arm_compute v18.05
diff --git a/src/core/CL/CLHelpers.cpp b/src/core/CL/CLHelpers.cpp
index 54e3e52..cda29d6 100644
--- a/src/core/CL/CLHelpers.cpp
+++ b/src/core/CL/CLHelpers.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2016, 2017 ARM Limited.
+ * Copyright (c) 2016-2018 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -27,47 +27,8 @@
#include "arm_compute/core/Log.h"
#include "arm_compute/core/Types.h"
-#include <map>
-#include <regex>
#include <vector>
-namespace
-{
-arm_compute::GPUTarget get_bifrost_target(const std::string &version)
-{
- if(version == "70")
- {
- return arm_compute::GPUTarget::G70;
- }
- else
- {
- return arm_compute::GPUTarget::BIFROST;
- }
-}
-
-arm_compute::GPUTarget get_midgard_target(const std::string &version)
-{
- switch(version[0])
- {
- case '6':
- return arm_compute::GPUTarget::T600;
- case '7':
- return arm_compute::GPUTarget::T700;
- case '8':
- return arm_compute::GPUTarget::T800;
- default:
- return arm_compute::GPUTarget::MIDGARD;
- }
-}
-
-bool extension_support(const cl::Device &device, const char *extension_name)
-{
- std::string extensions = device.getInfo<CL_DEVICE_EXTENSIONS>();
- auto pos = extensions.find(extension_name);
- return (pos != std::string::npos);
-}
-} // namespace
-
namespace arm_compute
{
std::string get_cl_type_from_data_type(const DataType &dt)
@@ -150,94 +111,27 @@
}
}
-const std::string &string_from_target(GPUTarget target)
-{
- static std::map<GPUTarget, const std::string> gpu_target_map =
- {
- { GPUTarget::MIDGARD, "midgard" },
- { GPUTarget::BIFROST, "bifrost" },
- { GPUTarget::T600, "t600" },
- { GPUTarget::T700, "t700" },
- { GPUTarget::T800, "t800" },
- { GPUTarget::G70, "g70" }
- };
-
- return gpu_target_map[target];
-}
-
GPUTarget get_target_from_device(cl::Device &device)
{
- size_t name_size = 0;
-
// Query device name size
- cl_int err = clGetDeviceInfo(device.get(), CL_DEVICE_NAME, 0, nullptr, &name_size);
- ARM_COMPUTE_ERROR_ON_MSG((err != 0) || (name_size == 0), "clGetDeviceInfo failed to return valid information");
- ARM_COMPUTE_UNUSED(err);
+ std::string device_name = device.getInfo<CL_DEVICE_NAME>();
- std::vector<char> name_buffer(name_size);
-
- // Query device name
- err = clGetDeviceInfo(device.get(), CL_DEVICE_NAME, name_size, name_buffer.data(), nullptr);
- ARM_COMPUTE_ERROR_ON_MSG(err != 0, "clGetDeviceInfo failed to return valid information");
- ARM_COMPUTE_UNUSED(err);
-
- std::regex mali_regex(R"(Mali-([TG])(\d+))");
- std::string device_name(name_buffer.begin(), name_buffer.end());
- std::smatch name_parts;
- const bool found_mali = std::regex_search(device_name, name_parts, mali_regex);
-
- if(!found_mali)
- {
- ARM_COMPUTE_LOG_INFO_MSG_CORE("Can't find valid Mali GPU. Target is set to MIDGARD.");
- return GPUTarget::MIDGARD;
- }
-
- const char target = name_parts.str(1)[0];
- const std::string &version = name_parts.str(2);
-
- switch(target)
- {
- case 'T':
- return get_midgard_target(version);
- case 'G':
- return get_bifrost_target(version);
- default:
- ARM_COMPUTE_LOG_INFO_MSG_CORE("Mali GPU unknown. Target is set to the default one.");
- return GPUTarget::MIDGARD;
- }
+ return get_target_from_name(device_name);
}
-GPUTarget get_arch_from_target(GPUTarget target)
+bool arm_non_uniform_workgroup_supported(const cl::Device &device)
{
- return (target & GPUTarget::GPU_ARCH_MASK);
+ return device_supports_extension(device, "cl_arm_non_uniform_work_group_size");
}
-bool non_uniform_workgroup_support(const cl::Device &device)
+bool fp16_supported(const cl::Device &device)
{
- return extension_support(device, "cl_arm_non_uniform_work_group_size");
-}
-
-bool fp16_support(const cl::Device &device)
-{
- return extension_support(device, "cl_khr_fp16");
+ return device_supports_extension(device, "cl_khr_fp16");
}
CLVersion get_cl_version(const cl::Device &device)
{
- std::vector<char> version;
- size_t version_size = 0;
- cl_int err = clGetDeviceInfo(device.get(), CL_DEVICE_VERSION, 0, nullptr, &version_size);
- ARM_COMPUTE_ERROR_ON_MSG((err != 0) || (version_size == 0), "clGetDeviceInfo failed to return valid information");
- ARM_COMPUTE_UNUSED(err);
-
- // Resize vector
- version.resize(version_size);
- // Query version
- err = clGetDeviceInfo(device.get(), CL_DEVICE_VERSION, version_size, version.data(), nullptr);
- ARM_COMPUTE_ERROR_ON_MSG(err != 0, "clGetDeviceInfo failed to return valid information");
- ARM_COMPUTE_UNUSED(err);
-
- std::string version_str(version.begin(), version.end());
+ std::string version_str = device.getInfo<CL_DEVICE_VERSION>();
if(version_str.find("OpenCL 2") != std::string::npos)
{
return CLVersion::CL20;
@@ -258,4 +152,11 @@
return CLVersion::UNKNOWN;
}
+bool device_supports_extension(const cl::Device &device, const char *extension_name)
+{
+ std::string extensions = device.getInfo<CL_DEVICE_EXTENSIONS>();
+ auto pos = extensions.find(extension_name);
+ return (pos != std::string::npos);
+}
+
} // namespace arm_compute
diff --git a/src/core/CL/CLKernelLibrary.cpp b/src/core/CL/CLKernelLibrary.cpp
index c7c08d4..bdb26f8 100644
--- a/src/core/CL/CLKernelLibrary.cpp
+++ b/src/core/CL/CLKernelLibrary.cpp
@@ -151,7 +151,8 @@
{ "activation_layer_qa8", "activation_layer_qa8.cl" },
{ "arithmetic_add", "arithmetic_op.cl" },
{ "arithmetic_sub", "arithmetic_op.cl" },
- { "batchnormalization_layer", "batchnormalization_layer.cl" },
+ { "batchnormalization_layer_nchw", "batchnormalization_layer.cl" },
+ { "batchnormalization_layer_nhwc", "batchnormalization_layer.cl" },
{ "bitwise_or", "bitwise_op.cl" },
{ "bitwise_and", "bitwise_op.cl" },
{ "bitwise_xor", "bitwise_op.cl" },
@@ -161,6 +162,7 @@
{ "channel_combine_RGBA8888", "channel_combine.cl" },
{ "channel_combine_UYVY422", "channel_combine.cl" },
{ "channel_combine_YUYV422", "channel_combine.cl" },
+ { "channel_shuffle_nchw", "channel_shuffle.cl" },
{ "channel_extract_NV12", "channel_extract.cl" },
{ "channel_extract_NV21", "channel_extract.cl" },
{ "channel_extract_RGB888", "channel_extract.cl" },
@@ -170,8 +172,12 @@
{ "combine_gradients_L1", "canny.cl" },
{ "combine_gradients_L2", "canny.cl" },
{ "concatenate_depth", "concatenate.cl" },
+ { "concatenate_width", "concatenate.cl" },
{ "convolution_rectangle", "convolution_rectangle.cl" },
{ "col2im", "col2im.cl" },
+ { "convert_depth_down", "depth_convert.cl" },
+ { "convert_depth_up", "depth_convert.cl" },
+ { "convert_fc_weights", "convert_fc_weights.cl" },
{ "convolution3x3_static", "convolution3x3.cl" },
{ "convolution5x5_static", "convolution5x5.cl" },
{ "convolution7x7_static", "convolution7x7.cl" },
@@ -182,17 +188,20 @@
{ "convolution_separable7x1_static", "convolution7x7.cl" },
{ "convolution_separable1x9_static", "convolution9x9.cl" },
{ "convolution_separable9x1_static", "convolution9x9.cl" },
- { "convert_depth_down", "depth_convert.cl" },
- { "convert_depth_up", "depth_convert.cl" },
+ { "copy_tensor", "copy_tensor.cl" },
{ "copy_plane", "channel_extract.cl" },
{ "copy_planes_3p", "channel_combine.cl" },
{ "copy_to_keypoint", "fast_corners.cl" },
{ "deconvolution_upsample", "deconvolution_layer.cl" },
{ "depthwise_convolution_3x3", "depthwise_convolution.cl" },
{ "depthwise_convolution_3x3_f16", "depthwise_convolution.cl" },
- { "depthwise_convolution_3x3_quantized", "depthwise_convolution_quantized.cl" },
- { "depthwise_convolution_3x3_stridex1_stridey1_bifrost", "depthwise_convolution.cl" },
- { "depthwise_convolution_3x3_stridex2_stridey2_bifrost", "depthwise_convolution.cl" },
+ { "depthwise_convolution_3x3_quantized_nchw", "depthwise_convolution_quantized.cl" },
+ { "depthwise_convolution_3x3_quantized_nhwc_stride1", "depthwise_convolution_quantized.cl" },
+ { "depthwise_convolution_3x3_quantized_nhwc_stride2", "depthwise_convolution_quantized.cl" },
+ { "depthwise_convolution_3x3_stridex1_stridey1_bifrost_f16", "depthwise_convolution.cl" },
+ { "depthwise_convolution_3x3_stridex2_stridey2_bifrost_f16", "depthwise_convolution.cl" },
+ { "depthwise_convolution_3x3_stridex1_stridey1_bifrost_f32", "depthwise_convolution.cl" },
+ { "depthwise_convolution_3x3_stridex2_stridey2_bifrost_f32", "depthwise_convolution.cl" },
{ "depthwise_im2col", "depthwise_convolution.cl" },
{ "depthwise_vector_to_tensor", "depthwise_convolution.cl" },
{ "depthwise_weights_reshape", "depthwise_convolution.cl" },
@@ -223,11 +232,13 @@
{ "gemm_mv", "gemv.cl" },
{ "gemm_mv_quantized", "gemv.cl" },
{ "gemm_mm_interleaved_transposed_f16", "gemm.cl" },
- { "gemm_mm_interleaved_transposed_f32_midgard", "gemm.cl" },
+ { "gemm_mm_interleaved_transposed_f16_bifrost", "gemm.cl" },
+ { "gemm_mm_interleaved_transposed_f32", "gemm.cl" },
{ "gemm_mm_interleaved_transposed_f32_bifrost", "gemm.cl" },
{ "gemm_mm_interleaved_transposed_qs8", "gemm.cl" },
{ "gemm_mm_interleaved_transposed_qs16", "gemm.cl" },
{ "gemm_mm_floating_point", "gemm.cl" },
+ { "gemm_mm_floating_point_f16_bifrost", "gemm.cl" },
{ "gemm_mm_floating_point_f32_bifrost", "gemm.cl" },
{ "gemm_mm_floating_point_f32_bifrost_1000", "gemm.cl" },
{ "gemm_mm_qs8", "gemm.cl" },
@@ -306,8 +317,10 @@
{ "pooling_layer_3", "pooling_layer.cl" },
{ "pooling_layer_optimized_3", "pooling_layer.cl" },
{ "pooling_layer_7", "pooling_layer.cl" },
- { "pooling_layer_MxN", "pooling_layer.cl" },
- { "pooling_layer_MxN_quantized", "pooling_layer_quantized.cl" },
+ { "pooling_layer_MxN_nchw", "pooling_layer.cl" },
+ { "pooling_layer_MxN_nhwc", "pooling_layer.cl" },
+ { "pooling_layer_MxN_quantized_nhwc", "pooling_layer_quantized.cl" },
+ { "pooling_layer_MxN_quantized_nchw", "pooling_layer_quantized.cl" },
{ "quantization_layer", "quantization_layer.cl" },
{ "reduction_operation", "reduction_operation.cl" },
{ "remap_nearest_neighbour", "remap.cl" },
@@ -351,6 +364,16 @@
{ "warp_affine_bilinear", "warp_affine.cl" },
{ "warp_perspective_nearest_neighbour", "warp_perspective.cl" },
{ "warp_perspective_bilinear", "warp_perspective.cl" },
+ { "winograd_filter_transform_2x2_3x3_nchw", "winograd.cl" },
+ { "winograd_filter_transform_4x4_3x3_nchw", "winograd.cl" },
+ { "winograd_filter_transform_4x4_5x5_nchw", "winograd.cl" },
+ { "winograd_input_transform_4x4_5x5_stepz1_nchw", "winograd.cl" },
+ { "winograd_input_transform_2x2_3x3_stepz1_nchw", "winograd.cl" },
+ { "winograd_input_transform_2x2_3x3_stepz2_nchw", "winograd.cl" },
+ { "winograd_input_transform_4x4_3x3_stepz1_nchw", "winograd.cl" },
+ { "winograd_output_transform_2x2_3x3_nchw", "winograd.cl" },
+ { "winograd_output_transform_4x4_3x3_nchw", "winograd.cl" },
+ { "winograd_output_transform_4x4_5x5_nchw", "winograd.cl" },
{ "YUYV422_to_IYUV_bt709", "color_convert.cl" },
{ "YUYV422_to_NV12_bt709", "color_convert.cl" },
{ "YUYV422_to_RGB888_bt709", "color_convert.cl" },
@@ -397,6 +420,10 @@
#include "./cl_kernels/channel_extract.clembed"
},
{
+ "channel_shuffle.cl",
+#include "./cl_kernels/channel_shuffle.clembed"
+ },
+ {
"col2im.cl",
#include "./cl_kernels/col2im.clembed"
},
@@ -409,6 +436,10 @@
#include "./cl_kernels/color_convert.clembed"
},
{
+ "convert_fc_weights.cl",
+#include "./cl_kernels/convert_fc_weights.clembed"
+ },
+ {
"convolution3x3.cl",
#include "./cl_kernels/convolution3x3.clembed"
},
@@ -433,6 +464,10 @@
#include "./cl_kernels/convolution_rectangle.clembed"
},
{
+ "copy_tensor.cl",
+#include "./cl_kernels/copy_tensor.clembed"
+ },
+ {
"deconvolution_layer.cl",
#include "./cl_kernels/deconvolution_layer.clembed"
},
@@ -676,12 +711,17 @@
"warp_perspective.cl",
#include "./cl_kernels/warp_perspective.clembed"
},
+ {
+ "winograd.cl",
+#include "./cl_kernels/winograd.clembed"
+ },
#endif /* EMBEDDED_KERNELS */
};
CLKernelLibrary::CLKernelLibrary()
: _context(), _device(), _kernel_path("."), _programs_map(), _built_programs_map()
{
+ opencl_is_available(); // Make sure the OpenCL symbols are initialised *before* the CLKernelLibrary is built
}
CLKernelLibrary &CLKernelLibrary::get()
@@ -699,22 +739,21 @@
{
ARM_COMPUTE_ERROR("Kernel %s not found in the CLKernelLibrary", kernel_name.c_str());
}
-
std::string concat_str;
- if(fp16_support(_device))
+ if(fp16_supported(_device))
{
concat_str += " -DARM_COMPUTE_OPENCL_FP16_ENABLED=1 ";
}
- if(non_uniform_workgroup_support(_device))
- {
- concat_str += " -cl-arm-non-uniform-work-group-size ";
- }
- else if(get_cl_version(_device) == CLVersion::CL20)
+ if(get_cl_version(_device) == CLVersion::CL20)
{
concat_str += " -cl-std=CL2.0 ";
}
+ else if(arm_non_uniform_workgroup_supported(_device))
+ {
+ concat_str += " -cl-arm-non-uniform-work-group-size ";
+ }
else
{
ARM_COMPUTE_ERROR("Non uniform workgroup size is not supported!!");
@@ -750,6 +789,11 @@
return Kernel(kernel_name, cl_program);
}
+void CLKernelLibrary::add_built_program(const std::string &built_program_name, cl::Program program)
+{
+ _built_programs_map.emplace(built_program_name, program);
+}
+
const Program &CLKernelLibrary::load_program(const std::string &program_name) const
{
const auto program_it = _programs_map.find(program_name);
@@ -838,5 +882,26 @@
cl::NDRange CLKernelLibrary::default_ndrange() const
{
- return cl::NDRange(128u, 1);
+ cl::Device device = cl::Device::getDefault();
+ GPUTarget _target = get_target_from_device(device);
+ cl::NDRange default_range;
+
+ switch(_target)
+ {
+ case GPUTarget::MIDGARD:
+ case GPUTarget::T600:
+ case GPUTarget::T700:
+ case GPUTarget::T800:
+ default_range = cl::NDRange(128u, 1);
+ break;
+ default:
+ default_range = cl::NullRange;
+ }
+
+ return default_range;
+}
+
+std::string CLKernelLibrary::get_device_version()
+{
+ return _device.getInfo<CL_DEVICE_VERSION>();
}
diff --git a/src/core/CL/OpenCL.cpp b/src/core/CL/OpenCL.cpp
index f75a90a..a8ed973 100644
--- a/src/core/CL/OpenCL.cpp
+++ b/src/core/CL/OpenCL.cpp
@@ -111,6 +111,12 @@
LOAD_FUNCTION_PTR(clGetCommandQueueInfo, handle);
LOAD_FUNCTION_PTR(clGetKernelInfo, handle);
LOAD_FUNCTION_PTR(clGetEventProfilingInfo, handle);
+ LOAD_FUNCTION_PTR(clSVMAlloc, handle);
+ LOAD_FUNCTION_PTR(clSVMFree, handle);
+ LOAD_FUNCTION_PTR(clEnqueueSVMMap, handle);
+ LOAD_FUNCTION_PTR(clEnqueueSVMUnmap, handle);
+ LOAD_FUNCTION_PTR(clEnqueueMarker, handle);
+ LOAD_FUNCTION_PTR(clWaitForEvents, handle);
#undef LOAD_FUNCTION_PTR
@@ -129,6 +135,90 @@
}
} // namespace arm_compute
+cl_int clEnqueueMarker(cl_command_queue command_queue,
+ cl_event *event)
+{
+ arm_compute::CLSymbols::get().load_default();
+ auto func = arm_compute::CLSymbols::get().clEnqueueMarker_ptr;
+ if(func != nullptr)
+ {
+ return func(command_queue, event);
+ }
+ else
+ {
+ return CL_OUT_OF_RESOURCES;
+ }
+}
+
+cl_int clWaitForEvents(cl_uint num_events,
+ const cl_event *event_list)
+{
+ arm_compute::CLSymbols::get().load_default();
+ auto func = arm_compute::CLSymbols::get().clWaitForEvents_ptr;
+ if(func != nullptr)
+ {
+ return func(num_events, event_list);
+ }
+ else
+ {
+ return CL_OUT_OF_RESOURCES;
+ }
+}
+
+cl_int clEnqueueSVMMap(cl_command_queue command_queue, cl_bool blocking_map, cl_map_flags flags, void *svm_ptr,
+ size_t size, cl_uint num_events_in_wait_list, const cl_event *event_wait_list, cl_event *event)
+{
+ arm_compute::CLSymbols::get().load_default();
+ auto func = arm_compute::CLSymbols::get().clEnqueueSVMMap_ptr;
+ if(func != nullptr)
+ {
+ return func(command_queue, blocking_map, flags, svm_ptr, size, num_events_in_wait_list, event_wait_list, event);
+ }
+ else
+ {
+ return CL_OUT_OF_RESOURCES;
+ }
+}
+
+cl_int clEnqueueSVMUnmap(cl_command_queue command_queue, void *svm_ptr, cl_uint num_events_in_wait_list,
+ const cl_event *event_wait_list, cl_event *event)
+{
+ arm_compute::CLSymbols::get().load_default();
+ auto func = arm_compute::CLSymbols::get().clEnqueueSVMUnmap_ptr;
+ if(func != nullptr)
+ {
+ return func(command_queue, svm_ptr, num_events_in_wait_list, event_wait_list, event);
+ }
+ else
+ {
+ return CL_OUT_OF_RESOURCES;
+ }
+}
+
+void *clSVMAlloc(cl_context context, cl_svm_mem_flags_arm flags, size_t size, cl_uint alignment)
+{
+ arm_compute::CLSymbols::get().load_default();
+ auto func = arm_compute::CLSymbols::get().clSVMAlloc_ptr;
+ if(func != nullptr)
+ {
+ return func(context, flags, size, alignment);
+ }
+ else
+ {
+ return nullptr;
+ }
+}
+
+void clSVMFree(cl_context context, void *svm_pointer)
+{
+ arm_compute::CLSymbols::get().load_default();
+ auto func = arm_compute::CLSymbols::get().clSVMFree_ptr;
+ if(func != nullptr)
+ {
+ func(context, svm_pointer);
+ }
+}
+
cl_int clGetContextInfo(cl_context context,
cl_context_info param_name,
size_t param_value_size,
diff --git a/src/core/CL/cl_kernels/activation_layer.cl b/src/core/CL/cl_kernels/activation_layer.cl
index 4424a66..a8ea738 100644
--- a/src/core/CL/cl_kernels/activation_layer.cl
+++ b/src/core/CL/cl_kernels/activation_layer.cl
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2016, 2017 ARM Limited.
+ * Copyright (c) 2016-2018 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -115,6 +115,8 @@
#define ACTIVATION_OP2(op, x) op##_op(x)
#define ACTIVATION_OP(op, x) ACTIVATION_OP2(op, x)
+#if defined(ACT)
+
/** This performs an activation function floating point inputs.
*
* @note In order to perform the activation function "in-place", the pre-processor -DIN_PLACE must be passed at compile time
@@ -168,3 +170,5 @@
VSTORE(VEC_SIZE)
(data, 0, (__global DATA_TYPE *)output.ptr);
}
+
+#endif /* defined(ACT) */
\ No newline at end of file
diff --git a/src/core/CL/cl_kernels/activation_layer_qa8.cl b/src/core/CL/cl_kernels/activation_layer_qa8.cl
index cb31e99..66e54ed 100644
--- a/src/core/CL/cl_kernels/activation_layer_qa8.cl
+++ b/src/core/CL/cl_kernels/activation_layer_qa8.cl
@@ -44,6 +44,26 @@
#define ACTIVATION_OP2(op, x) op##_op(x)
#define ACTIVATION_OP(op, x) ACTIVATION_OP2(op, x)
+#if defined(O1_VAL) && defined(O2_VAL) && defined(S1_VAL) && defined(S2_VAL)
+#define PERFORM_ACTIVATION_QA8(act, data) \
+ ({ \
+ data = ACTIVATION_OP(act, data); \
+ \
+ VEC_DATA_TYPE(float, VEC_SIZE) \
+ fdata = CONVERT(data, VEC_DATA_TYPE(float, VEC_SIZE)); \
+ \
+ fdata = round((fdata - (float)O1_VAL) * ((float)S1_VAL / (float)S2_VAL) + (float)O2_VAL); \
+ data = CONVERT_SAT(fdata, VEC_DATA_TYPE(uchar, VEC_SIZE)); \
+ })
+#else /* defined(O1_VAL) && defined(O2_VAL) && defined(S1_VAL) && defined(S2_VAL) */
+#define PERFORM_ACTIVATION_QA8(act, data) \
+ ({ \
+ data = ACTIVATION_OP(act, data); \
+ })
+#endif /* defined(O1_VAL) && defined(O2_VAL) && defined(S1_VAL) && defined(S2_VAL) */
+
+#if defined(ACT)
+
/** This performs an activation function on QASYMM8 inputs.
*
* @note In order to perform the activation function "in-place", the pre-processor -DIN_PLACE must be passed at compile time
@@ -92,19 +112,11 @@
// Load data
TYPE data = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)input.ptr);
- // Perform activation
- data = ACTIVATION_OP(ACT, data);
-
-#if defined(O1_VAL) && defined(O2_VAL) && defined(S1_VAL) && defined(S2_VAL)
- // requantize to output space
- VEC_DATA_TYPE(float, VEC_SIZE)
- fdata = CONVERT(data, VEC_DATA_TYPE(float, VEC_SIZE));
-
- fdata = round((fdata - (float)O1_VAL) * ((float)S1_VAL / (float)S2_VAL) + (float)O2_VAL);
- data = CONVERT_SAT(fdata, VEC_DATA_TYPE(uchar, VEC_SIZE));
-#endif // defined(O1_VAL) && defined(O2_VAL) && defined(S1_VAL) && defined(S2_VAL)
+ data = PERFORM_ACTIVATION_QA8(ACT, data);
// Store result
VSTORE(VEC_SIZE)
(data, 0, (__global DATA_TYPE *)output.ptr);
}
+
+#endif /* defined(ACT) */
\ No newline at end of file
diff --git a/src/core/CL/cl_kernels/batchnormalization_layer.cl b/src/core/CL/cl_kernels/batchnormalization_layer.cl
index 5ddeb1a..9c980da 100644
--- a/src/core/CL/cl_kernels/batchnormalization_layer.cl
+++ b/src/core/CL/cl_kernels/batchnormalization_layer.cl
@@ -44,15 +44,12 @@
#endif /* FIXED_POINT_POSITION */
-#if defined(LU_BRELU)
-#define ACTIVATION_FUNC(x) CLAMP(x, (DATA_TYPE)B_VAL, (DATA_TYPE)A_VAL)
-#elif defined(BRELU)
-#define ACTIVATION_FUNC(x) CLAMP(x, (DATA_TYPE)0, (DATA_TYPE)A_VAL)
-#elif defined(RELU)
-#define ACTIVATION_FUNC(x) max(x, (DATA_TYPE)0)
-#else /* FUSED_ACT */
+#if defined(FUSED_ACTIVATION)
+#include "activation_layer.cl"
+#define ACTIVATION_FUNC(x) ACTIVATION_OP(FUSED_ACTIVATION, x)
+#else /* defined(FUSED_ACTIVATION) */
#define ACTIVATION_FUNC(x) (x)
-#endif /* FUSED_ACT */
+#endif /* defined(FUSED_ACTIVATION) */
/** Apply batch normalization.
*
@@ -90,15 +87,19 @@
* @param[in] gamma_offset_first_element_in_bytes The offset of the first element in the gamma source tensor
* @param[in] epsilon Epsilon parameter in the batch normalization equation
*/
-__kernel void batchnormalization_layer(TENSOR3D_DECLARATION(input),
+__kernel void batchnormalization_layer_nchw(TENSOR3D_DECLARATION(input),
#ifndef IN_PLACE
- TENSOR3D_DECLARATION(output),
+ TENSOR3D_DECLARATION(output),
#endif /* not IN_PLACE */
- VECTOR_DECLARATION(mean),
- VECTOR_DECLARATION(var),
- VECTOR_DECLARATION(beta),
- VECTOR_DECLARATION(gamma),
- float epsilon)
+ VECTOR_DECLARATION(mean),
+ VECTOR_DECLARATION(var),
+#ifndef USE_DEFAULT_BETA
+ VECTOR_DECLARATION(beta),
+#endif /* USE_DEFAULT_BETA */
+#ifndef USE_DEFAULT_GAMMA
+ VECTOR_DECLARATION(gamma),
+#endif /* USE_DEFAULT_GAMMA */
+ float epsilon)
{
Tensor3D in = CONVERT_TO_TENSOR3D_STRUCT(input);
#ifdef IN_PLACE
@@ -106,10 +107,14 @@
#else /* IN_PLACE */
Tensor3D out = CONVERT_TO_TENSOR3D_STRUCT(output);
#endif /* IN_PLACE */
- Vector mean = CONVERT_TO_VECTOR_STRUCT(mean);
- Vector var = CONVERT_TO_VECTOR_STRUCT(var);
- Vector beta = CONVERT_TO_VECTOR_STRUCT(beta);
+ Vector mean = CONVERT_TO_VECTOR_STRUCT(mean);
+ Vector var = CONVERT_TO_VECTOR_STRUCT(var);
+#ifndef USE_DEFAULT_BETA
+ Vector beta = CONVERT_TO_VECTOR_STRUCT(beta);
+#endif /* USE_DEFAULT_BETA */
+#ifndef USE_DEFAULT_GAMMA
Vector gamma = CONVERT_TO_VECTOR_STRUCT(gamma);
+#endif /* USE_DEFAULT_GAMMA */
VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
data = 0;
@@ -120,9 +125,7 @@
VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
x_bar = 0;
VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
- gamma_vec = 0;
- VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
- beta_vec = 0;
+ res = 0;
const int current_slice = get_global_id(2);
@@ -135,11 +138,22 @@
numerator = SUB_OP(data, numerator);
x_bar = MUL_OP(numerator, denominator);
- gamma_vec = *((__global DATA_TYPE *)(gamma.ptr + current_slice * gamma.stride_x));
- beta_vec = *((__global DATA_TYPE *)(beta.ptr + current_slice * beta.stride_x));
-
+#ifndef USE_DEFAULT_GAMMA
VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
- res = ADD_OP(MUL_OP(gamma_vec, x_bar), beta_vec);
+ gamma_vec = *((__global DATA_TYPE *)(gamma.ptr + current_slice * gamma.stride_x));
+
+ res = MUL_OP(gamma_vec, x_bar);
+#else /* USE_DEFAULT_GAMMA */
+ // gamma is equal to 1, no need to perform multiplications
+ res = x_bar;
+#endif /* USE_DEFAULT_GAMMA */
+
+#ifndef USE_DEFAULT_BETA
+ VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+ beta_vec = *((__global DATA_TYPE *)(beta.ptr + current_slice * beta.stride_x));
+ // beta is not zero, hence we need to perform the addition
+ res = ADD_OP(res, beta_vec);
+#endif /* USE_DEFAULT_BETA */
res = ACTIVATION_FUNC(res);
@@ -147,4 +161,113 @@
(res, 0, (__global DATA_TYPE *)out.ptr);
}
-#endif /* defined(VEC_SIZE) && defined(DATA_TYPE) */
\ No newline at end of file
+/** Apply batch normalization on tensors with NHWC format.
+ *
+ * @param[in] input_ptr Pointer to the first source tensor. Supported data types: QS8/QS16/F16/F32
+ * @param[in] input_stride_x Stride of the first source tensor in X dimension (in bytes)
+ * @param[in] input_step_x input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] input_stride_y Stride of the first source tensor in Y dimension (in bytes)
+ * @param[in] input_step_y input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] input_stride_z Stride of the first source tensor in Z dimension (in bytes)
+ * @param[in] input_step_z input_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] input_offset_first_element_in_bytes The offset of the first element in the first source tensor
+ * @param[out] output_ptr Pointer to the destination tensor. Supported data types: same as @p input_ptr
+ * @param[in] output_stride_x Stride of the destination tensor in X dimension (in bytes)
+ * @param[in] output_step_x output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] output_stride_y Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in] output_step_y output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] output_stride_z Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in] output_step_z output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] output_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ * @param[in] mean_ptr Pointer to the mean source tensor. Supported data types: same as @p input_ptr
+ * @param[in] mean_stride_x Stride of the mean source tensor in X dimension (in bytes)
+ * @param[in] mean_step_x mean_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] mean_offset_first_element_in_bytes The offset of the first element in the mean source tensor
+ * @param[in] var_ptr Pointer to the var tensor. Supported data types: same as @p input_ptr
+ * @param[in] var_stride_x Stride of the var tensor in X dimension (in bytes)
+ * @param[in] var_step_x var_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] var_offset_first_element_in_bytes The offset of the first element in the var source tensor
+ * @param[in] beta_ptr Pointer to the beta source tensor. Supported data types: same as @p input_ptr
+ * @param[in] beta_stride_x Stride of the beta source tensor in X dimension (in bytes)
+ * @param[in] beta_step_x beta_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] beta_offset_first_element_in_bytes The offset of the first element in the beta source tensor
+ * @param[in] gamma_ptr Pointer to the gamma source tensor. Supported data types: same as @p input_ptr
+ * @param[in] gamma_stride_x Stride of the gamma source tensor in X dimension (in bytes)
+ * @param[in] gamma_step_x gamma_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] gamma_offset_first_element_in_bytes The offset of the first element in the gamma source tensor
+ * @param[in] epsilon Epsilon parameter in the batch normalization equation
+ */
+__kernel void batchnormalization_layer_nhwc(TENSOR3D_DECLARATION(input),
+#ifndef IN_PLACE
+ TENSOR3D_DECLARATION(output),
+#endif /* not IN_PLACE */
+ VECTOR_DECLARATION(mean),
+ VECTOR_DECLARATION(var),
+#ifndef USE_DEFAULT_BETA
+ VECTOR_DECLARATION(beta),
+#endif /* USE_DEFAULT_BETA */
+#ifndef USE_DEFAULT_GAMMA
+ VECTOR_DECLARATION(gamma),
+#endif /* USE_DEFAULT_GAMMA */
+ float epsilon)
+{
+ Tensor3D in = CONVERT_TO_TENSOR3D_STRUCT(input);
+#ifdef IN_PLACE
+ Tensor3D out = in;
+#else /* IN_PLACE */
+ Tensor3D out = CONVERT_TO_TENSOR3D_STRUCT(output);
+#endif /* IN_PLACE */
+ Vector mean = CONVERT_TO_VECTOR_STRUCT(mean);
+ Vector var = CONVERT_TO_VECTOR_STRUCT(var);
+#ifndef USE_DEFAULT_BETA
+ Vector beta = CONVERT_TO_VECTOR_STRUCT(beta);
+#endif /* USE_DEFAULT_BETA */
+#ifndef USE_DEFAULT_GAMMA
+ Vector gamma = CONVERT_TO_VECTOR_STRUCT(gamma);
+#endif /* USE_DEFAULT_GAMMA */
+
+ VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+ data = 0;
+ VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+ denominator = 0;
+ VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+ numerator = 0;
+ VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+ x_bar = 0;
+ VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+ res = 0;
+
+ const int current_slice = get_global_id(0);
+
+ data = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)in.ptr);
+ denominator = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(var.ptr + current_slice * VEC_SIZE * var.stride_x));
+ denominator = INVSQRT_OP(ADD_OP(denominator, ((VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE))SQCVT_SAT(epsilon))));
+
+ // Calculate x bar and store results
+ numerator = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(mean.ptr + current_slice * VEC_SIZE * mean.stride_x));
+ numerator = SUB_OP(data, numerator);
+ x_bar = MUL_OP(numerator, denominator);
+
+#ifndef USE_DEFAULT_GAMMA
+ VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+ gamma_vec = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(gamma.ptr + current_slice * VEC_SIZE * gamma.stride_x));
+
+ res = MUL_OP(gamma_vec, x_bar);
+#else /* USE_DEFAULT_GAMMA */
+ // gamma is equal to 1, no need to perform multiplications
+ res = x_bar;
+#endif /* USE_DEFAULT_GAMMA */
+
+#ifndef USE_DEFAULT_BETA
+ VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+ beta_vec = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(beta.ptr + current_slice * VEC_SIZE * beta.stride_x));
+ // beta is not zero, hence we need to perform the addition
+ res = ADD_OP(res, beta_vec);
+#endif /* USE_DEFAULT_BETA */
+
+ res = ACTIVATION_FUNC(res);
+
+ VSTORE(VEC_SIZE)
+ (res, 0, (__global DATA_TYPE *)out.ptr);
+}
+#endif /* defined(VEC_SIZE) && defined(DATA_TYPE) */
diff --git a/src/core/CL/cl_kernels/channel_combine.cl b/src/core/CL/cl_kernels/channel_combine.cl
index d309812..4207414 100644
--- a/src/core/CL/cl_kernels/channel_combine.cl
+++ b/src/core/CL/cl_kernels/channel_combine.cl
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2016, 2017 ARM Limited.
+ * Copyright (c) 2016-2018 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -338,9 +338,9 @@
uchar8 data2 = vload8(0, src_plane2.ptr);
#ifdef NV12
- vstore16(shuffle2(data1, data2, (uchar16)(0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15)), 0, dst_plane1.ptr);
+ vstore16(shuffle2(data1, data2, (uchar16)(0, 8, 1, 9, 2, 10, 3, 11, 4, 12, 5, 13, 6, 14, 7, 15)), 0, dst_plane1.ptr);
#elif defined(NV21)
- vstore16(shuffle2(data2, data1, (uchar16)(0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15)), 0, dst_plane1.ptr);
+ vstore16(shuffle2(data2, data1, (uchar16)(0, 8, 1, 9, 2, 10, 3, 11, 4, 12, 5, 13, 6, 14, 7, 15)), 0, dst_plane1.ptr);
#endif /* NV12 or NV21 */
}
diff --git a/src/core/CL/cl_kernels/channel_shuffle.cl b/src/core/CL/cl_kernels/channel_shuffle.cl
new file mode 100644
index 0000000..26cee9c
--- /dev/null
+++ b/src/core/CL/cl_kernels/channel_shuffle.cl
@@ -0,0 +1,132 @@
+/*
+* Copyright (c) 2018 ARM Limited.
+*
+* SPDX-License-Identifier: MIT
+*
+* Permission is hereby granted, free of charge, to any person obtaining a copy
+* of this software and associated documentation files (the "Software"), to
+* deal in the Software without restriction, including without limitation the
+* rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+* sell copies of the Software, and to permit persons to whom the Software is
+* furnished to do so, subject to the following conditions:
+*
+* The above copyright notice and this permission notice shall be included in all
+* copies or substantial portions of the Software.
+*
+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+* SOFTWARE.
+*/
+#include "helpers.h"
+
+#if defined(DATA_TYPE) && defined(BLOCK_SIZE) && defined(NUM_GROUPS) && defined(K)
+
+// Check valid BLOCK_SIZES
+#if BLOCK_SIZE != 4 && BLOCK_SIZE != 8 && BLOCK_SIZE != 16
+#error "Only block sizes 4, 8 and 16 are supported"
+#endif /* BLOCK_SIZE != 4 && BLOCK_SIZE != 8 && BLOCK_SIZE != 16 */
+
+#define TYPE VEC_DATA_TYPE(DATA_TYPE, BLOCK_SIZE)
+
+/** Perfoms channel shuffle see https://arxiv.org/pdf/1707.01083.pdf for details.
+ *
+ * @note The number of groups should be given as a preprocessor argument using -DNUM_GROUPS=num_groups. e.g. -DNUM_GROUPS=2
+ * @note The number of channels in each group should be given as a preprocessor argument using -DK=num. e.g. -DK=1
+ * K is equal to num_channels / num_groups.
+ *
+ * @param[in] src_ptr Pointer to the source matrix. Supported data types: U8/S8/QS8/QASYMM8/U16/S16/QS16/F16/U32/S32/F32
+ * @param[in] src_stride_x Stride of the first source tensor in X dimension (in bytes)
+ * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] src_stride_y Stride of the first source tensor in Y dimension (in bytes)
+ * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] src_stride_z Stride of the first source tensor in Z dimension (in bytes)
+ * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] src_offset_first_element_in_bytes The offset of the first element in the first source tensor
+ * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr
+ * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
+ * @param[in] dst_step_x output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in] dst_step_y output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in] dst_step_z output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ */
+__kernel void channel_shuffle_nchw(TENSOR3D_DECLARATION(src),
+ TENSOR3D_DECLARATION(dst))
+{
+ Tensor3D src = CONVERT_TO_TENSOR3D_STRUCT(src);
+ Tensor3D dst = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(dst);
+
+ const uint curr_channel = get_global_id(2); // channel id of input
+ const uint group_id = curr_channel / K; // group id
+ const uint channel_id = curr_channel % K; // channel id within the group
+
+ const uint x = get_global_id(0) * BLOCK_SIZE;
+ const uint y = get_global_id(1) * BLOCK_SIZE;
+ const uint z = channel_id * NUM_GROUPS + group_id;
+
+ // Load the NxN block
+ TYPE u0 = VLOAD(BLOCK_SIZE)(0, (__global DATA_TYPE *)tensor3D_offset(&src, 0, 0, 0));
+ TYPE u1 = VLOAD(BLOCK_SIZE)(0, (__global DATA_TYPE *)tensor3D_offset(&src, 0, 1, 0));
+ TYPE u2 = VLOAD(BLOCK_SIZE)(0, (__global DATA_TYPE *)tensor3D_offset(&src, 0, 2, 0));
+ TYPE u3 = VLOAD(BLOCK_SIZE)(0, (__global DATA_TYPE *)tensor3D_offset(&src, 0, 3, 0));
+#if BLOCK_SIZE > 4
+ TYPE u4 = VLOAD(BLOCK_SIZE)(0, (__global DATA_TYPE *)tensor3D_offset(&src, 0, 4, 0));
+ TYPE u5 = VLOAD(BLOCK_SIZE)(0, (__global DATA_TYPE *)tensor3D_offset(&src, 0, 5, 0));
+ TYPE u6 = VLOAD(BLOCK_SIZE)(0, (__global DATA_TYPE *)tensor3D_offset(&src, 0, 6, 0));
+ TYPE u7 = VLOAD(BLOCK_SIZE)(0, (__global DATA_TYPE *)tensor3D_offset(&src, 0, 7, 0));
+#if BLOCK_SIZE == 16
+ TYPE u8 = VLOAD(BLOCK_SIZE)(0, (__global DATA_TYPE *)tensor3D_offset(&src, 0, 8, 0));
+ TYPE u9 = VLOAD(BLOCK_SIZE)(0, (__global DATA_TYPE *)tensor3D_offset(&src, 0, 9, 0));
+ TYPE u10 = VLOAD(BLOCK_SIZE)(0, (__global DATA_TYPE *)tensor3D_offset(&src, 0, 10, 0));
+ TYPE u11 = VLOAD(BLOCK_SIZE)(0, (__global DATA_TYPE *)tensor3D_offset(&src, 0, 11, 0));
+ TYPE u12 = VLOAD(BLOCK_SIZE)(0, (__global DATA_TYPE *)tensor3D_offset(&src, 0, 12, 0));
+ TYPE u13 = VLOAD(BLOCK_SIZE)(0, (__global DATA_TYPE *)tensor3D_offset(&src, 0, 13, 0));
+ TYPE u14 = VLOAD(BLOCK_SIZE)(0, (__global DATA_TYPE *)tensor3D_offset(&src, 0, 14, 0));
+ TYPE u15 = VLOAD(BLOCK_SIZE)(0, (__global DATA_TYPE *)tensor3D_offset(&src, 0, 15, 0));
+#endif /* BLOCK_SIZE == 16 */
+#endif /* BLOCK_SIZE > 4 */
+
+ // Store blocks
+ VSTORE(BLOCK_SIZE)
+ (u0, 0, (__global DATA_TYPE *)tensor3D_offset(&dst, x, y + 0, z));
+ VSTORE(BLOCK_SIZE)
+ (u1, 0, (__global DATA_TYPE *)tensor3D_offset(&dst, x, y + 1, z));
+ VSTORE(BLOCK_SIZE)
+ (u2, 0, (__global DATA_TYPE *)tensor3D_offset(&dst, x, y + 2, z));
+ VSTORE(BLOCK_SIZE)
+ (u3, 0, (__global DATA_TYPE *)tensor3D_offset(&dst, x, y + 3, z));
+#if BLOCK_SIZE > 4
+ VSTORE(BLOCK_SIZE)
+ (u4, 0, (__global DATA_TYPE *)tensor3D_offset(&dst, x, y + 4, z));
+ VSTORE(BLOCK_SIZE)
+ (u5, 0, (__global DATA_TYPE *)tensor3D_offset(&dst, x, y + 5, z));
+ VSTORE(BLOCK_SIZE)
+ (u6, 0, (__global DATA_TYPE *)tensor3D_offset(&dst, x, y + 6, z));
+ VSTORE(BLOCK_SIZE)
+ (u7, 0, (__global DATA_TYPE *)tensor3D_offset(&dst, x, y + 7, z));
+#if BLOCK_SIZE == 16
+ VSTORE(BLOCK_SIZE)
+ (u8, 0, (__global DATA_TYPE *)tensor3D_offset(&dst, x, y + 8, z));
+ VSTORE(BLOCK_SIZE)
+ (u9, 0, (__global DATA_TYPE *)tensor3D_offset(&dst, x, y + 9, z));
+ VSTORE(BLOCK_SIZE)
+ (u10, 0, (__global DATA_TYPE *)tensor3D_offset(&dst, x, y + 10, z));
+ VSTORE(BLOCK_SIZE)
+ (u11, 0, (__global DATA_TYPE *)tensor3D_offset(&dst, x, y + 11, z));
+ VSTORE(BLOCK_SIZE)
+ (u12, 0, (__global DATA_TYPE *)tensor3D_offset(&dst, x, y + 12, z));
+ VSTORE(BLOCK_SIZE)
+ (u13, 0, (__global DATA_TYPE *)tensor3D_offset(&dst, x, y + 13, z));
+ VSTORE(BLOCK_SIZE)
+ (u14, 0, (__global DATA_TYPE *)tensor3D_offset(&dst, x, y + 14, z));
+ VSTORE(BLOCK_SIZE)
+ (u15, 0, (__global DATA_TYPE *)tensor3D_offset(&dst, x, y + 15, z));
+#endif /* BLOCK_SIZE == 16 */
+#endif /* BLOCK_SIZE > 4 */
+}
+#endif /* defined(DATA_TYPE) && defined(BLOCK_SIZE) && defined(NUM_GROUPS) && defined(K) */
diff --git a/src/core/CL/cl_kernels/concatenate.cl b/src/core/CL/cl_kernels/concatenate.cl
index a92ab5b..f97ae13 100644
--- a/src/core/CL/cl_kernels/concatenate.cl
+++ b/src/core/CL/cl_kernels/concatenate.cl
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2018 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -23,6 +23,41 @@
*/
#include "helpers.h"
+/** This kernel concatenates the input tensor into the output tensor along the first dimension
+ *
+ * @param[in] src_ptr Pointer to the source tensor. Supported data types: QS8, QASYMM8, QS16, F16, F32
+ * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
+ * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
+ * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr
+ * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
+ * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_stride_z Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ * @param[in] offset The offset to the first valid element of the output tensor in bytes
+ */
+__kernel void concatenate_width(
+ TENSOR3D_DECLARATION(src),
+ TENSOR3D_DECLARATION(dst),
+ int offset)
+{
+ Tensor3D src = CONVERT_TO_TENSOR3D_STRUCT(src);
+ Tensor3D dst = CONVERT_TO_TENSOR3D_STRUCT(dst);
+
+ VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+ source_values = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)src.ptr);
+
+ VSTORE(VEC_SIZE)
+ (source_values, 0, (__global DATA_TYPE *)(dst.ptr + offset));
+}
+
/** This kernel concatenates the input tensor into the output tensor along the third dimension
*
* @param[in] src_ptr Pointer to the source tensor. Supported data types: QS8, QS16, F16, F32
diff --git a/src/core/CL/cl_kernels/convert_fc_weights.cl b/src/core/CL/cl_kernels/convert_fc_weights.cl
new file mode 100644
index 0000000..3c3e8b0
--- /dev/null
+++ b/src/core/CL/cl_kernels/convert_fc_weights.cl
@@ -0,0 +1,58 @@
+/*
+ * Copyright (c) 2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers.h"
+
+#if defined(DATA_TYPE) && defined(FACTOR_1) && defined(FACTOR_2)
+/** Perform a NCHW -> NHWC or NHWC -> NCHW conversion for Fully Connected 2D weights.
+ *
+ * For NCHW -> NHWC, FACTOR_1 will be equal to the product of the first two dimensions of FullyConnectedLayer's input and FACTOR_2 will represent the number of channels of that tensor.
+ * For NHWC -> NCHW, FACTOR_1 and FACTOR_2 will hold the same values, but swapped.
+ *
+ * @attention Data type can be passed using the -DDATA_TYPE compile flag, e.g. -DDATA_TYPE=float
+ * @attention Original input tensor width*height and depth should be given as a preprocessor argument using -DFACTOR_1=size and -DFACTOR_2=size for NCHW and vice versa for NHWC. e.g. -DFACTOR_1=256 and -DFACTOR_2=128
+ *
+ * @param[in] src_ptr Pointer to the source image. Supported data types: U8, S8, QS8, QASYMM8, U16, S16, QS16, U32, S32, QS32, F16, F32
+ * @param[in] src_stride_x Stride of the source image in X dimension (in bytes)
+ * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] src_stride_y Stride of the source image in Y dimension (in bytes)
+ * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[out] dst_ptr Pointer to the destination image. Supported data types: same as @p src_ptr
+ * @param[in] dst_stride_x Stride of the destination image in X dimension (in bytes)
+ * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] dst_stride_y Stride of the destination image in Y dimension (in bytes)
+ * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination image
+ */
+__kernel void convert_fc_weights(
+ IMAGE_DECLARATION(src),
+ IMAGE_DECLARATION(dst))
+{
+ Image src = CONVERT_TO_IMAGE_STRUCT(src);
+
+ __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + get_global_id(0) * dst_stride_x + (get_global_id(1) % FACTOR_1 * FACTOR_2 + get_global_id(1) / FACTOR_1) * dst_stride_y;
+
+ *((__global DATA_TYPE *)dst_addr) = *((__global DATA_TYPE *)src.ptr);
+}
+#endif // defined(DATA_TYPE) && defined(FACTOR_1) && defined(FACTOR_2)
diff --git a/src/core/CL/cl_kernels/copy_tensor.cl b/src/core/CL/cl_kernels/copy_tensor.cl
new file mode 100644
index 0000000..4b37dec
--- /dev/null
+++ b/src/core/CL/cl_kernels/copy_tensor.cl
@@ -0,0 +1,48 @@
+/*
+ * Copyright (c) 2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers.h"
+
+/** Performs a copy of input tensor to the output tensor.
+ *
+ * @param[in] in_ptr Pointer to the source image. Supported data types: U8.
+ * @param[in] in_stride_x Stride of the source image in X dimension (in bytes)
+ * @param[in] in_step_x in_stride_x * number of elements along X processed per work item (in bytes)
+ * @param[in] in_offset_first_element_in_bytes Offset of the first element in the source image
+ * @param[out] out_ptr Pointer to the destination image. Supported data types: U8.
+ * @param[in] out_stride_x Stride of the destination image in X dimension (in bytes)
+ * @param[in] out_step_x out_stride_x * number of elements along X processed per work item (in bytes)
+ * @param[in] out_offset_first_element_in_bytes Offset of the first element in the destination image
+ */
+__kernel void copy_tensor(
+ VECTOR_DECLARATION(in),
+ VECTOR_DECLARATION(out))
+{
+ Vector in = CONVERT_TO_VECTOR_STRUCT(in);
+ Vector out = CONVERT_TO_VECTOR_STRUCT(out);
+
+ VEC_DATA_TYPE(DATA_TYPE, 16)
+ data = vload16(0, (__global DATA_TYPE *)in.ptr);
+
+ vstore16(data, 0, (__global DATA_TYPE *)out.ptr);
+}
\ No newline at end of file
diff --git a/src/core/CL/cl_kernels/depthwise_convolution.cl b/src/core/CL/cl_kernels/depthwise_convolution.cl
index f352138..5f4247e 100644
--- a/src/core/CL/cl_kernels/depthwise_convolution.cl
+++ b/src/core/CL/cl_kernels/depthwise_convolution.cl
@@ -24,6 +24,7 @@
#include "helpers.h"
+#if defined(DEPTH_MULTIPLIER)
#if defined(CONV_STRIDE_X)
#if CONV_STRIDE_X == 1
@@ -192,6 +193,8 @@
Vector biases = CONVERT_TO_VECTOR_STRUCT_NO_STEP(biases);
#endif //defined(HAS_BIAS)
+ src.ptr -= (get_global_id(2) - get_global_id(2) / DEPTH_MULTIPLIER) * src_step_z;
+
uchar3 offset = (uchar3)(0, 1, 2) * (uchar3)weights_stride_y;
float3 weights_values0 = vload3(0, (__global float *)(weights.ptr + offset.s0));
float3 weights_values1 = vload3(0, (__global float *)(weights.ptr + offset.s1));
@@ -218,6 +221,22 @@
acc.s1 = fma(src0.s3, weights_row0.s2, acc.s1); \
})
+#define CONVOLUTION1x3_BIFROST4X1_STRIDE1(acc, src0, weights_row0) \
+ ({ \
+ acc.s0 = fma(src0.s0, weights_row0.s0, acc.s0); \
+ acc.s0 = fma(src0.s1, weights_row0.s1, acc.s0); \
+ acc.s0 = fma(src0.s2, weights_row0.s2, acc.s0); \
+ acc.s1 = fma(src0.s1, weights_row0.s0, acc.s1); \
+ acc.s1 = fma(src0.s2, weights_row0.s1, acc.s1); \
+ acc.s1 = fma(src0.s3, weights_row0.s2, acc.s1); \
+ acc.s2 = fma(src0.s2, weights_row0.s0, acc.s2); \
+ acc.s2 = fma(src0.s3, weights_row0.s1, acc.s2); \
+ acc.s2 = fma(src0.s4, weights_row0.s2, acc.s2); \
+ acc.s3 = fma(src0.s3, weights_row0.s0, acc.s3); \
+ acc.s3 = fma(src0.s4, weights_row0.s1, acc.s3); \
+ acc.s3 = fma(src0.s5, weights_row0.s2, acc.s3); \
+ })
+
#define CONVOLUTION1x3_BIFROST2X1_STRIDE2(acc, src0, src1, weights_row0) \
({ \
acc.s0 = fma(src0.s0, weights_row0.s0, acc.s0); \
@@ -228,6 +247,22 @@
acc.s1 = fma(src1.s0, weights_row0.s2, acc.s1); \
})
+#define CONVOLUTION1x3_BIFROST4X1_STRIDE2(acc, src0, src1, weights_row0) \
+ ({ \
+ acc.s0 = fma(src0.s0, weights_row0.s0, acc.s0); \
+ acc.s0 = fma(src0.s1, weights_row0.s1, acc.s0); \
+ acc.s0 = fma(src0.s2, weights_row0.s2, acc.s0); \
+ acc.s1 = fma(src0.s2, weights_row0.s0, acc.s1); \
+ acc.s1 = fma(src0.s3, weights_row0.s1, acc.s1); \
+ acc.s1 = fma(src0.s4, weights_row0.s2, acc.s1); \
+ acc.s2 = fma(src0.s4, weights_row0.s0, acc.s2); \
+ acc.s2 = fma(src0.s5, weights_row0.s1, acc.s2); \
+ acc.s2 = fma(src0.s6, weights_row0.s2, acc.s2); \
+ acc.s3 = fma(src0.s6, weights_row0.s0, acc.s3); \
+ acc.s3 = fma(src0.s7, weights_row0.s1, acc.s3); \
+ acc.s3 = fma(src1.s0, weights_row0.s2, acc.s3); \
+ })
+
/** This OpenCL kernel is optimized for Bifrost architectures and computes the depthwise convolution 3x3 when both
* stride_x and stride_y are equal to 1
*
@@ -260,7 +295,7 @@
* @param[in] biases_step_x (Optional) biases_stride_x * number of elements along X processed per workitem(in bytes)
* @param[in] biases_offset_first_element_in_bytes (Optional) The offset of the first element in the biases vector
*/
-__kernel void depthwise_convolution_3x3_stridex1_stridey1_bifrost(
+__kernel void depthwise_convolution_3x3_stridex1_stridey1_bifrost_f32(
TENSOR3D_DECLARATION(src),
TENSOR3D_DECLARATION(dst),
TENSOR3D_DECLARATION(weights)
@@ -280,20 +315,20 @@
float2 pixels3 = 0.0f;
__global uchar *weights_addr = (__global uchar *)weights.ptr;
- __global uchar *src_addr = (__global uchar *)offset(&src, 0, 0);
+ __global uchar *src_addr = src.ptr - (get_global_id(2) - get_global_id(2) / DEPTH_MULTIPLIER) * src_step_z;
// Load the weights
float3 weights_row0 = vload3(0, (__global float *)(weights_addr + 0 * weights_stride_y));
float3 weights_row1 = vload3(0, (__global float *)(weights_addr + 1 * weights_stride_y));
float3 weights_row2 = vload3(0, (__global float *)(weights_addr + 2 * weights_stride_y));
- // Note: Since each work-item computes 4x2 elements, we need to load 4 rows from the input tensor
+ // Note: Since each work-item computes 4x2 elements, we need to load 6 rows from the input tensor
float4 src00 = vload4(0, (__global float *)(src_addr + 0 * src_stride_y)); // Row0
float4 src10 = vload4(0, (__global float *)(src_addr + 1 * src_stride_y)); // Row1
float4 src20 = vload4(0, (__global float *)(src_addr + 2 * src_stride_y)); // Row2
float4 src30 = vload4(0, (__global float *)(src_addr + 3 * src_stride_y)); // Row3
- float4 src40 = vload4(0, (__global float *)(src_addr + 4 * src_stride_y)); // Row3
- float4 src50 = vload4(0, (__global float *)(src_addr + 5 * src_stride_y)); // Row3
+ float4 src40 = vload4(0, (__global float *)(src_addr + 4 * src_stride_y)); // Row4
+ float4 src50 = vload4(0, (__global float *)(src_addr + 5 * src_stride_y)); // Row5
CONVOLUTION1x3_BIFROST2X1_STRIDE1(pixels0, src00, weights_row0);
CONVOLUTION1x3_BIFROST2X1_STRIDE1(pixels0, src10, weights_row1);
@@ -357,7 +392,7 @@
* @param[in] biases_step_x (Optional) biases_stride_x * number of elements along X processed per workitem(in bytes)
* @param[in] biases_offset_first_element_in_bytes (Optional) The offset of the first element in the biases vector
*/
-__kernel void depthwise_convolution_3x3_stridex2_stridey2_bifrost(
+__kernel void depthwise_convolution_3x3_stridex2_stridey2_bifrost_f32(
TENSOR3D_DECLARATION(src),
TENSOR3D_DECLARATION(dst),
TENSOR3D_DECLARATION(weights)
@@ -375,7 +410,7 @@
float2 pixels1 = 0.0f;
__global uchar *weights_addr = (__global uchar *)weights.ptr;
- __global uchar *src_addr = (__global uchar *)offset(&src, 0, 0);
+ __global uchar *src_addr = src.ptr - (get_global_id(2) - get_global_id(2) / DEPTH_MULTIPLIER) * src_step_z;
// Load the weights
float3 weights_row0 = vload3(0, (__global float *)(weights_addr + 0 * weights_stride_y));
@@ -414,6 +449,8 @@
vstore2(pixels1, 0, (__global float *)(dst.ptr + 1 * dst_stride_y));
}
+#endif // defined(DEPTH_MULTIPLIER)
+
#if defined(SRC_WIDTH) && defined(DATA_TYPE)
/** This kernel reshapes each of the tensor's low three dimensions to single rows.
*
@@ -463,17 +500,17 @@
#if defined(HAS_BIAS)
if(get_global_id(1) == 0)
{
- *((__global DATA_TYPE *)(output_ptr + SRC_WIDTH * get_global_size(1) * dst_stride_x)) = *((__global float *)(biases.ptr + get_global_id(2) * biases_stride_x));
+ *((__global DATA_TYPE *)(output_ptr + SRC_WIDTH * get_global_size(1) * dst_stride_x)) = *((__global DATA_TYPE *)(biases.ptr + get_global_id(2) * biases_stride_x));
}
#endif // defined(HAS_BIAS)
}
#endif //defined(SRC_WIDTH) && defined(DATA_TYPE)
-#if defined(STRIDE_X) && defined(STRIDE_Y) && defined(PAD_LEFT) && defined(PAD_TOP) && defined(PAD_RIGHT) && defined(PAD_BOTTOM) && defined(KERNEL_WIDTH) && defined(KERNEL_HEIGHT) && defined(SRC_WIDTH) && defined(SRC_HEIGHT) && defined(DATA_TYPE) && defined(PAD_VALUE)
+#if defined(STRIDE_X) && defined(STRIDE_Y) && defined(PAD_LEFT) && defined(PAD_TOP) && defined(PAD_RIGHT) && defined(PAD_BOTTOM) && defined(KERNEL_WIDTH) && defined(KERNEL_HEIGHT) && defined(SRC_WIDTH) && defined(SRC_HEIGHT) && defined(DATA_TYPE) && defined(PAD_VALUE) && defined(DEPTH_MULTIPLIER)
/** This kernel performs a reshaping of the input tensor to a tensor used to perform depthwise convolution using vector to matrix multiplication.
*
* @note The data type must be passed at compile time using -DDATA_TYPE: e.g. -DDATA_TYPE=float
- * @note The convolution information must be passed at compile time using -DSTRIDE_X, -DSTRIDE_Y, -DPAD_LEFT, -DPAD_TOP, -DPAD_RIGHT, -DPAD_BOTTOM, -DKERNEL_WIDHT, -DKERNEL_HEIGHT, -DSRC_WIDTH, -DSRC_HEIGHT
+ * @note The convolution information must be passed at compile time using -DSTRIDE_X, -DSTRIDE_Y, -DPAD_LEFT, -DPAD_TOP, -DPAD_RIGHT, -DPAD_BOTTOM, -DKERNEL_WIDHT, -DKERNEL_HEIGHT, -DSRC_WIDTH, -DSRC_HEIGHT, -DDEPTH_MULTIPLIER
*
* @param[in] src_ptr Pointer to the source tensor. Supported data types: QS8/QS16/F16/F32
* @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
@@ -502,7 +539,7 @@
const int src_x = -PAD_LEFT + src_pixel_linear % max_initial_x;
const int src_y = -PAD_TOP + src_pixel_linear / max_initial_x * STRIDE_Y;
- const int src_z = get_global_id(2);
+ const int src_z = get_global_id(2) / DEPTH_MULTIPLIER;
__global uchar *input_ptr = src_ptr + src_offset_first_element_in_bytes + src_z * src_stride_z;
__global DATA_TYPE *output_ptr = ((__global DATA_TYPE *)(dst.ptr));
@@ -526,7 +563,7 @@
#endif // defined(HAS_BIAS)
}
-#endif //defined(STRIDE_X) && defined(STRIDE_Y) && defined(PAD_LEFT) && defined(PAD_TOP) && defined(PAD_RIGHT) && defined(PAD_BOTTOM) && defined(KERNEL_WIDTH) && defined(KERNEL_HEIGHT) && defined(SRC_WIDTH) && defined(DATA_TYPE) && defined(PAD_VALUE)
+#endif //defined(STRIDE_X) && defined(STRIDE_Y) && defined(PAD_LEFT) && defined(PAD_TOP) && defined(PAD_RIGHT) && defined(PAD_BOTTOM) && defined(KERNEL_WIDTH) && defined(KERNEL_HEIGHT) && defined(SRC_WIDTH) && defined(DATA_TYPE) && defined(PAD_VALUE) && defined(DEPTH_MULTIPLIER)
#if defined(CONV_WIDTH) && defined(CONV_HEIGHT) && defined(DATA_TYPE)
@@ -565,7 +602,7 @@
#endif //defined(CONV_WIDTH) && defined(CONV_HEIGHT) && defined(DATA_TYPE)
-#if defined(ARM_COMPUTE_OPENCL_FP16_ENABLED)
+#if defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(DEPTH_MULTIPLIER)
#if defined(CONV_STRIDE_X)
#if CONV_STRIDE_X == 1
#define convolution1x3_f16 convolution1x3_stride_1_f16
@@ -684,6 +721,8 @@
return pixels;
}
+#if defined(DEPTH_MULTIPLIER)
+
/** This OpenCL kernel computes the depthwise convolution 3x3
*
* @param[in] src_ptr Pointer to the source image. Supported data types: F16
@@ -694,7 +733,7 @@
* @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image
* @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
* @param[in] src_step_z src_stride_z * number of elements along Y processed per workitem(in bytes)
- * @param[in] dst_ptr Pointer to the destination tensor. Supported data types: F32
+ * @param[in] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr
* @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
* @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
* @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
@@ -702,7 +741,7 @@
* @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
* @param[in] dst_step_z dst_stride_z * number of elements along Y processed per workitem(in bytes)
* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
- * @param[in] weights_ptr Pointer to the weights tensor. Supported data types: F32
+ * @param[in] weights_ptr Pointer to the weights tensor. Supported data types: same as @p src_ptr
* @param[in] weights_stride_x Stride of the weights tensor in X dimension (in bytes)
* @param[in] weights_step_x weights_stride_x * number of elements along X processed per workitem(in bytes)
* @param[in] weights_stride_y Stride of the weights tensor in Y dimension (in bytes)
@@ -732,6 +771,8 @@
Vector biases = CONVERT_TO_VECTOR_STRUCT_NO_STEP(biases);
#endif //defined(HAS_BIAS)
+ src.ptr -= (get_global_id(2) - get_global_id(2) / DEPTH_MULTIPLIER) * src_step_z;
+
uchar3 offset = (uchar3)(0, 1, 2) * (uchar3)weights_stride_y;
half3 weights_values0 = vload3(0, (__global half *)(weights.ptr + offset.s0));
half3 weights_values1 = vload3(0, (__global half *)(weights.ptr + offset.s1));
@@ -746,5 +787,196 @@
vstore4(pixels, 0, (__global half *)dst.ptr);
}
+#endif // defined(DEPTH_MULTIPLIER)
#endif // defined(CONV_STRIDE_X)
-#endif // defined(ARM_COMPUTE_OPENCL_FP16_ENABLED)
+
+/** This OpenCL kernel is optimized for Bifrost architectures and computes the 16bit floating point depthwise convolution 3x3
+ * when both stride_x and stride_y are equal to 1
+ *
+ * @param[in] src_ptr Pointer to the source image. Supported data types: F16
+ * @param[in] src_stride_x Stride of the source image in X dimension (in bytes)
+ * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] src_stride_y Stride of the source image in Y dimension (in bytes)
+ * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] src_step_z src_stride_z * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr
+ * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
+ * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in] dst_step_z dst_stride_z * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ * @param[in] weights_ptr Pointer to the weights tensor. Supported data types: same as @p src_ptr
+ * @param[in] weights_stride_x Stride of the weights tensor in X dimension (in bytes)
+ * @param[in] weights_step_x weights_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] weights_stride_y Stride of the weights tensor in Y dimension (in bytes)
+ * @param[in] weights_step_y weights_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] weights_stride_z Stride of the weights tensor in Z dimension (in bytes)
+ * @param[in] weights_step_z weights_stride_z * number of elements along Y processed per workitem(in bytes)
+ * @param[in] weights_offset_first_element_in_bytes The offset of the first element in the biases vector
+ * @param[in] biases_ptr (Optional) Pointer to the biases vector. Supported data types: same as @p src_ptr
+ * @param[in] biases_stride_x (Optional) Stride of the biases vector in X dimension (in bytes)
+ * @param[in] biases_step_x (Optional) biases_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] biases_offset_first_element_in_bytes (Optional) The offset of the first element in the biases vector
+ */
+__kernel void depthwise_convolution_3x3_stridex1_stridey1_bifrost_f16(
+ TENSOR3D_DECLARATION(src),
+ TENSOR3D_DECLARATION(dst),
+ TENSOR3D_DECLARATION(weights)
+#if defined(HAS_BIAS)
+ ,
+ VECTOR_DECLARATION(biases)
+#endif //defined(HAS_BIAS)
+)
+{
+ Image src = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(src);
+ Image dst = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(dst);
+ Tensor3D weights = CONVERT_TO_TENSOR3D_STRUCT(weights);
+
+#ifdef HAS_BIAS
+ Vector biases = CONVERT_TO_VECTOR_STRUCT_NO_STEP(biases);
+
+ half bias = *((__global half *)(vector_offset(&biases, get_global_id(2))));
+#endif /* defined(HAS_BIAS) */
+
+ half4 pixels0 = 0.0f;
+ half4 pixels1 = 0.0f;
+ half4 pixels2 = 0.0f;
+ half4 pixels3 = 0.0f;
+
+ __global uchar *weights_addr = (__global uchar *)weights.ptr;
+ __global uchar *src_addr = (__global uchar *)offset(&src, 0, 0) - (get_global_id(2) - get_global_id(2) / DEPTH_MULTIPLIER) * src_step_z;
+
+ // Load the weights
+ half3 weights_row0 = vload3(0, (__global half *)(weights_addr + 0 * weights_stride_y));
+ half3 weights_row1 = vload3(0, (__global half *)(weights_addr + 1 * weights_stride_y));
+ half3 weights_row2 = vload3(0, (__global half *)(weights_addr + 2 * weights_stride_y));
+
+ // Note: Since each work-item computes 4x4 elements, we need to load 6 rows from the input tensor
+ half8 src00 = vload8(0, (__global half *)(src_addr + 0 * src_stride_y)); // Row0
+ half8 src10 = vload8(0, (__global half *)(src_addr + 1 * src_stride_y)); // Row1
+ half8 src20 = vload8(0, (__global half *)(src_addr + 2 * src_stride_y)); // Row2
+ half8 src30 = vload8(0, (__global half *)(src_addr + 3 * src_stride_y)); // Row3
+ half8 src40 = vload8(0, (__global half *)(src_addr + 4 * src_stride_y)); // Row4
+ half8 src50 = vload8(0, (__global half *)(src_addr + 5 * src_stride_y)); // Row5
+
+ CONVOLUTION1x3_BIFROST4X1_STRIDE1(pixels0, src00, weights_row0);
+ CONVOLUTION1x3_BIFROST4X1_STRIDE1(pixels0, src10, weights_row1);
+ CONVOLUTION1x3_BIFROST4X1_STRIDE1(pixels0, src20, weights_row2);
+ CONVOLUTION1x3_BIFROST4X1_STRIDE1(pixels1, src10, weights_row0);
+ CONVOLUTION1x3_BIFROST4X1_STRIDE1(pixels1, src20, weights_row1);
+ CONVOLUTION1x3_BIFROST4X1_STRIDE1(pixels1, src30, weights_row2);
+ CONVOLUTION1x3_BIFROST4X1_STRIDE1(pixels2, src20, weights_row0);
+ CONVOLUTION1x3_BIFROST4X1_STRIDE1(pixels2, src30, weights_row1);
+ CONVOLUTION1x3_BIFROST4X1_STRIDE1(pixels2, src40, weights_row2);
+ CONVOLUTION1x3_BIFROST4X1_STRIDE1(pixels3, src30, weights_row0);
+ CONVOLUTION1x3_BIFROST4X1_STRIDE1(pixels3, src40, weights_row1);
+ CONVOLUTION1x3_BIFROST4X1_STRIDE1(pixels3, src50, weights_row2);
+
+#ifdef HAS_BIAS
+ pixels0 += (half4)bias;
+ pixels1 += (half4)bias;
+ pixels2 += (half4)bias;
+ pixels3 += (half4)bias;
+#endif /* defined(HAS_BIAS) */
+
+ vstore4(pixels0, 0, (__global half *)(dst.ptr + 0 * dst_stride_y));
+ vstore4(pixels1, 0, (__global half *)(dst.ptr + 1 * dst_stride_y));
+ vstore4(pixels2, 0, (__global half *)(dst.ptr + 2 * dst_stride_y));
+ vstore4(pixels3, 0, (__global half *)(dst.ptr + 3 * dst_stride_y));
+}
+
+/** This OpenCL kernel is optimized for Bifrost architectures and computes 16bit floating point the depthwise convolution 3x3
+ * when both stride_x and stride_y are equal to 2
+ *
+ * @param[in] src_ptr Pointer to the source image. Supported data types: F16
+ * @param[in] src_stride_x Stride of the source image in X dimension (in bytes)
+ * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] src_stride_y Stride of the source image in Y dimension (in bytes)
+ * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] src_step_z src_stride_z * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr
+ * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
+ * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in] dst_step_z dst_stride_z * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ * @param[in] weights_ptr Pointer to the weights tensor. Supported data types: same as @p src_ptr
+ * @param[in] weights_stride_x Stride of the weights tensor in X dimension (in bytes)
+ * @param[in] weights_step_x weights_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] weights_stride_y Stride of the weights tensor in Y dimension (in bytes)
+ * @param[in] weights_step_y weights_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] weights_stride_z Stride of the weights tensor in Z dimension (in bytes)
+ * @param[in] weights_step_z weights_stride_z * number of elements along Y processed per workitem(in bytes)
+ * @param[in] weights_offset_first_element_in_bytes The offset of the first element in the biases vector
+ * @param[in] biases_ptr (Optional) Pointer to the biases vector. Supported data types: same as @p src_ptr
+ * @param[in] biases_stride_x (Optional) Stride of the biases vector in X dimension (in bytes)
+ * @param[in] biases_step_x (Optional) biases_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] biases_offset_first_element_in_bytes (Optional) The offset of the first element in the biases vector
+ */
+__kernel void depthwise_convolution_3x3_stridex2_stridey2_bifrost_f16(
+ TENSOR3D_DECLARATION(src),
+ TENSOR3D_DECLARATION(dst),
+ TENSOR3D_DECLARATION(weights)
+#if defined(HAS_BIAS)
+ ,
+ VECTOR_DECLARATION(biases)
+#endif //defined(HAS_BIAS)
+)
+{
+ Image src = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(src);
+ Image dst = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(dst);
+ Tensor3D weights = CONVERT_TO_TENSOR3D_STRUCT(weights);
+
+#ifdef HAS_BIAS
+ Vector biases = CONVERT_TO_VECTOR_STRUCT_NO_STEP(biases);
+
+ half bias = *((__global half *)(vector_offset(&biases, get_global_id(2))));
+#endif /* defined(HAS_BIAS) */
+
+ half4 pixels0 = 0.0f;
+ half4 pixels1 = 0.0f;
+
+ __global uchar *weights_addr = (__global uchar *)weights.ptr;
+ __global uchar *src_addr = (__global uchar *)offset(&src, 0, 0) - (get_global_id(2) - get_global_id(2) / DEPTH_MULTIPLIER) * src_step_z;
+
+ // Load the weights
+ half3 weights_row0 = vload3(0, (__global half *)(weights_addr + 0 * weights_stride_y));
+ half3 weights_row1 = vload3(0, (__global half *)(weights_addr + 1 * weights_stride_y));
+ half3 weights_row2 = vload3(0, (__global half *)(weights_addr + 2 * weights_stride_y));
+
+ // Note: Since each work-item computes 2x4 elements, we need to load 5 rows from the input tensor
+ half8 src00 = vload8(0, (__global half *)(src_addr + 0 * src_stride_y)); // Row0
+ half2 src01 = vload2(4, (__global half *)(src_addr + 0 * src_stride_y)); // Row0
+ half8 src10 = vload8(0, (__global half *)(src_addr + 1 * src_stride_y)); // Row1
+ half2 src11 = vload2(4, (__global half *)(src_addr + 1 * src_stride_y)); // Row1
+ half8 src20 = vload8(0, (__global half *)(src_addr + 2 * src_stride_y)); // Row2
+ half2 src21 = vload2(4, (__global half *)(src_addr + 2 * src_stride_y)); // Row2
+ half8 src30 = vload8(0, (__global half *)(src_addr + 3 * src_stride_y)); // Row3
+ half2 src31 = vload2(4, (__global half *)(src_addr + 3 * src_stride_y)); // Row3
+ half8 src40 = vload8(0, (__global half *)(src_addr + 4 * src_stride_y)); // Row4
+ half2 src41 = vload2(4, (__global half *)(src_addr + 4 * src_stride_y)); // Row4
+
+ CONVOLUTION1x3_BIFROST4X1_STRIDE2(pixels0, src00, src01, weights_row0);
+ CONVOLUTION1x3_BIFROST4X1_STRIDE2(pixels0, src10, src11, weights_row1);
+ CONVOLUTION1x3_BIFROST4X1_STRIDE2(pixels0, src20, src21, weights_row2);
+ CONVOLUTION1x3_BIFROST4X1_STRIDE2(pixels1, src20, src21, weights_row0);
+ CONVOLUTION1x3_BIFROST4X1_STRIDE2(pixels1, src30, src31, weights_row1);
+ CONVOLUTION1x3_BIFROST4X1_STRIDE2(pixels1, src40, src41, weights_row2);
+
+#ifdef HAS_BIAS
+ pixels0 += (half4)bias;
+ pixels1 += (half4)bias;
+#endif /* defined(HAS_BIAS) */
+
+ vstore4(pixels0, 0, (__global half *)(dst.ptr + 0 * dst_stride_y));
+ vstore4(pixels1, 0, (__global half *)(dst.ptr + 1 * dst_stride_y));
+}
+#endif // defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(DEPTH_MULTIPLIER)
diff --git a/src/core/CL/cl_kernels/depthwise_convolution_quantized.cl b/src/core/CL/cl_kernels/depthwise_convolution_quantized.cl
index 40538a1..ccb3a1f 100644
--- a/src/core/CL/cl_kernels/depthwise_convolution_quantized.cl
+++ b/src/core/CL/cl_kernels/depthwise_convolution_quantized.cl
@@ -24,7 +24,20 @@
#include "helpers_asymm.h"
-#if defined(CONV_STRIDE_X) && defined(CONV_STRIDE_Y) && defined(WEIGHTS_OFFSET) && defined(INPUT_OFFSET) && defined(K_OFFSET) && defined(OUTPUT_OFFSET) && defined(OUTPUT_MULTIPLIER) && defined(OUTPUT_SHIFT)
+#if defined(WEIGHTS_OFFSET) && defined(INPUT_OFFSET) && defined(K_OFFSET) && defined(OUTPUT_OFFSET) && defined(OUTPUT_MULTIPLIER) && defined(OUTPUT_SHIFT)
+
+#if defined(FUSED_ACTIVATION)
+#define DATA_TYPE uchar
+#ifndef VEC_SIZE
+#define VEC_SIZE 8
+#endif /* VEC_SIZE */
+#include "activation_layer_qa8.cl"
+#define ACTIVATION_FUNC(x) PERFORM_ACTIVATION_QA8(FUSED_ACTIVATION, x)
+#else /* defined(FUSED_ACTIVATION) */
+#define ACTIVATION_FUNC(x) (x)
+#endif /* defined(FUSED_ACTIVATION) */
+
+#if defined(CONV_STRIDE_Y) && defined(CONV_STRIDE_X)
#if CONV_STRIDE_X > 3
#error "Stride X not supported"
@@ -62,7 +75,7 @@
})
#endif /* CONV_STRIDE_X */
-/** This function computes the horizontal integral of the image and adds offsets.
+/** This function computes the depthwise convolution quantized.
*
* @param[in] src_ptr Pointer to the source image. Supported data types: QASYMM8
* @param[in] src_stride_x Stride of the source image in X dimension (in bytes)
@@ -94,7 +107,7 @@
* @param[in] biases_offset_first_element_in_bytes (Optional) The offset of the first element in the biases vector
*/
-__kernel void depthwise_convolution_3x3_quantized(
+__kernel void depthwise_convolution_3x3_quantized_nchw(
TENSOR3D_DECLARATION(src),
TENSOR3D_DECLARATION(dst),
TENSOR3D_DECLARATION(weights)
@@ -113,6 +126,8 @@
int bias_value = *((__global int *)(vector_offset(&biases, get_global_id(2))));
#endif //defined(HAS_BIAS)
+ src.ptr -= (get_global_id(2) - get_global_id(2) / DEPTH_MULTIPLIER) * src_step_z;
+
uchar3 w0 = vload3(0, weights.ptr + 0 * weights_stride_y);
uchar3 w1 = vload3(0, weights.ptr + 1 * weights_stride_y);
uchar3 w2 = vload3(0, weights.ptr + 2 * weights_stride_y);
@@ -222,7 +237,7 @@
res0 = max(res0, (uchar8)0);
res0 = min(res0, (uchar8)255);
- vstore8(res0, 0, dst.ptr);
+ vstore8(ACTIVATION_FUNC(res0), 0, dst.ptr);
#if CONV_STRIDE_Y == 1
values1 = ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(values1, OUTPUT_MULTIPLIER, OUTPUT_SHIFT, 8);
@@ -231,8 +246,481 @@
res1 = max(res1, (uchar8)0);
res1 = min(res1, (uchar8)255);
- vstore8(res1, 0, dst.ptr + dst_stride_y);
+ vstore8(ACTIVATION_FUNC(res1), 0, dst.ptr + dst_stride_y);
#endif /* CONV_STRIDE_Y == 1 */
}
-#endif /* defined(CONV_STRIDE_X) && defined(CONV_STRIDE_Y) && defined(WEIGHTS_OFFSET) && defined(INPUT_OFFSET) && defined(K_OFFSET) && defined(OUTPUT_OFFSET) && defined(OUTPUT_MULTIPLIER) && defined(OUTPUT_SHIFT) */
+#endif /* defined(CONV_STRIDE_Y) && defined(CONV_STRIDE_X) */
+
+#if defined(VEC_SIZE) && defined(SRC_DEPTH) && defined(CONV_PAD_TOP) && defined(ROWS_READ)
+
+#define asymm_mult_by_quant_multiplier_less_than_one(x, y, z) ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(x, y, z, VEC_SIZE)
+
+#define VEC_INT VEC_DATA_TYPE(int, VEC_SIZE)
+#define VEC_UCHAR VEC_DATA_TYPE(uchar, VEC_SIZE)
+
+#define BIFROST_MAD_4(acc, x, y) \
+ ({ \
+ acc.s0 += (ushort)x.s0 * (ushort)y.s0; \
+ acc.s1 += (ushort)x.s1 * (ushort)y.s1; \
+ acc.s2 += (ushort)x.s2 * (ushort)y.s2; \
+ acc.s3 += (ushort)x.s3 * (ushort)y.s3; \
+ })
+
+#if WEIGHTS_OFFSET != 0
+#define BIFROST_MAD_ACC_4(acc, sum, x, y) \
+ ({ \
+ sum += CONVERT(x, VEC_INT); \
+ BIFROST_MAD_4(acc, x, y); \
+ })
+#else /* WEIGHTS_OFFSET != 0 */
+#define BIFROST_MAD_ACC_4(acc, sum, x, y) BIFROST_MAD_4(acc, x, y)
+#endif /* WEIGHTS_OFFSET != 0 */
+
+/** This function computes the depthwise convolution quantized.
+ *
+ * @param[in] src_ptr Pointer to the source image. Supported data types: QASYMM8
+ * @param[in] src_stride_x Stride of the source image in X dimension (in bytes)
+ * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] src_stride_y Stride of the source image in Y dimension (in bytes)
+ * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] src_step_z src_stride_z * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_ptr Pointer to the destination tensor. Supported data types: QASYMM8
+ * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
+ * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in] dst_step_z dst_stride_z * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ * @param[in] weights_ptr Pointer to the weights tensor. Supported data types: QASYMM8
+ * @param[in] weights_stride_x Stride of the weights tensor in X dimension (in bytes)
+ * @param[in] weights_step_x weights_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] weights_stride_y Stride of the weights tensor in Y dimension (in bytes)
+ * @param[in] weights_step_y weights_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] weights_stride_z Stride of the weights tensor in Z dimension (in bytes)
+ * @param[in] weights_step_z weights_stride_z * number of elements along Y processed per workitem(in bytes)
+ * @param[in] weights_offset_first_element_in_bytes The offset of the first element in the weights tensor
+ * @param[in] biases_ptr (Optional) Pointer to the biases vector. Supported data types: QASYMM8
+ * @param[in] biases_stride_x (Optional) Stride of the biases vector in X dimension (in bytes)
+ * @param[in] biases_step_x (Optional) biases_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] biases_offset_first_element_in_bytes (Optional) The offset of the first element in the biases vector
+ */
+
+__kernel void depthwise_convolution_3x3_quantized_nhwc_stride1(
+ TENSOR3D_DECLARATION(src),
+ TENSOR3D_DECLARATION(dst),
+ TENSOR3D_DECLARATION(weights),
+#if defined(HAS_BIAS)
+ VECTOR_DECLARATION(biases)
+#endif /* defined(HAS_BIAS) */
+)
+{
+ Image dst = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(dst);
+ Vector weights = CONVERT_TO_VECTOR_STRUCT(weights);
+#if defined(HAS_BIAS)
+ Vector biases = CONVERT_TO_VECTOR_STRUCT(biases);
+
+ VEC_INT bias_values = VLOAD(VEC_SIZE)(0, (__global int *)biases.ptr);
+#endif /* defined(HAS_BIAS) */
+
+ __global uchar *first_elem = src_ptr + src_offset_first_element_in_bytes;
+
+ const int z = get_global_id(2);
+ const int pad_offs = -ROWS_READ * src_stride_y;
+ const int src_offs0 = get_global_id(0) * src_step_x + get_global_id(1) * src_step_y + z * src_step_z - CONV_PAD_TOP * src_stride_z;
+ const int src_offs1 = src_offs0 + src_stride_z;
+ const int src_offs2 = src_offs1 + src_stride_z;
+
+ const int cond_top = z - CONV_PAD_TOP < 0;
+ const int cond_bottom = z * (src_step_z / src_stride_z) + 2 > SRC_DEPTH;
+
+ __global uchar *src_addr0 = first_elem + select(src_offs0, pad_offs, cond_top);
+ __global uchar *src_addr1 = first_elem + src_offs1;
+ __global uchar *src_addr2 = first_elem + select(src_offs2, pad_offs, cond_bottom);
+
+ VEC_INT sum_we = 0;
+ VEC_INT acc0 = 0, acc1 = 0, acc2 = 0, acc3 = 0;
+ VEC_INT sum0 = 0, sum1 = 0, sum2 = 0, sum3 = 0;
+
+ // z == 0
+ VEC_UCHAR w0, w1, w2;
+ w0 = VLOAD(VEC_SIZE)(0, weights.ptr + 0 * weights_stride_y);
+ w1 = VLOAD(VEC_SIZE)(0, weights.ptr + 1 * weights_stride_y);
+ w2 = VLOAD(VEC_SIZE)(0, weights.ptr + 2 * weights_stride_y);
+
+#if INPUT_OFFSET != 0
+ sum_we += CONVERT(w0, VEC_INT) + CONVERT(w1, VEC_INT) + CONVERT(w2, VEC_INT);
+#endif /* INPUT_OFFSET != 0 */
+
+ VEC_UCHAR values = VLOAD(VEC_SIZE)(0, src_addr0);
+ BIFROST_MAD_ACC_4(acc0, sum0, values, w0);
+
+ src_addr0 += src_stride_y;
+ values = VLOAD(VEC_SIZE)(0, src_addr0);
+ BIFROST_MAD_ACC_4(acc0, sum0, values, w1);
+ BIFROST_MAD_ACC_4(acc1, sum1, values, w0);
+
+ src_addr0 += src_stride_y;
+ values = VLOAD(VEC_SIZE)(0, src_addr0);
+ BIFROST_MAD_ACC_4(acc0, sum0, values, w2);
+ BIFROST_MAD_ACC_4(acc1, sum1, values, w1);
+ BIFROST_MAD_ACC_4(acc2, sum2, values, w0);
+
+ src_addr0 += src_stride_y;
+ values = VLOAD(VEC_SIZE)(0, src_addr0);
+ BIFROST_MAD_ACC_4(acc1, sum1, values, w2);
+ BIFROST_MAD_ACC_4(acc2, sum2, values, w1);
+ BIFROST_MAD_ACC_4(acc3, sum3, values, w0);
+
+ src_addr0 += src_stride_y;
+ values = VLOAD(VEC_SIZE)(0, src_addr0);
+ BIFROST_MAD_ACC_4(acc2, sum2, values, w2);
+ BIFROST_MAD_ACC_4(acc3, sum3, values, w1);
+
+ src_addr0 += src_stride_y;
+ values = VLOAD(VEC_SIZE)(0, src_addr0);
+ BIFROST_MAD_ACC_4(acc3, sum3, values, w2);
+
+ weights.ptr += weights_stride_z;
+
+ // z == 1
+ w0 = VLOAD(VEC_SIZE)(0, weights.ptr + 0 * weights_stride_y);
+ w1 = VLOAD(VEC_SIZE)(0, weights.ptr + 1 * weights_stride_y);
+ w2 = VLOAD(VEC_SIZE)(0, weights.ptr + 2 * weights_stride_y);
+
+#if INPUT_OFFSET != 0
+ sum_we += CONVERT(w0, VEC_INT) + CONVERT(w1, VEC_INT) + CONVERT(w2, VEC_INT);
+#endif /* INPUT_OFFSET != 0 */
+
+ values = VLOAD(VEC_SIZE)(0, src_addr1);
+ BIFROST_MAD_ACC_4(acc0, sum0, values, w0);
+
+ src_addr1 += src_stride_y;
+ values = VLOAD(VEC_SIZE)(0, src_addr1);
+ BIFROST_MAD_ACC_4(acc0, sum0, values, w1);
+ BIFROST_MAD_ACC_4(acc1, sum1, values, w0);
+
+ src_addr1 += src_stride_y;
+ values = VLOAD(VEC_SIZE)(0, src_addr1);
+ BIFROST_MAD_ACC_4(acc0, sum0, values, w2);
+ BIFROST_MAD_ACC_4(acc1, sum1, values, w1);
+ BIFROST_MAD_ACC_4(acc2, sum2, values, w0);
+
+ src_addr1 += src_stride_y;
+ values = VLOAD(VEC_SIZE)(0, src_addr1);
+ BIFROST_MAD_ACC_4(acc1, sum1, values, w2);
+ BIFROST_MAD_ACC_4(acc2, sum2, values, w1);
+ BIFROST_MAD_ACC_4(acc3, sum3, values, w0);
+
+ src_addr1 += src_stride_y;
+ values = VLOAD(VEC_SIZE)(0, src_addr1);
+ BIFROST_MAD_ACC_4(acc2, sum2, values, w2);
+ BIFROST_MAD_ACC_4(acc3, sum3, values, w1);
+
+ src_addr1 += src_stride_y;
+ values = VLOAD(VEC_SIZE)(0, src_addr1);
+ BIFROST_MAD_ACC_4(acc3, sum3, values, w2);
+
+ weights.ptr += weights_stride_z;
+
+ // z == 2
+ w0 = VLOAD(VEC_SIZE)(0, weights.ptr + 0 * weights_stride_y);
+ w1 = VLOAD(VEC_SIZE)(0, weights.ptr + 1 * weights_stride_y);
+ w2 = VLOAD(VEC_SIZE)(0, weights.ptr + 2 * weights_stride_y);
+
+#if INPUT_OFFSET != 0
+ sum_we += CONVERT(w0, VEC_INT) + CONVERT(w1, VEC_INT) + CONVERT(w2, VEC_INT);
+#endif /* INPUT_OFFSET != 0 */
+
+ values = VLOAD(VEC_SIZE)(0, src_addr2);
+ BIFROST_MAD_ACC_4(acc0, sum0, values, w0);
+
+ src_addr2 += src_stride_y;
+ values = VLOAD(VEC_SIZE)(0, src_addr2);
+ BIFROST_MAD_ACC_4(acc0, sum0, values, w1);
+ BIFROST_MAD_ACC_4(acc1, sum1, values, w0);
+
+ src_addr2 += src_stride_y;
+ values = VLOAD(VEC_SIZE)(0, src_addr2);
+ BIFROST_MAD_ACC_4(acc0, sum0, values, w2);
+ BIFROST_MAD_ACC_4(acc1, sum1, values, w1);
+ BIFROST_MAD_ACC_4(acc2, sum2, values, w0);
+
+ src_addr2 += src_stride_y;
+ values = VLOAD(VEC_SIZE)(0, src_addr2);
+ BIFROST_MAD_ACC_4(acc1, sum1, values, w2);
+ BIFROST_MAD_ACC_4(acc2, sum2, values, w1);
+ BIFROST_MAD_ACC_4(acc3, sum3, values, w0);
+
+ src_addr2 += src_stride_y;
+ values = VLOAD(VEC_SIZE)(0, src_addr2);
+ BIFROST_MAD_ACC_4(acc2, sum2, values, w2);
+ BIFROST_MAD_ACC_4(acc3, sum3, values, w1);
+
+ src_addr2 += src_stride_y;
+ values = VLOAD(VEC_SIZE)(0, src_addr2);
+ BIFROST_MAD_ACC_4(acc3, sum3, values, w2);
+
+#if defined(HAS_BIAS)
+ acc0 += bias_values;
+ acc1 += bias_values;
+ acc2 += bias_values;
+ acc3 += bias_values;
+#endif /* defined(HAS_BIAS) */
+
+#if WEIGHTS_OFFSET != 0
+ acc0 += WEIGHTS_OFFSET * sum0;
+ acc1 += WEIGHTS_OFFSET * sum1;
+ acc2 += WEIGHTS_OFFSET * sum2;
+ acc3 += WEIGHTS_OFFSET * sum3;
+#endif /* WEIGHTS_OFFSET != 0 */
+
+#if INPUT_OFFSET != 0
+ VEC_INT offs = INPUT_OFFSET * sum_we;
+
+ acc0 += offs;
+ acc1 += offs;
+ acc2 += offs;
+ acc3 += offs;
+#endif /* INPUT_OFFSET != 0 */
+
+#if K_OFFSET != 0
+ acc0 += (VEC_INT)K_OFFSET;
+ acc1 += (VEC_INT)K_OFFSET;
+ acc2 += (VEC_INT)K_OFFSET;
+ acc3 += (VEC_INT)K_OFFSET;
+#endif /* K_OFFSET != 0 */
+
+ acc0 = asymm_mult_by_quant_multiplier_less_than_one(acc0, OUTPUT_MULTIPLIER, OUTPUT_SHIFT);
+ acc1 = asymm_mult_by_quant_multiplier_less_than_one(acc1, OUTPUT_MULTIPLIER, OUTPUT_SHIFT);
+ acc2 = asymm_mult_by_quant_multiplier_less_than_one(acc2, OUTPUT_MULTIPLIER, OUTPUT_SHIFT);
+ acc3 = asymm_mult_by_quant_multiplier_less_than_one(acc3, OUTPUT_MULTIPLIER, OUTPUT_SHIFT);
+
+ acc0 += (VEC_INT)OUTPUT_OFFSET;
+ acc1 += (VEC_INT)OUTPUT_OFFSET;
+ acc2 += (VEC_INT)OUTPUT_OFFSET;
+ acc3 += (VEC_INT)OUTPUT_OFFSET;
+
+ VEC_UCHAR res0 = CONVERT_SAT(acc0, VEC_UCHAR);
+ VEC_UCHAR res1 = CONVERT_SAT(acc1, VEC_UCHAR);
+ VEC_UCHAR res2 = CONVERT_SAT(acc2, VEC_UCHAR);
+ VEC_UCHAR res3 = CONVERT_SAT(acc3, VEC_UCHAR);
+
+ res0 = CLAMP(res0, (VEC_UCHAR)0, (VEC_UCHAR)255);
+ res1 = CLAMP(res1, (VEC_UCHAR)0, (VEC_UCHAR)255);
+ res2 = CLAMP(res2, (VEC_UCHAR)0, (VEC_UCHAR)255);
+ res3 = CLAMP(res3, (VEC_UCHAR)0, (VEC_UCHAR)255);
+
+ VSTORE(VEC_SIZE)
+ (res0, 0, dst.ptr + 0 * dst_stride_y);
+ VSTORE(VEC_SIZE)
+ (res1, 0, dst.ptr + 1 * dst_stride_y);
+ VSTORE(VEC_SIZE)
+ (res2, 0, dst.ptr + 2 * dst_stride_y);
+ VSTORE(VEC_SIZE)
+ (res3, 0, dst.ptr + 3 * dst_stride_y);
+}
+
+/** This function computes the depthwise convolution quantized.
+ *
+ * @param[in] src_ptr Pointer to the source image. Supported data types: QASYMM8
+ * @param[in] src_stride_x Stride of the source image in X dimension (in bytes)
+ * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] src_stride_y Stride of the source image in Y dimension (in bytes)
+ * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] src_step_z src_stride_z * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_ptr Pointer to the destination tensor. Supported data types: QASYMM8
+ * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
+ * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in] dst_step_z dst_stride_z * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ * @param[in] weights_ptr Pointer to the weights tensor. Supported data types: QASYMM8
+ * @param[in] weights_stride_x Stride of the weights tensor in X dimension (in bytes)
+ * @param[in] weights_step_x weights_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] weights_stride_y Stride of the weights tensor in Y dimension (in bytes)
+ * @param[in] weights_step_y weights_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] weights_stride_z Stride of the weights tensor in Z dimension (in bytes)
+ * @param[in] weights_step_z weights_stride_z * number of elements along Y processed per workitem(in bytes)
+ * @param[in] weights_offset_first_element_in_bytes The offset of the first element in the weights tensor
+ * @param[in] biases_ptr (Optional) Pointer to the biases vector. Supported data types: QASYMM8
+ * @param[in] biases_stride_x (Optional) Stride of the biases vector in X dimension (in bytes)
+ * @param[in] biases_step_x (Optional) biases_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] biases_offset_first_element_in_bytes (Optional) The offset of the first element in the biases vector
+ */
+
+__kernel void depthwise_convolution_3x3_quantized_nhwc_stride2(
+ TENSOR3D_DECLARATION(src),
+ TENSOR3D_DECLARATION(dst),
+ TENSOR3D_DECLARATION(weights),
+#if defined(HAS_BIAS)
+ VECTOR_DECLARATION(biases)
+#endif /* defined(HAS_BIAS) */
+)
+{
+ Image dst = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(dst);
+ Vector weights = CONVERT_TO_VECTOR_STRUCT(weights);
+#if defined(HAS_BIAS)
+ Vector biases = CONVERT_TO_VECTOR_STRUCT(biases);
+
+ VEC_INT bias_values = VLOAD(VEC_SIZE)(0, (__global int *)biases.ptr);
+#endif /* defined(HAS_BIAS) */
+
+ __global uchar *first_elem = src_ptr + src_offset_first_element_in_bytes;
+
+ const int z = get_global_id(2);
+ const int pad_offs = -ROWS_READ * src_stride_y;
+ const int src_offs0 = get_global_id(0) * src_step_x + get_global_id(1) * src_step_y + z * src_step_z - CONV_PAD_TOP * src_stride_z;
+ const int src_offs1 = src_offs0 + src_stride_z;
+ const int src_offs2 = src_offs1 + src_stride_z;
+
+ const int cond_top = z - CONV_PAD_TOP < 0;
+ const int cond_bottom = z * (src_step_z / src_stride_z) + 2 > SRC_DEPTH;
+
+ __global uchar *src_addr0 = first_elem + select(src_offs0, pad_offs, cond_top);
+ __global uchar *src_addr1 = first_elem + src_offs1;
+ __global uchar *src_addr2 = first_elem + select(src_offs2, pad_offs, cond_bottom);
+
+ VEC_INT sum_we = 0;
+ VEC_INT acc0 = 0, acc2 = 0;
+ VEC_INT sum0 = 0, sum2 = 0;
+
+ // z == 0
+ VEC_UCHAR w0, w1, w2;
+ w0 = VLOAD(VEC_SIZE)(0, weights.ptr + 0 * weights_stride_y);
+ w1 = VLOAD(VEC_SIZE)(0, weights.ptr + 1 * weights_stride_y);
+ w2 = VLOAD(VEC_SIZE)(0, weights.ptr + 2 * weights_stride_y);
+
+#if INPUT_OFFSET != 0
+ sum_we += CONVERT(w0, VEC_INT) + CONVERT(w1, VEC_INT) + CONVERT(w2, VEC_INT);
+#endif /* INPUT_OFFSET != 0 */
+
+ VEC_UCHAR values = VLOAD(VEC_SIZE)(0, src_addr0);
+ BIFROST_MAD_ACC_4(acc0, sum0, values, w0);
+
+ src_addr0 += src_stride_y;
+ values = VLOAD(VEC_SIZE)(0, src_addr0);
+ BIFROST_MAD_ACC_4(acc0, sum0, values, w1);
+
+ src_addr0 += src_stride_y;
+ values = VLOAD(VEC_SIZE)(0, src_addr0);
+ BIFROST_MAD_ACC_4(acc0, sum0, values, w2);
+ BIFROST_MAD_ACC_4(acc2, sum2, values, w0);
+
+ src_addr0 += src_stride_y;
+ values = VLOAD(VEC_SIZE)(0, src_addr0);
+ BIFROST_MAD_ACC_4(acc2, sum2, values, w1);
+
+ src_addr0 += src_stride_y;
+ values = VLOAD(VEC_SIZE)(0, src_addr0);
+ BIFROST_MAD_ACC_4(acc2, sum2, values, w2);
+
+ weights.ptr += weights_stride_z;
+
+ // z == 1
+ w0 = VLOAD(VEC_SIZE)(0, weights.ptr + 0 * weights_stride_y);
+ w1 = VLOAD(VEC_SIZE)(0, weights.ptr + 1 * weights_stride_y);
+ w2 = VLOAD(VEC_SIZE)(0, weights.ptr + 2 * weights_stride_y);
+
+#if INPUT_OFFSET != 0
+ sum_we += CONVERT(w0, VEC_INT) + CONVERT(w1, VEC_INT) + CONVERT(w2, VEC_INT);
+#endif /* INPUT_OFFSET != 0 */
+
+ values = VLOAD(VEC_SIZE)(0, src_addr1);
+ BIFROST_MAD_ACC_4(acc0, sum0, values, w0);
+
+ src_addr1 += src_stride_y;
+ values = VLOAD(VEC_SIZE)(0, src_addr1);
+ BIFROST_MAD_ACC_4(acc0, sum0, values, w1);
+
+ src_addr1 += src_stride_y;
+ values = VLOAD(VEC_SIZE)(0, src_addr1);
+ BIFROST_MAD_ACC_4(acc0, sum0, values, w2);
+ BIFROST_MAD_ACC_4(acc2, sum2, values, w0);
+
+ src_addr1 += src_stride_y;
+ values = VLOAD(VEC_SIZE)(0, src_addr1);
+ BIFROST_MAD_ACC_4(acc2, sum2, values, w1);
+
+ src_addr1 += src_stride_y;
+ values = VLOAD(VEC_SIZE)(0, src_addr1);
+ BIFROST_MAD_ACC_4(acc2, sum2, values, w2);
+
+ weights.ptr += weights_stride_z;
+
+ // z == 2
+ w0 = VLOAD(VEC_SIZE)(0, weights.ptr + 0 * weights_stride_y);
+ w1 = VLOAD(VEC_SIZE)(0, weights.ptr + 1 * weights_stride_y);
+ w2 = VLOAD(VEC_SIZE)(0, weights.ptr + 2 * weights_stride_y);
+
+#if INPUT_OFFSET != 0
+ sum_we += CONVERT(w0, VEC_INT) + CONVERT(w1, VEC_INT) + CONVERT(w2, VEC_INT);
+#endif /* INPUT_OFFSET != 0 */
+
+ values = VLOAD(VEC_SIZE)(0, src_addr2);
+ BIFROST_MAD_ACC_4(acc0, sum0, values, w0);
+
+ src_addr2 += src_stride_y;
+ values = VLOAD(VEC_SIZE)(0, src_addr2);
+ BIFROST_MAD_ACC_4(acc0, sum0, values, w1);
+
+ src_addr2 += src_stride_y;
+ values = VLOAD(VEC_SIZE)(0, src_addr2);
+ BIFROST_MAD_ACC_4(acc0, sum0, values, w2);
+ BIFROST_MAD_ACC_4(acc2, sum2, values, w0);
+
+ src_addr2 += src_stride_y;
+ values = VLOAD(VEC_SIZE)(0, src_addr2);
+ BIFROST_MAD_ACC_4(acc2, sum2, values, w1);
+
+ src_addr2 += src_stride_y;
+ values = VLOAD(VEC_SIZE)(0, src_addr2);
+ BIFROST_MAD_ACC_4(acc2, sum2, values, w2);
+
+#if defined(HAS_BIAS)
+ acc0 += bias_values;
+ acc2 += bias_values;
+#endif /* defined(HAS_BIAS) */
+
+#if WEIGHTS_OFFSET != 0
+ acc0 += WEIGHTS_OFFSET * sum0;
+ acc2 += WEIGHTS_OFFSET * sum2;
+#endif /* WEIGHTS_OFFSET != 0 */
+
+#if INPUT_OFFSET != 0
+ VEC_INT offs = INPUT_OFFSET * sum_we;
+
+ acc0 += offs;
+ acc2 += offs;
+#endif /* INPUT_OFFSET != 0 */
+
+#if K_OFFSET != 0
+ acc0 += (VEC_INT)K_OFFSET;
+ acc2 += (VEC_INT)K_OFFSET;
+#endif /* K_OFFSET != 0 */
+
+ acc0 = asymm_mult_by_quant_multiplier_less_than_one(acc0, OUTPUT_MULTIPLIER, OUTPUT_SHIFT);
+ acc2 = asymm_mult_by_quant_multiplier_less_than_one(acc2, OUTPUT_MULTIPLIER, OUTPUT_SHIFT);
+ acc0 += (VEC_INT)OUTPUT_OFFSET;
+ acc2 += (VEC_INT)OUTPUT_OFFSET;
+ VEC_UCHAR res0 = CONVERT_SAT(acc0, VEC_UCHAR);
+ VEC_UCHAR res2 = CONVERT_SAT(acc2, VEC_UCHAR);
+ res0 = CLAMP(res0, (VEC_UCHAR)0, (VEC_UCHAR)255);
+ res2 = CLAMP(res2, (VEC_UCHAR)0, (VEC_UCHAR)255);
+
+ VSTORE(VEC_SIZE)
+ (res0, 0, dst.ptr + 0 * dst_stride_y);
+ VSTORE(VEC_SIZE)
+ (res2, 0, dst.ptr + 1 * dst_stride_y);
+}
+
+#endif /* defined(VEC_SIZE) && defined(SRC_DEPTH) && defined(CONV_PAD_TOP) && defined(ROWS_READ) */
+
+#endif /* defined(WEIGHTS_OFFSET) && defined(INPUT_OFFSET) && defined(K_OFFSET) && defined(OUTPUT_OFFSET) && defined(OUTPUT_MULTIPLIER) && defined(OUTPUT_SHIFT) */
diff --git a/src/core/CL/cl_kernels/fill_border.cl b/src/core/CL/cl_kernels/fill_border.cl
index fbd4f6a..33a9495 100644
--- a/src/core/CL/cl_kernels/fill_border.cl
+++ b/src/core/CL/cl_kernels/fill_border.cl
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2016, 2017 ARM Limited.
+ * Copyright (c) 2016-2018 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -35,7 +35,7 @@
* @attention The border size for top, bottom, left, right needs to be passed at the compile time.
* e.g. --DBORDER_SIZE_TOP=0 -DBORDER_SIZE_BOTTOM=2 -DBORDER_SIZE_LEFT=0 -DBORDER_SIZE_RIGHT=2
*
- * @param[in,out] buf_ptr Pointer to the source image. Supported data types: U8, U16, S16, U32, S32, F32
+ * @param[in,out] buf_ptr Pointer to the source image. Supported data types: U8/U16/S16/U32/S32/F16/F32
* @param[in] buf_stride_x Stride of the source image in X dimension (in bytes)
* @param[in] buf_step_x buf_stride_x * number of elements along X processed per workitem(in bytes)
* @param[in] buf_stride_y Stride of the source image in Y dimension (in bytes)
@@ -110,7 +110,7 @@
* @attention The border size for top, bottom, left, right needs to be passed at the compile time.
* e.g. --DBORDER_SIZE_TOP=0 -DBORDER_SIZE_BOTTOM=2 -DBORDER_SIZE_LEFT=0 -DBORDER_SIZE_RIGHT=2
*
- * @param[out] buf_ptr Pointer to the source image. Supported data types: U8, U16, S16, U32, S32, F32
+ * @param[out] buf_ptr Pointer to the source image. Supported data types: U8/U16/S16/U32/S32/F16/F32
* @param[in] buf_stride_x Stride of the source image in X dimension (in bytes)
* @param[in] buf_step_x buf_stride_x * number of elements along X processed per workitem(in bytes)
* @param[in] buf_stride_y Stride of the source image in Y dimension (in bytes)
diff --git a/src/core/CL/cl_kernels/gemm.cl b/src/core/CL/cl_kernels/gemm.cl
index 58a550f..9ed3af8 100644
--- a/src/core/CL/cl_kernels/gemm.cl
+++ b/src/core/CL/cl_kernels/gemm.cl
@@ -49,27 +49,35 @@
* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
* @param[in] src_stride_y Stride of the source matrix in Y dimension (in bytes)
* @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
* @param[in] src_offset_first_element_in_bytes The offset of the first element in the source matrix
* @param[out] dst_ptr Pointer to the destination matrix Supported data types: same as @p src_ptr
* @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)
* @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
* @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)
* @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)
* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix
*/
-__kernel void gemm_transpose1xW(IMAGE_DECLARATION(src),
- IMAGE_DECLARATION(dst))
+__kernel void gemm_transpose1xW(TENSOR3D_DECLARATION(src),
+ TENSOR3D_DECLARATION(dst))
{
uint x = get_global_id(0);
uint y = get_global_id(1);
+ uint z = get_global_id(2);
// Compute address for Matrix B - source
- Image src = CONVERT_TO_IMAGE_STRUCT(src);
+ Tensor3D src = CONVERT_TO_TENSOR3D_STRUCT(src);
// Compute address for Matrix B transposed - destination. X and Y are swapped
uint dst_addr_in_bytes = dst_offset_first_element_in_bytes + y * TRANSPOSE_W * sizeof(DATA_TYPE) * MULT_TRANSPOSE1XW_WIDTH + (x / MULT_TRANSPOSE1XW_WIDTH) * dst_stride_y +
(x % MULT_TRANSPOSE1XW_WIDTH) * TRANSPOSE_W * sizeof(DATA_TYPE);
+ // Add offset for batched GEMM
+ dst_addr_in_bytes += z * dst_stride_z;
+
VEC_DATA_TYPE(DATA_TYPE, TRANSPOSE_W)
b0 = VLOAD(TRANSPOSE_W)(0, (__global DATA_TYPE *)src.ptr);
@@ -90,37 +98,47 @@
* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
* @param[in] src_stride_y Stride of the source matrix in Y dimension (in bytes)
* @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
* @param[in] src_offset_first_element_in_bytes The offset of the first element in the source matrix
* @param[out] dst_ptr Pointer to the destination matrix Supported data types: same as @p src_ptr
* @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)
* @param[in] dst_step_x dst_gx_stride_x * number of elements along X processed per workitem(in bytes)
* @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)
* @param[in] dst_step_y dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)
* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix
*/
-__kernel void gemm_interleave4x4(IMAGE_DECLARATION(src),
- IMAGE_DECLARATION(dst))
+__kernel void gemm_interleave4x4(TENSOR3D_DECLARATION(src),
+ TENSOR3D_DECLARATION(dst))
{
// Compute source and destination addresses
uint x = get_global_id(0);
uint y = get_global_id(1);
+ uint z = get_global_id(2);
- // Compute address for Matrix B - source
- Image src = CONVERT_TO_IMAGE_STRUCT(src);
+ // Compute address for source tensor
+ Tensor3D src = CONVERT_TO_TENSOR3D_STRUCT(src);
// Compute address for Matrix B transposed - destination. X and Y are swapped
uint dst_addr_in_bytes = dst_offset_first_element_in_bytes + x * sizeof(DATA_TYPE) * 16 * MULT_INTERLEAVE4X4_HEIGHT + (y / MULT_INTERLEAVE4X4_HEIGHT) * dst_stride_y +
(y % MULT_INTERLEAVE4X4_HEIGHT) * 4 * sizeof(DATA_TYPE);
+ // Add offset for batched GEMM
+ dst_addr_in_bytes += z * dst_stride_z;
+
+ __global uchar *input_ptr = src.ptr;
+
// Load values from Matrix A
VEC_DATA_TYPE(DATA_TYPE, 4)
- a0 = vload4(0, (__global DATA_TYPE *)(offset(&src, 0, 0)));
+ a0 = vload4(0, (__global DATA_TYPE *)(input_ptr + 0 * src_stride_y));
VEC_DATA_TYPE(DATA_TYPE, 4)
- a1 = vload4(0, (__global DATA_TYPE *)(offset(&src, 0, 1)));
+ a1 = vload4(0, (__global DATA_TYPE *)(input_ptr + 1 * src_stride_y));
VEC_DATA_TYPE(DATA_TYPE, 4)
- a2 = vload4(0, (__global DATA_TYPE *)(offset(&src, 0, 2)));
+ a2 = vload4(0, (__global DATA_TYPE *)(input_ptr + 2 * src_stride_y));
VEC_DATA_TYPE(DATA_TYPE, 4)
- a3 = vload4(0, (__global DATA_TYPE *)(offset(&src, 0, 3)));
+ a3 = vload4(0, (__global DATA_TYPE *)(input_ptr + 3 * src_stride_y));
VEC_DATA_TYPE(DATA_TYPE, 4)
val0 = (VEC_DATA_TYPE(DATA_TYPE, 4))(a0.s0, a1.s0, a2.s0, a3.s0);
@@ -144,6 +162,8 @@
* @note The number of columns of matrix B and the optional alpha's value need to be passed at compile time using -DCOLS_B and -DALPHA
* @note The multiplication factor for the transposition width (mult_transpose1xW_width) must be passed at compile time using -DMULT_TRANSPOSE1XW_WIDTH (i.e. -DMULT_TRANSPOSE1XW_WIDTH=2)
* @note The multiplication factor for the height of the 4x4 interleaved block must be passed at compile time using -DMULT_INTERLEAVE4X4_HEIGHT (i.e. -DMULT_INTERLEAVE4X4_HEIGHT=2)
+ * @note In case the matrix B has 3 dimensions and the matrix A more than 3, in order to avoid out-of-bounds reads, the number of channels of matrix B must be passed at compile time using MATRIX_B_DEPTH (i.e. -DMATRIX_B_DEPTH=16)
+ * This case can happen when GEMM is used to perform the element-wise multiplication through a batched matrix multiplication (2D Winograd) and we have multiple inputs (i.e. a = [K, M, 16, Batches], b = [N, K, 16])
*
* @param[in] src0_ptr Pointer to the source matrix. Supported data types: F32
* @param[in] src0_stride_x Stride of the source matrix in X dimension (in bytes)
@@ -164,12 +184,16 @@
* @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix
*/
-__kernel void gemm_mm_interleaved_transposed_f32_midgard(IMAGE_DECLARATION(src0),
- IMAGE_DECLARATION(src1),
- IMAGE_DECLARATION(dst))
+__kernel void gemm_mm_interleaved_transposed_f32(IMAGE_DECLARATION(src0),
+ IMAGE_DECLARATION(src1),
+ IMAGE_DECLARATION(dst),
+ uint src0_stride_z,
+ uint src1_stride_z,
+ uint dst_stride_z)
{
int x = get_global_id(0) / MULT_TRANSPOSE1XW_WIDTH;
int y = get_global_id(1) / MULT_INTERLEAVE4X4_HEIGHT;
+ int z = get_global_id(2);
// Offset
const int offset_row_a = (get_global_id(1) % MULT_INTERLEAVE4X4_HEIGHT) * 4;
@@ -177,8 +201,18 @@
// src_addr_a = address of matrix A
// src_addr_b = address of matrix B
- __global float *src_addr_a = (__global float *)(src0_ptr + y * src0_stride_y + src0_offset_first_element_in_bytes);
- __global float *src_addr_b = (__global float *)(src1_ptr + x * src1_stride_y + src1_offset_first_element_in_bytes);
+ int src0_addr_in_bytes = z * src0_stride_z + y * src0_stride_y + src0_offset_first_element_in_bytes;
+ int src1_addr_in_bytes = x * src1_stride_y + src1_offset_first_element_in_bytes;
+
+#if defined(MATRIX_B_DEPTH)
+ // Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3
+ src1_addr_in_bytes += (z % MATRIX_B_DEPTH) * src1_stride_z;
+#else // defined(MATRIX_B_DEPTH)
+ src1_addr_in_bytes += z * src1_stride_z;
+#endif // defined(MATRIX_B_DEPTH)
+
+ __global float *src_addr_a = (__global float *)(src0_ptr + src0_addr_in_bytes);
+ __global float *src_addr_b = (__global float *)(src1_ptr + src1_addr_in_bytes);
// Compute end row address for matrix B
__global float *src_end_addr_b = src_addr_b + COLS_B;
@@ -236,11 +270,17 @@
c30 = c30 * (float4)ALPHA;
#endif // defined(ALPHA)
+ // Compute dst address
+ __global uchar *dst_addr = offset(&dst, 0, 0);
+
+ // Add offset for batched GEMM
+ dst_addr += z * dst_stride_z;
+
// Store 4x4 block
- vstore4(c00, 0, (__global float *)(offset(&dst, 0, 0)));
- vstore4(c10, 0, (__global float *)(offset(&dst, 0, 1)));
- vstore4(c20, 0, (__global float *)(offset(&dst, 0, 2)));
- vstore4(c30, 0, (__global float *)(offset(&dst, 0, 3)));
+ vstore4(c00, 0, (__global float *)(dst_addr + 0 * dst_stride_y));
+ vstore4(c10, 0, (__global float *)(dst_addr + 1 * dst_stride_y));
+ vstore4(c20, 0, (__global float *)(dst_addr + 2 * dst_stride_y));
+ vstore4(c30, 0, (__global float *)(dst_addr + 3 * dst_stride_y));
}
/** This OpenCL kernel is optimized for Bifrost. It computes the matrix multiplication between matrix A (src0) and matrix B (src1)
@@ -249,6 +289,9 @@
* @note The number of columns of matrix B and the optional alpha's value need to be passed at compile time using -DCOLS_B and -DALPHA
* @note The multiplication factor for the transposition width (mult_transpose1xW_width) must be passed at compile time using -DMULT_TRANSPOSE1XW_WIDTH (i.e. -DMULT_TRANSPOSE1XW_WIDTH=2)
* @note The multiplication factor for the height of the 4x4 interleaved block must be passed at compile time using -DMULT_INTERLEAVE4X4_HEIGHT (i.e. -DMULT_INTERLEAVE4X4_HEIGHT=2)
+ * @note The multiplication factor for the height of the 4x4 interleaved block must be passed at compile time using -DMULT_INTERLEAVE4X4_HEIGHT (i.e. -DMULT_INTERLEAVE4X4_HEIGHT=2)
+ * @note In case the matrix B has 3 dimensions and the matrix A more than 3, in order to avoid out-of-bounds reads, the number of channels of matrix B must be passed at compile time using MATRIX_B_DEPTH (i.e. -DMATRIX_B_DEPTH=16)
+ * This case can happen when GEMM is used to perform the element-wise multiplication through a batched matrix multiplication (2D Winograd) and we have multiple inputs (i.e. a = [K, M, 16, Batches], b = [N, K, 16])
*
* @param[in] src0_ptr Pointer to the source matrix. Supported data types: F32
* @param[in] src0_stride_x Stride of the source matrix in X dimension (in bytes)
@@ -271,10 +314,14 @@
*/
__kernel void gemm_mm_interleaved_transposed_f32_bifrost(IMAGE_DECLARATION(src0),
IMAGE_DECLARATION(src1),
- IMAGE_DECLARATION(dst))
+ IMAGE_DECLARATION(dst),
+ uint src0_stride_z,
+ uint src1_stride_z,
+ uint dst_stride_z)
{
int x = get_global_id(0) / MULT_TRANSPOSE1XW_WIDTH;
int y = get_global_id(1) / MULT_INTERLEAVE4X4_HEIGHT;
+ int z = get_global_id(2);
// Offset
const int offset_row_a = (get_global_id(1) % MULT_INTERLEAVE4X4_HEIGHT) * 4;
@@ -282,11 +329,18 @@
// src_addr_a = address of matrix A
// src_addr_b = address of matrix B
- __global float *src_addr_a = (__global float *)(src0_ptr + y * src0_stride_y + src0_offset_first_element_in_bytes);
- __global float *src_addr_b = (__global float *)(src1_ptr + x * src1_stride_y + src1_offset_first_element_in_bytes);
+ int src0_addr_in_bytes = z * src0_stride_z + y * src0_stride_y + src0_offset_first_element_in_bytes;
+ int src1_addr_in_bytes = x * src1_stride_y + src1_offset_first_element_in_bytes;
- // Compute end row address for matrix B
- __global float *src_end_addr_b = src_addr_b + COLS_B;
+#if defined(MATRIX_B_DEPTH)
+ // Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3
+ src1_addr_in_bytes += (z % MATRIX_B_DEPTH) * src1_stride_z;
+#else // defined(MATRIX_B_DEPTH)
+ src1_addr_in_bytes += z * src1_stride_z;
+#endif // defined(MATRIX_B_DEPTH)
+
+ __global float *src_addr_a = (__global float *)(src0_ptr + src0_addr_in_bytes);
+ __global float *src_addr_b = (__global float *)(src1_ptr + src1_addr_in_bytes);
src_addr_a += offset_row_a;
src_addr_b += offset_row_b;
@@ -309,35 +363,17 @@
float c32 = 0.0f;
float c33 = 0.0f;
- for(; src_addr_b <= (src_end_addr_b - (int)(16 * MULT_TRANSPOSE1XW_WIDTH)); src_addr_a += (16 * MULT_INTERLEAVE4X4_HEIGHT), src_addr_b += (16 * MULT_TRANSPOSE1XW_WIDTH))
+#define COLS_MTX_B (COLS_B / (4 * MULT_TRANSPOSE1XW_WIDTH))
+
+ int i = 0;
+ for(; i <= (int)(COLS_MTX_B - 4); i += 4)
{
// Load values from matrix A (interleaved) and matrix B (transposed)
float4 a0 = vload4(0, src_addr_a);
float4 b0 = vload4(0, src_addr_b);
- c00 = fma(a0.s0, b0.s0, c00);
- c01 = fma(a0.s0, b0.s1, c01);
- c02 = fma(a0.s0, b0.s2, c02);
- c03 = fma(a0.s0, b0.s3, c03);
-
- c10 = fma(a0.s1, b0.s0, c10);
- c11 = fma(a0.s1, b0.s1, c11);
- c12 = fma(a0.s1, b0.s2, c12);
- c13 = fma(a0.s1, b0.s3, c13);
-
- c20 = fma(a0.s2, b0.s0, c20);
- c21 = fma(a0.s2, b0.s1, c21);
- c22 = fma(a0.s2, b0.s2, c22);
- c23 = fma(a0.s2, b0.s3, c23);
-
- c30 = fma(a0.s3, b0.s0, c30);
- c31 = fma(a0.s3, b0.s1, c31);
- c32 = fma(a0.s3, b0.s2, c32);
- c33 = fma(a0.s3, b0.s3, c33);
-
- // Load values from matrix A (interleaved) and matrix B (transposed)
- a0 = vload4(0, src_addr_a + 4 * MULT_INTERLEAVE4X4_HEIGHT);
- b0 = vload4(0, src_addr_b + 4 * MULT_TRANSPOSE1XW_WIDTH);
+ src_addr_a += 4 * MULT_INTERLEAVE4X4_HEIGHT;
+ src_addr_b += 4 * MULT_TRANSPOSE1XW_WIDTH;
c00 = fma(a0.s0, b0.s0, c00);
c01 = fma(a0.s0, b0.s1, c01);
@@ -360,8 +396,11 @@
c33 = fma(a0.s3, b0.s3, c33);
// Load values from matrix A (interleaved) and matrix B (transposed)
- a0 = vload4(0, src_addr_a + 8 * MULT_INTERLEAVE4X4_HEIGHT);
- b0 = vload4(0, src_addr_b + 8 * MULT_TRANSPOSE1XW_WIDTH);
+ a0 = vload4(0, src_addr_a);
+ b0 = vload4(0, src_addr_b);
+
+ src_addr_a += 4 * MULT_INTERLEAVE4X4_HEIGHT;
+ src_addr_b += 4 * MULT_TRANSPOSE1XW_WIDTH;
c00 = fma(a0.s0, b0.s0, c00);
c01 = fma(a0.s0, b0.s1, c01);
@@ -384,8 +423,38 @@
c33 = fma(a0.s3, b0.s3, c33);
// Load values from matrix A (interleaved) and matrix B (transposed)
- a0 = vload4(0, src_addr_a + 12 * MULT_INTERLEAVE4X4_HEIGHT);
- b0 = vload4(0, src_addr_b + 12 * MULT_TRANSPOSE1XW_WIDTH);
+ a0 = vload4(0, src_addr_a);
+ b0 = vload4(0, src_addr_b);
+
+ src_addr_a += 4 * MULT_INTERLEAVE4X4_HEIGHT;
+ src_addr_b += 4 * MULT_TRANSPOSE1XW_WIDTH;
+
+ c00 = fma(a0.s0, b0.s0, c00);
+ c01 = fma(a0.s0, b0.s1, c01);
+ c02 = fma(a0.s0, b0.s2, c02);
+ c03 = fma(a0.s0, b0.s3, c03);
+
+ c10 = fma(a0.s1, b0.s0, c10);
+ c11 = fma(a0.s1, b0.s1, c11);
+ c12 = fma(a0.s1, b0.s2, c12);
+ c13 = fma(a0.s1, b0.s3, c13);
+
+ c20 = fma(a0.s2, b0.s0, c20);
+ c21 = fma(a0.s2, b0.s1, c21);
+ c22 = fma(a0.s2, b0.s2, c22);
+ c23 = fma(a0.s2, b0.s3, c23);
+
+ c30 = fma(a0.s3, b0.s0, c30);
+ c31 = fma(a0.s3, b0.s1, c31);
+ c32 = fma(a0.s3, b0.s2, c32);
+ c33 = fma(a0.s3, b0.s3, c33);
+
+ // Load values from matrix A (interleaved) and matrix B (transposed)
+ a0 = vload4(0, src_addr_a);
+ b0 = vload4(0, src_addr_b);
+
+ src_addr_a += 4 * MULT_INTERLEAVE4X4_HEIGHT;
+ src_addr_b += 4 * MULT_TRANSPOSE1XW_WIDTH;
c00 = fma(a0.s0, b0.s0, c00);
c01 = fma(a0.s0, b0.s1, c01);
@@ -408,12 +477,15 @@
c33 = fma(a0.s3, b0.s3, c33);
}
- for(; src_addr_b < src_end_addr_b; src_addr_a += (4 * MULT_INTERLEAVE4X4_HEIGHT), src_addr_b += (4 * MULT_TRANSPOSE1XW_WIDTH))
+ for(; i < (int)(COLS_MTX_B); ++i)
{
// Load values from matrix A (interleaved) and matrix B (transposed)
float4 a0 = vload4(0, src_addr_a);
float4 b0 = vload4(0, src_addr_b);
+ src_addr_a += 4 * MULT_INTERLEAVE4X4_HEIGHT;
+ src_addr_b += 4 * MULT_TRANSPOSE1XW_WIDTH;
+
c00 = fma(a0.s0, b0.s0, c00);
c01 = fma(a0.s0, b0.s1, c01);
c02 = fma(a0.s0, b0.s2, c02);
@@ -458,13 +530,22 @@
c33 = c33 * ALPHA;
#endif // defined(ALPHA)
+ // Compute dst address
+ __global uchar *dst_addr = offset(&dst, 0, 0);
+
+ // Add offset for batched GEMM
+ dst_addr += z * dst_stride_z;
+
// Store 4x4 block
- vstore4((float4)(c00, c01, c02, c03), 0, (__global float *)(offset(&dst, 0, 0)));
- vstore4((float4)(c10, c11, c12, c13), 0, (__global float *)(offset(&dst, 0, 1)));
- vstore4((float4)(c20, c21, c22, c23), 0, (__global float *)(offset(&dst, 0, 2)));
- vstore4((float4)(c30, c31, c32, c33), 0, (__global float *)(offset(&dst, 0, 3)));
+ vstore4((float4)(c00, c01, c02, c03), 0, (__global float *)(dst_addr + 0 * dst_stride_y));
+ vstore4((float4)(c10, c11, c12, c13), 0, (__global float *)(dst_addr + 1 * dst_stride_y));
+ vstore4((float4)(c20, c21, c22, c23), 0, (__global float *)(dst_addr + 2 * dst_stride_y));
+ vstore4((float4)(c30, c31, c32, c33), 0, (__global float *)(dst_addr + 3 * dst_stride_y));
}
+// Undefine local defines
+#undef COLS_MTX_B
+
#if defined(ARM_COMPUTE_OPENCL_FP16_ENABLED)
/** This OpenCL kernel computes the matrix multiplication between matrix A (src0) and matrix B (src1)
* Matrix A and matrix B must be reshaped respectively with @ref gemm_interleave4x4_16bit and @ref gemm_transpose1x8 before running the matrix multiplication
@@ -472,6 +553,8 @@
* @note The number of columns of matrix B and the optional alpha's value need to be passed at compile time using -DCOLS_B and -DALPHA
* @note The multiplication factor for the transposition width (mult_transpose1xW_width) must be passed at compile time using -DMULT_TRANSPOSE1XW_WIDTH (i.e. -DMULT_TRANSPOSE1XW_WIDTH=2)
* @note The multiplication factor for the height of the 4x4 interleaved block must be passed at compile time using -DMULT_INTERLEAVE4X4_HEIGHT (i.e. -DMULT_INTERLEAVE4X4_HEIGHT=2)
+ * @note In case the matrix B has 3 dimensions and the matrix A more than 3, in order to avoid out-of-bounds reads, the number of channels of matrix B must be passed at compile time using MATRIX_B_DEPTH (i.e. -DMATRIX_B_DEPTH=16)
+ * This case can happen when GEMM is used to perform the element-wise multiplication through a batched matrix multiplication (2D Winograd) and we have multiple inputs (i.e. a = [K, M, 16, Batches], b = [N, K, 16])
*
* @param[in] src0_ptr Pointer to the source matrix. Supported data types: F16
* @param[in] src0_stride_x Stride of the source matrix in X dimension (in bytes)
@@ -494,10 +577,14 @@
*/
__kernel void gemm_mm_interleaved_transposed_f16(IMAGE_DECLARATION(src0),
IMAGE_DECLARATION(src1),
- IMAGE_DECLARATION(dst))
+ IMAGE_DECLARATION(dst),
+ uint src0_stride_z,
+ uint src1_stride_z,
+ uint dst_stride_z)
{
int x = get_global_id(0) / MULT_TRANSPOSE1XW_WIDTH;
int y = get_global_id(1) / MULT_INTERLEAVE4X4_HEIGHT;
+ int z = get_global_id(2);
// Offset
const int offset_row_a = (get_global_id(1) % MULT_INTERLEAVE4X4_HEIGHT) * 4;
@@ -505,8 +592,18 @@
// src_addr_a = address of matrix A
// src_addr_b = address of matrix B
- __global half *src_addr_a = (__global half *)(src0_ptr + y * src0_stride_y + src0_offset_first_element_in_bytes);
- __global half *src_addr_b = (__global half *)(src1_ptr + x * src1_stride_y + src1_offset_first_element_in_bytes);
+ int src0_addr_in_bytes = z * src0_stride_z + y * src0_stride_y + src0_offset_first_element_in_bytes;
+ int src1_addr_in_bytes = x * src1_stride_y + src1_offset_first_element_in_bytes;
+
+#if defined(MATRIX_B_DEPTH)
+ // Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3
+ src1_addr_in_bytes += (z % MATRIX_B_DEPTH) * src1_stride_z;
+#else // defined(MATRIX_B_DEPTH)
+ src1_addr_in_bytes += z * src1_stride_z;
+#endif // defined(MATRIX_B_DEPTH)
+
+ __global half *src_addr_a = (__global half *)(src0_ptr + src0_addr_in_bytes);
+ __global half *src_addr_b = (__global half *)(src1_ptr + src1_addr_in_bytes);
// Compute end row address for matrix B
__global half *src_end_addr_b = src_addr_b + COLS_B;
@@ -564,12 +661,231 @@
c30 = c30 * (half8)ALPHA;
#endif // defined(ALPHA)
+ // Compute dst address
+ __global uchar *dst_addr = offset(&dst, 0, 0);
+
+ // Add offset for batched GEMM
+ dst_addr += z * dst_stride_z;
+
// Store 4x8 block
- vstore8(c00, 0, (__global half *)(offset(&dst, 0, 0)));
- vstore8(c10, 0, (__global half *)(offset(&dst, 0, 1)));
- vstore8(c20, 0, (__global half *)(offset(&dst, 0, 2)));
- vstore8(c30, 0, (__global half *)(offset(&dst, 0, 3)));
+ vstore8(c00, 0, (__global half *)(dst_addr + 0 * dst_stride_y));
+ vstore8(c10, 0, (__global half *)(dst_addr + 1 * dst_stride_y));
+ vstore8(c20, 0, (__global half *)(dst_addr + 2 * dst_stride_y));
+ vstore8(c30, 0, (__global half *)(dst_addr + 3 * dst_stride_y));
}
+
+/** This OpenCL kernel optimized for Bifrost architectures computes the matrix multiplication between matrix A (src0) and matrix B (src1)
+ * Matrix A and matrix B must be reshaped respectively with @ref gemm_interleave4x4_16bit and @ref gemm_transpose1x8 before running the matrix multiplication
+ *
+ * @note The number of columns of matrix B and the optional alpha's value need to be passed at compile time using -DCOLS_B and -DALPHA
+ * @note The multiplication factor for the transposition width (mult_transpose1xW_width) must be passed at compile time using -DMULT_TRANSPOSE1XW_WIDTH (i.e. -DMULT_TRANSPOSE1XW_WIDTH=2)
+ * @note The multiplication factor for the height of the 4x4 interleaved block must be passed at compile time using -DMULT_INTERLEAVE4X4_HEIGHT (i.e. -DMULT_INTERLEAVE4X4_HEIGHT=2)
+ * @note In case the matrix B has 3 dimensions and the matrix A more than 3, in order to avoid out-of-bounds reads, the number of channels of matrix B must be passed at compile time using MATRIX_B_DEPTH (i.e. -DMATRIX_B_DEPTH=16)
+ * This case can happen when GEMM is used to perform the element-wise multiplication through a batched matrix multiplication (2D Winograd) and we have multiple inputs (i.e. a = [K, M, 16, Batches], b = [N, K, 16])
+ *
+ * @param[in] src0_ptr Pointer to the source matrix. Supported data types: F16
+ * @param[in] src0_stride_x Stride of the source matrix in X dimension (in bytes)
+ * @param[in] src0_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] src0_stride_y Stride of the source matrix in Y dimension (in bytes)
+ * @param[in] src0_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] src0_offset_first_element_in_bytes The offset of the first element in the source matrix
+ * @param[in] src1_ptr Pointer to the source matrix. Supported data types: same as @p src0_ptr
+ * @param[in] src1_stride_x Stride of the source matrix in X dimension (in bytes)
+ * @param[in] src1_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] src1_stride_y Stride of the source matrix in Y dimension (in bytes)
+ * @param[in] src1_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] src1_offset_first_element_in_bytes The offset of the first element in the source matrix
+ * @param[out] dst_ptr Pointer to the destination matrix Supported data types: same as @p src0_ptr
+ * @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)
+ * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)
+ * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix
+ */
+__kernel void gemm_mm_interleaved_transposed_f16_bifrost(IMAGE_DECLARATION(src0),
+ IMAGE_DECLARATION(src1),
+ IMAGE_DECLARATION(dst),
+ uint src0_stride_z,
+ uint src1_stride_z,
+ uint dst_stride_z)
+{
+ int x = get_global_id(0) / MULT_TRANSPOSE1XW_WIDTH;
+ int y = get_global_id(1) / MULT_INTERLEAVE4X4_HEIGHT;
+ int z = get_global_id(2);
+
+ // Offset
+ const int offset_row_a = (get_global_id(1) % MULT_INTERLEAVE4X4_HEIGHT) * 4;
+ const int offset_row_b = (get_global_id(0) % MULT_TRANSPOSE1XW_WIDTH) * 8;
+
+ // src_addr_a = address of matrix A
+ // src_addr_b = address of matrix B
+ int src0_addr_in_bytes = z * src0_stride_z + y * src0_stride_y + src0_offset_first_element_in_bytes;
+ int src1_addr_in_bytes = x * src1_stride_y + src1_offset_first_element_in_bytes;
+
+#if defined(MATRIX_B_DEPTH)
+ // Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3
+ src1_addr_in_bytes += (z % MATRIX_B_DEPTH) * src1_stride_z;
+#else // defined(MATRIX_B_DEPTH)
+ src1_addr_in_bytes += z * src1_stride_z;
+#endif // defined(MATRIX_B_DEPTH)
+
+ __global half *src_addr_a = (__global half *)(src0_ptr + src0_addr_in_bytes);
+ __global half *src_addr_b = (__global half *)(src1_ptr + src1_addr_in_bytes);
+
+ // Compute end row address for matrix B
+ __global half *src_end_addr_b = src_addr_b + COLS_B;
+
+ src_addr_a += offset_row_a;
+ src_addr_b += offset_row_b;
+
+ // Reset accumulators
+ half8 c00 = 0.0f;
+ half8 c10 = 0.0f;
+ half8 c20 = 0.0f;
+ half8 c30 = 0.0f;
+
+#define COLS_MTX_B (COLS_B / (8 * MULT_TRANSPOSE1XW_WIDTH))
+
+ int i = 0;
+ for(; i <= (int)(COLS_MTX_B - 4); i += 4)
+ {
+#if MULT_INTERLEAVE4X4_HEIGHT == 1
+ // Load values from matrix A (interleaved) and matrix B (transposed)
+ half8 a0 = vload8(0, src_addr_a);
+ half8 b0 = vload8(0, src_addr_b);
+
+ src_addr_a += 8 * MULT_INTERLEAVE4X4_HEIGHT;
+ src_addr_b += 8 * MULT_TRANSPOSE1XW_WIDTH;
+
+ c00 = fma((half8)a0.s0, b0, c00);
+ c10 = fma((half8)a0.s1, b0, c10);
+ c20 = fma((half8)a0.s2, b0, c20);
+ c30 = fma((half8)a0.s3, b0, c30);
+
+ // Load values from matrix B (transposed)
+ b0 = vload8(0, src_addr_b);
+
+ src_addr_b += 8 * MULT_TRANSPOSE1XW_WIDTH;
+
+ c00 = fma((half8)a0.s4, b0, c00);
+ c10 = fma((half8)a0.s5, b0, c10);
+ c20 = fma((half8)a0.s6, b0, c20);
+ c30 = fma((half8)a0.s7, b0, c30);
+
+ // Load values from matrix A (interleaved) and matrix B (transposed)
+ a0 = vload8(0, src_addr_a);
+ b0 = vload8(0, src_addr_b);
+
+ src_addr_a += 8 * MULT_INTERLEAVE4X4_HEIGHT;
+ src_addr_b += 8 * MULT_TRANSPOSE1XW_WIDTH;
+
+ c00 = fma((half8)a0.s0, b0, c00);
+ c10 = fma((half8)a0.s1, b0, c10);
+ c20 = fma((half8)a0.s2, b0, c20);
+ c30 = fma((half8)a0.s3, b0, c30);
+
+ // Load values from matrix B (transposed)
+ b0 = vload8(0, src_addr_b);
+
+ src_addr_b += 8 * MULT_TRANSPOSE1XW_WIDTH;
+
+ c00 = fma((half8)a0.s4, b0, c00);
+ c10 = fma((half8)a0.s5, b0, c10);
+ c20 = fma((half8)a0.s6, b0, c20);
+ c30 = fma((half8)a0.s7, b0, c30);
+#else // MULT_INTERLEAVE4X4_HEIGHT == 1
+ // Load values from matrix A (interleaved) and matrix B (transposed)
+ half4 a0 = vload4(0, src_addr_a);
+ half8 b0 = vload8(0, src_addr_b);
+
+ src_addr_a += 4 * MULT_INTERLEAVE4X4_HEIGHT;
+ src_addr_b += 8 * MULT_TRANSPOSE1XW_WIDTH;
+
+ c00 = fma((half8)a0.s0, b0, c00);
+ c10 = fma((half8)a0.s1, b0, c10);
+ c20 = fma((half8)a0.s2, b0, c20);
+ c30 = fma((half8)a0.s3, b0, c30);
+
+ // Load values from matrix A (interleaved) and matrix B (transposed)
+ a0 = vload4(0, src_addr_a);
+ b0 = vload8(0, src_addr_b);
+
+ src_addr_a += 4 * MULT_INTERLEAVE4X4_HEIGHT;
+ src_addr_b += 8 * MULT_TRANSPOSE1XW_WIDTH;
+
+ c00 = fma((half8)a0.s0, b0, c00);
+ c10 = fma((half8)a0.s1, b0, c10);
+ c20 = fma((half8)a0.s2, b0, c20);
+ c30 = fma((half8)a0.s3, b0, c30);
+
+ // Load values from matrix A (interleaved) and matrix B (transposed)
+ a0 = vload4(0, src_addr_a);
+ b0 = vload8(0, src_addr_b);
+
+ src_addr_a += 4 * MULT_INTERLEAVE4X4_HEIGHT;
+ src_addr_b += 8 * MULT_TRANSPOSE1XW_WIDTH;
+
+ c00 = fma((half8)a0.s0, b0, c00);
+ c10 = fma((half8)a0.s1, b0, c10);
+ c20 = fma((half8)a0.s2, b0, c20);
+ c30 = fma((half8)a0.s3, b0, c30);
+
+ // Load values from matrix A (interleaved) and matrix B (transposed)
+ a0 = vload4(0, src_addr_a);
+ b0 = vload8(0, src_addr_b);
+
+ src_addr_a += 4 * MULT_INTERLEAVE4X4_HEIGHT;
+ src_addr_b += 8 * MULT_TRANSPOSE1XW_WIDTH;
+
+ c00 = fma((half8)a0.s0, b0, c00);
+ c10 = fma((half8)a0.s1, b0, c10);
+ c20 = fma((half8)a0.s2, b0, c20);
+ c30 = fma((half8)a0.s3, b0, c30);
+#endif // MULT_INTERLEAVE4X4_HEIGHT == 1
+ }
+
+ for(; i < (int)(COLS_MTX_B); ++i)
+ {
+ // Load values from matrix A (interleaved) and matrix B (transposed)
+ half4 a0 = vload4(0, src_addr_a);
+ half8 b0 = vload8(0, src_addr_b);
+
+ src_addr_a += 4 * MULT_INTERLEAVE4X4_HEIGHT;
+ src_addr_b += 8 * MULT_TRANSPOSE1XW_WIDTH;
+
+ c00 = fma((half8)a0.s0, b0, c00);
+ c10 = fma((half8)a0.s1, b0, c10);
+ c20 = fma((half8)a0.s2, b0, c20);
+ c30 = fma((half8)a0.s3, b0, c30);
+ }
+
+ // Compute destination address
+ Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
+
+#if defined(ALPHA)
+ // Multiply by the weight of matrix product
+ c00 = c00 * (half8)ALPHA;
+ c10 = c10 * (half8)ALPHA;
+ c20 = c20 * (half8)ALPHA;
+ c30 = c30 * (half8)ALPHA;
+#endif // defined(ALPHA)
+
+ // Compute dst address
+ __global uchar *dst_addr = offset(&dst, 0, 0);
+
+ // Add offset for batched GEMM
+ dst_addr += z * dst_stride_z;
+
+ // Store 4x8 block
+ vstore8(c00, 0, (__global half *)(dst_addr + 0 * dst_stride_y));
+ vstore8(c10, 0, (__global half *)(dst_addr + 1 * dst_stride_y));
+ vstore8(c20, 0, (__global half *)(dst_addr + 2 * dst_stride_y));
+ vstore8(c30, 0, (__global half *)(dst_addr + 3 * dst_stride_y));
+}
+
+// Undefine local defines
+#undef COLS_MTX_B
+
#endif // defined(ARM_COMPUTE_OPENCL_FP16_ENABLED)
#if defined(FIXED_POINT_POSITION)
@@ -579,8 +895,9 @@
* @note The number of columns of matrix B and the optional alpha's value need to be passed at compile time using -DCOLS_B and -DALPHA
* @note The multiplication factor for the transposition width (mult_transpose1xW_width) must be passed at compile time using -DMULT_TRANSPOSE1XW_WIDTH (i.e. -DMULT_TRANSPOSE1XW_WIDTH=2)
* @note The multiplication factor for the height of the 4x4 interleaved block must be passed at compile time using -DMULT_INTERLEAVE4X4_HEIGHT (i.e. -DMULT_INTERLEAVE4X4_HEIGHT=2)
- *
- * @note: ALPHA must be passed in 8 bit fixed point format
+ * @note In case the matrix B has 3 dimensions and the matrix A more than 3, in order to avoid out-of-bounds reads, the number of channels of matrix B must be passed at compile time using MATRIX_B_DEPTH (i.e. -DMATRIX_B_DEPTH=16)
+ * This case can happen when GEMM is used to perform the element-wise multiplication through a batched matrix multiplication (2D Winograd) and we have multiple inputs (i.e. a = [K, M, 16, Batches], b = [N, K, 16])
+ * @note:ALPHA must be passed in 8 bit fixed point format
*
* @param[in] src0_ptr Pointer to the source matrix. Supported data types: QS8
* @param[in] src0_stride_x Stride of the source matrix in X dimension (in bytes)
@@ -603,10 +920,14 @@
*/
__kernel void gemm_mm_interleaved_transposed_qs8(IMAGE_DECLARATION(src0),
IMAGE_DECLARATION(src1),
- IMAGE_DECLARATION(dst))
+ IMAGE_DECLARATION(dst),
+ uint src0_stride_z,
+ uint src1_stride_z,
+ uint dst_stride_z)
{
int x = get_global_id(0) / MULT_TRANSPOSE1XW_WIDTH;
int y = get_global_id(1) / MULT_INTERLEAVE4X4_HEIGHT;
+ int z = get_global_id(2);
// Offset
const int offset_row_a = (get_global_id(1) % MULT_INTERLEAVE4X4_HEIGHT) * 4;
@@ -614,8 +935,18 @@
// src_addr_a = address of matrix A
// src_addr_b = address of matrix B
- __global char *src_addr_a = src0_ptr + y * src0_stride_y + src0_offset_first_element_in_bytes;
- __global char *src_addr_b = src1_ptr + x * src1_stride_y + src1_offset_first_element_in_bytes;
+ int src0_addr_in_bytes = z * src0_stride_z + y * src0_stride_y + src0_offset_first_element_in_bytes;
+ int src1_addr_in_bytes = x * src1_stride_y + src1_offset_first_element_in_bytes;
+
+#if defined(MATRIX_B_DEPTH)
+ // Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3
+ src1_addr_in_bytes += (z % MATRIX_B_DEPTH) * src1_stride_z;
+#else // defined(MATRIX_B_DEPTH)
+ src1_addr_in_bytes += z * src1_stride_z;
+#endif // defined(MATRIX_B_DEPTH)
+
+ __global char *src_addr_a = (__global char *)(src0_ptr + src0_addr_in_bytes);
+ __global char *src_addr_b = (__global char *)(src1_ptr + src1_addr_in_bytes);
// Compute end row address for matrix B
__global char *src_end_addr_b = src_addr_b + COLS_B;
@@ -667,11 +998,17 @@
c30_qs8 = mul_sat_qs8x16(c30_qs8, (char16)ALPHA, FIXED_POINT_POSITION);
#endif // defined(ALPHA)
+ // Compute dst address
+ __global uchar *dst_addr = offset(&dst, 0, 0);
+
+ // Add offset for batched GEMM
+ dst_addr += z * dst_stride_z;
+
// Store 16x4 block
- vstore16(c00_qs8, 0, (__global char *)(offset(&dst, 0, 0)));
- vstore16(c10_qs8, 0, (__global char *)(offset(&dst, 0, 1)));
- vstore16(c20_qs8, 0, (__global char *)(offset(&dst, 0, 2)));
- vstore16(c30_qs8, 0, (__global char *)(offset(&dst, 0, 3)));
+ vstore16(c00_qs8, 0, (__global char *)(dst_addr + 0 * dst_stride_y));
+ vstore16(c10_qs8, 0, (__global char *)(dst_addr + 1 * dst_stride_y));
+ vstore16(c20_qs8, 0, (__global char *)(dst_addr + 2 * dst_stride_y));
+ vstore16(c30_qs8, 0, (__global char *)(dst_addr + 3 * dst_stride_y));
}
/** This OpenCL kernel computes the matrix multiplication between matrix A (src0) and matrix B (src1) in 16 bit fixed point precision
@@ -680,8 +1017,9 @@
* @note The number of columns of matrix B and the optional alpha's value need to be passed at compile time using -DCOLS_B and -DALPHA
* @note The multiplication factor for the transposition width (mult_transpose1xW_width) must be passed at compile time using -DMULT_TRANSPOSE1XW_WIDTH (i.e. -DMULT_TRANSPOSE1XW_WIDTH=2)
* @note The multiplication factor for the height of the 4x4 interleaved block must be passed at compile time using -DMULT_INTERLEAVE4X4_HEIGHT (i.e. -DMULT_INTERLEAVE4X4_HEIGHT=2)
- *
- * @note: ALPHA must be passed in 16 bit fixed point format
+ * @note In case the matrix B has 3 dimensions and the matrix A more than 3, in order to avoid out-of-bounds reads, the number of channels of matrix B must be passed at compile time using MATRIX_B_DEPTH (i.e. -DMATRIX_B_DEPTH=16)
+ * This case can happen when GEMM is used to perform the element-wise multiplication through a batched matrix multiplication (2D Winograd) and we have multiple inputs (i.e. a = [K, M, 16, Batches], b = [N, K, 16])
+ * @note:ALPHA must be passed in 16 bit fixed point format
*
* @param[in] src0_ptr Pointer to the source matrix. Supported data types: QS16
* @param[in] src0_stride_x Stride of the source matrix in X dimension (in bytes)
@@ -704,10 +1042,14 @@
*/
__kernel void gemm_mm_interleaved_transposed_qs16(IMAGE_DECLARATION(src0),
IMAGE_DECLARATION(src1),
- IMAGE_DECLARATION(dst))
+ IMAGE_DECLARATION(dst),
+ uint src0_stride_z,
+ uint src1_stride_z,
+ uint dst_stride_z)
{
int x = get_global_id(0) / MULT_TRANSPOSE1XW_WIDTH;
int y = get_global_id(1) / MULT_INTERLEAVE4X4_HEIGHT;
+ int z = get_global_id(2);
// Offset
const int offset_row_a = (get_global_id(1) % MULT_INTERLEAVE4X4_HEIGHT) * 4;
@@ -715,8 +1057,18 @@
// src_addr_a = address of matrix A
// src_addr_b = address of matrix B
- __global short *src_addr_a = (__global short *)(src0_ptr + y * src0_stride_y + src0_offset_first_element_in_bytes);
- __global short *src_addr_b = (__global short *)(src1_ptr + x * src1_stride_y + src1_offset_first_element_in_bytes);
+ int src0_addr_in_bytes = z * src0_stride_z + y * src0_stride_y + src0_offset_first_element_in_bytes;
+ int src1_addr_in_bytes = x * src1_stride_y + src1_offset_first_element_in_bytes;
+
+#if defined(MATRIX_B_DEPTH)
+ // Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3
+ src1_addr_in_bytes += (z % MATRIX_B_DEPTH) * src1_stride_z;
+#else // defined(MATRIX_B_DEPTH)
+ src1_addr_in_bytes += z * src1_stride_z;
+#endif // defined(MATRIX_B_DEPTH)
+
+ __global short *src_addr_a = (__global short *)(src0_ptr + src0_addr_in_bytes);
+ __global short *src_addr_b = (__global short *)(src1_ptr + src1_addr_in_bytes);
// Compute end row address for matrix B
__global short *src_end_addr_b = src_addr_b + COLS_B;
@@ -759,11 +1111,17 @@
c30_qs16 = mul_sat_qs16x8(c30_qs16, (short8)ALPHA, FIXED_POINT_POSITION);
#endif // defined(ALPHA)
+ // Compute dst address
+ __global uchar *dst_addr = offset(&dst, 0, 0);
+
+ // Add offset for batched GEMM
+ dst_addr += z * dst_stride_z;
+
// Store 8x4 block
- vstore8(c00_qs16, 0, (__global short *)(offset(&dst, 0, 0)));
- vstore8(c10_qs16, 0, (__global short *)(offset(&dst, 0, 1)));
- vstore8(c20_qs16, 0, (__global short *)(offset(&dst, 0, 2)));
- vstore8(c30_qs16, 0, (__global short *)(offset(&dst, 0, 3)));
+ vstore8(c00_qs16, 0, (__global short *)(dst_addr + 0 * dst_stride_y));
+ vstore8(c10_qs16, 0, (__global short *)(dst_addr + 1 * dst_stride_y));
+ vstore8(c20_qs16, 0, (__global short *)(dst_addr + 2 * dst_stride_y));
+ vstore8(c30_qs16, 0, (__global short *)(dst_addr + 3 * dst_stride_y));
}
#endif // defined(FIXED_POINT_POSITION)
#endif // defined(COLS_B) && defined(MULT_TRANSPOSE1XW_WIDTH) && defined(MULT_INTERLEAVE4X4_HEIGHT)
@@ -771,12 +1129,14 @@
#if defined(COLS_A) && defined(NUM_ELEMS_PROCESSED_PER_THREAD_X) && (NUM_ELEMS_PROCESSED_PER_THREAD_Y)
#if defined(DATA_TYPE)
#define VECTOR_TYPE VEC_DATA_TYPE(DATA_TYPE, NUM_ELEMS_PROCESSED_PER_THREAD_X)
-/** This OpenCL kernel computes the matrix by matrix multiplication between the matrix A (src0) and matrix B (src1) in case both matrices have not beed reshaped
+/** This OpenCL kernel computes the matrix by matrix multiplication between the matrix A (src0) and matrix B (src1) in case both matrices have not been reshaped
*
* @note This OpenCL kernel works with floating point data types (F16/F32)
* @note The floating point data type must be passed at compile time using -DDATA_TYPE (e.g. -DDATA_TYPE=float)
* @note The number of elements processed along the x and y directions must be passed at compile time using -DNUM_ELEMS_PROCESSED_PER_THREAD_X and -DNUM_ELEMS_PROCESSED_PER_THREAD_Y
* @note The number of matrix A columns and the optional alpha's value need to be passed at compile time using -DCOLS_A and -DALPHA
+ * @note In case the matrix B has 3 dimensions and the matrix A more than 3, in order to avoid out-of-bounds reads, the number of channels of matrix B must be passed at compile time using MATRIX_B_DEPTH (i.e. -DMATRIX_B_DEPTH=16)
+ * This case can happen when GEMM is used to perform the element-wise multiplication through a batched matrix multiplication (2D Winograd) and we have multiple inputs (i.e. a = [K, M, 16, Batches], b = [N, K, 16])
*
* @param[in] src0_ptr Pointer to the source matrix. Supported data types: F16/F32
* @param[in] src0_stride_x Stride of the source matrix in X dimension (in bytes)
@@ -799,7 +1159,10 @@
*/
__kernel void gemm_mm_floating_point(IMAGE_DECLARATION(src0),
IMAGE_DECLARATION(src1),
- IMAGE_DECLARATION(dst))
+ IMAGE_DECLARATION(dst),
+ uint src0_stride_z,
+ uint src1_stride_z,
+ uint dst_stride_z)
{
int idx = get_global_id(0) * NUM_ELEMS_PROCESSED_PER_THREAD_X;
@@ -812,6 +1175,16 @@
// Update address for the matrix B
src_addr.s1 += idx * sizeof(DATA_TYPE);
+ // Add offset for batched GEMM
+ src_addr.s0 += get_global_id(2) * src0_stride_z;
+
+#if defined(MATRIX_B_DEPTH)
+ // Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3
+ src_addr.s1 += (get_global_id(2) % MATRIX_B_DEPTH) * src1_stride_z;
+#else // defined(MATRIX_B_DEPTH)
+ src_addr.s1 += get_global_id(2) * src1_stride_z;
+#endif // defined(MATRIX_B_DEPTH)
+
int end_row_vec_a = src_addr.s0 + (COLS_A * sizeof(DATA_TYPE));
VECTOR_TYPE acc0 = 0.0f;
@@ -895,43 +1268,51 @@
// Compute destination address
Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
+ // Compute dst address
+ __global uchar *dst_addr = offset(&dst, 0, 0);
+
+ // Add offset for batched GEMM
+ dst_addr += get_global_id(2) * dst_stride_z;
+
// Multiply by the weight of matrix-matrix product and store the result
#if defined(ALPHA)
acc0 = acc0 * (VECTOR_TYPE)ALPHA;
#endif // defined(ALPHA)
VSTORE(NUM_ELEMS_PROCESSED_PER_THREAD_X)
- (acc0, 0, (__global DATA_TYPE *)(offset(&dst, 0, 0)));
+ (acc0, 0, (__global DATA_TYPE *)(dst_addr + 0 * dst_stride_y));
#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
#if defined(ALPHA)
acc1 = acc1 * (VECTOR_TYPE)ALPHA;
#endif // defined(ALPHA)
VSTORE(NUM_ELEMS_PROCESSED_PER_THREAD_X)
- (acc1, 0, (__global DATA_TYPE *)(offset(&dst, 0, 1)));
+ (acc1, 0, (__global DATA_TYPE *)(dst_addr + 1 * dst_stride_y));
#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
#if defined(ALPHA)
acc2 = acc2 * (VECTOR_TYPE)ALPHA;
#endif // defined(ALPHA)
VSTORE(NUM_ELEMS_PROCESSED_PER_THREAD_X)
- (acc2, 0, (__global DATA_TYPE *)(offset(&dst, 0, 2)));
+ (acc2, 0, (__global DATA_TYPE *)(dst_addr + 2 * dst_stride_y));
#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
#if defined(ALPHA)
acc3 = acc3 * (VECTOR_TYPE)ALPHA;
#endif // defined(ALPHA)
VSTORE(NUM_ELEMS_PROCESSED_PER_THREAD_X)
- (acc3, 0, (__global DATA_TYPE *)(offset(&dst, 0, 3)));
+ (acc3, 0, (__global DATA_TYPE *)(dst_addr + 3 * dst_stride_y));
#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
}
#endif // defined(DATA_TYPE)
-/** This OpenCL kernel computes the matrix by matrix multiplication between the matrix A (src0) and matrix B (src1) in case both matrices have not beed reshaped
+/** This OpenCL kernel computes the matrix by matrix multiplication between the matrix A (src0) and matrix B (src1) in case both matrices have not been reshaped
*
* @note This OpenCL kernel works with the 32-bit floating point data type (float) and uses the fma units.
* @note The number of elements processed along the x and y directions must be passed at compile time using -DNUM_ELEMS_PROCESSED_PER_THREAD_X and -DNUM_ELEMS_PROCESSED_PER_THREAD_Y.
* This kernel optimally uses -DNUM_ELEMS_PROCESSED_PER_THREAD_X=4.
* @note The number of matrix A columns must be passed at compile time using -DCOLS_A.
* @note The optional value of scalar alpha is passed at compile time using -DALPHA=alpha
+ * @note In case the matrix B has 3 dimensions and the matrix A more than 3, in order to avoid out-of-bounds reads, the number of channels of matrix B must be passed at compile time using MATRIX_B_DEPTH (i.e. -DMATRIX_B_DEPTH=16)
+ * This case can happen when GEMM is used to perform the element-wise multiplication through a batched matrix multiplication (2D Winograd) and we have multiple inputs (i.e. a = [K, M, 16, Batches], b = [N, K, 16])
*
* @param[in] src0_ptr Pointer to the source matrix. Supported data types: F16/F32
* @param[in] src0_stride_x Stride of the source matrix in X dimension (in bytes)
@@ -954,7 +1335,10 @@
*/
__kernel void gemm_mm_floating_point_f32_bifrost(IMAGE_DECLARATION(src0),
IMAGE_DECLARATION(src1),
- IMAGE_DECLARATION(dst))
+ IMAGE_DECLARATION(dst),
+ uint src0_stride_z,
+ uint src1_stride_z,
+ uint dst_stride_z)
{
int idx = get_global_id(0) * NUM_ELEMS_PROCESSED_PER_THREAD_X;
@@ -967,8 +1351,15 @@
// Update address for matrix B
src_addr.s1 += idx * sizeof(float);
- // Address boundary for matrix A
- int end_row_vec_a = src_addr.s0 + (COLS_A * sizeof(float));
+ // Add offset for batched GEMM
+ src_addr.s0 += get_global_id(2) * src0_stride_z;
+
+#if defined(MATRIX_B_DEPTH)
+ // Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3
+ src_addr.s1 += (get_global_id(2) % MATRIX_B_DEPTH) * src1_stride_z;
+#else // defined(MATRIX_B_DEPTH)
+ src_addr.s1 += get_global_id(2) * src1_stride_z;
+#endif // defined(MATRIX_B_DEPTH)
// Initialize accumulators
float acc00 = 0.0f;
@@ -998,72 +1389,162 @@
#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
// A and B src indices get incremented at the same time.
- for(; src_addr.s0 <= (end_row_vec_a - 2 * (int)sizeof(float)); src_addr += (int2)(2 * sizeof(float), 2 * src1_stride_y))
+ int i = 0;
+ for(; i <= ((int)COLS_A - 4); i += 4)
{
- // Load values from matrix A
- float2 a0 = vload2(0, (__global float *)(src0_ptr + src_addr.s0 + 0 * src0_stride_y));
+ // Load values from matrix A and matrix B
+ float4 a0 = vload4(0, (__global float *)(src0_ptr + src_addr.s0 + 0 * src0_stride_y));
#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
- float2 a1 = vload2(0, (__global float *)(src0_ptr + src_addr.s0 + 1 * src0_stride_y));
+ float4 a1 = vload4(0, (__global float *)(src0_ptr + src_addr.s0 + 1 * src0_stride_y));
#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
- float2 a2 = vload2(0, (__global float *)(src0_ptr + src_addr.s0 + 2 * src0_stride_y));
+ float4 a2 = vload4(0, (__global float *)(src0_ptr + src_addr.s0 + 2 * src0_stride_y));
#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
- float2 a3 = vload2(0, (__global float *)(src0_ptr + src_addr.s0 + 3 * src0_stride_y));
+ float4 a3 = vload4(0, (__global float *)(src0_ptr + src_addr.s0 + 3 * src0_stride_y));
#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
- // Load values from matrix B
- float4 b0 = vload4(0, (__global float *)(src1_ptr + src_addr.s1 + 0 * src1_stride_y));
- float4 b1 = vload4(0, (__global float *)(src1_ptr + src_addr.s1 + 1 * src1_stride_y));
+ float4 b0 = vload4(0, (__global float *)(src1_ptr + src_addr.s1));
+ src_addr.s1 += src1_stride_y;
// Multiply and accumulate
acc00 = fma(a0.s0, b0.s0, acc00);
- acc00 = fma(a0.s1, b1.s0, acc00);
acc01 = fma(a0.s0, b0.s1, acc01);
- acc01 = fma(a0.s1, b1.s1, acc01);
acc02 = fma(a0.s0, b0.s2, acc02);
- acc02 = fma(a0.s1, b1.s2, acc02);
- acc03 = fma(a0.s1, b1.s3, acc03);
acc03 = fma(a0.s0, b0.s3, acc03);
#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+
acc10 = fma(a1.s0, b0.s0, acc10);
acc11 = fma(a1.s0, b0.s1, acc11);
acc12 = fma(a1.s0, b0.s2, acc12);
acc13 = fma(a1.s0, b0.s3, acc13);
- acc10 = fma(a1.s1, b1.s0, acc10);
- acc11 = fma(a1.s1, b1.s1, acc11);
- acc12 = fma(a1.s1, b1.s2, acc12);
- acc13 = fma(a1.s1, b1.s3, acc13);
#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+
acc20 = fma(a2.s0, b0.s0, acc20);
acc21 = fma(a2.s0, b0.s1, acc21);
acc22 = fma(a2.s0, b0.s2, acc22);
acc23 = fma(a2.s0, b0.s3, acc23);
- acc20 = fma(a2.s1, b1.s0, acc20);
- acc21 = fma(a2.s1, b1.s1, acc21);
- acc22 = fma(a2.s1, b1.s2, acc22);
- acc23 = fma(a2.s1, b1.s3, acc23);
#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+
acc30 = fma(a3.s0, b0.s0, acc30);
acc31 = fma(a3.s0, b0.s1, acc31);
acc32 = fma(a3.s0, b0.s2, acc32);
acc33 = fma(a3.s0, b0.s3, acc33);
-
- acc30 = fma(a3.s1, b1.s0, acc30);
- acc31 = fma(a3.s1, b1.s1, acc31);
- acc32 = fma(a3.s1, b1.s2, acc32);
- acc33 = fma(a3.s1, b1.s3, acc33);
#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+
+ // Load values from matrix A and matrix B
+ b0 = vload4(0, (__global float *)(src1_ptr + src_addr.s1));
+ src_addr.s1 += src1_stride_y;
+
+ // Multiply and accumulate
+ acc00 = fma(a0.s1, b0.s0, acc00);
+ acc01 = fma(a0.s1, b0.s1, acc01);
+ acc02 = fma(a0.s1, b0.s2, acc02);
+ acc03 = fma(a0.s1, b0.s3, acc03);
+
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+
+ acc10 = fma(a1.s1, b0.s0, acc10);
+ acc11 = fma(a1.s1, b0.s1, acc11);
+ acc12 = fma(a1.s1, b0.s2, acc12);
+ acc13 = fma(a1.s1, b0.s3, acc13);
+
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+
+ acc20 = fma(a2.s1, b0.s0, acc20);
+ acc21 = fma(a2.s1, b0.s1, acc21);
+ acc22 = fma(a2.s1, b0.s2, acc22);
+ acc23 = fma(a2.s1, b0.s3, acc23);
+
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+
+ acc30 = fma(a3.s1, b0.s0, acc30);
+ acc31 = fma(a3.s1, b0.s1, acc31);
+ acc32 = fma(a3.s1, b0.s2, acc32);
+ acc33 = fma(a3.s1, b0.s3, acc33);
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+
+ // Load values from matrix A and matrix B
+ b0 = vload4(0, (__global float *)(src1_ptr + src_addr.s1));
+ src_addr.s1 += src1_stride_y;
+
+ // Multiply and accumulate
+ acc00 = fma(a0.s2, b0.s0, acc00);
+ acc01 = fma(a0.s2, b0.s1, acc01);
+ acc02 = fma(a0.s2, b0.s2, acc02);
+ acc03 = fma(a0.s2, b0.s3, acc03);
+
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+
+ acc10 = fma(a1.s2, b0.s0, acc10);
+ acc11 = fma(a1.s2, b0.s1, acc11);
+ acc12 = fma(a1.s2, b0.s2, acc12);
+ acc13 = fma(a1.s2, b0.s3, acc13);
+
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+
+ acc20 = fma(a2.s2, b0.s0, acc20);
+ acc21 = fma(a2.s2, b0.s1, acc21);
+ acc22 = fma(a2.s2, b0.s2, acc22);
+ acc23 = fma(a2.s2, b0.s3, acc23);
+
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+
+ acc30 = fma(a3.s2, b0.s0, acc30);
+ acc31 = fma(a3.s2, b0.s1, acc31);
+ acc32 = fma(a3.s2, b0.s2, acc32);
+ acc33 = fma(a3.s2, b0.s3, acc33);
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+
+ // Load values from matrix A and matrix B
+ b0 = vload4(0, (__global float *)(src1_ptr + src_addr.s1));
+ src_addr.s1 += src1_stride_y;
+
+ // Multiply and accumulate
+ acc00 = fma(a0.s3, b0.s0, acc00);
+ acc01 = fma(a0.s3, b0.s1, acc01);
+ acc02 = fma(a0.s3, b0.s2, acc02);
+ acc03 = fma(a0.s3, b0.s3, acc03);
+
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+
+ acc10 = fma(a1.s3, b0.s0, acc10);
+ acc11 = fma(a1.s3, b0.s1, acc11);
+ acc12 = fma(a1.s3, b0.s2, acc12);
+ acc13 = fma(a1.s3, b0.s3, acc13);
+
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+
+ acc20 = fma(a2.s3, b0.s0, acc20);
+ acc21 = fma(a2.s3, b0.s1, acc21);
+ acc22 = fma(a2.s3, b0.s2, acc22);
+ acc23 = fma(a2.s3, b0.s3, acc23);
+
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+
+ acc30 = fma(a3.s3, b0.s0, acc30);
+ acc31 = fma(a3.s3, b0.s1, acc31);
+ acc32 = fma(a3.s3, b0.s2, acc32);
+ acc33 = fma(a3.s3, b0.s3, acc33);
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+
+ src_addr.s0 += 4 * sizeof(float);
}
- for(; src_addr.s0 < end_row_vec_a; src_addr += (int2)(sizeof(float), src1_stride_y))
+ for(; i < (int)COLS_A; ++i)
{
// Load values from matrix A
- float a0 = *((__global float *)(src0_ptr + src_addr.s0 + 0 * src0_stride_y));
+ float a0 = *((__global float *)(src0_ptr + src_addr.s0));
#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
float a1 = *((__global float *)(src0_ptr + src_addr.s0 + 1 * src0_stride_y));
#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
@@ -1075,6 +1556,7 @@
#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
// Load values from matrix B
float4 b0 = vload4(0, (__global float *)(src1_ptr + src_addr.s1));
+ src_addr.s1 += src1_stride_y;
// Multiply and accumulate
acc00 = fma(a0, b0.s0, acc00);
@@ -1099,6 +1581,8 @@
acc32 = fma(a3, b0.s2, acc32);
acc33 = fma(a3, b0.s3, acc33);
#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+
+ src_addr.s0 += sizeof(float);
}
// Compute destination address
@@ -1112,8 +1596,14 @@
acc03 = acc03 * ALPHA;
#endif // defined(ALPHA)
+ // Compute dst address
+ __global uchar *dst_addr = offset(&dst, 0, 0);
+
+ // Add offset for batched GEMM
+ dst_addr += get_global_id(2) * dst_stride_z;
+
float4 acc0 = ((float4)(acc00, acc01, acc02, acc03));
- vstore4(acc0, 0, (__global float *)(offset(&dst, 0, 0)));
+ vstore4(acc0, 0, (__global float *)(dst_addr + 0 * dst_stride_y));
#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
#if defined(ALPHA)
@@ -1123,7 +1613,7 @@
acc13 = acc13 * ALPHA;
#endif // defined(ALPHA)
float4 acc1 = ((float4)(acc10, acc11, acc12, acc13));
- vstore4(acc1, 0, (__global float *)(offset(&dst, 0, 1)));
+ vstore4(acc1, 0, (__global float *)(dst_addr + 1 * dst_stride_y));
#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
#if defined(ALPHA)
@@ -1133,7 +1623,7 @@
acc23 = acc23 * ALPHA;
#endif // defined(ALPHA)
float4 acc2 = ((float4)(acc20, acc21, acc22, acc23));
- vstore4(acc2, 0, (__global float *)(offset(&dst, 0, 2)));
+ vstore4(acc2, 0, (__global float *)(dst_addr + 2 * dst_stride_y));
#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
#if defined(ALPHA)
@@ -1143,7 +1633,7 @@
acc33 = acc33 * ALPHA;
#endif // defined(ALPHA)
float4 acc3 = ((float4)(acc30, acc31, acc32, acc33));
- vstore4(acc3, 0, (__global float *)(offset(&dst, 0, 3)));
+ vstore4(acc3, 0, (__global float *)(dst_addr + 3 * dst_stride_y));
#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
}
@@ -1155,6 +1645,8 @@
* This kernel optimally uses -DNUM_ELEMS_PROCESSED_PER_THREAD_X=2.
* @note The number of matrix A columns must be passed at compile time using -DCOLS_A.
* @note The optional value of scalar alpha is passed at compile time using -DALPHA=alpha if alpha!=1.0f.
+ * @note In case the matrix B has 3 dimensions and the matrix A more than 3, in order to avoid out-of-bounds reads, the number of channels of matrix B must be passed at compile time using MATRIX_B_DEPTH (i.e. -DMATRIX_B_DEPTH=16)
+ * This case can happen when GEMM is used to perform the element-wise multiplication through a batched matrix multiplication (2D Winograd) and we have multiple inputs (i.e. a = [K, M, 16, Batches], b = [N, K, 16])
*
* @param[in] src0_ptr Pointer to the source matrix. Supported data types: F16/F32
* @param[in] src0_stride_x Stride of the source matrix in X dimension (in bytes)
@@ -1177,7 +1669,10 @@
*/
__kernel void gemm_mm_floating_point_f32_bifrost_1000(IMAGE_DECLARATION(src0),
IMAGE_DECLARATION(src1),
- IMAGE_DECLARATION(dst))
+ IMAGE_DECLARATION(dst),
+ uint src0_stride_z,
+ uint src1_stride_z,
+ uint dst_stride_z)
{
// Requires 2 NUM_ELEMS_PROCESSED_PER_THREAD_X, C vect2, A vect4, B (2 vload2) // to fix for NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
int idx = get_global_id(0) * NUM_ELEMS_PROCESSED_PER_THREAD_X;
@@ -1191,8 +1686,15 @@
// Update address for the matrix B
src_addr.s1 += idx * sizeof(float);
- // Address boundary for the matrix A
- int end_row_vec_a = src_addr.s0 + (COLS_A * sizeof(float));
+ // Add offset for batched GEMM
+ src_addr.s0 += get_global_id(2) * src0_stride_z;
+
+#if defined(MATRIX_B_DEPTH)
+ // Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3
+ src_addr.s1 += (get_global_id(2) % MATRIX_B_DEPTH) * src1_stride_z;
+#else // defined(MATRIX_B_DEPTH)
+ src_addr.s1 += get_global_id(2) * src1_stride_z;
+#endif // defined(MATRIX_B_DEPTH)
// Initialize accumulators
float acc00 = 0.0f;
@@ -1212,67 +1714,114 @@
#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
// A and B src indices get incremented at the same time.
- for(; src_addr.s0 <= (end_row_vec_a - 4 * (int)sizeof(float)); src_addr += (int2)(4 * sizeof(float), 4 * src1_stride_y))
+ int i = 0;
+ for(; i <= ((int)COLS_A - 8); i += 8)
{
// Load values from matrix A
- float4 a0 = vload4(0, (__global float *)(src0_ptr + src_addr.s0 + 0 * src0_stride_y));
+ float8 a0 = vload8(0, (__global float *)(src0_ptr + src_addr.s0));
// Load values from matrix B
- float2 b0 = vload2(0, (__global float *)(src1_ptr + src_addr.s1 + 0 * src1_stride_y));
- float2 b1 = vload2(0, (__global float *)(src1_ptr + src_addr.s1 + 1 * src1_stride_y));
- float2 b2 = vload2(0, (__global float *)(src1_ptr + src_addr.s1 + 2 * src1_stride_y));
- float2 b3 = vload2(0, (__global float *)(src1_ptr + src_addr.s1 + 3 * src1_stride_y));
+ float2 b0 = vload2(0, (__global float *)(src1_ptr + src_addr.s1));
+ src_addr.s1 += src1_stride_y;
+ float2 b1 = vload2(0, (__global float *)(src1_ptr + src_addr.s1));
+ src_addr.s1 += src1_stride_y;
+ float2 b2 = vload2(0, (__global float *)(src1_ptr + src_addr.s1));
+ src_addr.s1 += src1_stride_y;
+ float2 b3 = vload2(0, (__global float *)(src1_ptr + src_addr.s1));
+ src_addr.s1 += src1_stride_y;
+ float2 b4 = vload2(0, (__global float *)(src1_ptr + src_addr.s1));
+ src_addr.s1 += src1_stride_y;
+ float2 b5 = vload2(0, (__global float *)(src1_ptr + src_addr.s1));
+ src_addr.s1 += src1_stride_y;
+ float2 b6 = vload2(0, (__global float *)(src1_ptr + src_addr.s1));
+ src_addr.s1 += src1_stride_y;
+ float2 b7 = vload2(0, (__global float *)(src1_ptr + src_addr.s1));
+ src_addr.s1 += src1_stride_y;
// Multiply and accumulate
acc00 = fma(a0.s0, b0.s0, acc00);
acc00 = fma(a0.s1, b1.s0, acc00);
acc00 = fma(a0.s2, b2.s0, acc00);
acc00 = fma(a0.s3, b3.s0, acc00);
+ acc00 = fma(a0.s4, b4.s0, acc00);
+ acc00 = fma(a0.s5, b5.s0, acc00);
+ acc00 = fma(a0.s6, b6.s0, acc00);
+ acc00 = fma(a0.s7, b7.s0, acc00);
acc01 = fma(a0.s0, b0.s1, acc01);
acc01 = fma(a0.s1, b1.s1, acc01);
acc01 = fma(a0.s2, b2.s1, acc01);
acc01 = fma(a0.s3, b3.s1, acc01);
+ acc01 = fma(a0.s4, b4.s1, acc01);
+ acc01 = fma(a0.s5, b5.s1, acc01);
+ acc01 = fma(a0.s6, b6.s1, acc01);
+ acc01 = fma(a0.s7, b7.s1, acc01);
#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
- a0 = vload4(0, (__global float *)(src0_ptr + src_addr.s0 + 1 * src0_stride_y));
+ a0 = vload8(0, (__global float *)(src0_ptr + src_addr.s0 + 1 * src0_stride_y));
acc10 = fma(a0.s0, b0.s0, acc10);
acc10 = fma(a0.s1, b1.s0, acc10);
acc10 = fma(a0.s2, b2.s0, acc10);
acc10 = fma(a0.s3, b3.s0, acc10);
+ acc10 = fma(a0.s4, b4.s0, acc10);
+ acc10 = fma(a0.s5, b5.s0, acc10);
+ acc10 = fma(a0.s6, b6.s0, acc10);
+ acc10 = fma(a0.s7, b7.s0, acc10);
acc11 = fma(a0.s0, b0.s1, acc11);
acc11 = fma(a0.s1, b1.s1, acc11);
acc11 = fma(a0.s2, b2.s1, acc11);
acc11 = fma(a0.s3, b3.s1, acc11);
+ acc11 = fma(a0.s4, b4.s1, acc11);
+ acc11 = fma(a0.s5, b5.s1, acc11);
+ acc11 = fma(a0.s6, b6.s1, acc11);
+ acc11 = fma(a0.s7, b7.s1, acc11);
#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
- a0 = vload4(0, (__global float *)(src0_ptr + src_addr.s0 + 2 * src0_stride_y));
+ a0 = vload8(0, (__global float *)(src0_ptr + src_addr.s0 + 2 * src0_stride_y));
acc20 = fma(a0.s0, b0.s0, acc20);
acc20 = fma(a0.s1, b1.s0, acc20);
acc20 = fma(a0.s2, b2.s0, acc20);
acc20 = fma(a0.s3, b3.s0, acc20);
+ acc20 = fma(a0.s4, b4.s0, acc20);
+ acc20 = fma(a0.s5, b5.s0, acc20);
+ acc20 = fma(a0.s6, b6.s0, acc20);
+ acc20 = fma(a0.s7, b7.s0, acc20);
acc21 = fma(a0.s0, b0.s1, acc21);
acc21 = fma(a0.s1, b1.s1, acc21);
acc21 = fma(a0.s2, b2.s1, acc21);
acc21 = fma(a0.s3, b3.s1, acc21);
+ acc21 = fma(a0.s4, b4.s1, acc21);
+ acc21 = fma(a0.s5, b5.s1, acc21);
+ acc21 = fma(a0.s6, b6.s1, acc21);
+ acc21 = fma(a0.s7, b7.s1, acc21);
#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
- a0 = vload4(0, (__global float *)(src0_ptr + src_addr.s0 + 3 * src0_stride_y));
+ a0 = vload8(0, (__global float *)(src0_ptr + src_addr.s0 + 3 * src0_stride_y));
acc30 = fma(a0.s0, b0.s0, acc30);
acc30 = fma(a0.s1, b1.s0, acc30);
acc30 = fma(a0.s2, b2.s0, acc30);
acc30 = fma(a0.s3, b3.s0, acc30);
+ acc30 = fma(a0.s4, b4.s0, acc30);
+ acc30 = fma(a0.s5, b5.s0, acc30);
+ acc30 = fma(a0.s6, b6.s0, acc30);
+ acc30 = fma(a0.s7, b7.s0, acc30);
acc31 = fma(a0.s0, b0.s1, acc31);
acc31 = fma(a0.s1, b1.s1, acc31);
acc31 = fma(a0.s2, b2.s1, acc31);
acc31 = fma(a0.s3, b3.s1, acc31);
+ acc31 = fma(a0.s4, b4.s1, acc31);
+ acc31 = fma(a0.s5, b5.s1, acc31);
+ acc31 = fma(a0.s6, b6.s1, acc31);
+ acc31 = fma(a0.s7, b7.s1, acc31);
#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+
+ src_addr.s0 += sizeof(float) * 8;
}
// float size increment
- for(; src_addr.s0 < end_row_vec_a; src_addr += (int2)(4, src1_stride_y))
+ for(; i < (int)COLS_A; ++i)
{
// Load values from matrix A
float a0 = *((__global float *)(src0_ptr + src_addr.s0 + 0 * src0_stride_y));
@@ -1287,6 +1836,7 @@
#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
// Load values from matrix B
float2 b0 = vload2(0, (__global float *)(src1_ptr + src_addr.s1));
+ src_addr.s1 += src1_stride_y;
// Multiply and accumulate
acc00 = fma(a0, b0.s0, acc00);
@@ -1303,25 +1853,33 @@
acc30 = fma(a3, b0.s0, acc30);
acc31 = fma(a3, b0.s1, acc31);
#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+
+ src_addr.s0 += sizeof(float);
}
// Compute destination address
Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
+ // Compute dst address
+ __global uchar *dst_addr = offset(&dst, 0, 0);
+
+ // Add offset for batched GEMM
+ dst_addr += get_global_id(2) * dst_stride_z;
+
// Multiply by the weight of matrix-matrix product and store the result
#if defined(ALPHA)
acc00 = acc00 * ALPHA;
acc01 = acc01 * ALPHA;
#endif // defined(ALPHA)
float2 acc0 = ((float2)(acc00, acc01));
- vstore2(acc0, 0, (__global float *)(offset(&dst, 0, 0)));
+ vstore2(acc0, 0, (__global float *)(dst_addr + 0 * dst_stride_y));
#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
#if defined(ALPHA)
acc10 = acc10 * ALPHA;
acc11 = acc11 * ALPHA;
#endif // defined(ALPHA)
float2 acc1 = ((float2)(acc10, acc11));
- vstore2(acc1, 0, (__global float *)(offset(&dst, 0, 1)));
+ vstore2(acc1, 0, (__global float *)(dst_addr + 1 * dst_stride_y));
#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
#if defined(ALPHA)
@@ -1329,7 +1887,7 @@
acc21 = acc21 * ALPHA;
#endif // defined(ALPHA)
float2 acc2 = ((float2)(acc20, acc21));
- vstore2(acc2, 0, (__global float *)(offset(&dst, 0, 2)));
+ vstore2(acc2, 0, (__global float *)(dst_addr + 2 * dst_stride_y));
#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
#if defined(ALPHA)
@@ -1337,7 +1895,212 @@
acc31 = acc31 * ALPHA;
#endif // defined(ALPHA)
float2 acc3 = (float2)(acc30, acc31);
- vstore2(acc3, 0, (__global float *)(offset(&dst, 0, 3)));
+ vstore2(acc3, 0, (__global float *)(dst_addr + 3 * dst_stride_y));
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+}
+
+/** This OpenCL kernel computes the matrix by matrix multiplication between the matrix A (src0) and matrix B (src1) in case both matrices have not beed reshaped
+ *
+ * @note This OpenCL kernel works with the 16-bit floating point data type (half) and uses the fma units.
+ * @note The number of elements processed along the x and y directions must be passed at compile time using -DNUM_ELEMS_PROCESSED_PER_THREAD_X and -DNUM_ELEMS_PROCESSED_PER_THREAD_Y.
+ * This kernel optimally uses -DNUM_ELEMS_PROCESSED_PER_THREAD_X=4.
+ * @note The number of matrix A columns must be passed at compile time using -DCOLS_A.
+ * @note The optional value of scalar alpha is passed at compile time using -DALPHA=alpha
+ * @note In case the matrix B has 3 dimensions and the matrix A more than 3, in order to avoid out-of-bounds reads, the number of channels of matrix B must be passed at compile time using MATRIX_B_DEPTH (i.e. -DMATRIX_B_DEPTH=16)
+ * This case can happen when GEMM is used to perform the element-wise multiplication through a batched matrix multiplication (2D Winograd) and we have multiple inputs (i.e. a = [K, M, 16, Batches], b = [N, K, 16])
+ *
+ * @param[in] src0_ptr Pointer to the source matrix. Supported data types: F16
+ * @param[in] src0_stride_x Stride of the source matrix in X dimension (in bytes)
+ * @param[in] src0_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] src0_stride_y Stride of the source matrix in Y dimension (in bytes)
+ * @param[in] src0_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] src0_offset_first_element_in_bytes The offset of the first element in the source matrix
+ * @param[in] src1_ptr Pointer to the source matrix. Supported data types: same as @p src0_ptr
+ * @param[in] src1_stride_x Stride of the source matrix in X dimension (in bytes)
+ * @param[in] src1_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] src1_stride_y Stride of the source matrix in Y dimension (in bytes)
+ * @param[in] src1_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] src1_offset_first_element_in_bytes The offset of the first element in the source matrix
+ * @param[out] dst_ptr Pointer to the destination matrix Supported data types: same as @p src0_ptr
+ * @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)
+ * @param[in] dst_step_x dst_gx_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)
+ * @param[in] dst_step_y dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix
+ */
+__kernel void gemm_mm_floating_point_f16_bifrost(IMAGE_DECLARATION(src0),
+ IMAGE_DECLARATION(src1),
+ IMAGE_DECLARATION(dst),
+ uint src0_stride_z,
+ uint src1_stride_z,
+ uint dst_stride_z)
+{
+ int idx = get_global_id(0) * NUM_ELEMS_PROCESSED_PER_THREAD_X;
+
+ // Compute starting address for matrix A and Matrix B
+ int2 src_addr = ((int2)(src0_offset_first_element_in_bytes, src1_offset_first_element_in_bytes));
+
+ // Update address for the matrix A
+ src_addr.s0 += get_global_id(1) * src0_stride_y * NUM_ELEMS_PROCESSED_PER_THREAD_Y;
+
+ // Update address for the matrix B
+ src_addr.s1 += idx * sizeof(half);
+
+ // Add offset for batched GEMM
+ src_addr.s0 += get_global_id(2) * src0_stride_z;
+
+#if defined(MATRIX_B_DEPTH)
+ // Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3
+ src_addr.s1 += (get_global_id(2) % MATRIX_B_DEPTH) * src1_stride_z;
+#else // defined(MATRIX_B_DEPTH)
+ src_addr.s1 += get_global_id(2) * src1_stride_z;
+#endif // defined(MATRIX_B_DEPTH)
+
+ half8 acc0 = 0.0h;
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+ half8 acc1 = 0.0h;
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+ half8 acc2 = 0.0h;
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+ half8 acc3 = 0.0h;
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+
+ int i = 0;
+ for(; i <= ((int)COLS_A - 4); i += 4)
+ {
+ // Load values from matrix A
+ half4 a0 = vload4(0, (__global half *)(src0_ptr + src_addr.s0 + 0 * src0_stride_y));
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+ half4 a1 = vload4(0, (__global half *)(src0_ptr + src_addr.s0 + 1 * src0_stride_y));
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+ half4 a2 = vload4(0, (__global half *)(src0_ptr + src_addr.s0 + 2 * src0_stride_y));
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+ half4 a3 = vload4(0, (__global half *)(src0_ptr + src_addr.s0 + 3 * src0_stride_y));
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+ // Load values from matrix B
+ half8 b0 = vload8(0, (__global half *)(src1_ptr + src_addr.s1));
+ src_addr.s1 += src1_stride_y;
+
+ // Accumulate
+ acc0 = fma(b0, (half8)a0.s0, acc0);
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+ acc1 = fma(b0, (half8)a1.s0, acc1);
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+ acc2 = fma(b0, (half8)a2.s0, acc2);
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+ acc3 = fma(b0, (half8)a3.s0, acc3);
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+
+ b0 = vload8(0, (__global half *)(src1_ptr + src_addr.s1));
+ src_addr.s1 += src1_stride_y;
+ acc0 = fma(b0, (half8)a0.s1, acc0);
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+ acc1 = fma(b0, (half8)a1.s1, acc1);
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+ acc2 = fma(b0, (half8)a2.s1, acc2);
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+ acc3 = fma(b0, (half8)a3.s1, acc3);
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+
+ b0 = vload8(0, (__global half *)(src1_ptr + src_addr.s1));
+ src_addr.s1 += src1_stride_y;
+ acc0 = fma(b0, (half8)a0.s2, acc0);
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+ acc1 = fma(b0, (half8)a1.s2, acc1);
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+ acc2 = fma(b0, (half8)a2.s2, acc2);
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+ acc3 = fma(b0, (half8)a3.s2, acc3);
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+
+ b0 = vload8(0, (__global half *)(src1_ptr + src_addr.s1));
+ src_addr.s1 += src1_stride_y;
+ acc0 = fma(b0, (half8)a0.s3, acc0);
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+ acc1 = fma(b0, (half8)a1.s3, acc1);
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+ acc2 = fma(b0, (half8)a2.s3, acc2);
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+ acc3 = fma(b0, (half8)a3.s3, acc3);
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+
+ src_addr.s0 += 4 * sizeof(half);
+ }
+
+ for(; i < (int)COLS_A; ++i)
+ {
+ // Load values from matrix A
+ half a0 = *((__global half *)(src0_ptr + src_addr.s0 + 0 * src0_stride_y));
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+ half a1 = *((__global half *)(src0_ptr + src_addr.s0 + 1 * src0_stride_y));
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+ half a2 = *((__global half *)(src0_ptr + src_addr.s0 + 2 * src0_stride_y));
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+ half a3 = *((__global half *)(src0_ptr + src_addr.s0 + 3 * src0_stride_y));
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+ // Load values from matrix B
+ half8 b0 = vload8(0, (__global half *)(src1_ptr + src_addr.s1));
+
+ src_addr += (int2)(sizeof(half), src1_stride_y);
+
+ // Accumulate
+ acc0 = fma(b0, (half8)a0, acc0); // b0 * (half8)a0;
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+ acc1 = fma(b0, (half8)a1, acc1); // b0 * (half8)a1;
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+ acc2 = fma(b0, (half8)a2, acc2); // b0 * (half8)a2;
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+ acc3 = fma(b0, (half8)a3, acc3); // b0 * (half8)a3;
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+ }
+
+ // Compute destination address
+ Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
+
+ // Compute dst address
+ __global uchar *dst_addr = offset(&dst, 0, 0);
+
+ // Add offset for batched GEMM
+ dst_addr += get_global_id(2) * dst_stride_z;
+
+ // Multiply by the weight of matrix-matrix product and store the result
+#if defined(ALPHA)
+ acc0 = acc0 * (half8)ALPHA;
+#endif // defined(ALPHA)
+ vstore8(acc0, 0, (__global half *)(dst_addr + 0 * dst_stride_y));
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+#if defined(ALPHA)
+ acc1 = acc1 * (half8)ALPHA;
+#endif // defined(ALPHA)
+ vstore8(acc1, 0, (__global half *)(dst_addr + 1 * dst_stride_y));
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+#if defined(ALPHA)
+ acc2 = acc2 * (half8)ALPHA;
+#endif // defined(ALPHA)
+ vstore8(acc2, 0, (__global half *)(dst_addr + 2 * dst_stride_y));
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+#if defined(ALPHA)
+ acc3 = acc3 * (half8)ALPHA;
+#endif // defined(ALPHA)
+ vstore8(acc3, 0, (__global half *)(dst_addr + 3 * dst_stride_y));
#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
}
@@ -1349,6 +2112,8 @@
* @note The number matrix A columns, the number of elements processed per thread along the Y direction and the alpha's value need to be passed at compile time using -DCOLS_A, -DNUM_ELEMS_PROCESSED_PER_THREAD_Y and -DALPHA
* @note The fixed point position need to be passed at compile time using -DFIXED_POINT_POSITION
* @note The optional alpha value must be passed in 8 bit fixed point format using -DALPHA
+ * @note In case the matrix B has 3 dimensions and the matrix A more than 3, in order to avoid out-of-bounds reads, the number of channels of matrix B must be passed at compile time using MATRIX_B_DEPTH (i.e. -DMATRIX_B_DEPTH=16)
+ * This case can happen when GEMM is used to perform the element-wise multiplication through a batched matrix multiplication (2D Winograd) and we have multiple inputs (i.e. a = [K, M, 16, Batches], b = [N, K, 16])
*
* @param[in] src0_ptr Pointer to the source matrix. Supported data types: QS8/QS16
* @param[in] src0_stride_x Stride of the source matrix in X dimension (in bytes)
@@ -1371,7 +2136,10 @@
*/
__kernel void gemm_mm_qs8(IMAGE_DECLARATION(src0),
IMAGE_DECLARATION(src1),
- IMAGE_DECLARATION(dst))
+ IMAGE_DECLARATION(dst),
+ uint src0_stride_z,
+ uint src1_stride_z,
+ uint dst_stride_z)
{
int idx = get_global_id(0) * NUM_ELEMS_PROCESSED_PER_THREAD_X;
@@ -1384,6 +2152,16 @@
// Update address for the matrix B
src_addr.s1 += idx * sizeof(char);
+ // Add offset for batched GEMM
+ src_addr.s0 += get_global_id(2) * src0_stride_z;
+
+#if defined(MATRIX_B_DEPTH)
+ // Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3
+ src_addr.s1 += (get_global_id(2) % MATRIX_B_DEPTH) * src1_stride_z;
+#else // defined(MATRIX_B_DEPTH)
+ src_addr.s1 += get_global_id(2) * src1_stride_z;
+#endif // defined(MATRIX_B_DEPTH)
+
int end_row_vec_a = src_addr.s0 + (COLS_A * sizeof(char));
short8 acc00 = 0;
@@ -1475,33 +2253,39 @@
// Compute destination address
Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
+ // Compute dst address
+ __global uchar *dst_addr = offset(&dst, 0, 0);
+
+ // Add offset for batched GEMM
+ dst_addr += get_global_id(2) * dst_stride_z;
+
// Multiply by the weight of matrix product and store the result
char16 acc_qs8;
acc_qs8 = convert_char16_sat((short16)(acc00, acc01));
#if defined(ALPHA)
acc_qs8 = mul_sat_qs8x16(acc_qs8, (char16)ALPHA, FIXED_POINT_POSITION);
#endif // defined(ALPHA)
- vstore16(acc_qs8, 0, (__global char *)(offset(&dst, 0, 0)));
+ vstore16(acc_qs8, 0, (__global char *)(dst_addr + 0 * dst_stride_y));
#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
acc_qs8 = convert_char16_sat((short16)(acc10, acc11));
#if defined(ALPHA)
acc_qs8 = mul_sat_qs8x16(acc_qs8, (char16)ALPHA, FIXED_POINT_POSITION);
#endif // defined(ALPHA)
- vstore16(acc_qs8, 0, (__global char *)(offset(&dst, 0, 1)));
+ vstore16(acc_qs8, 0, (__global char *)(dst_addr + 1 * dst_stride_y));
#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
acc_qs8 = convert_char16_sat((short16)(acc20, acc21));
#if defined(ALPHA)
acc_qs8 = mul_sat_qs8x16(acc_qs8, (char16)ALPHA, FIXED_POINT_POSITION);
#endif // defined(ALPHA)
- vstore16(acc_qs8, 0, (__global char *)(offset(&dst, 0, 2)));
+ vstore16(acc_qs8, 0, (__global char *)(dst_addr + 2 * dst_stride_y));
#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
acc_qs8 = convert_char16_sat((short16)(acc30, acc31));
#if defined(ALPHA)
acc_qs8 = mul_sat_qs8x16(acc_qs8, (char16)ALPHA, FIXED_POINT_POSITION);
#endif // defined(ALPHA)
- vstore16(acc_qs8, 0, (__global char *)(offset(&dst, 0, 3)));
+ vstore16(acc_qs8, 0, (__global char *)(dst_addr + 3 * dst_stride_y));
#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
}
@@ -1512,6 +2296,8 @@
* @note The number of matrix A columns, the number of elements processed per thread along the Y direction and the alpha's value need to be passed at compile time using -DCOLS_A, -DNUM_ELEMS_PROCESSED_PER_THREAD_Y and -DALPHA
* @note The fixed point position need to be passed at compile time using -DFIXED_POINT_POSITION
* @note The optional alpha value must be passed in 16 bit fixed point format using -DALPHA
+ * @note In case the matrix B has 3 dimensions and the matrix A more than 3, in order to avoid out-of-bounds reads, the number of channels of matrix B must be passed at compile time using MATRIX_B_DEPTH (i.e. -DMATRIX_B_DEPTH=16)
+ * This case can happen when GEMM is used to perform the element-wise multiplication through a batched matrix multiplication (2D Winograd) and we have multiple inputs (i.e. a = [K, M, 16, Batches], b = [N, K, 16])
*
* @param[in] src0_ptr Pointer to the source matrix. Supported data types: QS8/QS16
* @param[in] src0_stride_x Stride of the source matrix in X dimension (in bytes)
@@ -1534,7 +2320,10 @@
*/
__kernel void gemm_mm_qs16(IMAGE_DECLARATION(src0),
IMAGE_DECLARATION(src1),
- IMAGE_DECLARATION(dst))
+ IMAGE_DECLARATION(dst),
+ uint src0_stride_z,
+ uint src1_stride_z,
+ uint dst_stride_z)
{
int idx = get_global_id(0) * NUM_ELEMS_PROCESSED_PER_THREAD_X;
@@ -1547,6 +2336,16 @@
// Update address for the matrix B
src_addr.s1 += idx * sizeof(short);
+ // Add offset for batched GEMM
+ src_addr.s0 += get_global_id(2) * src0_stride_z;
+
+#if defined(MATRIX_B_DEPTH)
+ // Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3
+ src_addr.s1 += (get_global_id(2) % MATRIX_B_DEPTH) * src1_stride_z;
+#else // defined(MATRIX_B_DEPTH)
+ src_addr.s1 += get_global_id(2) * src1_stride_z;
+#endif // defined(MATRIX_B_DEPTH)
+
int end_row_vec_a = src_addr.s0 + (COLS_A * sizeof(short));
int8 acc0 = 0;
@@ -1622,33 +2421,39 @@
// Compute destination address
Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
+ // Compute dst address
+ __global uchar *dst_addr = offset(&dst, 0, 0);
+
+ // Add offset for batched GEMM
+ dst_addr += get_global_id(2) * dst_stride_z;
+
// Multiply by the weight of matrix product and store the result
short8 acc_qs16;
acc_qs16 = convert_short8_sat(acc0);
#if defined(ALPHA)
acc_qs16 = mul_sat_qs16x8(acc_qs16, (short8)ALPHA, FIXED_POINT_POSITION);
#endif // defined(ALPHA)
- vstore8(acc_qs16, 0, (__global short *)(offset(&dst, 0, 0)));
+ vstore8(acc_qs16, 0, (__global short *)(dst_addr + 0 * dst_stride_y));
#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
acc_qs16 = convert_short8_sat(acc1);
#if defined(ALPHA)
acc_qs16 = mul_sat_qs16x8(acc_qs16, (short8)ALPHA, FIXED_POINT_POSITION);
#endif // defined(ALPHA)
- vstore8(acc_qs16, 0, (__global short *)(offset(&dst, 0, 1)));
+ vstore8(acc_qs16, 0, (__global short *)(dst_addr + 1 * dst_stride_y));
#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
acc_qs16 = convert_short8_sat(acc2);
#if defined(ALPHA)
acc_qs16 = mul_sat_qs16x8(acc_qs16, (short8)ALPHA, FIXED_POINT_POSITION);
#endif // defined(ALPHA)
- vstore8(acc_qs16, 0, (__global short *)(offset(&dst, 0, 2)));
+ vstore8(acc_qs16, 0, (__global short *)(dst_addr + 2 * dst_stride_y));
#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
acc_qs16 = convert_short8_sat(acc3);
#if defined(ALPHA)
acc_qs16 = mul_sat_qs16x8(acc_qs16, (short8)ALPHA, FIXED_POINT_POSITION);
#endif // defined(ALPHA)
- vstore8(acc_qs16, 0, (__global short *)(offset(&dst, 0, 3)));
+ vstore8(acc_qs16, 0, (__global short *)(dst_addr + 3 * dst_stride_y));
#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
}
#endif // defined(FIXED_POINT_POSITION)
diff --git a/src/core/CL/cl_kernels/helpers.h b/src/core/CL/cl_kernels/helpers.h
index 02c6c4c..615c518 100644
--- a/src/core/CL/cl_kernels/helpers.h
+++ b/src/core/CL/cl_kernels/helpers.h
@@ -29,7 +29,9 @@
#endif // defined(ARM_COMPUTE_OPENCL_FP16_ENABLED)
#if defined(ARM_COMPUTE_DEBUG_ENABLED)
+#if defined(cl_arm_printf)
#pragma OPENCL EXTENSION cl_arm_printf : enable
+#endif // defined(cl_arm_printf)
#endif // defined(ARM_COMPUTE_DEBUG_ENABLED)
#define EXPAND(x) x
diff --git a/src/core/CL/cl_kernels/helpers_asymm.h b/src/core/CL/cl_kernels/helpers_asymm.h
index a69bcc1..c314d17 100644
--- a/src/core/CL/cl_kernels/helpers_asymm.h
+++ b/src/core/CL/cl_kernels/helpers_asymm.h
@@ -62,6 +62,7 @@
b_64 = convert_long##size(b); \
VEC_DATA_TYPE(long, size) \
ab_64 = a_64 * b_64; \
+ /* COMPMID-907 */ \
VEC_DATA_TYPE(int, size) \
ab_x2_high32 = convert_int##size(((ab_64 + (1 << 30)) >> 31)); \
return select(ab_x2_high32, INT_MAX, overflow); \
@@ -366,4 +367,4 @@
ASYMM_RESCALE_IMPL(8)
ASYMM_RESCALE_IMPL(16)
-#endif // ARM_COMPUTE_HELPERS_ASYMM_H
+#endif // ARM_COMPUTE_HELPERS_ASYMM_H
\ No newline at end of file
diff --git a/src/core/CL/cl_kernels/hog.cl b/src/core/CL/cl_kernels/hog.cl
index 3d37fbc..407ee2f 100644
--- a/src/core/CL/cl_kernels/hog.cl
+++ b/src/core/CL/cl_kernels/hog.cl
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017, 2018 ARM Limited.
+ * Copyright (c) 2017-2018 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -351,7 +351,7 @@
}
#endif /* NUM_CELLS_PER_BLOCK_HEIGHT and NUM_BINS_PER_BLOCK_X and NUM_BINS_PER_BLOCK and HOG_NORM_TYPE and L2_HYST_THRESHOLD */
-#if defined(NUM_BLOCKS_PER_DESCRIPTOR_Y) && defined(NUM_BINS_PER_DESCRIPTOR_X) && defined(THRESHOLD) && defined(MAX_NUM_DETECTION_WINDOWS) && defined(IDX_CLASS) && defined(BLOCK_STRIDE_WIDTH) && defined(BLOCK_STRIDE_HEIGHT) && defined(DETECTION_WINDOW_WIDTH) && defined(DETECTION_WINDOW_HEIGHT)
+#if defined(NUM_BLOCKS_PER_DESCRIPTOR_Y) && defined(NUM_BINS_PER_DESCRIPTOR_X) && defined(THRESHOLD) && defined(MAX_NUM_DETECTION_WINDOWS) && defined(IDX_CLASS) && defined(DETECTION_WINDOW_STRIDE_WIDTH) && defined(DETECTION_WINDOW_STRIDE_HEIGHT) && defined(DETECTION_WINDOW_WIDTH) && defined(DETECTION_WINDOW_HEIGHT)
/** This OpenCL kernel computes the HOG detector using linear SVM
*
@@ -362,8 +362,8 @@
* -# -DTHRESHOLD = Threshold for the distance between features and SVM classifying plane
* -# -DMAX_NUM_DETECTION_WINDOWS = Maximum number of possible detection windows. It is equal to the size of the DetectioWindow array
* -# -DIDX_CLASS = Index of the class to detect
- * -# -DBLOCK_STRIDE_WIDTH = Block stride for the X direction
- * -# -DBLOCK_STRIDE_HEIGHT = Block stride for the Y direction
+ * -# -DDETECTION_WINDOW_STRIDE_WIDTH = Detection window stride for the X direction
+ * -# -DDETECTION_WINDOW_STRIDE_HEIGHT = Detection window stride for the Y direction
* -# -DDETECTION_WINDOW_WIDTH = Width of the detection window
* -# -DDETECTION_WINDOW_HEIGHT = Height of the detection window
*
@@ -443,8 +443,8 @@
int id = atomic_inc(num_detection_windows);
if(id < MAX_NUM_DETECTION_WINDOWS)
{
- dst[id].x = get_global_id(0) * BLOCK_STRIDE_WIDTH;
- dst[id].y = get_global_id(1) * BLOCK_STRIDE_HEIGHT;
+ dst[id].x = get_global_id(0) * DETECTION_WINDOW_STRIDE_WIDTH;
+ dst[id].y = get_global_id(1) * DETECTION_WINDOW_STRIDE_HEIGHT;
dst[id].width = DETECTION_WINDOW_WIDTH;
dst[id].height = DETECTION_WINDOW_HEIGHT;
dst[id].idx_class = IDX_CLASS;
@@ -453,4 +453,4 @@
}
}
#endif /* NUM_BLOCKS_PER_DESCRIPTOR_Y && NUM_BINS_PER_DESCRIPTOR_X && THRESHOLD && MAX_NUM_DETECTION_WINDOWS && IDX_CLASS &&
- * BLOCK_STRIDE_WIDTH && BLOCK_STRIDE_HEIGHT && DETECTION_WINDOW_WIDTH && DETECTION_WINDOW_HEIGHT */
+ * DETECTION_WINDOW_STRIDE_WIDTH && DETECTION_WINDOW_STRIDE_HEIGHT && DETECTION_WINDOW_WIDTH && DETECTION_WINDOW_HEIGHT */
diff --git a/src/core/CL/cl_kernels/im2col.cl b/src/core/CL/cl_kernels/im2col.cl
index 75d99bd..1e85e1b 100644
--- a/src/core/CL/cl_kernels/im2col.cl
+++ b/src/core/CL/cl_kernels/im2col.cl
@@ -680,6 +680,7 @@
* @note The pad_left, pad_right, pad_top and pad_bottom must be passed at compile time using -DPAD_LEFT, -DPAD_RIGHT, -DPAD_TOP and -DPAD_BOTTOM: e.g. -DPAD_LEFT=1, -DPAD_RIGHT=2, -DPAD_TOP=3 and -DPAD_BOTTOM=2
* @note The zero value to store in case we load values out-of-bounds must be passed at compile time using -DPAD_VALUE: e.g. -DPAD_VALUE=0.0
* @note The stride along the X and Y directions must be passed at compile time using -DSTRIDE_X and -DSTRIDE_Y: e.g. -DSTRIDE_X=1 and -DSTRIDE_Y=1
+ * @note The dilation_x and dilation_y must be passed at compile time using -DDILATION_X and -DDILATION_Y: e.g. -DDILATION_X=1, -DDILATION_Y=1
* @note In case biases will be added to the convolution -DHAS_BIAS has to be passed to append the final matrix with 1 in each row.
*
* @param[in] src_ptr Pointer to the source tensor. Supported data types: QS8/QASYMM8/QS16/F16/F32
@@ -722,10 +723,12 @@
__global DATA_TYPE *output_ptr = ((__global DATA_TYPE *)(dst_ptr + dst_offset_first_element_in_bytes + yo * dst_stride_y + batch * dst_stride_w)) + xo;
// Linearize convolution elements
- for(int y = yi, y_e = yi + KERNEL_HEIGHT; y < y_e; ++y)
+ for(int yk = 0; yk < KERNEL_HEIGHT; ++yk)
{
- for(int x = xi, x_e = xi + KERNEL_WIDTH; x < x_e; ++x, ++output_ptr)
+ int y = yi + yk * DILATION_Y;
+ for(int xk = 0; xk < KERNEL_WIDTH; ++xk, ++output_ptr)
{
+ int x = xi + xk * DILATION_X;
#if PAD_LEFT == 0 && PAD_TOP == 0 && PAD_RIGHT == 0 && PAD_BOTTOM == 0
*output_ptr = *((__global DATA_TYPE *)(input_ptr + x * src_stride_x + y * src_stride_y));
#else // PAD_LEFT == 0 && PAD_TOP == 0 && PAD_RIGHT == 0 && PAD_BOTTOM == 0
diff --git a/src/core/CL/cl_kernels/optical_flow_pyramid_lk.cl b/src/core/CL/cl_kernels/optical_flow_pyramid_lk.cl
index e1131d5..8a126a0 100644
--- a/src/core/CL/cl_kernels/optical_flow_pyramid_lk.cl
+++ b/src/core/CL/cl_kernels/optical_flow_pyramid_lk.cl
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2018 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -44,7 +44,7 @@
float x; /**< The x coordinate. */
float y; /**< The y coordinate. */
float tracking_status; /**< A zero indicates a lost point. Initialized to 1 by corner detectors. */
- float dummy;
+ float dummy; /**< Dummy member for alignment. */
} InternalKeypoint;
/** Threshold for the determinant. Used for lost tracking criteria */
@@ -167,7 +167,11 @@
Keypoint new_point;
new_point.x = round(new_point_internal.x);
new_point.y = round(new_point_internal.y);
+ new_point.strength = 0.f;
+ new_point.scale = 0.f;
+ new_point.orientation = 0.f;
new_point.tracking_status = new_point_internal.tracking_status;
+ new_point.error = 0.f;
// Store new point
new_points[idx] = new_point;
@@ -352,8 +356,7 @@
* @param[in] border_limits It stores the right border limit (width - window_dimension - 1, height - window_dimension - 1,)
* @param[in] eig_const 1.0f / (float)(2.0f * window_dimension * window_dimension)
* @param[in] level0 It is set to 1 if level of pyramid = 0
- * @param[in] term_iteration It is set to 1 if termination = VX_TERM_CRITERIA_ITERATIONS
- * @param[in] term_epsilon It is set to 1 if termination = VX_TERM_CRITERIA_EPSILON
+ * @param[in] term_epsilon It is set to 1 if termination = TERM_CRITERIA_EPSILON
*/
void __kernel lktracker_stage1(
IMAGE_DECLARATION(new_image),
@@ -368,7 +371,6 @@
const float3 border_limits,
const float eig_const,
const int level0,
- const int term_iteration,
const int term_epsilon)
{
int idx = get_global_id(0);
@@ -512,10 +514,7 @@
// Update previous delta
prev_delta = delta;
- if(term_iteration == 1)
- {
- j++;
- }
+ j++;
}
new_points[idx].xy = out_new_point;
diff --git a/src/core/CL/cl_kernels/pooling_layer.cl b/src/core/CL/cl_kernels/pooling_layer.cl
index dae0b99..2c7ddfd 100644
--- a/src/core/CL/cl_kernels/pooling_layer.cl
+++ b/src/core/CL/cl_kernels/pooling_layer.cl
@@ -62,6 +62,8 @@
#endif /* FIXED_POINT_POSITION */
+#define DIV_OP_NHWC(x, y) (x * (VEC_DATA_TYPE(DATA_TYPE, 8))(1.f / y))
+
#if STRIDE_X == 1
#define POOLING3x3(res, input, output) POOLING3x3_STRIDE1(res, input, output)
#elif STRIDE_X == 2 /* STRIDE_X == 1 */
@@ -423,7 +425,7 @@
#endif // POOL_AVG
-/** Performs a pooling function of pool size equal to N
+/** Performs a pooling function of pool size equal to N (NCHW)
*
* @note Datatype must be passed using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types are QS8/QS16/F16/F32;
* @note -DFP16 must be passed at compile time if half float data type is used
@@ -451,7 +453,7 @@
* @param[in] output_step_z output_stride_z * number of elements along Z processed per workitem(in bytes)
* @param[in] output_offset_first_element_in_bytes The offset of the first element in the destination image
*/
-__kernel void pooling_layer_MxN(
+__kernel void pooling_layer_MxN_nchw(
TENSOR3D_DECLARATION(input),
TENSOR3D_DECLARATION(output))
{
@@ -512,3 +514,97 @@
*(__global DATA_TYPE *)output.ptr = res;
}
#endif // defined(POOL_SIZE_X) && defined(POOL_SIZE_Y)
+
+DATA_TYPE calculate_avg_scale_nhwc(const int pool_size_x, const int pool_size_y, int upper_bound_w, int upper_bound_h,
+ const int pad_x, const int pad_y, const int stride_x, const int stride_y)
+{
+ int start_x = get_global_id(1) * stride_x - pad_x;
+ int start_y = get_global_id(2) * stride_y - pad_y;
+
+#if !defined(EXCLUDE_PADDING)
+ upper_bound_w += pad_x;
+ upper_bound_h += pad_y;
+#endif /* defined(EXCLUDE_PADDING) */
+ const int end_x = min(start_x + pool_size_x, upper_bound_w);
+ const int end_y = min(start_y + pool_size_y, upper_bound_h);
+#if defined(EXCLUDE_PADDING)
+ start_x = max(0, start_x);
+ start_y = max(0, start_y);
+#endif /* defined(EXCLUDE_PADDING) */
+ return ((end_y - start_y) * (end_x - start_x));
+}
+
+/** Performs a pooling function of pool size equal to N (NHWC)
+ *
+ * @note Datatype must be passed using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types are F16/F32
+ * @note -DFP16 must be passed at compile time if half float data type is used
+ * @note Pool sizes must be passed using -DPOOL_SIZE_X and -DPOOL_SIZE_Y e.g. -DPOOL_SIZE_X=13;
+ * @note Tensors width and height must be passed at compile time using -DMAX_WIDTH and -DMAX_HEIGHT
+ * @note Strides must be passed at compile time using -DSTRIDE_X and -DSTRIDE_Y which are the steps of the window along the x and y directions
+ * @note Pad values must be passed at compile time using -DPAD_X and -DPAD_Y which are the pooling paddings in x and y dimension
+ * @note In case of average pooling the following information must be passed at compile time:
+ * -DPOOL_AVG must be provided otherwise max pooling will be performed.
+ *
+ * @param[in] input_ptr Pointer to the source image. Supported data types: F16/F32
+ * @param[in] input_stride_x Stride of the source image in X dimension (in bytes)
+ * @param[in] input_step_x input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] input_stride_y Stride of the source image in Y dimension (in bytes)
+ * @param[in] input_step_y input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] input_stride_z Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] input_step_z input_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] input_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[out] output_ptr Pointer to the destination image. Supported data types: same as @p input_ptr
+ * @param[in] output_stride_x Stride of the destination image in X dimension (in bytes)
+ * @param[in] output_step_x output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] output_stride_y Stride of the destination image in Y dimension (in bytes)
+ * @param[in] output_step_y output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] output_stride_z Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] output_step_z output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] output_offset_first_element_in_bytes The offset of the first element in the destination image
+ */
+__kernel void pooling_layer_MxN_nhwc(
+ TENSOR3D_DECLARATION(input),
+ TENSOR3D_DECLARATION(output))
+{
+ // Get pixels pointer
+ Tensor3D input = CONVERT_TO_TENSOR3D_STRUCT(input);
+ Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT(output);
+
+ VEC_DATA_TYPE(DATA_TYPE, 8)
+ vdata = INITIAL_VALUE;
+ DATA_TYPE sdata = INITIAL_VALUE;
+
+ const int idx_width = get_global_id(1) * STRIDE_X;
+ const int idx_height = get_global_id(2) * STRIDE_Y;
+
+ for(int y = 0; y < POOL_SIZE_Y; ++y)
+ {
+ int y1 = select(y, PAD_Y - idx_height, y + idx_height < PAD_Y || y + idx_height > MAX_HEIGHT);
+ for(int x = 0; x < POOL_SIZE_X; ++x)
+ {
+ int x1 = select(x, PAD_X - idx_width - 1, x + idx_width < PAD_X || x + idx_width > MAX_WIDTH);
+ x1 = select(x1, PAD_X - idx_width - 1, y != y1);
+
+ VEC_DATA_TYPE(DATA_TYPE, 8)
+ data0 = vload8(0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, x1 - PAD_X, y1 - PAD_Y));
+#if defined(POOL_L2)
+ // Raise to power of 2 for L2 Pooling
+ data0 *= data0;
+#endif /* defined(POOL_L2) */
+ vdata = POOL_OP(vdata, data0);
+ }
+ }
+
+#if defined(POOL_AVG) || defined(POOL_L2)
+ // Divide by pool region in case of average pooling
+ vdata = DIV_OP_NHWC(vdata, calculate_avg_scale_nhwc(POOL_SIZE_X, POOL_SIZE_Y, MAX_WIDTH, MAX_HEIGHT, PAD_X, PAD_Y, STRIDE_X, STRIDE_Y));
+#endif /* defined(POOL_AVG) || defined(POOL_L2) */
+
+#if defined(POOL_L2)
+ // Take square root of the result in L2 pooling
+ vdata = SQRT_OP(vdata);
+#endif /* defined(POOL_L2) */
+
+ // Store result
+ vstore8(vdata, 0, (__global DATA_TYPE *)output.ptr);
+}
diff --git a/src/core/CL/cl_kernels/pooling_layer_quantized.cl b/src/core/CL/cl_kernels/pooling_layer_quantized.cl
index 98850c0..17d893a 100644
--- a/src/core/CL/cl_kernels/pooling_layer_quantized.cl
+++ b/src/core/CL/cl_kernels/pooling_layer_quantized.cl
@@ -31,6 +31,8 @@
#define DIV_OP(x, y) (x * (1.f / y))
+#define DIV_OP_NHWC(x, y) (convert_float8(x) * (float8)(1.f / y))
+
#if defined(POOL_L2)
#error "L2 pooling is not supported"
#endif /* defined(POOL_L2) */
@@ -49,7 +51,7 @@
return ((end_y - start_y) * (end_x - start_x));
}
-/** Performs a pooling function of pool size equal to N
+/** Performs a pooling function of pool size equal to N (NCHW)
*
* @note Pool sizes must be passed using -DPOOL_SIZE_X and -DPOOL_SIZE_Y e.g. -DPOOL_SIZE_X=13;
* @note In case of average pooling the following information must be passed at compile time:
@@ -75,7 +77,7 @@
* @param[in] output_step_z output_stride_z * number of elements along Z processed per workitem(in bytes)
* @param[in] output_offset_first_element_in_bytes The offset of the first element in the destination image
*/
-__kernel void pooling_layer_MxN_quantized(
+__kernel void pooling_layer_MxN_quantized_nchw(
TENSOR3D_DECLARATION(input),
TENSOR3D_DECLARATION(output))
{
@@ -119,3 +121,79 @@
// Store result
*(__global uchar *)output.ptr = convert_uchar(res);
}
+
+int calculate_avg_scale_nhwc(const int pool_size_x, const int pool_size_y, int upper_bound_w, int upper_bound_h,
+ const int pad_x, const int pad_y, const int stride_x, const int stride_y)
+{
+ int start_x = get_global_id(1) * stride_x - pad_x;
+ int start_y = get_global_id(2) * stride_y - pad_y;
+
+ const int end_x = min(start_x + pool_size_x, upper_bound_w);
+ const int end_y = min(start_y + pool_size_y, upper_bound_h);
+
+ start_x = max(0, start_x);
+ start_y = max(0, start_y);
+
+ return ((end_y - start_y) * (end_x - start_x));
+}
+
+/** Performs a pooling function of pool size equal to N (NHWC)
+ *
+ * @note Pool sizes must be passed using -DPOOL_SIZE_X and -DPOOL_SIZE_Y e.g. -DPOOL_SIZE_X=13;
+ * @note Tensors width and height must be passed at compile time using -DMAX_WIDTH and -DMAX_HEIGHT
+ * @note Strides must be passed at compile time using -DSTRIDE_X and -DSTRIDE_Y which are the steps of the window along the x and y directions
+ * @note Pad values must be passed at compile time using -DPAD_X and -DPAD_Y which are the pooling paddings in x and y dimension
+ * @note In case of average pooling the following information must be passed at compile time:
+ * -DPOOL_AVG must be provided otherwise max pooling will be performed.
+ *
+ * @param[in] input_ptr Pointer to the source image. Supported data types: QASYMM8
+ * @param[in] input_stride_x Stride of the source image in X dimension (in bytes)
+ * @param[in] input_step_x input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] input_stride_y Stride of the source image in Y dimension (in bytes)
+ * @param[in] input_step_y input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] input_stride_z Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] input_step_z input_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] input_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[out] output_ptr Pointer to the destination image. Supported data types: same as @p input_ptr
+ * @param[in] output_stride_x Stride of the destination image in X dimension (in bytes)
+ * @param[in] output_step_x output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] output_stride_y Stride of the destination image in Y dimension (in bytes)
+ * @param[in] output_step_y output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] output_stride_z Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] output_step_z output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] output_offset_first_element_in_bytes The offset of the first element in the destination image
+ */
+__kernel void pooling_layer_MxN_quantized_nhwc(
+ TENSOR3D_DECLARATION(input),
+ TENSOR3D_DECLARATION(output))
+{
+ // Get pixels pointer
+ Tensor3D input = CONVERT_TO_TENSOR3D_STRUCT(input);
+ Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT(output);
+
+ int8 vdata = 0;
+
+ const int idx_width = get_global_id(1) * STRIDE_X;
+ const int idx_height = get_global_id(2) * STRIDE_Y;
+
+ for(int y = 0; y < POOL_SIZE_Y; ++y)
+ {
+ int y1 = select(y, PAD_Y - idx_height, y + idx_height < PAD_Y || y + idx_height > MAX_HEIGHT);
+ for(int x = 0; x < POOL_SIZE_X; ++x)
+ {
+ int x1 = select(x, PAD_X - idx_width - 1, x + idx_width < PAD_X || x + idx_width > MAX_WIDTH);
+ x1 = select(x1, PAD_X - idx_width - 1, y != y1);
+ uchar8 data = vload8(0, (__global uchar *)tensor3D_offset(&input, 0, x1 - PAD_X, y1 - PAD_Y));
+ int8 data0 = convert_int8(data);
+ vdata = POOL_OP(vdata, data0);
+ }
+ }
+
+#if defined(POOL_AVG)
+ // Divide by pool region in case of average pooling
+ vdata = convert_int8(round(DIV_OP_NHWC(vdata, calculate_avg_scale_nhwc(POOL_SIZE_X, POOL_SIZE_Y, MAX_WIDTH, MAX_HEIGHT, PAD_X, PAD_Y, STRIDE_X, STRIDE_Y))));
+#endif /* defined(POOL_AVG) */
+
+ // Store result
+ vstore8(convert_uchar8(vdata), 0, (__global uchar *)output.ptr);
+}
\ No newline at end of file
diff --git a/src/core/CL/cl_kernels/winograd.cl b/src/core/CL/cl_kernels/winograd.cl
new file mode 100644
index 0000000..0458e53
--- /dev/null
+++ b/src/core/CL/cl_kernels/winograd.cl
@@ -0,0 +1,1611 @@
+/*
+ * Copyright (c) 2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers.h"
+
+#if defined(NUM_CHANNELS)
+
+/** This OpenCL kernel performs Winograd filter transform 3x3 when the data format is NCHW and the output tile is 2x2
+ *
+ * @note The number of channels must be passed at compile time using -DNUM_CHANNELS: e.g. -DNUM_CHANNELS=64
+ *
+ * @param[in] src_ptr Pointer to the source tensor. Supported data types: F32
+ * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
+ * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
+ * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)
+ * @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr
+ * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
+ * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ */
+__kernel void winograd_filter_transform_2x2_3x3_nchw(
+ TENSOR4D_DECLARATION(src),
+ TENSOR3D_DECLARATION(dst))
+{
+ Tensor4D src = CONVERT_TO_TENSOR4D_STRUCT(src, NUM_CHANNELS);
+
+ const __global uchar *src_addr = tensor4D_offset(&src, 0, 0, 0, 0);
+
+ // Load the values from the input tensor
+ float3 w0 = vload3(0, (__global float *)(src_addr + 0 * src_stride_y));
+ float3 w1 = vload3(0, (__global float *)(src_addr + 1 * src_stride_y));
+ float3 w2 = vload3(0, (__global float *)(src_addr + 2 * src_stride_y));
+
+ // Transform the 3x3 tile in a 4x4 tile
+ float4 out0 = 0.0f;
+ float4 out1 = 0.0f;
+ float4 out2 = 0.0f;
+ float4 out3 = 0.0f;
+
+ // Row 0
+ out0.s0 = (w0.s0);
+ out0.s1 = (w0.s0 + w0.s1 + w0.s2) * 0.5f;
+ out0.s2 = (w0.s0 + w0.s2 - w0.s1) * 0.5f;
+ out0.s3 = (w0.s2);
+
+ // Row 1
+ out1.s0 = (w0.s0 + w1.s0 + w2.s0) * 0.5f;
+ out1.s1 = (w0.s0 + w1.s0 + w2.s0 + w0.s1 + w1.s1 + w2.s1 + w0.s2 + w1.s2 + w2.s2) * 0.25f;
+ out1.s2 = (w0.s0 + w1.s0 + w2.s0 + w0.s2 + w1.s2 + w2.s2 - w0.s1 - w1.s1 - w2.s1) * 0.25f;
+ out1.s3 = (w0.s2 + w1.s2 + w2.s2) * 0.5f;
+
+ // Row 2
+ out2.s0 = (w0.s0 + w2.s0 - w1.s0) * 0.5f;
+ out2.s1 = (w0.s0 + w2.s0 + w0.s1 + w2.s1 + w0.s2 + w2.s2 - w1.s0 - w1.s1 - w1.s2) * 0.25f;
+ out2.s2 = (w0.s0 + w2.s0 + w1.s1 + w0.s2 + w2.s2 - w1.s0 - w0.s1 - w2.s1 - w1.s2) * 0.25f;
+ out2.s3 = (w0.s2 + w2.s2 - w1.s2) * 0.5f;
+
+ // Row 3
+ out3.s0 = (w2.s0);
+ out3.s1 = (w2.s0 + w2.s1 + w2.s2) * 0.5f;
+ out3.s2 = (w2.s0 + w2.s2 - w2.s1) * 0.5f;
+ out3.s3 = (w2.s2);
+
+ int z = get_global_id(2);
+ int x0 = z / NUM_CHANNELS; // idx filter
+ int y0 = z % NUM_CHANNELS; // idx channel
+
+ // Get output address
+ __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x0 * dst_stride_x + y0 * dst_stride_y;
+
+ // Store the 16 values across the 16 channels
+ *(__global float *)(dst_addr + 0 * dst_stride_z) = out0.s0;
+ *(__global float *)(dst_addr + 1 * dst_stride_z) = out0.s1;
+ *(__global float *)(dst_addr + 2 * dst_stride_z) = out0.s2;
+ *(__global float *)(dst_addr + 3 * dst_stride_z) = out0.s3;
+ *(__global float *)(dst_addr + 4 * dst_stride_z) = out1.s0;
+ *(__global float *)(dst_addr + 5 * dst_stride_z) = out1.s1;
+ *(__global float *)(dst_addr + 6 * dst_stride_z) = out1.s2;
+ *(__global float *)(dst_addr + 7 * dst_stride_z) = out1.s3;
+ *(__global float *)(dst_addr + 8 * dst_stride_z) = out2.s0;
+ *(__global float *)(dst_addr + 9 * dst_stride_z) = out2.s1;
+ *(__global float *)(dst_addr + 10 * dst_stride_z) = out2.s2;
+ *(__global float *)(dst_addr + 11 * dst_stride_z) = out2.s3;
+ *(__global float *)(dst_addr + 12 * dst_stride_z) = out3.s0;
+ *(__global float *)(dst_addr + 13 * dst_stride_z) = out3.s1;
+ *(__global float *)(dst_addr + 14 * dst_stride_z) = out3.s2;
+ *(__global float *)(dst_addr + 15 * dst_stride_z) = out3.s3;
+}
+
+/** This OpenCL kernel performs Winograd filter transform 3x3 when the data format is NCHW and the output tile is 4x4
+ *
+ * @note The number of channels must be passed at compile time using -DNUM_CHANNELS: e.g. -DNUM_CHANNELS=64
+ *
+ * @param[in] src_ptr Pointer to the source tensor. Supported data types: F32
+ * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
+ * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
+ * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)
+ * @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr
+ * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
+ * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ */
+__kernel void winograd_filter_transform_4x4_3x3_nchw(
+ TENSOR4D_DECLARATION(src),
+ TENSOR3D_DECLARATION(dst))
+{
+ Tensor4D src = CONVERT_TO_TENSOR4D_STRUCT(src, NUM_CHANNELS);
+
+ const __global uchar *src_addr = tensor4D_offset(&src, 0, 0, 0, 0);
+
+ // Load the values from the input tensor
+ float3 w0 = vload3(0, (__global float *)(src_addr + 0 * src_stride_y));
+ float3 w1 = vload3(0, (__global float *)(src_addr + 1 * src_stride_y));
+ float3 w2 = vload3(0, (__global float *)(src_addr + 2 * src_stride_y));
+
+ // Transform the 3x3 tile in a 6x6 tile
+ float8 out0 = 0.0f;
+ float8 out1 = 0.0f;
+ float8 out2 = 0.0f;
+ float8 out3 = 0.0f;
+ float8 out4 = 0.0f;
+ float8 out5 = 0.0f;
+
+ // Row 0
+ out0.s0 = (w0.s0) / 16.f;
+ out0.s1 = (-w0.s0 - w0.s1 - w0.s2) / 24.f;
+ out0.s2 = (-w0.s0 + w0.s1 - w0.s2) / 24.f;
+ out0.s3 = (w0.s0 + 2.f * w0.s1 + 4.f * w0.s2) / 96.f;
+ out0.s4 = (w0.s0 - 2.f * w0.s1 + 4.f * w0.s2) / 96.f;
+ out0.s5 = (w0.s2) / 4.f;
+
+ // Row 1
+ out1.s0 = (-w0.s0 - w1.s0 - w2.s0) / 24.f;
+ out1.s1 = (w0.s0 + w1.s0 + w2.s0 + w0.s1 + w1.s1 + w2.s1 + w0.s2 + w1.s2 + w2.s2) / 36.f;
+ out1.s2 = (w0.s0 + w1.s0 + w2.s0 - w0.s1 - w1.s1 - w2.s1 + w0.s2 + w1.s2 + w2.s2) / 36.f;
+ out1.s3 = (-w0.s0 - w1.s0 - w2.s0 + 2.f * (-w0.s1 - w1.s1 - w2.s1) + 4.f * (-w0.s2 - w1.s2 - w2.s2)) / 144.f;
+ out1.s4 = (-w0.s0 - w1.s0 - w2.s0 + 2.f * (w0.s1 + w1.s1 + w2.s1) + 4.f * (-w0.s2 - w1.s2 - w2.s2)) / 144.f;
+ out1.s5 = (-w0.s2 - w1.s2 - w2.s2) / 6.f;
+
+ // Row 2
+ out2.s0 = (-w0.s0 + w1.s0 - w2.s0) / 24.f;
+ out2.s1 = (w0.s0 - w1.s0 + w2.s0 + w0.s1 - w1.s1 + w2.s1 + w0.s2 - w1.s2 + w2.s2) / 36.f;
+ out2.s2 = (w0.s0 - w1.s0 + w2.s0 - w0.s1 + w1.s1 - w2.s1 + w0.s2 - w1.s2 + w2.s2) / 36.f;
+ out2.s3 = (-w0.s0 + w1.s0 - w2.s0 + 2.f * (-w0.s1 + w1.s1 - w2.s1) + 4.f * (-w0.s2 + w1.s2 - w2.s2)) / 144.f;
+ out2.s4 = (-w0.s0 + w1.s0 - w2.s0 + 2.f * (w0.s1 - w1.s1 + w2.s1) + 4.f * (-w0.s2 + w1.s2 - w2.s2)) / 144.f;
+ out2.s5 = (-w0.s2 + w1.s2 - w2.s2) / 6.f;
+
+ // Row 3
+ out3.s0 = (w0.s0 + 2.f * w1.s0 + 4.f * w2.s0) / 96.f;
+ out3.s1 = (-w0.s0 - 2.f * w1.s0 - 4.f * w2.s0 - w0.s1 - 2.f * w1.s1 - 4.f * w2.s1 - w0.s2 - 2.f * w1.s2 - 4.f * w2.s2) / 144.f;
+ out3.s2 = (-w0.s0 - 2.f * w1.s0 - 4.f * w2.s0 + w0.s1 + 2.f * w1.s1 + 4.f * w2.s1 - w0.s2 - 2.f * w1.s2 - 4.f * w2.s2) / 144.f;
+ out3.s3 = ((w0.s0 + 2.f * w1.s0 + 4.f * w2.s0) + 2.f * (w0.s1 + 2.f * w1.s1 + 4.f * w2.s1) + 4.f * (w0.s2 + 2.f * w1.s2 + 4.f * w2.s2)) / 576.f;
+ out3.s4 = ((w0.s0 + 2.f * w1.s0 + 4.f * w2.s0) + 2.f * (-w0.s1 - 2.f * w1.s1 - 4.f * w2.s1) + 4.f * (w0.s2 + 2.f * w1.s2 + 4.f * w2.s2)) / 576.f;
+ out3.s5 = (w0.s2 + 2.f * w1.s2 + 4.f * w2.s2) / 24.f;
+
+ // Row 4
+ out4.s0 = (w0.s0 - 2.f * w1.s0 + 4.f * w2.s0) / 96.f;
+ out4.s1 = (-w0.s0 + 2.f * w1.s0 - 4.f * w2.s0 - w0.s1 + 2.f * w1.s1 - 4.f * w2.s1 - w0.s2 + 2.f * w1.s2 - 4.f * w2.s2) / 144.f;
+ out4.s2 = (-w0.s0 + 2.f * w1.s0 - 4.f * w2.s0 + w0.s1 - 2.f * w1.s1 + 4.f * w2.s1 - w0.s2 + 2.f * w1.s2 - 4.f * w2.s2) / 144.f;
+ out4.s3 = ((w0.s0 - 2.f * w1.s0 + 4.f * w2.s0) + 2.f * (w0.s1 - 2.f * w1.s1 + 4.f * w2.s1) + 4.f * (w0.s2 - 2.f * w1.s2 + 4.f * w2.s2)) / 576.f;
+ out4.s4 = ((w0.s0 - 2.f * w1.s0 + 4.f * w2.s0) + 2.f * (-w0.s1 + 2.f * w1.s1 - 4.f * w2.s1) + 4.f * (w0.s2 - 2.f * w1.s2 + 4.f * w2.s2)) / 576.f;
+ out4.s5 = (w0.s2 - 2.f * w1.s2 + 4.f * w2.s2) / 24.f;
+
+ // Row 5
+ out5.s0 = (w2.s0) / 4.f;
+ out5.s1 = (-w2.s0 - w2.s1 - w2.s2) / 6.f;
+ out5.s2 = (-w2.s0 + w2.s1 - w2.s2) / 6.f;
+ out5.s3 = (w2.s0 + 2.f * w2.s1 + 4.f * w2.s2) / 24.f;
+ out5.s4 = (w2.s0 - 2.f * w2.s1 + 4.f * w2.s2) / 24.f;
+ out5.s5 = (w2.s2);
+
+ int z = get_global_id(2);
+ int x0 = z / NUM_CHANNELS; // idx filter
+ int y0 = z % NUM_CHANNELS; // idx channel
+
+ // Get output address
+ __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x0 * dst_stride_x + y0 * dst_stride_y;
+
+ // Store the 36 values across the 36 channels
+ *(__global float *)(dst_addr + 0 * dst_stride_z) = out0.s0;
+ *(__global float *)(dst_addr + 1 * dst_stride_z) = out0.s1;
+ *(__global float *)(dst_addr + 2 * dst_stride_z) = out0.s2;
+ *(__global float *)(dst_addr + 3 * dst_stride_z) = out0.s3;
+ *(__global float *)(dst_addr + 4 * dst_stride_z) = out0.s4;
+ *(__global float *)(dst_addr + 5 * dst_stride_z) = out0.s5;
+ *(__global float *)(dst_addr + 6 * dst_stride_z) = out1.s0;
+ *(__global float *)(dst_addr + 7 * dst_stride_z) = out1.s1;
+ *(__global float *)(dst_addr + 8 * dst_stride_z) = out1.s2;
+ *(__global float *)(dst_addr + 9 * dst_stride_z) = out1.s3;
+ *(__global float *)(dst_addr + 10 * dst_stride_z) = out1.s4;
+ *(__global float *)(dst_addr + 11 * dst_stride_z) = out1.s5;
+ *(__global float *)(dst_addr + 12 * dst_stride_z) = out2.s0;
+ *(__global float *)(dst_addr + 13 * dst_stride_z) = out2.s1;
+ *(__global float *)(dst_addr + 14 * dst_stride_z) = out2.s2;
+ *(__global float *)(dst_addr + 15 * dst_stride_z) = out2.s3;
+ *(__global float *)(dst_addr + 16 * dst_stride_z) = out2.s4;
+ *(__global float *)(dst_addr + 17 * dst_stride_z) = out2.s5;
+ *(__global float *)(dst_addr + 18 * dst_stride_z) = out3.s0;
+ *(__global float *)(dst_addr + 19 * dst_stride_z) = out3.s1;
+ *(__global float *)(dst_addr + 20 * dst_stride_z) = out3.s2;
+ *(__global float *)(dst_addr + 21 * dst_stride_z) = out3.s3;
+ *(__global float *)(dst_addr + 22 * dst_stride_z) = out3.s4;
+ *(__global float *)(dst_addr + 23 * dst_stride_z) = out3.s5;
+ *(__global float *)(dst_addr + 24 * dst_stride_z) = out4.s0;
+ *(__global float *)(dst_addr + 25 * dst_stride_z) = out4.s1;
+ *(__global float *)(dst_addr + 26 * dst_stride_z) = out4.s2;
+ *(__global float *)(dst_addr + 27 * dst_stride_z) = out4.s3;
+ *(__global float *)(dst_addr + 28 * dst_stride_z) = out4.s4;
+ *(__global float *)(dst_addr + 29 * dst_stride_z) = out4.s5;
+ *(__global float *)(dst_addr + 30 * dst_stride_z) = out5.s0;
+ *(__global float *)(dst_addr + 31 * dst_stride_z) = out5.s1;
+ *(__global float *)(dst_addr + 32 * dst_stride_z) = out5.s2;
+ *(__global float *)(dst_addr + 33 * dst_stride_z) = out5.s3;
+ *(__global float *)(dst_addr + 34 * dst_stride_z) = out5.s4;
+ *(__global float *)(dst_addr + 35 * dst_stride_z) = out5.s5;
+}
+
+/** This OpenCL kernel performs Winograd filter transform 5x5 when the data format is NCHW and the output tile is 4x4
+ *
+ * @note The number of channels must be passed at compile time using -DNUM_CHANNELS: e.g. -DNUM_CHANNELS=64
+ *
+ * @param[in] src_ptr Pointer to the source tensor. Supported data types: F32
+ * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
+ * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
+ * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)
+ * @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr
+ * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
+ * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ */
+__kernel void winograd_filter_transform_4x4_5x5_nchw(
+ TENSOR4D_DECLARATION(src),
+ TENSOR3D_DECLARATION(dst))
+{
+ Tensor4D src = CONVERT_TO_TENSOR4D_STRUCT(src, NUM_CHANNELS);
+
+ const __global uchar *src_addr = tensor4D_offset(&src, 0, 0, 0, 0);
+
+ // Load the values from the input tensor
+ const char stride_x = 4 * sizeof(float); // Used for accessing the last value in each row
+ const uchar8 stride_y = (uchar8)(0, 1, 2, 3, 4, 0, 0, 0) * (uchar8)src_stride_y;
+
+ float4 w00 = vload4(0, (__global float *)(src_addr + stride_y.s0));
+ float w01 = *((__global float *)(src_addr + stride_y.s0 + stride_x));
+ float4 w10 = vload4(0, (__global float *)(src_addr + stride_y.s1));
+ float w11 = *((__global float *)(src_addr + stride_y.s1 + stride_x));
+ float4 w20 = vload4(0, (__global float *)(src_addr + stride_y.s2));
+ float w21 = *((__global float *)(src_addr + stride_y.s2 + stride_x));
+ float4 w30 = vload4(0, (__global float *)(src_addr + stride_y.s3));
+ float w31 = *((__global float *)(src_addr + stride_y.s3 + stride_x));
+ float4 w40 = vload4(0, (__global float *)(src_addr + stride_y.s4));
+ float w41 = *((__global float *)(src_addr + stride_y.s4 + stride_x));
+
+ // Transform the 3x3 tile in a 8x8 tile
+ float8 out0 = 0.0f;
+ float8 out1 = 0.0f;
+ float8 out2 = 0.0f;
+ float8 out3 = 0.0f;
+ float8 out4 = 0.0f;
+ float8 out5 = 0.0f;
+ float8 out6 = 0.0f;
+ float8 out7 = 0.0f;
+
+ // Row 0
+ out0.s0 = w00.s0;
+ out0.s1 = -2.f * (w00.s0 + w00.s1 + w00.s2 + w00.s3 + w01) / 9.f;
+ out0.s2 = -2.f * (w00.s0 - w00.s1 + w00.s2 - w00.s3 + w01) / 9.f;
+ out0.s3 = (w00.s0 + 2.f * w00.s1 + 4.f * w00.s2 + 8.f * w00.s3 + 16.f * w01) / 90.f;
+ out0.s4 = (w00.s0 - 2.f * w00.s1 + 4.f * w00.s2 - 8.f * w00.s3 + 16.f * w01) / 90.f;
+ out0.s5 = (16.f * w00.s0 + 8.f * w00.s1 + 4.f * w00.s2 + 2.f * w00.s3 + w01) / 180.f;
+ out0.s6 = (16.f * w00.s0 - 8.f * w00.s1 + 4.f * w00.s2 - 2.f * w00.s3 + w01) / 180.f;
+ out0.s7 = w01;
+
+ // Row 1
+ out1.s0 = -2.f * (w00.s0 + w10.s0 + w20.s0 + w30.s0 + w40.s0) / 9.f;
+ out1.s1 = 4.f * ((w00.s0 + w10.s0 + w20.s0 + w30.s0 + w40.s0) + (w00.s1 + w10.s1 + w20.s1 + w30.s1 + w40.s1) + (w00.s2 + w10.s2 + w20.s2 + w30.s2 + w40.s2) +
+ (w00.s3 + w10.s3 + w20.s3 + w30.s3 + w40.s3) + (w01 + w11 + w21 + w31 + w41)) / 81.f;
+ out1.s2 = 4.f * ((w00.s0 + w10.s0 + w20.s0 + w30.s0 + w40.s0) - (w00.s1 + w10.s1 + w20.s1 + w30.s1 + w40.s1) + (w00.s2 + w10.s2 + w20.s2 + w30.s2 + w40.s2) -
+ (w00.s3 + w10.s3 + w20.s3 + w30.s3 + w40.s3) + (w01 + w11 + w21 + w31 + w41)) / 81.f;
+ out1.s3 = -((w00.s0 + w10.s0 + w20.s0 + w30.s0 + w40.s0) + 2.f * (w00.s1 + w10.s1 + w20.s1 + w30.s1 + w40.s1) + 4.f * (w00.s2 + w10.s2 + w20.s2 + w30.s2 + w40.s2) + 8.f *
+ (w00.s3 + w10.s3 + w20.s3 + w30.s3 + w40.s3) + 16.f * (w01 + w11 + w21 + w31 + w41)) / 405.f;
+ out1.s4 = -((w00.s0 + w10.s0 + w20.s0 + w30.s0 + w40.s0) - 2.f * (w00.s1 + w10.s1 + w20.s1 + w30.s1 + w40.s1) + 4.f * (w00.s2 + w10.s2 + w20.s2 + w30.s2 + w40.s2) - 8.f *
+ (w00.s3 + w10.s3 + w20.s3 + w30.s3 + w40.s3) + 16.f * (w01 + w11 + w21 + w31 + w41)) / 405.f;
+ out1.s5 = -(16.f * (w00.s0 + w10.s0 + w20.s0 + w30.s0 + w40.s0) + 8.f * (w00.s1 + w10.s1 + w20.s1 + w30.s1 + w40.s1) + 4.f * (w00.s2 + w10.s2 + w20.s2 + w30.s2 + w40.s2) + 2.f *
+ (w00.s3 + w10.s3 + w20.s3 + w30.s3 + w40.s3) + (w01 + w11 + w21 + w31 + w41)) / 810.f;
+ out1.s6 = -(16.f * (w00.s0 + w10.s0 + w20.s0 + w30.s0 + w40.s0) - 8.f * (w00.s1 + w10.s1 + w20.s1 + w30.s1 + w40.s1) + 4.f * (w00.s2 + w10.s2 + w20.s2 + w30.s2 + w40.s2) - 2.f *
+ (w00.s3 + w10.s3 + w20.s3 + w30.s3 + w40.s3) + (w01 + w11 + w21 + w31 + w41)) / 810.f;
+ out1.s7 = -2.f * (w01 + w11 + w21 + w31 + w41) / 9.f;
+
+ // Row 2
+ out2.s0 = -2.f * (w00.s0 - w10.s0 + w20.s0 - w30.s0 + w40.s0) / 9.f;
+ out2.s1 = 4.f * ((w00.s0 - w10.s0 + w20.s0 - w30.s0 + w40.s0) + (w00.s1 - w10.s1 + w20.s1 - w30.s1 + w40.s1) + (w00.s2 - w10.s2 + w20.s2 - w30.s2 + w40.s2) +
+ (w00.s3 - w10.s3 + w20.s3 - w30.s3 + w40.s3) + (w01 - w11 + w21 - w31 + w41)) / 81.f;
+ out2.s2 = 4.f * ((w00.s0 - w10.s0 + w20.s0 - w30.s0 + w40.s0) - (w00.s1 - w10.s1 + w20.s1 - w30.s1 + w40.s1) + (w00.s2 - w10.s2 + w20.s2 - w30.s2 + w40.s2) -
+ (w00.s3 - w10.s3 + w20.s3 - w30.s3 + w40.s3) + (w01 - w11 + w21 - w31 + w41)) / 81.f;
+ out2.s3 = -((w00.s0 - w10.s0 + w20.s0 - w30.s0 + w40.s0) + 2.f * (w00.s1 - w10.s1 + w20.s1 - w30.s1 + w40.s1) + 4.f * (w00.s2 - w10.s2 + w20.s2 - w30.s2 + w40.s2) + 8.f *
+ (w00.s3 - w10.s3 + w20.s3 - w30.s3 + w40.s3) + 16.f * (w01 - w11 + w21 - w31 + w41)) / 405.f;
+ out2.s4 = -((w00.s0 - w10.s0 + w20.s0 - w30.s0 + w40.s0) - 2.f * (w00.s1 - w10.s1 + w20.s1 - w30.s1 + w40.s1) + 4.f * (w00.s2 - w10.s2 + w20.s2 - w30.s2 + w40.s2) - 8.f *
+ (w00.s3 - w10.s3 + w20.s3 - w30.s3 + w40.s3) + 16.f * (w01 - w11 + w21 - w31 + w41)) / 405.f;
+ out2.s5 = -(16.f * (w00.s0 - w10.s0 + w20.s0 - w30.s0 + w40.s0) + 8.f * (w00.s1 - w10.s1 + w20.s1 - w30.s1 + w40.s1) + 4.f * (w00.s2 - w10.s2 + w20.s2 - w30.s2 + w40.s2) + 2.f *
+ (w00.s3 - w10.s3 + w20.s3 - w30.s3 + w40.s3) + (w01 - w11 + w21 - w31 + w41)) / 810.f;
+ out2.s6 = -(16.f * (w00.s0 - w10.s0 + w20.s0 - w30.s0 + w40.s0) - 8.f * (w00.s1 - w10.s1 + w20.s1 - w30.s1 + w40.s1) + 4.f * (w00.s2 - w10.s2 + w20.s2 - w30.s2 + w40.s2) - 2.f *
+ (w00.s3 - w10.s3 + w20.s3 - w30.s3 + w40.s3) + (w01 - w11 + w21 - w31 + w41)) / 810.f;
+ out2.s7 = -2.f * (w01 - w11 + w21 - w31 + w41) / 9.f;
+
+ // Row 3
+ out3.s0 = (w00.s0 + 2.f * w10.s0 + 4.f * w20.s0 + 8.f * w30.s0 + 16.f * w40.s0) / 90.f;
+ out3.s1 = -((w00.s0 + 2.f * w10.s0 + 4.f * w20.s0 + 8.f * w30.s0 + 16.f * w40.s0) + (w00.s1 + 2.f * w10.s1 + 4.f * w20.s1 + 8.f * w30.s1 + 16.f * w40.s1) +
+ (w00.s2 + 2.f * w10.s2 + 4.f * w20.s2 + 8.f * w30.s2 + 16.f * w40.s2) + (w00.s3 + 2.f * w10.s3 + 4.f * w20.s3 + 8.f * w30.s3 + 16.f * w40.s3) +
+ (w01 + 2.f * w11 + 4.f * w21 + 8.f * w31 + 16.f * w41)) / 405.f;
+ out3.s2 = -((w00.s0 + 2.f * w10.s0 + 4.f * w20.s0 + 8.f * w30.s0 + 16.f * w40.s0) - (w00.s1 + 2.f * w10.s1 + 4.f * w20.s1 + 8.f * w30.s1 + 16.f * w40.s1) +
+ (w00.s2 + 2.f * w10.s2 + 4.f * w20.s2 + 8.f * w30.s2 + 16.f * w40.s2) - (w00.s3 + 2.f * w10.s3 + 4.f * w20.s3 + 8.f * w30.s3 + 16.f * w40.s3) +
+ (w01 + 2.f * w11 + 4.f * w21 + 8.f * w31 + 16.f * w41)) / 405.f;
+ out3.s3 = ((w00.s0 + 2.f * w10.s0 + 4.f * w20.s0 + 8.f * w30.s0 + 16.f * w40.s0) + 2.f * (w00.s1 + 2.f * w10.s1 + 4.f * w20.s1 + 8.f * w30.s1 + 16.f * w40.s1) + 4.f *
+ (w00.s2 + 2.f * w10.s2 + 4.f * w20.s2 + 8.f * w30.s2 + 16.f * w40.s2) + 8.f * (w00.s3 + 2.f * w10.s3 + 4.f * w20.s3 + 8.f * w30.s3 + 16.f * w40.s3) + 16.f *
+ (w01 + 2.f * w11 + 4.f * w21 + 8.f * w31 + 16.f * w41)) / 8100.f;
+ out3.s4 = ((w00.s0 + 2.f * w10.s0 + 4.f * w20.s0 + 8.f * w30.s0 + 16.f * w40.s0) - 2.f * (w00.s1 + 2.f * w10.s1 + 4.f * w20.s1 + 8.f * w30.s1 + 16.f * w40.s1) + 4.f *
+ (w00.s2 + 2.f * w10.s2 + 4.f * w20.s2 + 8.f * w30.s2 + 16.f * w40.s2) - 8.f * (w00.s3 + 2.f * w10.s3 + 4.f * w20.s3 + 8.f * w30.s3 + 16.f * w40.s3) + 16.f *
+ (w01 + 2.f * w11 + 4.f * w21 + 8.f * w31 + 16.f * w41)) / 8100.f;
+ out3.s5 = (16.f * (w00.s0 + 2.f * w10.s0 + 4.f * w20.s0 + 8.f * w30.s0 + 16.f * w40.s0) + 8.f * (w00.s1 + 2.f * w10.s1 + 4.f * w20.s1 + 8.f * w30.s1 + 16.f * w40.s1) + 4.f *
+ (w00.s2 + 2.f * w10.s2 + 4.f * w20.s2 + 8.f * w30.s2 + 16.f * w40.s2) + 2.f * (w00.s3 + 2.f * w10.s3 + 4.f * w20.s3 + 8.f * w30.s3 + 16.f * w40.s3) +
+ (w01 + 2.f * w11 + 4.f * w21 + 8.f * w31 + 16.f * w41)) / 16200.f;
+ out3.s6 = (16.f * (w00.s0 + 2.f * w10.s0 + 4.f * w20.s0 + 8.f * w30.s0 + 16.f * w40.s0) - 8.f * (w00.s1 + 2.f * w10.s1 + 4.f * w20.s1 + 8.f * w30.s1 + 16.f * w40.s1) + 4.f *
+ (w00.s2 + 2.f * w10.s2 + 4.f * w20.s2 + 8.f * w30.s2 + 16.f * w40.s2) - 2.f * (w00.s3 + 2.f * w10.s3 + 4.f * w20.s3 + 8.f * w30.s3 + 16.f * w40.s3) +
+ (w01 + 2.f * w11 + 4.f * w21 + 8.f * w31 + 16.f * w41)) / 16200.f;
+ out3.s7 = (w01 + 2.f * w11 + 4.f * w21 + 8.f * w31 + 16.f * w41) / 90.f;
+
+ // Row 4
+ out4.s0 = (w00.s0 - 2.f * w10.s0 + 4.f * w20.s0 - 8.f * w30.s0 + 16.f * w40.s0) / 90.f;
+ out4.s1 = -((w00.s0 - 2.f * w10.s0 + 4.f * w20.s0 - 8.f * w30.s0 + 16.f * w40.s0) + (w00.s1 - 2.f * w10.s1 + 4.f * w20.s1 - 8.f * w30.s1 + 16.f * w40.s1) +
+ (w00.s2 - 2.f * w10.s2 + 4.f * w20.s2 - 8.f * w30.s2 + 16.f * w40.s2) + (w00.s3 - 2.f * w10.s3 + 4.f * w20.s3 - 8.f * w30.s3 + 16.f * w40.s3) +
+ (w01 - 2.f * w11 + 4.f * w21 - 8.f * w31 + 16.f * w41)) / 405.f;
+ out4.s2 = -((w00.s0 - 2.f * w10.s0 + 4.f * w20.s0 - 8.f * w30.s0 + 16.f * w40.s0) - (w00.s1 - 2.f * w10.s1 + 4.f * w20.s1 - 8.f * w30.s1 + 16.f * w40.s1) +
+ (w00.s2 - 2.f * w10.s2 + 4.f * w20.s2 - 8.f * w30.s2 + 16.f * w40.s2) - (w00.s3 - 2.f * w10.s3 + 4.f * w20.s3 - 8.f * w30.s3 + 16.f * w40.s3) +
+ (w01 - 2.f * w11 + 4.f * w21 - 8.f * w31 + 16.f * w41)) / 405.f;
+ out4.s3 = ((w00.s0 - 2.f * w10.s0 + 4.f * w20.s0 - 8.f * w30.s0 + 16.f * w40.s0) + 2.f * (w00.s1 - 2.f * w10.s1 + 4.f * w20.s1 - 8.f * w30.s1 + 16.f * w40.s1) + 4.f *
+ (w00.s2 - 2.f * w10.s2 + 4.f * w20.s2 - 8.f * w30.s2 + 16.f * w40.s2) + 8.f * (w00.s3 - 2.f * w10.s3 + 4.f * w20.s3 - 8.f * w30.s3 + 16.f * w40.s3) + 16.f *
+ (w01 - 2.f * w11 + 4.f * w21 - 8.f * w31 + 16.f * w41)) / 8100.f;
+ out4.s4 = ((w00.s0 - 2.f * w10.s0 + 4.f * w20.s0 - 8.f * w30.s0 + 16.f * w40.s0) - 2.f * (w00.s1 - 2.f * w10.s1 + 4.f * w20.s1 - 8.f * w30.s1 + 16.f * w40.s1) + 4.f *
+ (w00.s2 - 2.f * w10.s2 + 4.f * w20.s2 - 8.f * w30.s2 + 16.f * w40.s2) - 8.f * (w00.s3 - 2.f * w10.s3 + 4.f * w20.s3 - 8.f * w30.s3 + 16.f * w40.s3) + 16.f *
+ (w01 - 2.f * w11 + 4.f * w21 - 8.f * w31 + 16.f * w41)) / 8100.f;
+ out4.s5 = (16.f * (w00.s0 - 2.f * w10.s0 + 4.f * w20.s0 - 8.f * w30.s0 + 16.f * w40.s0) + 8.f * (w00.s1 - 2.f * w10.s1 + 4.f * w20.s1 - 8.f * w30.s1 + 16.f * w40.s1) + 4.f *
+ (w00.s2 - 2.f * w10.s2 + 4.f * w20.s2 - 8.f * w30.s2 + 16.f * w40.s2) + 2.f * (w00.s3 - 2.f * w10.s3 + 4.f * w20.s3 - 8.f * w30.s3 + 16.f * w40.s3) +
+ (w01 - 2.f * w11 + 4.f * w21 - 8.f * w31 + 16.f * w41)) / 16200.f;
+ out4.s6 = (16.f * (w00.s0 - 2.f * w10.s0 + 4.f * w20.s0 - 8.f * w30.s0 + 16.f * w40.s0) - 8.f * (w00.s1 - 2.f * w10.s1 + 4.f * w20.s1 - 8.f * w30.s1 + 16.f * w40.s1) + 4.f *
+ (w00.s2 - 2.f * w10.s2 + 4.f * w20.s2 - 8.f * w30.s2 + 16.f * w40.s2) - 2.f * (w00.s3 - 2.f * w10.s3 + 4.f * w20.s3 - 8.f * w30.s3 + 16.f * w40.s3) +
+ (w01 - 2.f * w11 + 4.f * w21 - 8.f * w31 + 16.f * w41)) / 16200.f;
+ out4.s7 = (w01 - 2.f * w11 + 4.f * w21 - 8.f * w31 + 16.f * w41) / 90.f;
+
+ // Row 5
+ out5.s0 = (16.f * w00.s0 + 8.f * w10.s0 + 4.f * w20.s0 + 2.f * w30.s0 + w40.s0) / 180.f;
+ out5.s1 = -((16.f * w00.s0 + 8.f * w10.s0 + 4.f * w20.s0 + 2.f * w30.s0 + w40.s0) + (16.f * w00.s1 + 8.f * w10.s1 + 4.f * w20.s1 + 2.f * w30.s1 + w40.s1) +
+ (16.f * w00.s2 + 8.f * w10.s2 + 4.f * w20.s2 + 2.f * w30.s2 + w40.s2) + (16.f * w00.s3 + 8.f * w10.s3 + 4.f * w20.s3 + 2.f * w30.s3 + w40.s3) +
+ (16.f * w01 + 8.f * w11 + 4.f * w21 + 2.f * w31 + w41)) / 810.f;
+ out5.s2 = -((16.f * w00.s0 + 8.f * w10.s0 + 4.f * w20.s0 + 2.f * w30.s0 + w40.s0) - (16.f * w00.s1 + 8.f * w10.s1 + 4.f * w20.s1 + 2.f * w30.s1 + w40.s1) +
+ (16.f * w00.s2 + 8.f * w10.s2 + 4.f * w20.s2 + 2.f * w30.s2 + w40.s2) - (16.f * w00.s3 + 8.f * w10.s3 + 4.f * w20.s3 + 2.f * w30.s3 + w40.s3) +
+ (16.f * w01 + 8.f * w11 + 4.f * w21 + 2.f * w31 + w41)) / 810.f;
+ out5.s3 = ((16.f * w00.s0 + 8.f * w10.s0 + 4.f * w20.s0 + 2.f * w30.s0 + w40.s0) + 2.f * (16.f * w00.s1 + 8.f * w10.s1 + 4.f * w20.s1 + 2.f * w30.s1 + w40.s1) + 4.f *
+ (16.f * w00.s2 + 8.f * w10.s2 + 4.f * w20.s2 + 2.f * w30.s2 + w40.s2) + 8.f * (16.f * w00.s3 + 8.f * w10.s3 + 4.f * w20.s3 + 2.f * w30.s3 + w40.s3) + 16.f *
+ (16.f * w01 + 8.f * w11 + 4.f * w21 + 2.f * w31 + w41)) / 16200.f;
+ out5.s4 = ((16.f * w00.s0 + 8.f * w10.s0 + 4.f * w20.s0 + 2.f * w30.s0 + w40.s0) - 2.f * (16.f * w00.s1 + 8.f * w10.s1 + 4.f * w20.s1 + 2.f * w30.s1 + w40.s1) + 4.f *
+ (16.f * w00.s2 + 8.f * w10.s2 + 4.f * w20.s2 + 2.f * w30.s2 + w40.s2) - 8.f * (16.f * w00.s3 + 8.f * w10.s3 + 4.f * w20.s3 + 2.f * w30.s3 + w40.s3) + 16.f *
+ (16.f * w01 + 8.f * w11 + 4.f * w21 + 2.f * w31 + w41)) / 16200.f;
+ out5.s5 = (16.f * (16.f * w00.s0 + 8.f * w10.s0 + 4.f * w20.s0 + 2.f * w30.s0 + w40.s0) + 8.f * (16.f * w00.s1 + 8.f * w10.s1 + 4.f * w20.s1 + 2.f * w30.s1 + w40.s1) + 4.f *
+ (16.f * w00.s2 + 8.f * w10.s2 + 4.f * w20.s2 + 2.f * w30.s2 + w40.s2) + 2.f * (16.f * w00.s3 + 8.f * w10.s3 + 4.f * w20.s3 + 2.f * w30.s3 + w40.s3) +
+ (16.f * w01 + 8.f * w11 + 4.f * w21 + 2.f * w31 + w41)) / 32400.f;
+ out5.s6 = (16.f * (16.f * w00.s0 + 8.f * w10.s0 + 4.f * w20.s0 + 2.f * w30.s0 + w40.s0) - 8.f * (16.f * w00.s1 + 8.f * w10.s1 + 4.f * w20.s1 + 2.f * w30.s1 + w40.s1) + 4.f *
+ (16.f * w00.s2 + 8.f * w10.s2 + 4.f * w20.s2 + 2.f * w30.s2 + w40.s2) - 2.f * (16.f * w00.s3 + 8.f * w10.s3 + 4.f * w20.s3 + 2.f * w30.s3 + w40.s3) +
+ (16.f * w01 + 8.f * w11 + 4.f * w21 + 2.f * w31 + w41)) / 32400.f;
+ out5.s7 = (16.f * w01 + 8.f * w11 + 4.f * w21 + 2.f * w31 + w41) / 180.f;
+
+ // Row 6
+ out6.s0 = (16.f * w00.s0 - 8.f * w10.s0 + 4.f * w20.s0 - 2.f * w30.s0 + w40.s0) / 180.f;
+ out6.s1 = -((16.f * w00.s0 - 8.f * w10.s0 + 4.f * w20.s0 - 2.f * w30.s0 + w40.s0) + (16.f * w00.s1 - 8.f * w10.s1 + 4.f * w20.s1 - 2.f * w30.s1 + w40.s1) +
+ (16.f * w00.s2 - 8.f * w10.s2 + 4.f * w20.s2 - 2.f * w30.s2 + w40.s2) + (16.f * w00.s3 - 8.f * w10.s3 + 4.f * w20.s3 - 2.f * w30.s3 + w40.s3) +
+ (16.f * w01 - 8.f * w11 + 4.f * w21 - 2.f * w31 + w41)) / 810.f;
+ out6.s2 = -((16.f * w00.s0 - 8.f * w10.s0 + 4.f * w20.s0 - 2.f * w30.s0 + w40.s0) - (16.f * w00.s1 - 8.f * w10.s1 + 4.f * w20.s1 - 2.f * w30.s1 + w40.s1) +
+ (16.f * w00.s2 - 8.f * w10.s2 + 4.f * w20.s2 - 2.f * w30.s2 + w40.s2) - (16.f * w00.s3 - 8.f * w10.s3 + 4.f * w20.s3 - 2.f * w30.s3 + w40.s3) +
+ (16.f * w01 - 8.f * w11 + 4.f * w21 - 2.f * w31 + w41)) / 810.f;
+ out6.s3 = ((16.f * w00.s0 - 8.f * w10.s0 + 4.f * w20.s0 - 2.f * w30.s0 + w40.s0) + 2.f * (16.f * w00.s1 - 8.f * w10.s1 + 4.f * w20.s1 - 2.f * w30.s1 + w40.s1) + 4.f *
+ (16.f * w00.s2 - 8.f * w10.s2 + 4.f * w20.s2 - 2.f * w30.s2 + w40.s2) + 8.f * (16.f * w00.s3 - 8.f * w10.s3 + 4.f * w20.s3 - 2.f * w30.s3 + w40.s3) + 16.f *
+ (16.f * w01 - 8.f * w11 + 4.f * w21 - 2.f * w31 + w41)) / 16200.f;
+ out6.s4 = ((16.f * w00.s0 - 8.f * w10.s0 + 4.f * w20.s0 - 2.f * w30.s0 + w40.s0) - 2.f * (16.f * w00.s1 - 8.f * w10.s1 + 4.f * w20.s1 - 2.f * w30.s1 + w40.s1) + 4.f *
+ (16.f * w00.s2 - 8.f * w10.s2 + 4.f * w20.s2 - 2.f * w30.s2 + w40.s2) - 8.f * (16.f * w00.s3 - 8.f * w10.s3 + 4.f * w20.s3 - 2.f * w30.s3 + w40.s3) + 16.f *
+ (16.f * w01 - 8.f * w11 + 4.f * w21 - 2.f * w31 + w41)) / 16200.f;
+ out6.s5 = (16.f * (16.f * w00.s0 - 8.f * w10.s0 + 4.f * w20.s0 - 2.f * w30.s0 + w40.s0) + 8.f * (16.f * w00.s1 - 8.f * w10.s1 + 4.f * w20.s1 - 2.f * w30.s1 + w40.s1) + 4.f *
+ (16.f * w00.s2 - 8.f * w10.s2 + 4.f * w20.s2 - 2.f * w30.s2 + w40.s2) + 2.f * (16.f * w00.s3 - 8.f * w10.s3 + 4.f * w20.s3 - 2.f * w30.s3 + w40.s3) +
+ (16.f * w01 - 8.f * w11 + 4.f * w21 - 2.f * w31 + w41)) / 32400.f;
+ out6.s6 = (16.f * (16.f * w00.s0 - 8.f * w10.s0 + 4.f * w20.s0 - 2.f * w30.s0 + w40.s0) - 8.f * (16.f * w00.s1 - 8.f * w10.s1 + 4.f * w20.s1 - 2.f * w30.s1 + w40.s1) + 4.f *
+ (16.f * w00.s2 - 8.f * w10.s2 + 4.f * w20.s2 - 2.f * w30.s2 + w40.s2) - 2.f * (16.f * w00.s3 - 8.f * w10.s3 + 4.f * w20.s3 - 2.f * w30.s3 + w40.s3) +
+ (16.f * w01 - 8.f * w11 + 4.f * w21 - 2.f * w31 + w41)) / 32400.f;
+ out6.s7 = (16.f * w01 - 8.f * w11 + 4.f * w21 - 2.f * w31 + w41) / 180.f;
+
+ // Row 7
+ out7.s0 = w40.s0;
+ out7.s1 = -2.f * (w40.s0 + w40.s1 + w40.s2 + w40.s3 + w41) / 9.f;
+ out7.s2 = -2.f * (w40.s0 - w40.s1 + w40.s2 - w40.s3 + w41) / 9.f;
+ out7.s3 = (w40.s0 + 2.f * w40.s1 + 4.f * w40.s2 + 8.f * w40.s3 + 16.f * w41) / 90.f;
+ out7.s4 = (w40.s0 - 2.f * w40.s1 + 4.f * w40.s2 - 8.f * w40.s3 + 16.f * w41) / 90.f;
+ out7.s5 = (16.f * w40.s0 + 8.f * w40.s1 + 4.f * w40.s2 + 2.f * w40.s3 + w41) / 180.f;
+ out7.s6 = (16.f * w40.s0 - 8.f * w40.s1 + 4.f * w40.s2 - 2.f * w40.s3 + w41) / 180.f;
+ out7.s7 = w41;
+
+ int z = get_global_id(2);
+ int x0 = z / NUM_CHANNELS; // idx filter
+ int y0 = z % NUM_CHANNELS; // idx channel
+
+ // Get output address
+ __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x0 * dst_stride_x + y0 * dst_stride_y;
+
+ // Store the 64 values across the 64 channels
+ *(__global float *)(dst_addr + 0 * dst_stride_z) = out0.s0;
+ *(__global float *)(dst_addr + 1 * dst_stride_z) = out0.s1;
+ *(__global float *)(dst_addr + 2 * dst_stride_z) = out0.s2;
+ *(__global float *)(dst_addr + 3 * dst_stride_z) = out0.s3;
+ *(__global float *)(dst_addr + 4 * dst_stride_z) = out0.s4;
+ *(__global float *)(dst_addr + 5 * dst_stride_z) = out0.s5;
+ *(__global float *)(dst_addr + 6 * dst_stride_z) = out0.s6;
+ *(__global float *)(dst_addr + 7 * dst_stride_z) = out0.s7;
+ *(__global float *)(dst_addr + 8 * dst_stride_z) = out1.s0;
+ *(__global float *)(dst_addr + 9 * dst_stride_z) = out1.s1;
+ *(__global float *)(dst_addr + 10 * dst_stride_z) = out1.s2;
+ *(__global float *)(dst_addr + 11 * dst_stride_z) = out1.s3;
+ *(__global float *)(dst_addr + 12 * dst_stride_z) = out1.s4;
+ *(__global float *)(dst_addr + 13 * dst_stride_z) = out1.s5;
+ *(__global float *)(dst_addr + 14 * dst_stride_z) = out1.s6;
+ *(__global float *)(dst_addr + 15 * dst_stride_z) = out1.s7;
+ *(__global float *)(dst_addr + 16 * dst_stride_z) = out2.s0;
+ *(__global float *)(dst_addr + 17 * dst_stride_z) = out2.s1;
+ *(__global float *)(dst_addr + 18 * dst_stride_z) = out2.s2;
+ *(__global float *)(dst_addr + 19 * dst_stride_z) = out2.s3;
+ *(__global float *)(dst_addr + 20 * dst_stride_z) = out2.s4;
+ *(__global float *)(dst_addr + 21 * dst_stride_z) = out2.s5;
+ *(__global float *)(dst_addr + 22 * dst_stride_z) = out2.s6;
+ *(__global float *)(dst_addr + 23 * dst_stride_z) = out2.s7;
+ *(__global float *)(dst_addr + 24 * dst_stride_z) = out3.s0;
+ *(__global float *)(dst_addr + 25 * dst_stride_z) = out3.s1;
+ *(__global float *)(dst_addr + 26 * dst_stride_z) = out3.s2;
+ *(__global float *)(dst_addr + 27 * dst_stride_z) = out3.s3;
+ *(__global float *)(dst_addr + 28 * dst_stride_z) = out3.s4;
+ *(__global float *)(dst_addr + 29 * dst_stride_z) = out3.s5;
+ *(__global float *)(dst_addr + 30 * dst_stride_z) = out3.s6;
+ *(__global float *)(dst_addr + 31 * dst_stride_z) = out3.s7;
+ *(__global float *)(dst_addr + 32 * dst_stride_z) = out4.s0;
+ *(__global float *)(dst_addr + 33 * dst_stride_z) = out4.s1;
+ *(__global float *)(dst_addr + 34 * dst_stride_z) = out4.s2;
+ *(__global float *)(dst_addr + 35 * dst_stride_z) = out4.s3;
+ *(__global float *)(dst_addr + 36 * dst_stride_z) = out4.s4;
+ *(__global float *)(dst_addr + 37 * dst_stride_z) = out4.s5;
+ *(__global float *)(dst_addr + 38 * dst_stride_z) = out4.s6;
+ *(__global float *)(dst_addr + 39 * dst_stride_z) = out4.s7;
+ *(__global float *)(dst_addr + 40 * dst_stride_z) = out5.s0;
+ *(__global float *)(dst_addr + 41 * dst_stride_z) = out5.s1;
+ *(__global float *)(dst_addr + 42 * dst_stride_z) = out5.s2;
+ *(__global float *)(dst_addr + 43 * dst_stride_z) = out5.s3;
+ *(__global float *)(dst_addr + 44 * dst_stride_z) = out5.s4;
+ *(__global float *)(dst_addr + 45 * dst_stride_z) = out5.s5;
+ *(__global float *)(dst_addr + 46 * dst_stride_z) = out5.s6;
+ *(__global float *)(dst_addr + 47 * dst_stride_z) = out5.s7;
+ *(__global float *)(dst_addr + 48 * dst_stride_z) = out6.s0;
+ *(__global float *)(dst_addr + 49 * dst_stride_z) = out6.s1;
+ *(__global float *)(dst_addr + 50 * dst_stride_z) = out6.s2;
+ *(__global float *)(dst_addr + 51 * dst_stride_z) = out6.s3;
+ *(__global float *)(dst_addr + 52 * dst_stride_z) = out6.s4;
+ *(__global float *)(dst_addr + 53 * dst_stride_z) = out6.s5;
+ *(__global float *)(dst_addr + 54 * dst_stride_z) = out6.s6;
+ *(__global float *)(dst_addr + 55 * dst_stride_z) = out6.s7;
+ *(__global float *)(dst_addr + 56 * dst_stride_z) = out7.s0;
+ *(__global float *)(dst_addr + 57 * dst_stride_z) = out7.s1;
+ *(__global float *)(dst_addr + 58 * dst_stride_z) = out7.s2;
+ *(__global float *)(dst_addr + 59 * dst_stride_z) = out7.s3;
+ *(__global float *)(dst_addr + 60 * dst_stride_z) = out7.s4;
+ *(__global float *)(dst_addr + 61 * dst_stride_z) = out7.s5;
+ *(__global float *)(dst_addr + 62 * dst_stride_z) = out7.s6;
+ *(__global float *)(dst_addr + 63 * dst_stride_z) = out7.s7;
+}
+#endif // defined(NUM_CHANNELS)
+
+#if defined(NUM_TILES_X) && defined(PAD_LEFT) && defined(PAD_TOP)
+/** This OpenCL kernel computes the input transform when the kernel size is 3x3 and the output tile is 2x2
+ *
+ * @note The number of tiles in the x axis must be passed at compile time using -DNUM_TILES_X (i.e.-DNUM_TILES_X=5).
+ * @note The pad left and pad top must be passed at compile time using -DPAD_LEFT and -DPAD_TOP (i.e.-DPAD_LEFT=1 and -DPAD_TOP=0).
+ *
+ * @param[in] src_ptr Pointer to the source image. Supported data types: F32
+ * @param[in] src_stride_x Stride of the source image in X dimension (in bytes)
+ * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] src_stride_y Stride of the source image in Y dimension (in bytes)
+ * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] src_step_z src_stride_z * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_ptr Pointer to the destination tensor. Supported data types: as @p src_ptr
+ * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
+ * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in] dst_step_z dst_stride_z * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ */
+__kernel void winograd_input_transform_2x2_3x3_stepz1_nchw(
+ TENSOR3D_DECLARATION(src),
+ TENSOR3D_DECLARATION(dst))
+{
+ int x = get_global_id(0);
+ int y = get_global_id(1);
+ int z = get_global_id(2);
+
+ // Compute input address
+ __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x * 2 * src_stride_x + y * 2 * src_stride_y + z * src_stride_z;
+
+ src_addr = src_addr - ((int)PAD_LEFT * src_stride_x) - ((int)PAD_TOP * src_stride_y);
+
+ float4 in_row0 = vload4(0, (__global float *)(src_addr + 0 * src_stride_y));
+ float4 in_row1 = vload4(0, (__global float *)(src_addr + 1 * src_stride_y));
+ float4 in_row2 = vload4(0, (__global float *)(src_addr + 2 * src_stride_y));
+ float4 in_row3 = vload4(0, (__global float *)(src_addr + 3 * src_stride_y));
+
+ float4 tmp0 = in_row0 - in_row2;
+ float4 tmp1 = in_row1 + in_row2;
+ float4 tmp2 = in_row2 - in_row1;
+ float4 tmp3 = in_row1 - in_row3;
+
+ float out00 = tmp0.s0 - tmp0.s2;
+ float out01 = tmp0.s1 + tmp0.s2;
+ float out02 = tmp0.s2 - tmp0.s1;
+ float out03 = tmp0.s1 - tmp0.s3;
+
+ float out10 = tmp1.s0 - tmp1.s2;
+ float out11 = tmp1.s1 + tmp1.s2;
+ float out12 = tmp1.s2 - tmp1.s1;
+ float out13 = tmp1.s1 - tmp1.s3;
+
+ float out20 = tmp2.s0 - tmp2.s2;
+ float out21 = tmp2.s1 + tmp2.s2;
+ float out22 = tmp2.s2 - tmp2.s1;
+ float out23 = tmp2.s1 - tmp2.s3;
+
+ float out30 = tmp3.s0 - tmp3.s2;
+ float out31 = tmp3.s1 + tmp3.s2;
+ float out32 = tmp3.s2 - tmp3.s1;
+ float out33 = tmp3.s1 - tmp3.s3;
+
+ __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + z * dst_stride_x + (x + y * (int)NUM_TILES_X) * dst_stride_y;
+
+ *((__global float *)(dst_addr + 0 * dst_stride_z)) = out00;
+ *((__global float *)(dst_addr + 1 * dst_stride_z)) = out01;
+ *((__global float *)(dst_addr + 2 * dst_stride_z)) = out02;
+ *((__global float *)(dst_addr + 3 * dst_stride_z)) = out03;
+ *((__global float *)(dst_addr + 4 * dst_stride_z)) = out10;
+ *((__global float *)(dst_addr + 5 * dst_stride_z)) = out11;
+ *((__global float *)(dst_addr + 6 * dst_stride_z)) = out12;
+ *((__global float *)(dst_addr + 7 * dst_stride_z)) = out13;
+ *((__global float *)(dst_addr + 8 * dst_stride_z)) = out20;
+ *((__global float *)(dst_addr + 9 * dst_stride_z)) = out21;
+ *((__global float *)(dst_addr + 10 * dst_stride_z)) = out22;
+ *((__global float *)(dst_addr + 11 * dst_stride_z)) = out23;
+ *((__global float *)(dst_addr + 12 * dst_stride_z)) = out30;
+ *((__global float *)(dst_addr + 13 * dst_stride_z)) = out31;
+ *((__global float *)(dst_addr + 14 * dst_stride_z)) = out32;
+ *((__global float *)(dst_addr + 15 * dst_stride_z)) = out33;
+}
+
+/** This OpenCL kernel computes the input transform when the kernel size is 3x3, the output tile is 2x2 and the number of channels is multiple of 2
+ *
+ * @note The number of tiles in the x axis must be passed at compile time using -DNUM_TILES_X (i.e.-DNUM_TILES_X=5).
+ * @note The pad left and pad top must be passed at compile time using -DPAD_LEFT and -DPAD_TOP (i.e.-DPAD_LEFT=1 and -DPAD_TOP=0).
+ *
+ * @param[in] src_ptr Pointer to the source image. Supported data types: F32
+ * @param[in] src_stride_x Stride of the source image in X dimension (in bytes)
+ * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] src_stride_y Stride of the source image in Y dimension (in bytes)
+ * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] src_step_z src_stride_z * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_ptr Pointer to the destination tensor. Supported data types: as @p src_ptr
+ * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
+ * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in] dst_step_z dst_stride_z * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ */
+__kernel void winograd_input_transform_2x2_3x3_stepz2_nchw(
+ TENSOR3D_DECLARATION(src),
+ TENSOR3D_DECLARATION(dst))
+{
+ int x = get_global_id(0);
+ int y = get_global_id(1);
+ int z = get_global_id(2) * 2;
+
+ // Compute input address
+ __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x * 2 * src_stride_x + y * 2 * src_stride_y + z * src_stride_z;
+
+ src_addr = src_addr - ((int)PAD_LEFT * src_stride_x) - ((int)PAD_TOP * src_stride_y);
+
+ float4 in_row0 = vload4(0, (__global float *)(src_addr + 0 * src_stride_y));
+ float4 in_row1 = vload4(0, (__global float *)(src_addr + 1 * src_stride_y));
+ float4 in_row2 = vload4(0, (__global float *)(src_addr + 2 * src_stride_y));
+ float4 in_row3 = vload4(0, (__global float *)(src_addr + 3 * src_stride_y));
+
+ src_addr += src_stride_z;
+ float4 in_row4 = vload4(0, (__global float *)(src_addr + 0 * src_stride_y));
+ float4 in_row5 = vload4(0, (__global float *)(src_addr + 1 * src_stride_y));
+ float4 in_row6 = vload4(0, (__global float *)(src_addr + 2 * src_stride_y));
+ float4 in_row7 = vload4(0, (__global float *)(src_addr + 3 * src_stride_y));
+
+ float4 tmp0 = in_row0 - in_row2;
+ float4 tmp1 = in_row1 + in_row2;
+ float4 tmp2 = in_row2 - in_row1;
+ float4 tmp3 = in_row1 - in_row3;
+
+ float4 tmp4 = in_row4 - in_row6;
+ float4 tmp5 = in_row5 + in_row6;
+ float4 tmp6 = in_row6 - in_row5;
+ float4 tmp7 = in_row5 - in_row7;
+
+ float2 out00 = (float2)(tmp0.s0 - tmp0.s2, tmp4.s0 - tmp4.s2);
+ float2 out01 = (float2)(tmp0.s1 + tmp0.s2, tmp4.s1 + tmp4.s2);
+ float2 out02 = (float2)(tmp0.s2 - tmp0.s1, tmp4.s2 - tmp4.s1);
+ float2 out03 = (float2)(tmp0.s1 - tmp0.s3, tmp4.s1 - tmp4.s3);
+
+ float2 out10 = (float2)(tmp1.s0 - tmp1.s2, tmp5.s0 - tmp5.s2);
+ float2 out11 = (float2)(tmp1.s1 + tmp1.s2, tmp5.s1 + tmp5.s2);
+ float2 out12 = (float2)(tmp1.s2 - tmp1.s1, tmp5.s2 - tmp5.s1);
+ float2 out13 = (float2)(tmp1.s1 - tmp1.s3, tmp5.s1 - tmp5.s3);
+
+ float2 out20 = (float2)(tmp2.s0 - tmp2.s2, tmp6.s0 - tmp6.s2);
+ float2 out21 = (float2)(tmp2.s1 + tmp2.s2, tmp6.s1 + tmp6.s2);
+ float2 out22 = (float2)(tmp2.s2 - tmp2.s1, tmp6.s2 - tmp6.s1);
+ float2 out23 = (float2)(tmp2.s1 - tmp2.s3, tmp6.s1 - tmp6.s3);
+
+ float2 out30 = (float2)(tmp3.s0 - tmp3.s2, tmp7.s0 - tmp7.s2);
+ float2 out31 = (float2)(tmp3.s1 + tmp3.s2, tmp7.s1 + tmp7.s2);
+ float2 out32 = (float2)(tmp3.s2 - tmp3.s1, tmp7.s2 - tmp7.s1);
+ float2 out33 = (float2)(tmp3.s1 - tmp3.s3, tmp7.s1 - tmp7.s3);
+
+ __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + z * dst_stride_x + (x + y * (int)NUM_TILES_X) * dst_stride_y;
+
+ vstore2(out00, 0, (__global float *)(dst_addr + 0 * dst_stride_z));
+ vstore2(out01, 0, (__global float *)(dst_addr + 1 * dst_stride_z));
+ vstore2(out02, 0, (__global float *)(dst_addr + 2 * dst_stride_z));
+ vstore2(out03, 0, (__global float *)(dst_addr + 3 * dst_stride_z));
+ vstore2(out10, 0, (__global float *)(dst_addr + 4 * dst_stride_z));
+ vstore2(out11, 0, (__global float *)(dst_addr + 5 * dst_stride_z));
+ vstore2(out12, 0, (__global float *)(dst_addr + 6 * dst_stride_z));
+ vstore2(out13, 0, (__global float *)(dst_addr + 7 * dst_stride_z));
+ vstore2(out20, 0, (__global float *)(dst_addr + 8 * dst_stride_z));
+ vstore2(out21, 0, (__global float *)(dst_addr + 9 * dst_stride_z));
+ vstore2(out22, 0, (__global float *)(dst_addr + 10 * dst_stride_z));
+ vstore2(out23, 0, (__global float *)(dst_addr + 11 * dst_stride_z));
+ vstore2(out30, 0, (__global float *)(dst_addr + 12 * dst_stride_z));
+ vstore2(out31, 0, (__global float *)(dst_addr + 13 * dst_stride_z));
+ vstore2(out32, 0, (__global float *)(dst_addr + 14 * dst_stride_z));
+ vstore2(out33, 0, (__global float *)(dst_addr + 15 * dst_stride_z));
+}
+
+/** This OpenCL kernel computes the input transform when the output tile is 4x4, the filter size 3x3 and the data format is NCHW
+ *
+ * @note The number of tiles in the x axis must be passed at compile time using -DNUM_TILES_X (i.e.-DNUM_TILES_X=5).
+ * @note The pad left and pad top must be passed at compile time using -DPAD_LEFT and -DPAD_TOP (i.e.-DPAD_LEFT=1 and -DPAD_TOP=0).
+ *
+ * @param[in] src_ptr Pointer to the source image. Supported data types: F32
+ * @param[in] src_stride_x Stride of the source image in X dimension (in bytes)
+ * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] src_stride_y Stride of the source image in Y dimension (in bytes)
+ * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] src_step_z src_stride_z * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_ptr Pointer to the destination tensor. Supported data types: as @p src_ptr
+ * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
+ * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in] dst_step_z dst_stride_z * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ */
+__kernel void winograd_input_transform_4x4_3x3_stepz1_nchw(
+ TENSOR3D_DECLARATION(src),
+ TENSOR3D_DECLARATION(dst))
+{
+ int x = get_global_id(0);
+ int y = get_global_id(1);
+ int z = get_global_id(2);
+
+ // Compute input address
+ __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x * 4 * src_stride_x + y * 4 * src_stride_y + z * src_stride_z;
+
+ src_addr = src_addr - ((int)PAD_LEFT * src_stride_x) - ((int)PAD_TOP * src_stride_y);
+
+ // Row4
+ float4 d40 = vload4(0, (__global float *)(src_addr + 4 * src_stride_y));
+ float2 d41 = vload2(2, (__global float *)(src_addr + 4 * src_stride_y));
+
+ float k0 = d41.s0;
+ float k1 = d41.s0;
+ float k2 = d41.s0;
+ float k3 = d41.s0;
+ float k4 = d41.s0;
+ float k5 = 0.0f;
+
+ k0 += 4.0f * d40.s0 - 5.0f * d40.s2;
+ k1 += -4.0f * d40.s1 - 4.0f * d40.s2 + d40.s3;
+ k2 += 4.0f * d40.s1 - 4.0f * d40.s2 - d40.s3;
+ k3 += -2.0f * d40.s1 + 2.0f * d40.s3 - d40.s2;
+ k4 += 2.0f * d40.s1 - 2.0f * d40.s3 - d40.s2;
+ k5 += 4.0f * d40.s1 - 5.0f * d40.s3 + d41.s1;
+
+ // Row0
+ float4 d00 = vload4(0, (__global float *)(src_addr + 0 * src_stride_y));
+ float2 d01 = vload2(2, (__global float *)(src_addr + 0 * src_stride_y));
+
+ // Row2
+ float4 d20 = vload4(0, (__global float *)(src_addr + 2 * src_stride_y));
+ float2 d21 = vload2(2, (__global float *)(src_addr + 2 * src_stride_y));
+
+ // Compute destination address
+ __global float *dst_addr = (__global float *)(dst_ptr + dst_offset_first_element_in_bytes + z * dst_stride_x + (x + y * (int)NUM_TILES_X) * dst_stride_y);
+
+ uint dst_plane_stride = dst_stride_z / sizeof(float);
+
+ float out0 = k0;
+ float out1 = k1;
+ float out2 = k2;
+ float out3 = k3;
+ float out4 = k4;
+ float out5 = k5;
+ float out6 = k0;
+ float out7 = k1;
+ float out8 = k2;
+ float out9 = k3;
+ float out10 = k4;
+ float out11 = k5;
+ float out12 = k0;
+ float out13 = k1;
+ float out14 = k2;
+ float out15 = k3;
+ float out16 = k4;
+ float out17 = k5;
+ float out18 = k0;
+ float out19 = k1;
+ float out20 = k2;
+ float out21 = k3;
+ float out22 = k4;
+ float out23 = k5;
+ float out24 = k0;
+ float out25 = k1;
+ float out26 = k2;
+ float out27 = k3;
+ float out28 = k4;
+ float out29 = k5;
+
+ // Channels [0, 5]: [out00, out01, out02, out03, out04, out05]
+ out0 += 16.0f * d00.s0 - 20.0f * d00.s2 - 20.0f * d20.s0 + 25.0f * d20.s2 + 4.0f * d01.s0 - 5.0f * d21.s0;
+ out1 += -16.0f * d00.s1 - 16.0f * d00.s2 + 4.0f * d00.s3 + 20.0f * d20.s1 + 20.0f * d20.s2 - 5.0f * d20.s3 + 4.0f * d01.s0 - 5.0f * d21.s0;
+ out2 += 16.0f * d00.s1 - 16.0f * d00.s2 - 4.0f * d00.s3 - 20.0f * d20.s1 + 20.0f * d20.s2 + 5.0f * d20.s3 + 4.0f * d01.s0 - 5.0f * d21.s0;
+ out3 += -8.0f * d00.s1 - 4.0f * d00.s2 + 8.0f * d00.s3 + 10.0f * d20.s1 + 5.0f * d20.s2 - 10.0f * d20.s3 + 4.0f * d01.s0 - 5.0f * d21.s0;
+ out4 += 8.0f * d00.s1 - 4.0f * d00.s2 - 8.0f * d00.s3 - 10.0f * d20.s1 + 5.0f * d20.s2 + 10.0f * d20.s3 + 4.0f * d01.s0 - 5.0f * d21.s0;
+ out5 += 16.0f * d00.s1 - 20.0f * d00.s3 - 20.0f * d20.s1 + 4.0f * d01.s1 + 25.0f * d20.s3 - 5.0f * d21.s1;
+
+ *(dst_addr) = out0;
+ dst_addr += dst_plane_stride;
+ *(dst_addr) = out1;
+ dst_addr += dst_plane_stride;
+ *(dst_addr) = out2;
+ dst_addr += dst_plane_stride;
+ *(dst_addr) = out3;
+ dst_addr += dst_plane_stride;
+ *(dst_addr) = out4;
+ dst_addr += dst_plane_stride;
+ *(dst_addr) = out5;
+ dst_addr += dst_plane_stride;
+
+ // Row1
+ float4 d10 = vload4(0, (__global float *)(src_addr + 1 * src_stride_y));
+ float2 d11 = vload2(2, (__global float *)(src_addr + 1 * src_stride_y));
+
+ // Row3
+ float4 d30 = vload4(0, (__global float *)(src_addr + 3 * src_stride_y));
+ float2 d31 = vload2(2, (__global float *)(src_addr + 3 * src_stride_y));
+
+ // Compute common parts for the channels between [6, 29]
+ // Channels [6, 11]: [out10, out11, out12, out13, out14, out15]
+ // Channels [12, 17]: [out20, out21, out22, out23, out24, out25]
+ float part0 = -16.0f * d20.s0 + 20.0f * d20.s2 - 4.0f * d21.s0;
+ float part1 = 16.0f * d10.s0 - 20.0f * d10.s2 + 4.0f * d11.s0 - 4.0f * d30.s0 + 5.0f * d30.s2 - d31.s0;
+ float part2 = 16.0f * d20.s2 - 4.0f * d21.s0;
+ float part3 = 16.0f * d20.s1 - 4.0f * d20.s3;
+ float part4 = 16.0f * d10.s2 - 4.0f * d11.s0 - 4.0f * d30.s2 + d31.s0;
+ float part5 = 16.0f * d10.s1 - 4.0f * d10.s3 - 4.0f * d30.s1 + d30.s3;
+ float part6 = 4.0f * d20.s2 - 4.0f * d21.s0;
+ float part7 = 8.0f * d10.s1 - 8.0f * d10.s3 - 2.0f * d30.s1 + 2.0f * d30.s3;
+ float part8 = 4.0f * d10.s2 - 4.0f * d11.s0 - d30.s2 + d31.s0;
+ float part9 = 8.0f * d20.s1 - 8.0f * d20.s3;
+ float part10 = -16.0f * d20.s1 + 20.0f * d20.s3 - 4.0f * d21.s1;
+ float part11 = -16.0f * d10.s1 + 20.0f * d10.s3 - 4.0f * d11.s1 + 4.0f * d30.s1 - 5.0f * d30.s3 + d31.s1;
+
+ // Channels [18, 23]: [out30, out31, out32, out33, out34, out35]
+ // Channels [24, 29]: [out40, out41, out42, out43, out44, out45]
+ float part12 = 8.0f * d10.s0 - 10.0f * d10.s2 + 2.0f * d11.s0 - 8.0f * d30.s0 + 10.0f * d30.s2 - 2.0f * d31.s0;
+ float part13 = part0 * 0.25f; // -4.0f * d20.s0 + 5.0f * d20.s2 - d21.s0
+ float part14 = part2 * 0.25f; // 4.0f * d20.s2 - d21.s0
+ float part15 = 8.0f * d10.s1 - 2.0f * d10.s3 - 8.0f * d30.s1 + 2.0f * d30.s3;
+ float part16 = 8.0f * d10.s2 - 2.0f * d11.s0 - 8.0f * d30.s2 + 2.0f * d31.s0;
+ float part17 = part3 * 0.25f; // 4.0f * d20.s1 - d20.s3
+ float part18 = part6 * 0.25f; // d20.s2 - d21.s0
+ float part19 = 4.0f * d10.s1 - 4.0f * d10.s3 - 4.0f * d30.s1 + 4.0f * d30.s3;
+ float part20 = 2.0f * d10.s2 - 2.0f * d11.s0 - 2.0f * d30.s2 + 2.0f * d31.s0;
+ float part21 = part9 * 0.25f; // 2.0f * (d20.s1 - d20.s3)
+ float part22 = part10 * 0.25f; // - 4.0f * d20.s1 + 5.0f * d20.s3 - d21.s1
+ float part23 = part11 * 0.5f + 6.0f * d30.s1 - 7.5f * d30.s3 + 1.5f * d31.s1; // - 8.0f * d10.s1 + 10.0f * d10.s3 - 2.0f * d11.s1 + 8.0f * d30.s1 - 10.0f * d30.s3 + 2.0f * d31.s1;
+
+ out6 += part0 - part1;
+ out12 += part0 + part1;
+ out7 += part2 + part3 + part4 + part5;
+ out8 += part2 - part3 + part4 - part5;
+ out13 += part2 + part3 - part4 - part5;
+ out14 += part2 - part3 - part4 + part5;
+ out9 += part6 + part7 + part8 + part9;
+ out10 += part6 - part7 + part8 - part9;
+ out15 += part6 - part7 - part8 + part9;
+ out16 += part6 + part7 - part8 - part9;
+ out11 += part10 + part11;
+ out17 += part10 - part11;
+
+ out18 += part13 - part12;
+ out24 += part13 + part12;
+ out19 += part14 + part15 + part16 + part17;
+ out20 += part14 - part15 + part16 - part17;
+ out25 += part14 - part15 - part16 + part17;
+ out26 += part14 + part15 - part16 - part17;
+ out21 += part18 + part19 + part20 + part21;
+ out22 += part18 - part19 + part20 - part21;
+ out27 += part18 - part19 - part20 + part21;
+ out28 += part18 + part19 - part20 - part21;
+ out23 += part22 + part23;
+ out29 += part22 - part23;
+
+ *(dst_addr) = out6;
+ dst_addr += dst_plane_stride;
+ *(dst_addr) = out7;
+ dst_addr += dst_plane_stride;
+ *(dst_addr) = out8;
+ dst_addr += dst_plane_stride;
+ *(dst_addr) = out9;
+ dst_addr += dst_plane_stride;
+ *(dst_addr) = out10;
+ dst_addr += dst_plane_stride;
+ *(dst_addr) = out11;
+ dst_addr += dst_plane_stride;
+ *(dst_addr) = out12;
+ dst_addr += dst_plane_stride;
+ *(dst_addr) = out13;
+ dst_addr += dst_plane_stride;
+ *(dst_addr) = out14;
+ dst_addr += dst_plane_stride;
+ *(dst_addr) = out15;
+ dst_addr += dst_plane_stride;
+ *(dst_addr) = out16;
+ dst_addr += dst_plane_stride;
+ *(dst_addr) = out17;
+ dst_addr += dst_plane_stride;
+
+ *(dst_addr) = out18;
+ dst_addr += dst_plane_stride;
+ *(dst_addr) = out19;
+ dst_addr += dst_plane_stride;
+ *(dst_addr) = out20;
+ dst_addr += dst_plane_stride;
+ *(dst_addr) = out21;
+ dst_addr += dst_plane_stride;
+ *(dst_addr) = out22;
+ dst_addr += dst_plane_stride;
+ *(dst_addr) = out23;
+ dst_addr += dst_plane_stride;
+ *(dst_addr) = out24;
+ dst_addr += dst_plane_stride;
+ *(dst_addr) = out25;
+ dst_addr += dst_plane_stride;
+ *(dst_addr) = out26;
+ dst_addr += dst_plane_stride;
+ *(dst_addr) = out27;
+ dst_addr += dst_plane_stride;
+ *(dst_addr) = out28;
+ dst_addr += dst_plane_stride;
+ *(dst_addr) = out29;
+ dst_addr += dst_plane_stride;
+
+ // Row5
+ float4 d50 = vload4(0, (__global float *)(src_addr + 5 * src_stride_y));
+ float2 d51 = vload2(2, (__global float *)(src_addr + 5 * src_stride_y));
+
+ // Channels [30, 35]
+ out0 = 16.0f * d10.s0 - 20.0f * d10.s2 - 20.0f * d30.s0 + 25.0f * d30.s2 + 4.0f * d50.s0 - 5.0f * d50.s2 + d51.s0 + 4.0f * d11.s0 - 5.0f * d31.s0;
+ out1 = -16.0f * d10.s1 - 16.0f * d10.s2 + 4.0f * d10.s3 + 20.0f * d30.s1 + 20.0f * d30.s2 - 5.0f * d30.s3 - 4.0f * d50.s1 - 4.0f * d50.s2 + d50.s3 + d51.s0 + 4.0f * d11.s0 - 5.0f * d31.s0;
+ out2 = 16.0f * d10.s1 - 16.0f * d10.s2 - 4.0f * d10.s3 - 20.0f * d30.s1 + 20.0f * d30.s2 + 5.0f * d30.s3 + 4.0f * d50.s1 - 4.0f * d50.s2 - d50.s3 + d51.s0 + 4.0f * d11.s0 - 5.0f * d31.s0;
+ out3 = -8.0f * d10.s1 - 4.0f * d10.s2 + 8.0f * d10.s3 + 10.0f * d30.s1 - 10.0f * d30.s3 + 5.0f * d30.s2 - 2.0f * d50.s1 + 2.0f * d50.s3 - d50.s2 + d51.s0 + 4.0f * d11.s0 - 5.0f * d31.s0;
+ out4 = 8.0f * d10.s1 - 4.0f * d10.s2 - 8.0f * d10.s3 - 10.0f * d30.s1 + 5.0f * d30.s2 + 10.0f * d30.s3 + 2.0f * d50.s1 - 2.0f * d50.s3 - d50.s2 + d51.s0 + 4.0f * d11.s0 - 5.0f * d31.s0;
+ out5 = 16.0f * d10.s1 - 20.0f * d10.s3 + 4.0f * d11.s1 - 20.0f * d30.s1 + 25.0f * d30.s3 - 5.0f * d31.s1 + 4.0f * d50.s1 - 5.0f * d50.s3 + d51.s1;
+
+ *(dst_addr) = out0;
+ dst_addr += dst_plane_stride;
+ *(dst_addr) = out1;
+ dst_addr += dst_plane_stride;
+ *(dst_addr) = out2;
+ dst_addr += dst_plane_stride;
+ *(dst_addr) = out3;
+ dst_addr += dst_plane_stride;
+ *(dst_addr) = out4;
+ dst_addr += dst_plane_stride;
+ *(dst_addr) = out5;
+ dst_addr += dst_plane_stride;
+}
+
+#define OUTPUT_ROW_4x4_5x5(out, tmp, comm_fact) \
+ ({ \
+ comm_fact.s0 = tmp.s2 - 4.25f * tmp.s4 + tmp.s6; \
+ comm_fact.s1 = tmp.s1 - 4.25f * tmp.s3 + tmp.s5; \
+ comm_fact.s2 = 2.5f * tmp.s3; \
+ comm_fact.s3 = 0.5f * tmp.s1 + 2.f * tmp.s5 - comm_fact.s2; \
+ comm_fact.s4 = 0.25f * tmp.s2 - 1.25f * tmp.s4 + tmp.s6; \
+ comm_fact.s5 = 4.f * tmp.s2 + tmp.s6 - 5.f * tmp.s4; \
+ comm_fact.s6 = 2.f * tmp.s1 + 0.5f * tmp.s5 - comm_fact.s2; \
+ \
+ out.s0 = tmp.s0 - tmp.s6 + 5.25f * tmp.s4 - 5.25f * tmp.s2; \
+ out.s1 = comm_fact.s0 + comm_fact.s1; \
+ out.s2 = comm_fact.s0 - comm_fact.s1; \
+ out.s3 = comm_fact.s3 + comm_fact.s4; \
+ out.s4 = comm_fact.s4 - comm_fact.s3; \
+ out.s5 = comm_fact.s5 + comm_fact.s6; \
+ out.s6 = comm_fact.s5 - comm_fact.s6; \
+ out.s7 = tmp.s7 - tmp.s1 + 5.25f * tmp.s3 - 5.25f * tmp.s5; \
+ })
+
+/** This OpenCL kernel computes the input transform when the kernel size is 5x5 and the output tile is 4x4
+ *
+ * @note The number of tiles in the x axis must be passed at compile time using -DNUM_TILES_X (i.e.-DNUM_TILES_X=5).
+ * @note The pad left and pad top must be passed at compile time using -DPAD_LEFT and -DPAD_TOP (i.e.-DPAD_LEFT=1 and -DPAD_TOP=0).
+ *
+ * @param[in] src_ptr Pointer to the source image. Supported data types: F32
+ * @param[in] src_stride_x Stride of the source image in X dimension (in bytes)
+ * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] src_stride_y Stride of the source image in Y dimension (in bytes)
+ * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] src_step_z src_stride_z * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_ptr Pointer to the destination tensor. Supported data types: as @p src_ptr
+ * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
+ * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in] dst_step_z dst_stride_z * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ */
+__kernel void winograd_input_transform_4x4_5x5_stepz1_nchw(
+ TENSOR3D_DECLARATION(src),
+ TENSOR3D_DECLARATION(dst))
+{
+ int x = get_global_id(0);
+ int y = get_global_id(1);
+ int z = get_global_id(2);
+
+ // Compute input address
+ __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x * 4 * src_stride_x + y * 4 * src_stride_y + z * src_stride_z;
+
+ src_addr = src_addr - ((int)PAD_LEFT * src_stride_x) - ((int)PAD_TOP * src_stride_y);
+
+ // Load 8x8 input tile
+ const float8 in_row0 = vload8(0, (__global float *)(src_addr + 0 * src_stride_y));
+ const float8 in_row1 = vload8(0, (__global float *)(src_addr + 1 * src_stride_y));
+ const float8 in_row2 = vload8(0, (__global float *)(src_addr + 2 * src_stride_y));
+ const float8 in_row3 = vload8(0, (__global float *)(src_addr + 3 * src_stride_y));
+ const float8 in_row4 = vload8(0, (__global float *)(src_addr + 4 * src_stride_y));
+ const float8 in_row5 = vload8(0, (__global float *)(src_addr + 5 * src_stride_y));
+ const float8 in_row6 = vload8(0, (__global float *)(src_addr + 6 * src_stride_y));
+ const float8 in_row7 = vload8(0, (__global float *)(src_addr + 7 * src_stride_y));
+
+ // Calculate common factors for intermediate tensor
+ float8 comm_fact0 = in_row2 + in_row6 - 4.25f * in_row4;
+ float8 comm_fact1 = in_row1 + in_row5 - 4.25f * in_row3;
+ float8 comm_fact2 = 0.25f * in_row2 - 1.25f * in_row4 + in_row6;
+
+ // Calculate intermediate tensor and reuse common factor vectors
+ const float8 tmp0 = in_row0 - in_row6 + 5.25f * in_row4 - 5.25f * in_row2;
+ const float8 tmp1 = comm_fact0 + comm_fact1;
+ const float8 tmp2 = comm_fact0 - comm_fact1;
+
+ comm_fact0 = 2.5f * in_row3;
+ comm_fact1 = 0.5f * in_row1 - comm_fact0 + 2.f * in_row5;
+
+ const float8 tmp3 = comm_fact1 + comm_fact2;
+ const float8 tmp4 = comm_fact2 - comm_fact1;
+
+ comm_fact1 = 2.f * in_row1 - comm_fact0 + 0.5f * in_row5;
+ comm_fact2 = 4.f * in_row2 - 5.f * in_row4 + in_row6;
+
+ const float8 tmp5 = comm_fact1 + comm_fact2;
+ const float8 tmp6 = comm_fact2 - comm_fact1;
+ const float8 tmp7 = in_row7 - in_row1 + 5.25f * in_row3 - 5.25f * in_row5;
+
+ // Calculate output rows (reuse comm_fact0 vector)
+ float8 out0, out1, out2, out3, out4, out5, out6, out7;
+
+ OUTPUT_ROW_4x4_5x5(out0, tmp0, comm_fact0);
+ OUTPUT_ROW_4x4_5x5(out1, tmp1, comm_fact0);
+ OUTPUT_ROW_4x4_5x5(out2, tmp2, comm_fact0);
+ OUTPUT_ROW_4x4_5x5(out3, tmp3, comm_fact0);
+ OUTPUT_ROW_4x4_5x5(out4, tmp4, comm_fact0);
+ OUTPUT_ROW_4x4_5x5(out5, tmp5, comm_fact0);
+ OUTPUT_ROW_4x4_5x5(out6, tmp6, comm_fact0);
+ OUTPUT_ROW_4x4_5x5(out7, tmp7, comm_fact0);
+
+ // Store values across the 64 channels
+ __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + z * dst_stride_x + (x + y * (int)NUM_TILES_X) * dst_stride_y;
+
+ *((__global float *)(dst_addr + 0 * dst_stride_z)) = out0.s0;
+ *((__global float *)(dst_addr + 1 * dst_stride_z)) = out0.s1;
+ *((__global float *)(dst_addr + 2 * dst_stride_z)) = out0.s2;
+ *((__global float *)(dst_addr + 3 * dst_stride_z)) = out0.s3;
+ *((__global float *)(dst_addr + 4 * dst_stride_z)) = out0.s4;
+ *((__global float *)(dst_addr + 5 * dst_stride_z)) = out0.s5;
+ *((__global float *)(dst_addr + 6 * dst_stride_z)) = out0.s6;
+ *((__global float *)(dst_addr + 7 * dst_stride_z)) = out0.s7;
+ *((__global float *)(dst_addr + 8 * dst_stride_z)) = out1.s0;
+ *((__global float *)(dst_addr + 9 * dst_stride_z)) = out1.s1;
+ *((__global float *)(dst_addr + 10 * dst_stride_z)) = out1.s2;
+ *((__global float *)(dst_addr + 11 * dst_stride_z)) = out1.s3;
+ *((__global float *)(dst_addr + 12 * dst_stride_z)) = out1.s4;
+ *((__global float *)(dst_addr + 13 * dst_stride_z)) = out1.s5;
+ *((__global float *)(dst_addr + 14 * dst_stride_z)) = out1.s6;
+ *((__global float *)(dst_addr + 15 * dst_stride_z)) = out1.s7;
+ *((__global float *)(dst_addr + 16 * dst_stride_z)) = out2.s0;
+ *((__global float *)(dst_addr + 17 * dst_stride_z)) = out2.s1;
+ *((__global float *)(dst_addr + 18 * dst_stride_z)) = out2.s2;
+ *((__global float *)(dst_addr + 19 * dst_stride_z)) = out2.s3;
+ *((__global float *)(dst_addr + 20 * dst_stride_z)) = out2.s4;
+ *((__global float *)(dst_addr + 21 * dst_stride_z)) = out2.s5;
+ *((__global float *)(dst_addr + 22 * dst_stride_z)) = out2.s6;
+ *((__global float *)(dst_addr + 23 * dst_stride_z)) = out2.s7;
+ *((__global float *)(dst_addr + 24 * dst_stride_z)) = out3.s0;
+ *((__global float *)(dst_addr + 25 * dst_stride_z)) = out3.s1;
+ *((__global float *)(dst_addr + 26 * dst_stride_z)) = out3.s2;
+ *((__global float *)(dst_addr + 27 * dst_stride_z)) = out3.s3;
+ *((__global float *)(dst_addr + 28 * dst_stride_z)) = out3.s4;
+ *((__global float *)(dst_addr + 29 * dst_stride_z)) = out3.s5;
+ *((__global float *)(dst_addr + 30 * dst_stride_z)) = out3.s6;
+ *((__global float *)(dst_addr + 31 * dst_stride_z)) = out3.s7;
+ *((__global float *)(dst_addr + 32 * dst_stride_z)) = out4.s0;
+ *((__global float *)(dst_addr + 33 * dst_stride_z)) = out4.s1;
+ *((__global float *)(dst_addr + 34 * dst_stride_z)) = out4.s2;
+ *((__global float *)(dst_addr + 35 * dst_stride_z)) = out4.s3;
+ *((__global float *)(dst_addr + 36 * dst_stride_z)) = out4.s4;
+ *((__global float *)(dst_addr + 37 * dst_stride_z)) = out4.s5;
+ *((__global float *)(dst_addr + 38 * dst_stride_z)) = out4.s6;
+ *((__global float *)(dst_addr + 39 * dst_stride_z)) = out4.s7;
+ *((__global float *)(dst_addr + 40 * dst_stride_z)) = out5.s0;
+ *((__global float *)(dst_addr + 41 * dst_stride_z)) = out5.s1;
+ *((__global float *)(dst_addr + 42 * dst_stride_z)) = out5.s2;
+ *((__global float *)(dst_addr + 43 * dst_stride_z)) = out5.s3;
+ *((__global float *)(dst_addr + 44 * dst_stride_z)) = out5.s4;
+ *((__global float *)(dst_addr + 45 * dst_stride_z)) = out5.s5;
+ *((__global float *)(dst_addr + 46 * dst_stride_z)) = out5.s6;
+ *((__global float *)(dst_addr + 47 * dst_stride_z)) = out5.s7;
+ *((__global float *)(dst_addr + 48 * dst_stride_z)) = out6.s0;
+ *((__global float *)(dst_addr + 49 * dst_stride_z)) = out6.s1;
+ *((__global float *)(dst_addr + 50 * dst_stride_z)) = out6.s2;
+ *((__global float *)(dst_addr + 51 * dst_stride_z)) = out6.s3;
+ *((__global float *)(dst_addr + 52 * dst_stride_z)) = out6.s4;
+ *((__global float *)(dst_addr + 53 * dst_stride_z)) = out6.s5;
+ *((__global float *)(dst_addr + 54 * dst_stride_z)) = out6.s6;
+ *((__global float *)(dst_addr + 55 * dst_stride_z)) = out6.s7;
+ *((__global float *)(dst_addr + 56 * dst_stride_z)) = out7.s0;
+ *((__global float *)(dst_addr + 57 * dst_stride_z)) = out7.s1;
+ *((__global float *)(dst_addr + 58 * dst_stride_z)) = out7.s2;
+ *((__global float *)(dst_addr + 59 * dst_stride_z)) = out7.s3;
+ *((__global float *)(dst_addr + 60 * dst_stride_z)) = out7.s4;
+ *((__global float *)(dst_addr + 61 * dst_stride_z)) = out7.s5;
+ *((__global float *)(dst_addr + 62 * dst_stride_z)) = out7.s6;
+ *((__global float *)(dst_addr + 63 * dst_stride_z)) = out7.s7;
+}
+#endif // defined(NUM_TILES_X) && defined(PAD_LEFT) && defined(PAD_TOP)
+
+#if defined(NUM_TILES_X)
+/** This OpenCL kernel performs Winograd output transform when the output tile is 2x2, the filter size 3x3 and the data format is NCHW
+ *
+ * @note The number of tiles along the X direction must be passed at compile time using -DNUM_TILES_X: e.g. -DNUM_TILES_X=16
+ *
+ * @param[in] src_ptr Pointer to the source tensor. Supported data types: F32
+ * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
+ * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
+ * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr
+ * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
+ * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ */
+__kernel void winograd_output_transform_2x2_3x3_nchw(
+ TENSOR3D_DECLARATION(src),
+ TENSOR3D_DECLARATION(dst)
+#if defined(HAS_BIAS)
+ ,
+ VECTOR_DECLARATION(bias)
+#endif // defined(HAS_BIAS)
+)
+{
+ // Each thread stores a 2x2 tile
+ Tensor3D src = CONVERT_TO_TENSOR3D_STRUCT(src);
+
+ const __global uchar *src_addr = tensor3D_offset(&src, 0, 0, 0);
+
+ // Load the values across the 16 channels to compose the 4x4 tile
+ float d00 = *((__global float *)(src_addr + 0 * src_stride_z));
+ float d01 = *((__global float *)(src_addr + 1 * src_stride_z));
+ float d02 = *((__global float *)(src_addr + 2 * src_stride_z));
+ float d03 = *((__global float *)(src_addr + 3 * src_stride_z));
+
+ float d10 = *((__global float *)(src_addr + 4 * src_stride_z));
+ float d11 = *((__global float *)(src_addr + 5 * src_stride_z));
+ float d12 = *((__global float *)(src_addr + 6 * src_stride_z));
+ float d13 = *((__global float *)(src_addr + 7 * src_stride_z));
+
+ float d20 = *((__global float *)(src_addr + 8 * src_stride_z));
+ float d21 = *((__global float *)(src_addr + 9 * src_stride_z));
+ float d22 = *((__global float *)(src_addr + 10 * src_stride_z));
+ float d23 = *((__global float *)(src_addr + 11 * src_stride_z));
+
+ float d30 = *((__global float *)(src_addr + 12 * src_stride_z));
+ float d31 = *((__global float *)(src_addr + 13 * src_stride_z));
+ float d32 = *((__global float *)(src_addr + 14 * src_stride_z));
+ float d33 = *((__global float *)(src_addr + 15 * src_stride_z));
+
+ // Compute the 2x2 output tile
+ float k0 = d01 + d11 + d21;
+ float k1 = d02 + d12 + d22;
+ float k2 = d11 - d21 - d31;
+ float k3 = d12 - d22 - d32;
+
+ // out00 = d00 + d10 + d20 + d01 + d11 + d21 + d02 + d12 + d22
+ // out01 = d01 + d11 + d21 - (d02 + d12 + d22) - (d03 + d13 + d23)
+ // out10 = d10 - d20 - d30 + (d11 - d21 - d31) + (d12 - d22 - d32)
+ // out11 = d11 - d21 - d31 - (d12 - d22 - d32) - (d13 - d23 - d33)
+
+ float out00 = d10;
+ float out01 = -d13;
+ float out10 = d10;
+ float out11 = -d13;
+
+ out00 += d00 + d20 + k0 + k1;
+ out01 += k0 - k1 - (d03 + d23);
+ out10 += -d20 - d30 + k2 + k3;
+ out11 += k2 - k3 + d23 + d33;
+
+ int y_in = get_global_id(1);
+ int x_out = (y_in % NUM_TILES_X) * 2;
+ int y_out = (y_in / NUM_TILES_X) * 2;
+ int z_out = get_global_id(0);
+
+#if defined(HAS_BIAS)
+ // Add bias
+ Vector bias = CONVERT_TO_VECTOR_STRUCT_NO_STEP(bias);
+
+ float b = (float) * ((__global float *)(vector_offset(&bias, z_out)));
+
+ out00 += (float)b;
+ out01 += (float)b;
+ out10 += (float)b;
+ out11 += (float)b;
+#endif // defined(HAS_BIAS)
+
+ // Get output address
+ __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x_out * dst_stride_x + y_out * dst_stride_y + z_out * dst_stride_z;
+
+ // Store the 2x2 output tile
+ vstore2((float2)(out00, out01), 0, (__global float *)(dst_addr + 0 * dst_stride_y));
+ vstore2((float2)(out10, out11), 0, (__global float *)(dst_addr + 1 * dst_stride_y));
+}
+
+/** This OpenCL kernel performs Winograd output transform when the output tile is 4x4, the filter size 3x3 and the data format is NCHW
+ *
+ * @note The number of tiles along the X direction must be passed at compile time using -DNUM_TILES_X: e.g. -DNUM_TILES_X=16
+ *
+ * @param[in] src_ptr Pointer to the source tensor. Supported data types: F32
+ * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
+ * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
+ * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr
+ * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
+ * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ */
+__kernel void winograd_output_transform_4x4_3x3_nchw(
+ TENSOR3D_DECLARATION(src),
+ TENSOR3D_DECLARATION(dst)
+#if defined(HAS_BIAS)
+ ,
+ VECTOR_DECLARATION(bias)
+#endif // defined(HAS_BIAS)
+)
+{
+ // Each thread stores a 4x4 tile
+ Tensor3D src = CONVERT_TO_TENSOR3D_STRUCT(src);
+
+ const __global uchar *src_addr = tensor3D_offset(&src, 0, 0, 0);
+
+ // Load the values across the 36 channels to compose the 6x6 tile
+ float d00 = *((__global float *)(src_addr + 0 * src_stride_z));
+ float d01 = *((__global float *)(src_addr + 1 * src_stride_z));
+ float d02 = *((__global float *)(src_addr + 2 * src_stride_z));
+ float d03 = *((__global float *)(src_addr + 3 * src_stride_z));
+ float d04 = *((__global float *)(src_addr + 4 * src_stride_z));
+ float d05 = *((__global float *)(src_addr + 5 * src_stride_z));
+
+ float d10 = *((__global float *)(src_addr + 6 * src_stride_z));
+ float d11 = *((__global float *)(src_addr + 7 * src_stride_z));
+ float d12 = *((__global float *)(src_addr + 8 * src_stride_z));
+ float d13 = *((__global float *)(src_addr + 9 * src_stride_z));
+ float d14 = *((__global float *)(src_addr + 10 * src_stride_z));
+ float d15 = *((__global float *)(src_addr + 11 * src_stride_z));
+
+ float d20 = *((__global float *)(src_addr + 12 * src_stride_z));
+ float d21 = *((__global float *)(src_addr + 13 * src_stride_z));
+ float d22 = *((__global float *)(src_addr + 14 * src_stride_z));
+ float d23 = *((__global float *)(src_addr + 15 * src_stride_z));
+ float d24 = *((__global float *)(src_addr + 16 * src_stride_z));
+ float d25 = *((__global float *)(src_addr + 17 * src_stride_z));
+
+ float d30 = *((__global float *)(src_addr + 18 * src_stride_z));
+ float d31 = *((__global float *)(src_addr + 19 * src_stride_z));
+ float d32 = *((__global float *)(src_addr + 20 * src_stride_z));
+ float d33 = *((__global float *)(src_addr + 21 * src_stride_z));
+ float d34 = *((__global float *)(src_addr + 22 * src_stride_z));
+ float d35 = *((__global float *)(src_addr + 23 * src_stride_z));
+
+ float d40 = *((__global float *)(src_addr + 24 * src_stride_z));
+ float d41 = *((__global float *)(src_addr + 25 * src_stride_z));
+ float d42 = *((__global float *)(src_addr + 26 * src_stride_z));
+ float d43 = *((__global float *)(src_addr + 27 * src_stride_z));
+ float d44 = *((__global float *)(src_addr + 28 * src_stride_z));
+ float d45 = *((__global float *)(src_addr + 29 * src_stride_z));
+
+ float d50 = *((__global float *)(src_addr + 30 * src_stride_z));
+ float d51 = *((__global float *)(src_addr + 31 * src_stride_z));
+ float d52 = *((__global float *)(src_addr + 32 * src_stride_z));
+ float d53 = *((__global float *)(src_addr + 33 * src_stride_z));
+ float d54 = *((__global float *)(src_addr + 34 * src_stride_z));
+ float d55 = *((__global float *)(src_addr + 35 * src_stride_z));
+
+ // Compute out00, out01, out02 and out03
+ float out00 = d01 + d21 + d41 + d11 + d31;
+ float out01 = d01 + d21 + d41 + d11 + d31;
+ float out02 = d01 + d21 + d41 + d11 + d31;
+ float out03 = d01 + d21 + d41 + d11 + d31;
+
+ float k0 = d03 + d04 + d13 + d14 + d23 + d24 + d33 + d34 + d43 + d44;
+ float k1 = 2.0f * d03 - 2.0f * d04 + 2.0f * d13 - 2.0f * d14 + 2.0f * d23 - 2.0f * d24 + 2.0f * d33 - 2.0f * d34 + 2.0f * d43 - 2.0f * d44;
+
+ out00 += k0 + d00 + d02 + d10 + d12 + d20 + d22 + d30 + d32 + d40 + d42;
+ out01 += k1 - d02 - d12 - d22 - d32 - d42;
+ out02 += 4.0f * k0 + d02 + d12 + d22 + d32 + d42;
+ out03 += 4.0f * k1 - d02 - d12 - d22 - d32 - d42 + d05 + d15 + d25 + d35 + d45;
+
+ // Compute out10, out11, out12 and out13
+ float out10 = d11 - d21 + 2.0f * d31 - 2.0f * d41;
+ float out11 = d11 - d21 + 2.0f * d31 - 2.0f * d41;
+ float out12 = d11 - d21 + 2.0f * d31 - 2.0f * d41;
+ float out13 = d11 - d21 + 2.0f * d31 - 2.0f * d41;
+
+ k0 = d13 + d14 - d23 - d24 + 2.0f * d33 + 2.0f * d34 - 2.0f * d43 - 2.0f * d44;
+ k1 = 2.0f * d13 - 2.0f * d14 - 2.0f * d23 + 2.0f * d24 + 4.0f * d33 - 4.0f * d34 - 4.0f * d43 + 4.0f * d44;
+
+ out10 += k0 + d10 + d12 - d20 - d22 + 2.0f * d30 + 2.0f * d32 - 2.0f * d40 - 2.0f * d42;
+ out11 += k1 - d12 + d22 - 2.0f * d32 + 2.0f * d42;
+ out12 += 4.0f * k0 + d12 - d22 + 2.0f * d32 - 2.0f * d42;
+ out13 += 4.0f * k1 - d12 + d15 + d22 - d25 - 2.0f * d32 + 2.0f * d35 + 2.0f * d42 - 2.0f * d45;
+
+ // Compute out20, out21, out22 and out23
+ float out20 = d11 + d21 + 4.0f * d31 + 4.0f * d41;
+ float out21 = d11 + d21 + 4.0f * d31 + 4.0f * d41;
+ float out22 = d11 + d21 + 4.0f * d31 + 4.0f * d41;
+ float out23 = d11 + d21 + 4.0f * d31 + 4.0f * d41;
+
+ k0 = d13 + d14 + d23 + d24 + 4.0f * d33 + 4.0f * d34 + 4.0f * d43 + 4.0f * d44;
+ k1 = 2.0f * d13 - 2.0f * d14 + 2.0f * d23 - 2.0f * d24 + 8.0f * d33 - 8.0f * d34 + 8.0f * d43 - 8.0f * d44;
+
+ out20 += k0 + d10 + d12 + d20 + d22 + 4.0f * d30 + 4.0f * d32 + 4.0f * d40 + 4.0f * d42;
+ out21 += k1 - d12 - d22 - 4.0f * d32 - 4.0f * d42;
+ out22 += 4.0f * k0 + d12 + d22 + 4.0f * d32 + 4.0f * d42;
+ out23 += 4.0f * k1 - d12 + d15 - d22 + d25 - 4.0f * d32 + 4.0f * d35 - 4.0f * d42 + 4.0f * d45;
+
+ // Compute out30, out31, out32 and out33
+ float out30 = d11 - d21 + 8.0f * d31 - 8.0f * d41 + d51;
+ float out31 = d11 - d21 + 8.0f * d31 - 8.0f * d41 + d51;
+ float out32 = d11 - d21 + 8.0f * d31 - 8.0f * d41 + d51;
+ float out33 = d11 - d21 + 8.0f * d31 - 8.0f * d41 + d51;
+
+ k0 = d13 + d14 - d23 - d24 + 8.0f * d33 + 8.0f * d34 - 8.0f * d43 - 8.0f * d44 + d53 + d54;
+ k1 = 2.0f * d13 - 2.0f * d14 - 2.0f * d23 + 2.0f * d24 + 16.0f * d33 - 16.0f * d34 - 16.0f * d43 + 16.0f * d44 + 2.0f * d53 - 2.0f * d54;
+
+ out30 += k0 + d10 + d12 - d20 - d22 + 8.0f * d30 + 8.0f * d32 - 8.0f * d40 - 8.0f * d42 + d50 + d52;
+ out31 += k1 - d12 + d22 - 8.0f * d32 + 8.0f * d42 - d52;
+ out32 += 4.0f * k0 + d12 - d22 + 8.0f * d32 - 8.0f * d42 + d52;
+ out33 += 4.0f * k1 - d12 + d15 + d22 - d25 - 8.0f * d32 + 8.0f * d35 + 8.0f * d42 - 8.0f * d45 - d52 + d55;
+
+ int y_in = get_global_id(1);
+ int x_out = (y_in % NUM_TILES_X) * 4;
+ int y_out = (y_in / NUM_TILES_X) * 4;
+ int z_out = get_global_id(0);
+
+#if defined(HAS_BIAS)
+ // Add bias
+ Vector bias = CONVERT_TO_VECTOR_STRUCT_NO_STEP(bias);
+
+ float b = (float) * ((__global float *)(vector_offset(&bias, z_out)));
+
+ out00 += (float)b;
+ out01 += (float)b;
+ out02 += (float)b;
+ out03 += (float)b;
+
+ out10 += (float)b;
+ out11 += (float)b;
+ out12 += (float)b;
+ out13 += (float)b;
+
+ out20 += (float)b;
+ out21 += (float)b;
+ out22 += (float)b;
+ out23 += (float)b;
+
+ out30 += (float)b;
+ out31 += (float)b;
+ out32 += (float)b;
+ out33 += (float)b;
+
+#endif // defined(HAS_BIAS)
+
+ // Get output address
+ __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x_out * dst_stride_x + y_out * dst_stride_y + z_out * dst_stride_z;
+
+ // Store the 4x4 output tile
+ vstore4((float4)(out00, out01, out02, out03), 0, (__global float *)(dst_addr + 0 * dst_stride_y));
+ vstore4((float4)(out10, out11, out12, out13), 0, (__global float *)(dst_addr + 1 * dst_stride_y));
+ vstore4((float4)(out20, out21, out22, out23), 0, (__global float *)(dst_addr + 2 * dst_stride_y));
+ vstore4((float4)(out30, out31, out32, out33), 0, (__global float *)(dst_addr + 3 * dst_stride_y));
+}
+
+#define COMPUTE_TMP_COL(col, d0, d1, d2, d3, d4, d5, d6, d7, comm_fact) \
+ ({ \
+ comm_fact.s0 = d1 + d2; \
+ comm_fact.s1 = d3 + d4; \
+ comm_fact.s2 = d5 + d6; \
+ \
+ col.s0 = comm_fact.s0 + comm_fact.s1 + 8.f * comm_fact.s2 + d0; \
+ col.s2 = comm_fact.s0 + 4.f * comm_fact.s1 + 2.f * comm_fact.s2; \
+ \
+ comm_fact.s0 = d1 - d2; \
+ comm_fact.s1 = d3 - d4; \
+ comm_fact.s2 = d5 - d6; \
+ \
+ col.s1 = comm_fact.s0 + 2.f * comm_fact.s1 + 4.f * comm_fact.s2; \
+ col.s3 = comm_fact.s0 + 8.f * comm_fact.s1 + comm_fact.s2 + d7; \
+ })
+
+/** This OpenCL kernel performs Winograd output transform when the output tile is 4x4, the filter size 5x5 and the data format is NCHW
+ *
+ * @note The number of tiles along the X direction must be passed at compile time using -DNUM_TILES_X: e.g. -DNUM_TILES_X=16
+ *
+ * @param[in] src_ptr Pointer to the source tensor. Supported data types: F32
+ * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
+ * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
+ * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr
+ * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
+ * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ */
+__kernel void winograd_output_transform_4x4_5x5_nchw(
+ TENSOR3D_DECLARATION(src),
+ TENSOR3D_DECLARATION(dst)
+#if defined(HAS_BIAS)
+ ,
+ VECTOR_DECLARATION(bias)
+#endif // defined(HAS_BIAS)
+)
+{
+ // Each thread stores a 4x4 tile
+ Tensor3D src = CONVERT_TO_TENSOR3D_STRUCT(src);
+
+ const __global uchar *src_addr = tensor3D_offset(&src, 0, 0, 0);
+
+ // Load the values across the 64 channels to compose the 8x8 input tile
+ float d00 = *((__global float *)(src_addr + 0 * src_stride_z));
+ float d01 = *((__global float *)(src_addr + 1 * src_stride_z));
+ float d02 = *((__global float *)(src_addr + 2 * src_stride_z));
+ float d03 = *((__global float *)(src_addr + 3 * src_stride_z));
+ float d04 = *((__global float *)(src_addr + 4 * src_stride_z));
+ float d05 = *((__global float *)(src_addr + 5 * src_stride_z));
+ float d06 = *((__global float *)(src_addr + 6 * src_stride_z));
+ float d07 = *((__global float *)(src_addr + 7 * src_stride_z));
+
+ float d10 = *((__global float *)(src_addr + 8 * src_stride_z));
+ float d11 = *((__global float *)(src_addr + 9 * src_stride_z));
+ float d12 = *((__global float *)(src_addr + 10 * src_stride_z));
+ float d13 = *((__global float *)(src_addr + 11 * src_stride_z));
+ float d14 = *((__global float *)(src_addr + 12 * src_stride_z));
+ float d15 = *((__global float *)(src_addr + 13 * src_stride_z));
+ float d16 = *((__global float *)(src_addr + 14 * src_stride_z));
+ float d17 = *((__global float *)(src_addr + 15 * src_stride_z));
+
+ float d20 = *((__global float *)(src_addr + 16 * src_stride_z));
+ float d21 = *((__global float *)(src_addr + 17 * src_stride_z));
+ float d22 = *((__global float *)(src_addr + 18 * src_stride_z));
+ float d23 = *((__global float *)(src_addr + 19 * src_stride_z));
+ float d24 = *((__global float *)(src_addr + 20 * src_stride_z));
+ float d25 = *((__global float *)(src_addr + 21 * src_stride_z));
+ float d26 = *((__global float *)(src_addr + 22 * src_stride_z));
+ float d27 = *((__global float *)(src_addr + 23 * src_stride_z));
+
+ float d30 = *((__global float *)(src_addr + 24 * src_stride_z));
+ float d31 = *((__global float *)(src_addr + 25 * src_stride_z));
+ float d32 = *((__global float *)(src_addr + 26 * src_stride_z));
+ float d33 = *((__global float *)(src_addr + 27 * src_stride_z));
+ float d34 = *((__global float *)(src_addr + 28 * src_stride_z));
+ float d35 = *((__global float *)(src_addr + 29 * src_stride_z));
+ float d36 = *((__global float *)(src_addr + 30 * src_stride_z));
+ float d37 = *((__global float *)(src_addr + 31 * src_stride_z));
+
+ float d40 = *((__global float *)(src_addr + 32 * src_stride_z));
+ float d41 = *((__global float *)(src_addr + 33 * src_stride_z));
+ float d42 = *((__global float *)(src_addr + 34 * src_stride_z));
+ float d43 = *((__global float *)(src_addr + 35 * src_stride_z));
+ float d44 = *((__global float *)(src_addr + 36 * src_stride_z));
+ float d45 = *((__global float *)(src_addr + 37 * src_stride_z));
+ float d46 = *((__global float *)(src_addr + 38 * src_stride_z));
+ float d47 = *((__global float *)(src_addr + 39 * src_stride_z));
+
+ float d50 = *((__global float *)(src_addr + 40 * src_stride_z));
+ float d51 = *((__global float *)(src_addr + 41 * src_stride_z));
+ float d52 = *((__global float *)(src_addr + 42 * src_stride_z));
+ float d53 = *((__global float *)(src_addr + 43 * src_stride_z));
+ float d54 = *((__global float *)(src_addr + 44 * src_stride_z));
+ float d55 = *((__global float *)(src_addr + 45 * src_stride_z));
+ float d56 = *((__global float *)(src_addr + 46 * src_stride_z));
+ float d57 = *((__global float *)(src_addr + 47 * src_stride_z));
+
+ float d60 = *((__global float *)(src_addr + 48 * src_stride_z));
+ float d61 = *((__global float *)(src_addr + 49 * src_stride_z));
+ float d62 = *((__global float *)(src_addr + 50 * src_stride_z));
+ float d63 = *((__global float *)(src_addr + 51 * src_stride_z));
+ float d64 = *((__global float *)(src_addr + 52 * src_stride_z));
+ float d65 = *((__global float *)(src_addr + 53 * src_stride_z));
+ float d66 = *((__global float *)(src_addr + 54 * src_stride_z));
+ float d67 = *((__global float *)(src_addr + 55 * src_stride_z));
+
+ float d70 = *((__global float *)(src_addr + 56 * src_stride_z));
+ float d71 = *((__global float *)(src_addr + 57 * src_stride_z));
+ float d72 = *((__global float *)(src_addr + 58 * src_stride_z));
+ float d73 = *((__global float *)(src_addr + 59 * src_stride_z));
+ float d74 = *((__global float *)(src_addr + 60 * src_stride_z));
+ float d75 = *((__global float *)(src_addr + 61 * src_stride_z));
+ float d76 = *((__global float *)(src_addr + 62 * src_stride_z));
+ float d77 = *((__global float *)(src_addr + 63 * src_stride_z));
+
+ // Compute the 8x4 intermediate tensor
+ float4 comm_fact0, comm_fact1, comm_fact2;
+ float4 tmp_col0, tmp_col1, tmp_col2, tmp_col3, tmp_col4, tmp_col5, tmp_col6, tmp_col7;
+
+ COMPUTE_TMP_COL(tmp_col0, d00, d10, d20, d30, d40, d50, d60, d70, comm_fact0);
+ COMPUTE_TMP_COL(tmp_col1, d01, d11, d21, d31, d41, d51, d61, d71, comm_fact0);
+ COMPUTE_TMP_COL(tmp_col2, d02, d12, d22, d32, d42, d52, d62, d72, comm_fact0);
+ COMPUTE_TMP_COL(tmp_col3, d03, d13, d23, d33, d43, d53, d63, d73, comm_fact0);
+ COMPUTE_TMP_COL(tmp_col4, d04, d14, d24, d34, d44, d54, d64, d74, comm_fact0);
+ COMPUTE_TMP_COL(tmp_col5, d05, d15, d25, d35, d45, d55, d65, d75, comm_fact0);
+ COMPUTE_TMP_COL(tmp_col6, d06, d16, d26, d36, d46, d56, d66, d76, comm_fact0);
+ COMPUTE_TMP_COL(tmp_col7, d07, d17, d27, d37, d47, d57, d67, d77, comm_fact0);
+
+ // Compute the 4x4 output tile
+ comm_fact0 = tmp_col1 + tmp_col2;
+ comm_fact1 = tmp_col3 + tmp_col4;
+ comm_fact2 = tmp_col5 + tmp_col6;
+
+ float4 out_col0 = comm_fact0 + comm_fact1 + 8.f * comm_fact2 + tmp_col0;
+ float4 out_col2 = comm_fact0 + 4.f * comm_fact1 + 2.f * comm_fact2;
+
+ comm_fact0 = tmp_col1 - tmp_col2;
+ comm_fact1 = tmp_col3 - tmp_col4;
+ comm_fact2 = tmp_col5 - tmp_col6;
+
+ float4 out_col1 = comm_fact0 + 2.f * comm_fact1 + 4.f * comm_fact2;
+ float4 out_col3 = comm_fact0 + 8.f * comm_fact1 + comm_fact2 + tmp_col7;
+
+ int y_in = get_global_id(1);
+ int x_out = (y_in % NUM_TILES_X) * 4;
+ int y_out = (y_in / NUM_TILES_X) * 4;
+ int z_out = get_global_id(0);
+
+#if defined(HAS_BIAS)
+ // Add bias
+ Vector bias = CONVERT_TO_VECTOR_STRUCT_NO_STEP(bias);
+
+ float b = (float) * ((__global float *)(vector_offset(&bias, z_out)));
+
+ out_col0 += (float4)b;
+ out_col1 += (float4)b;
+ out_col2 += (float4)b;
+ out_col3 += (float4)b;
+#endif // defined(HAS_BIAS)
+
+ // Get output address
+ __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x_out * dst_stride_x + y_out * dst_stride_y + z_out * dst_stride_z;
+
+ // Store the 4x4 output tile
+ *(__global float *)(dst_addr + 0 * dst_stride_x + 0 * dst_stride_y) = out_col0.s0;
+ *(__global float *)(dst_addr + 1 * dst_stride_x + 0 * dst_stride_y) = out_col1.s0;
+ *(__global float *)(dst_addr + 2 * dst_stride_x + 0 * dst_stride_y) = out_col2.s0;
+ *(__global float *)(dst_addr + 3 * dst_stride_x + 0 * dst_stride_y) = out_col3.s0;
+ *(__global float *)(dst_addr + 0 * dst_stride_x + 1 * dst_stride_y) = out_col0.s1;
+ *(__global float *)(dst_addr + 1 * dst_stride_x + 1 * dst_stride_y) = out_col1.s1;
+ *(__global float *)(dst_addr + 2 * dst_stride_x + 1 * dst_stride_y) = out_col2.s1;
+ *(__global float *)(dst_addr + 3 * dst_stride_x + 1 * dst_stride_y) = out_col3.s1;
+ *(__global float *)(dst_addr + 0 * dst_stride_x + 2 * dst_stride_y) = out_col0.s2;
+ *(__global float *)(dst_addr + 1 * dst_stride_x + 2 * dst_stride_y) = out_col1.s2;
+ *(__global float *)(dst_addr + 2 * dst_stride_x + 2 * dst_stride_y) = out_col2.s2;
+ *(__global float *)(dst_addr + 3 * dst_stride_x + 2 * dst_stride_y) = out_col3.s2;
+ *(__global float *)(dst_addr + 0 * dst_stride_x + 3 * dst_stride_y) = out_col0.s3;
+ *(__global float *)(dst_addr + 1 * dst_stride_x + 3 * dst_stride_y) = out_col1.s3;
+ *(__global float *)(dst_addr + 2 * dst_stride_x + 3 * dst_stride_y) = out_col2.s3;
+ *(__global float *)(dst_addr + 3 * dst_stride_x + 3 * dst_stride_y) = out_col3.s3;
+}
+#endif // defined(NUM_TILES_X)
diff --git a/src/core/CL/kernels/CLBatchNormalizationLayerKernel.cpp b/src/core/CL/kernels/CLBatchNormalizationLayerKernel.cpp
index 87fc1d0..293361b 100644
--- a/src/core/CL/kernels/CLBatchNormalizationLayerKernel.cpp
+++ b/src/core/CL/kernels/CLBatchNormalizationLayerKernel.cpp
@@ -46,10 +46,23 @@
{
ARM_COMPUTE_UNUSED(epsilon);
ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QS8, DataType::QS16, DataType::F16, DataType::F32);
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(mean, var, beta, gamma);
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, mean, var, beta, gamma);
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_FIXED_POINT(input, mean, var, beta, gamma);
- ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(2) != mean->dimension(0));
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(mean, var);
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, mean, var);
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_FIXED_POINT(input, mean, var);
+ ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::CHANNEL)) != mean->dimension(0));
+ if(beta != nullptr)
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(mean, beta);
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, beta);
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_FIXED_POINT(input, beta);
+ }
+ if(gamma != nullptr)
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(mean, gamma);
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, gamma);
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_FIXED_POINT(input, gamma);
+ }
+
if(act_info.enabled())
{
ActivationLayerInfo::ActivationFunction act = act_info.activation();
@@ -62,6 +75,7 @@
if(output != nullptr && output->total_size() != 0)
{
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output);
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(input, output);
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_FIXED_POINT(input, output);
}
@@ -69,7 +83,8 @@
return Status{};
}
-std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *output)
+std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *output,
+ ITensorInfo *mean, ITensorInfo *var, ITensorInfo *beta, ITensorInfo *gamma)
{
if(output != nullptr)
{
@@ -95,6 +110,24 @@
window_changed = update_window_and_padding(win, input_access);
}
+ if(input->data_layout() == DataLayout::NHWC)
+ {
+ AccessWindowHorizontal mean_access(mean, 0, num_elems_processed_per_iteration);
+ AccessWindowHorizontal var_access(var, 0, num_elems_processed_per_iteration);
+ window_changed = window_changed || update_window_and_padding(win, mean_access, var_access);
+
+ if(beta != nullptr)
+ {
+ AccessWindowHorizontal beta_access(beta, 0, num_elems_processed_per_iteration);
+ window_changed = window_changed || update_window_and_padding(win, beta_access);
+ }
+ if(gamma != nullptr)
+ {
+ AccessWindowHorizontal gamma_access(gamma, 0, num_elems_processed_per_iteration);
+ window_changed = window_changed || update_window_and_padding(win, gamma_access);
+ }
+ }
+
Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
return std::make_pair(err, win);
}
@@ -108,7 +141,7 @@
void CLBatchNormalizationLayerKernel::configure(ICLTensor *input, ICLTensor *output, const ICLTensor *mean, const ICLTensor *var, const ICLTensor *beta, const ICLTensor *gamma,
float epsilon, ActivationLayerInfo act_info)
{
- ARM_COMPUTE_ERROR_ON_NULLPTR(input, mean, var, beta, gamma);
+ ARM_COMPUTE_ERROR_ON_NULLPTR(input, mean, var);
_input = input;
_output = output;
@@ -120,15 +153,9 @@
_run_in_place = (output == nullptr) || (output == input);
- if(output != nullptr)
- {
- ARM_COMPUTE_ERROR_ON_NULLPTR(input->info(), output->info());
- // Output tensor auto initialization if not yet initialized
- auto_init_if_empty(*output->info(), *input->info()->clone());
- }
-
ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), (output != nullptr) ? output->info() : nullptr,
- mean->info(), var->info(), beta->info(), gamma->info(), epsilon, act_info));
+ mean->info(), var->info(), (beta != nullptr) ? beta->info() : nullptr,
+ (gamma != nullptr) ? gamma->info() : nullptr, epsilon, act_info));
const unsigned int num_elems_processed_per_iteration = 16 / input->info()->element_size();
@@ -136,26 +163,41 @@
CLBuildOptions build_opts;
build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type()));
build_opts.add_option("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration));
- build_opts.add_option_if(act_info.enabled(), "-D" + string_from_activation_func(act_info.activation()));
+ build_opts.add_option_if(act_info.enabled(), "-DFUSED_ACTIVATION=" + lower_string(string_from_activation_func(act_info.activation())));
build_opts.add_option_if(act_info.enabled(), "-DA_VAL=" + float_to_string_with_full_precision(act_info.a()));
build_opts.add_option_if(act_info.enabled(), "-DB_VAL=" + float_to_string_with_full_precision(act_info.b()));
build_opts.add_option_if(_run_in_place, "-DIN_PLACE");
build_opts.add_option_if(is_data_type_fixed_point(input->info()->data_type()), "-DFIXED_POINT_POSITION=" + support::cpp11::to_string(input->info()->fixed_point_position()));
+ build_opts.add_option_if(beta == nullptr, "-DUSE_DEFAULT_BETA");
+ build_opts.add_option_if(gamma == nullptr, "-DUSE_DEFAULT_GAMMA");
// Create kernel
- _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("batchnormalization_layer", build_opts.options()));
+ _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("batchnormalization_layer_" + lower_string(string_from_data_layout(input->info()->data_layout())), build_opts.options()));
// Set kernel static arguments
unsigned int include_output = (!_run_in_place) ? 1 : 0;
- unsigned int idx = (1 + include_output) * num_arguments_per_3D_tensor() + 4 * num_arguments_per_1D_tensor(); // Skip the input and output parameters
+ unsigned int idx = (1 + include_output) * num_arguments_per_3D_tensor() + 2 * num_arguments_per_1D_tensor(); // Skip the input and output parameters
+ if(_beta != nullptr)
+ {
+ idx += num_arguments_per_1D_tensor(); // Skip beta parameter
+ }
+ if(_gamma != nullptr)
+ {
+ idx += num_arguments_per_1D_tensor(); // Skip gamma parameter
+ }
_kernel.setArg<cl_float>(idx++, _epsilon);
// Configure kernel window
- auto win_config = validate_and_configure_window(input->info(), (_run_in_place) ? nullptr : output->info());
+ auto win_config = validate_and_configure_window(input->info(), (_run_in_place) ? nullptr : output->info(),
+ mean->info(), var->info(),
+ (beta != nullptr) ? beta->info() : nullptr,
+ (gamma != nullptr) ? gamma->info() : nullptr);
ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
ICLKernel::configure(win_config.second);
_config_id = "batch_normalization_layer_";
+ _config_id += string_from_data_layout(input->info()->data_layout());
+ _config_id += "_";
_config_id += string_from_data_type(input->info()->data_type());
_config_id += "_";
_config_id += support::cpp11::to_string(input->info()->dimension(0));
@@ -172,7 +214,11 @@
{
const bool run_in_place = (output == nullptr) || (output == input);
ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, mean, var, beta, gamma, epsilon, act_info));
- ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input->clone().get(), (run_in_place) ? nullptr : output->clone().get()).first);
+ ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input->clone().get(), (run_in_place) ? nullptr : output->clone().get(),
+ mean->clone().get(), var->clone().get(),
+ (beta != nullptr) ? beta->clone().get() : nullptr,
+ (gamma != nullptr) ? gamma->clone().get() : nullptr)
+ .first);
return Status{};
}
@@ -191,8 +237,14 @@
unsigned int idx = (1 + include_output) * num_arguments_per_3D_tensor();
add_1D_tensor_argument(idx, _mean, vector_slice);
add_1D_tensor_argument(idx, _var, vector_slice);
- add_1D_tensor_argument(idx, _beta, vector_slice);
- add_1D_tensor_argument(idx, _gamma, vector_slice);
+ if(_beta != nullptr)
+ {
+ add_1D_tensor_argument(idx, _beta, vector_slice);
+ }
+ if(_gamma != nullptr)
+ {
+ add_1D_tensor_argument(idx, _gamma, vector_slice);
+ }
do
{
diff --git a/src/core/CL/kernels/CLChannelCombineKernel.cpp b/src/core/CL/kernels/CLChannelCombineKernel.cpp
index d729ebc..6e55e66 100644
--- a/src/core/CL/kernels/CLChannelCombineKernel.cpp
+++ b/src/core/CL/kernels/CLChannelCombineKernel.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2016, 2017 ARM Limited.
+ * Copyright (c) 2016-2018 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -48,41 +48,62 @@
void CLChannelCombineKernel::configure(const ICLTensor *plane0, const ICLTensor *plane1, const ICLTensor *plane2, const ICLTensor *plane3, ICLTensor *output)
{
+ ARM_COMPUTE_ERROR_ON_NULLPTR(plane0, plane1, plane2, output);
+ ARM_COMPUTE_ERROR_ON_TENSOR_NOT_2D(plane0);
+ ARM_COMPUTE_ERROR_ON_TENSOR_NOT_2D(plane1);
+ ARM_COMPUTE_ERROR_ON_TENSOR_NOT_2D(plane2);
+ ARM_COMPUTE_ERROR_ON_TENSOR_NOT_2D(output);
+
ARM_COMPUTE_ERROR_ON_FORMAT_NOT_IN(plane0, Format::U8);
ARM_COMPUTE_ERROR_ON_FORMAT_NOT_IN(plane1, Format::U8);
ARM_COMPUTE_ERROR_ON_FORMAT_NOT_IN(plane2, Format::U8);
ARM_COMPUTE_ERROR_ON_FORMAT_NOT_IN(output, Format::RGB888, Format::RGBA8888, Format::YUYV422, Format::UYVY422);
- const Format fmt = output->info()->format();
- _planes[0] = plane0;
- _planes[1] = plane1;
- _planes[2] = plane2;
- if(Format::RGBA8888 == fmt)
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(plane0, 1, DataType::U8);
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(plane1, 1, DataType::U8);
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(plane2, 1, DataType::U8);
+
+ const Format output_format = output->info()->format();
+
+ // Check if horizontal dimension of Y plane is even and validate horizontal sub-sampling dimensions for U and V planes
+ if(Format::YUYV422 == output_format || Format::UYVY422 == output_format)
{
+ // Validate Y plane of input and output
+ ARM_COMPUTE_ERROR_ON_TENSORS_NOT_EVEN(output_format, plane0, output);
+
+ // Validate U and V plane of the input
+ ARM_COMPUTE_ERROR_ON_TENSORS_NOT_SUBSAMPLED(output_format, plane0->info()->tensor_shape(), plane1, plane2);
+ }
+
+ _planes[0] = plane0;
+ _planes[1] = plane1;
+ _planes[2] = plane2;
+ _planes[3] = nullptr;
+
+ // Validate the last input tensor only for RGBA format
+ if(Format::RGBA8888 == output_format)
+ {
+ ARM_COMPUTE_ERROR_ON_NULLPTR(plane3);
+ ARM_COMPUTE_ERROR_ON_TENSOR_NOT_2D(plane3);
+
ARM_COMPUTE_ERROR_ON_FORMAT_NOT_IN(plane3, Format::U8);
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(plane3, 1, DataType::U8);
+
_planes[3] = plane3;
}
- else
- {
- _planes[3] = nullptr;
- }
+
_output = output;
_output_multi = nullptr;
- // Half the processed elements for U,V channels due to sub-sampling of 2
- if(Format::YUYV422 == fmt || Format::UYVY422 == fmt)
+ // Half the processed elements for U and V channels due to horizontal sub-sampling of 2
+ if(Format::YUYV422 == output_format || Format::UYVY422 == output_format)
{
- _x_subsampling = { { 1, 2, 2 } };
- _y_subsampling = { { 1, 2, 2 } };
- }
- else
- {
- _x_subsampling = { { 1, 1, 1 } };
- _y_subsampling = { { 1, 1, 1 } };
+ _x_subsampling[1] = 2;
+ _x_subsampling[2] = 2;
}
// Create kernel
- std::string kernel_name = "channel_combine_" + string_from_format(fmt);
+ std::string kernel_name = "channel_combine_" + string_from_format(output_format);
_kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel(kernel_name));
// Configure window
@@ -112,50 +133,78 @@
void CLChannelCombineKernel::configure(const ICLImage *plane0, const ICLImage *plane1, const ICLImage *plane2, ICLMultiImage *output)
{
+ ARM_COMPUTE_ERROR_ON_NULLPTR(plane0, plane1, plane2, output);
ARM_COMPUTE_ERROR_ON_TENSOR_NOT_2D(plane0);
ARM_COMPUTE_ERROR_ON_TENSOR_NOT_2D(plane1);
ARM_COMPUTE_ERROR_ON_TENSOR_NOT_2D(plane2);
+
ARM_COMPUTE_ERROR_ON_FORMAT_NOT_IN(plane0, Format::U8);
ARM_COMPUTE_ERROR_ON_FORMAT_NOT_IN(plane1, Format::U8);
ARM_COMPUTE_ERROR_ON_FORMAT_NOT_IN(plane2, Format::U8);
ARM_COMPUTE_ERROR_ON_FORMAT_NOT_IN(output, Format::NV12, Format::NV21, Format::IYUV, Format::YUV444);
- _planes[0] = plane0;
- _planes[1] = plane1;
- _planes[2] = plane2;
- _planes[3] = nullptr;
- _output = nullptr;
- _output_multi = output;
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(plane0, 1, DataType::U8);
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(plane1, 1, DataType::U8);
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(plane2, 1, DataType::U8);
+
+ const Format output_format = output->info()->format();
+
+ // Validate shape of Y plane to be even and shape of sub-sampling dimensions for U and V planes
+ // Perform validation only for formats which require sub-sampling.
+ if(Format::YUV444 != output_format)
+ {
+ // Validate Y plane of input and output
+ ARM_COMPUTE_ERROR_ON_TENSORS_NOT_EVEN(output_format, plane0, output->plane(0));
+
+ // Validate U and V plane of the input
+ ARM_COMPUTE_ERROR_ON_TENSORS_NOT_SUBSAMPLED(output_format, plane0->info()->tensor_shape(), plane1, plane2);
+
+ // Validate second plane U (NV12 and NV21 have a UV88 combined plane while IYUV has only the U plane)
+ // MultiImage generates the correct tensor shape but also check in case the tensor shape of planes was changed to a wrong size
+ ARM_COMPUTE_ERROR_ON_TENSORS_NOT_SUBSAMPLED(output_format, plane0->info()->tensor_shape(), output->plane(1));
+
+ // Validate the last plane V of format IYUV
+ if(Format::IYUV == output_format)
+ {
+ // Validate Y plane of the output
+ ARM_COMPUTE_ERROR_ON_TENSORS_NOT_SUBSAMPLED(output_format, plane0->info()->tensor_shape(), output->plane(2));
+ }
+ }
+
+ // Set input tensors
+ _planes[0] = plane0;
+ _planes[1] = plane1;
+ _planes[2] = plane2;
+ _planes[3] = nullptr;
+
+ // Set output tensor
+ _output = nullptr;
+ _output_multi = output;
+
bool has_two_planars = false;
// Set sub-sampling parameters for each plane
- const Format fmt = output->info()->format();
std::string kernel_name;
std::set<std::string> build_opts;
- if(Format::NV12 == fmt || Format::NV21 == fmt)
+ if(Format::NV12 == output_format || Format::NV21 == output_format)
{
_x_subsampling = { { 1, 2, 2 } };
_y_subsampling = { { 1, 2, 2 } };
kernel_name = "channel_combine_NV";
- build_opts.emplace(Format::NV12 == fmt ? "-DNV12" : "-DNV21");
+ build_opts.emplace(Format::NV12 == output_format ? "-DNV12" : "-DNV21");
has_two_planars = true;
}
else
{
- if(Format::IYUV == fmt)
+ if(Format::IYUV == output_format)
{
_x_subsampling = { { 1, 2, 2 } };
_y_subsampling = { { 1, 2, 2 } };
}
- else
- {
- _x_subsampling = { { 1, 1, 1 } };
- _y_subsampling = { { 1, 1, 1 } };
- }
kernel_name = "copy_planes_3p";
- build_opts.emplace(Format::IYUV == fmt ? "-DIYUV" : "-DYUV444");
+ build_opts.emplace(Format::IYUV == output_format ? "-DIYUV" : "-DYUV444");
}
// Create kernel
@@ -166,12 +215,12 @@
Window win = calculate_max_window(*plane0->info(), Steps(num_elems_processed_per_iteration));
- AccessWindowHorizontal input_plane0_access(plane0->info(), 0, num_elems_processed_per_iteration);
- AccessWindowRectangle input_plane1_access(plane1->info(), 0, 0, num_elems_processed_per_iteration, 1, 1.f / _x_subsampling[1], 1.f / _y_subsampling[1]);
- AccessWindowRectangle input_plane2_access(plane2->info(), 0, 0, num_elems_processed_per_iteration, 1, 1.f / _x_subsampling[2], 1.f / _y_subsampling[2]);
- AccessWindowRectangle output_plane0_access(output->plane(0)->info(), 0, 0, num_elems_processed_per_iteration, 1, 1.f, 1.f / _y_subsampling[1]);
- AccessWindowRectangle output_plane1_access(output->plane(1)->info(), 0, 0, num_elems_processed_per_iteration, 1, 1.f / _x_subsampling[1], 1.f / _y_subsampling[1]);
- AccessWindowRectangle output_plane2_access(has_two_planars ? nullptr : output->plane(2)->info(), 0, 0, num_elems_processed_per_iteration, 1, 1.f / _x_subsampling[2], 1.f / _y_subsampling[2]);
+ AccessWindowRectangle input_plane0_access(plane0->info(), 0, 0, num_elems_processed_per_iteration, 1.f);
+ AccessWindowRectangle input_plane1_access(plane1->info(), 0, 0, num_elems_processed_per_iteration, 1.f, 1.f / _x_subsampling[1], 1.f / _y_subsampling[1]);
+ AccessWindowRectangle input_plane2_access(plane2->info(), 0, 0, num_elems_processed_per_iteration, 1.f, 1.f / _x_subsampling[2], 1.f / _y_subsampling[2]);
+ AccessWindowRectangle output_plane0_access(output->plane(0)->info(), 0, 0, num_elems_processed_per_iteration, 1.f, 1.f, 1.f / _y_subsampling[1]);
+ AccessWindowRectangle output_plane1_access(output->plane(1)->info(), 0, 0, num_elems_processed_per_iteration, 1.f, 1.f / _x_subsampling[1], 1.f / _y_subsampling[1]);
+ AccessWindowRectangle output_plane2_access(has_two_planars ? nullptr : output->plane(2)->info(), 0, 0, num_elems_processed_per_iteration, 1.f, 1.f / _x_subsampling[2], 1.f / _y_subsampling[2]);
update_window_and_padding(win,
input_plane0_access, input_plane1_access, input_plane2_access,
@@ -192,6 +241,7 @@
ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
Window slice = window.first_slice_window_2D();
+ slice.set_dimension_step(Window::DimY, 1);
do
{
diff --git a/src/core/CL/kernels/CLChannelShuffleLayerKernel.cpp b/src/core/CL/kernels/CLChannelShuffleLayerKernel.cpp
new file mode 100644
index 0000000..a667119
--- /dev/null
+++ b/src/core/CL/kernels/CLChannelShuffleLayerKernel.cpp
@@ -0,0 +1,153 @@
+/*
+ * Copyright (c) 2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/CL/kernels/CLChannelShuffleLayerKernel.h"
+
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/CL/CLKernelLibrary.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/Window.h"
+
+namespace arm_compute
+{
+namespace
+{
+Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, unsigned int num_groups)
+{
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8, DataType::S8, DataType::QS8, DataType::QASYMM8,
+ DataType::U16, DataType::S16, DataType::QS16,
+ DataType::U32, DataType::S32,
+ DataType::F16, DataType::F32);
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(num_groups < 2, "Channel shuffling with less than 2 groups would be inefficient");
+
+ const unsigned int channels = input->dimension(get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::CHANNEL));
+
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(num_groups == channels, "Channel shuffling with same number of groups as number of channels would be inefficient");
+ // There cannot be more groups than channels
+ ARM_COMPUTE_RETURN_ERROR_ON(num_groups > channels);
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG((channels % num_groups) != 0, "The number of channels must be a multiple of the number of groups");
+
+ // Checks performed when output is configured
+ if(output->total_size() != 0)
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output);
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+ }
+
+ return Status{};
+}
+
+std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *output)
+{
+ // Output tensor auto initialization if not yet initialized
+ auto_init_if_empty(*output, *input->clone());
+
+ const unsigned int num_elems_processed_per_iteration = max_cl_vector_width / input->element_size();
+
+ // Configure kernel window
+ Window win = calculate_max_window(*input, Steps(num_elems_processed_per_iteration, num_elems_processed_per_iteration));
+ AccessWindowRectangle input_access(input, 0, 0, num_elems_processed_per_iteration, num_elems_processed_per_iteration);
+ AccessWindowRectangle output_access(output, 0, 0, num_elems_processed_per_iteration, num_elems_processed_per_iteration);
+
+ const bool window_changed = update_window_and_padding(win, input_access, output_access);
+ output_access.set_valid_region(win, input->valid_region());
+
+ Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
+ return std::make_pair(err, win);
+}
+} // namespace
+
+CLChannelShuffleLayerKernel::CLChannelShuffleLayerKernel()
+ : _input(nullptr), _output(nullptr)
+{
+}
+
+void CLChannelShuffleLayerKernel::configure(const ICLTensor *input, ICLTensor *output, unsigned int num_groups)
+{
+ ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
+
+ _input = input;
+ _output = output;
+
+ ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), num_groups));
+
+ const unsigned int channels = input->info()->dimension(get_data_layout_dimension_index(input->info()->data_layout(), DataLayoutDimension::CHANNEL));
+ const unsigned int block_size = max_cl_vector_width / input->info()->element_size();
+
+ // Set kernel build options
+ CLBuildOptions build_opts;
+ build_opts.add_option("-DNUM_GROUPS=" + support::cpp11::to_string(num_groups));
+ build_opts.add_option("-DK=" + support::cpp11::to_string(channels / num_groups));
+ build_opts.add_option("-DBLOCK_SIZE=" + support::cpp11::to_string(block_size));
+ switch(input->info()->element_size())
+ {
+ case 1:
+ build_opts.add_option("-DDATA_TYPE=uchar");
+ break;
+ case 2:
+ build_opts.add_option("-DDATA_TYPE=ushort");
+ break;
+ case 4:
+ build_opts.add_option("-DDATA_TYPE=uint");
+ break;
+ default:
+ ARM_COMPUTE_ERROR("Data type not supported");
+ }
+
+ // Create kernel
+ _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("channel_shuffle_nchw", build_opts.options()));
+
+ // Configure kernel window
+ auto win_config = validate_and_configure_window(input->info(), output->info());
+ ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
+ ICLKernel::configure(win_config.second);
+}
+
+Status CLChannelShuffleLayerKernel::validate(const ITensorInfo *input, const ITensorInfo *output, unsigned int num_groups)
+{
+ ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, num_groups));
+ ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input->clone().get(), output->clone().get()).first);
+
+ return Status{};
+}
+
+void CLChannelShuffleLayerKernel::run(const Window &window, cl::CommandQueue &queue)
+{
+ ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+ ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
+
+ Window slice = window.first_slice_window_3D();
+ do
+ {
+ unsigned int idx = 0;
+ add_3D_tensor_argument(idx, _input, slice);
+ add_3D_tensor_argument(idx, _output, slice);
+ enqueue(queue, *this, slice);
+ }
+ while(window.slide_window_slice_3D(slice));
+}
+} // namespace arm_compute
diff --git a/src/core/CL/kernels/CLCol2ImKernel.cpp b/src/core/CL/kernels/CLCol2ImKernel.cpp
index 8ccec06..91c0430 100644
--- a/src/core/CL/kernels/CLCol2ImKernel.cpp
+++ b/src/core/CL/kernels/CLCol2ImKernel.cpp
@@ -51,7 +51,7 @@
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(output->tensor_shape(), compute_col2im_shape(*input, convolved_dims));
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_FIXED_POINT(input, output);
- ARM_COMPUTE_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(input, output);
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(input, output);
}
return Status{};
@@ -111,8 +111,8 @@
// Configure the local work size for Bifrost with a value obtained
// via exhaustive autotuning over 30 representative tensor shapes.
- const GPUTarget gpu_target = get_arch_from_target(get_target());
- if(gpu_target == GPUTarget::BIFROST)
+ const GPUTarget gpu_target = get_target();
+ if(gpu_target_is_in(gpu_target, GPUTarget::G71, GPUTarget::G72, GPUTarget::G51, GPUTarget::G51BIG, GPUTarget::G51LIT, GPUTarget::TNOX))
{
if((_convolved_dims.first == 7) || (_convolved_dims.first == 14))
{
diff --git a/src/core/CL/kernels/CLConvertFullyConnectedWeightsKernel.cpp b/src/core/CL/kernels/CLConvertFullyConnectedWeightsKernel.cpp
new file mode 100644
index 0000000..1b211b0
--- /dev/null
+++ b/src/core/CL/kernels/CLConvertFullyConnectedWeightsKernel.cpp
@@ -0,0 +1,97 @@
+/*
+ * Copyright (c) 2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/CL/kernels/CLConvertFullyConnectedWeightsKernel.h"
+
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/CL/CLKernelLibrary.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/Types.h"
+
+namespace arm_compute
+{
+CLConvertFullyConnectedWeightsKernel::CLConvertFullyConnectedWeightsKernel()
+ : _input(nullptr), _output(nullptr)
+{
+}
+
+void CLConvertFullyConnectedWeightsKernel::configure(const ICLTensor *input, ICLTensor *output, const TensorShape &original_input_shape,
+ DataLayout data_layout)
+{
+ ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
+ ARM_COMPUTE_ERROR_THROW_ON(CLConvertFullyConnectedWeightsKernel::validate(input->info(), output->info(), original_input_shape, data_layout));
+
+ _input = input;
+ _output = output;
+
+ const unsigned int num_elems_per_input_plane = original_input_shape.x() * original_input_shape.y();
+ const unsigned int num_channels = original_input_shape.z();
+
+ // Set build options
+ CLBuildOptions build_opts;
+ build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type()));
+ if(data_layout == DataLayout::NCHW)
+ {
+ build_opts.add_option("-DFACTOR_1=" + support::cpp11::to_string(num_elems_per_input_plane));
+ build_opts.add_option("-DFACTOR_2=" + support::cpp11::to_string(num_channels));
+ }
+ else
+ {
+ build_opts.add_option("-DFACTOR_1=" + support::cpp11::to_string(num_channels));
+ build_opts.add_option("-DFACTOR_2=" + support::cpp11::to_string(num_elems_per_input_plane));
+ }
+
+ // Create kernel
+ _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("convert_fc_weights", build_opts.options()));
+
+ // Configure kernel window
+ Window win = calculate_max_window(*input->info(), Steps());
+ ICLKernel::configure(win);
+}
+
+Status CLConvertFullyConnectedWeightsKernel::validate(const ITensorInfo *input, const ITensorInfo *output, const TensorShape &original_input_shape,
+ DataLayout data_layout)
+{
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8, DataType::S8, DataType::QS8, DataType::QASYMM8, DataType::U16, DataType::S16, DataType::QS16, DataType::U32, DataType::S32,
+ DataType::QS32, DataType::F16, DataType::F32);
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output);
+ ARM_COMPUTE_RETURN_ERROR_ON(input->num_dimensions() != 2);
+ ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(1) != original_input_shape.total_size_lower(3));
+ ARM_COMPUTE_RETURN_ERROR_ON(data_layout == DataLayout::UNKNOWN);
+
+ return Status{};
+}
+
+void CLConvertFullyConnectedWeightsKernel::run(const Window &window, cl::CommandQueue &queue)
+{
+ ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+ ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
+
+ unsigned int idx = 0;
+ add_2D_tensor_argument(idx, _input, window);
+ add_2D_tensor_argument(idx, _output, window);
+ enqueue(queue, *this, window);
+}
+} // namespace arm_compute
\ No newline at end of file
diff --git a/src/core/CL/kernels/CLCopyKernel.cpp b/src/core/CL/kernels/CLCopyKernel.cpp
new file mode 100644
index 0000000..4f00ef9
--- /dev/null
+++ b/src/core/CL/kernels/CLCopyKernel.cpp
@@ -0,0 +1,87 @@
+/*
+ * Copyright (c) 2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/CL/kernels/CLCopyKernel.h"
+
+#include "arm_compute/core/AccessWindowStatic.h"
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/CL/CLKernelLibrary.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/Window.h"
+
+#include <algorithm>
+
+using namespace arm_compute;
+
+CLCopyKernel::CLCopyKernel()
+ : _input(nullptr), _output(nullptr)
+{
+}
+
+void CLCopyKernel::configure(const ICLTensor *input, ICLTensor *output)
+{
+ ARM_COMPUTE_ERROR_ON_MISMATCHING_DIMENSIONS(input->info()->tensor_shape(), output->info()->tensor_shape());
+ ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+
+ _input = input;
+ _output = output;
+
+ // Create kernel
+ CLBuildOptions build_opts;
+ build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type()));
+ _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("copy_tensor", build_opts.options()));
+
+ // Configure window
+ constexpr unsigned int num_elems_processed_per_iteration = 16;
+
+ Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration));
+
+ AccessWindowHorizontal input_access(input->info(), 0, num_elems_processed_per_iteration);
+ AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration);
+
+ update_window_and_padding(win, input_access, output_access);
+
+ ICLKernel::configure(win);
+}
+
+void CLCopyKernel::run(const Window &window, cl::CommandQueue &queue)
+{
+ ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+ ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
+
+ Window collapsed = window.collapse_if_possible(ICLKernel::window(), Window::DimX);
+ Window slice = collapsed.first_slice_window_1D();
+
+ do
+ {
+ unsigned int idx = 0;
+ add_1D_tensor_argument(idx, _input, slice);
+ add_1D_tensor_argument(idx, _output, slice);
+ enqueue(queue, *this, slice);
+ }
+ while(collapsed.slide_window_slice_1D(slice));
+}
diff --git a/src/core/CL/kernels/CLDepthwiseConvolutionLayer3x3Kernel.cpp b/src/core/CL/kernels/CLDepthwiseConvolutionLayer3x3Kernel.cpp
deleted file mode 100644
index 29564b3..0000000
--- a/src/core/CL/kernels/CLDepthwiseConvolutionLayer3x3Kernel.cpp
+++ /dev/null
@@ -1,260 +0,0 @@
-/*
- * Copyright (c) 2017-2018 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/core/CL/kernels/CLDepthwiseConvolutionLayer3x3Kernel.h"
-
-#include "arm_compute/core/AccessWindowStatic.h"
-#include "arm_compute/core/CL/CLHelpers.h"
-#include "arm_compute/core/CL/CLKernelLibrary.h"
-#include "arm_compute/core/CL/ICLKernel.h"
-#include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/Error.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Types.h"
-#include "arm_compute/core/Utils.h"
-#include "arm_compute/core/utils/misc/ShapeCalculator.h"
-#include "arm_compute/core/utils/quantization/AsymmHelpers.h"
-
-using namespace arm_compute;
-using namespace arm_compute::misc::shape_calculator;
-
-CLDepthwiseConvolutionLayer3x3Kernel::CLDepthwiseConvolutionLayer3x3Kernel()
- : _border_size(0), _input(), _output(), _weights(), _biases(), _conv_stride_x(0), _conv_stride_y(0), _conv_pad_left(0), _conv_pad_top(0)
-{
-}
-
-BorderSize CLDepthwiseConvolutionLayer3x3Kernel::border_size() const
-{
- return _border_size;
-}
-
-void CLDepthwiseConvolutionLayer3x3Kernel::configure(const ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, const PadStrideInfo &conv_info)
-{
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::F16, DataType::F32);
- ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, weights);
- ARM_COMPUTE_ERROR_ON(weights->info()->dimension(0) != 3 || weights->info()->dimension(1) != 3);
-
- bool is_qasymm = is_data_type_quantized_asymmetric(input->info()->data_type());
-
- if(biases != nullptr)
- {
- if(is_qasymm)
- {
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(biases, 1, DataType::S32);
- }
- else
- {
- ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(weights, biases);
- }
- ARM_COMPUTE_ERROR_ON(biases->info()->dimension(0) != weights->info()->dimension(2));
- ARM_COMPUTE_ERROR_ON(biases->info()->num_dimensions() > 1);
- }
-
- // Get convolved dimensions
- const TensorShape output_shape = compute_depthwise_convolution_shape(*input->info(), *weights->info(), conv_info);
-
- // Output auto inizialitation if not yet initialized
- auto_init_if_empty(*output->info(),
- output_shape,
- 1,
- input->info()->data_type(),
- input->info()->fixed_point_position(),
- input->info()->quantization_info());
-
- ARM_COMPUTE_ERROR_ON_MISMATCHING_DIMENSIONS(output->info()->tensor_shape(), output_shape);
-
- _input = input;
- _output = output;
- _weights = weights;
- _biases = biases;
- _conv_stride_x = conv_info.stride().first;
- _conv_stride_y = conv_info.stride().second;
- _conv_pad_left = conv_info.pad_left();
- _conv_pad_top = conv_info.pad_top();
- _border_size = BorderSize(_conv_pad_top, conv_info.pad_right(), conv_info.pad_bottom(), _conv_pad_left);
-
- // Set build options
- ARM_COMPUTE_ERROR_ON(_conv_stride_x < 1 || _conv_stride_x > 3);
- CLBuildOptions build_opts;
- build_opts.add_option("-DCONV_STRIDE_X=" + support::cpp11::to_string(_conv_stride_x));
- build_opts.add_option_if(_biases != nullptr, "-DHAS_BIAS");
-
- if(is_qasymm)
- {
- float multiplier = _input->info()->quantization_info().scale * _weights->info()->quantization_info().scale / _output->info()->quantization_info().scale;
- int output_multiplier = 0;
- int output_shift = 0;
- quantization::calculate_quantized_multiplier_less_than_one(multiplier, &output_multiplier, &output_shift);
-
- build_opts.add_option("-DCONV_STRIDE_Y=" + support::cpp11::to_string(_conv_stride_y));
- build_opts.add_option("-DINPUT_OFFSET=" + support::cpp11::to_string(-_input->info()->quantization_info().offset));
- build_opts.add_option("-DWEIGHTS_OFFSET=" + support::cpp11::to_string(-_weights->info()->quantization_info().offset));
- build_opts.add_option("-DOUTPUT_OFFSET=" + support::cpp11::to_string(_output->info()->quantization_info().offset));
- build_opts.add_option("-DK_OFFSET=" + support::cpp11::to_string(9 * input->info()->quantization_info().offset * weights->info()->quantization_info().offset));
- build_opts.add_option("-DOUTPUT_MULTIPLIER=" + support::cpp11::to_string(output_multiplier));
- build_opts.add_option("-DOUTPUT_SHIFT=" + support::cpp11::to_string(output_shift));
- }
-
- // Configure the local work size for Bifrost with a value obtained
- // via exhaustive autotuning for the MobileNets tensor shapes.
- const GPUTarget gpu_target = get_arch_from_target(get_target());
-
- // Configure kernel window
- unsigned int num_elems_read_per_iteration_x = 0;
- unsigned int num_elems_read_per_iteration_y = 0;
- unsigned int num_elems_written_per_iteration_x = 0;
- unsigned int num_elems_written_per_iteration_y = 0;
-
- // Create kernel
- std::string kernel_name;
-
- if(input->info()->data_type() == DataType::F16)
- {
- kernel_name = "depthwise_convolution_3x3_f16";
- num_elems_written_per_iteration_x = 8 / data_size_from_type(input->info()->data_type());
- num_elems_written_per_iteration_y = 1;
- num_elems_read_per_iteration_y = 3;
- switch(_conv_stride_x)
- {
- case 1:
- num_elems_read_per_iteration_x = 8;
- break;
- case 2:
- num_elems_read_per_iteration_x = 9;
- break;
- case 3:
- num_elems_read_per_iteration_x = 16;
- break;
- default:
- num_elems_read_per_iteration_x = 3 + (num_elems_written_per_iteration_x - 1) * _conv_stride_x;
- break;
- }
- }
- else if(input->info()->data_type() == DataType::F32 && gpu_target == GPUTarget::BIFROST)
- {
- if(_conv_stride_x == 1 && _conv_stride_y == 1)
- {
- kernel_name = "depthwise_convolution_3x3_stridex1_stridey1_bifrost";
- num_elems_read_per_iteration_x = 4;
- num_elems_read_per_iteration_y = 6;
- num_elems_written_per_iteration_x = 2;
- num_elems_written_per_iteration_y = 4;
- }
- else if(_conv_stride_x == 2 && _conv_stride_y == 2)
- {
- kernel_name = "depthwise_convolution_3x3_stridex2_stridey2_bifrost";
- num_elems_read_per_iteration_x = 6;
- num_elems_read_per_iteration_y = 5;
- num_elems_written_per_iteration_x = 2;
- num_elems_written_per_iteration_y = 2;
- }
- else
- {
- kernel_name = "depthwise_convolution_3x3";
- num_elems_written_per_iteration_x = 8 / data_size_from_type(input->info()->data_type());
- num_elems_written_per_iteration_y = 1;
- num_elems_read_per_iteration_x = 3 + (num_elems_written_per_iteration_x - 1) * _conv_stride_x;
- num_elems_read_per_iteration_y = 3;
- }
- }
- else
- {
- kernel_name = is_qasymm ? "depthwise_convolution_3x3_quantized" : "depthwise_convolution_3x3";
- num_elems_written_per_iteration_x = 8 / data_size_from_type(input->info()->data_type());
- num_elems_written_per_iteration_y = (is_qasymm && _conv_stride_y < 3) ? (2 / _conv_stride_y) : 1;
- num_elems_read_per_iteration_x = 3 + (num_elems_written_per_iteration_x - 1) * _conv_stride_x;
- num_elems_read_per_iteration_y = num_elems_written_per_iteration_y + 2;
- }
-
- // Create window and update padding
- Window win = calculate_max_window(*output->info(), Steps(num_elems_written_per_iteration_x, num_elems_written_per_iteration_y));
-
- AccessWindowRectangle input_access(input->info(), -_conv_pad_left, -_conv_pad_top,
- num_elems_read_per_iteration_x, num_elems_read_per_iteration_y,
- _conv_stride_x, _conv_stride_y);
- AccessWindowStatic weights_access(weights->info(), 0, 0, 3, 3);
- AccessWindowRectangle output_access(output->info(), 0, 0, num_elems_written_per_iteration_x, num_elems_written_per_iteration_y);
-
- update_window_and_padding(win, input_access, weights_access, output_access);
-
- output_access.set_valid_region(win, ValidRegion(Coordinates(), output->info()->tensor_shape()));
-
- ICLKernel::configure(win);
-
- _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel(kernel_name, build_opts.options()));
-
- // Set config_id for enabling LWS tuning
- _config_id = kernel_name;
- _config_id += "_";
- _config_id += lower_string(string_from_data_type(input->info()->data_type()));
- _config_id += "_";
- _config_id += support::cpp11::to_string(input->info()->dimension(0));
- _config_id += "_";
- _config_id += support::cpp11::to_string(input->info()->dimension(1));
- _config_id += "_";
- _config_id += support::cpp11::to_string(input->info()->dimension(2));
- _config_id += "_";
- _config_id += support::cpp11::to_string(output->info()->dimension(0));
- _config_id += "_";
- _config_id += support::cpp11::to_string(output->info()->dimension(1));
-}
-
-void CLDepthwiseConvolutionLayer3x3Kernel::run(const Window &window, cl::CommandQueue &queue)
-{
- ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
- ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window);
-
- // Create input window and adjust
- Window win_in = window;
- win_in.adjust(Window::DimX, -_conv_pad_left, true);
- win_in.adjust(Window::DimY, -_conv_pad_top, true);
- win_in.set_dimension_step(Window::DimX, window.x().step() * _conv_stride_x);
- win_in.set_dimension_step(Window::DimY, window.y().step() * _conv_stride_y);
-
- Window slice_in = win_in.first_slice_window_3D();
- Window slice_out = window.first_slice_window_3D();
- Window slice_weights = window.first_slice_window_3D();
- slice_weights.set_dimension_step(Window::DimX, 0);
- slice_weights.set_dimension_step(Window::DimY, 0);
-
- // Set biases
- if(_biases != nullptr)
- {
- unsigned int idx = 3 * num_arguments_per_3D_tensor();
- Window slice_biases;
- slice_biases.use_tensor_dimensions(_biases->info()->tensor_shape());
- add_1D_tensor_argument(idx, _biases, slice_biases);
- }
-
- do
- {
- unsigned int idx = 0;
- add_3D_tensor_argument(idx, _input, slice_in);
- add_3D_tensor_argument(idx, _output, slice_out);
- add_3D_tensor_argument(idx, _weights, slice_weights);
-
- enqueue(queue, *this, slice_out, _lws_hint);
- }
- while(window.slide_window_slice_3D(slice_out) && win_in.slide_window_slice_3D(slice_in));
-}
diff --git a/src/core/CL/kernels/CLDepthwiseConvolutionLayer3x3NCHWKernel.cpp b/src/core/CL/kernels/CLDepthwiseConvolutionLayer3x3NCHWKernel.cpp
new file mode 100644
index 0000000..e4ad97f
--- /dev/null
+++ b/src/core/CL/kernels/CLDepthwiseConvolutionLayer3x3NCHWKernel.cpp
@@ -0,0 +1,346 @@
+/*
+ * Copyright (c) 2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/CL/kernels/CLDepthwiseConvolutionLayer3x3NCHWKernel.h"
+
+#include "arm_compute/core/AccessWindowStatic.h"
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/CL/CLKernelLibrary.h"
+#include "arm_compute/core/CL/ICLKernel.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/Utils.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "arm_compute/core/utils/quantization/AsymmHelpers.h"
+
+using namespace arm_compute;
+using namespace arm_compute::misc::shape_calculator;
+
+namespace
+{
+Status validate_arguments(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info, unsigned int depth_multiplier,
+ const ActivationLayerInfo &act_info)
+{
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::F16, DataType::F32);
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(act_info.enabled() && ((input->data_type() != DataType::QASYMM8) || ((act_info.activation() != ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU)
+ && (act_info.activation() != ActivationLayerInfo::ActivationFunction::BOUNDED_RELU)
+ && (act_info.activation() != ActivationLayerInfo::ActivationFunction::RELU))),
+ "For QASYMM8 only relu, lower bounded relu and lower-upper bounded relu are supported");
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, weights);
+ ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(0) != 3 || weights->dimension(1) != 3);
+ ARM_COMPUTE_RETURN_ERROR_ON((input->dimension(2) * depth_multiplier) != output->dimension(2));
+ ARM_COMPUTE_RETURN_ERROR_ON(conv_info.stride().first < 1 || conv_info.stride().first > 3);
+
+ const bool is_qasymm = is_data_type_quantized_asymmetric(input->data_type());
+
+ if(biases != nullptr)
+ {
+ if(is_qasymm)
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(biases, 1, DataType::S32);
+ }
+ else
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(weights, biases);
+ }
+ ARM_COMPUTE_RETURN_ERROR_ON(biases->dimension(0) != weights->dimension(2));
+ ARM_COMPUTE_RETURN_ERROR_ON(biases->num_dimensions() > 1);
+ }
+
+ if(output->total_size() != 0)
+ {
+ const TensorShape output_shape = compute_depthwise_convolution_shape(*input, *weights, conv_info, depth_multiplier);
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(output->tensor_shape(), output_shape);
+ }
+
+ return Status{};
+}
+
+std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *weights, ITensorInfo *output, const PadStrideInfo &conv_info, unsigned int depth_multiplier,
+ GPUTarget gpu_target, std::string &kernel_name)
+{
+ // Output auto inizialitation if not yet initialized
+ const TensorShape output_shape = compute_depthwise_convolution_shape(*input, *weights, conv_info, depth_multiplier);
+ auto_init_if_empty(*output, input->clone()->set_tensor_shape(output_shape));
+
+ const unsigned int conv_stride_x = conv_info.stride().first;
+ const unsigned int conv_stride_y = conv_info.stride().second;
+ const bool is_qasymm = is_data_type_quantized_asymmetric(input->data_type());
+ const bool is_bifrost = get_arch_from_target(gpu_target) == GPUTarget::BIFROST;
+
+ // Configure kernel window
+ unsigned int num_elems_read_per_iteration_x = 0;
+ unsigned int num_elems_read_per_iteration_y = 0;
+ unsigned int num_elems_written_per_iteration_x = 0;
+ unsigned int num_elems_written_per_iteration_y = 0;
+
+ if(input->data_type() == DataType::F16)
+ {
+ kernel_name = "depthwise_convolution_3x3_f16";
+ num_elems_written_per_iteration_x = 8 / data_size_from_type(input->data_type());
+ num_elems_written_per_iteration_y = 1;
+ num_elems_read_per_iteration_y = 3;
+ switch(conv_stride_x)
+ {
+ case 1:
+ num_elems_read_per_iteration_x = 8;
+ break;
+ case 2:
+ num_elems_read_per_iteration_x = 9;
+ break;
+ case 3:
+ num_elems_read_per_iteration_x = 16;
+ break;
+ default:
+ num_elems_read_per_iteration_x = 3 + (num_elems_written_per_iteration_x - 1) * conv_stride_x;
+ break;
+ }
+ if(is_bifrost)
+ {
+ if(conv_stride_x == 1 && conv_stride_y == 1)
+ {
+ kernel_name = "depthwise_convolution_3x3_stridex1_stridey1_bifrost_f16";
+ num_elems_read_per_iteration_x = 8;
+ num_elems_written_per_iteration_x = 4;
+ num_elems_read_per_iteration_y = 6;
+ num_elems_written_per_iteration_y = 4;
+ }
+ else if(conv_stride_x == 2 && conv_stride_y == 2)
+ {
+ kernel_name = "depthwise_convolution_3x3_stridex2_stridey2_bifrost_f16";
+ num_elems_read_per_iteration_x = 10;
+ num_elems_written_per_iteration_x = 4;
+ num_elems_read_per_iteration_y = 5;
+ num_elems_written_per_iteration_y = 2;
+ }
+ }
+ }
+ else if(input->data_type() == DataType::F32 && is_bifrost)
+ {
+ if(conv_stride_x == 1 && conv_stride_y == 1)
+ {
+ kernel_name = "depthwise_convolution_3x3_stridex1_stridey1_bifrost_f32";
+ num_elems_read_per_iteration_x = 4;
+ num_elems_read_per_iteration_y = 6;
+ num_elems_written_per_iteration_x = 2;
+ num_elems_written_per_iteration_y = 4;
+ }
+ else if(conv_stride_x == 2 && conv_stride_y == 2)
+ {
+ kernel_name = "depthwise_convolution_3x3_stridex2_stridey2_bifrost_f32";
+ num_elems_read_per_iteration_x = 6;
+ num_elems_read_per_iteration_y = 5;
+ num_elems_written_per_iteration_x = 2;
+ num_elems_written_per_iteration_y = 2;
+ }
+ else
+ {
+ kernel_name = "depthwise_convolution_3x3";
+ num_elems_written_per_iteration_x = 8 / data_size_from_type(input->data_type());
+ num_elems_written_per_iteration_y = 1;
+ num_elems_read_per_iteration_x = 3 + (num_elems_written_per_iteration_x - 1) * conv_stride_x;
+ num_elems_read_per_iteration_y = 3;
+ }
+ }
+ else
+ {
+ kernel_name = is_qasymm ? "depthwise_convolution_3x3_quantized_nchw" : "depthwise_convolution_3x3";
+ num_elems_written_per_iteration_x = 8 / data_size_from_type(input->data_type());
+ num_elems_written_per_iteration_y = (is_qasymm && conv_stride_y < 3) ? (2 / conv_stride_y) : 1;
+ num_elems_read_per_iteration_x = 3 + (num_elems_written_per_iteration_x - 1) * conv_stride_x;
+ num_elems_read_per_iteration_y = num_elems_written_per_iteration_y + 2;
+ }
+
+ // Create window and update padding
+ Window win = calculate_max_window(*output, Steps(num_elems_written_per_iteration_x, num_elems_written_per_iteration_y));
+
+ AccessWindowRectangle input_access(input, -conv_info.pad_left(), -conv_info.pad_top(),
+ num_elems_read_per_iteration_x, num_elems_read_per_iteration_y,
+ conv_stride_x, conv_stride_y);
+ AccessWindowStatic weights_access(weights, 0, 0, 3, 3);
+ AccessWindowRectangle output_access(output, 0, 0, num_elems_written_per_iteration_x, num_elems_written_per_iteration_y);
+
+ bool window_changed = update_window_and_padding(win, input_access, weights_access, output_access);
+
+ output_access.set_valid_region(win, ValidRegion(Coordinates(), output->tensor_shape()));
+
+ Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
+ return std::make_pair(err, win);
+}
+} // namespace
+
+CLDepthwiseConvolutionLayer3x3NCHWKernel::CLDepthwiseConvolutionLayer3x3NCHWKernel()
+ : _conv_stride_x(0), _conv_pad_top(0)
+{
+}
+
+BorderSize CLDepthwiseConvolutionLayer3x3NCHWKernel::border_size() const
+{
+ return _border_size;
+}
+
+void CLDepthwiseConvolutionLayer3x3NCHWKernel::configure(const ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, const PadStrideInfo &conv_info,
+ unsigned int depth_multiplier,
+ ActivationLayerInfo act_info)
+{
+ ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output);
+
+ bool is_qasymm = is_data_type_quantized_asymmetric(input->info()->data_type());
+
+ _input = input;
+ _output = output;
+ _weights = weights;
+ _biases = biases;
+ _conv_stride_x = conv_info.stride().first;
+ _conv_stride_y = conv_info.stride().second;
+ _conv_pad_left = conv_info.pad_left();
+ _conv_pad_top = conv_info.pad_top();
+ _border_size = BorderSize(_conv_pad_top, conv_info.pad_right(), conv_info.pad_bottom(), _conv_pad_left);
+
+ // Set build options
+ CLBuildOptions build_opts;
+ build_opts.add_option("-DDEPTH_MULTIPLIER=" + support::cpp11::to_string(depth_multiplier));
+ build_opts.add_option("-DCONV_STRIDE_X=" + support::cpp11::to_string(_conv_stride_x));
+ build_opts.add_option_if(_biases != nullptr, "-DHAS_BIAS");
+
+ if(is_qasymm)
+ {
+ float multiplier = _input->info()->quantization_info().scale * _weights->info()->quantization_info().scale / _output->info()->quantization_info().scale;
+ int output_multiplier = 0;
+ int output_shift = 0;
+ quantization::calculate_quantized_multiplier_less_than_one(multiplier, &output_multiplier, &output_shift);
+
+ build_opts.add_option("-DCONV_STRIDE_Y=" + support::cpp11::to_string(_conv_stride_y));
+ build_opts.add_option("-DINPUT_OFFSET=" + support::cpp11::to_string(-_input->info()->quantization_info().offset));
+ build_opts.add_option("-DWEIGHTS_OFFSET=" + support::cpp11::to_string(-_weights->info()->quantization_info().offset));
+ build_opts.add_option("-DOUTPUT_OFFSET=" + support::cpp11::to_string(_output->info()->quantization_info().offset));
+ build_opts.add_option("-DK_OFFSET=" + support::cpp11::to_string(9 * input->info()->quantization_info().offset * weights->info()->quantization_info().offset));
+ build_opts.add_option("-DOUTPUT_MULTIPLIER=" + support::cpp11::to_string(output_multiplier));
+ build_opts.add_option("-DOUTPUT_SHIFT=" + support::cpp11::to_string(output_shift));
+
+ if(act_info.enabled())
+ {
+ const int a_val = input->info()->quantization_info().quantize(act_info.a(), RoundingPolicy::TO_NEAREST_UP);
+ const int b_val = input->info()->quantization_info().quantize(act_info.b(), RoundingPolicy::TO_NEAREST_UP);
+ const int o1 = input->info()->quantization_info().offset;
+
+ build_opts.add_option("-DFUSED_ACTIVATION=" + lower_string(string_from_activation_func(act_info.activation())));
+ build_opts.add_option("-DA_VAL=" + support::cpp11::to_string(a_val));
+ build_opts.add_option("-DB_VAL=" + support::cpp11::to_string(b_val));
+ build_opts.add_option("-DCONST_0=" + support::cpp11::to_string(o1));
+
+ if(output != nullptr)
+ {
+ const float s1 = input->info()->quantization_info().scale;
+ const float s2 = output->info()->quantization_info().scale;
+ const int o2 = output->info()->quantization_info().offset;
+
+ if(o1 != o2 || s1 != s2)
+ {
+ build_opts.add_option("-DS1_VAL=" + float_to_string_with_full_precision(s1));
+ build_opts.add_option("-DS2_VAL=" + float_to_string_with_full_precision(s2));
+ build_opts.add_option("-DO1_VAL=" + support::cpp11::to_string(o1));
+ build_opts.add_option("-DO2_VAL=" + support::cpp11::to_string(o2));
+ }
+ }
+ }
+ }
+
+ // Configure kernel window
+ std::string kernel_name;
+ const GPUTarget gpu_target = get_target();
+
+ auto win_config = validate_and_configure_window(input->info(), weights->info(), output->info(), conv_info, depth_multiplier, gpu_target, kernel_name);
+ ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
+ ICLKernel::configure(win_config.second);
+
+ _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel(kernel_name, build_opts.options()));
+
+ // Set config_id for enabling LWS tuning
+ _config_id = kernel_name;
+ _config_id += "_";
+ _config_id += lower_string(string_from_data_type(input->info()->data_type()));
+ _config_id += "_";
+ _config_id += support::cpp11::to_string(input->info()->dimension(0));
+ _config_id += "_";
+ _config_id += support::cpp11::to_string(input->info()->dimension(1));
+ _config_id += "_";
+ _config_id += support::cpp11::to_string(input->info()->dimension(2));
+ _config_id += "_";
+ _config_id += support::cpp11::to_string(output->info()->dimension(0));
+ _config_id += "_";
+ _config_id += support::cpp11::to_string(output->info()->dimension(1));
+}
+
+Status CLDepthwiseConvolutionLayer3x3NCHWKernel::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info,
+ unsigned int depth_multiplier,
+ ActivationLayerInfo act_info, GPUTarget gpu_target)
+{
+ std::string kernel_name;
+ ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, weights, biases, output, conv_info, depth_multiplier, act_info));
+ ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input->clone().get(), weights->clone().get(), output->clone().get(), conv_info, depth_multiplier, gpu_target, kernel_name).first);
+
+ return Status{};
+}
+
+void CLDepthwiseConvolutionLayer3x3NCHWKernel::run(const Window &window, cl::CommandQueue &queue)
+{
+ ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+ ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window);
+
+ // Create input window and adjust
+ Window win_in = window;
+ win_in.adjust(Window::DimX, -_conv_pad_left, true);
+ win_in.adjust(Window::DimY, -_conv_pad_top, true);
+ win_in.set_dimension_step(Window::DimX, window.x().step() * _conv_stride_x);
+ win_in.set_dimension_step(Window::DimY, window.y().step() * _conv_stride_y);
+
+ Window slice_in = win_in.first_slice_window_3D();
+ Window slice_out = window.first_slice_window_3D();
+ Window slice_weights = window.first_slice_window_3D();
+ slice_weights.set_dimension_step(Window::DimX, 0);
+ slice_weights.set_dimension_step(Window::DimY, 0);
+
+ // Set biases
+ if(_biases != nullptr)
+ {
+ unsigned int idx = 3 * num_arguments_per_3D_tensor();
+ Window slice_biases;
+ slice_biases.use_tensor_dimensions(_biases->info()->tensor_shape());
+ add_1D_tensor_argument(idx, _biases, slice_biases);
+ }
+
+ do
+ {
+ unsigned int idx = 0;
+ add_3D_tensor_argument(idx, _input, slice_in);
+ add_3D_tensor_argument(idx, _output, slice_out);
+ add_3D_tensor_argument(idx, _weights, slice_weights);
+
+ enqueue(queue, *this, slice_out, _lws_hint);
+ }
+ while(window.slide_window_slice_3D(slice_out) && win_in.slide_window_slice_3D(slice_in));
+}
diff --git a/src/core/CL/kernels/CLDepthwiseConvolutionLayer3x3NHWCKernel.cpp b/src/core/CL/kernels/CLDepthwiseConvolutionLayer3x3NHWCKernel.cpp
new file mode 100644
index 0000000..a54e92c
--- /dev/null
+++ b/src/core/CL/kernels/CLDepthwiseConvolutionLayer3x3NHWCKernel.cpp
@@ -0,0 +1,267 @@
+/*
+ * Copyright (c) 2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/CL/kernels/CLDepthwiseConvolutionLayer3x3NHWCKernel.h"
+
+#include "arm_compute/core/AccessWindowStatic.h"
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/CL/CLKernelLibrary.h"
+#include "arm_compute/core/CL/ICLKernel.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/Utils.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "arm_compute/core/utils/quantization/AsymmHelpers.h"
+
+using namespace arm_compute;
+using namespace arm_compute::misc::shape_calculator;
+
+namespace
+{
+Status validate_arguments(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info, unsigned int depth_multiplier,
+ const ActivationLayerInfo &act_info)
+{
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8);
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG((act_info.enabled()) && (act_info.activation() != ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU)
+ && (act_info.activation() != ActivationLayerInfo::ActivationFunction::BOUNDED_RELU)
+ && (act_info.activation() != ActivationLayerInfo::ActivationFunction::RELU),
+ "For QASYMM8 only relu, lower bounded relu and lower-upper bounded relu are supported");
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, weights);
+ ARM_COMPUTE_RETURN_ERROR_ON(depth_multiplier > 1); // COMPMID-1071 Add depth multiplier support for NHWC
+ ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(1) != 3 || weights->dimension(2) != 3);
+
+ if(biases != nullptr)
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(biases, 1, DataType::S32);
+ ARM_COMPUTE_RETURN_ERROR_ON(biases->dimension(0) != weights->dimension(0));
+ ARM_COMPUTE_RETURN_ERROR_ON(biases->num_dimensions() > 1);
+ }
+
+ if(output->total_size() != 0)
+ {
+ const TensorShape output_shape = compute_depthwise_convolution_shape(*input, *weights, conv_info, depth_multiplier);
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(output->tensor_shape(), output_shape);
+ }
+
+ return Status{};
+}
+
+std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *weights, ITensorInfo *bias, ITensorInfo *output,
+ const PadStrideInfo &conv_info)
+{
+ const unsigned int num_rows_processed_per_iteration = 4;
+ const unsigned int num_elems_accessed_per_iteration = 4;
+ const unsigned int num_rows_read_per_iteration = num_rows_processed_per_iteration + 2;
+ const unsigned int num_rows_written_per_iteration = num_rows_processed_per_iteration / conv_info.stride().first;
+
+ const BorderSize border_size(conv_info.pad_left() + num_rows_read_per_iteration * std::max(conv_info.pad_top(), conv_info.pad_bottom()), 0, conv_info.pad_right(), 0);
+
+ // Configure kernel window
+ Window win = calculate_max_window(*output, Steps(num_elems_accessed_per_iteration, num_rows_written_per_iteration));
+
+ AccessWindowStatic input_access(input, 0, -border_size.top, ceil_to_multiple(input->dimension(0), num_elems_accessed_per_iteration),
+ ceil_to_multiple(input->dimension(1) + border_size.bottom, num_rows_read_per_iteration));
+ AccessWindowRectangle output_access(output, 0, 0, num_elems_accessed_per_iteration, num_rows_written_per_iteration);
+ AccessWindowHorizontal weights_access(weights, 0, num_elems_accessed_per_iteration);
+
+ bool window_changed = update_window_and_padding(win, input_access, weights_access, output_access);
+
+ if(bias != nullptr)
+ {
+ AccessWindowHorizontal bias_access(bias, 0, num_elems_accessed_per_iteration);
+ window_changed = window_changed || update_window_and_padding(win, bias_access);
+ }
+
+ output_access.set_valid_region(win, ValidRegion(Coordinates(), output->tensor_shape()));
+
+ Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
+ return std::make_pair(err, win);
+}
+} // namespace
+
+CLDepthwiseConvolutionLayer3x3NHWCKernel::CLDepthwiseConvolutionLayer3x3NHWCKernel()
+ : _num_rows_processed_per_iteration(1)
+{
+}
+
+BorderSize CLDepthwiseConvolutionLayer3x3NHWCKernel::border_size() const
+{
+ return _border_size;
+}
+
+void CLDepthwiseConvolutionLayer3x3NHWCKernel::configure(const ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, const PadStrideInfo &conv_info,
+ unsigned int depth_multiplier,
+ ActivationLayerInfo act_info)
+{
+ ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output);
+
+ // Get convolved dimensions
+ const TensorShape output_shape = compute_depthwise_convolution_shape(*input->info(), *weights->info(), conv_info, depth_multiplier);
+
+ // Output auto inizialitation if not yet initialized
+ auto_init_if_empty(*output->info(),
+ output_shape,
+ 1,
+ input->info()->data_type(),
+ input->info()->fixed_point_position(),
+ input->info()->quantization_info());
+
+ ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), weights->info(), (biases != nullptr) ? biases->info() : nullptr, output->info(), conv_info, depth_multiplier, act_info));
+
+ const unsigned int conv_stride_x = conv_info.stride().first;
+ ARM_COMPUTE_ERROR_ON(conv_stride_x < 1 || conv_stride_x > 2);
+ ARM_COMPUTE_ERROR_ON(std::max(conv_info.pad_top(), conv_info.pad_bottom()) > 1);
+
+ _input = input;
+ _output = output;
+ _weights = weights;
+ _biases = biases;
+ _conv_stride_y = conv_info.stride().second;
+ _conv_pad_left = conv_info.pad_left();
+ _num_rows_processed_per_iteration = 4;
+
+ const unsigned int num_elems_accessed_per_iteration = 4;
+ const unsigned int num_rows_read_per_iteration = _num_rows_processed_per_iteration + 2;
+
+ _border_size = BorderSize(_conv_pad_left + num_rows_read_per_iteration * std::max(conv_info.pad_top(), conv_info.pad_bottom()), 0, conv_info.pad_right(), 0);
+
+ float multiplier = _input->info()->quantization_info().scale * _weights->info()->quantization_info().scale / _output->info()->quantization_info().scale;
+ int output_multiplier = 0;
+ int output_shift = 0;
+ quantization::calculate_quantized_multiplier_less_than_one(multiplier, &output_multiplier, &output_shift);
+
+ CLBuildOptions build_opts;
+ build_opts.add_option_if(_biases != nullptr, "-DHAS_BIAS");
+ build_opts.add_option("-DINPUT_OFFSET=" + support::cpp11::to_string(-_input->info()->quantization_info().offset));
+ build_opts.add_option("-DWEIGHTS_OFFSET=" + support::cpp11::to_string(-_weights->info()->quantization_info().offset));
+ build_opts.add_option("-DOUTPUT_OFFSET=" + support::cpp11::to_string(_output->info()->quantization_info().offset));
+ build_opts.add_option("-DK_OFFSET=" + support::cpp11::to_string(9 * input->info()->quantization_info().offset * weights->info()->quantization_info().offset));
+ build_opts.add_option("-DOUTPUT_MULTIPLIER=" + support::cpp11::to_string(output_multiplier));
+ build_opts.add_option("-DOUTPUT_SHIFT=" + support::cpp11::to_string(output_shift));
+ build_opts.add_option("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_accessed_per_iteration));
+ build_opts.add_option("-DSRC_DEPTH=" + support::cpp11::to_string(_input->info()->dimension(2)));
+ build_opts.add_option("-DCONV_PAD_TOP=" + support::cpp11::to_string(conv_info.pad_top()));
+ build_opts.add_option("-DROWS_READ=" + support::cpp11::to_string(num_rows_read_per_iteration));
+
+ if(act_info.enabled())
+ {
+ const int a_val = input->info()->quantization_info().quantize(act_info.a(), RoundingPolicy::TO_NEAREST_UP);
+ const int b_val = input->info()->quantization_info().quantize(act_info.b(), RoundingPolicy::TO_NEAREST_UP);
+ const int o1 = input->info()->quantization_info().offset;
+
+ build_opts.add_option("-DFUSED_ACTIVATION=" + lower_string(string_from_activation_func(act_info.activation())));
+ build_opts.add_option("-DA_VAL=" + support::cpp11::to_string(a_val));
+ build_opts.add_option("-DB_VAL=" + support::cpp11::to_string(b_val));
+ build_opts.add_option("-DCONST_0=" + support::cpp11::to_string(o1));
+
+ if(output != nullptr)
+ {
+ const float s1 = input->info()->quantization_info().scale;
+ const float s2 = output->info()->quantization_info().scale;
+ const int o2 = output->info()->quantization_info().offset;
+
+ if(o1 != o2 || s1 != s2)
+ {
+ build_opts.add_option("-DS1_VAL=" + float_to_string_with_full_precision(s1));
+ build_opts.add_option("-DS2_VAL=" + float_to_string_with_full_precision(s2));
+ build_opts.add_option("-DO1_VAL=" + support::cpp11::to_string(o1));
+ build_opts.add_option("-DO2_VAL=" + support::cpp11::to_string(o2));
+ }
+ }
+ }
+
+ // Create kernel
+ std::string kernel_name = std::string("depthwise_convolution_3x3_quantized_nhwc_stride") + support::cpp11::to_string(conv_stride_x);
+ _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel(kernel_name, build_opts.options()));
+
+ // Configure kernel window
+ auto win_config = validate_and_configure_window(input->info(), weights->info(), biases != nullptr ? biases->info() : nullptr, output->info(), conv_info);
+ ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
+ ICLKernel::configure(win_config.second);
+
+ // Set config_id for enabling LWS tuning
+ _config_id = kernel_name;
+ _config_id += "_";
+ _config_id += support::cpp11::to_string(input->info()->dimension(0));
+ _config_id += "_";
+ _config_id += support::cpp11::to_string(input->info()->dimension(1));
+ _config_id += "_";
+ _config_id += support::cpp11::to_string(input->info()->dimension(2));
+ _config_id += "_";
+ _config_id += support::cpp11::to_string(output->info()->dimension(0));
+ _config_id += "_";
+ _config_id += support::cpp11::to_string(output->info()->dimension(1));
+}
+
+Status CLDepthwiseConvolutionLayer3x3NHWCKernel::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info,
+ unsigned int depth_multiplier,
+ ActivationLayerInfo act_info)
+{
+ ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, weights, biases, output, conv_info, depth_multiplier, act_info));
+ ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input->clone().get(), weights->clone().get(),
+ biases != nullptr ? biases->clone().get() : nullptr,
+ output->clone().get(), conv_info)
+ .first);
+
+ return Status{};
+}
+
+void CLDepthwiseConvolutionLayer3x3NHWCKernel::run(const Window &window, cl::CommandQueue &queue)
+{
+ ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+ ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window);
+
+ // Create input window and adjust
+ Window win_in = window;
+ win_in.adjust(Window::DimY, -_conv_pad_left, true);
+ win_in.set_dimension_step(Window::DimY, _num_rows_processed_per_iteration);
+ win_in.set_dimension_step(Window::DimZ, _conv_stride_y);
+
+ ARM_COMPUTE_ERROR_ON((win_in.y().step() < window.y().step()) || (win_in.z().step() < window.z().step()));
+
+ Window slice_in = win_in.first_slice_window_3D();
+ Window slice_out = window.first_slice_window_3D();
+
+ if(_biases != nullptr)
+ {
+ unsigned int idx = 3 * num_arguments_per_3D_tensor();
+ Window win_biases;
+ win_biases.use_tensor_dimensions(_biases->info()->tensor_shape());
+ win_biases.set_dimension_step(Window::DimX, window.x().step());
+ add_1D_tensor_argument(idx, _biases, win_biases);
+ }
+
+ do
+ {
+ unsigned int idx = 0;
+ add_3D_tensor_argument(idx, _input, slice_in);
+ add_3D_tensor_argument(idx, _output, slice_out);
+ add_3D_tensor_argument(idx, _weights, slice_out);
+
+ enqueue(queue, *this, slice_out, _lws_hint);
+ }
+ while(window.slide_window_slice_3D(slice_out) && win_in.slide_window_slice_3D(slice_in));
+}
diff --git a/src/core/CL/kernels/CLDepthwiseIm2ColKernel.cpp b/src/core/CL/kernels/CLDepthwiseIm2ColKernel.cpp
index 9851475..f44f08b 100644
--- a/src/core/CL/kernels/CLDepthwiseIm2ColKernel.cpp
+++ b/src/core/CL/kernels/CLDepthwiseIm2ColKernel.cpp
@@ -42,14 +42,26 @@
{
}
-void CLDepthwiseIm2ColKernel::configure(const ICLTensor *input, ICLTensor *output, const Size2D &kernel_dims, const PadStrideInfo &conv_info, bool has_bias)
+namespace
{
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::F16, DataType::F32);
- ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
- ARM_COMPUTE_ERROR_ON_MISMATCHING_FIXED_POINT(input, output);
- ARM_COMPUTE_ERROR_ON(is_data_type_quantized_asymmetric(input->info()->data_type()) && has_bias);
- ARM_COMPUTE_ERROR_ON(input->info()->dimension(2) != output->info()->dimension(2));
- ARM_COMPUTE_ERROR_ON(output->info()->dimension(0) != (kernel_dims.width * kernel_dims.height + ((has_bias) ? 1 : 0)));
+Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, const Size2D &kernel_dims, const PadStrideInfo &conv_info, bool has_bias, unsigned int depth_multiplier)
+{
+ ARM_COMPUTE_UNUSED(conv_info);
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::F16, DataType::F32);
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_FIXED_POINT(input, output);
+ ARM_COMPUTE_RETURN_ERROR_ON(is_data_type_quantized_asymmetric(input->data_type()) && has_bias);
+ ARM_COMPUTE_RETURN_ERROR_ON((input->dimension(2) * depth_multiplier) != output->dimension(2));
+ ARM_COMPUTE_RETURN_ERROR_ON(output->dimension(0) != (kernel_dims.width * kernel_dims.height + ((has_bias) ? 1 : 0)));
+
+ return Status{};
+}
+} // namespace
+
+void CLDepthwiseIm2ColKernel::configure(const ICLTensor *input, ICLTensor *output, const Size2D &kernel_dims, const PadStrideInfo &conv_info, bool has_bias, unsigned int depth_multiplier)
+{
+ ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
+ ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), kernel_dims, conv_info, has_bias, depth_multiplier));
_input = input;
_output = output;
@@ -68,6 +80,7 @@
build_opts.add_option("-DSRC_HEIGHT=" + support::cpp11::to_string(input->info()->dimension(1)));
build_opts.add_option("-DKERNEL_WIDTH=" + support::cpp11::to_string(kernel_dims.width));
build_opts.add_option("-DKERNEL_HEIGHT=" + support::cpp11::to_string(kernel_dims.height));
+ build_opts.add_option("-DDEPTH_MULTIPLIER=" + support::cpp11::to_string(depth_multiplier));
build_opts.add_option_if(has_bias, "-DHAS_BIAS");
build_opts.add_option_if_else(is_data_type_quantized_asymmetric(input->info()->data_type()),
"-DPAD_VALUE=" + support::cpp11::to_string(input->info()->quantization_info().offset),
@@ -77,20 +90,28 @@
// Configure the local work size for Bifrost with a value obtained
// via exhaustive autotuning for the MobileNets tensor shapes.
- const GPUTarget gpu_target = get_arch_from_target(get_target());
- if(gpu_target == GPUTarget::BIFROST)
+ const GPUTarget gpu_target = get_target();
+
+ if(gpu_target_is_in(gpu_target, GPUTarget::G71, GPUTarget::G72, GPUTarget::G51, GPUTarget::G51BIG, GPUTarget::G51LIT, GPUTarget::TNOX))
{
_lws_hint = cl::NDRange(1, 2, 1);
}
// Configure kernel window
- Window win = calculate_max_window(*input->info(), Steps());
- // The CLDepthwiseIm2ColKernel doesn't need padding so update_window_and_padding() can be skipped
+ Window win = calculate_max_window(*output->info(), Steps());
+ // CLDepthwiseIm2ColKernel doesn't need padding so update_window_and_padding() can be skipped
output->info()->set_valid_region(ValidRegion(Coordinates(), output->info()->tensor_shape()));
ICLKernel::configure(win);
}
+Status CLDepthwiseIm2ColKernel::validate(const ITensorInfo *input, const ITensorInfo *output, const Size2D &kernel_dims, const PadStrideInfo &conv_info, bool has_bias, unsigned int depth_multiplier)
+{
+ ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, kernel_dims, conv_info, has_bias, depth_multiplier));
+
+ return Status{};
+}
+
void CLDepthwiseIm2ColKernel::run(const Window &window, cl::CommandQueue &queue)
{
ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
diff --git a/src/core/CL/kernels/CLDepthwiseVectorToTensorKernel.cpp b/src/core/CL/kernels/CLDepthwiseVectorToTensorKernel.cpp
index 83fc168..26336eb 100644
--- a/src/core/CL/kernels/CLDepthwiseVectorToTensorKernel.cpp
+++ b/src/core/CL/kernels/CLDepthwiseVectorToTensorKernel.cpp
@@ -34,6 +34,34 @@
using namespace arm_compute;
+namespace
+{
+TensorShape compute_output_shape(const TensorShape &input, size_t conv_w, size_t conv_h)
+{
+ TensorShape output_shape(input);
+ output_shape.set(0, conv_w);
+ output_shape.set(1, conv_h);
+ output_shape.set(2, input.x() / (conv_w * conv_h));
+
+ return output_shape;
+}
+
+Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, size_t conv_w, size_t conv_h)
+{
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::S32, DataType::F16, DataType::F32);
+
+ if(output->total_size() != 0)
+ {
+ TensorShape output_shape = compute_output_shape(input->tensor_shape(), conv_w, conv_h);
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(output->tensor_shape(), output_shape);
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_FIXED_POINT(input, output);
+ }
+
+ return Status{};
+}
+} // namespace
+
CLDepthwiseVectorToTensorKernel::CLDepthwiseVectorToTensorKernel()
: _input(nullptr), _output(nullptr)
{
@@ -41,20 +69,13 @@
void CLDepthwiseVectorToTensorKernel::configure(const ICLTensor *input, ICLTensor *output, size_t conv_w, size_t conv_h)
{
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::S32, DataType::F16, DataType::F32);
- ARM_COMPUTE_ERROR_ON_NULLPTR(output);
-
- TensorShape output_shape = input->info()->tensor_shape();
- output_shape.set(0, conv_w);
- output_shape.set(1, conv_h);
- output_shape.set(2, input->info()->tensor_shape()[0] / (conv_w * conv_h));
+ ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
// Output auto inizialitation if not yet initialized
+ TensorShape output_shape = compute_output_shape(input->info()->tensor_shape(), conv_w, conv_h);
auto_init_if_empty(*output->info(), input->info()->clone()->set_tensor_shape(output_shape));
- ARM_COMPUTE_ERROR_ON_MISMATCHING_DIMENSIONS(output->info()->tensor_shape(), output_shape);
- ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
- ARM_COMPUTE_ERROR_ON_MISMATCHING_FIXED_POINT(input, output);
+ ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), conv_w, conv_h));
_input = input;
_output = output;
@@ -75,6 +96,12 @@
ICLKernel::configure(win);
}
+Status CLDepthwiseVectorToTensorKernel::validate(const ITensorInfo *input, const ITensorInfo *output, size_t conv_w, size_t conv_h)
+{
+ ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, conv_w, conv_h));
+ return Status{};
+}
+
void CLDepthwiseVectorToTensorKernel::run(const Window &window, cl::CommandQueue &queue)
{
ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
diff --git a/src/core/CL/kernels/CLDepthwiseWeightsReshapeKernel.cpp b/src/core/CL/kernels/CLDepthwiseWeightsReshapeKernel.cpp
index 26da96f..b5a607d 100644
--- a/src/core/CL/kernels/CLDepthwiseWeightsReshapeKernel.cpp
+++ b/src/core/CL/kernels/CLDepthwiseWeightsReshapeKernel.cpp
@@ -34,6 +34,29 @@
using namespace arm_compute;
+namespace
+{
+Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, const ITensorInfo *biases)
+{
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::F16, DataType::F32);
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_FIXED_POINT(input, output);
+ ARM_COMPUTE_RETURN_ERROR_ON(is_data_type_quantized_asymmetric(input->data_type()) && (biases != nullptr));
+ ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(2) != output->dimension(1));
+ ARM_COMPUTE_RETURN_ERROR_ON(output->dimension(0) != (input->dimension(0) * input->dimension(1) + ((biases != nullptr) ? 1 : 0)));
+
+ if(biases != nullptr)
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, biases);
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_FIXED_POINT(input, biases);
+ ARM_COMPUTE_RETURN_ERROR_ON(biases->dimension(0) != input->dimension(2));
+ ARM_COMPUTE_RETURN_ERROR_ON(biases->num_dimensions() > 1);
+ }
+
+ return Status{};
+}
+} // namespace
+
CLDepthwiseWeightsReshapeKernel::CLDepthwiseWeightsReshapeKernel()
: _input(nullptr), _biases(nullptr), _output(nullptr)
{
@@ -41,20 +64,8 @@
void CLDepthwiseWeightsReshapeKernel::configure(const ICLTensor *input, ICLTensor *output, const ICLTensor *biases)
{
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::F16, DataType::F32);
- ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
- ARM_COMPUTE_ERROR_ON_MISMATCHING_FIXED_POINT(input, output);
- ARM_COMPUTE_ERROR_ON(is_data_type_quantized_asymmetric(input->info()->data_type()) && (biases != nullptr));
- ARM_COMPUTE_ERROR_ON(input->info()->dimension(2) != output->info()->dimension(1));
- ARM_COMPUTE_ERROR_ON(output->info()->dimension(0) != (input->info()->dimension(0) * input->info()->dimension(1) + ((biases != nullptr) ? 1 : 0)));
-
- if(biases != nullptr)
- {
- ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, biases);
- ARM_COMPUTE_ERROR_ON_MISMATCHING_FIXED_POINT(input, biases);
- ARM_COMPUTE_ERROR_ON(biases->info()->dimension(0) != input->info()->dimension(2));
- ARM_COMPUTE_ERROR_ON(biases->info()->num_dimensions() > 1);
- }
+ ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
+ ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), (biases != nullptr) ? biases->info() : nullptr));
_input = input;
_biases = biases;
@@ -80,6 +91,12 @@
ICLKernel::configure(win);
}
+Status CLDepthwiseWeightsReshapeKernel::validate(const ITensorInfo *input, const ITensorInfo *output, const ITensorInfo *biases)
+{
+ ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, biases));
+ return Status{};
+}
+
void CLDepthwiseWeightsReshapeKernel::run(const Window &window, cl::CommandQueue &queue)
{
ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
diff --git a/src/core/CL/kernels/CLDequantizationLayerKernel.cpp b/src/core/CL/kernels/CLDequantizationLayerKernel.cpp
index 4efdb76..fa982d6 100644
--- a/src/core/CL/kernels/CLDequantizationLayerKernel.cpp
+++ b/src/core/CL/kernels/CLDequantizationLayerKernel.cpp
@@ -34,6 +34,46 @@
using namespace arm_compute;
+namespace
+{
+Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, const ITensorInfo *min_max)
+{
+ ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output, min_max);
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8);
+ ARM_COMPUTE_RETURN_ERROR_ON(input->num_dimensions() < 3);
+
+ if(output->tensor_shape().total_size() > 0)
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::F32);
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output);
+ }
+
+ return Status{};
+}
+
+std::tuple<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *output, ITensorInfo *min_max)
+{
+ // Output tensor auto initialization if not yet initialized
+ auto_init_if_empty(*output, input->tensor_shape(), 1, DataType::F32, 0);
+
+ constexpr unsigned int num_elems_processed_per_iteration = 4;
+
+ // Configure window
+ Window win = calculate_max_window(*input, Steps(num_elems_processed_per_iteration));
+ AccessWindowHorizontal input_access(input, 0, num_elems_processed_per_iteration);
+ AccessWindowHorizontal output_access(output, 0, num_elems_processed_per_iteration);
+ AccessWindowStatic min_max_access(min_max, 0, 0, 2, min_max->dimension(1));
+
+ // Update window and padding
+ bool window_changed = update_window_and_padding(win, input_access, output_access, min_max_access);
+
+ output_access.set_valid_region(win, input->valid_region());
+
+ Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
+ return std::make_tuple(err, win);
+}
+} // namespace
+
CLDequantizationLayerKernel::CLDequantizationLayerKernel()
: _input(nullptr), _output(nullptr), _min_max(nullptr)
{
@@ -41,37 +81,30 @@
void CLDequantizationLayerKernel::configure(const ICLTensor *input, ICLTensor *output, const ICLTensor *min_max)
{
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8);
- ARM_COMPUTE_ERROR_ON_NULLPTR(output, min_max);
- ARM_COMPUTE_ERROR_ON(input->info()->num_dimensions() < 3);
-
- // Output tensor auto initialization if not yet initialized
- auto_init_if_empty(*output->info(), input->info()->tensor_shape(), 1, DataType::F32, 0);
-
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::F32);
- ARM_COMPUTE_ERROR_ON_MISMATCHING_SHAPES(input, output);
+ ARM_COMPUTE_ERROR_ON_NULLPTR(input, output, min_max);
+ ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), min_max->info()));
_input = input;
_output = output;
_min_max = min_max;
- constexpr unsigned int num_elems_processed_per_iteration = 4;
-
// Create kernel
_kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("dequantization_layer"));
- // Configure window
- Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration));
- AccessWindowHorizontal input_access(input->info(), 0, num_elems_processed_per_iteration);
- AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration);
- AccessWindowStatic min_max_access(min_max->info(), 0, 0, 2, min_max->info()->dimension(1));
+ // Configure kernel window
+ auto win_config = validate_and_configure_window(input->info(), output->info(), min_max->info());
- // Update window and padding
- update_window_and_padding(win, input_access, output_access, min_max_access);
+ ARM_COMPUTE_ERROR_THROW_ON(std::get<0>(win_config));
- output_access.set_valid_region(win, input->info()->valid_region());
+ ICLKernel::configure(std::get<1>(win_config));
+}
- ICLKernel::configure(win);
+Status CLDequantizationLayerKernel::validate(const ITensorInfo *input, const ITensorInfo *output, const ITensorInfo *min_max)
+{
+ ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, min_max));
+ ARM_COMPUTE_RETURN_ON_ERROR(std::get<0>(validate_and_configure_window(input->clone().get(), output->clone().get(), min_max->clone().get())));
+
+ return Status{};
}
void CLDequantizationLayerKernel::run(const Window &window, cl::CommandQueue &queue)
diff --git a/src/core/CL/kernels/CLDirectConvolutionLayerKernel.cpp b/src/core/CL/kernels/CLDirectConvolutionLayerKernel.cpp
index 86a3581..7c6c7de 100644
--- a/src/core/CL/kernels/CLDirectConvolutionLayerKernel.cpp
+++ b/src/core/CL/kernels/CLDirectConvolutionLayerKernel.cpp
@@ -34,6 +34,7 @@
#include "arm_compute/core/Types.h"
#include "arm_compute/core/Utils.h"
#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
#include "arm_compute/core/utils/quantization/AsymmHelpers.h"
#include "support/ToolchainSupport.h"
@@ -41,26 +42,6 @@
namespace
{
-/** Calculates expected output shape dimension
- *
- * @param[in] Input shape
- *
- * @return Expected output shape
- */
-TensorShape get_output_shape(TensorShape input_shape, TensorShape weights_shape, PadStrideInfo conv_info)
-{
- unsigned int output_width = 0;
- unsigned int output_height = 0;
- std::tie(output_width, output_height) = scaled_dimensions(input_shape.x(), input_shape.y(), weights_shape.x(), weights_shape.y(), conv_info);
-
- TensorShape output_shape = input_shape;
- output_shape.set(0, output_width);
- output_shape.set(1, output_height);
- output_shape.set(2, weights_shape[3]);
-
- return output_shape;
-}
-
Status validate_arguments(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info)
{
ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QS8, DataType::QASYMM8, DataType::QS16, DataType::F16, DataType::F32);
@@ -100,7 +81,7 @@
if(output->total_size() != 0)
{
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(output->tensor_shape(),
- get_output_shape(input->tensor_shape(), weights->tensor_shape(), conv_info));
+ misc::shape_calculator::compute_deep_convolution_shape(*input, *weights, conv_info));
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_FIXED_POINT(input, output);
}
@@ -114,7 +95,7 @@
const DataType data_type = input->data_type();
// Get convolved dimensions
- TensorShape output_shape = get_output_shape(input->tensor_shape(), weights->tensor_shape(), conv_info);
+ TensorShape output_shape = misc::shape_calculator::compute_deep_convolution_shape(*input, *weights, conv_info);
// Output auto inizialitation if not yet initialized
auto_init_if_empty(*output, output_shape,
@@ -133,7 +114,8 @@
unsigned int num_elems_written_per_iteration_x = 0;
unsigned int num_elems_written_per_iteration_y = 0;
- if((target == GPUTarget::BIFROST) && (kernel_size <= 5) && (conv_stride_x == 1) && (conv_stride_y == 1) && (data_type == DataType::F32))
+ if(gpu_target_is_in(target, GPUTarget::G71, GPUTarget::G72, GPUTarget::G51, GPUTarget::G51BIG, GPUTarget::G51LIT, GPUTarget::TNOX) && (kernel_size <= 5) && (conv_stride_x == 1)
+ && (conv_stride_y == 1) && (data_type == DataType::F32))
{
// Configure kernel window
@@ -273,7 +255,7 @@
const DataType data_type = input->info()->data_type();
// Get convolved dimensions
- TensorShape output_shape = get_output_shape(input->info()->tensor_shape(), weights->info()->tensor_shape(), conv_info);
+ TensorShape output_shape = misc::shape_calculator::compute_deep_convolution_shape(*input->info(), *weights->info(), conv_info);
// Output auto inizialitation if not yet initialized
auto_init_if_empty(*output->info(),
@@ -299,7 +281,7 @@
_output = output;
_biases = biases;
- const GPUTarget gpu_target = get_arch_from_target(get_target());
+ const GPUTarget gpu_target = get_target();
std::stringstream kernel_name;
kernel_name << "direct_convolution" << kernel_size << "x" << kernel_size;
@@ -307,85 +289,13 @@
CLBuildOptions build_options;
build_options.add_option_if(_biases != nullptr, std::string("-DHAS_BIAS"));
- if((gpu_target == GPUTarget::BIFROST) && (kernel_size <= 5) && (_conv_stride_x == 1) && (_conv_stride_y == 1) && (data_type == DataType::F32))
+ if(gpu_target_is_in(gpu_target, GPUTarget::G71, GPUTarget::G72, GPUTarget::G51, GPUTarget::G51BIG, GPUTarget::G51LIT, GPUTarget::TNOX) && (kernel_size <= 5) && (_conv_stride_x == 1)
+ && (_conv_stride_y == 1) && (data_type == DataType::F32))
{
build_options.add_option(std::string("-DWEIGHTS_DEPTH=" + support::cpp11::to_string(_weights->info()->dimension(2))));
kernel_name << "_f32_bifrost";
_kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel(kernel_name.str(), build_options.options()));
-
- // Through extensive experimentation with over 30 representative tensor
- // shapes, we found a small number of local work size configurations
- // that result in nearly optimal execution times. Selecting the right
- // lws for a given shape, however, required a complex decision tree,
- // until we constructed a simple feature as described below.
- //
- // We started from the number of multiply-accumulate operations for a
- // convolution layer, which is equal to the product of the input
- // dimensions 0..2 and the weights dimensions 0..2. Unfortunately,
- // this resulted in ties between distinct shapes that required distinct
- // lws configurations. Replacing the width of the input with the kernel
- // size, however, resulted in nearly optimal predictions. We use underscores
- // in variable names to indicate when they are intentionally misleading.
- const size_t product_of_weights_dimensions = weights->info()->dimension(0) * weights->info()->dimension(1) * weights->info()->dimension(2);
- const size_t product_of_input_dimensions_ = input->info()->dimension(0) * weights->info()->dimension(1) * input->info()->dimension(2);
- const float mega_ops_ = 1e-6 * product_of_weights_dimensions * product_of_input_dimensions_;
-
- switch(kernel_size)
- {
- case 1:
- {
- if(mega_ops_ < 1.f)
- {
- _lws_hint = cl::NDRange(1, 1, 8);
- }
- else if(mega_ops_ < 7.f)
- {
- _lws_hint = cl::NDRange(1, 1, 4);
- }
- else
- {
- _lws_hint = cl::NDRange(1, 1, 2);
- }
- break;
- }
- case 3:
- {
- if(mega_ops_ < 1.f)
- {
- _lws_hint = cl::NDRange(1, 1, 8);
- }
- else if(mega_ops_ < 13.f)
- {
- _lws_hint = cl::NDRange(2, 1, 4);
- }
- else if(mega_ops_ < 50.f)
- {
- _lws_hint = cl::NDRange(3, 1, 4);
- }
- else
- {
- _lws_hint = cl::NDRange(2, 1, 6);
- }
- break;
- }
- case 5:
- {
- if(mega_ops_ < 2.f || mega_ops_ > 80.f)
- {
- _lws_hint = cl::NDRange(2, 1, 4);
- }
- else
- {
- _lws_hint = cl::NDRange(2, 1, 8);
- }
- break;
- }
- default:
- {
- ARM_COMPUTE_ERROR("Kernel size not optimized for Bifrost");
- }
- }
}
else
{
diff --git a/src/core/CL/kernels/CLGEMMInterleave4x4Kernel.cpp b/src/core/CL/kernels/CLGEMMInterleave4x4Kernel.cpp
index 241dd85..8f669a9 100644
--- a/src/core/CL/kernels/CLGEMMInterleave4x4Kernel.cpp
+++ b/src/core/CL/kernels/CLGEMMInterleave4x4Kernel.cpp
@@ -69,19 +69,23 @@
AccessWindowRectangle input_access(input, 0, 0, num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y);
window_changed = window_changed || update_window_and_padding(win, input_access);
- // Configure window in case of configured output
- if(output->total_size() != 0)
- {
- const float scale_x = 4.0f * static_cast<float>(mult_interleave4x4_height);
- const float scale_y = 1.0f / (scale_x);
+ // Output auto inizialitation if not yet initialized
+ auto_init_if_empty(*output, input->clone()->set_tensor_shape(compute_interleaved_shape(*input, mult_interleave4x4_height)));
- AccessWindowRectangle output_access(output, 0, 0, num_elems_written_per_iteration, 1, scale_x, scale_y);
- window_changed = window_changed || update_window_and_padding(win, output_access);
- output_access.set_valid_region(win, input->valid_region());
- }
+ // Configure window
+ const float scale_x = 4.0f * static_cast<float>(mult_interleave4x4_height);
+ const float scale_y = 1.0f / (scale_x);
+
+ AccessWindowRectangle output_access(output, 0, 0, num_elems_written_per_iteration, 1, scale_x, scale_y);
+ window_changed = window_changed || update_window_and_padding(win, output_access);
+ output_access.set_valid_region(win, input->valid_region());
+
+ // Collapse along the Z direction
+ // This collapse needs to be here in order to tune the Z dimension of LWS
+ Window collapsed = win.collapse(win, Window::DimZ);
Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
- return std::make_pair(err, win);
+ return std::make_pair(err, collapsed);
}
} // namespace
@@ -136,6 +140,10 @@
_config_id += support::cpp11::to_string(output->info()->dimension(0));
_config_id += "_";
_config_id += support::cpp11::to_string(output->info()->dimension(1));
+ _config_id += "_";
+ _config_id += support::cpp11::to_string(output->info()->dimension(2));
+ _config_id += "_";
+ _config_id += support::cpp11::to_string(output->info()->dimension(3));
}
Status CLGEMMInterleave4x4Kernel::validate(const ITensorInfo *input, const ITensorInfo *output, int mult_interleave4x4_height)
@@ -160,15 +168,14 @@
*
* After this operation, the output matrix will have the following shape: [ height * 4, width / 4 ]
*/
- Window in_slice = window.first_slice_window_2D();
- Window out_slice = window.first_slice_window_2D();
+ Window slice = window.first_slice_window_3D();
do
{
unsigned int idx = 0;
- add_2D_tensor_argument(idx, _input, in_slice);
- add_2D_tensor_argument(idx, _output, out_slice);
- enqueue(queue, *this, in_slice, _lws_hint);
+ add_3D_tensor_argument(idx, _input, slice);
+ add_3D_tensor_argument(idx, _output, slice);
+ enqueue(queue, *this, slice, _lws_hint);
}
- while(window.slide_window_slice_2D(in_slice) && window.slide_window_slice_2D(out_slice));
+ while(window.slide_window_slice_3D(slice));
}
diff --git a/src/core/CL/kernels/CLGEMMLowpMatrixMultiplyKernel.cpp b/src/core/CL/kernels/CLGEMMLowpMatrixMultiplyKernel.cpp
index ae498ec..3f705ac 100644
--- a/src/core/CL/kernels/CLGEMMLowpMatrixMultiplyKernel.cpp
+++ b/src/core/CL/kernels/CLGEMMLowpMatrixMultiplyKernel.cpp
@@ -31,6 +31,7 @@
#include "arm_compute/core/CL/OpenCL.h"
#include "arm_compute/core/Error.h"
#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/TensorInfo.h"
#include "arm_compute/core/Types.h"
#include "arm_compute/core/Utils.h"
#include "arm_compute/core/Validate.h"
diff --git a/src/core/CL/kernels/CLGEMMMatrixAccumulateBiasesKernel.cpp b/src/core/CL/kernels/CLGEMMMatrixAccumulateBiasesKernel.cpp
index d5c93dd..d409fdb 100644
--- a/src/core/CL/kernels/CLGEMMMatrixAccumulateBiasesKernel.cpp
+++ b/src/core/CL/kernels/CLGEMMMatrixAccumulateBiasesKernel.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2018 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -52,7 +52,7 @@
unsigned int &num_elems_processed_per_iteration)
{
// Select the vector size to use (8 for Bifrost; 16 for Midgard).
- num_elems_processed_per_iteration = (gpu_target == GPUTarget::BIFROST) ? 8 : 16;
+ num_elems_processed_per_iteration = gpu_target_is_in(gpu_target, GPUTarget::G71, GPUTarget::G72, GPUTarget::G51, GPUTarget::G51BIG, GPUTarget::G51LIT, GPUTarget::TNOX) ? 8 : 16;
// Configure kernel window
Window win = calculate_max_window(*accum, Steps(num_elems_processed_per_iteration));
@@ -81,12 +81,12 @@
_biases = biases;
_accum = accum;
- // Get the target architecture
- GPUTarget arch_target = get_arch_from_target(get_target());
+ // Get the target gpu
+ GPUTarget gpu_target = get_target();
unsigned int vector_size = 0;
// Configure kernel window
- auto win_config = validate_and_configure_window(accum->info(), biases->info(), arch_target, vector_size);
+ auto win_config = validate_and_configure_window(accum->info(), biases->info(), gpu_target, vector_size);
ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
ICLKernel::configure(win_config.second);
diff --git a/src/core/CL/kernels/CLGEMMMatrixAdditionKernel.cpp b/src/core/CL/kernels/CLGEMMMatrixAdditionKernel.cpp
index 4b4814f..4538812 100644
--- a/src/core/CL/kernels/CLGEMMMatrixAdditionKernel.cpp
+++ b/src/core/CL/kernels/CLGEMMMatrixAdditionKernel.cpp
@@ -58,16 +58,15 @@
namespace
{
-Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, const float beta)
+Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, float beta)
{
ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
+ ARM_COMPUTE_UNUSED(input, output, beta);
ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QS8, DataType::QS16, DataType::F16, DataType::F32);
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
- ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(0) != output->dimension(0));
- ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(1) != output->dimension(1));
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output);
- ARM_COMPUTE_UNUSED(beta);
return Status{};
}
} // namespace
@@ -114,11 +113,10 @@
ICLKernel::configure(win_config.second);
}
-Status CLGEMMMatrixAdditionKernel::validate(const ITensorInfo *input, const ITensorInfo *output, const float beta)
+Status CLGEMMMatrixAdditionKernel::validate(const ITensorInfo *input, const ITensorInfo *output, float beta)
{
- ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
- ARM_COMPUTE_RETURN_ERROR_ON(validate_arguments(input, output, beta));
- ARM_COMPUTE_RETURN_ERROR_ON(validate_and_configure_window(input->clone().get(), output->clone().get()).first);
+ ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, beta));
+ ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input->clone().get(), output->clone().get()).first);
return Status{};
}
diff --git a/src/core/CL/kernels/CLGEMMMatrixMultiplyKernel.cpp b/src/core/CL/kernels/CLGEMMMatrixMultiplyKernel.cpp
index 6c31e37..cc9ae27 100644
--- a/src/core/CL/kernels/CLGEMMMatrixMultiplyKernel.cpp
+++ b/src/core/CL/kernels/CLGEMMMatrixMultiplyKernel.cpp
@@ -32,6 +32,7 @@
#include "arm_compute/core/Error.h"
#include "arm_compute/core/FixedPoint.h"
#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/TensorInfo.h"
#include "arm_compute/core/Types.h"
#include "arm_compute/core/Utils.h"
#include "arm_compute/core/Validate.h"
@@ -54,6 +55,7 @@
ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input0, 1, DataType::QS8, DataType::QS16, DataType::F16, DataType::F32);
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input0, input1);
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_FIXED_POINT(input0, input1);
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(input1->num_dimensions() > 3, "The number of dimensions for the matrix B must be <= 3");
if(!is_interleaved_transposed)
{
@@ -105,7 +107,7 @@
}
inline std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input0, ITensorInfo *input1, ITensorInfo *output,
- bool is_interleaved_transposed, GPUTarget gpu_target,
+ bool is_interleaved_transposed, const GEMMReshapeInfo &reshape_info, GPUTarget gpu_target,
ElementsProcessed &num_elements_processed)
{
bool window_changed = false;
@@ -115,6 +117,9 @@
unsigned int &num_elems_processed_per_iteration_x = num_elements_processed[0];
unsigned int &num_elems_processed_per_iteration_y = num_elements_processed[1];
+ // Output tensor auto inizialitation if not yet initialized
+ auto_init_if_empty(*output, input0->clone()->set_tensor_shape(compute_mm_shape(*input0, *input1, is_interleaved_transposed, reshape_info)));
+
if(is_interleaved_transposed)
{
// Configure kernel window
@@ -124,7 +129,9 @@
win = calculate_max_window(*output, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y));
AccessWindowRectangle input0_access(input0, 0, 0, num_elems_processed_per_iteration_y, 1, 1.f, 0.25f);
- AccessWindowTranspose input1_access(input1, 0, 0, num_elems_processed_per_iteration_x, 1, 0.f, 0.25f);
+ AccessWindowStatic input1_access(input1, 0, 0,
+ ceil_to_multiple(input1->dimension(0), num_elems_processed_per_iteration_x),
+ ceil_to_multiple(input1->dimension(1), num_elems_processed_per_iteration_y));
AccessWindowRectangle output_access(output, 0, 0, num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y);
window_changed = update_window_and_padding(win, input0_access, input1_access, output_access);
@@ -138,7 +145,8 @@
num_elems_processed_per_iteration_y = std::min(static_cast<int>(output->dimension(1)), 4);
// Create kernels according to the architecture, data type and input size.
- if(gpu_target == GPUTarget::BIFROST && data_type == DataType::F32)
+ GPUTarget arch_target = get_arch_from_target(gpu_target);
+ if(arch_target == GPUTarget::BIFROST && data_type == DataType::F32)
{
num_elems_processed_per_iteration_x = (input1->dimension(0) <= 1000 && input0->num_dimensions() == 1) ? 2 : 4;
}
@@ -157,13 +165,19 @@
output_access.set_valid_region(win, ValidRegion(coord, output->tensor_shape()));
}
+ // Collapse along the Z direction
+ // This collapse needs to be here in order to tune the Z dimension of LWS
+ Window collapsed = win;
+ const unsigned int dimension_to_collapse = std::min(static_cast<unsigned int>(output->num_dimensions()), 2u);
+ collapsed = win.collapse(win, dimension_to_collapse);
+
Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
- return std::make_pair(err, win);
+ return std::make_pair(err, collapsed);
}
} // namespace
CLGEMMMatrixMultiplyKernel::CLGEMMMatrixMultiplyKernel()
- : _input0(nullptr), _input1(nullptr), _output(nullptr)
+ : _input0(nullptr), _input1(nullptr), _output(nullptr), _slide_matrix_b(true)
{
}
@@ -171,45 +185,64 @@
{
ARM_COMPUTE_ERROR_ON_NULLPTR(input0, input1, output);
- // Output tensor auto inizialitation if not yet initialized
- TensorShape tensor_shape{ input0->info()->tensor_shape() };
- tensor_shape.set(0, is_interleaved_transposed ? reshape_info.n() : input1->info()->dimension(0));
- tensor_shape.set(1, is_interleaved_transposed ? reshape_info.m() : input0->info()->dimension(1));
-
- auto_init_if_empty(*output->info(), input0->info()->clone()->set_tensor_shape(tensor_shape));
-
// Perform validate step
ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input0->info(), input1->info(), output->info(), is_interleaved_transposed, reshape_info));
- _input0 = input0;
- _input1 = input1;
- _output = output;
+ _input0 = input0;
+ _input1 = input1;
+ _output = output;
+ _slide_matrix_b = _input1->info()->num_dimensions() >= _input0->info()->num_dimensions();
const DataType data_type = input0->info()->data_type();
const int fp_pos = input0->info()->fixed_point_position();
// Get target architecture
- GPUTarget arch_target = get_arch_from_target(get_target());
+ GPUTarget gpu_target = get_target();
// Configure LWS hint
- if(arch_target == GPUTarget::BIFROST && input1->info()->dimension(1) == 24)
+ switch(gpu_target)
{
- // LWS optimized for the 11x11 AlexNet convolution on Bifrost.
- _lws_hint = cl::NDRange(2, 2);
- }
- else if(output->info()->dimension(1) == 196)
- {
- _lws_hint = cl::NDRange(1, 7);
- }
- else
- {
- _lws_hint = cl::NDRange(8, 8);
+ case GPUTarget::MIDGARD:
+ case GPUTarget::T600:
+ case GPUTarget::T700:
+ case GPUTarget::T800:
+ if(output->info()->dimension(1) == 196)
+ {
+ _lws_hint = cl::NDRange(1, 7);
+ }
+ else
+ {
+ _lws_hint = cl::NDRange(8, 8);
+ }
+ break;
+ case GPUTarget::G71:
+ case GPUTarget::G72:
+ case GPUTarget::G51:
+ case GPUTarget::G51BIG:
+ case GPUTarget::G51LIT:
+ case GPUTarget::TNOX:
+ if(input1->info()->dimension(1) == 24)
+ {
+ // LWS optimized for the 11x11 AlexNet convolution on Bifrost.
+ _lws_hint = cl::NDRange(2, 2);
+ }
+ else if(output->info()->dimension(1) == 196)
+ {
+ _lws_hint = cl::NDRange(1, 7);
+ }
+ else
+ {
+ _lws_hint = cl::NDRange(8, 8);
+ }
+ break;
+ default:
+ _lws_hint = cl::NullRange;
}
ElementsProcessed num_elements_processed{};
// Configure kernel window
- auto win_config = validate_and_configure_window(input0->info(), input1->info(), output->info(), is_interleaved_transposed, arch_target, num_elements_processed);
+ auto win_config = validate_and_configure_window(input0->info(), input1->info(), output->info(), is_interleaved_transposed, reshape_info, gpu_target, num_elements_processed);
ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
ICLKernel::configure(win_config.second);
@@ -225,6 +258,11 @@
"-DALPHA=" + float_to_string_with_full_precision(alpha));
}
+ // Do not slide matrix B if _slide_matrix_b = false
+ build_opts.add_option_if(!_slide_matrix_b, "-DMATRIX_B_DEPTH=" + support::cpp11::to_string(input1->info()->dimension(2)));
+
+ const bool is_bifrost = get_arch_from_target(gpu_target) == GPUTarget::BIFROST;
+
std::string kernel_name;
if(is_interleaved_transposed)
{
@@ -235,9 +273,9 @@
build_opts.add_option("-DMULT_TRANSPOSE1XW_WIDTH=" + support::cpp11::to_string(mult_transpose1xW_width));
build_opts.add_option("-DMULT_INTERLEAVE4X4_HEIGHT=" + support::cpp11::to_string(mult_interleave4x4_height));
- if(data_type == DataType::F32)
+ if(is_data_type_float(data_type) && is_bifrost)
{
- kernel_name = "gemm_mm_interleaved_transposed_f32_" + string_from_target(arch_target);
+ kernel_name = "gemm_mm_interleaved_transposed_" + lower_string(string_from_data_type(data_type)) + "_bifrost";
}
else
{
@@ -247,14 +285,24 @@
else // The input tensors have not been reshaped
{
build_opts.add_option("-DCOLS_A=" + support::cpp11::to_string(input0->info()->dimension(0)));
+ build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(data_type));
// Create kernels according to the architecture, data type and input size.
- if(arch_target == GPUTarget::BIFROST && data_type == DataType::F32)
+ if(is_data_type_float(data_type) && is_bifrost)
{
- // The first kernel is optimized for the case of 1000 or less output elements (e.g. FC8 of AlexNet and VGG-16, and
- // FC1 of Inception v3). The second kernel is optimized for the case of greater than 1000 output elements (e.g.
- // FC6 and FC7 of AlexNet and VGG-16).
- kernel_name = (input1->info()->dimension(0) <= 1000 && input0->info()->num_dimensions() == 1) ? "gemm_mm_floating_point_f32_bifrost_1000" : "gemm_mm_floating_point_f32_bifrost";
+ kernel_name = "gemm_mm_floating_point";
+
+ if(input0->info()->num_dimensions() != 1)
+ {
+ kernel_name += "_" + lower_string(string_from_data_type(data_type)) + "_bifrost";
+ }
+ else if(input1->info()->dimension(0) <= 1000 && data_type == DataType::F32)
+ {
+ // The first kernel is optimized for the case of 1000 or less output elements (e.g. FC8 of AlexNet and VGG-16, and
+ // FC1 of Inception v3). The second kernel is optimized for the case of greater than 1000 output elements (e.g.
+ // FC6 and FC7 of AlexNet and VGG-16).
+ kernel_name += "_" + lower_string(string_from_data_type(data_type)) + "_bifrost_1000";
+ }
// The work-group size equal to the Bifrost quad size has been proved to be optimal for these kernels
// via exhaustive autotuning over a range of representative layer configurations.
@@ -266,7 +314,6 @@
}
else // (MIDGARD and F32) or (F16)
{
- build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(data_type));
kernel_name = "gemm_mm_floating_point";
}
build_opts.add_option("-DNUM_ELEMS_PROCESSED_PER_THREAD_Y=" + support::cpp11::to_string(num_elements_processed.y()));
@@ -285,6 +332,10 @@
_config_id += "_";
_config_id += support::cpp11::to_string(output->info()->dimension(0));
_config_id += "_";
+ _config_id += support::cpp11::to_string(output->info()->dimension(2));
+ _config_id += "_";
+ _config_id += support::cpp11::to_string(output->info()->dimension(3));
+ _config_id += "_";
_config_id += (is_interleaved_transposed ? support::cpp11::to_string(input1->info()->dimension(0)) : support::cpp11::to_string(input1->info()->dimension(1)));
}
@@ -299,6 +350,7 @@
input1->clone().get(),
output->clone().get(),
is_interleaved_transposed,
+ reshape_info,
gpu_target,
num_elements_processed)
.first);
@@ -311,7 +363,13 @@
ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
- Window slice = window.first_slice_window_2D();
+ if(_input1->info()->num_dimensions() < 3)
+ {
+ // The stride_z for matrix B must be zero if we do not slice
+ ARM_COMPUTE_ERROR_ON(_input1->info()->strides_in_bytes()[3] != 0);
+ }
+
+ Window slice = window.first_slice_window_3D();
Window slice_matrix_b = slice;
slice_matrix_b.set(Window::DimX, Window::Dimension(0, 1, 1));
@@ -321,8 +379,8 @@
{
Window slice_b = slice;
// Don't slice matrix B along the z dimension if matrix B has just 2 dimensions and matrix A more than 2
- // This scenario can happen when the the matrix multiplication is used to perform a convolution operation
- if(_input1->info()->num_dimensions() < 3)
+ // This scenario can happen when the matrix multiplication is used to perform a convolution operation
+ if(!_slide_matrix_b)
{
slice_b = slice_matrix_b;
}
@@ -331,7 +389,10 @@
add_2D_tensor_argument(idx, _input0, slice);
add_2D_tensor_argument(idx, _input1, slice_b);
add_2D_tensor_argument(idx, _output, slice);
+ _kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(_input0->info()->strides_in_bytes()[2]));
+ _kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(_input1->info()->strides_in_bytes()[2]));
+ _kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(_output->info()->strides_in_bytes()[2]));
enqueue(queue, *this, slice, _lws_hint);
}
- while(window.slide_window_slice_2D(slice));
+ while(window.slide_window_slice_3D(slice));
}
diff --git a/src/core/CL/kernels/CLGEMMMatrixVectorMultiplyKernel.cpp b/src/core/CL/kernels/CLGEMMMatrixVectorMultiplyKernel.cpp
index cc483dc..b2ea95b 100644
--- a/src/core/CL/kernels/CLGEMMMatrixVectorMultiplyKernel.cpp
+++ b/src/core/CL/kernels/CLGEMMMatrixVectorMultiplyKernel.cpp
@@ -34,6 +34,42 @@
using namespace arm_compute;
+namespace
+{
+Status validate_arguments(const ITensorInfo *input0, const ITensorInfo *input1, const ITensorInfo *output)
+{
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input0, 1, DataType::QASYMM8, DataType::F16, DataType::F32);
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input0, input1);
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_FIXED_POINT(input0, input1, output);
+ ARM_COMPUTE_RETURN_ERROR_ON(is_data_type_quantized_asymmetric(input0->data_type()) && (output->data_type() != DataType::S32));
+ ARM_COMPUTE_RETURN_ERROR_ON(input0->dimension(2) != input1->dimension(1));
+
+ return Status{};
+}
+
+std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input0, ITensorInfo *input1, ITensorInfo *output)
+{
+ constexpr unsigned int num_elems_read_per_iteration = 4;
+ constexpr unsigned int num_rows_read_per_iteration = 4;
+
+ const unsigned int border_x = ceil_to_multiple(input0->dimension(0), num_elems_read_per_iteration) - input0->dimension(0);
+ const unsigned int border_y = ceil_to_multiple(input0->dimension(1), num_rows_read_per_iteration) - input0->dimension(1);
+
+ Window win = calculate_max_window(*input0, Steps(num_elems_read_per_iteration));
+
+ AccessWindowRectangle input0_access(input0, 0, 0, num_elems_read_per_iteration, num_rows_read_per_iteration);
+ AccessWindowHorizontal input1_access(input1, 0, num_elems_read_per_iteration);
+ AccessWindowStatic output_access(output, 0, 0, output->dimension(0) + border_x, output->dimension(1) + border_y);
+
+ bool window_changed = update_window_and_padding(win, input0_access, input1_access, output_access);
+
+ output->set_valid_region(ValidRegion(Coordinates(), output->tensor_shape()));
+
+ Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
+ return std::make_pair(err, win);
+}
+} // namespace
+
CLGEMMMatrixVectorMultiplyKernel::CLGEMMMatrixVectorMultiplyKernel()
: _input0(nullptr), _input1(nullptr), _output(nullptr), _num_rows_read_per_iteration(0), _border_size(0)
{
@@ -45,11 +81,8 @@
void CLGEMMMatrixVectorMultiplyKernel::configure(const ICLTensor *input0, const ICLTensor *input1, ICLTensor *output)
{
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input0, 1, DataType::QASYMM8, DataType::F16, DataType::F32);
- ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input0, input1);
- ARM_COMPUTE_ERROR_ON_MISMATCHING_FIXED_POINT(input0, input1, output);
- ARM_COMPUTE_ERROR_ON(is_data_type_quantized_asymmetric(input0->info()->data_type()) && (output->info()->data_type() != DataType::S32));
- ARM_COMPUTE_ERROR_ON(input0->info()->dimension(2) != input1->info()->dimension(1));
+ ARM_COMPUTE_ERROR_ON_NULLPTR(input0, input1, output);
+ ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input0->info(), input1->info(), output->info()));
_input0 = input0;
_input1 = input1;
@@ -77,8 +110,8 @@
// Configure the local work size for Bifrost with a value obtained
// via exhaustive autotuning for the MobileNets tensor shapes.
- const GPUTarget gpu_target = get_arch_from_target(get_target());
- if(gpu_target == GPUTarget::BIFROST)
+ const GPUTarget gpu_target = get_target();
+ if(gpu_target_is_in(gpu_target, GPUTarget::G71, GPUTarget::G72, GPUTarget::G51, GPUTarget::G51BIG, GPUTarget::G51LIT, GPUTarget::TNOX))
{
_lws_hint = cl::NDRange(1, 1, 1);
}
@@ -93,17 +126,17 @@
_border_size = BorderSize(border_y, border_x);
- Window win = calculate_max_window(*input0->info(), Steps(num_elems_read_per_iteration));
+ auto win_config = validate_and_configure_window(input0->info(), input1->info(), output->info());
+ ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
+ ICLKernel::configure(win_config.second);
+}
- AccessWindowRectangle input0_access(input0->info(), 0, 0, num_elems_read_per_iteration, _num_rows_read_per_iteration);
- AccessWindowHorizontal input1_access(input1->info(), 0, num_elems_read_per_iteration);
- AccessWindowStatic output_access(_output->info(), 0, 0, _output->info()->dimension(0) + border_x, _output->info()->dimension(1) + border_y);
+Status CLGEMMMatrixVectorMultiplyKernel::validate(const ITensorInfo *input0, const ITensorInfo *input1, const ITensorInfo *output)
+{
+ ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input0, input1, output));
+ ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input0->clone().get(), input1->clone().get(), output->clone().get()).first);
- update_window_and_padding(win, input0_access, input1_access, output_access);
-
- _output->info()->set_valid_region(ValidRegion(Coordinates(), _output->info()->tensor_shape()));
-
- ICLKernel::configure(win);
+ return Status{};
}
void CLGEMMMatrixVectorMultiplyKernel::run(const Window &window, cl::CommandQueue &queue)
diff --git a/src/core/CL/kernels/CLGEMMTranspose1xWKernel.cpp b/src/core/CL/kernels/CLGEMMTranspose1xWKernel.cpp
index 24d2187..05a20fd 100644
--- a/src/core/CL/kernels/CLGEMMTranspose1xWKernel.cpp
+++ b/src/core/CL/kernels/CLGEMMTranspose1xWKernel.cpp
@@ -23,6 +23,7 @@
*/
#include "arm_compute/core/CL/kernels/CLGEMMTranspose1xWKernel.h"
+#include "arm_compute/core/AccessWindowStatic.h"
#include "arm_compute/core/AccessWindowTranspose.h"
#include "arm_compute/core/CL/CLHelpers.h"
#include "arm_compute/core/CL/CLKernelLibrary.h"
@@ -70,24 +71,21 @@
// Configure kernel window
Window win = calculate_max_window(*input, Steps(num_elems_processed_per_iteration));
- if((win.x().end() / scale_x) == 0)
- {
- return std::make_pair(ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Transposed shape would be 0 in the second dimension"), win);
- }
-
AccessWindowHorizontal input_access(input, 0, num_elems_processed_per_iteration);
- window_changed = window_changed || update_window_and_padding(win, input_access);
+
+ // Output tensor auto inizialitation if not yet initialized
+ auto_init_if_empty(*output, input->clone()->set_tensor_shape(compute_transpose1xW_with_element_size_shape(*input, mult_transpose1xW_width)));
// Configure window in case of configured output
- if(output->total_size() != 0)
- {
- AccessWindowTranspose output_access(output, 0, 0, num_elems_processed_per_iteration, 1, scale_x, 1.f / scale_x);
- window_changed = window_changed || update_window_and_padding(win, output_access);
- output_access.set_valid_region(win, ValidRegion(Coordinates(0, 0), input->tensor_shape()));
- }
+ AccessWindowStatic output_access(output, 0, 0, ceil_to_multiple(output->dimension(0), scale_x), output->dimension(1));
+ window_changed = window_changed || update_window_and_padding(win, input_access, output_access);
+ output_access.set_valid_region(win, ValidRegion(Coordinates(0, 0), input->tensor_shape()));
+
+ // Collapse along the Z direction
+ Window collapsed = win.collapse(win, Window::DimZ);
Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
- return std::make_pair(err, win);
+ return std::make_pair(err, collapsed);
}
} // namespace
@@ -151,15 +149,15 @@
out_window.set(Window::DimX, window.y());
out_window.set(Window::DimY, window.x());
- Window in_slice = window.first_slice_window_2D();
- Window out_slice = out_window.first_slice_window_2D();
+ Window in_slice = window.first_slice_window_3D();
+ Window out_slice = out_window.first_slice_window_3D();
do
{
unsigned int idx = 0;
- add_2D_tensor_argument(idx, _input, in_slice);
- add_2D_tensor_argument(idx, _output, out_slice);
+ add_3D_tensor_argument(idx, _input, in_slice);
+ add_3D_tensor_argument(idx, _output, out_slice);
enqueue(queue, *this, in_slice, _lws_hint);
}
- while(window.slide_window_slice_2D(in_slice) && out_window.slide_window_slice_2D(out_slice));
+ while(window.slide_window_slice_3D(in_slice) && out_window.slide_window_slice_3D(out_slice));
}
diff --git a/src/core/CL/kernels/CLGaussianPyramidKernel.cpp b/src/core/CL/kernels/CLGaussianPyramidKernel.cpp
index 34a228c..a4fda36 100644
--- a/src/core/CL/kernels/CLGaussianPyramidKernel.cpp
+++ b/src/core/CL/kernels/CLGaussianPyramidKernel.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2018 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -32,20 +32,19 @@
using namespace arm_compute;
CLGaussianPyramidHorKernel::CLGaussianPyramidHorKernel()
- : _border_size(0), _l2_load_offset(0)
+ : _l2_load_offset(0)
{
}
BorderSize CLGaussianPyramidHorKernel::border_size() const
{
- return _border_size;
+ return BorderSize(0, 2);
}
-void CLGaussianPyramidHorKernel::configure(const ICLTensor *input, ICLTensor *output, bool border_undefined)
+void CLGaussianPyramidHorKernel::configure(const ICLTensor *input, ICLTensor *output)
{
ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8);
ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U16);
- ARM_COMPUTE_ERROR_ON(input->info()->dimension(0) != 2 * output->info()->dimension(0));
ARM_COMPUTE_ERROR_ON(input->info()->dimension(1) != output->info()->dimension(1));
for(size_t i = 2; i < Coordinates::num_max_dimensions; ++i)
@@ -53,9 +52,8 @@
ARM_COMPUTE_ERROR_ON(input->info()->dimension(i) != output->info()->dimension(i));
}
- _input = input;
- _output = output;
- _border_size = BorderSize(border_undefined ? 0 : 2, 2);
+ _input = input;
+ _output = output;
// Create kernel
_kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("gaussian1x5_sub_x"));
@@ -64,9 +62,9 @@
constexpr unsigned int num_elems_processed_per_iteration = 16;
constexpr unsigned int num_elems_read_per_iteration = 20;
constexpr unsigned int num_elems_written_per_iteration = 8;
- constexpr float scale_x = 0.5f;
+ const float scale_x = static_cast<float>(output->info()->dimension(0)) / input->info()->dimension(0);
- Window win = calculate_max_window_horizontal(*input->info(), Steps(num_elems_processed_per_iteration), border_undefined, border_size());
+ Window win = calculate_max_window_horizontal(*input->info(), Steps(num_elems_processed_per_iteration));
AccessWindowHorizontal output_access(output->info(), 0, num_elems_written_per_iteration, scale_x);
// Sub sampling selects odd pixels (1, 3, 5, ...) for images with even
@@ -95,11 +93,7 @@
AccessWindowHorizontal(input->info(), _l2_load_offset, num_elems_read_per_iteration),
output_access);
- ValidRegion valid_region = input->info()->valid_region();
- valid_region.anchor.set(0, std::ceil((valid_region.anchor[0] + (border_undefined ? border_size().left : 0)) / 2.f));
- valid_region.shape.set(0, (valid_region.shape[0] - (border_undefined ? border_size().right : 0)) / 2 - valid_region.anchor[0]);
-
- output_access.set_valid_region(win, valid_region);
+ output->info()->set_valid_region(ValidRegion(Coordinates(), output->info()->tensor_shape()));
ICLKernel::configure(win);
}
@@ -139,12 +133,11 @@
return BorderSize(2, 0);
}
-void CLGaussianPyramidVertKernel::configure(const ICLTensor *input, ICLTensor *output, bool border_undefined)
+void CLGaussianPyramidVertKernel::configure(const ICLTensor *input, ICLTensor *output)
{
ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U16);
ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8);
ARM_COMPUTE_ERROR_ON(input->info()->dimension(0) != output->info()->dimension(0));
- ARM_COMPUTE_ERROR_ON(input->info()->dimension(1) != 2 * output->info()->dimension(1));
for(size_t i = 2; i < Coordinates::num_max_dimensions; ++i)
{
@@ -163,10 +156,10 @@
constexpr unsigned int num_elems_written_per_iteration = 8;
constexpr unsigned int num_elems_read_per_iteration = 8;
constexpr unsigned int num_rows_per_iteration = 5;
- constexpr float scale_y = 0.5f;
- Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration, num_rows_processed_per_iteration),
- border_undefined, border_size());
+ const float scale_y = static_cast<float>(output->info()->dimension(1)) / input->info()->dimension(1);
+
+ Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration, num_rows_processed_per_iteration));
AccessWindowRectangle output_access(output->info(), 0, 0, num_elems_written_per_iteration, num_rows_per_iteration, 1.f, scale_y);
// Determine whether we need to load even or odd rows. See above for a
@@ -182,11 +175,7 @@
AccessWindowRectangle(input->info(), 0, _t2_load_offset, num_elems_read_per_iteration, num_rows_per_iteration),
output_access);
- ValidRegion valid_region = input->info()->valid_region();
- valid_region.anchor.set(1, std::ceil((valid_region.anchor[1] + (border_undefined ? border_size().top : 0)) / 2.f));
- valid_region.shape.set(1, (valid_region.shape[1] - (border_undefined ? border_size().bottom : 0)) / 2 - valid_region.anchor[1]);
-
- output_access.set_valid_region(win, valid_region);
+ output->info()->set_valid_region(ValidRegion(Coordinates(), output->info()->tensor_shape()));
ICLKernel::configure(win);
}
diff --git a/src/core/CL/kernels/CLHOGDescriptorKernel.cpp b/src/core/CL/kernels/CLHOGDescriptorKernel.cpp
index 87659c4..a15aab1 100644
--- a/src/core/CL/kernels/CLHOGDescriptorKernel.cpp
+++ b/src/core/CL/kernels/CLHOGDescriptorKernel.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2018 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -172,7 +172,7 @@
AccessWindowRectangle(input->info(), 0, 0, num_elems_read_per_iteration, num_rows_read_per_iteration),
output_access);
- output_access.set_valid_region(win, input->info()->valid_region());
+ output_access.set_valid_region(win, ValidRegion(Coordinates(), output->info()->tensor_shape()));
ICLKernel::configure(win);
}
diff --git a/src/core/CL/kernels/CLHOGDetectorKernel.cpp b/src/core/CL/kernels/CLHOGDetectorKernel.cpp
index 0f9a989..caca498 100644
--- a/src/core/CL/kernels/CLHOGDetectorKernel.cpp
+++ b/src/core/CL/kernels/CLHOGDetectorKernel.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2018 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -70,10 +70,10 @@
args_str << "-DTHRESHOLD=" << threshold << " ";
args_str << "-DMAX_NUM_DETECTION_WINDOWS=" << detection_windows->max_num_values() << " ";
args_str << "-DIDX_CLASS=" << idx_class << " ";
- args_str << "-DBLOCK_STRIDE_WIDTH=" << block_stride.width << " ";
- args_str << "-DBLOCK_STRIDE_HEIGHT=" << block_stride.height << " ";
args_str << "-DDETECTION_WINDOW_WIDTH=" << detection_window_size.width << " ";
args_str << "-DDETECTION_WINDOW_HEIGHT=" << detection_window_size.height << " ";
+ args_str << "-DDETECTION_WINDOW_STRIDE_WIDTH=" << detection_window_stride.width << " ";
+ args_str << "-DDETECTION_WINDOW_STRIDE_HEIGHT=" << detection_window_stride.height << " ";
// Construct kernel name
std::set<std::string> build_opts = {};
@@ -102,8 +102,8 @@
// Configure kernel window
Window win;
- win.set(Window::DimX, Window::Dimension(0, floor_to_multiple(num_blocks_x - num_blocks_per_detection_window_x, window_step_x), window_step_x));
- win.set(Window::DimY, Window::Dimension(0, floor_to_multiple(num_blocks_y - num_blocks_per_detection_window_y, window_step_y), window_step_y));
+ win.set(Window::DimX, Window::Dimension(0, floor_to_multiple(num_blocks_x - num_blocks_per_detection_window_x, window_step_x) + window_step_x, window_step_x));
+ win.set(Window::DimY, Window::Dimension(0, floor_to_multiple(num_blocks_y - num_blocks_per_detection_window_y, window_step_y) + window_step_y, window_step_y));
constexpr unsigned int num_elems_read_per_iteration = 1;
const unsigned int num_rows_read_per_iteration = num_blocks_per_descriptor_y;
diff --git a/src/core/CL/kernels/CLIm2ColKernel.cpp b/src/core/CL/kernels/CLIm2ColKernel.cpp
index b75d264..d04c1dc 100644
--- a/src/core/CL/kernels/CLIm2ColKernel.cpp
+++ b/src/core/CL/kernels/CLIm2ColKernel.cpp
@@ -41,11 +41,12 @@
namespace
{
-Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, bool has_bias)
+Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, bool has_bias, const Size2D &dilation)
{
ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QS8, DataType::QASYMM8, DataType::QS16, DataType::F16, DataType::F32);
ARM_COMPUTE_RETURN_ERROR_ON(input->data_type() == DataType::QASYMM8 && has_bias);
ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(output);
+ ARM_COMPUTE_RETURN_ERROR_ON((dilation.x() < 1) || (dilation.y() < 1));
// Checks performed when output is configured
if(output->total_size() != 0)
@@ -63,19 +64,19 @@
{
}
-void CLIm2ColKernel::configure(const ICLTensor *input, ICLTensor *output, const Size2D &kernel_dims, const PadStrideInfo &conv_info, bool has_bias)
+void CLIm2ColKernel::configure(const ICLTensor *input, ICLTensor *output, const Size2D &kernel_dims, const PadStrideInfo &conv_info, bool has_bias, const Size2D &dilation)
{
ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
// Perform validation step
- ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), has_bias));
+ ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), has_bias, dilation));
_input = input;
_output = output;
_kernel_dims = kernel_dims;
const DataType data_type = input->info()->data_type();
- const GPUTarget gpu_target = get_arch_from_target(get_target());
+ const GPUTarget gpu_target = get_target();
// Create kernel
CLBuildOptions build_opts;
@@ -107,7 +108,7 @@
_convolved_dims = scaled_dimensions(input->info()->dimension(0), input->info()->dimension(1),
kernel_dims.width, kernel_dims.height,
- conv_info);
+ conv_info, dilation);
build_opts.add_option("-DKERNEL_WIDTH=" + support::cpp11::to_string(kernel_dims.width));
build_opts.add_option("-DKERNEL_HEIGHT=" + support::cpp11::to_string(kernel_dims.height));
@@ -122,77 +123,82 @@
build_opts.add_option("-DPAD_BOTTOM=" + support::cpp11::to_string(conv_info.pad_bottom()));
build_opts.add_option("-DSRC_WIDTH=" + support::cpp11::to_string(input->info()->dimension(0)));
build_opts.add_option("-DSRC_HEIGHT=" + support::cpp11::to_string(input->info()->dimension(1)));
+ build_opts.add_option("-DDILATION_X=" + support::cpp11::to_string(dilation.x()));
+ build_opts.add_option("-DDILATION_Y=" + support::cpp11::to_string(dilation.y()));
build_opts.add_option_if_else(is_data_type_quantized(data_type), "-DPAD_VALUE=" + support::cpp11::to_string(input->info()->quantization_info().offset), "-DPAD_VALUE=0");
const bool squared_im2col = kernel_dims.width == kernel_dims.height;
- if(squared_im2col && !is_data_type_fixed_point(data_type))
+ if(dilation == Size2D(1U, 1U))
{
- // Check if we can run an optimized im2col
- switch(kernel_dims.width)
+ if(squared_im2col && !is_data_type_fixed_point(data_type))
{
- case 1:
- // Optimized im2col1x1 if stride_x = 1 and conv_info.has_padding() = false
- if(conv_info.stride().first == 1 && !conv_info.has_padding())
- {
- // Set hint for LWS
+ // Check if we can run an optimized im2col
+ switch(kernel_dims.width)
+ {
+ case 1:
+ // Optimized im2col1x1 if stride_x = 1 and conv_info.has_padding() = false
+ if(conv_info.stride().first == 1 && !conv_info.has_padding())
+ {
+ // Set hint for LWS
+ _lws_hint = cl::NDRange(1, 1, 8);
+ _num_elems_processed_per_iteration = 4;
+ is_optimized_path = true;
+ kernel_name = "im2col1x1_stridex1_dchw";
+ }
+ break;
+ case 3:
_lws_hint = cl::NDRange(1, 1, 8);
- _num_elems_processed_per_iteration = 4;
- is_optimized_path = true;
- kernel_name = "im2col1x1_stridex1_dchw";
- }
- break;
- case 3:
- _lws_hint = cl::NDRange(1, 1, 8);
- _num_elems_processed_per_iteration = 1;
- is_optimized_path = true;
- kernel_name = "im2col3x3_dchw";
- break;
- case 5:
- _num_elems_processed_per_iteration = 1;
- is_optimized_path = true;
- kernel_name = "im2col5x5_dchw";
- break;
- case 11:
- // Optimized im2col11x11 if pad_x = pad_y = 0
- if(!conv_info.has_padding())
- {
_num_elems_processed_per_iteration = 1;
is_optimized_path = true;
- kernel_name = "im2col11x11_padx0_pady0_dchw";
- }
- break;
- default:
- is_optimized_path = false;
- break;
+ kernel_name = "im2col3x3_dchw";
+ break;
+ case 5:
+ _num_elems_processed_per_iteration = 1;
+ is_optimized_path = true;
+ kernel_name = "im2col5x5_dchw";
+ break;
+ case 11:
+ // Optimized im2col11x11 if pad_x = pad_y = 0
+ if(!conv_info.has_padding())
+ {
+ _num_elems_processed_per_iteration = 1;
+ is_optimized_path = true;
+ kernel_name = "im2col11x11_padx0_pady0_dchw";
+ }
+ break;
+ default:
+ is_optimized_path = false;
+ break;
+ }
}
- }
- else if(kernel_dims.width > 1 && !conv_info.has_padding())
- {
- _num_elems_processed_per_iteration = 1;
- kernel_name = "im2col_generic_padx0_pady0_dchw";
+ else if(kernel_dims.width > 1 && !conv_info.has_padding())
+ {
+ _num_elems_processed_per_iteration = 1;
+ kernel_name = "im2col_generic_padx0_pady0_dchw";
- // Optimized im2col is performed using one or more vector operations with the specified vector size
- // and a remainder. For example, for 5x5 convolutions, im2col is performed using vectors of size 4
- // and scalars; for 7x7 convolutions, using vectors of size 4 and vectors of size 3.
- // Using the vector size of 4 is always safe since OpenCL supports vectors of size 2 and 3.
- // Using the vector size of 8, however, may be faster.
- size_t vector_size = 4;
- // For 2x2 convolutions, use vectors of size 2. (For 3x3 convolutions, im2col_kernel3x3_padx0_pady0
- // is used instead.)
- if(kernel_dims.width < vector_size)
- {
- vector_size = kernel_dims.width;
+ // Optimized im2col is performed using one or more vector operations with the specified vector size
+ // and a remainder. For example, for 5x5 convolutions, im2col is performed using vectors of size 4
+ // and scalars; for 7x7 convolutions, using vectors of size 4 and vectors of size 3.
+ // Using the vector size of 4 is always safe since OpenCL supports vectors of size 2 and 3.
+ // Using the vector size of 8, however, may be faster.
+ size_t vector_size = 4;
+ // For 2x2 convolutions, use vectors of size 2. (For 3x3 convolutions, im2col_kernel3x3_padx0_pady0
+ // is used instead.)
+ if(kernel_dims.width < vector_size)
+ {
+ vector_size = kernel_dims.width;
+ }
+ // Local work size and vector size optimized for the 11x11 AlexNet convolution on Bifrost.
+ if(gpu_target_is_in(gpu_target, GPUTarget::G71, GPUTarget::G72, GPUTarget::G51, GPUTarget::G51BIG, GPUTarget::G51LIT, GPUTarget::TNOX) && kernel_dims.width == 11)
+ {
+ _lws_hint = cl::NDRange(1, 1, 1);
+ vector_size = 8;
+ }
+ const size_t width_mod_vector_size = kernel_dims.width % vector_size;
+ build_opts.add_option("-DVECTOR_SIZE=" + support::cpp11::to_string(vector_size));
+ build_opts.add_option("-DWIDTH_MOD_VECTOR_SIZE=" + support::cpp11::to_string(width_mod_vector_size));
}
- // Local work size and vector size optimized for the 11x11 AlexNet convolution on Bifrost.
- if(gpu_target == GPUTarget::BIFROST && kernel_dims.width == 11)
- {
- _lws_hint = cl::NDRange(1, 1, 1);
- vector_size = 8;
- }
- const size_t width_mod_vector_size = kernel_dims.width % vector_size;
- build_opts.add_option("-DVECTOR_SIZE=" + support::cpp11::to_string(vector_size));
- build_opts.add_option("-DWIDTH_MOD_VECTOR_SIZE=" + support::cpp11::to_string(width_mod_vector_size));
}
_run_func = &CLIm2ColKernel::run_generic;
}
@@ -206,7 +212,7 @@
// Create kernel
_kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel(kernel_name, build_opts.options()));
- // Configure kernel window
+ // Configure kernel window
Window win;
if(is_optimized_path)
{
@@ -250,12 +256,12 @@
_config_id += support::cpp11::to_string(output->info()->dimension(1));
}
-Status CLIm2ColKernel::validate(const ITensorInfo *input, const ITensorInfo *output, const Size2D &kernel_dims, const PadStrideInfo &conv_info, bool has_bias)
+Status CLIm2ColKernel::validate(const ITensorInfo *input, const ITensorInfo *output, const Size2D &kernel_dims, const PadStrideInfo &conv_info, bool has_bias, const Size2D &dilation)
{
ARM_COMPUTE_UNUSED(kernel_dims);
ARM_COMPUTE_UNUSED(conv_info);
ARM_COMPUTE_UNUSED(has_bias);
- ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, has_bias));
+ ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, has_bias, dilation));
return Status{};
}
diff --git a/src/core/CL/kernels/CLL2NormalizeLayerKernel.cpp b/src/core/CL/kernels/CLL2NormalizeLayerKernel.cpp
index 36e351e..3d30350 100644
--- a/src/core/CL/kernels/CLL2NormalizeLayerKernel.cpp
+++ b/src/core/CL/kernels/CLL2NormalizeLayerKernel.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2018 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -42,18 +42,60 @@
{
}
+namespace
+{
+Status validate_arguments(const ITensorInfo *input, const ITensorInfo *sum, const ITensorInfo *output, unsigned int axis, float epsilon)
+{
+ ARM_COMPUTE_UNUSED(epsilon);
+
+ ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, sum, output);
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, sum);
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32);
+ ARM_COMPUTE_RETURN_ERROR_ON(input->data_layout() != DataLayout::NCHW);
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(axis > 0, "Unsupported reduction axis, Supported axis is 0");
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(axis >= TensorShape::num_max_dimensions, "Reduction axis greater than max number of dimensions");
+
+ // Reduce shape on axis
+ TensorShape sum_shape = input->tensor_shape();
+ sum_shape.set(axis, 1);
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(sum->tensor_shape(), sum_shape);
+
+ if(output->total_size() != 0)
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output);
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(input->tensor_shape(), output->tensor_shape());
+ ARM_COMPUTE_RETURN_ERROR_ON(output->data_layout() != DataLayout::NCHW);
+ }
+
+ return Status{};
+}
+
+std::tuple<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *output)
+{
+ const unsigned int num_elems_processed_per_iteration = 16;
+
+ Window win = calculate_max_window(*input, Steps(num_elems_processed_per_iteration));
+
+ // Output tensor auto initialization if not yet initialized
+ auto_init_if_empty(*output, input->tensor_shape(), 1, input->data_type(), input->fixed_point_position());
+
+ AccessWindowHorizontal input_access(input, 0, num_elems_processed_per_iteration);
+ AccessWindowHorizontal output_access(output, 0, num_elems_processed_per_iteration);
+
+ bool window_changed = update_window_and_padding(win, input_access, output_access);
+ output_access.set_valid_region(win, input->valid_region());
+
+ Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
+
+ return std::make_tuple(err, win);
+}
+} // namespace
+
void CLL2NormalizeLayerKernel::configure(const ICLTensor *input, const ICLTensor *sum, ICLTensor *output, unsigned int axis, float epsilon)
{
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32);
- ARM_COMPUTE_ERROR_ON_NULLPTR(output);
-
- // Sum and output tensor auto initialization if not yet initialized
- auto_init_if_empty(*output->info(), input->info()->tensor_shape(), 1, input->info()->data_type(), input->info()->fixed_point_position());
-
- ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
- ARM_COMPUTE_ERROR_ON_MSG(axis >= TensorShape::num_max_dimensions, "Reduction axis greater than max number of dimensions");
- ARM_COMPUTE_ERROR_ON_MSG(axis > 0, "Unsupported reduction axis, Supported axis is 0");
- ARM_COMPUTE_ERROR_ON_MISMATCHING_SHAPES(input, output);
+ ARM_COMPUTE_ERROR_ON_NULLPTR(input, sum, output);
+ ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), sum->info(), output->info(), axis, epsilon));
_input = input;
_sum = sum;
@@ -76,15 +118,18 @@
_kernel.setArg<cl_uint>(idx, _epsilon);
// Configure kernel window
- Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration));
+ auto win_config = validate_and_configure_window(_input->info(), _output->info());
+ ARM_COMPUTE_ERROR_THROW_ON(std::get<0>(win_config));
- AccessWindowHorizontal input_access(input->info(), 0, num_elems_processed_per_iteration);
- AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration);
+ ICLKernel::configure(std::get<1>(win_config));
+}
- update_window_and_padding(win, input_access, output_access);
- output_access.set_valid_region(win, input->info()->valid_region());
+Status CLL2NormalizeLayerKernel::validate(const ITensorInfo *input, const ITensorInfo *sum, const ITensorInfo *output, unsigned int axis, float epsilon)
+{
+ ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, sum, output, axis, epsilon));
+ ARM_COMPUTE_RETURN_ON_ERROR(std::get<0>(validate_and_configure_window(input->clone().get(), output->clone().get())));
- ICLKernel::configure(win);
+ return Status{};
}
void CLL2NormalizeLayerKernel::run(const Window &window, cl::CommandQueue &queue)
diff --git a/src/core/CL/kernels/CLLKTrackerKernel.cpp b/src/core/CL/kernels/CLLKTrackerKernel.cpp
index 12cdd0e..078d18e 100644
--- a/src/core/CL/kernels/CLLKTrackerKernel.cpp
+++ b/src/core/CL/kernels/CLLKTrackerKernel.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2018 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -249,8 +249,12 @@
static_cast<cl_float>(valid_region.start(0))
}
};
- const int term_iteration = (termination == Termination::TERM_CRITERIA_ITERATIONS || termination == Termination::TERM_CRITERIA_BOTH) ? 1 : 0;
- const int term_epsilon = (termination == Termination::TERM_CRITERIA_EPSILON || termination == Termination::TERM_CRITERIA_BOTH) ? 1 : 0;
+
+ // Set maximum number of iterations used for convergence
+ const size_t max_iterations = 1000;
+ num_iterations = (termination == Termination::TERM_CRITERIA_EPSILON) ? max_iterations : num_iterations;
+
+ const int term_epsilon = (termination == Termination::TERM_CRITERIA_EPSILON || termination == Termination::TERM_CRITERIA_BOTH) ? 1 : 0;
// Create kernel
_kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("lktracker_stage1"));
@@ -268,7 +272,6 @@
_kernel.setArg<cl_float3>(idx++, border_limits);
_kernel.setArg<cl_float>(idx++, eig_const);
_kernel.setArg<cl_int>(idx++, level0);
- _kernel.setArg<cl_int>(idx++, term_iteration);
_kernel.setArg<cl_int>(idx++, term_epsilon);
}
diff --git a/src/core/CL/kernels/CLLocallyConnectedMatrixMultiplyKernel.cpp b/src/core/CL/kernels/CLLocallyConnectedMatrixMultiplyKernel.cpp
index a3af5b0..84f2e0c 100644
--- a/src/core/CL/kernels/CLLocallyConnectedMatrixMultiplyKernel.cpp
+++ b/src/core/CL/kernels/CLLocallyConnectedMatrixMultiplyKernel.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017, 2018 ARM Limited.
+ * Copyright (c) 2017-2018 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -46,13 +46,44 @@
{
}
+namespace
+{
+Status validate_arguments(const ITensorInfo *input0, const ITensorInfo *input1, const ITensorInfo *output)
+{
+ ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input0, input1, output);
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input0, 1, DataType::F16, DataType::F32);
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input1, 1, DataType::F16, DataType::F32);
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::F16, DataType::F32);
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input0, input1, output);
+ ARM_COMPUTE_RETURN_ERROR_ON(input0->dimension(0) != input1->dimension(1));
+
+ return Status{};
+}
+
+std::tuple<Status, Window> validate_and_configure_window(ITensorInfo *input0, ITensorInfo *input1, ITensorInfo *output)
+{
+ const unsigned int num_elems_processed_per_iteration_x = max_cl_vector_width / data_size_from_type(input0->data_type());
+
+ Window win = calculate_max_window(*output, Steps(num_elems_processed_per_iteration_x));
+
+ AccessWindowHorizontal input0_access(input0, 0, num_elems_processed_per_iteration_x);
+ AccessWindowHorizontal input1_access(input1, 0, num_elems_processed_per_iteration_x);
+ AccessWindowHorizontal output_access(output, 0, num_elems_processed_per_iteration_x);
+
+ bool window_changed = update_window_and_padding(win, input0_access, input1_access, output_access);
+
+ output_access.set_valid_region(win, ValidRegion(Coordinates(), output->tensor_shape()));
+
+ Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
+
+ return std::make_tuple(err, win);
+}
+} // namespace
+
void CLLocallyConnectedMatrixMultiplyKernel::configure(const ICLTensor *input0, const ICLTensor *input1, ICLTensor *output)
{
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input0, 1, DataType::F32);
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input1, 1, DataType::F32);
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::F32);
- ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input0, input1, output);
- ARM_COMPUTE_ERROR_ON(input0->info()->dimension(0) != input1->info()->dimension(1));
+ ARM_COMPUTE_ERROR_ON_NULLPTR(input0, input1, output);
+ ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input0->info(), input1->info(), output->info()));
_input0 = input0;
_input1 = input1;
@@ -77,20 +108,20 @@
std::string data_type_name = lower_string(string_from_data_type(input0->info()->data_type()));
_kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel(("gemm_lc_vm_" + data_type_name), build_opts));
- // Configure window kernel
- const unsigned int num_elems_processed_per_iteration_x = max_cl_vector_width / data_size_from_type(input0->info()->data_type());
+ // Configure kernel window
+ auto win_config = validate_and_configure_window(input0->info(), input1->info(), output->info());
- Window win = calculate_max_window(*output->info(), Steps(num_elems_processed_per_iteration_x));
+ ARM_COMPUTE_ERROR_THROW_ON(std::get<0>(win_config));
- AccessWindowRectangle input0_access(input0->info(), 0, 0, num_elems_processed_per_iteration_x, 1);
- AccessWindowRectangle input1_access(input1->info(), 0, 0, num_elems_processed_per_iteration_x, 1);
- AccessWindowRectangle output_access(output->info(), 0, 0, num_elems_processed_per_iteration_x, 1);
+ ICLKernel::configure(std::get<1>(win_config));
+}
- update_window_and_padding(win, input0_access, input1_access, output_access);
+Status CLLocallyConnectedMatrixMultiplyKernel::validate(const ITensorInfo *input0, const ITensorInfo *input1, const ITensorInfo *output)
+{
+ ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input0, input1, output));
+ ARM_COMPUTE_RETURN_ON_ERROR(std::get<0>(validate_and_configure_window(input0->clone().get(), input1->clone().get(), output->clone().get())));
- output_access.set_valid_region(win, ValidRegion(Coordinates(), output->info()->tensor_shape()));
-
- ICLKernel::configure(win);
+ return Status{};
}
void CLLocallyConnectedMatrixMultiplyKernel::run(const Window &window, cl::CommandQueue &queue)
diff --git a/src/core/CL/kernels/CLMinMaxLayerKernel.cpp b/src/core/CL/kernels/CLMinMaxLayerKernel.cpp
index 8ba1f77..60dd5e7 100644
--- a/src/core/CL/kernels/CLMinMaxLayerKernel.cpp
+++ b/src/core/CL/kernels/CLMinMaxLayerKernel.cpp
@@ -30,10 +30,55 @@
#include "arm_compute/core/Helpers.h"
#include "arm_compute/core/Validate.h"
#include "arm_compute/core/Window.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
#include <climits>
using namespace arm_compute;
+using namespace arm_compute::misc::shape_calculator;
+
+namespace
+{
+Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output)
+{
+ ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32);
+ ARM_COMPUTE_RETURN_ERROR_ON(input->num_dimensions() < 3);
+
+ if(output->tensor_shape().total_size() > 0)
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+
+ TensorShape output_shape = compute_min_max_shape(input);
+
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(output->tensor_shape(), output_shape);
+ }
+
+ return Status{};
+}
+
+std::tuple<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *output)
+{
+ TensorShape output_shape = compute_min_max_shape(input);
+
+ // Output auto initialization if not yet initialized
+ auto_init_if_empty(*output, output_shape, 1, input->data_type(), input->fixed_point_position());
+
+ const unsigned int num_elems_processed_per_iteration = 1;
+
+ // Configure kernel window
+ Window win = calculate_max_window(*input, Steps(num_elems_processed_per_iteration));
+ AccessWindowHorizontal input_access(input, 0, num_elems_processed_per_iteration);
+ AccessWindowStatic output_access(output, 0, 0, 2, output->dimension(1));
+
+ bool window_changed = update_window_and_padding(win, input_access, output_access);
+
+ output_access.set_valid_region(win, ValidRegion(Coordinates(), output->tensor_shape()));
+
+ Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
+ return std::make_tuple(err, win);
+}
+} // namespace
CLMinMaxLayerKernel::CLMinMaxLayerKernel()
: _input(nullptr), _output(nullptr)
@@ -42,26 +87,12 @@
void CLMinMaxLayerKernel::configure(const ICLTensor *input, ICLTensor *output)
{
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32);
- ARM_COMPUTE_ERROR_ON(input->info()->num_dimensions() < 3);
- ARM_COMPUTE_ERROR_ON_NULLPTR(output);
-
- TensorShape output_shape{ input->info()->tensor_shape() };
- output_shape.set(Window::DimX, 2);
- output_shape.remove_dimension(1);
- output_shape.remove_dimension(1);
-
- // Output auto initialization if not yet initialized
- auto_init_if_empty(*output->info(), output_shape, 1, input->info()->data_type(), input->info()->fixed_point_position());
-
- ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
- ARM_COMPUTE_ERROR_ON_MISMATCHING_DIMENSIONS(output->info()->tensor_shape(), output_shape);
+ ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
+ ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info()));
_input = input;
_output = output;
- const unsigned int num_elems_processed_per_iteration = 1;
-
std::set<std::string> build_opts;
build_opts.emplace("-DWIDTH=" + support::cpp11::to_string(input->info()->dimension(0)));
build_opts.emplace("-DHEIGHT=" + support::cpp11::to_string(input->info()->dimension(1)));
@@ -70,16 +101,19 @@
// Create kernel
_kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("minmax_layer", build_opts));
- // Configure kernel window
- Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration));
- AccessWindowHorizontal input_access(input->info(), 0, num_elems_processed_per_iteration);
- AccessWindowStatic output_access(output->info(), 0, 0, 2, output->info()->dimension(1));
+ auto win_config = validate_and_configure_window(input->info(), output->info());
- update_window_and_padding(win, input_access, output_access);
+ ARM_COMPUTE_ERROR_THROW_ON(std::get<0>(win_config));
- output_access.set_valid_region(win, ValidRegion(Coordinates(), output->info()->tensor_shape()));
+ ICLKernel::configure(std::get<1>(win_config));
+}
- ICLKernel::configure(win);
+Status CLMinMaxLayerKernel::validate(const ITensorInfo *input, const ITensorInfo *output)
+{
+ ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output));
+ ARM_COMPUTE_RETURN_ON_ERROR(std::get<0>(validate_and_configure_window(input->clone().get(), output->clone().get())));
+
+ return Status{};
}
void CLMinMaxLayerKernel::reset(cl::CommandQueue &queue)
diff --git a/src/core/CL/kernels/CLPermuteKernel.cpp b/src/core/CL/kernels/CLPermuteKernel.cpp
index da34448..d20bee1 100644
--- a/src/core/CL/kernels/CLPermuteKernel.cpp
+++ b/src/core/CL/kernels/CLPermuteKernel.cpp
@@ -114,13 +114,18 @@
// Configure kernel window
Window win = calculate_max_window(*input->info(), Steps());
+ // The CLPermute doesn't need padding so update_window_and_padding() can be skipped
+ Coordinates coord;
+ coord.set_num_dimensions(output->info()->num_dimensions());
+ output->info()->set_valid_region(ValidRegion(coord, output->info()->tensor_shape()));
+
ICLKernel::configure(win);
}
Status CLPermuteKernel::validate(const ITensorInfo *input, const ITensorInfo *output, const PermutationVector &perm)
{
ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
- ARM_COMPUTE_RETURN_ERROR_ON(validate_arguments(input, output, perm));
+ ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, perm));
return Status{};
}
diff --git a/src/core/CL/kernels/CLPoolingLayerKernel.cpp b/src/core/CL/kernels/CLPoolingLayerKernel.cpp
index b3034e1..02fa283 100644
--- a/src/core/CL/kernels/CLPoolingLayerKernel.cpp
+++ b/src/core/CL/kernels/CLPoolingLayerKernel.cpp
@@ -34,53 +34,52 @@
#include "arm_compute/core/Utils.h"
#include "arm_compute/core/Validate.h"
#include "arm_compute/core/Window.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
#include <set>
#include <string>
#include <tuple>
using namespace arm_compute;
+using namespace arm_compute::misc::shape_calculator;
namespace
{
// Internal window config info
using CLPoolingConfig = std::pair<unsigned int, BorderSize>; //num_elems_processed_per_iteration, border_size
-void auto_init(const ITensorInfo *input, ITensorInfo *output, unsigned int pooled_w, unsigned int pooled_h)
+void auto_init(const ITensorInfo *input, ITensorInfo *output, PoolingLayerInfo pool_info)
{
- TensorShape output_shape{ input->tensor_shape() };
- output_shape.set(0, pooled_w);
- output_shape.set(1, pooled_h);
-
- auto_init_if_empty(*output, input->clone()->set_tensor_shape(output_shape));
+ TensorShape out_shape = compute_pool_shape(*input, pool_info);
+ auto_init_if_empty(*output, input->clone()->set_tensor_shape(out_shape));
}
Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, const PoolingLayerInfo &pool_info)
{
ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::QS8, DataType::QS16, DataType::F16, DataType::F32);
+ DataLayout data_layout = input->data_layout();
+ switch(data_layout)
+ {
+ case DataLayout::NCHW:
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::QS8, DataType::QS16, DataType::F16, DataType::F32);
+ break;
+ case DataLayout::NHWC:
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::F16, DataType::F32);
+ break;
+ default:
+ ARM_COMPUTE_ERROR("Data layout not supported");
+ }
ARM_COMPUTE_RETURN_ERROR_ON_MSG((is_data_type_quantized_asymmetric(input->data_type()) && pool_info.pool_type() == PoolingType::L2),
"Unsupported combination of parameters!");
- const bool is_global_pooling = pool_info.is_global_pooling();
- const unsigned int pool_size_x = is_global_pooling ? input->tensor_shape().x() : pool_info.pool_size().width;
- const unsigned int pool_size_y = is_global_pooling ? input->tensor_shape().y() : pool_info.pool_size().height;
-
// Checks performed when output is configured
if(output->total_size() != 0)
{
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(input, output);
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_FIXED_POINT(input, output);
-
- unsigned int pooled_w = 0;
- unsigned int pooled_h = 0;
- std::tie(pooled_w, pooled_h) = scaled_dimensions(input->dimension(0),
- input->dimension(1),
- pool_size_x,
- pool_size_y,
- pool_info.pad_stride_info());
- ARM_COMPUTE_RETURN_ERROR_ON_MSG((output->dimension(0) != pooled_w) || (output->dimension(1) != pooled_h),
- "Invalid output pooling dimensions!");
+ TensorInfo out_info(TensorInfo(compute_pool_shape(*input, pool_info), 1, output->data_type(), output->fixed_point_position()));
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(output, &out_info);
}
return Status{};
@@ -88,59 +87,82 @@
std::tuple<Status, Window, CLPoolingConfig> validate_and_configure_window(ITensorInfo *input, ITensorInfo *output, const PoolingLayerInfo &pool_info)
{
+ ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
+
+ // Get data layout
+ const DataLayout data_layout = input->data_layout();
+ const int idx_width = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
+ const int idx_height = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
+
int pool_stride_x = 0;
int pool_stride_y = 0;
unsigned int pooled_w = 0;
unsigned int pooled_h = 0;
- int pool_size_x = pool_info.is_global_pooling() ? input->dimension(0) : pool_info.pool_size().width;
- int pool_size_y = pool_info.is_global_pooling() ? input->dimension(1) : pool_info.pool_size().height;
+ int pool_size_x = pool_info.is_global_pooling() ? input->dimension(idx_width) : pool_info.pool_size().width;
+ int pool_size_y = pool_info.is_global_pooling() ? input->dimension(idx_height) : pool_info.pool_size().height;
const PadStrideInfo pad_stride_info = pool_info.pad_stride_info();
std::tie(pool_stride_x, pool_stride_y) = pad_stride_info.stride();
- const int pool_pad_right = pad_stride_info.pad_right();
- const int pool_pad_top = pad_stride_info.pad_top();
- const int pool_pad_left = pad_stride_info.pad_left();
- const int pool_pad_bottom = pad_stride_info.pad_bottom();
+ const int pool_pad_right = pad_stride_info.pad_right();
+ const int pool_pad_top = pad_stride_info.pad_top();
+ const int pool_pad_left = pad_stride_info.pad_left();
+ const int pool_pad_bottom = pad_stride_info.pad_bottom();
+ BorderSize border_size = BorderSize(pool_pad_top, pool_pad_right, pool_pad_bottom, pool_pad_left);
- ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
+ auto_init(input, output, pool_info);
+ pooled_w = output->tensor_shape()[idx_width];
+ pooled_h = output->tensor_shape()[idx_height];
- // Check output dimensions
- std::tie(pooled_w, pooled_h) = scaled_dimensions(input->dimension(0),
- input->dimension(1),
- pool_size_x,
- pool_size_y,
- pad_stride_info);
+ const DataType data_type = input->data_type();
- auto_init(input, output, pooled_w, pooled_h);
+ const int input_width = input->dimension(idx_width);
+ const int input_height = input->dimension(idx_height);
- BorderSize border_size = BorderSize(pool_pad_top, pool_pad_right, pool_pad_bottom, pool_pad_left);
- const DataType data_type = input->data_type();
+ unsigned int num_elems_processed_per_iteration = 0;
+ bool window_changed = false;
+ Window win{};
+ switch(data_layout)
+ {
+ case DataLayout::NCHW:
+ {
+ // Change the number of elements processed per iteration
+ // for pooling 3x3 with stride less equal than 3
+ const bool can_optimize = (pool_size_x == 3) && (pool_size_y == 3) && (pool_stride_x <= 3) && !is_data_type_quantized(data_type);
+ num_elems_processed_per_iteration = can_optimize ? 4 : 1;
+ const unsigned int num_elems_read_per_iteration = (num_elems_processed_per_iteration - 1) * pool_stride_x + pool_size_x;
- const int input_width = input->dimension(0);
- const int input_height = input->dimension(1);
+ // Number of iterations in X dimension
+ const int num_iterations_x = (pooled_w + num_elems_processed_per_iteration - 1) / num_elems_processed_per_iteration;
- // Change the number of elements processed per iteration
- // for pooling 3x3 with stride less equal than 3
- const bool can_optimize = (pool_size_x == 3) && (pool_size_y == 3) && (pool_stride_x <= 3) && !is_data_type_quantized(data_type);
- const unsigned int num_elems_processed_per_iteration = can_optimize ? 4 : 1;
- const int num_elems_read_per_iteration = (num_elems_processed_per_iteration - 1) * pool_stride_x + pool_size_x;
+ // Upper limit for the number of right/bottom border elements that are accessed
+ const int upper_bound_w = ((num_iterations_x - 1) * num_elems_processed_per_iteration * pool_stride_x - pool_pad_left + num_elems_read_per_iteration) - input_width;
+ const int upper_bound_h = ((pooled_h - 1) * pool_stride_y - pool_pad_top + pool_size_y) - input_height;
- // Number of iterations in X dimension
- const int num_iterations_x = (pooled_w + num_elems_processed_per_iteration - 1) / num_elems_processed_per_iteration;
+ border_size.right = std::max(upper_bound_w, pool_pad_right);
+ border_size.bottom = std::max(upper_bound_h, pool_pad_bottom);
- // Upper limit for the number of right/bottom border elements that are accessed
- const int upper_bound_w = ((num_iterations_x - 1) * num_elems_processed_per_iteration * pool_stride_x - pool_pad_left + num_elems_read_per_iteration) - input_width;
- const int upper_bound_h = ((pooled_h - 1) * pool_stride_y - pool_pad_top + pool_size_y) - input_height;
+ win = calculate_max_window(*output, Steps(num_elems_processed_per_iteration));
- border_size.right = std::max(upper_bound_w, pool_pad_right);
- border_size.bottom = std::max(upper_bound_h, pool_pad_bottom);
+ AccessWindowRectangle input_access(input, -pool_pad_left, -pool_pad_top, num_elems_read_per_iteration, pool_size_y,
+ pool_stride_x, pool_stride_y);
+ AccessWindowHorizontal output_access(output, 0, num_elems_processed_per_iteration);
+ window_changed = update_window_and_padding(win, input_access, output_access);
+ output_access.set_valid_region(win, ValidRegion(Coordinates(), output->tensor_shape()));
+ break;
+ }
+ case DataLayout::NHWC:
+ {
+ num_elems_processed_per_iteration = 8;
+ win = calculate_max_window(*output, Steps(num_elems_processed_per_iteration));
- Window win = calculate_max_window(*output, Steps(num_elems_processed_per_iteration));
-
- AccessWindowRectangle input_access(input, -pool_pad_left, -pool_pad_top, num_elems_read_per_iteration, pool_size_y,
- pool_stride_x * num_elems_processed_per_iteration, pool_stride_y);
- AccessWindowHorizontal output_access(output, 0, num_elems_processed_per_iteration);
- bool window_changed = update_window_and_padding(win, input_access, output_access);
- output_access.set_valid_region(win, ValidRegion(Coordinates(), output->tensor_shape()));
+ AccessWindowRectangle input_access(input, 0, -pool_pad_left, num_elems_processed_per_iteration, pool_size_x);
+ AccessWindowHorizontal output_access(output, 0, num_elems_processed_per_iteration);
+ window_changed = update_window_and_padding(win, input_access, output_access);
+ output_access.set_valid_region(win, ValidRegion(Coordinates(), output->tensor_shape()));
+ break;
+ }
+ default:
+ ARM_COMPUTE_ERROR("Not implemented");
+ }
Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
return std::make_tuple(err, win, CLPoolingConfig(num_elems_processed_per_iteration, border_size));
@@ -159,30 +181,25 @@
void CLPoolingLayerKernel::configure(const ICLTensor *input, ICLTensor *output, const PoolingLayerInfo &pool_info)
{
+ ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
+
int pool_stride_x = 0;
int pool_stride_y = 0;
- unsigned int pooled_w = 0;
- unsigned int pooled_h = 0;
const PoolingType pool_type = pool_info.pool_type();
- const int pool_size_x = pool_info.is_global_pooling() ? input->info()->dimension(0) : pool_info.pool_size().width;
- const int pool_size_y = pool_info.is_global_pooling() ? input->info()->dimension(1) : pool_info.pool_size().height;
+ DataLayout data_layout = input->info()->data_layout();
+ const int idx_width = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
+ const int idx_height = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
+ const int idx_channel = get_data_layout_dimension_index(data_layout, DataLayoutDimension::CHANNEL);
+ const int pool_size_x = pool_info.is_global_pooling() ? input->info()->dimension(idx_width) : pool_info.pool_size().width;
+ const int pool_size_y = pool_info.is_global_pooling() ? input->info()->dimension(idx_height) : pool_info.pool_size().height;
const PadStrideInfo pad_stride_info = pool_info.pad_stride_info();
const bool exclude_padding = pool_info.exclude_padding();
std::tie(pool_stride_x, pool_stride_y) = pad_stride_info.stride();
const int pool_pad_top = pad_stride_info.pad_top();
const int pool_pad_left = pad_stride_info.pad_left();
- ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
-
// Check output dimensions
- std::tie(pooled_w, pooled_h) = scaled_dimensions(input->info()->dimension(0),
- input->info()->dimension(1),
- pool_size_x,
- pool_size_y,
- pad_stride_info);
-
- auto_init(input->info(), output->info(), pooled_w, pooled_h);
-
+ auto_init(input->info(), output->info(), pool_info);
ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), pool_info));
// Set instance variables
@@ -190,7 +207,7 @@
_output = output;
_pool_info = pool_info;
- const GPUTarget gpu_target = get_arch_from_target(get_target());
+ const GPUTarget gpu_target = get_target();
const DataType data_type = input->info()->data_type();
// Set build options
@@ -200,65 +217,93 @@
build_opts.add_option_if(is_data_type_fixed_point(data_type),
"-DFIXED_POINT_POSITION=" + support::cpp11::to_string(input->info()->fixed_point_position()));
build_opts.add_option("-DSTRIDE_X=" + support::cpp11::to_string(pool_stride_x));
- if(pool_type != PoolingType::MAX)
- {
- build_opts.add_option_if(exclude_padding, "-DEXCLUDE_PADDING");
- build_opts.add_option("-DMAX_WIDTH=" + support::cpp11::to_string(input->info()->dimension(0) + (exclude_padding ? 0 : pool_pad_left)));
- build_opts.add_option("-DMAX_HEIGHT=" + support::cpp11::to_string(input->info()->dimension(1) + (exclude_padding ? 0 : pool_pad_top)));
- build_opts.add_option("-DSTRIDE_Y=" + support::cpp11::to_string(pool_stride_y));
- build_opts.add_option("-DPAD_X=" + support::cpp11::to_string(pool_pad_left));
- build_opts.add_option("-DPAD_Y=" + support::cpp11::to_string(pool_pad_top));
- }
+ build_opts.add_option("-DSTRIDE_Y=" + support::cpp11::to_string(pool_stride_y));
+ build_opts.add_option("-DPAD_X=" + support::cpp11::to_string(pool_pad_left));
+ build_opts.add_option("-DPAD_Y=" + support::cpp11::to_string(pool_pad_top));
+ build_opts.add_option("-DPOOL_SIZE_X=" + support::cpp11::to_string(pool_size_x));
+ build_opts.add_option("-DPOOL_SIZE_Y=" + support::cpp11::to_string(pool_size_y));
+ build_opts.add_option_if(data_type == DataType::F16, "-DFP16");
// Create kernel
- if((pool_size_x == 3) && (pool_size_y == 3) && !is_data_type_quantized_asymmetric(data_type))
+ switch(data_layout)
{
- // Check if we have pool3x3 with stride_x less equal than 3. In these cases, run an optimized OpenCL kernel where
- // each thread computes 4 output elements
- const bool is_pool3x3_stride_le3 = (pool_size_x == 3) && (pool_size_y == 3) && (pool_stride_x <= 3) && !is_data_type_fixed_point(data_type);
+ case DataLayout::NCHW:
+ {
+ build_opts.add_option("-DMAX_WIDTH=" + support::cpp11::to_string(input->info()->dimension(idx_width) + (exclude_padding ? 0 : pool_pad_left)));
+ build_opts.add_option("-DMAX_HEIGHT=" + support::cpp11::to_string(input->info()->dimension(idx_height) + (exclude_padding ? 0 : pool_pad_top)));
+ if(pool_type != PoolingType::MAX)
+ {
+ build_opts.add_option_if(exclude_padding, "-DEXCLUDE_PADDING");
+ }
- std::string kernel_name = ((is_pool3x3_stride_le3) ? "pooling_layer_optimized_" : "pooling_layer_")
- + support::cpp11::to_string(pool_size_x);
- _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel(kernel_name, build_opts.options()));
- }
- else // Run general case
- {
- build_opts.add_option("-DPOOL_SIZE_X=" + support::cpp11::to_string(pool_size_x));
- build_opts.add_option("-DPOOL_SIZE_Y=" + support::cpp11::to_string(pool_size_y));
- build_opts.add_option_if(data_type == DataType::F16, "-DFP16");
+ if((pool_size_x == 3) && (pool_size_y == 3) && !is_data_type_quantized_asymmetric(data_type))
+ {
+ // Check if we have pool3x3 with stride_x less equal than 3. In these cases, run an optimized OpenCL kernel where
+ // each thread computes 4 output elements
+ const bool is_pool3x3_stride_le3 = (pool_size_x == 3) && (pool_size_y == 3) && (pool_stride_x <= 3) && !is_data_type_fixed_point(data_type);
- std::string kernel_name = is_data_type_quantized_asymmetric(data_type) ? "pooling_layer_MxN_quantized" : "pooling_layer_MxN";
- _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel(kernel_name, build_opts.options()));
+ std::string kernel_name = ((is_pool3x3_stride_le3) ? "pooling_layer_optimized_" : "pooling_layer_")
+ + support::cpp11::to_string(pool_size_x);
+ _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel(kernel_name, build_opts.options()));
+ }
+ else // Run general case
+ {
+ std::string kernel_name = is_data_type_quantized_asymmetric(data_type) ? "pooling_layer_MxN_quantized_nchw" : "pooling_layer_MxN_nchw";
+ _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel(kernel_name, build_opts.options()));
+ }
+ break;
+ }
+ case DataLayout::NHWC:
+ {
+ build_opts.add_option_if(exclude_padding, "-DEXCLUDE_PADDING");
+ build_opts.add_option("-DMAX_WIDTH=" + support::cpp11::to_string(input->info()->dimension(idx_width)));
+ build_opts.add_option("-DMAX_HEIGHT=" + support::cpp11::to_string(input->info()->dimension(idx_height)));
+ std::string kernel_name = is_data_type_quantized_asymmetric(data_type) ? "pooling_layer_MxN_quantized_nhwc" : "pooling_layer_MxN_nhwc";
+ _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel(kernel_name, build_opts.options()));
+ break;
+ }
+ default:
+ ARM_COMPUTE_ERROR("Not implemented");
}
// Configure kernel window
auto win_config = validate_and_configure_window(input->info(), output->info(), pool_info);
ARM_COMPUTE_ERROR_THROW_ON(std::get<0>(win_config));
+ ICLKernel::configure(std::get<1>(win_config));
// Configure the local work size (hint) from the first two dimensions of the global work size.
// On Bifrost, this works for up to 35x35xC filters, for which the pooling_layer_3_optimized
// kernel is launched with gws=(9, 33, C). In any case, the hint will be ignored if it is
// invalid (e.g. exceeds the maximum workgroup size that the kernel can be launched with).
- if(gpu_target == GPUTarget::BIFROST)
+ if(data_layout == DataLayout::NCHW)
{
- cl::NDRange gws = ICLKernel::gws_from_window(std::get<1>(win_config));
- _lws_hint = cl::NDRange(gws[0], gws[1], 1);
+ CLPoolingConfig pooling_config = std::get<2>(win_config);
+ _num_elems_processed_per_iteration = pooling_config.first;
+ _border_size = pooling_config.second;
+ if(gpu_target_is_in(gpu_target, GPUTarget::G71, GPUTarget::G72, GPUTarget::G51, GPUTarget::G51BIG, GPUTarget::G51LIT, GPUTarget::TNOX))
+ {
+ cl::NDRange gws = ICLKernel::gws_from_window(std::get<1>(win_config));
+ _lws_hint = cl::NDRange(gws[0], gws[1], 1);
+ }
}
-
- ICLKernel::configure(std::get<1>(win_config));
-
- CLPoolingConfig pooling_config = std::get<2>(win_config);
- _num_elems_processed_per_iteration = pooling_config.first;
- _border_size = pooling_config.second;
+ else
+ {
+ _border_size = BorderSize(1, 0, 0, 0);
+ _num_elems_processed_per_iteration = 8;
+ }
// Set config_id for enabling LWS tuning
_config_id = "pooling_layer_";
_config_id += lower_string(string_from_data_type(data_type));
_config_id += "_";
- _config_id += support::cpp11::to_string(output->info()->dimension(0));
+ _config_id += lower_string(string_from_data_layout(data_layout));
_config_id += "_";
- _config_id += support::cpp11::to_string(output->info()->dimension(1));
+ _config_id += support::cpp11::to_string(output->info()->dimension(idx_width));
+ _config_id += "_";
+ _config_id += support::cpp11::to_string(output->info()->dimension(idx_height));
+ _config_id += "_";
+ _config_id += support::cpp11::to_string(output->info()->dimension(idx_channel));
}
Status CLPoolingLayerKernel::validate(const ITensorInfo *input, const ITensorInfo *output, const PoolingLayerInfo &pool_info)
@@ -278,25 +323,52 @@
unsigned int pool_stride_y = 0;
std::tie(pool_stride_x, pool_stride_y) = _pool_info.pad_stride_info().stride();
- Window window_collapsed = window.collapse_if_possible(ICLKernel::window(), Window::DimZ);
- Window slice = window_collapsed.first_slice_window_3D();
-
- do
+ switch(_input->info()->data_layout())
{
- // Upsample input by pool size
- Window in_slice(slice);
- in_slice.set(Window::DimX, Window::Dimension(in_slice.x().start() - _pool_info.pad_stride_info().pad_left(),
- (in_slice.x().end() - _pool_info.pad_stride_info().pad_left()) * pool_stride_x,
- pool_stride_x * _num_elems_processed_per_iteration));
- in_slice.set(Window::DimY, Window::Dimension(in_slice.y().start() - _pool_info.pad_stride_info().pad_top(),
- (in_slice.y().end() - _pool_info.pad_stride_info().pad_top()) * pool_stride_y,
- pool_stride_y));
+ case DataLayout::NCHW:
+ {
+ Window window_collapsed = window.collapse_if_possible(ICLKernel::window(), Window::DimZ);
+ Window slice = window_collapsed.first_slice_window_3D();
+ do
+ {
+ // Upsample input by pool size
+ Window in_slice(slice);
+ in_slice.set(Window::DimX, Window::Dimension(in_slice.x().start() - _pool_info.pad_stride_info().pad_left(),
+ (in_slice.x().end() - _pool_info.pad_stride_info().pad_left()) * pool_stride_x,
+ pool_stride_x * _num_elems_processed_per_iteration));
+ in_slice.set(Window::DimY, Window::Dimension(in_slice.y().start() - _pool_info.pad_stride_info().pad_top(),
+ (in_slice.y().end() - _pool_info.pad_stride_info().pad_top()) * pool_stride_y,
+ pool_stride_y));
- // Set inputs
- unsigned int idx = 0;
- add_3D_tensor_argument(idx, _input, in_slice);
- add_3D_tensor_argument(idx, _output, slice);
- enqueue(queue, *this, slice, _lws_hint);
+ // Set inputs
+ unsigned int idx = 0;
+ add_3D_tensor_argument(idx, _input, in_slice);
+ add_3D_tensor_argument(idx, _output, slice);
+ enqueue(queue, *this, slice, _lws_hint);
+ }
+ while(window_collapsed.slide_window_slice_3D(slice));
+ break;
+ }
+ case DataLayout::NHWC:
+ {
+ Window slice = window.first_slice_window_3D();
+
+ Window in_slice = window.first_slice_window_3D();
+ in_slice.set(Window::DimX, Window::Dimension(0, _input->info()->dimension(0), _num_elems_processed_per_iteration));
+ in_slice.set(Window::DimY, Window::Dimension(0, _input->info()->dimension(1), pool_stride_x));
+ in_slice.set(Window::DimZ, Window::Dimension(0, _input->info()->dimension(2), pool_stride_y));
+ do
+ {
+ // Set inputs
+ unsigned int idx = 0;
+ add_3D_tensor_argument(idx, _input, in_slice);
+ add_3D_tensor_argument(idx, _output, slice);
+ enqueue(queue, *this, slice, _lws_hint);
+ }
+ while(window.slide_window_slice_3D(slice) && window.slide_window_slice_3D(in_slice));
+ break;
+ }
+ default:
+ ARM_COMPUTE_ERROR("Not implemented");
}
- while(window_collapsed.slide_window_slice_3D(slice));
}
diff --git a/src/core/CL/kernels/CLQuantizationLayerKernel.cpp b/src/core/CL/kernels/CLQuantizationLayerKernel.cpp
index 8b082a8..028e508 100644
--- a/src/core/CL/kernels/CLQuantizationLayerKernel.cpp
+++ b/src/core/CL/kernels/CLQuantizationLayerKernel.cpp
@@ -34,6 +34,46 @@
using namespace arm_compute;
+namespace
+{
+Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, const ITensorInfo *min_max)
+{
+ ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output, min_max);
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32);
+ ARM_COMPUTE_RETURN_ERROR_ON(input->num_dimensions() < 3);
+
+ if(output->tensor_shape().total_size() > 0)
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8);
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output);
+ }
+
+ return Status{};
+}
+
+std::tuple<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *output, ITensorInfo *min_max)
+{
+ // Output tensor auto initialization if not yet initialized
+ auto_init_if_empty(*output, input->tensor_shape(), 1, DataType::U8, 0);
+
+ constexpr unsigned int num_elems_processed_per_iteration = 4;
+
+ // Configure window
+ Window win = calculate_max_window(*input, Steps(num_elems_processed_per_iteration));
+ AccessWindowHorizontal input_access(input, 0, num_elems_processed_per_iteration);
+ AccessWindowHorizontal output_access(output, 0, num_elems_processed_per_iteration);
+ AccessWindowStatic min_max_access(min_max, 0, 0, 2, min_max->dimension(1));
+
+ // Update window and padding
+ bool window_changed = update_window_and_padding(win, input_access, output_access, min_max_access);
+
+ output_access.set_valid_region(win, input->valid_region());
+
+ Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
+ return std::make_tuple(err, win);
+}
+} // namespace
+
CLQuantizationLayerKernel::CLQuantizationLayerKernel()
: _input(nullptr), _output(nullptr), _min_max(nullptr)
{
@@ -41,37 +81,30 @@
void CLQuantizationLayerKernel::configure(const ICLTensor *input, ICLTensor *output, ICLTensor *min_max)
{
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32);
- ARM_COMPUTE_ERROR_ON_NULLPTR(output, min_max);
- ARM_COMPUTE_ERROR_ON(input->info()->num_dimensions() < 3);
-
- // Output tensor auto initialization if not yet initialized
- auto_init_if_empty(*output->info(), input->info()->tensor_shape(), 1, DataType::U8, 0);
-
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8);
- ARM_COMPUTE_ERROR_ON_MISMATCHING_SHAPES(input, output);
+ ARM_COMPUTE_ERROR_ON_NULLPTR(input, output, min_max);
+ ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), min_max->info()));
_input = input;
_output = output;
_min_max = min_max;
- constexpr unsigned int num_elems_processed_per_iteration = 4;
-
// Create kernel
_kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("quantization_layer"));
- // Configure window
- Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration));
- AccessWindowHorizontal input_access(input->info(), 0, num_elems_processed_per_iteration);
- AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration);
- AccessWindowStatic min_max_access(min_max->info(), 0, 0, 2, min_max->info()->dimension(1));
+ // Configure kernel window
+ auto win_config = validate_and_configure_window(input->info(), output->info(), min_max->info());
- // Update window and padding
- update_window_and_padding(win, input_access, output_access, min_max_access);
+ ARM_COMPUTE_ERROR_THROW_ON(std::get<0>(win_config));
- output_access.set_valid_region(win, input->info()->valid_region());
+ ICLKernel::configure(std::get<1>(win_config));
+}
- ICLKernel::configure(win);
+Status CLQuantizationLayerKernel::validate(const ITensorInfo *input, const ITensorInfo *output, const ITensorInfo *min_max)
+{
+ ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, min_max));
+ ARM_COMPUTE_RETURN_ON_ERROR(std::get<0>(validate_and_configure_window(input->clone().get(), output->clone().get(), min_max->clone().get())));
+
+ return Status{};
}
void CLQuantizationLayerKernel::run(const Window &window, cl::CommandQueue &queue)
diff --git a/src/core/CL/kernels/CLReductionOperationKernel.cpp b/src/core/CL/kernels/CLReductionOperationKernel.cpp
index 1dd5eb9..25b756b 100644
--- a/src/core/CL/kernels/CLReductionOperationKernel.cpp
+++ b/src/core/CL/kernels/CLReductionOperationKernel.cpp
@@ -38,6 +38,52 @@
using namespace arm_compute;
+namespace
+{
+Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, unsigned int axis, ReductionOperation op)
+{
+ ARM_COMPUTE_UNUSED(op);
+
+ ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32);
+ ARM_COMPUTE_RETURN_ERROR_ON(input->data_layout() != DataLayout::NCHW);
+
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(axis >= TensorShape::num_max_dimensions, "Reduction axis greater than max number of dimensions");
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(axis > 0, "Unsupported reduction axis, Supported axis is 0");
+
+ if(output->total_size() != 0)
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+ ARM_COMPUTE_RETURN_ERROR_ON(output->data_layout() != DataLayout::NCHW);
+ }
+
+ return Status{};
+}
+
+std::tuple<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *output, unsigned int axis)
+{
+ // Output tensor auto initialization if not yet initialized
+ TensorShape output_shape{ input->tensor_shape() };
+ output_shape.set(axis, 1);
+ auto_init_if_empty(*output, output_shape, 1, input->data_type(), input->fixed_point_position());
+
+ const unsigned int num_elems_processed_per_iteration = 16;
+
+ Window win = calculate_max_window(*input, Steps(num_elems_processed_per_iteration));
+ const unsigned int border_width = ((input->dimension(0) % 128) != 0) ? 128 - input->dimension(0) % 128 : 0;
+
+ AccessWindowStatic input_access(input, 0, 0, input->dimension(0) + border_width, 1);
+ AccessWindowHorizontal output_access(output, 0, 1);
+
+ bool window_changed = update_window_and_padding(win, input_access, output_access);
+ output_access.set_valid_region(win, output->valid_region());
+
+ Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
+
+ return std::make_tuple(err, win);
+}
+} // namespace
+
CLReductionOperationKernel::CLReductionOperationKernel()
: _input(nullptr), _output(nullptr), _reduction_axis(0), _op(ReductionOperation::SUM_SQUARE), _border_size()
{
@@ -50,17 +96,9 @@
void CLReductionOperationKernel::configure(const ICLTensor *input, ICLTensor *output, unsigned int axis, ReductionOperation op)
{
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32);
- ARM_COMPUTE_ERROR_ON_NULLPTR(output);
+ ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
- // Output tensor auto initialization if not yet initialized
- TensorShape output_shape{ input->info()->tensor_shape() };
- output_shape.set(axis, 1);
- auto_init_if_empty(*output->info(), output_shape, 1, input->info()->data_type(), input->info()->fixed_point_position());
-
- ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
- ARM_COMPUTE_ERROR_ON_MSG(axis >= TensorShape::num_max_dimensions, "Reduction axis greater than max number of dimensions");
- ARM_COMPUTE_ERROR_ON_MSG(axis > 0, "Unsupported reduction axis, Supported axis is 0");
+ ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), axis, op));
const unsigned int num_elems_processed_per_iteration = 16;
const unsigned int border_width = ((input->info()->dimension(0) % 128) != 0) ? 128 - input->info()->dimension(0) % 128 : 0;
@@ -97,15 +135,19 @@
_kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("reduction_operation", build_opts));
// Configure kernel window
- Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration));
+ auto win_config = validate_and_configure_window(_input->info(), _output->info(), axis);
- AccessWindowStatic input_access(input->info(), 0, 0, input->info()->dimension(0) + border_width, 1);
- AccessWindowHorizontal output_access(output->info(), 0, 1);
+ ARM_COMPUTE_ERROR_THROW_ON(std::get<0>(win_config));
- update_window_and_padding(win, input_access, output_access);
- output_access.set_valid_region(win, output->info()->valid_region());
+ ICLKernel::configure(std::get<1>(win_config));
+}
- ICLKernel::configure(win);
+Status CLReductionOperationKernel::validate(const ITensorInfo *input, const ITensorInfo *output, unsigned int axis, ReductionOperation op)
+{
+ ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, axis, op));
+ ARM_COMPUTE_RETURN_ON_ERROR(std::get<0>(validate_and_configure_window(input->clone().get(), output->clone().get(), axis)));
+
+ return Status{};
}
void CLReductionOperationKernel::run(const Window &window, cl::CommandQueue &queue)
diff --git a/src/core/CL/kernels/CLScaleKernel.cpp b/src/core/CL/kernels/CLScaleKernel.cpp
index 673304a..9b8a582 100644
--- a/src/core/CL/kernels/CLScaleKernel.cpp
+++ b/src/core/CL/kernels/CLScaleKernel.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2016, 2017 ARM Limited.
+ * Copyright (c) 2016-2018 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -102,7 +102,7 @@
output_access.set_valid_region(win, calculate_valid_region_scale(*(input->info()),
output->info()->tensor_shape(),
policy,
- border,
+ sampling_policy,
border_undefined));
ICLKernel::configure(win);
diff --git a/src/core/CL/kernels/CLTransposeKernel.cpp b/src/core/CL/kernels/CLTransposeKernel.cpp
index 3b5fbc9..b80a612 100644
--- a/src/core/CL/kernels/CLTransposeKernel.cpp
+++ b/src/core/CL/kernels/CLTransposeKernel.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2018 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -30,6 +30,7 @@
#include "arm_compute/core/CL/OpenCL.h"
#include "arm_compute/core/Error.h"
#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/TensorInfo.h"
#include "arm_compute/core/Types.h"
#include "arm_compute/core/Utils.h"
diff --git a/src/core/CL/kernels/CLWidthConcatenateLayerKernel.cpp b/src/core/CL/kernels/CLWidthConcatenateLayerKernel.cpp
new file mode 100644
index 0000000..b8bce38
--- /dev/null
+++ b/src/core/CL/kernels/CLWidthConcatenateLayerKernel.cpp
@@ -0,0 +1,136 @@
+/*
+ * Copyright (c) 2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/CL/kernels/CLWidthConcatenateLayerKernel.h"
+
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/CL/CLKernelLibrary.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/CL/OpenCL.h"
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/IAccessWindow.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Utils.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/Window.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+
+#include "support/ToolchainSupport.h"
+
+#include <map>
+
+using namespace arm_compute;
+namespace
+{
+std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, unsigned int width_offset, ITensorInfo *output)
+{
+ const unsigned int num_elems_processed_per_iteration = 16;
+
+ // The window needs to be based on input as we copy all the widths of input
+ Window win = calculate_max_window(*input, Steps(num_elems_processed_per_iteration));
+ AccessWindowHorizontal input_access(input, 0, num_elems_processed_per_iteration);
+ AccessWindowHorizontal output_access(output, width_offset, num_elems_processed_per_iteration);
+ bool window_changed = update_window_and_padding(win, input_access, output_access);
+
+ Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
+ return std::make_pair(err, win);
+}
+Status validate_arguments(const ITensorInfo *input, unsigned int width_offset, const ITensorInfo *output)
+{
+ ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8, DataType::S8, DataType::QS8, DataType::QASYMM8, DataType::U16, DataType::S16, DataType::QS16, DataType::F16, DataType::U32,
+ DataType::F32);
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_FIXED_POINT_POSITION(input, output);
+ ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(0) + width_offset > output->dimension(0));
+
+ for(size_t i = 1; i < Coordinates::num_max_dimensions; ++i)
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(i) != output->dimension(i));
+ }
+ ARM_COMPUTE_RETURN_ERROR_ON(input->num_dimensions() > 3);
+
+ return Status{};
+}
+} // namespace
+
+CLWidthConcatenateLayerKernel::CLWidthConcatenateLayerKernel()
+ : _input(nullptr), _output(nullptr), _width_offset(0)
+{
+}
+
+Status CLWidthConcatenateLayerKernel::validate(const ITensorInfo *input, unsigned int width_offset, const ITensorInfo *output)
+{
+ ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, width_offset, output));
+ ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input->clone().get(), width_offset, output->clone().get()).first);
+ return Status{};
+}
+
+void CLWidthConcatenateLayerKernel::configure(const ICLTensor *input, unsigned int width_offset, ICLTensor *output)
+{
+ ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
+ ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), width_offset, output->info()));
+
+ _input = input;
+ _output = output;
+ _width_offset = width_offset;
+
+ const unsigned int num_elems_processed_per_iteration = 16;
+
+ // Add build options
+ CLBuildOptions build_opts;
+ build_opts.add_option("-DDATA_TYPE=" + get_underlying_cl_type_from_data_type(input->info()->data_type()));
+ build_opts.add_option("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration));
+
+ // Create kernel
+ _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("concatenate_width", build_opts.options()));
+
+ const int offset_to_first_elements_in_bytes = _width_offset * _output->info()->strides_in_bytes()[0];
+
+ unsigned int idx = 2 * num_arguments_per_3D_tensor(); // Skip the input and output parameters
+ _kernel.setArg<cl_int>(idx, offset_to_first_elements_in_bytes);
+
+ // Configure kernel window
+ auto win_config = validate_and_configure_window(input->info(), width_offset, output->info());
+ ARM_COMPUTE_ERROR_THROW_ON(std::get<0>(win_config));
+
+ ICLKernel::configure(std::get<1>(win_config));
+}
+
+void CLWidthConcatenateLayerKernel::run(const Window &window, cl::CommandQueue &queue)
+{
+ ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+ ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
+
+ Window slice = window.first_slice_window_3D();
+
+ do
+ {
+ unsigned int idx = 0;
+ add_3D_tensor_argument(idx, _input, slice);
+ add_3D_tensor_argument(idx, _output, slice);
+ enqueue(queue, *this, slice);
+ }
+ while(window.slide_window_slice_3D(slice));
+}
diff --git a/src/core/CL/kernels/CLWinogradFilterTransformKernel.cpp b/src/core/CL/kernels/CLWinogradFilterTransformKernel.cpp
new file mode 100644
index 0000000..41b3ac5
--- /dev/null
+++ b/src/core/CL/kernels/CLWinogradFilterTransformKernel.cpp
@@ -0,0 +1,157 @@
+/*
+ * Copyright (c) 2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/CL/kernels/CLWinogradFilterTransformKernel.h"
+
+#include "arm_compute/core/AccessWindowStatic.h"
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/CL/CLKernelLibrary.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/IAccessWindow.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/Utils.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/Window.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+
+#include "support/ToolchainSupport.h"
+
+using namespace arm_compute;
+using namespace arm_compute::misc::shape_calculator;
+
+namespace
+{
+Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, const WinogradInfo &winograd_info)
+{
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32);
+ ARM_COMPUTE_RETURN_ERROR_ON(input->data_layout() != DataLayout::NCHW);
+
+ const Size2D kernel_size = winograd_info.kernel_size;
+ const Size2D output_tile_size = winograd_info.output_tile_size;
+
+ const size_t idx_w = get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::WIDTH);
+ const size_t idx_h = get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::HEIGHT);
+
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(kernel_size != Size2D(3U, 3U) && kernel_size != Size2D(5U, 5U), "Winograd filter transform only supports 3x3 and 5x5 kernels");
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(kernel_size == Size2D(3U, 3U) && output_tile_size != Size2D(2U, 2U)
+ && output_tile_size != Size2D(4U, 4U),
+ "Winograd filter transform only supports 2x2 or 4x4 output tile for 3x3 kernels");
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(kernel_size == Size2D(5U, 5U) && output_tile_size != Size2D(4U, 4U), "Winograd filter transform only supports 4x4 output tile for 5x5 kernels");
+ ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(idx_w) != kernel_size.width || input->dimension(idx_h) != kernel_size.height);
+ ARM_COMPUTE_RETURN_ERROR_ON(input->num_dimensions() > 4);
+
+ // Checks performed when output is configured
+ if(output->total_size() != 0)
+ {
+ const TensorInfo tensor_info_output = input->clone()->set_tensor_shape(compute_winograd_filter_transform_shape(*input, winograd_info));
+
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(output, &tensor_info_output);
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+ }
+
+ return Status{};
+}
+
+std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *output)
+{
+ ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
+
+ const unsigned int num_elems_processed_per_iteration_x = input->dimension(get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::WIDTH));
+ const unsigned int num_elems_processed_per_iteration_y = input->dimension(get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::HEIGHT));
+
+ Window win = calculate_max_window(*input, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y));
+ bool window_changed = false;
+
+ AccessWindowRectangle input_access(input, 0, 0, num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y);
+ AccessWindowStatic output_access(output, 0, 0, output->dimension(0), output->dimension(1));
+ window_changed = update_window_and_padding(win, input_access, output_access);
+ output_access.set_valid_region(win, ValidRegion(Coordinates(0, 0), output->tensor_shape()));
+
+ Window win_collapsed = win.collapse(win, Window::DimZ);
+
+ Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
+ return std::make_pair(err, win_collapsed);
+}
+} // namespace
+
+CLWinogradFilterTransformKernel::CLWinogradFilterTransformKernel()
+ : _input(nullptr), _output(nullptr)
+{
+}
+
+void CLWinogradFilterTransformKernel::configure(const ICLTensor *input, ICLTensor *output, const WinogradInfo &winograd_info)
+{
+ ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
+
+ // Output auto initialization if not yet initialized
+ auto_init_if_empty(*output->info(), input->info()->clone()->set_tensor_shape(compute_winograd_filter_transform_shape(*input->info(), winograd_info)));
+
+ ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), winograd_info));
+
+ const size_t idx_c = get_data_layout_dimension_index(input->info()->data_layout(), DataLayoutDimension::CHANNEL);
+
+ // Set build options
+ CLBuildOptions build_opts;
+ build_opts.add_option("-DNUM_CHANNELS=" + support::cpp11::to_string(input->info()->dimension(idx_c)));
+
+ const Size2D kernel_size = winograd_info.kernel_size;
+ const Size2D output_tile_size = winograd_info.output_tile_size;
+
+ // Create kernel
+ std::string kernel_name = "winograd_filter_transform_" + output_tile_size.to_string() + "_" + kernel_size.to_string() + "_nchw";
+ _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel(kernel_name, build_opts.options()));
+
+ _input = input;
+ _output = output;
+
+ // Configure kernel window
+ auto win_config = validate_and_configure_window(input->info(), output->info());
+ ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
+ ICLKernel::configure(win_config.second);
+}
+
+Status CLWinogradFilterTransformKernel::validate(const ITensorInfo *input, const ITensorInfo *output, const WinogradInfo &winograd_info)
+{
+ ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, winograd_info));
+ ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input->clone().get(), output->clone().get()).first);
+
+ return Status{};
+}
+
+void CLWinogradFilterTransformKernel::run(const Window &window, cl::CommandQueue &queue)
+{
+ ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+ ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
+
+ // Setup output window
+ Window window_out;
+ window_out.use_tensor_dimensions(_output->info()->tensor_shape(), 0);
+
+ unsigned int idx = 0;
+ add_4D_tensor_argument(idx, _input, window);
+ add_3D_tensor_argument(idx, _output, window_out);
+ enqueue(queue, *this, window);
+}
diff --git a/src/core/CL/kernels/CLWinogradInputTransformKernel.cpp b/src/core/CL/kernels/CLWinogradInputTransformKernel.cpp
new file mode 100644
index 0000000..febd22b
--- /dev/null
+++ b/src/core/CL/kernels/CLWinogradInputTransformKernel.cpp
@@ -0,0 +1,202 @@
+/*
+ * Copyright (c) 2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/CL/kernels/CLWinogradInputTransformKernel.h"
+
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/CL/CLKernelLibrary.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/CL/OpenCL.h"
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "support/ToolchainSupport.h"
+
+using namespace arm_compute;
+
+namespace
+{
+Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, const WinogradInfo &winograd_info)
+{
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32);
+ ARM_COMPUTE_RETURN_ERROR_ON(input->data_layout() != DataLayout::NCHW);
+
+ const PadStrideInfo conv_info = winograd_info.convolution_info;
+ const Size2D output_tile_size = winograd_info.output_tile_size;
+ const Size2D kernel_size = winograd_info.kernel_size;
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(conv_info.stride().first != 1 || conv_info.stride().second != 1, "Winograd input transform only supports unit strides");
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(kernel_size != Size2D(3U, 3U) && kernel_size != Size2D(5U, 5U), "Winograd input transform only supports 3x3 and 5x5 kernels");
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(kernel_size == Size2D(3U, 3U) && output_tile_size != Size2D(2U, 2U)
+ && output_tile_size != Size2D(4U, 4U),
+ "Winograd input transform only supports 2x2 or 4x4 output tile for 3x3 kernels");
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(kernel_size == Size2D(5U, 5U) && output_tile_size != Size2D(4U, 4U), "Winograd input transform only supports 4x4 output tile for 5x5 kernels");
+ ARM_COMPUTE_UNUSED(conv_info);
+ ARM_COMPUTE_UNUSED(output_tile_size);
+ ARM_COMPUTE_UNUSED(kernel_size);
+
+ // Validate configured output
+ if(output->total_size() != 0)
+ {
+ const TensorShape output_shape = misc::shape_calculator::compute_winograd_input_transform_shape(*input, winograd_info);
+
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(output->tensor_shape(), output_shape);
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+ }
+
+ return Status{};
+}
+
+std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *output, const WinogradInfo &winograd_info)
+{
+ ARM_COMPUTE_UNUSED(output);
+ ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
+ const PadStrideInfo conv_info = winograd_info.convolution_info;
+ const Size2D output_tile_size = winograd_info.output_tile_size;
+ const Size2D kernel_size = winograd_info.kernel_size;
+
+ const unsigned int num_elems_read_per_iteration_x = output_tile_size.width + kernel_size.width - 1;
+ const unsigned int num_elems_read_per_iteration_y = output_tile_size.height + kernel_size.height - 1;
+
+ Window win = calculate_max_window(*input, Steps(1, 1));
+
+ AccessWindowRectangle input_access(input, -conv_info.pad_left(), -conv_info.pad_top(), num_elems_read_per_iteration_x, num_elems_read_per_iteration_y);
+
+ bool window_changed = update_window_and_padding(win, input_access);
+
+ Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
+ return std::make_pair(err, win);
+}
+} // namespace
+
+CLWinogradInputTransformKernel::CLWinogradInputTransformKernel()
+ : _border_size(0), _input(nullptr), _output(nullptr), _num_tiles_x(0), _num_tiles_y(0), _step_z(1)
+{
+}
+
+BorderSize CLWinogradInputTransformKernel::border_size() const
+{
+ return _border_size;
+}
+
+void CLWinogradInputTransformKernel::configure(const ICLTensor *input, ICLTensor *output, const WinogradInfo &winograd_info)
+{
+ ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
+ ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), winograd_info));
+
+ const PadStrideInfo conv_info = winograd_info.convolution_info;
+ const Size2D output_tile_size = winograd_info.output_tile_size;
+ const Size2D kernel_size = winograd_info.kernel_size;
+
+ // Compute number of elements to process in the X and Y direction
+ const int num_elements_x = input->info()->dimension(0) - (kernel_size.width - 1) + conv_info.pad_left() + conv_info.pad_right();
+ const int num_elements_y = input->info()->dimension(1) - (kernel_size.height - 1) + conv_info.pad_top() + conv_info.pad_bottom();
+
+ // Check if we need to extend the right or bottom border
+ const unsigned int extra_border_right = ((num_elements_x % output_tile_size.width) == 0) ? 0u : static_cast<unsigned int>(output_tile_size.width - 1);
+ const unsigned int extra_border_bottom = ((num_elements_y % output_tile_size.height) == 0) ? 0u : static_cast<unsigned int>(output_tile_size.height - 1);
+
+ _input = input;
+ _output = output;
+ _border_size = BorderSize(conv_info.pad_top(), conv_info.pad_right() + extra_border_right, conv_info.pad_bottom() + extra_border_bottom, conv_info.pad_left());
+ _num_tiles_x = std::ceil(num_elements_x / static_cast<float>(output_tile_size.width));
+ _num_tiles_y = std::ceil(num_elements_y / static_cast<float>(output_tile_size.height));
+
+ const TensorShape output_shape = misc::shape_calculator::compute_winograd_input_transform_shape(*input->info(), winograd_info);
+
+ // Output auto initialization if not yet initialized
+ auto_init_if_empty(*output->info(), input->info()->clone()->set_tensor_shape(output_shape));
+
+ ARM_COMPUTE_ERROR_ON(_num_tiles_x * _num_tiles_y != static_cast<int>(output->info()->dimension(1)));
+
+ CLBuildOptions build_opts;
+ build_opts.add_option("-DNUM_TILES_X=" + support::cpp11::to_string(_num_tiles_x));
+ build_opts.add_option("-DPAD_LEFT=" + support::cpp11::to_string(conv_info.pad_left()));
+ build_opts.add_option("-DPAD_TOP=" + support::cpp11::to_string(conv_info.pad_top()));
+
+ // Create kernel
+ std::string kernel_name = "winograd_input_transform_" + output_tile_size.to_string() + "_" + kernel_size.to_string();
+
+ // Check optimized kernel if output_dims == 2x2
+ if(output_tile_size == Size2D(2U, 2U))
+ {
+ _step_z = (_input->info()->dimension(2) % 2) != 0 ? 1 : 2;
+ }
+
+ _lws_hint = cl::NDRange(1, 1, 8);
+
+ // Append stepz and data layout
+ kernel_name += "_stepz";
+ kernel_name += support::cpp11::to_string(_step_z);
+ kernel_name += "_nchw";
+
+ _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel(kernel_name, build_opts.options()));
+
+ // Create window and update padding
+ auto win_config = validate_and_configure_window(input->info(), output->info(), winograd_info);
+ ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
+ ICLKernel::configure(win_config.second);
+
+ _config_id = kernel_name;
+ _config_id += support::cpp11::to_string(input->info()->dimension(0));
+ _config_id += "_";
+ _config_id += support::cpp11::to_string(input->info()->dimension(1));
+ _config_id += "_";
+ _config_id += support::cpp11::to_string(input->info()->dimension(2));
+ _config_id += "_";
+ _config_id += support::cpp11::to_string(conv_info.pad_left());
+ _config_id += "_";
+ _config_id += support::cpp11::to_string(conv_info.pad_top());
+}
+
+Status CLWinogradInputTransformKernel::validate(const ITensorInfo *input, const ITensorInfo *output, const WinogradInfo &winograd_info)
+{
+ ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
+ ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, winograd_info));
+ ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input->clone().get(), output->clone().get(), winograd_info).first);
+
+ return Status{};
+}
+
+void CLWinogradInputTransformKernel::run(const Window &window, cl::CommandQueue &queue)
+{
+ ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+ ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window);
+
+ Window slice = window.first_slice_window_3D();
+ slice.set(Window::DimX, Window::Dimension(0, _num_tiles_x, 1));
+ slice.set(Window::DimY, Window::Dimension(0, _num_tiles_y, 1));
+
+ ARM_COMPUTE_ERROR_ON(((slice.z().end() - slice.z().start()) % _step_z) != 0);
+ slice.set(Window::DimZ, Window::Dimension(slice.z().start(), slice.z().end(), _step_z));
+
+ do
+ {
+ unsigned int idx = 0;
+ add_3D_tensor_argument(idx, _input, slice);
+ add_3D_tensor_argument(idx, _output, slice);
+
+ enqueue(queue, *this, slice, _lws_hint);
+ }
+ while(window.slide_window_slice_3D(slice));
+}
diff --git a/src/core/CL/kernels/CLWinogradOutputTransformKernel.cpp b/src/core/CL/kernels/CLWinogradOutputTransformKernel.cpp
new file mode 100644
index 0000000..5c0a735
--- /dev/null
+++ b/src/core/CL/kernels/CLWinogradOutputTransformKernel.cpp
@@ -0,0 +1,209 @@
+/*
+ * Copyright (c) 2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/CL/kernels/CLWinogradOutputTransformKernel.h"
+
+#include "arm_compute/core/AccessWindowStatic.h"
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/CL/CLKernelLibrary.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/IAccessWindow.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/Utils.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/Window.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+
+#include "support/ToolchainSupport.h"
+
+#include <cmath>
+
+using namespace arm_compute;
+using namespace arm_compute::misc::shape_calculator;
+
+namespace
+{
+Status validate_arguments(const ITensorInfo *input, const ITensorInfo *bias, const ITensorInfo *output, const WinogradInfo &winograd_info)
+{
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32);
+ ARM_COMPUTE_RETURN_ERROR_ON(winograd_info.output_data_layout != DataLayout::NCHW);
+
+ const PadStrideInfo conv_info = winograd_info.convolution_info;
+ const Size2D output_tile_size = winograd_info.output_tile_size;
+ const Size2D kernel_size = winograd_info.kernel_size;
+ const Size2D input_dimensions = winograd_info.input_dimensions;
+
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(kernel_size != Size2D(3U, 3U) && kernel_size != Size2D(5U, 5U), "Only 3x3 and 5x5 kernels are supported");
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(kernel_size == Size2D(3U, 3U) && output_tile_size == Size2D(2U, 2U) && input->dimension(2) != 16, "Wrong number of batches");
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(kernel_size == Size2D(3U, 3U) && output_tile_size == Size2D(4U, 4U) && input->dimension(2) != 36, "Wrong number of batches");
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(kernel_size == Size2D(5U, 5U) && output_tile_size == Size2D(4U, 4U) && input->dimension(2) != 64, "Wrong number of batches");
+
+ // Compute number of elements to process in the X and Y direction
+ const int num_elements_x = input_dimensions.width - (kernel_size.width - 1) + conv_info.pad_left() + conv_info.pad_right();
+ const int num_elements_y = input_dimensions.height - (kernel_size.height - 1) + conv_info.pad_top() + conv_info.pad_bottom();
+ const int num_tiles_x = std::ceil(num_elements_x / static_cast<float>(output_tile_size.width));
+ const int num_tiles_y = std::ceil(num_elements_y / static_cast<float>(output_tile_size.height));
+
+ ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(1) != static_cast<unsigned int>((num_tiles_x * num_tiles_y)));
+
+ if(bias != nullptr)
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, bias);
+ ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(0) != bias->dimension(0));
+ }
+
+ // Checks performed when output is configured
+ if(output->total_size() != 0)
+ {
+ const TensorInfo tensor_info_output = input->clone()->set_tensor_shape(compute_winograd_output_transform_shape(*input, winograd_info));
+
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(output, &tensor_info_output);
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+ }
+
+ return Status{};
+}
+
+std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *bias, ITensorInfo *output, const Size2D &output_tile_size)
+{
+ ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
+
+ constexpr unsigned int num_elems_processed_per_iteration = 1;
+
+ Window win = calculate_max_window(*input, Steps(num_elems_processed_per_iteration));
+ bool window_changed = false;
+
+ AccessWindowRectangle input_access(input, 0, 0, num_elems_processed_per_iteration, num_elems_processed_per_iteration);
+ AccessWindowStatic output_access(output, 0, 0, ceil_to_multiple(output->dimension(0), output_tile_size.width), ceil_to_multiple(output->dimension(1), output_tile_size.height));
+
+ if(bias != nullptr)
+ {
+ AccessWindowStatic bias_access(bias, 0, 0, bias->dimension(0), bias->dimension(1));
+ window_changed = update_window_and_padding(win, input_access, bias_access, output_access);
+ }
+ else
+ {
+ window_changed = update_window_and_padding(win, input_access, output_access);
+ }
+ output->set_valid_region(ValidRegion(Coordinates(), output->tensor_shape()));
+
+ Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
+ return std::make_pair(err, win);
+}
+} // namespace
+
+CLWinogradOutputTransformKernel::CLWinogradOutputTransformKernel()
+ : _input(nullptr), _bias(nullptr), _output(nullptr)
+{
+}
+
+void CLWinogradOutputTransformKernel::configure(const ICLTensor *input, const ICLTensor *bias, ICLTensor *output, const WinogradInfo &winograd_info)
+{
+ ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
+
+ // Output tensor auto initialization if not yet initialized
+ auto_init_if_empty(*output->info(), input->info()->clone()->set_tensor_shape(compute_winograd_output_transform_shape(*input->info(), winograd_info)));
+
+ ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), (bias != nullptr ? bias->info() : nullptr), output->info(), winograd_info));
+
+ _input = input;
+ _bias = bias;
+ _output = output;
+
+ // Compute num_tiles_x
+ const Size2D input_dimensions = winograd_info.input_dimensions;
+ const Size2D kernel_size = winograd_info.kernel_size;
+ const Size2D output_tile_size = winograd_info.output_tile_size;
+ const PadStrideInfo conv_info = winograd_info.convolution_info;
+ const int num_elements_x = input_dimensions.width - (kernel_size.width - 1) + conv_info.pad_left() + conv_info.pad_right();
+ const int num_tiles_x = std::ceil(num_elements_x / static_cast<float>(output_tile_size.width));
+
+ // Set build options
+ CLBuildOptions build_opts;
+ build_opts.add_option_if(_bias != nullptr, std::string("-DHAS_BIAS"));
+ build_opts.add_option("-DNUM_TILES_X=" + support::cpp11::to_string(num_tiles_x));
+
+ // Create kernel
+ std::string kernel_name = "winograd_output_transform_" + output_tile_size.to_string() + "_" + kernel_size.to_string() + "_nchw";
+ _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel(kernel_name, build_opts.options()));
+
+ // Configure kernel window
+ auto win_config = validate_and_configure_window(input->info(), (bias != nullptr ? bias->info() : nullptr), output->info(), winograd_info.output_tile_size);
+ ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
+ ICLKernel::configure(win_config.second);
+
+ // Set config_id for enabling LWS tuning
+ _config_id = kernel_name;
+ _config_id += "_";
+ _config_id += lower_string(string_from_data_type(input->info()->data_type()));
+ _config_id += "_";
+ _config_id += support::cpp11::to_string(input->info()->dimension(0));
+ _config_id += "_";
+ _config_id += support::cpp11::to_string(input->info()->dimension(1));
+ _config_id += "_";
+ _config_id += support::cpp11::to_string(output->info()->dimension(0));
+ _config_id += "_";
+ _config_id += support::cpp11::to_string(output->info()->dimension(1));
+}
+
+Status CLWinogradOutputTransformKernel::validate(const ITensorInfo *input, const ITensorInfo *bias, const ITensorInfo *output, const WinogradInfo &winograd_info)
+{
+ ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, (bias != nullptr ? bias->clone().get() : nullptr), output, winograd_info));
+ ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input->clone().get(), (bias != nullptr ? bias->clone().get() : nullptr), output->clone().get(), winograd_info.output_tile_size).first);
+
+ return Status{};
+}
+
+void CLWinogradOutputTransformKernel::run(const Window &window, cl::CommandQueue &queue)
+{
+ ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+ ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
+
+ // Get initial windows
+ Window slice = window.first_slice_window_3D();
+ slice.set(Window::DimZ, Window::Dimension(0, 1, 1));
+
+ // Setup output slice
+ Window slice_out(slice);
+ slice_out.set(Window::DimX, Window::Dimension(0, 0, 0));
+ slice_out.set(Window::DimY, Window::Dimension(0, 0, 0));
+
+ if(_bias != nullptr)
+ {
+ unsigned int idx1 = 2 * num_arguments_per_3D_tensor();
+ Window slice_biases;
+ slice_biases.use_tensor_dimensions(_bias->info()->tensor_shape());
+ add_1D_tensor_argument(idx1, _bias, slice_biases);
+ }
+
+ do
+ {
+ unsigned int idx = 0;
+ add_3D_tensor_argument(idx, _input, slice);
+ add_3D_tensor_argument(idx, _output, slice_out);
+ enqueue(queue, *this, slice, _lws_hint);
+ }
+ while(window.slide_window_slice_3D(slice) && window.slide_window_slice_3D(slice_out));
+}
diff --git a/src/core/CPP/CPPTypes.cpp b/src/core/CPP/CPPTypes.cpp
new file mode 100644
index 0000000..9c2b41b
--- /dev/null
+++ b/src/core/CPP/CPPTypes.cpp
@@ -0,0 +1,114 @@
+/*
+ * Copyright (c) 2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "arm_compute/core/CPP/CPPTypes.h"
+
+#include "arm_compute/core/Error.h"
+
+#ifndef BARE_METAL
+#include <sched.h>
+#endif /* defined(BARE_METAL) */
+
+using namespace arm_compute;
+
+void CPUInfo::set_fp16(const bool fp16)
+{
+ _fp16 = fp16;
+}
+
+void CPUInfo::set_dotprod(const bool dotprod)
+{
+ _dotprod = dotprod;
+}
+
+void CPUInfo::set_cpu_model(unsigned int cpuid, CPUModel model)
+{
+ ARM_COMPUTE_ERROR_ON(cpuid >= _percpu.size());
+ if(_percpu.size() > cpuid)
+ {
+ _percpu[cpuid] = model;
+ }
+}
+
+bool CPUInfo::has_fp16() const
+{
+ return _fp16;
+}
+
+bool CPUInfo::has_dotprod() const
+{
+ return _dotprod;
+}
+
+CPUModel CPUInfo::get_cpu_model(unsigned int cpuid) const
+{
+ if(cpuid < _percpu.size())
+ {
+ return _percpu[cpuid];
+ }
+ return CPUModel::GENERIC;
+}
+
+unsigned int CPUInfo::get_L1_cache_size() const
+{
+ return _L1_cache_size;
+}
+
+void CPUInfo::set_L1_cache_size(unsigned int size)
+{
+ _L1_cache_size = size;
+}
+
+unsigned int CPUInfo::get_L2_cache_size() const
+{
+ return _L2_cache_size;
+}
+
+void CPUInfo::set_L2_cache_size(unsigned int size)
+{
+ _L2_cache_size = size;
+}
+
+void CPUInfo::set_cpu_num(unsigned int cpu_count)
+{
+ _percpu.resize(cpu_count);
+}
+
+CPUInfo::CPUInfo()
+ : _percpu(1)
+{
+ // The core library knows nothing about the CPUs so we set only 1 CPU to be generic.
+ // The runtime NESCheduler will initialise this vector with the correct CPU models.
+ // See void detect_cpus_configuration(CPUInfo &cpuinfo) in CPPUtils.h
+ _percpu[0] = CPUModel::GENERIC;
+}
+
+CPUModel CPUInfo::get_cpu_model() const
+{
+#if defined(BARE_METAL) || (!defined(__arm__) && !defined(__aarch64__))
+ return get_cpu_model(0);
+#else /* defined(BARE_METAL) || (!defined(__arm__) && !defined(__aarch64__)) */
+ return get_cpu_model(sched_getcpu());
+#endif /* defined(BARE_METAL) || (!defined(__arm__) && !defined(__aarch64__)) */
+}
diff --git a/src/core/CPP/kernels/CPPDetectionWindowNonMaximaSuppressionKernel.cpp b/src/core/CPP/kernels/CPPDetectionWindowNonMaximaSuppressionKernel.cpp
index 62a2477..5037ac5 100644
--- a/src/core/CPP/kernels/CPPDetectionWindowNonMaximaSuppressionKernel.cpp
+++ b/src/core/CPP/kernels/CPPDetectionWindowNonMaximaSuppressionKernel.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2018 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -35,7 +35,26 @@
{
bool compare_detection_window(const DetectionWindow &lhs, const DetectionWindow &rhs)
{
- return lhs.score > rhs.score;
+ if(lhs.idx_class < rhs.idx_class)
+ {
+ return true;
+ }
+ if(rhs.idx_class < lhs.idx_class)
+ {
+ return false;
+ }
+
+ // idx_classes are equal so compare by score
+ if(lhs.score > rhs.score)
+ {
+ return true;
+ }
+ if(rhs.score > lhs.score)
+ {
+ return false;
+ }
+
+ return false;
}
} // namespace
@@ -70,7 +89,7 @@
const size_t num_candidates = _input_output->num_values();
size_t num_detections = 0;
- // Sort list of candidates
+ // Sort list of candidates by idx_class and then score
std::sort(_input_output->buffer(), _input_output->buffer() + num_candidates, compare_detection_window);
const float min_distance_pow2 = _min_distance * _min_distance;
@@ -96,7 +115,7 @@
const float xc = cur.x + cur.width * 0.5f;
const float yc = cur.y + cur.height * 0.5f;
- for(size_t k = i + 1; k < num_candidates; ++k)
+ for(size_t k = i + 1; k < (num_candidates) && (cur.idx_class == _input_output->at(k).idx_class); ++k)
{
const float xn = _input_output->at(k).x + _input_output->at(k).width * 0.5f;
const float yn = _input_output->at(k).y + _input_output->at(k).height * 0.5f;
@@ -110,7 +129,7 @@
if(d < min_distance_pow2)
{
- // Invalidate keypoint
+ // Invalidate detection window
_input_output->at(k).score = 0.0f;
}
}
diff --git a/src/core/CPP/kernels/CPPUpsampleKernel.cpp b/src/core/CPP/kernels/CPPUpsampleKernel.cpp
new file mode 100644
index 0000000..d77d9c1
--- /dev/null
+++ b/src/core/CPP/kernels/CPPUpsampleKernel.cpp
@@ -0,0 +1,101 @@
+/*
+ * Copyright (c) 2017-2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/CPP/kernels/CPPUpsampleKernel.h"
+
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/ITensor.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+
+#include <cstddef>
+#include <cstdint>
+
+using namespace arm_compute;
+
+CPPUpsampleKernel::CPPUpsampleKernel()
+ : _input(nullptr), _output(nullptr), _info(), _inner_border()
+{
+}
+
+bool CPPUpsampleKernel::is_parallelisable() const
+{
+ return false;
+}
+
+void CPPUpsampleKernel::configure(const ITensor *input, ITensor *output, const PadStrideInfo &info, unsigned int inner_border_right, unsigned int inner_border_top)
+{
+ ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
+
+ _input = input;
+ _output = output;
+ _info = info;
+ _inner_border = std::make_pair(inner_border_right, inner_border_top);
+
+ // Configure kernel window
+ Window win = calculate_max_window(*input->info(), Steps());
+
+ // The CPPUpsampleKernel doesn't need padding so update_window_and_padding() can be skipped
+ Coordinates coord;
+ coord.set_num_dimensions(output->info()->num_dimensions());
+ output->info()->set_valid_region(ValidRegion(coord, output->info()->tensor_shape()));
+
+ ICPPKernel::configure(win);
+}
+
+void CPPUpsampleKernel::run(const Window &window, const ThreadInfo &info)
+{
+ ARM_COMPUTE_UNUSED(info);
+ ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+ ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICPPKernel::window(), window);
+
+ // Initialize _scaled_output buffer
+ const int width_scaled = _output->info()->dimension(0);
+ const int height_scaled = _output->info()->dimension(1);
+ const int stride_x = _info.stride().first;
+ const int stride_y = _info.stride().second;
+ const int start_x = _info.pad().first;
+ const int start_y = _inner_border.second + _info.pad().second;
+ const int end_y = height_scaled - _info.pad().second;
+ const int end_x = width_scaled - _inner_border.first - _info.pad().first;
+
+ std::fill_n(_output->buffer(), _output->info()->total_size(), 0);
+
+ // Create window
+ Window window_out(window);
+ window_out.set(Window::DimX, Window::Dimension(start_x, end_x, stride_x));
+ window_out.set(Window::DimY, Window::Dimension(start_y, end_y, stride_y));
+
+ // Create iterators
+ Iterator in(_input, window);
+ Iterator out(_output, window_out);
+
+ execute_window_loop(window, [&](const Coordinates & id)
+ {
+ *(reinterpret_cast<float *>(out.ptr())) = *(reinterpret_cast<const float *>(in.ptr()));
+ },
+ in, out);
+}
diff --git a/src/graph/CL/CLUnmap.cpp b/src/core/GLES_COMPUTE/GCHelpers.cpp
similarity index 66%
copy from src/graph/CL/CLUnmap.cpp
copy to src/core/GLES_COMPUTE/GCHelpers.cpp
index 31f2f19..8970688 100644
--- a/src/graph/CL/CLUnmap.cpp
+++ b/src/core/GLES_COMPUTE/GCHelpers.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2018 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -21,23 +21,14 @@
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*/
-#include "arm_compute/graph/CL/CLUnmap.h"
+#include "arm_compute/core/GLES_COMPUTE/GCHelpers.h"
-#include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/Validate.h"
-#include "arm_compute/graph/ITensorObject.h"
-#include "arm_compute/runtime/CL/CLScheduler.h"
-
-using namespace arm_compute::graph;
-
-CLUnmap::CLUnmap(ITensorObject *tensor)
- : _tensor(dynamic_cast<arm_compute::ICLTensor *>(tensor->tensor()))
+namespace arm_compute
{
- ARM_COMPUTE_ERROR_ON_NULLPTR(_tensor);
-}
-
-void CLUnmap::run()
+GPUTarget get_target_from_device()
{
- _tensor->unmap(arm_compute::CLScheduler::get().queue());
+ const std::string device_name = reinterpret_cast<const char *>(glGetString(GL_RENDERER));
+
+ return get_target_from_name(device_name);
}
+} // namespace arm_compute
diff --git a/src/core/GLES_COMPUTE/GCKernelLibrary.cpp b/src/core/GLES_COMPUTE/GCKernelLibrary.cpp
index d4ce388..25ac02e 100644
--- a/src/core/GLES_COMPUTE/GCKernelLibrary.cpp
+++ b/src/core/GLES_COMPUTE/GCKernelLibrary.cpp
@@ -152,9 +152,9 @@
ARM_COMPUTE_GL_CHECK(glGenBuffers(1, &_shader_params_ubo_name));
_shader_params_index = ARM_COMPUTE_GL_CHECK(glGetUniformBlockIndex(_program, _shader_params_name));
- ARM_COMPUTE_ERROR_ON_MSG((_shader_params_index == GL_INVALID_INDEX), "Failed to get index of %s", _shader_params_name);
+ ARM_COMPUTE_ERROR_ON_MSG(_shader_params_index == GL_INVALID_INDEX, "Failed to get index of %s", _shader_params_name);
ARM_COMPUTE_GL_CHECK(glGetActiveUniformBlockiv(_program, _shader_params_index, GL_UNIFORM_BLOCK_DATA_SIZE, &_shader_params_size));
- ARM_COMPUTE_ERROR_ON_MSG((_shader_params_size == 0), "Failed to get size of %s", _shader_params_name);
+ ARM_COMPUTE_ERROR_ON_MSG(_shader_params_size == 0, "Failed to get size of %s", _shader_params_name);
}
void GCKernel::cleanup()
@@ -232,6 +232,14 @@
{
#ifdef EMBEDDED_KERNELS
{
+ "helpers_cs.h",
+#include "./cs_shaders/helpers_cs.hembed"
+ },
+ {
+ "activation_layer_helpers_cs.h",
+#include "./cs_shaders/activation_layer_helpers_cs.hembed"
+ },
+ {
"absdiff.cs",
#include "./cs_shaders/absdiff.csembed"
},
diff --git a/src/core/GLES_COMPUTE/IGCKernel.cpp b/src/core/GLES_COMPUTE/IGCKernel.cpp
index 55b7f0d..ecd63b5 100644
--- a/src/core/GLES_COMPUTE/IGCKernel.cpp
+++ b/src/core/GLES_COMPUTE/IGCKernel.cpp
@@ -62,7 +62,7 @@
}
IGCKernel::IGCKernel()
- : _kernel(), _lws_hint(gles::NDRange(1U, 1U, 1U))
+ : _kernel(), _lws_hint(gles::NDRange(1U, 1U, 1U)), _target(GPUTarget::MIDGARD)
{
}
diff --git a/src/core/GLES_COMPUTE/OpenGLES.cpp b/src/core/GLES_COMPUTE/OpenGLES.cpp
index d2539d0..e93b360 100644
--- a/src/core/GLES_COMPUTE/OpenGLES.cpp
+++ b/src/core/GLES_COMPUTE/OpenGLES.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2018 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -66,7 +66,8 @@
using glMemoryBarrier_func = void GL_APIENTRY (*)(GLbitfield barriers);
using glUniform1ui_func = void GL_APIENTRY (*)(GLint location, GLuint v0);
using glUnmapBuffer_func = GLboolean GL_APIENTRY (*)(GLenum target);
-using glGetError_func = GLenum GL_APIENTRY (*)();
+using glGetError_func = GLenum GL_APIENTRY (*)();
+using glGetString_func = const GLubyte * GL_APIENTRY (*)(GLenum name);
using glGetActiveUniformBlockiv_func = void GL_APIENTRY (*)(GLuint program, GLuint uniformBlockIndex, GLenum pname, GLint *params);
using glUniformBlockBinding_func = void GL_APIENTRY (*)(GLuint program, GLuint uniformBlockIndex, GLuint uniformBlockBinding);
using glGetUniformBlockIndex_func = GLuint GL_APIENTRY (*)(GLuint program, const GLchar *uniformBlockName);
@@ -668,6 +669,19 @@
}
}
+const GLubyte *GL_APIENTRY glGetString(GLenum name)
+{
+ auto func = GLESSymbols::get().glGetString;
+ if(func != nullptr)
+ {
+ return func(name);
+ }
+ else
+ {
+ return nullptr;
+ }
+}
+
void GL_APIENTRY glGetActiveUniformBlockiv(GLuint program, GLuint uniformBlockIndex, GLenum pname, GLint *params)
{
auto func = GLESSymbols::get().glGetActiveUniformBlockiv;
diff --git a/src/core/GLES_COMPUTE/cs_shaders/activation_layer.cs b/src/core/GLES_COMPUTE/cs_shaders/activation_layer.cs
index 7d3f4ee..9a1e233 100644
--- a/src/core/GLES_COMPUTE/cs_shaders/activation_layer.cs
+++ b/src/core/GLES_COMPUTE/cs_shaders/activation_layer.cs
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2018 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -23,97 +23,9 @@
*/
layout(local_size_x = LOCAL_SIZE_X, local_size_y = LOCAL_SIZE_Y, local_size_z = LOCAL_SIZE_Z) in;
+#include "activation_layer_helpers_cs.h"
#include "helpers_cs.h"
-#ifdef DATA_TYPE_FP32
-precision highp float;
-#elif defined(DATA_TYPE_FP16)
-#if defined(LOGISTIC) || defined(TANH) || defined(SRELU) || defined(SQRT)
-precision highp float;
-#else /*LOGISTIC_TANH_SRELU_SQRT*/
-precision mediump float;
-#endif /*LOGISTIC_TANH_SRELU_SQRT*/
-#endif /*DATA_TYPE_FP32*/
-
-#define ABS_OP(a) abs((a))
-#define ADD_OP(a, b) ((a) + (b))
-#define SUB_OP(a, b) ((a) - (b))
-#define MUL_OP(a, b) ((a) * (b))
-#define MLA_OP(a, b, c) ((b) * (c) + (a))
-#define DIV_OP(a, b) ((a) / (b))
-#define EXP_OP(a) exp((a))
-#define LOG_OP(a) log((a))
-#define SQRT_OP(a) sqrt((a))
-#define CONST_ONE (1.f)
-
-// Logistic Activation
-float logistic_op(float x)
-{
- return DIV_OP(CONST_ONE, ADD_OP(CONST_ONE, EXP_OP(-x)));
-}
-// Hyperbolic Tangent Activation
-float tanh_op(float x)
-{
- float tmp = float(B_VAL) * x;
- if(tmp > 10.f)
- {
- return MUL_OP(float(A_VAL), 1.f);
- }
- else if(tmp < -10.f)
- {
- return MUL_OP(float(A_VAL), -1.f);
- }
- else
- {
- return MUL_OP(float(A_VAL), tanh(tmp + 0.000001f));
- }
-}
-// RELU Tangent Activation
-float relu_op(float x)
-{
- return max(0.f, x);
-}
-// Bounded RELU Activation
-float brelu_op(float x)
-{
- return min(float(A_VAL), max(float(0.0), x));
-}
-// Lower Upper Bounded RELU Activation
-float lu_brelu_op(float x)
-{
- return min(max(x, float(B_VAL)), float(A_VAL));
-}
-// Leaky RELU Activation
-float lrelu_op(float x)
-{
- return (x > float(0.0)) ? x : MUL_OP(float(A_VAL), x);
-}
-// Soft RELU Activation
-float srelu_op(float x)
-{
- return LOG_OP(ADD_OP(CONST_ONE, EXP_OP(x)));
-}
-// Absolute Activation
-float abs_op(float x)
-{
- return ABS_OP(x);
-}
-// Square Activation
-float square_op(float x)
-{
- return MUL_OP(x, x);
-}
-// Square-root Activation
-float sqrt_op(float x)
-{
- return SQRT_OP(x);
-}
-// Linear Activation
-float linear_op(float x)
-{
- return MLA_OP(float(B_VAL), float(A_VAL), x);
-}
-
/** This performs an activation function floating point inputs.
*
* @note The data type must be passed at compile time using "#define DATA_TYPE_NAME". e.g. "#define DATA_TYPE_FP32"
diff --git a/src/core/GLES_COMPUTE/cs_shaders/activation_layer_helpers_cs.h b/src/core/GLES_COMPUTE/cs_shaders/activation_layer_helpers_cs.h
new file mode 100644
index 0000000..f43a33f
--- /dev/null
+++ b/src/core/GLES_COMPUTE/cs_shaders/activation_layer_helpers_cs.h
@@ -0,0 +1,119 @@
+/*
+ * Copyright (c) 2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifdef DATA_TYPE_FP32
+precision highp float;
+#elif defined(DATA_TYPE_FP16)
+#if defined(LOGISTIC) || defined(TANH) || defined(SRELU) || defined(SQRT)
+precision highp float;
+#else /*LOGISTIC_TANH_SRELU_SQRT*/
+precision mediump float;
+#endif /*LOGISTIC_TANH_SRELU_SQRT*/
+#endif /*DATA_TYPE_FP32*/
+
+#define ABS_OP(a) abs((a))
+#define ADD_OP(a, b) ((a) + (b))
+#define SUB_OP(a, b) ((a) - (b))
+#define MUL_OP(a, b) ((a) * (b))
+#define MLA_OP(a, b, c) ((b) * (c) + (a))
+#define DIV_OP(a, b) ((a) / (b))
+#define EXP_OP(a) exp((a))
+#define LOG_OP(a) log((a))
+#define SQRT_OP(a) sqrt((a))
+#define CONST_ONE (1.f)
+
+// Logistic Activation
+float logistic_op(float x)
+{
+ return DIV_OP(CONST_ONE, ADD_OP(CONST_ONE, EXP_OP(-x)));
+}
+vec4 logistic_op(vec4 x)
+{
+ return DIV_OP(vec4(CONST_ONE), ADD_OP(CONST_ONE, EXP_OP(-x)));
+}
+// Hyperbolic Tangent Activation
+float tanh_op(float x)
+{
+ float tmp = float(B_VAL) * x;
+ if(tmp > 10.f)
+ {
+ return MUL_OP(float(A_VAL), 1.f);
+ }
+ else if(tmp < -10.f)
+ {
+ return MUL_OP(float(A_VAL), -1.f);
+ }
+ else
+ {
+ return MUL_OP(float(A_VAL), tanh(tmp + 0.000001f));
+ }
+}
+// RELU Tangent Activation
+float relu_op(float x)
+{
+ return max(0.f, x);
+}
+vec4 relu_op(vec4 x)
+{
+ return max(vec4(0.f), x);
+}
+// Bounded RELU Activation
+float brelu_op(float x)
+{
+ return min(float(A_VAL), max(float(0.0), x));
+}
+// Lower Upper Bounded RELU Activation
+float lu_brelu_op(float x)
+{
+ return min(max(x, float(B_VAL)), float(A_VAL));
+}
+// Leaky RELU Activation
+float lrelu_op(float x)
+{
+ return (x > float(0.0)) ? x : MUL_OP(float(A_VAL), x);
+}
+// Soft RELU Activation
+float srelu_op(float x)
+{
+ return LOG_OP(ADD_OP(CONST_ONE, EXP_OP(x)));
+}
+// Absolute Activation
+float abs_op(float x)
+{
+ return ABS_OP(x);
+}
+// Square Activation
+float square_op(float x)
+{
+ return MUL_OP(x, x);
+}
+// Square-root Activation
+float sqrt_op(float x)
+{
+ return SQRT_OP(x);
+}
+// Linear Activation
+float linear_op(float x)
+{
+ return MLA_OP(float(B_VAL), float(A_VAL), x);
+}
diff --git a/src/core/GLES_COMPUTE/cs_shaders/batchnormalization_layer.cs b/src/core/GLES_COMPUTE/cs_shaders/batchnormalization_layer.cs
index 7629b25..81be967 100644
--- a/src/core/GLES_COMPUTE/cs_shaders/batchnormalization_layer.cs
+++ b/src/core/GLES_COMPUTE/cs_shaders/batchnormalization_layer.cs
@@ -50,6 +50,8 @@
*
* @note The data type must be passed at compile time using "#define DATA_TYPE_NAME". e.g. "#define DATA_TYPE_FP32"
* @note Epsilon parameter in the batch normalization equation should be given as a preprocessor argument using "#define EPSILON". e.g. "#define EPSILON 0.1"
+ * @note Beta is optional with default value of 0. If not provided, the preprocessor argument "USE_DEFAULT_BETA" should be given
+ * @note Gamma is optional with default value of 1. If not provided, the preprocessor argument "USE_DEFAULT_GAMMA" should be given
*
* @param[in] src_ptr Pointer to the first source tensor. Supported data types: F16/F32
* @param[in] src_attrs The attributes of the source tensor
@@ -59,10 +61,10 @@
* @param[in] mean_attrs The attributes of the mean tensor
* @param[in] var_ptr Pointer to the var tensor. Supported data types: same as @p src_ptr
* @param[in] var_attrs The attributes of the var tensor
- * @param[in] beta_ptr Pointer to the beta source tensor. Supported data types: same as @p src_ptr
- * @param[in] beta_attrs The attributes of the beta tensor
- * @param[in] gamma_ptr Pointer to the gamma source tensor. Supported data types: same as @p src_ptr
- * @param[in] gamma_attrs The attributes of the gamma tensor
+ * @param[in] beta_ptr (Optional) Pointer to the beta source tensor. If not provided, default value of beta is 0. Supported data types: same as @p src_ptr
+ * @param[in] beta_attrs (Optional) The attributes of the beta tensor
+ * @param[in] gamma_ptr (Optional) Pointer to the gamma source tensor. If not provided, default value of gamma is 1. Supported data types: same as @p src_ptr
+ * @param[in] gamma_attrs (Optional) The attributes of the gamma tensor
*/
SHADER_PARAMS_DECLARATION
{
@@ -70,8 +72,12 @@
Tensor3DAttributes dst_attrs;
VectorAttributes mean_attrs;
VectorAttributes var_attrs;
- VectorAttributes beta_attrs;
- VectorAttributes gamma_attrs;
+#ifndef USE_DEFAULT_BETA
+ VectorAttributes beta_attrs;
+#endif /* USE_DEFAULT_BETA */
+#ifndef USE_DEFAULT_GAMMA
+ VectorAttributes gamma_attrs;
+#endif /* USE_DEFAULT_GAMMA */
};
#ifdef DATA_TYPE_FP32
@@ -79,24 +85,34 @@
TENSOR_DECLARATION(2, dstBuffer, float, dst_ptr, dst_shift, 2, writeonly);
TENSOR_DECLARATION(3, meanBuffer, float, mean_ptr, mean_shift, 2, readonly);
TENSOR_DECLARATION(4, varBuffer, float, var_ptr, var_shift, 2, readonly);
+#ifndef USE_DEFAULT_BETA
TENSOR_DECLARATION(5, betaBuffer, float, beta_ptr, beta_shift, 2, readonly);
+#endif /* USE_DEFAULT_BETA */
+#ifndef USE_DEFAULT_GAMMA
+#ifdef USE_DEFAULT_BETA
+TENSOR_DECLARATION(5, gammaBuffer, float, gamma_ptr, gamma_shift, 2, readonly);
+#else /* USE_DEFAULT_BETA */
TENSOR_DECLARATION(6, gammaBuffer, float, gamma_ptr, gamma_shift, 2, readonly);
+#endif /* USE_DEFAULT_BETA */
+#endif /* USE_DEFAULT_GAMMA */
void main(void)
{
- Tensor3DIterator src_iter = CONVERT_TO_TENSOR3D_ITERATOR(src_attrs, src_shift);
- Tensor3DIterator dst_iter = CONVERT_TO_TENSOR3D_ITERATOR(dst_attrs, dst_shift);
- VectorIterator mean_iter = CONVERT_TO_VECTOR_ITERATOR(mean_attrs, mean_shift);
- VectorIterator var_iter = CONVERT_TO_VECTOR_ITERATOR(var_attrs, var_shift);
- VectorIterator beta_iter = CONVERT_TO_VECTOR_ITERATOR(beta_attrs, beta_shift);
- VectorIterator gamma_iter = CONVERT_TO_VECTOR_ITERATOR(gamma_attrs, gamma_shift);
+ Tensor3DIterator src_iter = CONVERT_TO_TENSOR3D_ITERATOR(src_attrs, src_shift);
+ Tensor3DIterator dst_iter = CONVERT_TO_TENSOR3D_ITERATOR(dst_attrs, dst_shift);
+ VectorIterator mean_iter = CONVERT_TO_VECTOR_ITERATOR(mean_attrs, mean_shift);
+ VectorIterator var_iter = CONVERT_TO_VECTOR_ITERATOR(var_attrs, var_shift);
+#ifndef USE_DEFAULT_BETA
+ VectorIterator beta_iter = CONVERT_TO_VECTOR_ITERATOR(beta_attrs, beta_shift);
+#endif /* USE_DEFAULT_BETA */
+#ifndef USE_DEFAULT_GAMMA
+ VectorIterator gamma_iter = CONVERT_TO_VECTOR_ITERATOR(gamma_attrs, gamma_shift);
+#endif /* USE_DEFAULT_GAMMA */
float input_value = 0.f;
float denominator = 0.f;
float numerator = 0.f;
float x_bar = 0.f;
- float gamma_param = 0.f;
- float beta_param = 0.f;
uint current_slice = gl_GlobalInvocationID.z;
@@ -109,10 +125,18 @@
numerator = SUB_OP(input_value, numerator);
x_bar = MUL_OP(numerator, denominator);
- gamma_param = LOAD(gamma_ptr, TENSOR_OFFSET_ADVANCE_IN_BYTES(gamma_iter, current_slice * beta_attrs.stride_x));
- beta_param = LOAD(beta_ptr, TENSOR_OFFSET_ADVANCE_IN_BYTES(beta_iter, current_slice * beta_attrs.stride_x));
+#ifndef USE_DEFAULT_GAMMA
+ float gamma_param = LOAD(gamma_ptr, TENSOR_OFFSET_ADVANCE_IN_BYTES(gamma_iter, current_slice * gamma_attrs.stride_x));
- STORE_CURRENT_ITEM(dst_ptr, dst_iter, ACTIVATION_FUNC(ADD_OP(MUL_OP(gamma_param, x_bar), beta_param)));
+ x_bar = MUL_OP(gamma_param, x_bar);
+#endif /* USE_DEFAULT_GAMMA */
+#ifndef USE_DEFAULT_BETA
+ float beta_param = LOAD(beta_ptr, TENSOR_OFFSET_ADVANCE_IN_BYTES(beta_iter, current_slice * beta_attrs.stride_x));
+
+ x_bar = ADD_OP(x_bar, beta_param);
+#endif /* USE_DEFAULT_BETA */
+
+ STORE_CURRENT_ITEM(dst_ptr, dst_iter, ACTIVATION_FUNC(x_bar));
}
#elif defined(DATA_TYPE_FP16)
@@ -120,8 +144,16 @@
TENSOR_DECLARATION(2, dstBuffer, uvec2, dst_ptr, dst_shift, 3, writeonly);
TENSOR_DECLARATION(3, meanBuffer, uvec2, mean_ptr, mean_shift, 3, readonly);
TENSOR_DECLARATION(4, varBuffer, uvec2, var_ptr, var_shift, 3, readonly);
+#ifndef USE_DEFAULT_BETA
TENSOR_DECLARATION(5, betaBuffer, uvec2, beta_ptr, beta_shift, 3, readonly);
+#endif /* USE_DEFAULT_BETA */
+#ifndef USE_DEFAULT_GAMMA
+#ifdef USE_DEFAULT_BETA
+TENSOR_DECLARATION(5, gammaBuffer, uvec2, gamma_ptr, gamma_shift, 3, readonly);
+#else /* USE_DEFAULT_BETA */
TENSOR_DECLARATION(6, gammaBuffer, uvec2, gamma_ptr, gamma_shift, 3, readonly);
+#endif /* USE_DEFAULT_BETA */
+#endif /* USE_DEFAULT_GAMMA */
void main(void)
{
@@ -129,14 +161,18 @@
Tensor3DIterator dst_iter = CONVERT_TO_TENSOR3D_ITERATOR(dst_attrs, dst_shift);
VectorIterator mean_iter = CONVERT_TO_VECTOR_ITERATOR(mean_attrs, mean_shift);
VectorIterator var_iter = CONVERT_TO_VECTOR_ITERATOR(var_attrs, var_shift);
+#ifndef USE_DEFAULT_BETA
VectorIterator beta_iter = CONVERT_TO_VECTOR_ITERATOR(beta_attrs, beta_shift);
+#endif /* USE_DEFAULT_BETA */
+#ifndef USE_DEFAULT_GAMMA
VectorIterator gamma_iter = CONVERT_TO_VECTOR_ITERATOR(gamma_attrs, gamma_shift);
+#endif /* USE_DEFAULT_GAMMA */
vec4 unpacked_s[5];
float denominator;
float numerator;
- float gamma_param;
- float beta_param;
+ float gamma_param = 1.f;
+ float beta_param = 0.f;
vec4 x_bar;
vec4 result;
@@ -144,68 +180,87 @@
unpacked_s[0] = LOAD_UNPACK4_CURRENT_ITEM_HALF(src_ptr, src_iter);
unpacked_s[1] = LOAD_UNPACK4_HALF(var_ptr, TENSOR_OFFSET_ADVANCE_IN_BYTES(var_iter, current_slice * var_attrs.stride_x));
unpacked_s[2] = LOAD_UNPACK4_HALF(mean_ptr, TENSOR_OFFSET_ADVANCE_IN_BYTES(mean_iter, current_slice * mean_attrs.stride_x));
- unpacked_s[3] = LOAD_UNPACK4_HALF(gamma_ptr, TENSOR_OFFSET_ADVANCE_IN_BYTES(gamma_iter, current_slice * beta_attrs.stride_x));
+#ifndef USE_DEFAULT_GAMMA
+ unpacked_s[3] = LOAD_UNPACK4_HALF(gamma_ptr, TENSOR_OFFSET_ADVANCE_IN_BYTES(gamma_iter, current_slice * gamma_attrs.stride_x));
+#endif /* USE_DEFAULT_BETA */
+#ifndef USE_DEFAULT_BETA
unpacked_s[4] = LOAD_UNPACK4_HALF(beta_ptr, TENSOR_OFFSET_ADVANCE_IN_BYTES(beta_iter, current_slice * beta_attrs.stride_x));
+#endif /* USE_DEFAULT_GAMMA */
if((current_slice % uint(4)) == uint(0))
{
denominator = unpacked_s[1].x;
denominator = INVSQRT_OP(ADD_OP(denominator, SQCVT_SAT(float(ESPILON))));
- //Calculate x bar and store results
- numerator = unpacked_s[2].x;
- x_bar = MUL_OP(SUB_OP(unpacked_s[0], numerator), denominator);
+ // Calculate x bar
+ numerator = unpacked_s[2].x;
+ x_bar = MUL_OP(SUB_OP(unpacked_s[0], numerator), denominator);
+#ifndef USE_DEFAULT_GAMMA
gamma_param = unpacked_s[3].x;
+#endif /* USE_DEFAULT_GAMMA */
+#ifndef USE_DEFAULT_BETA
beta_param = unpacked_s[4].x;
- result = ACTIVATION_FUNC(ADD_OP(MUL_OP(gamma_param, x_bar), beta_param));
-
- STORE_PACK4_CURRENT_ITEM_HALF(dst_ptr, dst_iter, result);
+#endif /* USE_DEFAULT_BETA */
}
else if((current_slice % uint(4)) == uint(1))
{
denominator = unpacked_s[1].y;
denominator = INVSQRT_OP(ADD_OP(denominator, SQCVT_SAT(float(ESPILON))));
- //Calculate x bar and store results
- numerator = unpacked_s[2].y;
- x_bar = MUL_OP(SUB_OP(unpacked_s[0], numerator), denominator);
+ // Calculate x bar
+ numerator = unpacked_s[2].y;
+ x_bar = MUL_OP(SUB_OP(unpacked_s[0], numerator), denominator);
+#ifndef USE_DEFAULT_GAMMA
gamma_param = unpacked_s[3].y;
+#endif /* USE_DEFAULT_GAMMA */
+#ifndef USE_DEFAULT_BETA
beta_param = unpacked_s[4].y;
- result = ACTIVATION_FUNC(ADD_OP(MUL_OP(gamma_param, x_bar), beta_param));
-
- STORE_PACK4_CURRENT_ITEM_HALF(dst_ptr, dst_iter, result);
+#endif /* USE_DEFAULT_BETA */
}
else if((current_slice % uint(4)) == uint(2))
{
denominator = unpacked_s[1].z;
denominator = INVSQRT_OP(ADD_OP(denominator, SQCVT_SAT(float(ESPILON))));
- //Calculate x bar and store results
- numerator = unpacked_s[2].z;
- x_bar = MUL_OP(SUB_OP(unpacked_s[0], numerator), denominator);
+ // Calculate x bar
+ numerator = unpacked_s[2].z;
+ x_bar = MUL_OP(SUB_OP(unpacked_s[0], numerator), denominator);
+#ifndef USE_DEFAULT_GAMMA
gamma_param = unpacked_s[3].z;
+#endif /* USE_DEFAULT_GAMMA */
+#ifndef USE_DEFAULT_BETA
beta_param = unpacked_s[4].z;
- result = ACTIVATION_FUNC(ADD_OP(MUL_OP(gamma_param, x_bar), beta_param));
-
- STORE_PACK4_CURRENT_ITEM_HALF(dst_ptr, dst_iter, result);
+#endif /* USE_DEFAULT_BETA */
}
else
{
denominator = unpacked_s[1].w;
denominator = INVSQRT_OP(ADD_OP(denominator, SQCVT_SAT(float(ESPILON))));
- //Calculate x bar and store results
- numerator = unpacked_s[2].w;
- x_bar = MUL_OP(SUB_OP(unpacked_s[0], numerator), denominator);
+ // Calculate x bar
+ numerator = unpacked_s[2].w;
+ x_bar = MUL_OP(SUB_OP(unpacked_s[0], numerator), denominator);
+#ifndef USE_DEFAULT_GAMMA
gamma_param = unpacked_s[3].w;
+#endif /* USE_DEFAULT_GAMMA */
+#ifndef USE_DEFAULT_BETA
beta_param = unpacked_s[4].w;
- result = ACTIVATION_FUNC(ADD_OP(MUL_OP(gamma_param, x_bar), beta_param));
-
- STORE_PACK4_CURRENT_ITEM_HALF(dst_ptr, dst_iter, result);
+#endif /* USE_DEFAULT_BETA */
}
+
+#ifndef USE_DEFAULT_GAMMA
+ x_bar = MUL_OP(gamma_param, x_bar);
+#endif /* USE_DEFAULT_GAMMA */
+#ifndef USE_DEFAULT_BETA
+ x_bar = ADD_OP(x_bar, beta_param);
+#endif /* USE_DEFAULT_BETA */
+
+ result = ACTIVATION_FUNC(x_bar);
+
+ STORE_PACK4_CURRENT_ITEM_HALF(dst_ptr, dst_iter, result);
}
#endif /*DATA_TYPE_FP16*/
diff --git a/src/core/GLES_COMPUTE/cs_shaders/convolution_layer.cs b/src/core/GLES_COMPUTE/cs_shaders/convolution_layer.cs
index 774173d..5e7609c 100644
--- a/src/core/GLES_COMPUTE/cs_shaders/convolution_layer.cs
+++ b/src/core/GLES_COMPUTE/cs_shaders/convolution_layer.cs
@@ -62,7 +62,56 @@
uint total_filters;
};
-#if defined(DATA_TYPE_FP16)
+#if defined(DATA_TYPE_FP32)
+
+TENSOR_DECLARATION(1, srcBuffer, float, src_ptr, src_shift, 2, readonly);
+TENSOR_DECLARATION(2, dstBuffer, float, dst_ptr, dst_shift, 2, writeonly);
+#ifdef HAS_BIAS
+TENSOR_DECLARATION(3, biasesBuffer, float, biases_ptr, biases_shift, 2, readonly);
+#endif /* BIAS */
+
+void main()
+{
+ Tensor3DIterator src_iter = CONVERT_TO_TENSOR3D_ITERATOR(src_attrs, src_shift);
+ ImageIterator dst_iter = CONVERT_TO_IMAGE_ITERATOR_NO_STEP(dst_attrs, dst_shift);
+#ifdef HAS_BIAS
+ VectorIterator biases_iter = CONVERT_TO_VECTOR_ITERATOR_NO_STEP(biases_attrs, biases_shift);
+#endif /* BIAS */
+
+ bool is_last_thread = (((int(gl_GlobalInvocationID.x)) == (int(gl_NumWorkGroups.x * gl_WorkGroupSize.x) - 1)) && ((int(gl_GlobalInvocationID.y)) == (int(gl_NumWorkGroups.y * gl_WorkGroupSize.y) - 1))
+ && ((int(gl_GlobalInvocationID.z)) == (int(gl_NumWorkGroups.z * gl_WorkGroupSize.z) - 1)));
+ TENSOR_ITERATOR_ADVANCE_IN_BYTES(dst_iter, ((uint(gl_GlobalInvocationID.x) * uint(dst_attrs.stride_y)) + (uint(gl_GlobalInvocationID.y) * uint(width) * uint(dst_attrs.stride_y)) + (uint(
+ gl_GlobalInvocationID.z)
+ * uint(width) * uint(height) * uint(dst_attrs.stride_y))));
+ // Linearize convolution elements
+ if(is_last_thread)
+ {
+ for(uint i = 0u; i < uint(total_filters); ++i)
+ {
+ float s0 = LOAD_CURRENT_ITEM(src_ptr, src_iter);
+ STORE_CURRENT_ITEM(dst_ptr, dst_iter, s0);
+ TENSOR_ITERATOR_ADVANCE_IN_BYTES(src_iter, (depth * src_attrs.stride_z));
+#ifdef HAS_BIAS
+ float b = LOAD_CURRENT_ITEM(biases_ptr, biases_iter);
+ STORE(dst_ptr, TENSOR_OFFSET_ADVANCE_IN_BYTES(dst_iter, dst_attrs.stride_y), b);
+ TENSOR_ITERATOR_ADVANCE_IN_BYTES(biases_iter, biases_attrs.stride_x);
+#endif /* HAS_BIAS */
+ TENSOR_ITERATOR_ADVANCE_IN_BYTES(dst_iter, dst_attrs.stride_x);
+ }
+ }
+ else
+ {
+ for(uint i = 0u; i < uint(total_filters); ++i)
+ {
+ float s0 = LOAD_CURRENT_ITEM(src_ptr, src_iter);
+ STORE_CURRENT_ITEM(dst_ptr, dst_iter, s0);
+ TENSOR_ITERATOR_ADVANCE_IN_BYTES(src_iter, (depth * src_attrs.stride_z));
+ TENSOR_ITERATOR_ADVANCE_IN_BYTES(dst_iter, dst_attrs.stride_x);
+ }
+ }
+}
+
+#elif defined(DATA_TYPE_FP16)
TENSOR_DECLARATION(1, srcBuffer, uint, src_ptr, src_shift, 2, readonly);
TENSOR_DECLARATION(2, dstBuffer, uint, dst_ptr, dst_shift, 2, writeonly);
@@ -72,10 +121,10 @@
void main()
{
- Tensor3DIterator src_iter = CONVERT_TO_TENSOR3D_ITERATOR(src_attrs, src_shift);
- ImageIterator dst_iter = CONVERT_TO_IMAGE_ITERATOR_NO_STEP(dst_attrs, dst_shift);
+ Tensor3DIterator src_iter = CONVERT_TO_TENSOR3D_ITERATOR(src_attrs, src_shift);
+ ImageIterator dst_iter = CONVERT_TO_IMAGE_ITERATOR_NO_STEP(dst_attrs, dst_shift);
#ifdef HAS_BIAS
- VectorIterator biases_iter = CONVERT_TO_VECTOR_ITERATOR_NO_STEP(biases_attrs, biases_shift);
+ VectorIterator biases_iter = CONVERT_TO_VECTOR_ITERATOR_NO_STEP(biases_attrs, biases_shift);
#endif /* BIAS */
bool is_last_thread = (((int(gl_GlobalInvocationID.x)) == (int(gl_NumWorkGroups.x * gl_WorkGroupSize.x) - 1)) && ((int(gl_GlobalInvocationID.y)) == (int(gl_NumWorkGroups.y * gl_WorkGroupSize.y) - 1))
@@ -151,7 +200,7 @@
}
}
-#endif /* DATA_TYPE_FP16 */
+#endif /* DATA_TYPE_FP32 */
#endif // RESHAPE_TO_COLUMNS
#ifdef IM2COL_GENERIC
@@ -164,6 +213,7 @@
* @note STRIDE_X/STRIDE_Y must be passed for stride info, e.g. "#define STRIDE_X xxx"
* @note CONVOLVED_WIDTH/CONVOLVED_HEIGHT must be passed for convolved dimension, e.g. "#define CONVOLVED_WIDTH xxx"
* @note SRC_WIDTH/SRC_HEIGHT must be passed for input dimension, e.g. "#define SRC_WIDTH xxx"
+ * @note DILATION_X/DILATION_Y must be passed for dilation sizes, e.g. "#define DILATION_X xxx"
* @note In case biases will be added to the convolution "#define HAS_BIAS" has to be passed to append the final matrix with 1 in each row.
*
* @param[in] src_ptr Pointer to the source tensor. Supported data types: F16/F32
@@ -192,30 +242,31 @@
Tensor3DIterator src_iter = CONVERT_TO_TENSOR3D_ITERATOR_NO_STEP(src_attrs, src_shift);
ImageIterator dst_iter = CONVERT_TO_IMAGE_ITERATOR_NO_STEP(dst_attrs, dst_shift);
- uint xc = gl_GlobalInvocationID.x; // x coordinate in the convolved tensor
- uint yc = gl_GlobalInvocationID.y; // y coordinate in the convolved tensor
- uint ch = gl_GlobalInvocationID.z % KERNEL_DEPTH; // input feature map
- uint batch = gl_GlobalInvocationID.z / KERNEL_DEPTH; // the batch
+ int xc = int(gl_GlobalInvocationID.x); // x coordinate in the convolved tensor
+ int yc = int(gl_GlobalInvocationID.y); // y coordinate in the convolved tensor
+ int ch = int(gl_GlobalInvocationID.z) % KERNEL_DEPTH; // input feature map
+ int batch = int(gl_GlobalInvocationID.z) / KERNEL_DEPTH; // the batch
// Calculate input indeces
- uint xi = xc * uint(STRIDE_X) - uint(PAD_LEFT);
- uint yi = yc * uint(STRIDE_Y) - uint(PAD_TOP);
- TENSOR_ITERATOR_ADVANCE_IN_BYTES(src_iter, (ch * src_attrs.stride_z) + (batch * src_stride_w));
+ int xi = xc * STRIDE_X - PAD_LEFT;
+ int yi = yc * STRIDE_Y - PAD_TOP;
+ TENSOR_ITERATOR_ADVANCE_IN_BYTES(src_iter, (ch * int(src_attrs.stride_z)) + (batch * int(src_stride_w)));
// Calculate output indeces
- uint xo = ch * uint(KERNEL_WIDTH) * uint(KERNEL_HEIGHT);
- uint yo = xc + yc * uint(CONVOLVED_WIDTH); // Index of the convolution
- TENSOR_ITERATOR_ADVANCE_IN_BYTES(dst_iter, (yo * dst_attrs.stride_y) + (batch * dst_stride_w) + xo);
+ int xo = ch * KERNEL_WIDTH * KERNEL_HEIGHT;
+ int yo = xc + yc * CONVOLVED_WIDTH; // Index of the convolution
+ // sizeof is not available in GLES, so we'll use stride_x
+ TENSOR_ITERATOR_ADVANCE_IN_BYTES(dst_iter, (yo * int(dst_attrs.stride_y)) + (batch * int(dst_stride_w)) + xo * int(dst_attrs.stride_x));
uint src_pos = 0u;
// Linearize convolution elements
- for(uint y = yi, y_e = yi + uint(KERNEL_HEIGHT); y < y_e; ++y)
+ for(int y = yi, y_e = yi + KERNEL_HEIGHT * DILATION_Y; y < y_e; y += DILATION_Y)
{
- for(uint x = xi, x_e = xi + uint(KERNEL_WIDTH); x < x_e; ++x, TENSOR_OFFSET_ADVANCE(dst_iter, 1u))
+ for(int x = xi, x_e = xi + KERNEL_WIDTH * DILATION_X; x < x_e; x += DILATION_X, TENSOR_ITERATOR_ADVANCE_IN_BYTES(dst_iter, int(dst_attrs.stride_x)))
{
#if PAD_LEFT == 0 && PAD_TOP == 0 && PAD_RIGHT == 0 && PAD_BOTTOM == 0
- src_pos = TENSOR_OFFSET_ADVANCE_IN_BYTES(src_iter, x * src_attrs.stride_x + y * src_attrs.stride_y);
+ src_pos = TENSOR_OFFSET_ADVANCE_IN_BYTES(src_iter, x * int(src_attrs.stride_x) + y * int(src_attrs.stride_y));
STORE_CURRENT_ITEM(dst_ptr, dst_iter, LOAD(src_ptr, src_pos));
#else /* PAD_LEFT == 0 && PAD_TOP == 0 && PAD_RIGHT == 0 && PAD_BOTTOM == 0 */
if(x < 0 || x >= SRC_WIDTH || y < 0 || y >= SRC_HEIGHT)
@@ -224,7 +275,7 @@
}
else
{
- src_pos = TENSOR_OFFSET_ADVANCE_IN_BYTES(src_iter, x * src_attrs.stride_x + y * src_attrs.stride_y);
+ src_pos = TENSOR_OFFSET_ADVANCE_IN_BYTES(src_iter, x * int(src_attrs.stride_x) + y * int(src_attrs.stride_y));
STORE_CURRENT_ITEM(dst_ptr, dst_iter, LOAD(src_ptr, src_pos));
}
#endif /* PAD_LEFT == 0 && PAD_TOP == 0 && PAD_RIGHT == 0 && PAD_BOTTOM == 0 */
@@ -232,7 +283,7 @@
}
#ifdef HAS_BIAS
- if(ch == (uint(KERNEL_DEPTH) - 1))
+ if(ch == (KERNEL_DEPTH - 1))
{
STORE_CURRENT_ITEM(dst_ptr, dst_iter, 1.0f);
}
@@ -659,6 +710,7 @@
#endif /* DATA_TYPE_FP32 */
#endif /* IM2COL_REDUCED */
+#ifdef COL2IM
#ifdef WIDTH_OUTPUT
/** This kernel performs a reshaping of the output of the convolution layer.
@@ -692,10 +744,9 @@
Tensor3DIterator dst_iter = CONVERT_TO_TENSOR3D_ITERATOR(dst_attrs, dst_shift);
uvec3 pos = uvec3(gl_GlobalInvocationID.xyz);
- TENSOR_ITERATOR_ADVANCE_IN_BYTES(src_iter, pos.x * src_attrs.step_y + pos.y * WIDTH_OUTPUT * src_attrs.step_y + (pos.z % dst_depth) * src_attrs.stride_x + (pos.z / dst_depth) * (src_attrs.stride_z));
+ TENSOR_ITERATOR_ADVANCE_IN_BYTES(src_iter, pos.x * src_attrs.step_y + pos.y * uint(WIDTH_OUTPUT) * src_attrs.step_y + (pos.z % dst_depth) * src_attrs.stride_x + (pos.z / dst_depth) * dst_strideZ);
- STORE_CURRENT_ITEM(dst_ptr, dst_iter,
- LOAD_CURRENT_ITEM(src_ptr, src_iter));
+ STORE_CURRENT_ITEM(dst_ptr, dst_iter, LOAD_CURRENT_ITEM(src_ptr, src_iter));
}
#elif defined(DATA_TYPE_FP16)
@@ -735,4 +786,5 @@
#else /* DATA_TYPE_FP32 */
#error Data type not supported
#endif /* DATA_TYPE_FP32 */
+#endif /* WIDTH_OUTPUT */
#endif /* COL2IM */
diff --git a/src/core/GLES_COMPUTE/cs_shaders/depthwise_convolution3x3.cs b/src/core/GLES_COMPUTE/cs_shaders/depthwise_convolution3x3.cs
index adfc126..134cc10 100644
--- a/src/core/GLES_COMPUTE/cs_shaders/depthwise_convolution3x3.cs
+++ b/src/core/GLES_COMPUTE/cs_shaders/depthwise_convolution3x3.cs
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2018 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -108,6 +108,8 @@
uint z_index = gl_GlobalInvocationID.z;
TENSOR_ITERATOR_ADVANCE_IN_BYTES(weights_iter, z_index * weights_attrs.stride_z);
+ src_iter.current_offset_in_bytes -= int((z_index - z_index / uint(DEPTH_MULTIPLIER)) * src_attrs.step_z);
+
vec4 w[3];
w[0] = LOAD_UNPACK4_CURRENT_ITEM_HALF(weights_ptr, weights_iter);
w[1] = LOAD_UNPACK4_HALF(weights_ptr, TENSOR3D_OFFSET(weights_iter, 0, 1, 0));
@@ -263,6 +265,8 @@
uint z_index = gl_GlobalInvocationID.z;
TENSOR_ITERATOR_ADVANCE_IN_BYTES(weights_iter, z_index * weights_attrs.stride_z);
+ src_iter.current_offset_in_bytes -= int((z_index - z_index / uint(DEPTH_MULTIPLIER)) * src_attrs.step_z);
+
vec4 w[3];
w[0] = LOAD_UNPACK4_CURRENT_ITEM_HALF(weights_ptr, weights_iter);
w[1] = LOAD_UNPACK4_HALF(weights_ptr, TENSOR3D_OFFSET(weights_iter, 0, 1, 0));
diff --git a/src/core/GLES_COMPUTE/cs_shaders/direct_convolution1x1.cs b/src/core/GLES_COMPUTE/cs_shaders/direct_convolution1x1.cs
index ea4e9c1..b42c09b 100644
--- a/src/core/GLES_COMPUTE/cs_shaders/direct_convolution1x1.cs
+++ b/src/core/GLES_COMPUTE/cs_shaders/direct_convolution1x1.cs
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2018 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -26,6 +26,10 @@
#include "helpers_cs.h"
+#ifdef FUSED_ACTIVATION
+#include "activation_layer_helpers_cs.h"
+#endif /* FUSED_ACTIVATION */
+
#if defined(DATA_TYPE_FP16)
precision mediump float;
#endif // DATA_TYPE_FP16
@@ -99,6 +103,10 @@
pixels += LOAD(biases_ptr, VECTOR_OFFSET(biases_iter, z_index));
#endif /* BIAS */
+#ifdef FUSED_ACTIVATION
+ pixels = ACT_OP(pixels);
+#endif /* FUSED_ACTIVATION */
+
STORE_CURRENT_ITEM(dst_ptr, dst_iter, pixels);
}
@@ -210,6 +218,10 @@
pixels += b;
#endif /* BIAS */
+#ifdef FUSED_ACTIVATION
+ pixels = ACT_OP(pixels);
+#endif /* FUSED_ACTIVATION */
+
STORE_PACK4_CURRENT_ITEM_HALF(dst_ptr, dst_iter, pixels);
}
#elif defined(PROCESS_4X_2Y_1Z)
@@ -333,6 +345,11 @@
pixels[1] += b;
#endif /* BIAS */
+#ifdef FUSED_ACTIVATION
+ pixels[0] = ACT_OP(pixels[0]);
+ pixels[1] = ACT_OP(pixels[1]);
+#endif /* FUSED_ACTIVATION */
+
STORE_PACK4_CURRENT_ITEM_HALF(dst_ptr, dst_iter, pixels[0]);
STORE_PACK4_HALF(dst_ptr, TENSOR3D_OFFSET(dst_iter, 0, 1, 0), pixels[1]);
}
@@ -470,6 +487,12 @@
pixels[2] += b;
#endif /* BIAS */
+#ifdef FUSED_ACTIVATION
+ pixels[0] = ACT_OP(pixels[0]);
+ pixels[1] = ACT_OP(pixels[1]);
+ pixels[2] = ACT_OP(pixels[2]);
+#endif /* FUSED_ACTIVATION */
+
STORE_PACK4_CURRENT_ITEM_HALF(dst_ptr, dst_iter, pixels[0]);
STORE_PACK4_HALF(dst_ptr, TENSOR3D_OFFSET(dst_iter, 0, 1, 0), pixels[1]);
STORE_PACK4_HALF(dst_ptr, TENSOR3D_OFFSET(dst_iter, 0, 2, 0), pixels[2]);
@@ -609,6 +632,13 @@
pixels1[1] += b;
#endif /* BIAS */
+#ifdef FUSED_ACTIVATION
+ pixels[0] = ACT_OP(pixels[0]);
+ pixels[1] = ACT_OP(pixels[1]);
+ pixels1[0] = ACT_OP(pixels1[0]);
+ pixels1[1] = ACT_OP(pixels1[1]);
+#endif /* FUSED_ACTIVATION */
+
STORE_PACK4_CURRENT_ITEM_HALF(dst_ptr, dst_iter, pixels[0]);
STORE_PACK4_HALF(dst_ptr, TENSOR3D_OFFSET(dst_iter, 0, 1, 0), pixels[1]);
STORE_PACK4_HALF(dst_ptr, TENSOR3D_OFFSET(dst_iter, 0, 2, 0), pixels1[0]);
@@ -745,6 +775,11 @@
pixels[1] += b;
#endif /* BIAS */
+#ifdef FUSED_ACTIVATION
+ pixels[0] = ACT_OP(pixels[0]);
+ pixels[1] = ACT_OP(pixels[1]);
+#endif /* FUSED_ACTIVATION */
+
STORE_PACK4_CURRENT_ITEM_HALF(dst_ptr, dst_iter, pixels[0]);
STORE_PACK4_HALF(dst_ptr, TENSOR3D_OFFSET(dst_iter, 0, 1, 0), pixels[1]);
@@ -868,6 +903,11 @@
pixels[1] += b;
#endif /* BIAS */
+#ifdef FUSED_ACTIVATION
+ pixels[0] = ACT_OP(pixels[0]);
+ pixels[1] = ACT_OP(pixels[1]);
+#endif /* FUSED_ACTIVATION */
+
STORE_PACK8_CURRENT_ITEM_HALF(dst_ptr, dst_iter, pixels);
}
#elif defined(PROCESS_8X_2Y_1Z)
@@ -1001,6 +1041,13 @@
pixels1[1] += b;
#endif /* BIAS */
+#ifdef FUSED_ACTIVATION
+ pixels[0] = ACT_OP(pixels[0]);
+ pixels[1] = ACT_OP(pixels[1]);
+ pixels1[0] = ACT_OP(pixels1[0]);
+ pixels1[1] = ACT_OP(pixels1[1]);
+#endif /* FUSED_ACTIVATION */
+
STORE_PACK8_CURRENT_ITEM_HALF(dst_ptr, dst_iter, pixels);
STORE_PACK8_HALF(dst_ptr, TENSOR3D_OFFSET(dst_iter, 0, 1, 0), pixels1);
}
diff --git a/src/core/GLES_COMPUTE/cs_shaders/direct_convolution3x3.cs b/src/core/GLES_COMPUTE/cs_shaders/direct_convolution3x3.cs
index 855d450..e51cc37 100644
--- a/src/core/GLES_COMPUTE/cs_shaders/direct_convolution3x3.cs
+++ b/src/core/GLES_COMPUTE/cs_shaders/direct_convolution3x3.cs
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2018 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -25,6 +25,10 @@
#include "helpers_cs.h"
+#ifdef FUSED_ACTIVATION
+#include "activation_layer_helpers_cs.h"
+#endif /* FUSED_ACTIVATION */
+
#if defined(DATA_TYPE_FP16)
precision mediump float;
#endif // DATA_TYPE_FP16
@@ -114,6 +118,10 @@
pixels += LOAD(biases_ptr, VECTOR_OFFSET(biases_iter, z_index));
#endif /* BIAS */
+#ifdef FUSED_ACTIVATION
+ pixels = ACT_OP(pixels);
+#endif /* FUSED_ACTIVATION */
+
STORE_CURRENT_ITEM(dst_ptr, dst_iter, pixels);
}
@@ -238,6 +246,11 @@
pixels[1] += vec4(b);
#endif /* BIAS */
+#ifdef FUSED_ACTIVATION
+ pixels[0] = ACT_OP(pixels[0]);
+ pixels[1] = ACT_OP(pixels[1]);
+#endif /* FUSED_ACTIVATION */
+
VSTORE2_CURRENT_ITEM(dst_ptr, dst_iter, pixels);
}
@@ -335,6 +348,10 @@
pixels += b;
#endif /* BIAS */
+#ifdef FUSED_ACTIVATION
+ pixels = ACT_OP(pixels);
+#endif /* FUSED_ACTIVATION */
+
STORE_CURRENT_ITEM(dst_ptr, dst_iter, pixels);
}
@@ -434,6 +451,12 @@
pixels[2] += vec4(b);
#endif /* BIAS */
+#ifdef FUSED_ACTIVATION
+ pixels[0] = ACT_OP(pixels[0]);
+ pixels[1] = ACT_OP(pixels[1]);
+ pixels[2] = ACT_OP(pixels[2]);
+#endif /* FUSED_ACTIVATION */
+
STORE_CURRENT_ITEM(dst_ptr, dst_iter, pixels[0]);
STORE(dst_ptr, TENSOR3D_OFFSET(dst_iter, 0, 1, 0), pixels[1]);
STORE(dst_ptr, TENSOR3D_OFFSET(dst_iter, 0, 2, 0), pixels[2]);
@@ -601,6 +624,12 @@
}
#endif /* BIAS */
+#ifdef FUSED_ACTIVATION
+ pixels[0] = ACT_OP(pixels[0]);
+ pixels[1] = ACT_OP(pixels[1]);
+ pixels[2] = ACT_OP(pixels[2]);
+#endif /* FUSED_ACTIVATION */
+
STORE_PACK8_CURRENT_ITEM_HALF(dst_ptr, dst_iter, pixels[0]);
STORE_PACK8_HALF(dst_ptr, TENSOR3D_OFFSET(dst_iter, 0, 1, 0), pixels[1]);
STORE_PACK8_HALF(dst_ptr, TENSOR3D_OFFSET(dst_iter, 0, 2, 0), pixels[2]);
@@ -728,6 +757,10 @@
pixels += vec4(b);
#endif /* BIAS */
+#ifdef FUSED_ACTIVATION
+ pixels = ACT_OP(pixels);
+#endif /* FUSED_ACTIVATION */
+
STORE_PACK4_CURRENT_ITEM_HALF(dst_ptr, dst_iter, pixels);
}
@@ -841,6 +874,12 @@
}
#endif /* BIAS */
+#ifdef FUSED_ACTIVATION
+ pixels[0] = ACT_OP(pixels[0]);
+ pixels[1] = ACT_OP(pixels[1]);
+ pixels[2] = ACT_OP(pixels[2]);
+#endif /* FUSED_ACTIVATION */
+
STORE_PACK4_CURRENT_ITEM_HALF(dst_ptr, dst_iter, pixels[0]);
STORE_PACK4_HALF(dst_ptr, TENSOR3D_OFFSET(dst_iter, 0, 1, 0), pixels[1]);
STORE_PACK4_HALF(dst_ptr, TENSOR3D_OFFSET(dst_iter, 0, 2, 0), pixels[2]);
@@ -962,6 +1001,13 @@
}
#endif /* BIAS */
+#ifdef FUSED_ACTIVATION
+ pixels[0] = ACT_OP(pixels[0]);
+ pixels[1] = ACT_OP(pixels[1]);
+ pixels[2] = ACT_OP(pixels[2]);
+ pixels[3] = ACT_OP(pixels[3]);
+#endif /* FUSED_ACTIVATION */
+
STORE_PACK4_CURRENT_ITEM_HALF(dst_ptr, dst_iter, pixels[0]);
STORE_PACK4_HALF(dst_ptr, TENSOR3D_OFFSET(dst_iter, 0, 1, 0), pixels[1]);
STORE_PACK4_HALF(dst_ptr, TENSOR3D_OFFSET(dst_iter, 0, 2, 0), pixels[2]);
@@ -1087,6 +1133,13 @@
}
#endif /* BIAS */
+#ifdef FUSED_ACTIVATION
+ pixels[0] = ACT_OP(pixels[0]);
+ pixels[1] = ACT_OP(pixels[1]);
+ pixels[2] = ACT_OP(pixels[2]);
+ pixels[3] = ACT_OP(pixels[3]);
+#endif /* FUSED_ACTIVATION */
+
STORE_PACK4_CURRENT_ITEM_HALF(dst_ptr, dst_iter, pixels[0]);
STORE_PACK4_HALF(dst_ptr, TENSOR3D_OFFSET(dst_iter, 0, 1, 0), pixels[1]);
STORE_PACK4_HALF(dst_ptr, TENSOR3D_OFFSET(dst_iter, 0, 2, 0), pixels[2]);
diff --git a/src/core/GLES_COMPUTE/cs_shaders/direct_convolution5x5.cs b/src/core/GLES_COMPUTE/cs_shaders/direct_convolution5x5.cs
index c919e4e..728e964 100644
--- a/src/core/GLES_COMPUTE/cs_shaders/direct_convolution5x5.cs
+++ b/src/core/GLES_COMPUTE/cs_shaders/direct_convolution5x5.cs
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017, 2018 ARM Limited.
+ * Copyright (c) 2017-2018 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -26,6 +26,10 @@
#include "helpers_cs.h"
+#ifdef FUSED_ACTIVATION
+#include "activation_layer_helpers_cs.h"
+#endif /* FUSED_ACTIVATION */
+
#if defined(DATA_TYPE_FP16)
precision mediump float;
#endif // DATA_TYPE_FP16
@@ -116,6 +120,10 @@
pixels += LOAD(biases_ptr, VECTOR_OFFSET(biases_iter, z_index));
#endif /* BIAS */
+#ifdef FUSED_ACTIVATION
+ pixels = ACT_OP(pixels);
+#endif /* FUSED_ACTIVATION */
+
STORE_CURRENT_ITEM(dst_ptr, dst_iter, pixels);
}
#elif defined(DATA_TYPE_FP16)
@@ -204,6 +212,10 @@
res += vec4(b);
#endif /* BIAS */
+#ifdef FUSED_ACTIVATION
+ res = ACT_OP(res);
+#endif /* FUSED_ACTIVATION */
+
STORE_PACK4_CURRENT_ITEM_HALF(dst_ptr, dst_iter, res);
}
diff --git a/src/core/GLES_COMPUTE/cs_shaders/gemm.cs b/src/core/GLES_COMPUTE/cs_shaders/gemm.cs
index ba50721..a65f980 100644
--- a/src/core/GLES_COMPUTE/cs_shaders/gemm.cs
+++ b/src/core/GLES_COMPUTE/cs_shaders/gemm.cs
@@ -132,7 +132,7 @@
/** This OpenGL ES kernel is optimised for Midgard. It computes the matrix multiplication between matrix A (src0) and matrix B (src1)
* Matrix A and matrix B must be reshaped respectively with @ref gemm_interleave4x4_32bit and @ref gemm_transpose1x4 before running the matrix multiplication
*
- * @attention The width of matrix B and the alpha's value need to be passed at compile time using WIDTH_MATRIX_B and ALPHA
+ * @note The optional value of scalar alpha is passed at compile time using -DALPHA=alpha
*
* @param[in] src0_ptr Pointer to the source matrix. Supported data types: F32
* @param[in] src0_attrs The attributes of the source matrix
@@ -220,7 +220,9 @@
/** This OpenGL ES kernel computes the matrix multiplication between matrix A (src0) and matrix B (src1)
* Matrix A and matrix B must be reshaped respectively with @ref gemm_interleave4x4_32bit and @ref gemm_transpose1x4 before running the matrix multiplication
*
- * @attention The width of matrix B and the alpha's value need to be passed at compile time using WIDTH_MATRIX_B and ALPHA
+ * @note The number of elements processed along the x and y directions must be passed at compile time using -DNUM_ELEMS_PROCESSED_PER_THREAD_X and -DNUM_ELEMS_PROCESSED_PER_THREAD_Y.
+ * @note The number of matrix A columns must be passed at compile time using -DCOLS_A.
+ * @note The optional value of scalar alpha is passed at compile time using -DALPHA=alpha
*
* @param[in] src0_ptr Pointer to the source matrix. Supported data types: F32
* @param[in] src0_attrs The attributes of the source matrix
@@ -344,6 +346,184 @@
}
#endif /* GEMM_MM_FLOATING_POINT */
+#ifdef GEMM_MM_FLOATING_POINT_BIFROST
+/** This OpenGL ES kernel computes the matrix multiplication between matrix A (src0) and matrix B (src1)
+ * Matrix A and matrix B in case both matrices have not been reshaped
+ *
+ * @note The number of elements processed along the x and y directions must be passed at compile time using -DNUM_ELEMS_PROCESSED_PER_THREAD_X and -DNUM_ELEMS_PROCESSED_PER_THREAD_Y.
+ * @note The number of matrix A columns must be passed at compile time using -DCOLS_A.
+ * @note The optional value of scalar alpha is passed at compile time using -DALPHA=alpha
+ *
+ * @param[in] src0_ptr Pointer to the source matrix. Supported data types: F32
+ * @param[in] src0_attrs The attributes of the source matrix
+ * @param[in] src1_ptr Pointer to the source matrix. Supported data types: same as @p src0_ptr
+ * @param[in] src1_attrs The attributes of the source matrix
+ * @param[out] dst_ptr Pointer to the destination matrix Supported data types: same as @p src0_ptr
+ * @param[in] dst_attrs The attributes of the destination matrix
+ */
+SHADER_PARAMS_DECLARATION
+{
+ ImageAttributes src0_attrs;
+ ImageAttributes src1_attrs;
+ ImageAttributes dst_attrs;
+};
+TENSOR_DECLARATION(1, src0Buffer, float, src0_ptr, src0_shift, 2, readonly);
+TENSOR_DECLARATION(2, src1Buffer, float, src1_ptr, src1_shift, 2, readonly);
+TENSOR_DECLARATION(3, dstBuffer, float, dst_ptr, dst_shift, 2, writeonly);
+
+void main()
+{
+ ImageIterator src0_iter = CONVERT_TO_IMAGE_ITERATOR_NO_STEP(src0_attrs, src0_shift);
+ ImageIterator src1_iter = CONVERT_TO_IMAGE_ITERATOR_NO_STEP(src1_attrs, src1_shift);
+ ImageIterator dst_iter = CONVERT_TO_IMAGE_ITERATOR(dst_attrs, dst_shift);
+
+ int idx = int(gl_GlobalInvocationID.x) * int(NUM_ELEMS_PROCESSED_PER_THREAD_X);
+ /* Compute the address for the vector A and matrix B */
+ TENSOR_ITERATOR_ADVANCE_IN_BYTES(src0_iter, uint(gl_GlobalInvocationID.y) * (src0_attrs.stride_y) * uint(NUM_ELEMS_PROCESSED_PER_THREAD_Y));
+ TENSOR_ITERATOR_ADVANCE_IN_BYTES(src1_iter, idx * 4);
+
+ /* Reset accumulators */
+ vec4 acc0 = vec4(0.0f);
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+ vec4 acc1 = vec4(0.0f);
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+ vec4 acc2 = vec4(0.0f);
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+ vec4 acc3 = vec4(0.0f);
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+
+ // A and B src indices get incremented at the same time.
+ int i = 0;
+ for(; i <= (COLS_A - 4); i += 4)
+ {
+ // Load values from matrix A and matrix B
+ vec4 a0 = VLOAD4_CURRENT_ITEM(vec4, src0_ptr, src0_iter);
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+ vec4 a1 = VLOAD4(vec4, src0_ptr, IMAGE_OFFSET(src0_iter, 0, 1));
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+ vec4 a2 = VLOAD4(vec4, src0_ptr, IMAGE_OFFSET(src0_iter, 0, 2));
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+ vec4 a3 = VLOAD4(vec4, src0_ptr, IMAGE_OFFSET(src0_iter, 0, 3));
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+ vec4 b0 = VLOAD4_CURRENT_ITEM(vec4, src1_ptr, src1_iter);
+ TENSOR_ITERATOR_ADVANCE_IN_BYTES(src1_iter, src1_attrs.stride_y);
+
+ // Multiply and accumulate
+ acc0 += b0 * vec4(a0.x);
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+ acc1 += b0 * vec4(a1.x);
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+ acc2 += b0 * vec4(a2.x);
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+ acc3 += b0 * vec4(a3.x);
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+
+ // Load values from matrix B
+ b0 = VLOAD4_CURRENT_ITEM(vec4, src1_ptr, src1_iter);
+ TENSOR_ITERATOR_ADVANCE_IN_BYTES(src1_iter, src1_attrs.stride_y);
+
+ // Multiply and accumulate
+ acc0 += b0 * vec4(a0.y);
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+ acc1 += b0 * vec4(a1.y);
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+ acc2 += b0 * vec4(a2.y);
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+ acc3 += b0 * vec4(a3.y);
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+
+ // Load values from matrix B
+ b0 = VLOAD4_CURRENT_ITEM(vec4, src1_ptr, src1_iter);
+ TENSOR_ITERATOR_ADVANCE_IN_BYTES(src1_iter, src1_attrs.stride_y);
+
+ // Multiply and accumulate
+ acc0 += b0 * vec4(a0.z);
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+ acc1 += b0 * vec4(a1.z);
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+ acc2 += b0 * vec4(a2.z);
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+ acc3 += b0 * vec4(a3.z);
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+
+ // Load values from matrix B
+ b0 = VLOAD4_CURRENT_ITEM(vec4, src1_ptr, src1_iter);
+ TENSOR_ITERATOR_ADVANCE_IN_BYTES(src1_iter, src1_attrs.stride_y);
+
+ // Multiply and accumulate
+ acc0 += b0 * vec4(a0.w);
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+ acc1 += b0 * vec4(a1.w);
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+ acc2 += b0 * vec4(a2.w);
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+ acc3 += b0 * vec4(a3.w);
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+
+ TENSOR_ITERATOR_ADVANCE(src0_iter, 4);
+ }
+
+ for(; i < COLS_A; ++i)
+ {
+ // Load values from matrix A
+ float a0 = LOAD_CURRENT_ITEM(src0_ptr, src0_iter);
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+ float a1 = LOAD(src0_ptr, IMAGE_OFFSET(src0_iter, 0, 1));
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+ float a2 = LOAD(src0_ptr, IMAGE_OFFSET(src0_iter, 0, 2));
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+ float a3 = LOAD(src0_ptr, IMAGE_OFFSET(src0_iter, 0, 3));
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+ vec4 b0 = VLOAD4_CURRENT_ITEM(vec4, src1_ptr, src1_iter);
+
+ // Multiply and accumulate
+ acc0 += b0 * vec4(a0);
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+ acc1 += b0 * vec4(a1);
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+ acc2 += b0 * vec4(a2);
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+ acc3 += b0 * vec4(a3);
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+
+ TENSOR_ITERATOR_ADVANCE_IN_BYTES(src1_iter, src1_attrs.stride_y);
+ TENSOR_ITERATOR_ADVANCE(src0_iter, 1);
+ }
+
+ /* Multiply by the weight of vector-matrix product */
+ acc0 = acc0 * vec4(ALPHA);
+ VSTORE4_CURRENT_ITEM(dst_ptr, dst_iter, acc0);
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+ acc1 = acc1 * vec4(ALPHA);
+ VSTORE4(dst_ptr, IMAGE_OFFSET(dst_iter, 0, 1), acc1);
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+ acc2 = acc2 * vec4(ALPHA);
+ VSTORE4(dst_ptr, IMAGE_OFFSET(dst_iter, 0, 2), acc2);
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+ acc3 = acc3 * vec4(ALPHA);
+ VSTORE4(dst_ptr, IMAGE_OFFSET(dst_iter, 0, 3), acc3);
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+}
+#endif /* GEMM_MM_FLOATING_POINT_BIFROST */
+
#ifdef GEMM_MATRIXADDITION
/** This OpenGL ES kernel performs the in-place matrix addition between 2 matrices taking into account that the second matrix might be weighted by a scalar value beta:
*
@@ -461,7 +641,7 @@
/** This OpenGL ES kernel computes the matrix multiplication between matrix A(src0) and matrix B(src1)
* Matrix A and matrix B must be reshaped respectively with @ref gemm_interleave4x4_16bit and @ref gemm_transpose1x4 before running the matrix multiplication
*
- * @attention The width of matrix B and the alpha's value need to be passed at compile time using WIDTH_MATRIX_B and ALPHA
+ * @note The optional value of scalar alpha is passed at compile time using -DALPHA=alpha
*
* @param[in] src0_ptr Pointer to the source matrix.Supported data types: F16
* @param[in] src0_attrs The attributes of the source matrix
@@ -836,7 +1016,7 @@
/** This OpenGL ES kernel is optimised for Midgard. It computes the matrix multiplication between matrix A (src0) and matrix B (src1)
* Matrix A and matrix B must be reshaped respectively with @ref gemm_interleave4x4_32bit and @ref gemm_transpose1x4 before running the matrix multiplication
*
- * @attention The width of matrix B and the alpha's value need to be passed at compile time using WIDTH_MATRIX_B and ALPHA
+ * @note The optional value of scalar alpha is passed at compile time using -DALPHA=alpha
*
* @param[in] src0_ptr Pointer to the source matrix. Supported data types: F16
* @param[in] src0_attrs The attributes of the source matrix
diff --git a/src/core/GLES_COMPUTE/gl_entries.in b/src/core/GLES_COMPUTE/gl_entries.in
index 15ce8ee..17e3aee 100644
--- a/src/core/GLES_COMPUTE/gl_entries.in
+++ b/src/core/GLES_COMPUTE/gl_entries.in
@@ -61,3 +61,4 @@
GL_ENTRY(glDeleteFramebuffers)
GL_ENTRY(glBindFramebuffer)
GL_ENTRY(glFramebufferTexture2D)
+GL_ENTRY(glGetString)
diff --git a/src/core/GLES_COMPUTE/kernels/GCActivationLayerKernel.cpp b/src/core/GLES_COMPUTE/kernels/GCActivationLayerKernel.cpp
index d7c645d..8287823 100644
--- a/src/core/GLES_COMPUTE/kernels/GCActivationLayerKernel.cpp
+++ b/src/core/GLES_COMPUTE/kernels/GCActivationLayerKernel.cpp
@@ -111,8 +111,9 @@
_output->set_needs_shifting(true);
- Window slice = window.first_slice_window_3D();
- Window slice_in = window.first_slice_window_3D();
+ Window collapsed = window.collapse_if_possible(IGCKernel::window(), Window::DimZ);
+ Window slice = collapsed.first_slice_window_3D();
+ Window slice_in = collapsed.first_slice_window_3D();
slice.shift(Window::DimX, -(_output->info()->padding()).left);
@@ -125,10 +126,10 @@
{
unsigned int idx = 0;
unsigned int binding = 1;
- add_3D_tensor_argument(idx, _input, binding++, slice_in);
- add_3D_tensor_argument(idx, _output, binding++, slice);
+ add_3D_tensor_argument(idx, _input, binding++, slice);
+ add_3D_tensor_argument(idx, _output, binding++, slice_in);
_kernel.update_shader_params();
enqueue(*this, slice);
}
- while(window.slide_window_slice_3D(slice) && window.slide_window_slice_3D(slice_in));
+ while(collapsed.slide_window_slice_3D(slice) && collapsed.slide_window_slice_3D(slice_in));
}
diff --git a/src/core/GLES_COMPUTE/kernels/GCBatchNormalizationLayerKernel.cpp b/src/core/GLES_COMPUTE/kernels/GCBatchNormalizationLayerKernel.cpp
index cd93f69..9a592df 100644
--- a/src/core/GLES_COMPUTE/kernels/GCBatchNormalizationLayerKernel.cpp
+++ b/src/core/GLES_COMPUTE/kernels/GCBatchNormalizationLayerKernel.cpp
@@ -36,6 +36,105 @@
using namespace arm_compute;
+namespace
+{
+Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output,
+ const ITensorInfo *mean, const ITensorInfo *var,
+ const ITensorInfo *beta, const ITensorInfo *gamma,
+ float epsilon, ActivationLayerInfo act_info)
+{
+ ARM_COMPUTE_UNUSED(epsilon);
+ ARM_COMPUTE_UNUSED(var);
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F16, DataType::F32);
+
+ ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, mean, var);
+ ARM_COMPUTE_ERROR_ON_MISMATCHING_FIXED_POINT(input, mean, var);
+ ARM_COMPUTE_ERROR_ON_MISMATCHING_SHAPES(mean, var);
+
+ if(output->total_size() != 0)
+ {
+ ARM_COMPUTE_ERROR_ON_MISMATCHING_SHAPES(input, output);
+ ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+ ARM_COMPUTE_ERROR_ON_MISMATCHING_FIXED_POINT(input, output);
+ }
+
+ if(beta != nullptr)
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(mean, beta);
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, beta);
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_FIXED_POINT(input, beta);
+ }
+ if(gamma != nullptr)
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(mean, gamma);
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, gamma);
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_FIXED_POINT(input, gamma);
+ }
+ if(act_info.enabled())
+ {
+ ARM_COMPUTE_ERROR_ON(input->data_type() != DataType::F32 && input->data_type() != DataType::F16);
+ ARM_COMPUTE_ERROR_ON(act_info.activation() != ActivationLayerInfo::ActivationLayerInfo::ActivationFunction::RELU
+ && act_info.activation() != ActivationLayerInfo::ActivationLayerInfo::ActivationFunction::BOUNDED_RELU
+ && act_info.activation() != ActivationLayerInfo::ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU);
+ ARM_COMPUTE_ERROR_ON(act_info.b() > act_info.a());
+ }
+ return Status{};
+}
+
+std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *output,
+ ITensorInfo *mean, ITensorInfo *var,
+ ITensorInfo *beta, ITensorInfo *gamma)
+{
+ // Output tensor auto initialization if not yet initialized
+ auto_init_if_empty(*output, input->tensor_shape(), 1, input->data_type(), input->fixed_point_position());
+
+ unsigned int num_elems_processed_per_iteration = 1;
+ if(input->data_type() == DataType::F16)
+ {
+ num_elems_processed_per_iteration = 4;
+ }
+
+ // Configure kernel window
+ Window win = calculate_max_window(*input, Steps(num_elems_processed_per_iteration));
+
+ AccessWindowHorizontal input_access(input, 0, num_elems_processed_per_iteration);
+ AccessWindowHorizontal output_access(output, 0, num_elems_processed_per_iteration);
+ AccessWindowStatic mean_access(mean, 0, 0, mean->dimension(0) + 3, mean->dimension(1));
+ AccessWindowStatic var_access(var, 0, 0, var->dimension(0) + 3, var->dimension(1));
+
+ bool window_changed = false;
+ if(beta != nullptr)
+ {
+ AccessWindowStatic beta_access(beta, 0, 0, beta->dimension(0) + 3, beta->dimension(1));
+ if(gamma != nullptr)
+ {
+ AccessWindowStatic gamma_access(gamma, 0, 0, gamma->dimension(0) + 3, gamma->dimension(1));
+ window_changed = update_window_and_padding(win, input_access, output_access, mean_access, var_access, beta_access, gamma_access);
+ }
+ else
+ {
+ window_changed = update_window_and_padding(win, input_access, output_access, mean_access, var_access, beta_access);
+ }
+ }
+ else
+ {
+ if(gamma != nullptr)
+ {
+ AccessWindowStatic gamma_access(gamma, 0, 0, gamma->dimension(0) + 3, gamma->dimension(1));
+ window_changed = update_window_and_padding(win, input_access, output_access, mean_access, var_access, gamma_access);
+ }
+ else
+ {
+ window_changed = update_window_and_padding(win, input_access, output_access, mean_access, var_access);
+ }
+ }
+ output_access.set_valid_region(win, input->valid_region());
+
+ Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
+ return std::make_pair(err, win);
+}
+} // namespace
+
GCBatchNormalizationLayerKernel::GCBatchNormalizationLayerKernel()
: _input(nullptr), _output(nullptr), _mean(nullptr), _var(nullptr), _beta(nullptr), _gamma(nullptr), _epsilon(0.0f)
{
@@ -44,24 +143,11 @@
void GCBatchNormalizationLayerKernel::configure(const IGCTensor *input, IGCTensor *output, const IGCTensor *mean, const IGCTensor *var, const IGCTensor *beta, const IGCTensor *gamma,
float epsilon, ActivationLayerInfo act_info)
{
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F16, DataType::F32);
- ARM_COMPUTE_ERROR_ON_NULLPTR(output);
+ ARM_COMPUTE_ERROR_ON_NULLPTR(input, output, mean, var);
- // Output tensor auto initialization if not yet initialized
- auto_init_if_empty(*output->info(), input->info()->tensor_shape(), 1, input->info()->data_type(), input->info()->fixed_point_position());
-
- ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output, mean, var, beta, gamma);
- ARM_COMPUTE_ERROR_ON_MISMATCHING_FIXED_POINT(input, output, mean, var, beta, gamma);
- ARM_COMPUTE_ERROR_ON_MISMATCHING_SHAPES(input, output);
- ARM_COMPUTE_ERROR_ON_MISMATCHING_SHAPES(mean, var, beta, gamma);
- if(act_info.enabled())
- {
- ARM_COMPUTE_ERROR_ON(input->info()->data_type() != DataType::F32 && input->info()->data_type() != DataType::F16);
- ARM_COMPUTE_ERROR_ON(act_info.activation() != ActivationLayerInfo::ActivationLayerInfo::ActivationFunction::RELU
- && act_info.activation() != ActivationLayerInfo::ActivationLayerInfo::ActivationFunction::BOUNDED_RELU
- && act_info.activation() != ActivationLayerInfo::ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU);
- ARM_COMPUTE_ERROR_ON(act_info.b() > act_info.a());
- }
+ ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), mean->info(), var->info(),
+ (beta != nullptr) ? beta->info() : nullptr, (gamma != nullptr) ? gamma->info() : nullptr,
+ epsilon, act_info));
_input = input;
_output = output;
@@ -71,12 +157,6 @@
_gamma = gamma;
_epsilon = epsilon;
- unsigned int num_elems_processed_per_iteration = 1;
- if(input->info()->data_type() == DataType::F16)
- {
- num_elems_processed_per_iteration = 4;
- }
-
// Set build options
std::set<std::string> build_opts;
std::string dt_name = (input->info()->data_type() == DataType::F32) ? "DATA_TYPE_FP32" : "DATA_TYPE_FP16";
@@ -85,6 +165,14 @@
build_opts.emplace(("#define LOCAL_SIZE_X " + support::cpp11::to_string(1)));
build_opts.emplace(("#define LOCAL_SIZE_Y " + support::cpp11::to_string(1)));
build_opts.emplace(("#define LOCAL_SIZE_Z " + support::cpp11::to_string(1)));
+ if(beta == nullptr)
+ {
+ build_opts.emplace("#define USE_DEFAULT_BETA");
+ }
+ if(gamma == nullptr)
+ {
+ build_opts.emplace("#define USE_DEFAULT_GAMMA");
+ }
if(act_info.enabled())
{
@@ -97,19 +185,25 @@
_kernel = static_cast<GCKernel>(GCKernelLibrary::get().create_kernel("batchnormalization_layer", build_opts));
// Configure kernel window
- Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration));
+ auto win_config = validate_and_configure_window(input->info(), output->info(), mean->info(), var->info(),
+ (beta != nullptr) ? beta->info() : nullptr, (gamma != nullptr) ? gamma->info() : nullptr);
+ ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
- AccessWindowHorizontal input_access(input->info(), 0, num_elems_processed_per_iteration);
- AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration);
- AccessWindowStatic mean_access(mean->info(), 0, 0, mean->info()->dimension(0) + 3, mean->info()->dimension(1));
- AccessWindowStatic var_access(var->info(), 0, 0, var->info()->dimension(0) + 3, var->info()->dimension(1));
- AccessWindowStatic beta_access(beta->info(), 0, 0, beta->info()->dimension(0) + 3, beta->info()->dimension(1));
- AccessWindowStatic gamma_access(gamma->info(), 0, 0, gamma->info()->dimension(0) + 3, gamma->info()->dimension(1));
+ IGCKernel::configure(win_config.second);
+}
- update_window_and_padding(win, input_access, output_access, mean_access, var_access, beta_access, gamma_access);
- output_access.set_valid_region(win, input->info()->valid_region());
+Status GCBatchNormalizationLayerKernel::validate(const ITensorInfo *input, const ITensorInfo *output,
+ const ITensorInfo *mean, const ITensorInfo *var,
+ const ITensorInfo *beta, const ITensorInfo *gamma,
+ float epsilon, ActivationLayerInfo act_info)
+{
+ ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, mean, var, beta, gamma, epsilon, act_info));
+ ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input->clone().get(), output->clone().get(),
+ mean->clone().get(), var->clone().get(),
+ beta->clone().get(), gamma->clone().get())
+ .first);
- IGCKernel::configure(win);
+ return Status{};
}
void GCBatchNormalizationLayerKernel::run(const Window &window)
@@ -127,11 +221,18 @@
Window vector_slice = window.first_slice_window_1D();
vector_slice.set(Window::DimX, Window::Dimension(0, 0, 0));
- unsigned int idx = 2 * num_arguments_per_3D_tensor();
- add_1D_tensor_argument(idx, _mean, 3, vector_slice);
- add_1D_tensor_argument(idx, _var, 4, vector_slice);
- add_1D_tensor_argument(idx, _beta, 5, vector_slice);
- add_1D_tensor_argument(idx, _gamma, 6, vector_slice);
+ unsigned int idx = 2 * num_arguments_per_3D_tensor();
+ unsigned int binding_point = 3;
+ add_1D_tensor_argument(idx, _mean, binding_point, vector_slice);
+ add_1D_tensor_argument(idx, _var, ++binding_point, vector_slice);
+ if(_beta != nullptr)
+ {
+ add_1D_tensor_argument(idx, _beta, ++binding_point, vector_slice);
+ }
+ if(_gamma != nullptr)
+ {
+ add_1D_tensor_argument(idx, _gamma, ++binding_point, vector_slice);
+ }
slice.shift(Window::DimX, -(_output->info()->padding()).left);
diff --git a/src/core/GLES_COMPUTE/kernels/GCCol2ImKernel.cpp b/src/core/GLES_COMPUTE/kernels/GCCol2ImKernel.cpp
index af1e34e..1554a89 100644
--- a/src/core/GLES_COMPUTE/kernels/GCCol2ImKernel.cpp
+++ b/src/core/GLES_COMPUTE/kernels/GCCol2ImKernel.cpp
@@ -62,30 +62,32 @@
_output = output;
_convolved_dims = convolved_dims;
- unsigned int num_elems_processed_per_iteration = 1;
+ const DataType dt = input->info()->data_type();
+ const unsigned int local_size = 1;
// Create kernel
std::set<std::string> build_opts;
+ build_opts.emplace("#define COL2IM ");
build_opts.emplace("#define WIDTH_OUTPUT " + support::cpp11::to_string(_convolved_dims.first));
- std::string dt_name = (input->info()->data_type() == DataType::F32) ? "DATA_TYPE_FP32" : "DATA_TYPE_FP16";
+ const std::string dt_name = (dt == DataType::F32) ? "DATA_TYPE_FP32" : "DATA_TYPE_FP16";
build_opts.emplace(("#define " + dt_name));
- build_opts.emplace("#define LOCAL_SIZE_X " + support::cpp11::to_string(num_elems_processed_per_iteration));
- build_opts.emplace("#define LOCAL_SIZE_Y " + support::cpp11::to_string(num_elems_processed_per_iteration));
- build_opts.emplace("#define LOCAL_SIZE_Z " + support::cpp11::to_string(num_elems_processed_per_iteration));
+ build_opts.emplace("#define LOCAL_SIZE_X " + support::cpp11::to_string(local_size));
+ build_opts.emplace("#define LOCAL_SIZE_Y " + support::cpp11::to_string(local_size));
+ build_opts.emplace("#define LOCAL_SIZE_Z " + support::cpp11::to_string(local_size));
_kernel = static_cast<GCKernel>(GCKernelLibrary::get().create_kernel("col2im", build_opts));
// Configure window
- unsigned int nums = 2;
- Window win = calculate_max_window(*output->info(), Steps(nums));
+ const unsigned int num_elems_processed_per_iteration = (dt == DataType::F32) ? 1 : 2;
- AccessWindowHorizontal output_access(output->info(), 0, 2);
+ Window win = calculate_max_window(*output->info(), Steps(num_elems_processed_per_iteration));
+
+ AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration);
const int input_padding = ceil_to_multiple(input->info()->dimension(0), 2) - input->info()->dimension(0);
AccessWindowStatic input_access(input->info(), 0, 0, input->info()->dimension(0) + input_padding, input->info()->dimension(1) + 1);
- update_window_and_padding(win, input_access,
- output_access);
+ update_window_and_padding(win, input_access, output_access);
output_access.set_valid_region(win, output->info()->valid_region());
diff --git a/src/core/GLES_COMPUTE/kernels/GCDepthwiseConvolutionLayer3x3Kernel.cpp b/src/core/GLES_COMPUTE/kernels/GCDepthwiseConvolutionLayer3x3Kernel.cpp
index 9343268..c237409 100644
--- a/src/core/GLES_COMPUTE/kernels/GCDepthwiseConvolutionLayer3x3Kernel.cpp
+++ b/src/core/GLES_COMPUTE/kernels/GCDepthwiseConvolutionLayer3x3Kernel.cpp
@@ -33,31 +33,10 @@
#include "arm_compute/core/TensorInfo.h"
#include "arm_compute/core/Types.h"
#include "arm_compute/core/Utils.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
using namespace arm_compute;
-
-namespace
-{
-/** Calculates expected output shape dimension
- *
- * @param[in] Input shape
- *
- * @return Expected output shape
- */
-TensorShape get_output_shape(TensorShape input_shape, TensorShape weights_shape, PadStrideInfo conv_info)
-{
- unsigned int output_width = 0;
- unsigned int output_height = 0;
-
- std::tie(output_width, output_height) = scaled_dimensions(input_shape.x(), input_shape.y(), weights_shape.x(), weights_shape.y(), conv_info);
-
- TensorShape output_shape = input_shape;
- output_shape.set(0, output_width);
- output_shape.set(1, output_height);
-
- return output_shape;
-}
-} // namespace
+using namespace arm_compute::misc::shape_calculator;
GCDepthwiseConvolutionLayer3x3Kernel::GCDepthwiseConvolutionLayer3x3Kernel()
: _border_size(0), _input(), _output(), _weights(), _biases(), _conv_stride_x(0), _conv_stride_y(0), _conv_pad_left(0), _conv_pad_top(0), _lws(gles::NDRange(1U, 1U, 1U))
@@ -69,7 +48,8 @@
return _border_size;
}
-void GCDepthwiseConvolutionLayer3x3Kernel::configure(const IGCTensor *input, const IGCTensor *weights, const IGCTensor *biases, IGCTensor *output, const PadStrideInfo &conv_info)
+void GCDepthwiseConvolutionLayer3x3Kernel::configure(const IGCTensor *input, const IGCTensor *weights, const IGCTensor *biases, IGCTensor *output, const PadStrideInfo &conv_info,
+ unsigned int depth_multiplier)
{
ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F16);
ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, weights);
@@ -83,7 +63,7 @@
}
// Get convolved dimensions
- TensorShape output_shape = get_output_shape(input->info()->tensor_shape(), weights->info()->tensor_shape(), conv_info);
+ const TensorShape output_shape = compute_depthwise_convolution_shape(*input->info(), *weights->info(), conv_info, depth_multiplier);
// Output auto inizialitation if not yet initialized
auto_init_if_empty(*output->info(),
@@ -93,6 +73,7 @@
input->info()->fixed_point_position());
ARM_COMPUTE_ERROR_ON_MISMATCHING_DIMENSIONS(output->info()->tensor_shape(), output_shape);
+ ARM_COMPUTE_ERROR_ON(output->info()->dimension(2) != weights->info()->dimension(2));
_input = input;
_output = output;
@@ -108,6 +89,7 @@
ARM_COMPUTE_ERROR_ON(_conv_stride_x < 1 || _conv_stride_x > 3);
std::set<std::string> options;
+ options.emplace("#define DEPTH_MULTIPLIER " + support::cpp11::to_string(depth_multiplier));
options.emplace("#define LOCAL_SIZE_X " + support::cpp11::to_string(_lws[0]));
options.emplace("#define LOCAL_SIZE_Y " + support::cpp11::to_string(_lws[1]));
options.emplace("#define LOCAL_SIZE_Z " + support::cpp11::to_string(_lws[2]));
diff --git a/src/core/GLES_COMPUTE/kernels/GCDirectConvolutionLayerKernel.cpp b/src/core/GLES_COMPUTE/kernels/GCDirectConvolutionLayerKernel.cpp
index 99b5e7d..6b16def 100644
--- a/src/core/GLES_COMPUTE/kernels/GCDirectConvolutionLayerKernel.cpp
+++ b/src/core/GLES_COMPUTE/kernels/GCDirectConvolutionLayerKernel.cpp
@@ -50,7 +50,8 @@
}
template <unsigned int kernel_size>
-void GCDirectConvolutionLayerKernel<kernel_size>::configure(const IGCTensor *input, const IGCTensor *weights, const IGCTensor *bias, IGCTensor *output, const PadStrideInfo &conv_info)
+void GCDirectConvolutionLayerKernel<kernel_size>::configure(const IGCTensor *input, const IGCTensor *weights, const IGCTensor *bias, IGCTensor *output,
+ const PadStrideInfo &conv_info, const ActivationLayerInfo &act_info)
{
ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F16, DataType::F32);
ARM_COMPUTE_ERROR_ON(weights->info()->dimension(2) != input->info()->dimension(2));
@@ -58,6 +59,7 @@
ARM_COMPUTE_ERROR_ON(weights->info()->num_dimensions() > 4);
ARM_COMPUTE_ERROR_ON_MSG((kernel_size == 3 && std::get<0>(conv_info.stride()) > 2), "Strides larger than 2 not supported in 3x3 direct convolution!");
ARM_COMPUTE_ERROR_ON(kernel_size != weights->info()->dimension(0));
+ ARM_COMPUTE_ERROR_ON(act_info.enabled() && act_info.activation() != ActivationLayerInfo::ActivationFunction::RELU && act_info.activation() != ActivationLayerInfo::ActivationFunction::LOGISTIC);
if(bias != nullptr)
{
@@ -106,6 +108,16 @@
std::string dt_name = (input->info()->data_type() == DataType::F32) ? "DATA_TYPE_FP32" : "DATA_TYPE_FP16";
options.emplace(("#define " + dt_name));
+ // Activation information in case of a fused activation
+ if(act_info.enabled())
+ {
+ options.emplace("#define FUSED_ACTIVATION");
+ options.emplace(("#define " + string_from_activation_func(act_info.activation())));
+ options.emplace(("#define ACT_OP " + lower_string(string_from_activation_func(act_info.activation())) + "_op"));
+ options.emplace(("#define A_VAL " + float_to_string_with_full_precision(act_info.a())));
+ options.emplace(("#define B_VAL " + float_to_string_with_full_precision(act_info.b())));
+ }
+
unsigned int num_elems_read_per_iteration_x = kernel_size * _conv_stride_x;
unsigned int num_elems_read_per_iteration_y = 1;
unsigned int num_elems_written_per_iteration_x = 1;
diff --git a/src/core/GLES_COMPUTE/kernels/GCGEMMInterleave4x4Kernel.cpp b/src/core/GLES_COMPUTE/kernels/GCGEMMInterleave4x4Kernel.cpp
index dc86bfb..171fbad 100644
--- a/src/core/GLES_COMPUTE/kernels/GCGEMMInterleave4x4Kernel.cpp
+++ b/src/core/GLES_COMPUTE/kernels/GCGEMMInterleave4x4Kernel.cpp
@@ -35,7 +35,6 @@
#include "arm_compute/core/Window.h"
using namespace arm_compute;
-using namespace arm_compute::gles_compute;
GCGEMMInterleave4x4Kernel::GCGEMMInterleave4x4Kernel()
: _input(nullptr), _output(nullptr)
diff --git a/src/core/GLES_COMPUTE/kernels/GCGEMMMatrixAdditionKernel.cpp b/src/core/GLES_COMPUTE/kernels/GCGEMMMatrixAdditionKernel.cpp
index 43846dc..1a68a62 100644
--- a/src/core/GLES_COMPUTE/kernels/GCGEMMMatrixAdditionKernel.cpp
+++ b/src/core/GLES_COMPUTE/kernels/GCGEMMMatrixAdditionKernel.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2018 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -34,7 +34,6 @@
#include "arm_compute/core/Window.h"
using namespace arm_compute;
-using namespace arm_compute::gles_compute;
GCGEMMMatrixAdditionKernel::GCGEMMMatrixAdditionKernel()
: _input(nullptr), _output(nullptr)
diff --git a/src/core/GLES_COMPUTE/kernels/GCGEMMMatrixMultiplyKernel.cpp b/src/core/GLES_COMPUTE/kernels/GCGEMMMatrixMultiplyKernel.cpp
index a5f09e8..d576c30 100644
--- a/src/core/GLES_COMPUTE/kernels/GCGEMMMatrixMultiplyKernel.cpp
+++ b/src/core/GLES_COMPUTE/kernels/GCGEMMMatrixMultiplyKernel.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017, 2018 ARM Limited.
+ * Copyright (c) 2017-2018 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -31,38 +31,182 @@
#include "arm_compute/core/GLES_COMPUTE/IGCTensor.h"
#include "arm_compute/core/GLES_COMPUTE/OpenGLES.h"
#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/TensorInfo.h"
#include "arm_compute/core/Types.h"
#include "arm_compute/core/Utils.h"
#include "arm_compute/core/Validate.h"
#include "arm_compute/core/Window.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
#include <set>
#include <string>
using namespace arm_compute;
-using namespace arm_compute::gles_compute;
+using namespace arm_compute::misc::shape_calculator;
+
+namespace
+{
+using ElementsProcessed = Steps;
+
+inline Status validate_arguments(const ITensorInfo *input0, const ITensorInfo *input1, const ITensorInfo *output, bool is_interleaved_transposed, const GEMMReshapeInfo &reshape_info)
+{
+ ARM_COMPUTE_UNUSED(reshape_info);
+ ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input0, input1, output);
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input0, 1, DataType::F16, DataType::F32);
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input0, input1);
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(input1->num_dimensions() > 3, "The number of dimensions for the matrix B must be <= 3");
+
+ if(!is_interleaved_transposed)
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON(input0->dimension(0) != input1->dimension(1));
+
+ if(output->total_size() != 0)
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON(input1->dimension(0) != output->dimension(0));
+ ARM_COMPUTE_RETURN_ERROR_ON(input0->dimension(1) != output->dimension(1));
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input0, output);
+ }
+ }
+ else
+ {
+ const int m = reshape_info.m();
+ const int n = reshape_info.n();
+ const int k = reshape_info.k();
+ const int mult_transpose1xW_width = reshape_info.mult_transpose1xW_width();
+ const int mult_interleave4x4_height = reshape_info.mult_interleave4x4_height();
+
+ TensorShape tensor_shape0{ input0->tensor_shape() };
+ tensor_shape0.set(0, k);
+ tensor_shape0.set(1, m);
+
+ TensorShape tensor_shape1{ input1->tensor_shape() };
+ tensor_shape1.set(0, n);
+ tensor_shape1.set(1, k);
+
+ const TensorInfo tensor_info0 = input0->clone()->set_tensor_shape(tensor_shape0);
+ const TensorInfo tensor_info1 = input1->clone()->set_tensor_shape(tensor_shape1);
+
+ const TensorInfo tensor_info_reshaped0 = input0->clone()->set_tensor_shape(compute_interleaved_shape(tensor_info0, mult_interleave4x4_height));
+ const TensorInfo tensor_info_reshaped1 = input1->clone()->set_tensor_shape(compute_transpose1xW_with_element_size_shape(tensor_info1, mult_transpose1xW_width));
+
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input0, &tensor_info_reshaped0);
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input1, &tensor_info_reshaped1);
+
+ if(output->total_size() != 0)
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON(output->dimension(0) != static_cast<size_t>(n));
+ ARM_COMPUTE_RETURN_ERROR_ON(output->dimension(1) != static_cast<size_t>(m));
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input0, output);
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_FIXED_POINT(input0, output);
+ }
+ }
+
+ return Status{};
+}
+
+inline std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input0, ITensorInfo *input1, ITensorInfo *output,
+ bool is_interleaved_transposed, const GEMMReshapeInfo &reshape_info,
+ GPUTarget gpu_target, ElementsProcessed &num_elements_processed)
+{
+ ARM_COMPUTE_UNUSED(gpu_target);
+
+ // Output tensor auto inizialitation if not yet initialized
+ TensorShape tensor_shape{ input0->tensor_shape() };
+ tensor_shape.set(0, is_interleaved_transposed ? reshape_info.n() : input1->dimension(0));
+ tensor_shape.set(1, is_interleaved_transposed ? reshape_info.m() : input0->dimension(1));
+
+ auto_init_if_empty(*output, input0->clone()->set_tensor_shape(tensor_shape));
+
+ bool window_changed = false;
+ Window win{};
+
+ const DataType data_type = input0->data_type();
+ unsigned int &num_elems_processed_per_iteration_x = num_elements_processed[0];
+ unsigned int &num_elems_processed_per_iteration_y = num_elements_processed[1];
+
+ if(is_interleaved_transposed)
+ {
+ // Configure window kernel
+ num_elems_processed_per_iteration_x = max_gc_vector_width / data_size_from_type(data_type);
+ num_elems_processed_per_iteration_y = 4;
+
+ win = calculate_max_window(*output, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y));
+
+ AccessWindowRectangle input0_access(input0, 0, 0, num_elems_processed_per_iteration_y, 1, 1.f, 0.25f);
+ AccessWindowTranspose input1_access(input1, 0, 0, num_elems_processed_per_iteration_x, 1, 0.f, 0.25f);
+ AccessWindowRectangle output_access(output, 0, 0, num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y);
+
+ update_window_and_padding(win, input0_access, input1_access, output_access);
+
+ output_access.set_valid_region(win, ValidRegion(Coordinates(), output->tensor_shape()));
+ }
+ else // The input tensors have not been reshaped
+ {
+ // Special case for 1xN, 2xN, 3xN and 4xN input0 tensor.
+ num_elems_processed_per_iteration_y = std::min(static_cast<int>(output->dimension(1)), 4);
+
+ switch(data_type)
+ {
+ case DataType::F16:
+ num_elems_processed_per_iteration_x = 4;
+ break;
+
+ case DataType::F32:
+ num_elems_processed_per_iteration_x = max_gc_vector_width / data_size_from_type(data_type);
+ break;
+
+ default:
+ ARM_COMPUTE_ERROR("Current data type is not supported");
+ break;
+ }
+
+ win = calculate_max_window(*output, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y));
+
+ AccessWindowStatic input0_access(input0, 0, 0, ceil_to_multiple(input0->dimension(0), 8), ceil_to_multiple(input0->dimension(1), num_elems_processed_per_iteration_y));
+ AccessWindowStatic input1_access(input1, 0, 0, ceil_to_multiple(input1->dimension(0), num_elems_processed_per_iteration_x), input1->dimension(1));
+ AccessWindowRectangle output_access(output, 0, 0, num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y);
+
+ update_window_and_padding(win, input0_access, input1_access, output_access);
+
+ Coordinates coord;
+ coord.set_num_dimensions(output->num_dimensions());
+ output_access.set_valid_region(win, ValidRegion(coord, output->tensor_shape()));
+ }
+
+ Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
+ return std::make_pair(err, win);
+}
+} // namespace
GCGEMMMatrixMultiplyKernel::GCGEMMMatrixMultiplyKernel()
: _input0(nullptr), _input1(nullptr), _output(nullptr)
{
}
-void GCGEMMMatrixMultiplyKernel::configure(const IGCTensor *input0, const IGCTensor *input1, IGCTensor *output, float alpha, bool is_interleaved_transposed)
+void GCGEMMMatrixMultiplyKernel::configure(const IGCTensor *input0, const IGCTensor *input1, IGCTensor *output, float alpha, bool is_interleaved_transposed, const GEMMReshapeInfo &reshape_info)
{
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input0, 1, DataType::F32, DataType::F16);
- ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input0, input1, output);
+ ARM_COMPUTE_ERROR_ON_NULLPTR(input0, input1, output);
- if(!is_interleaved_transposed)
- {
- ARM_COMPUTE_ERROR_ON(input0->info()->dimension(0) != input1->info()->dimension(1));
- }
+ // Perform validate step
+ ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input0->info(), input1->info(), output->info(), is_interleaved_transposed, reshape_info));
_input0 = input0;
_input1 = input1;
_output = output;
+ // Get target architecture
+ GPUTarget gpu_target = get_target();
+
+ ElementsProcessed num_elements_processed{};
+
+ // Configure kernel window
+ auto win_config = validate_and_configure_window(input0->info(), input1->info(), output->info(), is_interleaved_transposed, reshape_info, gpu_target, num_elements_processed);
+ ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
+ IGCKernel::configure(win_config.second);
+
+ // Create build options
std::set<std::string> build_opts;
- Window win;
+ std::string kernel_name;
build_opts.emplace("#define LOCAL_SIZE_X " + support::cpp11::to_string(1));
build_opts.emplace("#define LOCAL_SIZE_Y " + support::cpp11::to_string(1));
@@ -74,6 +218,12 @@
// Check if the output tensor is a vector. If so,the kernel runs the vector-matrix multiplication
if(is_interleaved_transposed)
{
+ const int mult_transpose1xW_width = reshape_info.mult_transpose1xW_width();
+ const int mult_interleave4x4_height = reshape_info.mult_interleave4x4_height();
+
+ build_opts.emplace("#define MULT_TRANSPOSE1XW_WIDTH " + support::cpp11::to_string(mult_transpose1xW_width));
+ build_opts.emplace("#define MULT_INTERLEAVE4X4_HEIGHT " + support::cpp11::to_string(mult_interleave4x4_height));
+
switch(input0->info()->data_type())
{
case DataType::F16:
@@ -91,57 +241,32 @@
build_opts.emplace("#define GEMM_MM_INTERLEAVED_TRANSPOSED");
- // Create kernel
- _kernel = GCKernelLibrary::get().create_kernel(("gemm_mm_interleaved_transposed"), build_opts);
-
- // Configure window kernel
- const unsigned int num_elems_processed_per_iteration_x = max_gc_vector_width / data_size_from_type(input0->info()->data_type());
- constexpr unsigned int num_elems_processed_per_iteration_y = 4;
-
- win = calculate_max_window(*output->info(), Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y));
-
- AccessWindowRectangle input0_access(input0->info(), 0, 0, num_elems_processed_per_iteration_y, 1, 1.f, 0.25f);
- AccessWindowTranspose input1_access(input1->info(), 0, 0, num_elems_processed_per_iteration_x, 1, 0.f, 0.25f);
- AccessWindowRectangle output_access(output->info(), 0, 0, num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y);
-
- update_window_and_padding(win, input0_access, input1_access, output_access);
-
- output_access.set_valid_region(win, ValidRegion(Coordinates(), output->info()->tensor_shape()));
+ kernel_name = "gemm_mm_interleaved_transposed";
}
else
{
- ARM_COMPUTE_ERROR_ON(input0->info()->dimension(0) != input1->info()->dimension(1));
-
// Special case for 1xN, 2xN, 3xN and 4xN input0 tensor
- unsigned int num_elems_processed_per_iteration_x;
- unsigned int num_elems_processed_per_iteration_y;
+ GPUTarget arch_target = get_arch_from_target(gpu_target);
switch(input0->info()->data_type())
{
case DataType::F16:
build_opts.emplace("#define DATA_TYPE_FP16");
-
-#define MM_PROCESS_4X_OPTIMIZED
-
-#if defined(MM_PROCESS_4X)
- num_elems_processed_per_iteration_x = 4;
- num_elems_processed_per_iteration_y = std::min(static_cast<int>(output->info()->dimension(1)), 4);
- build_opts.emplace("#define MM_PROCESS_4X");
-#elif defined(MM_PROCESS_4X_OPTIMIZED) /* MM_PROCESS_4X */
- num_elems_processed_per_iteration_x = 4;
- num_elems_processed_per_iteration_y = std::min(static_cast<int>(output->info()->dimension(1)), 4);
build_opts.emplace("#define MM_PROCESS_4X_OPTIMIZED");
-#elif defined(MM_PROCESS_8X) /* MM_PROCESS_4X */
- num_elems_processed_per_iteration_x = 8;
- num_elems_processed_per_iteration_y = 1;
- build_opts.emplace("#define MM_PROCESS_8X");
-#endif /* MM_PROCESS_4X */
+ build_opts.emplace("#define GEMM_MM_FLOATING_POINT");
break;
case DataType::F32:
- num_elems_processed_per_iteration_x = max_gc_vector_width / data_size_from_type(input0->info()->data_type());
- num_elems_processed_per_iteration_y = std::min(static_cast<int>(output->info()->dimension(1)), 4);
build_opts.emplace("#define DATA_TYPE_FP32");
+
+ if(arch_target == GPUTarget::BIFROST && input0->info()->num_dimensions() != 1)
+ {
+ build_opts.emplace("#define GEMM_MM_FLOATING_POINT_BIFROST");
+ }
+ else
+ {
+ build_opts.emplace("#define GEMM_MM_FLOATING_POINT");
+ }
break;
default:
@@ -149,32 +274,31 @@
break;
}
- build_opts.emplace("#define GEMM_MM_FLOATING_POINT");
- build_opts.emplace("#define NUM_ELEMS_PROCESSED_PER_THREAD_X " + support::cpp11::to_string(num_elems_processed_per_iteration_x));
- build_opts.emplace("#define NUM_ELEMS_PROCESSED_PER_THREAD_Y " + support::cpp11::to_string(num_elems_processed_per_iteration_y));
+ build_opts.emplace("#define NUM_ELEMS_PROCESSED_PER_THREAD_X " + support::cpp11::to_string(num_elements_processed.x()));
+ build_opts.emplace("#define NUM_ELEMS_PROCESSED_PER_THREAD_Y " + support::cpp11::to_string(num_elements_processed.y()));
- // Create kernel
- _kernel = GCKernelLibrary::get().create_kernel("gemm_mm_floating_point", build_opts);
-
- win = calculate_max_window(*output->info(), Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y));
-
-#if defined(MM_PROCESS_4X_OPTIMIZED)
- AccessWindowStatic input0_access(input0->info(), 0, 0, ceil_to_multiple(input0->info()->dimension(0), 8), ceil_to_multiple(input0->info()->dimension(1), num_elems_processed_per_iteration_y));
-#else /* MM_PROCESS_4X_OPTIMIZED */
- AccessWindowStatic input0_access(input0->info(), 0, 0, ceil_to_multiple(input0->info()->dimension(0), num_elems_processed_per_iteration_x), ceil_to_multiple(input0->info()->dimension(1),
- num_elems_processed_per_iteration_y));
-#endif /* MM_PROCESS_4X_OPTIMIZED */
- AccessWindowStatic input1_access(input1->info(), 0, 0, ceil_to_multiple(input1->info()->dimension(0), num_elems_processed_per_iteration_x), input1->info()->dimension(1));
- AccessWindowRectangle output_access(output->info(), 0, 0, num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y);
-
- update_window_and_padding(win, input0_access, input1_access, output_access);
-
- Coordinates coord;
- coord.set_num_dimensions(output->info()->num_dimensions());
- output_access.set_valid_region(win, ValidRegion(coord, output->info()->tensor_shape()));
+ kernel_name = "gemm_mm_floating_point";
}
- IGCKernel::configure(win);
+ // Create kernel
+ _kernel = GCKernelLibrary::get().create_kernel(kernel_name, build_opts);
+}
+
+Status GCGEMMMatrixMultiplyKernel::validate(const ITensorInfo *input0, const ITensorInfo *input1, const ITensorInfo *output, float alpha, bool is_interleaved_transposed,
+ const GEMMReshapeInfo &reshape_info, GPUTarget gpu_target)
+{
+ ARM_COMPUTE_UNUSED(alpha);
+ ElementsProcessed num_elements_processed{};
+ ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input0, input1, output, is_interleaved_transposed, reshape_info));
+ ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input0->clone().get(),
+ input1->clone().get(),
+ output->clone().get(),
+ is_interleaved_transposed,
+ reshape_info,
+ gpu_target,
+ num_elements_processed)
+ .first);
+ return Status{};
}
void GCGEMMMatrixMultiplyKernel::run(const Window &window)
diff --git a/src/core/GLES_COMPUTE/kernels/GCIm2ColKernel.cpp b/src/core/GLES_COMPUTE/kernels/GCIm2ColKernel.cpp
index 4ab6f3e..6c89616 100644
--- a/src/core/GLES_COMPUTE/kernels/GCIm2ColKernel.cpp
+++ b/src/core/GLES_COMPUTE/kernels/GCIm2ColKernel.cpp
@@ -32,6 +32,7 @@
#include "arm_compute/core/GLES_COMPUTE/OpenGLES.h"
#include "arm_compute/core/Helpers.h"
#include "arm_compute/core/Size2D.h"
+#include "arm_compute/core/TensorInfo.h"
#include "arm_compute/core/Types.h"
#include "arm_compute/core/Validate.h"
#include "support/ToolchainSupport.h"
@@ -64,7 +65,7 @@
{
}
-void GCIm2ColKernel::configure(const IGCTensor *input, IGCTensor *output, const Size2D &kernel_dims, const PadStrideInfo &conv_info, bool has_bias)
+void GCIm2ColKernel::configure(const IGCTensor *input, IGCTensor *output, const Size2D &kernel_dims, const PadStrideInfo &conv_info, bool has_bias, const Size2D &dilation)
{
ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
@@ -97,7 +98,8 @@
&& (std::equal(input->info()->tensor_shape().cbegin() + 3,
input->info()->tensor_shape().cend(),
output->info()->tensor_shape().cbegin() + 1))
- && ((stride_x == 1) && (stride_y == 1) && !conv_info.has_padding());
+ && ((stride_x == 1) && (stride_y == 1) && !conv_info.has_padding())
+ && (dilation == Size2D(1U, 1U));
std::string kernel_name = "im2col_generic";
if(!run_img2col_reduced)
@@ -110,8 +112,8 @@
build_opts.emplace("#define IM2COL_GENERIC");
_convolved_dims = scaled_dimensions(input->info()->dimension(0), input->info()->dimension(1),
kernel_dims.width, kernel_dims.height,
- conv_info);
- _num_elems_processed_per_iteration = 2;
+ conv_info, dilation);
+ _num_elems_processed_per_iteration = (input->info()->data_type() == DataType::F32) ? 1 : 2;
build_opts.emplace("#define KERNEL_WIDTH " + support::cpp11::to_string(kernel_dims.width));
build_opts.emplace("#define KERNEL_HEIGHT " + support::cpp11::to_string(kernel_dims.height));
@@ -126,6 +128,8 @@
build_opts.emplace("#define PAD_BOTTOM " + support::cpp11::to_string(conv_info.pad_bottom()));
build_opts.emplace("#define SRC_WIDTH " + support::cpp11::to_string(input->info()->dimension(0)));
build_opts.emplace("#define SRC_HEIGHT " + support::cpp11::to_string(input->info()->dimension(1)));
+ build_opts.emplace("#define DILATION_X " + support::cpp11::to_string(dilation.x()));
+ build_opts.emplace("#define DILATION_Y " + support::cpp11::to_string(dilation.y()));
_run_func = &GCIm2ColKernel::run_generic;
}
@@ -205,11 +209,12 @@
IGCKernel::configure(win);
}
-Status GCIm2ColKernel::validate(const ITensorInfo *input, const ITensorInfo *output, const Size2D &kernel_dims, const PadStrideInfo &conv_info, bool has_bias)
+Status GCIm2ColKernel::validate(const ITensorInfo *input, const ITensorInfo *output, const Size2D &kernel_dims, const PadStrideInfo &conv_info, bool has_bias, const Size2D &dilation)
{
ARM_COMPUTE_UNUSED(kernel_dims);
ARM_COMPUTE_UNUSED(conv_info);
ARM_COMPUTE_UNUSED(has_bias);
+ ARM_COMPUTE_UNUSED(dilation);
ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output));
return Status{};
}
diff --git a/src/core/GLES_COMPUTE/kernels/GCScaleKernel.cpp b/src/core/GLES_COMPUTE/kernels/GCScaleKernel.cpp
index 46d7ff9..f87615a 100644
--- a/src/core/GLES_COMPUTE/kernels/GCScaleKernel.cpp
+++ b/src/core/GLES_COMPUTE/kernels/GCScaleKernel.cpp
@@ -51,7 +51,6 @@
ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
ARM_COMPUTE_ERROR_ON(output == input);
ARM_COMPUTE_ERROR_ON(policy != InterpolationPolicy::NEAREST_NEIGHBOR);
- ARM_COMPUTE_UNUSED(sampling_policy);
_input = input;
_output = output;
@@ -123,7 +122,7 @@
output_access.set_valid_region(win, calculate_valid_region_scale(*(input->info()),
output->info()->tensor_shape(),
policy,
- border,
+ sampling_policy,
border_undefined));
IGCKernel::configure(win);
diff --git a/src/core/GLES_COMPUTE/kernels/GCTensorShiftKernel.cpp b/src/core/GLES_COMPUTE/kernels/GCTensorShiftKernel.cpp
index 21946b7..f0057df 100644
--- a/src/core/GLES_COMPUTE/kernels/GCTensorShiftKernel.cpp
+++ b/src/core/GLES_COMPUTE/kernels/GCTensorShiftKernel.cpp
@@ -36,7 +36,6 @@
#include "support/ToolchainSupport.h"
using namespace arm_compute;
-using namespace arm_compute::gles_compute;
GCTensorShiftKernel::GCTensorShiftKernel()
: _input(nullptr), _lws(gles::NDRange(1U, 1U, 1U)), _left_padding(0)
diff --git a/src/core/GLES_COMPUTE/kernels/GCWeightsReshapeKernel.cpp b/src/core/GLES_COMPUTE/kernels/GCWeightsReshapeKernel.cpp
index 4c08873..ccbfaf8 100644
--- a/src/core/GLES_COMPUTE/kernels/GCWeightsReshapeKernel.cpp
+++ b/src/core/GLES_COMPUTE/kernels/GCWeightsReshapeKernel.cpp
@@ -31,11 +31,12 @@
#include "arm_compute/core/Helpers.h"
#include "arm_compute/core/Types.h"
#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
#include "arm_compute/core/GLES_COMPUTE/GCHelpers.h"
using namespace arm_compute;
-using namespace arm_compute::gles_compute;
+using namespace arm_compute::misc::shape_calculator;
GCWeightsReshapeKernel::GCWeightsReshapeKernel()
: _input(nullptr), _biases(nullptr), _output(nullptr)
@@ -47,15 +48,8 @@
ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F16, DataType::F32);
ARM_COMPUTE_ERROR_ON_NULLPTR(output);
- // Calculate output shape
- TensorShape output_shape{ input->info()->tensor_shape() };
- output_shape.collapse(3);
- const size_t tmp_dim = output_shape[0];
- output_shape.set(0, output_shape[1]);
- output_shape.set(1, tmp_dim + (biases != nullptr ? 1 : 0));
-
// Output tensor auto inizialitation if not yet initialized
- auto_init_if_empty(*output->info(), input->info()->clone()->set_tensor_shape(output_shape));
+ auto_init_if_empty(*output->info(), input->info()->clone()->set_tensor_shape(compute_weights_reshaped_shape(*input->info(), (biases != nullptr))));
ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
diff --git a/src/core/GPUTarget.cpp b/src/core/GPUTarget.cpp
new file mode 100644
index 0000000..575d858
--- /dev/null
+++ b/src/core/GPUTarget.cpp
@@ -0,0 +1,154 @@
+/*
+ * Copyright (c) 2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/GPUTarget.h"
+#include "arm_compute/core/Log.h"
+
+#include <map>
+#include <regex>
+
+namespace
+{
+arm_compute::GPUTarget get_bifrost_target(const std::string &version)
+{
+ if(version == "G71")
+ {
+ return arm_compute::GPUTarget::G71;
+ }
+ else if(version == "G72")
+ {
+ return arm_compute::GPUTarget::G72;
+ }
+ else if(version == "G51")
+ {
+ return arm_compute::GPUTarget::G51;
+ }
+ else if(version == "G51BIG")
+ {
+ return arm_compute::GPUTarget::G51BIG;
+ }
+ else if(version == "G51LIT")
+ {
+ return arm_compute::GPUTarget::G51LIT;
+ }
+ else if(version == "TNOX")
+ {
+ return arm_compute::GPUTarget::TNOX;
+ }
+ else if(version == "TTRX")
+ {
+ return arm_compute::GPUTarget::TTRX;
+ }
+ else if(version == "TBOX")
+ {
+ return arm_compute::GPUTarget::TBOX;
+ }
+ else
+ {
+ return arm_compute::GPUTarget::BIFROST;
+ }
+}
+
+arm_compute::GPUTarget get_midgard_target(const std::string &version)
+{
+ if(version == "T600")
+ {
+ return arm_compute::GPUTarget::T600;
+ }
+ else if(version == "T700")
+ {
+ return arm_compute::GPUTarget::T700;
+ }
+ else if(version == "T800")
+ {
+ return arm_compute::GPUTarget::T800;
+ }
+ else
+ {
+ return arm_compute::GPUTarget::MIDGARD;
+ }
+}
+} // namespace
+
+namespace arm_compute
+{
+const std::string &string_from_target(GPUTarget target)
+{
+ static std::map<GPUTarget, const std::string> gpu_target_map =
+ {
+ { GPUTarget::MIDGARD, "midgard" },
+ { GPUTarget::BIFROST, "bifrost" },
+ { GPUTarget::T600, "t600" },
+ { GPUTarget::T700, "t700" },
+ { GPUTarget::T800, "t800" },
+ { GPUTarget::G71, "g71" },
+ { GPUTarget::G72, "g72" },
+ { GPUTarget::G51, "g51" },
+ { GPUTarget::G51BIG, "g51big" },
+ { GPUTarget::G51LIT, "g51lit" },
+ { GPUTarget::TNOX, "tnox" },
+ { GPUTarget::TTRX, "ttrx" },
+ { GPUTarget::TBOX, "tbox" }
+ };
+
+ return gpu_target_map[target];
+}
+
+GPUTarget get_target_from_name(const std::string &device_name)
+{
+ std::regex mali_regex(R"(Mali-(.*))");
+ std::smatch name_parts;
+ const bool found_mali = std::regex_search(device_name, name_parts, mali_regex);
+
+ if(!found_mali)
+ {
+ ARM_COMPUTE_LOG_INFO_MSG_CORE("Can't find valid Mali GPU. Target is set to UNKNOWN.");
+ return GPUTarget::UNKNOWN;
+ }
+
+ const char target = name_parts.str(1)[0];
+ const std::string &version = name_parts.str(1);
+
+ std::regex future_regex(R"(.*X)");
+ const bool is_future_bifrost = std::regex_search(version, future_regex);
+
+ if(target == 'G' || is_future_bifrost)
+ {
+ return get_bifrost_target(version);
+ }
+ else if(target == 'T')
+ {
+ return get_midgard_target(version);
+ }
+ else
+ {
+ ARM_COMPUTE_LOG_INFO_MSG_CORE("Mali GPU unknown. Target is set to the default one. (BIFROST)");
+ return GPUTarget::BIFROST;
+ }
+}
+
+GPUTarget get_arch_from_target(GPUTarget target)
+{
+ return (target & GPUTarget::GPU_ARCH_MASK);
+}
+} // namespace arm_compute
diff --git a/src/core/HOGInfo.cpp b/src/core/HOGInfo.cpp
index 73f4c42..4f99455 100644
--- a/src/core/HOGInfo.cpp
+++ b/src/core/HOGInfo.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2016, 2018 ARM Limited.
+ * Copyright (c) 2016-2018 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -61,7 +61,7 @@
_phase_type = phase_type;
// Compute descriptor size. +1 takes into account of the bias
- _descriptor_size = num_cells_per_block().area() * num_blocks_per_image(_detection_window_size).area() * _num_bins + 1;
+ _descriptor_size = num_cells_per_block().area() * num_block_positions_per_image(_detection_window_size).area() * _num_bins + 1;
}
Size2D HOGInfo::num_cells_per_block() const
@@ -80,8 +80,10 @@
_block_stride.height / _cell_size.height);
}
-Size2D HOGInfo::num_blocks_per_image(const Size2D &image_size) const
+Size2D HOGInfo::num_block_positions_per_image(const Size2D &image_size) const
{
+ ARM_COMPUTE_ERROR_ON(_block_stride.width == 0 || _block_stride.height == 0);
+
return Size2D(((image_size.width - _block_size.width) / _block_stride.width) + 1,
((image_size.height - _block_size.height) / _block_stride.height) + 1);
}
diff --git a/src/core/Helpers.cpp b/src/core/Helpers.cpp
index 3ee0fa7..e336331 100644
--- a/src/core/Helpers.cpp
+++ b/src/core/Helpers.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2016, 2018 ARM Limited.
+ * Copyright (c) 2016-2018 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -173,3 +173,79 @@
return window;
}
+
+ValidRegion arm_compute::calculate_valid_region_scale(const ITensorInfo &src_info, const TensorShape &dst_shape,
+ InterpolationPolicy interpolate_policy, SamplingPolicy sampling_policy, bool border_undefined)
+{
+ const DataLayout data_layout = src_info.data_layout();
+ const int idx_width = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
+ const int idx_height = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
+
+ const float scale_x = static_cast<float>(dst_shape[idx_width]) / src_info.tensor_shape()[idx_width];
+ const float scale_y = static_cast<float>(dst_shape[idx_height]) / src_info.tensor_shape()[idx_height];
+ const float sampling_point = (sampling_policy == SamplingPolicy::CENTER) ? 0.5f : 0.0f;
+
+ // Get input's valid region start and end points
+ const int valid_start_in_x = src_info.valid_region().anchor[idx_width];
+ const int valid_start_in_y = src_info.valid_region().anchor[idx_height];
+ const int valid_end_in_x = src_info.valid_region().anchor[idx_width] + src_info.valid_region().shape[idx_width];
+ const int valid_end_in_y = src_info.valid_region().anchor[idx_height] + src_info.valid_region().shape[idx_height];
+
+ // Initialize output's valid region start and end points
+ auto valid_start_out_x = static_cast<int>(valid_start_in_x * scale_x);
+ auto valid_start_out_y = static_cast<int>(valid_start_in_y * scale_y);
+ auto valid_end_out_x = std::min<int>(std::ceil(valid_end_in_x * scale_x), dst_shape[idx_width]);
+ auto valid_end_out_y = std::min<int>(std::ceil(valid_end_in_y * scale_y), dst_shape[idx_height]);
+
+ // Handle valid points in case of the bi-linear interpolation
+ if(border_undefined)
+ {
+ switch(interpolate_policy)
+ {
+ case InterpolationPolicy::NEAREST_NEIGHBOR:
+ {
+ // (start_out + sampling_point) >= (start_in * scale)
+ // start_out = ceil((start_in * scale) - sampling_point)
+ valid_start_out_x = std::ceil(valid_start_in_x * scale_x - sampling_point);
+ valid_start_out_y = std::ceil(valid_start_in_y * scale_y - sampling_point);
+
+ // (end_out - 1 + sampling_point) < (end_in * scale)
+ // end_out = ceil((end_in * scale) - sampling_point); // <-- ceil(x - 1) strictly less
+ valid_end_out_x = std::ceil(valid_end_in_x * scale_x - sampling_point);
+ valid_end_out_y = std::ceil(valid_end_in_y * scale_y - sampling_point);
+ break;
+ }
+ case InterpolationPolicy::BILINEAR:
+ {
+ // (start_out + sampling_point) >= ((start_in + sampling_point) * scale)
+ // start_out = ceil(((start_in + sampling_point) * scale) - sampling_point)
+ valid_start_out_x = std::ceil((valid_start_in_x + sampling_point) * scale_x - sampling_point);
+ valid_start_out_y = std::ceil((valid_start_in_y + sampling_point) * scale_y - sampling_point);
+
+ // (end_out - 1 + sampling_point) <= ((end_in - 1 + sampling_point) * scale)
+ // end_out = floor(((end_in - 1 + sampling_point) * scale) - sampling_point + 1)
+ valid_end_out_x = std::floor((valid_end_in_x - 1.f + sampling_point) * scale_x - sampling_point + 1.f);
+ valid_end_out_y = std::floor((valid_end_in_y - 1.f + sampling_point) * scale_y - sampling_point + 1.f);
+ break;
+ }
+ case InterpolationPolicy::AREA:
+ break;
+ default:
+ {
+ ARM_COMPUTE_ERROR("Invalid InterpolationPolicy");
+ break;
+ }
+ }
+ }
+
+ // Setup output valid region
+ ValidRegion valid_region{ Coordinates(), dst_shape, src_info.tensor_shape().num_dimensions() };
+
+ valid_region.anchor.set(idx_width, std::max(0, valid_start_out_x));
+ valid_region.anchor.set(idx_height, std::max(0, valid_start_out_y));
+
+ valid_region.shape.set(idx_width, std::min<size_t>(valid_end_out_x - valid_start_out_x, dst_shape[idx_width]));
+ valid_region.shape.set(idx_height, std::min<size_t>(valid_end_out_y - valid_start_out_y, dst_shape[idx_height]));
+
+ return valid_region;
+}
\ No newline at end of file
diff --git a/src/core/ITensor.cpp b/src/core/ITensor.cpp
index b65c4f4..eb5f072 100644
--- a/src/core/ITensor.cpp
+++ b/src/core/ITensor.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2016, 2017 ARM Limited.
+ * Copyright (c) 2016-2018 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -151,3 +151,13 @@
}
}
}
+
+bool ITensor::is_used() const
+{
+ return _is_used;
+}
+
+void ITensor::mark_as_unused() const
+{
+ _is_used = false;
+}
diff --git a/src/core/NEON/kernels/NEBatchNormalizationLayerKernel.cpp b/src/core/NEON/kernels/NEBatchNormalizationLayerKernel.cpp
index 1f730a2..6be50fd 100644
--- a/src/core/NEON/kernels/NEBatchNormalizationLayerKernel.cpp
+++ b/src/core/NEON/kernels/NEBatchNormalizationLayerKernel.cpp
@@ -58,20 +58,39 @@
if(nullptr != output)
{
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output);
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(input, output);
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_FIXED_POINT(input, output);
}
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, mean, var, beta, gamma);
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_FIXED_POINT(input, mean, var, beta, gamma);
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(mean, var, beta, gamma);
- ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(2) != mean->dimension(0));
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, mean, var);
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_FIXED_POINT(input, mean, var);
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(mean, var);
+ if(beta != nullptr)
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, beta);
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_FIXED_POINT(input, beta);
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(mean, beta);
+ }
+ if(gamma != nullptr)
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, gamma);
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_FIXED_POINT(input, gamma);
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(mean, gamma);
+ }
+ ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::CHANNEL)) != mean->dimension(0));
return Status{};
}
std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *output)
{
+ if(output != nullptr)
+ {
+ // Output tensor auto initialization if not yet initialized
+ auto_init_if_empty(*output, *input->clone());
+ }
+
unsigned int num_elems_processed_per_iteration = 16 / input->element_size();
Window win = calculate_max_window(*input, Steps(num_elems_processed_per_iteration));
@@ -99,13 +118,13 @@
const int fixed_point_position = _input->info()->fixed_point_position();
const auto input_mean = reinterpret_cast<const qint8_t *>(_mean->ptr_to_element(Coordinates(0, 0)));
const auto input_var = reinterpret_cast<const qint8_t *>(_var->ptr_to_element(Coordinates(0, 0)));
- const auto input_gamma = reinterpret_cast<const qint8_t *>(_gamma->ptr_to_element(Coordinates(0, 0)));
- const auto input_beta = reinterpret_cast<const qint8_t *>(_beta->ptr_to_element(Coordinates(0, 0)));
+ const auto input_gamma = (_gamma != nullptr) ? reinterpret_cast<const qint8_t *>(_gamma->ptr_to_element(Coordinates(0, 0))) : nullptr;
+ const auto input_beta = (_beta != nullptr) ? reinterpret_cast<const qint8_t *>(_beta->ptr_to_element(Coordinates(0, 0))) : nullptr;
qint8x16_t mean_vec = vdupq_n_qs8(0);
qint8x16_t var_vec = vdupq_n_qs8(0);
- qint8x16_t gamma_vec = vdupq_n_qs8(0);
- qint8x16_t beta_vec = vdupq_n_qs8(0);
+ qint8x16_t gamma_vec = vdupq_n_qs8(sqcvt_qs8_f32(1, fixed_point_position));
+ qint8x16_t beta_vec = vdupq_n_qs8(sqcvt_qs8_f32(0, fixed_point_position));
qint8x16_t denominator = vdupq_n_qs8(0);
const qint8x16_t epsilon_vec = vdupq_n_qs8(sqcvt_qs8_f32(_epsilon, fixed_point_position));
execute_window_loop(window, [&](const Coordinates & id)
@@ -113,10 +132,16 @@
if(slice != id.z())
{
// Conctruct vectors
- mean_vec = vdupq_n_qs8(*(input_mean + id.z()));
- var_vec = vdupq_n_qs8(*(input_var + id.z()));
- gamma_vec = vdupq_n_qs8(*(input_gamma + id.z()));
- beta_vec = vdupq_n_qs8(*(input_beta + id.z()));
+ mean_vec = vdupq_n_qs8(*(input_mean + id.z()));
+ var_vec = vdupq_n_qs8(*(input_var + id.z()));
+ if(input_gamma != nullptr)
+ {
+ gamma_vec = vdupq_n_qs8(*(input_gamma + id.z()));
+ }
+ if(input_beta != nullptr)
+ {
+ beta_vec = vdupq_n_qs8(*(input_beta + id.z()));
+ }
// Calculate denominator
denominator = vqinvsqrtq_qs8(vqaddq_qs8(var_vec, epsilon_vec), fixed_point_position);
@@ -146,13 +171,13 @@
const int fixed_point_position = _input->info()->fixed_point_position();
const auto input_mean = reinterpret_cast<const qint16_t *>(_mean->ptr_to_element(Coordinates(0, 0)));
const auto input_var = reinterpret_cast<const qint16_t *>(_var->ptr_to_element(Coordinates(0, 0)));
- const auto input_gamma = reinterpret_cast<const qint16_t *>(_gamma->ptr_to_element(Coordinates(0, 0)));
- const auto input_beta = reinterpret_cast<const qint16_t *>(_beta->ptr_to_element(Coordinates(0, 0)));
+ const auto input_gamma = (_gamma != nullptr) ? reinterpret_cast<const qint16_t *>(_gamma->ptr_to_element(Coordinates(0, 0))) : nullptr;
+ const auto input_beta = (_beta != nullptr) ? reinterpret_cast<const qint16_t *>(_beta->ptr_to_element(Coordinates(0, 0))) : nullptr;
qint16x8_t mean_vec = vdupq_n_qs16(0);
qint16x8_t var_vec = vdupq_n_qs16(0);
- qint16x8_t gamma_vec = vdupq_n_qs16(0);
- qint16x8_t beta_vec = vdupq_n_qs16(0);
+ qint16x8_t gamma_vec = vdupq_n_qs16(sqcvt_qs16_f32(1, fixed_point_position));
+ qint16x8_t beta_vec = vdupq_n_qs16(sqcvt_qs16_f32(0, fixed_point_position));
qint16x8_t denominator = vdupq_n_qs16(0);
const qint16x8_t epsilon_vec = vdupq_n_qs16(sqcvt_qs16_f32(_epsilon, fixed_point_position));
execute_window_loop(window, [&](const Coordinates & id)
@@ -160,10 +185,16 @@
if(slice != id.z())
{
// Conctruct vectors
- mean_vec = vdupq_n_qs16(*(input_mean + id.z()));
- var_vec = vdupq_n_qs16(*(input_var + id.z()));
- gamma_vec = vdupq_n_qs16(*(input_gamma + id.z()));
- beta_vec = vdupq_n_qs16(*(input_beta + id.z()));
+ mean_vec = vdupq_n_qs16(*(input_mean + id.z()));
+ var_vec = vdupq_n_qs16(*(input_var + id.z()));
+ if(input_gamma != nullptr)
+ {
+ gamma_vec = vdupq_n_qs16(*(input_gamma + id.z()));
+ }
+ if(input_beta != nullptr)
+ {
+ beta_vec = vdupq_n_qs16(*(input_beta + id.z()));
+ }
// Calculate denominator
denominator = vqinvsqrtq_qs16(vqaddq_qs16(var_vec, epsilon_vec), fixed_point_position);
@@ -179,9 +210,9 @@
}
template <bool fused_activation>
-void NEBatchNormalizationLayerKernel::batch_normalization_fp16(const Window &window)
+void NEBatchNormalizationLayerKernel::batch_normalization_fp16_nchw(const Window &window)
{
- static_assert(!fused_activation, "Activation is not supported for QS8");
+ static_assert(!fused_activation, "Activation is not supported for FP16");
ARM_COMPUTE_UNUSED(window);
#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
@@ -194,12 +225,12 @@
const auto input_mean = reinterpret_cast<const float16_t *>(_mean->ptr_to_element(Coordinates(0, 0)));
const auto input_var = reinterpret_cast<const float16_t *>(_var->ptr_to_element(Coordinates(0, 0)));
- const auto input_gamma = reinterpret_cast<const float16_t *>(_gamma->ptr_to_element(Coordinates(0, 0)));
- const auto input_beta = reinterpret_cast<const float16_t *>(_beta->ptr_to_element(Coordinates(0, 0)));
+ const auto input_gamma = (_gamma != nullptr) ? reinterpret_cast<const float16_t *>(_gamma->ptr_to_element(Coordinates(0, 0))) : nullptr;
+ const auto input_beta = (_beta != nullptr) ? reinterpret_cast<const float16_t *>(_beta->ptr_to_element(Coordinates(0, 0))) : nullptr;
float16x8_t mean_vec = vdupq_n_f16(0.0);
float16x8_t var_vec = vdupq_n_f16(0.0);
- float16x8_t gamma_vec = vdupq_n_f16(0.0);
+ float16x8_t gamma_vec = vdupq_n_f16(1.0);
float16x8_t beta_vec = vdupq_n_f16(0.0);
float16x8_t denominator = vdupq_n_f16(0.0);
const float16x8_t epsilon_vec = vdupq_n_f16(_epsilon);
@@ -208,10 +239,16 @@
if(slice != id.z())
{
// Conctruct vectors
- mean_vec = vdupq_n_f16(*(input_mean + id.z()));
- var_vec = vdupq_n_f16(*(input_var + id.z()));
- gamma_vec = vdupq_n_f16(*(input_gamma + id.z()));
- beta_vec = vdupq_n_f16(*(input_beta + id.z()));
+ mean_vec = vdupq_n_f16(*(input_mean + id.z()));
+ var_vec = vdupq_n_f16(*(input_var + id.z()));
+ if(input_gamma != nullptr)
+ {
+ gamma_vec = vdupq_n_f16(*(input_gamma + id.z()));
+ }
+ if(input_beta != nullptr)
+ {
+ beta_vec = vdupq_n_f16(*(input_beta + id.z()));
+ }
// Calculate denominator
denominator = vinvsqrtq_f16(vaddq_f16(var_vec, epsilon_vec));
@@ -227,8 +264,43 @@
#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
}
+template <bool fused_activation>
+void NEBatchNormalizationLayerKernel::batch_normalization_fp16_nhwc(const Window &window)
+{
+ static_assert(!fused_activation, "Activation is not supported for FP16");
+
+ ARM_COMPUTE_UNUSED(window);
+#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+ Iterator input(_input, window);
+ Iterator output(_output, window);
+
+ const auto input_mean = reinterpret_cast<const float16_t *>(_mean->ptr_to_element(Coordinates(0, 0)));
+ const auto input_var = reinterpret_cast<const float16_t *>(_var->ptr_to_element(Coordinates(0, 0)));
+ const auto input_gamma = (_gamma != nullptr) ? reinterpret_cast<const float16_t *>(_gamma->ptr_to_element(Coordinates(0, 0))) : nullptr;
+ const auto input_beta = (_beta != nullptr) ? reinterpret_cast<const float16_t *>(_beta->ptr_to_element(Coordinates(0, 0))) : nullptr;
+
+ const float16x8_t epsilon_vec = vdupq_n_f16(_epsilon);
+ execute_window_loop(window, [&](const Coordinates & id)
+ {
+ // Conctruct vectors
+ const float16x8_t mean_vec = vld1q_f16(input_mean + id.x());
+ const float16x8_t var_vec = vld1q_f16(input_var + id.x());
+ const float16x8_t gamma_vec = (input_gamma != nullptr) ? vld1q_f16(input_gamma + id.x()) : vdupq_n_f16(1.0);
+ const float16x8_t beta_vec = (input_beta != nullptr) ? vld1q_f16(input_beta + id.x()) : vdupq_n_f16(0.0);
+ // Calculate denominator
+ const float16x8_t denominator = vinvsqrtq_f16(vaddq_f16(var_vec, epsilon_vec));
+
+ // Calculate x bar and store results
+ const float16x8_t numerator = vsubq_f16(vld1q_f16(reinterpret_cast<const float16_t *>(input.ptr())), mean_vec);
+ const float16x8_t x_bar = vmulq_f16(numerator, denominator);
+ vst1q_f16(reinterpret_cast<float16_t *>(output.ptr()), vaddq_f16(beta_vec, vmulq_f16(x_bar, gamma_vec)));
+ },
+ input, output);
+#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
+}
+
template <bool fused_activation, typename F>
-void NEBatchNormalizationLayerKernel::batch_normalization_fp32(const Window &window)
+void NEBatchNormalizationLayerKernel::batch_normalization_fp32_nchw(const Window &window)
{
Iterator input(_input, window);
Iterator output(_output, window);
@@ -241,12 +313,12 @@
const auto input_mean = reinterpret_cast<const float *>(_mean->ptr_to_element(Coordinates(0, 0)));
const auto input_var = reinterpret_cast<const float *>(_var->ptr_to_element(Coordinates(0, 0)));
- const auto input_gamma = reinterpret_cast<const float *>(_gamma->ptr_to_element(Coordinates(0, 0)));
- const auto input_beta = reinterpret_cast<const float *>(_beta->ptr_to_element(Coordinates(0, 0)));
+ const auto input_gamma = (_gamma != nullptr) ? reinterpret_cast<const float *>(_gamma->ptr_to_element(Coordinates(0, 0))) : nullptr;
+ const auto input_beta = (_beta != nullptr) ? reinterpret_cast<const float *>(_beta->ptr_to_element(Coordinates(0, 0))) : nullptr;
float32x4_t mean_vec = vdupq_n_f32(0.0);
float32x4_t var_vec = vdupq_n_f32(0.0);
- float32x4_t gamma_vec = vdupq_n_f32(0.0);
+ float32x4_t gamma_vec = vdupq_n_f32(1.0);
float32x4_t beta_vec = vdupq_n_f32(0.0);
float32x4_t denominator = vdupq_n_f32(0.0);
const float32x4_t epsilon_vec = vdupq_n_f32(_epsilon);
@@ -255,10 +327,16 @@
if(slice != id.z())
{
// Conctruct vectors
- mean_vec = vdupq_n_f32(*(input_mean + id.z()));
- var_vec = vdupq_n_f32(*(input_var + id.z()));
- gamma_vec = vdupq_n_f32(*(input_gamma + id.z()));
- beta_vec = vdupq_n_f32(*(input_beta + id.z()));
+ mean_vec = vdupq_n_f32(*(input_mean + id.z()));
+ var_vec = vdupq_n_f32(*(input_var + id.z()));
+ if(input_gamma != nullptr)
+ {
+ gamma_vec = vdupq_n_f32(*(input_gamma + id.z()));
+ }
+ if(input_beta != nullptr)
+ {
+ beta_vec = vdupq_n_f32(*(input_beta + id.z()));
+ }
// Calculate denominator
denominator = vinvsqrtq_f32(vaddq_f32(var_vec, epsilon_vec));
@@ -282,8 +360,50 @@
input, output);
}
+template <bool fused_activation, typename F>
+void NEBatchNormalizationLayerKernel::batch_normalization_fp32_nhwc(const Window &window)
+{
+ Iterator input(_input, window);
+ Iterator output(_output, window);
+
+ F activation_functor(_act_info);
+
+ const auto input_mean = reinterpret_cast<const float *>(_mean->ptr_to_element(Coordinates(0, 0)));
+ const auto input_var = reinterpret_cast<const float *>(_var->ptr_to_element(Coordinates(0, 0)));
+ const auto input_gamma = (_gamma != nullptr) ? reinterpret_cast<const float *>(_gamma->ptr_to_element(Coordinates(0, 0))) : nullptr;
+ const auto input_beta = (_beta != nullptr) ? reinterpret_cast<const float *>(_beta->ptr_to_element(Coordinates(0, 0))) : nullptr;
+
+ const float32x4_t epsilon_vec = vdupq_n_f32(_epsilon);
+ execute_window_loop(window, [&](const Coordinates & id)
+ {
+ // Conctruct vectors
+ const float32x4_t mean_vec = vld1q_f32(input_mean + id.x());
+ const float32x4_t var_vec = vld1q_f32(input_var + id.x());
+ const float32x4_t gamma_vec = (input_gamma != nullptr) ? vld1q_f32(input_gamma + id.x()) : vdupq_n_f32(1.0);
+ const float32x4_t beta_vec = (input_beta != nullptr) ? vld1q_f32(input_beta + id.x()) : vdupq_n_f32(0.0);
+ // Calculate denominator
+ const float32x4_t denominator = vinvsqrtq_f32(vaddq_f32(var_vec, epsilon_vec));
+
+ // Calculate x bar
+ const float32x4_t numerator = vsubq_f32(vld1q_f32(reinterpret_cast<const float *>(input.ptr())), mean_vec);
+ const float32x4_t x_bar = vmulq_f32(numerator, denominator);
+ float32x4_t res = vmlaq_f32(beta_vec, x_bar, gamma_vec);
+
+ // Perform fused activation
+ if(fused_activation)
+ {
+ activation_functor(res);
+ }
+
+ // Store results
+ vst1q_f32(reinterpret_cast<float *>(output.ptr()), res);
+ },
+ input, output);
+}
+
void NEBatchNormalizationLayerKernel::configure_non_fused()
{
+ const bool is_nhwc = _input->info()->data_layout() == DataLayout::NHWC;
switch(_input->info()->data_type())
{
case DataType::QS8:
@@ -293,10 +413,11 @@
_func = &NEBatchNormalizationLayerKernel::batch_normalization_qs16<false>;
break;
case DataType::F16:
- _func = &NEBatchNormalizationLayerKernel::batch_normalization_fp16<false>;
+ _func = (is_nhwc) ? &NEBatchNormalizationLayerKernel::batch_normalization_fp16_nhwc<false> : &NEBatchNormalizationLayerKernel::batch_normalization_fp16_nchw<false>;
break;
case DataType::F32:
- _func = &NEBatchNormalizationLayerKernel::batch_normalization_fp32<false, ::detail::dummy<float, 4>>;
+ _func = (is_nhwc) ? &NEBatchNormalizationLayerKernel::batch_normalization_fp32_nhwc<false, ::detail::dummy<float, 4>> :
+ &NEBatchNormalizationLayerKernel::batch_normalization_fp32_nchw<false, ::detail::dummy<float, 4>>;
break;
default:
ARM_COMPUTE_ERROR("Element size not supported");
@@ -306,18 +427,25 @@
void NEBatchNormalizationLayerKernel::configure_fused()
{
- // Fused Batched Normalization with activation functions : FP32
- static std::map<ActivationLayerInfo::ActivationFunction, BatchNormFunctionPtr> bn_fused_map_f32 =
+ // NCHW Fused Batched Normalization with activation functions : FP32
+ static std::map<ActivationLayerInfo::ActivationFunction, BatchNormFunctionPtr> bn_fused_map_f32_nchw =
{
- { ActivationLayerInfo::ActivationFunction::RELU, &NEBatchNormalizationLayerKernel::batch_normalization_fp32<true, ::detail::relu<float, 4>> },
- { ActivationLayerInfo::ActivationFunction::BOUNDED_RELU, &NEBatchNormalizationLayerKernel::batch_normalization_fp32<true, ::detail::brelu<float, 4>> },
- { ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, &NEBatchNormalizationLayerKernel::batch_normalization_fp32<true, ::detail::lubrelu<float, 4>> }
+ { ActivationLayerInfo::ActivationFunction::RELU, &NEBatchNormalizationLayerKernel::batch_normalization_fp32_nchw<true, ::detail::relu<float, 4>> },
+ { ActivationLayerInfo::ActivationFunction::BOUNDED_RELU, &NEBatchNormalizationLayerKernel::batch_normalization_fp32_nchw<true, ::detail::brelu<float, 4>> },
+ { ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, &NEBatchNormalizationLayerKernel::batch_normalization_fp32_nchw<true, ::detail::lubrelu<float, 4>> }
+ };
+ // NHWC Fused Batched Normalization with activation functions : FP32
+ static std::map<ActivationLayerInfo::ActivationFunction, BatchNormFunctionPtr> bn_fused_map_f32_nhwc =
+ {
+ { ActivationLayerInfo::ActivationFunction::RELU, &NEBatchNormalizationLayerKernel::batch_normalization_fp32_nhwc<true, ::detail::relu<float, 4>> },
+ { ActivationLayerInfo::ActivationFunction::BOUNDED_RELU, &NEBatchNormalizationLayerKernel::batch_normalization_fp32_nhwc<true, ::detail::brelu<float, 4>> },
+ { ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, &NEBatchNormalizationLayerKernel::batch_normalization_fp32_nhwc<true, ::detail::lubrelu<float, 4>> }
};
switch(_input->info()->data_type())
{
case DataType::F32:
- _func = bn_fused_map_f32[_act_info.activation()];
+ _func = (_input->info()->data_layout() == DataLayout::NHWC) ? bn_fused_map_f32_nhwc[_act_info.activation()] : bn_fused_map_f32_nchw[_act_info.activation()];
break;
default:
ARM_COMPUTE_ERROR("Element size not supported");
@@ -335,21 +463,12 @@
const ITensor *beta, const ITensor *gamma,
float epsilon, ActivationLayerInfo act_info)
{
- ARM_COMPUTE_ERROR_ON_NULLPTR(input, mean, var, beta, gamma);
+ ARM_COMPUTE_ERROR_ON_NULLPTR(input, mean, var);
- ITensorInfo *output_info = nullptr;
-
- if(nullptr != output)
- {
- // Output tensor auto initialization if not yet initialized
- auto_init_if_empty(*output->info(), *input->info());
-
- output_info = output->info();
- }
-
- ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output_info,
+ ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), (output != nullptr) ? output->info() : nullptr,
mean->info(), var->info(),
- beta->info(), gamma->info(),
+ (beta != nullptr) ? beta->info() : nullptr,
+ (gamma != nullptr) ? gamma->info() : nullptr,
epsilon, act_info));
_input = input;
@@ -361,7 +480,8 @@
_epsilon = epsilon;
_act_info = act_info;
- if(output != nullptr)
+ const bool run_in_place = (output == nullptr) || (output == input);
+ if(!run_in_place)
{
_output = output;
}
@@ -377,7 +497,7 @@
}
// Configure kernel window
- auto win_config = validate_and_configure_window(input->info(), output_info);
+ auto win_config = validate_and_configure_window(input->info(), (run_in_place) ? nullptr : output->info());
ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
INEKernel::configure(win_config.second);
}
diff --git a/src/core/NEON/kernels/NEChannelCombineKernel.cpp b/src/core/NEON/kernels/NEChannelCombineKernel.cpp
index a2b24de..28fb4bd 100644
--- a/src/core/NEON/kernels/NEChannelCombineKernel.cpp
+++ b/src/core/NEON/kernels/NEChannelCombineKernel.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2016, 2017 ARM Limited.
+ * Copyright (c) 2016-2018 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -56,47 +56,58 @@
ARM_COMPUTE_ERROR_ON(plane1 == output);
ARM_COMPUTE_ERROR_ON(plane2 == output);
- set_format_if_unknown(*plane0->info(), Format::U8);
- set_format_if_unknown(*plane1->info(), Format::U8);
- set_format_if_unknown(*plane2->info(), Format::U8);
-
- if(plane3 != nullptr)
- {
- set_format_if_unknown(*plane3->info(), Format::U8);
- }
-
- set_shape_if_empty(*output->info(), plane0->info()->tensor_shape());
+ ARM_COMPUTE_ERROR_ON_FORMAT_NOT_IN(plane0, Format::U8);
+ ARM_COMPUTE_ERROR_ON_FORMAT_NOT_IN(plane1, Format::U8);
+ ARM_COMPUTE_ERROR_ON_FORMAT_NOT_IN(plane2, Format::U8);
+ ARM_COMPUTE_ERROR_ON_FORMAT_NOT_IN(output, Format::RGB888, Format::RGBA8888, Format::UYVY422, Format::YUYV422);
ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(plane0, 1, DataType::U8);
ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(plane1, 1, DataType::U8);
ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(plane2, 1, DataType::U8);
- ARM_COMPUTE_ERROR_ON_FORMAT_NOT_IN(output, Format::RGB888, Format::RGBA8888, Format::UYVY422, Format::YUYV422);
- ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(plane0, plane1, plane2);
- if(plane3 != nullptr)
+ const Format output_format = output->info()->format();
+
+ // Check if horizontal dimension of Y plane is even and validate horizontal sub-sampling dimensions for U and V planes
+ if(Format::YUYV422 == output_format || Format::UYVY422 == output_format)
{
- ARM_COMPUTE_ERROR_ON_MISMATCHING_SHAPES(plane0, plane3);
- ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(plane0, plane3);
+ // Validate Y plane of input and output
+ ARM_COMPUTE_ERROR_ON_TENSORS_NOT_EVEN(output_format, plane0, output);
+
+ // Validate U and V plane of the input
+ ARM_COMPUTE_ERROR_ON_TENSORS_NOT_SUBSAMPLED(output_format, plane0->info()->tensor_shape(), plane1, plane2);
}
- const Format &output_format = output->info()->format();
+ _planes[0] = plane0;
+ _planes[1] = plane1;
+ _planes[2] = plane2;
+ _planes[3] = nullptr;
- if(output_format == Format::RGBA8888)
+ // Validate the last input tensor only for RGBA format
+ if(Format::RGBA8888 == output_format)
{
- ARM_COMPUTE_ERROR_ON(plane3 == output);
+ ARM_COMPUTE_ERROR_ON_NULLPTR(plane3);
+ ARM_COMPUTE_ERROR_ON_TENSOR_NOT_2D(plane3);
+
+ ARM_COMPUTE_ERROR_ON_FORMAT_NOT_IN(plane3, Format::U8);
ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(plane3, 1, DataType::U8);
+
+ _planes[3] = plane3;
}
- _planes[0] = plane0;
- _planes[1] = plane1;
- _planes[2] = plane2;
- _planes[3] = plane3;
_output = output;
_output_multi = nullptr;
+ // Half the processed elements for U and V channels due to horizontal sub-sampling of 2
+ if(Format::YUYV422 == output_format || Format::UYVY422 == output_format)
+ {
+ _x_subsampling[1] = 2;
+ _x_subsampling[2] = 2;
+ }
+
_num_elems_processed_per_iteration = 8;
_is_parallelizable = true;
+ // Select function and number of elements to process given the output format
switch(output_format)
{
case Format::RGB888:
@@ -106,14 +117,10 @@
_func = &NEChannelCombineKernel::combine_4C;
break;
case Format::UYVY422:
- _x_subsampling[1] = 2;
- _x_subsampling[2] = 2;
_num_elems_processed_per_iteration = 16;
_func = &NEChannelCombineKernel::combine_YUV_1p<true>;
break;
case Format::YUYV422:
- _x_subsampling[1] = 2;
- _x_subsampling[2] = 2;
_num_elems_processed_per_iteration = 16;
_func = &NEChannelCombineKernel::combine_YUV_1p<false>;
break;
@@ -122,14 +129,6 @@
break;
}
- TensorShape subsampled_shape_plane1{ plane0->info()->tensor_shape() };
- subsampled_shape_plane1.set(0, subsampled_shape_plane1[0] / _x_subsampling[1]);
- TensorShape subsampled_shape_plane2{ plane0->info()->tensor_shape() };
- subsampled_shape_plane2.set(0, subsampled_shape_plane2[0] / _x_subsampling[2]);
-
- ARM_COMPUTE_ERROR_ON_MISMATCHING_DIMENSIONS(plane1->info()->tensor_shape(), subsampled_shape_plane1);
- ARM_COMPUTE_ERROR_ON_MISMATCHING_DIMENSIONS(plane2->info()->tensor_shape(), subsampled_shape_plane2);
-
Window win = calculate_max_window(*plane0->info(), Steps(_num_elems_processed_per_iteration));
AccessWindowHorizontal output_access(output->info(), 0, _num_elems_processed_per_iteration);
@@ -167,65 +166,52 @@
ARM_COMPUTE_ERROR_ON_TENSOR_NOT_2D(plane1);
ARM_COMPUTE_ERROR_ON_TENSOR_NOT_2D(plane2);
- set_format_if_unknown(*plane0->info(), Format::U8);
- set_format_if_unknown(*plane1->info(), Format::U8);
- set_format_if_unknown(*plane2->info(), Format::U8);
+ ARM_COMPUTE_ERROR_ON_FORMAT_NOT_IN(plane0, Format::U8);
+ ARM_COMPUTE_ERROR_ON_FORMAT_NOT_IN(plane1, Format::U8);
+ ARM_COMPUTE_ERROR_ON_FORMAT_NOT_IN(plane2, Format::U8);
+ ARM_COMPUTE_ERROR_ON_FORMAT_NOT_IN(output, Format::NV12, Format::NV21, Format::IYUV, Format::YUV444);
- set_shape_if_empty(*output->plane(0)->info(), plane0->info()->tensor_shape());
-
- switch(output->info()->format())
- {
- case Format::NV12:
- case Format::NV21:
- case Format::IYUV:
- {
- TensorShape subsampled_shape = plane0->info()->tensor_shape();
- subsampled_shape.set(0, subsampled_shape[0] / 2);
- subsampled_shape.set(1, subsampled_shape[1] / 2);
-
- set_shape_if_empty(*output->plane(1)->info(), subsampled_shape);
-
- ARM_COMPUTE_ERROR_ON_MISMATCHING_DIMENSIONS(output->plane(1)->info()->tensor_shape(), subsampled_shape);
-
- if(output->info()->format() == Format::IYUV)
- {
- set_shape_if_empty(*output->plane(2)->info(), subsampled_shape);
-
- ARM_COMPUTE_ERROR_ON_MISMATCHING_DIMENSIONS(output->plane(2)->info()->tensor_shape(), subsampled_shape);
- }
- break;
- }
- case Format::YUV444:
- set_shape_if_empty(*output->plane(1)->info(), plane0->info()->tensor_shape());
- set_shape_if_empty(*output->plane(2)->info(), plane0->info()->tensor_shape());
-
- ARM_COMPUTE_ERROR_ON_MISMATCHING_SHAPES(plane1, plane2, output->plane(1), output->plane(2));
- break;
- default:
- ARM_COMPUTE_ERROR("Unsupported format");
- }
-
- ARM_COMPUTE_ERROR_ON_MISMATCHING_SHAPES(plane0, output->plane(0));
ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(plane0, 1, DataType::U8);
ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(plane1, 1, DataType::U8);
ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(plane2, 1, DataType::U8);
- ARM_COMPUTE_ERROR_ON_FORMAT_NOT_IN(output, Format::NV12, Format::NV21, Format::IYUV, Format::YUV444);
- ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(plane0, plane1, plane2);
- _planes[0] = plane0;
- _planes[1] = plane1;
- _planes[2] = plane2;
- _planes[3] = nullptr;
- _output = nullptr;
- _output_multi = output;
+ const Format output_format = output->info()->format();
+
+ // Validate shape of Y plane to be even and shape of sub-sampling dimensions for U and V planes
+ // Perform validation only for formats which require sub-sampling.
+ if(Format::YUV444 != output_format)
+ {
+ // Validate Y plane of input and output
+ ARM_COMPUTE_ERROR_ON_TENSORS_NOT_EVEN(output_format, plane0, output->plane(0));
+
+ // Validate U and V plane of the input
+ ARM_COMPUTE_ERROR_ON_TENSORS_NOT_SUBSAMPLED(output_format, plane0->info()->tensor_shape(), plane1, plane2);
+
+ // Validate second plane U (NV12 and NV21 have a UV88 combined plane while IYUV has only the U plane)
+ // MultiImage generates the correct tensor shape but also check in case the tensor shape of planes was changed to a wrong size
+ ARM_COMPUTE_ERROR_ON_TENSORS_NOT_SUBSAMPLED(output_format, plane0->info()->tensor_shape(), output->plane(1));
+
+ // Validate the last plane V of format IYUV
+ if(Format::IYUV == output_format)
+ {
+ // Validate Y plane of the output
+ ARM_COMPUTE_ERROR_ON_TENSORS_NOT_SUBSAMPLED(output_format, plane0->info()->tensor_shape(), output->plane(2));
+ }
+ }
+
+ _planes[0] = plane0;
+ _planes[1] = plane1;
+ _planes[2] = plane2;
+ _planes[3] = nullptr;
+ _output = nullptr;
+ _output_multi = output;
+
bool has_two_planes = false;
unsigned int num_elems_written_plane1 = 8;
_num_elems_processed_per_iteration = 8;
_is_parallelizable = true;
- const Format &output_format = output->info()->format();
-
switch(output_format)
{
case Format::NV12:
@@ -268,8 +254,7 @@
output_plane1_access,
output_plane2_access);
- ValidRegion plane0_valid_region = plane0->info()->valid_region();
-
+ ValidRegion plane0_valid_region = plane0->info()->valid_region();
ValidRegion output_plane1_region = has_two_planes ? intersect_valid_regions(plane1->info()->valid_region(), plane2->info()->valid_region()) : plane2->info()->valid_region();
output_plane0_access.set_valid_region(win, ValidRegion(plane0_valid_region.anchor, output->plane(0)->info()->tensor_shape()));
@@ -358,7 +343,7 @@
{
// Create sub-sampled uv window and init uv planes
Window win_uv(win);
- win_uv.set_dimension_step(0, win.x().step() / _x_subsampling[1]);
+ win_uv.set_dimension_step(Window::DimX, win.x().step() / _x_subsampling[1]);
win_uv.validate();
Iterator p0(_planes[0], win);
@@ -405,13 +390,13 @@
// Update UV window
Window uv_win(win);
- uv_win.set(Window::DimX, Window::Dimension(uv_win.x().start() / _x_subsampling[1], uv_win.x().end() / _x_subsampling[1], _num_elems_processed_per_iteration));
+ uv_win.set(Window::DimX, Window::Dimension(uv_win.x().start() / _x_subsampling[1], uv_win.x().end() / _x_subsampling[1], uv_win.x().step() / _x_subsampling[1]));
uv_win.set(Window::DimY, Window::Dimension(uv_win.y().start() / _y_subsampling[1], uv_win.y().end() / _y_subsampling[1], 1));
uv_win.validate();
// Update output win
Window out_win(win);
- out_win.set(Window::DimX, Window::Dimension(out_win.x().start(), out_win.x().end(), out_win.x().step() * 2));
+ out_win.set(Window::DimX, Window::Dimension(out_win.x().start(), out_win.x().end(), out_win.x().step() / _x_subsampling[1]));
out_win.set(Window::DimY, Window::Dimension(out_win.y().start() / _y_subsampling[1], out_win.y().end() / _y_subsampling[1], 1));
out_win.validate();
@@ -421,6 +406,9 @@
Iterator p2(_planes[2 - shift], uv_win);
Iterator out(_output_multi->plane(1), out_win);
+ // Increase step size after iterator is created to calculate stride correctly for multi channel format
+ out_win.set_dimension_step(Window::DimX, out_win.x().step() * _x_subsampling[1]);
+
execute_window_loop(out_win, [&](const Coordinates & id)
{
const uint8x8x2_t pixels =
@@ -450,19 +438,17 @@
// Update window
Window tmp_win(win);
- tmp_win.set(Window::DimX, Window::Dimension(tmp_win.x().start() / _x_subsampling[plane_id], tmp_win.x().end() / _x_subsampling[plane_id], _num_elems_processed_per_iteration));
+ tmp_win.set(Window::DimX, Window::Dimension(tmp_win.x().start() / _x_subsampling[plane_id], tmp_win.x().end() / _x_subsampling[plane_id], tmp_win.x().step() / _x_subsampling[plane_id]));
tmp_win.set(Window::DimY, Window::Dimension(tmp_win.y().start() / _y_subsampling[plane_id], tmp_win.y().end() / _y_subsampling[plane_id], 1));
- tmp_win.validate();
Iterator in(_planes[plane_id], tmp_win);
Iterator out(_output_multi->plane(plane_id), tmp_win);
execute_window_loop(tmp_win, [&](const Coordinates & id)
{
- const auto in_ptr = static_cast<uint8_t *>(in.ptr());
- const auto out_ptr = static_cast<uint8_t *>(out.ptr());
+ const uint8x8_t pixels = vld1_u8(in.ptr());
- vst1_u8(out_ptr, vld1_u8(in_ptr));
+ vst1_u8(out.ptr(), pixels);
},
in, out);
}
diff --git a/src/core/NEON/kernels/NEConvertFullyConnectedWeightsKernel.cpp b/src/core/NEON/kernels/NEConvertFullyConnectedWeightsKernel.cpp
new file mode 100644
index 0000000..b3746bd
--- /dev/null
+++ b/src/core/NEON/kernels/NEConvertFullyConnectedWeightsKernel.cpp
@@ -0,0 +1,116 @@
+/*
+ * Copyright (c) 2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/NEON/kernels/NEConvertFullyConnectedWeightsKernel.h"
+
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/Types.h"
+
+using namespace arm_compute;
+
+NEConvertFullyConnectedWeightsKernel::NEConvertFullyConnectedWeightsKernel()
+ : _input(nullptr), _output(nullptr), _factor1(0), _factor2(0)
+{
+}
+
+void NEConvertFullyConnectedWeightsKernel::configure(const ITensor *input, ITensor *output, const TensorShape &original_input_shape,
+ DataLayout data_layout)
+{
+ ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
+ ARM_COMPUTE_ERROR_THROW_ON(NEConvertFullyConnectedWeightsKernel::validate(input->info(), output->info(), original_input_shape, data_layout));
+
+ _input = input;
+ _output = output;
+
+ const unsigned int num_elems_per_input_plane = original_input_shape.x() * original_input_shape.y();
+ const unsigned int num_channels = original_input_shape.z();
+
+ // Set build options
+ if(data_layout == DataLayout::NCHW)
+ {
+ _factor1 = num_elems_per_input_plane;
+ _factor2 = num_channels;
+ }
+ else
+ {
+ _factor1 = num_channels;
+ _factor2 = num_elems_per_input_plane;
+ }
+
+ // Configure kernel window
+ Window win = calculate_max_window(*input->info(), Steps());
+ INEKernel::configure(win);
+}
+
+Status NEConvertFullyConnectedWeightsKernel::validate(const ITensorInfo *input, const ITensorInfo *output, const TensorShape &original_input_shape,
+ DataLayout data_layout)
+{
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8, DataType::S8, DataType::QS8, DataType::QASYMM8, DataType::U16, DataType::S16, DataType::QS16, DataType::U32, DataType::S32,
+ DataType::QS32, DataType::F16, DataType::F32);
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output);
+ ARM_COMPUTE_RETURN_ERROR_ON(input->num_dimensions() != 2);
+ ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(1) != original_input_shape.total_size_lower(3));
+ ARM_COMPUTE_RETURN_ERROR_ON(data_layout == DataLayout::UNKNOWN);
+
+ return Status{};
+}
+
+template <typename T>
+void NEConvertFullyConnectedWeightsKernel::run_convert_fc_weights(const Window &window)
+{
+ const unsigned int dst_stride_x = _output->info()->strides_in_bytes().x();
+ const unsigned int dst_stride_y = _output->info()->strides_in_bytes().y();
+
+ Iterator input(_input, window);
+ Iterator output(_output, window);
+
+ execute_window_loop(window, [&](const Coordinates & id)
+ {
+ *reinterpret_cast<T *>(output.ptr() + id.x() * dst_stride_x + (id.y() % _factor1 * _factor2 + id.y() / _factor1) * dst_stride_y) = *reinterpret_cast<T *>(input.ptr());
+ },
+ input);
+}
+
+void NEConvertFullyConnectedWeightsKernel::run(const Window &window, const ThreadInfo &info)
+{
+ ARM_COMPUTE_UNUSED(info);
+ ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+ ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
+
+ switch(_input->info()->element_size())
+ {
+ case 1:
+ run_convert_fc_weights<uint8_t>(window);
+ break;
+ case 2:
+ run_convert_fc_weights<uint16_t>(window);
+ break;
+ case 4:
+ run_convert_fc_weights<uint32_t>(window);
+ break;
+ default:
+ ARM_COMPUTE_ERROR("Data type not supported.");
+ break;
+ }
+}
diff --git a/src/core/NEON/kernels/NEDepthwiseConvolutionLayer3x3Kernel.cpp b/src/core/NEON/kernels/NEDepthwiseConvolutionLayer3x3Kernel.cpp
index f5ee608..8cdf175 100644
--- a/src/core/NEON/kernels/NEDepthwiseConvolutionLayer3x3Kernel.cpp
+++ b/src/core/NEON/kernels/NEDepthwiseConvolutionLayer3x3Kernel.cpp
@@ -52,13 +52,14 @@
{
public:
static void convolve(const Window &window, unsigned int num_elems_written_per_iteration,
- const ITensor *input, const ITensor *weights, ITensor *output, const PadStrideInfo &conv_info)
+ const ITensor *input, const ITensor *weights, ITensor *output, const PadStrideInfo &conv_info, unsigned int depth_multiplier)
{
const int input_offset = -input->info()->quantization_info().offset;
const int weights_offset = -weights->info()->quantization_info().offset;
const int input_stride_x = input->info()->strides_in_bytes().x();
const int input_stride_y = input->info()->strides_in_bytes().y();
+ const int input_stride_z = input->info()->strides_in_bytes().z();
const int output_stride_y = output->info()->strides_in_bytes().y();
const int kernel_stride_y = weights->info()->strides_in_bytes().y();
const int kernel_stride_z = weights->info()->strides_in_bytes().z();
@@ -93,7 +94,7 @@
int ih = 0;
int oh = 0;
- const uint8_t *input_ptr = in.ptr() - conv_pad_x * input_stride_x - conv_pad_y * input_stride_y;
+ const uint8_t *input_ptr = in.ptr() - conv_pad_x * input_stride_x - conv_pad_y * input_stride_y - (id.z() - id.z() / depth_multiplier) * input_stride_z;
const uint8_t *ptr_weights_base = weights_ptr + id.z() * kernel_stride_z;
const auto ptr_weights_r0 = reinterpret_cast<const T1 *>(ptr_weights_base);
@@ -125,19 +126,19 @@
template <typename T1, typename T2>
inline void convolve_3x3(const Window &window, unsigned int num_elems_written_per_iteration,
- const ITensor *input, const ITensor *weights, ITensor *output, const PadStrideInfo &conv_info)
+ const ITensor *input, const ITensor *weights, ITensor *output, const PadStrideInfo &conv_info, unsigned int depth_multiplier)
{
const unsigned int conv_stride_x = std::get<0>(conv_info.stride());
switch(conv_stride_x)
{
case 1:
- convolver_3x3<T1, T2, 1>::convolve(window, num_elems_written_per_iteration, input, weights, output, conv_info);
+ convolver_3x3<T1, T2, 1>::convolve(window, num_elems_written_per_iteration, input, weights, output, conv_info, depth_multiplier);
break;
case 2:
- convolver_3x3<T1, T2, 2>::convolve(window, num_elems_written_per_iteration, input, weights, output, conv_info);
+ convolver_3x3<T1, T2, 2>::convolve(window, num_elems_written_per_iteration, input, weights, output, conv_info, depth_multiplier);
break;
case 3:
- convolver_3x3<T1, T2, 3>::convolve(window, num_elems_written_per_iteration, input, weights, output, conv_info);
+ convolver_3x3<T1, T2, 3>::convolve(window, num_elems_written_per_iteration, input, weights, output, conv_info, depth_multiplier);
break;
default:
ARM_COMPUTE_ERROR("Not implemented");
@@ -146,7 +147,7 @@
} // namespace
NEDepthwiseConvolutionLayer3x3Kernel::NEDepthwiseConvolutionLayer3x3Kernel()
- : _border_size(0), _input(), _output(), _weights(), _conv_info(), _convolver(nullptr), _num_elems_written_per_iteration(0), _run_optimized(false)
+ : _border_size(0), _input(), _output(), _weights(), _conv_info(), _convolver(nullptr), _num_elems_written_per_iteration(0), _run_optimized(false), _depth_multiplier(1)
{
}
@@ -155,20 +156,22 @@
return _border_size;
}
-void NEDepthwiseConvolutionLayer3x3Kernel::configure(const ITensor *input, const ITensor *weights, ITensor *output, const PadStrideInfo &conv_info, DataLayout data_layout)
+void NEDepthwiseConvolutionLayer3x3Kernel::configure(const ITensor *input, const ITensor *weights, ITensor *output, const PadStrideInfo &conv_info, unsigned int depth_multiplier,
+ DataLayout data_layout)
{
ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::F32);
ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, weights);
- _input = input;
- _output = output;
- _weights = weights;
- _conv_info = conv_info;
- _convolver = nullptr;
+ _input = input;
+ _output = output;
+ _weights = weights;
+ _conv_info = conv_info;
+ _depth_multiplier = depth_multiplier;
+ _convolver = nullptr;
_run_optimized = NEDepthwiseConvolutionLayer3x3Kernel::is_optimized_execution_possible(input->info()->tensor_shape(),
conv_info,
- input->info()->data_type(),
+ input->info()->data_type(), depth_multiplier,
data_layout);
(_run_optimized) ? configure_optimized() : configure_generic();
@@ -182,7 +185,7 @@
(_run_optimized) ? run_optimized(window, info) : run_generic(window, info);
}
-bool NEDepthwiseConvolutionLayer3x3Kernel::is_optimized_execution_possible(TensorShape input_shape, PadStrideInfo conv_info, DataType dt, DataLayout data_layout)
+bool NEDepthwiseConvolutionLayer3x3Kernel::is_optimized_execution_possible(TensorShape input_shape, PadStrideInfo conv_info, DataType dt, unsigned int depth_multiplier, DataLayout data_layout)
{
// Reshape input shape if in NHWC format
TensorShape in_shape{ input_shape };
@@ -210,7 +213,7 @@
bool is_valid_padding = (pad_top == 0) && (pad_right == 0) && (pad_bottom == 0) && (pad_left == 0);
bool supported_padding = is_same_padding || is_valid_padding;
- return supported_datatype && supported_strides && supported_padding;
+ return supported_datatype && supported_strides && supported_padding && (depth_multiplier == 1);
}
void NEDepthwiseConvolutionLayer3x3Kernel::generate_convolver()
@@ -219,8 +222,7 @@
ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(_input, _weights);
ARM_COMPUTE_ERROR_ON(_weights->info()->dimension(1) != 3 || _weights->info()->dimension(2) != 3);
- _convolver = create_convolver_object(_input->info()->tensor_shape(), _conv_info,
- _weights->buffer(), _input->buffer(), _output->buffer());
+ _convolver = create_convolver_object(_conv_info, _weights, _input, _output, true);
}
void NEDepthwiseConvolutionLayer3x3Kernel::configure_generic()
@@ -228,7 +230,7 @@
ARM_COMPUTE_ERROR_ON(_weights->info()->dimension(0) != 3 || _weights->info()->dimension(1) != 3);
// Get convolved dimensions
- const TensorShape output_shape = compute_depthwise_convolution_shape(*_input->info(), *_weights->info(), _conv_info);
+ const TensorShape output_shape = compute_depthwise_convolution_shape(*_input->info(), *_weights->info(), _conv_info, _depth_multiplier);
const DataType output_dt = (_input->info()->data_type() == DataType::QASYMM8) ? DataType::S32 : _input->info()->data_type();
// Output auto inizialitation if not yet initialized
@@ -282,8 +284,7 @@
ARM_COMPUTE_ERROR_ON(_weights->info()->dimension(1) != 3 || _weights->info()->dimension(2) != 3);
_border_size = BorderSize(0, 0);
- _convolver = create_convolver_object(_input->info()->tensor_shape(), _conv_info,
- _weights->buffer(), _input->buffer(), _output->buffer());
+ _convolver = create_convolver_object(_conv_info, _weights, _input, _output);
// Auto-configure output
bool same_padding = _conv_info.has_padding();
@@ -296,6 +297,15 @@
auto_init_if_empty(*_output->info(),
_input->info()->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(output_shape));
+ // Set padding in channels
+ const int num_channels = _weights->info()->dimension(0);
+ if((num_channels >= 128) && (num_channels % 16 == 0))
+ {
+ _input->info()->extend_padding(PaddingSize(0, 4, 0, 0));
+ _weights->info()->extend_padding(PaddingSize(0, 4, 0, 0));
+ _output->info()->extend_padding(PaddingSize(0, 4, 0, 0));
+ }
+
// Configure window
Window win;
auto win_last = _convolver->get_window();
@@ -310,10 +320,10 @@
switch(_input->info()->data_type())
{
case DataType::F32:
- convolve_3x3<float, float>(window, _num_elems_written_per_iteration, _input, _weights, _output, _conv_info);
+ convolve_3x3<float, float>(window, _num_elems_written_per_iteration, _input, _weights, _output, _conv_info, _depth_multiplier);
break;
case DataType::QASYMM8:
- convolve_3x3<uint8_t, int32_t>(window, _num_elems_written_per_iteration, _input, _weights, _output, _conv_info);
+ convolve_3x3<uint8_t, int32_t>(window, _num_elems_written_per_iteration, _input, _weights, _output, _conv_info, _depth_multiplier);
break;
default:
ARM_COMPUTE_ERROR("Not implemented");
@@ -330,41 +340,56 @@
_convolver->run(start, end);
}
-std::unique_ptr<depthwise::IDepthwiseConvolution> NEDepthwiseConvolutionLayer3x3Kernel::create_convolver_object(TensorShape shape,
- PadStrideInfo conv_info,
- const uint8_t *w_ptr,
- uint8_t *in_ptr,
- uint8_t *out_ptr)
+std::unique_ptr<depthwise::IDepthwiseConvolution> NEDepthwiseConvolutionLayer3x3Kernel::create_convolver_object(PadStrideInfo conv_info,
+ const ITensor *w,
+ const ITensor *in,
+ ITensor *out,
+ bool setup_strides)
{
- const int in_rows = shape.z();
- const int in_cols = shape.y();
- const int n_batches = shape[3];
- const int n_channels = shape.x();
- const bool padding_same = conv_info.has_padding();
+ const TensorShape shape = in->info()->tensor_shape();
+ const int in_rows = shape.z();
+ const int in_cols = shape.y();
+ const int n_batches = shape[3];
+ const int n_channels = shape.x();
+ const bool padding_same = conv_info.has_padding();
+ const int weight_col_stride = (setup_strides) ? w->info()->strides_in_bytes().y() / w->info()->element_size() : 0;
+ const int weight_row_stride = (setup_strides) ? w->info()->strides_in_bytes().z() / w->info()->element_size() : 0;
+ const int input_col_stride = (setup_strides) ? in->info()->strides_in_bytes().y() / in->info()->element_size() : 0;
+ const int input_row_stride = (setup_strides) ? in->info()->strides_in_bytes().z() / in->info()->element_size() : 0;
+ const int input_batch_stride = (setup_strides) ? in->info()->strides_in_bytes()[3] / in->info()->element_size() : 0;
+ const int output_col_stride = (setup_strides) ? out->info()->strides_in_bytes().y() / out->info()->element_size() : 0;
+ const int output_row_stride = (setup_strides) ? out->info()->strides_in_bytes().z() / out->info()->element_size() : 0;
+ const int output_batch_stride = (setup_strides) ? out->info()->strides_in_bytes()[3] / out->info()->element_size() : 0;
const auto stride_x = conv_info.stride().first;
switch(stride_x)
{
case 1:
- return arm_compute::support::cpp14::make_unique<DepthwiseConvolution<2, 2, 3, 3, 1, 1, float, float>>(
+ return arm_compute::support::cpp14::make_unique<DepthwiseConvolution<4, 4, 3, 3, 1, 1, float, float>>(
n_batches,
in_rows,
in_cols,
n_channels,
padding_same,
- reinterpret_cast<const float *>(w_ptr),
- reinterpret_cast<float *>(in_ptr),
- reinterpret_cast<float *>(out_ptr));
+ reinterpret_cast<const float *>(w->ptr_to_element(Coordinates())),
+ reinterpret_cast<float *>(in->ptr_to_element(Coordinates())),
+ reinterpret_cast<float *>(out->ptr_to_element(Coordinates())),
+ weight_col_stride, weight_row_stride,
+ input_col_stride, input_row_stride, input_batch_stride,
+ output_col_stride, output_row_stride, output_batch_stride);
case 2:
- return arm_compute::support::cpp14::make_unique<DepthwiseConvolution<2, 2, 3, 3, 2, 2, float, float>>(
+ return arm_compute::support::cpp14::make_unique<DepthwiseConvolution<3, 3, 3, 3, 2, 2, float, float>>(
n_batches,
in_rows,
in_cols,
n_channels,
padding_same,
- reinterpret_cast<const float *>(w_ptr),
- reinterpret_cast<float *>(in_ptr),
- reinterpret_cast<float *>(out_ptr));
+ reinterpret_cast<const float *>(w->ptr_to_element(Coordinates())),
+ reinterpret_cast<float *>(in->ptr_to_element(Coordinates())),
+ reinterpret_cast<float *>(out->ptr_to_element(Coordinates())),
+ weight_col_stride, weight_row_stride,
+ input_col_stride, input_row_stride, input_batch_stride,
+ output_col_stride, output_row_stride, output_batch_stride);
default:
return nullptr;
}
diff --git a/src/core/NEON/kernels/NEDepthwiseIm2ColKernel.cpp b/src/core/NEON/kernels/NEDepthwiseIm2ColKernel.cpp
index b924d9f..cfd8eac 100644
--- a/src/core/NEON/kernels/NEDepthwiseIm2ColKernel.cpp
+++ b/src/core/NEON/kernels/NEDepthwiseIm2ColKernel.cpp
@@ -85,7 +85,7 @@
const int src_y = -pad_top + src_pixel_linear / max_initial_x * stride_y;
// Get pointers
- const uint8_t *const input_ptr = in.ptr() + id.z() * input_stride_z;
+ const uint8_t *const input_ptr = in.ptr() + id.z() / _depth_multiplier * input_stride_z;
auto output_ptr = reinterpret_cast<T *>(out.ptr());
const int height = src_y + _kernel_dims.height;
const int width = src_x + _kernel_dims.width;
@@ -114,24 +114,25 @@
}
NEDepthwiseIm2ColKernel::NEDepthwiseIm2ColKernel()
- : _func(nullptr), _input(nullptr), _output(nullptr), _kernel_dims(), _conv_info(), _has_bias()
+ : _func(nullptr), _input(nullptr), _output(nullptr), _kernel_dims(), _conv_info(), _has_bias(), _depth_multiplier(1)
{
}
-void NEDepthwiseIm2ColKernel::configure(const ITensor *input, ITensor *output, const Size2D &kernel_dims, const PadStrideInfo &conv_info, bool has_bias)
+void NEDepthwiseIm2ColKernel::configure(const ITensor *input, ITensor *output, const Size2D &kernel_dims, const PadStrideInfo &conv_info, bool has_bias, unsigned int depth_multiplier)
{
ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::F16, DataType::F32);
ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
ARM_COMPUTE_ERROR_ON_MISMATCHING_FIXED_POINT(input, output);
ARM_COMPUTE_ERROR_ON(is_data_type_quantized_asymmetric(input->info()->data_type()) && has_bias);
- ARM_COMPUTE_ERROR_ON(input->info()->dimension(2) != output->info()->dimension(2));
+ ARM_COMPUTE_ERROR_ON((input->info()->dimension(2) * depth_multiplier) != output->info()->dimension(2));
ARM_COMPUTE_ERROR_ON(output->info()->dimension(0) != (kernel_dims.width * kernel_dims.height + ((has_bias) ? 1 : 0)));
- _input = input;
- _output = output;
- _kernel_dims = kernel_dims;
- _conv_info = conv_info;
- _has_bias = has_bias;
+ _input = input;
+ _output = output;
+ _kernel_dims = kernel_dims;
+ _conv_info = conv_info;
+ _has_bias = has_bias;
+ _depth_multiplier = depth_multiplier;
// Configure kernel window
Window win = calculate_max_window(*input->info(), Steps());
diff --git a/src/core/NEON/kernels/NEDequantizationLayerKernel.cpp b/src/core/NEON/kernels/NEDequantizationLayerKernel.cpp
index be211b2..4120e5f 100644
--- a/src/core/NEON/kernels/NEDequantizationLayerKernel.cpp
+++ b/src/core/NEON/kernels/NEDequantizationLayerKernel.cpp
@@ -34,6 +34,46 @@
using namespace arm_compute;
+namespace
+{
+Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, const ITensorInfo *min_max)
+{
+ ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output, min_max);
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8);
+ ARM_COMPUTE_RETURN_ERROR_ON(input->num_dimensions() < 3);
+
+ if(output->tensor_shape().total_size() > 0)
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::F32);
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output);
+ }
+
+ return Status{};
+}
+
+std::tuple<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *output, ITensorInfo *min_max)
+{
+ // Output tensor auto initialization if not yet initialized
+ auto_init_if_empty(*output, input->tensor_shape(), 1, DataType::F32, 0);
+
+ constexpr unsigned int num_elems_processed_per_iteration = 8;
+
+ // Configure window
+ Window win = calculate_max_window(*input, Steps(num_elems_processed_per_iteration));
+ AccessWindowHorizontal input_access(input, 0, num_elems_processed_per_iteration);
+ AccessWindowHorizontal output_access(output, 0, num_elems_processed_per_iteration);
+ AccessWindowStatic min_max_access(min_max, 0, 0, 2, min_max->dimension(1));
+
+ // Update window and padding
+ bool window_changed = update_window_and_padding(win, input_access, output_access, min_max_access);
+
+ output_access.set_valid_region(win, input->valid_region());
+
+ Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
+ return std::make_tuple(err, win);
+}
+} // namespace
+
NEDequantizationLayerKernel::NEDequantizationLayerKernel()
: _input(nullptr), _output(nullptr), _min_max(nullptr)
{
@@ -41,34 +81,27 @@
void NEDequantizationLayerKernel::configure(const ITensor *input, ITensor *output, const ITensor *min_max)
{
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8);
- ARM_COMPUTE_ERROR_ON_NULLPTR(output);
- ARM_COMPUTE_ERROR_ON_NULLPTR(min_max);
- ARM_COMPUTE_ERROR_ON(input->info()->num_dimensions() < 3);
-
- // Output tensor auto initialization if not yet initialized
- auto_init_if_empty(*output->info(), input->info()->tensor_shape(), 1, DataType::F32, 0);
-
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::F32);
- ARM_COMPUTE_ERROR_ON_MISMATCHING_SHAPES(input, output);
+ ARM_COMPUTE_ERROR_ON_NULLPTR(input, output, min_max);
+ ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), min_max->info()));
_input = input;
_output = output;
_min_max = min_max;
- constexpr unsigned int num_elems_processed_per_iteration = 8;
+ // Configure kernel window
+ auto win_config = validate_and_configure_window(input->info(), output->info(), min_max->info());
- // Configure window
- Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration));
- AccessWindowHorizontal input_access(input->info(), 0, num_elems_processed_per_iteration);
- AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration);
- AccessWindowStatic min_max_access(min_max->info(), 0, 0, 2, min_max->info()->dimension(1));
+ ARM_COMPUTE_ERROR_THROW_ON(std::get<0>(win_config));
- // Update window and padding
- update_window_and_padding(win, input_access, output_access, min_max_access);
- output_access.set_valid_region(win, input->info()->valid_region());
+ INEKernel::configure(std::get<1>(win_config));
+}
- INEKernel::configure(win);
+Status NEDequantizationLayerKernel::validate(const ITensorInfo *input, const ITensorInfo *output, const ITensorInfo *min_max)
+{
+ ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, min_max));
+ ARM_COMPUTE_RETURN_ON_ERROR(std::get<0>(validate_and_configure_window(input->clone().get(), output->clone().get(), min_max->clone().get())));
+
+ return Status{};
}
void NEDequantizationLayerKernel::run(const Window &window, const ThreadInfo &info)
diff --git a/src/core/NEON/kernels/NEDirectConvolutionLayerKernel.cpp b/src/core/NEON/kernels/NEDirectConvolutionLayerKernel.cpp
index 285ec2d..5eafdf0 100644
--- a/src/core/NEON/kernels/NEDirectConvolutionLayerKernel.cpp
+++ b/src/core/NEON/kernels/NEDirectConvolutionLayerKernel.cpp
@@ -33,6 +33,7 @@
#include "arm_compute/core/Types.h"
#include "arm_compute/core/Utils.h"
#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
#include <algorithm>
#include <arm_neon.h>
@@ -663,6 +664,118 @@
vst1_qs16(buffer, vqadd_qs16(vld1_qs16(buffer), vget_low_s16(values.val[0])));
}
+template <typename T1>
+class convolver_nhwc
+{
+public:
+ static void convolve(const Window &window, int kernel_size, unsigned int num_elems_read_per_iteration,
+ const ITensor *input, const ITensor *weights, ITensor *output, const PadStrideInfo &conv_info)
+ {
+ const int input_width = input->info()->dimension(0);
+ const int input_depth = input->info()->dimension(2);
+ const int input_stride_x = input->info()->strides_in_bytes().x();
+ const int input_stride_y = input->info()->strides_in_bytes().y();
+ const int input_stride_z = input->info()->strides_in_bytes().z();
+ const int output_stride_x = output->info()->strides_in_bytes().x();
+ const int kernel_stride_x = weights->info()->strides_in_bytes().x();
+ const int kernel_stride_y = weights->info()->strides_in_bytes().y();
+ const int kernel_stride_z = weights->info()->strides_in_bytes().z();
+ const int conv_pad_top = conv_info.pad_top();
+ const unsigned int conv_stride_x = std::get<0>(conv_info.stride());
+ const unsigned int conv_stride_y = std::get<1>(conv_info.stride());
+ const T1 zero = 0;
+
+ // Setup input window for the input iterator
+ Window window_in = window;
+ window_in.set(Window::DimX, Window::Dimension(0, 0, 0));
+ window_in.set(Window::DimY, Window::Dimension(0, 0, 0));
+ window_in.set(Window::DimZ, Window::Dimension(0, 0, 0));
+
+ // Setup input window for the output iterator
+ Window window_out = window;
+ window_out.set(Window::DimX, Window::Dimension(0, 1, 1));
+
+ // Setup input window for the weights iterator
+ Window window_k = calculate_max_window(*weights->info(), Steps());
+ window_k.set(Window::DimX, Window::Dimension(0, 1, 1));
+ window_k.set(Window::DimY, Window::Dimension(0, 1, 1));
+ window_k.set(Window::DimZ, Window::Dimension(0, 1, 1));
+ window_k.set(3, Window::Dimension(0, weights->info()->dimension(3), 1));
+
+ Iterator in(input, window_in);
+ Iterator out(output, window_out);
+ Iterator k(weights, window_k);
+
+ execute_window_loop(window_k, [&](const Coordinates & id_k)
+ {
+ execute_window_loop(window_out, [&](const Coordinates & id)
+ {
+ const auto in_y = static_cast<int>(id.y() * conv_stride_x - conv_info.pad_left());
+ const auto in_z = static_cast<int>(id.z() * conv_stride_y - conv_pad_top);
+
+ const uint8_t *in_ptr = in.ptr() + in_y * input_stride_y + in_z * input_stride_z;
+ uint8_t *out_ptr = out.ptr() + id_k[3] * output_stride_x;
+
+ T1 out_val = 0;
+
+ auto in_addr_base0 = in_ptr;
+ auto we_addr_base0 = k.ptr();
+
+ for(int z = 0; z < kernel_size; ++z, in_addr_base0 += input_stride_z, we_addr_base0 += kernel_stride_z)
+ {
+ const int in_z = id.z() * conv_stride_y + z - conv_pad_top;
+
+ if(in_z >= 0 && in_z < input_depth) // If false, pad top/bottom
+ {
+ auto in_addr_base1 = in_addr_base0;
+ auto we_addr_base1 = we_addr_base0;
+
+ for(int y = 0; y < kernel_size; ++y, in_addr_base1 += input_stride_y, we_addr_base1 += kernel_stride_y)
+ {
+ auto out_values = internal_vdupq_n(zero);
+
+ int x = 0;
+ int no_leftover = input_width - num_elems_read_per_iteration;
+
+ for(; x < no_leftover; x += num_elems_read_per_iteration)
+ {
+ const auto in_addr = reinterpret_cast<const T1 *>(in_addr_base1 + x * input_stride_x);
+ const auto in_values = internal_vld1q<1>(in_addr);
+
+ const auto we_addr = reinterpret_cast<const T1 *>(we_addr_base1 + x * kernel_stride_x);
+ const auto we_values = internal_vld1q<1>(we_addr);
+
+ out_values = internal_vmlal(out_values, in_values, we_values, 0);
+ }
+
+ out_val += out_values[0];
+ out_val += out_values[1];
+ out_val += out_values[2];
+ out_val += out_values[3];
+
+ // Leftover
+ for(; x < input_width; ++x)
+ {
+ const auto in_addr = reinterpret_cast<const T1 *>(in_addr_base1 + x * input_stride_x);
+ const auto in_value = *(in_addr);
+
+ const auto we_addr = reinterpret_cast<const T1 *>(we_addr_base1 + x * kernel_stride_x);
+ const auto we_value = *(we_addr);
+
+ out_val += in_value * we_value;
+ }
+ }
+ }
+ }
+
+ *(reinterpret_cast<T1 *>(out_ptr)) = out_val;
+ },
+ in, out);
+ },
+ k);
+ }
+};
+
template <typename T1, typename T2, unsigned int stridex>
class convolver_3x3
{
@@ -1003,35 +1116,28 @@
}
}
-inline TensorShape get_convolved_dimensions(const ITensorInfo *input, const ITensorInfo *weights, const int kernel_size, const PadStrideInfo &conv_info)
-{
- unsigned int output_width = 0;
- unsigned int output_height = 0;
- std::tie(output_width, output_height) = scaled_dimensions(input->dimension(0), input->dimension(1), kernel_size, kernel_size, conv_info);
-
- TensorShape output_shape = input->tensor_shape();
- output_shape.set(0, output_width);
- output_shape.set(1, output_height);
- output_shape.set(2, weights->dimension(3));
-
- return output_shape;
-}
-
Status validate_arguments(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *output, const PadStrideInfo &conv_info)
{
ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, weights, output);
+ ARM_COMPUTE_RETURN_ERROR_ON(input->data_layout() == DataLayout::UNKNOWN);
ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QS8, DataType::QS16, DataType::F16, DataType::F32);
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, weights);
+ const DataLayout data_layout = input->data_layout();
+ const int width_idx = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
+ const int height_idx = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
+ const int channel_idx = get_data_layout_dimension_index(data_layout, DataLayoutDimension::CHANNEL);
+
ARM_COMPUTE_RETURN_ERROR_ON_MSG(std::get<0>(conv_info.stride()) > 3, "Strides larger than 3 not supported.");
- ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(2) != input->dimension(2));
- ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(0) != weights->dimension(1));
+ ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(channel_idx) != input->dimension(channel_idx));
+ ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(width_idx) != weights->dimension(height_idx));
ARM_COMPUTE_RETURN_ERROR_ON(weights->num_dimensions() > 4);
+ ARM_COMPUTE_RETURN_ERROR_ON(data_layout == DataLayout::NHWC && input->data_type() != DataType::F32);
// Checks performed when output is configured
if(output->total_size() != 0)
{
- TensorShape output_shape = get_convolved_dimensions(input, weights, weights->dimension(0), conv_info);
+ TensorShape output_shape = misc::shape_calculator::compute_deep_convolution_shape(*input, *weights, conv_info);
DataType data_type = input->data_type();
if(is_data_type_fixed_point(data_type))
@@ -1050,101 +1156,127 @@
std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *weights, ITensorInfo *output, const PadStrideInfo &conv_info, unsigned int &num_weight_elems_read_per_row,
unsigned int &num_elems_read_per_iteration, unsigned int &num_elems_written_per_iteration, BorderSize &border_size)
{
+ ARM_COMPUTE_ERROR_ON(input->data_layout() == DataLayout::UNKNOWN);
+
+ const DataLayout data_layout = input->data_layout();
+ const int width_idx = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
+
// Calculate right and bottom border
- unsigned int kernel_size = weights->dimension(0);
+ unsigned int kernel_size = weights->dimension(width_idx);
const int conv_stride_x = std::get<0>(conv_info.stride());
const int conv_stride_y = std::get<1>(conv_info.stride());
- const int input_width = input->dimension(0);
+ const int input_width = input->dimension(width_idx);
- switch(kernel_size)
+ Window win{};
+ bool window_changed = false;
+
+ if(data_layout == DataLayout::NCHW)
{
- case 1:
+ switch(kernel_size)
{
- switch(input->data_type())
+ case 1:
{
+ switch(input->data_type())
+ {
#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
- case DataType::F16:
+ case DataType::F16:
#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
- case DataType::QS8:
- case DataType::QS16:
- num_elems_written_per_iteration = 8;
- break;
- case DataType::F32:
- if(run_optim_small_tensor_info(input))
- {
+ case DataType::QS8:
+ case DataType::QS16:
num_elems_written_per_iteration = 8;
- }
- else
- {
- num_elems_written_per_iteration = 4;
- }
- break;
- default:
- ARM_COMPUTE_ERROR("Data type not supported.");
- break;
+ break;
+ case DataType::F32:
+ if(run_optim_small_tensor_info(input))
+ {
+ num_elems_written_per_iteration = 8;
+ }
+ else
+ {
+ num_elems_written_per_iteration = 4;
+ }
+ break;
+ default:
+ ARM_COMPUTE_ERROR("Data type not supported.");
+ break;
+ }
+ num_weight_elems_read_per_row = kernel_size;
+ num_elems_read_per_iteration = conv_stride_x * num_elems_written_per_iteration;
+ break;
}
- num_weight_elems_read_per_row = kernel_size;
- num_elems_read_per_iteration = conv_stride_x * num_elems_written_per_iteration;
- break;
- }
- case 3:
- case 5:
- {
- switch(input->data_type())
+ case 3:
+ case 5:
{
- case DataType::F32:
- num_weight_elems_read_per_row = 4 + kernel_size - 1;
- num_elems_read_per_iteration = 12;
- num_elems_written_per_iteration = 16 >> conv_stride_x;
- break;
+ switch(input->data_type())
+ {
+ case DataType::F32:
+ num_weight_elems_read_per_row = 4 + kernel_size - 1;
+ num_elems_read_per_iteration = 12;
+ num_elems_written_per_iteration = 16 >> conv_stride_x;
+ break;
#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
- case DataType::F16:
+ case DataType::F16:
#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
- case DataType::QS8:
- case DataType::QS16:
- num_weight_elems_read_per_row = 8 + kernel_size - 1;
- num_elems_read_per_iteration = 24;
- num_elems_written_per_iteration = 32 >> conv_stride_x;
- break;
- default:
- ARM_COMPUTE_ERROR("Data type not supported.");
- break;
+ case DataType::QS8:
+ case DataType::QS16:
+ num_weight_elems_read_per_row = 8 + kernel_size - 1;
+ num_elems_read_per_iteration = 24;
+ num_elems_written_per_iteration = 32 >> conv_stride_x;
+ break;
+ default:
+ ARM_COMPUTE_ERROR("Data type not supported.");
+ break;
+ }
+ }
+ break;
+ default:
+ {
+ ARM_COMPUTE_ERROR("Not implemented");
+ break;
}
}
- break;
- default:
- {
- ARM_COMPUTE_ERROR("Not implemented");
- break;
- }
+
+ // Calculate right pad
+ int start_x = kernel_size / 2 - static_cast<int>(conv_info.pad_left());
+ int end_x = ceil_to_multiple(static_cast<int>(output->dimension(0)), num_elems_written_per_iteration) * conv_stride_x;
+ int upper_bound_w = ceil_to_multiple(start_x + end_x, num_elems_read_per_iteration) - input_width;
+
+ // Calculate border
+ const unsigned int conv_pad_left = conv_info.pad_left();
+ const unsigned int conv_pad_top = conv_info.pad_top();
+ const unsigned int conv_pad_right = std::max(upper_bound_w, 0);
+ const unsigned int conv_pad_bottom = conv_info.pad_bottom();
+
+ border_size.left = conv_pad_left;
+ border_size.top = conv_pad_top;
+ border_size.right = conv_pad_right;
+ border_size.bottom = conv_pad_bottom;
+
+ // Configure window
+ win = calculate_max_window(*output, Steps(num_elems_written_per_iteration));
+
+ AccessWindowRectangle input_access(input, -conv_pad_left, -conv_pad_top,
+ num_elems_read_per_iteration, kernel_size,
+ conv_stride_x, conv_stride_y);
+ AccessWindowStatic weights_access(weights, 0, 0, num_weight_elems_read_per_row, kernel_size);
+ AccessWindowHorizontal output_access(output, 0, num_elems_written_per_iteration);
+ window_changed = update_window_and_padding(win, input_access, weights_access, output_access);
+ output_access.set_valid_region(win, ValidRegion(Coordinates(), output->tensor_shape()));
}
+ else
+ {
+ border_size.left = 0;
+ border_size.top = conv_info.pad_left();
+ border_size.right = 0;
+ border_size.bottom = conv_info.pad_right();
- // Calculate right pad
- int start_x = kernel_size / 2 - static_cast<int>(conv_info.pad_left());
- int end_x = ceil_to_multiple(static_cast<int>(output->dimension(0)), num_elems_written_per_iteration) * conv_stride_x;
- int upper_bound_w = ceil_to_multiple(start_x + end_x, num_elems_read_per_iteration) - input_width;
+ num_elems_read_per_iteration = 16 / element_size_from_data_type(input->data_type());
- // Calculate border
- const unsigned int conv_pad_left = conv_info.pad_left();
- const unsigned int conv_pad_top = conv_info.pad_top();
- const unsigned int conv_pad_right = std::max(upper_bound_w, 0);
- const unsigned int conv_pad_bottom = conv_info.pad_bottom();
+ win = calculate_max_window(*output, Steps());
- border_size.left = conv_pad_left;
- border_size.top = conv_pad_top;
- border_size.right = conv_pad_right;
- border_size.bottom = conv_pad_bottom;
-
- // Configure window
- Window win = calculate_max_window(*output, Steps(num_elems_written_per_iteration));
-
- AccessWindowRectangle input_access(input, -conv_pad_left, -conv_pad_top,
- num_elems_read_per_iteration, kernel_size,
- conv_stride_x, conv_stride_y);
- AccessWindowStatic weights_access(weights, 0, 0, num_weight_elems_read_per_row, kernel_size);
- AccessWindowHorizontal output_access(output, 0, num_elems_written_per_iteration);
- bool window_changed = update_window_and_padding(win, input_access, weights_access, output_access);
- output_access.set_valid_region(win, ValidRegion(Coordinates(), output->tensor_shape()));
+ AccessWindowRectangle input_access(input, 0, -border_size.top, num_elems_read_per_iteration, kernel_size, 1.f, conv_stride_x);
+ AccessWindowRectangle weights_access(weights, 0, 0, num_elems_read_per_iteration, kernel_size);
+ window_changed = update_window_and_padding(win, input_access, weights_access);
+ }
Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
return std::make_pair(err, win);
@@ -1170,7 +1302,7 @@
_weights = weights;
_output = output;
_conv_info = conv_info;
- _kernel_size = weights->info()->dimension(0);
+ _kernel_size = weights->info()->dimension(get_data_layout_dimension_index(weights->info()->data_layout(), DataLayoutDimension::WIDTH));
const unsigned int conv_pad_left = conv_info.pad_left();
const unsigned int conv_pad_top = conv_info.pad_top();
@@ -1179,7 +1311,7 @@
_border_size = BorderSize(conv_pad_top, conv_pad_right, conv_pad_bottom, conv_pad_left);
// Get convolved dimensions
- TensorShape output_shape = get_convolved_dimensions(input->info(), weights->info(), _kernel_size, conv_info);
+ TensorShape output_shape = misc::shape_calculator::compute_deep_convolution_shape(*input->info(), *weights->info(), conv_info);
DataType data_type = input->info()->data_type();
@@ -1229,73 +1361,88 @@
ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
ARM_COMPUTE_ERROR_ON(_input->buffer() == nullptr);
- const int kernel_size = _weights->info()->dimension(0);
+ const int kernel_size = _weights->info()->dimension(get_data_layout_dimension_index(_weights->info()->data_layout(), DataLayoutDimension::WIDTH));
- switch(kernel_size)
+ if(_input->info()->data_layout() == DataLayout::NCHW)
{
- case 1:
+ switch(kernel_size)
{
- switch(_input->info()->data_type())
+ case 1:
{
- case DataType::QS8:
- convolve_1x1<qint8_t, qint16_t>(window, _num_elems_read_per_iteration, _num_elems_written_per_iteration, _input, _weights, _output, _conv_info);
- break;
- case DataType::QS16:
- convolve_1x1<qint16_t, qint32_t>(window, _num_elems_read_per_iteration, _num_elems_written_per_iteration, _input, _weights, _output, _conv_info);
- break;
- case DataType::F32:
- convolve_1x1<float, float>(window, _num_elems_read_per_iteration, _num_elems_written_per_iteration, _input, _weights, _output, _conv_info);
- break;
+ switch(_input->info()->data_type())
+ {
+ case DataType::QS8:
+ convolve_1x1<qint8_t, qint16_t>(window, _num_elems_read_per_iteration, _num_elems_written_per_iteration, _input, _weights, _output, _conv_info);
+ break;
+ case DataType::QS16:
+ convolve_1x1<qint16_t, qint32_t>(window, _num_elems_read_per_iteration, _num_elems_written_per_iteration, _input, _weights, _output, _conv_info);
+ break;
+ case DataType::F32:
+ convolve_1x1<float, float>(window, _num_elems_read_per_iteration, _num_elems_written_per_iteration, _input, _weights, _output, _conv_info);
+ break;
#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
- case DataType::F16:
- convolve_1x1<float16_t, float16_t>(window, _num_elems_read_per_iteration, _num_elems_written_per_iteration, _input, _weights, _output, _conv_info);
- break;
+ case DataType::F16:
+ convolve_1x1<float16_t, float16_t>(window, _num_elems_read_per_iteration, _num_elems_written_per_iteration, _input, _weights, _output, _conv_info);
+ break;
#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
- default:
- ARM_COMPUTE_ERROR("Data type not supported");
- break;
+ default:
+ ARM_COMPUTE_ERROR("Data type not supported");
+ break;
+ }
+ break;
}
- break;
- }
- case 3:
- {
- switch(_input->info()->data_type())
+ case 3:
{
- case DataType::QS8:
- convolve_3x3<qint8_t, qint16_t>(window, _num_elems_read_per_iteration, _num_elems_written_per_iteration, _input, _weights, _output, _conv_info);
- break;
- case DataType::F32:
- convolve_3x3<float, float>(window, _num_elems_read_per_iteration, _num_elems_written_per_iteration, _input, _weights, _output, _conv_info);
- break;
+ switch(_input->info()->data_type())
+ {
+ case DataType::QS8:
+ convolve_3x3<qint8_t, qint16_t>(window, _num_elems_read_per_iteration, _num_elems_written_per_iteration, _input, _weights, _output, _conv_info);
+ break;
+ case DataType::F32:
+ convolve_3x3<float, float>(window, _num_elems_read_per_iteration, _num_elems_written_per_iteration, _input, _weights, _output, _conv_info);
+ break;
#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
- case DataType::F16:
- convolve_3x3<float16_t, float16_t>(window, _num_elems_read_per_iteration, _num_elems_written_per_iteration, _input, _weights, _output, _conv_info);
- break;
+ case DataType::F16:
+ convolve_3x3<float16_t, float16_t>(window, _num_elems_read_per_iteration, _num_elems_written_per_iteration, _input, _weights, _output, _conv_info);
+ break;
#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
- default:
- ARM_COMPUTE_ERROR("Data type not supported");
- break;
+ default:
+ ARM_COMPUTE_ERROR("Data type not supported");
+ break;
+ }
+ break;
}
- break;
- }
- case 5:
- {
- switch(_input->info()->data_type())
+ case 5:
{
- case DataType::F32:
- convolve_5x5<float, float>(window, _num_elems_read_per_iteration, _num_elems_written_per_iteration, _input, _weights, _output, _conv_info);
- break;
- default:
- ARM_COMPUTE_ERROR("Data type not supported");
- break;
+ switch(_input->info()->data_type())
+ {
+ case DataType::F32:
+ convolve_5x5<float, float>(window, _num_elems_read_per_iteration, _num_elems_written_per_iteration, _input, _weights, _output, _conv_info);
+ break;
+ default:
+ ARM_COMPUTE_ERROR("Data type not supported");
+ break;
+ }
+ break;
}
- break;
- }
- default:
+ default:
+ {
+ ARM_COMPUTE_ERROR("Only kernel sizes 1x1, 3x3 and 5x5 are supported.");
+ break;
+ }
+ }
+ }
+ else
+ {
+ switch(_input->info()->data_type())
{
- ARM_COMPUTE_ERROR("Only kernel sizes 1x1, 3x3 and 5x5 are supported.");
- break;
+ case DataType::F32:
+ convolver_nhwc<float>::convolve(window, kernel_size, _num_elems_read_per_iteration, _input, _weights, _output, _conv_info);
+ break;
+ default:
+ ARM_COMPUTE_ERROR("Data type not supported");
+ break;
}
}
}
diff --git a/src/core/NEON/kernels/NEDirectConvolutionLayerOutputStageKernel.cpp b/src/core/NEON/kernels/NEDirectConvolutionLayerOutputStageKernel.cpp
index 08d8f8c..edda2cd 100644
--- a/src/core/NEON/kernels/NEDirectConvolutionLayerOutputStageKernel.cpp
+++ b/src/core/NEON/kernels/NEDirectConvolutionLayerOutputStageKernel.cpp
@@ -44,6 +44,7 @@
Status validate_arguments(const ITensorInfo *input, const ITensorInfo *bias, const ITensorInfo *output)
{
ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input);
+ ARM_COMPUTE_RETURN_ERROR_ON(input->data_layout() == DataLayout::UNKNOWN);
ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QS8, DataType::QASYMM8,
DataType::QS16, DataType::F16,
DataType::QS32, DataType::S32, DataType::F32);
@@ -68,6 +69,7 @@
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, bias);
}
+ ARM_COMPUTE_RETURN_ERROR_ON(bias->dimension(0) != input->dimension(get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::CHANNEL)));
ARM_COMPUTE_RETURN_ERROR_ON(bias->num_dimensions() > 1);
}
else
@@ -79,6 +81,8 @@
if((output != nullptr) && (output->total_size() != 0))
{
ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::QS8, DataType::QASYMM8, DataType::QS16, DataType::F32);
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output);
+
if(is_data_type_fixed_point(input->data_type()))
{
ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->data_type() == DataType::QS8 && output->data_type() != DataType::QS8, "Wrong data type for output");
@@ -101,6 +105,8 @@
std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *bias, ITensorInfo *output)
{
+ ARM_COMPUTE_ERROR_ON(input->data_layout() == DataLayout::UNKNOWN);
+
bool window_changed = false;
unsigned int num_elems_processed_per_iteration = 16 / element_size_from_data_type(input->data_type());
@@ -138,8 +144,16 @@
}
else
{
- AccessWindowStatic bias_access(bias, 0, 0, bias->dimension(0), bias->dimension(1));
- window_changed = update_window_and_padding(win, input_access, bias_access);
+ if(input->data_layout() == DataLayout::NCHW)
+ {
+ AccessWindowStatic bias_access(bias, 0, 0, bias->dimension(0), bias->dimension(1));
+ window_changed = update_window_and_padding(win, input_access, bias_access);
+ }
+ else
+ {
+ AccessWindowHorizontal bias_access(bias, 0, num_elems_processed_per_iteration);
+ window_changed = update_window_and_padding(win, input_access, bias_access);
+ }
}
input_access.set_valid_region(win, ValidRegion(Coordinates(), input->tensor_shape()));
@@ -253,6 +267,7 @@
void output_stage(ITensor *input, const ITensor *bias, const Window &window, ITensor *output,
int result_fixedpoint_multiplier, int result_shift, int result_offset_after_shift)
{
+ ARM_COMPUTE_ERROR_ON(input->info()->data_layout() == DataLayout::UNKNOWN);
ARM_COMPUTE_UNUSED(result_fixedpoint_multiplier);
ARM_COMPUTE_UNUSED(result_shift);
ARM_COMPUTE_UNUSED(result_offset_after_shift);
@@ -303,6 +318,66 @@
}
}
+template <typename T1, typename T2, bool in_place, bool has_bias>
+void output_stage_nhwc(ITensor *input, const ITensor *bias, const Window &window, ITensor *output,
+ int result_fixedpoint_multiplier, int result_shift, int result_offset_after_shift)
+{
+ ARM_COMPUTE_UNUSED(result_fixedpoint_multiplier);
+ ARM_COMPUTE_UNUSED(result_shift);
+ ARM_COMPUTE_UNUSED(result_offset_after_shift);
+
+ Window window_bias = window;
+ window_bias.set(Window::DimY, Window::Dimension(0, 0, 0));
+ window_bias.set(Window::DimZ, Window::Dimension(0, 0, 0));
+ window_bias.set(3, Window::Dimension(0, 0, 0));
+
+ Iterator in(input, window);
+ Iterator bi(bias, window_bias);
+
+ if(in_place) // In place accumulate
+ {
+ execute_window_loop(window, [&](const Coordinates & id)
+ {
+ // Get bias and pointer to input
+ const auto in_ptr = reinterpret_cast<T1 *>(in.ptr());
+ const auto bias_ptr = reinterpret_cast<T2 *>(bi.ptr());
+
+ // Accumulate bias
+ if(has_bias)
+ {
+ internal_vst1q(in_ptr, internal_vqaddq(internal_vld1q(in_ptr), internal_vld1q(bias_ptr)));
+ }
+ else
+ {
+ internal_vst1q(in_ptr, internal_vld1q(in_ptr));
+ }
+ },
+ in, bi);
+ }
+ else // Out of place accumulate
+ {
+ Iterator out(output, window);
+ execute_window_loop(window, [&](const Coordinates & id)
+ {
+ // Get bias and pointer to input
+ const auto in_ptr = reinterpret_cast<T1 *>(in.ptr());
+ const auto out_ptr = reinterpret_cast<T2 *>(out.ptr());
+ const auto bias_ptr = reinterpret_cast<T2 *>(bi.ptr());
+
+ // Accumulate bias
+ if(has_bias)
+ {
+ internal_vst1q(out_ptr, internal_vqaddq(internal_vld1q(in_ptr), internal_vld1q(bias_ptr)));
+ }
+ else
+ {
+ internal_vst1q(out_ptr, internal_vld1q(in_ptr));
+ }
+ },
+ in, bi);
+ }
+}
+
// QASYMM8 specializations
template <>
void output_stage<int32_t, uint8_t, false, true>(ITensor *input, const ITensor *bias, const Window &window, ITensor *output,
@@ -415,61 +490,79 @@
INEKernel::configure(win_config.second);
// Set appropriate function
- switch(input->info()->data_type())
+ if(input->info()->data_layout() == DataLayout::NCHW)
{
- case DataType::QS8:
+ switch(input->info()->data_type())
{
- if(bias == nullptr)
+ case DataType::QS8:
{
- _func = (output == nullptr) ? &output_stage<qint8_t, qint8_t, true, false> : &output_stage<qint8_t, qint8_t, false, false>;
+ if(bias == nullptr)
+ {
+ _func = (output == nullptr) ? &output_stage<qint8_t, qint8_t, true, false> : &output_stage<qint8_t, qint8_t, false, false>;
+ }
+ else
+ {
+ _func = (output == nullptr) ? &output_stage<qint8_t, qint8_t, true, true> : &output_stage<qint8_t, qint8_t, false, true>;
+ }
+ break;
}
- else
+ case DataType::QS16:
{
- _func = (output == nullptr) ? &output_stage<qint8_t, qint8_t, true, true> : &output_stage<qint8_t, qint8_t, false, true>;
+ if(bias != nullptr && bias->info()->data_type() == DataType::QS8)
+ {
+ _func = (output == nullptr) ? &output_stage<qint16_t, qint8_t, true, true> : &output_stage<qint16_t, qint8_t, false, true>;
+ }
+ else if(bias == nullptr)
+ {
+ _func = (output == nullptr) ? &output_stage<qint16_t, qint8_t, true, false> : &output_stage<qint16_t, qint8_t, false, false>;
+ }
+ else
+ {
+ ARM_COMPUTE_ERROR("Not implemented");
+ }
+ break;
}
- break;
- }
- case DataType::QS16:
- {
- if(bias != nullptr && bias->info()->data_type() == DataType::QS8)
+ case DataType::QS32:
{
- _func = (output == nullptr) ? &output_stage<qint16_t, qint8_t, true, true> : &output_stage<qint16_t, qint8_t, false, true>;
+ _func = (output == nullptr) ? &output_stage<qint32_t, qint16_t, true, true> : &output_stage<qint32_t, qint16_t, false, true>;
+ break;
}
- else if(bias == nullptr)
+ case DataType::S32:
{
- _func = (output == nullptr) ? &output_stage<qint16_t, qint8_t, true, false> : &output_stage<qint16_t, qint8_t, false, false>;
+ _func = (bias == nullptr) ? &output_stage<int32_t, uint8_t, false, false> : &output_stage<int32_t, uint8_t, false, true>;
+ break;
}
- else
- {
- ARM_COMPUTE_ERROR("Not implemented");
- }
- break;
- }
- case DataType::QS32:
- {
- _func = (output == nullptr) ? &output_stage<qint32_t, qint16_t, true, true> : &output_stage<qint32_t, qint16_t, false, true>;
- break;
- }
- case DataType::S32:
- {
- _func = (bias == nullptr) ? &output_stage<int32_t, uint8_t, false, false> : &output_stage<int32_t, uint8_t, false, true>;
- break;
- }
#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
- case DataType::F16:
- {
- _func = (output == nullptr) ? &output_stage<float16_t, float16_t, true, true> : &output_stage<float16_t, float16_t, false, true>;
- break;
- }
+ case DataType::F16:
+ {
+ _func = (output == nullptr) ? &output_stage<float16_t, float16_t, true, true> : &output_stage<float16_t, float16_t, false, true>;
+ break;
+ }
#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
- case DataType::F32:
- {
- _func = (output == nullptr) ? &output_stage<float, float, true, true> : &output_stage<float, float, false, true>;
- break;
+ case DataType::F32:
+ {
+ _func = (output == nullptr) ? &output_stage<float, float, true, true> : &output_stage<float, float, false, true>;
+ break;
+ }
+ default:
+ {
+ ARM_COMPUTE_ERROR("Unsupported combination of types among the inputs.");
+ }
}
- default:
+ }
+ else
+ {
+ switch(input->info()->data_type())
{
- ARM_COMPUTE_ERROR("Unsupported combination of types among the inputs.");
+ case DataType::F32:
+ {
+ _func = (output == nullptr) ? &output_stage_nhwc<float, float, true, true> : &output_stage_nhwc<float, float, false, true>;
+ break;
+ }
+ default:
+ {
+ ARM_COMPUTE_ERROR("Unsupported combination of types among the inputs.");
+ }
}
}
}
diff --git a/src/core/NEON/kernels/NEGEMMInterleaveBlockedKernel.cpp b/src/core/NEON/kernels/NEGEMMInterleaveBlockedKernel.cpp
deleted file mode 100644
index 768dd8b..0000000
--- a/src/core/NEON/kernels/NEGEMMInterleaveBlockedKernel.cpp
+++ /dev/null
@@ -1,221 +0,0 @@
-/*
- * Copyright (c) 2017 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/core/NEON/kernels/NEGEMMInterleaveBlockedKernel.h"
-
-#include "arm_compute/core/Error.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/ITensor.h"
-#include "arm_compute/core/NEON/INEKernel.h"
-#include "arm_compute/core/Types.h"
-#include "arm_compute/core/Validate.h"
-#include "arm_compute/core/Window.h"
-
-#include <arm_neon.h>
-#include <cstddef>
-#include <cstdint>
-#include <tuple>
-
-using namespace arm_compute;
-
-namespace
-{
-TensorShape get_output_shape(const ITensorInfo *input, unsigned int block_height)
-{
- TensorShape output_shape = input->tensor_shape();
- const float interleave_by_f32 = block_height;
- output_shape.set(0, input->dimension(0) * interleave_by_f32);
- output_shape.set(1, std::ceil(static_cast<float>(input->dimension(1)) / interleave_by_f32));
- return output_shape;
-}
-
-Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, unsigned int block_width, unsigned int block_height)
-{
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8);
- ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(output);
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(block_height < 1, "Block height must be greater than 0");
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(block_width < 1, "Block window must be greater than 0");
-
- if(output->total_size() != 0)
- {
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(output->tensor_shape(), get_output_shape(input, block_height));
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_FIXED_POINT(input, output);
- }
-
- return Status{};
-}
-
-std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *output, unsigned int block_width, unsigned int block_height)
-{
- const unsigned int num_elems_processed_per_iteration_x = block_width;
- const unsigned int num_elems_processed_per_iteration_y = block_height;
- bool window_changed = false;
-
- // Configure kernel window
- Window win = calculate_max_window(*input, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y));
- const float scaley_factor = 1.f / block_height;
-
- AccessWindowRectangle input_access(input, 0, 0, num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y);
- window_changed = window_changed || update_window_and_padding(win, input_access);
-
- // Configure window in case of configured output
- if(output->total_size() != 0)
- {
- AccessWindowRectangle output_access(output,
- 0, 0,
- num_elems_processed_per_iteration_x * num_elems_processed_per_iteration_y,
- 1, num_elems_processed_per_iteration_y, scaley_factor);
- window_changed = window_changed || update_window_and_padding(win, output_access);
- output_access.set_valid_region(win, input->valid_region());
- }
-
- Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
- return std::make_pair(err, win);
-}
-
-inline void gemm_interleave_blocked_transposed_8bit(const ITensor *input, ITensor *output, const Window &window, unsigned int block_width, unsigned int block_height)
-{
- const size_t in_stride = input->info()->strides_in_bytes()[1];
-
- const unsigned int in_height = input->info()->dimension(1);
- const unsigned int in_width = input->info()->dimension(0);
-
- const float scale_y_factor = 1.f / float(block_height);
-
- // Set window for output tensor
- Window win_out(window);
- win_out.scale(Window::DimY, scale_y_factor);
- Iterator in(input, window);
-
- win_out.set_dimension_step(Window::DimX, block_width * block_height);
- Iterator out(output, win_out);
-
- execute_window_loop(window, [&](const Coordinates &)
- {
- std::fill_n(out.ptr(), block_width * block_height, 0);
- },
- out);
-
- execute_window_loop(window, [&](const Coordinates & id)
- {
- for(unsigned int z = id.y(); (z < in_width) && z < (id.y() + block_height); ++z)
- {
- int j = (z - id.y()) * block_width;
- for(unsigned int b = id.x(); (b < in_height) && (b < (id.x() + block_width)); ++b)
- {
- *(out.ptr() + j++) = *(input->buffer() + b * in_stride + z);
- }
- }
- },
- in, out);
-}
-
-inline void gemm_interleave_blocked_8bit(const ITensor *input, ITensor *output, const Window &window, unsigned int block_width, unsigned int block_height)
-{
- const size_t in_stride = input->info()->strides_in_bytes()[1];
-
- const unsigned int in_height = input->info()->dimension(1);
- const unsigned int in_width = input->info()->dimension(0);
-
- const float scale_y_factor = 1.f / float(block_height);
-
- // Set window for output tensor
- Window win_out(window);
- win_out.scale(Window::DimY, scale_y_factor);
- Iterator in(input, window);
-
- win_out.set_dimension_step(Window::DimX, block_width * block_height);
- Iterator out(output, win_out);
-
- execute_window_loop(window, [&](const Coordinates &)
- {
- std::fill_n(out.ptr(), block_width * block_height, 0);
- },
- out);
-
- execute_window_loop(window, [&](const Coordinates & id)
- {
- for(unsigned int z = id.y(); (z < in_height) && z < (id.y() + block_height); ++z)
- {
- int j = (z - id.y()) * block_width;
- for(unsigned int b = id.x(); (b < in_width) && (b < (id.x() + block_width)); ++b)
- {
- *(out.ptr() + j++) = *(input->buffer() + z * in_stride + b);
- }
- }
- },
- in, out);
-}
-} // namespace
-
-NEGEMMInterleaveBlockedKernel::NEGEMMInterleaveBlockedKernel()
- : _block_height(0), _block_width(0), _transpose(false)
-{
-}
-
-void NEGEMMInterleaveBlockedKernel::configure(const ITensor *input, ITensor *output, unsigned int block_height, unsigned int block_width, bool transpose)
-{
- ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
-
- // Output auto inizialitation if not yet initialized
- auto_init_if_empty(*output->info(), get_output_shape(input->info(), block_height), 1, input->info()->data_type(), input->info()->fixed_point_position());
-
- // Perform validation step
- ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), block_width, block_height));
-
- _input = input;
- _output = output;
- _block_height = block_height;
- _block_width = block_width;
- _transpose = transpose;
-
- // Configure kernel window
- auto win_config = validate_and_configure_window(input->info(), output->info(), block_width, block_height);
- ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
- INEKernel::configure(win_config.second);
-}
-
-Status NEGEMMInterleaveBlockedKernel::validate(const ITensorInfo *input, const ITensorInfo *output, unsigned int block_height, unsigned int block_width, bool transpose)
-{
- ARM_COMPUTE_UNUSED(transpose);
- ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, block_width, block_height));
- ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input->clone().get(), output->clone().get(), block_width, block_height).first);
-
- return Status{};
-}
-
-void NEGEMMInterleaveBlockedKernel::run(const Window &window, const ThreadInfo &info)
-{
- ARM_COMPUTE_UNUSED(info);
- ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
- ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
- if(_transpose)
- {
- gemm_interleave_blocked_transposed_8bit(_input, _output, window, _block_width, _block_height);
- }
- else
- {
- gemm_interleave_blocked_8bit(_input, _output, window, _block_width, _block_height);
- }
-}
diff --git a/src/core/NEON/kernels/NEHOGDescriptorKernel.cpp b/src/core/NEON/kernels/NEHOGDescriptorKernel.cpp
index 3fd81be..c204395 100644
--- a/src/core/NEON/kernels/NEHOGDescriptorKernel.cpp
+++ b/src/core/NEON/kernels/NEHOGDescriptorKernel.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2016, 2017 ARM Limited.
+ * Copyright (c) 2016-2018 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -764,7 +764,7 @@
AccessWindowRectangle(input->info(), 0, 0, num_elems_read_per_iteration, num_rows_read_per_iteration),
output_access);
- output_access.set_valid_region(win, input->info()->valid_region());
+ output_access.set_valid_region(win, ValidRegion(Coordinates(), output->info()->tensor_shape()));
INEKernel::configure(win);
}
@@ -786,7 +786,7 @@
Window win_in(window);
win_in.set_dimension_step(Window::DimX, _num_cells_per_block_stride.width);
- win_in.set_dimension_step(Window::DimY, _num_cells_per_block_stride.height);
+ win_in.set(Window::DimY, Window::Dimension(0, 0, 0));
Iterator in(_input, win_in);
Iterator out(_output, window);
@@ -794,7 +794,7 @@
// Normalises blocks
execute_window_loop(window, [&](const Coordinates & id)
{
- const auto input_row_ptr = reinterpret_cast<const float *>(in.ptr());
+ const auto input_row_ptr = reinterpret_cast<const float *>(in.ptr() + id.y() * _num_cells_per_block_stride.height * _input->info()->strides_in_bytes()[Window::DimY]);
const auto out_row_ptr = reinterpret_cast<float *>(out.ptr());
// Execute normalization function
diff --git a/src/core/NEON/kernels/NEHOGDetectorKernel.cpp b/src/core/NEON/kernels/NEHOGDetectorKernel.cpp
index 343b051..2c02ab8 100644
--- a/src/core/NEON/kernels/NEHOGDetectorKernel.cpp
+++ b/src/core/NEON/kernels/NEHOGDetectorKernel.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2016, 2017 ARM Limited.
+ * Copyright (c) 2016-2018 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -81,8 +81,8 @@
// Configure kernel window
Window win;
- win.set(Window::DimX, Window::Dimension(0, floor_to_multiple(num_blocks_x - num_blocks_per_detection_window_x, window_step_x), window_step_x));
- win.set(Window::DimY, Window::Dimension(0, floor_to_multiple(num_blocks_y - num_blocks_per_detection_window_y, window_step_y), window_step_y));
+ win.set(Window::DimX, Window::Dimension(0, floor_to_multiple(num_blocks_x - num_blocks_per_detection_window_x, window_step_x) + window_step_x, window_step_x));
+ win.set(Window::DimY, Window::Dimension(0, floor_to_multiple(num_blocks_y - num_blocks_per_detection_window_y, window_step_y) + window_step_y, window_step_y));
constexpr unsigned int num_elems_read_per_iteration = 1;
const unsigned int num_rows_read_per_iteration = _num_blocks_per_descriptor_y;
diff --git a/src/core/NEON/kernels/NEIm2ColKernel.cpp b/src/core/NEON/kernels/NEIm2ColKernel.cpp
index 4fa329b..86e3fd7 100644
--- a/src/core/NEON/kernels/NEIm2ColKernel.cpp
+++ b/src/core/NEON/kernels/NEIm2ColKernel.cpp
@@ -45,34 +45,34 @@
namespace
{
Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, const Size2D &kernel_dims, const PadStrideInfo &conv_info,
- bool has_bias, bool is_fully_connected, bool is_flatten)
+ bool has_bias, bool is_fully_connected, bool is_flatten, const Size2D &dilation)
{
ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QS8, DataType::QASYMM8, DataType::QS16, DataType::F16, DataType::F32);
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_FIXED_POINT_POSITION(input, output);
ARM_COMPUTE_RETURN_ERROR_ON(input->data_type() == DataType::QASYMM8 && has_bias);
+ ARM_COMPUTE_RETURN_ERROR_ON((dilation.x() < 1) || (dilation.y() < 1));
+ TensorShape expected_output_shape;
if(is_flatten) /* Called by FlattenLayer */
{
- size_t flatten_shape = input->tensor_shape().x() * input->tensor_shape().y() * input->tensor_shape().z();
- ARM_COMPUTE_RETURN_ERROR_ON(output->dimension(0) != flatten_shape);
+ expected_output_shape = misc::shape_calculator::compute_im2col_flatten_shape(input);
}
else if(!is_fully_connected) /* Called by ConvolutionLayer */
{
- std::pair<unsigned int, unsigned int> out_dims = scaled_dimensions(input->dimension(0), input->dimension(1), kernel_dims.width, kernel_dims.height, conv_info);
- ARM_COMPUTE_RETURN_ERROR_ON(output->dimension(0) != (input->dimension(2) * kernel_dims.area() + (has_bias ? 1 : 0)));
- ARM_COMPUTE_RETURN_ERROR_ON(output->dimension(1) != (out_dims.first * out_dims.second));
- ARM_COMPUTE_RETURN_ERROR_ON(output->dimension(2) != 1);
+ expected_output_shape = misc::shape_calculator::compute_im2col_conv_shape(input, kernel_dims, conv_info, has_bias, dilation);
}
else /* Called by FullyConnectedLayer */
{
const int num_batch_dimensions = std::max(0, static_cast<int>(output->tensor_shape().num_dimensions()) - 1);
const int num_input_dimensions = input->tensor_shape().num_dimensions() - num_batch_dimensions;
- TensorInfo expected_output = output->clone()->set_tensor_shape(misc::shape_calculator::compute_im2col_shape(input, num_input_dimensions));
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(&expected_output, output);
+ expected_output_shape = misc::shape_calculator::compute_im2col_fc_shape(input, num_input_dimensions);
}
+ TensorInfo expected_output = output->clone()->set_tensor_shape(expected_output_shape);
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(&expected_output, output);
+
return Status{};
}
@@ -91,11 +91,13 @@
int input_stride_y,
int input_stride_z,
int fixed_point_position,
- int pad_value)
+ int pad_value,
+ int dilation_x,
+ int dilation_y)
{
const int kernel_size2 = kernel_width * kernel_height;
- const int x_e = top_left_x + kernel_width;
- const int y_e = top_left_y + kernel_height;
+ const int x_e = top_left_x + kernel_width * dilation_x;
+ const int y_e = top_left_y + kernel_height * dilation_y;
// Linearize volume
int d = 0;
@@ -104,12 +106,12 @@
// 2) to have an optimized im2col for the first convolution layer where usually we have 3 IFMs
for(; d <= (kernel_depth - 3); d += 3)
{
- for(int y = top_left_y; y < y_e; ++y)
+ for(int y = top_left_y; y < y_e; y += dilation_y)
{
if((y < 0 || y >= input_h) && has_pads)
{
// All the values will be the offset (will be zeros when not quantized)
- for(int x = top_left_x; x < x_e; ++x, ++out_ptr)
+ for(int x = top_left_x; x < x_e; x += dilation_x, ++out_ptr)
{
*(out_ptr + 0 * kernel_size2) = pad_value;
*(out_ptr + 1 * kernel_size2) = pad_value;
@@ -118,7 +120,7 @@
}
else
{
- for(int x = top_left_x; x < x_e; ++x, ++out_ptr)
+ for(int x = top_left_x; x < x_e; x += dilation_x, ++out_ptr)
{
if((x < 0 || x >= input_w) && has_pads)
{
@@ -141,7 +143,7 @@
// Left over
for(; d < kernel_depth; d++)
{
- for(int y = top_left_y; y < y_e; ++y)
+ for(int y = top_left_y; y < y_e; y += dilation_y)
{
if((y < 0 || y >= input_h) && has_pads)
{
@@ -151,7 +153,7 @@
}
else
{
- for(int x = top_left_x; x < x_e; ++x, ++out_ptr)
+ for(int x = top_left_x; x < x_e; x += dilation_x, ++out_ptr)
{
if((x < 0 || x >= input_w) && has_pads)
{
@@ -191,12 +193,17 @@
ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
- const int kernel_depth = _input->info()->dimension(2);
- const int input_w = _input->info()->dimension(0);
- const int input_h = _input->info()->dimension(1);
- const int input_stride_x = _input->info()->strides_in_bytes().x();
- const int input_stride_y = _input->info()->strides_in_bytes().y();
- const int input_stride_z = _input->info()->strides_in_bytes().z();
+ const DataLayout data_layout = _input->info()->data_layout();
+ const unsigned int width_idx = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
+ const unsigned int height_idx = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
+ const unsigned int channel_idx = get_data_layout_dimension_index(data_layout, DataLayoutDimension::CHANNEL);
+
+ const int kernel_depth = _input->info()->dimension(channel_idx);
+ const int input_w = _input->info()->dimension(width_idx);
+ const int input_h = _input->info()->dimension(height_idx);
+ const int input_stride_x = _input->info()->strides_in_bytes()[width_idx];
+ const int input_stride_y = _input->info()->strides_in_bytes()[height_idx];
+ const int input_stride_z = _input->info()->strides_in_bytes()[channel_idx];
const int offset = is_data_type_quantized(_input->info()->data_type()) ? _input->info()->quantization_info().offset : 0;
int pad_left = 0;
@@ -211,30 +218,24 @@
const int start_x = -pad_left;
const int start_y = -pad_top;
- Window window_in(window);
- // The first three dimensions of the input are increased by the inner loops
- window_in.set(Window::DimX, Window::Dimension(0, 0, 0));
- window_in.set(Window::DimY, Window::Dimension(0, 0, 0));
- window_in.set(Window::DimZ, Window::Dimension(0, 0, 0));
-
- // Setup output window
- Window window_out(window);
- window_out.set(Window::DimX, Window::Dimension(0, _output->info()->dimension(0), _output->info()->strides_in_bytes().y() / _output->info()->element_size()));
- window_out.set(Window::DimY, Window::Dimension(window.y().start() * _convolved_dims.first, window.y().end() * _convolved_dims.first, _convolved_dims.first));
- window_out.set(Window::DimZ, Window::Dimension(0, 1, 1));
+ Window window_in_out(window);
+ // The first three dimensions of the input and output are increased by the inner loops
+ window_in_out.set(Window::DimX, Window::Dimension(0, 0, 0));
+ window_in_out.set(Window::DimY, Window::Dimension(0, 0, 0));
+ window_in_out.set(Window::DimZ, Window::Dimension(0, 0, 0));
// Create iterators
- Iterator in(_input, window_in);
- Iterator out(_output, window_out);
+ Iterator in(_input, window_in_out);
+ Iterator out(_output, window_in_out);
execute_window_loop(window, [&](const Coordinates & id)
{
- const int top_left_x = id.x() * stride_x + start_x;
- const int top_left_y = id.y() * stride_y + start_y;
+ const int top_left_x = id[width_idx] * stride_x + start_x;
+ const int top_left_y = id[height_idx] * stride_y + start_y;
// Get pointers
const uint8_t *const input_ptr = in.ptr();
- auto output_ptr = reinterpret_cast<T *>(out.ptr());
+ auto output_ptr = reinterpret_cast<T *>(out.ptr() + (id[width_idx] + id[height_idx] * _convolved_dims.first) * _output->info()->strides_in_bytes().y());
// Linearize volume
linearize_volume<T, has_pads>(input_ptr,
@@ -251,7 +252,9 @@
input_stride_y,
input_stride_z,
_input->info()->fixed_point_position(),
- offset);
+ offset,
+ _dilation.x(),
+ _dilation.y());
},
in, out);
}
@@ -309,28 +312,33 @@
}
NEIm2ColKernel::NEIm2ColKernel()
- : _func(), _input(nullptr), _output(nullptr), _convolved_dims(), _conv_info(), _kernel_width(0), _kernel_height(0), _has_bias(false)
+ : _func(), _input(nullptr), _output(nullptr), _convolved_dims(), _conv_info(), _kernel_width(0), _kernel_height(0), _has_bias(false), _dilation(1U, 1U)
{
}
void NEIm2ColKernel::configure(const ITensor *input, ITensor *output, const Size2D &kernel_dims, const PadStrideInfo &conv_info,
- bool has_bias, bool is_fully_connected, bool is_flatten)
+ bool has_bias, bool is_fully_connected, bool is_flatten, const Size2D &dilation)
{
ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
// Perform validation step
- ARM_COMPUTE_UNUSED(is_fully_connected);
- ARM_COMPUTE_UNUSED(is_flatten);
- ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), kernel_dims, conv_info, has_bias, is_fully_connected, is_flatten));
+ ARM_COMPUTE_UNUSED(is_fully_connected, is_flatten);
+ ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), kernel_dims, conv_info, has_bias, is_fully_connected, is_flatten, dilation));
+
+ const DataLayout data_layout = input->info()->data_layout();
+ const unsigned int width_idx = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
+ const unsigned int height_idx = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
+ const unsigned int channel_idx = get_data_layout_dimension_index(data_layout, DataLayoutDimension::CHANNEL);
_input = input;
_output = output;
_conv_info = conv_info;
_kernel_width = kernel_dims.width;
- _kernel_height = kernel_dims.height,
- _convolved_dims = scaled_dimensions(input->info()->dimension(0), input->info()->dimension(1),
+ _kernel_height = kernel_dims.height;
+ _dilation = dilation;
+ _convolved_dims = scaled_dimensions(input->info()->dimension(width_idx), input->info()->dimension(height_idx),
_kernel_width, _kernel_height,
- _conv_info);
+ _conv_info, _dilation);
_has_bias = has_bias;
unsigned int stride_x = 0;
@@ -341,7 +349,8 @@
&& (std::equal(input->info()->tensor_shape().cbegin() + 3,
input->info()->tensor_shape().cend(),
output->info()->tensor_shape().cbegin() + 1))
- && ((stride_x == 1) && (stride_y == 1) && !conv_info.has_padding());
+ && ((stride_x == 1) && (stride_y == 1) && !conv_info.has_padding())
+ && ((dilation.x() == 1) && (dilation.y() == 1));
Window window = calculate_max_window(*input->info(), Steps());
@@ -396,9 +405,9 @@
ARM_COMPUTE_ERROR("Data type not supported");
break;
}
- window.set(Window::DimX, Window::Dimension(0, _convolved_dims.first, 1));
- window.set(Window::DimY, Window::Dimension(0, _convolved_dims.second, 1));
- window.set(Window::DimZ, Window::Dimension(0, 1, 1));
+ window.set(width_idx, Window::Dimension(0, _convolved_dims.first, 1));
+ window.set(height_idx, Window::Dimension(0, _convolved_dims.second, 1));
+ window.set(channel_idx, Window::Dimension(0, 1, 1));
}
// The NEIm2ColKernel doesn't need padding so update_window_and_padding() can be skipped
@@ -408,9 +417,9 @@
}
Status NEIm2ColKernel::validate(const ITensorInfo *input, const ITensorInfo *output, const Size2D &kernel_dims, const PadStrideInfo &conv_info,
- bool has_bias, bool is_fully_connected, bool is_flatten)
+ bool has_bias, bool is_fully_connected, bool is_flatten, const Size2D &dilation)
{
- ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, kernel_dims, conv_info, has_bias, is_fully_connected, is_flatten));
+ ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, kernel_dims, conv_info, has_bias, is_fully_connected, is_flatten, dilation));
return Status{};
}
diff --git a/src/core/NEON/kernels/NEL2NormalizeLayerKernel.cpp b/src/core/NEON/kernels/NEL2NormalizeLayerKernel.cpp
index 3bf1d940..91776d8 100644
--- a/src/core/NEON/kernels/NEL2NormalizeLayerKernel.cpp
+++ b/src/core/NEON/kernels/NEL2NormalizeLayerKernel.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2018 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -67,6 +67,55 @@
}
while(window.slide_window_slice_1D(in_slice) && window.slide_window_slice_1D(sum_slice));
}
+
+Status validate_arguments(const ITensorInfo *input, const ITensorInfo *sum, const ITensorInfo *output, unsigned int axis, float epsilon)
+{
+ ARM_COMPUTE_UNUSED(epsilon);
+
+ ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, sum, output);
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, sum);
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32);
+ ARM_COMPUTE_RETURN_ERROR_ON(input->data_layout() != DataLayout::NCHW);
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(axis > 0, "Unsupported normalization axis, Supported axis is 0");
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(axis >= TensorShape::num_max_dimensions, "Normalization axis greater than max number of dimensions");
+
+ // Reduce shape on axis
+ TensorShape sum_shape = input->tensor_shape();
+ sum_shape.set(axis, 1);
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(sum->tensor_shape(), sum_shape);
+
+ if(output->total_size() != 0)
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output);
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(input->tensor_shape(), output->tensor_shape());
+ ARM_COMPUTE_RETURN_ERROR_ON(output->data_layout() != DataLayout::NCHW);
+ }
+
+ return Status{};
+}
+
+std::tuple<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *sum, ITensorInfo *output, unsigned int axis)
+{
+ const unsigned int num_elems_processed_per_iteration = 16 / data_size_from_type(input->data_type());
+ const unsigned int num_elems_processed_per_iteration_sum = (axis == 0) ? 1 : num_elems_processed_per_iteration;
+
+ Window win = calculate_max_window(*input, Steps(num_elems_processed_per_iteration));
+
+ // Output auto initialization if not yet initialized
+ auto_init_if_empty(*output, input->tensor_shape(), 1, input->data_type(), input->fixed_point_position());
+
+ AccessWindowHorizontal input_access(input, 0, num_elems_processed_per_iteration);
+ AccessWindowHorizontal sum_access(sum, 0, num_elems_processed_per_iteration_sum);
+ AccessWindowHorizontal output_access(output, 0, num_elems_processed_per_iteration);
+
+ bool window_changed = update_window_and_padding(win, input_access, sum_access, output_access);
+ output_access.set_valid_region(win, input->valid_region());
+
+ Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
+
+ return std::make_tuple(err, win);
+}
} // namespace
NEL2NormalizeLayerKernel::NEL2NormalizeLayerKernel()
@@ -77,18 +126,7 @@
void NEL2NormalizeLayerKernel::configure(const ITensor *input, const ITensor *sum, ITensor *output, unsigned int axis, float epsilon)
{
ARM_COMPUTE_ERROR_ON_NULLPTR(input, sum, output);
- ARM_COMPUTE_ERROR_ON_MSG(axis >= TensorShape::num_max_dimensions, "Normalization axis greater than max number of dimensions");
- ARM_COMPUTE_ERROR_ON_MSG(axis > 0, "Unsupported normalization axis, Supported axis is 0");
-
- // Output auto initialization if not yet initialized
- auto_init_if_empty(*output->info(), input->info()->tensor_shape(), 1, input->info()->data_type(), input->info()->fixed_point_position());
-
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32);
- ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output, sum);
- ARM_COMPUTE_ERROR_ON_MISMATCHING_SHAPES(input, output);
-
- unsigned int num_elems_processed_per_iteration = 16 / data_size_from_type(input->info()->data_type());
- unsigned int num_elems_processed_per_iteration_sum = (axis == 0) ? 1 : num_elems_processed_per_iteration;
+ ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), sum->info(), output->info(), axis, epsilon));
_input = input;
_sum = sum;
@@ -97,16 +135,18 @@
_epsilon = epsilon;
// Configure kernel window
- Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration));
- AccessWindowHorizontal input_access(input->info(), 0, num_elems_processed_per_iteration);
- AccessWindowHorizontal sum_access(sum->info(), 0, num_elems_processed_per_iteration_sum);
- AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration);
+ auto win_config = validate_and_configure_window(_input->info(), _sum->info(), _output->info(), axis);
+ ARM_COMPUTE_ERROR_THROW_ON(std::get<0>(win_config));
- update_window_and_padding(win, input_access, sum_access, output_access);
+ INEKernel::configure(std::get<1>(win_config));
+}
- output_access.set_valid_region(win, input->info()->valid_region());
+Status NEL2NormalizeLayerKernel::validate(const ITensorInfo *input, const ITensorInfo *sum, const ITensorInfo *output, unsigned int axis, float epsilon)
+{
+ ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, sum, output, axis, epsilon));
+ ARM_COMPUTE_RETURN_ON_ERROR(std::get<0>(validate_and_configure_window(input->clone().get(), sum->clone().get(), output->clone().get(), axis)));
- INEKernel::configure(win);
+ return Status{};
}
void NEL2NormalizeLayerKernel::run(const Window &window, const ThreadInfo &info)
diff --git a/src/core/NEON/kernels/NELKTrackerKernel.cpp b/src/core/NEON/kernels/NELKTrackerKernel.cpp
index 004ecd0..83593e7 100644
--- a/src/core/NEON/kernels/NELKTrackerKernel.cpp
+++ b/src/core/NEON/kernels/NELKTrackerKernel.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2016, 2017 ARM Limited.
+ * Copyright (c) 2016-2018 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -356,13 +356,16 @@
_termination = termination;
_use_initial_estimate = use_initial_estimate;
_epsilon = epsilon;
- _num_iterations = num_iterations;
_window_dimension = window_dimension;
_level = level;
_num_levels = num_levels;
_pyramid_scale = pyramid_scale;
_num_levels = num_levels;
+ // Set maximum number of iterations used for convergence
+ const size_t max_iterations = 1000;
+ _num_iterations = (termination == Termination::TERM_CRITERIA_EPSILON) ? max_iterations : num_iterations;
+
Window window;
window.set(Window::DimX, Window::Dimension(0, old_points->num_values()));
window.set(Window::DimY, Window::Dimension(0, 1));
@@ -471,7 +474,7 @@
float prev_delta_x = 0.0f;
float prev_delta_y = 0.0f;
- for(unsigned int j = 0; j < _num_iterations || _termination == Termination::TERM_CRITERIA_EPSILON; ++j)
+ for(unsigned int j = 0; j < _num_iterations; ++j)
{
if(is_invalid_keypoint(new_keypoint))
{
diff --git a/src/core/NEON/kernels/NELocallyConnectedMatrixMultiplyKernel.cpp b/src/core/NEON/kernels/NELocallyConnectedMatrixMultiplyKernel.cpp
index 58da040..099626d 100644
--- a/src/core/NEON/kernels/NELocallyConnectedMatrixMultiplyKernel.cpp
+++ b/src/core/NEON/kernels/NELocallyConnectedMatrixMultiplyKernel.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017, 2018 ARM Limited.
+ * Copyright (c) 2017-2018 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -186,7 +186,7 @@
win_out.set(Window::DimX, Window::Dimension(window_start_x, window_end_x, window_step_x));
Window win_a(window);
- win_a.set(Window::DimX, Window::Dimension(0, 1, 1));
+ win_a.set(Window::DimX, Window::Dimension(0, 0, 0));
Iterator ina(input0, win_a);
Iterator out(output, win_out);
@@ -234,7 +234,7 @@
asm volatile("PLD [%0, #128*1]" ::"r"(reinterpret_cast<const uint8_t *>(matrix_b + 2 * in_b_stride)));
asm volatile("PLD [%0, #128*1]" ::"r"(reinterpret_cast<const uint8_t *>(matrix_b + 3 * in_b_stride)));
asm volatile("PLD [%0, #128*1]" ::"r"(reinterpret_cast<const uint8_t *>(matrix_b + 4 * in_b_stride)));
-#endif /* __arm __ */
+#endif /* __arm__ */
acc0 = vmlaq_lane_f32(acc0, b00, a0l, 0);
acc1 = vmlaq_lane_f32(acc1, b01, a0l, 0);
@@ -302,6 +302,37 @@
},
ina, out);
}
+
+Status validate_arguments(const ITensorInfo *input0, const ITensorInfo *input1, const ITensorInfo *output)
+{
+ ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input0, input1, output);
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input0, 1, DataType::F16, DataType::F32);
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input1, 1, DataType::F16, DataType::F32);
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::F16, DataType::F32);
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input0, input1, output);
+ ARM_COMPUTE_RETURN_ERROR_ON(input0->dimension(0) != input1->dimension(1));
+
+ return Status{};
+}
+
+std::tuple<Status, Window> validate_and_configure_window(ITensorInfo *input0, ITensorInfo *input1, ITensorInfo *output)
+{
+ const unsigned int num_elems_processed_per_iteration_x = 16;
+
+ Window win = calculate_max_window(*output, Steps(num_elems_processed_per_iteration_x));
+
+ AccessWindowHorizontal input0_access(input0, 0, num_elems_processed_per_iteration_x);
+ AccessWindowHorizontal input1_access(input1, 0, num_elems_processed_per_iteration_x);
+ AccessWindowHorizontal output_access(output, 0, num_elems_processed_per_iteration_x);
+
+ bool window_changed = update_window_and_padding(win, input0_access, input1_access, output_access);
+
+ output_access.set_valid_region(win, ValidRegion(Coordinates(), output->tensor_shape()));
+
+ Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
+
+ return std::make_tuple(err, win);
+}
} // namespace
NELocallyConnectedMatrixMultiplyKernel::NELocallyConnectedMatrixMultiplyKernel()
@@ -311,31 +342,27 @@
void NELocallyConnectedMatrixMultiplyKernel::configure(const ITensor *input0, const ITensor *input1, ITensor *output)
{
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input0, 1, DataType::F16, DataType::F32);
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input1, 1, DataType::F16, DataType::F32);
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::F16, DataType::F32);
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::F16, DataType::F32);
- ARM_COMPUTE_ERROR_ON(input0->info()->dimension(0) != input1->info()->dimension(1));
+ ARM_COMPUTE_ERROR_ON_NULLPTR(input0, input1, output);
+ ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input0->info(), input1->info(), output->info()));
_input0 = input0;
_input1 = input1;
_output = output;
- const unsigned int num_elems_processed_per_iteration_x = 16;
-
// Configure kernel window
- Window win = calculate_max_window(*output->info(), Steps(num_elems_processed_per_iteration_x));
+ auto win_config = validate_and_configure_window(input0->info(), input1->info(), output->info());
- AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration_x);
+ ARM_COMPUTE_ERROR_THROW_ON(std::get<0>(win_config));
- update_window_and_padding(win,
- AccessWindowHorizontal(input0->info(), 0, num_elems_processed_per_iteration_x),
- AccessWindowHorizontal(input1->info(), 0, num_elems_processed_per_iteration_x),
- output_access);
+ INEKernel::configure(std::get<1>(win_config));
+}
- output_access.set_valid_region(win, ValidRegion(Coordinates(), output->info()->tensor_shape()));
+Status NELocallyConnectedMatrixMultiplyKernel::validate(const ITensorInfo *input0, const ITensorInfo *input1, const ITensorInfo *output)
+{
+ ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input0, input1, output));
+ ARM_COMPUTE_RETURN_ON_ERROR(std::get<0>(validate_and_configure_window(input0->clone().get(), input1->clone().get(), output->clone().get())));
- INEKernel::configure(win);
+ return Status{};
}
void NELocallyConnectedMatrixMultiplyKernel::run(const Window &window, const ThreadInfo &info)
diff --git a/src/core/NEON/kernels/NEMinMaxLayerKernel.cpp b/src/core/NEON/kernels/NEMinMaxLayerKernel.cpp
index 01be36b..434f4eb 100644
--- a/src/core/NEON/kernels/NEMinMaxLayerKernel.cpp
+++ b/src/core/NEON/kernels/NEMinMaxLayerKernel.cpp
@@ -32,14 +32,60 @@
#include "arm_compute/core/Types.h"
#include "arm_compute/core/Validate.h"
#include "arm_compute/core/Window.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
#include <algorithm>
#include <arm_neon.h>
#include <climits>
#include <cstddef>
+using namespace arm_compute::misc::shape_calculator;
+
namespace arm_compute
{
+namespace
+{
+Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output)
+{
+ ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32);
+ ARM_COMPUTE_RETURN_ERROR_ON(input->num_dimensions() < 3);
+
+ if(output->tensor_shape().total_size() > 0)
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+
+ TensorShape output_shape = compute_min_max_shape(input);
+
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(output->tensor_shape(), output_shape);
+ }
+
+ return Status{};
+}
+
+std::tuple<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *output)
+{
+ TensorShape output_shape = compute_min_max_shape(input);
+
+ // Output auto initialization if not yet initialized
+ auto_init_if_empty(*output, output_shape, 1, input->data_type(), input->fixed_point_position());
+
+ constexpr unsigned int num_elems_processed_per_iteration = 1;
+
+ // Configure kernel window
+ Window win = calculate_max_window(*input, Steps(num_elems_processed_per_iteration));
+ AccessWindowHorizontal input_access(input, 0, num_elems_processed_per_iteration);
+ AccessWindowHorizontal output_access(output, 0, 2);
+
+ bool window_changed = update_window_and_padding(win, input_access, output_access);
+
+ output_access.set_valid_region(win, ValidRegion(Coordinates(), output->tensor_shape()));
+
+ Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
+ return std::make_tuple(err, win);
+}
+} // namespace
+
NEMinMaxLayerKernel::NEMinMaxLayerKernel()
: _input(nullptr), _output(nullptr), _mtx()
{
@@ -47,36 +93,25 @@
void NEMinMaxLayerKernel::configure(const ITensor *input, ITensor *output)
{
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32);
- ARM_COMPUTE_ERROR_ON(input->info()->num_dimensions() < 3);
- ARM_COMPUTE_ERROR_ON(output == nullptr);
-
- TensorShape output_shape{ input->info()->tensor_shape() };
- output_shape.set(Window::DimX, 2);
- output_shape.remove_dimension(1);
- output_shape.remove_dimension(1);
-
- // Output auto initialization if not yet initialized
- auto_init_if_empty(*output->info(), output_shape, 1, input->info()->data_type(), input->info()->fixed_point_position());
-
- ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
- ARM_COMPUTE_ERROR_ON_MISMATCHING_DIMENSIONS(output->info()->tensor_shape(), output_shape);
+ ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
+ ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info()));
_input = input;
_output = output;
- // Configure kernel window
- constexpr unsigned int num_elems_processed_per_iteration = 1;
+ auto win_config = validate_and_configure_window(input->info(), output->info());
- Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration));
- AccessWindowHorizontal input_access(input->info(), 0, num_elems_processed_per_iteration);
- AccessWindowHorizontal output_access(output->info(), 0, 2);
+ ARM_COMPUTE_ERROR_THROW_ON(std::get<0>(win_config));
- update_window_and_padding(win, input_access, output_access);
+ INEKernel::configure(std::get<1>(win_config));
+}
- output_access.set_valid_region(win, ValidRegion(Coordinates(), output->info()->tensor_shape()));
+Status NEMinMaxLayerKernel::validate(const ITensorInfo *input, const ITensorInfo *output)
+{
+ ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output));
+ ARM_COMPUTE_RETURN_ON_ERROR(std::get<0>(validate_and_configure_window(input->clone().get(), output->clone().get())));
- INEKernel::configure(win);
+ return Status{};
}
void NEMinMaxLayerKernel::run(const Window &window, const ThreadInfo &info)
@@ -160,7 +195,7 @@
float32x2_t reset_values = vdup_n_f32(0.0f);
reset_values = vset_lane_f32(std::numeric_limits<float>::max(), reset_values, 0);
- reset_values = vset_lane_f32(std::numeric_limits<float>::min(), reset_values, 1);
+ reset_values = vset_lane_f32(std::numeric_limits<float>::lowest(), reset_values, 1);
Window window_output;
window_output.use_tensor_dimensions(_output->info()->tensor_shape());
diff --git a/src/core/NEON/kernels/NEMinMaxLocationKernel.cpp b/src/core/NEON/kernels/NEMinMaxLocationKernel.cpp
index ad66acd..b90e813 100644
--- a/src/core/NEON/kernels/NEMinMaxLocationKernel.cpp
+++ b/src/core/NEON/kernels/NEMinMaxLocationKernel.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2016, 2017 ARM Limited.
+ * Copyright (c) 2016-2018 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -32,7 +32,7 @@
#include "arm_compute/core/Types.h"
#include "arm_compute/core/Validate.h"
#include "arm_compute/core/Window.h"
-#include "arm_compute/core/utils/misc/utility.h"
+#include "arm_compute/core/utils/misc/Utility.h"
#include <algorithm>
#include <arm_neon.h>
diff --git a/src/core/NEON/kernels/NEPixelWiseMultiplicationKernel.cpp b/src/core/NEON/kernels/NEPixelWiseMultiplicationKernel.cpp
index c271032..193ca37 100644
--- a/src/core/NEON/kernels/NEPixelWiseMultiplicationKernel.cpp
+++ b/src/core/NEON/kernels/NEPixelWiseMultiplicationKernel.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2016, 2017 ARM Limited.
+ * Copyright (c) 2016-2018 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -54,18 +54,23 @@
const float32x4_t scale255_constant_f32q = vdupq_n_f32(scale255_constant);
const float32x4_t positive_round_f32q = vdupq_n_f32(0.5f);
+constexpr unsigned int num_elems_processed_per_iteration = 16;
+
inline Status validate_arguments(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, float scale, ConvertPolicy overflow_policy, RoundingPolicy rounding_policy)
{
ARM_COMPUTE_UNUSED(overflow_policy);
ARM_COMPUTE_UNUSED(rounding_policy);
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input1, input2, output);
ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input1, 1, DataType::U8, DataType::QS8, DataType::QS16, DataType::S16, DataType::F16, DataType::F32);
ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input2, 1, DataType::U8, DataType::QS8, DataType::QS16, DataType::S16, DataType::F16, DataType::F32);
ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8, DataType::QS8, DataType::QS16, DataType::S16, DataType::F16, DataType::F32);
ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->data_type() == DataType::U8 && (input1->data_type() != DataType::U8 || input2->data_type() != DataType::U8),
"Output can only be U8 if both inputs are U8");
+ const TensorShape &out_shape = TensorShape::broadcast_shape(input1->tensor_shape(), input2->tensor_shape());
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(detail::have_different_dimensions(out_shape, output->tensor_shape(), 0), "Wrong shape for output");
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(out_shape.total_size() == 0, "Inputs are not broadcast compatible");
+
if(is_data_type_fixed_point(input1->data_type()) || is_data_type_fixed_point(input2->data_type()) || is_data_type_fixed_point(output->data_type()))
{
// Check that all data types are the same and all fixed-point positions are the same
@@ -96,19 +101,44 @@
inline std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input1, ITensorInfo *input2, ITensorInfo *output)
{
- constexpr unsigned int num_elems_processed_per_iteration = 16;
+ const std::pair<TensorShape, ValidRegion> broadcast_pair = ITensorInfo::broadcast_shape_and_valid_region(*input1, *input2);
+ const ValidRegion &valid_region = broadcast_pair.second;
+
+ // Auto initialize output if not initialized
+ {
+ set_shape_if_empty(*output, input1->tensor_shape());
+
+ if(input1->data_type() == DataType::S16 || input2->data_type() == DataType::S16)
+ {
+ set_format_if_unknown(*output, Format::S16);
+ }
+ else if(input1->data_type() == DataType::F32 || input2->data_type() == DataType::F32)
+ {
+ set_format_if_unknown(*output, Format::F32);
+ }
+ else if(input1->data_type() == DataType::F16 || input2->data_type() == DataType::F16)
+ {
+ set_format_if_unknown(*output, Format::F16);
+ }
+ else if(input1->data_type() == DataType::QS8 && input2->data_type() == DataType::QS8)
+ {
+ set_data_type_if_unknown(*output, DataType::QS8);
+ set_fixed_point_position_if_zero(*output, input1->fixed_point_position());
+ }
+ }
// Configure kernel window
- Window win = calculate_max_window(*input1, Steps(num_elems_processed_per_iteration));
+ Window win = calculate_max_window(valid_region, Steps(num_elems_processed_per_iteration));
+ Window win_input1 = win.broadcast_if_dimension_le_one(*input1);
+ Window win_input2 = win.broadcast_if_dimension_le_one(*input2);
+
+ AccessWindowHorizontal input1_access(input1, 0, num_elems_processed_per_iteration);
+ AccessWindowHorizontal input2_access(input2, 0, num_elems_processed_per_iteration);
AccessWindowHorizontal output_access(output, 0, num_elems_processed_per_iteration);
- bool window_changed = update_window_and_padding(win,
- AccessWindowHorizontal(input1, 0, num_elems_processed_per_iteration),
- AccessWindowHorizontal(input2, 0, num_elems_processed_per_iteration),
- output_access);
-
- ValidRegion valid_region = intersect_valid_regions(input1->valid_region(),
- input2->valid_region());
+ bool window_changed = update_window_and_padding(win_input1, input1_access)
+ || update_window_and_padding(win_input2, input2_access)
+ || update_window_and_padding(win, output_access);
output_access.set_valid_region(win, valid_region);
@@ -508,31 +538,12 @@
ARM_COMPUTE_UNUSED(rounding_policy);
ARM_COMPUTE_ERROR_ON_NULLPTR(input1, input2, output);
- // Auto initialize output if not initialized
- {
- set_shape_if_empty(*output->info(), input1->info()->tensor_shape());
-
- if(input1->info()->data_type() == DataType::S16 || input2->info()->data_type() == DataType::S16)
- {
- set_format_if_unknown(*output->info(), Format::S16);
- }
- else if(input1->info()->data_type() == DataType::F32 || input2->info()->data_type() == DataType::F32)
- {
- set_format_if_unknown(*output->info(), Format::F32);
- }
- else if(input1->info()->data_type() == DataType::F16 || input2->info()->data_type() == DataType::F16)
- {
- set_format_if_unknown(*output->info(), Format::F16);
- }
- else if(input1->info()->data_type() == DataType::QS8 && input2->info()->data_type() == DataType::QS8)
- {
- set_data_type_if_unknown(*output->info(), DataType::QS8);
- set_fixed_point_position_if_zero(*output->info(), input1->info()->fixed_point_position());
- }
- }
-
ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input1->info(), input2->info(), output->info(), scale, overflow_policy, rounding_policy));
+ // Configure kernel window
+ auto win_config = validate_and_configure_window(input1->info(), input2->info(), output->info());
+ ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
+
_input1 = input1;
_input2 = input2;
_output = output;
@@ -656,15 +667,13 @@
ARM_COMPUTE_ERROR("You called with the wrong img formats");
}
- // Configure kernel window
- auto win_config = validate_and_configure_window(input1->info(), input2->info(), output->info());
- ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
INEKernel::configure(win_config.second);
}
Status NEPixelWiseMultiplicationKernel::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, float scale, ConvertPolicy overflow_policy,
RoundingPolicy rounding_policy)
{
+ ARM_COMPUTE_ERROR_ON_NULLPTR(input1, input2, output);
ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input1, input2, output, scale, overflow_policy, rounding_policy));
ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input1->clone().get(), input2->clone().get(), output->clone().get()).first);
@@ -677,34 +686,71 @@
ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
- Iterator input1(_input1, window);
- Iterator input2(_input2, window);
- Iterator output(_output, window);
+ const TensorShape &in_shape1 = _input1->info()->tensor_shape();
+ const TensorShape &in_shape2 = _input2->info()->tensor_shape();
+ const TensorShape &out_shape = _output->info()->tensor_shape();
+
+ bool can_collapse = true;
+ if(std::min(in_shape1.total_size(), in_shape2.total_size()) > 1)
+ {
+ can_collapse = (std::min(in_shape1.num_dimensions(), in_shape2.num_dimensions()) > Window::DimZ);
+ for(size_t d = Window::DimZ; can_collapse && (d < out_shape.num_dimensions()); ++d)
+ {
+ can_collapse = (in_shape1[d] == in_shape2[d]);
+ }
+ }
+
+ bool has_collapsed = false;
+ Window collapsed = can_collapse ? window.collapse_if_possible(INEKernel::window(), Window::DimZ, &has_collapsed) : window;
+
+ const TensorShape &in_shape1_collapsed = has_collapsed ? in_shape1.collapsed_from(Window::DimZ) : in_shape1;
+ const TensorShape &in_shape2_collapsed = has_collapsed ? in_shape2.collapsed_from(Window::DimZ) : in_shape2;
+
+ Window slice = collapsed.first_slice_window_3D();
+ Window slice_input1 = slice.broadcast_if_dimension_le_one(in_shape1_collapsed);
+ Window slice_input2 = slice.broadcast_if_dimension_le_one(in_shape2_collapsed);
+
+ Iterator input1(_input1, slice_input1);
+ Iterator input2(_input2, slice_input2);
+ Iterator output(_output, slice);
if(_func_int != nullptr)
{
- execute_window_loop(window, [&](const Coordinates & id)
+ execute_window_loop(collapsed, [&](const Coordinates & id)
{
(*_func_int)(input1.ptr(), input2.ptr(), output.ptr(), _scale_exponent);
+ collapsed.slide_window_slice_3D(slice_input1);
+ collapsed.slide_window_slice_3D(slice_input2);
},
input1, input2, output);
}
else if(_func_q_int != nullptr)
{
int fixed_point_position = _input1->info()->fixed_point_position();
- execute_window_loop(window, [&](const Coordinates & id)
+ execute_window_loop(collapsed, [&](const Coordinates & id)
{
(*_func_q_int)(input1.ptr(), input2.ptr(), output.ptr(), _scale_exponent, fixed_point_position);
+ collapsed.slide_window_slice_3D(slice_input1);
+ collapsed.slide_window_slice_3D(slice_input2);
},
input1, input2, output);
}
else
{
ARM_COMPUTE_ERROR_ON(_func_float == nullptr);
- execute_window_loop(window, [&](const Coordinates & id)
+ execute_window_loop(collapsed, [&](const Coordinates & id)
{
(*_func_float)(input1.ptr(), input2.ptr(), output.ptr(), _scale);
+ collapsed.slide_window_slice_3D(slice_input1);
+ collapsed.slide_window_slice_3D(slice_input2);
},
input1, input2, output);
}
}
+
+BorderSize NEPixelWiseMultiplicationKernel::border_size() const
+{
+ const unsigned int replicateSize = _output->info()->dimension(0) - std::min(_input1->info()->dimension(0), _input2->info()->dimension(0));
+ const unsigned int border = std::min<unsigned int>(num_elems_processed_per_iteration - 1U, replicateSize);
+ return BorderSize(0, border, 0, 0);
+}
\ No newline at end of file
diff --git a/src/core/NEON/kernels/NEPoolingLayerKernel.cpp b/src/core/NEON/kernels/NEPoolingLayerKernel.cpp
index b6af517..7877cf5 100644
--- a/src/core/NEON/kernels/NEPoolingLayerKernel.cpp
+++ b/src/core/NEON/kernels/NEPoolingLayerKernel.cpp
@@ -53,20 +53,24 @@
void auto_init(const ITensorInfo *input, ITensorInfo *output, unsigned int pooled_w, unsigned int pooled_h)
{
TensorShape output_shape{ input->tensor_shape() };
- output_shape.set(0, pooled_w);
- output_shape.set(1, pooled_h);
+ output_shape.set(get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::WIDTH), pooled_w);
+ output_shape.set(get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::HEIGHT), pooled_h);
auto_init_if_empty(*output, input->clone()->set_tensor_shape(output_shape));
}
-template <bool exclude_padding>
+template <bool exclude_padding, DataLayout data_layout>
inline float calculate_avg_scale(const Coordinates &id, const int pool_size_x, const int pool_size_y, const int upper_bound_w, const int upper_bound_h,
const int pad_x, const int pad_y, const int stride_x, const int stride_y)
{
- int start_x = id.x() * stride_x - pad_x;
- int start_y = id.y() * stride_y - pad_y;
- const int end_x = std::min(start_x + pool_size_x, upper_bound_w);
- const int end_y = std::min(start_y + pool_size_y, upper_bound_h);
+ const unsigned int idx_width = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
+ const unsigned int idx_height = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
+
+ int start_x = id[idx_width] * stride_x - pad_x;
+ int start_y = id[idx_height] * stride_y - pad_y;
+
+ const int end_x = std::min(start_x + pool_size_x, upper_bound_w);
+ const int end_y = std::min(start_y + pool_size_y, upper_bound_h);
if(exclude_padding)
{
start_x = std::max(0, start_x);
@@ -175,7 +179,9 @@
{
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_FIXED_POINT(input, output);
- ARM_COMPUTE_RETURN_ERROR_ON((output->dimension(0) != pooled_w) || (output->dimension(1) != pooled_h));
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(input, output);
+ ARM_COMPUTE_RETURN_ERROR_ON((output->dimension(get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::WIDTH)) != pooled_w)
+ || (output->dimension(get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::HEIGHT)) != pooled_h));
}
return Status{};
@@ -193,12 +199,16 @@
BorderSize &border_size,
unsigned int pooled_w, unsigned int pooled_h, int pool_size_x, int pool_size_y)
{
+ // Get data layout
+ DataLayout data_layout = input->data_layout();
unsigned int num_elems_read_per_iteration = 0;
unsigned int num_elems_horizontal_window = 0;
int pool_stride_x = 0;
int pool_stride_y = 0;
- const int input_width = input->dimension(0);
- const int input_height = input->dimension(1);
+ const int idx_width = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
+ const int idx_height = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
+ const int input_width = input->dimension(idx_width);
+ const int input_height = input->dimension(idx_height);
const PadStrideInfo pad_stride_info = pool_info.pad_stride_info();
std::tie(pool_stride_x, pool_stride_y) = pad_stride_info.stride();
const int pool_pad_right = pad_stride_info.pad_right();
@@ -206,18 +216,22 @@
const int pool_pad_left = pad_stride_info.pad_left();
const int pool_pad_bottom = pad_stride_info.pad_bottom();
const bool is_square = pool_size_x == pool_size_y;
+
// Check output dimensions
- std::tie(pooled_w, pooled_h) = scaled_dimensions(input->dimension(0),
- input->dimension(1),
+ std::tie(pooled_w, pooled_h) = scaled_dimensions(input->dimension(idx_width),
+ input->dimension(idx_height),
pool_size_x,
pool_size_y,
pad_stride_info);
+ auto_init(input, output, pooled_w, pooled_h);
//If it's not squared and optimized will be executed the MxN
num_elems_read_per_iteration = 1;
num_elems_processed_per_iteration = 1;
num_elems_horizontal_window = 1;
+ const bool is_nhwc = data_layout == DataLayout::NHWC;
+
if(is_square)
{
switch(input->data_type())
@@ -239,6 +253,11 @@
}
break;
case DataType::QASYMM8:
+ if(is_nhwc)
+ {
+ num_elems_processed_per_iteration = 8;
+ break;
+ }
switch(pool_size_x)
{
case 2:
@@ -273,6 +292,11 @@
break;
#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
case DataType::F16:
+ if(is_nhwc)
+ {
+ num_elems_processed_per_iteration = 8;
+ break;
+ }
switch(pool_size_x)
{
case 2:
@@ -291,6 +315,11 @@
break;
#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
case DataType::F32:
+ if(is_nhwc)
+ {
+ num_elems_processed_per_iteration = 4;
+ break;
+ }
switch(pool_size_x)
{
case 2:
@@ -313,35 +342,61 @@
break;
}
}
- // Number of iterations in X dimension
- const int num_iterations_x = (pooled_w + num_elems_processed_per_iteration - 1) / num_elems_processed_per_iteration;
-
- // Upper limit for the number of right/bottom border elements that are accessed
- const int upper_bound_w = ((num_iterations_x - 1) * num_elems_processed_per_iteration * pool_stride_x - pool_pad_left + num_elems_read_per_iteration) - input_width;
- const int upper_bound_h = ((pooled_h - 1) * pool_stride_y - pool_pad_top + pool_size_y) - input_height;
-
- border_size = BorderSize(pool_pad_top, pool_pad_right, pool_pad_bottom, pool_pad_left);
- border_size.right = std::max(upper_bound_w, pool_pad_right);
- border_size.bottom = std::max(upper_bound_h, pool_pad_bottom);
- bool window_changed = false;
-
- TensorShape output_shape{ input->tensor_shape() };
- output_shape.set(0, pooled_w);
- output_shape.set(1, pooled_h);
- TensorInfo output_info(input->clone()->set_tensor_shape(output_shape));
-
- Window win = calculate_max_window(output_info, Steps(num_elems_processed_per_iteration));
- AccessWindowStatic input_access(input, -pool_pad_left, -pool_pad_top, input_width + border_size.right, input_height + border_size.bottom);
-
- if(output->total_size() != 0)
+ else
{
+ if(is_nhwc)
+ {
+ if(DataType::QASYMM8 == input->data_type())
+ {
+ num_elems_processed_per_iteration = 8;
+ }
+ else
+ {
+ num_elems_processed_per_iteration = 4;
+ }
+ }
+ }
+
+ bool window_changed = false;
+ Window win{};
+ if(data_layout == DataLayout::NCHW)
+ {
+ // Number of iterations in X dimension
+ const int num_iterations_x = (pooled_w + num_elems_processed_per_iteration - 1) / num_elems_processed_per_iteration;
+
+ // Upper limit for the number of right/bottom border elements that are accessed
+ const int upper_bound_w = ((num_iterations_x - 1) * num_elems_processed_per_iteration * pool_stride_x - pool_pad_left + num_elems_read_per_iteration) - input_width;
+ const int upper_bound_h = ((pooled_h - 1) * pool_stride_y - pool_pad_top + pool_size_y) - input_height;
+
+ border_size = BorderSize(pool_pad_top, pool_pad_right, pool_pad_bottom, pool_pad_left);
+ border_size.right = std::max(upper_bound_w, pool_pad_right);
+ border_size.bottom = std::max(upper_bound_h, pool_pad_bottom);
+
+ TensorShape output_shape{ input->tensor_shape() };
+ output_shape.set(0, pooled_w);
+ output_shape.set(1, pooled_h);
+ TensorInfo output_info(input->clone()->set_tensor_shape(output_shape));
+
+ win = calculate_max_window(output_info, Steps(num_elems_processed_per_iteration));
+ AccessWindowStatic input_access(input, -pool_pad_left, -pool_pad_top, input_width + border_size.right, input_height + border_size.bottom);
+
AccessWindowHorizontal output_access(output, 0, num_elems_horizontal_window);
window_changed = update_window_and_padding(win, input_access, output_access);
output_access.set_valid_region(win, ValidRegion(Coordinates(), output->tensor_shape()));
}
else
{
- window_changed = update_window_and_padding(win, input_access);
+ TensorShape output_shape{ input->tensor_shape() };
+ output_shape.set(1, pooled_w);
+ output_shape.set(2, pooled_h);
+ TensorInfo output_info(input->clone()->set_tensor_shape(output_shape));
+
+ win = calculate_max_window(output_info, Steps(num_elems_processed_per_iteration));
+ AccessWindowHorizontal input_access(input, 0, num_elems_processed_per_iteration);
+
+ AccessWindowHorizontal output_access(output, 0, num_elems_processed_per_iteration);
+ window_changed = update_window_and_padding(win, input_access, output_access);
+ output_access.set_valid_region(win, ValidRegion(Coordinates(), output->tensor_shape()));
}
Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
@@ -368,18 +423,25 @@
const bool exclude_padding = pool_info.exclude_padding();
const bool is_global_pooling = pool_info.is_global_pooling();
const int pool_stride_x = pad_stride_info.stride().first;
+ unsigned int pool_size_x = 0;
+ unsigned int pool_size_y = 0;
+
+ // Get data layout
+ const DataLayout data_layout = input->info()->data_layout();
+ const int idx_width = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
+ const int idx_height = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
// Update pool size in case of global pooling
- const int pool_size_x = is_global_pooling ? input->info()->dimension(0) : pool_info.pool_size().width;
- const int pool_size_y = is_global_pooling ? input->info()->dimension(1) : pool_info.pool_size().height;
+ pool_size_x = is_global_pooling ? input->info()->dimension(idx_width) : pool_info.pool_size().width;
+ pool_size_y = is_global_pooling ? input->info()->dimension(idx_height) : pool_info.pool_size().height;
// Validate pool info before calling scaled_dimensions
ARM_COMPUTE_ERROR_THROW_ON(validate_arguments_pool_info(pool_size_x, pool_size_y));
// Check output dimensions
unsigned int pooled_w, pooled_h;
- std::tie(pooled_w, pooled_h) = scaled_dimensions(input->info()->dimension(0),
- input->info()->dimension(1),
+ std::tie(pooled_w, pooled_h) = scaled_dimensions(input->info()->dimension(idx_width),
+ input->info()->dimension(idx_height),
pool_size_x,
pool_size_y,
pad_stride_info);
@@ -398,6 +460,7 @@
// Get data type
const DataType data_type = input->info()->data_type();
+ const bool is_nchw = data_layout == DataLayout::NCHW;
// Select appropriate function
if(data_type == DataType::QS8)
@@ -410,10 +473,10 @@
switch(pool_type)
{
case PoolingType::AVG:
- _func = &NEPoolingLayerKernel::pooling2_q8<PoolingType::AVG>;
+ _func = &NEPoolingLayerKernel::pooling2_q8_nchw<PoolingType::AVG>;
break;
case PoolingType::MAX:
- _func = &NEPoolingLayerKernel::pooling2_q8<PoolingType::MAX>;
+ _func = &NEPoolingLayerKernel::pooling2_q8_nchw<PoolingType::MAX>;
break;
default:
ARM_COMPUTE_ERROR("Unsupported pooling type!");
@@ -423,10 +486,10 @@
switch(pool_type)
{
case PoolingType::AVG:
- _func = &NEPoolingLayerKernel::pooling3_q8<PoolingType::AVG>;
+ _func = &NEPoolingLayerKernel::pooling3_q8_nchw<PoolingType::AVG>;
break;
case PoolingType::MAX:
- _func = &NEPoolingLayerKernel::pooling3_q8<PoolingType::MAX>;
+ _func = &NEPoolingLayerKernel::pooling3_q8_nchw<PoolingType::MAX>;
break;
default:
ARM_COMPUTE_ERROR("Unsupported pooling type!");
@@ -436,7 +499,7 @@
switch(pool_type)
{
case PoolingType::MAX:
- _func = &NEPoolingLayerKernel::poolingMxN_q8<PoolingType::MAX>;
+ _func = &NEPoolingLayerKernel::poolingMxN_q8_nchw<PoolingType::MAX>;
break;
default:
ARM_COMPUTE_ERROR("Unsupported pooling type!");
@@ -449,7 +512,7 @@
switch(pool_type)
{
case PoolingType::MAX:
- _func = &NEPoolingLayerKernel::poolingMxN_q8<PoolingType::MAX>;
+ _func = &NEPoolingLayerKernel::poolingMxN_q8_nchw<PoolingType::MAX>;
break;
default:
ARM_COMPUTE_ERROR("Unsupported pooling type!");
@@ -463,10 +526,24 @@
switch(pool_type)
{
case PoolingType::AVG:
- _func = (exclude_padding) ? &NEPoolingLayerKernel::pooling2_qasymm8<PoolingType::AVG, true> : &NEPoolingLayerKernel::pooling2_qasymm8<PoolingType::AVG, false>;
+ if(is_nchw)
+ {
+ _func = (exclude_padding) ? &NEPoolingLayerKernel::pooling2_qasymm8_nchw<PoolingType::AVG, true> : &NEPoolingLayerKernel::pooling2_qasymm8_nchw<PoolingType::AVG, false>;
+ }
+ else
+ {
+ _func = (exclude_padding) ? &NEPoolingLayerKernel::poolingMxN_qasymm8_nhwc<PoolingType::AVG, true> : &NEPoolingLayerKernel::poolingMxN_qasymm8_nhwc<PoolingType::AVG, false>;
+ }
break;
case PoolingType::MAX:
- _func = &NEPoolingLayerKernel::pooling2_qasymm8<PoolingType::MAX>;
+ if(is_nchw)
+ {
+ _func = &NEPoolingLayerKernel::pooling2_qasymm8_nchw<PoolingType::MAX>;
+ }
+ else
+ {
+ _func = &NEPoolingLayerKernel::poolingMxN_qasymm8_nhwc<PoolingType::MAX>;
+ }
break;
default:
ARM_COMPUTE_ERROR("Unsupported pooling type!");
@@ -477,10 +554,24 @@
switch(pool_type)
{
case PoolingType::AVG:
- _func = (exclude_padding) ? &NEPoolingLayerKernel::pooling3_qasymm8<PoolingType::AVG, true> : &NEPoolingLayerKernel::pooling3_qasymm8<PoolingType::AVG, false>;
+ if(is_nchw)
+ {
+ _func = (exclude_padding) ? &NEPoolingLayerKernel::pooling3_qasymm8_nchw<PoolingType::AVG, true> : &NEPoolingLayerKernel::pooling3_qasymm8_nchw<PoolingType::AVG, false>;
+ }
+ else
+ {
+ _func = (exclude_padding) ? &NEPoolingLayerKernel::poolingMxN_qasymm8_nhwc<PoolingType::AVG, true> : &NEPoolingLayerKernel::poolingMxN_qasymm8_nhwc<PoolingType::AVG, false>;
+ }
break;
case PoolingType::MAX:
- _func = &NEPoolingLayerKernel::pooling3_qasymm8<PoolingType::MAX>;
+ if(is_nchw)
+ {
+ _func = &NEPoolingLayerKernel::pooling3_qasymm8_nchw<PoolingType::MAX>;
+ }
+ else
+ {
+ _func = &NEPoolingLayerKernel::poolingMxN_qasymm8_nhwc<PoolingType::MAX>;
+ }
break;
default:
ARM_COMPUTE_ERROR("Unsupported pooling type!");
@@ -491,10 +582,24 @@
switch(pool_type)
{
case PoolingType::AVG:
- _func = (exclude_padding) ? &NEPoolingLayerKernel::poolingMxN_qasymm8<PoolingType::AVG, true> : &NEPoolingLayerKernel::poolingMxN_qasymm8<PoolingType::AVG, false>;
+ if(is_nchw)
+ {
+ _func = (exclude_padding) ? &NEPoolingLayerKernel::poolingMxN_qasymm8_nchw<PoolingType::AVG, true> : &NEPoolingLayerKernel::poolingMxN_qasymm8_nchw<PoolingType::AVG, false>;
+ }
+ else
+ {
+ _func = (exclude_padding) ? &NEPoolingLayerKernel::poolingMxN_qasymm8_nhwc<PoolingType::AVG, true> : &NEPoolingLayerKernel::poolingMxN_qasymm8_nhwc<PoolingType::AVG, false>;
+ }
break;
case PoolingType::MAX:
- _func = &NEPoolingLayerKernel::poolingMxN_qasymm8<PoolingType::MAX>;
+ if(is_nchw)
+ {
+ _func = &NEPoolingLayerKernel::poolingMxN_qasymm8_nchw<PoolingType::MAX>;
+ }
+ else
+ {
+ _func = &NEPoolingLayerKernel::poolingMxN_qasymm8_nhwc<PoolingType::MAX>;
+ }
break;
default:
ARM_COMPUTE_ERROR("Unsupported pooling type!");
@@ -511,10 +616,10 @@
switch(pool_type)
{
case PoolingType::AVG:
- _func = &NEPoolingLayerKernel::pooling2_q16<PoolingType::AVG>;
+ _func = &NEPoolingLayerKernel::pooling2_q16_nchw<PoolingType::AVG>;
break;
case PoolingType::MAX:
- _func = &NEPoolingLayerKernel::pooling2_q16<PoolingType::MAX>;
+ _func = &NEPoolingLayerKernel::pooling2_q16_nchw<PoolingType::MAX>;
break;
default:
ARM_COMPUTE_ERROR("Unsupported pooling type!");
@@ -524,10 +629,10 @@
switch(pool_type)
{
case PoolingType::AVG:
- _func = &NEPoolingLayerKernel::pooling3_q16<PoolingType::AVG>;
+ _func = &NEPoolingLayerKernel::pooling3_q16_nchw<PoolingType::AVG>;
break;
case PoolingType::MAX:
- _func = &NEPoolingLayerKernel::pooling3_q16<PoolingType::MAX>;
+ _func = &NEPoolingLayerKernel::pooling3_q16_nchw<PoolingType::MAX>;
break;
default:
ARM_COMPUTE_ERROR("Unsupported pooling type!");
@@ -537,7 +642,7 @@
switch(pool_type)
{
case PoolingType::MAX:
- _func = &NEPoolingLayerKernel::poolingMxN_q16<PoolingType::MAX>;
+ _func = &NEPoolingLayerKernel::poolingMxN_q16_nchw<PoolingType::MAX>;
break;
default:
ARM_COMPUTE_ERROR("Unsupported pooling type!");
@@ -550,7 +655,7 @@
switch(pool_type)
{
case PoolingType::MAX:
- _func = &NEPoolingLayerKernel::poolingMxN_q16<PoolingType::MAX>;
+ _func = &NEPoolingLayerKernel::poolingMxN_q16_nchw<PoolingType::MAX>;
break;
default:
ARM_COMPUTE_ERROR("Unsupported pooling type!");
@@ -567,13 +672,34 @@
switch(pool_type)
{
case PoolingType::AVG:
- _func = (exclude_padding) ? &NEPoolingLayerKernel::pooling2_f16<PoolingType::AVG, true> : &NEPoolingLayerKernel::pooling2_f16<PoolingType::AVG, false>;
+ if(is_nchw)
+ {
+ _func = (exclude_padding) ? &NEPoolingLayerKernel::pooling2_f16_nchw<PoolingType::AVG, true> : &NEPoolingLayerKernel::pooling2_f16_nchw<PoolingType::AVG, false>;
+ }
+ else
+ {
+ _func = (exclude_padding) ? &NEPoolingLayerKernel::poolingMxN_f16_nhwc<PoolingType::AVG, true> : &NEPoolingLayerKernel::poolingMxN_f16_nhwc<PoolingType::AVG, false>;
+ }
break;
case PoolingType::L2:
- _func = (exclude_padding) ? &NEPoolingLayerKernel::pooling2_f16<PoolingType::L2, true> : &NEPoolingLayerKernel::pooling2_f16<PoolingType::L2, false>;
+ if(is_nchw)
+ {
+ _func = (exclude_padding) ? &NEPoolingLayerKernel::pooling2_f16_nchw<PoolingType::L2, true> : &NEPoolingLayerKernel::pooling2_f16_nchw<PoolingType::L2, false>;
+ }
+ else
+ {
+ _func = (exclude_padding) ? &NEPoolingLayerKernel::poolingMxN_f16_nhwc<PoolingType::L2, true> : &NEPoolingLayerKernel::poolingMxN_f16_nhwc<PoolingType::L2, false>;
+ }
break;
case PoolingType::MAX:
- _func = &NEPoolingLayerKernel::pooling2_f16<PoolingType::MAX, false>;
+ if(is_nchw)
+ {
+ _func = &NEPoolingLayerKernel::pooling2_f16_nchw<PoolingType::MAX, false>;
+ }
+ else
+ {
+ _func = &NEPoolingLayerKernel::poolingMxN_f16_nhwc<PoolingType::MAX, false>;
+ }
break;
default:
ARM_COMPUTE_ERROR("Unsupported pooling type!");
@@ -583,13 +709,34 @@
switch(pool_type)
{
case PoolingType::AVG:
- _func = (exclude_padding) ? &NEPoolingLayerKernel::pooling3_f16<PoolingType::AVG, true> : &NEPoolingLayerKernel::pooling3_f16<PoolingType::AVG, false>;
+ if(is_nchw)
+ {
+ _func = (exclude_padding) ? &NEPoolingLayerKernel::pooling3_f16_nchw<PoolingType::AVG, true> : &NEPoolingLayerKernel::pooling3_f16_nchw<PoolingType::AVG, false>;
+ }
+ else
+ {
+ _func = (exclude_padding) ? &NEPoolingLayerKernel::poolingMxN_f16_nhwc<PoolingType::AVG, true> : &NEPoolingLayerKernel::poolingMxN_f16_nhwc<PoolingType::AVG, false>;
+ }
break;
case PoolingType::L2:
- _func = (exclude_padding) ? &NEPoolingLayerKernel::pooling3_f16<PoolingType::L2, true> : &NEPoolingLayerKernel::pooling3_f16<PoolingType::L2, false>;
+ if(is_nchw)
+ {
+ _func = (exclude_padding) ? &NEPoolingLayerKernel::pooling3_f16_nchw<PoolingType::L2, true> : &NEPoolingLayerKernel::pooling3_f16_nchw<PoolingType::L2, false>;
+ }
+ else
+ {
+ _func = (exclude_padding) ? &NEPoolingLayerKernel::poolingMxN_f16_nhwc<PoolingType::L2, true> : &NEPoolingLayerKernel::poolingMxN_f16_nhwc<PoolingType::L2, false>;
+ }
break;
case PoolingType::MAX:
- _func = &NEPoolingLayerKernel::pooling3_f16<PoolingType::MAX, false>;
+ if(is_nchw)
+ {
+ _func = &NEPoolingLayerKernel::pooling3_f16_nchw<PoolingType::MAX, false>;
+ }
+ else
+ {
+ _func = &NEPoolingLayerKernel::poolingMxN_f16_nhwc<PoolingType::MAX, false>;
+ }
break;
default:
ARM_COMPUTE_ERROR("Unsupported pooling type!");
@@ -599,13 +746,34 @@
switch(pool_type)
{
case PoolingType::AVG:
- _func = (exclude_padding) ? &NEPoolingLayerKernel::poolingMxN_f16<PoolingType::AVG, true> : &NEPoolingLayerKernel::poolingMxN_f16<PoolingType::AVG, false>;
+ if(is_nchw)
+ {
+ _func = (exclude_padding) ? &NEPoolingLayerKernel::poolingMxN_f16_nchw<PoolingType::AVG, true> : &NEPoolingLayerKernel::poolingMxN_f16_nchw<PoolingType::AVG, false>;
+ }
+ else
+ {
+ _func = (exclude_padding) ? &NEPoolingLayerKernel::poolingMxN_f16_nhwc<PoolingType::AVG, true> : &NEPoolingLayerKernel::poolingMxN_f16_nhwc<PoolingType::AVG, false>;
+ }
break;
case PoolingType::L2:
- _func = (exclude_padding) ? &NEPoolingLayerKernel::poolingMxN_f16<PoolingType::L2, true> : &NEPoolingLayerKernel::poolingMxN_f16<PoolingType::L2, false>;
+ if(is_nchw)
+ {
+ _func = (exclude_padding) ? &NEPoolingLayerKernel::poolingMxN_f16_nchw<PoolingType::L2, true> : &NEPoolingLayerKernel::poolingMxN_f16_nchw<PoolingType::L2, false>;
+ }
+ else
+ {
+ _func = (exclude_padding) ? &NEPoolingLayerKernel::poolingMxN_f16_nhwc<PoolingType::L2, true> : &NEPoolingLayerKernel::poolingMxN_f16_nhwc<PoolingType::L2, false>;
+ }
break;
case PoolingType::MAX:
- _func = &NEPoolingLayerKernel::poolingMxN_f16<PoolingType::MAX, false>;
+ if(is_nchw)
+ {
+ _func = &NEPoolingLayerKernel::poolingMxN_f16_nchw<PoolingType::MAX, false>;
+ }
+ else
+ {
+ _func = &NEPoolingLayerKernel::poolingMxN_f16_nhwc<PoolingType::MAX, false>;
+ }
break;
default:
ARM_COMPUTE_ERROR("Unsupported pooling type!");
@@ -618,13 +786,34 @@
switch(pool_type)
{
case PoolingType::AVG:
- _func = (exclude_padding) ? &NEPoolingLayerKernel::poolingMxN_f16<PoolingType::AVG, true> : &NEPoolingLayerKernel::poolingMxN_f16<PoolingType::AVG, false>;
+ if(is_nchw)
+ {
+ _func = (exclude_padding) ? &NEPoolingLayerKernel::poolingMxN_f16_nchw<PoolingType::AVG, true> : &NEPoolingLayerKernel::poolingMxN_f16_nchw<PoolingType::AVG, false>;
+ }
+ else
+ {
+ _func = (exclude_padding) ? &NEPoolingLayerKernel::poolingMxN_f16_nhwc<PoolingType::AVG, true> : &NEPoolingLayerKernel::poolingMxN_f16_nhwc<PoolingType::AVG, false>;
+ }
break;
case PoolingType::L2:
- _func = (exclude_padding) ? &NEPoolingLayerKernel::poolingMxN_f16<PoolingType::L2, true> : &NEPoolingLayerKernel::poolingMxN_f16<PoolingType::L2, false>;
+ if(is_nchw)
+ {
+ _func = (exclude_padding) ? &NEPoolingLayerKernel::poolingMxN_f16_nchw<PoolingType::L2, true> : &NEPoolingLayerKernel::poolingMxN_f16_nchw<PoolingType::L2, false>;
+ }
+ else
+ {
+ _func = (exclude_padding) ? &NEPoolingLayerKernel::poolingMxN_f16_nhwc<PoolingType::L2, true> : &NEPoolingLayerKernel::poolingMxN_f16_nhwc<PoolingType::L2, false>;
+ }
break;
case PoolingType::MAX:
- _func = &NEPoolingLayerKernel::poolingMxN_f16<PoolingType::MAX, false>;
+ if(is_nchw)
+ {
+ _func = &NEPoolingLayerKernel::poolingMxN_f16_nchw<PoolingType::MAX, false>;
+ }
+ else
+ {
+ _func = &NEPoolingLayerKernel::poolingMxN_f16_nhwc<PoolingType::MAX, false>;
+ }
break;
default:
ARM_COMPUTE_ERROR("Unsupported pooling type!");
@@ -641,13 +830,34 @@
switch(pool_type)
{
case PoolingType::AVG:
- _func = (exclude_padding) ? &NEPoolingLayerKernel::pooling2_f32<PoolingType::AVG, true> : &NEPoolingLayerKernel::pooling2_f32<PoolingType::AVG, false>;
+ if(is_nchw)
+ {
+ _func = (exclude_padding) ? &NEPoolingLayerKernel::pooling2_f32_nchw<PoolingType::AVG, true> : &NEPoolingLayerKernel::pooling2_f32_nchw<PoolingType::AVG, false>;
+ }
+ else
+ {
+ _func = (exclude_padding) ? &NEPoolingLayerKernel::poolingMxN_f32_nhwc<PoolingType::AVG, true> : &NEPoolingLayerKernel::poolingMxN_f32_nhwc<PoolingType::AVG, false>;
+ }
break;
case PoolingType::L2:
- _func = (exclude_padding) ? &NEPoolingLayerKernel::pooling2_f32<PoolingType::L2, true> : &NEPoolingLayerKernel::pooling2_f32<PoolingType::L2, false>;
+ if(is_nchw)
+ {
+ _func = (exclude_padding) ? &NEPoolingLayerKernel::pooling2_f32_nchw<PoolingType::L2, true> : &NEPoolingLayerKernel::pooling2_f32_nchw<PoolingType::L2, false>;
+ }
+ else
+ {
+ _func = (exclude_padding) ? &NEPoolingLayerKernel::poolingMxN_f32_nhwc<PoolingType::L2, true> : &NEPoolingLayerKernel::poolingMxN_f32_nhwc<PoolingType::L2, false>;
+ }
break;
case PoolingType::MAX:
- _func = &NEPoolingLayerKernel::pooling2_f32<PoolingType::MAX, false>;
+ if(is_nchw)
+ {
+ _func = &NEPoolingLayerKernel::pooling2_f32_nchw<PoolingType::MAX, false>;
+ }
+ else
+ {
+ _func = &NEPoolingLayerKernel::poolingMxN_f32_nhwc<PoolingType::MAX, false>;
+ }
break;
default:
ARM_COMPUTE_ERROR("Unsupported pooling type!");
@@ -657,13 +867,34 @@
switch(pool_type)
{
case PoolingType::AVG:
- _func = (exclude_padding) ? &NEPoolingLayerKernel::pooling3_f32<PoolingType::AVG, true> : &NEPoolingLayerKernel::pooling3_f32<PoolingType::AVG, false>;
+ if(is_nchw)
+ {
+ _func = (exclude_padding) ? &NEPoolingLayerKernel::pooling3_f32_nchw<PoolingType::AVG, true> : &NEPoolingLayerKernel::pooling3_f32_nchw<PoolingType::AVG, false>;
+ }
+ else
+ {
+ _func = (exclude_padding) ? &NEPoolingLayerKernel::poolingMxN_f32_nhwc<PoolingType::AVG, true> : &NEPoolingLayerKernel::poolingMxN_f32_nhwc<PoolingType::AVG, false>;
+ }
break;
case PoolingType::L2:
- _func = (exclude_padding) ? &NEPoolingLayerKernel::pooling3_f32<PoolingType::L2, true> : &NEPoolingLayerKernel::pooling3_f32<PoolingType::L2, false>;
+ if(is_nchw)
+ {
+ _func = (exclude_padding) ? &NEPoolingLayerKernel::pooling3_f32_nchw<PoolingType::L2, true> : &NEPoolingLayerKernel::pooling3_f32_nchw<PoolingType::L2, false>;
+ }
+ else
+ {
+ _func = (exclude_padding) ? &NEPoolingLayerKernel::poolingMxN_f32_nhwc<PoolingType::L2, true> : &NEPoolingLayerKernel::poolingMxN_f32_nhwc<PoolingType::L2, false>;
+ }
break;
case PoolingType::MAX:
- _func = &NEPoolingLayerKernel::pooling3_f32<PoolingType::MAX, false>;
+ if(is_nchw)
+ {
+ _func = &NEPoolingLayerKernel::pooling3_f32_nchw<PoolingType::MAX, false>;
+ }
+ else
+ {
+ _func = &NEPoolingLayerKernel::poolingMxN_f32_nhwc<PoolingType::MAX, false>;
+ }
break;
default:
ARM_COMPUTE_ERROR("Unsupported pooling type!");
@@ -673,13 +904,34 @@
switch(pool_type)
{
case PoolingType::AVG:
- _func = (exclude_padding) ? &NEPoolingLayerKernel::pooling7_f32<PoolingType::AVG, true> : &NEPoolingLayerKernel::pooling7_f32<PoolingType::AVG, false>;
+ if(is_nchw)
+ {
+ _func = (exclude_padding) ? &NEPoolingLayerKernel::pooling7_f32_nchw<PoolingType::AVG, true> : &NEPoolingLayerKernel::pooling7_f32_nchw<PoolingType::AVG, false>;
+ }
+ else
+ {
+ _func = (exclude_padding) ? &NEPoolingLayerKernel::poolingMxN_f32_nhwc<PoolingType::AVG, true> : &NEPoolingLayerKernel::poolingMxN_f32_nhwc<PoolingType::AVG, false>;
+ }
break;
case PoolingType::L2:
- _func = (exclude_padding) ? &NEPoolingLayerKernel::pooling7_f32<PoolingType::L2, true> : &NEPoolingLayerKernel::pooling7_f32<PoolingType::L2, false>;
+ if(is_nchw)
+ {
+ _func = (exclude_padding) ? &NEPoolingLayerKernel::pooling7_f32_nchw<PoolingType::L2, true> : &NEPoolingLayerKernel::pooling7_f32_nchw<PoolingType::L2, false>;
+ }
+ else
+ {
+ _func = (exclude_padding) ? &NEPoolingLayerKernel::poolingMxN_f32_nhwc<PoolingType::L2, true> : &NEPoolingLayerKernel::poolingMxN_f32_nhwc<PoolingType::L2, false>;
+ }
break;
case PoolingType::MAX:
- _func = &NEPoolingLayerKernel::pooling7_f32<PoolingType::MAX, false>;
+ if(is_nchw)
+ {
+ _func = &NEPoolingLayerKernel::pooling7_f32_nchw<PoolingType::MAX, false>;
+ }
+ else
+ {
+ _func = &NEPoolingLayerKernel::poolingMxN_f32_nhwc<PoolingType::MAX, false>;
+ }
break;
default:
ARM_COMPUTE_ERROR("Unsupported pooling type!");
@@ -689,13 +941,34 @@
switch(pool_type)
{
case PoolingType::AVG:
- _func = (exclude_padding) ? &NEPoolingLayerKernel::poolingMxN_f32<PoolingType::AVG, true> : &NEPoolingLayerKernel::poolingMxN_f32<PoolingType::AVG, false>;
+ if(is_nchw)
+ {
+ _func = (exclude_padding) ? &NEPoolingLayerKernel::poolingMxN_f32_nchw<PoolingType::AVG, true> : &NEPoolingLayerKernel::poolingMxN_f32_nchw<PoolingType::AVG, false>;
+ }
+ else
+ {
+ _func = (exclude_padding) ? &NEPoolingLayerKernel::poolingMxN_f32_nhwc<PoolingType::AVG, true> : &NEPoolingLayerKernel::poolingMxN_f32_nhwc<PoolingType::AVG, false>;
+ }
break;
case PoolingType::L2:
- _func = (exclude_padding) ? &NEPoolingLayerKernel::poolingMxN_f32<PoolingType::L2, true> : &NEPoolingLayerKernel::poolingMxN_f32<PoolingType::L2, false>;
+ if(is_nchw)
+ {
+ _func = (exclude_padding) ? &NEPoolingLayerKernel::poolingMxN_f32_nchw<PoolingType::L2, true> : &NEPoolingLayerKernel::poolingMxN_f32_nchw<PoolingType::L2, false>;
+ }
+ else
+ {
+ _func = (exclude_padding) ? &NEPoolingLayerKernel::poolingMxN_f32_nhwc<PoolingType::L2, true> : &NEPoolingLayerKernel::poolingMxN_f32_nhwc<PoolingType::L2, false>;
+ }
break;
case PoolingType::MAX:
- _func = &NEPoolingLayerKernel::poolingMxN_f32<PoolingType::MAX, false>;
+ if(is_nchw)
+ {
+ _func = &NEPoolingLayerKernel::poolingMxN_f32_nchw<PoolingType::MAX, false>;
+ }
+ else
+ {
+ _func = &NEPoolingLayerKernel::poolingMxN_f32_nhwc<PoolingType::MAX, false>;
+ }
break;
default:
ARM_COMPUTE_ERROR("Unsupported pooling type!");
@@ -708,13 +981,34 @@
switch(pool_type)
{
case PoolingType::AVG:
- _func = (exclude_padding) ? &NEPoolingLayerKernel::poolingMxN_f32<PoolingType::AVG, true> : &NEPoolingLayerKernel::poolingMxN_f32<PoolingType::AVG, false>;
+ if(is_nchw)
+ {
+ _func = (exclude_padding) ? &NEPoolingLayerKernel::poolingMxN_f32_nchw<PoolingType::AVG, true> : &NEPoolingLayerKernel::poolingMxN_f32_nchw<PoolingType::AVG, false>;
+ }
+ else
+ {
+ _func = (exclude_padding) ? &NEPoolingLayerKernel::poolingMxN_f32_nhwc<PoolingType::AVG, true> : &NEPoolingLayerKernel::poolingMxN_f32_nhwc<PoolingType::AVG, false>;
+ }
break;
case PoolingType::L2:
- _func = (exclude_padding) ? &NEPoolingLayerKernel::poolingMxN_f32<PoolingType::L2, true> : &NEPoolingLayerKernel::poolingMxN_f32<PoolingType::L2, false>;
+ if(is_nchw)
+ {
+ _func = (exclude_padding) ? &NEPoolingLayerKernel::poolingMxN_f32_nchw<PoolingType::L2, true> : &NEPoolingLayerKernel::poolingMxN_f32_nchw<PoolingType::L2, false>;
+ }
+ else
+ {
+ _func = (exclude_padding) ? &NEPoolingLayerKernel::poolingMxN_f32_nhwc<PoolingType::L2, true> : &NEPoolingLayerKernel::poolingMxN_f32_nhwc<PoolingType::L2, false>;
+ }
break;
case PoolingType::MAX:
- _func = &NEPoolingLayerKernel::poolingMxN_f32<PoolingType::MAX, false>;
+ if(is_nchw)
+ {
+ _func = &NEPoolingLayerKernel::poolingMxN_f32_nchw<PoolingType::MAX, false>;
+ }
+ else
+ {
+ _func = &NEPoolingLayerKernel::poolingMxN_f32_nhwc<PoolingType::MAX, false>;
+ }
break;
default:
ARM_COMPUTE_ERROR("Unsupported pooling type!");
@@ -729,7 +1023,7 @@
}
template <PoolingType pooling_type>
-void NEPoolingLayerKernel::pooling2_q8(const Window &window_input, const Window &window)
+void NEPoolingLayerKernel::pooling2_q8_nchw(const Window &window_input, const Window &window)
{
Iterator input(_input, window_input);
Iterator output(_output, window);
@@ -794,7 +1088,7 @@
}
template <PoolingType pooling_type, bool exclude_padding>
-void NEPoolingLayerKernel::pooling2_qasymm8(const Window &window_input, const Window &window)
+void NEPoolingLayerKernel::pooling2_qasymm8_nchw(const Window &window_input, const Window &window)
{
Iterator input(_input, window_input);
Iterator output(_output, window);
@@ -908,7 +1202,7 @@
}
template <PoolingType pooling_type>
-void NEPoolingLayerKernel::pooling2_q16(const Window &window_input, const Window &window)
+void NEPoolingLayerKernel::pooling2_q16_nchw(const Window &window_input, const Window &window)
{
Iterator input(_input, window_input);
Iterator output(_output, window);
@@ -973,7 +1267,7 @@
}
template <PoolingType pooling_type, bool exclude_padding>
-void NEPoolingLayerKernel::pooling3_f16(const Window &window_input, const Window &window)
+void NEPoolingLayerKernel::pooling3_f16_nchw(const Window &window_input, const Window &window)
{
#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
Iterator input(_input, window_input);
@@ -1012,7 +1306,7 @@
if(pooling_type != PoolingType::MAX)
{
// Calculate scale
- const float scale = calculate_avg_scale<exclude_padding>(id, pool_size, pool_size, upper_bound_w, upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x, pool_stride_y);
+ const float scale = calculate_avg_scale<exclude_padding, DataLayout::NCHW>(id, pool_size, pool_size, upper_bound_w, upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x, pool_stride_y);
const float16x4_t scale_v = vdup_n_f16(scale);
// Perform pooling
const float16x4_t sum_data = vadd_f16(vadd_f16(top_data, bottom_data), middle_data);
@@ -1043,7 +1337,7 @@
}
template <PoolingType pooling_type, bool exclude_padding>
-void NEPoolingLayerKernel::pooling2_f16(const Window &window_input, const Window &window)
+void NEPoolingLayerKernel::pooling2_f16_nchw(const Window &window_input, const Window &window)
{
#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
Iterator input(_input, window_input);
@@ -1078,7 +1372,7 @@
if(pooling_type != PoolingType::MAX)
{
- const float scale = calculate_avg_scale<exclude_padding>(id, pool_size, pool_size, upper_bound_w, upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x, pool_stride_y);
+ const float scale = calculate_avg_scale<exclude_padding, DataLayout::NCHW>(id, pool_size, pool_size, upper_bound_w, upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x, pool_stride_y);
const float16x8_t scale_v = vdupq_n_f16(scale);
res = vmulq_f16(scale_v, vaddq_f16(bottom_data.val[1], vaddq_f16(bottom_data.val[0], vaddq_f16(top_data.val[0], top_data.val[1]))));
}
@@ -1105,7 +1399,7 @@
}
template <PoolingType pooling_type, bool exclude_padding>
-void NEPoolingLayerKernel::pooling2_f32(const Window &window_input, const Window &window)
+void NEPoolingLayerKernel::pooling2_f32_nchw(const Window &window_input, const Window &window)
{
Iterator input(_input, window_input);
Iterator output(_output, window);
@@ -1141,7 +1435,7 @@
if(pooling_type != PoolingType::MAX)
{
// Calculate scale
- float scale = calculate_avg_scale<exclude_padding>(id, pool_size, pool_size, upper_bound_w, upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x, pool_stride_y);
+ float scale = calculate_avg_scale<exclude_padding, DataLayout::NCHW>(id, pool_size, pool_size, upper_bound_w, upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x, pool_stride_y);
const float32x2_t scale_v = vdup_n_f32(scale);
// Perform pooling
@@ -1168,7 +1462,7 @@
}
template <PoolingType pooling_type>
-void NEPoolingLayerKernel::pooling3_q8(const Window &window_input, const Window &window)
+void NEPoolingLayerKernel::pooling3_q8_nchw(const Window &window_input, const Window &window)
{
Iterator input(_input, window_input);
Iterator output(_output, window);
@@ -1244,7 +1538,7 @@
}
template <PoolingType pooling_type, bool exclude_padding>
-void NEPoolingLayerKernel::pooling3_qasymm8(const Window &window_input, const Window &window)
+void NEPoolingLayerKernel::pooling3_qasymm8_nchw(const Window &window_input, const Window &window)
{
Iterator input(_input, window_input);
Iterator output(_output, window);
@@ -1364,7 +1658,7 @@
}
template <PoolingType pooling_type>
-void NEPoolingLayerKernel::pooling3_q16(const Window &window_input, const Window &window)
+void NEPoolingLayerKernel::pooling3_q16_nchw(const Window &window_input, const Window &window)
{
Iterator input(_input, window_input);
Iterator output(_output, window);
@@ -1435,7 +1729,7 @@
}
template <PoolingType pooling_type, bool exclude_padding>
-void NEPoolingLayerKernel::pooling3_f32(const Window &window_input, const Window &window)
+void NEPoolingLayerKernel::pooling3_f32_nchw(const Window &window_input, const Window &window)
{
Iterator input(_input, window_input);
Iterator output(_output, window);
@@ -1474,7 +1768,7 @@
if(pooling_type != PoolingType::MAX)
{
// Calculate scale
- float scale = calculate_avg_scale<exclude_padding>(id, pool_size, pool_size, upper_bound_w, upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x, pool_stride_y);
+ float scale = calculate_avg_scale<exclude_padding, DataLayout::NCHW>(id, pool_size, pool_size, upper_bound_w, upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x, pool_stride_y);
const float32x2_t scale_v = vdup_n_f32(scale);
// Perform pooling
@@ -1503,7 +1797,7 @@
}
template <PoolingType pooling_type, bool exclude_padding>
-void NEPoolingLayerKernel::pooling7_f32(const Window &window_input, const Window &window)
+void NEPoolingLayerKernel::pooling7_f32_nchw(const Window &window_input, const Window &window)
{
Iterator input(_input, window_input);
Iterator output(_output, window);
@@ -1532,7 +1826,7 @@
if(pooling_type != PoolingType::MAX)
{
// Calculate scale
- float scale = calculate_avg_scale<exclude_padding>(id, pool_size, pool_size, upper_bound_w, upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x, pool_stride_y);
+ float scale = calculate_avg_scale<exclude_padding, DataLayout::NCHW>(id, pool_size, pool_size, upper_bound_w, upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x, pool_stride_y);
const float32x2_t scale_v = vdup_n_f32(scale);
// Perform pooling
@@ -1586,7 +1880,7 @@
}
template <PoolingType pooling_type>
-void NEPoolingLayerKernel::poolingMxN_q8(const Window &window_input, const Window &window)
+void NEPoolingLayerKernel::poolingMxN_q8_nchw(const Window &window_input, const Window &window)
{
Iterator input(_input, window_input);
Iterator output(_output, window);
@@ -1640,7 +1934,7 @@
}
template <PoolingType pooling_type>
-void NEPoolingLayerKernel::poolingMxN_q16(const Window &window_input, const Window &window)
+void NEPoolingLayerKernel::poolingMxN_q16_nchw(const Window &window_input, const Window &window)
{
Iterator input(_input, window_input);
Iterator output(_output, window);
@@ -1690,7 +1984,7 @@
}
template <PoolingType pooling_type, bool exclude_padding>
-void NEPoolingLayerKernel::poolingMxN_f16(const Window &window_input, const Window &window)
+void NEPoolingLayerKernel::poolingMxN_f16_nchw(const Window &window_input, const Window &window)
{
#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
Iterator input(_input, window_input);
@@ -1716,7 +2010,7 @@
if(pooling_type != PoolingType::MAX)
{
// Calculate scale
- const float scale = calculate_avg_scale<exclude_padding>(id, pool_size_x, pool_size_y, upper_bound_w, upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x, pool_stride_y);
+ const float scale = calculate_avg_scale<exclude_padding, DataLayout::NCHW>(id, pool_size_x, pool_size_y, upper_bound_w, upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x, pool_stride_y);
// Perform pooling
@@ -1813,7 +2107,116 @@
}
template <PoolingType pooling_type, bool exclude_padding>
-void NEPoolingLayerKernel::poolingMxN_f32(const Window &window_input, const Window &window)
+void NEPoolingLayerKernel::poolingMxN_f16_nhwc(const Window &window_input, const Window &window)
+{
+#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+ Iterator input(_input, window_input);
+ Iterator output(_output, window);
+
+ const int pool_size_x = _pool_info.is_global_pooling() ? _input->info()->tensor_shape().y() : _pool_info.pool_size().width;
+ const int pool_size_y = _pool_info.is_global_pooling() ? _input->info()->tensor_shape().z() : _pool_info.pool_size().height;
+ const int pool_pad_right = _pool_info.pad_stride_info().pad_right();
+ const int pool_pad_top = _pool_info.pad_stride_info().pad_top();
+ const int pool_pad_left = _pool_info.pad_stride_info().pad_left();
+ const int pool_pad_bottom = _pool_info.pad_stride_info().pad_bottom();
+ int pool_stride_x = 0;
+ int pool_stride_y = 0;
+ std::tie(pool_stride_x, pool_stride_y) = _pool_info.pad_stride_info().stride();
+ const int upper_bound_w = _input->info()->dimension(1) + (exclude_padding ? 0 : pool_pad_right);
+ const int upper_bound_h = _input->info()->dimension(2) + (exclude_padding ? 0 : pool_pad_bottom);
+
+ float16x8_t vres;
+
+ execute_window_loop(window, [&](const Coordinates & id)
+ {
+ const int idx_width = id.y() * pool_stride_x;
+ const int idx_height = id.z() * pool_stride_y;
+ if(pooling_type != PoolingType::MAX)
+ {
+ // Calculate scale
+ const float scale = calculate_avg_scale<exclude_padding, DataLayout::NHWC>(id, pool_size_x, pool_size_y, upper_bound_w, upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x,
+ pool_stride_y);
+ const float16x8_t scale_v = vdupq_n_f16(scale);
+
+ // Perform pooling
+ vres = vdupq_n_f16(0.0f);
+
+ for(int y = 0; y < pool_size_y; ++y)
+ {
+ if(y + idx_height - pool_pad_top >= window_input.z().end() || y + idx_height - pool_pad_top < window_input.z().start())
+ {
+ continue;
+ }
+
+ for(int x = 0; x < pool_size_x; ++x)
+ {
+ if(x + idx_width - pool_pad_left >= window_input.y().end() || x + idx_width - pool_pad_left < window_input.y().start())
+ {
+ continue;
+ }
+
+ const float16x8_t data = vld1q_f16(reinterpret_cast<const float16_t *>(input.ptr() + (x - pool_pad_left) * _input->info()->strides_in_bytes().y() +
+ (y - pool_pad_top) * _input->info()->strides_in_bytes().z()));
+
+ // Get power of 2 in case of l2 pooling and accumulate
+ if(pooling_type == PoolingType::L2)
+ {
+ vres = vaddq_f16(vres, vmulq_f16(data, data));
+ }
+ else
+ {
+ vres = vaddq_f16(vres, data);
+ }
+ }
+ }
+ // Divide by scale
+ vres = vmulq_f16(vres, scale_v);
+ }
+ else
+ {
+ vres = vdupq_n_f16(std::numeric_limits<float>::lowest());
+ for(int y = 0; y < pool_size_y; ++y)
+ {
+ if(y + idx_height > window_input.z().end() || y + idx_height - pool_pad_top < window_input.z().start())
+ {
+ continue;
+ }
+
+ for(int x = 0; x < pool_size_x; ++x)
+ {
+ if(x + idx_width > window_input.y().end() || x + idx_width - pool_pad_left < window_input.y().start())
+ {
+ continue;
+ }
+
+ const float16x8_t data = vld1q_f16(reinterpret_cast<const float16_t *>(input.ptr() + (x - pool_pad_left) * _input->info()->strides_in_bytes().y() +
+ (y - pool_pad_top) * _input->info()->strides_in_bytes().z()));
+ vres = vmaxq_f16(vres, data);
+ }
+ }
+ }
+
+ // Calculate square-root in case of l2 pooling
+ if(pooling_type == PoolingType::L2)
+ {
+ float16x8_t sqrt_reciprocal = vrsqrteq_f16(vres);
+ vres = vmulq_f16(vres, vmulq_f16(vrsqrtsq_f16(vmulq_f16(vres, sqrt_reciprocal), sqrt_reciprocal), sqrt_reciprocal));
+ }
+
+ // Store result
+ vst1q_f16(reinterpret_cast<float16_t *>(output.ptr()), vres);
+ },
+ input, output);
+
+#else /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
+ ARM_COMPUTE_UNUSED(window_input);
+ ARM_COMPUTE_UNUSED(window);
+ ARM_COMPUTE_ERROR("FP16 Not supported! Recompile the library with arch=arm64-v8.2-a");
+#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
+}
+
+template <PoolingType pooling_type, bool exclude_padding>
+void NEPoolingLayerKernel::poolingMxN_f32_nchw(const Window &window_input, const Window &window)
{
Iterator input(_input, window_input);
Iterator output(_output, window);
@@ -1837,7 +2240,7 @@
if(pooling_type != PoolingType::MAX)
{
// Calculate scale
- const float scale = calculate_avg_scale<exclude_padding>(id, pool_size_x, pool_size_y, upper_bound_w, upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x, pool_stride_y);
+ const float scale = calculate_avg_scale<exclude_padding, DataLayout::NCHW>(id, pool_size_x, pool_size_y, upper_bound_w, upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x, pool_stride_y);
// Perform pooling
float32x4_t vres = vdupq_n_f32(0.0f);
@@ -1936,7 +2339,109 @@
}
template <PoolingType pooling_type, bool exclude_padding>
-void NEPoolingLayerKernel::poolingMxN_qasymm8(const Window &window_input, const Window &window)
+void NEPoolingLayerKernel::poolingMxN_f32_nhwc(const Window &window_input, const Window &window)
+{
+ Iterator input(_input, window_input);
+ Iterator output(_output, window);
+
+ const int pool_size_x = _pool_info.is_global_pooling() ? _input->info()->tensor_shape().y() : _pool_info.pool_size().width;
+ const int pool_size_y = _pool_info.is_global_pooling() ? _input->info()->tensor_shape().z() : _pool_info.pool_size().height;
+ const int pool_pad_right = _pool_info.pad_stride_info().pad_right();
+ const int pool_pad_top = _pool_info.pad_stride_info().pad_top();
+ const int pool_pad_left = _pool_info.pad_stride_info().pad_left();
+ const int pool_pad_bottom = _pool_info.pad_stride_info().pad_bottom();
+ int pool_stride_x = 0;
+ int pool_stride_y = 0;
+ std::tie(pool_stride_x, pool_stride_y) = _pool_info.pad_stride_info().stride();
+ const int upper_bound_w = _input->info()->dimension(1) + (exclude_padding ? 0 : pool_pad_right);
+ const int upper_bound_h = _input->info()->dimension(2) + (exclude_padding ? 0 : pool_pad_bottom);
+
+ float32x4_t vres;
+
+ execute_window_loop(window, [&](const Coordinates & id)
+ {
+ const int idx_width = id.y() * pool_stride_x;
+ const int idx_height = id.z() * pool_stride_y;
+ if(pooling_type != PoolingType::MAX)
+ {
+ // Calculate scale
+ const float scale = calculate_avg_scale<exclude_padding, DataLayout::NHWC>(id, pool_size_x, pool_size_y, upper_bound_w, upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x,
+ pool_stride_y);
+ const float32x4_t scale_v = vdupq_n_f32(scale);
+
+ // Perform pooling
+ vres = vdupq_n_f32(0.0f);
+
+ for(int y = 0; y < pool_size_y; ++y)
+ {
+ if(y + idx_height - pool_pad_top >= window_input.z().end() || y + idx_height - pool_pad_top < window_input.z().start())
+ {
+ continue;
+ }
+
+ for(int x = 0; x < pool_size_x; ++x)
+ {
+ if(x + idx_width - pool_pad_left >= window_input.y().end() || x + idx_width - pool_pad_left < window_input.y().start())
+ {
+ continue;
+ }
+
+ const float32x4_t data = vld1q_f32(reinterpret_cast<const float *>(input.ptr() + (x - pool_pad_left) * _input->info()->strides_in_bytes().y() +
+ (y - pool_pad_top) * _input->info()->strides_in_bytes().z()));
+
+ // Get power of 2 in case of l2 pooling and accumulate
+ if(pooling_type == PoolingType::L2)
+ {
+ vres = vmlaq_f32(vres, data, data);
+ }
+ else
+ {
+ vres = vaddq_f32(vres, data);
+ }
+ }
+ }
+ // Divide by scale
+ vres = vmulq_f32(vres, scale_v);
+ }
+ else
+ {
+ vres = vdupq_n_f32(std::numeric_limits<float>::lowest());
+ for(int y = 0; y < pool_size_y; ++y)
+ {
+ if(y + idx_height - pool_pad_top >= window_input.z().end() || y + idx_height - pool_pad_top < window_input.z().start())
+ {
+ continue;
+ }
+
+ for(int x = 0; x < pool_size_x; ++x)
+ {
+ if(x + idx_width - pool_pad_left >= window_input.y().end() || x + idx_width - pool_pad_left < window_input.y().start())
+ {
+ continue;
+ }
+
+ const float32x4_t data = vld1q_f32(reinterpret_cast<const float *>(input.ptr() + (x - pool_pad_left) * _input->info()->strides_in_bytes().y() +
+ (y - pool_pad_top) * _input->info()->strides_in_bytes().z()));
+ vres = vmaxq_f32(vres, data);
+ }
+ }
+ }
+
+ // Calculate square-root in case of l2 pooling
+ if(pooling_type == PoolingType::L2)
+ {
+ float32x4_t sqrt_reciprocal = vrsqrteq_f32(vres);
+ vres = vmulq_f32(vres, vmulq_f32(vrsqrtsq_f32(vmulq_f32(vres, sqrt_reciprocal), sqrt_reciprocal), sqrt_reciprocal));
+ }
+
+ // Store result
+ vst1q_f32(reinterpret_cast<float *>(output.ptr()), vres);
+ },
+ input, output);
+}
+
+template <PoolingType pooling_type, bool exclude_padding>
+void NEPoolingLayerKernel::poolingMxN_qasymm8_nchw(const Window &window_input, const Window &window)
{
Iterator input(_input, window_input);
Iterator output(_output, window);
@@ -1963,7 +2468,7 @@
uint32_t sres = 0;
// Calculate scale
- const float scale = calculate_avg_scale<exclude_padding>(id, pool_size_x, pool_size_y, upper_bound_w, upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x, pool_stride_y);
+ const float scale = calculate_avg_scale<exclude_padding, DataLayout::NCHW>(id, pool_size_x, pool_size_y, upper_bound_w, upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x, pool_stride_y);
// Perform pooling
for(int y = 0; y < pool_size_y; ++y)
@@ -2031,6 +2536,101 @@
input, output);
}
+template <PoolingType pooling_type, bool exclude_padding>
+void NEPoolingLayerKernel::poolingMxN_qasymm8_nhwc(const Window &window_input, const Window &window)
+{
+ Iterator input(_input, window_input);
+ Iterator output(_output, window);
+
+ const int pool_size_x = _pool_info.is_global_pooling() ? _input->info()->tensor_shape().y() : _pool_info.pool_size().width;
+ const int pool_size_y = _pool_info.is_global_pooling() ? _input->info()->tensor_shape().z() : _pool_info.pool_size().height;
+ const int pool_pad_right = _pool_info.pad_stride_info().pad_right();
+ const int pool_pad_top = _pool_info.pad_stride_info().pad_top();
+ const int pool_pad_left = _pool_info.pad_stride_info().pad_left();
+ const int pool_pad_bottom = _pool_info.pad_stride_info().pad_bottom();
+ int pool_stride_x = 0;
+ int pool_stride_y = 0;
+ std::tie(pool_stride_x, pool_stride_y) = _pool_info.pad_stride_info().stride();
+ const int upper_bound_w = _input->info()->dimension(1) + (exclude_padding ? 0 : pool_pad_right);
+ const int upper_bound_h = _input->info()->dimension(2) + (exclude_padding ? 0 : pool_pad_bottom);
+
+ execute_window_loop(window, [&](const Coordinates & id)
+ {
+ const int idx_width = id.y() * pool_stride_x;
+ const int idx_height = id.z() * pool_stride_y;
+ if(pooling_type != PoolingType::MAX)
+ {
+ uint32x4_t vres1 = vdupq_n_u32(0);
+ uint32x4_t vres2 = vdupq_n_u32(0);
+
+ // Calculate scale
+ const float scale = calculate_avg_scale<exclude_padding, DataLayout::NHWC>(id, pool_size_x, pool_size_y, upper_bound_w, upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x,
+ pool_stride_y);
+ const float32x4_t scale_v = vdupq_n_f32(scale);
+
+ // Perform pooling
+ for(int y = 0; y < pool_size_y; ++y)
+ {
+ if(y + idx_height - pool_pad_top >= window_input.z().end() || y + idx_height - pool_pad_top < window_input.z().start())
+ {
+ continue;
+ }
+
+ for(int x = 0; x < pool_size_x; ++x)
+ {
+ if(x + idx_width - pool_pad_left >= window_input.y().end() || x + idx_width - pool_pad_left < window_input.y().start())
+ {
+ continue;
+ }
+
+ const uint8x8_t data = vld1_u8(reinterpret_cast<const uint8_t *>(input.ptr() + (x - pool_pad_left) * _input->info()->strides_in_bytes().y() +
+ (y - pool_pad_top) * _input->info()->strides_in_bytes().z()));
+
+ const uint16x8_t data_u16 = vmovl_u8(data);
+ vres1 = vaddq_u32(vres1, vmovl_u16(vget_low_u16(data_u16)));
+ vres2 = vaddq_u32(vres2, vmovl_u16(vget_high_u16(data_u16)));
+ }
+ }
+ // Divide by scale
+ vres1 = vcvtq_u32_f32(vmulq_f32(vcvtq_f32_u32(vres1), scale_v));
+ vres2 = vcvtq_u32_f32(vmulq_f32(vcvtq_f32_u32(vres2), scale_v));
+
+ uint8x8_t res = vmovn_u16(vcombine_u16(vmovn_u32(vres1), vmovn_u32(vres2)));
+
+ // Store result
+ vst1_u8(output.ptr(), res);
+ }
+ else
+ {
+ uint8x8_t vres = vdup_n_u8(0);
+
+ for(int y = 0; y < pool_size_y; ++y)
+ {
+ if(y + idx_height - pool_pad_top >= window_input.z().end() || y + idx_height - pool_pad_top < window_input.z().start())
+ {
+ continue;
+ }
+
+ for(int x = 0; x < pool_size_x; ++x)
+ {
+ if(x + idx_width - pool_pad_left >= window_input.y().end() || x + idx_width - pool_pad_left < window_input.y().start())
+ {
+ continue;
+ }
+
+ const uint8x8_t data = vld1_u8(reinterpret_cast<const uint8_t *>(input.ptr() + (x - pool_pad_left) * _input->info()->strides_in_bytes().y() +
+ (y - pool_pad_top) * _input->info()->strides_in_bytes().z()));
+ vres = vmax_u8(vres, data);
+ }
+ }
+
+ // Store result
+ vst1_u8(output.ptr(), vres);
+ }
+ },
+ input, output);
+}
+
Status NEPoolingLayerKernel::validate(const ITensorInfo *input, const ITensorInfo *output, const PoolingLayerInfo &pool_info)
{
ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input);
@@ -2040,16 +2640,24 @@
unsigned int num_elems_processed_per_iteration = 0;
BorderSize border_size(0);
- const bool is_global_pooling = pool_info.is_global_pooling();
- const unsigned int pool_size_x = is_global_pooling ? input->tensor_shape().x() : pool_info.pool_size().width;
- const unsigned int pool_size_y = is_global_pooling ? input->tensor_shape().y() : pool_info.pool_size().height;
+ const bool is_global_pooling = pool_info.is_global_pooling();
+ unsigned int pool_size_x = 0;
+ unsigned int pool_size_y = 0;
+
+ // Get data layout
+ const DataLayout data_layout = input->data_layout();
+ const int idx_width = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
+ const int idx_height = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
+
+ pool_size_x = is_global_pooling ? input->dimension(idx_width) : pool_info.pool_size().width;
+ pool_size_y = is_global_pooling ? input->dimension(idx_height) : pool_info.pool_size().height;
// Validate pool info before calling scaled_dimensions
ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments_pool_info(pool_size_x, pool_size_y));
// Check output dimensions
- std::tie(pooled_w, pooled_h) = scaled_dimensions(input->dimension(0),
- input->dimension(1),
+ std::tie(pooled_w, pooled_h) = scaled_dimensions(input->dimension(idx_width),
+ input->dimension(idx_height),
pool_size_x,
pool_size_y,
pool_info.pad_stride_info());
@@ -2073,39 +2681,48 @@
const unsigned int pool_stride_y = _pool_info.pad_stride_info().stride().second;
const unsigned int pool_size = _pool_info.pool_size().width;
- // Set step for input in x and y direction for the input
- Window window_input(window);
- unsigned int window_x_inc = 0;
- switch(_input->info()->data_type())
+ Window window_input(window);
+ if(_input->info()->data_layout() == DataLayout::NCHW)
{
- case DataType::QS8:
- case DataType::QS16:
- case DataType::F16:
+ // Set step for input in x and y direction for the input
+ unsigned int window_x_inc = 0;
+ switch(_input->info()->data_type())
{
- window_x_inc = (pool_stride_x == 2) ? _num_elems_processed_per_iteration * 2 : _num_elems_processed_per_iteration;
- break;
- }
- case DataType::QASYMM8:
- {
- window_x_inc = pool_stride_x;
- if((pool_size == 2 || pool_size == 3) && pool_stride_x < 3)
+ case DataType::QS8:
+ case DataType::QS16:
+ case DataType::F16:
{
window_x_inc = (pool_stride_x == 2) ? _num_elems_processed_per_iteration * 2 : _num_elems_processed_per_iteration;
+ break;
}
- break;
+ case DataType::QASYMM8:
+ {
+ window_x_inc = pool_stride_x;
+ if((pool_size == 2 || pool_size == 3) && pool_stride_x < 3)
+ {
+ window_x_inc = (pool_stride_x == 2) ? _num_elems_processed_per_iteration * 2 : _num_elems_processed_per_iteration;
+ }
+ break;
+ }
+ case DataType::F32:
+ {
+ window_x_inc = pool_stride_x;
+ break;
+ }
+ default:
+ {
+ ARM_COMPUTE_ERROR("Not supported");
+ }
}
- case DataType::F32:
- {
- window_x_inc = pool_stride_x;
- break;
- }
- default:
- {
- ARM_COMPUTE_ERROR("Not supported");
- }
+ window_input.set(Window::DimX, Window::Dimension(window.x().start() * pool_stride_x, window.x().end() * pool_stride_x, window_x_inc));
+ window_input.set(Window::DimY, Window::Dimension(window.y().start() * pool_stride_y, window.y().end() * pool_stride_y, pool_stride_y));
}
- window_input.set(Window::DimX, Window::Dimension(window.x().start() * pool_stride_x, window.x().end() * pool_stride_x, window_x_inc));
- window_input.set(Window::DimY, Window::Dimension(window.y().start() * pool_stride_y, window.y().end() * pool_stride_y, pool_stride_y));
+ else
+ {
+ window_input.set(Window::DimX, Window::Dimension(window.x().start(), window.x().end(), _num_elems_processed_per_iteration));
+ window_input.set(Window::DimY, Window::Dimension(0, _input->info()->dimension(1), pool_stride_x));
+ window_input.set(Window::DimZ, Window::Dimension(0, _input->info()->dimension(2), pool_stride_y));
+ }
// Run function
(this->*_func)(window_input, window);
diff --git a/src/core/NEON/kernels/NEQuantizationLayerKernel.cpp b/src/core/NEON/kernels/NEQuantizationLayerKernel.cpp
index 767af08..ee23e76 100644
--- a/src/core/NEON/kernels/NEQuantizationLayerKernel.cpp
+++ b/src/core/NEON/kernels/NEQuantizationLayerKernel.cpp
@@ -34,6 +34,46 @@
using namespace arm_compute;
+namespace
+{
+Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, const ITensorInfo *min_max)
+{
+ ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output, min_max);
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32);
+ ARM_COMPUTE_RETURN_ERROR_ON(input->num_dimensions() < 3);
+
+ if(output->tensor_shape().total_size() > 0)
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8);
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output);
+ }
+
+ return Status{};
+}
+
+std::tuple<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *output, ITensorInfo *min_max)
+{
+ // Output tensor auto initialization if not yet initialized
+ auto_init_if_empty(*output, input->tensor_shape(), 1, DataType::U8, 0);
+
+ constexpr unsigned int num_elems_processed_per_iteration = 8;
+
+ // Configure window
+ Window win = calculate_max_window(*input, Steps(num_elems_processed_per_iteration));
+ AccessWindowHorizontal input_access(input, 0, num_elems_processed_per_iteration);
+ AccessWindowHorizontal output_access(output, 0, num_elems_processed_per_iteration);
+ AccessWindowStatic min_max_access(min_max, 0, 0, 2, min_max->dimension(1));
+
+ // Update window and padding
+ bool window_changed = update_window_and_padding(win, input_access, output_access, min_max_access);
+
+ output_access.set_valid_region(win, input->valid_region());
+
+ Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
+ return std::make_tuple(err, win);
+}
+} // namespace
+
NEQuantizationLayerKernel::NEQuantizationLayerKernel()
: _input(nullptr), _output(nullptr), _min_max(nullptr)
{
@@ -41,33 +81,27 @@
void NEQuantizationLayerKernel::configure(const ITensor *input, ITensor *output, const ITensor *min_max)
{
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32);
- ARM_COMPUTE_ERROR_ON_NULLPTR(output);
- ARM_COMPUTE_ERROR_ON(input->info()->num_dimensions() < 3);
-
- // Output tensor auto initialization if not yet initialized
- auto_init_if_empty(*output->info(), input->info()->tensor_shape(), 1, DataType::U8, 0);
-
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8);
- ARM_COMPUTE_ERROR_ON_MISMATCHING_SHAPES(input, output);
+ ARM_COMPUTE_ERROR_ON_NULLPTR(input, output, min_max);
+ ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), min_max->info()));
_input = input;
_output = output;
_min_max = min_max;
- constexpr unsigned int num_elems_processed_per_iteration = 8;
+ // Configure kernel window
+ auto win_config = validate_and_configure_window(input->info(), output->info(), min_max->info());
- // Configure window
- Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration));
- AccessWindowHorizontal input_access(input->info(), 0, num_elems_processed_per_iteration);
- AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration);
- AccessWindowStatic min_max_access(min_max->info(), 0, 0, 2, min_max->info()->dimension(1));
+ ARM_COMPUTE_ERROR_THROW_ON(std::get<0>(win_config));
- // Update window and padding
- update_window_and_padding(win, input_access, output_access, min_max_access);
- output_access.set_valid_region(win, input->info()->valid_region());
+ INEKernel::configure(std::get<1>(win_config));
+}
- INEKernel::configure(win);
+Status NEQuantizationLayerKernel::validate(const ITensorInfo *input, const ITensorInfo *output, const ITensorInfo *min_max)
+{
+ ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, min_max));
+ ARM_COMPUTE_RETURN_ON_ERROR(std::get<0>(validate_and_configure_window(input->clone().get(), output->clone().get(), min_max->clone().get())));
+
+ return Status{};
}
void NEQuantizationLayerKernel::run(const Window &window, const ThreadInfo &info)
diff --git a/src/core/NEON/kernels/NEReductionOperationKernel.cpp b/src/core/NEON/kernels/NEReductionOperationKernel.cpp
index 1a50ed8..30d42fa 100644
--- a/src/core/NEON/kernels/NEReductionOperationKernel.cpp
+++ b/src/core/NEON/kernels/NEReductionOperationKernel.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2018 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -29,6 +29,7 @@
#include "arm_compute/core/ITensor.h"
#include "arm_compute/core/NEON/INEKernel.h"
#include "arm_compute/core/NEON/NEMath.h"
+#include "arm_compute/core/TensorInfo.h"
#include "arm_compute/core/Validate.h"
#include <arm_neon.h>
@@ -94,6 +95,61 @@
ARM_COMPUTE_ERROR("Unsupported reduction axis");
}
}
+
+TensorShape calculate_output_shape(const TensorShape &input_shape, unsigned int axis)
+{
+ TensorShape output_shape{ input_shape };
+ output_shape.set(axis, 1);
+
+ return output_shape;
+}
+
+Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, unsigned int axis, ReductionOperation op)
+{
+ ARM_COMPUTE_UNUSED(op);
+
+ ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32);
+ ARM_COMPUTE_RETURN_ERROR_ON(input->data_layout() != DataLayout::NCHW);
+
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(axis >= TensorShape::num_max_dimensions, "Reduction axis greater than max number of dimensions");
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(axis > 0, "Unsupported reduction axis, Supported axis is 0");
+
+ if(output->total_size() != 0)
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+ ARM_COMPUTE_RETURN_ERROR_ON(output->data_layout() != DataLayout::NCHW);
+
+ const TensorShape output_shape = calculate_output_shape(input->tensor_shape(), axis);
+ const TensorInfo tensor_info_reshaped = input->clone()->set_tensor_shape(output_shape);
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(output, &tensor_info_reshaped);
+ }
+
+ return Status{};
+}
+
+std::tuple<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *output, unsigned int axis)
+{
+ // Calculate output shape and set if empty
+ const TensorShape output_shape = calculate_output_shape(input->tensor_shape(), axis);
+
+ // Output auto initialization if not yet initialized
+ auto_init_if_empty(*output, output_shape, 1, input->data_type(), input->fixed_point_position());
+
+ unsigned int num_elems_processed_per_iteration = 16 / data_size_from_type(input->data_type());
+
+ // Configure kernel window
+ Window win = calculate_max_window(*input, Steps(num_elems_processed_per_iteration));
+ AccessWindowHorizontal input_access(input, 0, num_elems_processed_per_iteration);
+ AccessWindowHorizontal output_access(output, 0, num_elems_processed_per_iteration);
+
+ bool window_changed = update_window_and_padding(win, input_access, output_access);
+ output_access.set_valid_region(win, ValidRegion(Coordinates(), output->tensor_shape()));
+
+ Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
+
+ return std::make_tuple(err, win);
+}
} // namespace
NEReductionOperationKernel::NEReductionOperationKernel()
@@ -109,19 +165,8 @@
void NEReductionOperationKernel::configure(const ITensor *input, ITensor *output, unsigned int axis, ReductionOperation op)
{
ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
- ARM_COMPUTE_ERROR_ON_MSG(axis >= TensorShape::num_max_dimensions, "Reduction axis greater than max number of dimensions");
- ARM_COMPUTE_ERROR_ON_MSG(axis > 0, "Unsupported reduction axis, Supported axis is 0");
- // Calculate output shape and set if empty
- TensorShape output_shape{ input->info()->tensor_shape() };
- output_shape.set(axis, 1);
-
- // Output auto initialization if not yet initialized
- auto_init_if_empty(*output->info(), output_shape, 1, input->info()->data_type(), input->info()->fixed_point_position());
-
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32);
- ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
- ARM_COMPUTE_ERROR_ON_MISMATCHING_DIMENSIONS(output->info()->tensor_shape(), output_shape);
+ ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), axis, op));
unsigned int num_elems_processed_per_iteration = 16 / data_size_from_type(input->info()->data_type());
@@ -131,14 +176,19 @@
_op = op;
// Configure kernel window
- Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration));
- AccessWindowHorizontal input_access(input->info(), 0, num_elems_processed_per_iteration);
- AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration);
+ auto win_config = validate_and_configure_window(_input->info(), _output->info(), axis);
- update_window_and_padding(win, input_access, output_access);
- output_access.set_valid_region(win, ValidRegion(Coordinates(), output->info()->tensor_shape()));
+ ARM_COMPUTE_ERROR_THROW_ON(std::get<0>(win_config));
- INEKernel::configure(win);
+ INEKernel::configure(std::get<1>(win_config));
+}
+
+Status NEReductionOperationKernel::validate(const ITensorInfo *input, const ITensorInfo *output, unsigned int axis, ReductionOperation op)
+{
+ ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, axis, op));
+ ARM_COMPUTE_RETURN_ON_ERROR(std::get<0>(validate_and_configure_window(input->clone().get(), output->clone().get(), axis)));
+
+ return Status{};
}
void NEReductionOperationKernel::run(const Window &window, const ThreadInfo &info)
diff --git a/src/core/NEON/kernels/NEScaleKernel.cpp b/src/core/NEON/kernels/NEScaleKernel.cpp
index 1918a77..7111644 100644
--- a/src/core/NEON/kernels/NEScaleKernel.cpp
+++ b/src/core/NEON/kernels/NEScaleKernel.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2016, 2017 ARM Limited.
+ * Copyright (c) 2016-2018 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -28,125 +28,337 @@
#include "arm_compute/core/Error.h"
#include "arm_compute/core/Helpers.h"
#include "arm_compute/core/ITensor.h"
+#include "arm_compute/core/NEON/wrapper/wrapper.h"
#include "arm_compute/core/TensorInfo.h"
#include "arm_compute/core/Validate.h"
#include "arm_compute/core/Window.h"
+#include "arm_compute/core/utils/misc/Utility.h"
#include <arm_neon.h>
#include <cstddef>
#include <cstdint>
-using namespace arm_compute;
+namespace arm_compute
+{
+namespace
+{
+Status validate_arguments(const ITensorInfo *input, const ITensorInfo *dx, const ITensorInfo *dy,
+ const ITensorInfo *offsets, ITensorInfo *output, InterpolationPolicy policy,
+ BorderMode border_mode, SamplingPolicy sampling_policy)
+{
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8, DataType::S16, DataType::F32);
+ ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(output);
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+ ARM_COMPUTE_RETURN_ERROR_ON(output == input);
+ ARM_COMPUTE_RETURN_ERROR_ON(sampling_policy != SamplingPolicy::CENTER);
+ ARM_COMPUTE_UNUSED(border_mode);
+
+ const DataLayout data_layout = input->data_layout();
+ ARM_COMPUTE_RETURN_ERROR_ON(output->dimension(get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH)) == 0);
+ ARM_COMPUTE_RETURN_ERROR_ON(output->dimension(get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT)) == 0);
+
+ if(policy == InterpolationPolicy::NEAREST_NEIGHBOR)
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(offsets, 1, DataType::S32);
+ }
+
+ if(policy == InterpolationPolicy::BILINEAR)
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(offsets, 1, DataType::S32);
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(dx, 1, DataType::F32);
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(dy, 1, DataType::F32);
+ }
+
+ if(policy == InterpolationPolicy::AREA)
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON(data_layout != DataLayout::NCHW);
+ }
+
+ return Status{};
+}
+
+std::pair<Status, Window> validate_and_configure_window_nchw(ITensorInfo *input, ITensorInfo *dx, ITensorInfo *dy, ITensorInfo *offsets, ITensorInfo *output,
+ InterpolationPolicy policy, bool border_undefined, SamplingPolicy sampling_policy, BorderSize border_size)
+{
+ bool window_changed{ false };
+ Window win{};
+
+ constexpr unsigned int num_elems_processed_per_iteration = 16;
+
+ // Configure kernel window
+ win = calculate_max_window(*output, Steps(num_elems_processed_per_iteration));
+
+ const ValidRegion &input_valid_region = input->valid_region();
+
+ if(offsets != nullptr)
+ {
+ AccessWindowHorizontal offsets_access(offsets, 0, num_elems_processed_per_iteration);
+ window_changed = window_changed || update_window_and_padding(win, offsets_access);
+ }
+ if(dx != nullptr && dy != nullptr)
+ {
+ AccessWindowHorizontal dx_access(dx, 0, num_elems_processed_per_iteration);
+ AccessWindowHorizontal dy_access(dy, 0, num_elems_processed_per_iteration);
+ window_changed = window_changed || update_window_and_padding(win, dx_access, dy_access);
+ }
+
+ // Reads can occur within the valid region of the input
+ AccessWindowStatic input_access(input, input_valid_region.anchor[0] - border_size.left,
+ input_valid_region.anchor[1] - border_size.top,
+ input_valid_region.anchor[0] + input_valid_region.shape[0] + border_size.right,
+ input_valid_region.anchor[1] + input_valid_region.shape[1] + border_size.bottom);
+ AccessWindowHorizontal output_access(output, 0, num_elems_processed_per_iteration);
+ window_changed = window_changed || update_window_and_padding(win, input_access, output_access);
+ output_access.set_valid_region(win, calculate_valid_region_scale(*input, output->tensor_shape(),
+ policy, sampling_policy, border_undefined));
+
+ Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
+ return std::make_pair(err, win);
+}
+
+std::pair<Status, Window> validate_and_configure_window_nhwc(ITensorInfo *input, ITensorInfo *output,
+ InterpolationPolicy policy, bool border_undefined,
+ SamplingPolicy sampling_policy, BorderSize border_size)
+{
+ bool window_changed{ false };
+ Window win{};
+
+ const unsigned int num_elems_processed_per_iteration = (policy == InterpolationPolicy::NEAREST_NEIGHBOR) ? 16 / input->element_size() : 1;
+
+ // Configure kernel window
+ win = calculate_max_window(*output, Steps(num_elems_processed_per_iteration));
+
+ AccessWindowStatic input_access(input, 0, -border_size.top,
+ ceil_to_multiple(input->tensor_shape()[0], num_elems_processed_per_iteration),
+ input->tensor_shape()[1]);
+ AccessWindowHorizontal output_access(output, 0, num_elems_processed_per_iteration);
+
+ window_changed = update_window_and_padding(win, input_access, output_access);
+ output->set_valid_region(calculate_valid_region_scale(*input, output->tensor_shape(),
+ policy, sampling_policy, border_undefined));
+
+ Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
+ return std::make_pair(err, win);
+}
+
+std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *dx, ITensorInfo *dy, ITensorInfo *offsets, ITensorInfo *output,
+ InterpolationPolicy policy, bool border_undefined, SamplingPolicy sampling_policy, BorderSize border_size)
+{
+ std::pair<Status, Window> win_config;
+ switch(input->data_layout())
+ {
+ case DataLayout::NCHW:
+ win_config = validate_and_configure_window_nchw(input, dx, dy, offsets, output, policy, border_undefined, sampling_policy, border_size);
+ break;
+ case DataLayout::NHWC:
+ win_config = validate_and_configure_window_nhwc(input, output, policy, border_undefined, sampling_policy, border_size);
+ break;
+ default:
+ win_config = std::make_pair(ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Unsupported data layout!"), Window{});
+ }
+
+ return win_config;
+}
+
+template <typename T>
+inline void scale_nearest_nhwc_core(const ITensor *input, const ITensor *offsets, ITensor *output,
+ float hr, Window window, const Window &win_in, size_t stride_w, size_t stride_h, size_t stride_c)
+{
+ Iterator in(input, win_in);
+ Iterator out(output, window);
+
+ const size_t offsets_stride = stride_w / sizeof(T);
+
+ execute_window_loop(window, [&](const Coordinates & id)
+ {
+ const auto offset = *reinterpret_cast<const int32_t *>(offsets->ptr_to_element(Coordinates(id.y(), id.z())));
+ const int in_yi = (id.z() + 0.5f) * hr;
+ const int offset_row = in_yi * stride_h + id.x() * stride_c;
+ wrapper::vstore(reinterpret_cast<T *>(out.ptr()),
+ wrapper::vloadq(reinterpret_cast<const T *>(in.ptr() + offset * offsets_stride + offset_row)));
+ },
+ in, out);
+}
+
+template <typename T>
+inline void scale_bilinear_nhwc_core(const ITensor *input, const ITensor *offsets, const ITensor *dx, const ITensor *dy, ITensor *output,
+ float hr, Window window, const Window &win_in, size_t stride_w, size_t stride_h, size_t stride_c, BorderMode border_mode)
+{
+ Iterator in(input, win_in);
+ Iterator out(output, window);
+
+ const size_t stride_w_elems = stride_w / sizeof(T);
+ const size_t stride_h_elems = stride_h / sizeof(T);
+
+ const int input_width = input->info()->dimension(1);
+ const int input_height = input->info()->dimension(2);
+
+ const T *border_area = reinterpret_cast<T *>(input->buffer() + input->info()->offset_first_element_in_bytes() - stride_w);
+
+ auto is_valid = [](int x, int low_x, int high_x, int y, int low_y, int high_y)
+ {
+ return !(x < low_x || x > high_x || y < low_y || y > high_y);
+ };
+
+ int border_size = (border_mode == BorderMode::UNDEFINED) ? 0 : 1;
+
+ execute_window_loop(window, [&](const Coordinates & id)
+ {
+ const auto offset = (*reinterpret_cast<const int32_t *>(offsets->ptr_to_element(Coordinates(id.y(), id.z())))) / static_cast<int>(sizeof(T));
+ const auto dx_scale = *reinterpret_cast<const float *>(dx->ptr_to_element(Coordinates(id.y(), id.z())));
+ const auto dy_scale = *reinterpret_cast<const float *>(dy->ptr_to_element(Coordinates(id.y(), id.z())));
+ const int in_yi = std::floor((id.z() + 0.5f) * hr - 0.5f);
+ const int offset_row = in_yi * stride_h + id.x() * stride_c;
+ const T *in_ptr = reinterpret_cast<T *>(in.ptr() + offset * stride_w + offset_row);
+
+ if(is_valid(offset, -border_size, input_width - 1 + border_size, in_yi, -border_size, input_height - 1 + border_size))
+ {
+ T a00 = 0, a01 = 0, a10 = 0, a11 = 0;
+
+ if(border_mode == BorderMode::CONSTANT)
+ {
+ a00 = is_valid(offset, 0, input_width - 1, in_yi, 0, input_height - 1) ? *in_ptr : *border_area;
+ a01 = is_valid(offset + 1, 0, input_width - 1, in_yi, 0, input_height - 1) ? *(in_ptr + stride_w_elems) : *border_area;
+ a10 = is_valid(offset, 0, input_width - 1, in_yi + 1, 0, input_height - 1) ? *(in_ptr + stride_h_elems) : *border_area;
+ a11 = is_valid(offset + 1, 0, input_width - 1, in_yi + 1, 0, input_height - 1) ? *(in_ptr + stride_h_elems + stride_w_elems) : *border_area;
+ }
+ else if(border_mode == BorderMode::REPLICATE)
+ {
+ auto clamped_x = utility::clamp<int>(offset, 0, input_width - 1);
+ auto clamped_x1 = utility::clamp<int>(offset + 1, 0, input_width - 1);
+ auto clamped_y = utility::clamp<int>(in_yi, 0, input_height - 1);
+ auto clamped_y1 = utility::clamp<int>(in_yi + 1, 0, input_height - 1);
+
+ a00 = *reinterpret_cast<T *>(in.ptr() + clamped_x * stride_w + clamped_y * stride_h + id.x() * stride_c);
+ a01 = *reinterpret_cast<T *>(in.ptr() + clamped_x1 * stride_w + clamped_y * stride_h + id.x() * stride_c);
+ a10 = *reinterpret_cast<T *>(in.ptr() + clamped_x * stride_w + clamped_y1 * stride_h + id.x() * stride_c);
+ a11 = *reinterpret_cast<T *>(in.ptr() + clamped_x1 * stride_w + clamped_y1 * stride_h + id.x() * stride_c);
+ }
+ else
+ {
+ a00 = is_valid(offset, 0, input_width - 1, in_yi, 0, input_height - 1) ? *in_ptr : 0;
+ a01 = is_valid(offset + 1, 0, input_width - 1, in_yi, 0, input_height - 1) ? *(in_ptr + stride_w_elems) : 0;
+ a10 = is_valid(offset, 0, input_width - 1, in_yi + 1, 0, input_height - 1) ? *(in_ptr + stride_h_elems) : 0;
+ a11 = is_valid(offset + 1, 0, input_width - 1, in_yi + 1, 0, input_height - 1) ? *(in_ptr + stride_h_elems + stride_w_elems) : 0;
+ }
+
+ // Perform interpolation
+ const float dx1 = 1.0f - dx_scale;
+ const float dy1 = 1.0f - dy_scale;
+
+ const float w1 = dx1 * dy1;
+ const float w2 = dx_scale * dy1;
+ const float w3 = dx1 * dy_scale;
+ const float w4 = dx_scale * dy_scale;
+
+ // Store result
+ *reinterpret_cast<T *>(out.ptr()) = static_cast<T>(a00 * w1 + a01 * w2 + a10 * w3 + a11 * w4);
+ }
+ else
+ {
+ if(border_mode == BorderMode::CONSTANT)
+ {
+ *reinterpret_cast<T *>(out.ptr()) = *border_area;
+ }
+ else if(border_mode == BorderMode::REPLICATE)
+ {
+ auto clamped_x = utility::clamp<int>(offset, 0, input_width - 1);
+ auto clamped_y = utility::clamp<int>(in_yi, 0, input_height - 1);
+ *reinterpret_cast<T *>(out.ptr()) = *reinterpret_cast<T *>(in.ptr() + clamped_x * stride_w + clamped_y * stride_h + id.x() * stride_c);
+ }
+ }
+ },
+ in, out);
+}
+} // namespace
NEScaleKernel::NEScaleKernel()
- : _func(nullptr), _offsets(nullptr), _dx(nullptr), _dy(nullptr), _input(nullptr), _output(nullptr)
+ : _func(nullptr), _offsets(nullptr), _dx(nullptr), _dy(nullptr), _input(nullptr), _output(nullptr), _policy(), _border_size(1), _border_mode()
{
}
BorderSize NEScaleKernel::border_size() const
{
- return BorderSize(1);
+ return _border_size;
}
-void NEScaleKernel::configure(const ITensor *input, const ITensor *dx, const ITensor *dy, const ITensor *offsets, ITensor *output, InterpolationPolicy policy, bool border_undefined,
- SamplingPolicy sampling_policy)
+void NEScaleKernel::configure(const ITensor *input, const ITensor *dx, const ITensor *dy, const ITensor *offsets,
+ ITensor *output, InterpolationPolicy policy, BorderMode border_mode, SamplingPolicy sampling_policy)
{
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8, DataType::S16, DataType::F32);
- ARM_COMPUTE_ERROR_ON_NULLPTR(output);
- ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
- ARM_COMPUTE_ERROR_ON(output == input);
- ARM_COMPUTE_ERROR_ON(sampling_policy != SamplingPolicy::CENTER);
- ARM_COMPUTE_UNUSED(sampling_policy);
+ ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
- if(policy == InterpolationPolicy::NEAREST_NEIGHBOR)
+ // Perform validation step
+ ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(),
+ dx != nullptr ? dx->info() : nullptr,
+ dy != nullptr ? dy->info() : nullptr,
+ offsets != nullptr ? offsets->info() : nullptr,
+ output->info(),
+ policy, border_mode, sampling_policy));
+
+ // Get data layout and width/height indices
+ const DataLayout data_layout = input->info()->data_layout();
+ const int idx_width = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
+ const int idx_height = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
+
+ _input = input;
+ _output = output;
+ _offsets = offsets;
+ _dx = dx;
+ _dy = dy;
+ _policy = policy;
+ _border_size = BorderSize(1);
+ _border_mode = border_mode;
+
+ // Compute the ratio between source width/height and destination width/height
+ const auto wr = static_cast<float>(input->info()->dimension(idx_width)) / static_cast<float>(output->info()->dimension(idx_width));
+ const auto hr = static_cast<float>(input->info()->dimension(idx_height)) / static_cast<float>(output->info()->dimension(idx_height));
+
+ // Add constant border only on top in case of NHWC layout
+ if(data_layout == DataLayout::NHWC)
{
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(offsets, 1, DataType::S32);
+ _border_size = (border_mode == BorderMode::CONSTANT && policy == InterpolationPolicy::BILINEAR) ? BorderSize(1, 0, 0, 0) : BorderSize(0);
}
- if(policy == InterpolationPolicy::BILINEAR)
- {
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(offsets, 1, DataType::S32);
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(dx, 1, DataType::F32);
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(dy, 1, DataType::F32);
- }
-
- ARM_COMPUTE_ERROR_ON(output->info()->dimension(0) == 0);
- ARM_COMPUTE_ERROR_ON(output->info()->dimension(1) == 0);
-
- for(size_t i = 2; i < Coordinates::num_max_dimensions; ++i)
- {
- ARM_COMPUTE_ERROR_ON(input->info()->dimension(i) != output->info()->dimension(i));
- }
-
- _input = input;
- _output = output;
- _offsets = offsets;
- _dx = dx;
- _dy = dy;
-
- /* Compute the ratio between source width/height and destination width/height */
- const auto wr = static_cast<float>(input->info()->dimension(0)) / static_cast<float>(output->info()->dimension(0));
- const auto hr = static_cast<float>(input->info()->dimension(1)) / static_cast<float>(output->info()->dimension(1));
-
- /* Area interpolation behaves as Nearest Neighbour in case of up-sampling */
+ // Area interpolation behaves as Nearest Neighbour in case of up-sampling
if(policy == InterpolationPolicy::AREA && wr <= 1.f && hr <= 1.f)
{
policy = InterpolationPolicy::NEAREST_NEIGHBOR;
}
+ // Select interpolation function
switch(policy)
{
case InterpolationPolicy::NEAREST_NEIGHBOR:
{
- _func = &NEScaleKernel::scale_nearest;
+ _func = (data_layout == DataLayout::NCHW) ? &NEScaleKernel::scale_nearest_nchw : &NEScaleKernel::scale_nhwc;
break;
}
case InterpolationPolicy::BILINEAR:
{
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(_dx, 1, DataType::F32);
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(_dy, 1, DataType::F32);
-
- _func = &NEScaleKernel::scale_bilinear;
+ _func = (data_layout == DataLayout::NCHW) ? &NEScaleKernel::scale_bilinear_nchw : &NEScaleKernel::scale_nhwc;
break;
}
case InterpolationPolicy::AREA:
{
- _func = &NEScaleKernel::scale_area;
+ _func = &NEScaleKernel::scale_area_nchw;
break;
}
default:
ARM_COMPUTE_ERROR("Unsupported interpolation mode");
}
- constexpr unsigned int num_elems_processed_per_iteration = 16;
-
- // Configure kernel window
- Window win = calculate_max_window(*output->info(), Steps(num_elems_processed_per_iteration));
-
- const ValidRegion &input_valid_region = input->info()->valid_region();
-
- // Reads can occur within the valid region of the input
- AccessWindowStatic input_access(input->info(),
- input_valid_region.anchor[0] - border_size().left, input_valid_region.anchor[1] - border_size().top,
- input_valid_region.anchor[0] + input_valid_region.shape[0] + border_size().right,
- input_valid_region.anchor[1] + input_valid_region.shape[1] + border_size().bottom);
- AccessWindowHorizontal offsets_access(offsets == nullptr ? nullptr : offsets->info(), 0, num_elems_processed_per_iteration);
- AccessWindowHorizontal dx_access(dx == nullptr ? nullptr : dx->info(), 0, num_elems_processed_per_iteration);
- AccessWindowHorizontal dy_access(dy == nullptr ? nullptr : dy->info(), 0, num_elems_processed_per_iteration);
- AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration);
-
- update_window_and_padding(win,
- input_access,
- offsets_access,
- dx_access,
- dy_access,
- output_access);
-
- output_access.set_valid_region(win, calculate_valid_region_scale(*(input->info()), output->info()->tensor_shape(), policy, border_size(), border_undefined));
- INEKernel::configure(win);
+ // Configure window
+ std::pair<Status, Window> win_config = validate_and_configure_window(input->info(),
+ dx != nullptr ? dx->info() : nullptr,
+ dy != nullptr ? dy->info() : nullptr,
+ offsets != nullptr ? offsets->info() : nullptr,
+ output->info(),
+ policy, border_mode == BorderMode::UNDEFINED, sampling_policy, border_size());
+ ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
+ INEKernel::configure(win_config.second);
}
-void NEScaleKernel::scale_nearest(const Window &window)
+void NEScaleKernel::scale_nearest_nchw(const Window &window)
{
const size_t input_stride = _input->info()->strides_in_bytes()[1];
@@ -159,15 +371,16 @@
win_in.set(Window::DimX, Window::Dimension(0, 0, 0));
win_in.set(Window::DimY, Window::Dimension(0, 0, 0));
+ // Set offsets window
Window win_off;
win_off.set(Window::DimX, window[Window::DimX]);
win_off.set(Window::DimY, window[Window::DimY]);
-
for(size_t d = Window::DimZ; d < _offsets->info()->num_dimensions(); ++d)
{
win_off.set(d, Window::Dimension(0, 0, 0));
}
+ // Create iterators
Iterator in(_input, win_in);
Iterator out(_output, window);
Iterator offsets(_offsets, win_off);
@@ -300,7 +513,7 @@
}
}
-void NEScaleKernel::scale_bilinear(const Window &window)
+void NEScaleKernel::scale_bilinear_nchw(const Window &window)
{
ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(_input, 1, DataType::U8, DataType::S16, DataType::F32);
@@ -465,15 +678,16 @@
}
}
-void NEScaleKernel::scale_area(const Window &window)
+void NEScaleKernel::scale_area_nchw(const Window &window)
{
ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(_input, 1, DataType::U8);
- // Don't increment in X and Y direction for the input tensor
+ // Don't increment in width/height/channels for the input tensor
// A pointer to the start of this plane is needed as base for the precomputed offsets
Window win_in(window);
win_in.set(Window::DimX, Window::Dimension(0, 0, 0));
win_in.set(Window::DimY, Window::Dimension(0, 0, 0));
+ win_in.set(Window::DimZ, Window::Dimension(0, 0, 0));
Iterator in(_input, win_in);
Iterator out(_output, window);
@@ -513,6 +727,97 @@
in, out);
}
+void NEScaleKernel::scale_nhwc(const Window &window)
+{
+ // Get data layout and width/height indices
+ const DataLayout data_layout = _input->info()->data_layout();
+ const int idx_channels = get_data_layout_dimension_index(data_layout, DataLayoutDimension::CHANNEL);
+ const int idx_width = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
+ const int idx_height = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
+
+ const size_t input_stride_w = _input->info()->strides_in_bytes()[idx_width];
+ const size_t input_stride_h = _input->info()->strides_in_bytes()[idx_height];
+ const size_t input_stride_c = _input->info()->strides_in_bytes()[idx_channels];
+
+ // Compute the ratio between source height and destination height
+ const auto hr = static_cast<float>(_input->info()->dimension(idx_height)) / static_cast<float>(_output->info()->dimension(idx_height));
+
+ // Don't increment in width/height/channels for the input tensor
+ // A pointer to the start of this plane is needed as base for the precomputed offsets
+ Window win_in(window);
+ win_in.set(Window::DimX, Window::Dimension(0, 0, 0));
+ win_in.set(Window::DimY, Window::Dimension(0, 0, 0));
+ win_in.set(Window::DimZ, Window::Dimension(0, 0, 0));
+
+ switch(_input->info()->data_type())
+ {
+ case DataType::U8:
+ {
+ if(_policy == InterpolationPolicy::NEAREST_NEIGHBOR)
+ {
+ scale_nearest_nhwc_core<uint8_t>(_input, _offsets, _output, hr, window, win_in, input_stride_w, input_stride_h, input_stride_c);
+ }
+ else
+ {
+ scale_bilinear_nhwc_core<uint8_t>(_input, _offsets, _dx, _dy, _output, hr,
+ window, win_in, input_stride_w, input_stride_h, input_stride_c, _border_mode);
+ }
+ break;
+ }
+ case DataType::S16:
+ {
+ if(_policy == InterpolationPolicy::NEAREST_NEIGHBOR)
+ {
+ scale_nearest_nhwc_core<int16_t>(_input, _offsets, _output, hr, window, win_in, input_stride_w, input_stride_h, input_stride_c);
+ }
+ else
+ {
+ scale_bilinear_nhwc_core<int16_t>(_input, _offsets, _dx, _dy, _output, hr,
+ window, win_in, input_stride_w, input_stride_h, input_stride_c, _border_mode);
+ }
+ break;
+ }
+ case DataType::F32:
+ {
+ if(_policy == InterpolationPolicy::NEAREST_NEIGHBOR)
+ {
+ scale_nearest_nhwc_core<float>(_input, _offsets, _output, hr, window, win_in, input_stride_w, input_stride_h, input_stride_c);
+ }
+ else
+ {
+ scale_bilinear_nhwc_core<float>(_input, _offsets, _dx, _dy, _output, hr,
+ window, win_in, input_stride_w, input_stride_h, input_stride_c, _border_mode);
+ }
+ break;
+ }
+ default:
+ ARM_COMPUTE_ERROR("Not supported");
+ break;
+ }
+}
+
+Status NEScaleKernel::validate(const ITensorInfo *input, const ITensorInfo *dx, const ITensorInfo *dy,
+ const ITensorInfo *offsets, ITensorInfo *output, InterpolationPolicy policy,
+ BorderMode border_mode, SamplingPolicy sampling_policy)
+{
+ BorderSize border_size(1);
+ if(input->data_layout() == DataLayout::NHWC)
+ {
+ border_size = (border_mode == BorderMode::CONSTANT && policy == InterpolationPolicy::BILINEAR) ? BorderSize(1, 0, 0, 0) : BorderSize(0);
+ }
+
+ ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, dx, dy, offsets, output, policy, border_mode, sampling_policy));
+ ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input->clone().get(),
+ dx != nullptr ? dx->clone().get() : nullptr,
+ dy != nullptr ? dy->clone().get() : nullptr,
+ offsets != nullptr ? offsets->clone().get() : nullptr,
+ output->clone().get(),
+ policy, border_mode == BorderMode::UNDEFINED, sampling_policy, border_size)
+ .first);
+
+ return Status{};
+}
+
void NEScaleKernel::run(const Window &window, const ThreadInfo &info)
{
ARM_COMPUTE_UNUSED(info);
@@ -522,3 +827,4 @@
(this->*_func)(window);
}
+} // namespace arm_compute
diff --git a/src/core/NEON/kernels/NESoftmaxLayerKernel.cpp b/src/core/NEON/kernels/NESoftmaxLayerKernel.cpp
index 13d87a0..d91efd2 100644
--- a/src/core/NEON/kernels/NESoftmaxLayerKernel.cpp
+++ b/src/core/NEON/kernels/NESoftmaxLayerKernel.cpp
@@ -33,7 +33,7 @@
#include "arm_compute/core/Utils.h"
#include "arm_compute/core/Validate.h"
#include "arm_compute/core/Window.h"
-#include "arm_compute/core/utils/misc/utility.h"
+#include "arm_compute/core/utils/misc/Utility.h"
#include <algorithm>
#include <arm_neon.h>
diff --git a/src/core/NEON/kernels/NETransposeKernel.cpp b/src/core/NEON/kernels/NETransposeKernel.cpp
index c863ed4..e6f3acc 100644
--- a/src/core/NEON/kernels/NETransposeKernel.cpp
+++ b/src/core/NEON/kernels/NETransposeKernel.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2018 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -27,6 +27,7 @@
#include "arm_compute/core/Error.h"
#include "arm_compute/core/Helpers.h"
#include "arm_compute/core/ITensor.h"
+#include "arm_compute/core/TensorInfo.h"
#include "arm_compute/core/Utils.h"
#include "arm_compute/core/Validate.h"
diff --git a/src/core/NEON/kernels/NEWeightsReshapeKernel.cpp b/src/core/NEON/kernels/NEWeightsReshapeKernel.cpp
index 1501402..3031a87 100644
--- a/src/core/NEON/kernels/NEWeightsReshapeKernel.cpp
+++ b/src/core/NEON/kernels/NEWeightsReshapeKernel.cpp
@@ -34,12 +34,16 @@
namespace
{
-template <typename T>
+template <typename T, bool is_nhwc>
void weights_reshape(const ITensor *input, const ITensor *bias, ITensor *output, const Window &window)
{
- const unsigned int kernel_size_x = input->info()->dimension(0);
- const unsigned int kernel_size_y = input->info()->dimension(1);
- const unsigned int kernel_depth = input->info()->dimension(2);
+ DataLayout data_layout = input->info()->data_layout();
+ const int idx_width = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
+ const int idx_height = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
+ const int idx_channel = get_data_layout_dimension_index(data_layout, DataLayoutDimension::CHANNEL);
+ const unsigned int kernel_size_x = input->info()->dimension(idx_width);
+ const unsigned int kernel_size_y = input->info()->dimension(idx_height);
+ const unsigned int kernel_depth = input->info()->dimension(idx_channel);
const unsigned int input_stride_x = input->info()->strides_in_bytes().x();
const unsigned int input_stride_y = input->info()->strides_in_bytes().y();
const unsigned int input_stride_z = input->info()->strides_in_bytes().z();
@@ -67,13 +71,13 @@
for(unsigned int i = 0; i < kernel_size_x; ++i)
{
*(reinterpret_cast<T *>(tmp_output_ptr)) = *(reinterpret_cast<const T *>(tmp_input_ptr));
- tmp_input_ptr += input_stride_x;
+ tmp_input_ptr += is_nhwc ? input_stride_y : input_stride_x;
tmp_output_ptr += output_stride_y;
}
- curr_input_row_ptr += input_stride_y;
+ curr_input_row_ptr += is_nhwc ? input_stride_z : input_stride_y;
tmp_input_ptr = curr_input_row_ptr;
}
- curr_input_depth_ptr += input_stride_z;
+ curr_input_depth_ptr += is_nhwc ? input_stride_x : input_stride_z;
curr_input_row_ptr = curr_input_depth_ptr;
tmp_input_ptr = curr_input_depth_ptr;
}
@@ -161,21 +165,24 @@
_bias = bias;
_output = output;
+ const DataLayout data_layout = input->info()->data_layout();
+ const bool is_nhwc = data_layout == DataLayout::NHWC;
+
switch(_input->info()->element_size())
{
case 4:
{
- _func = &weights_reshape<uint32_t>;
+ _func = is_nhwc ? &weights_reshape<uint32_t, true> : &weights_reshape<uint32_t, false>;
break;
}
case 2:
{
- _func = &weights_reshape<uint16_t>;
+ _func = is_nhwc ? &weights_reshape<uint16_t, true> : &weights_reshape<uint16_t, false>;
break;
}
case 1:
{
- _func = &weights_reshape<uint8_t>;
+ _func = is_nhwc ? &weights_reshape<uint8_t, true> : &weights_reshape<uint8_t, false>;
break;
}
default:
diff --git a/src/core/NEON/kernels/NEWinogradConvolutionLayerKernel.cpp b/src/core/NEON/kernels/NEWinogradConvolutionLayerKernel.cpp
new file mode 100644
index 0000000..672684d
--- /dev/null
+++ b/src/core/NEON/kernels/NEWinogradConvolutionLayerKernel.cpp
@@ -0,0 +1,555 @@
+/*
+ * Copyright (c) 2017-2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/NEON/kernels/NEWinogradConvolutionLayerKernel.h"
+
+#include "arm_compute/core/AccessWindowStatic.h"
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/IAccessWindow.h"
+#include "arm_compute/core/ITensor.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/Window.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "support/ToolchainSupport.h"
+
+namespace arm_compute
+{
+//Batched Gemms
+
+namespace
+{
+Status validate_arguments_winograd_gemm(const ITensorInfo *a, const ITensorInfo *b, const ITensor *c, const ITensorInfo *output, const float alpha, const float beta,
+ const GEMMInfo &gemm_info = GEMMInfo())
+{
+ ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(a);
+ ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(b);
+ ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(output);
+
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(a, 1, DataType::F32);
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(a, b);
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(gemm_info.is_a_reshaped(), "Matrix A already reshaped is not supported");
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(gemm_info.is_b_reshaped(), "Matrix B already reshaped is not supported");
+
+ if(c != nullptr)
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(a, c->info());
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(a->dimension(1) != c->info()->dimension(1), "The matrix C must have the same number of rows as the matrix A");
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(b->dimension(0) != c->info()->dimension(0), "The matrix C must have the same number of columns as the matrix B");
+ }
+
+ if(output->total_size() != 0)
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(a, output);
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(b->dimension(0) != output->dimension(0), "The output matrix must have the same number of columns as the matrix B");
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(a->dimension(1) != output->dimension(1), "The output matrix must have the same number of rows as the matrix A");
+ ARM_COMPUTE_RETURN_ERROR_ON(output->num_dimensions() != a->num_dimensions());
+ }
+
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(a->dimension(0) != b->dimension(1), "The product AB is defined only if the number of columns in A is equal to the number of rows in B");
+ ARM_COMPUTE_UNUSED(alpha, beta);
+ return Status{};
+}
+
+Status validate_arguments_winograd_weight_trans(const ITensorInfo *input, const ITensorInfo *output, const WinogradInfo &winograd_info)
+{
+ ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input);
+ ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(output);
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32);
+
+ const size_t idx_width = get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::WIDTH);
+ const size_t idx_height = get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::HEIGHT);
+ ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(idx_width) != 3 && input->dimension(idx_width) != 5);
+ ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(idx_width) != input->dimension(idx_height));
+ ARM_COMPUTE_RETURN_ERROR_ON(input->num_dimensions() > 4);
+ const Size2D &output_tile = winograd_info.output_tile_size;
+ ARM_COMPUTE_RETURN_ERROR_ON(output_tile != Size2D(2U, 2U) && output_tile != Size2D(4U, 4U));
+
+ // Checks performed when output is configured
+ if(output->total_size() != 0)
+ {
+ const TensorInfo tensor_info_output = input->clone()->set_tensor_shape(arm_compute::misc::shape_calculator::compute_winograd_filter_transform_shape(*input, winograd_info));
+
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(output, &tensor_info_output);
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+ }
+
+ return Status{};
+}
+
+std::pair<Status, Window> validate_and_configure_window_winograd_weight_trans(ITensorInfo *input, ITensorInfo *output, const WinogradInfo &winograd_info)
+{
+ const Size2D kernel_dims = winograd_info.kernel_size;
+ // Output tensor auto inizialitation if not yet initialized
+ auto_init_if_empty(*output, input->clone()->set_tensor_shape(arm_compute::misc::shape_calculator::compute_winograd_filter_transform_shape(*input, winograd_info)));
+
+ unsigned int num_elems_processed_per_iteration_x = kernel_dims.width;
+ unsigned int num_elems_processed_per_iteration_y = kernel_dims.height;
+
+ Window win = calculate_max_window(*input, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y));
+ bool window_changed = false;
+
+ AccessWindowRectangle input_access(input, 0, 0, num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y);
+ AccessWindowStatic output_access(output, 0, 0, output->dimension(0), output->dimension(1));
+ window_changed = update_window_and_padding(win, input_access, output_access);
+ output_access.set_valid_region(win, ValidRegion(Coordinates(0, 0), output->tensor_shape()));
+
+ Window win_collapsed = win.collapse(win, Window::DimZ);
+
+ Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
+
+ return std::make_pair(err, win_collapsed);
+}
+
+Status validate_arguments_winograd_input_trans(const ITensorInfo *input, const ITensorInfo *output, const WinogradInfo &winograd_info)
+{
+ const Size2D &kernel_dims = winograd_info.kernel_size;
+ const PadStrideInfo &conv_info = winograd_info.convolution_info;
+ ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input);
+ ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(output);
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32);
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(conv_info.stride().first != 1 || conv_info.stride().second != 1, "Winograd input transform only supports unit strides");
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG((kernel_dims.width != 3U && kernel_dims.width != 5U), "Winograd input transform only supports 3x3 and 5x5 kernels");
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG((kernel_dims.width != kernel_dims.height), "Winograd input transform only supports 3x3 and 5x5 kernels");
+
+ // Validate configured output
+ if(output->total_size() != 0)
+ {
+ const TensorShape output_shape = misc::shape_calculator::compute_winograd_input_transform_shape(*input, winograd_info);
+
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(output->tensor_shape(), output_shape);
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+ }
+
+ return Status{};
+}
+
+std::pair<Status, Window> validate_and_configure_window_winograd_input_trans(ITensorInfo *input, ITensorInfo *output, const WinogradInfo &winograd_info)
+{
+ const PadStrideInfo conv_info = winograd_info.convolution_info;
+ const Size2D output_tile_size = winograd_info.output_tile_size;
+ const Size2D kernel_dims = winograd_info.kernel_size;
+ const TensorShape output_shape = misc::shape_calculator::compute_winograd_input_transform_shape(*input, winograd_info);
+ // Output auto inizialitation if not yet initialized
+ auto_init_if_empty(*output, input->clone()->set_tensor_shape(output_shape));
+
+ unsigned int num_elems_read_per_iteration_x = (output_tile_size.width + kernel_dims.width - 1);
+ unsigned int num_elems_read_per_iteration_y = (output_tile_size.height + kernel_dims.height - 1);
+
+ Window win = calculate_max_window(*input, Steps(1, 1));
+
+ AccessWindowRectangle input_access(input, -conv_info.pad_left(), -conv_info.pad_top(), num_elems_read_per_iteration_x, num_elems_read_per_iteration_y);
+
+ bool window_changed = update_window_and_padding(win, input_access);
+
+ Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
+ return std::make_pair(err, win);
+}
+
+Status validate_arguments_winograd_output_trans(const ITensorInfo *input, const ITensorInfo *bias, const ITensorInfo *output, const WinogradInfo &winograd_info)
+{
+ const PadStrideInfo &conv_info = winograd_info.convolution_info;
+ const Size2D kernel_dims = winograd_info.kernel_size;
+
+ // Number of tiles along the X and Y direction
+ const unsigned int num_tiles_x = std::ceil((winograd_info.input_dimensions.x() - (kernel_dims.width - 1) + conv_info.pad_left() + conv_info.pad_right()) / static_cast<float>
+ (winograd_info.output_tile_size.width));
+ const unsigned int num_tiles_y = std::ceil((winograd_info.input_dimensions.y() - (kernel_dims.height - 1) + conv_info.pad_top() + conv_info.pad_bottom()) / static_cast<float>
+ (winograd_info.output_tile_size.height));
+ const Size2D num_tiles = Size2D(num_tiles_x, num_tiles_y);
+
+ ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input);
+ ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(output);
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32);
+ ARM_COMPUTE_RETURN_ERROR_ON(winograd_info.output_data_layout != DataLayout::NCHW);
+ ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(1) != num_tiles.area());
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG((kernel_dims.width != 3U && kernel_dims.width != 5U), "Winograd output transform only supports 3x3 and 5x5 kernels");
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG((kernel_dims.width != kernel_dims.height), "Winograd output transform only supports 3x3 and 5x5 kernels");
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(((input->dimension(2) != size_t(16U)) && (input->dimension(2) != size_t(36U))), "Only 2x2 and 4x4 output tile is supported");
+ ARM_COMPUTE_UNUSED(kernel_dims);
+ if(bias != nullptr)
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, bias);
+ ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(0) != bias->dimension(0));
+ ARM_COMPUTE_RETURN_ERROR_ON(bias->num_dimensions() != size_t(1));
+ }
+
+ // Checks performed when output is configured
+ if(output->total_size() != 0)
+ {
+ const TensorInfo tensor_info_output = input->clone()->set_tensor_shape(arm_compute::misc::shape_calculator::compute_winograd_output_transform_shape(*input, winograd_info));
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(output, &tensor_info_output);
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+ }
+ return Status{};
+}
+
+std::pair<Status, Window> validate_and_configure_window_winograd_output_trans(ITensorInfo *input, ITensorInfo *bias, ITensorInfo *output, const WinogradInfo &winograd_info)
+{
+ // Output tensor auto initialization if not yet initialized
+ auto_init_if_empty(*output, input->clone()->set_tensor_shape(arm_compute::misc::shape_calculator::compute_winograd_output_transform_shape(*input, winograd_info)));
+
+ constexpr unsigned int num_elems_processed_per_iteration = 1;
+
+ Window win = calculate_max_window(*input, Steps(num_elems_processed_per_iteration));
+ bool window_changed = false;
+
+ AccessWindowRectangle input_access(input, 0, 0, num_elems_processed_per_iteration, num_elems_processed_per_iteration);
+ AccessWindowStatic output_access(output, 0, 0, ceil_to_multiple(output->dimension(0), 2), ceil_to_multiple(output->dimension(1), 2));
+
+ if(bias != nullptr)
+ {
+ AccessWindowStatic bias_access(bias, 0, 0, bias->dimension(0), bias->dimension(1));
+ window_changed = update_window_and_padding(win, input_access, bias_access, output_access);
+ }
+ else
+ {
+ window_changed = update_window_and_padding(win, input_access, output_access);
+ }
+ output->set_valid_region(ValidRegion(Coordinates(), output->tensor_shape()));
+
+ Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
+ return std::make_pair(err, win);
+}
+} // namespace
+template <typename TIn, typename TOut, int OutputTileRows, int OutputTileCols, int KernelRows, int KernelCols>
+NEWinogradLayerBatchedGEMMKernel<TIn, TOut, OutputTileRows, OutputTileCols, KernelRows, KernelCols>::NEWinogradLayerBatchedGEMMKernel()
+ : _gemms()
+{
+}
+
+template <typename TIn, typename TOut, int OutputTileRows, int OutputTileCols, int KernelRows, int KernelCols>
+void NEWinogradLayerBatchedGEMMKernel<TIn, TOut, OutputTileRows, OutputTileCols, KernelRows, KernelCols>::configure(
+ const unsigned int n_gemms,
+ const int M, const int K, const int N,
+ const int a_matrix_stride,
+ const int a_row_stride,
+ const int b_matrix_stride,
+ const int b_row_stride,
+ const int c_matrix_stride,
+ const int c_row_stride,
+ const TIn *const a_ptr,
+ const TIn *const b_ptr,
+ TOut *const c_ptr)
+{
+ _gemms = support::cpp14::make_unique<MultiGEMM>(n_gemms, M, K, N, a_matrix_stride, a_row_stride, b_matrix_stride, b_row_stride, c_matrix_stride, c_row_stride, a_ptr, b_ptr, c_ptr);
+ Window win;
+ auto win_last = _gemms->get_window();
+ win.set(Window::DimX, Window::Dimension(0, win_last, 1));
+ INEKernel::configure(win);
+}
+
+template <typename TIn, typename TOut, int OutputTileRows, int OutputTileCols, int KernelRows, int KernelCols>
+void NEWinogradLayerBatchedGEMMKernel<TIn, TOut, OutputTileRows, OutputTileCols, KernelRows, KernelCols>::run(const Window &window, const ThreadInfo &info)
+{
+ ARM_COMPUTE_UNUSED(info);
+ ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+ const size_t first_gemm = window.x().start();
+ const size_t last_gemm = window.x().end();
+ _gemms->run(first_gemm, last_gemm);
+}
+
+template <typename TIn, typename TOut, int OutputTileRows, int OutputTileCols, int KernelRows, int KernelCols>
+unsigned int NEWinogradLayerBatchedGEMMKernel<TIn, TOut, OutputTileRows, OutputTileCols, KernelRows, KernelCols>::get_number_gemms() const
+{
+ return WinogradBase::N_GEMMS;
+}
+
+template <typename TIn, typename TOut, int OutputTileRows, int OutputTileCols, int KernelRows, int KernelCols>
+int NEWinogradLayerBatchedGEMMKernel<TIn, TOut, OutputTileRows, OutputTileCols, KernelRows, KernelCols>::get_output_tile_rows() const
+{
+ return _output_tile_rows;
+}
+
+template <typename TIn, typename TOut, int OutputTileRows, int OutputTileCols, int KernelRows, int KernelCols>
+int NEWinogradLayerBatchedGEMMKernel<TIn, TOut, OutputTileRows, OutputTileCols, KernelRows, KernelCols>::get_output_tile_cols() const
+{
+ return _output_tile_cols;
+}
+
+template <typename TIn, typename TOut, int OutputTileRows, int OutputTileCols, int KernelRows, int KernelCols>
+int NEWinogradLayerBatchedGEMMKernel<TIn, TOut, OutputTileRows, OutputTileCols, KernelRows, KernelCols>::get_number_blocks() const
+{
+ return WinogradConv::N_BLOCK;
+}
+
+template <typename TIn, typename TOut, int OutputTileRows, int OutputTileCols, int KernelRows, int KernelCols>
+Status NEWinogradLayerBatchedGEMMKernel<TIn, TOut, OutputTileRows, OutputTileCols, KernelRows, KernelCols>::validate(const ITensorInfo *a, const ITensorInfo *b, const ITensor *c,
+ const ITensorInfo *output, const float alpha, const float beta, const GEMMInfo &gemm_info)
+{
+ ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments_winograd_gemm(a, b, c, output, alpha, beta, gemm_info));
+ return Status{};
+}
+
+template class NEWinogradLayerBatchedGEMMKernel<float, float, 2, 2, 3, 3>;
+template class NEWinogradLayerBatchedGEMMKernel<float, float, 4, 4, 3, 3>;
+template class NEWinogradLayerBatchedGEMMKernel<float, float, 2, 2, 5, 5>;
+
+// Weights transform
+
+template <typename T, int OutputTileRows, int OutputTileCols, int KernelRows, int KernelCols>
+unsigned int NEWinogradLayerTransformWeightsKernel<T, OutputTileRows, OutputTileCols, KernelRows, KernelCols>::get_weight_storage_size(int n_output_channels, int n_input_channels) const
+{
+ const KernelShape shape(n_output_channels, KernelRows, KernelCols, n_input_channels);
+ return static_cast<unsigned int>(
+ // WinogradConv returns the size in bytes, we divide by `sizeof(T)` to express that in units of T
+ WinogradConv::get_kernel_storage_size(shape) / sizeof(T));
+}
+
+template <typename T, int OutputTileRows, int OutputTileCols, int KernelRows, int KernelCols>
+NEWinogradLayerTransformWeightsKernel<T, OutputTileRows, OutputTileCols, KernelRows, KernelCols>::NEWinogradLayerTransformWeightsKernel()
+ : _transform()
+{
+}
+
+template <typename T, int OutputTileRows, int OutputTileCols, int KernelRows, int KernelCols>
+int NEWinogradLayerTransformWeightsKernel<T, OutputTileRows, OutputTileCols, KernelRows, KernelCols>::get_matrix_stride(const KernelShape &kernel_shape) const
+{
+ return WinogradConv::get_kernel_matrix_stride(kernel_shape);
+}
+
+template <typename T, int OutputTileRows, int OutputTileCols, int KernelRows, int KernelCols>
+void NEWinogradLayerTransformWeightsKernel<T, OutputTileRows, OutputTileCols, KernelRows, KernelCols>::configure(
+ const ITensor *weights_hwio,
+ T *const output,
+ const int matrix_stride, /** Stride across matrices in the output. */
+ const int n_output_channels, /** Number of filters. */
+ const int n_input_channels) /** Number of channels in each filter. */
+{
+ const int matrix_row_stride = roundup(n_output_channels, WinogradConv::N_BLOCK);
+ _transform = support::cpp14::make_unique<WeightsTransform>(reinterpret_cast<T *>(weights_hwio->buffer()), output, matrix_stride, matrix_row_stride, n_output_channels,
+ n_input_channels);
+ Window win;
+ auto win_last = _transform->get_window();
+ win.set(Window::DimX, Window::Dimension(0, win_last, 1));
+ INEKernel::configure(win);
+}
+
+template <typename T, int OutputTileRows, int OutputTileCols, int KernelRows, int KernelCols>
+void NEWinogradLayerTransformWeightsKernel<T, OutputTileRows, OutputTileCols, KernelRows, KernelCols>::run(const Window &window, const ThreadInfo &info)
+{
+ ARM_COMPUTE_UNUSED(info);
+ ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+ const size_t fst = window.x().start();
+ const size_t lst = window.x().end();
+ _transform->run(fst, lst);
+}
+
+template <typename T, int OutputTileRows, int OutputTileCols, int KernelRows, int KernelCols>
+bool NEWinogradLayerTransformWeightsKernel<T, OutputTileRows, OutputTileCols, KernelRows, KernelCols>::is_parallelisable() const
+{
+ return false;
+}
+
+template <typename T, int OutputTileRows, int OutputTileCols, int KernelRows, int KernelCols>
+Status NEWinogradLayerTransformWeightsKernel<T, OutputTileRows, OutputTileCols, KernelRows, KernelCols>::validate(const ITensorInfo *input, const ITensorInfo *output,
+ const WinogradInfo &winograd_info)
+{
+ ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments_winograd_weight_trans(input, output, winograd_info));
+ ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window_winograd_weight_trans(input->clone().get(), output->clone().get(), winograd_info).first);
+ return Status{};
+}
+
+template class NEWinogradLayerTransformWeightsKernel<float, 2, 2, 3, 3>;
+template class NEWinogradLayerTransformWeightsKernel<float, 4, 4, 3, 3>;
+template class NEWinogradLayerTransformWeightsKernel<float, 2, 2, 5, 5>;
+
+// Input transform
+
+template <typename T, int OutputTileRows, int OutputTileCols, int KernelRows, int KernelCols>
+unsigned int NEWinogradLayerTransformInputKernel<T, OutputTileRows, OutputTileCols, KernelRows, KernelCols>::get_input_storage_size(
+ int n_batches, /** Number of batches in the input tensor. */
+ int n_channels, /** Number of feature maps in the input tensor. */
+ int n_rows, /** Number of rows in each feature map. */
+ int n_cols, /** Number of columns in each feature map. */
+ bool same_padding /** Use "SAME" padding, otherwise use "VALID". */
+) const
+{
+ // Construct shapes for the input and kernel tensors.
+ const Tensor4DShape input_shape(n_batches, n_rows, n_cols, n_channels);
+ const KernelShape kern_shape(1, KernelRows, KernelCols, n_channels);
+ const PaddingType padding = (same_padding) ? PADDING_SAME : PADDING_VALID;
+ // Return the size, converted into units of TIn
+ return static_cast<unsigned int>(WinogradConv::get_input_storage_size(kern_shape, input_shape, padding) / sizeof(T));
+}
+
+template <typename T, int OutputTileRows, int OutputTileCols, int KernelRows, int KernelCols>
+int NEWinogradLayerTransformInputKernel<T, OutputTileRows, OutputTileCols, KernelRows, KernelCols>::get_matrix_stride(
+ const KernelShape &kernel_shape, const Tensor4DShape &input_shape, const PaddingType padding_type) const
+{
+ return WinogradConv::get_input_matrix_stride(kernel_shape, input_shape, padding_type);
+}
+
+template <typename T, int OutputTileRows, int OutputTileCols, int KernelRows, int KernelCols>
+NEWinogradLayerTransformInputKernel<T, OutputTileRows, OutputTileCols, KernelRows, KernelCols>::NEWinogradLayerTransformInputKernel()
+ : _transform()
+{
+}
+
+template <typename T, int OutputTileRows, int OutputTileCols, int KernelRows, int KernelCols>
+void NEWinogradLayerTransformInputKernel<T, OutputTileRows, OutputTileCols, KernelRows, KernelCols>::configure(
+ const T *const input, /** Input tensor data */
+ const int n_batches, /** Number of batches in input tensor. */
+ const int n_rows, /** Number of rows in input tensor. */
+ const int n_cols, /** Number of columns in input tensor. */
+ const int n_channels, /** Number of channels in input tensor. */
+ const PaddingType padding, /** Padding type. */
+ T *const output, /** Base of output matrices. */
+ const int matrix_stride) /** Stride between output matrices. */
+{
+ // _input_matrix_row_stride(n_input_channels),
+ _transform = support::cpp14::make_unique<InputTransform>(input, n_batches, n_rows, n_cols, n_channels, padding, output, matrix_stride, n_channels);
+ Window win;
+ auto win_last = _transform->get_window();
+ win.set(Window::DimX, Window::Dimension(0, win_last, 1));
+ INEKernel::configure(win);
+}
+
+template <typename T, int OutputTileRows, int OutputTileCols, int KernelRows, int KernelCols>
+void NEWinogradLayerTransformInputKernel<T, OutputTileRows, OutputTileCols, KernelRows, KernelCols>::run(const Window &window, const ThreadInfo &info)
+{
+ ARM_COMPUTE_UNUSED(info);
+ ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+ const size_t fst = window.x().start();
+ const size_t lst = window.x().end();
+ _transform->run(fst, lst);
+}
+
+template <typename T, int OutputTileRows, int OutputTileCols, int KernelRows, int KernelCols>
+Status NEWinogradLayerTransformInputKernel<T, OutputTileRows, OutputTileCols, KernelRows, KernelCols>::validate(const ITensorInfo *input, const ITensorInfo *output, const WinogradInfo &winograd_info)
+{
+ ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments_winograd_input_trans(input, output, winograd_info));
+ ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window_winograd_input_trans(input->clone().get(), output->clone().get(), winograd_info).first);
+
+ return Status{};
+}
+
+template class NEWinogradLayerTransformInputKernel<float, 2, 2, 3, 3>;
+template class NEWinogradLayerTransformInputKernel<float, 4, 4, 3, 3>;
+template class NEWinogradLayerTransformInputKernel<float, 2, 2, 5, 5>;
+
+// Output transform
+
+template <typename T, int OutputTileRows, int OutputTileCols, int KernelRows, int KernelCols>
+unsigned int NEWinogradLayerTransformOutputKernel<T, OutputTileRows, OutputTileCols, KernelRows, KernelCols>::get_output_storage_size(
+ int n_batches, /** Number of batches in the output tensor. */
+ int n_rows, /** Number of rows in each feature map of the input tensor. */
+ int n_cols, /** Number of columns in each feature map of the input tensor. */
+ int n_output_channels, /** Number of feature maps in the output tensor. */
+ bool same_padding /** Use "SAME" padding, otherwise use "VALID". */
+) const
+{
+ // Construct shapes for the input and kernel tensors.
+ const Tensor4DShape input_shape(n_batches, n_rows, n_cols, 1);
+ const KernelShape kern_shape(n_output_channels, KernelRows, KernelCols, 1);
+ const PaddingType padding = (same_padding) ? PADDING_SAME : PADDING_VALID;
+
+ // Return the size, converted into units of TOut
+ return static_cast<unsigned int>(
+ WinogradConv::get_output_storage_size(kern_shape, input_shape, padding) / sizeof(T));
+}
+
+template <typename T, int OutputTileRows, int OutputTileCols, int KernelRows, int KernelCols>
+NEWinogradLayerTransformOutputKernel<T, OutputTileRows, OutputTileCols, KernelRows, KernelCols>::NEWinogradLayerTransformOutputKernel()
+ : _biases(nullptr), _output_workspace(nullptr), _matrix_stride(0), _matrix_row_stride(0), _output(nullptr), _n_batches(0), _n_rows(0), _n_cols(0), _n_channels(0)
+{
+}
+
+template <typename T, int OutputTileRows, int OutputTileCols, int KernelRows, int KernelCols>
+int NEWinogradLayerTransformOutputKernel<T, OutputTileRows, OutputTileCols, KernelRows, KernelCols>::get_matrix_stride(
+ const KernelShape &kernel_shape, const Tensor4DShape &input_shape, const PaddingType padding_type) const
+{
+ return WinogradConv::get_output_matrix_stride(kernel_shape, input_shape, padding_type);
+}
+template <typename T, int OutputTileRows, int OutputTileCols, int KernelRows, int KernelCols>
+Tensor4DShape NEWinogradLayerTransformOutputKernel<T, OutputTileRows, OutputTileCols, KernelRows, KernelCols>::get_output_shape(
+ const KernelShape &kernel_shape, const Tensor4DShape &in_shape, const PaddingType padding) const
+{
+ return WinogradConv::get_output_shape(kernel_shape, in_shape, padding);
+}
+
+template <typename T, int OutputTileRows, int OutputTileCols, int KernelRows, int KernelCols>
+void NEWinogradLayerTransformOutputKernel<T, OutputTileRows, OutputTileCols, KernelRows, KernelCols>::configure(
+ const ITensor *biases,
+ const T *const output_workingspace,
+ const int matrix_stride,
+ T *const output,
+ const int n_batches,
+ const int n_rows,
+ const int n_cols,
+ const int n_channels)
+{
+ _biases = biases;
+ _output_workspace = output_workingspace;
+ _matrix_stride = matrix_stride;
+ _matrix_row_stride = roundup(n_channels, WinogradConv::N_BLOCK);
+ _output = output;
+ _n_batches = n_batches;
+ _n_rows = n_rows;
+ _n_cols = n_cols;
+ _n_channels = n_channels;
+
+ // We don't have the biases buffer at this stage as it hasn't been allocated, we pass in nullptr OutputTransform is only used here to compute the window
+ OutputTransform output_transform(_output_workspace, _matrix_stride, _matrix_row_stride, nullptr, _output, _n_batches, _n_rows, _n_cols, _n_channels);
+ Window win;
+ auto win_last = output_transform.get_window();
+ win.set(Window::DimX, Window::Dimension(0, win_last, 1));
+ INEKernel::configure(win);
+}
+
+template <typename T, int OutputTileRows, int OutputTileCols, int KernelRows, int KernelCols>
+void NEWinogradLayerTransformOutputKernel<T, OutputTileRows, OutputTileCols, KernelRows, KernelCols>::run(const Window &window, const ThreadInfo &info)
+{
+ ARM_COMPUTE_UNUSED(info);
+ ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+ ARM_COMPUTE_ERROR_ON_NULLPTR(_output_workspace);
+ ARM_COMPUTE_ERROR_ON_NULLPTR(_output);
+
+ OutputTransform output_transform(_output_workspace, _matrix_stride, _matrix_row_stride,
+ (_biases ? reinterpret_cast<T *>(_biases->buffer()) : nullptr), _output,
+ _n_batches, _n_rows, _n_cols, _n_channels);
+
+ // The code below cannot be moved to configure because biases hasn't been allocated at that point
+ const size_t fst = window.x().start();
+ const size_t lst = window.x().end();
+ output_transform.run(fst, lst);
+}
+
+template <typename T, int OutputTileRows, int OutputTileCols, int KernelRows, int KernelCols>
+Status NEWinogradLayerTransformOutputKernel<T, OutputTileRows, OutputTileCols, KernelRows, KernelCols>::validate(const ITensorInfo *input, const ITensorInfo *bias, const ITensorInfo *output,
+ const WinogradInfo &winograd_info)
+{
+ ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments_winograd_output_trans(input, (bias != nullptr ? bias->clone().get() : nullptr), output, winograd_info));
+ ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window_winograd_output_trans(input->clone().get(), (bias != nullptr ? bias->clone().get() : nullptr), output->clone().get(),
+ winograd_info)
+ .first);
+
+ return Status{};
+}
+
+template class NEWinogradLayerTransformOutputKernel<float, 2, 2, 3, 3>;
+template class NEWinogradLayerTransformOutputKernel<float, 4, 4, 3, 3>;
+template class NEWinogradLayerTransformOutputKernel<float, 2, 2, 5, 5>;
+
+} // namespace arm_compute
diff --git a/src/core/NEON/kernels/NEWinogradLayerKernel.cpp b/src/core/NEON/kernels/NEWinogradLayerKernel.cpp
deleted file mode 100644
index b2e44f8..0000000
--- a/src/core/NEON/kernels/NEWinogradLayerKernel.cpp
+++ /dev/null
@@ -1,325 +0,0 @@
-/*
- * Copyright (c) 2017-2018 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/core/NEON/kernels/NEWinogradLayerKernel.h"
-
-#include "arm_compute/core/Error.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/ITensor.h"
-#include "arm_compute/core/TensorInfo.h"
-#include "support/ToolchainSupport.h"
-
-namespace arm_compute
-{
-//Batched Gemms
-template <typename TIn, typename TOut, int OutputTileRows, int OutputTileCols, int KernelRows, int KernelCols>
-NEWinogradLayerBatchedGEMMKernel<TIn, TOut, OutputTileRows, OutputTileCols, KernelRows, KernelCols>::NEWinogradLayerBatchedGEMMKernel()
- : _gemms()
-{
-}
-
-template <typename TIn, typename TOut, int OutputTileRows, int OutputTileCols, int KernelRows, int KernelCols>
-void NEWinogradLayerBatchedGEMMKernel<TIn, TOut, OutputTileRows, OutputTileCols, KernelRows, KernelCols>::configure(
- const unsigned int n_gemms,
- const int M, const int K, const int N,
- const int a_matrix_stride,
- const int a_row_stride,
- const int b_matrix_stride,
- const int b_row_stride,
- const int c_matrix_stride,
- const int c_row_stride,
- const TIn *const a_ptr,
- const TIn *const b_ptr,
- TOut *const c_ptr)
-{
- _gemms = support::cpp14::make_unique<MultiGEMM>(n_gemms, M, K, N, a_matrix_stride, a_row_stride, b_matrix_stride, b_row_stride, c_matrix_stride, c_row_stride, a_ptr, b_ptr, c_ptr);
- Window win;
- auto win_last = _gemms->get_window();
- win.set(Window::DimX, Window::Dimension(0, win_last, 1));
- INEKernel::configure(win);
-}
-
-template <typename TIn, typename TOut, int OutputTileRows, int OutputTileCols, int KernelRows, int KernelCols>
-void NEWinogradLayerBatchedGEMMKernel<TIn, TOut, OutputTileRows, OutputTileCols, KernelRows, KernelCols>::run(const Window &window, const ThreadInfo &info)
-{
- ARM_COMPUTE_UNUSED(info);
- ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
- const size_t first_gemm = window.x().start();
- const size_t last_gemm = window.x().end();
- _gemms->run(first_gemm, last_gemm);
-}
-
-template <typename TIn, typename TOut, int OutputTileRows, int OutputTileCols, int KernelRows, int KernelCols>
-unsigned int NEWinogradLayerBatchedGEMMKernel<TIn, TOut, OutputTileRows, OutputTileCols, KernelRows, KernelCols>::get_number_gemms() const
-{
- return WinogradBase::N_GEMMS;
-}
-
-template <typename TIn, typename TOut, int OutputTileRows, int OutputTileCols, int KernelRows, int KernelCols>
-int NEWinogradLayerBatchedGEMMKernel<TIn, TOut, OutputTileRows, OutputTileCols, KernelRows, KernelCols>::get_output_tile_rows() const
-{
- return _output_tile_rows;
-}
-
-template <typename TIn, typename TOut, int OutputTileRows, int OutputTileCols, int KernelRows, int KernelCols>
-int NEWinogradLayerBatchedGEMMKernel<TIn, TOut, OutputTileRows, OutputTileCols, KernelRows, KernelCols>::get_output_tile_cols() const
-{
- return _output_tile_cols;
-}
-
-template <typename TIn, typename TOut, int OutputTileRows, int OutputTileCols, int KernelRows, int KernelCols>
-int NEWinogradLayerBatchedGEMMKernel<TIn, TOut, OutputTileRows, OutputTileCols, KernelRows, KernelCols>::get_number_blocks() const
-{
- return WinogradConv::N_BLOCK;
-}
-
-template class NEWinogradLayerBatchedGEMMKernel<float, float, 2, 2, 3, 3>;
-template class NEWinogradLayerBatchedGEMMKernel<float, float, 2, 2, 5, 5>;
-
-// Weights transform
-
-template <typename T, int OutputTileRows, int OutputTileCols, int KernelRows, int KernelCols>
-unsigned int NEWinogradLayerTransformWeightsKernel<T, OutputTileRows, OutputTileCols, KernelRows, KernelCols>::get_weight_storage_size(int n_output_channels, int n_input_channels) const
-{
- const KernelShape shape(n_output_channels, KernelRows, KernelCols, n_input_channels);
- return static_cast<unsigned int>(
- // WinogradConv returns the size in bytes, we divide by `sizeof(T)` to express that in units of T
- WinogradConv::get_kernel_storage_size(shape) / sizeof(T));
-}
-
-template <typename T, int OutputTileRows, int OutputTileCols, int KernelRows, int KernelCols>
-NEWinogradLayerTransformWeightsKernel<T, OutputTileRows, OutputTileCols, KernelRows, KernelCols>::NEWinogradLayerTransformWeightsKernel()
- : _transform()
-{
-}
-
-template <typename T, int OutputTileRows, int OutputTileCols, int KernelRows, int KernelCols>
-int NEWinogradLayerTransformWeightsKernel<T, OutputTileRows, OutputTileCols, KernelRows, KernelCols>::get_matrix_stride(const KernelShape &kernel_shape) const
-{
- return WinogradConv::get_kernel_matrix_stride(kernel_shape);
-}
-
-template <typename T, int OutputTileRows, int OutputTileCols, int KernelRows, int KernelCols>
-void NEWinogradLayerTransformWeightsKernel<T, OutputTileRows, OutputTileCols, KernelRows, KernelCols>::configure(
- const ITensor *weights_hwio,
- T *const output,
- const int matrix_stride, /** Stride across matrices in the output. */
- const int n_output_channels, /** Number of filters. */
- const int n_input_channels) /** Number of channels in each filter. */
-{
- const int matrix_row_stride = roundup(n_output_channels, WinogradConv::N_BLOCK);
- _transform = support::cpp14::make_unique<WeightsTransform>(reinterpret_cast<T *>(weights_hwio->buffer()), output, matrix_stride, matrix_row_stride, n_output_channels,
- n_input_channels);
- Window win;
- auto win_last = _transform->get_window();
- win.set(Window::DimX, Window::Dimension(0, win_last, 1));
- INEKernel::configure(win);
-}
-
-template <typename T, int OutputTileRows, int OutputTileCols, int KernelRows, int KernelCols>
-void NEWinogradLayerTransformWeightsKernel<T, OutputTileRows, OutputTileCols, KernelRows, KernelCols>::run(const Window &window, const ThreadInfo &info)
-{
- ARM_COMPUTE_UNUSED(info);
- ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
- const size_t fst = window.x().start();
- const size_t lst = window.x().end();
- _transform->run(fst, lst);
-}
-
-template <typename T, int OutputTileRows, int OutputTileCols, int KernelRows, int KernelCols>
-bool NEWinogradLayerTransformWeightsKernel<T, OutputTileRows, OutputTileCols, KernelRows, KernelCols>::is_parallelisable() const
-{
- return false;
-}
-
-template class NEWinogradLayerTransformWeightsKernel<float, 2, 2, 3, 3>;
-template class NEWinogradLayerTransformWeightsKernel<float, 2, 2, 5, 5>;
-
-// Input transform
-
-template <typename T, int OutputTileRows, int OutputTileCols, int KernelRows, int KernelCols>
-unsigned int NEWinogradLayerTransformInputKernel<T, OutputTileRows, OutputTileCols, KernelRows, KernelCols>::get_input_storage_size(
- int n_batches, /** Number of batches in the input tensor. */
- int n_channels, /** Number of feature maps in the input tensor. */
- int n_rows, /** Number of rows in each feature map. */
- int n_cols, /** Number of columns in each feature map. */
- bool same_padding /** Use "SAME" padding, otherwise use "VALID". */
-) const
-{
- // Construct shapes for the input and kernel tensors.
- const Tensor4DShape input_shape(n_batches, n_rows, n_cols, n_channels);
- const KernelShape kern_shape(1, KernelRows, KernelCols, n_channels);
- const PaddingType padding = (same_padding) ? PADDING_SAME : PADDING_VALID;
- // Return the size, converted into units of TIn
- return static_cast<unsigned int>(WinogradConv::get_input_storage_size(kern_shape, input_shape, padding) / sizeof(T));
-}
-
-template <typename T, int OutputTileRows, int OutputTileCols, int KernelRows, int KernelCols>
-int NEWinogradLayerTransformInputKernel<T, OutputTileRows, OutputTileCols, KernelRows, KernelCols>::get_matrix_stride(
- const KernelShape &kernel_shape, const Tensor4DShape &input_shape, const PaddingType padding_type) const
-{
- return WinogradConv::get_input_matrix_stride(kernel_shape, input_shape, padding_type);
-}
-
-template <typename T, int OutputTileRows, int OutputTileCols, int KernelRows, int KernelCols>
-NEWinogradLayerTransformInputKernel<T, OutputTileRows, OutputTileCols, KernelRows, KernelCols>::NEWinogradLayerTransformInputKernel()
- : _transform()
-{
-}
-
-template <typename T, int OutputTileRows, int OutputTileCols, int KernelRows, int KernelCols>
-void NEWinogradLayerTransformInputKernel<T, OutputTileRows, OutputTileCols, KernelRows, KernelCols>::configure(
- const T *const input, /** Input tensor data */
- const int n_batches, /** Number of batches in input tensor. */
- const int n_rows, /** Number of rows in input tensor. */
- const int n_cols, /** Number of columns in input tensor. */
- const int n_channels, /** Number of channels in input tensor. */
- const PaddingType padding, /** Padding type. */
- T *const output, /** Base of output matrices. */
- const int matrix_stride) /** Stride between output matrices. */
-{
- // _input_matrix_row_stride(n_input_channels),
- _transform = support::cpp14::make_unique<InputTransform>(input, n_batches, n_rows, n_cols, n_channels, padding, output, matrix_stride, n_channels);
- Window win;
- auto win_last = _transform->get_window();
- win.set(Window::DimX, Window::Dimension(0, win_last, 1));
- INEKernel::configure(win);
-}
-
-template <typename T, int OutputTileRows, int OutputTileCols, int KernelRows, int KernelCols>
-void NEWinogradLayerTransformInputKernel<T, OutputTileRows, OutputTileCols, KernelRows, KernelCols>::run(const Window &window, const ThreadInfo &info)
-{
- ARM_COMPUTE_UNUSED(info);
- ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
- const size_t fst = window.x().start();
- const size_t lst = window.x().end();
- _transform->run(fst, lst);
-}
-
-template <typename T, int OutputTileRows, int OutputTileCols, int KernelRows, int KernelCols>
-bool NEWinogradLayerTransformInputKernel<T, OutputTileRows, OutputTileCols, KernelRows, KernelCols>::is_parallelisable() const
-{
- return false;
-}
-
-template class NEWinogradLayerTransformInputKernel<float, 2, 2, 3, 3>;
-template class NEWinogradLayerTransformInputKernel<float, 2, 2, 5, 5>;
-
-// Output transform
-
-template <typename T, int OutputTileRows, int OutputTileCols, int KernelRows, int KernelCols>
-unsigned int NEWinogradLayerTransformOutputKernel<T, OutputTileRows, OutputTileCols, KernelRows, KernelCols>::get_output_storage_size(
- int n_batches, /** Number of batches in the output tensor. */
- int n_rows, /** Number of rows in each feature map of the input tensor. */
- int n_cols, /** Number of columns in each feature map of the input tensor. */
- int n_output_channels, /** Number of feature maps in the output tensor. */
- bool same_padding /** Use "SAME" padding, otherwise use "VALID". */
-) const
-{
- // Construct shapes for the input and kernel tensors.
- const Tensor4DShape input_shape(n_batches, n_rows, n_cols, 1);
- const KernelShape kern_shape(n_output_channels, KernelRows, KernelCols, 1);
- const PaddingType padding = (same_padding) ? PADDING_SAME : PADDING_VALID;
-
- // Return the size, converted into units of TOut
- return static_cast<unsigned int>(
- WinogradConv::get_output_storage_size(kern_shape, input_shape, padding) / sizeof(T));
-}
-
-template <typename T, int OutputTileRows, int OutputTileCols, int KernelRows, int KernelCols>
-NEWinogradLayerTransformOutputKernel<T, OutputTileRows, OutputTileCols, KernelRows, KernelCols>::NEWinogradLayerTransformOutputKernel()
- : _biases(nullptr), _output_workspace(nullptr), _matrix_stride(0), _matrix_row_stride(0), _output(nullptr), _n_batches(0), _n_rows(0), _n_cols(0), _n_channels(0)
-{
-}
-
-template <typename T, int OutputTileRows, int OutputTileCols, int KernelRows, int KernelCols>
-int NEWinogradLayerTransformOutputKernel<T, OutputTileRows, OutputTileCols, KernelRows, KernelCols>::get_matrix_stride(
- const KernelShape &kernel_shape, const Tensor4DShape &input_shape, const PaddingType padding_type) const
-{
- return WinogradConv::get_output_matrix_stride(kernel_shape, input_shape, padding_type);
-}
-template <typename T, int OutputTileRows, int OutputTileCols, int KernelRows, int KernelCols>
-Tensor4DShape NEWinogradLayerTransformOutputKernel<T, OutputTileRows, OutputTileCols, KernelRows, KernelCols>::get_output_shape(
- const KernelShape &kernel_shape, const Tensor4DShape &in_shape, const PaddingType padding) const
-{
- return WinogradConv::get_output_shape(kernel_shape, in_shape, padding);
-}
-
-template <typename T, int OutputTileRows, int OutputTileCols, int KernelRows, int KernelCols>
-void NEWinogradLayerTransformOutputKernel<T, OutputTileRows, OutputTileCols, KernelRows, KernelCols>::configure(
- const ITensor *biases,
- const T *const output_workingspace,
- const int matrix_stride,
- T *const output,
- const int n_batches,
- const int n_rows,
- const int n_cols,
- const int n_channels)
-{
- _biases = biases;
- _output_workspace = output_workingspace;
- _matrix_stride = matrix_stride;
- _matrix_row_stride = roundup(n_channels, WinogradConv::N_BLOCK);
- _output = output;
- _n_batches = n_batches;
- _n_rows = n_rows;
- _n_cols = n_cols;
- _n_channels = n_channels;
-
- // We don't have the biases buffer at this stage as it hasn't been allocated, we pass in nullptr OutputTransform is only used here to compute the window
- OutputTransform output_transform(_output_workspace, _matrix_stride, _matrix_row_stride, nullptr, _output, _n_batches, _n_rows, _n_cols, _n_channels);
- Window win;
- auto win_last = output_transform.get_window();
- win.set(Window::DimX, Window::Dimension(0, win_last, 1));
- INEKernel::configure(win);
-}
-
-template <typename T, int OutputTileRows, int OutputTileCols, int KernelRows, int KernelCols>
-void NEWinogradLayerTransformOutputKernel<T, OutputTileRows, OutputTileCols, KernelRows, KernelCols>::run(const Window &window, const ThreadInfo &info)
-{
- ARM_COMPUTE_UNUSED(info);
- ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
- ARM_COMPUTE_ERROR_ON_NULLPTR(_biases->buffer());
- ARM_COMPUTE_ERROR_ON_NULLPTR(_output_workspace);
- ARM_COMPUTE_ERROR_ON_NULLPTR(_output);
-
- OutputTransform output_transform(_output_workspace, _matrix_stride, _matrix_row_stride,
- reinterpret_cast<T *>(_biases->buffer()), _output,
- _n_batches, _n_rows, _n_cols, _n_channels);
-
- // The code below cannot be moved to configure because biases hasn't been allocated at that point
- const size_t fst = window.x().start();
- const size_t lst = window.x().end();
- output_transform.run(fst, lst);
-}
-
-template <typename T, int OutputTileRows, int OutputTileCols, int KernelRows, int KernelCols>
-bool NEWinogradLayerTransformOutputKernel<T, OutputTileRows, OutputTileCols, KernelRows, KernelCols>::is_parallelisable() const
-{
- return false;
-}
-
-template class NEWinogradLayerTransformOutputKernel<float, 2, 2, 3, 3>;
-template class NEWinogradLayerTransformOutputKernel<float, 2, 2, 5, 5>;
-
-} // namespace arm_compute
diff --git a/src/core/NEON/kernels/arm32/NEGEMMAArch32Kernel.cpp b/src/core/NEON/kernels/arm32/NEGEMMAArch32Kernel.cpp
deleted file mode 100644
index bffcbbf..0000000
--- a/src/core/NEON/kernels/arm32/NEGEMMAArch32Kernel.cpp
+++ /dev/null
@@ -1,127 +0,0 @@
-/*
- * Copyright (c) 2017 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/core/NEON/kernels/arm32/NEGEMMAArch32Kernel.h"
-
-#include "arm_compute/core/AccessWindowStatic.h"
-#include "arm_compute/core/AccessWindowTranspose.h"
-#include "arm_compute/core/Error.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/IAccessWindow.h"
-#include "arm_compute/core/ITensor.h"
-#include "arm_compute/core/NEON/NEFixedPoint.h"
-#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Types.h"
-#include "arm_compute/core/Utils.h"
-#include "arm_compute/core/Validate.h"
-#include "arm_compute/core/Window.h"
-#include "support/ToolchainSupport.h"
-
-namespace arm_compute
-{
-#include "arm_compute/core/NEON/kernels/assembly/gemm_interleaved.hpp"
-#include "arm_compute/core/NEON/kernels/assembly/kernels/a32_sgemm_8x6.hpp"
-} // namespace arm_compute
-
-#include <arm_neon.h>
-#include <cstddef>
-#include <cstdint>
-#include <tuple>
-
-namespace arm_compute
-{
-void NEGEMMAArch32Kernel::internal_configure(const ITensor *input0, const ITensor *input1, ITensor *output, ITensor *workspace, float alpha, float beta, bool is_transposed_0, bool is_transposed_1)
-{
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input0, 1, DataType::F32);
- ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input0, input1, output);
- ARM_COMPUTE_ERROR_ON_MISMATCHING_FIXED_POINT(input0, input1, output);
-
- _input0 = input0;
- _input1 = input1;
- _output = output;
- _workspace = workspace;
- _alpha = alpha;
- _beta = beta;
- _is_transposed_0 = is_transposed_0;
- _is_transposed_1 = is_transposed_1;
-
- // Configure kernel window
- Window win = calculate_max_window(*output->info());
-
- AccessWindowRectangle output_access(output->info(), 0, 0, 8, 6);
-
- const int input0_access_end = ceil_to_multiple(input0->info()->tensor_shape().x(), 6);
- const int input1_access_end = ceil_to_multiple(input1->info()->tensor_shape().x(), 8);
-
- update_window_and_padding(win,
- AccessWindowStatic(input0->info(), 0, 0, input0_access_end, input0->info()->tensor_shape().y()),
- AccessWindowStatic(input1->info(), 0, 0, input1_access_end, input1->info()->tensor_shape().y()),
- output_access);
-
- INEKernel::configure(win);
-}
-
-void NEGEMMAArch32Kernel::run(const Window &window, const ThreadInfo &info)
-{
- ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
- ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
-
- const int lda = _input0->info()->strides_in_bytes().y() / sizeof(float);
- const int ldb = _input1->info()->strides_in_bytes().y() / sizeof(float);
- const int ldc = _output->info()->strides_in_bytes().y() / sizeof(float);
-
- const auto in1_ptr = reinterpret_cast<const float *>(_input1->buffer());
-
- const int M = std::min(_output->info()->tensor_shape().y(), static_cast<size_t>(window.y().end())) - window.y().start();
- const int N = _output->info()->tensor_shape().x();
- const int K = _input0->info()->tensor_shape().x();
-
- // Only iterate over batches
- Window win(window);
- win.set(0, Window::Dimension(0, 1, 1));
- win.set(1, Window::Dimension(0, 1, 1));
-
- Iterator in0(_input0, window);
- Iterator out(_output, window);
-
- GemmInterleaved<sgemm_8x6, float, float> gemm(&info.cpu_info, M, N, K, _is_transposed_0, _is_transposed_1);
- constexpr size_t alignment = 4096;
- const size_t offset = (gemm.get_working_size() + alignment - 1) * info.thread_id;
- void *workspace = _workspace->buffer() + offset;
- size_t workspace_size = _workspace->info()->total_size();
-
- if(support::cpp11::align(alignment, gemm.get_working_size(), workspace, workspace_size) == nullptr)
- {
- ARM_COMPUTE_ERROR("Not enough space to align buffer!");
- }
-
- execute_window_loop(win, [&](const Coordinates & id)
- {
- gemm.execute(reinterpret_cast<const float *>(in0.ptr()), lda,
- reinterpret_cast<const float *>(in1_ptr), ldb,
- reinterpret_cast<float *>(out.ptr()), ldc,
- _alpha, _beta, workspace);
- },
- in0, out);
-}
-} // namespace arm_compute
diff --git a/src/core/NEON/kernels/arm64/NEGEMMAArch64Kernel.cpp b/src/core/NEON/kernels/arm64/NEGEMMAArch64Kernel.cpp
deleted file mode 100644
index 0eaa9aa..0000000
--- a/src/core/NEON/kernels/arm64/NEGEMMAArch64Kernel.cpp
+++ /dev/null
@@ -1,127 +0,0 @@
-/*
- * Copyright (c) 2017 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/core/NEON/kernels/arm64/NEGEMMAArch64Kernel.h"
-
-#include "arm_compute/core/AccessWindowStatic.h"
-#include "arm_compute/core/AccessWindowTranspose.h"
-#include "arm_compute/core/Error.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/IAccessWindow.h"
-#include "arm_compute/core/ITensor.h"
-#include "arm_compute/core/NEON/NEFixedPoint.h"
-#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Types.h"
-#include "arm_compute/core/Utils.h"
-#include "arm_compute/core/Validate.h"
-#include "arm_compute/core/Window.h"
-#include "support/ToolchainSupport.h"
-
-namespace arm_compute
-{
-#include "arm_compute/core/NEON/kernels/assembly/gemm_interleaved.hpp"
-#include "arm_compute/core/NEON/kernels/assembly/kernels/a64_sgemm_12x8.hpp"
-} // namespace arm_compute
-
-#include <arm_neon.h>
-#include <cstddef>
-#include <cstdint>
-#include <tuple>
-
-namespace arm_compute
-{
-void NEGEMMAArch64Kernel::internal_configure(const ITensor *input0, const ITensor *input1, ITensor *output, ITensor *workspace, float alpha, float beta, bool is_transposed_0, bool is_transposed_1)
-{
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input0, 1, DataType::F32);
- ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input0, input1, output);
- ARM_COMPUTE_ERROR_ON_MISMATCHING_FIXED_POINT(input0, input1, output);
-
- _input0 = input0;
- _input1 = input1;
- _output = output;
- _workspace = workspace;
- _alpha = alpha;
- _beta = beta;
- _is_transposed_0 = is_transposed_0;
- _is_transposed_1 = is_transposed_1;
-
- // Configure kernel window
- Window win = calculate_max_window(*output->info());
-
- AccessWindowRectangle output_access(output->info(), 0, 0, 12, 8);
-
- const int input0_access_end = ceil_to_multiple(input0->info()->tensor_shape().x(), 8);
- const int input1_access_end = ceil_to_multiple(input1->info()->tensor_shape().x(), 12);
-
- update_window_and_padding(win,
- AccessWindowStatic(input0->info(), 0, 0, input0_access_end, input0->info()->tensor_shape().y()),
- AccessWindowStatic(input1->info(), 0, 0, input1_access_end, input1->info()->tensor_shape().y()),
- output_access);
-
- INEKernel::configure(win);
-}
-
-void NEGEMMAArch64Kernel::run(const Window &window, const ThreadInfo &info)
-{
- ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
- ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
-
- const int lda = _input0->info()->strides_in_bytes().y() / sizeof(float);
- const int ldb = _input1->info()->strides_in_bytes().y() / sizeof(float);
- const int ldc = _output->info()->strides_in_bytes().y() / sizeof(float);
-
- const auto in1_ptr = reinterpret_cast<const float *>(_input1->buffer());
-
- const int M = std::min(_output->info()->tensor_shape().y(), static_cast<size_t>(window.y().end())) - window.y().start();
- const int N = _output->info()->tensor_shape().x();
- const int K = _input0->info()->tensor_shape().x();
-
- // Only iterate over batches
- Window win(window);
- win.set(0, Window::Dimension(0, 1, 1));
- win.set(1, Window::Dimension(0, 1, 1));
-
- Iterator in0(_input0, window);
- Iterator out(_output, window);
-
- GemmInterleaved<sgemm_12x8, float, float> gemm(&info.cpu_info, M, N, K, _is_transposed_0, _is_transposed_1);
- constexpr size_t alignment = 4096;
- const size_t offset = (gemm.get_working_size() + alignment - 1) * info.thread_id;
- void *workspace = _workspace->buffer() + offset;
- size_t workspace_size = _workspace->info()->total_size();
-
- if(support::cpp11::align(alignment, gemm.get_working_size(), workspace, workspace_size) == nullptr)
- {
- ARM_COMPUTE_ERROR("Not enough space to align buffer!");
- }
-
- execute_window_loop(win, [&](const Coordinates & id)
- {
- gemm.execute(reinterpret_cast<const float *>(in0.ptr()), lda,
- reinterpret_cast<const float *>(in1_ptr), ldb,
- reinterpret_cast<float *>(out.ptr()), ldc,
- _alpha, _beta, workspace);
- },
- in0, out);
-}
-} // namespace arm_compute
diff --git a/src/core/NEON/kernels/arm64/NEGEMMAArch64NativeKernel.cpp b/src/core/NEON/kernels/arm64/NEGEMMAArch64NativeKernel.cpp
deleted file mode 100644
index 0b3212b..0000000
--- a/src/core/NEON/kernels/arm64/NEGEMMAArch64NativeKernel.cpp
+++ /dev/null
@@ -1,121 +0,0 @@
-/*
- * Copyright (c) 2018 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/core/NEON/kernels/arm64/NEGEMMAArch64NativeKernel.h"
-
-#include "arm_compute/core/AccessWindowStatic.h"
-#include "arm_compute/core/AccessWindowTranspose.h"
-#include "arm_compute/core/Error.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/IAccessWindow.h"
-#include "arm_compute/core/ITensor.h"
-#include "arm_compute/core/NEON/NEFixedPoint.h"
-#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Types.h"
-#include "arm_compute/core/Utils.h"
-#include "arm_compute/core/Validate.h"
-#include "arm_compute/core/Window.h"
-#include "support/ToolchainSupport.h"
-
-namespace arm_compute
-{
-#include "arm_compute/core/NEON/kernels/convolution/winograd/gemm.hpp"
-} // namespace arm_compute
-
-#include <arm_neon.h>
-#include <cstddef>
-#include <cstdint>
-#include <tuple>
-
-namespace arm_compute
-{
-void NEGEMMAArch64NativeKernel::internal_configure(const ITensor *input0, const ITensor *input1, ITensor *output, ITensor *workspace, float alpha, float beta, bool is_transposed_0,
- bool is_transposed_1)
-{
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input0, 1, DataType::F32);
- ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input0, input1, output);
- ARM_COMPUTE_ERROR_ON_MISMATCHING_FIXED_POINT(input0, input1, output);
-
- _input0 = input0;
- _input1 = input1;
- _output = output;
- _workspace = workspace;
- _alpha = alpha;
- _beta = beta;
- _is_transposed_0 = is_transposed_0;
- _is_transposed_1 = is_transposed_1;
-
- // Configure kernel window
- Window win = calculate_max_window(*output->info(), Steps(16U, 4U));
-
- const int input0_access_end_x = ceil_to_multiple(input0->info()->tensor_shape().x(), 4);
- const int input0_access_end_y = ceil_to_multiple(input0->info()->tensor_shape().y(), 4);
- const int input1_access_end_x = ceil_to_multiple(input1->info()->tensor_shape().x(), 16);
-
- AccessWindowStatic input0_access(input0->info(), 0, 0, input0_access_end_x, input0_access_end_y);
- AccessWindowStatic input1_access(input1->info(), 0, 0, input1_access_end_x, input1->info()->tensor_shape().y());
- AccessWindowRectangle output_access(output->info(), 0, 0, 16, 4);
- update_window_and_padding(win, input0_access, input1_access, output_access);
-
- INEKernel::configure(win);
-}
-
-void NEGEMMAArch64NativeKernel::run(const Window &window, const ThreadInfo &info)
-{
- ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
- ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
- ARM_COMPUTE_UNUSED(info);
-
- const auto in1_ptr = reinterpret_cast<const float *>(_input1->buffer());
-
- // Calculate row strides for each matrix
- const int lda = _input0->info()->strides_in_bytes().y() / sizeof(float);
- const int ldb = _input1->info()->strides_in_bytes().y() / sizeof(float);
- const int ldc = _output->info()->strides_in_bytes().y() / sizeof(float);
-
- // Calculate matrix sizes
- const int M = std::min(_input0->info()->tensor_shape().y(), static_cast<size_t>(window.y().end())) - window.y().start();
- const int K = _input0->info()->tensor_shape().x();
- const int N = _input1->info()->tensor_shape().x();
-
- // Create window (Only iterate over batches)
- Window win(window);
- win.set(0, Window::Dimension(0, 1, 1));
- win.set(1, Window::Dimension(0, 1, 1));
-
- // Create Iterators
- Iterator in0(_input0, window);
- Iterator out(_output, window);
-
- // Execute GEMM
- execute_window_loop(win, [&](const Coordinates & id)
- {
- BlockedGemm<4, 16, float, float>(reinterpret_cast<const float *>(in0.ptr()),
- reinterpret_cast<const float *>(in1_ptr),
- reinterpret_cast<float *>(out.ptr()),
- M, K, N,
- lda, ldb, ldc);
- },
- in0, out);
-}
-} // namespace arm_compute
diff --git a/src/core/NEON/kernels/arm64/NEGEMMLowpAArch64A53Kernel.cpp b/src/core/NEON/kernels/arm64/NEGEMMLowpAArch64A53Kernel.cpp
deleted file mode 100644
index 80606dc..0000000
--- a/src/core/NEON/kernels/arm64/NEGEMMLowpAArch64A53Kernel.cpp
+++ /dev/null
@@ -1,201 +0,0 @@
-/*
- * Copyright (c) 2017 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/core/NEON/kernels/arm64/NEGEMMLowpAArch64A53Kernel.h"
-
-#include "arm_compute/core/AccessWindowStatic.h"
-#include "arm_compute/core/Error.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/IAccessWindow.h"
-#include "arm_compute/core/ITensor.h"
-#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Types.h"
-#include "arm_compute/core/Utils.h"
-#include "arm_compute/core/Validate.h"
-#include "arm_compute/core/Window.h"
-#include "support/ToolchainSupport.h"
-
-namespace arm_compute
-{
-#include "arm_compute/core/NEON/kernels/assembly/gemm_interleaved.hpp"
-#include "arm_compute/core/NEON/kernels/assembly/kernels/a64_gemm_s16_12x8.hpp"
-#include "arm_compute/core/NEON/kernels/assembly/kernels/a64_gemm_u16_12x8.hpp"
-} // namespace arm_compute
-
-#include <arm_neon.h>
-#include <cstddef>
-#include <cstdint>
-
-// Enable only if compiled for AArch64-V8A targets
-#ifdef ARM_COMPUTE_AARCH64_V8A
-
-namespace arm_compute
-{
-NEGEMMLowpAArch64A53Kernel::NEGEMMLowpAArch64A53Kernel()
- : _func(nullptr)
-{
-}
-
-void gemm_interleaved_s16_12x8(const ITensor *input0, const ITensor *input1, ITensor *output, ITensor *workspace, float alpha, float beta, bool is_transposed_0, bool is_transposed_1,
- const Window &window,
- const ThreadInfo &info)
-{
- const int lda = input0->info()->strides_in_bytes().y();
- const int ldb = input1->info()->strides_in_bytes().y();
- const int ldc = output->info()->strides_in_bytes().y() / sizeof(int32_t);
-
- const auto in1_ptr = reinterpret_cast<const int8_t *>(input1->buffer());
-
- const int M = std::min(output->info()->tensor_shape().y(), static_cast<size_t>(window.y().end())) - window.y().start();
- const int N = output->info()->tensor_shape().x();
- const int K = input0->info()->tensor_shape().x();
-
- // Only iterate over batches
- Window win(window);
- win.set(0, Window::Dimension(0, 1, 1));
- win.set(1, Window::Dimension(0, 1, 1));
-
- Iterator in0(input0, window);
- Iterator out(output, window);
-
- GemmInterleaved<gemm_s16_12x8, int8_t, int32_t> gemm(&info.cpu_info, M, N, K, is_transposed_0, is_transposed_1);
-
- constexpr size_t alignment = 4096;
- const size_t offset = (gemm.get_working_size() + alignment - 1) * info.thread_id;
- void *_workspace = workspace->buffer() + offset;
- size_t workspace_size = workspace->info()->total_size();
-
- if(support::cpp11::align(alignment, gemm.get_working_size(), _workspace, workspace_size) == nullptr)
- {
- ARM_COMPUTE_ERROR("Not enough space to align buffer!");
- }
-
- execute_window_loop(win, [&](const Coordinates & id)
- {
- gemm.execute(reinterpret_cast<const int8_t *>(in0.ptr()), lda,
- reinterpret_cast<const int8_t *>(in1_ptr), ldb,
- reinterpret_cast<int32_t *>(out.ptr()), ldc,
- alpha, beta, _workspace);
- },
- in0, out);
-}
-
-void gemm_interleaved_u16_12x8(const ITensor *input0, const ITensor *input1, ITensor *output, ITensor *workspace, float alpha, float beta, bool is_transposed_0, bool is_transposed_1,
- const Window &window,
- const ThreadInfo &info)
-{
- const int lda = input0->info()->strides_in_bytes().y();
- const int ldb = input1->info()->strides_in_bytes().y();
- const int ldc = output->info()->strides_in_bytes().y() / sizeof(int32_t);
-
- const auto in1_ptr = reinterpret_cast<const int8_t *>(input1->buffer());
-
- const int M = std::min(output->info()->tensor_shape().y(), static_cast<size_t>(window.y().end())) - window.y().start();
- const int N = output->info()->tensor_shape().x();
- const int K = input0->info()->tensor_shape().x();
-
- // Only iterate over batches
- Window win(window);
- win.set(0, Window::Dimension(0, 1, 1));
- win.set(1, Window::Dimension(0, 1, 1));
-
- Iterator in0(input0, window);
- Iterator out(output, window);
-
- GemmInterleaved<gemm_u16_12x8, uint8_t, uint32_t> gemm(&info.cpu_info, M, N, K, is_transposed_0, is_transposed_1);
-
- constexpr size_t alignment = 4096;
- const size_t offset = (gemm.get_working_size() + alignment - 1) * info.thread_id;
- void *_workspace = workspace->buffer() + offset;
- size_t workspace_size = workspace->info()->total_size();
-
- if(support::cpp11::align(alignment, gemm.get_working_size(), _workspace, workspace_size) == nullptr)
- {
- ARM_COMPUTE_ERROR("Not enough space to align buffer!");
- }
-
- execute_window_loop(win, [&](const Coordinates & id)
- {
- gemm.execute(reinterpret_cast<const uint8_t *>(in0.ptr()), lda,
- reinterpret_cast<const uint8_t *>(in1_ptr), ldb,
- reinterpret_cast<uint32_t *>(out.ptr()), ldc,
- alpha, beta, _workspace);
- },
- in0, out);
-}
-
-void NEGEMMLowpAArch64A53Kernel::internal_configure(const ITensor *input0, const ITensor *input1, ITensor *output, ITensor *workspace, float alpha, float beta, bool is_transposed_0,
- bool is_transposed_1)
-{
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input0, 1, DataType::S8, DataType::U8);
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::S32);
- ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input0, input1);
-
- _input0 = input0;
- _input1 = input1;
- _output = output;
- _workspace = workspace;
- _alpha = alpha;
- _beta = beta;
- _is_transposed_0 = is_transposed_0;
- _is_transposed_1 = is_transposed_1;
-
- switch(input0->info()->data_type())
- {
- case DataType::S8:
- _func = &gemm_interleaved_s16_12x8;
- break;
- case DataType::U8:
- _func = &gemm_interleaved_u16_12x8;
- break;
- default:
- ARM_COMPUTE_ERROR("Element size not supported");
- break;
- }
-
- // Configure kernel window
- Window win = calculate_max_window(*output->info());
-
- AccessWindowRectangle output_access(output->info(), 0, 0, 12, 8);
-
- const int input0_access_end = ceil_to_multiple(input0->info()->tensor_shape().x(), 12);
- const int input1_access_end = ceil_to_multiple(input1->info()->tensor_shape().x(), 12);
-
- update_window_and_padding(win,
- AccessWindowStatic(input0->info(), 0, 0, input0_access_end, input0->info()->tensor_shape().y()),
- AccessWindowStatic(input1->info(), 0, 0, input1_access_end, input1->info()->tensor_shape().y()),
- output_access);
-
- INEKernel::configure(win);
-}
-
-void NEGEMMLowpAArch64A53Kernel::run(const Window &window, const ThreadInfo &info)
-{
- ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
- ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
- ARM_COMPUTE_ERROR_ON(_func == nullptr);
-
- (*_func)(_input0, _input1, _output, _workspace, _alpha, _beta, _is_transposed_0, _is_transposed_1, window, info);
-}
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_AARCH64_V8A */
diff --git a/src/core/NEON/kernels/arm64/NEGEMMLowpAArch64Kernel.cpp b/src/core/NEON/kernels/arm64/NEGEMMLowpAArch64Kernel.cpp
deleted file mode 100644
index 38f82f0..0000000
--- a/src/core/NEON/kernels/arm64/NEGEMMLowpAArch64Kernel.cpp
+++ /dev/null
@@ -1,199 +0,0 @@
-/*
- * Copyright (c) 2017 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/core/NEON/kernels/arm64/NEGEMMLowpAArch64Kernel.h"
-
-#include "arm_compute/core/AccessWindowStatic.h"
-#include "arm_compute/core/Error.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/IAccessWindow.h"
-#include "arm_compute/core/ITensor.h"
-#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Types.h"
-#include "arm_compute/core/Utils.h"
-#include "arm_compute/core/Validate.h"
-#include "arm_compute/core/Window.h"
-#include "support/ToolchainSupport.h"
-
-namespace arm_compute
-{
-#include "arm_compute/core/NEON/kernels/assembly/gemm_interleaved.hpp"
-#include "arm_compute/core/NEON/kernels/assembly/kernels/a64_gemm_s8_4x4.hpp"
-#include "arm_compute/core/NEON/kernels/assembly/kernels/a64_gemm_u8_4x4.hpp"
-} // namespace arm_compute
-
-#include <arm_neon.h>
-#include <cstddef>
-#include <cstdint>
-
-// Enable only if compiled for AArch64-V8A targets
-#ifdef ARM_COMPUTE_AARCH64_V8A
-
-namespace arm_compute
-{
-NEGEMMLowpAArch64Kernel::NEGEMMLowpAArch64Kernel()
- : _func(nullptr)
-{
-}
-
-void gemm_interleaved_s8(const ITensor *input0, const ITensor *input1, ITensor *output, ITensor *workspace, float alpha, float beta, bool is_transposed_0, bool is_transposed_1, const Window &window,
- const ThreadInfo &info)
-{
- const int lda = input0->info()->strides_in_bytes().y();
- const int ldb = input1->info()->strides_in_bytes().y();
- const int ldc = output->info()->strides_in_bytes().y() / sizeof(int32_t);
-
- const auto in1_ptr = reinterpret_cast<const int8_t *>(input1->buffer());
-
- const int M = std::min(output->info()->tensor_shape().y(), static_cast<size_t>(window.y().end())) - window.y().start();
- const int N = output->info()->tensor_shape().x();
- const int K = input0->info()->tensor_shape().x();
-
- // Only iterate over batches
- Window win(window);
- win.set(0, Window::Dimension(0, 1, 1));
- win.set(1, Window::Dimension(0, 1, 1));
-
- Iterator in0(input0, window);
- Iterator out(output, window);
-
- GemmInterleaved<gemm_s8_4x4, int8_t, int32_t> gemm(&info.cpu_info, M, N, K, is_transposed_0, is_transposed_1);
-
- constexpr size_t alignment = 4096;
- const size_t offset = (gemm.get_working_size() + alignment - 1) * info.thread_id;
- void *_workspace = workspace->buffer() + offset;
- size_t workspace_size = workspace->info()->total_size();
-
- if(support::cpp11::align(alignment, gemm.get_working_size(), _workspace, workspace_size) == nullptr)
- {
- ARM_COMPUTE_ERROR("Not enough space to align buffer!");
- }
-
- execute_window_loop(win, [&](const Coordinates & id)
- {
- gemm.execute(reinterpret_cast<const int8_t *>(in0.ptr()), lda,
- reinterpret_cast<const int8_t *>(in1_ptr), ldb,
- reinterpret_cast<int32_t *>(out.ptr()), ldc,
- alpha, beta, _workspace);
- },
- in0, out);
-}
-
-void gemm_interleaved_u8(const ITensor *input0, const ITensor *input1, ITensor *output, ITensor *workspace, float alpha, float beta, bool is_transposed_0, bool is_transposed_1, const Window &window,
- const ThreadInfo &info)
-{
- const int lda = input0->info()->strides_in_bytes().y();
- const int ldb = input1->info()->strides_in_bytes().y();
- const int ldc = output->info()->strides_in_bytes().y() / sizeof(uint32_t);
-
- const auto in1_ptr = reinterpret_cast<const uint8_t *>(input1->buffer());
-
- const int M = std::min(output->info()->tensor_shape().y(), static_cast<size_t>(window.y().end())) - window.y().start();
- const int N = output->info()->tensor_shape().x();
- const int K = input0->info()->tensor_shape().x();
-
- // Only iterate over batches
- Window win(window);
- win.set(0, Window::Dimension(0, 1, 1));
- win.set(1, Window::Dimension(0, 1, 1));
-
- Iterator in0(input0, window);
- Iterator out(output, window);
-
- GemmInterleaved<gemm_u8_4x4, uint8_t, uint32_t> gemm(&info.cpu_info, M, N, K, is_transposed_0, is_transposed_1);
-
- constexpr size_t alignment = 4096;
- const size_t offset = (gemm.get_working_size() + alignment - 1) * info.thread_id;
- void *_workspace = workspace->buffer() + offset;
- size_t workspace_size = workspace->info()->total_size();
-
- if(support::cpp11::align(alignment, gemm.get_working_size(), _workspace, workspace_size) == nullptr)
- {
- ARM_COMPUTE_ERROR("Not enough space to align buffer!");
- }
-
- execute_window_loop(win, [&](const Coordinates & id)
- {
- gemm.execute(reinterpret_cast<const uint8_t *>(in0.ptr()), lda,
- reinterpret_cast<const uint8_t *>(in1_ptr), ldb,
- reinterpret_cast<uint32_t *>(out.ptr()), ldc,
- alpha, beta, _workspace);
- },
- in0, out);
-}
-
-void NEGEMMLowpAArch64Kernel::internal_configure(const ITensor *input0, const ITensor *input1, ITensor *output, ITensor *workspace, float alpha, float beta, bool is_transposed_0,
- bool is_transposed_1)
-{
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input0, 1, DataType::S8, DataType::U8);
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::S32, DataType::U32);
- ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input0, input1);
-
- _input0 = input0;
- _input1 = input1;
- _output = output;
- _workspace = workspace;
- _alpha = alpha;
- _beta = beta;
- _is_transposed_0 = is_transposed_0;
- _is_transposed_1 = is_transposed_1;
-
- switch(input0->info()->data_type())
- {
- case DataType::S8:
- _func = &gemm_interleaved_s8;
- break;
- case DataType::U8:
- _func = &gemm_interleaved_u8;
- break;
- default:
- ARM_COMPUTE_ERROR("Element size not supported");
- break;
- }
-
- // Configure kernel window
- Window win = calculate_max_window(*output->info());
-
- AccessWindowRectangle output_access(output->info(), 0, 0, 4, 4);
-
- const int input0_access_end = ceil_to_multiple(input0->info()->tensor_shape().x(), 4);
- const int input1_access_end = ceil_to_multiple(input1->info()->tensor_shape().x(), 4);
-
- update_window_and_padding(win,
- AccessWindowStatic(input0->info(), 0, 0, input0_access_end, input0->info()->tensor_shape().y()),
- AccessWindowStatic(input1->info(), 0, 0, input1_access_end, input1->info()->tensor_shape().y()),
- output_access);
-
- INEKernel::configure(win);
-}
-
-void NEGEMMLowpAArch64Kernel::run(const Window &window, const ThreadInfo &info)
-{
- ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
- ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
- ARM_COMPUTE_ERROR_ON(_func == nullptr);
-
- (*_func)(_input0, _input1, _output, _workspace, _alpha, _beta, _is_transposed_0, _is_transposed_1, window, info);
-}
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_AARCH64_V8A */
diff --git a/src/core/NEON/kernels/arm64/NEGEMMLowpAArch64V8P4Kernel.cpp b/src/core/NEON/kernels/arm64/NEGEMMLowpAArch64V8P4Kernel.cpp
deleted file mode 100644
index d4fcf5e..0000000
--- a/src/core/NEON/kernels/arm64/NEGEMMLowpAArch64V8P4Kernel.cpp
+++ /dev/null
@@ -1,198 +0,0 @@
-/*
- * Copyright (c) 2017-2018 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/core/NEON/kernels/arm64/NEGEMMLowpAArch64V8P4Kernel.h"
-
-#include "arm_compute/core/AccessWindowStatic.h"
-#include "arm_compute/core/Error.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/IAccessWindow.h"
-#include "arm_compute/core/ITensor.h"
-#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Types.h"
-#include "arm_compute/core/Utils.h"
-#include "arm_compute/core/Validate.h"
-#include "arm_compute/core/Window.h"
-#include "support/ToolchainSupport.h"
-
-namespace arm_compute
-{
-#include "arm_compute/core/NEON/kernels/assembly/gemm_interleaved.hpp"
-#include "arm_compute/core/NEON/kernels/assembly/kernels/a64_gemm_s8_12x8.hpp"
-#include "arm_compute/core/NEON/kernels/assembly/kernels/a64_gemm_u8_12x8.hpp"
-} // namespace arm_compute
-
-#include <arm_neon.h>
-#include <cstddef>
-#include <cstdint>
-
-// Enable only if compiled for AArch64-V8.2-A targets
-#ifdef ARM_COMPUTE_AARCH64_V8_2
-
-namespace
-{
-using namespace arm_compute;
-
-Status validate_arguments(const ITensorInfo *input0, const ITensorInfo *input1, const ITensorInfo *output)
-{
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input0, 1, DataType::QASYMM8, DataType::U8, DataType::S8);
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::S32);
- ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input1);
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input0, input1);
-
- return Status{};
-}
-
-std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input0, ITensorInfo *input1, ITensorInfo *output)
-{
- // Configure kernel window
- Window win = calculate_max_window(*output);
-
- AccessWindowRectangle output_access(output, 0, 0, 12, 8);
-
- const int input0_access_end = ceil_to_multiple(input0->tensor_shape().x(), 8);
- const int input1_access_end = ceil_to_multiple(input1->tensor_shape().x(), 12);
-
- bool window_changed = update_window_and_padding(win,
- AccessWindowStatic(input0, 0, 0, input0_access_end, input0->tensor_shape().y()),
- AccessWindowStatic(input1, 0, 0, input1_access_end, input1->tensor_shape().y()),
- output_access);
-
- Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
- return std::make_pair(err, win);
-}
-
-template <typename strategy, typename To, typename Tr>
-void *align_workspace(GemmInterleaved<strategy, To, Tr> &gemm, const ThreadInfo &info, ITensor *ws)
-{
- constexpr size_t alignment = 4096;
- const size_t offset = (gemm.get_working_size() + alignment - 1) * info.thread_id;
- void *workspace = ws->buffer() + offset;
- size_t workspace_size = ws->info()->total_size();
-
- if(support::cpp11::align(alignment, gemm.get_working_size(), workspace, workspace_size) == nullptr)
- {
- ARM_COMPUTE_ERROR("Not enough space to align buffer!");
- }
- return workspace;
-}
-
-template <typename strategy>
-void execute_gemm(const Window &win, Iterator &in0, Iterator &in1, Iterator &out,
- const ThreadInfo &info, ITensor *ws, int M, int N, int K, bool is_transposed_0, bool is_transposed_1,
- int lda, int ldb, int ldc, float alpha, float beta)
-{
- ARM_COMPUTE_UNUSED(M);
- ARM_COMPUTE_UNUSED(N);
- ARM_COMPUTE_UNUSED(K);
- ARM_COMPUTE_UNUSED(is_transposed_0);
- ARM_COMPUTE_UNUSED(is_transposed_1);
- GemmInterleaved<strategy, typename strategy::operand_type, typename strategy::result_type> gemm(&info.cpu_info, M, N, K, is_transposed_0, is_transposed_1);
- void *workspace = align_workspace(gemm, info, ws);
- execute_window_loop(win, [&](const Coordinates & id)
- {
- gemm.execute(reinterpret_cast<const typename strategy::operand_type *>(in0.ptr()), lda,
- reinterpret_cast<const typename strategy::operand_type *>(in1.ptr()), ldb,
- reinterpret_cast<typename strategy::result_type *>(out.ptr()), ldc,
- alpha, beta, workspace);
- },
- in0, out);
-}
-} // namespace
-
-namespace arm_compute
-{
-void NEGEMMLowpAArch64V8P4Kernel::internal_configure(const ITensor *input0, const ITensor *input1, ITensor *output, ITensor *workspace, float alpha, float beta, bool is_transposed_0,
- bool is_transposed_1)
-{
- // Perform validate step
- ARM_COMPUTE_ERROR_ON_NULLPTR(input0, input1, output);
- ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input0->info(), input1->info(), output->info()));
-
- _input0 = input0;
- _input1 = input1;
- _output = output;
- _workspace = workspace;
- _alpha = alpha;
- _beta = beta;
- _is_transposed_0 = is_transposed_0;
- _is_transposed_1 = is_transposed_1;
-
- // Configure kernel window
- auto win_config = validate_and_configure_window(input0->info(), input1->info(), output->info());
- ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
- INEKernel::configure(win_config.second);
-}
-
-Status NEGEMMLowpAArch64V8P4Kernel::validate(const ITensorInfo *input0, const ITensorInfo *input1, const ITensorInfo *output)
-{
- ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input0, input1, output));
- ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input0->clone().get(), input1->clone().get(), output->clone().get()).first);
-
- return Status{};
-}
-
-void NEGEMMLowpAArch64V8P4Kernel::run(const Window &window, const ThreadInfo &info)
-{
- ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
- ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
-
- const int lda = _input0->info()->strides_in_bytes().y();
- const int ldb = _input1->info()->strides_in_bytes().y();
- const int ldc = _output->info()->strides_in_bytes().y() / sizeof(uint32_t);
-
- const int M = std::min(_output->info()->tensor_shape().y(), static_cast<size_t>(window.y().end())) - window.y().start();
- const int N = _output->info()->tensor_shape().x();
- const int K = _input0->info()->tensor_shape().x();
-
- // Only iterate over batches
- Window win(window);
- win.set(0, Window::Dimension(0, 1, 1));
- win.set(1, Window::Dimension(0, 1, 1));
-
- Iterator in0(_input0, window);
- Iterator in1(_input1, window);
- Iterator out(_output, window);
-
- switch(_input0->info()->data_type())
- {
- case DataType::QASYMM8:
- case DataType::U8:
- {
- execute_gemm<gemm_u8_12x8>(win, in0, in1, out, info, _workspace, M, N, K, _is_transposed_0, _is_transposed_1, lda, ldb, ldc, _alpha, _beta);
- break;
- }
- case DataType::S8:
- {
- execute_gemm<gemm_s8_12x8>(win, in0, in1, out, info, _workspace, M, N, K, _is_transposed_0, _is_transposed_1, lda, ldb, ldc, _alpha, _beta);
- break;
- }
- default:
- {
- ARM_COMPUTE_ERROR("Not supported.");
- break;
- }
- }
-}
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_AARCH64_V8_2 */
diff --git a/src/core/NEON/kernels/arm64/NEGEMVAArch64Kernel.cpp b/src/core/NEON/kernels/arm64/NEGEMVAArch64Kernel.cpp
deleted file mode 100644
index 163014b..0000000
--- a/src/core/NEON/kernels/arm64/NEGEMVAArch64Kernel.cpp
+++ /dev/null
@@ -1,130 +0,0 @@
-/*
- * Copyright (c) 2017-2018 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/core/NEON/kernels/arm64/NEGEMVAArch64Kernel.h"
-
-#include "arm_compute/core/AccessWindowStatic.h"
-#include "arm_compute/core/AccessWindowTranspose.h"
-#include "arm_compute/core/Error.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/IAccessWindow.h"
-#include "arm_compute/core/ITensor.h"
-#include "arm_compute/core/NEON/NEFixedPoint.h"
-#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Types.h"
-#include "arm_compute/core/Utils.h"
-#include "arm_compute/core/Validate.h"
-#include "arm_compute/core/Window.h"
-#include "support/ToolchainSupport.h"
-
-namespace arm_compute
-{
-#pragma GCC diagnostic push
-#pragma GCC diagnostic ignored "-Wswitch-default"
-#pragma GCC diagnostic ignored "-Weffc++"
-#include "arm_compute/core/NEON/kernels/assembly/gemv_transposed.hpp"
-#include "arm_compute/core/NEON/kernels/assembly/kernels/a64_sgemv_trans.hpp"
-#pragma GCC diagnostic pop
-} // namespace arm_compute
-
-#include <arm_neon.h>
-#include <cstddef>
-#include <cstdint>
-#include <tuple>
-
-namespace arm_compute
-{
-void NEGEMVAArch64Kernel::internal_configure(const ITensor *input0, const ITensor *input1, ITensor *output, ITensor *workspace, float alpha, float beta, bool is_transposed_0, bool is_transposed_1)
-{
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input0, 1, DataType::F32);
- ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input0, input1, output);
- ARM_COMPUTE_ERROR_ON_MISMATCHING_FIXED_POINT(input0, input1, output);
-
- _input0 = input0;
- _input1 = input1;
- _output = output;
- _workspace = workspace;
- _alpha = alpha;
- _beta = beta;
- _is_transposed_0 = is_transposed_0;
- _is_transposed_1 = is_transposed_1;
-
- // Configure kernel window
- Window win = calculate_max_window(*output->info());
-
- AccessWindowRectangle output_access(output->info(), 0, 0, 12, 1);
-
- const int input0_access_end = ceil_to_multiple(input0->info()->tensor_shape().x(), 12);
- const int input1_access_end = ceil_to_multiple(input1->info()->tensor_shape().x(), 12);
-
- update_window_and_padding(win,
- AccessWindowStatic(input0->info(), 0, 0, input0_access_end, input0->info()->tensor_shape().y()),
- AccessWindowStatic(input1->info(), 0, 0, input1_access_end, input1->info()->tensor_shape().y()),
- output_access);
-
- INEKernel::configure(win);
-}
-
-void NEGEMVAArch64Kernel::run(const Window &window, const ThreadInfo &info)
-{
- ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
- ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
-
- const int lda = _input0->info()->strides_in_bytes().y() / sizeof(sgemv_trans::operand_type);
- const int ldb = _input1->info()->strides_in_bytes().y() / sizeof(sgemv_trans::operand_type);
- const int ldc = _output->info()->strides_in_bytes().y() / sizeof(sgemv_trans::result_type);
-
- const auto in1_ptr = reinterpret_cast<const sgemv_trans::operand_type *>(_input1->buffer());
-
- const int N = _output->info()->tensor_shape().x();
- const int K = _input0->info()->tensor_shape().x();
-
- // Only iterate over batches
- Window win(window);
- win.set(0, Window::Dimension(0, 1, 1));
- win.set(1, Window::Dimension(0, 1, 1));
-
- Iterator in0(_input0, window);
- Iterator out(_output, window);
-
- GemvTransposed<sgemv_trans, sgemv_trans::operand_type, sgemv_trans::result_type> gemm(&info.cpu_info, N, K);
- constexpr size_t alignment = 4096;
- const size_t offset = (gemm.get_working_size() + alignment - 1) * info.thread_id;
- void *workspace = _workspace->buffer() + offset;
- size_t workspace_size = _workspace->info()->total_size();
-
- if(support::cpp11::align(alignment, gemm.get_working_size(), workspace, workspace_size) == nullptr)
- {
- ARM_COMPUTE_ERROR("Not enough space to align buffer!");
- }
-
- execute_window_loop(win, [&](const Coordinates & id)
- {
- gemm.execute(reinterpret_cast<const sgemv_trans::operand_type *>(in0.ptr()), lda,
- reinterpret_cast<const sgemv_trans::operand_type *>(in1_ptr), ldb,
- reinterpret_cast<sgemv_trans::result_type *>(out.ptr()), ldc,
- _alpha, _beta, workspace);
- },
- in0, out);
-}
-} // namespace arm_compute
diff --git a/src/core/NEON/kernels/arm64/NEHGEMMAArch64FP16Kernel.cpp b/src/core/NEON/kernels/arm64/NEHGEMMAArch64FP16Kernel.cpp
deleted file mode 100644
index e84409c..0000000
--- a/src/core/NEON/kernels/arm64/NEHGEMMAArch64FP16Kernel.cpp
+++ /dev/null
@@ -1,132 +0,0 @@
-/*
- * Copyright (c) 2017-2018 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/core/NEON/kernels/arm64/NEHGEMMAArch64FP16Kernel.h"
-
-#include "arm_compute/core/AccessWindowStatic.h"
-#include "arm_compute/core/AccessWindowTranspose.h"
-#include "arm_compute/core/Error.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/IAccessWindow.h"
-#include "arm_compute/core/ITensor.h"
-#include "arm_compute/core/NEON/NEFixedPoint.h"
-#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Types.h"
-#include "arm_compute/core/Utils.h"
-#include "arm_compute/core/Validate.h"
-#include "arm_compute/core/Window.h"
-#include "support/ToolchainSupport.h"
-
-namespace arm_compute
-{
-#pragma GCC diagnostic push
-#pragma GCC diagnostic ignored "-Wswitch-default"
-#include "arm_compute/core/NEON/kernels/assembly/gemm_interleaved.hpp"
-#include "arm_compute/core/NEON/kernels/assembly/kernels/a64_hgemm_24x8.hpp"
-#pragma GCC diagnostic pop
-} // namespace arm_compute
-
-namespace arm_compute
-{
-void NEHGEMMAArch64FP16Kernel::internal_configure(const ITensor *input0, const ITensor *input1, ITensor *output, ITensor *workspace, float alpha, float beta, bool is_transposed_0,
- bool is_transposed_1)
-{
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input0, 1, DataType::F16);
- ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input0, input1, output);
- ARM_COMPUTE_ERROR_ON_MISMATCHING_FIXED_POINT(input0, input1, output);
-
- _input0 = input0;
- _input1 = input1;
- _output = output;
- _workspace = workspace;
- _alpha = alpha;
- _beta = beta;
- _is_transposed_0 = is_transposed_0;
- _is_transposed_1 = is_transposed_1;
-
- // Configure kernel window
- Window win = calculate_max_window(*output->info());
-
- AccessWindowRectangle output_access(output->info(), 0, 0, 24, 8);
-
- const int input0_access_end = ceil_to_multiple(input0->info()->tensor_shape().x(), 8);
- const int input1_access_end = ceil_to_multiple(input1->info()->tensor_shape().x(), 24);
-
- update_window_and_padding(win,
- AccessWindowStatic(input0->info(), 0, 0, input0_access_end, input0->info()->tensor_shape().y()),
- AccessWindowStatic(input1->info(), 0, 0, input1_access_end, input1->info()->tensor_shape().y()),
- output_access);
-
- INEKernel::configure(win);
-}
-
-void NEHGEMMAArch64FP16Kernel::run(const Window &window, const ThreadInfo &info)
-{
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
- ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
- ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
-
- const int lda = _input0->info()->strides_in_bytes().y() / sizeof(hgemm_24x8::operand_type);
- const int ldb = _input1->info()->strides_in_bytes().y() / sizeof(hgemm_24x8::operand_type);
- const int ldc = _output->info()->strides_in_bytes().y() / sizeof(hgemm_24x8::result_type);
-
- const auto in1_ptr = reinterpret_cast<const hgemm_24x8::operand_type *>(_input1->buffer());
-
- const int M = std::min(_output->info()->tensor_shape().y(), static_cast<size_t>(window.y().end())) - window.y().start();
- const int N = _output->info()->tensor_shape().x();
- const int K = _input0->info()->tensor_shape().x();
-
- // Only iterate over batches
- Window win(window);
- win.set(0, Window::Dimension(0, 1, 1));
- win.set(1, Window::Dimension(0, 1, 1));
-
- Iterator in0(_input0, window);
- Iterator out(_output, window);
-
- GemmInterleaved<hgemm_24x8, hgemm_24x8::operand_type, hgemm_24x8::result_type> gemm(&info.cpu_info, M, N, K, _is_transposed_0, _is_transposed_1);
- constexpr size_t alignment = 4096;
- const size_t offset = (gemm.get_working_size() + alignment - 1) * info.thread_id;
- void *workspace = _workspace->buffer() + offset;
- size_t workspace_size = _workspace->info()->total_size();
-
- if(support::cpp11::align(alignment, gemm.get_working_size(), workspace, workspace_size) == nullptr)
- {
- ARM_COMPUTE_ERROR("Not enough space to align buffer!");
- }
-
- execute_window_loop(win, [&](const Coordinates & id)
- {
- gemm.execute(reinterpret_cast<const hgemm_24x8::operand_type *>(in0.ptr()), lda,
- reinterpret_cast<const hgemm_24x8::operand_type *>(in1_ptr), ldb,
- reinterpret_cast<hgemm_24x8::result_type *>(out.ptr()), ldc,
- _alpha, 1.f, workspace);
- },
- in0, out);
-#else /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
- ARM_COMPUTE_UNUSED(window);
- ARM_COMPUTE_UNUSED(info);
- ARM_COMPUTE_ERROR("Recompile the library with arch=arm64-v8.2-a to enable support for FP16.");
-#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
-}
-} // namespace arm_compute
diff --git a/src/core/NEON/kernels/arm_gemm/asmlib.hpp b/src/core/NEON/kernels/arm_gemm/asmlib.hpp
new file mode 100644
index 0000000..b3fcb33
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/asmlib.hpp
@@ -0,0 +1,128 @@
+/*
+ * Copyright (c) 2017-2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#pragma once
+
+#ifdef __aarch64__
+// Macro to use in assembler to get a preload. Needed because of various
+// workarounds needed to get working preload behaviour.
+//
+// Code using these macros needs to clobber x20 and x21 as they might be
+// used by the workaround.
+
+// "Correct" version
+#define ASM_PREFETCH(address) "PRFM PLDL1KEEP, " address "\n"
+#define ASM_PREFETCHL2(address) "PRFM PLDL2KEEP, " address "\n"
+#define ASM_PREFETCHW(address) "PRFM PSTL1KEEP, " address "\n"
+#define ASM_PREFETCHWL2(address) "PRFM PSTL2KEEP, " address "\n"
+
+// Lee's uarchsim hack
+//#define ASM_PREFETCH(address) "LDNP x20, x21, " address "\n"
+
+// No preload at all
+//#define ASM_PREFETCH(address) ""
+#else
+
+// "Correct" versions for AArch32
+#define ASM_PREFETCH(address) "PLD " address "\n"
+#define ASM_PREFETCHW(address) "PLDW " address "\n"
+
+#endif
+
+/*
+ * Do some prefetches.
+ */
+template <typename T>
+static inline void prefetch_6x(const T *pfp)
+{
+ __asm __volatile(
+ ASM_PREFETCH("[%[pfp]]")
+ ASM_PREFETCH("[%[pfp], #64]")
+ ASM_PREFETCH("[%[pfp], #128]")
+ ASM_PREFETCH("[%[pfp], #192]")
+ ASM_PREFETCH("[%[pfp], #256]")
+ ASM_PREFETCH("[%[pfp], #320]")
+ :
+ : [pfp] "r"(pfp)
+ : "memory");
+}
+
+template <typename T>
+static inline void prefetch_5x(const T *pfp)
+{
+ __asm __volatile(
+ ASM_PREFETCH("[%[pfp]]")
+ ASM_PREFETCH("[%[pfp], #64]")
+ ASM_PREFETCH("[%[pfp], #128]")
+ ASM_PREFETCH("[%[pfp], #192]")
+ ASM_PREFETCH("[%[pfp], #256]")
+ :
+ : [pfp] "r"(pfp)
+ : "memory");
+}
+
+template <typename T>
+static inline void prefetch_4x(const T *pfp)
+{
+ __asm __volatile(
+ ASM_PREFETCH("[%[pfp]]")
+ ASM_PREFETCH("[%[pfp], #64]")
+ ASM_PREFETCH("[%[pfp], #128]")
+ ASM_PREFETCH("[%[pfp], #192]")
+ :
+ : [pfp] "r"(pfp)
+ : "memory");
+}
+
+template <typename T>
+static inline void prefetch_3x(const T *pfp)
+{
+ __asm __volatile(
+ ASM_PREFETCH("[%[pfp]]")
+ ASM_PREFETCH("[%[pfp], #64]")
+ ASM_PREFETCH("[%[pfp], #128]")
+ :
+ : [pfp] "r"(pfp)
+ : "memory");
+}
+
+template <typename T>
+static inline void prefetch_2x(const T *pfp)
+{
+ __asm __volatile(
+ ASM_PREFETCH("[%[pfp]]")
+ ASM_PREFETCH("[%[pfp], #64]")
+ :
+ : [pfp] "r"(pfp)
+ : "memory");
+}
+
+template <typename T>
+static inline void prefetch_1x(const T *pfp)
+{
+ __asm __volatile(
+ ASM_PREFETCH("[%[pfp]]")
+ :
+ : [pfp] "r"(pfp)
+ : "memory");
+}
diff --git a/src/core/NEON/kernels/arm_gemm/buffer_manager.hpp b/src/core/NEON/kernels/arm_gemm/buffer_manager.hpp
new file mode 100644
index 0000000..dd74744
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/buffer_manager.hpp
@@ -0,0 +1,379 @@
+/*
+ * Copyright (c) 2017-2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#pragma once
+
+#include <cstdlib>
+#include <vector>
+
+#ifndef NO_MULTI_THREADING
+#include <atomic>
+#include <mutex>
+
+#define USE_SEMAPHORE
+
+#ifdef USE_SEMAPHORE
+#include <condition_variable>
+#endif
+
+#endif
+
+namespace arm_gemm
+{
+#ifndef NO_MULTI_THREADING
+enum class BufferStatus
+{
+ IDLE,
+ POPULATING,
+ BUSY
+};
+
+class Buffer
+{
+private:
+ const int _maxusers; // Maximum permissible threads.
+ void *const _storage; // Storage for buffer content.
+
+ int _numusers; // Actual number of threads (might be lower).
+
+ volatile BufferStatus _status = BufferStatus::IDLE; // Status
+ std::atomic_int _users = {}; // How many users are still using the buffer.
+ volatile int _index = 0; // Which block of data currently resides in the buffer.
+
+ std::mutex _lock = {};
+#ifdef USE_SEMAPHORE
+ std::condition_variable _cv = {};
+#endif
+
+ template <typename T>
+ void populate_buffer(T func)
+ {
+ func(_storage);
+
+ /* Now mark it as ready. */
+#ifdef USE_SEMAPHORE
+ {
+ std::unique_lock<std::mutex> ul(_lock);
+ _status = BufferStatus::BUSY;
+ _cv.notify_all();
+ }
+#else
+ _status = BufferStatus::BUSY;
+#endif
+ }
+
+public:
+ Buffer(Buffer &) = delete;
+ Buffer &operator=(Buffer &) = delete;
+
+ Buffer(void *storage, int maxusers)
+ : _maxusers(maxusers), _storage(storage), _numusers(maxusers)
+ {
+ _status = BufferStatus::IDLE;
+ }
+
+ /* Try and populate the given index.
+ * Wait if the buffer is busy with previous index, then:
+ *
+ * If the buffer is idle, grab it and populate it.
+ * If it's already being populated by another thread or is ready, return.
+ */
+ template <typename T>
+ void try_populate(const int index, T func)
+ {
+ for(;;)
+ {
+#ifdef USE_SEMAPHORE
+ /* If it's busy with a previous index, wait on the semaphore. */
+ if((_status == BufferStatus::BUSY) && (_index != index))
+ {
+ std::unique_lock<std::mutex> ul(_lock);
+
+ if((_status == BufferStatus::BUSY) && (_index != index))
+ {
+ _cv.wait(ul);
+ }
+ }
+#endif
+ /* Return if another thread is populating it already. */
+ if((_index == index) && ((_status == BufferStatus::POPULATING) || (_status == BufferStatus::BUSY)))
+ {
+ return;
+ }
+
+ if(_status == BufferStatus::IDLE)
+ {
+ std::lock_guard<std::mutex> guard(_lock);
+
+ /* If the buffer is still idle, we can grab it and populate it. */
+ if(_status == BufferStatus::IDLE)
+ {
+ _status = BufferStatus::POPULATING;
+ _index = index;
+ _users = _numusers;
+ break;
+ }
+ }
+ }
+
+ /* If we get here, fill in the buffer. */
+ populate_buffer(func);
+ }
+
+ template <typename T>
+ void *get(const int index, T func)
+ {
+ // Loop until we achieve something.
+ for(;;)
+ {
+ // If the index is correct and the buffer status is busy then we can
+ // just return the content. No locking is needed here as the index
+ // cannot change (and status cannot change from BUSY) until all
+ // users have finished.
+ if((_index == index) && (_status == BufferStatus::BUSY))
+ {
+ return _storage;
+ }
+#ifdef USE_SEMAPHORE
+ if(((_status == BufferStatus::BUSY) && (_index != index)) || (_status == BufferStatus::POPULATING))
+ {
+ std::unique_lock<std::mutex> ul(_lock);
+
+ if(((_status == BufferStatus::BUSY) && (_index != index)) || (_status == BufferStatus::POPULATING))
+ {
+ _cv.wait(ul);
+ }
+ }
+#endif
+
+ // If it's idle, we need to populate it. The IDLE->POPULATING
+ // transition requires the lock.
+ if(_status == BufferStatus::IDLE)
+ {
+ std::lock_guard<std::mutex> guard(_lock);
+
+ /* If it's still idle, grab it. Otherwise drop through and
+ * we'll do something else next time through the loop. */
+ if(_status == BufferStatus::IDLE)
+ {
+ _status = BufferStatus::POPULATING;
+ _index = index;
+ _users = _numusers;
+ break;
+ }
+ }
+ }
+
+ /* If we get here we need to populate the buffer. */
+ populate_buffer(func);
+
+ return _storage;
+ }
+
+ /* Threads call this when they have finished processing a buffer. We
+ * simply (atomically) decrement the user count, and if it's hit zero we
+ * flag the buffer as idle.
+ */
+ void release(void)
+ {
+ if(--_users == 0)
+ {
+#ifdef USE_SEMAPHORE
+ std::unique_lock<std::mutex> ul(_lock);
+ _status = BufferStatus::IDLE;
+ /* We notify all waiters as we expect one to do the populating
+ * and any others to go and process and earlier block. */
+ _cv.notify_all();
+#else
+ _status = BufferStatus::IDLE;
+#endif
+ }
+ }
+
+ /* This is called to change the number of users. */
+ void set_numusers(int numusers)
+ {
+ _numusers = std::min(numusers, _maxusers);
+ }
+};
+
+class BufferManager
+{
+private:
+ /* This has to be a vector of Buffer *, because a Buffer cannot be moved
+ * or copied due to atomic members. */
+ std::vector<Buffer *> _buffers = {};
+ const int _maxthreads;
+ void *const _storage;
+
+public:
+ BufferManager(BufferManager &) = delete;
+ BufferManager &operator=(BufferManager &) = delete;
+
+ // Say how much storage is needed.
+ static inline size_t get_storage_requirement(const int maxthreads, const size_t buffersize)
+ {
+ return buffersize * ((maxthreads == 1) ? 1 : 3);
+ }
+
+ BufferManager(const int maxthreads, const size_t buffersize, void *storage)
+ : _maxthreads(maxthreads), _storage(storage)
+ {
+ const int numbuffers = (maxthreads == 1) ? 1 : 3;
+
+ /* We don't need any Buffer objects in single thread mode. */
+ if(_maxthreads == 1)
+ {
+ return;
+ }
+
+ /* Use intptr_t to avoid performing arithmetic on a void * */
+ intptr_t storage_int = reinterpret_cast<intptr_t>(_storage);
+
+ for(int i = 0; i < numbuffers; i++)
+ {
+ _buffers.push_back(new Buffer(reinterpret_cast<void *>(storage_int), _maxthreads));
+ storage_int += buffersize;
+ }
+ }
+
+ ~BufferManager()
+ {
+ while(_buffers.size())
+ {
+ delete _buffers.back();
+ _buffers.pop_back();
+ }
+ }
+
+ template <typename T>
+ void *get(const int index, T func)
+ {
+ /* In single thread mode, we just directly call the populating
+ * function on the (single) buffer, otherwise forward to the
+ * relevant Buffer. */
+ if(_maxthreads == 1)
+ {
+ func(_storage);
+ return _storage;
+ }
+ else
+ {
+ return _buffers[index % _buffers.size()]->get(index, func);
+ }
+ }
+
+ template <typename T>
+ void try_populate(const int index, T func)
+ {
+ /* No need for this in single thread mode. */
+ if(_maxthreads == 1)
+ {
+ return;
+ }
+
+ _buffers[index % _buffers.size()]->try_populate(index, func);
+ }
+
+ void release(const int index)
+ {
+ /* No need for this in single thread mode. */
+ if(_maxthreads == 1)
+ {
+ return;
+ }
+
+ _buffers[index % _buffers.size()]->release();
+ }
+
+ void set_nthreads(int threads)
+ {
+ if(_maxthreads == 1)
+ {
+ return;
+ }
+
+ for(unsigned int i = 0; i < _buffers.size(); i++)
+ {
+ _buffers[i]->set_numusers(threads);
+ }
+ }
+};
+
+#else
+
+/* Trivial implementation if threading is disabled at compile time.
+ *
+ * Here, we only need storage for a single buffer. The 'get' method needs
+ * to call the supplied function to populate the buffer and then return it.
+ * All the other methods do nothing.
+ */
+
+class BufferManager
+{
+private:
+ void *const _storage;
+
+public:
+ BufferManager(BufferManager &) = delete;
+ BufferManager &operator=(BufferManager &) = delete;
+
+ BufferManager(const int maxthreads, const size_t buffersize, void *storage)
+ : _storage(storage)
+ {
+ }
+
+ ~BufferManager()
+ {
+ }
+
+ // Say how much storage is needed.
+ static inline size_t get_storage_requirement(const int maxthreads, const size_t buffersize)
+ {
+ return buffersize;
+ }
+
+ template <typename T>
+ void try_populate(const int index, T func)
+ {
+ }
+
+ void release(const int index)
+ {
+ }
+
+ template <typename T>
+ void *get(const int index, T func)
+ {
+ func(_storage);
+ return _storage;
+ }
+
+ void set_nthreads(int)
+ {
+ }
+};
+
+#endif
+
+} // namespace arm_gemm
diff --git a/src/core/NEON/kernels/arm_gemm/gemm_batched.hpp b/src/core/NEON/kernels/arm_gemm/gemm_batched.hpp
new file mode 100644
index 0000000..385358f
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/gemm_batched.hpp
@@ -0,0 +1,106 @@
+/*
+ * Copyright (c) 2017-2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#pragma once
+
+#include "arm_gemm.hpp"
+
+namespace arm_gemm
+{
+template <typename To, typename Tr>
+class GemmBatched : public GemmCommon<To, Tr>
+{
+private:
+ UniqueGemmCommon<To, Tr> _subgemm = nullptr;
+
+public:
+ GemmBatched(const CPUInfo &ci, const unsigned int M, const unsigned int N, const unsigned int K,
+ const unsigned int nbatches, const unsigned int nmulti, const bool trA, const bool trB,
+ const To alpha, const To beta, const int maxthreads, const bool pretransposed_hint)
+ {
+ /* Just create a subgemm with batches->M */
+ _subgemm = gemm<To, Tr>(ci, nbatches, N, K, 1, nmulti, trA, trB, alpha, beta, maxthreads, pretransposed_hint);
+ }
+
+ void set_arrays(const To *A, const int lda, const int A_batch_stride, const int A_multi_stride,
+ const To *B, const int ldb, const int B_multi_stride,
+ Tr *C, const int ldc, const int C_batch_stride, const int C_multi_stride) override
+ {
+ /* A and C's batch stride becomes their new row stride. New batch stride is 0 as nbatches for subgemm is always 1. */
+ _subgemm->set_arrays(A, A_batch_stride, 0, A_multi_stride,
+ B, ldb, B_multi_stride,
+ C, C_batch_stride, 0, C_multi_stride);
+ }
+
+ unsigned int get_window_size() const override
+ {
+ return _subgemm->get_window_size();
+ }
+
+ void set_nthreads(int nthreads) override
+ {
+ _subgemm->set_nthreads(nthreads);
+ }
+
+ void execute(unsigned int start, unsigned int end, int threadid) override
+ {
+ _subgemm->execute(start, end, threadid);
+ }
+
+ size_t get_working_size() const override
+ {
+ return _subgemm->get_working_size();
+ }
+
+ void set_working_space(void *space) override
+ {
+ _subgemm->set_working_space(space);
+ }
+
+ bool B_is_pretransposed() const override
+ {
+ return _subgemm->B_is_pretransposed();
+ }
+
+ bool B_pretranspose_required() const override
+ {
+ return _subgemm->B_pretranspose_required();
+ }
+
+ size_t get_B_pretransposed_array_size() const override
+ {
+ return _subgemm->get_B_pretransposed_array_size();
+ }
+
+ void pretranspose_B_array(void *buffer, const To *B, const int ldb, const int B_multi_stride) override
+ {
+ _subgemm->pretranspose_B_array(buffer, B, ldb, B_multi_stride);
+ }
+
+ void set_pretransposed_B_data(void *buffer) override
+ {
+ _subgemm->set_pretransposed_B_data(buffer);
+ }
+};
+
+} // namespace arm_gemm
diff --git a/src/core/NEON/kernels/arm_gemm/gemm_fp16.cpp b/src/core/NEON/kernels/arm_gemm/gemm_fp16.cpp
new file mode 100644
index 0000000..d1180b1
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/gemm_fp16.cpp
@@ -0,0 +1,80 @@
+/*
+ * Copyright (c) 2017-2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+// This can only be built if the target/compiler supports FP16 arguments.
+#ifdef __ARM_FP16_ARGS
+
+#include "arm_gemm.hpp"
+
+#include "gemm_common.hpp"
+#include "gemm_interleaved.hpp"
+
+#include "kernels/a32_sgemm_8x6.hpp"
+#include "kernels/a64_hgemm_24x8.hpp"
+#include "kernels/a64_sgemm_12x8.hpp"
+
+namespace arm_gemm
+{
+template <>
+UniqueGemmCommon<__fp16, __fp16> gemm(const CPUInfo &ci, const unsigned int M, const unsigned int N, const unsigned int K,
+ const unsigned int nbatches, const unsigned int nmulti,
+ const bool trA, const bool trB, const __fp16 alpha, const __fp16 beta,
+ const int maxthreads, const bool pretransposed_hint)
+{
+#ifdef __aarch64__
+
+ // Only consider the native FP16 kernel if it will get built.
+#if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) || defined(FP16_KERNELS)
+#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+ // If the compiler is configured to enable this feature always, then assume it is available at runtime too.
+ const bool use_fp16 = true;
+#else
+ // Otherwise, detect at runtime via CPUInfo.
+ const bool use_fp16 = ci.has_fp16();
+#endif
+
+ // If FP16 is supported, use it.
+ if(use_fp16)
+ {
+ return UniqueGemmCommon<__fp16, __fp16>(new GemmInterleaved<hgemm_24x8, __fp16, __fp16>(&ci, M, N, K, nbatches, nmulti, trA, trB, alpha, beta, maxthreads, pretransposed_hint));
+ }
+#endif
+
+ // Fallback to using the blocked SGEMM kernel.
+ return UniqueGemmCommon<__fp16, __fp16>(new GemmInterleaved<sgemm_12x8, __fp16, __fp16>(&ci, M, N, K, nbatches, nmulti, trA, trB, alpha, beta, maxthreads, pretransposed_hint));
+#else
+ // For AArch32, only support the SGEMM route for now.
+ return UniqueGemmCommon<__fp16, __fp16>(new GemmInterleaved<sgemm_8x6, __fp16, __fp16>(&ci, M, N, K, nbatches, nmulti, trA, trB, alpha, beta, maxthreads, pretransposed_hint));
+#endif
+}
+
+// Instantiate static class members if necessary.
+#if defined(__aarch64__) && (defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) || defined(FP16_KERNELS))
+const int hgemm_24x8::out_width;
+const int hgemm_24x8::out_height;
+#endif
+
+} // namespace arm_gemm
+
+#endif // __ARM_FP16_ARGS
diff --git a/src/core/NEON/kernels/arm_gemm/gemm_fp32.cpp b/src/core/NEON/kernels/arm_gemm/gemm_fp32.cpp
new file mode 100644
index 0000000..43df1aa
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/gemm_fp32.cpp
@@ -0,0 +1,92 @@
+/*
+ * Copyright (c) 2017-2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_gemm.hpp"
+#include "gemm_batched.hpp"
+#include "gemm_common.hpp"
+#include "gemm_interleaved.hpp"
+#include "gemm_native.hpp"
+#include "gemv_native_transposed.hpp"
+#include "gemv_pretransposed.hpp"
+
+#include "kernels/a32_sgemm_8x6.hpp"
+#include "kernels/a64_sgemm_12x8.hpp"
+#include "kernels/a64_sgemm_native_16x4.hpp"
+#include "kernels/a64_sgemv_pretransposed.hpp"
+#include "kernels/a64_sgemv_trans.hpp"
+
+namespace arm_gemm
+{
+template <>
+UniqueGemmCommon<float, float> gemm<float, float>(const CPUInfo &ci, const unsigned int M, const unsigned int N, const unsigned int K,
+ const unsigned int nbatches, const unsigned int nmulti,
+ const bool trA, const bool trB, const float alpha, const float beta,
+ const int maxthreads, const bool pretransposed_hint)
+{
+ /* Handle "batched GEMM" */
+ if(M == 1 && nbatches > 1)
+ {
+ return UniqueGemmCommon<float, float>(new GemmBatched<float, float>(ci, M, N, K, nbatches, nmulti, trA, trB, alpha, beta, maxthreads, pretransposed_hint));
+ }
+#ifdef __aarch64__
+ /* Cases in priority order */
+ /* GemvPretransposed: requires M=1, alpha=1, and transposed hint set. nbatches must be 1 or we would have returned above so don't test. */
+ if(M == 1 && alpha == 1.0f && pretransposed_hint)
+ {
+ return UniqueGemmCommon<float, float>(new GemvPretransposed<sgemv_pretransposed, float, float>(&ci, N, K, nmulti, trB, beta));
+ }
+
+ /* GemvNativeTransposed: requires M=1, no trA or trB, doesn't handle alpha */
+ if(M == 1 && alpha == 1.0f && !trA && !trB)
+ {
+ return UniqueGemmCommon<float, float>(new GemvNativeTransposed<sgemv_trans, float, float>(&ci, N, K, nmulti, beta));
+ }
+
+ /* Native GEMM: requires M to be a multiple of 4, K at least 4, N a
+ * multiple of 16, doesn't handle alpha and only makes sense for small
+ * sizes. */
+ if(N <= 128 && K <= 128 && ((M % 4) == 0) && (K >= 4) && ((N % 16) == 0) && alpha == 1.0f)
+ {
+ return UniqueGemmCommon<float, float>(new GemmNative<sgemm_native_16x4, float, float>(&ci, M, N, K, nbatches, nmulti, beta));
+ }
+
+ /* Blocked GEMM, handles all cases. */
+ return UniqueGemmCommon<float, float>(new GemmInterleaved<sgemm_12x8, float, float>(&ci, M, N, K, nbatches, nmulti, trA, trB, alpha, beta, maxthreads, pretransposed_hint));
+#else
+ return UniqueGemmCommon<float, float>(new GemmInterleaved<sgemm_8x6, float, float>(&ci, M, N, K, nbatches, nmulti, trA, trB, alpha, beta, maxthreads, pretransposed_hint));
+#endif
+}
+
+// Instantiate static class variables.
+#ifdef __aarch64__
+const int sgemm_12x8::out_width;
+const int sgemm_12x8::out_height;
+
+const int sgemm_native_16x4::out_width;
+const int sgemm_native_16x4::out_height;
+#else
+const int sgemm_8x6::out_width;
+const int sgemm_8x6::out_height;
+#endif
+
+} // namespace arm_gemm
diff --git a/src/core/NEON/kernels/arm_gemm/gemm_int16.cpp b/src/core/NEON/kernels/arm_gemm/gemm_int16.cpp
new file mode 100644
index 0000000..7669fe0
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/gemm_int16.cpp
@@ -0,0 +1,49 @@
+/*
+ * Copyright (c) 2017-2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifdef __aarch64__
+
+#include "arm_gemm.hpp"
+#include "gemm_common.hpp"
+#include "gemm_interleaved.hpp"
+
+#include "kernels/a64_gemm_s16_12x8.hpp"
+
+namespace arm_gemm
+{
+template <>
+UniqueGemmCommon<int16_t, int32_t> gemm<int16_t, int32_t>(const CPUInfo &ci, const unsigned int M, const unsigned int N, const unsigned int K,
+ const unsigned int nbatches, const unsigned int nmulti,
+ const bool trA, const bool trB, const int32_t alpha, const int32_t beta,
+ const int maxthreads, const bool pretransposed_hint)
+{
+ return UniqueGemmCommon<int16_t, int32_t>(new GemmInterleaved<gemm_s16_12x8, int16_t, int32_t>(&ci, M, N, K, nbatches, nmulti, trA, trB, alpha, beta, maxthreads, pretransposed_hint));
+}
+
+// Instantiate static class members
+const int gemm_s16_12x8::out_width;
+const int gemm_s16_12x8::out_height;
+
+} // namespace arm_gemm
+
+#endif // __aarch64__
diff --git a/src/core/NEON/kernels/arm_gemm/gemm_int8.cpp b/src/core/NEON/kernels/arm_gemm/gemm_int8.cpp
new file mode 100644
index 0000000..6016af2
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/gemm_int8.cpp
@@ -0,0 +1,59 @@
+/*
+ * Copyright (c) 2017-2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifdef __aarch64__
+
+#include "arm_gemm.hpp"
+#include "gemm_common.hpp"
+#include "gemm_interleaved.hpp"
+
+#include "kernels/a64_gemm_s16_12x8.hpp"
+#include "kernels/a64_gemm_s8_12x8.hpp"
+#include "kernels/a64_gemm_s8_4x4.hpp"
+
+namespace arm_gemm
+{
+template <>
+UniqueGemmCommon<int8_t, int32_t> gemm<int8_t, int32_t>(const CPUInfo &ci, const unsigned int M, const unsigned int N, const unsigned int K,
+ const unsigned int nbatches, const unsigned int nmulti,
+ const bool trA, const bool trB, const int32_t alpha, const int32_t beta,
+ const int maxthreads, const bool pretransposed_hint)
+{
+ if(ci.has_dotprod())
+ {
+ // Dot product supporting CPUs. This family has a special version for A55r1.
+ return UniqueGemmCommon<int8_t, int32_t>(new GemmInterleaved<gemm_s8_12x8, int8_t, int32_t>(&ci, M, N, K, nbatches, nmulti, trA, trB, alpha, beta, maxthreads, pretransposed_hint));
+ }
+
+ return UniqueGemmCommon<int8_t, int32_t>(new GemmInterleaved<gemm_s8_4x4, int8_t, int32_t>(&ci, M, N, K, nbatches, nmulti, trA, trB, alpha, beta, maxthreads, pretransposed_hint));
+}
+
+// Instantiate static class members
+const int gemm_s8_12x8::out_width;
+const int gemm_s8_12x8::out_height;
+const int gemm_s8_4x4::out_width;
+const int gemm_s8_4x4::out_height;
+
+} // namespace arm_gemm
+
+#endif // aarch64
diff --git a/src/core/NEON/kernels/arm_gemm/gemm_interleaved.hpp b/src/core/NEON/kernels/arm_gemm/gemm_interleaved.hpp
new file mode 100644
index 0000000..efc5171
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/gemm_interleaved.hpp
@@ -0,0 +1,618 @@
+/*
+ * Copyright (c) 2017-2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#pragma once
+
+#include <assert.h>
+#include <stdio.h>
+
+#include <algorithm>
+
+#include "arm_gemm.hpp"
+#include "utils.hpp"
+
+#include "buffer_manager.hpp"
+#include "mergeresults.hpp"
+#include "transform.hpp"
+
+#ifdef CYCLE_PROFILING
+#include "profiler.hpp"
+#endif
+
+// Some macros used to decide how much working space to allocate.
+// Round allocations up to the next cache line.
+#define ALLOC_ROUND 64
+#define ROUND_UP(x) ((((x) + ALLOC_ROUND - 1) / ALLOC_ROUND) * ALLOC_ROUND)
+
+// Implementation of the GemmCommon abstract class.
+//
+// This implementation interleaves the source matrices in blocks - good for
+// larger matrices.
+namespace arm_gemm
+{
+template <typename strategy, typename To, typename Tr>
+class GemmInterleaved : public GemmCommon<To, Tr>
+{
+ typedef typename strategy::operand_type Toi;
+ typedef typename strategy::result_type Tri;
+
+ /* const properties set by constructor */
+ const CPUInfo *const _ci;
+
+ const unsigned int _Msize;
+ const unsigned int _Nsize;
+ const unsigned int _Ksize;
+
+ const unsigned int _nbatches;
+ const unsigned int _nmulti;
+
+ const bool _trA;
+ const bool _trB;
+
+ const Tr _alpha;
+ const Tr _beta;
+
+ const unsigned int _maxthreads;
+ const bool _pretransposed;
+
+ /* Blocking info */
+ unsigned int _k_block = 0;
+ unsigned int _x_block = 0;
+ unsigned int _Mround = 0;
+
+ /* Working space, pretransposed buffer, buffer manager */
+ const Toi *_B_transposed = nullptr;
+ BufferManager *_bm = nullptr;
+ void *_working_space = nullptr;
+
+ /* We will need to walk through the blocks of B in a few contexts, so
+ * factor that out. */
+ class blockwalker
+ {
+ private:
+ /* Size loops, etc. based on our parent's configuration */
+ const GemmInterleaved<strategy, To, Tr> &_parent;
+
+ /* K and X and multi parameters for current iteration. */
+ unsigned int _k0 = 0, _x0 = 0, _multi = 0;
+
+ unsigned int _index = 0;
+ bool _done = false;
+ bool _newkblock = true;
+ bool _newmulti = true;
+
+ public:
+ blockwalker(const GemmInterleaved<strategy, To, Tr> &parent)
+ : _parent(parent)
+ {
+ }
+
+ unsigned int xmax()
+ {
+ return std::min(_x0 + _parent._x_block, _parent._Nsize);
+ }
+
+ unsigned int kmax()
+ {
+ return std::min(_k0 + _parent._k_block, _parent._Ksize);
+ }
+
+ /* Advance to the next block, return false at the end. */
+ bool advance(void)
+ {
+ if(_done)
+ {
+ return false;
+ }
+
+ _newkblock = false;
+ _x0 += _parent._x_block;
+ if(_x0 >= _parent._Nsize)
+ {
+ _x0 = 0;
+ _k0 += _parent._k_block;
+ if(_k0 >= _parent._Ksize)
+ {
+ _k0 = 0;
+ _multi++;
+ if(_multi >= _parent._nmulti)
+ {
+ _done = true;
+ return false;
+ }
+ _newmulti = true;
+ }
+ _newkblock = true;
+ }
+ _index++;
+
+ return true;
+ }
+
+ unsigned int k0(void)
+ {
+ return _k0;
+ }
+ unsigned int x0(void)
+ {
+ return _x0;
+ }
+ unsigned int multi(void)
+ {
+ return _multi;
+ }
+ unsigned int index(void)
+ {
+ return _index;
+ }
+ bool done(void)
+ {
+ return _done;
+ }
+ bool newkblock(void)
+ {
+ return _newkblock;
+ }
+ };
+
+ // A working size: One of these needed, regardless of thread count. Divided according to window.
+ size_t get_a_working_size() const
+ {
+ return ROUND_UP(sizeof(Toi) * _k_block * _Mround * _nbatches);
+ }
+
+ // B working size: 0, 1 or 3 of these needed depending on pretransposed and threading settings.
+ size_t get_b_working_size() const
+ {
+ return ROUND_UP(sizeof(Toi) * _x_block * _k_block);
+ }
+
+ // C working size: One needed per thread.
+ size_t get_c_working_size() const
+ {
+ return ROUND_UP(sizeof(Tri) * _x_block * strategy::out_height);
+ }
+
+ // Internal execute function.
+ // This supports both the "pretransposed" and "standard" interfaces via the template parameter.
+ template <bool pretransposed>
+ void execute_internal(unsigned int start, unsigned int end, int threadid)
+ {
+#ifdef CYCLE_PROFILING
+ profiler prof;
+#endif
+
+ strategy strat(_ci);
+
+ blockwalker current(*this);
+ blockwalker next = current;
+
+ /* Translate 'start' and 'end' into a position within the batches and rows. */
+ const unsigned int window_per_batch = _Mround / strategy::out_height;
+ unsigned int batch_0 = start / window_per_batch;
+ unsigned int batch_end = end / window_per_batch;
+
+ /* Compute the M values to operate on */
+ unsigned int m_0 = (start - (batch_0 * window_per_batch)) * strategy::out_height;
+ unsigned int m_max = (end - (batch_end * window_per_batch)) * strategy::out_height;
+
+ /* Make sure we've been set up correctly. */
+ if(pretransposed)
+ {
+ assert(_B_transposed);
+ }
+ else
+ {
+ assert(_bm);
+ }
+
+ assert(_working_space);
+ int8_t *working_space_bytes = reinterpret_cast<int8_t *>(_working_space);
+
+ // Private buffers. Treat working_space as an array of C buffers (one per thread) first, followed by the (window-divided) A buffer.
+ // Set a_panel to the base of the A buffers - compute offsets into it based on M/batches later.
+ Toi *const a_panel = reinterpret_cast<Toi *>(working_space_bytes + (_maxthreads * get_c_working_size()));
+ Tri *const c_panel = reinterpret_cast<Tri *>(working_space_bytes + (threadid * get_c_working_size()));
+
+ // Shared buffers - these come either from BufferManager or _B_transposed.
+ const Toi *b_panel;
+
+ if(pretransposed)
+ {
+ b_panel = _B_transposed;
+ }
+
+ //printf("Starting GEMM loop, x_block=%d, k_block=%d\n", _x_block, _k_block);
+
+ // newkblock() is always true on the first iteration, so this will be set properly on the first loop.
+ int kern_k = 0;
+
+ for(; !current.done(); current.advance())
+ {
+ if(current.newkblock())
+ {
+#ifdef CYCLE_PROFILING
+ auto p = prof.ScopedProfiler(PROFILE_PREPA, (end - start) * strategy::out_height * (current.kmax() - current.k0()) * sizeof(Toi));
+#endif
+ for(unsigned int batch = batch_0; batch <= batch_end; batch++)
+ {
+ unsigned int first_m = (batch == batch_0) ? m_0 : 0;
+ unsigned int last_m = (batch == batch_end) ? m_max : _Msize;
+
+ if(first_m >= last_m)
+ continue;
+ if(_trA ^ strategy::A_transpose)
+ {
+ Transform<strategy::A_interleave, strategy::A_block, true>(
+ a_panel + ((batch * _Mround + first_m) * _k_block),
+ this->_Aptr + (batch * this->_A_batch_stride) + (current.multi() * this->_A_multi_stride),
+ this->_lda, first_m, last_m, current.k0(), current.kmax());
+ }
+ else
+ {
+ Transform<strategy::A_interleave, strategy::A_block, false>(
+ a_panel + ((batch * _Mround + first_m) * _k_block),
+ this->_Aptr + (batch * this->_A_batch_stride) + (current.multi() * this->_A_multi_stride),
+ this->_lda, first_m, last_m, current.k0(), current.kmax());
+ }
+ }
+
+ // Figure out how many "K" the kernel will actually process.
+ kern_k = iceildiv(current.kmax() - current.k0(), strategy::k_unroll);
+ kern_k *= strat.k_unroll;
+ }
+
+ int bblocks = iceildiv(current.xmax() - current.x0(), strategy::out_width);
+
+ if(!pretransposed)
+ {
+ /* Look ahead to the next block and populate it if necessary.
+ * This avoids the populate operation becoming a bottleneck, and
+ * helps keep the threads synchronized (the first thread to get
+ * here will populate while the rest will advance).
+ *
+ * If we are running single threaded, bm->try_populate() will do
+ * nothing.
+ */
+ if(next.advance())
+ {
+ _bm->try_populate(next.index(), [&](void *buffer)
+ {
+#ifdef CYCLE_PROFILING
+ auto p = prof.ScopedProfiler(PROFILE_PREPB, (next.xmax() - next.x0()) * (next.kmax() - next.k0()) * sizeof(Toi));
+#endif
+
+ Toi *b_panel = reinterpret_cast<Toi *>(buffer);
+ if(_trB ^ strategy::B_transpose)
+ {
+ Transform<strategy::B_interleave, strategy::B_block, true>(
+ b_panel, this->_Bptr + (next.multi() * this->_B_multi_stride), this->_ldb,
+ next.x0(), next.xmax(), next.k0(), next.kmax());
+ }
+ else
+ {
+ Transform<strategy::B_interleave, strategy::B_block, false>(
+ b_panel, this->_Bptr + (next.multi() * this->_B_multi_stride), this->_ldb,
+ next.x0(), next.xmax(), next.k0(), next.kmax());
+ }
+ });
+ }
+ /* Get the buffer for this iteration from the BufferManager. */
+ b_panel = reinterpret_cast<Toi *>(_bm->get(current.index(), [&](void *bpv)
+ {
+#ifdef CYCLE_PROFILING
+ auto p = prof.ScopedProfiler(PROFILE_PREPB, (current.xmax() - current.x0()) * (current.kmax() - current.k0()) * sizeof(Toi));
+#endif
+
+ Toi *b_panel = reinterpret_cast<Toi *>(bpv);
+ if(_trB ^ strategy::B_transpose)
+ {
+ Transform<strategy::B_interleave, strategy::B_block, true>(
+ b_panel, this->_Bptr + (current.multi() * this->_B_multi_stride), this->_ldb,
+ current.x0(), current.xmax(), current.k0(), current.kmax());
+ }
+ else
+ {
+ Transform<strategy::B_interleave, strategy::B_block, false>(
+ b_panel, this->_Bptr + (current.multi() * this->_B_multi_stride), this->_ldb,
+ current.x0(), current.xmax(), current.k0(), current.kmax());
+ }
+
+ }));
+ }
+
+ /* Do the actual work. */
+ for(unsigned int batch = batch_0; batch <= batch_end; batch++)
+ {
+ unsigned int first_m = (batch == batch_0) ? m_0 : 0;
+ unsigned int last_m = (batch == batch_end) ? m_max : _Msize;
+
+ const Toi *a_ptr = a_panel + (batch * _Mround + first_m) * _k_block;
+
+ if(first_m >= last_m)
+ continue;
+
+ for(unsigned int y = first_m; y < last_m; y += strategy::out_height)
+ {
+ unsigned int ymax = std::min(_Msize, y + strategy::out_height);
+
+ {
+#ifdef CYCLE_PROFILING
+ auto p = prof.ScopedProfiler(PROFILE_KERNEL, (strategy::out_height * bblocks * strategy::out_width * kern_k));
+#endif
+
+ strat.kernel(a_ptr, b_panel, c_panel, 1, bblocks, kern_k);
+
+ a_ptr += (strategy::out_height * kern_k);
+ }
+
+ {
+#ifdef CYCLE_PROFILING
+ auto p = prof.ScopedProfiler(PROFILE_MERGE, (strategy::out_height * bblocks * strategy::out_width * sizeof(Tr)));
+#endif
+ MergeResults<strategy::out_width, strategy::out_height>(
+ this->_Cptr + (batch * this->_C_batch_stride) + (current.multi() * this->_C_multi_stride),
+ c_panel, this->_ldc, y, ymax, current.x0(), current.xmax(),
+ _alpha, (current.k0() == 0 ? _beta : static_cast<Tr>(1)));
+ }
+ }
+ }
+
+ if(pretransposed)
+ {
+ b_panel += (bblocks * strat.out_width * kern_k);
+ }
+ else
+ {
+ _bm->release(current.index());
+ }
+ }
+ }
+
+public:
+ GemmInterleaved(GemmInterleaved &) = delete;
+ GemmInterleaved &operator=(GemmInterleaved &) = delete;
+
+ /* Constructor */
+ GemmInterleaved(const CPUInfo *ci, const unsigned int M, const unsigned int N, const unsigned int K,
+ const unsigned int nbatches, const unsigned int nmulti, const bool trA, const bool trB,
+ const Tr alpha, const Tr beta, const int maxthreads, const bool pretransposed)
+ : _ci(ci), _Msize(M), _Nsize(N), _Ksize(K), _nbatches(nbatches), _nmulti(nmulti), _trA(trA), _trB(trB), _alpha(alpha), _beta(beta), _maxthreads(maxthreads), _pretransposed(pretransposed)
+ {
+ const unsigned int L1_size = ci->get_L1_cache_size();
+ const unsigned int L2_size = ci->get_L2_cache_size();
+
+ assert(maxthreads > 0);
+
+ // Work out blocking parameters
+
+ // k_block: Find out how much of the larger array can be loaded into half the cache.
+ // This should account for associative caches.
+ _k_block = (L1_size / 2) / (sizeof(Toi) * (std::max(strategy::out_width, strategy::out_height)));
+
+ // Needs to be (at least a single) multiple of the K unroll level.
+ _k_block /= strategy::k_unroll;
+ _k_block = std::max(_k_block, 1U) * strategy::k_unroll;
+
+ // Now tune to presented problem size; this is how many blocks we need.
+ int num_k_blocks = iceildiv(K, _k_block);
+
+ // So divide the space equally into that many blocks.
+ _k_block = iceildiv(K, num_k_blocks);
+
+ // And round UP to the K unroll level required.
+ _k_block = iceildiv(_k_block, strategy::k_unroll);
+ _k_block *= strategy::k_unroll;
+
+ // x_block: Work out how many rows (of length k_block) will fit in the L2
+ // Don't allocate more than 90% of the L2 to allow for overheads, and subtract off the L1 contents.
+ _x_block = (((L2_size * 9) / 10) - (_k_block * sizeof(Toi) * (strategy::out_width + strategy::out_height))) / (sizeof(Toi) * _k_block);
+
+ // Needs to be (at least a single) multiple of the kernel output width.
+ _x_block /= strategy::out_width;
+ _x_block = std::max(_x_block, 1U) * strategy::out_width;
+
+ // And tune to the presented problem size.
+ int num_x_blocks = iceildiv(N, _x_block);
+ _x_block = iceildiv(N, num_x_blocks);
+
+ _x_block = iceildiv(_x_block, strategy::out_width);
+ _x_block *= strategy::out_width;
+
+ // Work out the rounded size of M - needed for some buffers.
+ _Mround = iceildiv(M, strategy::out_height);
+ _Mround *= strategy::out_height;
+ }
+
+ // Interface implementation - Compulsory functions
+
+ // Window size: Only the last thread should do a ragged block, so dole
+ // out work in units of out_height. Factor batches into the window, but
+ // not multi for now (as this would cause problems with the buffer
+ // manager).
+
+ unsigned int get_window_size() const override
+ {
+ // _Mround is a multiple of out_height by definition.
+ return (_Mround / strategy::out_height) * _nbatches;
+ }
+
+ // set_nthreads: pass on to buffer manager to avoid it waiting for non-existant threads.
+ void set_nthreads(int nthreads) override
+ {
+ if(_bm)
+ {
+ _bm->set_nthreads(nthreads);
+ }
+ }
+
+ // Execute
+ void execute(unsigned int start, unsigned int end, int threadid) override
+ {
+ if(_pretransposed)
+ {
+ execute_internal<true>(start, end, threadid);
+ }
+ else
+ {
+ execute_internal<false>(start, end, threadid);
+ }
+ }
+
+ // Interface implementation - working space
+ size_t get_working_size() const override
+ {
+ // In all cases, we need one A buffer plus a C buffer per thread.
+ size_t size = get_a_working_size() + (get_c_working_size() * _maxthreads);
+
+ // For pretransposed case, there is no working space needed for B.
+ // Otherwise, we need a BufferManager.
+ if(!_pretransposed)
+ {
+ size += BufferManager::get_storage_requirement(_maxthreads, get_b_working_size());
+ }
+
+ size += 64; // Add on a cache line extra for alignment.
+
+ return size;
+ }
+
+ void set_working_space(void *working_space) override
+ {
+ // Make sure everything ends up cache line aligned
+ int8_t *working_space_bytes = reinterpret_cast<int8_t *>(working_space);
+ intptr_t working_space_int = reinterpret_cast<intptr_t>(working_space);
+
+ size_t diff = 0;
+
+ if(working_space_int & 0x3F)
+ {
+ diff = 0x40 - (working_space_int & 0x3F);
+ }
+
+ working_space_bytes += diff;
+
+ if(_pretransposed)
+ {
+ // Pretransposed case: just set internal pointer to parameter value.
+ _working_space = reinterpret_cast<void *>(working_space_bytes);
+ }
+ else
+ {
+ // Otherwise, use the first part of the working space for the buffer manager.
+ // It's legal to call this again so don't leak a buffer manager if it already existed.
+ delete _bm;
+
+ _bm = new BufferManager(_maxthreads, get_b_working_size(), reinterpret_cast<void *>(working_space_bytes));
+
+ working_space_bytes += BufferManager::get_storage_requirement(_maxthreads, get_b_working_size());
+
+ _working_space = reinterpret_cast<void *>(working_space_bytes);
+ }
+ }
+
+ // Interface implementation - pretransposed
+ bool B_is_pretransposed() const override
+ {
+ return _pretransposed;
+ }
+
+ bool B_pretranspose_required() const override
+ {
+ return _pretransposed && (_B_transposed == nullptr);
+ }
+
+ size_t get_B_pretransposed_array_size() const override
+ {
+ size_t total = 0;
+ blockwalker current(*this);
+
+ do
+ {
+ /* Figure out the size of each block. */
+ size_t x_size = (current.xmax() - current.x0());
+ size_t k_size = (current.kmax() - current.k0());
+
+ /* Round sizes up as needed. */
+ x_size = iceildiv(x_size, strategy::out_width);
+ x_size *= strategy::out_width;
+
+ k_size = iceildiv(k_size, strategy::k_unroll);
+ k_size *= strategy::k_unroll;
+
+ total += x_size * k_size * sizeof(Toi);
+ }
+ while(current.advance());
+
+ return total;
+ }
+
+ void pretranspose_B_array(void *in_buffer, const To *B, const int ldb, const int B_multi_stride) override
+ {
+ blockwalker current(*this);
+ Toi *buffer = reinterpret_cast<Toi *>(in_buffer);
+ _B_transposed = buffer;
+
+ do
+ {
+ /* Figure out the size of each block. */
+ size_t x_size = (current.xmax() - current.x0());
+ size_t k_size = (current.kmax() - current.k0());
+
+ /* Round sizes up as needed. */
+ x_size = iceildiv(x_size, strategy::out_width);
+ x_size *= strategy::out_width;
+
+ k_size = iceildiv(k_size, strategy::k_unroll);
+ k_size *= strategy::k_unroll;
+
+ if(_trB ^ strategy::B_transpose)
+ {
+ Transform<strategy::B_interleave, strategy::B_block, true>(
+ buffer, B + (current.multi() * B_multi_stride), ldb,
+ current.x0(), current.xmax(), current.k0(), current.kmax());
+ }
+ else
+ {
+ Transform<strategy::B_interleave, strategy::B_block, false>(
+ buffer, B + (current.multi() * B_multi_stride), ldb,
+ current.x0(), current.xmax(), current.k0(), current.kmax());
+ }
+
+ buffer += (x_size * k_size);
+ }
+ while(current.advance());
+ }
+
+ void set_pretransposed_B_data(void *in_buffer) override
+ {
+ _B_transposed = reinterpret_cast<Toi *>(in_buffer);
+ }
+
+ ~GemmInterleaved() override
+ {
+ delete _bm;
+ }
+};
+
+} // namespace arm_gemm
diff --git a/src/core/NEON/kernels/arm_gemm/gemm_native.hpp b/src/core/NEON/kernels/arm_gemm/gemm_native.hpp
new file mode 100644
index 0000000..075ab82
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/gemm_native.hpp
@@ -0,0 +1,133 @@
+/*
+ * Copyright (c) 2017-2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#pragma once
+
+#include <stdio.h>
+
+#include "arm_gemm.hpp"
+
+#include "mergeresults.hpp"
+#include "transform.hpp"
+
+#ifdef CYCLE_PROFILING
+#include "profiler.hpp"
+#endif
+
+namespace arm_gemm
+{
+// Implementation of the GemmCommon abstract class.
+//
+// This is implementation is for native GEMM with no transposition.
+//
+// By default the source data is used in-place, but if type conversion is
+// needed we need to allocate working space (CURRENTLY NOT IMPLEMENTED).
+
+template <typename strategy, typename To, typename Tr>
+class GemmNative : public GemmCommon<To, Tr>
+{
+ typedef typename strategy::operand_type Toi;
+ typedef typename strategy::result_type Tri;
+
+ const unsigned int _Msize;
+ const unsigned int _Nsize;
+ const unsigned int _Ksize;
+
+ const unsigned int _nbatches;
+ const unsigned int _nmultis;
+
+ Tr _beta;
+
+ const CPUInfo *const _ci;
+
+ unsigned int k_block = 0;
+ unsigned int n_block = 0;
+
+public:
+ GemmNative(GemmNative &) = delete;
+ GemmNative &operator=(GemmNative &) = delete;
+
+ GemmNative(const CPUInfo *ci, const unsigned int M, const unsigned int N, const unsigned int K, const unsigned int nbatches, const unsigned int nmultis, const Tr beta)
+ : _Msize(M), _Nsize(N), _Ksize(K), _nbatches(nbatches), _nmultis(nmultis), _beta(beta), _ci(ci)
+ {
+ /* For now don't do any blocking.*/
+ k_block = K;
+ n_block = N;
+ }
+
+ // Window is number of out_height blocks
+ unsigned int get_window_size() const override
+ {
+ return iceildiv(_Msize, strategy::out_height) * _nbatches * _nmultis;
+ }
+
+ // Actually execute the GEMM.
+ void execute(unsigned int start, unsigned int end, int) override
+ {
+#ifdef CYCLE_PROFILING
+ profiler prof;
+#endif
+ strategy strat(_ci);
+ const unsigned int window_per_batch = iceildiv(_Msize, strategy::out_height);
+ const unsigned int window_per_multi = window_per_batch * _nbatches;
+
+ const unsigned int first_multi = start / window_per_multi;
+ const unsigned int last_multi = end / window_per_multi;
+
+ const unsigned int first_batch = (start - (first_multi * window_per_multi)) / window_per_batch;
+ const unsigned int last_batch = (end - (last_multi * window_per_multi)) / window_per_batch;
+
+ const unsigned int first_row = ((start - (first_multi * window_per_multi)) % window_per_batch) * strategy::out_height;
+ const unsigned int last_row = ((end - (last_multi * window_per_multi)) % window_per_batch) * strategy::out_height;
+
+ static_assert(std::is_same<To, Toi>::value, "gemm_native: Operand types must be the same.");
+ static_assert(std::is_same<Tr, Tri>::value, "gemm_native: Result types must be the same.");
+
+ for(unsigned int multi = first_multi; multi <= last_multi; multi++)
+ {
+ const unsigned int batch_0 = (multi == first_multi) ? first_batch : 0;
+ const unsigned int batch_max = (multi == last_multi) ? last_batch : _nbatches - 1;
+
+ for(unsigned int batch = batch_0; batch <= batch_max; batch++)
+ {
+ const unsigned int m_start = ((multi == first_multi) && (batch == first_batch)) ? first_row : 0;
+ const unsigned int m_end = ((multi == last_multi) && (batch == last_batch)) ? last_row : _Msize;
+
+ for(unsigned int y0 = m_start; y0 < m_end; y0 += strategy::out_height)
+ {
+ const unsigned int ymax = std::min(y0 + strategy::out_height, m_end);
+#ifdef CYCLE_PROFILING
+ auto p = prof.ScopedProfiler(PROFILE_KERNEL, (ymax - y0) * _Nsize * _Ksize);
+#endif
+
+ strat.kernel(this->_Aptr + (multi * this->_A_multi_stride) + (batch * this->_A_batch_stride) + (y0 * this->_lda), this->_lda,
+ this->_Bptr + (multi * this->_B_multi_stride), this->_ldb,
+ this->_Cptr + (multi * this->_C_multi_stride) + (batch * this->_C_batch_stride) + (y0 * this->_ldc), this->_ldc,
+ _beta, (ymax - y0), _Nsize, _Ksize);
+ }
+ }
+ }
+ }
+};
+
+} // namespace arm_gemm
diff --git a/src/core/NEON/kernels/arm_gemm/gemm_uint16.cpp b/src/core/NEON/kernels/arm_gemm/gemm_uint16.cpp
new file mode 100644
index 0000000..8f1f377
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/gemm_uint16.cpp
@@ -0,0 +1,49 @@
+/*
+ * Copyright (c) 2017-2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifdef __aarch64__
+
+#include "arm_gemm.hpp"
+#include "gemm_common.hpp"
+#include "gemm_interleaved.hpp"
+
+#include "kernels/a64_gemm_u16_12x8.hpp"
+
+namespace arm_gemm
+{
+template <>
+UniqueGemmCommon<uint16_t, uint32_t> gemm<uint16_t, uint32_t>(const CPUInfo &ci, const unsigned int M, const unsigned int N, const unsigned int K,
+ const unsigned int nbatches, const unsigned int nmulti,
+ const bool trA, const bool trB, uint32_t alpha, uint32_t beta,
+ const int maxthreads, const bool pretransposed_hint)
+{
+ return UniqueGemmCommon<uint16_t, uint32_t>(new GemmInterleaved<gemm_u16_12x8, uint16_t, uint32_t>(&ci, M, N, K, nbatches, nmulti, trA, trB, alpha, beta, maxthreads, pretransposed_hint));
+}
+
+// Instantiate static class members
+const int gemm_u16_12x8::out_width;
+const int gemm_u16_12x8::out_height;
+
+} // namespace arm_gemm
+
+#endif // __aarch64__
diff --git a/src/core/NEON/kernels/arm_gemm/gemm_uint8.cpp b/src/core/NEON/kernels/arm_gemm/gemm_uint8.cpp
new file mode 100644
index 0000000..12e5aa6
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/gemm_uint8.cpp
@@ -0,0 +1,60 @@
+/*
+ * Copyright (c) 2017-2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifdef __aarch64__
+
+#include "arm_gemm.hpp"
+#include "gemm_common.hpp"
+#include "gemm_interleaved.hpp"
+
+#include "kernels/a64_gemm_u8_12x8.hpp"
+#include "kernels/a64_gemm_u8_4x4.hpp"
+
+namespace arm_gemm
+{
+template <>
+UniqueGemmCommon<uint8_t, uint32_t> gemm<uint8_t, uint32_t>(const CPUInfo &ci, const unsigned int M, const unsigned int N, const unsigned int K,
+ const unsigned int nbatches, const unsigned int nmulti,
+ const bool trA, const bool trB, const uint32_t alpha, const uint32_t beta,
+ const int maxthreads, const bool pretransposed_hint)
+{
+ if(ci.has_dotprod())
+ {
+ // Dot product supporting CPUs. This family has a special version for A55r1.
+ return UniqueGemmCommon<uint8_t, uint32_t>(new GemmInterleaved<gemm_u8_12x8, uint8_t, uint32_t>(&ci, M, N, K, nbatches, nmulti, trA, trB, alpha, beta, maxthreads, pretransposed_hint));
+ }
+
+ // Non dot-product code.
+ return UniqueGemmCommon<uint8_t, uint32_t>(new GemmInterleaved<gemm_u8_4x4, uint8_t, uint32_t>(&ci, M, N, K, nbatches, nmulti, trA, trB, alpha, beta, maxthreads, pretransposed_hint));
+}
+
+// Instantiate static class members
+const int gemm_u8_12x8::out_width;
+const int gemm_u8_12x8::out_height;
+
+const int gemm_u8_4x4::out_width;
+const int gemm_u8_4x4::out_height;
+
+} // namespace arm_gemm
+
+#endif // aarch64
diff --git a/src/core/NEON/kernels/arm_gemm/gemv_native_transposed.hpp b/src/core/NEON/kernels/arm_gemm/gemv_native_transposed.hpp
new file mode 100644
index 0000000..63bb58a
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/gemv_native_transposed.hpp
@@ -0,0 +1,127 @@
+/*
+ * Copyright (c) 2017-2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#pragma once
+
+#include <stdio.h>
+
+#include "arm_gemm.hpp"
+
+#include "mergeresults.hpp"
+#include "transform.hpp"
+
+#ifdef CYCLE_PROFILING
+#include "profiler.hpp"
+#endif
+
+namespace arm_gemm
+{
+// Implementation of the GemmCommon abstract class.
+//
+// This is implementation is for a "native" (no-transform) GEMV with a
+// transposed matrix.
+//
+// As a native operation the source data is used in-place, so the internal
+// and external operand/result types must match.
+template <typename strategy, typename To, typename Tr>
+class GemvNativeTransposed : public GemmCommon<To, Tr>
+{
+ typedef typename strategy::operand_type Toi;
+ typedef typename strategy::result_type Tri;
+
+ const unsigned int _Nsize;
+ const unsigned int _Ksize;
+ const unsigned int _nmultis;
+
+ const Tr _beta;
+
+ const CPUInfo *const _ci;
+
+ unsigned int m_block = 0;
+ unsigned int n_block = 0;
+
+public:
+ GemvNativeTransposed(GemvNativeTransposed &) = delete;
+ GemvNativeTransposed &operator=(GemvNativeTransposed &) = delete;
+
+ GemvNativeTransposed(const CPUInfo *ci, const unsigned int N, const unsigned int K, const unsigned int nmultis, const Tr beta)
+ : _Nsize(N), _Ksize(K), _nmultis(nmultis), _beta(beta), _ci(ci)
+ {
+ /* For now don't do any blocking.*/
+ m_block = K;
+ n_block = N;
+ }
+
+ // Window is number of out_width blocks times number of multis.
+ unsigned int get_window_size() const override
+ {
+ return iceildiv(_Nsize, strategy::out_width) * _nmultis;
+ }
+
+ // Actually execute the GEMV.
+ void execute(unsigned int start, unsigned int end, int) override
+ {
+#ifdef CYCLE_PROFILING
+ profiler prof;
+#endif
+
+ strategy strat(_ci);
+
+ const unsigned int window_per_multi = iceildiv(_Nsize, strategy::out_width);
+ const unsigned int multi_0 = start / window_per_multi;
+ const unsigned int multi_end = end / window_per_multi;
+
+ const unsigned int n_0 = (start - (multi_0 * window_per_multi)) * strategy::out_width;
+ const unsigned int n_max = (end - (multi_end * window_per_multi)) * strategy::out_width;
+
+ static_assert(std::is_same<To, Toi>::value, "gemv_transposed: Operand types must be the same.");
+ static_assert(std::is_same<Tr, Tri>::value, "gemv_transposed: Result types must be the same.");
+
+ for(unsigned int multi = multi_0; multi <= multi_end; multi++)
+ {
+ const unsigned int n_start = (multi == multi_0) ? n_0 : 0;
+ const unsigned int n_end = (multi == multi_end) ? n_max : _Nsize;
+
+ if(n_end <= n_start)
+ continue;
+
+ for(unsigned int m0 = 0; m0 < _Ksize; m0 += m_block)
+ {
+ unsigned int mmax = std::min(m0 + m_block, _Ksize);
+ for(unsigned int n0 = n_start; n0 < n_end; n0 += n_block)
+ {
+ unsigned int nmax = std::min(n0 + n_block, n_end);
+#ifdef CYCLE_PROFILING
+ auto p = prof.ScopedProfiler(PROFILE_KERNEL, (mmax - m0) * (nmax - n0));
+#endif
+ strat.kernel(this->_Bptr + (multi * this->_B_multi_stride) + (m0 * this->_ldb) + n0,
+ this->_Aptr + (multi * this->_A_multi_stride) + m0,
+ this->_Cptr + (multi * this->_C_multi_stride) + n0,
+ _beta, this->_ldb, (mmax - m0), (nmax - n0));
+ }
+ }
+ }
+ }
+};
+
+} // namespace arm_gemm
diff --git a/src/core/NEON/kernels/arm_gemm/gemv_pretransposed.hpp b/src/core/NEON/kernels/arm_gemm/gemv_pretransposed.hpp
new file mode 100644
index 0000000..79f1359
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/gemv_pretransposed.hpp
@@ -0,0 +1,177 @@
+/*
+ * Copyright (c) 2017-2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#pragma once
+
+#include <stdio.h>
+
+#include "arm_gemm.hpp"
+
+#include "mergeresults.hpp"
+#include "transform.hpp"
+
+#ifdef CYCLE_PROFILING
+#include "profiler.hpp"
+#endif
+
+namespace arm_gemm
+{
+// Implementation of the GemmCommon abstract class.
+//
+// This is implementation is for GEMV with pretransposition.
+// batches are not supported as a batched GEMV makes no sense (can be converted to a GEMM).
+
+template <typename strategy, typename To, typename Tr>
+class GemvPretransposed : public GemmCommon<To, Tr>
+{
+ typedef typename strategy::operand_type Toi;
+ typedef typename strategy::result_type Tri;
+
+ const unsigned int _Nsize;
+ const unsigned int _Ksize;
+ const unsigned int _nmultis;
+
+ const bool _trB;
+
+ const Tr _beta;
+
+ const CPUInfo *const _ci;
+ const unsigned int _buffer_per_multi;
+
+ unsigned int m_block = 0;
+ unsigned int n_block = 0;
+
+ const Toi *_A_pretransposed = nullptr;
+
+public:
+ GemvPretransposed(GemvPretransposed &) = delete;
+ GemvPretransposed &operator=(GemvPretransposed &) = delete;
+
+ GemvPretransposed(const CPUInfo *ci, const unsigned int N, const unsigned int K, const unsigned int nmultis, const bool trB, const Tr beta)
+ : _Nsize(N), _Ksize(K), _nmultis(nmultis), _trB(trB), _beta(beta), _ci(ci), _buffer_per_multi(_Ksize * iceildiv(_Nsize, strategy::A_interleave) * strategy::A_interleave)
+ {
+ /* For now don't do any blocking.*/
+ m_block = K;
+ n_block = N;
+ }
+
+ // Window is number of out_width blocks, times number of multis.
+ unsigned int get_window_size() const override
+ {
+ return iceildiv(_Nsize, strategy::out_width) * _nmultis;
+ }
+
+ // Actually execute the GEMV.
+ void execute(unsigned int start, unsigned int end, int) override
+ {
+#ifdef CYCLE_PROFILING
+ profiler prof;
+#endif
+
+ strategy strat(_ci);
+
+ /* Break the window values down into multis of interest... */
+ const unsigned int window_per_multi = iceildiv(_Nsize, strategy::out_width);
+ const unsigned int multi_0 = start / window_per_multi;
+ const unsigned int multi_end = end / window_per_multi;
+
+ /* ... and figure out where we start and end in the first and last multi. */
+ const unsigned int n_0 = (start - (multi_0 * window_per_multi)) * strategy::out_width;
+ const unsigned int n_max = (end - (multi_end * window_per_multi)) * strategy::out_width;
+
+ static_assert(std::is_same<Tr, Tri>::value, "GemvPretransposed: Result types must be the same.");
+
+ for(unsigned int multi = multi_0; multi <= multi_end; multi++)
+ {
+ const unsigned int n_start = (multi == multi_0) ? n_0 : 0;
+ const unsigned int n_end = (multi == multi_end) ? n_max : _Nsize;
+
+ if(n_end <= n_start)
+ continue;
+
+ for(unsigned int m0 = 0; m0 < _Ksize; m0 += m_block)
+ {
+ unsigned int mmax = std::min(m0 + m_block, _Ksize);
+ for(unsigned int n = n_start; n < n_end; n += n_block)
+ {
+ unsigned int nmax = std::min(n + n_block, n_end);
+#ifdef CYCLE_PROFILING
+ auto p = prof.ScopedProfiler(PROFILE_KERNEL, (mmax - m0) * (nmax - n));
+#endif
+ /* This assumes that the underlying call was a GEMM with M=1; for the N=1 case we would have to pick up this->_Bptr below instead */
+ strat.kernel(_A_pretransposed + (multi * _buffer_per_multi) + (n * _Ksize) + (m0 * strategy::A_interleave),
+ (_Ksize * strategy::A_interleave),
+ this->_Aptr + (multi * this->_A_multi_stride) + m0,
+ this->_Cptr + (multi * this->_C_multi_stride) + n,
+ _beta, (mmax - m0), (nmax - n));
+ }
+ }
+ }
+ }
+
+ /* Pretransposed interface implementation */
+ bool B_is_pretransposed() const override
+ {
+ return true;
+ }
+
+ bool B_pretranspose_required() const override
+ {
+ /* Transpose is required if _A_pretransposed is still nullptr */
+ return (_A_pretransposed == nullptr);
+ }
+
+ size_t get_B_pretransposed_array_size() const override
+ {
+ return _buffer_per_multi * _nmultis * sizeof(To);
+ }
+
+ void pretranspose_B_array(void *buffer, const To *B, const int ldb, const int B_multi_stride) override
+ {
+ Toi *A_buffer = reinterpret_cast<Toi *>(buffer);
+
+ for(unsigned int multi = 0; multi < _nmultis; multi++)
+ {
+ /* Reverse sense here as we are dealing with B rather than A. So if
+ * strategy::A_transpose is false and _trB is false, we still
+ * transpose. */
+ if(_trB ^ strategy::A_transpose)
+ {
+ Transform<strategy::A_interleave, strategy::A_block, false>(A_buffer + (multi * _buffer_per_multi), B + (multi * B_multi_stride), ldb, 0, _Nsize, 0, _Ksize);
+ }
+ else
+ {
+ Transform<strategy::A_interleave, strategy::A_block, true>(A_buffer + (multi * _buffer_per_multi), B + (multi * B_multi_stride), ldb, 0, _Nsize, 0, _Ksize);
+ }
+ }
+
+ _A_pretransposed = A_buffer;
+ }
+
+ void set_pretransposed_B_data(void *buffer) override
+ {
+ _A_pretransposed = reinterpret_cast<Toi *>(buffer);
+ }
+};
+
+} // namespace arm_gemm
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a32_sgemm_8x6.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a32_sgemm_8x6.hpp
new file mode 100644
index 0000000..de11dc5
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a32_sgemm_8x6.hpp
@@ -0,0 +1,88 @@
+/*
+ * Copyright (c) 2017-2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#pragma once
+
+#ifdef __arm__
+
+namespace arm_gemm
+{
+// Actual kernel implementations
+void a32_sgemm_8x6(const float *, const float *, float *, int, int, int);
+void a32_sgemm_8x6_a53(const float *, const float *, float *, int, int, int);
+void a32_sgemm_8x6_a55r1(const float *, const float *, float *, int, int, int);
+
+// 8x6 SGEMM "strategy" class.
+//
+// This describes the characteristics of a family of kernels, in terms of
+// the required interleave properties and the output block size.
+//
+// All kernels in the family must share these characteristics. The actual
+// kernel to be used can be chosen at runtime, based on the CPU_type
+// structure.
+class sgemm_8x6
+{
+public:
+ typedef float operand_type;
+ typedef float result_type;
+
+ typedef void (*kern_type)(const float *, const float *, float *, int, int, int);
+
+ /* Describes the data layout for A input */
+ static const int A_interleave = 6;
+ static const int A_block = 1;
+ static const int A_transpose = 0;
+
+ /* Same for B input */
+ static const int B_interleave = 8;
+ static const int B_block = 1;
+ static const int B_transpose = 1;
+
+ /* Kernel blocking parameters */
+ static const int out_width = 8;
+ static const int out_height = 6;
+ static const int k_unroll = 1;
+
+ kern_type kernel = a32_sgemm_8x6;
+
+ sgemm_8x6(const CPUInfo *ci)
+ {
+ switch(ci->get_cpu_model())
+ {
+ case CPUModel::A53:
+ kernel = a32_sgemm_8x6_a53;
+ break;
+
+ case CPUModel::A55r1:
+ kernel = a32_sgemm_8x6_a55r1;
+ break;
+
+ default:
+ kernel = a32_sgemm_8x6;
+ break;
+ }
+ }
+};
+
+} // namespace arm_gemm
+#endif // __arm__
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a32_sgemm_8x6/a53.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a32_sgemm_8x6/a53.cpp
new file mode 100644
index 0000000..428498f
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a32_sgemm_8x6/a53.cpp
@@ -0,0 +1,400 @@
+/*
+ * Copyright (c) 2017-2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifdef __arm__
+
+#include <arm_neon.h>
+
+#include "../../asmlib.hpp"
+
+// Kernel implementation.
+//
+// Assume that "Apanel" points to a chunk of A blocks (each size 6xK) in read-order.
+// Assume that "Bpanel" points to a chunk of B blocks (each size 8xK) in read-order.
+// Assume that "Cpanel" points to a chunk of C output blocks (each size
+// 8x6), the chunks being arranged in a row major fashion.
+//
+// Note that the intent of this is that either ablocks or bblocks will be 1
+// - this construction allows the output loop to proceed in either order.
+
+namespace arm_gemm
+{
+void a32_sgemm_8x6_a53(const float *Apanel, const float *Bpanel, float *Cpanel, int ablocks, int bblocks, int K)
+{
+ const float *a_ptr = Apanel;
+ float *c_ptr = Cpanel;
+
+ for(int yb = 0; yb < ablocks; yb++)
+ {
+ const float *a_ptr0 = a_ptr;
+ const float *b_ptr = Bpanel;
+
+ for(int xb = 0; xb < bblocks; xb++)
+ {
+ a_ptr = a_ptr0;
+ int tails = (K & 3);
+ if(tails == 0)
+ {
+ tails = 4;
+ }
+ int k = ((K + 3) / 4) - 1;
+
+ __asm __volatile(
+ "vmov.i32 q4, #0\n"
+ "vld1.32 {d0-d1}, [%[a_ptr] :64]\n"
+ "vmov.i32 q5, #0\n"
+ "vld1.32 {d4-d5}, [%[b_ptr] :128]\n"
+ "vmov.i32 q6, #0\n"
+ "ldr r0, [%[a_ptr], #0x10]\n"
+ "vmov.i32 q7, #0\n"
+ "ldr r1, [%[a_ptr], #0x14]\n"
+ "vmov.i32 q8, #0\n" ASM_PREFETCH("[%[a_ptr], #0x40]") "vmov.i32 q9, #0\n" ASM_PREFETCH("[%[b_ptr], #0x40]") "vmov.i32 q10, #0\n" ASM_PREFETCH("[%[a_ptr], #0x80]") "vmov.i32 q11, #0\n"
+ ASM_PREFETCH("[%[b_ptr], #0x80]")
+ "vmov.i32 q12, #0\n"
+ "vmov.i32 q13, #0\n" ASM_PREFETCH("[%[a_ptr], #0xC0]") "vmov.i32 q14, #0\n" ASM_PREFETCH("[%[b_ptr], #0XC0]")
+ "vmov.i32 q15, #0\n"
+ "cmp %[k], #0\n"
+ "beq 6f\n"
+
+ "1:\n"
+ // Unroll 0
+ "vldr d6, [%[b_ptr], #0x10]\n"
+ "vmov d2, r0, r1\n"
+ "vmla.f32 q4, q2, d0[0]\n"
+ "ldr r0, [%[b_ptr], #0x18]\n"
+ "vmla.f32 q5, q2, d0[1]\n"
+ "ldr r1, [%[b_ptr], #0x1C]\n"
+ "vmla.f32 q6, q2, d1[0]\n"
+
+ "vldr d3, [%[a_ptr], #0x18]\n"
+ "vmov d7, r0, r1\n"
+ "vmla.f32 q7, q2, d1[1]\n" ASM_PREFETCH("[%[a_ptr], #0x100]")
+ "vmla.f32 q8, q2, d2[0]\n"
+ "vmla.f32 q9, q2, d2[1]\n"
+
+ "vldr d4, [%[b_ptr], #0x20]\n"
+ "vmla.f32 q10, q3, d0[0]\n"
+ "ldr r0, [%[b_ptr], #0x28]\n"
+ "vmla.f32 q11, q3, d0[1]\n"
+ "ldr r1, [%[b_ptr], #0x2C]\n"
+ "vmla.f32 q12, q3, d1[0]\n"
+
+ "vldr d0, [%[a_ptr], #0x20]\n"
+ "vmov d5, r0, r1\n"
+ "vmla.f32 q13, q3, d1[1]\n"
+ "ldr r0, [%[a_ptr], #0x28]\n"
+ "vmla.f32 q14, q3, d2[0]\n"
+ "ldr r1, [%[a_ptr], #0x2C]\n"
+ "vmla.f32 q15, q3, d2[1]\n"
+
+ // Unroll 1
+ "vldr d6, [%[b_ptr], #0x30]\n"
+ "vmov d1, r0, r1\n"
+ "vmla.f32 q4, q2, d3[0]\n"
+ "ldr r0, [%[b_ptr], #0x38]\n"
+ "vmla.f32 q5, q2, d3[1]\n"
+ "ldr r1, [%[b_ptr], #0x3C]\n"
+ "vmla.f32 q6, q2, d0[0]\n"
+
+ "vldr d2, [%[a_ptr], #0x30]\n"
+ "vmov d7, r0, r1\n"
+ "vmla.f32 q7, q2, d0[1]\n" ASM_PREFETCH("[%[b_ptr], #0x100]")
+ "vmla.f32 q8, q2, d1[0]\n"
+ "vmla.f32 q9, q2, d1[1]\n"
+
+ "vldr d4, [%[b_ptr], #0x40]\n"
+ "vmla.f32 q10, q3, d3[0]\n"
+ "ldr r0, [%[b_ptr], #0x48]\n"
+ "vmla.f32 q11, q3, d3[1]\n"
+ "ldr r1, [%[b_ptr], #0x4C]\n"
+ "vmla.f32 q12, q3, d0[0]\n"
+
+ "vldr d3, [%[a_ptr], #0x38]\n"
+ "vmov d5, r0, r1\n"
+ "vmla.f32 q13, q3, d0[1]\n"
+ "ldr r0, [%[a_ptr], #0x40]\n"
+ "vmla.f32 q14, q3, d1[0]\n"
+ "ldr r1, [%[a_ptr], #0x44]\n"
+ "vmla.f32 q15, q3, d1[1]\n"
+
+ // Unroll 2
+ "vldr d6, [%[b_ptr], #0x50]\n"
+ "vmov d0, r0, r1\n"
+ "vmla.f32 q4, q2, d2[0]\n"
+ "ldr r0, [%[b_ptr], #0x58]\n"
+ "vmla.f32 q5, q2, d2[1]\n"
+ "ldr r1, [%[b_ptr], #0x5C]\n"
+ "vmla.f32 q6, q2, d3[0]\n"
+
+ "vldr d1, [%[a_ptr], #0x48]\n"
+ "vmov d7, r0, r1\n"
+ "vmla.f32 q7, q2, d3[1]\n" ASM_PREFETCH("[%[a_ptr], #0x140]")
+ "vmla.f32 q8, q2, d0[0]\n"
+ "vmla.f32 q9, q2, d0[1]\n"
+
+ "vldr d4, [%[b_ptr], #0x60]\n"
+ "vmla.f32 q10, q3, d2[0]\n"
+ "ldr r0, [%[b_ptr], #0x68]\n"
+ "vmla.f32 q11, q3, d2[1]\n"
+ "ldr r1, [%[b_ptr], #0x6C]\n"
+ "vmla.f32 q12, q3, d3[0]\n"
+
+ "vldr d2, [%[a_ptr], #0x50]\n"
+ "vmov d5, r0, r1\n"
+ "vmla.f32 q13, q3, d3[1]\n"
+ "ldr r0, [%[a_ptr], #0x58]\n"
+ "vmla.f32 q14, q3, d0[0]\n"
+ "ldr r1, [%[a_ptr], #0x5C]\n"
+ "vmla.f32 q15, q3, d0[1]\n"
+ "add %[a_ptr], %[a_ptr], #0x60\n"
+
+ // Unroll 3
+ "vldr d6, [%[b_ptr], #0x70]\n"
+ "vmov d3, r0, r1\n"
+ "vmla.f32 q4, q2, d1[0]\n"
+ "ldr r0, [%[b_ptr], #0x78]\n"
+ "vmla.f32 q5, q2, d1[1]\n"
+ "ldr r1, [%[b_ptr], #0x7C]\n"
+ "vmla.f32 q6, q2, d2[0]\n"
+ "add %[b_ptr], %[b_ptr], #0x80\n"
+
+ "vldr d0, [%[a_ptr], #0x00]\n"
+ "vmov d7, r0, r1\n"
+ "vmla.f32 q7, q2, d2[1]\n" ASM_PREFETCH("[%[b_ptr], #0xC0]")
+ "vmla.f32 q8, q2, d3[0]\n"
+ "vmla.f32 q9, q2, d3[1]\n"
+
+ "vldr d4, [%[b_ptr], #0x00]\n"
+ "vmla.f32 q10, q3, d1[0]\n"
+ "ldr r0, [%[b_ptr], #0x08]\n"
+ "vmla.f32 q11, q3, d1[1]\n"
+ "ldr r1, [%[b_ptr], #0x0C]\n"
+ "vmla.f32 q12, q3, d2[0]\n"
+ "subs %[k], %[k], #1\n"
+
+ "vldr d1, [%[a_ptr], #0x08]\n"
+ "vmov d5, r0, r1\n"
+ "vmla.f32 q13, q3, d2[1]\n"
+ "ldr r0, [%[a_ptr], #0x10]\n"
+ "vmla.f32 q14, q3, d3[0]\n"
+ "ldr r1, [%[a_ptr], #0x14]\n"
+ "vmla.f32 q15, q3, d3[1]\n"
+ "bne 1b\n"
+
+ // "Tails" shows how many multiply blocks are needed at the
+ // end, must be 1-4 inclusive. Bail out to alternative tail
+ // immediately if it's 1.
+ "6:\n"
+ "subs %[tails], %[tails], #1\n"
+ "beq 3f\n"
+
+ // Detached final iteration - for now adapt the generic
+ // tails rather than reimplementing for A53.
+
+ // Unroll 0
+ "vmov d2, r0, r1\n"
+ "add %[a_ptr], %[a_ptr], #0x18\n"
+ "vmla.f32 q4, q2, d0[0]\n"
+ "vld1.32 {d3}, [%[a_ptr] :64]!\n"
+ "vmla.f32 q5, q2, d0[1]\n"
+ "add %[b_ptr], %[b_ptr], #0x10\n"
+ "vmla.f32 q6, q2, d1[0]\n"
+ "vld1.32 {d6-d7}, [%[b_ptr] :128]!\n"
+ "vmla.f32 q7, q2, d1[1]\n"
+ "vmla.f32 q8, q2, d2[0]\n"
+ "subs %[tails], %[tails], #1\n"
+ "vmla.f32 q9, q2, d2[1]\n"
+ "vld1.32 {d4-d5}, [%[b_ptr] :128]!\n"
+
+ "vmla.f32 q10, q3, d0[0]\n"
+ "vmla.f32 q11, q3, d0[1]\n"
+ "vmla.f32 q12, q3, d1[0]\n"
+ "vmla.f32 q13, q3, d1[1]\n"
+ "vld1.32 {d0-d1}, [%[a_ptr] :64]!\n"
+ "vmla.f32 q14, q3, d2[0]\n"
+ "vmla.f32 q15, q3, d2[1]\n"
+ "vld1.32 {d6-d7}, [%[b_ptr] :128]!\n"
+ "beq 4f\n"
+
+ // Unroll 1
+ "vmla.f32 q4, q2, d3[0]\n"
+ "vmla.f32 q5, q2, d3[1]\n"
+ "subs %[tails], %[tails], #1\n"
+ "vmla.f32 q6, q2, d0[0]\n"
+ "vmla.f32 q7, q2, d0[1]\n"
+ "vmla.f32 q8, q2, d1[0]\n"
+ "vmla.f32 q9, q2, d1[1]\n"
+ "vld1.32 {d4-d5}, [%[b_ptr] :128]!\n"
+
+ "vmla.f32 q10, q3, d3[0]\n"
+ "vmla.f32 q11, q3, d3[1]\n"
+ "vld1.32 {d2-d3}, [%[a_ptr] :64]!\n"
+ "vmla.f32 q12, q3, d0[0]\n"
+ "vmla.f32 q13, q3, d0[1]\n"
+ "vmla.f32 q14, q3, d1[0]\n"
+ "vmla.f32 q15, q3, d1[1]\n"
+ "vld1.32 {d6-d7}, [%[b_ptr] :128]!\n"
+ "beq 5f\n"
+
+ // Unroll 2
+ "vld1.32 {d0-d1}, [%[a_ptr] :64]!\n"
+ "vmla.f32 q4, q2, d2[0]\n"
+ "vmla.f32 q5, q2, d2[1]\n"
+ "vmla.f32 q6, q2, d3[0]\n"
+ "vmla.f32 q7, q2, d3[1]\n"
+ "vmla.f32 q8, q2, d0[0]\n"
+ "vmla.f32 q9, q2, d0[1]\n"
+ "vld1.32 {d4-d5}, [%[b_ptr] :128]!\n"
+
+ "vmla.f32 q10, q3, d2[0]\n"
+ "vmla.f32 q11, q3, d2[1]\n"
+ "vmla.f32 q12, q3, d3[0]\n"
+ "vmla.f32 q13, q3, d3[1]\n"
+ "vld1.32 {d2-d3}, [%[a_ptr] :64]!\n"
+ "vmla.f32 q14, q3, d0[0]\n"
+ "vmla.f32 q15, q3, d0[1]\n"
+ "vld1.32 {d6-d7}, [%[b_ptr] :128]!\n"
+
+ // Unroll 3
+ "vmla.f32 q4, q2, d1[0]\n"
+ "vmla.f32 q10, q3, d1[0]\n"
+ "vst1.32 {d8-d9}, [%[c_ptr] :128]!\n"
+ "vmla.f32 q5, q2, d1[1]\n"
+ "vst1.32 {d20-d21}, [%[c_ptr] :128]!\n"
+ "vmla.f32 q11, q3, d1[1]\n"
+ "vst1.32 {d10-d11}, [%[c_ptr] :128]!\n"
+ "vmla.f32 q6, q2, d2[0]\n"
+ "vst1.32 {d22-d23}, [%[c_ptr] :128]!\n"
+ "vmla.f32 q12, q3, d2[0]\n"
+ "vst1.32 {d12-d13}, [%[c_ptr] :128]!\n"
+ "vmla.f32 q7, q2, d2[1]\n"
+ "vst1.32 {d24-d25}, [%[c_ptr] :128]!\n"
+ "vmla.f32 q13, q3, d2[1]\n"
+ "vst1.32 {d14-d15}, [%[c_ptr] :128]!\n"
+ "vmla.f32 q8, q2, d3[0]\n"
+ "vst1.32 {d26-d27}, [%[c_ptr] :128]!\n"
+ "vmla.f32 q14, q3, d3[0]\n"
+ "vst1.32 {d16-d17}, [%[c_ptr] :128]!\n"
+ "vmla.f32 q9, q2, d3[1]\n"
+ "vst1.32 {d28-d29}, [%[c_ptr] :128]!\n"
+ "vmla.f32 q15, q3, d3[1]\n"
+ "vst1.32 {d18-d19}, [%[c_ptr] :128]!\n"
+ "b 2f\n"
+
+ // tails==1 final tail
+ "3:\n"
+ "vmov d2, r0, r1\n"
+ "add %[b_ptr], %[b_ptr], #0x10\n"
+ "vmla.f32 q4, q2, d0[0]\n"
+ "add %[a_ptr], %[a_ptr], #0x18\n"
+ "vmla.f32 q5, q2, d0[1]\n"
+ "vld1.32 {d6-d7}, [%[b_ptr] :128]!\n"
+ "vmla.f32 q6, q2, d1[0]\n"
+ "vst1.32 {d8-d9}, [%[c_ptr] :128]!\n"
+ "vmla.f32 q10, q3, d0[0]\n"
+ "vst1.32 {d20-d21}, [%[c_ptr] :128]!\n"
+ "vmla.f32 q11, q3, d0[1]\n"
+ "vst1.32 {d10-d11}, [%[c_ptr] :128]!\n"
+ "vmla.f32 q12, q3, d1[0]\n"
+ "vst1.32 {d22-d23}, [%[c_ptr] :128]!\n"
+ "vmla.f32 q7, q2, d1[1]\n"
+ "vst1.32 {d12-d13}, [%[c_ptr] :128]!\n"
+ "vmla.f32 q13, q3, d1[1]\n"
+ "vst1.32 {d24-d25}, [%[c_ptr] :128]!\n"
+ "vmla.f32 q8, q2, d2[0]\n"
+ "vst1.32 {d14-d15}, [%[c_ptr] :128]!\n"
+ "vmla.f32 q14, q3, d2[0]\n"
+ "vst1.32 {d26-d27}, [%[c_ptr] :128]!\n"
+ "vmla.f32 q9, q2, d2[1]\n"
+ "vst1.32 {d16-d17}, [%[c_ptr] :128]!\n"
+ "vmla.f32 q15, q3, d2[1]\n"
+ "vst1.32 {d28-d29}, [%[c_ptr] :128]!\n"
+ "vst1.32 {d18-d19}, [%[c_ptr] :128]!\n"
+ "b 2f\n"
+
+ // tails==2 final tail
+ "4:\n"
+ "vmla.f32 q4, q2, d3[0]\n"
+ "vmla.f32 q10, q3, d3[0]\n"
+ "vst1.32 {d8-d9}, [%[c_ptr] :128]!\n"
+ "vmla.f32 q5, q2, d3[1]\n"
+ "vst1.32 {d20-d21}, [%[c_ptr] :128]!\n"
+ "vmla.f32 q11, q3, d3[1]\n"
+ "vst1.32 {d10-d11}, [%[c_ptr] :128]!\n"
+ "vmla.f32 q6, q2, d0[0]\n"
+ "vst1.32 {d22-d23}, [%[c_ptr] :128]!\n"
+ "vmla.f32 q12, q3, d0[0]\n"
+ "vst1.32 {d12-d13}, [%[c_ptr] :128]!\n"
+ "vmla.f32 q7, q2, d0[1]\n"
+ "vst1.32 {d24-d25}, [%[c_ptr] :128]!\n"
+ "vmla.f32 q13, q3, d0[1]\n"
+ "vst1.32 {d14-d15}, [%[c_ptr] :128]!\n"
+ "vmla.f32 q8, q2, d1[0]\n"
+ "vst1.32 {d26-d27}, [%[c_ptr] :128]!\n"
+ "vmla.f32 q14, q3, d1[0]\n"
+ "vst1.32 {d16-d17}, [%[c_ptr] :128]!\n"
+ "vmla.f32 q9, q2, d1[1]\n"
+ "vst1.32 {d28-d29}, [%[c_ptr] :128]!\n"
+ "vmla.f32 q15, q3, d1[1]\n"
+ "vst1.32 {d18-d19}, [%[c_ptr] :128]!\n"
+ "b 2f\n"
+
+ // tails==3 final tail
+ "5:\n"
+ "vmla.f32 q4, q2, d2[0]\n"
+ "vld1.32 {d0}, [%[a_ptr] :64]!\n"
+ "vmla.f32 q5, q2, d2[1]\n"
+ "vmla.f32 q6, q2, d3[0]\n"
+ "vst1.32 {d8-d9}, [%[c_ptr] :128]!\n"
+ "vmla.f32 q10, q3, d2[0]\n"
+ "vst1.32 {d20-d21}, [%[c_ptr] :128]!\n"
+ "vmla.f32 q11, q3, d2[1]\n"
+ "vst1.32 {d10-d11}, [%[c_ptr] :128]!\n"
+ "vmla.f32 q12, q3, d3[0]\n"
+ "vst1.32 {d22-d23}, [%[c_ptr] :128]!\n"
+ "vmla.f32 q7, q2, d3[1]\n"
+ "vst1.32 {d12-d13}, [%[c_ptr] :128]!\n"
+ "vmla.f32 q13, q3, d3[1]\n"
+ "vst1.32 {d24-d25}, [%[c_ptr] :128]!\n"
+ "vmla.f32 q8, q2, d0[0]\n"
+ "vst1.32 {d14-d15}, [%[c_ptr] :128]!\n"
+ "vmla.f32 q14, q3, d0[0]\n"
+ "vst1.32 {d26-d27}, [%[c_ptr] :128]!\n"
+ "vmla.f32 q9, q2, d0[1]\n"
+ "vst1.32 {d16-d17}, [%[c_ptr] :128]!\n"
+ "vmla.f32 q15, q3, d0[1]\n"
+ "vst1.32 {d28-d29}, [%[c_ptr] :128]!\n"
+ "vst1.32 {d18-d19}, [%[c_ptr] :128]!\n"
+
+ "2:\n"
+ "vst1.32 {d30-d31}, [%[c_ptr] :128]!\n"
+ : [a_ptr] "+r"(a_ptr), [b_ptr] "+r"(b_ptr), [c_ptr] "+r"(c_ptr), [k] "+r"(k), [tails] "+r"(tails)
+ :
+ : "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15", "r0", "r1");
+ }
+ }
+}
+
+} // namespace arm_gemm
+
+#endif
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a32_sgemm_8x6/a55r1.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a32_sgemm_8x6/a55r1.cpp
new file mode 100644
index 0000000..4cfb72a
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a32_sgemm_8x6/a55r1.cpp
@@ -0,0 +1,398 @@
+/*
+ * Copyright (c) 2017-2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifdef __arm__
+
+#include <arm_neon.h>
+
+#include "../../asmlib.hpp"
+
+// Kernel implementation.
+//
+// Assume that "Apanel" points to a chunk of A blocks (each size 6xK) in read-order.
+// Assume that "Bpanel" points to a chunk of B blocks (each size 8xK) in read-order.
+// Assume that "Cpanel" points to a chunk of C output blocks (each size
+// 8x6), the chunks being arranged in a row major fashion.
+//
+// Note that the intent of this is that either ablocks or bblocks will be 1
+// - this construction allows the output loop to proceed in either order.
+
+namespace arm_gemm
+{
+void a32_sgemm_8x6_a55r1(const float *Apanel, const float *Bpanel, float *Cpanel, int ablocks, int bblocks, int K)
+{
+ const float *a_ptr = Apanel;
+ float *c_ptr = Cpanel;
+
+ /* Work out starting values for "k" and "tails" in the inner loop. */
+ int tails_initial = (K & 3);
+ if(tails_initial == 0)
+ {
+ tails_initial = 4;
+ }
+
+ int k_initial = ((K + 3) / 4) - 1;
+
+ for(int yb = 0; yb < ablocks; yb++)
+ {
+ const float *a_ptr0 = a_ptr;
+ const float *b_ptr = Bpanel;
+
+ for(int xb = 0; xb < bblocks; xb++)
+ {
+ int tails = tails_initial;
+ int k = k_initial;
+
+ a_ptr = a_ptr0;
+
+ __asm __volatile(
+ "vldr d0, [%[a_ptr]]\n"
+ "vmov.i32 q4, #0\n"
+ "vldr d1, [%[a_ptr], #0x08]\n"
+ "vmov.i32 q5, #0\n"
+ "vldr d4, [%[b_ptr]]\n"
+ "vmov.i32 q6, #0\n"
+ "vldr d5, [%[b_ptr], #0x08]\n"
+ "vmov.i32 q7, #0\n"
+ "vldr d2, [%[a_ptr], #0x10]\n"
+ "vmov.i32 q8, #0\n" ASM_PREFETCH("[%[b_ptr], #0x40]") "vmov.i32 q9, #0\n" ASM_PREFETCH("[%[a_ptr], #0x40]") "vmov.i32 q10, #0\n" ASM_PREFETCH("[%[b_ptr], #0x80]") "vmov.i32 q11, #0\n"
+ ASM_PREFETCH("[%[a_ptr], #0x80]") "vmov.i32 q12, #0\n" ASM_PREFETCH("[%[b_ptr], #0XC0]") "vmov.i32 q13, #0\n" ASM_PREFETCH("[%[a_ptr], #0xC0]") "vmov.i32 q14, #0\n"
+ ASM_PREFETCH("[%[b_ptr], #0x100]") "vmov.i32 q15, #0\n" ASM_PREFETCH("[%[a_ptr], #0x100]") "cmp %[k], #0\n" ASM_PREFETCH("[%[b_ptr], #0x140]") "beq 6f\n"
+ ASM_PREFETCH("[%[b_ptr], #0x180]")
+
+ "1:\n"
+ // Unroll 0
+ "vmla.f32 q4, q2, d0[0]\n"
+ "vldr d6, [%[b_ptr], #0x10]\n"
+ "vmla.f32 q5, q2, d0[1]\n"
+ "vldr d7, [%[b_ptr], #0x18]\n"
+ "vmla.f32 q6, q2, d1[0]\n"
+ "vldr d3, [%[a_ptr], #0x18]\n"
+ "vmla.f32 q7, q2, d1[1]\n" ASM_PREFETCH("[%[a_ptr], #0x140]")
+ "vmla.f32 q8, q2, d2[0]\n"
+ "subs %[k], %[k], #1\n"
+ "vmla.f32 q9, q2, d2[1]\n"
+ "vldr d4, [%[b_ptr], #0x20]\n"
+ "vmla.f32 q10, q3, d0[0]\n"
+ "vldr d5, [%[b_ptr], #0x28]\n"
+ "vmla.f32 q11, q3, d0[1]\n"
+ "vldr d0, [%[a_ptr], #0x20]\n"
+ "vmla.f32 q12, q3, d1[0]\n"
+
+ "vmla.f32 q13, q3, d1[1]\n"
+ "vldr d1, [%[a_ptr], #0x28]\n"
+ "vmla.f32 q14, q3, d2[0]\n"
+
+ "vmla.f32 q15, q3, d2[1]\n"
+ "vldr d6, [%[b_ptr], #0x30]\n"
+
+ // Unroll 1
+ "vmla.f32 q4, q2, d3[0]\n"
+ "vldr d7, [%[b_ptr], #0x38]\n"
+ "vmla.f32 q5, q2, d3[1]\n"
+ "vldr d2, [%[a_ptr], #0x30]\n"
+ "vmla.f32 q6, q2, d0[0]\n"
+
+ "vmla.f32 q7, q2, d0[1]\n" ASM_PREFETCH("[%[b_ptr], #0x1C0]")
+ "vmla.f32 q8, q2, d1[0]\n"
+
+ "vmla.f32 q9, q2, d1[1]\n"
+ "vldr d4, [%[b_ptr], #0x40]\n"
+ "vmla.f32 q10, q3, d3[0]\n"
+ "vldr d5, [%[b_ptr], #0x48]\n"
+ "vmla.f32 q11, q3, d3[1]\n"
+ "vldr d3, [%[a_ptr], #0x38]\n"
+ "vmla.f32 q12, q3, d0[0]\n"
+
+ "vmla.f32 q13, q3, d0[1]\n"
+ "vldr d0, [%[a_ptr], #0x40]\n"
+ "vmla.f32 q14, q3, d1[0]\n"
+
+ "vmla.f32 q15, q3, d1[1]\n"
+ "vldr d6, [%[b_ptr], #0x50]\n"
+
+ // Unroll 2
+ "vmla.f32 q4, q2, d2[0]\n"
+ "vldr d7, [%[b_ptr], #0x58]\n"
+ "vmla.f32 q5, q2, d2[1]\n"
+ "vldr d1, [%[a_ptr], #0x48]\n"
+ "vmla.f32 q6, q2, d3[0]\n"
+
+ "vmla.f32 q7, q2, d3[1]\n" ASM_PREFETCH("[%[a_ptr], #0x180]")
+ "vmla.f32 q8, q2, d0[0]\n"
+
+ "vmla.f32 q9, q2, d0[1]\n"
+ "vldr d4, [%[b_ptr], #0x60]\n"
+ "vmla.f32 q10, q3, d2[0]\n"
+ "vldr d5, [%[b_ptr], #0x68]\n"
+ "vmla.f32 q11, q3, d2[1]\n"
+ "vldr d2, [%[a_ptr], #0x50]\n"
+ "vmla.f32 q12, q3, d3[0]\n"
+
+ "vmla.f32 q13, q3, d3[1]\n"
+ "vldr d3, [%[a_ptr], #0x58]\n"
+ "vmla.f32 q14, q3, d0[0]\n"
+ "add %[a_ptr], %[a_ptr], #0x60\n"
+ "vmla.f32 q15, q3, d0[1]\n"
+ "vldr d6, [%[b_ptr], #0x70]\n"
+
+ // Unroll 3
+ "vmla.f32 q4, q2, d1[0]\n"
+ "vldr d7, [%[b_ptr], #0x78]\n"
+ "vmla.f32 q5, q2, d1[1]\n"
+ "add %[b_ptr], %[b_ptr], #0x80\n"
+ "vmla.f32 q6, q2, d2[0]\n"
+ "vldr d0, [%[a_ptr], #0x00]\n"
+ "vmla.f32 q7, q2, d2[1]\n" ASM_PREFETCH("[%[b_ptr], #0x180]")
+ "vmla.f32 q8, q2, d3[0]\n"
+
+ "vmla.f32 q9, q2, d3[1]\n"
+ "vldr d4, [%[b_ptr], #0x00]\n"
+ "vmla.f32 q10, q3, d1[0]\n"
+ "vldr d5, [%[b_ptr], #0x08]\n"
+ "vmla.f32 q11, q3, d1[1]\n"
+ "vldr d1, [%[a_ptr], #0x08]\n"
+ "vmla.f32 q12, q3, d2[0]\n"
+
+ "vmla.f32 q13, q3, d2[1]\n"
+ "vldr d2, [%[a_ptr], #0x10]\n"
+ "vmla.f32 q14, q3, d3[0]\n"
+
+ "vmla.f32 q15, q3, d3[1]\n"
+ "bne 1b\n"
+
+ // "Tails" shows how many multiply blocks are needed at the
+ // end, must be 1-4 inclusive. Bail out to alternative tail
+ // immediately if it's 1.
+ "6:\n"
+ "subs %[tails], %[tails], #1\n"
+ "beq 3f\n"
+
+ // Detached final iteration
+
+ // Unroll 0
+ "vmla.f32 q4, q2, d0[0]\n"
+ "vldr d6, [%[b_ptr], #0x10]\n"
+ "vmla.f32 q5, q2, d0[1]\n"
+ "vldr d7, [%[b_ptr], #0x18]\n"
+ "vmla.f32 q6, q2, d1[0]\n"
+ "vldr d3, [%[a_ptr], #0x18]\n"
+ "vmla.f32 q7, q2, d1[1]\n"
+ "subs %[tails], %[tails], #1\n"
+ "vmla.f32 q8, q2, d2[0]\n"
+ "vmla.f32 q9, q2, d2[1]\n"
+ "vldr d4, [%[b_ptr], #0x20]\n"
+
+ "vmla.f32 q10, q3, d0[0]\n"
+ "vldr d5, [%[b_ptr], #0x28]\n"
+ "vmla.f32 q11, q3, d0[1]\n"
+ "vldr d0, [%[a_ptr], #0x20]\n"
+ "vmla.f32 q12, q3, d1[0]\n"
+ "add %[b_ptr], %[b_ptr], #0x30\n"
+ "vmla.f32 q13, q3, d1[1]\n"
+ "vldr d1, [%[a_ptr], #0x28]\n"
+ "vmla.f32 q14, q3, d2[0]\n"
+ "vmla.f32 q15, q3, d2[1]\n"
+ "beq 4f\n"
+
+ // Unroll 1
+ "vmla.f32 q4, q2, d3[0]\n"
+ "vldr d6, [%[b_ptr], #0x30]\n"
+ "vmla.f32 q5, q2, d3[1]\n"
+ "vldr d7, [%[b_ptr], #0x38]\n"
+ "vmla.f32 q6, q2, d0[0]\n"
+ "vldr d2, [%[a_ptr], #0x30]\n"
+ "vmla.f32 q7, q2, d0[1]\n"
+ "subs %[tails], %[tails], #1\n"
+ "vmla.f32 q8, q2, d1[0]\n"
+
+ "vmla.f32 q9, q2, d1[1]\n"
+
+ "vmla.f32 q10, q3, d3[0]\n"
+ "vldr d4, [%[b_ptr], #0x40]\n"
+ "vmla.f32 q11, q3, d3[1]\n"
+ "vldr d5, [%[b_ptr], #0x48]\n"
+ "vmla.f32 q12, q3, d0[0]\n"
+ "vldr d3, [%[a_ptr], #0x38]\n"
+ "vmla.f32 q13, q3, d0[1]\n"
+ "vldr d0, [%[a_ptr], #0x40]\n"
+ "vmla.f32 q14, q3, d1[0]\n"
+ "vmla.f32 q15, q3, d1[1]\n"
+ "beq 5f\n"
+
+ // Unroll 2
+ "vmla.f32 q4, q2, d2[0]\n"
+ "vldr d6, [%[b_ptr], #0x50]\n"
+ "vmla.f32 q5, q2, d2[1]\n"
+ "vldr d7, [%[b_ptr], #0x58]\n"
+ "vmla.f32 q6, q2, d3[0]\n"
+ "vldr d1, [%[a_ptr], #0x48]\n"
+ "vmla.f32 q7, q2, d3[1]\n"
+ "vmla.f32 q8, q2, d0[0]\n"
+ "vmla.f32 q9, q2, d0[1]\n"
+
+ "vmla.f32 q10, q3, d2[0]\n"
+ "vldr d4, [%[b_ptr], #0x60]\n"
+ "vmla.f32 q11, q3, d2[1]\n"
+ "vldr d5, [%[b_ptr], #0x68]\n"
+ "vmla.f32 q12, q3, d3[0]\n"
+ "vldr d2, [%[a_ptr], #0x50]\n"
+ "vmla.f32 q13, q3, d3[1]\n"
+ "vldr d3, [%[a_ptr], #0x58]\n"
+ "vmla.f32 q14, q3, d0[0]\n"
+ "vmla.f32 q15, q3, d0[1]\n"
+
+ // Unroll 3
+ "vmla.f32 q4, q2, d1[0]\n"
+ "vldr d6, [%[b_ptr], #0x70]\n"
+ "vmla.f32 q5, q2, d1[1]\n"
+ "vldr d7, [%[b_ptr], #0x78]\n"
+ "vmla.f32 q10, q3, d1[0]\n"
+ "vst1.32 {d8-d9}, [%[c_ptr] :128]!\n"
+ "vmla.f32 q11, q3, d1[1]\n"
+ "vst1.32 {d20-d21}, [%[c_ptr] :128]!\n"
+ "vmla.f32 q6, q2, d2[0]\n"
+ "vst1.32 {d10-d11}, [%[c_ptr] :128]!\n"
+ "vmla.f32 q12, q3, d2[0]\n"
+ "vst1.32 {d22-d23}, [%[c_ptr] :128]!\n"
+ "vmla.f32 q7, q2, d2[1]\n"
+ "vst1.32 {d12-d13}, [%[c_ptr] :128]!\n"
+ "vmla.f32 q13, q3, d2[1]\n"
+ "vst1.32 {d24-d25}, [%[c_ptr] :128]!\n"
+ "vmla.f32 q8, q2, d3[0]\n"
+ "vst1.32 {d14-d15}, [%[c_ptr] :128]!\n"
+ "vmla.f32 q14, q3, d3[0]\n"
+ "vst1.32 {d26-d27}, [%[c_ptr] :128]!\n"
+ "vmla.f32 q9, q2, d3[1]\n"
+ "vst1.32 {d16-d17}, [%[c_ptr] :128]!\n"
+ "vmla.f32 q15, q3, d3[1]\n"
+ "vst1.32 {d28-d29}, [%[c_ptr] :128]!\n"
+ "add %[a_ptr], %[a_ptr], #0x60\n"
+ "vst1.32 {d18-d19}, [%[c_ptr] :128]!\n"
+ "add %[b_ptr], %[b_ptr], #0x80\n"
+ "b 2f\n"
+
+ // tails==1 final tail
+ "3:\n"
+ "vmla.f32 q4, q2, d0[0]\n"
+ "vldr d6, [%[b_ptr], #0x10]\n"
+ "vmla.f32 q5, q2, d0[1]\n"
+ "vldr d7, [%[b_ptr], #0x18]\n"
+ "vmla.f32 q6, q2, d1[0]\n"
+ "vst1.32 {d8-d9}, [%[c_ptr] :128]!\n"
+ "vmla.f32 q10, q3, d0[0]\n"
+ "vst1.32 {d20-d21}, [%[c_ptr] :128]!\n"
+ "vmla.f32 q11, q3, d0[1]\n"
+ "vst1.32 {d10-d11}, [%[c_ptr] :128]!\n"
+ "vmla.f32 q12, q3, d1[0]\n"
+ "vst1.32 {d22-d23}, [%[c_ptr] :128]!\n"
+ "vmla.f32 q7, q2, d1[1]\n"
+ "vst1.32 {d12-d13}, [%[c_ptr] :128]!\n"
+ "vmla.f32 q13, q3, d1[1]\n"
+ "vst1.32 {d24-d25}, [%[c_ptr] :128]!\n"
+ "vmla.f32 q8, q2, d2[0]\n"
+ "vst1.32 {d14-d15}, [%[c_ptr] :128]!\n"
+ "vmla.f32 q14, q3, d2[0]\n"
+ "vst1.32 {d26-d27}, [%[c_ptr] :128]!\n"
+ "vmla.f32 q9, q2, d2[1]\n"
+ "vst1.32 {d16-d17}, [%[c_ptr] :128]!\n"
+ "vmla.f32 q15, q3, d2[1]\n"
+ "vst1.32 {d28-d29}, [%[c_ptr] :128]!\n"
+ "add %[a_ptr], %[a_ptr], #0x18\n"
+ "vst1.32 {d18-d19}, [%[c_ptr] :128]!\n"
+ "add %[b_ptr], %[b_ptr], #0x20\n"
+ "b 2f\n"
+
+ // tails==2 final tail
+ "4:\n"
+ "vmla.f32 q4, q2, d3[0]\n"
+ "vldr d6, [%[b_ptr], #0x30]\n"
+ "vmla.f32 q5, q2, d3[1]\n"
+ "vldr d7, [%[b_ptr], #0x38]\n"
+ "vmla.f32 q10, q3, d3[0]\n"
+ "vst1.32 {d8-d9}, [%[c_ptr] :128]!\n"
+ "vmla.f32 q11, q3, d3[1]\n"
+ "vst1.32 {d20-d21}, [%[c_ptr] :128]!\n"
+ "vmla.f32 q6, q2, d0[0]\n"
+ "vst1.32 {d10-d11}, [%[c_ptr] :128]!\n"
+ "vmla.f32 q12, q3, d0[0]\n"
+ "vst1.32 {d22-d23}, [%[c_ptr] :128]!\n"
+ "vmla.f32 q7, q2, d0[1]\n"
+ "vst1.32 {d12-d13}, [%[c_ptr] :128]!\n"
+ "vmla.f32 q13, q3, d0[1]\n"
+ "vst1.32 {d24-d25}, [%[c_ptr] :128]!\n"
+ "vmla.f32 q8, q2, d1[0]\n"
+ "vst1.32 {d14-d15}, [%[c_ptr] :128]!\n"
+ "vmla.f32 q14, q3, d1[0]\n"
+ "vst1.32 {d26-d27}, [%[c_ptr] :128]!\n"
+ "vmla.f32 q9, q2, d1[1]\n"
+ "vst1.32 {d16-d17}, [%[c_ptr] :128]!\n"
+ "vmla.f32 q15, q3, d1[1]\n"
+ "vst1.32 {d28-d29}, [%[c_ptr] :128]!\n"
+ "add %[b_ptr], %[b_ptr], #0x40\n"
+ "vst1.32 {d18-d19}, [%[c_ptr] :128]!\n"
+ "add %[a_ptr], %[a_ptr], #0x30\n"
+ "b 2f\n"
+
+ // tails==3 final tail
+ "5:\n"
+ "vmla.f32 q4, q2, d2[0]\n"
+ "vldr d6, [%[b_ptr], #0x50]\n"
+ "vmla.f32 q5, q2, d2[1]\n"
+ "vldr d7, [%[b_ptr], #0x58]\n"
+ "vmla.f32 q6, q2, d3[0]\n"
+ "vst1.32 {d8-d9}, [%[c_ptr] :128]!\n"
+ "vmla.f32 q10, q3, d2[0]\n"
+ "vst1.32 {d20-d21}, [%[c_ptr] :128]!\n"
+ "vmla.f32 q11, q3, d2[1]\n"
+ "vst1.32 {d10-d11}, [%[c_ptr] :128]!\n"
+ "vmla.f32 q12, q3, d3[0]\n"
+ "vst1.32 {d22-d23}, [%[c_ptr] :128]!\n"
+ "vmla.f32 q7, q2, d3[1]\n"
+ "vst1.32 {d12-d13}, [%[c_ptr] :128]!\n"
+ "vmla.f32 q13, q3, d3[1]\n"
+ "vst1.32 {d24-d25}, [%[c_ptr] :128]!\n"
+ "vmla.f32 q8, q2, d0[0]\n"
+ "vst1.32 {d14-d15}, [%[c_ptr] :128]!\n"
+ "vmla.f32 q14, q3, d0[0]\n"
+ "vst1.32 {d26-d27}, [%[c_ptr] :128]!\n"
+ "vmla.f32 q9, q2, d0[1]\n"
+ "vst1.32 {d16-d17}, [%[c_ptr] :128]!\n"
+ "vmla.f32 q15, q3, d0[1]\n"
+ "vst1.32 {d28-d29}, [%[c_ptr] :128]!\n"
+ "add %[a_ptr], %[a_ptr], #0x48\n"
+ "vst1.32 {d18-d19}, [%[c_ptr] :128]!\n"
+ "add %[b_ptr], %[b_ptr], #0x60\n"
+
+ "2:\n"
+ "vst1.32 {d30-d31}, [%[c_ptr] :128]!\n"
+ : [a_ptr] "+r"(a_ptr), [b_ptr] "+r"(b_ptr), [c_ptr] "+r"(c_ptr), [k] "+r"(k), [tails] "+r"(tails)
+ :
+ : "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15", "r0", "r1");
+ }
+ }
+}
+
+} // namespace arm_gemm
+
+#endif
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a32_sgemm_8x6/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a32_sgemm_8x6/generic.cpp
new file mode 100644
index 0000000..d7d0484
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a32_sgemm_8x6/generic.cpp
@@ -0,0 +1,346 @@
+/*
+ * Copyright (c) 2017-2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifdef __arm__
+
+#include <arm_neon.h>
+
+#include "../../asmlib.hpp"
+
+// Kernel implementation.
+//
+// Assume that "Apanel" points to a chunk of A blocks (each size 6xK) in read-order.
+// Assume that "Bpanel" points to a chunk of B blocks (each size 8xK) in read-order.
+// Assume that "Cpanel" points to a chunk of C output blocks (each size
+// 8x6), the chunks being arranged in a row major fashion.
+//
+// Note that the intent of this is that either ablocks or bblocks will be 1
+// - this construction allows the output loop to proceed in either order.
+
+namespace arm_gemm
+{
+void a32_sgemm_8x6(const float *Apanel, const float *Bpanel, float *Cpanel, int ablocks, int bblocks, int K)
+{
+ const float *a_ptr = Apanel;
+ float *c_ptr = Cpanel;
+
+ for(int yb = 0; yb < ablocks; yb++)
+ {
+ const float *a_ptr0 = a_ptr;
+ const float *b_ptr = Bpanel;
+
+ for(int xb = 0; xb < bblocks; xb++)
+ {
+ a_ptr = a_ptr0;
+ int tails = (K & 3);
+ if(tails == 0)
+ {
+ tails = 4;
+ }
+ int k = ((K + 3) / 4) - 1;
+
+ __asm __volatile(
+ "vmov.i32 q4, #0\n"
+ "vld1.32 {d0-d1}, [%[a_ptr] :64]!\n"
+ "vmov.i32 q5, #0\n"
+ "vld1.32 {d4-d5}, [%[b_ptr] :128]!\n"
+ "vmov.i32 q6, #0\n" ASM_PREFETCH("[%[a_ptr], #48]") "vmov.i32 q7, #0\n" ASM_PREFETCH("[%[b_ptr], #48]") "vmov.i32 q8, #0\n" ASM_PREFETCH("[%[a_ptr], #112]") "vmov.i32 q9, #0\n"
+ ASM_PREFETCH("[%[b_ptr], #112]")
+ "vmov.i32 q10, #0\n"
+ "vmov.i32 q11, #0\n"
+ "vmov.i32 q12, #0\n"
+ "vmov.i32 q13, #0\n" ASM_PREFETCH("[%[a_ptr], #176]") "vmov.i32 q14, #0\n" ASM_PREFETCH("[%[b_ptr], #176]")
+ "vmov.i32 q15, #0\n"
+
+ "cmp %[k], #0\n"
+ "beq 6f\n"
+
+ "1:\n"
+ // Unroll 0
+ "vmla.f32 q4, q2, d0[0]\n"
+ "vld1.32 {d2-d3}, [%[a_ptr] :64]!\n"
+ "vmla.f32 q5, q2, d0[1]\n"
+ "vmla.f32 q6, q2, d1[0]\n"
+ "vld1.32 {d6-d7}, [%[b_ptr] :128]!\n"
+ "vmla.f32 q7, q2, d1[1]\n"
+ "vmla.f32 q8, q2, d2[0]\n"
+ "vmla.f32 q9, q2, d2[1]\n"
+ "vld1.32 {d4-d5}, [%[b_ptr] :128]!\n"
+
+ "vmla.f32 q10, q3, d0[0]\n"
+ "vmla.f32 q11, q3, d0[1]\n"
+ "vmla.f32 q12, q3, d1[0]\n"
+ "vmla.f32 q13, q3, d1[1]\n"
+ "vld1.32 {d0-d1}, [%[a_ptr] :64]!\n"
+ "vmla.f32 q14, q3, d2[0]\n"
+ "vmla.f32 q15, q3, d2[1]\n"
+ "vld1.32 {d6-d7}, [%[b_ptr] :128]!\n"
+
+ // Unroll 1
+ "vmla.f32 q4, q2, d3[0]\n"
+ "subs %[k], %[k], #1\n"
+ "vmla.f32 q5, q2, d3[1]\n" ASM_PREFETCH("[%[a_ptr], #208]")
+ "vmla.f32 q6, q2, d0[0]\n"
+ "vmla.f32 q7, q2, d0[1]\n" ASM_PREFETCH("[%[b_ptr], #192]")
+ "vmla.f32 q8, q2, d1[0]\n"
+ "vmla.f32 q9, q2, d1[1]\n"
+ "vld1.32 {d4-d5}, [%[b_ptr] :128]!\n"
+
+ "vmla.f32 q10, q3, d3[0]\n"
+ "vmla.f32 q11, q3, d3[1]\n"
+ "vld1.32 {d2-d3}, [%[a_ptr] :64]!\n"
+ "vmla.f32 q12, q3, d0[0]\n"
+ "vmla.f32 q13, q3, d0[1]\n"
+ "vmla.f32 q14, q3, d1[0]\n"
+ "vmla.f32 q15, q3, d1[1]\n"
+ "vld1.32 {d0-d1}, [%[a_ptr] :64]!\n"
+
+ // Unroll 2
+ "vmla.f32 q4, q2, d2[0]\n"
+ "vmla.f32 q5, q2, d2[1]\n"
+ "vld1.32 {d6-d7}, [%[b_ptr] :128]!\n"
+ "vmla.f32 q6, q2, d3[0]\n"
+ "vmla.f32 q7, q2, d3[1]\n" ASM_PREFETCH("[%[a_ptr], #240]")
+ "vmla.f32 q8, q2, d0[0]\n"
+ "vmla.f32 q9, q2, d0[1]\n"
+ "vld1.32 {d4-d5}, [%[b_ptr] :128]!\n"
+
+ "vmla.f32 q10, q3, d2[0]\n"
+ "vmla.f32 q11, q3, d2[1]\n" ASM_PREFETCH("[%[b_ptr], #208]")
+ "vmla.f32 q12, q3, d3[0]\n"
+ "vmla.f32 q13, q3, d3[1]\n"
+ "vld1.32 {d2-d3}, [%[a_ptr] :64]!\n"
+ "vmla.f32 q14, q3, d0[0]\n"
+ "vmla.f32 q15, q3, d0[1]\n"
+ "vld1.32 {d6-d7}, [%[b_ptr] :128]!\n"
+
+ // Unroll 3
+ "vmla.f32 q4, q2, d1[0]\n"
+ "vmla.f32 q5, q2, d1[1]\n"
+ "vmla.f32 q6, q2, d2[0]\n"
+ "vmla.f32 q7, q2, d2[1]\n"
+ "vmla.f32 q8, q2, d3[0]\n"
+ "vmla.f32 q9, q2, d3[1]\n"
+ "vld1.32 {d4-d5}, [%[b_ptr] :128]!\n"
+
+ "vmla.f32 q10, q3, d1[0]\n"
+ "vmla.f32 q11, q3, d1[1]\n"
+ "vld1.32 {d0-d1}, [%[a_ptr] :64]!\n"
+ "vmla.f32 q12, q3, d2[0]\n"
+ "vmla.f32 q13, q3, d2[1]\n"
+ "vmla.f32 q14, q3, d3[0]\n"
+ "vmla.f32 q15, q3, d3[1]\n"
+ "bne 1b\n"
+
+ // Branch here if we never execute main loop.
+ "6:\n"
+
+ // "Tails" shows how many multiply blocks are needed at the
+ // end, must be 1-4 inclusive. Bail out to alternative tail
+ // immediately if it's 1.
+ "subs %[tails], %[tails], #1\n"
+ "beq 3f\n"
+
+ // Detached final iteration
+ // Unroll 0
+ "vmla.f32 q4, q2, d0[0]\n"
+ "vld1.32 {d2-d3}, [%[a_ptr] :64]!\n"
+ "vmla.f32 q5, q2, d0[1]\n"
+ "vmla.f32 q6, q2, d1[0]\n"
+ "vld1.32 {d6-d7}, [%[b_ptr] :128]!\n"
+ "vmla.f32 q7, q2, d1[1]\n"
+ "vmla.f32 q8, q2, d2[0]\n"
+ "subs %[tails], %[tails], #1\n"
+ "vmla.f32 q9, q2, d2[1]\n"
+ "vld1.32 {d4-d5}, [%[b_ptr] :128]!\n"
+
+ "vmla.f32 q10, q3, d0[0]\n"
+ "vmla.f32 q11, q3, d0[1]\n"
+ "vmla.f32 q12, q3, d1[0]\n"
+ "vmla.f32 q13, q3, d1[1]\n"
+ "vld1.32 {d0-d1}, [%[a_ptr] :64]!\n"
+ "vmla.f32 q14, q3, d2[0]\n"
+ "vmla.f32 q15, q3, d2[1]\n"
+ "vld1.32 {d6-d7}, [%[b_ptr] :128]!\n"
+ "beq 4f\n"
+
+ // Unroll 1
+ "vmla.f32 q4, q2, d3[0]\n"
+ "vmla.f32 q5, q2, d3[1]\n"
+ "subs %[tails], %[tails], #1\n"
+ "vmla.f32 q6, q2, d0[0]\n"
+ "vmla.f32 q7, q2, d0[1]\n"
+ "vmla.f32 q8, q2, d1[0]\n"
+ "vmla.f32 q9, q2, d1[1]\n"
+ "vld1.32 {d4-d5}, [%[b_ptr] :128]!\n"
+
+ "vmla.f32 q10, q3, d3[0]\n"
+ "vmla.f32 q11, q3, d3[1]\n"
+ "vld1.32 {d2-d3}, [%[a_ptr] :64]!\n"
+ "vmla.f32 q12, q3, d0[0]\n"
+ "vmla.f32 q13, q3, d0[1]\n"
+ "vmla.f32 q14, q3, d1[0]\n"
+ "vmla.f32 q15, q3, d1[1]\n"
+ "vld1.32 {d6-d7}, [%[b_ptr] :128]!\n"
+ "beq 5f\n"
+
+ // Unroll 2
+ "vld1.32 {d0-d1}, [%[a_ptr] :64]!\n"
+ "vmla.f32 q4, q2, d2[0]\n"
+ "vmla.f32 q5, q2, d2[1]\n"
+ "vmla.f32 q6, q2, d3[0]\n"
+ "vmla.f32 q7, q2, d3[1]\n"
+ "vmla.f32 q8, q2, d0[0]\n"
+ "vmla.f32 q9, q2, d0[1]\n"
+ "vld1.32 {d4-d5}, [%[b_ptr] :128]!\n"
+
+ "vmla.f32 q10, q3, d2[0]\n"
+ "vmla.f32 q11, q3, d2[1]\n"
+ "vmla.f32 q12, q3, d3[0]\n"
+ "vmla.f32 q13, q3, d3[1]\n"
+ "vld1.32 {d2-d3}, [%[a_ptr] :64]!\n"
+ "vmla.f32 q14, q3, d0[0]\n"
+ "vmla.f32 q15, q3, d0[1]\n"
+ "vld1.32 {d6-d7}, [%[b_ptr] :128]!\n"
+
+ // Unroll 3
+ "vmla.f32 q4, q2, d1[0]\n"
+ "vmla.f32 q10, q3, d1[0]\n"
+ "vst1.32 {d8-d9}, [%[c_ptr] :128]!\n"
+ "vmla.f32 q5, q2, d1[1]\n"
+ "vst1.32 {d20-d21}, [%[c_ptr] :128]!\n"
+ "vmla.f32 q11, q3, d1[1]\n"
+ "vst1.32 {d10-d11}, [%[c_ptr] :128]!\n"
+ "vmla.f32 q6, q2, d2[0]\n"
+ "vst1.32 {d22-d23}, [%[c_ptr] :128]!\n"
+ "vmla.f32 q12, q3, d2[0]\n"
+ "vst1.32 {d12-d13}, [%[c_ptr] :128]!\n"
+ "vmla.f32 q7, q2, d2[1]\n"
+ "vst1.32 {d24-d25}, [%[c_ptr] :128]!\n"
+ "vmla.f32 q13, q3, d2[1]\n"
+ "vst1.32 {d14-d15}, [%[c_ptr] :128]!\n"
+ "vmla.f32 q8, q2, d3[0]\n"
+ "vst1.32 {d26-d27}, [%[c_ptr] :128]!\n"
+ "vmla.f32 q14, q3, d3[0]\n"
+ "vst1.32 {d16-d17}, [%[c_ptr] :128]!\n"
+ "vmla.f32 q9, q2, d3[1]\n"
+ "vst1.32 {d28-d29}, [%[c_ptr] :128]!\n"
+ "vmla.f32 q15, q3, d3[1]\n"
+ "vst1.32 {d18-d19}, [%[c_ptr] :128]!\n"
+ "b 2f\n"
+
+ // tails==1 final tail
+ "3:\n"
+ "vmla.f32 q4, q2, d0[0]\n"
+ "vld1.32 {d2}, [%[a_ptr] :64]!\n"
+ "vmla.f32 q5, q2, d0[1]\n"
+ "vld1.32 {d6-d7}, [%[b_ptr] :128]!\n"
+ "vmla.f32 q6, q2, d1[0]\n"
+ "vst1.32 {d8-d9}, [%[c_ptr] :128]!\n"
+ "vmla.f32 q10, q3, d0[0]\n"
+ "vst1.32 {d20-d21}, [%[c_ptr] :128]!\n"
+ "vmla.f32 q11, q3, d0[1]\n"
+ "vst1.32 {d10-d11}, [%[c_ptr] :128]!\n"
+ "vmla.f32 q12, q3, d1[0]\n"
+ "vst1.32 {d22-d23}, [%[c_ptr] :128]!\n"
+ "vmla.f32 q7, q2, d1[1]\n"
+ "vst1.32 {d12-d13}, [%[c_ptr] :128]!\n"
+ "vmla.f32 q13, q3, d1[1]\n"
+ "vst1.32 {d24-d25}, [%[c_ptr] :128]!\n"
+ "vmla.f32 q8, q2, d2[0]\n"
+ "vst1.32 {d14-d15}, [%[c_ptr] :128]!\n"
+ "vmla.f32 q14, q3, d2[0]\n"
+ "vst1.32 {d26-d27}, [%[c_ptr] :128]!\n"
+ "vmla.f32 q9, q2, d2[1]\n"
+ "vst1.32 {d16-d17}, [%[c_ptr] :128]!\n"
+ "vmla.f32 q15, q3, d2[1]\n"
+ "vst1.32 {d28-d29}, [%[c_ptr] :128]!\n"
+ "vst1.32 {d18-d19}, [%[c_ptr] :128]!\n"
+ "b 2f\n"
+
+ // tails==2 final tail
+ "4:\n"
+ "vmla.f32 q4, q2, d3[0]\n"
+ "vmla.f32 q10, q3, d3[0]\n"
+ "vst1.32 {d8-d9}, [%[c_ptr] :128]!\n"
+ "vmla.f32 q5, q2, d3[1]\n"
+ "vst1.32 {d20-d21}, [%[c_ptr] :128]!\n"
+ "vmla.f32 q11, q3, d3[1]\n"
+ "vst1.32 {d10-d11}, [%[c_ptr] :128]!\n"
+ "vmla.f32 q6, q2, d0[0]\n"
+ "vst1.32 {d22-d23}, [%[c_ptr] :128]!\n"
+ "vmla.f32 q12, q3, d0[0]\n"
+ "vst1.32 {d12-d13}, [%[c_ptr] :128]!\n"
+ "vmla.f32 q7, q2, d0[1]\n"
+ "vst1.32 {d24-d25}, [%[c_ptr] :128]!\n"
+ "vmla.f32 q13, q3, d0[1]\n"
+ "vst1.32 {d14-d15}, [%[c_ptr] :128]!\n"
+ "vmla.f32 q8, q2, d1[0]\n"
+ "vst1.32 {d26-d27}, [%[c_ptr] :128]!\n"
+ "vmla.f32 q14, q3, d1[0]\n"
+ "vst1.32 {d16-d17}, [%[c_ptr] :128]!\n"
+ "vmla.f32 q9, q2, d1[1]\n"
+ "vst1.32 {d28-d29}, [%[c_ptr] :128]!\n"
+ "vmla.f32 q15, q3, d1[1]\n"
+ "vst1.32 {d18-d19}, [%[c_ptr] :128]!\n"
+ "b 2f\n"
+
+ // tails==3 final tail
+ "5:\n"
+ "vmla.f32 q4, q2, d2[0]\n"
+ "vld1.32 {d0}, [%[a_ptr] :64]!\n"
+ "vmla.f32 q5, q2, d2[1]\n"
+ "vmla.f32 q6, q2, d3[0]\n"
+ "vst1.32 {d8-d9}, [%[c_ptr] :128]!\n"
+ "vmla.f32 q10, q3, d2[0]\n"
+ "vst1.32 {d20-d21}, [%[c_ptr] :128]!\n"
+ "vmla.f32 q11, q3, d2[1]\n"
+ "vst1.32 {d10-d11}, [%[c_ptr] :128]!\n"
+ "vmla.f32 q12, q3, d3[0]\n"
+ "vst1.32 {d22-d23}, [%[c_ptr] :128]!\n"
+ "vmla.f32 q7, q2, d3[1]\n"
+ "vst1.32 {d12-d13}, [%[c_ptr] :128]!\n"
+ "vmla.f32 q13, q3, d3[1]\n"
+ "vst1.32 {d24-d25}, [%[c_ptr] :128]!\n"
+ "vmla.f32 q8, q2, d0[0]\n"
+ "vst1.32 {d14-d15}, [%[c_ptr] :128]!\n"
+ "vmla.f32 q14, q3, d0[0]\n"
+ "vst1.32 {d26-d27}, [%[c_ptr] :128]!\n"
+ "vmla.f32 q9, q2, d0[1]\n"
+ "vst1.32 {d16-d17}, [%[c_ptr] :128]!\n"
+ "vmla.f32 q15, q3, d0[1]\n"
+ "vst1.32 {d28-d29}, [%[c_ptr] :128]!\n"
+ "vst1.32 {d18-d19}, [%[c_ptr] :128]!\n"
+
+ "2:\n"
+ "vst1.32 {d30-d31}, [%[c_ptr] :128]!\n"
+ : [a_ptr] "+r"(a_ptr), [b_ptr] "+r"(b_ptr), [c_ptr] "+r"(c_ptr), [k] "+r"(k), [tails] "+r"(tails)
+ :
+ : "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15", "cc");
+ }
+ }
+}
+
+} // namespace arm_gemm
+
+#endif
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_s16_12x8.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_s16_12x8.hpp
new file mode 100644
index 0000000..387f899
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_s16_12x8.hpp
@@ -0,0 +1,73 @@
+/*
+ * Copyright (c) 2017-2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#pragma once
+
+#ifdef __aarch64__
+
+namespace arm_gemm
+{
+// Actual kernel implementations
+void a64_gemm_s16_asimd_12x8(const int16_t *, const int16_t *, int32_t *, int, int, int);
+
+// 12x8 SGEMM "strategy" class.
+//
+// This describes the characteristics of a family of kernels, in terms of
+// the required interleave properties and the output block size.
+//
+// All kernels in the family must share these characteristics. The actual
+// kernel to be used can be chosen at runtime, based on the CPU_type
+// structure.
+class gemm_s16_12x8
+{
+public:
+ typedef int16_t operand_type;
+ typedef int32_t result_type;
+
+ typedef void (*kern_type)(const int16_t *, const int16_t *, int32_t *, int, int, int);
+
+ /* Describes the data layout for A input */
+ static const int A_interleave = 8;
+ static const int A_block = 1;
+ static const int A_transpose = 0;
+
+ /* Same for B input */
+ static const int B_interleave = 12;
+ static const int B_block = 1;
+ static const int B_transpose = 1;
+
+ /* Kernel blocking parameters */
+ static const int out_width = 12;
+ static const int out_height = 8;
+ static const int k_unroll = 1;
+
+ kern_type kernel = a64_gemm_s16_asimd_12x8;
+
+ gemm_s16_12x8(const CPUInfo *ci)
+ {
+ }
+};
+
+} // namespace arm_gemm
+
+#endif // __aarch64__
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_s16_12x8/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_s16_12x8/generic.cpp
new file mode 100644
index 0000000..b217dcf
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_s16_12x8/generic.cpp
@@ -0,0 +1,309 @@
+/*
+ * Copyright (c) 2017-2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifdef __aarch64__
+
+#include <arm_neon.h>
+
+#include "../../asmlib.hpp"
+
+namespace arm_gemm
+{
+void a64_gemm_s16_asimd_12x8(const int16_t *Apanel, const int16_t *Bpanel, int32_t *Cpanel, int ablocks, int bblocks, int K)
+{
+ const int16_t *a_ptr = Apanel;
+ int32_t *c_ptr = Cpanel;
+
+ for(int yb = 0; yb < ablocks; yb++)
+ {
+ const int16_t *a_ptr0 = a_ptr;
+ const int16_t *b_ptr = Bpanel;
+
+ for(int xb = 0; xb < bblocks; xb++)
+ {
+ a_ptr = a_ptr0;
+ const bool odd_k = K & 0x1;
+ int k = (K + 1) / 2 - 1;
+
+ register int16x8_t aa asm("v0");
+ register int16x8_t ab asm("v1");
+ register int16x8_t b0 asm("v2");
+ register int16x8_t b1 asm("v3");
+ register int16x8_t b2 asm("v4");
+
+ __asm __volatile(
+ "ldr %d[aa], [%x[a_ptr]]\n" // Load A[A].lower
+ "movi v5.4s, #0\n"
+ "ldr x20, [%x[a_ptr], #0x08]\n" // Load A[A].upper
+ "movi v6.4s, #0\n"
+ "ldr %d[b0], [%x[b_ptr]]\n" // Load B[0].lower
+ "ins %[aa].d[1], x20\n" // Merge A[A].lower and upper
+ "movi v7.4s, #0\n" ASM_PREFETCH("[%[a_ptr], #64]")
+ "movi v8.4s, #0\n"
+ "ldr x20, [%x[b_ptr], #0x08]\n" // Load B[0].upper
+ "movi v9.4s, #0\n" ASM_PREFETCH("[%[b_ptr], #64]")
+ "movi v10.4s, #0\n"
+ "ldr %d[b1], [%x[b_ptr], #0x10]\n" // Load B[1].lower
+ "ins %[b0].d[1], x20\n" // Merge B[0].lower and upper
+ "movi v11.4s, #0\n" ASM_PREFETCH("[%[a_ptr], #96]")
+ "movi v12.4s, #0\n"
+ "movi v13.4s, #0\n" ASM_PREFETCH("[%[b_ptr], #96]")
+ "movi v14.4s, #0\n"
+ "movi v15.4s, #0\n" ASM_PREFETCH("[%[a_ptr], #128]")
+ "movi v16.4s, #0\n"
+ "movi v17.4s, #0\n" ASM_PREFETCH("[%[b_ptr], #128]")
+ "movi v18.4s, #0\n"
+ "movi v19.4s, #0\n" ASM_PREFETCH("[%[a_ptr], #160]")
+ "movi v20.4s, #0\n"
+ "movi v21.4s, #0\n" ASM_PREFETCH("[%[b_ptr], #160]")
+ "movi v22.4s, #0\n"
+ "movi v23.4s, #0\n" ASM_PREFETCH("[%[a_ptr], #192]")
+ "movi v24.4s, #0\n"
+ "add %x[a_ptr], %x[a_ptr], #0x10\n"
+ "movi v25.4s, #0\n" ASM_PREFETCH("[%[b_ptr], #192]")
+ "movi v26.4s, #0\n"
+ "add %x[b_ptr], %x[b_ptr], #0x18\n"
+ "movi v27.4s, #0\n"
+ "movi v28.4s, #0\n"
+
+ "cbz %x[k], 2f\n" // Skip the loop if doing zero iterations.
+
+ "1:\n" // Main loop
+ // First unroll
+ "smlal v5.4s, %[b0].4h, %[aa].h[0]\n"
+ "ldr x20, [%x[b_ptr]]\n" // Load B[1].upper
+ "smlal v6.4s, %[b0].4h, %[aa].h[1]\n"
+ "smlal v7.4s, %[b0].4h, %[aa].h[2]\n"
+ "ldr %d[ab], [%x[a_ptr]]\n" // Load A[B].lower
+ "ins %[b1].d[1], x20\n" // Merge B[1].lower and .upper
+ "smlal v8.4s, %[b0].4h, %[aa].h[3]\n"
+ "smlal v9.4s, %[b0].4h, %[aa].h[4]\n"
+ "ldr x20, [%x[a_ptr], #0x8]\n" // Load A[B].upper
+ "smlal v10.4s, %[b0].4h, %[aa].h[5]\n"
+ "smlal v11.4s, %[b0].4h, %[aa].h[6]\n"
+ "ldr %d[b2], [%x[b_ptr], #0x8]\n" // Load B[2].lower
+ "ins %[ab].d[1], x20\n" // Merge A[B].lower and .upper
+ "smlal v12.4s, %[b0].4h, %[aa].h[7]\n"
+ "smlal2 v13.4s, %[b0].8h, %[aa].h[0]\n"
+ "ldr x20, [%x[b_ptr], #0x10]\n" // Load B[2].upper
+ "smlal2 v14.4s, %[b0].8h, %[aa].h[1]\n"
+ "smlal2 v15.4s, %[b0].8h, %[aa].h[2]\n"
+ "smlal2 v16.4s, %[b0].8h, %[aa].h[3]\n"
+ "smlal2 v17.4s, %[b0].8h, %[aa].h[4]\n"
+ "smlal2 v18.4s, %[b0].8h, %[aa].h[5]\n"
+ "smlal2 v19.4s, %[b0].8h, %[aa].h[6]\n"
+ "smlal2 v20.4s, %[b0].8h, %[aa].h[7]\n"
+ "ldr %d[b0], [%x[b_ptr], #0x18]\n" // Load B[0].lower
+ "ins %[b2].d[1], x20\n" // Merge B[2].lower and .upper
+ "smlal v21.4s, %[b1].4h, %[aa].h[0]\n"
+ "smlal v22.4s, %[b1].4h, %[aa].h[1]\n"
+ "ldr x20, [%x[b_ptr], #0x20]\n" // Load B[0].upper
+ "smlal v23.4s, %[b1].4h, %[aa].h[2]\n"
+ "smlal v24.4s, %[b1].4h, %[aa].h[3]\n"
+ "smlal v25.4s, %[b1].4h, %[aa].h[4]\n"
+ "smlal v26.4s, %[b1].4h, %[aa].h[5]\n"
+ "smlal v27.4s, %[b1].4h, %[aa].h[6]\n"
+ "smlal v28.4s, %[b1].4h, %[aa].h[7]\n"
+
+ // Second unroll
+ "smlal2 v5.4s, %[b1].8h, %[ab].h[0]\n"
+ "ldr %d[aa], [%x[a_ptr], #0x10]\n" // Load A[A].lower
+ "ins %[b0].d[1], x20\n" // Merge B[0].lower and .upper
+ "smlal2 v6.4s, %[b1].8h, %[ab].h[1]\n"
+ "smlal2 v7.4s, %[b1].8h, %[ab].h[2]\n"
+ "ldr x20, [%x[a_ptr], #0x18]\n" // Load A[A].upper
+ "smlal2 v8.4s, %[b1].8h, %[ab].h[3]\n"
+ "smlal2 v9.4s, %[b1].8h, %[ab].h[4]\n"
+ "smlal2 v10.4s, %[b1].8h, %[ab].h[5]\n"
+ "smlal2 v11.4s, %[b1].8h, %[ab].h[6]\n"
+ "add %x[a_ptr], %x[a_ptr], #0x20\n"
+ "smlal2 v12.4s, %[b1].8h, %[ab].h[7]\n"
+ "smlal v13.4s, %[b2].4h, %[ab].h[0]\n" ASM_PREFETCH("[%[b_ptr], #320]")
+ "smlal v14.4s, %[b2].4h, %[ab].h[1]\n"
+ "smlal v15.4s, %[b2].4h, %[ab].h[2]\n" ASM_PREFETCH("[%[a_ptr], #320]")
+ "smlal v16.4s, %[b2].4h, %[ab].h[3]\n"
+ "smlal v17.4s, %[b2].4h, %[ab].h[4]\n" ASM_PREFETCH("[%[b_ptr], #448]")
+ "smlal v18.4s, %[b2].4h, %[ab].h[5]\n"
+ "smlal v19.4s, %[b2].4h, %[ab].h[6]\n"
+ "smlal v20.4s, %[b2].4h, %[ab].h[7]\n"
+ "smlal2 v21.4s, %[b2].8h, %[ab].h[0]\n"
+ "smlal2 v22.4s, %[b2].8h, %[ab].h[1]\n"
+ "subs %x[k], %x[k], #0x1\n"
+ "smlal2 v23.4s, %[b2].8h, %[ab].h[2]\n"
+ "smlal2 v24.4s, %[b2].8h, %[ab].h[3]\n"
+ "ldr %d[b1], [%x[b_ptr], #0x28]\n" // Load B[1].lower
+ "ins %[aa].d[1], x20\n" // Merge A[A].lower and .upper
+ "smlal2 v25.4s, %[b2].8h, %[ab].h[4]\n"
+ "smlal2 v26.4s, %[b2].8h, %[ab].h[5]\n"
+ "add %x[b_ptr], %x[b_ptr], #0x30\n"
+ "smlal2 v27.4s, %[b2].8h, %[ab].h[6]\n"
+ "smlal2 v28.4s, %[b2].8h, %[ab].h[7]\n"
+ "bne 1b\n"
+
+ "2:\n" // Even tail
+ "cbnz %x[odd_k], 3f\n"
+
+ "smlal v5.4s, %[b0].4h, %[aa].h[0]\n"
+ "ldr x20, [%x[b_ptr]]\n" // Load B[1].upper
+ "smlal v6.4s, %[b0].4h, %[aa].h[1]\n"
+ "smlal v7.4s, %[b0].4h, %[aa].h[2]\n"
+ "ldr %d[ab], [%x[a_ptr]]\n" // Load A[B].lower
+ "ins %[b1].d[1], x20\n" // Merge B[1].lower and .upper
+ "smlal v8.4s, %[b0].4h, %[aa].h[3]\n"
+ "smlal v9.4s, %[b0].4h, %[aa].h[4]\n"
+ "ldr x20, [%x[a_ptr], #0x8]\n" // Load A[B].upper
+ "smlal v10.4s, %[b0].4h, %[aa].h[5]\n"
+ "smlal v11.4s, %[b0].4h, %[aa].h[6]\n"
+ "ldr %d[b2], [%x[b_ptr], #0x8]\n" // Load B[2].lower
+ "ins %[ab].d[1], x20\n" // Merge A[B].lower and .upper
+ "smlal v12.4s, %[b0].4h, %[aa].h[7]\n"
+ "smlal2 v13.4s, %[b0].8h, %[aa].h[0]\n"
+ "ldr x20, [%x[b_ptr], #0x10]\n" // Load B[2].upper
+ "smlal2 v14.4s, %[b0].8h, %[aa].h[1]\n"
+ "smlal2 v15.4s, %[b0].8h, %[aa].h[2]\n"
+ "smlal2 v16.4s, %[b0].8h, %[aa].h[3]\n"
+ "add %[a_ptr], %[a_ptr], #0x10\n"
+ "smlal2 v17.4s, %[b0].8h, %[aa].h[4]\n"
+ "add %[b_ptr], %[b_ptr], #0x18\n"
+ "smlal2 v18.4s, %[b0].8h, %[aa].h[5]\n"
+ "smlal2 v19.4s, %[b0].8h, %[aa].h[6]\n"
+ "smlal2 v20.4s, %[b0].8h, %[aa].h[7]\n"
+ "ins %[b2].d[1], x20\n" // Merge B[2].lower and .upper
+ "smlal v21.4s, %[b1].4h, %[aa].h[0]\n"
+ "smlal v22.4s, %[b1].4h, %[aa].h[1]\n"
+ "smlal v23.4s, %[b1].4h, %[aa].h[2]\n"
+ "smlal v24.4s, %[b1].4h, %[aa].h[3]\n"
+ "smlal v25.4s, %[b1].4h, %[aa].h[4]\n"
+ "smlal v26.4s, %[b1].4h, %[aa].h[5]\n"
+ "smlal v27.4s, %[b1].4h, %[aa].h[6]\n"
+ "smlal v28.4s, %[b1].4h, %[aa].h[7]\n"
+
+ "smlal2 v5.4s, %[b1].8h, %[ab].h[0]\n"
+ "smlal v13.4s, %[b2].4h, %[ab].h[0]\n"
+ "smlal2 v21.4s, %[b2].8h, %[ab].h[0]\n"
+ "smlal2 v6.4s, %[b1].8h, %[ab].h[1]\n"
+ "smlal v14.4s, %[b2].4h, %[ab].h[1]\n"
+ "str q5, [%x[c_ptr]]\n"
+ "smlal2 v22.4s, %[b2].8h, %[ab].h[1]\n"
+ "str q13, [%x[c_ptr], #0x10]\n"
+ "smlal2 v7.4s, %[b1].8h, %[ab].h[2]\n"
+ "str q21, [%x[c_ptr], #0x20]\n"
+ "smlal v15.4s, %[b2].4h, %[ab].h[2]\n"
+ "str q6, [%x[c_ptr], #0x30]\n"
+ "smlal2 v23.4s, %[b2].8h, %[ab].h[2]\n"
+ "str q14, [%x[c_ptr], #0x40]\n"
+ "smlal2 v8.4s, %[b1].8h, %[ab].h[3]\n"
+ "str q22, [%x[c_ptr], #0x50]\n"
+ "smlal v16.4s, %[b2].4h, %[ab].h[3]\n"
+ "str q7, [%x[c_ptr], #0x60]\n"
+ "smlal2 v24.4s, %[b2].8h, %[ab].h[3]\n"
+ "str q15, [%x[c_ptr], #0x70]\n"
+ "smlal2 v9.4s, %[b1].8h, %[ab].h[4]\n"
+ "str q23, [%x[c_ptr], #0x80]\n"
+ "smlal v17.4s, %[b2].4h, %[ab].h[4]\n"
+ "str q8, [%x[c_ptr], #0x90]\n"
+ "smlal2 v25.4s, %[b2].8h, %[ab].h[4]\n"
+ "str q16, [%x[c_ptr], #0xa0]\n"
+ "smlal2 v10.4s, %[b1].8h, %[ab].h[5]\n"
+ "str q24, [%x[c_ptr], #0xb0]\n"
+ "smlal v18.4s, %[b2].4h, %[ab].h[5]\n"
+ "str q9, [%x[c_ptr], #0xc0]\n"
+ "smlal2 v26.4s, %[b2].8h, %[ab].h[5]\n"
+ "str q17, [%x[c_ptr], #0xd0]\n"
+ "smlal2 v11.4s, %[b1].8h, %[ab].h[6]\n"
+ "str q25, [%x[c_ptr], #0xe0]\n"
+ "smlal v19.4s, %[b2].4h, %[ab].h[6]\n"
+ "str q10, [%x[c_ptr], #0xf0]\n"
+ "smlal2 v27.4s, %[b2].8h, %[ab].h[6]\n"
+ "str q18, [%x[c_ptr], #0x100]\n"
+ "smlal2 v12.4s, %[b1].8h, %[ab].h[7]\n"
+ "str q26, [%x[c_ptr], #0x110]\n"
+ "smlal v20.4s, %[b2].4h, %[ab].h[7]\n"
+ "str q11, [%x[c_ptr], #0x120]\n"
+ "smlal2 v28.4s, %[b2].8h, %[ab].h[7]\n"
+ "str q19, [%x[c_ptr], #0x130]\n"
+ "b 4f\n" // Complete write out
+
+ "3:\n" // Odd tail
+ "smlal v5.4s, %[b0].4h, %[aa].h[0]\n"
+ "smlal2 v13.4s, %[b0].8h, %[aa].h[0]\n"
+ "smlal v21.4s, %[b1].4h, %[aa].h[0]\n"
+ "smlal v6.4s, %[b0].4h, %[aa].h[1]\n"
+ "smlal2 v14.4s, %[b0].8h, %[aa].h[1]\n"
+ "smlal v22.4s, %[b1].4h, %[aa].h[1]\n"
+ "str q5, [%x[c_ptr]]\n"
+ "smlal v7.4s, %[b0].4h, %[aa].h[2]\n"
+ "str q13, [%x[c_ptr], #0x10]\n"
+ "smlal2 v15.4s, %[b0].8h, %[aa].h[2]\n"
+ "str q21, [%x[c_ptr], #0x20]\n"
+ "smlal v23.4s, %[b1].4h, %[aa].h[2]\n"
+ "str q6, [%x[c_ptr], #0x30]\n"
+ "smlal v8.4s, %[b0].4h, %[aa].h[3]\n"
+ "str q14, [%x[c_ptr], #0x40]\n"
+ "smlal2 v16.4s, %[b0].8h, %[aa].h[3]\n"
+ "str q22, [%x[c_ptr], #0x50]\n"
+ "smlal v24.4s, %[b1].4h, %[aa].h[3]\n"
+ "str q7, [%x[c_ptr], #0x60]\n"
+ "smlal v9.4s, %[b0].4h, %[aa].h[4]\n"
+ "str q15, [%x[c_ptr], #0x70]\n"
+ "smlal2 v17.4s, %[b0].8h, %[aa].h[4]\n"
+ "str q23, [%x[c_ptr], #0x80]\n"
+ "smlal v25.4s, %[b1].4h, %[aa].h[4]\n"
+ "str q8, [%x[c_ptr], #0x90]\n"
+ "smlal v10.4s, %[b0].4h, %[aa].h[5]\n"
+ "str q16, [%x[c_ptr], #0xa0]\n"
+ "smlal2 v18.4s, %[b0].8h, %[aa].h[5]\n"
+ "str q24, [%x[c_ptr], #0xb0]\n"
+ "smlal v26.4s, %[b1].4h, %[aa].h[5]\n"
+ "str q9, [%x[c_ptr], #0xc0]\n"
+ "smlal v11.4s, %[b0].4h, %[aa].h[6]\n"
+ "str q17, [%x[c_ptr], #0xd0]\n"
+ "smlal2 v19.4s, %[b0].8h, %[aa].h[6]\n"
+ "str q25, [%x[c_ptr], #0xe0]\n"
+ "smlal v27.4s, %[b1].4h, %[aa].h[6]\n"
+ "str q10, [%x[c_ptr], #0xf0]\n"
+ "smlal v12.4s, %[b0].4h, %[aa].h[7]\n"
+ "str q18, [%x[c_ptr], #0x100]\n"
+ "smlal2 v20.4s, %[b0].8h, %[aa].h[7]\n"
+ "str q26, [%x[c_ptr], #0x110]\n"
+ "smlal v28.4s, %[b1].4h, %[aa].h[7]\n"
+ "str q11, [%x[c_ptr], #0x120]\n"
+
+ "4:\n" // End of function
+ "str q19, [%x[c_ptr], #0x130]\n"
+ "str q27, [%x[c_ptr], #0x140]\n"
+ "str q12, [%x[c_ptr], #0x150]\n"
+ "str q20, [%x[c_ptr], #0x160]\n"
+ "str q28, [%x[c_ptr], #0x170]\n"
+ "add %x[c_ptr], %x[c_ptr], #0x180\n"
+ : [a_ptr] "+r"(a_ptr), [b_ptr] "+r"(b_ptr), [c_ptr] "+r"(c_ptr), [k] "+r"(k),
+ [aa] "+w"(aa), [ab] "+w"(ab), [b0] "+w"(b0), [b1] "+w"(b1), [b2] "+w"(b2)
+ : [odd_k] "r"(odd_k)
+ : "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "x20", "cc");
+ }
+ }
+}
+
+} // namespace arm_gemm
+
+#endif
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_s8_12x8.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_s8_12x8.hpp
new file mode 100644
index 0000000..08f90e1
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_s8_12x8.hpp
@@ -0,0 +1,72 @@
+/*
+ * Copyright (c) 2017-2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#pragma once
+
+#ifdef __aarch64__
+
+#include "arm_gemm.hpp"
+
+namespace arm_gemm
+{
+// Load the actual kernel
+void a64_gemm_s8_12x8(const int8_t *, const int8_t *, int32_t *, int, int, int);
+void a64_gemm_s8_12x8_a55r1(const int8_t *, const int8_t *, int32_t *, int, int, int);
+
+class gemm_s8_12x8
+{
+public:
+ typedef int8_t operand_type;
+ typedef int32_t result_type;
+
+ typedef void (*kern_type)(const int8_t *, const int8_t *, int32_t *, int, int, int);
+
+ /* Describes the data layout for A input */
+ static const int A_interleave = 8;
+ static const int A_block = 4;
+ static const bool A_transpose = false;
+
+ /* Same for B input */
+ static const int B_interleave = 12;
+ static const int B_block = 4;
+ static const bool B_transpose = true;
+
+ /* Kernel blocking parameters */
+ static const int out_width = 12;
+ static const int out_height = 8;
+ static const int k_unroll = 4;
+
+ kern_type kernel = a64_gemm_s8_12x8;
+
+ gemm_s8_12x8(const CPUInfo *ci)
+ {
+ if(ci->get_cpu_model() == CPUModel::A55r1)
+ {
+ kernel = a64_gemm_s8_12x8_a55r1;
+ }
+ }
+};
+
+} // namespace arm_gemm
+
+#endif // __aarch64__
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_s8_12x8/a55r1.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_s8_12x8/a55r1.cpp
new file mode 100644
index 0000000..ef2f291
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_s8_12x8/a55r1.cpp
@@ -0,0 +1,356 @@
+/*
+ * Copyright (c) 2017-2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifdef __aarch64__
+
+#include <arm_neon.h>
+
+#include "../../asmlib.hpp"
+
+#ifdef NO_DOT_IN_TOOLCHAIN
+#include "dot_toolchain_support.h"
+#endif
+
+namespace arm_gemm
+{
+void a64_gemm_s8_12x8_a55r1(const int8_t *Apanel, const int8_t *Bpanel, int32_t *Cpanel, const int ablocks, const int bblocks, const int K)
+{
+ const int8_t *a_ptr = Apanel;
+ int32_t *c_ptr = Cpanel;
+
+ // We divide K by 4 because the sdot instruction processes 4 elements at a time.
+ const int W = K / 4;
+
+ // Fix up for odd lengths - set a flag if K is odd, but make
+ // sure we round up the iteration count.
+ const int oddk = (W & 1);
+ const int k_iters = ((W + 1) / 2) - 1;
+
+ for(int yb = 0; yb < ablocks; yb++)
+ {
+ const int8_t *a_ptr0 = a_ptr;
+ const int8_t *b_ptr = Bpanel;
+
+ for(int xb = 0; xb < bblocks; xb++)
+ {
+ a_ptr = a_ptr0;
+ int k = k_iters;
+
+ register int32x4_t a0 asm("v0");
+ register int32x4_t a1 asm("v1");
+ register int32x4_t b0 asm("v2");
+ register int32x4_t b1 asm("v3");
+ register int32x4_t b2 asm("v4");
+ register int32x4_t a0a asm("v5");
+ register int32x4_t a1a asm("v6");
+
+ __asm __volatile(
+#ifdef NO_DOT_IN_TOOLCHAIN
+ _DECLARE_SDOT
+#else
+ ".arch armv8.2-a+dotprod\n"
+#endif
+ // Initialize result registers, load initial operands, prime prefetches.
+ "movi v8.4s, #0x0\n"
+ "ldr %q[a0], [%[a_ptr]]\n"
+ "movi v9.4s, #0x0\n"
+ "ldr %q[b0], [%[b_ptr]]\n"
+ "movi v10.4s, #0x0\n"
+ "ldr %q[a1], [%[a_ptr], #16]\n"
+ "movi v11.4s, #0x0\n"
+ "ldr %q[b1], [%[b_ptr], #16]\n"
+ "movi v12.4s, #0x0\n" ASM_PREFETCH("[%[b_ptr], #64]") "movi v13.4s, #0x0\n" ASM_PREFETCH("[%[a_ptr], #64]") "movi v14.4s, #0x0\n" ASM_PREFETCH("[%[b_ptr], #128]") "movi v15.4s, #0x0\n"
+ ASM_PREFETCH("[%[a_ptr], #128]") "movi v16.4s, #0x0\n" ASM_PREFETCH("[%[b_ptr], #192]") "movi v17.4s, #0x0\n" ASM_PREFETCH("[%[b_ptr], #256]")
+ "movi v18.4s, #0x0\n"
+ "movi v19.4s, #0x0\n" ASM_PREFETCH("[%[a_ptr], #192]")
+ "movi v20.4s, #0x0\n"
+ "movi v21.4s, #0x0\n" ASM_PREFETCH("[%[b_ptr], #320]")
+ "movi v22.4s, #0x0\n"
+ "movi v23.4s, #0x0\n" ASM_PREFETCH("[%[a_ptr], #256]")
+ "movi v24.4s, #0x0\n"
+ "movi v25.4s, #0x0\n" ASM_PREFETCH("[%[b_ptr], #384]")
+ "movi v26.4s, #0x0\n"
+ "movi v27.4s, #0x0\n" ASM_PREFETCH("[%[b_ptr], #448]")
+ "movi v28.4s, #0x0\n"
+ "movi v29.4s, #0x0\n" ASM_PREFETCH("[%[a_ptr], #384]")
+ "movi v30.4s, #0x0\n"
+ "movi v31.4s, #0x0\n" ASM_PREFETCH("[%[b_ptr], #512]")
+
+ // The loop is offset by these two instructions which must
+ // always be executed.
+ "sdot v8.4s , %[b0].16b, %[a0].4b[0]\n"
+ "ldr %d[b2], [%[b_ptr], #32]\n"
+
+ // Skip loop if we are doing zero iterations of it.
+ "cbz %w[k], 4f\n"
+
+ "1:\n"
+ "sdot v9.4s , %[b0].16b, %[a0].4b[1]\n"
+ "ldr x20, [%[b_ptr], #40]\n"
+ "sdot v10.4s, %[b0].16b, %[a0].4b[2]\n"
+ "subs %w[k], %w[k], #1\n"
+ "sdot v11.4s, %[b0].16b, %[a0].4b[3]\n"
+ "ldr %d[a0a], [%[a_ptr], #32]\n"
+
+ "sdot v12.4s, %[b0].16b, %[a1].4b[0]\n"
+ "ins %[b2].d[1], x20\n"
+ "sdot v13.4s, %[b0].16b, %[a1].4b[1]\n"
+ "ldr x20, [%[a_ptr], #40]\n"
+ "sdot v14.4s, %[b0].16b, %[a1].4b[2]\n"
+ "sdot v15.4s, %[b0].16b, %[a1].4b[3]\n"
+ "ldr %d[a1a], [%[a_ptr], #48]\n"
+
+ "sdot v16.4s, %[b1].16b, %[a0].4b[0]\n"
+ "ins %[a0a].d[1], x20\n"
+ "sdot v17.4s, %[b1].16b, %[a0].4b[1]\n"
+ "ldr x20, [%[a_ptr], #56]\n"
+ "sdot v18.4s, %[b1].16b, %[a0].4b[2]\n"
+ "sdot v19.4s, %[b1].16b, %[a0].4b[3]\n"
+ "ldr %d[b0], [%[b_ptr], #48]\n"
+
+ "sdot v20.4s, %[b1].16b, %[a1].4b[0]\n"
+ "ins %[a1a].d[1], x20\n"
+ "sdot v21.4s, %[b1].16b, %[a1].4b[1]\n"
+ "ldr x20, [%[b_ptr], #56]\n"
+ "sdot v22.4s, %[b1].16b, %[a1].4b[2]\n"
+ "sdot v23.4s, %[b1].16b, %[a1].4b[3]\n"
+ "ldr %d[b1], [%[b_ptr], #64]\n"
+
+ "sdot v24.4s, %[b2].16b, %[a0].4b[0]\n"
+ "ins %[b0].d[1], x20\n"
+ "sdot v25.4s, %[b2].16b, %[a0].4b[1]\n"
+ "ldr x20, [%[b_ptr], #72]\n"
+ "sdot v26.4s, %[b2].16b, %[a0].4b[2]\n"
+ "sdot v27.4s, %[b2].16b, %[a0].4b[3]\n" ASM_PREFETCH("[%[a_ptr], #448]")
+
+ "sdot v28.4s, %[b2].16b, %[a1].4b[0]\n"
+ "sdot v29.4s, %[b2].16b, %[a1].4b[1]\n" ASM_PREFETCH("[%[b_ptr], #576]")
+ "sdot v30.4s, %[b2].16b, %[a1].4b[2]\n"
+ "sdot v31.4s, %[b2].16b, %[a1].4b[3]\n"
+
+ // Unroll 1
+ "ldr %d[b2], [%[b_ptr], #80]\n"
+
+ "sdot v8.4s , %[b0].16b, %[a0a].4b[0]\n"
+ "ins %[b1].d[1], x20\n"
+ "sdot v9.4s , %[b0].16b, %[a0a].4b[1]\n"
+ "ldr x20, [%[b_ptr], #88]\n"
+ "sdot v10.4s, %[b0].16b, %[a0a].4b[2]\n"
+ "sdot v11.4s, %[b0].16b, %[a0a].4b[3]\n"
+ "ldr %d[a0], [%[a_ptr], #64]\n"
+
+ "sdot v12.4s, %[b0].16b, %[a1a].4b[0]\n"
+ "ins %[b2].d[1], x20\n"
+ "sdot v13.4s, %[b0].16b, %[a1a].4b[1]\n"
+ "ldr x20, [%[a_ptr], #72]\n"
+ "sdot v14.4s, %[b0].16b, %[a1a].4b[2]\n"
+ "sdot v15.4s, %[b0].16b, %[a1a].4b[3]\n"
+ "ldr %d[a1], [%[a_ptr], #80]\n"
+
+ "sdot v16.4s, %[b1].16b, %[a0a].4b[0]\n"
+ "ins %[a0].d[1], x20\n"
+ "sdot v17.4s, %[b1].16b, %[a0a].4b[1]\n"
+ "ldr x20, [%[a_ptr], #88]\n"
+ "sdot v18.4s, %[b1].16b, %[a0a].4b[2]\n"
+ "sdot v19.4s, %[b1].16b, %[a0a].4b[3]\n"
+ "ldr %d[b0], [%[b_ptr], #96]\n"
+
+ "sdot v20.4s, %[b1].16b, %[a1a].4b[0]\n"
+ "ins %[a1].d[1], x20\n"
+ "sdot v21.4s, %[b1].16b, %[a1a].4b[1]\n"
+ "ldr x20, [%[b_ptr], #104]\n"
+ "sdot v22.4s, %[b1].16b, %[a1a].4b[2]\n"
+ "sdot v23.4s, %[b1].16b, %[a1a].4b[3]\n"
+ "ldr %d[b1], [%[b_ptr], #112]\n"
+
+ "sdot v24.4s, %[b2].16b, %[a0a].4b[0]\n"
+ "ins %[b0].d[1], x20\n"
+ "sdot v25.4s, %[b2].16b, %[a0a].4b[1]\n"
+ "ldr x20, [%[b_ptr], #120]\n"
+ "sdot v26.4s, %[b2].16b, %[a0a].4b[2]\n"
+ "sdot v27.4s, %[b2].16b, %[a0a].4b[3]\n"
+ "add %[a_ptr], %[a_ptr], #64\n"
+
+ "sdot v28.4s, %[b2].16b, %[a1a].4b[0]\n" ASM_PREFETCH("[%[b_ptr], #640]")
+ "sdot v29.4s, %[b2].16b, %[a1a].4b[1]\n"
+ "add %[b_ptr], %[b_ptr], #96\n"
+ "sdot v30.4s, %[b2].16b, %[a1a].4b[2]\n"
+ "ins %[b1].d[1], x20\n"
+ "sdot v31.4s, %[b2].16b, %[a1a].4b[3]\n"
+ "ldr %d[b2], [%[b_ptr], #32]\n"
+
+ "sdot v8.4s , %[b0].16b, %[a0].4b[0]\n"
+ "b.ne 1b\n"
+
+ // Branch here if K=1 or 2. Do the right thing for odd/even at the end.
+ "4:\n"
+
+ // Start final iteration - branch off to "odd" code before we load a0a.
+ "sdot v9.4s , %[b0].16b, %[a0].4b[1]\n"
+ "ldr x20, [%[b_ptr], #40]\n"
+ "sdot v10.4s, %[b0].16b, %[a0].4b[2]\n"
+ "cbnz %w[oddk], 2f\n"
+
+ // Even K continuation
+ "sdot v11.4s, %[b0].16b, %[a0].4b[3]\n"
+ "ldr %d[a0a], [%[a_ptr], #32]\n"
+
+ "sdot v12.4s, %[b0].16b, %[a1].4b[0]\n"
+ "ins %[b2].d[1], x20\n"
+ "sdot v13.4s, %[b0].16b, %[a1].4b[1]\n"
+ "ldr x20, [%[a_ptr], #40]\n"
+ "sdot v14.4s, %[b0].16b, %[a1].4b[2]\n" ASM_PREFETCHW("[%[c_ptr]]")
+ "sdot v15.4s, %[b0].16b, %[a1].4b[3]\n"
+ "ldr %d[a1a], [%[a_ptr], #48]\n"
+
+ "sdot v16.4s, %[b1].16b, %[a0].4b[0]\n"
+ "ins %[a0a].d[1], x20\n"
+ "sdot v17.4s, %[b1].16b, %[a0].4b[1]\n"
+ "ldr x20, [%[a_ptr], #56]\n"
+ "sdot v18.4s, %[b1].16b, %[a0].4b[2]\n"
+ "sdot v19.4s, %[b1].16b, %[a0].4b[3]\n"
+ "ldr %d[b0], [%[b_ptr], #48]\n"
+
+ "sdot v20.4s, %[b1].16b, %[a1].4b[0]\n"
+ "ins %[a1a].d[1], x20\n"
+ "sdot v21.4s, %[b1].16b, %[a1].4b[1]\n"
+ "ldr x20, [%[b_ptr], #56]\n"
+ "sdot v22.4s, %[b1].16b, %[a1].4b[2]\n" ASM_PREFETCHW("[%[c_ptr], #64]")
+ "sdot v23.4s, %[b1].16b, %[a1].4b[3]\n"
+
+ "sdot v24.4s, %[b2].16b, %[a0].4b[0]\n"
+ "sdot v25.4s, %[b2].16b, %[a0].4b[1]\n" ASM_PREFETCHW("[%[c_ptr], #128]")
+ "sdot v26.4s, %[b2].16b, %[a0].4b[2]\n"
+ "sdot v27.4s, %[b2].16b, %[a0].4b[3]\n"
+ "ldr %d[b1], [%[b_ptr], #64]\n"
+
+ "sdot v28.4s, %[b2].16b, %[a1].4b[0]\n"
+ "ins %[b0].d[1], x20\n"
+ "sdot v29.4s, %[b2].16b, %[a1].4b[1]\n"
+ "ldr x20, [%[b_ptr], #72]\n"
+ "sdot v30.4s, %[b2].16b, %[a1].4b[2]\n" ASM_PREFETCHW("[%[c_ptr], #192]")
+ "sdot v31.4s, %[b2].16b, %[a1].4b[3]\n"
+ "ldr %d[b2], [%[b_ptr], #80]\n"
+
+ "sdot v8.4s , %[b0].16b, %[a0a].4b[0]\n"
+ "ins %[b1].d[1], x20\n"
+ "sdot v9.4s , %[b0].16b, %[a0a].4b[1]\n"
+ "ldr x20, [%[b_ptr], #88]\n"
+ "sdot v10.4s, %[b0].16b, %[a0a].4b[2]\n"
+ "ins %[b2].d[1], x20\n"
+
+ "sdot v11.4s, %[b0].16b, %[a0a].4b[3]\n" ASM_PREFETCHW("[%[c_ptr], #256]")
+ "sdot v12.4s, %[b0].16b, %[a1a].4b[0]\n"
+ "sdot v13.4s, %[b0].16b, %[a1a].4b[1]\n"
+ "sdot v14.4s, %[b0].16b, %[a1a].4b[2]\n" ASM_PREFETCHW("[%[c_ptr], #320]")
+ "sdot v15.4s, %[b0].16b, %[a1a].4b[3]\n"
+ "sdot v16.4s, %[b1].16b, %[a0a].4b[0]\n" ASM_PREFETCHWL2("[%[c_ptr], #384]")
+ "sdot v17.4s, %[b1].16b, %[a0a].4b[1]\n"
+ "sdot v18.4s, %[b1].16b, %[a0a].4b[2]\n" ASM_PREFETCHWL2("[%[c_ptr], #448]")
+ "sdot v19.4s, %[b1].16b, %[a0a].4b[3]\n"
+ "sdot v20.4s, %[b1].16b, %[a1a].4b[0]\n"
+ "sdot v21.4s, %[b1].16b, %[a1a].4b[1]\n" ASM_PREFETCHWL2("[%[c_ptr], #512]")
+ "sdot v22.4s, %[b1].16b, %[a1a].4b[2]\n"
+ "sdot v23.4s, %[b1].16b, %[a1a].4b[3]\n" ASM_PREFETCHWL2("[%[c_ptr], #576]")
+ "sdot v24.4s, %[b2].16b, %[a0a].4b[0]\n"
+ "sdot v25.4s, %[b2].16b, %[a0a].4b[1]\n"
+ "sdot v26.4s, %[b2].16b, %[a0a].4b[2]\n" ASM_PREFETCHWL2("[%[c_ptr], #640]")
+ "sdot v27.4s, %[b2].16b, %[a0a].4b[3]\n"
+ "sdot v28.4s, %[b2].16b, %[a1a].4b[0]\n" ASM_PREFETCHWL2("[%[c_ptr], #704]")
+ "sdot v29.4s, %[b2].16b, %[a1a].4b[1]\n"
+ "add %[a_ptr], %[a_ptr], #64\n"
+ "sdot v30.4s, %[b2].16b, %[a1a].4b[2]\n"
+ "add %[b_ptr], %[b_ptr], #96\n"
+ "sdot v31.4s, %[b2].16b, %[a1a].4b[3]\n"
+ "b 3f\n"
+
+ // Odd K continuation
+ "2:\n"
+ "sdot v11.4s, %[b0].16b, %[a0].4b[3]\n" ASM_PREFETCHW("[%[c_ptr]]")
+ "sdot v12.4s, %[b0].16b, %[a1].4b[0]\n"
+ "ins %[b2].d[1], x20\n"
+ "sdot v13.4s, %[b0].16b, %[a1].4b[1]\n" ASM_PREFETCHW("[%[c_ptr], #64]")
+ "sdot v14.4s, %[b0].16b, %[a1].4b[2]\n"
+ "add %[a_ptr], %[a_ptr], #32\n"
+ "sdot v15.4s, %[b0].16b, %[a1].4b[3]\n" ASM_PREFETCHW("[%[c_ptr], #128]")
+ "sdot v16.4s, %[b1].16b, %[a0].4b[0]\n"
+ "add %[b_ptr], %[b_ptr], #48\n"
+ "sdot v17.4s, %[b1].16b, %[a0].4b[1]\n" ASM_PREFETCHW("[%[c_ptr], #192]")
+ "sdot v18.4s, %[b1].16b, %[a0].4b[2]\n"
+ "sdot v19.4s, %[b1].16b, %[a0].4b[3]\n" ASM_PREFETCHW("[%[c_ptr], #256]")
+ "sdot v20.4s, %[b1].16b, %[a1].4b[0]\n"
+ "sdot v21.4s, %[b1].16b, %[a1].4b[1]\n" ASM_PREFETCHW("[%[c_ptr], #320]")
+ "sdot v22.4s, %[b1].16b, %[a1].4b[2]\n"
+ "sdot v23.4s, %[b1].16b, %[a1].4b[3]\n" ASM_PREFETCHWL2("[%[c_ptr], #384]")
+ "sdot v24.4s, %[b2].16b, %[a0].4b[0]\n"
+ "sdot v25.4s, %[b2].16b, %[a0].4b[1]\n" ASM_PREFETCHWL2("[%[c_ptr], #448]")
+ "sdot v26.4s, %[b2].16b, %[a0].4b[2]\n"
+ "sdot v27.4s, %[b2].16b, %[a0].4b[3]\n" ASM_PREFETCHWL2("[%[c_ptr], #512]") "sdot v28.4s, %[b2].16b, %[a1].4b[0]\n" ASM_PREFETCHWL2("[%[c_ptr], #576]") "sdot v29.4s, %[b2].16b, %[a1].4b[1]\n"
+ ASM_PREFETCHWL2("[%[c_ptr], #640]") "sdot v30.4s, %[b2].16b, %[a1].4b[2]\n" ASM_PREFETCHWL2("[%[c_ptr], #704]")
+ "sdot v31.4s, %[b2].16b, %[a1].4b[3]\n"
+
+ // Common tail
+ "3:\n"
+ "str q8, [%[c_ptr]]\n"
+ "str q16, [%[c_ptr], #16]\n"
+ "str q24, [%[c_ptr], #32]\n"
+ "str q9, [%[c_ptr], #48]\n"
+ "str q17, [%[c_ptr], #64]\n"
+ "str q25, [%[c_ptr], #80]\n"
+ "str q10, [%[c_ptr], #96]\n"
+ "str q18, [%[c_ptr], #112]\n"
+ "str q26, [%[c_ptr], #128]\n"
+ "str q11, [%[c_ptr], #144]\n"
+ "str q19, [%[c_ptr], #160]\n"
+ "str q27, [%[c_ptr], #176]\n"
+ "str q12, [%[c_ptr], #192]\n"
+ "str q20, [%[c_ptr], #208]\n"
+ "str q28, [%[c_ptr], #224]\n"
+ "str q13, [%[c_ptr], #240]\n"
+ "str q21, [%[c_ptr], #256]\n"
+ "str q29, [%[c_ptr], #272]\n"
+ "str q14, [%[c_ptr], #288]\n"
+ "str q22, [%[c_ptr], #304]\n"
+ "str q30, [%[c_ptr], #320]\n"
+ "str q15, [%[c_ptr], #336]\n"
+ "str q23, [%[c_ptr], #352]\n"
+ "str q31, [%[c_ptr], #368]\n"
+ "add %[c_ptr], %[c_ptr], #384\n"
+
+#ifdef NO_DOT_IN_TOOLCHAIN
+ ".purgem sdot\n"
+#endif
+ :
+ [a_ptr] "+r"(a_ptr), [b_ptr] "+r"(b_ptr), [c_ptr] "+r"(c_ptr),
+ [a0] "+w"(a0), [a1] "+w"(a1), [a0a] "+w"(a0a), [a1a] "+w"(a1a),
+ [b0] "+w"(b0), [b1] "+w"(b1), [b2] "+w"(b2), [k] "+r"(k)
+ : [oddk] "r"(oddk)
+ : "x20", "x21", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18",
+ "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "cc", "memory");
+ }
+ }
+}
+
+} // namespace arm_gemm
+
+#endif
\ No newline at end of file
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_s8_12x8/dot_toolchain_support.h b/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_s8_12x8/dot_toolchain_support.h
new file mode 100644
index 0000000..c76f99d
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_s8_12x8/dot_toolchain_support.h
@@ -0,0 +1,66 @@
+/*
+ * Copyright (c) 2017-2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+// Define a macro to assemble the UDOT instruction (in the absence of toolchain support)
+#define _DECLARE_SDOT \
+ ".altmacro\n" \
+ ".macro sdot opd:req, opn:req, opm:req\n" \
+ "local vd, vn, vm, h, l\n" \
+ ".irp reg,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31\n" \
+ ".ifeqs \"\\opd\",\"v\\reg\\.4s\"\n" \
+ ".set vd,\\reg\n" \
+ ".endif\n" \
+ ".ifeqs \"\\opn\",\"v\\reg\\.16b\"\n" \
+ ".set vn,\\reg\n" \
+ ".endif\n" \
+ ".irp idx,0,1,2,3\n" \
+ ".ifeqs \"\\opm\",\"v\\reg\\.4b[\\idx\\]\"\n" \
+ ".set vm,\\reg\n" \
+ ".set h,\\idx / 2\n" \
+ ".set l,\\idx %% 2\n" \
+ ".endif\n" \
+ ".endr\n" \
+ ".endr\n" \
+ ".ifndef vd\n" \
+ ".error \"Bad operand \\opd\"\n" \
+ ".exitm\n" \
+ ".endif\n" \
+ ".ifndef vn\n" \
+ ".error \"Bad operand \\opn\"\n" \
+ ".exitm\n" \
+ ".endif\n" \
+ ".ifndef vm\n" \
+ ".error \"Bad operand \\opm\"\n" \
+ ".exitm\n" \
+ ".endif\n" \
+ ".ifndef h\n" \
+ ".error \"Bad operand \\opm\"\n" \
+ ".exitm\n" \
+ ".endif\n" \
+ ".ifndef l\n" \
+ ".error \"Bad operand \\opm\"\n" \
+ ".exitm\n" \
+ ".endif\n" \
+ ".int 0x4f80e000 | vd | (vn << 5) | (vm << 16) | (l << 21) | (h << 11)\n" \
+ ".endm\n"
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_s8_12x8/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_s8_12x8/generic.cpp
new file mode 100644
index 0000000..258ef5e
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_s8_12x8/generic.cpp
@@ -0,0 +1,343 @@
+/*
+ * Copyright (c) 2017-2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifdef __aarch64__
+
+#include <arm_neon.h>
+
+#include "../../asmlib.hpp"
+
+#ifdef NO_DOT_IN_TOOLCHAIN
+#include "dot_toolchain_support.h"
+#endif
+
+namespace arm_gemm
+{
+void a64_gemm_s8_12x8(const int8_t *Apanel, const int8_t *Bpanel, int32_t *Cpanel, int ablocks, int bblocks, int K)
+{
+ const int8_t *a_ptr = Apanel;
+ int32_t *c_ptr = Cpanel;
+ // We divide K by 4 because the sdot instruction processes 4 elements at a time.
+ const int W = K / 4;
+ // Fix up for odd lengths - set a flag if K is odd, but make
+ // sure we round up the iteration count.
+ const int oddk = (W & 1);
+ const int init_value_k = ((W + 1) / 2) - 1;
+ for(int yb = 0; yb < ablocks; yb++)
+ {
+ const int8_t *a_ptr0 = a_ptr;
+ const int8_t *b_ptr = Bpanel;
+ for(int xb = 0; xb < bblocks; xb++)
+ {
+ a_ptr = a_ptr0;
+ int k = init_value_k;
+ register int32x4_t a0 asm("v0");
+ register int32x4_t a1 asm("v1");
+ register int32x4_t b0 asm("v2");
+ register int32x4_t b1 asm("v3");
+ register int32x4_t b2 asm("v4");
+ register int32x4_t a0a asm("v5");
+ register int32x4_t a1a asm("v6");
+ __asm __volatile(
+#ifdef NO_DOT_IN_TOOLCHAIN
+ _DECLARE_SDOT
+#else
+ ".arch armv8.2-a+dotprod\n"
+#endif
+ // Initialize result registers, load initial operands, prime prefetches.
+ "movi v8.4s, #0x0\n"
+ "ldr %q[a0], [%[a_ptr]]\n"
+ "movi v9.4s, #0x0\n"
+ "ldr %q[b0], [%[b_ptr]]\n"
+ "movi v10.4s, #0x0\n"
+ "ldr %q[a1], [%[a_ptr], #16]\n"
+ "movi v11.4s, #0x0\n"
+ "ldr %q[b1], [%[b_ptr], #16]\n"
+ "movi v12.4s, #0x0\n" ASM_PREFETCH("[%[b_ptr], #64]") "movi v13.4s, #0x0\n" ASM_PREFETCH("[%[a_ptr], #64]") "movi v14.4s, #0x0\n" ASM_PREFETCH("[%[b_ptr], #128]") "movi v15.4s, #0x0\n"
+ ASM_PREFETCH("[%[a_ptr], #128]") "movi v16.4s, #0x0\n" ASM_PREFETCH("[%[b_ptr], #192]") "movi v17.4s, #0x0\n" ASM_PREFETCH("[%[b_ptr], #256]") "movi v18.4s, #0x0\n"
+ ASM_PREFETCH("[%[a_ptr], #192]") "movi v19.4s, #0x0\n" ASM_PREFETCH("[%[b_ptr], #320]") "movi v20.4s, #0x0\n" ASM_PREFETCH("[%[a_ptr], #256]") "movi v21.4s, #0x0\n"
+ ASM_PREFETCH("[%[b_ptr], #384]")
+ "movi v22.4s, #0x0\n"
+ "movi v23.4s, #0x0\n"
+ "movi v24.4s, #0x0\n"
+ "movi v25.4s, #0x0\n"
+ "movi v26.4s, #0x0\n"
+ "movi v27.4s, #0x0\n"
+ "movi v28.4s, #0x0\n"
+ "movi v29.4s, #0x0\n"
+ "movi v30.4s, #0x0\n"
+ "movi v31.4s, #0x0\n"
+
+ // Skip loop if we are doing zero iterations of it.
+ "cbz %w[k], 4f\n"
+
+ // Loop proper
+ "1:\n"
+ "sdot v8.4s , %[b0].16b, %[a0].4b[0]\n"
+ "sdot v9.4s , %[b0].16b, %[a0].4b[1]\n"
+
+ "ldr %q[b2], [%[b_ptr], #32]\n"
+ "sdot v10.4s, %[b0].16b, %[a0].4b[2]\n"
+ "sdot v11.4s, %[b0].16b, %[a0].4b[3]\n"
+ "ldr %q[a0a], [%[a_ptr], #32]\n"
+ "sdot v12.4s, %[b0].16b, %[a1].4b[0]\n"
+ "sdot v13.4s, %[b0].16b, %[a1].4b[1]\n"
+ "ldr %q[a1a], [%[a_ptr], #48]\n"
+ "sdot v14.4s, %[b0].16b, %[a1].4b[2]\n"
+ "sdot v15.4s, %[b0].16b, %[a1].4b[3]\n"
+ "ldr %q[b0], [%[b_ptr], #48]\n"
+
+ "sdot v16.4s, %[b1].16b, %[a0].4b[0]\n"
+ "sdot v17.4s, %[b1].16b, %[a0].4b[1]\n" ASM_PREFETCH("[%[a_ptr], #320]")
+ "sdot v18.4s, %[b1].16b, %[a0].4b[2]\n"
+ "sdot v19.4s, %[b1].16b, %[a0].4b[3]\n"
+ "sdot v20.4s, %[b1].16b, %[a1].4b[0]\n"
+ "sdot v21.4s, %[b1].16b, %[a1].4b[1]\n"
+ "sdot v22.4s, %[b1].16b, %[a1].4b[2]\n"
+ "sdot v23.4s, %[b1].16b, %[a1].4b[3]\n"
+ "ldr %q[b1], [%[b_ptr], #64]\n"
+
+ "sdot v24.4s, %[b2].16b, %[a0].4b[0]\n"
+ "sdot v25.4s, %[b2].16b, %[a0].4b[1]\n" ASM_PREFETCH("[%[b_ptr], #448]")
+ "sdot v26.4s, %[b2].16b, %[a0].4b[2]\n"
+ "sdot v27.4s, %[b2].16b, %[a0].4b[3]\n"
+ "sdot v28.4s, %[b2].16b, %[a1].4b[0]\n"
+ "sdot v29.4s, %[b2].16b, %[a1].4b[1]\n"
+ "sdot v30.4s, %[b2].16b, %[a1].4b[2]\n"
+ "sdot v31.4s, %[b2].16b, %[a1].4b[3]\n"
+ "ldr %q[b2], [%[b_ptr], #80]\n"
+
+ "sdot v8.4s , %[b0].16b, %[a0a].4b[0]\n"
+ "sdot v9.4s , %[b0].16b, %[a0a].4b[1]\n"
+ "ldr %q[a0], [%[a_ptr], #64]\n"
+ "sdot v10.4s, %[b0].16b, %[a0a].4b[2]\n"
+ "sdot v11.4s, %[b0].16b, %[a0a].4b[3]\n"
+ "sdot v12.4s, %[b0].16b, %[a1a].4b[0]\n"
+ "ldr %q[a1], [%[a_ptr], #80]\n"
+ "sdot v13.4s, %[b0].16b, %[a1a].4b[1]\n"
+ "sdot v14.4s, %[b0].16b, %[a1a].4b[2]\n"
+ "sdot v15.4s, %[b0].16b, %[a1a].4b[3]\n"
+ "ldr %q[b0], [%[b_ptr], #96]\n"
+
+ "sdot v16.4s, %[b1].16b, %[a0a].4b[0]\n"
+ "sdot v17.4s, %[b1].16b, %[a0a].4b[1]\n" ASM_PREFETCH("[%[b_ptr], #512]")
+ "sdot v18.4s, %[b1].16b, %[a0a].4b[2]\n"
+ "sdot v19.4s, %[b1].16b, %[a0a].4b[3]\n"
+ "sdot v20.4s, %[b1].16b, %[a1a].4b[0]\n"
+ "sdot v21.4s, %[b1].16b, %[a1a].4b[1]\n"
+ "sdot v22.4s, %[b1].16b, %[a1a].4b[2]\n"
+ "sdot v23.4s, %[b1].16b, %[a1a].4b[3]\n"
+ "ldr %q[b1], [%[b_ptr], #112]\n"
+
+ "sdot v24.4s, %[b2].16b, %[a0a].4b[0]\n"
+ "sdot v25.4s, %[b2].16b, %[a0a].4b[1]\n"
+ "add %[a_ptr], %[a_ptr], #64\n"
+ "sdot v26.4s, %[b2].16b, %[a0a].4b[2]\n"
+ "sdot v27.4s, %[b2].16b, %[a0a].4b[3]\n"
+ "add %[b_ptr], %[b_ptr], #96\n"
+ "sdot v28.4s, %[b2].16b, %[a1a].4b[0]\n"
+ "sdot v29.4s, %[b2].16b, %[a1a].4b[1]\n"
+ "subs %w[k], %w[k], #1\n"
+ "sdot v30.4s, %[b2].16b, %[a1a].4b[2]\n"
+ "sdot v31.4s, %[b2].16b, %[a1a].4b[3]\n"
+ "bne 1b\n"
+
+ // Target to use when K is 1 or 2 (i.e. zero iterations of main loop)
+ "4:\n"
+
+ // Branch to alternative tail for odd K
+ "cbnz %w[oddk], 2f\n"
+
+ // Detached final iteration (even K)
+ "sdot v8.4s , %[b0].16b, %[a0].4b[0]\n"
+ "sdot v9.4s , %[b0].16b, %[a0].4b[1]\n"
+ "ldr %q[b2], [%[b_ptr], #32]\n"
+ "sdot v10.4s, %[b0].16b, %[a0].4b[2]\n"
+ "sdot v11.4s, %[b0].16b, %[a0].4b[3]\n"
+ "ldr %q[a0a], [%[a_ptr], #32]\n"
+ "sdot v12.4s, %[b0].16b, %[a1].4b[0]\n"
+ "sdot v13.4s, %[b0].16b, %[a1].4b[1]\n"
+ "ldr %q[a1a], [%[a_ptr], #48]\n"
+ "sdot v14.4s, %[b0].16b, %[a1].4b[2]\n"
+ "sdot v15.4s, %[b0].16b, %[a1].4b[3]\n"
+ "ldr %q[b0], [%[b_ptr], #48]\n"
+
+ "sdot v16.4s, %[b1].16b, %[a0].4b[0]\n"
+ "sdot v17.4s, %[b1].16b, %[a0].4b[1]\n"
+ "sdot v18.4s, %[b1].16b, %[a0].4b[2]\n"
+ "sdot v19.4s, %[b1].16b, %[a0].4b[3]\n"
+ "sdot v20.4s, %[b1].16b, %[a1].4b[0]\n"
+ "sdot v21.4s, %[b1].16b, %[a1].4b[1]\n"
+ "sdot v22.4s, %[b1].16b, %[a1].4b[2]\n"
+ "sdot v23.4s, %[b1].16b, %[a1].4b[3]\n"
+ "ldr %q[b1], [%[b_ptr], #64]\n"
+
+ "sdot v24.4s, %[b2].16b, %[a0].4b[0]\n"
+ "sdot v25.4s, %[b2].16b, %[a0].4b[1]\n"
+ "add %[a_ptr], %[a_ptr], #64\n"
+ "sdot v26.4s, %[b2].16b, %[a0].4b[2]\n"
+ "sdot v27.4s, %[b2].16b, %[a0].4b[3]\n"
+ "sdot v28.4s, %[b2].16b, %[a1].4b[0]\n"
+ "sdot v29.4s, %[b2].16b, %[a1].4b[1]\n"
+ "sdot v30.4s, %[b2].16b, %[a1].4b[2]\n"
+ "sdot v31.4s, %[b2].16b, %[a1].4b[3]\n"
+ "ldr %q[b2], [%[b_ptr], #80]\n"
+
+ "sdot v8.4s , %[b0].16b, %[a0a].4b[0]\n"
+
+ "sdot v16.4s, %[b1].16b, %[a0a].4b[0]\n"
+ "add %[b_ptr], %[b_ptr], #96\n"
+ "sdot v9.4s , %[b0].16b, %[a0a].4b[1]\n"
+ "str q8, [%[c_ptr], #0]\n"
+ "sdot v17.4s, %[b1].16b, %[a0a].4b[1]\n"
+ "str q16, [%[c_ptr], #16]\n"
+ "sdot v24.4s, %[b2].16b, %[a0a].4b[0]\n"
+ "str q24, [%[c_ptr], #32]\n"
+
+ "sdot v25.4s, %[b2].16b, %[a0a].4b[1]\n"
+ "str q9, [%[c_ptr], #48]\n"
+ "sdot v10.4s, %[b0].16b, %[a0a].4b[2]\n"
+ "str q17, [%[c_ptr], #64]\n"
+ "sdot v18.4s, %[b1].16b, %[a0a].4b[2]\n"
+ "str q25, [%[c_ptr], #80]\n"
+ "sdot v26.4s, %[b2].16b, %[a0a].4b[2]\n"
+ "str q10, [%[c_ptr], #96]\n"
+
+ "sdot v11.4s, %[b0].16b, %[a0a].4b[3]\n"
+ "str q18, [%[c_ptr], #112]\n"
+ "sdot v19.4s, %[b1].16b, %[a0a].4b[3]\n"
+ "str q26, [%[c_ptr], #128]\n"
+ "sdot v27.4s, %[b2].16b, %[a0a].4b[3]\n"
+ "str q11, [%[c_ptr], #144]\n"
+
+ "sdot v12.4s, %[b0].16b, %[a1a].4b[0]\n"
+ "str q19, [%[c_ptr], #160]\n"
+ "sdot v20.4s, %[b1].16b, %[a1a].4b[0]\n"
+ "str q27, [%[c_ptr], #176]\n"
+ "sdot v28.4s, %[b2].16b, %[a1a].4b[0]\n"
+ "str q12, [%[c_ptr], #192]\n"
+
+ "sdot v13.4s, %[b0].16b, %[a1a].4b[1]\n"
+ "str q20, [%[c_ptr], #208]\n"
+ "sdot v21.4s, %[b1].16b, %[a1a].4b[1]\n"
+ "str q28, [%[c_ptr], #224]\n"
+ "sdot v29.4s, %[b2].16b, %[a1a].4b[1]\n"
+ "str q13, [%[c_ptr], #240]\n"
+
+ "sdot v14.4s, %[b0].16b, %[a1a].4b[2]\n"
+ "str q21, [%[c_ptr], #256]\n"
+ "sdot v22.4s, %[b1].16b, %[a1a].4b[2]\n"
+ "str q29, [%[c_ptr], #272]\n"
+ "sdot v30.4s, %[b2].16b, %[a1a].4b[2]\n"
+ "str q14, [%[c_ptr], #288]\n"
+
+ "sdot v15.4s, %[b0].16b, %[a1a].4b[3]\n"
+ "str q22, [%[c_ptr], #304]\n"
+ "sdot v23.4s, %[b1].16b, %[a1a].4b[3]\n"
+ "str q30, [%[c_ptr], #320]\n"
+ "sdot v31.4s, %[b2].16b, %[a1a].4b[3]\n"
+ "str q15, [%[c_ptr], #336]\n"
+
+ "b 3f\n"
+
+ // Detached final iteration (odd K)
+ "2:\n"
+ "sdot v8.4s , %[b0].16b, %[a0].4b[0]\n"
+ "ldr %q[b2], [%[b_ptr], #32]\n"
+ "sdot v16.4s, %[b1].16b, %[a0].4b[0]\n"
+ "sdot v9.4s , %[b0].16b, %[a0].4b[1]\n"
+ "str q8, [%[c_ptr], #0]\n"
+ "sdot v17.4s, %[b1].16b, %[a0].4b[1]\n"
+ "str q16, [%[c_ptr], #16]\n"
+ "sdot v24.4s, %[b2].16b, %[a0].4b[0]\n"
+ "add %[b_ptr], %[b_ptr], #48\n"
+ "add %[a_ptr], %[a_ptr], #32\n"
+ "str q24, [%[c_ptr], #32]\n"
+ "sdot v25.4s, %[b2].16b, %[a0].4b[1]\n"
+ "str q9, [%[c_ptr], #48]\n"
+
+ "sdot v10.4s, %[b0].16b, %[a0].4b[2]\n"
+ "str q17, [%[c_ptr], #64]\n"
+ "sdot v18.4s, %[b1].16b, %[a0].4b[2]\n"
+ "str q25, [%[c_ptr], #80]\n"
+ "sdot v26.4s, %[b2].16b, %[a0].4b[2]\n"
+ "str q10, [%[c_ptr], #96]\n"
+
+ "sdot v11.4s, %[b0].16b, %[a0].4b[3]\n"
+ "str q18, [%[c_ptr], #112]\n"
+ "sdot v19.4s, %[b1].16b, %[a0].4b[3]\n"
+ "str q26, [%[c_ptr], #128]\n"
+ "sdot v27.4s, %[b2].16b, %[a0].4b[3]\n"
+ "str q11, [%[c_ptr], #144]\n"
+
+ "sdot v12.4s, %[b0].16b, %[a1].4b[0]\n"
+ "str q19, [%[c_ptr], #160]\n"
+ "sdot v20.4s, %[b1].16b, %[a1].4b[0]\n"
+ "str q27, [%[c_ptr], #176]\n"
+ "sdot v28.4s, %[b2].16b, %[a1].4b[0]\n"
+ "str q12, [%[c_ptr], #192]\n"
+
+ "sdot v13.4s, %[b0].16b, %[a1].4b[1]\n"
+ "str q20, [%[c_ptr], #208]\n"
+ "sdot v21.4s, %[b1].16b, %[a1].4b[1]\n"
+ "str q28, [%[c_ptr], #224]\n"
+ "sdot v29.4s, %[b2].16b, %[a1].4b[1]\n"
+ "str q13, [%[c_ptr], #240]\n"
+
+ "sdot v14.4s, %[b0].16b, %[a1].4b[2]\n"
+ "str q21, [%[c_ptr], #256]\n"
+ "sdot v22.4s, %[b1].16b, %[a1].4b[2]\n"
+ "str q29, [%[c_ptr], #272]\n"
+ "sdot v30.4s, %[b2].16b, %[a1].4b[2]\n"
+ "str q14, [%[c_ptr], #288]\n"
+
+ "sdot v15.4s, %[b0].16b, %[a1].4b[3]\n"
+ "str q22, [%[c_ptr], #304]\n"
+ "sdot v23.4s, %[b1].16b, %[a1].4b[3]\n"
+ "str q30, [%[c_ptr], #320]\n"
+ "sdot v31.4s, %[b2].16b, %[a1].4b[3]\n"
+ "str q15, [%[c_ptr], #336]\n"
+
+ // Common tail
+ "3:\n"
+ "str q23, [%[c_ptr], #352]\n"
+ "str q31, [%[c_ptr], #368]\n"
+ "add %[c_ptr], %[c_ptr], #384\n"
+
+#ifdef NO_DOT_IN_TOOLCHAIN
+ ".purgem sdot\n"
+#endif
+ :
+ [a_ptr] "+r"(a_ptr), [b_ptr] "+r"(b_ptr), [c_ptr] "+r"(c_ptr),
+ [a0] "+w"(a0), [a1] "+w"(a1), [a0a] "+w"(a0a), [a1a] "+w"(a1a),
+ [b0] "+w"(b0), [b1] "+w"(b1), [b2] "+w"(b2), [k] "+r"(k)
+ : [oddk] "r"(oddk)
+ : "x20", "x21", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18",
+ "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "cc");
+ }
+ }
+}
+
+} // namespace arm_gemm
+
+#endif
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_s8_4x4.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_s8_4x4.hpp
new file mode 100644
index 0000000..2ec28f4
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_s8_4x4.hpp
@@ -0,0 +1,67 @@
+/*
+ * Copyright (c) 2017-2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#pragma once
+
+#ifdef __aarch64__
+
+namespace arm_gemm
+{
+// Load the actual kernel
+void a64_gemm_s8_4x4(const int8_t *, const int8_t *, int32_t *, int, int, int);
+
+#include "arm_gemm.hpp"
+
+class gemm_s8_4x4
+{
+public:
+ typedef int8_t operand_type;
+ typedef int32_t result_type;
+
+ typedef void (*kern_type)(const int8_t *, const int8_t *, int32_t *, int, int, int);
+
+ /* Describes the data layout for A input */
+ static const int A_interleave = 4;
+ static const int A_block = 16;
+ static const bool A_transpose = false;
+
+ /* Same for B input */
+ static const int B_interleave = 4;
+ static const int B_block = 16;
+ static const bool B_transpose = true;
+
+ /* Kernel blocking parameters */
+ static const int out_width = 4;
+ static const int out_height = 4;
+ static const int k_unroll = 16;
+
+ kern_type kernel = a64_gemm_s8_4x4;
+
+ gemm_s8_4x4(const CPUInfo *ci)
+ {
+ }
+};
+
+} // namespace arm_gemm
+
+#endif // __aarch64__
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_s8_4x4/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_s8_4x4/generic.cpp
new file mode 100644
index 0000000..243b94e
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_s8_4x4/generic.cpp
@@ -0,0 +1,456 @@
+/*
+ * Copyright (c) 2017-2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifdef __aarch64__
+
+#include <arm_neon.h>
+
+#include "../../asmlib.hpp"
+
+namespace arm_gemm
+{
+void a64_gemm_s8_4x4(const int8_t *Apanel, const int8_t *Bpanel, int32_t *Cpanel, int ablocks, int bblocks, int K)
+{
+ const int8_t *a_ptr = Apanel;
+ int32_t *c_ptr = Cpanel;
+
+ K /= 16;
+ int oddk = (K & 1);
+
+ for(int yb = 0; yb < ablocks; yb++)
+ {
+ const int8_t *a_ptr0 = a_ptr;
+ const int8_t *b_ptr = Bpanel;
+
+ for(int xb = 0; xb < bblocks; xb++)
+ {
+ a_ptr = a_ptr0;
+
+ int k = ((K + 1) / 2) - 1;
+
+ register int8x16_t b0 asm("v4");
+ register int8x16_t b1 asm("v5");
+ register int8x16_t b2 asm("v6");
+ register int8x16_t b3 asm("v7");
+ register int8x16_t b0a asm("v8");
+ register int8x16_t b1a asm("v9");
+ register int8x16_t b2a asm("v10");
+ register int8x16_t b3a asm("v11");
+
+ __asm __volatile(
+ "movi v16.4s, #0x0\n"
+ "ldr q0, [%[a_ptr]]\n"
+ "movi v17.4s, #0x0\n"
+ "ldr %q[b0], [%[b_ptr]]\n"
+ "movi v18.4s, #0x0\n"
+ "ldr %q[b1], [%[b_ptr], #16]\n"
+ "movi v19.4s, #0x0\n"
+ "ldr %q[b2], [%[b_ptr], #32]\n"
+ "movi v20.4s, #0x0\n"
+ "ldr %q[b3], [%[b_ptr], #48]\n"
+ "movi v21.4s, #0x0\n"
+ "ldr q1, [%[a_ptr], #16]\n"
+ "movi v22.4s, #0x0\n"
+ "ldr q2, [%[a_ptr], #32]\n"
+ "movi v23.4s, #0x0\n"
+ "ldr q3, [%[a_ptr], #48]\n"
+ "movi v24.4s, #0x0\n" ASM_PREFETCH("[%[b_ptr], #64]") "movi v25.4s, #0x0\n" ASM_PREFETCH("[%[a_ptr], #64]") "movi v26.4s, #0x0\n" ASM_PREFETCH("[%[b_ptr], #128]") "movi v27.4s, #0x0\n"
+ ASM_PREFETCH("[%[a_ptr], #128]") "movi v28.4s, #0x0\n" ASM_PREFETCH("[%[b_ptr], #192]") "movi v29.4s, #0x0\n" ASM_PREFETCH("[%[a_ptr], #192]") "movi v30.4s, #0x0\n"
+ ASM_PREFETCH("[%[b_ptr], #256]") "movi v31.4s, #0x0\n" ASM_PREFETCH("[%[a_ptr], #256]")
+
+ // Loop structure optimized for A57 (after r0).
+
+ // Unavoidably, the multiply will "dribble" if
+ // dual issued with an add.
+
+ // Minimize the effect of this by making sure
+ // there are 2 adds to run under the dribbled
+ // multiply.
+
+ // Pipeline in blocks of 8 multiplies - combine
+ // this iteration's multiplies with adds from
+ // the previous iteration.
+
+ // So the first block doesn't have any adds to
+ // do - but because all the adds are at the
+ // start of the block it's only the first couple
+ // of multiplies that need to be pulled out.
+
+ // Start of unroll 0 (first iteration)
+ "smull v12.8h, v0.8b, %[b0].8b\n"
+ "smull v13.8h, v0.8b, %[b1].8b\n"
+
+ // Skip loop if we are doing zero iterations of it.
+ "cbz %w[k], 4f\n"
+
+ // Unroll 0 continuation (branch target)
+ "1:\n"
+ "smull v14.8h, v0.8b, %[b2].8b\n"
+ "subs %w[k], %w[k], #1\n"
+ "smull v15.8h, v0.8b, %[b3].8b\n"
+ "ldr %q[b0a], [%[b_ptr], #64]\n"
+ "smlal2 v12.8h, v0.16b, %[b0].16b\n"
+ "smlal2 v13.8h, v0.16b, %[b1].16b\n"
+ "ldr %q[b1a], [%[b_ptr], #80]\n"
+ "smlal2 v14.8h, v0.16b, %[b2].16b\n"
+ "smlal2 v15.8h, v0.16b, %[b3].16b\n"
+ "ldr q0, [%[a_ptr], #64]\n"
+
+ "sadalp v16.4s, v12.8h\n"
+ "smull v12.8h, v1.8b, %[b0].8b\n"
+ "sadalp v17.4s, v13.8h\n"
+ "sadalp v18.4s, v14.8h\n"
+ "smull v13.8h, v1.8b, %[b1].8b\n"
+ "sadalp v19.4s, v15.8h\n"
+ "smull v14.8h, v1.8b, %[b2].8b\n"
+ "ldr %q[b2a], [%[b_ptr], #96]\n"
+ "smull v15.8h, v1.8b, %[b3].8b\n"
+ "smlal2 v12.8h, v1.16b, %[b0].16b\n"
+ "ldr %q[b3a], [%[b_ptr], #112]\n"
+ "smlal2 v13.8h, v1.16b, %[b1].16b\n"
+ "add %[b_ptr], %[b_ptr], #128\n"
+ "smlal2 v14.8h, v1.16b, %[b2].16b\n"
+ "smlal2 v15.8h, v1.16b, %[b3].16b\n"
+ "ldr q1, [%[a_ptr], #80]\n"
+
+ "sadalp v20.4s, v12.8h\n"
+ "smull v12.8h, v2.8b, %[b0].8b\n"
+ "sadalp v21.4s, v13.8h\n"
+ "sadalp v22.4s, v14.8h\n"
+ "smull v13.8h, v2.8b, %[b1].8b\n"
+ "sadalp v23.4s, v15.8h\n"
+ "smull v14.8h, v2.8b, %[b2].8b\n"
+ "smull v15.8h, v2.8b, %[b3].8b\n"
+ "smlal2 v12.8h, v2.16b, %[b0].16b\n" ASM_PREFETCH("[%[b_ptr], #192]")
+ "smlal2 v13.8h, v2.16b, %[b1].16b\n"
+ "smlal2 v14.8h, v2.16b, %[b2].16b\n" ASM_PREFETCH("[%[a_ptr], #320]")
+ "smlal2 v15.8h, v2.16b, %[b3].16b\n"
+ "ldr q2, [%[a_ptr], #96]\n"
+
+ "sadalp v24.4s, v12.8h\n"
+ "smull v12.8h, v3.8b, %[b0].8b\n"
+ "sadalp v25.4s, v13.8h\n"
+ "sadalp v26.4s, v14.8h\n"
+ "smull v13.8h, v3.8b, %[b1].8b\n"
+ "sadalp v27.4s, v15.8h\n"
+ "smull v14.8h, v3.8b, %[b2].8b\n"
+ "smull v15.8h, v3.8b, %[b3].8b\n"
+ "smlal2 v12.8h, v3.16b, %[b0].16b\n"
+ "ldr %q[b0], [%[b_ptr], #0]\n"
+ "smlal2 v13.8h, v3.16b, %[b1].16b\n"
+ "smlal2 v14.8h, v3.16b, %[b2].16b\n"
+ "smlal2 v15.8h, v3.16b, %[b3].16b\n"
+ "ldr q3, [%[a_ptr], #112]\n"
+
+ // Unroll 1
+ "sadalp v28.4s, v12.8h\n"
+ "smull v12.8h, v0.8b, %[b0a].8b\n"
+ "sadalp v29.4s, v13.8h\n"
+ "sadalp v30.4s, v14.8h\n"
+ "smull v13.8h, v0.8b, %[b1a].8b\n"
+ "sadalp v31.4s, v15.8h\n"
+ "smull v14.8h, v0.8b, %[b2a].8b\n"
+ "smull v15.8h, v0.8b, %[b3a].8b\n"
+ "ldr %q[b1], [%[b_ptr], #16]\n"
+ "smlal2 v12.8h, v0.16b, %[b0a].16b\n"
+ "smlal2 v13.8h, v0.16b, %[b1a].16b\n"
+ "ldr %q[b2], [%[b_ptr], #32]\n"
+ "smlal2 v14.8h, v0.16b, %[b2a].16b\n"
+ "smlal2 v15.8h, v0.16b, %[b3a].16b\n"
+ "ldr q0, [%[a_ptr], #128]\n"
+
+ "sadalp v16.4s, v12.8h\n"
+ "smull v12.8h, v1.8b, %[b0a].8b\n"
+ "sadalp v17.4s, v13.8h\n"
+ "sadalp v18.4s, v14.8h\n"
+ "smull v13.8h, v1.8b, %[b1a].8b\n"
+ "sadalp v19.4s, v15.8h\n"
+ "add %[a_ptr], %[a_ptr], #128\n"
+ "smull v14.8h, v1.8b, %[b2a].8b\n"
+ "smull v15.8h, v1.8b, %[b3a].8b\n"
+ "ldr %q[b3], [%[b_ptr], #48]\n"
+ "smlal2 v12.8h, v1.16b, %[b0a].16b\n"
+ "smlal2 v13.8h, v1.16b, %[b1a].16b\n"
+ "smlal2 v14.8h, v1.16b, %[b2a].16b\n"
+ "smlal2 v15.8h, v1.16b, %[b3a].16b\n"
+ "ldr q1, [%[a_ptr], #16]\n"
+
+ "sadalp v20.4s, v12.8h\n"
+ "smull v12.8h, v2.8b, %[b0a].8b\n"
+ "sadalp v21.4s, v13.8h\n"
+ "sadalp v22.4s, v14.8h\n"
+ "smull v13.8h, v2.8b, %[b1a].8b\n"
+ "sadalp v23.4s, v15.8h\n"
+ "smull v14.8h, v2.8b, %[b2a].8b\n"
+ "smull v15.8h, v2.8b, %[b3a].8b\n"
+ "smlal2 v12.8h, v2.16b, %[b0a].16b\n" ASM_PREFETCH("[%[b_ptr], #256]")
+ "smlal2 v13.8h, v2.16b, %[b1a].16b\n"
+ "smlal2 v14.8h, v2.16b, %[b2a].16b\n" ASM_PREFETCH("[%[a_ptr], #256]")
+ "smlal2 v15.8h, v2.16b, %[b3a].16b\n"
+ "ldr q2, [%[a_ptr], #32]\n"
+
+ "sadalp v24.4s, v12.8h\n"
+ "smull v12.8h, v3.8b, %[b0a].8b\n"
+ "sadalp v25.4s, v13.8h\n"
+ "sadalp v26.4s, v14.8h\n"
+ "smull v13.8h, v3.8b, %[b1a].8b\n"
+ "sadalp v27.4s, v15.8h\n"
+ "smull v14.8h, v3.8b, %[b2a].8b\n"
+ "smull v15.8h, v3.8b, %[b3a].8b\n"
+ "smlal2 v12.8h, v3.16b, %[b0a].16b\n"
+ "smlal2 v13.8h, v3.16b, %[b1a].16b\n"
+ "smlal2 v14.8h, v3.16b, %[b2a].16b\n"
+ "smlal2 v15.8h, v3.16b, %[b3a].16b\n"
+ "ldr q3, [%[a_ptr], #48]\n"
+
+ // Start of unroll 0 for next iteration.
+ "sadalp v28.4s, v12.8h\n"
+ "smull v12.8h, v0.8b, %[b0].8b\n"
+ "sadalp v29.4s, v13.8h\n"
+ "sadalp v30.4s, v14.8h\n"
+ "smull v13.8h, v0.8b, %[b1].8b\n"
+ "sadalp v31.4s, v15.8h\n"
+ "bne 1b\n"
+
+ // Target to use when K=1 or 2 (i.e. zero iterations of main loop)
+ "4:\n"
+
+ // Branch to alternative tail for odd K
+ "cbnz %w[oddk], 2f\n"
+
+ // Detached final iteration (even K)
+ "smull v14.8h, v0.8b, %[b2].8b\n"
+ "smull v15.8h, v0.8b, %[b3].8b\n"
+ "ldr %q[b0a], [%[b_ptr], #64]\n"
+ "smlal2 v12.8h, v0.16b, %[b0].16b\n"
+ "smlal2 v13.8h, v0.16b, %[b1].16b\n"
+ "ldr %q[b1a], [%[b_ptr], #80]\n"
+ "smlal2 v14.8h, v0.16b, %[b2].16b\n"
+ "smlal2 v15.8h, v0.16b, %[b3].16b\n"
+ "ldr q0, [%[a_ptr], #64]\n"
+
+ "sadalp v16.4s, v12.8h\n"
+ "smull v12.8h, v1.8b, %[b0].8b\n"
+ "sadalp v17.4s, v13.8h\n"
+ "sadalp v18.4s, v14.8h\n"
+ "smull v13.8h, v1.8b, %[b1].8b\n"
+ "sadalp v19.4s, v15.8h\n"
+ "smull v14.8h, v1.8b, %[b2].8b\n"
+ "ldr %q[b2a], [%[b_ptr], #96]\n"
+ "smull v15.8h, v1.8b, %[b3].8b\n"
+ "smlal2 v12.8h, v1.16b, %[b0].16b\n"
+ "ldr %q[b3a], [%[b_ptr], #112]\n"
+ "smlal2 v13.8h, v1.16b, %[b1].16b\n"
+ "add %[b_ptr], %[b_ptr], #128\n"
+ "smlal2 v14.8h, v1.16b, %[b2].16b\n"
+ "smlal2 v15.8h, v1.16b, %[b3].16b\n"
+ "ldr q1, [%[a_ptr], #80]\n"
+
+ "sadalp v20.4s, v12.8h\n"
+ "smull v12.8h, v2.8b, %[b0].8b\n"
+ "sadalp v21.4s, v13.8h\n"
+ "sadalp v22.4s, v14.8h\n"
+ "smull v13.8h, v2.8b, %[b1].8b\n"
+ "sadalp v23.4s, v15.8h\n"
+ "smull v14.8h, v2.8b, %[b2].8b\n"
+ "smull v15.8h, v2.8b, %[b3].8b\n"
+ "smlal2 v12.8h, v2.16b, %[b0].16b\n"
+ "smlal2 v13.8h, v2.16b, %[b1].16b\n"
+ "smlal2 v14.8h, v2.16b, %[b2].16b\n"
+ "smlal2 v15.8h, v2.16b, %[b3].16b\n"
+ "ldr q2, [%[a_ptr], #96]\n"
+
+ "sadalp v24.4s, v12.8h\n"
+ "smull v12.8h, v3.8b, %[b0].8b\n"
+ "sadalp v25.4s, v13.8h\n"
+ "sadalp v26.4s, v14.8h\n"
+ "smull v13.8h, v3.8b, %[b1].8b\n"
+ "sadalp v27.4s, v15.8h\n"
+ "smull v14.8h, v3.8b, %[b2].8b\n"
+ "smull v15.8h, v3.8b, %[b3].8b\n"
+ "smlal2 v12.8h, v3.16b, %[b0].16b\n"
+ "smlal2 v13.8h, v3.16b, %[b1].16b\n"
+ "smlal2 v14.8h, v3.16b, %[b2].16b\n"
+ "smlal2 v15.8h, v3.16b, %[b3].16b\n"
+ "ldr q3, [%[a_ptr], #112]\n"
+
+ // Unroll 1
+ "sadalp v28.4s, v12.8h\n"
+ "smull v12.8h, v0.8b, %[b0a].8b\n"
+ "sadalp v29.4s, v13.8h\n"
+ "sadalp v30.4s, v14.8h\n"
+ "smull v13.8h, v0.8b, %[b1a].8b\n"
+ "sadalp v31.4s, v15.8h\n"
+ "smull v14.8h, v0.8b, %[b2a].8b\n"
+ "add %[a_ptr], %[a_ptr], #128\n"
+ "smull v15.8h, v0.8b, %[b3a].8b\n"
+ "smlal2 v12.8h, v0.16b, %[b0a].16b\n"
+ "smlal2 v13.8h, v0.16b, %[b1a].16b\n"
+ "smlal2 v14.8h, v0.16b, %[b2a].16b\n"
+ "smlal2 v15.8h, v0.16b, %[b3a].16b\n"
+
+ "sadalp v16.4s, v12.8h\n"
+ "smull v12.8h, v1.8b, %[b0a].8b\n"
+ "sadalp v17.4s, v13.8h\n"
+ "sadalp v18.4s, v14.8h\n"
+ "smull v13.8h, v1.8b, %[b1a].8b\n"
+ "sadalp v19.4s, v15.8h\n"
+ "smull v14.8h, v1.8b, %[b2a].8b\n"
+ "smull v15.8h, v1.8b, %[b3a].8b\n"
+ "smlal2 v12.8h, v1.16b, %[b0a].16b\n"
+ "addp v16.4s, v16.4s, v17.4s\n"
+ "smlal2 v13.8h, v1.16b, %[b1a].16b\n"
+ "addp v17.4s, v18.4s, v19.4s\n"
+ "smlal2 v14.8h, v1.16b, %[b2a].16b\n"
+ "smlal2 v15.8h, v1.16b, %[b3a].16b\n"
+
+ "sadalp v20.4s, v12.8h\n"
+ "smull v12.8h, v2.8b, %[b0a].8b\n"
+ "sadalp v21.4s, v13.8h\n"
+ "sadalp v22.4s, v14.8h\n"
+ "smull v13.8h, v2.8b, %[b1a].8b\n"
+ "sadalp v23.4s, v15.8h\n"
+ "addp v16.4s, v16.4s, v17.4s\n"
+ "smull v14.8h, v2.8b, %[b2a].8b\n"
+ "addp v18.4s, v20.4s, v21.4s\n"
+ "addp v19.4s, v22.4s, v23.4s\n"
+ "smull v15.8h, v2.8b, %[b3a].8b\n"
+ "smlal2 v12.8h, v2.16b, %[b0a].16b\n"
+ "str q16, [%[c_ptr]]\n"
+ "smlal2 v13.8h, v2.16b, %[b1a].16b\n"
+ "smlal2 v14.8h, v2.16b, %[b2a].16b\n"
+ "smlal2 v15.8h, v2.16b, %[b3a].16b\n"
+
+ "sadalp v24.4s, v12.8h\n"
+ "smull v12.8h, v3.8b, %[b0a].8b\n"
+ "sadalp v25.4s, v13.8h\n"
+ "sadalp v26.4s, v14.8h\n"
+ "smull v13.8h, v3.8b, %[b1a].8b\n"
+ "sadalp v27.4s, v15.8h\n"
+ "addp v17.4s, v18.4s, v19.4s\n"
+ "smull v14.8h, v3.8b, %[b2a].8b\n"
+ "addp v20.4s, v24.4s, v25.4s\n"
+ "addp v21.4s, v26.4s, v27.4s\n"
+ "smull v15.8h, v3.8b, %[b3a].8b\n"
+ "smlal2 v12.8h, v3.16b, %[b0a].16b\n"
+ "str q17, [%[c_ptr], #16]\n"
+ "smlal2 v13.8h, v3.16b, %[b1a].16b\n"
+ "smlal2 v14.8h, v3.16b, %[b2a].16b\n"
+ "addp v18.4s, v20.4s, v21.4s\n"
+ "smlal2 v15.8h, v3.16b, %[b3a].16b\n"
+ "b 3f\n"
+
+ // Detached final iteration (odd K)
+ "2:\n"
+ "smull v14.8h, v0.8b, %[b2].8b\n"
+ "add %[a_ptr], %[a_ptr], #64\n"
+ "smull v15.8h, v0.8b, %[b3].8b\n"
+ "add %[b_ptr], %[b_ptr], #64\n"
+ "smlal2 v12.8h, v0.16b, %[b0].16b\n"
+ "smlal2 v13.8h, v0.16b, %[b1].16b\n"
+ "smlal2 v14.8h, v0.16b, %[b2].16b\n"
+ "smlal2 v15.8h, v0.16b, %[b3].16b\n"
+
+ "sadalp v16.4s, v12.8h\n"
+ "smull v12.8h, v1.8b, %[b0].8b\n"
+ "sadalp v17.4s, v13.8h\n"
+ "sadalp v18.4s, v14.8h\n"
+ "smull v13.8h, v1.8b, %[b1].8b\n"
+ "sadalp v19.4s, v15.8h\n"
+ "smull v14.8h, v1.8b, %[b2].8b\n"
+ "smull v15.8h, v1.8b, %[b3].8b\n"
+ "smlal2 v12.8h, v1.16b, %[b0].16b\n"
+ "addp v16.4s, v16.4s, v17.4s\n"
+ "smlal2 v13.8h, v1.16b, %[b1].16b\n"
+ "addp v17.4s, v18.4s, v19.4s\n"
+ "smlal2 v14.8h, v1.16b, %[b2].16b\n"
+ "smlal2 v15.8h, v1.16b, %[b3].16b\n"
+
+ "sadalp v20.4s, v12.8h\n"
+ "smull v12.8h, v2.8b, %[b0].8b\n"
+ "sadalp v21.4s, v13.8h\n"
+ "sadalp v22.4s, v14.8h\n"
+ "smull v13.8h, v2.8b, %[b1].8b\n"
+ "sadalp v23.4s, v15.8h\n"
+ "addp v16.4s, v16.4s, v17.4s\n"
+ "smull v14.8h, v2.8b, %[b2].8b\n"
+ "addp v18.4s, v20.4s, v21.4s\n"
+ "addp v19.4s, v22.4s, v23.4s\n"
+ "smull v15.8h, v2.8b, %[b3].8b\n"
+ "smlal2 v12.8h, v2.16b, %[b0].16b\n"
+ "str q16, [%[c_ptr]]\n"
+ "smlal2 v13.8h, v2.16b, %[b1].16b\n"
+ "smlal2 v14.8h, v2.16b, %[b2].16b\n"
+ "smlal2 v15.8h, v2.16b, %[b3].16b\n"
+
+ "sadalp v24.4s, v12.8h\n"
+ "smull v12.8h, v3.8b, %[b0].8b\n"
+ "sadalp v25.4s, v13.8h\n"
+ "sadalp v26.4s, v14.8h\n"
+ "smull v13.8h, v3.8b, %[b1].8b\n"
+ "sadalp v27.4s, v15.8h\n"
+ "addp v17.4s, v18.4s, v19.4s\n"
+ "smull v14.8h, v3.8b, %[b2].8b\n"
+ "addp v20.4s, v24.4s, v25.4s\n"
+ "addp v21.4s, v26.4s, v27.4s\n"
+ "smull v15.8h, v3.8b, %[b3].8b\n"
+ "smlal2 v12.8h, v3.16b, %[b0].16b\n"
+ "str q17, [%[c_ptr], #16]\n"
+ "smlal2 v13.8h, v3.16b, %[b1].16b\n"
+ "smlal2 v14.8h, v3.16b, %[b2].16b\n"
+ "addp v18.4s, v20.4s, v21.4s\n"
+ "smlal2 v15.8h, v3.16b, %[b3].16b\n"
+
+ "3:\n"
+
+ // Final additions
+ "sadalp v28.4s, v12.8h\n"
+ "str q18, [%[c_ptr], #32]\n"
+ "sadalp v29.4s, v13.8h\n"
+ "sadalp v30.4s, v14.8h\n"
+ "sadalp v31.4s, v15.8h\n"
+
+ // Horizontal reduction, phase 1
+ "addp v22.4s, v28.4s, v29.4s\n"
+ "addp v23.4s, v30.4s, v31.4s\n"
+
+ // Horizontal reduction, phase 2
+ "addp v19.4s, v22.4s, v23.4s\n"
+ "str q19, [%[c_ptr], #48]\n"
+ "add %[c_ptr], %[c_ptr], #64\n"
+
+ :
+ [a_ptr] "+r"(a_ptr), [b_ptr] "+r"(b_ptr), [c_ptr] "+r"(c_ptr),
+ [b0] "+w"(b0), [b1] "+w"(b1), [b2] "+w"(b2), [b3] "+w"(b3),
+ [b0a] "+w"(b0a), [b1a] "+w"(b1a), [b2a] "+w"(b2a), [b3a] "+w"(b3a),
+ [k] "+r"(k)
+ : [oddk] "r"(oddk)
+ : "x20", "x21", "v0", "v1", "v2", "v3", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19",
+ "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "cc");
+ }
+ }
+}
+
+} // namespace arm_gemm
+
+#endif // __aarch64__
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_u16_12x8.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_u16_12x8.hpp
new file mode 100644
index 0000000..3975732
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_u16_12x8.hpp
@@ -0,0 +1,73 @@
+/*
+ * Copyright (c) 2017-2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#pragma once
+
+#ifdef __aarch64__
+
+namespace arm_gemm
+{
+// Actual kernel implementations
+void a64_gemm_u16_asimd_12x8(const uint16_t *, const uint16_t *, uint32_t *, int, int, int);
+
+// 12x8 SGEMM "strategy" class.
+//
+// This describes the characteristics of a family of kernels, in terms of
+// the required interleave properties and the output block size.
+//
+// All kernels in the family must share these characteristics. The actual
+// kernel to be used can be chosen at runtime, based on the CPU_type
+// structure.
+class gemm_u16_12x8
+{
+public:
+ typedef uint16_t operand_type;
+ typedef uint32_t result_type;
+
+ typedef void (*kern_type)(const uint16_t *, const uint16_t *, uint32_t *, int, int, int);
+
+ /* Describes the data layout for A input */
+ static const int A_interleave = 8;
+ static const int A_block = 1;
+ static const int A_transpose = 0;
+
+ /* Same for B input */
+ static const int B_interleave = 12;
+ static const int B_block = 1;
+ static const int B_transpose = 1;
+
+ /* Kernel blocking parameters */
+ static const int out_width = 12;
+ static const int out_height = 8;
+ static const int k_unroll = 1;
+
+ kern_type kernel = a64_gemm_u16_asimd_12x8;
+
+ gemm_u16_12x8(const CPUInfo *ci)
+ {
+ }
+};
+
+} // namespace arm_gemm
+
+#endif // __aarch64__
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_u16_12x8/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_u16_12x8/generic.cpp
new file mode 100644
index 0000000..7903878
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_u16_12x8/generic.cpp
@@ -0,0 +1,309 @@
+/*
+ * Copyright (c) 2017-2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifdef __aarch64__
+
+#include <arm_neon.h>
+
+#include "../../asmlib.hpp"
+
+namespace arm_gemm
+{
+void a64_gemm_u16_asimd_12x8(const uint16_t *Apanel, const uint16_t *Bpanel, uint32_t *Cpanel, int ablocks, int bblocks, int K)
+{
+ const uint16_t *a_ptr = Apanel;
+ uint32_t *c_ptr = Cpanel;
+
+ for(int yb = 0; yb < ablocks; yb++)
+ {
+ const uint16_t *a_ptr0 = a_ptr;
+ const uint16_t *b_ptr = Bpanel;
+
+ for(int xb = 0; xb < bblocks; xb++)
+ {
+ a_ptr = a_ptr0;
+ const bool odd_k = K & 0x1;
+ int k = (K + 1) / 2 - 1;
+
+ register uint16x8_t aa asm("v0");
+ register uint16x8_t ab asm("v1");
+ register uint16x8_t b0 asm("v2");
+ register uint16x8_t b1 asm("v3");
+ register uint16x8_t b2 asm("v4");
+
+ __asm __volatile(
+ "ldr %d[aa], [%x[a_ptr]]\n" // Load A[A].lower
+ "movi v5.4s, #0\n"
+ "ldr x20, [%x[a_ptr], #0x08]\n" // Load A[A].upper
+ "movi v6.4s, #0\n"
+ "ldr %d[b0], [%x[b_ptr]]\n" // Load B[0].lower
+ "ins %[aa].d[1], x20\n" // Merge A[A].lower and upper
+ "movi v7.4s, #0\n" ASM_PREFETCH("[%[a_ptr], #64]")
+ "movi v8.4s, #0\n"
+ "ldr x20, [%x[b_ptr], #0x08]\n" // Load B[0].upper
+ "movi v9.4s, #0\n" ASM_PREFETCH("[%[b_ptr], #64]")
+ "movi v10.4s, #0\n"
+ "ldr %d[b1], [%x[b_ptr], #0x10]\n" // Load B[1].lower
+ "ins %[b0].d[1], x20\n" // Merge B[0].lower and upper
+ "movi v11.4s, #0\n" ASM_PREFETCH("[%[a_ptr], #96]")
+ "movi v12.4s, #0\n"
+ "movi v13.4s, #0\n" ASM_PREFETCH("[%[b_ptr], #96]")
+ "movi v14.4s, #0\n"
+ "movi v15.4s, #0\n" ASM_PREFETCH("[%[a_ptr], #128]")
+ "movi v16.4s, #0\n"
+ "movi v17.4s, #0\n" ASM_PREFETCH("[%[b_ptr], #128]")
+ "movi v18.4s, #0\n"
+ "movi v19.4s, #0\n" ASM_PREFETCH("[%[a_ptr], #160]")
+ "movi v20.4s, #0\n"
+ "movi v21.4s, #0\n" ASM_PREFETCH("[%[b_ptr], #160]")
+ "movi v22.4s, #0\n"
+ "movi v23.4s, #0\n" ASM_PREFETCH("[%[a_ptr], #192]")
+ "movi v24.4s, #0\n"
+ "add %x[a_ptr], %x[a_ptr], #0x10\n"
+ "movi v25.4s, #0\n" ASM_PREFETCH("[%[b_ptr], #192]")
+ "movi v26.4s, #0\n"
+ "add %x[b_ptr], %x[b_ptr], #0x18\n"
+ "movi v27.4s, #0\n"
+ "movi v28.4s, #0\n"
+
+ "cbz %x[k], 2f\n" // Skip the loop if doing zero iterations.
+
+ "1:\n" // Main loop
+ // First unroll
+ "smlal v5.4s, %[b0].4h, %[aa].h[0]\n"
+ "ldr x20, [%x[b_ptr]]\n" // Load B[1].upper
+ "umlal v6.4s, %[b0].4h, %[aa].h[1]\n"
+ "umlal v7.4s, %[b0].4h, %[aa].h[2]\n"
+ "ldr %d[ab], [%x[a_ptr]]\n" // Load A[B].lower
+ "ins %[b1].d[1], x20\n" // Merge B[1].lower and .upper
+ "umlal v8.4s, %[b0].4h, %[aa].h[3]\n"
+ "umlal v9.4s, %[b0].4h, %[aa].h[4]\n"
+ "ldr x20, [%x[a_ptr], #0x8]\n" // Load A[B].upper
+ "umlal v10.4s, %[b0].4h, %[aa].h[5]\n"
+ "umlal v11.4s, %[b0].4h, %[aa].h[6]\n"
+ "ldr %d[b2], [%x[b_ptr], #0x8]\n" // Load B[2].lower
+ "ins %[ab].d[1], x20\n" // Merge A[B].lower and .upper
+ "umlal v12.4s, %[b0].4h, %[aa].h[7]\n"
+ "umlal2 v13.4s, %[b0].8h, %[aa].h[0]\n"
+ "ldr x20, [%x[b_ptr], #0x10]\n" // Load B[2].upper
+ "umlal2 v14.4s, %[b0].8h, %[aa].h[1]\n"
+ "umlal2 v15.4s, %[b0].8h, %[aa].h[2]\n"
+ "umlal2 v16.4s, %[b0].8h, %[aa].h[3]\n"
+ "umlal2 v17.4s, %[b0].8h, %[aa].h[4]\n"
+ "umlal2 v18.4s, %[b0].8h, %[aa].h[5]\n"
+ "umlal2 v19.4s, %[b0].8h, %[aa].h[6]\n"
+ "umlal2 v20.4s, %[b0].8h, %[aa].h[7]\n"
+ "ldr %d[b0], [%x[b_ptr], #0x18]\n" // Load B[0].lower
+ "ins %[b2].d[1], x20\n" // Merge B[2].lower and .upper
+ "umlal v21.4s, %[b1].4h, %[aa].h[0]\n"
+ "umlal v22.4s, %[b1].4h, %[aa].h[1]\n"
+ "ldr x20, [%x[b_ptr], #0x20]\n" // Load B[0].upper
+ "umlal v23.4s, %[b1].4h, %[aa].h[2]\n"
+ "umlal v24.4s, %[b1].4h, %[aa].h[3]\n"
+ "umlal v25.4s, %[b1].4h, %[aa].h[4]\n"
+ "umlal v26.4s, %[b1].4h, %[aa].h[5]\n"
+ "umlal v27.4s, %[b1].4h, %[aa].h[6]\n"
+ "umlal v28.4s, %[b1].4h, %[aa].h[7]\n"
+
+ // Second unroll
+ "umlal2 v5.4s, %[b1].8h, %[ab].h[0]\n"
+ "ldr %d[aa], [%x[a_ptr], #0x10]\n" // Load A[A].lower
+ "ins %[b0].d[1], x20\n" // Merge B[0].lower and .upper
+ "umlal2 v6.4s, %[b1].8h, %[ab].h[1]\n"
+ "umlal2 v7.4s, %[b1].8h, %[ab].h[2]\n"
+ "ldr x20, [%x[a_ptr], #0x18]\n" // Load A[A].upper
+ "umlal2 v8.4s, %[b1].8h, %[ab].h[3]\n"
+ "umlal2 v9.4s, %[b1].8h, %[ab].h[4]\n"
+ "umlal2 v10.4s, %[b1].8h, %[ab].h[5]\n"
+ "umlal2 v11.4s, %[b1].8h, %[ab].h[6]\n"
+ "add %x[a_ptr], %x[a_ptr], #0x20\n"
+ "umlal2 v12.4s, %[b1].8h, %[ab].h[7]\n"
+ "umlal v13.4s, %[b2].4h, %[ab].h[0]\n" ASM_PREFETCH("[%[b_ptr], #320]")
+ "umlal v14.4s, %[b2].4h, %[ab].h[1]\n"
+ "umlal v15.4s, %[b2].4h, %[ab].h[2]\n" ASM_PREFETCH("[%[a_ptr], #320]")
+ "umlal v16.4s, %[b2].4h, %[ab].h[3]\n"
+ "umlal v17.4s, %[b2].4h, %[ab].h[4]\n" ASM_PREFETCH("[%[b_ptr], #448]")
+ "umlal v18.4s, %[b2].4h, %[ab].h[5]\n"
+ "umlal v19.4s, %[b2].4h, %[ab].h[6]\n"
+ "umlal v20.4s, %[b2].4h, %[ab].h[7]\n"
+ "umlal2 v21.4s, %[b2].8h, %[ab].h[0]\n"
+ "umlal2 v22.4s, %[b2].8h, %[ab].h[1]\n"
+ "subs %x[k], %x[k], #0x1\n"
+ "umlal2 v23.4s, %[b2].8h, %[ab].h[2]\n"
+ "umlal2 v24.4s, %[b2].8h, %[ab].h[3]\n"
+ "ldr %d[b1], [%x[b_ptr], #0x28]\n" // Load B[1].lower
+ "ins %[aa].d[1], x20\n" // Merge A[A].lower and .upper
+ "umlal2 v25.4s, %[b2].8h, %[ab].h[4]\n"
+ "umlal2 v26.4s, %[b2].8h, %[ab].h[5]\n"
+ "add %x[b_ptr], %x[b_ptr], #0x30\n"
+ "umlal2 v27.4s, %[b2].8h, %[ab].h[6]\n"
+ "umlal2 v28.4s, %[b2].8h, %[ab].h[7]\n"
+ "bne 1b\n"
+
+ "2:\n" // Even tail
+ "cbnz %x[odd_k], 3f\n"
+
+ "umlal v5.4s, %[b0].4h, %[aa].h[0]\n"
+ "ldr x20, [%x[b_ptr]]\n" // Load B[1].upper
+ "umlal v6.4s, %[b0].4h, %[aa].h[1]\n"
+ "umlal v7.4s, %[b0].4h, %[aa].h[2]\n"
+ "ldr %d[ab], [%x[a_ptr]]\n" // Load A[B].lower
+ "ins %[b1].d[1], x20\n" // Merge B[1].lower and .upper
+ "umlal v8.4s, %[b0].4h, %[aa].h[3]\n"
+ "umlal v9.4s, %[b0].4h, %[aa].h[4]\n"
+ "ldr x20, [%x[a_ptr], #0x8]\n" // Load A[B].upper
+ "umlal v10.4s, %[b0].4h, %[aa].h[5]\n"
+ "umlal v11.4s, %[b0].4h, %[aa].h[6]\n"
+ "ldr %d[b2], [%x[b_ptr], #0x8]\n" // Load B[2].lower
+ "ins %[ab].d[1], x20\n" // Merge A[B].lower and .upper
+ "umlal v12.4s, %[b0].4h, %[aa].h[7]\n"
+ "umlal2 v13.4s, %[b0].8h, %[aa].h[0]\n"
+ "ldr x20, [%x[b_ptr], #0x10]\n" // Load B[2].upper
+ "umlal2 v14.4s, %[b0].8h, %[aa].h[1]\n"
+ "umlal2 v15.4s, %[b0].8h, %[aa].h[2]\n"
+ "umlal2 v16.4s, %[b0].8h, %[aa].h[3]\n"
+ "add %[a_ptr], %[a_ptr], #0x10\n"
+ "umlal2 v17.4s, %[b0].8h, %[aa].h[4]\n"
+ "add %[b_ptr], %[b_ptr], #0x18\n"
+ "umlal2 v18.4s, %[b0].8h, %[aa].h[5]\n"
+ "umlal2 v19.4s, %[b0].8h, %[aa].h[6]\n"
+ "umlal2 v20.4s, %[b0].8h, %[aa].h[7]\n"
+ "ins %[b2].d[1], x20\n" // Merge B[2].lower and .upper
+ "umlal v21.4s, %[b1].4h, %[aa].h[0]\n"
+ "umlal v22.4s, %[b1].4h, %[aa].h[1]\n"
+ "umlal v23.4s, %[b1].4h, %[aa].h[2]\n"
+ "umlal v24.4s, %[b1].4h, %[aa].h[3]\n"
+ "umlal v25.4s, %[b1].4h, %[aa].h[4]\n"
+ "umlal v26.4s, %[b1].4h, %[aa].h[5]\n"
+ "umlal v27.4s, %[b1].4h, %[aa].h[6]\n"
+ "umlal v28.4s, %[b1].4h, %[aa].h[7]\n"
+
+ "umlal2 v5.4s, %[b1].8h, %[ab].h[0]\n"
+ "umlal v13.4s, %[b2].4h, %[ab].h[0]\n"
+ "umlal2 v21.4s, %[b2].8h, %[ab].h[0]\n"
+ "umlal2 v6.4s, %[b1].8h, %[ab].h[1]\n"
+ "umlal v14.4s, %[b2].4h, %[ab].h[1]\n"
+ "str q5, [%x[c_ptr]]\n"
+ "umlal2 v22.4s, %[b2].8h, %[ab].h[1]\n"
+ "str q13, [%x[c_ptr], #0x10]\n"
+ "umlal2 v7.4s, %[b1].8h, %[ab].h[2]\n"
+ "str q21, [%x[c_ptr], #0x20]\n"
+ "umlal v15.4s, %[b2].4h, %[ab].h[2]\n"
+ "str q6, [%x[c_ptr], #0x30]\n"
+ "umlal2 v23.4s, %[b2].8h, %[ab].h[2]\n"
+ "str q14, [%x[c_ptr], #0x40]\n"
+ "umlal2 v8.4s, %[b1].8h, %[ab].h[3]\n"
+ "str q22, [%x[c_ptr], #0x50]\n"
+ "umlal v16.4s, %[b2].4h, %[ab].h[3]\n"
+ "str q7, [%x[c_ptr], #0x60]\n"
+ "umlal2 v24.4s, %[b2].8h, %[ab].h[3]\n"
+ "str q15, [%x[c_ptr], #0x70]\n"
+ "umlal2 v9.4s, %[b1].8h, %[ab].h[4]\n"
+ "str q23, [%x[c_ptr], #0x80]\n"
+ "umlal v17.4s, %[b2].4h, %[ab].h[4]\n"
+ "str q8, [%x[c_ptr], #0x90]\n"
+ "umlal2 v25.4s, %[b2].8h, %[ab].h[4]\n"
+ "str q16, [%x[c_ptr], #0xa0]\n"
+ "umlal2 v10.4s, %[b1].8h, %[ab].h[5]\n"
+ "str q24, [%x[c_ptr], #0xb0]\n"
+ "umlal v18.4s, %[b2].4h, %[ab].h[5]\n"
+ "str q9, [%x[c_ptr], #0xc0]\n"
+ "umlal2 v26.4s, %[b2].8h, %[ab].h[5]\n"
+ "str q17, [%x[c_ptr], #0xd0]\n"
+ "umlal2 v11.4s, %[b1].8h, %[ab].h[6]\n"
+ "str q25, [%x[c_ptr], #0xe0]\n"
+ "umlal v19.4s, %[b2].4h, %[ab].h[6]\n"
+ "str q10, [%x[c_ptr], #0xf0]\n"
+ "umlal2 v27.4s, %[b2].8h, %[ab].h[6]\n"
+ "str q18, [%x[c_ptr], #0x100]\n"
+ "umlal2 v12.4s, %[b1].8h, %[ab].h[7]\n"
+ "str q26, [%x[c_ptr], #0x110]\n"
+ "umlal v20.4s, %[b2].4h, %[ab].h[7]\n"
+ "str q11, [%x[c_ptr], #0x120]\n"
+ "umlal2 v28.4s, %[b2].8h, %[ab].h[7]\n"
+ "str q19, [%x[c_ptr], #0x130]\n"
+ "b 4f\n" // Complete write out
+
+ "3:\n" // Odd tail
+ "umlal v5.4s, %[b0].4h, %[aa].h[0]\n"
+ "umlal2 v13.4s, %[b0].8h, %[aa].h[0]\n"
+ "umlal v21.4s, %[b1].4h, %[aa].h[0]\n"
+ "umlal v6.4s, %[b0].4h, %[aa].h[1]\n"
+ "umlal2 v14.4s, %[b0].8h, %[aa].h[1]\n"
+ "umlal v22.4s, %[b1].4h, %[aa].h[1]\n"
+ "str q5, [%x[c_ptr]]\n"
+ "umlal v7.4s, %[b0].4h, %[aa].h[2]\n"
+ "str q13, [%x[c_ptr], #0x10]\n"
+ "umlal2 v15.4s, %[b0].8h, %[aa].h[2]\n"
+ "str q21, [%x[c_ptr], #0x20]\n"
+ "umlal v23.4s, %[b1].4h, %[aa].h[2]\n"
+ "str q6, [%x[c_ptr], #0x30]\n"
+ "umlal v8.4s, %[b0].4h, %[aa].h[3]\n"
+ "str q14, [%x[c_ptr], #0x40]\n"
+ "umlal2 v16.4s, %[b0].8h, %[aa].h[3]\n"
+ "str q22, [%x[c_ptr], #0x50]\n"
+ "umlal v24.4s, %[b1].4h, %[aa].h[3]\n"
+ "str q7, [%x[c_ptr], #0x60]\n"
+ "umlal v9.4s, %[b0].4h, %[aa].h[4]\n"
+ "str q15, [%x[c_ptr], #0x70]\n"
+ "umlal2 v17.4s, %[b0].8h, %[aa].h[4]\n"
+ "str q23, [%x[c_ptr], #0x80]\n"
+ "umlal v25.4s, %[b1].4h, %[aa].h[4]\n"
+ "str q8, [%x[c_ptr], #0x90]\n"
+ "umlal v10.4s, %[b0].4h, %[aa].h[5]\n"
+ "str q16, [%x[c_ptr], #0xa0]\n"
+ "umlal2 v18.4s, %[b0].8h, %[aa].h[5]\n"
+ "str q24, [%x[c_ptr], #0xb0]\n"
+ "umlal v26.4s, %[b1].4h, %[aa].h[5]\n"
+ "str q9, [%x[c_ptr], #0xc0]\n"
+ "umlal v11.4s, %[b0].4h, %[aa].h[6]\n"
+ "str q17, [%x[c_ptr], #0xd0]\n"
+ "umlal2 v19.4s, %[b0].8h, %[aa].h[6]\n"
+ "str q25, [%x[c_ptr], #0xe0]\n"
+ "umlal v27.4s, %[b1].4h, %[aa].h[6]\n"
+ "str q10, [%x[c_ptr], #0xf0]\n"
+ "umlal v12.4s, %[b0].4h, %[aa].h[7]\n"
+ "str q18, [%x[c_ptr], #0x100]\n"
+ "umlal2 v20.4s, %[b0].8h, %[aa].h[7]\n"
+ "str q26, [%x[c_ptr], #0x110]\n"
+ "umlal v28.4s, %[b1].4h, %[aa].h[7]\n"
+ "str q11, [%x[c_ptr], #0x120]\n"
+
+ "4:\n" // End of function
+ "str q19, [%x[c_ptr], #0x130]\n"
+ "str q27, [%x[c_ptr], #0x140]\n"
+ "str q12, [%x[c_ptr], #0x150]\n"
+ "str q20, [%x[c_ptr], #0x160]\n"
+ "str q28, [%x[c_ptr], #0x170]\n"
+ "add %x[c_ptr], %x[c_ptr], #0x180\n"
+ : [a_ptr] "+r"(a_ptr), [b_ptr] "+r"(b_ptr), [c_ptr] "+r"(c_ptr), [k] "+r"(k),
+ [aa] "+w"(aa), [ab] "+w"(ab), [b0] "+w"(b0), [b1] "+w"(b1), [b2] "+w"(b2)
+ : [odd_k] "r"(odd_k)
+ : "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "x20", "cc");
+ }
+ }
+}
+
+} // namespace arm_gemm
+
+#endif
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_u8_12x8.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_u8_12x8.hpp
new file mode 100644
index 0000000..26255b1
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_u8_12x8.hpp
@@ -0,0 +1,72 @@
+/*
+ * Copyright (c) 2017-2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#pragma once
+
+#ifdef __aarch64__
+
+#include "arm_gemm.hpp"
+
+namespace arm_gemm
+{
+// Load the actual kernel
+void a64_gemm_u8_12x8(const uint8_t *, const uint8_t *, uint32_t *, int, int, int);
+void a64_gemm_u8_12x8_a55r1(const uint8_t *, const uint8_t *, uint32_t *, int, int, int);
+
+class gemm_u8_12x8
+{
+public:
+ typedef uint8_t operand_type;
+ typedef uint32_t result_type;
+
+ typedef void (*kern_type)(const uint8_t *, const uint8_t *, uint32_t *, int, int, int);
+
+ /* Describes the data layout for A input */
+ static const int A_interleave = 8;
+ static const int A_block = 4;
+ static const bool A_transpose = false;
+
+ /* Same for B input */
+ static const int B_interleave = 12;
+ static const int B_block = 4;
+ static const bool B_transpose = true;
+
+ /* Kernel blocking parameters */
+ static const int out_width = 12;
+ static const int out_height = 8;
+ static const int k_unroll = 4;
+
+ kern_type kernel = a64_gemm_u8_12x8;
+
+ gemm_u8_12x8(const CPUInfo *ci)
+ {
+ if(ci->get_cpu_model() == CPUModel::A55r1)
+ {
+ kernel = a64_gemm_u8_12x8_a55r1;
+ }
+ }
+};
+
+} // namespace arm_gemm
+
+#endif // __aarch64__
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_u8_12x8/a55r1.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_u8_12x8/a55r1.cpp
new file mode 100644
index 0000000..f8fafbd
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_u8_12x8/a55r1.cpp
@@ -0,0 +1,356 @@
+/*
+ * Copyright (c) 2017-2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifdef __aarch64__
+
+#include <arm_neon.h>
+
+#include "../../asmlib.hpp"
+
+#ifdef NO_DOT_IN_TOOLCHAIN
+#include "dot_toolchain_support.h"
+#endif
+
+namespace arm_gemm
+{
+void a64_gemm_u8_12x8_a55r1(const uint8_t *Apanel, const uint8_t *Bpanel, uint32_t *Cpanel, const int ablocks, const int bblocks, const int K)
+{
+ const uint8_t *a_ptr = Apanel;
+ uint32_t *c_ptr = Cpanel;
+
+ // We divide K by 4 because the udot instruction processes 4 elements at a time.
+ const int W = K / 4;
+
+ // Fix up for odd lengths - set a flag if K is odd, but make
+ // sure we round up the iteration count.
+ const int oddk = (W & 1);
+ const int k_iters = ((W + 1) / 2) - 1;
+
+ for(int yb = 0; yb < ablocks; yb++)
+ {
+ const uint8_t *a_ptr0 = a_ptr;
+ const uint8_t *b_ptr = Bpanel;
+
+ for(int xb = 0; xb < bblocks; xb++)
+ {
+ a_ptr = a_ptr0;
+ int k = k_iters;
+
+ register int32x4_t a0 asm("v0");
+ register int32x4_t a1 asm("v1");
+ register int32x4_t b0 asm("v2");
+ register int32x4_t b1 asm("v3");
+ register int32x4_t b2 asm("v4");
+ register int32x4_t a0a asm("v5");
+ register int32x4_t a1a asm("v6");
+
+ __asm __volatile(
+#ifdef NO_DOT_IN_TOOLCHAIN
+ _DECLARE_UDOT
+#else
+ ".arch armv8.2-a+dotprod\n"
+#endif
+ // Initialize result registers, load initial operands, prime prefetches.
+ "movi v8.4s, #0x0\n"
+ "ldr %q[a0], [%[a_ptr]]\n"
+ "movi v9.4s, #0x0\n"
+ "ldr %q[b0], [%[b_ptr]]\n"
+ "movi v10.4s, #0x0\n"
+ "ldr %q[a1], [%[a_ptr], #16]\n"
+ "movi v11.4s, #0x0\n"
+ "ldr %q[b1], [%[b_ptr], #16]\n"
+ "movi v12.4s, #0x0\n" ASM_PREFETCH("[%[b_ptr], #64]") "movi v13.4s, #0x0\n" ASM_PREFETCH("[%[a_ptr], #64]") "movi v14.4s, #0x0\n" ASM_PREFETCH("[%[b_ptr], #128]") "movi v15.4s, #0x0\n"
+ ASM_PREFETCH("[%[a_ptr], #128]") "movi v16.4s, #0x0\n" ASM_PREFETCH("[%[b_ptr], #192]") "movi v17.4s, #0x0\n" ASM_PREFETCH("[%[b_ptr], #256]")
+ "movi v18.4s, #0x0\n"
+ "movi v19.4s, #0x0\n" ASM_PREFETCH("[%[a_ptr], #192]")
+ "movi v20.4s, #0x0\n"
+ "movi v21.4s, #0x0\n" ASM_PREFETCH("[%[b_ptr], #320]")
+ "movi v22.4s, #0x0\n"
+ "movi v23.4s, #0x0\n" ASM_PREFETCH("[%[a_ptr], #256]")
+ "movi v24.4s, #0x0\n"
+ "movi v25.4s, #0x0\n" ASM_PREFETCH("[%[b_ptr], #384]")
+ "movi v26.4s, #0x0\n"
+ "movi v27.4s, #0x0\n" ASM_PREFETCH("[%[b_ptr], #448]")
+ "movi v28.4s, #0x0\n"
+ "movi v29.4s, #0x0\n" ASM_PREFETCH("[%[a_ptr], #384]")
+ "movi v30.4s, #0x0\n"
+ "movi v31.4s, #0x0\n" ASM_PREFETCH("[%[b_ptr], #512]")
+
+ // The loop is offset by these two instructions which must
+ // always be executed.
+ "udot v8.4s , %[b0].16b, %[a0].4b[0]\n"
+ "ldr %d[b2], [%[b_ptr], #32]\n"
+
+ // Skip loop if we are doing zero iterations of it.
+ "cbz %w[k], 4f\n"
+
+ "1:\n"
+ "udot v9.4s , %[b0].16b, %[a0].4b[1]\n"
+ "ldr x20, [%[b_ptr], #40]\n"
+ "udot v10.4s, %[b0].16b, %[a0].4b[2]\n"
+ "subs %w[k], %w[k], #1\n"
+ "udot v11.4s, %[b0].16b, %[a0].4b[3]\n"
+ "ldr %d[a0a], [%[a_ptr], #32]\n"
+
+ "udot v12.4s, %[b0].16b, %[a1].4b[0]\n"
+ "ins %[b2].d[1], x20\n"
+ "udot v13.4s, %[b0].16b, %[a1].4b[1]\n"
+ "ldr x20, [%[a_ptr], #40]\n"
+ "udot v14.4s, %[b0].16b, %[a1].4b[2]\n"
+ "udot v15.4s, %[b0].16b, %[a1].4b[3]\n"
+ "ldr %d[a1a], [%[a_ptr], #48]\n"
+
+ "udot v16.4s, %[b1].16b, %[a0].4b[0]\n"
+ "ins %[a0a].d[1], x20\n"
+ "udot v17.4s, %[b1].16b, %[a0].4b[1]\n"
+ "ldr x20, [%[a_ptr], #56]\n"
+ "udot v18.4s, %[b1].16b, %[a0].4b[2]\n"
+ "udot v19.4s, %[b1].16b, %[a0].4b[3]\n"
+ "ldr %d[b0], [%[b_ptr], #48]\n"
+
+ "udot v20.4s, %[b1].16b, %[a1].4b[0]\n"
+ "ins %[a1a].d[1], x20\n"
+ "udot v21.4s, %[b1].16b, %[a1].4b[1]\n"
+ "ldr x20, [%[b_ptr], #56]\n"
+ "udot v22.4s, %[b1].16b, %[a1].4b[2]\n"
+ "udot v23.4s, %[b1].16b, %[a1].4b[3]\n"
+ "ldr %d[b1], [%[b_ptr], #64]\n"
+
+ "udot v24.4s, %[b2].16b, %[a0].4b[0]\n"
+ "ins %[b0].d[1], x20\n"
+ "udot v25.4s, %[b2].16b, %[a0].4b[1]\n"
+ "ldr x20, [%[b_ptr], #72]\n"
+ "udot v26.4s, %[b2].16b, %[a0].4b[2]\n"
+ "udot v27.4s, %[b2].16b, %[a0].4b[3]\n" ASM_PREFETCH("[%[a_ptr], #448]")
+
+ "udot v28.4s, %[b2].16b, %[a1].4b[0]\n"
+ "udot v29.4s, %[b2].16b, %[a1].4b[1]\n" ASM_PREFETCH("[%[b_ptr], #576]")
+ "udot v30.4s, %[b2].16b, %[a1].4b[2]\n"
+ "udot v31.4s, %[b2].16b, %[a1].4b[3]\n"
+
+ // Unroll 1
+ "ldr %d[b2], [%[b_ptr], #80]\n"
+
+ "udot v8.4s , %[b0].16b, %[a0a].4b[0]\n"
+ "ins %[b1].d[1], x20\n"
+ "udot v9.4s , %[b0].16b, %[a0a].4b[1]\n"
+ "ldr x20, [%[b_ptr], #88]\n"
+ "udot v10.4s, %[b0].16b, %[a0a].4b[2]\n"
+ "udot v11.4s, %[b0].16b, %[a0a].4b[3]\n"
+ "ldr %d[a0], [%[a_ptr], #64]\n"
+
+ "udot v12.4s, %[b0].16b, %[a1a].4b[0]\n"
+ "ins %[b2].d[1], x20\n"
+ "udot v13.4s, %[b0].16b, %[a1a].4b[1]\n"
+ "ldr x20, [%[a_ptr], #72]\n"
+ "udot v14.4s, %[b0].16b, %[a1a].4b[2]\n"
+ "udot v15.4s, %[b0].16b, %[a1a].4b[3]\n"
+ "ldr %d[a1], [%[a_ptr], #80]\n"
+
+ "udot v16.4s, %[b1].16b, %[a0a].4b[0]\n"
+ "ins %[a0].d[1], x20\n"
+ "udot v17.4s, %[b1].16b, %[a0a].4b[1]\n"
+ "ldr x20, [%[a_ptr], #88]\n"
+ "udot v18.4s, %[b1].16b, %[a0a].4b[2]\n"
+ "udot v19.4s, %[b1].16b, %[a0a].4b[3]\n"
+ "ldr %d[b0], [%[b_ptr], #96]\n"
+
+ "udot v20.4s, %[b1].16b, %[a1a].4b[0]\n"
+ "ins %[a1].d[1], x20\n"
+ "udot v21.4s, %[b1].16b, %[a1a].4b[1]\n"
+ "ldr x20, [%[b_ptr], #104]\n"
+ "udot v22.4s, %[b1].16b, %[a1a].4b[2]\n"
+ "udot v23.4s, %[b1].16b, %[a1a].4b[3]\n"
+ "ldr %d[b1], [%[b_ptr], #112]\n"
+
+ "udot v24.4s, %[b2].16b, %[a0a].4b[0]\n"
+ "ins %[b0].d[1], x20\n"
+ "udot v25.4s, %[b2].16b, %[a0a].4b[1]\n"
+ "ldr x20, [%[b_ptr], #120]\n"
+ "udot v26.4s, %[b2].16b, %[a0a].4b[2]\n"
+ "udot v27.4s, %[b2].16b, %[a0a].4b[3]\n"
+ "add %[a_ptr], %[a_ptr], #64\n"
+
+ "udot v28.4s, %[b2].16b, %[a1a].4b[0]\n" ASM_PREFETCH("[%[b_ptr], #640]")
+ "udot v29.4s, %[b2].16b, %[a1a].4b[1]\n"
+ "add %[b_ptr], %[b_ptr], #96\n"
+ "udot v30.4s, %[b2].16b, %[a1a].4b[2]\n"
+ "ins %[b1].d[1], x20\n"
+ "udot v31.4s, %[b2].16b, %[a1a].4b[3]\n"
+ "ldr %d[b2], [%[b_ptr], #32]\n"
+
+ "udot v8.4s , %[b0].16b, %[a0].4b[0]\n"
+ "b.ne 1b\n"
+
+ // Branch here if K=1 or 2. Do the right thing for odd/even at the end.
+ "4:\n"
+
+ // Start final iteration - branch off to "odd" code before we load a0a.
+ "udot v9.4s , %[b0].16b, %[a0].4b[1]\n"
+ "ldr x20, [%[b_ptr], #40]\n"
+ "udot v10.4s, %[b0].16b, %[a0].4b[2]\n"
+ "cbnz %w[oddk], 2f\n"
+
+ // Even K continuation
+ "udot v11.4s, %[b0].16b, %[a0].4b[3]\n"
+ "ldr %d[a0a], [%[a_ptr], #32]\n"
+
+ "udot v12.4s, %[b0].16b, %[a1].4b[0]\n"
+ "ins %[b2].d[1], x20\n"
+ "udot v13.4s, %[b0].16b, %[a1].4b[1]\n"
+ "ldr x20, [%[a_ptr], #40]\n"
+ "udot v14.4s, %[b0].16b, %[a1].4b[2]\n" ASM_PREFETCHW("[%[c_ptr]]")
+ "udot v15.4s, %[b0].16b, %[a1].4b[3]\n"
+ "ldr %d[a1a], [%[a_ptr], #48]\n"
+
+ "udot v16.4s, %[b1].16b, %[a0].4b[0]\n"
+ "ins %[a0a].d[1], x20\n"
+ "udot v17.4s, %[b1].16b, %[a0].4b[1]\n"
+ "ldr x20, [%[a_ptr], #56]\n"
+ "udot v18.4s, %[b1].16b, %[a0].4b[2]\n"
+ "udot v19.4s, %[b1].16b, %[a0].4b[3]\n"
+ "ldr %d[b0], [%[b_ptr], #48]\n"
+
+ "udot v20.4s, %[b1].16b, %[a1].4b[0]\n"
+ "ins %[a1a].d[1], x20\n"
+ "udot v21.4s, %[b1].16b, %[a1].4b[1]\n"
+ "ldr x20, [%[b_ptr], #56]\n"
+ "udot v22.4s, %[b1].16b, %[a1].4b[2]\n" ASM_PREFETCHW("[%[c_ptr], #64]")
+ "udot v23.4s, %[b1].16b, %[a1].4b[3]\n"
+
+ "udot v24.4s, %[b2].16b, %[a0].4b[0]\n"
+ "udot v25.4s, %[b2].16b, %[a0].4b[1]\n" ASM_PREFETCHW("[%[c_ptr], #128]")
+ "udot v26.4s, %[b2].16b, %[a0].4b[2]\n"
+ "udot v27.4s, %[b2].16b, %[a0].4b[3]\n"
+ "ldr %d[b1], [%[b_ptr], #64]\n"
+
+ "udot v28.4s, %[b2].16b, %[a1].4b[0]\n"
+ "ins %[b0].d[1], x20\n"
+ "udot v29.4s, %[b2].16b, %[a1].4b[1]\n"
+ "ldr x20, [%[b_ptr], #72]\n"
+ "udot v30.4s, %[b2].16b, %[a1].4b[2]\n" ASM_PREFETCHW("[%[c_ptr], #192]")
+ "udot v31.4s, %[b2].16b, %[a1].4b[3]\n"
+ "ldr %d[b2], [%[b_ptr], #80]\n"
+
+ "udot v8.4s , %[b0].16b, %[a0a].4b[0]\n"
+ "ins %[b1].d[1], x20\n"
+ "udot v9.4s , %[b0].16b, %[a0a].4b[1]\n"
+ "ldr x20, [%[b_ptr], #88]\n"
+ "udot v10.4s, %[b0].16b, %[a0a].4b[2]\n"
+ "ins %[b2].d[1], x20\n"
+
+ "udot v11.4s, %[b0].16b, %[a0a].4b[3]\n" ASM_PREFETCHW("[%[c_ptr], #256]")
+ "udot v12.4s, %[b0].16b, %[a1a].4b[0]\n"
+ "udot v13.4s, %[b0].16b, %[a1a].4b[1]\n"
+ "udot v14.4s, %[b0].16b, %[a1a].4b[2]\n" ASM_PREFETCHW("[%[c_ptr], #320]")
+ "udot v15.4s, %[b0].16b, %[a1a].4b[3]\n"
+ "udot v16.4s, %[b1].16b, %[a0a].4b[0]\n" ASM_PREFETCHWL2("[%[c_ptr], #384]")
+ "udot v17.4s, %[b1].16b, %[a0a].4b[1]\n"
+ "udot v18.4s, %[b1].16b, %[a0a].4b[2]\n" ASM_PREFETCHWL2("[%[c_ptr], #448]")
+ "udot v19.4s, %[b1].16b, %[a0a].4b[3]\n"
+ "udot v20.4s, %[b1].16b, %[a1a].4b[0]\n"
+ "udot v21.4s, %[b1].16b, %[a1a].4b[1]\n" ASM_PREFETCHWL2("[%[c_ptr], #512]")
+ "udot v22.4s, %[b1].16b, %[a1a].4b[2]\n"
+ "udot v23.4s, %[b1].16b, %[a1a].4b[3]\n" ASM_PREFETCHWL2("[%[c_ptr], #576]")
+ "udot v24.4s, %[b2].16b, %[a0a].4b[0]\n"
+ "udot v25.4s, %[b2].16b, %[a0a].4b[1]\n"
+ "udot v26.4s, %[b2].16b, %[a0a].4b[2]\n" ASM_PREFETCHWL2("[%[c_ptr], #640]")
+ "udot v27.4s, %[b2].16b, %[a0a].4b[3]\n"
+ "udot v28.4s, %[b2].16b, %[a1a].4b[0]\n" ASM_PREFETCHWL2("[%[c_ptr], #704]")
+ "udot v29.4s, %[b2].16b, %[a1a].4b[1]\n"
+ "add %[a_ptr], %[a_ptr], #64\n"
+ "udot v30.4s, %[b2].16b, %[a1a].4b[2]\n"
+ "add %[b_ptr], %[b_ptr], #96\n"
+ "udot v31.4s, %[b2].16b, %[a1a].4b[3]\n"
+ "b 3f\n"
+
+ // Odd K continuation
+ "2:\n"
+ "udot v11.4s, %[b0].16b, %[a0].4b[3]\n" ASM_PREFETCHW("[%[c_ptr]]")
+ "udot v12.4s, %[b0].16b, %[a1].4b[0]\n"
+ "ins %[b2].d[1], x20\n"
+ "udot v13.4s, %[b0].16b, %[a1].4b[1]\n" ASM_PREFETCHW("[%[c_ptr], #64]")
+ "udot v14.4s, %[b0].16b, %[a1].4b[2]\n"
+ "add %[a_ptr], %[a_ptr], #32\n"
+ "udot v15.4s, %[b0].16b, %[a1].4b[3]\n" ASM_PREFETCHW("[%[c_ptr], #128]")
+ "udot v16.4s, %[b1].16b, %[a0].4b[0]\n"
+ "add %[b_ptr], %[b_ptr], #48\n"
+ "udot v17.4s, %[b1].16b, %[a0].4b[1]\n" ASM_PREFETCHW("[%[c_ptr], #192]")
+ "udot v18.4s, %[b1].16b, %[a0].4b[2]\n"
+ "udot v19.4s, %[b1].16b, %[a0].4b[3]\n" ASM_PREFETCHW("[%[c_ptr], #256]")
+ "udot v20.4s, %[b1].16b, %[a1].4b[0]\n"
+ "udot v21.4s, %[b1].16b, %[a1].4b[1]\n" ASM_PREFETCHW("[%[c_ptr], #320]")
+ "udot v22.4s, %[b1].16b, %[a1].4b[2]\n"
+ "udot v23.4s, %[b1].16b, %[a1].4b[3]\n" ASM_PREFETCHWL2("[%[c_ptr], #384]")
+ "udot v24.4s, %[b2].16b, %[a0].4b[0]\n"
+ "udot v25.4s, %[b2].16b, %[a0].4b[1]\n" ASM_PREFETCHWL2("[%[c_ptr], #448]")
+ "udot v26.4s, %[b2].16b, %[a0].4b[2]\n"
+ "udot v27.4s, %[b2].16b, %[a0].4b[3]\n" ASM_PREFETCHWL2("[%[c_ptr], #512]") "udot v28.4s, %[b2].16b, %[a1].4b[0]\n" ASM_PREFETCHWL2("[%[c_ptr], #576]") "udot v29.4s, %[b2].16b, %[a1].4b[1]\n"
+ ASM_PREFETCHWL2("[%[c_ptr], #640]") "udot v30.4s, %[b2].16b, %[a1].4b[2]\n" ASM_PREFETCHWL2("[%[c_ptr], #704]")
+ "udot v31.4s, %[b2].16b, %[a1].4b[3]\n"
+
+ // Common tail
+ "3:\n"
+ "str q8, [%[c_ptr]]\n"
+ "str q16, [%[c_ptr], #16]\n"
+ "str q24, [%[c_ptr], #32]\n"
+ "str q9, [%[c_ptr], #48]\n"
+ "str q17, [%[c_ptr], #64]\n"
+ "str q25, [%[c_ptr], #80]\n"
+ "str q10, [%[c_ptr], #96]\n"
+ "str q18, [%[c_ptr], #112]\n"
+ "str q26, [%[c_ptr], #128]\n"
+ "str q11, [%[c_ptr], #144]\n"
+ "str q19, [%[c_ptr], #160]\n"
+ "str q27, [%[c_ptr], #176]\n"
+ "str q12, [%[c_ptr], #192]\n"
+ "str q20, [%[c_ptr], #208]\n"
+ "str q28, [%[c_ptr], #224]\n"
+ "str q13, [%[c_ptr], #240]\n"
+ "str q21, [%[c_ptr], #256]\n"
+ "str q29, [%[c_ptr], #272]\n"
+ "str q14, [%[c_ptr], #288]\n"
+ "str q22, [%[c_ptr], #304]\n"
+ "str q30, [%[c_ptr], #320]\n"
+ "str q15, [%[c_ptr], #336]\n"
+ "str q23, [%[c_ptr], #352]\n"
+ "str q31, [%[c_ptr], #368]\n"
+ "add %[c_ptr], %[c_ptr], #384\n"
+
+#ifdef NO_DOT_IN_TOOLCHAIN
+ ".purgem udot\n"
+#endif
+ :
+ [a_ptr] "+r"(a_ptr), [b_ptr] "+r"(b_ptr), [c_ptr] "+r"(c_ptr),
+ [a0] "+w"(a0), [a1] "+w"(a1), [a0a] "+w"(a0a), [a1a] "+w"(a1a),
+ [b0] "+w"(b0), [b1] "+w"(b1), [b2] "+w"(b2), [k] "+r"(k)
+ : [oddk] "r"(oddk)
+ : "x20", "x21", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18",
+ "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "cc", "memory");
+ }
+ }
+}
+
+} // namespace arm_gemm
+
+#endif
\ No newline at end of file
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_u8_12x8/dot_toolchain_support.h b/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_u8_12x8/dot_toolchain_support.h
new file mode 100644
index 0000000..5ee273b
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_u8_12x8/dot_toolchain_support.h
@@ -0,0 +1,66 @@
+/*
+ * Copyright (c) 2017-2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+// Define a macro to assemble the UDOT instruction (in the absence of toolchain support)
+#define _DECLARE_UDOT \
+ ".altmacro\n" \
+ ".macro udot opd:req, opn:req, opm:req\n" \
+ "local vd, vn, vm, h, l\n" \
+ ".irp reg,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31\n" \
+ ".ifeqs \"\\opd\",\"v\\reg\\.4s\"\n" \
+ ".set vd,\\reg\n" \
+ ".endif\n" \
+ ".ifeqs \"\\opn\",\"v\\reg\\.16b\"\n" \
+ ".set vn,\\reg\n" \
+ ".endif\n" \
+ ".irp idx,0,1,2,3\n" \
+ ".ifeqs \"\\opm\",\"v\\reg\\.4b[\\idx\\]\"\n" \
+ ".set vm,\\reg\n" \
+ ".set h,\\idx / 2\n" \
+ ".set l,\\idx %% 2\n" \
+ ".endif\n" \
+ ".endr\n" \
+ ".endr\n" \
+ ".ifndef vd\n" \
+ ".error \"Bad operand \\opd\"\n" \
+ ".exitm\n" \
+ ".endif\n" \
+ ".ifndef vn\n" \
+ ".error \"Bad operand \\opn\"\n" \
+ ".exitm\n" \
+ ".endif\n" \
+ ".ifndef vm\n" \
+ ".error \"Bad operand \\opm\"\n" \
+ ".exitm\n" \
+ ".endif\n" \
+ ".ifndef h\n" \
+ ".error \"Bad operand \\opm\"\n" \
+ ".exitm\n" \
+ ".endif\n" \
+ ".ifndef l\n" \
+ ".error \"Bad operand \\opm\"\n" \
+ ".exitm\n" \
+ ".endif\n" \
+ ".int 0x6f80e000 | vd | (vn << 5) | (vm << 16) | (l << 21) | (h << 11)\n" \
+ ".endm\n"
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_u8_12x8/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_u8_12x8/generic.cpp
new file mode 100644
index 0000000..d026dc5
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_u8_12x8/generic.cpp
@@ -0,0 +1,343 @@
+/*
+ * Copyright (c) 2017-2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifdef __aarch64__
+
+#include <arm_neon.h>
+
+#include "../../asmlib.hpp"
+
+#ifdef NO_DOT_IN_TOOLCHAIN
+#include "dot_toolchain_support.h"
+#endif
+
+namespace arm_gemm
+{
+void a64_gemm_u8_12x8(const uint8_t *Apanel, const uint8_t *Bpanel, uint32_t *Cpanel, int ablocks, int bblocks, int K)
+{
+ const uint8_t *a_ptr = Apanel;
+ uint32_t *c_ptr = Cpanel;
+ // We divide K by 4 because the udot instruction processes 4 elements at a time.
+ const int W = K / 4;
+ // Fix up for odd lengths - set a flag if K is odd, but make
+ // sure we round up the iteration count.
+ const int oddk = (W & 1);
+ const int init_value_k = ((W + 1) / 2) - 1;
+ for(int yb = 0; yb < ablocks; yb++)
+ {
+ const uint8_t *a_ptr0 = a_ptr;
+ const uint8_t *b_ptr = Bpanel;
+ for(int xb = 0; xb < bblocks; xb++)
+ {
+ a_ptr = a_ptr0;
+ int k = init_value_k;
+ register uint8x16_t a0 asm("v0");
+ register uint8x16_t a1 asm("v1");
+ register uint8x16_t b0 asm("v2");
+ register uint8x16_t b1 asm("v3");
+ register uint8x16_t b2 asm("v4");
+ register uint8x16_t a0a asm("v5");
+ register uint8x16_t a1a asm("v6");
+ __asm __volatile(
+#ifdef NO_DOT_IN_TOOLCHAIN
+ _DECLARE_UDOT
+#else
+ ".arch armv8.2-a+dotprod\n"
+#endif
+ // Initialize result registers, load initial operands, prime prefetches.
+ "movi v8.4s, #0x0\n"
+ "ldr %q[a0], [%[a_ptr]]\n"
+ "movi v9.4s, #0x0\n"
+ "ldr %q[b0], [%[b_ptr]]\n"
+ "movi v10.4s, #0x0\n"
+ "ldr %q[a1], [%[a_ptr], #16]\n"
+ "movi v11.4s, #0x0\n"
+ "ldr %q[b1], [%[b_ptr], #16]\n"
+ "movi v12.4s, #0x0\n" ASM_PREFETCH("[%[b_ptr], #64]") "movi v13.4s, #0x0\n" ASM_PREFETCH("[%[a_ptr], #64]") "movi v14.4s, #0x0\n" ASM_PREFETCH("[%[b_ptr], #128]") "movi v15.4s, #0x0\n"
+ ASM_PREFETCH("[%[a_ptr], #128]") "movi v16.4s, #0x0\n" ASM_PREFETCH("[%[b_ptr], #192]") "movi v17.4s, #0x0\n" ASM_PREFETCH("[%[b_ptr], #256]") "movi v18.4s, #0x0\n"
+ ASM_PREFETCH("[%[a_ptr], #192]") "movi v19.4s, #0x0\n" ASM_PREFETCH("[%[b_ptr], #320]") "movi v20.4s, #0x0\n" ASM_PREFETCH("[%[a_ptr], #256]") "movi v21.4s, #0x0\n"
+ ASM_PREFETCH("[%[b_ptr], #384]")
+ "movi v22.4s, #0x0\n"
+ "movi v23.4s, #0x0\n"
+ "movi v24.4s, #0x0\n"
+ "movi v25.4s, #0x0\n"
+ "movi v26.4s, #0x0\n"
+ "movi v27.4s, #0x0\n"
+ "movi v28.4s, #0x0\n"
+ "movi v29.4s, #0x0\n"
+ "movi v30.4s, #0x0\n"
+ "movi v31.4s, #0x0\n"
+
+ // Skip loop if we are doing zero iterations of it.
+ "cbz %w[k], 4f\n"
+
+ // Loop proper
+ "1:\n"
+ "udot v8.4s , %[b0].16b, %[a0].4b[0]\n"
+ "udot v9.4s , %[b0].16b, %[a0].4b[1]\n"
+
+ "ldr %q[b2], [%[b_ptr], #32]\n"
+ "udot v10.4s, %[b0].16b, %[a0].4b[2]\n"
+ "udot v11.4s, %[b0].16b, %[a0].4b[3]\n"
+ "ldr %q[a0a], [%[a_ptr], #32]\n"
+ "udot v12.4s, %[b0].16b, %[a1].4b[0]\n"
+ "udot v13.4s, %[b0].16b, %[a1].4b[1]\n"
+ "ldr %q[a1a], [%[a_ptr], #48]\n"
+ "udot v14.4s, %[b0].16b, %[a1].4b[2]\n"
+ "udot v15.4s, %[b0].16b, %[a1].4b[3]\n"
+ "ldr %q[b0], [%[b_ptr], #48]\n"
+
+ "udot v16.4s, %[b1].16b, %[a0].4b[0]\n"
+ "udot v17.4s, %[b1].16b, %[a0].4b[1]\n" ASM_PREFETCH("[%[a_ptr], #320]")
+ "udot v18.4s, %[b1].16b, %[a0].4b[2]\n"
+ "udot v19.4s, %[b1].16b, %[a0].4b[3]\n"
+ "udot v20.4s, %[b1].16b, %[a1].4b[0]\n"
+ "udot v21.4s, %[b1].16b, %[a1].4b[1]\n"
+ "udot v22.4s, %[b1].16b, %[a1].4b[2]\n"
+ "udot v23.4s, %[b1].16b, %[a1].4b[3]\n"
+ "ldr %q[b1], [%[b_ptr], #64]\n"
+
+ "udot v24.4s, %[b2].16b, %[a0].4b[0]\n"
+ "udot v25.4s, %[b2].16b, %[a0].4b[1]\n" ASM_PREFETCH("[%[b_ptr], #448]")
+ "udot v26.4s, %[b2].16b, %[a0].4b[2]\n"
+ "udot v27.4s, %[b2].16b, %[a0].4b[3]\n"
+ "udot v28.4s, %[b2].16b, %[a1].4b[0]\n"
+ "udot v29.4s, %[b2].16b, %[a1].4b[1]\n"
+ "udot v30.4s, %[b2].16b, %[a1].4b[2]\n"
+ "udot v31.4s, %[b2].16b, %[a1].4b[3]\n"
+ "ldr %q[b2], [%[b_ptr], #80]\n"
+
+ "udot v8.4s , %[b0].16b, %[a0a].4b[0]\n"
+ "udot v9.4s , %[b0].16b, %[a0a].4b[1]\n"
+ "ldr %q[a0], [%[a_ptr], #64]\n"
+ "udot v10.4s, %[b0].16b, %[a0a].4b[2]\n"
+ "udot v11.4s, %[b0].16b, %[a0a].4b[3]\n"
+ "udot v12.4s, %[b0].16b, %[a1a].4b[0]\n"
+ "ldr %q[a1], [%[a_ptr], #80]\n"
+ "udot v13.4s, %[b0].16b, %[a1a].4b[1]\n"
+ "udot v14.4s, %[b0].16b, %[a1a].4b[2]\n"
+ "udot v15.4s, %[b0].16b, %[a1a].4b[3]\n"
+ "ldr %q[b0], [%[b_ptr], #96]\n"
+
+ "udot v16.4s, %[b1].16b, %[a0a].4b[0]\n"
+ "udot v17.4s, %[b1].16b, %[a0a].4b[1]\n" ASM_PREFETCH("[%[b_ptr], #512]")
+ "udot v18.4s, %[b1].16b, %[a0a].4b[2]\n"
+ "udot v19.4s, %[b1].16b, %[a0a].4b[3]\n"
+ "udot v20.4s, %[b1].16b, %[a1a].4b[0]\n"
+ "udot v21.4s, %[b1].16b, %[a1a].4b[1]\n"
+ "udot v22.4s, %[b1].16b, %[a1a].4b[2]\n"
+ "udot v23.4s, %[b1].16b, %[a1a].4b[3]\n"
+ "ldr %q[b1], [%[b_ptr], #112]\n"
+
+ "udot v24.4s, %[b2].16b, %[a0a].4b[0]\n"
+ "udot v25.4s, %[b2].16b, %[a0a].4b[1]\n"
+ "add %[a_ptr], %[a_ptr], #64\n"
+ "udot v26.4s, %[b2].16b, %[a0a].4b[2]\n"
+ "udot v27.4s, %[b2].16b, %[a0a].4b[3]\n"
+ "add %[b_ptr], %[b_ptr], #96\n"
+ "udot v28.4s, %[b2].16b, %[a1a].4b[0]\n"
+ "udot v29.4s, %[b2].16b, %[a1a].4b[1]\n"
+ "subs %w[k], %w[k], #1\n"
+ "udot v30.4s, %[b2].16b, %[a1a].4b[2]\n"
+ "udot v31.4s, %[b2].16b, %[a1a].4b[3]\n"
+ "bne 1b\n"
+
+ // Target to use when K is 1 or 2 (i.e. zero iterations of main loop)
+ "4:\n"
+
+ // Branch to alternative tail for odd K
+ "cbnz %w[oddk], 2f\n"
+
+ // Detached final iteration (even K)
+ "udot v8.4s , %[b0].16b, %[a0].4b[0]\n"
+ "udot v9.4s , %[b0].16b, %[a0].4b[1]\n"
+ "ldr %q[b2], [%[b_ptr], #32]\n"
+ "udot v10.4s, %[b0].16b, %[a0].4b[2]\n"
+ "udot v11.4s, %[b0].16b, %[a0].4b[3]\n"
+ "ldr %q[a0a], [%[a_ptr], #32]\n"
+ "udot v12.4s, %[b0].16b, %[a1].4b[0]\n"
+ "udot v13.4s, %[b0].16b, %[a1].4b[1]\n"
+ "ldr %q[a1a], [%[a_ptr], #48]\n"
+ "udot v14.4s, %[b0].16b, %[a1].4b[2]\n"
+ "udot v15.4s, %[b0].16b, %[a1].4b[3]\n"
+ "ldr %q[b0], [%[b_ptr], #48]\n"
+
+ "udot v16.4s, %[b1].16b, %[a0].4b[0]\n"
+ "udot v17.4s, %[b1].16b, %[a0].4b[1]\n"
+ "udot v18.4s, %[b1].16b, %[a0].4b[2]\n"
+ "udot v19.4s, %[b1].16b, %[a0].4b[3]\n"
+ "udot v20.4s, %[b1].16b, %[a1].4b[0]\n"
+ "udot v21.4s, %[b1].16b, %[a1].4b[1]\n"
+ "udot v22.4s, %[b1].16b, %[a1].4b[2]\n"
+ "udot v23.4s, %[b1].16b, %[a1].4b[3]\n"
+ "ldr %q[b1], [%[b_ptr], #64]\n"
+
+ "udot v24.4s, %[b2].16b, %[a0].4b[0]\n"
+ "udot v25.4s, %[b2].16b, %[a0].4b[1]\n"
+ "add %[a_ptr], %[a_ptr], #64\n"
+ "udot v26.4s, %[b2].16b, %[a0].4b[2]\n"
+ "udot v27.4s, %[b2].16b, %[a0].4b[3]\n"
+ "udot v28.4s, %[b2].16b, %[a1].4b[0]\n"
+ "udot v29.4s, %[b2].16b, %[a1].4b[1]\n"
+ "udot v30.4s, %[b2].16b, %[a1].4b[2]\n"
+ "udot v31.4s, %[b2].16b, %[a1].4b[3]\n"
+ "ldr %q[b2], [%[b_ptr], #80]\n"
+
+ "udot v8.4s , %[b0].16b, %[a0a].4b[0]\n"
+
+ "udot v16.4s, %[b1].16b, %[a0a].4b[0]\n"
+ "add %[b_ptr], %[b_ptr], #96\n"
+ "udot v9.4s , %[b0].16b, %[a0a].4b[1]\n"
+ "str q8, [%[c_ptr], #0]\n"
+ "udot v17.4s, %[b1].16b, %[a0a].4b[1]\n"
+ "str q16, [%[c_ptr], #16]\n"
+ "udot v24.4s, %[b2].16b, %[a0a].4b[0]\n"
+ "str q24, [%[c_ptr], #32]\n"
+
+ "udot v25.4s, %[b2].16b, %[a0a].4b[1]\n"
+ "str q9, [%[c_ptr], #48]\n"
+ "udot v10.4s, %[b0].16b, %[a0a].4b[2]\n"
+ "str q17, [%[c_ptr], #64]\n"
+ "udot v18.4s, %[b1].16b, %[a0a].4b[2]\n"
+ "str q25, [%[c_ptr], #80]\n"
+ "udot v26.4s, %[b2].16b, %[a0a].4b[2]\n"
+ "str q10, [%[c_ptr], #96]\n"
+
+ "udot v11.4s, %[b0].16b, %[a0a].4b[3]\n"
+ "str q18, [%[c_ptr], #112]\n"
+ "udot v19.4s, %[b1].16b, %[a0a].4b[3]\n"
+ "str q26, [%[c_ptr], #128]\n"
+ "udot v27.4s, %[b2].16b, %[a0a].4b[3]\n"
+ "str q11, [%[c_ptr], #144]\n"
+
+ "udot v12.4s, %[b0].16b, %[a1a].4b[0]\n"
+ "str q19, [%[c_ptr], #160]\n"
+ "udot v20.4s, %[b1].16b, %[a1a].4b[0]\n"
+ "str q27, [%[c_ptr], #176]\n"
+ "udot v28.4s, %[b2].16b, %[a1a].4b[0]\n"
+ "str q12, [%[c_ptr], #192]\n"
+
+ "udot v13.4s, %[b0].16b, %[a1a].4b[1]\n"
+ "str q20, [%[c_ptr], #208]\n"
+ "udot v21.4s, %[b1].16b, %[a1a].4b[1]\n"
+ "str q28, [%[c_ptr], #224]\n"
+ "udot v29.4s, %[b2].16b, %[a1a].4b[1]\n"
+ "str q13, [%[c_ptr], #240]\n"
+
+ "udot v14.4s, %[b0].16b, %[a1a].4b[2]\n"
+ "str q21, [%[c_ptr], #256]\n"
+ "udot v22.4s, %[b1].16b, %[a1a].4b[2]\n"
+ "str q29, [%[c_ptr], #272]\n"
+ "udot v30.4s, %[b2].16b, %[a1a].4b[2]\n"
+ "str q14, [%[c_ptr], #288]\n"
+
+ "udot v15.4s, %[b0].16b, %[a1a].4b[3]\n"
+ "str q22, [%[c_ptr], #304]\n"
+ "udot v23.4s, %[b1].16b, %[a1a].4b[3]\n"
+ "str q30, [%[c_ptr], #320]\n"
+ "udot v31.4s, %[b2].16b, %[a1a].4b[3]\n"
+ "str q15, [%[c_ptr], #336]\n"
+
+ "b 3f\n"
+
+ // Detached final iteration (odd K)
+ "2:\n"
+ "udot v8.4s , %[b0].16b, %[a0].4b[0]\n"
+ "ldr %q[b2], [%[b_ptr], #32]\n"
+ "udot v16.4s, %[b1].16b, %[a0].4b[0]\n"
+ "udot v9.4s , %[b0].16b, %[a0].4b[1]\n"
+ "str q8, [%[c_ptr], #0]\n"
+ "udot v17.4s, %[b1].16b, %[a0].4b[1]\n"
+ "str q16, [%[c_ptr], #16]\n"
+ "udot v24.4s, %[b2].16b, %[a0].4b[0]\n"
+ "add %[b_ptr], %[b_ptr], #48\n"
+ "add %[a_ptr], %[a_ptr], #32\n"
+ "str q24, [%[c_ptr], #32]\n"
+ "udot v25.4s, %[b2].16b, %[a0].4b[1]\n"
+ "str q9, [%[c_ptr], #48]\n"
+
+ "udot v10.4s, %[b0].16b, %[a0].4b[2]\n"
+ "str q17, [%[c_ptr], #64]\n"
+ "udot v18.4s, %[b1].16b, %[a0].4b[2]\n"
+ "str q25, [%[c_ptr], #80]\n"
+ "udot v26.4s, %[b2].16b, %[a0].4b[2]\n"
+ "str q10, [%[c_ptr], #96]\n"
+
+ "udot v11.4s, %[b0].16b, %[a0].4b[3]\n"
+ "str q18, [%[c_ptr], #112]\n"
+ "udot v19.4s, %[b1].16b, %[a0].4b[3]\n"
+ "str q26, [%[c_ptr], #128]\n"
+ "udot v27.4s, %[b2].16b, %[a0].4b[3]\n"
+ "str q11, [%[c_ptr], #144]\n"
+
+ "udot v12.4s, %[b0].16b, %[a1].4b[0]\n"
+ "str q19, [%[c_ptr], #160]\n"
+ "udot v20.4s, %[b1].16b, %[a1].4b[0]\n"
+ "str q27, [%[c_ptr], #176]\n"
+ "udot v28.4s, %[b2].16b, %[a1].4b[0]\n"
+ "str q12, [%[c_ptr], #192]\n"
+
+ "udot v13.4s, %[b0].16b, %[a1].4b[1]\n"
+ "str q20, [%[c_ptr], #208]\n"
+ "udot v21.4s, %[b1].16b, %[a1].4b[1]\n"
+ "str q28, [%[c_ptr], #224]\n"
+ "udot v29.4s, %[b2].16b, %[a1].4b[1]\n"
+ "str q13, [%[c_ptr], #240]\n"
+
+ "udot v14.4s, %[b0].16b, %[a1].4b[2]\n"
+ "str q21, [%[c_ptr], #256]\n"
+ "udot v22.4s, %[b1].16b, %[a1].4b[2]\n"
+ "str q29, [%[c_ptr], #272]\n"
+ "udot v30.4s, %[b2].16b, %[a1].4b[2]\n"
+ "str q14, [%[c_ptr], #288]\n"
+
+ "udot v15.4s, %[b0].16b, %[a1].4b[3]\n"
+ "str q22, [%[c_ptr], #304]\n"
+ "udot v23.4s, %[b1].16b, %[a1].4b[3]\n"
+ "str q30, [%[c_ptr], #320]\n"
+ "udot v31.4s, %[b2].16b, %[a1].4b[3]\n"
+ "str q15, [%[c_ptr], #336]\n"
+
+ // Common tail
+ "3:\n"
+ "str q23, [%[c_ptr], #352]\n"
+ "str q31, [%[c_ptr], #368]\n"
+ "add %[c_ptr], %[c_ptr], #384\n"
+
+#ifdef NO_DOT_IN_TOOLCHAIN
+ ".purgem udot\n"
+#endif
+ :
+ [a_ptr] "+r"(a_ptr), [b_ptr] "+r"(b_ptr), [c_ptr] "+r"(c_ptr),
+ [a0] "+w"(a0), [a1] "+w"(a1), [a0a] "+w"(a0a), [a1a] "+w"(a1a),
+ [b0] "+w"(b0), [b1] "+w"(b1), [b2] "+w"(b2), [k] "+r"(k)
+ : [oddk] "r"(oddk)
+ : "x20", "x21", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18",
+ "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "cc");
+ }
+ }
+}
+
+} // namespace arm_gemm
+
+#endif
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_u8_4x4.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_u8_4x4.hpp
new file mode 100644
index 0000000..5aa5291
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_u8_4x4.hpp
@@ -0,0 +1,66 @@
+/*
+ * Copyright (c) 2017-2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#pragma once
+
+#ifdef __aarch64__
+
+namespace arm_gemm
+{
+// Kernel definition
+void a64_gemm_u8_4x4(const uint8_t *Apanel, const uint8_t *Bpanel, uint32_t *Cpanel, int ablocks, int bblocks, int K);
+
+class gemm_u8_4x4
+{
+public:
+ typedef uint8_t operand_type;
+ typedef uint32_t result_type;
+
+ typedef void (*kern_type)(const uint8_t *, const uint8_t *, uint32_t *, int, int, int);
+
+ /* Describes the data layout for A input */
+ static const int A_interleave = 4;
+ static const int A_block = 16;
+ static const bool A_transpose = false;
+
+ /* Same for B input */
+ static const int B_interleave = 4;
+ static const int B_block = 16;
+ static const bool B_transpose = true;
+
+ /* Kernel blocking parameters */
+ static const int out_width = 4;
+ static const int out_height = 4;
+ static const int k_unroll = 16;
+
+ kern_type kernel = nullptr;
+
+ gemm_u8_4x4(const CPUInfo *ci)
+ {
+ kernel = a64_gemm_u8_4x4;
+ }
+};
+
+} // namespace arm_gemm
+
+#endif // __aarch64__
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_u8_4x4/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_u8_4x4/generic.cpp
new file mode 100644
index 0000000..0a881ff
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_u8_4x4/generic.cpp
@@ -0,0 +1,273 @@
+/*
+ * Copyright (c) 2017-2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifdef __aarch64__
+
+#include <arm_neon.h>
+
+#include "../../asmlib.hpp"
+
+namespace arm_gemm
+{
+void a64_gemm_u8_4x4(const uint8_t *Apanel, const uint8_t *Bpanel, uint32_t *Cpanel, int ablocks, int bblocks, int K)
+{
+ const uint8_t *a_ptr = Apanel;
+ uint32_t *c_ptr = Cpanel;
+ K /= 16;
+
+ for(int yb = 0; yb < ablocks; yb++)
+ {
+ const uint8_t *a_ptr0 = a_ptr;
+ const uint8_t *b_ptr = Bpanel;
+
+ for(int xb = 0; xb < bblocks; xb++)
+ {
+ a_ptr = a_ptr0;
+
+ int k = K - 1;
+
+ register uint8x16_t b0 asm("v4");
+ register uint8x16_t b1 asm("v5");
+ register uint8x16_t b2 asm("v6");
+ register uint8x16_t b3 asm("v7");
+
+ __asm __volatile(
+ "movi v16.4s, #0x0\n"
+ "ldr q0, [%[a_ptr]]\n"
+ "movi v17.4s, #0x0\n"
+ "ldr %q[b0], [%[b_ptr]]\n"
+ "movi v18.4s, #0x0\n"
+ "ldr %q[b1], [%[b_ptr], #16]\n"
+ "movi v19.4s, #0x0\n"
+ "ldr %q[b2], [%[b_ptr], #32]\n"
+ "movi v20.4s, #0x0\n"
+ "ldr %q[b3], [%[b_ptr], #48]\n"
+ "movi v21.4s, #0x0\n"
+ "ldr q1, [%[a_ptr], #16]\n"
+ "movi v22.4s, #0x0\n"
+ "ldr q2, [%[a_ptr], #32]\n"
+ "movi v23.4s, #0x0\n"
+ "ldr q3, [%[a_ptr], #48]\n"
+ "movi v24.4s, #0x0\n" ASM_PREFETCH("[%[b_ptr], #64]") "movi v25.4s, #0x0\n" ASM_PREFETCH("[%[a_ptr], #64]") "movi v26.4s, #0x0\n" ASM_PREFETCH("[%[b_ptr], #128]") "movi v27.4s, #0x0\n"
+ ASM_PREFETCH("[%[a_ptr], #128]") "movi v28.4s, #0x0\n" ASM_PREFETCH("[%[b_ptr], #192]") "movi v29.4s, #0x0\n" ASM_PREFETCH("[%[a_ptr], #192]") "movi v30.4s, #0x0\n"
+ ASM_PREFETCH("[%[b_ptr], #256]") "movi v31.4s, #0x0\n" ASM_PREFETCH("[%[a_ptr], #256]")
+
+ "umull v12.8h, v0.8b, %[b0].8b\n"
+ "add %[a_ptr], %[a_ptr], #64\n"
+ "umull v13.8h, v0.8b, %[b1].8b\n"
+ "umull v14.8h, v0.8b, %[b2].8b\n"
+ "add %[b_ptr], %[b_ptr], #64\n"
+ "umull v15.8h, v0.8b, %[b3].8b\n"
+
+ // Skip loop if we are doing zero iterations of it.
+ "cbz %w[k], 2f\n"
+
+ "1:\n"
+ "uadalp v16.4s, v12.8h\n"
+ "umull2 v12.8h, v0.16b, %[b0].16b\n"
+ "uadalp v17.4s, v13.8h\n"
+ "umull2 v13.8h, v0.16b, %[b1].16b\n"
+ "uadalp v18.4s, v14.8h\n"
+ "umull2 v14.8h, v0.16b, %[b2].16b\n"
+ "uadalp v19.4s, v15.8h\n"
+ "umull2 v15.8h, v0.16b, %[b3].16b\n"
+ "ldr q0, [%[a_ptr]]\n"
+
+ "uadalp v16.4s, v12.8h\n"
+ "umull v12.8h, v1.8b, %[b0].8b\n"
+ "uadalp v17.4s, v13.8h\n"
+ "umull v13.8h, v1.8b, %[b1].8b\n"
+ "subs %w[k], %w[k], #1\n"
+ "uadalp v18.4s, v14.8h\n"
+ "umull v14.8h, v1.8b, %[b2].8b\n"
+ "uadalp v19.4s, v15.8h\n"
+ "umull v15.8h, v1.8b, %[b3].8b\n"
+
+ "uadalp v20.4s, v12.8h\n"
+ "umull2 v12.8h, v1.16b, %[b0].16b\n"
+ "uadalp v21.4s, v13.8h\n"
+ "umull2 v13.8h, v1.16b, %[b1].16b\n" ASM_PREFETCH("[%[a_ptr], #256]")
+ "uadalp v22.4s, v14.8h\n"
+ "umull2 v14.8h, v1.16b, %[b2].16b\n"
+ "uadalp v23.4s, v15.8h\n"
+ "umull2 v15.8h, v1.16b, %[b3].16b\n"
+ "ldr q1, [%[a_ptr], #16]\n"
+
+ "uadalp v20.4s, v12.8h\n"
+ "umull v12.8h, v2.8b, %[b0].8b\n"
+ "uadalp v21.4s, v13.8h\n"
+ "umull v13.8h, v2.8b, %[b1].8b\n" ASM_PREFETCH("[%[b_ptr], #256]")
+ "uadalp v22.4s, v14.8h\n"
+ "umull v14.8h, v2.8b, %[b2].8b\n"
+ "uadalp v23.4s, v15.8h\n"
+ "umull v15.8h, v2.8b, %[b3].8b\n"
+
+ "uadalp v24.4s, v12.8h\n"
+ "umull2 v12.8h, v2.16b, %[b0].16b\n"
+ "uadalp v25.4s, v13.8h\n"
+ "umull2 v13.8h, v2.16b, %[b1].16b\n"
+ "uadalp v26.4s, v14.8h\n"
+ "umull2 v14.8h, v2.16b, %[b2].16b\n"
+ "uadalp v27.4s, v15.8h\n"
+ "umull2 v15.8h, v2.16b, %[b3].16b\n"
+ "ldr q2, [%[a_ptr], #32]\n"
+
+ "uadalp v24.4s, v12.8h\n"
+ "umull v12.8h, v3.8b, %[b0].8b\n"
+ "uadalp v25.4s, v13.8h\n"
+ "umull v13.8h, v3.8b, %[b1].8b\n"
+ "uadalp v26.4s, v14.8h\n"
+ "umull v14.8h, v3.8b, %[b2].8b\n"
+ "uadalp v27.4s, v15.8h\n"
+ "umull v15.8h, v3.8b, %[b3].8b\n"
+
+ "uadalp v28.4s, v12.8h\n"
+ "umull2 v12.8h, v3.16b, %[b0].16b\n"
+ "ldr %q[b0], [%[b_ptr]]\n"
+ "uadalp v29.4s, v13.8h\n"
+ "umull2 v13.8h, v3.16b, %[b1].16b\n"
+ "ldr %q[b1], [%[b_ptr], #16]\n"
+ "uadalp v30.4s, v14.8h\n"
+ "umull2 v14.8h, v3.16b, %[b2].16b\n"
+ "ldr %q[b2], [%[b_ptr], #32]\n"
+ "uadalp v31.4s, v15.8h\n"
+ "umull2 v15.8h, v3.16b, %[b3].16b\n"
+ "ldr %q[b3], [%[b_ptr], #48]\n"
+
+ "uadalp v28.4s, v12.8h\n"
+ "umull v12.8h, v0.8b, %[b0].8b\n"
+ "add %[b_ptr], %[b_ptr], #64\n"
+ "uadalp v29.4s, v13.8h\n"
+ "umull v13.8h, v0.8b, %[b1].8b\n"
+ "ldr q3, [%[a_ptr], #48]\n"
+ "uadalp v30.4s, v14.8h\n"
+ "umull v14.8h, v0.8b, %[b2].8b\n"
+ "add %[a_ptr], %[a_ptr], #64\n"
+ "uadalp v31.4s, v15.8h\n"
+ "umull v15.8h, v0.8b, %[b3].8b\n"
+ "bne 1b\n"
+
+ // Branch target
+ "2:\n"
+ "uadalp v16.4s, v12.8h\n"
+ "umull2 v12.8h, v0.16b, %[b0].16b\n"
+ "uadalp v17.4s, v13.8h\n"
+ "umull2 v13.8h, v0.16b, %[b1].16b\n"
+ "uadalp v18.4s, v14.8h\n"
+ "umull2 v14.8h, v0.16b, %[b2].16b\n"
+ "uadalp v19.4s, v15.8h\n"
+ "umull2 v15.8h, v0.16b, %[b3].16b\n"
+
+ "uadalp v16.4s, v12.8h\n"
+ "umull v12.8h, v1.8b, %[b0].8b\n"
+ "uadalp v17.4s, v13.8h\n"
+ "umull v13.8h, v1.8b, %[b1].8b\n"
+ "uadalp v18.4s, v14.8h\n"
+ "umull v14.8h, v1.8b, %[b2].8b\n"
+ "uadalp v19.4s, v15.8h\n"
+ "umull v15.8h, v1.8b, %[b3].8b\n"
+
+ "uadalp v20.4s, v12.8h\n"
+ "umull2 v12.8h, v1.16b, %[b0].16b\n"
+ "uadalp v21.4s, v13.8h\n"
+ "umull2 v13.8h, v1.16b, %[b1].16b\n"
+ "uadalp v22.4s, v14.8h\n"
+ "umull2 v14.8h, v1.16b, %[b2].16b\n"
+ "uadalp v23.4s, v15.8h\n"
+ "umull2 v15.8h, v1.16b, %[b3].16b\n"
+
+ "uadalp v20.4s, v12.8h\n"
+ "umull v12.8h, v2.8b, %[b0].8b\n"
+ "uadalp v21.4s, v13.8h\n"
+ "umull v13.8h, v2.8b, %[b1].8b\n"
+ "uadalp v22.4s, v14.8h\n"
+ "umull v14.8h, v2.8b, %[b2].8b\n"
+ "uadalp v23.4s, v15.8h\n"
+ "umull v15.8h, v2.8b, %[b3].8b\n"
+
+ "uadalp v24.4s, v12.8h\n"
+ "umull2 v12.8h, v2.16b, %[b0].16b\n"
+ "uadalp v25.4s, v13.8h\n"
+ "umull2 v13.8h, v2.16b, %[b1].16b\n"
+ "uadalp v26.4s, v14.8h\n"
+ "umull2 v14.8h, v2.16b, %[b2].16b\n"
+ "uadalp v27.4s, v15.8h\n"
+ "umull2 v15.8h, v2.16b, %[b3].16b\n"
+
+ "uadalp v24.4s, v12.8h\n"
+ "umull v12.8h, v3.8b, %[b0].8b\n"
+ "uadalp v25.4s, v13.8h\n"
+ "umull v13.8h, v3.8b, %[b1].8b\n"
+ "uadalp v26.4s, v14.8h\n"
+ "umull v14.8h, v3.8b, %[b2].8b\n"
+ "uadalp v27.4s, v15.8h\n"
+ "umull v15.8h, v3.8b, %[b3].8b\n"
+
+ "uadalp v28.4s, v12.8h\n"
+ "umull2 v12.8h, v3.16b, %[b0].16b\n"
+ "uadalp v29.4s, v13.8h\n"
+ "umull2 v13.8h, v3.16b, %[b1].16b\n"
+ "uadalp v30.4s, v14.8h\n"
+ "umull2 v14.8h, v3.16b, %[b2].16b\n"
+ "uadalp v31.4s, v15.8h\n"
+ "umull2 v15.8h, v3.16b, %[b3].16b\n"
+
+ "uadalp v28.4s, v12.8h\n"
+ "uadalp v29.4s, v13.8h\n"
+ "uadalp v30.4s, v14.8h\n"
+ "uadalp v31.4s, v15.8h\n"
+
+ "addp v16.4s, v16.4s, v17.4s\n"
+ "addp v17.4s, v18.4s, v19.4s\n"
+ "addp v18.4s, v20.4s, v21.4s\n"
+ "addp v19.4s, v22.4s, v23.4s\n"
+ "addp v20.4s, v24.4s, v25.4s\n"
+ "addp v21.4s, v26.4s, v27.4s\n"
+ "addp v22.4s, v28.4s, v29.4s\n"
+ "addp v23.4s, v30.4s, v31.4s\n"
+
+ "addp v16.4s, v16.4s, v17.4s\n"
+ "addp v17.4s, v18.4s, v19.4s\n"
+ "addp v18.4s, v20.4s, v21.4s\n"
+ "addp v19.4s, v22.4s, v23.4s\n"
+
+ "str q16, [%[c_ptr]]\n"
+ "str q17, [%[c_ptr], #16]\n"
+ "str q18, [%[c_ptr], #32]\n"
+ "str q19, [%[c_ptr], #48]\n"
+ "add %[c_ptr], %[c_ptr], #64\n"
+
+ :
+ [a_ptr] "+r"(a_ptr), [b_ptr] "+r"(b_ptr), [c_ptr] "+r"(c_ptr),
+ [b0] "+w"(b0), [b1] "+w"(b1), [b2] "+w"(b2), [b3] "+w"(b3),
+ [k] "+r"(k)
+ :
+ : "x20", "x21", "v0", "v1", "v2", "v3", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19",
+ "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "cc");
+ }
+ }
+}
+
+} // namespace arm_gemm
+
+#endif // __aarch64__
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hgemm_24x8.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hgemm_24x8.hpp
new file mode 100644
index 0000000..5fc0a7b
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hgemm_24x8.hpp
@@ -0,0 +1,74 @@
+/*
+ * Copyright (c) 2017-2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#pragma once
+
+#if defined(__aarch64__) && (defined(FP16_KERNELS) || defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC))
+
+#include "arm_gemm.hpp"
+
+namespace arm_gemm
+{
+// Actual kernel implementations
+void a64_hgemm_asimd_24x8(const __fp16 *, const __fp16 *, __fp16 *, int, int, int);
+void a64_hgemm_asimd_24x8_a55r1(const __fp16 *, const __fp16 *, __fp16 *, int, int, int);
+
+// 24x8 HGEMM "strategy" class. Describes the kernel properties.
+//
+// The generic "gemm_opt" function will instantiate one of these (allowing
+// the constructor to pick a kernel implementation).
+class hgemm_24x8
+{
+public:
+ typedef __fp16 operand_type;
+ typedef __fp16 result_type;
+
+ typedef void (*kern_type)(const __fp16 *, const __fp16 *, __fp16 *, int, int, int);
+
+ static const int A_block = 1;
+ static const int A_interleave = 8;
+ static const bool A_transpose = false;
+
+ static const int B_block = 1;
+ static const int B_interleave = 24;
+ static const bool B_transpose = true;
+
+ static const int out_width = 24;
+ static const int out_height = 8;
+ static const int k_unroll = 1;
+
+ // Default to the generic kernel
+ kern_type kernel = a64_hgemm_asimd_24x8;
+
+ hgemm_24x8(const CPUInfo *ci)
+ {
+ if(ci->get_cpu_model() == CPUModel::A55r1)
+ {
+ kernel = a64_hgemm_asimd_24x8_a55r1;
+ }
+ }
+};
+
+} // namespace arm_gemm
+
+#endif // __aarch64__ && (FP16_KERNELS || __ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hgemm_24x8/a55r1.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hgemm_24x8/a55r1.cpp
new file mode 100644
index 0000000..2186117
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hgemm_24x8/a55r1.cpp
@@ -0,0 +1,362 @@
+/*
+ * Copyright (c) 2017-2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+// Build on AArch64 where either FP16_KERNELS is set or FP16 is explicitly supported.
+#if defined(__aarch64__) && (defined(FP16_KERNELS) || defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC))
+
+#include <arm_neon.h>
+
+#include "../../asmlib.hpp"
+
+// Kernel implementation.
+//
+// Assume that "Apanel" points to a chunk of A blocks (each size 8xK) in read-order.
+// Assume that "Bpanel" points to a chunk of B blocks (each size 12xK) in read-order.
+// Assume that "Cpanel" points to a chunk of C output blocks (each size
+// 12x8), the chunks being arranged in a row major fashion.
+//
+// Note that the intent of this is that either ablocks or bblocks will be 1
+// - this construction allows the output loop to proceed in either order.
+
+namespace arm_gemm
+{
+void a64_hgemm_asimd_24x8_a55r1(const __fp16 *Apanel, const __fp16 *Bpanel, __fp16 *Cpanel, int ablocks, int bblocks, int K)
+{
+ const __fp16 *a_ptr = Apanel;
+ __fp16 *c_ptr = Cpanel;
+
+ // Fix up for odd lengths - set a flag if K is odd, but make
+ // sure we round up the iteration count.
+ int oddk = (K & 1);
+ int k_iters = ((K + 1) / 2) - 1;
+
+ for(int yb = 0; yb < ablocks; yb++)
+ {
+ const __fp16 *a_ptr0 = a_ptr;
+ const __fp16 *b_ptr = Bpanel;
+
+ for(int xb = 0; xb < bblocks; xb++)
+ {
+ int k = k_iters;
+ a_ptr = a_ptr0;
+
+ // As A55 requires 64-bit loads anyway, just use 64 bits of the
+ // "A" operands to save on "ins" instructions. Since A55 is
+ // in-order, two sets of "A" operands and one set of "B" is
+ // sufficient.
+ register float16x8_t a0 asm("v0");
+ register float16x8_t a1 asm("v1");
+ register float16x8_t a0a asm("v2");
+ register float16x8_t a1a asm("v3");
+ register float16x8_t b0 asm("v4");
+ register float16x8_t b1 asm("v5");
+ register float16x8_t b2 asm("v6");
+
+ __asm __volatile(
+ // Enable FP16 extensions
+ ".arch armv8.2-a+fp16\n"
+ // Initialize result registers, load initial operands, prime prefetches.
+ "movi v8.8h, #0x0\n"
+ "ldr %d[a0], [%[a_ptr]]\n"
+ "movi v9.8h, #0x0\n"
+ "ldr %q[b0], [%[b_ptr]]\n"
+ "movi v10.8h, #0x0\n"
+ "ldr %d[a1], [%[a_ptr], #8]\n"
+ "movi v11.8h, #0x0\n"
+ "ldr %q[b1], [%[b_ptr], #16]\n"
+ "movi v12.8h, #0x0\n"
+ "movi v13.8h, #0x0\n" ASM_PREFETCH("[%[b_ptr], #64]")
+ "movi v14.8h, #0x0\n"
+ "movi v15.8h, #0x0\n" ASM_PREFETCH("[%[b_ptr], #128]")
+ "movi v16.8h, #0x0\n"
+ "movi v17.8h, #0x0\n" ASM_PREFETCH("[%[a_ptr], #64]")
+ "movi v18.8h, #0x0\n"
+ "movi v19.8h, #0x0\n" ASM_PREFETCH("[%[b_ptr], #192]")
+ "movi v20.8h, #0x0\n"
+ "movi v21.8h, #0x0\n" ASM_PREFETCH("[%[b_ptr], #256]")
+ "movi v22.8h, #0x0\n"
+ "movi v23.8h, #0x0\n" ASM_PREFETCH("[%[b_ptr], #320]")
+ "movi v24.8h, #0x0\n"
+ "movi v25.8h, #0x0\n"
+ "movi v26.8h, #0x0\n"
+ "movi v27.8h, #0x0\n"
+ "movi v28.8h, #0x0\n"
+ "movi v29.8h, #0x0\n"
+ "movi v30.8h, #0x0\n"
+ "movi v31.8h, #0x0\n"
+
+ // The loop is offset by these two instructions which must
+ // always be executed.
+ "fmla v8.8h , %[b0].8h, %[a0].h[0]\n"
+ "ldr %d[b2], [%[b_ptr], #32]\n"
+
+ // Skip loop if we are doing zero iterations of it.
+ "cbz %w[k], 4f\n"
+
+ "1:\n"
+ "fmla v9.8h , %[b0].8h, %[a0].h[1]\n"
+ "ldr x20, [%[b_ptr], #40]\n"
+ "fmla v10.8h, %[b0].8h, %[a0].h[2]\n"
+ "subs %w[k], %w[k], #1\n"
+ "fmla v11.8h, %[b0].8h, %[a0].h[3]\n"
+ "ldr %d[a0a], [%[a_ptr], #16]\n"
+
+ "fmla v12.8h, %[b0].8h, %[a1].h[0]\n"
+ "ins %[b2].d[1], x20\n"
+ "fmla v13.8h, %[b0].8h, %[a1].h[1]\n"
+ "fmla v14.8h, %[b0].8h, %[a1].h[2]\n"
+ "fmla v15.8h, %[b0].8h, %[a1].h[3]\n"
+ "ldr %d[a1a], [%[a_ptr], #24]\n"
+
+ "fmla v16.8h, %[b1].8h, %[a0].h[0]\n"
+ "fmla v17.8h, %[b1].8h, %[a0].h[1]\n"
+ "fmla v18.8h, %[b1].8h, %[a0].h[2]\n"
+ "fmla v19.8h, %[b1].8h, %[a0].h[3]\n"
+ "ldr %d[b0], [%[b_ptr], #48]\n"
+
+ "fmla v20.8h, %[b1].8h, %[a1].h[0]\n"
+ "fmla v21.8h, %[b1].8h, %[a1].h[1]\n"
+ "ldr x20, [%[b_ptr], #56]\n"
+ "fmla v22.8h, %[b1].8h, %[a1].h[2]\n"
+ "fmla v23.8h, %[b1].8h, %[a1].h[3]\n"
+ "ldr %d[b1], [%[b_ptr], #64]\n"
+
+ "fmla v24.8h, %[b2].8h, %[a0].h[0]\n"
+ "ins %[b0].d[1], x20\n"
+ "fmla v25.8h, %[b2].8h, %[a0].h[1]\n"
+ "ldr x20, [%[b_ptr], #72]\n"
+ "fmla v26.8h, %[b2].8h, %[a0].h[2]\n"
+ "fmla v27.8h, %[b2].8h, %[a0].h[3]\n" ASM_PREFETCH("[%[a_ptr], #128]")
+
+ "fmla v28.8h, %[b2].8h, %[a1].h[0]\n"
+ "fmla v29.8h, %[b2].8h, %[a1].h[1]\n" ASM_PREFETCH("[%[b_ptr], #384]")
+ "fmla v30.8h, %[b2].8h, %[a1].h[2]\n"
+ "fmla v31.8h, %[b2].8h, %[a1].h[3]\n"
+ "ldr %d[b2], [%[b_ptr], #80]\n"
+
+ // Unroll 1
+ "fmla v8.8h , %[b0].8h, %[a0a].h[0]\n"
+ "ins %[b1].d[1], x20\n"
+ "fmla v9.8h , %[b0].8h, %[a0a].h[1]\n"
+ "ldr x20, [%[b_ptr], #88]\n"
+ "fmla v10.8h, %[b0].8h, %[a0a].h[2]\n"
+ "fmla v11.8h, %[b0].8h, %[a0a].h[3]\n"
+ "ldr %d[a0], [%[a_ptr], #32]\n"
+
+ "fmla v12.8h, %[b0].8h, %[a1a].h[0]\n"
+ "ins %[b2].d[1], x20\n"
+ "fmla v13.8h, %[b0].8h, %[a1a].h[1]\n"
+ "fmla v14.8h, %[b0].8h, %[a1a].h[2]\n"
+ "fmla v15.8h, %[b0].8h, %[a1a].h[3]\n"
+ "ldr %d[a1], [%[a_ptr], #40]\n"
+
+ "fmla v16.8h, %[b1].8h, %[a0a].h[0]\n"
+ "add %[a_ptr], %[a_ptr], #32\n"
+ "fmla v17.8h, %[b1].8h, %[a0a].h[1]\n"
+ "fmla v18.8h, %[b1].8h, %[a0a].h[2]\n"
+ "fmla v19.8h, %[b1].8h, %[a0a].h[3]\n"
+ "ldr %d[b0], [%[b_ptr], #96]\n"
+
+ "fmla v20.8h, %[b1].8h, %[a1a].h[0]\n"
+ "fmla v21.8h, %[b1].8h, %[a1a].h[1]\n"
+ "ldr x20, [%[b_ptr], #104]\n"
+ "fmla v22.8h, %[b1].8h, %[a1a].h[2]\n"
+ "fmla v23.8h, %[b1].8h, %[a1a].h[3]\n"
+ "ldr %d[b1], [%[b_ptr], #112]\n"
+
+ "fmla v24.8h, %[b2].8h, %[a0a].h[0]\n"
+ "ins %[b0].d[1], x20\n"
+ "fmla v25.8h, %[b2].8h, %[a0a].h[1]\n"
+ "ldr x20, [%[b_ptr], #120]\n"
+ "fmla v26.8h, %[b2].8h, %[a0a].h[2]\n"
+ "fmla v27.8h, %[b2].8h, %[a0a].h[3]\n"
+
+ "fmla v28.8h, %[b2].8h, %[a1a].h[0]\n" ASM_PREFETCH("[%[b_ptr], #448]")
+ "fmla v29.8h, %[b2].8h, %[a1a].h[1]\n"
+ "add %[b_ptr], %[b_ptr], #96\n"
+ "fmla v30.8h, %[b2].8h, %[a1a].h[2]\n"
+ "ins %[b1].d[1], x20\n"
+ "fmla v31.8h, %[b2].8h, %[a1a].h[3]\n"
+ "ldr %d[b2], [%[b_ptr], #32]\n"
+
+ "fmla v8.8h , %[b0].8h, %[a0].h[0]\n"
+ "bne 1b\n"
+
+ "4:\n"
+
+ // Start final iteration - branch off to "odd" code before we load a0a
+ "fmla v9.8h , %[b0].8h, %[a0].h[1]\n"
+ "ldr x20, [%[b_ptr], #40]\n"
+ "fmla v10.8h, %[b0].8h, %[a0].h[2]\n"
+ "cbnz %w[oddk], 2f\n"
+
+ // Even K continuation
+ "fmla v11.8h, %[b0].8h, %[a0].h[3]\n"
+ "ldr %d[a0a], [%[a_ptr], #16]\n"
+
+ "fmla v12.8h, %[b0].8h, %[a1].h[0]\n"
+ "ins %[b2].d[1], x20\n"
+ "fmla v13.8h, %[b0].8h, %[a1].h[1]\n" ASM_PREFETCHW("[%[c_ptr]]")
+ "fmla v14.8h, %[b0].8h, %[a1].h[2]\n"
+ "fmla v15.8h, %[b0].8h, %[a1].h[3]\n"
+ "ldr %d[a1a], [%[a_ptr], #24]\n"
+
+ "fmla v16.8h, %[b1].8h, %[a0].h[0]\n"
+ "fmla v17.8h, %[b1].8h, %[a0].h[1]\n" ASM_PREFETCHW("[%[c_ptr], #64]")
+ "fmla v18.8h, %[b1].8h, %[a0].h[2]\n"
+ "fmla v19.8h, %[b1].8h, %[a0].h[3]\n"
+ "ldr %d[b0], [%[b_ptr], #48]\n"
+
+ "fmla v20.8h, %[b1].8h, %[a1].h[0]\n"
+ "fmla v21.8h, %[b1].8h, %[a1].h[1]\n"
+ "ldr x20, [%[b_ptr], #56]\n"
+ "fmla v22.8h, %[b1].8h, %[a1].h[2]\n"
+ "fmla v23.8h, %[b1].8h, %[a1].h[3]\n"
+ "ldr %d[b1], [%[b_ptr], #64]\n"
+
+ "fmla v24.8h, %[b2].8h, %[a0].h[0]\n"
+ "ins %[b0].d[1], x20\n"
+ "fmla v25.8h, %[b2].8h, %[a0].h[1]\n"
+ "ldr x20, [%[b_ptr], #72]\n"
+ "fmla v26.8h, %[b2].8h, %[a0].h[2]\n"
+ "fmla v27.8h, %[b2].8h, %[a0].h[3]\n" ASM_PREFETCHW("[%[c_ptr], #128]")
+
+ "fmla v28.8h, %[b2].8h, %[a1].h[0]\n"
+ "fmla v29.8h, %[b2].8h, %[a1].h[1]\n" ASM_PREFETCHW("[%[c_ptr], #192]")
+ "fmla v30.8h, %[b2].8h, %[a1].h[2]\n"
+ "fmla v31.8h, %[b2].8h, %[a1].h[3]\n"
+ "ldr %d[b2], [%[b_ptr], #80]\n"
+
+ "fmla v8.8h , %[b0].8h, %[a0a].h[0]\n"
+ "ins %[b1].d[1], x20\n"
+ "fmla v9.8h , %[b0].8h, %[a0a].h[1]\n"
+ "ldr x20, [%[b_ptr], #88]\n"
+ "fmla v10.8h, %[b0].8h, %[a0a].h[2]\n"
+ "fmla v11.8h, %[b0].8h, %[a0a].h[3]\n" ASM_PREFETCHW("[%[c_ptr], #256]")
+
+ "fmla v12.8h, %[b0].8h, %[a1a].h[0]\n"
+ "ins %[b2].d[1], x20\n"
+ "fmla v13.8h, %[b0].8h, %[a1a].h[1]\n" ASM_PREFETCHW("[%[c_ptr], #320]")
+ "fmla v14.8h, %[b0].8h, %[a1a].h[2]\n"
+ "fmla v15.8h, %[b0].8h, %[a1a].h[3]\n"
+ "ldr %d[a1], [%[a_ptr], #40]\n"
+
+ "fmla v16.8h, %[b1].8h, %[a0a].h[0]\n"
+ "add %[a_ptr], %[a_ptr], #32\n"
+ "fmla v17.8h, %[b1].8h, %[a0a].h[1]\n" ASM_PREFETCHWL2("[%[c_ptr], #384]")
+ "fmla v18.8h, %[b1].8h, %[a0a].h[2]\n"
+ "fmla v19.8h, %[b1].8h, %[a0a].h[3]\n" ASM_PREFETCHWL2("[%[c_ptr], #448]")
+
+ "fmla v20.8h, %[b1].8h, %[a1a].h[0]\n"
+ "fmla v21.8h, %[b1].8h, %[a1a].h[1]\n" ASM_PREFETCHWL2("[%[c_ptr], #512]")
+ "fmla v22.8h, %[b1].8h, %[a1a].h[2]\n"
+ "fmla v23.8h, %[b1].8h, %[a1a].h[3]\n" ASM_PREFETCHWL2("[%[c_ptr], #576]")
+
+ "fmla v24.8h, %[b2].8h, %[a0a].h[0]\n"
+ "fmla v25.8h, %[b2].8h, %[a0a].h[1]\n" ASM_PREFETCHWL2("[%[c_ptr], #640]")
+ "fmla v26.8h, %[b2].8h, %[a0a].h[2]\n"
+ "fmla v27.8h, %[b2].8h, %[a0a].h[3]\n" ASM_PREFETCHWL2("[%[c_ptr], #704]")
+
+ "fmla v28.8h, %[b2].8h, %[a1a].h[0]\n"
+ "fmla v29.8h, %[b2].8h, %[a1a].h[1]\n"
+ "add %[b_ptr], %[b_ptr], #96\n"
+ "fmla v30.8h, %[b2].8h, %[a1a].h[2]\n"
+ "fmla v31.8h, %[b2].8h, %[a1a].h[3]\n"
+ "b 3f\n"
+
+ "2:\n"
+
+ // Odd tail
+ "fmla v11.8h, %[b0].8h, %[a0].h[3]\n" ASM_PREFETCHW("[%[c_ptr]]")
+
+ "fmla v12.8h, %[b0].8h, %[a1].h[0]\n"
+ "ins %[b2].d[1], x20\n"
+ "fmla v13.8h, %[b0].8h, %[a1].h[1]\n" ASM_PREFETCHW("[%[c_ptr], #64]")
+ "fmla v14.8h, %[b0].8h, %[a1].h[2]\n"
+ "add %[a_ptr], %[a_ptr], #16\n"
+ "fmla v15.8h, %[b0].8h, %[a1].h[3]\n" ASM_PREFETCHW("[%[c_ptr], #128]")
+
+ "fmla v16.8h, %[b1].8h, %[a0].h[0]\n"
+ "add %[b_ptr], %[b_ptr], #48\n"
+ "fmla v17.8h, %[b1].8h, %[a0].h[1]\n" ASM_PREFETCHW("[%[c_ptr], #192]")
+ "fmla v18.8h, %[b1].8h, %[a0].h[2]\n"
+ "fmla v19.8h, %[b1].8h, %[a0].h[3]\n" ASM_PREFETCHW("[%[c_ptr], #256]")
+
+ "fmla v20.8h, %[b1].8h, %[a1].h[0]\n"
+ "fmla v21.8h, %[b1].8h, %[a1].h[1]\n" ASM_PREFETCHW("[%[c_ptr], #320]")
+ "fmla v22.8h, %[b1].8h, %[a1].h[2]\n"
+ "fmla v23.8h, %[b1].8h, %[a1].h[3]\n" ASM_PREFETCHWL2("[%[c_ptr], #384]")
+
+ "fmla v24.8h, %[b2].8h, %[a0].h[0]\n"
+ "fmla v25.8h, %[b2].8h, %[a0].h[1]\n" ASM_PREFETCHWL2("[%[c_ptr], #384]")
+ "fmla v26.8h, %[b2].8h, %[a0].h[2]\n"
+ "fmla v27.8h, %[b2].8h, %[a0].h[3]\n" ASM_PREFETCHWL2("[%[c_ptr], #448]")
+
+ "fmla v28.8h, %[b2].8h, %[a1].h[0]\n" ASM_PREFETCHWL2("[%[c_ptr], #512]") "fmla v29.8h, %[b2].8h, %[a1].h[1]\n" ASM_PREFETCHWL2("[%[c_ptr], #576]") "fmla v30.8h, %[b2].8h, %[a1].h[2]\n"
+ ASM_PREFETCHWL2("[%[c_ptr], #640]") "fmla v31.8h, %[b2].8h, %[a1].h[3]\n" ASM_PREFETCHWL2("[%[c_ptr], #704]")
+
+ // Common tail
+ // A55 won't dual issue these stores with anything else, so
+ // simplest to do them all in this common code.
+ "3:\n"
+ "str q8, [%[c_ptr]]\n"
+ "str q16, [%[c_ptr], #16]\n"
+ "str q24, [%[c_ptr], #32]\n"
+ "str q9, [%[c_ptr], #48]\n"
+ "str q17, [%[c_ptr], #64]\n"
+ "str q25, [%[c_ptr], #80]\n"
+ "str q10, [%[c_ptr], #96]\n"
+ "str q18, [%[c_ptr], #112]\n"
+ "str q26, [%[c_ptr], #128]\n"
+ "str q11, [%[c_ptr], #144]\n"
+ "str q19, [%[c_ptr], #160]\n"
+ "str q27, [%[c_ptr], #176]\n"
+ "str q12, [%[c_ptr], #192]\n"
+ "str q20, [%[c_ptr], #208]\n"
+ "str q28, [%[c_ptr], #224]\n"
+ "str q13, [%[c_ptr], #240]\n"
+ "str q21, [%[c_ptr], #256]\n"
+ "str q29, [%[c_ptr], #272]\n"
+ "str q14, [%[c_ptr], #288]\n"
+ "str q22, [%[c_ptr], #304]\n"
+ "str q30, [%[c_ptr], #320]\n"
+ "str q15, [%[c_ptr], #336]\n"
+ "str q23, [%[c_ptr], #352]\n"
+ "str q31, [%[c_ptr], #368]\n"
+ "5:\n"
+ "add %[c_ptr], %[c_ptr], #384\n"
+ :
+ [a_ptr] "+r"(a_ptr), [b_ptr] "+r"(b_ptr), [c_ptr] "+r"(c_ptr),
+ [a0] "=w"(a0), [a0a] "=w"(a0a), [a1] "=w"(a1), [a1a] "=w"(a1a),
+ [b0] "=w"(b0), [b1] "=w"(b1), [b2] "=w"(b2), [k] "+r"(k)
+ : [oddk] "r"(oddk)
+ : "x20", "x21", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18",
+ "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "cc", "memory");
+ }
+ }
+}
+
+} // namespace arm_gemm
+
+#endif // __aarch64__ && (FP16_KERNELS || __ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hgemm_24x8/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hgemm_24x8/generic.cpp
new file mode 100644
index 0000000..65a5d43
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hgemm_24x8/generic.cpp
@@ -0,0 +1,339 @@
+/*
+ * Copyright (c) 2017-2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+// Build on AArch64 where either FP16_KERNELS is set or FP16 is explicitly supported.
+#if defined(__aarch64__) && (defined(FP16_KERNELS) || defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC))
+
+#include <arm_neon.h>
+
+#include "../../asmlib.hpp"
+
+// Kernel implementation.
+//
+// Assume that "Apanel" points to a chunk of A blocks (each size 8xK) in read-order.
+// Assume that "Bpanel" points to a chunk of B blocks (each size 12xK) in read-order.
+// Assume that "Cpanel" points to a chunk of C output blocks (each size
+// 12x8), the chunks being arranged in a row major fashion.
+//
+// Note that the intent of this is that either ablocks or bblocks will be 1
+// - this construction allows the output loop to proceed in either order.
+
+namespace arm_gemm
+{
+void a64_hgemm_asimd_24x8(const __fp16 *Apanel, const __fp16 *Bpanel, __fp16 *Cpanel, int ablocks, int bblocks, int K)
+{
+ const __fp16 *a_ptr = Apanel;
+ __fp16 *c_ptr = Cpanel;
+
+ for(int yb = 0; yb < ablocks; yb++)
+ {
+ const __fp16 *a_ptr0 = a_ptr;
+ const __fp16 *b_ptr = Bpanel;
+
+ for(int xb = 0; xb < bblocks; xb++)
+ {
+ a_ptr = a_ptr0;
+ // Fix up for odd lengths - set a flag if K is odd, but make
+ // sure we round up the iteration count.
+ int oddk = (K & 1);
+ int k = ((K + 1) / 2) - 1;
+
+ register float16x8_t a0 asm("v0");
+ register float16x8_t a0a asm("v1");
+ register float16x8_t b0 asm("v2");
+ register float16x8_t b1 asm("v3");
+ register float16x8_t b2 asm("v4");
+ register float16x8_t b0a asm("v5");
+ register float16x8_t b1a asm("v6");
+ register float16x8_t b2a asm("v7");
+
+ __asm __volatile(
+ ".arch armv8.2-a+fp16\n"
+ // Initialize result registers, load initial operands, prime prefetches.
+ "movi v8.8h, #0x0\n"
+ "ldr %q[a0], [%[a_ptr]]\n"
+ "movi v9.8h, #0x0\n"
+ "ldr %q[b0], [%[b_ptr]]\n"
+ "movi v10.8h, #0x0\n"
+ "ldr %q[b1], [%[b_ptr], #16]\n"
+ "movi v11.8h, #0x0\n"
+ "ldr %q[b2], [%[b_ptr], #32]\n"
+ "movi v12.8h, #0x0\n"
+ "ldr %q[b0a], [%[b_ptr], #48]\n"
+ "movi v13.8h, #0x0\n"
+ "ldr %q[b1a], [%[b_ptr], #64]\n"
+ "movi v14.8h, #0x0\n" ASM_PREFETCH("[%[b_ptr], #64]") "movi v15.8h, #0x0\n" ASM_PREFETCH("[%[b_ptr], #128]") "movi v16.8h, #0x0\n" ASM_PREFETCH("[%[a_ptr], #64]") "movi v17.8h, #0x0\n"
+ ASM_PREFETCH("[%[b_ptr], #192]") "movi v18.8h, #0x0\n" ASM_PREFETCH("[%[b_ptr], #256]") "movi v19.8h, #0x0\n" ASM_PREFETCH("[%[b_ptr], #320]")
+ "movi v20.8h, #0x0\n"
+ "movi v21.8h, #0x0\n"
+ "movi v22.8h, #0x0\n"
+ "movi v23.8h, #0x0\n"
+ "movi v24.8h, #0x0\n"
+ "movi v25.8h, #0x0\n"
+ "movi v26.8h, #0x0\n"
+ "movi v27.8h, #0x0\n"
+ "movi v28.8h, #0x0\n"
+ "movi v29.8h, #0x0\n"
+ "movi v30.8h, #0x0\n"
+ "movi v31.8h, #0x0\n"
+
+ // Skip loop if we are doing zero iterations of it.
+ "cbz %w[k], 4f\n"
+
+ "1:\n"
+ "fmla v8.8h , %[b0].8h, %[a0].h[0]\n"
+ "fmla v9.8h , %[b0].8h, %[a0].h[1]\n"
+ "ldr %q[a0a], [%[a_ptr], #16]\n"
+ "fmla v10.8h, %[b0].8h, %[a0].h[2]\n"
+ "fmla v11.8h, %[b0].8h, %[a0].h[3]\n"
+ "ldr %q[b2a], [%[b_ptr], #80]\n"
+ "fmla v12.8h, %[b0].8h, %[a0].h[4]\n"
+ "fmla v13.8h, %[b0].8h, %[a0].h[5]\n"
+ "fmla v14.8h, %[b0].8h, %[a0].h[6]\n"
+ "fmla v15.8h, %[b0].8h, %[a0].h[7]\n"
+ "ldr %q[b0], [%[b_ptr], #96]\n"
+
+ "fmla v16.8h, %[b1].8h, %[a0].h[0]\n"
+ "fmla v17.8h, %[b1].8h, %[a0].h[1]\n" ASM_PREFETCH("[%[a_ptr], #128]")
+ "fmla v18.8h, %[b1].8h, %[a0].h[2]\n"
+ "fmla v19.8h, %[b1].8h, %[a0].h[3]\n"
+ "add %[b_ptr], %[b_ptr], #96\n"
+ "fmla v20.8h, %[b1].8h, %[a0].h[4]\n"
+ "fmla v21.8h, %[b1].8h, %[a0].h[5]\n"
+ "fmla v22.8h, %[b1].8h, %[a0].h[6]\n"
+ "fmla v23.8h, %[b1].8h, %[a0].h[7]\n"
+ "ldr %q[b1], [%[b_ptr], #16]\n"
+
+ "fmla v24.8h, %[b2].8h, %[a0].h[0]\n"
+ "fmla v25.8h, %[b2].8h, %[a0].h[1]\n" ASM_PREFETCH("[%[b_ptr], #288]")
+ "fmla v26.8h, %[b2].8h, %[a0].h[2]\n"
+ "fmla v27.8h, %[b2].8h, %[a0].h[3]\n"
+ "fmla v28.8h, %[b2].8h, %[a0].h[4]\n"
+ "fmla v29.8h, %[b2].8h, %[a0].h[5]\n"
+ "fmla v30.8h, %[b2].8h, %[a0].h[6]\n"
+ "fmla v31.8h, %[b2].8h, %[a0].h[7]\n"
+ "ldr %q[a0], [%[a_ptr], #32]\n"
+
+ "fmla v8.8h , %[b0a].8h, %[a0a].h[0]\n"
+ "fmla v9.8h , %[b0a].8h, %[a0a].h[1]\n"
+ "ldr %q[b2], [%[b_ptr], #32]\n"
+ "fmla v10.8h, %[b0a].8h, %[a0a].h[2]\n"
+ "fmla v11.8h, %[b0a].8h, %[a0a].h[3]\n"
+ "fmla v12.8h, %[b0a].8h, %[a0a].h[4]\n"
+ "fmla v13.8h, %[b0a].8h, %[a0a].h[5]\n"
+ "fmla v14.8h, %[b0a].8h, %[a0a].h[6]\n"
+ "fmla v15.8h, %[b0a].8h, %[a0a].h[7]\n"
+ "ldr %q[b0a], [%[b_ptr], #48]\n"
+
+ "fmla v16.8h, %[b1a].8h, %[a0a].h[0]\n"
+ "fmla v17.8h, %[b1a].8h, %[a0a].h[1]\n" ASM_PREFETCH("[%[b_ptr], #352]")
+ "fmla v18.8h, %[b1a].8h, %[a0a].h[2]\n"
+ "fmla v19.8h, %[b1a].8h, %[a0a].h[3]\n"
+ "fmla v20.8h, %[b1a].8h, %[a0a].h[4]\n"
+ "fmla v21.8h, %[b1a].8h, %[a0a].h[5]\n"
+ "fmla v22.8h, %[b1a].8h, %[a0a].h[6]\n"
+ "fmla v23.8h, %[b1a].8h, %[a0a].h[7]\n"
+ "ldr %q[b1a], [%[b_ptr], #64]\n"
+
+ "fmla v24.8h, %[b2a].8h, %[a0a].h[0]\n"
+ "fmla v25.8h, %[b2a].8h, %[a0a].h[1]\n"
+ "add %[a_ptr], %[a_ptr], #32\n"
+ "fmla v26.8h, %[b2a].8h, %[a0a].h[2]\n"
+ "fmla v27.8h, %[b2a].8h, %[a0a].h[3]\n"
+ "fmla v28.8h, %[b2a].8h, %[a0a].h[4]\n"
+ "fmla v29.8h, %[b2a].8h, %[a0a].h[5]\n"
+ "subs %w[k], %w[k], #1\n"
+ "fmla v30.8h, %[b2a].8h, %[a0a].h[6]\n"
+ "fmla v31.8h, %[b2a].8h, %[a0a].h[7]\n"
+
+ "bne 1b\n"
+ "4:\n"
+
+ // Jump to odd tail if necessary.
+ "cbnz %w[oddk], 2f\n"
+
+ // Even tail.
+ "fmla v8.8h , %[b0].8h, %[a0].h[0]\n"
+ "fmla v9.8h , %[b0].8h, %[a0].h[1]\n"
+ "ldr %q[a0a], [%[a_ptr], #16]\n"
+ "fmla v10.8h, %[b0].8h, %[a0].h[2]\n"
+ "fmla v11.8h, %[b0].8h, %[a0].h[3]\n"
+ "ldr %q[b2a], [%[b_ptr], #80]\n"
+ "fmla v12.8h, %[b0].8h, %[a0].h[4]\n"
+ "fmla v13.8h, %[b0].8h, %[a0].h[5]\n"
+ "fmla v14.8h, %[b0].8h, %[a0].h[6]\n"
+ "fmla v15.8h, %[b0].8h, %[a0].h[7]\n"
+
+ "fmla v16.8h, %[b1].8h, %[a0].h[0]\n"
+ "fmla v17.8h, %[b1].8h, %[a0].h[1]\n"
+ "add %[b_ptr], %[b_ptr], #96\n"
+ "fmla v18.8h, %[b1].8h, %[a0].h[2]\n"
+ "fmla v19.8h, %[b1].8h, %[a0].h[3]\n"
+ "fmla v20.8h, %[b1].8h, %[a0].h[4]\n"
+ "fmla v21.8h, %[b1].8h, %[a0].h[5]\n"
+ "add %[a_ptr], %[a_ptr], #32\n"
+ "fmla v22.8h, %[b1].8h, %[a0].h[6]\n"
+ "fmla v23.8h, %[b1].8h, %[a0].h[7]\n"
+
+ "fmla v24.8h, %[b2].8h, %[a0].h[0]\n"
+ "fmla v25.8h, %[b2].8h, %[a0].h[1]\n"
+ "fmla v26.8h, %[b2].8h, %[a0].h[2]\n"
+ "fmla v27.8h, %[b2].8h, %[a0].h[3]\n"
+ "fmla v28.8h, %[b2].8h, %[a0].h[4]\n"
+ "fmla v29.8h, %[b2].8h, %[a0].h[5]\n"
+ "fmla v30.8h, %[b2].8h, %[a0].h[6]\n"
+ "fmla v31.8h, %[b2].8h, %[a0].h[7]\n"
+
+ "fmla v8.8h , %[b0a].8h, %[a0a].h[0]\n"
+ "fmla v16.8h, %[b1a].8h, %[a0a].h[0]\n"
+ "str q8, [%[c_ptr]]\n"
+ "fmla v24.8h, %[b2a].8h, %[a0a].h[0]\n"
+ "str q16, [%[c_ptr], #16]\n"
+
+ "fmla v9.8h , %[b0a].8h, %[a0a].h[1]\n"
+ "str q24, [%[c_ptr], #32]\n"
+ "fmla v17.8h, %[b1a].8h, %[a0a].h[1]\n"
+ "str q9, [%[c_ptr], #48]\n"
+ "fmla v25.8h, %[b2a].8h, %[a0a].h[1]\n"
+ "str q17, [%[c_ptr], #64]\n"
+
+ "fmla v10.8h, %[b0a].8h, %[a0a].h[2]\n"
+ "str q25, [%[c_ptr], #80]\n"
+ "fmla v18.8h, %[b1a].8h, %[a0a].h[2]\n"
+ "str q10, [%[c_ptr], #96]\n"
+ "fmla v26.8h, %[b2a].8h, %[a0a].h[2]\n"
+ "str q18, [%[c_ptr], #112]\n"
+
+ "fmla v11.8h, %[b0a].8h, %[a0a].h[3]\n"
+ "str q26, [%[c_ptr], #128]\n"
+ "fmla v19.8h, %[b1a].8h, %[a0a].h[3]\n"
+ "str q11, [%[c_ptr], #144]\n"
+ "fmla v27.8h, %[b2a].8h, %[a0a].h[3]\n"
+ "str q19, [%[c_ptr], #160]\n"
+
+ "fmla v12.8h, %[b0a].8h, %[a0a].h[4]\n"
+ "str q27, [%[c_ptr], #176]\n"
+ "fmla v20.8h, %[b1a].8h, %[a0a].h[4]\n"
+ "str q12, [%[c_ptr], #192]\n"
+ "fmla v28.8h, %[b2a].8h, %[a0a].h[4]\n"
+ "str q20, [%[c_ptr], #208]\n"
+
+ "fmla v13.8h, %[b0a].8h, %[a0a].h[5]\n"
+ "str q28, [%[c_ptr], #224]\n"
+ "fmla v21.8h, %[b1a].8h, %[a0a].h[5]\n"
+ "str q13, [%[c_ptr], #240]\n"
+ "fmla v29.8h, %[b2a].8h, %[a0a].h[5]\n"
+ "str q21, [%[c_ptr], #256]\n"
+
+ "fmla v14.8h, %[b0a].8h, %[a0a].h[6]\n"
+ "str q29, [%[c_ptr], #272]\n"
+ "fmla v22.8h, %[b1a].8h, %[a0a].h[6]\n"
+ "str q14, [%[c_ptr], #288]\n"
+ "fmla v30.8h, %[b2a].8h, %[a0a].h[6]\n"
+ "str q22, [%[c_ptr], #304]\n"
+
+ "fmla v15.8h, %[b0a].8h, %[a0a].h[7]\n"
+ "str q30, [%[c_ptr], #320]\n"
+ "fmla v23.8h, %[b1a].8h, %[a0a].h[7]\n"
+ "str q15, [%[c_ptr], #336]\n"
+ "fmla v31.8h, %[b2a].8h, %[a0a].h[7]\n"
+ "b 3f\n"
+
+ // Odd tail
+ "2:\n"
+ "fmla v8.8h , %[b0].8h, %[a0].h[0]\n"
+ "add %[b_ptr], %[b_ptr], #48\n"
+ "fmla v16.8h, %[b1].8h, %[a0].h[0]\n"
+ "add %[a_ptr], %[a_ptr], #16\n"
+ "str q8, [%[c_ptr]]\n"
+ "fmla v24.8h, %[b2].8h, %[a0].h[0]\n"
+ "str q16, [%[c_ptr], #16]\n"
+
+ "fmla v9.8h , %[b0].8h, %[a0].h[1]\n"
+ "str q24, [%[c_ptr], #32]\n"
+ "fmla v17.8h, %[b1].8h, %[a0].h[1]\n"
+ "str q9, [%[c_ptr], #48]\n"
+ "fmla v25.8h, %[b2].8h, %[a0].h[1]\n"
+ "str q17, [%[c_ptr], #64]\n"
+
+ "fmla v10.8h, %[b0].8h, %[a0].h[2]\n"
+ "str q25, [%[c_ptr], #80]\n"
+ "fmla v18.8h, %[b1].8h, %[a0].h[2]\n"
+ "str q10, [%[c_ptr], #96]\n"
+ "fmla v26.8h, %[b2].8h, %[a0].h[2]\n"
+ "str q18, [%[c_ptr], #112]\n"
+
+ "fmla v11.8h, %[b0].8h, %[a0].h[3]\n"
+ "str q26, [%[c_ptr], #128]\n"
+ "fmla v19.8h, %[b1].8h, %[a0].h[3]\n"
+ "str q11, [%[c_ptr], #144]\n"
+ "fmla v27.8h, %[b2].8h, %[a0].h[3]\n"
+ "str q19, [%[c_ptr], #160]\n"
+
+ "fmla v12.8h, %[b0].8h, %[a0].h[4]\n"
+ "str q27, [%[c_ptr], #176]\n"
+ "fmla v20.8h, %[b1].8h, %[a0].h[4]\n"
+ "str q12, [%[c_ptr], #192]\n"
+ "fmla v28.8h, %[b2].8h, %[a0].h[4]\n"
+ "str q20, [%[c_ptr], #208]\n"
+
+ "fmla v13.8h, %[b0].8h, %[a0].h[5]\n"
+ "str q28, [%[c_ptr], #224]\n"
+ "fmla v21.8h, %[b1].8h, %[a0].h[5]\n"
+ "str q13, [%[c_ptr], #240]\n"
+ "fmla v29.8h, %[b2].8h, %[a0].h[5]\n"
+ "str q21, [%[c_ptr], #256]\n"
+
+ "fmla v14.8h, %[b0].8h, %[a0].h[6]\n"
+ "str q29, [%[c_ptr], #272]\n"
+ "fmla v22.8h, %[b1].8h, %[a0].h[6]\n"
+ "str q14, [%[c_ptr], #288]\n"
+ "fmla v30.8h, %[b2].8h, %[a0].h[6]\n"
+ "str q22, [%[c_ptr], #304]\n"
+
+ "fmla v15.8h, %[b0].8h, %[a0].h[7]\n"
+ "str q30, [%[c_ptr], #320]\n"
+ "fmla v23.8h, %[b1].8h, %[a0].h[7]\n"
+ "str q15, [%[c_ptr], #336]\n"
+ "fmla v31.8h, %[b2].8h, %[a0].h[7]\n"
+
+ "3:\n"
+ "str q23, [%[c_ptr], #352]\n"
+ "str q31, [%[c_ptr], #368]\n"
+ "add %[c_ptr], %[c_ptr], #384\n"
+ :
+ [a_ptr] "+r"(a_ptr), [b_ptr] "+r"(b_ptr), [c_ptr] "+r"(c_ptr),
+ [a0] "+w"(a0), [a0a] "+w"(a0a),
+ [b0] "+w"(b0), [b1] "+w"(b1), [b2] "+w"(b2), [k] "+r"(k),
+ [b0a] "+w"(b0a), [b1a] "+w"(b1a), [b2a] "+w"(b2a)
+ : [oddk] "r"(oddk)
+ : "x20", "x21", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18",
+ "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "cc");
+ }
+ }
+}
+
+} // namespace arm_gemm
+
+#endif // __aarch64__ && (FP16_KERNELS || __ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_sgemm_12x8.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_sgemm_12x8.hpp
new file mode 100644
index 0000000..91a9e8d
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_sgemm_12x8.hpp
@@ -0,0 +1,95 @@
+/*
+ * Copyright (c) 2017-2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#pragma once
+
+#ifdef __aarch64__
+
+namespace arm_gemm
+{
+// Actual kernel implementations
+void a64_sgemm_asimd_12x8(const float *, const float *, float *, int, int, int);
+void a64_sgemm_asimd_12x8_a53(const float *, const float *, float *, int, int, int);
+void a64_sgemm_asimd_12x8_a55(const float *, const float *, float *, int, int, int);
+void a64_sgemm_asimd_12x8_a55r1(const float *, const float *, float *, int, int, int);
+
+// 12x8 SGEMM "strategy" class.
+//
+// This describes the characteristics of a family of kernels, in terms of
+// the required interleave properties and the output block size.
+//
+// All kernels in the family must share these characteristics. The actual
+// kernel to be used can be chosen at runtime, based on the CPU_type
+// structure.
+class sgemm_12x8
+{
+public:
+ typedef float operand_type;
+ typedef float result_type;
+
+ typedef void (*kern_type)(const float *, const float *, float *, int, int, int);
+
+ /* Describes the data layout for A input */
+ static const int A_interleave = 8;
+ static const int A_block = 1;
+ static const int A_transpose = 0;
+
+ /* Same for B input */
+ static const int B_interleave = 12;
+ static const int B_block = 1;
+ static const int B_transpose = 1;
+
+ /* Kernel blocking parameters */
+ static const int out_width = 12;
+ static const int out_height = 8;
+ static const int k_unroll = 1;
+
+ kern_type kernel = a64_sgemm_asimd_12x8;
+
+ sgemm_12x8(const CPUInfo *ci)
+ {
+ // Select specific kernel if available
+ switch(ci->get_cpu_model())
+ {
+ case CPUModel::A53:
+ kernel = a64_sgemm_asimd_12x8_a53;
+ break;
+
+ case CPUModel::A55r0:
+ kernel = a64_sgemm_asimd_12x8_a55;
+ break;
+
+ case CPUModel::A55r1:
+ kernel = a64_sgemm_asimd_12x8_a55r1;
+ break;
+
+ default:
+ /* Generic kernel is initialized by default. */
+ break;
+ }
+ }
+};
+
+} // namespace arm_gemm
+
+#endif // __aarch64__
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_sgemm_12x8/a53.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_sgemm_12x8/a53.cpp
new file mode 100644
index 0000000..618ebc7
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_sgemm_12x8/a53.cpp
@@ -0,0 +1,363 @@
+/*
+ * Copyright (c) 2017-2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifdef __aarch64__
+
+#include <arm_neon.h>
+
+#include "../../asmlib.hpp"
+
+namespace arm_gemm
+{
+void a64_sgemm_asimd_12x8_a53(const float *Apanel, const float *Bpanel, float *Cpanel, int ablocks, int bblocks, int K)
+{
+ const float *a_ptr = Apanel;
+ float *c_ptr = Cpanel;
+
+ for(int yb = 0; yb < ablocks; yb++)
+ {
+ const float *a_ptr0 = a_ptr;
+ const float *b_ptr = Bpanel;
+
+ for(int xb = 0; xb < bblocks; xb++)
+ {
+ a_ptr = a_ptr0;
+ // Fix up for odd lengths - set a flag if K is odd, but make
+ // sure we round up the iteration count.
+ int oddk = (K & 1);
+ int k = ((K + 1) / 2) - 1;
+
+ register float32x4_t a0 asm("v0");
+ register float32x4_t a1 asm("v1");
+ register float32x4_t b0 asm("v2");
+ register float32x4_t b1 asm("v3");
+ register float32x4_t b2 asm("v4");
+ register float32x4_t a0a asm("v5");
+ register float32x4_t a1a asm("v6");
+
+ __asm __volatile(
+ // Initialize result registers, load initial operands, prime prefetches.
+ "movi v8.4s, #0x0\n"
+ "ldr %q[a0], [%[a_ptr]]\n"
+ "movi v9.4s, #0x0\n"
+ "ldr %q[b0], [%[b_ptr]]\n"
+ "movi v10.4s, #0x0\n"
+ "ldr %q[a1], [%[a_ptr], #16]\n"
+ "movi v11.4s, #0x0\n"
+ "ldr %q[b1], [%[b_ptr], #16]\n"
+ "movi v12.4s, #0x0\n" ASM_PREFETCH("[%[b_ptr], #64]") "movi v13.4s, #0x0\n" ASM_PREFETCH("[%[a_ptr], #64]") "movi v14.4s, #0x0\n" ASM_PREFETCH("[%[b_ptr], #128]") "movi v15.4s, #0x0\n"
+ ASM_PREFETCH("[%[a_ptr], #128]") "movi v16.4s, #0x0\n" ASM_PREFETCH("[%[b_ptr], #192]") "movi v17.4s, #0x0\n" ASM_PREFETCH("[%[b_ptr], #256]") "movi v18.4s, #0x0\n"
+ ASM_PREFETCH("[%[a_ptr], #192]") "movi v19.4s, #0x0\n" ASM_PREFETCH("[%[b_ptr], #320]") "movi v20.4s, #0x0\n" ASM_PREFETCH("[%[a_ptr], #256]") "movi v21.4s, #0x0\n"
+ ASM_PREFETCH("[%[b_ptr], #384]")
+ "movi v22.4s, #0x0\n"
+ "movi v23.4s, #0x0\n"
+ "movi v24.4s, #0x0\n"
+ "movi v25.4s, #0x0\n"
+ "movi v26.4s, #0x0\n"
+ "movi v27.4s, #0x0\n"
+ "movi v28.4s, #0x0\n"
+ "movi v29.4s, #0x0\n"
+ "movi v30.4s, #0x0\n"
+ "movi v31.4s, #0x0\n"
+
+ // Skip loop if we are doing zero iterations of it.
+ "cbz %w[k], 4f\n"
+
+ "1:\n"
+ // Unroll 0
+ "ldr %d[b2], [%[b_ptr], #32]\n"
+ "nop\n"
+ "fmla v8.4s , %[b0].4s, %[a0].s[0]\n"
+ "ldr x20, [%[b_ptr], #40]\n"
+ "fmla v9.4s , %[b0].4s, %[a0].s[1]\n"
+ "subs %w[k], %w[k], #1\n"
+ "fmla v10.4s, %[b0].4s, %[a0].s[2]\n"
+
+ "ldr %d[a0a], [%[a_ptr], #32]\n"
+ "ins %[b2].d[1], x20\n"
+ "fmla v11.4s, %[b0].4s, %[a0].s[3]\n"
+ "ldr x20, [%[a_ptr], #40]\n"
+ "fmla v12.4s, %[b0].4s, %[a1].s[0]\n"
+ "fmla v13.4s, %[b0].4s, %[a1].s[1]\n"
+
+ "ldr %d[a1a], [%[a_ptr], #48]\n"
+ "ins %[a0a].d[1], x20\n"
+ "fmla v14.4s, %[b0].4s, %[a1].s[2]\n"
+ "ldr x20, [%[a_ptr], #56]\n"
+ "fmla v15.4s, %[b0].4s, %[a1].s[3]\n"
+ "fmla v16.4s, %[b1].4s, %[a0].s[0]\n"
+
+ "ldr %d[b0], [%[b_ptr], #48]\n"
+ "ins %[a1a].d[1], x20\n"
+ "fmla v17.4s, %[b1].4s, %[a0].s[1]\n"
+ "ldr x20, [%[b_ptr], #56]\n"
+ "fmla v18.4s, %[b1].4s, %[a0].s[2]\n"
+ "fmla v19.4s, %[b1].4s, %[a0].s[3]\n"
+
+ ASM_PREFETCH("[%[a_ptr], #320]")
+ "ins %[b0].d[1], x20\n"
+ "fmla v20.4s, %[b1].4s, %[a1].s[0]\n"
+ "fmla v21.4s, %[b1].4s, %[a1].s[1]\n"
+ "fmla v22.4s, %[b1].4s, %[a1].s[2]\n"
+
+ ASM_PREFETCH("[%[b_ptr], #448]")
+ "nop\n"
+ "fmla v23.4s, %[b1].4s, %[a1].s[3]\n"
+ "fmla v24.4s, %[b2].4s, %[a0].s[0]\n"
+ "fmla v25.4s, %[b2].4s, %[a0].s[1]\n"
+
+ "ldr %d[b1], [%[b_ptr], #64]\n"
+ "nop\n"
+ "fmla v26.4s, %[b2].4s, %[a0].s[2]\n"
+ "ldr x20, [%[b_ptr], #72]\n"
+ "fmla v27.4s, %[b2].4s, %[a0].s[3]\n"
+ "fmla v28.4s, %[b2].4s, %[a1].s[0]\n"
+
+ ASM_PREFETCH("[%[b_ptr], #512]")
+ "ins %[b1].d[1], x20\n"
+ "fmla v29.4s, %[b2].4s, %[a1].s[1]\n"
+ "fmla v30.4s, %[b2].4s, %[a1].s[2]\n"
+ "fmla v31.4s, %[b2].4s, %[a1].s[3]\n"
+
+ // Unroll 1
+ "ldr %d[b2], [%[b_ptr], #80]\n"
+ "nop\n"
+ "fmla v8.4s , %[b0].4s, %[a0a].s[0]\n"
+ "ldr x20, [%[b_ptr], #88]\n"
+ "fmla v9.4s , %[b0].4s, %[a0a].s[1]\n"
+ "fmla v10.4s, %[b0].4s, %[a0a].s[2]\n"
+
+ "ldr %d[a0], [%[a_ptr], #64]\n"
+ "ins %[b2].d[1], x20\n"
+ "fmla v11.4s, %[b0].4s, %[a0a].s[3]\n"
+ "ldr x20, [%[a_ptr], #72]\n"
+ "fmla v12.4s, %[b0].4s, %[a1a].s[0]\n"
+ "fmla v13.4s, %[b0].4s, %[a1a].s[1]\n"
+
+ "ldr %d[a1], [%[a_ptr], #80]\n"
+ "ins %[a0].d[1], x20\n"
+ "fmla v14.4s, %[b0].4s, %[a1a].s[2]\n"
+ "ldr x20, [%[a_ptr], #88]\n"
+ "fmla v15.4s, %[b0].4s, %[a1a].s[3]\n"
+ "fmla v16.4s, %[b1].4s, %[a0a].s[0]\n"
+
+ "ldr %d[b0], [%[b_ptr], #96]\n"
+ "ins %[a1].d[1], x20\n"
+ "fmla v17.4s, %[b1].4s, %[a0a].s[1]\n"
+ "ldr x20, [%[b_ptr], #104]\n"
+ "fmla v18.4s, %[b1].4s, %[a0a].s[2]\n"
+ "fmla v19.4s, %[b1].4s, %[a0a].s[3]\n"
+
+ "nop\n"
+ "ins %[b0].d[1], x20\n"
+ "fmla v20.4s, %[b1].4s, %[a1a].s[0]\n"
+ "fmla v21.4s, %[b1].4s, %[a1a].s[1]\n"
+ "fmla v22.4s, %[b1].4s, %[a1a].s[2]\n"
+
+ "nop\n"
+ "nop\n"
+ "fmla v23.4s, %[b1].4s, %[a1a].s[3]\n"
+ "fmla v24.4s, %[b2].4s, %[a0a].s[0]\n"
+ "fmla v25.4s, %[b2].4s, %[a0a].s[1]\n"
+
+ "ldr %d[b1], [%[b_ptr], #112]\n"
+ "nop\n"
+ "fmla v26.4s, %[b2].4s, %[a0a].s[2]\n"
+ "ldr x20, [%[b_ptr], #120]\n"
+ "fmla v27.4s, %[b2].4s, %[a0a].s[3]\n"
+ "add %[a_ptr], %[a_ptr], #64\n"
+ "fmla v28.4s, %[b2].4s, %[a1a].s[0]\n"
+ "add %[b_ptr], %[b_ptr], #96\n"
+
+ "nop\n"
+ "ins %[b1].d[1], x20\n"
+ "fmla v29.4s, %[b2].4s, %[a1a].s[1]\n"
+ "fmla v30.4s, %[b2].4s, %[a1a].s[2]\n"
+ "fmla v31.4s, %[b2].4s, %[a1a].s[3]\n"
+
+ "bne 1b\n"
+
+ // Branch here if K=1 or 2. Do the right thing for odd/even at the end.
+ "4:\n"
+ "cbnz %w[oddk], 2f\n"
+
+ // Detached final iteration. (even K)
+ "ldr %d[b2], [%[b_ptr], #32]\n"
+ "nop\n"
+ "fmla v8.4s , %[b0].4s, %[a0].s[0]\n"
+ "ldr x20, [%[b_ptr], #40]\n"
+ "fmla v9.4s , %[b0].4s, %[a0].s[1]\n"
+ "subs %w[k], %w[k], #1\n"
+ "fmla v10.4s, %[b0].4s, %[a0].s[2]\n"
+
+ "ldr %d[a0a], [%[a_ptr], #32]\n"
+ "ins %[b2].d[1], x20\n"
+ "fmla v11.4s, %[b0].4s, %[a0].s[3]\n"
+ "ldr x20, [%[a_ptr], #40]\n"
+ "fmla v12.4s, %[b0].4s, %[a1].s[0]\n"
+ "fmla v13.4s, %[b0].4s, %[a1].s[1]\n"
+
+ "ldr %d[a1a], [%[a_ptr], #48]\n"
+ "ins %[a0a].d[1], x20\n"
+ "fmla v14.4s, %[b0].4s, %[a1].s[2]\n"
+ "ldr x20, [%[a_ptr], #56]\n"
+ "fmla v15.4s, %[b0].4s, %[a1].s[3]\n"
+ "fmla v16.4s, %[b1].4s, %[a0].s[0]\n"
+
+ "ldr %d[b0], [%[b_ptr], #48]\n"
+ "ins %[a1a].d[1], x20\n"
+ "fmla v17.4s, %[b1].4s, %[a0].s[1]\n"
+ "ldr x20, [%[b_ptr], #56]\n"
+ "fmla v18.4s, %[b1].4s, %[a0].s[2]\n"
+ "fmla v19.4s, %[b1].4s, %[a0].s[3]\n"
+
+ "ins %[b0].d[1], x20\n"
+ "fmla v20.4s, %[b1].4s, %[a1].s[0]\n"
+ "fmla v21.4s, %[b1].4s, %[a1].s[1]\n"
+ "fmla v22.4s, %[b1].4s, %[a1].s[2]\n"
+
+ "nop\n"
+ "fmla v23.4s, %[b1].4s, %[a1].s[3]\n"
+ "fmla v24.4s, %[b2].4s, %[a0].s[0]\n"
+ "fmla v25.4s, %[b2].4s, %[a0].s[1]\n"
+
+ "ldr %d[b1], [%[b_ptr], #64]\n"
+ "nop\n"
+ "fmla v26.4s, %[b2].4s, %[a0].s[2]\n"
+ "ldr x20, [%[b_ptr], #72]\n"
+ "fmla v27.4s, %[b2].4s, %[a0].s[3]\n"
+ "fmla v28.4s, %[b2].4s, %[a1].s[0]\n"
+
+ "ins %[b1].d[1], x20\n"
+ "fmla v29.4s, %[b2].4s, %[a1].s[1]\n"
+ "fmla v30.4s, %[b2].4s, %[a1].s[2]\n"
+ "fmla v31.4s, %[b2].4s, %[a1].s[3]\n"
+
+ "ldr %d[b2], [%[b_ptr], #80]\n"
+ "nop\n"
+ "fmla v8.4s , %[b0].4s, %[a0a].s[0]\n"
+ "ldr x20, [%[b_ptr], #88]\n"
+ "fmla v9.4s , %[b0].4s, %[a0a].s[1]\n"
+ "fmla v10.4s, %[b0].4s, %[a0a].s[2]\n"
+
+ "ins %[b2].d[1], x20\n"
+ "fmla v11.4s, %[b0].4s, %[a0a].s[3]\n"
+ "fmla v12.4s, %[b0].4s, %[a1a].s[0]\n"
+ "fmla v13.4s, %[b0].4s, %[a1a].s[1]\n"
+ "fmla v14.4s, %[b0].4s, %[a1a].s[2]\n"
+ "fmla v15.4s, %[b0].4s, %[a1a].s[3]\n"
+ "fmla v16.4s, %[b1].4s, %[a0a].s[0]\n"
+ "fmla v17.4s, %[b1].4s, %[a0a].s[1]\n"
+ "fmla v18.4s, %[b1].4s, %[a0a].s[2]\n"
+ "fmla v19.4s, %[b1].4s, %[a0a].s[3]\n"
+ "fmla v20.4s, %[b1].4s, %[a1a].s[0]\n"
+ "fmla v21.4s, %[b1].4s, %[a1a].s[1]\n"
+ "fmla v22.4s, %[b1].4s, %[a1a].s[2]\n"
+ "fmla v23.4s, %[b1].4s, %[a1a].s[3]\n"
+ "fmla v24.4s, %[b2].4s, %[a0a].s[0]\n"
+ "fmla v25.4s, %[b2].4s, %[a0a].s[1]\n"
+ "fmla v26.4s, %[b2].4s, %[a0a].s[2]\n"
+ "fmla v27.4s, %[b2].4s, %[a0a].s[3]\n"
+ "fmla v28.4s, %[b2].4s, %[a1a].s[0]\n"
+ "fmla v29.4s, %[b2].4s, %[a1a].s[1]\n"
+ "add %[a_ptr], %[a_ptr], #64\n"
+ "fmla v30.4s, %[b2].4s, %[a1a].s[2]\n"
+ "add %[b_ptr], %[b_ptr], #96\n"
+ "fmla v31.4s, %[b2].4s, %[a1a].s[3]\n"
+ "b 3f\n"
+
+ // Detached final iteration. (odd K)
+ "2:\n"
+ "ldr %d[b2], [%[b_ptr], #32]\n"
+ "nop\n"
+ "fmla v8.4s , %[b0].4s, %[a0].s[0]\n"
+ "ldr x20, [%[b_ptr], #40]\n"
+ "fmla v9.4s , %[b0].4s, %[a0].s[1]\n"
+ "fmla v10.4s, %[b0].4s, %[a0].s[2]\n"
+
+ "ins %[b2].d[1], x20\n"
+ "fmla v11.4s, %[b0].4s, %[a0].s[3]\n"
+ "fmla v12.4s, %[b0].4s, %[a1].s[0]\n"
+ "fmla v13.4s, %[b0].4s, %[a1].s[1]\n"
+ "fmla v14.4s, %[b0].4s, %[a1].s[2]\n"
+ "fmla v15.4s, %[b0].4s, %[a1].s[3]\n"
+ "fmla v16.4s, %[b1].4s, %[a0].s[0]\n"
+ "fmla v17.4s, %[b1].4s, %[a0].s[1]\n"
+ "fmla v18.4s, %[b1].4s, %[a0].s[2]\n"
+ "fmla v19.4s, %[b1].4s, %[a0].s[3]\n"
+ "fmla v20.4s, %[b1].4s, %[a1].s[0]\n"
+ "fmla v21.4s, %[b1].4s, %[a1].s[1]\n"
+ "fmla v22.4s, %[b1].4s, %[a1].s[2]\n"
+ "fmla v23.4s, %[b1].4s, %[a1].s[3]\n"
+ "fmla v24.4s, %[b2].4s, %[a0].s[0]\n"
+ "fmla v25.4s, %[b2].4s, %[a0].s[1]\n"
+ "fmla v26.4s, %[b2].4s, %[a0].s[2]\n"
+ "fmla v27.4s, %[b2].4s, %[a0].s[3]\n"
+ "fmla v28.4s, %[b2].4s, %[a1].s[0]\n"
+ "fmla v29.4s, %[b2].4s, %[a1].s[1]\n"
+ "add %[a_ptr], %[a_ptr], #32\n"
+ "fmla v30.4s, %[b2].4s, %[a1].s[2]\n"
+ "add %[b_ptr], %[b_ptr], #48\n"
+ "fmla v31.4s, %[b2].4s, %[a1].s[3]\n"
+
+ // Common tail
+ "3:\n"
+ "str q8, [%[c_ptr]]\n"
+ "str q16, [%[c_ptr], #16]\n"
+ "str q24, [%[c_ptr], #32]\n"
+ "str q9, [%[c_ptr], #48]\n"
+ "str q17, [%[c_ptr], #64]\n"
+ "str q25, [%[c_ptr], #80]\n"
+ "str q10, [%[c_ptr], #96]\n"
+ "str q18, [%[c_ptr], #112]\n"
+ "str q26, [%[c_ptr], #128]\n"
+ "str q11, [%[c_ptr], #144]\n"
+ "str q19, [%[c_ptr], #160]\n"
+ "str q27, [%[c_ptr], #176]\n"
+ "str q12, [%[c_ptr], #192]\n"
+ "str q20, [%[c_ptr], #208]\n"
+ "str q28, [%[c_ptr], #224]\n"
+ "str q13, [%[c_ptr], #240]\n"
+ "str q21, [%[c_ptr], #256]\n"
+ "str q29, [%[c_ptr], #272]\n"
+ "str q14, [%[c_ptr], #288]\n"
+ "str q22, [%[c_ptr], #304]\n"
+ "str q30, [%[c_ptr], #320]\n"
+ "str q15, [%[c_ptr], #336]\n"
+ "str q23, [%[c_ptr], #352]\n"
+ "str q31, [%[c_ptr], #368]\n"
+ "add %[c_ptr], %[c_ptr], #384\n"
+ :
+ [a_ptr] "+r"(a_ptr), [b_ptr] "+r"(b_ptr), [c_ptr] "+r"(c_ptr),
+ [a0] "+w"(a0), [a1] "+w"(a1), [a0a] "+w"(a0a), [a1a] "+w"(a1a),
+ [b0] "+w"(b0), [b1] "+w"(b1), [b2] "+w"(b2), [k] "+r"(k)
+ : [oddk] "r"(oddk)
+ : "x20", "x21", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18",
+ "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "cc");
+ }
+ }
+}
+
+} // namespace arm_gemm
+
+#endif
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_sgemm_12x8/a55.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_sgemm_12x8/a55.cpp
new file mode 100644
index 0000000..4ca25eb
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_sgemm_12x8/a55.cpp
@@ -0,0 +1,356 @@
+/*
+ * Copyright (c) 2017-2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifdef __aarch64__
+
+#include <arm_neon.h>
+
+#include "../../asmlib.hpp"
+
+namespace arm_gemm
+{
+void a64_sgemm_asimd_12x8_a55(const float *Apanel, const float *Bpanel, float *Cpanel, int ablocks, int bblocks, int K)
+{
+ const float *a_ptr = Apanel;
+ float *c_ptr = Cpanel;
+
+ for(int yb = 0; yb < ablocks; yb++)
+ {
+ const float *a_ptr0 = a_ptr;
+ const float *b_ptr = Bpanel;
+
+ for(int xb = 0; xb < bblocks; xb++)
+ {
+ a_ptr = a_ptr0;
+ // Fix up for odd lengths - set a flag if K is odd, but make
+ // sure we round up the iteration count.
+ int oddk = (K & 1);
+ int k = ((K + 1) / 2) - 1;
+
+ register float32x4_t a0 asm("v0");
+ register float32x4_t a1 asm("v1");
+ register float32x4_t b0 asm("v2");
+ register float32x4_t b1 asm("v3");
+ register float32x4_t b2 asm("v4");
+ register float32x4_t a0a asm("v5");
+ register float32x4_t a1a asm("v6");
+
+ __asm __volatile(
+ // Initialize result registers, load initial operands, prime prefetches.
+ "movi v8.4s, #0x0\n"
+ "ldr %q[a0], [%[a_ptr]]\n"
+ "movi v9.4s, #0x0\n"
+ "ldr %q[b0], [%[b_ptr]]\n"
+ "movi v10.4s, #0x0\n"
+ "ldr %q[a1], [%[a_ptr], #16]\n"
+ "movi v11.4s, #0x0\n"
+ "ldr %q[b1], [%[b_ptr], #16]\n"
+ "movi v12.4s, #0x0\n" ASM_PREFETCH("[%[b_ptr], #64]") "movi v13.4s, #0x0\n" ASM_PREFETCH("[%[a_ptr], #64]") "movi v14.4s, #0x0\n" ASM_PREFETCH("[%[b_ptr], #128]") "movi v15.4s, #0x0\n"
+ ASM_PREFETCH("[%[a_ptr], #128]") "movi v16.4s, #0x0\n" ASM_PREFETCH("[%[b_ptr], #192]") "movi v17.4s, #0x0\n" ASM_PREFETCH("[%[b_ptr], #256]") "movi v18.4s, #0x0\n"
+ ASM_PREFETCH("[%[a_ptr], #192]") "movi v19.4s, #0x0\n" ASM_PREFETCH("[%[b_ptr], #320]") "movi v20.4s, #0x0\n" ASM_PREFETCH("[%[a_ptr], #256]") "movi v21.4s, #0x0\n"
+ ASM_PREFETCH("[%[b_ptr], #384]")
+ "movi v22.4s, #0x0\n"
+ "movi v23.4s, #0x0\n"
+ "movi v24.4s, #0x0\n"
+ "movi v25.4s, #0x0\n"
+ "movi v26.4s, #0x0\n"
+ "movi v27.4s, #0x0\n"
+ "movi v28.4s, #0x0\n"
+ "movi v29.4s, #0x0\n"
+ "movi v30.4s, #0x0\n"
+ "movi v31.4s, #0x0\n"
+
+ // Skip loop if we are doing zero iterations of it.
+ "cbz %w[k], 4f\n"
+
+ "1:\n"
+ // Unroll 0
+ "ldr %d[b2], [%[b_ptr], #32]\n"
+ "fmla v8.4s , %[b0].4s, %[a0].s[0]\n"
+
+ "fmla v9.4s , %[b0].4s, %[a0].s[1]\n"
+ "ldr x20, [%[b_ptr], #40]\n"
+ "fmla v10.4s, %[b0].4s, %[a0].s[2]\n"
+ "fmla v11.4s, %[b0].4s, %[a0].s[3]\n"
+ "subs %w[k], %w[k], #1\n"
+
+ "ldr %d[a0a], [%[a_ptr], #32]\n"
+ "ins %[b2].d[1], x20\n"
+
+ "fmla v12.4s, %[b0].4s, %[a1].s[0]\n"
+ "fmla v13.4s, %[b0].4s, %[a1].s[1]\n"
+ "ldr x20, [%[a_ptr], #40]\n"
+ "fmla v14.4s, %[b0].4s, %[a1].s[2]\n"
+ "fmla v15.4s, %[b0].4s, %[a1].s[3]\n"
+
+ "ldr %d[a1a], [%[a_ptr], #48]\n"
+ "ins %[a0a].d[1], x20\n"
+
+ "fmla v16.4s, %[b1].4s, %[a0].s[0]\n"
+ "fmla v17.4s, %[b1].4s, %[a0].s[1]\n"
+ "ldr x20, [%[a_ptr], #56]\n"
+ "fmla v18.4s, %[b1].4s, %[a0].s[2]\n"
+ "fmla v19.4s, %[b1].4s, %[a0].s[3]\n"
+
+ "ldr %d[b0], [%[b_ptr], #48]\n"
+ "ins %[a1a].d[1], x20\n" ASM_PREFETCH("[%[a_ptr], #320]")
+ "fmla v20.4s, %[b1].4s, %[a1].s[0]\n"
+ "fmla v21.4s, %[b1].4s, %[a1].s[1]\n"
+ "ldr x20, [%[b_ptr], #56]\n"
+ "fmla v22.4s, %[b1].4s, %[a1].s[2]\n"
+ "fmla v23.4s, %[b1].4s, %[a1].s[3]\n"
+
+ "ldr %d[b1], [%[b_ptr], #64]\n"
+ "ins %[b0].d[1], x20\n"
+
+ "fmla v24.4s, %[b2].4s, %[a0].s[0]\n"
+ "fmla v25.4s, %[b2].4s, %[a0].s[1]\n"
+ "ldr x20, [%[b_ptr], #72]\n"
+ "fmla v26.4s, %[b2].4s, %[a0].s[2]\n"
+ "fmla v27.4s, %[b2].4s, %[a0].s[3]\n" ASM_PREFETCH("[%[b_ptr], #448]")
+
+ "fmla v28.4s, %[b2].4s, %[a1].s[0]\n"
+ "fmla v29.4s, %[b2].4s, %[a1].s[1]\n" ASM_PREFETCH("[%[b_ptr], #512]")
+ "fmla v30.4s, %[b2].4s, %[a1].s[2]\n"
+ "fmla v31.4s, %[b2].4s, %[a1].s[3]\n"
+
+ // Unroll 1
+ "ldr %d[b2], [%[b_ptr], #80]\n"
+ "ins %[b1].d[1], x20\n"
+
+ "fmla v8.4s , %[b0].4s, %[a0a].s[0]\n"
+ "fmla v9.4s , %[b0].4s, %[a0a].s[1]\n"
+ "ldr x20, [%[b_ptr], #88]\n"
+ "fmla v10.4s, %[b0].4s, %[a0a].s[2]\n"
+ "fmla v11.4s, %[b0].4s, %[a0a].s[3]\n"
+
+ "ldr %d[a0], [%[a_ptr], #64]\n"
+ "ins %[b2].d[1], x20\n"
+
+ "fmla v12.4s, %[b0].4s, %[a1a].s[0]\n"
+ "fmla v13.4s, %[b0].4s, %[a1a].s[1]\n"
+ "ldr x20, [%[a_ptr], #72]\n"
+ "fmla v14.4s, %[b0].4s, %[a1a].s[2]\n"
+ "fmla v15.4s, %[b0].4s, %[a1a].s[3]\n"
+
+ "ldr %d[a1], [%[a_ptr], #80]\n"
+ "ins %[a0].d[1], x20\n"
+
+ "fmla v16.4s, %[b1].4s, %[a0a].s[0]\n"
+ "fmla v17.4s, %[b1].4s, %[a0a].s[1]\n"
+ "ldr x20, [%[a_ptr], #88]\n"
+ "fmla v18.4s, %[b1].4s, %[a0a].s[2]\n"
+ "fmla v19.4s, %[b1].4s, %[a0a].s[3]\n"
+
+ "ldr %d[b0], [%[b_ptr], #96]\n"
+ "ins %[a1].d[1], x20\n"
+
+ "fmla v20.4s, %[b1].4s, %[a1a].s[0]\n"
+ "fmla v21.4s, %[b1].4s, %[a1a].s[1]\n"
+ "ldr x20, [%[b_ptr], #104]\n"
+ "fmla v22.4s, %[b1].4s, %[a1a].s[2]\n"
+ "fmla v23.4s, %[b1].4s, %[a1a].s[3]\n"
+
+ "ldr %d[b1], [%[b_ptr], #112]\n"
+ "ins %[b0].d[1], x20\n"
+
+ "fmla v24.4s, %[b2].4s, %[a0a].s[0]\n"
+ "fmla v25.4s, %[b2].4s, %[a0a].s[1]\n"
+ "ldr x20, [%[b_ptr], #120]\n"
+ "fmla v26.4s, %[b2].4s, %[a0a].s[2]\n"
+ "fmla v27.4s, %[b2].4s, %[a0a].s[3]\n"
+ "add %[a_ptr], %[a_ptr], #64\n"
+
+ "fmla v28.4s, %[b2].4s, %[a1a].s[0]\n"
+ "fmla v29.4s, %[b2].4s, %[a1a].s[1]\n"
+ "fmla v30.4s, %[b2].4s, %[a1a].s[2]\n"
+ "add %[b_ptr], %[b_ptr], #96\n"
+ "fmla v31.4s, %[b2].4s, %[a1a].s[3]\n"
+
+ "ldr %d[b2], [%[b_ptr], #32]\n"
+ "ins %[b1].d[1], x20\n"
+
+ "bne 1b\n"
+
+ // Branch here if K=1 or 2. Do the right thing for odd/even at the end.
+ "4:\n"
+ "cbnz %w[oddk], 2f\n"
+ "fmla v8.4s , %[b0].4s, %[a0].s[0]\n"
+
+ // Detached final iteration. (even K)
+ "ldr x20, [%[b_ptr], #40]\n"
+ "fmla v9.4s , %[b0].4s, %[a0].s[1]\n"
+ "subs %w[k], %w[k], #1\n"
+ "fmla v10.4s, %[b0].4s, %[a0].s[2]\n"
+ "fmla v11.4s, %[b0].4s, %[a0].s[3]\n"
+
+ "ldr %d[a0a], [%[a_ptr], #32]\n"
+ "ins %[b2].d[1], x20\n"
+
+ "fmla v12.4s, %[b0].4s, %[a1].s[0]\n"
+ "fmla v13.4s, %[b0].4s, %[a1].s[1]\n"
+ "ldr x20, [%[a_ptr], #40]\n"
+ "fmla v14.4s, %[b0].4s, %[a1].s[2]\n"
+ "fmla v15.4s, %[b0].4s, %[a1].s[3]\n"
+
+ "ldr %d[a1a], [%[a_ptr], #48]\n"
+ "ins %[a0a].d[1], x20\n"
+
+ "fmla v16.4s, %[b1].4s, %[a0].s[0]\n"
+ "fmla v17.4s, %[b1].4s, %[a0].s[1]\n"
+ "ldr x20, [%[a_ptr], #56]\n"
+ "fmla v18.4s, %[b1].4s, %[a0].s[2]\n"
+ "fmla v19.4s, %[b1].4s, %[a0].s[3]\n"
+
+ "ldr %d[b0], [%[b_ptr], #48]\n"
+ "ins %[a1a].d[1], x20\n"
+
+ "fmla v20.4s, %[b1].4s, %[a1].s[0]\n"
+ "fmla v21.4s, %[b1].4s, %[a1].s[1]\n"
+ "ldr x20, [%[b_ptr], #56]\n"
+ "fmla v22.4s, %[b1].4s, %[a1].s[2]\n"
+ "fmla v23.4s, %[b1].4s, %[a1].s[3]\n"
+
+ "fmla v24.4s, %[b2].4s, %[a0].s[0]\n"
+ "fmla v25.4s, %[b2].4s, %[a0].s[1]\n"
+ "fmla v26.4s, %[b2].4s, %[a0].s[2]\n"
+ "fmla v27.4s, %[b2].4s, %[a0].s[3]\n"
+
+ "ldr %d[b1], [%[b_ptr], #64]\n"
+ "ins %[b0].d[1], x20\n"
+
+ "fmla v28.4s, %[b2].4s, %[a1].s[0]\n"
+ "fmla v29.4s, %[b2].4s, %[a1].s[1]\n"
+ "ldr x20, [%[b_ptr], #72]\n"
+ "fmla v30.4s, %[b2].4s, %[a1].s[2]\n"
+ "fmla v31.4s, %[b2].4s, %[a1].s[3]\n"
+
+ "ldr %d[b2], [%[b_ptr], #80]\n"
+ "ins %[b1].d[1], x20\n"
+
+ "fmla v8.4s , %[b0].4s, %[a0a].s[0]\n"
+ "fmla v9.4s , %[b0].4s, %[a0a].s[1]\n"
+ "ldr x20, [%[b_ptr], #88]\n"
+ "fmla v10.4s, %[b0].4s, %[a0a].s[2]\n"
+
+ "ins %[b2].d[1], x20\n"
+ "fmla v11.4s, %[b0].4s, %[a0a].s[3]\n"
+ "fmla v12.4s, %[b0].4s, %[a1a].s[0]\n"
+ "fmla v13.4s, %[b0].4s, %[a1a].s[1]\n"
+ "fmla v14.4s, %[b0].4s, %[a1a].s[2]\n"
+ "fmla v15.4s, %[b0].4s, %[a1a].s[3]\n"
+ "fmla v16.4s, %[b1].4s, %[a0a].s[0]\n"
+ "fmla v17.4s, %[b1].4s, %[a0a].s[1]\n"
+ "fmla v18.4s, %[b1].4s, %[a0a].s[2]\n"
+ "fmla v19.4s, %[b1].4s, %[a0a].s[3]\n"
+ "fmla v20.4s, %[b1].4s, %[a1a].s[0]\n"
+ "fmla v21.4s, %[b1].4s, %[a1a].s[1]\n"
+ "fmla v22.4s, %[b1].4s, %[a1a].s[2]\n"
+ "fmla v23.4s, %[b1].4s, %[a1a].s[3]\n"
+ "fmla v24.4s, %[b2].4s, %[a0a].s[0]\n"
+ "fmla v25.4s, %[b2].4s, %[a0a].s[1]\n"
+ "fmla v26.4s, %[b2].4s, %[a0a].s[2]\n"
+ "fmla v27.4s, %[b2].4s, %[a0a].s[3]\n"
+ "fmla v28.4s, %[b2].4s, %[a1a].s[0]\n"
+ "fmla v29.4s, %[b2].4s, %[a1a].s[1]\n"
+ "add %[a_ptr], %[a_ptr], #64\n"
+ "fmla v30.4s, %[b2].4s, %[a1a].s[2]\n"
+ "add %[b_ptr], %[b_ptr], #96\n"
+ "fmla v31.4s, %[b2].4s, %[a1a].s[3]\n"
+ "b 3f\n"
+
+ // Detached final iteration. (odd K)
+ "2:\n"
+
+ "ldr %d[b2], [%[b_ptr], #32]\n"
+ "fmla v8.4s , %[b0].4s, %[a0].s[0]\n"
+
+ "fmla v9.4s , %[b0].4s, %[a0].s[1]\n"
+ "ldr x20, [%[b_ptr], #40]\n"
+ "fmla v10.4s, %[b0].4s, %[a0].s[2]\n"
+ "fmla v11.4s, %[b0].4s, %[a0].s[3]\n"
+ "ins %[b2].d[1], x20\n"
+ "fmla v12.4s, %[b0].4s, %[a1].s[0]\n"
+ "fmla v13.4s, %[b0].4s, %[a1].s[1]\n"
+ "fmla v14.4s, %[b0].4s, %[a1].s[2]\n"
+ "fmla v15.4s, %[b0].4s, %[a1].s[3]\n"
+ "fmla v16.4s, %[b1].4s, %[a0].s[0]\n"
+ "fmla v17.4s, %[b1].4s, %[a0].s[1]\n"
+ "fmla v18.4s, %[b1].4s, %[a0].s[2]\n"
+ "fmla v19.4s, %[b1].4s, %[a0].s[3]\n"
+ "fmla v20.4s, %[b1].4s, %[a1].s[0]\n"
+ "fmla v21.4s, %[b1].4s, %[a1].s[1]\n"
+ "fmla v22.4s, %[b1].4s, %[a1].s[2]\n"
+ "fmla v23.4s, %[b1].4s, %[a1].s[3]\n"
+ "fmla v24.4s, %[b2].4s, %[a0].s[0]\n"
+ "fmla v25.4s, %[b2].4s, %[a0].s[1]\n"
+ "fmla v26.4s, %[b2].4s, %[a0].s[2]\n"
+ "fmla v27.4s, %[b2].4s, %[a0].s[3]\n"
+ "fmla v28.4s, %[b2].4s, %[a1].s[0]\n"
+ "fmla v29.4s, %[b2].4s, %[a1].s[1]\n"
+ "add %[a_ptr], %[a_ptr], #32\n"
+ "fmla v30.4s, %[b2].4s, %[a1].s[2]\n"
+ "add %[b_ptr], %[b_ptr], #48\n"
+ "fmla v31.4s, %[b2].4s, %[a1].s[3]\n"
+
+ // Common tail
+ "3:\n"
+ "str q8, [%[c_ptr]]\n"
+ "str q16, [%[c_ptr], #16]\n"
+ "str q24, [%[c_ptr], #32]\n"
+ "str q9, [%[c_ptr], #48]\n"
+ "str q17, [%[c_ptr], #64]\n"
+ "str q25, [%[c_ptr], #80]\n"
+ "str q10, [%[c_ptr], #96]\n"
+ "str q18, [%[c_ptr], #112]\n"
+ "str q26, [%[c_ptr], #128]\n"
+ "str q11, [%[c_ptr], #144]\n"
+ "str q19, [%[c_ptr], #160]\n"
+ "str q27, [%[c_ptr], #176]\n"
+ "str q12, [%[c_ptr], #192]\n"
+ "str q20, [%[c_ptr], #208]\n"
+ "str q28, [%[c_ptr], #224]\n"
+ "str q13, [%[c_ptr], #240]\n"
+ "str q21, [%[c_ptr], #256]\n"
+ "str q29, [%[c_ptr], #272]\n"
+ "str q14, [%[c_ptr], #288]\n"
+ "str q22, [%[c_ptr], #304]\n"
+ "str q30, [%[c_ptr], #320]\n"
+ "str q15, [%[c_ptr], #336]\n"
+ "str q23, [%[c_ptr], #352]\n"
+ "str q31, [%[c_ptr], #368]\n"
+ "add %[c_ptr], %[c_ptr], #384\n"
+ :
+ [a_ptr] "+r"(a_ptr), [b_ptr] "+r"(b_ptr), [c_ptr] "+r"(c_ptr),
+ [a0] "+w"(a0), [a1] "+w"(a1), [a0a] "+w"(a0a), [a1a] "+w"(a1a),
+ [b0] "+w"(b0), [b1] "+w"(b1), [b2] "+w"(b2), [k] "+r"(k)
+ : [oddk] "r"(oddk)
+ : "x20", "x21", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18",
+ "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "cc");
+ }
+ }
+}
+
+} // namespace arm_gemm
+
+#endif
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_sgemm_12x8/a55r1.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_sgemm_12x8/a55r1.cpp
new file mode 100644
index 0000000..89fe6ac
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_sgemm_12x8/a55r1.cpp
@@ -0,0 +1,342 @@
+/*
+ * Copyright (c) 2017-2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifdef __aarch64__
+
+#include <arm_neon.h>
+
+#include "../../asmlib.hpp"
+
+namespace arm_gemm
+{
+void a64_sgemm_asimd_12x8_a55r1(const float *Apanel, const float *Bpanel, float *Cpanel, const int ablocks, const int bblocks, const int K)
+{
+ const float *a_ptr = Apanel;
+ float *c_ptr = Cpanel;
+
+ // Fix up for odd lengths - set a flag if K is odd, but make
+ // sure we round up the iteration count.
+ int oddk = (K & 1);
+ int k_iters = ((K + 1) / 2) - 1;
+
+ for(int yb = 0; yb < ablocks; yb++)
+ {
+ const float *a_ptr0 = a_ptr;
+ const float *b_ptr = Bpanel;
+
+ for(int xb = 0; xb < bblocks; xb++)
+ {
+ a_ptr = a_ptr0;
+ int k = k_iters;
+
+ register float32x4_t a0 asm("v0");
+ register float32x4_t a1 asm("v1");
+ register float32x4_t b0 asm("v2");
+ register float32x4_t b1 asm("v3");
+ register float32x4_t b2 asm("v4");
+ register float32x4_t a0a asm("v5");
+ register float32x4_t a1a asm("v6");
+
+ __asm __volatile(
+ // Initialize result registers, load initial operands, prime prefetches.
+ "movi v8.4s, #0x0\n"
+ "ldr %q[a0], [%[a_ptr]]\n"
+ "movi v9.4s, #0x0\n"
+ "ldr %q[b0], [%[b_ptr]]\n"
+ "movi v10.4s, #0x0\n"
+ "ldr %q[a1], [%[a_ptr], #16]\n"
+ "movi v11.4s, #0x0\n"
+ "ldr %q[b1], [%[b_ptr], #16]\n"
+ "movi v12.4s, #0x0\n" ASM_PREFETCH("[%[b_ptr], #64]") "movi v13.4s, #0x0\n" ASM_PREFETCH("[%[a_ptr], #64]") "movi v14.4s, #0x0\n" ASM_PREFETCH("[%[b_ptr], #128]") "movi v15.4s, #0x0\n"
+ ASM_PREFETCH("[%[a_ptr], #128]") "movi v16.4s, #0x0\n" ASM_PREFETCH("[%[b_ptr], #192]") "movi v17.4s, #0x0\n" ASM_PREFETCH("[%[b_ptr], #256]")
+ "movi v18.4s, #0x0\n"
+ "movi v19.4s, #0x0\n" ASM_PREFETCH("[%[a_ptr], #192]")
+ "movi v20.4s, #0x0\n"
+ "movi v21.4s, #0x0\n" ASM_PREFETCH("[%[b_ptr], #320]")
+ "movi v22.4s, #0x0\n"
+ "movi v23.4s, #0x0\n" ASM_PREFETCH("[%[a_ptr], #256]")
+ "movi v24.4s, #0x0\n"
+ "movi v25.4s, #0x0\n" ASM_PREFETCH("[%[b_ptr], #384]")
+ "movi v26.4s, #0x0\n"
+ "movi v27.4s, #0x0\n" ASM_PREFETCH("[%[b_ptr], #448]")
+ "movi v28.4s, #0x0\n"
+ "movi v29.4s, #0x0\n" ASM_PREFETCH("[%[a_ptr], #384]")
+ "movi v30.4s, #0x0\n"
+ "movi v31.4s, #0x0\n" ASM_PREFETCH("[%[b_ptr], #512]")
+
+ // The loop is offset by these two instructions which must
+ // always be executed.
+ "fmla v8.4s , %[b0].4s, %[a0].s[0]\n"
+ "ldr %d[b2], [%[b_ptr], #32]\n"
+
+ // Skip loop if we are doing zero iterations of it.
+ "cbz %w[k], 4f\n"
+
+ "1:\n"
+ // Unroll 0
+ "fmla v9.4s , %[b0].4s, %[a0].s[1]\n"
+ "ldr x20, [%[b_ptr], #40]\n"
+ "fmla v10.4s, %[b0].4s, %[a0].s[2]\n"
+ "subs %w[k], %w[k], #1\n"
+ "fmla v11.4s, %[b0].4s, %[a0].s[3]\n"
+ "ldr %d[a0a], [%[a_ptr], #32]\n"
+
+ "fmla v12.4s, %[b0].4s, %[a1].s[0]\n"
+ "ins %[b2].d[1], x20\n"
+ "fmla v13.4s, %[b0].4s, %[a1].s[1]\n"
+ "ldr x20, [%[a_ptr], #40]\n"
+ "fmla v14.4s, %[b0].4s, %[a1].s[2]\n"
+ "fmla v15.4s, %[b0].4s, %[a1].s[3]\n"
+ "ldr %d[a1a], [%[a_ptr], #48]\n"
+
+ "fmla v16.4s, %[b1].4s, %[a0].s[0]\n"
+ "ins %[a0a].d[1], x20\n"
+ "fmla v17.4s, %[b1].4s, %[a0].s[1]\n"
+ "ldr x20, [%[a_ptr], #56]\n"
+ "fmla v18.4s, %[b1].4s, %[a0].s[2]\n"
+ "fmla v19.4s, %[b1].4s, %[a0].s[3]\n"
+ "ldr %d[b0], [%[b_ptr], #48]\n"
+
+ "fmla v20.4s, %[b1].4s, %[a1].s[0]\n"
+ "ins %[a1a].d[1], x20\n"
+ "fmla v21.4s, %[b1].4s, %[a1].s[1]\n"
+ "ldr x20, [%[b_ptr], #56]\n"
+ "fmla v22.4s, %[b1].4s, %[a1].s[2]\n"
+ "fmla v23.4s, %[b1].4s, %[a1].s[3]\n"
+ "ldr %d[b1], [%[b_ptr], #64]\n"
+
+ "fmla v24.4s, %[b2].4s, %[a0].s[0]\n"
+ "ins %[b0].d[1], x20\n"
+ "fmla v25.4s, %[b2].4s, %[a0].s[1]\n"
+ "ldr x20, [%[b_ptr], #72]\n"
+ "fmla v26.4s, %[b2].4s, %[a0].s[2]\n"
+ "fmla v27.4s, %[b2].4s, %[a0].s[3]\n" ASM_PREFETCH("[%[a_ptr], #448]")
+
+ "fmla v28.4s, %[b2].4s, %[a1].s[0]\n"
+ "fmla v29.4s, %[b2].4s, %[a1].s[1]\n" ASM_PREFETCH("[%[b_ptr], #576]")
+ "fmla v30.4s, %[b2].4s, %[a1].s[2]\n"
+ "fmla v31.4s, %[b2].4s, %[a1].s[3]\n"
+
+ // Unroll 1
+ "ldr %d[b2], [%[b_ptr], #80]\n"
+
+ "fmla v8.4s , %[b0].4s, %[a0a].s[0]\n"
+ "ins %[b1].d[1], x20\n"
+ "fmla v9.4s , %[b0].4s, %[a0a].s[1]\n"
+ "ldr x20, [%[b_ptr], #88]\n"
+ "fmla v10.4s, %[b0].4s, %[a0a].s[2]\n"
+ "fmla v11.4s, %[b0].4s, %[a0a].s[3]\n"
+ "ldr %d[a0], [%[a_ptr], #64]\n"
+
+ "fmla v12.4s, %[b0].4s, %[a1a].s[0]\n"
+ "ins %[b2].d[1], x20\n"
+ "fmla v13.4s, %[b0].4s, %[a1a].s[1]\n"
+ "ldr x20, [%[a_ptr], #72]\n"
+ "fmla v14.4s, %[b0].4s, %[a1a].s[2]\n"
+ "fmla v15.4s, %[b0].4s, %[a1a].s[3]\n"
+ "ldr %d[a1], [%[a_ptr], #80]\n"
+
+ "fmla v16.4s, %[b1].4s, %[a0a].s[0]\n"
+ "ins %[a0].d[1], x20\n"
+ "fmla v17.4s, %[b1].4s, %[a0a].s[1]\n"
+ "ldr x20, [%[a_ptr], #88]\n"
+ "fmla v18.4s, %[b1].4s, %[a0a].s[2]\n"
+ "fmla v19.4s, %[b1].4s, %[a0a].s[3]\n"
+ "ldr %d[b0], [%[b_ptr], #96]\n"
+
+ "fmla v20.4s, %[b1].4s, %[a1a].s[0]\n"
+ "ins %[a1].d[1], x20\n"
+ "fmla v21.4s, %[b1].4s, %[a1a].s[1]\n"
+ "ldr x20, [%[b_ptr], #104]\n"
+ "fmla v22.4s, %[b1].4s, %[a1a].s[2]\n"
+ "fmla v23.4s, %[b1].4s, %[a1a].s[3]\n"
+ "ldr %d[b1], [%[b_ptr], #112]\n"
+
+ "fmla v24.4s, %[b2].4s, %[a0a].s[0]\n"
+ "ins %[b0].d[1], x20\n"
+ "fmla v25.4s, %[b2].4s, %[a0a].s[1]\n"
+ "ldr x20, [%[b_ptr], #120]\n"
+ "fmla v26.4s, %[b2].4s, %[a0a].s[2]\n"
+
+ "fmla v27.4s, %[b2].4s, %[a0a].s[3]\n"
+ "add %[a_ptr], %[a_ptr], #64\n"
+
+ "fmla v28.4s, %[b2].4s, %[a1a].s[0]\n" ASM_PREFETCH("[%[b_ptr], #640]")
+ "fmla v29.4s, %[b2].4s, %[a1a].s[1]\n"
+ "add %[b_ptr], %[b_ptr], #96\n"
+ "fmla v30.4s, %[b2].4s, %[a1a].s[2]\n"
+ "ins %[b1].d[1], x20\n"
+ "fmla v31.4s, %[b2].4s, %[a1a].s[3]\n"
+ "ldr %d[b2], [%[b_ptr], #32]\n"
+
+ "fmla v8.4s , %[b0].4s, %[a0].s[0]\n"
+ "b.ne 1b\n"
+
+ // Branch here if K=1 or 2. Do the right thing for odd/even at the end.
+ "4:\n"
+
+ // Start final iteration - branch off to "odd" code before we load a0a.
+ "fmla v9.4s , %[b0].4s, %[a0].s[1]\n"
+ "ldr x20, [%[b_ptr], #40]\n"
+ "fmla v10.4s, %[b0].4s, %[a0].s[2]\n"
+ "cbnz %w[oddk], 2f\n"
+
+ // Even K continuation
+ "fmla v11.4s, %[b0].4s, %[a0].s[3]\n"
+ "ldr %d[a0a], [%[a_ptr], #32]\n"
+
+ "fmla v12.4s, %[b0].4s, %[a1].s[0]\n"
+ "ins %[b2].d[1], x20\n"
+ "fmla v13.4s, %[b0].4s, %[a1].s[1]\n"
+ "ldr x20, [%[a_ptr], #40]\n"
+ "fmla v14.4s, %[b0].4s, %[a1].s[2]\n" ASM_PREFETCHW("[%[c_ptr]]")
+ "fmla v15.4s, %[b0].4s, %[a1].s[3]\n"
+ "ldr %d[a1a], [%[a_ptr], #48]\n"
+
+ "fmla v16.4s, %[b1].4s, %[a0].s[0]\n"
+ "ins %[a0a].d[1], x20\n"
+ "fmla v17.4s, %[b1].4s, %[a0].s[1]\n"
+ "ldr x20, [%[a_ptr], #56]\n"
+ "fmla v18.4s, %[b1].4s, %[a0].s[2]\n"
+ "fmla v19.4s, %[b1].4s, %[a0].s[3]\n"
+ "ldr %d[b0], [%[b_ptr], #48]\n"
+
+ "fmla v20.4s, %[b1].4s, %[a1].s[0]\n"
+ "ins %[a1a].d[1], x20\n"
+ "fmla v21.4s, %[b1].4s, %[a1].s[1]\n"
+ "ldr x20, [%[b_ptr], #56]\n"
+ "fmla v22.4s, %[b1].4s, %[a1].s[2]\n" ASM_PREFETCHW("[%[c_ptr], #64]")
+ "fmla v23.4s, %[b1].4s, %[a1].s[3]\n"
+
+ "fmla v24.4s, %[b2].4s, %[a0].s[0]\n"
+ "fmla v25.4s, %[b2].4s, %[a0].s[1]\n" ASM_PREFETCHW("[%[c_ptr], #128]")
+ "fmla v26.4s, %[b2].4s, %[a0].s[2]\n"
+ "fmla v27.4s, %[b2].4s, %[a0].s[3]\n"
+ "ldr %d[b1], [%[b_ptr], #64]\n"
+
+ "fmla v28.4s, %[b2].4s, %[a1].s[0]\n"
+ "ins %[b0].d[1], x20\n"
+ "fmla v29.4s, %[b2].4s, %[a1].s[1]\n"
+ "ldr x20, [%[b_ptr], #72]\n"
+ "fmla v30.4s, %[b2].4s, %[a1].s[2]\n" ASM_PREFETCHW("[%[c_ptr], #192]")
+ "fmla v31.4s, %[b2].4s, %[a1].s[3]\n"
+ "ldr %d[b2], [%[b_ptr], #80]\n"
+
+ "fmla v8.4s , %[b0].4s, %[a0a].s[0]\n"
+ "ins %[b1].d[1], x20\n"
+ "fmla v9.4s , %[b0].4s, %[a0a].s[1]\n"
+ "ldr x20, [%[b_ptr], #88]\n"
+ "fmla v10.4s, %[b0].4s, %[a0a].s[2]\n"
+ "ins %[b2].d[1], x20\n"
+
+ "fmla v11.4s, %[b0].4s, %[a0a].s[3]\n" ASM_PREFETCHW("[%[c_ptr], #256]")
+ "fmla v12.4s, %[b0].4s, %[a1a].s[0]\n"
+ "fmla v13.4s, %[b0].4s, %[a1a].s[1]\n"
+ "fmla v14.4s, %[b0].4s, %[a1a].s[2]\n" ASM_PREFETCHW("[%[c_ptr], #320]")
+ "fmla v15.4s, %[b0].4s, %[a1a].s[3]\n"
+ "fmla v16.4s, %[b1].4s, %[a0a].s[0]\n" ASM_PREFETCHWL2("[%[c_ptr], #384]")
+ "fmla v17.4s, %[b1].4s, %[a0a].s[1]\n"
+ "fmla v18.4s, %[b1].4s, %[a0a].s[2]\n" ASM_PREFETCHWL2("[%[c_ptr], #448]")
+ "fmla v19.4s, %[b1].4s, %[a0a].s[3]\n"
+ "fmla v20.4s, %[b1].4s, %[a1a].s[0]\n"
+ "fmla v21.4s, %[b1].4s, %[a1a].s[1]\n" ASM_PREFETCHWL2("[%[c_ptr], #512]")
+ "fmla v22.4s, %[b1].4s, %[a1a].s[2]\n"
+ "fmla v23.4s, %[b1].4s, %[a1a].s[3]\n" ASM_PREFETCHWL2("[%[c_ptr], #576]")
+ "fmla v24.4s, %[b2].4s, %[a0a].s[0]\n"
+ "fmla v25.4s, %[b2].4s, %[a0a].s[1]\n"
+ "fmla v26.4s, %[b2].4s, %[a0a].s[2]\n" ASM_PREFETCHWL2("[%[c_ptr], #640]")
+ "fmla v27.4s, %[b2].4s, %[a0a].s[3]\n"
+ "fmla v28.4s, %[b2].4s, %[a1a].s[0]\n" ASM_PREFETCHWL2("[%[c_ptr], #704]")
+ "fmla v29.4s, %[b2].4s, %[a1a].s[1]\n"
+ "add %[a_ptr], %[a_ptr], #64\n"
+ "fmla v30.4s, %[b2].4s, %[a1a].s[2]\n"
+ "add %[b_ptr], %[b_ptr], #96\n"
+ "fmla v31.4s, %[b2].4s, %[a1a].s[3]\n"
+ "b 3f\n"
+
+ // Odd K continuation
+ "2:\n"
+ "fmla v11.4s, %[b0].4s, %[a0].s[3]\n" ASM_PREFETCHW("[%[c_ptr]]")
+ "fmla v12.4s, %[b0].4s, %[a1].s[0]\n"
+ "ins %[b2].d[1], x20\n"
+ "fmla v13.4s, %[b0].4s, %[a1].s[1]\n" ASM_PREFETCHW("[%[c_ptr], #64]")
+ "fmla v14.4s, %[b0].4s, %[a1].s[2]\n"
+ "add %[a_ptr], %[a_ptr], #32\n"
+ "fmla v15.4s, %[b0].4s, %[a1].s[3]\n" ASM_PREFETCHW("[%[c_ptr], #128]")
+ "fmla v16.4s, %[b1].4s, %[a0].s[0]\n"
+ "add %[b_ptr], %[b_ptr], #48\n"
+ "fmla v17.4s, %[b1].4s, %[a0].s[1]\n" ASM_PREFETCHW("[%[c_ptr], #192]")
+ "fmla v18.4s, %[b1].4s, %[a0].s[2]\n"
+ "fmla v19.4s, %[b1].4s, %[a0].s[3]\n" ASM_PREFETCHW("[%[c_ptr], #256]")
+ "fmla v20.4s, %[b1].4s, %[a1].s[0]\n"
+ "fmla v21.4s, %[b1].4s, %[a1].s[1]\n" ASM_PREFETCHW("[%[c_ptr], #320]")
+ "fmla v22.4s, %[b1].4s, %[a1].s[2]\n"
+ "fmla v23.4s, %[b1].4s, %[a1].s[3]\n" ASM_PREFETCHWL2("[%[c_ptr], #384]")
+ "fmla v24.4s, %[b2].4s, %[a0].s[0]\n"
+ "fmla v25.4s, %[b2].4s, %[a0].s[1]\n" ASM_PREFETCHWL2("[%[c_ptr], #448]")
+ "fmla v26.4s, %[b2].4s, %[a0].s[2]\n"
+ "fmla v27.4s, %[b2].4s, %[a0].s[3]\n" ASM_PREFETCHWL2("[%[c_ptr], #512]") "fmla v28.4s, %[b2].4s, %[a1].s[0]\n" ASM_PREFETCHWL2("[%[c_ptr], #576]") "fmla v29.4s, %[b2].4s, %[a1].s[1]\n"
+ ASM_PREFETCHWL2("[%[c_ptr], #640]") "fmla v30.4s, %[b2].4s, %[a1].s[2]\n" ASM_PREFETCHWL2("[%[c_ptr], #704]")
+ "fmla v31.4s, %[b2].4s, %[a1].s[3]\n"
+
+ // Common tail
+ "3:\n"
+ "str q8, [%[c_ptr]]\n"
+ "str q16, [%[c_ptr], #16]\n"
+ "str q24, [%[c_ptr], #32]\n"
+ "str q9, [%[c_ptr], #48]\n"
+ "str q17, [%[c_ptr], #64]\n"
+ "str q25, [%[c_ptr], #80]\n"
+ "str q10, [%[c_ptr], #96]\n"
+ "str q18, [%[c_ptr], #112]\n"
+ "str q26, [%[c_ptr], #128]\n"
+ "str q11, [%[c_ptr], #144]\n"
+ "str q19, [%[c_ptr], #160]\n"
+ "str q27, [%[c_ptr], #176]\n"
+ "str q12, [%[c_ptr], #192]\n"
+ "str q20, [%[c_ptr], #208]\n"
+ "str q28, [%[c_ptr], #224]\n"
+ "str q13, [%[c_ptr], #240]\n"
+ "str q21, [%[c_ptr], #256]\n"
+ "str q29, [%[c_ptr], #272]\n"
+ "str q14, [%[c_ptr], #288]\n"
+ "str q22, [%[c_ptr], #304]\n"
+ "str q30, [%[c_ptr], #320]\n"
+ "str q15, [%[c_ptr], #336]\n"
+ "str q23, [%[c_ptr], #352]\n"
+ "str q31, [%[c_ptr], #368]\n"
+ "add %[c_ptr], %[c_ptr], #384\n"
+ :
+ [a_ptr] "+r"(a_ptr), [b_ptr] "+r"(b_ptr), [c_ptr] "+r"(c_ptr),
+ [a0] "+w"(a0), [a1] "+w"(a1), [a0a] "+w"(a0a), [a1a] "+w"(a1a),
+ [b0] "+w"(b0), [b1] "+w"(b1), [b2] "+w"(b2), [k] "+r"(k)
+ : [oddk] "r"(oddk)
+ : "x20", "x21", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18",
+ "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "cc");
+ }
+ }
+}
+
+} // namespace arm_gemm
+
+#endif
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_sgemm_12x8/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_sgemm_12x8/generic.cpp
new file mode 100644
index 0000000..42e870e
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_sgemm_12x8/generic.cpp
@@ -0,0 +1,350 @@
+/*
+ * Copyright (c) 2017-2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifdef __aarch64__
+
+#include <arm_neon.h>
+
+#include "../../asmlib.hpp"
+
+// Kernel implementation.
+//
+// Assume that "Apanel" points to a chunk of A blocks (each size 8xK) in read-order.
+// Assume that "Bpanel" points to a chunk of B blocks (each size 12xK) in read-order.
+// Assume that "Cpanel" points to a chunk of C output blocks (each size
+// 12x8), the chunks being arranged in a row major fashion.
+//
+// Note that the intent of this is that either ablocks or bblocks will be 1
+// - this construction allows the output loop to proceed in either order.
+
+namespace arm_gemm
+{
+void a64_sgemm_asimd_12x8_jumps(const float *Apanel, const float *Bpanel, float *Cpanel, int ablocks, int bblocks, int K, long int row_jump = 0, long int block_jump = 0)
+{
+ const float *a_ptr = Apanel;
+ float *c_ptr = Cpanel;
+
+ for(int yb = 0; yb < ablocks; yb++)
+ {
+ const float *a_ptr0 = a_ptr;
+ const float *b_ptr = Bpanel;
+
+ for(int xb = 0; xb < bblocks; xb++)
+ {
+ a_ptr = a_ptr0;
+ // Fix up for odd lengths - set a flag if K is odd, but make
+ // sure we round up the iteration count.
+ int oddk = (K & 1);
+ int k = ((K + 1) / 2) - 1;
+
+ register float32x4_t a0 asm("v0");
+ register float32x4_t a1 asm("v1");
+ register float32x4_t b0 asm("v2");
+ register float32x4_t b1 asm("v3");
+ register float32x4_t b2 asm("v4");
+ register float32x4_t a0a asm("v5");
+ register float32x4_t a1a asm("v6");
+
+ __asm __volatile(
+ // Initialize result registers, load initial operands, prime prefetches.
+ "movi v8.4s, #0x0\n"
+ "ldr %q[a0], [%[a_ptr]]\n"
+ "movi v9.4s, #0x0\n"
+ "ldr %q[b0], [%[b_ptr]]\n"
+ "movi v10.4s, #0x0\n"
+ "ldr %q[a1], [%[a_ptr], #16]\n"
+ "movi v11.4s, #0x0\n"
+ "ldr %q[b1], [%[b_ptr], #16]\n"
+ "movi v12.4s, #0x0\n" ASM_PREFETCH("[%[b_ptr], #64]") "movi v13.4s, #0x0\n" ASM_PREFETCH("[%[a_ptr], #64]") "movi v14.4s, #0x0\n" ASM_PREFETCH("[%[b_ptr], #128]") "movi v15.4s, #0x0\n"
+ ASM_PREFETCH("[%[a_ptr], #128]") "movi v16.4s, #0x0\n" ASM_PREFETCH("[%[b_ptr], #192]") "movi v17.4s, #0x0\n" ASM_PREFETCH("[%[b_ptr], #256]") "movi v18.4s, #0x0\n"
+ ASM_PREFETCH("[%[a_ptr], #192]") "movi v19.4s, #0x0\n" ASM_PREFETCH("[%[b_ptr], #320]") "movi v20.4s, #0x0\n" ASM_PREFETCH("[%[a_ptr], #256]") "movi v21.4s, #0x0\n"
+ ASM_PREFETCH("[%[b_ptr], #384]")
+ "movi v22.4s, #0x0\n"
+ "movi v23.4s, #0x0\n"
+ "movi v24.4s, #0x0\n"
+ "movi v25.4s, #0x0\n"
+ "movi v26.4s, #0x0\n"
+ "movi v27.4s, #0x0\n"
+ "movi v28.4s, #0x0\n"
+ "movi v29.4s, #0x0\n"
+ "movi v30.4s, #0x0\n"
+ "movi v31.4s, #0x0\n"
+
+ // Skip loop if we are doing zero iterations of it.
+ "cbz %w[k], 4f\n"
+
+ // Loop proper
+ "1:\n"
+ "fmla v8.4s , %[b0].4s, %[a0].s[0]\n"
+ "fmla v9.4s , %[b0].4s, %[a0].s[1]\n"
+ "ldr %q[b2], [%[b_ptr], #32]\n"
+ "fmla v10.4s, %[b0].4s, %[a0].s[2]\n"
+ "add %[b_ptr], %[b_ptr], %[row_jump]\n"
+ "fmla v11.4s, %[b0].4s, %[a0].s[3]\n"
+ "ldr %q[a0a], [%[a_ptr], #32]\n"
+ "fmla v12.4s, %[b0].4s, %[a1].s[0]\n"
+ "fmla v13.4s, %[b0].4s, %[a1].s[1]\n"
+ "ldr %q[a1a], [%[a_ptr], #48]\n"
+ "fmla v14.4s, %[b0].4s, %[a1].s[2]\n"
+ "fmla v15.4s, %[b0].4s, %[a1].s[3]\n"
+ "ldr %q[b0], [%[b_ptr], #48]\n"
+
+ "fmla v16.4s, %[b1].4s, %[a0].s[0]\n"
+ "fmla v17.4s, %[b1].4s, %[a0].s[1]\n" ASM_PREFETCH("[%[a_ptr], #320]")
+ "fmla v18.4s, %[b1].4s, %[a0].s[2]\n"
+ "fmla v19.4s, %[b1].4s, %[a0].s[3]\n"
+ "fmla v20.4s, %[b1].4s, %[a1].s[0]\n"
+ "fmla v21.4s, %[b1].4s, %[a1].s[1]\n"
+ "fmla v22.4s, %[b1].4s, %[a1].s[2]\n"
+ "fmla v23.4s, %[b1].4s, %[a1].s[3]\n"
+ "ldr %q[b1], [%[b_ptr], #64]\n"
+
+ "fmla v24.4s, %[b2].4s, %[a0].s[0]\n"
+ "fmla v25.4s, %[b2].4s, %[a0].s[1]\n" ASM_PREFETCH("[%[b_ptr], #448]")
+ "fmla v26.4s, %[b2].4s, %[a0].s[2]\n"
+ "fmla v27.4s, %[b2].4s, %[a0].s[3]\n"
+ "fmla v28.4s, %[b2].4s, %[a1].s[0]\n"
+ "fmla v29.4s, %[b2].4s, %[a1].s[1]\n"
+ "fmla v30.4s, %[b2].4s, %[a1].s[2]\n"
+ "fmla v31.4s, %[b2].4s, %[a1].s[3]\n"
+ "ldr %q[b2], [%[b_ptr], #80]\n"
+
+ "fmla v8.4s , %[b0].4s, %[a0a].s[0]\n"
+ "fmla v9.4s , %[b0].4s, %[a0a].s[1]\n"
+ "ldr %q[a0], [%[a_ptr], #64]\n"
+ "fmla v10.4s, %[b0].4s, %[a0a].s[2]\n"
+ "add %[b_ptr], %[b_ptr], %[row_jump]\n"
+ "fmla v11.4s, %[b0].4s, %[a0a].s[3]\n"
+ "fmla v12.4s, %[b0].4s, %[a1a].s[0]\n"
+ "ldr %q[a1], [%[a_ptr], #80]\n"
+ "fmla v13.4s, %[b0].4s, %[a1a].s[1]\n"
+ "fmla v14.4s, %[b0].4s, %[a1a].s[2]\n"
+ "fmla v15.4s, %[b0].4s, %[a1a].s[3]\n"
+ "ldr %q[b0], [%[b_ptr], #96]\n"
+
+ "fmla v16.4s, %[b1].4s, %[a0a].s[0]\n"
+ "fmla v17.4s, %[b1].4s, %[a0a].s[1]\n" ASM_PREFETCH("[%[b_ptr], #512]")
+ "fmla v18.4s, %[b1].4s, %[a0a].s[2]\n"
+ "fmla v19.4s, %[b1].4s, %[a0a].s[3]\n"
+ "fmla v20.4s, %[b1].4s, %[a1a].s[0]\n"
+ "fmla v21.4s, %[b1].4s, %[a1a].s[1]\n"
+ "fmla v22.4s, %[b1].4s, %[a1a].s[2]\n"
+ "fmla v23.4s, %[b1].4s, %[a1a].s[3]\n"
+ "ldr %q[b1], [%[b_ptr], #112]\n"
+
+ "fmla v24.4s, %[b2].4s, %[a0a].s[0]\n"
+ "fmla v25.4s, %[b2].4s, %[a0a].s[1]\n"
+ "add %[a_ptr], %[a_ptr], #64\n"
+ "fmla v26.4s, %[b2].4s, %[a0a].s[2]\n"
+ "fmla v27.4s, %[b2].4s, %[a0a].s[3]\n"
+ "add %[b_ptr], %[b_ptr], #96\n"
+ "fmla v28.4s, %[b2].4s, %[a1a].s[0]\n"
+ "fmla v29.4s, %[b2].4s, %[a1a].s[1]\n"
+ "subs %w[k], %w[k], #1\n"
+ "fmla v30.4s, %[b2].4s, %[a1a].s[2]\n"
+ "fmla v31.4s, %[b2].4s, %[a1a].s[3]\n"
+ "bne 1b\n"
+
+ // Target to use when K is 1 or 2 (i.e. zero iterations of main loop)
+ "4:\n"
+
+ // Branch to alternative tail for odd K
+ "cbnz %w[oddk], 2f\n"
+
+ // Detached final iteration (even K)
+ "fmla v8.4s , %[b0].4s, %[a0].s[0]\n"
+ "fmla v9.4s , %[b0].4s, %[a0].s[1]\n"
+ "ldr %q[b2], [%[b_ptr], #32]\n"
+ "fmla v10.4s, %[b0].4s, %[a0].s[2]\n"
+ "add %[b_ptr], %[b_ptr], %[row_jump]\n"
+ "fmla v11.4s, %[b0].4s, %[a0].s[3]\n"
+ "ldr %q[a0a], [%[a_ptr], #32]\n"
+ "fmla v12.4s, %[b0].4s, %[a1].s[0]\n"
+ "fmla v13.4s, %[b0].4s, %[a1].s[1]\n"
+ "ldr %q[a1a], [%[a_ptr], #48]\n"
+ "fmla v14.4s, %[b0].4s, %[a1].s[2]\n"
+ "fmla v15.4s, %[b0].4s, %[a1].s[3]\n"
+ "ldr %q[b0], [%[b_ptr], #48]\n"
+
+ "fmla v16.4s, %[b1].4s, %[a0].s[0]\n"
+ "fmla v17.4s, %[b1].4s, %[a0].s[1]\n"
+ "fmla v18.4s, %[b1].4s, %[a0].s[2]\n"
+ "fmla v19.4s, %[b1].4s, %[a0].s[3]\n"
+ "fmla v20.4s, %[b1].4s, %[a1].s[0]\n"
+ "fmla v21.4s, %[b1].4s, %[a1].s[1]\n"
+ "fmla v22.4s, %[b1].4s, %[a1].s[2]\n"
+ "fmla v23.4s, %[b1].4s, %[a1].s[3]\n"
+ "ldr %q[b1], [%[b_ptr], #64]\n"
+
+ "fmla v24.4s, %[b2].4s, %[a0].s[0]\n"
+ "fmla v25.4s, %[b2].4s, %[a0].s[1]\n"
+ "add %[a_ptr], %[a_ptr], #64\n"
+ "fmla v26.4s, %[b2].4s, %[a0].s[2]\n"
+ "fmla v27.4s, %[b2].4s, %[a0].s[3]\n"
+ "fmla v28.4s, %[b2].4s, %[a1].s[0]\n"
+ "fmla v29.4s, %[b2].4s, %[a1].s[1]\n"
+ "fmla v30.4s, %[b2].4s, %[a1].s[2]\n"
+ "fmla v31.4s, %[b2].4s, %[a1].s[3]\n"
+ "ldr %q[b2], [%[b_ptr], #80]\n"
+
+ "fmla v8.4s , %[b0].4s, %[a0a].s[0]\n"
+ "add %[b_ptr], %[b_ptr], %[block_jump]\n"
+ "fmla v16.4s, %[b1].4s, %[a0a].s[0]\n"
+ "add %[b_ptr], %[b_ptr], #96\n"
+ "fmla v9.4s , %[b0].4s, %[a0a].s[1]\n"
+ "add %[b_ptr], %[b_ptr], %[row_jump]\n"
+ "str q8, [%[c_ptr], #0]\n"
+ "fmla v17.4s, %[b1].4s, %[a0a].s[1]\n"
+ "str q16, [%[c_ptr], #16]\n"
+ "fmla v24.4s, %[b2].4s, %[a0a].s[0]\n"
+ "str q24, [%[c_ptr], #32]\n"
+
+ "fmla v25.4s, %[b2].4s, %[a0a].s[1]\n"
+ "str q9, [%[c_ptr], #48]\n"
+ "fmla v10.4s, %[b0].4s, %[a0a].s[2]\n"
+ "str q17, [%[c_ptr], #64]\n"
+ "fmla v18.4s, %[b1].4s, %[a0a].s[2]\n"
+ "str q25, [%[c_ptr], #80]\n"
+ "fmla v26.4s, %[b2].4s, %[a0a].s[2]\n"
+ "str q10, [%[c_ptr], #96]\n"
+
+ "fmla v11.4s, %[b0].4s, %[a0a].s[3]\n"
+ "str q18, [%[c_ptr], #112]\n"
+ "fmla v19.4s, %[b1].4s, %[a0a].s[3]\n"
+ "str q26, [%[c_ptr], #128]\n"
+ "fmla v27.4s, %[b2].4s, %[a0a].s[3]\n"
+ "str q11, [%[c_ptr], #144]\n"
+
+ "fmla v12.4s, %[b0].4s, %[a1a].s[0]\n"
+ "str q19, [%[c_ptr], #160]\n"
+ "fmla v20.4s, %[b1].4s, %[a1a].s[0]\n"
+ "str q27, [%[c_ptr], #176]\n"
+ "fmla v28.4s, %[b2].4s, %[a1a].s[0]\n"
+ "str q12, [%[c_ptr], #192]\n"
+
+ "fmla v13.4s, %[b0].4s, %[a1a].s[1]\n"
+ "str q20, [%[c_ptr], #208]\n"
+ "fmla v21.4s, %[b1].4s, %[a1a].s[1]\n"
+ "str q28, [%[c_ptr], #224]\n"
+ "fmla v29.4s, %[b2].4s, %[a1a].s[1]\n"
+ "str q13, [%[c_ptr], #240]\n"
+
+ "fmla v14.4s, %[b0].4s, %[a1a].s[2]\n"
+ "str q21, [%[c_ptr], #256]\n"
+ "fmla v22.4s, %[b1].4s, %[a1a].s[2]\n"
+ "str q29, [%[c_ptr], #272]\n"
+ "fmla v30.4s, %[b2].4s, %[a1a].s[2]\n"
+ "str q14, [%[c_ptr], #288]\n"
+
+ "fmla v15.4s, %[b0].4s, %[a1a].s[3]\n"
+ "str q22, [%[c_ptr], #304]\n"
+ "fmla v23.4s, %[b1].4s, %[a1a].s[3]\n"
+ "str q30, [%[c_ptr], #320]\n"
+ "fmla v31.4s, %[b2].4s, %[a1a].s[3]\n"
+ "str q15, [%[c_ptr], #336]\n"
+
+ "b 3f\n"
+
+ // Detached final iteration (odd K)
+ "2:\n"
+ "fmla v8.4s , %[b0].4s, %[a0].s[0]\n"
+ "ldr %q[b2], [%[b_ptr], #32]\n"
+ "fmla v16.4s, %[b1].4s, %[a0].s[0]\n"
+ "add %[b_ptr], %[b_ptr], %[row_jump]\n"
+ "fmla v9.4s , %[b0].4s, %[a0].s[1]\n"
+ "str q8, [%[c_ptr], #0]\n"
+ "fmla v17.4s, %[b1].4s, %[a0].s[1]\n"
+ "str q16, [%[c_ptr], #16]\n"
+ "fmla v24.4s, %[b2].4s, %[a0].s[0]\n"
+ "add %[b_ptr], %[b_ptr], #48\n"
+ "add %[a_ptr], %[a_ptr], #32\n"
+ "str q24, [%[c_ptr], #32]\n"
+ "fmla v25.4s, %[b2].4s, %[a0].s[1]\n"
+ "str q9, [%[c_ptr], #48]\n"
+
+ "fmla v10.4s, %[b0].4s, %[a0].s[2]\n"
+ "str q17, [%[c_ptr], #64]\n"
+ "fmla v18.4s, %[b1].4s, %[a0].s[2]\n"
+ "str q25, [%[c_ptr], #80]\n"
+ "fmla v26.4s, %[b2].4s, %[a0].s[2]\n"
+ "str q10, [%[c_ptr], #96]\n"
+
+ "fmla v11.4s, %[b0].4s, %[a0].s[3]\n"
+ "str q18, [%[c_ptr], #112]\n"
+ "fmla v19.4s, %[b1].4s, %[a0].s[3]\n"
+ "str q26, [%[c_ptr], #128]\n"
+ "fmla v27.4s, %[b2].4s, %[a0].s[3]\n"
+ "str q11, [%[c_ptr], #144]\n"
+
+ "fmla v12.4s, %[b0].4s, %[a1].s[0]\n"
+ "str q19, [%[c_ptr], #160]\n"
+ "fmla v20.4s, %[b1].4s, %[a1].s[0]\n"
+ "str q27, [%[c_ptr], #176]\n"
+ "fmla v28.4s, %[b2].4s, %[a1].s[0]\n"
+ "str q12, [%[c_ptr], #192]\n"
+
+ "fmla v13.4s, %[b0].4s, %[a1].s[1]\n"
+ "str q20, [%[c_ptr], #208]\n"
+ "fmla v21.4s, %[b1].4s, %[a1].s[1]\n"
+ "str q28, [%[c_ptr], #224]\n"
+ "fmla v29.4s, %[b2].4s, %[a1].s[1]\n"
+ "str q13, [%[c_ptr], #240]\n"
+
+ "fmla v14.4s, %[b0].4s, %[a1].s[2]\n"
+ "str q21, [%[c_ptr], #256]\n"
+ "fmla v22.4s, %[b1].4s, %[a1].s[2]\n"
+ "str q29, [%[c_ptr], #272]\n"
+ "fmla v30.4s, %[b2].4s, %[a1].s[2]\n"
+ "str q14, [%[c_ptr], #288]\n"
+
+ "fmla v15.4s, %[b0].4s, %[a1].s[3]\n"
+ "str q22, [%[c_ptr], #304]\n"
+ "fmla v23.4s, %[b1].4s, %[a1].s[3]\n"
+ "str q30, [%[c_ptr], #320]\n"
+ "fmla v31.4s, %[b2].4s, %[a1].s[3]\n"
+ "str q15, [%[c_ptr], #336]\n"
+
+ // Common tail
+ "3:\n"
+ "str q23, [%[c_ptr], #352]\n"
+ "str q31, [%[c_ptr], #368]\n"
+ "add %[c_ptr], %[c_ptr], #384\n"
+ :
+ [a_ptr] "+r"(a_ptr), [b_ptr] "+r"(b_ptr), [c_ptr] "+r"(c_ptr),
+ [a0] "+w"(a0), [a1] "+w"(a1), [a0a] "+w"(a0a), [a1a] "+w"(a1a),
+ [b0] "+w"(b0), [b1] "+w"(b1), [b2] "+w"(b2), [k] "+r"(k)
+ : [oddk] "r"(oddk), [row_jump] "r"(row_jump), [block_jump] "r"(block_jump)
+ : "x20", "x21", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18",
+ "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "cc");
+ }
+ }
+}
+
+void a64_sgemm_asimd_12x8(const float *Apanel, const float *Bpanel, float *Cpanel, int ablocks, int bblocks, int K)
+{
+ a64_sgemm_asimd_12x8_jumps(Apanel, Bpanel, Cpanel, ablocks, bblocks, K, 0, 0);
+}
+
+} // namespace arm_gemm
+
+#endif
\ No newline at end of file
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_sgemm_native_16x4.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_sgemm_native_16x4.hpp
new file mode 100644
index 0000000..eceacc9
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_sgemm_native_16x4.hpp
@@ -0,0 +1,64 @@
+/*
+ * Copyright (c) 2017-2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#pragma once
+
+#ifdef __aarch64__
+
+namespace arm_gemm
+{
+// Actual kernel implementations
+void a64_sgemm_native_16x4(const float *, int, const float *, int, float *, int, float, int, int, int);
+
+// 12x8 SGEMM "strategy" class.
+//
+// This describes the characteristics of a family of kernels, in terms of
+// the required interleave properties and the output block size.
+//
+// All kernels in the family must share these characteristics. The actual
+// kernel to be used can be chosen at runtime, based on the CPU_type
+// structure.
+class sgemm_native_16x4
+{
+public:
+ typedef float operand_type;
+ typedef float result_type;
+
+ typedef void (*kern_type)(const float *, int, const float *, int, float *, int, float, int, int, int);
+
+ /* Kernel blocking parameters */
+ static const int out_width = 16;
+ static const int out_height = 4;
+ static const int k_unroll = 1;
+
+ // Default to the generic kernel
+ kern_type kernel = a64_sgemm_native_16x4;
+
+ sgemm_native_16x4(const CPUInfo *ci)
+ {
+ }
+};
+
+} // namespace arm_gemm
+
+#endif // __aarch64__
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_sgemm_native_16x4/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_sgemm_native_16x4/generic.cpp
new file mode 100644
index 0000000..8d4a38c
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_sgemm_native_16x4/generic.cpp
@@ -0,0 +1,869 @@
+/*
+ * Copyright (c) 2017-2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifdef __aarch64__
+
+#include <cstddef>
+
+#include <arm_neon.h>
+
+namespace arm_gemm
+{
+void a64_sgemm_native_16x4(const float *A, int lda, const float *B, int ldb, float *C, int ldc, float beta, int M, int N, int K)
+{
+ const int oddk = ((K % 8) >= 4) ? 1 : 0;
+ const int beta0 = (beta == 0.0f) ? 1 : 0;
+ const int oddones = (K % 4);
+
+ /* For now, very naive with no blocking */
+ for(int y = 0; y < M; y += 4)
+ {
+ for(int x0 = 0; x0 < N; x0 += 16)
+ {
+ const float *a_ptr0 = A + (y * lda);
+ const float *a_ptr1 = a_ptr0 + lda;
+ const float *a_ptr2 = a_ptr1 + lda;
+ const float *a_ptr3 = a_ptr2 + lda;
+
+ const float *b_ptr = B + x0;
+
+ float *c_ptr0 = C + (y * ldc) + x0;
+ float *c_ptr1 = c_ptr0 + ldc;
+ float *c_ptr2 = c_ptr1 + ldc;
+ float *c_ptr3 = c_ptr2 + ldc;
+
+ int loops = ((K + 4) / 8) - 1;
+ int odds = oddones;
+
+ size_t ldbb = ldb * sizeof(float);
+
+ __asm __volatile(
+ "a0 .req v0\n"
+ "a1 .req v1\n"
+ "a2 .req v2\n"
+ "a3 .req v3\n"
+ "a0a .req v4\n"
+ "a1a .req v5\n"
+ "a2a .req v6\n"
+ "a3a .req v7\n"
+ "bb0 .req v8\n"
+ "bb1 .req v9\n"
+ "bb2 .req v10\n"
+ "bb3 .req v11\n"
+ "b0a .req v12\n"
+ "b1a .req v13\n"
+ "b2a .req v14\n"
+ "b3a .req v15\n"
+
+ "a0q .req q0\n"
+ "a1q .req q1\n"
+ "a2q .req q2\n"
+ "a3q .req q3\n"
+ "a0aq .req q4\n"
+ "a1aq .req q5\n"
+ "a2aq .req q6\n"
+ "a3aq .req q7\n"
+ "b0q .req q8\n"
+ "b1q .req q9\n"
+ "b2q .req q10\n"
+ "b3q .req q11\n"
+ "b0aq .req q12\n"
+ "b1aq .req q13\n"
+ "b2aq .req q14\n"
+ "b3aq .req q15\n"
+
+ "movi v16.4s, #0x0\n"
+ "ldr a0q, [%[a_ptr0]]\n"
+ "movi v17.4s, #0x0\n"
+ "ldr b0q, [%[b_ptr]]\n"
+ "movi v18.4s, #0x0\n"
+ "ldr b1q, [%[b_ptr], #16]\n"
+ "movi v19.4s, #0x0\n"
+ "ldr b2q, [%[b_ptr], #32]\n"
+ "movi v20.4s, #0x0\n"
+ "ldr b3q, [%[b_ptr], #48]\n"
+ "movi v21.4s, #0x0\n"
+ "add %[b_ptr], %[b_ptr], %[ldb]\n"
+ "ldr a1q, [%[a_ptr1]]\n"
+ "movi v22.4s, #0x0\n"
+ "ldr a2q, [%[a_ptr2]]\n"
+ "movi v23.4s, #0x0\n"
+ "ldr a3q, [%[a_ptr3]]\n"
+ "movi v24.4s, #0x0\n"
+ "ldr b0aq, [%[b_ptr]]\n"
+ "movi v25.4s, #0x0\n"
+ "ldr b1aq, [%[b_ptr], #16]\n"
+ "movi v26.4s, #0x0\n"
+ "ldr b2aq, [%[b_ptr], #32]\n"
+ "cbz %w[beta0], 5f\n"
+ "movi v27.4s, #0x0\n"
+ "movi v28.4s, #0x0\n"
+ "movi v29.4s, #0x0\n"
+ "movi v30.4s, #0x0\n"
+ "movi v31.4s, #0x0\n"
+
+ // Skip if no complete loops.
+ "cbz %w[loops], 4f\n"
+ "b 1f\n"
+
+ // If beta is non-zero, need to load and multiply by beta
+ "5:\n"
+ "ld1r {v4.4s}, [%[betaptr]]\n"
+ "ldr q16, [%[c_ptr0]]\n"
+ "ldr q17, [%[c_ptr0], #16]\n"
+ "ldr q18, [%[c_ptr0], #32]\n"
+ "ldr q19, [%[c_ptr0], #48]\n"
+
+ "ldr q20, [%[c_ptr1]]\n"
+ "fmul v16.4s, v16.4s, v4.4s\n"
+ "ldr q21, [%[c_ptr1], #16]\n"
+ "fmul v17.4s, v17.4s, v4.4s\n"
+ "ldr q22, [%[c_ptr1], #32]\n"
+ "fmul v18.4s, v18.4s, v4.4s\n"
+ "ldr q23, [%[c_ptr1], #48]\n"
+ "fmul v19.4s, v19.4s, v4.4s\n"
+
+ "ldr q24, [%[c_ptr2]]\n"
+ "fmul v20.4s, v20.4s, v4.4s\n"
+ "ldr q25, [%[c_ptr2], #16]\n"
+ "fmul v21.4s, v21.4s, v4.4s\n"
+ "ldr q26, [%[c_ptr2], #32]\n"
+ "fmul v22.4s, v22.4s, v4.4s\n"
+ "ldr q27, [%[c_ptr2], #48]\n"
+ "fmul v23.4s, v23.4s, v4.4s\n"
+
+ "ldr q28, [%[c_ptr3]]\n"
+ "fmul v24.4s, v24.4s, v4.4s\n"
+ "ldr q29, [%[c_ptr3], #16]\n"
+ "fmul v25.4s, v25.4s, v4.4s\n"
+ "ldr q30, [%[c_ptr3], #32]\n"
+ "fmul v26.4s, v26.4s, v4.4s\n"
+ "ldr q31, [%[c_ptr3], #48]\n"
+ "fmul v27.4s, v27.4s, v4.4s\n"
+
+ "fmul v28.4s, v28.4s, v4.4s\n"
+ "fmul v29.4s, v29.4s, v4.4s\n"
+ "fmul v30.4s, v30.4s, v4.4s\n"
+ "fmul v31.4s, v31.4s, v4.4s\n"
+
+ "cbz %w[loops], 4f\n"
+
+ "1:\n"
+ // Unroll 0
+ "fmla v16.4s, bb0.4s, a0.s[0]\n"
+ "fmla v20.4s, bb0.4s, a1.s[0]\n"
+ "ldr b3aq, [%[b_ptr], #48]\n"
+ "fmla v24.4s, bb0.4s, a2.s[0]\n"
+ "add %[b_ptr], %[b_ptr], %[ldb]\n"
+ "fmla v28.4s, bb0.4s, a3.s[0]\n"
+ "ldr b0q, [%[b_ptr]]\n"
+
+ "fmla v17.4s, bb1.4s, a0.s[0]\n"
+ "fmla v21.4s, bb1.4s, a1.s[0]\n"
+ "ldr a0aq, [%[a_ptr0], #16]\n"
+ "fmla v25.4s, bb1.4s, a2.s[0]\n"
+ "fmla v29.4s, bb1.4s, a3.s[0]\n"
+ "ldr b1q, [%[b_ptr], #16]\n"
+
+ "fmla v18.4s, bb2.4s, a0.s[0]\n"
+ "fmla v22.4s, bb2.4s, a1.s[0]\n"
+ "ldr a1aq, [%[a_ptr1], #16]\n"
+ "fmla v26.4s, bb2.4s, a2.s[0]\n"
+ "fmla v30.4s, bb2.4s, a3.s[0]\n"
+ "ldr b2q, [%[b_ptr], #32]\n"
+
+ "fmla v19.4s, bb3.4s, a0.s[0]\n"
+ "fmla v23.4s, bb3.4s, a1.s[0]\n"
+ "ldr a2aq, [%[a_ptr2], #16]\n"
+ "fmla v27.4s, bb3.4s, a2.s[0]\n"
+ "fmla v31.4s, bb3.4s, a3.s[0]\n"
+ "ldr b3q, [%[b_ptr], #48]\n"
+
+ // Unroll 1
+ "fmla v16.4s, b0a.4s, a0.s[1]\n"
+ "add %[b_ptr], %[b_ptr], %[ldb]\n"
+ "fmla v20.4s, b0a.4s, a1.s[1]\n"
+ "ldr a3aq, [%[a_ptr3], #16]\n"
+ "fmla v24.4s, b0a.4s, a2.s[1]\n"
+ "fmla v28.4s, b0a.4s, a3.s[1]\n"
+ "ldr b0aq, [%[b_ptr]]\n"
+
+ "fmla v17.4s, b1a.4s, a0.s[1]\n"
+ "fmla v21.4s, b1a.4s, a1.s[1]\n"
+ "subs %w[loops], %w[loops], #1\n"
+ "fmla v25.4s, b1a.4s, a2.s[1]\n"
+ "fmla v29.4s, b1a.4s, a3.s[1]\n"
+ "ldr b1aq, [%[b_ptr], #16]\n"
+
+ "fmla v18.4s, b2a.4s, a0.s[1]\n"
+ "fmla v22.4s, b2a.4s, a1.s[1]\n"
+ "fmla v26.4s, b2a.4s, a2.s[1]\n"
+ "fmla v30.4s, b2a.4s, a3.s[1]\n"
+ "ldr b2aq, [%[b_ptr], #32]\n"
+
+ "fmla v19.4s, b3a.4s, a0.s[1]\n"
+ "fmla v23.4s, b3a.4s, a1.s[1]\n"
+ "fmla v27.4s, b3a.4s, a2.s[1]\n"
+ "fmla v31.4s, b3a.4s, a3.s[1]\n"
+ "ldr b3aq, [%[b_ptr], #48]\n"
+
+ // Unroll 2
+ "fmla v16.4s, bb0.4s, a0.s[2]\n"
+ "fmla v20.4s, bb0.4s, a1.s[2]\n"
+ "add %[b_ptr], %[b_ptr], %[ldb]\n"
+ "fmla v24.4s, bb0.4s, a2.s[2]\n"
+ "fmla v28.4s, bb0.4s, a3.s[2]\n"
+ "ldr b0q, [%[b_ptr]]\n"
+
+ "fmla v17.4s, bb1.4s, a0.s[2]\n"
+ "add %[a_ptr0], %[a_ptr0], #32\n"
+ "fmla v21.4s, bb1.4s, a1.s[2]\n"
+ "add %[a_ptr1], %[a_ptr1], #32\n"
+ "fmla v25.4s, bb1.4s, a2.s[2]\n"
+ "add %[a_ptr2], %[a_ptr2], #32\n"
+ "fmla v29.4s, bb1.4s, a3.s[2]\n"
+ "ldr b1q, [%[b_ptr], #16]\n"
+
+ "fmla v18.4s, bb2.4s, a0.s[2]\n"
+ "add %[a_ptr3], %[a_ptr3], #32\n"
+ "fmla v22.4s, bb2.4s, a1.s[2]\n"
+ "fmla v26.4s, bb2.4s, a2.s[2]\n"
+ "fmla v30.4s, bb2.4s, a3.s[2]\n"
+ "ldr b2q, [%[b_ptr], #32]\n"
+
+ "fmla v19.4s, bb3.4s, a0.s[2]\n"
+ "fmla v23.4s, bb3.4s, a1.s[2]\n"
+ "fmla v27.4s, bb3.4s, a2.s[2]\n"
+ "fmla v31.4s, bb3.4s, a3.s[2]\n"
+ "ldr b3q, [%[b_ptr], #48]\n"
+
+ // Unroll 3
+ "fmla v16.4s, b0a.4s, a0.s[3]\n"
+ "fmla v20.4s, b0a.4s, a1.s[3]\n"
+ "add %[b_ptr], %[b_ptr], %[ldb]\n"
+ "fmla v24.4s, b0a.4s, a2.s[3]\n"
+ "fmla v28.4s, b0a.4s, a3.s[3]\n"
+ "ldr b0aq, [%[b_ptr]]\n"
+
+ "fmla v17.4s, b1a.4s, a0.s[3]\n"
+ "fmla v21.4s, b1a.4s, a1.s[3]\n"
+ "fmla v25.4s, b1a.4s, a2.s[3]\n"
+ "fmla v29.4s, b1a.4s, a3.s[3]\n"
+ "ldr b1aq, [%[b_ptr], #16]\n"
+
+ "fmla v18.4s, b2a.4s, a0.s[3]\n"
+ "fmla v22.4s, b2a.4s, a1.s[3]\n"
+ "fmla v26.4s, b2a.4s, a2.s[3]\n"
+ "fmla v30.4s, b2a.4s, a3.s[3]\n"
+ "ldr b2aq, [%[b_ptr], #32]\n"
+
+ "fmla v19.4s, b3a.4s, a0.s[3]\n"
+ "fmla v23.4s, b3a.4s, a1.s[3]\n"
+ "ldr a0q, [%[a_ptr0]]\n"
+ "fmla v27.4s, b3a.4s, a2.s[3]\n"
+ "fmla v31.4s, b3a.4s, a3.s[3]\n"
+ "ldr b3aq, [%[b_ptr], #48]\n"
+
+ // Unroll 4
+ "fmla v16.4s, bb0.4s, a0a.s[0]\n"
+ "fmla v20.4s, bb0.4s, a1a.s[0]\n"
+ "add %[b_ptr], %[b_ptr], %[ldb]\n"
+ "fmla v24.4s, bb0.4s, a2a.s[0]\n"
+ "fmla v28.4s, bb0.4s, a3a.s[0]\n"
+ "ldr b0q, [%[b_ptr]]\n"
+
+ "fmla v17.4s, bb1.4s, a0a.s[0]\n"
+ "fmla v21.4s, bb1.4s, a1a.s[0]\n"
+ "ldr a1q, [%[a_ptr1]]\n"
+ "fmla v25.4s, bb1.4s, a2a.s[0]\n"
+ "fmla v29.4s, bb1.4s, a3a.s[0]\n"
+ "ldr b1q, [%[b_ptr], #16]\n"
+
+ "fmla v18.4s, bb2.4s, a0a.s[0]\n"
+ "fmla v22.4s, bb2.4s, a1a.s[0]\n"
+ "ldr a2q, [%[a_ptr2]]\n"
+ "fmla v26.4s, bb2.4s, a2a.s[0]\n"
+ "fmla v30.4s, bb2.4s, a3a.s[0]\n"
+ "ldr b2q, [%[b_ptr], #32]\n"
+
+ "fmla v19.4s, bb3.4s, a0a.s[0]\n"
+ "fmla v23.4s, bb3.4s, a1a.s[0]\n"
+ "ldr a3q, [%[a_ptr3]]\n"
+ "fmla v27.4s, bb3.4s, a2a.s[0]\n"
+ "fmla v31.4s, bb3.4s, a3a.s[0]\n"
+ "ldr b3q, [%[b_ptr], #48]\n"
+
+ // Unroll 5
+ "fmla v16.4s, b0a.4s, a0a.s[1]\n"
+ "fmla v20.4s, b0a.4s, a1a.s[1]\n"
+ "add %[b_ptr], %[b_ptr], %[ldb]\n"
+ "fmla v24.4s, b0a.4s, a2a.s[1]\n"
+ "fmla v28.4s, b0a.4s, a3a.s[1]\n"
+ "ldr b0aq, [%[b_ptr]]\n"
+
+ "fmla v17.4s, b1a.4s, a0a.s[1]\n"
+ "fmla v21.4s, b1a.4s, a1a.s[1]\n"
+ "fmla v25.4s, b1a.4s, a2a.s[1]\n"
+ "fmla v29.4s, b1a.4s, a3a.s[1]\n"
+ "ldr b1aq, [%[b_ptr], #16]\n"
+
+ "fmla v18.4s, b2a.4s, a0a.s[1]\n"
+ "fmla v22.4s, b2a.4s, a1a.s[1]\n"
+ "fmla v26.4s, b2a.4s, a2a.s[1]\n"
+ "fmla v30.4s, b2a.4s, a3a.s[1]\n"
+ "ldr b2aq, [%[b_ptr], #32]\n"
+
+ "fmla v19.4s, b3a.4s, a0a.s[1]\n"
+ "fmla v23.4s, b3a.4s, a1a.s[1]\n"
+ "fmla v27.4s, b3a.4s, a2a.s[1]\n"
+ "fmla v31.4s, b3a.4s, a3a.s[1]\n"
+ "ldr b3aq, [%[b_ptr], #48]\n"
+
+ // Unroll 6
+ "fmla v16.4s, bb0.4s, a0a.s[2]\n"
+ "fmla v20.4s, bb0.4s, a1a.s[2]\n"
+ "add %[b_ptr], %[b_ptr], %[ldb]\n"
+ "fmla v24.4s, bb0.4s, a2a.s[2]\n"
+ "fmla v28.4s, bb0.4s, a3a.s[2]\n"
+ "ldr b0q, [%[b_ptr]]\n"
+
+ "fmla v17.4s, bb1.4s, a0a.s[2]\n"
+ "fmla v21.4s, bb1.4s, a1a.s[2]\n"
+ "fmla v25.4s, bb1.4s, a2a.s[2]\n"
+ "fmla v29.4s, bb1.4s, a3a.s[2]\n"
+ "ldr b1q, [%[b_ptr], #16]\n"
+
+ "fmla v18.4s, bb2.4s, a0a.s[2]\n"
+ "fmla v22.4s, bb2.4s, a1a.s[2]\n"
+ "fmla v26.4s, bb2.4s, a2a.s[2]\n"
+ "fmla v30.4s, bb2.4s, a3a.s[2]\n"
+ "ldr b2q, [%[b_ptr], #32]\n"
+
+ "fmla v19.4s, bb3.4s, a0a.s[2]\n"
+ "fmla v23.4s, bb3.4s, a1a.s[2]\n"
+ "fmla v27.4s, bb3.4s, a2a.s[2]\n"
+ "fmla v31.4s, bb3.4s, a3a.s[2]\n"
+ "ldr b3q, [%[b_ptr], #48]\n"
+
+ // Unroll 7
+ "fmla v16.4s, b0a.4s, a0a.s[3]\n"
+ "fmla v20.4s, b0a.4s, a1a.s[3]\n"
+ "add %[b_ptr], %[b_ptr], %[ldb]\n"
+ "fmla v24.4s, b0a.4s, a2a.s[3]\n"
+ "fmla v28.4s, b0a.4s, a3a.s[3]\n"
+ "ldr b0aq, [%[b_ptr]]\n"
+
+ "fmla v17.4s, b1a.4s, a0a.s[3]\n"
+ "fmla v21.4s, b1a.4s, a1a.s[3]\n"
+ "fmla v25.4s, b1a.4s, a2a.s[3]\n"
+ "fmla v29.4s, b1a.4s, a3a.s[3]\n"
+ "ldr b1aq, [%[b_ptr], #16]\n"
+
+ "fmla v18.4s, b2a.4s, a0a.s[3]\n"
+ "fmla v22.4s, b2a.4s, a1a.s[3]\n"
+ "fmla v26.4s, b2a.4s, a2a.s[3]\n"
+ "fmla v30.4s, b2a.4s, a3a.s[3]\n"
+ "ldr b2aq, [%[b_ptr], #32]\n"
+
+ "fmla v19.4s, b3a.4s, a0a.s[3]\n"
+ "fmla v23.4s, b3a.4s, a1a.s[3]\n"
+ "fmla v27.4s, b3a.4s, a2a.s[3]\n"
+ "fmla v31.4s, b3a.4s, a3a.s[3]\n"
+ "bne 1b\n"
+
+ // Skip to here
+ "4:\n"
+
+ // Detached final iteration
+ // Unroll 0
+ "fmla v16.4s, bb0.4s, a0.s[0]\n"
+ "fmla v20.4s, bb0.4s, a1.s[0]\n"
+ "ldr b3aq, [%[b_ptr], #48]\n"
+ "fmla v24.4s, bb0.4s, a2.s[0]\n"
+ "add %[b_ptr], %[b_ptr], %[ldb]\n"
+ "fmla v28.4s, bb0.4s, a3.s[0]\n"
+ "ldr b0q, [%[b_ptr]]\n"
+
+ "fmla v17.4s, bb1.4s, a0.s[0]\n"
+ "cbnz %w[oddk], 2f\n" // Deal with odd K before we load a0a
+ "fmla v21.4s, bb1.4s, a1.s[0]\n"
+ "ldr a0aq, [%[a_ptr0], #16]\n"
+ "fmla v25.4s, bb1.4s, a2.s[0]\n"
+ "fmla v29.4s, bb1.4s, a3.s[0]\n"
+ "ldr b1q, [%[b_ptr], #16]\n"
+
+ "fmla v18.4s, bb2.4s, a0.s[0]\n"
+ "fmla v22.4s, bb2.4s, a1.s[0]\n"
+ "ldr a1aq, [%[a_ptr1], #16]\n"
+ "fmla v26.4s, bb2.4s, a2.s[0]\n"
+ "fmla v30.4s, bb2.4s, a3.s[0]\n"
+ "ldr b2q, [%[b_ptr], #32]\n"
+
+ "fmla v19.4s, bb3.4s, a0.s[0]\n"
+ "fmla v23.4s, bb3.4s, a1.s[0]\n"
+ "ldr a2aq, [%[a_ptr2], #16]\n"
+ "fmla v27.4s, bb3.4s, a2.s[0]\n"
+ "fmla v31.4s, bb3.4s, a3.s[0]\n"
+ "ldr b3q, [%[b_ptr], #48]\n"
+
+ // Unroll 1
+ "fmla v16.4s, b0a.4s, a0.s[1]\n"
+ "add %[b_ptr], %[b_ptr], %[ldb]\n"
+ "fmla v20.4s, b0a.4s, a1.s[1]\n"
+ "ldr a3aq, [%[a_ptr3], #16]\n"
+ "fmla v24.4s, b0a.4s, a2.s[1]\n"
+ "fmla v28.4s, b0a.4s, a3.s[1]\n"
+ "ldr b0aq, [%[b_ptr]]\n"
+
+ "fmla v17.4s, b1a.4s, a0.s[1]\n"
+ "add %[a_ptr0], %[a_ptr0], #32\n"
+ "fmla v21.4s, b1a.4s, a1.s[1]\n"
+ "add %[a_ptr1], %[a_ptr1], #32\n"
+ "fmla v25.4s, b1a.4s, a2.s[1]\n"
+ "add %[a_ptr2], %[a_ptr2], #32\n"
+ "fmla v29.4s, b1a.4s, a3.s[1]\n"
+ "ldr b1aq, [%[b_ptr], #16]\n"
+
+ "fmla v18.4s, b2a.4s, a0.s[1]\n"
+ "fmla v22.4s, b2a.4s, a1.s[1]\n"
+ "add %[a_ptr3], %[a_ptr3], #32\n"
+ "fmla v26.4s, b2a.4s, a2.s[1]\n"
+ "fmla v30.4s, b2a.4s, a3.s[1]\n"
+ "ldr b2aq, [%[b_ptr], #32]\n"
+
+ "fmla v19.4s, b3a.4s, a0.s[1]\n"
+ "fmla v23.4s, b3a.4s, a1.s[1]\n"
+ "fmla v27.4s, b3a.4s, a2.s[1]\n"
+ "fmla v31.4s, b3a.4s, a3.s[1]\n"
+ "ldr b3aq, [%[b_ptr], #48]\n"
+
+ // Unroll 2
+ "fmla v16.4s, bb0.4s, a0.s[2]\n"
+ "fmla v20.4s, bb0.4s, a1.s[2]\n"
+ "add %[b_ptr], %[b_ptr], %[ldb]\n"
+ "fmla v24.4s, bb0.4s, a2.s[2]\n"
+ "fmla v28.4s, bb0.4s, a3.s[2]\n"
+ "ldr b0q, [%[b_ptr]]\n"
+
+ "fmla v17.4s, bb1.4s, a0.s[2]\n"
+ "fmla v21.4s, bb1.4s, a1.s[2]\n"
+ "fmla v25.4s, bb1.4s, a2.s[2]\n"
+ "fmla v29.4s, bb1.4s, a3.s[2]\n"
+ "ldr b1q, [%[b_ptr], #16]\n"
+
+ "fmla v18.4s, bb2.4s, a0.s[2]\n"
+ "fmla v22.4s, bb2.4s, a1.s[2]\n"
+ "fmla v26.4s, bb2.4s, a2.s[2]\n"
+ "fmla v30.4s, bb2.4s, a3.s[2]\n"
+ "ldr b2q, [%[b_ptr], #32]\n"
+
+ "fmla v19.4s, bb3.4s, a0.s[2]\n"
+ "fmla v23.4s, bb3.4s, a1.s[2]\n"
+ "fmla v27.4s, bb3.4s, a2.s[2]\n"
+ "fmla v31.4s, bb3.4s, a3.s[2]\n"
+ "ldr b3q, [%[b_ptr], #48]\n"
+
+ // Unroll 3
+ "fmla v16.4s, b0a.4s, a0.s[3]\n"
+ "fmla v20.4s, b0a.4s, a1.s[3]\n"
+ "add %[b_ptr], %[b_ptr], %[ldb]\n"
+ "fmla v24.4s, b0a.4s, a2.s[3]\n"
+ "fmla v28.4s, b0a.4s, a3.s[3]\n"
+ "ldr b0aq, [%[b_ptr]]\n"
+
+ "fmla v17.4s, b1a.4s, a0.s[3]\n"
+ "fmla v21.4s, b1a.4s, a1.s[3]\n"
+ "fmla v25.4s, b1a.4s, a2.s[3]\n"
+ "fmla v29.4s, b1a.4s, a3.s[3]\n"
+ "ldr b1aq, [%[b_ptr], #16]\n"
+
+ "fmla v18.4s, b2a.4s, a0.s[3]\n"
+ "fmla v22.4s, b2a.4s, a1.s[3]\n"
+ "fmla v26.4s, b2a.4s, a2.s[3]\n"
+ "fmla v30.4s, b2a.4s, a3.s[3]\n"
+ "ldr b2aq, [%[b_ptr], #32]\n"
+
+ "fmla v19.4s, b3a.4s, a0.s[3]\n"
+ "fmla v23.4s, b3a.4s, a1.s[3]\n"
+ "fmla v27.4s, b3a.4s, a2.s[3]\n"
+ "fmla v31.4s, b3a.4s, a3.s[3]\n"
+ "ldr b3aq, [%[b_ptr], #48]\n"
+
+ // Unroll 4
+ "fmla v16.4s, bb0.4s, a0a.s[0]\n"
+ "fmla v20.4s, bb0.4s, a1a.s[0]\n"
+ "add %[b_ptr], %[b_ptr], %[ldb]\n"
+ "fmla v24.4s, bb0.4s, a2a.s[0]\n"
+ "fmla v28.4s, bb0.4s, a3a.s[0]\n"
+ "ldr b0q, [%[b_ptr]]\n"
+
+ "fmla v17.4s, bb1.4s, a0a.s[0]\n"
+ "fmla v21.4s, bb1.4s, a1a.s[0]\n"
+ "fmla v25.4s, bb1.4s, a2a.s[0]\n"
+ "fmla v29.4s, bb1.4s, a3a.s[0]\n"
+ "ldr b1q, [%[b_ptr], #16]\n"
+
+ "fmla v18.4s, bb2.4s, a0a.s[0]\n"
+ "fmla v22.4s, bb2.4s, a1a.s[0]\n"
+ "fmla v26.4s, bb2.4s, a2a.s[0]\n"
+ "fmla v30.4s, bb2.4s, a3a.s[0]\n"
+ "ldr b2q, [%[b_ptr], #32]\n"
+
+ "fmla v19.4s, bb3.4s, a0a.s[0]\n"
+ "fmla v23.4s, bb3.4s, a1a.s[0]\n"
+ "fmla v27.4s, bb3.4s, a2a.s[0]\n"
+ "fmla v31.4s, bb3.4s, a3a.s[0]\n"
+ "ldr b3q, [%[b_ptr], #48]\n"
+
+ // Unroll 5
+ "fmla v16.4s, b0a.4s, a0a.s[1]\n"
+ "fmla v20.4s, b0a.4s, a1a.s[1]\n"
+ "add %[b_ptr], %[b_ptr], %[ldb]\n"
+ "fmla v24.4s, b0a.4s, a2a.s[1]\n"
+ "fmla v28.4s, b0a.4s, a3a.s[1]\n"
+ "ldr b0aq, [%[b_ptr]]\n"
+
+ "fmla v17.4s, b1a.4s, a0a.s[1]\n"
+ "fmla v21.4s, b1a.4s, a1a.s[1]\n"
+ "fmla v25.4s, b1a.4s, a2a.s[1]\n"
+ "fmla v29.4s, b1a.4s, a3a.s[1]\n"
+ "ldr b1aq, [%[b_ptr], #16]\n"
+
+ "fmla v18.4s, b2a.4s, a0a.s[1]\n"
+ "fmla v22.4s, b2a.4s, a1a.s[1]\n"
+ "fmla v26.4s, b2a.4s, a2a.s[1]\n"
+ "fmla v30.4s, b2a.4s, a3a.s[1]\n"
+ "ldr b2aq, [%[b_ptr], #32]\n"
+
+ "fmla v19.4s, b3a.4s, a0a.s[1]\n"
+ "fmla v23.4s, b3a.4s, a1a.s[1]\n"
+ "fmla v27.4s, b3a.4s, a2a.s[1]\n"
+ "fmla v31.4s, b3a.4s, a3a.s[1]\n"
+ "ldr b3aq, [%[b_ptr], #48]\n"
+
+ // Unroll 6
+ "fmla v16.4s, bb0.4s, a0a.s[2]\n"
+ "fmla v20.4s, bb0.4s, a1a.s[2]\n"
+ "add %[b_ptr], %[b_ptr], %[ldb]\n"
+ "fmla v24.4s, bb0.4s, a2a.s[2]\n"
+ "fmla v28.4s, bb0.4s, a3a.s[2]\n"
+
+ "fmla v17.4s, bb1.4s, a0a.s[2]\n"
+ "fmla v21.4s, bb1.4s, a1a.s[2]\n"
+ "fmla v25.4s, bb1.4s, a2a.s[2]\n"
+ "fmla v29.4s, bb1.4s, a3a.s[2]\n"
+
+ "fmla v18.4s, bb2.4s, a0a.s[2]\n"
+ "fmla v22.4s, bb2.4s, a1a.s[2]\n"
+ "fmla v26.4s, bb2.4s, a2a.s[2]\n"
+ "fmla v30.4s, bb2.4s, a3a.s[2]\n"
+
+ "fmla v19.4s, bb3.4s, a0a.s[2]\n"
+ "fmla v23.4s, bb3.4s, a1a.s[2]\n"
+ "fmla v27.4s, bb3.4s, a2a.s[2]\n"
+ "fmla v31.4s, bb3.4s, a3a.s[2]\n"
+
+ // Unroll 7
+ "fmla v16.4s, b0a.4s, a0a.s[3]\n"
+ "fmla v17.4s, b1a.4s, a0a.s[3]\n"
+ "fmla v18.4s, b2a.4s, a0a.s[3]\n"
+ "fmla v19.4s, b3a.4s, a0a.s[3]\n"
+ "cbnz %w[odds], 6f\n"
+
+ "fmla v20.4s, b0a.4s, a1a.s[3]\n"
+ "str q16, [%[c_ptr0]]\n"
+ "fmla v21.4s, b1a.4s, a1a.s[3]\n"
+ "str q17, [%[c_ptr0], #16]\n"
+ "fmla v22.4s, b2a.4s, a1a.s[3]\n"
+ "str q18, [%[c_ptr0], #32]\n"
+ "fmla v23.4s, b3a.4s, a1a.s[3]\n"
+ "str q19, [%[c_ptr0], #48]\n"
+
+ "fmla v24.4s, b0a.4s, a2a.s[3]\n"
+ "str q20, [%[c_ptr1]]\n"
+ "fmla v25.4s, b1a.4s, a2a.s[3]\n"
+ "str q21, [%[c_ptr1], #16]\n"
+ "fmla v26.4s, b2a.4s, a2a.s[3]\n"
+ "str q22, [%[c_ptr1], #32]\n"
+ "fmla v27.4s, b3a.4s, a2a.s[3]\n"
+ "str q23, [%[c_ptr1], #48]\n"
+
+ "fmla v28.4s, b0a.4s, a3a.s[3]\n"
+ "str q24, [%[c_ptr2]]\n"
+ "fmla v29.4s, b1a.4s, a3a.s[3]\n"
+ "str q25, [%[c_ptr2], #16]\n"
+ "fmla v30.4s, b2a.4s, a3a.s[3]\n"
+ "str q26, [%[c_ptr2], #32]\n"
+ "fmla v31.4s, b3a.4s, a3a.s[3]\n"
+ "str q27, [%[c_ptr2], #48]\n"
+ "b 3f\n"
+
+ // Odd K case: Just do 4 more.
+ "2:\n"
+ "fmla v21.4s, bb1.4s, a1.s[0]\n"
+ "add %[a_ptr0], %[a_ptr0], #16\n"
+ "fmla v25.4s, bb1.4s, a2.s[0]\n"
+ "add %[a_ptr1], %[a_ptr1], #16\n"
+ "fmla v29.4s, bb1.4s, a3.s[0]\n"
+ "ldr b1q, [%[b_ptr], #16]\n"
+
+ "fmla v18.4s, bb2.4s, a0.s[0]\n"
+ "add %[a_ptr2], %[a_ptr2], #16\n"
+ "fmla v22.4s, bb2.4s, a1.s[0]\n"
+ "add %[a_ptr3], %[a_ptr3], #16\n"
+ "fmla v26.4s, bb2.4s, a2.s[0]\n"
+ "fmla v30.4s, bb2.4s, a3.s[0]\n"
+ "ldr b2q, [%[b_ptr], #32]\n"
+
+ "fmla v19.4s, bb3.4s, a0.s[0]\n"
+ "fmla v23.4s, bb3.4s, a1.s[0]\n"
+ "fmla v27.4s, bb3.4s, a2.s[0]\n"
+ "fmla v31.4s, bb3.4s, a3.s[0]\n"
+ "ldr b3q, [%[b_ptr], #48]\n"
+
+ // Unroll 1
+ "fmla v16.4s, b0a.4s, a0.s[1]\n"
+ "add %[b_ptr], %[b_ptr], %[ldb]\n"
+ "fmla v20.4s, b0a.4s, a1.s[1]\n"
+ "fmla v24.4s, b0a.4s, a2.s[1]\n"
+ "fmla v28.4s, b0a.4s, a3.s[1]\n"
+ "ldr b0aq, [%[b_ptr]]\n"
+
+ "fmla v17.4s, b1a.4s, a0.s[1]\n"
+ "fmla v21.4s, b1a.4s, a1.s[1]\n"
+ "fmla v25.4s, b1a.4s, a2.s[1]\n"
+ "fmla v29.4s, b1a.4s, a3.s[1]\n"
+ "ldr b1aq, [%[b_ptr], #16]\n"
+
+ "fmla v18.4s, b2a.4s, a0.s[1]\n"
+ "fmla v22.4s, b2a.4s, a1.s[1]\n"
+ "fmla v26.4s, b2a.4s, a2.s[1]\n"
+ "fmla v30.4s, b2a.4s, a3.s[1]\n"
+ "ldr b2aq, [%[b_ptr], #32]\n"
+
+ "fmla v19.4s, b3a.4s, a0.s[1]\n"
+ "fmla v23.4s, b3a.4s, a1.s[1]\n"
+ "fmla v27.4s, b3a.4s, a2.s[1]\n"
+ "fmla v31.4s, b3a.4s, a3.s[1]\n"
+ "ldr b3aq, [%[b_ptr], #48]\n"
+
+ // Unroll 2
+ "fmla v16.4s, bb0.4s, a0.s[2]\n"
+ "add %[b_ptr], %[b_ptr], %[ldb]\n"
+ "fmla v20.4s, bb0.4s, a1.s[2]\n"
+ "fmla v24.4s, bb0.4s, a2.s[2]\n"
+ "fmla v28.4s, bb0.4s, a3.s[2]\n"
+
+ "fmla v17.4s, bb1.4s, a0.s[2]\n"
+ "fmla v21.4s, bb1.4s, a1.s[2]\n"
+ "fmla v25.4s, bb1.4s, a2.s[2]\n"
+ "fmla v29.4s, bb1.4s, a3.s[2]\n"
+
+ "fmla v18.4s, bb2.4s, a0.s[2]\n"
+ "fmla v22.4s, bb2.4s, a1.s[2]\n"
+ "fmla v26.4s, bb2.4s, a2.s[2]\n"
+ "fmla v30.4s, bb2.4s, a3.s[2]\n"
+
+ "fmla v19.4s, bb3.4s, a0.s[2]\n"
+ "fmla v23.4s, bb3.4s, a1.s[2]\n"
+ "fmla v27.4s, bb3.4s, a2.s[2]\n"
+ "fmla v31.4s, bb3.4s, a3.s[2]\n"
+
+ // Unroll 3
+ "fmla v16.4s, b0a.4s, a0.s[3]\n"
+ "fmla v17.4s, b1a.4s, a0.s[3]\n"
+ "fmla v18.4s, b2a.4s, a0.s[3]\n"
+ "fmla v19.4s, b3a.4s, a0.s[3]\n"
+ "cbnz %w[odds], 7f\n"
+
+ "fmla v20.4s, b0a.4s, a1.s[3]\n"
+ "str q16, [%[c_ptr0]]\n"
+ "fmla v21.4s, b1a.4s, a1.s[3]\n"
+ "str q17, [%[c_ptr0], #16]\n"
+ "fmla v22.4s, b2a.4s, a1.s[3]\n"
+ "str q18, [%[c_ptr0], #32]\n"
+ "fmla v23.4s, b3a.4s, a1.s[3]\n"
+ "str q19, [%[c_ptr0], #48]\n"
+
+ "fmla v24.4s, b0a.4s, a2.s[3]\n"
+ "str q20, [%[c_ptr1]]\n"
+ "fmla v25.4s, b1a.4s, a2.s[3]\n"
+ "str q21, [%[c_ptr1], #16]\n"
+ "fmla v26.4s, b2a.4s, a2.s[3]\n"
+ "str q22, [%[c_ptr1], #32]\n"
+ "fmla v27.4s, b3a.4s, a2.s[3]\n"
+ "str q23, [%[c_ptr1], #48]\n"
+
+ "fmla v28.4s, b0a.4s, a3.s[3]\n"
+ "str q24, [%[c_ptr2]]\n"
+ "fmla v29.4s, b1a.4s, a3.s[3]\n"
+ "str q25, [%[c_ptr2], #16]\n"
+ "fmla v30.4s, b2a.4s, a3.s[3]\n"
+ "str q26, [%[c_ptr2], #32]\n"
+ "fmla v31.4s, b3a.4s, a3.s[3]\n"
+ "str q27, [%[c_ptr2], #48]\n"
+ "b 3f\n"
+
+ // "Odd ones" - lead in from even
+ "6:\n"
+ "fmla v20.4s, b0a.4s, a1a.s[3]\n"
+ "fmla v21.4s, b1a.4s, a1a.s[3]\n"
+ "ldr b0q, [%[b_ptr]]\n"
+ "fmla v22.4s, b2a.4s, a1a.s[3]\n"
+ "subs %w[odds], %w[odds], #1\n"
+ "fmla v23.4s, b3a.4s, a1a.s[3]\n"
+ "ldr b1q, [%[b_ptr], #16]\n"
+
+ "fmla v24.4s, b0a.4s, a2a.s[3]\n"
+ "fmla v25.4s, b1a.4s, a2a.s[3]\n"
+ "ldr b2q, [%[b_ptr], #32]\n"
+ "fmla v26.4s, b2a.4s, a2a.s[3]\n"
+ "fmla v27.4s, b3a.4s, a2a.s[3]\n"
+ "ldr b3q, [%[b_ptr], #48]\n"
+
+ "fmla v28.4s, b0a.4s, a3a.s[3]\n"
+ "ld1r {a0.4s}, [%[a_ptr0]], #4\n"
+ "fmla v29.4s, b1a.4s, a3a.s[3]\n"
+ "fmla v30.4s, b2a.4s, a3a.s[3]\n"
+ "ld1r {a1.4s}, [%[a_ptr1]], #4\n"
+ "fmla v31.4s, b3a.4s, a3a.s[3]\n"
+
+ "fmla v16.4s, bb0.4s, a0.4s\n"
+ "beq 9f\n"
+ "b 8f\n"
+
+ // "Odd ones" - lead in from odd
+ "7:\n"
+ "fmla v20.4s, b0a.4s, a1.s[3]\n"
+ "subs %w[odds], %w[odds], #1\n"
+ "fmla v21.4s, b1a.4s, a1.s[3]\n"
+ "ldr b0q, [%[b_ptr]]\n"
+ "fmla v22.4s, b2a.4s, a1.s[3]\n"
+ "fmla v23.4s, b3a.4s, a1.s[3]\n"
+ "ldr b1q, [%[b_ptr], #16]\n"
+
+ "fmla v24.4s, b0a.4s, a2.s[3]\n"
+ "fmla v25.4s, b1a.4s, a2.s[3]\n"
+ "ldr b2q, [%[b_ptr], #32]\n"
+ "fmla v26.4s, b2a.4s, a2.s[3]\n"
+ "fmla v27.4s, b3a.4s, a2.s[3]\n"
+ "ldr b3q, [%[b_ptr], #48]\n"
+
+ "fmla v28.4s, b0a.4s, a3.s[3]\n"
+ "ld1r {a0.4s}, [%[a_ptr0]], #4\n"
+ "fmla v29.4s, b1a.4s, a3.s[3]\n"
+ "fmla v30.4s, b2a.4s, a3.s[3]\n"
+ "ld1r {a1.4s}, [%[a_ptr1]], #4\n"
+ "fmla v31.4s, b3a.4s, a3.s[3]\n"
+
+ "fmla v16.4s, bb0.4s, a0.4s\n"
+ "beq 9f\n"
+
+ // "Odd ones" - loop
+ "8:\n"
+ "fmla v17.4s, bb1.4s, a0.4s\n"
+ "ld1r {a2.4s}, [%[a_ptr2]], #4\n"
+ "fmla v18.4s, bb2.4s, a0.4s\n"
+ "add %[b_ptr], %[b_ptr], %[ldb]\n"
+ "fmla v19.4s, bb3.4s, a0.4s\n"
+ "ld1r {a3.4s}, [%[a_ptr3]], #4\n"
+
+ "fmla v20.4s, bb0.4s, a1.4s\n"
+ "subs %w[odds], %w[odds], #1\n"
+ "fmla v21.4s, bb1.4s, a1.4s\n"
+ "ld1r {a0.4s}, [%[a_ptr0]], #4\n"
+ "fmla v22.4s, bb2.4s, a1.4s\n"
+ "fmla v23.4s, bb3.4s, a1.4s\n"
+ "ld1r {a1.4s}, [%[a_ptr1]], #4\n"
+
+ "fmla v24.4s, bb0.4s, a2.4s\n"
+ "fmla v28.4s, bb0.4s, a3.4s\n"
+ "ldr b0q, [%[b_ptr]]\n"
+ "fmla v25.4s, bb1.4s, a2.4s\n"
+ "fmla v29.4s, bb1.4s, a3.4s\n"
+ "ldr b1q, [%[b_ptr], #16]\n"
+
+ "fmla v26.4s, bb2.4s, a2.4s\n"
+ "fmla v30.4s, bb2.4s, a3.4s\n"
+ "ldr b2q, [%[b_ptr], #32]\n"
+ "fmla v27.4s, bb3.4s, a2.4s\n"
+ "fmla v31.4s, bb3.4s, a3.4s\n"
+ "ldr b3q, [%[b_ptr], #48]\n"
+ "fmla v16.4s, bb0.4s, a0.4s\n"
+ "bne 8b\n"
+
+ // "Odd ones" - detached final iteration
+ "9:\n"
+ "fmla v17.4s, bb1.4s, a0.4s\n"
+ "ld1r {a2.4s}, [%[a_ptr2]], #4\n"
+ "fmla v18.4s, bb2.4s, a0.4s\n"
+ "fmla v19.4s, bb3.4s, a0.4s\n"
+ "ld1r {a3.4s}, [%[a_ptr3]], #4\n"
+
+ "fmla v20.4s, bb0.4s, a1.4s\n"
+ "str q16, [%[c_ptr0]]\n"
+ "fmla v21.4s, bb1.4s, a1.4s\n"
+ "str q17, [%[c_ptr0], #16]\n"
+ "fmla v22.4s, bb2.4s, a1.4s\n"
+ "str q18, [%[c_ptr0], #32]\n"
+ "fmla v23.4s, bb3.4s, a1.4s\n"
+ "str q19, [%[c_ptr0], #48]\n"
+
+ "fmla v24.4s, bb0.4s, a2.4s\n"
+ "str q20, [%[c_ptr1]]\n"
+ "fmla v25.4s, bb1.4s, a2.4s\n"
+ "str q21, [%[c_ptr1], #16]\n"
+ "fmla v26.4s, bb2.4s, a2.4s\n"
+ "str q22, [%[c_ptr1], #32]\n"
+ "fmla v27.4s, bb3.4s, a2.4s\n"
+ "str q23, [%[c_ptr1], #48]\n"
+
+ "fmla v28.4s, bb0.4s, a3.4s\n"
+ "str q24, [%[c_ptr2]]\n"
+ "fmla v29.4s, bb1.4s, a3.4s\n"
+ "str q25, [%[c_ptr2], #16]\n"
+ "fmla v30.4s, bb2.4s, a3.4s\n"
+ "str q26, [%[c_ptr2], #32]\n"
+ "fmla v31.4s, bb3.4s, a3.4s\n"
+ "str q27, [%[c_ptr2], #48]\n"
+
+ "3:\n"
+ "str q28, [%[c_ptr3]]\n"
+ "str q29, [%[c_ptr3], #16]\n"
+ "str q30, [%[c_ptr3], #32]\n"
+ "str q31, [%[c_ptr3], #48]\n"
+
+ : [a_ptr0] "+r"(a_ptr0), [a_ptr1] "+r"(a_ptr1), [a_ptr2] "+r"(a_ptr2), [a_ptr3] "+r"(a_ptr3),
+ [b_ptr] "+r"(b_ptr), [loops] "+r"(loops), [odds] "+r"(odds)
+ : [ldb] "r"(ldbb), [oddk] "r"(oddk), [beta0] "r"(beta0), [betaptr] "r"(&beta),
+ [c_ptr0] "r"(c_ptr0), [c_ptr1] "r"(c_ptr1), [c_ptr2] "r"(c_ptr2), [c_ptr3] "r"(c_ptr3)
+ : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15",
+ "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31",
+ "cc", "memory");
+ }
+ }
+}
+
+} // namespace arm_gemm
+
+#endif // __aarch64__
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_sgemv_pretransposed.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_sgemv_pretransposed.hpp
new file mode 100644
index 0000000..c89514f
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_sgemv_pretransposed.hpp
@@ -0,0 +1,67 @@
+/*
+ * Copyright (c) 2017-2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#pragma once
+
+#ifdef __aarch64__
+
+namespace arm_gemm
+{
+// Actual kernel implementations
+void a64_sgemv_pretransposed(const float *, int, const float *, float *, float, int, int);
+
+// Pretransposed SGEMV strategy class.
+class sgemv_pretransposed
+{
+public:
+ typedef float operand_type;
+ typedef float result_type;
+
+ typedef void (*kern_type)(const float *, int, const float *, float *, float, int, int);
+
+ /* Describes the data layout for matrix (A) input */
+
+ /* Note that often GEMV is expressed as a GEMM with M=1, i.e. A is the
+ * (row) vector and B is the matrix, but the standard GEMV arrangement
+ * is matrix A times (column) vector X. "A_transpose" is expressed in
+ * terms of this standard arrangement, so if the A matrix is in fact the
+ * B matrix from a GEMM call, the sense of the transpose needs to be
+ * reversed. */
+ static const int A_interleave = 32;
+ static const int A_block = 1;
+ static const bool A_transpose = false;
+
+ /* Kernel blocking parameters */
+ static const int out_width = 32;
+ static const int k_unroll = 1;
+
+ kern_type kernel = a64_sgemv_pretransposed;
+
+ sgemv_pretransposed(const CPUInfo *ci)
+ {
+ }
+};
+
+} // namespace arm_gemm
+
+#endif // __aarch64__
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_sgemv_pretransposed/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_sgemv_pretransposed/generic.cpp
new file mode 100644
index 0000000..2907598
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_sgemv_pretransposed/generic.cpp
@@ -0,0 +1,794 @@
+/*
+ * Copyright (c) 2017-2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifdef __aarch64__
+
+#include <algorithm>
+
+#include <arm_neon.h>
+
+#include "../../asmlib.hpp"
+#include "../../utils.hpp"
+
+namespace arm_gemm
+{
+void a64_sgemv_pretransposed(const float *A, int lda, const float *X, float *Y, float beta, int M, int N)
+{
+ const bool beta0 = (beta == 0.0f);
+ const bool beta1 = (beta == 1.0f);
+
+ for(int x = 0; x < N; x += 32)
+ {
+ float *y_ptr = Y + x;
+
+ // How many elements are we processing in this loop?
+ int l = std::min(N - x, 32);
+
+ register float32x4_t r0 asm("v24");
+ register float32x4_t r1 asm("v25");
+ register float32x4_t r2 asm("v26");
+ register float32x4_t r3 asm("v27");
+ register float32x4_t r4 asm("v28");
+ register float32x4_t r5 asm("v29");
+ register float32x4_t r6 asm("v30");
+ register float32x4_t r7 asm("v31");
+
+ register float32x4_t x0 asm("v0");
+ register float32x4_t x0a asm("v1");
+
+ const float *x_ptr = X;
+ const float *a_ptr = A + ((x / 32) * lda);
+
+ if(beta0)
+ {
+ r0 = r1 = r2 = r3 = r4 = r5 = r6 = r7 = vdupq_n_f32(0.0f);
+ }
+ else
+ {
+ if(l == 32)
+ {
+ // Fastest path - load all 8 vectors
+ r0 = vld1q_f32(y_ptr);
+ r1 = vld1q_f32(y_ptr + 4);
+ r2 = vld1q_f32(y_ptr + 8);
+ r3 = vld1q_f32(y_ptr + 12);
+ r4 = vld1q_f32(y_ptr + 16);
+ r5 = vld1q_f32(y_ptr + 20);
+ r6 = vld1q_f32(y_ptr + 24);
+ r7 = vld1q_f32(y_ptr + 28);
+ }
+ else
+ {
+ // Slow case - leftovers. Note that we don't care about
+ // out-of-range vectors and lanes as we will throw them away at
+ // the end.
+ int vecs = l / 4; // How many leftover vectors?
+ int oddbits = l % 4; // And how many odd single values?
+
+ if(oddbits)
+ {
+ // Load the outstanding odd values into a vector first
+ float32x4_t oddvec = vdupq_n_f32(0.0f); // This does not really need to be initialized, but the compiler has a hard time with that.
+ float *oddbase = y_ptr + l - oddbits;
+
+ switch(oddbits)
+ {
+ case 3:
+ oddvec = vld1q_lane_f32(oddbase + 2, oddvec, 2);
+ // fall through
+ case 2:
+ oddvec = vld1q_lane_f32(oddbase + 1, oddvec, 1);
+ // fall through
+ case 1:
+ oddvec = vld1q_lane_f32(oddbase, oddvec, 0);
+ break;
+
+ default:
+ UNREACHABLE("Impossible case in switch.");
+ }
+
+ // Now load the whole vectors, putting the oddments in when we run out.
+ do
+ {
+ if(vecs == 0)
+ {
+ r0 = oddvec;
+ break;
+ }
+
+ r0 = vld1q_f32(y_ptr);
+ if(--vecs == 0)
+ {
+ r1 = oddvec;
+ break;
+ }
+
+ r1 = vld1q_f32(y_ptr + 4);
+ if(--vecs == 0)
+ {
+ r2 = oddvec;
+ break;
+ }
+
+ r2 = vld1q_f32(y_ptr + 8);
+ if(--vecs == 0)
+ {
+ r3 = oddvec;
+ break;
+ }
+
+ r3 = vld1q_f32(y_ptr + 12);
+ if(--vecs == 0)
+ {
+ r4 = oddvec;
+ break;
+ }
+
+ r4 = vld1q_f32(y_ptr + 16);
+ if(--vecs == 0)
+ {
+ r5 = oddvec;
+ break;
+ }
+
+ r5 = vld1q_f32(y_ptr + 20);
+ if(--vecs == 0)
+ {
+ r6 = oddvec;
+ break;
+ }
+
+ r6 = vld1q_f32(y_ptr + 24);
+ r7 = oddvec;
+ }
+ while(0);
+ }
+ else
+ {
+ // Slightly less slow path - just load the whole vectors
+ do
+ {
+ // It can't be the case that oddbits==0 AND vecs==0 or we wouldn't be here.
+ if(vecs == 0)
+ {
+ UNREACHABLE("Impossible lack of work to do");
+ }
+
+ r0 = vld1q_f32(y_ptr);
+ if(--vecs == 0)
+ {
+ break;
+ }
+
+ r1 = vld1q_f32(y_ptr + 4);
+ if(--vecs == 0)
+ {
+ break;
+ }
+
+ r2 = vld1q_f32(y_ptr + 8);
+ if(--vecs == 0)
+ {
+ break;
+ }
+
+ r3 = vld1q_f32(y_ptr + 12);
+ if(--vecs == 0)
+ {
+ break;
+ }
+
+ r4 = vld1q_f32(y_ptr + 16);
+ if(--vecs == 0)
+ {
+ break;
+ }
+
+ r5 = vld1q_f32(y_ptr + 20);
+ if(--vecs == 0)
+ {
+ break;
+ }
+
+ r6 = vld1q_f32(y_ptr + 24);
+ }
+ while(0);
+ }
+ }
+
+ if(!beta1)
+ {
+ const float32x4_t vb = vdupq_n_f32(beta);
+
+ r0 = vmulq_f32(r0, vb);
+ r1 = vmulq_f32(r1, vb);
+ r2 = vmulq_f32(r2, vb);
+ r3 = vmulq_f32(r3, vb);
+ r4 = vmulq_f32(r4, vb);
+ r5 = vmulq_f32(r5, vb);
+ r6 = vmulq_f32(r6, vb);
+ r7 = vmulq_f32(r7, vb);
+ }
+ }
+
+ if(M >= 8)
+ {
+ int k = (M / 8) - 1;
+ x0 = vld1q_f32(x_ptr);
+
+ __asm __volatile(
+ "ldr q2, [%[a_ptr], #0]\n"
+ "ldr q3, [%[a_ptr], #16]\n"
+ "ldr q4, [%[a_ptr], #32]\n"
+ "ldr q5, [%[a_ptr], #48]\n"
+ "ldr q6, [%[a_ptr], #64]\n"
+ "ldr q7, [%[a_ptr], #80]\n"
+ "ldr q8, [%[a_ptr], #96]\n"
+ "ldr q9, [%[a_ptr], #112]\n"
+ "ldr q10, [%[a_ptr], #128]\n"
+ "ldr q11, [%[a_ptr], #144]\n"
+ "ldr q12, [%[a_ptr], #160]\n"
+ "ldr q13, [%[a_ptr], #176]\n"
+ "ldr q14, [%[a_ptr], #192]\n"
+ "ldr q15, [%[a_ptr], #208]\n"
+ "ldr q16, [%[a_ptr], #224]\n"
+ "ldr q17, [%[a_ptr], #240]\n"
+ "ldr q18, [%[a_ptr], #256]\n"
+ "ldr q19, [%[a_ptr], #272]\n"
+ "ldr q20, [%[a_ptr], #288]\n"
+ "ldr q21, [%[a_ptr], #304]\n"
+ "ldr q22, [%[a_ptr], #320]\n"
+ "ldr q23, [%[a_ptr], #336]\n" ASM_PREFETCH("[%[a_ptr], #384]")
+ ASM_PREFETCH("[%[a_ptr], #448]")
+ ASM_PREFETCH("[%[a_ptr], #512]")
+ ASM_PREFETCH("[%[a_ptr], #576]")
+ ASM_PREFETCH("[%[a_ptr], #640]")
+ ASM_PREFETCH("[%[a_ptr], #704]")
+ ASM_PREFETCH("[%[a_ptr], #768]")
+ ASM_PREFETCH("[%[a_ptr], #832]")
+ ASM_PREFETCH("[%[a_ptr], #896]")
+ ASM_PREFETCH("[%[a_ptr], #960]")
+ ASM_PREFETCH("[%[a_ptr], #1024]")
+ ASM_PREFETCH("[%[a_ptr], #1088]")
+ ASM_PREFETCH("[%[a_ptr], #1152]")
+ ASM_PREFETCH("[%[a_ptr], #1216]")
+ ASM_PREFETCH("[%[a_ptr], #1280]")
+ ASM_PREFETCH("[%[a_ptr], #1344]")
+ ASM_PREFETCH("[%[a_ptr], #1408]")
+ ASM_PREFETCH("[%[a_ptr], #1472]")
+ ASM_PREFETCH("[%[a_ptr], #1536]")
+ ASM_PREFETCH("[%[a_ptr], #1600]")
+ ASM_PREFETCH("[%[a_ptr], #1664]")
+ ASM_PREFETCH("[%[a_ptr], #1728]")
+ ASM_PREFETCH("[%[a_ptr], #1792]")
+ ASM_PREFETCH("[%[a_ptr], #1856]")
+ ASM_PREFETCH("[%[a_ptr], #1920]")
+ ASM_PREFETCH("[%[a_ptr], #1984]")
+ "add %[a_ptr], %[a_ptr], #352\n"
+
+ "cbz %w[k], 2f\n"
+
+ "1:\n"
+ // Unroll 0
+ "fmla %[r0].4s, v2.4s, %[x0].s[0]\n"
+ "ldr %q[x0a], [%[x_ptr], #16]\n"
+ "fmla %[r1].4s, v3.4s, %[x0].s[0]\n"
+ "ldr q3, [%[a_ptr], #0]\n"
+ "subs %w[k], %w[k], #1\n"
+ "fmla %[r2].4s, v4.4s, %[x0].s[0]\n"
+ "ldr q4, [%[a_ptr], #16]\n"
+ "fmla %[r3].4s, v5.4s, %[x0].s[0]\n"
+ "ldr q5, [%[a_ptr], #32]\n"
+ "add %[x_ptr], %[x_ptr], #32\n" ASM_PREFETCH("[%[a_ptr], #1664]")
+ "fmla %[r4].4s, v6.4s, %[x0].s[0]\n"
+ "ldr q6, [%[a_ptr], #48]\n"
+ "fmla %[r5].4s, v7.4s, %[x0].s[0]\n"
+ "ldr q7, [%[a_ptr], #64]\n"
+ "fmla %[r6].4s, v8.4s, %[x0].s[0]\n"
+ "ldr q8, [%[a_ptr], #80]\n"
+ "fmla %[r7].4s, v9.4s, %[x0].s[0]\n"
+ "ldr q9, [%[a_ptr], #96]\n" ASM_PREFETCH("[%[a_ptr], #1728]")
+
+ // Unroll 1
+ "fmla %[r0].4s, v10.4s, %[x0].s[1]\n"
+ "ldr q10, [%[a_ptr], #112]\n"
+ "fmla %[r1].4s, v11.4s, %[x0].s[1]\n"
+ "ldr q11, [%[a_ptr], #128]\n"
+ "fmla %[r2].4s, v12.4s, %[x0].s[1]\n"
+ "ldr q12, [%[a_ptr], #144]\n"
+ "fmla %[r3].4s, v13.4s, %[x0].s[1]\n"
+ "ldr q13, [%[a_ptr], #160]\n" ASM_PREFETCH("[%[a_ptr], #1792]")
+ "fmla %[r4].4s, v14.4s, %[x0].s[1]\n"
+ "ldr q14, [%[a_ptr], #176]\n"
+ "fmla %[r5].4s, v15.4s, %[x0].s[1]\n"
+ "ldr q15, [%[a_ptr], #192]\n"
+ "fmla %[r6].4s, v16.4s, %[x0].s[1]\n"
+ "ldr q16, [%[a_ptr], #208]\n"
+ "fmla %[r7].4s, v17.4s, %[x0].s[1]\n"
+ "ldr q17, [%[a_ptr], #224]\n" ASM_PREFETCH("[%[a_ptr], #1856]")
+
+ // Unroll 2
+ "fmla %[r0].4s, v18.4s, %[x0].s[2]\n"
+ "ldr q18, [%[a_ptr], #240]\n"
+ "fmla %[r1].4s, v19.4s, %[x0].s[2]\n"
+ "ldr q19, [%[a_ptr], #256]\n"
+ "fmla %[r2].4s, v20.4s, %[x0].s[2]\n"
+ "ldr q20, [%[a_ptr], #272]\n"
+ "fmla %[r3].4s, v21.4s, %[x0].s[2]\n"
+ "ldr q21, [%[a_ptr], #288]\n" ASM_PREFETCH("[%[a_ptr], #1920]")
+ "fmla %[r4].4s, v22.4s, %[x0].s[2]\n"
+ "ldr q22, [%[a_ptr], #304]\n"
+ "fmla %[r5].4s, v23.4s, %[x0].s[2]\n"
+ "ldr q23, [%[a_ptr], #320]\n"
+ "fmla %[r6].4s, v3.4s, %[x0].s[2]\n"
+ "ldr q2, [%[a_ptr], #336]\n"
+ "ldr q3, [%[a_ptr], #352]\n"
+ "fmla %[r7].4s, v4.4s, %[x0].s[2]\n"
+ "ldr q4, [%[a_ptr], #368]\n" ASM_PREFETCH("[%[a_ptr], #1984]")
+
+ // Unroll 3
+ "fmla %[r0].4s, v5.4s, %[x0].s[3]\n"
+ "ldr q5, [%[a_ptr], #384]\n"
+ "fmla %[r1].4s, v6.4s, %[x0].s[3]\n"
+ "ldr q6, [%[a_ptr], #400]\n"
+ "fmla %[r2].4s, v7.4s, %[x0].s[3]\n"
+ "ldr q7, [%[a_ptr], #416]\n"
+ "fmla %[r3].4s, v8.4s, %[x0].s[3]\n" ASM_PREFETCH("[%[a_ptr], #2048]")
+ "ldr q8, [%[a_ptr], #432]\n"
+ "fmla %[r4].4s, v9.4s, %[x0].s[3]\n"
+ "ldr q9, [%[a_ptr], #448]\n"
+ "fmla %[r5].4s, v10.4s, %[x0].s[3]\n"
+ "ldr q10, [%[a_ptr], #464]\n"
+ "fmla %[r6].4s, v11.4s, %[x0].s[3]\n"
+ "ldr q11, [%[a_ptr], #480]\n"
+ "fmla %[r7].4s, v12.4s, %[x0].s[3]\n"
+ "ldr q12, [%[a_ptr], #496]\n" ASM_PREFETCH("[%[a_ptr], #2112]")
+
+ // Unroll 4
+ "fmla %[r0].4s, v13.4s, %[x0a].s[0]\n"
+ "ldr %q[x0], [%[x_ptr]]\n"
+ "fmla %[r1].4s, v14.4s, %[x0a].s[0]\n"
+ "ldr q14, [%[a_ptr], #512]\n"
+ "fmla %[r2].4s, v15.4s, %[x0a].s[0]\n"
+ "ldr q15, [%[a_ptr], #528]\n"
+ "fmla %[r3].4s, v16.4s, %[x0a].s[0]\n" ASM_PREFETCH("[%[a_ptr], #2176]")
+ "ldr q16, [%[a_ptr], #544]\n"
+ "fmla %[r4].4s, v17.4s, %[x0a].s[0]\n"
+ "ldr q17, [%[a_ptr], #560]\n"
+ "fmla %[r5].4s, v18.4s, %[x0a].s[0]\n"
+ "ldr q18, [%[a_ptr], #576]\n"
+ "fmla %[r6].4s, v19.4s, %[x0a].s[0]\n"
+ "ldr q19, [%[a_ptr], #592]\n"
+ "fmla %[r7].4s, v20.4s, %[x0a].s[0]\n"
+ "ldr q20, [%[a_ptr], #608]\n" ASM_PREFETCH("[%[a_ptr], #2240]")
+
+ // Unroll 5
+ "fmla %[r0].4s, v21.4s, %[x0a].s[1]\n"
+ "ldr q21, [%[a_ptr], #624]\n"
+ "fmla %[r1].4s, v22.4s, %[x0a].s[1]\n"
+ "ldr q22, [%[a_ptr], #640]\n"
+ "fmla %[r2].4s, v23.4s, %[x0a].s[1]\n"
+ "ldr q23, [%[a_ptr], #656]\n"
+ "fmla %[r3].4s, v2.4s, %[x0a].s[1]\n"
+ "ldr q2, [%[a_ptr], #672]\n" ASM_PREFETCH("[%[a_ptr], #2304]")
+ "fmla %[r4].4s, v3.4s, %[x0a].s[1]\n"
+ "ldr q3, [%[a_ptr], #688]\n"
+ "fmla %[r5].4s, v4.4s, %[x0a].s[1]\n"
+ "ldr q4, [%[a_ptr], #704]\n"
+ "fmla %[r6].4s, v5.4s, %[x0a].s[1]\n"
+ "ldr q5, [%[a_ptr], #720]\n"
+ "fmla %[r7].4s, v6.4s, %[x0a].s[1]\n"
+ "ldr q6, [%[a_ptr], #736]\n" ASM_PREFETCH("[%[a_ptr], #2368]")
+
+ // Unroll 6
+ "fmla %[r0].4s, v7.4s, %[x0a].s[2]\n"
+ "ldr q7, [%[a_ptr], #752]\n"
+ "fmla %[r1].4s, v8.4s, %[x0a].s[2]\n"
+ "ldr q8, [%[a_ptr], #768]\n"
+ "fmla %[r2].4s, v9.4s, %[x0a].s[2]\n"
+ "ldr q9, [%[a_ptr], #784]\n"
+ "fmla %[r3].4s, v10.4s, %[x0a].s[2]\n"
+ "ldr q10, [%[a_ptr], #800]\n" ASM_PREFETCH("[%[a_ptr], #2432]")
+ "fmla %[r4].4s, v11.4s, %[x0a].s[2]\n"
+ "ldr q11, [%[a_ptr], #816]\n"
+ "fmla %[r5].4s, v12.4s, %[x0a].s[2]\n"
+ "ldr q12, [%[a_ptr], #832]\n"
+ "fmla %[r6].4s, v14.4s, %[x0a].s[2]\n"
+ "ldr q13, [%[a_ptr], #848]\n"
+ "ldr q14, [%[a_ptr], #864]\n"
+ "fmla %[r7].4s, v15.4s, %[x0a].s[2]\n"
+ "ldr q15, [%[a_ptr], #880]\n" ASM_PREFETCH("[%[a_ptr], #2496]")
+
+ // Unroll 7
+ "fmla %[r0].4s, v16.4s, %[x0a].s[3]\n"
+ "ldr q16, [%[a_ptr], #896]\n"
+ "fmla %[r1].4s, v17.4s, %[x0a].s[3]\n"
+ "ldr q17, [%[a_ptr], #912]\n"
+ "fmla %[r2].4s, v18.4s, %[x0a].s[3]\n"
+ "ldr q18, [%[a_ptr], #928]\n"
+ "fmla %[r3].4s, v19.4s, %[x0a].s[3]\n" ASM_PREFETCH("[%[a_ptr], #2560]")
+ "ldr q19, [%[a_ptr], #944]\n"
+ "fmla %[r4].4s, v20.4s, %[x0a].s[3]\n"
+ "ldr q20, [%[a_ptr], #960]\n"
+ "fmla %[r5].4s, v21.4s, %[x0a].s[3]\n"
+ "ldr q21, [%[a_ptr], #976]\n"
+ "add %[a_ptr], %[a_ptr], #1024\n"
+ "fmla %[r6].4s, v22.4s, %[x0a].s[3]\n"
+ "ldr q22, [%[a_ptr], #-32]\n"
+ "fmla %[r7].4s, v23.4s, %[x0a].s[3]\n"
+ "ldr q23, [%[a_ptr], #-16]\n" ASM_PREFETCH("[%[a_ptr], #1600]")
+ "bne 1b\n"
+
+ // Detached final iteration
+ "2:\n"
+
+ // Unroll 0
+ "fmla %[r0].4s, v2.4s, %[x0].s[0]\n"
+ "ldr %q[x0a], [%[x_ptr], #16]\n"
+ "fmla %[r1].4s, v3.4s, %[x0].s[0]\n"
+ "ldr q3, [%[a_ptr], #0]\n"
+ "subs %w[k], %w[k], #1\n"
+ "fmla %[r2].4s, v4.4s, %[x0].s[0]\n"
+ "ldr q4, [%[a_ptr], #16]\n"
+ "fmla %[r3].4s, v5.4s, %[x0].s[0]\n"
+ "ldr q5, [%[a_ptr], #32]\n"
+ "add %[x_ptr], %[x_ptr], #32\n"
+ "fmla %[r4].4s, v6.4s, %[x0].s[0]\n"
+ "ldr q6, [%[a_ptr], #48]\n"
+ "fmla %[r5].4s, v7.4s, %[x0].s[0]\n"
+ "ldr q7, [%[a_ptr], #64]\n"
+ "fmla %[r6].4s, v8.4s, %[x0].s[0]\n"
+ "ldr q8, [%[a_ptr], #80]\n"
+ "fmla %[r7].4s, v9.4s, %[x0].s[0]\n"
+ "ldr q9, [%[a_ptr], #96]\n"
+
+ // Unroll 1
+ "fmla %[r0].4s, v10.4s, %[x0].s[1]\n"
+ "ldr q10, [%[a_ptr], #112]\n"
+ "fmla %[r1].4s, v11.4s, %[x0].s[1]\n"
+ "ldr q11, [%[a_ptr], #128]\n"
+ "fmla %[r2].4s, v12.4s, %[x0].s[1]\n"
+ "ldr q12, [%[a_ptr], #144]\n"
+ "fmla %[r3].4s, v13.4s, %[x0].s[1]\n"
+ "ldr q13, [%[a_ptr], #160]\n"
+ "fmla %[r4].4s, v14.4s, %[x0].s[1]\n"
+ "ldr q14, [%[a_ptr], #176]\n"
+ "fmla %[r5].4s, v15.4s, %[x0].s[1]\n"
+ "ldr q15, [%[a_ptr], #192]\n"
+ "fmla %[r6].4s, v16.4s, %[x0].s[1]\n"
+ "ldr q16, [%[a_ptr], #208]\n"
+ "fmla %[r7].4s, v17.4s, %[x0].s[1]\n"
+ "ldr q17, [%[a_ptr], #224]\n"
+
+ // Unroll 2
+ "fmla %[r0].4s, v18.4s, %[x0].s[2]\n"
+ "ldr q18, [%[a_ptr], #240]\n"
+ "fmla %[r1].4s, v19.4s, %[x0].s[2]\n"
+ "ldr q19, [%[a_ptr], #256]\n"
+ "fmla %[r2].4s, v20.4s, %[x0].s[2]\n"
+ "ldr q20, [%[a_ptr], #272]\n"
+ "fmla %[r3].4s, v21.4s, %[x0].s[2]\n"
+ "ldr q21, [%[a_ptr], #288]\n"
+ "fmla %[r4].4s, v22.4s, %[x0].s[2]\n"
+ "ldr q22, [%[a_ptr], #304]\n"
+ "fmla %[r5].4s, v23.4s, %[x0].s[2]\n"
+ "ldr q23, [%[a_ptr], #320]\n"
+ "fmla %[r6].4s, v3.4s, %[x0].s[2]\n"
+ "ldr q2, [%[a_ptr], #336]\n"
+ "ldr q3, [%[a_ptr], #352]\n"
+ "fmla %[r7].4s, v4.4s, %[x0].s[2]\n"
+ "ldr q4, [%[a_ptr], #368]\n"
+
+ // Unroll 3
+ "fmla %[r0].4s, v5.4s, %[x0].s[3]\n"
+ "ldr q5, [%[a_ptr], #384]\n"
+ "fmla %[r1].4s, v6.4s, %[x0].s[3]\n"
+ "ldr q6, [%[a_ptr], #400]\n"
+ "fmla %[r2].4s, v7.4s, %[x0].s[3]\n"
+ "ldr q7, [%[a_ptr], #416]\n"
+ "fmla %[r3].4s, v8.4s, %[x0].s[3]\n"
+ "ldr q8, [%[a_ptr], #432]\n"
+ "fmla %[r4].4s, v9.4s, %[x0].s[3]\n"
+ "ldr q9, [%[a_ptr], #448]\n"
+ "fmla %[r5].4s, v10.4s, %[x0].s[3]\n"
+ "ldr q10, [%[a_ptr], #464]\n"
+ "fmla %[r6].4s, v11.4s, %[x0].s[3]\n"
+ "ldr q11, [%[a_ptr], #480]\n"
+ "fmla %[r7].4s, v12.4s, %[x0].s[3]\n"
+ "ldr q12, [%[a_ptr], #496]\n"
+
+ // Unroll 4
+ "fmla %[r0].4s, v13.4s, %[x0a].s[0]\n"
+ "fmla %[r1].4s, v14.4s, %[x0a].s[0]\n"
+ "ldr q14, [%[a_ptr], #512]\n"
+ "fmla %[r2].4s, v15.4s, %[x0a].s[0]\n"
+ "ldr q15, [%[a_ptr], #528]\n"
+ "fmla %[r3].4s, v16.4s, %[x0a].s[0]\n"
+ "ldr q16, [%[a_ptr], #544]\n"
+ "fmla %[r4].4s, v17.4s, %[x0a].s[0]\n"
+ "ldr q17, [%[a_ptr], #560]\n"
+ "fmla %[r5].4s, v18.4s, %[x0a].s[0]\n"
+ "ldr q18, [%[a_ptr], #576]\n"
+ "fmla %[r6].4s, v19.4s, %[x0a].s[0]\n"
+ "ldr q19, [%[a_ptr], #592]\n"
+ "fmla %[r7].4s, v20.4s, %[x0a].s[0]\n"
+ "ldr q20, [%[a_ptr], #608]\n"
+
+ // Unroll 5
+ "fmla %[r0].4s, v21.4s, %[x0a].s[1]\n"
+ "ldr q21, [%[a_ptr], #624]\n"
+ "fmla %[r1].4s, v22.4s, %[x0a].s[1]\n"
+ "ldr q22, [%[a_ptr], #640]\n"
+ "fmla %[r2].4s, v23.4s, %[x0a].s[1]\n"
+ "ldr q23, [%[a_ptr], #656]\n"
+ "fmla %[r3].4s, v2.4s, %[x0a].s[1]\n"
+ "add %[a_ptr], %[a_ptr], #672\n"
+ "fmla %[r4].4s, v3.4s, %[x0a].s[1]\n"
+ "fmla %[r5].4s, v4.4s, %[x0a].s[1]\n"
+ "fmla %[r6].4s, v5.4s, %[x0a].s[1]\n"
+ "fmla %[r7].4s, v6.4s, %[x0a].s[1]\n"
+
+ // Unroll 6
+ "fmla %[r0].4s, v7.4s, %[x0a].s[2]\n"
+ "fmla %[r1].4s, v8.4s, %[x0a].s[2]\n"
+ "fmla %[r2].4s, v9.4s, %[x0a].s[2]\n"
+ "fmla %[r3].4s, v10.4s, %[x0a].s[2]\n"
+ "fmla %[r4].4s, v11.4s, %[x0a].s[2]\n"
+ "fmla %[r5].4s, v12.4s, %[x0a].s[2]\n"
+ "fmla %[r6].4s, v14.4s, %[x0a].s[2]\n"
+ "fmla %[r7].4s, v15.4s, %[x0a].s[2]\n"
+
+ // Unroll 7
+ "fmla %[r0].4s, v16.4s, %[x0a].s[3]\n"
+ "fmla %[r1].4s, v17.4s, %[x0a].s[3]\n"
+ "fmla %[r2].4s, v18.4s, %[x0a].s[3]\n"
+ "fmla %[r3].4s, v19.4s, %[x0a].s[3]\n"
+ "fmla %[r4].4s, v20.4s, %[x0a].s[3]\n"
+ "fmla %[r5].4s, v21.4s, %[x0a].s[3]\n"
+ "fmla %[r6].4s, v22.4s, %[x0a].s[3]\n"
+ "fmla %[r7].4s, v23.4s, %[x0a].s[3]\n"
+ :
+ [a_ptr] "+r"(a_ptr), [x_ptr] "+r"(x_ptr),
+ [x0] "+w"(x0), [x0a] "+w"(x0a), [k] "+r"(k),
+ [r0] "+w"(r0), [r1] "+w"(r1), [r2] "+w"(r2), [r3] "+w"(r3),
+ [r4] "+w"(r4), [r5] "+w"(r5), [r6] "+w"(r6), [r7] "+w"(r7)
+ :
+ : "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14",
+ "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "x20", "x21", "cc", "memory");
+ }
+
+ // Deal with ragged M
+ if(M % 8)
+ {
+ int l = (M % 8) - 1;
+
+ __asm __volatile(
+ "ldr q2, [%[a_ptr], #0]\n"
+ "ldr q3, [%[a_ptr], #16]\n"
+ "ldr q4, [%[a_ptr], #32]\n"
+ "ldr q5, [%[a_ptr], #48]\n"
+ "ldr q6, [%[a_ptr], #64]\n"
+ "ldr q7, [%[a_ptr], #80]\n"
+ "ldr q8, [%[a_ptr], #96]\n"
+ "ldr q9, [%[a_ptr], #112]\n"
+ "ldr %s[x0], [%[x_ptr]]\n"
+ "add %[a_ptr], %[a_ptr], #128\n"
+ "add %[x_ptr], %[x_ptr], #4\n"
+
+ "cbz %w[l], 2f\n"
+
+ "1:\n"
+ "fmla %[r0].4s, v2.4s, %[x0].s[0]\n"
+ "ldr q2, [%[a_ptr], #0]\n"
+ "subs %w[l], %w[l], #1\n"
+ "fmla %[r1].4s, v3.4s, %[x0].s[0]\n"
+ "ldr q3, [%[a_ptr], #16]\n"
+ "fmla %[r2].4s, v4.4s, %[x0].s[0]\n"
+ "ldr q4, [%[a_ptr], #32]\n"
+ "fmla %[r3].4s, v5.4s, %[x0].s[0]\n"
+ "ldr q5, [%[a_ptr], #48]\n"
+ "fmla %[r4].4s, v6.4s, %[x0].s[0]\n"
+ "ldr q6, [%[a_ptr], #64]\n"
+ "fmla %[r5].4s, v7.4s, %[x0].s[0]\n"
+ "ldr q7, [%[a_ptr], #80]\n"
+ "fmla %[r6].4s, v8.4s, %[x0].s[0]\n"
+ "ldr q8, [%[a_ptr], #96]\n"
+ "fmla %[r7].4s, v9.4s, %[x0].s[0]\n"
+ "ldr q9, [%[a_ptr], #112]\n"
+ "ldr %s[x0], [%[x_ptr]]\n"
+ "add %[a_ptr], %[a_ptr], #128\n"
+ "add %[x_ptr], %[x_ptr], #4\n"
+ "bne 1b\n"
+
+ "2:\n"
+
+ "fmla %[r0].4s, v2.4s, %[x0].s[0]\n"
+ "fmla %[r1].4s, v3.4s, %[x0].s[0]\n"
+ "fmla %[r2].4s, v4.4s, %[x0].s[0]\n"
+ "fmla %[r3].4s, v5.4s, %[x0].s[0]\n"
+ "fmla %[r4].4s, v6.4s, %[x0].s[0]\n"
+ "fmla %[r5].4s, v7.4s, %[x0].s[0]\n"
+ "fmla %[r6].4s, v8.4s, %[x0].s[0]\n"
+ "fmla %[r7].4s, v9.4s, %[x0].s[0]\n"
+ :
+ [a_ptr] "+r"(a_ptr), [x_ptr] "+r"(x_ptr),
+ [x0] "+w"(x0), [l] "+r"(l),
+ [r0] "+w"(r0), [r1] "+w"(r1), [r2] "+w"(r2), [r3] "+w"(r3),
+ [r4] "+w"(r4), [r5] "+w"(r5), [r6] "+w"(r6), [r7] "+w"(r7)
+ :
+ : "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "cc", "memory");
+ }
+
+ if(l == 32)
+ {
+ // Fast path
+ vst1q_f32(y_ptr, r0);
+ vst1q_f32(y_ptr + 4, r1);
+ vst1q_f32(y_ptr + 8, r2);
+ vst1q_f32(y_ptr + 12, r3);
+ vst1q_f32(y_ptr + 16, r4);
+ vst1q_f32(y_ptr + 20, r5);
+ vst1q_f32(y_ptr + 24, r6);
+ vst1q_f32(y_ptr + 28, r7);
+ }
+ else
+ {
+ int vecs = l / 4;
+ int oddbits = l % 4;
+
+ if(oddbits)
+ {
+ // As above - slowest path deals with vectors plus odd bits
+ float32x4_t oddvec;
+
+ do
+ {
+ if(vecs == 0)
+ {
+ oddvec = r0;
+ break;
+ }
+
+ vst1q_f32(y_ptr, r0);
+ if(--vecs == 0)
+ {
+ oddvec = r1;
+ break;
+ }
+
+ vst1q_f32(y_ptr + 4, r1);
+ if(--vecs == 0)
+ {
+ oddvec = r2;
+ break;
+ }
+
+ vst1q_f32(y_ptr + 8, r2);
+ if(--vecs == 0)
+ {
+ oddvec = r3;
+ break;
+ }
+
+ vst1q_f32(y_ptr + 12, r3);
+ if(--vecs == 0)
+ {
+ oddvec = r4;
+ break;
+ }
+
+ vst1q_f32(y_ptr + 16, r4);
+ if(--vecs == 0)
+ {
+ oddvec = r5;
+ break;
+ }
+
+ vst1q_f32(y_ptr + 20, r5);
+ if(--vecs == 0)
+ {
+ oddvec = r6;
+ break;
+ }
+
+ vst1q_f32(y_ptr + 24, r6);
+ oddvec = r7;
+ }
+ while(0);
+
+ float *oddbase = y_ptr + l - oddbits;
+
+ switch(oddbits)
+ {
+ case 3:
+ vst1q_lane_f32(oddbase + 2, oddvec, 2);
+ // fall through
+ case 2:
+ vst1q_lane_f32(oddbase + 1, oddvec, 1);
+ // fall through
+ case 1:
+ vst1q_lane_f32(oddbase, oddvec, 0);
+ break;
+
+ default:
+ // oddbits must be 1, 2 or 3.
+ UNREACHABLE("Impossible case in switch.");
+ }
+ }
+ else
+ {
+ // As above - medium path deals with vectors only
+ do
+ {
+ if(vecs == 0)
+ {
+ UNREACHABLE("vecs and oddbits can't both be 0");
+ }
+
+ vst1q_f32(y_ptr, r0);
+ if(--vecs == 0)
+ {
+ break;
+ }
+
+ vst1q_f32(y_ptr + 4, r1);
+ if(--vecs == 0)
+ {
+ break;
+ }
+
+ vst1q_f32(y_ptr + 8, r2);
+ if(--vecs == 0)
+ {
+ break;
+ }
+
+ vst1q_f32(y_ptr + 12, r3);
+ if(--vecs == 0)
+ {
+ break;
+ }
+
+ vst1q_f32(y_ptr + 16, r4);
+ if(--vecs == 0)
+ {
+ break;
+ }
+
+ vst1q_f32(y_ptr + 20, r5);
+ if(--vecs == 0)
+ {
+ break;
+ }
+
+ vst1q_f32(y_ptr + 24, r6);
+ }
+ while(0);
+ }
+ }
+ }
+}
+
+} // namespace arm_gemm
+
+#endif // aarch64
diff --git a/src/graph/CL/CLMap.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_sgemv_trans.hpp
similarity index 61%
copy from src/graph/CL/CLMap.cpp
copy to src/core/NEON/kernels/arm_gemm/kernels/a64_sgemv_trans.hpp
index 5289ea9..5b9bd72 100644
--- a/src/graph/CL/CLMap.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_sgemv_trans.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2018 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -21,23 +21,35 @@
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*/
-#include "arm_compute/graph/CL/CLMap.h"
+#pragma once
-#include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/Validate.h"
-#include "arm_compute/graph/ITensorObject.h"
-#include "arm_compute/runtime/CL/CLScheduler.h"
+#ifdef __aarch64__
-using namespace arm_compute::graph;
-
-CLMap::CLMap(ITensorObject *tensor, bool blocking)
- : _tensor(dynamic_cast<arm_compute::ICLTensor *>(tensor->tensor())), _blocking(blocking)
+namespace arm_gemm
{
- ARM_COMPUTE_ERROR_ON_NULLPTR(_tensor);
-}
+// Actual kernel implementations
+void a64_sgemv_trans(const float *, const float *, float *, float, int, int, int);
-void CLMap::run()
+// Transposed SGEMV strategy class.
+class sgemv_trans
{
- _tensor->map(arm_compute::CLScheduler::get().queue(), _blocking);
-}
+public:
+ typedef float operand_type;
+ typedef float result_type;
+
+ typedef void (*kern_type)(const float *, const float *, float *, float, int, int, int);
+
+ /* Kernel blocking parameters */
+ static const int out_width = 96;
+ static const int k_unroll = 1;
+
+ kern_type kernel = a64_sgemv_trans;
+
+ sgemv_trans(const CPUInfo *ci)
+ {
+ }
+};
+
+} // namespace arm_gemm
+
+#endif // __aarch64__
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_sgemv_trans/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_sgemv_trans/generic.cpp
new file mode 100644
index 0000000..8fa403b
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_sgemv_trans/generic.cpp
@@ -0,0 +1,914 @@
+/*
+ * Copyright (c) 2017-2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifdef __aarch64__
+
+#include <cstddef>
+
+#include <arm_neon.h>
+
+#include "../../asmlib.hpp"
+#include "../../utils.hpp"
+
+// Kernel implementation - transposed GEMV
+//
+// The kernel will process "M" rows of A (= steps of dot product) and "N"
+// columns (= dot products total)
+//
+// General plan is to do as many columns simultaneously as possible - a
+// reasonable limit is half the NEON regfile = 64 total accumulators.
+//
+// It's possible that messing around with sub-blocking M and N can yield
+// higher performance, but that's left to the outer loop. In this kernel we
+// process all of M at the same time.
+
+// How far ahead to prefetch for the first and subsequent prefetches.
+// These values work for A72 on JunoR2...
+
+#define FIRST_PFD 9
+#define PFD 6
+
+namespace arm_gemm
+{
+void a64_sgemv_trans(const float *Astart, const float *Xstart, float *Ystart, float beta, int lda, int M, int N)
+{
+ const float *a_ptr_base = Astart;
+ float *y_ptr = Ystart;
+
+ register const float32x4_t vb asm("v1") = vdupq_n_f32(beta);
+
+ int firstpfd = FIRST_PFD;
+ if(firstpfd > M)
+ {
+ firstpfd = (M - 1);
+ }
+
+ int pfd = PFD;
+ if(pfd > M)
+ {
+ pfd = (M - 1);
+ }
+
+ ptrdiff_t jump = lda * sizeof(int);
+
+ for(; N >= 96; N -= 96)
+ {
+ int k = M - 1;
+
+ const float *a_ptr = a_ptr_base;
+ const float *x_ptr = Xstart;
+ const float *pf_ptr = a_ptr;
+ const float *firstpf_ptr = a_ptr;
+ const float *pf_limit = a_ptr + (M * lda);
+
+ for(int i = 0; i < firstpfd; i++)
+ {
+ prefetch_1x(firstpf_ptr);
+ firstpf_ptr += lda;
+ }
+
+ for(int i = 0; i < pfd; i++)
+ {
+ prefetch_5x(pf_ptr + 16);
+ pf_ptr += lda;
+ }
+
+ a_ptr_base += 96;
+
+ __asm __volatile(
+ "movi v8.4s,#0x0\n"
+ "ldr w0, [%[x_ptr]]\n"
+ "movi v9.4s,#0x0\n"
+ "ldr q2, [%[a_ptr], #0]\n"
+ "movi v10.4s,#0x0\n"
+ "ldr q3, [%[a_ptr], #0x10]\n"
+ "movi v11.4s,#0x0\n"
+ "ldr q4, [%[a_ptr], #0x20]\n"
+ "movi v12.4s,#0x0\n"
+ "ldr q5, [%[a_ptr], #0x30]\n"
+ "movi v13.4s,#0x0\n"
+ "ldr q6, [%[a_ptr], #0x40]\n"
+ "movi v14.4s,#0x0\n"
+ "ldr q7, [%[a_ptr], #0x50]\n"
+ "movi v15.4s,#0x0\n" ASM_PREFETCH("[%[firstpf_ptr]]")
+ "movi v16.4s, #0x0\n"
+ "movi v17.4s, #0x0\n" ASM_PREFETCH("[%[pf_ptr], #64]")
+ "movi v18.4s, #0x0\n"
+ "movi v19.4s, #0x0\n" ASM_PREFETCH("[%[pf_ptr], #128]")
+ "movi v20.4s, #0x0\n"
+ "movi v21.4s, #0x0\n" ASM_PREFETCH("[%[pf_ptr], #192]")
+ "movi v22.4s, #0x0\n"
+ "movi v23.4s, #0x0\n" ASM_PREFETCH("[%[pf_ptr], #256]")
+ "movi v24.4s, #0x0\n"
+ "movi v25.4s, #0x0\n" ASM_PREFETCH("[%[pf_ptr], #320]")
+ "movi v26.4s, #0x0\n"
+ "movi v27.4s, #0x0\n"
+ "add %[pf_ptr], %[pf_ptr], %[jump]\n"
+ "movi v28.4s, #0x0\n"
+ "add %[firstpf_ptr], %[firstpf_ptr], %[jump]\n"
+ "movi v29.4s, #0x0\n"
+ "movi v30.4s, #0x0\n"
+ "movi v31.4s, #0x0\n"
+
+ // Skip everything if there are no iterations of the main loop to do.
+ "cbz %w[k], 10f\n"
+
+ // Loop with all prefetches. Exit this loop when firstpf_ptr
+ // hits pf_limit.
+ "1:\n"
+ "dup v0.4s, w0\n"
+ "ldr w0, [%[x_ptr], #4]\n"
+ "add %[x_ptr], %[x_ptr], #0x4\n"
+ "fmla v8.4s, v2.4s, v0.4s\n"
+ "ldr q2, [%[a_ptr], #0x60]\n"
+ "fmla v9.4s, v3.4s, v0.4s\n"
+ "ldr q3, [%[a_ptr], #0x70]\n" ASM_PREFETCH("[%[firstpf_ptr]]")
+ "fmla v10.4s, v4.4s, v0.4s\n"
+ "ldr q4, [%[a_ptr], #0x80]\n"
+ "add %[firstpf_ptr], %[firstpf_ptr], %[jump]\n"
+ "fmla v11.4s, v5.4s, v0.4s\n"
+ "ldr q5, [%[a_ptr], #0x90]\n"
+ "sub %w[k], %w[k], #1\n" ASM_PREFETCH("[%[x_ptr], #128]")
+ "fmla v12.4s, v6.4s, v0.4s\n"
+ "ldr q6, [%[a_ptr], #0xa0]\n"
+ "fmla v13.4s, v7.4s, v0.4s\n"
+ "ldr q7, [%[a_ptr], #0xb0]\n" ASM_PREFETCH("[%[pf_ptr], #0x40]")
+ "fmla v14.4s, v2.4s, v0.4s\n"
+ "ldr q2, [%[a_ptr], #0xc0]\n"
+ "fmla v15.4s, v3.4s, v0.4s\n"
+ "ldr q3, [%[a_ptr], #0xd0]\n"
+ "fmla v16.4s, v4.4s, v0.4s\n"
+ "ldr q4, [%[a_ptr], #0xe0]\n"
+ "fmla v17.4s, v5.4s, v0.4s\n"
+ "ldr q5, [%[a_ptr], #0xf0]\n" ASM_PREFETCH("[%[pf_ptr], #0x80]")
+ "fmla v18.4s, v6.4s, v0.4s\n"
+ "ldr q6, [%[a_ptr], #0x100]\n"
+ "fmla v19.4s, v7.4s, v0.4s\n"
+ "ldr q7, [%[a_ptr], #0x110]\n"
+ "fmla v20.4s, v2.4s, v0.4s\n"
+ "ldr q2, [%[a_ptr], #0x120]\n"
+ "fmla v21.4s, v3.4s, v0.4s\n"
+ "ldr q3, [%[a_ptr], #0x130]\n" ASM_PREFETCH("[%[pf_ptr], #0xc0]")
+ "fmla v22.4s, v4.4s, v0.4s\n"
+ "ldr q4, [%[a_ptr], #0x140]\n"
+ "fmla v23.4s, v5.4s, v0.4s\n"
+ "ldr q5, [%[a_ptr], #0x150]\n"
+ "fmla v24.4s, v6.4s, v0.4s\n"
+ "ldr q6, [%[a_ptr], #0x160]\n"
+ "fmla v25.4s, v7.4s, v0.4s\n"
+ "ldr q7, [%[a_ptr], #0x170]\n" ASM_PREFETCH("[%[pf_ptr], #0x100]")
+ "add %[a_ptr], %[a_ptr], %[jump]\n"
+ "fmla v26.4s, v2.4s, v0.4s\n"
+ "ldr q2, [%[a_ptr], #0x00]\n"
+ "fmla v27.4s, v3.4s, v0.4s\n"
+ "ldr q3, [%[a_ptr], #0x10]\n"
+ "fmla v28.4s, v4.4s, v0.4s\n"
+ "ldr q4, [%[a_ptr], #0x20]\n"
+ "fmla v29.4s, v5.4s, v0.4s\n"
+ "ldr q5, [%[a_ptr], #0x30]\n" ASM_PREFETCH("[%[pf_ptr], #0x140]")
+ "fmla v30.4s, v6.4s, v0.4s\n"
+ "add %[pf_ptr], %[pf_ptr], %[jump]\n"
+ "ldr q6, [%[a_ptr], #0x40]\n"
+ "fmla v31.4s, v7.4s, v0.4s\n"
+ "cmp %[firstpf_ptr], %[pf_limit]\n"
+ "ldr q7, [%[a_ptr], #0x50]\n"
+ "blt 1b\n"
+
+ // Check that there are still "main" prefetches to do.
+ "cmp %[pf_ptr], %[pf_limit]\n"
+ "bge 9f\n"
+
+ // Just the main prefetches, exit this loop when pf_ptr hits pf_limit.
+ "8:\n"
+ "dup v0.4s, w0\n"
+ "ldr w0, [%[x_ptr], #4]\n"
+ "add %[x_ptr], %[x_ptr], #0x4\n"
+ "fmla v8.4s, v2.4s, v0.4s\n"
+ "ldr q2, [%[a_ptr], #0x60]\n"
+ "fmla v9.4s, v3.4s, v0.4s\n"
+ "ldr q3, [%[a_ptr], #0x70]\n"
+ "fmla v10.4s, v4.4s, v0.4s\n"
+ "ldr q4, [%[a_ptr], #0x80]\n"
+ "fmla v11.4s, v5.4s, v0.4s\n"
+ "ldr q5, [%[a_ptr], #0x90]\n"
+ "sub %w[k], %w[k], #1\n" ASM_PREFETCH("[%[x_ptr], #128]")
+ "fmla v12.4s, v6.4s, v0.4s\n"
+ "ldr q6, [%[a_ptr], #0xa0]\n"
+ "fmla v13.4s, v7.4s, v0.4s\n"
+ "ldr q7, [%[a_ptr], #0xb0]\n" ASM_PREFETCH("[%[pf_ptr], #0x40]")
+ "fmla v14.4s, v2.4s, v0.4s\n"
+ "ldr q2, [%[a_ptr], #0xc0]\n"
+ "fmla v15.4s, v3.4s, v0.4s\n"
+ "ldr q3, [%[a_ptr], #0xd0]\n"
+ "fmla v16.4s, v4.4s, v0.4s\n"
+ "ldr q4, [%[a_ptr], #0xe0]\n"
+ "fmla v17.4s, v5.4s, v0.4s\n"
+ "ldr q5, [%[a_ptr], #0xf0]\n" ASM_PREFETCH("[%[pf_ptr], #0x80]")
+ "fmla v18.4s, v6.4s, v0.4s\n"
+ "ldr q6, [%[a_ptr], #0x100]\n"
+ "fmla v19.4s, v7.4s, v0.4s\n"
+ "ldr q7, [%[a_ptr], #0x110]\n"
+ "fmla v20.4s, v2.4s, v0.4s\n"
+ "ldr q2, [%[a_ptr], #0x120]\n"
+ "fmla v21.4s, v3.4s, v0.4s\n"
+ "ldr q3, [%[a_ptr], #0x130]\n" ASM_PREFETCH("[%[pf_ptr], #0xc0]")
+ "fmla v22.4s, v4.4s, v0.4s\n"
+ "ldr q4, [%[a_ptr], #0x140]\n"
+ "fmla v23.4s, v5.4s, v0.4s\n"
+ "ldr q5, [%[a_ptr], #0x150]\n"
+ "fmla v24.4s, v6.4s, v0.4s\n"
+ "ldr q6, [%[a_ptr], #0x160]\n"
+ "fmla v25.4s, v7.4s, v0.4s\n"
+ "ldr q7, [%[a_ptr], #0x170]\n" ASM_PREFETCH("[%[pf_ptr], #0x100]")
+ "add %[a_ptr], %[a_ptr], %[jump]\n"
+ "fmla v26.4s, v2.4s, v0.4s\n"
+ "ldr q2, [%[a_ptr], #0x00]\n"
+ "fmla v27.4s, v3.4s, v0.4s\n"
+ "ldr q3, [%[a_ptr], #0x10]\n"
+ "fmla v28.4s, v4.4s, v0.4s\n"
+ "ldr q4, [%[a_ptr], #0x20]\n"
+ "fmla v29.4s, v5.4s, v0.4s\n"
+ "ldr q5, [%[a_ptr], #0x30]\n" ASM_PREFETCH("[%[pf_ptr], #0x140]")
+ "fmla v30.4s, v6.4s, v0.4s\n"
+ "add %[pf_ptr], %[pf_ptr], %[jump]\n"
+ "ldr q6, [%[a_ptr], #0x40]\n"
+ "fmla v31.4s, v7.4s, v0.4s\n"
+ "cmp %[pf_ptr], %[pf_limit]\n"
+ "ldr q7, [%[a_ptr], #0x50]\n"
+ "blt 8b\n"
+
+ // Check that there is still work to do.
+ "9:\n"
+ "cmp %w[k], #0\n"
+ "beq 10f\n"
+
+ // Loop without prefetches, exit when k hits 0.
+ "2:\n"
+ "dup v0.4s, w0\n"
+ "ldr w0, [%[x_ptr], #4]\n"
+ "add %[x_ptr], %[x_ptr], #0x4\n"
+ "fmla v8.4s, v2.4s, v0.4s\n"
+ "ldr q2, [%[a_ptr], #0x60]\n"
+ "fmla v9.4s, v3.4s, v0.4s\n"
+ "ldr q3, [%[a_ptr], #0x70]\n"
+ "fmla v10.4s, v4.4s, v0.4s\n"
+ "ldr q4, [%[a_ptr], #0x80]\n"
+ "fmla v11.4s, v5.4s, v0.4s\n"
+ "ldr q5, [%[a_ptr], #0x90]\n"
+ "subs %w[k], %w[k], #1\n"
+ "fmla v12.4s, v6.4s, v0.4s\n"
+ "ldr q6, [%[a_ptr], #0xa0]\n"
+ "fmla v13.4s, v7.4s, v0.4s\n"
+ "ldr q7, [%[a_ptr], #0xb0]\n"
+ "fmla v14.4s, v2.4s, v0.4s\n"
+ "ldr q2, [%[a_ptr], #0xc0]\n"
+ "fmla v15.4s, v3.4s, v0.4s\n"
+ "ldr q3, [%[a_ptr], #0xd0]\n"
+ "fmla v16.4s, v4.4s, v0.4s\n"
+ "ldr q4, [%[a_ptr], #0xe0]\n"
+ "fmla v17.4s, v5.4s, v0.4s\n"
+ "ldr q5, [%[a_ptr], #0xf0]\n"
+ "fmla v18.4s, v6.4s, v0.4s\n"
+ "ldr q6, [%[a_ptr], #0x100]\n"
+ "fmla v19.4s, v7.4s, v0.4s\n"
+ "ldr q7, [%[a_ptr], #0x110]\n"
+ "fmla v20.4s, v2.4s, v0.4s\n"
+ "ldr q2, [%[a_ptr], #0x120]\n"
+ "fmla v21.4s, v3.4s, v0.4s\n"
+ "ldr q3, [%[a_ptr], #0x130]\n"
+ "fmla v22.4s, v4.4s, v0.4s\n"
+ "ldr q4, [%[a_ptr], #0x140]\n"
+ "fmla v23.4s, v5.4s, v0.4s\n"
+ "ldr q5, [%[a_ptr], #0x150]\n"
+ "fmla v24.4s, v6.4s, v0.4s\n"
+ "ldr q6, [%[a_ptr], #0x160]\n"
+ "fmla v25.4s, v7.4s, v0.4s\n"
+ "ldr q7, [%[a_ptr], #0x170]\n"
+ "add %[a_ptr], %[a_ptr], %[jump]\n"
+ "fmla v26.4s, v2.4s, v0.4s\n"
+ "ldr q2, [%[a_ptr], #0x00]\n"
+ "fmla v27.4s, v3.4s, v0.4s\n"
+ "ldr q3, [%[a_ptr], #0x10]\n"
+ "fmla v28.4s, v4.4s, v0.4s\n"
+ "ldr q4, [%[a_ptr], #0x20]\n"
+ "fmla v29.4s, v5.4s, v0.4s\n"
+ "ldr q5, [%[a_ptr], #0x30]\n"
+ "fmla v30.4s, v6.4s, v0.4s\n"
+ "ldr q6, [%[a_ptr], #0x40]\n"
+ "fmla v31.4s, v7.4s, v0.4s\n"
+ "ldr q7, [%[a_ptr], #0x50]\n"
+ "bne 2b\n"
+
+ "10:\n"
+
+ // Final iteration
+ "dup v0.4s, w0\n"
+ "fmla v8.4s, v2.4s, v0.4s\n"
+ "ldr q2, [%[a_ptr], #0x60]\n"
+ "fmla v9.4s, v3.4s, v0.4s\n"
+ "ldr q3, [%[a_ptr], #0x70]\n"
+ "fmla v10.4s, v4.4s, v0.4s\n"
+ "ldr q4, [%[a_ptr], #0x80]\n"
+ "fmla v11.4s, v5.4s, v0.4s\n"
+ "ldr q5, [%[a_ptr], #0x90]\n"
+ "fmla v12.4s, v6.4s, v0.4s\n"
+ "ldr q6, [%[a_ptr], #0xa0]\n"
+ "fmla v13.4s, v7.4s, v0.4s\n"
+ "ldr q7, [%[a_ptr], #0xb0]\n"
+ "fmla v14.4s, v2.4s, v0.4s\n"
+ "ldr q2, [%[a_ptr], #0xc0]\n"
+ "fmla v15.4s, v3.4s, v0.4s\n"
+ "ldr q3, [%[a_ptr], #0xd0]\n"
+ "fmla v16.4s, v4.4s, v0.4s\n"
+ "ldr q4, [%[a_ptr], #0xe0]\n"
+ "fmla v17.4s, v5.4s, v0.4s\n"
+ "ldr q5, [%[a_ptr], #0xf0]\n"
+ "fmla v18.4s, v6.4s, v0.4s\n"
+
+ "ldr q6, [%[a_ptr], #0x100]\n"
+ "fmla v19.4s, v7.4s, v0.4s\n"
+ "ldr q7, [%[a_ptr], #0x110]\n"
+ "fmla v20.4s, v2.4s, v0.4s\n"
+ "ldr q2, [%[a_ptr], #0x120]\n"
+ "fmla v21.4s, v3.4s, v0.4s\n"
+ "ldr q3, [%[a_ptr], #0x130]\n"
+ "fmla v22.4s, v4.4s, v0.4s\n"
+ "ldr q4, [%[a_ptr], #0x140]\n"
+ "fmla v23.4s, v5.4s, v0.4s\n"
+ "ldr q5, [%[a_ptr], #0x150]\n"
+ "fmla v24.4s, v6.4s, v0.4s\n"
+ "ldr q6, [%[a_ptr], #0x160]\n"
+ "fmla v25.4s, v7.4s, v0.4s\n"
+ "ldr q7, [%[a_ptr], #0x170]\n"
+ "fmla v26.4s, v2.4s, v0.4s\n"
+ "ldr q2, [%[y_ptr]]\n"
+ "fmla v27.4s, v3.4s, v0.4s\n"
+ "ldr q3, [%[y_ptr], #0x10]\n"
+ "fmla v28.4s, v4.4s, v0.4s\n"
+ "ldr q4, [%[y_ptr], #0x20]\n"
+ "fmla v29.4s, v5.4s, v0.4s\n"
+ "ldr q5, [%[y_ptr], #0x30]\n"
+ "fmla v30.4s, v6.4s, v0.4s\n"
+ "ldr q6, [%[y_ptr], #0x40]\n"
+ "fmla v31.4s, v7.4s, v0.4s\n"
+ "ldr q7, [%[y_ptr], #0x50]\n"
+
+ "fmla v8.4s, v2.4s, %[vb].4s\n"
+ "ldr q2, [%[y_ptr], #0x60]\n"
+ "fmla v9.4s, v3.4s, %[vb].4s\n"
+ "ldr q3, [%[y_ptr], #0x70]\n"
+ "fmla v10.4s, v4.4s, %[vb].4s\n"
+ "ldr q4, [%[y_ptr], #0x80]\n"
+ "fmla v11.4s, v5.4s, %[vb].4s\n"
+ "ldr q5, [%[y_ptr], #0x90]\n"
+ "fmla v12.4s, v6.4s, %[vb].4s\n"
+ "ldr q6, [%[y_ptr], #0xa0]\n"
+ "str q8, [%[y_ptr], #0x00]\n"
+ "fmla v13.4s, v7.4s, %[vb].4s\n"
+ "ldr q7, [%[y_ptr], #0xb0]\n"
+ "str q9, [%[y_ptr], #0x10]\n"
+ "fmla v14.4s, v2.4s, %[vb].4s\n"
+ "ldr q2, [%[y_ptr], #0xc0]\n"
+ "str q10, [%[y_ptr], #0x20]\n"
+ "fmla v15.4s, v3.4s, %[vb].4s\n"
+ "ldr q3, [%[y_ptr], #0xd0]\n"
+ "str q11, [%[y_ptr], #0x30]\n"
+ "fmla v16.4s, v4.4s, %[vb].4s\n"
+ "ldr q4, [%[y_ptr], #0xe0]\n"
+ "str q12, [%[y_ptr], #0x40]\n"
+ "fmla v17.4s, v5.4s, %[vb].4s\n"
+ "ldr q5, [%[y_ptr], #0xf0]\n"
+ "str q13, [%[y_ptr], #0x50]\n"
+ "fmla v18.4s, v6.4s, %[vb].4s\n"
+ "ldr q6, [%[y_ptr], #0x100]\n"
+ "str q14, [%[y_ptr], #0x60]\n"
+ "fmla v19.4s, v7.4s, %[vb].4s\n"
+ "ldr q7, [%[y_ptr], #0x110]\n"
+ "str q15, [%[y_ptr], #0x70]\n"
+ "fmla v20.4s, v2.4s, %[vb].4s\n"
+ "ldr q2, [%[y_ptr], #0x120]\n"
+ "str q16, [%[y_ptr], #0x80]\n"
+ "fmla v21.4s, v3.4s, %[vb].4s\n"
+ "ldr q3, [%[y_ptr], #0x130]\n"
+ "str q17, [%[y_ptr], #0x90]\n"
+ "fmla v22.4s, v4.4s, %[vb].4s\n"
+ "ldr q4, [%[y_ptr], #0x140]\n"
+ "str q18, [%[y_ptr], #0xa0]\n"
+ "fmla v23.4s, v5.4s, %[vb].4s\n"
+ "ldr q5, [%[y_ptr], #0x150]\n"
+ "str q19, [%[y_ptr], #0xb0]\n"
+ "fmla v24.4s, v6.4s, %[vb].4s\n"
+ "ldr q6, [%[y_ptr], #0x160]\n"
+ "str q20, [%[y_ptr], #0xc0]\n"
+ "fmla v25.4s, v7.4s, %[vb].4s\n"
+ "ldr q7, [%[y_ptr], #0x170]\n"
+ "str q21, [%[y_ptr], #0xd0]\n"
+ "fmla v26.4s, v2.4s, %[vb].4s\n"
+ "str q22, [%[y_ptr], #0xe0]\n"
+ "fmla v27.4s, v3.4s, %[vb].4s\n"
+ "str q23, [%[y_ptr], #0xf0]\n"
+ "fmla v28.4s, v4.4s, %[vb].4s\n"
+ "str q24, [%[y_ptr], #0x100]\n"
+ "fmla v29.4s, v5.4s, %[vb].4s\n"
+ "str q25, [%[y_ptr], #0x110]\n"
+ "fmla v30.4s, v6.4s, %[vb].4s\n"
+ "str q26, [%[y_ptr], #0x120]\n"
+ "fmla v31.4s, v7.4s, %[vb].4s\n"
+ "str q27, [%[y_ptr], #0x130]\n"
+
+ "stp q28, q29, [%[y_ptr], #0x140]\n"
+ "stp q30, q31, [%[y_ptr], #0x160]\n"
+ "add %[y_ptr], %[y_ptr], #0x180\n"
+
+ : [a_ptr] "+r"(a_ptr), [x_ptr] "+r"(x_ptr), [y_ptr] "+r"(y_ptr), [k] "+r"(k), [pf_ptr] "+r"(pf_ptr), [firstpf_ptr] "+r"(firstpf_ptr)
+ : [jump] "r"(jump), [vb] "w"(vb), [pf_limit] "r"(pf_limit)
+ : "w0", "v0", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13",
+ "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26",
+ "v27", "v28", "v29", "v30", "v31", "cc");
+ }
+
+ if(N > 0)
+ {
+ // Handle N tail - up to 95 stragglers.
+ // This is 0-23 vectors, plus optionally an 64-bit vector and/or a
+ // single value for the remainder.
+
+ // Independent pointers into the matrix for the odd 2 and odd 1.
+ // Double up as flag to indicate whether they are needed.
+ const float *odd2_aptr = NULL;
+ const float *odd1_aptr = NULL;
+
+ // Figure out how much work we need to do.
+ int numvecs = N / 4;
+ int rem = N % 4;
+ int k = M;
+
+ // Set up pointers for the odd 2/1 if needed.
+ if(rem >= 2)
+ {
+ odd2_aptr = a_ptr_base + (numvecs * 4);
+ }
+
+ if(rem & 1)
+ {
+ odd1_aptr = a_ptr_base + (numvecs * 4) + (odd2_aptr == NULL ? 0 : 2);
+ }
+
+ const float *a_ptr = a_ptr_base;
+ const float *firstpf_ptr = a_ptr_base;
+ const float *pf_ptr = a_ptr_base;
+ const float *pf_limit = a_ptr + (M * lda);
+
+ const float *x_ptr = Xstart;
+ int vecs = 0; // Working variable to count how many vectors to work on.
+ int dopf = 1; // Track whether we are doing prefetches.
+
+ // Figure out how many cache lines we need to prefetch each time.
+ int numpfs = (N + 15) / 16;
+
+ // Do initial prefetches
+ for(int i = 0; i < firstpfd + 1; i++)
+ {
+ prefetch_1x(firstpf_ptr);
+ firstpf_ptr += lda;
+ }
+
+ // Do "main" prefetches - adapt number to the number we actually need.
+ if(numpfs > 1)
+ {
+ for(int i = 0; i < pfd + 1; i++)
+ {
+ switch(numpfs)
+ {
+ case 2:
+ prefetch_1x(pf_ptr + 16);
+ break;
+
+ case 3:
+ prefetch_2x(pf_ptr + 16);
+ break;
+
+ case 4:
+ prefetch_3x(pf_ptr + 16);
+ break;
+
+ case 5:
+ prefetch_4x(pf_ptr + 16);
+ break;
+
+ case 6:
+ prefetch_5x(pf_ptr + 16);
+ break;
+
+ default:
+ UNREACHABLE("Impossible.");
+ }
+ pf_ptr += lda;
+ }
+ }
+ else
+ {
+ // Just disable additional prefetches
+ dopf = 0;
+ }
+
+ // Do the real work
+ __asm __volatile(
+ // Initialize all the vectors - not worth skipping this if only
+ // some are needed.
+ "movi v8.4s,#0x0\n"
+ "ldr w0, [%[x_ptr]]\n"
+ "movi v9.4s,#0x0\n"
+ "movi v10.4s,#0x0\n"
+ "movi v11.4s,#0x0\n"
+ "movi v12.4s,#0x0\n"
+ "movi v13.4s,#0x0\n"
+ "movi v14.4s,#0x0\n"
+ "movi v15.4s,#0x0\n"
+ "movi v16.4s, #0x0\n"
+ "movi v17.4s, #0x0\n"
+ "movi v18.4s, #0x0\n"
+ "movi v19.4s, #0x0\n"
+ "movi v20.4s, #0x0\n"
+ "movi v21.4s, #0x0\n"
+ "movi v22.4s, #0x0\n"
+ "movi v23.4s, #0x0\n"
+ "movi v24.4s, #0x0\n"
+ "movi v25.4s, #0x0\n"
+ "movi v26.4s, #0x0\n"
+ "movi v27.4s, #0x0\n"
+ "movi v28.4s, #0x0\n"
+ "movi v29.4s, #0x0\n"
+ "movi v30.4s, #0x0\n"
+ "movi v6.2s, #0x0\n"
+ "movi v5.2s, #0x0\n"
+
+ "1:\n" ASM_PREFETCH("[%[firstpf_ptr]]\n")
+ "11:\n"
+ "dup v0.4s, w0\n"
+ "ldr w0, [%[x_ptr], #4]\n"
+ "add %[x_ptr], %[x_ptr], #4\n"
+
+ "cbz %w[numvecs], 2f\n"
+ "mov %w[vecs], %w[numvecs]\n"
+
+ // Vector 0
+ "subs %w[vecs], %w[vecs], #1\n"
+ "ldr q7,[%[a_ptr], #0x00]\n"
+ "fmla v8.4s, v7.4s, v0.4s\n"
+ "beq 2f\n"
+ // Vector 1
+ "subs %w[vecs], %w[vecs], #1\n"
+ "ldr q7,[%[a_ptr], #0x10]\n"
+ "fmla v9.4s, v7.4s, v0.4s\n"
+ "beq 2f\n"
+ // Vector 2
+ "subs %w[vecs], %w[vecs], #1\n"
+ "ldr q7,[%[a_ptr], #0x20]\n"
+ "fmla v10.4s, v7.4s, v0.4s\n"
+ "beq 2f\n"
+ // Vector 3
+ "subs %w[vecs], %w[vecs], #1\n"
+ "ldr q7,[%[a_ptr], #0x30]\n"
+ "fmla v11.4s, v7.4s, v0.4s\n"
+ // Prefetch
+ "cbz %w[dopf], 3f\n" ASM_PREFETCH("[%[pf_ptr], #0x40]")
+ "3:\n"
+ "beq 2f\n"
+
+ // Vector 4
+ "subs %w[vecs], %w[vecs], #1\n"
+ "ldr q7,[%[a_ptr], #0x40]\n"
+ "fmla v12.4s, v7.4s, v0.4s\n"
+ "beq 2f\n"
+ // Vector 5
+ "subs %w[vecs], %w[vecs], #1\n"
+ "ldr q7,[%[a_ptr], #0x50]\n"
+ "fmla v13.4s, v7.4s, v0.4s\n"
+ "beq 2f\n"
+ // Vector 6
+ "subs %w[vecs], %w[vecs], #1\n"
+ "ldr q7,[%[a_ptr], #0x60]\n"
+ "fmla v14.4s, v7.4s, v0.4s\n"
+ "beq 2f\n"
+ // Vector 7
+ "subs %w[vecs], %w[vecs], #1\n"
+ "ldr q7,[%[a_ptr], #0x70]\n"
+ "fmla v15.4s, v7.4s, v0.4s\n"
+ // Prefetch
+ "cbz %w[dopf], 4f\n" ASM_PREFETCH("[%[pf_ptr], #0x80]")
+ "4:\n"
+ "beq 2f\n"
+
+ // Vector 8
+ "subs %w[vecs], %w[vecs], #1\n"
+ "ldr q7,[%[a_ptr], #0x80]\n"
+ "fmla v16.4s, v7.4s, v0.4s\n"
+ "beq 2f\n"
+ // Vector 9
+ "subs %w[vecs], %w[vecs], #1\n"
+ "ldr q7,[%[a_ptr], #0x90]\n"
+ "fmla v17.4s, v7.4s, v0.4s\n"
+ "beq 2f\n"
+ // Vector 10
+ "subs %w[vecs], %w[vecs], #1\n"
+ "ldr q7,[%[a_ptr], #0xa0]\n"
+ "fmla v18.4s, v7.4s, v0.4s\n"
+ "beq 2f\n"
+ // Vector 11
+ "subs %w[vecs], %w[vecs], #1\n"
+ "ldr q7,[%[a_ptr], #0xb0]\n"
+ "fmla v19.4s, v7.4s, v0.4s\n"
+ // Prefetch
+ "cbz %w[dopf], 5f\n" ASM_PREFETCH("[%[pf_ptr], #0xc0]")
+ "5:\n"
+ "beq 2f\n"
+
+ // Vector 12
+ "subs %w[vecs], %w[vecs], #1\n"
+ "ldr q7,[%[a_ptr], #0xc0]\n"
+ "fmla v20.4s, v7.4s, v0.4s\n"
+ "beq 2f\n"
+ // Vector 13
+ "subs %w[vecs], %w[vecs], #1\n"
+ "ldr q7,[%[a_ptr], #0xd0]\n"
+ "fmla v21.4s, v7.4s, v0.4s\n"
+ "beq 2f\n"
+ // Vector 14
+ "subs %w[vecs], %w[vecs], #1\n"
+ "ldr q7,[%[a_ptr], #0xe0]\n"
+ "fmla v22.4s, v7.4s, v0.4s\n"
+ "beq 2f\n"
+ // Vector 15
+ "subs %w[vecs], %w[vecs], #1\n"
+ "ldr q7,[%[a_ptr], #0xf0]\n"
+ "fmla v23.4s, v7.4s, v0.4s\n"
+ // Prefetch
+ "cbz %w[dopf], 6f\n" ASM_PREFETCH("[%[pf_ptr], #0x100]")
+ "6:\n"
+ "beq 2f\n"
+
+ // Vector 16
+ "subs %w[vecs], %w[vecs], #1\n"
+ "ldr q7,[%[a_ptr], #0x100]\n"
+ "fmla v24.4s, v7.4s, v0.4s\n"
+ "beq 2f\n"
+ // Vector 17
+ "subs %w[vecs], %w[vecs], #1\n"
+ "ldr q7,[%[a_ptr], #0x110]\n"
+ "fmla v25.4s, v7.4s, v0.4s\n"
+ "beq 2f\n"
+ // Vector 18
+ "subs %w[vecs], %w[vecs], #1\n"
+ "ldr q7,[%[a_ptr], #0x120]\n"
+ "fmla v26.4s, v7.4s, v0.4s\n"
+ "beq 2f\n"
+ // Vector 19
+ "subs %w[vecs], %w[vecs], #1\n"
+ "ldr q7,[%[a_ptr], #0x130]\n"
+ "fmla v27.4s, v7.4s, v0.4s\n"
+ // Prefetch
+ "cbz %w[dopf], 7f\n" ASM_PREFETCH("[%[pf_ptr], #0x140]")
+ "7:\n"
+ "beq 2f\n"
+
+ // Vector 20
+ "subs %w[vecs], %w[vecs], #1\n"
+ "ldr q7,[%[a_ptr], #0x140]\n"
+ "fmla v28.4s, v7.4s, v0.4s\n"
+ "beq 2f\n"
+ // Vector 21
+ "subs %w[vecs], %w[vecs], #1\n"
+ "ldr q7,[%[a_ptr], #0x150]\n"
+ "fmla v29.4s, v7.4s, v0.4s\n"
+ "beq 2f\n"
+ // Vector 22
+ "subs %w[vecs], %w[vecs], #1\n"
+ "ldr q7,[%[a_ptr], #0x160]\n"
+ "fmla v30.4s, v7.4s, v0.4s\n"
+
+ "2:\n"
+ "add %[a_ptr], %[a_ptr], %[jump]\n"
+
+ // Do the odd 2-vector, if needed
+ "cbz %[odd2_aptr], 8f\n"
+ "ldr d7, [%[odd2_aptr]]\n"
+ "fmla v6.2s, v7.2s, v0.2s\n"
+ "add %[odd2_aptr], %[odd2_aptr], %[jump]\n"
+
+ "8:\n"
+ // Do the odd 1-vector, if needed
+ "cbz %[odd1_aptr], 9f\n"
+ "ldr s7, [%[odd1_aptr]]\n"
+ "fmla v5.2s, v7.2s, v0.2s\n"
+ "add %[odd1_aptr], %[odd1_aptr], %[jump]\n"
+
+ // Get out if needed.
+ "9:\n"
+ "subs %w[k], %w[k], #1\n"
+ "beq 10f\n"
+
+ // Update the "main" prefetch pointer, if it strays beyond the limit turn off "dopf"
+ "add %[pf_ptr], %[pf_ptr], %[jump]\n"
+ "cmp %[pf_ptr], %[pf_limit]\n"
+ "csel %w[dopf], %w[dopf], WZR, LT\n"
+
+ // Update the "leading" prefetch pointer, don't do the first
+ // instruction of the loop if it's over the limit.
+ "add %[firstpf_ptr], %[firstpf_ptr], %[jump]\n"
+ "cmp %[firstpf_ptr], %[pf_limit]\n"
+ "blt 1b\n"
+ "b 11b\n"
+
+ // Now write out the outputs
+ "10:\n"
+ "cbz %w[numvecs], 12f\n"
+ "mov %w[vecs], %w[numvecs]\n"
+
+ // Vector 0
+ "subs %w[vecs], %w[vecs], #1\n"
+ "ldr q7, [%[y_ptr]]\n"
+ "fmla v8.4s, v7.4s, %[vb].4s\n"
+ "str q8, [%[y_ptr]], #0x10\n"
+ "beq 12f\n"
+ // Vector 1
+ "subs %w[vecs], %w[vecs], #1\n"
+ "ldr q7, [%[y_ptr]]\n"
+ "fmla v9.4s, v7.4s, %[vb].4s\n"
+ "str q9, [%[y_ptr]], #0x10\n"
+ "beq 12f\n"
+ // Vector 2
+ "subs %w[vecs], %w[vecs], #1\n"
+ "ldr q7, [%[y_ptr]]\n"
+ "fmla v10.4s, v7.4s, %[vb].4s\n"
+ "str q10, [%[y_ptr]], #0x10\n"
+ "beq 12f\n"
+ // Vector 3
+ "subs %w[vecs], %w[vecs], #1\n"
+ "ldr q7, [%[y_ptr]]\n"
+ "fmla v11.4s, v7.4s, %[vb].4s\n"
+ "str q11, [%[y_ptr]], #0x10\n"
+ "beq 12f\n"
+ // Vector 4
+ "subs %w[vecs], %w[vecs], #1\n"
+ "ldr q7, [%[y_ptr]]\n"
+ "fmla v12.4s, v7.4s, %[vb].4s\n"
+ "str q12, [%[y_ptr]], #0x10\n"
+ "beq 12f\n"
+ // Vector 5
+ "subs %w[vecs], %w[vecs], #1\n"
+ "ldr q7, [%[y_ptr]]\n"
+ "fmla v13.4s, v7.4s, %[vb].4s\n"
+ "str q13, [%[y_ptr]], #0x10\n"
+ "beq 12f\n"
+ // Vector 6
+ "subs %w[vecs], %w[vecs], #1\n"
+ "ldr q7, [%[y_ptr]]\n"
+ "fmla v14.4s, v7.4s, %[vb].4s\n"
+ "str q14, [%[y_ptr]], #0x10\n"
+ "beq 12f\n"
+ // Vector 7
+ "subs %w[vecs], %w[vecs], #1\n"
+ "ldr q7, [%[y_ptr]]\n"
+ "fmla v15.4s, v7.4s, %[vb].4s\n"
+ "str q15, [%[y_ptr]], #0x10\n"
+ "beq 12f\n"
+ // Vector 8
+ "subs %w[vecs], %w[vecs], #1\n"
+ "ldr q7, [%[y_ptr]]\n"
+ "fmla v16.4s, v7.4s, %[vb].4s\n"
+ "str q16, [%[y_ptr]], #0x10\n"
+ "beq 12f\n"
+ // Vector 9
+ "subs %w[vecs], %w[vecs], #1\n"
+ "ldr q7, [%[y_ptr]]\n"
+ "fmla v17.4s, v7.4s, %[vb].4s\n"
+ "str q17, [%[y_ptr]], #0x10\n"
+ "beq 12f\n"
+ // Vector 10
+ "subs %w[vecs], %w[vecs], #1\n"
+ "ldr q7, [%[y_ptr]]\n"
+ "fmla v18.4s, v7.4s, %[vb].4s\n"
+ "str q18, [%[y_ptr]], #0x10\n"
+ "beq 12f\n"
+ // Vector 11
+ "subs %w[vecs], %w[vecs], #1\n"
+ "ldr q7, [%[y_ptr]]\n"
+ "fmla v19.4s, v7.4s, %[vb].4s\n"
+ "str q19, [%[y_ptr]], #0x10\n"
+ "beq 12f\n"
+ // Vector 12
+ "subs %w[vecs], %w[vecs], #1\n"
+ "ldr q7, [%[y_ptr]]\n"
+ "fmla v20.4s, v7.4s, %[vb].4s\n"
+ "str q20, [%[y_ptr]], #0x10\n"
+ "beq 12f\n"
+ // Vector 13
+ "subs %w[vecs], %w[vecs], #1\n"
+ "ldr q7, [%[y_ptr]]\n"
+ "fmla v21.4s, v7.4s, %[vb].4s\n"
+ "str q21, [%[y_ptr]], #0x10\n"
+ "beq 12f\n"
+ // Vector 14
+ "subs %w[vecs], %w[vecs], #1\n"
+ "ldr q7, [%[y_ptr]]\n"
+ "fmla v22.4s, v7.4s, %[vb].4s\n"
+ "str q22, [%[y_ptr]], #0x10\n"
+ "beq 12f\n"
+ // Vector 15
+ "subs %w[vecs], %w[vecs], #1\n"
+ "ldr q7, [%[y_ptr]]\n"
+ "fmla v23.4s, v7.4s, %[vb].4s\n"
+ "str q23, [%[y_ptr]], #0x10\n"
+ "beq 12f\n"
+ // Vector 16
+ "subs %w[vecs], %w[vecs], #1\n"
+ "ldr q7, [%[y_ptr]]\n"
+ "fmla v24.4s, v7.4s, %[vb].4s\n"
+ "str q24, [%[y_ptr]], #0x10\n"
+ "beq 12f\n"
+ // Vector 17
+ "subs %w[vecs], %w[vecs], #1\n"
+ "ldr q7, [%[y_ptr]]\n"
+ "fmla v25.4s, v7.4s, %[vb].4s\n"
+ "str q25, [%[y_ptr]], #0x10\n"
+ "beq 12f\n"
+ // Vector 18
+ "subs %w[vecs], %w[vecs], #1\n"
+ "ldr q7, [%[y_ptr]]\n"
+ "fmla v26.4s, v7.4s, %[vb].4s\n"
+ "str q26, [%[y_ptr]], #0x10\n"
+ "beq 12f\n"
+ // Vector 19
+ "subs %w[vecs], %w[vecs], #1\n"
+ "ldr q7, [%[y_ptr]]\n"
+ "fmla v27.4s, v7.4s, %[vb].4s\n"
+ "str q27, [%[y_ptr]], #0x10\n"
+ "beq 12f\n"
+ // Vector 20
+ "subs %w[vecs], %w[vecs], #1\n"
+ "ldr q7, [%[y_ptr]]\n"
+ "fmla v28.4s, v7.4s, %[vb].4s\n"
+ "str q28, [%[y_ptr]], #0x10\n"
+ "beq 12f\n"
+ // Vector 21
+ "subs %w[vecs], %w[vecs], #1\n"
+ "ldr q7, [%[y_ptr]]\n"
+ "fmla v29.4s, v7.4s, %[vb].4s\n"
+ "str q29, [%[y_ptr]], #0x10\n"
+ "beq 12f\n"
+ // Vector 22
+ "subs %w[vecs], %w[vecs], #1\n"
+ "ldr q7, [%[y_ptr]]\n"
+ "fmla v30.4s, v7.4s, %[vb].4s\n"
+ "str q30, [%[y_ptr]], #0x10\n"
+
+ // Odd 2
+ "12:\n"
+ "cbz %[odd2_aptr], 13f\n"
+ "ldr d7, [%[y_ptr]]\n"
+ "fmla v6.2s, v7.2s, %[vb].2s\n"
+ "str d6, [%[y_ptr]], #0x8\n"
+
+ // Odd 1
+ "13:\n"
+ "cbz %[odd1_aptr], 14f\n"
+ "ldr s7, [%[y_ptr]]\n"
+ "fmla v5.2s, v7.2s, %[vb].2s\n"
+ "str s5, [%[y_ptr]]\n"
+
+ "14:\n"
+ : [a_ptr] "+r"(a_ptr), [x_ptr] "+r"(x_ptr), [y_ptr] "+r"(y_ptr), [k] "+r"(k),
+ [pf_ptr] "+r"(pf_ptr), [firstpf_ptr] "+r"(firstpf_ptr),
+ [odd1_aptr] "+r"(odd1_aptr), [odd2_aptr] "+r"(odd2_aptr),
+ [dopf] "+r"(dopf), [vecs] "+r"(vecs)
+ : [jump] "r"(jump), [vb] "w"(vb), [pf_limit] "r"(pf_limit), [numvecs] "r"(numvecs)
+ : "w0", "v0", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13",
+ "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26",
+ "v27", "v28", "v29", "v30", "v31", "cc");
+ }
+}
+
+} // namespace arm_gemm
+
+#endif // __aarch64__
diff --git a/src/core/NEON/kernels/arm_gemm/mergeresults.hpp b/src/core/NEON/kernels/arm_gemm/mergeresults.hpp
new file mode 100644
index 0000000..4a6da3d
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/mergeresults.hpp
@@ -0,0 +1,75 @@
+/*
+ * Copyright (c) 2017-2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#pragma once
+
+/* As some of the merges need these headers, but are all included in the
+ * arm_gemm namespace, put these headers here. */
+#include <arm_neon.h>
+
+#include "asmlib.hpp"
+#include "utils.hpp"
+
+namespace arm_gemm
+{
+template <unsigned int width, unsigned int height, typename Tin, typename Tout>
+inline void MergeResults(Tout *out, const Tin *in, int ldc, int y0, int ymax, int x0, int xmax, const Tout alpha, const Tout beta)
+{
+ int full_y_blocks = (ymax - y0) / height;
+ int y_remainder = (ymax - y0) % height;
+ int y_blocks = full_y_blocks + (y_remainder ? 1 : 0);
+
+ int full_x_blocks = (xmax - x0) / width;
+ int x_remainder = (xmax - x0) % width;
+ int x_blocks = full_x_blocks + (x_remainder ? 1 : 0);
+
+ for(int y_block = 0; y_block < y_blocks; y_block++)
+ {
+ int ybase = y0 + (y_block * height);
+
+ int fill_rows = (y_block < full_y_blocks) ? height : y_remainder;
+
+ for(int x_block = 0; x_block < x_blocks; x_block++)
+ {
+ int xbase = x0 + (x_block * width);
+
+ int fill_cols = (x_block < full_x_blocks) ? width : x_remainder;
+
+ for(int row = 0; row < fill_rows; row++)
+ {
+ for(int col = 0; col < fill_cols; col++)
+ {
+ Tout &p = out[(ybase + row) * ldc + xbase + col];
+
+ p = (p * beta) + (alpha * in[row * width + col]);
+ }
+ }
+
+ in += (width * height);
+ }
+ }
+}
+
+#include "merges/list.hpp"
+
+} // namespace arm_gemm
diff --git a/src/core/NEON/kernels/arm_gemm/merges/a32_merge_float_8x6.hpp b/src/core/NEON/kernels/arm_gemm/merges/a32_merge_float_8x6.hpp
new file mode 100644
index 0000000..b44e564
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/merges/a32_merge_float_8x6.hpp
@@ -0,0 +1,167 @@
+/*
+ * Copyright (c) 2017-2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#pragma once
+
+#ifdef __arm__
+
+#include <arm_neon.h>
+
+template <>
+inline void MergeResults<8, 6>(float *out, const float *in, const int ldout, const int y0, const int ymax, const int x0, const int xmax, const float alpha, const float beta)
+{
+ const float *inptr = in;
+ prefetch_6x(inptr);
+ prefetch_6x(inptr + 96);
+
+ float32x4_t av = vdupq_n_f32(alpha);
+ float32x4_t bv = vdupq_n_f32(beta);
+
+ for(int y = y0; y < ymax; y += 8)
+ {
+ float *outptr0 = out + (y * ldout) + x0;
+ float *outptr1 = outptr0 + ldout;
+ float *outptr2 = outptr1 + ldout;
+ float *outptr3 = outptr2 + ldout;
+ float *outptr4 = outptr3 + ldout;
+ float *outptr5 = outptr4 + ldout;
+
+ prefetch_2x(outptr0);
+ prefetch_2x(outptr1);
+ prefetch_2x(outptr2);
+ prefetch_2x(outptr3);
+ prefetch_2x(outptr4);
+ prefetch_2x(outptr5);
+
+ for(int i = x0; i < xmax; i += 8)
+ {
+ float dummyres[8];
+
+ /* Make sure we throw away results if Y isn't a multiple of 8.
+ * We do this by pointing the result pointer at a dummy buffer
+ * we later discard. */
+ if((y + 5) >= ymax)
+ {
+ switch((y + 5) - ymax)
+ {
+ case 4:
+ outptr1 = dummyres;
+ case 3:
+ outptr2 = dummyres;
+ case 2:
+ outptr3 = dummyres;
+ case 1:
+ outptr4 = dummyres;
+ case 0:
+ outptr5 = dummyres;
+ break;
+
+ default:
+ UNREACHABLE("Impossible.");
+ }
+ }
+
+ /* For ragged X, manually copy over the valid results. */
+ if((i + 7) >= xmax)
+ {
+ for(int xi = 0; xi < 8; xi++)
+ {
+ if((i + xi) < xmax)
+ {
+ *outptr0 = (alpha * inptr[xi]) + (*outptr0 * beta);
+ outptr0++;
+ *outptr1 = (alpha * inptr[xi + 8]) + (*outptr1 * beta);
+ outptr1++;
+ *outptr2 = (alpha * inptr[xi + 16]) + (*outptr2 * beta);
+ outptr2++;
+ *outptr3 = (alpha * inptr[xi + 24]) + (*outptr3 * beta);
+ outptr3++;
+ *outptr4 = (alpha * inptr[xi + 32]) + (*outptr4 * beta);
+ outptr4++;
+ *outptr5 = (alpha * inptr[xi + 40]) + (*outptr5 * beta);
+ outptr5++;
+ }
+ }
+ inptr += 48;
+ }
+ else
+ {
+ /* Optimized routine to copy an entire block */
+ __asm __volatile(
+ // Rows 0-1
+ "VLD1.32 {d8-d11}, [%[outptr0]]\n"
+ "VMUL.f32 q4, q4, %q[bv]\n"
+ "VLD1.32 {d12-d15}, [%[outptr1]]\n"
+ "VMUL.f32 q5, q5, %q[bv]\n"
+ "VLD1.32 {d0-d3}, [%[inptr]]!\n"
+ "VMUL.f32 q6, q6, %q[bv]\n"
+ "VLD1.32 {d4-d7}, [%[inptr]]!\n"
+ "VMUL.f32 q7, q7, %q[bv]\n"
+
+ "VMLA.f32 q4, q0, %q[av]\n" ASM_PREFETCH("[%[inptr], #352]")
+ "VMLA.f32 q5, q1, %q[av]\n"
+ "VST1.32 {d8-d11}, [%[outptr0]]!\n" ASM_PREFETCH("[%[inptr], #416]") "VMLA.f32 q6, q2, %q[av]\n" ASM_PREFETCH("[%[inptr], #480]")
+ "VMLA.f32 q7, q3, %q[av]\n"
+ "VST1.32 {d12-d15}, [%[outptr1]]!\n"
+
+ // Rows 2-3
+ "VLD1.32 {d8-d11}, [%[outptr2]]\n"
+ "VMUL.f32 q4, q4, %q[bv]\n"
+ "VLD1.32 {d12-d15}, [%[outptr3]]\n"
+ "VMUL.f32 q5, q5, %q[bv]\n"
+ "VLD1.32 {d0-d3}, [%[inptr]]!\n"
+ "VMUL.f32 q6, q6, %q[bv]\n"
+ "VLD1.32 {d4-d7}, [%[inptr]]!\n"
+ "VMUL.f32 q7, q7, %q[bv]\n"
+
+ "VMLA.f32 q4, q0, %q[av]\n" ASM_PREFETCH("[%[outptr0], #96]")
+ "VMLA.f32 q5, q1, %q[av]\n"
+ "VST1.32 {d8-d11}, [%[outptr2]]!\n" ASM_PREFETCH("[%[outptr1], #96]") "VMLA.f32 q6, q2, %q[av]\n" ASM_PREFETCH("[%[outptr2], #96]")
+ "VMLA.f32 q7, q3, %q[av]\n"
+ "VST1.32 {d12-d15}, [%[outptr3]]!\n"
+
+ // Rows 4-5
+ "VLD1.32 {d8-d11}, [%[outptr4]]\n"
+ "VMUL.f32 q4, q4, %q[bv]\n"
+ "VLD1.32 {d12-d15}, [%[outptr5]]\n"
+ "VMUL.f32 q5, q5, %q[bv]\n"
+ "VLD1.32 {d0-d3}, [%[inptr]]!\n"
+ "VMUL.f32 q6, q6, %q[bv]\n"
+ "VLD1.32 {d4-d7}, [%[inptr]]!\n"
+ "VMUL.f32 q7, q7, %q[bv]\n"
+
+ "VMLA.f32 q4, q0, %q[av]\n" ASM_PREFETCH("[%[outptr3], #96]")
+ "VMLA.f32 q5, q1, %q[av]\n"
+ "VST1.32 {d8-d11}, [%[outptr4]]!\n" ASM_PREFETCH("[%[outptr4], #96]") "VMLA.f32 q6, q2, %q[av]\n" ASM_PREFETCH("[%[outptr5], #128]")
+ "VMLA.f32 q7, q3, %q[av]\n"
+ "VST1.32 {d12-d15}, [%[outptr5]]!\n"
+ : [outptr0] "+r"(outptr0), [outptr1] "+r"(outptr1), [outptr2] "+r"(outptr2), [outptr3] "+r"(outptr3),
+ [outptr4] "+r"(outptr4), [outptr5] "+r"(outptr5), [inptr] "+r"(inptr)
+ : [av] "w"(av), [bv] "w"(bv)
+ : "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7");
+ }
+ }
+ }
+}
+
+#endif // __arm__
diff --git a/src/core/NEON/kernels/arm_gemm/merges/a64_merge_float_12x8.hpp b/src/core/NEON/kernels/arm_gemm/merges/a64_merge_float_12x8.hpp
new file mode 100644
index 0000000..3b59a43
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/merges/a64_merge_float_12x8.hpp
@@ -0,0 +1,233 @@
+/*
+ * Copyright (c) 2017-2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#pragma once
+
+#ifdef __aarch64__
+
+template <>
+inline void MergeResults<12, 8>(float *out, const float *in, const int ldout, const int y0, const int ymax, const int x0, const int xmax, const float alpha, const float beta)
+{
+ const float *inptr = in;
+ prefetch_6x(inptr);
+ prefetch_6x(inptr + 96);
+
+ float32x4_t av = vdupq_n_f32(alpha);
+ float32x4_t bv = vdupq_n_f32(beta);
+
+ for(int y = y0; y < ymax; y += 8)
+ {
+ float *outptr0 = out + (y * ldout) + x0;
+ float *outptr1 = outptr0 + ldout;
+ float *outptr2 = outptr1 + ldout;
+ float *outptr3 = outptr2 + ldout;
+ float *outptr4 = outptr3 + ldout;
+ float *outptr5 = outptr4 + ldout;
+ float *outptr6 = outptr5 + ldout;
+ float *outptr7 = outptr6 + ldout;
+
+ prefetch_2x(outptr0);
+ prefetch_2x(outptr1);
+ prefetch_2x(outptr2);
+ prefetch_2x(outptr3);
+ prefetch_2x(outptr4);
+ prefetch_2x(outptr5);
+ prefetch_2x(outptr6);
+ prefetch_2x(outptr7);
+
+ for(int i = x0; i < xmax; i += 12)
+ {
+ float dummyres[12];
+
+ /* Make sure we throw away results if Y isn't a multiple of 8.
+ * We do this by pointing the result pointer at a dummy buffer
+ * we later discard. */
+ if((y + 7) >= ymax)
+ {
+ switch((y + 7) - ymax)
+ {
+ case 6:
+ outptr1 = dummyres;
+ case 5:
+ outptr2 = dummyres;
+ case 4:
+ outptr3 = dummyres;
+ case 3:
+ outptr4 = dummyres;
+ case 2:
+ outptr5 = dummyres;
+ case 1:
+ outptr6 = dummyres;
+ case 0:
+ outptr7 = dummyres;
+ break;
+
+ default:
+ UNREACHABLE("Impossible.");
+ }
+ }
+
+ /* For ragged X, manually copy over the valid results. */
+ if((i + 11) >= xmax)
+ {
+ for(int xi = 0; xi < 12; xi++)
+ {
+ if((i + xi) < xmax)
+ {
+ *outptr0 = (alpha * inptr[xi]) + (*outptr0 * beta);
+ outptr0++;
+ *outptr1 = (alpha * inptr[xi + 12]) + (*outptr1 * beta);
+ outptr1++;
+ *outptr2 = (alpha * inptr[xi + 24]) + (*outptr2 * beta);
+ outptr2++;
+ *outptr3 = (alpha * inptr[xi + 36]) + (*outptr3 * beta);
+ outptr3++;
+ *outptr4 = (alpha * inptr[xi + 48]) + (*outptr4 * beta);
+ outptr4++;
+ *outptr5 = (alpha * inptr[xi + 60]) + (*outptr5 * beta);
+ outptr5++;
+ *outptr6 = (alpha * inptr[xi + 72]) + (*outptr6 * beta);
+ outptr6++;
+ *outptr7 = (alpha * inptr[xi + 84]) + (*outptr7 * beta);
+ outptr7++;
+ }
+ }
+ inptr += 96;
+ }
+ else
+ {
+ /* Optimized routine to copy an entire block */
+ __asm __volatile(
+ // Rows 0-1
+ "LDP q16, q17, [%[outptr0]]\n"
+ "FMUL v16.4s, v16.4s, %[bv].4s\n"
+ "LDR q18, [%[outptr0], #32]\n"
+ "FMUL v17.4s, v17.4s, %[bv].4s\n"
+ "LDP q19, q20, [%[outptr1]]\n"
+ "FMUL v18.4s, v18.4s, %[bv].4s\n"
+ "LDR q21, [%[outptr1], #32]\n" ASM_PREFETCH("[%[inptr], #768]")
+ "FMUL v19.4s, v19.4s, %[bv].4s\n"
+ "LDP q0, q1, [%[inptr]]\n"
+ "FMUL v20.4s, v20.4s, %[bv].4s\n"
+ "LDP q2, q3, [%[inptr], #32]\n"
+ "FMUL v21.4s, v21.4s, %[bv].4s\n"
+ "LDP q4, q5, [%[inptr], #64]\n"
+ "FMLA v16.4s, v0.4s, %[av].4s\n" ASM_PREFETCH("[%[inptr], #832]")
+ "FMLA v17.4s, v1.4s, %[av].4s\n"
+ "STP q16, q17, [%[outptr0]], #32\n"
+ "FMLA v18.4s, v2.4s, %[av].4s\n"
+ "STR q18, [%[outptr0]], #16\n"
+ "FMLA v19.4s, v3.4s, %[av].4s\n" ASM_PREFETCH("[%[inptr], #896]")
+ "FMLA v20.4s, v4.4s, %[av].4s\n"
+ "STP q19, q20, [%[outptr1]], #32\n"
+ "FMLA v21.4s, v5.4s, %[av].4s\n"
+ "STR q21, [%[outptr1]], #16\n"
+
+ // Rows 2-3
+ "LDP q16, q17, [%[outptr2]]\n"
+ "FMUL v16.4s, v16.4s, %[bv].4s\n"
+ "LDR q18, [%[outptr2], #32]\n"
+ "FMUL v17.4s, v17.4s, %[bv].4s\n"
+ "LDP q19, q20, [%[outptr3]]\n"
+ "FMUL v18.4s, v18.4s, %[bv].4s\n"
+ "LDR q21, [%[outptr3], #32]\n" ASM_PREFETCH("[%[inptr], #960]")
+ "FMUL v19.4s, v19.4s, %[bv].4s\n"
+ "LDP q0, q1, [%[inptr], #96]\n"
+ "FMUL v20.4s, v20.4s, %[bv].4s\n"
+ "LDP q2, q3, [%[inptr], #128]\n"
+ "FMUL v21.4s, v21.4s, %[bv].4s\n"
+ "LDP q4, q5, [%[inptr], #160]\n"
+ "FMLA v16.4s, v0.4s, %[av].4s\n" ASM_PREFETCH("[%[inptr], #1024]")
+ "FMLA v17.4s, v1.4s, %[av].4s\n"
+ "STP q16, q17, [%[outptr2]], #32\n"
+ "FMLA v18.4s, v2.4s, %[av].4s\n"
+ "STR q18, [%[outptr2]], #16\n"
+ "FMLA v19.4s, v3.4s, %[av].4s\n" ASM_PREFETCH("[%[inptr], #1088]")
+ "FMLA v20.4s, v4.4s, %[av].4s\n"
+ "STP q19, q20, [%[outptr3]], #32\n"
+ "FMLA v21.4s, v5.4s, %[av].4s\n"
+ "STR q21, [%[outptr3]], #16\n"
+
+ // Rows 4-5
+ ASM_PREFETCH("[%[outptr0], #80]")
+ "LDP q16, q17, [%[outptr4]]\n"
+ "FMUL v16.4s, v16.4s, %[bv].4s\n"
+ "LDR q18, [%[outptr4], #32]\n"
+ "FMUL v17.4s, v17.4s, %[bv].4s\n"
+ "LDP q19, q20, [%[outptr5]]\n"
+ "FMUL v18.4s, v18.4s, %[bv].4s\n"
+ "LDR q21, [%[outptr5], #32]\n" ASM_PREFETCH("[%[outptr1], #80]")
+ "FMUL v19.4s, v19.4s, %[bv].4s\n"
+ "LDP q0, q1, [%[inptr], #192]\n"
+ "FMUL v20.4s, v20.4s, %[bv].4s\n"
+ "LDP q2, q3, [%[inptr], #224]\n"
+ "FMUL v21.4s, v21.4s, %[bv].4s\n"
+ "LDP q4, q5, [%[inptr], #256]\n"
+ "FMLA v16.4s, v0.4s, %[av].4s\n" ASM_PREFETCH("[%[outptr2], #80]")
+ "FMLA v17.4s, v1.4s, %[av].4s\n"
+ "STP q16, q17, [%[outptr4]], #32\n"
+ "FMLA v18.4s, v2.4s, %[av].4s\n"
+ "STR q18, [%[outptr4]], #16\n"
+ "FMLA v19.4s, v3.4s, %[av].4s\n" ASM_PREFETCH("[%[outptr3], #80]")
+ "FMLA v20.4s, v4.4s, %[av].4s\n"
+ "STP q19, q20, [%[outptr5]], #32\n"
+ "FMLA v21.4s, v5.4s, %[av].4s\n"
+ "STR q21, [%[outptr5]], #16\n"
+
+ // Rows 6-7
+ ASM_PREFETCH("[%[outptr4], #80]")
+ "LDP q16, q17, [%[outptr6]]\n"
+ "FMUL v16.4s, v16.4s, %[bv].4s\n"
+ "LDR q18, [%[outptr6], #32]\n"
+ "FMUL v17.4s, v17.4s, %[bv].4s\n"
+ "LDP q19, q20, [%[outptr7]]\n"
+ "FMUL v18.4s, v18.4s, %[bv].4s\n"
+ "LDR q21, [%[outptr7], #32]\n" ASM_PREFETCH("[%[outptr5], #80]")
+ "FMUL v19.4s, v19.4s, %[bv].4s\n"
+ "LDP q0, q1, [%[inptr], #288]\n"
+ "FMUL v20.4s, v20.4s, %[bv].4s\n"
+ "LDP q2, q3, [%[inptr], #320]\n"
+ "FMUL v21.4s, v21.4s, %[bv].4s\n"
+ "LDP q4, q5, [%[inptr], #352]\n"
+ "FMLA v16.4s, v0.4s, %[av].4s\n" ASM_PREFETCH("[%[outptr6], #128]")
+ "FMLA v17.4s, v1.4s, %[av].4s\n"
+ "STP q16, q17, [%[outptr6]], #32\n"
+ "FMLA v18.4s, v2.4s, %[av].4s\n"
+ "STR q18, [%[outptr6]], #16\n"
+ "FMLA v19.4s, v3.4s, %[av].4s\n" ASM_PREFETCH("[%[outptr7], #128]")
+ "FMLA v20.4s, v4.4s, %[av].4s\n"
+ "STP q19, q20, [%[outptr7]], #32\n"
+ "FMLA v21.4s, v5.4s, %[av].4s\n"
+ "STR q21, [%[outptr7]], #16\n"
+ "ADD %[inptr], %[inptr], #384\n"
+ : [outptr0] "+r"(outptr0), [outptr1] "+r"(outptr1), [outptr2] "+r"(outptr2), [outptr3] "+r"(outptr3),
+ [outptr4] "+r"(outptr4), [outptr5] "+r"(outptr5), [outptr6] "+r"(outptr6), [outptr7] "+r"(outptr7),
+ [inptr] "+r"(inptr)
+ : [av] "w"(av), [bv] "w"(bv)
+ : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v16", "v17", "v18", "v19", "v20", "v21");
+ }
+ }
+ }
+}
+
+#endif // __aarch64__
\ No newline at end of file
diff --git a/src/core/NEON/kernels/arm_gemm/merges/a64_merge_float_to_half_12x8.hpp b/src/core/NEON/kernels/arm_gemm/merges/a64_merge_float_to_half_12x8.hpp
new file mode 100644
index 0000000..9708fe1
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/merges/a64_merge_float_to_half_12x8.hpp
@@ -0,0 +1,274 @@
+/*
+ * Copyright (c) 2017-2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#pragma once
+
+// This should be possible on any AArch64 target, but some old compilers don't support __fp16 arguments.
+#if defined(__aarch64__) && defined(__ARM_FP16_ARGS)
+
+#include <arm_neon.h>
+
+template <>
+inline void MergeResults<12, 8>(__fp16 *out, const float *in, int ldout, int y0, int ymax, int x0, int xmax, const __fp16 alpha, const __fp16 beta)
+{
+ const float *inptr = in;
+ prefetch_6x(inptr);
+ prefetch_6x(inptr + 24);
+
+ float32x4_t av = vdupq_n_f32(alpha);
+ float32x4_t bv = vdupq_n_f32(beta);
+
+ for(int y = y0; y < ymax; y += 8)
+ {
+ __fp16 *outptr0 = out + (y * ldout) + x0;
+ __fp16 *outptr1 = outptr0 + ldout;
+ __fp16 *outptr2 = outptr1 + ldout;
+ __fp16 *outptr3 = outptr2 + ldout;
+ __fp16 *outptr4 = outptr3 + ldout;
+ __fp16 *outptr5 = outptr4 + ldout;
+ __fp16 *outptr6 = outptr5 + ldout;
+ __fp16 *outptr7 = outptr6 + ldout;
+
+ prefetch_2x(outptr0);
+ prefetch_2x(outptr1);
+ prefetch_2x(outptr2);
+ prefetch_2x(outptr3);
+ prefetch_2x(outptr4);
+ prefetch_2x(outptr5);
+ prefetch_2x(outptr6);
+ prefetch_2x(outptr7);
+
+ for(int i = x0; i < xmax; i += 12)
+ {
+ __fp16 dummyres[12];
+
+ /* Make sure we throw away results if Y isn't a multiple of 8.
+ * We do this by pointing the result pointer at a dummy buffer
+ * we later discard. */
+ if((y + 7) >= ymax)
+ {
+ switch((y + 7) - ymax)
+ {
+ case 6:
+ outptr1 = dummyres;
+ case 5:
+ outptr2 = dummyres;
+ case 4:
+ outptr3 = dummyres;
+ case 3:
+ outptr4 = dummyres;
+ case 2:
+ outptr5 = dummyres;
+ case 1:
+ outptr6 = dummyres;
+ case 0:
+ outptr7 = dummyres;
+ break;
+
+ default:
+ UNREACHABLE("Impossible.");
+ }
+ }
+
+ /* For ragged X, manually copy over the valid results. */
+ if((i + 11) >= xmax)
+ {
+ for(int xi = 0; xi < 12; xi++)
+ {
+ if((i + xi) < xmax)
+ {
+ *outptr0 = (alpha * inptr[xi]) + (*outptr0 * beta);
+ outptr0++;
+ *outptr1 = (alpha * inptr[xi + 12]) + (*outptr1 * beta);
+ outptr1++;
+ *outptr2 = (alpha * inptr[xi + 24]) + (*outptr2 * beta);
+ outptr2++;
+ *outptr3 = (alpha * inptr[xi + 36]) + (*outptr3 * beta);
+ outptr3++;
+ *outptr4 = (alpha * inptr[xi + 48]) + (*outptr4 * beta);
+ outptr4++;
+ *outptr5 = (alpha * inptr[xi + 60]) + (*outptr5 * beta);
+ outptr5++;
+ *outptr6 = (alpha * inptr[xi + 72]) + (*outptr6 * beta);
+ outptr6++;
+ *outptr7 = (alpha * inptr[xi + 84]) + (*outptr7 * beta);
+ outptr7++;
+ }
+ }
+ inptr += 96;
+ }
+ else
+ {
+ /* Optimized routine to copy an entire block */
+ __asm __volatile(
+ // Rows 0-1
+ "LDR q16, [%[outptr0]]\n"
+ "FCVTL2 v17.4s, v16.8h\n"
+ "LDR d18, [%[outptr0], #16]\n"
+ "FCVTL v16.4s, v16.4h\n"
+ "LDR q19, [%[outptr1]]\n"
+ "FMUL v17.4s, v17.4s, %[bv].4s\n"
+ "LDR d21, [%[outptr1], #16]\n"
+ "FMUL v16.4s, v16.4s, %[bv].4s\n"
+ "LDP q0, q1, [%[inptr]]\n"
+ "FCVTL v18.4s, v18.4h\n"
+ "LDP q2, q3, [%[inptr], #32]\n"
+ "FCVTL2 v20.4s, v19.8h\n"
+ "LDP q4, q5, [%[inptr], #64]\n"
+ "FCVTL v19.4s, v19.4h\n" ASM_PREFETCH("[%[inptr], #768]") "FCVTL v21.4s, v21.4h\n" ASM_PREFETCH("[%[inptr], #832]") "FMUL v18.4s, v18.4s, %[bv].4s\n" ASM_PREFETCH("[%[inptr], #896]")
+ "FMUL v20.4s, v20.4s, %[bv].4s\n" ASM_PREFETCH("[%[inptr], #960]")
+ "FMUL v19.4s, v19.4s, %[bv].4s\n"
+ "FMUL v21.4s, v21.4s, %[bv].4s\n"
+ "FMLA v16.4s, v0.4s, %[av].4s\n"
+ "FMLA v17.4s, v1.4s, %[av].4s\n"
+ "FCVTN v16.4h, v16.4s\n"
+ "FCVTN2 v16.8h, v17.4s\n"
+ "FMLA v18.4s, v2.4s, %[av].4s\n"
+ "STR q16, [%[outptr0]], #16\n"
+ "FCVTN v18.4h, v18.4s\n"
+ "STR d18, [%[outptr0]], #8\n"
+ "FMLA v19.4s, v3.4s, %[av].4s\n"
+ "FMLA v20.4s, v4.4s, %[av].4s\n"
+ "FCVTN v19.4h, v19.4s\n"
+ "FCVTN2 v19.8h, v20.4s\n"
+ "STR q19, [%[outptr1]], #16\n"
+ "FMLA v21.4s, v5.4s, %[av].4s\n"
+ "FCVTN v21.4h, v21.4s\n"
+ "STR d21, [%[outptr1]], #8\n"
+
+ // Rows 2-3
+ "LDR q16, [%[outptr2]]\n"
+ "FCVTL2 v17.4s, v16.8h\n"
+ "LDR d18, [%[outptr2], #16]\n"
+ "FCVTL v16.4s, v16.4h\n"
+ "LDR q19, [%[outptr3]]\n"
+ "FMUL v17.4s, v17.4s, %[bv].4s\n"
+ "LDR d21, [%[outptr3], #16]\n"
+ "FMUL v16.4s, v16.4s, %[bv].4s\n"
+ "LDP q0, q1, [%[inptr], #96]\n"
+ "FCVTL v18.4s, v18.4h\n"
+ "LDP q2, q3, [%[inptr], #128]\n"
+ "FCVTL2 v20.4s, v19.8h\n"
+ "LDP q4, q5, [%[inptr], #160]\n"
+ "FCVTL v19.4s, v19.4h\n" ASM_PREFETCH("[%[inptr], #1024]") "FCVTL v21.4s, v21.4h\n" ASM_PREFETCH("[%[inptr], #1088]") "FMUL v18.4s, v18.4s, %[bv].4s\n" ASM_PREFETCH("[%[outptr0], #64]")
+ "FMUL v20.4s, v20.4s, %[bv].4s\n" ASM_PREFETCH("[%[outptr1], #64]")
+ "FMUL v19.4s, v19.4s, %[bv].4s\n"
+ "FMUL v21.4s, v21.4s, %[bv].4s\n"
+ "FMLA v16.4s, v0.4s, %[av].4s\n"
+ "FMLA v17.4s, v1.4s, %[av].4s\n"
+ "FCVTN v16.4h, v16.4s\n"
+ "FCVTN2 v16.8h, v17.4s\n"
+ "FMLA v18.4s, v2.4s, %[av].4s\n"
+ "STR q16, [%[outptr2]], #16\n"
+ "FCVTN v18.4h, v18.4s\n"
+ "STR d18, [%[outptr2]], #8\n"
+ "FMLA v19.4s, v3.4s, %[av].4s\n"
+ "FMLA v20.4s, v4.4s, %[av].4s\n"
+ "FCVTN v19.4h, v19.4s\n"
+ "FCVTN2 v19.8h, v20.4s\n"
+ "STR q19, [%[outptr3]], #16\n"
+ "FMLA v21.4s, v5.4s, %[av].4s\n"
+ "FCVTN v21.4h, v21.4s\n"
+ "STR d21, [%[outptr3]], #8\n"
+
+ // Rows 4-5
+ "LDR q16, [%[outptr4]]\n"
+ "FCVTL2 v17.4s, v16.8h\n"
+ "LDR d18, [%[outptr4], #16]\n"
+ "FCVTL v16.4s, v16.4h\n"
+ "LDR q19, [%[outptr5]]\n"
+ "FMUL v17.4s, v17.4s, %[bv].4s\n"
+ "LDR d21, [%[outptr5], #16]\n"
+ "FMUL v16.4s, v16.4s, %[bv].4s\n"
+ "LDP q0, q1, [%[inptr], #192]\n"
+ "FCVTL v18.4s, v18.4h\n"
+ "LDP q2, q3, [%[inptr], #224]\n"
+ "FCVTL2 v20.4s, v19.8h\n"
+ "LDP q4, q5, [%[inptr], #256]\n"
+ "FCVTL v19.4s, v19.4h\n" ASM_PREFETCH("[%[outptr2], #64]") "FCVTL v21.4s, v21.4h\n" ASM_PREFETCH("[%[outptr3], #64]") "FMUL v18.4s, v18.4s, %[bv].4s\n" ASM_PREFETCH("[%[outptr4], #88]")
+ "FMUL v20.4s, v20.4s, %[bv].4s\n"
+ "FMUL v19.4s, v19.4s, %[bv].4s\n"
+ "FMUL v21.4s, v21.4s, %[bv].4s\n"
+ "FMLA v16.4s, v0.4s, %[av].4s\n"
+ "FMLA v17.4s, v1.4s, %[av].4s\n"
+ "FCVTN v16.4h, v16.4s\n"
+ "FCVTN2 v16.8h, v17.4s\n"
+ "FMLA v18.4s, v2.4s, %[av].4s\n"
+ "STR q16, [%[outptr4]], #16\n"
+ "FCVTN v18.4h, v18.4s\n"
+ "STR d18, [%[outptr4]], #8\n"
+ "FMLA v19.4s, v3.4s, %[av].4s\n"
+ "FMLA v20.4s, v4.4s, %[av].4s\n"
+ "FCVTN v19.4h, v19.4s\n"
+ "FCVTN2 v19.8h, v20.4s\n"
+ "STR q19, [%[outptr5]], #16\n"
+ "FMLA v21.4s, v5.4s, %[av].4s\n"
+ "FCVTN v21.4h, v21.4s\n"
+ "STR d21, [%[outptr5]], #8\n"
+
+ // Rows 6-7
+ "LDR q16, [%[outptr6]]\n"
+ "FCVTL2 v17.4s, v16.8h\n"
+ "LDR d18, [%[outptr6], #16]\n"
+ "FCVTL v16.4s, v16.4h\n"
+ "LDR q19, [%[outptr7]]\n"
+ "FMUL v17.4s, v17.4s, %[bv].4s\n"
+ "LDR d21, [%[outptr7], #16]\n"
+ "FMUL v16.4s, v16.4s, %[bv].4s\n"
+ "LDP q0, q1, [%[inptr], #288]\n"
+ "FCVTL v18.4s, v18.4h\n"
+ "LDP q2, q3, [%[inptr], #320]\n"
+ "FCVTL2 v20.4s, v19.8h\n"
+ "LDP q4, q5, [%[inptr], #352]\n"
+ "FCVTL v19.4s, v19.4h\n" ASM_PREFETCH("[%[outptr5], #64]") "FCVTL v21.4s, v21.4h\n" ASM_PREFETCH("[%[outptr6], #88]") "FMUL v18.4s, v18.4s, %[bv].4s\n" ASM_PREFETCH("[%[outptr7], #88]")
+ "FMUL v20.4s, v20.4s, %[bv].4s\n"
+ "FMUL v19.4s, v19.4s, %[bv].4s\n"
+ "FMUL v21.4s, v21.4s, %[bv].4s\n"
+ "FMLA v16.4s, v0.4s, %[av].4s\n"
+ "FMLA v17.4s, v1.4s, %[av].4s\n"
+ "FCVTN v16.4h, v16.4s\n"
+ "FCVTN2 v16.8h, v17.4s\n"
+ "FMLA v18.4s, v2.4s, %[av].4s\n"
+ "STR q16, [%[outptr6]], #16\n"
+ "FCVTN v18.4h, v18.4s\n"
+ "STR d18, [%[outptr6]], #8\n"
+ "FMLA v19.4s, v3.4s, %[av].4s\n"
+ "FMLA v20.4s, v4.4s, %[av].4s\n"
+ "FCVTN v19.4h, v19.4s\n"
+ "FCVTN2 v19.8h, v20.4s\n"
+ "STR q19, [%[outptr7]], #16\n"
+ "FMLA v21.4s, v5.4s, %[av].4s\n"
+ "FCVTN v21.4h, v21.4s\n"
+ "STR d21, [%[outptr7]], #8\n"
+ "ADD %[inptr], %[inptr], #384\n"
+ : [outptr0] "+r"(outptr0), [outptr1] "+r"(outptr1), [outptr2] "+r"(outptr2), [outptr3] "+r"(outptr3),
+ [outptr4] "+r"(outptr4), [outptr5] "+r"(outptr5), [outptr6] "+r"(outptr6), [outptr7] "+r"(outptr7),
+ [inptr] "+r"(inptr)
+ : [av] "w"(av), [bv] "w"(bv)
+ : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v16", "v17", "v18", "v19", "v20", "v21");
+ }
+ }
+ }
+}
+
+#endif // __aarch64__ && __ARM_FP16_ARGS
diff --git a/src/core/NEON/kernels/arm_gemm/merges/a64_merge_half_24x8.hpp b/src/core/NEON/kernels/arm_gemm/merges/a64_merge_half_24x8.hpp
new file mode 100644
index 0000000..08cfc00
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/merges/a64_merge_half_24x8.hpp
@@ -0,0 +1,233 @@
+/*
+ * Copyright (c) 2017-2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#pragma once
+
+#if defined(__aarch64__) && defined(__ARM_FEATURE_FP16_SCALAR_ARITHMETIC)
+
+template <>
+inline void MergeResults<24, 8>(__fp16 *out, const __fp16 *in, const int ldout, const int y0, const int ymax,
+ const int x0, const int xmax, const __fp16 alpha, const __fp16 beta)
+{
+ const __fp16 *inptr = in;
+ prefetch_6x(inptr);
+ prefetch_6x(inptr + 48);
+
+ float16x8_t va = vdupq_n_f16(alpha);
+ float16x8_t vb = vdupq_n_f16(beta);
+
+ for(int y = y0; y < ymax; y += 8)
+ {
+ __fp16 *outptr0 = out + (y * ldout) + x0;
+ __fp16 *outptr1 = outptr0 + ldout;
+ __fp16 *outptr2 = outptr1 + ldout;
+ __fp16 *outptr3 = outptr2 + ldout;
+ __fp16 *outptr4 = outptr3 + ldout;
+ __fp16 *outptr5 = outptr4 + ldout;
+ __fp16 *outptr6 = outptr5 + ldout;
+ __fp16 *outptr7 = outptr6 + ldout;
+
+ prefetch_2x(outptr0);
+ prefetch_2x(outptr1);
+ prefetch_2x(outptr2);
+ prefetch_2x(outptr3);
+ prefetch_2x(outptr4);
+ prefetch_2x(outptr5);
+ prefetch_2x(outptr6);
+ prefetch_2x(outptr7);
+
+ for(int i = x0; i < xmax; i += 24)
+ {
+ __fp16 dummyres[24];
+
+ /* Make sure we throw away results if Y isn't a multiple of 8.
+ * We do this by pointing the result pointer at a dummy buffer
+ * we later discard. */
+ if((y + 7) >= ymax)
+ {
+ switch((y + 7) - ymax)
+ {
+ case 6:
+ outptr1 = dummyres;
+ case 5:
+ outptr2 = dummyres;
+ case 4:
+ outptr3 = dummyres;
+ case 3:
+ outptr4 = dummyres;
+ case 2:
+ outptr5 = dummyres;
+ case 1:
+ outptr6 = dummyres;
+ case 0:
+ outptr7 = dummyres;
+ break;
+
+ default:
+ UNREACHABLE("Impossible.");
+ }
+ }
+
+ /* For ragged X, manually copy over the valid results. */
+ if((i + 23) >= xmax)
+ {
+ for(int xi = 0; xi < 24; xi++)
+ {
+ if((i + xi) < xmax)
+ {
+ *outptr0 = (alpha * inptr[xi]) + (*outptr0 * beta);
+ outptr0++;
+ *outptr1 = (alpha * inptr[xi + 24]) + (*outptr1 * beta);
+ outptr1++;
+ *outptr2 = (alpha * inptr[xi + 48]) + (*outptr2 * beta);
+ outptr2++;
+ *outptr3 = (alpha * inptr[xi + 72]) + (*outptr3 * beta);
+ outptr3++;
+ *outptr4 = (alpha * inptr[xi + 96]) + (*outptr4 * beta);
+ outptr4++;
+ *outptr5 = (alpha * inptr[xi + 120]) + (*outptr5 * beta);
+ outptr5++;
+ *outptr6 = (alpha * inptr[xi + 144]) + (*outptr6 * beta);
+ outptr6++;
+ *outptr7 = (alpha * inptr[xi + 168]) + (*outptr7 * beta);
+ outptr7++;
+ }
+ }
+ inptr += 192;
+ }
+ else
+ {
+ /* Optimized routine to copy an entire block */
+ __asm __volatile(
+ ".arch armv8.2-a+fp16\n"
+ // Rows 0-1
+ "LDP q16, q17, [%[outptr0]]\n"
+ "FMUL v16.8h, v16.8h, %[vb].8h\n"
+ "LDR q18, [%[outptr0], #32]\n"
+ "FMUL v17.8h, v17.8h, %[vb].8h\n"
+ "LDP q19, q20, [%[outptr1]]\n"
+ "FMUL v18.8h, v18.8h, %[vb].8h\n" ASM_PREFETCH("[%[inptr], #768]")
+ "LDR q21, [%[outptr1], #32]\n"
+ "FMUL v19.8h, v19.8h, %[vb].8h\n"
+ "LDP q0, q1, [%[inptr]]\n"
+ "FMUL v20.8h, v20.8h, %[vb].8h\n"
+ "LDP q2, q3, [%[inptr], #32]\n"
+ "FMUL v21.8h, v21.8h, %[vb].8h\n"
+ "LDP q4, q5, [%[inptr], #64]\n"
+ "FMLA v16.8h, v0.8h, %[va].8h\n" ASM_PREFETCH("[%[inptr], #832]")
+ "FMLA v17.8h, v1.8h, %[va].8h\n"
+ "STP q16, q17, [%[outptr0]], #32\n"
+ "FMLA v18.8h, v2.8h, %[va].8h\n"
+ "STR q18, [%[outptr0]], #16\n"
+ "FMLA v19.8h, v3.8h, %[va].8h\n" ASM_PREFETCH("[%[inptr], #896]")
+ "FMLA v20.8h, v4.8h, %[va].8h\n"
+ "STP q19, q20, [%[outptr1]], #32\n"
+ "FMLA v21.8h, v5.8h, %[va].8h\n"
+ "STR q21, [%[outptr1]], #16\n" ASM_PREFETCH("[%[inptr], #960]")
+
+ // Rows 2-3
+ "LDP q16, q17, [%[outptr2]]\n"
+ "FMUL v16.8h, v16.8h, %[vb].8h\n"
+ "LDR q18, [%[outptr2], #32]\n"
+ "FMUL v17.8h, v17.8h, %[vb].8h\n"
+ "LDP q19, q20, [%[outptr3]]\n"
+ "FMUL v18.8h, v18.8h, %[vb].8h\n" ASM_PREFETCH("[%[inptr], #1024]")
+ "LDR q21, [%[outptr3], #32]\n"
+ "FMUL v19.8h, v19.8h, %[vb].8h\n"
+ "LDP q0, q1, [%[inptr], #96]\n"
+ "FMUL v20.8h, v20.8h, %[vb].8h\n"
+ "LDP q2, q3, [%[inptr], #128]\n"
+ "FMUL v21.8h, v21.8h, %[vb].8h\n"
+ "LDP q4, q5, [%[inptr], #160]\n"
+ "FMLA v16.8h, v0.8h, %[va].8h\n" ASM_PREFETCH("[%[inptr], #1088]")
+ "FMLA v17.8h, v1.8h, %[va].8h\n"
+ "STP q16, q17, [%[outptr2]], #32\n"
+ "FMLA v18.8h, v2.8h, %[va].8h\n"
+ "STR q18, [%[outptr2]], #16\n"
+ "FMLA v19.8h, v3.8h, %[va].8h\n" ASM_PREFETCH("[%[outptr0], #80]")
+ "FMLA v20.8h, v4.8h, %[va].8h\n"
+ "STP q19, q20, [%[outptr3]], #32\n"
+ "FMLA v21.8h, v5.8h, %[va].8h\n"
+ "STR q21, [%[outptr3]], #16\n" ASM_PREFETCH("[%[outptr1], #80]")
+
+ // Rows 4-5
+ "LDP q16, q17, [%[outptr4]]\n"
+ "FMUL v16.8h, v16.8h, %[vb].8h\n"
+ "LDR q18, [%[outptr4], #32]\n"
+ "FMUL v17.8h, v17.8h, %[vb].8h\n"
+ "LDP q19, q20, [%[outptr5]]\n"
+ "FMUL v18.8h, v18.8h, %[vb].8h\n" ASM_PREFETCH("[%[outptr2], #80]")
+ "LDR q21, [%[outptr5], #32]\n"
+ "FMUL v19.8h, v19.8h, %[vb].8h\n"
+ "LDP q0, q1, [%[inptr], #192]\n"
+ "FMUL v20.8h, v20.8h, %[vb].8h\n"
+ "LDP q2, q3, [%[inptr], #224]\n"
+ "FMUL v21.8h, v21.8h, %[vb].8h\n"
+ "LDP q4, q5, [%[inptr], #256]\n"
+ "FMLA v16.8h, v0.8h, %[va].8h\n" ASM_PREFETCH("[%[outptr3], #80]")
+ "FMLA v17.8h, v1.8h, %[va].8h\n"
+ "STP q16, q17, [%[outptr4]], #32\n"
+ "FMLA v18.8h, v2.8h, %[va].8h\n"
+ "STR q18, [%[outptr4]], #16\n"
+ "FMLA v19.8h, v3.8h, %[va].8h\n" ASM_PREFETCH("[%[outptr4], #80]")
+ "FMLA v20.8h, v4.8h, %[va].8h\n"
+ "STP q19, q20, [%[outptr5]], #32\n"
+ "FMLA v21.8h, v5.8h, %[va].8h\n"
+ "STR q21, [%[outptr5]], #16\n"
+
+ // Rows 6-7
+ "LDP q16, q17, [%[outptr6]]\n"
+ "FMUL v16.8h, v16.8h, %[vb].8h\n"
+ "LDR q18, [%[outptr6], #32]\n"
+ "FMUL v17.8h, v17.8h, %[vb].8h\n"
+ "LDP q19, q20, [%[outptr7]]\n" ASM_PREFETCH("[%[outptr5], #80]")
+ "FMUL v18.8h, v18.8h, %[vb].8h\n"
+ "LDR q21, [%[outptr7], #32]\n"
+ "FMUL v19.8h, v19.8h, %[vb].8h\n"
+ "LDP q0, q1, [%[inptr], #288]\n"
+ "FMUL v20.8h, v20.8h, %[vb].8h\n"
+ "LDP q2, q3, [%[inptr], #320]\n"
+ "FMUL v21.8h, v21.8h, %[vb].8h\n"
+ "LDP q4, q5, [%[inptr], #352]\n"
+ "FMLA v16.8h, v0.8h, %[va].8h\n" ASM_PREFETCH("[%[outptr6], #128]")
+ "FMLA v17.8h, v1.8h, %[va].8h\n"
+ "STP q16, q17, [%[outptr6]], #32\n"
+ "FMLA v18.8h, v2.8h, %[va].8h\n"
+ "STR q18, [%[outptr6]], #16\n"
+ "FMLA v19.8h, v3.8h, %[va].8h\n" ASM_PREFETCH("[%[outptr7], #128]")
+ "FMLA v20.8h, v4.8h, %[va].8h\n"
+ "STP q19, q20, [%[outptr7]], #32\n"
+ "FMLA v21.8h, v5.8h, %[va].8h\n"
+ "STR q21, [%[outptr7]], #16\n"
+ "ADD %[inptr], %[inptr], #384\n"
+ : [outptr0] "+r"(outptr0), [outptr1] "+r"(outptr1), [outptr2] "+r"(outptr2), [outptr3] "+r"(outptr3),
+ [outptr4] "+r"(outptr4), [outptr5] "+r"(outptr5), [outptr6] "+r"(outptr6), [outptr7] "+r"(outptr7),
+ [inptr] "+r"(inptr)
+ : [va] "w"(va), [vb] "w"(vb)
+ : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v16", "v17", "v18", "v19", "v20", "v21");
+ }
+ }
+ }
+}
+
+#endif // __aarch64__ && __ARM_FEATURE_FP16_SCALAR_ARITHMETIC
diff --git a/src/core/NEON/kernels/arm_gemm/merges/a64_merge_int32_12x8.hpp b/src/core/NEON/kernels/arm_gemm/merges/a64_merge_int32_12x8.hpp
new file mode 100644
index 0000000..79dd1f0
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/merges/a64_merge_int32_12x8.hpp
@@ -0,0 +1,289 @@
+/*
+ * Copyright (c) 2017-2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#pragma once
+
+#ifdef __aarch64__
+
+template <>
+inline void MergeResults<12, 8>(int32_t *out, const int32_t *in, const int ldout, const int y0, const int ymax, const int x0, const int xmax, const int32_t alpha, const int32_t beta)
+{
+ const int32_t *inptr = in;
+ prefetch_6x(inptr);
+ prefetch_6x(inptr + 96);
+
+ int32x4_t alpha_value = vdupq_n_s32(alpha);
+ int32x4_t beta_value = vdupq_n_s32(beta);
+
+ for(int y = y0; y < ymax; y += 8)
+ {
+ int32_t *outptr0 = out + (y * ldout) + x0;
+ int32_t *outptr1 = outptr0 + ldout;
+ int32_t *outptr2 = outptr1 + ldout;
+ int32_t *outptr3 = outptr2 + ldout;
+ int32_t *outptr4 = outptr3 + ldout;
+ int32_t *outptr5 = outptr4 + ldout;
+ int32_t *outptr6 = outptr5 + ldout;
+ int32_t *outptr7 = outptr6 + ldout;
+
+ prefetch_2x(outptr0);
+ prefetch_2x(outptr1);
+ prefetch_2x(outptr2);
+ prefetch_2x(outptr3);
+ prefetch_2x(outptr4);
+ prefetch_2x(outptr5);
+ prefetch_2x(outptr6);
+ prefetch_2x(outptr7);
+
+ for(int i = x0; i < xmax; i += 12)
+ {
+ int32_t dummyres[12];
+
+ /* Make sure we throw away results if Y isn't a multiple of 8.
+ * We do this by pointing the result pointer at a dummy buffer
+ * we later discard. */
+ if((y + 7) >= ymax)
+ {
+ switch((y + 7) - ymax)
+ {
+ case 6:
+ outptr1 = dummyres;
+ case 5:
+ outptr2 = dummyres;
+ case 4:
+ outptr3 = dummyres;
+ case 3:
+ outptr4 = dummyres;
+ case 2:
+ outptr5 = dummyres;
+ case 1:
+ outptr6 = dummyres;
+ case 0:
+ outptr7 = dummyres;
+ break;
+
+ default:
+ UNREACHABLE("Impossible.");
+ }
+ }
+
+ /* For ragged X, manually copy over the valid results. */
+ if((i + 11) >= xmax)
+ {
+ for(int xi = 0; xi < 12; xi++)
+ {
+ if((i + xi) < xmax)
+ {
+ *outptr0 = (alpha * inptr[xi]) + (*outptr0 * beta);
+ outptr0++;
+ *outptr1 = (alpha * inptr[xi + 12]) + (*outptr1 * beta);
+ outptr1++;
+ *outptr2 = (alpha * inptr[xi + 24]) + (*outptr2 * beta);
+ outptr2++;
+ *outptr3 = (alpha * inptr[xi + 36]) + (*outptr3 * beta);
+ outptr3++;
+ *outptr4 = (alpha * inptr[xi + 48]) + (*outptr4 * beta);
+ outptr4++;
+ *outptr5 = (alpha * inptr[xi + 60]) + (*outptr5 * beta);
+ outptr5++;
+ *outptr6 = (alpha * inptr[xi + 72]) + (*outptr6 * beta);
+ outptr6++;
+ *outptr7 = (alpha * inptr[xi + 84]) + (*outptr7 * beta);
+ outptr7++;
+ }
+ }
+ inptr += 96;
+ }
+ else
+ {
+ /* Optimized routine to copy an entire block */
+ __asm __volatile(
+ // Row 0
+ ASM_PREFETCH("[%x[outptr1], #192]")
+ "ldr q3, [%x[outptr0]]\n"
+ "ldr q4, [%x[outptr0], #0x10]\n"
+ "ldr q5, [%x[outptr0], #0x20]\n"
+ "mul v3.4s, v3.4s, %[beta_value].4s\n"
+ "ldr q6, [%x[inptr]]\n"
+ "mul v4.4s, v4.4s, %[beta_value].4s\n"
+ "ldr q7, [%x[inptr], #0x10]\n"
+ "mul v5.4s, v5.4s, %[beta_value].4s\n"
+ "ldr q8, [%x[inptr], #0x20]\n"
+ "mla v3.4s, v6.4s, %[alpha_value].4s\n"
+ "ldr q0, [%x[outptr1]]\n"
+ "mla v4.4s, v7.4s, %[alpha_value].4s\n"
+ "ldr q1, [%x[outptr1], #0x10]\n"
+ "mla v5.4s, v8.4s, %[alpha_value].4s\n"
+ "ldr q2, [%x[outptr1], #0x20]\n"
+
+ // Row 1
+ ASM_PREFETCH("[%x[outptr2], #192]")
+ "mul v0.4s, v0.4s, %[beta_value].4s\n"
+ "ldr q6, [%x[inptr], #0x30]\n"
+ "str q3, [%x[outptr0]], #0x10\n"
+ "mul v1.4s, v1.4s, %[beta_value].4s\n"
+ "ldr q7, [%x[inptr], #0x40]\n"
+ "str q4, [%x[outptr0]], #0x10\n"
+ "mul v2.4s, v2.4s, %[beta_value].4s\n"
+ "ldr q8, [%x[inptr], #0x50]\n"
+ "str q5, [%x[outptr0]], #0x10\n"
+ "mla v0.4s, v6.4s, %[alpha_value].4s\n"
+ "ldr q3, [%x[outptr2]]\n"
+ "mla v1.4s, v7.4s, %[alpha_value].4s\n"
+ "ldr q4, [%x[outptr2], #0x10]\n"
+ "mla v2.4s, v8.4s, %[alpha_value].4s\n"
+ "ldr q5, [%x[outptr2], #0x20]\n"
+
+ // Row 2
+ ASM_PREFETCH("[%x[outptr3], #192]")
+ "mul v3.4s, v3.4s, %[beta_value].4s\n"
+ "ldr q6, [%x[inptr], #0x60]\n"
+ "str q0, [%x[outptr1]], #0x10\n"
+ "mul v4.4s, v4.4s, %[beta_value].4s\n"
+ "ldr q7, [%x[inptr], #0x70]\n"
+ "str q1, [%x[outptr1]], #0x10\n"
+ "mul v5.4s, v5.4s, %[beta_value].4s\n"
+ "ldr q8, [%x[inptr], #0x80]\n"
+ "str q2, [%x[outptr1]], #0x10\n"
+ "mla v3.4s, v6.4s, %[alpha_value].4s\n"
+ "ldr q0, [%x[outptr3]]\n"
+ "mla v4.4s, v7.4s, %[alpha_value].4s\n"
+ "ldr q1, [%x[outptr3], #0x10]\n"
+ "mla v5.4s, v8.4s, %[alpha_value].4s\n"
+ "ldr q2, [%x[outptr3], #0x20]\n"
+
+ // Row 3
+ ASM_PREFETCH("[%x[outptr4], #192]")
+ "mul v0.4s, v0.4s, %[beta_value].4s\n"
+ "ldr q6, [%x[inptr], #0x90]\n"
+ "str q3, [%x[outptr2]], #0x10\n"
+ "mul v1.4s, v1.4s, %[beta_value].4s\n"
+ "ldr q7, [%x[inptr], #0xa0]\n"
+ "str q4, [%x[outptr2]], #0x10\n"
+ "mul v2.4s, v2.4s, %[beta_value].4s\n"
+ "ldr q8, [%x[inptr], #0xb0]\n"
+ "str q5, [%x[outptr2]], #0x10\n"
+ "mla v0.4s, v6.4s, %[alpha_value].4s\n"
+ "ldr q3, [%x[outptr4]]\n"
+ "mla v1.4s, v7.4s, %[alpha_value].4s\n"
+ "ldr q4, [%x[outptr4], #0x10]\n"
+ "mla v2.4s, v8.4s, %[alpha_value].4s\n"
+ "ldr q5, [%x[outptr4], #0x20]\n"
+
+ // Row 4
+ ASM_PREFETCH("[%x[outptr5], #192]")
+ "mul v3.4s, v3.4s, %[beta_value].4s\n"
+ "ldr q6, [%x[inptr], #0xc0]\n"
+ "str q0, [%x[outptr3]], #0x10\n"
+ "mul v4.4s, v4.4s, %[beta_value].4s\n"
+ "ldr q7, [%x[inptr], #0xd0]\n"
+ "str q1, [%x[outptr3]], #0x10\n"
+ "mul v5.4s, v5.4s, %[beta_value].4s\n"
+ "ldr q8, [%x[inptr], #0xe0]\n"
+ "str q2, [%x[outptr3]], #0x10\n"
+ "mla v3.4s, v6.4s, %[alpha_value].4s\n"
+ "ldr q0, [%x[outptr5]]\n"
+ "mla v4.4s, v7.4s, %[alpha_value].4s\n"
+ "ldr q1, [%x[outptr5], #0x10]\n"
+ "mla v5.4s, v8.4s, %[alpha_value].4s\n"
+ "ldr q2, [%x[outptr5], #0x20]\n"
+
+ // Row 5
+ ASM_PREFETCH("[%x[outptr6], #192]")
+ "mul v0.4s, v0.4s, %[beta_value].4s\n"
+ "ldr q6, [%x[inptr], #0xf0]\n"
+ "str q3, [%x[outptr4]], #0x10\n"
+ "mul v1.4s, v1.4s, %[beta_value].4s\n"
+ "ldr q7, [%x[inptr], #0x100]\n"
+ "str q4, [%x[outptr4]], #0x10\n"
+ "mul v2.4s, v2.4s, %[beta_value].4s\n"
+ "ldr q8, [%x[inptr], #0x110]\n"
+ "str q5, [%x[outptr4]], #0x10\n"
+ "mla v0.4s, v6.4s, %[alpha_value].4s\n"
+ "ldr q3, [%x[outptr6]]\n"
+ "mla v1.4s, v7.4s, %[alpha_value].4s\n"
+ "ldr q4, [%x[outptr6], #0x10]\n"
+ "mla v2.4s, v8.4s, %[alpha_value].4s\n"
+ "ldr q5, [%x[outptr6], #0x20]\n"
+
+ // Row 6
+ ASM_PREFETCH("[%x[outptr7], #192]")
+ "mul v3.4s, v3.4s, %[beta_value].4s\n"
+ "ldr q6, [%x[inptr], #0x120]\n"
+ "str q0, [%x[outptr5]], #0x10\n"
+ "mul v4.4s, v4.4s, %[beta_value].4s\n"
+ "ldr q7, [%x[inptr], #0x130]\n"
+ "str q1, [%x[outptr5]], #0x10\n"
+ "mul v5.4s, v5.4s, %[beta_value].4s\n"
+ "ldr q8, [%x[inptr], #0x140]\n"
+ "str q2, [%x[outptr5]], #0x10\n"
+ "mla v3.4s, v6.4s, %[alpha_value].4s\n"
+ "ldr q0, [%x[outptr7]]\n"
+ "mla v4.4s, v7.4s, %[alpha_value].4s\n"
+ "ldr q1, [%x[outptr7], #0x10]\n"
+ "mla v5.4s, v8.4s, %[alpha_value].4s\n"
+ "ldr q2, [%x[outptr7], #0x20]\n"
+
+ // Row 7
+ "mul v0.4s, v0.4s, %[beta_value].4s\n"
+ "ldr q6, [%x[inptr], #0x150]\n"
+ "str q3, [%x[outptr6]], #0x10\n"
+ "mul v1.4s, v1.4s, %[beta_value].4s\n"
+ "ldr q7, [%x[inptr], #0x160]\n"
+ "str q4, [%x[outptr6]], #0x10\n"
+ "mul v2.4s, v2.4s, %[beta_value].4s\n"
+ "ldr q8, [%x[inptr], #0x170]\n"
+ "str q5, [%x[outptr6]], #0x10\n"
+ "mla v0.4s, v6.4s, %[alpha_value].4s\n"
+ "mla v1.4s, v7.4s, %[alpha_value].4s\n"
+ "mla v2.4s, v8.4s, %[alpha_value].4s\n"
+ "str q0, [%x[outptr7]], #0x10\n"
+ "str q1, [%x[outptr7]], #0x10\n"
+ "str q2, [%x[outptr7]], #0x10\n"
+
+ "add %x[inptr], %x[inptr], #0x180\n"
+ : [outptr0] "+r"(outptr0),
+ [outptr1] "+r"(outptr1),
+ [outptr2] "+r"(outptr2),
+ [outptr3] "+r"(outptr3),
+ [outptr4] "+r"(outptr4),
+ [outptr5] "+r"(outptr5),
+ [outptr6] "+r"(outptr6),
+ [outptr7] "+r"(outptr7),
+ [inptr] "+r"(inptr)
+ : [alpha_value] "w"(alpha_value),
+ [beta_value] "w"(beta_value)
+ : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8");
+ }
+ }
+ }
+}
+
+template <>
+inline void MergeResults<12, 8>(uint32_t *out, const uint32_t *in, const int ldout, const int y0, const int ymax, const int x0, const int xmax, const uint32_t alpha, const uint32_t beta)
+{
+ // Since the above code uses only MUL and MLA instructions discard the "unsignedness" and proceed safely.
+ MergeResults<12, 8>(reinterpret_cast<int32_t *>(out), reinterpret_cast<const int32_t *>(in), ldout, y0, ymax, x0, xmax, static_cast<const int32_t>(alpha), static_cast<const int32_t>(beta));
+}
+
+#endif // __aarch64__
diff --git a/src/graph/CL/CLUnmap.cpp b/src/core/NEON/kernels/arm_gemm/merges/list.hpp
similarity index 65%
copy from src/graph/CL/CLUnmap.cpp
copy to src/core/NEON/kernels/arm_gemm/merges/list.hpp
index 31f2f19..d93f1b0 100644
--- a/src/graph/CL/CLUnmap.cpp
+++ b/src/core/NEON/kernels/arm_gemm/merges/list.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2018 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -21,23 +21,8 @@
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*/
-#include "arm_compute/graph/CL/CLUnmap.h"
-
-#include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/Validate.h"
-#include "arm_compute/graph/ITensorObject.h"
-#include "arm_compute/runtime/CL/CLScheduler.h"
-
-using namespace arm_compute::graph;
-
-CLUnmap::CLUnmap(ITensorObject *tensor)
- : _tensor(dynamic_cast<arm_compute::ICLTensor *>(tensor->tensor()))
-{
- ARM_COMPUTE_ERROR_ON_NULLPTR(_tensor);
-}
-
-void CLUnmap::run()
-{
- _tensor->unmap(arm_compute::CLScheduler::get().queue());
-}
+#include "a32_merge_float_8x6.hpp"
+#include "a64_merge_float_12x8.hpp"
+#include "a64_merge_float_to_half_12x8.hpp"
+#include "a64_merge_half_24x8.hpp"
+#include "a64_merge_int32_12x8.hpp"
diff --git a/src/core/NEON/kernels/arm_gemm/profiler.hpp b/src/core/NEON/kernels/arm_gemm/profiler.hpp
new file mode 100644
index 0000000..ada0c95
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/profiler.hpp
@@ -0,0 +1,152 @@
+/*
+ * Copyright (c) 2017-2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#pragma once
+
+#ifdef CYCLE_PROFILING
+
+#include "../perf.h"
+
+#ifndef NO_MULTI_THREADING
+#include <mutex>
+#endif
+
+namespace arm_gemm
+{
+#ifndef NO_MULTI_THREADING
+extern std::mutex report_mutex;
+#endif
+
+class profiler
+{
+private:
+ static const int maxevents = 100000;
+ unsigned long times[maxevents] = {};
+ unsigned long units[maxevents] = {};
+ int events[maxevents] = {};
+ int currentevent = 0;
+ int countfd = 0;
+
+ class ScopedProfilerClass
+ {
+ private:
+ profiler &_parent;
+ bool legal = false;
+
+ public:
+ ScopedProfilerClass(profiler &prof, int i, unsigned long u)
+ : _parent(prof)
+ {
+ if(prof.currentevent == maxevents)
+ return;
+
+ prof.events[prof.currentevent] = i;
+ prof.units[prof.currentevent] = u;
+ legal = true;
+ start_counter(prof.countfd);
+ }
+
+ ~ScopedProfilerClass()
+ {
+ if(!legal)
+ return;
+
+ long long cycs = stop_counter(_parent.countfd);
+ _parent.times[_parent.currentevent++] = cycs;
+ }
+ };
+
+public:
+ profiler()
+ {
+ countfd = open_cycle_counter();
+ }
+
+ ~profiler()
+ {
+ close(countfd);
+ int tots[5];
+ unsigned long counts[5];
+ unsigned long tunits[5];
+ const char *descs[] = { "Prepare A", "Prepare B", "Kernel", "Merge" };
+
+ for(int i = 1; i < 5; i++)
+ {
+ tots[i] = 0;
+ counts[i] = 0;
+ tunits[i] = 0;
+ }
+
+ for(int i = 0; i < currentevent; i++)
+ {
+ // printf("%10s: %ld\n", descs[events[i]-1], times[i]);
+ tots[events[i]]++;
+ counts[events[i]] += times[i];
+ tunits[events[i]] += units[i];
+ }
+
+#ifdef NO_MULTI_THREADING
+ printf("Profiled events:\n");
+#else
+ std::lock_guard<std::mutex> lock(report_mutex);
+ printf("Profiled events (cpu %d):\n", sched_getcpu());
+#endif
+
+ printf("%20s %9s %9s %9s %12s %9s\n", "", "Events", "Total", "Average", "Bytes/MACs", "Per cycle");
+ for(int i = 1; i < 5; i++)
+ {
+ printf("%20s: %9d %9ld %9ld %12lu %9.2f\n", descs[i - 1], tots[i], counts[i], counts[i] / tots[i], tunits[i], (float)tunits[i] / counts[i]);
+ }
+ }
+
+ template <typename T>
+ void operator()(int i, unsigned long u, T func)
+ {
+ if(currentevent == maxevents)
+ {
+ func();
+ }
+ else
+ {
+ events[currentevent] = i;
+ units[currentevent] = u;
+ start_counter(countfd);
+ func();
+ long long cycs = stop_counter(countfd);
+ times[currentevent++] = cycs;
+ }
+ }
+ ScopedProfilerClass ScopedProfiler(int i, unsigned long u)
+ {
+ return ScopedProfilerClass(*this, i, u);
+ }
+};
+
+#endif // CYCLE_PROFILING
+
+} // namespace arm_gemm
+
+#define PROFILE_PREPA 1
+#define PROFILE_PREPB 2
+#define PROFILE_KERNEL 3
+#define PROFILE_MERGE 4
diff --git a/src/core/NEON/kernels/arm_gemm/transform.hpp b/src/core/NEON/kernels/arm_gemm/transform.hpp
new file mode 100644
index 0000000..c80bb59
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/transform.hpp
@@ -0,0 +1,122 @@
+/*
+ * Copyright (c) 2017-2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#pragma once
+
+/*
+ * Generic transform.
+ *
+ * Assuming the untransposed case, this works by first reading <BlockBy>
+ * consecutive values from the first input row. This same number of values
+ * are then read from the next <IntBy-1> rows. Now return to the first
+ * input row and repeat.
+ *
+ * Need to cope with the work requested in either dimension not actually
+ * being a multiple of the block sizes.
+ */
+template <unsigned IntBy, unsigned int BlockBy, bool Transposed, size_t TOutSize, size_t TInSize>
+struct TransformImpl
+{
+ template <typename TOut, typename TIn>
+ static void Transform(TOut *out, const TIn *const in, const int stride,
+ const int y0, const int ymax, const int x0, const int xmax)
+ {
+ const int n_whole_y_blocks = (ymax - y0) / IntBy;
+ const int y_remainders = (ymax - y0) % IntBy;
+ const int n_y_blocks = n_whole_y_blocks + (y_remainders ? 1 : 0);
+
+ const int n_whole_x_blocks = (xmax - x0) / BlockBy;
+ const int x_remainders = (xmax - x0) % BlockBy;
+ const int n_x_blocks = n_whole_x_blocks + (x_remainders ? 1 : 0);
+
+ // "Y" loop: advance down the rows of the source IntBy rows at a time.
+ // Set up fill_rows to show the number rows to copy from, and blank_rows
+ // for the number of blank rows to add.
+ for(int y_block = 0; y_block < n_y_blocks; y_block++)
+ {
+ int fill_rows = (y_block < n_whole_y_blocks) ? IntBy : y_remainders;
+ int blank_rows = IntBy - fill_rows;
+
+ int y_base = y0 + (y_block * IntBy);
+
+ // So now advance along this block of rows, BlockBy columns at a time.
+ for(int x_block = 0; x_block < n_x_blocks; x_block++)
+ {
+ int fill_cols = (x_block < n_whole_x_blocks) ? BlockBy : x_remainders;
+ int blank_cols = BlockBy - fill_cols;
+
+ int x_base = x0 + (x_block * BlockBy);
+
+ for(int row = 0; row < fill_rows; row++)
+ {
+ for(int col = 0; col < fill_cols; col++)
+ {
+ // In-range copy. If it's transposed, we reverse the sense of rows and columns here.
+ if(Transposed)
+ {
+ *out++ = static_cast<TOut>(in[(x_base + col) * stride + y_base + row]);
+ }
+ else
+ {
+ *out++ = static_cast<TOut>(in[(y_base + row) * stride + x_base + col]);
+ }
+ }
+ // "col" tail - row is in range but column is out of range.
+ for(int col = 0; col < blank_cols; col++)
+ {
+ *out++ = static_cast<TOut>(0);
+ }
+ }
+ // "row" tail - row is out of range so fill with zeros always.
+ for(int row = 0; row < blank_rows; row++)
+ {
+ for(int col = 0; col < (fill_cols + blank_cols); col++)
+ {
+ *out++ = static_cast<TOut>(0);
+ }
+ }
+ }
+ }
+ }
+
+ template <typename T>
+ static inline void Transform(T *out, const T *const in, const int stride,
+ const int k0, const int kmax, const int x0, const int xmax)
+ {
+ Transform<T, T>(out, in, stride, k0, kmax, x0, xmax);
+ }
+};
+
+/*****************************************************************************/
+template <unsigned int IntBy, unsigned int BlockBy, bool Transposed, typename TOut, typename TIn>
+void Transform(
+ TOut *out, const TIn *const in, const int stride,
+ const int k0, const int kmax, const int x0, const int xmax)
+{
+ // Redirect to a specialised implementation predicated on argument size.
+ TransformImpl<IntBy, BlockBy, Transposed, sizeof(TOut), sizeof(TIn)>::Transform(
+ out, in, stride, k0, kmax, x0, xmax);
+}
+/*****************************************************************************/
+
+#include "transforms/list.hpp"
diff --git a/src/core/NEON/kernels/arm_gemm/transforms/a32_interleave_6way_32bit.hpp b/src/core/NEON/kernels/arm_gemm/transforms/a32_interleave_6way_32bit.hpp
new file mode 100644
index 0000000..501d6bf
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/transforms/a32_interleave_6way_32bit.hpp
@@ -0,0 +1,154 @@
+/*
+ * Copyright (c) 2017-2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#pragma once
+
+#ifdef __arm__
+
+#include <arm_neon.h>
+
+#include "../asmlib.hpp"
+
+template <>
+template <typename T>
+inline void TransformImpl<6, 1, false, 4, 4>::Transform(T *out, const T *in, int ldin, int y0, int ymax, int k0, int kmax)
+{
+ uint32_t *outptr = reinterpret_cast<uint32_t *>(out);
+ const uint32_t *inptr = reinterpret_cast<const uint32_t *>(in);
+
+ uint32_t zerobuff[8];
+
+ for(int y = y0; y < ymax; y += 6)
+ {
+ const uint32_t *inptr0 = inptr + y * ldin + k0;
+ const uint32_t *inptr1 = inptr0 + ldin;
+ const uint32_t *inptr2 = inptr1 + ldin;
+ const uint32_t *inptr3 = inptr2 + ldin;
+ const uint32_t *inptr4 = inptr3 + ldin;
+ const uint32_t *inptr5 = inptr4 + ldin;
+
+ //prefetch_2x(inptr0);
+ //prefetch_2x(inptr1);
+ //prefetch_2x(inptr2);
+ //prefetch_2x(inptr3);
+ //prefetch_2x(inptr4);
+ //prefetch_2x(inptr5);
+
+ int x = (kmax - k0);
+ for(; x > 7; x -= 8)
+ {
+ /* Cope with ragged cases by copying from a buffer of zeroes instead */
+ if((y + 5) >= ymax)
+ {
+ switch((y + 5) - ymax)
+ {
+ /* Everything falls through in here */
+ case 4:
+ inptr1 = zerobuff;
+ case 3:
+ inptr2 = zerobuff;
+ case 2:
+ inptr3 = zerobuff;
+ case 1:
+ inptr4 = zerobuff;
+ case 0:
+ inptr5 = zerobuff;
+ break;
+
+ default:
+ UNREACHABLE("Impossible.");
+ }
+ }
+
+ __asm __volatile(
+ // Load up 8 elements (2 vectors) from each of 8 sources.
+ "VLD1.32 {d0-d3}, [%[inptr0]]!\n" // q0=A0A1A2A3
+ "VLD1.32 {d4-d7}, [%[inptr1]]!\n" // q2=B0B1B2B3
+ "VLD1.32 {d8-d11}, [%[inptr2]]!\n" // q4=C0C1C2C3
+ "VZIP.32 q0, q4\n" // q0=A0C0A1C1, q4 = A2C2A3C3
+ "VLD1.32 {d12-d15}, [%[inptr3]]!\n" // q6=D0D1D2D3
+ "VZIP.32 q2, q6\n" // q2=B0D0B1D1, q6 = B2D2B3D3
+ "VLD1.32 {d16-d19}, [%[inptr4]]!\n"
+ "VLD1.32 {d20-d23}, [%[inptr5]]!\n"
+ "VZIP.32 q8, q10\n" // q8=E0F0E1F1, q10 = E2F2E3F3
+ ASM_PREFETCH("[%[inptr0], #128]")
+ "VZIP.32 q0, q2\n" // q0 = A0B0C0D0, q2 = A1B1C1D1
+
+ // Store first elements
+ "VST1.32 {d0-d1}, [%[outptr]]!\n"
+ "VST1.32 {d16}, [%[outptr]]!\n"
+
+ "VZIP.32 q4, q6\n" // q4 = A2B2C2D2, q6 = A3B3C3D3
+
+ // Store second elements
+ "VST1.32 {d4-d5}, [%[outptr]]!\n"
+ "VZIP.32 q1, q5\n" ASM_PREFETCH("[%[inptr1], #128]")
+ "VST1.32 {d17}, [%[outptr]]!\n"
+ "VZIP.32 q3, q7\n"
+
+ // Store third elements
+ "VZIP.32 q9, q11\n"
+ "VST1.32 {d8-d9}, [%[outptr]]!\n"
+ "VZIP.32 q1, q3\n" ASM_PREFETCH("[%[inptr2], #128]")
+ "VST1.32 {d20}, [%[outptr]]!\n"
+
+ // Store fourth elements
+ "VZIP.32 q5, q7\n"
+ "VST1.32 {d12-d13}, [%[outptr]]!\n" ASM_PREFETCH("[%[inptr3], #128]")
+ "VST1.32 {d21}, [%[outptr]]!\n"
+
+ // Fifth
+ "VST1.32 {d2-d3}, [%[outptr]]!\n" ASM_PREFETCH("[%[inptr4], #128]")
+ "VST1.32 {d18}, [%[outptr]]!\n"
+
+ // Sixth
+ "VST1.32 {d6-d7}, [%[outptr]]!\n" ASM_PREFETCH("[%[inptr5], #128]")
+ "VST1.32 {d19}, [%[outptr]]!\n"
+
+ // Seventh
+ "VST1.32 {d10-d11}, [%[outptr]]!\n"
+ "VST1.32 {d22}, [%[outptr]]!\n"
+
+ // Eighth
+ "VST1.32 {d14-d15}, [%[outptr]]!\n"
+ "VST1.32 {d23}, [%[outptr]]!\n"
+
+ : [inptr0] "+r"(inptr0), [inptr1] "+r"(inptr1), [inptr2] "+r"(inptr2), [inptr3] "+r"(inptr3),
+ [inptr4] "+r"(inptr4), [inptr5] "+r"(inptr5), [outptr] "+r"(outptr)
+ :
+ : "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9", "q10", "q11", "q12");
+ }
+
+ for(; x > 0; x--)
+ {
+ *outptr++ = *inptr0++;
+ *outptr++ = *inptr1++;
+ *outptr++ = *inptr2++;
+ *outptr++ = *inptr3++;
+ *outptr++ = *inptr4++;
+ *outptr++ = *inptr5++;
+ }
+ }
+}
+
+#endif // __arm__
diff --git a/src/core/NEON/kernels/arm_gemm/transforms/a32_transpose_interleave_8way_32bit.hpp b/src/core/NEON/kernels/arm_gemm/transforms/a32_transpose_interleave_8way_32bit.hpp
new file mode 100644
index 0000000..ea32c96
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/transforms/a32_transpose_interleave_8way_32bit.hpp
@@ -0,0 +1,116 @@
+/*
+ * Copyright (c) 2017-2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#pragma once
+
+#ifdef __arm__
+
+#include "transpose_interleave_common.hpp"
+
+// Generic unblocked transposed 8x32-bit sized specialisation
+template <>
+template <typename T>
+inline void TransformImpl<8, 1, true, 4, 4>::Transform(
+ T *out, const T *const in, const int stride,
+ const int x0, const int xmax, const int k0, const int kmax)
+{
+ // Redirect to a 16x uint16_t specialisation
+ TransformImpl<16, 1, true, 2, 2>::Transform(
+ reinterpret_cast<uint16_t *>(out),
+ reinterpret_cast<const uint16_t *const>(in),
+ stride * 2, x0 * 2, xmax * 2, k0, kmax);
+}
+
+// Generic 12x16-bit sized specialisation
+template <>
+template <typename T>
+inline void TransformImpl<16, 1, true, 2, 2>::Transform(
+ T *out, const T *const in, const int stride,
+ const int x0, const int xmax, const int k0, const int kmax)
+{
+ // Redirect to a uint16_t specialisation
+ Transform(
+ reinterpret_cast<uint16_t *>(out),
+ reinterpret_cast<const uint16_t *const>(in),
+ stride, x0, xmax, k0, kmax);
+}
+
+// Specialised 16 x uint16_t version
+template <>
+inline void TransposeInterleaveCommon<16, uint16_t, uint16_t>::moveblock_1x1(const uint16_t *&in0, uint16_t *out)
+{
+ __asm volatile(
+ "VLD1.32 {d0-d3}, [%[in0]]!\n"
+ "VST1.32 {d0-d3}, [%[out]]\n" ASM_PREFETCH("[%[in0], #192]")
+ : [in0] "+r"(in0),
+ [out] "+r"(out)
+ :
+ : "q0", "q1", "memory");
+}
+
+template <>
+inline void TransposeInterleaveCommon<16, uint16_t, uint16_t>::moveblock_1x2(const uint16_t *&in0, const uint16_t *&in1, uint16_t *out)
+{
+ __asm volatile(
+ "VLD1.32 {d0-d3}, [%[in0]]!\n"
+ "VST1.32 {d0-d3}, [%[out]]!\n" ASM_PREFETCH("[%[in0], #192]")
+ "VLD1.32 {d0-d3}, [%[in1]]!\n"
+ "VST1.32 {d0-d3}, [%[out]]\n" ASM_PREFETCH("[%[in1], #192]") "SUB %[out], %[out], #32\n"
+ : [in0] "+r"(in0),
+ [in1] "+r"(in1),
+ [out] "+r"(out)
+ :
+ : "q0", "q1", "memory");
+}
+
+template <>
+inline void TransposeInterleaveCommon<16, uint16_t, uint16_t>::moveblock_1x4(const uint16_t *&in0, const uint16_t *&in1, const uint16_t *&in2, const uint16_t *&in3, uint16_t *out)
+{
+ __asm __volatile(
+ "VLD1.32 {d0-d3}, [%[in0]]!\n"
+ "VST1.32 {d0-d3}, [%[out]]!\n" ASM_PREFETCH("[%[in0], #192]")
+ "VLD1.32 {d0-d3}, [%[in1]]!\n"
+ "VST1.32 {d0-d3}, [%[out]]!\n" ASM_PREFETCH("[%[in1], #192]")
+ "VLD1.32 {d0-d3}, [%[in2]]!\n"
+ "VST1.32 {d0-d3}, [%[out]]!\n" ASM_PREFETCH("[%[in2], #192]")
+ "VLD1.32 {d0-d3}, [%[in3]]!\n"
+ "VST1.32 {d0-d3}, [%[out]]\n" ASM_PREFETCH("[%[in3], #192]") "SUB %[out], %[out], #96\n"
+ : [in0] "+r"(in0),
+ [in1] "+r"(in1),
+ [in2] "+r"(in2),
+ [in3] "+r"(in3),
+ [out] "+r"(out)
+ :
+ : "q0", "q1", "memory");
+}
+
+template <>
+template <>
+inline void TransformImpl<16, 1, true, 2, 2>::Transform(
+ uint16_t *out, const uint16_t *const in, const int stride,
+ const int x0, const int xmax, const int k0, const int kmax)
+{
+ TransposeInterleaveCommon<16, uint16_t, uint16_t>::Transform(out, in, stride, x0, xmax, k0, kmax);
+}
+
+#endif // __arm__
diff --git a/src/core/NEON/kernels/arm_gemm/transforms/a64_block16_interleave4_8bit.hpp b/src/core/NEON/kernels/arm_gemm/transforms/a64_block16_interleave4_8bit.hpp
new file mode 100644
index 0000000..8d61f15
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/transforms/a64_block16_interleave4_8bit.hpp
@@ -0,0 +1,131 @@
+/*
+ * Copyright (c) 2017-2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#pragma once
+
+#ifdef __aarch64__
+
+#include <arm_neon.h>
+
+#include "../asmlib.hpp"
+#include "../utils.hpp"
+
+template <>
+template <typename T>
+void TransformImpl<4, 16, false, 1, 1>::Transform(T *out, const T *in, int ldin, int y0, int ymax, int k0, int kmax)
+{
+ uint8_t *outptr = (uint8_t *)out;
+ const uint8_t *inptr = (uint8_t *)in;
+
+ uint8_t zerobuff[16];
+
+ for(int y = y0; y < ymax; y += 4)
+ {
+ const uint8_t *inptr0 = inptr + y * ldin + k0;
+ const uint8_t *inptr1 = inptr0 + ldin;
+ const uint8_t *inptr2 = inptr1 + ldin;
+ const uint8_t *inptr3 = inptr2 + ldin;
+
+ prefetch_2x(inptr0);
+ prefetch_2x(inptr1);
+ prefetch_2x(inptr2);
+ prefetch_2x(inptr3);
+
+ int x = (kmax - k0);
+ for(; x > 15; x -= 16)
+ {
+ /* Cope with ragged cases by copying from a buffer of zeroes instead */
+ if((y + 3) >= ymax)
+ {
+ switch((y + 3) - ymax)
+ {
+ /* Everything falls through in here */
+ case 2:
+ inptr1 = zerobuff;
+ case 1:
+ inptr2 = zerobuff;
+ case 0:
+ inptr3 = zerobuff;
+ break;
+
+ default:
+ UNREACHABLE("Impossible.");
+ }
+ }
+
+ __asm __volatile(
+ "LDR q0, [%[inptr0]], #16\n" ASM_PREFETCH("[%[inptr0], #176]") "LDR q1, [%[inptr1]], #16\n" ASM_PREFETCH("[%[inptr1], #176]")
+ "STP q0, q1, [%[outptr]], #32\n"
+ "LDR q0, [%[inptr2]], #16\n" ASM_PREFETCH("[%[inptr2], #176]") "LDR q1, [%[inptr3]], #16\n" ASM_PREFETCH("[%[inptr3], #176]") "STP q0, q1, [%[outptr]], #32\n"
+ : [inptr0] "+r"(inptr0), [inptr1] "+r"(inptr1), [inptr2] "+r"(inptr2), [inptr3] "+r"(inptr3),
+ [outptr] "+r"(outptr)
+ :
+ : "v0", "v1");
+ }
+
+ if(x > 0)
+ {
+ /* Need to duplicate this here, in case we didn't run the main loop. */
+ if((y + 3) >= ymax)
+ {
+ switch((y + 3) - ymax)
+ {
+ /* Everything falls through in here */
+ case 2:
+ inptr1 = zerobuff;
+ case 1:
+ inptr2 = zerobuff;
+ case 0:
+ inptr3 = zerobuff;
+ break;
+
+ default:
+ UNREACHABLE("Impossible.");
+ }
+ }
+
+ /* We have to write out 16 values, copy as many legal values as there are and pad with 0 */
+ auto f = [&outptr, x](const uint8_t *&p)
+ {
+ for(int i = 0; i < 16; i++)
+ {
+ if(i < x)
+ {
+ *outptr++ = *p++;
+ }
+ else
+ {
+ *outptr++ = 0;
+ }
+ }
+ };
+
+ f(inptr0);
+ f(inptr1);
+ f(inptr2);
+ f(inptr3);
+ }
+ }
+}
+
+#endif // __aarch64__
\ No newline at end of file
diff --git a/src/core/NEON/kernels/arm_gemm/transforms/a64_interleave_8way_16bit.hpp b/src/core/NEON/kernels/arm_gemm/transforms/a64_interleave_8way_16bit.hpp
new file mode 100644
index 0000000..3cbc881
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/transforms/a64_interleave_8way_16bit.hpp
@@ -0,0 +1,170 @@
+/*
+ * Copyright (c) 2017-2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#pragma once
+
+#ifdef __aarch64__
+
+#include <arm_neon.h>
+
+#include "../asmlib.hpp"
+
+template <>
+template <typename T>
+void TransformImpl<8, 1, false, 2, 2>::Transform(T *out, const T *in, int ldin, int y0, int ymax, int k0, int kmax)
+{
+ uint16_t *outptr = (uint16_t *)out;
+ const uint16_t *inptr = (const uint16_t *)in;
+
+ uint16_t zerobuff[24];
+
+ for(int y = y0; y < ymax; y += 8)
+ {
+ const uint16_t *inptr0 = inptr + y * ldin + k0;
+ const uint16_t *inptr1 = inptr0 + ldin;
+ const uint16_t *inptr2 = inptr1 + ldin;
+ const uint16_t *inptr3 = inptr2 + ldin;
+ const uint16_t *inptr4 = inptr3 + ldin;
+ const uint16_t *inptr5 = inptr4 + ldin;
+ const uint16_t *inptr6 = inptr5 + ldin;
+ const uint16_t *inptr7 = inptr6 + ldin;
+
+ prefetch_2x(inptr0);
+ prefetch_2x(inptr1);
+ prefetch_2x(inptr2);
+ prefetch_2x(inptr3);
+ prefetch_2x(inptr4);
+ prefetch_2x(inptr5);
+ prefetch_2x(inptr6);
+ prefetch_2x(inptr7);
+
+ int x = (kmax - k0);
+ for(; x > 7; x -= 8)
+ {
+ /* Cope with ragged cases by copying from a buffer of zeroes instead */
+ if((y + 7) >= ymax)
+ {
+ switch((y + 7) - ymax)
+ {
+ /* Everything falls through in here */
+ case 6:
+ inptr1 = zerobuff;
+ case 5:
+ inptr2 = zerobuff;
+ case 4:
+ inptr3 = zerobuff;
+ case 3:
+ inptr4 = zerobuff;
+ case 2:
+ inptr5 = zerobuff;
+ case 1:
+ inptr6 = zerobuff;
+ case 0:
+ inptr7 = zerobuff;
+ break;
+
+ default:
+ UNREACHABLE("Impossible.");
+ }
+ }
+
+ int skippf = (x & 31);
+ __asm __volatile(
+ // Load up 8 elements (1 vector) from each of 8 sources.
+ "CBNZ %w[skippf], 1f\n" ASM_PREFETCH("[%[inptr0], #128]")
+ ASM_PREFETCH("[%[inptr1], #128]")
+ ASM_PREFETCH("[%[inptr2], #128]")
+ ASM_PREFETCH("[%[inptr3], #128]")
+ "1:\n"
+
+ "LDR q0, [%[inptr0]], #16\n" // q0=A0A1A2A3A4A5A6A7
+ "LDR q4, [%[inptr4]], #16\n" // q8=E0E1E2E3E4E5E6E7
+ "LDR q2, [%[inptr2]], #16\n" // q4=C0C1C2C3...
+ "LDR q6, [%[inptr6]], #16\n"
+ "ZIP1 v8.8h, v0.8h, v4.8h\n" // q8=A0E0A1E1A2E2A3E3
+ "ZIP2 v16.8h, v0.8h, v4.8h\n" // q16=A4E4A5E5A6E6A7E7
+ "ZIP1 v9.8h, v2.8h, v6.8h\n" // q9=C0G0C1G1C2G2C3G3
+ "ZIP2 v17.8h, v2.8h, v6.8h\n" // q17=C4G4C5G5C6G6C7G7
+ "LDR q1, [%[inptr1]], #16\n" // q1=B0B1B2B3B4B5B6B7
+ "LDR q5, [%[inptr5]], #16\n"
+ "LDR q3, [%[inptr3]], #16\n" // q3=D0D1D2D3....
+ "LDR q7, [%[inptr7]], #16\n"
+ "ZIP1 v10.8h, v1.8h, v5.8h\n" // q18=B0F0B1F1B2F2B3F3
+ "ZIP2 v18.8h, v1.8h, v5.8h\n" // q18=B4F4B5F5B6F6B7F7
+ "ZIP1 v11.8h, v3.8h, v7.8h\n" // q19=D0H0D1H1D2H2D3H3
+ "ZIP2 v19.8h, v3.8h, v7.8h\n" // q19=D4H4D5H5D6H6D7H7
+
+ "ZIP1 v12.8h, v8.8h, v9.8h\n" // q20=A0C0E0G0A1C1E1G1
+ "ZIP2 v20.8h, v8.8h, v9.8h\n"
+ "ZIP1 v13.8h, v10.8h, v11.8h\n" // q21=B0D0F0H0B1I1F1H1
+ "ZIP2 v21.8h, v10.8h, v11.8h\n"
+
+ "CBNZ %w[skippf], 2f\n" ASM_PREFETCH("[%[inptr4], #112]")
+ ASM_PREFETCH("[%[inptr5], #112]")
+ ASM_PREFETCH("[%[inptr6], #112]")
+ ASM_PREFETCH("[%[inptr7], #112]")
+ "2:\n"
+
+ "ZIP1 v22.8h, v16.8h, v17.8h\n"
+ "ZIP2 v30.8h, v16.8h, v17.8h\n"
+ "ZIP1 v23.8h, v18.8h, v19.8h\n"
+ "ZIP2 v31.8h, v18.8h, v19.8h\n"
+
+ "ZIP1 v14.8h, v12.8h, v13.8h\n" // q22=A0B0C0D0E0F0G0H0
+ "ZIP2 v15.8h, v12.8h, v13.8h\n" // q23=A1B1C1D1E1F1G1H1
+ "STP q14, q15, [%[outptr]], #32\n" // Write back first two elements
+
+ "ZIP1 v0.8h, v20.8h, v21.8h\n"
+ "ZIP2 v1.8h, v20.8h, v21.8h\n"
+ "STP q0, q1, [%[outptr]], #32\n" // Write back next two elements
+
+ "ZIP1 v2.8h, v22.8h, v23.8h\n"
+ "ZIP2 v3.8h, v22.8h, v23.8h\n"
+ "STP q2, q3, [%[outptr]], #32\n" // Write back next two elements
+
+ "ZIP1 v4.8h, v30.8h, v31.8h\n"
+ "ZIP2 v5.8h, v30.8h, v31.8h\n"
+ "STP q4, q5, [%[outptr]], #32\n" // Write back last two elements
+ : [inptr0] "+r"(inptr0), [inptr1] "+r"(inptr1), [inptr2] "+r"(inptr2), [inptr3] "+r"(inptr3),
+ [inptr4] "+r"(inptr4), [inptr5] "+r"(inptr5), [inptr6] "+r"(inptr6), [inptr7] "+r"(inptr7), [outptr] "+r"(outptr)
+ : [skippf] "r"(skippf)
+ : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12",
+ "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24",
+ "v25", "v26", "v27", "v28", "v29", "v30", "v31");
+ }
+
+ for(; x > 0; x--)
+ {
+ *outptr++ = *inptr0++;
+ *outptr++ = *inptr1++;
+ *outptr++ = *inptr2++;
+ *outptr++ = *inptr3++;
+ *outptr++ = *inptr4++;
+ *outptr++ = *inptr5++;
+ *outptr++ = *inptr6++;
+ *outptr++ = *inptr7++;
+ }
+ }
+}
+
+#endif // __aarch64__
diff --git a/src/core/NEON/kernels/arm_gemm/transforms/a64_interleave_8way_32bit.hpp b/src/core/NEON/kernels/arm_gemm/transforms/a64_interleave_8way_32bit.hpp
new file mode 100644
index 0000000..47e4fa2
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/transforms/a64_interleave_8way_32bit.hpp
@@ -0,0 +1,175 @@
+/*
+ * Copyright (c) 2017-2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#pragma once
+
+#ifdef __aarch64__
+
+#include <arm_neon.h>
+
+#include "../asmlib.hpp"
+
+template <>
+template <typename T>
+inline void TransformImpl<8, 1, false, 4, 4>::Transform(T *out, const T *in, int ldin, int y0, int ymax, int k0, int kmax)
+{
+ uint32_t *outptr = (uint32_t *)out;
+ const uint32_t *inptr = (uint32_t *)in;
+
+ uint32_t zerobuff[8];
+
+ for(int y = y0; y < ymax; y += 8)
+ {
+ const uint32_t *inptr0 = inptr + y * ldin + k0;
+ const uint32_t *inptr1 = inptr0 + ldin;
+ const uint32_t *inptr2 = inptr1 + ldin;
+ const uint32_t *inptr3 = inptr2 + ldin;
+ const uint32_t *inptr4 = inptr3 + ldin;
+ const uint32_t *inptr5 = inptr4 + ldin;
+ const uint32_t *inptr6 = inptr5 + ldin;
+ const uint32_t *inptr7 = inptr6 + ldin;
+
+ prefetch_2x(inptr0);
+ prefetch_2x(inptr1);
+ prefetch_2x(inptr2);
+ prefetch_2x(inptr3);
+ prefetch_2x(inptr4);
+ prefetch_2x(inptr5);
+ prefetch_2x(inptr6);
+ prefetch_2x(inptr7);
+
+ int x = (kmax - k0);
+ for(; x > 7; x -= 8)
+ {
+ /* Cope with ragged cases by copying from a buffer of zeroes instead */
+ if((y + 7) >= ymax)
+ {
+ switch((y + 7) - ymax)
+ {
+ /* Everything falls through in here */
+ case 6:
+ inptr1 = zerobuff;
+ case 5:
+ inptr2 = zerobuff;
+ case 4:
+ inptr3 = zerobuff;
+ case 3:
+ inptr4 = zerobuff;
+ case 2:
+ inptr5 = zerobuff;
+ case 1:
+ inptr6 = zerobuff;
+ case 0:
+ inptr7 = zerobuff;
+ break;
+
+ default:
+ UNREACHABLE("Impossible.");
+ }
+ }
+
+ __asm __volatile(
+ // Load up 8 elements (2 vectors) from each of 8 sources.
+ "LDP q0, q1, [%[inptr0]], #32\n" // q0=A0A1A2A3
+ "LDP q2, q3, [%[inptr1]], #32\n" // q2=B0B1B2B3
+ "LDP q4, q5, [%[inptr2]], #32\n" // q4=C0C1C2C3
+ "ZIP1 v16.4s, v0.4s, v4.4s\n" // q16=A0C0A1C1
+ ASM_PREFETCH("[%[inptr0], #128]")
+ "LDP q6, q7, [%[inptr3]], #32\n" // q6=D0D1D2D3
+ "ZIP1 v17.4s, v2.4s, v6.4s\n" // q17=B0D0B1D1
+ "LDP q8, q9, [%[inptr4]], #32\n"
+ "LDP q10, q11, [%[inptr5]], #32\n"
+ "LDP q12, q13, [%[inptr6]], #32\n"
+ "ZIP1 v18.4s, v8.4s, v12.4s\n" ASM_PREFETCH("[%[inptr1], #128]")
+ "LDP q14, q15, [%[inptr7]], #32\n"
+ "ZIP1 v19.4s, v10.4s, v14.4s\n"
+
+ "ZIP1 v20.4s, v16.4s, v17.4s\n" // q20=A0B0C0D0
+ ASM_PREFETCH("[%[inptr2], #128]")
+ "ZIP1 v21.4s, v18.4s, v19.4s\n"
+ "ZIP2 v22.4s, v16.4s, v17.4s\n"
+ "ZIP2 v23.4s, v18.4s, v19.4s\n"
+
+ "ZIP2 v16.4s, v0.4s, v4.4s\n" ASM_PREFETCH("[%[inptr3], #128]")
+ "ZIP2 v17.4s, v2.4s, v6.4s\n"
+ "STP q20, q21, [%[outptr]], #32\n" // Write back the first element of each source
+
+ "ZIP2 v18.4s, v8.4s, v12.4s\n"
+ "ZIP2 v19.4s, v10.4s, v14.4s\n"
+ "STP q22, q23, [%[outptr]], #32\n" // Write back the second element of each source
+
+ "ZIP1 v20.4s, v16.4s, v17.4s\n" ASM_PREFETCH("[%[inptr4], #128]")
+ "ZIP1 v21.4s, v18.4s, v19.4s\n"
+ "ZIP2 v22.4s, v16.4s, v17.4s\n"
+ "ZIP2 v23.4s, v18.4s, v19.4s\n"
+
+ "ZIP1 v16.4s, v1.4s, v5.4s\n" ASM_PREFETCH("[%[inptr5], #128]")
+ "ZIP1 v17.4s, v3.4s, v7.4s\n"
+ "STP q20, q21, [%[outptr]], #32\n" // Third element
+
+ "ZIP1 v18.4s, v9.4s, v13.4s\n"
+ "ZIP1 v19.4s, v11.4s, v15.4s\n"
+ "STP q22, q23, [%[outptr]], #32\n" // Fourth element
+
+ "ZIP1 v20.4s, v16.4s, v17.4s\n"
+ "ZIP1 v21.4s, v18.4s, v19.4s\n"
+ "ZIP2 v22.4s, v16.4s, v17.4s\n" ASM_PREFETCH("[%[inptr6], #128]")
+ "ZIP2 v23.4s, v18.4s, v19.4s\n"
+
+ "ZIP2 v16.4s, v1.4s, v5.4s\n"
+ "ZIP2 v17.4s, v3.4s, v7.4s\n"
+ "STP q20, q21, [%[outptr]], #32\n" // Fifth element
+
+ "ZIP2 v18.4s, v9.4s, v13.4s\n" ASM_PREFETCH("[%[inptr7], #128]")
+ "ZIP2 v19.4s, v11.4s, v15.4s\n"
+ "STP q22, q23, [%[outptr]], #32\n" // Sixth element
+
+ "ZIP1 v20.4s, v16.4s, v17.4s\n"
+ "ZIP1 v21.4s, v18.4s, v19.4s\n"
+ "STP q20, q21, [%[outptr]], #32\n" // Seventh element
+
+ "ZIP2 v22.4s, v16.4s, v17.4s\n"
+ "ZIP2 v23.4s, v18.4s, v19.4s\n"
+ "STP q22, q23, [%[outptr]], #32\n" // Eighth element
+ : [inptr0] "+r"(inptr0), [inptr1] "+r"(inptr1), [inptr2] "+r"(inptr2), [inptr3] "+r"(inptr3),
+ [inptr4] "+r"(inptr4), [inptr5] "+r"(inptr5), [inptr6] "+r"(inptr6), [inptr7] "+r"(inptr7), [outptr] "+r"(outptr)
+ :
+ : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12",
+ "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23");
+ }
+
+ for(; x > 0; x--)
+ {
+ *outptr++ = *inptr0++;
+ *outptr++ = *inptr1++;
+ *outptr++ = *inptr2++;
+ *outptr++ = *inptr3++;
+ *outptr++ = *inptr4++;
+ *outptr++ = *inptr5++;
+ *outptr++ = *inptr6++;
+ *outptr++ = *inptr7++;
+ }
+ }
+}
+
+#endif // __aarch64__
diff --git a/src/core/NEON/kernels/arm_gemm/transforms/a64_interleave_8way_half_to_float.hpp b/src/core/NEON/kernels/arm_gemm/transforms/a64_interleave_8way_half_to_float.hpp
new file mode 100644
index 0000000..1d2d496
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/transforms/a64_interleave_8way_half_to_float.hpp
@@ -0,0 +1,192 @@
+/*
+ * Copyright (c) 2017-2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#pragma once
+
+#if defined(__aarch64__) && defined(__ARM_FP16_ARGS)
+
+#include <arm_neon.h>
+
+#include "../asmlib.hpp"
+
+template <>
+template <>
+inline void TransformImpl<8, 1, false, 4, 2>::Transform(float *out, const __fp16 *in, int ldin, int y0, int ymax, int k0, int kmax)
+{
+ float *outptr = out;
+ const __fp16 *inptr = in;
+
+ __fp16 zerobuff[8];
+
+ for(int y = y0; y < ymax; y += 8)
+ {
+ const __fp16 *inptr0 = inptr + y * ldin + k0;
+ const __fp16 *inptr1 = inptr0 + ldin;
+ const __fp16 *inptr2 = inptr1 + ldin;
+ const __fp16 *inptr3 = inptr2 + ldin;
+ const __fp16 *inptr4 = inptr3 + ldin;
+ const __fp16 *inptr5 = inptr4 + ldin;
+ const __fp16 *inptr6 = inptr5 + ldin;
+ const __fp16 *inptr7 = inptr6 + ldin;
+
+ prefetch_2x(inptr0);
+ prefetch_2x(inptr1);
+ prefetch_2x(inptr2);
+ prefetch_2x(inptr3);
+ prefetch_2x(inptr4);
+ prefetch_2x(inptr5);
+ prefetch_2x(inptr6);
+ prefetch_2x(inptr7);
+
+ int x = (kmax - k0);
+ for(; x > 7; x -= 8)
+ {
+ /* Cope with ragged cases by copying from a buffer of zeroes instead */
+ if((y + 7) >= ymax)
+ {
+ switch((y + 7) - ymax)
+ {
+ /* Everything falls through in here */
+ case 6:
+ inptr1 = zerobuff;
+ case 5:
+ inptr2 = zerobuff;
+ case 4:
+ inptr3 = zerobuff;
+ case 3:
+ inptr4 = zerobuff;
+ case 2:
+ inptr5 = zerobuff;
+ case 1:
+ inptr6 = zerobuff;
+ case 0:
+ inptr7 = zerobuff;
+ break;
+
+ default:
+ UNREACHABLE("Impossible.");
+ }
+ }
+
+ __asm __volatile(
+ // Load up 8 elements (2 vectors) from each of 8 sources.
+ "LDR q0, [%[inptr0]], #16\n"
+ "LDR q2, [%[inptr1]], #16\n"
+ "FCVTL2 v1.4s, v0.8h\n"
+ "FCVTL v0.4s, v0.4h\n"
+ "LDR q4, [%[inptr2]], #16\n" // q4=C0C1C2C3
+ "FCVTL2 v3.4s, v2.8h\n"
+ "FCVTL v2.4s, v2.4h\n"
+ "FCVTL2 v5.4s, v4.8h\n"
+ "FCVTL v4.4s, v4.4h\n"
+ "ZIP1 v16.4s, v0.4s, v4.4s\n" // q16=A0C0A1C1
+ ASM_PREFETCH("[%[inptr0], #128]")
+ "LDR q6, [%[inptr3]], #16\n" // q6=D0D1D2D3
+ "FCVTL2 v7.4s, v6.8h\n"
+ "FCVTL v6.4s, v6.4h\n"
+ "ZIP1 v17.4s, v2.4s, v6.4s\n" // q17=B0D0B1D1
+ "LDR q8, [%[inptr4]], #16\n"
+ "LDR q10, [%[inptr5]], #16\n"
+ "FCVTL2 v9.4s, v8.8h\n"
+ "FCVTL v8.4s, v8.4h\n" ASM_PREFETCH("[%[inptr1], #128]")
+ "LDR q12, [%[inptr6]], #16\n"
+ "FCVTL2 v11.4s, v10.8h\n"
+ "FCVTL v10.4s, v10.4h\n"
+ "FCVTL2 v13.4s, v12.8h\n"
+ "FCVTL v12.4s, v12.4h\n"
+ "ZIP1 v18.4s, v8.4s, v12.4s\n"
+ "LDR q14, [%[inptr7]], #16\n"
+ "FCVTL2 v15.4s, v14.8h\n"
+ "FCVTL v14.4s, v14.4h\n"
+ "ZIP1 v19.4s, v10.4s, v14.4s\n"
+
+ ASM_PREFETCH("[%[inptr2], #128]")
+ "ZIP1 v20.4s, v16.4s, v17.4s\n" // q20=A0B0C0D0
+ "ZIP1 v21.4s, v18.4s, v19.4s\n"
+ "ZIP2 v22.4s, v16.4s, v17.4s\n"
+ "ZIP2 v23.4s, v18.4s, v19.4s\n" ASM_PREFETCH("[%[inptr3], #128]")
+
+ "ZIP2 v16.4s, v0.4s, v4.4s\n"
+ "ZIP2 v17.4s, v2.4s, v6.4s\n"
+ "STP q20, q21, [%[outptr]], #32\n" // Write back the first element of each source
+
+ "ZIP2 v18.4s, v8.4s, v12.4s\n" ASM_PREFETCH("[%[inptr4], #128]")
+ "ZIP2 v19.4s, v10.4s, v14.4s\n"
+ "STP q22, q23, [%[outptr]], #32\n" // Write back the second element of each source
+
+ "ZIP1 v20.4s, v16.4s, v17.4s\n"
+ "ZIP1 v21.4s, v18.4s, v19.4s\n" ASM_PREFETCH("[%[inptr5], #128]")
+ "ZIP2 v22.4s, v16.4s, v17.4s\n"
+ "ZIP2 v23.4s, v18.4s, v19.4s\n"
+
+ "ZIP1 v16.4s, v1.4s, v5.4s\n"
+ "ZIP1 v17.4s, v3.4s, v7.4s\n" ASM_PREFETCH("[%[inptr6], #128]")
+ "STP q20, q21, [%[outptr]], #32\n" // Third element
+
+ "ZIP1 v18.4s, v9.4s, v13.4s\n"
+ "ZIP1 v19.4s, v11.4s, v15.4s\n"
+ "STP q22, q23, [%[outptr]], #32\n" // Fourth element
+ ASM_PREFETCH("[%[inptr7], #128]")
+
+ "ZIP1 v20.4s, v16.4s, v17.4s\n"
+ "ZIP1 v21.4s, v18.4s, v19.4s\n"
+ "ZIP2 v22.4s, v16.4s, v17.4s\n"
+ "ZIP2 v23.4s, v18.4s, v19.4s\n"
+
+ "ZIP2 v16.4s, v1.4s, v5.4s\n"
+ "ZIP2 v17.4s, v3.4s, v7.4s\n"
+ "STP q20, q21, [%[outptr]], #32\n" // Fifth element
+
+ "ZIP2 v18.4s, v9.4s, v13.4s\n"
+ "ZIP2 v19.4s, v11.4s, v15.4s\n"
+ "STP q22, q23, [%[outptr]], #32\n" // Sixth element
+
+ "ZIP1 v20.4s, v16.4s, v17.4s\n"
+ "ZIP1 v21.4s, v18.4s, v19.4s\n"
+ "STP q20, q21, [%[outptr]], #32\n" // Seventh element
+
+ "ZIP2 v22.4s, v16.4s, v17.4s\n"
+ "ZIP2 v23.4s, v18.4s, v19.4s\n"
+ "STP q22, q23, [%[outptr]], #32\n" // Eighth element
+ : [inptr0] "+r"(inptr0), [inptr1] "+r"(inptr1), [inptr2] "+r"(inptr2), [inptr3] "+r"(inptr3),
+ [inptr4] "+r"(inptr4), [inptr5] "+r"(inptr5), [inptr6] "+r"(inptr6), [inptr7] "+r"(inptr7), [outptr] "+r"(outptr)
+ :
+ : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12",
+ "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23");
+ }
+
+ for(; x > 0; x--)
+ {
+ *outptr++ = *inptr0++;
+ *outptr++ = *inptr1++;
+ *outptr++ = *inptr2++;
+ *outptr++ = *inptr3++;
+ *outptr++ = *inptr4++;
+ *outptr++ = *inptr5++;
+ *outptr++ = *inptr6++;
+ *outptr++ = *inptr7++;
+ }
+ }
+}
+
+#endif // __aarch64__ && __ARM_FP16_ARGS
diff --git a/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_12way_16bit.hpp b/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_12way_16bit.hpp
new file mode 100644
index 0000000..fd6a253
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_12way_16bit.hpp
@@ -0,0 +1,135 @@
+/*
+ * Copyright (c) 2017-2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#pragma once
+
+#ifdef __aarch64__
+
+#include "transpose_interleave_common.hpp"
+
+// Generic unblocked transposed 6x32-bit sized specialisation
+template <>
+template <typename T>
+inline void TransformImpl<6, 1, true, 4, 4>::Transform(
+ T *out, const T *const in, const int stride,
+ const int x0, const int xmax, const int k0, const int kmax)
+{
+ // Redirect to a 12 x uint16_t specialisation
+ TransformImpl<12, 1, true, 2, 2>::Transform(
+ reinterpret_cast<uint16_t *>(out),
+ reinterpret_cast<const uint16_t *const>(in),
+ stride * 2, x0 * 2, xmax * 2, k0, kmax);
+}
+
+// Generic 12x16-bit sized specialisation
+template <>
+template <typename T>
+inline void TransformImpl<12, 1, true, 2, 2>::Transform(
+ T *out, const T *const in, const int stride,
+ const int x0, const int xmax, const int k0, const int kmax)
+{
+ // Redirect to a uint16_t specialisation
+ Transform(
+ reinterpret_cast<uint16_t *>(out),
+ reinterpret_cast<const uint16_t *const>(in),
+ stride, x0, xmax, k0, kmax);
+}
+
+// Specialised 12 x uint16_t version
+template <>
+inline void TransposeInterleaveCommon<12, uint16_t, uint16_t>::moveblock_1x1(const uint16_t *&in0, uint16_t *out)
+{
+ __asm volatile(
+ "LDR q0, [%[in0]]\n"
+ "STR q0, [%[out]]\n"
+ "LDR d1, [%[in0], #0x10]\n"
+ "STR d1, [%[out], #0x10]\n"
+ "ADD %x[in0], %x[in0], #0x18\n" ASM_PREFETCH("[%[in0], #192]")
+ : [in0] "+r"(in0),
+ [out] "+r"(out)
+ :
+ : "v0", "v1", "memory");
+}
+
+template <>
+inline void TransposeInterleaveCommon<12, uint16_t, uint16_t>::moveblock_1x2(const uint16_t *&in0, const uint16_t *&in1, uint16_t *out)
+{
+ __asm volatile(
+ "LDR q0, [%[in0]]\n"
+ "LDR d1, [%[in0], #0x10]\n"
+ "ADD %x[in0], %x[in0], #0x18\n" ASM_PREFETCH("[%[in0], #192]")
+
+ "LDR x21, [%[in1]]\n"
+ "LDR q2, [%[in1], #0x08]\n"
+ "INS v1.d[1], x21\n"
+ "ADD %x[in1], %x[in1], #0x18\n"
+ "STP q0, q1, [%[out]]\n"
+ "STR q2, [%x[out], #0x20]\n" ASM_PREFETCH("[%[in1], #192]")
+ : [in0] "+r"(in0),
+ [in1] "+r"(in1),
+ [out] "+r"(out)
+ :
+ : "x21", "v0", "v1", "v2", "memory");
+}
+
+template <>
+inline void TransposeInterleaveCommon<12, uint16_t, uint16_t>::moveblock_1x4(const uint16_t *&in0, const uint16_t *&in1, const uint16_t *&in2, const uint16_t *&in3, uint16_t *out)
+{
+ __asm __volatile(
+ "LDR q0, [%x[in0]], #0x10\n"
+ "STR q0, [%x[out]]\n"
+ "LDR d1, [%x[in0]], #0x08\n" ASM_PREFETCH("[%[in0], #192]")
+ "STR d1, [%x[out], #0x10]\n"
+
+ "LDR q0, [%x[in1]], #0x10\n"
+ "STR q0, [%x[out], #0x18]\n"
+ "LDR d1, [%x[in1]], #0x08\n" ASM_PREFETCH("[%[in1], #192]")
+ "STR d1, [%x[out], #0x28]\n"
+
+ "LDR q0, [%x[in2]], #0x10\n"
+ "STR q0, [%x[out], #0x30]\n"
+ "LDR d1, [%x[in2]], #0x08\n" ASM_PREFETCH("[%[in2], #192]")
+ "STR d1, [%x[out], #0x40]\n"
+
+ "LDR q0, [%x[in3]], #0x10\n"
+ "STR q0, [%x[out], #0x48]\n"
+ "LDR d1, [%x[in3]], #0x08\n" ASM_PREFETCH("[%[in3], #192]") "STR d1, [%x[out], #0x58]\n"
+ : [in0] "+r"(in0),
+ [in1] "+r"(in1),
+ [in2] "+r"(in2),
+ [in3] "+r"(in3),
+ [out] "+r"(out)
+ :
+ : "v0", "v1", "memory");
+}
+
+template <>
+template <>
+inline void TransformImpl<12, 1, true, 2, 2>::Transform(
+ uint16_t *out, const uint16_t *const in, const int stride,
+ const int x0, const int xmax, const int k0, const int kmax)
+{
+ TransposeInterleaveCommon<12, uint16_t, uint16_t>::Transform(out, in, stride, x0, xmax, k0, kmax);
+}
+
+#endif // __aarch64__
diff --git a/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_12way_half_to_float.hpp b/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_12way_half_to_float.hpp
new file mode 100644
index 0000000..b79f32f
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_12way_half_to_float.hpp
@@ -0,0 +1,113 @@
+/*
+ * Copyright (c) 2017-2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#pragma once
+
+#if defined(__aarch64__) && defined(__ARM_FP16_ARGS)
+
+#include "transpose_interleave_common.hpp"
+
+template <>
+inline void TransposeInterleaveCommon<12, __fp16, float>::moveblock_1x1(const __fp16 *&in0, float *out)
+{
+ __asm __volatile(
+ "LDR q0, [%[in0]], #16\n"
+ "FCVTL2 v1.4s, v0.8h\n"
+ "FCVTL v0.4s, v0.4h\n"
+ "STP q0, q1, [%[out]]\n" ASM_PREFETCH("[%[in0], #192]")
+ "LDR d2, [%[in0]], #8\n"
+ "FCVTL v2.4s, v2.4h\n"
+ "STR q2, [%[out], #32]\n"
+ : [in0] "+r"(in0), [out] "+r"(out)
+ :
+ : "v0", "v1", "v2", "memory");
+}
+
+template <>
+inline void TransposeInterleaveCommon<12, __fp16, float>::moveblock_1x2(const __fp16 *&in0, const __fp16 *&in1, float *out)
+{
+ __asm __volatile(
+ "LDR q0, [%[in0]], #16\n"
+ "FCVTL2 v1.4s, v0.8h\n"
+ "FCVTL v0.4s, v0.4h\n"
+ "STP q0, q1, [%[out]]\n" ASM_PREFETCH("[%[in0], #192]")
+ "LDR d2, [%[in0]], #8\n"
+ "FCVTL v2.4s, v2.4h\n"
+ "LDR q3, [%[in1]], #16\n"
+ "FCVTL2 v4.4s, v3.8h\n"
+ "FCVTL v3.4s, v3.4h\n"
+ "STP q2, q3, [%[out], #32]\n" ASM_PREFETCH("[%[in1], #192]")
+ "LDR d5, [%[in1]], #16\n"
+ "FCVTL v5.4s, v5.4h\n"
+ "STP q4, q5, [%[out], #64]\n"
+ : [in0] "+r"(in0), [in1] "+r"(in1), [out] "+r"(out)
+ :
+ : "v0", "v1", "v2", "v3", "v4", "v5", "memory");
+}
+
+template <>
+inline void TransposeInterleaveCommon<12, __fp16, float>::moveblock_1x4(const __fp16 *&in0, const __fp16 *&in1, const __fp16 *&in2, const __fp16 *&in3, float *out)
+{
+ __asm __volatile(
+ "LDR q0, [%[in0]], #16\n"
+ "FCVTL2 v1.4s, v0.8h\n"
+ "FCVTL v0.4s, v0.4h\n"
+ "STP q0, q1, [%[out]]\n"
+ "LDR d2, [%[in0]], #8\n" ASM_PREFETCH("[%[in0], #192]")
+ "FCVTL v2.4s, v2.4h\n"
+ "LDR q3, [%[in1]], #16\n"
+ "FCVTL2 v4.4s, v3.8h\n"
+ "FCVTL v3.4s, v3.4h\n"
+ "STP q2, q3, [%[out], #32]\n"
+ "LDR d5, [%[in1]], #8\n"
+ "FCVTL v5.4s, v5.4h\n" ASM_PREFETCH("[%[in1], #192]")
+ "STP q4, q5, [%[out], #64]\n"
+ "LDR q6, [%[in2]], #16\n"
+ "FCVTL2 v7.4s, v6.8h\n"
+ "FCVTL v6.4s, v6.4h\n"
+ "STP q6, q7, [%[out], #96]\n"
+ "LDR d8, [%[in2]], #8\n"
+ "FCVTL v8.4s, v8.4h\n" ASM_PREFETCH("[%[in2], #192]")
+ "LDR q9, [%[in3]], #16\n"
+ "FCVTL2 v10.4s, v9.8h\n"
+ "FCVTL v9.4s, v9.4h\n"
+ "STP q8, q9, [%[out], #128]\n"
+ "LDR d11, [%[in3]], #8\n"
+ "FCVTL v11.4s, v11.4h\n"
+ "STP q10, q11, [%[out], #160]\n" ASM_PREFETCH("[%[in3], #192]")
+
+ : [in0] "+r"(in0), [in1] "+r"(in1), [in2] "+r"(in2), [in3] "+r"(in3), [out] "+r"(out)
+ :
+ : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "memory");
+}
+
+template <>
+template <>
+inline void TransformImpl<12, 1, true, 4, 2>::Transform(
+ float *out, const __fp16 *const in, const int stride,
+ const int x0, const int xmax, const int k0, const int kmax)
+{
+ TransposeInterleaveCommon<12, __fp16, float>::Transform(out, in, stride, x0, xmax, k0, kmax);
+}
+
+#endif // __aarch64__ && __ARM_FP16_ARGS
diff --git a/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_24way_16bit.hpp b/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_24way_16bit.hpp
new file mode 100644
index 0000000..5434599
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_24way_16bit.hpp
@@ -0,0 +1,121 @@
+/*
+ * Copyright (c) 2017-2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#pragma once
+
+#ifdef __aarch64__
+
+#include "transpose_interleave_common.hpp"
+
+// Generic unblocked transposed 12x32-bit sized specialisation
+template <>
+template <typename T>
+inline void TransformImpl<12, 1, true, 4, 4>::Transform(
+ T *out, const T *const in, const int stride,
+ const int x0, const int xmax, const int k0, const int kmax)
+{
+ // Redirect to a 24 x uint16_t specialisation
+ TransformImpl<24, 1, true, 2, 2>::Transform(
+ reinterpret_cast<uint16_t *>(out),
+ reinterpret_cast<const uint16_t *const>(in),
+ stride * 2, x0 * 2, xmax * 2, k0, kmax);
+}
+
+// Generic 24x16-bit sized specialisation
+template <>
+template <typename T>
+inline void TransformImpl<24, 1, true, 2, 2>::Transform(
+ T *out, const T *const in, const int stride,
+ const int x0, const int xmax, const int k0, const int kmax)
+{
+ // Redirect to a uint16_t specialisation
+ Transform(
+ reinterpret_cast<uint16_t *>(out),
+ reinterpret_cast<const uint16_t *const>(in),
+ stride, x0, xmax, k0, kmax);
+}
+
+// Specialised 24 x uint16_t version
+template <>
+inline void TransposeInterleaveCommon<24, uint16_t, uint16_t>::moveblock_1x1(const uint16_t *&in0, uint16_t *out)
+{
+ __asm __volatile(
+ "LDP q0, q1, [%[in0]], #32\n"
+ "STP q0, q1, [%[out]]\n" ASM_PREFETCH("[%[in0], #192]")
+ "LDR q2, [%[in0]], #16\n"
+ "STR q2, [%[out], #32]\n"
+ : [in0] "+r"(in0), [out] "+r"(out)
+ :
+ : "v0", "v1", "v2", "memory");
+}
+
+template <>
+inline void TransposeInterleaveCommon<24, uint16_t, uint16_t>::moveblock_1x2(const uint16_t *&in0, const uint16_t *&in1, uint16_t *out)
+{
+ __asm __volatile(
+ "LDP q0, q1, [%[in0]], #32\n"
+ "STP q0, q1, [%[out]]\n" ASM_PREFETCH("[%[in0], #192]")
+ "LDR q2, [%[in0]], #16\n"
+ "LDP q3, q4, [%[in1]], #32\n"
+ "STP q2, q3, [%[out], #32]\n" ASM_PREFETCH("[%[in1], #192]")
+ "LDR q5, [%[in1]], #16\n"
+ "STP q4, q5, [%[out], #64]\n"
+ : [in0] "+r"(in0), [in1] "+r"(in1), [out] "+r"(out)
+ :
+ : "v0", "v1", "v2", "v3", "v4", "v5", "memory");
+}
+
+template <>
+inline void TransposeInterleaveCommon<24, uint16_t, uint16_t>::moveblock_1x4(const uint16_t *&in0, const uint16_t *&in1, const uint16_t *&in2, const uint16_t *&in3, uint16_t *out)
+{
+ __asm __volatile(
+ "LDP q0, q1, [%[in0]], #32\n"
+ "STP q0, q1, [%[out]]\n"
+ "LDR q2, [%[in0]], #16\n" ASM_PREFETCH("[%[in0], #192]")
+ "LDP q3, q4, [%[in1]], #32\n"
+ "STP q2, q3, [%[out], #32]\n"
+ "LDR q5, [%[in1]], #16\n" ASM_PREFETCH("[%[in1], #192]")
+ "STP q4, q5, [%[out], #64]\n"
+ "LDP q6, q7, [%[in2]], #32\n"
+ "STP q6, q7, [%[out], #96]\n"
+ "LDR q8, [%[in2]], #16\n" ASM_PREFETCH("[%[in2], #192]")
+ "LDP q9, q10, [%[in3]], #32\n"
+ "STP q8, q9, [%[out], #128]\n"
+ "LDR q11, [%[in3]], #16\n"
+ "STP q10, q11, [%[out], #160]\n" ASM_PREFETCH("[%[in3], #192]")
+
+ : [in0] "+r"(in0), [in1] "+r"(in1), [in2] "+r"(in2), [in3] "+r"(in3), [out] "+r"(out)
+ :
+ : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "memory");
+}
+
+template <>
+template <>
+inline void TransformImpl<24, 1, true, 2, 2>::Transform(
+ uint16_t *out, const uint16_t *const in, const int stride,
+ const int x0, const int xmax, const int k0, const int kmax)
+{
+ TransposeInterleaveCommon<24, uint16_t, uint16_t>::Transform(out, in, stride, x0, xmax, k0, kmax);
+}
+
+#endif // __arch64__
diff --git a/src/graph/CL/CLUnmap.cpp b/src/core/NEON/kernels/arm_gemm/transforms/list.hpp
similarity index 65%
copy from src/graph/CL/CLUnmap.cpp
copy to src/core/NEON/kernels/arm_gemm/transforms/list.hpp
index 31f2f19..8ad5b85 100644
--- a/src/graph/CL/CLUnmap.cpp
+++ b/src/core/NEON/kernels/arm_gemm/transforms/list.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2018 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -21,23 +21,13 @@
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*/
-#include "arm_compute/graph/CL/CLUnmap.h"
-
-#include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/Validate.h"
-#include "arm_compute/graph/ITensorObject.h"
-#include "arm_compute/runtime/CL/CLScheduler.h"
-
-using namespace arm_compute::graph;
-
-CLUnmap::CLUnmap(ITensorObject *tensor)
- : _tensor(dynamic_cast<arm_compute::ICLTensor *>(tensor->tensor()))
-{
- ARM_COMPUTE_ERROR_ON_NULLPTR(_tensor);
-}
-
-void CLUnmap::run()
-{
- _tensor->unmap(arm_compute::CLScheduler::get().queue());
-}
+#include "a32_interleave_6way_32bit.hpp"
+#include "a32_transpose_interleave_8way_32bit.hpp"
+#include "a64_block16_interleave4_8bit.hpp"
+#include "a64_interleave_8way_16bit.hpp"
+#include "a64_interleave_8way_32bit.hpp"
+#include "a64_interleave_8way_half_to_float.hpp"
+#include "a64_transpose_interleave_12way_16bit.hpp"
+#include "a64_transpose_interleave_12way_half_to_float.hpp"
+#include "a64_transpose_interleave_24way_16bit.hpp"
+#include "transpose_interleave_common.hpp"
diff --git a/src/core/NEON/kernels/arm_gemm/transforms/transpose_interleave_common.hpp b/src/core/NEON/kernels/arm_gemm/transforms/transpose_interleave_common.hpp
new file mode 100644
index 0000000..3218ca1
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/transforms/transpose_interleave_common.hpp
@@ -0,0 +1,160 @@
+/*
+ * Copyright (c) 2017-2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#pragma once
+
+template <unsigned int IntBy, typename TIn, typename TOut>
+struct TransposeInterleaveCommon
+{
+ // Override the moveblock_1xY methods to improve performance
+ static inline void moveblock_1x1(const TIn *&in0, TOut *out)
+ {
+ for(unsigned int i = 0; i < IntBy; i++)
+ {
+ *out++ = static_cast<TOut>(*in0++);
+ }
+ }
+
+ static inline void moveblock_1x2(const TIn *&in0, const TIn *&in1, TOut *out)
+ {
+ for(unsigned int i = 0; i < IntBy; i++)
+ {
+ *out++ = static_cast<TOut>(*in0++);
+ }
+ for(unsigned int i = 0; i < IntBy; i++)
+ {
+ *out++ = static_cast<TOut>(*in1++);
+ }
+ }
+
+ static inline void moveblock_1x4(const TIn *&in0, const TIn *&in1, const TIn *&in2, const TIn *&in3, TOut *out)
+ {
+ for(unsigned int i = 0; i < IntBy; i++)
+ {
+ *out++ = static_cast<TOut>(*in0++);
+ }
+ for(unsigned int i = 0; i < IntBy; i++)
+ {
+ *out++ = static_cast<TOut>(*in1++);
+ }
+ for(unsigned int i = 0; i < IntBy; i++)
+ {
+ *out++ = static_cast<TOut>(*in2++);
+ }
+ for(unsigned int i = 0; i < IntBy; i++)
+ {
+ *out++ = static_cast<TOut>(*in3++);
+ }
+ }
+
+ static inline void Transform(TOut *out, const TIn *in, const int stride, const int x0, const int xmax, const int k0, const int kmax)
+ {
+ const auto ldin = stride;
+
+ TOut *outarray = out;
+ const TIn *inarray = in;
+ TOut *outptr_base = outarray;
+ const TIn *inptr_base = inarray + x0 + (k0 * ldin);
+ int ldout = (kmax - k0) * IntBy;
+
+ int k = (kmax - k0);
+ for(; k > 3; k -= 4)
+ {
+ TOut *outptr = outptr_base;
+ const TIn *inptr = inptr_base;
+ const TIn *inptr1 = inptr + ldin;
+ const TIn *inptr2 = inptr1 + ldin;
+ const TIn *inptr3 = inptr2 + ldin;
+
+ prefetch_3x(inptr);
+ prefetch_3x(inptr1);
+ prefetch_3x(inptr2);
+ prefetch_3x(inptr3);
+
+ outptr_base += IntBy * 4;
+ inptr_base += ldin * 4;
+
+ for(int x = (xmax - x0) / IntBy; x > 0; x--)
+ {
+ moveblock_1x4(inptr, inptr1, inptr2, inptr3, outptr);
+ outptr += ldout;
+ }
+ }
+
+ if(k)
+ {
+ TOut *outptr = outptr_base;
+ const TIn *inptr = inptr_base;
+ const TIn *inptr1 = inptr + ldin;
+ const TIn *inptr2 = inptr1 + ldin;
+
+ prefetch_3x(inptr);
+ prefetch_3x(inptr1);
+ prefetch_3x(inptr2);
+
+ for(int x = (xmax - x0) / IntBy; x > 0; x--)
+ {
+ switch(k)
+ {
+ case 3:
+ moveblock_1x2(inptr, inptr1, outptr);
+ moveblock_1x1(inptr2, outptr + IntBy * 2);
+ break;
+
+ case 2:
+ moveblock_1x2(inptr, inptr1, outptr);
+ break;
+
+ case 1:
+ moveblock_1x1(inptr, outptr);
+ break;
+
+ default:
+ UNREACHABLE("Impossible.");
+ }
+
+ outptr += ldout;
+ }
+ }
+
+ // Cope with ragged X cases
+ const unsigned int overflow = (xmax - x0) % IntBy;
+ if(overflow)
+ {
+ const TIn *inptr_base = inarray + (xmax - overflow) + (k0 * ldin);
+ TOut *outptr = outarray + ((xmax - x0) / IntBy) * ldout;
+
+ for(int k = (kmax - k0); k > 0; k--)
+ {
+ const TIn *inptr = inptr_base;
+ inptr_base += ldin;
+
+ for(unsigned int x = 0; x < IntBy; x++)
+ {
+ TOut val = (x < overflow) ? static_cast<TOut>(*inptr++) : static_cast<TOut>(0);
+ *outptr++ = val;
+ }
+ }
+ }
+ }
+};
diff --git a/src/graph/CL/CLUnmap.cpp b/src/core/NEON/kernels/arm_gemm/utils.hpp
similarity index 66%
rename from src/graph/CL/CLUnmap.cpp
rename to src/core/NEON/kernels/arm_gemm/utils.hpp
index 31f2f19..6c5b92a 100644
--- a/src/graph/CL/CLUnmap.cpp
+++ b/src/core/NEON/kernels/arm_gemm/utils.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2018 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -21,23 +21,31 @@
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*/
-#include "arm_compute/graph/CL/CLUnmap.h"
-#include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/Validate.h"
-#include "arm_compute/graph/ITensorObject.h"
-#include "arm_compute/runtime/CL/CLScheduler.h"
+#pragma once
-using namespace arm_compute::graph;
+// Macro for unreachable code (e.g. impossible default cases on switch)
+#define UNREACHABLE(why) __builtin_unreachable()
-CLUnmap::CLUnmap(ITensorObject *tensor)
- : _tensor(dynamic_cast<arm_compute::ICLTensor *>(tensor->tensor()))
+// Paranoid option for the above with assert
+// #define UNREACHABLE(why) assert(0 && why)
+
+inline int iceildiv(const int a, const int b)
{
- ARM_COMPUTE_ERROR_ON_NULLPTR(_tensor);
+ return (a + b - 1) / b;
}
-void CLUnmap::run()
+template <typename T>
+inline T roundup(const T a, const T b)
{
- _tensor->unmap(arm_compute::CLScheduler::get().queue());
+ T rem = a % b;
+
+ if(rem)
+ {
+ return a + b - rem;
+ }
+ else
+ {
+ return a;
+ }
}
diff --git a/src/core/NEON/kernels/convolution/common/utils.cpp b/src/core/NEON/kernels/convolution/common/utils.cpp
index 24d0386..45847bb 100644
--- a/src/core/NEON/kernels/convolution/common/utils.cpp
+++ b/src/core/NEON/kernels/convolution/common/utils.cpp
@@ -23,18 +23,6 @@
*/
#include <cstdio>
-#include <ctime>
-
-double TimeInUs(void)
-{
-#ifdef CYCLE_PROFILING
- timespec t;
- clock_gettime(CLOCK_REALTIME, &t);
- return 1e6*t.tv_sec + 1e-3*t.tv_nsec;
-#else
- return 0;
-#endif
-}
void PrintMatrix(const float* const m, const int M, const int N, const int row_stride)
{
@@ -47,4 +35,4 @@
printf("\n");
}
printf("\n");
-}
+}
\ No newline at end of file
diff --git a/src/core/NEON/kernels/convolution/depthwise/depthwise_2x2_3x3_1x1_fp32_fp32.cpp b/src/core/NEON/kernels/convolution/depthwise/depthwise_2x2_3x3_1x1_fp32_fp32.cpp
index fa50f79..9b3a60d 100644
--- a/src/core/NEON/kernels/convolution/depthwise/depthwise_2x2_3x3_1x1_fp32_fp32.cpp
+++ b/src/core/NEON/kernels/convolution/depthwise/depthwise_2x2_3x3_1x1_fp32_fp32.cpp
@@ -28,412 +28,543 @@
using Conv = DepthwiseConvolution<2, 2, 3, 3, 1, 1, float, float>;
using ConvImpl = DepthwiseConvolutionImpl<2, 2, 3, 3, 1, 1, float, float>;
+#ifdef __aarch64__
+
template <>
-const Conv::TileFn Conv::tile_fns
- [max_in_pad_top]
- [max_in_pad_left]
- [max_in_pad_bottom]
- [max_in_pad_right]
- [max_out_pad_bottom]
- [max_out_pad_right] = {
- { // Input pad top = 0
- { // Input pad left = 0
- { // Input pad bottom = 0
- { // Input pad right = 0
- { // Output pad bottom = 0
- ConvImpl::template process_tile<0, 0, 0, 0, 0, 0>,
- ConvImpl::template process_tile<0, 0, 0, 0, 0, 1>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- ConvImpl::template process_tile<0, 0, 0, 0, 1, 0>,
- ConvImpl::template process_tile<0, 0, 0, 0, 1, 1>,
- }, // Output pad bottom = 1
- }, // Input pad right = 0
- { // Input pad right = 1
- { // Output pad bottom = 0
- ConvImpl::template process_tile<0, 0, 0, 1, 0, 0>,
- ConvImpl::template process_tile<0, 0, 0, 1, 0, 1>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- ConvImpl::template process_tile<0, 0, 0, 1, 1, 0>,
- ConvImpl::template process_tile<0, 0, 0, 1, 1, 1>,
- }, // Output pad bottom = 1
- }, // Input pad right = 1
- { // Input pad right = 2
- { // Output pad bottom = 0
- ConvImpl::template process_tile<0, 0, 0, 2, 0, 0>,
- ConvImpl::template process_tile<0, 0, 0, 2, 0, 1>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- ConvImpl::template process_tile<0, 0, 0, 2, 1, 0>,
- ConvImpl::template process_tile<0, 0, 0, 2, 1, 1>,
- }, // Output pad bottom = 1
- }, // Input pad right = 2
- }, // Input pad bottom = 0
- { // Input pad bottom = 1
- { // Input pad right = 0
- { // Output pad bottom = 0
- ConvImpl::template process_tile<0, 0, 1, 0, 0, 0>,
- ConvImpl::template process_tile<0, 0, 1, 0, 0, 1>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- ConvImpl::template process_tile<0, 0, 1, 0, 1, 0>,
- ConvImpl::template process_tile<0, 0, 1, 0, 1, 1>,
- }, // Output pad bottom = 1
- }, // Input pad right = 0
- { // Input pad right = 1
- { // Output pad bottom = 0
- ConvImpl::template process_tile<0, 0, 1, 1, 0, 0>,
- ConvImpl::template process_tile<0, 0, 1, 1, 0, 1>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- ConvImpl::template process_tile<0, 0, 1, 1, 1, 0>,
- ConvImpl::template process_tile<0, 0, 1, 1, 1, 1>,
- }, // Output pad bottom = 1
- }, // Input pad right = 1
- { // Input pad right = 2
- { // Output pad bottom = 0
- ConvImpl::template process_tile<0, 0, 1, 2, 0, 0>,
- ConvImpl::template process_tile<0, 0, 1, 2, 0, 1>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- ConvImpl::template process_tile<0, 0, 1, 2, 1, 0>,
- ConvImpl::template process_tile<0, 0, 1, 2, 1, 1>,
- }, // Output pad bottom = 1
- }, // Input pad right = 2
- }, // Input pad bottom = 1
- { // Input pad bottom = 2
- { // Input pad right = 0
- { // Output pad bottom = 0
- ConvImpl::template process_tile<0, 0, 2, 0, 0, 0>,
- ConvImpl::template process_tile<0, 0, 2, 0, 0, 1>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- ConvImpl::template process_tile<0, 0, 2, 0, 1, 0>,
- ConvImpl::template process_tile<0, 0, 2, 0, 1, 1>,
- }, // Output pad bottom = 1
- }, // Input pad right = 0
- { // Input pad right = 1
- { // Output pad bottom = 0
- ConvImpl::template process_tile<0, 0, 2, 1, 0, 0>,
- ConvImpl::template process_tile<0, 0, 2, 1, 0, 1>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- ConvImpl::template process_tile<0, 0, 2, 1, 1, 0>,
- ConvImpl::template process_tile<0, 0, 2, 1, 1, 1>,
- }, // Output pad bottom = 1
- }, // Input pad right = 1
- { // Input pad right = 2
- { // Output pad bottom = 0
- ConvImpl::template process_tile<0, 0, 2, 2, 0, 0>,
- ConvImpl::template process_tile<0, 0, 2, 2, 0, 1>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- ConvImpl::template process_tile<0, 0, 2, 2, 1, 0>,
- ConvImpl::template process_tile<0, 0, 2, 2, 1, 1>,
- }, // Output pad bottom = 1
- }, // Input pad right = 2
- }, // Input pad bottom = 2
- }, // Input pad left = 0
- { // Input pad left = 1
- { // Input pad bottom = 0
- { // Input pad right = 0
- { // Output pad bottom = 0
- ConvImpl::template process_tile<0, 1, 0, 0, 0, 0>,
- ConvImpl::template process_tile<0, 1, 0, 0, 0, 1>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- ConvImpl::template process_tile<0, 1, 0, 0, 1, 0>,
- ConvImpl::template process_tile<0, 1, 0, 0, 1, 1>,
- }, // Output pad bottom = 1
- }, // Input pad right = 0
- { // Input pad right = 1
- { // Output pad bottom = 0
- ConvImpl::template process_tile<0, 1, 0, 1, 0, 0>,
- ConvImpl::template process_tile<0, 1, 0, 1, 0, 1>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- ConvImpl::template process_tile<0, 1, 0, 1, 1, 0>,
- ConvImpl::template process_tile<0, 1, 0, 1, 1, 1>,
- }, // Output pad bottom = 1
- }, // Input pad right = 1
- { // Input pad right = 2
- { // Output pad bottom = 0
- ConvImpl::template process_tile<0, 1, 0, 2, 0, 0>,
- ConvImpl::template process_tile<0, 1, 0, 2, 0, 1>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- ConvImpl::template process_tile<0, 1, 0, 2, 1, 0>,
- ConvImpl::template process_tile<0, 1, 0, 2, 1, 1>,
- }, // Output pad bottom = 1
- }, // Input pad right = 2
- }, // Input pad bottom = 0
- { // Input pad bottom = 1
- { // Input pad right = 0
- { // Output pad bottom = 0
- ConvImpl::template process_tile<0, 1, 1, 0, 0, 0>,
- ConvImpl::template process_tile<0, 1, 1, 0, 0, 1>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- ConvImpl::template process_tile<0, 1, 1, 0, 1, 0>,
- ConvImpl::template process_tile<0, 1, 1, 0, 1, 1>,
- }, // Output pad bottom = 1
- }, // Input pad right = 0
- { // Input pad right = 1
- { // Output pad bottom = 0
- ConvImpl::template process_tile<0, 1, 1, 1, 0, 0>,
- ConvImpl::template process_tile<0, 1, 1, 1, 0, 1>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- ConvImpl::template process_tile<0, 1, 1, 1, 1, 0>,
- ConvImpl::template process_tile<0, 1, 1, 1, 1, 1>,
- }, // Output pad bottom = 1
- }, // Input pad right = 1
- { // Input pad right = 2
- { // Output pad bottom = 0
- ConvImpl::template process_tile<0, 1, 1, 2, 0, 0>,
- ConvImpl::template process_tile<0, 1, 1, 2, 0, 1>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- ConvImpl::template process_tile<0, 1, 1, 2, 1, 0>,
- ConvImpl::template process_tile<0, 1, 1, 2, 1, 1>,
- }, // Output pad bottom = 1
- }, // Input pad right = 2
- }, // Input pad bottom = 1
- { // Input pad bottom = 2
- { // Input pad right = 0
- { // Output pad bottom = 0
- ConvImpl::template process_tile<0, 1, 2, 0, 0, 0>,
- ConvImpl::template process_tile<0, 1, 2, 0, 0, 1>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- ConvImpl::template process_tile<0, 1, 2, 0, 1, 0>,
- ConvImpl::template process_tile<0, 1, 2, 0, 1, 1>,
- }, // Output pad bottom = 1
- }, // Input pad right = 0
- { // Input pad right = 1
- { // Output pad bottom = 0
- ConvImpl::template process_tile<0, 1, 2, 1, 0, 0>,
- ConvImpl::template process_tile<0, 1, 2, 1, 0, 1>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- ConvImpl::template process_tile<0, 1, 2, 1, 1, 0>,
- ConvImpl::template process_tile<0, 1, 2, 1, 1, 1>,
- }, // Output pad bottom = 1
- }, // Input pad right = 1
- { // Input pad right = 2
- { // Output pad bottom = 0
- ConvImpl::template process_tile<0, 1, 2, 2, 0, 0>,
- ConvImpl::template process_tile<0, 1, 2, 2, 0, 1>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- ConvImpl::template process_tile<0, 1, 2, 2, 1, 0>,
- ConvImpl::template process_tile<0, 1, 2, 2, 1, 1>,
- }, // Output pad bottom = 1
- }, // Input pad right = 2
- }, // Input pad bottom = 2
- }, // Input pad left = 1
- }, // Input pad top = 0
- { // Input pad top = 1
- { // Input pad left = 0
- { // Input pad bottom = 0
- { // Input pad right = 0
- { // Output pad bottom = 0
- ConvImpl::template process_tile<1, 0, 0, 0, 0, 0>,
- ConvImpl::template process_tile<1, 0, 0, 0, 0, 1>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- ConvImpl::template process_tile<1, 0, 0, 0, 1, 0>,
- ConvImpl::template process_tile<1, 0, 0, 0, 1, 1>,
- }, // Output pad bottom = 1
- }, // Input pad right = 0
- { // Input pad right = 1
- { // Output pad bottom = 0
- ConvImpl::template process_tile<1, 0, 0, 1, 0, 0>,
- ConvImpl::template process_tile<1, 0, 0, 1, 0, 1>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- ConvImpl::template process_tile<1, 0, 0, 1, 1, 0>,
- ConvImpl::template process_tile<1, 0, 0, 1, 1, 1>,
- }, // Output pad bottom = 1
- }, // Input pad right = 1
- { // Input pad right = 2
- { // Output pad bottom = 0
- ConvImpl::template process_tile<1, 0, 0, 2, 0, 0>,
- ConvImpl::template process_tile<1, 0, 0, 2, 0, 1>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- ConvImpl::template process_tile<1, 0, 0, 2, 1, 0>,
- ConvImpl::template process_tile<1, 0, 0, 2, 1, 1>,
- }, // Output pad bottom = 1
- }, // Input pad right = 2
- }, // Input pad bottom = 0
- { // Input pad bottom = 1
- { // Input pad right = 0
- { // Output pad bottom = 0
- ConvImpl::template process_tile<1, 0, 1, 0, 0, 0>,
- ConvImpl::template process_tile<1, 0, 1, 0, 0, 1>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- ConvImpl::template process_tile<1, 0, 1, 0, 1, 0>,
- ConvImpl::template process_tile<1, 0, 1, 0, 1, 1>,
- }, // Output pad bottom = 1
- }, // Input pad right = 0
- { // Input pad right = 1
- { // Output pad bottom = 0
- ConvImpl::template process_tile<1, 0, 1, 1, 0, 0>,
- ConvImpl::template process_tile<1, 0, 1, 1, 0, 1>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- ConvImpl::template process_tile<1, 0, 1, 1, 1, 0>,
- ConvImpl::template process_tile<1, 0, 1, 1, 1, 1>,
- }, // Output pad bottom = 1
- }, // Input pad right = 1
- { // Input pad right = 2
- { // Output pad bottom = 0
- ConvImpl::template process_tile<1, 0, 1, 2, 0, 0>,
- ConvImpl::template process_tile<1, 0, 1, 2, 0, 1>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- ConvImpl::template process_tile<1, 0, 1, 2, 1, 0>,
- ConvImpl::template process_tile<1, 0, 1, 2, 1, 1>,
- }, // Output pad bottom = 1
- }, // Input pad right = 2
- }, // Input pad bottom = 1
- { // Input pad bottom = 2
- { // Input pad right = 0
- { // Output pad bottom = 0
- ConvImpl::template process_tile<1, 0, 2, 0, 0, 0>,
- ConvImpl::template process_tile<1, 0, 2, 0, 0, 1>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- ConvImpl::template process_tile<1, 0, 2, 0, 1, 0>,
- ConvImpl::template process_tile<1, 0, 2, 0, 1, 1>,
- }, // Output pad bottom = 1
- }, // Input pad right = 0
- { // Input pad right = 1
- { // Output pad bottom = 0
- ConvImpl::template process_tile<1, 0, 2, 1, 0, 0>,
- ConvImpl::template process_tile<1, 0, 2, 1, 0, 1>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- ConvImpl::template process_tile<1, 0, 2, 1, 1, 0>,
- ConvImpl::template process_tile<1, 0, 2, 1, 1, 1>,
- }, // Output pad bottom = 1
- }, // Input pad right = 1
- { // Input pad right = 2
- { // Output pad bottom = 0
- ConvImpl::template process_tile<1, 0, 2, 2, 0, 0>,
- ConvImpl::template process_tile<1, 0, 2, 2, 0, 1>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- ConvImpl::template process_tile<1, 0, 2, 2, 1, 0>,
- ConvImpl::template process_tile<1, 0, 2, 2, 1, 1>,
- }, // Output pad bottom = 1
- }, // Input pad right = 2
- }, // Input pad bottom = 2
- }, // Input pad left = 0
- { // Input pad left = 1
- { // Input pad bottom = 0
- { // Input pad right = 0
- { // Output pad bottom = 0
- ConvImpl::template process_tile<1, 1, 0, 0, 0, 0>,
- ConvImpl::template process_tile<1, 1, 0, 0, 0, 1>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- ConvImpl::template process_tile<1, 1, 0, 0, 1, 0>,
- ConvImpl::template process_tile<1, 1, 0, 0, 1, 1>,
- }, // Output pad bottom = 1
- }, // Input pad right = 0
- { // Input pad right = 1
- { // Output pad bottom = 0
- ConvImpl::template process_tile<1, 1, 0, 1, 0, 0>,
- ConvImpl::template process_tile<1, 1, 0, 1, 0, 1>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- ConvImpl::template process_tile<1, 1, 0, 1, 1, 0>,
- ConvImpl::template process_tile<1, 1, 0, 1, 1, 1>,
- }, // Output pad bottom = 1
- }, // Input pad right = 1
- { // Input pad right = 2
- { // Output pad bottom = 0
- ConvImpl::template process_tile<1, 1, 0, 2, 0, 0>,
- ConvImpl::template process_tile<1, 1, 0, 2, 0, 1>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- ConvImpl::template process_tile<1, 1, 0, 2, 1, 0>,
- ConvImpl::template process_tile<1, 1, 0, 2, 1, 1>,
- }, // Output pad bottom = 1
- }, // Input pad right = 2
- }, // Input pad bottom = 0
- { // Input pad bottom = 1
- { // Input pad right = 0
- { // Output pad bottom = 0
- ConvImpl::template process_tile<1, 1, 1, 0, 0, 0>,
- ConvImpl::template process_tile<1, 1, 1, 0, 0, 1>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- ConvImpl::template process_tile<1, 1, 1, 0, 1, 0>,
- ConvImpl::template process_tile<1, 1, 1, 0, 1, 1>,
- }, // Output pad bottom = 1
- }, // Input pad right = 0
- { // Input pad right = 1
- { // Output pad bottom = 0
- ConvImpl::template process_tile<1, 1, 1, 1, 0, 0>,
- ConvImpl::template process_tile<1, 1, 1, 1, 0, 1>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- ConvImpl::template process_tile<1, 1, 1, 1, 1, 0>,
- ConvImpl::template process_tile<1, 1, 1, 1, 1, 1>,
- }, // Output pad bottom = 1
- }, // Input pad right = 1
- { // Input pad right = 2
- { // Output pad bottom = 0
- ConvImpl::template process_tile<1, 1, 1, 2, 0, 0>,
- ConvImpl::template process_tile<1, 1, 1, 2, 0, 1>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- ConvImpl::template process_tile<1, 1, 1, 2, 1, 0>,
- ConvImpl::template process_tile<1, 1, 1, 2, 1, 1>,
- }, // Output pad bottom = 1
- }, // Input pad right = 2
- }, // Input pad bottom = 1
- { // Input pad bottom = 2
- { // Input pad right = 0
- { // Output pad bottom = 0
- ConvImpl::template process_tile<1, 1, 2, 0, 0, 0>,
- ConvImpl::template process_tile<1, 1, 2, 0, 0, 1>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- ConvImpl::template process_tile<1, 1, 2, 0, 1, 0>,
- ConvImpl::template process_tile<1, 1, 2, 0, 1, 1>,
- }, // Output pad bottom = 1
- }, // Input pad right = 0
- { // Input pad right = 1
- { // Output pad bottom = 0
- ConvImpl::template process_tile<1, 1, 2, 1, 0, 0>,
- ConvImpl::template process_tile<1, 1, 2, 1, 0, 1>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- ConvImpl::template process_tile<1, 1, 2, 1, 1, 0>,
- ConvImpl::template process_tile<1, 1, 2, 1, 1, 1>,
- }, // Output pad bottom = 1
- }, // Input pad right = 1
- { // Input pad right = 2
- { // Output pad bottom = 0
- ConvImpl::template process_tile<1, 1, 2, 2, 0, 0>,
- ConvImpl::template process_tile<1, 1, 2, 2, 0, 1>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- ConvImpl::template process_tile<1, 1, 2, 2, 1, 0>,
- ConvImpl::template process_tile<1, 1, 2, 2, 1, 1>,
- }, // Output pad bottom = 1
- }, // Input pad right = 2
- }, // Input pad bottom = 2
- }, // Input pad left = 1
- }, // Input pad top = 1
+template <>
+void ConvImpl::process_tile<true, 0, 0, 0, 0, 0, 0>(
+ const int n_channels,
+ const float* const weights,
+ const int weight_row_stride,
+ const int weight_col_stride,
+ const float* const inptr,
+ const int in_row_stride,
+ const int in_col_stride,
+ float* const outptr,
+ const int out_row_stride,
+ const int out_col_stride,
+ const int, const int, const int, const int, const int, const int
+)
+{
+ // Copy pointers
+ const float *uptr0 = inptr;
+ const float *wptr0 = weights;
+ float *vptr0 = outptr;
+
+ int channels_remaining = n_channels;
+ if (channels_remaining >= 4)
+ {
+ // Process blocks of 4 channels at a time
+ int n_iters = ((channels_remaining / 4) + 1)/2 - 1;
+ const bool odd_tail = (channels_remaining / 4) & 1;
+ channels_remaining %= 4;
+
+ asm volatile (
+ "qW11B .req q0\n" "vW11B .req v0\n" "qW33A .req q1\n" "qU32B .req q1\n"
+ "vW33A .req v1\n" "vU32B .req v1\n" "qU44B .req q2\n" "qW21A .req q2\n"
+ "vU44B .req v2\n" "vW21A .req v2\n" "qU21B .req q3\n" "qU32A .req q3\n"
+ "vU21B .req v3\n" "vU32A .req v3\n" "qU43A .req q4\n" "qV21B .req q4\n"
+ "vU43A .req v4\n" "vV21B .req v4\n" "qU24A .req q5\n" "qU44A .req q5\n"
+ "qU33B .req q5\n" "vU24A .req v5\n" "vU44A .req v5\n" "vU33B .req v5\n"
+ "qU31A .req q6\n" "qV12B .req q6\n" "qU23A .req q6\n" "vU31A .req v6\n"
+ "vV12B .req v6\n" "vU23A .req v6\n" "qW31B .req q7\n" "qV22A .req q7\n"
+ "vW31B .req v7\n" "vV22A .req v7\n" "qV12A .req q8\n" "qW21B .req q8\n"
+ "vV12A .req v8\n" "vW21B .req v8\n" "qU22B .req q9\n" "qU34A .req q9\n"
+ "vU22B .req v9\n" "vU34A .req v9\n" "qU13B .req q10\n" "qU13A .req q10\n"
+ "vU13B .req v10\n" "vU13A .req v10\n" "qU34B .req q11\n" "qU22A .req q11\n"
+ "vU34B .req v11\n" "vU22A .req v11\n" "qU24B .req q12\n" "qU31B .req q12\n"
+ "vU24B .req v12\n" "vU31B .req v12\n" "qW12B .req q13\n" "qW13A .req q13\n"
+ "vW12B .req v13\n" "vW13A .req v13\n" "qV21A .req q14\n" "qV11B .req q14\n"
+ "vV21A .req v14\n" "vV11B .req v14\n" "qW32A .req q15\n" "qW32B .req q15\n"
+ "vW32A .req v15\n" "vW32B .req v15\n" "qW31A .req q16\n" "qV22B .req q16\n"
+ "vW31A .req v16\n" "vV22B .req v16\n"
+ "qW11A .req q17\n" "vW11A .req v17\n" "qW13B .req q18\n" "qU14A .req q18\n"
+ "vW13B .req v18\n" "vU14A .req v18\n" "qU33A .req q19\n" "qW33B .req q19\n"
+ "vU33A .req v19\n" "vW33B .req v19\n" "qW22A .req q20\n" "qU23B .req q20\n"
+ "vW22A .req v20\n" "vU23B .req v20\n" "qU12A .req q21\n" "qU42A .req q21\n"
+ "vU12A .req v21\n" "vU42A .req v21\n" "qU41A .req q22\n" "qU42B .req q22\n"
+ "vU41A .req v22\n" "vU42B .req v22\n" "qW23A .req q23\n" "qW23B .req q23\n"
+ "vW23A .req v23\n" "vW23B .req v23\n" "qU43B .req q24\n" "qU11A .req q24\n"
+ "vU43B .req v24\n" "vU11A .req v24\n" "qU12B .req q25\n" "qW12A .req q25\n"
+ "vU12B .req v25\n" "vW12A .req v25\n" "qU41B .req q26\n" "qV11A .req q26\n"
+ "vU41B .req v26\n" "vV11A .req v26\n" "qW22B .req q27\n" "vW22B .req v27\n"
+ "qU11B .req q28\n" "qU14B .req q28\n" "vU11B .req v28\n" "vU14B .req v28\n"
+ "qU21A .req q29\n" "vU21A .req v29\n"
+
+ "u_col_stride1 .req %x[u_col_stride]\n"
+ "u_col_stride2 .req x0\n"
+ "u_col_stride3 .req x1\n"
+ "uptr1 .req x2\n"
+ "uptr2 .req x3\n"
+ "uptr3 .req x4\n"
+ "wptr1 .req x5\n"
+ "wptr2 .req x6\n"
+ "vptr1 .req x7\n"
+ "w_col_stride1 .req %x[w_col_stride]\n"
+ "w_col_stride2 .req x8\n"
+
+ // Prepare strides and pointers
+ "add uptr1, %x[uptr0], %x[u_row_stride]\n"
+ "add uptr2, uptr1 , %x[u_row_stride]\n"
+ "add uptr3, uptr2 , %x[u_row_stride]\n"
+ "add wptr1, %x[wptr0], %x[w_row_stride]\n"
+ "add wptr2, wptr1 , %x[w_row_stride]\n"
+ "add vptr1, %x[vptr0], %x[v_row_stride]\n"
+ "add u_col_stride2, %x[u_col_stride], %x[u_col_stride]\n"
+ "add u_col_stride3, u_col_stride2 , %x[u_col_stride]\n"
+ "add w_col_stride2, %x[w_col_stride], %x[w_col_stride]\n"
+
+ // Load in preparation for execution
+ "ldr qU14A, [%x[uptr0], u_col_stride3]\n"
+ "ldr qW13A, [%x[wptr0], w_col_stride2]\n"
+ "ldr qU13A, [%x[uptr0], u_col_stride2]\n"
+ "ldr qW12A, [%x[wptr0], w_col_stride1]\n"
+ "ldr qU12A, [%x[uptr0], u_col_stride1]\n"
+ "ldr qW11A, [%x[wptr0]], #0x10\n"
+ "ldr qU24A, [uptr1, u_col_stride3]\n"
+ "ldr qW23A, [wptr1, w_col_stride2]\n"
+ "ldr qU23A, [uptr1, u_col_stride2]\n"
+ "ldr qW22A, [wptr1, w_col_stride1]\n"
+ "ldr qU22A, [uptr1, u_col_stride1]\n"
+ "ldr qW21A, [wptr1], #0x10\n"
+ "ldr qU34A, [uptr2, u_col_stride3]\n"
+ "ldr qW33A, [wptr2, w_col_stride2]\n"
+ "ldr qU33A, [uptr2, u_col_stride2]\n"
+ "ldr qW32A, [wptr2, w_col_stride1]\n"
+ "ldr qU32A, [uptr2, u_col_stride1]\n"
+ "ldr qW31A, [wptr2], #0x10\n"
+ "fmul vV12A.4s, vU14A.4s, vW13A.4s\n"
+ "cbz %x[iters], 2f\n" // Jump to tail if doing zero iterations of loop
+
+ "1:" // Main loop body
+ // A part
+ "fmul vV11A.4s, vU13A.4s, vW13A.4s\n"
+ "fmla vV12A.4s, vU13A.4s, vW12A.4s\n"
+ "fmla vV11A.4s, vU12A.4s, vW12A.4s\n"
+ "fmla vV12A.4s, vU12A.4s, vW11A.4s\n"
+ "fmla vV12A.4s, vU24A.4s, vW23A.4s\n"
+ "fmul vV22A.4s, vU24A.4s, vW13A.4s\n"
+ "fmla vV11A.4s, vU23A.4s, vW23A.4s\n"
+ "ldr qU44A, [uptr3, u_col_stride3]\n"
+ "fmla vV12A.4s, vU23A.4s, vW22A.4s\n"
+ "ldr qU43A, [uptr3, u_col_stride2]\n"
+ "fmul vV21A.4s, vU23A.4s, vW13A.4s\n"
+ "ldr qU42A, [uptr3, u_col_stride1]\n"
+ "fmla vV22A.4s, vU23A.4s, vW12A.4s\n"
+ "ldr qU11A, [%x[uptr0]], #0x10\n"
+ "fmla vV11A.4s, vU22A.4s, vW22A.4s\n"
+ "ldr qU21A, [uptr1], #0x10\n"
+ "fmla vV12A.4s, vU22A.4s, vW21A.4s\n"
+ "ldr qU31A, [uptr2], #0x10\n"
+ "fmla vV21A.4s, vU22A.4s, vW12A.4s\n"
+ "ldr qU41A, [uptr3], #0x10\n"
+ "fmla vV22A.4s, vU22A.4s, vW11A.4s\n"
+ "ldr qU14B, [%x[uptr0], u_col_stride3]\n"
+ "fmla vV12A.4s, vU34A.4s, vW33A.4s\n"
+ "ldr qW13B, [%x[wptr0], w_col_stride2]\n"
+ "fmla vV22A.4s, vU34A.4s, vW23A.4s\n"
+ "ldr qU13B, [%x[uptr0], u_col_stride2]\n"
+ "fmla vV11A.4s, vU33A.4s, vW33A.4s\n"
+ "ldr qW12B, [%x[wptr0], w_col_stride1]\n"
+ "fmla vV12A.4s, vU33A.4s, vW32A.4s\n"
+ "ldr qU12B, [%x[uptr0], u_col_stride1]\n"
+ "fmla vV21A.4s, vU33A.4s, vW23A.4s\n"
+ "ldr qW11B, [%x[wptr0]], #0x10\n"
+ "fmla vV22A.4s, vU33A.4s, vW22A.4s\n"
+ "ldr qU24B, [uptr1, u_col_stride3]\n"
+ "fmla vV11A.4s, vU32A.4s, vW32A.4s\n"
+ "ldr qW23B, [wptr1, w_col_stride2]\n"
+ "fmla vV12A.4s, vU32A.4s, vW31A.4s\n"
+ "str qV12A, [%x[vptr0], %x[v_col_stride]]\n"
+ "fmla vV21A.4s, vU32A.4s, vW22A.4s\n"
+ "ldr qU23B, [uptr1, u_col_stride2]\n"
+ "fmla vV22A.4s, vU32A.4s, vW21A.4s\n"
+ "ldr qW22B, [wptr1, w_col_stride1]\n"
+ "fmla vV22A.4s, vU44A.4s, vW33A.4s\n"
+ "ldr qU22B, [uptr1, u_col_stride1]\n"
+ "fmla vV21A.4s, vU43A.4s, vW33A.4s\n"
+ "ldr qW21B, [wptr1], #0x10\n"
+ "fmla vV22A.4s, vU43A.4s, vW32A.4s\n"
+ "ldr qU34B, [uptr2, u_col_stride3]\n"
+ "fmla vV21A.4s, vU42A.4s, vW32A.4s\n"
+ "ldr qW33B, [wptr2, w_col_stride2]\n"
+ "fmla vV22A.4s, vU42A.4s, vW31A.4s\n"
+ "str qV22A, [vptr1, %x[v_col_stride]]\n"
+ "fmla vV11A.4s, vU11A.4s, vW11A.4s\n"
+ "ldr qU33B, [uptr2, u_col_stride2]\n"
+ "fmla vV11A.4s, vU21A.4s, vW21A.4s\n"
+ "ldr qW32B, [wptr2, w_col_stride1]\n"
+ "fmla vV21A.4s, vU21A.4s, vW11A.4s\n"
+ "ldr qU32B, [uptr2, u_col_stride1]\n"
+ "fmla vV11A.4s, vU31A.4s, vW31A.4s\n"
+ "str qV11A, [%x[vptr0]], #0x10\n"
+ "fmla vV21A.4s, vU31A.4s, vW21A.4s\n"
+ "ldr qW31B, [wptr2], #0x10\n"
+ "fmla vV21A.4s, vU41A.4s, vW31A.4s\n"
+ "str qV21A, [vptr1], #0x10\n"
+
+ // B part
+ "fmul vV12B.4s, vU14B.4s, vW13B.4s\n"
+ "fmul vV11B.4s, vU13B.4s, vW13B.4s\n"
+ "fmla vV12B.4s, vU13B.4s, vW12B.4s\n"
+ "fmla vV11B.4s, vU12B.4s, vW12B.4s\n"
+ "fmla vV12B.4s, vU12B.4s, vW11B.4s\n"
+ "fmla vV12B.4s, vU24B.4s, vW23B.4s\n"
+ "fmul vV22B.4s, vU24B.4s, vW13B.4s\n"
+ "subs %x[iters], %x[iters], #1\n"
+ "fmla vV11B.4s, vU23B.4s, vW23B.4s\n"
+ "ldr qU44B, [uptr3, u_col_stride3]\n"
+ "fmla vV12B.4s, vU23B.4s, vW22B.4s\n"
+ "ldr qU43B, [uptr3, u_col_stride2]\n"
+ "fmul vV21B.4s, vU23B.4s, vW13B.4s\n"
+ "ldr qU42B, [uptr3, u_col_stride1]\n"
+ "fmla vV22B.4s, vU23B.4s, vW12B.4s\n"
+ "ldr qU11B, [%x[uptr0]], #0x10\n"
+ "fmla vV11B.4s, vU22B.4s, vW22B.4s\n"
+ "ldr qU21B, [uptr1], #0x10\n"
+ "fmla vV12B.4s, vU22B.4s, vW21B.4s\n"
+ "ldr qU31B, [uptr2], #0x10\n"
+ "fmla vV21B.4s, vU22B.4s, vW12B.4s\n"
+ "ldr qU41B, [uptr3], #0x10\n"
+ "fmla vV22B.4s, vU22B.4s, vW11B.4s\n"
+ "ldr qU14A, [%x[uptr0], u_col_stride3]\n"
+ "fmla vV12B.4s, vU34B.4s, vW33B.4s\n"
+ "ldr qW13A, [%x[wptr0], w_col_stride2]\n"
+ "fmla vV22B.4s, vU34B.4s, vW23B.4s\n"
+ "ldr qU13A, [%x[uptr0], u_col_stride2]\n"
+ "fmla vV11B.4s, vU33B.4s, vW33B.4s\n"
+ "ldr qW12A, [%x[wptr0], w_col_stride1]\n"
+ "fmla vV12B.4s, vU33B.4s, vW32B.4s\n"
+ "ldr qU12A, [%x[uptr0], u_col_stride1]\n"
+ "fmla vV21B.4s, vU33B.4s, vW23B.4s\n"
+ "ldr qW11A, [%x[wptr0]], #0x10\n"
+ "fmla vV22B.4s, vU33B.4s, vW22B.4s\n"
+ "ldr qU24A, [uptr1, u_col_stride3]\n"
+ "fmla vV11B.4s, vU32B.4s, vW32B.4s\n"
+ "ldr qW23A, [wptr1, w_col_stride2]\n"
+ "fmla vV12B.4s, vU32B.4s, vW31B.4s\n"
+ "str qV12B, [%x[vptr0], %x[v_col_stride]]\n"
+ "fmla vV21B.4s, vU32B.4s, vW22B.4s\n"
+ "ldr qU23A, [uptr1, u_col_stride2]\n"
+ "fmla vV22B.4s, vU32B.4s, vW21B.4s\n"
+ "ldr qW22A, [wptr1, w_col_stride1]\n"
+ "fmla vV22B.4s, vU44B.4s, vW33B.4s\n"
+ "ldr qU22A, [uptr1, u_col_stride1]\n"
+ "fmla vV21B.4s, vU43B.4s, vW33B.4s\n"
+ "ldr qW21A, [wptr1], #0x10\n"
+ "fmla vV22B.4s, vU43B.4s, vW32B.4s\n"
+ "ldr qU34A, [uptr2, u_col_stride3]\n"
+ "fmla vV21B.4s, vU42B.4s, vW32B.4s\n"
+ "ldr qW33A, [wptr2, w_col_stride2]\n"
+ "fmla vV22B.4s, vU42B.4s, vW31B.4s\n"
+ "str qV22B, [vptr1, %x[v_col_stride]]\n"
+ "fmla vV11B.4s, vU11B.4s, vW11B.4s\n"
+ "ldr qU33A, [uptr2, u_col_stride2]\n"
+ "fmla vV11B.4s, vU21B.4s, vW21B.4s\n"
+ "ldr qW32A, [wptr2, w_col_stride1]\n"
+ "fmla vV21B.4s, vU21B.4s, vW11B.4s\n"
+ "ldr qU32A, [uptr2, u_col_stride1]\n"
+ "fmla vV11B.4s, vU31B.4s, vW31B.4s\n"
+ "str qV11B, [%x[vptr0]], #0x10\n"
+ "fmla vV21B.4s, vU31B.4s, vW21B.4s\n"
+ "ldr qW31A, [wptr2], #0x10\n"
+ "fmla vV21B.4s, vU41B.4s, vW31B.4s\n"
+ "str qV21B, [vptr1], #0x10\n"
+ "fmul vV12A.4s, vU14A.4s, vW13A.4s\n"
+ "bne 1b\n" // Loop
+
+ "2:" // Branch destination for zero loops
+ "cbnz %w[odd_tail], 4f\n"
+
+ "3:" // Even number of iterations
+ "fmul vV11A.4s, vU13A.4s, vW13A.4s\n"
+ "fmla vV12A.4s, vU13A.4s, vW12A.4s\n"
+ "fmla vV11A.4s, vU12A.4s, vW12A.4s\n"
+ "fmla vV12A.4s, vU12A.4s, vW11A.4s\n"
+ "fmla vV12A.4s, vU24A.4s, vW23A.4s\n"
+ "fmul vV22A.4s, vU24A.4s, vW13A.4s\n"
+ "fmla vV11A.4s, vU23A.4s, vW23A.4s\n"
+ "ldr qU44A, [uptr3, u_col_stride3]\n"
+ "fmla vV12A.4s, vU23A.4s, vW22A.4s\n"
+ "ldr qU43A, [uptr3, u_col_stride2]\n"
+ "fmul vV21A.4s, vU23A.4s, vW13A.4s\n"
+ "ldr qU42A, [uptr3, u_col_stride1]\n"
+ "fmla vV22A.4s, vU23A.4s, vW12A.4s\n"
+ "ldr qU11A, [%x[uptr0]], #0x10\n"
+ "fmla vV11A.4s, vU22A.4s, vW22A.4s\n"
+ "ldr qU21A, [uptr1], #0x10\n"
+ "fmla vV12A.4s, vU22A.4s, vW21A.4s\n"
+ "ldr qU31A, [uptr2], #0x10\n"
+ "fmla vV21A.4s, vU22A.4s, vW12A.4s\n"
+ "ldr qU41A, [uptr3], #0x10\n"
+ "fmla vV22A.4s, vU22A.4s, vW11A.4s\n"
+ "ldr qU14B, [%x[uptr0], u_col_stride3]\n"
+ "fmla vV12A.4s, vU34A.4s, vW33A.4s\n"
+ "ldr qW13B, [%x[wptr0], w_col_stride2]\n"
+ "fmla vV22A.4s, vU34A.4s, vW23A.4s\n"
+ "ldr qU13B, [%x[uptr0], u_col_stride2]\n"
+ "fmla vV11A.4s, vU33A.4s, vW33A.4s\n"
+ "ldr qW12B, [%x[wptr0], w_col_stride1]\n"
+ "fmla vV12A.4s, vU33A.4s, vW32A.4s\n"
+ "ldr qU12B, [%x[uptr0], u_col_stride1]\n"
+ "fmla vV21A.4s, vU33A.4s, vW23A.4s\n"
+ "ldr qW11B, [%x[wptr0]], #0x10\n"
+ "fmla vV22A.4s, vU33A.4s, vW22A.4s\n"
+ "ldr qU24B, [uptr1, u_col_stride3]\n"
+ "fmla vV11A.4s, vU32A.4s, vW32A.4s\n"
+ "ldr qW23B, [wptr1, w_col_stride2]\n"
+ "fmla vV12A.4s, vU32A.4s, vW31A.4s\n"
+ "str qV12A, [%x[vptr0], %x[v_col_stride]]\n"
+ "fmla vV21A.4s, vU32A.4s, vW22A.4s\n"
+ "ldr qU23B, [uptr1, u_col_stride2]\n"
+ "fmla vV22A.4s, vU32A.4s, vW21A.4s\n"
+ "ldr qW22B, [wptr1, w_col_stride1]\n"
+ "fmla vV22A.4s, vU44A.4s, vW33A.4s\n"
+ "ldr qU22B, [uptr1, u_col_stride1]\n"
+ "fmla vV21A.4s, vU43A.4s, vW33A.4s\n"
+ "ldr qW21B, [wptr1], #0x10\n"
+ "fmla vV22A.4s, vU43A.4s, vW32A.4s\n"
+ "ldr qU34B, [uptr2, u_col_stride3]\n"
+ "fmla vV21A.4s, vU42A.4s, vW32A.4s\n"
+ "ldr qW33B, [wptr2, w_col_stride2]\n"
+ "fmla vV22A.4s, vU42A.4s, vW31A.4s\n"
+ "str qV22A, [vptr1, %x[v_col_stride]]\n"
+ "fmla vV11A.4s, vU11A.4s, vW11A.4s\n"
+ "ldr qU33B, [uptr2, u_col_stride2]\n"
+ "fmla vV11A.4s, vU21A.4s, vW21A.4s\n"
+ "ldr qW32B, [wptr2, w_col_stride1]\n"
+ "fmla vV21A.4s, vU21A.4s, vW11A.4s\n"
+ "ldr qU32B, [uptr2, u_col_stride1]\n"
+ "fmla vV11A.4s, vU31A.4s, vW31A.4s\n"
+ "str qV11A, [%x[vptr0]], #0x10\n"
+ "fmla vV21A.4s, vU31A.4s, vW21A.4s\n"
+ "ldr qW31B, [wptr2], #0x10\n"
+ "fmla vV21A.4s, vU41A.4s, vW31A.4s\n"
+ "str qV21A, [vptr1], #0x10\n"
+
+ "fmul vV12B.4s, vU14B.4s, vW13B.4s\n"
+ "fmul vV11B.4s, vU13B.4s, vW13B.4s\n"
+ "fmla vV12B.4s, vU13B.4s, vW12B.4s\n"
+ "fmla vV11B.4s, vU12B.4s, vW12B.4s\n"
+ "fmla vV12B.4s, vU12B.4s, vW11B.4s\n"
+ "fmla vV12B.4s, vU24B.4s, vW23B.4s\n"
+ "fmul vV22B.4s, vU24B.4s, vW13B.4s\n"
+ "fmla vV11B.4s, vU23B.4s, vW23B.4s\n"
+ "ldr qU44B, [uptr3, u_col_stride3]\n"
+ "fmla vV12B.4s, vU23B.4s, vW22B.4s\n"
+ "ldr qU43B, [uptr3, u_col_stride2]\n"
+ "fmul vV21B.4s, vU23B.4s, vW13B.4s\n"
+ "ldr qU42B, [uptr3, u_col_stride1]\n"
+ "fmla vV22B.4s, vU23B.4s, vW12B.4s\n"
+ "ldr qU11B, [%x[uptr0]], #0x10\n"
+ "fmla vV11B.4s, vU22B.4s, vW22B.4s\n"
+ "ldr qU21B, [uptr1], #0x10\n"
+ "fmla vV12B.4s, vU22B.4s, vW21B.4s\n"
+ "ldr qU31B, [uptr2], #0x10\n"
+ "fmla vV21B.4s, vU22B.4s, vW12B.4s\n"
+ "ldr qU41B, [uptr3], #0x10\n"
+ "fmla vV22B.4s, vU22B.4s, vW11B.4s\n"
+ "fmla vV12B.4s, vU34B.4s, vW33B.4s\n"
+ "fmla vV22B.4s, vU34B.4s, vW23B.4s\n"
+ "fmla vV11B.4s, vU33B.4s, vW33B.4s\n"
+ "fmla vV12B.4s, vU33B.4s, vW32B.4s\n"
+ "fmla vV21B.4s, vU33B.4s, vW23B.4s\n"
+ "fmla vV22B.4s, vU33B.4s, vW22B.4s\n"
+ "fmla vV11B.4s, vU32B.4s, vW32B.4s\n"
+ "fmla vV12B.4s, vU32B.4s, vW31B.4s\n"
+ "str qV12B, [%x[vptr0], %x[v_col_stride]]\n"
+ "fmla vV21B.4s, vU32B.4s, vW22B.4s\n"
+ "fmla vV22B.4s, vU32B.4s, vW21B.4s\n"
+ "fmla vV22B.4s, vU44B.4s, vW33B.4s\n"
+ "fmla vV21B.4s, vU43B.4s, vW33B.4s\n"
+ "fmla vV22B.4s, vU43B.4s, vW32B.4s\n"
+ "fmla vV21B.4s, vU42B.4s, vW32B.4s\n"
+ "fmla vV22B.4s, vU42B.4s, vW31B.4s\n"
+ "str qV22B, [vptr1, %x[v_col_stride]]\n"
+ "fmla vV11B.4s, vU11B.4s, vW11B.4s\n"
+ "fmla vV11B.4s, vU21B.4s, vW21B.4s\n"
+ "fmla vV21B.4s, vU21B.4s, vW11B.4s\n"
+ "fmla vV11B.4s, vU31B.4s, vW31B.4s\n"
+ "str qV11B, [%x[vptr0]], #0x10\n"
+ "fmla vV21B.4s, vU31B.4s, vW21B.4s\n"
+ "fmla vV21B.4s, vU41B.4s, vW31B.4s\n"
+ "str qV21B, [vptr1], #0x10\n"
+ "b 5f\n"
+
+ "4:" // Odd number of iterations
+ "fmul vV11A.4s, vU13A.4s, vW13A.4s\n"
+ "fmla vV12A.4s, vU13A.4s, vW12A.4s\n"
+ "fmla vV11A.4s, vU12A.4s, vW12A.4s\n"
+ "fmla vV12A.4s, vU12A.4s, vW11A.4s\n"
+ "fmla vV12A.4s, vU24A.4s, vW23A.4s\n"
+ "fmul vV22A.4s, vU24A.4s, vW13A.4s\n"
+ "fmla vV11A.4s, vU23A.4s, vW23A.4s\n"
+ "ldr qU44A, [uptr3, u_col_stride3]\n"
+ "fmla vV12A.4s, vU23A.4s, vW22A.4s\n"
+ "ldr qU43A, [uptr3, u_col_stride2]\n"
+ "fmul vV21A.4s, vU23A.4s, vW13A.4s\n"
+ "ldr qU42A, [uptr3, u_col_stride1]\n"
+ "fmla vV22A.4s, vU23A.4s, vW12A.4s\n"
+ "ldr qU11A, [%x[uptr0]], #0x10\n"
+ "fmla vV11A.4s, vU22A.4s, vW22A.4s\n"
+ "ldr qU21A, [uptr1], #0x10\n"
+ "fmla vV12A.4s, vU22A.4s, vW21A.4s\n"
+ "ldr qU31A, [uptr2], #0x10\n"
+ "fmla vV21A.4s, vU22A.4s, vW12A.4s\n"
+ "ldr qU41A, [uptr3], #0x10\n"
+ "fmla vV22A.4s, vU22A.4s, vW11A.4s\n"
+ "fmla vV12A.4s, vU34A.4s, vW33A.4s\n"
+ "fmla vV22A.4s, vU34A.4s, vW23A.4s\n"
+ "fmla vV11A.4s, vU33A.4s, vW33A.4s\n"
+ "fmla vV12A.4s, vU33A.4s, vW32A.4s\n"
+ "fmla vV21A.4s, vU33A.4s, vW23A.4s\n"
+ "fmla vV22A.4s, vU33A.4s, vW22A.4s\n"
+ "fmla vV11A.4s, vU32A.4s, vW32A.4s\n"
+ "fmla vV12A.4s, vU32A.4s, vW31A.4s\n"
+ "str qV12A, [%x[vptr0], %x[v_col_stride]]\n"
+ "fmla vV21A.4s, vU32A.4s, vW22A.4s\n"
+ "fmla vV22A.4s, vU32A.4s, vW21A.4s\n"
+ "fmla vV22A.4s, vU44A.4s, vW33A.4s\n"
+ "fmla vV21A.4s, vU43A.4s, vW33A.4s\n"
+ "fmla vV22A.4s, vU43A.4s, vW32A.4s\n"
+ "fmla vV21A.4s, vU42A.4s, vW32A.4s\n"
+ "fmla vV22A.4s, vU42A.4s, vW31A.4s\n"
+ "str qV22A, [vptr1, %x[v_col_stride]]\n"
+ "fmla vV11A.4s, vU11A.4s, vW11A.4s\n"
+ "fmla vV11A.4s, vU21A.4s, vW21A.4s\n"
+ "fmla vV21A.4s, vU21A.4s, vW11A.4s\n"
+ "fmla vV11A.4s, vU31A.4s, vW31A.4s\n"
+ "str qV11A, [%x[vptr0]], #0x10\n"
+ "fmla vV21A.4s, vU31A.4s, vW21A.4s\n"
+ "fmla vV21A.4s, vU41A.4s, vW31A.4s\n"
+ "str qV21A, [vptr1], #0x10\n"
+
+ "5:" // End of method
+
+ ".unreq qW11B\n" ".unreq qW33A\n" ".unreq qU32B\n"
+ ".unreq qU44B\n" ".unreq qW21A\n" ".unreq qU21B\n" ".unreq qU32A\n"
+ ".unreq qU43A\n" ".unreq qV21B\n"
+ ".unreq qU24A\n" ".unreq qU44A\n" ".unreq qU33B\n"
+ ".unreq qU31A\n" ".unreq qV12B\n" ".unreq qU23A\n"
+ ".unreq qW31B\n" ".unreq qV22A\n" ".unreq qV12A\n" ".unreq qW21B\n"
+ ".unreq qU22B\n" ".unreq qU34A\n" ".unreq qU13B\n" ".unreq qU13A\n"
+ ".unreq qU34B\n" ".unreq qU22A\n" ".unreq qU24B\n" ".unreq qU31B\n"
+ ".unreq qW12B\n" ".unreq qW13A\n" ".unreq qV21A\n" ".unreq qV11B\n"
+ ".unreq qW32A\n" ".unreq qW32B\n" ".unreq qW31A\n" ".unreq qV22B\n"
+ ".unreq qW11A\n" ".unreq qW13B\n" ".unreq qU14A\n"
+ ".unreq qU33A\n" ".unreq qW33B\n" ".unreq qW22A\n" ".unreq qU23B\n"
+ ".unreq qU12A\n" ".unreq qU42A\n" ".unreq qU41A\n" ".unreq qU42B\n"
+ ".unreq qW23A\n" ".unreq qW23B\n" ".unreq qU43B\n" ".unreq qU11A\n"
+ ".unreq qU12B\n" ".unreq qW12A\n" ".unreq qU41B\n" ".unreq qV11A\n"
+ ".unreq qW22B\n" ".unreq qU11B\n" ".unreq qU14B\n" ".unreq qU21A\n"
+ ".unreq vW11B\n" ".unreq vW33A\n" ".unreq vU32B\n"
+ ".unreq vU44B\n" ".unreq vW21A\n" ".unreq vU21B\n" ".unreq vU32A\n"
+ ".unreq vU43A\n" ".unreq vV21B\n"
+ ".unreq vU24A\n" ".unreq vU44A\n" ".unreq vU33B\n"
+ ".unreq vU31A\n" ".unreq vV12B\n" ".unreq vU23A\n"
+ ".unreq vW31B\n" ".unreq vV22A\n" ".unreq vV12A\n" ".unreq vW21B\n"
+ ".unreq vU22B\n" ".unreq vU34A\n" ".unreq vU13B\n" ".unreq vU13A\n"
+ ".unreq vU34B\n" ".unreq vU22A\n" ".unreq vU24B\n" ".unreq vU31B\n"
+ ".unreq vW12B\n" ".unreq vW13A\n" ".unreq vV21A\n" ".unreq vV11B\n"
+ ".unreq vW32A\n" ".unreq vW32B\n" ".unreq vW31A\n" ".unreq vV22B\n"
+ ".unreq vW11A\n" ".unreq vW13B\n" ".unreq vU14A\n"
+ ".unreq vU33A\n" ".unreq vW33B\n" ".unreq vW22A\n" ".unreq vU23B\n"
+ ".unreq vU12A\n" ".unreq vU42A\n" ".unreq vU41A\n" ".unreq vU42B\n"
+ ".unreq vW23A\n" ".unreq vW23B\n" ".unreq vU43B\n" ".unreq vU11A\n"
+ ".unreq vU12B\n" ".unreq vW12A\n" ".unreq vU41B\n" ".unreq vV11A\n"
+ ".unreq vW22B\n" ".unreq vU11B\n" ".unreq vU14B\n" ".unreq vU21A\n"
+ ".unreq u_col_stride1\n" ".unreq u_col_stride2\n"
+ ".unreq u_col_stride3\n"
+ ".unreq uptr1\n" ".unreq uptr2\n" ".unreq uptr3\n"
+ ".unreq wptr1\n" ".unreq wptr2\n" ".unreq vptr1\n"
+ ".unreq w_col_stride1\n" ".unreq w_col_stride2\n"
+
+ : [uptr0] "+r" (uptr0), [vptr0] "+r" (vptr0), [wptr0] "+r" (wptr0),
+ [iters] "+r" (n_iters)
+ : [u_row_stride] "r" (in_row_stride * sizeof(float)),
+ [u_col_stride] "r" (in_col_stride * sizeof(float)),
+ [v_row_stride] "r" (out_row_stride * sizeof(float)),
+ [v_col_stride] "r" (out_col_stride * sizeof(float)),
+ [w_row_stride] "r" (weight_row_stride * sizeof(float)),
+ [w_col_stride] "r" (weight_col_stride * sizeof(float)),
+ [odd_tail] "r" (odd_tail)
+ : "x0", "x1", "x2", "x3", "x4", "x5", "x6", "x7", "x8", "cc",
+ "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10",
+ "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20",
+ "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "memory"
+ );
+ }
+
+ if (channels_remaining)
+ {
+ // Fall back on the unoptimised version to clean up the tail
+ ConvImpl::process_tile<false>(
+ channels_remaining,
+ wptr0, weight_row_stride, weight_col_stride,
+ uptr0, in_row_stride, in_col_stride,
+ vptr0, out_row_stride, out_col_stride,
+ 0, 0, 0, 0, 0, 0
+ );
+ }
+}
+
+#endif // __aarch64__
+
+template <>
+const Conv::TileFn Conv::tilefn_unpadded = ConvImpl::template process_tile<true, 0, 0, 0, 0, 0, 0>;
+
+template <>
+const Conv::TileFn Conv::tilefn_top[n_in_pad_top_fns] = {
+ ConvImpl::template process_tile<true, 1, 0, 0, 0, 0, 0>,
};
+template <>
+const Conv::TileFn Conv::tilefn_left[n_in_pad_left_fns] = {
+ ConvImpl::template process_tile<true, 0, 1, 0, 0, 0, 0>,
+};
+
+template <>
+const Conv::TileFn Conv::tilefn_bottom[n_in_pad_bottom_fns][n_out_pad_bottom_fns] = {
+ {
+ ConvImpl::template process_tile<true, 0, 0, 0, 0, 0, 0>,
+ ConvImpl::template process_tile<true, 0, 0, 0, 0, 1, 0>,
+ },
+ {
+ ConvImpl::template process_tile<true, 0, 0, 1, 0, 0, 0>,
+ ConvImpl::template process_tile<true, 0, 0, 1, 0, 1, 0>,
+ },
+ {
+ ConvImpl::template process_tile<true, 0, 0, 2, 0, 0, 0>,
+ ConvImpl::template process_tile<true, 0, 0, 2, 0, 1, 0>,
+ },
+ {
+ ConvImpl::template process_tile<true, 0, 0, 3, 0, 0, 0>,
+ ConvImpl::template process_tile<true, 0, 0, 3, 0, 1, 0>,
+ },
+};
+
+template <>
+const Conv::TileFn Conv::tilefn_right[n_in_pad_right_fns][n_out_pad_right_fns] = {
+ {
+ ConvImpl::template process_tile<true, 0, 0, 0, 0, 0, 0>,
+ ConvImpl::template process_tile<true, 0, 0, 0, 0, 0, 1>,
+ },
+ {
+ ConvImpl::template process_tile<true, 0, 0, 0, 1, 0, 0>,
+ ConvImpl::template process_tile<true, 0, 0, 0, 1, 0, 1>,
+ },
+ {
+ ConvImpl::template process_tile<true, 0, 0, 0, 2, 0, 0>,
+ ConvImpl::template process_tile<true, 0, 0, 0, 2, 0, 1>,
+ },
+ {
+ ConvImpl::template process_tile<true, 0, 0, 0, 3, 0, 0>,
+ ConvImpl::template process_tile<true, 0, 0, 0, 3, 0, 1>,
+ },
+};
+
+template <>
+const Conv::TileFn Conv::tilefn_generic = ConvImpl::template process_tile<false>;
template class DepthwiseConvolution<2, 2, 3, 3, 1, 1, float, float>;
} // namespace depthwise
diff --git a/src/core/NEON/kernels/convolution/depthwise/depthwise_2x2_3x3_2x2_fp32_fp32.cpp b/src/core/NEON/kernels/convolution/depthwise/depthwise_2x2_3x3_2x2_fp32_fp32.cpp
index 0ec5a77..dba2330 100644
--- a/src/core/NEON/kernels/convolution/depthwise/depthwise_2x2_3x3_2x2_fp32_fp32.cpp
+++ b/src/core/NEON/kernels/convolution/depthwise/depthwise_2x2_3x3_2x2_fp32_fp32.cpp
@@ -29,1067 +29,70 @@
using ConvImpl = DepthwiseConvolutionImpl<2, 2, 3, 3, 2, 2, float, float>;
template <>
-const Conv::TileFn Conv::tile_fns
- [max_in_pad_top]
- [max_in_pad_left]
- [max_in_pad_bottom]
- [max_in_pad_right]
- [max_out_pad_bottom]
- [max_out_pad_right] = {
- { // Input pad top = 0
- { // Input pad left = 0
- { // Input pad bottom = 0
- { // Input pad right = 0
- { // Output pad bottom = 0
- Conv::template process_tile<0, 0, 0, 0, 0, 0>,
- Conv::template process_tile<0, 0, 0, 0, 0, 1>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- Conv::template process_tile<0, 0, 0, 0, 1, 0>,
- Conv::template process_tile<0, 0, 0, 0, 1, 1>,
- }, // Output pad bottom = 1
- }, // Input pad right = 0
- { // Input pad right = 1
- { // Output pad bottom = 0
- Conv::template process_tile<0, 0, 0, 1, 0, 0>,
- Conv::template process_tile<0, 0, 0, 1, 0, 1>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- Conv::template process_tile<0, 0, 0, 1, 1, 0>,
- Conv::template process_tile<0, 0, 0, 1, 1, 1>,
- }, // Output pad bottom = 1
- }, // Input pad right = 1
- { // Input pad right = 2
- { // Output pad bottom = 0
- Conv::template process_tile<0, 0, 0, 2, 0, 0>,
- Conv::template process_tile<0, 0, 0, 2, 0, 1>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- Conv::template process_tile<0, 0, 0, 2, 1, 0>,
- Conv::template process_tile<0, 0, 0, 2, 1, 1>,
- }, // Output pad bottom = 1
- }, // Input pad right = 2
- { // Input pad right = 3
- { // Output pad bottom = 0
- Conv::template process_tile<0, 0, 0, 3, 0, 0>,
- Conv::template process_tile<0, 0, 0, 3, 0, 1>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- Conv::template process_tile<0, 0, 0, 3, 1, 0>,
- Conv::template process_tile<0, 0, 0, 3, 1, 1>,
- }, // Output pad bottom = 1
- }, // Input pad right = 3
- { // Input pad right = 4
- { // Output pad bottom = 0
- Conv::template process_tile<0, 0, 0, 4, 0, 0>,
- Conv::template process_tile<0, 0, 0, 4, 0, 1>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- Conv::template process_tile<0, 0, 0, 4, 1, 0>,
- Conv::template process_tile<0, 0, 0, 4, 1, 1>,
- }, // Output pad bottom = 1
- }, // Input pad right = 4
- }, // Input pad bottom = 0
- { // Input pad bottom = 1
- { // Input pad right = 0
- { // Output pad bottom = 0
- Conv::template process_tile<0, 0, 1, 0, 0, 0>,
- Conv::template process_tile<0, 0, 1, 0, 0, 1>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- Conv::template process_tile<0, 0, 1, 0, 1, 0>,
- Conv::template process_tile<0, 0, 1, 0, 1, 1>,
- }, // Output pad bottom = 1
- }, // Input pad right = 0
- { // Input pad right = 1
- { // Output pad bottom = 0
- Conv::template process_tile<0, 0, 1, 1, 0, 0>,
- Conv::template process_tile<0, 0, 1, 1, 0, 1>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- Conv::template process_tile<0, 0, 1, 1, 1, 0>,
- Conv::template process_tile<0, 0, 1, 1, 1, 1>,
- }, // Output pad bottom = 1
- }, // Input pad right = 1
- { // Input pad right = 2
- { // Output pad bottom = 0
- Conv::template process_tile<0, 0, 1, 2, 0, 0>,
- Conv::template process_tile<0, 0, 1, 2, 0, 1>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- Conv::template process_tile<0, 0, 1, 2, 1, 0>,
- Conv::template process_tile<0, 0, 1, 2, 1, 1>,
- }, // Output pad bottom = 1
- }, // Input pad right = 2
- { // Input pad right = 3
- { // Output pad bottom = 0
- Conv::template process_tile<0, 0, 1, 3, 0, 0>,
- Conv::template process_tile<0, 0, 1, 3, 0, 1>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- Conv::template process_tile<0, 0, 1, 3, 1, 0>,
- Conv::template process_tile<0, 0, 1, 3, 1, 1>,
- }, // Output pad bottom = 1
- }, // Input pad right = 3
- { // Input pad right = 4
- { // Output pad bottom = 0
- Conv::template process_tile<0, 0, 1, 4, 0, 0>,
- Conv::template process_tile<0, 0, 1, 4, 0, 1>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- Conv::template process_tile<0, 0, 1, 4, 1, 0>,
- Conv::template process_tile<0, 0, 1, 4, 1, 1>,
- }, // Output pad bottom = 1
- }, // Input pad right = 4
- }, // Input pad bottom = 1
- { // Input pad bottom = 2
- { // Input pad right = 0
- { // Output pad bottom = 0
- Conv::template process_tile<0, 0, 2, 0, 0, 0>,
- Conv::template process_tile<0, 0, 2, 0, 0, 1>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- Conv::template process_tile<0, 0, 2, 0, 1, 0>,
- Conv::template process_tile<0, 0, 2, 0, 1, 1>,
- }, // Output pad bottom = 1
- }, // Input pad right = 0
- { // Input pad right = 1
- { // Output pad bottom = 0
- Conv::template process_tile<0, 0, 2, 1, 0, 0>,
- Conv::template process_tile<0, 0, 2, 1, 0, 1>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- Conv::template process_tile<0, 0, 2, 1, 1, 0>,
- Conv::template process_tile<0, 0, 2, 1, 1, 1>,
- }, // Output pad bottom = 1
- }, // Input pad right = 1
- { // Input pad right = 2
- { // Output pad bottom = 0
- Conv::template process_tile<0, 0, 2, 2, 0, 0>,
- Conv::template process_tile<0, 0, 2, 2, 0, 1>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- Conv::template process_tile<0, 0, 2, 2, 1, 0>,
- Conv::template process_tile<0, 0, 2, 2, 1, 1>,
- }, // Output pad bottom = 1
- }, // Input pad right = 2
- { // Input pad right = 3
- { // Output pad bottom = 0
- Conv::template process_tile<0, 0, 2, 3, 0, 0>,
- Conv::template process_tile<0, 0, 2, 3, 0, 1>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- Conv::template process_tile<0, 0, 2, 3, 1, 0>,
- Conv::template process_tile<0, 0, 2, 3, 1, 1>,
- }, // Output pad bottom = 1
- }, // Input pad right = 3
- { // Input pad right = 4
- { // Output pad bottom = 0
- Conv::template process_tile<0, 0, 2, 4, 0, 0>,
- Conv::template process_tile<0, 0, 2, 4, 0, 1>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- Conv::template process_tile<0, 0, 2, 4, 1, 0>,
- Conv::template process_tile<0, 0, 2, 4, 1, 1>,
- }, // Output pad bottom = 1
- }, // Input pad right = 4
- }, // Input pad bottom = 2
- { // Input pad bottom = 3
- { // Input pad right = 0
- { // Output pad bottom = 0
- Conv::template process_tile<0, 0, 3, 0, 0, 0>,
- Conv::template process_tile<0, 0, 3, 0, 0, 1>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- Conv::template process_tile<0, 0, 3, 0, 1, 0>,
- Conv::template process_tile<0, 0, 3, 0, 1, 1>,
- }, // Output pad bottom = 1
- }, // Input pad right = 0
- { // Input pad right = 1
- { // Output pad bottom = 0
- Conv::template process_tile<0, 0, 3, 1, 0, 0>,
- Conv::template process_tile<0, 0, 3, 1, 0, 1>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- Conv::template process_tile<0, 0, 3, 1, 1, 0>,
- Conv::template process_tile<0, 0, 3, 1, 1, 1>,
- }, // Output pad bottom = 1
- }, // Input pad right = 1
- { // Input pad right = 2
- { // Output pad bottom = 0
- Conv::template process_tile<0, 0, 3, 2, 0, 0>,
- Conv::template process_tile<0, 0, 3, 2, 0, 1>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- Conv::template process_tile<0, 0, 3, 2, 1, 0>,
- Conv::template process_tile<0, 0, 3, 2, 1, 1>,
- }, // Output pad bottom = 1
- }, // Input pad right = 2
- { // Input pad right = 3
- { // Output pad bottom = 0
- Conv::template process_tile<0, 0, 3, 3, 0, 0>,
- Conv::template process_tile<0, 0, 3, 3, 0, 1>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- Conv::template process_tile<0, 0, 3, 3, 1, 0>,
- Conv::template process_tile<0, 0, 3, 3, 1, 1>,
- }, // Output pad bottom = 1
- }, // Input pad right = 3
- { // Input pad right = 4
- { // Output pad bottom = 0
- Conv::template process_tile<0, 0, 3, 4, 0, 0>,
- Conv::template process_tile<0, 0, 3, 4, 0, 1>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- Conv::template process_tile<0, 0, 3, 4, 1, 0>,
- Conv::template process_tile<0, 0, 3, 4, 1, 1>,
- }, // Output pad bottom = 1
- }, // Input pad right = 4
- }, // Input pad bottom = 3
- { // Input pad bottom = 4
- { // Input pad right = 0
- { // Output pad bottom = 0
- Conv::template process_tile<0, 0, 4, 0, 0, 0>,
- Conv::template process_tile<0, 0, 4, 0, 0, 1>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- Conv::template process_tile<0, 0, 4, 0, 1, 0>,
- Conv::template process_tile<0, 0, 4, 0, 1, 1>,
- }, // Output pad bottom = 1
- }, // Input pad right = 0
- { // Input pad right = 1
- { // Output pad bottom = 0
- Conv::template process_tile<0, 0, 4, 1, 0, 0>,
- Conv::template process_tile<0, 0, 4, 1, 0, 1>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- Conv::template process_tile<0, 0, 4, 1, 1, 0>,
- Conv::template process_tile<0, 0, 4, 1, 1, 1>,
- }, // Output pad bottom = 1
- }, // Input pad right = 1
- { // Input pad right = 2
- { // Output pad bottom = 0
- Conv::template process_tile<0, 0, 4, 2, 0, 0>,
- Conv::template process_tile<0, 0, 4, 2, 0, 1>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- Conv::template process_tile<0, 0, 4, 2, 1, 0>,
- Conv::template process_tile<0, 0, 4, 2, 1, 1>,
- }, // Output pad bottom = 1
- }, // Input pad right = 2
- { // Input pad right = 3
- { // Output pad bottom = 0
- Conv::template process_tile<0, 0, 4, 3, 0, 0>,
- Conv::template process_tile<0, 0, 4, 3, 0, 1>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- Conv::template process_tile<0, 0, 4, 3, 1, 0>,
- Conv::template process_tile<0, 0, 4, 3, 1, 1>,
- }, // Output pad bottom = 1
- }, // Input pad right = 3
- { // Input pad right = 4
- { // Output pad bottom = 0
- Conv::template process_tile<0, 0, 4, 4, 0, 0>,
- Conv::template process_tile<0, 0, 4, 4, 0, 1>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- Conv::template process_tile<0, 0, 4, 4, 1, 0>,
- Conv::template process_tile<0, 0, 4, 4, 1, 1>,
- }, // Output pad bottom = 1
- }, // Input pad right = 4
- }, // Input pad bottom = 4
- }, // Input pad left = 0
- { // Input pad left = 1
- { // Input pad bottom = 0
- { // Input pad right = 0
- { // Output pad bottom = 0
- Conv::template process_tile<0, 1, 0, 0, 0, 0>,
- Conv::template process_tile<0, 1, 0, 0, 0, 1>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- Conv::template process_tile<0, 1, 0, 0, 1, 0>,
- Conv::template process_tile<0, 1, 0, 0, 1, 1>,
- }, // Output pad bottom = 1
- }, // Input pad right = 0
- { // Input pad right = 1
- { // Output pad bottom = 0
- Conv::template process_tile<0, 1, 0, 1, 0, 0>,
- Conv::template process_tile<0, 1, 0, 1, 0, 1>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- Conv::template process_tile<0, 1, 0, 1, 1, 0>,
- Conv::template process_tile<0, 1, 0, 1, 1, 1>,
- }, // Output pad bottom = 1
- }, // Input pad right = 1
- { // Input pad right = 2
- { // Output pad bottom = 0
- Conv::template process_tile<0, 1, 0, 2, 0, 0>,
- Conv::template process_tile<0, 1, 0, 2, 0, 1>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- Conv::template process_tile<0, 1, 0, 2, 1, 0>,
- Conv::template process_tile<0, 1, 0, 2, 1, 1>,
- }, // Output pad bottom = 1
- }, // Input pad right = 2
- { // Input pad right = 3
- { // Output pad bottom = 0
- Conv::template process_tile<0, 1, 0, 3, 0, 0>,
- Conv::template process_tile<0, 1, 0, 3, 0, 1>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- Conv::template process_tile<0, 1, 0, 3, 1, 0>,
- Conv::template process_tile<0, 1, 0, 3, 1, 1>,
- }, // Output pad bottom = 1
- }, // Input pad right = 3
- { // Input pad right = 4
- { // Output pad bottom = 0
- Conv::template process_tile<0, 1, 0, 4, 0, 0>,
- Conv::template process_tile<0, 1, 0, 4, 0, 1>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- Conv::template process_tile<0, 1, 0, 4, 1, 0>,
- Conv::template process_tile<0, 1, 0, 4, 1, 1>,
- }, // Output pad bottom = 1
- }, // Input pad right = 4
- }, // Input pad bottom = 0
- { // Input pad bottom = 1
- { // Input pad right = 0
- { // Output pad bottom = 0
- Conv::template process_tile<0, 1, 1, 0, 0, 0>,
- Conv::template process_tile<0, 1, 1, 0, 0, 1>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- Conv::template process_tile<0, 1, 1, 0, 1, 0>,
- Conv::template process_tile<0, 1, 1, 0, 1, 1>,
- }, // Output pad bottom = 1
- }, // Input pad right = 0
- { // Input pad right = 1
- { // Output pad bottom = 0
- Conv::template process_tile<0, 1, 1, 1, 0, 0>,
- Conv::template process_tile<0, 1, 1, 1, 0, 1>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- Conv::template process_tile<0, 1, 1, 1, 1, 0>,
- Conv::template process_tile<0, 1, 1, 1, 1, 1>,
- }, // Output pad bottom = 1
- }, // Input pad right = 1
- { // Input pad right = 2
- { // Output pad bottom = 0
- Conv::template process_tile<0, 1, 1, 2, 0, 0>,
- Conv::template process_tile<0, 1, 1, 2, 0, 1>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- Conv::template process_tile<0, 1, 1, 2, 1, 0>,
- Conv::template process_tile<0, 1, 1, 2, 1, 1>,
- }, // Output pad bottom = 1
- }, // Input pad right = 2
- { // Input pad right = 3
- { // Output pad bottom = 0
- Conv::template process_tile<0, 1, 1, 3, 0, 0>,
- Conv::template process_tile<0, 1, 1, 3, 0, 1>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- Conv::template process_tile<0, 1, 1, 3, 1, 0>,
- Conv::template process_tile<0, 1, 1, 3, 1, 1>,
- }, // Output pad bottom = 1
- }, // Input pad right = 3
- { // Input pad right = 4
- { // Output pad bottom = 0
- Conv::template process_tile<0, 1, 1, 4, 0, 0>,
- Conv::template process_tile<0, 1, 1, 4, 0, 1>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- Conv::template process_tile<0, 1, 1, 4, 1, 0>,
- Conv::template process_tile<0, 1, 1, 4, 1, 1>,
- }, // Output pad bottom = 1
- }, // Input pad right = 4
- }, // Input pad bottom = 1
- { // Input pad bottom = 2
- { // Input pad right = 0
- { // Output pad bottom = 0
- Conv::template process_tile<0, 1, 2, 0, 0, 0>,
- Conv::template process_tile<0, 1, 2, 0, 0, 1>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- Conv::template process_tile<0, 1, 2, 0, 1, 0>,
- Conv::template process_tile<0, 1, 2, 0, 1, 1>,
- }, // Output pad bottom = 1
- }, // Input pad right = 0
- { // Input pad right = 1
- { // Output pad bottom = 0
- Conv::template process_tile<0, 1, 2, 1, 0, 0>,
- Conv::template process_tile<0, 1, 2, 1, 0, 1>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- Conv::template process_tile<0, 1, 2, 1, 1, 0>,
- Conv::template process_tile<0, 1, 2, 1, 1, 1>,
- }, // Output pad bottom = 1
- }, // Input pad right = 1
- { // Input pad right = 2
- { // Output pad bottom = 0
- Conv::template process_tile<0, 1, 2, 2, 0, 0>,
- Conv::template process_tile<0, 1, 2, 2, 0, 1>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- Conv::template process_tile<0, 1, 2, 2, 1, 0>,
- Conv::template process_tile<0, 1, 2, 2, 1, 1>,
- }, // Output pad bottom = 1
- }, // Input pad right = 2
- { // Input pad right = 3
- { // Output pad bottom = 0
- Conv::template process_tile<0, 1, 2, 3, 0, 0>,
- Conv::template process_tile<0, 1, 2, 3, 0, 1>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- Conv::template process_tile<0, 1, 2, 3, 1, 0>,
- Conv::template process_tile<0, 1, 2, 3, 1, 1>,
- }, // Output pad bottom = 1
- }, // Input pad right = 3
- { // Input pad right = 4
- { // Output pad bottom = 0
- Conv::template process_tile<0, 1, 2, 4, 0, 0>,
- Conv::template process_tile<0, 1, 2, 4, 0, 1>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- Conv::template process_tile<0, 1, 2, 4, 1, 0>,
- Conv::template process_tile<0, 1, 2, 4, 1, 1>,
- }, // Output pad bottom = 1
- }, // Input pad right = 4
- }, // Input pad bottom = 2
- { // Input pad bottom = 3
- { // Input pad right = 0
- { // Output pad bottom = 0
- Conv::template process_tile<0, 1, 3, 0, 0, 0>,
- Conv::template process_tile<0, 1, 3, 0, 0, 1>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- Conv::template process_tile<0, 1, 3, 0, 1, 0>,
- Conv::template process_tile<0, 1, 3, 0, 1, 1>,
- }, // Output pad bottom = 1
- }, // Input pad right = 0
- { // Input pad right = 1
- { // Output pad bottom = 0
- Conv::template process_tile<0, 1, 3, 1, 0, 0>,
- Conv::template process_tile<0, 1, 3, 1, 0, 1>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- Conv::template process_tile<0, 1, 3, 1, 1, 0>,
- Conv::template process_tile<0, 1, 3, 1, 1, 1>,
- }, // Output pad bottom = 1
- }, // Input pad right = 1
- { // Input pad right = 2
- { // Output pad bottom = 0
- Conv::template process_tile<0, 1, 3, 2, 0, 0>,
- Conv::template process_tile<0, 1, 3, 2, 0, 1>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- Conv::template process_tile<0, 1, 3, 2, 1, 0>,
- Conv::template process_tile<0, 1, 3, 2, 1, 1>,
- }, // Output pad bottom = 1
- }, // Input pad right = 2
- { // Input pad right = 3
- { // Output pad bottom = 0
- Conv::template process_tile<0, 1, 3, 3, 0, 0>,
- Conv::template process_tile<0, 1, 3, 3, 0, 1>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- Conv::template process_tile<0, 1, 3, 3, 1, 0>,
- Conv::template process_tile<0, 1, 3, 3, 1, 1>,
- }, // Output pad bottom = 1
- }, // Input pad right = 3
- { // Input pad right = 4
- { // Output pad bottom = 0
- Conv::template process_tile<0, 1, 3, 4, 0, 0>,
- Conv::template process_tile<0, 1, 3, 4, 0, 1>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- Conv::template process_tile<0, 1, 3, 4, 1, 0>,
- Conv::template process_tile<0, 1, 3, 4, 1, 1>,
- }, // Output pad bottom = 1
- }, // Input pad right = 4
- }, // Input pad bottom = 3
- { // Input pad bottom = 4
- { // Input pad right = 0
- { // Output pad bottom = 0
- Conv::template process_tile<0, 1, 4, 0, 0, 0>,
- Conv::template process_tile<0, 1, 4, 0, 0, 1>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- Conv::template process_tile<0, 1, 4, 0, 1, 0>,
- Conv::template process_tile<0, 1, 4, 0, 1, 1>,
- }, // Output pad bottom = 1
- }, // Input pad right = 0
- { // Input pad right = 1
- { // Output pad bottom = 0
- Conv::template process_tile<0, 1, 4, 1, 0, 0>,
- Conv::template process_tile<0, 1, 4, 1, 0, 1>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- Conv::template process_tile<0, 1, 4, 1, 1, 0>,
- Conv::template process_tile<0, 1, 4, 1, 1, 1>,
- }, // Output pad bottom = 1
- }, // Input pad right = 1
- { // Input pad right = 2
- { // Output pad bottom = 0
- Conv::template process_tile<0, 1, 4, 2, 0, 0>,
- Conv::template process_tile<0, 1, 4, 2, 0, 1>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- Conv::template process_tile<0, 1, 4, 2, 1, 0>,
- Conv::template process_tile<0, 1, 4, 2, 1, 1>,
- }, // Output pad bottom = 1
- }, // Input pad right = 2
- { // Input pad right = 3
- { // Output pad bottom = 0
- Conv::template process_tile<0, 1, 4, 3, 0, 0>,
- Conv::template process_tile<0, 1, 4, 3, 0, 1>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- Conv::template process_tile<0, 1, 4, 3, 1, 0>,
- Conv::template process_tile<0, 1, 4, 3, 1, 1>,
- }, // Output pad bottom = 1
- }, // Input pad right = 3
- { // Input pad right = 4
- { // Output pad bottom = 0
- Conv::template process_tile<0, 1, 4, 4, 0, 0>,
- Conv::template process_tile<0, 1, 4, 4, 0, 1>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- Conv::template process_tile<0, 1, 4, 4, 1, 0>,
- Conv::template process_tile<0, 1, 4, 4, 1, 1>,
- }, // Output pad bottom = 1
- }, // Input pad right = 4
- }, // Input pad bottom = 4
- }, // Input pad left = 1
- }, // Input pad top = 0
- { // Input pad top = 1
- { // Input pad left = 0
- { // Input pad bottom = 0
- { // Input pad right = 0
- { // Output pad bottom = 0
- Conv::template process_tile<1, 0, 0, 0, 0, 0>,
- Conv::template process_tile<1, 0, 0, 0, 0, 1>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- Conv::template process_tile<1, 0, 0, 0, 1, 0>,
- Conv::template process_tile<1, 0, 0, 0, 1, 1>,
- }, // Output pad bottom = 1
- }, // Input pad right = 0
- { // Input pad right = 1
- { // Output pad bottom = 0
- Conv::template process_tile<1, 0, 0, 1, 0, 0>,
- Conv::template process_tile<1, 0, 0, 1, 0, 1>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- Conv::template process_tile<1, 0, 0, 1, 1, 0>,
- Conv::template process_tile<1, 0, 0, 1, 1, 1>,
- }, // Output pad bottom = 1
- }, // Input pad right = 1
- { // Input pad right = 2
- { // Output pad bottom = 0
- Conv::template process_tile<1, 0, 0, 2, 0, 0>,
- Conv::template process_tile<1, 0, 0, 2, 0, 1>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- Conv::template process_tile<1, 0, 0, 2, 1, 0>,
- Conv::template process_tile<1, 0, 0, 2, 1, 1>,
- }, // Output pad bottom = 1
- }, // Input pad right = 2
- { // Input pad right = 3
- { // Output pad bottom = 0
- Conv::template process_tile<1, 0, 0, 3, 0, 0>,
- Conv::template process_tile<1, 0, 0, 3, 0, 1>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- Conv::template process_tile<1, 0, 0, 3, 1, 0>,
- Conv::template process_tile<1, 0, 0, 3, 1, 1>,
- }, // Output pad bottom = 1
- }, // Input pad right = 3
- { // Input pad right = 4
- { // Output pad bottom = 0
- Conv::template process_tile<1, 0, 0, 4, 0, 0>,
- Conv::template process_tile<1, 0, 0, 4, 0, 1>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- Conv::template process_tile<1, 0, 0, 4, 1, 0>,
- Conv::template process_tile<1, 0, 0, 4, 1, 1>,
- }, // Output pad bottom = 1
- }, // Input pad right = 4
- }, // Input pad bottom = 0
- { // Input pad bottom = 1
- { // Input pad right = 0
- { // Output pad bottom = 0
- Conv::template process_tile<1, 0, 1, 0, 0, 0>,
- Conv::template process_tile<1, 0, 1, 0, 0, 1>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- Conv::template process_tile<1, 0, 1, 0, 1, 0>,
- Conv::template process_tile<1, 0, 1, 0, 1, 1>,
- }, // Output pad bottom = 1
- }, // Input pad right = 0
- { // Input pad right = 1
- { // Output pad bottom = 0
- Conv::template process_tile<1, 0, 1, 1, 0, 0>,
- Conv::template process_tile<1, 0, 1, 1, 0, 1>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- Conv::template process_tile<1, 0, 1, 1, 1, 0>,
- Conv::template process_tile<1, 0, 1, 1, 1, 1>,
- }, // Output pad bottom = 1
- }, // Input pad right = 1
- { // Input pad right = 2
- { // Output pad bottom = 0
- Conv::template process_tile<1, 0, 1, 2, 0, 0>,
- Conv::template process_tile<1, 0, 1, 2, 0, 1>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- Conv::template process_tile<1, 0, 1, 2, 1, 0>,
- Conv::template process_tile<1, 0, 1, 2, 1, 1>,
- }, // Output pad bottom = 1
- }, // Input pad right = 2
- { // Input pad right = 3
- { // Output pad bottom = 0
- Conv::template process_tile<1, 0, 1, 3, 0, 0>,
- Conv::template process_tile<1, 0, 1, 3, 0, 1>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- Conv::template process_tile<1, 0, 1, 3, 1, 0>,
- Conv::template process_tile<1, 0, 1, 3, 1, 1>,
- }, // Output pad bottom = 1
- }, // Input pad right = 3
- { // Input pad right = 4
- { // Output pad bottom = 0
- Conv::template process_tile<1, 0, 1, 4, 0, 0>,
- Conv::template process_tile<1, 0, 1, 4, 0, 1>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- Conv::template process_tile<1, 0, 1, 4, 1, 0>,
- Conv::template process_tile<1, 0, 1, 4, 1, 1>,
- }, // Output pad bottom = 1
- }, // Input pad right = 4
- }, // Input pad bottom = 1
- { // Input pad bottom = 2
- { // Input pad right = 0
- { // Output pad bottom = 0
- Conv::template process_tile<1, 0, 2, 0, 0, 0>,
- Conv::template process_tile<1, 0, 2, 0, 0, 1>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- Conv::template process_tile<1, 0, 2, 0, 1, 0>,
- Conv::template process_tile<1, 0, 2, 0, 1, 1>,
- }, // Output pad bottom = 1
- }, // Input pad right = 0
- { // Input pad right = 1
- { // Output pad bottom = 0
- Conv::template process_tile<1, 0, 2, 1, 0, 0>,
- Conv::template process_tile<1, 0, 2, 1, 0, 1>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- Conv::template process_tile<1, 0, 2, 1, 1, 0>,
- Conv::template process_tile<1, 0, 2, 1, 1, 1>,
- }, // Output pad bottom = 1
- }, // Input pad right = 1
- { // Input pad right = 2
- { // Output pad bottom = 0
- Conv::template process_tile<1, 0, 2, 2, 0, 0>,
- Conv::template process_tile<1, 0, 2, 2, 0, 1>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- Conv::template process_tile<1, 0, 2, 2, 1, 0>,
- Conv::template process_tile<1, 0, 2, 2, 1, 1>,
- }, // Output pad bottom = 1
- }, // Input pad right = 2
- { // Input pad right = 3
- { // Output pad bottom = 0
- Conv::template process_tile<1, 0, 2, 3, 0, 0>,
- Conv::template process_tile<1, 0, 2, 3, 0, 1>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- Conv::template process_tile<1, 0, 2, 3, 1, 0>,
- Conv::template process_tile<1, 0, 2, 3, 1, 1>,
- }, // Output pad bottom = 1
- }, // Input pad right = 3
- { // Input pad right = 4
- { // Output pad bottom = 0
- Conv::template process_tile<1, 0, 2, 4, 0, 0>,
- Conv::template process_tile<1, 0, 2, 4, 0, 1>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- Conv::template process_tile<1, 0, 2, 4, 1, 0>,
- Conv::template process_tile<1, 0, 2, 4, 1, 1>,
- }, // Output pad bottom = 1
- }, // Input pad right = 4
- }, // Input pad bottom = 2
- { // Input pad bottom = 3
- { // Input pad right = 0
- { // Output pad bottom = 0
- Conv::template process_tile<1, 0, 3, 0, 0, 0>,
- Conv::template process_tile<1, 0, 3, 0, 0, 1>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- Conv::template process_tile<1, 0, 3, 0, 1, 0>,
- Conv::template process_tile<1, 0, 3, 0, 1, 1>,
- }, // Output pad bottom = 1
- }, // Input pad right = 0
- { // Input pad right = 1
- { // Output pad bottom = 0
- Conv::template process_tile<1, 0, 3, 1, 0, 0>,
- Conv::template process_tile<1, 0, 3, 1, 0, 1>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- Conv::template process_tile<1, 0, 3, 1, 1, 0>,
- Conv::template process_tile<1, 0, 3, 1, 1, 1>,
- }, // Output pad bottom = 1
- }, // Input pad right = 1
- { // Input pad right = 2
- { // Output pad bottom = 0
- Conv::template process_tile<1, 0, 3, 2, 0, 0>,
- Conv::template process_tile<1, 0, 3, 2, 0, 1>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- Conv::template process_tile<1, 0, 3, 2, 1, 0>,
- Conv::template process_tile<1, 0, 3, 2, 1, 1>,
- }, // Output pad bottom = 1
- }, // Input pad right = 2
- { // Input pad right = 3
- { // Output pad bottom = 0
- Conv::template process_tile<1, 0, 3, 3, 0, 0>,
- Conv::template process_tile<1, 0, 3, 3, 0, 1>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- Conv::template process_tile<1, 0, 3, 3, 1, 0>,
- Conv::template process_tile<1, 0, 3, 3, 1, 1>,
- }, // Output pad bottom = 1
- }, // Input pad right = 3
- { // Input pad right = 4
- { // Output pad bottom = 0
- Conv::template process_tile<1, 0, 3, 4, 0, 0>,
- Conv::template process_tile<1, 0, 3, 4, 0, 1>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- Conv::template process_tile<1, 0, 3, 4, 1, 0>,
- Conv::template process_tile<1, 0, 3, 4, 1, 1>,
- }, // Output pad bottom = 1
- }, // Input pad right = 4
- }, // Input pad bottom = 3
- { // Input pad bottom = 4
- { // Input pad right = 0
- { // Output pad bottom = 0
- Conv::template process_tile<1, 0, 4, 0, 0, 0>,
- Conv::template process_tile<1, 0, 4, 0, 0, 1>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- Conv::template process_tile<1, 0, 4, 0, 1, 0>,
- Conv::template process_tile<1, 0, 4, 0, 1, 1>,
- }, // Output pad bottom = 1
- }, // Input pad right = 0
- { // Input pad right = 1
- { // Output pad bottom = 0
- Conv::template process_tile<1, 0, 4, 1, 0, 0>,
- Conv::template process_tile<1, 0, 4, 1, 0, 1>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- Conv::template process_tile<1, 0, 4, 1, 1, 0>,
- Conv::template process_tile<1, 0, 4, 1, 1, 1>,
- }, // Output pad bottom = 1
- }, // Input pad right = 1
- { // Input pad right = 2
- { // Output pad bottom = 0
- Conv::template process_tile<1, 0, 4, 2, 0, 0>,
- Conv::template process_tile<1, 0, 4, 2, 0, 1>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- Conv::template process_tile<1, 0, 4, 2, 1, 0>,
- Conv::template process_tile<1, 0, 4, 2, 1, 1>,
- }, // Output pad bottom = 1
- }, // Input pad right = 2
- { // Input pad right = 3
- { // Output pad bottom = 0
- Conv::template process_tile<1, 0, 4, 3, 0, 0>,
- Conv::template process_tile<1, 0, 4, 3, 0, 1>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- Conv::template process_tile<1, 0, 4, 3, 1, 0>,
- Conv::template process_tile<1, 0, 4, 3, 1, 1>,
- }, // Output pad bottom = 1
- }, // Input pad right = 3
- { // Input pad right = 4
- { // Output pad bottom = 0
- Conv::template process_tile<1, 0, 4, 4, 0, 0>,
- Conv::template process_tile<1, 0, 4, 4, 0, 1>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- Conv::template process_tile<1, 0, 4, 4, 1, 0>,
- Conv::template process_tile<1, 0, 4, 4, 1, 1>,
- }, // Output pad bottom = 1
- }, // Input pad right = 4
- }, // Input pad bottom = 4
- }, // Input pad left = 0
- { // Input pad left = 1
- { // Input pad bottom = 0
- { // Input pad right = 0
- { // Output pad bottom = 0
- Conv::template process_tile<1, 1, 0, 0, 0, 0>,
- Conv::template process_tile<1, 1, 0, 0, 0, 1>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- Conv::template process_tile<1, 1, 0, 0, 1, 0>,
- Conv::template process_tile<1, 1, 0, 0, 1, 1>,
- }, // Output pad bottom = 1
- }, // Input pad right = 0
- { // Input pad right = 1
- { // Output pad bottom = 0
- Conv::template process_tile<1, 1, 0, 1, 0, 0>,
- Conv::template process_tile<1, 1, 0, 1, 0, 1>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- Conv::template process_tile<1, 1, 0, 1, 1, 0>,
- Conv::template process_tile<1, 1, 0, 1, 1, 1>,
- }, // Output pad bottom = 1
- }, // Input pad right = 1
- { // Input pad right = 2
- { // Output pad bottom = 0
- Conv::template process_tile<1, 1, 0, 2, 0, 0>,
- Conv::template process_tile<1, 1, 0, 2, 0, 1>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- Conv::template process_tile<1, 1, 0, 2, 1, 0>,
- Conv::template process_tile<1, 1, 0, 2, 1, 1>,
- }, // Output pad bottom = 1
- }, // Input pad right = 2
- { // Input pad right = 3
- { // Output pad bottom = 0
- Conv::template process_tile<1, 1, 0, 3, 0, 0>,
- Conv::template process_tile<1, 1, 0, 3, 0, 1>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- Conv::template process_tile<1, 1, 0, 3, 1, 0>,
- Conv::template process_tile<1, 1, 0, 3, 1, 1>,
- }, // Output pad bottom = 1
- }, // Input pad right = 3
- { // Input pad right = 4
- { // Output pad bottom = 0
- Conv::template process_tile<1, 1, 0, 4, 0, 0>,
- Conv::template process_tile<1, 1, 0, 4, 0, 1>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- Conv::template process_tile<1, 1, 0, 4, 1, 0>,
- Conv::template process_tile<1, 1, 0, 4, 1, 1>,
- }, // Output pad bottom = 1
- }, // Input pad right = 4
- }, // Input pad bottom = 0
- { // Input pad bottom = 1
- { // Input pad right = 0
- { // Output pad bottom = 0
- Conv::template process_tile<1, 1, 1, 0, 0, 0>,
- Conv::template process_tile<1, 1, 1, 0, 0, 1>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- Conv::template process_tile<1, 1, 1, 0, 1, 0>,
- Conv::template process_tile<1, 1, 1, 0, 1, 1>,
- }, // Output pad bottom = 1
- }, // Input pad right = 0
- { // Input pad right = 1
- { // Output pad bottom = 0
- Conv::template process_tile<1, 1, 1, 1, 0, 0>,
- Conv::template process_tile<1, 1, 1, 1, 0, 1>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- Conv::template process_tile<1, 1, 1, 1, 1, 0>,
- Conv::template process_tile<1, 1, 1, 1, 1, 1>,
- }, // Output pad bottom = 1
- }, // Input pad right = 1
- { // Input pad right = 2
- { // Output pad bottom = 0
- Conv::template process_tile<1, 1, 1, 2, 0, 0>,
- Conv::template process_tile<1, 1, 1, 2, 0, 1>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- Conv::template process_tile<1, 1, 1, 2, 1, 0>,
- Conv::template process_tile<1, 1, 1, 2, 1, 1>,
- }, // Output pad bottom = 1
- }, // Input pad right = 2
- { // Input pad right = 3
- { // Output pad bottom = 0
- Conv::template process_tile<1, 1, 1, 3, 0, 0>,
- Conv::template process_tile<1, 1, 1, 3, 0, 1>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- Conv::template process_tile<1, 1, 1, 3, 1, 0>,
- Conv::template process_tile<1, 1, 1, 3, 1, 1>,
- }, // Output pad bottom = 1
- }, // Input pad right = 3
- { // Input pad right = 4
- { // Output pad bottom = 0
- Conv::template process_tile<1, 1, 1, 4, 0, 0>,
- Conv::template process_tile<1, 1, 1, 4, 0, 1>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- Conv::template process_tile<1, 1, 1, 4, 1, 0>,
- Conv::template process_tile<1, 1, 1, 4, 1, 1>,
- }, // Output pad bottom = 1
- }, // Input pad right = 4
- }, // Input pad bottom = 1
- { // Input pad bottom = 2
- { // Input pad right = 0
- { // Output pad bottom = 0
- Conv::template process_tile<1, 1, 2, 0, 0, 0>,
- Conv::template process_tile<1, 1, 2, 0, 0, 1>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- Conv::template process_tile<1, 1, 2, 0, 1, 0>,
- Conv::template process_tile<1, 1, 2, 0, 1, 1>,
- }, // Output pad bottom = 1
- }, // Input pad right = 0
- { // Input pad right = 1
- { // Output pad bottom = 0
- Conv::template process_tile<1, 1, 2, 1, 0, 0>,
- Conv::template process_tile<1, 1, 2, 1, 0, 1>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- Conv::template process_tile<1, 1, 2, 1, 1, 0>,
- Conv::template process_tile<1, 1, 2, 1, 1, 1>,
- }, // Output pad bottom = 1
- }, // Input pad right = 1
- { // Input pad right = 2
- { // Output pad bottom = 0
- Conv::template process_tile<1, 1, 2, 2, 0, 0>,
- Conv::template process_tile<1, 1, 2, 2, 0, 1>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- Conv::template process_tile<1, 1, 2, 2, 1, 0>,
- Conv::template process_tile<1, 1, 2, 2, 1, 1>,
- }, // Output pad bottom = 1
- }, // Input pad right = 2
- { // Input pad right = 3
- { // Output pad bottom = 0
- Conv::template process_tile<1, 1, 2, 3, 0, 0>,
- Conv::template process_tile<1, 1, 2, 3, 0, 1>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- Conv::template process_tile<1, 1, 2, 3, 1, 0>,
- Conv::template process_tile<1, 1, 2, 3, 1, 1>,
- }, // Output pad bottom = 1
- }, // Input pad right = 3
- { // Input pad right = 4
- { // Output pad bottom = 0
- Conv::template process_tile<1, 1, 2, 4, 0, 0>,
- Conv::template process_tile<1, 1, 2, 4, 0, 1>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- Conv::template process_tile<1, 1, 2, 4, 1, 0>,
- Conv::template process_tile<1, 1, 2, 4, 1, 1>,
- }, // Output pad bottom = 1
- }, // Input pad right = 4
- }, // Input pad bottom = 2
- { // Input pad bottom = 3
- { // Input pad right = 0
- { // Output pad bottom = 0
- Conv::template process_tile<1, 1, 3, 0, 0, 0>,
- Conv::template process_tile<1, 1, 3, 0, 0, 1>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- Conv::template process_tile<1, 1, 3, 0, 1, 0>,
- Conv::template process_tile<1, 1, 3, 0, 1, 1>,
- }, // Output pad bottom = 1
- }, // Input pad right = 0
- { // Input pad right = 1
- { // Output pad bottom = 0
- Conv::template process_tile<1, 1, 3, 1, 0, 0>,
- Conv::template process_tile<1, 1, 3, 1, 0, 1>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- Conv::template process_tile<1, 1, 3, 1, 1, 0>,
- Conv::template process_tile<1, 1, 3, 1, 1, 1>,
- }, // Output pad bottom = 1
- }, // Input pad right = 1
- { // Input pad right = 2
- { // Output pad bottom = 0
- Conv::template process_tile<1, 1, 3, 2, 0, 0>,
- Conv::template process_tile<1, 1, 3, 2, 0, 1>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- Conv::template process_tile<1, 1, 3, 2, 1, 0>,
- Conv::template process_tile<1, 1, 3, 2, 1, 1>,
- }, // Output pad bottom = 1
- }, // Input pad right = 2
- { // Input pad right = 3
- { // Output pad bottom = 0
- Conv::template process_tile<1, 1, 3, 3, 0, 0>,
- Conv::template process_tile<1, 1, 3, 3, 0, 1>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- Conv::template process_tile<1, 1, 3, 3, 1, 0>,
- Conv::template process_tile<1, 1, 3, 3, 1, 1>,
- }, // Output pad bottom = 1
- }, // Input pad right = 3
- { // Input pad right = 4
- { // Output pad bottom = 0
- Conv::template process_tile<1, 1, 3, 4, 0, 0>,
- Conv::template process_tile<1, 1, 3, 4, 0, 1>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- Conv::template process_tile<1, 1, 3, 4, 1, 0>,
- Conv::template process_tile<1, 1, 3, 4, 1, 1>,
- }, // Output pad bottom = 1
- }, // Input pad right = 4
- }, // Input pad bottom = 3
- { // Input pad bottom = 4
- { // Input pad right = 0
- { // Output pad bottom = 0
- Conv::template process_tile<1, 1, 4, 0, 0, 0>,
- Conv::template process_tile<1, 1, 4, 0, 0, 1>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- Conv::template process_tile<1, 1, 4, 0, 1, 0>,
- Conv::template process_tile<1, 1, 4, 0, 1, 1>,
- }, // Output pad bottom = 1
- }, // Input pad right = 0
- { // Input pad right = 1
- { // Output pad bottom = 0
- Conv::template process_tile<1, 1, 4, 1, 0, 0>,
- Conv::template process_tile<1, 1, 4, 1, 0, 1>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- Conv::template process_tile<1, 1, 4, 1, 1, 0>,
- Conv::template process_tile<1, 1, 4, 1, 1, 1>,
- }, // Output pad bottom = 1
- }, // Input pad right = 1
- { // Input pad right = 2
- { // Output pad bottom = 0
- Conv::template process_tile<1, 1, 4, 2, 0, 0>,
- Conv::template process_tile<1, 1, 4, 2, 0, 1>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- Conv::template process_tile<1, 1, 4, 2, 1, 0>,
- Conv::template process_tile<1, 1, 4, 2, 1, 1>,
- }, // Output pad bottom = 1
- }, // Input pad right = 2
- { // Input pad right = 3
- { // Output pad bottom = 0
- Conv::template process_tile<1, 1, 4, 3, 0, 0>,
- Conv::template process_tile<1, 1, 4, 3, 0, 1>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- Conv::template process_tile<1, 1, 4, 3, 1, 0>,
- Conv::template process_tile<1, 1, 4, 3, 1, 1>,
- }, // Output pad bottom = 1
- }, // Input pad right = 3
- { // Input pad right = 4
- { // Output pad bottom = 0
- Conv::template process_tile<1, 1, 4, 4, 0, 0>,
- Conv::template process_tile<1, 1, 4, 4, 0, 1>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- Conv::template process_tile<1, 1, 4, 4, 1, 0>,
- Conv::template process_tile<1, 1, 4, 4, 1, 1>,
- }, // Output pad bottom = 1
- }, // Input pad right = 4
- }, // Input pad bottom = 4
- }, // Input pad left = 1
- }, // Input pad top = 1
+const Conv::TileFn Conv::tilefn_unpadded = ConvImpl::template process_tile<true, 0, 0, 0, 0, 0, 0>;
+
+template <>
+const Conv::TileFn Conv::tilefn_top[n_in_pad_top_fns] = {
+ ConvImpl::template process_tile<true, 0, 0, 0, 0, 0, 0>,
+ ConvImpl::template process_tile<true, 1, 0, 0, 0, 0, 0>,
};
+template <>
+const Conv::TileFn Conv::tilefn_left[n_in_pad_left_fns] = {
+ ConvImpl::template process_tile<true, 0, 0, 0, 0, 0, 0>,
+ ConvImpl::template process_tile<true, 0, 1, 0, 0, 0, 0>,
+};
+
+template <>
+const Conv::TileFn Conv::tilefn_bottom[n_in_pad_bottom_fns][n_out_pad_bottom_fns] = {
+ {
+ ConvImpl::template process_tile<true, 0, 0, 0, 0, 0, 0>,
+ ConvImpl::template process_tile<true, 0, 0, 0, 0, 1, 0>,
+ },
+ {
+ ConvImpl::template process_tile<true, 0, 0, 1, 0, 0, 0>,
+ ConvImpl::template process_tile<true, 0, 0, 1, 0, 1, 0>,
+ },
+ {
+ ConvImpl::template process_tile<true, 0, 0, 2, 0, 0, 0>,
+ ConvImpl::template process_tile<true, 0, 0, 2, 0, 1, 0>,
+ },
+ {
+ ConvImpl::template process_tile<true, 0, 0, 3, 0, 0, 0>,
+ ConvImpl::template process_tile<true, 0, 0, 3, 0, 1, 0>,
+ },
+ {
+ ConvImpl::template process_tile<true, 0, 0, 4, 0, 0, 0>,
+ ConvImpl::template process_tile<true, 0, 0, 4, 0, 1, 0>,
+ },
+};
+
+template <>
+const Conv::TileFn Conv::tilefn_right[n_in_pad_right_fns][n_out_pad_right_fns] = {
+ {
+ ConvImpl::template process_tile<true, 0, 0, 0, 0, 0, 0>,
+ ConvImpl::template process_tile<true, 0, 0, 0, 0, 0, 1>,
+ },
+ {
+ ConvImpl::template process_tile<true, 0, 0, 0, 1, 0, 0>,
+ ConvImpl::template process_tile<true, 0, 0, 0, 1, 0, 1>,
+ },
+ {
+ ConvImpl::template process_tile<true, 0, 0, 0, 2, 0, 0>,
+ ConvImpl::template process_tile<true, 0, 0, 0, 2, 0, 1>,
+ },
+ {
+ ConvImpl::template process_tile<true, 0, 0, 0, 3, 0, 0>,
+ ConvImpl::template process_tile<true, 0, 0, 0, 3, 0, 1>,
+ },
+ {
+ ConvImpl::template process_tile<true, 0, 0, 0, 4, 0, 0>,
+ ConvImpl::template process_tile<true, 0, 0, 0, 4, 0, 1>,
+ },
+};
+
+template <>
+const Conv::TileFn Conv::tilefn_generic = ConvImpl::template process_tile<false>;
template class DepthwiseConvolution<2, 2, 3, 3, 2, 2, float, float>;
} // namespace depthwise
diff --git a/src/core/NEON/kernels/convolution/depthwise/depthwise_3x3_3x3_1x1_fp32_fp32.cpp b/src/core/NEON/kernels/convolution/depthwise/depthwise_3x3_3x3_1x1_fp32_fp32.cpp
index dc3c383..b946e5d 100644
--- a/src/core/NEON/kernels/convolution/depthwise/depthwise_3x3_3x3_1x1_fp32_fp32.cpp
+++ b/src/core/NEON/kernels/convolution/depthwise/depthwise_3x3_3x3_1x1_fp32_fp32.cpp
@@ -28,1148 +28,928 @@
using Conv = DepthwiseConvolution<3, 3, 3, 3, 1, 1, float, float>;
using ConvImpl = DepthwiseConvolutionImpl<3, 3, 3, 3, 1, 1, float, float>;
+#ifdef __aarch64__
+
template <>
-const Conv::TileFn Conv::tile_fns
- [max_in_pad_top]
- [max_in_pad_left]
- [max_in_pad_bottom]
- [max_in_pad_right]
- [max_out_pad_bottom]
- [max_out_pad_right] = {
- { // Input pad top = 0
- { // Input pad left = 0
- { // Input pad bottom = 0
- { // Input pad right = 0
- { // Output pad bottom = 0
- Conv::template process_tile<0, 0, 0, 0, 0, 0>,
- Conv::template process_tile<0, 0, 0, 0, 0, 1>,
- Conv::template process_tile<0, 0, 0, 0, 0, 2>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- Conv::template process_tile<0, 0, 0, 0, 1, 0>,
- Conv::template process_tile<0, 0, 0, 0, 1, 1>,
- Conv::template process_tile<0, 0, 0, 0, 1, 2>,
- }, // Output pad bottom = 1
- { // Output pad bottom = 2
- Conv::template process_tile<0, 0, 0, 0, 2, 0>,
- Conv::template process_tile<0, 0, 0, 0, 2, 1>,
- Conv::template process_tile<0, 0, 0, 0, 2, 2>,
- }, // Output pad bottom = 2
- }, // Input pad right = 0
- { // Input pad right = 1
- { // Output pad bottom = 0
- Conv::template process_tile<0, 0, 0, 1, 0, 0>,
- Conv::template process_tile<0, 0, 0, 1, 0, 1>,
- Conv::template process_tile<0, 0, 0, 1, 0, 2>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- Conv::template process_tile<0, 0, 0, 1, 1, 0>,
- Conv::template process_tile<0, 0, 0, 1, 1, 1>,
- Conv::template process_tile<0, 0, 0, 1, 1, 2>,
- }, // Output pad bottom = 1
- { // Output pad bottom = 2
- Conv::template process_tile<0, 0, 0, 1, 2, 0>,
- Conv::template process_tile<0, 0, 0, 1, 2, 1>,
- Conv::template process_tile<0, 0, 0, 1, 2, 2>,
- }, // Output pad bottom = 2
- }, // Input pad right = 1
- { // Input pad right = 2
- { // Output pad bottom = 0
- Conv::template process_tile<0, 0, 0, 2, 0, 0>,
- Conv::template process_tile<0, 0, 0, 2, 0, 1>,
- Conv::template process_tile<0, 0, 0, 2, 0, 2>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- Conv::template process_tile<0, 0, 0, 2, 1, 0>,
- Conv::template process_tile<0, 0, 0, 2, 1, 1>,
- Conv::template process_tile<0, 0, 0, 2, 1, 2>,
- }, // Output pad bottom = 1
- { // Output pad bottom = 2
- Conv::template process_tile<0, 0, 0, 2, 2, 0>,
- Conv::template process_tile<0, 0, 0, 2, 2, 1>,
- Conv::template process_tile<0, 0, 0, 2, 2, 2>,
- }, // Output pad bottom = 2
- }, // Input pad right = 2
- { // Input pad right = 3
- { // Output pad bottom = 0
- Conv::template process_tile<0, 0, 0, 3, 0, 0>,
- Conv::template process_tile<0, 0, 0, 3, 0, 1>,
- Conv::template process_tile<0, 0, 0, 3, 0, 2>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- Conv::template process_tile<0, 0, 0, 3, 1, 0>,
- Conv::template process_tile<0, 0, 0, 3, 1, 1>,
- Conv::template process_tile<0, 0, 0, 3, 1, 2>,
- }, // Output pad bottom = 1
- { // Output pad bottom = 2
- Conv::template process_tile<0, 0, 0, 3, 2, 0>,
- Conv::template process_tile<0, 0, 0, 3, 2, 1>,
- Conv::template process_tile<0, 0, 0, 3, 2, 2>,
- }, // Output pad bottom = 2
- }, // Input pad right = 3
- }, // Input pad bottom = 0
- { // Input pad bottom = 1
- { // Input pad right = 0
- { // Output pad bottom = 0
- Conv::template process_tile<0, 0, 1, 0, 0, 0>,
- Conv::template process_tile<0, 0, 1, 0, 0, 1>,
- Conv::template process_tile<0, 0, 1, 0, 0, 2>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- Conv::template process_tile<0, 0, 1, 0, 1, 0>,
- Conv::template process_tile<0, 0, 1, 0, 1, 1>,
- Conv::template process_tile<0, 0, 1, 0, 1, 2>,
- }, // Output pad bottom = 1
- { // Output pad bottom = 2
- Conv::template process_tile<0, 0, 1, 0, 2, 0>,
- Conv::template process_tile<0, 0, 1, 0, 2, 1>,
- Conv::template process_tile<0, 0, 1, 0, 2, 2>,
- }, // Output pad bottom = 2
- }, // Input pad right = 0
- { // Input pad right = 1
- { // Output pad bottom = 0
- Conv::template process_tile<0, 0, 1, 1, 0, 0>,
- Conv::template process_tile<0, 0, 1, 1, 0, 1>,
- Conv::template process_tile<0, 0, 1, 1, 0, 2>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- Conv::template process_tile<0, 0, 1, 1, 1, 0>,
- Conv::template process_tile<0, 0, 1, 1, 1, 1>,
- Conv::template process_tile<0, 0, 1, 1, 1, 2>,
- }, // Output pad bottom = 1
- { // Output pad bottom = 2
- Conv::template process_tile<0, 0, 1, 1, 2, 0>,
- Conv::template process_tile<0, 0, 1, 1, 2, 1>,
- Conv::template process_tile<0, 0, 1, 1, 2, 2>,
- }, // Output pad bottom = 2
- }, // Input pad right = 1
- { // Input pad right = 2
- { // Output pad bottom = 0
- Conv::template process_tile<0, 0, 1, 2, 0, 0>,
- Conv::template process_tile<0, 0, 1, 2, 0, 1>,
- Conv::template process_tile<0, 0, 1, 2, 0, 2>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- Conv::template process_tile<0, 0, 1, 2, 1, 0>,
- Conv::template process_tile<0, 0, 1, 2, 1, 1>,
- Conv::template process_tile<0, 0, 1, 2, 1, 2>,
- }, // Output pad bottom = 1
- { // Output pad bottom = 2
- Conv::template process_tile<0, 0, 1, 2, 2, 0>,
- Conv::template process_tile<0, 0, 1, 2, 2, 1>,
- Conv::template process_tile<0, 0, 1, 2, 2, 2>,
- }, // Output pad bottom = 2
- }, // Input pad right = 2
- { // Input pad right = 3
- { // Output pad bottom = 0
- Conv::template process_tile<0, 0, 1, 3, 0, 0>,
- Conv::template process_tile<0, 0, 1, 3, 0, 1>,
- Conv::template process_tile<0, 0, 1, 3, 0, 2>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- Conv::template process_tile<0, 0, 1, 3, 1, 0>,
- Conv::template process_tile<0, 0, 1, 3, 1, 1>,
- Conv::template process_tile<0, 0, 1, 3, 1, 2>,
- }, // Output pad bottom = 1
- { // Output pad bottom = 2
- Conv::template process_tile<0, 0, 1, 3, 2, 0>,
- Conv::template process_tile<0, 0, 1, 3, 2, 1>,
- Conv::template process_tile<0, 0, 1, 3, 2, 2>,
- }, // Output pad bottom = 2
- }, // Input pad right = 3
- }, // Input pad bottom = 1
- { // Input pad bottom = 2
- { // Input pad right = 0
- { // Output pad bottom = 0
- Conv::template process_tile<0, 0, 2, 0, 0, 0>,
- Conv::template process_tile<0, 0, 2, 0, 0, 1>,
- Conv::template process_tile<0, 0, 2, 0, 0, 2>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- Conv::template process_tile<0, 0, 2, 0, 1, 0>,
- Conv::template process_tile<0, 0, 2, 0, 1, 1>,
- Conv::template process_tile<0, 0, 2, 0, 1, 2>,
- }, // Output pad bottom = 1
- { // Output pad bottom = 2
- Conv::template process_tile<0, 0, 2, 0, 2, 0>,
- Conv::template process_tile<0, 0, 2, 0, 2, 1>,
- Conv::template process_tile<0, 0, 2, 0, 2, 2>,
- }, // Output pad bottom = 2
- }, // Input pad right = 0
- { // Input pad right = 1
- { // Output pad bottom = 0
- Conv::template process_tile<0, 0, 2, 1, 0, 0>,
- Conv::template process_tile<0, 0, 2, 1, 0, 1>,
- Conv::template process_tile<0, 0, 2, 1, 0, 2>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- Conv::template process_tile<0, 0, 2, 1, 1, 0>,
- Conv::template process_tile<0, 0, 2, 1, 1, 1>,
- Conv::template process_tile<0, 0, 2, 1, 1, 2>,
- }, // Output pad bottom = 1
- { // Output pad bottom = 2
- Conv::template process_tile<0, 0, 2, 1, 2, 0>,
- Conv::template process_tile<0, 0, 2, 1, 2, 1>,
- Conv::template process_tile<0, 0, 2, 1, 2, 2>,
- }, // Output pad bottom = 2
- }, // Input pad right = 1
- { // Input pad right = 2
- { // Output pad bottom = 0
- Conv::template process_tile<0, 0, 2, 2, 0, 0>,
- Conv::template process_tile<0, 0, 2, 2, 0, 1>,
- Conv::template process_tile<0, 0, 2, 2, 0, 2>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- Conv::template process_tile<0, 0, 2, 2, 1, 0>,
- Conv::template process_tile<0, 0, 2, 2, 1, 1>,
- Conv::template process_tile<0, 0, 2, 2, 1, 2>,
- }, // Output pad bottom = 1
- { // Output pad bottom = 2
- Conv::template process_tile<0, 0, 2, 2, 2, 0>,
- Conv::template process_tile<0, 0, 2, 2, 2, 1>,
- Conv::template process_tile<0, 0, 2, 2, 2, 2>,
- }, // Output pad bottom = 2
- }, // Input pad right = 2
- { // Input pad right = 3
- { // Output pad bottom = 0
- Conv::template process_tile<0, 0, 2, 3, 0, 0>,
- Conv::template process_tile<0, 0, 2, 3, 0, 1>,
- Conv::template process_tile<0, 0, 2, 3, 0, 2>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- Conv::template process_tile<0, 0, 2, 3, 1, 0>,
- Conv::template process_tile<0, 0, 2, 3, 1, 1>,
- Conv::template process_tile<0, 0, 2, 3, 1, 2>,
- }, // Output pad bottom = 1
- { // Output pad bottom = 2
- Conv::template process_tile<0, 0, 2, 3, 2, 0>,
- Conv::template process_tile<0, 0, 2, 3, 2, 1>,
- Conv::template process_tile<0, 0, 2, 3, 2, 2>,
- }, // Output pad bottom = 2
- }, // Input pad right = 3
- }, // Input pad bottom = 2
- { // Input pad bottom = 3
- { // Input pad right = 0
- { // Output pad bottom = 0
- Conv::template process_tile<0, 0, 3, 0, 0, 0>,
- Conv::template process_tile<0, 0, 3, 0, 0, 1>,
- Conv::template process_tile<0, 0, 3, 0, 0, 2>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- Conv::template process_tile<0, 0, 3, 0, 1, 0>,
- Conv::template process_tile<0, 0, 3, 0, 1, 1>,
- Conv::template process_tile<0, 0, 3, 0, 1, 2>,
- }, // Output pad bottom = 1
- { // Output pad bottom = 2
- Conv::template process_tile<0, 0, 3, 0, 2, 0>,
- Conv::template process_tile<0, 0, 3, 0, 2, 1>,
- Conv::template process_tile<0, 0, 3, 0, 2, 2>,
- }, // Output pad bottom = 2
- }, // Input pad right = 0
- { // Input pad right = 1
- { // Output pad bottom = 0
- Conv::template process_tile<0, 0, 3, 1, 0, 0>,
- Conv::template process_tile<0, 0, 3, 1, 0, 1>,
- Conv::template process_tile<0, 0, 3, 1, 0, 2>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- Conv::template process_tile<0, 0, 3, 1, 1, 0>,
- Conv::template process_tile<0, 0, 3, 1, 1, 1>,
- Conv::template process_tile<0, 0, 3, 1, 1, 2>,
- }, // Output pad bottom = 1
- { // Output pad bottom = 2
- Conv::template process_tile<0, 0, 3, 1, 2, 0>,
- Conv::template process_tile<0, 0, 3, 1, 2, 1>,
- Conv::template process_tile<0, 0, 3, 1, 2, 2>,
- }, // Output pad bottom = 2
- }, // Input pad right = 1
- { // Input pad right = 2
- { // Output pad bottom = 0
- Conv::template process_tile<0, 0, 3, 2, 0, 0>,
- Conv::template process_tile<0, 0, 3, 2, 0, 1>,
- Conv::template process_tile<0, 0, 3, 2, 0, 2>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- Conv::template process_tile<0, 0, 3, 2, 1, 0>,
- Conv::template process_tile<0, 0, 3, 2, 1, 1>,
- Conv::template process_tile<0, 0, 3, 2, 1, 2>,
- }, // Output pad bottom = 1
- { // Output pad bottom = 2
- Conv::template process_tile<0, 0, 3, 2, 2, 0>,
- Conv::template process_tile<0, 0, 3, 2, 2, 1>,
- Conv::template process_tile<0, 0, 3, 2, 2, 2>,
- }, // Output pad bottom = 2
- }, // Input pad right = 2
- { // Input pad right = 3
- { // Output pad bottom = 0
- Conv::template process_tile<0, 0, 3, 3, 0, 0>,
- Conv::template process_tile<0, 0, 3, 3, 0, 1>,
- Conv::template process_tile<0, 0, 3, 3, 0, 2>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- Conv::template process_tile<0, 0, 3, 3, 1, 0>,
- Conv::template process_tile<0, 0, 3, 3, 1, 1>,
- Conv::template process_tile<0, 0, 3, 3, 1, 2>,
- }, // Output pad bottom = 1
- { // Output pad bottom = 2
- Conv::template process_tile<0, 0, 3, 3, 2, 0>,
- Conv::template process_tile<0, 0, 3, 3, 2, 1>,
- Conv::template process_tile<0, 0, 3, 3, 2, 2>,
- }, // Output pad bottom = 2
- }, // Input pad right = 3
- }, // Input pad bottom = 3
- }, // Input pad left = 0
- { // Input pad left = 1
- { // Input pad bottom = 0
- { // Input pad right = 0
- { // Output pad bottom = 0
- Conv::template process_tile<0, 1, 0, 0, 0, 0>,
- Conv::template process_tile<0, 1, 0, 0, 0, 1>,
- Conv::template process_tile<0, 1, 0, 0, 0, 2>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- Conv::template process_tile<0, 1, 0, 0, 1, 0>,
- Conv::template process_tile<0, 1, 0, 0, 1, 1>,
- Conv::template process_tile<0, 1, 0, 0, 1, 2>,
- }, // Output pad bottom = 1
- { // Output pad bottom = 2
- Conv::template process_tile<0, 1, 0, 0, 2, 0>,
- Conv::template process_tile<0, 1, 0, 0, 2, 1>,
- Conv::template process_tile<0, 1, 0, 0, 2, 2>,
- }, // Output pad bottom = 2
- }, // Input pad right = 0
- { // Input pad right = 1
- { // Output pad bottom = 0
- Conv::template process_tile<0, 1, 0, 1, 0, 0>,
- Conv::template process_tile<0, 1, 0, 1, 0, 1>,
- Conv::template process_tile<0, 1, 0, 1, 0, 2>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- Conv::template process_tile<0, 1, 0, 1, 1, 0>,
- Conv::template process_tile<0, 1, 0, 1, 1, 1>,
- Conv::template process_tile<0, 1, 0, 1, 1, 2>,
- }, // Output pad bottom = 1
- { // Output pad bottom = 2
- Conv::template process_tile<0, 1, 0, 1, 2, 0>,
- Conv::template process_tile<0, 1, 0, 1, 2, 1>,
- Conv::template process_tile<0, 1, 0, 1, 2, 2>,
- }, // Output pad bottom = 2
- }, // Input pad right = 1
- { // Input pad right = 2
- { // Output pad bottom = 0
- Conv::template process_tile<0, 1, 0, 2, 0, 0>,
- Conv::template process_tile<0, 1, 0, 2, 0, 1>,
- Conv::template process_tile<0, 1, 0, 2, 0, 2>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- Conv::template process_tile<0, 1, 0, 2, 1, 0>,
- Conv::template process_tile<0, 1, 0, 2, 1, 1>,
- Conv::template process_tile<0, 1, 0, 2, 1, 2>,
- }, // Output pad bottom = 1
- { // Output pad bottom = 2
- Conv::template process_tile<0, 1, 0, 2, 2, 0>,
- Conv::template process_tile<0, 1, 0, 2, 2, 1>,
- Conv::template process_tile<0, 1, 0, 2, 2, 2>,
- }, // Output pad bottom = 2
- }, // Input pad right = 2
- { // Input pad right = 3
- { // Output pad bottom = 0
- Conv::template process_tile<0, 1, 0, 3, 0, 0>,
- Conv::template process_tile<0, 1, 0, 3, 0, 1>,
- Conv::template process_tile<0, 1, 0, 3, 0, 2>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- Conv::template process_tile<0, 1, 0, 3, 1, 0>,
- Conv::template process_tile<0, 1, 0, 3, 1, 1>,
- Conv::template process_tile<0, 1, 0, 3, 1, 2>,
- }, // Output pad bottom = 1
- { // Output pad bottom = 2
- Conv::template process_tile<0, 1, 0, 3, 2, 0>,
- Conv::template process_tile<0, 1, 0, 3, 2, 1>,
- Conv::template process_tile<0, 1, 0, 3, 2, 2>,
- }, // Output pad bottom = 2
- }, // Input pad right = 3
- }, // Input pad bottom = 0
- { // Input pad bottom = 1
- { // Input pad right = 0
- { // Output pad bottom = 0
- Conv::template process_tile<0, 1, 1, 0, 0, 0>,
- Conv::template process_tile<0, 1, 1, 0, 0, 1>,
- Conv::template process_tile<0, 1, 1, 0, 0, 2>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- Conv::template process_tile<0, 1, 1, 0, 1, 0>,
- Conv::template process_tile<0, 1, 1, 0, 1, 1>,
- Conv::template process_tile<0, 1, 1, 0, 1, 2>,
- }, // Output pad bottom = 1
- { // Output pad bottom = 2
- Conv::template process_tile<0, 1, 1, 0, 2, 0>,
- Conv::template process_tile<0, 1, 1, 0, 2, 1>,
- Conv::template process_tile<0, 1, 1, 0, 2, 2>,
- }, // Output pad bottom = 2
- }, // Input pad right = 0
- { // Input pad right = 1
- { // Output pad bottom = 0
- Conv::template process_tile<0, 1, 1, 1, 0, 0>,
- Conv::template process_tile<0, 1, 1, 1, 0, 1>,
- Conv::template process_tile<0, 1, 1, 1, 0, 2>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- Conv::template process_tile<0, 1, 1, 1, 1, 0>,
- Conv::template process_tile<0, 1, 1, 1, 1, 1>,
- Conv::template process_tile<0, 1, 1, 1, 1, 2>,
- }, // Output pad bottom = 1
- { // Output pad bottom = 2
- Conv::template process_tile<0, 1, 1, 1, 2, 0>,
- Conv::template process_tile<0, 1, 1, 1, 2, 1>,
- Conv::template process_tile<0, 1, 1, 1, 2, 2>,
- }, // Output pad bottom = 2
- }, // Input pad right = 1
- { // Input pad right = 2
- { // Output pad bottom = 0
- Conv::template process_tile<0, 1, 1, 2, 0, 0>,
- Conv::template process_tile<0, 1, 1, 2, 0, 1>,
- Conv::template process_tile<0, 1, 1, 2, 0, 2>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- Conv::template process_tile<0, 1, 1, 2, 1, 0>,
- Conv::template process_tile<0, 1, 1, 2, 1, 1>,
- Conv::template process_tile<0, 1, 1, 2, 1, 2>,
- }, // Output pad bottom = 1
- { // Output pad bottom = 2
- Conv::template process_tile<0, 1, 1, 2, 2, 0>,
- Conv::template process_tile<0, 1, 1, 2, 2, 1>,
- Conv::template process_tile<0, 1, 1, 2, 2, 2>,
- }, // Output pad bottom = 2
- }, // Input pad right = 2
- { // Input pad right = 3
- { // Output pad bottom = 0
- Conv::template process_tile<0, 1, 1, 3, 0, 0>,
- Conv::template process_tile<0, 1, 1, 3, 0, 1>,
- Conv::template process_tile<0, 1, 1, 3, 0, 2>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- Conv::template process_tile<0, 1, 1, 3, 1, 0>,
- Conv::template process_tile<0, 1, 1, 3, 1, 1>,
- Conv::template process_tile<0, 1, 1, 3, 1, 2>,
- }, // Output pad bottom = 1
- { // Output pad bottom = 2
- Conv::template process_tile<0, 1, 1, 3, 2, 0>,
- Conv::template process_tile<0, 1, 1, 3, 2, 1>,
- Conv::template process_tile<0, 1, 1, 3, 2, 2>,
- }, // Output pad bottom = 2
- }, // Input pad right = 3
- }, // Input pad bottom = 1
- { // Input pad bottom = 2
- { // Input pad right = 0
- { // Output pad bottom = 0
- Conv::template process_tile<0, 1, 2, 0, 0, 0>,
- Conv::template process_tile<0, 1, 2, 0, 0, 1>,
- Conv::template process_tile<0, 1, 2, 0, 0, 2>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- Conv::template process_tile<0, 1, 2, 0, 1, 0>,
- Conv::template process_tile<0, 1, 2, 0, 1, 1>,
- Conv::template process_tile<0, 1, 2, 0, 1, 2>,
- }, // Output pad bottom = 1
- { // Output pad bottom = 2
- Conv::template process_tile<0, 1, 2, 0, 2, 0>,
- Conv::template process_tile<0, 1, 2, 0, 2, 1>,
- Conv::template process_tile<0, 1, 2, 0, 2, 2>,
- }, // Output pad bottom = 2
- }, // Input pad right = 0
- { // Input pad right = 1
- { // Output pad bottom = 0
- Conv::template process_tile<0, 1, 2, 1, 0, 0>,
- Conv::template process_tile<0, 1, 2, 1, 0, 1>,
- Conv::template process_tile<0, 1, 2, 1, 0, 2>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- Conv::template process_tile<0, 1, 2, 1, 1, 0>,
- Conv::template process_tile<0, 1, 2, 1, 1, 1>,
- Conv::template process_tile<0, 1, 2, 1, 1, 2>,
- }, // Output pad bottom = 1
- { // Output pad bottom = 2
- Conv::template process_tile<0, 1, 2, 1, 2, 0>,
- Conv::template process_tile<0, 1, 2, 1, 2, 1>,
- Conv::template process_tile<0, 1, 2, 1, 2, 2>,
- }, // Output pad bottom = 2
- }, // Input pad right = 1
- { // Input pad right = 2
- { // Output pad bottom = 0
- Conv::template process_tile<0, 1, 2, 2, 0, 0>,
- Conv::template process_tile<0, 1, 2, 2, 0, 1>,
- Conv::template process_tile<0, 1, 2, 2, 0, 2>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- Conv::template process_tile<0, 1, 2, 2, 1, 0>,
- Conv::template process_tile<0, 1, 2, 2, 1, 1>,
- Conv::template process_tile<0, 1, 2, 2, 1, 2>,
- }, // Output pad bottom = 1
- { // Output pad bottom = 2
- Conv::template process_tile<0, 1, 2, 2, 2, 0>,
- Conv::template process_tile<0, 1, 2, 2, 2, 1>,
- Conv::template process_tile<0, 1, 2, 2, 2, 2>,
- }, // Output pad bottom = 2
- }, // Input pad right = 2
- { // Input pad right = 3
- { // Output pad bottom = 0
- Conv::template process_tile<0, 1, 2, 3, 0, 0>,
- Conv::template process_tile<0, 1, 2, 3, 0, 1>,
- Conv::template process_tile<0, 1, 2, 3, 0, 2>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- Conv::template process_tile<0, 1, 2, 3, 1, 0>,
- Conv::template process_tile<0, 1, 2, 3, 1, 1>,
- Conv::template process_tile<0, 1, 2, 3, 1, 2>,
- }, // Output pad bottom = 1
- { // Output pad bottom = 2
- Conv::template process_tile<0, 1, 2, 3, 2, 0>,
- Conv::template process_tile<0, 1, 2, 3, 2, 1>,
- Conv::template process_tile<0, 1, 2, 3, 2, 2>,
- }, // Output pad bottom = 2
- }, // Input pad right = 3
- }, // Input pad bottom = 2
- { // Input pad bottom = 3
- { // Input pad right = 0
- { // Output pad bottom = 0
- Conv::template process_tile<0, 1, 3, 0, 0, 0>,
- Conv::template process_tile<0, 1, 3, 0, 0, 1>,
- Conv::template process_tile<0, 1, 3, 0, 0, 2>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- Conv::template process_tile<0, 1, 3, 0, 1, 0>,
- Conv::template process_tile<0, 1, 3, 0, 1, 1>,
- Conv::template process_tile<0, 1, 3, 0, 1, 2>,
- }, // Output pad bottom = 1
- { // Output pad bottom = 2
- Conv::template process_tile<0, 1, 3, 0, 2, 0>,
- Conv::template process_tile<0, 1, 3, 0, 2, 1>,
- Conv::template process_tile<0, 1, 3, 0, 2, 2>,
- }, // Output pad bottom = 2
- }, // Input pad right = 0
- { // Input pad right = 1
- { // Output pad bottom = 0
- Conv::template process_tile<0, 1, 3, 1, 0, 0>,
- Conv::template process_tile<0, 1, 3, 1, 0, 1>,
- Conv::template process_tile<0, 1, 3, 1, 0, 2>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- Conv::template process_tile<0, 1, 3, 1, 1, 0>,
- Conv::template process_tile<0, 1, 3, 1, 1, 1>,
- Conv::template process_tile<0, 1, 3, 1, 1, 2>,
- }, // Output pad bottom = 1
- { // Output pad bottom = 2
- Conv::template process_tile<0, 1, 3, 1, 2, 0>,
- Conv::template process_tile<0, 1, 3, 1, 2, 1>,
- Conv::template process_tile<0, 1, 3, 1, 2, 2>,
- }, // Output pad bottom = 2
- }, // Input pad right = 1
- { // Input pad right = 2
- { // Output pad bottom = 0
- Conv::template process_tile<0, 1, 3, 2, 0, 0>,
- Conv::template process_tile<0, 1, 3, 2, 0, 1>,
- Conv::template process_tile<0, 1, 3, 2, 0, 2>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- Conv::template process_tile<0, 1, 3, 2, 1, 0>,
- Conv::template process_tile<0, 1, 3, 2, 1, 1>,
- Conv::template process_tile<0, 1, 3, 2, 1, 2>,
- }, // Output pad bottom = 1
- { // Output pad bottom = 2
- Conv::template process_tile<0, 1, 3, 2, 2, 0>,
- Conv::template process_tile<0, 1, 3, 2, 2, 1>,
- Conv::template process_tile<0, 1, 3, 2, 2, 2>,
- }, // Output pad bottom = 2
- }, // Input pad right = 2
- { // Input pad right = 3
- { // Output pad bottom = 0
- Conv::template process_tile<0, 1, 3, 3, 0, 0>,
- Conv::template process_tile<0, 1, 3, 3, 0, 1>,
- Conv::template process_tile<0, 1, 3, 3, 0, 2>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- Conv::template process_tile<0, 1, 3, 3, 1, 0>,
- Conv::template process_tile<0, 1, 3, 3, 1, 1>,
- Conv::template process_tile<0, 1, 3, 3, 1, 2>,
- }, // Output pad bottom = 1
- { // Output pad bottom = 2
- Conv::template process_tile<0, 1, 3, 3, 2, 0>,
- Conv::template process_tile<0, 1, 3, 3, 2, 1>,
- Conv::template process_tile<0, 1, 3, 3, 2, 2>,
- }, // Output pad bottom = 2
- }, // Input pad right = 3
- }, // Input pad bottom = 3
- }, // Input pad left = 1
- }, // Input pad top = 0
- { // Input pad top = 1
- { // Input pad left = 0
- { // Input pad bottom = 0
- { // Input pad right = 0
- { // Output pad bottom = 0
- Conv::template process_tile<1, 0, 0, 0, 0, 0>,
- Conv::template process_tile<1, 0, 0, 0, 0, 1>,
- Conv::template process_tile<1, 0, 0, 0, 0, 2>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- Conv::template process_tile<1, 0, 0, 0, 1, 0>,
- Conv::template process_tile<1, 0, 0, 0, 1, 1>,
- Conv::template process_tile<1, 0, 0, 0, 1, 2>,
- }, // Output pad bottom = 1
- { // Output pad bottom = 2
- Conv::template process_tile<1, 0, 0, 0, 2, 0>,
- Conv::template process_tile<1, 0, 0, 0, 2, 1>,
- Conv::template process_tile<1, 0, 0, 0, 2, 2>,
- }, // Output pad bottom = 2
- }, // Input pad right = 0
- { // Input pad right = 1
- { // Output pad bottom = 0
- Conv::template process_tile<1, 0, 0, 1, 0, 0>,
- Conv::template process_tile<1, 0, 0, 1, 0, 1>,
- Conv::template process_tile<1, 0, 0, 1, 0, 2>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- Conv::template process_tile<1, 0, 0, 1, 1, 0>,
- Conv::template process_tile<1, 0, 0, 1, 1, 1>,
- Conv::template process_tile<1, 0, 0, 1, 1, 2>,
- }, // Output pad bottom = 1
- { // Output pad bottom = 2
- Conv::template process_tile<1, 0, 0, 1, 2, 0>,
- Conv::template process_tile<1, 0, 0, 1, 2, 1>,
- Conv::template process_tile<1, 0, 0, 1, 2, 2>,
- }, // Output pad bottom = 2
- }, // Input pad right = 1
- { // Input pad right = 2
- { // Output pad bottom = 0
- Conv::template process_tile<1, 0, 0, 2, 0, 0>,
- Conv::template process_tile<1, 0, 0, 2, 0, 1>,
- Conv::template process_tile<1, 0, 0, 2, 0, 2>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- Conv::template process_tile<1, 0, 0, 2, 1, 0>,
- Conv::template process_tile<1, 0, 0, 2, 1, 1>,
- Conv::template process_tile<1, 0, 0, 2, 1, 2>,
- }, // Output pad bottom = 1
- { // Output pad bottom = 2
- Conv::template process_tile<1, 0, 0, 2, 2, 0>,
- Conv::template process_tile<1, 0, 0, 2, 2, 1>,
- Conv::template process_tile<1, 0, 0, 2, 2, 2>,
- }, // Output pad bottom = 2
- }, // Input pad right = 2
- { // Input pad right = 3
- { // Output pad bottom = 0
- Conv::template process_tile<1, 0, 0, 3, 0, 0>,
- Conv::template process_tile<1, 0, 0, 3, 0, 1>,
- Conv::template process_tile<1, 0, 0, 3, 0, 2>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- Conv::template process_tile<1, 0, 0, 3, 1, 0>,
- Conv::template process_tile<1, 0, 0, 3, 1, 1>,
- Conv::template process_tile<1, 0, 0, 3, 1, 2>,
- }, // Output pad bottom = 1
- { // Output pad bottom = 2
- Conv::template process_tile<1, 0, 0, 3, 2, 0>,
- Conv::template process_tile<1, 0, 0, 3, 2, 1>,
- Conv::template process_tile<1, 0, 0, 3, 2, 2>,
- }, // Output pad bottom = 2
- }, // Input pad right = 3
- }, // Input pad bottom = 0
- { // Input pad bottom = 1
- { // Input pad right = 0
- { // Output pad bottom = 0
- Conv::template process_tile<1, 0, 1, 0, 0, 0>,
- Conv::template process_tile<1, 0, 1, 0, 0, 1>,
- Conv::template process_tile<1, 0, 1, 0, 0, 2>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- Conv::template process_tile<1, 0, 1, 0, 1, 0>,
- Conv::template process_tile<1, 0, 1, 0, 1, 1>,
- Conv::template process_tile<1, 0, 1, 0, 1, 2>,
- }, // Output pad bottom = 1
- { // Output pad bottom = 2
- Conv::template process_tile<1, 0, 1, 0, 2, 0>,
- Conv::template process_tile<1, 0, 1, 0, 2, 1>,
- Conv::template process_tile<1, 0, 1, 0, 2, 2>,
- }, // Output pad bottom = 2
- }, // Input pad right = 0
- { // Input pad right = 1
- { // Output pad bottom = 0
- Conv::template process_tile<1, 0, 1, 1, 0, 0>,
- Conv::template process_tile<1, 0, 1, 1, 0, 1>,
- Conv::template process_tile<1, 0, 1, 1, 0, 2>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- Conv::template process_tile<1, 0, 1, 1, 1, 0>,
- Conv::template process_tile<1, 0, 1, 1, 1, 1>,
- Conv::template process_tile<1, 0, 1, 1, 1, 2>,
- }, // Output pad bottom = 1
- { // Output pad bottom = 2
- Conv::template process_tile<1, 0, 1, 1, 2, 0>,
- Conv::template process_tile<1, 0, 1, 1, 2, 1>,
- Conv::template process_tile<1, 0, 1, 1, 2, 2>,
- }, // Output pad bottom = 2
- }, // Input pad right = 1
- { // Input pad right = 2
- { // Output pad bottom = 0
- Conv::template process_tile<1, 0, 1, 2, 0, 0>,
- Conv::template process_tile<1, 0, 1, 2, 0, 1>,
- Conv::template process_tile<1, 0, 1, 2, 0, 2>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- Conv::template process_tile<1, 0, 1, 2, 1, 0>,
- Conv::template process_tile<1, 0, 1, 2, 1, 1>,
- Conv::template process_tile<1, 0, 1, 2, 1, 2>,
- }, // Output pad bottom = 1
- { // Output pad bottom = 2
- Conv::template process_tile<1, 0, 1, 2, 2, 0>,
- Conv::template process_tile<1, 0, 1, 2, 2, 1>,
- Conv::template process_tile<1, 0, 1, 2, 2, 2>,
- }, // Output pad bottom = 2
- }, // Input pad right = 2
- { // Input pad right = 3
- { // Output pad bottom = 0
- Conv::template process_tile<1, 0, 1, 3, 0, 0>,
- Conv::template process_tile<1, 0, 1, 3, 0, 1>,
- Conv::template process_tile<1, 0, 1, 3, 0, 2>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- Conv::template process_tile<1, 0, 1, 3, 1, 0>,
- Conv::template process_tile<1, 0, 1, 3, 1, 1>,
- Conv::template process_tile<1, 0, 1, 3, 1, 2>,
- }, // Output pad bottom = 1
- { // Output pad bottom = 2
- Conv::template process_tile<1, 0, 1, 3, 2, 0>,
- Conv::template process_tile<1, 0, 1, 3, 2, 1>,
- Conv::template process_tile<1, 0, 1, 3, 2, 2>,
- }, // Output pad bottom = 2
- }, // Input pad right = 3
- }, // Input pad bottom = 1
- { // Input pad bottom = 2
- { // Input pad right = 0
- { // Output pad bottom = 0
- Conv::template process_tile<1, 0, 2, 0, 0, 0>,
- Conv::template process_tile<1, 0, 2, 0, 0, 1>,
- Conv::template process_tile<1, 0, 2, 0, 0, 2>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- Conv::template process_tile<1, 0, 2, 0, 1, 0>,
- Conv::template process_tile<1, 0, 2, 0, 1, 1>,
- Conv::template process_tile<1, 0, 2, 0, 1, 2>,
- }, // Output pad bottom = 1
- { // Output pad bottom = 2
- Conv::template process_tile<1, 0, 2, 0, 2, 0>,
- Conv::template process_tile<1, 0, 2, 0, 2, 1>,
- Conv::template process_tile<1, 0, 2, 0, 2, 2>,
- }, // Output pad bottom = 2
- }, // Input pad right = 0
- { // Input pad right = 1
- { // Output pad bottom = 0
- Conv::template process_tile<1, 0, 2, 1, 0, 0>,
- Conv::template process_tile<1, 0, 2, 1, 0, 1>,
- Conv::template process_tile<1, 0, 2, 1, 0, 2>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- Conv::template process_tile<1, 0, 2, 1, 1, 0>,
- Conv::template process_tile<1, 0, 2, 1, 1, 1>,
- Conv::template process_tile<1, 0, 2, 1, 1, 2>,
- }, // Output pad bottom = 1
- { // Output pad bottom = 2
- Conv::template process_tile<1, 0, 2, 1, 2, 0>,
- Conv::template process_tile<1, 0, 2, 1, 2, 1>,
- Conv::template process_tile<1, 0, 2, 1, 2, 2>,
- }, // Output pad bottom = 2
- }, // Input pad right = 1
- { // Input pad right = 2
- { // Output pad bottom = 0
- Conv::template process_tile<1, 0, 2, 2, 0, 0>,
- Conv::template process_tile<1, 0, 2, 2, 0, 1>,
- Conv::template process_tile<1, 0, 2, 2, 0, 2>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- Conv::template process_tile<1, 0, 2, 2, 1, 0>,
- Conv::template process_tile<1, 0, 2, 2, 1, 1>,
- Conv::template process_tile<1, 0, 2, 2, 1, 2>,
- }, // Output pad bottom = 1
- { // Output pad bottom = 2
- Conv::template process_tile<1, 0, 2, 2, 2, 0>,
- Conv::template process_tile<1, 0, 2, 2, 2, 1>,
- Conv::template process_tile<1, 0, 2, 2, 2, 2>,
- }, // Output pad bottom = 2
- }, // Input pad right = 2
- { // Input pad right = 3
- { // Output pad bottom = 0
- Conv::template process_tile<1, 0, 2, 3, 0, 0>,
- Conv::template process_tile<1, 0, 2, 3, 0, 1>,
- Conv::template process_tile<1, 0, 2, 3, 0, 2>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- Conv::template process_tile<1, 0, 2, 3, 1, 0>,
- Conv::template process_tile<1, 0, 2, 3, 1, 1>,
- Conv::template process_tile<1, 0, 2, 3, 1, 2>,
- }, // Output pad bottom = 1
- { // Output pad bottom = 2
- Conv::template process_tile<1, 0, 2, 3, 2, 0>,
- Conv::template process_tile<1, 0, 2, 3, 2, 1>,
- Conv::template process_tile<1, 0, 2, 3, 2, 2>,
- }, // Output pad bottom = 2
- }, // Input pad right = 3
- }, // Input pad bottom = 2
- { // Input pad bottom = 3
- { // Input pad right = 0
- { // Output pad bottom = 0
- Conv::template process_tile<1, 0, 3, 0, 0, 0>,
- Conv::template process_tile<1, 0, 3, 0, 0, 1>,
- Conv::template process_tile<1, 0, 3, 0, 0, 2>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- Conv::template process_tile<1, 0, 3, 0, 1, 0>,
- Conv::template process_tile<1, 0, 3, 0, 1, 1>,
- Conv::template process_tile<1, 0, 3, 0, 1, 2>,
- }, // Output pad bottom = 1
- { // Output pad bottom = 2
- Conv::template process_tile<1, 0, 3, 0, 2, 0>,
- Conv::template process_tile<1, 0, 3, 0, 2, 1>,
- Conv::template process_tile<1, 0, 3, 0, 2, 2>,
- }, // Output pad bottom = 2
- }, // Input pad right = 0
- { // Input pad right = 1
- { // Output pad bottom = 0
- Conv::template process_tile<1, 0, 3, 1, 0, 0>,
- Conv::template process_tile<1, 0, 3, 1, 0, 1>,
- Conv::template process_tile<1, 0, 3, 1, 0, 2>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- Conv::template process_tile<1, 0, 3, 1, 1, 0>,
- Conv::template process_tile<1, 0, 3, 1, 1, 1>,
- Conv::template process_tile<1, 0, 3, 1, 1, 2>,
- }, // Output pad bottom = 1
- { // Output pad bottom = 2
- Conv::template process_tile<1, 0, 3, 1, 2, 0>,
- Conv::template process_tile<1, 0, 3, 1, 2, 1>,
- Conv::template process_tile<1, 0, 3, 1, 2, 2>,
- }, // Output pad bottom = 2
- }, // Input pad right = 1
- { // Input pad right = 2
- { // Output pad bottom = 0
- Conv::template process_tile<1, 0, 3, 2, 0, 0>,
- Conv::template process_tile<1, 0, 3, 2, 0, 1>,
- Conv::template process_tile<1, 0, 3, 2, 0, 2>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- Conv::template process_tile<1, 0, 3, 2, 1, 0>,
- Conv::template process_tile<1, 0, 3, 2, 1, 1>,
- Conv::template process_tile<1, 0, 3, 2, 1, 2>,
- }, // Output pad bottom = 1
- { // Output pad bottom = 2
- Conv::template process_tile<1, 0, 3, 2, 2, 0>,
- Conv::template process_tile<1, 0, 3, 2, 2, 1>,
- Conv::template process_tile<1, 0, 3, 2, 2, 2>,
- }, // Output pad bottom = 2
- }, // Input pad right = 2
- { // Input pad right = 3
- { // Output pad bottom = 0
- Conv::template process_tile<1, 0, 3, 3, 0, 0>,
- Conv::template process_tile<1, 0, 3, 3, 0, 1>,
- Conv::template process_tile<1, 0, 3, 3, 0, 2>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- Conv::template process_tile<1, 0, 3, 3, 1, 0>,
- Conv::template process_tile<1, 0, 3, 3, 1, 1>,
- Conv::template process_tile<1, 0, 3, 3, 1, 2>,
- }, // Output pad bottom = 1
- { // Output pad bottom = 2
- Conv::template process_tile<1, 0, 3, 3, 2, 0>,
- Conv::template process_tile<1, 0, 3, 3, 2, 1>,
- Conv::template process_tile<1, 0, 3, 3, 2, 2>,
- }, // Output pad bottom = 2
- }, // Input pad right = 3
- }, // Input pad bottom = 3
- }, // Input pad left = 0
- { // Input pad left = 1
- { // Input pad bottom = 0
- { // Input pad right = 0
- { // Output pad bottom = 0
- Conv::template process_tile<1, 1, 0, 0, 0, 0>,
- Conv::template process_tile<1, 1, 0, 0, 0, 1>,
- Conv::template process_tile<1, 1, 0, 0, 0, 2>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- Conv::template process_tile<1, 1, 0, 0, 1, 0>,
- Conv::template process_tile<1, 1, 0, 0, 1, 1>,
- Conv::template process_tile<1, 1, 0, 0, 1, 2>,
- }, // Output pad bottom = 1
- { // Output pad bottom = 2
- Conv::template process_tile<1, 1, 0, 0, 2, 0>,
- Conv::template process_tile<1, 1, 0, 0, 2, 1>,
- Conv::template process_tile<1, 1, 0, 0, 2, 2>,
- }, // Output pad bottom = 2
- }, // Input pad right = 0
- { // Input pad right = 1
- { // Output pad bottom = 0
- Conv::template process_tile<1, 1, 0, 1, 0, 0>,
- Conv::template process_tile<1, 1, 0, 1, 0, 1>,
- Conv::template process_tile<1, 1, 0, 1, 0, 2>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- Conv::template process_tile<1, 1, 0, 1, 1, 0>,
- Conv::template process_tile<1, 1, 0, 1, 1, 1>,
- Conv::template process_tile<1, 1, 0, 1, 1, 2>,
- }, // Output pad bottom = 1
- { // Output pad bottom = 2
- Conv::template process_tile<1, 1, 0, 1, 2, 0>,
- Conv::template process_tile<1, 1, 0, 1, 2, 1>,
- Conv::template process_tile<1, 1, 0, 1, 2, 2>,
- }, // Output pad bottom = 2
- }, // Input pad right = 1
- { // Input pad right = 2
- { // Output pad bottom = 0
- Conv::template process_tile<1, 1, 0, 2, 0, 0>,
- Conv::template process_tile<1, 1, 0, 2, 0, 1>,
- Conv::template process_tile<1, 1, 0, 2, 0, 2>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- Conv::template process_tile<1, 1, 0, 2, 1, 0>,
- Conv::template process_tile<1, 1, 0, 2, 1, 1>,
- Conv::template process_tile<1, 1, 0, 2, 1, 2>,
- }, // Output pad bottom = 1
- { // Output pad bottom = 2
- Conv::template process_tile<1, 1, 0, 2, 2, 0>,
- Conv::template process_tile<1, 1, 0, 2, 2, 1>,
- Conv::template process_tile<1, 1, 0, 2, 2, 2>,
- }, // Output pad bottom = 2
- }, // Input pad right = 2
- { // Input pad right = 3
- { // Output pad bottom = 0
- Conv::template process_tile<1, 1, 0, 3, 0, 0>,
- Conv::template process_tile<1, 1, 0, 3, 0, 1>,
- Conv::template process_tile<1, 1, 0, 3, 0, 2>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- Conv::template process_tile<1, 1, 0, 3, 1, 0>,
- Conv::template process_tile<1, 1, 0, 3, 1, 1>,
- Conv::template process_tile<1, 1, 0, 3, 1, 2>,
- }, // Output pad bottom = 1
- { // Output pad bottom = 2
- Conv::template process_tile<1, 1, 0, 3, 2, 0>,
- Conv::template process_tile<1, 1, 0, 3, 2, 1>,
- Conv::template process_tile<1, 1, 0, 3, 2, 2>,
- }, // Output pad bottom = 2
- }, // Input pad right = 3
- }, // Input pad bottom = 0
- { // Input pad bottom = 1
- { // Input pad right = 0
- { // Output pad bottom = 0
- Conv::template process_tile<1, 1, 1, 0, 0, 0>,
- Conv::template process_tile<1, 1, 1, 0, 0, 1>,
- Conv::template process_tile<1, 1, 1, 0, 0, 2>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- Conv::template process_tile<1, 1, 1, 0, 1, 0>,
- Conv::template process_tile<1, 1, 1, 0, 1, 1>,
- Conv::template process_tile<1, 1, 1, 0, 1, 2>,
- }, // Output pad bottom = 1
- { // Output pad bottom = 2
- Conv::template process_tile<1, 1, 1, 0, 2, 0>,
- Conv::template process_tile<1, 1, 1, 0, 2, 1>,
- Conv::template process_tile<1, 1, 1, 0, 2, 2>,
- }, // Output pad bottom = 2
- }, // Input pad right = 0
- { // Input pad right = 1
- { // Output pad bottom = 0
- Conv::template process_tile<1, 1, 1, 1, 0, 0>,
- Conv::template process_tile<1, 1, 1, 1, 0, 1>,
- Conv::template process_tile<1, 1, 1, 1, 0, 2>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- Conv::template process_tile<1, 1, 1, 1, 1, 0>,
- Conv::template process_tile<1, 1, 1, 1, 1, 1>,
- Conv::template process_tile<1, 1, 1, 1, 1, 2>,
- }, // Output pad bottom = 1
- { // Output pad bottom = 2
- Conv::template process_tile<1, 1, 1, 1, 2, 0>,
- Conv::template process_tile<1, 1, 1, 1, 2, 1>,
- Conv::template process_tile<1, 1, 1, 1, 2, 2>,
- }, // Output pad bottom = 2
- }, // Input pad right = 1
- { // Input pad right = 2
- { // Output pad bottom = 0
- Conv::template process_tile<1, 1, 1, 2, 0, 0>,
- Conv::template process_tile<1, 1, 1, 2, 0, 1>,
- Conv::template process_tile<1, 1, 1, 2, 0, 2>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- Conv::template process_tile<1, 1, 1, 2, 1, 0>,
- Conv::template process_tile<1, 1, 1, 2, 1, 1>,
- Conv::template process_tile<1, 1, 1, 2, 1, 2>,
- }, // Output pad bottom = 1
- { // Output pad bottom = 2
- Conv::template process_tile<1, 1, 1, 2, 2, 0>,
- Conv::template process_tile<1, 1, 1, 2, 2, 1>,
- Conv::template process_tile<1, 1, 1, 2, 2, 2>,
- }, // Output pad bottom = 2
- }, // Input pad right = 2
- { // Input pad right = 3
- { // Output pad bottom = 0
- Conv::template process_tile<1, 1, 1, 3, 0, 0>,
- Conv::template process_tile<1, 1, 1, 3, 0, 1>,
- Conv::template process_tile<1, 1, 1, 3, 0, 2>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- Conv::template process_tile<1, 1, 1, 3, 1, 0>,
- Conv::template process_tile<1, 1, 1, 3, 1, 1>,
- Conv::template process_tile<1, 1, 1, 3, 1, 2>,
- }, // Output pad bottom = 1
- { // Output pad bottom = 2
- Conv::template process_tile<1, 1, 1, 3, 2, 0>,
- Conv::template process_tile<1, 1, 1, 3, 2, 1>,
- Conv::template process_tile<1, 1, 1, 3, 2, 2>,
- }, // Output pad bottom = 2
- }, // Input pad right = 3
- }, // Input pad bottom = 1
- { // Input pad bottom = 2
- { // Input pad right = 0
- { // Output pad bottom = 0
- Conv::template process_tile<1, 1, 2, 0, 0, 0>,
- Conv::template process_tile<1, 1, 2, 0, 0, 1>,
- Conv::template process_tile<1, 1, 2, 0, 0, 2>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- Conv::template process_tile<1, 1, 2, 0, 1, 0>,
- Conv::template process_tile<1, 1, 2, 0, 1, 1>,
- Conv::template process_tile<1, 1, 2, 0, 1, 2>,
- }, // Output pad bottom = 1
- { // Output pad bottom = 2
- Conv::template process_tile<1, 1, 2, 0, 2, 0>,
- Conv::template process_tile<1, 1, 2, 0, 2, 1>,
- Conv::template process_tile<1, 1, 2, 0, 2, 2>,
- }, // Output pad bottom = 2
- }, // Input pad right = 0
- { // Input pad right = 1
- { // Output pad bottom = 0
- Conv::template process_tile<1, 1, 2, 1, 0, 0>,
- Conv::template process_tile<1, 1, 2, 1, 0, 1>,
- Conv::template process_tile<1, 1, 2, 1, 0, 2>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- Conv::template process_tile<1, 1, 2, 1, 1, 0>,
- Conv::template process_tile<1, 1, 2, 1, 1, 1>,
- Conv::template process_tile<1, 1, 2, 1, 1, 2>,
- }, // Output pad bottom = 1
- { // Output pad bottom = 2
- Conv::template process_tile<1, 1, 2, 1, 2, 0>,
- Conv::template process_tile<1, 1, 2, 1, 2, 1>,
- Conv::template process_tile<1, 1, 2, 1, 2, 2>,
- }, // Output pad bottom = 2
- }, // Input pad right = 1
- { // Input pad right = 2
- { // Output pad bottom = 0
- Conv::template process_tile<1, 1, 2, 2, 0, 0>,
- Conv::template process_tile<1, 1, 2, 2, 0, 1>,
- Conv::template process_tile<1, 1, 2, 2, 0, 2>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- Conv::template process_tile<1, 1, 2, 2, 1, 0>,
- Conv::template process_tile<1, 1, 2, 2, 1, 1>,
- Conv::template process_tile<1, 1, 2, 2, 1, 2>,
- }, // Output pad bottom = 1
- { // Output pad bottom = 2
- Conv::template process_tile<1, 1, 2, 2, 2, 0>,
- Conv::template process_tile<1, 1, 2, 2, 2, 1>,
- Conv::template process_tile<1, 1, 2, 2, 2, 2>,
- }, // Output pad bottom = 2
- }, // Input pad right = 2
- { // Input pad right = 3
- { // Output pad bottom = 0
- Conv::template process_tile<1, 1, 2, 3, 0, 0>,
- Conv::template process_tile<1, 1, 2, 3, 0, 1>,
- Conv::template process_tile<1, 1, 2, 3, 0, 2>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- Conv::template process_tile<1, 1, 2, 3, 1, 0>,
- Conv::template process_tile<1, 1, 2, 3, 1, 1>,
- Conv::template process_tile<1, 1, 2, 3, 1, 2>,
- }, // Output pad bottom = 1
- { // Output pad bottom = 2
- Conv::template process_tile<1, 1, 2, 3, 2, 0>,
- Conv::template process_tile<1, 1, 2, 3, 2, 1>,
- Conv::template process_tile<1, 1, 2, 3, 2, 2>,
- }, // Output pad bottom = 2
- }, // Input pad right = 3
- }, // Input pad bottom = 2
- { // Input pad bottom = 3
- { // Input pad right = 0
- { // Output pad bottom = 0
- Conv::template process_tile<1, 1, 3, 0, 0, 0>,
- Conv::template process_tile<1, 1, 3, 0, 0, 1>,
- Conv::template process_tile<1, 1, 3, 0, 0, 2>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- Conv::template process_tile<1, 1, 3, 0, 1, 0>,
- Conv::template process_tile<1, 1, 3, 0, 1, 1>,
- Conv::template process_tile<1, 1, 3, 0, 1, 2>,
- }, // Output pad bottom = 1
- { // Output pad bottom = 2
- Conv::template process_tile<1, 1, 3, 0, 2, 0>,
- Conv::template process_tile<1, 1, 3, 0, 2, 1>,
- Conv::template process_tile<1, 1, 3, 0, 2, 2>,
- }, // Output pad bottom = 2
- }, // Input pad right = 0
- { // Input pad right = 1
- { // Output pad bottom = 0
- Conv::template process_tile<1, 1, 3, 1, 0, 0>,
- Conv::template process_tile<1, 1, 3, 1, 0, 1>,
- Conv::template process_tile<1, 1, 3, 1, 0, 2>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- Conv::template process_tile<1, 1, 3, 1, 1, 0>,
- Conv::template process_tile<1, 1, 3, 1, 1, 1>,
- Conv::template process_tile<1, 1, 3, 1, 1, 2>,
- }, // Output pad bottom = 1
- { // Output pad bottom = 2
- Conv::template process_tile<1, 1, 3, 1, 2, 0>,
- Conv::template process_tile<1, 1, 3, 1, 2, 1>,
- Conv::template process_tile<1, 1, 3, 1, 2, 2>,
- }, // Output pad bottom = 2
- }, // Input pad right = 1
- { // Input pad right = 2
- { // Output pad bottom = 0
- Conv::template process_tile<1, 1, 3, 2, 0, 0>,
- Conv::template process_tile<1, 1, 3, 2, 0, 1>,
- Conv::template process_tile<1, 1, 3, 2, 0, 2>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- Conv::template process_tile<1, 1, 3, 2, 1, 0>,
- Conv::template process_tile<1, 1, 3, 2, 1, 1>,
- Conv::template process_tile<1, 1, 3, 2, 1, 2>,
- }, // Output pad bottom = 1
- { // Output pad bottom = 2
- Conv::template process_tile<1, 1, 3, 2, 2, 0>,
- Conv::template process_tile<1, 1, 3, 2, 2, 1>,
- Conv::template process_tile<1, 1, 3, 2, 2, 2>,
- }, // Output pad bottom = 2
- }, // Input pad right = 2
- { // Input pad right = 3
- { // Output pad bottom = 0
- Conv::template process_tile<1, 1, 3, 3, 0, 0>,
- Conv::template process_tile<1, 1, 3, 3, 0, 1>,
- Conv::template process_tile<1, 1, 3, 3, 0, 2>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- Conv::template process_tile<1, 1, 3, 3, 1, 0>,
- Conv::template process_tile<1, 1, 3, 3, 1, 1>,
- Conv::template process_tile<1, 1, 3, 3, 1, 2>,
- }, // Output pad bottom = 1
- { // Output pad bottom = 2
- Conv::template process_tile<1, 1, 3, 3, 2, 0>,
- Conv::template process_tile<1, 1, 3, 3, 2, 1>,
- Conv::template process_tile<1, 1, 3, 3, 2, 2>,
- }, // Output pad bottom = 2
- }, // Input pad right = 3
- }, // Input pad bottom = 3
- }, // Input pad left = 1
- }, // Input pad top = 1
+template <>
+void ConvImpl::process_tile<true, 0, 0, 0, 0, 0, 0>(
+ const int n_channels,
+ const float* const weights,
+ const int weight_row_stride,
+ const int weight_col_stride,
+ const float* const inptr,
+ const int in_row_stride,
+ const int in_col_stride,
+ float* const outptr,
+ const int out_row_stride,
+ const int out_col_stride,
+ const int, const int, const int, const int, const int, const int
+)
+{
+ // Copy pointers
+ const float *uptr0 = inptr;
+ const float *wptr0 = weights;
+ float *vptr0 = outptr;
+
+ int channels_remaining = n_channels;
+ if (channels_remaining >= 4)
+ {
+ // Process blocks of 4 channels at a time
+ int n_iters = ((channels_remaining / 4) + 1)/2 - 1;
+ const bool odd_tail = (channels_remaining / 4) & 1;
+ channels_remaining %= 4;
+
+ asm volatile (
+ "qU22B .req q0\n" "qU23B .req q0\n" "qW22A .req q0\n"
+ "vU22B .req v0\n" "vU23B .req v0\n" "vW22A .req v0\n"
+ "qV12A .req q1\n" "qW11B .req q1\n"
+ "vV12A .req v1\n" "vW11B .req v1\n"
+ "qU41A .req q2\n" "qU32B .req q2\n" "qU33A .req q2\n" "qV13B .req q2\n"
+ "vU41A .req v2\n" "vU32B .req v2\n" "vU33A .req v2\n" "vV13B .req v2\n"
+ "qU42B .req q3\n" "qU13B .req q3\n" "qU44B .req q3\n" "qU55A .req q3\n"
+ "vU42B .req v3\n" "vU13B .req v3\n" "vU44B .req v3\n" "vU55A .req v3\n"
+ "qU34B .req q4\n" "qU15A .req q4\n" "qU42A .req q4\n" "qU44A .req q4\n" "qU12B .req q4\n"
+ "vU34B .req v4\n" "vU15A .req v4\n" "vU42A .req v4\n" "vU44A .req v4\n" "vU12B .req v4\n"
+ "qU33B .req q5\n" "qU52A .req q5\n" "qW23A .req q5\n"
+ "vU33B .req v5\n" "vU52A .req v5\n" "vW23A .req v5\n"
+ "qV31A .req q6\n" "qU13A .req q6\n" "qV12B .req q6\n"
+ "vV31A .req v6\n" "vU13A .req v6\n" "vV12B .req v6\n"
+ "qU35B .req q7\n" "qU51B .req q7\n" "qV11A .req q7\n" "qU53B .req q7\n"
+ "vU35B .req v7\n" "vU51B .req v7\n" "vV11A .req v7\n" "vU53B .req v7\n"
+ "qW21A .req q8\n" "qV22B .req q8\n"
+ "vW21A .req v8\n" "vV22B .req v8\n"
+ "qV33B .req q9\n" "qU14A .req q9\n" "qV23A .req q9\n" "qU25B .req q9\n"
+ "vV33B .req v9\n" "vU14A .req v9\n" "vV23A .req v9\n" "vU25B .req v9\n"
+ "qW21B .req q10\n" "qV32A .req q10\n" "qU35A .req q10\n"
+ "vW21B .req v10\n" "vV32A .req v10\n" "vU35A .req v10\n"
+ "qV11B .req q11\n" "qU15B .req q11\n" "qV33A .req q11\n"
+ "vV11B .req v11\n" "vU15B .req v11\n" "vV33A .req v11\n"
+ "qU11B .req q12\n" "qW23B .req q12\n" "qU45A .req q12\n"
+ "vU11B .req v12\n" "vW23B .req v12\n" "vU45A .req v12\n"
+ "qW11A .req q13\n" "qU45B .req q13\n" "qU52B .req q13\n"
+ "vW11A .req v13\n" "vU45B .req v13\n" "vU52B .req v13\n"
+ "qU55B .req q14\n" "qU25A .req q14\n" "qV21A .req q14\n"
+ "vU55B .req v14\n" "vU25A .req v14\n" "vV21A .req v14\n"
+ "qU53A .req q15\n" "qV21B .req q15\n" "qU31A .req q15\n"
+ "vU53A .req v15\n" "vV21B .req v15\n" "vU31A .req v15\n"
+ "qW13B .req q16\n" "qU23A .req q16\n"
+ "vW13B .req v16\n" "vU23A .req v16\n"
+ "qW33B .req q17\n" "qW33A .req q17\n"
+ "vW33B .req v17\n" "vW33A .req v17\n"
+ "qU24B .req q18\n" "qU32A .req q18\n" "qV31B .req q18\n" "qV13A .req q18\n"
+ "vU24B .req v18\n" "vU32A .req v18\n" "vV31B .req v18\n" "vV13A .req v18\n"
+ "qU31B .req q19\n" "qU11A .req q19\n" "qU54B .req q19\n" "qU43A .req q19\n"
+ "vU31B .req v19\n" "vU11A .req v19\n" "vU54B .req v19\n" "vU43A .req v19\n"
+ "qU24A .req q20\n" "qW12B .req q20\n" "qU54A .req q20\n"
+ "vU24A .req v20\n" "vW12B .req v20\n" "vU54A .req v20\n"
+ "qV23B .req q21\n" "qW12A .req q21\n"
+ "vV23B .req v21\n" "vW12A .req v21\n"
+ "qW32A .req q22\n" "qU43B .req q22\n"
+ "vW32A .req v22\n" "vU43B .req v22\n"
+ "qW31A .req q23\n" "qV32B .req q23\n"
+ "vW31A .req v23\n" "vV32B .req v23\n"
+ "qU22A .req q24\n" "qW31B .req q24\n"
+ "vU22A .req v24\n" "vW31B .req v24\n"
+ "qU21B .req q25\n" "qV22A .req q25\n"
+ "vU21B .req v25\n" "vV22A .req v25\n"
+ "qU34A .req q26\n" "qW22B .req q26\n" "qU12A .req q26\n"
+ "vU34A .req v26\n" "vW22B .req v26\n" "vU12A .req v26\n"
+ "qW13A .req q27\n" "qU51A .req q27\n"
+ "vW13A .req v27\n" "vU51A .req v27\n"
+ "qW32B .req q28\n"
+ "vW32B .req v28\n"
+ "qU41B .req q29\n" "qU14B .req q29\n"
+ "vU41B .req v29\n" "vU14B .req v29\n"
+ "qU21A .req q30\n"
+ "vU21A .req v30\n"
+
+ "uptr1 .req x0\n"
+ "uptr2 .req x1\n"
+ "uptr3 .req x2\n"
+ "uptr4 .req x3\n"
+
+ "u_col_stride1 .req %x[u_col_stride]\n"
+ "u_col_stride2 .req x4\n"
+ "u_col_stride3 .req x5\n"
+ "u_col_stride4 .req x6\n"
+
+ "wptr1 .req x7\n"
+ "wptr2 .req x8\n"
+ "w_col_stride1 .req %x[w_col_stride]\n"
+ "w_col_stride2 .req x9\n"
+
+ "vptr1 .req x10\n"
+ "vptr2 .req x11\n"
+ "v_col_stride1 .req %x[v_col_stride]\n"
+ "v_col_stride2 .req x12\n"
+
+ // Prepare strides and pointers
+ "add uptr1, %x[uptr0], %x[u_row_stride]\n"
+ "add uptr2, uptr1 , %x[u_row_stride]\n"
+ "add uptr3, uptr2 , %x[u_row_stride]\n"
+ "add uptr4, uptr3 , %x[u_row_stride]\n"
+ "add u_col_stride2, u_col_stride1, u_col_stride1\n"
+ "add u_col_stride3, u_col_stride2, u_col_stride1\n"
+ "add u_col_stride4, u_col_stride3, u_col_stride1\n"
+
+ "add wptr1, %x[wptr0], %x[w_row_stride]\n"
+ "add wptr2, wptr1 , %x[w_row_stride]\n"
+ "add w_col_stride2, w_col_stride1, w_col_stride1\n"
+
+ "add vptr1, %x[vptr0], %x[v_row_stride]\n"
+ "add vptr2, vptr1 , %x[v_row_stride]\n"
+ "add v_col_stride2, v_col_stride1, v_col_stride1\n"
+
+ // Pre-load for A
+ "ldr qW13A, [%x[wptr0], w_col_stride2]\n"
+ "ldr qW23A, [wptr1, w_col_stride2]\n"
+ "ldr qW33A, [wptr2, w_col_stride2]\n"
+ "ldr qW12A, [%x[wptr0], w_col_stride1]\n"
+ "ldr qU15A, [%x[uptr0], u_col_stride4]\n"
+ "ldr qW22A, [wptr1, w_col_stride1]\n"
+ "ldr qU14A, [%x[uptr0], u_col_stride3]\n"
+ "ldr qW32A, [wptr2, w_col_stride1]\n"
+ "ldr qU13A, [%x[uptr0], u_col_stride2]\n"
+ "ldr qU25A, [uptr1, u_col_stride4]\n"
+ "ldr qU24A, [uptr1, u_col_stride3]\n"
+ "ldr qW11A, [%x[wptr0]], #0x10\n"
+ "ldr qU23A, [uptr1, u_col_stride2]\n"
+ "ldr qW21A, [wptr1], #0x10\n"
+ "ldr qW31A, [wptr2], #0x10\n"
+ "ldr qU34A, [uptr2, u_col_stride3]\n"
+ "ldr qU35A, [uptr2, u_col_stride4]\n"
+
+ // First part of A
+ "fmul vV13A.4s, vU15A.4s, vW13A.4s\n"
+ "ldr qU33A, [uptr2, u_col_stride2]\n"
+ "fmul vV12A.4s, vU14A.4s, vW13A.4s\n"
+ "cbz %x[n_iters], 2f\n" // Jump to tail if not looping
+
+ "1:" // Main loop, double unrolled
+ // A Part
+ "fmla vV13A.4s, vU14A.4s, vW12A.4s\n"
+ "ldr qU45A, [uptr3, u_col_stride4]\n"
+ "fmul vV11A.4s, vU13A.4s, vW13A.4s\n"
+ "fmla vV12A.4s, vU13A.4s, vW12A.4s\n"
+ "fmla vV13A.4s, vU13A.4s, vW11A.4s\n"
+ "ldr qU44A, [uptr3, u_col_stride3]\n"
+ "fmla vV13A.4s, vU25A.4s, vW23A.4s\n"
+ "fmul vV23A.4s, vU25A.4s, vW13A.4s\n"
+ "ldr qU43A, [uptr3, u_col_stride2]\n"
+ "fmla vV12A.4s, vU24A.4s, vW23A.4s\n"
+ "fmla vV13A.4s, vU24A.4s, vW22A.4s\n"
+ "fmul vV22A.4s, vU24A.4s, vW13A.4s\n"
+ "fmla vV23A.4s, vU24A.4s, vW12A.4s\n"
+ "ldr qU55A, [uptr4, u_col_stride4]\n"
+ "fmla vV11A.4s, vU23A.4s, vW23A.4s\n"
+ "fmla vV12A.4s, vU23A.4s, vW22A.4s\n"
+ "fmla vV13A.4s, vU23A.4s, vW21A.4s\n"
+ "fmul vV21A.4s, vU23A.4s, vW13A.4s\n"
+ "fmla vV22A.4s, vU23A.4s, vW12A.4s\n"
+ "fmla vV23A.4s, vU23A.4s, vW11A.4s\n"
+ "ldr qU54A, [uptr4, u_col_stride3]\n"
+ "fmla vV13A.4s, vU35A.4s, vW33A.4s\n"
+ "fmla vV23A.4s, vU35A.4s, vW23A.4s\n"
+ "fmul vV33A.4s, vU35A.4s, vW13A.4s\n"
+ "ldr qU53A, [uptr4, u_col_stride2]\n"
+ "fmla vV12A.4s, vU34A.4s, vW33A.4s\n"
+ "fmla vV13A.4s, vU34A.4s, vW32A.4s\n"
+ "fmla vV22A.4s, vU34A.4s, vW23A.4s\n"
+ "fmla vV23A.4s, vU34A.4s, vW22A.4s\n"
+ "fmul vV32A.4s, vU34A.4s, vW13A.4s\n"
+ "fmla vV33A.4s, vU34A.4s, vW12A.4s\n"
+ "ldr qU12A, [%x[uptr0], u_col_stride1]\n"
+ "fmla vV11A.4s, vU33A.4s, vW33A.4s\n"
+ "fmla vV12A.4s, vU33A.4s, vW32A.4s\n"
+ "fmla vV13A.4s, vU33A.4s, vW31A.4s\n"
+ "str qV13A, [%x[vptr0], v_col_stride2]\n"
+ "fmla vV21A.4s, vU33A.4s, vW23A.4s\n"
+ "fmla vV22A.4s, vU33A.4s, vW22A.4s\n"
+ "fmla vV23A.4s, vU33A.4s, vW21A.4s\n"
+ "fmul vV31A.4s, vU33A.4s, vW13A.4s\n"
+ "ldr qW13B, [%x[wptr0], w_col_stride2]\n"
+ "fmla vV32A.4s, vU33A.4s, vW12A.4s\n"
+ "fmla vV33A.4s, vU33A.4s, vW11A.4s\n"
+ "ldr qU22A, [uptr1, u_col_stride1]\n"
+ "fmla vV23A.4s, vU45A.4s, vW33A.4s\n"
+ "fmla vV33A.4s, vU45A.4s, vW23A.4s\n"
+ "ldr qU32A, [uptr2, u_col_stride1]\n"
+ "fmla vV22A.4s, vU44A.4s, vW33A.4s\n"
+ "fmla vV23A.4s, vU44A.4s, vW32A.4s\n"
+ "fmla vV32A.4s, vU44A.4s, vW23A.4s\n"
+ "fmla vV33A.4s, vU44A.4s, vW22A.4s\n"
+ "ldr qU42A, [uptr3, u_col_stride1]\n"
+ "fmla vV21A.4s, vU43A.4s, vW33A.4s\n"
+ "fmla vV22A.4s, vU43A.4s, vW32A.4s\n"
+ "fmla vV23A.4s, vU43A.4s, vW31A.4s\n"
+ "str qV23A, [vptr1, v_col_stride2]\n"
+ "fmla vV31A.4s, vU43A.4s, vW23A.4s\n"
+ "ldr qW23B, [wptr1, w_col_stride2]\n"
+ "fmla vV32A.4s, vU43A.4s, vW22A.4s\n"
+ "fmla vV33A.4s, vU43A.4s, vW21A.4s\n"
+ "ldr qU52A, [uptr4, u_col_stride1]\n"
+ "fmla vV33A.4s, vU55A.4s, vW33A.4s\n"
+ "ldr qU11A, [%x[uptr0]], #0x10\n"
+ "fmla vV32A.4s, vU54A.4s, vW33A.4s\n"
+ "fmla vV33A.4s, vU54A.4s, vW32A.4s\n"
+ "ldr qU21A, [uptr1], #0x10\n"
+ "fmla vV31A.4s, vU53A.4s, vW33A.4s\n"
+ "ldr qW33B, [wptr2, w_col_stride2]\n"
+ "fmla vV32A.4s, vU53A.4s, vW32A.4s\n"
+ "fmla vV33A.4s, vU53A.4s, vW31A.4s\n"
+ "str qV33A, [vptr2, v_col_stride2]\n"
+ "fmla vV11A.4s, vU12A.4s, vW12A.4s\n"
+ "ldr qU31A, [uptr2], #0x10\n"
+ "fmla vV12A.4s, vU12A.4s, vW11A.4s\n"
+ "ldr qU41A, [uptr3], #0x10\n"
+ "fmla vV11A.4s, vU22A.4s, vW22A.4s\n"
+ "ldr qU51A, [uptr4], #0x10\n"
+ "fmla vV12A.4s, vU22A.4s, vW21A.4s\n"
+ "ldr qW12B, [%x[wptr0], w_col_stride1]\n"
+ "fmla vV21A.4s, vU22A.4s, vW12A.4s\n"
+ "ldr qU15B, [%x[uptr0], u_col_stride4]\n"
+ "fmla vV22A.4s, vU22A.4s, vW11A.4s\n"
+ "ldr qW22B, [wptr1, w_col_stride1]\n"
+ "fmla vV11A.4s, vU32A.4s, vW32A.4s\n"
+ "ldr qU14B, [%x[uptr0], u_col_stride3]\n"
+ "fmla vV12A.4s, vU32A.4s, vW31A.4s\n"
+ "str qV12A, [%x[vptr0], v_col_stride1]\n"
+ "fmla vV21A.4s, vU32A.4s, vW22A.4s\n"
+ "ldr qW32B, [wptr2, w_col_stride1]\n"
+ "fmla vV22A.4s, vU32A.4s, vW21A.4s\n"
+ "ldr qU13B, [%x[uptr0], u_col_stride2]\n"
+ "fmla vV31A.4s, vU32A.4s, vW12A.4s\n"
+ "ldr qU25B, [uptr1, u_col_stride4]\n"
+ "fmla vV32A.4s, vU32A.4s, vW11A.4s\n"
+ "ldr qU24B, [uptr1, u_col_stride3]\n"
+ "fmla vV21A.4s, vU42A.4s, vW32A.4s\n"
+ "fmla vV22A.4s, vU42A.4s, vW31A.4s\n"
+ "str qV22A, [vptr1, v_col_stride1]\n"
+ "fmla vV31A.4s, vU42A.4s, vW22A.4s\n"
+ "fmla vV32A.4s, vU42A.4s, vW21A.4s\n"
+ "fmla vV31A.4s, vU52A.4s, vW32A.4s\n"
+ "fmla vV32A.4s, vU52A.4s, vW31A.4s\n"
+ "str qV32A, [vptr2, v_col_stride1]\n"
+ "fmla vV11A.4s, vU11A.4s, vW11A.4s\n"
+ "ldr qW11B, [%x[wptr0]], #0x10\n"
+ "fmla vV11A.4s, vU21A.4s, vW21A.4s\n"
+ "ldr qU23B, [uptr1, u_col_stride2]\n"
+ "fmla vV21A.4s, vU21A.4s, vW11A.4s\n"
+ "ldr qW21B, [wptr1], #0x10\n"
+ "fmla vV11A.4s, vU31A.4s, vW31A.4s\n"
+ "str qV11A, [%x[vptr0]], #0x10\n"
+ "fmla vV21A.4s, vU31A.4s, vW21A.4s\n"
+ "ldr qW31B, [wptr2], #0x10\n"
+ "fmla vV31A.4s, vU31A.4s, vW11A.4s\n"
+ "ldr qU34B, [uptr2, u_col_stride3]\n"
+ "fmla vV21A.4s, vU41A.4s, vW31A.4s\n"
+ "str qV21A, [vptr1], #0x10\n"
+ "fmla vV31A.4s, vU41A.4s, vW21A.4s\n"
+ "ldr qU35B, [uptr2, u_col_stride4]\n"
+ "fmla vV31A.4s, vU51A.4s, vW31A.4s\n"
+ "str qV31A, [vptr2], #0x10\n"
+
+ // B Part
+ "fmul vV13B.4s, vU15B.4s, vW13B.4s\n"
+ "ldr qU33B, [uptr2, u_col_stride2]\n"
+ "fmul vV12B.4s, vU14B.4s, vW13B.4s\n"
+ "fmla vV13B.4s, vU14B.4s, vW12B.4s\n"
+ "ldr qU45B, [uptr3, u_col_stride4]\n"
+ "fmul vV11B.4s, vU13B.4s, vW13B.4s\n"
+ "fmla vV12B.4s, vU13B.4s, vW12B.4s\n"
+ "fmla vV13B.4s, vU13B.4s, vW11B.4s\n"
+ "ldr qU44B, [uptr3, u_col_stride3]\n"
+ "fmla vV13B.4s, vU25B.4s, vW23B.4s\n"
+ "fmul vV23B.4s, vU25B.4s, vW13B.4s\n"
+ "ldr qU43B, [uptr3, u_col_stride2]\n"
+ "fmla vV12B.4s, vU24B.4s, vW23B.4s\n"
+ "fmla vV13B.4s, vU24B.4s, vW22B.4s\n"
+ "fmul vV22B.4s, vU24B.4s, vW13B.4s\n"
+ "fmla vV23B.4s, vU24B.4s, vW12B.4s\n"
+ "ldr qU55B, [uptr4, u_col_stride4]\n"
+ "fmla vV11B.4s, vU23B.4s, vW23B.4s\n"
+ "fmla vV12B.4s, vU23B.4s, vW22B.4s\n"
+ "fmla vV13B.4s, vU23B.4s, vW21B.4s\n"
+ "fmul vV21B.4s, vU23B.4s, vW13B.4s\n"
+ "fmla vV22B.4s, vU23B.4s, vW12B.4s\n"
+ "fmla vV23B.4s, vU23B.4s, vW11B.4s\n"
+ "ldr qU54B, [uptr4, u_col_stride3]\n"
+ "fmla vV13B.4s, vU35B.4s, vW33B.4s\n"
+ "fmla vV23B.4s, vU35B.4s, vW23B.4s\n"
+ "fmul vV33B.4s, vU35B.4s, vW13B.4s\n"
+ "ldr qU53B, [uptr4, u_col_stride2]\n"
+ "fmla vV12B.4s, vU34B.4s, vW33B.4s\n"
+ "fmla vV13B.4s, vU34B.4s, vW32B.4s\n"
+ "fmla vV22B.4s, vU34B.4s, vW23B.4s\n"
+ "fmla vV23B.4s, vU34B.4s, vW22B.4s\n"
+ "fmul vV32B.4s, vU34B.4s, vW13B.4s\n"
+ "fmla vV33B.4s, vU34B.4s, vW12B.4s\n"
+ "ldr qU12B, [%x[uptr0], u_col_stride1]\n"
+ "fmla vV11B.4s, vU33B.4s, vW33B.4s\n"
+ "fmla vV12B.4s, vU33B.4s, vW32B.4s\n"
+ "fmla vV13B.4s, vU33B.4s, vW31B.4s\n"
+ "str qV13B, [%x[vptr0], v_col_stride2]\n"
+ "fmla vV21B.4s, vU33B.4s, vW23B.4s\n"
+ "fmla vV22B.4s, vU33B.4s, vW22B.4s\n"
+ "fmla vV23B.4s, vU33B.4s, vW21B.4s\n"
+ "fmul vV31B.4s, vU33B.4s, vW13B.4s\n"
+ "ldr qW13A, [%x[wptr0], w_col_stride2]\n"
+ "fmla vV32B.4s, vU33B.4s, vW12B.4s\n"
+ "fmla vV33B.4s, vU33B.4s, vW11B.4s\n"
+ "ldr qU22B, [uptr1, u_col_stride1]\n"
+ "fmla vV23B.4s, vU45B.4s, vW33B.4s\n"
+ "fmla vV33B.4s, vU45B.4s, vW23B.4s\n"
+ "ldr qU32B, [uptr2, u_col_stride1]\n"
+ "fmla vV22B.4s, vU44B.4s, vW33B.4s\n"
+ "fmla vV23B.4s, vU44B.4s, vW32B.4s\n"
+ "fmla vV32B.4s, vU44B.4s, vW23B.4s\n"
+ "fmla vV33B.4s, vU44B.4s, vW22B.4s\n"
+ "ldr qU42B, [uptr3, u_col_stride1]\n"
+ "fmla vV21B.4s, vU43B.4s, vW33B.4s\n"
+ "fmla vV22B.4s, vU43B.4s, vW32B.4s\n"
+ "fmla vV23B.4s, vU43B.4s, vW31B.4s\n"
+ "str qV23B, [vptr1, v_col_stride2]\n"
+ "fmla vV31B.4s, vU43B.4s, vW23B.4s\n"
+ "ldr qW23A, [wptr1, w_col_stride2]\n"
+ "fmla vV32B.4s, vU43B.4s, vW22B.4s\n"
+ "fmla vV33B.4s, vU43B.4s, vW21B.4s\n"
+ "ldr qU52B, [uptr4, u_col_stride1]\n"
+ "fmla vV33B.4s, vU55B.4s, vW33B.4s\n"
+ "ldr qU11B, [%x[uptr0]], #0x10\n"
+ "fmla vV32B.4s, vU54B.4s, vW33B.4s\n"
+ "fmla vV33B.4s, vU54B.4s, vW32B.4s\n"
+ "ldr qU21B, [uptr1], #0x10\n"
+ "fmla vV31B.4s, vU53B.4s, vW33B.4s\n"
+ "ldr qW33A, [wptr2, w_col_stride2]\n"
+ "fmla vV32B.4s, vU53B.4s, vW32B.4s\n"
+ "fmla vV33B.4s, vU53B.4s, vW31B.4s\n"
+ "str qV33B, [vptr2, v_col_stride2]\n"
+ "fmla vV11B.4s, vU12B.4s, vW12B.4s\n"
+ "ldr qU31B, [uptr2], #0x10\n"
+ "fmla vV12B.4s, vU12B.4s, vW11B.4s\n"
+ "ldr qU41B, [uptr3], #0x10\n"
+ "fmla vV11B.4s, vU22B.4s, vW22B.4s\n"
+ "ldr qU51B, [uptr4], #0x10\n"
+ "fmla vV12B.4s, vU22B.4s, vW21B.4s\n"
+ "ldr qW12A, [%x[wptr0], w_col_stride1]\n"
+ "fmla vV21B.4s, vU22B.4s, vW12B.4s\n"
+ "ldr qU15A, [%x[uptr0], u_col_stride4]\n"
+ "fmla vV22B.4s, vU22B.4s, vW11B.4s\n"
+ "ldr qW22A, [wptr1, w_col_stride1]\n"
+ "fmla vV11B.4s, vU32B.4s, vW32B.4s\n"
+ "ldr qU14A, [%x[uptr0], u_col_stride3]\n"
+ "fmla vV12B.4s, vU32B.4s, vW31B.4s\n"
+ "str qV12B, [%x[vptr0], v_col_stride1]\n"
+ "fmla vV21B.4s, vU32B.4s, vW22B.4s\n"
+ "ldr qW32A, [wptr2, w_col_stride1]\n"
+ "fmla vV22B.4s, vU32B.4s, vW21B.4s\n"
+ "ldr qU13A, [%x[uptr0], u_col_stride2]\n"
+ "fmla vV31B.4s, vU32B.4s, vW12B.4s\n"
+ "ldr qU25A, [uptr1, u_col_stride4]\n"
+ "fmla vV32B.4s, vU32B.4s, vW11B.4s\n"
+ "ldr qU24A, [uptr1, u_col_stride3]\n"
+ "fmla vV21B.4s, vU42B.4s, vW32B.4s\n"
+ "fmla vV22B.4s, vU42B.4s, vW31B.4s\n"
+ "str qV22B, [vptr1, v_col_stride1]\n"
+ "fmla vV31B.4s, vU42B.4s, vW22B.4s\n"
+ "fmla vV32B.4s, vU42B.4s, vW21B.4s\n"
+ "fmla vV31B.4s, vU52B.4s, vW32B.4s\n"
+ "subs %x[n_iters], %x[n_iters], #1\n"
+ "fmla vV32B.4s, vU52B.4s, vW31B.4s\n"
+ "str qV32B, [vptr2, v_col_stride1]\n"
+ "fmla vV11B.4s, vU11B.4s, vW11B.4s\n"
+ "ldr qW11A, [%x[wptr0]], #0x10\n"
+ "fmla vV11B.4s, vU21B.4s, vW21B.4s\n"
+ "ldr qU23A, [uptr1, u_col_stride2]\n"
+ "fmla vV21B.4s, vU21B.4s, vW11B.4s\n"
+ "ldr qW21A, [wptr1], #0x10\n"
+ "fmla vV11B.4s, vU31B.4s, vW31B.4s\n"
+ "str qV11B, [%x[vptr0]], #0x10\n"
+ "fmla vV21B.4s, vU31B.4s, vW21B.4s\n"
+ "ldr qW31A, [wptr2], #0x10\n"
+ "fmla vV31B.4s, vU31B.4s, vW11B.4s\n"
+ "ldr qU34A, [uptr2, u_col_stride3]\n"
+ "fmla vV21B.4s, vU41B.4s, vW31B.4s\n"
+ "str qV21B, [vptr1], #0x10\n"
+ "fmla vV31B.4s, vU41B.4s, vW21B.4s\n"
+ "ldr qU35A, [uptr2, u_col_stride4]\n"
+ "fmla vV31B.4s, vU51B.4s, vW31B.4s\n"
+ "str qV31B, [vptr2], #0x10\n"
+
+ // First part of A
+ "fmul vV13A.4s, vU15A.4s, vW13A.4s\n"
+ "ldr qU33A, [uptr2, u_col_stride2]\n"
+ "fmul vV12A.4s, vU14A.4s, vW13A.4s\n"
+ "bne 1b\n" // Loop
+
+ "2:" // Tail dispatch
+ "cbnz %w[odd_tail], 3f\n"
+
+ // Even tail
+ // A Part
+ "fmla vV13A.4s, vU14A.4s, vW12A.4s\n"
+ "ldr qU45A, [uptr3, u_col_stride4]\n"
+ "fmul vV11A.4s, vU13A.4s, vW13A.4s\n"
+ "fmla vV12A.4s, vU13A.4s, vW12A.4s\n"
+ "fmla vV13A.4s, vU13A.4s, vW11A.4s\n"
+ "ldr qU44A, [uptr3, u_col_stride3]\n"
+ "fmla vV13A.4s, vU25A.4s, vW23A.4s\n"
+ "fmul vV23A.4s, vU25A.4s, vW13A.4s\n"
+ "ldr qU43A, [uptr3, u_col_stride2]\n"
+ "fmla vV12A.4s, vU24A.4s, vW23A.4s\n"
+ "fmla vV13A.4s, vU24A.4s, vW22A.4s\n"
+ "fmul vV22A.4s, vU24A.4s, vW13A.4s\n"
+ "fmla vV23A.4s, vU24A.4s, vW12A.4s\n"
+ "ldr qU55A, [uptr4, u_col_stride4]\n"
+ "fmla vV11A.4s, vU23A.4s, vW23A.4s\n"
+ "fmla vV12A.4s, vU23A.4s, vW22A.4s\n"
+ "fmla vV13A.4s, vU23A.4s, vW21A.4s\n"
+ "fmul vV21A.4s, vU23A.4s, vW13A.4s\n"
+ "fmla vV22A.4s, vU23A.4s, vW12A.4s\n"
+ "fmla vV23A.4s, vU23A.4s, vW11A.4s\n"
+ "ldr qU54A, [uptr4, u_col_stride3]\n"
+ "fmla vV13A.4s, vU35A.4s, vW33A.4s\n"
+ "fmla vV23A.4s, vU35A.4s, vW23A.4s\n"
+ "fmul vV33A.4s, vU35A.4s, vW13A.4s\n"
+ "ldr qU53A, [uptr4, u_col_stride2]\n"
+ "fmla vV12A.4s, vU34A.4s, vW33A.4s\n"
+ "fmla vV13A.4s, vU34A.4s, vW32A.4s\n"
+ "fmla vV22A.4s, vU34A.4s, vW23A.4s\n"
+ "fmla vV23A.4s, vU34A.4s, vW22A.4s\n"
+ "fmul vV32A.4s, vU34A.4s, vW13A.4s\n"
+ "fmla vV33A.4s, vU34A.4s, vW12A.4s\n"
+ "ldr qU12A, [%x[uptr0], u_col_stride1]\n"
+ "fmla vV11A.4s, vU33A.4s, vW33A.4s\n"
+ "fmla vV12A.4s, vU33A.4s, vW32A.4s\n"
+ "fmla vV13A.4s, vU33A.4s, vW31A.4s\n"
+ "str qV13A, [%x[vptr0], v_col_stride2]\n"
+ "fmla vV21A.4s, vU33A.4s, vW23A.4s\n"
+ "fmla vV22A.4s, vU33A.4s, vW22A.4s\n"
+ "fmla vV23A.4s, vU33A.4s, vW21A.4s\n"
+ "fmul vV31A.4s, vU33A.4s, vW13A.4s\n"
+ "ldr qW13B, [%x[wptr0], w_col_stride2]\n"
+ "fmla vV32A.4s, vU33A.4s, vW12A.4s\n"
+ "fmla vV33A.4s, vU33A.4s, vW11A.4s\n"
+ "ldr qU22A, [uptr1, u_col_stride1]\n"
+ "fmla vV23A.4s, vU45A.4s, vW33A.4s\n"
+ "fmla vV33A.4s, vU45A.4s, vW23A.4s\n"
+ "ldr qU32A, [uptr2, u_col_stride1]\n"
+ "fmla vV22A.4s, vU44A.4s, vW33A.4s\n"
+ "fmla vV23A.4s, vU44A.4s, vW32A.4s\n"
+ "fmla vV32A.4s, vU44A.4s, vW23A.4s\n"
+ "fmla vV33A.4s, vU44A.4s, vW22A.4s\n"
+ "ldr qU42A, [uptr3, u_col_stride1]\n"
+ "fmla vV21A.4s, vU43A.4s, vW33A.4s\n"
+ "fmla vV22A.4s, vU43A.4s, vW32A.4s\n"
+ "fmla vV23A.4s, vU43A.4s, vW31A.4s\n"
+ "str qV23A, [vptr1, v_col_stride2]\n"
+ "fmla vV31A.4s, vU43A.4s, vW23A.4s\n"
+ "ldr qW23B, [wptr1, w_col_stride2]\n"
+ "fmla vV32A.4s, vU43A.4s, vW22A.4s\n"
+ "fmla vV33A.4s, vU43A.4s, vW21A.4s\n"
+ "ldr qU52A, [uptr4, u_col_stride1]\n"
+ "fmla vV33A.4s, vU55A.4s, vW33A.4s\n"
+ "ldr qU11A, [%x[uptr0]], #0x10\n"
+ "fmla vV32A.4s, vU54A.4s, vW33A.4s\n"
+ "fmla vV33A.4s, vU54A.4s, vW32A.4s\n"
+ "ldr qU21A, [uptr1], #0x10\n"
+ "fmla vV31A.4s, vU53A.4s, vW33A.4s\n"
+ "ldr qW33B, [wptr2, w_col_stride2]\n"
+ "fmla vV32A.4s, vU53A.4s, vW32A.4s\n"
+ "fmla vV33A.4s, vU53A.4s, vW31A.4s\n"
+ "str qV33A, [vptr2, v_col_stride2]\n"
+ "fmla vV11A.4s, vU12A.4s, vW12A.4s\n"
+ "ldr qU31A, [uptr2], #0x10\n"
+ "fmla vV12A.4s, vU12A.4s, vW11A.4s\n"
+ "ldr qU41A, [uptr3], #0x10\n"
+ "fmla vV11A.4s, vU22A.4s, vW22A.4s\n"
+ "ldr qU51A, [uptr4], #0x10\n"
+ "fmla vV12A.4s, vU22A.4s, vW21A.4s\n"
+ "ldr qW12B, [%x[wptr0], w_col_stride1]\n"
+ "fmla vV21A.4s, vU22A.4s, vW12A.4s\n"
+ "ldr qU15B, [%x[uptr0], u_col_stride4]\n"
+ "fmla vV22A.4s, vU22A.4s, vW11A.4s\n"
+ "ldr qW22B, [wptr1, w_col_stride1]\n"
+ "fmla vV11A.4s, vU32A.4s, vW32A.4s\n"
+ "ldr qU14B, [%x[uptr0], u_col_stride3]\n"
+ "fmla vV12A.4s, vU32A.4s, vW31A.4s\n"
+ "str qV12A, [%x[vptr0], v_col_stride1]\n"
+ "fmla vV21A.4s, vU32A.4s, vW22A.4s\n"
+ "ldr qW32B, [wptr2, w_col_stride1]\n"
+ "fmla vV22A.4s, vU32A.4s, vW21A.4s\n"
+ "ldr qU13B, [%x[uptr0], u_col_stride2]\n"
+ "fmla vV31A.4s, vU32A.4s, vW12A.4s\n"
+ "ldr qU25B, [uptr1, u_col_stride4]\n"
+ "fmla vV32A.4s, vU32A.4s, vW11A.4s\n"
+ "ldr qU24B, [uptr1, u_col_stride3]\n"
+ "fmla vV21A.4s, vU42A.4s, vW32A.4s\n"
+ "fmla vV22A.4s, vU42A.4s, vW31A.4s\n"
+ "str qV22A, [vptr1, v_col_stride1]\n"
+ "fmla vV31A.4s, vU42A.4s, vW22A.4s\n"
+ "fmla vV32A.4s, vU42A.4s, vW21A.4s\n"
+ "fmla vV31A.4s, vU52A.4s, vW32A.4s\n"
+ "fmla vV32A.4s, vU52A.4s, vW31A.4s\n"
+ "str qV32A, [vptr2, v_col_stride1]\n"
+ "fmla vV11A.4s, vU11A.4s, vW11A.4s\n"
+ "ldr qW11B, [%x[wptr0]], #0x10\n"
+ "fmla vV11A.4s, vU21A.4s, vW21A.4s\n"
+ "ldr qU23B, [uptr1, u_col_stride2]\n"
+ "fmla vV21A.4s, vU21A.4s, vW11A.4s\n"
+ "ldr qW21B, [wptr1], #0x10\n"
+ "fmla vV11A.4s, vU31A.4s, vW31A.4s\n"
+ "str qV11A, [%x[vptr0]], #0x10\n"
+ "fmla vV21A.4s, vU31A.4s, vW21A.4s\n"
+ "ldr qW31B, [wptr2], #0x10\n"
+ "fmla vV31A.4s, vU31A.4s, vW11A.4s\n"
+ "ldr qU34B, [uptr2, u_col_stride3]\n"
+ "fmla vV21A.4s, vU41A.4s, vW31A.4s\n"
+ "str qV21A, [vptr1], #0x10\n"
+ "fmla vV31A.4s, vU41A.4s, vW21A.4s\n"
+ "ldr qU35B, [uptr2, u_col_stride4]\n"
+ "fmla vV31A.4s, vU51A.4s, vW31A.4s\n"
+ "str qV31A, [vptr2], #0x10\n"
+
+ // B Part
+ "fmul vV13B.4s, vU15B.4s, vW13B.4s\n"
+ "ldr qU33B, [uptr2, u_col_stride2]\n"
+ "fmul vV12B.4s, vU14B.4s, vW13B.4s\n"
+ "fmla vV13B.4s, vU14B.4s, vW12B.4s\n"
+ "ldr qU45B, [uptr3, u_col_stride4]\n"
+ "fmul vV11B.4s, vU13B.4s, vW13B.4s\n"
+ "fmla vV12B.4s, vU13B.4s, vW12B.4s\n"
+ "fmla vV13B.4s, vU13B.4s, vW11B.4s\n"
+ "ldr qU44B, [uptr3, u_col_stride3]\n"
+ "fmla vV13B.4s, vU25B.4s, vW23B.4s\n"
+ "fmul vV23B.4s, vU25B.4s, vW13B.4s\n"
+ "ldr qU43B, [uptr3, u_col_stride2]\n"
+ "fmla vV12B.4s, vU24B.4s, vW23B.4s\n"
+ "fmla vV13B.4s, vU24B.4s, vW22B.4s\n"
+ "fmul vV22B.4s, vU24B.4s, vW13B.4s\n"
+ "fmla vV23B.4s, vU24B.4s, vW12B.4s\n"
+ "ldr qU55B, [uptr4, u_col_stride4]\n"
+ "fmla vV11B.4s, vU23B.4s, vW23B.4s\n"
+ "fmla vV12B.4s, vU23B.4s, vW22B.4s\n"
+ "fmla vV13B.4s, vU23B.4s, vW21B.4s\n"
+ "fmul vV21B.4s, vU23B.4s, vW13B.4s\n"
+ "fmla vV22B.4s, vU23B.4s, vW12B.4s\n"
+ "fmla vV23B.4s, vU23B.4s, vW11B.4s\n"
+ "ldr qU54B, [uptr4, u_col_stride3]\n"
+ "fmla vV13B.4s, vU35B.4s, vW33B.4s\n"
+ "fmla vV23B.4s, vU35B.4s, vW23B.4s\n"
+ "fmul vV33B.4s, vU35B.4s, vW13B.4s\n"
+ "ldr qU53B, [uptr4, u_col_stride2]\n"
+ "fmla vV12B.4s, vU34B.4s, vW33B.4s\n"
+ "fmla vV13B.4s, vU34B.4s, vW32B.4s\n"
+ "fmla vV22B.4s, vU34B.4s, vW23B.4s\n"
+ "fmla vV23B.4s, vU34B.4s, vW22B.4s\n"
+ "fmul vV32B.4s, vU34B.4s, vW13B.4s\n"
+ "fmla vV33B.4s, vU34B.4s, vW12B.4s\n"
+ "ldr qU12B, [%x[uptr0], u_col_stride1]\n"
+ "fmla vV11B.4s, vU33B.4s, vW33B.4s\n"
+ "fmla vV12B.4s, vU33B.4s, vW32B.4s\n"
+ "fmla vV13B.4s, vU33B.4s, vW31B.4s\n"
+ "str qV13B, [%x[vptr0], v_col_stride2]\n"
+ "fmla vV21B.4s, vU33B.4s, vW23B.4s\n"
+ "fmla vV22B.4s, vU33B.4s, vW22B.4s\n"
+ "fmla vV23B.4s, vU33B.4s, vW21B.4s\n"
+ "fmul vV31B.4s, vU33B.4s, vW13B.4s\n"
+ "fmla vV32B.4s, vU33B.4s, vW12B.4s\n"
+ "fmla vV33B.4s, vU33B.4s, vW11B.4s\n"
+ "ldr qU22B, [uptr1, u_col_stride1]\n"
+ "fmla vV23B.4s, vU45B.4s, vW33B.4s\n"
+ "fmla vV33B.4s, vU45B.4s, vW23B.4s\n"
+ "ldr qU32B, [uptr2, u_col_stride1]\n"
+ "fmla vV22B.4s, vU44B.4s, vW33B.4s\n"
+ "fmla vV23B.4s, vU44B.4s, vW32B.4s\n"
+ "fmla vV32B.4s, vU44B.4s, vW23B.4s\n"
+ "fmla vV33B.4s, vU44B.4s, vW22B.4s\n"
+ "ldr qU42B, [uptr3, u_col_stride1]\n"
+ "fmla vV21B.4s, vU43B.4s, vW33B.4s\n"
+ "fmla vV22B.4s, vU43B.4s, vW32B.4s\n"
+ "fmla vV23B.4s, vU43B.4s, vW31B.4s\n"
+ "str qV23B, [vptr1, v_col_stride2]\n"
+ "fmla vV31B.4s, vU43B.4s, vW23B.4s\n"
+ "fmla vV32B.4s, vU43B.4s, vW22B.4s\n"
+ "fmla vV33B.4s, vU43B.4s, vW21B.4s\n"
+ "ldr qU52B, [uptr4, u_col_stride1]\n"
+ "fmla vV33B.4s, vU55B.4s, vW33B.4s\n"
+ "ldr qU11B, [%x[uptr0]], #0x10\n"
+ "fmla vV32B.4s, vU54B.4s, vW33B.4s\n"
+ "fmla vV33B.4s, vU54B.4s, vW32B.4s\n"
+ "ldr qU21B, [uptr1], #0x10\n"
+ "fmla vV31B.4s, vU53B.4s, vW33B.4s\n"
+ "fmla vV32B.4s, vU53B.4s, vW32B.4s\n"
+ "fmla vV33B.4s, vU53B.4s, vW31B.4s\n"
+ "str qV33B, [vptr2, v_col_stride2]\n"
+ "fmla vV11B.4s, vU12B.4s, vW12B.4s\n"
+ "ldr qU31B, [uptr2], #0x10\n"
+ "fmla vV12B.4s, vU12B.4s, vW11B.4s\n"
+ "ldr qU41B, [uptr3], #0x10\n"
+ "fmla vV11B.4s, vU22B.4s, vW22B.4s\n"
+ "ldr qU51B, [uptr4], #0x10\n"
+ "fmla vV12B.4s, vU22B.4s, vW21B.4s\n"
+ "fmla vV21B.4s, vU22B.4s, vW12B.4s\n"
+ "fmla vV22B.4s, vU22B.4s, vW11B.4s\n"
+ "fmla vV11B.4s, vU32B.4s, vW32B.4s\n"
+ "fmla vV12B.4s, vU32B.4s, vW31B.4s\n"
+ "str qV12B, [%x[vptr0], v_col_stride1]\n"
+ "fmla vV21B.4s, vU32B.4s, vW22B.4s\n"
+ "fmla vV22B.4s, vU32B.4s, vW21B.4s\n"
+ "fmla vV31B.4s, vU32B.4s, vW12B.4s\n"
+ "fmla vV32B.4s, vU32B.4s, vW11B.4s\n"
+ "fmla vV21B.4s, vU42B.4s, vW32B.4s\n"
+ "fmla vV22B.4s, vU42B.4s, vW31B.4s\n"
+ "str qV22B, [vptr1, v_col_stride1]\n"
+ "fmla vV31B.4s, vU42B.4s, vW22B.4s\n"
+ "fmla vV32B.4s, vU42B.4s, vW21B.4s\n"
+ "fmla vV31B.4s, vU52B.4s, vW32B.4s\n"
+ "subs %x[n_iters], %x[n_iters], #1\n"
+ "fmla vV32B.4s, vU52B.4s, vW31B.4s\n"
+ "str qV32B, [vptr2, v_col_stride1]\n"
+ "fmla vV11B.4s, vU11B.4s, vW11B.4s\n"
+ "fmla vV11B.4s, vU21B.4s, vW21B.4s\n"
+ "fmla vV21B.4s, vU21B.4s, vW11B.4s\n"
+ "fmla vV11B.4s, vU31B.4s, vW31B.4s\n"
+ "str qV11B, [%x[vptr0]], #0x10\n"
+ "fmla vV21B.4s, vU31B.4s, vW21B.4s\n"
+ "fmla vV31B.4s, vU31B.4s, vW11B.4s\n"
+ "fmla vV21B.4s, vU41B.4s, vW31B.4s\n"
+ "str qV21B, [vptr1], #0x10\n"
+ "fmla vV31B.4s, vU41B.4s, vW21B.4s\n"
+ "fmla vV31B.4s, vU51B.4s, vW31B.4s\n"
+ "str qV31B, [vptr2], #0x10\n"
+
+ "b 4f\n" // Branch to end of method
+
+ "3:" // Odd tail, finish off A
+ "fmla vV13A.4s, vU14A.4s, vW12A.4s\n"
+ "ldr qU45A, [uptr3, u_col_stride4]\n"
+ "fmul vV11A.4s, vU13A.4s, vW13A.4s\n"
+ "fmla vV12A.4s, vU13A.4s, vW12A.4s\n"
+ "fmla vV13A.4s, vU13A.4s, vW11A.4s\n"
+ "ldr qU44A, [uptr3, u_col_stride3]\n"
+ "fmla vV13A.4s, vU25A.4s, vW23A.4s\n"
+ "fmul vV23A.4s, vU25A.4s, vW13A.4s\n"
+ "ldr qU43A, [uptr3, u_col_stride2]\n"
+ "fmla vV12A.4s, vU24A.4s, vW23A.4s\n"
+ "fmla vV13A.4s, vU24A.4s, vW22A.4s\n"
+ "fmul vV22A.4s, vU24A.4s, vW13A.4s\n"
+ "fmla vV23A.4s, vU24A.4s, vW12A.4s\n"
+ "ldr qU55A, [uptr4, u_col_stride4]\n"
+ "fmla vV11A.4s, vU23A.4s, vW23A.4s\n"
+ "fmla vV12A.4s, vU23A.4s, vW22A.4s\n"
+ "fmla vV13A.4s, vU23A.4s, vW21A.4s\n"
+ "fmul vV21A.4s, vU23A.4s, vW13A.4s\n"
+ "fmla vV22A.4s, vU23A.4s, vW12A.4s\n"
+ "fmla vV23A.4s, vU23A.4s, vW11A.4s\n"
+ "ldr qU54A, [uptr4, u_col_stride3]\n"
+ "fmla vV13A.4s, vU35A.4s, vW33A.4s\n"
+ "fmla vV23A.4s, vU35A.4s, vW23A.4s\n"
+ "fmul vV33A.4s, vU35A.4s, vW13A.4s\n"
+ "ldr qU53A, [uptr4, u_col_stride2]\n"
+ "fmla vV12A.4s, vU34A.4s, vW33A.4s\n"
+ "fmla vV13A.4s, vU34A.4s, vW32A.4s\n"
+ "fmla vV22A.4s, vU34A.4s, vW23A.4s\n"
+ "fmla vV23A.4s, vU34A.4s, vW22A.4s\n"
+ "fmul vV32A.4s, vU34A.4s, vW13A.4s\n"
+ "fmla vV33A.4s, vU34A.4s, vW12A.4s\n"
+ "ldr qU12A, [%x[uptr0], u_col_stride1]\n"
+ "fmla vV11A.4s, vU33A.4s, vW33A.4s\n"
+ "fmla vV12A.4s, vU33A.4s, vW32A.4s\n"
+ "fmla vV13A.4s, vU33A.4s, vW31A.4s\n"
+ "str qV13A, [%x[vptr0], v_col_stride2]\n"
+ "fmla vV21A.4s, vU33A.4s, vW23A.4s\n"
+ "fmla vV22A.4s, vU33A.4s, vW22A.4s\n"
+ "fmla vV23A.4s, vU33A.4s, vW21A.4s\n"
+ "fmul vV31A.4s, vU33A.4s, vW13A.4s\n"
+ "fmla vV32A.4s, vU33A.4s, vW12A.4s\n"
+ "fmla vV33A.4s, vU33A.4s, vW11A.4s\n"
+ "ldr qU22A, [uptr1, u_col_stride1]\n"
+ "fmla vV23A.4s, vU45A.4s, vW33A.4s\n"
+ "fmla vV33A.4s, vU45A.4s, vW23A.4s\n"
+ "ldr qU32A, [uptr2, u_col_stride1]\n"
+ "fmla vV22A.4s, vU44A.4s, vW33A.4s\n"
+ "fmla vV23A.4s, vU44A.4s, vW32A.4s\n"
+ "fmla vV32A.4s, vU44A.4s, vW23A.4s\n"
+ "fmla vV33A.4s, vU44A.4s, vW22A.4s\n"
+ "ldr qU42A, [uptr3, u_col_stride1]\n"
+ "fmla vV21A.4s, vU43A.4s, vW33A.4s\n"
+ "fmla vV22A.4s, vU43A.4s, vW32A.4s\n"
+ "fmla vV23A.4s, vU43A.4s, vW31A.4s\n"
+ "str qV23A, [vptr1, v_col_stride2]\n"
+ "fmla vV31A.4s, vU43A.4s, vW23A.4s\n"
+ "fmla vV32A.4s, vU43A.4s, vW22A.4s\n"
+ "fmla vV33A.4s, vU43A.4s, vW21A.4s\n"
+ "ldr qU52A, [uptr4, u_col_stride1]\n"
+ "fmla vV33A.4s, vU55A.4s, vW33A.4s\n"
+ "ldr qU11A, [%x[uptr0]], #0x10\n"
+ "fmla vV32A.4s, vU54A.4s, vW33A.4s\n"
+ "fmla vV33A.4s, vU54A.4s, vW32A.4s\n"
+ "ldr qU21A, [uptr1], #0x10\n"
+ "fmla vV31A.4s, vU53A.4s, vW33A.4s\n"
+ "fmla vV32A.4s, vU53A.4s, vW32A.4s\n"
+ "fmla vV33A.4s, vU53A.4s, vW31A.4s\n"
+ "str qV33A, [vptr2, v_col_stride2]\n"
+ "fmla vV11A.4s, vU12A.4s, vW12A.4s\n"
+ "ldr qU31A, [uptr2], #0x10\n"
+ "fmla vV12A.4s, vU12A.4s, vW11A.4s\n"
+ "ldr qU41A, [uptr3], #0x10\n"
+ "fmla vV11A.4s, vU22A.4s, vW22A.4s\n"
+ "ldr qU51A, [uptr4], #0x10\n"
+ "fmla vV12A.4s, vU22A.4s, vW21A.4s\n"
+ "fmla vV21A.4s, vU22A.4s, vW12A.4s\n"
+ "fmla vV22A.4s, vU22A.4s, vW11A.4s\n"
+ "fmla vV11A.4s, vU32A.4s, vW32A.4s\n"
+ "fmla vV12A.4s, vU32A.4s, vW31A.4s\n"
+ "str qV12A, [%x[vptr0], v_col_stride1]\n"
+ "fmla vV21A.4s, vU32A.4s, vW22A.4s\n"
+ "fmla vV22A.4s, vU32A.4s, vW21A.4s\n"
+ "fmla vV31A.4s, vU32A.4s, vW12A.4s\n"
+ "fmla vV32A.4s, vU32A.4s, vW11A.4s\n"
+ "fmla vV21A.4s, vU42A.4s, vW32A.4s\n"
+ "fmla vV22A.4s, vU42A.4s, vW31A.4s\n"
+ "str qV22A, [vptr1, v_col_stride1]\n"
+ "fmla vV31A.4s, vU42A.4s, vW22A.4s\n"
+ "fmla vV32A.4s, vU42A.4s, vW21A.4s\n"
+ "fmla vV31A.4s, vU52A.4s, vW32A.4s\n"
+ "fmla vV32A.4s, vU52A.4s, vW31A.4s\n"
+ "str qV32A, [vptr2, v_col_stride1]\n"
+ "fmla vV11A.4s, vU11A.4s, vW11A.4s\n"
+ "fmla vV11A.4s, vU21A.4s, vW21A.4s\n"
+ "fmla vV21A.4s, vU21A.4s, vW11A.4s\n"
+ "fmla vV11A.4s, vU31A.4s, vW31A.4s\n"
+ "str qV11A, [%x[vptr0]], #0x10\n"
+ "fmla vV21A.4s, vU31A.4s, vW21A.4s\n"
+ "fmla vV31A.4s, vU31A.4s, vW11A.4s\n"
+ "fmla vV21A.4s, vU41A.4s, vW31A.4s\n"
+ "str qV21A, [vptr1], #0x10\n"
+ "fmla vV31A.4s, vU41A.4s, vW21A.4s\n"
+ "fmla vV31A.4s, vU51A.4s, vW31A.4s\n"
+ "str qV31A, [vptr2], #0x10\n"
+
+ "4:" // End of method
+ ".unreq uptr1\n" ".unreq uptr2\n" ".unreq uptr3\n" ".unreq uptr4\n"
+ ".unreq u_col_stride1\n" ".unreq u_col_stride2\n"
+ ".unreq u_col_stride3\n" ".unreq u_col_stride4\n"
+ ".unreq wptr1\n" ".unreq wptr2\n"
+ ".unreq w_col_stride1\n" ".unreq w_col_stride2\n"
+ ".unreq vptr1\n" ".unreq vptr2\n"
+ ".unreq v_col_stride1\n" ".unreq v_col_stride2\n"
+
+ ".unreq qU22B\n" ".unreq qW13B\n" ".unreq qW13A\n" ".unreq qU51B\n"
+ ".unreq qU54B\n" ".unreq qU45A\n" ".unreq qU15A\n" ".unreq qU41B\n"
+ ".unreq qU24B\n" ".unreq qU21A\n"
+ ".unreq qV11B\n" ".unreq qU51A\n" ".unreq qU35A\n" ".unreq qU12A\n"
+ ".unreq qU42B\n" ".unreq qU44B\n" ".unreq qU13B\n" ".unreq qW33A\n"
+ ".unreq qV31B\n" ".unreq qV23A\n" ".unreq qU31A\n" ".unreq qU35B\n" ".unreq qU13A\n"
+ ".unreq qV23B\n" ".unreq qU11A\n" ".unreq qU25A\n" ".unreq qU43A\n" ".unreq qU52B\n"
+ ".unreq qU24A\n" ".unreq qU23B\n" ".unreq qV21A\n" ".unreq qV32B\n"
+ ".unreq qV33B\n" ".unreq qW11A\n" ".unreq qU31B\n"
+ ".unreq qW12B\n" ".unreq qU33A\n" ".unreq qU14A\n" ".unreq qU22A\n"
+ ".unreq qU25B\n" ".unreq qU53B\n" ".unreq qU42A\n" ".unreq qU44A\n"
+ ".unreq qU43B\n" ".unreq qW31A\n" ".unreq qU11B\n"
+ ".unreq qW11B\n" ".unreq qW32A\n"
+ ".unreq qU12B\n" ".unreq qU34B\n" ".unreq qW21A\n"
+ ".unreq qU14B\n" ".unreq qV21B\n" ".unreq qW22A\n"
+ ".unreq qW23B\n" ".unreq qW23A\n" ".unreq qU21B\n"
+ ".unreq qU32B\n" ".unreq qU34A\n" ".unreq qU45B\n" ".unreq qV31A\n"
+ ".unreq qW12A\n" ".unreq qU33B\n" ".unreq qU15B\n"
+ ".unreq qW33B\n" ".unreq qU54A\n" ".unreq qU23A\n"
+ ".unreq qW32B\n" ".unreq qV33A\n" ".unreq qW31B\n" ".unreq qV12A\n"
+ ".unreq qV12B\n" ".unreq qU41A\n" ".unreq qU53A\n"
+ ".unreq qV13A\n" ".unreq qU32A\n" ".unreq qW22B\n"
+ ".unreq qV22B\n" ".unreq qU52A\n" ".unreq qV13B\n" ".unreq qV32A\n"
+ ".unreq qU55A\n" ".unreq qU55B\n" ".unreq qV22A\n" ".unreq qW21B\n"
+ ".unreq qV11A\n"
+ ".unreq vU22B\n" ".unreq vW13B\n" ".unreq vW13A\n" ".unreq vU51B\n"
+ ".unreq vU54B\n" ".unreq vU45A\n" ".unreq vU15A\n" ".unreq vU41B\n"
+ ".unreq vU24B\n" ".unreq vU21A\n"
+ ".unreq vV11B\n" ".unreq vU51A\n" ".unreq vU35A\n" ".unreq vU12A\n"
+ ".unreq vU42B\n" ".unreq vU44B\n" ".unreq vU13B\n" ".unreq vW33A\n"
+ ".unreq vV31B\n" ".unreq vV23A\n" ".unreq vU31A\n" ".unreq vU35B\n" ".unreq vU13A\n"
+ ".unreq vV23B\n" ".unreq vU11A\n" ".unreq vU25A\n" ".unreq vU43A\n" ".unreq vU52B\n"
+ ".unreq vU24A\n" ".unreq vU23B\n" ".unreq vV21A\n" ".unreq vV32B\n"
+ ".unreq vV33B\n" ".unreq vW11A\n" ".unreq vU31B\n"
+ ".unreq vW12B\n" ".unreq vU33A\n" ".unreq vU14A\n" ".unreq vU22A\n"
+ ".unreq vU25B\n" ".unreq vU53B\n" ".unreq vU42A\n" ".unreq vU44A\n"
+ ".unreq vU43B\n" ".unreq vW31A\n" ".unreq vU11B\n"
+ ".unreq vW11B\n" ".unreq vW32A\n"
+ ".unreq vU12B\n" ".unreq vU34B\n" ".unreq vW21A\n"
+ ".unreq vU14B\n" ".unreq vV21B\n" ".unreq vW22A\n"
+ ".unreq vW23B\n" ".unreq vW23A\n" ".unreq vU21B\n"
+ ".unreq vU32B\n" ".unreq vU34A\n" ".unreq vU45B\n" ".unreq vV31A\n"
+ ".unreq vW12A\n" ".unreq vU33B\n" ".unreq vU15B\n"
+ ".unreq vW33B\n" ".unreq vU54A\n" ".unreq vU23A\n"
+ ".unreq vW32B\n" ".unreq vV33A\n" ".unreq vW31B\n" ".unreq vV12A\n"
+ ".unreq vV12B\n" ".unreq vU41A\n" ".unreq vU53A\n"
+ ".unreq vV13A\n" ".unreq vU32A\n" ".unreq vW22B\n"
+ ".unreq vV22B\n" ".unreq vU52A\n" ".unreq vV13B\n" ".unreq vV32A\n"
+ ".unreq vU55A\n" ".unreq vU55B\n" ".unreq vV22A\n" ".unreq vW21B\n"
+ ".unreq vV11A\n"
+ : [uptr0] "+r" (uptr0), [wptr0] "+r" (wptr0), [vptr0] "+r" (vptr0),
+ [n_iters] "+r" (n_iters)
+ : [u_row_stride] "r" (in_row_stride * sizeof(float)),
+ [u_col_stride] "r" (in_col_stride * sizeof(float)),
+ [w_row_stride] "r" (weight_row_stride * sizeof(float)),
+ [w_col_stride] "r" (weight_col_stride * sizeof(float)),
+ [v_row_stride] "r" (out_row_stride * sizeof(float)),
+ [v_col_stride] "r" (out_col_stride * sizeof(float)),
+ [odd_tail] "r" (odd_tail)
+ : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11",
+ "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21",
+ "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "x0",
+ "x1", "x2", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11",
+ "x12", "cc", "memory"
+ );
+ }
+ if (channels_remaining)
+ {
+ // Fall back on the unoptimised version to clean up the tail
+ ConvImpl::process_tile<false>(
+ channels_remaining,
+ wptr0, weight_row_stride, weight_col_stride,
+ uptr0, in_row_stride, in_col_stride,
+ vptr0, out_row_stride, out_col_stride,
+ 0, 0, 0, 0, 0, 0
+ );
+ }
+}
+
+#endif // __aarch64__
+
+template <>
+const Conv::TileFn Conv::tilefn_unpadded = ConvImpl::template process_tile<true, 0, 0, 0, 0, 0, 0>;
+
+template <>
+const Conv::TileFn Conv::tilefn_top[n_in_pad_top_fns] = {
+ ConvImpl::template process_tile<true, 1, 0, 0, 0, 0, 0>,
};
+template <>
+const Conv::TileFn Conv::tilefn_left[n_in_pad_left_fns] = {
+ ConvImpl::template process_tile<true, 0, 1, 0, 0, 0, 0>,
+};
+
+template <>
+const Conv::TileFn Conv::tilefn_bottom[n_in_pad_bottom_fns][n_out_pad_bottom_fns] = {
+ {
+ ConvImpl::template process_tile<true, 0, 0, 0, 0, 0, 0>,
+ ConvImpl::template process_tile<true, 0, 0, 0, 0, 1, 0>,
+ ConvImpl::template process_tile<true, 0, 0, 0, 0, 2, 0>,
+ },
+ {
+ ConvImpl::template process_tile<true, 0, 0, 1, 0, 0, 0>,
+ ConvImpl::template process_tile<true, 0, 0, 1, 0, 1, 0>,
+ ConvImpl::template process_tile<true, 0, 0, 1, 0, 2, 0>,
+ },
+ {
+ ConvImpl::template process_tile<true, 0, 0, 2, 0, 0, 0>,
+ ConvImpl::template process_tile<true, 0, 0, 2, 0, 1, 0>,
+ ConvImpl::template process_tile<true, 0, 0, 2, 0, 2, 0>,
+ },
+ {
+ ConvImpl::template process_tile<true, 0, 0, 3, 0, 0, 0>,
+ ConvImpl::template process_tile<true, 0, 0, 3, 0, 1, 0>,
+ ConvImpl::template process_tile<true, 0, 0, 3, 0, 2, 0>,
+ },
+ {
+ ConvImpl::template process_tile<true, 0, 0, 4, 0, 0, 0>,
+ ConvImpl::template process_tile<true, 0, 0, 4, 0, 1, 0>,
+ ConvImpl::template process_tile<true, 0, 0, 4, 0, 2, 0>,
+ },
+};
+
+template <>
+const Conv::TileFn Conv::tilefn_right[n_in_pad_right_fns][n_out_pad_right_fns] = {
+ {
+ ConvImpl::template process_tile<true, 0, 0, 0, 0, 0, 0>,
+ ConvImpl::template process_tile<true, 0, 0, 0, 0, 0, 1>,
+ ConvImpl::template process_tile<true, 0, 0, 0, 0, 0, 2>,
+ },
+ {
+ ConvImpl::template process_tile<true, 0, 0, 0, 1, 0, 0>,
+ ConvImpl::template process_tile<true, 0, 0, 0, 1, 0, 1>,
+ ConvImpl::template process_tile<true, 0, 0, 0, 1, 0, 2>,
+ },
+ {
+ ConvImpl::template process_tile<true, 0, 0, 0, 2, 0, 0>,
+ ConvImpl::template process_tile<true, 0, 0, 0, 2, 0, 1>,
+ ConvImpl::template process_tile<true, 0, 0, 0, 2, 0, 2>,
+ },
+ {
+ ConvImpl::template process_tile<true, 0, 0, 0, 3, 0, 0>,
+ ConvImpl::template process_tile<true, 0, 0, 0, 3, 0, 1>,
+ ConvImpl::template process_tile<true, 0, 0, 0, 3, 0, 2>,
+ },
+ {
+ ConvImpl::template process_tile<true, 0, 0, 0, 4, 0, 0>,
+ ConvImpl::template process_tile<true, 0, 0, 0, 4, 0, 1>,
+ ConvImpl::template process_tile<true, 0, 0, 0, 4, 0, 2>,
+ },
+};
+
+template <>
+const Conv::TileFn Conv::tilefn_generic = ConvImpl::template process_tile<false>;
template class DepthwiseConvolution<3, 3, 3, 3, 1, 1, float, float>;
} // namespace depthwise
diff --git a/src/core/NEON/kernels/convolution/depthwise/depthwise_3x3_3x3_2x2_fp32_fp32.cpp b/src/core/NEON/kernels/convolution/depthwise/depthwise_3x3_3x3_2x2_fp32_fp32.cpp
index 8d511b1..2510941 100644
--- a/src/core/NEON/kernels/convolution/depthwise/depthwise_3x3_3x3_2x2_fp32_fp32.cpp
+++ b/src/core/NEON/kernels/convolution/depthwise/depthwise_3x3_3x3_2x2_fp32_fp32.cpp
@@ -28,3416 +28,596 @@
using Conv = DepthwiseConvolution<3, 3, 3, 3, 2, 2, float, float>;
using ConvImpl = DepthwiseConvolutionImpl<3, 3, 3, 3, 2, 2, float, float>;
+#ifdef __aarch64__
+
template <>
-const Conv::TileFn Conv::tile_fns
- [max_in_pad_top]
- [max_in_pad_left]
- [max_in_pad_bottom]
- [max_in_pad_right]
- [max_out_pad_bottom]
- [max_out_pad_right] = {
- { // Input pad top = 0
- { // Input pad left = 0
- { // Input pad bottom = 0
- { // Input pad right = 0
- { // Output pad bottom = 0
- Conv::template process_tile<0, 0, 0, 0, 0, 0>,
- Conv::template process_tile<0, 0, 0, 0, 0, 1>,
- Conv::template process_tile<0, 0, 0, 0, 0, 2>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- Conv::template process_tile<0, 0, 0, 0, 1, 0>,
- Conv::template process_tile<0, 0, 0, 0, 1, 1>,
- Conv::template process_tile<0, 0, 0, 0, 1, 2>,
- }, // Output pad bottom = 1
- { // Output pad bottom = 2
- Conv::template process_tile<0, 0, 0, 0, 2, 0>,
- Conv::template process_tile<0, 0, 0, 0, 2, 1>,
- Conv::template process_tile<0, 0, 0, 0, 2, 2>,
- }, // Output pad bottom = 2
- }, // Input pad right = 0
- { // Input pad right = 1
- { // Output pad bottom = 0
- Conv::template process_tile<0, 0, 0, 1, 0, 0>,
- Conv::template process_tile<0, 0, 0, 1, 0, 1>,
- Conv::template process_tile<0, 0, 0, 1, 0, 2>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- Conv::template process_tile<0, 0, 0, 1, 1, 0>,
- Conv::template process_tile<0, 0, 0, 1, 1, 1>,
- Conv::template process_tile<0, 0, 0, 1, 1, 2>,
- }, // Output pad bottom = 1
- { // Output pad bottom = 2
- Conv::template process_tile<0, 0, 0, 1, 2, 0>,
- Conv::template process_tile<0, 0, 0, 1, 2, 1>,
- Conv::template process_tile<0, 0, 0, 1, 2, 2>,
- }, // Output pad bottom = 2
- }, // Input pad right = 1
- { // Input pad right = 2
- { // Output pad bottom = 0
- Conv::template process_tile<0, 0, 0, 2, 0, 0>,
- Conv::template process_tile<0, 0, 0, 2, 0, 1>,
- Conv::template process_tile<0, 0, 0, 2, 0, 2>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- Conv::template process_tile<0, 0, 0, 2, 1, 0>,
- Conv::template process_tile<0, 0, 0, 2, 1, 1>,
- Conv::template process_tile<0, 0, 0, 2, 1, 2>,
- }, // Output pad bottom = 1
- { // Output pad bottom = 2
- Conv::template process_tile<0, 0, 0, 2, 2, 0>,
- Conv::template process_tile<0, 0, 0, 2, 2, 1>,
- Conv::template process_tile<0, 0, 0, 2, 2, 2>,
- }, // Output pad bottom = 2
- }, // Input pad right = 2
- { // Input pad right = 3
- { // Output pad bottom = 0
- Conv::template process_tile<0, 0, 0, 3, 0, 0>,
- Conv::template process_tile<0, 0, 0, 3, 0, 1>,
- Conv::template process_tile<0, 0, 0, 3, 0, 2>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- Conv::template process_tile<0, 0, 0, 3, 1, 0>,
- Conv::template process_tile<0, 0, 0, 3, 1, 1>,
- Conv::template process_tile<0, 0, 0, 3, 1, 2>,
- }, // Output pad bottom = 1
- { // Output pad bottom = 2
- Conv::template process_tile<0, 0, 0, 3, 2, 0>,
- Conv::template process_tile<0, 0, 0, 3, 2, 1>,
- Conv::template process_tile<0, 0, 0, 3, 2, 2>,
- }, // Output pad bottom = 2
- }, // Input pad right = 3
- { // Input pad right = 4
- { // Output pad bottom = 0
- Conv::template process_tile<0, 0, 0, 4, 0, 0>,
- Conv::template process_tile<0, 0, 0, 4, 0, 1>,
- Conv::template process_tile<0, 0, 0, 4, 0, 2>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- Conv::template process_tile<0, 0, 0, 4, 1, 0>,
- Conv::template process_tile<0, 0, 0, 4, 1, 1>,
- Conv::template process_tile<0, 0, 0, 4, 1, 2>,
- }, // Output pad bottom = 1
- { // Output pad bottom = 2
- Conv::template process_tile<0, 0, 0, 4, 2, 0>,
- Conv::template process_tile<0, 0, 0, 4, 2, 1>,
- Conv::template process_tile<0, 0, 0, 4, 2, 2>,
- }, // Output pad bottom = 2
- }, // Input pad right = 4
- { // Input pad right = 5
- { // Output pad bottom = 0
- Conv::template process_tile<0, 0, 0, 5, 0, 0>,
- Conv::template process_tile<0, 0, 0, 5, 0, 1>,
- Conv::template process_tile<0, 0, 0, 5, 0, 2>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- Conv::template process_tile<0, 0, 0, 5, 1, 0>,
- Conv::template process_tile<0, 0, 0, 5, 1, 1>,
- Conv::template process_tile<0, 0, 0, 5, 1, 2>,
- }, // Output pad bottom = 1
- { // Output pad bottom = 2
- Conv::template process_tile<0, 0, 0, 5, 2, 0>,
- Conv::template process_tile<0, 0, 0, 5, 2, 1>,
- Conv::template process_tile<0, 0, 0, 5, 2, 2>,
- }, // Output pad bottom = 2
- }, // Input pad right = 5
- { // Input pad right = 6
- { // Output pad bottom = 0
- Conv::template process_tile<0, 0, 0, 6, 0, 0>,
- Conv::template process_tile<0, 0, 0, 6, 0, 1>,
- Conv::template process_tile<0, 0, 0, 6, 0, 2>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- Conv::template process_tile<0, 0, 0, 6, 1, 0>,
- Conv::template process_tile<0, 0, 0, 6, 1, 1>,
- Conv::template process_tile<0, 0, 0, 6, 1, 2>,
- }, // Output pad bottom = 1
- { // Output pad bottom = 2
- Conv::template process_tile<0, 0, 0, 6, 2, 0>,
- Conv::template process_tile<0, 0, 0, 6, 2, 1>,
- Conv::template process_tile<0, 0, 0, 6, 2, 2>,
- }, // Output pad bottom = 2
- }, // Input pad right = 6
- }, // Input pad bottom = 0
- { // Input pad bottom = 1
- { // Input pad right = 0
- { // Output pad bottom = 0
- Conv::template process_tile<0, 0, 1, 0, 0, 0>,
- Conv::template process_tile<0, 0, 1, 0, 0, 1>,
- Conv::template process_tile<0, 0, 1, 0, 0, 2>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- Conv::template process_tile<0, 0, 1, 0, 1, 0>,
- Conv::template process_tile<0, 0, 1, 0, 1, 1>,
- Conv::template process_tile<0, 0, 1, 0, 1, 2>,
- }, // Output pad bottom = 1
- { // Output pad bottom = 2
- Conv::template process_tile<0, 0, 1, 0, 2, 0>,
- Conv::template process_tile<0, 0, 1, 0, 2, 1>,
- Conv::template process_tile<0, 0, 1, 0, 2, 2>,
- }, // Output pad bottom = 2
- }, // Input pad right = 0
- { // Input pad right = 1
- { // Output pad bottom = 0
- Conv::template process_tile<0, 0, 1, 1, 0, 0>,
- Conv::template process_tile<0, 0, 1, 1, 0, 1>,
- Conv::template process_tile<0, 0, 1, 1, 0, 2>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- Conv::template process_tile<0, 0, 1, 1, 1, 0>,
- Conv::template process_tile<0, 0, 1, 1, 1, 1>,
- Conv::template process_tile<0, 0, 1, 1, 1, 2>,
- }, // Output pad bottom = 1
- { // Output pad bottom = 2
- Conv::template process_tile<0, 0, 1, 1, 2, 0>,
- Conv::template process_tile<0, 0, 1, 1, 2, 1>,
- Conv::template process_tile<0, 0, 1, 1, 2, 2>,
- }, // Output pad bottom = 2
- }, // Input pad right = 1
- { // Input pad right = 2
- { // Output pad bottom = 0
- Conv::template process_tile<0, 0, 1, 2, 0, 0>,
- Conv::template process_tile<0, 0, 1, 2, 0, 1>,
- Conv::template process_tile<0, 0, 1, 2, 0, 2>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- Conv::template process_tile<0, 0, 1, 2, 1, 0>,
- Conv::template process_tile<0, 0, 1, 2, 1, 1>,
- Conv::template process_tile<0, 0, 1, 2, 1, 2>,
- }, // Output pad bottom = 1
- { // Output pad bottom = 2
- Conv::template process_tile<0, 0, 1, 2, 2, 0>,
- Conv::template process_tile<0, 0, 1, 2, 2, 1>,
- Conv::template process_tile<0, 0, 1, 2, 2, 2>,
- }, // Output pad bottom = 2
- }, // Input pad right = 2
- { // Input pad right = 3
- { // Output pad bottom = 0
- Conv::template process_tile<0, 0, 1, 3, 0, 0>,
- Conv::template process_tile<0, 0, 1, 3, 0, 1>,
- Conv::template process_tile<0, 0, 1, 3, 0, 2>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- Conv::template process_tile<0, 0, 1, 3, 1, 0>,
- Conv::template process_tile<0, 0, 1, 3, 1, 1>,
- Conv::template process_tile<0, 0, 1, 3, 1, 2>,
- }, // Output pad bottom = 1
- { // Output pad bottom = 2
- Conv::template process_tile<0, 0, 1, 3, 2, 0>,
- Conv::template process_tile<0, 0, 1, 3, 2, 1>,
- Conv::template process_tile<0, 0, 1, 3, 2, 2>,
- }, // Output pad bottom = 2
- }, // Input pad right = 3
- { // Input pad right = 4
- { // Output pad bottom = 0
- Conv::template process_tile<0, 0, 1, 4, 0, 0>,
- Conv::template process_tile<0, 0, 1, 4, 0, 1>,
- Conv::template process_tile<0, 0, 1, 4, 0, 2>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- Conv::template process_tile<0, 0, 1, 4, 1, 0>,
- Conv::template process_tile<0, 0, 1, 4, 1, 1>,
- Conv::template process_tile<0, 0, 1, 4, 1, 2>,
- }, // Output pad bottom = 1
- { // Output pad bottom = 2
- Conv::template process_tile<0, 0, 1, 4, 2, 0>,
- Conv::template process_tile<0, 0, 1, 4, 2, 1>,
- Conv::template process_tile<0, 0, 1, 4, 2, 2>,
- }, // Output pad bottom = 2
- }, // Input pad right = 4
- { // Input pad right = 5
- { // Output pad bottom = 0
- Conv::template process_tile<0, 0, 1, 5, 0, 0>,
- Conv::template process_tile<0, 0, 1, 5, 0, 1>,
- Conv::template process_tile<0, 0, 1, 5, 0, 2>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- Conv::template process_tile<0, 0, 1, 5, 1, 0>,
- Conv::template process_tile<0, 0, 1, 5, 1, 1>,
- Conv::template process_tile<0, 0, 1, 5, 1, 2>,
- }, // Output pad bottom = 1
- { // Output pad bottom = 2
- Conv::template process_tile<0, 0, 1, 5, 2, 0>,
- Conv::template process_tile<0, 0, 1, 5, 2, 1>,
- Conv::template process_tile<0, 0, 1, 5, 2, 2>,
- }, // Output pad bottom = 2
- }, // Input pad right = 5
- { // Input pad right = 6
- { // Output pad bottom = 0
- Conv::template process_tile<0, 0, 1, 6, 0, 0>,
- Conv::template process_tile<0, 0, 1, 6, 0, 1>,
- Conv::template process_tile<0, 0, 1, 6, 0, 2>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- Conv::template process_tile<0, 0, 1, 6, 1, 0>,
- Conv::template process_tile<0, 0, 1, 6, 1, 1>,
- Conv::template process_tile<0, 0, 1, 6, 1, 2>,
- }, // Output pad bottom = 1
- { // Output pad bottom = 2
- Conv::template process_tile<0, 0, 1, 6, 2, 0>,
- Conv::template process_tile<0, 0, 1, 6, 2, 1>,
- Conv::template process_tile<0, 0, 1, 6, 2, 2>,
- }, // Output pad bottom = 2
- }, // Input pad right = 6
- }, // Input pad bottom = 1
- { // Input pad bottom = 2
- { // Input pad right = 0
- { // Output pad bottom = 0
- Conv::template process_tile<0, 0, 2, 0, 0, 0>,
- Conv::template process_tile<0, 0, 2, 0, 0, 1>,
- Conv::template process_tile<0, 0, 2, 0, 0, 2>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- Conv::template process_tile<0, 0, 2, 0, 1, 0>,
- Conv::template process_tile<0, 0, 2, 0, 1, 1>,
- Conv::template process_tile<0, 0, 2, 0, 1, 2>,
- }, // Output pad bottom = 1
- { // Output pad bottom = 2
- Conv::template process_tile<0, 0, 2, 0, 2, 0>,
- Conv::template process_tile<0, 0, 2, 0, 2, 1>,
- Conv::template process_tile<0, 0, 2, 0, 2, 2>,
- }, // Output pad bottom = 2
- }, // Input pad right = 0
- { // Input pad right = 1
- { // Output pad bottom = 0
- Conv::template process_tile<0, 0, 2, 1, 0, 0>,
- Conv::template process_tile<0, 0, 2, 1, 0, 1>,
- Conv::template process_tile<0, 0, 2, 1, 0, 2>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- Conv::template process_tile<0, 0, 2, 1, 1, 0>,
- Conv::template process_tile<0, 0, 2, 1, 1, 1>,
- Conv::template process_tile<0, 0, 2, 1, 1, 2>,
- }, // Output pad bottom = 1
- { // Output pad bottom = 2
- Conv::template process_tile<0, 0, 2, 1, 2, 0>,
- Conv::template process_tile<0, 0, 2, 1, 2, 1>,
- Conv::template process_tile<0, 0, 2, 1, 2, 2>,
- }, // Output pad bottom = 2
- }, // Input pad right = 1
- { // Input pad right = 2
- { // Output pad bottom = 0
- Conv::template process_tile<0, 0, 2, 2, 0, 0>,
- Conv::template process_tile<0, 0, 2, 2, 0, 1>,
- Conv::template process_tile<0, 0, 2, 2, 0, 2>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- Conv::template process_tile<0, 0, 2, 2, 1, 0>,
- Conv::template process_tile<0, 0, 2, 2, 1, 1>,
- Conv::template process_tile<0, 0, 2, 2, 1, 2>,
- }, // Output pad bottom = 1
- { // Output pad bottom = 2
- Conv::template process_tile<0, 0, 2, 2, 2, 0>,
- Conv::template process_tile<0, 0, 2, 2, 2, 1>,
- Conv::template process_tile<0, 0, 2, 2, 2, 2>,
- }, // Output pad bottom = 2
- }, // Input pad right = 2
- { // Input pad right = 3
- { // Output pad bottom = 0
- Conv::template process_tile<0, 0, 2, 3, 0, 0>,
- Conv::template process_tile<0, 0, 2, 3, 0, 1>,
- Conv::template process_tile<0, 0, 2, 3, 0, 2>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- Conv::template process_tile<0, 0, 2, 3, 1, 0>,
- Conv::template process_tile<0, 0, 2, 3, 1, 1>,
- Conv::template process_tile<0, 0, 2, 3, 1, 2>,
- }, // Output pad bottom = 1
- { // Output pad bottom = 2
- Conv::template process_tile<0, 0, 2, 3, 2, 0>,
- Conv::template process_tile<0, 0, 2, 3, 2, 1>,
- Conv::template process_tile<0, 0, 2, 3, 2, 2>,
- }, // Output pad bottom = 2
- }, // Input pad right = 3
- { // Input pad right = 4
- { // Output pad bottom = 0
- Conv::template process_tile<0, 0, 2, 4, 0, 0>,
- Conv::template process_tile<0, 0, 2, 4, 0, 1>,
- Conv::template process_tile<0, 0, 2, 4, 0, 2>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- Conv::template process_tile<0, 0, 2, 4, 1, 0>,
- Conv::template process_tile<0, 0, 2, 4, 1, 1>,
- Conv::template process_tile<0, 0, 2, 4, 1, 2>,
- }, // Output pad bottom = 1
- { // Output pad bottom = 2
- Conv::template process_tile<0, 0, 2, 4, 2, 0>,
- Conv::template process_tile<0, 0, 2, 4, 2, 1>,
- Conv::template process_tile<0, 0, 2, 4, 2, 2>,
- }, // Output pad bottom = 2
- }, // Input pad right = 4
- { // Input pad right = 5
- { // Output pad bottom = 0
- Conv::template process_tile<0, 0, 2, 5, 0, 0>,
- Conv::template process_tile<0, 0, 2, 5, 0, 1>,
- Conv::template process_tile<0, 0, 2, 5, 0, 2>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- Conv::template process_tile<0, 0, 2, 5, 1, 0>,
- Conv::template process_tile<0, 0, 2, 5, 1, 1>,
- Conv::template process_tile<0, 0, 2, 5, 1, 2>,
- }, // Output pad bottom = 1
- { // Output pad bottom = 2
- Conv::template process_tile<0, 0, 2, 5, 2, 0>,
- Conv::template process_tile<0, 0, 2, 5, 2, 1>,
- Conv::template process_tile<0, 0, 2, 5, 2, 2>,
- }, // Output pad bottom = 2
- }, // Input pad right = 5
- { // Input pad right = 6
- { // Output pad bottom = 0
- Conv::template process_tile<0, 0, 2, 6, 0, 0>,
- Conv::template process_tile<0, 0, 2, 6, 0, 1>,
- Conv::template process_tile<0, 0, 2, 6, 0, 2>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- Conv::template process_tile<0, 0, 2, 6, 1, 0>,
- Conv::template process_tile<0, 0, 2, 6, 1, 1>,
- Conv::template process_tile<0, 0, 2, 6, 1, 2>,
- }, // Output pad bottom = 1
- { // Output pad bottom = 2
- Conv::template process_tile<0, 0, 2, 6, 2, 0>,
- Conv::template process_tile<0, 0, 2, 6, 2, 1>,
- Conv::template process_tile<0, 0, 2, 6, 2, 2>,
- }, // Output pad bottom = 2
- }, // Input pad right = 6
- }, // Input pad bottom = 2
- { // Input pad bottom = 3
- { // Input pad right = 0
- { // Output pad bottom = 0
- Conv::template process_tile<0, 0, 3, 0, 0, 0>,
- Conv::template process_tile<0, 0, 3, 0, 0, 1>,
- Conv::template process_tile<0, 0, 3, 0, 0, 2>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- Conv::template process_tile<0, 0, 3, 0, 1, 0>,
- Conv::template process_tile<0, 0, 3, 0, 1, 1>,
- Conv::template process_tile<0, 0, 3, 0, 1, 2>,
- }, // Output pad bottom = 1
- { // Output pad bottom = 2
- Conv::template process_tile<0, 0, 3, 0, 2, 0>,
- Conv::template process_tile<0, 0, 3, 0, 2, 1>,
- Conv::template process_tile<0, 0, 3, 0, 2, 2>,
- }, // Output pad bottom = 2
- }, // Input pad right = 0
- { // Input pad right = 1
- { // Output pad bottom = 0
- Conv::template process_tile<0, 0, 3, 1, 0, 0>,
- Conv::template process_tile<0, 0, 3, 1, 0, 1>,
- Conv::template process_tile<0, 0, 3, 1, 0, 2>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- Conv::template process_tile<0, 0, 3, 1, 1, 0>,
- Conv::template process_tile<0, 0, 3, 1, 1, 1>,
- Conv::template process_tile<0, 0, 3, 1, 1, 2>,
- }, // Output pad bottom = 1
- { // Output pad bottom = 2
- Conv::template process_tile<0, 0, 3, 1, 2, 0>,
- Conv::template process_tile<0, 0, 3, 1, 2, 1>,
- Conv::template process_tile<0, 0, 3, 1, 2, 2>,
- }, // Output pad bottom = 2
- }, // Input pad right = 1
- { // Input pad right = 2
- { // Output pad bottom = 0
- Conv::template process_tile<0, 0, 3, 2, 0, 0>,
- Conv::template process_tile<0, 0, 3, 2, 0, 1>,
- Conv::template process_tile<0, 0, 3, 2, 0, 2>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- Conv::template process_tile<0, 0, 3, 2, 1, 0>,
- Conv::template process_tile<0, 0, 3, 2, 1, 1>,
- Conv::template process_tile<0, 0, 3, 2, 1, 2>,
- }, // Output pad bottom = 1
- { // Output pad bottom = 2
- Conv::template process_tile<0, 0, 3, 2, 2, 0>,
- Conv::template process_tile<0, 0, 3, 2, 2, 1>,
- Conv::template process_tile<0, 0, 3, 2, 2, 2>,
- }, // Output pad bottom = 2
- }, // Input pad right = 2
- { // Input pad right = 3
- { // Output pad bottom = 0
- Conv::template process_tile<0, 0, 3, 3, 0, 0>,
- Conv::template process_tile<0, 0, 3, 3, 0, 1>,
- Conv::template process_tile<0, 0, 3, 3, 0, 2>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- Conv::template process_tile<0, 0, 3, 3, 1, 0>,
- Conv::template process_tile<0, 0, 3, 3, 1, 1>,
- Conv::template process_tile<0, 0, 3, 3, 1, 2>,
- }, // Output pad bottom = 1
- { // Output pad bottom = 2
- Conv::template process_tile<0, 0, 3, 3, 2, 0>,
- Conv::template process_tile<0, 0, 3, 3, 2, 1>,
- Conv::template process_tile<0, 0, 3, 3, 2, 2>,
- }, // Output pad bottom = 2
- }, // Input pad right = 3
- { // Input pad right = 4
- { // Output pad bottom = 0
- Conv::template process_tile<0, 0, 3, 4, 0, 0>,
- Conv::template process_tile<0, 0, 3, 4, 0, 1>,
- Conv::template process_tile<0, 0, 3, 4, 0, 2>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- Conv::template process_tile<0, 0, 3, 4, 1, 0>,
- Conv::template process_tile<0, 0, 3, 4, 1, 1>,
- Conv::template process_tile<0, 0, 3, 4, 1, 2>,
- }, // Output pad bottom = 1
- { // Output pad bottom = 2
- Conv::template process_tile<0, 0, 3, 4, 2, 0>,
- Conv::template process_tile<0, 0, 3, 4, 2, 1>,
- Conv::template process_tile<0, 0, 3, 4, 2, 2>,
- }, // Output pad bottom = 2
- }, // Input pad right = 4
- { // Input pad right = 5
- { // Output pad bottom = 0
- Conv::template process_tile<0, 0, 3, 5, 0, 0>,
- Conv::template process_tile<0, 0, 3, 5, 0, 1>,
- Conv::template process_tile<0, 0, 3, 5, 0, 2>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- Conv::template process_tile<0, 0, 3, 5, 1, 0>,
- Conv::template process_tile<0, 0, 3, 5, 1, 1>,
- Conv::template process_tile<0, 0, 3, 5, 1, 2>,
- }, // Output pad bottom = 1
- { // Output pad bottom = 2
- Conv::template process_tile<0, 0, 3, 5, 2, 0>,
- Conv::template process_tile<0, 0, 3, 5, 2, 1>,
- Conv::template process_tile<0, 0, 3, 5, 2, 2>,
- }, // Output pad bottom = 2
- }, // Input pad right = 5
- { // Input pad right = 6
- { // Output pad bottom = 0
- Conv::template process_tile<0, 0, 3, 6, 0, 0>,
- Conv::template process_tile<0, 0, 3, 6, 0, 1>,
- Conv::template process_tile<0, 0, 3, 6, 0, 2>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- Conv::template process_tile<0, 0, 3, 6, 1, 0>,
- Conv::template process_tile<0, 0, 3, 6, 1, 1>,
- Conv::template process_tile<0, 0, 3, 6, 1, 2>,
- }, // Output pad bottom = 1
- { // Output pad bottom = 2
- Conv::template process_tile<0, 0, 3, 6, 2, 0>,
- Conv::template process_tile<0, 0, 3, 6, 2, 1>,
- Conv::template process_tile<0, 0, 3, 6, 2, 2>,
- }, // Output pad bottom = 2
- }, // Input pad right = 6
- }, // Input pad bottom = 3
- { // Input pad bottom = 4
- { // Input pad right = 0
- { // Output pad bottom = 0
- Conv::template process_tile<0, 0, 4, 0, 0, 0>,
- Conv::template process_tile<0, 0, 4, 0, 0, 1>,
- Conv::template process_tile<0, 0, 4, 0, 0, 2>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- Conv::template process_tile<0, 0, 4, 0, 1, 0>,
- Conv::template process_tile<0, 0, 4, 0, 1, 1>,
- Conv::template process_tile<0, 0, 4, 0, 1, 2>,
- }, // Output pad bottom = 1
- { // Output pad bottom = 2
- Conv::template process_tile<0, 0, 4, 0, 2, 0>,
- Conv::template process_tile<0, 0, 4, 0, 2, 1>,
- Conv::template process_tile<0, 0, 4, 0, 2, 2>,
- }, // Output pad bottom = 2
- }, // Input pad right = 0
- { // Input pad right = 1
- { // Output pad bottom = 0
- Conv::template process_tile<0, 0, 4, 1, 0, 0>,
- Conv::template process_tile<0, 0, 4, 1, 0, 1>,
- Conv::template process_tile<0, 0, 4, 1, 0, 2>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- Conv::template process_tile<0, 0, 4, 1, 1, 0>,
- Conv::template process_tile<0, 0, 4, 1, 1, 1>,
- Conv::template process_tile<0, 0, 4, 1, 1, 2>,
- }, // Output pad bottom = 1
- { // Output pad bottom = 2
- Conv::template process_tile<0, 0, 4, 1, 2, 0>,
- Conv::template process_tile<0, 0, 4, 1, 2, 1>,
- Conv::template process_tile<0, 0, 4, 1, 2, 2>,
- }, // Output pad bottom = 2
- }, // Input pad right = 1
- { // Input pad right = 2
- { // Output pad bottom = 0
- Conv::template process_tile<0, 0, 4, 2, 0, 0>,
- Conv::template process_tile<0, 0, 4, 2, 0, 1>,
- Conv::template process_tile<0, 0, 4, 2, 0, 2>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- Conv::template process_tile<0, 0, 4, 2, 1, 0>,
- Conv::template process_tile<0, 0, 4, 2, 1, 1>,
- Conv::template process_tile<0, 0, 4, 2, 1, 2>,
- }, // Output pad bottom = 1
- { // Output pad bottom = 2
- Conv::template process_tile<0, 0, 4, 2, 2, 0>,
- Conv::template process_tile<0, 0, 4, 2, 2, 1>,
- Conv::template process_tile<0, 0, 4, 2, 2, 2>,
- }, // Output pad bottom = 2
- }, // Input pad right = 2
- { // Input pad right = 3
- { // Output pad bottom = 0
- Conv::template process_tile<0, 0, 4, 3, 0, 0>,
- Conv::template process_tile<0, 0, 4, 3, 0, 1>,
- Conv::template process_tile<0, 0, 4, 3, 0, 2>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- Conv::template process_tile<0, 0, 4, 3, 1, 0>,
- Conv::template process_tile<0, 0, 4, 3, 1, 1>,
- Conv::template process_tile<0, 0, 4, 3, 1, 2>,
- }, // Output pad bottom = 1
- { // Output pad bottom = 2
- Conv::template process_tile<0, 0, 4, 3, 2, 0>,
- Conv::template process_tile<0, 0, 4, 3, 2, 1>,
- Conv::template process_tile<0, 0, 4, 3, 2, 2>,
- }, // Output pad bottom = 2
- }, // Input pad right = 3
- { // Input pad right = 4
- { // Output pad bottom = 0
- Conv::template process_tile<0, 0, 4, 4, 0, 0>,
- Conv::template process_tile<0, 0, 4, 4, 0, 1>,
- Conv::template process_tile<0, 0, 4, 4, 0, 2>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- Conv::template process_tile<0, 0, 4, 4, 1, 0>,
- Conv::template process_tile<0, 0, 4, 4, 1, 1>,
- Conv::template process_tile<0, 0, 4, 4, 1, 2>,
- }, // Output pad bottom = 1
- { // Output pad bottom = 2
- Conv::template process_tile<0, 0, 4, 4, 2, 0>,
- Conv::template process_tile<0, 0, 4, 4, 2, 1>,
- Conv::template process_tile<0, 0, 4, 4, 2, 2>,
- }, // Output pad bottom = 2
- }, // Input pad right = 4
- { // Input pad right = 5
- { // Output pad bottom = 0
- Conv::template process_tile<0, 0, 4, 5, 0, 0>,
- Conv::template process_tile<0, 0, 4, 5, 0, 1>,
- Conv::template process_tile<0, 0, 4, 5, 0, 2>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- Conv::template process_tile<0, 0, 4, 5, 1, 0>,
- Conv::template process_tile<0, 0, 4, 5, 1, 1>,
- Conv::template process_tile<0, 0, 4, 5, 1, 2>,
- }, // Output pad bottom = 1
- { // Output pad bottom = 2
- Conv::template process_tile<0, 0, 4, 5, 2, 0>,
- Conv::template process_tile<0, 0, 4, 5, 2, 1>,
- Conv::template process_tile<0, 0, 4, 5, 2, 2>,
- }, // Output pad bottom = 2
- }, // Input pad right = 5
- { // Input pad right = 6
- { // Output pad bottom = 0
- Conv::template process_tile<0, 0, 4, 6, 0, 0>,
- Conv::template process_tile<0, 0, 4, 6, 0, 1>,
- Conv::template process_tile<0, 0, 4, 6, 0, 2>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- Conv::template process_tile<0, 0, 4, 6, 1, 0>,
- Conv::template process_tile<0, 0, 4, 6, 1, 1>,
- Conv::template process_tile<0, 0, 4, 6, 1, 2>,
- }, // Output pad bottom = 1
- { // Output pad bottom = 2
- Conv::template process_tile<0, 0, 4, 6, 2, 0>,
- Conv::template process_tile<0, 0, 4, 6, 2, 1>,
- Conv::template process_tile<0, 0, 4, 6, 2, 2>,
- }, // Output pad bottom = 2
- }, // Input pad right = 6
- }, // Input pad bottom = 4
- { // Input pad bottom = 5
- { // Input pad right = 0
- { // Output pad bottom = 0
- Conv::template process_tile<0, 0, 5, 0, 0, 0>,
- Conv::template process_tile<0, 0, 5, 0, 0, 1>,
- Conv::template process_tile<0, 0, 5, 0, 0, 2>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- Conv::template process_tile<0, 0, 5, 0, 1, 0>,
- Conv::template process_tile<0, 0, 5, 0, 1, 1>,
- Conv::template process_tile<0, 0, 5, 0, 1, 2>,
- }, // Output pad bottom = 1
- { // Output pad bottom = 2
- Conv::template process_tile<0, 0, 5, 0, 2, 0>,
- Conv::template process_tile<0, 0, 5, 0, 2, 1>,
- Conv::template process_tile<0, 0, 5, 0, 2, 2>,
- }, // Output pad bottom = 2
- }, // Input pad right = 0
- { // Input pad right = 1
- { // Output pad bottom = 0
- Conv::template process_tile<0, 0, 5, 1, 0, 0>,
- Conv::template process_tile<0, 0, 5, 1, 0, 1>,
- Conv::template process_tile<0, 0, 5, 1, 0, 2>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- Conv::template process_tile<0, 0, 5, 1, 1, 0>,
- Conv::template process_tile<0, 0, 5, 1, 1, 1>,
- Conv::template process_tile<0, 0, 5, 1, 1, 2>,
- }, // Output pad bottom = 1
- { // Output pad bottom = 2
- Conv::template process_tile<0, 0, 5, 1, 2, 0>,
- Conv::template process_tile<0, 0, 5, 1, 2, 1>,
- Conv::template process_tile<0, 0, 5, 1, 2, 2>,
- }, // Output pad bottom = 2
- }, // Input pad right = 1
- { // Input pad right = 2
- { // Output pad bottom = 0
- Conv::template process_tile<0, 0, 5, 2, 0, 0>,
- Conv::template process_tile<0, 0, 5, 2, 0, 1>,
- Conv::template process_tile<0, 0, 5, 2, 0, 2>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- Conv::template process_tile<0, 0, 5, 2, 1, 0>,
- Conv::template process_tile<0, 0, 5, 2, 1, 1>,
- Conv::template process_tile<0, 0, 5, 2, 1, 2>,
- }, // Output pad bottom = 1
- { // Output pad bottom = 2
- Conv::template process_tile<0, 0, 5, 2, 2, 0>,
- Conv::template process_tile<0, 0, 5, 2, 2, 1>,
- Conv::template process_tile<0, 0, 5, 2, 2, 2>,
- }, // Output pad bottom = 2
- }, // Input pad right = 2
- { // Input pad right = 3
- { // Output pad bottom = 0
- Conv::template process_tile<0, 0, 5, 3, 0, 0>,
- Conv::template process_tile<0, 0, 5, 3, 0, 1>,
- Conv::template process_tile<0, 0, 5, 3, 0, 2>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- Conv::template process_tile<0, 0, 5, 3, 1, 0>,
- Conv::template process_tile<0, 0, 5, 3, 1, 1>,
- Conv::template process_tile<0, 0, 5, 3, 1, 2>,
- }, // Output pad bottom = 1
- { // Output pad bottom = 2
- Conv::template process_tile<0, 0, 5, 3, 2, 0>,
- Conv::template process_tile<0, 0, 5, 3, 2, 1>,
- Conv::template process_tile<0, 0, 5, 3, 2, 2>,
- }, // Output pad bottom = 2
- }, // Input pad right = 3
- { // Input pad right = 4
- { // Output pad bottom = 0
- Conv::template process_tile<0, 0, 5, 4, 0, 0>,
- Conv::template process_tile<0, 0, 5, 4, 0, 1>,
- Conv::template process_tile<0, 0, 5, 4, 0, 2>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- Conv::template process_tile<0, 0, 5, 4, 1, 0>,
- Conv::template process_tile<0, 0, 5, 4, 1, 1>,
- Conv::template process_tile<0, 0, 5, 4, 1, 2>,
- }, // Output pad bottom = 1
- { // Output pad bottom = 2
- Conv::template process_tile<0, 0, 5, 4, 2, 0>,
- Conv::template process_tile<0, 0, 5, 4, 2, 1>,
- Conv::template process_tile<0, 0, 5, 4, 2, 2>,
- }, // Output pad bottom = 2
- }, // Input pad right = 4
- { // Input pad right = 5
- { // Output pad bottom = 0
- Conv::template process_tile<0, 0, 5, 5, 0, 0>,
- Conv::template process_tile<0, 0, 5, 5, 0, 1>,
- Conv::template process_tile<0, 0, 5, 5, 0, 2>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- Conv::template process_tile<0, 0, 5, 5, 1, 0>,
- Conv::template process_tile<0, 0, 5, 5, 1, 1>,
- Conv::template process_tile<0, 0, 5, 5, 1, 2>,
- }, // Output pad bottom = 1
- { // Output pad bottom = 2
- Conv::template process_tile<0, 0, 5, 5, 2, 0>,
- Conv::template process_tile<0, 0, 5, 5, 2, 1>,
- Conv::template process_tile<0, 0, 5, 5, 2, 2>,
- }, // Output pad bottom = 2
- }, // Input pad right = 5
- { // Input pad right = 6
- { // Output pad bottom = 0
- Conv::template process_tile<0, 0, 5, 6, 0, 0>,
- Conv::template process_tile<0, 0, 5, 6, 0, 1>,
- Conv::template process_tile<0, 0, 5, 6, 0, 2>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- Conv::template process_tile<0, 0, 5, 6, 1, 0>,
- Conv::template process_tile<0, 0, 5, 6, 1, 1>,
- Conv::template process_tile<0, 0, 5, 6, 1, 2>,
- }, // Output pad bottom = 1
- { // Output pad bottom = 2
- Conv::template process_tile<0, 0, 5, 6, 2, 0>,
- Conv::template process_tile<0, 0, 5, 6, 2, 1>,
- Conv::template process_tile<0, 0, 5, 6, 2, 2>,
- }, // Output pad bottom = 2
- }, // Input pad right = 6
- }, // Input pad bottom = 5
- { // Input pad bottom = 6
- { // Input pad right = 0
- { // Output pad bottom = 0
- Conv::template process_tile<0, 0, 6, 0, 0, 0>,
- Conv::template process_tile<0, 0, 6, 0, 0, 1>,
- Conv::template process_tile<0, 0, 6, 0, 0, 2>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- Conv::template process_tile<0, 0, 6, 0, 1, 0>,
- Conv::template process_tile<0, 0, 6, 0, 1, 1>,
- Conv::template process_tile<0, 0, 6, 0, 1, 2>,
- }, // Output pad bottom = 1
- { // Output pad bottom = 2
- Conv::template process_tile<0, 0, 6, 0, 2, 0>,
- Conv::template process_tile<0, 0, 6, 0, 2, 1>,
- Conv::template process_tile<0, 0, 6, 0, 2, 2>,
- }, // Output pad bottom = 2
- }, // Input pad right = 0
- { // Input pad right = 1
- { // Output pad bottom = 0
- Conv::template process_tile<0, 0, 6, 1, 0, 0>,
- Conv::template process_tile<0, 0, 6, 1, 0, 1>,
- Conv::template process_tile<0, 0, 6, 1, 0, 2>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- Conv::template process_tile<0, 0, 6, 1, 1, 0>,
- Conv::template process_tile<0, 0, 6, 1, 1, 1>,
- Conv::template process_tile<0, 0, 6, 1, 1, 2>,
- }, // Output pad bottom = 1
- { // Output pad bottom = 2
- Conv::template process_tile<0, 0, 6, 1, 2, 0>,
- Conv::template process_tile<0, 0, 6, 1, 2, 1>,
- Conv::template process_tile<0, 0, 6, 1, 2, 2>,
- }, // Output pad bottom = 2
- }, // Input pad right = 1
- { // Input pad right = 2
- { // Output pad bottom = 0
- Conv::template process_tile<0, 0, 6, 2, 0, 0>,
- Conv::template process_tile<0, 0, 6, 2, 0, 1>,
- Conv::template process_tile<0, 0, 6, 2, 0, 2>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- Conv::template process_tile<0, 0, 6, 2, 1, 0>,
- Conv::template process_tile<0, 0, 6, 2, 1, 1>,
- Conv::template process_tile<0, 0, 6, 2, 1, 2>,
- }, // Output pad bottom = 1
- { // Output pad bottom = 2
- Conv::template process_tile<0, 0, 6, 2, 2, 0>,
- Conv::template process_tile<0, 0, 6, 2, 2, 1>,
- Conv::template process_tile<0, 0, 6, 2, 2, 2>,
- }, // Output pad bottom = 2
- }, // Input pad right = 2
- { // Input pad right = 3
- { // Output pad bottom = 0
- Conv::template process_tile<0, 0, 6, 3, 0, 0>,
- Conv::template process_tile<0, 0, 6, 3, 0, 1>,
- Conv::template process_tile<0, 0, 6, 3, 0, 2>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- Conv::template process_tile<0, 0, 6, 3, 1, 0>,
- Conv::template process_tile<0, 0, 6, 3, 1, 1>,
- Conv::template process_tile<0, 0, 6, 3, 1, 2>,
- }, // Output pad bottom = 1
- { // Output pad bottom = 2
- Conv::template process_tile<0, 0, 6, 3, 2, 0>,
- Conv::template process_tile<0, 0, 6, 3, 2, 1>,
- Conv::template process_tile<0, 0, 6, 3, 2, 2>,
- }, // Output pad bottom = 2
- }, // Input pad right = 3
- { // Input pad right = 4
- { // Output pad bottom = 0
- Conv::template process_tile<0, 0, 6, 4, 0, 0>,
- Conv::template process_tile<0, 0, 6, 4, 0, 1>,
- Conv::template process_tile<0, 0, 6, 4, 0, 2>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- Conv::template process_tile<0, 0, 6, 4, 1, 0>,
- Conv::template process_tile<0, 0, 6, 4, 1, 1>,
- Conv::template process_tile<0, 0, 6, 4, 1, 2>,
- }, // Output pad bottom = 1
- { // Output pad bottom = 2
- Conv::template process_tile<0, 0, 6, 4, 2, 0>,
- Conv::template process_tile<0, 0, 6, 4, 2, 1>,
- Conv::template process_tile<0, 0, 6, 4, 2, 2>,
- }, // Output pad bottom = 2
- }, // Input pad right = 4
- { // Input pad right = 5
- { // Output pad bottom = 0
- Conv::template process_tile<0, 0, 6, 5, 0, 0>,
- Conv::template process_tile<0, 0, 6, 5, 0, 1>,
- Conv::template process_tile<0, 0, 6, 5, 0, 2>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- Conv::template process_tile<0, 0, 6, 5, 1, 0>,
- Conv::template process_tile<0, 0, 6, 5, 1, 1>,
- Conv::template process_tile<0, 0, 6, 5, 1, 2>,
- }, // Output pad bottom = 1
- { // Output pad bottom = 2
- Conv::template process_tile<0, 0, 6, 5, 2, 0>,
- Conv::template process_tile<0, 0, 6, 5, 2, 1>,
- Conv::template process_tile<0, 0, 6, 5, 2, 2>,
- }, // Output pad bottom = 2
- }, // Input pad right = 5
- { // Input pad right = 6
- { // Output pad bottom = 0
- Conv::template process_tile<0, 0, 6, 6, 0, 0>,
- Conv::template process_tile<0, 0, 6, 6, 0, 1>,
- Conv::template process_tile<0, 0, 6, 6, 0, 2>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- Conv::template process_tile<0, 0, 6, 6, 1, 0>,
- Conv::template process_tile<0, 0, 6, 6, 1, 1>,
- Conv::template process_tile<0, 0, 6, 6, 1, 2>,
- }, // Output pad bottom = 1
- { // Output pad bottom = 2
- Conv::template process_tile<0, 0, 6, 6, 2, 0>,
- Conv::template process_tile<0, 0, 6, 6, 2, 1>,
- Conv::template process_tile<0, 0, 6, 6, 2, 2>,
- }, // Output pad bottom = 2
- }, // Input pad right = 6
- }, // Input pad bottom = 6
- }, // Input pad left = 0
- { // Input pad left = 1
- { // Input pad bottom = 0
- { // Input pad right = 0
- { // Output pad bottom = 0
- Conv::template process_tile<0, 1, 0, 0, 0, 0>,
- Conv::template process_tile<0, 1, 0, 0, 0, 1>,
- Conv::template process_tile<0, 1, 0, 0, 0, 2>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- Conv::template process_tile<0, 1, 0, 0, 1, 0>,
- Conv::template process_tile<0, 1, 0, 0, 1, 1>,
- Conv::template process_tile<0, 1, 0, 0, 1, 2>,
- }, // Output pad bottom = 1
- { // Output pad bottom = 2
- Conv::template process_tile<0, 1, 0, 0, 2, 0>,
- Conv::template process_tile<0, 1, 0, 0, 2, 1>,
- Conv::template process_tile<0, 1, 0, 0, 2, 2>,
- }, // Output pad bottom = 2
- }, // Input pad right = 0
- { // Input pad right = 1
- { // Output pad bottom = 0
- Conv::template process_tile<0, 1, 0, 1, 0, 0>,
- Conv::template process_tile<0, 1, 0, 1, 0, 1>,
- Conv::template process_tile<0, 1, 0, 1, 0, 2>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- Conv::template process_tile<0, 1, 0, 1, 1, 0>,
- Conv::template process_tile<0, 1, 0, 1, 1, 1>,
- Conv::template process_tile<0, 1, 0, 1, 1, 2>,
- }, // Output pad bottom = 1
- { // Output pad bottom = 2
- Conv::template process_tile<0, 1, 0, 1, 2, 0>,
- Conv::template process_tile<0, 1, 0, 1, 2, 1>,
- Conv::template process_tile<0, 1, 0, 1, 2, 2>,
- }, // Output pad bottom = 2
- }, // Input pad right = 1
- { // Input pad right = 2
- { // Output pad bottom = 0
- Conv::template process_tile<0, 1, 0, 2, 0, 0>,
- Conv::template process_tile<0, 1, 0, 2, 0, 1>,
- Conv::template process_tile<0, 1, 0, 2, 0, 2>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- Conv::template process_tile<0, 1, 0, 2, 1, 0>,
- Conv::template process_tile<0, 1, 0, 2, 1, 1>,
- Conv::template process_tile<0, 1, 0, 2, 1, 2>,
- }, // Output pad bottom = 1
- { // Output pad bottom = 2
- Conv::template process_tile<0, 1, 0, 2, 2, 0>,
- Conv::template process_tile<0, 1, 0, 2, 2, 1>,
- Conv::template process_tile<0, 1, 0, 2, 2, 2>,
- }, // Output pad bottom = 2
- }, // Input pad right = 2
- { // Input pad right = 3
- { // Output pad bottom = 0
- Conv::template process_tile<0, 1, 0, 3, 0, 0>,
- Conv::template process_tile<0, 1, 0, 3, 0, 1>,
- Conv::template process_tile<0, 1, 0, 3, 0, 2>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- Conv::template process_tile<0, 1, 0, 3, 1, 0>,
- Conv::template process_tile<0, 1, 0, 3, 1, 1>,
- Conv::template process_tile<0, 1, 0, 3, 1, 2>,
- }, // Output pad bottom = 1
- { // Output pad bottom = 2
- Conv::template process_tile<0, 1, 0, 3, 2, 0>,
- Conv::template process_tile<0, 1, 0, 3, 2, 1>,
- Conv::template process_tile<0, 1, 0, 3, 2, 2>,
- }, // Output pad bottom = 2
- }, // Input pad right = 3
- { // Input pad right = 4
- { // Output pad bottom = 0
- Conv::template process_tile<0, 1, 0, 4, 0, 0>,
- Conv::template process_tile<0, 1, 0, 4, 0, 1>,
- Conv::template process_tile<0, 1, 0, 4, 0, 2>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- Conv::template process_tile<0, 1, 0, 4, 1, 0>,
- Conv::template process_tile<0, 1, 0, 4, 1, 1>,
- Conv::template process_tile<0, 1, 0, 4, 1, 2>,
- }, // Output pad bottom = 1
- { // Output pad bottom = 2
- Conv::template process_tile<0, 1, 0, 4, 2, 0>,
- Conv::template process_tile<0, 1, 0, 4, 2, 1>,
- Conv::template process_tile<0, 1, 0, 4, 2, 2>,
- }, // Output pad bottom = 2
- }, // Input pad right = 4
- { // Input pad right = 5
- { // Output pad bottom = 0
- Conv::template process_tile<0, 1, 0, 5, 0, 0>,
- Conv::template process_tile<0, 1, 0, 5, 0, 1>,
- Conv::template process_tile<0, 1, 0, 5, 0, 2>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- Conv::template process_tile<0, 1, 0, 5, 1, 0>,
- Conv::template process_tile<0, 1, 0, 5, 1, 1>,
- Conv::template process_tile<0, 1, 0, 5, 1, 2>,
- }, // Output pad bottom = 1
- { // Output pad bottom = 2
- Conv::template process_tile<0, 1, 0, 5, 2, 0>,
- Conv::template process_tile<0, 1, 0, 5, 2, 1>,
- Conv::template process_tile<0, 1, 0, 5, 2, 2>,
- }, // Output pad bottom = 2
- }, // Input pad right = 5
- { // Input pad right = 6
- { // Output pad bottom = 0
- Conv::template process_tile<0, 1, 0, 6, 0, 0>,
- Conv::template process_tile<0, 1, 0, 6, 0, 1>,
- Conv::template process_tile<0, 1, 0, 6, 0, 2>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- Conv::template process_tile<0, 1, 0, 6, 1, 0>,
- Conv::template process_tile<0, 1, 0, 6, 1, 1>,
- Conv::template process_tile<0, 1, 0, 6, 1, 2>,
- }, // Output pad bottom = 1
- { // Output pad bottom = 2
- Conv::template process_tile<0, 1, 0, 6, 2, 0>,
- Conv::template process_tile<0, 1, 0, 6, 2, 1>,
- Conv::template process_tile<0, 1, 0, 6, 2, 2>,
- }, // Output pad bottom = 2
- }, // Input pad right = 6
- }, // Input pad bottom = 0
- { // Input pad bottom = 1
- { // Input pad right = 0
- { // Output pad bottom = 0
- Conv::template process_tile<0, 1, 1, 0, 0, 0>,
- Conv::template process_tile<0, 1, 1, 0, 0, 1>,
- Conv::template process_tile<0, 1, 1, 0, 0, 2>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- Conv::template process_tile<0, 1, 1, 0, 1, 0>,
- Conv::template process_tile<0, 1, 1, 0, 1, 1>,
- Conv::template process_tile<0, 1, 1, 0, 1, 2>,
- }, // Output pad bottom = 1
- { // Output pad bottom = 2
- Conv::template process_tile<0, 1, 1, 0, 2, 0>,
- Conv::template process_tile<0, 1, 1, 0, 2, 1>,
- Conv::template process_tile<0, 1, 1, 0, 2, 2>,
- }, // Output pad bottom = 2
- }, // Input pad right = 0
- { // Input pad right = 1
- { // Output pad bottom = 0
- Conv::template process_tile<0, 1, 1, 1, 0, 0>,
- Conv::template process_tile<0, 1, 1, 1, 0, 1>,
- Conv::template process_tile<0, 1, 1, 1, 0, 2>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- Conv::template process_tile<0, 1, 1, 1, 1, 0>,
- Conv::template process_tile<0, 1, 1, 1, 1, 1>,
- Conv::template process_tile<0, 1, 1, 1, 1, 2>,
- }, // Output pad bottom = 1
- { // Output pad bottom = 2
- Conv::template process_tile<0, 1, 1, 1, 2, 0>,
- Conv::template process_tile<0, 1, 1, 1, 2, 1>,
- Conv::template process_tile<0, 1, 1, 1, 2, 2>,
- }, // Output pad bottom = 2
- }, // Input pad right = 1
- { // Input pad right = 2
- { // Output pad bottom = 0
- Conv::template process_tile<0, 1, 1, 2, 0, 0>,
- Conv::template process_tile<0, 1, 1, 2, 0, 1>,
- Conv::template process_tile<0, 1, 1, 2, 0, 2>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- Conv::template process_tile<0, 1, 1, 2, 1, 0>,
- Conv::template process_tile<0, 1, 1, 2, 1, 1>,
- Conv::template process_tile<0, 1, 1, 2, 1, 2>,
- }, // Output pad bottom = 1
- { // Output pad bottom = 2
- Conv::template process_tile<0, 1, 1, 2, 2, 0>,
- Conv::template process_tile<0, 1, 1, 2, 2, 1>,
- Conv::template process_tile<0, 1, 1, 2, 2, 2>,
- }, // Output pad bottom = 2
- }, // Input pad right = 2
- { // Input pad right = 3
- { // Output pad bottom = 0
- Conv::template process_tile<0, 1, 1, 3, 0, 0>,
- Conv::template process_tile<0, 1, 1, 3, 0, 1>,
- Conv::template process_tile<0, 1, 1, 3, 0, 2>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- Conv::template process_tile<0, 1, 1, 3, 1, 0>,
- Conv::template process_tile<0, 1, 1, 3, 1, 1>,
- Conv::template process_tile<0, 1, 1, 3, 1, 2>,
- }, // Output pad bottom = 1
- { // Output pad bottom = 2
- Conv::template process_tile<0, 1, 1, 3, 2, 0>,
- Conv::template process_tile<0, 1, 1, 3, 2, 1>,
- Conv::template process_tile<0, 1, 1, 3, 2, 2>,
- }, // Output pad bottom = 2
- }, // Input pad right = 3
- { // Input pad right = 4
- { // Output pad bottom = 0
- Conv::template process_tile<0, 1, 1, 4, 0, 0>,
- Conv::template process_tile<0, 1, 1, 4, 0, 1>,
- Conv::template process_tile<0, 1, 1, 4, 0, 2>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- Conv::template process_tile<0, 1, 1, 4, 1, 0>,
- Conv::template process_tile<0, 1, 1, 4, 1, 1>,
- Conv::template process_tile<0, 1, 1, 4, 1, 2>,
- }, // Output pad bottom = 1
- { // Output pad bottom = 2
- Conv::template process_tile<0, 1, 1, 4, 2, 0>,
- Conv::template process_tile<0, 1, 1, 4, 2, 1>,
- Conv::template process_tile<0, 1, 1, 4, 2, 2>,
- }, // Output pad bottom = 2
- }, // Input pad right = 4
- { // Input pad right = 5
- { // Output pad bottom = 0
- Conv::template process_tile<0, 1, 1, 5, 0, 0>,
- Conv::template process_tile<0, 1, 1, 5, 0, 1>,
- Conv::template process_tile<0, 1, 1, 5, 0, 2>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- Conv::template process_tile<0, 1, 1, 5, 1, 0>,
- Conv::template process_tile<0, 1, 1, 5, 1, 1>,
- Conv::template process_tile<0, 1, 1, 5, 1, 2>,
- }, // Output pad bottom = 1
- { // Output pad bottom = 2
- Conv::template process_tile<0, 1, 1, 5, 2, 0>,
- Conv::template process_tile<0, 1, 1, 5, 2, 1>,
- Conv::template process_tile<0, 1, 1, 5, 2, 2>,
- }, // Output pad bottom = 2
- }, // Input pad right = 5
- { // Input pad right = 6
- { // Output pad bottom = 0
- Conv::template process_tile<0, 1, 1, 6, 0, 0>,
- Conv::template process_tile<0, 1, 1, 6, 0, 1>,
- Conv::template process_tile<0, 1, 1, 6, 0, 2>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- Conv::template process_tile<0, 1, 1, 6, 1, 0>,
- Conv::template process_tile<0, 1, 1, 6, 1, 1>,
- Conv::template process_tile<0, 1, 1, 6, 1, 2>,
- }, // Output pad bottom = 1
- { // Output pad bottom = 2
- Conv::template process_tile<0, 1, 1, 6, 2, 0>,
- Conv::template process_tile<0, 1, 1, 6, 2, 1>,
- Conv::template process_tile<0, 1, 1, 6, 2, 2>,
- }, // Output pad bottom = 2
- }, // Input pad right = 6
- }, // Input pad bottom = 1
- { // Input pad bottom = 2
- { // Input pad right = 0
- { // Output pad bottom = 0
- Conv::template process_tile<0, 1, 2, 0, 0, 0>,
- Conv::template process_tile<0, 1, 2, 0, 0, 1>,
- Conv::template process_tile<0, 1, 2, 0, 0, 2>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- Conv::template process_tile<0, 1, 2, 0, 1, 0>,
- Conv::template process_tile<0, 1, 2, 0, 1, 1>,
- Conv::template process_tile<0, 1, 2, 0, 1, 2>,
- }, // Output pad bottom = 1
- { // Output pad bottom = 2
- Conv::template process_tile<0, 1, 2, 0, 2, 0>,
- Conv::template process_tile<0, 1, 2, 0, 2, 1>,
- Conv::template process_tile<0, 1, 2, 0, 2, 2>,
- }, // Output pad bottom = 2
- }, // Input pad right = 0
- { // Input pad right = 1
- { // Output pad bottom = 0
- Conv::template process_tile<0, 1, 2, 1, 0, 0>,
- Conv::template process_tile<0, 1, 2, 1, 0, 1>,
- Conv::template process_tile<0, 1, 2, 1, 0, 2>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- Conv::template process_tile<0, 1, 2, 1, 1, 0>,
- Conv::template process_tile<0, 1, 2, 1, 1, 1>,
- Conv::template process_tile<0, 1, 2, 1, 1, 2>,
- }, // Output pad bottom = 1
- { // Output pad bottom = 2
- Conv::template process_tile<0, 1, 2, 1, 2, 0>,
- Conv::template process_tile<0, 1, 2, 1, 2, 1>,
- Conv::template process_tile<0, 1, 2, 1, 2, 2>,
- }, // Output pad bottom = 2
- }, // Input pad right = 1
- { // Input pad right = 2
- { // Output pad bottom = 0
- Conv::template process_tile<0, 1, 2, 2, 0, 0>,
- Conv::template process_tile<0, 1, 2, 2, 0, 1>,
- Conv::template process_tile<0, 1, 2, 2, 0, 2>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- Conv::template process_tile<0, 1, 2, 2, 1, 0>,
- Conv::template process_tile<0, 1, 2, 2, 1, 1>,
- Conv::template process_tile<0, 1, 2, 2, 1, 2>,
- }, // Output pad bottom = 1
- { // Output pad bottom = 2
- Conv::template process_tile<0, 1, 2, 2, 2, 0>,
- Conv::template process_tile<0, 1, 2, 2, 2, 1>,
- Conv::template process_tile<0, 1, 2, 2, 2, 2>,
- }, // Output pad bottom = 2
- }, // Input pad right = 2
- { // Input pad right = 3
- { // Output pad bottom = 0
- Conv::template process_tile<0, 1, 2, 3, 0, 0>,
- Conv::template process_tile<0, 1, 2, 3, 0, 1>,
- Conv::template process_tile<0, 1, 2, 3, 0, 2>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- Conv::template process_tile<0, 1, 2, 3, 1, 0>,
- Conv::template process_tile<0, 1, 2, 3, 1, 1>,
- Conv::template process_tile<0, 1, 2, 3, 1, 2>,
- }, // Output pad bottom = 1
- { // Output pad bottom = 2
- Conv::template process_tile<0, 1, 2, 3, 2, 0>,
- Conv::template process_tile<0, 1, 2, 3, 2, 1>,
- Conv::template process_tile<0, 1, 2, 3, 2, 2>,
- }, // Output pad bottom = 2
- }, // Input pad right = 3
- { // Input pad right = 4
- { // Output pad bottom = 0
- Conv::template process_tile<0, 1, 2, 4, 0, 0>,
- Conv::template process_tile<0, 1, 2, 4, 0, 1>,
- Conv::template process_tile<0, 1, 2, 4, 0, 2>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- Conv::template process_tile<0, 1, 2, 4, 1, 0>,
- Conv::template process_tile<0, 1, 2, 4, 1, 1>,
- Conv::template process_tile<0, 1, 2, 4, 1, 2>,
- }, // Output pad bottom = 1
- { // Output pad bottom = 2
- Conv::template process_tile<0, 1, 2, 4, 2, 0>,
- Conv::template process_tile<0, 1, 2, 4, 2, 1>,
- Conv::template process_tile<0, 1, 2, 4, 2, 2>,
- }, // Output pad bottom = 2
- }, // Input pad right = 4
- { // Input pad right = 5
- { // Output pad bottom = 0
- Conv::template process_tile<0, 1, 2, 5, 0, 0>,
- Conv::template process_tile<0, 1, 2, 5, 0, 1>,
- Conv::template process_tile<0, 1, 2, 5, 0, 2>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- Conv::template process_tile<0, 1, 2, 5, 1, 0>,
- Conv::template process_tile<0, 1, 2, 5, 1, 1>,
- Conv::template process_tile<0, 1, 2, 5, 1, 2>,
- }, // Output pad bottom = 1
- { // Output pad bottom = 2
- Conv::template process_tile<0, 1, 2, 5, 2, 0>,
- Conv::template process_tile<0, 1, 2, 5, 2, 1>,
- Conv::template process_tile<0, 1, 2, 5, 2, 2>,
- }, // Output pad bottom = 2
- }, // Input pad right = 5
- { // Input pad right = 6
- { // Output pad bottom = 0
- Conv::template process_tile<0, 1, 2, 6, 0, 0>,
- Conv::template process_tile<0, 1, 2, 6, 0, 1>,
- Conv::template process_tile<0, 1, 2, 6, 0, 2>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- Conv::template process_tile<0, 1, 2, 6, 1, 0>,
- Conv::template process_tile<0, 1, 2, 6, 1, 1>,
- Conv::template process_tile<0, 1, 2, 6, 1, 2>,
- }, // Output pad bottom = 1
- { // Output pad bottom = 2
- Conv::template process_tile<0, 1, 2, 6, 2, 0>,
- Conv::template process_tile<0, 1, 2, 6, 2, 1>,
- Conv::template process_tile<0, 1, 2, 6, 2, 2>,
- }, // Output pad bottom = 2
- }, // Input pad right = 6
- }, // Input pad bottom = 2
- { // Input pad bottom = 3
- { // Input pad right = 0
- { // Output pad bottom = 0
- Conv::template process_tile<0, 1, 3, 0, 0, 0>,
- Conv::template process_tile<0, 1, 3, 0, 0, 1>,
- Conv::template process_tile<0, 1, 3, 0, 0, 2>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- Conv::template process_tile<0, 1, 3, 0, 1, 0>,
- Conv::template process_tile<0, 1, 3, 0, 1, 1>,
- Conv::template process_tile<0, 1, 3, 0, 1, 2>,
- }, // Output pad bottom = 1
- { // Output pad bottom = 2
- Conv::template process_tile<0, 1, 3, 0, 2, 0>,
- Conv::template process_tile<0, 1, 3, 0, 2, 1>,
- Conv::template process_tile<0, 1, 3, 0, 2, 2>,
- }, // Output pad bottom = 2
- }, // Input pad right = 0
- { // Input pad right = 1
- { // Output pad bottom = 0
- Conv::template process_tile<0, 1, 3, 1, 0, 0>,
- Conv::template process_tile<0, 1, 3, 1, 0, 1>,
- Conv::template process_tile<0, 1, 3, 1, 0, 2>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- Conv::template process_tile<0, 1, 3, 1, 1, 0>,
- Conv::template process_tile<0, 1, 3, 1, 1, 1>,
- Conv::template process_tile<0, 1, 3, 1, 1, 2>,
- }, // Output pad bottom = 1
- { // Output pad bottom = 2
- Conv::template process_tile<0, 1, 3, 1, 2, 0>,
- Conv::template process_tile<0, 1, 3, 1, 2, 1>,
- Conv::template process_tile<0, 1, 3, 1, 2, 2>,
- }, // Output pad bottom = 2
- }, // Input pad right = 1
- { // Input pad right = 2
- { // Output pad bottom = 0
- Conv::template process_tile<0, 1, 3, 2, 0, 0>,
- Conv::template process_tile<0, 1, 3, 2, 0, 1>,
- Conv::template process_tile<0, 1, 3, 2, 0, 2>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- Conv::template process_tile<0, 1, 3, 2, 1, 0>,
- Conv::template process_tile<0, 1, 3, 2, 1, 1>,
- Conv::template process_tile<0, 1, 3, 2, 1, 2>,
- }, // Output pad bottom = 1
- { // Output pad bottom = 2
- Conv::template process_tile<0, 1, 3, 2, 2, 0>,
- Conv::template process_tile<0, 1, 3, 2, 2, 1>,
- Conv::template process_tile<0, 1, 3, 2, 2, 2>,
- }, // Output pad bottom = 2
- }, // Input pad right = 2
- { // Input pad right = 3
- { // Output pad bottom = 0
- Conv::template process_tile<0, 1, 3, 3, 0, 0>,
- Conv::template process_tile<0, 1, 3, 3, 0, 1>,
- Conv::template process_tile<0, 1, 3, 3, 0, 2>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- Conv::template process_tile<0, 1, 3, 3, 1, 0>,
- Conv::template process_tile<0, 1, 3, 3, 1, 1>,
- Conv::template process_tile<0, 1, 3, 3, 1, 2>,
- }, // Output pad bottom = 1
- { // Output pad bottom = 2
- Conv::template process_tile<0, 1, 3, 3, 2, 0>,
- Conv::template process_tile<0, 1, 3, 3, 2, 1>,
- Conv::template process_tile<0, 1, 3, 3, 2, 2>,
- }, // Output pad bottom = 2
- }, // Input pad right = 3
- { // Input pad right = 4
- { // Output pad bottom = 0
- Conv::template process_tile<0, 1, 3, 4, 0, 0>,
- Conv::template process_tile<0, 1, 3, 4, 0, 1>,
- Conv::template process_tile<0, 1, 3, 4, 0, 2>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- Conv::template process_tile<0, 1, 3, 4, 1, 0>,
- Conv::template process_tile<0, 1, 3, 4, 1, 1>,
- Conv::template process_tile<0, 1, 3, 4, 1, 2>,
- }, // Output pad bottom = 1
- { // Output pad bottom = 2
- Conv::template process_tile<0, 1, 3, 4, 2, 0>,
- Conv::template process_tile<0, 1, 3, 4, 2, 1>,
- Conv::template process_tile<0, 1, 3, 4, 2, 2>,
- }, // Output pad bottom = 2
- }, // Input pad right = 4
- { // Input pad right = 5
- { // Output pad bottom = 0
- Conv::template process_tile<0, 1, 3, 5, 0, 0>,
- Conv::template process_tile<0, 1, 3, 5, 0, 1>,
- Conv::template process_tile<0, 1, 3, 5, 0, 2>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- Conv::template process_tile<0, 1, 3, 5, 1, 0>,
- Conv::template process_tile<0, 1, 3, 5, 1, 1>,
- Conv::template process_tile<0, 1, 3, 5, 1, 2>,
- }, // Output pad bottom = 1
- { // Output pad bottom = 2
- Conv::template process_tile<0, 1, 3, 5, 2, 0>,
- Conv::template process_tile<0, 1, 3, 5, 2, 1>,
- Conv::template process_tile<0, 1, 3, 5, 2, 2>,
- }, // Output pad bottom = 2
- }, // Input pad right = 5
- { // Input pad right = 6
- { // Output pad bottom = 0
- Conv::template process_tile<0, 1, 3, 6, 0, 0>,
- Conv::template process_tile<0, 1, 3, 6, 0, 1>,
- Conv::template process_tile<0, 1, 3, 6, 0, 2>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- Conv::template process_tile<0, 1, 3, 6, 1, 0>,
- Conv::template process_tile<0, 1, 3, 6, 1, 1>,
- Conv::template process_tile<0, 1, 3, 6, 1, 2>,
- }, // Output pad bottom = 1
- { // Output pad bottom = 2
- Conv::template process_tile<0, 1, 3, 6, 2, 0>,
- Conv::template process_tile<0, 1, 3, 6, 2, 1>,
- Conv::template process_tile<0, 1, 3, 6, 2, 2>,
- }, // Output pad bottom = 2
- }, // Input pad right = 6
- }, // Input pad bottom = 3
- { // Input pad bottom = 4
- { // Input pad right = 0
- { // Output pad bottom = 0
- Conv::template process_tile<0, 1, 4, 0, 0, 0>,
- Conv::template process_tile<0, 1, 4, 0, 0, 1>,
- Conv::template process_tile<0, 1, 4, 0, 0, 2>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- Conv::template process_tile<0, 1, 4, 0, 1, 0>,
- Conv::template process_tile<0, 1, 4, 0, 1, 1>,
- Conv::template process_tile<0, 1, 4, 0, 1, 2>,
- }, // Output pad bottom = 1
- { // Output pad bottom = 2
- Conv::template process_tile<0, 1, 4, 0, 2, 0>,
- Conv::template process_tile<0, 1, 4, 0, 2, 1>,
- Conv::template process_tile<0, 1, 4, 0, 2, 2>,
- }, // Output pad bottom = 2
- }, // Input pad right = 0
- { // Input pad right = 1
- { // Output pad bottom = 0
- Conv::template process_tile<0, 1, 4, 1, 0, 0>,
- Conv::template process_tile<0, 1, 4, 1, 0, 1>,
- Conv::template process_tile<0, 1, 4, 1, 0, 2>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- Conv::template process_tile<0, 1, 4, 1, 1, 0>,
- Conv::template process_tile<0, 1, 4, 1, 1, 1>,
- Conv::template process_tile<0, 1, 4, 1, 1, 2>,
- }, // Output pad bottom = 1
- { // Output pad bottom = 2
- Conv::template process_tile<0, 1, 4, 1, 2, 0>,
- Conv::template process_tile<0, 1, 4, 1, 2, 1>,
- Conv::template process_tile<0, 1, 4, 1, 2, 2>,
- }, // Output pad bottom = 2
- }, // Input pad right = 1
- { // Input pad right = 2
- { // Output pad bottom = 0
- Conv::template process_tile<0, 1, 4, 2, 0, 0>,
- Conv::template process_tile<0, 1, 4, 2, 0, 1>,
- Conv::template process_tile<0, 1, 4, 2, 0, 2>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- Conv::template process_tile<0, 1, 4, 2, 1, 0>,
- Conv::template process_tile<0, 1, 4, 2, 1, 1>,
- Conv::template process_tile<0, 1, 4, 2, 1, 2>,
- }, // Output pad bottom = 1
- { // Output pad bottom = 2
- Conv::template process_tile<0, 1, 4, 2, 2, 0>,
- Conv::template process_tile<0, 1, 4, 2, 2, 1>,
- Conv::template process_tile<0, 1, 4, 2, 2, 2>,
- }, // Output pad bottom = 2
- }, // Input pad right = 2
- { // Input pad right = 3
- { // Output pad bottom = 0
- Conv::template process_tile<0, 1, 4, 3, 0, 0>,
- Conv::template process_tile<0, 1, 4, 3, 0, 1>,
- Conv::template process_tile<0, 1, 4, 3, 0, 2>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- Conv::template process_tile<0, 1, 4, 3, 1, 0>,
- Conv::template process_tile<0, 1, 4, 3, 1, 1>,
- Conv::template process_tile<0, 1, 4, 3, 1, 2>,
- }, // Output pad bottom = 1
- { // Output pad bottom = 2
- Conv::template process_tile<0, 1, 4, 3, 2, 0>,
- Conv::template process_tile<0, 1, 4, 3, 2, 1>,
- Conv::template process_tile<0, 1, 4, 3, 2, 2>,
- }, // Output pad bottom = 2
- }, // Input pad right = 3
- { // Input pad right = 4
- { // Output pad bottom = 0
- Conv::template process_tile<0, 1, 4, 4, 0, 0>,
- Conv::template process_tile<0, 1, 4, 4, 0, 1>,
- Conv::template process_tile<0, 1, 4, 4, 0, 2>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- Conv::template process_tile<0, 1, 4, 4, 1, 0>,
- Conv::template process_tile<0, 1, 4, 4, 1, 1>,
- Conv::template process_tile<0, 1, 4, 4, 1, 2>,
- }, // Output pad bottom = 1
- { // Output pad bottom = 2
- Conv::template process_tile<0, 1, 4, 4, 2, 0>,
- Conv::template process_tile<0, 1, 4, 4, 2, 1>,
- Conv::template process_tile<0, 1, 4, 4, 2, 2>,
- }, // Output pad bottom = 2
- }, // Input pad right = 4
- { // Input pad right = 5
- { // Output pad bottom = 0
- Conv::template process_tile<0, 1, 4, 5, 0, 0>,
- Conv::template process_tile<0, 1, 4, 5, 0, 1>,
- Conv::template process_tile<0, 1, 4, 5, 0, 2>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- Conv::template process_tile<0, 1, 4, 5, 1, 0>,
- Conv::template process_tile<0, 1, 4, 5, 1, 1>,
- Conv::template process_tile<0, 1, 4, 5, 1, 2>,
- }, // Output pad bottom = 1
- { // Output pad bottom = 2
- Conv::template process_tile<0, 1, 4, 5, 2, 0>,
- Conv::template process_tile<0, 1, 4, 5, 2, 1>,
- Conv::template process_tile<0, 1, 4, 5, 2, 2>,
- }, // Output pad bottom = 2
- }, // Input pad right = 5
- { // Input pad right = 6
- { // Output pad bottom = 0
- Conv::template process_tile<0, 1, 4, 6, 0, 0>,
- Conv::template process_tile<0, 1, 4, 6, 0, 1>,
- Conv::template process_tile<0, 1, 4, 6, 0, 2>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- Conv::template process_tile<0, 1, 4, 6, 1, 0>,
- Conv::template process_tile<0, 1, 4, 6, 1, 1>,
- Conv::template process_tile<0, 1, 4, 6, 1, 2>,
- }, // Output pad bottom = 1
- { // Output pad bottom = 2
- Conv::template process_tile<0, 1, 4, 6, 2, 0>,
- Conv::template process_tile<0, 1, 4, 6, 2, 1>,
- Conv::template process_tile<0, 1, 4, 6, 2, 2>,
- }, // Output pad bottom = 2
- }, // Input pad right = 6
- }, // Input pad bottom = 4
- { // Input pad bottom = 5
- { // Input pad right = 0
- { // Output pad bottom = 0
- Conv::template process_tile<0, 1, 5, 0, 0, 0>,
- Conv::template process_tile<0, 1, 5, 0, 0, 1>,
- Conv::template process_tile<0, 1, 5, 0, 0, 2>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- Conv::template process_tile<0, 1, 5, 0, 1, 0>,
- Conv::template process_tile<0, 1, 5, 0, 1, 1>,
- Conv::template process_tile<0, 1, 5, 0, 1, 2>,
- }, // Output pad bottom = 1
- { // Output pad bottom = 2
- Conv::template process_tile<0, 1, 5, 0, 2, 0>,
- Conv::template process_tile<0, 1, 5, 0, 2, 1>,
- Conv::template process_tile<0, 1, 5, 0, 2, 2>,
- }, // Output pad bottom = 2
- }, // Input pad right = 0
- { // Input pad right = 1
- { // Output pad bottom = 0
- Conv::template process_tile<0, 1, 5, 1, 0, 0>,
- Conv::template process_tile<0, 1, 5, 1, 0, 1>,
- Conv::template process_tile<0, 1, 5, 1, 0, 2>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- Conv::template process_tile<0, 1, 5, 1, 1, 0>,
- Conv::template process_tile<0, 1, 5, 1, 1, 1>,
- Conv::template process_tile<0, 1, 5, 1, 1, 2>,
- }, // Output pad bottom = 1
- { // Output pad bottom = 2
- Conv::template process_tile<0, 1, 5, 1, 2, 0>,
- Conv::template process_tile<0, 1, 5, 1, 2, 1>,
- Conv::template process_tile<0, 1, 5, 1, 2, 2>,
- }, // Output pad bottom = 2
- }, // Input pad right = 1
- { // Input pad right = 2
- { // Output pad bottom = 0
- Conv::template process_tile<0, 1, 5, 2, 0, 0>,
- Conv::template process_tile<0, 1, 5, 2, 0, 1>,
- Conv::template process_tile<0, 1, 5, 2, 0, 2>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- Conv::template process_tile<0, 1, 5, 2, 1, 0>,
- Conv::template process_tile<0, 1, 5, 2, 1, 1>,
- Conv::template process_tile<0, 1, 5, 2, 1, 2>,
- }, // Output pad bottom = 1
- { // Output pad bottom = 2
- Conv::template process_tile<0, 1, 5, 2, 2, 0>,
- Conv::template process_tile<0, 1, 5, 2, 2, 1>,
- Conv::template process_tile<0, 1, 5, 2, 2, 2>,
- }, // Output pad bottom = 2
- }, // Input pad right = 2
- { // Input pad right = 3
- { // Output pad bottom = 0
- Conv::template process_tile<0, 1, 5, 3, 0, 0>,
- Conv::template process_tile<0, 1, 5, 3, 0, 1>,
- Conv::template process_tile<0, 1, 5, 3, 0, 2>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- Conv::template process_tile<0, 1, 5, 3, 1, 0>,
- Conv::template process_tile<0, 1, 5, 3, 1, 1>,
- Conv::template process_tile<0, 1, 5, 3, 1, 2>,
- }, // Output pad bottom = 1
- { // Output pad bottom = 2
- Conv::template process_tile<0, 1, 5, 3, 2, 0>,
- Conv::template process_tile<0, 1, 5, 3, 2, 1>,
- Conv::template process_tile<0, 1, 5, 3, 2, 2>,
- }, // Output pad bottom = 2
- }, // Input pad right = 3
- { // Input pad right = 4
- { // Output pad bottom = 0
- Conv::template process_tile<0, 1, 5, 4, 0, 0>,
- Conv::template process_tile<0, 1, 5, 4, 0, 1>,
- Conv::template process_tile<0, 1, 5, 4, 0, 2>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- Conv::template process_tile<0, 1, 5, 4, 1, 0>,
- Conv::template process_tile<0, 1, 5, 4, 1, 1>,
- Conv::template process_tile<0, 1, 5, 4, 1, 2>,
- }, // Output pad bottom = 1
- { // Output pad bottom = 2
- Conv::template process_tile<0, 1, 5, 4, 2, 0>,
- Conv::template process_tile<0, 1, 5, 4, 2, 1>,
- Conv::template process_tile<0, 1, 5, 4, 2, 2>,
- }, // Output pad bottom = 2
- }, // Input pad right = 4
- { // Input pad right = 5
- { // Output pad bottom = 0
- Conv::template process_tile<0, 1, 5, 5, 0, 0>,
- Conv::template process_tile<0, 1, 5, 5, 0, 1>,
- Conv::template process_tile<0, 1, 5, 5, 0, 2>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- Conv::template process_tile<0, 1, 5, 5, 1, 0>,
- Conv::template process_tile<0, 1, 5, 5, 1, 1>,
- Conv::template process_tile<0, 1, 5, 5, 1, 2>,
- }, // Output pad bottom = 1
- { // Output pad bottom = 2
- Conv::template process_tile<0, 1, 5, 5, 2, 0>,
- Conv::template process_tile<0, 1, 5, 5, 2, 1>,
- Conv::template process_tile<0, 1, 5, 5, 2, 2>,
- }, // Output pad bottom = 2
- }, // Input pad right = 5
- { // Input pad right = 6
- { // Output pad bottom = 0
- Conv::template process_tile<0, 1, 5, 6, 0, 0>,
- Conv::template process_tile<0, 1, 5, 6, 0, 1>,
- Conv::template process_tile<0, 1, 5, 6, 0, 2>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- Conv::template process_tile<0, 1, 5, 6, 1, 0>,
- Conv::template process_tile<0, 1, 5, 6, 1, 1>,
- Conv::template process_tile<0, 1, 5, 6, 1, 2>,
- }, // Output pad bottom = 1
- { // Output pad bottom = 2
- Conv::template process_tile<0, 1, 5, 6, 2, 0>,
- Conv::template process_tile<0, 1, 5, 6, 2, 1>,
- Conv::template process_tile<0, 1, 5, 6, 2, 2>,
- }, // Output pad bottom = 2
- }, // Input pad right = 6
- }, // Input pad bottom = 5
- { // Input pad bottom = 6
- { // Input pad right = 0
- { // Output pad bottom = 0
- Conv::template process_tile<0, 1, 6, 0, 0, 0>,
- Conv::template process_tile<0, 1, 6, 0, 0, 1>,
- Conv::template process_tile<0, 1, 6, 0, 0, 2>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- Conv::template process_tile<0, 1, 6, 0, 1, 0>,
- Conv::template process_tile<0, 1, 6, 0, 1, 1>,
- Conv::template process_tile<0, 1, 6, 0, 1, 2>,
- }, // Output pad bottom = 1
- { // Output pad bottom = 2
- Conv::template process_tile<0, 1, 6, 0, 2, 0>,
- Conv::template process_tile<0, 1, 6, 0, 2, 1>,
- Conv::template process_tile<0, 1, 6, 0, 2, 2>,
- }, // Output pad bottom = 2
- }, // Input pad right = 0
- { // Input pad right = 1
- { // Output pad bottom = 0
- Conv::template process_tile<0, 1, 6, 1, 0, 0>,
- Conv::template process_tile<0, 1, 6, 1, 0, 1>,
- Conv::template process_tile<0, 1, 6, 1, 0, 2>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- Conv::template process_tile<0, 1, 6, 1, 1, 0>,
- Conv::template process_tile<0, 1, 6, 1, 1, 1>,
- Conv::template process_tile<0, 1, 6, 1, 1, 2>,
- }, // Output pad bottom = 1
- { // Output pad bottom = 2
- Conv::template process_tile<0, 1, 6, 1, 2, 0>,
- Conv::template process_tile<0, 1, 6, 1, 2, 1>,
- Conv::template process_tile<0, 1, 6, 1, 2, 2>,
- }, // Output pad bottom = 2
- }, // Input pad right = 1
- { // Input pad right = 2
- { // Output pad bottom = 0
- Conv::template process_tile<0, 1, 6, 2, 0, 0>,
- Conv::template process_tile<0, 1, 6, 2, 0, 1>,
- Conv::template process_tile<0, 1, 6, 2, 0, 2>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- Conv::template process_tile<0, 1, 6, 2, 1, 0>,
- Conv::template process_tile<0, 1, 6, 2, 1, 1>,
- Conv::template process_tile<0, 1, 6, 2, 1, 2>,
- }, // Output pad bottom = 1
- { // Output pad bottom = 2
- Conv::template process_tile<0, 1, 6, 2, 2, 0>,
- Conv::template process_tile<0, 1, 6, 2, 2, 1>,
- Conv::template process_tile<0, 1, 6, 2, 2, 2>,
- }, // Output pad bottom = 2
- }, // Input pad right = 2
- { // Input pad right = 3
- { // Output pad bottom = 0
- Conv::template process_tile<0, 1, 6, 3, 0, 0>,
- Conv::template process_tile<0, 1, 6, 3, 0, 1>,
- Conv::template process_tile<0, 1, 6, 3, 0, 2>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- Conv::template process_tile<0, 1, 6, 3, 1, 0>,
- Conv::template process_tile<0, 1, 6, 3, 1, 1>,
- Conv::template process_tile<0, 1, 6, 3, 1, 2>,
- }, // Output pad bottom = 1
- { // Output pad bottom = 2
- Conv::template process_tile<0, 1, 6, 3, 2, 0>,
- Conv::template process_tile<0, 1, 6, 3, 2, 1>,
- Conv::template process_tile<0, 1, 6, 3, 2, 2>,
- }, // Output pad bottom = 2
- }, // Input pad right = 3
- { // Input pad right = 4
- { // Output pad bottom = 0
- Conv::template process_tile<0, 1, 6, 4, 0, 0>,
- Conv::template process_tile<0, 1, 6, 4, 0, 1>,
- Conv::template process_tile<0, 1, 6, 4, 0, 2>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- Conv::template process_tile<0, 1, 6, 4, 1, 0>,
- Conv::template process_tile<0, 1, 6, 4, 1, 1>,
- Conv::template process_tile<0, 1, 6, 4, 1, 2>,
- }, // Output pad bottom = 1
- { // Output pad bottom = 2
- Conv::template process_tile<0, 1, 6, 4, 2, 0>,
- Conv::template process_tile<0, 1, 6, 4, 2, 1>,
- Conv::template process_tile<0, 1, 6, 4, 2, 2>,
- }, // Output pad bottom = 2
- }, // Input pad right = 4
- { // Input pad right = 5
- { // Output pad bottom = 0
- Conv::template process_tile<0, 1, 6, 5, 0, 0>,
- Conv::template process_tile<0, 1, 6, 5, 0, 1>,
- Conv::template process_tile<0, 1, 6, 5, 0, 2>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- Conv::template process_tile<0, 1, 6, 5, 1, 0>,
- Conv::template process_tile<0, 1, 6, 5, 1, 1>,
- Conv::template process_tile<0, 1, 6, 5, 1, 2>,
- }, // Output pad bottom = 1
- { // Output pad bottom = 2
- Conv::template process_tile<0, 1, 6, 5, 2, 0>,
- Conv::template process_tile<0, 1, 6, 5, 2, 1>,
- Conv::template process_tile<0, 1, 6, 5, 2, 2>,
- }, // Output pad bottom = 2
- }, // Input pad right = 5
- { // Input pad right = 6
- { // Output pad bottom = 0
- Conv::template process_tile<0, 1, 6, 6, 0, 0>,
- Conv::template process_tile<0, 1, 6, 6, 0, 1>,
- Conv::template process_tile<0, 1, 6, 6, 0, 2>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- Conv::template process_tile<0, 1, 6, 6, 1, 0>,
- Conv::template process_tile<0, 1, 6, 6, 1, 1>,
- Conv::template process_tile<0, 1, 6, 6, 1, 2>,
- }, // Output pad bottom = 1
- { // Output pad bottom = 2
- Conv::template process_tile<0, 1, 6, 6, 2, 0>,
- Conv::template process_tile<0, 1, 6, 6, 2, 1>,
- Conv::template process_tile<0, 1, 6, 6, 2, 2>,
- }, // Output pad bottom = 2
- }, // Input pad right = 6
- }, // Input pad bottom = 6
- }, // Input pad left = 1
- }, // Input pad top = 0
- { // Input pad top = 1
- { // Input pad left = 0
- { // Input pad bottom = 0
- { // Input pad right = 0
- { // Output pad bottom = 0
- Conv::template process_tile<1, 0, 0, 0, 0, 0>,
- Conv::template process_tile<1, 0, 0, 0, 0, 1>,
- Conv::template process_tile<1, 0, 0, 0, 0, 2>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- Conv::template process_tile<1, 0, 0, 0, 1, 0>,
- Conv::template process_tile<1, 0, 0, 0, 1, 1>,
- Conv::template process_tile<1, 0, 0, 0, 1, 2>,
- }, // Output pad bottom = 1
- { // Output pad bottom = 2
- Conv::template process_tile<1, 0, 0, 0, 2, 0>,
- Conv::template process_tile<1, 0, 0, 0, 2, 1>,
- Conv::template process_tile<1, 0, 0, 0, 2, 2>,
- }, // Output pad bottom = 2
- }, // Input pad right = 0
- { // Input pad right = 1
- { // Output pad bottom = 0
- Conv::template process_tile<1, 0, 0, 1, 0, 0>,
- Conv::template process_tile<1, 0, 0, 1, 0, 1>,
- Conv::template process_tile<1, 0, 0, 1, 0, 2>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- Conv::template process_tile<1, 0, 0, 1, 1, 0>,
- Conv::template process_tile<1, 0, 0, 1, 1, 1>,
- Conv::template process_tile<1, 0, 0, 1, 1, 2>,
- }, // Output pad bottom = 1
- { // Output pad bottom = 2
- Conv::template process_tile<1, 0, 0, 1, 2, 0>,
- Conv::template process_tile<1, 0, 0, 1, 2, 1>,
- Conv::template process_tile<1, 0, 0, 1, 2, 2>,
- }, // Output pad bottom = 2
- }, // Input pad right = 1
- { // Input pad right = 2
- { // Output pad bottom = 0
- Conv::template process_tile<1, 0, 0, 2, 0, 0>,
- Conv::template process_tile<1, 0, 0, 2, 0, 1>,
- Conv::template process_tile<1, 0, 0, 2, 0, 2>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- Conv::template process_tile<1, 0, 0, 2, 1, 0>,
- Conv::template process_tile<1, 0, 0, 2, 1, 1>,
- Conv::template process_tile<1, 0, 0, 2, 1, 2>,
- }, // Output pad bottom = 1
- { // Output pad bottom = 2
- Conv::template process_tile<1, 0, 0, 2, 2, 0>,
- Conv::template process_tile<1, 0, 0, 2, 2, 1>,
- Conv::template process_tile<1, 0, 0, 2, 2, 2>,
- }, // Output pad bottom = 2
- }, // Input pad right = 2
- { // Input pad right = 3
- { // Output pad bottom = 0
- Conv::template process_tile<1, 0, 0, 3, 0, 0>,
- Conv::template process_tile<1, 0, 0, 3, 0, 1>,
- Conv::template process_tile<1, 0, 0, 3, 0, 2>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- Conv::template process_tile<1, 0, 0, 3, 1, 0>,
- Conv::template process_tile<1, 0, 0, 3, 1, 1>,
- Conv::template process_tile<1, 0, 0, 3, 1, 2>,
- }, // Output pad bottom = 1
- { // Output pad bottom = 2
- Conv::template process_tile<1, 0, 0, 3, 2, 0>,
- Conv::template process_tile<1, 0, 0, 3, 2, 1>,
- Conv::template process_tile<1, 0, 0, 3, 2, 2>,
- }, // Output pad bottom = 2
- }, // Input pad right = 3
- { // Input pad right = 4
- { // Output pad bottom = 0
- Conv::template process_tile<1, 0, 0, 4, 0, 0>,
- Conv::template process_tile<1, 0, 0, 4, 0, 1>,
- Conv::template process_tile<1, 0, 0, 4, 0, 2>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- Conv::template process_tile<1, 0, 0, 4, 1, 0>,
- Conv::template process_tile<1, 0, 0, 4, 1, 1>,
- Conv::template process_tile<1, 0, 0, 4, 1, 2>,
- }, // Output pad bottom = 1
- { // Output pad bottom = 2
- Conv::template process_tile<1, 0, 0, 4, 2, 0>,
- Conv::template process_tile<1, 0, 0, 4, 2, 1>,
- Conv::template process_tile<1, 0, 0, 4, 2, 2>,
- }, // Output pad bottom = 2
- }, // Input pad right = 4
- { // Input pad right = 5
- { // Output pad bottom = 0
- Conv::template process_tile<1, 0, 0, 5, 0, 0>,
- Conv::template process_tile<1, 0, 0, 5, 0, 1>,
- Conv::template process_tile<1, 0, 0, 5, 0, 2>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- Conv::template process_tile<1, 0, 0, 5, 1, 0>,
- Conv::template process_tile<1, 0, 0, 5, 1, 1>,
- Conv::template process_tile<1, 0, 0, 5, 1, 2>,
- }, // Output pad bottom = 1
- { // Output pad bottom = 2
- Conv::template process_tile<1, 0, 0, 5, 2, 0>,
- Conv::template process_tile<1, 0, 0, 5, 2, 1>,
- Conv::template process_tile<1, 0, 0, 5, 2, 2>,
- }, // Output pad bottom = 2
- }, // Input pad right = 5
- { // Input pad right = 6
- { // Output pad bottom = 0
- Conv::template process_tile<1, 0, 0, 6, 0, 0>,
- Conv::template process_tile<1, 0, 0, 6, 0, 1>,
- Conv::template process_tile<1, 0, 0, 6, 0, 2>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- Conv::template process_tile<1, 0, 0, 6, 1, 0>,
- Conv::template process_tile<1, 0, 0, 6, 1, 1>,
- Conv::template process_tile<1, 0, 0, 6, 1, 2>,
- }, // Output pad bottom = 1
- { // Output pad bottom = 2
- Conv::template process_tile<1, 0, 0, 6, 2, 0>,
- Conv::template process_tile<1, 0, 0, 6, 2, 1>,
- Conv::template process_tile<1, 0, 0, 6, 2, 2>,
- }, // Output pad bottom = 2
- }, // Input pad right = 6
- }, // Input pad bottom = 0
- { // Input pad bottom = 1
- { // Input pad right = 0
- { // Output pad bottom = 0
- Conv::template process_tile<1, 0, 1, 0, 0, 0>,
- Conv::template process_tile<1, 0, 1, 0, 0, 1>,
- Conv::template process_tile<1, 0, 1, 0, 0, 2>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- Conv::template process_tile<1, 0, 1, 0, 1, 0>,
- Conv::template process_tile<1, 0, 1, 0, 1, 1>,
- Conv::template process_tile<1, 0, 1, 0, 1, 2>,
- }, // Output pad bottom = 1
- { // Output pad bottom = 2
- Conv::template process_tile<1, 0, 1, 0, 2, 0>,
- Conv::template process_tile<1, 0, 1, 0, 2, 1>,
- Conv::template process_tile<1, 0, 1, 0, 2, 2>,
- }, // Output pad bottom = 2
- }, // Input pad right = 0
- { // Input pad right = 1
- { // Output pad bottom = 0
- Conv::template process_tile<1, 0, 1, 1, 0, 0>,
- Conv::template process_tile<1, 0, 1, 1, 0, 1>,
- Conv::template process_tile<1, 0, 1, 1, 0, 2>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- Conv::template process_tile<1, 0, 1, 1, 1, 0>,
- Conv::template process_tile<1, 0, 1, 1, 1, 1>,
- Conv::template process_tile<1, 0, 1, 1, 1, 2>,
- }, // Output pad bottom = 1
- { // Output pad bottom = 2
- Conv::template process_tile<1, 0, 1, 1, 2, 0>,
- Conv::template process_tile<1, 0, 1, 1, 2, 1>,
- Conv::template process_tile<1, 0, 1, 1, 2, 2>,
- }, // Output pad bottom = 2
- }, // Input pad right = 1
- { // Input pad right = 2
- { // Output pad bottom = 0
- Conv::template process_tile<1, 0, 1, 2, 0, 0>,
- Conv::template process_tile<1, 0, 1, 2, 0, 1>,
- Conv::template process_tile<1, 0, 1, 2, 0, 2>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- Conv::template process_tile<1, 0, 1, 2, 1, 0>,
- Conv::template process_tile<1, 0, 1, 2, 1, 1>,
- Conv::template process_tile<1, 0, 1, 2, 1, 2>,
- }, // Output pad bottom = 1
- { // Output pad bottom = 2
- Conv::template process_tile<1, 0, 1, 2, 2, 0>,
- Conv::template process_tile<1, 0, 1, 2, 2, 1>,
- Conv::template process_tile<1, 0, 1, 2, 2, 2>,
- }, // Output pad bottom = 2
- }, // Input pad right = 2
- { // Input pad right = 3
- { // Output pad bottom = 0
- Conv::template process_tile<1, 0, 1, 3, 0, 0>,
- Conv::template process_tile<1, 0, 1, 3, 0, 1>,
- Conv::template process_tile<1, 0, 1, 3, 0, 2>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- Conv::template process_tile<1, 0, 1, 3, 1, 0>,
- Conv::template process_tile<1, 0, 1, 3, 1, 1>,
- Conv::template process_tile<1, 0, 1, 3, 1, 2>,
- }, // Output pad bottom = 1
- { // Output pad bottom = 2
- Conv::template process_tile<1, 0, 1, 3, 2, 0>,
- Conv::template process_tile<1, 0, 1, 3, 2, 1>,
- Conv::template process_tile<1, 0, 1, 3, 2, 2>,
- }, // Output pad bottom = 2
- }, // Input pad right = 3
- { // Input pad right = 4
- { // Output pad bottom = 0
- Conv::template process_tile<1, 0, 1, 4, 0, 0>,
- Conv::template process_tile<1, 0, 1, 4, 0, 1>,
- Conv::template process_tile<1, 0, 1, 4, 0, 2>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- Conv::template process_tile<1, 0, 1, 4, 1, 0>,
- Conv::template process_tile<1, 0, 1, 4, 1, 1>,
- Conv::template process_tile<1, 0, 1, 4, 1, 2>,
- }, // Output pad bottom = 1
- { // Output pad bottom = 2
- Conv::template process_tile<1, 0, 1, 4, 2, 0>,
- Conv::template process_tile<1, 0, 1, 4, 2, 1>,
- Conv::template process_tile<1, 0, 1, 4, 2, 2>,
- }, // Output pad bottom = 2
- }, // Input pad right = 4
- { // Input pad right = 5
- { // Output pad bottom = 0
- Conv::template process_tile<1, 0, 1, 5, 0, 0>,
- Conv::template process_tile<1, 0, 1, 5, 0, 1>,
- Conv::template process_tile<1, 0, 1, 5, 0, 2>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- Conv::template process_tile<1, 0, 1, 5, 1, 0>,
- Conv::template process_tile<1, 0, 1, 5, 1, 1>,
- Conv::template process_tile<1, 0, 1, 5, 1, 2>,
- }, // Output pad bottom = 1
- { // Output pad bottom = 2
- Conv::template process_tile<1, 0, 1, 5, 2, 0>,
- Conv::template process_tile<1, 0, 1, 5, 2, 1>,
- Conv::template process_tile<1, 0, 1, 5, 2, 2>,
- }, // Output pad bottom = 2
- }, // Input pad right = 5
- { // Input pad right = 6
- { // Output pad bottom = 0
- Conv::template process_tile<1, 0, 1, 6, 0, 0>,
- Conv::template process_tile<1, 0, 1, 6, 0, 1>,
- Conv::template process_tile<1, 0, 1, 6, 0, 2>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- Conv::template process_tile<1, 0, 1, 6, 1, 0>,
- Conv::template process_tile<1, 0, 1, 6, 1, 1>,
- Conv::template process_tile<1, 0, 1, 6, 1, 2>,
- }, // Output pad bottom = 1
- { // Output pad bottom = 2
- Conv::template process_tile<1, 0, 1, 6, 2, 0>,
- Conv::template process_tile<1, 0, 1, 6, 2, 1>,
- Conv::template process_tile<1, 0, 1, 6, 2, 2>,
- }, // Output pad bottom = 2
- }, // Input pad right = 6
- }, // Input pad bottom = 1
- { // Input pad bottom = 2
- { // Input pad right = 0
- { // Output pad bottom = 0
- Conv::template process_tile<1, 0, 2, 0, 0, 0>,
- Conv::template process_tile<1, 0, 2, 0, 0, 1>,
- Conv::template process_tile<1, 0, 2, 0, 0, 2>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- Conv::template process_tile<1, 0, 2, 0, 1, 0>,
- Conv::template process_tile<1, 0, 2, 0, 1, 1>,
- Conv::template process_tile<1, 0, 2, 0, 1, 2>,
- }, // Output pad bottom = 1
- { // Output pad bottom = 2
- Conv::template process_tile<1, 0, 2, 0, 2, 0>,
- Conv::template process_tile<1, 0, 2, 0, 2, 1>,
- Conv::template process_tile<1, 0, 2, 0, 2, 2>,
- }, // Output pad bottom = 2
- }, // Input pad right = 0
- { // Input pad right = 1
- { // Output pad bottom = 0
- Conv::template process_tile<1, 0, 2, 1, 0, 0>,
- Conv::template process_tile<1, 0, 2, 1, 0, 1>,
- Conv::template process_tile<1, 0, 2, 1, 0, 2>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- Conv::template process_tile<1, 0, 2, 1, 1, 0>,
- Conv::template process_tile<1, 0, 2, 1, 1, 1>,
- Conv::template process_tile<1, 0, 2, 1, 1, 2>,
- }, // Output pad bottom = 1
- { // Output pad bottom = 2
- Conv::template process_tile<1, 0, 2, 1, 2, 0>,
- Conv::template process_tile<1, 0, 2, 1, 2, 1>,
- Conv::template process_tile<1, 0, 2, 1, 2, 2>,
- }, // Output pad bottom = 2
- }, // Input pad right = 1
- { // Input pad right = 2
- { // Output pad bottom = 0
- Conv::template process_tile<1, 0, 2, 2, 0, 0>,
- Conv::template process_tile<1, 0, 2, 2, 0, 1>,
- Conv::template process_tile<1, 0, 2, 2, 0, 2>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- Conv::template process_tile<1, 0, 2, 2, 1, 0>,
- Conv::template process_tile<1, 0, 2, 2, 1, 1>,
- Conv::template process_tile<1, 0, 2, 2, 1, 2>,
- }, // Output pad bottom = 1
- { // Output pad bottom = 2
- Conv::template process_tile<1, 0, 2, 2, 2, 0>,
- Conv::template process_tile<1, 0, 2, 2, 2, 1>,
- Conv::template process_tile<1, 0, 2, 2, 2, 2>,
- }, // Output pad bottom = 2
- }, // Input pad right = 2
- { // Input pad right = 3
- { // Output pad bottom = 0
- Conv::template process_tile<1, 0, 2, 3, 0, 0>,
- Conv::template process_tile<1, 0, 2, 3, 0, 1>,
- Conv::template process_tile<1, 0, 2, 3, 0, 2>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- Conv::template process_tile<1, 0, 2, 3, 1, 0>,
- Conv::template process_tile<1, 0, 2, 3, 1, 1>,
- Conv::template process_tile<1, 0, 2, 3, 1, 2>,
- }, // Output pad bottom = 1
- { // Output pad bottom = 2
- Conv::template process_tile<1, 0, 2, 3, 2, 0>,
- Conv::template process_tile<1, 0, 2, 3, 2, 1>,
- Conv::template process_tile<1, 0, 2, 3, 2, 2>,
- }, // Output pad bottom = 2
- }, // Input pad right = 3
- { // Input pad right = 4
- { // Output pad bottom = 0
- Conv::template process_tile<1, 0, 2, 4, 0, 0>,
- Conv::template process_tile<1, 0, 2, 4, 0, 1>,
- Conv::template process_tile<1, 0, 2, 4, 0, 2>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- Conv::template process_tile<1, 0, 2, 4, 1, 0>,
- Conv::template process_tile<1, 0, 2, 4, 1, 1>,
- Conv::template process_tile<1, 0, 2, 4, 1, 2>,
- }, // Output pad bottom = 1
- { // Output pad bottom = 2
- Conv::template process_tile<1, 0, 2, 4, 2, 0>,
- Conv::template process_tile<1, 0, 2, 4, 2, 1>,
- Conv::template process_tile<1, 0, 2, 4, 2, 2>,
- }, // Output pad bottom = 2
- }, // Input pad right = 4
- { // Input pad right = 5
- { // Output pad bottom = 0
- Conv::template process_tile<1, 0, 2, 5, 0, 0>,
- Conv::template process_tile<1, 0, 2, 5, 0, 1>,
- Conv::template process_tile<1, 0, 2, 5, 0, 2>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- Conv::template process_tile<1, 0, 2, 5, 1, 0>,
- Conv::template process_tile<1, 0, 2, 5, 1, 1>,
- Conv::template process_tile<1, 0, 2, 5, 1, 2>,
- }, // Output pad bottom = 1
- { // Output pad bottom = 2
- Conv::template process_tile<1, 0, 2, 5, 2, 0>,
- Conv::template process_tile<1, 0, 2, 5, 2, 1>,
- Conv::template process_tile<1, 0, 2, 5, 2, 2>,
- }, // Output pad bottom = 2
- }, // Input pad right = 5
- { // Input pad right = 6
- { // Output pad bottom = 0
- Conv::template process_tile<1, 0, 2, 6, 0, 0>,
- Conv::template process_tile<1, 0, 2, 6, 0, 1>,
- Conv::template process_tile<1, 0, 2, 6, 0, 2>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- Conv::template process_tile<1, 0, 2, 6, 1, 0>,
- Conv::template process_tile<1, 0, 2, 6, 1, 1>,
- Conv::template process_tile<1, 0, 2, 6, 1, 2>,
- }, // Output pad bottom = 1
- { // Output pad bottom = 2
- Conv::template process_tile<1, 0, 2, 6, 2, 0>,
- Conv::template process_tile<1, 0, 2, 6, 2, 1>,
- Conv::template process_tile<1, 0, 2, 6, 2, 2>,
- }, // Output pad bottom = 2
- }, // Input pad right = 6
- }, // Input pad bottom = 2
- { // Input pad bottom = 3
- { // Input pad right = 0
- { // Output pad bottom = 0
- Conv::template process_tile<1, 0, 3, 0, 0, 0>,
- Conv::template process_tile<1, 0, 3, 0, 0, 1>,
- Conv::template process_tile<1, 0, 3, 0, 0, 2>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- Conv::template process_tile<1, 0, 3, 0, 1, 0>,
- Conv::template process_tile<1, 0, 3, 0, 1, 1>,
- Conv::template process_tile<1, 0, 3, 0, 1, 2>,
- }, // Output pad bottom = 1
- { // Output pad bottom = 2
- Conv::template process_tile<1, 0, 3, 0, 2, 0>,
- Conv::template process_tile<1, 0, 3, 0, 2, 1>,
- Conv::template process_tile<1, 0, 3, 0, 2, 2>,
- }, // Output pad bottom = 2
- }, // Input pad right = 0
- { // Input pad right = 1
- { // Output pad bottom = 0
- Conv::template process_tile<1, 0, 3, 1, 0, 0>,
- Conv::template process_tile<1, 0, 3, 1, 0, 1>,
- Conv::template process_tile<1, 0, 3, 1, 0, 2>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- Conv::template process_tile<1, 0, 3, 1, 1, 0>,
- Conv::template process_tile<1, 0, 3, 1, 1, 1>,
- Conv::template process_tile<1, 0, 3, 1, 1, 2>,
- }, // Output pad bottom = 1
- { // Output pad bottom = 2
- Conv::template process_tile<1, 0, 3, 1, 2, 0>,
- Conv::template process_tile<1, 0, 3, 1, 2, 1>,
- Conv::template process_tile<1, 0, 3, 1, 2, 2>,
- }, // Output pad bottom = 2
- }, // Input pad right = 1
- { // Input pad right = 2
- { // Output pad bottom = 0
- Conv::template process_tile<1, 0, 3, 2, 0, 0>,
- Conv::template process_tile<1, 0, 3, 2, 0, 1>,
- Conv::template process_tile<1, 0, 3, 2, 0, 2>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- Conv::template process_tile<1, 0, 3, 2, 1, 0>,
- Conv::template process_tile<1, 0, 3, 2, 1, 1>,
- Conv::template process_tile<1, 0, 3, 2, 1, 2>,
- }, // Output pad bottom = 1
- { // Output pad bottom = 2
- Conv::template process_tile<1, 0, 3, 2, 2, 0>,
- Conv::template process_tile<1, 0, 3, 2, 2, 1>,
- Conv::template process_tile<1, 0, 3, 2, 2, 2>,
- }, // Output pad bottom = 2
- }, // Input pad right = 2
- { // Input pad right = 3
- { // Output pad bottom = 0
- Conv::template process_tile<1, 0, 3, 3, 0, 0>,
- Conv::template process_tile<1, 0, 3, 3, 0, 1>,
- Conv::template process_tile<1, 0, 3, 3, 0, 2>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- Conv::template process_tile<1, 0, 3, 3, 1, 0>,
- Conv::template process_tile<1, 0, 3, 3, 1, 1>,
- Conv::template process_tile<1, 0, 3, 3, 1, 2>,
- }, // Output pad bottom = 1
- { // Output pad bottom = 2
- Conv::template process_tile<1, 0, 3, 3, 2, 0>,
- Conv::template process_tile<1, 0, 3, 3, 2, 1>,
- Conv::template process_tile<1, 0, 3, 3, 2, 2>,
- }, // Output pad bottom = 2
- }, // Input pad right = 3
- { // Input pad right = 4
- { // Output pad bottom = 0
- Conv::template process_tile<1, 0, 3, 4, 0, 0>,
- Conv::template process_tile<1, 0, 3, 4, 0, 1>,
- Conv::template process_tile<1, 0, 3, 4, 0, 2>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- Conv::template process_tile<1, 0, 3, 4, 1, 0>,
- Conv::template process_tile<1, 0, 3, 4, 1, 1>,
- Conv::template process_tile<1, 0, 3, 4, 1, 2>,
- }, // Output pad bottom = 1
- { // Output pad bottom = 2
- Conv::template process_tile<1, 0, 3, 4, 2, 0>,
- Conv::template process_tile<1, 0, 3, 4, 2, 1>,
- Conv::template process_tile<1, 0, 3, 4, 2, 2>,
- }, // Output pad bottom = 2
- }, // Input pad right = 4
- { // Input pad right = 5
- { // Output pad bottom = 0
- Conv::template process_tile<1, 0, 3, 5, 0, 0>,
- Conv::template process_tile<1, 0, 3, 5, 0, 1>,
- Conv::template process_tile<1, 0, 3, 5, 0, 2>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- Conv::template process_tile<1, 0, 3, 5, 1, 0>,
- Conv::template process_tile<1, 0, 3, 5, 1, 1>,
- Conv::template process_tile<1, 0, 3, 5, 1, 2>,
- }, // Output pad bottom = 1
- { // Output pad bottom = 2
- Conv::template process_tile<1, 0, 3, 5, 2, 0>,
- Conv::template process_tile<1, 0, 3, 5, 2, 1>,
- Conv::template process_tile<1, 0, 3, 5, 2, 2>,
- }, // Output pad bottom = 2
- }, // Input pad right = 5
- { // Input pad right = 6
- { // Output pad bottom = 0
- Conv::template process_tile<1, 0, 3, 6, 0, 0>,
- Conv::template process_tile<1, 0, 3, 6, 0, 1>,
- Conv::template process_tile<1, 0, 3, 6, 0, 2>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- Conv::template process_tile<1, 0, 3, 6, 1, 0>,
- Conv::template process_tile<1, 0, 3, 6, 1, 1>,
- Conv::template process_tile<1, 0, 3, 6, 1, 2>,
- }, // Output pad bottom = 1
- { // Output pad bottom = 2
- Conv::template process_tile<1, 0, 3, 6, 2, 0>,
- Conv::template process_tile<1, 0, 3, 6, 2, 1>,
- Conv::template process_tile<1, 0, 3, 6, 2, 2>,
- }, // Output pad bottom = 2
- }, // Input pad right = 6
- }, // Input pad bottom = 3
- { // Input pad bottom = 4
- { // Input pad right = 0
- { // Output pad bottom = 0
- Conv::template process_tile<1, 0, 4, 0, 0, 0>,
- Conv::template process_tile<1, 0, 4, 0, 0, 1>,
- Conv::template process_tile<1, 0, 4, 0, 0, 2>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- Conv::template process_tile<1, 0, 4, 0, 1, 0>,
- Conv::template process_tile<1, 0, 4, 0, 1, 1>,
- Conv::template process_tile<1, 0, 4, 0, 1, 2>,
- }, // Output pad bottom = 1
- { // Output pad bottom = 2
- Conv::template process_tile<1, 0, 4, 0, 2, 0>,
- Conv::template process_tile<1, 0, 4, 0, 2, 1>,
- Conv::template process_tile<1, 0, 4, 0, 2, 2>,
- }, // Output pad bottom = 2
- }, // Input pad right = 0
- { // Input pad right = 1
- { // Output pad bottom = 0
- Conv::template process_tile<1, 0, 4, 1, 0, 0>,
- Conv::template process_tile<1, 0, 4, 1, 0, 1>,
- Conv::template process_tile<1, 0, 4, 1, 0, 2>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- Conv::template process_tile<1, 0, 4, 1, 1, 0>,
- Conv::template process_tile<1, 0, 4, 1, 1, 1>,
- Conv::template process_tile<1, 0, 4, 1, 1, 2>,
- }, // Output pad bottom = 1
- { // Output pad bottom = 2
- Conv::template process_tile<1, 0, 4, 1, 2, 0>,
- Conv::template process_tile<1, 0, 4, 1, 2, 1>,
- Conv::template process_tile<1, 0, 4, 1, 2, 2>,
- }, // Output pad bottom = 2
- }, // Input pad right = 1
- { // Input pad right = 2
- { // Output pad bottom = 0
- Conv::template process_tile<1, 0, 4, 2, 0, 0>,
- Conv::template process_tile<1, 0, 4, 2, 0, 1>,
- Conv::template process_tile<1, 0, 4, 2, 0, 2>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- Conv::template process_tile<1, 0, 4, 2, 1, 0>,
- Conv::template process_tile<1, 0, 4, 2, 1, 1>,
- Conv::template process_tile<1, 0, 4, 2, 1, 2>,
- }, // Output pad bottom = 1
- { // Output pad bottom = 2
- Conv::template process_tile<1, 0, 4, 2, 2, 0>,
- Conv::template process_tile<1, 0, 4, 2, 2, 1>,
- Conv::template process_tile<1, 0, 4, 2, 2, 2>,
- }, // Output pad bottom = 2
- }, // Input pad right = 2
- { // Input pad right = 3
- { // Output pad bottom = 0
- Conv::template process_tile<1, 0, 4, 3, 0, 0>,
- Conv::template process_tile<1, 0, 4, 3, 0, 1>,
- Conv::template process_tile<1, 0, 4, 3, 0, 2>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- Conv::template process_tile<1, 0, 4, 3, 1, 0>,
- Conv::template process_tile<1, 0, 4, 3, 1, 1>,
- Conv::template process_tile<1, 0, 4, 3, 1, 2>,
- }, // Output pad bottom = 1
- { // Output pad bottom = 2
- Conv::template process_tile<1, 0, 4, 3, 2, 0>,
- Conv::template process_tile<1, 0, 4, 3, 2, 1>,
- Conv::template process_tile<1, 0, 4, 3, 2, 2>,
- }, // Output pad bottom = 2
- }, // Input pad right = 3
- { // Input pad right = 4
- { // Output pad bottom = 0
- Conv::template process_tile<1, 0, 4, 4, 0, 0>,
- Conv::template process_tile<1, 0, 4, 4, 0, 1>,
- Conv::template process_tile<1, 0, 4, 4, 0, 2>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- Conv::template process_tile<1, 0, 4, 4, 1, 0>,
- Conv::template process_tile<1, 0, 4, 4, 1, 1>,
- Conv::template process_tile<1, 0, 4, 4, 1, 2>,
- }, // Output pad bottom = 1
- { // Output pad bottom = 2
- Conv::template process_tile<1, 0, 4, 4, 2, 0>,
- Conv::template process_tile<1, 0, 4, 4, 2, 1>,
- Conv::template process_tile<1, 0, 4, 4, 2, 2>,
- }, // Output pad bottom = 2
- }, // Input pad right = 4
- { // Input pad right = 5
- { // Output pad bottom = 0
- Conv::template process_tile<1, 0, 4, 5, 0, 0>,
- Conv::template process_tile<1, 0, 4, 5, 0, 1>,
- Conv::template process_tile<1, 0, 4, 5, 0, 2>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- Conv::template process_tile<1, 0, 4, 5, 1, 0>,
- Conv::template process_tile<1, 0, 4, 5, 1, 1>,
- Conv::template process_tile<1, 0, 4, 5, 1, 2>,
- }, // Output pad bottom = 1
- { // Output pad bottom = 2
- Conv::template process_tile<1, 0, 4, 5, 2, 0>,
- Conv::template process_tile<1, 0, 4, 5, 2, 1>,
- Conv::template process_tile<1, 0, 4, 5, 2, 2>,
- }, // Output pad bottom = 2
- }, // Input pad right = 5
- { // Input pad right = 6
- { // Output pad bottom = 0
- Conv::template process_tile<1, 0, 4, 6, 0, 0>,
- Conv::template process_tile<1, 0, 4, 6, 0, 1>,
- Conv::template process_tile<1, 0, 4, 6, 0, 2>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- Conv::template process_tile<1, 0, 4, 6, 1, 0>,
- Conv::template process_tile<1, 0, 4, 6, 1, 1>,
- Conv::template process_tile<1, 0, 4, 6, 1, 2>,
- }, // Output pad bottom = 1
- { // Output pad bottom = 2
- Conv::template process_tile<1, 0, 4, 6, 2, 0>,
- Conv::template process_tile<1, 0, 4, 6, 2, 1>,
- Conv::template process_tile<1, 0, 4, 6, 2, 2>,
- }, // Output pad bottom = 2
- }, // Input pad right = 6
- }, // Input pad bottom = 4
- { // Input pad bottom = 5
- { // Input pad right = 0
- { // Output pad bottom = 0
- Conv::template process_tile<1, 0, 5, 0, 0, 0>,
- Conv::template process_tile<1, 0, 5, 0, 0, 1>,
- Conv::template process_tile<1, 0, 5, 0, 0, 2>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- Conv::template process_tile<1, 0, 5, 0, 1, 0>,
- Conv::template process_tile<1, 0, 5, 0, 1, 1>,
- Conv::template process_tile<1, 0, 5, 0, 1, 2>,
- }, // Output pad bottom = 1
- { // Output pad bottom = 2
- Conv::template process_tile<1, 0, 5, 0, 2, 0>,
- Conv::template process_tile<1, 0, 5, 0, 2, 1>,
- Conv::template process_tile<1, 0, 5, 0, 2, 2>,
- }, // Output pad bottom = 2
- }, // Input pad right = 0
- { // Input pad right = 1
- { // Output pad bottom = 0
- Conv::template process_tile<1, 0, 5, 1, 0, 0>,
- Conv::template process_tile<1, 0, 5, 1, 0, 1>,
- Conv::template process_tile<1, 0, 5, 1, 0, 2>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- Conv::template process_tile<1, 0, 5, 1, 1, 0>,
- Conv::template process_tile<1, 0, 5, 1, 1, 1>,
- Conv::template process_tile<1, 0, 5, 1, 1, 2>,
- }, // Output pad bottom = 1
- { // Output pad bottom = 2
- Conv::template process_tile<1, 0, 5, 1, 2, 0>,
- Conv::template process_tile<1, 0, 5, 1, 2, 1>,
- Conv::template process_tile<1, 0, 5, 1, 2, 2>,
- }, // Output pad bottom = 2
- }, // Input pad right = 1
- { // Input pad right = 2
- { // Output pad bottom = 0
- Conv::template process_tile<1, 0, 5, 2, 0, 0>,
- Conv::template process_tile<1, 0, 5, 2, 0, 1>,
- Conv::template process_tile<1, 0, 5, 2, 0, 2>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- Conv::template process_tile<1, 0, 5, 2, 1, 0>,
- Conv::template process_tile<1, 0, 5, 2, 1, 1>,
- Conv::template process_tile<1, 0, 5, 2, 1, 2>,
- }, // Output pad bottom = 1
- { // Output pad bottom = 2
- Conv::template process_tile<1, 0, 5, 2, 2, 0>,
- Conv::template process_tile<1, 0, 5, 2, 2, 1>,
- Conv::template process_tile<1, 0, 5, 2, 2, 2>,
- }, // Output pad bottom = 2
- }, // Input pad right = 2
- { // Input pad right = 3
- { // Output pad bottom = 0
- Conv::template process_tile<1, 0, 5, 3, 0, 0>,
- Conv::template process_tile<1, 0, 5, 3, 0, 1>,
- Conv::template process_tile<1, 0, 5, 3, 0, 2>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- Conv::template process_tile<1, 0, 5, 3, 1, 0>,
- Conv::template process_tile<1, 0, 5, 3, 1, 1>,
- Conv::template process_tile<1, 0, 5, 3, 1, 2>,
- }, // Output pad bottom = 1
- { // Output pad bottom = 2
- Conv::template process_tile<1, 0, 5, 3, 2, 0>,
- Conv::template process_tile<1, 0, 5, 3, 2, 1>,
- Conv::template process_tile<1, 0, 5, 3, 2, 2>,
- }, // Output pad bottom = 2
- }, // Input pad right = 3
- { // Input pad right = 4
- { // Output pad bottom = 0
- Conv::template process_tile<1, 0, 5, 4, 0, 0>,
- Conv::template process_tile<1, 0, 5, 4, 0, 1>,
- Conv::template process_tile<1, 0, 5, 4, 0, 2>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- Conv::template process_tile<1, 0, 5, 4, 1, 0>,
- Conv::template process_tile<1, 0, 5, 4, 1, 1>,
- Conv::template process_tile<1, 0, 5, 4, 1, 2>,
- }, // Output pad bottom = 1
- { // Output pad bottom = 2
- Conv::template process_tile<1, 0, 5, 4, 2, 0>,
- Conv::template process_tile<1, 0, 5, 4, 2, 1>,
- Conv::template process_tile<1, 0, 5, 4, 2, 2>,
- }, // Output pad bottom = 2
- }, // Input pad right = 4
- { // Input pad right = 5
- { // Output pad bottom = 0
- Conv::template process_tile<1, 0, 5, 5, 0, 0>,
- Conv::template process_tile<1, 0, 5, 5, 0, 1>,
- Conv::template process_tile<1, 0, 5, 5, 0, 2>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- Conv::template process_tile<1, 0, 5, 5, 1, 0>,
- Conv::template process_tile<1, 0, 5, 5, 1, 1>,
- Conv::template process_tile<1, 0, 5, 5, 1, 2>,
- }, // Output pad bottom = 1
- { // Output pad bottom = 2
- Conv::template process_tile<1, 0, 5, 5, 2, 0>,
- Conv::template process_tile<1, 0, 5, 5, 2, 1>,
- Conv::template process_tile<1, 0, 5, 5, 2, 2>,
- }, // Output pad bottom = 2
- }, // Input pad right = 5
- { // Input pad right = 6
- { // Output pad bottom = 0
- Conv::template process_tile<1, 0, 5, 6, 0, 0>,
- Conv::template process_tile<1, 0, 5, 6, 0, 1>,
- Conv::template process_tile<1, 0, 5, 6, 0, 2>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- Conv::template process_tile<1, 0, 5, 6, 1, 0>,
- Conv::template process_tile<1, 0, 5, 6, 1, 1>,
- Conv::template process_tile<1, 0, 5, 6, 1, 2>,
- }, // Output pad bottom = 1
- { // Output pad bottom = 2
- Conv::template process_tile<1, 0, 5, 6, 2, 0>,
- Conv::template process_tile<1, 0, 5, 6, 2, 1>,
- Conv::template process_tile<1, 0, 5, 6, 2, 2>,
- }, // Output pad bottom = 2
- }, // Input pad right = 6
- }, // Input pad bottom = 5
- { // Input pad bottom = 6
- { // Input pad right = 0
- { // Output pad bottom = 0
- Conv::template process_tile<1, 0, 6, 0, 0, 0>,
- Conv::template process_tile<1, 0, 6, 0, 0, 1>,
- Conv::template process_tile<1, 0, 6, 0, 0, 2>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- Conv::template process_tile<1, 0, 6, 0, 1, 0>,
- Conv::template process_tile<1, 0, 6, 0, 1, 1>,
- Conv::template process_tile<1, 0, 6, 0, 1, 2>,
- }, // Output pad bottom = 1
- { // Output pad bottom = 2
- Conv::template process_tile<1, 0, 6, 0, 2, 0>,
- Conv::template process_tile<1, 0, 6, 0, 2, 1>,
- Conv::template process_tile<1, 0, 6, 0, 2, 2>,
- }, // Output pad bottom = 2
- }, // Input pad right = 0
- { // Input pad right = 1
- { // Output pad bottom = 0
- Conv::template process_tile<1, 0, 6, 1, 0, 0>,
- Conv::template process_tile<1, 0, 6, 1, 0, 1>,
- Conv::template process_tile<1, 0, 6, 1, 0, 2>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- Conv::template process_tile<1, 0, 6, 1, 1, 0>,
- Conv::template process_tile<1, 0, 6, 1, 1, 1>,
- Conv::template process_tile<1, 0, 6, 1, 1, 2>,
- }, // Output pad bottom = 1
- { // Output pad bottom = 2
- Conv::template process_tile<1, 0, 6, 1, 2, 0>,
- Conv::template process_tile<1, 0, 6, 1, 2, 1>,
- Conv::template process_tile<1, 0, 6, 1, 2, 2>,
- }, // Output pad bottom = 2
- }, // Input pad right = 1
- { // Input pad right = 2
- { // Output pad bottom = 0
- Conv::template process_tile<1, 0, 6, 2, 0, 0>,
- Conv::template process_tile<1, 0, 6, 2, 0, 1>,
- Conv::template process_tile<1, 0, 6, 2, 0, 2>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- Conv::template process_tile<1, 0, 6, 2, 1, 0>,
- Conv::template process_tile<1, 0, 6, 2, 1, 1>,
- Conv::template process_tile<1, 0, 6, 2, 1, 2>,
- }, // Output pad bottom = 1
- { // Output pad bottom = 2
- Conv::template process_tile<1, 0, 6, 2, 2, 0>,
- Conv::template process_tile<1, 0, 6, 2, 2, 1>,
- Conv::template process_tile<1, 0, 6, 2, 2, 2>,
- }, // Output pad bottom = 2
- }, // Input pad right = 2
- { // Input pad right = 3
- { // Output pad bottom = 0
- Conv::template process_tile<1, 0, 6, 3, 0, 0>,
- Conv::template process_tile<1, 0, 6, 3, 0, 1>,
- Conv::template process_tile<1, 0, 6, 3, 0, 2>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- Conv::template process_tile<1, 0, 6, 3, 1, 0>,
- Conv::template process_tile<1, 0, 6, 3, 1, 1>,
- Conv::template process_tile<1, 0, 6, 3, 1, 2>,
- }, // Output pad bottom = 1
- { // Output pad bottom = 2
- Conv::template process_tile<1, 0, 6, 3, 2, 0>,
- Conv::template process_tile<1, 0, 6, 3, 2, 1>,
- Conv::template process_tile<1, 0, 6, 3, 2, 2>,
- }, // Output pad bottom = 2
- }, // Input pad right = 3
- { // Input pad right = 4
- { // Output pad bottom = 0
- Conv::template process_tile<1, 0, 6, 4, 0, 0>,
- Conv::template process_tile<1, 0, 6, 4, 0, 1>,
- Conv::template process_tile<1, 0, 6, 4, 0, 2>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- Conv::template process_tile<1, 0, 6, 4, 1, 0>,
- Conv::template process_tile<1, 0, 6, 4, 1, 1>,
- Conv::template process_tile<1, 0, 6, 4, 1, 2>,
- }, // Output pad bottom = 1
- { // Output pad bottom = 2
- Conv::template process_tile<1, 0, 6, 4, 2, 0>,
- Conv::template process_tile<1, 0, 6, 4, 2, 1>,
- Conv::template process_tile<1, 0, 6, 4, 2, 2>,
- }, // Output pad bottom = 2
- }, // Input pad right = 4
- { // Input pad right = 5
- { // Output pad bottom = 0
- Conv::template process_tile<1, 0, 6, 5, 0, 0>,
- Conv::template process_tile<1, 0, 6, 5, 0, 1>,
- Conv::template process_tile<1, 0, 6, 5, 0, 2>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- Conv::template process_tile<1, 0, 6, 5, 1, 0>,
- Conv::template process_tile<1, 0, 6, 5, 1, 1>,
- Conv::template process_tile<1, 0, 6, 5, 1, 2>,
- }, // Output pad bottom = 1
- { // Output pad bottom = 2
- Conv::template process_tile<1, 0, 6, 5, 2, 0>,
- Conv::template process_tile<1, 0, 6, 5, 2, 1>,
- Conv::template process_tile<1, 0, 6, 5, 2, 2>,
- }, // Output pad bottom = 2
- }, // Input pad right = 5
- { // Input pad right = 6
- { // Output pad bottom = 0
- Conv::template process_tile<1, 0, 6, 6, 0, 0>,
- Conv::template process_tile<1, 0, 6, 6, 0, 1>,
- Conv::template process_tile<1, 0, 6, 6, 0, 2>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- Conv::template process_tile<1, 0, 6, 6, 1, 0>,
- Conv::template process_tile<1, 0, 6, 6, 1, 1>,
- Conv::template process_tile<1, 0, 6, 6, 1, 2>,
- }, // Output pad bottom = 1
- { // Output pad bottom = 2
- Conv::template process_tile<1, 0, 6, 6, 2, 0>,
- Conv::template process_tile<1, 0, 6, 6, 2, 1>,
- Conv::template process_tile<1, 0, 6, 6, 2, 2>,
- }, // Output pad bottom = 2
- }, // Input pad right = 6
- }, // Input pad bottom = 6
- }, // Input pad left = 0
- { // Input pad left = 1
- { // Input pad bottom = 0
- { // Input pad right = 0
- { // Output pad bottom = 0
- Conv::template process_tile<1, 1, 0, 0, 0, 0>,
- Conv::template process_tile<1, 1, 0, 0, 0, 1>,
- Conv::template process_tile<1, 1, 0, 0, 0, 2>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- Conv::template process_tile<1, 1, 0, 0, 1, 0>,
- Conv::template process_tile<1, 1, 0, 0, 1, 1>,
- Conv::template process_tile<1, 1, 0, 0, 1, 2>,
- }, // Output pad bottom = 1
- { // Output pad bottom = 2
- Conv::template process_tile<1, 1, 0, 0, 2, 0>,
- Conv::template process_tile<1, 1, 0, 0, 2, 1>,
- Conv::template process_tile<1, 1, 0, 0, 2, 2>,
- }, // Output pad bottom = 2
- }, // Input pad right = 0
- { // Input pad right = 1
- { // Output pad bottom = 0
- Conv::template process_tile<1, 1, 0, 1, 0, 0>,
- Conv::template process_tile<1, 1, 0, 1, 0, 1>,
- Conv::template process_tile<1, 1, 0, 1, 0, 2>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- Conv::template process_tile<1, 1, 0, 1, 1, 0>,
- Conv::template process_tile<1, 1, 0, 1, 1, 1>,
- Conv::template process_tile<1, 1, 0, 1, 1, 2>,
- }, // Output pad bottom = 1
- { // Output pad bottom = 2
- Conv::template process_tile<1, 1, 0, 1, 2, 0>,
- Conv::template process_tile<1, 1, 0, 1, 2, 1>,
- Conv::template process_tile<1, 1, 0, 1, 2, 2>,
- }, // Output pad bottom = 2
- }, // Input pad right = 1
- { // Input pad right = 2
- { // Output pad bottom = 0
- Conv::template process_tile<1, 1, 0, 2, 0, 0>,
- Conv::template process_tile<1, 1, 0, 2, 0, 1>,
- Conv::template process_tile<1, 1, 0, 2, 0, 2>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- Conv::template process_tile<1, 1, 0, 2, 1, 0>,
- Conv::template process_tile<1, 1, 0, 2, 1, 1>,
- Conv::template process_tile<1, 1, 0, 2, 1, 2>,
- }, // Output pad bottom = 1
- { // Output pad bottom = 2
- Conv::template process_tile<1, 1, 0, 2, 2, 0>,
- Conv::template process_tile<1, 1, 0, 2, 2, 1>,
- Conv::template process_tile<1, 1, 0, 2, 2, 2>,
- }, // Output pad bottom = 2
- }, // Input pad right = 2
- { // Input pad right = 3
- { // Output pad bottom = 0
- Conv::template process_tile<1, 1, 0, 3, 0, 0>,
- Conv::template process_tile<1, 1, 0, 3, 0, 1>,
- Conv::template process_tile<1, 1, 0, 3, 0, 2>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- Conv::template process_tile<1, 1, 0, 3, 1, 0>,
- Conv::template process_tile<1, 1, 0, 3, 1, 1>,
- Conv::template process_tile<1, 1, 0, 3, 1, 2>,
- }, // Output pad bottom = 1
- { // Output pad bottom = 2
- Conv::template process_tile<1, 1, 0, 3, 2, 0>,
- Conv::template process_tile<1, 1, 0, 3, 2, 1>,
- Conv::template process_tile<1, 1, 0, 3, 2, 2>,
- }, // Output pad bottom = 2
- }, // Input pad right = 3
- { // Input pad right = 4
- { // Output pad bottom = 0
- Conv::template process_tile<1, 1, 0, 4, 0, 0>,
- Conv::template process_tile<1, 1, 0, 4, 0, 1>,
- Conv::template process_tile<1, 1, 0, 4, 0, 2>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- Conv::template process_tile<1, 1, 0, 4, 1, 0>,
- Conv::template process_tile<1, 1, 0, 4, 1, 1>,
- Conv::template process_tile<1, 1, 0, 4, 1, 2>,
- }, // Output pad bottom = 1
- { // Output pad bottom = 2
- Conv::template process_tile<1, 1, 0, 4, 2, 0>,
- Conv::template process_tile<1, 1, 0, 4, 2, 1>,
- Conv::template process_tile<1, 1, 0, 4, 2, 2>,
- }, // Output pad bottom = 2
- }, // Input pad right = 4
- { // Input pad right = 5
- { // Output pad bottom = 0
- Conv::template process_tile<1, 1, 0, 5, 0, 0>,
- Conv::template process_tile<1, 1, 0, 5, 0, 1>,
- Conv::template process_tile<1, 1, 0, 5, 0, 2>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- Conv::template process_tile<1, 1, 0, 5, 1, 0>,
- Conv::template process_tile<1, 1, 0, 5, 1, 1>,
- Conv::template process_tile<1, 1, 0, 5, 1, 2>,
- }, // Output pad bottom = 1
- { // Output pad bottom = 2
- Conv::template process_tile<1, 1, 0, 5, 2, 0>,
- Conv::template process_tile<1, 1, 0, 5, 2, 1>,
- Conv::template process_tile<1, 1, 0, 5, 2, 2>,
- }, // Output pad bottom = 2
- }, // Input pad right = 5
- { // Input pad right = 6
- { // Output pad bottom = 0
- Conv::template process_tile<1, 1, 0, 6, 0, 0>,
- Conv::template process_tile<1, 1, 0, 6, 0, 1>,
- Conv::template process_tile<1, 1, 0, 6, 0, 2>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- Conv::template process_tile<1, 1, 0, 6, 1, 0>,
- Conv::template process_tile<1, 1, 0, 6, 1, 1>,
- Conv::template process_tile<1, 1, 0, 6, 1, 2>,
- }, // Output pad bottom = 1
- { // Output pad bottom = 2
- Conv::template process_tile<1, 1, 0, 6, 2, 0>,
- Conv::template process_tile<1, 1, 0, 6, 2, 1>,
- Conv::template process_tile<1, 1, 0, 6, 2, 2>,
- }, // Output pad bottom = 2
- }, // Input pad right = 6
- }, // Input pad bottom = 0
- { // Input pad bottom = 1
- { // Input pad right = 0
- { // Output pad bottom = 0
- Conv::template process_tile<1, 1, 1, 0, 0, 0>,
- Conv::template process_tile<1, 1, 1, 0, 0, 1>,
- Conv::template process_tile<1, 1, 1, 0, 0, 2>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- Conv::template process_tile<1, 1, 1, 0, 1, 0>,
- Conv::template process_tile<1, 1, 1, 0, 1, 1>,
- Conv::template process_tile<1, 1, 1, 0, 1, 2>,
- }, // Output pad bottom = 1
- { // Output pad bottom = 2
- Conv::template process_tile<1, 1, 1, 0, 2, 0>,
- Conv::template process_tile<1, 1, 1, 0, 2, 1>,
- Conv::template process_tile<1, 1, 1, 0, 2, 2>,
- }, // Output pad bottom = 2
- }, // Input pad right = 0
- { // Input pad right = 1
- { // Output pad bottom = 0
- Conv::template process_tile<1, 1, 1, 1, 0, 0>,
- Conv::template process_tile<1, 1, 1, 1, 0, 1>,
- Conv::template process_tile<1, 1, 1, 1, 0, 2>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- Conv::template process_tile<1, 1, 1, 1, 1, 0>,
- Conv::template process_tile<1, 1, 1, 1, 1, 1>,
- Conv::template process_tile<1, 1, 1, 1, 1, 2>,
- }, // Output pad bottom = 1
- { // Output pad bottom = 2
- Conv::template process_tile<1, 1, 1, 1, 2, 0>,
- Conv::template process_tile<1, 1, 1, 1, 2, 1>,
- Conv::template process_tile<1, 1, 1, 1, 2, 2>,
- }, // Output pad bottom = 2
- }, // Input pad right = 1
- { // Input pad right = 2
- { // Output pad bottom = 0
- Conv::template process_tile<1, 1, 1, 2, 0, 0>,
- Conv::template process_tile<1, 1, 1, 2, 0, 1>,
- Conv::template process_tile<1, 1, 1, 2, 0, 2>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- Conv::template process_tile<1, 1, 1, 2, 1, 0>,
- Conv::template process_tile<1, 1, 1, 2, 1, 1>,
- Conv::template process_tile<1, 1, 1, 2, 1, 2>,
- }, // Output pad bottom = 1
- { // Output pad bottom = 2
- Conv::template process_tile<1, 1, 1, 2, 2, 0>,
- Conv::template process_tile<1, 1, 1, 2, 2, 1>,
- Conv::template process_tile<1, 1, 1, 2, 2, 2>,
- }, // Output pad bottom = 2
- }, // Input pad right = 2
- { // Input pad right = 3
- { // Output pad bottom = 0
- Conv::template process_tile<1, 1, 1, 3, 0, 0>,
- Conv::template process_tile<1, 1, 1, 3, 0, 1>,
- Conv::template process_tile<1, 1, 1, 3, 0, 2>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- Conv::template process_tile<1, 1, 1, 3, 1, 0>,
- Conv::template process_tile<1, 1, 1, 3, 1, 1>,
- Conv::template process_tile<1, 1, 1, 3, 1, 2>,
- }, // Output pad bottom = 1
- { // Output pad bottom = 2
- Conv::template process_tile<1, 1, 1, 3, 2, 0>,
- Conv::template process_tile<1, 1, 1, 3, 2, 1>,
- Conv::template process_tile<1, 1, 1, 3, 2, 2>,
- }, // Output pad bottom = 2
- }, // Input pad right = 3
- { // Input pad right = 4
- { // Output pad bottom = 0
- Conv::template process_tile<1, 1, 1, 4, 0, 0>,
- Conv::template process_tile<1, 1, 1, 4, 0, 1>,
- Conv::template process_tile<1, 1, 1, 4, 0, 2>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- Conv::template process_tile<1, 1, 1, 4, 1, 0>,
- Conv::template process_tile<1, 1, 1, 4, 1, 1>,
- Conv::template process_tile<1, 1, 1, 4, 1, 2>,
- }, // Output pad bottom = 1
- { // Output pad bottom = 2
- Conv::template process_tile<1, 1, 1, 4, 2, 0>,
- Conv::template process_tile<1, 1, 1, 4, 2, 1>,
- Conv::template process_tile<1, 1, 1, 4, 2, 2>,
- }, // Output pad bottom = 2
- }, // Input pad right = 4
- { // Input pad right = 5
- { // Output pad bottom = 0
- Conv::template process_tile<1, 1, 1, 5, 0, 0>,
- Conv::template process_tile<1, 1, 1, 5, 0, 1>,
- Conv::template process_tile<1, 1, 1, 5, 0, 2>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- Conv::template process_tile<1, 1, 1, 5, 1, 0>,
- Conv::template process_tile<1, 1, 1, 5, 1, 1>,
- Conv::template process_tile<1, 1, 1, 5, 1, 2>,
- }, // Output pad bottom = 1
- { // Output pad bottom = 2
- Conv::template process_tile<1, 1, 1, 5, 2, 0>,
- Conv::template process_tile<1, 1, 1, 5, 2, 1>,
- Conv::template process_tile<1, 1, 1, 5, 2, 2>,
- }, // Output pad bottom = 2
- }, // Input pad right = 5
- { // Input pad right = 6
- { // Output pad bottom = 0
- Conv::template process_tile<1, 1, 1, 6, 0, 0>,
- Conv::template process_tile<1, 1, 1, 6, 0, 1>,
- Conv::template process_tile<1, 1, 1, 6, 0, 2>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- Conv::template process_tile<1, 1, 1, 6, 1, 0>,
- Conv::template process_tile<1, 1, 1, 6, 1, 1>,
- Conv::template process_tile<1, 1, 1, 6, 1, 2>,
- }, // Output pad bottom = 1
- { // Output pad bottom = 2
- Conv::template process_tile<1, 1, 1, 6, 2, 0>,
- Conv::template process_tile<1, 1, 1, 6, 2, 1>,
- Conv::template process_tile<1, 1, 1, 6, 2, 2>,
- }, // Output pad bottom = 2
- }, // Input pad right = 6
- }, // Input pad bottom = 1
- { // Input pad bottom = 2
- { // Input pad right = 0
- { // Output pad bottom = 0
- Conv::template process_tile<1, 1, 2, 0, 0, 0>,
- Conv::template process_tile<1, 1, 2, 0, 0, 1>,
- Conv::template process_tile<1, 1, 2, 0, 0, 2>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- Conv::template process_tile<1, 1, 2, 0, 1, 0>,
- Conv::template process_tile<1, 1, 2, 0, 1, 1>,
- Conv::template process_tile<1, 1, 2, 0, 1, 2>,
- }, // Output pad bottom = 1
- { // Output pad bottom = 2
- Conv::template process_tile<1, 1, 2, 0, 2, 0>,
- Conv::template process_tile<1, 1, 2, 0, 2, 1>,
- Conv::template process_tile<1, 1, 2, 0, 2, 2>,
- }, // Output pad bottom = 2
- }, // Input pad right = 0
- { // Input pad right = 1
- { // Output pad bottom = 0
- Conv::template process_tile<1, 1, 2, 1, 0, 0>,
- Conv::template process_tile<1, 1, 2, 1, 0, 1>,
- Conv::template process_tile<1, 1, 2, 1, 0, 2>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- Conv::template process_tile<1, 1, 2, 1, 1, 0>,
- Conv::template process_tile<1, 1, 2, 1, 1, 1>,
- Conv::template process_tile<1, 1, 2, 1, 1, 2>,
- }, // Output pad bottom = 1
- { // Output pad bottom = 2
- Conv::template process_tile<1, 1, 2, 1, 2, 0>,
- Conv::template process_tile<1, 1, 2, 1, 2, 1>,
- Conv::template process_tile<1, 1, 2, 1, 2, 2>,
- }, // Output pad bottom = 2
- }, // Input pad right = 1
- { // Input pad right = 2
- { // Output pad bottom = 0
- Conv::template process_tile<1, 1, 2, 2, 0, 0>,
- Conv::template process_tile<1, 1, 2, 2, 0, 1>,
- Conv::template process_tile<1, 1, 2, 2, 0, 2>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- Conv::template process_tile<1, 1, 2, 2, 1, 0>,
- Conv::template process_tile<1, 1, 2, 2, 1, 1>,
- Conv::template process_tile<1, 1, 2, 2, 1, 2>,
- }, // Output pad bottom = 1
- { // Output pad bottom = 2
- Conv::template process_tile<1, 1, 2, 2, 2, 0>,
- Conv::template process_tile<1, 1, 2, 2, 2, 1>,
- Conv::template process_tile<1, 1, 2, 2, 2, 2>,
- }, // Output pad bottom = 2
- }, // Input pad right = 2
- { // Input pad right = 3
- { // Output pad bottom = 0
- Conv::template process_tile<1, 1, 2, 3, 0, 0>,
- Conv::template process_tile<1, 1, 2, 3, 0, 1>,
- Conv::template process_tile<1, 1, 2, 3, 0, 2>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- Conv::template process_tile<1, 1, 2, 3, 1, 0>,
- Conv::template process_tile<1, 1, 2, 3, 1, 1>,
- Conv::template process_tile<1, 1, 2, 3, 1, 2>,
- }, // Output pad bottom = 1
- { // Output pad bottom = 2
- Conv::template process_tile<1, 1, 2, 3, 2, 0>,
- Conv::template process_tile<1, 1, 2, 3, 2, 1>,
- Conv::template process_tile<1, 1, 2, 3, 2, 2>,
- }, // Output pad bottom = 2
- }, // Input pad right = 3
- { // Input pad right = 4
- { // Output pad bottom = 0
- Conv::template process_tile<1, 1, 2, 4, 0, 0>,
- Conv::template process_tile<1, 1, 2, 4, 0, 1>,
- Conv::template process_tile<1, 1, 2, 4, 0, 2>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- Conv::template process_tile<1, 1, 2, 4, 1, 0>,
- Conv::template process_tile<1, 1, 2, 4, 1, 1>,
- Conv::template process_tile<1, 1, 2, 4, 1, 2>,
- }, // Output pad bottom = 1
- { // Output pad bottom = 2
- Conv::template process_tile<1, 1, 2, 4, 2, 0>,
- Conv::template process_tile<1, 1, 2, 4, 2, 1>,
- Conv::template process_tile<1, 1, 2, 4, 2, 2>,
- }, // Output pad bottom = 2
- }, // Input pad right = 4
- { // Input pad right = 5
- { // Output pad bottom = 0
- Conv::template process_tile<1, 1, 2, 5, 0, 0>,
- Conv::template process_tile<1, 1, 2, 5, 0, 1>,
- Conv::template process_tile<1, 1, 2, 5, 0, 2>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- Conv::template process_tile<1, 1, 2, 5, 1, 0>,
- Conv::template process_tile<1, 1, 2, 5, 1, 1>,
- Conv::template process_tile<1, 1, 2, 5, 1, 2>,
- }, // Output pad bottom = 1
- { // Output pad bottom = 2
- Conv::template process_tile<1, 1, 2, 5, 2, 0>,
- Conv::template process_tile<1, 1, 2, 5, 2, 1>,
- Conv::template process_tile<1, 1, 2, 5, 2, 2>,
- }, // Output pad bottom = 2
- }, // Input pad right = 5
- { // Input pad right = 6
- { // Output pad bottom = 0
- Conv::template process_tile<1, 1, 2, 6, 0, 0>,
- Conv::template process_tile<1, 1, 2, 6, 0, 1>,
- Conv::template process_tile<1, 1, 2, 6, 0, 2>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- Conv::template process_tile<1, 1, 2, 6, 1, 0>,
- Conv::template process_tile<1, 1, 2, 6, 1, 1>,
- Conv::template process_tile<1, 1, 2, 6, 1, 2>,
- }, // Output pad bottom = 1
- { // Output pad bottom = 2
- Conv::template process_tile<1, 1, 2, 6, 2, 0>,
- Conv::template process_tile<1, 1, 2, 6, 2, 1>,
- Conv::template process_tile<1, 1, 2, 6, 2, 2>,
- }, // Output pad bottom = 2
- }, // Input pad right = 6
- }, // Input pad bottom = 2
- { // Input pad bottom = 3
- { // Input pad right = 0
- { // Output pad bottom = 0
- Conv::template process_tile<1, 1, 3, 0, 0, 0>,
- Conv::template process_tile<1, 1, 3, 0, 0, 1>,
- Conv::template process_tile<1, 1, 3, 0, 0, 2>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- Conv::template process_tile<1, 1, 3, 0, 1, 0>,
- Conv::template process_tile<1, 1, 3, 0, 1, 1>,
- Conv::template process_tile<1, 1, 3, 0, 1, 2>,
- }, // Output pad bottom = 1
- { // Output pad bottom = 2
- Conv::template process_tile<1, 1, 3, 0, 2, 0>,
- Conv::template process_tile<1, 1, 3, 0, 2, 1>,
- Conv::template process_tile<1, 1, 3, 0, 2, 2>,
- }, // Output pad bottom = 2
- }, // Input pad right = 0
- { // Input pad right = 1
- { // Output pad bottom = 0
- Conv::template process_tile<1, 1, 3, 1, 0, 0>,
- Conv::template process_tile<1, 1, 3, 1, 0, 1>,
- Conv::template process_tile<1, 1, 3, 1, 0, 2>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- Conv::template process_tile<1, 1, 3, 1, 1, 0>,
- Conv::template process_tile<1, 1, 3, 1, 1, 1>,
- Conv::template process_tile<1, 1, 3, 1, 1, 2>,
- }, // Output pad bottom = 1
- { // Output pad bottom = 2
- Conv::template process_tile<1, 1, 3, 1, 2, 0>,
- Conv::template process_tile<1, 1, 3, 1, 2, 1>,
- Conv::template process_tile<1, 1, 3, 1, 2, 2>,
- }, // Output pad bottom = 2
- }, // Input pad right = 1
- { // Input pad right = 2
- { // Output pad bottom = 0
- Conv::template process_tile<1, 1, 3, 2, 0, 0>,
- Conv::template process_tile<1, 1, 3, 2, 0, 1>,
- Conv::template process_tile<1, 1, 3, 2, 0, 2>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- Conv::template process_tile<1, 1, 3, 2, 1, 0>,
- Conv::template process_tile<1, 1, 3, 2, 1, 1>,
- Conv::template process_tile<1, 1, 3, 2, 1, 2>,
- }, // Output pad bottom = 1
- { // Output pad bottom = 2
- Conv::template process_tile<1, 1, 3, 2, 2, 0>,
- Conv::template process_tile<1, 1, 3, 2, 2, 1>,
- Conv::template process_tile<1, 1, 3, 2, 2, 2>,
- }, // Output pad bottom = 2
- }, // Input pad right = 2
- { // Input pad right = 3
- { // Output pad bottom = 0
- Conv::template process_tile<1, 1, 3, 3, 0, 0>,
- Conv::template process_tile<1, 1, 3, 3, 0, 1>,
- Conv::template process_tile<1, 1, 3, 3, 0, 2>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- Conv::template process_tile<1, 1, 3, 3, 1, 0>,
- Conv::template process_tile<1, 1, 3, 3, 1, 1>,
- Conv::template process_tile<1, 1, 3, 3, 1, 2>,
- }, // Output pad bottom = 1
- { // Output pad bottom = 2
- Conv::template process_tile<1, 1, 3, 3, 2, 0>,
- Conv::template process_tile<1, 1, 3, 3, 2, 1>,
- Conv::template process_tile<1, 1, 3, 3, 2, 2>,
- }, // Output pad bottom = 2
- }, // Input pad right = 3
- { // Input pad right = 4
- { // Output pad bottom = 0
- Conv::template process_tile<1, 1, 3, 4, 0, 0>,
- Conv::template process_tile<1, 1, 3, 4, 0, 1>,
- Conv::template process_tile<1, 1, 3, 4, 0, 2>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- Conv::template process_tile<1, 1, 3, 4, 1, 0>,
- Conv::template process_tile<1, 1, 3, 4, 1, 1>,
- Conv::template process_tile<1, 1, 3, 4, 1, 2>,
- }, // Output pad bottom = 1
- { // Output pad bottom = 2
- Conv::template process_tile<1, 1, 3, 4, 2, 0>,
- Conv::template process_tile<1, 1, 3, 4, 2, 1>,
- Conv::template process_tile<1, 1, 3, 4, 2, 2>,
- }, // Output pad bottom = 2
- }, // Input pad right = 4
- { // Input pad right = 5
- { // Output pad bottom = 0
- Conv::template process_tile<1, 1, 3, 5, 0, 0>,
- Conv::template process_tile<1, 1, 3, 5, 0, 1>,
- Conv::template process_tile<1, 1, 3, 5, 0, 2>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- Conv::template process_tile<1, 1, 3, 5, 1, 0>,
- Conv::template process_tile<1, 1, 3, 5, 1, 1>,
- Conv::template process_tile<1, 1, 3, 5, 1, 2>,
- }, // Output pad bottom = 1
- { // Output pad bottom = 2
- Conv::template process_tile<1, 1, 3, 5, 2, 0>,
- Conv::template process_tile<1, 1, 3, 5, 2, 1>,
- Conv::template process_tile<1, 1, 3, 5, 2, 2>,
- }, // Output pad bottom = 2
- }, // Input pad right = 5
- { // Input pad right = 6
- { // Output pad bottom = 0
- Conv::template process_tile<1, 1, 3, 6, 0, 0>,
- Conv::template process_tile<1, 1, 3, 6, 0, 1>,
- Conv::template process_tile<1, 1, 3, 6, 0, 2>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- Conv::template process_tile<1, 1, 3, 6, 1, 0>,
- Conv::template process_tile<1, 1, 3, 6, 1, 1>,
- Conv::template process_tile<1, 1, 3, 6, 1, 2>,
- }, // Output pad bottom = 1
- { // Output pad bottom = 2
- Conv::template process_tile<1, 1, 3, 6, 2, 0>,
- Conv::template process_tile<1, 1, 3, 6, 2, 1>,
- Conv::template process_tile<1, 1, 3, 6, 2, 2>,
- }, // Output pad bottom = 2
- }, // Input pad right = 6
- }, // Input pad bottom = 3
- { // Input pad bottom = 4
- { // Input pad right = 0
- { // Output pad bottom = 0
- Conv::template process_tile<1, 1, 4, 0, 0, 0>,
- Conv::template process_tile<1, 1, 4, 0, 0, 1>,
- Conv::template process_tile<1, 1, 4, 0, 0, 2>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- Conv::template process_tile<1, 1, 4, 0, 1, 0>,
- Conv::template process_tile<1, 1, 4, 0, 1, 1>,
- Conv::template process_tile<1, 1, 4, 0, 1, 2>,
- }, // Output pad bottom = 1
- { // Output pad bottom = 2
- Conv::template process_tile<1, 1, 4, 0, 2, 0>,
- Conv::template process_tile<1, 1, 4, 0, 2, 1>,
- Conv::template process_tile<1, 1, 4, 0, 2, 2>,
- }, // Output pad bottom = 2
- }, // Input pad right = 0
- { // Input pad right = 1
- { // Output pad bottom = 0
- Conv::template process_tile<1, 1, 4, 1, 0, 0>,
- Conv::template process_tile<1, 1, 4, 1, 0, 1>,
- Conv::template process_tile<1, 1, 4, 1, 0, 2>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- Conv::template process_tile<1, 1, 4, 1, 1, 0>,
- Conv::template process_tile<1, 1, 4, 1, 1, 1>,
- Conv::template process_tile<1, 1, 4, 1, 1, 2>,
- }, // Output pad bottom = 1
- { // Output pad bottom = 2
- Conv::template process_tile<1, 1, 4, 1, 2, 0>,
- Conv::template process_tile<1, 1, 4, 1, 2, 1>,
- Conv::template process_tile<1, 1, 4, 1, 2, 2>,
- }, // Output pad bottom = 2
- }, // Input pad right = 1
- { // Input pad right = 2
- { // Output pad bottom = 0
- Conv::template process_tile<1, 1, 4, 2, 0, 0>,
- Conv::template process_tile<1, 1, 4, 2, 0, 1>,
- Conv::template process_tile<1, 1, 4, 2, 0, 2>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- Conv::template process_tile<1, 1, 4, 2, 1, 0>,
- Conv::template process_tile<1, 1, 4, 2, 1, 1>,
- Conv::template process_tile<1, 1, 4, 2, 1, 2>,
- }, // Output pad bottom = 1
- { // Output pad bottom = 2
- Conv::template process_tile<1, 1, 4, 2, 2, 0>,
- Conv::template process_tile<1, 1, 4, 2, 2, 1>,
- Conv::template process_tile<1, 1, 4, 2, 2, 2>,
- }, // Output pad bottom = 2
- }, // Input pad right = 2
- { // Input pad right = 3
- { // Output pad bottom = 0
- Conv::template process_tile<1, 1, 4, 3, 0, 0>,
- Conv::template process_tile<1, 1, 4, 3, 0, 1>,
- Conv::template process_tile<1, 1, 4, 3, 0, 2>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- Conv::template process_tile<1, 1, 4, 3, 1, 0>,
- Conv::template process_tile<1, 1, 4, 3, 1, 1>,
- Conv::template process_tile<1, 1, 4, 3, 1, 2>,
- }, // Output pad bottom = 1
- { // Output pad bottom = 2
- Conv::template process_tile<1, 1, 4, 3, 2, 0>,
- Conv::template process_tile<1, 1, 4, 3, 2, 1>,
- Conv::template process_tile<1, 1, 4, 3, 2, 2>,
- }, // Output pad bottom = 2
- }, // Input pad right = 3
- { // Input pad right = 4
- { // Output pad bottom = 0
- Conv::template process_tile<1, 1, 4, 4, 0, 0>,
- Conv::template process_tile<1, 1, 4, 4, 0, 1>,
- Conv::template process_tile<1, 1, 4, 4, 0, 2>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- Conv::template process_tile<1, 1, 4, 4, 1, 0>,
- Conv::template process_tile<1, 1, 4, 4, 1, 1>,
- Conv::template process_tile<1, 1, 4, 4, 1, 2>,
- }, // Output pad bottom = 1
- { // Output pad bottom = 2
- Conv::template process_tile<1, 1, 4, 4, 2, 0>,
- Conv::template process_tile<1, 1, 4, 4, 2, 1>,
- Conv::template process_tile<1, 1, 4, 4, 2, 2>,
- }, // Output pad bottom = 2
- }, // Input pad right = 4
- { // Input pad right = 5
- { // Output pad bottom = 0
- Conv::template process_tile<1, 1, 4, 5, 0, 0>,
- Conv::template process_tile<1, 1, 4, 5, 0, 1>,
- Conv::template process_tile<1, 1, 4, 5, 0, 2>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- Conv::template process_tile<1, 1, 4, 5, 1, 0>,
- Conv::template process_tile<1, 1, 4, 5, 1, 1>,
- Conv::template process_tile<1, 1, 4, 5, 1, 2>,
- }, // Output pad bottom = 1
- { // Output pad bottom = 2
- Conv::template process_tile<1, 1, 4, 5, 2, 0>,
- Conv::template process_tile<1, 1, 4, 5, 2, 1>,
- Conv::template process_tile<1, 1, 4, 5, 2, 2>,
- }, // Output pad bottom = 2
- }, // Input pad right = 5
- { // Input pad right = 6
- { // Output pad bottom = 0
- Conv::template process_tile<1, 1, 4, 6, 0, 0>,
- Conv::template process_tile<1, 1, 4, 6, 0, 1>,
- Conv::template process_tile<1, 1, 4, 6, 0, 2>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- Conv::template process_tile<1, 1, 4, 6, 1, 0>,
- Conv::template process_tile<1, 1, 4, 6, 1, 1>,
- Conv::template process_tile<1, 1, 4, 6, 1, 2>,
- }, // Output pad bottom = 1
- { // Output pad bottom = 2
- Conv::template process_tile<1, 1, 4, 6, 2, 0>,
- Conv::template process_tile<1, 1, 4, 6, 2, 1>,
- Conv::template process_tile<1, 1, 4, 6, 2, 2>,
- }, // Output pad bottom = 2
- }, // Input pad right = 6
- }, // Input pad bottom = 4
- { // Input pad bottom = 5
- { // Input pad right = 0
- { // Output pad bottom = 0
- Conv::template process_tile<1, 1, 5, 0, 0, 0>,
- Conv::template process_tile<1, 1, 5, 0, 0, 1>,
- Conv::template process_tile<1, 1, 5, 0, 0, 2>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- Conv::template process_tile<1, 1, 5, 0, 1, 0>,
- Conv::template process_tile<1, 1, 5, 0, 1, 1>,
- Conv::template process_tile<1, 1, 5, 0, 1, 2>,
- }, // Output pad bottom = 1
- { // Output pad bottom = 2
- Conv::template process_tile<1, 1, 5, 0, 2, 0>,
- Conv::template process_tile<1, 1, 5, 0, 2, 1>,
- Conv::template process_tile<1, 1, 5, 0, 2, 2>,
- }, // Output pad bottom = 2
- }, // Input pad right = 0
- { // Input pad right = 1
- { // Output pad bottom = 0
- Conv::template process_tile<1, 1, 5, 1, 0, 0>,
- Conv::template process_tile<1, 1, 5, 1, 0, 1>,
- Conv::template process_tile<1, 1, 5, 1, 0, 2>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- Conv::template process_tile<1, 1, 5, 1, 1, 0>,
- Conv::template process_tile<1, 1, 5, 1, 1, 1>,
- Conv::template process_tile<1, 1, 5, 1, 1, 2>,
- }, // Output pad bottom = 1
- { // Output pad bottom = 2
- Conv::template process_tile<1, 1, 5, 1, 2, 0>,
- Conv::template process_tile<1, 1, 5, 1, 2, 1>,
- Conv::template process_tile<1, 1, 5, 1, 2, 2>,
- }, // Output pad bottom = 2
- }, // Input pad right = 1
- { // Input pad right = 2
- { // Output pad bottom = 0
- Conv::template process_tile<1, 1, 5, 2, 0, 0>,
- Conv::template process_tile<1, 1, 5, 2, 0, 1>,
- Conv::template process_tile<1, 1, 5, 2, 0, 2>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- Conv::template process_tile<1, 1, 5, 2, 1, 0>,
- Conv::template process_tile<1, 1, 5, 2, 1, 1>,
- Conv::template process_tile<1, 1, 5, 2, 1, 2>,
- }, // Output pad bottom = 1
- { // Output pad bottom = 2
- Conv::template process_tile<1, 1, 5, 2, 2, 0>,
- Conv::template process_tile<1, 1, 5, 2, 2, 1>,
- Conv::template process_tile<1, 1, 5, 2, 2, 2>,
- }, // Output pad bottom = 2
- }, // Input pad right = 2
- { // Input pad right = 3
- { // Output pad bottom = 0
- Conv::template process_tile<1, 1, 5, 3, 0, 0>,
- Conv::template process_tile<1, 1, 5, 3, 0, 1>,
- Conv::template process_tile<1, 1, 5, 3, 0, 2>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- Conv::template process_tile<1, 1, 5, 3, 1, 0>,
- Conv::template process_tile<1, 1, 5, 3, 1, 1>,
- Conv::template process_tile<1, 1, 5, 3, 1, 2>,
- }, // Output pad bottom = 1
- { // Output pad bottom = 2
- Conv::template process_tile<1, 1, 5, 3, 2, 0>,
- Conv::template process_tile<1, 1, 5, 3, 2, 1>,
- Conv::template process_tile<1, 1, 5, 3, 2, 2>,
- }, // Output pad bottom = 2
- }, // Input pad right = 3
- { // Input pad right = 4
- { // Output pad bottom = 0
- Conv::template process_tile<1, 1, 5, 4, 0, 0>,
- Conv::template process_tile<1, 1, 5, 4, 0, 1>,
- Conv::template process_tile<1, 1, 5, 4, 0, 2>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- Conv::template process_tile<1, 1, 5, 4, 1, 0>,
- Conv::template process_tile<1, 1, 5, 4, 1, 1>,
- Conv::template process_tile<1, 1, 5, 4, 1, 2>,
- }, // Output pad bottom = 1
- { // Output pad bottom = 2
- Conv::template process_tile<1, 1, 5, 4, 2, 0>,
- Conv::template process_tile<1, 1, 5, 4, 2, 1>,
- Conv::template process_tile<1, 1, 5, 4, 2, 2>,
- }, // Output pad bottom = 2
- }, // Input pad right = 4
- { // Input pad right = 5
- { // Output pad bottom = 0
- Conv::template process_tile<1, 1, 5, 5, 0, 0>,
- Conv::template process_tile<1, 1, 5, 5, 0, 1>,
- Conv::template process_tile<1, 1, 5, 5, 0, 2>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- Conv::template process_tile<1, 1, 5, 5, 1, 0>,
- Conv::template process_tile<1, 1, 5, 5, 1, 1>,
- Conv::template process_tile<1, 1, 5, 5, 1, 2>,
- }, // Output pad bottom = 1
- { // Output pad bottom = 2
- Conv::template process_tile<1, 1, 5, 5, 2, 0>,
- Conv::template process_tile<1, 1, 5, 5, 2, 1>,
- Conv::template process_tile<1, 1, 5, 5, 2, 2>,
- }, // Output pad bottom = 2
- }, // Input pad right = 5
- { // Input pad right = 6
- { // Output pad bottom = 0
- Conv::template process_tile<1, 1, 5, 6, 0, 0>,
- Conv::template process_tile<1, 1, 5, 6, 0, 1>,
- Conv::template process_tile<1, 1, 5, 6, 0, 2>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- Conv::template process_tile<1, 1, 5, 6, 1, 0>,
- Conv::template process_tile<1, 1, 5, 6, 1, 1>,
- Conv::template process_tile<1, 1, 5, 6, 1, 2>,
- }, // Output pad bottom = 1
- { // Output pad bottom = 2
- Conv::template process_tile<1, 1, 5, 6, 2, 0>,
- Conv::template process_tile<1, 1, 5, 6, 2, 1>,
- Conv::template process_tile<1, 1, 5, 6, 2, 2>,
- }, // Output pad bottom = 2
- }, // Input pad right = 6
- }, // Input pad bottom = 5
- { // Input pad bottom = 6
- { // Input pad right = 0
- { // Output pad bottom = 0
- Conv::template process_tile<1, 1, 6, 0, 0, 0>,
- Conv::template process_tile<1, 1, 6, 0, 0, 1>,
- Conv::template process_tile<1, 1, 6, 0, 0, 2>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- Conv::template process_tile<1, 1, 6, 0, 1, 0>,
- Conv::template process_tile<1, 1, 6, 0, 1, 1>,
- Conv::template process_tile<1, 1, 6, 0, 1, 2>,
- }, // Output pad bottom = 1
- { // Output pad bottom = 2
- Conv::template process_tile<1, 1, 6, 0, 2, 0>,
- Conv::template process_tile<1, 1, 6, 0, 2, 1>,
- Conv::template process_tile<1, 1, 6, 0, 2, 2>,
- }, // Output pad bottom = 2
- }, // Input pad right = 0
- { // Input pad right = 1
- { // Output pad bottom = 0
- Conv::template process_tile<1, 1, 6, 1, 0, 0>,
- Conv::template process_tile<1, 1, 6, 1, 0, 1>,
- Conv::template process_tile<1, 1, 6, 1, 0, 2>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- Conv::template process_tile<1, 1, 6, 1, 1, 0>,
- Conv::template process_tile<1, 1, 6, 1, 1, 1>,
- Conv::template process_tile<1, 1, 6, 1, 1, 2>,
- }, // Output pad bottom = 1
- { // Output pad bottom = 2
- Conv::template process_tile<1, 1, 6, 1, 2, 0>,
- Conv::template process_tile<1, 1, 6, 1, 2, 1>,
- Conv::template process_tile<1, 1, 6, 1, 2, 2>,
- }, // Output pad bottom = 2
- }, // Input pad right = 1
- { // Input pad right = 2
- { // Output pad bottom = 0
- Conv::template process_tile<1, 1, 6, 2, 0, 0>,
- Conv::template process_tile<1, 1, 6, 2, 0, 1>,
- Conv::template process_tile<1, 1, 6, 2, 0, 2>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- Conv::template process_tile<1, 1, 6, 2, 1, 0>,
- Conv::template process_tile<1, 1, 6, 2, 1, 1>,
- Conv::template process_tile<1, 1, 6, 2, 1, 2>,
- }, // Output pad bottom = 1
- { // Output pad bottom = 2
- Conv::template process_tile<1, 1, 6, 2, 2, 0>,
- Conv::template process_tile<1, 1, 6, 2, 2, 1>,
- Conv::template process_tile<1, 1, 6, 2, 2, 2>,
- }, // Output pad bottom = 2
- }, // Input pad right = 2
- { // Input pad right = 3
- { // Output pad bottom = 0
- Conv::template process_tile<1, 1, 6, 3, 0, 0>,
- Conv::template process_tile<1, 1, 6, 3, 0, 1>,
- Conv::template process_tile<1, 1, 6, 3, 0, 2>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- Conv::template process_tile<1, 1, 6, 3, 1, 0>,
- Conv::template process_tile<1, 1, 6, 3, 1, 1>,
- Conv::template process_tile<1, 1, 6, 3, 1, 2>,
- }, // Output pad bottom = 1
- { // Output pad bottom = 2
- Conv::template process_tile<1, 1, 6, 3, 2, 0>,
- Conv::template process_tile<1, 1, 6, 3, 2, 1>,
- Conv::template process_tile<1, 1, 6, 3, 2, 2>,
- }, // Output pad bottom = 2
- }, // Input pad right = 3
- { // Input pad right = 4
- { // Output pad bottom = 0
- Conv::template process_tile<1, 1, 6, 4, 0, 0>,
- Conv::template process_tile<1, 1, 6, 4, 0, 1>,
- Conv::template process_tile<1, 1, 6, 4, 0, 2>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- Conv::template process_tile<1, 1, 6, 4, 1, 0>,
- Conv::template process_tile<1, 1, 6, 4, 1, 1>,
- Conv::template process_tile<1, 1, 6, 4, 1, 2>,
- }, // Output pad bottom = 1
- { // Output pad bottom = 2
- Conv::template process_tile<1, 1, 6, 4, 2, 0>,
- Conv::template process_tile<1, 1, 6, 4, 2, 1>,
- Conv::template process_tile<1, 1, 6, 4, 2, 2>,
- }, // Output pad bottom = 2
- }, // Input pad right = 4
- { // Input pad right = 5
- { // Output pad bottom = 0
- Conv::template process_tile<1, 1, 6, 5, 0, 0>,
- Conv::template process_tile<1, 1, 6, 5, 0, 1>,
- Conv::template process_tile<1, 1, 6, 5, 0, 2>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- Conv::template process_tile<1, 1, 6, 5, 1, 0>,
- Conv::template process_tile<1, 1, 6, 5, 1, 1>,
- Conv::template process_tile<1, 1, 6, 5, 1, 2>,
- }, // Output pad bottom = 1
- { // Output pad bottom = 2
- Conv::template process_tile<1, 1, 6, 5, 2, 0>,
- Conv::template process_tile<1, 1, 6, 5, 2, 1>,
- Conv::template process_tile<1, 1, 6, 5, 2, 2>,
- }, // Output pad bottom = 2
- }, // Input pad right = 5
- { // Input pad right = 6
- { // Output pad bottom = 0
- Conv::template process_tile<1, 1, 6, 6, 0, 0>,
- Conv::template process_tile<1, 1, 6, 6, 0, 1>,
- Conv::template process_tile<1, 1, 6, 6, 0, 2>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- Conv::template process_tile<1, 1, 6, 6, 1, 0>,
- Conv::template process_tile<1, 1, 6, 6, 1, 1>,
- Conv::template process_tile<1, 1, 6, 6, 1, 2>,
- }, // Output pad bottom = 1
- { // Output pad bottom = 2
- Conv::template process_tile<1, 1, 6, 6, 2, 0>,
- Conv::template process_tile<1, 1, 6, 6, 2, 1>,
- Conv::template process_tile<1, 1, 6, 6, 2, 2>,
- }, // Output pad bottom = 2
- }, // Input pad right = 6
- }, // Input pad bottom = 6
- }, // Input pad left = 1
- }, // Input pad top = 1
+template <>
+void ConvImpl::process_tile<true, 0, 0, 0, 0, 0, 0>(
+ const int n_channels,
+ const float* const weights,
+ const int weight_row_stride,
+ const int weight_col_stride,
+ const float* const inptr,
+ const int in_row_stride,
+ const int in_col_stride,
+ float* const outptr,
+ const int out_row_stride,
+ const int out_col_stride,
+ const int, const int, const int, const int, const int, const int
+)
+{
+ // Copy pointers
+ const float *uptr0 = inptr;
+ const float *wptr0 = weights;
+ float *vptr0 = outptr;
+
+ int channels_remaining = n_channels;
+ if (channels_remaining >= 4)
+ {
+ // Process blocks of 4 channels at a time
+ int n_iters = channels_remaining / 4 - 1;
+ channels_remaining %= 4;
+
+ asm volatile(
+ // Prepare aliases
+ "qW13 .req q0\n" "vW13 .req v0\n"
+ "qU15 .req q1\n" "qU73 .req q1\n" "qU45 .req q1\n" "qU14 .req q1\n"
+ "vU15 .req v1\n" "vU73 .req v1\n" "vU45 .req v1\n" "vU14 .req v1\n"
+ "qU62 .req q2\n" "qV12 .req q2\n" "vU62 .req v2\n" "vV12 .req v2\n"
+ "qU51 .req q3\n" "qU43 .req q3\n" "qU55 .req q3\n"
+ "vU51 .req v3\n" "vU43 .req v3\n" "vU55 .req v3\n"
+ "qU77 .req q4\n" "qV13 .req q4\n" "qV31 .req q4\n" "qU44 .req q4\n"
+ "vU77 .req v4\n" "vV13 .req v4\n" "vV31 .req v4\n" "vU44 .req v4\n"
+ "qV33 .req q5\n" "qU46 .req q5\n" "qU11 .req q5\n" "qU37 .req q5\n"
+ "vV33 .req v5\n" "vU46 .req v5\n" "vU11 .req v5\n" "vU37 .req v5\n"
+ "qU56 .req q6\n" "qU25 .req q6\n" "qU32 .req q6\n"
+ "vU56 .req v6\n" "vU25 .req v6\n" "vU32 .req v6\n"
+ "qU72 .req q7\n" "qV22 .req q7\n" "vU72 .req v7\n" "vV22 .req v7\n"
+ "qU67 .req q8\n" "qU61 .req q8\n" "qU13 .req q8\n"
+ "vU67 .req v8\n" "vU61 .req v8\n" "vU13 .req v8\n"
+ "qU74 .req q9\n" "qU34 .req q9\n" "qU17 .req q9\n" "qU66 .req q9\n"
+ "vU74 .req v9\n" "vU34 .req v9\n" "vU17 .req v9\n" "vU66 .req v9\n"
+ "qU33 .req q10\n" "qU57 .req q10\n" "qU21 .req q10\n"
+ "vU33 .req v10\n" "vU57 .req v10\n" "vU21 .req v10\n" "qW23 .req q11\n"
+ "vW23 .req v11\n" "qU42 .req q12\n" "qV23 .req q12\n" "qU23 .req q12\n"
+ "vU42 .req v12\n" "vV23 .req v12\n" "vU23 .req v12\n"
+ "qW33 .req q13\n" "vW33 .req v13\n"
+ "qU76 .req q14\n" "qU47 .req q14\n" "qU64 .req q14\n" "qU41 .req q14\n"
+ "vU76 .req v14\n" "vU47 .req v14\n" "vU64 .req v14\n" "vU41 .req v14\n"
+ "qU52 .req q15\n" "qU54 .req q15\n" "qU75 .req q15\n" "qU26 .req q15\n"
+ "vU52 .req v15\n" "vU54 .req v15\n" "vU75 .req v15\n" "vU26 .req v15\n"
+ "qU53 .req q16\n" "qU27 .req q16\n" "vU53 .req v16\n" "vU27 .req v16\n"
+ "qV21 .req q17\n" "qU65 .req q17\n" "vV21 .req v17\n" "vU65 .req v17\n"
+ "qU31 .req q18\n" "qU24 .req q18\n" "qU36 .req q18\n"
+ "vU31 .req v18\n" "vU24 .req v18\n" "vU36 .req v18\n" "qU22 .req q19\n"
+ "vU22 .req v19\n" "qU35 .req q20\n" "qU63 .req q20\n"
+ "vU35 .req v20\n" "vU63 .req v20\n" "qW12 .req q21\n"
+ "vW12 .req v21\n" "qV32 .req q22\n" "qU16 .req q22\n"
+ "vV32 .req v22\n" "vU16 .req v22\n" "qW11 .req q23\n" "vW11 .req v23\n"
+ "qU12 .req q24\n" "vU12 .req v24\n" "qW31 .req q25\n" "vW31 .req v25\n"
+ "qW22 .req q26\n" "vW22 .req v26\n" "qU71 .req q27\n" "vU71 .req v27\n"
+ "qV11 .req q28\n" "vV11 .req v28\n" "qW21 .req q29\n" "vW21 .req v29\n"
+ "qW32 .req q30\n" "vW32 .req v30\n"
+
+ "uptr1 .req x0\n"
+ "uptr2 .req x1\n"
+ "uptr3 .req x2\n"
+ "uptr4 .req x3\n"
+ "uptr5 .req x4\n"
+ "uptr6 .req x5\n"
+ "u_col_stride1 .req %x[u_col_stride]\n"
+ "u_col_stride2 .req x6\n"
+ "u_col_stride3 .req x7\n"
+ "u_col_stride4 .req x8\n"
+ "u_col_stride5 .req x9\n"
+ "u_col_stride6 .req x10\n"
+ "wptr1 .req x11\n"
+ "wptr2 .req x12\n"
+ "w_col_stride1 .req %x[w_col_stride]\n"
+ "w_col_stride2 .req x13\n"
+ "vptr1 .req x14\n"
+ "vptr2 .req x15\n"
+ "v_col_stride1 .req %x[v_col_stride]\n"
+ "v_col_stride2 .req x16\n"
+
+ // Prepare strides and pointers
+ "add uptr1, %x[uptr0], %x[u_row_stride]\n"
+ "add uptr2, uptr1 , %x[u_row_stride]\n"
+ "add uptr3, uptr2 , %x[u_row_stride]\n"
+ "add uptr4, uptr3 , %x[u_row_stride]\n"
+ "add uptr5, uptr4 , %x[u_row_stride]\n"
+ "add uptr6, uptr5 , %x[u_row_stride]\n"
+ "add u_col_stride2, u_col_stride1, u_col_stride1\n"
+ "add u_col_stride3, u_col_stride2, u_col_stride1\n"
+ "add u_col_stride4, u_col_stride3, u_col_stride1\n"
+ "add u_col_stride5, u_col_stride4, u_col_stride1\n"
+ "add u_col_stride6, u_col_stride5, u_col_stride1\n"
+
+ "add wptr1, %x[wptr0], %x[w_row_stride]\n"
+ "add wptr2, wptr1 , %x[w_row_stride]\n"
+ "add w_col_stride2, w_col_stride1, w_col_stride1\n"
+
+ "add vptr1, %x[vptr0], %x[v_row_stride]\n"
+ "add vptr2, vptr1 , %x[v_row_stride]\n"
+ "add v_col_stride2, v_col_stride1, v_col_stride1\n"
+
+ // Prepare for first iteration
+ "ldr qW13, [%x[wptr0], w_col_stride2]\n"
+ "ldr qW23, [wptr1, w_col_stride2]\n"
+ "ldr qW33, [wptr2, w_col_stride2]\n"
+ "ldr qW12, [%x[wptr0], w_col_stride1]\n"
+ "ldr qW22, [wptr1, w_col_stride1]\n"
+ "ldr qW32, [wptr2, w_col_stride1]\n"
+ "ldr qW11, [%x[wptr0]], #0x10\n"
+ "ldr qW21, [wptr1], #0x10\n"
+ "ldr qU17, [%x[uptr0], u_col_stride6]\n"
+ "ldr qU15, [%x[uptr0], u_col_stride4]\n"
+ "ldr qU16, [%x[uptr0], u_col_stride5]\n"
+ "ldr qU37, [uptr2, u_col_stride6]\n"
+ "ldr qU35, [uptr2, u_col_stride4]\n"
+ "ldr qU36, [uptr2, u_col_stride5]\n"
+ "ldr qU27, [uptr1, u_col_stride6]\n"
+ "ldr qU25, [uptr1, u_col_stride4]\n"
+ "fmul vV13.4s, vU17.4s, vW13.4s\n"
+ "fmul vV12.4s, vU15.4s, vW13.4s\n"
+ "fmla vV13.4s, vU15.4s, vW11.4s\n"
+ "ldr qW31, [wptr2], #0x10\n"
+ "fmla vV13.4s, vU16.4s, vW12.4s\n"
+ "ldr qU26, [uptr1, u_col_stride5]\n"
+ "fmla vV13.4s, vU37.4s, vW33.4s\n"
+ "ldr qU47, [uptr3, u_col_stride6]\n"
+ "fmul vV23.4s, vU37.4s, vW13.4s\n"
+ "ldr qU45, [uptr3, u_col_stride4]\n"
+ "fmla vV12.4s, vU35.4s, vW33.4s\n"
+ "ldr qU46, [uptr3, u_col_stride5]\n"
+ "fmla vV13.4s, vU35.4s, vW31.4s\n"
+ "ldr qU67, [uptr5, u_col_stride6]\n"
+ "fmul vV22.4s, vU35.4s, vW13.4s\n"
+ "cbz %x[n_iters], 2f\n" // Jump to tail if no iterations
+
+ "1:" // Loop body
+ "fmla vV23.4s, vU35.4s, vW11.4s\n"
+ "ldr qU65, [uptr5, u_col_stride4]\n"
+ "fmla vV13.4s, vU36.4s, vW32.4s\n"
+ "fmla vV23.4s, vU36.4s, vW12.4s\n"
+ "ldr qU66, [uptr5, u_col_stride5]\n"
+ "fmla vV13.4s, vU27.4s, vW23.4s\n"
+ "ldr qU57, [uptr4, u_col_stride6]\n"
+ "fmla vV12.4s, vU25.4s, vW23.4s\n"
+ "ldr qU55, [uptr4, u_col_stride4]\n"
+ "fmla vV13.4s, vU25.4s, vW21.4s\n"
+ "ldr qU56, [uptr4, u_col_stride5]\n"
+ "fmla vV13.4s, vU26.4s, vW22.4s\n"
+ "str qV13, [%x[vptr0], v_col_stride2]\n"
+ "fmla vV23.4s, vU47.4s, vW23.4s\n"
+ "ldr qU77, [uptr6, u_col_stride6]\n"
+ "fmla vV22.4s, vU45.4s, vW23.4s\n"
+ "fmla vV23.4s, vU45.4s, vW21.4s\n"
+ "ldr qU75, [uptr6, u_col_stride4]\n"
+ "fmla vV23.4s, vU46.4s, vW22.4s\n"
+ "ldr qU76, [uptr6, u_col_stride5]\n"
+ "fmul vV33.4s, vU67.4s, vW23.4s\n"
+ "ldr qU14, [%x[uptr0], u_col_stride3]\n"
+ "fmul vV32.4s, vU65.4s, vW23.4s\n"
+ "fmla vV33.4s, vU65.4s, vW21.4s\n"
+ "ldr qU13, [%x[uptr0], u_col_stride2]\n"
+ "fmla vV33.4s, vU66.4s, vW22.4s\n"
+ "ldr qU34, [uptr2, u_col_stride3]\n"
+ "fmla vV23.4s, vU57.4s, vW33.4s\n"
+ "fmla vV33.4s, vU57.4s, vW13.4s\n"
+ "ldr qU33, [uptr2, u_col_stride2]\n"
+ "fmla vV22.4s, vU55.4s, vW33.4s\n"
+ "fmla vV23.4s, vU55.4s, vW31.4s\n"
+ "fmla vV32.4s, vU55.4s, vW13.4s\n"
+ "fmla vV33.4s, vU55.4s, vW11.4s\n"
+ "ldr qU24, [uptr1, u_col_stride3]\n"
+ "fmla vV23.4s, vU56.4s, vW32.4s\n"
+ "str qV23, [vptr1, v_col_stride2]\n"
+ "fmla vV33.4s, vU56.4s, vW12.4s\n"
+ "ldr qU23, [uptr1, u_col_stride2]\n"
+ "fmla vV33.4s, vU77.4s, vW33.4s\n"
+ "ldr qU44, [uptr3, u_col_stride3]\n"
+ "fmla vV32.4s, vU75.4s, vW33.4s\n"
+ "fmla vV33.4s, vU75.4s, vW31.4s\n"
+ "ldr qU43, [uptr3, u_col_stride2]\n"
+ "fmla vV33.4s, vU76.4s, vW32.4s\n"
+ "str qV33, [vptr2, v_col_stride2]\n"
+ "ldr qU64, [uptr5, u_col_stride3]\n"
+ "fmla vV12.4s, vU14.4s, vW12.4s\n"
+ "ldr qU63, [uptr5, u_col_stride2]\n"
+ "fmul vV11.4s, vU13.4s, vW13.4s\n"
+ "fmla vV12.4s, vU13.4s, vW11.4s\n"
+ "ldr qU54, [uptr4, u_col_stride3]\n"
+ "fmla vV12.4s, vU34.4s, vW32.4s\n"
+ "fmla vV22.4s, vU34.4s, vW12.4s\n"
+ "ldr qU53, [uptr4, u_col_stride2]\n"
+ "fmla vV11.4s, vU33.4s, vW33.4s\n"
+ "ldr qU74, [uptr6, u_col_stride3]\n"
+ "fmla vV12.4s, vU33.4s, vW31.4s\n"
+ "ldr qU73, [uptr6, u_col_stride2]\n"
+ "fmul vV21.4s, vU33.4s, vW13.4s\n"
+ "ldr qU12, [%x[uptr0], u_col_stride1]\n"
+ "fmla vV22.4s, vU33.4s, vW11.4s\n"
+ "ldr qU11, [%x[uptr0]], #0x10\n"
+ "fmla vV12.4s, vU24.4s, vW22.4s\n"
+ "ldr qU32, [uptr2, u_col_stride1]\n"
+ "fmla vV11.4s, vU23.4s, vW23.4s\n"
+ "ldr qU31, [uptr2], #0x10\n"
+ "fmla vV12.4s, vU23.4s, vW21.4s\n"
+ "str qV12, [%x[vptr0], v_col_stride1]\n"
+ "fmla vV22.4s, vU44.4s, vW22.4s\n"
+ "ldr qU22, [uptr1, u_col_stride1]\n"
+ "fmla vV21.4s, vU43.4s, vW23.4s\n"
+ "ldr qU21, [uptr1], #0x10\n"
+ "fmla vV22.4s, vU43.4s, vW21.4s\n"
+ "ldr qU42, [uptr3, u_col_stride1]\n"
+ "fmla vV32.4s, vU64.4s, vW22.4s\n"
+ "ldr qU41, [uptr3], #0x10\n"
+ "fmul vV31.4s, vU63.4s, vW23.4s\n"
+ "ldr qW23, [wptr1, w_col_stride2]\n"
+ "fmla vV32.4s, vU63.4s, vW21.4s\n"
+ "ldr qU62, [uptr5, u_col_stride1]\n"
+ "fmla vV22.4s, vU54.4s, vW32.4s\n"
+ "ldr qU61, [uptr5], #0x10\n"
+ "fmla vV32.4s, vU54.4s, vW12.4s\n"
+ "ldr qU52, [uptr4, u_col_stride1]\n"
+ "fmla vV21.4s, vU53.4s, vW33.4s\n"
+ "ldr qU51, [uptr4], #0x10\n"
+ "fmla vV22.4s, vU53.4s, vW31.4s\n"
+ "str qV22, [vptr1, v_col_stride1]\n"
+ "fmla vV31.4s, vU53.4s, vW13.4s\n"
+ "ldr qW13, [%x[wptr0], w_col_stride2]\n"
+ "fmla vV32.4s, vU53.4s, vW11.4s\n"
+ "ldr qU72, [uptr6, u_col_stride1]\n"
+ "fmla vV32.4s, vU74.4s, vW32.4s\n"
+ "ldr qU71, [uptr6], #0x10\n"
+ "fmla vV31.4s, vU73.4s, vW33.4s\n"
+ "ldr qW33, [wptr2, w_col_stride2]\n"
+ "fmla vV32.4s, vU73.4s, vW31.4s\n"
+ "str qV32, [vptr2, v_col_stride1]\n"
+ "fmla vV11.4s, vU12.4s, vW12.4s\n"
+ "ldr qU17, [%x[uptr0], u_col_stride6]\n"
+ "fmla vV11.4s, vU11.4s, vW11.4s\n"
+ "ldr qU15, [%x[uptr0], u_col_stride4]\n"
+ "fmla vV11.4s, vU32.4s, vW32.4s\n"
+ "ldr qU16, [%x[uptr0], u_col_stride5]\n"
+ "fmla vV21.4s, vU32.4s, vW12.4s\n"
+ "ldr qU37, [uptr2, u_col_stride6]\n"
+ "fmla vV11.4s, vU31.4s, vW31.4s\n"
+ "ldr qU35, [uptr2, u_col_stride4]\n"
+ "fmla vV21.4s, vU31.4s, vW11.4s\n"
+ "ldr qU36, [uptr2, u_col_stride5]\n"
+ "fmla vV11.4s, vU22.4s, vW22.4s\n"
+ "ldr qU27, [uptr1, u_col_stride6]\n"
+ "fmla vV11.4s, vU21.4s, vW21.4s\n"
+ "str qV11, [%x[vptr0]], #0x10\n"
+ "fmla vV21.4s, vU42.4s, vW22.4s\n"
+ "ldr qU25, [uptr1, u_col_stride4]\n"
+ "fmla vV21.4s, vU41.4s, vW21.4s\n"
+ "fmla vV31.4s, vU62.4s, vW22.4s\n"
+ "ldr qW22, [wptr1, w_col_stride1]\n"
+ "fmla vV31.4s, vU61.4s, vW21.4s\n"
+ "ldr qW21, [wptr1], #0x10\n"
+ "fmla vV21.4s, vU52.4s, vW32.4s\n"
+ "fmla vV31.4s, vU52.4s, vW12.4s\n"
+ "ldr qW12, [%x[wptr0], w_col_stride1]\n"
+ "fmla vV21.4s, vU51.4s, vW31.4s\n"
+ "str qV21, [vptr1], #0x10\n"
+ "fmla vV31.4s, vU51.4s, vW11.4s\n"
+ "ldr qW11, [%x[wptr0]], #0x10\n"
+ "fmla vV31.4s, vU72.4s, vW32.4s\n"
+ "ldr qW32, [wptr2, w_col_stride1]\n"
+ "fmla vV31.4s, vU71.4s, vW31.4s\n"
+ "str qV31, [vptr2], #0x10\n"
+ "fmul vV13.4s, vU17.4s, vW13.4s\n"
+ "fmul vV12.4s, vU15.4s, vW13.4s\n"
+ "subs %x[n_iters], %x[n_iters], #1\n"
+ "fmla vV13.4s, vU15.4s, vW11.4s\n"
+ "ldr qW31, [wptr2], #0x10\n"
+ "fmla vV13.4s, vU16.4s, vW12.4s\n"
+ "ldr qU26, [uptr1, u_col_stride5]\n"
+ "fmla vV13.4s, vU37.4s, vW33.4s\n"
+ "ldr qU47, [uptr3, u_col_stride6]\n"
+ "fmul vV23.4s, vU37.4s, vW13.4s\n"
+ "ldr qU45, [uptr3, u_col_stride4]\n"
+ "fmla vV12.4s, vU35.4s, vW33.4s\n"
+ "ldr qU46, [uptr3, u_col_stride5]\n"
+ "fmla vV13.4s, vU35.4s, vW31.4s\n"
+ "ldr qU67, [uptr5, u_col_stride6]\n"
+ "fmul vV22.4s, vU35.4s, vW13.4s\n"
+ "bne 1b\n"
+
+ "2:" // Tail iteration
+ "fmla vV23.4s, vU35.4s, vW11.4s\n"
+ "ldr qU65, [uptr5, u_col_stride4]\n"
+ "fmla vV13.4s, vU36.4s, vW32.4s\n"
+ "fmla vV23.4s, vU36.4s, vW12.4s\n"
+ "ldr qU66, [uptr5, u_col_stride5]\n"
+ "fmla vV13.4s, vU27.4s, vW23.4s\n"
+ "ldr qU57, [uptr4, u_col_stride6]\n"
+ "fmla vV12.4s, vU25.4s, vW23.4s\n"
+ "ldr qU55, [uptr4, u_col_stride4]\n"
+ "fmla vV13.4s, vU25.4s, vW21.4s\n"
+ "ldr qU56, [uptr4, u_col_stride5]\n"
+ "fmla vV13.4s, vU26.4s, vW22.4s\n"
+ "str qV13, [%x[vptr0], v_col_stride2]\n"
+ "fmla vV23.4s, vU47.4s, vW23.4s\n"
+ "ldr qU77, [uptr6, u_col_stride6]\n"
+ "fmla vV22.4s, vU45.4s, vW23.4s\n"
+ "fmla vV23.4s, vU45.4s, vW21.4s\n"
+ "ldr qU75, [uptr6, u_col_stride4]\n"
+ "fmla vV23.4s, vU46.4s, vW22.4s\n"
+ "ldr qU76, [uptr6, u_col_stride5]\n"
+ "fmul vV33.4s, vU67.4s, vW23.4s\n"
+ "ldr qU14, [%x[uptr0], u_col_stride3]\n"
+ "fmul vV32.4s, vU65.4s, vW23.4s\n"
+ "fmla vV33.4s, vU65.4s, vW21.4s\n"
+ "ldr qU13, [%x[uptr0], u_col_stride2]\n"
+ "fmla vV33.4s, vU66.4s, vW22.4s\n"
+ "ldr qU34, [uptr2, u_col_stride3]\n"
+ "fmla vV23.4s, vU57.4s, vW33.4s\n"
+ "fmla vV33.4s, vU57.4s, vW13.4s\n"
+ "ldr qU33, [uptr2, u_col_stride2]\n"
+ "fmla vV22.4s, vU55.4s, vW33.4s\n"
+ "fmla vV23.4s, vU55.4s, vW31.4s\n"
+ "fmla vV32.4s, vU55.4s, vW13.4s\n"
+ "fmla vV33.4s, vU55.4s, vW11.4s\n"
+ "ldr qU24, [uptr1, u_col_stride3]\n"
+ "fmla vV23.4s, vU56.4s, vW32.4s\n"
+ "str qV23, [vptr1, v_col_stride2]\n"
+ "fmla vV33.4s, vU56.4s, vW12.4s\n"
+ "ldr qU23, [uptr1, u_col_stride2]\n"
+ "fmla vV33.4s, vU77.4s, vW33.4s\n"
+ "ldr qU44, [uptr3, u_col_stride3]\n"
+ "fmla vV32.4s, vU75.4s, vW33.4s\n"
+ "fmla vV33.4s, vU75.4s, vW31.4s\n"
+ "ldr qU43, [uptr3, u_col_stride2]\n"
+ "fmla vV33.4s, vU76.4s, vW32.4s\n"
+ "str qV33, [vptr2, v_col_stride2]\n"
+ "ldr qU64, [uptr5, u_col_stride3]\n"
+ "fmla vV12.4s, vU14.4s, vW12.4s\n"
+ "ldr qU63, [uptr5, u_col_stride2]\n"
+ "fmul vV11.4s, vU13.4s, vW13.4s\n"
+ "fmla vV12.4s, vU13.4s, vW11.4s\n"
+ "ldr qU54, [uptr4, u_col_stride3]\n"
+ "fmla vV12.4s, vU34.4s, vW32.4s\n"
+ "fmla vV22.4s, vU34.4s, vW12.4s\n"
+ "ldr qU53, [uptr4, u_col_stride2]\n"
+ "fmla vV11.4s, vU33.4s, vW33.4s\n"
+ "ldr qU74, [uptr6, u_col_stride3]\n"
+ "fmla vV12.4s, vU33.4s, vW31.4s\n"
+ "ldr qU73, [uptr6, u_col_stride2]\n"
+ "fmul vV21.4s, vU33.4s, vW13.4s\n"
+ "ldr qU12, [%x[uptr0], u_col_stride1]\n"
+ "fmla vV22.4s, vU33.4s, vW11.4s\n"
+ "ldr qU11, [%x[uptr0]], #0x10\n"
+ "fmla vV12.4s, vU24.4s, vW22.4s\n"
+ "ldr qU32, [uptr2, u_col_stride1]\n"
+ "fmla vV11.4s, vU23.4s, vW23.4s\n"
+ "ldr qU31, [uptr2], #0x10\n"
+ "fmla vV12.4s, vU23.4s, vW21.4s\n"
+ "str qV12, [%x[vptr0], v_col_stride1]\n"
+ "fmla vV22.4s, vU44.4s, vW22.4s\n"
+ "ldr qU22, [uptr1, u_col_stride1]\n"
+ "fmla vV21.4s, vU43.4s, vW23.4s\n"
+ "ldr qU21, [uptr1], #0x10\n"
+ "fmla vV22.4s, vU43.4s, vW21.4s\n"
+ "ldr qU42, [uptr3, u_col_stride1]\n"
+ "fmla vV32.4s, vU64.4s, vW22.4s\n"
+ "ldr qU41, [uptr3], #0x10\n"
+ "fmul vV31.4s, vU63.4s, vW23.4s\n"
+ "fmla vV32.4s, vU63.4s, vW21.4s\n"
+ "ldr qU62, [uptr5, u_col_stride1]\n"
+ "fmla vV22.4s, vU54.4s, vW32.4s\n"
+ "ldr qU61, [uptr5], #0x10\n"
+ "fmla vV32.4s, vU54.4s, vW12.4s\n"
+ "ldr qU52, [uptr4, u_col_stride1]\n"
+ "fmla vV21.4s, vU53.4s, vW33.4s\n"
+ "ldr qU51, [uptr4], #0x10\n"
+ "fmla vV22.4s, vU53.4s, vW31.4s\n"
+ "str qV22, [vptr1, v_col_stride1]\n"
+ "fmla vV31.4s, vU53.4s, vW13.4s\n"
+ "fmla vV32.4s, vU53.4s, vW11.4s\n"
+ "ldr qU72, [uptr6, u_col_stride1]\n"
+ "fmla vV32.4s, vU74.4s, vW32.4s\n"
+ "ldr qU71, [uptr6], #0x10\n"
+ "fmla vV31.4s, vU73.4s, vW33.4s\n"
+ "fmla vV32.4s, vU73.4s, vW31.4s\n"
+ "str qV32, [vptr2, v_col_stride1]\n"
+ "fmla vV11.4s, vU12.4s, vW12.4s\n"
+ "fmla vV11.4s, vU11.4s, vW11.4s\n"
+ "fmla vV11.4s, vU32.4s, vW32.4s\n"
+ "fmla vV21.4s, vU32.4s, vW12.4s\n"
+ "fmla vV11.4s, vU31.4s, vW31.4s\n"
+ "fmla vV21.4s, vU31.4s, vW11.4s\n"
+ "fmla vV11.4s, vU22.4s, vW22.4s\n"
+ "fmla vV11.4s, vU21.4s, vW21.4s\n"
+ "str qV11, [%x[vptr0]], #0x10\n"
+ "fmla vV21.4s, vU42.4s, vW22.4s\n"
+ "fmla vV21.4s, vU41.4s, vW21.4s\n"
+ "fmla vV31.4s, vU62.4s, vW22.4s\n"
+ "fmla vV31.4s, vU61.4s, vW21.4s\n"
+ "fmla vV21.4s, vU52.4s, vW32.4s\n"
+ "fmla vV31.4s, vU52.4s, vW12.4s\n"
+ "fmla vV21.4s, vU51.4s, vW31.4s\n"
+ "str qV21, [vptr1], #0x10\n"
+ "fmla vV31.4s, vU51.4s, vW11.4s\n"
+ "fmla vV31.4s, vU72.4s, vW32.4s\n"
+ "fmla vV31.4s, vU71.4s, vW31.4s\n"
+ "str qV31, [vptr2], #0x10\n"
+
+ // Clear aliases
+ ".unreq uptr1\n" ".unreq uptr2\n" ".unreq uptr3\n" ".unreq uptr4\n"
+ ".unreq uptr5\n" ".unreq uptr6\n"
+ ".unreq u_col_stride1\n" ".unreq u_col_stride2\n" ".unreq u_col_stride3\n"
+ ".unreq u_col_stride4\n" ".unreq u_col_stride5\n" ".unreq u_col_stride6\n"
+ ".unreq wptr1\n" ".unreq wptr2\n"
+ ".unreq w_col_stride1\n" ".unreq w_col_stride2\n"
+ ".unreq vptr1\n" ".unreq vptr2\n"
+ ".unreq v_col_stride1\n" ".unreq v_col_stride2\n"
+ ".unreq qU15\n" ".unreq qU73\n" ".unreq qU45\n" ".unreq qU14\n"
+ ".unreq qW13\n" ".unreq qU62\n" ".unreq qV12\n"
+ ".unreq qU51\n" ".unreq qU43\n" ".unreq qU55\n"
+ ".unreq qU77\n" ".unreq qV13\n" ".unreq qV31\n" ".unreq qU44\n"
+ ".unreq qV33\n" ".unreq qU46\n" ".unreq qU11\n" ".unreq qU37\n"
+ ".unreq qU56\n" ".unreq qU25\n" ".unreq qU32\n"
+ ".unreq qU72\n" ".unreq qV22\n"
+ ".unreq qU67\n" ".unreq qU61\n" ".unreq qU13\n" ".unreq qW33\n"
+ ".unreq qU74\n" ".unreq qU34\n" ".unreq qU17\n" ".unreq qU66\n"
+ ".unreq qU33\n" ".unreq qU57\n" ".unreq qU21\n"
+ ".unreq qW23\n" ".unreq qU42\n" ".unreq qV23\n" ".unreq qU23\n"
+ ".unreq qU76\n" ".unreq qU47\n" ".unreq qU64\n" ".unreq qU41\n"
+ ".unreq qU52\n" ".unreq qU54\n" ".unreq qU75\n" ".unreq qU26\n"
+ ".unreq qU53\n" ".unreq qU27\n"
+ ".unreq qV21\n" ".unreq qU65\n"
+ ".unreq qU31\n" ".unreq qU24\n" ".unreq qU36\n" ".unreq qU22\n"
+ ".unreq qU35\n" ".unreq qU63\n" ".unreq qW12\n"
+ ".unreq qV32\n" ".unreq qU16\n" ".unreq qW11\n" ".unreq qU12\n"
+ ".unreq qW31\n" ".unreq qW22\n" ".unreq qU71\n" ".unreq qV11\n"
+ ".unreq qW21\n" ".unreq qW32\n" ".unreq vW13\n"
+ ".unreq vU15\n" ".unreq vU73\n" ".unreq vU45\n" ".unreq vU14\n"
+ ".unreq vU62\n" ".unreq vV12\n"
+ ".unreq vU51\n" ".unreq vU43\n" ".unreq vU55\n"
+ ".unreq vU77\n" ".unreq vV13\n" ".unreq vV31\n" ".unreq vU44\n"
+ ".unreq vV33\n" ".unreq vU46\n" ".unreq vU11\n" ".unreq vU37\n"
+ ".unreq vU56\n" ".unreq vU25\n" ".unreq vU32\n"
+ ".unreq vU72\n" ".unreq vV22\n" ".unreq vW21\n" ".unreq vW32\n"
+ ".unreq vU67\n" ".unreq vU61\n" ".unreq vU13\n"
+ ".unreq vU74\n" ".unreq vU34\n" ".unreq vU17\n" ".unreq vU66\n"
+ ".unreq vU33\n" ".unreq vU57\n" ".unreq vU21\n" ".unreq vW23\n"
+ ".unreq vU42\n" ".unreq vV23\n" ".unreq vU23\n" ".unreq vW33\n"
+ ".unreq vU76\n" ".unreq vU47\n" ".unreq vU64\n" ".unreq vU41\n"
+ ".unreq vU52\n" ".unreq vU54\n" ".unreq vU75\n" ".unreq vU26\n"
+ ".unreq vU53\n" ".unreq vU27\n" ".unreq vV21\n" ".unreq vU65\n"
+ ".unreq vU31\n" ".unreq vU24\n" ".unreq vU36\n" ".unreq vU22\n"
+ ".unreq vU35\n" ".unreq vU63\n" ".unreq vW12\n"
+ ".unreq vV32\n" ".unreq vU16\n" ".unreq vW11\n" ".unreq vU12\n"
+ ".unreq vW31\n" ".unreq vW22\n" ".unreq vU71\n" ".unreq vV11\n"
+ : [uptr0] "+r" (uptr0), [wptr0] "+r" (wptr0), [vptr0] "+r" (vptr0),
+ [n_iters] "+r" (n_iters)
+ : [u_row_stride] "r" (in_row_stride * sizeof(float)),
+ [u_col_stride] "r" (in_col_stride * sizeof(float)),
+ [w_row_stride] "r" (weight_row_stride * sizeof(float)),
+ [w_col_stride] "r" (weight_col_stride * sizeof(float)),
+ [v_row_stride] "r" (out_row_stride * sizeof(float)),
+ [v_col_stride] "r" (out_col_stride * sizeof(float))
+ : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11",
+ "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21",
+ "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "x0",
+ "x1", "x2", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11",
+ "x12", "x13", "x14", "x15", "x16", "cc", "memory"
+ );
+ }
+ if (channels_remaining)
+ {
+ // Fall back on the unoptimised version to clean up the tail
+ ConvImpl::process_tile<false>(
+ channels_remaining,
+ wptr0, weight_row_stride, weight_col_stride,
+ uptr0, in_row_stride, in_col_stride,
+ vptr0, out_row_stride, out_col_stride,
+ 0, 0, 0, 0, 0, 0
+ );
+ }
+}
+
+#endif // __aarch64__
+
+template <>
+const Conv::TileFn Conv::tilefn_unpadded = ConvImpl::template process_tile<true, 0, 0, 0, 0, 0, 0>;
+
+template <>
+const Conv::TileFn Conv::tilefn_top[n_in_pad_top_fns] = {
+ ConvImpl::template process_tile<true, 0, 0, 0, 0, 0, 0>,
+ ConvImpl::template process_tile<true, 1, 0, 0, 0, 0, 0>,
};
+template <>
+const Conv::TileFn Conv::tilefn_left[n_in_pad_left_fns] = {
+ ConvImpl::template process_tile<true, 0, 0, 0, 0, 0, 0>,
+ ConvImpl::template process_tile<true, 0, 1, 0, 0, 0, 0>,
+};
+
+template <>
+const Conv::TileFn Conv::tilefn_bottom[n_in_pad_bottom_fns][n_out_pad_bottom_fns] = {
+ {
+ ConvImpl::template process_tile<true, 0, 0, 0, 0, 0, 0>,
+ ConvImpl::template process_tile<true, 0, 0, 0, 0, 1, 0>,
+ ConvImpl::template process_tile<true, 0, 0, 0, 0, 2, 0>,
+ },
+ {
+ ConvImpl::template process_tile<true, 0, 0, 1, 0, 0, 0>,
+ ConvImpl::template process_tile<true, 0, 0, 1, 0, 1, 0>,
+ ConvImpl::template process_tile<true, 0, 0, 1, 0, 2, 0>,
+ },
+ {
+ ConvImpl::template process_tile<true, 0, 0, 2, 0, 0, 0>,
+ ConvImpl::template process_tile<true, 0, 0, 2, 0, 1, 0>,
+ ConvImpl::template process_tile<true, 0, 0, 2, 0, 2, 0>,
+ },
+ {
+ ConvImpl::template process_tile<true, 0, 0, 3, 0, 0, 0>,
+ ConvImpl::template process_tile<true, 0, 0, 3, 0, 1, 0>,
+ ConvImpl::template process_tile<true, 0, 0, 3, 0, 2, 0>,
+ },
+ {
+ ConvImpl::template process_tile<true, 0, 0, 4, 0, 0, 0>,
+ ConvImpl::template process_tile<true, 0, 0, 4, 0, 1, 0>,
+ ConvImpl::template process_tile<true, 0, 0, 4, 0, 2, 0>,
+ },
+ {
+ ConvImpl::template process_tile<true, 0, 0, 5, 0, 0, 0>,
+ ConvImpl::template process_tile<true, 0, 0, 5, 0, 1, 0>,
+ ConvImpl::template process_tile<true, 0, 0, 5, 0, 2, 0>,
+ },
+ {
+ ConvImpl::template process_tile<true, 0, 0, 6, 0, 0, 0>,
+ ConvImpl::template process_tile<true, 0, 0, 6, 0, 1, 0>,
+ ConvImpl::template process_tile<true, 0, 0, 6, 0, 2, 0>,
+ },
+};
+
+template <>
+const Conv::TileFn Conv::tilefn_right[n_in_pad_right_fns][n_out_pad_right_fns] = {
+ {
+ ConvImpl::template process_tile<true, 0, 0, 0, 0, 0, 0>,
+ ConvImpl::template process_tile<true, 0, 0, 0, 0, 0, 1>,
+ ConvImpl::template process_tile<true, 0, 0, 0, 0, 0, 2>,
+ },
+ {
+ ConvImpl::template process_tile<true, 0, 0, 0, 1, 0, 0>,
+ ConvImpl::template process_tile<true, 0, 0, 0, 1, 0, 1>,
+ ConvImpl::template process_tile<true, 0, 0, 0, 1, 0, 2>,
+ },
+ {
+ ConvImpl::template process_tile<true, 0, 0, 0, 2, 0, 0>,
+ ConvImpl::template process_tile<true, 0, 0, 0, 2, 0, 1>,
+ ConvImpl::template process_tile<true, 0, 0, 0, 2, 0, 2>,
+ },
+ {
+ ConvImpl::template process_tile<true, 0, 0, 0, 3, 0, 0>,
+ ConvImpl::template process_tile<true, 0, 0, 0, 3, 0, 1>,
+ ConvImpl::template process_tile<true, 0, 0, 0, 3, 0, 2>,
+ },
+ {
+ ConvImpl::template process_tile<true, 0, 0, 0, 4, 0, 0>,
+ ConvImpl::template process_tile<true, 0, 0, 0, 4, 0, 1>,
+ ConvImpl::template process_tile<true, 0, 0, 0, 4, 0, 2>,
+ },
+ {
+ ConvImpl::template process_tile<true, 0, 0, 0, 5, 0, 0>,
+ ConvImpl::template process_tile<true, 0, 0, 0, 5, 0, 1>,
+ ConvImpl::template process_tile<true, 0, 0, 0, 5, 0, 2>,
+ },
+ {
+ ConvImpl::template process_tile<true, 0, 0, 0, 6, 0, 0>,
+ ConvImpl::template process_tile<true, 0, 0, 0, 6, 0, 1>,
+ ConvImpl::template process_tile<true, 0, 0, 0, 6, 0, 2>,
+ },
+};
+
+template <>
+const Conv::TileFn Conv::tilefn_generic = ConvImpl::template process_tile<false>;
template class DepthwiseConvolution<3, 3, 3, 3, 2, 2, float, float>;
} // namespace depthwise
diff --git a/src/core/NEON/kernels/convolution/depthwise/depthwise_4x4_3x3_1x1_fp32_fp32.cpp b/src/core/NEON/kernels/convolution/depthwise/depthwise_4x4_3x3_1x1_fp32_fp32.cpp
index a1aaaa0..44b93a1 100644
--- a/src/core/NEON/kernels/convolution/depthwise/depthwise_4x4_3x3_1x1_fp32_fp32.cpp
+++ b/src/core/NEON/kernels/convolution/depthwise/depthwise_4x4_3x3_1x1_fp32_fp32.cpp
@@ -28,2668 +28,1465 @@
using Conv = DepthwiseConvolution<4, 4, 3, 3, 1, 1, float, float>;
using ConvImpl = DepthwiseConvolutionImpl<4, 4, 3, 3, 1, 1, float, float>;
+#ifdef __aarch64__
+
template <>
-const Conv::TileFn Conv::tile_fns
- [max_in_pad_top]
- [max_in_pad_left]
- [max_in_pad_bottom]
- [max_in_pad_right]
- [max_out_pad_bottom]
- [max_out_pad_right] = {
- { // Input pad top = 0
- { // Input pad left = 0
- { // Input pad bottom = 0
- { // Input pad right = 0
- { // Output pad bottom = 0
- ConvImpl::template process_tile<0, 0, 0, 0, 0, 0>,
- ConvImpl::template process_tile<0, 0, 0, 0, 0, 1>,
- ConvImpl::template process_tile<0, 0, 0, 0, 0, 2>,
- ConvImpl::template process_tile<0, 0, 0, 0, 0, 3>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- ConvImpl::template process_tile<0, 0, 0, 0, 1, 0>,
- ConvImpl::template process_tile<0, 0, 0, 0, 1, 1>,
- ConvImpl::template process_tile<0, 0, 0, 0, 1, 2>,
- ConvImpl::template process_tile<0, 0, 0, 0, 1, 3>,
- }, // Output pad bottom = 1
- { // Output pad bottom = 2
- ConvImpl::template process_tile<0, 0, 0, 0, 2, 0>,
- ConvImpl::template process_tile<0, 0, 0, 0, 2, 1>,
- ConvImpl::template process_tile<0, 0, 0, 0, 2, 2>,
- ConvImpl::template process_tile<0, 0, 0, 0, 2, 3>,
- }, // Output pad bottom = 2
- { // Output pad bottom = 3
- ConvImpl::template process_tile<0, 0, 0, 0, 3, 0>,
- ConvImpl::template process_tile<0, 0, 0, 0, 3, 1>,
- ConvImpl::template process_tile<0, 0, 0, 0, 3, 2>,
- ConvImpl::template process_tile<0, 0, 0, 0, 3, 3>,
- }, // Output pad bottom = 3
- }, // Input pad right = 0
- { // Input pad right = 1
- { // Output pad bottom = 0
- ConvImpl::template process_tile<0, 0, 0, 1, 0, 0>,
- ConvImpl::template process_tile<0, 0, 0, 1, 0, 1>,
- ConvImpl::template process_tile<0, 0, 0, 1, 0, 2>,
- ConvImpl::template process_tile<0, 0, 0, 1, 0, 3>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- ConvImpl::template process_tile<0, 0, 0, 1, 1, 0>,
- ConvImpl::template process_tile<0, 0, 0, 1, 1, 1>,
- ConvImpl::template process_tile<0, 0, 0, 1, 1, 2>,
- ConvImpl::template process_tile<0, 0, 0, 1, 1, 3>,
- }, // Output pad bottom = 1
- { // Output pad bottom = 2
- ConvImpl::template process_tile<0, 0, 0, 1, 2, 0>,
- ConvImpl::template process_tile<0, 0, 0, 1, 2, 1>,
- ConvImpl::template process_tile<0, 0, 0, 1, 2, 2>,
- ConvImpl::template process_tile<0, 0, 0, 1, 2, 3>,
- }, // Output pad bottom = 2
- { // Output pad bottom = 3
- ConvImpl::template process_tile<0, 0, 0, 1, 3, 0>,
- ConvImpl::template process_tile<0, 0, 0, 1, 3, 1>,
- ConvImpl::template process_tile<0, 0, 0, 1, 3, 2>,
- ConvImpl::template process_tile<0, 0, 0, 1, 3, 3>,
- }, // Output pad bottom = 3
- }, // Input pad right = 1
- { // Input pad right = 2
- { // Output pad bottom = 0
- ConvImpl::template process_tile<0, 0, 0, 2, 0, 0>,
- ConvImpl::template process_tile<0, 0, 0, 2, 0, 1>,
- ConvImpl::template process_tile<0, 0, 0, 2, 0, 2>,
- ConvImpl::template process_tile<0, 0, 0, 2, 0, 3>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- ConvImpl::template process_tile<0, 0, 0, 2, 1, 0>,
- ConvImpl::template process_tile<0, 0, 0, 2, 1, 1>,
- ConvImpl::template process_tile<0, 0, 0, 2, 1, 2>,
- ConvImpl::template process_tile<0, 0, 0, 2, 1, 3>,
- }, // Output pad bottom = 1
- { // Output pad bottom = 2
- ConvImpl::template process_tile<0, 0, 0, 2, 2, 0>,
- ConvImpl::template process_tile<0, 0, 0, 2, 2, 1>,
- ConvImpl::template process_tile<0, 0, 0, 2, 2, 2>,
- ConvImpl::template process_tile<0, 0, 0, 2, 2, 3>,
- }, // Output pad bottom = 2
- { // Output pad bottom = 3
- ConvImpl::template process_tile<0, 0, 0, 2, 3, 0>,
- ConvImpl::template process_tile<0, 0, 0, 2, 3, 1>,
- ConvImpl::template process_tile<0, 0, 0, 2, 3, 2>,
- ConvImpl::template process_tile<0, 0, 0, 2, 3, 3>,
- }, // Output pad bottom = 3
- }, // Input pad right = 2
- { // Input pad right = 3
- { // Output pad bottom = 0
- ConvImpl::template process_tile<0, 0, 0, 3, 0, 0>,
- ConvImpl::template process_tile<0, 0, 0, 3, 0, 1>,
- ConvImpl::template process_tile<0, 0, 0, 3, 0, 2>,
- ConvImpl::template process_tile<0, 0, 0, 3, 0, 3>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- ConvImpl::template process_tile<0, 0, 0, 3, 1, 0>,
- ConvImpl::template process_tile<0, 0, 0, 3, 1, 1>,
- ConvImpl::template process_tile<0, 0, 0, 3, 1, 2>,
- ConvImpl::template process_tile<0, 0, 0, 3, 1, 3>,
- }, // Output pad bottom = 1
- { // Output pad bottom = 2
- ConvImpl::template process_tile<0, 0, 0, 3, 2, 0>,
- ConvImpl::template process_tile<0, 0, 0, 3, 2, 1>,
- ConvImpl::template process_tile<0, 0, 0, 3, 2, 2>,
- ConvImpl::template process_tile<0, 0, 0, 3, 2, 3>,
- }, // Output pad bottom = 2
- { // Output pad bottom = 3
- ConvImpl::template process_tile<0, 0, 0, 3, 3, 0>,
- ConvImpl::template process_tile<0, 0, 0, 3, 3, 1>,
- ConvImpl::template process_tile<0, 0, 0, 3, 3, 2>,
- ConvImpl::template process_tile<0, 0, 0, 3, 3, 3>,
- }, // Output pad bottom = 3
- }, // Input pad right = 3
- { // Input pad right = 4
- { // Output pad bottom = 0
- ConvImpl::template process_tile<0, 0, 0, 4, 0, 0>,
- ConvImpl::template process_tile<0, 0, 0, 4, 0, 1>,
- ConvImpl::template process_tile<0, 0, 0, 4, 0, 2>,
- ConvImpl::template process_tile<0, 0, 0, 4, 0, 3>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- ConvImpl::template process_tile<0, 0, 0, 4, 1, 0>,
- ConvImpl::template process_tile<0, 0, 0, 4, 1, 1>,
- ConvImpl::template process_tile<0, 0, 0, 4, 1, 2>,
- ConvImpl::template process_tile<0, 0, 0, 4, 1, 3>,
- }, // Output pad bottom = 1
- { // Output pad bottom = 2
- ConvImpl::template process_tile<0, 0, 0, 4, 2, 0>,
- ConvImpl::template process_tile<0, 0, 0, 4, 2, 1>,
- ConvImpl::template process_tile<0, 0, 0, 4, 2, 2>,
- ConvImpl::template process_tile<0, 0, 0, 4, 2, 3>,
- }, // Output pad bottom = 2
- { // Output pad bottom = 3
- ConvImpl::template process_tile<0, 0, 0, 4, 3, 0>,
- ConvImpl::template process_tile<0, 0, 0, 4, 3, 1>,
- ConvImpl::template process_tile<0, 0, 0, 4, 3, 2>,
- ConvImpl::template process_tile<0, 0, 0, 4, 3, 3>,
- }, // Output pad bottom = 3
- }, // Input pad right = 4
- }, // Input pad bottom = 0
- { // Input pad bottom = 1
- { // Input pad right = 0
- { // Output pad bottom = 0
- ConvImpl::template process_tile<0, 0, 1, 0, 0, 0>,
- ConvImpl::template process_tile<0, 0, 1, 0, 0, 1>,
- ConvImpl::template process_tile<0, 0, 1, 0, 0, 2>,
- ConvImpl::template process_tile<0, 0, 1, 0, 0, 3>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- ConvImpl::template process_tile<0, 0, 1, 0, 1, 0>,
- ConvImpl::template process_tile<0, 0, 1, 0, 1, 1>,
- ConvImpl::template process_tile<0, 0, 1, 0, 1, 2>,
- ConvImpl::template process_tile<0, 0, 1, 0, 1, 3>,
- }, // Output pad bottom = 1
- { // Output pad bottom = 2
- ConvImpl::template process_tile<0, 0, 1, 0, 2, 0>,
- ConvImpl::template process_tile<0, 0, 1, 0, 2, 1>,
- ConvImpl::template process_tile<0, 0, 1, 0, 2, 2>,
- ConvImpl::template process_tile<0, 0, 1, 0, 2, 3>,
- }, // Output pad bottom = 2
- { // Output pad bottom = 3
- ConvImpl::template process_tile<0, 0, 1, 0, 3, 0>,
- ConvImpl::template process_tile<0, 0, 1, 0, 3, 1>,
- ConvImpl::template process_tile<0, 0, 1, 0, 3, 2>,
- ConvImpl::template process_tile<0, 0, 1, 0, 3, 3>,
- }, // Output pad bottom = 3
- }, // Input pad right = 0
- { // Input pad right = 1
- { // Output pad bottom = 0
- ConvImpl::template process_tile<0, 0, 1, 1, 0, 0>,
- ConvImpl::template process_tile<0, 0, 1, 1, 0, 1>,
- ConvImpl::template process_tile<0, 0, 1, 1, 0, 2>,
- ConvImpl::template process_tile<0, 0, 1, 1, 0, 3>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- ConvImpl::template process_tile<0, 0, 1, 1, 1, 0>,
- ConvImpl::template process_tile<0, 0, 1, 1, 1, 1>,
- ConvImpl::template process_tile<0, 0, 1, 1, 1, 2>,
- ConvImpl::template process_tile<0, 0, 1, 1, 1, 3>,
- }, // Output pad bottom = 1
- { // Output pad bottom = 2
- ConvImpl::template process_tile<0, 0, 1, 1, 2, 0>,
- ConvImpl::template process_tile<0, 0, 1, 1, 2, 1>,
- ConvImpl::template process_tile<0, 0, 1, 1, 2, 2>,
- ConvImpl::template process_tile<0, 0, 1, 1, 2, 3>,
- }, // Output pad bottom = 2
- { // Output pad bottom = 3
- ConvImpl::template process_tile<0, 0, 1, 1, 3, 0>,
- ConvImpl::template process_tile<0, 0, 1, 1, 3, 1>,
- ConvImpl::template process_tile<0, 0, 1, 1, 3, 2>,
- ConvImpl::template process_tile<0, 0, 1, 1, 3, 3>,
- }, // Output pad bottom = 3
- }, // Input pad right = 1
- { // Input pad right = 2
- { // Output pad bottom = 0
- ConvImpl::template process_tile<0, 0, 1, 2, 0, 0>,
- ConvImpl::template process_tile<0, 0, 1, 2, 0, 1>,
- ConvImpl::template process_tile<0, 0, 1, 2, 0, 2>,
- ConvImpl::template process_tile<0, 0, 1, 2, 0, 3>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- ConvImpl::template process_tile<0, 0, 1, 2, 1, 0>,
- ConvImpl::template process_tile<0, 0, 1, 2, 1, 1>,
- ConvImpl::template process_tile<0, 0, 1, 2, 1, 2>,
- ConvImpl::template process_tile<0, 0, 1, 2, 1, 3>,
- }, // Output pad bottom = 1
- { // Output pad bottom = 2
- ConvImpl::template process_tile<0, 0, 1, 2, 2, 0>,
- ConvImpl::template process_tile<0, 0, 1, 2, 2, 1>,
- ConvImpl::template process_tile<0, 0, 1, 2, 2, 2>,
- ConvImpl::template process_tile<0, 0, 1, 2, 2, 3>,
- }, // Output pad bottom = 2
- { // Output pad bottom = 3
- ConvImpl::template process_tile<0, 0, 1, 2, 3, 0>,
- ConvImpl::template process_tile<0, 0, 1, 2, 3, 1>,
- ConvImpl::template process_tile<0, 0, 1, 2, 3, 2>,
- ConvImpl::template process_tile<0, 0, 1, 2, 3, 3>,
- }, // Output pad bottom = 3
- }, // Input pad right = 2
- { // Input pad right = 3
- { // Output pad bottom = 0
- ConvImpl::template process_tile<0, 0, 1, 3, 0, 0>,
- ConvImpl::template process_tile<0, 0, 1, 3, 0, 1>,
- ConvImpl::template process_tile<0, 0, 1, 3, 0, 2>,
- ConvImpl::template process_tile<0, 0, 1, 3, 0, 3>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- ConvImpl::template process_tile<0, 0, 1, 3, 1, 0>,
- ConvImpl::template process_tile<0, 0, 1, 3, 1, 1>,
- ConvImpl::template process_tile<0, 0, 1, 3, 1, 2>,
- ConvImpl::template process_tile<0, 0, 1, 3, 1, 3>,
- }, // Output pad bottom = 1
- { // Output pad bottom = 2
- ConvImpl::template process_tile<0, 0, 1, 3, 2, 0>,
- ConvImpl::template process_tile<0, 0, 1, 3, 2, 1>,
- ConvImpl::template process_tile<0, 0, 1, 3, 2, 2>,
- ConvImpl::template process_tile<0, 0, 1, 3, 2, 3>,
- }, // Output pad bottom = 2
- { // Output pad bottom = 3
- ConvImpl::template process_tile<0, 0, 1, 3, 3, 0>,
- ConvImpl::template process_tile<0, 0, 1, 3, 3, 1>,
- ConvImpl::template process_tile<0, 0, 1, 3, 3, 2>,
- ConvImpl::template process_tile<0, 0, 1, 3, 3, 3>,
- }, // Output pad bottom = 3
- }, // Input pad right = 3
- { // Input pad right = 4
- { // Output pad bottom = 0
- ConvImpl::template process_tile<0, 0, 1, 4, 0, 0>,
- ConvImpl::template process_tile<0, 0, 1, 4, 0, 1>,
- ConvImpl::template process_tile<0, 0, 1, 4, 0, 2>,
- ConvImpl::template process_tile<0, 0, 1, 4, 0, 3>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- ConvImpl::template process_tile<0, 0, 1, 4, 1, 0>,
- ConvImpl::template process_tile<0, 0, 1, 4, 1, 1>,
- ConvImpl::template process_tile<0, 0, 1, 4, 1, 2>,
- ConvImpl::template process_tile<0, 0, 1, 4, 1, 3>,
- }, // Output pad bottom = 1
- { // Output pad bottom = 2
- ConvImpl::template process_tile<0, 0, 1, 4, 2, 0>,
- ConvImpl::template process_tile<0, 0, 1, 4, 2, 1>,
- ConvImpl::template process_tile<0, 0, 1, 4, 2, 2>,
- ConvImpl::template process_tile<0, 0, 1, 4, 2, 3>,
- }, // Output pad bottom = 2
- { // Output pad bottom = 3
- ConvImpl::template process_tile<0, 0, 1, 4, 3, 0>,
- ConvImpl::template process_tile<0, 0, 1, 4, 3, 1>,
- ConvImpl::template process_tile<0, 0, 1, 4, 3, 2>,
- ConvImpl::template process_tile<0, 0, 1, 4, 3, 3>,
- }, // Output pad bottom = 3
- }, // Input pad right = 4
- }, // Input pad bottom = 1
- { // Input pad bottom = 2
- { // Input pad right = 0
- { // Output pad bottom = 0
- ConvImpl::template process_tile<0, 0, 2, 0, 0, 0>,
- ConvImpl::template process_tile<0, 0, 2, 0, 0, 1>,
- ConvImpl::template process_tile<0, 0, 2, 0, 0, 2>,
- ConvImpl::template process_tile<0, 0, 2, 0, 0, 3>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- ConvImpl::template process_tile<0, 0, 2, 0, 1, 0>,
- ConvImpl::template process_tile<0, 0, 2, 0, 1, 1>,
- ConvImpl::template process_tile<0, 0, 2, 0, 1, 2>,
- ConvImpl::template process_tile<0, 0, 2, 0, 1, 3>,
- }, // Output pad bottom = 1
- { // Output pad bottom = 2
- ConvImpl::template process_tile<0, 0, 2, 0, 2, 0>,
- ConvImpl::template process_tile<0, 0, 2, 0, 2, 1>,
- ConvImpl::template process_tile<0, 0, 2, 0, 2, 2>,
- ConvImpl::template process_tile<0, 0, 2, 0, 2, 3>,
- }, // Output pad bottom = 2
- { // Output pad bottom = 3
- ConvImpl::template process_tile<0, 0, 2, 0, 3, 0>,
- ConvImpl::template process_tile<0, 0, 2, 0, 3, 1>,
- ConvImpl::template process_tile<0, 0, 2, 0, 3, 2>,
- ConvImpl::template process_tile<0, 0, 2, 0, 3, 3>,
- }, // Output pad bottom = 3
- }, // Input pad right = 0
- { // Input pad right = 1
- { // Output pad bottom = 0
- ConvImpl::template process_tile<0, 0, 2, 1, 0, 0>,
- ConvImpl::template process_tile<0, 0, 2, 1, 0, 1>,
- ConvImpl::template process_tile<0, 0, 2, 1, 0, 2>,
- ConvImpl::template process_tile<0, 0, 2, 1, 0, 3>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- ConvImpl::template process_tile<0, 0, 2, 1, 1, 0>,
- ConvImpl::template process_tile<0, 0, 2, 1, 1, 1>,
- ConvImpl::template process_tile<0, 0, 2, 1, 1, 2>,
- ConvImpl::template process_tile<0, 0, 2, 1, 1, 3>,
- }, // Output pad bottom = 1
- { // Output pad bottom = 2
- ConvImpl::template process_tile<0, 0, 2, 1, 2, 0>,
- ConvImpl::template process_tile<0, 0, 2, 1, 2, 1>,
- ConvImpl::template process_tile<0, 0, 2, 1, 2, 2>,
- ConvImpl::template process_tile<0, 0, 2, 1, 2, 3>,
- }, // Output pad bottom = 2
- { // Output pad bottom = 3
- ConvImpl::template process_tile<0, 0, 2, 1, 3, 0>,
- ConvImpl::template process_tile<0, 0, 2, 1, 3, 1>,
- ConvImpl::template process_tile<0, 0, 2, 1, 3, 2>,
- ConvImpl::template process_tile<0, 0, 2, 1, 3, 3>,
- }, // Output pad bottom = 3
- }, // Input pad right = 1
- { // Input pad right = 2
- { // Output pad bottom = 0
- ConvImpl::template process_tile<0, 0, 2, 2, 0, 0>,
- ConvImpl::template process_tile<0, 0, 2, 2, 0, 1>,
- ConvImpl::template process_tile<0, 0, 2, 2, 0, 2>,
- ConvImpl::template process_tile<0, 0, 2, 2, 0, 3>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- ConvImpl::template process_tile<0, 0, 2, 2, 1, 0>,
- ConvImpl::template process_tile<0, 0, 2, 2, 1, 1>,
- ConvImpl::template process_tile<0, 0, 2, 2, 1, 2>,
- ConvImpl::template process_tile<0, 0, 2, 2, 1, 3>,
- }, // Output pad bottom = 1
- { // Output pad bottom = 2
- ConvImpl::template process_tile<0, 0, 2, 2, 2, 0>,
- ConvImpl::template process_tile<0, 0, 2, 2, 2, 1>,
- ConvImpl::template process_tile<0, 0, 2, 2, 2, 2>,
- ConvImpl::template process_tile<0, 0, 2, 2, 2, 3>,
- }, // Output pad bottom = 2
- { // Output pad bottom = 3
- ConvImpl::template process_tile<0, 0, 2, 2, 3, 0>,
- ConvImpl::template process_tile<0, 0, 2, 2, 3, 1>,
- ConvImpl::template process_tile<0, 0, 2, 2, 3, 2>,
- ConvImpl::template process_tile<0, 0, 2, 2, 3, 3>,
- }, // Output pad bottom = 3
- }, // Input pad right = 2
- { // Input pad right = 3
- { // Output pad bottom = 0
- ConvImpl::template process_tile<0, 0, 2, 3, 0, 0>,
- ConvImpl::template process_tile<0, 0, 2, 3, 0, 1>,
- ConvImpl::template process_tile<0, 0, 2, 3, 0, 2>,
- ConvImpl::template process_tile<0, 0, 2, 3, 0, 3>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- ConvImpl::template process_tile<0, 0, 2, 3, 1, 0>,
- ConvImpl::template process_tile<0, 0, 2, 3, 1, 1>,
- ConvImpl::template process_tile<0, 0, 2, 3, 1, 2>,
- ConvImpl::template process_tile<0, 0, 2, 3, 1, 3>,
- }, // Output pad bottom = 1
- { // Output pad bottom = 2
- ConvImpl::template process_tile<0, 0, 2, 3, 2, 0>,
- ConvImpl::template process_tile<0, 0, 2, 3, 2, 1>,
- ConvImpl::template process_tile<0, 0, 2, 3, 2, 2>,
- ConvImpl::template process_tile<0, 0, 2, 3, 2, 3>,
- }, // Output pad bottom = 2
- { // Output pad bottom = 3
- ConvImpl::template process_tile<0, 0, 2, 3, 3, 0>,
- ConvImpl::template process_tile<0, 0, 2, 3, 3, 1>,
- ConvImpl::template process_tile<0, 0, 2, 3, 3, 2>,
- ConvImpl::template process_tile<0, 0, 2, 3, 3, 3>,
- }, // Output pad bottom = 3
- }, // Input pad right = 3
- { // Input pad right = 4
- { // Output pad bottom = 0
- ConvImpl::template process_tile<0, 0, 2, 4, 0, 0>,
- ConvImpl::template process_tile<0, 0, 2, 4, 0, 1>,
- ConvImpl::template process_tile<0, 0, 2, 4, 0, 2>,
- ConvImpl::template process_tile<0, 0, 2, 4, 0, 3>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- ConvImpl::template process_tile<0, 0, 2, 4, 1, 0>,
- ConvImpl::template process_tile<0, 0, 2, 4, 1, 1>,
- ConvImpl::template process_tile<0, 0, 2, 4, 1, 2>,
- ConvImpl::template process_tile<0, 0, 2, 4, 1, 3>,
- }, // Output pad bottom = 1
- { // Output pad bottom = 2
- ConvImpl::template process_tile<0, 0, 2, 4, 2, 0>,
- ConvImpl::template process_tile<0, 0, 2, 4, 2, 1>,
- ConvImpl::template process_tile<0, 0, 2, 4, 2, 2>,
- ConvImpl::template process_tile<0, 0, 2, 4, 2, 3>,
- }, // Output pad bottom = 2
- { // Output pad bottom = 3
- ConvImpl::template process_tile<0, 0, 2, 4, 3, 0>,
- ConvImpl::template process_tile<0, 0, 2, 4, 3, 1>,
- ConvImpl::template process_tile<0, 0, 2, 4, 3, 2>,
- ConvImpl::template process_tile<0, 0, 2, 4, 3, 3>,
- }, // Output pad bottom = 3
- }, // Input pad right = 4
- }, // Input pad bottom = 2
- { // Input pad bottom = 3
- { // Input pad right = 0
- { // Output pad bottom = 0
- ConvImpl::template process_tile<0, 0, 3, 0, 0, 0>,
- ConvImpl::template process_tile<0, 0, 3, 0, 0, 1>,
- ConvImpl::template process_tile<0, 0, 3, 0, 0, 2>,
- ConvImpl::template process_tile<0, 0, 3, 0, 0, 3>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- ConvImpl::template process_tile<0, 0, 3, 0, 1, 0>,
- ConvImpl::template process_tile<0, 0, 3, 0, 1, 1>,
- ConvImpl::template process_tile<0, 0, 3, 0, 1, 2>,
- ConvImpl::template process_tile<0, 0, 3, 0, 1, 3>,
- }, // Output pad bottom = 1
- { // Output pad bottom = 2
- ConvImpl::template process_tile<0, 0, 3, 0, 2, 0>,
- ConvImpl::template process_tile<0, 0, 3, 0, 2, 1>,
- ConvImpl::template process_tile<0, 0, 3, 0, 2, 2>,
- ConvImpl::template process_tile<0, 0, 3, 0, 2, 3>,
- }, // Output pad bottom = 2
- { // Output pad bottom = 3
- ConvImpl::template process_tile<0, 0, 3, 0, 3, 0>,
- ConvImpl::template process_tile<0, 0, 3, 0, 3, 1>,
- ConvImpl::template process_tile<0, 0, 3, 0, 3, 2>,
- ConvImpl::template process_tile<0, 0, 3, 0, 3, 3>,
- }, // Output pad bottom = 3
- }, // Input pad right = 0
- { // Input pad right = 1
- { // Output pad bottom = 0
- ConvImpl::template process_tile<0, 0, 3, 1, 0, 0>,
- ConvImpl::template process_tile<0, 0, 3, 1, 0, 1>,
- ConvImpl::template process_tile<0, 0, 3, 1, 0, 2>,
- ConvImpl::template process_tile<0, 0, 3, 1, 0, 3>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- ConvImpl::template process_tile<0, 0, 3, 1, 1, 0>,
- ConvImpl::template process_tile<0, 0, 3, 1, 1, 1>,
- ConvImpl::template process_tile<0, 0, 3, 1, 1, 2>,
- ConvImpl::template process_tile<0, 0, 3, 1, 1, 3>,
- }, // Output pad bottom = 1
- { // Output pad bottom = 2
- ConvImpl::template process_tile<0, 0, 3, 1, 2, 0>,
- ConvImpl::template process_tile<0, 0, 3, 1, 2, 1>,
- ConvImpl::template process_tile<0, 0, 3, 1, 2, 2>,
- ConvImpl::template process_tile<0, 0, 3, 1, 2, 3>,
- }, // Output pad bottom = 2
- { // Output pad bottom = 3
- ConvImpl::template process_tile<0, 0, 3, 1, 3, 0>,
- ConvImpl::template process_tile<0, 0, 3, 1, 3, 1>,
- ConvImpl::template process_tile<0, 0, 3, 1, 3, 2>,
- ConvImpl::template process_tile<0, 0, 3, 1, 3, 3>,
- }, // Output pad bottom = 3
- }, // Input pad right = 1
- { // Input pad right = 2
- { // Output pad bottom = 0
- ConvImpl::template process_tile<0, 0, 3, 2, 0, 0>,
- ConvImpl::template process_tile<0, 0, 3, 2, 0, 1>,
- ConvImpl::template process_tile<0, 0, 3, 2, 0, 2>,
- ConvImpl::template process_tile<0, 0, 3, 2, 0, 3>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- ConvImpl::template process_tile<0, 0, 3, 2, 1, 0>,
- ConvImpl::template process_tile<0, 0, 3, 2, 1, 1>,
- ConvImpl::template process_tile<0, 0, 3, 2, 1, 2>,
- ConvImpl::template process_tile<0, 0, 3, 2, 1, 3>,
- }, // Output pad bottom = 1
- { // Output pad bottom = 2
- ConvImpl::template process_tile<0, 0, 3, 2, 2, 0>,
- ConvImpl::template process_tile<0, 0, 3, 2, 2, 1>,
- ConvImpl::template process_tile<0, 0, 3, 2, 2, 2>,
- ConvImpl::template process_tile<0, 0, 3, 2, 2, 3>,
- }, // Output pad bottom = 2
- { // Output pad bottom = 3
- ConvImpl::template process_tile<0, 0, 3, 2, 3, 0>,
- ConvImpl::template process_tile<0, 0, 3, 2, 3, 1>,
- ConvImpl::template process_tile<0, 0, 3, 2, 3, 2>,
- ConvImpl::template process_tile<0, 0, 3, 2, 3, 3>,
- }, // Output pad bottom = 3
- }, // Input pad right = 2
- { // Input pad right = 3
- { // Output pad bottom = 0
- ConvImpl::template process_tile<0, 0, 3, 3, 0, 0>,
- ConvImpl::template process_tile<0, 0, 3, 3, 0, 1>,
- ConvImpl::template process_tile<0, 0, 3, 3, 0, 2>,
- ConvImpl::template process_tile<0, 0, 3, 3, 0, 3>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- ConvImpl::template process_tile<0, 0, 3, 3, 1, 0>,
- ConvImpl::template process_tile<0, 0, 3, 3, 1, 1>,
- ConvImpl::template process_tile<0, 0, 3, 3, 1, 2>,
- ConvImpl::template process_tile<0, 0, 3, 3, 1, 3>,
- }, // Output pad bottom = 1
- { // Output pad bottom = 2
- ConvImpl::template process_tile<0, 0, 3, 3, 2, 0>,
- ConvImpl::template process_tile<0, 0, 3, 3, 2, 1>,
- ConvImpl::template process_tile<0, 0, 3, 3, 2, 2>,
- ConvImpl::template process_tile<0, 0, 3, 3, 2, 3>,
- }, // Output pad bottom = 2
- { // Output pad bottom = 3
- ConvImpl::template process_tile<0, 0, 3, 3, 3, 0>,
- ConvImpl::template process_tile<0, 0, 3, 3, 3, 1>,
- ConvImpl::template process_tile<0, 0, 3, 3, 3, 2>,
- ConvImpl::template process_tile<0, 0, 3, 3, 3, 3>,
- }, // Output pad bottom = 3
- }, // Input pad right = 3
- { // Input pad right = 4
- { // Output pad bottom = 0
- ConvImpl::template process_tile<0, 0, 3, 4, 0, 0>,
- ConvImpl::template process_tile<0, 0, 3, 4, 0, 1>,
- ConvImpl::template process_tile<0, 0, 3, 4, 0, 2>,
- ConvImpl::template process_tile<0, 0, 3, 4, 0, 3>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- ConvImpl::template process_tile<0, 0, 3, 4, 1, 0>,
- ConvImpl::template process_tile<0, 0, 3, 4, 1, 1>,
- ConvImpl::template process_tile<0, 0, 3, 4, 1, 2>,
- ConvImpl::template process_tile<0, 0, 3, 4, 1, 3>,
- }, // Output pad bottom = 1
- { // Output pad bottom = 2
- ConvImpl::template process_tile<0, 0, 3, 4, 2, 0>,
- ConvImpl::template process_tile<0, 0, 3, 4, 2, 1>,
- ConvImpl::template process_tile<0, 0, 3, 4, 2, 2>,
- ConvImpl::template process_tile<0, 0, 3, 4, 2, 3>,
- }, // Output pad bottom = 2
- { // Output pad bottom = 3
- ConvImpl::template process_tile<0, 0, 3, 4, 3, 0>,
- ConvImpl::template process_tile<0, 0, 3, 4, 3, 1>,
- ConvImpl::template process_tile<0, 0, 3, 4, 3, 2>,
- ConvImpl::template process_tile<0, 0, 3, 4, 3, 3>,
- }, // Output pad bottom = 3
- }, // Input pad right = 4
- }, // Input pad bottom = 3
- { // Input pad bottom = 4
- { // Input pad right = 0
- { // Output pad bottom = 0
- ConvImpl::template process_tile<0, 0, 4, 0, 0, 0>,
- ConvImpl::template process_tile<0, 0, 4, 0, 0, 1>,
- ConvImpl::template process_tile<0, 0, 4, 0, 0, 2>,
- ConvImpl::template process_tile<0, 0, 4, 0, 0, 3>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- ConvImpl::template process_tile<0, 0, 4, 0, 1, 0>,
- ConvImpl::template process_tile<0, 0, 4, 0, 1, 1>,
- ConvImpl::template process_tile<0, 0, 4, 0, 1, 2>,
- ConvImpl::template process_tile<0, 0, 4, 0, 1, 3>,
- }, // Output pad bottom = 1
- { // Output pad bottom = 2
- ConvImpl::template process_tile<0, 0, 4, 0, 2, 0>,
- ConvImpl::template process_tile<0, 0, 4, 0, 2, 1>,
- ConvImpl::template process_tile<0, 0, 4, 0, 2, 2>,
- ConvImpl::template process_tile<0, 0, 4, 0, 2, 3>,
- }, // Output pad bottom = 2
- { // Output pad bottom = 3
- ConvImpl::template process_tile<0, 0, 4, 0, 3, 0>,
- ConvImpl::template process_tile<0, 0, 4, 0, 3, 1>,
- ConvImpl::template process_tile<0, 0, 4, 0, 3, 2>,
- ConvImpl::template process_tile<0, 0, 4, 0, 3, 3>,
- }, // Output pad bottom = 3
- }, // Input pad right = 0
- { // Input pad right = 1
- { // Output pad bottom = 0
- ConvImpl::template process_tile<0, 0, 4, 1, 0, 0>,
- ConvImpl::template process_tile<0, 0, 4, 1, 0, 1>,
- ConvImpl::template process_tile<0, 0, 4, 1, 0, 2>,
- ConvImpl::template process_tile<0, 0, 4, 1, 0, 3>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- ConvImpl::template process_tile<0, 0, 4, 1, 1, 0>,
- ConvImpl::template process_tile<0, 0, 4, 1, 1, 1>,
- ConvImpl::template process_tile<0, 0, 4, 1, 1, 2>,
- ConvImpl::template process_tile<0, 0, 4, 1, 1, 3>,
- }, // Output pad bottom = 1
- { // Output pad bottom = 2
- ConvImpl::template process_tile<0, 0, 4, 1, 2, 0>,
- ConvImpl::template process_tile<0, 0, 4, 1, 2, 1>,
- ConvImpl::template process_tile<0, 0, 4, 1, 2, 2>,
- ConvImpl::template process_tile<0, 0, 4, 1, 2, 3>,
- }, // Output pad bottom = 2
- { // Output pad bottom = 3
- ConvImpl::template process_tile<0, 0, 4, 1, 3, 0>,
- ConvImpl::template process_tile<0, 0, 4, 1, 3, 1>,
- ConvImpl::template process_tile<0, 0, 4, 1, 3, 2>,
- ConvImpl::template process_tile<0, 0, 4, 1, 3, 3>,
- }, // Output pad bottom = 3
- }, // Input pad right = 1
- { // Input pad right = 2
- { // Output pad bottom = 0
- ConvImpl::template process_tile<0, 0, 4, 2, 0, 0>,
- ConvImpl::template process_tile<0, 0, 4, 2, 0, 1>,
- ConvImpl::template process_tile<0, 0, 4, 2, 0, 2>,
- ConvImpl::template process_tile<0, 0, 4, 2, 0, 3>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- ConvImpl::template process_tile<0, 0, 4, 2, 1, 0>,
- ConvImpl::template process_tile<0, 0, 4, 2, 1, 1>,
- ConvImpl::template process_tile<0, 0, 4, 2, 1, 2>,
- ConvImpl::template process_tile<0, 0, 4, 2, 1, 3>,
- }, // Output pad bottom = 1
- { // Output pad bottom = 2
- ConvImpl::template process_tile<0, 0, 4, 2, 2, 0>,
- ConvImpl::template process_tile<0, 0, 4, 2, 2, 1>,
- ConvImpl::template process_tile<0, 0, 4, 2, 2, 2>,
- ConvImpl::template process_tile<0, 0, 4, 2, 2, 3>,
- }, // Output pad bottom = 2
- { // Output pad bottom = 3
- ConvImpl::template process_tile<0, 0, 4, 2, 3, 0>,
- ConvImpl::template process_tile<0, 0, 4, 2, 3, 1>,
- ConvImpl::template process_tile<0, 0, 4, 2, 3, 2>,
- ConvImpl::template process_tile<0, 0, 4, 2, 3, 3>,
- }, // Output pad bottom = 3
- }, // Input pad right = 2
- { // Input pad right = 3
- { // Output pad bottom = 0
- ConvImpl::template process_tile<0, 0, 4, 3, 0, 0>,
- ConvImpl::template process_tile<0, 0, 4, 3, 0, 1>,
- ConvImpl::template process_tile<0, 0, 4, 3, 0, 2>,
- ConvImpl::template process_tile<0, 0, 4, 3, 0, 3>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- ConvImpl::template process_tile<0, 0, 4, 3, 1, 0>,
- ConvImpl::template process_tile<0, 0, 4, 3, 1, 1>,
- ConvImpl::template process_tile<0, 0, 4, 3, 1, 2>,
- ConvImpl::template process_tile<0, 0, 4, 3, 1, 3>,
- }, // Output pad bottom = 1
- { // Output pad bottom = 2
- ConvImpl::template process_tile<0, 0, 4, 3, 2, 0>,
- ConvImpl::template process_tile<0, 0, 4, 3, 2, 1>,
- ConvImpl::template process_tile<0, 0, 4, 3, 2, 2>,
- ConvImpl::template process_tile<0, 0, 4, 3, 2, 3>,
- }, // Output pad bottom = 2
- { // Output pad bottom = 3
- ConvImpl::template process_tile<0, 0, 4, 3, 3, 0>,
- ConvImpl::template process_tile<0, 0, 4, 3, 3, 1>,
- ConvImpl::template process_tile<0, 0, 4, 3, 3, 2>,
- ConvImpl::template process_tile<0, 0, 4, 3, 3, 3>,
- }, // Output pad bottom = 3
- }, // Input pad right = 3
- { // Input pad right = 4
- { // Output pad bottom = 0
- ConvImpl::template process_tile<0, 0, 4, 4, 0, 0>,
- ConvImpl::template process_tile<0, 0, 4, 4, 0, 1>,
- ConvImpl::template process_tile<0, 0, 4, 4, 0, 2>,
- ConvImpl::template process_tile<0, 0, 4, 4, 0, 3>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- ConvImpl::template process_tile<0, 0, 4, 4, 1, 0>,
- ConvImpl::template process_tile<0, 0, 4, 4, 1, 1>,
- ConvImpl::template process_tile<0, 0, 4, 4, 1, 2>,
- ConvImpl::template process_tile<0, 0, 4, 4, 1, 3>,
- }, // Output pad bottom = 1
- { // Output pad bottom = 2
- ConvImpl::template process_tile<0, 0, 4, 4, 2, 0>,
- ConvImpl::template process_tile<0, 0, 4, 4, 2, 1>,
- ConvImpl::template process_tile<0, 0, 4, 4, 2, 2>,
- ConvImpl::template process_tile<0, 0, 4, 4, 2, 3>,
- }, // Output pad bottom = 2
- { // Output pad bottom = 3
- ConvImpl::template process_tile<0, 0, 4, 4, 3, 0>,
- ConvImpl::template process_tile<0, 0, 4, 4, 3, 1>,
- ConvImpl::template process_tile<0, 0, 4, 4, 3, 2>,
- ConvImpl::template process_tile<0, 0, 4, 4, 3, 3>,
- }, // Output pad bottom = 3
- }, // Input pad right = 4
- }, // Input pad bottom = 4
- }, // Input pad left = 0
- { // Input pad left = 1
- { // Input pad bottom = 0
- { // Input pad right = 0
- { // Output pad bottom = 0
- ConvImpl::template process_tile<0, 1, 0, 0, 0, 0>,
- ConvImpl::template process_tile<0, 1, 0, 0, 0, 1>,
- ConvImpl::template process_tile<0, 1, 0, 0, 0, 2>,
- ConvImpl::template process_tile<0, 1, 0, 0, 0, 3>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- ConvImpl::template process_tile<0, 1, 0, 0, 1, 0>,
- ConvImpl::template process_tile<0, 1, 0, 0, 1, 1>,
- ConvImpl::template process_tile<0, 1, 0, 0, 1, 2>,
- ConvImpl::template process_tile<0, 1, 0, 0, 1, 3>,
- }, // Output pad bottom = 1
- { // Output pad bottom = 2
- ConvImpl::template process_tile<0, 1, 0, 0, 2, 0>,
- ConvImpl::template process_tile<0, 1, 0, 0, 2, 1>,
- ConvImpl::template process_tile<0, 1, 0, 0, 2, 2>,
- ConvImpl::template process_tile<0, 1, 0, 0, 2, 3>,
- }, // Output pad bottom = 2
- { // Output pad bottom = 3
- ConvImpl::template process_tile<0, 1, 0, 0, 3, 0>,
- ConvImpl::template process_tile<0, 1, 0, 0, 3, 1>,
- ConvImpl::template process_tile<0, 1, 0, 0, 3, 2>,
- ConvImpl::template process_tile<0, 1, 0, 0, 3, 3>,
- }, // Output pad bottom = 3
- }, // Input pad right = 0
- { // Input pad right = 1
- { // Output pad bottom = 0
- ConvImpl::template process_tile<0, 1, 0, 1, 0, 0>,
- ConvImpl::template process_tile<0, 1, 0, 1, 0, 1>,
- ConvImpl::template process_tile<0, 1, 0, 1, 0, 2>,
- ConvImpl::template process_tile<0, 1, 0, 1, 0, 3>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- ConvImpl::template process_tile<0, 1, 0, 1, 1, 0>,
- ConvImpl::template process_tile<0, 1, 0, 1, 1, 1>,
- ConvImpl::template process_tile<0, 1, 0, 1, 1, 2>,
- ConvImpl::template process_tile<0, 1, 0, 1, 1, 3>,
- }, // Output pad bottom = 1
- { // Output pad bottom = 2
- ConvImpl::template process_tile<0, 1, 0, 1, 2, 0>,
- ConvImpl::template process_tile<0, 1, 0, 1, 2, 1>,
- ConvImpl::template process_tile<0, 1, 0, 1, 2, 2>,
- ConvImpl::template process_tile<0, 1, 0, 1, 2, 3>,
- }, // Output pad bottom = 2
- { // Output pad bottom = 3
- ConvImpl::template process_tile<0, 1, 0, 1, 3, 0>,
- ConvImpl::template process_tile<0, 1, 0, 1, 3, 1>,
- ConvImpl::template process_tile<0, 1, 0, 1, 3, 2>,
- ConvImpl::template process_tile<0, 1, 0, 1, 3, 3>,
- }, // Output pad bottom = 3
- }, // Input pad right = 1
- { // Input pad right = 2
- { // Output pad bottom = 0
- ConvImpl::template process_tile<0, 1, 0, 2, 0, 0>,
- ConvImpl::template process_tile<0, 1, 0, 2, 0, 1>,
- ConvImpl::template process_tile<0, 1, 0, 2, 0, 2>,
- ConvImpl::template process_tile<0, 1, 0, 2, 0, 3>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- ConvImpl::template process_tile<0, 1, 0, 2, 1, 0>,
- ConvImpl::template process_tile<0, 1, 0, 2, 1, 1>,
- ConvImpl::template process_tile<0, 1, 0, 2, 1, 2>,
- ConvImpl::template process_tile<0, 1, 0, 2, 1, 3>,
- }, // Output pad bottom = 1
- { // Output pad bottom = 2
- ConvImpl::template process_tile<0, 1, 0, 2, 2, 0>,
- ConvImpl::template process_tile<0, 1, 0, 2, 2, 1>,
- ConvImpl::template process_tile<0, 1, 0, 2, 2, 2>,
- ConvImpl::template process_tile<0, 1, 0, 2, 2, 3>,
- }, // Output pad bottom = 2
- { // Output pad bottom = 3
- ConvImpl::template process_tile<0, 1, 0, 2, 3, 0>,
- ConvImpl::template process_tile<0, 1, 0, 2, 3, 1>,
- ConvImpl::template process_tile<0, 1, 0, 2, 3, 2>,
- ConvImpl::template process_tile<0, 1, 0, 2, 3, 3>,
- }, // Output pad bottom = 3
- }, // Input pad right = 2
- { // Input pad right = 3
- { // Output pad bottom = 0
- ConvImpl::template process_tile<0, 1, 0, 3, 0, 0>,
- ConvImpl::template process_tile<0, 1, 0, 3, 0, 1>,
- ConvImpl::template process_tile<0, 1, 0, 3, 0, 2>,
- ConvImpl::template process_tile<0, 1, 0, 3, 0, 3>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- ConvImpl::template process_tile<0, 1, 0, 3, 1, 0>,
- ConvImpl::template process_tile<0, 1, 0, 3, 1, 1>,
- ConvImpl::template process_tile<0, 1, 0, 3, 1, 2>,
- ConvImpl::template process_tile<0, 1, 0, 3, 1, 3>,
- }, // Output pad bottom = 1
- { // Output pad bottom = 2
- ConvImpl::template process_tile<0, 1, 0, 3, 2, 0>,
- ConvImpl::template process_tile<0, 1, 0, 3, 2, 1>,
- ConvImpl::template process_tile<0, 1, 0, 3, 2, 2>,
- ConvImpl::template process_tile<0, 1, 0, 3, 2, 3>,
- }, // Output pad bottom = 2
- { // Output pad bottom = 3
- ConvImpl::template process_tile<0, 1, 0, 3, 3, 0>,
- ConvImpl::template process_tile<0, 1, 0, 3, 3, 1>,
- ConvImpl::template process_tile<0, 1, 0, 3, 3, 2>,
- ConvImpl::template process_tile<0, 1, 0, 3, 3, 3>,
- }, // Output pad bottom = 3
- }, // Input pad right = 3
- { // Input pad right = 4
- { // Output pad bottom = 0
- ConvImpl::template process_tile<0, 1, 0, 4, 0, 0>,
- ConvImpl::template process_tile<0, 1, 0, 4, 0, 1>,
- ConvImpl::template process_tile<0, 1, 0, 4, 0, 2>,
- ConvImpl::template process_tile<0, 1, 0, 4, 0, 3>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- ConvImpl::template process_tile<0, 1, 0, 4, 1, 0>,
- ConvImpl::template process_tile<0, 1, 0, 4, 1, 1>,
- ConvImpl::template process_tile<0, 1, 0, 4, 1, 2>,
- ConvImpl::template process_tile<0, 1, 0, 4, 1, 3>,
- }, // Output pad bottom = 1
- { // Output pad bottom = 2
- ConvImpl::template process_tile<0, 1, 0, 4, 2, 0>,
- ConvImpl::template process_tile<0, 1, 0, 4, 2, 1>,
- ConvImpl::template process_tile<0, 1, 0, 4, 2, 2>,
- ConvImpl::template process_tile<0, 1, 0, 4, 2, 3>,
- }, // Output pad bottom = 2
- { // Output pad bottom = 3
- ConvImpl::template process_tile<0, 1, 0, 4, 3, 0>,
- ConvImpl::template process_tile<0, 1, 0, 4, 3, 1>,
- ConvImpl::template process_tile<0, 1, 0, 4, 3, 2>,
- ConvImpl::template process_tile<0, 1, 0, 4, 3, 3>,
- }, // Output pad bottom = 3
- }, // Input pad right = 4
- }, // Input pad bottom = 0
- { // Input pad bottom = 1
- { // Input pad right = 0
- { // Output pad bottom = 0
- ConvImpl::template process_tile<0, 1, 1, 0, 0, 0>,
- ConvImpl::template process_tile<0, 1, 1, 0, 0, 1>,
- ConvImpl::template process_tile<0, 1, 1, 0, 0, 2>,
- ConvImpl::template process_tile<0, 1, 1, 0, 0, 3>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- ConvImpl::template process_tile<0, 1, 1, 0, 1, 0>,
- ConvImpl::template process_tile<0, 1, 1, 0, 1, 1>,
- ConvImpl::template process_tile<0, 1, 1, 0, 1, 2>,
- ConvImpl::template process_tile<0, 1, 1, 0, 1, 3>,
- }, // Output pad bottom = 1
- { // Output pad bottom = 2
- ConvImpl::template process_tile<0, 1, 1, 0, 2, 0>,
- ConvImpl::template process_tile<0, 1, 1, 0, 2, 1>,
- ConvImpl::template process_tile<0, 1, 1, 0, 2, 2>,
- ConvImpl::template process_tile<0, 1, 1, 0, 2, 3>,
- }, // Output pad bottom = 2
- { // Output pad bottom = 3
- ConvImpl::template process_tile<0, 1, 1, 0, 3, 0>,
- ConvImpl::template process_tile<0, 1, 1, 0, 3, 1>,
- ConvImpl::template process_tile<0, 1, 1, 0, 3, 2>,
- ConvImpl::template process_tile<0, 1, 1, 0, 3, 3>,
- }, // Output pad bottom = 3
- }, // Input pad right = 0
- { // Input pad right = 1
- { // Output pad bottom = 0
- ConvImpl::template process_tile<0, 1, 1, 1, 0, 0>,
- ConvImpl::template process_tile<0, 1, 1, 1, 0, 1>,
- ConvImpl::template process_tile<0, 1, 1, 1, 0, 2>,
- ConvImpl::template process_tile<0, 1, 1, 1, 0, 3>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- ConvImpl::template process_tile<0, 1, 1, 1, 1, 0>,
- ConvImpl::template process_tile<0, 1, 1, 1, 1, 1>,
- ConvImpl::template process_tile<0, 1, 1, 1, 1, 2>,
- ConvImpl::template process_tile<0, 1, 1, 1, 1, 3>,
- }, // Output pad bottom = 1
- { // Output pad bottom = 2
- ConvImpl::template process_tile<0, 1, 1, 1, 2, 0>,
- ConvImpl::template process_tile<0, 1, 1, 1, 2, 1>,
- ConvImpl::template process_tile<0, 1, 1, 1, 2, 2>,
- ConvImpl::template process_tile<0, 1, 1, 1, 2, 3>,
- }, // Output pad bottom = 2
- { // Output pad bottom = 3
- ConvImpl::template process_tile<0, 1, 1, 1, 3, 0>,
- ConvImpl::template process_tile<0, 1, 1, 1, 3, 1>,
- ConvImpl::template process_tile<0, 1, 1, 1, 3, 2>,
- ConvImpl::template process_tile<0, 1, 1, 1, 3, 3>,
- }, // Output pad bottom = 3
- }, // Input pad right = 1
- { // Input pad right = 2
- { // Output pad bottom = 0
- ConvImpl::template process_tile<0, 1, 1, 2, 0, 0>,
- ConvImpl::template process_tile<0, 1, 1, 2, 0, 1>,
- ConvImpl::template process_tile<0, 1, 1, 2, 0, 2>,
- ConvImpl::template process_tile<0, 1, 1, 2, 0, 3>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- ConvImpl::template process_tile<0, 1, 1, 2, 1, 0>,
- ConvImpl::template process_tile<0, 1, 1, 2, 1, 1>,
- ConvImpl::template process_tile<0, 1, 1, 2, 1, 2>,
- ConvImpl::template process_tile<0, 1, 1, 2, 1, 3>,
- }, // Output pad bottom = 1
- { // Output pad bottom = 2
- ConvImpl::template process_tile<0, 1, 1, 2, 2, 0>,
- ConvImpl::template process_tile<0, 1, 1, 2, 2, 1>,
- ConvImpl::template process_tile<0, 1, 1, 2, 2, 2>,
- ConvImpl::template process_tile<0, 1, 1, 2, 2, 3>,
- }, // Output pad bottom = 2
- { // Output pad bottom = 3
- ConvImpl::template process_tile<0, 1, 1, 2, 3, 0>,
- ConvImpl::template process_tile<0, 1, 1, 2, 3, 1>,
- ConvImpl::template process_tile<0, 1, 1, 2, 3, 2>,
- ConvImpl::template process_tile<0, 1, 1, 2, 3, 3>,
- }, // Output pad bottom = 3
- }, // Input pad right = 2
- { // Input pad right = 3
- { // Output pad bottom = 0
- ConvImpl::template process_tile<0, 1, 1, 3, 0, 0>,
- ConvImpl::template process_tile<0, 1, 1, 3, 0, 1>,
- ConvImpl::template process_tile<0, 1, 1, 3, 0, 2>,
- ConvImpl::template process_tile<0, 1, 1, 3, 0, 3>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- ConvImpl::template process_tile<0, 1, 1, 3, 1, 0>,
- ConvImpl::template process_tile<0, 1, 1, 3, 1, 1>,
- ConvImpl::template process_tile<0, 1, 1, 3, 1, 2>,
- ConvImpl::template process_tile<0, 1, 1, 3, 1, 3>,
- }, // Output pad bottom = 1
- { // Output pad bottom = 2
- ConvImpl::template process_tile<0, 1, 1, 3, 2, 0>,
- ConvImpl::template process_tile<0, 1, 1, 3, 2, 1>,
- ConvImpl::template process_tile<0, 1, 1, 3, 2, 2>,
- ConvImpl::template process_tile<0, 1, 1, 3, 2, 3>,
- }, // Output pad bottom = 2
- { // Output pad bottom = 3
- ConvImpl::template process_tile<0, 1, 1, 3, 3, 0>,
- ConvImpl::template process_tile<0, 1, 1, 3, 3, 1>,
- ConvImpl::template process_tile<0, 1, 1, 3, 3, 2>,
- ConvImpl::template process_tile<0, 1, 1, 3, 3, 3>,
- }, // Output pad bottom = 3
- }, // Input pad right = 3
- { // Input pad right = 4
- { // Output pad bottom = 0
- ConvImpl::template process_tile<0, 1, 1, 4, 0, 0>,
- ConvImpl::template process_tile<0, 1, 1, 4, 0, 1>,
- ConvImpl::template process_tile<0, 1, 1, 4, 0, 2>,
- ConvImpl::template process_tile<0, 1, 1, 4, 0, 3>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- ConvImpl::template process_tile<0, 1, 1, 4, 1, 0>,
- ConvImpl::template process_tile<0, 1, 1, 4, 1, 1>,
- ConvImpl::template process_tile<0, 1, 1, 4, 1, 2>,
- ConvImpl::template process_tile<0, 1, 1, 4, 1, 3>,
- }, // Output pad bottom = 1
- { // Output pad bottom = 2
- ConvImpl::template process_tile<0, 1, 1, 4, 2, 0>,
- ConvImpl::template process_tile<0, 1, 1, 4, 2, 1>,
- ConvImpl::template process_tile<0, 1, 1, 4, 2, 2>,
- ConvImpl::template process_tile<0, 1, 1, 4, 2, 3>,
- }, // Output pad bottom = 2
- { // Output pad bottom = 3
- ConvImpl::template process_tile<0, 1, 1, 4, 3, 0>,
- ConvImpl::template process_tile<0, 1, 1, 4, 3, 1>,
- ConvImpl::template process_tile<0, 1, 1, 4, 3, 2>,
- ConvImpl::template process_tile<0, 1, 1, 4, 3, 3>,
- }, // Output pad bottom = 3
- }, // Input pad right = 4
- }, // Input pad bottom = 1
- { // Input pad bottom = 2
- { // Input pad right = 0
- { // Output pad bottom = 0
- ConvImpl::template process_tile<0, 1, 2, 0, 0, 0>,
- ConvImpl::template process_tile<0, 1, 2, 0, 0, 1>,
- ConvImpl::template process_tile<0, 1, 2, 0, 0, 2>,
- ConvImpl::template process_tile<0, 1, 2, 0, 0, 3>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- ConvImpl::template process_tile<0, 1, 2, 0, 1, 0>,
- ConvImpl::template process_tile<0, 1, 2, 0, 1, 1>,
- ConvImpl::template process_tile<0, 1, 2, 0, 1, 2>,
- ConvImpl::template process_tile<0, 1, 2, 0, 1, 3>,
- }, // Output pad bottom = 1
- { // Output pad bottom = 2
- ConvImpl::template process_tile<0, 1, 2, 0, 2, 0>,
- ConvImpl::template process_tile<0, 1, 2, 0, 2, 1>,
- ConvImpl::template process_tile<0, 1, 2, 0, 2, 2>,
- ConvImpl::template process_tile<0, 1, 2, 0, 2, 3>,
- }, // Output pad bottom = 2
- { // Output pad bottom = 3
- ConvImpl::template process_tile<0, 1, 2, 0, 3, 0>,
- ConvImpl::template process_tile<0, 1, 2, 0, 3, 1>,
- ConvImpl::template process_tile<0, 1, 2, 0, 3, 2>,
- ConvImpl::template process_tile<0, 1, 2, 0, 3, 3>,
- }, // Output pad bottom = 3
- }, // Input pad right = 0
- { // Input pad right = 1
- { // Output pad bottom = 0
- ConvImpl::template process_tile<0, 1, 2, 1, 0, 0>,
- ConvImpl::template process_tile<0, 1, 2, 1, 0, 1>,
- ConvImpl::template process_tile<0, 1, 2, 1, 0, 2>,
- ConvImpl::template process_tile<0, 1, 2, 1, 0, 3>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- ConvImpl::template process_tile<0, 1, 2, 1, 1, 0>,
- ConvImpl::template process_tile<0, 1, 2, 1, 1, 1>,
- ConvImpl::template process_tile<0, 1, 2, 1, 1, 2>,
- ConvImpl::template process_tile<0, 1, 2, 1, 1, 3>,
- }, // Output pad bottom = 1
- { // Output pad bottom = 2
- ConvImpl::template process_tile<0, 1, 2, 1, 2, 0>,
- ConvImpl::template process_tile<0, 1, 2, 1, 2, 1>,
- ConvImpl::template process_tile<0, 1, 2, 1, 2, 2>,
- ConvImpl::template process_tile<0, 1, 2, 1, 2, 3>,
- }, // Output pad bottom = 2
- { // Output pad bottom = 3
- ConvImpl::template process_tile<0, 1, 2, 1, 3, 0>,
- ConvImpl::template process_tile<0, 1, 2, 1, 3, 1>,
- ConvImpl::template process_tile<0, 1, 2, 1, 3, 2>,
- ConvImpl::template process_tile<0, 1, 2, 1, 3, 3>,
- }, // Output pad bottom = 3
- }, // Input pad right = 1
- { // Input pad right = 2
- { // Output pad bottom = 0
- ConvImpl::template process_tile<0, 1, 2, 2, 0, 0>,
- ConvImpl::template process_tile<0, 1, 2, 2, 0, 1>,
- ConvImpl::template process_tile<0, 1, 2, 2, 0, 2>,
- ConvImpl::template process_tile<0, 1, 2, 2, 0, 3>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- ConvImpl::template process_tile<0, 1, 2, 2, 1, 0>,
- ConvImpl::template process_tile<0, 1, 2, 2, 1, 1>,
- ConvImpl::template process_tile<0, 1, 2, 2, 1, 2>,
- ConvImpl::template process_tile<0, 1, 2, 2, 1, 3>,
- }, // Output pad bottom = 1
- { // Output pad bottom = 2
- ConvImpl::template process_tile<0, 1, 2, 2, 2, 0>,
- ConvImpl::template process_tile<0, 1, 2, 2, 2, 1>,
- ConvImpl::template process_tile<0, 1, 2, 2, 2, 2>,
- ConvImpl::template process_tile<0, 1, 2, 2, 2, 3>,
- }, // Output pad bottom = 2
- { // Output pad bottom = 3
- ConvImpl::template process_tile<0, 1, 2, 2, 3, 0>,
- ConvImpl::template process_tile<0, 1, 2, 2, 3, 1>,
- ConvImpl::template process_tile<0, 1, 2, 2, 3, 2>,
- ConvImpl::template process_tile<0, 1, 2, 2, 3, 3>,
- }, // Output pad bottom = 3
- }, // Input pad right = 2
- { // Input pad right = 3
- { // Output pad bottom = 0
- ConvImpl::template process_tile<0, 1, 2, 3, 0, 0>,
- ConvImpl::template process_tile<0, 1, 2, 3, 0, 1>,
- ConvImpl::template process_tile<0, 1, 2, 3, 0, 2>,
- ConvImpl::template process_tile<0, 1, 2, 3, 0, 3>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- ConvImpl::template process_tile<0, 1, 2, 3, 1, 0>,
- ConvImpl::template process_tile<0, 1, 2, 3, 1, 1>,
- ConvImpl::template process_tile<0, 1, 2, 3, 1, 2>,
- ConvImpl::template process_tile<0, 1, 2, 3, 1, 3>,
- }, // Output pad bottom = 1
- { // Output pad bottom = 2
- ConvImpl::template process_tile<0, 1, 2, 3, 2, 0>,
- ConvImpl::template process_tile<0, 1, 2, 3, 2, 1>,
- ConvImpl::template process_tile<0, 1, 2, 3, 2, 2>,
- ConvImpl::template process_tile<0, 1, 2, 3, 2, 3>,
- }, // Output pad bottom = 2
- { // Output pad bottom = 3
- ConvImpl::template process_tile<0, 1, 2, 3, 3, 0>,
- ConvImpl::template process_tile<0, 1, 2, 3, 3, 1>,
- ConvImpl::template process_tile<0, 1, 2, 3, 3, 2>,
- ConvImpl::template process_tile<0, 1, 2, 3, 3, 3>,
- }, // Output pad bottom = 3
- }, // Input pad right = 3
- { // Input pad right = 4
- { // Output pad bottom = 0
- ConvImpl::template process_tile<0, 1, 2, 4, 0, 0>,
- ConvImpl::template process_tile<0, 1, 2, 4, 0, 1>,
- ConvImpl::template process_tile<0, 1, 2, 4, 0, 2>,
- ConvImpl::template process_tile<0, 1, 2, 4, 0, 3>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- ConvImpl::template process_tile<0, 1, 2, 4, 1, 0>,
- ConvImpl::template process_tile<0, 1, 2, 4, 1, 1>,
- ConvImpl::template process_tile<0, 1, 2, 4, 1, 2>,
- ConvImpl::template process_tile<0, 1, 2, 4, 1, 3>,
- }, // Output pad bottom = 1
- { // Output pad bottom = 2
- ConvImpl::template process_tile<0, 1, 2, 4, 2, 0>,
- ConvImpl::template process_tile<0, 1, 2, 4, 2, 1>,
- ConvImpl::template process_tile<0, 1, 2, 4, 2, 2>,
- ConvImpl::template process_tile<0, 1, 2, 4, 2, 3>,
- }, // Output pad bottom = 2
- { // Output pad bottom = 3
- ConvImpl::template process_tile<0, 1, 2, 4, 3, 0>,
- ConvImpl::template process_tile<0, 1, 2, 4, 3, 1>,
- ConvImpl::template process_tile<0, 1, 2, 4, 3, 2>,
- ConvImpl::template process_tile<0, 1, 2, 4, 3, 3>,
- }, // Output pad bottom = 3
- }, // Input pad right = 4
- }, // Input pad bottom = 2
- { // Input pad bottom = 3
- { // Input pad right = 0
- { // Output pad bottom = 0
- ConvImpl::template process_tile<0, 1, 3, 0, 0, 0>,
- ConvImpl::template process_tile<0, 1, 3, 0, 0, 1>,
- ConvImpl::template process_tile<0, 1, 3, 0, 0, 2>,
- ConvImpl::template process_tile<0, 1, 3, 0, 0, 3>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- ConvImpl::template process_tile<0, 1, 3, 0, 1, 0>,
- ConvImpl::template process_tile<0, 1, 3, 0, 1, 1>,
- ConvImpl::template process_tile<0, 1, 3, 0, 1, 2>,
- ConvImpl::template process_tile<0, 1, 3, 0, 1, 3>,
- }, // Output pad bottom = 1
- { // Output pad bottom = 2
- ConvImpl::template process_tile<0, 1, 3, 0, 2, 0>,
- ConvImpl::template process_tile<0, 1, 3, 0, 2, 1>,
- ConvImpl::template process_tile<0, 1, 3, 0, 2, 2>,
- ConvImpl::template process_tile<0, 1, 3, 0, 2, 3>,
- }, // Output pad bottom = 2
- { // Output pad bottom = 3
- ConvImpl::template process_tile<0, 1, 3, 0, 3, 0>,
- ConvImpl::template process_tile<0, 1, 3, 0, 3, 1>,
- ConvImpl::template process_tile<0, 1, 3, 0, 3, 2>,
- ConvImpl::template process_tile<0, 1, 3, 0, 3, 3>,
- }, // Output pad bottom = 3
- }, // Input pad right = 0
- { // Input pad right = 1
- { // Output pad bottom = 0
- ConvImpl::template process_tile<0, 1, 3, 1, 0, 0>,
- ConvImpl::template process_tile<0, 1, 3, 1, 0, 1>,
- ConvImpl::template process_tile<0, 1, 3, 1, 0, 2>,
- ConvImpl::template process_tile<0, 1, 3, 1, 0, 3>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- ConvImpl::template process_tile<0, 1, 3, 1, 1, 0>,
- ConvImpl::template process_tile<0, 1, 3, 1, 1, 1>,
- ConvImpl::template process_tile<0, 1, 3, 1, 1, 2>,
- ConvImpl::template process_tile<0, 1, 3, 1, 1, 3>,
- }, // Output pad bottom = 1
- { // Output pad bottom = 2
- ConvImpl::template process_tile<0, 1, 3, 1, 2, 0>,
- ConvImpl::template process_tile<0, 1, 3, 1, 2, 1>,
- ConvImpl::template process_tile<0, 1, 3, 1, 2, 2>,
- ConvImpl::template process_tile<0, 1, 3, 1, 2, 3>,
- }, // Output pad bottom = 2
- { // Output pad bottom = 3
- ConvImpl::template process_tile<0, 1, 3, 1, 3, 0>,
- ConvImpl::template process_tile<0, 1, 3, 1, 3, 1>,
- ConvImpl::template process_tile<0, 1, 3, 1, 3, 2>,
- ConvImpl::template process_tile<0, 1, 3, 1, 3, 3>,
- }, // Output pad bottom = 3
- }, // Input pad right = 1
- { // Input pad right = 2
- { // Output pad bottom = 0
- ConvImpl::template process_tile<0, 1, 3, 2, 0, 0>,
- ConvImpl::template process_tile<0, 1, 3, 2, 0, 1>,
- ConvImpl::template process_tile<0, 1, 3, 2, 0, 2>,
- ConvImpl::template process_tile<0, 1, 3, 2, 0, 3>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- ConvImpl::template process_tile<0, 1, 3, 2, 1, 0>,
- ConvImpl::template process_tile<0, 1, 3, 2, 1, 1>,
- ConvImpl::template process_tile<0, 1, 3, 2, 1, 2>,
- ConvImpl::template process_tile<0, 1, 3, 2, 1, 3>,
- }, // Output pad bottom = 1
- { // Output pad bottom = 2
- ConvImpl::template process_tile<0, 1, 3, 2, 2, 0>,
- ConvImpl::template process_tile<0, 1, 3, 2, 2, 1>,
- ConvImpl::template process_tile<0, 1, 3, 2, 2, 2>,
- ConvImpl::template process_tile<0, 1, 3, 2, 2, 3>,
- }, // Output pad bottom = 2
- { // Output pad bottom = 3
- ConvImpl::template process_tile<0, 1, 3, 2, 3, 0>,
- ConvImpl::template process_tile<0, 1, 3, 2, 3, 1>,
- ConvImpl::template process_tile<0, 1, 3, 2, 3, 2>,
- ConvImpl::template process_tile<0, 1, 3, 2, 3, 3>,
- }, // Output pad bottom = 3
- }, // Input pad right = 2
- { // Input pad right = 3
- { // Output pad bottom = 0
- ConvImpl::template process_tile<0, 1, 3, 3, 0, 0>,
- ConvImpl::template process_tile<0, 1, 3, 3, 0, 1>,
- ConvImpl::template process_tile<0, 1, 3, 3, 0, 2>,
- ConvImpl::template process_tile<0, 1, 3, 3, 0, 3>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- ConvImpl::template process_tile<0, 1, 3, 3, 1, 0>,
- ConvImpl::template process_tile<0, 1, 3, 3, 1, 1>,
- ConvImpl::template process_tile<0, 1, 3, 3, 1, 2>,
- ConvImpl::template process_tile<0, 1, 3, 3, 1, 3>,
- }, // Output pad bottom = 1
- { // Output pad bottom = 2
- ConvImpl::template process_tile<0, 1, 3, 3, 2, 0>,
- ConvImpl::template process_tile<0, 1, 3, 3, 2, 1>,
- ConvImpl::template process_tile<0, 1, 3, 3, 2, 2>,
- ConvImpl::template process_tile<0, 1, 3, 3, 2, 3>,
- }, // Output pad bottom = 2
- { // Output pad bottom = 3
- ConvImpl::template process_tile<0, 1, 3, 3, 3, 0>,
- ConvImpl::template process_tile<0, 1, 3, 3, 3, 1>,
- ConvImpl::template process_tile<0, 1, 3, 3, 3, 2>,
- ConvImpl::template process_tile<0, 1, 3, 3, 3, 3>,
- }, // Output pad bottom = 3
- }, // Input pad right = 3
- { // Input pad right = 4
- { // Output pad bottom = 0
- ConvImpl::template process_tile<0, 1, 3, 4, 0, 0>,
- ConvImpl::template process_tile<0, 1, 3, 4, 0, 1>,
- ConvImpl::template process_tile<0, 1, 3, 4, 0, 2>,
- ConvImpl::template process_tile<0, 1, 3, 4, 0, 3>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- ConvImpl::template process_tile<0, 1, 3, 4, 1, 0>,
- ConvImpl::template process_tile<0, 1, 3, 4, 1, 1>,
- ConvImpl::template process_tile<0, 1, 3, 4, 1, 2>,
- ConvImpl::template process_tile<0, 1, 3, 4, 1, 3>,
- }, // Output pad bottom = 1
- { // Output pad bottom = 2
- ConvImpl::template process_tile<0, 1, 3, 4, 2, 0>,
- ConvImpl::template process_tile<0, 1, 3, 4, 2, 1>,
- ConvImpl::template process_tile<0, 1, 3, 4, 2, 2>,
- ConvImpl::template process_tile<0, 1, 3, 4, 2, 3>,
- }, // Output pad bottom = 2
- { // Output pad bottom = 3
- ConvImpl::template process_tile<0, 1, 3, 4, 3, 0>,
- ConvImpl::template process_tile<0, 1, 3, 4, 3, 1>,
- ConvImpl::template process_tile<0, 1, 3, 4, 3, 2>,
- ConvImpl::template process_tile<0, 1, 3, 4, 3, 3>,
- }, // Output pad bottom = 3
- }, // Input pad right = 4
- }, // Input pad bottom = 3
- { // Input pad bottom = 4
- { // Input pad right = 0
- { // Output pad bottom = 0
- ConvImpl::template process_tile<0, 1, 4, 0, 0, 0>,
- ConvImpl::template process_tile<0, 1, 4, 0, 0, 1>,
- ConvImpl::template process_tile<0, 1, 4, 0, 0, 2>,
- ConvImpl::template process_tile<0, 1, 4, 0, 0, 3>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- ConvImpl::template process_tile<0, 1, 4, 0, 1, 0>,
- ConvImpl::template process_tile<0, 1, 4, 0, 1, 1>,
- ConvImpl::template process_tile<0, 1, 4, 0, 1, 2>,
- ConvImpl::template process_tile<0, 1, 4, 0, 1, 3>,
- }, // Output pad bottom = 1
- { // Output pad bottom = 2
- ConvImpl::template process_tile<0, 1, 4, 0, 2, 0>,
- ConvImpl::template process_tile<0, 1, 4, 0, 2, 1>,
- ConvImpl::template process_tile<0, 1, 4, 0, 2, 2>,
- ConvImpl::template process_tile<0, 1, 4, 0, 2, 3>,
- }, // Output pad bottom = 2
- { // Output pad bottom = 3
- ConvImpl::template process_tile<0, 1, 4, 0, 3, 0>,
- ConvImpl::template process_tile<0, 1, 4, 0, 3, 1>,
- ConvImpl::template process_tile<0, 1, 4, 0, 3, 2>,
- ConvImpl::template process_tile<0, 1, 4, 0, 3, 3>,
- }, // Output pad bottom = 3
- }, // Input pad right = 0
- { // Input pad right = 1
- { // Output pad bottom = 0
- ConvImpl::template process_tile<0, 1, 4, 1, 0, 0>,
- ConvImpl::template process_tile<0, 1, 4, 1, 0, 1>,
- ConvImpl::template process_tile<0, 1, 4, 1, 0, 2>,
- ConvImpl::template process_tile<0, 1, 4, 1, 0, 3>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- ConvImpl::template process_tile<0, 1, 4, 1, 1, 0>,
- ConvImpl::template process_tile<0, 1, 4, 1, 1, 1>,
- ConvImpl::template process_tile<0, 1, 4, 1, 1, 2>,
- ConvImpl::template process_tile<0, 1, 4, 1, 1, 3>,
- }, // Output pad bottom = 1
- { // Output pad bottom = 2
- ConvImpl::template process_tile<0, 1, 4, 1, 2, 0>,
- ConvImpl::template process_tile<0, 1, 4, 1, 2, 1>,
- ConvImpl::template process_tile<0, 1, 4, 1, 2, 2>,
- ConvImpl::template process_tile<0, 1, 4, 1, 2, 3>,
- }, // Output pad bottom = 2
- { // Output pad bottom = 3
- ConvImpl::template process_tile<0, 1, 4, 1, 3, 0>,
- ConvImpl::template process_tile<0, 1, 4, 1, 3, 1>,
- ConvImpl::template process_tile<0, 1, 4, 1, 3, 2>,
- ConvImpl::template process_tile<0, 1, 4, 1, 3, 3>,
- }, // Output pad bottom = 3
- }, // Input pad right = 1
- { // Input pad right = 2
- { // Output pad bottom = 0
- ConvImpl::template process_tile<0, 1, 4, 2, 0, 0>,
- ConvImpl::template process_tile<0, 1, 4, 2, 0, 1>,
- ConvImpl::template process_tile<0, 1, 4, 2, 0, 2>,
- ConvImpl::template process_tile<0, 1, 4, 2, 0, 3>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- ConvImpl::template process_tile<0, 1, 4, 2, 1, 0>,
- ConvImpl::template process_tile<0, 1, 4, 2, 1, 1>,
- ConvImpl::template process_tile<0, 1, 4, 2, 1, 2>,
- ConvImpl::template process_tile<0, 1, 4, 2, 1, 3>,
- }, // Output pad bottom = 1
- { // Output pad bottom = 2
- ConvImpl::template process_tile<0, 1, 4, 2, 2, 0>,
- ConvImpl::template process_tile<0, 1, 4, 2, 2, 1>,
- ConvImpl::template process_tile<0, 1, 4, 2, 2, 2>,
- ConvImpl::template process_tile<0, 1, 4, 2, 2, 3>,
- }, // Output pad bottom = 2
- { // Output pad bottom = 3
- ConvImpl::template process_tile<0, 1, 4, 2, 3, 0>,
- ConvImpl::template process_tile<0, 1, 4, 2, 3, 1>,
- ConvImpl::template process_tile<0, 1, 4, 2, 3, 2>,
- ConvImpl::template process_tile<0, 1, 4, 2, 3, 3>,
- }, // Output pad bottom = 3
- }, // Input pad right = 2
- { // Input pad right = 3
- { // Output pad bottom = 0
- ConvImpl::template process_tile<0, 1, 4, 3, 0, 0>,
- ConvImpl::template process_tile<0, 1, 4, 3, 0, 1>,
- ConvImpl::template process_tile<0, 1, 4, 3, 0, 2>,
- ConvImpl::template process_tile<0, 1, 4, 3, 0, 3>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- ConvImpl::template process_tile<0, 1, 4, 3, 1, 0>,
- ConvImpl::template process_tile<0, 1, 4, 3, 1, 1>,
- ConvImpl::template process_tile<0, 1, 4, 3, 1, 2>,
- ConvImpl::template process_tile<0, 1, 4, 3, 1, 3>,
- }, // Output pad bottom = 1
- { // Output pad bottom = 2
- ConvImpl::template process_tile<0, 1, 4, 3, 2, 0>,
- ConvImpl::template process_tile<0, 1, 4, 3, 2, 1>,
- ConvImpl::template process_tile<0, 1, 4, 3, 2, 2>,
- ConvImpl::template process_tile<0, 1, 4, 3, 2, 3>,
- }, // Output pad bottom = 2
- { // Output pad bottom = 3
- ConvImpl::template process_tile<0, 1, 4, 3, 3, 0>,
- ConvImpl::template process_tile<0, 1, 4, 3, 3, 1>,
- ConvImpl::template process_tile<0, 1, 4, 3, 3, 2>,
- ConvImpl::template process_tile<0, 1, 4, 3, 3, 3>,
- }, // Output pad bottom = 3
- }, // Input pad right = 3
- { // Input pad right = 4
- { // Output pad bottom = 0
- ConvImpl::template process_tile<0, 1, 4, 4, 0, 0>,
- ConvImpl::template process_tile<0, 1, 4, 4, 0, 1>,
- ConvImpl::template process_tile<0, 1, 4, 4, 0, 2>,
- ConvImpl::template process_tile<0, 1, 4, 4, 0, 3>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- ConvImpl::template process_tile<0, 1, 4, 4, 1, 0>,
- ConvImpl::template process_tile<0, 1, 4, 4, 1, 1>,
- ConvImpl::template process_tile<0, 1, 4, 4, 1, 2>,
- ConvImpl::template process_tile<0, 1, 4, 4, 1, 3>,
- }, // Output pad bottom = 1
- { // Output pad bottom = 2
- ConvImpl::template process_tile<0, 1, 4, 4, 2, 0>,
- ConvImpl::template process_tile<0, 1, 4, 4, 2, 1>,
- ConvImpl::template process_tile<0, 1, 4, 4, 2, 2>,
- ConvImpl::template process_tile<0, 1, 4, 4, 2, 3>,
- }, // Output pad bottom = 2
- { // Output pad bottom = 3
- ConvImpl::template process_tile<0, 1, 4, 4, 3, 0>,
- ConvImpl::template process_tile<0, 1, 4, 4, 3, 1>,
- ConvImpl::template process_tile<0, 1, 4, 4, 3, 2>,
- ConvImpl::template process_tile<0, 1, 4, 4, 3, 3>,
- }, // Output pad bottom = 3
- }, // Input pad right = 4
- }, // Input pad bottom = 4
- }, // Input pad left = 1
- }, // Input pad top = 0
- { // Input pad top = 1
- { // Input pad left = 0
- { // Input pad bottom = 0
- { // Input pad right = 0
- { // Output pad bottom = 0
- ConvImpl::template process_tile<1, 0, 0, 0, 0, 0>,
- ConvImpl::template process_tile<1, 0, 0, 0, 0, 1>,
- ConvImpl::template process_tile<1, 0, 0, 0, 0, 2>,
- ConvImpl::template process_tile<1, 0, 0, 0, 0, 3>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- ConvImpl::template process_tile<1, 0, 0, 0, 1, 0>,
- ConvImpl::template process_tile<1, 0, 0, 0, 1, 1>,
- ConvImpl::template process_tile<1, 0, 0, 0, 1, 2>,
- ConvImpl::template process_tile<1, 0, 0, 0, 1, 3>,
- }, // Output pad bottom = 1
- { // Output pad bottom = 2
- ConvImpl::template process_tile<1, 0, 0, 0, 2, 0>,
- ConvImpl::template process_tile<1, 0, 0, 0, 2, 1>,
- ConvImpl::template process_tile<1, 0, 0, 0, 2, 2>,
- ConvImpl::template process_tile<1, 0, 0, 0, 2, 3>,
- }, // Output pad bottom = 2
- { // Output pad bottom = 3
- ConvImpl::template process_tile<1, 0, 0, 0, 3, 0>,
- ConvImpl::template process_tile<1, 0, 0, 0, 3, 1>,
- ConvImpl::template process_tile<1, 0, 0, 0, 3, 2>,
- ConvImpl::template process_tile<1, 0, 0, 0, 3, 3>,
- }, // Output pad bottom = 3
- }, // Input pad right = 0
- { // Input pad right = 1
- { // Output pad bottom = 0
- ConvImpl::template process_tile<1, 0, 0, 1, 0, 0>,
- ConvImpl::template process_tile<1, 0, 0, 1, 0, 1>,
- ConvImpl::template process_tile<1, 0, 0, 1, 0, 2>,
- ConvImpl::template process_tile<1, 0, 0, 1, 0, 3>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- ConvImpl::template process_tile<1, 0, 0, 1, 1, 0>,
- ConvImpl::template process_tile<1, 0, 0, 1, 1, 1>,
- ConvImpl::template process_tile<1, 0, 0, 1, 1, 2>,
- ConvImpl::template process_tile<1, 0, 0, 1, 1, 3>,
- }, // Output pad bottom = 1
- { // Output pad bottom = 2
- ConvImpl::template process_tile<1, 0, 0, 1, 2, 0>,
- ConvImpl::template process_tile<1, 0, 0, 1, 2, 1>,
- ConvImpl::template process_tile<1, 0, 0, 1, 2, 2>,
- ConvImpl::template process_tile<1, 0, 0, 1, 2, 3>,
- }, // Output pad bottom = 2
- { // Output pad bottom = 3
- ConvImpl::template process_tile<1, 0, 0, 1, 3, 0>,
- ConvImpl::template process_tile<1, 0, 0, 1, 3, 1>,
- ConvImpl::template process_tile<1, 0, 0, 1, 3, 2>,
- ConvImpl::template process_tile<1, 0, 0, 1, 3, 3>,
- }, // Output pad bottom = 3
- }, // Input pad right = 1
- { // Input pad right = 2
- { // Output pad bottom = 0
- ConvImpl::template process_tile<1, 0, 0, 2, 0, 0>,
- ConvImpl::template process_tile<1, 0, 0, 2, 0, 1>,
- ConvImpl::template process_tile<1, 0, 0, 2, 0, 2>,
- ConvImpl::template process_tile<1, 0, 0, 2, 0, 3>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- ConvImpl::template process_tile<1, 0, 0, 2, 1, 0>,
- ConvImpl::template process_tile<1, 0, 0, 2, 1, 1>,
- ConvImpl::template process_tile<1, 0, 0, 2, 1, 2>,
- ConvImpl::template process_tile<1, 0, 0, 2, 1, 3>,
- }, // Output pad bottom = 1
- { // Output pad bottom = 2
- ConvImpl::template process_tile<1, 0, 0, 2, 2, 0>,
- ConvImpl::template process_tile<1, 0, 0, 2, 2, 1>,
- ConvImpl::template process_tile<1, 0, 0, 2, 2, 2>,
- ConvImpl::template process_tile<1, 0, 0, 2, 2, 3>,
- }, // Output pad bottom = 2
- { // Output pad bottom = 3
- ConvImpl::template process_tile<1, 0, 0, 2, 3, 0>,
- ConvImpl::template process_tile<1, 0, 0, 2, 3, 1>,
- ConvImpl::template process_tile<1, 0, 0, 2, 3, 2>,
- ConvImpl::template process_tile<1, 0, 0, 2, 3, 3>,
- }, // Output pad bottom = 3
- }, // Input pad right = 2
- { // Input pad right = 3
- { // Output pad bottom = 0
- ConvImpl::template process_tile<1, 0, 0, 3, 0, 0>,
- ConvImpl::template process_tile<1, 0, 0, 3, 0, 1>,
- ConvImpl::template process_tile<1, 0, 0, 3, 0, 2>,
- ConvImpl::template process_tile<1, 0, 0, 3, 0, 3>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- ConvImpl::template process_tile<1, 0, 0, 3, 1, 0>,
- ConvImpl::template process_tile<1, 0, 0, 3, 1, 1>,
- ConvImpl::template process_tile<1, 0, 0, 3, 1, 2>,
- ConvImpl::template process_tile<1, 0, 0, 3, 1, 3>,
- }, // Output pad bottom = 1
- { // Output pad bottom = 2
- ConvImpl::template process_tile<1, 0, 0, 3, 2, 0>,
- ConvImpl::template process_tile<1, 0, 0, 3, 2, 1>,
- ConvImpl::template process_tile<1, 0, 0, 3, 2, 2>,
- ConvImpl::template process_tile<1, 0, 0, 3, 2, 3>,
- }, // Output pad bottom = 2
- { // Output pad bottom = 3
- ConvImpl::template process_tile<1, 0, 0, 3, 3, 0>,
- ConvImpl::template process_tile<1, 0, 0, 3, 3, 1>,
- ConvImpl::template process_tile<1, 0, 0, 3, 3, 2>,
- ConvImpl::template process_tile<1, 0, 0, 3, 3, 3>,
- }, // Output pad bottom = 3
- }, // Input pad right = 3
- { // Input pad right = 4
- { // Output pad bottom = 0
- ConvImpl::template process_tile<1, 0, 0, 4, 0, 0>,
- ConvImpl::template process_tile<1, 0, 0, 4, 0, 1>,
- ConvImpl::template process_tile<1, 0, 0, 4, 0, 2>,
- ConvImpl::template process_tile<1, 0, 0, 4, 0, 3>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- ConvImpl::template process_tile<1, 0, 0, 4, 1, 0>,
- ConvImpl::template process_tile<1, 0, 0, 4, 1, 1>,
- ConvImpl::template process_tile<1, 0, 0, 4, 1, 2>,
- ConvImpl::template process_tile<1, 0, 0, 4, 1, 3>,
- }, // Output pad bottom = 1
- { // Output pad bottom = 2
- ConvImpl::template process_tile<1, 0, 0, 4, 2, 0>,
- ConvImpl::template process_tile<1, 0, 0, 4, 2, 1>,
- ConvImpl::template process_tile<1, 0, 0, 4, 2, 2>,
- ConvImpl::template process_tile<1, 0, 0, 4, 2, 3>,
- }, // Output pad bottom = 2
- { // Output pad bottom = 3
- ConvImpl::template process_tile<1, 0, 0, 4, 3, 0>,
- ConvImpl::template process_tile<1, 0, 0, 4, 3, 1>,
- ConvImpl::template process_tile<1, 0, 0, 4, 3, 2>,
- ConvImpl::template process_tile<1, 0, 0, 4, 3, 3>,
- }, // Output pad bottom = 3
- }, // Input pad right = 4
- }, // Input pad bottom = 0
- { // Input pad bottom = 1
- { // Input pad right = 0
- { // Output pad bottom = 0
- ConvImpl::template process_tile<1, 0, 1, 0, 0, 0>,
- ConvImpl::template process_tile<1, 0, 1, 0, 0, 1>,
- ConvImpl::template process_tile<1, 0, 1, 0, 0, 2>,
- ConvImpl::template process_tile<1, 0, 1, 0, 0, 3>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- ConvImpl::template process_tile<1, 0, 1, 0, 1, 0>,
- ConvImpl::template process_tile<1, 0, 1, 0, 1, 1>,
- ConvImpl::template process_tile<1, 0, 1, 0, 1, 2>,
- ConvImpl::template process_tile<1, 0, 1, 0, 1, 3>,
- }, // Output pad bottom = 1
- { // Output pad bottom = 2
- ConvImpl::template process_tile<1, 0, 1, 0, 2, 0>,
- ConvImpl::template process_tile<1, 0, 1, 0, 2, 1>,
- ConvImpl::template process_tile<1, 0, 1, 0, 2, 2>,
- ConvImpl::template process_tile<1, 0, 1, 0, 2, 3>,
- }, // Output pad bottom = 2
- { // Output pad bottom = 3
- ConvImpl::template process_tile<1, 0, 1, 0, 3, 0>,
- ConvImpl::template process_tile<1, 0, 1, 0, 3, 1>,
- ConvImpl::template process_tile<1, 0, 1, 0, 3, 2>,
- ConvImpl::template process_tile<1, 0, 1, 0, 3, 3>,
- }, // Output pad bottom = 3
- }, // Input pad right = 0
- { // Input pad right = 1
- { // Output pad bottom = 0
- ConvImpl::template process_tile<1, 0, 1, 1, 0, 0>,
- ConvImpl::template process_tile<1, 0, 1, 1, 0, 1>,
- ConvImpl::template process_tile<1, 0, 1, 1, 0, 2>,
- ConvImpl::template process_tile<1, 0, 1, 1, 0, 3>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- ConvImpl::template process_tile<1, 0, 1, 1, 1, 0>,
- ConvImpl::template process_tile<1, 0, 1, 1, 1, 1>,
- ConvImpl::template process_tile<1, 0, 1, 1, 1, 2>,
- ConvImpl::template process_tile<1, 0, 1, 1, 1, 3>,
- }, // Output pad bottom = 1
- { // Output pad bottom = 2
- ConvImpl::template process_tile<1, 0, 1, 1, 2, 0>,
- ConvImpl::template process_tile<1, 0, 1, 1, 2, 1>,
- ConvImpl::template process_tile<1, 0, 1, 1, 2, 2>,
- ConvImpl::template process_tile<1, 0, 1, 1, 2, 3>,
- }, // Output pad bottom = 2
- { // Output pad bottom = 3
- ConvImpl::template process_tile<1, 0, 1, 1, 3, 0>,
- ConvImpl::template process_tile<1, 0, 1, 1, 3, 1>,
- ConvImpl::template process_tile<1, 0, 1, 1, 3, 2>,
- ConvImpl::template process_tile<1, 0, 1, 1, 3, 3>,
- }, // Output pad bottom = 3
- }, // Input pad right = 1
- { // Input pad right = 2
- { // Output pad bottom = 0
- ConvImpl::template process_tile<1, 0, 1, 2, 0, 0>,
- ConvImpl::template process_tile<1, 0, 1, 2, 0, 1>,
- ConvImpl::template process_tile<1, 0, 1, 2, 0, 2>,
- ConvImpl::template process_tile<1, 0, 1, 2, 0, 3>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- ConvImpl::template process_tile<1, 0, 1, 2, 1, 0>,
- ConvImpl::template process_tile<1, 0, 1, 2, 1, 1>,
- ConvImpl::template process_tile<1, 0, 1, 2, 1, 2>,
- ConvImpl::template process_tile<1, 0, 1, 2, 1, 3>,
- }, // Output pad bottom = 1
- { // Output pad bottom = 2
- ConvImpl::template process_tile<1, 0, 1, 2, 2, 0>,
- ConvImpl::template process_tile<1, 0, 1, 2, 2, 1>,
- ConvImpl::template process_tile<1, 0, 1, 2, 2, 2>,
- ConvImpl::template process_tile<1, 0, 1, 2, 2, 3>,
- }, // Output pad bottom = 2
- { // Output pad bottom = 3
- ConvImpl::template process_tile<1, 0, 1, 2, 3, 0>,
- ConvImpl::template process_tile<1, 0, 1, 2, 3, 1>,
- ConvImpl::template process_tile<1, 0, 1, 2, 3, 2>,
- ConvImpl::template process_tile<1, 0, 1, 2, 3, 3>,
- }, // Output pad bottom = 3
- }, // Input pad right = 2
- { // Input pad right = 3
- { // Output pad bottom = 0
- ConvImpl::template process_tile<1, 0, 1, 3, 0, 0>,
- ConvImpl::template process_tile<1, 0, 1, 3, 0, 1>,
- ConvImpl::template process_tile<1, 0, 1, 3, 0, 2>,
- ConvImpl::template process_tile<1, 0, 1, 3, 0, 3>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- ConvImpl::template process_tile<1, 0, 1, 3, 1, 0>,
- ConvImpl::template process_tile<1, 0, 1, 3, 1, 1>,
- ConvImpl::template process_tile<1, 0, 1, 3, 1, 2>,
- ConvImpl::template process_tile<1, 0, 1, 3, 1, 3>,
- }, // Output pad bottom = 1
- { // Output pad bottom = 2
- ConvImpl::template process_tile<1, 0, 1, 3, 2, 0>,
- ConvImpl::template process_tile<1, 0, 1, 3, 2, 1>,
- ConvImpl::template process_tile<1, 0, 1, 3, 2, 2>,
- ConvImpl::template process_tile<1, 0, 1, 3, 2, 3>,
- }, // Output pad bottom = 2
- { // Output pad bottom = 3
- ConvImpl::template process_tile<1, 0, 1, 3, 3, 0>,
- ConvImpl::template process_tile<1, 0, 1, 3, 3, 1>,
- ConvImpl::template process_tile<1, 0, 1, 3, 3, 2>,
- ConvImpl::template process_tile<1, 0, 1, 3, 3, 3>,
- }, // Output pad bottom = 3
- }, // Input pad right = 3
- { // Input pad right = 4
- { // Output pad bottom = 0
- ConvImpl::template process_tile<1, 0, 1, 4, 0, 0>,
- ConvImpl::template process_tile<1, 0, 1, 4, 0, 1>,
- ConvImpl::template process_tile<1, 0, 1, 4, 0, 2>,
- ConvImpl::template process_tile<1, 0, 1, 4, 0, 3>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- ConvImpl::template process_tile<1, 0, 1, 4, 1, 0>,
- ConvImpl::template process_tile<1, 0, 1, 4, 1, 1>,
- ConvImpl::template process_tile<1, 0, 1, 4, 1, 2>,
- ConvImpl::template process_tile<1, 0, 1, 4, 1, 3>,
- }, // Output pad bottom = 1
- { // Output pad bottom = 2
- ConvImpl::template process_tile<1, 0, 1, 4, 2, 0>,
- ConvImpl::template process_tile<1, 0, 1, 4, 2, 1>,
- ConvImpl::template process_tile<1, 0, 1, 4, 2, 2>,
- ConvImpl::template process_tile<1, 0, 1, 4, 2, 3>,
- }, // Output pad bottom = 2
- { // Output pad bottom = 3
- ConvImpl::template process_tile<1, 0, 1, 4, 3, 0>,
- ConvImpl::template process_tile<1, 0, 1, 4, 3, 1>,
- ConvImpl::template process_tile<1, 0, 1, 4, 3, 2>,
- ConvImpl::template process_tile<1, 0, 1, 4, 3, 3>,
- }, // Output pad bottom = 3
- }, // Input pad right = 4
- }, // Input pad bottom = 1
- { // Input pad bottom = 2
- { // Input pad right = 0
- { // Output pad bottom = 0
- ConvImpl::template process_tile<1, 0, 2, 0, 0, 0>,
- ConvImpl::template process_tile<1, 0, 2, 0, 0, 1>,
- ConvImpl::template process_tile<1, 0, 2, 0, 0, 2>,
- ConvImpl::template process_tile<1, 0, 2, 0, 0, 3>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- ConvImpl::template process_tile<1, 0, 2, 0, 1, 0>,
- ConvImpl::template process_tile<1, 0, 2, 0, 1, 1>,
- ConvImpl::template process_tile<1, 0, 2, 0, 1, 2>,
- ConvImpl::template process_tile<1, 0, 2, 0, 1, 3>,
- }, // Output pad bottom = 1
- { // Output pad bottom = 2
- ConvImpl::template process_tile<1, 0, 2, 0, 2, 0>,
- ConvImpl::template process_tile<1, 0, 2, 0, 2, 1>,
- ConvImpl::template process_tile<1, 0, 2, 0, 2, 2>,
- ConvImpl::template process_tile<1, 0, 2, 0, 2, 3>,
- }, // Output pad bottom = 2
- { // Output pad bottom = 3
- ConvImpl::template process_tile<1, 0, 2, 0, 3, 0>,
- ConvImpl::template process_tile<1, 0, 2, 0, 3, 1>,
- ConvImpl::template process_tile<1, 0, 2, 0, 3, 2>,
- ConvImpl::template process_tile<1, 0, 2, 0, 3, 3>,
- }, // Output pad bottom = 3
- }, // Input pad right = 0
- { // Input pad right = 1
- { // Output pad bottom = 0
- ConvImpl::template process_tile<1, 0, 2, 1, 0, 0>,
- ConvImpl::template process_tile<1, 0, 2, 1, 0, 1>,
- ConvImpl::template process_tile<1, 0, 2, 1, 0, 2>,
- ConvImpl::template process_tile<1, 0, 2, 1, 0, 3>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- ConvImpl::template process_tile<1, 0, 2, 1, 1, 0>,
- ConvImpl::template process_tile<1, 0, 2, 1, 1, 1>,
- ConvImpl::template process_tile<1, 0, 2, 1, 1, 2>,
- ConvImpl::template process_tile<1, 0, 2, 1, 1, 3>,
- }, // Output pad bottom = 1
- { // Output pad bottom = 2
- ConvImpl::template process_tile<1, 0, 2, 1, 2, 0>,
- ConvImpl::template process_tile<1, 0, 2, 1, 2, 1>,
- ConvImpl::template process_tile<1, 0, 2, 1, 2, 2>,
- ConvImpl::template process_tile<1, 0, 2, 1, 2, 3>,
- }, // Output pad bottom = 2
- { // Output pad bottom = 3
- ConvImpl::template process_tile<1, 0, 2, 1, 3, 0>,
- ConvImpl::template process_tile<1, 0, 2, 1, 3, 1>,
- ConvImpl::template process_tile<1, 0, 2, 1, 3, 2>,
- ConvImpl::template process_tile<1, 0, 2, 1, 3, 3>,
- }, // Output pad bottom = 3
- }, // Input pad right = 1
- { // Input pad right = 2
- { // Output pad bottom = 0
- ConvImpl::template process_tile<1, 0, 2, 2, 0, 0>,
- ConvImpl::template process_tile<1, 0, 2, 2, 0, 1>,
- ConvImpl::template process_tile<1, 0, 2, 2, 0, 2>,
- ConvImpl::template process_tile<1, 0, 2, 2, 0, 3>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- ConvImpl::template process_tile<1, 0, 2, 2, 1, 0>,
- ConvImpl::template process_tile<1, 0, 2, 2, 1, 1>,
- ConvImpl::template process_tile<1, 0, 2, 2, 1, 2>,
- ConvImpl::template process_tile<1, 0, 2, 2, 1, 3>,
- }, // Output pad bottom = 1
- { // Output pad bottom = 2
- ConvImpl::template process_tile<1, 0, 2, 2, 2, 0>,
- ConvImpl::template process_tile<1, 0, 2, 2, 2, 1>,
- ConvImpl::template process_tile<1, 0, 2, 2, 2, 2>,
- ConvImpl::template process_tile<1, 0, 2, 2, 2, 3>,
- }, // Output pad bottom = 2
- { // Output pad bottom = 3
- ConvImpl::template process_tile<1, 0, 2, 2, 3, 0>,
- ConvImpl::template process_tile<1, 0, 2, 2, 3, 1>,
- ConvImpl::template process_tile<1, 0, 2, 2, 3, 2>,
- ConvImpl::template process_tile<1, 0, 2, 2, 3, 3>,
- }, // Output pad bottom = 3
- }, // Input pad right = 2
- { // Input pad right = 3
- { // Output pad bottom = 0
- ConvImpl::template process_tile<1, 0, 2, 3, 0, 0>,
- ConvImpl::template process_tile<1, 0, 2, 3, 0, 1>,
- ConvImpl::template process_tile<1, 0, 2, 3, 0, 2>,
- ConvImpl::template process_tile<1, 0, 2, 3, 0, 3>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- ConvImpl::template process_tile<1, 0, 2, 3, 1, 0>,
- ConvImpl::template process_tile<1, 0, 2, 3, 1, 1>,
- ConvImpl::template process_tile<1, 0, 2, 3, 1, 2>,
- ConvImpl::template process_tile<1, 0, 2, 3, 1, 3>,
- }, // Output pad bottom = 1
- { // Output pad bottom = 2
- ConvImpl::template process_tile<1, 0, 2, 3, 2, 0>,
- ConvImpl::template process_tile<1, 0, 2, 3, 2, 1>,
- ConvImpl::template process_tile<1, 0, 2, 3, 2, 2>,
- ConvImpl::template process_tile<1, 0, 2, 3, 2, 3>,
- }, // Output pad bottom = 2
- { // Output pad bottom = 3
- ConvImpl::template process_tile<1, 0, 2, 3, 3, 0>,
- ConvImpl::template process_tile<1, 0, 2, 3, 3, 1>,
- ConvImpl::template process_tile<1, 0, 2, 3, 3, 2>,
- ConvImpl::template process_tile<1, 0, 2, 3, 3, 3>,
- }, // Output pad bottom = 3
- }, // Input pad right = 3
- { // Input pad right = 4
- { // Output pad bottom = 0
- ConvImpl::template process_tile<1, 0, 2, 4, 0, 0>,
- ConvImpl::template process_tile<1, 0, 2, 4, 0, 1>,
- ConvImpl::template process_tile<1, 0, 2, 4, 0, 2>,
- ConvImpl::template process_tile<1, 0, 2, 4, 0, 3>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- ConvImpl::template process_tile<1, 0, 2, 4, 1, 0>,
- ConvImpl::template process_tile<1, 0, 2, 4, 1, 1>,
- ConvImpl::template process_tile<1, 0, 2, 4, 1, 2>,
- ConvImpl::template process_tile<1, 0, 2, 4, 1, 3>,
- }, // Output pad bottom = 1
- { // Output pad bottom = 2
- ConvImpl::template process_tile<1, 0, 2, 4, 2, 0>,
- ConvImpl::template process_tile<1, 0, 2, 4, 2, 1>,
- ConvImpl::template process_tile<1, 0, 2, 4, 2, 2>,
- ConvImpl::template process_tile<1, 0, 2, 4, 2, 3>,
- }, // Output pad bottom = 2
- { // Output pad bottom = 3
- ConvImpl::template process_tile<1, 0, 2, 4, 3, 0>,
- ConvImpl::template process_tile<1, 0, 2, 4, 3, 1>,
- ConvImpl::template process_tile<1, 0, 2, 4, 3, 2>,
- ConvImpl::template process_tile<1, 0, 2, 4, 3, 3>,
- }, // Output pad bottom = 3
- }, // Input pad right = 4
- }, // Input pad bottom = 2
- { // Input pad bottom = 3
- { // Input pad right = 0
- { // Output pad bottom = 0
- ConvImpl::template process_tile<1, 0, 3, 0, 0, 0>,
- ConvImpl::template process_tile<1, 0, 3, 0, 0, 1>,
- ConvImpl::template process_tile<1, 0, 3, 0, 0, 2>,
- ConvImpl::template process_tile<1, 0, 3, 0, 0, 3>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- ConvImpl::template process_tile<1, 0, 3, 0, 1, 0>,
- ConvImpl::template process_tile<1, 0, 3, 0, 1, 1>,
- ConvImpl::template process_tile<1, 0, 3, 0, 1, 2>,
- ConvImpl::template process_tile<1, 0, 3, 0, 1, 3>,
- }, // Output pad bottom = 1
- { // Output pad bottom = 2
- ConvImpl::template process_tile<1, 0, 3, 0, 2, 0>,
- ConvImpl::template process_tile<1, 0, 3, 0, 2, 1>,
- ConvImpl::template process_tile<1, 0, 3, 0, 2, 2>,
- ConvImpl::template process_tile<1, 0, 3, 0, 2, 3>,
- }, // Output pad bottom = 2
- { // Output pad bottom = 3
- ConvImpl::template process_tile<1, 0, 3, 0, 3, 0>,
- ConvImpl::template process_tile<1, 0, 3, 0, 3, 1>,
- ConvImpl::template process_tile<1, 0, 3, 0, 3, 2>,
- ConvImpl::template process_tile<1, 0, 3, 0, 3, 3>,
- }, // Output pad bottom = 3
- }, // Input pad right = 0
- { // Input pad right = 1
- { // Output pad bottom = 0
- ConvImpl::template process_tile<1, 0, 3, 1, 0, 0>,
- ConvImpl::template process_tile<1, 0, 3, 1, 0, 1>,
- ConvImpl::template process_tile<1, 0, 3, 1, 0, 2>,
- ConvImpl::template process_tile<1, 0, 3, 1, 0, 3>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- ConvImpl::template process_tile<1, 0, 3, 1, 1, 0>,
- ConvImpl::template process_tile<1, 0, 3, 1, 1, 1>,
- ConvImpl::template process_tile<1, 0, 3, 1, 1, 2>,
- ConvImpl::template process_tile<1, 0, 3, 1, 1, 3>,
- }, // Output pad bottom = 1
- { // Output pad bottom = 2
- ConvImpl::template process_tile<1, 0, 3, 1, 2, 0>,
- ConvImpl::template process_tile<1, 0, 3, 1, 2, 1>,
- ConvImpl::template process_tile<1, 0, 3, 1, 2, 2>,
- ConvImpl::template process_tile<1, 0, 3, 1, 2, 3>,
- }, // Output pad bottom = 2
- { // Output pad bottom = 3
- ConvImpl::template process_tile<1, 0, 3, 1, 3, 0>,
- ConvImpl::template process_tile<1, 0, 3, 1, 3, 1>,
- ConvImpl::template process_tile<1, 0, 3, 1, 3, 2>,
- ConvImpl::template process_tile<1, 0, 3, 1, 3, 3>,
- }, // Output pad bottom = 3
- }, // Input pad right = 1
- { // Input pad right = 2
- { // Output pad bottom = 0
- ConvImpl::template process_tile<1, 0, 3, 2, 0, 0>,
- ConvImpl::template process_tile<1, 0, 3, 2, 0, 1>,
- ConvImpl::template process_tile<1, 0, 3, 2, 0, 2>,
- ConvImpl::template process_tile<1, 0, 3, 2, 0, 3>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- ConvImpl::template process_tile<1, 0, 3, 2, 1, 0>,
- ConvImpl::template process_tile<1, 0, 3, 2, 1, 1>,
- ConvImpl::template process_tile<1, 0, 3, 2, 1, 2>,
- ConvImpl::template process_tile<1, 0, 3, 2, 1, 3>,
- }, // Output pad bottom = 1
- { // Output pad bottom = 2
- ConvImpl::template process_tile<1, 0, 3, 2, 2, 0>,
- ConvImpl::template process_tile<1, 0, 3, 2, 2, 1>,
- ConvImpl::template process_tile<1, 0, 3, 2, 2, 2>,
- ConvImpl::template process_tile<1, 0, 3, 2, 2, 3>,
- }, // Output pad bottom = 2
- { // Output pad bottom = 3
- ConvImpl::template process_tile<1, 0, 3, 2, 3, 0>,
- ConvImpl::template process_tile<1, 0, 3, 2, 3, 1>,
- ConvImpl::template process_tile<1, 0, 3, 2, 3, 2>,
- ConvImpl::template process_tile<1, 0, 3, 2, 3, 3>,
- }, // Output pad bottom = 3
- }, // Input pad right = 2
- { // Input pad right = 3
- { // Output pad bottom = 0
- ConvImpl::template process_tile<1, 0, 3, 3, 0, 0>,
- ConvImpl::template process_tile<1, 0, 3, 3, 0, 1>,
- ConvImpl::template process_tile<1, 0, 3, 3, 0, 2>,
- ConvImpl::template process_tile<1, 0, 3, 3, 0, 3>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- ConvImpl::template process_tile<1, 0, 3, 3, 1, 0>,
- ConvImpl::template process_tile<1, 0, 3, 3, 1, 1>,
- ConvImpl::template process_tile<1, 0, 3, 3, 1, 2>,
- ConvImpl::template process_tile<1, 0, 3, 3, 1, 3>,
- }, // Output pad bottom = 1
- { // Output pad bottom = 2
- ConvImpl::template process_tile<1, 0, 3, 3, 2, 0>,
- ConvImpl::template process_tile<1, 0, 3, 3, 2, 1>,
- ConvImpl::template process_tile<1, 0, 3, 3, 2, 2>,
- ConvImpl::template process_tile<1, 0, 3, 3, 2, 3>,
- }, // Output pad bottom = 2
- { // Output pad bottom = 3
- ConvImpl::template process_tile<1, 0, 3, 3, 3, 0>,
- ConvImpl::template process_tile<1, 0, 3, 3, 3, 1>,
- ConvImpl::template process_tile<1, 0, 3, 3, 3, 2>,
- ConvImpl::template process_tile<1, 0, 3, 3, 3, 3>,
- }, // Output pad bottom = 3
- }, // Input pad right = 3
- { // Input pad right = 4
- { // Output pad bottom = 0
- ConvImpl::template process_tile<1, 0, 3, 4, 0, 0>,
- ConvImpl::template process_tile<1, 0, 3, 4, 0, 1>,
- ConvImpl::template process_tile<1, 0, 3, 4, 0, 2>,
- ConvImpl::template process_tile<1, 0, 3, 4, 0, 3>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- ConvImpl::template process_tile<1, 0, 3, 4, 1, 0>,
- ConvImpl::template process_tile<1, 0, 3, 4, 1, 1>,
- ConvImpl::template process_tile<1, 0, 3, 4, 1, 2>,
- ConvImpl::template process_tile<1, 0, 3, 4, 1, 3>,
- }, // Output pad bottom = 1
- { // Output pad bottom = 2
- ConvImpl::template process_tile<1, 0, 3, 4, 2, 0>,
- ConvImpl::template process_tile<1, 0, 3, 4, 2, 1>,
- ConvImpl::template process_tile<1, 0, 3, 4, 2, 2>,
- ConvImpl::template process_tile<1, 0, 3, 4, 2, 3>,
- }, // Output pad bottom = 2
- { // Output pad bottom = 3
- ConvImpl::template process_tile<1, 0, 3, 4, 3, 0>,
- ConvImpl::template process_tile<1, 0, 3, 4, 3, 1>,
- ConvImpl::template process_tile<1, 0, 3, 4, 3, 2>,
- ConvImpl::template process_tile<1, 0, 3, 4, 3, 3>,
- }, // Output pad bottom = 3
- }, // Input pad right = 4
- }, // Input pad bottom = 3
- { // Input pad bottom = 4
- { // Input pad right = 0
- { // Output pad bottom = 0
- ConvImpl::template process_tile<1, 0, 4, 0, 0, 0>,
- ConvImpl::template process_tile<1, 0, 4, 0, 0, 1>,
- ConvImpl::template process_tile<1, 0, 4, 0, 0, 2>,
- ConvImpl::template process_tile<1, 0, 4, 0, 0, 3>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- ConvImpl::template process_tile<1, 0, 4, 0, 1, 0>,
- ConvImpl::template process_tile<1, 0, 4, 0, 1, 1>,
- ConvImpl::template process_tile<1, 0, 4, 0, 1, 2>,
- ConvImpl::template process_tile<1, 0, 4, 0, 1, 3>,
- }, // Output pad bottom = 1
- { // Output pad bottom = 2
- ConvImpl::template process_tile<1, 0, 4, 0, 2, 0>,
- ConvImpl::template process_tile<1, 0, 4, 0, 2, 1>,
- ConvImpl::template process_tile<1, 0, 4, 0, 2, 2>,
- ConvImpl::template process_tile<1, 0, 4, 0, 2, 3>,
- }, // Output pad bottom = 2
- { // Output pad bottom = 3
- ConvImpl::template process_tile<1, 0, 4, 0, 3, 0>,
- ConvImpl::template process_tile<1, 0, 4, 0, 3, 1>,
- ConvImpl::template process_tile<1, 0, 4, 0, 3, 2>,
- ConvImpl::template process_tile<1, 0, 4, 0, 3, 3>,
- }, // Output pad bottom = 3
- }, // Input pad right = 0
- { // Input pad right = 1
- { // Output pad bottom = 0
- ConvImpl::template process_tile<1, 0, 4, 1, 0, 0>,
- ConvImpl::template process_tile<1, 0, 4, 1, 0, 1>,
- ConvImpl::template process_tile<1, 0, 4, 1, 0, 2>,
- ConvImpl::template process_tile<1, 0, 4, 1, 0, 3>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- ConvImpl::template process_tile<1, 0, 4, 1, 1, 0>,
- ConvImpl::template process_tile<1, 0, 4, 1, 1, 1>,
- ConvImpl::template process_tile<1, 0, 4, 1, 1, 2>,
- ConvImpl::template process_tile<1, 0, 4, 1, 1, 3>,
- }, // Output pad bottom = 1
- { // Output pad bottom = 2
- ConvImpl::template process_tile<1, 0, 4, 1, 2, 0>,
- ConvImpl::template process_tile<1, 0, 4, 1, 2, 1>,
- ConvImpl::template process_tile<1, 0, 4, 1, 2, 2>,
- ConvImpl::template process_tile<1, 0, 4, 1, 2, 3>,
- }, // Output pad bottom = 2
- { // Output pad bottom = 3
- ConvImpl::template process_tile<1, 0, 4, 1, 3, 0>,
- ConvImpl::template process_tile<1, 0, 4, 1, 3, 1>,
- ConvImpl::template process_tile<1, 0, 4, 1, 3, 2>,
- ConvImpl::template process_tile<1, 0, 4, 1, 3, 3>,
- }, // Output pad bottom = 3
- }, // Input pad right = 1
- { // Input pad right = 2
- { // Output pad bottom = 0
- ConvImpl::template process_tile<1, 0, 4, 2, 0, 0>,
- ConvImpl::template process_tile<1, 0, 4, 2, 0, 1>,
- ConvImpl::template process_tile<1, 0, 4, 2, 0, 2>,
- ConvImpl::template process_tile<1, 0, 4, 2, 0, 3>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- ConvImpl::template process_tile<1, 0, 4, 2, 1, 0>,
- ConvImpl::template process_tile<1, 0, 4, 2, 1, 1>,
- ConvImpl::template process_tile<1, 0, 4, 2, 1, 2>,
- ConvImpl::template process_tile<1, 0, 4, 2, 1, 3>,
- }, // Output pad bottom = 1
- { // Output pad bottom = 2
- ConvImpl::template process_tile<1, 0, 4, 2, 2, 0>,
- ConvImpl::template process_tile<1, 0, 4, 2, 2, 1>,
- ConvImpl::template process_tile<1, 0, 4, 2, 2, 2>,
- ConvImpl::template process_tile<1, 0, 4, 2, 2, 3>,
- }, // Output pad bottom = 2
- { // Output pad bottom = 3
- ConvImpl::template process_tile<1, 0, 4, 2, 3, 0>,
- ConvImpl::template process_tile<1, 0, 4, 2, 3, 1>,
- ConvImpl::template process_tile<1, 0, 4, 2, 3, 2>,
- ConvImpl::template process_tile<1, 0, 4, 2, 3, 3>,
- }, // Output pad bottom = 3
- }, // Input pad right = 2
- { // Input pad right = 3
- { // Output pad bottom = 0
- ConvImpl::template process_tile<1, 0, 4, 3, 0, 0>,
- ConvImpl::template process_tile<1, 0, 4, 3, 0, 1>,
- ConvImpl::template process_tile<1, 0, 4, 3, 0, 2>,
- ConvImpl::template process_tile<1, 0, 4, 3, 0, 3>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- ConvImpl::template process_tile<1, 0, 4, 3, 1, 0>,
- ConvImpl::template process_tile<1, 0, 4, 3, 1, 1>,
- ConvImpl::template process_tile<1, 0, 4, 3, 1, 2>,
- ConvImpl::template process_tile<1, 0, 4, 3, 1, 3>,
- }, // Output pad bottom = 1
- { // Output pad bottom = 2
- ConvImpl::template process_tile<1, 0, 4, 3, 2, 0>,
- ConvImpl::template process_tile<1, 0, 4, 3, 2, 1>,
- ConvImpl::template process_tile<1, 0, 4, 3, 2, 2>,
- ConvImpl::template process_tile<1, 0, 4, 3, 2, 3>,
- }, // Output pad bottom = 2
- { // Output pad bottom = 3
- ConvImpl::template process_tile<1, 0, 4, 3, 3, 0>,
- ConvImpl::template process_tile<1, 0, 4, 3, 3, 1>,
- ConvImpl::template process_tile<1, 0, 4, 3, 3, 2>,
- ConvImpl::template process_tile<1, 0, 4, 3, 3, 3>,
- }, // Output pad bottom = 3
- }, // Input pad right = 3
- { // Input pad right = 4
- { // Output pad bottom = 0
- ConvImpl::template process_tile<1, 0, 4, 4, 0, 0>,
- ConvImpl::template process_tile<1, 0, 4, 4, 0, 1>,
- ConvImpl::template process_tile<1, 0, 4, 4, 0, 2>,
- ConvImpl::template process_tile<1, 0, 4, 4, 0, 3>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- ConvImpl::template process_tile<1, 0, 4, 4, 1, 0>,
- ConvImpl::template process_tile<1, 0, 4, 4, 1, 1>,
- ConvImpl::template process_tile<1, 0, 4, 4, 1, 2>,
- ConvImpl::template process_tile<1, 0, 4, 4, 1, 3>,
- }, // Output pad bottom = 1
- { // Output pad bottom = 2
- ConvImpl::template process_tile<1, 0, 4, 4, 2, 0>,
- ConvImpl::template process_tile<1, 0, 4, 4, 2, 1>,
- ConvImpl::template process_tile<1, 0, 4, 4, 2, 2>,
- ConvImpl::template process_tile<1, 0, 4, 4, 2, 3>,
- }, // Output pad bottom = 2
- { // Output pad bottom = 3
- ConvImpl::template process_tile<1, 0, 4, 4, 3, 0>,
- ConvImpl::template process_tile<1, 0, 4, 4, 3, 1>,
- ConvImpl::template process_tile<1, 0, 4, 4, 3, 2>,
- ConvImpl::template process_tile<1, 0, 4, 4, 3, 3>,
- }, // Output pad bottom = 3
- }, // Input pad right = 4
- }, // Input pad bottom = 4
- }, // Input pad left = 0
- { // Input pad left = 1
- { // Input pad bottom = 0
- { // Input pad right = 0
- { // Output pad bottom = 0
- ConvImpl::template process_tile<1, 1, 0, 0, 0, 0>,
- ConvImpl::template process_tile<1, 1, 0, 0, 0, 1>,
- ConvImpl::template process_tile<1, 1, 0, 0, 0, 2>,
- ConvImpl::template process_tile<1, 1, 0, 0, 0, 3>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- ConvImpl::template process_tile<1, 1, 0, 0, 1, 0>,
- ConvImpl::template process_tile<1, 1, 0, 0, 1, 1>,
- ConvImpl::template process_tile<1, 1, 0, 0, 1, 2>,
- ConvImpl::template process_tile<1, 1, 0, 0, 1, 3>,
- }, // Output pad bottom = 1
- { // Output pad bottom = 2
- ConvImpl::template process_tile<1, 1, 0, 0, 2, 0>,
- ConvImpl::template process_tile<1, 1, 0, 0, 2, 1>,
- ConvImpl::template process_tile<1, 1, 0, 0, 2, 2>,
- ConvImpl::template process_tile<1, 1, 0, 0, 2, 3>,
- }, // Output pad bottom = 2
- { // Output pad bottom = 3
- ConvImpl::template process_tile<1, 1, 0, 0, 3, 0>,
- ConvImpl::template process_tile<1, 1, 0, 0, 3, 1>,
- ConvImpl::template process_tile<1, 1, 0, 0, 3, 2>,
- ConvImpl::template process_tile<1, 1, 0, 0, 3, 3>,
- }, // Output pad bottom = 3
- }, // Input pad right = 0
- { // Input pad right = 1
- { // Output pad bottom = 0
- ConvImpl::template process_tile<1, 1, 0, 1, 0, 0>,
- ConvImpl::template process_tile<1, 1, 0, 1, 0, 1>,
- ConvImpl::template process_tile<1, 1, 0, 1, 0, 2>,
- ConvImpl::template process_tile<1, 1, 0, 1, 0, 3>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- ConvImpl::template process_tile<1, 1, 0, 1, 1, 0>,
- ConvImpl::template process_tile<1, 1, 0, 1, 1, 1>,
- ConvImpl::template process_tile<1, 1, 0, 1, 1, 2>,
- ConvImpl::template process_tile<1, 1, 0, 1, 1, 3>,
- }, // Output pad bottom = 1
- { // Output pad bottom = 2
- ConvImpl::template process_tile<1, 1, 0, 1, 2, 0>,
- ConvImpl::template process_tile<1, 1, 0, 1, 2, 1>,
- ConvImpl::template process_tile<1, 1, 0, 1, 2, 2>,
- ConvImpl::template process_tile<1, 1, 0, 1, 2, 3>,
- }, // Output pad bottom = 2
- { // Output pad bottom = 3
- ConvImpl::template process_tile<1, 1, 0, 1, 3, 0>,
- ConvImpl::template process_tile<1, 1, 0, 1, 3, 1>,
- ConvImpl::template process_tile<1, 1, 0, 1, 3, 2>,
- ConvImpl::template process_tile<1, 1, 0, 1, 3, 3>,
- }, // Output pad bottom = 3
- }, // Input pad right = 1
- { // Input pad right = 2
- { // Output pad bottom = 0
- ConvImpl::template process_tile<1, 1, 0, 2, 0, 0>,
- ConvImpl::template process_tile<1, 1, 0, 2, 0, 1>,
- ConvImpl::template process_tile<1, 1, 0, 2, 0, 2>,
- ConvImpl::template process_tile<1, 1, 0, 2, 0, 3>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- ConvImpl::template process_tile<1, 1, 0, 2, 1, 0>,
- ConvImpl::template process_tile<1, 1, 0, 2, 1, 1>,
- ConvImpl::template process_tile<1, 1, 0, 2, 1, 2>,
- ConvImpl::template process_tile<1, 1, 0, 2, 1, 3>,
- }, // Output pad bottom = 1
- { // Output pad bottom = 2
- ConvImpl::template process_tile<1, 1, 0, 2, 2, 0>,
- ConvImpl::template process_tile<1, 1, 0, 2, 2, 1>,
- ConvImpl::template process_tile<1, 1, 0, 2, 2, 2>,
- ConvImpl::template process_tile<1, 1, 0, 2, 2, 3>,
- }, // Output pad bottom = 2
- { // Output pad bottom = 3
- ConvImpl::template process_tile<1, 1, 0, 2, 3, 0>,
- ConvImpl::template process_tile<1, 1, 0, 2, 3, 1>,
- ConvImpl::template process_tile<1, 1, 0, 2, 3, 2>,
- ConvImpl::template process_tile<1, 1, 0, 2, 3, 3>,
- }, // Output pad bottom = 3
- }, // Input pad right = 2
- { // Input pad right = 3
- { // Output pad bottom = 0
- ConvImpl::template process_tile<1, 1, 0, 3, 0, 0>,
- ConvImpl::template process_tile<1, 1, 0, 3, 0, 1>,
- ConvImpl::template process_tile<1, 1, 0, 3, 0, 2>,
- ConvImpl::template process_tile<1, 1, 0, 3, 0, 3>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- ConvImpl::template process_tile<1, 1, 0, 3, 1, 0>,
- ConvImpl::template process_tile<1, 1, 0, 3, 1, 1>,
- ConvImpl::template process_tile<1, 1, 0, 3, 1, 2>,
- ConvImpl::template process_tile<1, 1, 0, 3, 1, 3>,
- }, // Output pad bottom = 1
- { // Output pad bottom = 2
- ConvImpl::template process_tile<1, 1, 0, 3, 2, 0>,
- ConvImpl::template process_tile<1, 1, 0, 3, 2, 1>,
- ConvImpl::template process_tile<1, 1, 0, 3, 2, 2>,
- ConvImpl::template process_tile<1, 1, 0, 3, 2, 3>,
- }, // Output pad bottom = 2
- { // Output pad bottom = 3
- ConvImpl::template process_tile<1, 1, 0, 3, 3, 0>,
- ConvImpl::template process_tile<1, 1, 0, 3, 3, 1>,
- ConvImpl::template process_tile<1, 1, 0, 3, 3, 2>,
- ConvImpl::template process_tile<1, 1, 0, 3, 3, 3>,
- }, // Output pad bottom = 3
- }, // Input pad right = 3
- { // Input pad right = 4
- { // Output pad bottom = 0
- ConvImpl::template process_tile<1, 1, 0, 4, 0, 0>,
- ConvImpl::template process_tile<1, 1, 0, 4, 0, 1>,
- ConvImpl::template process_tile<1, 1, 0, 4, 0, 2>,
- ConvImpl::template process_tile<1, 1, 0, 4, 0, 3>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- ConvImpl::template process_tile<1, 1, 0, 4, 1, 0>,
- ConvImpl::template process_tile<1, 1, 0, 4, 1, 1>,
- ConvImpl::template process_tile<1, 1, 0, 4, 1, 2>,
- ConvImpl::template process_tile<1, 1, 0, 4, 1, 3>,
- }, // Output pad bottom = 1
- { // Output pad bottom = 2
- ConvImpl::template process_tile<1, 1, 0, 4, 2, 0>,
- ConvImpl::template process_tile<1, 1, 0, 4, 2, 1>,
- ConvImpl::template process_tile<1, 1, 0, 4, 2, 2>,
- ConvImpl::template process_tile<1, 1, 0, 4, 2, 3>,
- }, // Output pad bottom = 2
- { // Output pad bottom = 3
- ConvImpl::template process_tile<1, 1, 0, 4, 3, 0>,
- ConvImpl::template process_tile<1, 1, 0, 4, 3, 1>,
- ConvImpl::template process_tile<1, 1, 0, 4, 3, 2>,
- ConvImpl::template process_tile<1, 1, 0, 4, 3, 3>,
- }, // Output pad bottom = 3
- }, // Input pad right = 4
- }, // Input pad bottom = 0
- { // Input pad bottom = 1
- { // Input pad right = 0
- { // Output pad bottom = 0
- ConvImpl::template process_tile<1, 1, 1, 0, 0, 0>,
- ConvImpl::template process_tile<1, 1, 1, 0, 0, 1>,
- ConvImpl::template process_tile<1, 1, 1, 0, 0, 2>,
- ConvImpl::template process_tile<1, 1, 1, 0, 0, 3>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- ConvImpl::template process_tile<1, 1, 1, 0, 1, 0>,
- ConvImpl::template process_tile<1, 1, 1, 0, 1, 1>,
- ConvImpl::template process_tile<1, 1, 1, 0, 1, 2>,
- ConvImpl::template process_tile<1, 1, 1, 0, 1, 3>,
- }, // Output pad bottom = 1
- { // Output pad bottom = 2
- ConvImpl::template process_tile<1, 1, 1, 0, 2, 0>,
- ConvImpl::template process_tile<1, 1, 1, 0, 2, 1>,
- ConvImpl::template process_tile<1, 1, 1, 0, 2, 2>,
- ConvImpl::template process_tile<1, 1, 1, 0, 2, 3>,
- }, // Output pad bottom = 2
- { // Output pad bottom = 3
- ConvImpl::template process_tile<1, 1, 1, 0, 3, 0>,
- ConvImpl::template process_tile<1, 1, 1, 0, 3, 1>,
- ConvImpl::template process_tile<1, 1, 1, 0, 3, 2>,
- ConvImpl::template process_tile<1, 1, 1, 0, 3, 3>,
- }, // Output pad bottom = 3
- }, // Input pad right = 0
- { // Input pad right = 1
- { // Output pad bottom = 0
- ConvImpl::template process_tile<1, 1, 1, 1, 0, 0>,
- ConvImpl::template process_tile<1, 1, 1, 1, 0, 1>,
- ConvImpl::template process_tile<1, 1, 1, 1, 0, 2>,
- ConvImpl::template process_tile<1, 1, 1, 1, 0, 3>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- ConvImpl::template process_tile<1, 1, 1, 1, 1, 0>,
- ConvImpl::template process_tile<1, 1, 1, 1, 1, 1>,
- ConvImpl::template process_tile<1, 1, 1, 1, 1, 2>,
- ConvImpl::template process_tile<1, 1, 1, 1, 1, 3>,
- }, // Output pad bottom = 1
- { // Output pad bottom = 2
- ConvImpl::template process_tile<1, 1, 1, 1, 2, 0>,
- ConvImpl::template process_tile<1, 1, 1, 1, 2, 1>,
- ConvImpl::template process_tile<1, 1, 1, 1, 2, 2>,
- ConvImpl::template process_tile<1, 1, 1, 1, 2, 3>,
- }, // Output pad bottom = 2
- { // Output pad bottom = 3
- ConvImpl::template process_tile<1, 1, 1, 1, 3, 0>,
- ConvImpl::template process_tile<1, 1, 1, 1, 3, 1>,
- ConvImpl::template process_tile<1, 1, 1, 1, 3, 2>,
- ConvImpl::template process_tile<1, 1, 1, 1, 3, 3>,
- }, // Output pad bottom = 3
- }, // Input pad right = 1
- { // Input pad right = 2
- { // Output pad bottom = 0
- ConvImpl::template process_tile<1, 1, 1, 2, 0, 0>,
- ConvImpl::template process_tile<1, 1, 1, 2, 0, 1>,
- ConvImpl::template process_tile<1, 1, 1, 2, 0, 2>,
- ConvImpl::template process_tile<1, 1, 1, 2, 0, 3>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- ConvImpl::template process_tile<1, 1, 1, 2, 1, 0>,
- ConvImpl::template process_tile<1, 1, 1, 2, 1, 1>,
- ConvImpl::template process_tile<1, 1, 1, 2, 1, 2>,
- ConvImpl::template process_tile<1, 1, 1, 2, 1, 3>,
- }, // Output pad bottom = 1
- { // Output pad bottom = 2
- ConvImpl::template process_tile<1, 1, 1, 2, 2, 0>,
- ConvImpl::template process_tile<1, 1, 1, 2, 2, 1>,
- ConvImpl::template process_tile<1, 1, 1, 2, 2, 2>,
- ConvImpl::template process_tile<1, 1, 1, 2, 2, 3>,
- }, // Output pad bottom = 2
- { // Output pad bottom = 3
- ConvImpl::template process_tile<1, 1, 1, 2, 3, 0>,
- ConvImpl::template process_tile<1, 1, 1, 2, 3, 1>,
- ConvImpl::template process_tile<1, 1, 1, 2, 3, 2>,
- ConvImpl::template process_tile<1, 1, 1, 2, 3, 3>,
- }, // Output pad bottom = 3
- }, // Input pad right = 2
- { // Input pad right = 3
- { // Output pad bottom = 0
- ConvImpl::template process_tile<1, 1, 1, 3, 0, 0>,
- ConvImpl::template process_tile<1, 1, 1, 3, 0, 1>,
- ConvImpl::template process_tile<1, 1, 1, 3, 0, 2>,
- ConvImpl::template process_tile<1, 1, 1, 3, 0, 3>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- ConvImpl::template process_tile<1, 1, 1, 3, 1, 0>,
- ConvImpl::template process_tile<1, 1, 1, 3, 1, 1>,
- ConvImpl::template process_tile<1, 1, 1, 3, 1, 2>,
- ConvImpl::template process_tile<1, 1, 1, 3, 1, 3>,
- }, // Output pad bottom = 1
- { // Output pad bottom = 2
- ConvImpl::template process_tile<1, 1, 1, 3, 2, 0>,
- ConvImpl::template process_tile<1, 1, 1, 3, 2, 1>,
- ConvImpl::template process_tile<1, 1, 1, 3, 2, 2>,
- ConvImpl::template process_tile<1, 1, 1, 3, 2, 3>,
- }, // Output pad bottom = 2
- { // Output pad bottom = 3
- ConvImpl::template process_tile<1, 1, 1, 3, 3, 0>,
- ConvImpl::template process_tile<1, 1, 1, 3, 3, 1>,
- ConvImpl::template process_tile<1, 1, 1, 3, 3, 2>,
- ConvImpl::template process_tile<1, 1, 1, 3, 3, 3>,
- }, // Output pad bottom = 3
- }, // Input pad right = 3
- { // Input pad right = 4
- { // Output pad bottom = 0
- ConvImpl::template process_tile<1, 1, 1, 4, 0, 0>,
- ConvImpl::template process_tile<1, 1, 1, 4, 0, 1>,
- ConvImpl::template process_tile<1, 1, 1, 4, 0, 2>,
- ConvImpl::template process_tile<1, 1, 1, 4, 0, 3>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- ConvImpl::template process_tile<1, 1, 1, 4, 1, 0>,
- ConvImpl::template process_tile<1, 1, 1, 4, 1, 1>,
- ConvImpl::template process_tile<1, 1, 1, 4, 1, 2>,
- ConvImpl::template process_tile<1, 1, 1, 4, 1, 3>,
- }, // Output pad bottom = 1
- { // Output pad bottom = 2
- ConvImpl::template process_tile<1, 1, 1, 4, 2, 0>,
- ConvImpl::template process_tile<1, 1, 1, 4, 2, 1>,
- ConvImpl::template process_tile<1, 1, 1, 4, 2, 2>,
- ConvImpl::template process_tile<1, 1, 1, 4, 2, 3>,
- }, // Output pad bottom = 2
- { // Output pad bottom = 3
- ConvImpl::template process_tile<1, 1, 1, 4, 3, 0>,
- ConvImpl::template process_tile<1, 1, 1, 4, 3, 1>,
- ConvImpl::template process_tile<1, 1, 1, 4, 3, 2>,
- ConvImpl::template process_tile<1, 1, 1, 4, 3, 3>,
- }, // Output pad bottom = 3
- }, // Input pad right = 4
- }, // Input pad bottom = 1
- { // Input pad bottom = 2
- { // Input pad right = 0
- { // Output pad bottom = 0
- ConvImpl::template process_tile<1, 1, 2, 0, 0, 0>,
- ConvImpl::template process_tile<1, 1, 2, 0, 0, 1>,
- ConvImpl::template process_tile<1, 1, 2, 0, 0, 2>,
- ConvImpl::template process_tile<1, 1, 2, 0, 0, 3>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- ConvImpl::template process_tile<1, 1, 2, 0, 1, 0>,
- ConvImpl::template process_tile<1, 1, 2, 0, 1, 1>,
- ConvImpl::template process_tile<1, 1, 2, 0, 1, 2>,
- ConvImpl::template process_tile<1, 1, 2, 0, 1, 3>,
- }, // Output pad bottom = 1
- { // Output pad bottom = 2
- ConvImpl::template process_tile<1, 1, 2, 0, 2, 0>,
- ConvImpl::template process_tile<1, 1, 2, 0, 2, 1>,
- ConvImpl::template process_tile<1, 1, 2, 0, 2, 2>,
- ConvImpl::template process_tile<1, 1, 2, 0, 2, 3>,
- }, // Output pad bottom = 2
- { // Output pad bottom = 3
- ConvImpl::template process_tile<1, 1, 2, 0, 3, 0>,
- ConvImpl::template process_tile<1, 1, 2, 0, 3, 1>,
- ConvImpl::template process_tile<1, 1, 2, 0, 3, 2>,
- ConvImpl::template process_tile<1, 1, 2, 0, 3, 3>,
- }, // Output pad bottom = 3
- }, // Input pad right = 0
- { // Input pad right = 1
- { // Output pad bottom = 0
- ConvImpl::template process_tile<1, 1, 2, 1, 0, 0>,
- ConvImpl::template process_tile<1, 1, 2, 1, 0, 1>,
- ConvImpl::template process_tile<1, 1, 2, 1, 0, 2>,
- ConvImpl::template process_tile<1, 1, 2, 1, 0, 3>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- ConvImpl::template process_tile<1, 1, 2, 1, 1, 0>,
- ConvImpl::template process_tile<1, 1, 2, 1, 1, 1>,
- ConvImpl::template process_tile<1, 1, 2, 1, 1, 2>,
- ConvImpl::template process_tile<1, 1, 2, 1, 1, 3>,
- }, // Output pad bottom = 1
- { // Output pad bottom = 2
- ConvImpl::template process_tile<1, 1, 2, 1, 2, 0>,
- ConvImpl::template process_tile<1, 1, 2, 1, 2, 1>,
- ConvImpl::template process_tile<1, 1, 2, 1, 2, 2>,
- ConvImpl::template process_tile<1, 1, 2, 1, 2, 3>,
- }, // Output pad bottom = 2
- { // Output pad bottom = 3
- ConvImpl::template process_tile<1, 1, 2, 1, 3, 0>,
- ConvImpl::template process_tile<1, 1, 2, 1, 3, 1>,
- ConvImpl::template process_tile<1, 1, 2, 1, 3, 2>,
- ConvImpl::template process_tile<1, 1, 2, 1, 3, 3>,
- }, // Output pad bottom = 3
- }, // Input pad right = 1
- { // Input pad right = 2
- { // Output pad bottom = 0
- ConvImpl::template process_tile<1, 1, 2, 2, 0, 0>,
- ConvImpl::template process_tile<1, 1, 2, 2, 0, 1>,
- ConvImpl::template process_tile<1, 1, 2, 2, 0, 2>,
- ConvImpl::template process_tile<1, 1, 2, 2, 0, 3>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- ConvImpl::template process_tile<1, 1, 2, 2, 1, 0>,
- ConvImpl::template process_tile<1, 1, 2, 2, 1, 1>,
- ConvImpl::template process_tile<1, 1, 2, 2, 1, 2>,
- ConvImpl::template process_tile<1, 1, 2, 2, 1, 3>,
- }, // Output pad bottom = 1
- { // Output pad bottom = 2
- ConvImpl::template process_tile<1, 1, 2, 2, 2, 0>,
- ConvImpl::template process_tile<1, 1, 2, 2, 2, 1>,
- ConvImpl::template process_tile<1, 1, 2, 2, 2, 2>,
- ConvImpl::template process_tile<1, 1, 2, 2, 2, 3>,
- }, // Output pad bottom = 2
- { // Output pad bottom = 3
- ConvImpl::template process_tile<1, 1, 2, 2, 3, 0>,
- ConvImpl::template process_tile<1, 1, 2, 2, 3, 1>,
- ConvImpl::template process_tile<1, 1, 2, 2, 3, 2>,
- ConvImpl::template process_tile<1, 1, 2, 2, 3, 3>,
- }, // Output pad bottom = 3
- }, // Input pad right = 2
- { // Input pad right = 3
- { // Output pad bottom = 0
- ConvImpl::template process_tile<1, 1, 2, 3, 0, 0>,
- ConvImpl::template process_tile<1, 1, 2, 3, 0, 1>,
- ConvImpl::template process_tile<1, 1, 2, 3, 0, 2>,
- ConvImpl::template process_tile<1, 1, 2, 3, 0, 3>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- ConvImpl::template process_tile<1, 1, 2, 3, 1, 0>,
- ConvImpl::template process_tile<1, 1, 2, 3, 1, 1>,
- ConvImpl::template process_tile<1, 1, 2, 3, 1, 2>,
- ConvImpl::template process_tile<1, 1, 2, 3, 1, 3>,
- }, // Output pad bottom = 1
- { // Output pad bottom = 2
- ConvImpl::template process_tile<1, 1, 2, 3, 2, 0>,
- ConvImpl::template process_tile<1, 1, 2, 3, 2, 1>,
- ConvImpl::template process_tile<1, 1, 2, 3, 2, 2>,
- ConvImpl::template process_tile<1, 1, 2, 3, 2, 3>,
- }, // Output pad bottom = 2
- { // Output pad bottom = 3
- ConvImpl::template process_tile<1, 1, 2, 3, 3, 0>,
- ConvImpl::template process_tile<1, 1, 2, 3, 3, 1>,
- ConvImpl::template process_tile<1, 1, 2, 3, 3, 2>,
- ConvImpl::template process_tile<1, 1, 2, 3, 3, 3>,
- }, // Output pad bottom = 3
- }, // Input pad right = 3
- { // Input pad right = 4
- { // Output pad bottom = 0
- ConvImpl::template process_tile<1, 1, 2, 4, 0, 0>,
- ConvImpl::template process_tile<1, 1, 2, 4, 0, 1>,
- ConvImpl::template process_tile<1, 1, 2, 4, 0, 2>,
- ConvImpl::template process_tile<1, 1, 2, 4, 0, 3>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- ConvImpl::template process_tile<1, 1, 2, 4, 1, 0>,
- ConvImpl::template process_tile<1, 1, 2, 4, 1, 1>,
- ConvImpl::template process_tile<1, 1, 2, 4, 1, 2>,
- ConvImpl::template process_tile<1, 1, 2, 4, 1, 3>,
- }, // Output pad bottom = 1
- { // Output pad bottom = 2
- ConvImpl::template process_tile<1, 1, 2, 4, 2, 0>,
- ConvImpl::template process_tile<1, 1, 2, 4, 2, 1>,
- ConvImpl::template process_tile<1, 1, 2, 4, 2, 2>,
- ConvImpl::template process_tile<1, 1, 2, 4, 2, 3>,
- }, // Output pad bottom = 2
- { // Output pad bottom = 3
- ConvImpl::template process_tile<1, 1, 2, 4, 3, 0>,
- ConvImpl::template process_tile<1, 1, 2, 4, 3, 1>,
- ConvImpl::template process_tile<1, 1, 2, 4, 3, 2>,
- ConvImpl::template process_tile<1, 1, 2, 4, 3, 3>,
- }, // Output pad bottom = 3
- }, // Input pad right = 4
- }, // Input pad bottom = 2
- { // Input pad bottom = 3
- { // Input pad right = 0
- { // Output pad bottom = 0
- ConvImpl::template process_tile<1, 1, 3, 0, 0, 0>,
- ConvImpl::template process_tile<1, 1, 3, 0, 0, 1>,
- ConvImpl::template process_tile<1, 1, 3, 0, 0, 2>,
- ConvImpl::template process_tile<1, 1, 3, 0, 0, 3>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- ConvImpl::template process_tile<1, 1, 3, 0, 1, 0>,
- ConvImpl::template process_tile<1, 1, 3, 0, 1, 1>,
- ConvImpl::template process_tile<1, 1, 3, 0, 1, 2>,
- ConvImpl::template process_tile<1, 1, 3, 0, 1, 3>,
- }, // Output pad bottom = 1
- { // Output pad bottom = 2
- ConvImpl::template process_tile<1, 1, 3, 0, 2, 0>,
- ConvImpl::template process_tile<1, 1, 3, 0, 2, 1>,
- ConvImpl::template process_tile<1, 1, 3, 0, 2, 2>,
- ConvImpl::template process_tile<1, 1, 3, 0, 2, 3>,
- }, // Output pad bottom = 2
- { // Output pad bottom = 3
- ConvImpl::template process_tile<1, 1, 3, 0, 3, 0>,
- ConvImpl::template process_tile<1, 1, 3, 0, 3, 1>,
- ConvImpl::template process_tile<1, 1, 3, 0, 3, 2>,
- ConvImpl::template process_tile<1, 1, 3, 0, 3, 3>,
- }, // Output pad bottom = 3
- }, // Input pad right = 0
- { // Input pad right = 1
- { // Output pad bottom = 0
- ConvImpl::template process_tile<1, 1, 3, 1, 0, 0>,
- ConvImpl::template process_tile<1, 1, 3, 1, 0, 1>,
- ConvImpl::template process_tile<1, 1, 3, 1, 0, 2>,
- ConvImpl::template process_tile<1, 1, 3, 1, 0, 3>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- ConvImpl::template process_tile<1, 1, 3, 1, 1, 0>,
- ConvImpl::template process_tile<1, 1, 3, 1, 1, 1>,
- ConvImpl::template process_tile<1, 1, 3, 1, 1, 2>,
- ConvImpl::template process_tile<1, 1, 3, 1, 1, 3>,
- }, // Output pad bottom = 1
- { // Output pad bottom = 2
- ConvImpl::template process_tile<1, 1, 3, 1, 2, 0>,
- ConvImpl::template process_tile<1, 1, 3, 1, 2, 1>,
- ConvImpl::template process_tile<1, 1, 3, 1, 2, 2>,
- ConvImpl::template process_tile<1, 1, 3, 1, 2, 3>,
- }, // Output pad bottom = 2
- { // Output pad bottom = 3
- ConvImpl::template process_tile<1, 1, 3, 1, 3, 0>,
- ConvImpl::template process_tile<1, 1, 3, 1, 3, 1>,
- ConvImpl::template process_tile<1, 1, 3, 1, 3, 2>,
- ConvImpl::template process_tile<1, 1, 3, 1, 3, 3>,
- }, // Output pad bottom = 3
- }, // Input pad right = 1
- { // Input pad right = 2
- { // Output pad bottom = 0
- ConvImpl::template process_tile<1, 1, 3, 2, 0, 0>,
- ConvImpl::template process_tile<1, 1, 3, 2, 0, 1>,
- ConvImpl::template process_tile<1, 1, 3, 2, 0, 2>,
- ConvImpl::template process_tile<1, 1, 3, 2, 0, 3>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- ConvImpl::template process_tile<1, 1, 3, 2, 1, 0>,
- ConvImpl::template process_tile<1, 1, 3, 2, 1, 1>,
- ConvImpl::template process_tile<1, 1, 3, 2, 1, 2>,
- ConvImpl::template process_tile<1, 1, 3, 2, 1, 3>,
- }, // Output pad bottom = 1
- { // Output pad bottom = 2
- ConvImpl::template process_tile<1, 1, 3, 2, 2, 0>,
- ConvImpl::template process_tile<1, 1, 3, 2, 2, 1>,
- ConvImpl::template process_tile<1, 1, 3, 2, 2, 2>,
- ConvImpl::template process_tile<1, 1, 3, 2, 2, 3>,
- }, // Output pad bottom = 2
- { // Output pad bottom = 3
- ConvImpl::template process_tile<1, 1, 3, 2, 3, 0>,
- ConvImpl::template process_tile<1, 1, 3, 2, 3, 1>,
- ConvImpl::template process_tile<1, 1, 3, 2, 3, 2>,
- ConvImpl::template process_tile<1, 1, 3, 2, 3, 3>,
- }, // Output pad bottom = 3
- }, // Input pad right = 2
- { // Input pad right = 3
- { // Output pad bottom = 0
- ConvImpl::template process_tile<1, 1, 3, 3, 0, 0>,
- ConvImpl::template process_tile<1, 1, 3, 3, 0, 1>,
- ConvImpl::template process_tile<1, 1, 3, 3, 0, 2>,
- ConvImpl::template process_tile<1, 1, 3, 3, 0, 3>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- ConvImpl::template process_tile<1, 1, 3, 3, 1, 0>,
- ConvImpl::template process_tile<1, 1, 3, 3, 1, 1>,
- ConvImpl::template process_tile<1, 1, 3, 3, 1, 2>,
- ConvImpl::template process_tile<1, 1, 3, 3, 1, 3>,
- }, // Output pad bottom = 1
- { // Output pad bottom = 2
- ConvImpl::template process_tile<1, 1, 3, 3, 2, 0>,
- ConvImpl::template process_tile<1, 1, 3, 3, 2, 1>,
- ConvImpl::template process_tile<1, 1, 3, 3, 2, 2>,
- ConvImpl::template process_tile<1, 1, 3, 3, 2, 3>,
- }, // Output pad bottom = 2
- { // Output pad bottom = 3
- ConvImpl::template process_tile<1, 1, 3, 3, 3, 0>,
- ConvImpl::template process_tile<1, 1, 3, 3, 3, 1>,
- ConvImpl::template process_tile<1, 1, 3, 3, 3, 2>,
- ConvImpl::template process_tile<1, 1, 3, 3, 3, 3>,
- }, // Output pad bottom = 3
- }, // Input pad right = 3
- { // Input pad right = 4
- { // Output pad bottom = 0
- ConvImpl::template process_tile<1, 1, 3, 4, 0, 0>,
- ConvImpl::template process_tile<1, 1, 3, 4, 0, 1>,
- ConvImpl::template process_tile<1, 1, 3, 4, 0, 2>,
- ConvImpl::template process_tile<1, 1, 3, 4, 0, 3>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- ConvImpl::template process_tile<1, 1, 3, 4, 1, 0>,
- ConvImpl::template process_tile<1, 1, 3, 4, 1, 1>,
- ConvImpl::template process_tile<1, 1, 3, 4, 1, 2>,
- ConvImpl::template process_tile<1, 1, 3, 4, 1, 3>,
- }, // Output pad bottom = 1
- { // Output pad bottom = 2
- ConvImpl::template process_tile<1, 1, 3, 4, 2, 0>,
- ConvImpl::template process_tile<1, 1, 3, 4, 2, 1>,
- ConvImpl::template process_tile<1, 1, 3, 4, 2, 2>,
- ConvImpl::template process_tile<1, 1, 3, 4, 2, 3>,
- }, // Output pad bottom = 2
- { // Output pad bottom = 3
- ConvImpl::template process_tile<1, 1, 3, 4, 3, 0>,
- ConvImpl::template process_tile<1, 1, 3, 4, 3, 1>,
- ConvImpl::template process_tile<1, 1, 3, 4, 3, 2>,
- ConvImpl::template process_tile<1, 1, 3, 4, 3, 3>,
- }, // Output pad bottom = 3
- }, // Input pad right = 4
- }, // Input pad bottom = 3
- { // Input pad bottom = 4
- { // Input pad right = 0
- { // Output pad bottom = 0
- ConvImpl::template process_tile<1, 1, 4, 0, 0, 0>,
- ConvImpl::template process_tile<1, 1, 4, 0, 0, 1>,
- ConvImpl::template process_tile<1, 1, 4, 0, 0, 2>,
- ConvImpl::template process_tile<1, 1, 4, 0, 0, 3>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- ConvImpl::template process_tile<1, 1, 4, 0, 1, 0>,
- ConvImpl::template process_tile<1, 1, 4, 0, 1, 1>,
- ConvImpl::template process_tile<1, 1, 4, 0, 1, 2>,
- ConvImpl::template process_tile<1, 1, 4, 0, 1, 3>,
- }, // Output pad bottom = 1
- { // Output pad bottom = 2
- ConvImpl::template process_tile<1, 1, 4, 0, 2, 0>,
- ConvImpl::template process_tile<1, 1, 4, 0, 2, 1>,
- ConvImpl::template process_tile<1, 1, 4, 0, 2, 2>,
- ConvImpl::template process_tile<1, 1, 4, 0, 2, 3>,
- }, // Output pad bottom = 2
- { // Output pad bottom = 3
- ConvImpl::template process_tile<1, 1, 4, 0, 3, 0>,
- ConvImpl::template process_tile<1, 1, 4, 0, 3, 1>,
- ConvImpl::template process_tile<1, 1, 4, 0, 3, 2>,
- ConvImpl::template process_tile<1, 1, 4, 0, 3, 3>,
- }, // Output pad bottom = 3
- }, // Input pad right = 0
- { // Input pad right = 1
- { // Output pad bottom = 0
- ConvImpl::template process_tile<1, 1, 4, 1, 0, 0>,
- ConvImpl::template process_tile<1, 1, 4, 1, 0, 1>,
- ConvImpl::template process_tile<1, 1, 4, 1, 0, 2>,
- ConvImpl::template process_tile<1, 1, 4, 1, 0, 3>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- ConvImpl::template process_tile<1, 1, 4, 1, 1, 0>,
- ConvImpl::template process_tile<1, 1, 4, 1, 1, 1>,
- ConvImpl::template process_tile<1, 1, 4, 1, 1, 2>,
- ConvImpl::template process_tile<1, 1, 4, 1, 1, 3>,
- }, // Output pad bottom = 1
- { // Output pad bottom = 2
- ConvImpl::template process_tile<1, 1, 4, 1, 2, 0>,
- ConvImpl::template process_tile<1, 1, 4, 1, 2, 1>,
- ConvImpl::template process_tile<1, 1, 4, 1, 2, 2>,
- ConvImpl::template process_tile<1, 1, 4, 1, 2, 3>,
- }, // Output pad bottom = 2
- { // Output pad bottom = 3
- ConvImpl::template process_tile<1, 1, 4, 1, 3, 0>,
- ConvImpl::template process_tile<1, 1, 4, 1, 3, 1>,
- ConvImpl::template process_tile<1, 1, 4, 1, 3, 2>,
- ConvImpl::template process_tile<1, 1, 4, 1, 3, 3>,
- }, // Output pad bottom = 3
- }, // Input pad right = 1
- { // Input pad right = 2
- { // Output pad bottom = 0
- ConvImpl::template process_tile<1, 1, 4, 2, 0, 0>,
- ConvImpl::template process_tile<1, 1, 4, 2, 0, 1>,
- ConvImpl::template process_tile<1, 1, 4, 2, 0, 2>,
- ConvImpl::template process_tile<1, 1, 4, 2, 0, 3>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- ConvImpl::template process_tile<1, 1, 4, 2, 1, 0>,
- ConvImpl::template process_tile<1, 1, 4, 2, 1, 1>,
- ConvImpl::template process_tile<1, 1, 4, 2, 1, 2>,
- ConvImpl::template process_tile<1, 1, 4, 2, 1, 3>,
- }, // Output pad bottom = 1
- { // Output pad bottom = 2
- ConvImpl::template process_tile<1, 1, 4, 2, 2, 0>,
- ConvImpl::template process_tile<1, 1, 4, 2, 2, 1>,
- ConvImpl::template process_tile<1, 1, 4, 2, 2, 2>,
- ConvImpl::template process_tile<1, 1, 4, 2, 2, 3>,
- }, // Output pad bottom = 2
- { // Output pad bottom = 3
- ConvImpl::template process_tile<1, 1, 4, 2, 3, 0>,
- ConvImpl::template process_tile<1, 1, 4, 2, 3, 1>,
- ConvImpl::template process_tile<1, 1, 4, 2, 3, 2>,
- ConvImpl::template process_tile<1, 1, 4, 2, 3, 3>,
- }, // Output pad bottom = 3
- }, // Input pad right = 2
- { // Input pad right = 3
- { // Output pad bottom = 0
- ConvImpl::template process_tile<1, 1, 4, 3, 0, 0>,
- ConvImpl::template process_tile<1, 1, 4, 3, 0, 1>,
- ConvImpl::template process_tile<1, 1, 4, 3, 0, 2>,
- ConvImpl::template process_tile<1, 1, 4, 3, 0, 3>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- ConvImpl::template process_tile<1, 1, 4, 3, 1, 0>,
- ConvImpl::template process_tile<1, 1, 4, 3, 1, 1>,
- ConvImpl::template process_tile<1, 1, 4, 3, 1, 2>,
- ConvImpl::template process_tile<1, 1, 4, 3, 1, 3>,
- }, // Output pad bottom = 1
- { // Output pad bottom = 2
- ConvImpl::template process_tile<1, 1, 4, 3, 2, 0>,
- ConvImpl::template process_tile<1, 1, 4, 3, 2, 1>,
- ConvImpl::template process_tile<1, 1, 4, 3, 2, 2>,
- ConvImpl::template process_tile<1, 1, 4, 3, 2, 3>,
- }, // Output pad bottom = 2
- { // Output pad bottom = 3
- ConvImpl::template process_tile<1, 1, 4, 3, 3, 0>,
- ConvImpl::template process_tile<1, 1, 4, 3, 3, 1>,
- ConvImpl::template process_tile<1, 1, 4, 3, 3, 2>,
- ConvImpl::template process_tile<1, 1, 4, 3, 3, 3>,
- }, // Output pad bottom = 3
- }, // Input pad right = 3
- { // Input pad right = 4
- { // Output pad bottom = 0
- ConvImpl::template process_tile<1, 1, 4, 4, 0, 0>,
- ConvImpl::template process_tile<1, 1, 4, 4, 0, 1>,
- ConvImpl::template process_tile<1, 1, 4, 4, 0, 2>,
- ConvImpl::template process_tile<1, 1, 4, 4, 0, 3>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- ConvImpl::template process_tile<1, 1, 4, 4, 1, 0>,
- ConvImpl::template process_tile<1, 1, 4, 4, 1, 1>,
- ConvImpl::template process_tile<1, 1, 4, 4, 1, 2>,
- ConvImpl::template process_tile<1, 1, 4, 4, 1, 3>,
- }, // Output pad bottom = 1
- { // Output pad bottom = 2
- ConvImpl::template process_tile<1, 1, 4, 4, 2, 0>,
- ConvImpl::template process_tile<1, 1, 4, 4, 2, 1>,
- ConvImpl::template process_tile<1, 1, 4, 4, 2, 2>,
- ConvImpl::template process_tile<1, 1, 4, 4, 2, 3>,
- }, // Output pad bottom = 2
- { // Output pad bottom = 3
- ConvImpl::template process_tile<1, 1, 4, 4, 3, 0>,
- ConvImpl::template process_tile<1, 1, 4, 4, 3, 1>,
- ConvImpl::template process_tile<1, 1, 4, 4, 3, 2>,
- ConvImpl::template process_tile<1, 1, 4, 4, 3, 3>,
- }, // Output pad bottom = 3
- }, // Input pad right = 4
- }, // Input pad bottom = 4
- }, // Input pad left = 1
- }, // Input pad top = 1
+template <>
+void ConvImpl::process_tile<true, 0, 0, 0, 0, 0, 0>(
+ const int n_channels,
+ const float* const weights,
+ const int weight_row_stride,
+ const int weight_col_stride,
+ const float* const inptr,
+ const int in_row_stride,
+ const int in_col_stride,
+ float* const outptr,
+ const int out_row_stride,
+ const int out_col_stride,
+ const int, const int, const int, const int, const int, const int
+)
+{
+ constexpr auto inner_tile_rows = DWC::inner_tile_rows;
+ constexpr auto inner_tile_cols = DWC::inner_tile_cols;
+ constexpr auto kernel_rows = DWC::kernel_rows;
+ constexpr auto kernel_cols = DWC::kernel_cols;
+ constexpr auto output_tile_rows = DWC::output_tile_rows;
+ constexpr auto output_tile_cols = DWC::output_tile_cols;
+ constexpr auto stride_rows = DWC::stride_rows;
+ constexpr auto stride_cols = DWC::stride_cols;
+
+ // Extract parameters
+ const int in_pad_top = 0;
+ const int in_pad_left = 0;
+ const int in_pad_bottom = 0;
+ const int in_pad_right = 0;
+ const int out_pad_bottom = 0;
+ const int out_pad_right = 0;
+
+ // Compute valid ranges of the tile
+ const int in_cells_i = inner_tile_rows - in_pad_bottom;
+ const int in_cells_j = inner_tile_cols - in_pad_right;
+ const int out_cells_i = output_tile_rows - out_pad_bottom;
+ const int out_cells_j = output_tile_cols - out_pad_right;
+
+ // Copy pointers
+ const float *uptr0 = inptr;
+ const float *wptr0 = weights;
+ float *vptr0 = outptr;
+ const bool same_strides = (
+ weight_col_stride == in_col_stride &&
+ weight_col_stride == out_col_stride
+ );
+
+ int channels_remaining = n_channels;
+ if (channels_remaining >= 4 && same_strides)
+ {
+ int c4_rem = channels_remaining / 4;
+ channels_remaining %= 4;
+ const int prefetch_depth = 8;
+
+ asm volatile (
+ "qW22 .req q0\n" "vW22 .req v0\n"
+ "qU64 .req q1\n" "qU35 .req q1\n" "qV41 .req q1\n"
+ "vU64 .req v1\n" "vU35 .req v1\n" "vV41 .req v1\n"
+ "qU34 .req q2\n" "qU21 .req q2\n" "qV43 .req q2\n"
+ "vU34 .req v2\n" "vU21 .req v2\n" "vV43 .req v2\n"
+ "qW21 .req q3\n" "vW21 .req v3\n"
+ "qU24 .req q4\n" "qU54 .req q4\n" "qV31 .req q4\n"
+ "vU24 .req v4\n" "vU54 .req v4\n" "vV31 .req v4\n"
+ "qV12 .req q5\n" "qU61 .req q5\n" "vV12 .req v5\n" "vU61 .req v5\n"
+ "qU26 .req q6\n" "qV32 .req q6\n" "vU26 .req v6\n" "vV32 .req v6\n"
+ "qU36 .req q7\n" "qU51 .req q7\n" "qU66 .req q7\n" "qU12 .req q7\n"
+ "vU36 .req v7\n" "vU51 .req v7\n" "vU66 .req v7\n" "vU12 .req v7\n"
+ "qV14 .req q8\n" "qV11 .req q8\n" "qU65 .req q8\n"
+ "vV14 .req v8\n" "vV11 .req v8\n" "vU65 .req v8\n"
+ "qU15 .req q9\n" "qU22 .req q9\n" "qU45 .req q9\n"
+ "vU15 .req v9\n" "vU22 .req v9\n" "vU45 .req v9\n"
+ "qV22 .req q10\n" "qU14 .req q10\n" "vV22 .req v10\n" "vU14 .req v10\n"
+ "qU44 .req q11\n" "qU43 .req q11\n" "qU11 .req q11\n"
+ "vU44 .req v11\n" "vU43 .req v11\n" "vU11 .req v11\n"
+ "qV24 .req q12\n" "qV42 .req q12\n" "vV24 .req v12\n" "vV42 .req v12\n"
+ "qW31 .req q13\n" "vW31 .req v13\n" "qW13 .req q14\n" "vW13 .req v14\n"
+ "qU33 .req q15\n" "qU62 .req q15\n" "qU25 .req q15\n" "qU56 .req q15\n"
+ "vU33 .req v15\n" "vU62 .req v15\n" "vU25 .req v15\n" "vU56 .req v15\n"
+ "qW33 .req q16\n" "vW33 .req v16\n"
+ "qU42 .req q17\n" "qU16 .req q17\n" "qV44 .req q17\n"
+ "vU42 .req v17\n" "vU16 .req v17\n" "vV44 .req v17\n"
+ "qU63 .req q18\n" "qU31 .req q18\n" "qV34 .req q18\n"
+ "vU63 .req v18\n" "vU31 .req v18\n" "vV34 .req v18\n"
+ "qW11 .req q19\n" "vW11 .req v19\n" "qU41 .req q20\n" "qV13 .req q20\n"
+ "vU41 .req v20\n" "vV13 .req v20\n" "qV33 .req q21\n" "vV33 .req v21\n"
+ "qU46 .req q22\n" "qU32 .req q22\n" "qU13 .req q22\n"
+ "vU46 .req v22\n" "vU32 .req v22\n" "vU13 .req v22\n" "qW23 .req q23\n"
+ "vW23 .req v23\n" "qV23 .req q24\n" "vV23 .req v24\n"
+ "qV21 .req q25\n" "qU55 .req q25\n" "vV21 .req v25\n" "vU55 .req v25\n"
+ "qW12 .req q26\n" "vW12 .req v26\n" "qW32 .req q27\n" "vW32 .req v27\n"
+ "qU23 .req q28\n" "qU52 .req q28\n"
+ "vU23 .req v28\n" "vU52 .req v28\n" "qU53 .req q29\n" "vU53 .req v29\n"
+
+ "uptr1 .req x0\n"
+ "uptr2 .req x1\n"
+ "uptr3 .req x2\n"
+ "uptr4 .req x3\n"
+ "uptr5 .req x4\n"
+
+ "vptr1 .req x5\n"
+ "vptr2 .req x6\n"
+ "vptr3 .req x7\n"
+
+ "wptr1 .req x8\n"
+ "wptr2 .req x9\n"
+
+ // Prepare pointers and strides
+ "add uptr1, %x[uptr0], %x[u_row_stride]\n"
+ "add uptr2, uptr1 , %x[u_row_stride]\n"
+ "add uptr3, uptr2 , %x[u_row_stride]\n"
+ "add uptr4, uptr3 , %x[u_row_stride]\n"
+ "add uptr5, uptr4 , %x[u_row_stride]\n"
+
+ "add vptr1, %x[vptr0], %x[v_row_stride]\n"
+ "add vptr2, vptr1 , %x[v_row_stride]\n"
+ "add vptr3, vptr2 , %x[v_row_stride]\n"
+
+ "add wptr1, %x[wptr0], %x[w_row_stride]\n"
+ "add wptr2, wptr1 , %x[w_row_stride]\n"
+
+ // Load initial operands
+ "ldr qU16, [%x[uptr0], %x[uvw_col_stride5]]\n"
+ "ldr qW13, [%x[wptr0], %x[uvw_col_stride2]]\n"
+ "subs %x[c4_rem], %x[c4_rem], #1\n"
+ "ldr qU15, [%x[uptr0], %x[uvw_col_stride4]]\n"
+ "ldr qW23, [wptr1, %x[uvw_col_stride2]]\n"
+ "ldr qU14, [%x[uptr0], %x[uvw_col_stride3]]\n"
+ "ldr qW33, [wptr2, %x[uvw_col_stride2]]\n"
+ "ldr qU26, [uptr1, %x[uvw_col_stride5]]\n"
+ "ldr qW12, [%x[wptr0], %x[uvw_col_stride1]]\n"
+ "ldr qU25, [uptr1, %x[uvw_col_stride4]]\n"
+ "ldr qW22, [wptr1, %x[uvw_col_stride1]]\n"
+ "ldr qU36, [uptr2, %x[uvw_col_stride5]]\n"
+ "ldr qW32, [wptr2, %x[uvw_col_stride1]]\n"
+ "ldr qW11, [%x[wptr0]], #0x10\n"
+ "fmul vV14.4s, vU16.4s, vW13.4s\n"
+ "ldr qU24, [uptr1, %x[uvw_col_stride3]]\n"
+ "fmul vV13.4s, vU15.4s, vW13.4s\n"
+ "ldr qW31, [wptr2], #0x10\n"
+ "fmla vV14.4s, vU15.4s, vW12.4s\n"
+ "ldr qW21, [wptr1], #0x10\n"
+ "fmul vV12.4s, vU14.4s, vW13.4s\n"
+ "ldr qU34, [uptr2, %x[uvw_col_stride3]]\n"
+ "fmla vV13.4s, vU14.4s, vW12.4s\n"
+ "ldr qU46, [uptr3, %x[uvw_col_stride5]]\n"
+ "fmla vV14.4s, vU14.4s, vW11.4s\n"
+ "ldr qU45, [uptr3, %x[uvw_col_stride4]]\n"
+ "fmla vV14.4s, vU26.4s, vW23.4s\n"
+ "ldr qU35, [uptr2, %x[uvw_col_stride4]]\n"
+ "fmul vV24.4s, vU26.4s, vW13.4s\n"
+ "ldr qU44, [uptr3, %x[uvw_col_stride3]]\n"
+ "fmla vV13.4s, vU25.4s, vW23.4s\n"
+ "beq 2f\n" // Single iteration only
+
+ "1:" // Loop body
+ "fmla vV14.4s, vU25.4s, vW22.4s\n"
+ "prfm pldl1keep, [%x[wptr0], %[prftch]]\n"
+ "fmul vV23.4s, vU25.4s, vW13.4s\n"
+ "prfm pldl1keep, [%x[wptr0], %x[prftch_uvw_col_stride1]]\n"
+ "fmla vV24.4s, vU25.4s, vW12.4s\n"
+ "ldr qU56, [uptr4, %x[uvw_col_stride5]]\n"
+ "fmla vV12.4s, vU24.4s, vW23.4s\n"
+ "prfm pldl1keep, [%x[wptr0], %x[prftch_uvw_col_stride2] ]\n"
+ "fmla vV13.4s, vU24.4s, vW22.4s\n"
+ "prfm pldl1keep, [ wptr1 , %[prftch]]\n"
+ "fmla vV14.4s, vU24.4s, vW21.4s\n"
+ "prfm pldl1keep, [ wptr1 , %x[prftch_uvw_col_stride1]]\n"
+ "fmul vV22.4s, vU24.4s, vW13.4s\n"
+ "prfm pldl1keep, [ wptr1 , %x[prftch_uvw_col_stride2] ]\n"
+ "fmla vV23.4s, vU24.4s, vW12.4s\n"
+ "prfm pldl1keep, [ wptr2 , %x[prftch]]\n"
+ "fmla vV24.4s, vU24.4s, vW11.4s\n"
+ "ldr qU55, [uptr4, %x[uvw_col_stride4]]\n"
+ "fmla vV14.4s, vU36.4s, vW33.4s\n"
+ "prfm pldl1keep, [ wptr2 , %x[prftch_uvw_col_stride1]]\n"
+ "fmla vV24.4s, vU36.4s, vW23.4s\n"
+ "prfm pldl1keep, [ wptr2 , %x[prftch_uvw_col_stride2] ]\n"
+ "fmul vV34.4s, vU36.4s, vW13.4s\n"
+ "ldr qU54, [uptr4, %x[uvw_col_stride3]]\n"
+ "fmla vV13.4s, vU35.4s, vW33.4s\n"
+ "prfm pldl1keep, [ uptr2 , %x[prftch_uvw_col_stride1]]\n"
+ "fmla vV14.4s, vU35.4s, vW32.4s\n"
+ "prfm pldl1keep, [ uptr2 , %x[prftch_uvw_col_stride2] ]\n"
+ "fmla vV23.4s, vU35.4s, vW23.4s\n"
+ "prfm pldl1keep, [ uptr2 , %x[prftch_uvw_col_stride3] ]\n"
+ "fmla vV24.4s, vU35.4s, vW22.4s\n"
+ "prfm pldl1keep, [ uptr2 , %x[prftch_uvw_col_stride4] ]\n"
+ "fmul vV33.4s, vU35.4s, vW13.4s\n"
+ "prfm pldl1keep, [ uptr2 , %x[prftch_uvw_col_stride5] ]\n"
+ "fmla vV34.4s, vU35.4s, vW12.4s\n"
+ "ldr qU66, [uptr5, %x[uvw_col_stride5]]\n"
+ "fmla vV12.4s, vU34.4s, vW33.4s\n"
+ "prfm pldl1keep, [ uptr3 , %[prftch]]\n"
+ "fmla vV13.4s, vU34.4s, vW32.4s\n"
+ "prfm pldl1keep, [ uptr3 , %x[prftch_uvw_col_stride1]]\n"
+ "fmla vV14.4s, vU34.4s, vW31.4s\n"
+ "str qV14, [%x[vptr0], %x[uvw_col_stride3]]\n"
+ "fmla vV22.4s, vU34.4s, vW23.4s\n"
+ "prfm pldl1keep, [ uptr3 , %x[prftch_uvw_col_stride2] ]\n"
+ "fmla vV23.4s, vU34.4s, vW22.4s\n"
+ "prfm pldl1keep, [ uptr3 , %x[prftch_uvw_col_stride3] ]\n"
+ "fmla vV24.4s, vU34.4s, vW21.4s\n"
+ "prfm pldl1keep, [ uptr3 , %x[prftch_uvw_col_stride4] ]\n"
+ "fmul vV32.4s, vU34.4s, vW13.4s\n"
+ "prfm pldl1keep, [ uptr3 , %x[prftch_uvw_col_stride5] ]\n"
+ "fmla vV33.4s, vU34.4s, vW12.4s\n"
+ "prfm pldl1keep, [ uptr4 , %[prftch]]\n"
+ "fmla vV34.4s, vU34.4s, vW11.4s\n"
+ "ldr qU65, [uptr5, %x[uvw_col_stride4]]\n"
+ "fmla vV24.4s, vU46.4s, vW33.4s\n"
+ "prfm pldl1keep, [ uptr4 , %x[prftch_uvw_col_stride1]]\n"
+ "fmla vV34.4s, vU46.4s, vW23.4s\n"
+ "prfm pldl1keep, [ uptr4 , %x[prftch_uvw_col_stride2] ]\n"
+ "fmul vV44.4s, vU46.4s, vW13.4s\n"
+ "ldr qU64, [uptr5, %x[uvw_col_stride3]]\n"
+ "fmla vV23.4s, vU45.4s, vW33.4s\n"
+ "prfm pldl1keep, [ uptr4 , %x[prftch_uvw_col_stride3] ]\n"
+ "fmla vV24.4s, vU45.4s, vW32.4s\n"
+ "prfm pldl1keep, [ uptr4 , %x[prftch_uvw_col_stride4] ]\n"
+ "fmla vV33.4s, vU45.4s, vW23.4s\n"
+ "prfm pldl1keep, [ uptr4 , %x[prftch_uvw_col_stride5] ]\n"
+ "fmla vV34.4s, vU45.4s, vW22.4s\n"
+ "prfm pldl1keep, [ uptr5 , %[prftch]]\n"
+ "fmul vV43.4s, vU45.4s, vW13.4s\n"
+ "prfm pldl1keep, [ uptr5 , %x[prftch_uvw_col_stride1]]\n"
+ "fmla vV44.4s, vU45.4s, vW12.4s\n"
+ "ldr qU13, [%x[uptr0], %x[uvw_col_stride2]]\n"
+ "fmla vV22.4s, vU44.4s, vW33.4s\n"
+ "prfm pldl1keep, [ uptr5 , %x[prftch_uvw_col_stride2] ]\n"
+ "fmla vV23.4s, vU44.4s, vW32.4s\n"
+ "prfm pldl1keep, [ uptr5 , %x[prftch_uvw_col_stride3] ]\n"
+ "fmla vV24.4s, vU44.4s, vW31.4s\n"
+ "str qV24, [vptr1, %x[uvw_col_stride3]]\n"
+ "fmla vV32.4s, vU44.4s, vW23.4s\n"
+ "prfm pldl1keep, [ uptr5 , %x[prftch_uvw_col_stride4] ]\n"
+ "fmla vV33.4s, vU44.4s, vW22.4s\n"
+ "prfm pldl1keep, [ uptr5 , %x[prftch_uvw_col_stride5] ]\n"
+ "fmla vV34.4s, vU44.4s, vW21.4s\n"
+ "prfm pstl1keep, [%x[vptr0], %[prftch]]\n"
+ "fmul vV42.4s, vU44.4s, vW13.4s\n"
+ "prfm pstl1keep, [%x[vptr0], %x[prftch_uvw_col_stride1]]\n"
+ "fmla vV43.4s, vU44.4s, vW12.4s\n"
+ "prfm pstl1keep, [%x[vptr0], %x[prftch_uvw_col_stride2] ]\n"
+ "fmla vV44.4s, vU44.4s, vW11.4s\n"
+ "ldr qU23, [uptr1, %x[uvw_col_stride2]]\n"
+ "fmla vV34.4s, vU56.4s, vW33.4s\n"
+ "prfm pstl1keep, [%x[vptr0], %x[prftch_uvw_col_stride3] ]\n"
+ "fmla vV44.4s, vU56.4s, vW23.4s\n"
+ "ldr qU33, [uptr2, %x[uvw_col_stride2]]\n"
+ "fmla vV33.4s, vU55.4s, vW33.4s\n"
+ "prfm pstl1keep, [ vptr1 , %[prftch]]\n"
+ "fmla vV34.4s, vU55.4s, vW32.4s\n"
+ "prfm pstl1keep, [ vptr1 , %x[prftch_uvw_col_stride1]]\n"
+ "fmla vV43.4s, vU55.4s, vW23.4s\n"
+ "prfm pstl1keep, [ vptr1 , %x[prftch_uvw_col_stride2] ]\n"
+ "fmla vV44.4s, vU55.4s, vW22.4s\n"
+ "ldr qU43, [uptr3, %x[uvw_col_stride2]]\n"
+ "fmla vV32.4s, vU54.4s, vW33.4s\n"
+ "prfm pstl1keep, [ vptr1 , %x[prftch_uvw_col_stride3] ]\n"
+ "fmla vV33.4s, vU54.4s, vW32.4s\n"
+ "prfm pstl1keep, [ vptr2 , %[prftch]]\n"
+ "fmla vV34.4s, vU54.4s, vW31.4s\n"
+ "str qV34, [vptr2, %x[uvw_col_stride3]]\n"
+ "fmla vV42.4s, vU54.4s, vW23.4s\n"
+ "prfm pstl1keep, [ vptr2 , %x[prftch_uvw_col_stride1]]\n"
+ "fmla vV43.4s, vU54.4s, vW22.4s\n"
+ "prfm pstl1keep, [ vptr2 , %x[prftch_uvw_col_stride2] ]\n"
+ "fmla vV44.4s, vU54.4s, vW21.4s\n"
+ "ldr qU53, [uptr4, %x[uvw_col_stride2]]\n"
+ "fmla vV44.4s, vU66.4s, vW33.4s\n"
+ "ldr qU63, [uptr5, %x[uvw_col_stride2]]\n"
+ "fmla vV43.4s, vU65.4s, vW33.4s\n"
+ "prfm pstl1keep, [ vptr2 , %x[prftch_uvw_col_stride3] ]\n"
+ "fmla vV44.4s, vU65.4s, vW32.4s\n"
+ "ldr qU12, [%x[uptr0], %x[uvw_col_stride1]]\n"
+ "fmla vV42.4s, vU64.4s, vW33.4s\n"
+ "prfm pstl1keep, [ vptr3 , %[prftch]]\n"
+ "fmla vV43.4s, vU64.4s, vW32.4s\n"
+ "prfm pstl1keep, [ vptr3 , %x[prftch_uvw_col_stride1]]\n"
+ "fmla vV44.4s, vU64.4s, vW31.4s\n"
+ "str qV44, [vptr3, %x[uvw_col_stride3]]\n"
+ "fmul vV11.4s, vU13.4s, vW13.4s\n"
+ "ldr qU22, [uptr1, %x[uvw_col_stride1]]\n"
+ "fmla vV12.4s, vU13.4s, vW12.4s\n"
+ "prfm pstl1keep, [ vptr3 , %x[prftch_uvw_col_stride2] ]\n"
+ "fmla vV13.4s, vU13.4s, vW11.4s\n"
+ "ldr qU32, [uptr2, %x[uvw_col_stride1]]\n"
+ "fmla vV11.4s, vU23.4s, vW23.4s\n"
+ "prfm pstl1keep, [ vptr3 , %x[prftch_uvw_col_stride3] ]\n"
+ "fmla vV12.4s, vU23.4s, vW22.4s\n"
+ "fmla vV13.4s, vU23.4s, vW21.4s\n"
+ "fmul vV21.4s, vU23.4s, vW13.4s\n"
+ "fmla vV22.4s, vU23.4s, vW12.4s\n"
+ "fmla vV23.4s, vU23.4s, vW11.4s\n"
+ "ldr qU42, [uptr3, %x[uvw_col_stride1]]\n"
+ "fmla vV11.4s, vU33.4s, vW33.4s\n"
+ "fmla vV12.4s, vU33.4s, vW32.4s\n"
+ "fmla vV13.4s, vU33.4s, vW31.4s\n"
+ "str qV13, [%x[vptr0], %x[uvw_col_stride2]]\n"
+ "fmla vV21.4s, vU33.4s, vW23.4s\n"
+ "fmla vV22.4s, vU33.4s, vW22.4s\n"
+ "fmla vV23.4s, vU33.4s, vW21.4s\n"
+ "fmul vV31.4s, vU33.4s, vW13.4s\n"
+ "fmla vV32.4s, vU33.4s, vW12.4s\n"
+ "fmla vV33.4s, vU33.4s, vW11.4s\n"
+ "ldr qU52, [uptr4, %x[uvw_col_stride1]]\n"
+ "fmla vV21.4s, vU43.4s, vW33.4s\n"
+ "fmla vV22.4s, vU43.4s, vW32.4s\n"
+ "fmla vV23.4s, vU43.4s, vW31.4s\n"
+ "str qV23, [vptr1, %x[uvw_col_stride2]]\n"
+ "fmla vV31.4s, vU43.4s, vW23.4s\n"
+ "fmla vV32.4s, vU43.4s, vW22.4s\n"
+ "fmla vV33.4s, vU43.4s, vW21.4s\n"
+ "fmul vV41.4s, vU43.4s, vW13.4s\n"
+ "ldr qW13, [%x[wptr0], %x[uvw_col_stride2]]\n"
+ "fmla vV42.4s, vU43.4s, vW12.4s\n"
+ "fmla vV43.4s, vU43.4s, vW11.4s\n"
+ "ldr qU62, [uptr5, %x[uvw_col_stride1]]\n"
+ "fmla vV31.4s, vU53.4s, vW33.4s\n"
+ "fmla vV32.4s, vU53.4s, vW32.4s\n"
+ "fmla vV33.4s, vU53.4s, vW31.4s\n"
+ "str qV33, [vptr2, %x[uvw_col_stride2]]\n"
+ "fmla vV41.4s, vU53.4s, vW23.4s\n"
+ "ldr qW23, [wptr1, %x[uvw_col_stride2]]\n"
+ "fmla vV42.4s, vU53.4s, vW22.4s\n"
+ "fmla vV43.4s, vU53.4s, vW21.4s\n"
+ "ldr qU11, [%x[uptr0]], #0x10\n"
+ "fmla vV41.4s, vU63.4s, vW33.4s\n"
+ "ldr qW33, [wptr2, %x[uvw_col_stride2]]\n"
+ "fmla vV42.4s, vU63.4s, vW32.4s\n"
+ "prfm pldl1keep, [%x[uptr0], %[prftch]]\n"
+ "fmla vV43.4s, vU63.4s, vW31.4s\n"
+ "str qV43, [vptr3, %x[uvw_col_stride2]]\n"
+ "fmla vV11.4s, vU12.4s, vW12.4s\n"
+ "ldr qU21, [uptr1], #0x10\n"
+ "fmla vV12.4s, vU12.4s, vW11.4s\n"
+ "ldr qU31, [uptr2], #0x10\n"
+ "fmla vV11.4s, vU22.4s, vW22.4s\n"
+ "prfm pldl1keep, [%x[uptr0], %x[prftch_uvw_col_stride1]]\n"
+ "fmla vV12.4s, vU22.4s, vW21.4s\n"
+ "prfm pldl1keep, [%x[uptr0], %x[prftch_uvw_col_stride2] ]\n"
+ "fmla vV21.4s, vU22.4s, vW12.4s\n"
+ "prfm pldl1keep, [%x[uptr0], %x[prftch_uvw_col_stride3] ]\n"
+ "fmla vV22.4s, vU22.4s, vW11.4s\n"
+ "ldr qU41, [uptr3], #0x10\n"
+ "fmla vV11.4s, vU32.4s, vW32.4s\n"
+ "prfm pldl1keep, [%x[uptr0], %x[prftch_uvw_col_stride4] ]\n"
+ "fmla vV12.4s, vU32.4s, vW31.4s\n"
+ "str qV12, [%x[vptr0], %x[uvw_col_stride1]]\n"
+ "fmla vV21.4s, vU32.4s, vW22.4s\n"
+ "prfm pldl1keep, [%x[uptr0], %x[prftch_uvw_col_stride5] ]\n"
+ "fmla vV22.4s, vU32.4s, vW21.4s\n"
+ "prfm pldl1keep, [ uptr1 , %[prftch]]\n"
+ "fmla vV31.4s, vU32.4s, vW12.4s\n"
+ "prfm pldl1keep, [ uptr1 , %x[prftch_uvw_col_stride1]]\n"
+ "fmla vV32.4s, vU32.4s, vW11.4s\n"
+ "ldr qU51, [uptr4], #0x10\n"
+ "fmla vV21.4s, vU42.4s, vW32.4s\n"
+ "prfm pldl1keep, [ uptr1 , %x[prftch_uvw_col_stride2] ]\n"
+ "fmla vV22.4s, vU42.4s, vW31.4s\n"
+ "str qV22, [vptr1, %x[uvw_col_stride1]]\n"
+ "fmla vV31.4s, vU42.4s, vW22.4s\n"
+ "prfm pldl1keep, [ uptr1 , %x[prftch_uvw_col_stride3] ]\n"
+ "fmla vV32.4s, vU42.4s, vW21.4s\n"
+ "subs %x[c4_rem], %x[c4_rem], #1\n"
+ "fmla vV41.4s, vU42.4s, vW12.4s\n"
+ "ldr qW12, [%x[wptr0], %x[uvw_col_stride1]]\n"
+ "fmla vV42.4s, vU42.4s, vW11.4s\n"
+ "ldr qU61, [uptr5], #0x10\n"
+ "fmla vV31.4s, vU52.4s, vW32.4s\n"
+ "prfm pldl1keep, [ uptr1 , %x[prftch_uvw_col_stride4] ]\n"
+ "fmla vV32.4s, vU52.4s, vW31.4s\n"
+ "str qV32, [vptr2, %x[uvw_col_stride1]]\n"
+ "fmla vV41.4s, vU52.4s, vW22.4s\n"
+ "ldr qW22, [wptr1, %x[uvw_col_stride1]]\n"
+ "fmla vV42.4s, vU52.4s, vW21.4s\n"
+ "ldr qU16, [%x[uptr0], %x[uvw_col_stride5]]\n"
+ "fmla vV41.4s, vU62.4s, vW32.4s\n"
+ "ldr qW32, [wptr2, %x[uvw_col_stride1]]\n"
+ "fmla vV42.4s, vU62.4s, vW31.4s\n"
+ "str qV42, [vptr3, %x[uvw_col_stride1]]\n"
+ "fmla vV11.4s, vU11.4s, vW11.4s\n"
+ "ldr qU15, [%x[uptr0], %x[uvw_col_stride4]]\n"
+ "fmla vV11.4s, vU21.4s, vW21.4s\n"
+ "ldr qU14, [%x[uptr0], %x[uvw_col_stride3]]\n"
+ "fmla vV21.4s, vU21.4s, vW11.4s\n"
+ "ldr qU26, [uptr1, %x[uvw_col_stride5]]\n"
+ "fmla vV11.4s, vU31.4s, vW31.4s\n"
+ "str qV11, [%x[vptr0]], #0x10\n"
+ "fmla vV21.4s, vU31.4s, vW21.4s\n"
+ "prfm pldl1keep, [ uptr1 , %x[prftch_uvw_col_stride5] ]\n"
+ "fmla vV31.4s, vU31.4s, vW11.4s\n"
+ "ldr qU25, [uptr1, %x[uvw_col_stride4]]\n"
+ "fmla vV21.4s, vU41.4s, vW31.4s\n"
+ "str qV21, [vptr1], #0x10\n"
+ "fmla vV31.4s, vU41.4s, vW21.4s\n"
+ "prfm pldl1keep, [ uptr2 , %[prftch]]\n"
+ "fmla vV41.4s, vU41.4s, vW11.4s\n"
+ "ldr qW11, [%x[wptr0]], #0x10\n"
+ "fmla vV31.4s, vU51.4s, vW31.4s\n"
+ "str qV31, [vptr2], #0x10\n"
+ "fmla vV41.4s, vU51.4s, vW21.4s\n"
+ "ldr qU36, [uptr2, %x[uvw_col_stride5]]\n"
+ "fmla vV41.4s, vU61.4s, vW31.4s\n"
+ "str qV41, [vptr3], #0x10\n"
+ "fmul vV14.4s, vU16.4s, vW13.4s\n"
+ "ldr qU24, [uptr1, %x[uvw_col_stride3]]\n"
+ "fmul vV13.4s, vU15.4s, vW13.4s\n"
+ "ldr qW31, [wptr2], #0x10\n"
+ "fmla vV14.4s, vU15.4s, vW12.4s\n"
+ "ldr qW21, [wptr1], #0x10\n"
+ "fmul vV12.4s, vU14.4s, vW13.4s\n"
+ "ldr qU34, [uptr2, %x[uvw_col_stride3]]\n"
+ "fmla vV13.4s, vU14.4s, vW12.4s\n"
+ "ldr qU46, [uptr3, %x[uvw_col_stride5]]\n"
+ "fmla vV14.4s, vU14.4s, vW11.4s\n"
+ "ldr qU45, [uptr3, %x[uvw_col_stride4]]\n"
+ "fmla vV14.4s, vU26.4s, vW23.4s\n"
+ "ldr qU35, [uptr2, %x[uvw_col_stride4]]\n"
+ "fmul vV24.4s, vU26.4s, vW13.4s\n"
+ "ldr qU44, [uptr3, %x[uvw_col_stride3]]\n"
+ "fmla vV13.4s, vU25.4s, vW23.4s\n"
+ "bne 1b\n"
+
+ "2:" // Final iteration
+ "fmla vV14.4s, vU25.4s, vW22.4s\n"
+ "fmul vV23.4s, vU25.4s, vW13.4s\n"
+ "fmla vV24.4s, vU25.4s, vW12.4s\n"
+ "ldr qU56, [uptr4, %x[uvw_col_stride5]]\n"
+ "fmla vV12.4s, vU24.4s, vW23.4s\n"
+ "fmla vV13.4s, vU24.4s, vW22.4s\n"
+ "fmla vV14.4s, vU24.4s, vW21.4s\n"
+ "fmul vV22.4s, vU24.4s, vW13.4s\n"
+ "fmla vV23.4s, vU24.4s, vW12.4s\n"
+ "fmla vV24.4s, vU24.4s, vW11.4s\n"
+ "ldr qU55, [uptr4, %x[uvw_col_stride4]]\n"
+ "fmla vV14.4s, vU36.4s, vW33.4s\n"
+ "fmla vV24.4s, vU36.4s, vW23.4s\n"
+ "fmul vV34.4s, vU36.4s, vW13.4s\n"
+ "ldr qU54, [uptr4, %x[uvw_col_stride3]]\n"
+ "fmla vV13.4s, vU35.4s, vW33.4s\n"
+ "fmla vV14.4s, vU35.4s, vW32.4s\n"
+ "fmla vV23.4s, vU35.4s, vW23.4s\n"
+ "fmla vV24.4s, vU35.4s, vW22.4s\n"
+ "fmul vV33.4s, vU35.4s, vW13.4s\n"
+ "fmla vV34.4s, vU35.4s, vW12.4s\n"
+ "ldr qU66, [uptr5, %x[uvw_col_stride5]]\n"
+ "fmla vV12.4s, vU34.4s, vW33.4s\n"
+ "fmla vV13.4s, vU34.4s, vW32.4s\n"
+ "fmla vV14.4s, vU34.4s, vW31.4s\n"
+ "str qV14, [%x[vptr0], %x[uvw_col_stride3]]\n"
+ "fmla vV22.4s, vU34.4s, vW23.4s\n"
+ "fmla vV23.4s, vU34.4s, vW22.4s\n"
+ "fmla vV24.4s, vU34.4s, vW21.4s\n"
+ "fmul vV32.4s, vU34.4s, vW13.4s\n"
+ "fmla vV33.4s, vU34.4s, vW12.4s\n"
+ "fmla vV34.4s, vU34.4s, vW11.4s\n"
+ "ldr qU65, [uptr5, %x[uvw_col_stride4]]\n"
+ "fmla vV24.4s, vU46.4s, vW33.4s\n"
+ "fmla vV34.4s, vU46.4s, vW23.4s\n"
+ "fmul vV44.4s, vU46.4s, vW13.4s\n"
+ "ldr qU64, [uptr5, %x[uvw_col_stride3]]\n"
+ "fmla vV23.4s, vU45.4s, vW33.4s\n"
+ "fmla vV24.4s, vU45.4s, vW32.4s\n"
+ "fmla vV33.4s, vU45.4s, vW23.4s\n"
+ "fmla vV34.4s, vU45.4s, vW22.4s\n"
+ "fmul vV43.4s, vU45.4s, vW13.4s\n"
+ "fmla vV44.4s, vU45.4s, vW12.4s\n"
+ "ldr qU13, [%x[uptr0], %x[uvw_col_stride2]]\n"
+ "fmla vV22.4s, vU44.4s, vW33.4s\n"
+ "fmla vV23.4s, vU44.4s, vW32.4s\n"
+ "fmla vV24.4s, vU44.4s, vW31.4s\n"
+ "str qV24, [vptr1, %x[uvw_col_stride3]]\n"
+ "fmla vV32.4s, vU44.4s, vW23.4s\n"
+ "fmla vV33.4s, vU44.4s, vW22.4s\n"
+ "fmla vV34.4s, vU44.4s, vW21.4s\n"
+ "fmul vV42.4s, vU44.4s, vW13.4s\n"
+ "fmla vV43.4s, vU44.4s, vW12.4s\n"
+ "fmla vV44.4s, vU44.4s, vW11.4s\n"
+ "ldr qU23, [uptr1, %x[uvw_col_stride2]]\n"
+ "fmla vV34.4s, vU56.4s, vW33.4s\n"
+ "fmla vV44.4s, vU56.4s, vW23.4s\n"
+ "ldr qU33, [uptr2, %x[uvw_col_stride2]]\n"
+ "fmla vV33.4s, vU55.4s, vW33.4s\n"
+ "fmla vV34.4s, vU55.4s, vW32.4s\n"
+ "fmla vV43.4s, vU55.4s, vW23.4s\n"
+ "fmla vV44.4s, vU55.4s, vW22.4s\n"
+ "ldr qU43, [uptr3, %x[uvw_col_stride2]]\n"
+ "fmla vV32.4s, vU54.4s, vW33.4s\n"
+ "fmla vV33.4s, vU54.4s, vW32.4s\n"
+ "fmla vV34.4s, vU54.4s, vW31.4s\n"
+ "str qV34, [vptr2, %x[uvw_col_stride3]]\n"
+ "fmla vV42.4s, vU54.4s, vW23.4s\n"
+ "fmla vV43.4s, vU54.4s, vW22.4s\n"
+ "fmla vV44.4s, vU54.4s, vW21.4s\n"
+ "ldr qU53, [uptr4, %x[uvw_col_stride2]]\n"
+ "fmla vV44.4s, vU66.4s, vW33.4s\n"
+ "ldr qU63, [uptr5, %x[uvw_col_stride2]]\n"
+ "fmla vV43.4s, vU65.4s, vW33.4s\n"
+ "fmla vV44.4s, vU65.4s, vW32.4s\n"
+ "ldr qU12, [%x[uptr0], %x[uvw_col_stride1]]\n"
+ "fmla vV42.4s, vU64.4s, vW33.4s\n"
+ "fmla vV43.4s, vU64.4s, vW32.4s\n"
+ "fmla vV44.4s, vU64.4s, vW31.4s\n"
+ "str qV44, [vptr3, %x[uvw_col_stride3]]\n"
+ "fmul vV11.4s, vU13.4s, vW13.4s\n"
+ "ldr qU22, [uptr1, %x[uvw_col_stride1]]\n"
+ "fmla vV12.4s, vU13.4s, vW12.4s\n"
+ "fmla vV13.4s, vU13.4s, vW11.4s\n"
+ "ldr qU32, [uptr2, %x[uvw_col_stride1]]\n"
+ "fmla vV11.4s, vU23.4s, vW23.4s\n"
+ "fmla vV12.4s, vU23.4s, vW22.4s\n"
+ "fmla vV13.4s, vU23.4s, vW21.4s\n"
+ "fmul vV21.4s, vU23.4s, vW13.4s\n"
+ "fmla vV22.4s, vU23.4s, vW12.4s\n"
+ "fmla vV23.4s, vU23.4s, vW11.4s\n"
+ "ldr qU42, [uptr3, %x[uvw_col_stride1]]\n"
+ "fmla vV11.4s, vU33.4s, vW33.4s\n"
+ "fmla vV12.4s, vU33.4s, vW32.4s\n"
+ "fmla vV13.4s, vU33.4s, vW31.4s\n"
+ "str qV13, [%x[vptr0], %x[uvw_col_stride2]]\n"
+ "fmla vV21.4s, vU33.4s, vW23.4s\n"
+ "fmla vV22.4s, vU33.4s, vW22.4s\n"
+ "fmla vV23.4s, vU33.4s, vW21.4s\n"
+ "fmul vV31.4s, vU33.4s, vW13.4s\n"
+ "fmla vV32.4s, vU33.4s, vW12.4s\n"
+ "fmla vV33.4s, vU33.4s, vW11.4s\n"
+ "ldr qU52, [uptr4, %x[uvw_col_stride1]]\n"
+ "fmla vV21.4s, vU43.4s, vW33.4s\n"
+ "fmla vV22.4s, vU43.4s, vW32.4s\n"
+ "fmla vV23.4s, vU43.4s, vW31.4s\n"
+ "str qV23, [vptr1, %x[uvw_col_stride2]]\n"
+ "fmla vV31.4s, vU43.4s, vW23.4s\n"
+ "fmla vV32.4s, vU43.4s, vW22.4s\n"
+ "fmla vV33.4s, vU43.4s, vW21.4s\n"
+ "fmul vV41.4s, vU43.4s, vW13.4s\n"
+ "fmla vV42.4s, vU43.4s, vW12.4s\n"
+ "fmla vV43.4s, vU43.4s, vW11.4s\n"
+ "ldr qU62, [uptr5, %x[uvw_col_stride1]]\n"
+ "fmla vV31.4s, vU53.4s, vW33.4s\n"
+ "fmla vV32.4s, vU53.4s, vW32.4s\n"
+ "fmla vV33.4s, vU53.4s, vW31.4s\n"
+ "str qV33, [vptr2, %x[uvw_col_stride2]]\n"
+ "fmla vV41.4s, vU53.4s, vW23.4s\n"
+ "fmla vV42.4s, vU53.4s, vW22.4s\n"
+ "fmla vV43.4s, vU53.4s, vW21.4s\n"
+ "ldr qU11, [%x[uptr0]], #0x10\n"
+ "fmla vV41.4s, vU63.4s, vW33.4s\n"
+ "fmla vV42.4s, vU63.4s, vW32.4s\n"
+ "fmla vV43.4s, vU63.4s, vW31.4s\n"
+ "str qV43, [vptr3, %x[uvw_col_stride2]]\n"
+ "fmla vV11.4s, vU12.4s, vW12.4s\n"
+ "ldr qU21, [uptr1], #0x10\n"
+ "fmla vV12.4s, vU12.4s, vW11.4s\n"
+ "ldr qU31, [uptr2], #0x10\n"
+ "fmla vV11.4s, vU22.4s, vW22.4s\n"
+ "fmla vV12.4s, vU22.4s, vW21.4s\n"
+ "fmla vV21.4s, vU22.4s, vW12.4s\n"
+ "fmla vV22.4s, vU22.4s, vW11.4s\n"
+ "ldr qU41, [uptr3], #0x10\n"
+ "fmla vV11.4s, vU32.4s, vW32.4s\n"
+ "fmla vV12.4s, vU32.4s, vW31.4s\n"
+ "str qV12, [%x[vptr0], %x[uvw_col_stride1]]\n"
+ "fmla vV21.4s, vU32.4s, vW22.4s\n"
+ "fmla vV22.4s, vU32.4s, vW21.4s\n"
+ "fmla vV31.4s, vU32.4s, vW12.4s\n"
+ "fmla vV32.4s, vU32.4s, vW11.4s\n"
+ "ldr qU51, [uptr4], #0x10\n"
+ "fmla vV21.4s, vU42.4s, vW32.4s\n"
+ "fmla vV22.4s, vU42.4s, vW31.4s\n"
+ "str qV22, [vptr1, %x[uvw_col_stride1]]\n"
+ "fmla vV31.4s, vU42.4s, vW22.4s\n"
+ "fmla vV32.4s, vU42.4s, vW21.4s\n"
+ "subs %x[c4_rem], %x[c4_rem], #1\n"
+ "fmla vV41.4s, vU42.4s, vW12.4s\n"
+ "fmla vV42.4s, vU42.4s, vW11.4s\n"
+ "ldr qU61, [uptr5], #0x10\n"
+ "fmla vV31.4s, vU52.4s, vW32.4s\n"
+ "fmla vV32.4s, vU52.4s, vW31.4s\n"
+ "str qV32, [vptr2, %x[uvw_col_stride1]]\n"
+ "fmla vV41.4s, vU52.4s, vW22.4s\n"
+ "fmla vV42.4s, vU52.4s, vW21.4s\n"
+ "fmla vV41.4s, vU62.4s, vW32.4s\n"
+ "fmla vV42.4s, vU62.4s, vW31.4s\n"
+ "str qV42, [vptr3, %x[uvw_col_stride1]]\n"
+ "fmla vV11.4s, vU11.4s, vW11.4s\n"
+ "fmla vV11.4s, vU21.4s, vW21.4s\n"
+ "fmla vV21.4s, vU21.4s, vW11.4s\n"
+ "fmla vV11.4s, vU31.4s, vW31.4s\n"
+ "str qV11, [%x[vptr0]], #0x10\n"
+ "fmla vV21.4s, vU31.4s, vW21.4s\n"
+ "fmla vV31.4s, vU31.4s, vW11.4s\n"
+ "fmla vV21.4s, vU41.4s, vW31.4s\n"
+ "str qV21, [vptr1], #0x10\n"
+ "fmla vV31.4s, vU41.4s, vW21.4s\n"
+ "fmla vV41.4s, vU41.4s, vW11.4s\n"
+ "fmla vV31.4s, vU51.4s, vW31.4s\n"
+ "str qV31, [vptr2], #0x10\n"
+ "fmla vV41.4s, vU51.4s, vW21.4s\n"
+ "fmla vV41.4s, vU61.4s, vW31.4s\n"
+ "str qV41, [vptr3], #0x10\n"
+
+ ".unreq qW22\n" ".unreq qU64\n" ".unreq qU35\n" ".unreq qV41\n"
+ ".unreq qU34\n" ".unreq qU21\n" ".unreq qV43\n" ".unreq qW21\n"
+ ".unreq qU24\n" ".unreq qU54\n" ".unreq qV31\n" ".unreq qV12\n"
+ ".unreq qU61\n" ".unreq qU26\n" ".unreq qV32\n"
+ ".unreq qU36\n" ".unreq qU51\n" ".unreq qU66\n" ".unreq qU12\n"
+ ".unreq qV14\n" ".unreq qV11\n" ".unreq qU65\n"
+ ".unreq qU15\n" ".unreq qU22\n" ".unreq qU45\n"
+ ".unreq qV22\n" ".unreq qU14\n"
+ ".unreq qU44\n" ".unreq qU43\n" ".unreq qU11\n"
+ ".unreq qV24\n" ".unreq qV42\n" ".unreq qW31\n" ".unreq qW13\n"
+ ".unreq qU33\n" ".unreq qU62\n" ".unreq qU25\n" ".unreq qU56\n"
+ ".unreq qW33\n"
+ ".unreq qU42\n" ".unreq qU16\n" ".unreq qV44\n"
+ ".unreq qU63\n" ".unreq qU31\n" ".unreq qV34\n"
+ ".unreq qW11\n" ".unreq qU41\n" ".unreq qV13\n" ".unreq qV33\n"
+ ".unreq qU46\n" ".unreq qU32\n" ".unreq qU13\n"
+ ".unreq qW23\n" ".unreq qV23\n" ".unreq qV21\n" ".unreq qU55\n"
+ ".unreq qW12\n" ".unreq qW32\n" ".unreq qU23\n" ".unreq qU52\n"
+ ".unreq qU53\n" ".unreq vW22\n"
+ ".unreq vU64\n" ".unreq vU35\n" ".unreq vV41\n"
+ ".unreq vU34\n" ".unreq vU21\n" ".unreq vV43\n" ".unreq vW21\n"
+ ".unreq vU24\n" ".unreq vU54\n" ".unreq vV31\n"
+ ".unreq vV12\n" ".unreq vU61\n"
+ ".unreq vU26\n" ".unreq vV32\n"
+ ".unreq vU36\n" ".unreq vU51\n" ".unreq vU66\n" ".unreq vU12\n"
+ ".unreq vV14\n" ".unreq vV11\n" ".unreq vU65\n"
+ ".unreq vU15\n" ".unreq vU22\n" ".unreq vU45\n"
+ ".unreq vV22\n" ".unreq vU14\n"
+ ".unreq vU44\n" ".unreq vU43\n" ".unreq vU11\n"
+ ".unreq vV24\n" ".unreq vV42\n" ".unreq vW31\n" ".unreq vW13\n"
+ ".unreq vU33\n" ".unreq vU62\n" ".unreq vU25\n" ".unreq vU56\n"
+ ".unreq vW33\n" ".unreq vU42\n" ".unreq vU16\n" ".unreq vV44\n"
+ ".unreq vU63\n" ".unreq vU31\n" ".unreq vV34\n" ".unreq vW11\n"
+ ".unreq vU41\n" ".unreq vV13\n" ".unreq vV33\n"
+ ".unreq vU46\n" ".unreq vU32\n" ".unreq vU13\n" ".unreq vW23\n"
+ ".unreq vV23\n" ".unreq vV21\n" ".unreq vU55\n" ".unreq vW12\n"
+ ".unreq vW32\n" ".unreq vU23\n" ".unreq vU52\n" ".unreq vU53\n"
+ : [uptr0] "+r" (uptr0), [vptr0] "+r" (vptr0), [wptr0] "+r" (wptr0),
+ [c4_rem] "+r" (c4_rem)
+ : [u_row_stride] "r" (in_row_stride * sizeof(float)),
+ [v_row_stride] "r" (out_row_stride * sizeof(float)),
+ [w_row_stride] "r" (weight_row_stride * sizeof(float)),
+ [uvw_col_stride1] "r" (1 * in_col_stride * sizeof(float)),
+ [uvw_col_stride2] "r" (2 * in_col_stride * sizeof(float)),
+ [uvw_col_stride3] "r" (3 * in_col_stride * sizeof(float)),
+ [uvw_col_stride4] "r" (4 * in_col_stride * sizeof(float)),
+ [uvw_col_stride5] "r" (5 * in_col_stride * sizeof(float)),
+ [prftch] "i" (prefetch_depth * sizeof(float)),
+ [prftch_uvw_col_stride1] "r" ((prefetch_depth + 1 * in_col_stride) * sizeof(float)),
+ [prftch_uvw_col_stride2] "r" ((prefetch_depth + 2 * in_col_stride) * sizeof(float)),
+ [prftch_uvw_col_stride3] "r" ((prefetch_depth + 3 * in_col_stride) * sizeof(float)),
+ [prftch_uvw_col_stride4] "r" ((prefetch_depth + 4 * in_col_stride) * sizeof(float)),
+ [prftch_uvw_col_stride5] "r" ((prefetch_depth + 5 * in_col_stride) * sizeof(float))
+ : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10",
+ "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20",
+ "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "x0",
+ "x1", "x2", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "cc", "memory"
+ );
+ }
+ else if (channels_remaining >= 4)
+ {
+ int c4_rem = channels_remaining / 4;
+ channels_remaining %= 4;
+
+ asm volatile (
+ "qW22 .req q0\n" "vW22 .req v0\n"
+ "qU64 .req q1\n" "qU35 .req q1\n" "qV41 .req q1\n"
+ "vU64 .req v1\n" "vU35 .req v1\n" "vV41 .req v1\n"
+ "qU34 .req q2\n" "qU21 .req q2\n" "qV43 .req q2\n"
+ "vU34 .req v2\n" "vU21 .req v2\n" "vV43 .req v2\n"
+ "qW21 .req q3\n" "vW21 .req v3\n"
+ "qU24 .req q4\n" "qU54 .req q4\n" "qV31 .req q4\n"
+ "vU24 .req v4\n" "vU54 .req v4\n" "vV31 .req v4\n"
+ "qV12 .req q5\n" "qU61 .req q5\n" "vV12 .req v5\n" "vU61 .req v5\n"
+ "qU26 .req q6\n" "qV32 .req q6\n" "vU26 .req v6\n" "vV32 .req v6\n"
+ "qU36 .req q7\n" "qU51 .req q7\n" "qU66 .req q7\n" "qU12 .req q7\n"
+ "vU36 .req v7\n" "vU51 .req v7\n" "vU66 .req v7\n" "vU12 .req v7\n"
+ "qV14 .req q8\n" "qV11 .req q8\n" "qU65 .req q8\n"
+ "vV14 .req v8\n" "vV11 .req v8\n" "vU65 .req v8\n"
+ "qU15 .req q9\n" "qU22 .req q9\n" "qU45 .req q9\n"
+ "vU15 .req v9\n" "vU22 .req v9\n" "vU45 .req v9\n"
+ "qV22 .req q10\n" "qU14 .req q10\n" "vV22 .req v10\n" "vU14 .req v10\n"
+ "qU44 .req q11\n" "qU43 .req q11\n" "qU11 .req q11\n"
+ "vU44 .req v11\n" "vU43 .req v11\n" "vU11 .req v11\n"
+ "qV24 .req q12\n" "qV42 .req q12\n" "vV24 .req v12\n" "vV42 .req v12\n"
+ "qW31 .req q13\n" "vW31 .req v13\n" "qW13 .req q14\n" "vW13 .req v14\n"
+ "qU33 .req q15\n" "qU62 .req q15\n" "qU25 .req q15\n" "qU56 .req q15\n"
+ "vU33 .req v15\n" "vU62 .req v15\n" "vU25 .req v15\n" "vU56 .req v15\n"
+ "qW33 .req q16\n" "vW33 .req v16\n"
+ "qU42 .req q17\n" "qU16 .req q17\n" "qV44 .req q17\n"
+ "vU42 .req v17\n" "vU16 .req v17\n" "vV44 .req v17\n"
+ "qU63 .req q18\n" "qU31 .req q18\n" "qV34 .req q18\n"
+ "vU63 .req v18\n" "vU31 .req v18\n" "vV34 .req v18\n"
+ "qW11 .req q19\n" "vW11 .req v19\n" "qU41 .req q20\n" "qV13 .req q20\n"
+ "vU41 .req v20\n" "vV13 .req v20\n" "qV33 .req q21\n" "vV33 .req v21\n"
+ "qU46 .req q22\n" "qU32 .req q22\n" "qU13 .req q22\n"
+ "vU46 .req v22\n" "vU32 .req v22\n" "vU13 .req v22\n" "qW23 .req q23\n"
+ "vW23 .req v23\n" "qV23 .req q24\n" "vV23 .req v24\n"
+ "qV21 .req q25\n" "qU55 .req q25\n" "vV21 .req v25\n" "vU55 .req v25\n"
+ "qW12 .req q26\n" "vW12 .req v26\n" "qW32 .req q27\n" "vW32 .req v27\n"
+ "qU23 .req q28\n" "qU52 .req q28\n"
+ "vU23 .req v28\n" "vU52 .req v28\n" "qU53 .req q29\n" "vU53 .req v29\n"
+
+ "uptr1 .req x0\n"
+ "uptr2 .req x1\n"
+ "uptr3 .req x2\n"
+ "uptr4 .req x3\n"
+ "uptr5 .req x4\n"
+
+ "vptr1 .req x5\n"
+ "vptr2 .req x6\n"
+ "vptr3 .req x7\n"
+
+ "wptr1 .req x8\n"
+ "wptr2 .req x9\n"
+
+ "u_col_stride2 .req x10\n"
+ "u_col_stride3 .req x11\n"
+ "u_col_stride4 .req x12\n"
+ "u_col_stride5 .req x13\n"
+
+ "v_col_stride2 .req x14\n"
+ "v_col_stride3 .req x15\n"
+
+ "w_col_stride2 .req x16\n"
+
+ // Prepare pointers and strides
+ "add uptr1, %x[uptr0], %x[u_row_stride]\n"
+ "add uptr2, uptr1 , %x[u_row_stride]\n"
+ "add uptr3, uptr2 , %x[u_row_stride]\n"
+ "add uptr4, uptr3 , %x[u_row_stride]\n"
+ "add uptr5, uptr4 , %x[u_row_stride]\n"
+
+ "add vptr1, %x[vptr0], %x[v_row_stride]\n"
+ "add vptr2, vptr1 , %x[v_row_stride]\n"
+ "add vptr3, vptr2 , %x[v_row_stride]\n"
+
+ "add wptr1, %x[wptr0], %x[w_row_stride]\n"
+ "add wptr2, wptr1 , %x[w_row_stride]\n"
+
+ "add u_col_stride2, %x[u_col_stride1], %x[u_col_stride1]\n"
+ "add u_col_stride3, u_col_stride2 , %x[u_col_stride1]\n"
+ "add u_col_stride4, u_col_stride3 , %x[u_col_stride1]\n"
+ "add u_col_stride5, u_col_stride4 , %x[u_col_stride1]\n"
+
+ "add v_col_stride2, %x[v_col_stride1], %x[v_col_stride1]\n"
+ "add v_col_stride3, v_col_stride2 , %x[v_col_stride1]\n"
+
+ "add w_col_stride2, %x[w_col_stride1], %x[w_col_stride1]\n"
+
+ // Load initial operands
+ "ldr qU16, [%x[uptr0], u_col_stride5]\n"
+ "ldr qW13, [%x[wptr0], w_col_stride2]\n"
+ "subs %x[c4_rem], %x[c4_rem], #1\n"
+ "ldr qU15, [%x[uptr0], u_col_stride4]\n"
+ "ldr qW23, [wptr1, w_col_stride2]\n"
+ "ldr qU14, [%x[uptr0], u_col_stride3]\n"
+ "ldr qW33, [wptr2, w_col_stride2]\n"
+ "ldr qU26, [uptr1, u_col_stride5]\n"
+ "ldr qW12, [%x[wptr0], %x[w_col_stride1]]\n"
+ "ldr qU25, [uptr1, u_col_stride4]\n"
+ "ldr qW22, [wptr1, %x[w_col_stride1]]\n"
+ "ldr qU36, [uptr2, u_col_stride5]\n"
+ "ldr qW32, [wptr2, %x[w_col_stride1]]\n"
+ "ldr qW11, [%x[wptr0]], #0x10\n"
+ "fmul vV14.4s, vU16.4s, vW13.4s\n"
+ "ldr qU24, [uptr1, u_col_stride3]\n"
+ "fmul vV13.4s, vU15.4s, vW13.4s\n"
+ "ldr qW31, [wptr2], #0x10\n"
+ "fmla vV14.4s, vU15.4s, vW12.4s\n"
+ "ldr qW21, [wptr1], #0x10\n"
+ "fmul vV12.4s, vU14.4s, vW13.4s\n"
+ "ldr qU34, [uptr2, u_col_stride3]\n"
+ "fmla vV13.4s, vU14.4s, vW12.4s\n"
+ "ldr qU46, [uptr3, u_col_stride5]\n"
+ "fmla vV14.4s, vU14.4s, vW11.4s\n"
+ "ldr qU45, [uptr3, u_col_stride4]\n"
+ "fmla vV14.4s, vU26.4s, vW23.4s\n"
+ "ldr qU35, [uptr2, u_col_stride4]\n"
+ "fmul vV24.4s, vU26.4s, vW13.4s\n"
+ "ldr qU44, [uptr3, u_col_stride3]\n"
+ "fmla vV13.4s, vU25.4s, vW23.4s\n"
+ "beq 2f\n" // Single iteration only
+
+ "1:" // Loop body
+ "fmla vV14.4s, vU25.4s, vW22.4s\n"
+ "prfm pldl1keep, [%x[wptr0]]\n"
+ "fmul vV23.4s, vU25.4s, vW13.4s\n"
+ "prfm pldl1keep, [%x[wptr0], %x[w_col_stride1]]\n"
+ "fmla vV24.4s, vU25.4s, vW12.4s\n"
+ "ldr qU56, [uptr4, u_col_stride5]\n"
+ "fmla vV12.4s, vU24.4s, vW23.4s\n"
+ "prfm pldl1keep, [%x[wptr0], w_col_stride2 ]\n"
+ "fmla vV13.4s, vU24.4s, vW22.4s\n"
+ "prfm pldl1keep, [ wptr1 ]\n"
+ "fmla vV14.4s, vU24.4s, vW21.4s\n"
+ "prfm pldl1keep, [ wptr1 , %x[w_col_stride1]]\n"
+ "fmul vV22.4s, vU24.4s, vW13.4s\n"
+ "prfm pldl1keep, [ wptr1 , w_col_stride2 ]\n"
+ "fmla vV23.4s, vU24.4s, vW12.4s\n"
+ "prfm pldl1keep, [ wptr2 ]\n"
+ "fmla vV24.4s, vU24.4s, vW11.4s\n"
+ "ldr qU55, [uptr4, u_col_stride4]\n"
+ "fmla vV14.4s, vU36.4s, vW33.4s\n"
+ "prfm pldl1keep, [ wptr2 , %x[w_col_stride1]]\n"
+ "fmla vV24.4s, vU36.4s, vW23.4s\n"
+ "prfm pldl1keep, [ wptr2 , w_col_stride2 ]\n"
+ "fmul vV34.4s, vU36.4s, vW13.4s\n"
+ "ldr qU54, [uptr4, u_col_stride3]\n"
+ "fmla vV13.4s, vU35.4s, vW33.4s\n"
+ "prfm pldl1keep, [ uptr2 , %x[u_col_stride1]]\n"
+ "fmla vV14.4s, vU35.4s, vW32.4s\n"
+ "prfm pldl1keep, [ uptr2 , u_col_stride2 ]\n"
+ "fmla vV23.4s, vU35.4s, vW23.4s\n"
+ "prfm pldl1keep, [ uptr2 , u_col_stride3 ]\n"
+ "fmla vV24.4s, vU35.4s, vW22.4s\n"
+ "prfm pldl1keep, [ uptr2 , u_col_stride4 ]\n"
+ "fmul vV33.4s, vU35.4s, vW13.4s\n"
+ "prfm pldl1keep, [ uptr2 , u_col_stride5 ]\n"
+ "fmla vV34.4s, vU35.4s, vW12.4s\n"
+ "ldr qU66, [uptr5, u_col_stride5]\n"
+ "fmla vV12.4s, vU34.4s, vW33.4s\n"
+ "prfm pldl1keep, [ uptr3 ]\n"
+ "fmla vV13.4s, vU34.4s, vW32.4s\n"
+ "prfm pldl1keep, [ uptr3 , %x[u_col_stride1]]\n"
+ "fmla vV14.4s, vU34.4s, vW31.4s\n"
+ "str qV14, [%x[vptr0], v_col_stride3]\n"
+ "fmla vV22.4s, vU34.4s, vW23.4s\n"
+ "prfm pldl1keep, [ uptr3 , u_col_stride2 ]\n"
+ "fmla vV23.4s, vU34.4s, vW22.4s\n"
+ "prfm pldl1keep, [ uptr3 , u_col_stride3 ]\n"
+ "fmla vV24.4s, vU34.4s, vW21.4s\n"
+ "prfm pldl1keep, [ uptr3 , u_col_stride4 ]\n"
+ "fmul vV32.4s, vU34.4s, vW13.4s\n"
+ "prfm pldl1keep, [ uptr3 , u_col_stride5 ]\n"
+ "fmla vV33.4s, vU34.4s, vW12.4s\n"
+ "prfm pldl1keep, [ uptr4 ]\n"
+ "fmla vV34.4s, vU34.4s, vW11.4s\n"
+ "ldr qU65, [uptr5, u_col_stride4]\n"
+ "fmla vV24.4s, vU46.4s, vW33.4s\n"
+ "prfm pldl1keep, [ uptr4 , %x[u_col_stride1]]\n"
+ "fmla vV34.4s, vU46.4s, vW23.4s\n"
+ "prfm pldl1keep, [ uptr4 , u_col_stride2 ]\n"
+ "fmul vV44.4s, vU46.4s, vW13.4s\n"
+ "ldr qU64, [uptr5, u_col_stride3]\n"
+ "fmla vV23.4s, vU45.4s, vW33.4s\n"
+ "prfm pldl1keep, [ uptr4 , u_col_stride3 ]\n"
+ "fmla vV24.4s, vU45.4s, vW32.4s\n"
+ "prfm pldl1keep, [ uptr4 , u_col_stride4 ]\n"
+ "fmla vV33.4s, vU45.4s, vW23.4s\n"
+ "prfm pldl1keep, [ uptr4 , u_col_stride5 ]\n"
+ "fmla vV34.4s, vU45.4s, vW22.4s\n"
+ "prfm pldl1keep, [ uptr5 ]\n"
+ "fmul vV43.4s, vU45.4s, vW13.4s\n"
+ "prfm pldl1keep, [ uptr5 , %x[u_col_stride1]]\n"
+ "fmla vV44.4s, vU45.4s, vW12.4s\n"
+ "ldr qU13, [%x[uptr0], u_col_stride2]\n"
+ "fmla vV22.4s, vU44.4s, vW33.4s\n"
+ "prfm pldl1keep, [ uptr5 , u_col_stride2 ]\n"
+ "fmla vV23.4s, vU44.4s, vW32.4s\n"
+ "prfm pldl1keep, [ uptr5 , u_col_stride3 ]\n"
+ "fmla vV24.4s, vU44.4s, vW31.4s\n"
+ "str qV24, [vptr1, v_col_stride3]\n"
+ "fmla vV32.4s, vU44.4s, vW23.4s\n"
+ "prfm pldl1keep, [ uptr5 , u_col_stride4 ]\n"
+ "fmla vV33.4s, vU44.4s, vW22.4s\n"
+ "prfm pldl1keep, [ uptr5 , u_col_stride5 ]\n"
+ "fmla vV34.4s, vU44.4s, vW21.4s\n"
+ "prfm pstl1keep, [%x[vptr0]]\n"
+ "fmul vV42.4s, vU44.4s, vW13.4s\n"
+ "prfm pstl1keep, [%x[vptr0], %x[v_col_stride1]]\n"
+ "fmla vV43.4s, vU44.4s, vW12.4s\n"
+ "prfm pstl1keep, [%x[vptr0], v_col_stride2 ]\n"
+ "fmla vV44.4s, vU44.4s, vW11.4s\n"
+ "ldr qU23, [uptr1, u_col_stride2]\n"
+ "fmla vV34.4s, vU56.4s, vW33.4s\n"
+ "prfm pstl1keep, [%x[vptr0], v_col_stride3 ]\n"
+ "fmla vV44.4s, vU56.4s, vW23.4s\n"
+ "ldr qU33, [uptr2, u_col_stride2]\n"
+ "fmla vV33.4s, vU55.4s, vW33.4s\n"
+ "prfm pstl1keep, [ vptr1 ]\n"
+ "fmla vV34.4s, vU55.4s, vW32.4s\n"
+ "prfm pstl1keep, [ vptr1 , %x[v_col_stride1]]\n"
+ "fmla vV43.4s, vU55.4s, vW23.4s\n"
+ "prfm pstl1keep, [ vptr1 , v_col_stride2 ]\n"
+ "fmla vV44.4s, vU55.4s, vW22.4s\n"
+ "ldr qU43, [uptr3, u_col_stride2]\n"
+ "fmla vV32.4s, vU54.4s, vW33.4s\n"
+ "prfm pstl1keep, [ vptr1 , v_col_stride3 ]\n"
+ "fmla vV33.4s, vU54.4s, vW32.4s\n"
+ "prfm pstl1keep, [ vptr2 ]\n"
+ "fmla vV34.4s, vU54.4s, vW31.4s\n"
+ "str qV34, [vptr2, v_col_stride3]\n"
+ "fmla vV42.4s, vU54.4s, vW23.4s\n"
+ "prfm pstl1keep, [ vptr2 , %x[v_col_stride1]]\n"
+ "fmla vV43.4s, vU54.4s, vW22.4s\n"
+ "prfm pstl1keep, [ vptr2 , v_col_stride2 ]\n"
+ "fmla vV44.4s, vU54.4s, vW21.4s\n"
+ "ldr qU53, [uptr4, u_col_stride2]\n"
+ "fmla vV44.4s, vU66.4s, vW33.4s\n"
+ "ldr qU63, [uptr5, u_col_stride2]\n"
+ "fmla vV43.4s, vU65.4s, vW33.4s\n"
+ "prfm pstl1keep, [ vptr2 , v_col_stride3 ]\n"
+ "fmla vV44.4s, vU65.4s, vW32.4s\n"
+ "ldr qU12, [%x[uptr0], %x[u_col_stride1]]\n"
+ "fmla vV42.4s, vU64.4s, vW33.4s\n"
+ "prfm pstl1keep, [ vptr3 ]\n"
+ "fmla vV43.4s, vU64.4s, vW32.4s\n"
+ "prfm pstl1keep, [ vptr3 , %x[v_col_stride1]]\n"
+ "fmla vV44.4s, vU64.4s, vW31.4s\n"
+ "str qV44, [vptr3, v_col_stride3]\n"
+ "fmul vV11.4s, vU13.4s, vW13.4s\n"
+ "ldr qU22, [uptr1, %x[u_col_stride1]]\n"
+ "fmla vV12.4s, vU13.4s, vW12.4s\n"
+ "prfm pstl1keep, [ vptr3 , v_col_stride2 ]\n"
+ "fmla vV13.4s, vU13.4s, vW11.4s\n"
+ "ldr qU32, [uptr2, %x[u_col_stride1]]\n"
+ "fmla vV11.4s, vU23.4s, vW23.4s\n"
+ "prfm pstl1keep, [ vptr3 , v_col_stride3 ]\n"
+ "fmla vV12.4s, vU23.4s, vW22.4s\n"
+ "fmla vV13.4s, vU23.4s, vW21.4s\n"
+ "fmul vV21.4s, vU23.4s, vW13.4s\n"
+ "fmla vV22.4s, vU23.4s, vW12.4s\n"
+ "fmla vV23.4s, vU23.4s, vW11.4s\n"
+ "ldr qU42, [uptr3, %x[u_col_stride1]]\n"
+ "fmla vV11.4s, vU33.4s, vW33.4s\n"
+ "fmla vV12.4s, vU33.4s, vW32.4s\n"
+ "fmla vV13.4s, vU33.4s, vW31.4s\n"
+ "str qV13, [%x[vptr0], v_col_stride2]\n"
+ "fmla vV21.4s, vU33.4s, vW23.4s\n"
+ "fmla vV22.4s, vU33.4s, vW22.4s\n"
+ "fmla vV23.4s, vU33.4s, vW21.4s\n"
+ "fmul vV31.4s, vU33.4s, vW13.4s\n"
+ "fmla vV32.4s, vU33.4s, vW12.4s\n"
+ "fmla vV33.4s, vU33.4s, vW11.4s\n"
+ "ldr qU52, [uptr4, %x[u_col_stride1]]\n"
+ "fmla vV21.4s, vU43.4s, vW33.4s\n"
+ "fmla vV22.4s, vU43.4s, vW32.4s\n"
+ "fmla vV23.4s, vU43.4s, vW31.4s\n"
+ "str qV23, [vptr1, v_col_stride2]\n"
+ "fmla vV31.4s, vU43.4s, vW23.4s\n"
+ "fmla vV32.4s, vU43.4s, vW22.4s\n"
+ "fmla vV33.4s, vU43.4s, vW21.4s\n"
+ "fmul vV41.4s, vU43.4s, vW13.4s\n"
+ "ldr qW13, [%x[wptr0], w_col_stride2]\n"
+ "fmla vV42.4s, vU43.4s, vW12.4s\n"
+ "fmla vV43.4s, vU43.4s, vW11.4s\n"
+ "ldr qU62, [uptr5, %x[u_col_stride1]]\n"
+ "fmla vV31.4s, vU53.4s, vW33.4s\n"
+ "fmla vV32.4s, vU53.4s, vW32.4s\n"
+ "fmla vV33.4s, vU53.4s, vW31.4s\n"
+ "str qV33, [vptr2, v_col_stride2]\n"
+ "fmla vV41.4s, vU53.4s, vW23.4s\n"
+ "ldr qW23, [wptr1, w_col_stride2]\n"
+ "fmla vV42.4s, vU53.4s, vW22.4s\n"
+ "fmla vV43.4s, vU53.4s, vW21.4s\n"
+ "ldr qU11, [%x[uptr0]], #0x10\n"
+ "fmla vV41.4s, vU63.4s, vW33.4s\n"
+ "ldr qW33, [wptr2, w_col_stride2]\n"
+ "fmla vV42.4s, vU63.4s, vW32.4s\n"
+ "prfm pldl1keep, [%x[uptr0]]\n"
+ "fmla vV43.4s, vU63.4s, vW31.4s\n"
+ "str qV43, [vptr3, v_col_stride2]\n"
+ "fmla vV11.4s, vU12.4s, vW12.4s\n"
+ "ldr qU21, [uptr1], #0x10\n"
+ "fmla vV12.4s, vU12.4s, vW11.4s\n"
+ "ldr qU31, [uptr2], #0x10\n"
+ "fmla vV11.4s, vU22.4s, vW22.4s\n"
+ "prfm pldl1keep, [%x[uptr0], %x[u_col_stride1]]\n"
+ "fmla vV12.4s, vU22.4s, vW21.4s\n"
+ "prfm pldl1keep, [%x[uptr0], u_col_stride2 ]\n"
+ "fmla vV21.4s, vU22.4s, vW12.4s\n"
+ "prfm pldl1keep, [%x[uptr0], u_col_stride3 ]\n"
+ "fmla vV22.4s, vU22.4s, vW11.4s\n"
+ "ldr qU41, [uptr3], #0x10\n"
+ "fmla vV11.4s, vU32.4s, vW32.4s\n"
+ "prfm pldl1keep, [%x[uptr0], u_col_stride4 ]\n"
+ "fmla vV12.4s, vU32.4s, vW31.4s\n"
+ "str qV12, [%x[vptr0], %x[v_col_stride1]]\n"
+ "fmla vV21.4s, vU32.4s, vW22.4s\n"
+ "prfm pldl1keep, [%x[uptr0], u_col_stride5 ]\n"
+ "fmla vV22.4s, vU32.4s, vW21.4s\n"
+ "prfm pldl1keep, [ uptr1 ]\n"
+ "fmla vV31.4s, vU32.4s, vW12.4s\n"
+ "prfm pldl1keep, [ uptr1 , %x[u_col_stride1]]\n"
+ "fmla vV32.4s, vU32.4s, vW11.4s\n"
+ "ldr qU51, [uptr4], #0x10\n"
+ "fmla vV21.4s, vU42.4s, vW32.4s\n"
+ "prfm pldl1keep, [ uptr1 , u_col_stride2 ]\n"
+ "fmla vV22.4s, vU42.4s, vW31.4s\n"
+ "str qV22, [vptr1, %x[v_col_stride1]]\n"
+ "fmla vV31.4s, vU42.4s, vW22.4s\n"
+ "prfm pldl1keep, [ uptr1 , u_col_stride3 ]\n"
+ "fmla vV32.4s, vU42.4s, vW21.4s\n"
+ "subs %x[c4_rem], %x[c4_rem], #1\n"
+ "fmla vV41.4s, vU42.4s, vW12.4s\n"
+ "ldr qW12, [%x[wptr0], %x[w_col_stride1]]\n"
+ "fmla vV42.4s, vU42.4s, vW11.4s\n"
+ "ldr qU61, [uptr5], #0x10\n"
+ "fmla vV31.4s, vU52.4s, vW32.4s\n"
+ "prfm pldl1keep, [ uptr1 , u_col_stride4 ]\n"
+ "fmla vV32.4s, vU52.4s, vW31.4s\n"
+ "str qV32, [vptr2, %x[v_col_stride1]]\n"
+ "fmla vV41.4s, vU52.4s, vW22.4s\n"
+ "ldr qW22, [wptr1, %x[w_col_stride1]]\n"
+ "fmla vV42.4s, vU52.4s, vW21.4s\n"
+ "ldr qU16, [%x[uptr0], u_col_stride5]\n"
+ "fmla vV41.4s, vU62.4s, vW32.4s\n"
+ "ldr qW32, [wptr2, %x[w_col_stride1]]\n"
+ "fmla vV42.4s, vU62.4s, vW31.4s\n"
+ "str qV42, [vptr3, %x[v_col_stride1]]\n"
+ "fmla vV11.4s, vU11.4s, vW11.4s\n"
+ "ldr qU15, [%x[uptr0], u_col_stride4]\n"
+ "fmla vV11.4s, vU21.4s, vW21.4s\n"
+ "ldr qU14, [%x[uptr0], u_col_stride3]\n"
+ "fmla vV21.4s, vU21.4s, vW11.4s\n"
+ "ldr qU26, [uptr1, u_col_stride5]\n"
+ "fmla vV11.4s, vU31.4s, vW31.4s\n"
+ "str qV11, [%x[vptr0]], #0x10\n"
+ "fmla vV21.4s, vU31.4s, vW21.4s\n"
+ "prfm pldl1keep, [ uptr1 , u_col_stride5 ]\n"
+ "fmla vV31.4s, vU31.4s, vW11.4s\n"
+ "ldr qU25, [uptr1, u_col_stride4]\n"
+ "fmla vV21.4s, vU41.4s, vW31.4s\n"
+ "str qV21, [vptr1], #0x10\n"
+ "fmla vV31.4s, vU41.4s, vW21.4s\n"
+ "prfm pldl1keep, [ uptr2 ]\n"
+ "fmla vV41.4s, vU41.4s, vW11.4s\n"
+ "ldr qW11, [%x[wptr0]], #0x10\n"
+ "fmla vV31.4s, vU51.4s, vW31.4s\n"
+ "str qV31, [vptr2], #0x10\n"
+ "fmla vV41.4s, vU51.4s, vW21.4s\n"
+ "ldr qU36, [uptr2, u_col_stride5]\n"
+ "fmla vV41.4s, vU61.4s, vW31.4s\n"
+ "str qV41, [vptr3], #0x10\n"
+ "fmul vV14.4s, vU16.4s, vW13.4s\n"
+ "ldr qU24, [uptr1, u_col_stride3]\n"
+ "fmul vV13.4s, vU15.4s, vW13.4s\n"
+ "ldr qW31, [wptr2], #0x10\n"
+ "fmla vV14.4s, vU15.4s, vW12.4s\n"
+ "ldr qW21, [wptr1], #0x10\n"
+ "fmul vV12.4s, vU14.4s, vW13.4s\n"
+ "ldr qU34, [uptr2, u_col_stride3]\n"
+ "fmla vV13.4s, vU14.4s, vW12.4s\n"
+ "ldr qU46, [uptr3, u_col_stride5]\n"
+ "fmla vV14.4s, vU14.4s, vW11.4s\n"
+ "ldr qU45, [uptr3, u_col_stride4]\n"
+ "fmla vV14.4s, vU26.4s, vW23.4s\n"
+ "ldr qU35, [uptr2, u_col_stride4]\n"
+ "fmul vV24.4s, vU26.4s, vW13.4s\n"
+ "ldr qU44, [uptr3, u_col_stride3]\n"
+ "fmla vV13.4s, vU25.4s, vW23.4s\n"
+ "bne 1b\n"
+
+ "2:" // Final iteration
+ "fmla vV14.4s, vU25.4s, vW22.4s\n"
+ "fmul vV23.4s, vU25.4s, vW13.4s\n"
+ "fmla vV24.4s, vU25.4s, vW12.4s\n"
+ "ldr qU56, [uptr4, u_col_stride5]\n"
+ "fmla vV12.4s, vU24.4s, vW23.4s\n"
+ "fmla vV13.4s, vU24.4s, vW22.4s\n"
+ "fmla vV14.4s, vU24.4s, vW21.4s\n"
+ "fmul vV22.4s, vU24.4s, vW13.4s\n"
+ "fmla vV23.4s, vU24.4s, vW12.4s\n"
+ "fmla vV24.4s, vU24.4s, vW11.4s\n"
+ "ldr qU55, [uptr4, u_col_stride4]\n"
+ "fmla vV14.4s, vU36.4s, vW33.4s\n"
+ "fmla vV24.4s, vU36.4s, vW23.4s\n"
+ "fmul vV34.4s, vU36.4s, vW13.4s\n"
+ "ldr qU54, [uptr4, u_col_stride3]\n"
+ "fmla vV13.4s, vU35.4s, vW33.4s\n"
+ "fmla vV14.4s, vU35.4s, vW32.4s\n"
+ "fmla vV23.4s, vU35.4s, vW23.4s\n"
+ "fmla vV24.4s, vU35.4s, vW22.4s\n"
+ "fmul vV33.4s, vU35.4s, vW13.4s\n"
+ "fmla vV34.4s, vU35.4s, vW12.4s\n"
+ "ldr qU66, [uptr5, u_col_stride5]\n"
+ "fmla vV12.4s, vU34.4s, vW33.4s\n"
+ "fmla vV13.4s, vU34.4s, vW32.4s\n"
+ "fmla vV14.4s, vU34.4s, vW31.4s\n"
+ "str qV14, [%x[vptr0], v_col_stride3]\n"
+ "fmla vV22.4s, vU34.4s, vW23.4s\n"
+ "fmla vV23.4s, vU34.4s, vW22.4s\n"
+ "fmla vV24.4s, vU34.4s, vW21.4s\n"
+ "fmul vV32.4s, vU34.4s, vW13.4s\n"
+ "fmla vV33.4s, vU34.4s, vW12.4s\n"
+ "fmla vV34.4s, vU34.4s, vW11.4s\n"
+ "ldr qU65, [uptr5, u_col_stride4]\n"
+ "fmla vV24.4s, vU46.4s, vW33.4s\n"
+ "fmla vV34.4s, vU46.4s, vW23.4s\n"
+ "fmul vV44.4s, vU46.4s, vW13.4s\n"
+ "ldr qU64, [uptr5, u_col_stride3]\n"
+ "fmla vV23.4s, vU45.4s, vW33.4s\n"
+ "fmla vV24.4s, vU45.4s, vW32.4s\n"
+ "fmla vV33.4s, vU45.4s, vW23.4s\n"
+ "fmla vV34.4s, vU45.4s, vW22.4s\n"
+ "fmul vV43.4s, vU45.4s, vW13.4s\n"
+ "fmla vV44.4s, vU45.4s, vW12.4s\n"
+ "ldr qU13, [%x[uptr0], u_col_stride2]\n"
+ "fmla vV22.4s, vU44.4s, vW33.4s\n"
+ "fmla vV23.4s, vU44.4s, vW32.4s\n"
+ "fmla vV24.4s, vU44.4s, vW31.4s\n"
+ "str qV24, [vptr1, v_col_stride3]\n"
+ "fmla vV32.4s, vU44.4s, vW23.4s\n"
+ "fmla vV33.4s, vU44.4s, vW22.4s\n"
+ "fmla vV34.4s, vU44.4s, vW21.4s\n"
+ "fmul vV42.4s, vU44.4s, vW13.4s\n"
+ "fmla vV43.4s, vU44.4s, vW12.4s\n"
+ "fmla vV44.4s, vU44.4s, vW11.4s\n"
+ "ldr qU23, [uptr1, u_col_stride2]\n"
+ "fmla vV34.4s, vU56.4s, vW33.4s\n"
+ "fmla vV44.4s, vU56.4s, vW23.4s\n"
+ "ldr qU33, [uptr2, u_col_stride2]\n"
+ "fmla vV33.4s, vU55.4s, vW33.4s\n"
+ "fmla vV34.4s, vU55.4s, vW32.4s\n"
+ "fmla vV43.4s, vU55.4s, vW23.4s\n"
+ "fmla vV44.4s, vU55.4s, vW22.4s\n"
+ "ldr qU43, [uptr3, u_col_stride2]\n"
+ "fmla vV32.4s, vU54.4s, vW33.4s\n"
+ "fmla vV33.4s, vU54.4s, vW32.4s\n"
+ "fmla vV34.4s, vU54.4s, vW31.4s\n"
+ "str qV34, [vptr2, v_col_stride3]\n"
+ "fmla vV42.4s, vU54.4s, vW23.4s\n"
+ "fmla vV43.4s, vU54.4s, vW22.4s\n"
+ "fmla vV44.4s, vU54.4s, vW21.4s\n"
+ "ldr qU53, [uptr4, u_col_stride2]\n"
+ "fmla vV44.4s, vU66.4s, vW33.4s\n"
+ "ldr qU63, [uptr5, u_col_stride2]\n"
+ "fmla vV43.4s, vU65.4s, vW33.4s\n"
+ "fmla vV44.4s, vU65.4s, vW32.4s\n"
+ "ldr qU12, [%x[uptr0], %x[u_col_stride1]]\n"
+ "fmla vV42.4s, vU64.4s, vW33.4s\n"
+ "fmla vV43.4s, vU64.4s, vW32.4s\n"
+ "fmla vV44.4s, vU64.4s, vW31.4s\n"
+ "str qV44, [vptr3, v_col_stride3]\n"
+ "fmul vV11.4s, vU13.4s, vW13.4s\n"
+ "ldr qU22, [uptr1, %x[u_col_stride1]]\n"
+ "fmla vV12.4s, vU13.4s, vW12.4s\n"
+ "fmla vV13.4s, vU13.4s, vW11.4s\n"
+ "ldr qU32, [uptr2, %x[u_col_stride1]]\n"
+ "fmla vV11.4s, vU23.4s, vW23.4s\n"
+ "fmla vV12.4s, vU23.4s, vW22.4s\n"
+ "fmla vV13.4s, vU23.4s, vW21.4s\n"
+ "fmul vV21.4s, vU23.4s, vW13.4s\n"
+ "fmla vV22.4s, vU23.4s, vW12.4s\n"
+ "fmla vV23.4s, vU23.4s, vW11.4s\n"
+ "ldr qU42, [uptr3, %x[u_col_stride1]]\n"
+ "fmla vV11.4s, vU33.4s, vW33.4s\n"
+ "fmla vV12.4s, vU33.4s, vW32.4s\n"
+ "fmla vV13.4s, vU33.4s, vW31.4s\n"
+ "str qV13, [%x[vptr0], v_col_stride2]\n"
+ "fmla vV21.4s, vU33.4s, vW23.4s\n"
+ "fmla vV22.4s, vU33.4s, vW22.4s\n"
+ "fmla vV23.4s, vU33.4s, vW21.4s\n"
+ "fmul vV31.4s, vU33.4s, vW13.4s\n"
+ "fmla vV32.4s, vU33.4s, vW12.4s\n"
+ "fmla vV33.4s, vU33.4s, vW11.4s\n"
+ "ldr qU52, [uptr4, %x[u_col_stride1]]\n"
+ "fmla vV21.4s, vU43.4s, vW33.4s\n"
+ "fmla vV22.4s, vU43.4s, vW32.4s\n"
+ "fmla vV23.4s, vU43.4s, vW31.4s\n"
+ "str qV23, [vptr1, v_col_stride2]\n"
+ "fmla vV31.4s, vU43.4s, vW23.4s\n"
+ "fmla vV32.4s, vU43.4s, vW22.4s\n"
+ "fmla vV33.4s, vU43.4s, vW21.4s\n"
+ "fmul vV41.4s, vU43.4s, vW13.4s\n"
+ "fmla vV42.4s, vU43.4s, vW12.4s\n"
+ "fmla vV43.4s, vU43.4s, vW11.4s\n"
+ "ldr qU62, [uptr5, %x[u_col_stride1]]\n"
+ "fmla vV31.4s, vU53.4s, vW33.4s\n"
+ "fmla vV32.4s, vU53.4s, vW32.4s\n"
+ "fmla vV33.4s, vU53.4s, vW31.4s\n"
+ "str qV33, [vptr2, v_col_stride2]\n"
+ "fmla vV41.4s, vU53.4s, vW23.4s\n"
+ "fmla vV42.4s, vU53.4s, vW22.4s\n"
+ "fmla vV43.4s, vU53.4s, vW21.4s\n"
+ "ldr qU11, [%x[uptr0]], #0x10\n"
+ "fmla vV41.4s, vU63.4s, vW33.4s\n"
+ "fmla vV42.4s, vU63.4s, vW32.4s\n"
+ "fmla vV43.4s, vU63.4s, vW31.4s\n"
+ "str qV43, [vptr3, v_col_stride2]\n"
+ "fmla vV11.4s, vU12.4s, vW12.4s\n"
+ "ldr qU21, [uptr1], #0x10\n"
+ "fmla vV12.4s, vU12.4s, vW11.4s\n"
+ "ldr qU31, [uptr2], #0x10\n"
+ "fmla vV11.4s, vU22.4s, vW22.4s\n"
+ "fmla vV12.4s, vU22.4s, vW21.4s\n"
+ "fmla vV21.4s, vU22.4s, vW12.4s\n"
+ "fmla vV22.4s, vU22.4s, vW11.4s\n"
+ "ldr qU41, [uptr3], #0x10\n"
+ "fmla vV11.4s, vU32.4s, vW32.4s\n"
+ "fmla vV12.4s, vU32.4s, vW31.4s\n"
+ "str qV12, [%x[vptr0], %x[v_col_stride1]]\n"
+ "fmla vV21.4s, vU32.4s, vW22.4s\n"
+ "fmla vV22.4s, vU32.4s, vW21.4s\n"
+ "fmla vV31.4s, vU32.4s, vW12.4s\n"
+ "fmla vV32.4s, vU32.4s, vW11.4s\n"
+ "ldr qU51, [uptr4], #0x10\n"
+ "fmla vV21.4s, vU42.4s, vW32.4s\n"
+ "fmla vV22.4s, vU42.4s, vW31.4s\n"
+ "str qV22, [vptr1, %x[v_col_stride1]]\n"
+ "fmla vV31.4s, vU42.4s, vW22.4s\n"
+ "fmla vV32.4s, vU42.4s, vW21.4s\n"
+ "subs %x[c4_rem], %x[c4_rem], #1\n"
+ "fmla vV41.4s, vU42.4s, vW12.4s\n"
+ "fmla vV42.4s, vU42.4s, vW11.4s\n"
+ "ldr qU61, [uptr5], #0x10\n"
+ "fmla vV31.4s, vU52.4s, vW32.4s\n"
+ "fmla vV32.4s, vU52.4s, vW31.4s\n"
+ "str qV32, [vptr2, %x[v_col_stride1]]\n"
+ "fmla vV41.4s, vU52.4s, vW22.4s\n"
+ "fmla vV42.4s, vU52.4s, vW21.4s\n"
+ "fmla vV41.4s, vU62.4s, vW32.4s\n"
+ "fmla vV42.4s, vU62.4s, vW31.4s\n"
+ "str qV42, [vptr3, %x[v_col_stride1]]\n"
+ "fmla vV11.4s, vU11.4s, vW11.4s\n"
+ "fmla vV11.4s, vU21.4s, vW21.4s\n"
+ "fmla vV21.4s, vU21.4s, vW11.4s\n"
+ "fmla vV11.4s, vU31.4s, vW31.4s\n"
+ "str qV11, [%x[vptr0]], #0x10\n"
+ "fmla vV21.4s, vU31.4s, vW21.4s\n"
+ "fmla vV31.4s, vU31.4s, vW11.4s\n"
+ "fmla vV21.4s, vU41.4s, vW31.4s\n"
+ "str qV21, [vptr1], #0x10\n"
+ "fmla vV31.4s, vU41.4s, vW21.4s\n"
+ "fmla vV41.4s, vU41.4s, vW11.4s\n"
+ "fmla vV31.4s, vU51.4s, vW31.4s\n"
+ "str qV31, [vptr2], #0x10\n"
+ "fmla vV41.4s, vU51.4s, vW21.4s\n"
+ "fmla vV41.4s, vU61.4s, vW31.4s\n"
+ "str qV41, [vptr3], #0x10\n"
+
+ ".unreq qW22\n" ".unreq qU64\n" ".unreq qU35\n" ".unreq qV41\n"
+ ".unreq qU34\n" ".unreq qU21\n" ".unreq qV43\n" ".unreq qW21\n"
+ ".unreq qU24\n" ".unreq qU54\n" ".unreq qV31\n" ".unreq qV12\n"
+ ".unreq qU61\n" ".unreq qU26\n" ".unreq qV32\n"
+ ".unreq qU36\n" ".unreq qU51\n" ".unreq qU66\n" ".unreq qU12\n"
+ ".unreq qV14\n" ".unreq qV11\n" ".unreq qU65\n"
+ ".unreq qU15\n" ".unreq qU22\n" ".unreq qU45\n"
+ ".unreq qV22\n" ".unreq qU14\n"
+ ".unreq qU44\n" ".unreq qU43\n" ".unreq qU11\n"
+ ".unreq qV24\n" ".unreq qV42\n" ".unreq qW31\n" ".unreq qW13\n"
+ ".unreq qU33\n" ".unreq qU62\n" ".unreq qU25\n" ".unreq qU56\n"
+ ".unreq qW33\n"
+ ".unreq qU42\n" ".unreq qU16\n" ".unreq qV44\n"
+ ".unreq qU63\n" ".unreq qU31\n" ".unreq qV34\n"
+ ".unreq qW11\n" ".unreq qU41\n" ".unreq qV13\n" ".unreq qV33\n"
+ ".unreq qU46\n" ".unreq qU32\n" ".unreq qU13\n"
+ ".unreq qW23\n" ".unreq qV23\n" ".unreq qV21\n" ".unreq qU55\n"
+ ".unreq qW12\n" ".unreq qW32\n" ".unreq qU23\n" ".unreq qU52\n"
+ ".unreq qU53\n" ".unreq vW22\n"
+ ".unreq vU64\n" ".unreq vU35\n" ".unreq vV41\n"
+ ".unreq vU34\n" ".unreq vU21\n" ".unreq vV43\n" ".unreq vW21\n"
+ ".unreq vU24\n" ".unreq vU54\n" ".unreq vV31\n"
+ ".unreq vV12\n" ".unreq vU61\n"
+ ".unreq vU26\n" ".unreq vV32\n"
+ ".unreq vU36\n" ".unreq vU51\n" ".unreq vU66\n" ".unreq vU12\n"
+ ".unreq vV14\n" ".unreq vV11\n" ".unreq vU65\n"
+ ".unreq vU15\n" ".unreq vU22\n" ".unreq vU45\n"
+ ".unreq vV22\n" ".unreq vU14\n"
+ ".unreq vU44\n" ".unreq vU43\n" ".unreq vU11\n"
+ ".unreq vV24\n" ".unreq vV42\n" ".unreq vW31\n" ".unreq vW13\n"
+ ".unreq vU33\n" ".unreq vU62\n" ".unreq vU25\n" ".unreq vU56\n"
+ ".unreq vW33\n" ".unreq vU42\n" ".unreq vU16\n" ".unreq vV44\n"
+ ".unreq vU63\n" ".unreq vU31\n" ".unreq vV34\n" ".unreq vW11\n"
+ ".unreq vU41\n" ".unreq vV13\n" ".unreq vV33\n"
+ ".unreq vU46\n" ".unreq vU32\n" ".unreq vU13\n" ".unreq vW23\n"
+ ".unreq vV23\n" ".unreq vV21\n" ".unreq vU55\n" ".unreq vW12\n"
+ ".unreq vW32\n" ".unreq vU23\n" ".unreq vU52\n" ".unreq vU53\n"
+ : [uptr0] "+r" (uptr0), [vptr0] "+r" (vptr0), [wptr0] "+r" (wptr0),
+ [c4_rem] "+r" (c4_rem)
+ : [u_row_stride] "r" (in_row_stride * sizeof(float)),
+ [u_col_stride1] "r" (in_col_stride * sizeof(float)),
+ [v_row_stride] "r" (out_row_stride * sizeof(float)),
+ [v_col_stride1] "r" (out_col_stride * sizeof(float)),
+ [w_row_stride] "r" (weight_row_stride * sizeof(float)),
+ [w_col_stride1] "r" (weight_col_stride * sizeof(float))
+ : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10",
+ "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20",
+ "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "x0",
+ "x1", "x2", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11",
+ "x12", "x13", "x14", "x15", "x16", "cc", "memory"
+ );
+ }
+ for (; channels_remaining; channels_remaining--)
+ {
+ // Load input tile
+ float u[inner_tile_rows][inner_tile_cols];
+ for (int i = 0; i < inner_tile_rows; i++)
+ {
+ const float* const inptr_row = uptr0 + (i - in_pad_top)*in_row_stride;
+ for (int j = 0; j < inner_tile_cols; j++)
+ {
+ if (i < in_pad_top || in_cells_i <= i ||
+ j < in_pad_left || in_cells_j <= j)
+ {
+ u[i][j] = static_cast<float>(0);
+ }
+ else
+ {
+ u[i][j] = *(inptr_row + (j - in_pad_left)*in_col_stride);
+ }
+ }
+ }
+ uptr0++;
+
+ // Load weights tile
+ float w[kernel_rows][kernel_cols];
+ for (int i = 0; i < kernel_rows; i++)
+ {
+ const float* const wptr_row = wptr0 + i*weight_row_stride;
+ for (int j = 0; j < kernel_cols; j++)
+ {
+ w[i][j] = *(wptr_row + j*weight_col_stride);
+ }
+ }
+ wptr0++;
+
+ // Perform the convolution
+ float v[output_tile_rows][output_tile_cols];
+ for (int out_i = 0; out_i < out_cells_i; out_i++)
+ {
+ for (int out_j = 0; out_j < out_cells_j; out_j++)
+ {
+ // Clear the accumulator
+ v[out_i][out_j] = static_cast<float>(0);
+
+ // Base co-ordinate
+ const int base_i = out_i * stride_rows;
+ const int base_j = out_j * stride_cols;
+
+ // Fill the accumulator
+ for (int in_i = 0; in_i < kernel_rows; in_i++)
+ {
+ const int i = base_i + in_i;
+ for (int in_j = 0; in_j < kernel_cols; in_j++)
+ {
+ const int j = base_j + in_j;
+ v[out_i][out_j] += w[in_i][in_j] * u[i][j];
+ }
+ }
+ }
+ }
+
+ // Store the output tile
+ for (int i = 0; i < out_cells_i; i++)
+ {
+ float* const outptr_row = vptr0 + i*out_row_stride;
+ for (int j = 0; j < out_cells_j; j++)
+ {
+ *(outptr_row + j*out_col_stride) = v[i][j];
+ }
+ }
+ vptr0++;
+ }
+}
+
+#endif // __aarch64__
+
+template <>
+const Conv::TileFn Conv::tilefn_unpadded = ConvImpl::template process_tile<true, 0, 0, 0, 0, 0, 0>;
+
+template <>
+const Conv::TileFn Conv::tilefn_top[n_in_pad_top_fns] = {
+ ConvImpl::template process_tile<true, 1, 0, 0, 0, 0, 0>,
};
+template <>
+const Conv::TileFn Conv::tilefn_left[n_in_pad_left_fns] = {
+ ConvImpl::template process_tile<true, 0, 1, 0, 0, 0, 0>,
+};
+
+template <>
+const Conv::TileFn Conv::tilefn_bottom[n_in_pad_bottom_fns][n_out_pad_bottom_fns] = {
+ {
+ ConvImpl::template process_tile<true, 0, 0, 0, 0, 0, 0>,
+ ConvImpl::template process_tile<true, 0, 0, 0, 0, 1, 0>,
+ ConvImpl::template process_tile<true, 0, 0, 0, 0, 2, 0>,
+ ConvImpl::template process_tile<true, 0, 0, 0, 0, 3, 0>,
+ },
+ {
+ ConvImpl::template process_tile<true, 0, 0, 1, 0, 0, 0>,
+ ConvImpl::template process_tile<true, 0, 0, 1, 0, 1, 0>,
+ ConvImpl::template process_tile<true, 0, 0, 1, 0, 2, 0>,
+ ConvImpl::template process_tile<true, 0, 0, 1, 0, 3, 0>,
+ },
+ {
+ ConvImpl::template process_tile<true, 0, 0, 2, 0, 0, 0>,
+ ConvImpl::template process_tile<true, 0, 0, 2, 0, 1, 0>,
+ ConvImpl::template process_tile<true, 0, 0, 2, 0, 2, 0>,
+ ConvImpl::template process_tile<true, 0, 0, 2, 0, 3, 0>,
+ },
+ {
+ ConvImpl::template process_tile<true, 0, 0, 3, 0, 0, 0>,
+ ConvImpl::template process_tile<true, 0, 0, 3, 0, 1, 0>,
+ ConvImpl::template process_tile<true, 0, 0, 3, 0, 2, 0>,
+ ConvImpl::template process_tile<true, 0, 0, 3, 0, 3, 0>,
+ },
+ {
+ ConvImpl::template process_tile<true, 0, 0, 4, 0, 0, 0>,
+ ConvImpl::template process_tile<true, 0, 0, 4, 0, 1, 0>,
+ ConvImpl::template process_tile<true, 0, 0, 4, 0, 2, 0>,
+ ConvImpl::template process_tile<true, 0, 0, 4, 0, 3, 0>,
+ },
+ {
+ ConvImpl::template process_tile<true, 0, 0, 5, 0, 0, 0>,
+ ConvImpl::template process_tile<true, 0, 0, 5, 0, 1, 0>,
+ ConvImpl::template process_tile<true, 0, 0, 5, 0, 2, 0>,
+ ConvImpl::template process_tile<true, 0, 0, 5, 0, 3, 0>,
+ },
+};
+
+template <>
+const Conv::TileFn Conv::tilefn_right[n_in_pad_right_fns][n_out_pad_right_fns] = {
+ {
+ ConvImpl::template process_tile<true, 0, 0, 0, 0, 0, 0>,
+ ConvImpl::template process_tile<true, 0, 0, 0, 0, 0, 1>,
+ ConvImpl::template process_tile<true, 0, 0, 0, 0, 0, 2>,
+ ConvImpl::template process_tile<true, 0, 0, 0, 0, 0, 3>,
+ },
+ {
+ ConvImpl::template process_tile<true, 0, 0, 0, 1, 0, 0>,
+ ConvImpl::template process_tile<true, 0, 0, 0, 1, 0, 1>,
+ ConvImpl::template process_tile<true, 0, 0, 0, 1, 0, 2>,
+ ConvImpl::template process_tile<true, 0, 0, 0, 1, 0, 3>,
+ },
+ {
+ ConvImpl::template process_tile<true, 0, 0, 0, 2, 0, 0>,
+ ConvImpl::template process_tile<true, 0, 0, 0, 2, 0, 1>,
+ ConvImpl::template process_tile<true, 0, 0, 0, 2, 0, 2>,
+ ConvImpl::template process_tile<true, 0, 0, 0, 2, 0, 3>,
+ },
+ {
+ ConvImpl::template process_tile<true, 0, 0, 0, 3, 0, 0>,
+ ConvImpl::template process_tile<true, 0, 0, 0, 3, 0, 1>,
+ ConvImpl::template process_tile<true, 0, 0, 0, 3, 0, 2>,
+ ConvImpl::template process_tile<true, 0, 0, 0, 3, 0, 3>,
+ },
+ {
+ ConvImpl::template process_tile<true, 0, 0, 0, 4, 0, 0>,
+ ConvImpl::template process_tile<true, 0, 0, 0, 4, 0, 1>,
+ ConvImpl::template process_tile<true, 0, 0, 0, 4, 0, 2>,
+ ConvImpl::template process_tile<true, 0, 0, 0, 4, 0, 3>,
+ },
+ {
+ ConvImpl::template process_tile<true, 0, 0, 0, 5, 0, 0>,
+ ConvImpl::template process_tile<true, 0, 0, 0, 5, 0, 1>,
+ ConvImpl::template process_tile<true, 0, 0, 0, 5, 0, 2>,
+ ConvImpl::template process_tile<true, 0, 0, 0, 5, 0, 3>,
+ },
+};
+
+template <>
+const Conv::TileFn Conv::tilefn_generic = ConvImpl::template process_tile<false>;
template class DepthwiseConvolution<4, 4, 3, 3, 1, 1, float, float>;
} // namespace depthwise
diff --git a/src/core/NEON/kernels/convolution/depthwise/depthwise_4x4_3x3_2x2_fp32_fp32.cpp b/src/core/NEON/kernels/convolution/depthwise/depthwise_4x4_3x3_2x2_fp32_fp32.cpp
index 2104c0b..8eb53a6 100644
--- a/src/core/NEON/kernels/convolution/depthwise/depthwise_4x4_3x3_2x2_fp32_fp32.cpp
+++ b/src/core/NEON/kernels/convolution/depthwise/depthwise_4x4_3x3_2x2_fp32_fp32.cpp
@@ -29,5179 +29,138 @@
using ConvImpl = DepthwiseConvolutionImpl<4, 4, 3, 3, 2, 2, float, float>;
template <>
-const Conv::TileFn Conv::tile_fns
- [max_in_pad_top]
- [max_in_pad_left]
- [max_in_pad_bottom]
- [max_in_pad_right]
- [max_out_pad_bottom]
- [max_out_pad_right] = {
- { // Input pad top = 0
- { // Input pad left = 0
- { // Input pad bottom = 0
- { // Input pad right = 0
- { // Output pad bottom = 0
- Conv::template process_tile<0, 0, 0, 0, 0, 0>,
- Conv::template process_tile<0, 0, 0, 0, 0, 1>,
- Conv::template process_tile<0, 0, 0, 0, 0, 2>,
- Conv::template process_tile<0, 0, 0, 0, 0, 3>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- Conv::template process_tile<0, 0, 0, 0, 1, 0>,
- Conv::template process_tile<0, 0, 0, 0, 1, 1>,
- Conv::template process_tile<0, 0, 0, 0, 1, 2>,
- Conv::template process_tile<0, 0, 0, 0, 1, 3>,
- }, // Output pad bottom = 1
- { // Output pad bottom = 2
- Conv::template process_tile<0, 0, 0, 0, 2, 0>,
- Conv::template process_tile<0, 0, 0, 0, 2, 1>,
- Conv::template process_tile<0, 0, 0, 0, 2, 2>,
- Conv::template process_tile<0, 0, 0, 0, 2, 3>,
- }, // Output pad bottom = 2
- { // Output pad bottom = 3
- Conv::template process_tile<0, 0, 0, 0, 3, 0>,
- Conv::template process_tile<0, 0, 0, 0, 3, 1>,
- Conv::template process_tile<0, 0, 0, 0, 3, 2>,
- Conv::template process_tile<0, 0, 0, 0, 3, 3>,
- }, // Output pad bottom = 3
- }, // Input pad right = 0
- { // Input pad right = 1
- { // Output pad bottom = 0
- Conv::template process_tile<0, 0, 0, 1, 0, 0>,
- Conv::template process_tile<0, 0, 0, 1, 0, 1>,
- Conv::template process_tile<0, 0, 0, 1, 0, 2>,
- Conv::template process_tile<0, 0, 0, 1, 0, 3>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- Conv::template process_tile<0, 0, 0, 1, 1, 0>,
- Conv::template process_tile<0, 0, 0, 1, 1, 1>,
- Conv::template process_tile<0, 0, 0, 1, 1, 2>,
- Conv::template process_tile<0, 0, 0, 1, 1, 3>,
- }, // Output pad bottom = 1
- { // Output pad bottom = 2
- Conv::template process_tile<0, 0, 0, 1, 2, 0>,
- Conv::template process_tile<0, 0, 0, 1, 2, 1>,
- Conv::template process_tile<0, 0, 0, 1, 2, 2>,
- Conv::template process_tile<0, 0, 0, 1, 2, 3>,
- }, // Output pad bottom = 2
- { // Output pad bottom = 3
- Conv::template process_tile<0, 0, 0, 1, 3, 0>,
- Conv::template process_tile<0, 0, 0, 1, 3, 1>,
- Conv::template process_tile<0, 0, 0, 1, 3, 2>,
- Conv::template process_tile<0, 0, 0, 1, 3, 3>,
- }, // Output pad bottom = 3
- }, // Input pad right = 1
- { // Input pad right = 2
- { // Output pad bottom = 0
- Conv::template process_tile<0, 0, 0, 2, 0, 0>,
- Conv::template process_tile<0, 0, 0, 2, 0, 1>,
- Conv::template process_tile<0, 0, 0, 2, 0, 2>,
- Conv::template process_tile<0, 0, 0, 2, 0, 3>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- Conv::template process_tile<0, 0, 0, 2, 1, 0>,
- Conv::template process_tile<0, 0, 0, 2, 1, 1>,
- Conv::template process_tile<0, 0, 0, 2, 1, 2>,
- Conv::template process_tile<0, 0, 0, 2, 1, 3>,
- }, // Output pad bottom = 1
- { // Output pad bottom = 2
- Conv::template process_tile<0, 0, 0, 2, 2, 0>,
- Conv::template process_tile<0, 0, 0, 2, 2, 1>,
- Conv::template process_tile<0, 0, 0, 2, 2, 2>,
- Conv::template process_tile<0, 0, 0, 2, 2, 3>,
- }, // Output pad bottom = 2
- { // Output pad bottom = 3
- Conv::template process_tile<0, 0, 0, 2, 3, 0>,
- Conv::template process_tile<0, 0, 0, 2, 3, 1>,
- Conv::template process_tile<0, 0, 0, 2, 3, 2>,
- Conv::template process_tile<0, 0, 0, 2, 3, 3>,
- }, // Output pad bottom = 3
- }, // Input pad right = 2
- { // Input pad right = 3
- { // Output pad bottom = 0
- Conv::template process_tile<0, 0, 0, 3, 0, 0>,
- Conv::template process_tile<0, 0, 0, 3, 0, 1>,
- Conv::template process_tile<0, 0, 0, 3, 0, 2>,
- Conv::template process_tile<0, 0, 0, 3, 0, 3>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- Conv::template process_tile<0, 0, 0, 3, 1, 0>,
- Conv::template process_tile<0, 0, 0, 3, 1, 1>,
- Conv::template process_tile<0, 0, 0, 3, 1, 2>,
- Conv::template process_tile<0, 0, 0, 3, 1, 3>,
- }, // Output pad bottom = 1
- { // Output pad bottom = 2
- Conv::template process_tile<0, 0, 0, 3, 2, 0>,
- Conv::template process_tile<0, 0, 0, 3, 2, 1>,
- Conv::template process_tile<0, 0, 0, 3, 2, 2>,
- Conv::template process_tile<0, 0, 0, 3, 2, 3>,
- }, // Output pad bottom = 2
- { // Output pad bottom = 3
- Conv::template process_tile<0, 0, 0, 3, 3, 0>,
- Conv::template process_tile<0, 0, 0, 3, 3, 1>,
- Conv::template process_tile<0, 0, 0, 3, 3, 2>,
- Conv::template process_tile<0, 0, 0, 3, 3, 3>,
- }, // Output pad bottom = 3
- }, // Input pad right = 3
- { // Input pad right = 4
- { // Output pad bottom = 0
- Conv::template process_tile<0, 0, 0, 4, 0, 0>,
- Conv::template process_tile<0, 0, 0, 4, 0, 1>,
- Conv::template process_tile<0, 0, 0, 4, 0, 2>,
- Conv::template process_tile<0, 0, 0, 4, 0, 3>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- Conv::template process_tile<0, 0, 0, 4, 1, 0>,
- Conv::template process_tile<0, 0, 0, 4, 1, 1>,
- Conv::template process_tile<0, 0, 0, 4, 1, 2>,
- Conv::template process_tile<0, 0, 0, 4, 1, 3>,
- }, // Output pad bottom = 1
- { // Output pad bottom = 2
- Conv::template process_tile<0, 0, 0, 4, 2, 0>,
- Conv::template process_tile<0, 0, 0, 4, 2, 1>,
- Conv::template process_tile<0, 0, 0, 4, 2, 2>,
- Conv::template process_tile<0, 0, 0, 4, 2, 3>,
- }, // Output pad bottom = 2
- { // Output pad bottom = 3
- Conv::template process_tile<0, 0, 0, 4, 3, 0>,
- Conv::template process_tile<0, 0, 0, 4, 3, 1>,
- Conv::template process_tile<0, 0, 0, 4, 3, 2>,
- Conv::template process_tile<0, 0, 0, 4, 3, 3>,
- }, // Output pad bottom = 3
- }, // Input pad right = 4
- { // Input pad right = 5
- { // Output pad bottom = 0
- Conv::template process_tile<0, 0, 0, 5, 0, 0>,
- Conv::template process_tile<0, 0, 0, 5, 0, 1>,
- Conv::template process_tile<0, 0, 0, 5, 0, 2>,
- Conv::template process_tile<0, 0, 0, 5, 0, 3>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- Conv::template process_tile<0, 0, 0, 5, 1, 0>,
- Conv::template process_tile<0, 0, 0, 5, 1, 1>,
- Conv::template process_tile<0, 0, 0, 5, 1, 2>,
- Conv::template process_tile<0, 0, 0, 5, 1, 3>,
- }, // Output pad bottom = 1
- { // Output pad bottom = 2
- Conv::template process_tile<0, 0, 0, 5, 2, 0>,
- Conv::template process_tile<0, 0, 0, 5, 2, 1>,
- Conv::template process_tile<0, 0, 0, 5, 2, 2>,
- Conv::template process_tile<0, 0, 0, 5, 2, 3>,
- }, // Output pad bottom = 2
- { // Output pad bottom = 3
- Conv::template process_tile<0, 0, 0, 5, 3, 0>,
- Conv::template process_tile<0, 0, 0, 5, 3, 1>,
- Conv::template process_tile<0, 0, 0, 5, 3, 2>,
- Conv::template process_tile<0, 0, 0, 5, 3, 3>,
- }, // Output pad bottom = 3
- }, // Input pad right = 5
- { // Input pad right = 6
- { // Output pad bottom = 0
- Conv::template process_tile<0, 0, 0, 6, 0, 0>,
- Conv::template process_tile<0, 0, 0, 6, 0, 1>,
- Conv::template process_tile<0, 0, 0, 6, 0, 2>,
- Conv::template process_tile<0, 0, 0, 6, 0, 3>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- Conv::template process_tile<0, 0, 0, 6, 1, 0>,
- Conv::template process_tile<0, 0, 0, 6, 1, 1>,
- Conv::template process_tile<0, 0, 0, 6, 1, 2>,
- Conv::template process_tile<0, 0, 0, 6, 1, 3>,
- }, // Output pad bottom = 1
- { // Output pad bottom = 2
- Conv::template process_tile<0, 0, 0, 6, 2, 0>,
- Conv::template process_tile<0, 0, 0, 6, 2, 1>,
- Conv::template process_tile<0, 0, 0, 6, 2, 2>,
- Conv::template process_tile<0, 0, 0, 6, 2, 3>,
- }, // Output pad bottom = 2
- { // Output pad bottom = 3
- Conv::template process_tile<0, 0, 0, 6, 3, 0>,
- Conv::template process_tile<0, 0, 0, 6, 3, 1>,
- Conv::template process_tile<0, 0, 0, 6, 3, 2>,
- Conv::template process_tile<0, 0, 0, 6, 3, 3>,
- }, // Output pad bottom = 3
- }, // Input pad right = 6
- }, // Input pad bottom = 0
- { // Input pad bottom = 1
- { // Input pad right = 0
- { // Output pad bottom = 0
- Conv::template process_tile<0, 0, 1, 0, 0, 0>,
- Conv::template process_tile<0, 0, 1, 0, 0, 1>,
- Conv::template process_tile<0, 0, 1, 0, 0, 2>,
- Conv::template process_tile<0, 0, 1, 0, 0, 3>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- Conv::template process_tile<0, 0, 1, 0, 1, 0>,
- Conv::template process_tile<0, 0, 1, 0, 1, 1>,
- Conv::template process_tile<0, 0, 1, 0, 1, 2>,
- Conv::template process_tile<0, 0, 1, 0, 1, 3>,
- }, // Output pad bottom = 1
- { // Output pad bottom = 2
- Conv::template process_tile<0, 0, 1, 0, 2, 0>,
- Conv::template process_tile<0, 0, 1, 0, 2, 1>,
- Conv::template process_tile<0, 0, 1, 0, 2, 2>,
- Conv::template process_tile<0, 0, 1, 0, 2, 3>,
- }, // Output pad bottom = 2
- { // Output pad bottom = 3
- Conv::template process_tile<0, 0, 1, 0, 3, 0>,
- Conv::template process_tile<0, 0, 1, 0, 3, 1>,
- Conv::template process_tile<0, 0, 1, 0, 3, 2>,
- Conv::template process_tile<0, 0, 1, 0, 3, 3>,
- }, // Output pad bottom = 3
- }, // Input pad right = 0
- { // Input pad right = 1
- { // Output pad bottom = 0
- Conv::template process_tile<0, 0, 1, 1, 0, 0>,
- Conv::template process_tile<0, 0, 1, 1, 0, 1>,
- Conv::template process_tile<0, 0, 1, 1, 0, 2>,
- Conv::template process_tile<0, 0, 1, 1, 0, 3>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- Conv::template process_tile<0, 0, 1, 1, 1, 0>,
- Conv::template process_tile<0, 0, 1, 1, 1, 1>,
- Conv::template process_tile<0, 0, 1, 1, 1, 2>,
- Conv::template process_tile<0, 0, 1, 1, 1, 3>,
- }, // Output pad bottom = 1
- { // Output pad bottom = 2
- Conv::template process_tile<0, 0, 1, 1, 2, 0>,
- Conv::template process_tile<0, 0, 1, 1, 2, 1>,
- Conv::template process_tile<0, 0, 1, 1, 2, 2>,
- Conv::template process_tile<0, 0, 1, 1, 2, 3>,
- }, // Output pad bottom = 2
- { // Output pad bottom = 3
- Conv::template process_tile<0, 0, 1, 1, 3, 0>,
- Conv::template process_tile<0, 0, 1, 1, 3, 1>,
- Conv::template process_tile<0, 0, 1, 1, 3, 2>,
- Conv::template process_tile<0, 0, 1, 1, 3, 3>,
- }, // Output pad bottom = 3
- }, // Input pad right = 1
- { // Input pad right = 2
- { // Output pad bottom = 0
- Conv::template process_tile<0, 0, 1, 2, 0, 0>,
- Conv::template process_tile<0, 0, 1, 2, 0, 1>,
- Conv::template process_tile<0, 0, 1, 2, 0, 2>,
- Conv::template process_tile<0, 0, 1, 2, 0, 3>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- Conv::template process_tile<0, 0, 1, 2, 1, 0>,
- Conv::template process_tile<0, 0, 1, 2, 1, 1>,
- Conv::template process_tile<0, 0, 1, 2, 1, 2>,
- Conv::template process_tile<0, 0, 1, 2, 1, 3>,
- }, // Output pad bottom = 1
- { // Output pad bottom = 2
- Conv::template process_tile<0, 0, 1, 2, 2, 0>,
- Conv::template process_tile<0, 0, 1, 2, 2, 1>,
- Conv::template process_tile<0, 0, 1, 2, 2, 2>,
- Conv::template process_tile<0, 0, 1, 2, 2, 3>,
- }, // Output pad bottom = 2
- { // Output pad bottom = 3
- Conv::template process_tile<0, 0, 1, 2, 3, 0>,
- Conv::template process_tile<0, 0, 1, 2, 3, 1>,
- Conv::template process_tile<0, 0, 1, 2, 3, 2>,
- Conv::template process_tile<0, 0, 1, 2, 3, 3>,
- }, // Output pad bottom = 3
- }, // Input pad right = 2
- { // Input pad right = 3
- { // Output pad bottom = 0
- Conv::template process_tile<0, 0, 1, 3, 0, 0>,
- Conv::template process_tile<0, 0, 1, 3, 0, 1>,
- Conv::template process_tile<0, 0, 1, 3, 0, 2>,
- Conv::template process_tile<0, 0, 1, 3, 0, 3>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- Conv::template process_tile<0, 0, 1, 3, 1, 0>,
- Conv::template process_tile<0, 0, 1, 3, 1, 1>,
- Conv::template process_tile<0, 0, 1, 3, 1, 2>,
- Conv::template process_tile<0, 0, 1, 3, 1, 3>,
- }, // Output pad bottom = 1
- { // Output pad bottom = 2
- Conv::template process_tile<0, 0, 1, 3, 2, 0>,
- Conv::template process_tile<0, 0, 1, 3, 2, 1>,
- Conv::template process_tile<0, 0, 1, 3, 2, 2>,
- Conv::template process_tile<0, 0, 1, 3, 2, 3>,
- }, // Output pad bottom = 2
- { // Output pad bottom = 3
- Conv::template process_tile<0, 0, 1, 3, 3, 0>,
- Conv::template process_tile<0, 0, 1, 3, 3, 1>,
- Conv::template process_tile<0, 0, 1, 3, 3, 2>,
- Conv::template process_tile<0, 0, 1, 3, 3, 3>,
- }, // Output pad bottom = 3
- }, // Input pad right = 3
- { // Input pad right = 4
- { // Output pad bottom = 0
- Conv::template process_tile<0, 0, 1, 4, 0, 0>,
- Conv::template process_tile<0, 0, 1, 4, 0, 1>,
- Conv::template process_tile<0, 0, 1, 4, 0, 2>,
- Conv::template process_tile<0, 0, 1, 4, 0, 3>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- Conv::template process_tile<0, 0, 1, 4, 1, 0>,
- Conv::template process_tile<0, 0, 1, 4, 1, 1>,
- Conv::template process_tile<0, 0, 1, 4, 1, 2>,
- Conv::template process_tile<0, 0, 1, 4, 1, 3>,
- }, // Output pad bottom = 1
- { // Output pad bottom = 2
- Conv::template process_tile<0, 0, 1, 4, 2, 0>,
- Conv::template process_tile<0, 0, 1, 4, 2, 1>,
- Conv::template process_tile<0, 0, 1, 4, 2, 2>,
- Conv::template process_tile<0, 0, 1, 4, 2, 3>,
- }, // Output pad bottom = 2
- { // Output pad bottom = 3
- Conv::template process_tile<0, 0, 1, 4, 3, 0>,
- Conv::template process_tile<0, 0, 1, 4, 3, 1>,
- Conv::template process_tile<0, 0, 1, 4, 3, 2>,
- Conv::template process_tile<0, 0, 1, 4, 3, 3>,
- }, // Output pad bottom = 3
- }, // Input pad right = 4
- { // Input pad right = 5
- { // Output pad bottom = 0
- Conv::template process_tile<0, 0, 1, 5, 0, 0>,
- Conv::template process_tile<0, 0, 1, 5, 0, 1>,
- Conv::template process_tile<0, 0, 1, 5, 0, 2>,
- Conv::template process_tile<0, 0, 1, 5, 0, 3>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- Conv::template process_tile<0, 0, 1, 5, 1, 0>,
- Conv::template process_tile<0, 0, 1, 5, 1, 1>,
- Conv::template process_tile<0, 0, 1, 5, 1, 2>,
- Conv::template process_tile<0, 0, 1, 5, 1, 3>,
- }, // Output pad bottom = 1
- { // Output pad bottom = 2
- Conv::template process_tile<0, 0, 1, 5, 2, 0>,
- Conv::template process_tile<0, 0, 1, 5, 2, 1>,
- Conv::template process_tile<0, 0, 1, 5, 2, 2>,
- Conv::template process_tile<0, 0, 1, 5, 2, 3>,
- }, // Output pad bottom = 2
- { // Output pad bottom = 3
- Conv::template process_tile<0, 0, 1, 5, 3, 0>,
- Conv::template process_tile<0, 0, 1, 5, 3, 1>,
- Conv::template process_tile<0, 0, 1, 5, 3, 2>,
- Conv::template process_tile<0, 0, 1, 5, 3, 3>,
- }, // Output pad bottom = 3
- }, // Input pad right = 5
- { // Input pad right = 6
- { // Output pad bottom = 0
- Conv::template process_tile<0, 0, 1, 6, 0, 0>,
- Conv::template process_tile<0, 0, 1, 6, 0, 1>,
- Conv::template process_tile<0, 0, 1, 6, 0, 2>,
- Conv::template process_tile<0, 0, 1, 6, 0, 3>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- Conv::template process_tile<0, 0, 1, 6, 1, 0>,
- Conv::template process_tile<0, 0, 1, 6, 1, 1>,
- Conv::template process_tile<0, 0, 1, 6, 1, 2>,
- Conv::template process_tile<0, 0, 1, 6, 1, 3>,
- }, // Output pad bottom = 1
- { // Output pad bottom = 2
- Conv::template process_tile<0, 0, 1, 6, 2, 0>,
- Conv::template process_tile<0, 0, 1, 6, 2, 1>,
- Conv::template process_tile<0, 0, 1, 6, 2, 2>,
- Conv::template process_tile<0, 0, 1, 6, 2, 3>,
- }, // Output pad bottom = 2
- { // Output pad bottom = 3
- Conv::template process_tile<0, 0, 1, 6, 3, 0>,
- Conv::template process_tile<0, 0, 1, 6, 3, 1>,
- Conv::template process_tile<0, 0, 1, 6, 3, 2>,
- Conv::template process_tile<0, 0, 1, 6, 3, 3>,
- }, // Output pad bottom = 3
- }, // Input pad right = 6
- }, // Input pad bottom = 1
- { // Input pad bottom = 2
- { // Input pad right = 0
- { // Output pad bottom = 0
- Conv::template process_tile<0, 0, 2, 0, 0, 0>,
- Conv::template process_tile<0, 0, 2, 0, 0, 1>,
- Conv::template process_tile<0, 0, 2, 0, 0, 2>,
- Conv::template process_tile<0, 0, 2, 0, 0, 3>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- Conv::template process_tile<0, 0, 2, 0, 1, 0>,
- Conv::template process_tile<0, 0, 2, 0, 1, 1>,
- Conv::template process_tile<0, 0, 2, 0, 1, 2>,
- Conv::template process_tile<0, 0, 2, 0, 1, 3>,
- }, // Output pad bottom = 1
- { // Output pad bottom = 2
- Conv::template process_tile<0, 0, 2, 0, 2, 0>,
- Conv::template process_tile<0, 0, 2, 0, 2, 1>,
- Conv::template process_tile<0, 0, 2, 0, 2, 2>,
- Conv::template process_tile<0, 0, 2, 0, 2, 3>,
- }, // Output pad bottom = 2
- { // Output pad bottom = 3
- Conv::template process_tile<0, 0, 2, 0, 3, 0>,
- Conv::template process_tile<0, 0, 2, 0, 3, 1>,
- Conv::template process_tile<0, 0, 2, 0, 3, 2>,
- Conv::template process_tile<0, 0, 2, 0, 3, 3>,
- }, // Output pad bottom = 3
- }, // Input pad right = 0
- { // Input pad right = 1
- { // Output pad bottom = 0
- Conv::template process_tile<0, 0, 2, 1, 0, 0>,
- Conv::template process_tile<0, 0, 2, 1, 0, 1>,
- Conv::template process_tile<0, 0, 2, 1, 0, 2>,
- Conv::template process_tile<0, 0, 2, 1, 0, 3>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- Conv::template process_tile<0, 0, 2, 1, 1, 0>,
- Conv::template process_tile<0, 0, 2, 1, 1, 1>,
- Conv::template process_tile<0, 0, 2, 1, 1, 2>,
- Conv::template process_tile<0, 0, 2, 1, 1, 3>,
- }, // Output pad bottom = 1
- { // Output pad bottom = 2
- Conv::template process_tile<0, 0, 2, 1, 2, 0>,
- Conv::template process_tile<0, 0, 2, 1, 2, 1>,
- Conv::template process_tile<0, 0, 2, 1, 2, 2>,
- Conv::template process_tile<0, 0, 2, 1, 2, 3>,
- }, // Output pad bottom = 2
- { // Output pad bottom = 3
- Conv::template process_tile<0, 0, 2, 1, 3, 0>,
- Conv::template process_tile<0, 0, 2, 1, 3, 1>,
- Conv::template process_tile<0, 0, 2, 1, 3, 2>,
- Conv::template process_tile<0, 0, 2, 1, 3, 3>,
- }, // Output pad bottom = 3
- }, // Input pad right = 1
- { // Input pad right = 2
- { // Output pad bottom = 0
- Conv::template process_tile<0, 0, 2, 2, 0, 0>,
- Conv::template process_tile<0, 0, 2, 2, 0, 1>,
- Conv::template process_tile<0, 0, 2, 2, 0, 2>,
- Conv::template process_tile<0, 0, 2, 2, 0, 3>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- Conv::template process_tile<0, 0, 2, 2, 1, 0>,
- Conv::template process_tile<0, 0, 2, 2, 1, 1>,
- Conv::template process_tile<0, 0, 2, 2, 1, 2>,
- Conv::template process_tile<0, 0, 2, 2, 1, 3>,
- }, // Output pad bottom = 1
- { // Output pad bottom = 2
- Conv::template process_tile<0, 0, 2, 2, 2, 0>,
- Conv::template process_tile<0, 0, 2, 2, 2, 1>,
- Conv::template process_tile<0, 0, 2, 2, 2, 2>,
- Conv::template process_tile<0, 0, 2, 2, 2, 3>,
- }, // Output pad bottom = 2
- { // Output pad bottom = 3
- Conv::template process_tile<0, 0, 2, 2, 3, 0>,
- Conv::template process_tile<0, 0, 2, 2, 3, 1>,
- Conv::template process_tile<0, 0, 2, 2, 3, 2>,
- Conv::template process_tile<0, 0, 2, 2, 3, 3>,
- }, // Output pad bottom = 3
- }, // Input pad right = 2
- { // Input pad right = 3
- { // Output pad bottom = 0
- Conv::template process_tile<0, 0, 2, 3, 0, 0>,
- Conv::template process_tile<0, 0, 2, 3, 0, 1>,
- Conv::template process_tile<0, 0, 2, 3, 0, 2>,
- Conv::template process_tile<0, 0, 2, 3, 0, 3>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- Conv::template process_tile<0, 0, 2, 3, 1, 0>,
- Conv::template process_tile<0, 0, 2, 3, 1, 1>,
- Conv::template process_tile<0, 0, 2, 3, 1, 2>,
- Conv::template process_tile<0, 0, 2, 3, 1, 3>,
- }, // Output pad bottom = 1
- { // Output pad bottom = 2
- Conv::template process_tile<0, 0, 2, 3, 2, 0>,
- Conv::template process_tile<0, 0, 2, 3, 2, 1>,
- Conv::template process_tile<0, 0, 2, 3, 2, 2>,
- Conv::template process_tile<0, 0, 2, 3, 2, 3>,
- }, // Output pad bottom = 2
- { // Output pad bottom = 3
- Conv::template process_tile<0, 0, 2, 3, 3, 0>,
- Conv::template process_tile<0, 0, 2, 3, 3, 1>,
- Conv::template process_tile<0, 0, 2, 3, 3, 2>,
- Conv::template process_tile<0, 0, 2, 3, 3, 3>,
- }, // Output pad bottom = 3
- }, // Input pad right = 3
- { // Input pad right = 4
- { // Output pad bottom = 0
- Conv::template process_tile<0, 0, 2, 4, 0, 0>,
- Conv::template process_tile<0, 0, 2, 4, 0, 1>,
- Conv::template process_tile<0, 0, 2, 4, 0, 2>,
- Conv::template process_tile<0, 0, 2, 4, 0, 3>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- Conv::template process_tile<0, 0, 2, 4, 1, 0>,
- Conv::template process_tile<0, 0, 2, 4, 1, 1>,
- Conv::template process_tile<0, 0, 2, 4, 1, 2>,
- Conv::template process_tile<0, 0, 2, 4, 1, 3>,
- }, // Output pad bottom = 1
- { // Output pad bottom = 2
- Conv::template process_tile<0, 0, 2, 4, 2, 0>,
- Conv::template process_tile<0, 0, 2, 4, 2, 1>,
- Conv::template process_tile<0, 0, 2, 4, 2, 2>,
- Conv::template process_tile<0, 0, 2, 4, 2, 3>,
- }, // Output pad bottom = 2
- { // Output pad bottom = 3
- Conv::template process_tile<0, 0, 2, 4, 3, 0>,
- Conv::template process_tile<0, 0, 2, 4, 3, 1>,
- Conv::template process_tile<0, 0, 2, 4, 3, 2>,
- Conv::template process_tile<0, 0, 2, 4, 3, 3>,
- }, // Output pad bottom = 3
- }, // Input pad right = 4
- { // Input pad right = 5
- { // Output pad bottom = 0
- Conv::template process_tile<0, 0, 2, 5, 0, 0>,
- Conv::template process_tile<0, 0, 2, 5, 0, 1>,
- Conv::template process_tile<0, 0, 2, 5, 0, 2>,
- Conv::template process_tile<0, 0, 2, 5, 0, 3>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- Conv::template process_tile<0, 0, 2, 5, 1, 0>,
- Conv::template process_tile<0, 0, 2, 5, 1, 1>,
- Conv::template process_tile<0, 0, 2, 5, 1, 2>,
- Conv::template process_tile<0, 0, 2, 5, 1, 3>,
- }, // Output pad bottom = 1
- { // Output pad bottom = 2
- Conv::template process_tile<0, 0, 2, 5, 2, 0>,
- Conv::template process_tile<0, 0, 2, 5, 2, 1>,
- Conv::template process_tile<0, 0, 2, 5, 2, 2>,
- Conv::template process_tile<0, 0, 2, 5, 2, 3>,
- }, // Output pad bottom = 2
- { // Output pad bottom = 3
- Conv::template process_tile<0, 0, 2, 5, 3, 0>,
- Conv::template process_tile<0, 0, 2, 5, 3, 1>,
- Conv::template process_tile<0, 0, 2, 5, 3, 2>,
- Conv::template process_tile<0, 0, 2, 5, 3, 3>,
- }, // Output pad bottom = 3
- }, // Input pad right = 5
- { // Input pad right = 6
- { // Output pad bottom = 0
- Conv::template process_tile<0, 0, 2, 6, 0, 0>,
- Conv::template process_tile<0, 0, 2, 6, 0, 1>,
- Conv::template process_tile<0, 0, 2, 6, 0, 2>,
- Conv::template process_tile<0, 0, 2, 6, 0, 3>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- Conv::template process_tile<0, 0, 2, 6, 1, 0>,
- Conv::template process_tile<0, 0, 2, 6, 1, 1>,
- Conv::template process_tile<0, 0, 2, 6, 1, 2>,
- Conv::template process_tile<0, 0, 2, 6, 1, 3>,
- }, // Output pad bottom = 1
- { // Output pad bottom = 2
- Conv::template process_tile<0, 0, 2, 6, 2, 0>,
- Conv::template process_tile<0, 0, 2, 6, 2, 1>,
- Conv::template process_tile<0, 0, 2, 6, 2, 2>,
- Conv::template process_tile<0, 0, 2, 6, 2, 3>,
- }, // Output pad bottom = 2
- { // Output pad bottom = 3
- Conv::template process_tile<0, 0, 2, 6, 3, 0>,
- Conv::template process_tile<0, 0, 2, 6, 3, 1>,
- Conv::template process_tile<0, 0, 2, 6, 3, 2>,
- Conv::template process_tile<0, 0, 2, 6, 3, 3>,
- }, // Output pad bottom = 3
- }, // Input pad right = 6
- }, // Input pad bottom = 2
- { // Input pad bottom = 3
- { // Input pad right = 0
- { // Output pad bottom = 0
- Conv::template process_tile<0, 0, 3, 0, 0, 0>,
- Conv::template process_tile<0, 0, 3, 0, 0, 1>,
- Conv::template process_tile<0, 0, 3, 0, 0, 2>,
- Conv::template process_tile<0, 0, 3, 0, 0, 3>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- Conv::template process_tile<0, 0, 3, 0, 1, 0>,
- Conv::template process_tile<0, 0, 3, 0, 1, 1>,
- Conv::template process_tile<0, 0, 3, 0, 1, 2>,
- Conv::template process_tile<0, 0, 3, 0, 1, 3>,
- }, // Output pad bottom = 1
- { // Output pad bottom = 2
- Conv::template process_tile<0, 0, 3, 0, 2, 0>,
- Conv::template process_tile<0, 0, 3, 0, 2, 1>,
- Conv::template process_tile<0, 0, 3, 0, 2, 2>,
- Conv::template process_tile<0, 0, 3, 0, 2, 3>,
- }, // Output pad bottom = 2
- { // Output pad bottom = 3
- Conv::template process_tile<0, 0, 3, 0, 3, 0>,
- Conv::template process_tile<0, 0, 3, 0, 3, 1>,
- Conv::template process_tile<0, 0, 3, 0, 3, 2>,
- Conv::template process_tile<0, 0, 3, 0, 3, 3>,
- }, // Output pad bottom = 3
- }, // Input pad right = 0
- { // Input pad right = 1
- { // Output pad bottom = 0
- Conv::template process_tile<0, 0, 3, 1, 0, 0>,
- Conv::template process_tile<0, 0, 3, 1, 0, 1>,
- Conv::template process_tile<0, 0, 3, 1, 0, 2>,
- Conv::template process_tile<0, 0, 3, 1, 0, 3>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- Conv::template process_tile<0, 0, 3, 1, 1, 0>,
- Conv::template process_tile<0, 0, 3, 1, 1, 1>,
- Conv::template process_tile<0, 0, 3, 1, 1, 2>,
- Conv::template process_tile<0, 0, 3, 1, 1, 3>,
- }, // Output pad bottom = 1
- { // Output pad bottom = 2
- Conv::template process_tile<0, 0, 3, 1, 2, 0>,
- Conv::template process_tile<0, 0, 3, 1, 2, 1>,
- Conv::template process_tile<0, 0, 3, 1, 2, 2>,
- Conv::template process_tile<0, 0, 3, 1, 2, 3>,
- }, // Output pad bottom = 2
- { // Output pad bottom = 3
- Conv::template process_tile<0, 0, 3, 1, 3, 0>,
- Conv::template process_tile<0, 0, 3, 1, 3, 1>,
- Conv::template process_tile<0, 0, 3, 1, 3, 2>,
- Conv::template process_tile<0, 0, 3, 1, 3, 3>,
- }, // Output pad bottom = 3
- }, // Input pad right = 1
- { // Input pad right = 2
- { // Output pad bottom = 0
- Conv::template process_tile<0, 0, 3, 2, 0, 0>,
- Conv::template process_tile<0, 0, 3, 2, 0, 1>,
- Conv::template process_tile<0, 0, 3, 2, 0, 2>,
- Conv::template process_tile<0, 0, 3, 2, 0, 3>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- Conv::template process_tile<0, 0, 3, 2, 1, 0>,
- Conv::template process_tile<0, 0, 3, 2, 1, 1>,
- Conv::template process_tile<0, 0, 3, 2, 1, 2>,
- Conv::template process_tile<0, 0, 3, 2, 1, 3>,
- }, // Output pad bottom = 1
- { // Output pad bottom = 2
- Conv::template process_tile<0, 0, 3, 2, 2, 0>,
- Conv::template process_tile<0, 0, 3, 2, 2, 1>,
- Conv::template process_tile<0, 0, 3, 2, 2, 2>,
- Conv::template process_tile<0, 0, 3, 2, 2, 3>,
- }, // Output pad bottom = 2
- { // Output pad bottom = 3
- Conv::template process_tile<0, 0, 3, 2, 3, 0>,
- Conv::template process_tile<0, 0, 3, 2, 3, 1>,
- Conv::template process_tile<0, 0, 3, 2, 3, 2>,
- Conv::template process_tile<0, 0, 3, 2, 3, 3>,
- }, // Output pad bottom = 3
- }, // Input pad right = 2
- { // Input pad right = 3
- { // Output pad bottom = 0
- Conv::template process_tile<0, 0, 3, 3, 0, 0>,
- Conv::template process_tile<0, 0, 3, 3, 0, 1>,
- Conv::template process_tile<0, 0, 3, 3, 0, 2>,
- Conv::template process_tile<0, 0, 3, 3, 0, 3>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- Conv::template process_tile<0, 0, 3, 3, 1, 0>,
- Conv::template process_tile<0, 0, 3, 3, 1, 1>,
- Conv::template process_tile<0, 0, 3, 3, 1, 2>,
- Conv::template process_tile<0, 0, 3, 3, 1, 3>,
- }, // Output pad bottom = 1
- { // Output pad bottom = 2
- Conv::template process_tile<0, 0, 3, 3, 2, 0>,
- Conv::template process_tile<0, 0, 3, 3, 2, 1>,
- Conv::template process_tile<0, 0, 3, 3, 2, 2>,
- Conv::template process_tile<0, 0, 3, 3, 2, 3>,
- }, // Output pad bottom = 2
- { // Output pad bottom = 3
- Conv::template process_tile<0, 0, 3, 3, 3, 0>,
- Conv::template process_tile<0, 0, 3, 3, 3, 1>,
- Conv::template process_tile<0, 0, 3, 3, 3, 2>,
- Conv::template process_tile<0, 0, 3, 3, 3, 3>,
- }, // Output pad bottom = 3
- }, // Input pad right = 3
- { // Input pad right = 4
- { // Output pad bottom = 0
- Conv::template process_tile<0, 0, 3, 4, 0, 0>,
- Conv::template process_tile<0, 0, 3, 4, 0, 1>,
- Conv::template process_tile<0, 0, 3, 4, 0, 2>,
- Conv::template process_tile<0, 0, 3, 4, 0, 3>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- Conv::template process_tile<0, 0, 3, 4, 1, 0>,
- Conv::template process_tile<0, 0, 3, 4, 1, 1>,
- Conv::template process_tile<0, 0, 3, 4, 1, 2>,
- Conv::template process_tile<0, 0, 3, 4, 1, 3>,
- }, // Output pad bottom = 1
- { // Output pad bottom = 2
- Conv::template process_tile<0, 0, 3, 4, 2, 0>,
- Conv::template process_tile<0, 0, 3, 4, 2, 1>,
- Conv::template process_tile<0, 0, 3, 4, 2, 2>,
- Conv::template process_tile<0, 0, 3, 4, 2, 3>,
- }, // Output pad bottom = 2
- { // Output pad bottom = 3
- Conv::template process_tile<0, 0, 3, 4, 3, 0>,
- Conv::template process_tile<0, 0, 3, 4, 3, 1>,
- Conv::template process_tile<0, 0, 3, 4, 3, 2>,
- Conv::template process_tile<0, 0, 3, 4, 3, 3>,
- }, // Output pad bottom = 3
- }, // Input pad right = 4
- { // Input pad right = 5
- { // Output pad bottom = 0
- Conv::template process_tile<0, 0, 3, 5, 0, 0>,
- Conv::template process_tile<0, 0, 3, 5, 0, 1>,
- Conv::template process_tile<0, 0, 3, 5, 0, 2>,
- Conv::template process_tile<0, 0, 3, 5, 0, 3>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- Conv::template process_tile<0, 0, 3, 5, 1, 0>,
- Conv::template process_tile<0, 0, 3, 5, 1, 1>,
- Conv::template process_tile<0, 0, 3, 5, 1, 2>,
- Conv::template process_tile<0, 0, 3, 5, 1, 3>,
- }, // Output pad bottom = 1
- { // Output pad bottom = 2
- Conv::template process_tile<0, 0, 3, 5, 2, 0>,
- Conv::template process_tile<0, 0, 3, 5, 2, 1>,
- Conv::template process_tile<0, 0, 3, 5, 2, 2>,
- Conv::template process_tile<0, 0, 3, 5, 2, 3>,
- }, // Output pad bottom = 2
- { // Output pad bottom = 3
- Conv::template process_tile<0, 0, 3, 5, 3, 0>,
- Conv::template process_tile<0, 0, 3, 5, 3, 1>,
- Conv::template process_tile<0, 0, 3, 5, 3, 2>,
- Conv::template process_tile<0, 0, 3, 5, 3, 3>,
- }, // Output pad bottom = 3
- }, // Input pad right = 5
- { // Input pad right = 6
- { // Output pad bottom = 0
- Conv::template process_tile<0, 0, 3, 6, 0, 0>,
- Conv::template process_tile<0, 0, 3, 6, 0, 1>,
- Conv::template process_tile<0, 0, 3, 6, 0, 2>,
- Conv::template process_tile<0, 0, 3, 6, 0, 3>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- Conv::template process_tile<0, 0, 3, 6, 1, 0>,
- Conv::template process_tile<0, 0, 3, 6, 1, 1>,
- Conv::template process_tile<0, 0, 3, 6, 1, 2>,
- Conv::template process_tile<0, 0, 3, 6, 1, 3>,
- }, // Output pad bottom = 1
- { // Output pad bottom = 2
- Conv::template process_tile<0, 0, 3, 6, 2, 0>,
- Conv::template process_tile<0, 0, 3, 6, 2, 1>,
- Conv::template process_tile<0, 0, 3, 6, 2, 2>,
- Conv::template process_tile<0, 0, 3, 6, 2, 3>,
- }, // Output pad bottom = 2
- { // Output pad bottom = 3
- Conv::template process_tile<0, 0, 3, 6, 3, 0>,
- Conv::template process_tile<0, 0, 3, 6, 3, 1>,
- Conv::template process_tile<0, 0, 3, 6, 3, 2>,
- Conv::template process_tile<0, 0, 3, 6, 3, 3>,
- }, // Output pad bottom = 3
- }, // Input pad right = 6
- }, // Input pad bottom = 3
- { // Input pad bottom = 4
- { // Input pad right = 0
- { // Output pad bottom = 0
- Conv::template process_tile<0, 0, 4, 0, 0, 0>,
- Conv::template process_tile<0, 0, 4, 0, 0, 1>,
- Conv::template process_tile<0, 0, 4, 0, 0, 2>,
- Conv::template process_tile<0, 0, 4, 0, 0, 3>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- Conv::template process_tile<0, 0, 4, 0, 1, 0>,
- Conv::template process_tile<0, 0, 4, 0, 1, 1>,
- Conv::template process_tile<0, 0, 4, 0, 1, 2>,
- Conv::template process_tile<0, 0, 4, 0, 1, 3>,
- }, // Output pad bottom = 1
- { // Output pad bottom = 2
- Conv::template process_tile<0, 0, 4, 0, 2, 0>,
- Conv::template process_tile<0, 0, 4, 0, 2, 1>,
- Conv::template process_tile<0, 0, 4, 0, 2, 2>,
- Conv::template process_tile<0, 0, 4, 0, 2, 3>,
- }, // Output pad bottom = 2
- { // Output pad bottom = 3
- Conv::template process_tile<0, 0, 4, 0, 3, 0>,
- Conv::template process_tile<0, 0, 4, 0, 3, 1>,
- Conv::template process_tile<0, 0, 4, 0, 3, 2>,
- Conv::template process_tile<0, 0, 4, 0, 3, 3>,
- }, // Output pad bottom = 3
- }, // Input pad right = 0
- { // Input pad right = 1
- { // Output pad bottom = 0
- Conv::template process_tile<0, 0, 4, 1, 0, 0>,
- Conv::template process_tile<0, 0, 4, 1, 0, 1>,
- Conv::template process_tile<0, 0, 4, 1, 0, 2>,
- Conv::template process_tile<0, 0, 4, 1, 0, 3>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- Conv::template process_tile<0, 0, 4, 1, 1, 0>,
- Conv::template process_tile<0, 0, 4, 1, 1, 1>,
- Conv::template process_tile<0, 0, 4, 1, 1, 2>,
- Conv::template process_tile<0, 0, 4, 1, 1, 3>,
- }, // Output pad bottom = 1
- { // Output pad bottom = 2
- Conv::template process_tile<0, 0, 4, 1, 2, 0>,
- Conv::template process_tile<0, 0, 4, 1, 2, 1>,
- Conv::template process_tile<0, 0, 4, 1, 2, 2>,
- Conv::template process_tile<0, 0, 4, 1, 2, 3>,
- }, // Output pad bottom = 2
- { // Output pad bottom = 3
- Conv::template process_tile<0, 0, 4, 1, 3, 0>,
- Conv::template process_tile<0, 0, 4, 1, 3, 1>,
- Conv::template process_tile<0, 0, 4, 1, 3, 2>,
- Conv::template process_tile<0, 0, 4, 1, 3, 3>,
- }, // Output pad bottom = 3
- }, // Input pad right = 1
- { // Input pad right = 2
- { // Output pad bottom = 0
- Conv::template process_tile<0, 0, 4, 2, 0, 0>,
- Conv::template process_tile<0, 0, 4, 2, 0, 1>,
- Conv::template process_tile<0, 0, 4, 2, 0, 2>,
- Conv::template process_tile<0, 0, 4, 2, 0, 3>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- Conv::template process_tile<0, 0, 4, 2, 1, 0>,
- Conv::template process_tile<0, 0, 4, 2, 1, 1>,
- Conv::template process_tile<0, 0, 4, 2, 1, 2>,
- Conv::template process_tile<0, 0, 4, 2, 1, 3>,
- }, // Output pad bottom = 1
- { // Output pad bottom = 2
- Conv::template process_tile<0, 0, 4, 2, 2, 0>,
- Conv::template process_tile<0, 0, 4, 2, 2, 1>,
- Conv::template process_tile<0, 0, 4, 2, 2, 2>,
- Conv::template process_tile<0, 0, 4, 2, 2, 3>,
- }, // Output pad bottom = 2
- { // Output pad bottom = 3
- Conv::template process_tile<0, 0, 4, 2, 3, 0>,
- Conv::template process_tile<0, 0, 4, 2, 3, 1>,
- Conv::template process_tile<0, 0, 4, 2, 3, 2>,
- Conv::template process_tile<0, 0, 4, 2, 3, 3>,
- }, // Output pad bottom = 3
- }, // Input pad right = 2
- { // Input pad right = 3
- { // Output pad bottom = 0
- Conv::template process_tile<0, 0, 4, 3, 0, 0>,
- Conv::template process_tile<0, 0, 4, 3, 0, 1>,
- Conv::template process_tile<0, 0, 4, 3, 0, 2>,
- Conv::template process_tile<0, 0, 4, 3, 0, 3>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- Conv::template process_tile<0, 0, 4, 3, 1, 0>,
- Conv::template process_tile<0, 0, 4, 3, 1, 1>,
- Conv::template process_tile<0, 0, 4, 3, 1, 2>,
- Conv::template process_tile<0, 0, 4, 3, 1, 3>,
- }, // Output pad bottom = 1
- { // Output pad bottom = 2
- Conv::template process_tile<0, 0, 4, 3, 2, 0>,
- Conv::template process_tile<0, 0, 4, 3, 2, 1>,
- Conv::template process_tile<0, 0, 4, 3, 2, 2>,
- Conv::template process_tile<0, 0, 4, 3, 2, 3>,
- }, // Output pad bottom = 2
- { // Output pad bottom = 3
- Conv::template process_tile<0, 0, 4, 3, 3, 0>,
- Conv::template process_tile<0, 0, 4, 3, 3, 1>,
- Conv::template process_tile<0, 0, 4, 3, 3, 2>,
- Conv::template process_tile<0, 0, 4, 3, 3, 3>,
- }, // Output pad bottom = 3
- }, // Input pad right = 3
- { // Input pad right = 4
- { // Output pad bottom = 0
- Conv::template process_tile<0, 0, 4, 4, 0, 0>,
- Conv::template process_tile<0, 0, 4, 4, 0, 1>,
- Conv::template process_tile<0, 0, 4, 4, 0, 2>,
- Conv::template process_tile<0, 0, 4, 4, 0, 3>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- Conv::template process_tile<0, 0, 4, 4, 1, 0>,
- Conv::template process_tile<0, 0, 4, 4, 1, 1>,
- Conv::template process_tile<0, 0, 4, 4, 1, 2>,
- Conv::template process_tile<0, 0, 4, 4, 1, 3>,
- }, // Output pad bottom = 1
- { // Output pad bottom = 2
- Conv::template process_tile<0, 0, 4, 4, 2, 0>,
- Conv::template process_tile<0, 0, 4, 4, 2, 1>,
- Conv::template process_tile<0, 0, 4, 4, 2, 2>,
- Conv::template process_tile<0, 0, 4, 4, 2, 3>,
- }, // Output pad bottom = 2
- { // Output pad bottom = 3
- Conv::template process_tile<0, 0, 4, 4, 3, 0>,
- Conv::template process_tile<0, 0, 4, 4, 3, 1>,
- Conv::template process_tile<0, 0, 4, 4, 3, 2>,
- Conv::template process_tile<0, 0, 4, 4, 3, 3>,
- }, // Output pad bottom = 3
- }, // Input pad right = 4
- { // Input pad right = 5
- { // Output pad bottom = 0
- Conv::template process_tile<0, 0, 4, 5, 0, 0>,
- Conv::template process_tile<0, 0, 4, 5, 0, 1>,
- Conv::template process_tile<0, 0, 4, 5, 0, 2>,
- Conv::template process_tile<0, 0, 4, 5, 0, 3>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- Conv::template process_tile<0, 0, 4, 5, 1, 0>,
- Conv::template process_tile<0, 0, 4, 5, 1, 1>,
- Conv::template process_tile<0, 0, 4, 5, 1, 2>,
- Conv::template process_tile<0, 0, 4, 5, 1, 3>,
- }, // Output pad bottom = 1
- { // Output pad bottom = 2
- Conv::template process_tile<0, 0, 4, 5, 2, 0>,
- Conv::template process_tile<0, 0, 4, 5, 2, 1>,
- Conv::template process_tile<0, 0, 4, 5, 2, 2>,
- Conv::template process_tile<0, 0, 4, 5, 2, 3>,
- }, // Output pad bottom = 2
- { // Output pad bottom = 3
- Conv::template process_tile<0, 0, 4, 5, 3, 0>,
- Conv::template process_tile<0, 0, 4, 5, 3, 1>,
- Conv::template process_tile<0, 0, 4, 5, 3, 2>,
- Conv::template process_tile<0, 0, 4, 5, 3, 3>,
- }, // Output pad bottom = 3
- }, // Input pad right = 5
- { // Input pad right = 6
- { // Output pad bottom = 0
- Conv::template process_tile<0, 0, 4, 6, 0, 0>,
- Conv::template process_tile<0, 0, 4, 6, 0, 1>,
- Conv::template process_tile<0, 0, 4, 6, 0, 2>,
- Conv::template process_tile<0, 0, 4, 6, 0, 3>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- Conv::template process_tile<0, 0, 4, 6, 1, 0>,
- Conv::template process_tile<0, 0, 4, 6, 1, 1>,
- Conv::template process_tile<0, 0, 4, 6, 1, 2>,
- Conv::template process_tile<0, 0, 4, 6, 1, 3>,
- }, // Output pad bottom = 1
- { // Output pad bottom = 2
- Conv::template process_tile<0, 0, 4, 6, 2, 0>,
- Conv::template process_tile<0, 0, 4, 6, 2, 1>,
- Conv::template process_tile<0, 0, 4, 6, 2, 2>,
- Conv::template process_tile<0, 0, 4, 6, 2, 3>,
- }, // Output pad bottom = 2
- { // Output pad bottom = 3
- Conv::template process_tile<0, 0, 4, 6, 3, 0>,
- Conv::template process_tile<0, 0, 4, 6, 3, 1>,
- Conv::template process_tile<0, 0, 4, 6, 3, 2>,
- Conv::template process_tile<0, 0, 4, 6, 3, 3>,
- }, // Output pad bottom = 3
- }, // Input pad right = 6
- }, // Input pad bottom = 4
- { // Input pad bottom = 5
- { // Input pad right = 0
- { // Output pad bottom = 0
- Conv::template process_tile<0, 0, 5, 0, 0, 0>,
- Conv::template process_tile<0, 0, 5, 0, 0, 1>,
- Conv::template process_tile<0, 0, 5, 0, 0, 2>,
- Conv::template process_tile<0, 0, 5, 0, 0, 3>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- Conv::template process_tile<0, 0, 5, 0, 1, 0>,
- Conv::template process_tile<0, 0, 5, 0, 1, 1>,
- Conv::template process_tile<0, 0, 5, 0, 1, 2>,
- Conv::template process_tile<0, 0, 5, 0, 1, 3>,
- }, // Output pad bottom = 1
- { // Output pad bottom = 2
- Conv::template process_tile<0, 0, 5, 0, 2, 0>,
- Conv::template process_tile<0, 0, 5, 0, 2, 1>,
- Conv::template process_tile<0, 0, 5, 0, 2, 2>,
- Conv::template process_tile<0, 0, 5, 0, 2, 3>,
- }, // Output pad bottom = 2
- { // Output pad bottom = 3
- Conv::template process_tile<0, 0, 5, 0, 3, 0>,
- Conv::template process_tile<0, 0, 5, 0, 3, 1>,
- Conv::template process_tile<0, 0, 5, 0, 3, 2>,
- Conv::template process_tile<0, 0, 5, 0, 3, 3>,
- }, // Output pad bottom = 3
- }, // Input pad right = 0
- { // Input pad right = 1
- { // Output pad bottom = 0
- Conv::template process_tile<0, 0, 5, 1, 0, 0>,
- Conv::template process_tile<0, 0, 5, 1, 0, 1>,
- Conv::template process_tile<0, 0, 5, 1, 0, 2>,
- Conv::template process_tile<0, 0, 5, 1, 0, 3>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- Conv::template process_tile<0, 0, 5, 1, 1, 0>,
- Conv::template process_tile<0, 0, 5, 1, 1, 1>,
- Conv::template process_tile<0, 0, 5, 1, 1, 2>,
- Conv::template process_tile<0, 0, 5, 1, 1, 3>,
- }, // Output pad bottom = 1
- { // Output pad bottom = 2
- Conv::template process_tile<0, 0, 5, 1, 2, 0>,
- Conv::template process_tile<0, 0, 5, 1, 2, 1>,
- Conv::template process_tile<0, 0, 5, 1, 2, 2>,
- Conv::template process_tile<0, 0, 5, 1, 2, 3>,
- }, // Output pad bottom = 2
- { // Output pad bottom = 3
- Conv::template process_tile<0, 0, 5, 1, 3, 0>,
- Conv::template process_tile<0, 0, 5, 1, 3, 1>,
- Conv::template process_tile<0, 0, 5, 1, 3, 2>,
- Conv::template process_tile<0, 0, 5, 1, 3, 3>,
- }, // Output pad bottom = 3
- }, // Input pad right = 1
- { // Input pad right = 2
- { // Output pad bottom = 0
- Conv::template process_tile<0, 0, 5, 2, 0, 0>,
- Conv::template process_tile<0, 0, 5, 2, 0, 1>,
- Conv::template process_tile<0, 0, 5, 2, 0, 2>,
- Conv::template process_tile<0, 0, 5, 2, 0, 3>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- Conv::template process_tile<0, 0, 5, 2, 1, 0>,
- Conv::template process_tile<0, 0, 5, 2, 1, 1>,
- Conv::template process_tile<0, 0, 5, 2, 1, 2>,
- Conv::template process_tile<0, 0, 5, 2, 1, 3>,
- }, // Output pad bottom = 1
- { // Output pad bottom = 2
- Conv::template process_tile<0, 0, 5, 2, 2, 0>,
- Conv::template process_tile<0, 0, 5, 2, 2, 1>,
- Conv::template process_tile<0, 0, 5, 2, 2, 2>,
- Conv::template process_tile<0, 0, 5, 2, 2, 3>,
- }, // Output pad bottom = 2
- { // Output pad bottom = 3
- Conv::template process_tile<0, 0, 5, 2, 3, 0>,
- Conv::template process_tile<0, 0, 5, 2, 3, 1>,
- Conv::template process_tile<0, 0, 5, 2, 3, 2>,
- Conv::template process_tile<0, 0, 5, 2, 3, 3>,
- }, // Output pad bottom = 3
- }, // Input pad right = 2
- { // Input pad right = 3
- { // Output pad bottom = 0
- Conv::template process_tile<0, 0, 5, 3, 0, 0>,
- Conv::template process_tile<0, 0, 5, 3, 0, 1>,
- Conv::template process_tile<0, 0, 5, 3, 0, 2>,
- Conv::template process_tile<0, 0, 5, 3, 0, 3>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- Conv::template process_tile<0, 0, 5, 3, 1, 0>,
- Conv::template process_tile<0, 0, 5, 3, 1, 1>,
- Conv::template process_tile<0, 0, 5, 3, 1, 2>,
- Conv::template process_tile<0, 0, 5, 3, 1, 3>,
- }, // Output pad bottom = 1
- { // Output pad bottom = 2
- Conv::template process_tile<0, 0, 5, 3, 2, 0>,
- Conv::template process_tile<0, 0, 5, 3, 2, 1>,
- Conv::template process_tile<0, 0, 5, 3, 2, 2>,
- Conv::template process_tile<0, 0, 5, 3, 2, 3>,
- }, // Output pad bottom = 2
- { // Output pad bottom = 3
- Conv::template process_tile<0, 0, 5, 3, 3, 0>,
- Conv::template process_tile<0, 0, 5, 3, 3, 1>,
- Conv::template process_tile<0, 0, 5, 3, 3, 2>,
- Conv::template process_tile<0, 0, 5, 3, 3, 3>,
- }, // Output pad bottom = 3
- }, // Input pad right = 3
- { // Input pad right = 4
- { // Output pad bottom = 0
- Conv::template process_tile<0, 0, 5, 4, 0, 0>,
- Conv::template process_tile<0, 0, 5, 4, 0, 1>,
- Conv::template process_tile<0, 0, 5, 4, 0, 2>,
- Conv::template process_tile<0, 0, 5, 4, 0, 3>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- Conv::template process_tile<0, 0, 5, 4, 1, 0>,
- Conv::template process_tile<0, 0, 5, 4, 1, 1>,
- Conv::template process_tile<0, 0, 5, 4, 1, 2>,
- Conv::template process_tile<0, 0, 5, 4, 1, 3>,
- }, // Output pad bottom = 1
- { // Output pad bottom = 2
- Conv::template process_tile<0, 0, 5, 4, 2, 0>,
- Conv::template process_tile<0, 0, 5, 4, 2, 1>,
- Conv::template process_tile<0, 0, 5, 4, 2, 2>,
- Conv::template process_tile<0, 0, 5, 4, 2, 3>,
- }, // Output pad bottom = 2
- { // Output pad bottom = 3
- Conv::template process_tile<0, 0, 5, 4, 3, 0>,
- Conv::template process_tile<0, 0, 5, 4, 3, 1>,
- Conv::template process_tile<0, 0, 5, 4, 3, 2>,
- Conv::template process_tile<0, 0, 5, 4, 3, 3>,
- }, // Output pad bottom = 3
- }, // Input pad right = 4
- { // Input pad right = 5
- { // Output pad bottom = 0
- Conv::template process_tile<0, 0, 5, 5, 0, 0>,
- Conv::template process_tile<0, 0, 5, 5, 0, 1>,
- Conv::template process_tile<0, 0, 5, 5, 0, 2>,
- Conv::template process_tile<0, 0, 5, 5, 0, 3>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- Conv::template process_tile<0, 0, 5, 5, 1, 0>,
- Conv::template process_tile<0, 0, 5, 5, 1, 1>,
- Conv::template process_tile<0, 0, 5, 5, 1, 2>,
- Conv::template process_tile<0, 0, 5, 5, 1, 3>,
- }, // Output pad bottom = 1
- { // Output pad bottom = 2
- Conv::template process_tile<0, 0, 5, 5, 2, 0>,
- Conv::template process_tile<0, 0, 5, 5, 2, 1>,
- Conv::template process_tile<0, 0, 5, 5, 2, 2>,
- Conv::template process_tile<0, 0, 5, 5, 2, 3>,
- }, // Output pad bottom = 2
- { // Output pad bottom = 3
- Conv::template process_tile<0, 0, 5, 5, 3, 0>,
- Conv::template process_tile<0, 0, 5, 5, 3, 1>,
- Conv::template process_tile<0, 0, 5, 5, 3, 2>,
- Conv::template process_tile<0, 0, 5, 5, 3, 3>,
- }, // Output pad bottom = 3
- }, // Input pad right = 5
- { // Input pad right = 6
- { // Output pad bottom = 0
- Conv::template process_tile<0, 0, 5, 6, 0, 0>,
- Conv::template process_tile<0, 0, 5, 6, 0, 1>,
- Conv::template process_tile<0, 0, 5, 6, 0, 2>,
- Conv::template process_tile<0, 0, 5, 6, 0, 3>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- Conv::template process_tile<0, 0, 5, 6, 1, 0>,
- Conv::template process_tile<0, 0, 5, 6, 1, 1>,
- Conv::template process_tile<0, 0, 5, 6, 1, 2>,
- Conv::template process_tile<0, 0, 5, 6, 1, 3>,
- }, // Output pad bottom = 1
- { // Output pad bottom = 2
- Conv::template process_tile<0, 0, 5, 6, 2, 0>,
- Conv::template process_tile<0, 0, 5, 6, 2, 1>,
- Conv::template process_tile<0, 0, 5, 6, 2, 2>,
- Conv::template process_tile<0, 0, 5, 6, 2, 3>,
- }, // Output pad bottom = 2
- { // Output pad bottom = 3
- Conv::template process_tile<0, 0, 5, 6, 3, 0>,
- Conv::template process_tile<0, 0, 5, 6, 3, 1>,
- Conv::template process_tile<0, 0, 5, 6, 3, 2>,
- Conv::template process_tile<0, 0, 5, 6, 3, 3>,
- }, // Output pad bottom = 3
- }, // Input pad right = 6
- }, // Input pad bottom = 5
- { // Input pad bottom = 6
- { // Input pad right = 0
- { // Output pad bottom = 0
- Conv::template process_tile<0, 0, 6, 0, 0, 0>,
- Conv::template process_tile<0, 0, 6, 0, 0, 1>,
- Conv::template process_tile<0, 0, 6, 0, 0, 2>,
- Conv::template process_tile<0, 0, 6, 0, 0, 3>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- Conv::template process_tile<0, 0, 6, 0, 1, 0>,
- Conv::template process_tile<0, 0, 6, 0, 1, 1>,
- Conv::template process_tile<0, 0, 6, 0, 1, 2>,
- Conv::template process_tile<0, 0, 6, 0, 1, 3>,
- }, // Output pad bottom = 1
- { // Output pad bottom = 2
- Conv::template process_tile<0, 0, 6, 0, 2, 0>,
- Conv::template process_tile<0, 0, 6, 0, 2, 1>,
- Conv::template process_tile<0, 0, 6, 0, 2, 2>,
- Conv::template process_tile<0, 0, 6, 0, 2, 3>,
- }, // Output pad bottom = 2
- { // Output pad bottom = 3
- Conv::template process_tile<0, 0, 6, 0, 3, 0>,
- Conv::template process_tile<0, 0, 6, 0, 3, 1>,
- Conv::template process_tile<0, 0, 6, 0, 3, 2>,
- Conv::template process_tile<0, 0, 6, 0, 3, 3>,
- }, // Output pad bottom = 3
- }, // Input pad right = 0
- { // Input pad right = 1
- { // Output pad bottom = 0
- Conv::template process_tile<0, 0, 6, 1, 0, 0>,
- Conv::template process_tile<0, 0, 6, 1, 0, 1>,
- Conv::template process_tile<0, 0, 6, 1, 0, 2>,
- Conv::template process_tile<0, 0, 6, 1, 0, 3>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- Conv::template process_tile<0, 0, 6, 1, 1, 0>,
- Conv::template process_tile<0, 0, 6, 1, 1, 1>,
- Conv::template process_tile<0, 0, 6, 1, 1, 2>,
- Conv::template process_tile<0, 0, 6, 1, 1, 3>,
- }, // Output pad bottom = 1
- { // Output pad bottom = 2
- Conv::template process_tile<0, 0, 6, 1, 2, 0>,
- Conv::template process_tile<0, 0, 6, 1, 2, 1>,
- Conv::template process_tile<0, 0, 6, 1, 2, 2>,
- Conv::template process_tile<0, 0, 6, 1, 2, 3>,
- }, // Output pad bottom = 2
- { // Output pad bottom = 3
- Conv::template process_tile<0, 0, 6, 1, 3, 0>,
- Conv::template process_tile<0, 0, 6, 1, 3, 1>,
- Conv::template process_tile<0, 0, 6, 1, 3, 2>,
- Conv::template process_tile<0, 0, 6, 1, 3, 3>,
- }, // Output pad bottom = 3
- }, // Input pad right = 1
- { // Input pad right = 2
- { // Output pad bottom = 0
- Conv::template process_tile<0, 0, 6, 2, 0, 0>,
- Conv::template process_tile<0, 0, 6, 2, 0, 1>,
- Conv::template process_tile<0, 0, 6, 2, 0, 2>,
- Conv::template process_tile<0, 0, 6, 2, 0, 3>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- Conv::template process_tile<0, 0, 6, 2, 1, 0>,
- Conv::template process_tile<0, 0, 6, 2, 1, 1>,
- Conv::template process_tile<0, 0, 6, 2, 1, 2>,
- Conv::template process_tile<0, 0, 6, 2, 1, 3>,
- }, // Output pad bottom = 1
- { // Output pad bottom = 2
- Conv::template process_tile<0, 0, 6, 2, 2, 0>,
- Conv::template process_tile<0, 0, 6, 2, 2, 1>,
- Conv::template process_tile<0, 0, 6, 2, 2, 2>,
- Conv::template process_tile<0, 0, 6, 2, 2, 3>,
- }, // Output pad bottom = 2
- { // Output pad bottom = 3
- Conv::template process_tile<0, 0, 6, 2, 3, 0>,
- Conv::template process_tile<0, 0, 6, 2, 3, 1>,
- Conv::template process_tile<0, 0, 6, 2, 3, 2>,
- Conv::template process_tile<0, 0, 6, 2, 3, 3>,
- }, // Output pad bottom = 3
- }, // Input pad right = 2
- { // Input pad right = 3
- { // Output pad bottom = 0
- Conv::template process_tile<0, 0, 6, 3, 0, 0>,
- Conv::template process_tile<0, 0, 6, 3, 0, 1>,
- Conv::template process_tile<0, 0, 6, 3, 0, 2>,
- Conv::template process_tile<0, 0, 6, 3, 0, 3>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- Conv::template process_tile<0, 0, 6, 3, 1, 0>,
- Conv::template process_tile<0, 0, 6, 3, 1, 1>,
- Conv::template process_tile<0, 0, 6, 3, 1, 2>,
- Conv::template process_tile<0, 0, 6, 3, 1, 3>,
- }, // Output pad bottom = 1
- { // Output pad bottom = 2
- Conv::template process_tile<0, 0, 6, 3, 2, 0>,
- Conv::template process_tile<0, 0, 6, 3, 2, 1>,
- Conv::template process_tile<0, 0, 6, 3, 2, 2>,
- Conv::template process_tile<0, 0, 6, 3, 2, 3>,
- }, // Output pad bottom = 2
- { // Output pad bottom = 3
- Conv::template process_tile<0, 0, 6, 3, 3, 0>,
- Conv::template process_tile<0, 0, 6, 3, 3, 1>,
- Conv::template process_tile<0, 0, 6, 3, 3, 2>,
- Conv::template process_tile<0, 0, 6, 3, 3, 3>,
- }, // Output pad bottom = 3
- }, // Input pad right = 3
- { // Input pad right = 4
- { // Output pad bottom = 0
- Conv::template process_tile<0, 0, 6, 4, 0, 0>,
- Conv::template process_tile<0, 0, 6, 4, 0, 1>,
- Conv::template process_tile<0, 0, 6, 4, 0, 2>,
- Conv::template process_tile<0, 0, 6, 4, 0, 3>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- Conv::template process_tile<0, 0, 6, 4, 1, 0>,
- Conv::template process_tile<0, 0, 6, 4, 1, 1>,
- Conv::template process_tile<0, 0, 6, 4, 1, 2>,
- Conv::template process_tile<0, 0, 6, 4, 1, 3>,
- }, // Output pad bottom = 1
- { // Output pad bottom = 2
- Conv::template process_tile<0, 0, 6, 4, 2, 0>,
- Conv::template process_tile<0, 0, 6, 4, 2, 1>,
- Conv::template process_tile<0, 0, 6, 4, 2, 2>,
- Conv::template process_tile<0, 0, 6, 4, 2, 3>,
- }, // Output pad bottom = 2
- { // Output pad bottom = 3
- Conv::template process_tile<0, 0, 6, 4, 3, 0>,
- Conv::template process_tile<0, 0, 6, 4, 3, 1>,
- Conv::template process_tile<0, 0, 6, 4, 3, 2>,
- Conv::template process_tile<0, 0, 6, 4, 3, 3>,
- }, // Output pad bottom = 3
- }, // Input pad right = 4
- { // Input pad right = 5
- { // Output pad bottom = 0
- Conv::template process_tile<0, 0, 6, 5, 0, 0>,
- Conv::template process_tile<0, 0, 6, 5, 0, 1>,
- Conv::template process_tile<0, 0, 6, 5, 0, 2>,
- Conv::template process_tile<0, 0, 6, 5, 0, 3>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- Conv::template process_tile<0, 0, 6, 5, 1, 0>,
- Conv::template process_tile<0, 0, 6, 5, 1, 1>,
- Conv::template process_tile<0, 0, 6, 5, 1, 2>,
- Conv::template process_tile<0, 0, 6, 5, 1, 3>,
- }, // Output pad bottom = 1
- { // Output pad bottom = 2
- Conv::template process_tile<0, 0, 6, 5, 2, 0>,
- Conv::template process_tile<0, 0, 6, 5, 2, 1>,
- Conv::template process_tile<0, 0, 6, 5, 2, 2>,
- Conv::template process_tile<0, 0, 6, 5, 2, 3>,
- }, // Output pad bottom = 2
- { // Output pad bottom = 3
- Conv::template process_tile<0, 0, 6, 5, 3, 0>,
- Conv::template process_tile<0, 0, 6, 5, 3, 1>,
- Conv::template process_tile<0, 0, 6, 5, 3, 2>,
- Conv::template process_tile<0, 0, 6, 5, 3, 3>,
- }, // Output pad bottom = 3
- }, // Input pad right = 5
- { // Input pad right = 6
- { // Output pad bottom = 0
- Conv::template process_tile<0, 0, 6, 6, 0, 0>,
- Conv::template process_tile<0, 0, 6, 6, 0, 1>,
- Conv::template process_tile<0, 0, 6, 6, 0, 2>,
- Conv::template process_tile<0, 0, 6, 6, 0, 3>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- Conv::template process_tile<0, 0, 6, 6, 1, 0>,
- Conv::template process_tile<0, 0, 6, 6, 1, 1>,
- Conv::template process_tile<0, 0, 6, 6, 1, 2>,
- Conv::template process_tile<0, 0, 6, 6, 1, 3>,
- }, // Output pad bottom = 1
- { // Output pad bottom = 2
- Conv::template process_tile<0, 0, 6, 6, 2, 0>,
- Conv::template process_tile<0, 0, 6, 6, 2, 1>,
- Conv::template process_tile<0, 0, 6, 6, 2, 2>,
- Conv::template process_tile<0, 0, 6, 6, 2, 3>,
- }, // Output pad bottom = 2
- { // Output pad bottom = 3
- Conv::template process_tile<0, 0, 6, 6, 3, 0>,
- Conv::template process_tile<0, 0, 6, 6, 3, 1>,
- Conv::template process_tile<0, 0, 6, 6, 3, 2>,
- Conv::template process_tile<0, 0, 6, 6, 3, 3>,
- }, // Output pad bottom = 3
- }, // Input pad right = 6
- }, // Input pad bottom = 6
- }, // Input pad left = 0
- { // Input pad left = 1
- { // Input pad bottom = 0
- { // Input pad right = 0
- { // Output pad bottom = 0
- Conv::template process_tile<0, 1, 0, 0, 0, 0>,
- Conv::template process_tile<0, 1, 0, 0, 0, 1>,
- Conv::template process_tile<0, 1, 0, 0, 0, 2>,
- Conv::template process_tile<0, 1, 0, 0, 0, 3>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- Conv::template process_tile<0, 1, 0, 0, 1, 0>,
- Conv::template process_tile<0, 1, 0, 0, 1, 1>,
- Conv::template process_tile<0, 1, 0, 0, 1, 2>,
- Conv::template process_tile<0, 1, 0, 0, 1, 3>,
- }, // Output pad bottom = 1
- { // Output pad bottom = 2
- Conv::template process_tile<0, 1, 0, 0, 2, 0>,
- Conv::template process_tile<0, 1, 0, 0, 2, 1>,
- Conv::template process_tile<0, 1, 0, 0, 2, 2>,
- Conv::template process_tile<0, 1, 0, 0, 2, 3>,
- }, // Output pad bottom = 2
- { // Output pad bottom = 3
- Conv::template process_tile<0, 1, 0, 0, 3, 0>,
- Conv::template process_tile<0, 1, 0, 0, 3, 1>,
- Conv::template process_tile<0, 1, 0, 0, 3, 2>,
- Conv::template process_tile<0, 1, 0, 0, 3, 3>,
- }, // Output pad bottom = 3
- }, // Input pad right = 0
- { // Input pad right = 1
- { // Output pad bottom = 0
- Conv::template process_tile<0, 1, 0, 1, 0, 0>,
- Conv::template process_tile<0, 1, 0, 1, 0, 1>,
- Conv::template process_tile<0, 1, 0, 1, 0, 2>,
- Conv::template process_tile<0, 1, 0, 1, 0, 3>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- Conv::template process_tile<0, 1, 0, 1, 1, 0>,
- Conv::template process_tile<0, 1, 0, 1, 1, 1>,
- Conv::template process_tile<0, 1, 0, 1, 1, 2>,
- Conv::template process_tile<0, 1, 0, 1, 1, 3>,
- }, // Output pad bottom = 1
- { // Output pad bottom = 2
- Conv::template process_tile<0, 1, 0, 1, 2, 0>,
- Conv::template process_tile<0, 1, 0, 1, 2, 1>,
- Conv::template process_tile<0, 1, 0, 1, 2, 2>,
- Conv::template process_tile<0, 1, 0, 1, 2, 3>,
- }, // Output pad bottom = 2
- { // Output pad bottom = 3
- Conv::template process_tile<0, 1, 0, 1, 3, 0>,
- Conv::template process_tile<0, 1, 0, 1, 3, 1>,
- Conv::template process_tile<0, 1, 0, 1, 3, 2>,
- Conv::template process_tile<0, 1, 0, 1, 3, 3>,
- }, // Output pad bottom = 3
- }, // Input pad right = 1
- { // Input pad right = 2
- { // Output pad bottom = 0
- Conv::template process_tile<0, 1, 0, 2, 0, 0>,
- Conv::template process_tile<0, 1, 0, 2, 0, 1>,
- Conv::template process_tile<0, 1, 0, 2, 0, 2>,
- Conv::template process_tile<0, 1, 0, 2, 0, 3>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- Conv::template process_tile<0, 1, 0, 2, 1, 0>,
- Conv::template process_tile<0, 1, 0, 2, 1, 1>,
- Conv::template process_tile<0, 1, 0, 2, 1, 2>,
- Conv::template process_tile<0, 1, 0, 2, 1, 3>,
- }, // Output pad bottom = 1
- { // Output pad bottom = 2
- Conv::template process_tile<0, 1, 0, 2, 2, 0>,
- Conv::template process_tile<0, 1, 0, 2, 2, 1>,
- Conv::template process_tile<0, 1, 0, 2, 2, 2>,
- Conv::template process_tile<0, 1, 0, 2, 2, 3>,
- }, // Output pad bottom = 2
- { // Output pad bottom = 3
- Conv::template process_tile<0, 1, 0, 2, 3, 0>,
- Conv::template process_tile<0, 1, 0, 2, 3, 1>,
- Conv::template process_tile<0, 1, 0, 2, 3, 2>,
- Conv::template process_tile<0, 1, 0, 2, 3, 3>,
- }, // Output pad bottom = 3
- }, // Input pad right = 2
- { // Input pad right = 3
- { // Output pad bottom = 0
- Conv::template process_tile<0, 1, 0, 3, 0, 0>,
- Conv::template process_tile<0, 1, 0, 3, 0, 1>,
- Conv::template process_tile<0, 1, 0, 3, 0, 2>,
- Conv::template process_tile<0, 1, 0, 3, 0, 3>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- Conv::template process_tile<0, 1, 0, 3, 1, 0>,
- Conv::template process_tile<0, 1, 0, 3, 1, 1>,
- Conv::template process_tile<0, 1, 0, 3, 1, 2>,
- Conv::template process_tile<0, 1, 0, 3, 1, 3>,
- }, // Output pad bottom = 1
- { // Output pad bottom = 2
- Conv::template process_tile<0, 1, 0, 3, 2, 0>,
- Conv::template process_tile<0, 1, 0, 3, 2, 1>,
- Conv::template process_tile<0, 1, 0, 3, 2, 2>,
- Conv::template process_tile<0, 1, 0, 3, 2, 3>,
- }, // Output pad bottom = 2
- { // Output pad bottom = 3
- Conv::template process_tile<0, 1, 0, 3, 3, 0>,
- Conv::template process_tile<0, 1, 0, 3, 3, 1>,
- Conv::template process_tile<0, 1, 0, 3, 3, 2>,
- Conv::template process_tile<0, 1, 0, 3, 3, 3>,
- }, // Output pad bottom = 3
- }, // Input pad right = 3
- { // Input pad right = 4
- { // Output pad bottom = 0
- Conv::template process_tile<0, 1, 0, 4, 0, 0>,
- Conv::template process_tile<0, 1, 0, 4, 0, 1>,
- Conv::template process_tile<0, 1, 0, 4, 0, 2>,
- Conv::template process_tile<0, 1, 0, 4, 0, 3>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- Conv::template process_tile<0, 1, 0, 4, 1, 0>,
- Conv::template process_tile<0, 1, 0, 4, 1, 1>,
- Conv::template process_tile<0, 1, 0, 4, 1, 2>,
- Conv::template process_tile<0, 1, 0, 4, 1, 3>,
- }, // Output pad bottom = 1
- { // Output pad bottom = 2
- Conv::template process_tile<0, 1, 0, 4, 2, 0>,
- Conv::template process_tile<0, 1, 0, 4, 2, 1>,
- Conv::template process_tile<0, 1, 0, 4, 2, 2>,
- Conv::template process_tile<0, 1, 0, 4, 2, 3>,
- }, // Output pad bottom = 2
- { // Output pad bottom = 3
- Conv::template process_tile<0, 1, 0, 4, 3, 0>,
- Conv::template process_tile<0, 1, 0, 4, 3, 1>,
- Conv::template process_tile<0, 1, 0, 4, 3, 2>,
- Conv::template process_tile<0, 1, 0, 4, 3, 3>,
- }, // Output pad bottom = 3
- }, // Input pad right = 4
- { // Input pad right = 5
- { // Output pad bottom = 0
- Conv::template process_tile<0, 1, 0, 5, 0, 0>,
- Conv::template process_tile<0, 1, 0, 5, 0, 1>,
- Conv::template process_tile<0, 1, 0, 5, 0, 2>,
- Conv::template process_tile<0, 1, 0, 5, 0, 3>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- Conv::template process_tile<0, 1, 0, 5, 1, 0>,
- Conv::template process_tile<0, 1, 0, 5, 1, 1>,
- Conv::template process_tile<0, 1, 0, 5, 1, 2>,
- Conv::template process_tile<0, 1, 0, 5, 1, 3>,
- }, // Output pad bottom = 1
- { // Output pad bottom = 2
- Conv::template process_tile<0, 1, 0, 5, 2, 0>,
- Conv::template process_tile<0, 1, 0, 5, 2, 1>,
- Conv::template process_tile<0, 1, 0, 5, 2, 2>,
- Conv::template process_tile<0, 1, 0, 5, 2, 3>,
- }, // Output pad bottom = 2
- { // Output pad bottom = 3
- Conv::template process_tile<0, 1, 0, 5, 3, 0>,
- Conv::template process_tile<0, 1, 0, 5, 3, 1>,
- Conv::template process_tile<0, 1, 0, 5, 3, 2>,
- Conv::template process_tile<0, 1, 0, 5, 3, 3>,
- }, // Output pad bottom = 3
- }, // Input pad right = 5
- { // Input pad right = 6
- { // Output pad bottom = 0
- Conv::template process_tile<0, 1, 0, 6, 0, 0>,
- Conv::template process_tile<0, 1, 0, 6, 0, 1>,
- Conv::template process_tile<0, 1, 0, 6, 0, 2>,
- Conv::template process_tile<0, 1, 0, 6, 0, 3>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- Conv::template process_tile<0, 1, 0, 6, 1, 0>,
- Conv::template process_tile<0, 1, 0, 6, 1, 1>,
- Conv::template process_tile<0, 1, 0, 6, 1, 2>,
- Conv::template process_tile<0, 1, 0, 6, 1, 3>,
- }, // Output pad bottom = 1
- { // Output pad bottom = 2
- Conv::template process_tile<0, 1, 0, 6, 2, 0>,
- Conv::template process_tile<0, 1, 0, 6, 2, 1>,
- Conv::template process_tile<0, 1, 0, 6, 2, 2>,
- Conv::template process_tile<0, 1, 0, 6, 2, 3>,
- }, // Output pad bottom = 2
- { // Output pad bottom = 3
- Conv::template process_tile<0, 1, 0, 6, 3, 0>,
- Conv::template process_tile<0, 1, 0, 6, 3, 1>,
- Conv::template process_tile<0, 1, 0, 6, 3, 2>,
- Conv::template process_tile<0, 1, 0, 6, 3, 3>,
- }, // Output pad bottom = 3
- }, // Input pad right = 6
- }, // Input pad bottom = 0
- { // Input pad bottom = 1
- { // Input pad right = 0
- { // Output pad bottom = 0
- Conv::template process_tile<0, 1, 1, 0, 0, 0>,
- Conv::template process_tile<0, 1, 1, 0, 0, 1>,
- Conv::template process_tile<0, 1, 1, 0, 0, 2>,
- Conv::template process_tile<0, 1, 1, 0, 0, 3>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- Conv::template process_tile<0, 1, 1, 0, 1, 0>,
- Conv::template process_tile<0, 1, 1, 0, 1, 1>,
- Conv::template process_tile<0, 1, 1, 0, 1, 2>,
- Conv::template process_tile<0, 1, 1, 0, 1, 3>,
- }, // Output pad bottom = 1
- { // Output pad bottom = 2
- Conv::template process_tile<0, 1, 1, 0, 2, 0>,
- Conv::template process_tile<0, 1, 1, 0, 2, 1>,
- Conv::template process_tile<0, 1, 1, 0, 2, 2>,
- Conv::template process_tile<0, 1, 1, 0, 2, 3>,
- }, // Output pad bottom = 2
- { // Output pad bottom = 3
- Conv::template process_tile<0, 1, 1, 0, 3, 0>,
- Conv::template process_tile<0, 1, 1, 0, 3, 1>,
- Conv::template process_tile<0, 1, 1, 0, 3, 2>,
- Conv::template process_tile<0, 1, 1, 0, 3, 3>,
- }, // Output pad bottom = 3
- }, // Input pad right = 0
- { // Input pad right = 1
- { // Output pad bottom = 0
- Conv::template process_tile<0, 1, 1, 1, 0, 0>,
- Conv::template process_tile<0, 1, 1, 1, 0, 1>,
- Conv::template process_tile<0, 1, 1, 1, 0, 2>,
- Conv::template process_tile<0, 1, 1, 1, 0, 3>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- Conv::template process_tile<0, 1, 1, 1, 1, 0>,
- Conv::template process_tile<0, 1, 1, 1, 1, 1>,
- Conv::template process_tile<0, 1, 1, 1, 1, 2>,
- Conv::template process_tile<0, 1, 1, 1, 1, 3>,
- }, // Output pad bottom = 1
- { // Output pad bottom = 2
- Conv::template process_tile<0, 1, 1, 1, 2, 0>,
- Conv::template process_tile<0, 1, 1, 1, 2, 1>,
- Conv::template process_tile<0, 1, 1, 1, 2, 2>,
- Conv::template process_tile<0, 1, 1, 1, 2, 3>,
- }, // Output pad bottom = 2
- { // Output pad bottom = 3
- Conv::template process_tile<0, 1, 1, 1, 3, 0>,
- Conv::template process_tile<0, 1, 1, 1, 3, 1>,
- Conv::template process_tile<0, 1, 1, 1, 3, 2>,
- Conv::template process_tile<0, 1, 1, 1, 3, 3>,
- }, // Output pad bottom = 3
- }, // Input pad right = 1
- { // Input pad right = 2
- { // Output pad bottom = 0
- Conv::template process_tile<0, 1, 1, 2, 0, 0>,
- Conv::template process_tile<0, 1, 1, 2, 0, 1>,
- Conv::template process_tile<0, 1, 1, 2, 0, 2>,
- Conv::template process_tile<0, 1, 1, 2, 0, 3>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- Conv::template process_tile<0, 1, 1, 2, 1, 0>,
- Conv::template process_tile<0, 1, 1, 2, 1, 1>,
- Conv::template process_tile<0, 1, 1, 2, 1, 2>,
- Conv::template process_tile<0, 1, 1, 2, 1, 3>,
- }, // Output pad bottom = 1
- { // Output pad bottom = 2
- Conv::template process_tile<0, 1, 1, 2, 2, 0>,
- Conv::template process_tile<0, 1, 1, 2, 2, 1>,
- Conv::template process_tile<0, 1, 1, 2, 2, 2>,
- Conv::template process_tile<0, 1, 1, 2, 2, 3>,
- }, // Output pad bottom = 2
- { // Output pad bottom = 3
- Conv::template process_tile<0, 1, 1, 2, 3, 0>,
- Conv::template process_tile<0, 1, 1, 2, 3, 1>,
- Conv::template process_tile<0, 1, 1, 2, 3, 2>,
- Conv::template process_tile<0, 1, 1, 2, 3, 3>,
- }, // Output pad bottom = 3
- }, // Input pad right = 2
- { // Input pad right = 3
- { // Output pad bottom = 0
- Conv::template process_tile<0, 1, 1, 3, 0, 0>,
- Conv::template process_tile<0, 1, 1, 3, 0, 1>,
- Conv::template process_tile<0, 1, 1, 3, 0, 2>,
- Conv::template process_tile<0, 1, 1, 3, 0, 3>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- Conv::template process_tile<0, 1, 1, 3, 1, 0>,
- Conv::template process_tile<0, 1, 1, 3, 1, 1>,
- Conv::template process_tile<0, 1, 1, 3, 1, 2>,
- Conv::template process_tile<0, 1, 1, 3, 1, 3>,
- }, // Output pad bottom = 1
- { // Output pad bottom = 2
- Conv::template process_tile<0, 1, 1, 3, 2, 0>,
- Conv::template process_tile<0, 1, 1, 3, 2, 1>,
- Conv::template process_tile<0, 1, 1, 3, 2, 2>,
- Conv::template process_tile<0, 1, 1, 3, 2, 3>,
- }, // Output pad bottom = 2
- { // Output pad bottom = 3
- Conv::template process_tile<0, 1, 1, 3, 3, 0>,
- Conv::template process_tile<0, 1, 1, 3, 3, 1>,
- Conv::template process_tile<0, 1, 1, 3, 3, 2>,
- Conv::template process_tile<0, 1, 1, 3, 3, 3>,
- }, // Output pad bottom = 3
- }, // Input pad right = 3
- { // Input pad right = 4
- { // Output pad bottom = 0
- Conv::template process_tile<0, 1, 1, 4, 0, 0>,
- Conv::template process_tile<0, 1, 1, 4, 0, 1>,
- Conv::template process_tile<0, 1, 1, 4, 0, 2>,
- Conv::template process_tile<0, 1, 1, 4, 0, 3>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- Conv::template process_tile<0, 1, 1, 4, 1, 0>,
- Conv::template process_tile<0, 1, 1, 4, 1, 1>,
- Conv::template process_tile<0, 1, 1, 4, 1, 2>,
- Conv::template process_tile<0, 1, 1, 4, 1, 3>,
- }, // Output pad bottom = 1
- { // Output pad bottom = 2
- Conv::template process_tile<0, 1, 1, 4, 2, 0>,
- Conv::template process_tile<0, 1, 1, 4, 2, 1>,
- Conv::template process_tile<0, 1, 1, 4, 2, 2>,
- Conv::template process_tile<0, 1, 1, 4, 2, 3>,
- }, // Output pad bottom = 2
- { // Output pad bottom = 3
- Conv::template process_tile<0, 1, 1, 4, 3, 0>,
- Conv::template process_tile<0, 1, 1, 4, 3, 1>,
- Conv::template process_tile<0, 1, 1, 4, 3, 2>,
- Conv::template process_tile<0, 1, 1, 4, 3, 3>,
- }, // Output pad bottom = 3
- }, // Input pad right = 4
- { // Input pad right = 5
- { // Output pad bottom = 0
- Conv::template process_tile<0, 1, 1, 5, 0, 0>,
- Conv::template process_tile<0, 1, 1, 5, 0, 1>,
- Conv::template process_tile<0, 1, 1, 5, 0, 2>,
- Conv::template process_tile<0, 1, 1, 5, 0, 3>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- Conv::template process_tile<0, 1, 1, 5, 1, 0>,
- Conv::template process_tile<0, 1, 1, 5, 1, 1>,
- Conv::template process_tile<0, 1, 1, 5, 1, 2>,
- Conv::template process_tile<0, 1, 1, 5, 1, 3>,
- }, // Output pad bottom = 1
- { // Output pad bottom = 2
- Conv::template process_tile<0, 1, 1, 5, 2, 0>,
- Conv::template process_tile<0, 1, 1, 5, 2, 1>,
- Conv::template process_tile<0, 1, 1, 5, 2, 2>,
- Conv::template process_tile<0, 1, 1, 5, 2, 3>,
- }, // Output pad bottom = 2
- { // Output pad bottom = 3
- Conv::template process_tile<0, 1, 1, 5, 3, 0>,
- Conv::template process_tile<0, 1, 1, 5, 3, 1>,
- Conv::template process_tile<0, 1, 1, 5, 3, 2>,
- Conv::template process_tile<0, 1, 1, 5, 3, 3>,
- }, // Output pad bottom = 3
- }, // Input pad right = 5
- { // Input pad right = 6
- { // Output pad bottom = 0
- Conv::template process_tile<0, 1, 1, 6, 0, 0>,
- Conv::template process_tile<0, 1, 1, 6, 0, 1>,
- Conv::template process_tile<0, 1, 1, 6, 0, 2>,
- Conv::template process_tile<0, 1, 1, 6, 0, 3>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- Conv::template process_tile<0, 1, 1, 6, 1, 0>,
- Conv::template process_tile<0, 1, 1, 6, 1, 1>,
- Conv::template process_tile<0, 1, 1, 6, 1, 2>,
- Conv::template process_tile<0, 1, 1, 6, 1, 3>,
- }, // Output pad bottom = 1
- { // Output pad bottom = 2
- Conv::template process_tile<0, 1, 1, 6, 2, 0>,
- Conv::template process_tile<0, 1, 1, 6, 2, 1>,
- Conv::template process_tile<0, 1, 1, 6, 2, 2>,
- Conv::template process_tile<0, 1, 1, 6, 2, 3>,
- }, // Output pad bottom = 2
- { // Output pad bottom = 3
- Conv::template process_tile<0, 1, 1, 6, 3, 0>,
- Conv::template process_tile<0, 1, 1, 6, 3, 1>,
- Conv::template process_tile<0, 1, 1, 6, 3, 2>,
- Conv::template process_tile<0, 1, 1, 6, 3, 3>,
- }, // Output pad bottom = 3
- }, // Input pad right = 6
- }, // Input pad bottom = 1
- { // Input pad bottom = 2
- { // Input pad right = 0
- { // Output pad bottom = 0
- Conv::template process_tile<0, 1, 2, 0, 0, 0>,
- Conv::template process_tile<0, 1, 2, 0, 0, 1>,
- Conv::template process_tile<0, 1, 2, 0, 0, 2>,
- Conv::template process_tile<0, 1, 2, 0, 0, 3>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- Conv::template process_tile<0, 1, 2, 0, 1, 0>,
- Conv::template process_tile<0, 1, 2, 0, 1, 1>,
- Conv::template process_tile<0, 1, 2, 0, 1, 2>,
- Conv::template process_tile<0, 1, 2, 0, 1, 3>,
- }, // Output pad bottom = 1
- { // Output pad bottom = 2
- Conv::template process_tile<0, 1, 2, 0, 2, 0>,
- Conv::template process_tile<0, 1, 2, 0, 2, 1>,
- Conv::template process_tile<0, 1, 2, 0, 2, 2>,
- Conv::template process_tile<0, 1, 2, 0, 2, 3>,
- }, // Output pad bottom = 2
- { // Output pad bottom = 3
- Conv::template process_tile<0, 1, 2, 0, 3, 0>,
- Conv::template process_tile<0, 1, 2, 0, 3, 1>,
- Conv::template process_tile<0, 1, 2, 0, 3, 2>,
- Conv::template process_tile<0, 1, 2, 0, 3, 3>,
- }, // Output pad bottom = 3
- }, // Input pad right = 0
- { // Input pad right = 1
- { // Output pad bottom = 0
- Conv::template process_tile<0, 1, 2, 1, 0, 0>,
- Conv::template process_tile<0, 1, 2, 1, 0, 1>,
- Conv::template process_tile<0, 1, 2, 1, 0, 2>,
- Conv::template process_tile<0, 1, 2, 1, 0, 3>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- Conv::template process_tile<0, 1, 2, 1, 1, 0>,
- Conv::template process_tile<0, 1, 2, 1, 1, 1>,
- Conv::template process_tile<0, 1, 2, 1, 1, 2>,
- Conv::template process_tile<0, 1, 2, 1, 1, 3>,
- }, // Output pad bottom = 1
- { // Output pad bottom = 2
- Conv::template process_tile<0, 1, 2, 1, 2, 0>,
- Conv::template process_tile<0, 1, 2, 1, 2, 1>,
- Conv::template process_tile<0, 1, 2, 1, 2, 2>,
- Conv::template process_tile<0, 1, 2, 1, 2, 3>,
- }, // Output pad bottom = 2
- { // Output pad bottom = 3
- Conv::template process_tile<0, 1, 2, 1, 3, 0>,
- Conv::template process_tile<0, 1, 2, 1, 3, 1>,
- Conv::template process_tile<0, 1, 2, 1, 3, 2>,
- Conv::template process_tile<0, 1, 2, 1, 3, 3>,
- }, // Output pad bottom = 3
- }, // Input pad right = 1
- { // Input pad right = 2
- { // Output pad bottom = 0
- Conv::template process_tile<0, 1, 2, 2, 0, 0>,
- Conv::template process_tile<0, 1, 2, 2, 0, 1>,
- Conv::template process_tile<0, 1, 2, 2, 0, 2>,
- Conv::template process_tile<0, 1, 2, 2, 0, 3>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- Conv::template process_tile<0, 1, 2, 2, 1, 0>,
- Conv::template process_tile<0, 1, 2, 2, 1, 1>,
- Conv::template process_tile<0, 1, 2, 2, 1, 2>,
- Conv::template process_tile<0, 1, 2, 2, 1, 3>,
- }, // Output pad bottom = 1
- { // Output pad bottom = 2
- Conv::template process_tile<0, 1, 2, 2, 2, 0>,
- Conv::template process_tile<0, 1, 2, 2, 2, 1>,
- Conv::template process_tile<0, 1, 2, 2, 2, 2>,
- Conv::template process_tile<0, 1, 2, 2, 2, 3>,
- }, // Output pad bottom = 2
- { // Output pad bottom = 3
- Conv::template process_tile<0, 1, 2, 2, 3, 0>,
- Conv::template process_tile<0, 1, 2, 2, 3, 1>,
- Conv::template process_tile<0, 1, 2, 2, 3, 2>,
- Conv::template process_tile<0, 1, 2, 2, 3, 3>,
- }, // Output pad bottom = 3
- }, // Input pad right = 2
- { // Input pad right = 3
- { // Output pad bottom = 0
- Conv::template process_tile<0, 1, 2, 3, 0, 0>,
- Conv::template process_tile<0, 1, 2, 3, 0, 1>,
- Conv::template process_tile<0, 1, 2, 3, 0, 2>,
- Conv::template process_tile<0, 1, 2, 3, 0, 3>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- Conv::template process_tile<0, 1, 2, 3, 1, 0>,
- Conv::template process_tile<0, 1, 2, 3, 1, 1>,
- Conv::template process_tile<0, 1, 2, 3, 1, 2>,
- Conv::template process_tile<0, 1, 2, 3, 1, 3>,
- }, // Output pad bottom = 1
- { // Output pad bottom = 2
- Conv::template process_tile<0, 1, 2, 3, 2, 0>,
- Conv::template process_tile<0, 1, 2, 3, 2, 1>,
- Conv::template process_tile<0, 1, 2, 3, 2, 2>,
- Conv::template process_tile<0, 1, 2, 3, 2, 3>,
- }, // Output pad bottom = 2
- { // Output pad bottom = 3
- Conv::template process_tile<0, 1, 2, 3, 3, 0>,
- Conv::template process_tile<0, 1, 2, 3, 3, 1>,
- Conv::template process_tile<0, 1, 2, 3, 3, 2>,
- Conv::template process_tile<0, 1, 2, 3, 3, 3>,
- }, // Output pad bottom = 3
- }, // Input pad right = 3
- { // Input pad right = 4
- { // Output pad bottom = 0
- Conv::template process_tile<0, 1, 2, 4, 0, 0>,
- Conv::template process_tile<0, 1, 2, 4, 0, 1>,
- Conv::template process_tile<0, 1, 2, 4, 0, 2>,
- Conv::template process_tile<0, 1, 2, 4, 0, 3>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- Conv::template process_tile<0, 1, 2, 4, 1, 0>,
- Conv::template process_tile<0, 1, 2, 4, 1, 1>,
- Conv::template process_tile<0, 1, 2, 4, 1, 2>,
- Conv::template process_tile<0, 1, 2, 4, 1, 3>,
- }, // Output pad bottom = 1
- { // Output pad bottom = 2
- Conv::template process_tile<0, 1, 2, 4, 2, 0>,
- Conv::template process_tile<0, 1, 2, 4, 2, 1>,
- Conv::template process_tile<0, 1, 2, 4, 2, 2>,
- Conv::template process_tile<0, 1, 2, 4, 2, 3>,
- }, // Output pad bottom = 2
- { // Output pad bottom = 3
- Conv::template process_tile<0, 1, 2, 4, 3, 0>,
- Conv::template process_tile<0, 1, 2, 4, 3, 1>,
- Conv::template process_tile<0, 1, 2, 4, 3, 2>,
- Conv::template process_tile<0, 1, 2, 4, 3, 3>,
- }, // Output pad bottom = 3
- }, // Input pad right = 4
- { // Input pad right = 5
- { // Output pad bottom = 0
- Conv::template process_tile<0, 1, 2, 5, 0, 0>,
- Conv::template process_tile<0, 1, 2, 5, 0, 1>,
- Conv::template process_tile<0, 1, 2, 5, 0, 2>,
- Conv::template process_tile<0, 1, 2, 5, 0, 3>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- Conv::template process_tile<0, 1, 2, 5, 1, 0>,
- Conv::template process_tile<0, 1, 2, 5, 1, 1>,
- Conv::template process_tile<0, 1, 2, 5, 1, 2>,
- Conv::template process_tile<0, 1, 2, 5, 1, 3>,
- }, // Output pad bottom = 1
- { // Output pad bottom = 2
- Conv::template process_tile<0, 1, 2, 5, 2, 0>,
- Conv::template process_tile<0, 1, 2, 5, 2, 1>,
- Conv::template process_tile<0, 1, 2, 5, 2, 2>,
- Conv::template process_tile<0, 1, 2, 5, 2, 3>,
- }, // Output pad bottom = 2
- { // Output pad bottom = 3
- Conv::template process_tile<0, 1, 2, 5, 3, 0>,
- Conv::template process_tile<0, 1, 2, 5, 3, 1>,
- Conv::template process_tile<0, 1, 2, 5, 3, 2>,
- Conv::template process_tile<0, 1, 2, 5, 3, 3>,
- }, // Output pad bottom = 3
- }, // Input pad right = 5
- { // Input pad right = 6
- { // Output pad bottom = 0
- Conv::template process_tile<0, 1, 2, 6, 0, 0>,
- Conv::template process_tile<0, 1, 2, 6, 0, 1>,
- Conv::template process_tile<0, 1, 2, 6, 0, 2>,
- Conv::template process_tile<0, 1, 2, 6, 0, 3>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- Conv::template process_tile<0, 1, 2, 6, 1, 0>,
- Conv::template process_tile<0, 1, 2, 6, 1, 1>,
- Conv::template process_tile<0, 1, 2, 6, 1, 2>,
- Conv::template process_tile<0, 1, 2, 6, 1, 3>,
- }, // Output pad bottom = 1
- { // Output pad bottom = 2
- Conv::template process_tile<0, 1, 2, 6, 2, 0>,
- Conv::template process_tile<0, 1, 2, 6, 2, 1>,
- Conv::template process_tile<0, 1, 2, 6, 2, 2>,
- Conv::template process_tile<0, 1, 2, 6, 2, 3>,
- }, // Output pad bottom = 2
- { // Output pad bottom = 3
- Conv::template process_tile<0, 1, 2, 6, 3, 0>,
- Conv::template process_tile<0, 1, 2, 6, 3, 1>,
- Conv::template process_tile<0, 1, 2, 6, 3, 2>,
- Conv::template process_tile<0, 1, 2, 6, 3, 3>,
- }, // Output pad bottom = 3
- }, // Input pad right = 6
- }, // Input pad bottom = 2
- { // Input pad bottom = 3
- { // Input pad right = 0
- { // Output pad bottom = 0
- Conv::template process_tile<0, 1, 3, 0, 0, 0>,
- Conv::template process_tile<0, 1, 3, 0, 0, 1>,
- Conv::template process_tile<0, 1, 3, 0, 0, 2>,
- Conv::template process_tile<0, 1, 3, 0, 0, 3>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- Conv::template process_tile<0, 1, 3, 0, 1, 0>,
- Conv::template process_tile<0, 1, 3, 0, 1, 1>,
- Conv::template process_tile<0, 1, 3, 0, 1, 2>,
- Conv::template process_tile<0, 1, 3, 0, 1, 3>,
- }, // Output pad bottom = 1
- { // Output pad bottom = 2
- Conv::template process_tile<0, 1, 3, 0, 2, 0>,
- Conv::template process_tile<0, 1, 3, 0, 2, 1>,
- Conv::template process_tile<0, 1, 3, 0, 2, 2>,
- Conv::template process_tile<0, 1, 3, 0, 2, 3>,
- }, // Output pad bottom = 2
- { // Output pad bottom = 3
- Conv::template process_tile<0, 1, 3, 0, 3, 0>,
- Conv::template process_tile<0, 1, 3, 0, 3, 1>,
- Conv::template process_tile<0, 1, 3, 0, 3, 2>,
- Conv::template process_tile<0, 1, 3, 0, 3, 3>,
- }, // Output pad bottom = 3
- }, // Input pad right = 0
- { // Input pad right = 1
- { // Output pad bottom = 0
- Conv::template process_tile<0, 1, 3, 1, 0, 0>,
- Conv::template process_tile<0, 1, 3, 1, 0, 1>,
- Conv::template process_tile<0, 1, 3, 1, 0, 2>,
- Conv::template process_tile<0, 1, 3, 1, 0, 3>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- Conv::template process_tile<0, 1, 3, 1, 1, 0>,
- Conv::template process_tile<0, 1, 3, 1, 1, 1>,
- Conv::template process_tile<0, 1, 3, 1, 1, 2>,
- Conv::template process_tile<0, 1, 3, 1, 1, 3>,
- }, // Output pad bottom = 1
- { // Output pad bottom = 2
- Conv::template process_tile<0, 1, 3, 1, 2, 0>,
- Conv::template process_tile<0, 1, 3, 1, 2, 1>,
- Conv::template process_tile<0, 1, 3, 1, 2, 2>,
- Conv::template process_tile<0, 1, 3, 1, 2, 3>,
- }, // Output pad bottom = 2
- { // Output pad bottom = 3
- Conv::template process_tile<0, 1, 3, 1, 3, 0>,
- Conv::template process_tile<0, 1, 3, 1, 3, 1>,
- Conv::template process_tile<0, 1, 3, 1, 3, 2>,
- Conv::template process_tile<0, 1, 3, 1, 3, 3>,
- }, // Output pad bottom = 3
- }, // Input pad right = 1
- { // Input pad right = 2
- { // Output pad bottom = 0
- Conv::template process_tile<0, 1, 3, 2, 0, 0>,
- Conv::template process_tile<0, 1, 3, 2, 0, 1>,
- Conv::template process_tile<0, 1, 3, 2, 0, 2>,
- Conv::template process_tile<0, 1, 3, 2, 0, 3>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- Conv::template process_tile<0, 1, 3, 2, 1, 0>,
- Conv::template process_tile<0, 1, 3, 2, 1, 1>,
- Conv::template process_tile<0, 1, 3, 2, 1, 2>,
- Conv::template process_tile<0, 1, 3, 2, 1, 3>,
- }, // Output pad bottom = 1
- { // Output pad bottom = 2
- Conv::template process_tile<0, 1, 3, 2, 2, 0>,
- Conv::template process_tile<0, 1, 3, 2, 2, 1>,
- Conv::template process_tile<0, 1, 3, 2, 2, 2>,
- Conv::template process_tile<0, 1, 3, 2, 2, 3>,
- }, // Output pad bottom = 2
- { // Output pad bottom = 3
- Conv::template process_tile<0, 1, 3, 2, 3, 0>,
- Conv::template process_tile<0, 1, 3, 2, 3, 1>,
- Conv::template process_tile<0, 1, 3, 2, 3, 2>,
- Conv::template process_tile<0, 1, 3, 2, 3, 3>,
- }, // Output pad bottom = 3
- }, // Input pad right = 2
- { // Input pad right = 3
- { // Output pad bottom = 0
- Conv::template process_tile<0, 1, 3, 3, 0, 0>,
- Conv::template process_tile<0, 1, 3, 3, 0, 1>,
- Conv::template process_tile<0, 1, 3, 3, 0, 2>,
- Conv::template process_tile<0, 1, 3, 3, 0, 3>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- Conv::template process_tile<0, 1, 3, 3, 1, 0>,
- Conv::template process_tile<0, 1, 3, 3, 1, 1>,
- Conv::template process_tile<0, 1, 3, 3, 1, 2>,
- Conv::template process_tile<0, 1, 3, 3, 1, 3>,
- }, // Output pad bottom = 1
- { // Output pad bottom = 2
- Conv::template process_tile<0, 1, 3, 3, 2, 0>,
- Conv::template process_tile<0, 1, 3, 3, 2, 1>,
- Conv::template process_tile<0, 1, 3, 3, 2, 2>,
- Conv::template process_tile<0, 1, 3, 3, 2, 3>,
- }, // Output pad bottom = 2
- { // Output pad bottom = 3
- Conv::template process_tile<0, 1, 3, 3, 3, 0>,
- Conv::template process_tile<0, 1, 3, 3, 3, 1>,
- Conv::template process_tile<0, 1, 3, 3, 3, 2>,
- Conv::template process_tile<0, 1, 3, 3, 3, 3>,
- }, // Output pad bottom = 3
- }, // Input pad right = 3
- { // Input pad right = 4
- { // Output pad bottom = 0
- Conv::template process_tile<0, 1, 3, 4, 0, 0>,
- Conv::template process_tile<0, 1, 3, 4, 0, 1>,
- Conv::template process_tile<0, 1, 3, 4, 0, 2>,
- Conv::template process_tile<0, 1, 3, 4, 0, 3>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- Conv::template process_tile<0, 1, 3, 4, 1, 0>,
- Conv::template process_tile<0, 1, 3, 4, 1, 1>,
- Conv::template process_tile<0, 1, 3, 4, 1, 2>,
- Conv::template process_tile<0, 1, 3, 4, 1, 3>,
- }, // Output pad bottom = 1
- { // Output pad bottom = 2
- Conv::template process_tile<0, 1, 3, 4, 2, 0>,
- Conv::template process_tile<0, 1, 3, 4, 2, 1>,
- Conv::template process_tile<0, 1, 3, 4, 2, 2>,
- Conv::template process_tile<0, 1, 3, 4, 2, 3>,
- }, // Output pad bottom = 2
- { // Output pad bottom = 3
- Conv::template process_tile<0, 1, 3, 4, 3, 0>,
- Conv::template process_tile<0, 1, 3, 4, 3, 1>,
- Conv::template process_tile<0, 1, 3, 4, 3, 2>,
- Conv::template process_tile<0, 1, 3, 4, 3, 3>,
- }, // Output pad bottom = 3
- }, // Input pad right = 4
- { // Input pad right = 5
- { // Output pad bottom = 0
- Conv::template process_tile<0, 1, 3, 5, 0, 0>,
- Conv::template process_tile<0, 1, 3, 5, 0, 1>,
- Conv::template process_tile<0, 1, 3, 5, 0, 2>,
- Conv::template process_tile<0, 1, 3, 5, 0, 3>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- Conv::template process_tile<0, 1, 3, 5, 1, 0>,
- Conv::template process_tile<0, 1, 3, 5, 1, 1>,
- Conv::template process_tile<0, 1, 3, 5, 1, 2>,
- Conv::template process_tile<0, 1, 3, 5, 1, 3>,
- }, // Output pad bottom = 1
- { // Output pad bottom = 2
- Conv::template process_tile<0, 1, 3, 5, 2, 0>,
- Conv::template process_tile<0, 1, 3, 5, 2, 1>,
- Conv::template process_tile<0, 1, 3, 5, 2, 2>,
- Conv::template process_tile<0, 1, 3, 5, 2, 3>,
- }, // Output pad bottom = 2
- { // Output pad bottom = 3
- Conv::template process_tile<0, 1, 3, 5, 3, 0>,
- Conv::template process_tile<0, 1, 3, 5, 3, 1>,
- Conv::template process_tile<0, 1, 3, 5, 3, 2>,
- Conv::template process_tile<0, 1, 3, 5, 3, 3>,
- }, // Output pad bottom = 3
- }, // Input pad right = 5
- { // Input pad right = 6
- { // Output pad bottom = 0
- Conv::template process_tile<0, 1, 3, 6, 0, 0>,
- Conv::template process_tile<0, 1, 3, 6, 0, 1>,
- Conv::template process_tile<0, 1, 3, 6, 0, 2>,
- Conv::template process_tile<0, 1, 3, 6, 0, 3>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- Conv::template process_tile<0, 1, 3, 6, 1, 0>,
- Conv::template process_tile<0, 1, 3, 6, 1, 1>,
- Conv::template process_tile<0, 1, 3, 6, 1, 2>,
- Conv::template process_tile<0, 1, 3, 6, 1, 3>,
- }, // Output pad bottom = 1
- { // Output pad bottom = 2
- Conv::template process_tile<0, 1, 3, 6, 2, 0>,
- Conv::template process_tile<0, 1, 3, 6, 2, 1>,
- Conv::template process_tile<0, 1, 3, 6, 2, 2>,
- Conv::template process_tile<0, 1, 3, 6, 2, 3>,
- }, // Output pad bottom = 2
- { // Output pad bottom = 3
- Conv::template process_tile<0, 1, 3, 6, 3, 0>,
- Conv::template process_tile<0, 1, 3, 6, 3, 1>,
- Conv::template process_tile<0, 1, 3, 6, 3, 2>,
- Conv::template process_tile<0, 1, 3, 6, 3, 3>,
- }, // Output pad bottom = 3
- }, // Input pad right = 6
- }, // Input pad bottom = 3
- { // Input pad bottom = 4
- { // Input pad right = 0
- { // Output pad bottom = 0
- Conv::template process_tile<0, 1, 4, 0, 0, 0>,
- Conv::template process_tile<0, 1, 4, 0, 0, 1>,
- Conv::template process_tile<0, 1, 4, 0, 0, 2>,
- Conv::template process_tile<0, 1, 4, 0, 0, 3>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- Conv::template process_tile<0, 1, 4, 0, 1, 0>,
- Conv::template process_tile<0, 1, 4, 0, 1, 1>,
- Conv::template process_tile<0, 1, 4, 0, 1, 2>,
- Conv::template process_tile<0, 1, 4, 0, 1, 3>,
- }, // Output pad bottom = 1
- { // Output pad bottom = 2
- Conv::template process_tile<0, 1, 4, 0, 2, 0>,
- Conv::template process_tile<0, 1, 4, 0, 2, 1>,
- Conv::template process_tile<0, 1, 4, 0, 2, 2>,
- Conv::template process_tile<0, 1, 4, 0, 2, 3>,
- }, // Output pad bottom = 2
- { // Output pad bottom = 3
- Conv::template process_tile<0, 1, 4, 0, 3, 0>,
- Conv::template process_tile<0, 1, 4, 0, 3, 1>,
- Conv::template process_tile<0, 1, 4, 0, 3, 2>,
- Conv::template process_tile<0, 1, 4, 0, 3, 3>,
- }, // Output pad bottom = 3
- }, // Input pad right = 0
- { // Input pad right = 1
- { // Output pad bottom = 0
- Conv::template process_tile<0, 1, 4, 1, 0, 0>,
- Conv::template process_tile<0, 1, 4, 1, 0, 1>,
- Conv::template process_tile<0, 1, 4, 1, 0, 2>,
- Conv::template process_tile<0, 1, 4, 1, 0, 3>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- Conv::template process_tile<0, 1, 4, 1, 1, 0>,
- Conv::template process_tile<0, 1, 4, 1, 1, 1>,
- Conv::template process_tile<0, 1, 4, 1, 1, 2>,
- Conv::template process_tile<0, 1, 4, 1, 1, 3>,
- }, // Output pad bottom = 1
- { // Output pad bottom = 2
- Conv::template process_tile<0, 1, 4, 1, 2, 0>,
- Conv::template process_tile<0, 1, 4, 1, 2, 1>,
- Conv::template process_tile<0, 1, 4, 1, 2, 2>,
- Conv::template process_tile<0, 1, 4, 1, 2, 3>,
- }, // Output pad bottom = 2
- { // Output pad bottom = 3
- Conv::template process_tile<0, 1, 4, 1, 3, 0>,
- Conv::template process_tile<0, 1, 4, 1, 3, 1>,
- Conv::template process_tile<0, 1, 4, 1, 3, 2>,
- Conv::template process_tile<0, 1, 4, 1, 3, 3>,
- }, // Output pad bottom = 3
- }, // Input pad right = 1
- { // Input pad right = 2
- { // Output pad bottom = 0
- Conv::template process_tile<0, 1, 4, 2, 0, 0>,
- Conv::template process_tile<0, 1, 4, 2, 0, 1>,
- Conv::template process_tile<0, 1, 4, 2, 0, 2>,
- Conv::template process_tile<0, 1, 4, 2, 0, 3>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- Conv::template process_tile<0, 1, 4, 2, 1, 0>,
- Conv::template process_tile<0, 1, 4, 2, 1, 1>,
- Conv::template process_tile<0, 1, 4, 2, 1, 2>,
- Conv::template process_tile<0, 1, 4, 2, 1, 3>,
- }, // Output pad bottom = 1
- { // Output pad bottom = 2
- Conv::template process_tile<0, 1, 4, 2, 2, 0>,
- Conv::template process_tile<0, 1, 4, 2, 2, 1>,
- Conv::template process_tile<0, 1, 4, 2, 2, 2>,
- Conv::template process_tile<0, 1, 4, 2, 2, 3>,
- }, // Output pad bottom = 2
- { // Output pad bottom = 3
- Conv::template process_tile<0, 1, 4, 2, 3, 0>,
- Conv::template process_tile<0, 1, 4, 2, 3, 1>,
- Conv::template process_tile<0, 1, 4, 2, 3, 2>,
- Conv::template process_tile<0, 1, 4, 2, 3, 3>,
- }, // Output pad bottom = 3
- }, // Input pad right = 2
- { // Input pad right = 3
- { // Output pad bottom = 0
- Conv::template process_tile<0, 1, 4, 3, 0, 0>,
- Conv::template process_tile<0, 1, 4, 3, 0, 1>,
- Conv::template process_tile<0, 1, 4, 3, 0, 2>,
- Conv::template process_tile<0, 1, 4, 3, 0, 3>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- Conv::template process_tile<0, 1, 4, 3, 1, 0>,
- Conv::template process_tile<0, 1, 4, 3, 1, 1>,
- Conv::template process_tile<0, 1, 4, 3, 1, 2>,
- Conv::template process_tile<0, 1, 4, 3, 1, 3>,
- }, // Output pad bottom = 1
- { // Output pad bottom = 2
- Conv::template process_tile<0, 1, 4, 3, 2, 0>,
- Conv::template process_tile<0, 1, 4, 3, 2, 1>,
- Conv::template process_tile<0, 1, 4, 3, 2, 2>,
- Conv::template process_tile<0, 1, 4, 3, 2, 3>,
- }, // Output pad bottom = 2
- { // Output pad bottom = 3
- Conv::template process_tile<0, 1, 4, 3, 3, 0>,
- Conv::template process_tile<0, 1, 4, 3, 3, 1>,
- Conv::template process_tile<0, 1, 4, 3, 3, 2>,
- Conv::template process_tile<0, 1, 4, 3, 3, 3>,
- }, // Output pad bottom = 3
- }, // Input pad right = 3
- { // Input pad right = 4
- { // Output pad bottom = 0
- Conv::template process_tile<0, 1, 4, 4, 0, 0>,
- Conv::template process_tile<0, 1, 4, 4, 0, 1>,
- Conv::template process_tile<0, 1, 4, 4, 0, 2>,
- Conv::template process_tile<0, 1, 4, 4, 0, 3>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- Conv::template process_tile<0, 1, 4, 4, 1, 0>,
- Conv::template process_tile<0, 1, 4, 4, 1, 1>,
- Conv::template process_tile<0, 1, 4, 4, 1, 2>,
- Conv::template process_tile<0, 1, 4, 4, 1, 3>,
- }, // Output pad bottom = 1
- { // Output pad bottom = 2
- Conv::template process_tile<0, 1, 4, 4, 2, 0>,
- Conv::template process_tile<0, 1, 4, 4, 2, 1>,
- Conv::template process_tile<0, 1, 4, 4, 2, 2>,
- Conv::template process_tile<0, 1, 4, 4, 2, 3>,
- }, // Output pad bottom = 2
- { // Output pad bottom = 3
- Conv::template process_tile<0, 1, 4, 4, 3, 0>,
- Conv::template process_tile<0, 1, 4, 4, 3, 1>,
- Conv::template process_tile<0, 1, 4, 4, 3, 2>,
- Conv::template process_tile<0, 1, 4, 4, 3, 3>,
- }, // Output pad bottom = 3
- }, // Input pad right = 4
- { // Input pad right = 5
- { // Output pad bottom = 0
- Conv::template process_tile<0, 1, 4, 5, 0, 0>,
- Conv::template process_tile<0, 1, 4, 5, 0, 1>,
- Conv::template process_tile<0, 1, 4, 5, 0, 2>,
- Conv::template process_tile<0, 1, 4, 5, 0, 3>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- Conv::template process_tile<0, 1, 4, 5, 1, 0>,
- Conv::template process_tile<0, 1, 4, 5, 1, 1>,
- Conv::template process_tile<0, 1, 4, 5, 1, 2>,
- Conv::template process_tile<0, 1, 4, 5, 1, 3>,
- }, // Output pad bottom = 1
- { // Output pad bottom = 2
- Conv::template process_tile<0, 1, 4, 5, 2, 0>,
- Conv::template process_tile<0, 1, 4, 5, 2, 1>,
- Conv::template process_tile<0, 1, 4, 5, 2, 2>,
- Conv::template process_tile<0, 1, 4, 5, 2, 3>,
- }, // Output pad bottom = 2
- { // Output pad bottom = 3
- Conv::template process_tile<0, 1, 4, 5, 3, 0>,
- Conv::template process_tile<0, 1, 4, 5, 3, 1>,
- Conv::template process_tile<0, 1, 4, 5, 3, 2>,
- Conv::template process_tile<0, 1, 4, 5, 3, 3>,
- }, // Output pad bottom = 3
- }, // Input pad right = 5
- { // Input pad right = 6
- { // Output pad bottom = 0
- Conv::template process_tile<0, 1, 4, 6, 0, 0>,
- Conv::template process_tile<0, 1, 4, 6, 0, 1>,
- Conv::template process_tile<0, 1, 4, 6, 0, 2>,
- Conv::template process_tile<0, 1, 4, 6, 0, 3>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- Conv::template process_tile<0, 1, 4, 6, 1, 0>,
- Conv::template process_tile<0, 1, 4, 6, 1, 1>,
- Conv::template process_tile<0, 1, 4, 6, 1, 2>,
- Conv::template process_tile<0, 1, 4, 6, 1, 3>,
- }, // Output pad bottom = 1
- { // Output pad bottom = 2
- Conv::template process_tile<0, 1, 4, 6, 2, 0>,
- Conv::template process_tile<0, 1, 4, 6, 2, 1>,
- Conv::template process_tile<0, 1, 4, 6, 2, 2>,
- Conv::template process_tile<0, 1, 4, 6, 2, 3>,
- }, // Output pad bottom = 2
- { // Output pad bottom = 3
- Conv::template process_tile<0, 1, 4, 6, 3, 0>,
- Conv::template process_tile<0, 1, 4, 6, 3, 1>,
- Conv::template process_tile<0, 1, 4, 6, 3, 2>,
- Conv::template process_tile<0, 1, 4, 6, 3, 3>,
- }, // Output pad bottom = 3
- }, // Input pad right = 6
- }, // Input pad bottom = 4
- { // Input pad bottom = 5
- { // Input pad right = 0
- { // Output pad bottom = 0
- Conv::template process_tile<0, 1, 5, 0, 0, 0>,
- Conv::template process_tile<0, 1, 5, 0, 0, 1>,
- Conv::template process_tile<0, 1, 5, 0, 0, 2>,
- Conv::template process_tile<0, 1, 5, 0, 0, 3>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- Conv::template process_tile<0, 1, 5, 0, 1, 0>,
- Conv::template process_tile<0, 1, 5, 0, 1, 1>,
- Conv::template process_tile<0, 1, 5, 0, 1, 2>,
- Conv::template process_tile<0, 1, 5, 0, 1, 3>,
- }, // Output pad bottom = 1
- { // Output pad bottom = 2
- Conv::template process_tile<0, 1, 5, 0, 2, 0>,
- Conv::template process_tile<0, 1, 5, 0, 2, 1>,
- Conv::template process_tile<0, 1, 5, 0, 2, 2>,
- Conv::template process_tile<0, 1, 5, 0, 2, 3>,
- }, // Output pad bottom = 2
- { // Output pad bottom = 3
- Conv::template process_tile<0, 1, 5, 0, 3, 0>,
- Conv::template process_tile<0, 1, 5, 0, 3, 1>,
- Conv::template process_tile<0, 1, 5, 0, 3, 2>,
- Conv::template process_tile<0, 1, 5, 0, 3, 3>,
- }, // Output pad bottom = 3
- }, // Input pad right = 0
- { // Input pad right = 1
- { // Output pad bottom = 0
- Conv::template process_tile<0, 1, 5, 1, 0, 0>,
- Conv::template process_tile<0, 1, 5, 1, 0, 1>,
- Conv::template process_tile<0, 1, 5, 1, 0, 2>,
- Conv::template process_tile<0, 1, 5, 1, 0, 3>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- Conv::template process_tile<0, 1, 5, 1, 1, 0>,
- Conv::template process_tile<0, 1, 5, 1, 1, 1>,
- Conv::template process_tile<0, 1, 5, 1, 1, 2>,
- Conv::template process_tile<0, 1, 5, 1, 1, 3>,
- }, // Output pad bottom = 1
- { // Output pad bottom = 2
- Conv::template process_tile<0, 1, 5, 1, 2, 0>,
- Conv::template process_tile<0, 1, 5, 1, 2, 1>,
- Conv::template process_tile<0, 1, 5, 1, 2, 2>,
- Conv::template process_tile<0, 1, 5, 1, 2, 3>,
- }, // Output pad bottom = 2
- { // Output pad bottom = 3
- Conv::template process_tile<0, 1, 5, 1, 3, 0>,
- Conv::template process_tile<0, 1, 5, 1, 3, 1>,
- Conv::template process_tile<0, 1, 5, 1, 3, 2>,
- Conv::template process_tile<0, 1, 5, 1, 3, 3>,
- }, // Output pad bottom = 3
- }, // Input pad right = 1
- { // Input pad right = 2
- { // Output pad bottom = 0
- Conv::template process_tile<0, 1, 5, 2, 0, 0>,
- Conv::template process_tile<0, 1, 5, 2, 0, 1>,
- Conv::template process_tile<0, 1, 5, 2, 0, 2>,
- Conv::template process_tile<0, 1, 5, 2, 0, 3>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- Conv::template process_tile<0, 1, 5, 2, 1, 0>,
- Conv::template process_tile<0, 1, 5, 2, 1, 1>,
- Conv::template process_tile<0, 1, 5, 2, 1, 2>,
- Conv::template process_tile<0, 1, 5, 2, 1, 3>,
- }, // Output pad bottom = 1
- { // Output pad bottom = 2
- Conv::template process_tile<0, 1, 5, 2, 2, 0>,
- Conv::template process_tile<0, 1, 5, 2, 2, 1>,
- Conv::template process_tile<0, 1, 5, 2, 2, 2>,
- Conv::template process_tile<0, 1, 5, 2, 2, 3>,
- }, // Output pad bottom = 2
- { // Output pad bottom = 3
- Conv::template process_tile<0, 1, 5, 2, 3, 0>,
- Conv::template process_tile<0, 1, 5, 2, 3, 1>,
- Conv::template process_tile<0, 1, 5, 2, 3, 2>,
- Conv::template process_tile<0, 1, 5, 2, 3, 3>,
- }, // Output pad bottom = 3
- }, // Input pad right = 2
- { // Input pad right = 3
- { // Output pad bottom = 0
- Conv::template process_tile<0, 1, 5, 3, 0, 0>,
- Conv::template process_tile<0, 1, 5, 3, 0, 1>,
- Conv::template process_tile<0, 1, 5, 3, 0, 2>,
- Conv::template process_tile<0, 1, 5, 3, 0, 3>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- Conv::template process_tile<0, 1, 5, 3, 1, 0>,
- Conv::template process_tile<0, 1, 5, 3, 1, 1>,
- Conv::template process_tile<0, 1, 5, 3, 1, 2>,
- Conv::template process_tile<0, 1, 5, 3, 1, 3>,
- }, // Output pad bottom = 1
- { // Output pad bottom = 2
- Conv::template process_tile<0, 1, 5, 3, 2, 0>,
- Conv::template process_tile<0, 1, 5, 3, 2, 1>,
- Conv::template process_tile<0, 1, 5, 3, 2, 2>,
- Conv::template process_tile<0, 1, 5, 3, 2, 3>,
- }, // Output pad bottom = 2
- { // Output pad bottom = 3
- Conv::template process_tile<0, 1, 5, 3, 3, 0>,
- Conv::template process_tile<0, 1, 5, 3, 3, 1>,
- Conv::template process_tile<0, 1, 5, 3, 3, 2>,
- Conv::template process_tile<0, 1, 5, 3, 3, 3>,
- }, // Output pad bottom = 3
- }, // Input pad right = 3
- { // Input pad right = 4
- { // Output pad bottom = 0
- Conv::template process_tile<0, 1, 5, 4, 0, 0>,
- Conv::template process_tile<0, 1, 5, 4, 0, 1>,
- Conv::template process_tile<0, 1, 5, 4, 0, 2>,
- Conv::template process_tile<0, 1, 5, 4, 0, 3>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- Conv::template process_tile<0, 1, 5, 4, 1, 0>,
- Conv::template process_tile<0, 1, 5, 4, 1, 1>,
- Conv::template process_tile<0, 1, 5, 4, 1, 2>,
- Conv::template process_tile<0, 1, 5, 4, 1, 3>,
- }, // Output pad bottom = 1
- { // Output pad bottom = 2
- Conv::template process_tile<0, 1, 5, 4, 2, 0>,
- Conv::template process_tile<0, 1, 5, 4, 2, 1>,
- Conv::template process_tile<0, 1, 5, 4, 2, 2>,
- Conv::template process_tile<0, 1, 5, 4, 2, 3>,
- }, // Output pad bottom = 2
- { // Output pad bottom = 3
- Conv::template process_tile<0, 1, 5, 4, 3, 0>,
- Conv::template process_tile<0, 1, 5, 4, 3, 1>,
- Conv::template process_tile<0, 1, 5, 4, 3, 2>,
- Conv::template process_tile<0, 1, 5, 4, 3, 3>,
- }, // Output pad bottom = 3
- }, // Input pad right = 4
- { // Input pad right = 5
- { // Output pad bottom = 0
- Conv::template process_tile<0, 1, 5, 5, 0, 0>,
- Conv::template process_tile<0, 1, 5, 5, 0, 1>,
- Conv::template process_tile<0, 1, 5, 5, 0, 2>,
- Conv::template process_tile<0, 1, 5, 5, 0, 3>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- Conv::template process_tile<0, 1, 5, 5, 1, 0>,
- Conv::template process_tile<0, 1, 5, 5, 1, 1>,
- Conv::template process_tile<0, 1, 5, 5, 1, 2>,
- Conv::template process_tile<0, 1, 5, 5, 1, 3>,
- }, // Output pad bottom = 1
- { // Output pad bottom = 2
- Conv::template process_tile<0, 1, 5, 5, 2, 0>,
- Conv::template process_tile<0, 1, 5, 5, 2, 1>,
- Conv::template process_tile<0, 1, 5, 5, 2, 2>,
- Conv::template process_tile<0, 1, 5, 5, 2, 3>,
- }, // Output pad bottom = 2
- { // Output pad bottom = 3
- Conv::template process_tile<0, 1, 5, 5, 3, 0>,
- Conv::template process_tile<0, 1, 5, 5, 3, 1>,
- Conv::template process_tile<0, 1, 5, 5, 3, 2>,
- Conv::template process_tile<0, 1, 5, 5, 3, 3>,
- }, // Output pad bottom = 3
- }, // Input pad right = 5
- { // Input pad right = 6
- { // Output pad bottom = 0
- Conv::template process_tile<0, 1, 5, 6, 0, 0>,
- Conv::template process_tile<0, 1, 5, 6, 0, 1>,
- Conv::template process_tile<0, 1, 5, 6, 0, 2>,
- Conv::template process_tile<0, 1, 5, 6, 0, 3>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- Conv::template process_tile<0, 1, 5, 6, 1, 0>,
- Conv::template process_tile<0, 1, 5, 6, 1, 1>,
- Conv::template process_tile<0, 1, 5, 6, 1, 2>,
- Conv::template process_tile<0, 1, 5, 6, 1, 3>,
- }, // Output pad bottom = 1
- { // Output pad bottom = 2
- Conv::template process_tile<0, 1, 5, 6, 2, 0>,
- Conv::template process_tile<0, 1, 5, 6, 2, 1>,
- Conv::template process_tile<0, 1, 5, 6, 2, 2>,
- Conv::template process_tile<0, 1, 5, 6, 2, 3>,
- }, // Output pad bottom = 2
- { // Output pad bottom = 3
- Conv::template process_tile<0, 1, 5, 6, 3, 0>,
- Conv::template process_tile<0, 1, 5, 6, 3, 1>,
- Conv::template process_tile<0, 1, 5, 6, 3, 2>,
- Conv::template process_tile<0, 1, 5, 6, 3, 3>,
- }, // Output pad bottom = 3
- }, // Input pad right = 6
- }, // Input pad bottom = 5
- { // Input pad bottom = 6
- { // Input pad right = 0
- { // Output pad bottom = 0
- Conv::template process_tile<0, 1, 6, 0, 0, 0>,
- Conv::template process_tile<0, 1, 6, 0, 0, 1>,
- Conv::template process_tile<0, 1, 6, 0, 0, 2>,
- Conv::template process_tile<0, 1, 6, 0, 0, 3>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- Conv::template process_tile<0, 1, 6, 0, 1, 0>,
- Conv::template process_tile<0, 1, 6, 0, 1, 1>,
- Conv::template process_tile<0, 1, 6, 0, 1, 2>,
- Conv::template process_tile<0, 1, 6, 0, 1, 3>,
- }, // Output pad bottom = 1
- { // Output pad bottom = 2
- Conv::template process_tile<0, 1, 6, 0, 2, 0>,
- Conv::template process_tile<0, 1, 6, 0, 2, 1>,
- Conv::template process_tile<0, 1, 6, 0, 2, 2>,
- Conv::template process_tile<0, 1, 6, 0, 2, 3>,
- }, // Output pad bottom = 2
- { // Output pad bottom = 3
- Conv::template process_tile<0, 1, 6, 0, 3, 0>,
- Conv::template process_tile<0, 1, 6, 0, 3, 1>,
- Conv::template process_tile<0, 1, 6, 0, 3, 2>,
- Conv::template process_tile<0, 1, 6, 0, 3, 3>,
- }, // Output pad bottom = 3
- }, // Input pad right = 0
- { // Input pad right = 1
- { // Output pad bottom = 0
- Conv::template process_tile<0, 1, 6, 1, 0, 0>,
- Conv::template process_tile<0, 1, 6, 1, 0, 1>,
- Conv::template process_tile<0, 1, 6, 1, 0, 2>,
- Conv::template process_tile<0, 1, 6, 1, 0, 3>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- Conv::template process_tile<0, 1, 6, 1, 1, 0>,
- Conv::template process_tile<0, 1, 6, 1, 1, 1>,
- Conv::template process_tile<0, 1, 6, 1, 1, 2>,
- Conv::template process_tile<0, 1, 6, 1, 1, 3>,
- }, // Output pad bottom = 1
- { // Output pad bottom = 2
- Conv::template process_tile<0, 1, 6, 1, 2, 0>,
- Conv::template process_tile<0, 1, 6, 1, 2, 1>,
- Conv::template process_tile<0, 1, 6, 1, 2, 2>,
- Conv::template process_tile<0, 1, 6, 1, 2, 3>,
- }, // Output pad bottom = 2
- { // Output pad bottom = 3
- Conv::template process_tile<0, 1, 6, 1, 3, 0>,
- Conv::template process_tile<0, 1, 6, 1, 3, 1>,
- Conv::template process_tile<0, 1, 6, 1, 3, 2>,
- Conv::template process_tile<0, 1, 6, 1, 3, 3>,
- }, // Output pad bottom = 3
- }, // Input pad right = 1
- { // Input pad right = 2
- { // Output pad bottom = 0
- Conv::template process_tile<0, 1, 6, 2, 0, 0>,
- Conv::template process_tile<0, 1, 6, 2, 0, 1>,
- Conv::template process_tile<0, 1, 6, 2, 0, 2>,
- Conv::template process_tile<0, 1, 6, 2, 0, 3>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- Conv::template process_tile<0, 1, 6, 2, 1, 0>,
- Conv::template process_tile<0, 1, 6, 2, 1, 1>,
- Conv::template process_tile<0, 1, 6, 2, 1, 2>,
- Conv::template process_tile<0, 1, 6, 2, 1, 3>,
- }, // Output pad bottom = 1
- { // Output pad bottom = 2
- Conv::template process_tile<0, 1, 6, 2, 2, 0>,
- Conv::template process_tile<0, 1, 6, 2, 2, 1>,
- Conv::template process_tile<0, 1, 6, 2, 2, 2>,
- Conv::template process_tile<0, 1, 6, 2, 2, 3>,
- }, // Output pad bottom = 2
- { // Output pad bottom = 3
- Conv::template process_tile<0, 1, 6, 2, 3, 0>,
- Conv::template process_tile<0, 1, 6, 2, 3, 1>,
- Conv::template process_tile<0, 1, 6, 2, 3, 2>,
- Conv::template process_tile<0, 1, 6, 2, 3, 3>,
- }, // Output pad bottom = 3
- }, // Input pad right = 2
- { // Input pad right = 3
- { // Output pad bottom = 0
- Conv::template process_tile<0, 1, 6, 3, 0, 0>,
- Conv::template process_tile<0, 1, 6, 3, 0, 1>,
- Conv::template process_tile<0, 1, 6, 3, 0, 2>,
- Conv::template process_tile<0, 1, 6, 3, 0, 3>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- Conv::template process_tile<0, 1, 6, 3, 1, 0>,
- Conv::template process_tile<0, 1, 6, 3, 1, 1>,
- Conv::template process_tile<0, 1, 6, 3, 1, 2>,
- Conv::template process_tile<0, 1, 6, 3, 1, 3>,
- }, // Output pad bottom = 1
- { // Output pad bottom = 2
- Conv::template process_tile<0, 1, 6, 3, 2, 0>,
- Conv::template process_tile<0, 1, 6, 3, 2, 1>,
- Conv::template process_tile<0, 1, 6, 3, 2, 2>,
- Conv::template process_tile<0, 1, 6, 3, 2, 3>,
- }, // Output pad bottom = 2
- { // Output pad bottom = 3
- Conv::template process_tile<0, 1, 6, 3, 3, 0>,
- Conv::template process_tile<0, 1, 6, 3, 3, 1>,
- Conv::template process_tile<0, 1, 6, 3, 3, 2>,
- Conv::template process_tile<0, 1, 6, 3, 3, 3>,
- }, // Output pad bottom = 3
- }, // Input pad right = 3
- { // Input pad right = 4
- { // Output pad bottom = 0
- Conv::template process_tile<0, 1, 6, 4, 0, 0>,
- Conv::template process_tile<0, 1, 6, 4, 0, 1>,
- Conv::template process_tile<0, 1, 6, 4, 0, 2>,
- Conv::template process_tile<0, 1, 6, 4, 0, 3>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- Conv::template process_tile<0, 1, 6, 4, 1, 0>,
- Conv::template process_tile<0, 1, 6, 4, 1, 1>,
- Conv::template process_tile<0, 1, 6, 4, 1, 2>,
- Conv::template process_tile<0, 1, 6, 4, 1, 3>,
- }, // Output pad bottom = 1
- { // Output pad bottom = 2
- Conv::template process_tile<0, 1, 6, 4, 2, 0>,
- Conv::template process_tile<0, 1, 6, 4, 2, 1>,
- Conv::template process_tile<0, 1, 6, 4, 2, 2>,
- Conv::template process_tile<0, 1, 6, 4, 2, 3>,
- }, // Output pad bottom = 2
- { // Output pad bottom = 3
- Conv::template process_tile<0, 1, 6, 4, 3, 0>,
- Conv::template process_tile<0, 1, 6, 4, 3, 1>,
- Conv::template process_tile<0, 1, 6, 4, 3, 2>,
- Conv::template process_tile<0, 1, 6, 4, 3, 3>,
- }, // Output pad bottom = 3
- }, // Input pad right = 4
- { // Input pad right = 5
- { // Output pad bottom = 0
- Conv::template process_tile<0, 1, 6, 5, 0, 0>,
- Conv::template process_tile<0, 1, 6, 5, 0, 1>,
- Conv::template process_tile<0, 1, 6, 5, 0, 2>,
- Conv::template process_tile<0, 1, 6, 5, 0, 3>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- Conv::template process_tile<0, 1, 6, 5, 1, 0>,
- Conv::template process_tile<0, 1, 6, 5, 1, 1>,
- Conv::template process_tile<0, 1, 6, 5, 1, 2>,
- Conv::template process_tile<0, 1, 6, 5, 1, 3>,
- }, // Output pad bottom = 1
- { // Output pad bottom = 2
- Conv::template process_tile<0, 1, 6, 5, 2, 0>,
- Conv::template process_tile<0, 1, 6, 5, 2, 1>,
- Conv::template process_tile<0, 1, 6, 5, 2, 2>,
- Conv::template process_tile<0, 1, 6, 5, 2, 3>,
- }, // Output pad bottom = 2
- { // Output pad bottom = 3
- Conv::template process_tile<0, 1, 6, 5, 3, 0>,
- Conv::template process_tile<0, 1, 6, 5, 3, 1>,
- Conv::template process_tile<0, 1, 6, 5, 3, 2>,
- Conv::template process_tile<0, 1, 6, 5, 3, 3>,
- }, // Output pad bottom = 3
- }, // Input pad right = 5
- { // Input pad right = 6
- { // Output pad bottom = 0
- Conv::template process_tile<0, 1, 6, 6, 0, 0>,
- Conv::template process_tile<0, 1, 6, 6, 0, 1>,
- Conv::template process_tile<0, 1, 6, 6, 0, 2>,
- Conv::template process_tile<0, 1, 6, 6, 0, 3>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- Conv::template process_tile<0, 1, 6, 6, 1, 0>,
- Conv::template process_tile<0, 1, 6, 6, 1, 1>,
- Conv::template process_tile<0, 1, 6, 6, 1, 2>,
- Conv::template process_tile<0, 1, 6, 6, 1, 3>,
- }, // Output pad bottom = 1
- { // Output pad bottom = 2
- Conv::template process_tile<0, 1, 6, 6, 2, 0>,
- Conv::template process_tile<0, 1, 6, 6, 2, 1>,
- Conv::template process_tile<0, 1, 6, 6, 2, 2>,
- Conv::template process_tile<0, 1, 6, 6, 2, 3>,
- }, // Output pad bottom = 2
- { // Output pad bottom = 3
- Conv::template process_tile<0, 1, 6, 6, 3, 0>,
- Conv::template process_tile<0, 1, 6, 6, 3, 1>,
- Conv::template process_tile<0, 1, 6, 6, 3, 2>,
- Conv::template process_tile<0, 1, 6, 6, 3, 3>,
- }, // Output pad bottom = 3
- }, // Input pad right = 6
- }, // Input pad bottom = 6
- }, // Input pad left = 1
- }, // Input pad top = 0
- { // Input pad top = 1
- { // Input pad left = 0
- { // Input pad bottom = 0
- { // Input pad right = 0
- { // Output pad bottom = 0
- Conv::template process_tile<1, 0, 0, 0, 0, 0>,
- Conv::template process_tile<1, 0, 0, 0, 0, 1>,
- Conv::template process_tile<1, 0, 0, 0, 0, 2>,
- Conv::template process_tile<1, 0, 0, 0, 0, 3>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- Conv::template process_tile<1, 0, 0, 0, 1, 0>,
- Conv::template process_tile<1, 0, 0, 0, 1, 1>,
- Conv::template process_tile<1, 0, 0, 0, 1, 2>,
- Conv::template process_tile<1, 0, 0, 0, 1, 3>,
- }, // Output pad bottom = 1
- { // Output pad bottom = 2
- Conv::template process_tile<1, 0, 0, 0, 2, 0>,
- Conv::template process_tile<1, 0, 0, 0, 2, 1>,
- Conv::template process_tile<1, 0, 0, 0, 2, 2>,
- Conv::template process_tile<1, 0, 0, 0, 2, 3>,
- }, // Output pad bottom = 2
- { // Output pad bottom = 3
- Conv::template process_tile<1, 0, 0, 0, 3, 0>,
- Conv::template process_tile<1, 0, 0, 0, 3, 1>,
- Conv::template process_tile<1, 0, 0, 0, 3, 2>,
- Conv::template process_tile<1, 0, 0, 0, 3, 3>,
- }, // Output pad bottom = 3
- }, // Input pad right = 0
- { // Input pad right = 1
- { // Output pad bottom = 0
- Conv::template process_tile<1, 0, 0, 1, 0, 0>,
- Conv::template process_tile<1, 0, 0, 1, 0, 1>,
- Conv::template process_tile<1, 0, 0, 1, 0, 2>,
- Conv::template process_tile<1, 0, 0, 1, 0, 3>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- Conv::template process_tile<1, 0, 0, 1, 1, 0>,
- Conv::template process_tile<1, 0, 0, 1, 1, 1>,
- Conv::template process_tile<1, 0, 0, 1, 1, 2>,
- Conv::template process_tile<1, 0, 0, 1, 1, 3>,
- }, // Output pad bottom = 1
- { // Output pad bottom = 2
- Conv::template process_tile<1, 0, 0, 1, 2, 0>,
- Conv::template process_tile<1, 0, 0, 1, 2, 1>,
- Conv::template process_tile<1, 0, 0, 1, 2, 2>,
- Conv::template process_tile<1, 0, 0, 1, 2, 3>,
- }, // Output pad bottom = 2
- { // Output pad bottom = 3
- Conv::template process_tile<1, 0, 0, 1, 3, 0>,
- Conv::template process_tile<1, 0, 0, 1, 3, 1>,
- Conv::template process_tile<1, 0, 0, 1, 3, 2>,
- Conv::template process_tile<1, 0, 0, 1, 3, 3>,
- }, // Output pad bottom = 3
- }, // Input pad right = 1
- { // Input pad right = 2
- { // Output pad bottom = 0
- Conv::template process_tile<1, 0, 0, 2, 0, 0>,
- Conv::template process_tile<1, 0, 0, 2, 0, 1>,
- Conv::template process_tile<1, 0, 0, 2, 0, 2>,
- Conv::template process_tile<1, 0, 0, 2, 0, 3>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- Conv::template process_tile<1, 0, 0, 2, 1, 0>,
- Conv::template process_tile<1, 0, 0, 2, 1, 1>,
- Conv::template process_tile<1, 0, 0, 2, 1, 2>,
- Conv::template process_tile<1, 0, 0, 2, 1, 3>,
- }, // Output pad bottom = 1
- { // Output pad bottom = 2
- Conv::template process_tile<1, 0, 0, 2, 2, 0>,
- Conv::template process_tile<1, 0, 0, 2, 2, 1>,
- Conv::template process_tile<1, 0, 0, 2, 2, 2>,
- Conv::template process_tile<1, 0, 0, 2, 2, 3>,
- }, // Output pad bottom = 2
- { // Output pad bottom = 3
- Conv::template process_tile<1, 0, 0, 2, 3, 0>,
- Conv::template process_tile<1, 0, 0, 2, 3, 1>,
- Conv::template process_tile<1, 0, 0, 2, 3, 2>,
- Conv::template process_tile<1, 0, 0, 2, 3, 3>,
- }, // Output pad bottom = 3
- }, // Input pad right = 2
- { // Input pad right = 3
- { // Output pad bottom = 0
- Conv::template process_tile<1, 0, 0, 3, 0, 0>,
- Conv::template process_tile<1, 0, 0, 3, 0, 1>,
- Conv::template process_tile<1, 0, 0, 3, 0, 2>,
- Conv::template process_tile<1, 0, 0, 3, 0, 3>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- Conv::template process_tile<1, 0, 0, 3, 1, 0>,
- Conv::template process_tile<1, 0, 0, 3, 1, 1>,
- Conv::template process_tile<1, 0, 0, 3, 1, 2>,
- Conv::template process_tile<1, 0, 0, 3, 1, 3>,
- }, // Output pad bottom = 1
- { // Output pad bottom = 2
- Conv::template process_tile<1, 0, 0, 3, 2, 0>,
- Conv::template process_tile<1, 0, 0, 3, 2, 1>,
- Conv::template process_tile<1, 0, 0, 3, 2, 2>,
- Conv::template process_tile<1, 0, 0, 3, 2, 3>,
- }, // Output pad bottom = 2
- { // Output pad bottom = 3
- Conv::template process_tile<1, 0, 0, 3, 3, 0>,
- Conv::template process_tile<1, 0, 0, 3, 3, 1>,
- Conv::template process_tile<1, 0, 0, 3, 3, 2>,
- Conv::template process_tile<1, 0, 0, 3, 3, 3>,
- }, // Output pad bottom = 3
- }, // Input pad right = 3
- { // Input pad right = 4
- { // Output pad bottom = 0
- Conv::template process_tile<1, 0, 0, 4, 0, 0>,
- Conv::template process_tile<1, 0, 0, 4, 0, 1>,
- Conv::template process_tile<1, 0, 0, 4, 0, 2>,
- Conv::template process_tile<1, 0, 0, 4, 0, 3>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- Conv::template process_tile<1, 0, 0, 4, 1, 0>,
- Conv::template process_tile<1, 0, 0, 4, 1, 1>,
- Conv::template process_tile<1, 0, 0, 4, 1, 2>,
- Conv::template process_tile<1, 0, 0, 4, 1, 3>,
- }, // Output pad bottom = 1
- { // Output pad bottom = 2
- Conv::template process_tile<1, 0, 0, 4, 2, 0>,
- Conv::template process_tile<1, 0, 0, 4, 2, 1>,
- Conv::template process_tile<1, 0, 0, 4, 2, 2>,
- Conv::template process_tile<1, 0, 0, 4, 2, 3>,
- }, // Output pad bottom = 2
- { // Output pad bottom = 3
- Conv::template process_tile<1, 0, 0, 4, 3, 0>,
- Conv::template process_tile<1, 0, 0, 4, 3, 1>,
- Conv::template process_tile<1, 0, 0, 4, 3, 2>,
- Conv::template process_tile<1, 0, 0, 4, 3, 3>,
- }, // Output pad bottom = 3
- }, // Input pad right = 4
- { // Input pad right = 5
- { // Output pad bottom = 0
- Conv::template process_tile<1, 0, 0, 5, 0, 0>,
- Conv::template process_tile<1, 0, 0, 5, 0, 1>,
- Conv::template process_tile<1, 0, 0, 5, 0, 2>,
- Conv::template process_tile<1, 0, 0, 5, 0, 3>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- Conv::template process_tile<1, 0, 0, 5, 1, 0>,
- Conv::template process_tile<1, 0, 0, 5, 1, 1>,
- Conv::template process_tile<1, 0, 0, 5, 1, 2>,
- Conv::template process_tile<1, 0, 0, 5, 1, 3>,
- }, // Output pad bottom = 1
- { // Output pad bottom = 2
- Conv::template process_tile<1, 0, 0, 5, 2, 0>,
- Conv::template process_tile<1, 0, 0, 5, 2, 1>,
- Conv::template process_tile<1, 0, 0, 5, 2, 2>,
- Conv::template process_tile<1, 0, 0, 5, 2, 3>,
- }, // Output pad bottom = 2
- { // Output pad bottom = 3
- Conv::template process_tile<1, 0, 0, 5, 3, 0>,
- Conv::template process_tile<1, 0, 0, 5, 3, 1>,
- Conv::template process_tile<1, 0, 0, 5, 3, 2>,
- Conv::template process_tile<1, 0, 0, 5, 3, 3>,
- }, // Output pad bottom = 3
- }, // Input pad right = 5
- { // Input pad right = 6
- { // Output pad bottom = 0
- Conv::template process_tile<1, 0, 0, 6, 0, 0>,
- Conv::template process_tile<1, 0, 0, 6, 0, 1>,
- Conv::template process_tile<1, 0, 0, 6, 0, 2>,
- Conv::template process_tile<1, 0, 0, 6, 0, 3>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- Conv::template process_tile<1, 0, 0, 6, 1, 0>,
- Conv::template process_tile<1, 0, 0, 6, 1, 1>,
- Conv::template process_tile<1, 0, 0, 6, 1, 2>,
- Conv::template process_tile<1, 0, 0, 6, 1, 3>,
- }, // Output pad bottom = 1
- { // Output pad bottom = 2
- Conv::template process_tile<1, 0, 0, 6, 2, 0>,
- Conv::template process_tile<1, 0, 0, 6, 2, 1>,
- Conv::template process_tile<1, 0, 0, 6, 2, 2>,
- Conv::template process_tile<1, 0, 0, 6, 2, 3>,
- }, // Output pad bottom = 2
- { // Output pad bottom = 3
- Conv::template process_tile<1, 0, 0, 6, 3, 0>,
- Conv::template process_tile<1, 0, 0, 6, 3, 1>,
- Conv::template process_tile<1, 0, 0, 6, 3, 2>,
- Conv::template process_tile<1, 0, 0, 6, 3, 3>,
- }, // Output pad bottom = 3
- }, // Input pad right = 6
- }, // Input pad bottom = 0
- { // Input pad bottom = 1
- { // Input pad right = 0
- { // Output pad bottom = 0
- Conv::template process_tile<1, 0, 1, 0, 0, 0>,
- Conv::template process_tile<1, 0, 1, 0, 0, 1>,
- Conv::template process_tile<1, 0, 1, 0, 0, 2>,
- Conv::template process_tile<1, 0, 1, 0, 0, 3>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- Conv::template process_tile<1, 0, 1, 0, 1, 0>,
- Conv::template process_tile<1, 0, 1, 0, 1, 1>,
- Conv::template process_tile<1, 0, 1, 0, 1, 2>,
- Conv::template process_tile<1, 0, 1, 0, 1, 3>,
- }, // Output pad bottom = 1
- { // Output pad bottom = 2
- Conv::template process_tile<1, 0, 1, 0, 2, 0>,
- Conv::template process_tile<1, 0, 1, 0, 2, 1>,
- Conv::template process_tile<1, 0, 1, 0, 2, 2>,
- Conv::template process_tile<1, 0, 1, 0, 2, 3>,
- }, // Output pad bottom = 2
- { // Output pad bottom = 3
- Conv::template process_tile<1, 0, 1, 0, 3, 0>,
- Conv::template process_tile<1, 0, 1, 0, 3, 1>,
- Conv::template process_tile<1, 0, 1, 0, 3, 2>,
- Conv::template process_tile<1, 0, 1, 0, 3, 3>,
- }, // Output pad bottom = 3
- }, // Input pad right = 0
- { // Input pad right = 1
- { // Output pad bottom = 0
- Conv::template process_tile<1, 0, 1, 1, 0, 0>,
- Conv::template process_tile<1, 0, 1, 1, 0, 1>,
- Conv::template process_tile<1, 0, 1, 1, 0, 2>,
- Conv::template process_tile<1, 0, 1, 1, 0, 3>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- Conv::template process_tile<1, 0, 1, 1, 1, 0>,
- Conv::template process_tile<1, 0, 1, 1, 1, 1>,
- Conv::template process_tile<1, 0, 1, 1, 1, 2>,
- Conv::template process_tile<1, 0, 1, 1, 1, 3>,
- }, // Output pad bottom = 1
- { // Output pad bottom = 2
- Conv::template process_tile<1, 0, 1, 1, 2, 0>,
- Conv::template process_tile<1, 0, 1, 1, 2, 1>,
- Conv::template process_tile<1, 0, 1, 1, 2, 2>,
- Conv::template process_tile<1, 0, 1, 1, 2, 3>,
- }, // Output pad bottom = 2
- { // Output pad bottom = 3
- Conv::template process_tile<1, 0, 1, 1, 3, 0>,
- Conv::template process_tile<1, 0, 1, 1, 3, 1>,
- Conv::template process_tile<1, 0, 1, 1, 3, 2>,
- Conv::template process_tile<1, 0, 1, 1, 3, 3>,
- }, // Output pad bottom = 3
- }, // Input pad right = 1
- { // Input pad right = 2
- { // Output pad bottom = 0
- Conv::template process_tile<1, 0, 1, 2, 0, 0>,
- Conv::template process_tile<1, 0, 1, 2, 0, 1>,
- Conv::template process_tile<1, 0, 1, 2, 0, 2>,
- Conv::template process_tile<1, 0, 1, 2, 0, 3>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- Conv::template process_tile<1, 0, 1, 2, 1, 0>,
- Conv::template process_tile<1, 0, 1, 2, 1, 1>,
- Conv::template process_tile<1, 0, 1, 2, 1, 2>,
- Conv::template process_tile<1, 0, 1, 2, 1, 3>,
- }, // Output pad bottom = 1
- { // Output pad bottom = 2
- Conv::template process_tile<1, 0, 1, 2, 2, 0>,
- Conv::template process_tile<1, 0, 1, 2, 2, 1>,
- Conv::template process_tile<1, 0, 1, 2, 2, 2>,
- Conv::template process_tile<1, 0, 1, 2, 2, 3>,
- }, // Output pad bottom = 2
- { // Output pad bottom = 3
- Conv::template process_tile<1, 0, 1, 2, 3, 0>,
- Conv::template process_tile<1, 0, 1, 2, 3, 1>,
- Conv::template process_tile<1, 0, 1, 2, 3, 2>,
- Conv::template process_tile<1, 0, 1, 2, 3, 3>,
- }, // Output pad bottom = 3
- }, // Input pad right = 2
- { // Input pad right = 3
- { // Output pad bottom = 0
- Conv::template process_tile<1, 0, 1, 3, 0, 0>,
- Conv::template process_tile<1, 0, 1, 3, 0, 1>,
- Conv::template process_tile<1, 0, 1, 3, 0, 2>,
- Conv::template process_tile<1, 0, 1, 3, 0, 3>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- Conv::template process_tile<1, 0, 1, 3, 1, 0>,
- Conv::template process_tile<1, 0, 1, 3, 1, 1>,
- Conv::template process_tile<1, 0, 1, 3, 1, 2>,
- Conv::template process_tile<1, 0, 1, 3, 1, 3>,
- }, // Output pad bottom = 1
- { // Output pad bottom = 2
- Conv::template process_tile<1, 0, 1, 3, 2, 0>,
- Conv::template process_tile<1, 0, 1, 3, 2, 1>,
- Conv::template process_tile<1, 0, 1, 3, 2, 2>,
- Conv::template process_tile<1, 0, 1, 3, 2, 3>,
- }, // Output pad bottom = 2
- { // Output pad bottom = 3
- Conv::template process_tile<1, 0, 1, 3, 3, 0>,
- Conv::template process_tile<1, 0, 1, 3, 3, 1>,
- Conv::template process_tile<1, 0, 1, 3, 3, 2>,
- Conv::template process_tile<1, 0, 1, 3, 3, 3>,
- }, // Output pad bottom = 3
- }, // Input pad right = 3
- { // Input pad right = 4
- { // Output pad bottom = 0
- Conv::template process_tile<1, 0, 1, 4, 0, 0>,
- Conv::template process_tile<1, 0, 1, 4, 0, 1>,
- Conv::template process_tile<1, 0, 1, 4, 0, 2>,
- Conv::template process_tile<1, 0, 1, 4, 0, 3>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- Conv::template process_tile<1, 0, 1, 4, 1, 0>,
- Conv::template process_tile<1, 0, 1, 4, 1, 1>,
- Conv::template process_tile<1, 0, 1, 4, 1, 2>,
- Conv::template process_tile<1, 0, 1, 4, 1, 3>,
- }, // Output pad bottom = 1
- { // Output pad bottom = 2
- Conv::template process_tile<1, 0, 1, 4, 2, 0>,
- Conv::template process_tile<1, 0, 1, 4, 2, 1>,
- Conv::template process_tile<1, 0, 1, 4, 2, 2>,
- Conv::template process_tile<1, 0, 1, 4, 2, 3>,
- }, // Output pad bottom = 2
- { // Output pad bottom = 3
- Conv::template process_tile<1, 0, 1, 4, 3, 0>,
- Conv::template process_tile<1, 0, 1, 4, 3, 1>,
- Conv::template process_tile<1, 0, 1, 4, 3, 2>,
- Conv::template process_tile<1, 0, 1, 4, 3, 3>,
- }, // Output pad bottom = 3
- }, // Input pad right = 4
- { // Input pad right = 5
- { // Output pad bottom = 0
- Conv::template process_tile<1, 0, 1, 5, 0, 0>,
- Conv::template process_tile<1, 0, 1, 5, 0, 1>,
- Conv::template process_tile<1, 0, 1, 5, 0, 2>,
- Conv::template process_tile<1, 0, 1, 5, 0, 3>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- Conv::template process_tile<1, 0, 1, 5, 1, 0>,
- Conv::template process_tile<1, 0, 1, 5, 1, 1>,
- Conv::template process_tile<1, 0, 1, 5, 1, 2>,
- Conv::template process_tile<1, 0, 1, 5, 1, 3>,
- }, // Output pad bottom = 1
- { // Output pad bottom = 2
- Conv::template process_tile<1, 0, 1, 5, 2, 0>,
- Conv::template process_tile<1, 0, 1, 5, 2, 1>,
- Conv::template process_tile<1, 0, 1, 5, 2, 2>,
- Conv::template process_tile<1, 0, 1, 5, 2, 3>,
- }, // Output pad bottom = 2
- { // Output pad bottom = 3
- Conv::template process_tile<1, 0, 1, 5, 3, 0>,
- Conv::template process_tile<1, 0, 1, 5, 3, 1>,
- Conv::template process_tile<1, 0, 1, 5, 3, 2>,
- Conv::template process_tile<1, 0, 1, 5, 3, 3>,
- }, // Output pad bottom = 3
- }, // Input pad right = 5
- { // Input pad right = 6
- { // Output pad bottom = 0
- Conv::template process_tile<1, 0, 1, 6, 0, 0>,
- Conv::template process_tile<1, 0, 1, 6, 0, 1>,
- Conv::template process_tile<1, 0, 1, 6, 0, 2>,
- Conv::template process_tile<1, 0, 1, 6, 0, 3>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- Conv::template process_tile<1, 0, 1, 6, 1, 0>,
- Conv::template process_tile<1, 0, 1, 6, 1, 1>,
- Conv::template process_tile<1, 0, 1, 6, 1, 2>,
- Conv::template process_tile<1, 0, 1, 6, 1, 3>,
- }, // Output pad bottom = 1
- { // Output pad bottom = 2
- Conv::template process_tile<1, 0, 1, 6, 2, 0>,
- Conv::template process_tile<1, 0, 1, 6, 2, 1>,
- Conv::template process_tile<1, 0, 1, 6, 2, 2>,
- Conv::template process_tile<1, 0, 1, 6, 2, 3>,
- }, // Output pad bottom = 2
- { // Output pad bottom = 3
- Conv::template process_tile<1, 0, 1, 6, 3, 0>,
- Conv::template process_tile<1, 0, 1, 6, 3, 1>,
- Conv::template process_tile<1, 0, 1, 6, 3, 2>,
- Conv::template process_tile<1, 0, 1, 6, 3, 3>,
- }, // Output pad bottom = 3
- }, // Input pad right = 6
- }, // Input pad bottom = 1
- { // Input pad bottom = 2
- { // Input pad right = 0
- { // Output pad bottom = 0
- Conv::template process_tile<1, 0, 2, 0, 0, 0>,
- Conv::template process_tile<1, 0, 2, 0, 0, 1>,
- Conv::template process_tile<1, 0, 2, 0, 0, 2>,
- Conv::template process_tile<1, 0, 2, 0, 0, 3>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- Conv::template process_tile<1, 0, 2, 0, 1, 0>,
- Conv::template process_tile<1, 0, 2, 0, 1, 1>,
- Conv::template process_tile<1, 0, 2, 0, 1, 2>,
- Conv::template process_tile<1, 0, 2, 0, 1, 3>,
- }, // Output pad bottom = 1
- { // Output pad bottom = 2
- Conv::template process_tile<1, 0, 2, 0, 2, 0>,
- Conv::template process_tile<1, 0, 2, 0, 2, 1>,
- Conv::template process_tile<1, 0, 2, 0, 2, 2>,
- Conv::template process_tile<1, 0, 2, 0, 2, 3>,
- }, // Output pad bottom = 2
- { // Output pad bottom = 3
- Conv::template process_tile<1, 0, 2, 0, 3, 0>,
- Conv::template process_tile<1, 0, 2, 0, 3, 1>,
- Conv::template process_tile<1, 0, 2, 0, 3, 2>,
- Conv::template process_tile<1, 0, 2, 0, 3, 3>,
- }, // Output pad bottom = 3
- }, // Input pad right = 0
- { // Input pad right = 1
- { // Output pad bottom = 0
- Conv::template process_tile<1, 0, 2, 1, 0, 0>,
- Conv::template process_tile<1, 0, 2, 1, 0, 1>,
- Conv::template process_tile<1, 0, 2, 1, 0, 2>,
- Conv::template process_tile<1, 0, 2, 1, 0, 3>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- Conv::template process_tile<1, 0, 2, 1, 1, 0>,
- Conv::template process_tile<1, 0, 2, 1, 1, 1>,
- Conv::template process_tile<1, 0, 2, 1, 1, 2>,
- Conv::template process_tile<1, 0, 2, 1, 1, 3>,
- }, // Output pad bottom = 1
- { // Output pad bottom = 2
- Conv::template process_tile<1, 0, 2, 1, 2, 0>,
- Conv::template process_tile<1, 0, 2, 1, 2, 1>,
- Conv::template process_tile<1, 0, 2, 1, 2, 2>,
- Conv::template process_tile<1, 0, 2, 1, 2, 3>,
- }, // Output pad bottom = 2
- { // Output pad bottom = 3
- Conv::template process_tile<1, 0, 2, 1, 3, 0>,
- Conv::template process_tile<1, 0, 2, 1, 3, 1>,
- Conv::template process_tile<1, 0, 2, 1, 3, 2>,
- Conv::template process_tile<1, 0, 2, 1, 3, 3>,
- }, // Output pad bottom = 3
- }, // Input pad right = 1
- { // Input pad right = 2
- { // Output pad bottom = 0
- Conv::template process_tile<1, 0, 2, 2, 0, 0>,
- Conv::template process_tile<1, 0, 2, 2, 0, 1>,
- Conv::template process_tile<1, 0, 2, 2, 0, 2>,
- Conv::template process_tile<1, 0, 2, 2, 0, 3>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- Conv::template process_tile<1, 0, 2, 2, 1, 0>,
- Conv::template process_tile<1, 0, 2, 2, 1, 1>,
- Conv::template process_tile<1, 0, 2, 2, 1, 2>,
- Conv::template process_tile<1, 0, 2, 2, 1, 3>,
- }, // Output pad bottom = 1
- { // Output pad bottom = 2
- Conv::template process_tile<1, 0, 2, 2, 2, 0>,
- Conv::template process_tile<1, 0, 2, 2, 2, 1>,
- Conv::template process_tile<1, 0, 2, 2, 2, 2>,
- Conv::template process_tile<1, 0, 2, 2, 2, 3>,
- }, // Output pad bottom = 2
- { // Output pad bottom = 3
- Conv::template process_tile<1, 0, 2, 2, 3, 0>,
- Conv::template process_tile<1, 0, 2, 2, 3, 1>,
- Conv::template process_tile<1, 0, 2, 2, 3, 2>,
- Conv::template process_tile<1, 0, 2, 2, 3, 3>,
- }, // Output pad bottom = 3
- }, // Input pad right = 2
- { // Input pad right = 3
- { // Output pad bottom = 0
- Conv::template process_tile<1, 0, 2, 3, 0, 0>,
- Conv::template process_tile<1, 0, 2, 3, 0, 1>,
- Conv::template process_tile<1, 0, 2, 3, 0, 2>,
- Conv::template process_tile<1, 0, 2, 3, 0, 3>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- Conv::template process_tile<1, 0, 2, 3, 1, 0>,
- Conv::template process_tile<1, 0, 2, 3, 1, 1>,
- Conv::template process_tile<1, 0, 2, 3, 1, 2>,
- Conv::template process_tile<1, 0, 2, 3, 1, 3>,
- }, // Output pad bottom = 1
- { // Output pad bottom = 2
- Conv::template process_tile<1, 0, 2, 3, 2, 0>,
- Conv::template process_tile<1, 0, 2, 3, 2, 1>,
- Conv::template process_tile<1, 0, 2, 3, 2, 2>,
- Conv::template process_tile<1, 0, 2, 3, 2, 3>,
- }, // Output pad bottom = 2
- { // Output pad bottom = 3
- Conv::template process_tile<1, 0, 2, 3, 3, 0>,
- Conv::template process_tile<1, 0, 2, 3, 3, 1>,
- Conv::template process_tile<1, 0, 2, 3, 3, 2>,
- Conv::template process_tile<1, 0, 2, 3, 3, 3>,
- }, // Output pad bottom = 3
- }, // Input pad right = 3
- { // Input pad right = 4
- { // Output pad bottom = 0
- Conv::template process_tile<1, 0, 2, 4, 0, 0>,
- Conv::template process_tile<1, 0, 2, 4, 0, 1>,
- Conv::template process_tile<1, 0, 2, 4, 0, 2>,
- Conv::template process_tile<1, 0, 2, 4, 0, 3>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- Conv::template process_tile<1, 0, 2, 4, 1, 0>,
- Conv::template process_tile<1, 0, 2, 4, 1, 1>,
- Conv::template process_tile<1, 0, 2, 4, 1, 2>,
- Conv::template process_tile<1, 0, 2, 4, 1, 3>,
- }, // Output pad bottom = 1
- { // Output pad bottom = 2
- Conv::template process_tile<1, 0, 2, 4, 2, 0>,
- Conv::template process_tile<1, 0, 2, 4, 2, 1>,
- Conv::template process_tile<1, 0, 2, 4, 2, 2>,
- Conv::template process_tile<1, 0, 2, 4, 2, 3>,
- }, // Output pad bottom = 2
- { // Output pad bottom = 3
- Conv::template process_tile<1, 0, 2, 4, 3, 0>,
- Conv::template process_tile<1, 0, 2, 4, 3, 1>,
- Conv::template process_tile<1, 0, 2, 4, 3, 2>,
- Conv::template process_tile<1, 0, 2, 4, 3, 3>,
- }, // Output pad bottom = 3
- }, // Input pad right = 4
- { // Input pad right = 5
- { // Output pad bottom = 0
- Conv::template process_tile<1, 0, 2, 5, 0, 0>,
- Conv::template process_tile<1, 0, 2, 5, 0, 1>,
- Conv::template process_tile<1, 0, 2, 5, 0, 2>,
- Conv::template process_tile<1, 0, 2, 5, 0, 3>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- Conv::template process_tile<1, 0, 2, 5, 1, 0>,
- Conv::template process_tile<1, 0, 2, 5, 1, 1>,
- Conv::template process_tile<1, 0, 2, 5, 1, 2>,
- Conv::template process_tile<1, 0, 2, 5, 1, 3>,
- }, // Output pad bottom = 1
- { // Output pad bottom = 2
- Conv::template process_tile<1, 0, 2, 5, 2, 0>,
- Conv::template process_tile<1, 0, 2, 5, 2, 1>,
- Conv::template process_tile<1, 0, 2, 5, 2, 2>,
- Conv::template process_tile<1, 0, 2, 5, 2, 3>,
- }, // Output pad bottom = 2
- { // Output pad bottom = 3
- Conv::template process_tile<1, 0, 2, 5, 3, 0>,
- Conv::template process_tile<1, 0, 2, 5, 3, 1>,
- Conv::template process_tile<1, 0, 2, 5, 3, 2>,
- Conv::template process_tile<1, 0, 2, 5, 3, 3>,
- }, // Output pad bottom = 3
- }, // Input pad right = 5
- { // Input pad right = 6
- { // Output pad bottom = 0
- Conv::template process_tile<1, 0, 2, 6, 0, 0>,
- Conv::template process_tile<1, 0, 2, 6, 0, 1>,
- Conv::template process_tile<1, 0, 2, 6, 0, 2>,
- Conv::template process_tile<1, 0, 2, 6, 0, 3>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- Conv::template process_tile<1, 0, 2, 6, 1, 0>,
- Conv::template process_tile<1, 0, 2, 6, 1, 1>,
- Conv::template process_tile<1, 0, 2, 6, 1, 2>,
- Conv::template process_tile<1, 0, 2, 6, 1, 3>,
- }, // Output pad bottom = 1
- { // Output pad bottom = 2
- Conv::template process_tile<1, 0, 2, 6, 2, 0>,
- Conv::template process_tile<1, 0, 2, 6, 2, 1>,
- Conv::template process_tile<1, 0, 2, 6, 2, 2>,
- Conv::template process_tile<1, 0, 2, 6, 2, 3>,
- }, // Output pad bottom = 2
- { // Output pad bottom = 3
- Conv::template process_tile<1, 0, 2, 6, 3, 0>,
- Conv::template process_tile<1, 0, 2, 6, 3, 1>,
- Conv::template process_tile<1, 0, 2, 6, 3, 2>,
- Conv::template process_tile<1, 0, 2, 6, 3, 3>,
- }, // Output pad bottom = 3
- }, // Input pad right = 6
- }, // Input pad bottom = 2
- { // Input pad bottom = 3
- { // Input pad right = 0
- { // Output pad bottom = 0
- Conv::template process_tile<1, 0, 3, 0, 0, 0>,
- Conv::template process_tile<1, 0, 3, 0, 0, 1>,
- Conv::template process_tile<1, 0, 3, 0, 0, 2>,
- Conv::template process_tile<1, 0, 3, 0, 0, 3>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- Conv::template process_tile<1, 0, 3, 0, 1, 0>,
- Conv::template process_tile<1, 0, 3, 0, 1, 1>,
- Conv::template process_tile<1, 0, 3, 0, 1, 2>,
- Conv::template process_tile<1, 0, 3, 0, 1, 3>,
- }, // Output pad bottom = 1
- { // Output pad bottom = 2
- Conv::template process_tile<1, 0, 3, 0, 2, 0>,
- Conv::template process_tile<1, 0, 3, 0, 2, 1>,
- Conv::template process_tile<1, 0, 3, 0, 2, 2>,
- Conv::template process_tile<1, 0, 3, 0, 2, 3>,
- }, // Output pad bottom = 2
- { // Output pad bottom = 3
- Conv::template process_tile<1, 0, 3, 0, 3, 0>,
- Conv::template process_tile<1, 0, 3, 0, 3, 1>,
- Conv::template process_tile<1, 0, 3, 0, 3, 2>,
- Conv::template process_tile<1, 0, 3, 0, 3, 3>,
- }, // Output pad bottom = 3
- }, // Input pad right = 0
- { // Input pad right = 1
- { // Output pad bottom = 0
- Conv::template process_tile<1, 0, 3, 1, 0, 0>,
- Conv::template process_tile<1, 0, 3, 1, 0, 1>,
- Conv::template process_tile<1, 0, 3, 1, 0, 2>,
- Conv::template process_tile<1, 0, 3, 1, 0, 3>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- Conv::template process_tile<1, 0, 3, 1, 1, 0>,
- Conv::template process_tile<1, 0, 3, 1, 1, 1>,
- Conv::template process_tile<1, 0, 3, 1, 1, 2>,
- Conv::template process_tile<1, 0, 3, 1, 1, 3>,
- }, // Output pad bottom = 1
- { // Output pad bottom = 2
- Conv::template process_tile<1, 0, 3, 1, 2, 0>,
- Conv::template process_tile<1, 0, 3, 1, 2, 1>,
- Conv::template process_tile<1, 0, 3, 1, 2, 2>,
- Conv::template process_tile<1, 0, 3, 1, 2, 3>,
- }, // Output pad bottom = 2
- { // Output pad bottom = 3
- Conv::template process_tile<1, 0, 3, 1, 3, 0>,
- Conv::template process_tile<1, 0, 3, 1, 3, 1>,
- Conv::template process_tile<1, 0, 3, 1, 3, 2>,
- Conv::template process_tile<1, 0, 3, 1, 3, 3>,
- }, // Output pad bottom = 3
- }, // Input pad right = 1
- { // Input pad right = 2
- { // Output pad bottom = 0
- Conv::template process_tile<1, 0, 3, 2, 0, 0>,
- Conv::template process_tile<1, 0, 3, 2, 0, 1>,
- Conv::template process_tile<1, 0, 3, 2, 0, 2>,
- Conv::template process_tile<1, 0, 3, 2, 0, 3>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- Conv::template process_tile<1, 0, 3, 2, 1, 0>,
- Conv::template process_tile<1, 0, 3, 2, 1, 1>,
- Conv::template process_tile<1, 0, 3, 2, 1, 2>,
- Conv::template process_tile<1, 0, 3, 2, 1, 3>,
- }, // Output pad bottom = 1
- { // Output pad bottom = 2
- Conv::template process_tile<1, 0, 3, 2, 2, 0>,
- Conv::template process_tile<1, 0, 3, 2, 2, 1>,
- Conv::template process_tile<1, 0, 3, 2, 2, 2>,
- Conv::template process_tile<1, 0, 3, 2, 2, 3>,
- }, // Output pad bottom = 2
- { // Output pad bottom = 3
- Conv::template process_tile<1, 0, 3, 2, 3, 0>,
- Conv::template process_tile<1, 0, 3, 2, 3, 1>,
- Conv::template process_tile<1, 0, 3, 2, 3, 2>,
- Conv::template process_tile<1, 0, 3, 2, 3, 3>,
- }, // Output pad bottom = 3
- }, // Input pad right = 2
- { // Input pad right = 3
- { // Output pad bottom = 0
- Conv::template process_tile<1, 0, 3, 3, 0, 0>,
- Conv::template process_tile<1, 0, 3, 3, 0, 1>,
- Conv::template process_tile<1, 0, 3, 3, 0, 2>,
- Conv::template process_tile<1, 0, 3, 3, 0, 3>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- Conv::template process_tile<1, 0, 3, 3, 1, 0>,
- Conv::template process_tile<1, 0, 3, 3, 1, 1>,
- Conv::template process_tile<1, 0, 3, 3, 1, 2>,
- Conv::template process_tile<1, 0, 3, 3, 1, 3>,
- }, // Output pad bottom = 1
- { // Output pad bottom = 2
- Conv::template process_tile<1, 0, 3, 3, 2, 0>,
- Conv::template process_tile<1, 0, 3, 3, 2, 1>,
- Conv::template process_tile<1, 0, 3, 3, 2, 2>,
- Conv::template process_tile<1, 0, 3, 3, 2, 3>,
- }, // Output pad bottom = 2
- { // Output pad bottom = 3
- Conv::template process_tile<1, 0, 3, 3, 3, 0>,
- Conv::template process_tile<1, 0, 3, 3, 3, 1>,
- Conv::template process_tile<1, 0, 3, 3, 3, 2>,
- Conv::template process_tile<1, 0, 3, 3, 3, 3>,
- }, // Output pad bottom = 3
- }, // Input pad right = 3
- { // Input pad right = 4
- { // Output pad bottom = 0
- Conv::template process_tile<1, 0, 3, 4, 0, 0>,
- Conv::template process_tile<1, 0, 3, 4, 0, 1>,
- Conv::template process_tile<1, 0, 3, 4, 0, 2>,
- Conv::template process_tile<1, 0, 3, 4, 0, 3>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- Conv::template process_tile<1, 0, 3, 4, 1, 0>,
- Conv::template process_tile<1, 0, 3, 4, 1, 1>,
- Conv::template process_tile<1, 0, 3, 4, 1, 2>,
- Conv::template process_tile<1, 0, 3, 4, 1, 3>,
- }, // Output pad bottom = 1
- { // Output pad bottom = 2
- Conv::template process_tile<1, 0, 3, 4, 2, 0>,
- Conv::template process_tile<1, 0, 3, 4, 2, 1>,
- Conv::template process_tile<1, 0, 3, 4, 2, 2>,
- Conv::template process_tile<1, 0, 3, 4, 2, 3>,
- }, // Output pad bottom = 2
- { // Output pad bottom = 3
- Conv::template process_tile<1, 0, 3, 4, 3, 0>,
- Conv::template process_tile<1, 0, 3, 4, 3, 1>,
- Conv::template process_tile<1, 0, 3, 4, 3, 2>,
- Conv::template process_tile<1, 0, 3, 4, 3, 3>,
- }, // Output pad bottom = 3
- }, // Input pad right = 4
- { // Input pad right = 5
- { // Output pad bottom = 0
- Conv::template process_tile<1, 0, 3, 5, 0, 0>,
- Conv::template process_tile<1, 0, 3, 5, 0, 1>,
- Conv::template process_tile<1, 0, 3, 5, 0, 2>,
- Conv::template process_tile<1, 0, 3, 5, 0, 3>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- Conv::template process_tile<1, 0, 3, 5, 1, 0>,
- Conv::template process_tile<1, 0, 3, 5, 1, 1>,
- Conv::template process_tile<1, 0, 3, 5, 1, 2>,
- Conv::template process_tile<1, 0, 3, 5, 1, 3>,
- }, // Output pad bottom = 1
- { // Output pad bottom = 2
- Conv::template process_tile<1, 0, 3, 5, 2, 0>,
- Conv::template process_tile<1, 0, 3, 5, 2, 1>,
- Conv::template process_tile<1, 0, 3, 5, 2, 2>,
- Conv::template process_tile<1, 0, 3, 5, 2, 3>,
- }, // Output pad bottom = 2
- { // Output pad bottom = 3
- Conv::template process_tile<1, 0, 3, 5, 3, 0>,
- Conv::template process_tile<1, 0, 3, 5, 3, 1>,
- Conv::template process_tile<1, 0, 3, 5, 3, 2>,
- Conv::template process_tile<1, 0, 3, 5, 3, 3>,
- }, // Output pad bottom = 3
- }, // Input pad right = 5
- { // Input pad right = 6
- { // Output pad bottom = 0
- Conv::template process_tile<1, 0, 3, 6, 0, 0>,
- Conv::template process_tile<1, 0, 3, 6, 0, 1>,
- Conv::template process_tile<1, 0, 3, 6, 0, 2>,
- Conv::template process_tile<1, 0, 3, 6, 0, 3>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- Conv::template process_tile<1, 0, 3, 6, 1, 0>,
- Conv::template process_tile<1, 0, 3, 6, 1, 1>,
- Conv::template process_tile<1, 0, 3, 6, 1, 2>,
- Conv::template process_tile<1, 0, 3, 6, 1, 3>,
- }, // Output pad bottom = 1
- { // Output pad bottom = 2
- Conv::template process_tile<1, 0, 3, 6, 2, 0>,
- Conv::template process_tile<1, 0, 3, 6, 2, 1>,
- Conv::template process_tile<1, 0, 3, 6, 2, 2>,
- Conv::template process_tile<1, 0, 3, 6, 2, 3>,
- }, // Output pad bottom = 2
- { // Output pad bottom = 3
- Conv::template process_tile<1, 0, 3, 6, 3, 0>,
- Conv::template process_tile<1, 0, 3, 6, 3, 1>,
- Conv::template process_tile<1, 0, 3, 6, 3, 2>,
- Conv::template process_tile<1, 0, 3, 6, 3, 3>,
- }, // Output pad bottom = 3
- }, // Input pad right = 6
- }, // Input pad bottom = 3
- { // Input pad bottom = 4
- { // Input pad right = 0
- { // Output pad bottom = 0
- Conv::template process_tile<1, 0, 4, 0, 0, 0>,
- Conv::template process_tile<1, 0, 4, 0, 0, 1>,
- Conv::template process_tile<1, 0, 4, 0, 0, 2>,
- Conv::template process_tile<1, 0, 4, 0, 0, 3>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- Conv::template process_tile<1, 0, 4, 0, 1, 0>,
- Conv::template process_tile<1, 0, 4, 0, 1, 1>,
- Conv::template process_tile<1, 0, 4, 0, 1, 2>,
- Conv::template process_tile<1, 0, 4, 0, 1, 3>,
- }, // Output pad bottom = 1
- { // Output pad bottom = 2
- Conv::template process_tile<1, 0, 4, 0, 2, 0>,
- Conv::template process_tile<1, 0, 4, 0, 2, 1>,
- Conv::template process_tile<1, 0, 4, 0, 2, 2>,
- Conv::template process_tile<1, 0, 4, 0, 2, 3>,
- }, // Output pad bottom = 2
- { // Output pad bottom = 3
- Conv::template process_tile<1, 0, 4, 0, 3, 0>,
- Conv::template process_tile<1, 0, 4, 0, 3, 1>,
- Conv::template process_tile<1, 0, 4, 0, 3, 2>,
- Conv::template process_tile<1, 0, 4, 0, 3, 3>,
- }, // Output pad bottom = 3
- }, // Input pad right = 0
- { // Input pad right = 1
- { // Output pad bottom = 0
- Conv::template process_tile<1, 0, 4, 1, 0, 0>,
- Conv::template process_tile<1, 0, 4, 1, 0, 1>,
- Conv::template process_tile<1, 0, 4, 1, 0, 2>,
- Conv::template process_tile<1, 0, 4, 1, 0, 3>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- Conv::template process_tile<1, 0, 4, 1, 1, 0>,
- Conv::template process_tile<1, 0, 4, 1, 1, 1>,
- Conv::template process_tile<1, 0, 4, 1, 1, 2>,
- Conv::template process_tile<1, 0, 4, 1, 1, 3>,
- }, // Output pad bottom = 1
- { // Output pad bottom = 2
- Conv::template process_tile<1, 0, 4, 1, 2, 0>,
- Conv::template process_tile<1, 0, 4, 1, 2, 1>,
- Conv::template process_tile<1, 0, 4, 1, 2, 2>,
- Conv::template process_tile<1, 0, 4, 1, 2, 3>,
- }, // Output pad bottom = 2
- { // Output pad bottom = 3
- Conv::template process_tile<1, 0, 4, 1, 3, 0>,
- Conv::template process_tile<1, 0, 4, 1, 3, 1>,
- Conv::template process_tile<1, 0, 4, 1, 3, 2>,
- Conv::template process_tile<1, 0, 4, 1, 3, 3>,
- }, // Output pad bottom = 3
- }, // Input pad right = 1
- { // Input pad right = 2
- { // Output pad bottom = 0
- Conv::template process_tile<1, 0, 4, 2, 0, 0>,
- Conv::template process_tile<1, 0, 4, 2, 0, 1>,
- Conv::template process_tile<1, 0, 4, 2, 0, 2>,
- Conv::template process_tile<1, 0, 4, 2, 0, 3>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- Conv::template process_tile<1, 0, 4, 2, 1, 0>,
- Conv::template process_tile<1, 0, 4, 2, 1, 1>,
- Conv::template process_tile<1, 0, 4, 2, 1, 2>,
- Conv::template process_tile<1, 0, 4, 2, 1, 3>,
- }, // Output pad bottom = 1
- { // Output pad bottom = 2
- Conv::template process_tile<1, 0, 4, 2, 2, 0>,
- Conv::template process_tile<1, 0, 4, 2, 2, 1>,
- Conv::template process_tile<1, 0, 4, 2, 2, 2>,
- Conv::template process_tile<1, 0, 4, 2, 2, 3>,
- }, // Output pad bottom = 2
- { // Output pad bottom = 3
- Conv::template process_tile<1, 0, 4, 2, 3, 0>,
- Conv::template process_tile<1, 0, 4, 2, 3, 1>,
- Conv::template process_tile<1, 0, 4, 2, 3, 2>,
- Conv::template process_tile<1, 0, 4, 2, 3, 3>,
- }, // Output pad bottom = 3
- }, // Input pad right = 2
- { // Input pad right = 3
- { // Output pad bottom = 0
- Conv::template process_tile<1, 0, 4, 3, 0, 0>,
- Conv::template process_tile<1, 0, 4, 3, 0, 1>,
- Conv::template process_tile<1, 0, 4, 3, 0, 2>,
- Conv::template process_tile<1, 0, 4, 3, 0, 3>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- Conv::template process_tile<1, 0, 4, 3, 1, 0>,
- Conv::template process_tile<1, 0, 4, 3, 1, 1>,
- Conv::template process_tile<1, 0, 4, 3, 1, 2>,
- Conv::template process_tile<1, 0, 4, 3, 1, 3>,
- }, // Output pad bottom = 1
- { // Output pad bottom = 2
- Conv::template process_tile<1, 0, 4, 3, 2, 0>,
- Conv::template process_tile<1, 0, 4, 3, 2, 1>,
- Conv::template process_tile<1, 0, 4, 3, 2, 2>,
- Conv::template process_tile<1, 0, 4, 3, 2, 3>,
- }, // Output pad bottom = 2
- { // Output pad bottom = 3
- Conv::template process_tile<1, 0, 4, 3, 3, 0>,
- Conv::template process_tile<1, 0, 4, 3, 3, 1>,
- Conv::template process_tile<1, 0, 4, 3, 3, 2>,
- Conv::template process_tile<1, 0, 4, 3, 3, 3>,
- }, // Output pad bottom = 3
- }, // Input pad right = 3
- { // Input pad right = 4
- { // Output pad bottom = 0
- Conv::template process_tile<1, 0, 4, 4, 0, 0>,
- Conv::template process_tile<1, 0, 4, 4, 0, 1>,
- Conv::template process_tile<1, 0, 4, 4, 0, 2>,
- Conv::template process_tile<1, 0, 4, 4, 0, 3>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- Conv::template process_tile<1, 0, 4, 4, 1, 0>,
- Conv::template process_tile<1, 0, 4, 4, 1, 1>,
- Conv::template process_tile<1, 0, 4, 4, 1, 2>,
- Conv::template process_tile<1, 0, 4, 4, 1, 3>,
- }, // Output pad bottom = 1
- { // Output pad bottom = 2
- Conv::template process_tile<1, 0, 4, 4, 2, 0>,
- Conv::template process_tile<1, 0, 4, 4, 2, 1>,
- Conv::template process_tile<1, 0, 4, 4, 2, 2>,
- Conv::template process_tile<1, 0, 4, 4, 2, 3>,
- }, // Output pad bottom = 2
- { // Output pad bottom = 3
- Conv::template process_tile<1, 0, 4, 4, 3, 0>,
- Conv::template process_tile<1, 0, 4, 4, 3, 1>,
- Conv::template process_tile<1, 0, 4, 4, 3, 2>,
- Conv::template process_tile<1, 0, 4, 4, 3, 3>,
- }, // Output pad bottom = 3
- }, // Input pad right = 4
- { // Input pad right = 5
- { // Output pad bottom = 0
- Conv::template process_tile<1, 0, 4, 5, 0, 0>,
- Conv::template process_tile<1, 0, 4, 5, 0, 1>,
- Conv::template process_tile<1, 0, 4, 5, 0, 2>,
- Conv::template process_tile<1, 0, 4, 5, 0, 3>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- Conv::template process_tile<1, 0, 4, 5, 1, 0>,
- Conv::template process_tile<1, 0, 4, 5, 1, 1>,
- Conv::template process_tile<1, 0, 4, 5, 1, 2>,
- Conv::template process_tile<1, 0, 4, 5, 1, 3>,
- }, // Output pad bottom = 1
- { // Output pad bottom = 2
- Conv::template process_tile<1, 0, 4, 5, 2, 0>,
- Conv::template process_tile<1, 0, 4, 5, 2, 1>,
- Conv::template process_tile<1, 0, 4, 5, 2, 2>,
- Conv::template process_tile<1, 0, 4, 5, 2, 3>,
- }, // Output pad bottom = 2
- { // Output pad bottom = 3
- Conv::template process_tile<1, 0, 4, 5, 3, 0>,
- Conv::template process_tile<1, 0, 4, 5, 3, 1>,
- Conv::template process_tile<1, 0, 4, 5, 3, 2>,
- Conv::template process_tile<1, 0, 4, 5, 3, 3>,
- }, // Output pad bottom = 3
- }, // Input pad right = 5
- { // Input pad right = 6
- { // Output pad bottom = 0
- Conv::template process_tile<1, 0, 4, 6, 0, 0>,
- Conv::template process_tile<1, 0, 4, 6, 0, 1>,
- Conv::template process_tile<1, 0, 4, 6, 0, 2>,
- Conv::template process_tile<1, 0, 4, 6, 0, 3>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- Conv::template process_tile<1, 0, 4, 6, 1, 0>,
- Conv::template process_tile<1, 0, 4, 6, 1, 1>,
- Conv::template process_tile<1, 0, 4, 6, 1, 2>,
- Conv::template process_tile<1, 0, 4, 6, 1, 3>,
- }, // Output pad bottom = 1
- { // Output pad bottom = 2
- Conv::template process_tile<1, 0, 4, 6, 2, 0>,
- Conv::template process_tile<1, 0, 4, 6, 2, 1>,
- Conv::template process_tile<1, 0, 4, 6, 2, 2>,
- Conv::template process_tile<1, 0, 4, 6, 2, 3>,
- }, // Output pad bottom = 2
- { // Output pad bottom = 3
- Conv::template process_tile<1, 0, 4, 6, 3, 0>,
- Conv::template process_tile<1, 0, 4, 6, 3, 1>,
- Conv::template process_tile<1, 0, 4, 6, 3, 2>,
- Conv::template process_tile<1, 0, 4, 6, 3, 3>,
- }, // Output pad bottom = 3
- }, // Input pad right = 6
- }, // Input pad bottom = 4
- { // Input pad bottom = 5
- { // Input pad right = 0
- { // Output pad bottom = 0
- Conv::template process_tile<1, 0, 5, 0, 0, 0>,
- Conv::template process_tile<1, 0, 5, 0, 0, 1>,
- Conv::template process_tile<1, 0, 5, 0, 0, 2>,
- Conv::template process_tile<1, 0, 5, 0, 0, 3>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- Conv::template process_tile<1, 0, 5, 0, 1, 0>,
- Conv::template process_tile<1, 0, 5, 0, 1, 1>,
- Conv::template process_tile<1, 0, 5, 0, 1, 2>,
- Conv::template process_tile<1, 0, 5, 0, 1, 3>,
- }, // Output pad bottom = 1
- { // Output pad bottom = 2
- Conv::template process_tile<1, 0, 5, 0, 2, 0>,
- Conv::template process_tile<1, 0, 5, 0, 2, 1>,
- Conv::template process_tile<1, 0, 5, 0, 2, 2>,
- Conv::template process_tile<1, 0, 5, 0, 2, 3>,
- }, // Output pad bottom = 2
- { // Output pad bottom = 3
- Conv::template process_tile<1, 0, 5, 0, 3, 0>,
- Conv::template process_tile<1, 0, 5, 0, 3, 1>,
- Conv::template process_tile<1, 0, 5, 0, 3, 2>,
- Conv::template process_tile<1, 0, 5, 0, 3, 3>,
- }, // Output pad bottom = 3
- }, // Input pad right = 0
- { // Input pad right = 1
- { // Output pad bottom = 0
- Conv::template process_tile<1, 0, 5, 1, 0, 0>,
- Conv::template process_tile<1, 0, 5, 1, 0, 1>,
- Conv::template process_tile<1, 0, 5, 1, 0, 2>,
- Conv::template process_tile<1, 0, 5, 1, 0, 3>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- Conv::template process_tile<1, 0, 5, 1, 1, 0>,
- Conv::template process_tile<1, 0, 5, 1, 1, 1>,
- Conv::template process_tile<1, 0, 5, 1, 1, 2>,
- Conv::template process_tile<1, 0, 5, 1, 1, 3>,
- }, // Output pad bottom = 1
- { // Output pad bottom = 2
- Conv::template process_tile<1, 0, 5, 1, 2, 0>,
- Conv::template process_tile<1, 0, 5, 1, 2, 1>,
- Conv::template process_tile<1, 0, 5, 1, 2, 2>,
- Conv::template process_tile<1, 0, 5, 1, 2, 3>,
- }, // Output pad bottom = 2
- { // Output pad bottom = 3
- Conv::template process_tile<1, 0, 5, 1, 3, 0>,
- Conv::template process_tile<1, 0, 5, 1, 3, 1>,
- Conv::template process_tile<1, 0, 5, 1, 3, 2>,
- Conv::template process_tile<1, 0, 5, 1, 3, 3>,
- }, // Output pad bottom = 3
- }, // Input pad right = 1
- { // Input pad right = 2
- { // Output pad bottom = 0
- Conv::template process_tile<1, 0, 5, 2, 0, 0>,
- Conv::template process_tile<1, 0, 5, 2, 0, 1>,
- Conv::template process_tile<1, 0, 5, 2, 0, 2>,
- Conv::template process_tile<1, 0, 5, 2, 0, 3>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- Conv::template process_tile<1, 0, 5, 2, 1, 0>,
- Conv::template process_tile<1, 0, 5, 2, 1, 1>,
- Conv::template process_tile<1, 0, 5, 2, 1, 2>,
- Conv::template process_tile<1, 0, 5, 2, 1, 3>,
- }, // Output pad bottom = 1
- { // Output pad bottom = 2
- Conv::template process_tile<1, 0, 5, 2, 2, 0>,
- Conv::template process_tile<1, 0, 5, 2, 2, 1>,
- Conv::template process_tile<1, 0, 5, 2, 2, 2>,
- Conv::template process_tile<1, 0, 5, 2, 2, 3>,
- }, // Output pad bottom = 2
- { // Output pad bottom = 3
- Conv::template process_tile<1, 0, 5, 2, 3, 0>,
- Conv::template process_tile<1, 0, 5, 2, 3, 1>,
- Conv::template process_tile<1, 0, 5, 2, 3, 2>,
- Conv::template process_tile<1, 0, 5, 2, 3, 3>,
- }, // Output pad bottom = 3
- }, // Input pad right = 2
- { // Input pad right = 3
- { // Output pad bottom = 0
- Conv::template process_tile<1, 0, 5, 3, 0, 0>,
- Conv::template process_tile<1, 0, 5, 3, 0, 1>,
- Conv::template process_tile<1, 0, 5, 3, 0, 2>,
- Conv::template process_tile<1, 0, 5, 3, 0, 3>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- Conv::template process_tile<1, 0, 5, 3, 1, 0>,
- Conv::template process_tile<1, 0, 5, 3, 1, 1>,
- Conv::template process_tile<1, 0, 5, 3, 1, 2>,
- Conv::template process_tile<1, 0, 5, 3, 1, 3>,
- }, // Output pad bottom = 1
- { // Output pad bottom = 2
- Conv::template process_tile<1, 0, 5, 3, 2, 0>,
- Conv::template process_tile<1, 0, 5, 3, 2, 1>,
- Conv::template process_tile<1, 0, 5, 3, 2, 2>,
- Conv::template process_tile<1, 0, 5, 3, 2, 3>,
- }, // Output pad bottom = 2
- { // Output pad bottom = 3
- Conv::template process_tile<1, 0, 5, 3, 3, 0>,
- Conv::template process_tile<1, 0, 5, 3, 3, 1>,
- Conv::template process_tile<1, 0, 5, 3, 3, 2>,
- Conv::template process_tile<1, 0, 5, 3, 3, 3>,
- }, // Output pad bottom = 3
- }, // Input pad right = 3
- { // Input pad right = 4
- { // Output pad bottom = 0
- Conv::template process_tile<1, 0, 5, 4, 0, 0>,
- Conv::template process_tile<1, 0, 5, 4, 0, 1>,
- Conv::template process_tile<1, 0, 5, 4, 0, 2>,
- Conv::template process_tile<1, 0, 5, 4, 0, 3>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- Conv::template process_tile<1, 0, 5, 4, 1, 0>,
- Conv::template process_tile<1, 0, 5, 4, 1, 1>,
- Conv::template process_tile<1, 0, 5, 4, 1, 2>,
- Conv::template process_tile<1, 0, 5, 4, 1, 3>,
- }, // Output pad bottom = 1
- { // Output pad bottom = 2
- Conv::template process_tile<1, 0, 5, 4, 2, 0>,
- Conv::template process_tile<1, 0, 5, 4, 2, 1>,
- Conv::template process_tile<1, 0, 5, 4, 2, 2>,
- Conv::template process_tile<1, 0, 5, 4, 2, 3>,
- }, // Output pad bottom = 2
- { // Output pad bottom = 3
- Conv::template process_tile<1, 0, 5, 4, 3, 0>,
- Conv::template process_tile<1, 0, 5, 4, 3, 1>,
- Conv::template process_tile<1, 0, 5, 4, 3, 2>,
- Conv::template process_tile<1, 0, 5, 4, 3, 3>,
- }, // Output pad bottom = 3
- }, // Input pad right = 4
- { // Input pad right = 5
- { // Output pad bottom = 0
- Conv::template process_tile<1, 0, 5, 5, 0, 0>,
- Conv::template process_tile<1, 0, 5, 5, 0, 1>,
- Conv::template process_tile<1, 0, 5, 5, 0, 2>,
- Conv::template process_tile<1, 0, 5, 5, 0, 3>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- Conv::template process_tile<1, 0, 5, 5, 1, 0>,
- Conv::template process_tile<1, 0, 5, 5, 1, 1>,
- Conv::template process_tile<1, 0, 5, 5, 1, 2>,
- Conv::template process_tile<1, 0, 5, 5, 1, 3>,
- }, // Output pad bottom = 1
- { // Output pad bottom = 2
- Conv::template process_tile<1, 0, 5, 5, 2, 0>,
- Conv::template process_tile<1, 0, 5, 5, 2, 1>,
- Conv::template process_tile<1, 0, 5, 5, 2, 2>,
- Conv::template process_tile<1, 0, 5, 5, 2, 3>,
- }, // Output pad bottom = 2
- { // Output pad bottom = 3
- Conv::template process_tile<1, 0, 5, 5, 3, 0>,
- Conv::template process_tile<1, 0, 5, 5, 3, 1>,
- Conv::template process_tile<1, 0, 5, 5, 3, 2>,
- Conv::template process_tile<1, 0, 5, 5, 3, 3>,
- }, // Output pad bottom = 3
- }, // Input pad right = 5
- { // Input pad right = 6
- { // Output pad bottom = 0
- Conv::template process_tile<1, 0, 5, 6, 0, 0>,
- Conv::template process_tile<1, 0, 5, 6, 0, 1>,
- Conv::template process_tile<1, 0, 5, 6, 0, 2>,
- Conv::template process_tile<1, 0, 5, 6, 0, 3>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- Conv::template process_tile<1, 0, 5, 6, 1, 0>,
- Conv::template process_tile<1, 0, 5, 6, 1, 1>,
- Conv::template process_tile<1, 0, 5, 6, 1, 2>,
- Conv::template process_tile<1, 0, 5, 6, 1, 3>,
- }, // Output pad bottom = 1
- { // Output pad bottom = 2
- Conv::template process_tile<1, 0, 5, 6, 2, 0>,
- Conv::template process_tile<1, 0, 5, 6, 2, 1>,
- Conv::template process_tile<1, 0, 5, 6, 2, 2>,
- Conv::template process_tile<1, 0, 5, 6, 2, 3>,
- }, // Output pad bottom = 2
- { // Output pad bottom = 3
- Conv::template process_tile<1, 0, 5, 6, 3, 0>,
- Conv::template process_tile<1, 0, 5, 6, 3, 1>,
- Conv::template process_tile<1, 0, 5, 6, 3, 2>,
- Conv::template process_tile<1, 0, 5, 6, 3, 3>,
- }, // Output pad bottom = 3
- }, // Input pad right = 6
- }, // Input pad bottom = 5
- { // Input pad bottom = 6
- { // Input pad right = 0
- { // Output pad bottom = 0
- Conv::template process_tile<1, 0, 6, 0, 0, 0>,
- Conv::template process_tile<1, 0, 6, 0, 0, 1>,
- Conv::template process_tile<1, 0, 6, 0, 0, 2>,
- Conv::template process_tile<1, 0, 6, 0, 0, 3>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- Conv::template process_tile<1, 0, 6, 0, 1, 0>,
- Conv::template process_tile<1, 0, 6, 0, 1, 1>,
- Conv::template process_tile<1, 0, 6, 0, 1, 2>,
- Conv::template process_tile<1, 0, 6, 0, 1, 3>,
- }, // Output pad bottom = 1
- { // Output pad bottom = 2
- Conv::template process_tile<1, 0, 6, 0, 2, 0>,
- Conv::template process_tile<1, 0, 6, 0, 2, 1>,
- Conv::template process_tile<1, 0, 6, 0, 2, 2>,
- Conv::template process_tile<1, 0, 6, 0, 2, 3>,
- }, // Output pad bottom = 2
- { // Output pad bottom = 3
- Conv::template process_tile<1, 0, 6, 0, 3, 0>,
- Conv::template process_tile<1, 0, 6, 0, 3, 1>,
- Conv::template process_tile<1, 0, 6, 0, 3, 2>,
- Conv::template process_tile<1, 0, 6, 0, 3, 3>,
- }, // Output pad bottom = 3
- }, // Input pad right = 0
- { // Input pad right = 1
- { // Output pad bottom = 0
- Conv::template process_tile<1, 0, 6, 1, 0, 0>,
- Conv::template process_tile<1, 0, 6, 1, 0, 1>,
- Conv::template process_tile<1, 0, 6, 1, 0, 2>,
- Conv::template process_tile<1, 0, 6, 1, 0, 3>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- Conv::template process_tile<1, 0, 6, 1, 1, 0>,
- Conv::template process_tile<1, 0, 6, 1, 1, 1>,
- Conv::template process_tile<1, 0, 6, 1, 1, 2>,
- Conv::template process_tile<1, 0, 6, 1, 1, 3>,
- }, // Output pad bottom = 1
- { // Output pad bottom = 2
- Conv::template process_tile<1, 0, 6, 1, 2, 0>,
- Conv::template process_tile<1, 0, 6, 1, 2, 1>,
- Conv::template process_tile<1, 0, 6, 1, 2, 2>,
- Conv::template process_tile<1, 0, 6, 1, 2, 3>,
- }, // Output pad bottom = 2
- { // Output pad bottom = 3
- Conv::template process_tile<1, 0, 6, 1, 3, 0>,
- Conv::template process_tile<1, 0, 6, 1, 3, 1>,
- Conv::template process_tile<1, 0, 6, 1, 3, 2>,
- Conv::template process_tile<1, 0, 6, 1, 3, 3>,
- }, // Output pad bottom = 3
- }, // Input pad right = 1
- { // Input pad right = 2
- { // Output pad bottom = 0
- Conv::template process_tile<1, 0, 6, 2, 0, 0>,
- Conv::template process_tile<1, 0, 6, 2, 0, 1>,
- Conv::template process_tile<1, 0, 6, 2, 0, 2>,
- Conv::template process_tile<1, 0, 6, 2, 0, 3>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- Conv::template process_tile<1, 0, 6, 2, 1, 0>,
- Conv::template process_tile<1, 0, 6, 2, 1, 1>,
- Conv::template process_tile<1, 0, 6, 2, 1, 2>,
- Conv::template process_tile<1, 0, 6, 2, 1, 3>,
- }, // Output pad bottom = 1
- { // Output pad bottom = 2
- Conv::template process_tile<1, 0, 6, 2, 2, 0>,
- Conv::template process_tile<1, 0, 6, 2, 2, 1>,
- Conv::template process_tile<1, 0, 6, 2, 2, 2>,
- Conv::template process_tile<1, 0, 6, 2, 2, 3>,
- }, // Output pad bottom = 2
- { // Output pad bottom = 3
- Conv::template process_tile<1, 0, 6, 2, 3, 0>,
- Conv::template process_tile<1, 0, 6, 2, 3, 1>,
- Conv::template process_tile<1, 0, 6, 2, 3, 2>,
- Conv::template process_tile<1, 0, 6, 2, 3, 3>,
- }, // Output pad bottom = 3
- }, // Input pad right = 2
- { // Input pad right = 3
- { // Output pad bottom = 0
- Conv::template process_tile<1, 0, 6, 3, 0, 0>,
- Conv::template process_tile<1, 0, 6, 3, 0, 1>,
- Conv::template process_tile<1, 0, 6, 3, 0, 2>,
- Conv::template process_tile<1, 0, 6, 3, 0, 3>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- Conv::template process_tile<1, 0, 6, 3, 1, 0>,
- Conv::template process_tile<1, 0, 6, 3, 1, 1>,
- Conv::template process_tile<1, 0, 6, 3, 1, 2>,
- Conv::template process_tile<1, 0, 6, 3, 1, 3>,
- }, // Output pad bottom = 1
- { // Output pad bottom = 2
- Conv::template process_tile<1, 0, 6, 3, 2, 0>,
- Conv::template process_tile<1, 0, 6, 3, 2, 1>,
- Conv::template process_tile<1, 0, 6, 3, 2, 2>,
- Conv::template process_tile<1, 0, 6, 3, 2, 3>,
- }, // Output pad bottom = 2
- { // Output pad bottom = 3
- Conv::template process_tile<1, 0, 6, 3, 3, 0>,
- Conv::template process_tile<1, 0, 6, 3, 3, 1>,
- Conv::template process_tile<1, 0, 6, 3, 3, 2>,
- Conv::template process_tile<1, 0, 6, 3, 3, 3>,
- }, // Output pad bottom = 3
- }, // Input pad right = 3
- { // Input pad right = 4
- { // Output pad bottom = 0
- Conv::template process_tile<1, 0, 6, 4, 0, 0>,
- Conv::template process_tile<1, 0, 6, 4, 0, 1>,
- Conv::template process_tile<1, 0, 6, 4, 0, 2>,
- Conv::template process_tile<1, 0, 6, 4, 0, 3>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- Conv::template process_tile<1, 0, 6, 4, 1, 0>,
- Conv::template process_tile<1, 0, 6, 4, 1, 1>,
- Conv::template process_tile<1, 0, 6, 4, 1, 2>,
- Conv::template process_tile<1, 0, 6, 4, 1, 3>,
- }, // Output pad bottom = 1
- { // Output pad bottom = 2
- Conv::template process_tile<1, 0, 6, 4, 2, 0>,
- Conv::template process_tile<1, 0, 6, 4, 2, 1>,
- Conv::template process_tile<1, 0, 6, 4, 2, 2>,
- Conv::template process_tile<1, 0, 6, 4, 2, 3>,
- }, // Output pad bottom = 2
- { // Output pad bottom = 3
- Conv::template process_tile<1, 0, 6, 4, 3, 0>,
- Conv::template process_tile<1, 0, 6, 4, 3, 1>,
- Conv::template process_tile<1, 0, 6, 4, 3, 2>,
- Conv::template process_tile<1, 0, 6, 4, 3, 3>,
- }, // Output pad bottom = 3
- }, // Input pad right = 4
- { // Input pad right = 5
- { // Output pad bottom = 0
- Conv::template process_tile<1, 0, 6, 5, 0, 0>,
- Conv::template process_tile<1, 0, 6, 5, 0, 1>,
- Conv::template process_tile<1, 0, 6, 5, 0, 2>,
- Conv::template process_tile<1, 0, 6, 5, 0, 3>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- Conv::template process_tile<1, 0, 6, 5, 1, 0>,
- Conv::template process_tile<1, 0, 6, 5, 1, 1>,
- Conv::template process_tile<1, 0, 6, 5, 1, 2>,
- Conv::template process_tile<1, 0, 6, 5, 1, 3>,
- }, // Output pad bottom = 1
- { // Output pad bottom = 2
- Conv::template process_tile<1, 0, 6, 5, 2, 0>,
- Conv::template process_tile<1, 0, 6, 5, 2, 1>,
- Conv::template process_tile<1, 0, 6, 5, 2, 2>,
- Conv::template process_tile<1, 0, 6, 5, 2, 3>,
- }, // Output pad bottom = 2
- { // Output pad bottom = 3
- Conv::template process_tile<1, 0, 6, 5, 3, 0>,
- Conv::template process_tile<1, 0, 6, 5, 3, 1>,
- Conv::template process_tile<1, 0, 6, 5, 3, 2>,
- Conv::template process_tile<1, 0, 6, 5, 3, 3>,
- }, // Output pad bottom = 3
- }, // Input pad right = 5
- { // Input pad right = 6
- { // Output pad bottom = 0
- Conv::template process_tile<1, 0, 6, 6, 0, 0>,
- Conv::template process_tile<1, 0, 6, 6, 0, 1>,
- Conv::template process_tile<1, 0, 6, 6, 0, 2>,
- Conv::template process_tile<1, 0, 6, 6, 0, 3>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- Conv::template process_tile<1, 0, 6, 6, 1, 0>,
- Conv::template process_tile<1, 0, 6, 6, 1, 1>,
- Conv::template process_tile<1, 0, 6, 6, 1, 2>,
- Conv::template process_tile<1, 0, 6, 6, 1, 3>,
- }, // Output pad bottom = 1
- { // Output pad bottom = 2
- Conv::template process_tile<1, 0, 6, 6, 2, 0>,
- Conv::template process_tile<1, 0, 6, 6, 2, 1>,
- Conv::template process_tile<1, 0, 6, 6, 2, 2>,
- Conv::template process_tile<1, 0, 6, 6, 2, 3>,
- }, // Output pad bottom = 2
- { // Output pad bottom = 3
- Conv::template process_tile<1, 0, 6, 6, 3, 0>,
- Conv::template process_tile<1, 0, 6, 6, 3, 1>,
- Conv::template process_tile<1, 0, 6, 6, 3, 2>,
- Conv::template process_tile<1, 0, 6, 6, 3, 3>,
- }, // Output pad bottom = 3
- }, // Input pad right = 6
- }, // Input pad bottom = 6
- }, // Input pad left = 0
- { // Input pad left = 1
- { // Input pad bottom = 0
- { // Input pad right = 0
- { // Output pad bottom = 0
- Conv::template process_tile<1, 1, 0, 0, 0, 0>,
- Conv::template process_tile<1, 1, 0, 0, 0, 1>,
- Conv::template process_tile<1, 1, 0, 0, 0, 2>,
- Conv::template process_tile<1, 1, 0, 0, 0, 3>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- Conv::template process_tile<1, 1, 0, 0, 1, 0>,
- Conv::template process_tile<1, 1, 0, 0, 1, 1>,
- Conv::template process_tile<1, 1, 0, 0, 1, 2>,
- Conv::template process_tile<1, 1, 0, 0, 1, 3>,
- }, // Output pad bottom = 1
- { // Output pad bottom = 2
- Conv::template process_tile<1, 1, 0, 0, 2, 0>,
- Conv::template process_tile<1, 1, 0, 0, 2, 1>,
- Conv::template process_tile<1, 1, 0, 0, 2, 2>,
- Conv::template process_tile<1, 1, 0, 0, 2, 3>,
- }, // Output pad bottom = 2
- { // Output pad bottom = 3
- Conv::template process_tile<1, 1, 0, 0, 3, 0>,
- Conv::template process_tile<1, 1, 0, 0, 3, 1>,
- Conv::template process_tile<1, 1, 0, 0, 3, 2>,
- Conv::template process_tile<1, 1, 0, 0, 3, 3>,
- }, // Output pad bottom = 3
- }, // Input pad right = 0
- { // Input pad right = 1
- { // Output pad bottom = 0
- Conv::template process_tile<1, 1, 0, 1, 0, 0>,
- Conv::template process_tile<1, 1, 0, 1, 0, 1>,
- Conv::template process_tile<1, 1, 0, 1, 0, 2>,
- Conv::template process_tile<1, 1, 0, 1, 0, 3>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- Conv::template process_tile<1, 1, 0, 1, 1, 0>,
- Conv::template process_tile<1, 1, 0, 1, 1, 1>,
- Conv::template process_tile<1, 1, 0, 1, 1, 2>,
- Conv::template process_tile<1, 1, 0, 1, 1, 3>,
- }, // Output pad bottom = 1
- { // Output pad bottom = 2
- Conv::template process_tile<1, 1, 0, 1, 2, 0>,
- Conv::template process_tile<1, 1, 0, 1, 2, 1>,
- Conv::template process_tile<1, 1, 0, 1, 2, 2>,
- Conv::template process_tile<1, 1, 0, 1, 2, 3>,
- }, // Output pad bottom = 2
- { // Output pad bottom = 3
- Conv::template process_tile<1, 1, 0, 1, 3, 0>,
- Conv::template process_tile<1, 1, 0, 1, 3, 1>,
- Conv::template process_tile<1, 1, 0, 1, 3, 2>,
- Conv::template process_tile<1, 1, 0, 1, 3, 3>,
- }, // Output pad bottom = 3
- }, // Input pad right = 1
- { // Input pad right = 2
- { // Output pad bottom = 0
- Conv::template process_tile<1, 1, 0, 2, 0, 0>,
- Conv::template process_tile<1, 1, 0, 2, 0, 1>,
- Conv::template process_tile<1, 1, 0, 2, 0, 2>,
- Conv::template process_tile<1, 1, 0, 2, 0, 3>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- Conv::template process_tile<1, 1, 0, 2, 1, 0>,
- Conv::template process_tile<1, 1, 0, 2, 1, 1>,
- Conv::template process_tile<1, 1, 0, 2, 1, 2>,
- Conv::template process_tile<1, 1, 0, 2, 1, 3>,
- }, // Output pad bottom = 1
- { // Output pad bottom = 2
- Conv::template process_tile<1, 1, 0, 2, 2, 0>,
- Conv::template process_tile<1, 1, 0, 2, 2, 1>,
- Conv::template process_tile<1, 1, 0, 2, 2, 2>,
- Conv::template process_tile<1, 1, 0, 2, 2, 3>,
- }, // Output pad bottom = 2
- { // Output pad bottom = 3
- Conv::template process_tile<1, 1, 0, 2, 3, 0>,
- Conv::template process_tile<1, 1, 0, 2, 3, 1>,
- Conv::template process_tile<1, 1, 0, 2, 3, 2>,
- Conv::template process_tile<1, 1, 0, 2, 3, 3>,
- }, // Output pad bottom = 3
- }, // Input pad right = 2
- { // Input pad right = 3
- { // Output pad bottom = 0
- Conv::template process_tile<1, 1, 0, 3, 0, 0>,
- Conv::template process_tile<1, 1, 0, 3, 0, 1>,
- Conv::template process_tile<1, 1, 0, 3, 0, 2>,
- Conv::template process_tile<1, 1, 0, 3, 0, 3>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- Conv::template process_tile<1, 1, 0, 3, 1, 0>,
- Conv::template process_tile<1, 1, 0, 3, 1, 1>,
- Conv::template process_tile<1, 1, 0, 3, 1, 2>,
- Conv::template process_tile<1, 1, 0, 3, 1, 3>,
- }, // Output pad bottom = 1
- { // Output pad bottom = 2
- Conv::template process_tile<1, 1, 0, 3, 2, 0>,
- Conv::template process_tile<1, 1, 0, 3, 2, 1>,
- Conv::template process_tile<1, 1, 0, 3, 2, 2>,
- Conv::template process_tile<1, 1, 0, 3, 2, 3>,
- }, // Output pad bottom = 2
- { // Output pad bottom = 3
- Conv::template process_tile<1, 1, 0, 3, 3, 0>,
- Conv::template process_tile<1, 1, 0, 3, 3, 1>,
- Conv::template process_tile<1, 1, 0, 3, 3, 2>,
- Conv::template process_tile<1, 1, 0, 3, 3, 3>,
- }, // Output pad bottom = 3
- }, // Input pad right = 3
- { // Input pad right = 4
- { // Output pad bottom = 0
- Conv::template process_tile<1, 1, 0, 4, 0, 0>,
- Conv::template process_tile<1, 1, 0, 4, 0, 1>,
- Conv::template process_tile<1, 1, 0, 4, 0, 2>,
- Conv::template process_tile<1, 1, 0, 4, 0, 3>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- Conv::template process_tile<1, 1, 0, 4, 1, 0>,
- Conv::template process_tile<1, 1, 0, 4, 1, 1>,
- Conv::template process_tile<1, 1, 0, 4, 1, 2>,
- Conv::template process_tile<1, 1, 0, 4, 1, 3>,
- }, // Output pad bottom = 1
- { // Output pad bottom = 2
- Conv::template process_tile<1, 1, 0, 4, 2, 0>,
- Conv::template process_tile<1, 1, 0, 4, 2, 1>,
- Conv::template process_tile<1, 1, 0, 4, 2, 2>,
- Conv::template process_tile<1, 1, 0, 4, 2, 3>,
- }, // Output pad bottom = 2
- { // Output pad bottom = 3
- Conv::template process_tile<1, 1, 0, 4, 3, 0>,
- Conv::template process_tile<1, 1, 0, 4, 3, 1>,
- Conv::template process_tile<1, 1, 0, 4, 3, 2>,
- Conv::template process_tile<1, 1, 0, 4, 3, 3>,
- }, // Output pad bottom = 3
- }, // Input pad right = 4
- { // Input pad right = 5
- { // Output pad bottom = 0
- Conv::template process_tile<1, 1, 0, 5, 0, 0>,
- Conv::template process_tile<1, 1, 0, 5, 0, 1>,
- Conv::template process_tile<1, 1, 0, 5, 0, 2>,
- Conv::template process_tile<1, 1, 0, 5, 0, 3>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- Conv::template process_tile<1, 1, 0, 5, 1, 0>,
- Conv::template process_tile<1, 1, 0, 5, 1, 1>,
- Conv::template process_tile<1, 1, 0, 5, 1, 2>,
- Conv::template process_tile<1, 1, 0, 5, 1, 3>,
- }, // Output pad bottom = 1
- { // Output pad bottom = 2
- Conv::template process_tile<1, 1, 0, 5, 2, 0>,
- Conv::template process_tile<1, 1, 0, 5, 2, 1>,
- Conv::template process_tile<1, 1, 0, 5, 2, 2>,
- Conv::template process_tile<1, 1, 0, 5, 2, 3>,
- }, // Output pad bottom = 2
- { // Output pad bottom = 3
- Conv::template process_tile<1, 1, 0, 5, 3, 0>,
- Conv::template process_tile<1, 1, 0, 5, 3, 1>,
- Conv::template process_tile<1, 1, 0, 5, 3, 2>,
- Conv::template process_tile<1, 1, 0, 5, 3, 3>,
- }, // Output pad bottom = 3
- }, // Input pad right = 5
- { // Input pad right = 6
- { // Output pad bottom = 0
- Conv::template process_tile<1, 1, 0, 6, 0, 0>,
- Conv::template process_tile<1, 1, 0, 6, 0, 1>,
- Conv::template process_tile<1, 1, 0, 6, 0, 2>,
- Conv::template process_tile<1, 1, 0, 6, 0, 3>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- Conv::template process_tile<1, 1, 0, 6, 1, 0>,
- Conv::template process_tile<1, 1, 0, 6, 1, 1>,
- Conv::template process_tile<1, 1, 0, 6, 1, 2>,
- Conv::template process_tile<1, 1, 0, 6, 1, 3>,
- }, // Output pad bottom = 1
- { // Output pad bottom = 2
- Conv::template process_tile<1, 1, 0, 6, 2, 0>,
- Conv::template process_tile<1, 1, 0, 6, 2, 1>,
- Conv::template process_tile<1, 1, 0, 6, 2, 2>,
- Conv::template process_tile<1, 1, 0, 6, 2, 3>,
- }, // Output pad bottom = 2
- { // Output pad bottom = 3
- Conv::template process_tile<1, 1, 0, 6, 3, 0>,
- Conv::template process_tile<1, 1, 0, 6, 3, 1>,
- Conv::template process_tile<1, 1, 0, 6, 3, 2>,
- Conv::template process_tile<1, 1, 0, 6, 3, 3>,
- }, // Output pad bottom = 3
- }, // Input pad right = 6
- }, // Input pad bottom = 0
- { // Input pad bottom = 1
- { // Input pad right = 0
- { // Output pad bottom = 0
- Conv::template process_tile<1, 1, 1, 0, 0, 0>,
- Conv::template process_tile<1, 1, 1, 0, 0, 1>,
- Conv::template process_tile<1, 1, 1, 0, 0, 2>,
- Conv::template process_tile<1, 1, 1, 0, 0, 3>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- Conv::template process_tile<1, 1, 1, 0, 1, 0>,
- Conv::template process_tile<1, 1, 1, 0, 1, 1>,
- Conv::template process_tile<1, 1, 1, 0, 1, 2>,
- Conv::template process_tile<1, 1, 1, 0, 1, 3>,
- }, // Output pad bottom = 1
- { // Output pad bottom = 2
- Conv::template process_tile<1, 1, 1, 0, 2, 0>,
- Conv::template process_tile<1, 1, 1, 0, 2, 1>,
- Conv::template process_tile<1, 1, 1, 0, 2, 2>,
- Conv::template process_tile<1, 1, 1, 0, 2, 3>,
- }, // Output pad bottom = 2
- { // Output pad bottom = 3
- Conv::template process_tile<1, 1, 1, 0, 3, 0>,
- Conv::template process_tile<1, 1, 1, 0, 3, 1>,
- Conv::template process_tile<1, 1, 1, 0, 3, 2>,
- Conv::template process_tile<1, 1, 1, 0, 3, 3>,
- }, // Output pad bottom = 3
- }, // Input pad right = 0
- { // Input pad right = 1
- { // Output pad bottom = 0
- Conv::template process_tile<1, 1, 1, 1, 0, 0>,
- Conv::template process_tile<1, 1, 1, 1, 0, 1>,
- Conv::template process_tile<1, 1, 1, 1, 0, 2>,
- Conv::template process_tile<1, 1, 1, 1, 0, 3>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- Conv::template process_tile<1, 1, 1, 1, 1, 0>,
- Conv::template process_tile<1, 1, 1, 1, 1, 1>,
- Conv::template process_tile<1, 1, 1, 1, 1, 2>,
- Conv::template process_tile<1, 1, 1, 1, 1, 3>,
- }, // Output pad bottom = 1
- { // Output pad bottom = 2
- Conv::template process_tile<1, 1, 1, 1, 2, 0>,
- Conv::template process_tile<1, 1, 1, 1, 2, 1>,
- Conv::template process_tile<1, 1, 1, 1, 2, 2>,
- Conv::template process_tile<1, 1, 1, 1, 2, 3>,
- }, // Output pad bottom = 2
- { // Output pad bottom = 3
- Conv::template process_tile<1, 1, 1, 1, 3, 0>,
- Conv::template process_tile<1, 1, 1, 1, 3, 1>,
- Conv::template process_tile<1, 1, 1, 1, 3, 2>,
- Conv::template process_tile<1, 1, 1, 1, 3, 3>,
- }, // Output pad bottom = 3
- }, // Input pad right = 1
- { // Input pad right = 2
- { // Output pad bottom = 0
- Conv::template process_tile<1, 1, 1, 2, 0, 0>,
- Conv::template process_tile<1, 1, 1, 2, 0, 1>,
- Conv::template process_tile<1, 1, 1, 2, 0, 2>,
- Conv::template process_tile<1, 1, 1, 2, 0, 3>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- Conv::template process_tile<1, 1, 1, 2, 1, 0>,
- Conv::template process_tile<1, 1, 1, 2, 1, 1>,
- Conv::template process_tile<1, 1, 1, 2, 1, 2>,
- Conv::template process_tile<1, 1, 1, 2, 1, 3>,
- }, // Output pad bottom = 1
- { // Output pad bottom = 2
- Conv::template process_tile<1, 1, 1, 2, 2, 0>,
- Conv::template process_tile<1, 1, 1, 2, 2, 1>,
- Conv::template process_tile<1, 1, 1, 2, 2, 2>,
- Conv::template process_tile<1, 1, 1, 2, 2, 3>,
- }, // Output pad bottom = 2
- { // Output pad bottom = 3
- Conv::template process_tile<1, 1, 1, 2, 3, 0>,
- Conv::template process_tile<1, 1, 1, 2, 3, 1>,
- Conv::template process_tile<1, 1, 1, 2, 3, 2>,
- Conv::template process_tile<1, 1, 1, 2, 3, 3>,
- }, // Output pad bottom = 3
- }, // Input pad right = 2
- { // Input pad right = 3
- { // Output pad bottom = 0
- Conv::template process_tile<1, 1, 1, 3, 0, 0>,
- Conv::template process_tile<1, 1, 1, 3, 0, 1>,
- Conv::template process_tile<1, 1, 1, 3, 0, 2>,
- Conv::template process_tile<1, 1, 1, 3, 0, 3>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- Conv::template process_tile<1, 1, 1, 3, 1, 0>,
- Conv::template process_tile<1, 1, 1, 3, 1, 1>,
- Conv::template process_tile<1, 1, 1, 3, 1, 2>,
- Conv::template process_tile<1, 1, 1, 3, 1, 3>,
- }, // Output pad bottom = 1
- { // Output pad bottom = 2
- Conv::template process_tile<1, 1, 1, 3, 2, 0>,
- Conv::template process_tile<1, 1, 1, 3, 2, 1>,
- Conv::template process_tile<1, 1, 1, 3, 2, 2>,
- Conv::template process_tile<1, 1, 1, 3, 2, 3>,
- }, // Output pad bottom = 2
- { // Output pad bottom = 3
- Conv::template process_tile<1, 1, 1, 3, 3, 0>,
- Conv::template process_tile<1, 1, 1, 3, 3, 1>,
- Conv::template process_tile<1, 1, 1, 3, 3, 2>,
- Conv::template process_tile<1, 1, 1, 3, 3, 3>,
- }, // Output pad bottom = 3
- }, // Input pad right = 3
- { // Input pad right = 4
- { // Output pad bottom = 0
- Conv::template process_tile<1, 1, 1, 4, 0, 0>,
- Conv::template process_tile<1, 1, 1, 4, 0, 1>,
- Conv::template process_tile<1, 1, 1, 4, 0, 2>,
- Conv::template process_tile<1, 1, 1, 4, 0, 3>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- Conv::template process_tile<1, 1, 1, 4, 1, 0>,
- Conv::template process_tile<1, 1, 1, 4, 1, 1>,
- Conv::template process_tile<1, 1, 1, 4, 1, 2>,
- Conv::template process_tile<1, 1, 1, 4, 1, 3>,
- }, // Output pad bottom = 1
- { // Output pad bottom = 2
- Conv::template process_tile<1, 1, 1, 4, 2, 0>,
- Conv::template process_tile<1, 1, 1, 4, 2, 1>,
- Conv::template process_tile<1, 1, 1, 4, 2, 2>,
- Conv::template process_tile<1, 1, 1, 4, 2, 3>,
- }, // Output pad bottom = 2
- { // Output pad bottom = 3
- Conv::template process_tile<1, 1, 1, 4, 3, 0>,
- Conv::template process_tile<1, 1, 1, 4, 3, 1>,
- Conv::template process_tile<1, 1, 1, 4, 3, 2>,
- Conv::template process_tile<1, 1, 1, 4, 3, 3>,
- }, // Output pad bottom = 3
- }, // Input pad right = 4
- { // Input pad right = 5
- { // Output pad bottom = 0
- Conv::template process_tile<1, 1, 1, 5, 0, 0>,
- Conv::template process_tile<1, 1, 1, 5, 0, 1>,
- Conv::template process_tile<1, 1, 1, 5, 0, 2>,
- Conv::template process_tile<1, 1, 1, 5, 0, 3>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- Conv::template process_tile<1, 1, 1, 5, 1, 0>,
- Conv::template process_tile<1, 1, 1, 5, 1, 1>,
- Conv::template process_tile<1, 1, 1, 5, 1, 2>,
- Conv::template process_tile<1, 1, 1, 5, 1, 3>,
- }, // Output pad bottom = 1
- { // Output pad bottom = 2
- Conv::template process_tile<1, 1, 1, 5, 2, 0>,
- Conv::template process_tile<1, 1, 1, 5, 2, 1>,
- Conv::template process_tile<1, 1, 1, 5, 2, 2>,
- Conv::template process_tile<1, 1, 1, 5, 2, 3>,
- }, // Output pad bottom = 2
- { // Output pad bottom = 3
- Conv::template process_tile<1, 1, 1, 5, 3, 0>,
- Conv::template process_tile<1, 1, 1, 5, 3, 1>,
- Conv::template process_tile<1, 1, 1, 5, 3, 2>,
- Conv::template process_tile<1, 1, 1, 5, 3, 3>,
- }, // Output pad bottom = 3
- }, // Input pad right = 5
- { // Input pad right = 6
- { // Output pad bottom = 0
- Conv::template process_tile<1, 1, 1, 6, 0, 0>,
- Conv::template process_tile<1, 1, 1, 6, 0, 1>,
- Conv::template process_tile<1, 1, 1, 6, 0, 2>,
- Conv::template process_tile<1, 1, 1, 6, 0, 3>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- Conv::template process_tile<1, 1, 1, 6, 1, 0>,
- Conv::template process_tile<1, 1, 1, 6, 1, 1>,
- Conv::template process_tile<1, 1, 1, 6, 1, 2>,
- Conv::template process_tile<1, 1, 1, 6, 1, 3>,
- }, // Output pad bottom = 1
- { // Output pad bottom = 2
- Conv::template process_tile<1, 1, 1, 6, 2, 0>,
- Conv::template process_tile<1, 1, 1, 6, 2, 1>,
- Conv::template process_tile<1, 1, 1, 6, 2, 2>,
- Conv::template process_tile<1, 1, 1, 6, 2, 3>,
- }, // Output pad bottom = 2
- { // Output pad bottom = 3
- Conv::template process_tile<1, 1, 1, 6, 3, 0>,
- Conv::template process_tile<1, 1, 1, 6, 3, 1>,
- Conv::template process_tile<1, 1, 1, 6, 3, 2>,
- Conv::template process_tile<1, 1, 1, 6, 3, 3>,
- }, // Output pad bottom = 3
- }, // Input pad right = 6
- }, // Input pad bottom = 1
- { // Input pad bottom = 2
- { // Input pad right = 0
- { // Output pad bottom = 0
- Conv::template process_tile<1, 1, 2, 0, 0, 0>,
- Conv::template process_tile<1, 1, 2, 0, 0, 1>,
- Conv::template process_tile<1, 1, 2, 0, 0, 2>,
- Conv::template process_tile<1, 1, 2, 0, 0, 3>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- Conv::template process_tile<1, 1, 2, 0, 1, 0>,
- Conv::template process_tile<1, 1, 2, 0, 1, 1>,
- Conv::template process_tile<1, 1, 2, 0, 1, 2>,
- Conv::template process_tile<1, 1, 2, 0, 1, 3>,
- }, // Output pad bottom = 1
- { // Output pad bottom = 2
- Conv::template process_tile<1, 1, 2, 0, 2, 0>,
- Conv::template process_tile<1, 1, 2, 0, 2, 1>,
- Conv::template process_tile<1, 1, 2, 0, 2, 2>,
- Conv::template process_tile<1, 1, 2, 0, 2, 3>,
- }, // Output pad bottom = 2
- { // Output pad bottom = 3
- Conv::template process_tile<1, 1, 2, 0, 3, 0>,
- Conv::template process_tile<1, 1, 2, 0, 3, 1>,
- Conv::template process_tile<1, 1, 2, 0, 3, 2>,
- Conv::template process_tile<1, 1, 2, 0, 3, 3>,
- }, // Output pad bottom = 3
- }, // Input pad right = 0
- { // Input pad right = 1
- { // Output pad bottom = 0
- Conv::template process_tile<1, 1, 2, 1, 0, 0>,
- Conv::template process_tile<1, 1, 2, 1, 0, 1>,
- Conv::template process_tile<1, 1, 2, 1, 0, 2>,
- Conv::template process_tile<1, 1, 2, 1, 0, 3>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- Conv::template process_tile<1, 1, 2, 1, 1, 0>,
- Conv::template process_tile<1, 1, 2, 1, 1, 1>,
- Conv::template process_tile<1, 1, 2, 1, 1, 2>,
- Conv::template process_tile<1, 1, 2, 1, 1, 3>,
- }, // Output pad bottom = 1
- { // Output pad bottom = 2
- Conv::template process_tile<1, 1, 2, 1, 2, 0>,
- Conv::template process_tile<1, 1, 2, 1, 2, 1>,
- Conv::template process_tile<1, 1, 2, 1, 2, 2>,
- Conv::template process_tile<1, 1, 2, 1, 2, 3>,
- }, // Output pad bottom = 2
- { // Output pad bottom = 3
- Conv::template process_tile<1, 1, 2, 1, 3, 0>,
- Conv::template process_tile<1, 1, 2, 1, 3, 1>,
- Conv::template process_tile<1, 1, 2, 1, 3, 2>,
- Conv::template process_tile<1, 1, 2, 1, 3, 3>,
- }, // Output pad bottom = 3
- }, // Input pad right = 1
- { // Input pad right = 2
- { // Output pad bottom = 0
- Conv::template process_tile<1, 1, 2, 2, 0, 0>,
- Conv::template process_tile<1, 1, 2, 2, 0, 1>,
- Conv::template process_tile<1, 1, 2, 2, 0, 2>,
- Conv::template process_tile<1, 1, 2, 2, 0, 3>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- Conv::template process_tile<1, 1, 2, 2, 1, 0>,
- Conv::template process_tile<1, 1, 2, 2, 1, 1>,
- Conv::template process_tile<1, 1, 2, 2, 1, 2>,
- Conv::template process_tile<1, 1, 2, 2, 1, 3>,
- }, // Output pad bottom = 1
- { // Output pad bottom = 2
- Conv::template process_tile<1, 1, 2, 2, 2, 0>,
- Conv::template process_tile<1, 1, 2, 2, 2, 1>,
- Conv::template process_tile<1, 1, 2, 2, 2, 2>,
- Conv::template process_tile<1, 1, 2, 2, 2, 3>,
- }, // Output pad bottom = 2
- { // Output pad bottom = 3
- Conv::template process_tile<1, 1, 2, 2, 3, 0>,
- Conv::template process_tile<1, 1, 2, 2, 3, 1>,
- Conv::template process_tile<1, 1, 2, 2, 3, 2>,
- Conv::template process_tile<1, 1, 2, 2, 3, 3>,
- }, // Output pad bottom = 3
- }, // Input pad right = 2
- { // Input pad right = 3
- { // Output pad bottom = 0
- Conv::template process_tile<1, 1, 2, 3, 0, 0>,
- Conv::template process_tile<1, 1, 2, 3, 0, 1>,
- Conv::template process_tile<1, 1, 2, 3, 0, 2>,
- Conv::template process_tile<1, 1, 2, 3, 0, 3>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- Conv::template process_tile<1, 1, 2, 3, 1, 0>,
- Conv::template process_tile<1, 1, 2, 3, 1, 1>,
- Conv::template process_tile<1, 1, 2, 3, 1, 2>,
- Conv::template process_tile<1, 1, 2, 3, 1, 3>,
- }, // Output pad bottom = 1
- { // Output pad bottom = 2
- Conv::template process_tile<1, 1, 2, 3, 2, 0>,
- Conv::template process_tile<1, 1, 2, 3, 2, 1>,
- Conv::template process_tile<1, 1, 2, 3, 2, 2>,
- Conv::template process_tile<1, 1, 2, 3, 2, 3>,
- }, // Output pad bottom = 2
- { // Output pad bottom = 3
- Conv::template process_tile<1, 1, 2, 3, 3, 0>,
- Conv::template process_tile<1, 1, 2, 3, 3, 1>,
- Conv::template process_tile<1, 1, 2, 3, 3, 2>,
- Conv::template process_tile<1, 1, 2, 3, 3, 3>,
- }, // Output pad bottom = 3
- }, // Input pad right = 3
- { // Input pad right = 4
- { // Output pad bottom = 0
- Conv::template process_tile<1, 1, 2, 4, 0, 0>,
- Conv::template process_tile<1, 1, 2, 4, 0, 1>,
- Conv::template process_tile<1, 1, 2, 4, 0, 2>,
- Conv::template process_tile<1, 1, 2, 4, 0, 3>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- Conv::template process_tile<1, 1, 2, 4, 1, 0>,
- Conv::template process_tile<1, 1, 2, 4, 1, 1>,
- Conv::template process_tile<1, 1, 2, 4, 1, 2>,
- Conv::template process_tile<1, 1, 2, 4, 1, 3>,
- }, // Output pad bottom = 1
- { // Output pad bottom = 2
- Conv::template process_tile<1, 1, 2, 4, 2, 0>,
- Conv::template process_tile<1, 1, 2, 4, 2, 1>,
- Conv::template process_tile<1, 1, 2, 4, 2, 2>,
- Conv::template process_tile<1, 1, 2, 4, 2, 3>,
- }, // Output pad bottom = 2
- { // Output pad bottom = 3
- Conv::template process_tile<1, 1, 2, 4, 3, 0>,
- Conv::template process_tile<1, 1, 2, 4, 3, 1>,
- Conv::template process_tile<1, 1, 2, 4, 3, 2>,
- Conv::template process_tile<1, 1, 2, 4, 3, 3>,
- }, // Output pad bottom = 3
- }, // Input pad right = 4
- { // Input pad right = 5
- { // Output pad bottom = 0
- Conv::template process_tile<1, 1, 2, 5, 0, 0>,
- Conv::template process_tile<1, 1, 2, 5, 0, 1>,
- Conv::template process_tile<1, 1, 2, 5, 0, 2>,
- Conv::template process_tile<1, 1, 2, 5, 0, 3>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- Conv::template process_tile<1, 1, 2, 5, 1, 0>,
- Conv::template process_tile<1, 1, 2, 5, 1, 1>,
- Conv::template process_tile<1, 1, 2, 5, 1, 2>,
- Conv::template process_tile<1, 1, 2, 5, 1, 3>,
- }, // Output pad bottom = 1
- { // Output pad bottom = 2
- Conv::template process_tile<1, 1, 2, 5, 2, 0>,
- Conv::template process_tile<1, 1, 2, 5, 2, 1>,
- Conv::template process_tile<1, 1, 2, 5, 2, 2>,
- Conv::template process_tile<1, 1, 2, 5, 2, 3>,
- }, // Output pad bottom = 2
- { // Output pad bottom = 3
- Conv::template process_tile<1, 1, 2, 5, 3, 0>,
- Conv::template process_tile<1, 1, 2, 5, 3, 1>,
- Conv::template process_tile<1, 1, 2, 5, 3, 2>,
- Conv::template process_tile<1, 1, 2, 5, 3, 3>,
- }, // Output pad bottom = 3
- }, // Input pad right = 5
- { // Input pad right = 6
- { // Output pad bottom = 0
- Conv::template process_tile<1, 1, 2, 6, 0, 0>,
- Conv::template process_tile<1, 1, 2, 6, 0, 1>,
- Conv::template process_tile<1, 1, 2, 6, 0, 2>,
- Conv::template process_tile<1, 1, 2, 6, 0, 3>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- Conv::template process_tile<1, 1, 2, 6, 1, 0>,
- Conv::template process_tile<1, 1, 2, 6, 1, 1>,
- Conv::template process_tile<1, 1, 2, 6, 1, 2>,
- Conv::template process_tile<1, 1, 2, 6, 1, 3>,
- }, // Output pad bottom = 1
- { // Output pad bottom = 2
- Conv::template process_tile<1, 1, 2, 6, 2, 0>,
- Conv::template process_tile<1, 1, 2, 6, 2, 1>,
- Conv::template process_tile<1, 1, 2, 6, 2, 2>,
- Conv::template process_tile<1, 1, 2, 6, 2, 3>,
- }, // Output pad bottom = 2
- { // Output pad bottom = 3
- Conv::template process_tile<1, 1, 2, 6, 3, 0>,
- Conv::template process_tile<1, 1, 2, 6, 3, 1>,
- Conv::template process_tile<1, 1, 2, 6, 3, 2>,
- Conv::template process_tile<1, 1, 2, 6, 3, 3>,
- }, // Output pad bottom = 3
- }, // Input pad right = 6
- }, // Input pad bottom = 2
- { // Input pad bottom = 3
- { // Input pad right = 0
- { // Output pad bottom = 0
- Conv::template process_tile<1, 1, 3, 0, 0, 0>,
- Conv::template process_tile<1, 1, 3, 0, 0, 1>,
- Conv::template process_tile<1, 1, 3, 0, 0, 2>,
- Conv::template process_tile<1, 1, 3, 0, 0, 3>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- Conv::template process_tile<1, 1, 3, 0, 1, 0>,
- Conv::template process_tile<1, 1, 3, 0, 1, 1>,
- Conv::template process_tile<1, 1, 3, 0, 1, 2>,
- Conv::template process_tile<1, 1, 3, 0, 1, 3>,
- }, // Output pad bottom = 1
- { // Output pad bottom = 2
- Conv::template process_tile<1, 1, 3, 0, 2, 0>,
- Conv::template process_tile<1, 1, 3, 0, 2, 1>,
- Conv::template process_tile<1, 1, 3, 0, 2, 2>,
- Conv::template process_tile<1, 1, 3, 0, 2, 3>,
- }, // Output pad bottom = 2
- { // Output pad bottom = 3
- Conv::template process_tile<1, 1, 3, 0, 3, 0>,
- Conv::template process_tile<1, 1, 3, 0, 3, 1>,
- Conv::template process_tile<1, 1, 3, 0, 3, 2>,
- Conv::template process_tile<1, 1, 3, 0, 3, 3>,
- }, // Output pad bottom = 3
- }, // Input pad right = 0
- { // Input pad right = 1
- { // Output pad bottom = 0
- Conv::template process_tile<1, 1, 3, 1, 0, 0>,
- Conv::template process_tile<1, 1, 3, 1, 0, 1>,
- Conv::template process_tile<1, 1, 3, 1, 0, 2>,
- Conv::template process_tile<1, 1, 3, 1, 0, 3>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- Conv::template process_tile<1, 1, 3, 1, 1, 0>,
- Conv::template process_tile<1, 1, 3, 1, 1, 1>,
- Conv::template process_tile<1, 1, 3, 1, 1, 2>,
- Conv::template process_tile<1, 1, 3, 1, 1, 3>,
- }, // Output pad bottom = 1
- { // Output pad bottom = 2
- Conv::template process_tile<1, 1, 3, 1, 2, 0>,
- Conv::template process_tile<1, 1, 3, 1, 2, 1>,
- Conv::template process_tile<1, 1, 3, 1, 2, 2>,
- Conv::template process_tile<1, 1, 3, 1, 2, 3>,
- }, // Output pad bottom = 2
- { // Output pad bottom = 3
- Conv::template process_tile<1, 1, 3, 1, 3, 0>,
- Conv::template process_tile<1, 1, 3, 1, 3, 1>,
- Conv::template process_tile<1, 1, 3, 1, 3, 2>,
- Conv::template process_tile<1, 1, 3, 1, 3, 3>,
- }, // Output pad bottom = 3
- }, // Input pad right = 1
- { // Input pad right = 2
- { // Output pad bottom = 0
- Conv::template process_tile<1, 1, 3, 2, 0, 0>,
- Conv::template process_tile<1, 1, 3, 2, 0, 1>,
- Conv::template process_tile<1, 1, 3, 2, 0, 2>,
- Conv::template process_tile<1, 1, 3, 2, 0, 3>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- Conv::template process_tile<1, 1, 3, 2, 1, 0>,
- Conv::template process_tile<1, 1, 3, 2, 1, 1>,
- Conv::template process_tile<1, 1, 3, 2, 1, 2>,
- Conv::template process_tile<1, 1, 3, 2, 1, 3>,
- }, // Output pad bottom = 1
- { // Output pad bottom = 2
- Conv::template process_tile<1, 1, 3, 2, 2, 0>,
- Conv::template process_tile<1, 1, 3, 2, 2, 1>,
- Conv::template process_tile<1, 1, 3, 2, 2, 2>,
- Conv::template process_tile<1, 1, 3, 2, 2, 3>,
- }, // Output pad bottom = 2
- { // Output pad bottom = 3
- Conv::template process_tile<1, 1, 3, 2, 3, 0>,
- Conv::template process_tile<1, 1, 3, 2, 3, 1>,
- Conv::template process_tile<1, 1, 3, 2, 3, 2>,
- Conv::template process_tile<1, 1, 3, 2, 3, 3>,
- }, // Output pad bottom = 3
- }, // Input pad right = 2
- { // Input pad right = 3
- { // Output pad bottom = 0
- Conv::template process_tile<1, 1, 3, 3, 0, 0>,
- Conv::template process_tile<1, 1, 3, 3, 0, 1>,
- Conv::template process_tile<1, 1, 3, 3, 0, 2>,
- Conv::template process_tile<1, 1, 3, 3, 0, 3>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- Conv::template process_tile<1, 1, 3, 3, 1, 0>,
- Conv::template process_tile<1, 1, 3, 3, 1, 1>,
- Conv::template process_tile<1, 1, 3, 3, 1, 2>,
- Conv::template process_tile<1, 1, 3, 3, 1, 3>,
- }, // Output pad bottom = 1
- { // Output pad bottom = 2
- Conv::template process_tile<1, 1, 3, 3, 2, 0>,
- Conv::template process_tile<1, 1, 3, 3, 2, 1>,
- Conv::template process_tile<1, 1, 3, 3, 2, 2>,
- Conv::template process_tile<1, 1, 3, 3, 2, 3>,
- }, // Output pad bottom = 2
- { // Output pad bottom = 3
- Conv::template process_tile<1, 1, 3, 3, 3, 0>,
- Conv::template process_tile<1, 1, 3, 3, 3, 1>,
- Conv::template process_tile<1, 1, 3, 3, 3, 2>,
- Conv::template process_tile<1, 1, 3, 3, 3, 3>,
- }, // Output pad bottom = 3
- }, // Input pad right = 3
- { // Input pad right = 4
- { // Output pad bottom = 0
- Conv::template process_tile<1, 1, 3, 4, 0, 0>,
- Conv::template process_tile<1, 1, 3, 4, 0, 1>,
- Conv::template process_tile<1, 1, 3, 4, 0, 2>,
- Conv::template process_tile<1, 1, 3, 4, 0, 3>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- Conv::template process_tile<1, 1, 3, 4, 1, 0>,
- Conv::template process_tile<1, 1, 3, 4, 1, 1>,
- Conv::template process_tile<1, 1, 3, 4, 1, 2>,
- Conv::template process_tile<1, 1, 3, 4, 1, 3>,
- }, // Output pad bottom = 1
- { // Output pad bottom = 2
- Conv::template process_tile<1, 1, 3, 4, 2, 0>,
- Conv::template process_tile<1, 1, 3, 4, 2, 1>,
- Conv::template process_tile<1, 1, 3, 4, 2, 2>,
- Conv::template process_tile<1, 1, 3, 4, 2, 3>,
- }, // Output pad bottom = 2
- { // Output pad bottom = 3
- Conv::template process_tile<1, 1, 3, 4, 3, 0>,
- Conv::template process_tile<1, 1, 3, 4, 3, 1>,
- Conv::template process_tile<1, 1, 3, 4, 3, 2>,
- Conv::template process_tile<1, 1, 3, 4, 3, 3>,
- }, // Output pad bottom = 3
- }, // Input pad right = 4
- { // Input pad right = 5
- { // Output pad bottom = 0
- Conv::template process_tile<1, 1, 3, 5, 0, 0>,
- Conv::template process_tile<1, 1, 3, 5, 0, 1>,
- Conv::template process_tile<1, 1, 3, 5, 0, 2>,
- Conv::template process_tile<1, 1, 3, 5, 0, 3>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- Conv::template process_tile<1, 1, 3, 5, 1, 0>,
- Conv::template process_tile<1, 1, 3, 5, 1, 1>,
- Conv::template process_tile<1, 1, 3, 5, 1, 2>,
- Conv::template process_tile<1, 1, 3, 5, 1, 3>,
- }, // Output pad bottom = 1
- { // Output pad bottom = 2
- Conv::template process_tile<1, 1, 3, 5, 2, 0>,
- Conv::template process_tile<1, 1, 3, 5, 2, 1>,
- Conv::template process_tile<1, 1, 3, 5, 2, 2>,
- Conv::template process_tile<1, 1, 3, 5, 2, 3>,
- }, // Output pad bottom = 2
- { // Output pad bottom = 3
- Conv::template process_tile<1, 1, 3, 5, 3, 0>,
- Conv::template process_tile<1, 1, 3, 5, 3, 1>,
- Conv::template process_tile<1, 1, 3, 5, 3, 2>,
- Conv::template process_tile<1, 1, 3, 5, 3, 3>,
- }, // Output pad bottom = 3
- }, // Input pad right = 5
- { // Input pad right = 6
- { // Output pad bottom = 0
- Conv::template process_tile<1, 1, 3, 6, 0, 0>,
- Conv::template process_tile<1, 1, 3, 6, 0, 1>,
- Conv::template process_tile<1, 1, 3, 6, 0, 2>,
- Conv::template process_tile<1, 1, 3, 6, 0, 3>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- Conv::template process_tile<1, 1, 3, 6, 1, 0>,
- Conv::template process_tile<1, 1, 3, 6, 1, 1>,
- Conv::template process_tile<1, 1, 3, 6, 1, 2>,
- Conv::template process_tile<1, 1, 3, 6, 1, 3>,
- }, // Output pad bottom = 1
- { // Output pad bottom = 2
- Conv::template process_tile<1, 1, 3, 6, 2, 0>,
- Conv::template process_tile<1, 1, 3, 6, 2, 1>,
- Conv::template process_tile<1, 1, 3, 6, 2, 2>,
- Conv::template process_tile<1, 1, 3, 6, 2, 3>,
- }, // Output pad bottom = 2
- { // Output pad bottom = 3
- Conv::template process_tile<1, 1, 3, 6, 3, 0>,
- Conv::template process_tile<1, 1, 3, 6, 3, 1>,
- Conv::template process_tile<1, 1, 3, 6, 3, 2>,
- Conv::template process_tile<1, 1, 3, 6, 3, 3>,
- }, // Output pad bottom = 3
- }, // Input pad right = 6
- }, // Input pad bottom = 3
- { // Input pad bottom = 4
- { // Input pad right = 0
- { // Output pad bottom = 0
- Conv::template process_tile<1, 1, 4, 0, 0, 0>,
- Conv::template process_tile<1, 1, 4, 0, 0, 1>,
- Conv::template process_tile<1, 1, 4, 0, 0, 2>,
- Conv::template process_tile<1, 1, 4, 0, 0, 3>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- Conv::template process_tile<1, 1, 4, 0, 1, 0>,
- Conv::template process_tile<1, 1, 4, 0, 1, 1>,
- Conv::template process_tile<1, 1, 4, 0, 1, 2>,
- Conv::template process_tile<1, 1, 4, 0, 1, 3>,
- }, // Output pad bottom = 1
- { // Output pad bottom = 2
- Conv::template process_tile<1, 1, 4, 0, 2, 0>,
- Conv::template process_tile<1, 1, 4, 0, 2, 1>,
- Conv::template process_tile<1, 1, 4, 0, 2, 2>,
- Conv::template process_tile<1, 1, 4, 0, 2, 3>,
- }, // Output pad bottom = 2
- { // Output pad bottom = 3
- Conv::template process_tile<1, 1, 4, 0, 3, 0>,
- Conv::template process_tile<1, 1, 4, 0, 3, 1>,
- Conv::template process_tile<1, 1, 4, 0, 3, 2>,
- Conv::template process_tile<1, 1, 4, 0, 3, 3>,
- }, // Output pad bottom = 3
- }, // Input pad right = 0
- { // Input pad right = 1
- { // Output pad bottom = 0
- Conv::template process_tile<1, 1, 4, 1, 0, 0>,
- Conv::template process_tile<1, 1, 4, 1, 0, 1>,
- Conv::template process_tile<1, 1, 4, 1, 0, 2>,
- Conv::template process_tile<1, 1, 4, 1, 0, 3>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- Conv::template process_tile<1, 1, 4, 1, 1, 0>,
- Conv::template process_tile<1, 1, 4, 1, 1, 1>,
- Conv::template process_tile<1, 1, 4, 1, 1, 2>,
- Conv::template process_tile<1, 1, 4, 1, 1, 3>,
- }, // Output pad bottom = 1
- { // Output pad bottom = 2
- Conv::template process_tile<1, 1, 4, 1, 2, 0>,
- Conv::template process_tile<1, 1, 4, 1, 2, 1>,
- Conv::template process_tile<1, 1, 4, 1, 2, 2>,
- Conv::template process_tile<1, 1, 4, 1, 2, 3>,
- }, // Output pad bottom = 2
- { // Output pad bottom = 3
- Conv::template process_tile<1, 1, 4, 1, 3, 0>,
- Conv::template process_tile<1, 1, 4, 1, 3, 1>,
- Conv::template process_tile<1, 1, 4, 1, 3, 2>,
- Conv::template process_tile<1, 1, 4, 1, 3, 3>,
- }, // Output pad bottom = 3
- }, // Input pad right = 1
- { // Input pad right = 2
- { // Output pad bottom = 0
- Conv::template process_tile<1, 1, 4, 2, 0, 0>,
- Conv::template process_tile<1, 1, 4, 2, 0, 1>,
- Conv::template process_tile<1, 1, 4, 2, 0, 2>,
- Conv::template process_tile<1, 1, 4, 2, 0, 3>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- Conv::template process_tile<1, 1, 4, 2, 1, 0>,
- Conv::template process_tile<1, 1, 4, 2, 1, 1>,
- Conv::template process_tile<1, 1, 4, 2, 1, 2>,
- Conv::template process_tile<1, 1, 4, 2, 1, 3>,
- }, // Output pad bottom = 1
- { // Output pad bottom = 2
- Conv::template process_tile<1, 1, 4, 2, 2, 0>,
- Conv::template process_tile<1, 1, 4, 2, 2, 1>,
- Conv::template process_tile<1, 1, 4, 2, 2, 2>,
- Conv::template process_tile<1, 1, 4, 2, 2, 3>,
- }, // Output pad bottom = 2
- { // Output pad bottom = 3
- Conv::template process_tile<1, 1, 4, 2, 3, 0>,
- Conv::template process_tile<1, 1, 4, 2, 3, 1>,
- Conv::template process_tile<1, 1, 4, 2, 3, 2>,
- Conv::template process_tile<1, 1, 4, 2, 3, 3>,
- }, // Output pad bottom = 3
- }, // Input pad right = 2
- { // Input pad right = 3
- { // Output pad bottom = 0
- Conv::template process_tile<1, 1, 4, 3, 0, 0>,
- Conv::template process_tile<1, 1, 4, 3, 0, 1>,
- Conv::template process_tile<1, 1, 4, 3, 0, 2>,
- Conv::template process_tile<1, 1, 4, 3, 0, 3>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- Conv::template process_tile<1, 1, 4, 3, 1, 0>,
- Conv::template process_tile<1, 1, 4, 3, 1, 1>,
- Conv::template process_tile<1, 1, 4, 3, 1, 2>,
- Conv::template process_tile<1, 1, 4, 3, 1, 3>,
- }, // Output pad bottom = 1
- { // Output pad bottom = 2
- Conv::template process_tile<1, 1, 4, 3, 2, 0>,
- Conv::template process_tile<1, 1, 4, 3, 2, 1>,
- Conv::template process_tile<1, 1, 4, 3, 2, 2>,
- Conv::template process_tile<1, 1, 4, 3, 2, 3>,
- }, // Output pad bottom = 2
- { // Output pad bottom = 3
- Conv::template process_tile<1, 1, 4, 3, 3, 0>,
- Conv::template process_tile<1, 1, 4, 3, 3, 1>,
- Conv::template process_tile<1, 1, 4, 3, 3, 2>,
- Conv::template process_tile<1, 1, 4, 3, 3, 3>,
- }, // Output pad bottom = 3
- }, // Input pad right = 3
- { // Input pad right = 4
- { // Output pad bottom = 0
- Conv::template process_tile<1, 1, 4, 4, 0, 0>,
- Conv::template process_tile<1, 1, 4, 4, 0, 1>,
- Conv::template process_tile<1, 1, 4, 4, 0, 2>,
- Conv::template process_tile<1, 1, 4, 4, 0, 3>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- Conv::template process_tile<1, 1, 4, 4, 1, 0>,
- Conv::template process_tile<1, 1, 4, 4, 1, 1>,
- Conv::template process_tile<1, 1, 4, 4, 1, 2>,
- Conv::template process_tile<1, 1, 4, 4, 1, 3>,
- }, // Output pad bottom = 1
- { // Output pad bottom = 2
- Conv::template process_tile<1, 1, 4, 4, 2, 0>,
- Conv::template process_tile<1, 1, 4, 4, 2, 1>,
- Conv::template process_tile<1, 1, 4, 4, 2, 2>,
- Conv::template process_tile<1, 1, 4, 4, 2, 3>,
- }, // Output pad bottom = 2
- { // Output pad bottom = 3
- Conv::template process_tile<1, 1, 4, 4, 3, 0>,
- Conv::template process_tile<1, 1, 4, 4, 3, 1>,
- Conv::template process_tile<1, 1, 4, 4, 3, 2>,
- Conv::template process_tile<1, 1, 4, 4, 3, 3>,
- }, // Output pad bottom = 3
- }, // Input pad right = 4
- { // Input pad right = 5
- { // Output pad bottom = 0
- Conv::template process_tile<1, 1, 4, 5, 0, 0>,
- Conv::template process_tile<1, 1, 4, 5, 0, 1>,
- Conv::template process_tile<1, 1, 4, 5, 0, 2>,
- Conv::template process_tile<1, 1, 4, 5, 0, 3>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- Conv::template process_tile<1, 1, 4, 5, 1, 0>,
- Conv::template process_tile<1, 1, 4, 5, 1, 1>,
- Conv::template process_tile<1, 1, 4, 5, 1, 2>,
- Conv::template process_tile<1, 1, 4, 5, 1, 3>,
- }, // Output pad bottom = 1
- { // Output pad bottom = 2
- Conv::template process_tile<1, 1, 4, 5, 2, 0>,
- Conv::template process_tile<1, 1, 4, 5, 2, 1>,
- Conv::template process_tile<1, 1, 4, 5, 2, 2>,
- Conv::template process_tile<1, 1, 4, 5, 2, 3>,
- }, // Output pad bottom = 2
- { // Output pad bottom = 3
- Conv::template process_tile<1, 1, 4, 5, 3, 0>,
- Conv::template process_tile<1, 1, 4, 5, 3, 1>,
- Conv::template process_tile<1, 1, 4, 5, 3, 2>,
- Conv::template process_tile<1, 1, 4, 5, 3, 3>,
- }, // Output pad bottom = 3
- }, // Input pad right = 5
- { // Input pad right = 6
- { // Output pad bottom = 0
- Conv::template process_tile<1, 1, 4, 6, 0, 0>,
- Conv::template process_tile<1, 1, 4, 6, 0, 1>,
- Conv::template process_tile<1, 1, 4, 6, 0, 2>,
- Conv::template process_tile<1, 1, 4, 6, 0, 3>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- Conv::template process_tile<1, 1, 4, 6, 1, 0>,
- Conv::template process_tile<1, 1, 4, 6, 1, 1>,
- Conv::template process_tile<1, 1, 4, 6, 1, 2>,
- Conv::template process_tile<1, 1, 4, 6, 1, 3>,
- }, // Output pad bottom = 1
- { // Output pad bottom = 2
- Conv::template process_tile<1, 1, 4, 6, 2, 0>,
- Conv::template process_tile<1, 1, 4, 6, 2, 1>,
- Conv::template process_tile<1, 1, 4, 6, 2, 2>,
- Conv::template process_tile<1, 1, 4, 6, 2, 3>,
- }, // Output pad bottom = 2
- { // Output pad bottom = 3
- Conv::template process_tile<1, 1, 4, 6, 3, 0>,
- Conv::template process_tile<1, 1, 4, 6, 3, 1>,
- Conv::template process_tile<1, 1, 4, 6, 3, 2>,
- Conv::template process_tile<1, 1, 4, 6, 3, 3>,
- }, // Output pad bottom = 3
- }, // Input pad right = 6
- }, // Input pad bottom = 4
- { // Input pad bottom = 5
- { // Input pad right = 0
- { // Output pad bottom = 0
- Conv::template process_tile<1, 1, 5, 0, 0, 0>,
- Conv::template process_tile<1, 1, 5, 0, 0, 1>,
- Conv::template process_tile<1, 1, 5, 0, 0, 2>,
- Conv::template process_tile<1, 1, 5, 0, 0, 3>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- Conv::template process_tile<1, 1, 5, 0, 1, 0>,
- Conv::template process_tile<1, 1, 5, 0, 1, 1>,
- Conv::template process_tile<1, 1, 5, 0, 1, 2>,
- Conv::template process_tile<1, 1, 5, 0, 1, 3>,
- }, // Output pad bottom = 1
- { // Output pad bottom = 2
- Conv::template process_tile<1, 1, 5, 0, 2, 0>,
- Conv::template process_tile<1, 1, 5, 0, 2, 1>,
- Conv::template process_tile<1, 1, 5, 0, 2, 2>,
- Conv::template process_tile<1, 1, 5, 0, 2, 3>,
- }, // Output pad bottom = 2
- { // Output pad bottom = 3
- Conv::template process_tile<1, 1, 5, 0, 3, 0>,
- Conv::template process_tile<1, 1, 5, 0, 3, 1>,
- Conv::template process_tile<1, 1, 5, 0, 3, 2>,
- Conv::template process_tile<1, 1, 5, 0, 3, 3>,
- }, // Output pad bottom = 3
- }, // Input pad right = 0
- { // Input pad right = 1
- { // Output pad bottom = 0
- Conv::template process_tile<1, 1, 5, 1, 0, 0>,
- Conv::template process_tile<1, 1, 5, 1, 0, 1>,
- Conv::template process_tile<1, 1, 5, 1, 0, 2>,
- Conv::template process_tile<1, 1, 5, 1, 0, 3>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- Conv::template process_tile<1, 1, 5, 1, 1, 0>,
- Conv::template process_tile<1, 1, 5, 1, 1, 1>,
- Conv::template process_tile<1, 1, 5, 1, 1, 2>,
- Conv::template process_tile<1, 1, 5, 1, 1, 3>,
- }, // Output pad bottom = 1
- { // Output pad bottom = 2
- Conv::template process_tile<1, 1, 5, 1, 2, 0>,
- Conv::template process_tile<1, 1, 5, 1, 2, 1>,
- Conv::template process_tile<1, 1, 5, 1, 2, 2>,
- Conv::template process_tile<1, 1, 5, 1, 2, 3>,
- }, // Output pad bottom = 2
- { // Output pad bottom = 3
- Conv::template process_tile<1, 1, 5, 1, 3, 0>,
- Conv::template process_tile<1, 1, 5, 1, 3, 1>,
- Conv::template process_tile<1, 1, 5, 1, 3, 2>,
- Conv::template process_tile<1, 1, 5, 1, 3, 3>,
- }, // Output pad bottom = 3
- }, // Input pad right = 1
- { // Input pad right = 2
- { // Output pad bottom = 0
- Conv::template process_tile<1, 1, 5, 2, 0, 0>,
- Conv::template process_tile<1, 1, 5, 2, 0, 1>,
- Conv::template process_tile<1, 1, 5, 2, 0, 2>,
- Conv::template process_tile<1, 1, 5, 2, 0, 3>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- Conv::template process_tile<1, 1, 5, 2, 1, 0>,
- Conv::template process_tile<1, 1, 5, 2, 1, 1>,
- Conv::template process_tile<1, 1, 5, 2, 1, 2>,
- Conv::template process_tile<1, 1, 5, 2, 1, 3>,
- }, // Output pad bottom = 1
- { // Output pad bottom = 2
- Conv::template process_tile<1, 1, 5, 2, 2, 0>,
- Conv::template process_tile<1, 1, 5, 2, 2, 1>,
- Conv::template process_tile<1, 1, 5, 2, 2, 2>,
- Conv::template process_tile<1, 1, 5, 2, 2, 3>,
- }, // Output pad bottom = 2
- { // Output pad bottom = 3
- Conv::template process_tile<1, 1, 5, 2, 3, 0>,
- Conv::template process_tile<1, 1, 5, 2, 3, 1>,
- Conv::template process_tile<1, 1, 5, 2, 3, 2>,
- Conv::template process_tile<1, 1, 5, 2, 3, 3>,
- }, // Output pad bottom = 3
- }, // Input pad right = 2
- { // Input pad right = 3
- { // Output pad bottom = 0
- Conv::template process_tile<1, 1, 5, 3, 0, 0>,
- Conv::template process_tile<1, 1, 5, 3, 0, 1>,
- Conv::template process_tile<1, 1, 5, 3, 0, 2>,
- Conv::template process_tile<1, 1, 5, 3, 0, 3>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- Conv::template process_tile<1, 1, 5, 3, 1, 0>,
- Conv::template process_tile<1, 1, 5, 3, 1, 1>,
- Conv::template process_tile<1, 1, 5, 3, 1, 2>,
- Conv::template process_tile<1, 1, 5, 3, 1, 3>,
- }, // Output pad bottom = 1
- { // Output pad bottom = 2
- Conv::template process_tile<1, 1, 5, 3, 2, 0>,
- Conv::template process_tile<1, 1, 5, 3, 2, 1>,
- Conv::template process_tile<1, 1, 5, 3, 2, 2>,
- Conv::template process_tile<1, 1, 5, 3, 2, 3>,
- }, // Output pad bottom = 2
- { // Output pad bottom = 3
- Conv::template process_tile<1, 1, 5, 3, 3, 0>,
- Conv::template process_tile<1, 1, 5, 3, 3, 1>,
- Conv::template process_tile<1, 1, 5, 3, 3, 2>,
- Conv::template process_tile<1, 1, 5, 3, 3, 3>,
- }, // Output pad bottom = 3
- }, // Input pad right = 3
- { // Input pad right = 4
- { // Output pad bottom = 0
- Conv::template process_tile<1, 1, 5, 4, 0, 0>,
- Conv::template process_tile<1, 1, 5, 4, 0, 1>,
- Conv::template process_tile<1, 1, 5, 4, 0, 2>,
- Conv::template process_tile<1, 1, 5, 4, 0, 3>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- Conv::template process_tile<1, 1, 5, 4, 1, 0>,
- Conv::template process_tile<1, 1, 5, 4, 1, 1>,
- Conv::template process_tile<1, 1, 5, 4, 1, 2>,
- Conv::template process_tile<1, 1, 5, 4, 1, 3>,
- }, // Output pad bottom = 1
- { // Output pad bottom = 2
- Conv::template process_tile<1, 1, 5, 4, 2, 0>,
- Conv::template process_tile<1, 1, 5, 4, 2, 1>,
- Conv::template process_tile<1, 1, 5, 4, 2, 2>,
- Conv::template process_tile<1, 1, 5, 4, 2, 3>,
- }, // Output pad bottom = 2
- { // Output pad bottom = 3
- Conv::template process_tile<1, 1, 5, 4, 3, 0>,
- Conv::template process_tile<1, 1, 5, 4, 3, 1>,
- Conv::template process_tile<1, 1, 5, 4, 3, 2>,
- Conv::template process_tile<1, 1, 5, 4, 3, 3>,
- }, // Output pad bottom = 3
- }, // Input pad right = 4
- { // Input pad right = 5
- { // Output pad bottom = 0
- Conv::template process_tile<1, 1, 5, 5, 0, 0>,
- Conv::template process_tile<1, 1, 5, 5, 0, 1>,
- Conv::template process_tile<1, 1, 5, 5, 0, 2>,
- Conv::template process_tile<1, 1, 5, 5, 0, 3>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- Conv::template process_tile<1, 1, 5, 5, 1, 0>,
- Conv::template process_tile<1, 1, 5, 5, 1, 1>,
- Conv::template process_tile<1, 1, 5, 5, 1, 2>,
- Conv::template process_tile<1, 1, 5, 5, 1, 3>,
- }, // Output pad bottom = 1
- { // Output pad bottom = 2
- Conv::template process_tile<1, 1, 5, 5, 2, 0>,
- Conv::template process_tile<1, 1, 5, 5, 2, 1>,
- Conv::template process_tile<1, 1, 5, 5, 2, 2>,
- Conv::template process_tile<1, 1, 5, 5, 2, 3>,
- }, // Output pad bottom = 2
- { // Output pad bottom = 3
- Conv::template process_tile<1, 1, 5, 5, 3, 0>,
- Conv::template process_tile<1, 1, 5, 5, 3, 1>,
- Conv::template process_tile<1, 1, 5, 5, 3, 2>,
- Conv::template process_tile<1, 1, 5, 5, 3, 3>,
- }, // Output pad bottom = 3
- }, // Input pad right = 5
- { // Input pad right = 6
- { // Output pad bottom = 0
- Conv::template process_tile<1, 1, 5, 6, 0, 0>,
- Conv::template process_tile<1, 1, 5, 6, 0, 1>,
- Conv::template process_tile<1, 1, 5, 6, 0, 2>,
- Conv::template process_tile<1, 1, 5, 6, 0, 3>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- Conv::template process_tile<1, 1, 5, 6, 1, 0>,
- Conv::template process_tile<1, 1, 5, 6, 1, 1>,
- Conv::template process_tile<1, 1, 5, 6, 1, 2>,
- Conv::template process_tile<1, 1, 5, 6, 1, 3>,
- }, // Output pad bottom = 1
- { // Output pad bottom = 2
- Conv::template process_tile<1, 1, 5, 6, 2, 0>,
- Conv::template process_tile<1, 1, 5, 6, 2, 1>,
- Conv::template process_tile<1, 1, 5, 6, 2, 2>,
- Conv::template process_tile<1, 1, 5, 6, 2, 3>,
- }, // Output pad bottom = 2
- { // Output pad bottom = 3
- Conv::template process_tile<1, 1, 5, 6, 3, 0>,
- Conv::template process_tile<1, 1, 5, 6, 3, 1>,
- Conv::template process_tile<1, 1, 5, 6, 3, 2>,
- Conv::template process_tile<1, 1, 5, 6, 3, 3>,
- }, // Output pad bottom = 3
- }, // Input pad right = 6
- }, // Input pad bottom = 5
- { // Input pad bottom = 6
- { // Input pad right = 0
- { // Output pad bottom = 0
- Conv::template process_tile<1, 1, 6, 0, 0, 0>,
- Conv::template process_tile<1, 1, 6, 0, 0, 1>,
- Conv::template process_tile<1, 1, 6, 0, 0, 2>,
- Conv::template process_tile<1, 1, 6, 0, 0, 3>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- Conv::template process_tile<1, 1, 6, 0, 1, 0>,
- Conv::template process_tile<1, 1, 6, 0, 1, 1>,
- Conv::template process_tile<1, 1, 6, 0, 1, 2>,
- Conv::template process_tile<1, 1, 6, 0, 1, 3>,
- }, // Output pad bottom = 1
- { // Output pad bottom = 2
- Conv::template process_tile<1, 1, 6, 0, 2, 0>,
- Conv::template process_tile<1, 1, 6, 0, 2, 1>,
- Conv::template process_tile<1, 1, 6, 0, 2, 2>,
- Conv::template process_tile<1, 1, 6, 0, 2, 3>,
- }, // Output pad bottom = 2
- { // Output pad bottom = 3
- Conv::template process_tile<1, 1, 6, 0, 3, 0>,
- Conv::template process_tile<1, 1, 6, 0, 3, 1>,
- Conv::template process_tile<1, 1, 6, 0, 3, 2>,
- Conv::template process_tile<1, 1, 6, 0, 3, 3>,
- }, // Output pad bottom = 3
- }, // Input pad right = 0
- { // Input pad right = 1
- { // Output pad bottom = 0
- Conv::template process_tile<1, 1, 6, 1, 0, 0>,
- Conv::template process_tile<1, 1, 6, 1, 0, 1>,
- Conv::template process_tile<1, 1, 6, 1, 0, 2>,
- Conv::template process_tile<1, 1, 6, 1, 0, 3>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- Conv::template process_tile<1, 1, 6, 1, 1, 0>,
- Conv::template process_tile<1, 1, 6, 1, 1, 1>,
- Conv::template process_tile<1, 1, 6, 1, 1, 2>,
- Conv::template process_tile<1, 1, 6, 1, 1, 3>,
- }, // Output pad bottom = 1
- { // Output pad bottom = 2
- Conv::template process_tile<1, 1, 6, 1, 2, 0>,
- Conv::template process_tile<1, 1, 6, 1, 2, 1>,
- Conv::template process_tile<1, 1, 6, 1, 2, 2>,
- Conv::template process_tile<1, 1, 6, 1, 2, 3>,
- }, // Output pad bottom = 2
- { // Output pad bottom = 3
- Conv::template process_tile<1, 1, 6, 1, 3, 0>,
- Conv::template process_tile<1, 1, 6, 1, 3, 1>,
- Conv::template process_tile<1, 1, 6, 1, 3, 2>,
- Conv::template process_tile<1, 1, 6, 1, 3, 3>,
- }, // Output pad bottom = 3
- }, // Input pad right = 1
- { // Input pad right = 2
- { // Output pad bottom = 0
- Conv::template process_tile<1, 1, 6, 2, 0, 0>,
- Conv::template process_tile<1, 1, 6, 2, 0, 1>,
- Conv::template process_tile<1, 1, 6, 2, 0, 2>,
- Conv::template process_tile<1, 1, 6, 2, 0, 3>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- Conv::template process_tile<1, 1, 6, 2, 1, 0>,
- Conv::template process_tile<1, 1, 6, 2, 1, 1>,
- Conv::template process_tile<1, 1, 6, 2, 1, 2>,
- Conv::template process_tile<1, 1, 6, 2, 1, 3>,
- }, // Output pad bottom = 1
- { // Output pad bottom = 2
- Conv::template process_tile<1, 1, 6, 2, 2, 0>,
- Conv::template process_tile<1, 1, 6, 2, 2, 1>,
- Conv::template process_tile<1, 1, 6, 2, 2, 2>,
- Conv::template process_tile<1, 1, 6, 2, 2, 3>,
- }, // Output pad bottom = 2
- { // Output pad bottom = 3
- Conv::template process_tile<1, 1, 6, 2, 3, 0>,
- Conv::template process_tile<1, 1, 6, 2, 3, 1>,
- Conv::template process_tile<1, 1, 6, 2, 3, 2>,
- Conv::template process_tile<1, 1, 6, 2, 3, 3>,
- }, // Output pad bottom = 3
- }, // Input pad right = 2
- { // Input pad right = 3
- { // Output pad bottom = 0
- Conv::template process_tile<1, 1, 6, 3, 0, 0>,
- Conv::template process_tile<1, 1, 6, 3, 0, 1>,
- Conv::template process_tile<1, 1, 6, 3, 0, 2>,
- Conv::template process_tile<1, 1, 6, 3, 0, 3>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- Conv::template process_tile<1, 1, 6, 3, 1, 0>,
- Conv::template process_tile<1, 1, 6, 3, 1, 1>,
- Conv::template process_tile<1, 1, 6, 3, 1, 2>,
- Conv::template process_tile<1, 1, 6, 3, 1, 3>,
- }, // Output pad bottom = 1
- { // Output pad bottom = 2
- Conv::template process_tile<1, 1, 6, 3, 2, 0>,
- Conv::template process_tile<1, 1, 6, 3, 2, 1>,
- Conv::template process_tile<1, 1, 6, 3, 2, 2>,
- Conv::template process_tile<1, 1, 6, 3, 2, 3>,
- }, // Output pad bottom = 2
- { // Output pad bottom = 3
- Conv::template process_tile<1, 1, 6, 3, 3, 0>,
- Conv::template process_tile<1, 1, 6, 3, 3, 1>,
- Conv::template process_tile<1, 1, 6, 3, 3, 2>,
- Conv::template process_tile<1, 1, 6, 3, 3, 3>,
- }, // Output pad bottom = 3
- }, // Input pad right = 3
- { // Input pad right = 4
- { // Output pad bottom = 0
- Conv::template process_tile<1, 1, 6, 4, 0, 0>,
- Conv::template process_tile<1, 1, 6, 4, 0, 1>,
- Conv::template process_tile<1, 1, 6, 4, 0, 2>,
- Conv::template process_tile<1, 1, 6, 4, 0, 3>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- Conv::template process_tile<1, 1, 6, 4, 1, 0>,
- Conv::template process_tile<1, 1, 6, 4, 1, 1>,
- Conv::template process_tile<1, 1, 6, 4, 1, 2>,
- Conv::template process_tile<1, 1, 6, 4, 1, 3>,
- }, // Output pad bottom = 1
- { // Output pad bottom = 2
- Conv::template process_tile<1, 1, 6, 4, 2, 0>,
- Conv::template process_tile<1, 1, 6, 4, 2, 1>,
- Conv::template process_tile<1, 1, 6, 4, 2, 2>,
- Conv::template process_tile<1, 1, 6, 4, 2, 3>,
- }, // Output pad bottom = 2
- { // Output pad bottom = 3
- Conv::template process_tile<1, 1, 6, 4, 3, 0>,
- Conv::template process_tile<1, 1, 6, 4, 3, 1>,
- Conv::template process_tile<1, 1, 6, 4, 3, 2>,
- Conv::template process_tile<1, 1, 6, 4, 3, 3>,
- }, // Output pad bottom = 3
- }, // Input pad right = 4
- { // Input pad right = 5
- { // Output pad bottom = 0
- Conv::template process_tile<1, 1, 6, 5, 0, 0>,
- Conv::template process_tile<1, 1, 6, 5, 0, 1>,
- Conv::template process_tile<1, 1, 6, 5, 0, 2>,
- Conv::template process_tile<1, 1, 6, 5, 0, 3>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- Conv::template process_tile<1, 1, 6, 5, 1, 0>,
- Conv::template process_tile<1, 1, 6, 5, 1, 1>,
- Conv::template process_tile<1, 1, 6, 5, 1, 2>,
- Conv::template process_tile<1, 1, 6, 5, 1, 3>,
- }, // Output pad bottom = 1
- { // Output pad bottom = 2
- Conv::template process_tile<1, 1, 6, 5, 2, 0>,
- Conv::template process_tile<1, 1, 6, 5, 2, 1>,
- Conv::template process_tile<1, 1, 6, 5, 2, 2>,
- Conv::template process_tile<1, 1, 6, 5, 2, 3>,
- }, // Output pad bottom = 2
- { // Output pad bottom = 3
- Conv::template process_tile<1, 1, 6, 5, 3, 0>,
- Conv::template process_tile<1, 1, 6, 5, 3, 1>,
- Conv::template process_tile<1, 1, 6, 5, 3, 2>,
- Conv::template process_tile<1, 1, 6, 5, 3, 3>,
- }, // Output pad bottom = 3
- }, // Input pad right = 5
- { // Input pad right = 6
- { // Output pad bottom = 0
- Conv::template process_tile<1, 1, 6, 6, 0, 0>,
- Conv::template process_tile<1, 1, 6, 6, 0, 1>,
- Conv::template process_tile<1, 1, 6, 6, 0, 2>,
- Conv::template process_tile<1, 1, 6, 6, 0, 3>,
- }, // Output pad bottom = 0
- { // Output pad bottom = 1
- Conv::template process_tile<1, 1, 6, 6, 1, 0>,
- Conv::template process_tile<1, 1, 6, 6, 1, 1>,
- Conv::template process_tile<1, 1, 6, 6, 1, 2>,
- Conv::template process_tile<1, 1, 6, 6, 1, 3>,
- }, // Output pad bottom = 1
- { // Output pad bottom = 2
- Conv::template process_tile<1, 1, 6, 6, 2, 0>,
- Conv::template process_tile<1, 1, 6, 6, 2, 1>,
- Conv::template process_tile<1, 1, 6, 6, 2, 2>,
- Conv::template process_tile<1, 1, 6, 6, 2, 3>,
- }, // Output pad bottom = 2
- { // Output pad bottom = 3
- Conv::template process_tile<1, 1, 6, 6, 3, 0>,
- Conv::template process_tile<1, 1, 6, 6, 3, 1>,
- Conv::template process_tile<1, 1, 6, 6, 3, 2>,
- Conv::template process_tile<1, 1, 6, 6, 3, 3>,
- }, // Output pad bottom = 3
- }, // Input pad right = 6
- }, // Input pad bottom = 6
- }, // Input pad left = 1
- }, // Input pad top = 1
+const Conv::TileFn Conv::tilefn_unpadded = ConvImpl::template process_tile<true, 0, 0, 0, 0, 0, 0>;
+
+template <>
+const Conv::TileFn Conv::tilefn_top[n_in_pad_top_fns] = {
+ ConvImpl::template process_tile<true, 0, 0, 0, 0, 0, 0>,
+ ConvImpl::template process_tile<true, 1, 0, 0, 0, 0, 0>,
};
+template <>
+const Conv::TileFn Conv::tilefn_left[n_in_pad_left_fns] = {
+ ConvImpl::template process_tile<true, 0, 0, 0, 0, 0, 0>,
+ ConvImpl::template process_tile<true, 0, 1, 0, 0, 0, 0>,
+};
+
+template <>
+const Conv::TileFn Conv::tilefn_bottom[n_in_pad_bottom_fns][n_out_pad_bottom_fns] = {
+ {
+ ConvImpl::template process_tile<true, 0, 0, 0, 0, 0, 0>,
+ ConvImpl::template process_tile<true, 0, 0, 0, 0, 1, 0>,
+ ConvImpl::template process_tile<true, 0, 0, 0, 0, 2, 0>,
+ ConvImpl::template process_tile<true, 0, 0, 0, 0, 3, 0>,
+ },
+ {
+ ConvImpl::template process_tile<true, 0, 0, 1, 0, 0, 0>,
+ ConvImpl::template process_tile<true, 0, 0, 1, 0, 1, 0>,
+ ConvImpl::template process_tile<true, 0, 0, 1, 0, 2, 0>,
+ ConvImpl::template process_tile<true, 0, 0, 1, 0, 3, 0>,
+ },
+ {
+ ConvImpl::template process_tile<true, 0, 0, 2, 0, 0, 0>,
+ ConvImpl::template process_tile<true, 0, 0, 2, 0, 1, 0>,
+ ConvImpl::template process_tile<true, 0, 0, 2, 0, 2, 0>,
+ ConvImpl::template process_tile<true, 0, 0, 2, 0, 3, 0>,
+ },
+ {
+ ConvImpl::template process_tile<true, 0, 0, 3, 0, 0, 0>,
+ ConvImpl::template process_tile<true, 0, 0, 3, 0, 1, 0>,
+ ConvImpl::template process_tile<true, 0, 0, 3, 0, 2, 0>,
+ ConvImpl::template process_tile<true, 0, 0, 3, 0, 3, 0>,
+ },
+ {
+ ConvImpl::template process_tile<true, 0, 0, 4, 0, 0, 0>,
+ ConvImpl::template process_tile<true, 0, 0, 4, 0, 1, 0>,
+ ConvImpl::template process_tile<true, 0, 0, 4, 0, 2, 0>,
+ ConvImpl::template process_tile<true, 0, 0, 4, 0, 3, 0>,
+ },
+ {
+ ConvImpl::template process_tile<true, 0, 0, 5, 0, 0, 0>,
+ ConvImpl::template process_tile<true, 0, 0, 5, 0, 1, 0>,
+ ConvImpl::template process_tile<true, 0, 0, 5, 0, 2, 0>,
+ ConvImpl::template process_tile<true, 0, 0, 5, 0, 3, 0>,
+ },
+ {
+ ConvImpl::template process_tile<true, 0, 0, 6, 0, 0, 0>,
+ ConvImpl::template process_tile<true, 0, 0, 6, 0, 1, 0>,
+ ConvImpl::template process_tile<true, 0, 0, 6, 0, 2, 0>,
+ ConvImpl::template process_tile<true, 0, 0, 6, 0, 3, 0>,
+ },
+ {
+ ConvImpl::template process_tile<true, 0, 0, 7, 0, 0, 0>,
+ ConvImpl::template process_tile<true, 0, 0, 7, 0, 1, 0>,
+ ConvImpl::template process_tile<true, 0, 0, 7, 0, 2, 0>,
+ ConvImpl::template process_tile<true, 0, 0, 7, 0, 3, 0>,
+ },
+ {
+ ConvImpl::template process_tile<true, 0, 0, 8, 0, 0, 0>,
+ ConvImpl::template process_tile<true, 0, 0, 8, 0, 1, 0>,
+ ConvImpl::template process_tile<true, 0, 0, 8, 0, 2, 0>,
+ ConvImpl::template process_tile<true, 0, 0, 8, 0, 3, 0>,
+ },
+};
+
+template <>
+const Conv::TileFn Conv::tilefn_right[n_in_pad_right_fns][n_out_pad_right_fns] = {
+ {
+ ConvImpl::template process_tile<true, 0, 0, 0, 0, 0, 0>,
+ ConvImpl::template process_tile<true, 0, 0, 0, 0, 0, 1>,
+ ConvImpl::template process_tile<true, 0, 0, 0, 0, 0, 2>,
+ ConvImpl::template process_tile<true, 0, 0, 0, 0, 0, 3>,
+ },
+ {
+ ConvImpl::template process_tile<true, 0, 0, 0, 1, 0, 0>,
+ ConvImpl::template process_tile<true, 0, 0, 0, 1, 0, 1>,
+ ConvImpl::template process_tile<true, 0, 0, 0, 1, 0, 2>,
+ ConvImpl::template process_tile<true, 0, 0, 0, 1, 0, 3>,
+ },
+ {
+ ConvImpl::template process_tile<true, 0, 0, 0, 2, 0, 0>,
+ ConvImpl::template process_tile<true, 0, 0, 0, 2, 0, 1>,
+ ConvImpl::template process_tile<true, 0, 0, 0, 2, 0, 2>,
+ ConvImpl::template process_tile<true, 0, 0, 0, 2, 0, 3>,
+ },
+ {
+ ConvImpl::template process_tile<true, 0, 0, 0, 3, 0, 0>,
+ ConvImpl::template process_tile<true, 0, 0, 0, 3, 0, 1>,
+ ConvImpl::template process_tile<true, 0, 0, 0, 3, 0, 2>,
+ ConvImpl::template process_tile<true, 0, 0, 0, 3, 0, 3>,
+ },
+ {
+ ConvImpl::template process_tile<true, 0, 0, 0, 4, 0, 0>,
+ ConvImpl::template process_tile<true, 0, 0, 0, 4, 0, 1>,
+ ConvImpl::template process_tile<true, 0, 0, 0, 4, 0, 2>,
+ ConvImpl::template process_tile<true, 0, 0, 0, 4, 0, 3>,
+ },
+ {
+ ConvImpl::template process_tile<true, 0, 0, 0, 5, 0, 0>,
+ ConvImpl::template process_tile<true, 0, 0, 0, 5, 0, 1>,
+ ConvImpl::template process_tile<true, 0, 0, 0, 5, 0, 2>,
+ ConvImpl::template process_tile<true, 0, 0, 0, 5, 0, 3>,
+ },
+ {
+ ConvImpl::template process_tile<true, 0, 0, 0, 6, 0, 0>,
+ ConvImpl::template process_tile<true, 0, 0, 0, 6, 0, 1>,
+ ConvImpl::template process_tile<true, 0, 0, 0, 6, 0, 2>,
+ ConvImpl::template process_tile<true, 0, 0, 0, 6, 0, 3>,
+ },
+ {
+ ConvImpl::template process_tile<true, 0, 0, 0, 7, 0, 0>,
+ ConvImpl::template process_tile<true, 0, 0, 0, 7, 0, 1>,
+ ConvImpl::template process_tile<true, 0, 0, 0, 7, 0, 2>,
+ ConvImpl::template process_tile<true, 0, 0, 0, 7, 0, 3>,
+ },
+ {
+ ConvImpl::template process_tile<true, 0, 0, 0, 8, 0, 0>,
+ ConvImpl::template process_tile<true, 0, 0, 0, 8, 0, 1>,
+ ConvImpl::template process_tile<true, 0, 0, 0, 8, 0, 2>,
+ ConvImpl::template process_tile<true, 0, 0, 0, 8, 0, 3>,
+ },
+};
+
+template <>
+const Conv::TileFn Conv::tilefn_generic = ConvImpl::template process_tile<false>;
template class DepthwiseConvolution<4, 4, 3, 3, 2, 2, float, float>;
} // namespace depthwise
diff --git a/src/core/NEON/kernels/convolution/winograd/transforms/output_2x2_3x3_fp32.cpp b/src/core/NEON/kernels/convolution/winograd/transforms/output_2x2_3x3_fp32.cpp
index a95ce0e..3b3cda0 100644
--- a/src/core/NEON/kernels/convolution/winograd/transforms/output_2x2_3x3_fp32.cpp
+++ b/src/core/NEON/kernels/convolution/winograd/transforms/output_2x2_3x3_fp32.cpp
@@ -86,148 +86,288 @@
const float *inptr = matrix_base;
const float *bptr = biases;
- // For each channel of the output
- int channels_remaining = n_channels;
-#ifdef __aarch64__
- for (; channels_remaining >= 4; channels_remaining -= 4)
+ if (bptr)
{
- // Matrices used and computed during this transform
- float32x4_t F[4][4], FZ[4][2], f[2][2], b;
-
- // Read a 4x4 tile in the Winograd domain
- for (int i = 0, m = 0; i < 4; i++)
+ // For each channel of the output
+ int channels_remaining = n_channels;
+#ifdef __aarch64__
+ for (; channels_remaining >= 4; channels_remaining -= 4)
{
- for (int j = 0; j < 4; j++, m++)
+ // Matrices used and computed during this transform
+ float32x4_t F[4][4], FZ[4][2], f[2][2], b;
+
+ // Read a 4x4 tile in the Winograd domain
+ for (int i = 0, m = 0; i < 4; i++)
{
- F[i][j] = vld1q_f32(inptr + m*matrix_stride);
+ for (int j = 0; j < 4; j++, m++)
+ {
+ F[i][j] = vld1q_f32(inptr + m*matrix_stride);
+ }
+ }
+ inptr += 4;
+
+ // Compute the matrix F Z
+ for (int i = 0; i < 4; i++)
+ {
+ // FZ[i][0] = F[i][0] + F[i][1] + F[i][2];
+ FZ[i][0] = vaddq_f32(vaddq_f32(F[i][0], F[i][1]), F[i][2]);
+
+ // FZ[i][1] = F[i][1] - F[i][2] - F[i][3];
+ FZ[i][1] = vsubq_f32(vsubq_f32(F[i][1], F[i][2]), F[i][3]);
+ }
+
+ // Compute the output tile f = ZT F Z
+ for (int j = 0; j < 2; j++)
+ {
+ // f[0][j] = FZ[0][j] + FZ[1][j] + FZ[2][j];
+ f[0][j] = vaddq_f32(vaddq_f32(FZ[0][j], FZ[1][j]), FZ[2][j]);
+
+ // f[1][j] = FZ[1][j] - FZ[2][j] - FZ[3][j];
+ f[1][j] = vsubq_f32(vsubq_f32(FZ[1][j], FZ[2][j]), FZ[3][j]);
+ }
+
+ // Load the bias vector
+ b = vld1q_f32(bptr);
+ bptr += 4;
+
+ // Write out the output tile
+ for (int i = 0; i < cells_i; i++)
+ {
+ for (int j = 0; j < cells_j; j++)
+ {
+ vst1q_f32(outptrs[i][j], vaddq_f32(f[i][j], b));
+ outptrs[i][j] += 4;
+ }
}
}
- inptr += 4;
-
- // Compute the matrix F Z
- for (int i = 0; i < 4; i++)
- {
- // FZ[i][0] = F[i][0] + F[i][1] + F[i][2];
- FZ[i][0] = vaddq_f32(vaddq_f32(F[i][0], F[i][1]), F[i][2]);
-
- // FZ[i][1] = F[i][1] - F[i][2] - F[i][3];
- FZ[i][1] = vsubq_f32(vsubq_f32(F[i][1], F[i][2]), F[i][3]);
- }
-
- // Compute the output tile f = ZT F Z
- for (int j = 0; j < 2; j++)
- {
- // f[0][j] = FZ[0][j] + FZ[1][j] + FZ[2][j];
- f[0][j] = vaddq_f32(vaddq_f32(FZ[0][j], FZ[1][j]), FZ[2][j]);
-
- // f[1][j] = FZ[1][j] - FZ[2][j] - FZ[3][j];
- f[1][j] = vsubq_f32(vsubq_f32(FZ[1][j], FZ[2][j]), FZ[3][j]);
- }
-
- // Load the bias vector
- b = vld1q_f32(bptr);
- bptr += 4;
-
- // Write out the output tile
- for (int i = 0; i < cells_i; i++)
- {
- for (int j = 0; j < cells_j; j++)
- {
- vst1q_f32(outptrs[i][j], vaddq_f32(f[i][j], b));
- outptrs[i][j] += 4;
- }
- }
- }
#endif // __aarch64__
#ifdef __arm_any__
- for (; channels_remaining >= 2; channels_remaining -= 2)
- {
- // Matrices used and computed during this transform
- float32x2_t F[4][4], FZ[4][2], f[2][2], b;
-
- // Read a 4x4 tile in the Winograd domain
- for (int i = 0, m = 0; i < 4; i++)
+ for (; channels_remaining >= 2; channels_remaining -= 2)
{
- for (int j = 0; j < 4; j++, m++)
+ // Matrices used and computed during this transform
+ float32x2_t F[4][4], FZ[4][2], f[2][2], b;
+
+ // Read a 4x4 tile in the Winograd domain
+ for (int i = 0, m = 0; i < 4; i++)
{
- F[i][j] = vld1_f32(inptr + m*matrix_stride);
+ for (int j = 0; j < 4; j++, m++)
+ {
+ F[i][j] = vld1_f32(inptr + m*matrix_stride);
+ }
+ }
+ inptr += 2;
+
+ // Compute the matrix F Z
+ for (int i = 0; i < 4; i++)
+ {
+ // FZ[i][0] = F[i][0] + F[i][1] + F[i][2];
+ FZ[i][0] = vadd_f32(vadd_f32(F[i][0], F[i][1]), F[i][2]);
+
+ // FZ[i][1] = F[i][1] - F[i][2] - F[i][3];
+ FZ[i][1] = vsub_f32(vsub_f32(F[i][1], F[i][2]), F[i][3]);
+ }
+
+ // Compute the output tile f = ZT F Z
+ for (int j = 0; j < 2; j++)
+ {
+ // f[0][j] = FZ[0][j] + FZ[1][j] + FZ[2][j];
+ f[0][j] = vadd_f32(vadd_f32(FZ[0][j], FZ[1][j]), FZ[2][j]);
+
+ // f[1][j] = FZ[1][j] - FZ[2][j] - FZ[3][j];
+ f[1][j] = vsub_f32(vsub_f32(FZ[1][j], FZ[2][j]), FZ[3][j]);
+ }
+
+ // Load the bias vector
+ b = vld1_f32(bptr);
+ bptr += 2;
+
+ // Write out the output tile
+ for (int i = 0; i < cells_i; i++)
+ {
+ for (int j = 0; j < cells_j; j++)
+ {
+ vst1_f32(outptrs[i][j], vadd_f32(f[i][j], b));
+ outptrs[i][j] += 2;
+ }
}
}
- inptr += 2;
-
- // Compute the matrix F Z
- for (int i = 0; i < 4; i++)
+#endif // __arm_any__
+ for (; channels_remaining; channels_remaining--)
{
- // FZ[i][0] = F[i][0] + F[i][1] + F[i][2];
- FZ[i][0] = vadd_f32(vadd_f32(F[i][0], F[i][1]), F[i][2]);
+ // Matrices used and computed during this transform
+ float F[4][4], FZ[4][2], f[2][2], b;
- // FZ[i][1] = F[i][1] - F[i][2] - F[i][3];
- FZ[i][1] = vsub_f32(vsub_f32(F[i][1], F[i][2]), F[i][3]);
- }
-
- // Compute the output tile f = ZT F Z
- for (int j = 0; j < 2; j++)
- {
- // f[0][j] = FZ[0][j] + FZ[1][j] + FZ[2][j];
- f[0][j] = vadd_f32(vadd_f32(FZ[0][j], FZ[1][j]), FZ[2][j]);
-
- // f[1][j] = FZ[1][j] - FZ[2][j] - FZ[3][j];
- f[1][j] = vsub_f32(vsub_f32(FZ[1][j], FZ[2][j]), FZ[3][j]);
- }
-
- // Load the bias vector
- b = vld1_f32(bptr);
- bptr += 2;
-
- // Write out the output tile
- for (int i = 0; i < cells_i; i++)
- {
- for (int j = 0; j < cells_j; j++)
+ // Read a 4x4 tile in the Winograd domain
+ for (int i = 0, m = 0; i < 4; i++)
{
- vst1_f32(outptrs[i][j], vadd_f32(f[i][j], b));
- outptrs[i][j] += 2;
+ for (int j = 0; j < 4; j++, m++)
+ {
+ F[i][j] = *(inptr + m*matrix_stride);
+ }
+ }
+ inptr++;
+
+ // Compute the matrix F Z
+ for (int i = 0; i < 4; i++)
+ {
+ FZ[i][0] = F[i][0] + F[i][1] + F[i][2];
+ FZ[i][1] = F[i][1] - F[i][2] - F[i][3];
+ }
+
+ // Compute the output tile f = ZT F Z
+ for (int j = 0; j < 2; j++)
+ {
+ f[0][j] = FZ[0][j] + FZ[1][j] + FZ[2][j];
+ f[1][j] = FZ[1][j] - FZ[2][j] - FZ[3][j];
+ }
+
+ // Load the bias
+ b = *(bptr++);
+
+ // Write out the output tile
+ for (int i = 0; i < cells_i; i++)
+ {
+ for (int j = 0; j < cells_j; j++)
+ {
+ *(outptrs[i][j]++) = f[i][j] + b;
+ }
}
}
}
-#endif // __arm_any__
- for (; channels_remaining; channels_remaining--)
+ else
{
- // Matrices used and computed during this transform
- float F[4][4], FZ[4][2], f[2][2], b;
-
- // Read a 4x4 tile in the Winograd domain
- for (int i = 0, m = 0; i < 4; i++)
+ // For each channel of the output
+ int channels_remaining = n_channels;
+#ifdef __aarch64__
+ for (; channels_remaining >= 4; channels_remaining -= 4)
{
- for (int j = 0; j < 4; j++, m++)
+ // Matrices used and computed during this transform
+ float32x4_t F[4][4], FZ[4][2], f[2][2];
+
+ // Read a 4x4 tile in the Winograd domain
+ for (int i = 0, m = 0; i < 4; i++)
{
- F[i][j] = *(inptr + m*matrix_stride);
+ for (int j = 0; j < 4; j++, m++)
+ {
+ F[i][j] = vld1q_f32(inptr + m*matrix_stride);
+ }
+ }
+ inptr += 4;
+
+ // Compute the matrix F Z
+ for (int i = 0; i < 4; i++)
+ {
+ // FZ[i][0] = F[i][0] + F[i][1] + F[i][2];
+ FZ[i][0] = vaddq_f32(vaddq_f32(F[i][0], F[i][1]), F[i][2]);
+
+ // FZ[i][1] = F[i][1] - F[i][2] - F[i][3];
+ FZ[i][1] = vsubq_f32(vsubq_f32(F[i][1], F[i][2]), F[i][3]);
+ }
+
+ // Compute the output tile f = ZT F Z
+ for (int j = 0; j < 2; j++)
+ {
+ // f[0][j] = FZ[0][j] + FZ[1][j] + FZ[2][j];
+ f[0][j] = vaddq_f32(vaddq_f32(FZ[0][j], FZ[1][j]), FZ[2][j]);
+
+ // f[1][j] = FZ[1][j] - FZ[2][j] - FZ[3][j];
+ f[1][j] = vsubq_f32(vsubq_f32(FZ[1][j], FZ[2][j]), FZ[3][j]);
+ }
+
+ // Write out the output tile
+ for (int i = 0; i < cells_i; i++)
+ {
+ for (int j = 0; j < cells_j; j++)
+ {
+ vst1q_f32(outptrs[i][j], f[i][j]);
+ outptrs[i][j] += 4;
+ }
}
}
- inptr++;
-
- // Compute the matrix F Z
- for (int i = 0; i < 4; i++)
+#endif // __aarch64__
+#ifdef __arm_any__
+ for (; channels_remaining >= 2; channels_remaining -= 2)
{
- FZ[i][0] = F[i][0] + F[i][1] + F[i][2];
- FZ[i][1] = F[i][1] - F[i][2] - F[i][3];
- }
+ // Matrices used and computed during this transform
+ float32x2_t F[4][4], FZ[4][2], f[2][2];
- // Compute the output tile f = ZT F Z
- for (int j = 0; j < 2; j++)
- {
- f[0][j] = FZ[0][j] + FZ[1][j] + FZ[2][j];
- f[1][j] = FZ[1][j] - FZ[2][j] - FZ[3][j];
- }
-
- // Load the bias
- b = *(bptr++);
-
- // Write out the output tile
- for (int i = 0; i < cells_i; i++)
- {
- for (int j = 0; j < cells_j; j++)
+ // Read a 4x4 tile in the Winograd domain
+ for (int i = 0, m = 0; i < 4; i++)
{
- *(outptrs[i][j]++) = f[i][j] + b;
+ for (int j = 0; j < 4; j++, m++)
+ {
+ F[i][j] = vld1_f32(inptr + m*matrix_stride);
+ }
+ }
+ inptr += 2;
+
+ // Compute the matrix F Z
+ for (int i = 0; i < 4; i++)
+ {
+ // FZ[i][0] = F[i][0] + F[i][1] + F[i][2];
+ FZ[i][0] = vadd_f32(vadd_f32(F[i][0], F[i][1]), F[i][2]);
+
+ // FZ[i][1] = F[i][1] - F[i][2] - F[i][3];
+ FZ[i][1] = vsub_f32(vsub_f32(F[i][1], F[i][2]), F[i][3]);
+ }
+
+ // Compute the output tile f = ZT F Z
+ for (int j = 0; j < 2; j++)
+ {
+ // f[0][j] = FZ[0][j] + FZ[1][j] + FZ[2][j];
+ f[0][j] = vadd_f32(vadd_f32(FZ[0][j], FZ[1][j]), FZ[2][j]);
+
+ // f[1][j] = FZ[1][j] - FZ[2][j] - FZ[3][j];
+ f[1][j] = vsub_f32(vsub_f32(FZ[1][j], FZ[2][j]), FZ[3][j]);
+ }
+
+ // Write out the output tile
+ for (int i = 0; i < cells_i; i++)
+ {
+ for (int j = 0; j < cells_j; j++)
+ {
+ vst1_f32(outptrs[i][j], f[i][j]);
+ outptrs[i][j] += 2;
+ }
+ }
+ }
+#endif // __arm_any__
+ for (; channels_remaining; channels_remaining--)
+ {
+ // Matrices used and computed during this transform
+ float F[4][4], FZ[4][2], f[2][2];
+
+ // Read a 4x4 tile in the Winograd domain
+ for (int i = 0, m = 0; i < 4; i++)
+ {
+ for (int j = 0; j < 4; j++, m++)
+ {
+ F[i][j] = *(inptr + m*matrix_stride);
+ }
+ }
+ inptr++;
+
+ // Compute the matrix F Z
+ for (int i = 0; i < 4; i++)
+ {
+ FZ[i][0] = F[i][0] + F[i][1] + F[i][2];
+ FZ[i][1] = F[i][1] - F[i][2] - F[i][3];
+ }
+
+ // Compute the output tile f = ZT F Z
+ for (int j = 0; j < 2; j++)
+ {
+ f[0][j] = FZ[0][j] + FZ[1][j] + FZ[2][j];
+ f[1][j] = FZ[1][j] - FZ[2][j] - FZ[3][j];
+ }
+
+ // Write out the output tile
+ for (int i = 0; i < cells_i; i++)
+ {
+ for (int j = 0; j < cells_j; j++)
+ {
+ *(outptrs[i][j]++) = f[i][j];
+ }
}
}
}
diff --git a/src/core/NEON/kernels/convolution/winograd/transforms/output_2x2_5x5_fp32.cpp b/src/core/NEON/kernels/convolution/winograd/transforms/output_2x2_5x5_fp32.cpp
index 6bb1674..8668535 100644
--- a/src/core/NEON/kernels/convolution/winograd/transforms/output_2x2_5x5_fp32.cpp
+++ b/src/core/NEON/kernels/convolution/winograd/transforms/output_2x2_5x5_fp32.cpp
@@ -35,6 +35,7 @@
template <>
int Transform::ops_performed(const Tensor4DShape &shape)
{
+ (void) shape;
return 0;
}
@@ -83,142 +84,282 @@
const float *inptr = matrix_base;
const float *bptr = biases;
- // For each channel of the output
- int channels_remaining = n_channels;
-#ifdef __aarch64__
- for (; channels_remaining >= 4; channels_remaining -= 4)
+ if (bptr)
{
- // Matrices used and computed during this transform
- float32x4_t F[6][6], FZ[6][2], f[2][2], b;
-
- // Read a 6x6 tile in the Winograd domain
- for (int i = 0, m = 0; i < 6; i++)
+ // For each channel of the output
+ int channels_remaining = n_channels;
+#ifdef __aarch64__
+ for (; channels_remaining >= 4; channels_remaining -= 4)
{
- for (int j = 0; j < 6; j++, m++)
+ // Matrices used and computed during this transform
+ float32x4_t F[6][6], FZ[6][2], f[2][2], b;
+
+ // Read a 6x6 tile in the Winograd domain
+ for (int i = 0, m = 0; i < 6; i++)
{
- F[i][j] = vld1q_f32(inptr + m*matrix_stride);
+ for (int j = 0; j < 6; j++, m++)
+ {
+ F[i][j] = vld1q_f32(inptr + m*matrix_stride);
+ }
+ }
+ inptr += 4;
+
+ // Compute the matrix F Z
+ for (int i = 0; i < 6; i++)
+ {
+ // FZ[i][0] = 1*F[i][0] + 1*F[i][1] + 1*F[i][2] + 1*F[i][3] + 1*F[i][4];
+ FZ[i][0] = vaddq_f32(vaddq_f32(vaddq_f32(F[i][0], F[i][1]), vaddq_f32(F[i][2], F[i][3])), F[i][4]);
+
+ // FZ[i][1] = 1*F[i][1] + -1*F[i][2] + 2*F[i][3] + -2*F[i][4] + 1*F[i][5];
+ FZ[i][1] = vaddq_f32(vmlaq_n_f32(vsubq_f32(F[i][1], F[i][2]), vsubq_f32(F[i][3], F[i][4]), 2.0f), F[i][5]);
+ }
+
+ // Compute the output tile f = ZT F Z
+ for (int j = 0; j < 2; j++)
+ {
+ // f[0][j] = 1*FZ[0][j] + 1*FZ[1][j] + 1*FZ[2][j] + 1*FZ[3][j] + 1*FZ[4][j];
+ f[0][j] = vaddq_f32(vaddq_f32(vaddq_f32(FZ[0][j], FZ[1][j]), vaddq_f32(FZ[2][j], FZ[3][j])), FZ[4][j]);
+
+ // f[1][j] = 1*FZ[1][j] + -1*FZ[2][j] + 2*FZ[3][j] + -2*FZ[4][j] + 1*FZ[5][j];
+ f[1][j] = vaddq_f32(vmlaq_n_f32(vsubq_f32(FZ[1][j], FZ[2][j]), vsubq_f32(FZ[3][j], FZ[4][j]), 2.0f), FZ[5][j]);
+ }
+
+ // Write out the output tile
+ b = vld1q_f32(bptr);
+ bptr += 4;
+ for (int i = 0; i < cells_i; i++)
+ {
+ for (int j = 0; j < cells_j; j++)
+ {
+ vst1q_f32(outptrs[i][j], vaddq_f32(f[i][j], b));
+ outptrs[i][j] += 4;
+ }
}
}
- inptr += 4;
-
- // Compute the matrix F Z
- for (int i = 0; i < 6; i++)
- {
- // FZ[i][0] = 1*F[i][0] + 1*F[i][1] + 1*F[i][2] + 1*F[i][3] + 1*F[i][4];
- FZ[i][0] = vaddq_f32(vaddq_f32(vaddq_f32(F[i][0], F[i][1]), vaddq_f32(F[i][2], F[i][3])), F[i][4]);
-
- // FZ[i][1] = 1*F[i][1] + -1*F[i][2] + 2*F[i][3] + -2*F[i][4] + 1*F[i][5];
- FZ[i][1] = vaddq_f32(vmlaq_n_f32(vsubq_f32(F[i][1], F[i][2]), vsubq_f32(F[i][3], F[i][4]), 2.0f), F[i][5]);
- }
-
- // Compute the output tile f = ZT F Z
- for (int j = 0; j < 2; j++)
- {
- // f[0][j] = 1*FZ[0][j] + 1*FZ[1][j] + 1*FZ[2][j] + 1*FZ[3][j] + 1*FZ[4][j];
- f[0][j] = vaddq_f32(vaddq_f32(vaddq_f32(FZ[0][j], FZ[1][j]), vaddq_f32(FZ[2][j], FZ[3][j])), FZ[4][j]);
-
- // f[1][j] = 1*FZ[1][j] + -1*FZ[2][j] + 2*FZ[3][j] + -2*FZ[4][j] + 1*FZ[5][j];
- f[1][j] = vaddq_f32(vmlaq_n_f32(vsubq_f32(FZ[1][j], FZ[2][j]), vsubq_f32(FZ[3][j], FZ[4][j]), 2.0f), FZ[5][j]);
- }
-
- // Write out the output tile
- b = vld1q_f32(bptr);
- bptr += 4;
- for (int i = 0; i < cells_i; i++)
- {
- for (int j = 0; j < cells_j; j++)
- {
- vst1q_f32(outptrs[i][j], vaddq_f32(f[i][j], b));
- outptrs[i][j] += 4;
- }
- }
- }
#endif // __aarch64__
#ifdef __arm_any__
- for (; channels_remaining >= 2; channels_remaining -= 2)
- {
- // Matrices used and computed during this transform
- float32x2_t F[6][6], FZ[6][2], f[2][2], b;
-
- // Read a 6x6 tile in the Winograd domain
- for (int i = 0, m = 0; i < 6; i++)
+ for (; channels_remaining >= 2; channels_remaining -= 2)
{
- for (int j = 0; j < 6; j++, m++)
+ // Matrices used and computed during this transform
+ float32x2_t F[6][6], FZ[6][2], f[2][2], b;
+
+ // Read a 6x6 tile in the Winograd domain
+ for (int i = 0, m = 0; i < 6; i++)
{
- F[i][j] = vld1_f32(inptr + m*matrix_stride);
+ for (int j = 0; j < 6; j++, m++)
+ {
+ F[i][j] = vld1_f32(inptr + m*matrix_stride);
+ }
+ }
+ inptr += 2;
+
+ // Compute the matrix F Z
+ for (int i = 0; i < 6; i++)
+ {
+ // FZ[i][0] = 1*F[i][0] + 1*F[i][1] + 1*F[i][2] + 1*F[i][3] + 1*F[i][4];
+ FZ[i][0] = vadd_f32(vadd_f32(vadd_f32(F[i][0], F[i][1]), vadd_f32(F[i][2], F[i][3])), F[i][4]);
+
+ // FZ[i][1] = 1*F[i][1] + -1*F[i][2] + 2*F[i][3] + -2*F[i][4] + 1*F[i][5];
+ FZ[i][1] = vadd_f32(vmla_n_f32(vsub_f32(F[i][1], F[i][2]), vsub_f32(F[i][3], F[i][4]), 2.0f), F[i][5]);
+ }
+
+ // Compute the output tile f = ZT F Z
+ for (int j = 0; j < 2; j++)
+ {
+ // f[0][j] = 1*FZ[0][j] + 1*FZ[1][j] + 1*FZ[2][j] + 1*FZ[3][j] + 1*FZ[4][j];
+ f[0][j] = vadd_f32(vadd_f32(vadd_f32(FZ[0][j], FZ[1][j]), vadd_f32(FZ[2][j], FZ[3][j])), FZ[4][j]);
+
+ // f[1][j] = 1*FZ[1][j] + -1*FZ[2][j] + 2*FZ[3][j] + -2*FZ[4][j] + 1*FZ[5][j];
+ f[1][j] = vadd_f32(vmla_n_f32(vsub_f32(FZ[1][j], FZ[2][j]), vsub_f32(FZ[3][j], FZ[4][j]), 2.0f), FZ[5][j]);
+ }
+
+ // Write out the output tile
+ b = vld1_f32(bptr);
+ bptr += 2;
+ for (int i = 0; i < cells_i; i++)
+ {
+ for (int j = 0; j < cells_j; j++)
+ {
+ vst1_f32(outptrs[i][j], vadd_f32(f[i][j], b));
+ outptrs[i][j] += 2;
+ }
}
}
- inptr += 2;
-
- // Compute the matrix F Z
- for (int i = 0; i < 6; i++)
+#endif // __arm_any__
+ for (; channels_remaining; channels_remaining--)
{
- // FZ[i][0] = 1*F[i][0] + 1*F[i][1] + 1*F[i][2] + 1*F[i][3] + 1*F[i][4];
- FZ[i][0] = vadd_f32(vadd_f32(vadd_f32(F[i][0], F[i][1]), vadd_f32(F[i][2], F[i][3])), F[i][4]);
+ // Matrices used and computed during this transform
+ float F[6][6], FZ[6][2], f[2][2], b;
- // FZ[i][1] = 1*F[i][1] + -1*F[i][2] + 2*F[i][3] + -2*F[i][4] + 1*F[i][5];
- FZ[i][1] = vadd_f32(vmla_n_f32(vsub_f32(F[i][1], F[i][2]), vsub_f32(F[i][3], F[i][4]), 2.0f), F[i][5]);
- }
-
- // Compute the output tile f = ZT F Z
- for (int j = 0; j < 2; j++)
- {
- // f[0][j] = 1*FZ[0][j] + 1*FZ[1][j] + 1*FZ[2][j] + 1*FZ[3][j] + 1*FZ[4][j];
- f[0][j] = vadd_f32(vadd_f32(vadd_f32(FZ[0][j], FZ[1][j]), vadd_f32(FZ[2][j], FZ[3][j])), FZ[4][j]);
-
- // f[1][j] = 1*FZ[1][j] + -1*FZ[2][j] + 2*FZ[3][j] + -2*FZ[4][j] + 1*FZ[5][j];
- f[1][j] = vadd_f32(vmla_n_f32(vsub_f32(FZ[1][j], FZ[2][j]), vsub_f32(FZ[3][j], FZ[4][j]), 2.0f), FZ[5][j]);
- }
-
- // Write out the output tile
- b = vld1_f32(bptr);
- bptr += 2;
- for (int i = 0; i < cells_i; i++)
- {
- for (int j = 0; j < cells_j; j++)
+ // Read a 6x6 tile in the Winograd domain
+ for (int i = 0, m = 0; i < 6; i++)
{
- vst1_f32(outptrs[i][j], vadd_f32(f[i][j], b));
- outptrs[i][j] += 2;
+ for (int j = 0; j < 6; j++, m++)
+ {
+ F[i][j] = *(inptr + m*matrix_stride);
+ }
+ }
+ inptr++;
+
+ // Compute the matrix F Z
+ for (int i = 0; i < 6; i++)
+ {
+ FZ[i][0] = 1*F[i][0] + 1*F[i][1] + 1*F[i][2] + 1*F[i][3] + 1*F[i][4];
+ FZ[i][1] = 1*F[i][1] + -1*F[i][2] + 2*F[i][3] + -2*F[i][4] + 1*F[i][5];
+ }
+
+ // Compute the output tile f = ZT F Z
+ for (int j = 0; j < 2; j++)
+ {
+ f[0][j] = 1*FZ[0][j] + 1*FZ[1][j] + 1*FZ[2][j] + 1*FZ[3][j] + 1*FZ[4][j];
+ f[1][j] = 1*FZ[1][j] + -1*FZ[2][j] + 2*FZ[3][j] + -2*FZ[4][j] + 1*FZ[5][j];
+ }
+
+ // Write out the output tile
+ b = *(bptr++);
+ for (int i = 0; i < cells_i; i++)
+ {
+ for (int j = 0; j < cells_j; j++)
+ {
+ *(outptrs[i][j]++) = f[i][j] + b;
+ }
}
}
}
-#endif // __arm_any__
- for (; channels_remaining; channels_remaining--)
+ else
{
- // Matrices used and computed during this transform
- float F[6][6], FZ[6][2], f[2][2], b;
-
- // Read a 6x6 tile in the Winograd domain
- for (int i = 0, m = 0; i < 6; i++)
+ // For each channel of the output
+ int channels_remaining = n_channels;
+#ifdef __aarch64__
+ for (; channels_remaining >= 4; channels_remaining -= 4)
{
- for (int j = 0; j < 6; j++, m++)
+ // Matrices used and computed during this transform
+ float32x4_t F[6][6], FZ[6][2], f[2][2];
+
+ // Read a 6x6 tile in the Winograd domain
+ for (int i = 0, m = 0; i < 6; i++)
{
- F[i][j] = *(inptr + m*matrix_stride);
+ for (int j = 0; j < 6; j++, m++)
+ {
+ F[i][j] = vld1q_f32(inptr + m*matrix_stride);
+ }
+ }
+ inptr += 4;
+
+ // Compute the matrix F Z
+ for (int i = 0; i < 6; i++)
+ {
+ // FZ[i][0] = 1*F[i][0] + 1*F[i][1] + 1*F[i][2] + 1*F[i][3] + 1*F[i][4];
+ FZ[i][0] = vaddq_f32(vaddq_f32(vaddq_f32(F[i][0], F[i][1]), vaddq_f32(F[i][2], F[i][3])), F[i][4]);
+
+ // FZ[i][1] = 1*F[i][1] + -1*F[i][2] + 2*F[i][3] + -2*F[i][4] + 1*F[i][5];
+ FZ[i][1] = vaddq_f32(vmlaq_n_f32(vsubq_f32(F[i][1], F[i][2]), vsubq_f32(F[i][3], F[i][4]), 2.0f), F[i][5]);
+ }
+
+ // Compute the output tile f = ZT F Z
+ for (int j = 0; j < 2; j++)
+ {
+ // f[0][j] = 1*FZ[0][j] + 1*FZ[1][j] + 1*FZ[2][j] + 1*FZ[3][j] + 1*FZ[4][j];
+ f[0][j] = vaddq_f32(vaddq_f32(vaddq_f32(FZ[0][j], FZ[1][j]), vaddq_f32(FZ[2][j], FZ[3][j])), FZ[4][j]);
+
+ // f[1][j] = 1*FZ[1][j] + -1*FZ[2][j] + 2*FZ[3][j] + -2*FZ[4][j] + 1*FZ[5][j];
+ f[1][j] = vaddq_f32(vmlaq_n_f32(vsubq_f32(FZ[1][j], FZ[2][j]), vsubq_f32(FZ[3][j], FZ[4][j]), 2.0f), FZ[5][j]);
+ }
+
+ // Write out the output tile
+ for (int i = 0; i < cells_i; i++)
+ {
+ for (int j = 0; j < cells_j; j++)
+ {
+ vst1q_f32(outptrs[i][j], f[i][j]);
+ outptrs[i][j] += 4;
+ }
}
}
- inptr++;
-
- // Compute the matrix F Z
- for (int i = 0; i < 6; i++)
+#endif // __aarch64__
+#ifdef __arm_any__
+ for (; channels_remaining >= 2; channels_remaining -= 2)
{
- FZ[i][0] = 1*F[i][0] + 1*F[i][1] + 1*F[i][2] + 1*F[i][3] + 1*F[i][4];
- FZ[i][1] = 1*F[i][1] + -1*F[i][2] + 2*F[i][3] + -2*F[i][4] + 1*F[i][5];
- }
+ // Matrices used and computed during this transform
+ float32x2_t F[6][6], FZ[6][2], f[2][2];
- // Compute the output tile f = ZT F Z
- for (int j = 0; j < 2; j++)
- {
- f[0][j] = 1*FZ[0][j] + 1*FZ[1][j] + 1*FZ[2][j] + 1*FZ[3][j] + 1*FZ[4][j];
- f[1][j] = 1*FZ[1][j] + -1*FZ[2][j] + 2*FZ[3][j] + -2*FZ[4][j] + 1*FZ[5][j];
- }
-
- // Write out the output tile
- b = *(bptr++);
- for (int i = 0; i < cells_i; i++)
- {
- for (int j = 0; j < cells_j; j++)
+ // Read a 6x6 tile in the Winograd domain
+ for (int i = 0, m = 0; i < 6; i++)
{
- *(outptrs[i][j]++) = f[i][j] + b;
+ for (int j = 0; j < 6; j++, m++)
+ {
+ F[i][j] = vld1_f32(inptr + m*matrix_stride);
+ }
+ }
+ inptr += 2;
+
+ // Compute the matrix F Z
+ for (int i = 0; i < 6; i++)
+ {
+ // FZ[i][0] = 1*F[i][0] + 1*F[i][1] + 1*F[i][2] + 1*F[i][3] + 1*F[i][4];
+ FZ[i][0] = vadd_f32(vadd_f32(vadd_f32(F[i][0], F[i][1]), vadd_f32(F[i][2], F[i][3])), F[i][4]);
+
+ // FZ[i][1] = 1*F[i][1] + -1*F[i][2] + 2*F[i][3] + -2*F[i][4] + 1*F[i][5];
+ FZ[i][1] = vadd_f32(vmla_n_f32(vsub_f32(F[i][1], F[i][2]), vsub_f32(F[i][3], F[i][4]), 2.0f), F[i][5]);
+ }
+
+ // Compute the output tile f = ZT F Z
+ for (int j = 0; j < 2; j++)
+ {
+ // f[0][j] = 1*FZ[0][j] + 1*FZ[1][j] + 1*FZ[2][j] + 1*FZ[3][j] + 1*FZ[4][j];
+ f[0][j] = vadd_f32(vadd_f32(vadd_f32(FZ[0][j], FZ[1][j]), vadd_f32(FZ[2][j], FZ[3][j])), FZ[4][j]);
+
+ // f[1][j] = 1*FZ[1][j] + -1*FZ[2][j] + 2*FZ[3][j] + -2*FZ[4][j] + 1*FZ[5][j];
+ f[1][j] = vadd_f32(vmla_n_f32(vsub_f32(FZ[1][j], FZ[2][j]), vsub_f32(FZ[3][j], FZ[4][j]), 2.0f), FZ[5][j]);
+ }
+
+ // Write out the output tile
+ for (int i = 0; i < cells_i; i++)
+ {
+ for (int j = 0; j < cells_j; j++)
+ {
+ vst1_f32(outptrs[i][j], f[i][j]);
+ outptrs[i][j] += 2;
+ }
+ }
+ }
+#endif // __arm_any__
+ for (; channels_remaining; channels_remaining--)
+ {
+ // Matrices used and computed during this transform
+ float F[6][6], FZ[6][2], f[2][2];
+
+ // Read a 6x6 tile in the Winograd domain
+ for (int i = 0, m = 0; i < 6; i++)
+ {
+ for (int j = 0; j < 6; j++, m++)
+ {
+ F[i][j] = *(inptr + m*matrix_stride);
+ }
+ }
+ inptr++;
+
+ // Compute the matrix F Z
+ for (int i = 0; i < 6; i++)
+ {
+ FZ[i][0] = 1*F[i][0] + 1*F[i][1] + 1*F[i][2] + 1*F[i][3] + 1*F[i][4];
+ FZ[i][1] = 1*F[i][1] + -1*F[i][2] + 2*F[i][3] + -2*F[i][4] + 1*F[i][5];
+ }
+
+ // Compute the output tile f = ZT F Z
+ for (int j = 0; j < 2; j++)
+ {
+ f[0][j] = 1*FZ[0][j] + 1*FZ[1][j] + 1*FZ[2][j] + 1*FZ[3][j] + 1*FZ[4][j];
+ f[1][j] = 1*FZ[1][j] + -1*FZ[2][j] + 2*FZ[3][j] + -2*FZ[4][j] + 1*FZ[5][j];
+ }
+
+ // Write out the output tile
+ for (int i = 0; i < cells_i; i++)
+ {
+ for (int j = 0; j < cells_j; j++)
+ {
+ *(outptrs[i][j]++) = f[i][j];
+ }
}
}
}
diff --git a/src/core/NEON/kernels/convolution/winograd/transforms/output_4x4_3x3_fp32.cpp b/src/core/NEON/kernels/convolution/winograd/transforms/output_4x4_3x3_fp32.cpp
index 609823b..cd3bdef 100644
--- a/src/core/NEON/kernels/convolution/winograd/transforms/output_4x4_3x3_fp32.cpp
+++ b/src/core/NEON/kernels/convolution/winograd/transforms/output_4x4_3x3_fp32.cpp
@@ -100,170 +100,338 @@
const float *inptr = matrix_base;
const float *bptr = biases;
- // For each channel of the output
- int channels_remaining = n_channels;
-#ifdef __aarch64__
- for (; channels_remaining >= 4; channels_remaining -= 4)
+ if (bptr)
{
- // Matrices used and computed during this transform
- float32x4_t F[6][6], FZ[6][4], f[4][4], b;
-
- // Read a 6x6 tile in the Winograd domain
- for (int i = 0, m = 0; i < 6; i++)
+ // For each channel of the output
+ int channels_remaining = n_channels;
+#ifdef __aarch64__
+ for (; channels_remaining >= 4; channels_remaining -= 4)
{
- for (int j = 0; j < 6; j++, m++)
+ // Matrices used and computed during this transform
+ float32x4_t F[6][6], FZ[6][4], f[4][4], b;
+
+ // Read a 6x6 tile in the Winograd domain
+ for (int i = 0, m = 0; i < 6; i++)
{
- F[i][j] = vld1q_f32(inptr + m*matrix_stride);
+ for (int j = 0; j < 6; j++, m++)
+ {
+ F[i][j] = vld1q_f32(inptr + m*matrix_stride);
+ }
+ }
+ inptr += 4;
+
+ // Compute the matrix F Z
+ for (int i = 0; i < 6; i++)
+ {
+ // FZ[i][0] = 1*F[i][0] + 1*F[i][1] + 1*F[i][2] + 1*F[i][3] + 1*F[i][4];
+ FZ[i][0] = vaddq_f32(vaddq_f32(vaddq_f32(F[i][0], F[i][1]), vaddq_f32(F[i][2], F[i][3])), F[i][4]);
+
+ // FZ[i][1] = 1*F[i][1] + -1*F[i][2] + 2*F[i][3] + -2*F[i][4];
+ FZ[i][1] = vmlaq_n_f32(vsubq_f32(F[i][1], F[i][2]), vsubq_f32(F[i][3], F[i][4]), 2.0f);
+
+ // FZ[i][2] = 1*F[i][1] + 1*F[i][2] + 4*F[i][3] + 4*F[i][4];
+ FZ[i][2] = vmlaq_n_f32(vaddq_f32(F[i][1], F[i][2]), vaddq_f32(F[i][3], F[i][4]), 4.0f);
+
+ // FZ[i][3] = 1*F[i][1] + -1*F[i][2] + 8*F[i][3] + -8*F[i][4] + 1*F[i][5];
+ FZ[i][3] = vaddq_f32(vmlaq_n_f32(vsubq_f32(F[i][1], F[i][2]), vsubq_f32(F[i][3], F[i][4]), 8.0f), F[i][5]);
+ }
+
+ // Compute the output tile f = ZT F Z
+ for (int j = 0; j < 4; j++)
+ {
+ // f[0][j] = 1*FZ[0][j] + 1*FZ[1][j] + 1*FZ[2][j] + 1*FZ[3][j] + 1*FZ[4][j];
+ f[0][j] = vaddq_f32(vaddq_f32(vaddq_f32(FZ[0][j], FZ[1][j]), vaddq_f32(FZ[2][j], FZ[3][j])), FZ[4][j]);
+
+ // f[1][j] = 1*FZ[1][j] + -1*FZ[2][j] + 2*FZ[3][j] + -2*FZ[4][j];
+ f[1][j] = vmlaq_n_f32(vsubq_f32(FZ[1][j], FZ[2][j]), vsubq_f32(FZ[3][j], FZ[4][j]), 2.0f);
+
+ // f[2][j] = 1*FZ[1][j] + 1*FZ[2][j] + 4*FZ[3][j] + 4*FZ[4][j];
+ f[2][j] = vmlaq_n_f32(vaddq_f32(FZ[1][j], FZ[2][j]), vaddq_f32(FZ[3][j], FZ[4][j]), 4.0f);
+
+ // f[3][j] = 1*FZ[1][j] + -1*FZ[2][j] + 8*FZ[3][j] + -8*FZ[4][j] + 1*FZ[5][j];
+ f[3][j] = vaddq_f32(vmlaq_n_f32(vsubq_f32(FZ[1][j], FZ[2][j]), vsubq_f32(FZ[3][j], FZ[4][j]), 8.0f), FZ[5][j]);
+ }
+
+ // Write out the output tile
+ b = vld1q_f32(bptr);
+ bptr += 4;
+ for (int i = 0; i < cells_i; i++)
+ {
+ for (int j = 0; j < cells_j; j++)
+ {
+ vst1q_f32(outptrs[i][j], vaddq_f32(f[i][j], b));
+ outptrs[i][j] += 4;
+ }
}
}
- inptr += 4;
-
- // Compute the matrix F Z
- for (int i = 0; i < 6; i++)
- {
- // FZ[i][0] = 1*F[i][0] + 1*F[i][1] + 1*F[i][2] + 1*F[i][3] + 1*F[i][4];
- FZ[i][0] = vaddq_f32(vaddq_f32(vaddq_f32(F[i][0], F[i][1]), vaddq_f32(F[i][2], F[i][3])), F[i][4]);
-
- // FZ[i][1] = 1*F[i][1] + -1*F[i][2] + 2*F[i][3] + -2*F[i][4];
- FZ[i][1] = vmlaq_n_f32(vsubq_f32(F[i][1], F[i][2]), vsubq_f32(F[i][3], F[i][4]), 2.0f);
-
- // FZ[i][2] = 1*F[i][1] + 1*F[i][2] + 4*F[i][3] + 4*F[i][4];
- FZ[i][2] = vmlaq_n_f32(vaddq_f32(F[i][1], F[i][2]), vaddq_f32(F[i][3], F[i][4]), 4.0f);
-
- // FZ[i][3] = 1*F[i][1] + -1*F[i][2] + 8*F[i][3] + -8*F[i][4] + 1*F[i][5];
- FZ[i][3] = vaddq_f32(vmlaq_n_f32(vsubq_f32(F[i][1], F[i][2]), vsubq_f32(F[i][3], F[i][4]), 8.0f), F[i][5]);
- }
-
- // Compute the output tile f = ZT F Z
- for (int j = 0; j < 4; j++)
- {
- // f[0][j] = 1*FZ[0][j] + 1*FZ[1][j] + 1*FZ[2][j] + 1*FZ[3][j] + 1*FZ[4][j];
- f[0][j] = vaddq_f32(vaddq_f32(vaddq_f32(FZ[0][j], FZ[1][j]), vaddq_f32(FZ[2][j], FZ[3][j])), FZ[4][j]);
-
- // f[1][j] = 1*FZ[1][j] + -1*FZ[2][j] + 2*FZ[3][j] + -2*FZ[4][j];
- f[1][j] = vmlaq_n_f32(vsubq_f32(FZ[1][j], FZ[2][j]), vsubq_f32(FZ[3][j], FZ[4][j]), 2.0f);
-
- // f[2][j] = 1*FZ[1][j] + 1*FZ[2][j] + 4*FZ[3][j] + 4*FZ[4][j];
- f[2][j] = vmlaq_n_f32(vaddq_f32(FZ[1][j], FZ[2][j]), vaddq_f32(FZ[3][j], FZ[4][j]), 4.0f);
-
- // f[3][j] = 1*FZ[1][j] + -1*FZ[2][j] + 8*FZ[3][j] + -8*FZ[4][j] + 1*FZ[5][j];
- f[3][j] = vaddq_f32(vmlaq_n_f32(vsubq_f32(FZ[1][j], FZ[2][j]), vsubq_f32(FZ[3][j], FZ[4][j]), 8.0f), FZ[5][j]);
- }
-
- // Write out the output tile
- b = vld1q_f32(bptr);
- bptr += 4;
- for (int i = 0; i < cells_i; i++)
- {
- for (int j = 0; j < cells_j; j++)
- {
- vst1q_f32(outptrs[i][j], vaddq_f32(f[i][j], b));
- outptrs[i][j] += 4;
- }
- }
- }
#endif // __aarch64__
#ifdef __arm_any__
- for (; channels_remaining >= 2; channels_remaining -= 2)
- {
- // Matrices used and computed during this transform
- float32x2_t F[6][6], FZ[6][4], f[4][4], b;
-
- // Read a 6x6 tile in the Winograd domain
- for (int i = 0, m = 0; i < 6; i++)
+ for (; channels_remaining >= 2; channels_remaining -= 2)
{
- for (int j = 0; j < 6; j++, m++)
+ // Matrices used and computed during this transform
+ float32x2_t F[6][6], FZ[6][4], f[4][4], b;
+
+ // Read a 6x6 tile in the Winograd domain
+ for (int i = 0, m = 0; i < 6; i++)
{
- F[i][j] = vld1_f32(inptr + m*matrix_stride);
+ for (int j = 0; j < 6; j++, m++)
+ {
+ F[i][j] = vld1_f32(inptr + m*matrix_stride);
+ }
+ }
+ inptr += 2;
+
+ // Compute the matrix F Z
+ for (int i = 0; i < 6; i++)
+ {
+ // FZ[i][0] = 1*F[i][0] + 1*F[i][1] + 1*F[i][2] + 1*F[i][3] + 1*F[i][4];
+ FZ[i][0] = vadd_f32(vadd_f32(vadd_f32(F[i][0], F[i][1]), vadd_f32(F[i][2], F[i][3])), F[i][4]);
+
+ // FZ[i][1] = 1*F[i][1] + -1*F[i][2] + 2*F[i][3] + -2*F[i][4];
+ FZ[i][1] = vmla_n_f32(vsub_f32(F[i][1], F[i][2]), vsub_f32(F[i][3], F[i][4]), 2.0f);
+
+ // FZ[i][2] = 1*F[i][1] + 1*F[i][2] + 4*F[i][3] + 4*F[i][4];
+ FZ[i][2] = vmla_n_f32(vadd_f32(F[i][1], F[i][2]), vadd_f32(F[i][3], F[i][4]), 4.0f);
+
+ // FZ[i][3] = 1*F[i][1] + -1*F[i][2] + 8*F[i][3] + -8*F[i][4] + 1*F[i][5];
+ FZ[i][3] = vadd_f32(vmla_n_f32(vsub_f32(F[i][1], F[i][2]), vsub_f32(F[i][3], F[i][4]), 8.0f), F[i][5]);
+ }
+
+ // Compute the output tile f = ZT F Z
+ for (int j = 0; j < 4; j++)
+ {
+ // f[0][j] = 1*FZ[0][j] + 1*FZ[1][j] + 1*FZ[2][j] + 1*FZ[3][j] + 1*FZ[4][j];
+ f[0][j] = vadd_f32(vadd_f32(vadd_f32(FZ[0][j], FZ[1][j]), vadd_f32(FZ[2][j], FZ[3][j])), FZ[4][j]);
+
+ // f[1][j] = 1*FZ[1][j] + -1*FZ[2][j] + 2*FZ[3][j] + -2*FZ[4][j];
+ f[1][j] = vmla_n_f32(vsub_f32(FZ[1][j], FZ[2][j]), vsub_f32(FZ[3][j], FZ[4][j]), 2.0f);
+
+ // f[2][j] = 1*FZ[1][j] + 1*FZ[2][j] + 4*FZ[3][j] + 4*FZ[4][j];
+ f[2][j] = vmla_n_f32(vadd_f32(FZ[1][j], FZ[2][j]), vadd_f32(FZ[3][j], FZ[4][j]), 4.0f);
+
+ // f[3][j] = 1*FZ[1][j] + -1*FZ[2][j] + 8*FZ[3][j] + -8*FZ[4][j] + 1*FZ[5][j];
+ f[3][j] = vadd_f32(vmla_n_f32(vsub_f32(FZ[1][j], FZ[2][j]), vsub_f32(FZ[3][j], FZ[4][j]), 8.0f), FZ[5][j]);
+ }
+
+ // Write out the output tile
+ b = vld1_f32(bptr);
+ bptr += 2;
+ for (int i = 0; i < cells_i; i++)
+ {
+ for (int j = 0; j < cells_j; j++)
+ {
+ vst1_f32(outptrs[i][j], vadd_f32(f[i][j], b));
+ outptrs[i][j] += 2;
+ }
}
}
- inptr += 2;
-
- // Compute the matrix F Z
- for (int i = 0; i < 6; i++)
+#endif
+ for (; channels_remaining; channels_remaining--)
{
- // FZ[i][0] = 1*F[i][0] + 1*F[i][1] + 1*F[i][2] + 1*F[i][3] + 1*F[i][4];
- FZ[i][0] = vadd_f32(vadd_f32(vadd_f32(F[i][0], F[i][1]), vadd_f32(F[i][2], F[i][3])), F[i][4]);
+ // Matrices used and computed during this transform
+ float F[6][6], FZ[6][4], f[4][4], b;
- // FZ[i][1] = 1*F[i][1] + -1*F[i][2] + 2*F[i][3] + -2*F[i][4];
- FZ[i][1] = vmla_n_f32(vsub_f32(F[i][1], F[i][2]), vsub_f32(F[i][3], F[i][4]), 2.0f);
-
- // FZ[i][2] = 1*F[i][1] + 1*F[i][2] + 4*F[i][3] + 4*F[i][4];
- FZ[i][2] = vmla_n_f32(vadd_f32(F[i][1], F[i][2]), vadd_f32(F[i][3], F[i][4]), 4.0f);
-
- // FZ[i][3] = 1*F[i][1] + -1*F[i][2] + 8*F[i][3] + -8*F[i][4] + 1*F[i][5];
- FZ[i][3] = vadd_f32(vmla_n_f32(vsub_f32(F[i][1], F[i][2]), vsub_f32(F[i][3], F[i][4]), 8.0f), F[i][5]);
- }
-
- // Compute the output tile f = ZT F Z
- for (int j = 0; j < 4; j++)
- {
- // f[0][j] = 1*FZ[0][j] + 1*FZ[1][j] + 1*FZ[2][j] + 1*FZ[3][j] + 1*FZ[4][j];
- f[0][j] = vadd_f32(vadd_f32(vadd_f32(FZ[0][j], FZ[1][j]), vadd_f32(FZ[2][j], FZ[3][j])), FZ[4][j]);
-
- // f[1][j] = 1*FZ[1][j] + -1*FZ[2][j] + 2*FZ[3][j] + -2*FZ[4][j];
- f[1][j] = vmla_n_f32(vsub_f32(FZ[1][j], FZ[2][j]), vsub_f32(FZ[3][j], FZ[4][j]), 2.0f);
-
- // f[2][j] = 1*FZ[1][j] + 1*FZ[2][j] + 4*FZ[3][j] + 4*FZ[4][j];
- f[2][j] = vmla_n_f32(vadd_f32(FZ[1][j], FZ[2][j]), vadd_f32(FZ[3][j], FZ[4][j]), 4.0f);
-
- // f[3][j] = 1*FZ[1][j] + -1*FZ[2][j] + 8*FZ[3][j] + -8*FZ[4][j] + 1*FZ[5][j];
- f[3][j] = vadd_f32(vmla_n_f32(vsub_f32(FZ[1][j], FZ[2][j]), vsub_f32(FZ[3][j], FZ[4][j]), 8.0f), FZ[5][j]);
- }
-
- // Write out the output tile
- b = vld1_f32(bptr);
- bptr += 2;
- for (int i = 0; i < cells_i; i++)
- {
- for (int j = 0; j < cells_j; j++)
+ // Read a 6x6 tile in the Winograd domain
+ for (int i = 0, m = 0; i < 6; i++)
{
- vst1_f32(outptrs[i][j], vadd_f32(f[i][j], b));
- outptrs[i][j] += 2;
+ for (int j = 0; j < 6; j++, m++)
+ {
+ F[i][j] = *(inptr + m*matrix_stride);
+ }
+ }
+ inptr++;
+
+ // Compute the matrix F Z
+ for (int i = 0; i < 6; i++)
+ {
+ FZ[i][0] = 1*F[i][0] + 1*F[i][1] + 1*F[i][2] + 1*F[i][3] + 1*F[i][4];
+ FZ[i][1] = 1*F[i][1] + -1*F[i][2] + 2*F[i][3] + -2*F[i][4];
+ FZ[i][2] = 1*F[i][1] + 1*F[i][2] + 4*F[i][3] + 4*F[i][4];
+ FZ[i][3] = 1*F[i][1] + -1*F[i][2] + 8*F[i][3] + -8*F[i][4] + 1*F[i][5];
+ }
+
+ // Compute the output tile f = ZT F Z
+ for (int j = 0; j < 4; j++)
+ {
+ f[0][j] = 1*FZ[0][j] + 1*FZ[1][j] + 1*FZ[2][j] + 1*FZ[3][j] + 1*FZ[4][j];
+ f[1][j] = 1*FZ[1][j] + -1*FZ[2][j] + 2*FZ[3][j] + -2*FZ[4][j];
+ f[2][j] = 1*FZ[1][j] + 1*FZ[2][j] + 4*FZ[3][j] + 4*FZ[4][j];
+ f[3][j] = 1*FZ[1][j] + -1*FZ[2][j] + 8*FZ[3][j] + -8*FZ[4][j] + 1*FZ[5][j];
+ }
+
+ // Write out the output tile
+ b = *(bptr++);
+ for (int i = 0; i < cells_i; i++)
+ {
+ for (int j = 0; j < cells_j; j++)
+ {
+ *(outptrs[i][j]++) = f[i][j] + b;
+ }
}
}
}
-#endif
- for (; channels_remaining; channels_remaining--)
+ else
{
- // Matrices used and computed during this transform
- float F[6][6], FZ[6][4], f[4][4], b;
-
- // Read a 6x6 tile in the Winograd domain
- for (int i = 0, m = 0; i < 6; i++)
+ // For each channel of the output
+ int channels_remaining = n_channels;
+#ifdef __aarch64__
+ for (; channels_remaining >= 4; channels_remaining -= 4)
{
- for (int j = 0; j < 6; j++, m++)
+ // Matrices used and computed during this transform
+ float32x4_t F[6][6], FZ[6][4], f[4][4];
+
+ // Read a 6x6 tile in the Winograd domain
+ for (int i = 0, m = 0; i < 6; i++)
{
- F[i][j] = *(inptr + m*matrix_stride);
+ for (int j = 0; j < 6; j++, m++)
+ {
+ F[i][j] = vld1q_f32(inptr + m*matrix_stride);
+ }
+ }
+ inptr += 4;
+
+ // Compute the matrix F Z
+ for (int i = 0; i < 6; i++)
+ {
+ // FZ[i][0] = 1*F[i][0] + 1*F[i][1] + 1*F[i][2] + 1*F[i][3] + 1*F[i][4];
+ FZ[i][0] = vaddq_f32(vaddq_f32(vaddq_f32(F[i][0], F[i][1]), vaddq_f32(F[i][2], F[i][3])), F[i][4]);
+
+ // FZ[i][1] = 1*F[i][1] + -1*F[i][2] + 2*F[i][3] + -2*F[i][4];
+ FZ[i][1] = vmlaq_n_f32(vsubq_f32(F[i][1], F[i][2]), vsubq_f32(F[i][3], F[i][4]), 2.0f);
+
+ // FZ[i][2] = 1*F[i][1] + 1*F[i][2] + 4*F[i][3] + 4*F[i][4];
+ FZ[i][2] = vmlaq_n_f32(vaddq_f32(F[i][1], F[i][2]), vaddq_f32(F[i][3], F[i][4]), 4.0f);
+
+ // FZ[i][3] = 1*F[i][1] + -1*F[i][2] + 8*F[i][3] + -8*F[i][4] + 1*F[i][5];
+ FZ[i][3] = vaddq_f32(vmlaq_n_f32(vsubq_f32(F[i][1], F[i][2]), vsubq_f32(F[i][3], F[i][4]), 8.0f), F[i][5]);
+ }
+
+ // Compute the output tile f = ZT F Z
+ for (int j = 0; j < 4; j++)
+ {
+ // f[0][j] = 1*FZ[0][j] + 1*FZ[1][j] + 1*FZ[2][j] + 1*FZ[3][j] + 1*FZ[4][j];
+ f[0][j] = vaddq_f32(vaddq_f32(vaddq_f32(FZ[0][j], FZ[1][j]), vaddq_f32(FZ[2][j], FZ[3][j])), FZ[4][j]);
+
+ // f[1][j] = 1*FZ[1][j] + -1*FZ[2][j] + 2*FZ[3][j] + -2*FZ[4][j];
+ f[1][j] = vmlaq_n_f32(vsubq_f32(FZ[1][j], FZ[2][j]), vsubq_f32(FZ[3][j], FZ[4][j]), 2.0f);
+
+ // f[2][j] = 1*FZ[1][j] + 1*FZ[2][j] + 4*FZ[3][j] + 4*FZ[4][j];
+ f[2][j] = vmlaq_n_f32(vaddq_f32(FZ[1][j], FZ[2][j]), vaddq_f32(FZ[3][j], FZ[4][j]), 4.0f);
+
+ // f[3][j] = 1*FZ[1][j] + -1*FZ[2][j] + 8*FZ[3][j] + -8*FZ[4][j] + 1*FZ[5][j];
+ f[3][j] = vaddq_f32(vmlaq_n_f32(vsubq_f32(FZ[1][j], FZ[2][j]), vsubq_f32(FZ[3][j], FZ[4][j]), 8.0f), FZ[5][j]);
+ }
+
+ // Write out the output tile
+ for (int i = 0; i < cells_i; i++)
+ {
+ for (int j = 0; j < cells_j; j++)
+ {
+ vst1q_f32(outptrs[i][j], f[i][j]);
+ outptrs[i][j] += 4;
+ }
}
}
- inptr++;
-
- // Compute the matrix F Z
- for (int i = 0; i < 6; i++)
+#endif // __aarch64__
+#ifdef __arm_any__
+ for (; channels_remaining >= 2; channels_remaining -= 2)
{
- FZ[i][0] = 1*F[i][0] + 1*F[i][1] + 1*F[i][2] + 1*F[i][3] + 1*F[i][4];
- FZ[i][1] = 1*F[i][1] + -1*F[i][2] + 2*F[i][3] + -2*F[i][4];
- FZ[i][2] = 1*F[i][1] + 1*F[i][2] + 4*F[i][3] + 4*F[i][4];
- FZ[i][3] = 1*F[i][1] + -1*F[i][2] + 8*F[i][3] + -8*F[i][4] + 1*F[i][5];
- }
+ // Matrices used and computed during this transform
+ float32x2_t F[6][6], FZ[6][4], f[4][4];
- // Compute the output tile f = ZT F Z
- for (int j = 0; j < 4; j++)
- {
- f[0][j] = 1*FZ[0][j] + 1*FZ[1][j] + 1*FZ[2][j] + 1*FZ[3][j] + 1*FZ[4][j];
- f[1][j] = 1*FZ[1][j] + -1*FZ[2][j] + 2*FZ[3][j] + -2*FZ[4][j];
- f[2][j] = 1*FZ[1][j] + 1*FZ[2][j] + 4*FZ[3][j] + 4*FZ[4][j];
- f[3][j] = 1*FZ[1][j] + -1*FZ[2][j] + 8*FZ[3][j] + -8*FZ[4][j] + 1*FZ[5][j];
- }
-
- // Write out the output tile
- b = *(bptr++);
- for (int i = 0; i < cells_i; i++)
- {
- for (int j = 0; j < cells_j; j++)
+ // Read a 6x6 tile in the Winograd domain
+ for (int i = 0, m = 0; i < 6; i++)
{
- *(outptrs[i][j]++) = f[i][j] + b;
+ for (int j = 0; j < 6; j++, m++)
+ {
+ F[i][j] = vld1_f32(inptr + m*matrix_stride);
+ }
+ }
+ inptr += 2;
+
+ // Compute the matrix F Z
+ for (int i = 0; i < 6; i++)
+ {
+ // FZ[i][0] = 1*F[i][0] + 1*F[i][1] + 1*F[i][2] + 1*F[i][3] + 1*F[i][4];
+ FZ[i][0] = vadd_f32(vadd_f32(vadd_f32(F[i][0], F[i][1]), vadd_f32(F[i][2], F[i][3])), F[i][4]);
+
+ // FZ[i][1] = 1*F[i][1] + -1*F[i][2] + 2*F[i][3] + -2*F[i][4];
+ FZ[i][1] = vmla_n_f32(vsub_f32(F[i][1], F[i][2]), vsub_f32(F[i][3], F[i][4]), 2.0f);
+
+ // FZ[i][2] = 1*F[i][1] + 1*F[i][2] + 4*F[i][3] + 4*F[i][4];
+ FZ[i][2] = vmla_n_f32(vadd_f32(F[i][1], F[i][2]), vadd_f32(F[i][3], F[i][4]), 4.0f);
+
+ // FZ[i][3] = 1*F[i][1] + -1*F[i][2] + 8*F[i][3] + -8*F[i][4] + 1*F[i][5];
+ FZ[i][3] = vadd_f32(vmla_n_f32(vsub_f32(F[i][1], F[i][2]), vsub_f32(F[i][3], F[i][4]), 8.0f), F[i][5]);
+ }
+
+ // Compute the output tile f = ZT F Z
+ for (int j = 0; j < 4; j++)
+ {
+ // f[0][j] = 1*FZ[0][j] + 1*FZ[1][j] + 1*FZ[2][j] + 1*FZ[3][j] + 1*FZ[4][j];
+ f[0][j] = vadd_f32(vadd_f32(vadd_f32(FZ[0][j], FZ[1][j]), vadd_f32(FZ[2][j], FZ[3][j])), FZ[4][j]);
+
+ // f[1][j] = 1*FZ[1][j] + -1*FZ[2][j] + 2*FZ[3][j] + -2*FZ[4][j];
+ f[1][j] = vmla_n_f32(vsub_f32(FZ[1][j], FZ[2][j]), vsub_f32(FZ[3][j], FZ[4][j]), 2.0f);
+
+ // f[2][j] = 1*FZ[1][j] + 1*FZ[2][j] + 4*FZ[3][j] + 4*FZ[4][j];
+ f[2][j] = vmla_n_f32(vadd_f32(FZ[1][j], FZ[2][j]), vadd_f32(FZ[3][j], FZ[4][j]), 4.0f);
+
+ // f[3][j] = 1*FZ[1][j] + -1*FZ[2][j] + 8*FZ[3][j] + -8*FZ[4][j] + 1*FZ[5][j];
+ f[3][j] = vadd_f32(vmla_n_f32(vsub_f32(FZ[1][j], FZ[2][j]), vsub_f32(FZ[3][j], FZ[4][j]), 8.0f), FZ[5][j]);
+ }
+
+ // Write out the output tile
+ for (int i = 0; i < cells_i; i++)
+ {
+ for (int j = 0; j < cells_j; j++)
+ {
+ vst1_f32(outptrs[i][j], f[i][j]);
+ outptrs[i][j] += 2;
+ }
+ }
+ }
+#endif
+ for (; channels_remaining; channels_remaining--)
+ {
+ // Matrices used and computed during this transform
+ float F[6][6], FZ[6][4], f[4][4];
+
+ // Read a 6x6 tile in the Winograd domain
+ for (int i = 0, m = 0; i < 6; i++)
+ {
+ for (int j = 0; j < 6; j++, m++)
+ {
+ F[i][j] = *(inptr + m*matrix_stride);
+ }
+ }
+ inptr++;
+
+ // Compute the matrix F Z
+ for (int i = 0; i < 6; i++)
+ {
+ FZ[i][0] = 1*F[i][0] + 1*F[i][1] + 1*F[i][2] + 1*F[i][3] + 1*F[i][4];
+ FZ[i][1] = 1*F[i][1] + -1*F[i][2] + 2*F[i][3] + -2*F[i][4];
+ FZ[i][2] = 1*F[i][1] + 1*F[i][2] + 4*F[i][3] + 4*F[i][4];
+ FZ[i][3] = 1*F[i][1] + -1*F[i][2] + 8*F[i][3] + -8*F[i][4] + 1*F[i][5];
+ }
+
+ // Compute the output tile f = ZT F Z
+ for (int j = 0; j < 4; j++)
+ {
+ f[0][j] = 1*FZ[0][j] + 1*FZ[1][j] + 1*FZ[2][j] + 1*FZ[3][j] + 1*FZ[4][j];
+ f[1][j] = 1*FZ[1][j] + -1*FZ[2][j] + 2*FZ[3][j] + -2*FZ[4][j];
+ f[2][j] = 1*FZ[1][j] + 1*FZ[2][j] + 4*FZ[3][j] + 4*FZ[4][j];
+ f[3][j] = 1*FZ[1][j] + -1*FZ[2][j] + 8*FZ[3][j] + -8*FZ[4][j] + 1*FZ[5][j];
+ }
+
+ // Write out the output tile
+ for (int i = 0; i < cells_i; i++)
+ {
+ for (int j = 0; j < cells_j; j++)
+ {
+ *(outptrs[i][j]++) = f[i][j];
+ }
}
}
}
diff --git a/src/core/NEON/kernels/convolution/winograd/winograd_gemm.cpp b/src/core/NEON/kernels/convolution/winograd/winograd_gemm.cpp
index c082356..a5d4302 100644
--- a/src/core/NEON/kernels/convolution/winograd/winograd_gemm.cpp
+++ b/src/core/NEON/kernels/convolution/winograd/winograd_gemm.cpp
@@ -21,9 +21,9 @@
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*/
+#include <cstring>
#include "arm_compute/core/NEON/kernels/convolution/winograd/winograd_gemm.hpp"
#include "arm_compute/core/NEON/kernels/convolution/winograd/batched_blocked_gemm.hpp"
-
using namespace winograd;
/** Get the output shape of a convolution. */
@@ -37,8 +37,8 @@
{
return Tensor4DShape {
in_shape.n_batches,
- (padding == PADDING_SAME) ? in_shape.n_rows : in_shape.n_rows - (kernel_rows - 1),
- (padding == PADDING_SAME) ? in_shape.n_cols : in_shape.n_cols - (kernel_cols - 1),
+ (padding == PADDING_SAME) ? in_shape.n_rows : in_shape.n_rows - (kernel_rows - 1),
+ (padding == PADDING_SAME) ? in_shape.n_cols : in_shape.n_cols - (kernel_cols - 1),
kernel_shape.n_output_channels,
in_shape.ordering
};
@@ -221,344 +221,6 @@
}
-/** Create a new Winograd operator. */
-template <int output_tile_rows, int output_tile_cols,
- int kernel_rows, int kernel_cols>
-template <typename TOut, typename TIn>
-WinogradGEMM<output_tile_rows, output_tile_cols, kernel_rows, kernel_cols>::Convolution<TOut, TIn>::Convolution(
- const KernelShape &kernel_shape,
- const Tensor4DShape &input_shape,
- const PaddingType padding,
- void *kernel_storage
-) : kernel_shape(kernel_shape), // Store the kernel shape
- kernel_matrix_row_stride(roundup(kernel_shape.n_output_channels, N_BLOCK)),
- manage_kernel_storage(kernel_storage == NULL),
- _kernel_storage(manage_kernel_storage ?
- ALLOCATE(get_kernel_storage_size(kernel_shape)) :
- kernel_storage),
- input_shape(input_shape),
- padding(padding),
- output_shape(get_output_shape(kernel_shape, input_shape, padding)),
- tile_rows(iceildiv(output_shape.n_rows, output_tile_rows)),
- tile_cols(iceildiv(output_shape.n_cols, output_tile_cols)),
- M(input_shape.n_batches * tile_rows * tile_cols),
- K(kernel_shape.n_input_channels),
- N(kernel_shape.n_output_channels),
- prof()
-{
- // Create pointers to the kernel matrices
- const int kernel_matrix_size_bytes = get_kernel_matrix_size(kernel_shape);
- int8_t* const ks_bytes = reinterpret_cast<int8_t *>(_kernel_storage);
- for (int i = 0; i < N_GEMMS; i++) {
- kernel_matrices[i] = reinterpret_cast<TIn *>(
- ks_bytes + i*kernel_matrix_size_bytes);
- }
-}
-
-
-/** Create a new Winograd operator and initialise the weights. */
-template <int output_tile_rows, int output_tile_cols,
- int kernel_rows, int kernel_cols>
-template <typename TOut, typename TIn>
-WinogradGEMM<output_tile_rows, output_tile_cols, kernel_rows, kernel_cols>::Convolution<TOut, TIn>::Convolution(
- const KernelShape &kernel_shape,
- const Tensor4DShape &input_shape,
- const PaddingType padding,
- const TIn* const kernel,
- void *kernel_storage,
- void *transform_working_space
-) : Convolution(kernel_shape, input_shape, padding, kernel_storage)
-{
- transform_weights(kernel, transform_working_space);
-}
-
-
-/** Clean up a convolution engine. */
-template <int output_tile_rows, int output_tile_cols, int kernel_rows, int kernel_cols>
-template <typename TOut, typename TIn>
-WinogradGEMM<output_tile_rows, output_tile_cols, kernel_rows, kernel_cols>::
-Convolution<TOut, TIn>::~Convolution()
-{
- // If we were responsible for managing kernel storage ensure that it is
- // freed.
- if (manage_kernel_storage)
- {
- free(_kernel_storage);
- }
-}
-
-
-/** Transform weights into the Winograd domain and store them for later use/reuse. */
-template <int output_tile_rows, int output_tile_cols, int kernel_rows, int kernel_cols>
-template <typename TOut, typename TIn>
-template <typename WeightsTransformT>
-void WinogradGEMM<output_tile_rows, output_tile_cols, kernel_rows, kernel_cols>::
-Convolution<TOut, TIn>::transform_weights(
- const TIn* const kernel,
- void *transform_working_space
-)
-{
- // Allocate working space if it is required
- bool allocated_working_space = false;
- if (transform_working_space == NULL && // If no memory has been provided
- get_kernel_transform_working_size(kernel_shape) != 0) // And we need the space
- {
- allocated_working_space = true;
- transform_working_space = ALLOCATE(
- get_kernel_transform_working_size(kernel_shape)
- );
- }
-
- // The transformation methods only work on weights laid out in HWIO form, if
- // the weights are not in this form then we need to re-order them.
- const TIn *kernel_hwio = kernel;
- if (kernel_shape.ordering != HWIO)
- {
- kernel_hwio = reinterpret_cast<TIn *>(transform_working_space);
-
- // Re-order the weights from OIHW to HWIO
- this->prof(
- "Weight reorder",
- [&kernel, &kernel_hwio, this] () {
- reorder::ofm_ifm_h_w_to_h_w_ifm_ofm(
- kernel, const_cast<TIn *>(kernel_hwio),
- kernel_shape.n_output_channels,
- kernel_shape.n_input_channels,
- kernel_shape.n_rows,
- kernel_shape.n_cols
- );
- },
- kernel_shape.size() * sizeof(TIn),
- 0,
- kernel_shape.size() * sizeof(TIn)
- );
- }
-
- const int kernel_matrix_size_bytes = get_kernel_matrix_size(kernel_shape);
- WeightsTransformT weights_transform(
- kernel_hwio, kernel_matrices[0],
- kernel_matrix_size_bytes / sizeof(TIn),
- kernel_matrix_row_stride,
- kernel_shape.n_output_channels,
- kernel_shape.n_input_channels
- );
-
- // Transform the weights into the Winograd domain
- auto kernel_prep = [&] ()
- {
- weights_transform.run(0, weights_transform.get_window());
- };
-
- prof(
- "Kernel Prep", kernel_prep,
- WeightsTransformT::bytes_read(kernel_shape),
- WeightsTransformT::ops_performed(kernel_shape),
- WeightsTransformT::bytes_written(kernel_shape)
- );
-
- // Free memory if we allocated it
- if (allocated_working_space)
- {
- free(transform_working_space);
- }
-}
-
-
-/** Perform a convolution. */
-template <int output_tile_rows, int output_tile_cols,
- int kernel_rows, int kernel_cols>
-template <typename TOut, typename TIn>
-void WinogradGEMM<output_tile_rows, output_tile_cols, kernel_rows, kernel_cols>::
-Convolution<TOut, TIn>::execute(
- TOut* const output,
- const TIn* const input,
- const TOut* const biases,
- void *working_space,
- const int n_threads
-)
-{
- const auto padding_type = padding;
- const auto input_shape = this->input_shape;
-
- // Allocate working space if none has been provided
- const bool manage_working_space = (working_space == NULL);
- if (manage_working_space)
- {
- const size_t ws_size = get_working_space_size(
- kernel_shape, input_shape, padding_type
- );
- working_space = ALLOCATE(ws_size * sizeof(int8_t));
- memset(working_space, 0x00, ws_size);
- }
- int8_t* const ws_bytes = reinterpret_cast<int8_t *>(working_space);
-
- // Split the working space into that required for 16 input matrices and
- // output matrices.
- TIn *input_matrices[N_GEMMS];
- TOut *output_matrices[N_GEMMS];
- const int in_matrix_stride_bytes = get_input_matrix_size(kernel_shape, input_shape, padding_type);
- const int out_matrix_stride_bytes = get_output_matrix_size(kernel_shape, input_shape, padding_type);
-
- for (int i = 0; i < N_GEMMS; i++)
- {
- input_matrices[i] = reinterpret_cast<TIn *>(
- ws_bytes + i*in_matrix_stride_bytes);
- output_matrices[i] = reinterpret_cast<TIn *>(
- ws_bytes + N_GEMMS*in_matrix_stride_bytes + i*out_matrix_stride_bytes);
- }
-
- // If we need to re-order the input and output tensors then the final chunk
- // of the working space can be used for this purpose.
- const TIn* input_nhwc = input;
- if (input_shape.ordering == NCHW)
- {
- input_nhwc = reinterpret_cast<TIn *>(
- ws_bytes + N_GEMMS*(in_matrix_stride_bytes + out_matrix_stride_bytes)
- );
-
- this->prof(
- "NCHW -> NHWC",
- [input, input_shape, input_nhwc] () {
- reorder::nchw_to_nhwc(
- input, const_cast<TIn *>(input_nhwc),
- input_shape.n_batches,
- input_shape.n_channels,
- input_shape.n_rows,
- input_shape.n_cols
- );
- },
- input_shape.size(), 0, input_shape.size()
- );
- }
-
- // Compute shape for the GEMM
- const auto output_shape = this->output_shape;
- int M = this->M;
- int K = this->K;
- int N = this->N;
-
- const int in_matrix_row_stride = K;
- const int out_matrix_row_stride = kernel_matrix_row_stride;
-
- InputTransform<TIn> input_transform(
- input_nhwc,
- input_shape.n_batches,
- input_shape.n_rows,
- input_shape.n_cols,
- input_shape.n_channels,
- padding_type,
- input_matrices[0],
- in_matrix_stride_bytes / sizeof(TIn),
- in_matrix_row_stride
- );
-
- // Transform the input into the Winograd domain
- auto input_prep = [&] () {
- input_transform.run(0, input_transform.get_window());
- };
- prof(
- "Input Prep", input_prep,
- InputTransform<TIn>::bytes_read(input_shape),
- InputTransform<TIn>::ops_performed(input_shape),
- InputTransform<TIn>::bytes_written(input_shape)
- );
-
- // Perform the GEMMs
- const int kernel_matrix_stride_bytes = get_kernel_matrix_size(kernel_shape);
- BatchedBlockedGemm<M_BLOCK, N_BLOCK, TOut, TIn> gemms(
- N_GEMMS, M, K, N,
- in_matrix_stride_bytes / sizeof(TIn),
- in_matrix_row_stride,
- kernel_matrix_stride_bytes / sizeof(TIn),
- kernel_matrix_row_stride,
- out_matrix_stride_bytes / sizeof(TOut),
- out_matrix_row_stride,
- input_matrices[0],
- kernel_matrices[0],
- output_matrices[0]
- );
- for (unsigned int i = 0; i < gemms.get_window(); i++)
- {
- auto run_gemm = [&] () { gemms.run(i, i+1); };
- prof("GEMM", run_gemm, 0, 0, 0);
- }
-
- // If the output tensor needs to be in NCHW form then store the NHWC output
- // tensor in temporary storage and then reorder. If the output tensor needs
- // to be in NHWC then just write straight to the output tensor.
- TOut *output_nhwc = output;
- if (input_shape.ordering == NCHW)
- {
- output_nhwc = reinterpret_cast<TOut *>(
- ws_bytes + N_GEMMS*(in_matrix_stride_bytes + out_matrix_stride_bytes)
- );
- }
-
- // Transform the output tensor from the Winograd domain to the spatial
- // domain.
- OutputTransform<TOut> output_transform(
- output_matrices[0],
- out_matrix_stride_bytes / sizeof(TOut),
- out_matrix_row_stride,
- biases,
- output_nhwc,
- output_shape.n_batches,
- output_shape.n_rows,
- output_shape.n_cols,
- output_shape.n_channels
- );
- auto output_prep = [&] () {
- output_transform.run(0, output_transform.get_window());
- };
- prof(
- "Output Comp", output_prep,
- OutputTransform<TOut>::bytes_read(output_shape),
- OutputTransform<TOut>::ops_performed(output_shape),
- OutputTransform<TOut>::bytes_written(output_shape)
- );
-
- // Reorder the output tensor if it is required to be in NCHW form.
- if (input_shape.ordering == NCHW)
- {
- prof(
- "NHWC -> NCHW",
- [output_nhwc, output_shape, output] () {
- reorder::nhwc_to_nchw(
- output_nhwc, output,
- output_shape.n_batches,
- output_shape.n_rows,
- output_shape.n_cols,
- output_shape.n_channels
- );
- },
- output_shape.size(), 0, output_shape.size()
- );
- }
-
- // Free working space if we were responsible for allocating it
- if (manage_working_space)
- {
- free(working_space);
- }
-}
-
-
-/** Perform a convolution. */
-template <int output_tile_rows, int output_tile_cols,
- int kernel_rows, int kernel_cols>
-template <typename TOut, typename TIn>
-void WinogradGEMM<output_tile_rows, output_tile_cols, kernel_rows, kernel_cols>::
-Convolution<TOut, TIn>::execute(
- TOut* const output,
- const TIn* const input,
- const TOut* const biases,
- const int n_threads
-)
-{
- execute(output, input, biases, NULL, n_threads);
-}
-
-
// Instantiate required implementations
template class WinogradGEMM<2, 2, 3, 3>::Convolution<float, float>;
template class WinogradGEMM<4, 4, 3, 3>::Convolution<float, float>;
diff --git a/src/core/SubTensorInfo.cpp b/src/core/SubTensorInfo.cpp
index 836c379..237f133 100644
--- a/src/core/SubTensorInfo.cpp
+++ b/src/core/SubTensorInfo.cpp
@@ -100,6 +100,7 @@
if(_parent->tensor_shape().total_size() != 0 && !_extend_parent)
{
ARM_COMPUTE_ERROR_ON_INVALID_SUBTENSOR(_parent->tensor_shape(), _coords, shape);
+ _valid_region = ValidRegion{ _coords, shape };
}
else if(_extend_parent) // Extend parent shape, configure if specified
{
diff --git a/src/core/TensorInfo.cpp b/src/core/TensorInfo.cpp
index bd0c85f..676938a 100644
--- a/src/core/TensorInfo.cpp
+++ b/src/core/TensorInfo.cpp
@@ -34,7 +34,7 @@
TensorInfo::TensorInfo()
: _total_size(0), _fixed_point_position(0), _offset_first_element_in_bytes(0), _strides_in_bytes(), _num_channels(0), _tensor_shape(), _data_type(DataType::UNKNOWN), _format(Format::UNKNOWN),
- _is_resizable{ true }, _valid_region{ Coordinates(), _tensor_shape }, _padding{ 0 }, _quantization_info()
+ _is_resizable{ true }, _valid_region{ Coordinates(), _tensor_shape }, _padding{ 0 }, _quantization_info(), _data_layout(DataLayout::NCHW)
{
}
@@ -53,6 +53,7 @@
_valid_region = info.valid_region();
_padding = info.padding();
_quantization_info = info.quantization_info();
+ _data_layout = info.data_layout();
}
TensorInfo::TensorInfo(Format format)
@@ -167,13 +168,13 @@
// Number of cells for each block
const Size2D num_cells_per_block = hog_info.num_cells_per_block();
- // Tensor Size = (Number of horizontal blocks) * (Number of vertical blocks )
- const Size2D num_blocks_per_img = hog_info.num_blocks_per_image(Size2D(width, height));
+ // Tensor Size = (Number of horizontal block positions) * (Number of vertical block positions)
+ const Size2D num_block_positions_per_img = hog_info.num_block_positions_per_image(Size2D(width, height));
// Number of tensor channels = (Number of cells per block) * (Number of bins per cell)
const size_t num_channels = num_cells_per_block.area() * hog_info.num_bins();
- init(TensorShape(num_blocks_per_img.width, num_blocks_per_img.height), num_channels, DataType::F32);
+ init(TensorShape(num_block_positions_per_img.width, num_block_positions_per_img.height), num_channels, DataType::F32);
}
size_t TensorInfo::init_auto_padding(const TensorShape &tensor_shape, Format format)
@@ -211,13 +212,13 @@
// Number of cells for each block
const Size2D num_cells_per_block = hog_info.num_cells_per_block();
- // Tensor Size = (Number of horizontal blocks) * (Number of vertical blocks )
- const Size2D num_blocks_per_img = hog_info.num_blocks_per_image(Size2D(width, height));
+ // Tensor Size = (Number of horizontal block positions) * (Number of vertical block positions)
+ const Size2D num_block_positions_per_img = hog_info.num_block_positions_per_image(Size2D(width, height));
// Number of tensor channels = (Number of cells per block) * (Number of bins per cell)
const size_t num_channels = num_cells_per_block.area() * hog_info.num_bins();
- return init_auto_padding(TensorShape(num_blocks_per_img.width, num_blocks_per_img.height), num_channels, DataType::F32);
+ return init_auto_padding(TensorShape(num_block_positions_per_img.width, num_block_positions_per_img.height), num_channels, DataType::F32);
}
bool TensorInfo::auto_padding()
@@ -321,7 +322,7 @@
{
_data_type = data_type;
_format = Format::UNKNOWN;
- return *this;
+ return set_tensor_shape(tensor_shape()); // Force total size and strides to update
}
ITensorInfo &TensorInfo::set_num_channels(int num_channels)
@@ -384,6 +385,12 @@
return *this;
}
+ITensorInfo &TensorInfo::set_data_layout(const DataLayout &data_layout)
+{
+ _data_layout = data_layout;
+ return *this;
+}
+
ITensorInfo &TensorInfo::reset_padding()
{
_padding = PaddingSize();
diff --git a/src/core/Utils.cpp b/src/core/Utils.cpp
index f4b4553..b1c5992 100644
--- a/src/core/Utils.cpp
+++ b/src/core/Utils.cpp
@@ -126,6 +126,18 @@
return channels_map[channel];
}
+const std::string &arm_compute::string_from_data_layout(DataLayout dl)
+{
+ static std::map<DataLayout, const std::string> dl_map =
+ {
+ { DataLayout::UNKNOWN, "UNKNOWN" },
+ { DataLayout::NCHW, "NCHW" },
+ { DataLayout::NHWC, "NHWC" },
+ };
+
+ return dl_map[dl];
+}
+
const std::string &arm_compute::string_from_data_type(DataType dt)
{
static std::map<DataType, const std::string> dt_map =
@@ -145,6 +157,7 @@
{ DataType::F32, "F32" },
{ DataType::F64, "F64" },
{ DataType::SIZET, "SIZET" },
+ { DataType::QASYMM8, "QASYMM8" },
};
return dt_map[dt];
@@ -292,7 +305,8 @@
const std::pair<unsigned int, unsigned int> arm_compute::scaled_dimensions(unsigned int width, unsigned int height,
unsigned int kernel_width, unsigned int kernel_height,
- const PadStrideInfo &pad_stride_info)
+ const PadStrideInfo &pad_stride_info,
+ const Size2D &dilation)
{
const unsigned int pad_left = pad_stride_info.pad_left();
const unsigned int pad_top = pad_stride_info.pad_top();
@@ -305,12 +319,12 @@
switch(pad_stride_info.round())
{
case DimensionRoundingType::FLOOR:
- w = static_cast<unsigned int>(std::floor((static_cast<float>(width + pad_left + pad_right - kernel_width) / stride_x) + 1));
- h = static_cast<unsigned int>(std::floor((static_cast<float>(height + pad_top + pad_bottom - kernel_height) / stride_y) + 1));
+ w = static_cast<unsigned int>(std::floor((static_cast<float>(width + pad_left + pad_right - (dilation.x() * (kernel_width - 1) + 1)) / stride_x) + 1));
+ h = static_cast<unsigned int>(std::floor((static_cast<float>(height + pad_top + pad_bottom - (dilation.y() * (kernel_height - 1) + 1)) / stride_y) + 1));
break;
case DimensionRoundingType::CEIL:
- w = static_cast<unsigned int>(std::ceil((static_cast<float>(width + pad_left + pad_right - kernel_width) / stride_x) + 1));
- h = static_cast<unsigned int>(std::ceil((static_cast<float>(height + pad_top + pad_bottom - kernel_height) / stride_y) + 1));
+ w = static_cast<unsigned int>(std::ceil((static_cast<float>(width + pad_left + pad_right - (dilation.x() * (kernel_width - 1) + 1)) / stride_x) + 1));
+ h = static_cast<unsigned int>(std::ceil((static_cast<float>(height + pad_top + pad_bottom - (dilation.y() * (kernel_height - 1) + 1)) / stride_y) + 1));
break;
default:
ARM_COMPUTE_ERROR("Unsupported rounding type");
diff --git a/src/core/Validate.cpp b/src/core/Validate.cpp
index f5f9f1f..d4fabd4 100644
--- a/src/core/Validate.cpp
+++ b/src/core/Validate.cpp
@@ -167,9 +167,9 @@
const TensorShape &parent_shape, const Coordinates &coords, const TensorShape &shape)
{
// Subtensor should not index in x, y dimensions.
- ARM_COMPUTE_RETURN_ERROR_ON_LOC(((coords.x() != 0) && (coords.y() != 0)), function, file, line);
+ ARM_COMPUTE_RETURN_ERROR_ON_LOC(((coords.x() != 0) || (coords.y() != 0)), function, file, line);
// Subtensor shape should match parent tensor in x, y dimensions.
- ARM_COMPUTE_RETURN_ERROR_ON_LOC(((parent_shape.x() != shape.x()) && (parent_shape.y() != parent_shape.y())), function, file, line);
+ ARM_COMPUTE_RETURN_ERROR_ON_LOC(((parent_shape.x() != shape.x()) || (parent_shape.y() != parent_shape.y())), function, file, line);
// Check dimensions
for(unsigned int i = 0; i < TensorShape::num_max_dimensions; ++i)
diff --git a/src/graph/Graph.cpp b/src/graph/Graph.cpp
index 2fe3a90..e1ffeed 100644
--- a/src/graph/Graph.cpp
+++ b/src/graph/Graph.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2018 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -23,292 +23,205 @@
*/
#include "arm_compute/graph/Graph.h"
-#include "arm_compute/graph/CL/CLMap.h"
-#include "arm_compute/graph/CL/CLUnmap.h"
-#include "arm_compute/graph/INode.h"
-#include "arm_compute/graph/ITensorObject.h"
-#include "arm_compute/graph/Tensor.h"
-#include "arm_compute/runtime/CL/CLScheduler.h"
-#include "arm_compute/runtime/CL/CLTensor.h"
-#include "arm_compute/runtime/Tensor.h"
-#include "support/ToolchainSupport.h"
-
-#include <sys/stat.h>
-
-using namespace arm_compute::graph;
-
-namespace
+namespace arm_compute
{
-bool file_exists(const std::string &filename)
+namespace graph
{
- std::ifstream file(filename);
- return file.good();
+Graph::Graph(GraphID id, std::string name)
+ : _id(id), _name(std::move(name)), _nodes(), _edges(), _tensors(), _tagged_nodes(), _mtx()
+{
}
-} // namespace
-struct Stage
+bool Graph::remove_node(NodeID nid)
{
- ITensorObject *_input;
- ITensorObject *_output;
- std::unique_ptr<arm_compute::IFunction> _function;
-};
-
-struct Graph::Private
-{
-public:
- /** Finalizes the current node's configuration
- *
- * @param _next_hint Device execution hint
- */
- void configure(GraphHints _next_hints);
-
- GraphContext _ctx{};
- std::vector<Stage> _pipeline{};
- std::vector<std::unique_ptr<ITensorObject>> _tensors{};
- std::vector<std::unique_ptr<INode>> _nodes{};
- GraphHints _current_hints{};
- GraphHints _next_hints{};
- std::unique_ptr<ITensorObject> _graph_input{ nullptr };
- std::unique_ptr<ITensorObject> _graph_output{ nullptr };
- std::unique_ptr<INode> _current_node{ nullptr };
- ITensorObject *_current_output{ nullptr };
- bool _info_enabled{ false };
- CLTuner _tuner{};
-
-private:
- ITensorObject *_current_input{ nullptr };
- GraphHints _previous_hints{};
-};
-
-static const std::string tuner_data_filename = "acl_tuner.csv";
-Graph::~Graph() //NOLINT
-{
- if(_pimpl->_tuner.tune_new_kernels() && !_pimpl->_tuner.lws_table().empty())
+ if(nid >= _nodes.size())
{
- _pimpl->_tuner.save_to_file(tuner_data_filename);
+ return false;
}
-}
-Graph::Graph()
- : _pimpl{ new Private() }
-{
- graph_init();
-}
+ std::unique_ptr<INode> &node = _nodes[nid];
-void Graph::graph_init(const bool use_cl_tuner)
-{
- // Check if OpenCL is available and initialize the scheduler
- if(opencl_is_available())
+ // Remove node connections
+ if(node)
{
- if(_pimpl->_tuner.lws_table().empty() && file_exists(tuner_data_filename))
+ for(auto &input_eid : node->_input_edges)
{
- _pimpl->_tuner.load_from_file(tuner_data_filename);
+ remove_connection(input_eid);
}
- _pimpl->_tuner.set_tune_new_kernels(use_cl_tuner);
- arm_compute::CLScheduler::get().default_init(&_pimpl->_tuner);
- }
-}
-void Graph::run()
-{
- while(true)
- {
- if(_pimpl->_graph_input->has_accessor() && !_pimpl->_graph_input->call_accessor())
+ for(auto &outpud_eid : node->_output_edges)
{
- return;
- }
-
- for(auto &stage : _pimpl->_pipeline)
- {
- stage._function->run();
- }
-
- if((_pimpl->_graph_output->has_accessor() && !_pimpl->_graph_output->call_accessor())
- || (!_pimpl->_graph_output->has_accessor()))
- {
- return;
- }
- }
-}
-
-//Finalize current node's configuration
-void Graph::Private::configure(GraphHints _next_hints)
-{
- ARM_COMPUTE_ERROR_ON(_current_node == nullptr);
- ARM_COMPUTE_ERROR_ON(_graph_input == nullptr);
-
- // Is it the first node of the graph ?
- if(_current_input == nullptr)
- {
- _graph_input->set_target(_current_hints.target_hint());
- _current_input = _graph_input.get();
- _previous_hints = _current_hints; // For the first node just assume the previous node was of the same type as this one
- }
-
- if(_current_node->supports_in_place())
- {
- _current_output = _current_input;
- }
-
- //Automatic output configuration ?
- if(_current_output == nullptr)
- {
- _tensors.push_back(arm_compute::support::cpp14::make_unique<Tensor>(TensorInfo()));
- _current_output = _tensors.back().get();
- }
-
- // If either the writer or reader node needs OpenCL then use OpenCL memory:
- if((_next_hints.target_hint() == TargetHint::OPENCL || _current_hints.target_hint() == TargetHint::OPENCL))
- {
- _current_output->set_target(TargetHint::OPENCL);
- }
- else
- {
- _current_output->set_target(TargetHint::NEON);
- }
-
- // Instantiate Node
- _ctx.hints() = _current_hints;
- std::unique_ptr<arm_compute::IFunction> func = _current_node->instantiate_node(_ctx, _current_input, _current_output);
-
- // If the operation is done in-place, do not allocate or it will prevent following layers from performing the configuration
- if(!_current_node->supports_in_place())
- {
- // Allocate current input
- _current_input->allocate();
- }
-
- // Map input if needed
- if(_current_input->target() == TargetHint::OPENCL)
- {
- if(_previous_hints.target_hint() == TargetHint::NEON)
- {
- ARM_COMPUTE_ERROR_ON(_current_hints.target_hint() == TargetHint::NEON);
- _pipeline.push_back({ _current_input, _current_input, arm_compute::support::cpp14::make_unique<CLUnmap>(_current_input) });
- }
- if(_current_hints.target_hint() == TargetHint::NEON)
- {
- ARM_COMPUTE_ERROR_ON(_previous_hints.target_hint() == TargetHint::NEON);
- _pipeline.push_back({ _current_input, _current_input, arm_compute::support::cpp14::make_unique<CLMap>(_current_input, true) });
+ remove_connection(outpud_eid);
}
}
- _pipeline.push_back({ _current_input, _current_output, std::move(func) });
+ node = nullptr;
- _current_input = _current_output;
- _current_output = nullptr;
- std::swap(_previous_hints, _current_hints);
- std::swap(_current_hints, _next_hints);
+ return true;
}
-void Graph::add_node(std::unique_ptr<INode> node)
+EdgeID Graph::add_connection(NodeID source, size_t source_idx, NodeID sink, size_t sink_idx)
{
- ARM_COMPUTE_ERROR_ON_MSG(_pimpl->_graph_input == nullptr, "The graph's input must be set before the first node is added");
- ARM_COMPUTE_ERROR_ON_MSG(_pimpl->_graph_output != nullptr, "Nothing can be added after the output tensor");
- //Trigger the creation of the current Node:
+ std::lock_guard<arm_compute::Mutex> lock(_mtx);
- GraphHints _next_hints = _pimpl->_next_hints;
- _next_hints.set_target_hint(node->override_target_hint(_pimpl->_next_hints.target_hint()));
- ARM_COMPUTE_ERROR_ON(_next_hints.target_hint() == TargetHint::DONT_CARE);
- if(_pimpl->_current_node)
+ // Check if node index is valid, if node exists and finally if the connection index is valid
+ ARM_COMPUTE_ERROR_ON((source >= _nodes.size()) || (_nodes[source] == nullptr) || (source_idx >= _nodes[source]->num_outputs()));
+ ARM_COMPUTE_ERROR_ON((sink >= _nodes.size()) || (_nodes[sink] == nullptr) || (sink_idx >= _nodes[sink]->num_inputs()));
+
+ // Get nodes
+ std::unique_ptr<INode> &source_node = _nodes[source];
+ std::unique_ptr<INode> &sink_node = _nodes[sink];
+
+ // Check for duplicate connections (Check only sink node)
+ Edge *sink_node_edge = sink_node->input_edge(sink_idx);
+ if((sink_node_edge != nullptr) && (sink_node_edge->producer_id() == source) && (sink_node_edge->producer_idx() == source_idx)
+ && (sink_node_edge->consumer_id() == sink) && (sink_node_edge->consumer_idx() == sink_idx))
{
- //Finalize the previous Node:
- _pimpl->configure(_pimpl->_next_hints);
+ return sink_node_edge->id();
}
- else
+
+ // Check if there is already a tensor associated with output if not create one
+ TensorID tid = source_node->output_id(source_idx);
+ if(tid == NullTensorID)
{
- // If that's the first node then use the same TargetHint before and after the node.
- _pimpl->_current_hints = _next_hints;
+ tid = create_tensor();
}
- if(_pimpl->_current_node)
+ std::unique_ptr<Tensor> &tensor = _tensors[tid];
+
+ // Create connections
+ EdgeID eid = _edges.size();
+ auto connection = arm_compute::support::cpp14::make_unique<Edge>(eid, source_node.get(), source_idx, sink_node.get(), sink_idx, tensor.get());
+ _edges.push_back(std::move(connection));
+
+ // Add connections to source and sink nodes
+ source_node->_output_edges.insert(eid);
+ sink_node->_input_edges[sink_idx] = eid;
+
+ // Set tensor output node
+ source_node->_outputs[source_idx] = tid;
+
+ // Bind tensor to the edge
+ tensor->bind_edge(eid);
+
+ // Try and propagate shapes in sink node
+ sink_node->forward_descriptors();
+
+ return eid;
+}
+
+bool Graph::remove_connection(EdgeID eid)
+{
+ if(eid >= _edges.size())
{
- _pimpl->_nodes.push_back(std::move(_pimpl->_current_node));
+ return false;
}
- _pimpl->_current_node = std::move(node);
-}
-//Add a tensor with an Accessor (i.e either the input or output of the graph)
-void Graph::add_tensor_object(std::unique_ptr<ITensorObject> tensor)
-{
- // If it's the first Tensor added then it will be the input of the Graph.
- if(_pimpl->_graph_input == nullptr)
+ std::unique_ptr<Edge> &edge = _edges[eid];
+
+ // Remove node connections
+ if(edge != nullptr)
{
- ARM_COMPUTE_ERROR_ON(_pimpl->_graph_output != nullptr);
- ARM_COMPUTE_ERROR_ON(_pimpl->_current_node != nullptr);
- _pimpl->_graph_input = std::move(tensor);
+ // Get tensor bound to the edge
+ if(edge->tensor() != nullptr)
+ {
+ edge->tensor()->unbind_edge(eid);
+ }
+
+ // Remove edges from source node
+ if(edge->producer() != nullptr)
+ {
+ edge->producer()->_output_edges.erase(eid);
+ }
+
+ // Remove edges from sink node
+ if((edge->consumer() != nullptr) && (edge->consumer_idx() < edge->consumer()->_input_edges.size()))
+ {
+ edge->consumer()->_input_edges[edge->consumer_idx()] = EmptyEdgeID;
+ }
}
- else
- {
- // Else it will be the output of the Graph
- ARM_COMPUTE_ERROR_ON(_pimpl->_graph_output != nullptr);
- ARM_COMPUTE_ERROR_ON(_pimpl->_current_node == nullptr);
- _pimpl->_graph_output = std::move(tensor);
- _pimpl->_current_output = _pimpl->_graph_output.get();
- // Finalize the graph by configuring the last Node of the graph:
- _pimpl->configure(_pimpl->_current_hints); // Ignore _next_hint as this is the last node, and just use the same hint as before this node.
- _pimpl->_graph_output->allocate();
- }
+ // Clear edge
+ edge = nullptr;
+
+ return true;
}
-bool Graph::opencl_is_available()
+TensorID Graph::create_tensor(TensorDescriptor desc)
{
- return arm_compute::opencl_is_available();
+ TensorID tid = _tensors.size();
+ auto tensor = support::cpp14::make_unique<Tensor>(tid, desc);
+ _tensors.push_back(std::move(tensor));
+
+ return tid;
}
-arm_compute::GPUTarget Graph::gpu_target()
+std::string Graph::name() const
{
- // Check if OpenCL is available before returning the GPU target
- if(opencl_is_available())
- {
- return arm_compute::CLScheduler::get().target();
- }
- else
- {
- return GPUTarget::MIDGARD;
- }
+ return _name;
}
-void Graph::set_temp(TensorInfo &&tmp)
+GraphID Graph::id() const
{
- ARM_COMPUTE_ERROR_ON(_pimpl->_graph_input == nullptr);
- ARM_COMPUTE_ERROR_ON(_pimpl->_graph_output != nullptr);
- ARM_COMPUTE_ERROR_ON_MSG(_pimpl->_current_output != nullptr, "TensorInfo for temporary tensor already set");
-
- _pimpl->_tensors.push_back(arm_compute::support::cpp14::make_unique<Tensor>(std::move(tmp)));
- _pimpl->_current_output = _pimpl->_tensors.back().get();
+ return _id;
}
-GraphHints &Graph::hints()
+const std::vector<NodeID> &Graph::inputs()
{
- return _pimpl->_next_hints;
+ return _tagged_nodes[NodeType::Input];
}
-Graph &arm_compute::graph::operator<<(Graph &graph, TensorInfo &&info)
+std::vector<std::unique_ptr<INode>> &Graph::nodes()
{
- graph.set_temp(std::move(info));
- return graph;
+ return _nodes;
}
-Graph &arm_compute::graph::operator<<(Graph &graph, Tensor &&tensor)
+const std::vector<std::unique_ptr<INode>> &Graph::nodes() const
{
- graph.add_tensor_object(arm_compute::support::cpp14::make_unique<Tensor>(std::move(tensor)));
- return graph;
+ return _nodes;
}
-Graph &arm_compute::graph::operator<<(Graph &graph, SubTensor &&sub_tensor)
+const std::vector<std::unique_ptr<Edge>> &Graph::edges() const
{
- graph.add_tensor_object(arm_compute::support::cpp14::make_unique<SubTensor>(std::move(sub_tensor)));
- return graph;
+ return _edges;
}
-Graph &arm_compute::graph::operator<<(Graph &graph, TargetHint target_hint)
+std::vector<std::unique_ptr<Tensor>> &Graph::tensors()
{
- graph.hints().set_target_hint(target_hint);
- return graph;
+ return _tensors;
}
-Graph &arm_compute::graph::operator<<(Graph &graph, ConvolutionMethodHint conv_method_hint)
+const std::vector<std::unique_ptr<Tensor>> &Graph::tensors() const
{
- graph.hints().set_convolution_method_hint(conv_method_hint);
- return graph;
+ return _tensors;
}
+
+const INode *Graph::node(NodeID id) const
+{
+ return (id >= _nodes.size()) ? nullptr : _nodes[id].get();
+}
+
+INode *Graph::node(NodeID id)
+{
+ return (id >= _nodes.size()) ? nullptr : _nodes[id].get();
+}
+
+const Edge *Graph::edge(EdgeID id) const
+{
+ return (id >= _edges.size()) ? nullptr : _edges[id].get();
+}
+
+Edge *Graph::edge(EdgeID id)
+{
+ return (id >= _edges.size()) ? nullptr : _edges[id].get();
+}
+
+const Tensor *Graph::tensor(TensorID id) const
+{
+ return (id >= _tensors.size()) ? nullptr : _tensors[id].get();
+}
+
+Tensor *Graph::tensor(TensorID id)
+{
+ return (id >= _tensors.size()) ? nullptr : _tensors[id].get();
+}
+} // namespace graph
+} // namespace arm_compute
\ No newline at end of file
diff --git a/src/graph/GraphBuilder.cpp b/src/graph/GraphBuilder.cpp
new file mode 100644
index 0000000..4c5d30a
--- /dev/null
+++ b/src/graph/GraphBuilder.cpp
@@ -0,0 +1,441 @@
+/*
+ * Copyright (c) 2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/graph/GraphBuilder.h"
+
+#include "arm_compute/graph/Graph.h"
+#include "arm_compute/graph/Utils.h"
+#include "arm_compute/graph/algorithms/BFS.h"
+#include "arm_compute/graph/nodes/Nodes.h"
+
+#define CHECK_NODEIDX_PAIR(pair, g) \
+ ARM_COMPUTE_ERROR_ON(((pair).node_id >= (g).nodes().size()) || ((g).node((pair).node_id) == nullptr) || ((pair).index >= (g).node((pair).node_id)->num_outputs()));
+
+namespace arm_compute
+{
+namespace graph
+{
+namespace
+{
+Status set_node_params(Graph &g, NodeID nid, NodeParams ¶ms)
+{
+ INode *node = g.node(nid);
+ ARM_COMPUTE_RETURN_ERROR_ON(!node);
+
+ node->set_common_node_parameters(params);
+
+ return Status{};
+}
+
+Status set_accessor_on_node(Graph &g, NodeID nid, bool is_output, size_t idx, ITensorAccessorUPtr accessor)
+{
+ INode *node = g.node(nid);
+ ARM_COMPUTE_RETURN_ERROR_ON(!node);
+
+ Tensor *tensor = is_output ? node->output(idx) : node->input(idx);
+ ARM_COMPUTE_RETURN_ERROR_ON(!tensor);
+
+ tensor->set_accessor(std::move(accessor));
+
+ return Status{};
+}
+
+NodeID add_const_node_with_name(Graph &g, NodeParams params, const std::string &name, TensorDescriptor desc, ITensorAccessorUPtr accessor)
+{
+ params.name = params.name.empty() ? "" : params.name + name;
+ auto nid = GraphBuilder::add_const_node(g, params, std::move(desc), std::move(accessor));
+ set_node_params(g, nid, params);
+ return nid;
+}
+
+template <typename NT, typename... Args>
+NodeID create_simple_single_input_output_node(Graph &g, NodeParams ¶ms, NodeIdxPair input, Args &&... args)
+{
+ CHECK_NODEIDX_PAIR(input, g);
+
+ NodeID nid = g.add_node<NT>(std::forward<Args>(args)...);
+ g.add_connection(input.node_id, input.index, nid, 0);
+ set_node_params(g, nid, params);
+
+ return nid;
+}
+
+NodeID create_grouped_convolution(Graph &g, NodeParams ¶ms, NodeIdxPair input, NodeID weights, NodeID bias,
+ PadStrideInfo conv_info, ConvolutionMethod method, FastMathHint fast_math_hint, unsigned int num_groups)
+{
+ bool has_bias = (bias != EmptyNodeID);
+
+ // Split input
+ NodeID input_split = GraphBuilder::add_split_node(g, params, input, num_groups, 2);
+
+ // Split weights
+ NodeID weights_split = GraphBuilder::add_split_node(g, params, { weights, 0 }, num_groups, 3);
+
+ // Split bias
+ NodeID bias_split = EmptyNodeID;
+ if(has_bias)
+ {
+ // Split bias
+ bias_split = GraphBuilder::add_split_node(g, params, { bias, 0 }, num_groups, 0);
+ }
+
+ std::vector<NodeIdxPair> convolution_outputs;
+ for(unsigned int i = 0; i < num_groups; ++i)
+ {
+ NodeID conv_nid = g.add_node<ConvolutionLayerNode>(conv_info, method, fast_math_hint);
+ g.add_connection(input_split, i, conv_nid, 0);
+ g.add_connection(weights_split, i, conv_nid, 1);
+ if(has_bias)
+ {
+ g.add_connection(bias_split, i, conv_nid, 2);
+ }
+ set_node_params(g, conv_nid, params);
+ convolution_outputs.push_back({ conv_nid, 0 });
+ }
+
+ // Depth concatenate output
+ return GraphBuilder::add_depth_concatenate_node(g, params, convolution_outputs);
+}
+} // namespace
+
+NodeID GraphBuilder::add_const_node(Graph &g, NodeParams params, TensorDescriptor desc, ITensorAccessorUPtr accessor)
+{
+ auto nid = g.add_node<ConstNode>(desc);
+ set_node_params(g, nid, params);
+ set_accessor_on_node(g, nid, true, 0, std::move(accessor));
+ return nid;
+}
+
+NodeID GraphBuilder::add_input_node(Graph &g, NodeParams params, TensorDescriptor desc, ITensorAccessorUPtr accessor)
+{
+ auto nid = g.add_node<InputNode>(desc);
+ set_node_params(g, nid, params);
+ set_accessor_on_node(g, nid, true, 0, std::move(accessor));
+ return nid;
+}
+
+NodeID GraphBuilder::add_output_node(Graph &g, NodeParams params, NodeIdxPair input, ITensorAccessorUPtr accessor)
+{
+ CHECK_NODEIDX_PAIR(input, g);
+
+ NodeID nid = g.add_node<OutputNode>();
+ g.add_connection(input.node_id, input.index, nid, 0);
+ set_node_params(g, nid, params);
+ set_accessor_on_node(g, nid, false, 0, std::move(accessor));
+
+ return nid;
+}
+
+NodeID GraphBuilder::add_activation_node(Graph &g, NodeParams params, NodeIdxPair input, ActivationLayerInfo act_info)
+{
+ return create_simple_single_input_output_node<ActivationLayerNode>(g, params, input, act_info);
+}
+
+NodeID GraphBuilder::add_batch_normalization_node(Graph &g, NodeParams params, NodeIdxPair input, float epsilon,
+ ITensorAccessorUPtr mean_accessor, ITensorAccessorUPtr var_accessor,
+ ITensorAccessorUPtr beta_accessor, ITensorAccessorUPtr gamma_accessor)
+{
+ CHECK_NODEIDX_PAIR(input, g);
+
+ bool has_beta = (beta_accessor != nullptr);
+ bool has_gamma = (gamma_accessor != nullptr);
+
+ // Get input tensor descriptor
+ const TensorDescriptor input_tensor_desc = get_tensor_descriptor(g, g.node(input.node_id)->outputs()[0]);
+
+ // Calculate Common Descriptor
+ TensorDescriptor common_desc = input_tensor_desc;
+ common_desc.shape = TensorShape(get_dimension_size(input_tensor_desc, DataLayoutDimension::CHANNEL));
+
+ // Create mean and nodes
+ auto mean_nid = add_const_node_with_name(g, params, "Mean", common_desc, std::move(mean_accessor));
+ auto var_nid = add_const_node_with_name(g, params, "Variance", common_desc, std::move(var_accessor));
+
+ // Create beta node
+ NodeID beta_nid = EmptyNodeID;
+ if(has_beta)
+ {
+ beta_nid = add_const_node_with_name(g, params, "Beta", common_desc, std::move(beta_accessor));
+ }
+
+ // Create gamma node
+ NodeID gamma_nid = EmptyNodeID;
+ if(has_gamma)
+ {
+ gamma_nid = add_const_node_with_name(g, params, "Gamma", common_desc, std::move(gamma_accessor));
+ }
+
+ // Create batch normalization node and add connections
+ NodeID batch_norm_nid = g.add_node<BatchNormalizationLayerNode>(epsilon);
+ g.add_connection(input.node_id, input.index, batch_norm_nid, 0);
+ g.add_connection(mean_nid, 0, batch_norm_nid, 1);
+ g.add_connection(var_nid, 0, batch_norm_nid, 2);
+ if(has_beta)
+ {
+ g.add_connection(beta_nid, 0, batch_norm_nid, 3);
+ }
+ if(has_gamma)
+ {
+ g.add_connection(gamma_nid, 0, batch_norm_nid, 4);
+ }
+ set_node_params(g, batch_norm_nid, params);
+
+ return batch_norm_nid;
+}
+
+NodeID GraphBuilder::add_convolution_node(Graph &g, NodeParams params, NodeIdxPair input,
+ Size2D kernel_spatial_extend, unsigned int depth, PadStrideInfo conv_info,
+ unsigned int num_groups, ConvolutionMethod method, FastMathHint fast_math_hint,
+ ITensorAccessorUPtr weights_accessor, ITensorAccessorUPtr bias_accessor,
+ const QuantizationInfo weights_quant_info,
+ const QuantizationInfo out_quant_info)
+{
+ CHECK_NODEIDX_PAIR(input, g);
+ ARM_COMPUTE_ERROR_ON(depth == 0);
+ ARM_COMPUTE_ERROR_ON((kernel_spatial_extend.width == 0) || (kernel_spatial_extend.height == 0));
+
+ bool has_bias = (bias_accessor != nullptr);
+
+ // Get input tensor descriptor
+ const TensorDescriptor input_tensor_desc = get_tensor_descriptor(g, g.node(input.node_id)->outputs()[0]);
+
+ // Create weights node
+ TensorDescriptor w_desc = input_tensor_desc;
+ w_desc.shape.set(get_dimension_idx(input_tensor_desc, DataLayoutDimension::WIDTH), kernel_spatial_extend.width);
+ w_desc.shape.set(get_dimension_idx(input_tensor_desc, DataLayoutDimension::HEIGHT), kernel_spatial_extend.height);
+ w_desc.shape.set(get_dimension_idx(input_tensor_desc, DataLayoutDimension::CHANNEL),
+ get_dimension_size(input_tensor_desc, DataLayoutDimension::CHANNEL) / num_groups);
+ w_desc.shape.set(get_dimension_idx(input_tensor_desc, DataLayoutDimension::BATCHES), depth);
+ if(!weights_quant_info.empty())
+ {
+ w_desc.quant_info = weights_quant_info;
+ }
+
+ NodeID w_nid = add_const_node_with_name(g, params, "Weights", w_desc, std::move(weights_accessor));
+
+ // Create bias nodes
+ NodeID b_nid = EmptyNodeID;
+ if(has_bias)
+ {
+ TensorDescriptor b_desc = input_tensor_desc;
+ b_desc.shape = TensorShape(depth);
+ b_nid = add_const_node_with_name(g, params, "Bias", b_desc, std::move(bias_accessor));
+ }
+
+ if(num_groups == 1)
+ {
+ // Create convolution node and connect
+ NodeID conv_nid = g.add_node<ConvolutionLayerNode>(conv_info, method, fast_math_hint, out_quant_info);
+ g.add_connection(input.node_id, input.index, conv_nid, 0);
+ g.add_connection(w_nid, 0, conv_nid, 1);
+ if(has_bias)
+ {
+ g.add_connection(b_nid, 0, conv_nid, 2);
+ }
+ set_node_params(g, conv_nid, params);
+
+ return conv_nid;
+ }
+ else
+ {
+ return create_grouped_convolution(g, params, input, w_nid, b_nid, conv_info, method, fast_math_hint, num_groups);
+ }
+}
+
+NodeID GraphBuilder::add_depth_concatenate_node(Graph &g, NodeParams params, std::vector<NodeIdxPair> inputs)
+{
+ ARM_COMPUTE_ERROR_ON(inputs.size() == 0);
+
+ NodeID nid = g.add_node<DepthConcatenateLayerNode>(inputs.size());
+
+ unsigned int i = 0;
+ for(const auto &input : inputs)
+ {
+ CHECK_NODEIDX_PAIR(input, g);
+ g.add_connection(input.node_id, input.index, nid, i++);
+ }
+ set_node_params(g, nid, params);
+
+ return nid;
+}
+
+NodeID GraphBuilder::add_depthwise_convolution_node(Graph &g, NodeParams params, NodeIdxPair input, Size2D kernel_spatial_extend, PadStrideInfo conv_info,
+ DepthwiseConvolutionMethod method,
+ ITensorAccessorUPtr weights_accessor, ITensorAccessorUPtr bias_accessor, const QuantizationInfo quant_info)
+{
+ CHECK_NODEIDX_PAIR(input, g);
+ ARM_COMPUTE_ERROR_ON((kernel_spatial_extend.width == 0) || (kernel_spatial_extend.height == 0));
+
+ bool has_bias = (bias_accessor != nullptr);
+
+ // Get input tensor descriptor
+ const TensorDescriptor input_tensor_desc = get_tensor_descriptor(g, g.node(input.node_id)->outputs()[0]);
+
+ // Create weights node
+ TensorDescriptor w_desc = input_tensor_desc;
+ w_desc.shape.set(get_dimension_idx(input_tensor_desc, DataLayoutDimension::WIDTH), kernel_spatial_extend.width);
+ w_desc.shape.set(get_dimension_idx(input_tensor_desc, DataLayoutDimension::HEIGHT), kernel_spatial_extend.height);
+ w_desc.shape.set(get_dimension_idx(input_tensor_desc, DataLayoutDimension::CHANNEL),
+ get_dimension_size(input_tensor_desc, DataLayoutDimension::CHANNEL));
+ if(!quant_info.empty())
+ {
+ w_desc.quant_info = quant_info;
+ }
+
+ NodeID w_nid = add_const_node_with_name(g, params, "Weights", w_desc, std::move(weights_accessor));
+
+ // Create bias nodes
+ NodeID b_nid = EmptyNodeID;
+ if(has_bias)
+ {
+ TensorDescriptor b_desc = input_tensor_desc;
+ b_desc.shape = TensorShape(b_desc.shape.z());
+ b_nid = add_const_node_with_name(g, params, "Bias", b_desc, std::move(bias_accessor));
+ }
+
+ // Create convolution node and connect
+ NodeID conv_nid = g.add_node<DepthwiseConvolutionLayerNode>(conv_info, method);
+ g.add_connection(input.node_id, input.index, conv_nid, 0);
+ g.add_connection(w_nid, 0, conv_nid, 1);
+ if(has_bias)
+ {
+ g.add_connection(b_nid, 0, conv_nid, 2);
+ }
+ set_node_params(g, conv_nid, params);
+
+ return conv_nid;
+}
+
+NodeID GraphBuilder::add_elementwise_node(Graph &g, NodeParams params, NodeIdxPair input0, NodeIdxPair input1, EltwiseOperation operation)
+{
+ CHECK_NODEIDX_PAIR(input0, g);
+ CHECK_NODEIDX_PAIR(input1, g);
+
+ NodeID nid = g.add_node<EltwiseLayerNode>(operation);
+
+ g.add_connection(input0.node_id, input0.index, nid, 0);
+ g.add_connection(input1.node_id, input1.index, nid, 1);
+
+ set_node_params(g, nid, params);
+
+ return nid;
+}
+
+NodeID GraphBuilder::add_flatten_node(Graph &g, NodeParams params, NodeIdxPair input)
+{
+ return create_simple_single_input_output_node<FlattenLayerNode>(g, params, input);
+}
+
+NodeID GraphBuilder::add_fully_connected_layer(Graph &g, NodeParams params, NodeIdxPair input, unsigned int num_outputs,
+ ITensorAccessorUPtr weights_accessor, ITensorAccessorUPtr bias_accessor)
+{
+ CHECK_NODEIDX_PAIR(input, g);
+ ARM_COMPUTE_ERROR_ON(num_outputs == 0);
+
+ bool has_bias = (bias_accessor != nullptr);
+
+ // Get input tensor descriptor
+ const TensorDescriptor input_tensor_desc = get_tensor_descriptor(g, g.node(input.node_id)->outputs()[0]);
+
+ // Create weights node
+ TensorDescriptor w_desc = FullyConnectedLayerNode::compute_weights_descriptor(input_tensor_desc, num_outputs);
+ NodeID w_nid = add_const_node_with_name(g, params, "Weights", w_desc, std::move(weights_accessor));
+
+ // Create bias nodes
+ NodeID b_nid = EmptyNodeID;
+ if(has_bias)
+ {
+ TensorDescriptor b_desc = input_tensor_desc;
+ b_desc.shape = TensorShape(num_outputs);
+ b_nid = add_const_node_with_name(g, params, "Bias", b_desc, std::move(bias_accessor));
+ }
+
+ // Create convolution node and connect
+ NodeID fc_nid = g.add_node<FullyConnectedLayerNode>(num_outputs);
+ g.add_connection(input.node_id, input.index, fc_nid, 0);
+ g.add_connection(w_nid, 0, fc_nid, 1);
+ if(has_bias)
+ {
+ g.add_connection(b_nid, 0, fc_nid, 2);
+ }
+
+ set_node_params(g, fc_nid, params);
+
+ return fc_nid;
+}
+
+NodeID GraphBuilder::add_normalization_node(Graph &g, NodeParams params, NodeIdxPair input, NormalizationLayerInfo norm_info)
+{
+ return create_simple_single_input_output_node<NormalizationLayerNode>(g, params, input, norm_info);
+}
+
+NodeID GraphBuilder::add_pooling_node(Graph &g, NodeParams params, NodeIdxPair input, PoolingLayerInfo pool_info)
+{
+ return create_simple_single_input_output_node<PoolingLayerNode>(g, params, input, pool_info);
+}
+
+NodeID GraphBuilder::add_reshape_node(Graph &g, NodeParams params, NodeIdxPair input, TensorShape shape)
+{
+ return create_simple_single_input_output_node<ReshapeLayerNode>(g, params, input, shape);
+}
+
+NodeID GraphBuilder::add_scale_layer(Graph &g, const NodeParams ¶ms, NodeIdxPair input, ITensorAccessorUPtr mul_accessor, ITensorAccessorUPtr add_accessor)
+{
+ CHECK_NODEIDX_PAIR(input, g);
+
+ // Get input tensor descriptor
+ const TensorDescriptor input_tensor_desc = get_tensor_descriptor(g, g.node(input.node_id)->outputs()[0]);
+
+ // Create mul node
+ TensorDescriptor mul_desc = input_tensor_desc;
+ const size_t C = input_tensor_desc.shape[get_dimension_idx(mul_desc, DataLayoutDimension::CHANNEL)];
+ mul_desc.shape.set(get_dimension_idx(input_tensor_desc, DataLayoutDimension::WIDTH), 1);
+ mul_desc.shape.set(get_dimension_idx(input_tensor_desc, DataLayoutDimension::HEIGHT), 1);
+ mul_desc.shape.set(get_dimension_idx(input_tensor_desc, DataLayoutDimension::CHANNEL), C);
+ NodeID mul_const_nid = add_const_node_with_name(g, params, "Mul", mul_desc, std::move(mul_accessor));
+ NodeIdxPair mul_const_nidxp = { mul_const_nid, 0 };
+
+ // Create add node
+ TensorDescriptor add_desc = mul_desc;
+ NodeID add_const_nid = add_const_node_with_name(g, params, "Add", add_desc, std::move(add_accessor));
+ NodeIdxPair add_const_nidxp = { add_const_nid, 0 };
+
+ // Create node and connect
+ NodeID mul_node = GraphBuilder::add_elementwise_node(g, params, input, mul_const_nidxp, EltwiseOperation::MUL);
+ NodeIdxPair mulnode_nidxp = { mul_node, 0 };
+ NodeID add_node = GraphBuilder::add_elementwise_node(g, params, mulnode_nidxp, add_const_nidxp, EltwiseOperation::ADD);
+
+ return add_node;
+}
+
+NodeID GraphBuilder::add_softmax_node(Graph &g, NodeParams params, NodeIdxPair input, float beta)
+{
+ return create_simple_single_input_output_node<SoftmaxLayerNode>(g, params, input, beta);
+}
+
+NodeID GraphBuilder::add_split_node(Graph &g, NodeParams params, NodeIdxPair input, unsigned int num_splits, unsigned int axis)
+{
+ return create_simple_single_input_output_node<SplitLayerNode>(g, params, input, num_splits, axis);
+}
+} // namespace graph
+} // namespace arm_compute
\ No newline at end of file
diff --git a/src/graph/GraphContext.cpp b/src/graph/GraphContext.cpp
index bfc6fcd..3f31114 100644
--- a/src/graph/GraphContext.cpp
+++ b/src/graph/GraphContext.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2018 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -22,45 +22,64 @@
* SOFTWARE.
*/
#include "arm_compute/graph/GraphContext.h"
+#include <arm_compute/graph.h>
-using namespace arm_compute::graph;
-
-GraphHints::GraphHints(TargetHint target_hint, ConvolutionMethodHint conv_method_hint)
- : _target_hint(target_hint), _convolution_method_hint(conv_method_hint)
+namespace arm_compute
{
-}
-
-void GraphHints::set_target_hint(TargetHint target_hint)
+namespace graph
{
- _target_hint = target_hint;
-}
-
-void GraphHints::set_convolution_method_hint(ConvolutionMethodHint convolution_method)
-{
- _convolution_method_hint = convolution_method;
-}
-
-TargetHint GraphHints::target_hint() const
-{
- return _target_hint;
-}
-
-ConvolutionMethodHint GraphHints::convolution_method_hint() const
-{
- return _convolution_method_hint;
-}
-
GraphContext::GraphContext()
- : _hints()
+ : _config(), _memory_managers()
{
}
-GraphHints &GraphContext::hints()
+const GraphConfig &GraphContext::config() const
{
- return _hints;
+ return _config;
}
-const GraphHints &GraphContext::hints() const
+void GraphContext::set_config(const GraphConfig &config)
{
- return _hints;
-}
\ No newline at end of file
+ _config = config;
+}
+
+bool GraphContext::insert_memory_management_ctx(MemoryManagerContext &&memory_ctx)
+{
+ Target target = memory_ctx.target;
+ if(target == Target::UNSPECIFIED || _memory_managers.find(target) != std::end(_memory_managers))
+ {
+ return false;
+ }
+
+ _memory_managers[target] = std::move(memory_ctx);
+ return true;
+}
+
+MemoryManagerContext *GraphContext::memory_management_ctx(Target target)
+{
+ return (_memory_managers.find(target) != std::end(_memory_managers)) ? &_memory_managers[target] : nullptr;
+}
+
+std::map<Target, MemoryManagerContext> &GraphContext::memory_managers()
+{
+ return _memory_managers;
+}
+
+void GraphContext::finalize()
+{
+ for(auto &mm_obj : _memory_managers)
+ {
+ // Finalize intra layer memory manager
+ if(mm_obj.second.intra_mm != nullptr)
+ {
+ mm_obj.second.intra_mm->finalize();
+ }
+ // Finalize cross layer memory manager
+ if(mm_obj.second.cross_mm != nullptr)
+ {
+ mm_obj.second.cross_mm->finalize();
+ }
+ }
+}
+} // namespace graph
+} // namespace arm_compute
\ No newline at end of file
diff --git a/src/graph/GraphManager.cpp b/src/graph/GraphManager.cpp
new file mode 100644
index 0000000..ad45845
--- /dev/null
+++ b/src/graph/GraphManager.cpp
@@ -0,0 +1,133 @@
+/*
+ * Copyright (c) 2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/graph/GraphManager.h"
+
+#include "arm_compute/graph/Graph.h"
+#include "arm_compute/graph/GraphContext.h"
+#include "arm_compute/graph/Logger.h"
+#include "arm_compute/graph/PassManager.h"
+#include "arm_compute/graph/Utils.h"
+#include "arm_compute/graph/detail/CrossLayerMemoryManagerHelpers.h"
+#include "arm_compute/graph/detail/ExecutionHelpers.h"
+
+namespace arm_compute
+{
+namespace graph
+{
+GraphManager::GraphManager()
+ : _workloads()
+{
+ detail::default_initialize_backends();
+}
+
+void GraphManager::finalize_graph(Graph &graph, GraphContext &ctx, PassManager &pm, Target target)
+{
+ // Setup graph context if not done manually
+ setup_default_graph_context(ctx);
+
+ // Check if graph has been registered
+ if(_workloads.find(graph.id()) != std::end(_workloads))
+ {
+ ARM_COMPUTE_ERROR("Graph is already registered!");
+ }
+
+ // Force target to all graph construct
+ Target forced_target = is_target_supported(target) ? target : get_default_target();
+ force_target_to_graph(graph, forced_target);
+
+ // Configure all tensors
+ detail::configure_all_tensors(graph);
+
+ // Apply all mutating passes
+ pm.run_all(graph);
+
+ // Validate all nodes
+ detail::validate_all_nodes(graph);
+
+ // Configure all nodes
+ auto workload = detail::configure_all_nodes(graph, ctx);
+ ARM_COMPUTE_ERROR_ON_MSG(workload.tasks.empty(), "Could not configure all nodes!");
+
+ // Allocate const tensors and call accessors
+ detail::allocate_const_tensors(graph);
+ detail::call_all_const_node_accessors(graph);
+
+ if(forced_target == Target::CL)
+ {
+ // Prepare graph
+ detail::prepare_all_tasks(workload);
+ }
+
+ // Setup tensor memory (Allocate all tensors or setup transition manager)
+ if(ctx.config().use_transition_memory_manager)
+ {
+ detail::configure_transition_manager(graph, ctx, workload);
+ }
+ else
+ {
+ detail::allocate_all_tensors(graph);
+ }
+
+ // Finalize Graph context
+ ctx.finalize();
+
+ // Register graph
+ _workloads.insert(std::make_pair(graph.id(), std::move(workload)));
+ ARM_COMPUTE_LOG_GRAPH_VERBOSE("Created workload for graph with ID : " << graph.id().get() << std::endl);
+
+ if(forced_target != Target::CL)
+ {
+ // Make first run
+ execute_graph(graph);
+
+ // Release all unused const tensors
+ detail::release_unused_tensors(graph);
+ }
+}
+
+void GraphManager::execute_graph(Graph &graph)
+{
+ // Check if graph is finalized
+ auto it = _workloads.find(graph.id());
+ ARM_COMPUTE_ERROR_ON_MSG(it == std::end(_workloads), "Graph is not registered!");
+
+ // Call input accessors
+ detail::call_all_input_node_accessors(it->second);
+
+ // Run graph
+ detail::call_all_tasks(it->second);
+
+ // Call output accessors
+ detail::call_all_output_node_accessors(it->second);
+}
+
+void GraphManager::invalidate_graph(Graph &graph)
+{
+ auto it = _workloads.find(graph.id());
+ ARM_COMPUTE_ERROR_ON_MSG(it == std::end(_workloads), "Graph is not registered!");
+
+ _workloads.erase(it);
+}
+} // namespace graph
+} // namespace arm_compute
diff --git a/src/graph/INode.cpp b/src/graph/INode.cpp
index c753f66..cd9a46a 100644
--- a/src/graph/INode.cpp
+++ b/src/graph/INode.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2018 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -23,33 +23,176 @@
*/
#include "arm_compute/graph/INode.h"
-#include "arm_compute/core/CL/OpenCL.h"
-#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/Error.h"
+#include "arm_compute/graph/Edge.h"
+#include "arm_compute/graph/Graph.h"
+#include "arm_compute/graph/Tensor.h"
-using namespace arm_compute::graph;
-
-TargetHint INode::override_target_hint(TargetHint target_hint) const
+namespace arm_compute
{
- if(target_hint == TargetHint::OPENCL && !opencl_is_available())
+namespace graph
+{
+// *INDENT-OFF*
+// clang-format off
+INode::INode()
+ : _graph(nullptr), _id(EmptyNodeID), _common_params({ "", Target::UNSPECIFIED}),
+ _outputs(), _input_edges(), _output_edges(), _assigned_target(Target::UNSPECIFIED)
+{
+}
+// clang-format on
+// *INDENT-ON*
+
+Status INode::validate() const
+{
+ return Status{};
+}
+
+void INode::set_graph(Graph *g)
+{
+ ARM_COMPUTE_ERROR_ON(g == nullptr);
+ _graph = g;
+}
+
+void INode::set_id(NodeID id)
+{
+ _id = id;
+}
+
+void INode::set_common_node_parameters(NodeParams common_params)
+{
+ _common_params = std::move(common_params);
+}
+
+void INode::set_requested_target(Target target)
+{
+ _common_params.target = target;
+}
+
+void INode::set_assigned_target(Target target)
+{
+ _assigned_target = target;
+}
+
+void INode::set_output_tensor(TensorID tid, size_t idx)
+{
+ if(tid != NullTensorID && (idx < _outputs.size()) && (_graph->tensor(tid) != nullptr))
{
- target_hint = TargetHint::DONT_CARE;
+ ARM_COMPUTE_ERROR_ON(_graph == nullptr);
+ Tensor *updated_tensor = _graph->tensor(tid);
+ _outputs[idx] = tid;
+
+ // Set tensor to all output edges of the node
+ for(auto &output_edge_id : _output_edges)
+ {
+ auto output_edge = _graph->edge(output_edge_id);
+ if(output_edge != nullptr)
+ {
+ // Unbind edge from current tensor
+ auto current_output_tensor = output_edge->tensor();
+ current_output_tensor->unbind_edge(output_edge->id());
+
+ // Update tensor to edge and rebind tensor
+ output_edge->update_bound_tensor(updated_tensor);
+ updated_tensor->bind_edge(output_edge->id());
+ }
+ }
}
- GraphHints hints{ target_hint };
- target_hint = node_override_hints(hints).target_hint();
- ARM_COMPUTE_ERROR_ON(target_hint == TargetHint::OPENCL && !opencl_is_available());
- return target_hint;
}
-bool INode::supports_in_place() const
+
+NodeID INode::id() const
{
- return _supports_in_place;
+ return _id;
}
-void INode::set_supports_in_place(bool value)
+
+std::string INode::name() const
{
- _supports_in_place = value;
+ return _common_params.name;
}
-GraphHints INode::node_override_hints(GraphHints hints) const
+
+const Graph *INode::graph() const
{
- TargetHint target_hint = hints.target_hint();
- hints.set_target_hint((target_hint == TargetHint::DONT_CARE) ? TargetHint::NEON : target_hint);
- return hints;
+ return _graph;
}
+
+Graph *INode::graph()
+{
+ return _graph;
+}
+
+const std::vector<TensorID> &INode::outputs() const
+{
+ return _outputs;
+}
+
+const std::vector<EdgeID> &INode::input_edges() const
+{
+ return _input_edges;
+}
+
+const std::set<EdgeID> &INode::output_edges() const
+{
+ return _output_edges;
+}
+
+TensorID INode::input_id(size_t idx) const
+{
+ ARM_COMPUTE_ERROR_ON(idx >= _input_edges.size());
+ Edge *e = _graph->edge(_input_edges[idx]);
+ return (e != nullptr) ? e->tensor_id() : NullTensorID;
+}
+
+TensorID INode::output_id(size_t idx) const
+{
+ ARM_COMPUTE_ERROR_ON(idx >= _outputs.size());
+ return _outputs[idx];
+}
+
+Tensor *INode::input(size_t idx) const
+{
+ ARM_COMPUTE_ERROR_ON(_graph == nullptr);
+ ARM_COMPUTE_ERROR_ON(idx >= _input_edges.size());
+ Edge *e = _graph->edge(_input_edges[idx]);
+ return (e != nullptr) ? e->tensor() : nullptr;
+}
+
+Tensor *INode::output(size_t idx) const
+{
+ ARM_COMPUTE_ERROR_ON(_graph == nullptr);
+ ARM_COMPUTE_ERROR_ON(idx >= _outputs.size());
+ return _graph->tensor(_outputs[idx]);
+}
+
+EdgeID INode::input_edge_id(size_t idx) const
+{
+ ARM_COMPUTE_ERROR_ON(idx >= _input_edges.size());
+ return _input_edges[idx];
+}
+
+Edge *INode::input_edge(size_t idx) const
+{
+ ARM_COMPUTE_ERROR_ON(_graph == nullptr);
+ ARM_COMPUTE_ERROR_ON(idx >= _input_edges.size());
+ return _graph->edge(_input_edges[idx]);
+}
+
+size_t INode::num_inputs() const
+{
+ return _input_edges.size();
+}
+
+size_t INode::num_outputs() const
+{
+ return _outputs.size();
+}
+
+Target INode::requested_target() const
+{
+ return _common_params.target;
+}
+
+Target INode::assigned_target() const
+{
+ return _assigned_target;
+}
+} // namespace graph
+} // namespace arm_compute
\ No newline at end of file
diff --git a/src/graph/NodeContext.cpp b/src/graph/NodeContext.cpp
deleted file mode 100644
index 2aa5aa1..0000000
--- a/src/graph/NodeContext.cpp
+++ /dev/null
@@ -1,75 +0,0 @@
-/*
- * Copyright (c) 2017 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/graph/NodeContext.h"
-
-using namespace arm_compute::graph;
-
-void NodeContext::set_target(TargetHint target)
-{
- _target = target;
-}
-
-void NodeContext::add_input(arm_compute::ITensor *input)
-{
- ARM_COMPUTE_ERROR_ON(input == nullptr);
- _inputs.emplace_back(input);
-}
-
-void NodeContext::add_output(arm_compute::ITensor *output)
-{
- ARM_COMPUTE_ERROR_ON(output == nullptr);
- _outputs.emplace_back(output);
-}
-
-OperationType NodeContext::operation() const
-{
- return _operation;
-}
-
-TargetHint NodeContext::target() const
-{
- return _target;
-}
-
-arm_compute::ITensor *NodeContext::input(size_t idx) const
-{
- ARM_COMPUTE_ERROR_ON(idx >= _inputs.size());
- return _inputs[idx];
-}
-
-arm_compute::ITensor *NodeContext::output(size_t idx) const
-{
- ARM_COMPUTE_ERROR_ON(idx >= _outputs.size());
- return _outputs[idx];
-}
-
-size_t NodeContext::num_inputs() const
-{
- return _inputs.size();
-}
-
-size_t NodeContext::num_outputs() const
-{
- return _outputs.size();
-}
\ No newline at end of file
diff --git a/src/graph/OperationRegistry.cpp b/src/graph/OperationRegistry.cpp
deleted file mode 100644
index 651653f..0000000
--- a/src/graph/OperationRegistry.cpp
+++ /dev/null
@@ -1,61 +0,0 @@
-/*
- * Copyright (c) 2017 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/graph/OperationRegistry.h"
-
-using namespace arm_compute::graph;
-
-OperationRegistry::OperationRegistry()
- : _registered_ops()
-{
-}
-
-OperationRegistry &OperationRegistry::get()
-{
- static OperationRegistry instance;
- return instance;
-}
-
-IOperation *OperationRegistry::find_operation(OperationType operation, TargetHint target)
-{
- ARM_COMPUTE_ERROR_ON(!contains(operation, target));
- auto it = std::find_if(_registered_ops[operation].begin(), _registered_ops[operation].end(), [&](const std::unique_ptr<IOperation> &op)
- {
- return (op->target() == target);
- });
- ARM_COMPUTE_ERROR_ON(it == _registered_ops[operation].end());
- return (*it).get();
-}
-
-bool OperationRegistry::contains(OperationType operation, TargetHint target) const
-{
- auto it = _registered_ops.find(operation);
- if(it != _registered_ops.end())
- {
- return std::any_of(it->second.begin(), it->second.end(), [&](const std::unique_ptr<IOperation> &op)
- {
- return (op->target() == target);
- });
- }
- return false;
-}
diff --git a/src/graph/PassManager.cpp b/src/graph/PassManager.cpp
new file mode 100644
index 0000000..8ed68bd
--- /dev/null
+++ b/src/graph/PassManager.cpp
@@ -0,0 +1,88 @@
+/*
+ * Copyright (c) 2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/graph/PassManager.h"
+
+#include "arm_compute/graph/Logger.h"
+
+namespace arm_compute
+{
+namespace graph
+{
+PassManager::PassManager()
+ : _passes()
+{
+}
+
+const std::vector<std::unique_ptr<IGraphMutator>> &PassManager::passes() const
+{
+ return _passes;
+}
+
+IGraphMutator *PassManager::pass(size_t index)
+{
+ return (index >= _passes.size()) ? nullptr : _passes.at(index).get();
+}
+
+void PassManager::append(std::unique_ptr<IGraphMutator> pass)
+{
+ if(pass)
+ {
+ ARM_COMPUTE_LOG_GRAPH_VERBOSE("Appending mutating pass : " << pass->name() << std::endl);
+ _passes.push_back(std::move(pass));
+ }
+}
+
+void PassManager::clear()
+{
+ _passes.clear();
+}
+
+void PassManager::run_all(Graph &g)
+{
+ for(auto &pass : _passes)
+ {
+ if(pass)
+ {
+ ARM_COMPUTE_LOG_GRAPH_INFO("Running mutating pass : " << pass->name() << std::endl);
+ pass->mutate(g);
+ }
+ }
+}
+
+void PassManager::run(Graph &g, size_t index)
+{
+ if(index >= _passes.size())
+ {
+ return;
+ }
+
+ auto &pass = _passes.at(index);
+
+ if(pass != nullptr)
+ {
+ pass->mutate(g);
+ }
+}
+} // namespace graph
+} // namespace arm_compute
\ No newline at end of file
diff --git a/src/graph/SubGraph.cpp b/src/graph/SubGraph.cpp
deleted file mode 100644
index 4065e1d..0000000
--- a/src/graph/SubGraph.cpp
+++ /dev/null
@@ -1,109 +0,0 @@
-/*
- * Copyright (c) 2017-2018 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/graph/SubGraph.h"
-
-#include "arm_compute/graph/Graph.h"
-#include "arm_compute/graph/INode.h"
-#include "arm_compute/graph/Tensor.h"
-
-using namespace arm_compute::graph;
-
-SubGraph::SubGraph()
- : _nodes(), _input(nullptr), _output(nullptr)
-{
-}
-
-void SubGraph::add_node(std::unique_ptr<INode> node)
-{
- _nodes.push_back(std::move(node));
-}
-
-void SubGraph::add_tensor_object(std::unique_ptr<ITensorObject> tensor)
-{
- // If it's the first Tensor added then it will be the input of the Graph.
- if(_input == nullptr)
- {
- _input = std::move(tensor);
- }
- else
- {
- _output = std::move(tensor);
- }
-}
-
-std::unique_ptr<Graph> SubGraph::construct(const GraphContext &ctx, std::unique_ptr<ITensorObject> input, std::unique_ptr<ITensorObject> output)
-{
- auto graph = arm_compute::support::cpp14::make_unique<Graph>();
-
- // Set hint
- graph->hints() = ctx.hints();
-
- // Configure input
- if(_input == nullptr)
- {
- _input = std::move(input);
- }
- graph->add_tensor_object(std::move(_input));
-
- // Make sure first and last nodes of the subgraph always do operations out-of-place
- _nodes.front()->set_supports_in_place(false);
- _nodes.back()->set_supports_in_place(false);
-
- // Construct nodes
- for(auto &node : _nodes)
- {
- graph->add_node(std::move(node));
- }
-
- // Configure output
- if(_output == nullptr)
- {
- _output = std::move(output);
- }
- graph->add_tensor_object(std::move(_output));
-
- return graph;
-}
-
-bool SubGraph::has_input() const
-{
- return _input != nullptr;
-}
-
-bool SubGraph::has_output() const
-{
- return _output != nullptr;
-}
-
-SubGraph &arm_compute::graph::operator<<(SubGraph &graph, Tensor &&tensor)
-{
- graph.add_tensor_object(arm_compute::support::cpp14::make_unique<Tensor>(std::move(tensor)));
- return graph;
-}
-
-SubGraph &arm_compute::graph::operator<<(SubGraph &graph, SubTensor &&sub_tensor)
-{
- graph.add_tensor_object(arm_compute::support::cpp14::make_unique<SubTensor>(std::move(sub_tensor)));
- return graph;
-}
diff --git a/src/graph/SubTensor.cpp b/src/graph/SubTensor.cpp
deleted file mode 100644
index 2e640dd..0000000
--- a/src/graph/SubTensor.cpp
+++ /dev/null
@@ -1,119 +0,0 @@
-/*
- * Copyright (c) 2017-2018 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/graph/SubTensor.h"
-
-#include "arm_compute/core/Error.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/Validate.h"
-#include "arm_compute/runtime/CL/CLSubTensor.h"
-#include "arm_compute/runtime/CL/CLTensor.h"
-#include "arm_compute/runtime/SubTensor.h"
-#include "arm_compute/runtime/Tensor.h"
-#include "utils/TypePrinter.h"
-
-using namespace arm_compute::graph;
-
-namespace
-{
-template <typename SubTensorType, typename ParentTensorType>
-std::unique_ptr<arm_compute::ITensor> initialise_subtensor(arm_compute::ITensor *parent, TensorShape shape, Coordinates coords, bool extend_parent)
-{
- auto ptensor = dynamic_cast<ParentTensorType *>(parent);
- auto subtensor = arm_compute::support::cpp14::make_unique<SubTensorType>(ptensor, shape, coords, extend_parent);
- return std::move(subtensor);
-}
-} // namespace
-
-SubTensor::SubTensor()
- : _target(TargetHint::DONT_CARE), _tensor_shape(), _coords(), _parent(nullptr), _subtensor(nullptr), _extend_parent(false)
-{
-}
-
-SubTensor::SubTensor(Tensor &parent, TensorShape tensor_shape, Coordinates coords, bool extend_parent)
- : _target(TargetHint::DONT_CARE), _tensor_shape(tensor_shape), _coords(coords), _parent(nullptr), _subtensor(nullptr), _extend_parent(extend_parent)
-{
- ARM_COMPUTE_ERROR_ON(parent.tensor() == nullptr);
- _parent = parent.tensor();
- _target = parent.target();
-
- instantiate_subtensor();
-}
-
-SubTensor::SubTensor(arm_compute::ITensor *parent, TensorShape tensor_shape, Coordinates coords, TargetHint target, bool extend_parent)
- : _target(target), _tensor_shape(tensor_shape), _coords(coords), _parent(parent), _subtensor(nullptr), _extend_parent(extend_parent)
-{
- ARM_COMPUTE_ERROR_ON(parent == nullptr);
- instantiate_subtensor();
-}
-
-bool SubTensor::call_accessor()
-{
- return true;
-}
-
-bool SubTensor::has_accessor() const
-{
- return false;
-}
-
-arm_compute::ITensor *SubTensor::set_target(TargetHint target)
-{
- ARM_COMPUTE_ERROR_ON(target != _target);
- return (target == _target) ? _subtensor.get() : nullptr;
-}
-
-arm_compute::ITensor *SubTensor::tensor()
-{
- return _subtensor.get();
-}
-
-const arm_compute::ITensor *SubTensor::tensor() const
-{
- return _subtensor.get();
-}
-
-TargetHint SubTensor::target() const
-{
- return _target;
-}
-
-void SubTensor::allocate()
-{
- // NOP for sub-tensors
-}
-
-void SubTensor::instantiate_subtensor()
-{
- switch(_target)
- {
- case TargetHint::OPENCL:
- _subtensor = initialise_subtensor<arm_compute::CLSubTensor, arm_compute::ICLTensor>(_parent, _tensor_shape, _coords, _extend_parent);
- break;
- case TargetHint::NEON:
- _subtensor = initialise_subtensor<arm_compute::SubTensor, arm_compute::ITensor>(_parent, _tensor_shape, _coords, _extend_parent);
- break;
- default:
- ARM_COMPUTE_ERROR("Invalid TargetHint");
- }
-}
diff --git a/src/graph/Tensor.cpp b/src/graph/Tensor.cpp
index 4db79e9..287e783 100644
--- a/src/graph/Tensor.cpp
+++ b/src/graph/Tensor.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2018 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -23,138 +23,89 @@
*/
#include "arm_compute/graph/Tensor.h"
-#include "arm_compute/core/Error.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/Validate.h"
-#include "arm_compute/runtime/CL/CLTensor.h"
-#include "arm_compute/runtime/Tensor.h"
-#include "utils/TypePrinter.h"
-
-using namespace arm_compute::graph;
-
-namespace
+namespace arm_compute
{
-template <typename TensorType>
-std::unique_ptr<arm_compute::ITensor> initialise_tensor(TensorInfo &info)
+namespace graph
{
- auto tensor = arm_compute::support::cpp14::make_unique<TensorType>();
- tensor->allocator()->init(info);
- return std::move(tensor);
-}
-
-template <typename TensorType>
-void tensor_allocate(arm_compute::ITensor &tensor)
-{
- auto itensor = dynamic_cast<TensorType *>(&tensor);
- ARM_COMPUTE_ERROR_ON_NULLPTR(itensor);
- itensor->allocator()->allocate();
-}
-} // namespace
-
-Tensor::Tensor(TensorInfo &&info)
- : _target(TargetHint::DONT_CARE), _info(info), _accessor(nullptr), _tensor(nullptr)
+Tensor::Tensor(TensorID id, TensorDescriptor desc)
+ : _id(id), _desc(std::move(desc)), _handle(nullptr), _accessor(nullptr), _bound_edges()
{
}
-Tensor::Tensor(Tensor &&src) noexcept
- : _target(src._target),
- _info(std::move(src._info)),
- _accessor(std::move(src._accessor)),
- _tensor(std::move(src._tensor))
+TensorID Tensor::id() const
{
+ return _id;
}
-void Tensor::set_info(TensorInfo &&info)
+TensorDescriptor &Tensor::desc()
{
- _info = info;
+ return _desc;
+}
+
+const TensorDescriptor &Tensor::desc() const
+{
+ return _desc;
+}
+
+void Tensor::set_handle(std::unique_ptr<ITensorHandle> backend_tensor)
+{
+ _handle = std::move(backend_tensor);
+}
+
+ITensorHandle *Tensor::handle()
+{
+ return _handle.get();
+}
+
+void Tensor::set_accessor(std::unique_ptr<ITensorAccessor> accessor)
+{
+ _accessor = std::move(accessor);
+}
+
+ITensorAccessor *Tensor::accessor()
+{
+ return _accessor.get();
}
bool Tensor::call_accessor()
{
- ARM_COMPUTE_ERROR_ON_NULLPTR(_accessor.get());
- auto cl_tensor = dynamic_cast<arm_compute::CLTensor *>(_tensor.get());
- if(cl_tensor != nullptr && cl_tensor->buffer() == nullptr)
+ // Early exit guard
+ if(!_accessor || !_handle)
{
- cl_tensor->map();
+ return false;
}
- bool retval = _accessor->access_tensor(*_tensor);
- if(cl_tensor != nullptr)
+
+ // Map tensor
+ _handle->map(true);
+
+ // Return in case of null backend buffer
+ if(_handle->tensor().buffer() == nullptr)
{
- cl_tensor->unmap();
+ return false;
}
- return retval;
+
+ // Call accessor
+ _accessor->access_tensor(_handle->tensor());
+
+ // Unmap tensor
+ _handle->unmap();
+
+ return true;
}
-bool Tensor::has_accessor() const
+void Tensor::bind_edge(EdgeID eid)
{
- return (_accessor != nullptr);
+ _bound_edges.insert(eid);
}
-arm_compute::ITensor *Tensor::tensor()
+void Tensor::unbind_edge(EdgeID eid)
{
- return _tensor.get();
+ _bound_edges.erase(eid);
}
-const arm_compute::ITensor *Tensor::tensor() const
+const std::set<EdgeID> Tensor::bound_edges() const
{
- return _tensor.get();
+ return _bound_edges;
}
-
-const TensorInfo &Tensor::info() const
-{
- return _info;
-}
-
-arm_compute::ITensor *Tensor::set_target(TargetHint target)
-{
- if(_tensor != nullptr)
- {
- ARM_COMPUTE_ERROR_ON(target != _target);
- }
- else
- {
- switch(target)
- {
- case TargetHint::OPENCL:
- _tensor = initialise_tensor<arm_compute::CLTensor>(_info);
- break;
- case TargetHint::NEON:
- _tensor = initialise_tensor<arm_compute::Tensor>(_info);
- break;
- default:
- ARM_COMPUTE_ERROR("Invalid TargetHint");
- }
- _target = target;
- }
- return _tensor.get();
-}
-
-void Tensor::allocate()
-{
- ARM_COMPUTE_ERROR_ON_NULLPTR(_tensor.get());
- switch(_target)
- {
- case TargetHint::OPENCL:
- tensor_allocate<arm_compute::CLTensor>(*_tensor);
- break;
- case TargetHint::NEON:
- tensor_allocate<arm_compute::Tensor>(*_tensor);
- break;
- default:
- ARM_COMPUTE_ERROR("Invalid TargetHint");
- }
-}
-
-void Tensor::allocate_and_fill_if_needed()
-{
- allocate();
- if(_accessor != nullptr)
- {
- call_accessor();
- }
-}
-
-TargetHint Tensor::target() const
-{
- return _target;
-}
+} // namespace graph
+} // namespace arm_compute
diff --git a/src/graph/Utils.cpp b/src/graph/Utils.cpp
new file mode 100644
index 0000000..030fa2d
--- /dev/null
+++ b/src/graph/Utils.cpp
@@ -0,0 +1,135 @@
+/*
+ * Copyright (c) 2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/graph/Utils.h"
+
+#include "arm_compute/graph/GraphContext.h"
+#include "arm_compute/graph/backends/BackendRegistry.h"
+#include "arm_compute/graph/mutators/GraphMutators.h"
+
+namespace arm_compute
+{
+namespace graph
+{
+bool is_target_supported(Target target)
+{
+ return backends::BackendRegistry::get().contains(target) && backends::BackendRegistry::get().find_backend(target)->is_backend_supported();
+}
+
+Target get_default_target()
+{
+ if(is_target_supported(Target::NEON))
+ {
+ return Target::NEON;
+ }
+ if(is_target_supported(Target::CL))
+ {
+ return Target::CL;
+ }
+ if(is_target_supported(Target::GC))
+ {
+ return Target::GC;
+ }
+ ARM_COMPUTE_ERROR("No backend exists!");
+}
+
+void force_target_to_graph(Graph &g, Target target)
+{
+ auto &nodes = g.nodes();
+ for(auto &node : nodes)
+ {
+ if(node)
+ {
+ node->set_assigned_target(target);
+ }
+ }
+
+ auto &tensors = g.tensors();
+ for(auto &tensor : tensors)
+ {
+ if(tensor)
+ {
+ tensor->desc().target = target;
+ }
+ }
+}
+
+PassManager create_default_pass_manager(Target target)
+{
+ PassManager pm;
+
+ if(target != Target::GC)
+ {
+ pm.append(support::cpp14::make_unique<InPlaceOperationMutator>());
+ pm.append(support::cpp14::make_unique<NodeFusionMutator>());
+ pm.append(support::cpp14::make_unique<SplitLayerSubTensorMutator>());
+ pm.append(support::cpp14::make_unique<DepthConcatSubTensorMutator>());
+ }
+
+ return pm;
+}
+
+void setup_default_graph_context(GraphContext &ctx)
+{
+ for(const auto &backend : backends::BackendRegistry::get().backends())
+ {
+ backend.second->setup_backend_context(ctx);
+ }
+}
+
+size_t get_dimension_size(const TensorDescriptor &descriptor, const DataLayoutDimension data_layout_dimension)
+{
+ ARM_COMPUTE_ERROR_ON_MSG(descriptor.layout == DataLayout::UNKNOWN, "Cannot retrieve the dimension index for an unknown layout!");
+ return descriptor.shape[get_dimension_idx(descriptor, data_layout_dimension)];
+}
+
+size_t get_dimension_idx(const TensorDescriptor &descriptor, const DataLayoutDimension data_layout_dimension)
+{
+ ARM_COMPUTE_ERROR_ON_MSG(descriptor.layout == DataLayout::UNKNOWN, "Cannot retrieve the dimension index for an unknown layout!");
+
+ /* Return the index based on the data layout
+ * [N C H W]
+ * [3 2 1 0]
+ * [N H W C]
+ */
+ switch(data_layout_dimension)
+ {
+ case DataLayoutDimension::CHANNEL:
+ return (descriptor.layout == DataLayout::NCHW) ? 2 : 0;
+ break;
+ case DataLayoutDimension::HEIGHT:
+ return (descriptor.layout == DataLayout::NCHW) ? 1 : 2;
+ break;
+ case DataLayoutDimension::WIDTH:
+ return (descriptor.layout == DataLayout::NCHW) ? 0 : 1;
+ break;
+ case DataLayoutDimension::BATCHES:
+ return 3;
+ break;
+ default:
+ ARM_COMPUTE_ERROR("Data layout index not supported!");
+ break;
+ }
+}
+} // namespace graph
+} // namespace arm_compute
\ No newline at end of file
diff --git a/src/graph/CL/CLMap.cpp b/src/graph/Workload.cpp
similarity index 62%
copy from src/graph/CL/CLMap.cpp
copy to src/graph/Workload.cpp
index 5289ea9..d8046c3 100644
--- a/src/graph/CL/CLMap.cpp
+++ b/src/graph/Workload.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2018 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -21,23 +21,45 @@
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*/
-#include "arm_compute/graph/CL/CLMap.h"
+#include "arm_compute/graph/Workload.h"
-#include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/Validate.h"
-#include "arm_compute/graph/ITensorObject.h"
-#include "arm_compute/runtime/CL/CLScheduler.h"
+#include "arm_compute/graph/INode.h"
+#include "arm_compute/graph/ITensorHandle.h"
-using namespace arm_compute::graph;
-
-CLMap::CLMap(ITensorObject *tensor, bool blocking)
- : _tensor(dynamic_cast<arm_compute::ICLTensor *>(tensor->tensor())), _blocking(blocking)
+namespace arm_compute
{
- ARM_COMPUTE_ERROR_ON_NULLPTR(_tensor);
+namespace graph
+{
+void ExecutionTask::operator()()
+{
+ TaskExecutor::get().execute_function(*this);
}
-void CLMap::run()
+void execute_task(ExecutionTask &task)
{
- _tensor->map(arm_compute::CLScheduler::get().queue(), _blocking);
+ if(task.task)
+ {
+ task.task->run();
+ }
}
+
+void ExecutionTask::prepare()
+{
+ if(task)
+ {
+ task->prepare();
+ }
+}
+
+TaskExecutor::TaskExecutor()
+ : execute_function(execute_task)
+{
+}
+
+TaskExecutor &TaskExecutor::get()
+{
+ static TaskExecutor executor;
+ return executor;
+}
+} // namespace graph
+} // namespace arm_compute
\ No newline at end of file
diff --git a/src/graph/backends/BackendRegistry.cpp b/src/graph/backends/BackendRegistry.cpp
new file mode 100644
index 0000000..2803322
--- /dev/null
+++ b/src/graph/backends/BackendRegistry.cpp
@@ -0,0 +1,63 @@
+/*
+ * Copyright (c) 2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/graph/backends/BackendRegistry.h"
+
+using namespace arm_compute::graph::backends;
+
+namespace arm_compute
+{
+namespace graph
+{
+namespace backends
+{
+BackendRegistry::BackendRegistry()
+ : _registered_backends()
+{
+}
+
+BackendRegistry &BackendRegistry::get()
+{
+ static BackendRegistry instance;
+ return instance;
+}
+
+IDeviceBackend *BackendRegistry::find_backend(Target target)
+{
+ ARM_COMPUTE_ERROR_ON(!contains(target));
+ return _registered_backends[target].get();
+}
+
+bool BackendRegistry::contains(Target target) const
+{
+ auto it = _registered_backends.find(target);
+ return (it != _registered_backends.end());
+}
+
+const std::map<Target, std::unique_ptr<IDeviceBackend>> &BackendRegistry::backends() const
+{
+ return _registered_backends;
+}
+} // namespace backends
+} // namespace graph
+} // namespace arm_compute
diff --git a/src/graph/backends/CL/CLDeviceBackend.cpp b/src/graph/backends/CL/CLDeviceBackend.cpp
new file mode 100644
index 0000000..bf17f80
--- /dev/null
+++ b/src/graph/backends/CL/CLDeviceBackend.cpp
@@ -0,0 +1,188 @@
+/*
+ * Copyright (c) 2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/graph/backends/CL/CLDeviceBackend.h"
+
+#include "arm_compute/graph/Graph.h"
+#include "arm_compute/graph/GraphContext.h"
+#include "arm_compute/graph/INode.h"
+#include "arm_compute/graph/Logger.h"
+#include "arm_compute/graph/Tensor.h"
+#include "arm_compute/graph/backends/BackendRegistrar.h"
+#include "arm_compute/graph/backends/CL/CLFunctionFactory.h"
+#include "arm_compute/graph/backends/CL/CLNodeValidator.h"
+#include "arm_compute/graph/backends/CL/CLSubTensorHandle.h"
+#include "arm_compute/graph/backends/CL/CLTensorHandle.h"
+
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/runtime/BlobLifetimeManager.h"
+#include "arm_compute/runtime/CL/CLBufferAllocator.h"
+#include "arm_compute/runtime/CL/CLMemoryGroup.h"
+#include "arm_compute/runtime/CL/CLScheduler.h"
+#include "arm_compute/runtime/MemoryManagerOnDemand.h"
+#include "arm_compute/runtime/PoolManager.h"
+
+#include "support/ToolchainSupport.h"
+
+namespace arm_compute
+{
+namespace graph
+{
+namespace backends
+{
+namespace
+{
+bool file_exists(const std::string &filename)
+{
+ std::ifstream file(filename);
+ return file.good();
+}
+} // namespace
+
+/** Register CL backend */
+static detail::BackendRegistrar<CLDeviceBackend> CLDeviceBackend_registrar(Target::CL);
+
+/** Tuner export file */
+static const std::string tuner_data_filename = "acl_tuner.csv";
+
+CLDeviceBackend::CLDeviceBackend()
+ : _tuner(), _allocator(cl::Context::getDefault())
+{
+}
+
+CLDeviceBackend::~CLDeviceBackend()
+{
+ if(_tuner.tune_new_kernels() && !_tuner.lws_table().empty())
+ {
+ _tuner.save_to_file(tuner_data_filename);
+ }
+}
+
+void CLDeviceBackend::set_kernel_tuning(bool enable_tuning)
+{
+ _tuner.set_tune_new_kernels(enable_tuning);
+}
+
+void CLDeviceBackend::initialize_backend()
+{
+ // Load tuner data if available
+ if(_tuner.lws_table().empty() && file_exists(tuner_data_filename))
+ {
+ _tuner.load_from_file(tuner_data_filename);
+ }
+
+ // Setup Scheduler
+ CLScheduler::get().default_init(&_tuner);
+
+ // Create allocator with new context
+ _allocator = CLBufferAllocator();
+}
+
+void CLDeviceBackend::setup_backend_context(GraphContext &ctx)
+{
+ // Setup tuner
+ set_kernel_tuning(ctx.config().use_tuner);
+
+ // Setup a management backend
+ if(ctx.memory_management_ctx(Target::CL) == nullptr)
+ {
+ MemoryManagerContext mm_ctx;
+ mm_ctx.target = Target::CL;
+ mm_ctx.intra_mm = create_memory_manager(MemoryManagerAffinity::Buffer);
+ mm_ctx.cross_mm = create_memory_manager(MemoryManagerAffinity::Buffer);
+ mm_ctx.cross_group = std::make_shared<CLMemoryGroup>(mm_ctx.cross_mm);
+
+ ctx.insert_memory_management_ctx(std::move(mm_ctx));
+ }
+}
+
+bool CLDeviceBackend::is_backend_supported()
+{
+ return arm_compute::opencl_is_available();
+}
+
+IAllocator *CLDeviceBackend::backend_allocator()
+{
+ return &_allocator;
+}
+
+std::unique_ptr<ITensorHandle> CLDeviceBackend::create_tensor(const Tensor &tensor)
+{
+ // Get tensor descriptor
+ const TensorDescriptor &tensor_desc = tensor.desc();
+ ARM_COMPUTE_ERROR_ON(tensor_desc.target != Target::CL);
+
+ // Create backend tensor handle
+ TensorInfo info(tensor_desc.shape, 1, tensor_desc.data_type, tensor_desc.quant_info);
+ info.set_data_layout(tensor_desc.layout);
+ auto backend_tensor_handle = support::cpp14::make_unique<CLTensorHandle>(info);
+
+ return std::move(backend_tensor_handle);
+}
+
+std::unique_ptr<ITensorHandle> CLDeviceBackend::create_subtensor(ITensorHandle *parent, TensorShape shape, Coordinates coords, bool extend_parent)
+{
+ if(parent == nullptr)
+ {
+ return nullptr;
+ }
+
+ return support::cpp14::make_unique<CLSubTensorHandle>(parent, shape, coords, extend_parent);
+}
+
+std::unique_ptr<arm_compute::IFunction> CLDeviceBackend::configure_node(INode &node, GraphContext &ctx)
+{
+ ARM_COMPUTE_LOG_GRAPH_VERBOSE("Configuring CL node with ID : " << node.id() << std::endl);
+ ARM_COMPUTE_ERROR_ON(node.assigned_target() != Target::CL);
+
+ // Configure node
+ return CLFunctionFactory::create(&node, ctx);
+}
+
+arm_compute::Status CLDeviceBackend::validate_node(INode &node)
+{
+ ARM_COMPUTE_LOG_GRAPH_VERBOSE("Validating CL node with ID : " << node.id() << std::endl);
+ ARM_COMPUTE_ERROR_ON(node.assigned_target() != Target::CL);
+
+ return CLNodeValidator::validate(&node);
+}
+
+std::shared_ptr<arm_compute::IMemoryManager> CLDeviceBackend::create_memory_manager(MemoryManagerAffinity affinity)
+{
+ if(affinity == MemoryManagerAffinity::Offset)
+ {
+ ARM_COMPUTE_LOG_GRAPH_WARNING("CL Backend does not support offset affinity memory management!");
+ return nullptr;
+ }
+
+ auto lifetime_mgr = std::make_shared<BlobLifetimeManager>();
+ auto pool_mgr = std::make_shared<PoolManager>();
+ auto mm = std::make_shared<MemoryManagerOnDemand>(lifetime_mgr, pool_mgr);
+
+ mm->set_allocator(&_allocator);
+
+ return mm;
+}
+} // namespace backends
+} // namespace graph
+} // namespace arm_compute
diff --git a/src/graph/backends/CL/CLFunctionsFactory.cpp b/src/graph/backends/CL/CLFunctionsFactory.cpp
new file mode 100644
index 0000000..db8a7a0
--- /dev/null
+++ b/src/graph/backends/CL/CLFunctionsFactory.cpp
@@ -0,0 +1,607 @@
+/*
+ * Copyright (c) 2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/graph/backends/CL/CLFunctionFactory.h"
+
+#include "arm_compute/core/utils/misc/Cast.h"
+#include "arm_compute/graph/Graph.h"
+#include "arm_compute/graph/GraphContext.h"
+#include "arm_compute/graph/Logger.h"
+#include "arm_compute/graph/TypePrinter.h"
+#include "arm_compute/graph/Types.h"
+#include "arm_compute/graph/backends/Utils.h"
+#include "arm_compute/graph/nodes/Nodes.h"
+#include "arm_compute/runtime/CL/CLFunctions.h"
+
+#include "support/ToolchainSupport.h"
+
+using namespace arm_compute::utils::cast;
+
+namespace arm_compute
+{
+namespace graph
+{
+namespace backends
+{
+namespace
+{
+/** Returns backing tensor of a given tensor
+ *
+ * @param[in] tensor Tensor to extract the backing tensor from
+ *
+ * @return Backing tensor if present else nullptr
+ */
+arm_compute::ICLTensor *get_backing_tensor(arm_compute::graph::Tensor *tensor)
+{
+ arm_compute::ICLTensor *backing_tensor = nullptr;
+ if(tensor != nullptr)
+ {
+ ARM_COMPUTE_ERROR_ON(tensor->desc().target != arm_compute::graph::Target::CL);
+ // Get backing tensor handle
+ ITensorHandle *tensor_handle = tensor->handle();
+ // Get backing tensor
+ backing_tensor = (tensor_handle != nullptr) ? polymorphic_cast<ICLTensor *>(&tensor_handle->tensor()) : nullptr;
+ }
+
+ return backing_tensor;
+}
+
+/** Create a backend activation layer function
+ *
+ * @param[in] node Node to create the backend function for
+ *
+ * @return Backend activation layer function
+ */
+std::unique_ptr<IFunction> create_activation_layer(ActivationLayerNode &node)
+{
+ ARM_COMPUTE_LOG_GRAPH_VERBOSE(
+ "Creating CL ActivationLayerNode node with ID : " << node.id() << " and Name: " << node.name()
+ << std::endl);
+ ARM_COMPUTE_ERROR_ON(node.num_inputs() != 1);
+ ARM_COMPUTE_ERROR_ON(node.num_outputs() != 1);
+
+ // Extract IO and info
+ ICLTensor *input = get_backing_tensor(node.input(0));
+ ICLTensor *output = get_backing_tensor(node.output(0));
+ const ActivationLayerInfo act_info = node.activation_info();
+
+ // Create function
+ auto func = support::cpp14::make_unique<CLActivationLayer>();
+ func->configure(input, output, act_info);
+
+ ARM_COMPUTE_LOG_GRAPH_INFO("Instantiated CLActivationLayer"
+ << " Data Type: " << input->info()->data_type()
+ << " Shape: " << input->info()->tensor_shape()
+ << " Activation function: " << act_info.activation()
+ << " a: " << act_info.a()
+ << " b: " << act_info.b()
+ << " InPlace : " << is_in_place_operation(input, output)
+ << std::endl);
+
+ return std::move(func);
+}
+
+/** Create a backend batch normalization layer function
+ *
+ * @param[in] node Node to create the backend function for
+ *
+ * @return Backend batch normalization layer function
+ */
+std::unique_ptr<IFunction> create_batch_normalization_layer(BatchNormalizationLayerNode &node)
+{
+ ARM_COMPUTE_LOG_GRAPH_VERBOSE("Creating CL BatchNormalization node with ID : " << node.id() << " and Name: " << node.name() << std::endl);
+
+ ARM_COMPUTE_ERROR_ON(node.num_inputs() != 5);
+ ARM_COMPUTE_ERROR_ON(node.num_outputs() != 1);
+
+ // Extract IO and info
+ ICLTensor *input = get_backing_tensor(node.input(0));
+ ICLTensor *mean = get_backing_tensor(node.input(1));
+ ICLTensor *var = get_backing_tensor(node.input(2));
+ ICLTensor *beta = get_backing_tensor(node.input(3));
+ ICLTensor *gamma = get_backing_tensor(node.input(4));
+ ICLTensor *output = get_backing_tensor(node.output(0));
+ const float epsilon = node.epsilon();
+ const ActivationLayerInfo fused_act = node.fused_activation();
+
+ // Create and configure function
+ auto func = support::cpp14::make_unique<CLBatchNormalizationLayer>();
+ func->configure(input, output, mean, var, beta, gamma, epsilon, fused_act);
+
+ // Log info
+ ARM_COMPUTE_LOG_GRAPH_INFO("Instantiated CLBatchNormalizationLayer"
+ << " Data Type: " << input->info()->data_type()
+ << " Shape: " << input->info()->tensor_shape()
+ << " Epsilon: " << epsilon << " "
+ << (fused_act.enabled() ? to_string(fused_act.activation()) : "")
+ << " InPlace : " << is_in_place_operation(input, output)
+ << std::endl);
+
+ return std::move(func);
+}
+
+/** Create a backend convolution layer function
+ *
+ * @param[in] node Node to create the backend function for
+ *
+ * @return Backend convolution layer function
+ */
+std::unique_ptr<IFunction> create_convolution_layer(ConvolutionLayerNode &node, GraphContext &ctx)
+{
+ ARM_COMPUTE_LOG_GRAPH_VERBOSE("Creating CL ConvolutionLayer node with ID : " << node.id() << " and Name: " << node.name() << std::endl);
+ ARM_COMPUTE_ERROR_ON(node.num_inputs() != 3);
+ ARM_COMPUTE_ERROR_ON(node.num_outputs() != 1);
+
+ // Extract IO and info
+ ICLTensor *input = get_backing_tensor(node.input(0));
+ ICLTensor *weights = get_backing_tensor(node.input(1));
+ ICLTensor *biases = get_backing_tensor(node.input(2));
+ ICLTensor *output = get_backing_tensor(node.output(0));
+
+ if(is_data_type_quantized_asymmetric(input->info()->data_type()))
+ {
+ biases->info()->set_data_type(DataType::S32);
+ }
+
+ const PadStrideInfo conv_info = node.convolution_info();
+ const ConvolutionMethod conv_algorithm = node.convolution_method();
+ const bool fast_math = node.fast_math_hint() == FastMathHint::ENABLED;
+
+ // Create and configure function (we assume that functions have been validated before creation)
+ std::shared_ptr<IMemoryManager> mm = get_memory_manager(ctx, Target::CL);
+ std::unique_ptr<IFunction> func;
+ std::string func_name;
+
+ if(conv_algorithm == ConvolutionMethod::WINOGRAD)
+ {
+ std::tie(func, func_name) = create_named_memory_managed_function<CLWinogradConvolutionLayer>(
+ std::string("CLWinogradConvolutionLayer"), mm, input, weights, biases, output, conv_info, ActivationLayerInfo(), fast_math);
+ }
+ else if(conv_algorithm == ConvolutionMethod::DIRECT)
+ {
+ std::tie(func, func_name) = create_named_function<CLDirectConvolutionLayer>(
+ std::string("CLDirectConvolutionLayer"), input, weights, biases, output, conv_info);
+ }
+ else if(conv_algorithm == ConvolutionMethod::GEMM)
+ {
+ std::tie(func, func_name) = create_named_memory_managed_function<CLGEMMConvolutionLayer>(std::string("CLGEMMConvolutionLayer"), mm,
+ input, weights, biases, output, conv_info);
+ }
+ else
+ {
+ std::tie(func, func_name) = create_named_memory_managed_function<CLConvolutionLayer>(std::string("CLConvolutionLayer"), mm,
+ input, weights, biases, output, conv_info, WeightsInfo(), Size2D(1U, 1U), ActivationLayerInfo(), fast_math);
+ }
+
+ // Log info
+ ARM_COMPUTE_LOG_GRAPH_INFO("Instantiated " << func_name
+ << " Data Type: " << input->info()->data_type()
+ << " Input QuantInfo: " << input->info()->quantization_info()
+ << " Weights QuantInfo: " << weights->info()->quantization_info()
+ << " Input shape: " << input->info()->tensor_shape()
+ << " Weights shape: " << weights->info()->tensor_shape()
+ << " Output shape: " << output->info()->tensor_shape()
+ << std::endl);
+ return func;
+}
+
+/** Create a backend layer depth concatenate function
+ *
+ * @param[in] node Node to create the backend function for
+ *
+ * @return Backend depth concatenate layer function
+ */
+std::unique_ptr<arm_compute::IFunction> create_depth_concatenate_layer(DepthConcatenateLayerNode &node)
+{
+ ARM_COMPUTE_LOG_GRAPH_VERBOSE("Creating CL DepthConcatenate node with ID : " << node.id() << " and Name: " << node.name() << std::endl);
+ ARM_COMPUTE_ERROR_ON(node.num_outputs() != 1);
+
+ // Return nullptr if depth concatenate is switched off
+ if(!node.is_enabled())
+ {
+ return nullptr;
+ }
+
+ // Extract IO and info
+ std::vector<arm_compute::ICLTensor *> inputs;
+ for(unsigned int i = 0; i < node.num_inputs(); ++i)
+ {
+ inputs.push_back(get_backing_tensor(node.input(i)));
+ }
+ ICLTensor *output = get_backing_tensor(node.output(0));
+
+ // Create and configure function
+ auto func = support::cpp14::make_unique<CLDepthConcatenateLayer>();
+ func->configure(inputs, output);
+
+ // Log info
+ ARM_COMPUTE_LOG_GRAPH_INFO("Instantiated CLDepthConcatenateLayer"
+ << " Data Type: " << output->info()->data_type()
+ << " Shape: " << output->info()->tensor_shape()
+ << " Num Inputs: " << inputs.size()
+ << std::endl);
+
+ return std::move(func);
+}
+
+/** Create a backend layer depth-wise convolution function
+ *
+ * @param[in] node Node to create the backend function for
+ *
+ * @return Backend depth-wise convolution layer function
+ */
+std::unique_ptr<IFunction> create_depthwise_convolution_layer(DepthwiseConvolutionLayerNode &node)
+{
+ ARM_COMPUTE_LOG_GRAPH_VERBOSE(
+ "Creating CL DepthwiseConvolutionLayer node with ID : " << node.id() << " and Name: " << node.name()
+ << std::endl);
+ ARM_COMPUTE_ERROR_ON(node.num_inputs() != 3);
+ ARM_COMPUTE_ERROR_ON(node.num_outputs() != 1);
+
+ // Extract IO and info
+ ICLTensor *input = get_backing_tensor(node.input(0));
+ ICLTensor *weights = get_backing_tensor(node.input(1));
+ ICLTensor *biases = get_backing_tensor(node.input(2));
+ ICLTensor *output = get_backing_tensor(node.output(0));
+
+ if(is_data_type_quantized_asymmetric(input->info()->data_type()))
+ {
+ biases->info()->set_data_type(DataType::S32);
+ }
+
+ const PadStrideInfo conv_info = node.convolution_info();
+ const DepthwiseConvolutionMethod dwc_algorithm = node.depthwise_convolution_method();
+
+ // Create and configure function (we assume that functions have been validated before creation)
+ std::unique_ptr<IFunction> func;
+ std::string func_name;
+ if(dwc_algorithm == DepthwiseConvolutionMethod::OPTIMIZED_3x3)
+ {
+ std::tie(func, func_name) = create_named_function<CLDepthwiseConvolutionLayer3x3>(
+ std::string("CLDepthwiseConvolutionLayer3x3"), input, weights, biases, output, conv_info);
+ }
+ else
+ {
+ std::tie(func, func_name) = create_named_function<CLDepthwiseConvolutionLayer>(
+ std::string("CLDepthwiseConvolutionLayer"), input, weights, biases, output, conv_info);
+ }
+
+ // Log info
+ ARM_COMPUTE_LOG_GRAPH_INFO("Instantiated " << func_name
+ << " Data Type: " << input->info()->data_type()
+ << " Input QuantInfo: " << input->info()->quantization_info()
+ << " Weights QuantInfo: " << weights->info()->quantization_info()
+ << " Input shape: " << input->info()->tensor_shape()
+ << " Weights shape: " << weights->info()->tensor_shape()
+ << " Output shape: " << output->info()->tensor_shape()
+ << std::endl);
+ return func;
+}
+
+/** Create a backend element-wise operation layer function
+ *
+ * @param[in] node Node to create the backend function for
+ *
+ * @return Backend element-wise operation layer function
+ */
+std::unique_ptr<IFunction> create_eltwise_layer(EltwiseLayerNode &node)
+{
+ ARM_COMPUTE_LOG_GRAPH_VERBOSE(
+ "Creating CL EltwiseLayer node with ID : " << node.id() << " and Name: " << node.name() << std::endl);
+ ARM_COMPUTE_ERROR_ON(node.num_inputs() != 2);
+ ARM_COMPUTE_ERROR_ON(node.num_outputs() != 1);
+
+ // Extract IO and info
+ ICLTensor *input1 = get_backing_tensor(node.input(0));
+ ICLTensor *input2 = get_backing_tensor(node.input(1));
+ ICLTensor *output = get_backing_tensor(node.output(0));
+ const EltwiseOperation eltwise_op = node.eltwise_operation();
+ const ConvertPolicy convert_policy = node.convert_policy();
+ ARM_COMPUTE_ERROR_ON(input1 == nullptr);
+ ARM_COMPUTE_ERROR_ON(input2 == nullptr);
+ ARM_COMPUTE_ERROR_ON(output == nullptr);
+
+ std::unique_ptr<IFunction> func = nullptr;
+ std::string func_name;
+ if(eltwise_op == EltwiseOperation::ADD)
+ {
+ std::tie(func, func_name) = create_named_function<CLArithmeticAddition>(std::string("CLArithmeticAddition"),
+ input1, input2, output,
+ convert_policy);
+ }
+ else if(eltwise_op == EltwiseOperation::SUB)
+ {
+ std::tie(func, func_name) = create_named_function<CLArithmeticSubtraction>(
+ std::string("CLArithmeticSubtraction"), input1, input2, output, convert_policy);
+ }
+ else if(eltwise_op == EltwiseOperation::MUL)
+ {
+ std::tie(func, func_name) = create_named_function<CLPixelWiseMultiplication>(
+ std::string("CLPixelWiseMultiplication"), input1, input2, output, 1.f, convert_policy,
+ node.rounding_policy());
+ }
+ else
+ {
+ ARM_COMPUTE_ERROR("Unsupported element-wise operation!");
+ }
+
+ // Log info
+ ARM_COMPUTE_LOG_GRAPH_INFO("Instantiated " << func_name
+ << " Data Type: " << input1->info()->data_type()
+ << " Shape : " << input1->info()->tensor_shape()
+ << std::endl);
+
+ return func;
+}
+
+/** Create a backend flatten layer function
+ *
+ * @param[in] node Node to create the backend function for
+ *
+ * @return Backend flatten layer function
+ */
+std::unique_ptr<IFunction> create_flatten_layer(FlattenLayerNode &node)
+{
+ ARM_COMPUTE_LOG_GRAPH_VERBOSE(
+ "Creating CL FlattenLayer node with ID : " << node.id() << " and Name: " << node.name() << std::endl);
+ ARM_COMPUTE_ERROR_ON(node.num_inputs() != 1);
+ ARM_COMPUTE_ERROR_ON(node.num_outputs() != 1);
+
+ // Extract IO and info
+ ICLTensor *input = get_backing_tensor(node.input(0));
+ ICLTensor *output = get_backing_tensor(node.output(0));
+
+ // Create and configure function
+ auto func = support::cpp14::make_unique<CLFlattenLayer>();
+ func->configure(input, output);
+ ARM_COMPUTE_ERROR_ON(input == nullptr);
+ ARM_COMPUTE_ERROR_ON(output == nullptr);
+
+ // Log info
+ ARM_COMPUTE_LOG_GRAPH_INFO("Instantiated CLFlattenLayer"
+ << " Data Type: " << input->info()->data_type()
+ << " Input shape: " << input->info()->tensor_shape()
+ << " Output shape: " << output->info()->tensor_shape()
+ << std::endl);
+
+ return std::move(func);
+}
+
+/** Create a backend fully connected layer function
+ *
+ * @param[in] node Node to create the backend function for
+ *
+ * @return Backend fully connected layer function
+ */
+std::unique_ptr<IFunction> create_fully_connected_layer(FullyConnectedLayerNode &node, GraphContext &ctx)
+{
+ ARM_COMPUTE_LOG_GRAPH_VERBOSE(
+ "Creating CL FullyConnectedLayer node with ID : " << node.id() << " and Name: " << node.name()
+ << std::endl);
+ ARM_COMPUTE_ERROR_ON(node.num_inputs() != 3);
+ ARM_COMPUTE_ERROR_ON(node.num_outputs() != 1);
+
+ // Extract IO and info
+ ICLTensor *input = get_backing_tensor(node.input(0));
+ ICLTensor *weights = get_backing_tensor(node.input(1));
+ ICLTensor *biases = get_backing_tensor(node.input(2));
+ ICLTensor *output = get_backing_tensor(node.output(0));
+
+ // Create and configure function
+ auto func = support::cpp14::make_unique<CLFullyConnectedLayer>(get_memory_manager(ctx, Target::CL));
+ func->configure(input, weights, biases, output);
+ ARM_COMPUTE_ERROR_ON(input == nullptr);
+ ARM_COMPUTE_ERROR_ON(weights == nullptr);
+ ARM_COMPUTE_ERROR_ON(output == nullptr);
+
+ // Log info
+ ARM_COMPUTE_LOG_GRAPH_INFO("Instantiated CLFullyConnectedLayer"
+ << " Data Type: " << input->info()->data_type()
+ << " Input shape: " << input->info()->tensor_shape()
+ << " Weights shape: " << weights->info()->tensor_shape()
+ << " Biases Shape: " << biases->info()->tensor_shape()
+ << " Output shape: " << output->info()->tensor_shape()
+ << std::endl);
+
+ return std::move(func);
+}
+
+/** Create a backend normalization layer function
+ *
+ * @param[in] node Node to create the backend function for
+ *
+ * @return Backend normalization layer function
+ */
+std::unique_ptr<IFunction> create_normalization_layer(NormalizationLayerNode &node)
+{
+ ARM_COMPUTE_LOG_GRAPH_VERBOSE(
+ "Creating CL NormalizationLayer node with ID : " << node.id() << " and Name: " << node.name() << std::endl);
+ ARM_COMPUTE_ERROR_ON(node.num_inputs() != 1);
+ ARM_COMPUTE_ERROR_ON(node.num_outputs() != 1);
+
+ // Extract IO and info
+ ICLTensor *input = get_backing_tensor(node.input(0));
+ ICLTensor *output = get_backing_tensor(node.output(0));
+ const NormalizationLayerInfo norm_info = node.normalization_info();
+ ARM_COMPUTE_ERROR_ON(input == nullptr);
+ ARM_COMPUTE_ERROR_ON(output == nullptr);
+
+ // Create and configure function
+ auto func = support::cpp14::make_unique<CLNormalizationLayer>();
+ func->configure(input, output, norm_info);
+
+ // Log info
+ ARM_COMPUTE_LOG_GRAPH_INFO("Instantiated CLNormalizationLayer"
+ << " Data Type: " << input->info()->data_type()
+ << " Input shape: " << input->info()->tensor_shape()
+ << " Output shape: " << output->info()->tensor_shape()
+ << " Normalization info: " << norm_info.type()
+ << std::endl);
+
+ return std::move(func);
+}
+
+/** Create a backend pooling layer function
+ *
+ * @param[in] node Node to create the backend function for
+ *
+ * @return Backend pooling layer function
+ */
+std::unique_ptr<IFunction> create_pooling_layer(PoolingLayerNode &node)
+{
+ ARM_COMPUTE_LOG_GRAPH_VERBOSE(
+ "Creating CL PoolingLayer node with ID : " << node.id() << " and Name: " << node.name() << std::endl);
+ ARM_COMPUTE_ERROR_ON(node.num_inputs() != 1);
+ ARM_COMPUTE_ERROR_ON(node.num_outputs() != 1);
+
+ // Extract IO and info
+ ICLTensor *input = get_backing_tensor(node.input(0));
+ ICLTensor *output = get_backing_tensor(node.output(0));
+ const PoolingLayerInfo pool_info = node.pooling_info();
+ ARM_COMPUTE_ERROR_ON(input == nullptr);
+ ARM_COMPUTE_ERROR_ON(output == nullptr);
+
+ // Create and configure function
+ auto func = support::cpp14::make_unique<CLPoolingLayer>();
+ func->configure(input, output, pool_info);
+
+ // Log info
+ ARM_COMPUTE_LOG_GRAPH_INFO("Instantiated CLPoolingLayer"
+ << " Data Type: " << input->info()->data_type()
+ << " Input shape: " << input->info()->tensor_shape()
+ << " Output shape: " << output->info()->tensor_shape()
+ << " Pooling info: " << pool_info.pool_type()
+ << std::endl);
+
+ return std::move(func);
+}
+
+/** Create a backend reshape layer function
+ *
+ * @param[in] node Node to create the backend function for
+ *
+ * @return Backend reshape layer function
+ */
+std::unique_ptr<IFunction> create_reshape_layer(ReshapeLayerNode &node)
+{
+ ARM_COMPUTE_LOG_GRAPH_VERBOSE(
+ "Creating CL ReshapeLayer node with ID : " << node.id() << " and Name: " << node.name() << std::endl);
+ ARM_COMPUTE_ERROR_ON(node.num_inputs() != 1);
+ ARM_COMPUTE_ERROR_ON(node.num_outputs() != 1);
+
+ // Extract IO and info
+ ICLTensor *input = get_backing_tensor(node.input(0));
+ ICLTensor *output = get_backing_tensor(node.output(0));
+ ARM_COMPUTE_ERROR_ON(input == nullptr);
+ ARM_COMPUTE_ERROR_ON(output == nullptr);
+
+ // Create and configure function
+ auto func = support::cpp14::make_unique<CLReshapeLayer>();
+ func->configure(input, output);
+
+ // Log info
+ ARM_COMPUTE_LOG_GRAPH_INFO("Instantiated CLReshapeLayer"
+ << " Data Type: " << input->info()->data_type()
+ << " Input shape: " << input->info()->tensor_shape()
+ << " Output shape: " << output->info()->tensor_shape()
+ << std::endl);
+
+ return std::move(func);
+}
+
+/** Create a backend softmax layer function
+ *
+ * @param[in] node Node to create the backend function for
+ *
+ * @return Backend softmax layer function
+ */
+std::unique_ptr<IFunction> create_softmax_layer(SoftmaxLayerNode &node, GraphContext &ctx)
+{
+ ARM_COMPUTE_LOG_GRAPH_VERBOSE(
+ "Creating CL SoftmaxLayer node with ID : " << node.id() << " and Name: " << node.name() << std::endl);
+ ARM_COMPUTE_ERROR_ON(node.num_inputs() != 1);
+ ARM_COMPUTE_ERROR_ON(node.num_outputs() != 1);
+
+ // Extract IO and info
+ ICLTensor *input = get_backing_tensor(node.input(0));
+ ICLTensor *output = get_backing_tensor(node.output(0));
+ const float beta = node.beta();
+ ARM_COMPUTE_ERROR_ON(input == nullptr);
+ ARM_COMPUTE_ERROR_ON(output == nullptr);
+
+ // Create and configure function
+ auto func = support::cpp14::make_unique<CLSoftmaxLayer>(get_memory_manager(ctx, Target::CL));
+ func->configure(input, output, beta);
+
+ // Log info
+ ARM_COMPUTE_LOG_GRAPH_INFO("Instantiated CLSoftmaxLayer"
+ << " Data Type: " << input->info()->data_type()
+ << " Input shape: " << input->info()->tensor_shape()
+ << " Output shape: " << output->info()->tensor_shape()
+ << std::endl);
+
+ return std::move(func);
+}
+} // namespace
+
+std::unique_ptr<IFunction> CLFunctionFactory::create(INode *node, GraphContext &ctx)
+{
+ if(node == nullptr)
+ {
+ return nullptr;
+ }
+
+ NodeType type = node->type();
+ switch(type)
+ {
+ case NodeType::ActivationLayer:
+ return create_activation_layer(*polymorphic_downcast<ActivationLayerNode *>(node));
+ case NodeType::BatchNormalizationLayer:
+ return create_batch_normalization_layer(*polymorphic_downcast<BatchNormalizationLayerNode *>(node));
+ case NodeType::ConvolutionLayer:
+ return create_convolution_layer(*polymorphic_downcast<ConvolutionLayerNode *>(node), ctx);
+ case NodeType::DepthConcatenateLayer:
+ return create_depth_concatenate_layer(*polymorphic_downcast<DepthConcatenateLayerNode *>(node));
+ case NodeType::DepthwiseConvolutionLayer:
+ return create_depthwise_convolution_layer(*polymorphic_downcast<DepthwiseConvolutionLayerNode *>(node));
+ case NodeType::EltwiseLayer:
+ return create_eltwise_layer(*polymorphic_downcast<EltwiseLayerNode *>(node));
+ case NodeType::FlattenLayer:
+ return create_flatten_layer(*polymorphic_downcast<FlattenLayerNode *>(node));
+ case NodeType::FullyConnectedLayer:
+ return create_fully_connected_layer(*polymorphic_downcast<FullyConnectedLayerNode *>(node), ctx);
+ case NodeType::NormalizationLayer:
+ return create_normalization_layer(*polymorphic_downcast<NormalizationLayerNode *>(node));
+ case NodeType::PoolingLayer:
+ return create_pooling_layer(*polymorphic_downcast<PoolingLayerNode *>(node));
+ case NodeType::ReshapeLayer:
+ return create_reshape_layer(*polymorphic_downcast<ReshapeLayerNode *>(node));
+ case NodeType::SoftmaxLayer:
+ return create_softmax_layer(*polymorphic_downcast<SoftmaxLayerNode *>(node), ctx);
+ default:
+ return nullptr;
+ }
+}
+} // namespace backends
+} // namespace graph
+} // namespace arm_compute
diff --git a/src/graph/backends/CL/CLNodeValidator.cpp b/src/graph/backends/CL/CLNodeValidator.cpp
new file mode 100644
index 0000000..c16b2e6
--- /dev/null
+++ b/src/graph/backends/CL/CLNodeValidator.cpp
@@ -0,0 +1,64 @@
+/*
+ * Copyright (c) 2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/graph/backends/CL/CLNodeValidator.h"
+
+#include "arm_compute/graph/backends/ValidateHelpers.h"
+#include "arm_compute/graph/nodes/Nodes.h"
+
+#include "arm_compute/core/utils/misc/Cast.h"
+#include "arm_compute/runtime/CL/CLFunctions.h"
+
+using namespace arm_compute::utils::cast;
+
+namespace arm_compute
+{
+namespace graph
+{
+namespace backends
+{
+Status CLNodeValidator::validate(INode *node)
+{
+ if(node == nullptr)
+ {
+ return Status{};
+ }
+
+ NodeType type = node->type();
+ switch(type)
+ {
+ case NodeType::ConvolutionLayer:
+ return detail::validate_convolution_layer<CLConvolutionLayer,
+ CLDirectConvolutionLayer,
+ CLGEMMConvolutionLayer,
+ CLWinogradConvolutionLayer>(*polymorphic_downcast<ConvolutionLayerNode *>(node));
+ case NodeType::DepthwiseConvolutionLayer:
+ return detail::validate_depthwise_convolution_layer<CLDepthwiseConvolutionLayer,
+ CLDepthwiseConvolutionLayer3x3>(*polymorphic_downcast<DepthwiseConvolutionLayerNode *>(node));
+ default:
+ return Status{};
+ }
+}
+} // namespace backends
+} // namespace graph
+} // namespace arm_compute
\ No newline at end of file
diff --git a/src/graph/backends/CL/CLSubTensorHandle.cpp b/src/graph/backends/CL/CLSubTensorHandle.cpp
new file mode 100644
index 0000000..016dca7
--- /dev/null
+++ b/src/graph/backends/CL/CLSubTensorHandle.cpp
@@ -0,0 +1,101 @@
+/*
+ * Copyright (c) 2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/graph/backends/CL/CLSubTensorHandle.h"
+
+#include "arm_compute/core/utils/misc/Cast.h"
+
+namespace arm_compute
+{
+namespace graph
+{
+namespace backends
+{
+CLSubTensorHandle::CLSubTensorHandle(ITensorHandle *parent_handle, const TensorShape &shape, const Coordinates &coords, bool extend_parent)
+ : _sub_tensor(), _parent_handle(nullptr)
+{
+ ARM_COMPUTE_ERROR_ON(!parent_handle);
+ auto parent_tensor = arm_compute::utils::cast::polymorphic_downcast<ICLTensor *>(&parent_handle->tensor());
+ _sub_tensor = arm_compute::CLSubTensor(parent_tensor, shape, coords, extend_parent);
+ _parent_handle = parent_handle;
+}
+
+void CLSubTensorHandle::allocate()
+{
+ // noop
+}
+
+void CLSubTensorHandle::free()
+{
+ // noop
+}
+
+void CLSubTensorHandle::manage(IMemoryGroup *mg)
+{
+ ARM_COMPUTE_UNUSED(mg);
+ // noop
+}
+
+void CLSubTensorHandle::map(bool blocking)
+{
+ _sub_tensor.map(blocking);
+}
+
+void CLSubTensorHandle::unmap()
+{
+ _sub_tensor.unmap();
+}
+
+void CLSubTensorHandle::release_if_unused()
+{
+ // noop
+}
+
+const arm_compute::ITensor &CLSubTensorHandle::tensor() const
+{
+ return _sub_tensor;
+}
+
+arm_compute::ITensor &CLSubTensorHandle::tensor()
+{
+ return _sub_tensor;
+}
+
+ITensorHandle *CLSubTensorHandle::parent_handle()
+{
+ ARM_COMPUTE_ERROR_ON(_parent_handle == nullptr);
+ return _parent_handle->parent_handle();
+}
+
+bool CLSubTensorHandle::is_subtensor() const
+{
+ return true;
+}
+
+Target CLSubTensorHandle::target() const
+{
+ return Target::CL;
+}
+} // namespace backends
+} // namespace graph
+} // namespace arm_compute
\ No newline at end of file
diff --git a/src/graph/backends/CL/CLTensorHandle.cpp b/src/graph/backends/CL/CLTensorHandle.cpp
new file mode 100644
index 0000000..fdb044c
--- /dev/null
+++ b/src/graph/backends/CL/CLTensorHandle.cpp
@@ -0,0 +1,104 @@
+/*
+ * Copyright (c) 2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/graph/backends/CL/CLTensorHandle.h"
+
+#include "arm_compute/core/utils/misc/Cast.h"
+#include "arm_compute/runtime/CL/CLMemoryGroup.h"
+
+namespace arm_compute
+{
+namespace graph
+{
+namespace backends
+{
+CLTensorHandle::CLTensorHandle(const ITensorInfo &info)
+ : _tensor()
+{
+ _tensor.allocator()->init(info);
+}
+
+void CLTensorHandle::allocate()
+{
+ _tensor.allocator()->allocate();
+}
+
+void CLTensorHandle::free()
+{
+ _tensor.allocator()->free();
+}
+
+void CLTensorHandle::manage(IMemoryGroup *mg)
+{
+ if(mg != nullptr)
+ {
+ auto *cl_mg = arm_compute::utils::cast::polymorphic_downcast<CLMemoryGroup *>(mg);
+ cl_mg->manage(&_tensor);
+ }
+}
+
+void CLTensorHandle::map(bool blocking)
+{
+ _tensor.map(blocking);
+}
+
+void CLTensorHandle::unmap()
+{
+ _tensor.unmap();
+}
+
+void CLTensorHandle::release_if_unused()
+{
+ if(!_tensor.is_used())
+ {
+ _tensor.allocator()->free();
+ }
+}
+
+const arm_compute::ITensor &CLTensorHandle::tensor() const
+{
+ return _tensor;
+}
+
+arm_compute::ITensor &CLTensorHandle::tensor()
+{
+ return _tensor;
+}
+
+ITensorHandle *CLTensorHandle::parent_handle()
+{
+ return this;
+}
+
+bool CLTensorHandle::is_subtensor() const
+{
+ return false;
+}
+
+Target CLTensorHandle::target() const
+{
+ return Target::CL;
+}
+} // namespace backends
+} // namespace graph
+} // namespace arm_compute
diff --git a/src/graph/backends/GLES/GCDeviceBackend.cpp b/src/graph/backends/GLES/GCDeviceBackend.cpp
new file mode 100644
index 0000000..770cca5
--- /dev/null
+++ b/src/graph/backends/GLES/GCDeviceBackend.cpp
@@ -0,0 +1,147 @@
+/*
+ * Copyright (c) 2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/graph/backends/GLES/GCDeviceBackend.h"
+
+#include "arm_compute/graph/Graph.h"
+#include "arm_compute/graph/GraphContext.h"
+#include "arm_compute/graph/INode.h"
+#include "arm_compute/graph/Logger.h"
+#include "arm_compute/graph/Tensor.h"
+#include "arm_compute/graph/backends/BackendRegistrar.h"
+#include "arm_compute/graph/backends/GLES/GCFunctionFactory.h"
+#include "arm_compute/graph/backends/GLES/GCNodeValidator.h"
+#include "arm_compute/graph/backends/GLES/GCTensorHandle.h"
+
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/runtime/BlobLifetimeManager.h"
+#include "arm_compute/runtime/GLES_COMPUTE/GCBufferAllocator.h"
+#include "arm_compute/runtime/GLES_COMPUTE/GCMemoryGroup.h"
+#include "arm_compute/runtime/GLES_COMPUTE/GCScheduler.h"
+#include "arm_compute/runtime/MemoryManagerOnDemand.h"
+#include "arm_compute/runtime/PoolManager.h"
+
+#include "support/ToolchainSupport.h"
+
+namespace arm_compute
+{
+namespace graph
+{
+namespace backends
+{
+/** Register GLES backend */
+static detail::BackendRegistrar<GCDeviceBackend> GCDeviceBackend_registrar(Target::GC);
+
+GCDeviceBackend::GCDeviceBackend()
+ : _allocator()
+{
+}
+
+void GCDeviceBackend::initialize_backend()
+{
+ // Setup Scheduler
+ GCScheduler::get().default_init();
+}
+
+void GCDeviceBackend::setup_backend_context(GraphContext &ctx)
+{
+ // Setup a management backend
+ if(ctx.memory_management_ctx(Target::GC) == nullptr)
+ {
+ MemoryManagerContext mm_ctx;
+ mm_ctx.target = Target::GC;
+ mm_ctx.intra_mm = create_memory_manager(MemoryManagerAffinity::Buffer);
+ mm_ctx.cross_mm = create_memory_manager(MemoryManagerAffinity::Buffer);
+ mm_ctx.cross_group = std::make_shared<GCMemoryGroup>(mm_ctx.cross_mm);
+
+ ctx.insert_memory_management_ctx(std::move(mm_ctx));
+ }
+}
+
+bool GCDeviceBackend::is_backend_supported()
+{
+ return arm_compute::opengles31_is_available();
+}
+
+IAllocator *GCDeviceBackend::backend_allocator()
+{
+ return &_allocator;
+}
+
+std::unique_ptr<ITensorHandle> GCDeviceBackend::create_tensor(const Tensor &tensor)
+{
+ // Get tensor descriptor
+ const TensorDescriptor &tensor_desc = tensor.desc();
+ ARM_COMPUTE_ERROR_ON(tensor_desc.target != Target::GC);
+
+ // Create backend tensor handle
+ TensorInfo info(tensor_desc.shape, 1, tensor_desc.data_type, tensor_desc.quant_info);
+ info.set_data_layout(tensor_desc.layout);
+ auto backend_tensor_handle = support::cpp14::make_unique<GCTensorHandle>(info);
+
+ return std::move(backend_tensor_handle);
+}
+
+std::unique_ptr<ITensorHandle> GCDeviceBackend::create_subtensor(ITensorHandle *parent, TensorShape shape, Coordinates coords, bool extend_parent)
+{
+ ARM_COMPUTE_UNUSED(parent, shape, coords, extend_parent);
+ ARM_COMPUTE_ERROR("GLES backend has no sub-tensor support!");
+ return nullptr;
+}
+
+std::unique_ptr<arm_compute::IFunction> GCDeviceBackend::configure_node(INode &node, GraphContext &ctx)
+{
+ ARM_COMPUTE_LOG_GRAPH_VERBOSE("Configuring GC node with ID : " << node.id() << std::endl);
+ ARM_COMPUTE_ERROR_ON(node.assigned_target() != Target::GC);
+
+ // Configure node
+ return GCFunctionFactory::create(&node, ctx);
+}
+
+arm_compute::Status GCDeviceBackend::validate_node(INode &node)
+{
+ ARM_COMPUTE_LOG_GRAPH_VERBOSE("Validating GC node with ID : " << node.id() << std::endl);
+ ARM_COMPUTE_ERROR_ON(node.assigned_target() != Target::GC);
+
+ return GCNodeValidator::validate(&node);
+}
+
+std::shared_ptr<arm_compute::IMemoryManager> GCDeviceBackend::create_memory_manager(MemoryManagerAffinity affinity)
+{
+ if(affinity == MemoryManagerAffinity::Offset)
+ {
+ ARM_COMPUTE_LOG_GRAPH_WARNING("GC Backend does not support offset affinity memory management!");
+ return nullptr;
+ }
+
+ auto lifetime_mgr = std::make_shared<BlobLifetimeManager>();
+ auto pool_mgr = std::make_shared<PoolManager>();
+ auto mm = std::make_shared<MemoryManagerOnDemand>(lifetime_mgr, pool_mgr);
+
+ mm->set_allocator(&_allocator);
+
+ return mm;
+}
+} // namespace backends
+} // namespace graph
+} // namespace arm_compute
\ No newline at end of file
diff --git a/src/graph/backends/GLES/GCFunctionsFactory.cpp b/src/graph/backends/GLES/GCFunctionsFactory.cpp
new file mode 100644
index 0000000..e61e840
--- /dev/null
+++ b/src/graph/backends/GLES/GCFunctionsFactory.cpp
@@ -0,0 +1,523 @@
+/*
+ * Copyright (c) 2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/graph/backends/GLES/GCFunctionFactory.h"
+
+#include "arm_compute/core/utils/misc/Cast.h"
+#include "arm_compute/graph/Graph.h"
+#include "arm_compute/graph/GraphContext.h"
+#include "arm_compute/graph/Logger.h"
+#include "arm_compute/graph/TypePrinter.h"
+#include "arm_compute/graph/Types.h"
+#include "arm_compute/graph/backends/Utils.h"
+#include "arm_compute/graph/nodes/Nodes.h"
+#include "arm_compute/runtime/GLES_COMPUTE/GCFunctions.h"
+
+#include "support/ToolchainSupport.h"
+
+using namespace arm_compute::utils::cast;
+
+namespace arm_compute
+{
+namespace graph
+{
+namespace backends
+{
+namespace
+{
+/** Returns backing tensor of a given tensor
+ *
+ * @param[in] tensor Tensor to extract the backing tensor from
+ *
+ * @return Backing tensor if present else nullptr
+ */
+arm_compute::IGCTensor *get_backing_tensor(arm_compute::graph::Tensor *tensor)
+{
+ arm_compute::IGCTensor *backing_tensor = nullptr;
+ if(tensor != nullptr)
+ {
+ ARM_COMPUTE_ERROR_ON(tensor->desc().target != arm_compute::graph::Target::GC);
+ // Get backing tensor handle
+ ITensorHandle *tensor_handle = tensor->handle();
+ // Get backing tensor
+ backing_tensor = (tensor_handle != nullptr) ? polymorphic_cast<IGCTensor *>(&tensor_handle->tensor()) : nullptr;
+ }
+
+ return backing_tensor;
+}
+
+/** Create a backend activation layer function
+ *
+ * @param[in] node Node to create the backend function for
+ *
+ * @return Backend activation layer function
+ */
+std::unique_ptr<IFunction> create_activation_layer(ActivationLayerNode &node)
+{
+ ARM_COMPUTE_LOG_GRAPH_VERBOSE(
+ "Creating GC ActivationLayerNode node with ID : " << node.id() << " and Name: " << node.name()
+ << std::endl);
+ ARM_COMPUTE_ERROR_ON(node.num_inputs() != 1);
+ ARM_COMPUTE_ERROR_ON(node.num_outputs() != 1);
+
+ // Extract IO and info
+ IGCTensor *input = get_backing_tensor(node.input(0));
+ IGCTensor *output = get_backing_tensor(node.output(0));
+ const ActivationLayerInfo act_info = node.activation_info();
+
+ // Create function
+ auto func = support::cpp14::make_unique<GCActivationLayer>();
+ func->configure(input, output, act_info);
+
+ ARM_COMPUTE_LOG_GRAPH_INFO("Instantiated GCActivationLayer"
+ << " Data Type: " << input->info()->data_type()
+ << " Shape: " << input->info()->tensor_shape()
+ << " Activation function: " << act_info.activation()
+ << " a: " << act_info.a()
+ << " b: " << act_info.b()
+ << " InPlace : " << is_in_place_operation(input, output)
+ << std::endl);
+
+ return std::move(func);
+}
+
+/** Create a backend batch normalization layer function
+ *
+ * @param[in] node Node to create the backend function for
+ *
+ * @return Backend batch normalization layer function
+ */
+std::unique_ptr<IFunction> create_batch_normalization_layer(BatchNormalizationLayerNode &node)
+{
+ ARM_COMPUTE_LOG_GRAPH_VERBOSE("Creating GC BatchNormalization node with ID : " << node.id() << " and Name: " << node.name() << std::endl);
+
+ ARM_COMPUTE_ERROR_ON(node.num_inputs() != 5);
+ ARM_COMPUTE_ERROR_ON(node.num_outputs() != 1);
+
+ // Extract IO and info
+ IGCTensor *input = get_backing_tensor(node.input(0));
+ IGCTensor *mean = get_backing_tensor(node.input(1));
+ IGCTensor *var = get_backing_tensor(node.input(2));
+ IGCTensor *beta = get_backing_tensor(node.input(3));
+ IGCTensor *gamma = get_backing_tensor(node.input(4));
+ IGCTensor *output = get_backing_tensor(node.output(0));
+ const float epsilon = node.epsilon();
+ const ActivationLayerInfo fused_act = node.fused_activation();
+
+ // Create and configure function
+ auto func = support::cpp14::make_unique<GCBatchNormalizationLayer>();
+ func->configure(input, output, mean, var, beta, gamma, epsilon, fused_act);
+
+ // Log info
+ ARM_COMPUTE_LOG_GRAPH_INFO("Instantiated GCBatchNormalizationLayer"
+ << " Data Type: " << input->info()->data_type()
+ << " Shape: " << input->info()->tensor_shape()
+ << " Epsilon: " << epsilon << " "
+ << (fused_act.enabled() ? to_string(fused_act.activation()) : "")
+ << " InPlace : " << is_in_place_operation(input, output)
+ << std::endl);
+
+ return std::move(func);
+}
+
+/** Create a backend convolution layer function
+ *
+ * @param[in] node Node to create the backend function for
+ *
+ * @return Backend convolution layer function
+ */
+std::unique_ptr<IFunction> create_convolution_layer(ConvolutionLayerNode &node, GraphContext &ctx)
+{
+ ARM_COMPUTE_LOG_GRAPH_VERBOSE("Creating GC ConvolutionLayer node with ID : " << node.id() << " and Name: " << node.name() << std::endl);
+ ARM_COMPUTE_ERROR_ON(node.num_inputs() != 3);
+ ARM_COMPUTE_ERROR_ON(node.num_outputs() != 1);
+
+ // Extract IO and info
+ IGCTensor *input = get_backing_tensor(node.input(0));
+ IGCTensor *weights = get_backing_tensor(node.input(1));
+ IGCTensor *biases = get_backing_tensor(node.input(2));
+ IGCTensor *output = get_backing_tensor(node.output(0));
+
+ if(is_data_type_quantized_asymmetric(input->info()->data_type()))
+ {
+ biases->info()->set_data_type(DataType::S32);
+ }
+
+ const PadStrideInfo conv_info = node.convolution_info();
+ const ConvolutionMethod conv_algorithm = node.convolution_method();
+
+ // Create and configure function (we assume that functions have been validated before creation)
+ std::shared_ptr<IMemoryManager> mm = get_memory_manager(ctx, Target::GC);
+ std::unique_ptr<IFunction> func;
+ std::string func_name;
+
+ if(conv_algorithm == ConvolutionMethod::DIRECT)
+ {
+ std::tie(func, func_name) = create_named_function<GCDirectConvolutionLayer>(
+ std::string("GCDirectConvolutionLayer"), input, weights, biases, output, conv_info);
+ }
+ else
+ {
+ std::tie(func, func_name) = create_named_memory_managed_function<GCConvolutionLayer>(std::string("GCConvolutionLayer"), mm,
+ input, weights, biases, output, conv_info);
+ }
+
+ // Log info
+ ARM_COMPUTE_LOG_GRAPH_INFO("Instantiated " << func_name
+ << " Data Type: " << input->info()->data_type()
+ << " Input QuantInfo: " << input->info()->quantization_info()
+ << " Weights QuantInfo: " << weights->info()->quantization_info()
+ << " Input shape: " << input->info()->tensor_shape()
+ << " Weights shape: " << weights->info()->tensor_shape()
+ << " Output shape: " << output->info()->tensor_shape()
+ << std::endl);
+ return func;
+}
+
+/** Create a backend layer depth concatenate function
+ *
+ * @param[in] node Node to create the backend function for
+ *
+ * @return Backend depth concatenate layer function
+ */
+std::unique_ptr<arm_compute::IFunction> create_depth_concatenate_layer(DepthConcatenateLayerNode &node)
+{
+ ARM_COMPUTE_LOG_GRAPH_VERBOSE("Creating GC DepthConcatenate node with ID : " << node.id() << " and Name: " << node.name() << std::endl);
+ ARM_COMPUTE_ERROR_ON(node.num_outputs() != 1);
+
+ // Return nullptr if depth concatenate is switched off
+ if(!node.is_enabled())
+ {
+ return nullptr;
+ }
+
+ // Extract IO and info
+ std::vector<arm_compute::IGCTensor *> inputs;
+ for(unsigned int i = 0; i < node.num_inputs(); ++i)
+ {
+ inputs.push_back(get_backing_tensor(node.input(i)));
+ }
+ IGCTensor *output = get_backing_tensor(node.output(0));
+
+ // Create and configure function
+ auto func = support::cpp14::make_unique<GCDepthConcatenateLayer>();
+ func->configure(inputs, output);
+
+ // Log info
+ ARM_COMPUTE_LOG_GRAPH_INFO("Instantiated GCDepthConcatenateLayer"
+ << " Data Type: " << output->info()->data_type()
+ << " Shape: " << output->info()->tensor_shape()
+ << " Num Inputs: " << inputs.size()
+ << std::endl);
+
+ return std::move(func);
+}
+
+/** Create a backend layer depth-wise convolution function
+ *
+ * @param[in] node Node to create the backend function for
+ *
+ * @return Backend depth-wise convolution layer function
+ */
+std::unique_ptr<IFunction> create_depthwise_convolution_layer(DepthwiseConvolutionLayerNode &node)
+{
+ ARM_COMPUTE_LOG_GRAPH_VERBOSE(
+ "Creating GC DepthwiseConvolutionLayer node with ID : " << node.id() << " and Name: " << node.name()
+ << std::endl);
+ ARM_COMPUTE_ERROR_ON(node.num_inputs() != 3);
+ ARM_COMPUTE_ERROR_ON(node.num_outputs() != 1);
+
+ // Extract IO and info
+ IGCTensor *input = get_backing_tensor(node.input(0));
+ IGCTensor *weights = get_backing_tensor(node.input(1));
+ IGCTensor *biases = get_backing_tensor(node.input(2));
+ IGCTensor *output = get_backing_tensor(node.output(0));
+
+ if(is_data_type_quantized_asymmetric(input->info()->data_type()))
+ {
+ biases->info()->set_data_type(DataType::S32);
+ }
+
+ const PadStrideInfo conv_info = node.convolution_info();
+ const DepthwiseConvolutionMethod dwc_algorithm = node.depthwise_convolution_method();
+
+ // Create and configure function (we assume that functions have been validated before creation)
+ std::unique_ptr<IFunction> func;
+ std::string func_name;
+ if(dwc_algorithm == DepthwiseConvolutionMethod::OPTIMIZED_3x3)
+ {
+ std::tie(func, func_name) = create_named_function<GCDepthwiseConvolutionLayer3x3>(
+ std::string("GCDepthwiseConvolutionLayer3x3"), input, weights, biases, output, conv_info);
+ }
+ else
+ {
+ ARM_COMPUTE_ERROR("Generic DepthwiseConvolutionLayer is not supported in GLES backend");
+ }
+
+ // Log info
+ ARM_COMPUTE_LOG_GRAPH_INFO("Instantiated " << func_name
+ << " Data Type: " << input->info()->data_type()
+ << " Input QuantInfo: " << input->info()->quantization_info()
+ << " Weights QuantInfo: " << weights->info()->quantization_info()
+ << " Input shape: " << input->info()->tensor_shape()
+ << " Weights shape: " << weights->info()->tensor_shape()
+ << " Output shape: " << output->info()->tensor_shape()
+ << std::endl);
+ return func;
+}
+
+/** Create a backend element-wise operation layer function
+ *
+ * @param[in] node Node to create the backend function for
+ *
+ * @return Backend element-wise operation layer function
+ */
+std::unique_ptr<IFunction> create_eltwise_layer(EltwiseLayerNode &node)
+{
+ ARM_COMPUTE_LOG_GRAPH_VERBOSE(
+ "Creating GC EltwiseLayer node with ID : " << node.id() << " and Name: " << node.name() << std::endl);
+ ARM_COMPUTE_ERROR_ON(node.num_inputs() != 2);
+ ARM_COMPUTE_ERROR_ON(node.num_outputs() != 1);
+
+ // Extract IO and info
+ IGCTensor *input1 = get_backing_tensor(node.input(0));
+ IGCTensor *input2 = get_backing_tensor(node.input(1));
+ IGCTensor *output = get_backing_tensor(node.output(0));
+ const EltwiseOperation eltwise_op = node.eltwise_operation();
+ const ConvertPolicy convert_policy = node.convert_policy();
+ ARM_COMPUTE_ERROR_ON(input1 == nullptr);
+ ARM_COMPUTE_ERROR_ON(input2 == nullptr);
+ ARM_COMPUTE_ERROR_ON(output == nullptr);
+
+ std::unique_ptr<IFunction> func = nullptr;
+ std::string func_name;
+ if(eltwise_op == EltwiseOperation::ADD)
+ {
+ std::tie(func, func_name) = create_named_function<GCArithmeticAddition>(std::string("GCArithmeticAddition"),
+ input1, input2, output,
+ convert_policy);
+ }
+ else if(eltwise_op == EltwiseOperation::SUB)
+ {
+ ARM_COMPUTE_ERROR("Arithmetic subtraction is not supported in GLES backend");
+ }
+ else if(eltwise_op == EltwiseOperation::MUL)
+ {
+ std::tie(func, func_name) = create_named_function<GCPixelWiseMultiplication>(
+ std::string("GCPixelWiseMultiplication"), input1, input2, output, 1.f);
+ }
+ else
+ {
+ ARM_COMPUTE_ERROR("Unsupported element-wise operation!");
+ }
+
+ // Log info
+ ARM_COMPUTE_LOG_GRAPH_INFO("Instantiated " << func_name
+ << " Data Type: " << input1->info()->data_type()
+ << " Shape : " << input1->info()->tensor_shape()
+ << std::endl);
+
+ return func;
+}
+
+/** Create a backend fully connected layer function
+ *
+ * @param[in] node Node to create the backend function for
+ *
+ * @return Backend fully connected layer function
+ */
+std::unique_ptr<IFunction> create_fully_connected_layer(FullyConnectedLayerNode &node, GraphContext &ctx)
+{
+ ARM_COMPUTE_LOG_GRAPH_VERBOSE(
+ "Creating GC FullyConnectedLayer node with ID : " << node.id() << " and Name: " << node.name()
+ << std::endl);
+ ARM_COMPUTE_ERROR_ON(node.num_inputs() != 3);
+ ARM_COMPUTE_ERROR_ON(node.num_outputs() != 1);
+
+ // Extract IO and info
+ IGCTensor *input = get_backing_tensor(node.input(0));
+ IGCTensor *weights = get_backing_tensor(node.input(1));
+ IGCTensor *biases = get_backing_tensor(node.input(2));
+ IGCTensor *output = get_backing_tensor(node.output(0));
+
+ // Create and configure function
+ auto func = support::cpp14::make_unique<GCFullyConnectedLayer>(get_memory_manager(ctx, Target::GC));
+ func->configure(input, weights, biases, output);
+ ARM_COMPUTE_ERROR_ON(input == nullptr);
+ ARM_COMPUTE_ERROR_ON(weights == nullptr);
+ ARM_COMPUTE_ERROR_ON(output == nullptr);
+
+ // Log info
+ ARM_COMPUTE_LOG_GRAPH_INFO("Instantiated GCFullyConnectedLayer"
+ << " Data Type: " << input->info()->data_type()
+ << " Input shape: " << input->info()->tensor_shape()
+ << " Weights shape: " << weights->info()->tensor_shape()
+ << " Biases Shape: " << biases->info()->tensor_shape()
+ << " Output shape: " << output->info()->tensor_shape()
+ << std::endl);
+
+ return std::move(func);
+}
+
+/** Create a backend normalization layer function
+ *
+ * @param[in] node Node to create the backend function for
+ *
+ * @return Backend normalization layer function
+ */
+std::unique_ptr<IFunction> create_normalization_layer(NormalizationLayerNode &node)
+{
+ ARM_COMPUTE_LOG_GRAPH_VERBOSE(
+ "Creating GC NormalizationLayer node with ID : " << node.id() << " and Name: " << node.name() << std::endl);
+ ARM_COMPUTE_ERROR_ON(node.num_inputs() != 1);
+ ARM_COMPUTE_ERROR_ON(node.num_outputs() != 1);
+
+ // Extract IO and info
+ IGCTensor *input = get_backing_tensor(node.input(0));
+ IGCTensor *output = get_backing_tensor(node.output(0));
+ const NormalizationLayerInfo norm_info = node.normalization_info();
+ ARM_COMPUTE_ERROR_ON(input == nullptr);
+ ARM_COMPUTE_ERROR_ON(output == nullptr);
+
+ // Create and configure function
+ auto func = support::cpp14::make_unique<GCNormalizationLayer>();
+ func->configure(input, output, norm_info);
+
+ // Log info
+ ARM_COMPUTE_LOG_GRAPH_INFO("Instantiated GCNormalizationLayer"
+ << " Data Type: " << input->info()->data_type()
+ << " Input shape: " << input->info()->tensor_shape()
+ << " Output shape: " << output->info()->tensor_shape()
+ << " Normalization info: " << norm_info.type()
+ << std::endl);
+
+ return std::move(func);
+}
+
+/** Create a backend pooling layer function
+ *
+ * @param[in] node Node to create the backend function for
+ *
+ * @return Backend pooling layer function
+ */
+std::unique_ptr<IFunction> create_pooling_layer(PoolingLayerNode &node)
+{
+ ARM_COMPUTE_LOG_GRAPH_VERBOSE(
+ "Creating GC PoolingLayer node with ID : " << node.id() << " and Name: " << node.name() << std::endl);
+ ARM_COMPUTE_ERROR_ON(node.num_inputs() != 1);
+ ARM_COMPUTE_ERROR_ON(node.num_outputs() != 1);
+
+ // Extract IO and info
+ IGCTensor *input = get_backing_tensor(node.input(0));
+ IGCTensor *output = get_backing_tensor(node.output(0));
+ const PoolingLayerInfo pool_info = node.pooling_info();
+ ARM_COMPUTE_ERROR_ON(input == nullptr);
+ ARM_COMPUTE_ERROR_ON(output == nullptr);
+
+ // Create and configure function
+ auto func = support::cpp14::make_unique<GCPoolingLayer>();
+ func->configure(input, output, pool_info);
+
+ // Log info
+ ARM_COMPUTE_LOG_GRAPH_INFO("Instantiated GCPoolingLayer"
+ << " Data Type: " << input->info()->data_type()
+ << " Input shape: " << input->info()->tensor_shape()
+ << " Output shape: " << output->info()->tensor_shape()
+ << " Pooling info: " << pool_info.pool_type()
+ << std::endl);
+
+ return std::move(func);
+}
+
+/** Create a backend softmax layer function
+ *
+ * @param[in] node Node to create the backend function for
+ *
+ * @return Backend softmax layer function
+ */
+std::unique_ptr<IFunction> create_softmax_layer(SoftmaxLayerNode &node, GraphContext &ctx)
+{
+ ARM_COMPUTE_LOG_GRAPH_VERBOSE(
+ "Creating GC SoftmaxLayer node with ID : " << node.id() << " and Name: " << node.name() << std::endl);
+ ARM_COMPUTE_ERROR_ON(node.num_inputs() != 1);
+ ARM_COMPUTE_ERROR_ON(node.num_outputs() != 1);
+
+ // Extract IO and info
+ IGCTensor *input = get_backing_tensor(node.input(0));
+ IGCTensor *output = get_backing_tensor(node.output(0));
+ const float beta = node.beta();
+ ARM_COMPUTE_ERROR_ON(input == nullptr);
+ ARM_COMPUTE_ERROR_ON(output == nullptr);
+
+ // Create and configure function
+ auto func = support::cpp14::make_unique<GCSoftmaxLayer>(get_memory_manager(ctx, Target::CL));
+ func->configure(input, output, beta);
+
+ // Log info
+ ARM_COMPUTE_LOG_GRAPH_INFO("Instantiated GCSoftmaxLayer"
+ << " Data Type: " << input->info()->data_type()
+ << " Input shape: " << input->info()->tensor_shape()
+ << " Output shape: " << output->info()->tensor_shape()
+ << std::endl);
+
+ return std::move(func);
+}
+} // namespace
+
+std::unique_ptr<IFunction> GCFunctionFactory::create(INode *node, GraphContext &ctx)
+{
+ if(node == nullptr)
+ {
+ return nullptr;
+ }
+
+ NodeType type = node->type();
+ switch(type)
+ {
+ case NodeType::ActivationLayer:
+ return create_activation_layer(*polymorphic_downcast<ActivationLayerNode *>(node));
+ case NodeType::BatchNormalizationLayer:
+ return create_batch_normalization_layer(*polymorphic_downcast<BatchNormalizationLayerNode *>(node));
+ case NodeType::ConvolutionLayer:
+ return create_convolution_layer(*polymorphic_downcast<ConvolutionLayerNode *>(node), ctx);
+ case NodeType::DepthConcatenateLayer:
+ return create_depth_concatenate_layer(*polymorphic_downcast<DepthConcatenateLayerNode *>(node));
+ case NodeType::DepthwiseConvolutionLayer:
+ return create_depthwise_convolution_layer(*polymorphic_downcast<DepthwiseConvolutionLayerNode *>(node));
+ case NodeType::EltwiseLayer:
+ return create_eltwise_layer(*polymorphic_downcast<EltwiseLayerNode *>(node));
+ case NodeType::FullyConnectedLayer:
+ return create_fully_connected_layer(*polymorphic_downcast<FullyConnectedLayerNode *>(node), ctx);
+ case NodeType::NormalizationLayer:
+ return create_normalization_layer(*polymorphic_downcast<NormalizationLayerNode *>(node));
+ case NodeType::PoolingLayer:
+ return create_pooling_layer(*polymorphic_downcast<PoolingLayerNode *>(node));
+ case NodeType::SoftmaxLayer:
+ return create_softmax_layer(*polymorphic_downcast<SoftmaxLayerNode *>(node), ctx);
+ default:
+ return nullptr;
+ }
+}
+} // namespace backends
+} // namespace graph
+} // namespace arm_compute
diff --git a/src/graph/backends/GLES/GCNodeValidator.cpp b/src/graph/backends/GLES/GCNodeValidator.cpp
new file mode 100644
index 0000000..c7f7d81
--- /dev/null
+++ b/src/graph/backends/GLES/GCNodeValidator.cpp
@@ -0,0 +1,121 @@
+/*
+ * Copyright (c) 2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/graph/backends/GLES/GCNodeValidator.h"
+
+#include "arm_compute/graph/backends/ValidateHelpers.h"
+#include "arm_compute/graph/nodes/Nodes.h"
+
+#include "arm_compute/core/utils/misc/Cast.h"
+#include "arm_compute/runtime/GLES_COMPUTE/GCFunctions.h"
+
+using namespace arm_compute::utils::cast;
+
+namespace arm_compute
+{
+namespace graph
+{
+namespace backends
+{
+namespace
+{
+/** Validates a Depthwise Convolution layer node
+ *
+ * @param[in] node Node to validate
+ *
+ * @return Status
+ */
+Status validate_depthwise_convolution_layer(DepthwiseConvolutionLayerNode &node)
+{
+ ARM_COMPUTE_LOG_GRAPH_VERBOSE("Validating GCDepthwiseConvolutionLayer node with ID : " << node.id() << " and Name: " << node.name() << std::endl);
+ ARM_COMPUTE_RETURN_ERROR_ON(node.num_inputs() != 3);
+ ARM_COMPUTE_RETURN_ERROR_ON(node.num_outputs() != 1);
+
+ // Extract IO and info
+ arm_compute::ITensorInfo *weights = detail::get_backing_tensor_info(node.input(1));
+ ARM_COMPUTE_ERROR_ON(weights == nullptr);
+
+ // Validate function
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(weights->tensor_shape().x() != 3 && weights->tensor_shape().y() != 3, "Unsupported depthwise convolution");
+ node.set_depthwise_convolution_method(DepthwiseConvolutionMethod::OPTIMIZED_3x3);
+
+ return Status{};
+}
+/** Validates a Convolution layer node
+ *
+ * @param[in] node Node to validate
+ *
+ * @return Status
+ */
+Status validate_convolution_layer(ConvolutionLayerNode &node)
+{
+ ARM_COMPUTE_LOG_GRAPH_VERBOSE("Validating ConvolutionLayer node with ID : " << node.id() << " and Name: " << node.name() << std::endl);
+ ARM_COMPUTE_RETURN_ERROR_ON(node.num_inputs() != 3);
+ ARM_COMPUTE_RETURN_ERROR_ON(node.num_outputs() != 1);
+
+ // Extract IO and info
+ arm_compute::ITensorInfo *weights = detail::get_backing_tensor_info(node.input(1));
+ const PadStrideInfo conv_info = node.convolution_info();
+ const ConvolutionMethod conv_algorithm = node.convolution_method();
+
+ // Validate function
+ if(conv_algorithm == ConvolutionMethod::DIRECT)
+ {
+ bool is_square = weights->tensor_shape().x() == weights->tensor_shape().y();
+ bool is_direct = (weights->tensor_shape().x() == 1) || (weights->tensor_shape().x() == 3) || (weights->tensor_shape().x() == 5);
+ bool is_correct_stride = (conv_info.stride().first) <= 2 && (conv_info.stride().second <= 2);
+ if(!(is_square && is_direct && is_correct_stride))
+ {
+ node.set_convolution_method(ConvolutionMethod::DEFAULT);
+ }
+ }
+
+ return Status{};
+}
+} // namespace
+
+Status GCNodeValidator::validate(INode *node)
+{
+ if(node == nullptr)
+ {
+ return Status{};
+ }
+
+ NodeType type = node->type();
+ switch(type)
+ {
+ case NodeType::ConvolutionLayer:
+ return validate_convolution_layer(*polymorphic_downcast<ConvolutionLayerNode *>(node));
+ case NodeType::DepthwiseConvolutionLayer:
+ return validate_depthwise_convolution_layer(*polymorphic_downcast<DepthwiseConvolutionLayerNode *>(node));
+ case NodeType::FlattenLayer:
+ return ARM_COMPUTE_CREATE_ERROR(arm_compute::ErrorCode::RUNTIME_ERROR, "Unsupported operation");
+ case NodeType::ReshapeLayer:
+ return ARM_COMPUTE_CREATE_ERROR(arm_compute::ErrorCode::RUNTIME_ERROR, "Unsupported operation");
+ default:
+ return Status{};
+ }
+}
+} // namespace backends
+} // namespace graph
+} // namespace arm_compute
diff --git a/src/graph/backends/GLES/GCTensorHandle.cpp b/src/graph/backends/GLES/GCTensorHandle.cpp
new file mode 100644
index 0000000..6f96263
--- /dev/null
+++ b/src/graph/backends/GLES/GCTensorHandle.cpp
@@ -0,0 +1,104 @@
+/*
+ * Copyright (c) 2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/graph/backends/GLES/GCTensorHandle.h"
+
+#include "arm_compute/core/utils/misc/Cast.h"
+#include "arm_compute/runtime/GLES_COMPUTE/GCMemoryGroup.h"
+
+namespace arm_compute
+{
+namespace graph
+{
+namespace backends
+{
+GCTensorHandle::GCTensorHandle(const ITensorInfo &info)
+ : _tensor()
+{
+ _tensor.allocator()->init(info);
+}
+
+void GCTensorHandle::allocate()
+{
+ _tensor.allocator()->allocate();
+}
+
+void GCTensorHandle::free()
+{
+ _tensor.allocator()->free();
+}
+
+void GCTensorHandle::manage(IMemoryGroup *mg)
+{
+ if(mg != nullptr)
+ {
+ auto *gc_mg = arm_compute::utils::cast::polymorphic_downcast<GCMemoryGroup *>(mg);
+ gc_mg->manage(&_tensor);
+ }
+}
+
+void GCTensorHandle::map(bool blocking)
+{
+ _tensor.map(blocking);
+}
+
+void GCTensorHandle::unmap()
+{
+ _tensor.unmap();
+}
+
+void GCTensorHandle::release_if_unused()
+{
+ if(!_tensor.is_used())
+ {
+ _tensor.allocator()->free();
+ }
+}
+
+const arm_compute::ITensor &GCTensorHandle::tensor() const
+{
+ return _tensor;
+}
+
+arm_compute::ITensor &GCTensorHandle::tensor()
+{
+ return _tensor;
+}
+
+ITensorHandle *GCTensorHandle::parent_handle()
+{
+ return this;
+}
+
+bool GCTensorHandle::is_subtensor() const
+{
+ return false;
+}
+
+Target GCTensorHandle::target() const
+{
+ return Target::GC;
+}
+} // namespace backends
+} // namespace graph
+} // namespace arm_compute
diff --git a/src/graph/backends/NEON/NEDeviceBackend.cpp b/src/graph/backends/NEON/NEDeviceBackend.cpp
new file mode 100644
index 0000000..7c2db40
--- /dev/null
+++ b/src/graph/backends/NEON/NEDeviceBackend.cpp
@@ -0,0 +1,158 @@
+/*
+ * Copyright (c) 2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/graph/backends/NEON/NEDeviceBackend.h"
+
+#include "arm_compute/graph/Graph.h"
+#include "arm_compute/graph/GraphContext.h"
+#include "arm_compute/graph/INode.h"
+#include "arm_compute/graph/Logger.h"
+#include "arm_compute/graph/Tensor.h"
+#include "arm_compute/graph/backends/BackendRegistrar.h"
+#include "arm_compute/graph/backends/NEON/NEFunctionFactory.h"
+#include "arm_compute/graph/backends/NEON/NENodeValidator.h"
+#include "arm_compute/graph/backends/NEON/NESubTensorHandle.h"
+#include "arm_compute/graph/backends/NEON/NETensorHandle.h"
+
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/runtime/Allocator.h"
+#include "arm_compute/runtime/BlobLifetimeManager.h"
+#include "arm_compute/runtime/MemoryGroup.h"
+#include "arm_compute/runtime/MemoryManagerOnDemand.h"
+#include "arm_compute/runtime/OffsetLifetimeManager.h"
+#include "arm_compute/runtime/PoolManager.h"
+#include "arm_compute/runtime/Scheduler.h"
+
+#include "support/ToolchainSupport.h"
+
+namespace arm_compute
+{
+namespace graph
+{
+namespace backends
+{
+/** Register NEON backend */
+static detail::BackendRegistrar<NEDeviceBackend> NEDeviceBackend_registrar(Target::NEON);
+
+NEDeviceBackend::NEDeviceBackend()
+ : _allocator()
+{
+}
+
+void NEDeviceBackend::initialize_backend()
+{
+}
+
+void NEDeviceBackend::setup_backend_context(GraphContext &ctx)
+{
+ // Set number of threads
+ if(ctx.config().num_threads >= 0)
+ {
+ Scheduler::get().set_num_threads(ctx.config().num_threads);
+ }
+
+ // Create function level memory manager
+ if(ctx.memory_management_ctx(Target::NEON) == nullptr)
+ {
+ MemoryManagerContext mm_ctx;
+ mm_ctx.target = Target::NEON;
+ mm_ctx.intra_mm = create_memory_manager(MemoryManagerAffinity::Offset);
+ mm_ctx.cross_mm = create_memory_manager(MemoryManagerAffinity::Offset);
+ mm_ctx.cross_group = std::make_shared<MemoryGroup>(mm_ctx.cross_mm);
+
+ ctx.insert_memory_management_ctx(std::move(mm_ctx));
+ }
+}
+
+bool NEDeviceBackend::is_backend_supported()
+{
+ return true;
+}
+
+IAllocator *NEDeviceBackend::backend_allocator()
+{
+ return &_allocator;
+}
+
+std::unique_ptr<ITensorHandle> NEDeviceBackend::create_tensor(const Tensor &tensor)
+{
+ // Get tensor descriptor
+ const TensorDescriptor &tensor_desc = tensor.desc();
+ ARM_COMPUTE_ERROR_ON(tensor_desc.target != Target::NEON);
+
+ // Create backend tensor handle
+ TensorInfo info(tensor_desc.shape, 1, tensor_desc.data_type, tensor_desc.quant_info);
+ info.set_data_layout(tensor_desc.layout);
+ auto backend_tensor_handle = support::cpp14::make_unique<NETensorHandle>(info);
+
+ return std::move(backend_tensor_handle);
+}
+
+std::unique_ptr<ITensorHandle> NEDeviceBackend::create_subtensor(ITensorHandle *parent, TensorShape shape, Coordinates coords, bool extend_parent)
+{
+ if(parent == nullptr)
+ {
+ return nullptr;
+ }
+
+ return support::cpp14::make_unique<NESubTensorHandle>(parent, shape, coords, extend_parent);
+}
+
+std::unique_ptr<arm_compute::IFunction> NEDeviceBackend::configure_node(INode &node, GraphContext &ctx)
+{
+ ARM_COMPUTE_LOG_GRAPH_VERBOSE("Configuring NEON node with ID : " << node.id() << std::endl);
+ ARM_COMPUTE_ERROR_ON(node.assigned_target() != Target::NEON);
+
+ // Configure node
+ return NEFunctionFactory::create(&node, ctx);
+}
+
+arm_compute::Status NEDeviceBackend::validate_node(INode &node)
+{
+ ARM_COMPUTE_LOG_GRAPH_VERBOSE("Validating NEON node with ID : " << node.id() << std::endl);
+ ARM_COMPUTE_ERROR_ON(node.assigned_target() != Target::NEON);
+
+ return NENodeValidator::validate(&node);
+}
+
+std::shared_ptr<arm_compute::IMemoryManager> NEDeviceBackend::create_memory_manager(MemoryManagerAffinity affinity)
+{
+ std::shared_ptr<ILifetimeManager> lifetime_mgr = nullptr;
+ if(affinity == MemoryManagerAffinity::Buffer)
+ {
+ lifetime_mgr = std::make_shared<BlobLifetimeManager>();
+ }
+ else
+ {
+ lifetime_mgr = std::make_shared<OffsetLifetimeManager>();
+ }
+ auto pool_mgr = std::make_shared<PoolManager>();
+ auto mm = std::make_shared<MemoryManagerOnDemand>(lifetime_mgr, pool_mgr);
+
+ mm->set_allocator(&_allocator);
+
+ return mm;
+}
+} // namespace backends
+} // namespace graph
+} // namespace arm_compute
\ No newline at end of file
diff --git a/src/graph/backends/NEON/NEFunctionFactory.cpp b/src/graph/backends/NEON/NEFunctionFactory.cpp
new file mode 100644
index 0000000..7b1c50f
--- /dev/null
+++ b/src/graph/backends/NEON/NEFunctionFactory.cpp
@@ -0,0 +1,579 @@
+/*
+ * Copyright (c) 2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/graph/backends/NEON/NEFunctionFactory.h"
+
+#include "arm_compute/core/utils/misc/Cast.h"
+#include "arm_compute/graph/Graph.h"
+#include "arm_compute/graph/GraphContext.h"
+#include "arm_compute/graph/Logger.h"
+#include "arm_compute/graph/TypePrinter.h"
+#include "arm_compute/graph/backends/Utils.h"
+#include "arm_compute/graph/nodes/Nodes.h"
+#include "arm_compute/runtime/NEON/NEFunctions.h"
+#include "support/ToolchainSupport.h"
+
+using namespace arm_compute::utils::cast;
+
+namespace arm_compute
+{
+namespace graph
+{
+namespace backends
+{
+namespace
+{
+/** Returns backing tensor of a given tensor
+ *
+ * @param[in] tensor Tensor to extract the backing tensor from
+ *
+ * @return Backing tensor if present else nullptr
+ */
+arm_compute::ITensor *get_backing_tensor(arm_compute::graph::Tensor *tensor)
+{
+ return ((tensor == nullptr) || (tensor->handle() == nullptr)) ? nullptr : &tensor->handle()->tensor();
+}
+
+/** Create a backend activation layer function
+ *
+ * @param[in] node Node to create the backend function for
+ *
+ * @return Backend activation layer function
+ */
+std::unique_ptr<IFunction> create_activation_layer(ActivationLayerNode &node)
+{
+ ARM_COMPUTE_LOG_GRAPH_VERBOSE("Creating NEON ActivationLayerNode node with ID : " << node.id() << " and Name: " << node.name() << std::endl);
+ ARM_COMPUTE_ERROR_ON(node.num_inputs() != 1);
+ ARM_COMPUTE_ERROR_ON(node.num_outputs() != 1);
+
+ // Extract IO and info
+ ITensor *input = get_backing_tensor(node.input(0));
+ ITensor *output = get_backing_tensor(node.output(0));
+ const ActivationLayerInfo act_info = node.activation_info();
+
+ // Create function
+ auto func = support::cpp14::make_unique<NEActivationLayer>();
+ func->configure(input, output, act_info);
+
+ ARM_COMPUTE_LOG_GRAPH_INFO("Instantiated NEActivationLayer"
+ << " Data Type: " << input->info()->data_type()
+ << " Shape: " << input->info()->tensor_shape()
+ << " Activation function: " << act_info.activation()
+ << " a: " << act_info.a()
+ << " b: " << act_info.b()
+ << " InPlace : " << is_in_place_operation(input, output)
+ << std::endl);
+
+ return std::move(func);
+}
+
+/** Create a backend batch normalization layer function
+ *
+ * @param[in] node Node to create the backend function for
+ *
+ * @return Backend batch normalization layer function
+ */
+std::unique_ptr<IFunction> create_batch_normalization_layer(BatchNormalizationLayerNode &node)
+{
+ ARM_COMPUTE_LOG_GRAPH_VERBOSE("Creating NEON BatchNormalization node with ID : " << node.id() << " and Name: " << node.name() << std::endl);
+
+ ARM_COMPUTE_ERROR_ON(node.num_inputs() != 5);
+ ARM_COMPUTE_ERROR_ON(node.num_outputs() != 1);
+
+ // Extract IO and info
+ ITensor *input = get_backing_tensor(node.input(0));
+ ITensor *mean = get_backing_tensor(node.input(1));
+ ITensor *var = get_backing_tensor(node.input(2));
+ ITensor *beta = get_backing_tensor(node.input(3));
+ ITensor *gamma = get_backing_tensor(node.input(4));
+ ITensor *output = get_backing_tensor(node.output(0));
+ const float epsilon = node.epsilon();
+ const ActivationLayerInfo fused_act = node.fused_activation();
+
+ // Create and configure function
+ auto func = support::cpp14::make_unique<NEBatchNormalizationLayer>();
+ func->configure(input, output, mean, var, beta, gamma, epsilon, fused_act);
+
+ // Log info
+ ARM_COMPUTE_LOG_GRAPH_INFO("Instantiated NEBatchNormalizationLayer"
+ << " Data Type: " << input->info()->data_type()
+ << " Shape: " << input->info()->tensor_shape()
+ << " Epsilon: " << epsilon << " "
+ << (fused_act.enabled() ? to_string(fused_act.activation()) : "")
+ << " InPlace : " << is_in_place_operation(input, output)
+ << std::endl);
+
+ return std::move(func);
+}
+
+/** Create a backend convolution layer function
+ *
+ * @param[in] node Node to create the backend function for
+ *
+ * @return Backend convolution layer function
+ */
+std::unique_ptr<IFunction> create_convolution_layer(ConvolutionLayerNode &node, GraphContext &ctx)
+{
+ ARM_COMPUTE_LOG_GRAPH_VERBOSE("Creating NEON ConvolutionLayer node with ID : " << node.id() << " and Name: " << node.name() << std::endl);
+ ARM_COMPUTE_ERROR_ON(node.num_inputs() != 3);
+ ARM_COMPUTE_ERROR_ON(node.num_outputs() != 1);
+
+ // Extract IO and info
+ ITensor *input = get_backing_tensor(node.input(0));
+ ITensor *weights = get_backing_tensor(node.input(1));
+ ITensor *biases = get_backing_tensor(node.input(2));
+ ITensor *output = get_backing_tensor(node.output(0));
+
+ if(is_data_type_quantized_asymmetric(input->info()->data_type()))
+ {
+ biases->info()->set_data_type(DataType::S32);
+ }
+
+ const PadStrideInfo conv_info = node.convolution_info();
+ const ConvolutionMethod conv_algorithm = node.convolution_method();
+
+ // Create and configure function (we assume that functions have been validated before creation)
+ std::shared_ptr<IMemoryManager> mm = get_memory_manager(ctx, Target::NEON);
+ std::unique_ptr<IFunction> func;
+ std::string func_name;
+ if(conv_algorithm == ConvolutionMethod::DIRECT)
+ {
+ std::tie(func, func_name) = create_named_memory_managed_function<NEDirectConvolutionLayer>(std::string("NEDirectConvolutionLayer"), mm,
+ input, weights, biases, output, conv_info);
+ }
+ else if(conv_algorithm == ConvolutionMethod::GEMM)
+ {
+ std::tie(func, func_name) = create_named_memory_managed_function<NEGEMMConvolutionLayer>(std::string("NEGEMMConvolutionLayer"), mm,
+ input, weights, biases, output, conv_info);
+ }
+ else if(conv_algorithm == ConvolutionMethod::WINOGRAD)
+ {
+ std::tie(func, func_name) = create_named_memory_managed_function<NEWinogradConvolutionLayer>(std::string("NEWinogradConvolutionLayer"), mm,
+ input, weights, biases, output, conv_info);
+ }
+ else
+ {
+ std::tie(func, func_name) = create_named_memory_managed_function<NEConvolutionLayer>(std::string("NEConvolutionLayer"), mm,
+ input, weights, biases, output, conv_info);
+ }
+
+ // Log info
+ ARM_COMPUTE_LOG_GRAPH_INFO("Instantiated " << func_name
+ << " Data Type: " << input->info()->data_type()
+ << " Input QuantInfo: " << input->info()->quantization_info()
+ << " Weights QuantInfo: " << weights->info()->quantization_info()
+ << " Input shape: " << input->info()->tensor_shape()
+ << " Weights shape: " << weights->info()->tensor_shape()
+ << " Output shape: " << output->info()->tensor_shape()
+ << std::endl);
+ return func;
+}
+
+/** Create a backend layer depth concatenate function
+ *
+ * @param[in] node Node to create the backend function for
+ *
+ * @return Backend depth concatenate layer function
+ */
+std::unique_ptr<arm_compute::IFunction> create_depth_concatenate_layer(DepthConcatenateLayerNode &node)
+{
+ ARM_COMPUTE_LOG_GRAPH_VERBOSE("Creating NEON DepthConcatenate node with ID : " << node.id() << " and Name: " << node.name() << std::endl);
+ ARM_COMPUTE_ERROR_ON(node.num_outputs() != 1);
+
+ // Return nullptr if depth concatenate is switched off
+ if(!node.is_enabled())
+ {
+ return nullptr;
+ }
+
+ // Extract IO and info
+ std::vector<arm_compute::ITensor *> inputs;
+ for(unsigned int i = 0; i < node.num_inputs(); ++i)
+ {
+ inputs.push_back(get_backing_tensor(node.input(i)));
+ }
+ ITensor *output = get_backing_tensor(node.output(0));
+
+ // Create and configure function
+ auto func = support::cpp14::make_unique<NEDepthConcatenateLayer>();
+ func->configure(inputs, output);
+
+ // Log info
+ ARM_COMPUTE_LOG_GRAPH_INFO("Instantiated NEDepthConcatenateLayer"
+ << " Data Type: " << output->info()->data_type()
+ << " Shape: " << output->info()->tensor_shape()
+ << " Num Inputs: " << inputs.size()
+ << std::endl);
+
+ return std::move(func);
+}
+
+/** Create a backend layer depth-wise convolution function
+ *
+ * @param[in] node Node to create the backend function for
+ *
+ * @return Backend depth-wise convolution layer function
+ */
+std::unique_ptr<IFunction> create_depthwise_convolution_layer(DepthwiseConvolutionLayerNode &node)
+{
+ ARM_COMPUTE_LOG_GRAPH_VERBOSE("Creating NEON DepthwiseConvolutionLayer node with ID : " << node.id() << " and Name: " << node.name() << std::endl);
+ ARM_COMPUTE_ERROR_ON(node.num_inputs() != 3);
+ ARM_COMPUTE_ERROR_ON(node.num_outputs() != 1);
+
+ // Extract IO and info
+ ITensor *input = get_backing_tensor(node.input(0));
+ ITensor *weights = get_backing_tensor(node.input(1));
+ ITensor *biases = get_backing_tensor(node.input(2));
+ ITensor *output = get_backing_tensor(node.output(0));
+
+ if(is_data_type_quantized_asymmetric(input->info()->data_type()))
+ {
+ biases->info()->set_data_type(DataType::S32);
+ }
+
+ const PadStrideInfo conv_info = node.convolution_info();
+ const DepthwiseConvolutionMethod dwc_algorithm = node.depthwise_convolution_method();
+
+ // Create and configure function (we assume that functions have been validated before creation)
+ std::unique_ptr<IFunction> func;
+ std::string func_name;
+ if(dwc_algorithm == DepthwiseConvolutionMethod::OPTIMIZED_3x3)
+ {
+ std::tie(func, func_name) = create_named_function<NEDepthwiseConvolutionLayer3x3>(std::string("NEDepthwiseConvolutionLayer3x3"),
+ input, weights, biases, output, conv_info);
+ }
+ else
+ {
+ std::tie(func, func_name) = create_named_function<NEDepthwiseConvolutionLayer>(std::string("NEDepthwiseConvolutionLayer"),
+ input, weights, biases, output, conv_info);
+ }
+
+ // Log info
+ ARM_COMPUTE_LOG_GRAPH_INFO("Instantiated " << func_name
+ << " Data Type: " << input->info()->data_type()
+ << " Input QuantInfo: " << input->info()->quantization_info()
+ << " Weights QuantInfo: " << weights->info()->quantization_info()
+ << " Input shape: " << input->info()->tensor_shape()
+ << " Weights shape: " << weights->info()->tensor_shape()
+ << " Output shape: " << output->info()->tensor_shape()
+ << std::endl);
+ return func;
+}
+
+/** Create a backend element-wise operation layer function
+ *
+ * @param[in] node Node to create the backend function for
+ *
+ * @return Backend element-wise operation layer function
+ */
+std::unique_ptr<IFunction> create_eltwise_layer(EltwiseLayerNode &node)
+{
+ ARM_COMPUTE_LOG_GRAPH_VERBOSE("Creating NEON EltwiseLayer node with ID : " << node.id() << " and Name: " << node.name() << std::endl);
+ ARM_COMPUTE_ERROR_ON(node.num_inputs() != 2);
+ ARM_COMPUTE_ERROR_ON(node.num_outputs() != 1);
+
+ // Extract IO and info
+ ITensor *input1 = get_backing_tensor(node.input(0));
+ ITensor *input2 = get_backing_tensor(node.input(1));
+ ITensor *output = get_backing_tensor(node.output(0));
+ const EltwiseOperation eltwise_op = node.eltwise_operation();
+ const ConvertPolicy convert_policy = node.convert_policy();
+ ARM_COMPUTE_ERROR_ON(input1 == nullptr);
+ ARM_COMPUTE_ERROR_ON(input2 == nullptr);
+ ARM_COMPUTE_ERROR_ON(output == nullptr);
+
+ std::unique_ptr<IFunction> func = nullptr;
+ std::string func_name;
+ if(eltwise_op == EltwiseOperation::ADD)
+ {
+ std::tie(func, func_name) = create_named_function<NEArithmeticAddition>(std::string("NEArithmeticAddition"),
+ input1, input2, output, convert_policy);
+ }
+ else if(eltwise_op == EltwiseOperation::SUB)
+ {
+ std::tie(func, func_name) = create_named_function<NEArithmeticSubtraction>(std::string("NEArithmeticSubtraction"),
+ input1, input2, output, convert_policy);
+ }
+ else if(eltwise_op == EltwiseOperation::MUL)
+ {
+ std::tie(func, func_name) = create_named_function<NEPixelWiseMultiplication>(std::string("NEPixelWiseMultiplication"),
+ input1, input2, output, 1.f,
+ convert_policy, node.rounding_policy());
+ }
+ else
+ {
+ ARM_COMPUTE_ERROR("Unsupported element-wise operation!");
+ }
+
+ // Log info
+ ARM_COMPUTE_LOG_GRAPH_INFO("Instantiated " << func_name
+ << " Data Type: " << input1->info()->data_type()
+ << " Shape : " << input1->info()->tensor_shape()
+ << std::endl);
+
+ return func;
+}
+
+/** Create a backend flatten layer function
+ *
+ * @param[in] node Node to create the backend function for
+ *
+ * @return Backend flatten layer function
+ */
+std::unique_ptr<IFunction> create_flatten_layer(FlattenLayerNode &node)
+{
+ ARM_COMPUTE_LOG_GRAPH_VERBOSE("Creating NEON FlattenLayer node with ID : " << node.id() << " and Name: " << node.name() << std::endl);
+ ARM_COMPUTE_ERROR_ON(node.num_inputs() != 1);
+ ARM_COMPUTE_ERROR_ON(node.num_outputs() != 1);
+
+ // Extract IO and info
+ ITensor *input = get_backing_tensor(node.input(0));
+ ITensor *output = get_backing_tensor(node.output(0));
+
+ // Create and configure function
+ auto func = support::cpp14::make_unique<NEFlattenLayer>();
+ func->configure(input, output);
+ ARM_COMPUTE_ERROR_ON(input == nullptr);
+ ARM_COMPUTE_ERROR_ON(output == nullptr);
+
+ // Log info
+ ARM_COMPUTE_LOG_GRAPH_INFO("Instantiated NEFlattenLayer"
+ << " Data Type: " << input->info()->data_type()
+ << " Input shape: " << input->info()->tensor_shape()
+ << " Output shape: " << output->info()->tensor_shape()
+ << std::endl);
+
+ return std::move(func);
+}
+
+/** Create a backend fully connected layer function
+ *
+ * @param[in] node Node to create the backend function for
+ *
+ * @return Backend fully connected layer function
+ */
+std::unique_ptr<IFunction> create_fully_connected_layer(FullyConnectedLayerNode &node, GraphContext &ctx)
+{
+ ARM_COMPUTE_LOG_GRAPH_VERBOSE("Creating NEON FullyConnectedLayer node with ID : " << node.id() << " and Name: " << node.name() << std::endl);
+ ARM_COMPUTE_ERROR_ON(node.num_inputs() != 3);
+ ARM_COMPUTE_ERROR_ON(node.num_outputs() != 1);
+
+ // Extract IO and info
+ ITensor *input = get_backing_tensor(node.input(0));
+ ITensor *weights = get_backing_tensor(node.input(1));
+ ITensor *biases = get_backing_tensor(node.input(2));
+ ITensor *output = get_backing_tensor(node.output(0));
+
+ // Create and configure function
+ auto func = support::cpp14::make_unique<NEFullyConnectedLayer>(get_memory_manager(ctx, Target::NEON));
+ func->configure(input, weights, biases, output);
+ ARM_COMPUTE_ERROR_ON(input == nullptr);
+ ARM_COMPUTE_ERROR_ON(weights == nullptr);
+ ARM_COMPUTE_ERROR_ON(output == nullptr);
+
+ // Log info
+ ARM_COMPUTE_LOG_GRAPH_INFO("Instantiated NEFullyConnectedLayer"
+ << " Data Type: " << input->info()->data_type()
+ << " Input shape: " << input->info()->tensor_shape()
+ << " Weights shape: " << weights->info()->tensor_shape()
+ << " Output shape: " << output->info()->tensor_shape()
+ << std::endl);
+
+ return std::move(func);
+}
+
+/** Create a backend normalization layer function
+ *
+ * @param[in] node Node to create the backend function for
+ *
+ * @return Backend normalization layer function
+ */
+std::unique_ptr<IFunction> create_normalization_layer(NormalizationLayerNode &node, GraphContext &ctx)
+{
+ ARM_COMPUTE_LOG_GRAPH_VERBOSE("Creating NEON NormalizationLayer node with ID : " << node.id() << " and Name: " << node.name() << std::endl);
+ ARM_COMPUTE_ERROR_ON(node.num_inputs() != 1);
+ ARM_COMPUTE_ERROR_ON(node.num_outputs() != 1);
+
+ // Extract IO and info
+ ITensor *input = get_backing_tensor(node.input(0));
+ ITensor *output = get_backing_tensor(node.output(0));
+ const NormalizationLayerInfo norm_info = node.normalization_info();
+ ARM_COMPUTE_ERROR_ON(input == nullptr);
+ ARM_COMPUTE_ERROR_ON(output == nullptr);
+
+ // Create and configure function
+ auto func = support::cpp14::make_unique<NENormalizationLayer>(get_memory_manager(ctx, Target::NEON));
+ func->configure(input, output, norm_info);
+
+ // Log info
+ ARM_COMPUTE_LOG_GRAPH_INFO("Instantiated NENormalizationLayer"
+ << " Data Type: " << input->info()->data_type()
+ << " Input shape: " << input->info()->tensor_shape()
+ << " Output shape: " << output->info()->tensor_shape()
+ << " Normalization info: " << norm_info.type()
+ << std::endl);
+
+ return std::move(func);
+}
+
+/** Create a backend pooling layer function
+ *
+ * @param[in] node Node to create the backend function for
+ *
+ * @return Backend pooling layer function
+ */
+std::unique_ptr<IFunction> create_pooling_layer(PoolingLayerNode &node)
+{
+ ARM_COMPUTE_LOG_GRAPH_VERBOSE("Creating NEON PoolingLayer node with ID : " << node.id() << " and Name: " << node.name() << std::endl);
+ ARM_COMPUTE_ERROR_ON(node.num_inputs() != 1);
+ ARM_COMPUTE_ERROR_ON(node.num_outputs() != 1);
+
+ // Extract IO and info
+ ITensor *input = get_backing_tensor(node.input(0));
+ ITensor *output = get_backing_tensor(node.output(0));
+ const PoolingLayerInfo pool_info = node.pooling_info();
+ ARM_COMPUTE_ERROR_ON(input == nullptr);
+ ARM_COMPUTE_ERROR_ON(output == nullptr);
+
+ // Create and configure function
+ auto func = support::cpp14::make_unique<NEPoolingLayer>();
+ func->configure(input, output, pool_info);
+
+ // Log info
+ ARM_COMPUTE_LOG_GRAPH_INFO("Instantiated NEPoolingLayer"
+ << " Data Type: " << input->info()->data_type()
+ << " Input shape: " << input->info()->tensor_shape()
+ << " Output shape: " << output->info()->tensor_shape()
+ << " Pooling info: " << pool_info.pool_type()
+ << std::endl);
+
+ return std::move(func);
+}
+
+/** Create a backend reshape layer function
+ *
+ * @param[in] node Node to create the backend function for
+ *
+ * @return Backend reshape layer function
+ */
+std::unique_ptr<IFunction> create_reshape_layer(ReshapeLayerNode &node)
+{
+ ARM_COMPUTE_LOG_GRAPH_VERBOSE("Creating NEON ReshapeLayer node with ID : " << node.id() << " and Name: " << node.name() << std::endl);
+ ARM_COMPUTE_ERROR_ON(node.num_inputs() != 1);
+ ARM_COMPUTE_ERROR_ON(node.num_outputs() != 1);
+
+ // Extract IO and info
+ ITensor *input = get_backing_tensor(node.input(0));
+ ITensor *output = get_backing_tensor(node.output(0));
+ ARM_COMPUTE_ERROR_ON(input == nullptr);
+ ARM_COMPUTE_ERROR_ON(output == nullptr);
+
+ // Create and configure function
+ auto func = support::cpp14::make_unique<NEReshapeLayer>();
+ func->configure(input, output);
+
+ // Log info
+ ARM_COMPUTE_LOG_GRAPH_INFO("Instantiated NEReshapeLayer"
+ << " Data Type: " << input->info()->data_type()
+ << " Input shape: " << input->info()->tensor_shape()
+ << " Output shape: " << output->info()->tensor_shape()
+ << std::endl);
+
+ return std::move(func);
+}
+
+/** Create a backend softmax layer function
+ *
+ * @param[in] node Node to create the backend function for
+ *
+ * @return Backend softmax layer function
+ */
+std::unique_ptr<IFunction> create_softmax_layer(SoftmaxLayerNode &node, GraphContext &ctx)
+{
+ ARM_COMPUTE_LOG_GRAPH_VERBOSE("Creating NEON SoftmaxLayer node with ID : " << node.id() << " and Name: " << node.name() << std::endl);
+ ARM_COMPUTE_ERROR_ON(node.num_inputs() != 1);
+ ARM_COMPUTE_ERROR_ON(node.num_outputs() != 1);
+
+ // Extract IO and info
+ ITensor *input = get_backing_tensor(node.input(0));
+ ITensor *output = get_backing_tensor(node.output(0));
+ const float beta = node.beta();
+ ARM_COMPUTE_ERROR_ON(input == nullptr);
+ ARM_COMPUTE_ERROR_ON(output == nullptr);
+
+ // Create and configure function
+ auto func = support::cpp14::make_unique<NESoftmaxLayer>(get_memory_manager(ctx, Target::NEON));
+ func->configure(input, output, beta);
+
+ // Log info
+ ARM_COMPUTE_LOG_GRAPH_INFO("Instantiated NESoftmaxLayer"
+ << " Data Type: " << input->info()->data_type()
+ << " Input shape: " << input->info()->tensor_shape()
+ << " Output shape: " << output->info()->tensor_shape()
+ << std::endl);
+
+ return std::move(func);
+}
+} // namespace
+
+std::unique_ptr<IFunction> NEFunctionFactory::create(INode *node, GraphContext &ctx)
+{
+ if(node == nullptr)
+ {
+ return nullptr;
+ }
+
+ NodeType type = node->type();
+ switch(type)
+ {
+ case NodeType::ActivationLayer:
+ return create_activation_layer(*polymorphic_downcast<ActivationLayerNode *>(node));
+ case NodeType::BatchNormalizationLayer:
+ return create_batch_normalization_layer(*polymorphic_downcast<BatchNormalizationLayerNode *>(node));
+ case NodeType::ConvolutionLayer:
+ return create_convolution_layer(*polymorphic_downcast<ConvolutionLayerNode *>(node), ctx);
+ case NodeType::DepthConcatenateLayer:
+ return create_depth_concatenate_layer(*polymorphic_downcast<DepthConcatenateLayerNode *>(node));
+ case NodeType::DepthwiseConvolutionLayer:
+ return create_depthwise_convolution_layer(*polymorphic_downcast<DepthwiseConvolutionLayerNode *>(node));
+ case NodeType::EltwiseLayer:
+ return create_eltwise_layer(*polymorphic_downcast<EltwiseLayerNode *>(node));
+ case NodeType::FlattenLayer:
+ return create_flatten_layer(*polymorphic_downcast<FlattenLayerNode *>(node));
+ case NodeType::FullyConnectedLayer:
+ return create_fully_connected_layer(*polymorphic_downcast<FullyConnectedLayerNode *>(node), ctx);
+ case NodeType::NormalizationLayer:
+ return create_normalization_layer(*polymorphic_downcast<NormalizationLayerNode *>(node), ctx);
+ case NodeType::PoolingLayer:
+ return create_pooling_layer(*polymorphic_downcast<PoolingLayerNode *>(node));
+ case NodeType::ReshapeLayer:
+ return create_reshape_layer(*polymorphic_downcast<ReshapeLayerNode *>(node));
+ case NodeType::SoftmaxLayer:
+ return create_softmax_layer(*polymorphic_downcast<SoftmaxLayerNode *>(node), ctx);
+ default:
+ return nullptr;
+ }
+}
+} // namespace backends
+} // namespace graph
+} // namespace arm_compute
diff --git a/src/graph/backends/NEON/NENodeValidator.cpp b/src/graph/backends/NEON/NENodeValidator.cpp
new file mode 100644
index 0000000..e438e79
--- /dev/null
+++ b/src/graph/backends/NEON/NENodeValidator.cpp
@@ -0,0 +1,65 @@
+/*
+ * Copyright (c) 2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/graph/backends/NEON/NENodeValidator.h"
+
+#include "arm_compute/graph/backends/ValidateHelpers.h"
+#include "arm_compute/graph/nodes/Nodes.h"
+
+#include "arm_compute/core/utils/misc/Cast.h"
+#include "arm_compute/runtime/NEON/NEFunctions.h"
+
+using namespace arm_compute::utils::cast;
+
+namespace arm_compute
+{
+namespace graph
+{
+namespace backends
+{
+Status NENodeValidator::validate(INode *node)
+{
+ if(node == nullptr)
+ {
+ return Status{};
+ }
+
+ NodeType type = node->type();
+ switch(type)
+ {
+ case NodeType::ConvolutionLayer:
+ return detail::validate_convolution_layer<NEConvolutionLayer,
+ NEDirectConvolutionLayer,
+ NEGEMMConvolutionLayer,
+ NEWinogradConvolutionLayer>(*polymorphic_downcast<ConvolutionLayerNode *>(node));
+ case NodeType::DepthwiseConvolutionLayer:
+ return detail::validate_depthwise_convolution_layer<NEDepthwiseConvolutionLayer,
+ NEDepthwiseConvolutionLayer3x3>(*polymorphic_downcast<DepthwiseConvolutionLayerNode *>(node));
+
+ default:
+ return Status{};
+ }
+}
+} // namespace backends
+} // namespace graph
+} // namespace arm_compute
\ No newline at end of file
diff --git a/src/graph/backends/NEON/NESubTensorHandle.cpp b/src/graph/backends/NEON/NESubTensorHandle.cpp
new file mode 100644
index 0000000..c0acedd
--- /dev/null
+++ b/src/graph/backends/NEON/NESubTensorHandle.cpp
@@ -0,0 +1,98 @@
+/*
+ * Copyright (c) 2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/graph/backends/NEON/NESubTensorHandle.h"
+
+namespace arm_compute
+{
+namespace graph
+{
+namespace backends
+{
+NESubTensorHandle::NESubTensorHandle(ITensorHandle *parent_handle, const TensorShape &shape, const Coordinates &coords, bool extend_parent)
+ : _sub_tensor(), _parent_handle(nullptr)
+{
+ ARM_COMPUTE_ERROR_ON(!parent_handle);
+ _sub_tensor = arm_compute::SubTensor(&parent_handle->tensor(), shape, coords, extend_parent);
+ _parent_handle = parent_handle;
+}
+
+void NESubTensorHandle::allocate()
+{
+ // noop
+}
+
+void NESubTensorHandle::free()
+{
+ // noop
+}
+
+void NESubTensorHandle::manage(IMemoryGroup *mg)
+{
+ ARM_COMPUTE_UNUSED(mg);
+ // noop
+}
+
+void NESubTensorHandle::map(bool blocking)
+{
+ ARM_COMPUTE_UNUSED(blocking);
+}
+
+void NESubTensorHandle::unmap()
+{
+ // noop
+}
+
+void NESubTensorHandle::release_if_unused()
+{
+ // noop
+}
+
+const arm_compute::ITensor &NESubTensorHandle::tensor() const
+{
+ return _sub_tensor;
+}
+
+arm_compute::ITensor &NESubTensorHandle::tensor()
+{
+ return _sub_tensor;
+}
+
+ITensorHandle *NESubTensorHandle::parent_handle()
+{
+ ARM_COMPUTE_ERROR_ON(_parent_handle == nullptr);
+ return _parent_handle->parent_handle();
+}
+
+bool NESubTensorHandle::is_subtensor() const
+{
+ return true;
+}
+
+Target NESubTensorHandle::target() const
+{
+ return Target::NEON;
+}
+} // namespace backends
+} // namespace graph
+} // namespace arm_compute
\ No newline at end of file
diff --git a/src/graph/backends/NEON/NETensorHandle.cpp b/src/graph/backends/NEON/NETensorHandle.cpp
new file mode 100644
index 0000000..caa2c10
--- /dev/null
+++ b/src/graph/backends/NEON/NETensorHandle.cpp
@@ -0,0 +1,103 @@
+/*
+ * Copyright (c) 2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/graph/backends/NEON/NETensorHandle.h"
+
+#include "arm_compute/core/utils/misc/Cast.h"
+#include "arm_compute/runtime/MemoryGroup.h"
+
+namespace arm_compute
+{
+namespace graph
+{
+namespace backends
+{
+NETensorHandle::NETensorHandle(const ITensorInfo &info)
+ : _tensor()
+{
+ _tensor.allocator()->init(info);
+}
+
+void NETensorHandle::allocate()
+{
+ _tensor.allocator()->allocate();
+}
+
+void NETensorHandle::free()
+{
+ _tensor.allocator()->free();
+}
+
+void NETensorHandle::manage(IMemoryGroup *mg)
+{
+ if(mg != nullptr)
+ {
+ auto *ne_mg = arm_compute::utils::cast::polymorphic_downcast<MemoryGroup *>(mg);
+ ne_mg->manage(&_tensor);
+ }
+}
+
+void NETensorHandle::map(bool blocking)
+{
+ ARM_COMPUTE_UNUSED(blocking);
+}
+
+void NETensorHandle::unmap()
+{
+}
+
+void NETensorHandle::release_if_unused()
+{
+ if(!_tensor.is_used())
+ {
+ _tensor.allocator()->free();
+ }
+}
+
+const arm_compute::ITensor &NETensorHandle::tensor() const
+{
+ return _tensor;
+}
+
+arm_compute::ITensor &NETensorHandle::tensor()
+{
+ return _tensor;
+}
+
+ITensorHandle *NETensorHandle::parent_handle()
+{
+ return this;
+}
+
+bool NETensorHandle::is_subtensor() const
+{
+ return false;
+}
+
+Target NETensorHandle::target() const
+{
+ return Target::NEON;
+}
+} // namespace backends
+} // namespace graph
+} // namespace arm_compute
diff --git a/src/graph/detail/CrossLayerMemoryManagerHelpers.cpp b/src/graph/detail/CrossLayerMemoryManagerHelpers.cpp
new file mode 100644
index 0000000..6b2f68c
--- /dev/null
+++ b/src/graph/detail/CrossLayerMemoryManagerHelpers.cpp
@@ -0,0 +1,268 @@
+/*
+ * Copyright (c) 2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/graph/detail/CrossLayerMemoryManagerHelpers.h"
+
+#include "arm_compute/graph/Graph.h"
+#include "arm_compute/graph/GraphContext.h"
+#include "arm_compute/graph/GraphManager.h"
+#include "arm_compute/graph/INode.h"
+#include "arm_compute/graph/Tensor.h"
+#include "arm_compute/graph/Types.h"
+#include "arm_compute/graph/backends/BackendRegistry.h"
+
+#include "arm_compute/core/ITensor.h"
+#include "arm_compute/core/utils/misc/Cast.h"
+
+#include <algorithm>
+#include <map>
+
+namespace arm_compute
+{
+namespace graph
+{
+namespace detail
+{
+namespace
+{
+using HandleCountPair = std::pair<ITensorHandle *, unsigned int>;
+using HandleCounter = std::map<HandleCountPair::first_type, HandleCountPair::second_type>;
+using TargetHandleCounter = std::map<Target, HandleCounter>;
+
+/** Holds managed IO tensor handles if a task */
+struct TaskHandles
+{
+ std::vector<std::pair<ITensorHandle *, IMemoryGroup *>> input_handles = {}; /**< Input handles to a task */
+ std::vector<std::pair<ITensorHandle *, IMemoryGroup *>> output_handles = {}; /**< Output handles of a task */
+};
+
+/** Returns memory group depending on handle backend type
+ *
+ * @param[in] ctx Graph context
+ * @param[in] handle Tensor handle
+ *
+ * @return Memory groupb
+ */
+IMemoryGroup *get_memory_group_from_handle(GraphContext &ctx, ITensorHandle *handle)
+{
+ ARM_COMPUTE_ERROR_ON(handle == nullptr);
+ return ctx.memory_management_ctx(handle->target())->cross_group.get();
+}
+
+/** Get handles of const tensors of graph
+ *
+ * @param[in] g Graph
+ *
+ * @return Handles of const tensors of graph
+ */
+std::set<ITensorHandle *> get_const_handles(const Graph &g)
+{
+ std::set<NodeType> const_node_types = { NodeType::Input, NodeType::Output, NodeType::Const };
+
+ std::set<ITensorHandle *> const_tensors;
+
+ auto &nodes = g.nodes();
+ for(auto &node : nodes)
+ {
+ // If its a const node:
+ if(node != nullptr && const_node_types.find(node->type()) != std::end(const_node_types))
+ {
+ // Add all its inputs / outputs to the list of constant handles
+ for(unsigned int i = 0; i < node->num_inputs(); ++i)
+ {
+ if(node->input(i) != nullptr)
+ {
+ const_tensors.insert(node->input(i)->handle()->parent_handle());
+ }
+ }
+ for(unsigned int i = 0; i < node->num_outputs(); ++i)
+ {
+ if(node->output(i) != nullptr)
+ {
+ const_tensors.insert(node->output(i)->handle()->parent_handle());
+ }
+ }
+ }
+ }
+
+ return const_tensors;
+}
+
+/** Builds a list of all the transition handles (Handles that are used to link two nodes)
+ *
+ * @param[in] ctx Graph context
+ * @param[in] task Workload task
+ * @param[in] const_tensors Constant tensors
+ *
+ * @return List of transition handles
+ */
+TaskHandles get_transition_handles(GraphContext &ctx,
+ ExecutionTask &task,
+ const std::set<ITensorHandle *> &const_tensors)
+{
+ ARM_COMPUTE_ERROR_ON(task.node == nullptr || task.task == nullptr);
+ INode &node = *task.node;
+
+ TaskHandles transition_handles;
+
+ // Add input handles
+ for(unsigned int i = 0; i < node.input_edges().size(); ++i)
+ {
+ Edge *input_edge = node.input_edge(i);
+ // If this input is the output of another node
+ if(input_edge != nullptr && input_edge->tensor() != nullptr && const_tensors.find(input_edge->tensor()->handle()->parent_handle()) == std::end(const_tensors))
+ {
+ // Then add it to the list of transition buffers
+ ITensorHandle *tensor_handle = input_edge->tensor()->handle()->parent_handle();
+ IMemoryGroup *mm_group = get_memory_group_from_handle(ctx, tensor_handle);
+ transition_handles.input_handles.push_back(std::make_pair(tensor_handle, mm_group));
+ }
+ }
+
+ // Add output handles
+ for(unsigned int i = 0; i < node.num_outputs(); ++i)
+ {
+ Tensor *output_tensor = node.output(i);
+ // If this output is used as an input for another node
+ if(output_tensor != nullptr && const_tensors.find(output_tensor->handle()->parent_handle()) == std::end(const_tensors))
+ {
+ ITensorHandle *tensor_handle = output_tensor->handle()->parent_handle();
+ IMemoryGroup *mm_group = get_memory_group_from_handle(ctx, tensor_handle);
+ transition_handles.output_handles.push_back(std::make_pair(tensor_handle, mm_group));
+ }
+ }
+
+ return transition_handles;
+}
+
+/** Counts handles refcount for each input handle of each target
+ *
+ * @param[in] task Execution task containing the managed handles
+ * @param[in,out] handle_counter Data structure that keeps the handles reference count
+ */
+void count_input_handles_per_target(const TaskHandles &task_handles, TargetHandleCounter &handle_counter)
+{
+ for(const auto &handle : task_handles.input_handles)
+ {
+ ITensorHandle *key = handle.first;
+ HandleCounter &target_counter = handle_counter[key->target()];
+ if(target_counter.find(key) == std::end(target_counter))
+ {
+ target_counter.emplace(std::make_pair(key, 1));
+ }
+ else
+ {
+ ++target_counter[key];
+ }
+ }
+}
+
+/** Calculates the lifetime of each tensor handle
+ *
+ * @param[in, out] tasks_handles Tensor handles for each task
+ * @param[in] hc Data structure that keeps the handles reference count
+ */
+void configure_handle_lifetime(std::vector<TaskHandles> &tasks_handles, const HandleCounter &hc)
+{
+ // Identify max number of tensors in flight
+ HandleCounter tensors_in_flight;
+
+ // Acquires the given handles and sets them as in flight if they aren't already
+ auto acquire = [&](std::vector<std::pair<ITensorHandle *, IMemoryGroup *>> &handles)
+ {
+ for(auto &handle : handles)
+ {
+ ITensorHandle *parent_handle = handle.first;
+ ARM_COMPUTE_ERROR_ON(parent_handle == nullptr);
+ // If the tensor is not already in flight:
+ if(tensors_in_flight.find(parent_handle) == std::end(tensors_in_flight))
+ {
+ ARM_COMPUTE_ERROR_ON(hc.find(parent_handle) == std::end(hc));
+ // Then add it to the list of in flight tensors
+ tensors_in_flight.insert(std::make_pair(parent_handle, hc.at(parent_handle)));
+ // Start of allocation's lifetime
+ parent_handle->manage(handle.second);
+ }
+ }
+ };
+
+ for(auto &task_handle : tasks_handles)
+ {
+ // Marking all the input and output tensors of the task as in flight
+ acquire(task_handle.input_handles);
+ acquire(task_handle.output_handles);
+
+ // Releasing the input tensors
+ for(auto &input_handle : task_handle.input_handles)
+ {
+ ITensorHandle *ihandle = input_handle.first;
+ ARM_COMPUTE_ERROR_ON(ihandle == nullptr);
+ ARM_COMPUTE_ERROR_ON(tensors_in_flight.find(ihandle) == std::end(tensors_in_flight));
+ --tensors_in_flight[ihandle];
+ if(tensors_in_flight[ihandle] <= 0)
+ {
+ // Remove tensor for tensors in flight
+ tensors_in_flight.erase(ihandle);
+ // End of allocation's lifetime
+ ihandle->allocate();
+ }
+ }
+ }
+}
+} // namespace
+
+void configure_transition_manager(Graph &g, GraphContext &ctx, ExecutionWorkload &workload)
+{
+ // Get const tensors (un-managed)
+ std::set<ITensorHandle *> const_tensors = get_const_handles(g);
+
+ std::vector<TaskHandles> tasks_handles;
+ TargetHandleCounter target_handle_count;
+
+ // Count handles
+ for(auto &task : workload.tasks)
+ {
+ // Populates IO handles
+ tasks_handles.push_back(get_transition_handles(ctx, task, const_tensors));
+
+ // Count handles
+ count_input_handles_per_target(tasks_handles.back(), target_handle_count);
+ }
+
+ // Setup memory managers
+ for(auto &hc : target_handle_count)
+ {
+ MemoryManagerContext *mm_ctx = ctx.memory_management_ctx(hc.first);
+ if(mm_ctx != nullptr)
+ {
+ if(mm_ctx->cross_mm != nullptr && mm_ctx->cross_group != nullptr)
+ {
+ // Manage and allocate tensors
+ configure_handle_lifetime(tasks_handles, hc.second);
+ }
+ }
+ }
+}
+} // namespace detail
+} // namespace graph
+} // namespace arm_compute
diff --git a/src/graph/detail/ExecutionHelpers.cpp b/src/graph/detail/ExecutionHelpers.cpp
new file mode 100644
index 0000000..c370fdf
--- /dev/null
+++ b/src/graph/detail/ExecutionHelpers.cpp
@@ -0,0 +1,279 @@
+/*
+ * Copyright (c) 2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/graph/detail/ExecutionHelpers.h"
+
+#include "arm_compute/graph/Graph.h"
+#include "arm_compute/graph/GraphContext.h"
+#include "arm_compute/graph/GraphManager.h"
+#include "arm_compute/graph/Tensor.h"
+#include "arm_compute/graph/backends/BackendRegistry.h"
+
+namespace arm_compute
+{
+namespace graph
+{
+namespace detail
+{
+void default_initialize_backends()
+{
+ for(const auto &backend : backends::BackendRegistry::get().backends())
+ {
+ backend.second->initialize_backend();
+ }
+}
+
+void validate_all_nodes(Graph &g)
+{
+ auto &nodes = g.nodes();
+
+ // Create tasks
+ for(auto &node : nodes)
+ {
+ if(node != nullptr)
+ {
+ Target assigned_target = node->assigned_target();
+ auto backend = backends::BackendRegistry::get().find_backend(assigned_target);
+ ARM_COMPUTE_ERROR_ON_MSG(!backend, "Requested backend doesn't exist!");
+ Status status = backend->validate_node(*node);
+ ARM_COMPUTE_ERROR_ON_MSG(!bool(status), status.error_description().c_str());
+ }
+ }
+}
+
+void configure_all_tensors(Graph &g)
+{
+ auto &tensors = g.tensors();
+
+ for(auto &tensor : tensors)
+ {
+ if(tensor)
+ {
+ Target target = tensor->desc().target;
+ auto backend = backends::BackendRegistry::get().find_backend(target);
+ ARM_COMPUTE_ERROR_ON_MSG(!backend, "Requested backend doesn't exist!");
+ auto handle = backend->create_tensor(*tensor);
+ ARM_COMPUTE_ERROR_ON_MSG(!backend, "Couldn't create backend handle!");
+ tensor->set_handle(std::move(handle));
+ }
+ }
+}
+
+void allocate_all_input_tensors(INode &node)
+{
+ for(unsigned int i = 0; i < node.num_inputs(); ++i)
+ {
+ Tensor *tensor = node.input(i);
+ if(tensor != nullptr && !tensor->bound_edges().empty())
+ {
+ ARM_COMPUTE_ERROR_ON_MSG(!tensor->handle(), "Tensor handle is not configured!");
+ tensor->handle()->allocate();
+ }
+ }
+}
+
+void allocate_all_output_tensors(INode &node)
+{
+ for(unsigned int i = 0; i < node.num_outputs(); ++i)
+ {
+ Tensor *tensor = node.output(i);
+ if(tensor != nullptr && !tensor->bound_edges().empty())
+ {
+ ARM_COMPUTE_ERROR_ON_MSG(!tensor->handle(), "Tensor handle is not configured!");
+ tensor->handle()->allocate();
+ }
+ }
+}
+
+void allocate_const_tensors(Graph &g)
+{
+ for(auto &node : g.nodes())
+ {
+ if(node != nullptr)
+ {
+ switch(node->type())
+ {
+ case NodeType::Const:
+ case NodeType::Input:
+ allocate_all_output_tensors(*node);
+ break;
+ case NodeType::Output:
+ allocate_all_input_tensors(*node);
+ default:
+ break;
+ }
+ }
+ }
+}
+
+void allocate_all_tensors(Graph &g)
+{
+ auto &tensors = g.tensors();
+
+ for(auto &tensor : tensors)
+ {
+ if(tensor && !tensor->bound_edges().empty() && tensor->handle() != nullptr && tensor->handle()->tensor().info()->is_resizable() && tensor->handle()->tensor().is_used())
+ {
+ tensor->handle()->allocate();
+ }
+ }
+}
+
+ExecutionWorkload configure_all_nodes(Graph &g, GraphContext &ctx)
+{
+ ExecutionWorkload workload;
+ workload.graph = &g;
+ workload.ctx = &ctx;
+
+ auto &nodes = g.nodes();
+
+ // Create tasks
+ for(auto &node : nodes)
+ {
+ if(node != nullptr)
+ {
+ Target assigned_target = node->assigned_target();
+ auto backend = backends::BackendRegistry::get().find_backend(assigned_target);
+ ARM_COMPUTE_ERROR_ON_MSG(!backend, "Requested backend doesn't exist!");
+ auto func = backend->configure_node(*node, ctx);
+ if(func != nullptr)
+ {
+ ExecutionTask task;
+ task.task = std::move(func);
+ task.node = node.get();
+ workload.tasks.push_back(std::move(task));
+ }
+ }
+ }
+
+ // Add inputs and outputs
+ for(auto &node : nodes)
+ {
+ if(node != nullptr && node->type() == NodeType::Input)
+ {
+ workload.inputs.push_back(node->output(0));
+ }
+
+ if(node != nullptr && node->type() == NodeType::Output)
+ {
+ workload.outputs.push_back(node->input(0));
+ continue;
+ }
+ }
+
+ return workload;
+}
+
+void release_unused_tensors(Graph &g)
+{
+ for(auto &tensor : g.tensors())
+ {
+ if(tensor != nullptr && tensor->handle() != nullptr)
+ {
+ tensor->handle()->release_if_unused();
+ }
+ }
+}
+
+void call_tensor_accessor(Tensor *tensor)
+{
+ ARM_COMPUTE_ERROR_ON(!tensor);
+ tensor->call_accessor();
+}
+
+void call_all_const_node_accessors(Graph &g)
+{
+ auto &nodes = g.nodes();
+
+ for(auto &node : nodes)
+ {
+ if(node != nullptr && node->type() == NodeType::Const)
+ {
+ call_tensor_accessor(node->output(0));
+ }
+ }
+}
+
+void call_all_input_node_accessors(ExecutionWorkload &workload)
+{
+ for(auto &input : workload.inputs)
+ {
+ if(input != nullptr)
+ {
+ input->call_accessor();
+ }
+ }
+}
+
+void prepare_all_tasks(ExecutionWorkload &workload)
+{
+ ARM_COMPUTE_ERROR_ON(workload.graph == nullptr);
+ for(auto &task : workload.tasks)
+ {
+ task.prepare();
+ release_unused_tensors(*workload.graph);
+ }
+}
+
+void call_all_tasks(ExecutionWorkload &workload)
+{
+ ARM_COMPUTE_ERROR_ON(workload.ctx == nullptr);
+
+ // Acquire memory for the transition buffers
+ for(auto &mm_ctx : workload.ctx->memory_managers())
+ {
+ if(mm_ctx.second.cross_group != nullptr)
+ {
+ mm_ctx.second.cross_group->acquire();
+ }
+ }
+
+ // Execute tasks
+ for(auto &task : workload.tasks)
+ {
+ task();
+ }
+
+ // Release memory for the transition buffers
+ for(auto &mm_ctx : workload.ctx->memory_managers())
+ {
+ if(mm_ctx.second.cross_group != nullptr)
+ {
+ mm_ctx.second.cross_group->release();
+ }
+ }
+}
+
+void call_all_output_node_accessors(ExecutionWorkload &workload)
+{
+ for(auto &output : workload.outputs)
+ {
+ if(output != nullptr)
+ {
+ output->call_accessor();
+ }
+ }
+}
+} // namespace detail
+} // namespace graph
+} // namespace arm_compute
\ No newline at end of file
diff --git a/src/graph/frontend/Stream.cpp b/src/graph/frontend/Stream.cpp
new file mode 100644
index 0000000..96a166c
--- /dev/null
+++ b/src/graph/frontend/Stream.cpp
@@ -0,0 +1,69 @@
+/*
+ * Copyright (c) 2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/graph/frontend/Stream.h"
+
+#include "arm_compute/graph/Utils.h"
+#include "arm_compute/graph/frontend/ILayer.h"
+
+namespace arm_compute
+{
+namespace graph
+{
+namespace frontend
+{
+Stream::Stream(size_t id, std::string name)
+ : _manager(), _ctx(), _g(id, std::move(name))
+{
+}
+
+void Stream::finalize(Target target, const GraphConfig &config)
+{
+ PassManager pm = create_default_pass_manager(target);
+ _ctx.set_config(config);
+ _manager.finalize_graph(_g, _ctx, pm, target);
+}
+
+void Stream::run()
+{
+ _manager.execute_graph(_g);
+}
+
+void Stream::add_layer(ILayer &layer)
+{
+ auto nid = layer.create_layer(*this);
+ _tail_node = nid;
+}
+
+const Graph &Stream::graph() const
+{
+ return _g;
+}
+
+Graph &Stream::graph()
+{
+ return _g;
+}
+} // namespace frontend
+} // namespace graph
+} // namespace arm_compute
\ No newline at end of file
diff --git a/src/graph/CL/CLMap.cpp b/src/graph/frontend/SubStream.cpp
similarity index 63%
copy from src/graph/CL/CLMap.cpp
copy to src/graph/frontend/SubStream.cpp
index 5289ea9..e8bd23a 100644
--- a/src/graph/CL/CLMap.cpp
+++ b/src/graph/frontend/SubStream.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2018 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -21,23 +21,39 @@
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*/
-#include "arm_compute/graph/CL/CLMap.h"
+#include "arm_compute/graph/frontend/SubStream.h"
-#include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/Validate.h"
-#include "arm_compute/graph/ITensorObject.h"
-#include "arm_compute/runtime/CL/CLScheduler.h"
+#include "arm_compute/graph/Graph.h"
+#include "arm_compute/graph/frontend/ILayer.h"
-using namespace arm_compute::graph;
-
-CLMap::CLMap(ITensorObject *tensor, bool blocking)
- : _tensor(dynamic_cast<arm_compute::ICLTensor *>(tensor->tensor())), _blocking(blocking)
+namespace arm_compute
{
- ARM_COMPUTE_ERROR_ON_NULLPTR(_tensor);
+namespace graph
+{
+namespace frontend
+{
+SubStream::SubStream(IStream &s)
+ : _s(s)
+{
+ _hints = s.hints();
+ _tail_node = s.tail_node();
}
-void CLMap::run()
+void SubStream::add_layer(ILayer &layer)
{
- _tensor->map(arm_compute::CLScheduler::get().queue(), _blocking);
+ auto nid = layer.create_layer(*this);
+ _tail_node = nid;
}
+
+const Graph &SubStream::graph() const
+{
+ return _s.graph();
+}
+
+Graph &SubStream::graph()
+{
+ return _s.graph();
+}
+} // namespace frontend
+} // namespace graph
+} // namespace arm_compute
diff --git a/src/graph/mutators/DepthConcatSubTensorMutator.cpp b/src/graph/mutators/DepthConcatSubTensorMutator.cpp
new file mode 100644
index 0000000..c56f4c5
--- /dev/null
+++ b/src/graph/mutators/DepthConcatSubTensorMutator.cpp
@@ -0,0 +1,86 @@
+/*
+ * Copyright (c) 2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/graph/mutators/DepthConcatSubTensorMutator.h"
+
+#include "arm_compute/graph/Graph.h"
+#include "arm_compute/graph/Logger.h"
+#include "arm_compute/graph/backends/BackendRegistry.h"
+#include "arm_compute/graph/nodes/DepthConcatenateLayerNode.h"
+
+#include "arm_compute/core/utils/misc/Cast.h"
+#include "arm_compute/core/utils/misc/Iterable.h"
+
+namespace arm_compute
+{
+namespace graph
+{
+const char *DepthConcatSubTensorMutator::name()
+{
+ return "DepthConcatSubTensorMutator";
+}
+
+void DepthConcatSubTensorMutator::mutate(Graph &g)
+{
+ // Should be in reverse order of execution
+ for(auto &node : arm_compute::utils::iterable::reverse_iterate(g.nodes()))
+ {
+ if(node && node->type() == NodeType::DepthConcatenateLayer && node->output(0) != nullptr)
+ {
+ // Get output tensor
+ auto output_tensor = node->output(0);
+
+ // Check that all tensor have the same target and valid inputs
+ bool is_valid = std::all_of(node->input_edges().cbegin(), node->input_edges().cend(),
+ [&](const EdgeID & eid)
+ {
+ return (g.edge(eid) != nullptr) && (g.edge(eid)->tensor() != nullptr) && (g.edge(eid)->tensor()->desc().target == output_tensor->desc().target);
+ });
+
+ // Create subtensors
+ if(is_valid && backends::BackendRegistry::get().find_backend(output_tensor->desc().target) != nullptr)
+ {
+ ARM_COMPUTE_LOG_GRAPH_VERBOSE("Using sub-tensors for the node with ID : "
+ << node->id() << " and name : " << node->name() << std::endl);
+ // Create sub-tensor handles
+ unsigned depth = 0;
+ for(unsigned int i = 0; i < node->input_edges().size(); ++i)
+ {
+ auto input_tensor = node->input(i);
+ const auto input_shape = input_tensor->desc().shape;
+
+ auto backend = backends::BackendRegistry::get().find_backend(input_tensor->desc().target);
+ auto handle = backend->create_subtensor(output_tensor->handle(), input_shape, Coordinates(0, 0, depth), false);
+ input_tensor->set_handle(std::move(handle));
+
+ depth += input_shape.z();
+ }
+
+ auto *dc_node = arm_compute::utils::cast::polymorphic_downcast<DepthConcatenateLayerNode *>(node.get());
+ dc_node->set_enabled(false);
+ }
+ }
+ }
+}
+} // namespace graph
+} // namespace arm_compute
diff --git a/src/graph/mutators/InPlaceOperationMutator.cpp b/src/graph/mutators/InPlaceOperationMutator.cpp
new file mode 100644
index 0000000..bd3f098
--- /dev/null
+++ b/src/graph/mutators/InPlaceOperationMutator.cpp
@@ -0,0 +1,63 @@
+/*
+ * Copyright (c) 2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/graph/mutators/InPlaceOperationMutator.h"
+
+#include "arm_compute/graph/Graph.h"
+#include "arm_compute/graph/Logger.h"
+
+namespace arm_compute
+{
+namespace graph
+{
+const char *InPlaceOperationMutator::name()
+{
+ return "InPlaceOperationMutator";
+}
+
+void InPlaceOperationMutator::mutate(Graph &g)
+{
+ std::set<NodeType> in_place_nodes = { NodeType::BatchNormalizationLayer, NodeType::ActivationLayer };
+
+ // Not interested in the order of nodes
+ for(auto &node : g.nodes())
+ {
+ if(node && in_place_nodes.find(node->type()) != std::end(in_place_nodes))
+ {
+ // Get input edge
+ Edge *input_edge = node->input_edge(0);
+
+ // Check if parent has a single output if yes then force in place calculation else not
+ if((input_edge != nullptr) && (input_edge->producer() != nullptr) && (input_edge->producer()->output_edges().size() == 1))
+ {
+ ARM_COMPUTE_LOG_GRAPH_VERBOSE("Switching to in-place computation for the node with ID : "
+ << node->id() << " and name : " << node->name() << std::endl);
+ // Update output
+ auto tensor = input_edge->tensor();
+ node->set_output_tensor(tensor->id(), 0);
+ }
+ }
+ }
+}
+} // namespace graph
+} // namespace arm_compute
diff --git a/src/graph/mutators/NodeFusionMutator.cpp b/src/graph/mutators/NodeFusionMutator.cpp
new file mode 100644
index 0000000..2e893c2
--- /dev/null
+++ b/src/graph/mutators/NodeFusionMutator.cpp
@@ -0,0 +1,96 @@
+/*
+ * Copyright (c) 2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/graph/mutators/NodeFusionMutator.h"
+
+#include "arm_compute/graph/Graph.h"
+#include "arm_compute/graph/Logger.h"
+#include "arm_compute/graph/nodes/Nodes.h"
+
+#include "arm_compute/core/utils/misc/Cast.h"
+
+namespace arm_compute
+{
+namespace graph
+{
+namespace detail
+{
+void fuse_batch_norm_with_activation(Graph &g)
+{
+ // Not interested in the order of nodes
+ for(auto &node : g.nodes())
+ {
+ // Check if the node is batch norm and not a branching node
+ if(node && node->type() == NodeType::BatchNormalizationLayer && node->output_edges().size() == 1)
+ {
+ auto output_edge_id = *node->output_edges().begin();
+ auto output_edge = g.edge(output_edge_id);
+ // Check if following node is an activation layer node
+ if((output_edge != nullptr) && (output_edge->consumer() != nullptr) && (output_edge->consumer()->type() == NodeType::ActivationLayer))
+ {
+ ARM_COMPUTE_LOG_GRAPH_VERBOSE("Fusing Batch Normalization node with ID : " << output_edge->producer_id()
+ << " with Activation Layer node with ID : " << output_edge->consumer_id() << std::endl);
+
+ auto *bn_node = arm_compute::utils::cast::polymorphic_downcast<BatchNormalizationLayerNode *>(output_edge->producer());
+ auto *act_node = arm_compute::utils::cast::polymorphic_downcast<ActivationLayerNode *>(output_edge->consumer());
+
+ // Get driving nodes of activation node
+ std::vector<NodeIdxPair> act_driving_nodes;
+ for(auto &act_output_edge_id : act_node->output_edges())
+ {
+ auto act_output_edge = g.edge(act_output_edge_id);
+ if(act_output_edge != nullptr)
+ {
+ ARM_COMPUTE_ERROR_ON(act_output_edge->consumer() == nullptr);
+ act_driving_nodes.push_back({ act_output_edge->consumer_id(), act_output_edge->consumer_idx() });
+ }
+ }
+
+ // Set activation info to batch normalization
+ bn_node->set_fused_activation(act_node->activation_info());
+
+ // Remove activation node
+ g.remove_node(act_node->id());
+
+ // Update batch normalization node outputs
+ for(auto &driving_node : act_driving_nodes)
+ {
+ g.add_connection(bn_node->id(), 0, driving_node.node_id, driving_node.index);
+ }
+ }
+ }
+ }
+}
+} // namespace detail
+
+const char *NodeFusionMutator::name()
+{
+ return "NodeFusionMutator";
+}
+
+void NodeFusionMutator::mutate(Graph &g)
+{
+ detail::fuse_batch_norm_with_activation(g);
+}
+} // namespace graph
+} // namespace arm_compute
diff --git a/src/graph/mutators/SplitLayerSubTensorMutator.cpp b/src/graph/mutators/SplitLayerSubTensorMutator.cpp
new file mode 100644
index 0000000..2a8c029
--- /dev/null
+++ b/src/graph/mutators/SplitLayerSubTensorMutator.cpp
@@ -0,0 +1,89 @@
+/*
+ * Copyright (c) 2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/graph/mutators/SplitLayerSubTensorMutator.h"
+
+#include "arm_compute/graph/Graph.h"
+#include "arm_compute/graph/Logger.h"
+#include "arm_compute/graph/backends/BackendRegistry.h"
+#include "arm_compute/graph/nodes/SplitLayerNode.h"
+
+#include "arm_compute/core/utils/misc/Cast.h"
+#include "arm_compute/core/utils/misc/Iterable.h"
+
+namespace arm_compute
+{
+namespace graph
+{
+const char *SplitLayerSubTensorMutator::name()
+{
+ return "SplitLayerSubTensorMutator";
+}
+
+void SplitLayerSubTensorMutator::mutate(Graph &g)
+{
+ // Should be in reverse order of execution
+ for(auto &node : arm_compute::utils::iterable::reverse_iterate(g.nodes()))
+ {
+ if(node && node->type() == NodeType::SplitLayer && node->input(0) != nullptr)
+ {
+ // Get output tensor
+ Tensor *input_tensor = node->input(0);
+
+ // Check that all tensor have the same target and are valid
+ bool is_valid = std::all_of(node->outputs().cbegin(), node->outputs().cend(),
+ [&](const TensorID & tid)
+ {
+ return (g.tensor(tid) != nullptr) && (g.tensor(tid)->desc().target == input_tensor->desc().target);
+ });
+
+ // Create subtensors
+ if(is_valid && backends::BackendRegistry::get().find_backend(input_tensor->desc().target) != nullptr)
+ {
+ ARM_COMPUTE_LOG_GRAPH_VERBOSE("Using sub-tensors for the node with ID : "
+ << node->id() << " and name : " << node->name() << std::endl);
+
+ auto *split_node = arm_compute::utils::cast::polymorphic_downcast<SplitLayerNode *>(node.get());
+
+ const unsigned int axis = split_node->axis();
+ const unsigned int num_splits = split_node->num_splits();
+ const bool extend_parent = (axis < 2);
+
+ // Create sub-tensor handles
+ for(unsigned int i = 0; i < node->outputs().size(); ++i)
+ {
+ Tensor *output_tensor = node->output(i);
+ const TensorShape output_shape = output_tensor->desc().shape;
+ Coordinates coords;
+ std::tie(std::ignore, coords) = SplitLayerNode::compute_output_descriptor(input_tensor->desc(), num_splits, axis, i);
+
+ backends::IDeviceBackend *backend = backends::BackendRegistry::get().find_backend(output_tensor->desc().target);
+ std::unique_ptr<ITensorHandle> handle = backend->create_subtensor(input_tensor->handle(), output_shape, coords, extend_parent);
+ output_tensor->set_handle(std::move(handle));
+ }
+ }
+ }
+ }
+}
+} // namespace graph
+} // namespace arm_compute
diff --git a/src/graph/nodes/ActivationLayer.cpp b/src/graph/nodes/ActivationLayer.cpp
deleted file mode 100644
index 546c42a..0000000
--- a/src/graph/nodes/ActivationLayer.cpp
+++ /dev/null
@@ -1,56 +0,0 @@
-/*
- * Copyright (c) 2017-2018 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/graph/nodes/ActivationLayer.h"
-
-#include "arm_compute/graph/Error.h"
-#include "arm_compute/graph/NodeContext.h"
-#include "arm_compute/graph/OperationRegistry.h"
-#include "support/ToolchainSupport.h"
-
-using namespace arm_compute::graph;
-
-ActivationLayer::ActivationLayer(const ActivationLayerInfo activation_info)
- : _activation_info(activation_info)
-{
- set_supports_in_place(true);
-}
-
-std::unique_ptr<arm_compute::IFunction> ActivationLayer::instantiate_node(GraphContext &ctx, ITensorObject *input, ITensorObject *output)
-{
- ARM_COMPUTE_ERROR_ON_UNALLOCATED_TENSOR_OBJECT(input, output);
-
- arm_compute::ITensor *in = input->tensor();
- arm_compute::ITensor *out = output->tensor();
- _target_hint = ctx.hints().target_hint();
-
- // Create node context
- NodeContext node_ctx(OperationType::ActivationLayer);
- node_ctx.set_target(_target_hint);
- node_ctx.add_input(in);
- node_ctx.add_output(out);
- node_ctx.add_parameter<ActivationLayerInfo>("ActivationLayerInfo", _activation_info);
-
- // Get function
- return OperationRegistry::get().find_operation(OperationType::ActivationLayer, _target_hint)->configure(node_ctx);
-}
diff --git a/src/graph/nodes/ActivationLayerNode.cpp b/src/graph/nodes/ActivationLayerNode.cpp
new file mode 100644
index 0000000..414684c
--- /dev/null
+++ b/src/graph/nodes/ActivationLayerNode.cpp
@@ -0,0 +1,78 @@
+/*
+ * Copyright (c) 2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/graph/nodes/ActivationLayerNode.h"
+
+#include "arm_compute/graph/Graph.h"
+#include "arm_compute/graph/INodeVisitor.h"
+
+namespace arm_compute
+{
+namespace graph
+{
+ActivationLayerNode::ActivationLayerNode(ActivationLayerInfo info)
+ : _info(info)
+{
+ _input_edges.resize(1, EmptyEdgeID);
+ _outputs.resize(1, NullTensorID);
+}
+
+ActivationLayerInfo ActivationLayerNode::activation_info() const
+{
+ return _info;
+}
+
+bool ActivationLayerNode::forward_descriptors()
+{
+ if((input_id(0) != NullTensorID) && (output_id(0) != NullTensorID))
+ {
+ Tensor *dst = output(0);
+ ARM_COMPUTE_ERROR_ON(dst == nullptr);
+ dst->desc() = configure_output(0);
+ return true;
+ }
+ return false;
+}
+
+TensorDescriptor ActivationLayerNode::configure_output(size_t idx) const
+{
+ ARM_COMPUTE_UNUSED(idx);
+ ARM_COMPUTE_ERROR_ON(idx >= _outputs.size());
+
+ const Tensor *src = input(0);
+ ARM_COMPUTE_ERROR_ON(src == nullptr);
+
+ return src->desc();
+}
+
+NodeType ActivationLayerNode::type() const
+{
+ return NodeType::ActivationLayer;
+}
+
+void ActivationLayerNode::accept(INodeVisitor &v)
+{
+ v.visit(*this);
+}
+} // namespace graph
+} // namespace arm_compute
\ No newline at end of file
diff --git a/src/graph/nodes/BatchNormalizationLayer.cpp b/src/graph/nodes/BatchNormalizationLayer.cpp
deleted file mode 100644
index 24287ac..0000000
--- a/src/graph/nodes/BatchNormalizationLayer.cpp
+++ /dev/null
@@ -1,105 +0,0 @@
-/*
- * Copyright (c) 2017-2018 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/graph/nodes/BatchNormalizationLayer.h"
-
-#include "arm_compute/graph/Error.h"
-#include "arm_compute/graph/NodeContext.h"
-#include "arm_compute/graph/OperationRegistry.h"
-#include "support/ToolchainSupport.h"
-
-using namespace arm_compute::graph;
-
-std::unique_ptr<arm_compute::IFunction> BatchNormalizationLayer::instantiate_node(GraphContext &ctx, ITensorObject *input, ITensorObject *output)
-{
- ARM_COMPUTE_ERROR_ON_UNALLOCATED_TENSOR_OBJECT(input, output);
-
- arm_compute::ITensor *in = input->tensor();
- arm_compute::ITensor *out = output->tensor();
- _target_hint = ctx.hints().target_hint();
-
- unsigned int batch_norm_size = in->info()->dimension(2);
- if(_mean.tensor() == nullptr)
- {
- _mean.set_info(TensorInfo(TensorShape(batch_norm_size), in->info()->num_channels(), in->info()->data_type(), in->info()->fixed_point_position()));
- }
- if(_var.tensor() == nullptr)
- {
- _var.set_info(TensorInfo(TensorShape(batch_norm_size), in->info()->num_channels(), in->info()->data_type(), in->info()->fixed_point_position()));
- }
- if(_beta.tensor() == nullptr)
- {
- _beta.set_info(TensorInfo(TensorShape(batch_norm_size), in->info()->num_channels(), in->info()->data_type(), in->info()->fixed_point_position()));
- }
- if(_gamma.tensor() == nullptr)
- {
- _gamma.set_info(TensorInfo(TensorShape(batch_norm_size), in->info()->num_channels(), in->info()->data_type(), in->info()->fixed_point_position()));
- }
-
- bool mean_is_loaded = _mean.tensor() != nullptr;
- bool var_is_loaded = _var.tensor() != nullptr;
- bool gamma_is_loaded = _gamma.tensor() != nullptr;
- bool beta_is_loaded = _beta.tensor() != nullptr;
-
- // Set mean, var, gamma and beta target
- _mean.set_target(_target_hint);
- _var.set_target(_target_hint);
- _gamma.set_target(_target_hint);
- _beta.set_target(_target_hint);
-
- // Create node context
- NodeContext node_ctx(OperationType::BatchNormalizationLayer);
- node_ctx.set_target(_target_hint);
- node_ctx.add_input(in);
- node_ctx.add_input(_mean.tensor());
- node_ctx.add_input(_var.tensor());
- node_ctx.add_input(_beta.tensor());
- node_ctx.add_input(_gamma.tensor());
- node_ctx.add_output(out);
- node_ctx.add_parameter<float>("epsilon", _epsilon);
- node_ctx.add_parameter<ActivationLayerInfo>("act_info", _act_info);
-
- // Configure operation
- auto func = OperationRegistry::get().find_operation(OperationType::BatchNormalizationLayer, _target_hint)->configure(node_ctx);
-
- // Fill tensors
- if(!mean_is_loaded)
- {
- _mean.allocate_and_fill_if_needed();
- }
- if(!var_is_loaded)
- {
- _var.allocate_and_fill_if_needed();
- }
- if(!gamma_is_loaded)
- {
- _gamma.allocate_and_fill_if_needed();
- }
- if(!beta_is_loaded)
- {
- _beta.allocate_and_fill_if_needed();
- }
-
- // Get function
- return func;
-}
\ No newline at end of file
diff --git a/src/graph/nodes/BatchNormalizationLayerNode.cpp b/src/graph/nodes/BatchNormalizationLayerNode.cpp
new file mode 100644
index 0000000..3ae11fc
--- /dev/null
+++ b/src/graph/nodes/BatchNormalizationLayerNode.cpp
@@ -0,0 +1,89 @@
+/*
+ * Copyright (c) 2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/graph/nodes/BatchNormalizationLayerNode.h"
+
+#include "arm_compute/core/Utils.h"
+#include "arm_compute/graph/Graph.h"
+#include "arm_compute/graph/INodeVisitor.h"
+
+namespace arm_compute
+{
+namespace graph
+{
+BatchNormalizationLayerNode::BatchNormalizationLayerNode(float epsilon, ActivationLayerInfo fused_activation)
+ : _epsilon(epsilon), _fused_activation(fused_activation)
+{
+ _input_edges.resize(5, EmptyEdgeID);
+ _outputs.resize(1, NullTensorID);
+}
+
+float BatchNormalizationLayerNode::epsilon() const
+{
+ return _epsilon;
+}
+
+ActivationLayerInfo BatchNormalizationLayerNode::fused_activation() const
+{
+ return _fused_activation;
+}
+
+void BatchNormalizationLayerNode::set_fused_activation(ActivationLayerInfo fused_activation)
+{
+ _fused_activation = fused_activation;
+}
+
+bool BatchNormalizationLayerNode::forward_descriptors()
+{
+ if((input_id(0) != NullTensorID) && (output_id(0) != NullTensorID))
+ {
+ Tensor *dst = output(0);
+ ARM_COMPUTE_ERROR_ON(dst == nullptr);
+ dst->desc() = configure_output(0);
+ return true;
+ }
+ return false;
+}
+
+TensorDescriptor BatchNormalizationLayerNode::configure_output(size_t idx) const
+{
+ ARM_COMPUTE_UNUSED(idx);
+ ARM_COMPUTE_ERROR_ON(idx >= _outputs.size());
+
+ const Tensor *src = input(0);
+ ARM_COMPUTE_ERROR_ON(src == nullptr);
+
+ return src->desc();
+}
+
+NodeType BatchNormalizationLayerNode::type() const
+{
+ return NodeType::BatchNormalizationLayer;
+}
+
+void BatchNormalizationLayerNode::accept(INodeVisitor &v)
+{
+ v.visit(*this);
+}
+} // namespace graph
+} // namespace arm_compute
\ No newline at end of file
diff --git a/src/graph/nodes/BranchLayer.cpp b/src/graph/nodes/BranchLayer.cpp
deleted file mode 100644
index 7a20a56..0000000
--- a/src/graph/nodes/BranchLayer.cpp
+++ /dev/null
@@ -1,130 +0,0 @@
-/*
- * Copyright (c) 2017-2018 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/graph/nodes/BranchLayer.h"
-
-#include "arm_compute/graph/Error.h"
-#include "arm_compute/graph/Graph.h"
-#include "arm_compute/graph/SubGraph.h"
-#include "arm_compute/graph/Tensor.h"
-#include "arm_compute/runtime/IFunction.h"
-#include "support/ToolchainSupport.h"
-#include "utils/TypePrinter.h"
-
-#include <memory>
-#include <tuple>
-#include <vector>
-
-using namespace arm_compute::graph;
-
-/** Branch function */
-class BranchFunction final : public arm_compute::IFunction
-{
-public:
- /** Default Constructor */
- BranchFunction()
- : _graphs()
- {
- }
- /** Registers graph to be executed by the branch function
- *
- * @param[in] graph Graph to register
- */
- void register_graph(std::unique_ptr<Graph> graph)
- {
- _graphs.push_back(std::move(graph));
- }
- // Inherited methods overriden:
- void run() override
- {
- for(auto &g : _graphs)
- {
- ARM_COMPUTE_ERROR_ON(g.get() == nullptr);
- g->run();
- }
- }
-
-private:
- std::vector<std::unique_ptr<Graph>> _graphs;
-};
-
-std::unique_ptr<arm_compute::IFunction> BranchLayer::instantiate_node(GraphContext &ctx, ITensorObject *input, ITensorObject *output)
-{
- ARM_COMPUTE_ERROR_ON(_branch_merge_method != BranchMergeMethod::DEPTH_CONCATENATE);
- ARM_COMPUTE_UNUSED(_branch_merge_method);
- ARM_COMPUTE_ERROR_ON_UNALLOCATED_TENSOR_OBJECT(input, output);
-
- // Create branch function
- auto func = arm_compute::support::cpp14::make_unique<BranchFunction>();
-
- // Track output depth
- int depth = 0;
-
- // Constuct all sub-graphs given the input/output
- for(auto &sg : _sub_graphs)
- {
- ARM_COMPUTE_ERROR_ON(sg.get() == nullptr);
-
- // IO buffers
- std::unique_ptr<ITensorObject> in;
- std::unique_ptr<ITensorObject> out;
- SubTensor *out_sub_tensor = nullptr;
-
- // Create input sub-tensor
- if(!sg->has_input())
- {
- ARM_COMPUTE_ERROR_ON(dynamic_cast<Tensor *>(input) == nullptr);
- in = arm_compute::support::cpp14::make_unique<SubTensor>(*dynamic_cast<Tensor *>(input),
- input->tensor()->info()->tensor_shape(),
- Coordinates());
- }
-
- // Create output sub-tensor
- if(!sg->has_output())
- {
- ARM_COMPUTE_ERROR_ON((dynamic_cast<Tensor *>(output) == nullptr) && (dynamic_cast<SubTensor *>(output) == nullptr));
-
- out = arm_compute::support::cpp14::make_unique<SubTensor>(output->tensor(),
- TensorShape(),
- Coordinates(0, 0, depth),
- output->target(),
- true);
- out_sub_tensor = dynamic_cast<SubTensor *>(out.get());
- }
-
- // Construct sub_graph
- auto g = sg->construct(ctx, std::move(in), std::move(out));
-
- // Register graph to function
- func->register_graph(std::move(g));
-
- // Update and track depth
- if(out_sub_tensor != nullptr)
- {
- ARM_COMPUTE_ERROR_ON(out_sub_tensor->tensor() == nullptr);
- depth += out_sub_tensor->tensor()->info()->tensor_shape()[2];
- }
- }
-
- return std::move(func);
-}
\ No newline at end of file
diff --git a/src/graph/nodes/ConstNode.cpp b/src/graph/nodes/ConstNode.cpp
new file mode 100644
index 0000000..2f3cd14
--- /dev/null
+++ b/src/graph/nodes/ConstNode.cpp
@@ -0,0 +1,67 @@
+/*
+ * Copyright (c) 2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/graph/nodes/ConstNode.h"
+
+#include "arm_compute/graph/Graph.h"
+#include "arm_compute/graph/INodeVisitor.h"
+
+namespace arm_compute
+{
+namespace graph
+{
+ConstNode::ConstNode(TensorDescriptor desc)
+ : _desc(std::move(desc))
+{
+ _outputs.resize(1, NullTensorID);
+}
+
+bool ConstNode::forward_descriptors()
+{
+ if(output_id(0) != NullTensorID)
+ {
+ Tensor *t = output(0);
+ ARM_COMPUTE_ERROR_ON(t == nullptr);
+ t->desc() = configure_output(0);
+ return true;
+ }
+ return false;
+}
+
+TensorDescriptor ConstNode::configure_output(size_t idx) const
+{
+ ARM_COMPUTE_UNUSED(idx);
+ return _desc;
+}
+
+NodeType ConstNode::type() const
+{
+ return NodeType::Const;
+}
+
+void ConstNode::accept(INodeVisitor &v)
+{
+ v.visit(*this);
+}
+} // namespace graph
+} // namespace arm_compute
diff --git a/src/graph/nodes/ConvolutionLayer.cpp b/src/graph/nodes/ConvolutionLayer.cpp
deleted file mode 100644
index f292b89..0000000
--- a/src/graph/nodes/ConvolutionLayer.cpp
+++ /dev/null
@@ -1,344 +0,0 @@
-/*
- * Copyright (c) 2017-2018 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/graph/nodes/ConvolutionLayer.h"
-
-#include "arm_compute/graph/Error.h"
-#include "arm_compute/runtime/CL/functions/CLConvolutionLayer.h"
-#include "arm_compute/runtime/CL/functions/CLDirectConvolutionLayer.h"
-#include "arm_compute/runtime/IFunction.h"
-#include "arm_compute/runtime/NEON/functions/NEConvolutionLayer.h"
-#include "arm_compute/runtime/NEON/functions/NEDirectConvolutionLayer.h"
-#include "support/ToolchainSupport.h"
-#include "utils/GraphTypePrinter.h"
-#include "utils/TypePrinter.h"
-
-#include <tuple>
-#include <vector>
-
-using namespace arm_compute::graph;
-
-namespace
-{
-/** Calculates the output shaped of the convolution layer
- *
- * @param[in] input_shape Input tensor shape
- * @param[in] weights_shape Weights shape
- * @param[in] conv_info Convolution information (padding, stride, etc.)
- *
- * @return The expected output tensor shape
- */
-TensorShape calculate_convolution_layer_output_shape(const TensorShape &input_shape, const TensorShape &weights_shape, const PadStrideInfo &conv_info)
-{
- unsigned int output_width = 0;
- unsigned int output_height = 0;
-
- // Get output width and height
- std::tie(output_width, output_height) = arm_compute::scaled_dimensions(input_shape.x(), input_shape.y(), weights_shape.x(), weights_shape.y(), conv_info);
-
- // Create output shape
- TensorShape output_shape = input_shape;
- output_shape.set(0, output_width);
- output_shape.set(1, output_height);
- output_shape.set(2, weights_shape[3]);
-
- return output_shape;
-}
-
-// Instantiate GEMM based convolution layer
-template <typename ConvolutionType, typename TensorType, TargetHint target_hint>
-std::unique_ptr<arm_compute::IFunction> instantiate_function(arm_compute::ITensor *input, arm_compute::ITensor *weights, arm_compute::ITensor *biases, arm_compute::ITensor *output,
- const PadStrideInfo &conv_info, const WeightsInfo &weights_info)
-{
- auto conv = arm_compute::support::cpp14::make_unique<ConvolutionType>();
- conv->configure(
- dynamic_cast<TensorType *>(input),
- dynamic_cast<TensorType *>(weights),
- dynamic_cast<TensorType *>(biases),
- dynamic_cast<TensorType *>(output),
- conv_info, weights_info);
- return std::move(conv);
-}
-
-// Instantiate direct convolution layer
-template <typename ConvolutionType, typename TensorType, TargetHint target_hint>
-std::unique_ptr<arm_compute::IFunction> instantiate_direct_function(arm_compute::ITensor *input, arm_compute::ITensor *weights, arm_compute::ITensor *biases, arm_compute::ITensor *output,
- const PadStrideInfo &conv_info)
-{
- auto conv = arm_compute::support::cpp14::make_unique<ConvolutionType>();
- conv->configure(
- dynamic_cast<TensorType *>(input),
- dynamic_cast<TensorType *>(weights),
- dynamic_cast<TensorType *>(biases),
- dynamic_cast<TensorType *>(output),
- conv_info);
- return std::move(conv);
-}
-
-template <TargetHint target_hint>
-std::unique_ptr<arm_compute::IFunction> instantiate(arm_compute::ITensor *input, arm_compute::ITensor *weights, arm_compute::ITensor *biases, arm_compute::ITensor *output,
- const PadStrideInfo &conv_info, const WeightsInfo &weights_info,
- ConvolutionMethodHint conv_method);
-
-template <>
-std::unique_ptr<arm_compute::IFunction> instantiate<TargetHint::OPENCL>(arm_compute::ITensor *input, arm_compute::ITensor *weights, arm_compute::ITensor *biases, arm_compute::ITensor *output,
- const PadStrideInfo &conv_info,
- const WeightsInfo &weights_info,
- ConvolutionMethodHint conv_method)
-{
- if((conv_method == ConvolutionMethodHint::DIRECT)
- && arm_compute::CLDirectConvolutionLayer::validate(input->info(), weights->info(), biases != nullptr ? biases->info() : nullptr, output->info(), conv_info)) // NOLINT
- {
- ARM_COMPUTE_LOG_GRAPH_INFO("Instantiating CLDirectConvolutionLayer");
- return instantiate_direct_function<arm_compute::CLDirectConvolutionLayer, arm_compute::ICLTensor, TargetHint::OPENCL>(input, weights, biases, output, conv_info);
- }
- else
- {
- ARM_COMPUTE_LOG_GRAPH_INFO("Instantiating CLConvolutionLayer");
- return instantiate_function<arm_compute::CLConvolutionLayer, arm_compute::ICLTensor, TargetHint::OPENCL>(input, weights, biases, output, conv_info, weights_info);
- }
-}
-
-template <>
-std::unique_ptr<arm_compute::IFunction> instantiate<TargetHint::NEON>(arm_compute::ITensor *input, arm_compute::ITensor *weights, arm_compute::ITensor *biases, arm_compute::ITensor *output,
- const PadStrideInfo &conv_info,
- const WeightsInfo &weights_info,
- ConvolutionMethodHint conv_method)
-{
- if((conv_method == ConvolutionMethodHint::DIRECT)
- && arm_compute::NEDirectConvolutionLayer::validate(input->info(), weights->info(), biases != nullptr ? biases->info() : nullptr, output->info(), conv_info)) // NOLINT
- {
- ARM_COMPUTE_LOG_GRAPH_INFO("Instantiating NEDirectConvolutionLayer");
- return instantiate_direct_function<arm_compute::NEDirectConvolutionLayer, arm_compute::ITensor, TargetHint::NEON>(input, weights, biases, output, conv_info);
- }
- else
- {
- ARM_COMPUTE_LOG_GRAPH_INFO("Instantiating NEConvolutionLayer");
- return instantiate_function<arm_compute::NEConvolutionLayer, arm_compute::ITensor, TargetHint::NEON>(input, weights, biases, output, conv_info, weights_info);
- }
-}
-} // namespace
-
-/** Grouped Convolution function */
-class GroupedConvolutionFunction final : public arm_compute::IFunction
-{
-public:
- /** Default Constructor */
- GroupedConvolutionFunction()
- : _convolutions()
- {
- }
- /** Default Destructor */
- ~GroupedConvolutionFunction() final = default;
- /** Prevent instances from being copy constructed */
- GroupedConvolutionFunction(const GroupedConvolutionFunction &) = delete;
- /** Prevent instances from being copy assigned */
- GroupedConvolutionFunction &operator=(const GroupedConvolutionFunction &) = delete;
- /** Allow instances to be move constructed */
- GroupedConvolutionFunction(GroupedConvolutionFunction &&) noexcept = default;
- /** Allow instances to be move assigned */
- GroupedConvolutionFunction &operator=(GroupedConvolutionFunction &&) noexcept = default;
- /** Adds a convolution
- *
- * @param convolution Convolution function to add
- */
- void add_convolution_function(std::unique_ptr<IFunction> convolution)
- {
- _convolutions.emplace_back(std::move(convolution));
- }
-
- // Inherited methods overriden:
- void run() override
- {
- for(auto &c : _convolutions)
- {
- c->run();
- }
- }
-
-private:
- std::vector<std::unique_ptr<IFunction>> _convolutions;
-};
-
-std::unique_ptr<arm_compute::IFunction> ConvolutionLayer::instantiate_node(GraphContext &ctx, ITensorObject *input, ITensorObject *output)
-{
- ARM_COMPUTE_ERROR_ON_UNALLOCATED_TENSOR_OBJECT(input, output);
-
- arm_compute::ITensor *in = input->tensor();
- arm_compute::ITensor *out = output->tensor();
-
- // Set weights and biases info
- if(_weights.tensor() == nullptr)
- {
- TensorInfo info = TensorInfo(TensorShape(_conv_width, _conv_height, in->info()->dimension(2) / _num_groups, _ofm),
- in->info()->num_channels(),
- in->info()->data_type(),
- in->info()->fixed_point_position());
- info.set_quantization_info(_weights_quant_info);
- _weights.set_info(std::move(info));
- }
- if(_biases.has_accessor() && _biases.tensor() == nullptr)
- {
- DataType dt = in->info()->data_type();
- _biases.set_info(TensorInfo(TensorShape(_ofm), in->info()->num_channels(), is_data_type_quantized_asymmetric(dt) ? DataType::S32 : dt, in->info()->fixed_point_position()));
- }
-
- std::unique_ptr<arm_compute::IFunction> func;
- _target_hint = ctx.hints().target_hint();
- const ConvolutionMethodHint conv_method_hint = ctx.hints().convolution_method_hint();
-
- // Check if the weights and biases are loaded
- bool weights_are_loaded = _weights.tensor() != nullptr;
- bool biases_are_loaded = _biases.has_accessor() ? _biases.tensor() != nullptr : true;
-
- // Set bias and weights target
- _weights.set_target(_target_hint);
- if(_biases.has_accessor())
- {
- _biases.set_target(_target_hint);
- }
-
- // Calculate output shape
- TensorShape output_shape = calculate_convolution_layer_output_shape(in->info()->tensor_shape(), _weights.info().tensor_shape(), _conv_info);
-
- // Output auto inizialitation if not yet initialized
- arm_compute::auto_init_if_empty(*out->info(), output_shape, 1, in->info()->data_type(), in->info()->fixed_point_position(),
- (_out_quant_info.empty()) ? in->info()->quantization_info() : _out_quant_info);
-
- // Create appropriate convolution function
- if(_num_groups == 1)
- {
- func = instantiate_convolution(in, out, conv_method_hint);
- }
- else
- {
- func = instantiate_grouped_convolution(in, out, conv_method_hint);
- }
-
- // Fill weights
- if(!weights_are_loaded)
- {
- _weights.allocate_and_fill_if_needed();
- }
- // Fill biases
- if(!biases_are_loaded)
- {
- _biases.allocate_and_fill_if_needed();
- }
-
- ARM_COMPUTE_LOG_GRAPH_INFO(" Data Type: " << in->info()->data_type()
- << " Input Shape: " << in->info()->tensor_shape()
- << " Weights shape: " << _weights.info().tensor_shape()
- << " Biases Shape: " << _biases.info().tensor_shape()
- << " Output Shape: " << out->info()->tensor_shape()
- << " PadStrideInfo: " << _conv_info
- << " Groups: " << _num_groups
- << " WeightsInfo: " << _weights_info
- << std::endl);
-
- return func;
-}
-
-std::unique_ptr<arm_compute::IFunction> ConvolutionLayer::instantiate_convolution(ITensor *input, ITensor *output, ConvolutionMethodHint conv_method_hint)
-{
- std::unique_ptr<arm_compute::IFunction> func;
- if(_target_hint == TargetHint::OPENCL)
- {
- func = instantiate<TargetHint::OPENCL>(input, _weights.tensor(), _biases.tensor(), output, _conv_info, _weights_info, conv_method_hint);
- }
- else
- {
- func = instantiate<TargetHint::NEON>(input, _weights.tensor(), _biases.tensor(), output, _conv_info, _weights_info, conv_method_hint);
- }
- return func;
-}
-
-std::unique_ptr<arm_compute::IFunction> ConvolutionLayer::instantiate_grouped_convolution(ITensor *input, ITensor *output, ConvolutionMethodHint conv_method_hint)
-{
- // Get tensor shapes
- TensorShape input_shape = input->info()->tensor_shape();
- TensorShape output_shape = output->info()->tensor_shape();
- TensorShape weights_shape = _weights.info().tensor_shape();
- TensorShape biases_shape = _biases.info().tensor_shape();
-
- ARM_COMPUTE_ERROR_ON_MSG((input_shape.z() % _num_groups) != 0, "Input depth not multiple of the number of groups!");
- ARM_COMPUTE_ERROR_ON_MSG((output_shape.z() % _num_groups) != 0, "Output depth not multiple of the number of groups!");
- ARM_COMPUTE_ERROR_ON_MSG((weights_shape[3] % _num_groups) != 0, "Number of kernels not multiple of the number of groups!");
- ARM_COMPUTE_ERROR_ON_MSG((biases_shape.x() % _num_groups) != 0, "Biases not multiple of the number of groups!");
-
- // Create a grouped convolution function
- auto grouped_conv = arm_compute::support::cpp14::make_unique<GroupedConvolutionFunction>();
-
- // Create sub-tensors vectors
- _is = arm_compute::support::cpp14::make_unique<SubTensor[]>(_num_groups);
- _os = arm_compute::support::cpp14::make_unique<SubTensor[]>(_num_groups);
- _ws = arm_compute::support::cpp14::make_unique<SubTensor[]>(_num_groups);
- _bs = arm_compute::support::cpp14::make_unique<SubTensor[]>(_num_groups);
-
- // Calculate sub-tensor splits
- const int input_split = input_shape.z() / _num_groups;
- const int output_split = output_shape.z() / _num_groups;
- const int weights_split = weights_shape[3] / _num_groups;
- const int biases_split = biases_shape.x() / _num_groups;
-
- // Calculate sub-tensor shapes
- input_shape.set(2, input_split);
- output_shape.set(2, output_split);
- weights_shape.set(3, weights_split);
- biases_shape.set(0, biases_split);
-
- // Configure sub-tensors
- for(int i = 0; i < static_cast<int>(_num_groups); ++i)
- {
- // Create convolution function
- std::unique_ptr<arm_compute::IFunction> func;
-
- // Calculate sub-tensors starting coordinates
- Coordinates input_coord(0, 0, input_split * i);
- Coordinates output_coord(0, 0, output_split * i);
- Coordinates weights_coord(0, 0, 0, weights_split * i);
- Coordinates biases_coord(biases_split * i);
-
- // Create sub-tensors for input, output, weights and bias
- auto hint_to_use = (_target_hint == TargetHint::OPENCL) ? TargetHint::OPENCL : TargetHint::NEON;
- _is[i] = SubTensor(input, input_shape, input_coord, hint_to_use);
- _os[i] = SubTensor(output, output_shape, output_coord, hint_to_use);
- _ws[i] = SubTensor(_weights.tensor(), weights_shape, weights_coord, hint_to_use);
- _bs[i] = SubTensor(_biases.tensor(), biases_shape, biases_coord, hint_to_use);
-
- // Instantiate convolution function
- if(_target_hint == TargetHint::OPENCL)
- {
- func = instantiate<TargetHint::OPENCL>(_is[i].tensor(), _ws[i].tensor(), _bs[i].tensor(), _os[i].tensor(), _conv_info, _weights_info, conv_method_hint);
- }
- else
- {
- func = instantiate<TargetHint::NEON>(_is[i].tensor(), _ws[i].tensor(), _bs[i].tensor(), _os[i].tensor(), _conv_info, _weights_info, conv_method_hint);
- }
-
- // Add convolution function to the list of convolutions for the grouped convolution
- grouped_conv->add_convolution_function(std::move(func));
- }
-
- return std::move(grouped_conv);
-}
diff --git a/src/graph/nodes/ConvolutionLayerNode.cpp b/src/graph/nodes/ConvolutionLayerNode.cpp
new file mode 100644
index 0000000..6c31a6b
--- /dev/null
+++ b/src/graph/nodes/ConvolutionLayerNode.cpp
@@ -0,0 +1,128 @@
+/*
+ * Copyright (c) 2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/graph/nodes/ConvolutionLayerNode.h"
+
+#include "arm_compute/core/Utils.h"
+#include "arm_compute/graph/Graph.h"
+#include "arm_compute/graph/INodeVisitor.h"
+#include "arm_compute/graph/Utils.h"
+
+namespace arm_compute
+{
+namespace graph
+{
+ConvolutionLayerNode::ConvolutionLayerNode(PadStrideInfo info, ConvolutionMethod method, FastMathHint fast_math_hint, QuantizationInfo out_quant_info)
+ : _info(std::move(info)), _method(method), _fast_math_hint(fast_math_hint), _out_quant_info(out_quant_info)
+{
+ _input_edges.resize(3, EmptyEdgeID);
+ _outputs.resize(1, NullTensorID);
+}
+
+void ConvolutionLayerNode::set_convolution_method(ConvolutionMethod method)
+{
+ _method = method;
+}
+
+ConvolutionMethod ConvolutionLayerNode::convolution_method() const
+{
+ return _method;
+}
+
+void ConvolutionLayerNode::set_fast_math_hint(FastMathHint hint)
+{
+ _fast_math_hint = hint;
+}
+
+FastMathHint ConvolutionLayerNode::fast_math_hint() const
+{
+ return _fast_math_hint;
+}
+
+PadStrideInfo ConvolutionLayerNode::convolution_info() const
+{
+ return _info;
+}
+
+TensorDescriptor ConvolutionLayerNode::compute_output_descriptor(const TensorDescriptor &input_descriptor,
+ const TensorDescriptor &weights_descriptor,
+ const PadStrideInfo &info)
+{
+ unsigned int output_width = 0;
+ unsigned int output_height = 0;
+
+ const unsigned int input_width = get_dimension_size(input_descriptor, DataLayoutDimension::WIDTH);
+ const unsigned int input_height = get_dimension_size(input_descriptor, DataLayoutDimension::HEIGHT);
+ const unsigned int kernel_width = get_dimension_size(weights_descriptor, DataLayoutDimension::WIDTH);
+ const unsigned int kernel_height = get_dimension_size(weights_descriptor, DataLayoutDimension::HEIGHT);
+
+ std::tie(output_width, output_height) = scaled_dimensions(input_width, input_height, kernel_width, kernel_height, info);
+
+ TensorDescriptor output_descriptor = input_descriptor;
+ output_descriptor.shape.set(get_dimension_idx(output_descriptor, DataLayoutDimension::WIDTH), output_width);
+ output_descriptor.shape.set(get_dimension_idx(output_descriptor, DataLayoutDimension::HEIGHT), output_height);
+ output_descriptor.shape.set(get_dimension_idx(output_descriptor, DataLayoutDimension::CHANNEL), weights_descriptor.shape[3]);
+
+ return output_descriptor;
+}
+
+bool ConvolutionLayerNode::forward_descriptors()
+{
+ if((input_id(0) != NullTensorID) && (input_id(1) != NullTensorID) && (output_id(0) != NullTensorID))
+ {
+ Tensor *dst = output(0);
+ ARM_COMPUTE_ERROR_ON(dst == nullptr);
+ dst->desc() = configure_output(0);
+ return true;
+ }
+ return false;
+}
+
+TensorDescriptor ConvolutionLayerNode::configure_output(size_t idx) const
+{
+ ARM_COMPUTE_UNUSED(idx);
+ const Tensor *src = input(0);
+ const Tensor *weights = input(1);
+
+ ARM_COMPUTE_ERROR_ON(src == nullptr || weights == nullptr);
+
+ TensorDescriptor output_info = compute_output_descriptor(src->desc(), weights->desc(), _info);
+ if(!_out_quant_info.empty())
+ {
+ output_info.quant_info = _out_quant_info;
+ }
+
+ return output_info;
+}
+
+NodeType ConvolutionLayerNode::type() const
+{
+ return NodeType::ConvolutionLayer;
+}
+
+void ConvolutionLayerNode::accept(INodeVisitor &v)
+{
+ v.visit(*this);
+}
+} // namespace graph
+} // namespace arm_compute
\ No newline at end of file
diff --git a/src/graph/nodes/DeQuantizationLayer.cpp b/src/graph/nodes/DeQuantizationLayer.cpp
deleted file mode 100644
index af9ecee..0000000
--- a/src/graph/nodes/DeQuantizationLayer.cpp
+++ /dev/null
@@ -1,68 +0,0 @@
-/*
- * Copyright (c) 2017 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/graph/nodes/DequantizationLayer.h"
-
-#include "arm_compute/graph/Error.h"
-#include "arm_compute/graph/NodeContext.h"
-#include "arm_compute/graph/OperationRegistry.h"
-
-using namespace arm_compute::graph;
-
-std::unique_ptr<arm_compute::IFunction> DequantizationLayer::instantiate_node(GraphContext &ctx, ITensorObject *input, ITensorObject *output)
-{
- ARM_COMPUTE_ERROR_ON_UNALLOCATED_TENSOR_OBJECT(input, output);
-
- _target_hint = ctx.hints().target_hint();
- arm_compute::ITensor *in = input->tensor();
- arm_compute::ITensor *out = output->tensor();
-
- if(_min_max.tensor() == nullptr)
- {
- TensorShape shape = in->info()->tensor_shape();
- shape.set(Window::DimX, 2);
- shape.remove_dimension(1);
- shape.remove_dimension(1);
-
- _min_max.set_info(TensorInfo(shape, in->info()->num_channels(), DataType::F32));
- _min_max.set_target(_target_hint);
- }
-
- bool minmax_is_loaded = _min_max.tensor() != nullptr;
-
- // Create node context
- NodeContext node_ctx(OperationType::DequantizationLayer);
- node_ctx.set_target(_target_hint);
- node_ctx.add_input(in);
- node_ctx.add_output(_min_max.tensor());
- node_ctx.add_output(out);
-
- // Fill min max
- if(!minmax_is_loaded)
- {
- _min_max.allocate_and_fill_if_needed();
- }
-
- // Get function
- return OperationRegistry::get().find_operation(OperationType::DequantizationLayer, _target_hint)->configure(node_ctx);
-}
diff --git a/src/graph/nodes/DepthConcatenateLayerNode.cpp b/src/graph/nodes/DepthConcatenateLayerNode.cpp
new file mode 100644
index 0000000..08cccc1
--- /dev/null
+++ b/src/graph/nodes/DepthConcatenateLayerNode.cpp
@@ -0,0 +1,125 @@
+/*
+ * Copyright (c) 2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/graph/nodes/DepthConcatenateLayerNode.h"
+
+#include "arm_compute/core/Utils.h"
+#include "arm_compute/graph/Graph.h"
+#include "arm_compute/graph/INodeVisitor.h"
+
+namespace arm_compute
+{
+namespace graph
+{
+DepthConcatenateLayerNode::DepthConcatenateLayerNode(unsigned int total_nodes)
+ : _total_nodes(total_nodes), _is_enabled(true)
+{
+ _input_edges.resize(_total_nodes, EmptyEdgeID);
+ _outputs.resize(1, NullTensorID);
+}
+
+void DepthConcatenateLayerNode::set_enabled(bool is_enabled)
+{
+ _is_enabled = is_enabled;
+}
+
+bool DepthConcatenateLayerNode::is_enabled() const
+{
+ return _is_enabled;
+}
+
+TensorDescriptor DepthConcatenateLayerNode::compute_output_descriptor(const std::vector<TensorDescriptor> &input_descriptors)
+{
+ ARM_COMPUTE_ERROR_ON(input_descriptors.size() == 0);
+
+ TensorDescriptor output_descriptor = input_descriptors[0];
+
+ size_t max_x = 0;
+ size_t max_y = 0;
+ size_t depth = 0;
+
+ for(const auto &input_descriptor : input_descriptors)
+ {
+ max_x = std::max(input_descriptor.shape.x(), max_x);
+ max_y = std::max(input_descriptor.shape.y(), max_y);
+ depth += input_descriptor.shape.z();
+ }
+
+ output_descriptor.shape.set(0, max_x);
+ output_descriptor.shape.set(1, max_y);
+ output_descriptor.shape.set(2, depth);
+
+ return output_descriptor;
+}
+
+bool DepthConcatenateLayerNode::forward_descriptors()
+{
+ if(_outputs[0] != NullTensorID)
+ {
+ Tensor *dst = output(0);
+ ARM_COMPUTE_ERROR_ON(dst == nullptr);
+ dst->desc() = configure_output(0);
+ return true;
+ }
+ return false;
+}
+
+TensorDescriptor DepthConcatenateLayerNode::configure_output(size_t idx) const
+{
+ ARM_COMPUTE_UNUSED(idx);
+ ARM_COMPUTE_ERROR_ON(idx >= _outputs.size());
+
+ // Check if all input tensors are set
+ bool are_all_inputs_set = std::all_of(std::begin(_input_edges), std::end(_input_edges), [](const EdgeID & eid)
+ {
+ return eid != EmptyEdgeID;
+ });
+
+ TensorDescriptor output_info = {};
+
+ if(are_all_inputs_set)
+ {
+ std::vector<TensorDescriptor> inputs_descriptors;
+ for(unsigned int i = 0; i < _input_edges.size(); ++i)
+ {
+ const Tensor *t = _graph->tensor(input_id(i));
+ ARM_COMPUTE_ERROR_ON(t == nullptr);
+ inputs_descriptors.push_back(t->desc());
+ }
+ output_info = compute_output_descriptor(inputs_descriptors);
+ }
+
+ return output_info;
+}
+
+NodeType DepthConcatenateLayerNode::type() const
+{
+ return NodeType::DepthConcatenateLayer;
+}
+
+void DepthConcatenateLayerNode::accept(INodeVisitor &v)
+{
+ v.visit(*this);
+}
+} // namespace graph
+} // namespace arm_compute
\ No newline at end of file
diff --git a/src/graph/nodes/DepthConvertLayer.cpp b/src/graph/nodes/DepthConvertLayer.cpp
deleted file mode 100644
index 9b328e7..0000000
--- a/src/graph/nodes/DepthConvertLayer.cpp
+++ /dev/null
@@ -1,58 +0,0 @@
-/*
- * Copyright (c) 2017 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/graph/nodes/DepthConvertLayer.h"
-
-#include "arm_compute/graph/Error.h"
-#include "arm_compute/graph/NodeContext.h"
-#include "arm_compute/graph/OperationRegistry.h"
-
-using namespace arm_compute::graph;
-
-DepthConvertLayer::DepthConvertLayer(const ConvertPolicy policy, uint32_t shift, DataType output_datatype)
- : _policy(policy), _shift(shift), _output_datatype(output_datatype)
-{
-}
-
-std::unique_ptr<arm_compute::IFunction> DepthConvertLayer::instantiate_node(GraphContext &ctx, ITensorObject *input, ITensorObject *output)
-{
- ARM_COMPUTE_ERROR_ON_UNALLOCATED_TENSOR_OBJECT(input, output);
-
- _target_hint = ctx.hints().target_hint();
- arm_compute::ITensor *in = input->tensor();
- arm_compute::ITensor *out = output->tensor();
-
- // Auto configure output
- arm_compute::auto_init_if_empty(*out->info(), in->info()->tensor_shape(), 1, _output_datatype, in->info()->fixed_point_position());
-
- // Create node context
- NodeContext node_ctx(OperationType::DepthConvertLayer);
- node_ctx.set_target(_target_hint);
- node_ctx.add_input(in);
- node_ctx.add_output(out);
- node_ctx.add_parameter<ConvertPolicy>("ConvertPolicy", _policy);
- node_ctx.add_parameter<uint32_t>("shift", _shift);
-
- // Get function
- return OperationRegistry::get().find_operation(OperationType::DepthConvertLayer, _target_hint)->configure(node_ctx);
-}
diff --git a/src/graph/nodes/DepthwiseConvolutionLayer.cpp b/src/graph/nodes/DepthwiseConvolutionLayer.cpp
deleted file mode 100644
index e5101cc..0000000
--- a/src/graph/nodes/DepthwiseConvolutionLayer.cpp
+++ /dev/null
@@ -1,91 +0,0 @@
-/*
- * Copyright (c) 2017-2018 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/graph/nodes/DepthwiseConvolutionLayer.h"
-
-#include "arm_compute/graph/Error.h"
-#include "arm_compute/graph/NodeContext.h"
-#include "arm_compute/graph/OperationRegistry.h"
-#include "support/ToolchainSupport.h"
-
-using namespace arm_compute::graph;
-
-std::unique_ptr<arm_compute::IFunction> DepthwiseConvolutionLayer::instantiate_node(GraphContext &ctx, ITensorObject *input, ITensorObject *output)
-{
- ARM_COMPUTE_ERROR_ON_UNALLOCATED_TENSOR_OBJECT(input, output);
-
- arm_compute::ITensor *in = input->tensor();
- arm_compute::ITensor *out = output->tensor();
- _target_hint = ctx.hints().target_hint();
-
- if(_weights.tensor() == nullptr)
- {
- TensorShape weights_shape(_conv_width, _conv_height, input->tensor()->info()->tensor_shape().z());
- TensorInfo info = TensorInfo(TensorShape(weights_shape), in->info()->num_channels(), in->info()->data_type(), in->info()->fixed_point_position());
- info.set_quantization_info(_quant_info);
- _weights.set_info(std::move(info));
- }
- if(_biases.has_accessor() && _biases.tensor() == nullptr)
- {
- DataType dt = in->info()->data_type();
- _biases.set_info(TensorInfo(TensorShape(in->info()->dimension(2)), in->info()->num_channels(), is_data_type_quantized_asymmetric(dt) ? DataType::S32 : dt, in->info()->fixed_point_position()));
- }
-
- bool weights_is_loaded = _weights.tensor() != nullptr;
- bool biases_is_loaded = _biases.has_accessor() ? _biases.tensor() != nullptr : true;
-
- _weights.set_target(_target_hint);
- if(_biases.has_accessor())
- {
- _biases.set_target(_target_hint);
- }
-
- // Create node context
- NodeContext node_ctx(OperationType::DepthwiseConvolutionLayer);
- node_ctx.set_target(_target_hint);
- node_ctx.add_input(in);
- node_ctx.add_input(_weights.tensor());
- if(_biases.has_accessor())
- {
- node_ctx.add_input(_biases.tensor());
- }
- node_ctx.add_output(out);
- node_ctx.add_parameter<PadStrideInfo>("ConvolutionInfo", _conv_info);
- node_ctx.add_parameter<bool>("Optimized3x3", _opt3x3);
-
- // Configure operation
- auto func = OperationRegistry::get().find_operation(OperationType::DepthwiseConvolutionLayer, _target_hint)->configure(node_ctx);
-
- // Fill tensors
- if(!weights_is_loaded)
- {
- _weights.allocate_and_fill_if_needed();
- }
- if(!biases_is_loaded)
- {
- _biases.allocate_and_fill_if_needed();
- }
-
- // Get function
- return func;
-}
diff --git a/src/graph/nodes/DepthwiseConvolutionLayerNode.cpp b/src/graph/nodes/DepthwiseConvolutionLayerNode.cpp
new file mode 100644
index 0000000..1a6f8d3
--- /dev/null
+++ b/src/graph/nodes/DepthwiseConvolutionLayerNode.cpp
@@ -0,0 +1,111 @@
+/*
+ * Copyright (c) 2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/graph/nodes/DepthwiseConvolutionLayerNode.h"
+
+#include "arm_compute/core/Utils.h"
+#include "arm_compute/graph/Graph.h"
+#include "arm_compute/graph/INodeVisitor.h"
+#include "arm_compute/graph/Utils.h"
+
+namespace arm_compute
+{
+namespace graph
+{
+DepthwiseConvolutionLayerNode::DepthwiseConvolutionLayerNode(PadStrideInfo info, DepthwiseConvolutionMethod method)
+ : _info(std::move(info)), _method(method)
+{
+ _input_edges.resize(3, EmptyEdgeID);
+ _outputs.resize(1, NullTensorID);
+}
+
+void DepthwiseConvolutionLayerNode::set_depthwise_convolution_method(DepthwiseConvolutionMethod method)
+{
+ _method = method;
+}
+
+DepthwiseConvolutionMethod DepthwiseConvolutionLayerNode::depthwise_convolution_method() const
+{
+ return _method;
+}
+
+PadStrideInfo DepthwiseConvolutionLayerNode::convolution_info() const
+{
+ return _info;
+}
+
+TensorDescriptor DepthwiseConvolutionLayerNode::compute_output_descriptor(const TensorDescriptor &input_descriptor,
+ const TensorDescriptor &weights_descriptor,
+ const PadStrideInfo &info)
+{
+ unsigned int output_width = 0;
+ unsigned int output_height = 0;
+
+ const unsigned int input_width = get_dimension_size(input_descriptor, DataLayoutDimension::WIDTH);
+ const unsigned int input_height = get_dimension_size(input_descriptor, DataLayoutDimension::HEIGHT);
+ const unsigned int kernel_width = get_dimension_size(weights_descriptor, DataLayoutDimension::WIDTH);
+ const unsigned int kernel_height = get_dimension_size(weights_descriptor, DataLayoutDimension::HEIGHT);
+
+ std::tie(output_width, output_height) = scaled_dimensions(input_width, input_height, kernel_width, kernel_height, info);
+
+ TensorDescriptor output_descriptor = input_descriptor;
+ output_descriptor.shape.set(get_dimension_idx(output_descriptor, DataLayoutDimension::WIDTH), output_width);
+ output_descriptor.shape.set(get_dimension_idx(output_descriptor, DataLayoutDimension::HEIGHT), output_height);
+
+ return output_descriptor;
+}
+
+bool DepthwiseConvolutionLayerNode::forward_descriptors()
+{
+ if((input_id(0) != NullTensorID) && (input_id(1) != NullTensorID) && (output_id(0) != NullTensorID))
+ {
+ Tensor *dst = output(0);
+ ARM_COMPUTE_ERROR_ON(dst == nullptr);
+ dst->desc() = configure_output(0);
+ return true;
+ }
+ return false;
+}
+
+TensorDescriptor DepthwiseConvolutionLayerNode::configure_output(size_t idx) const
+{
+ ARM_COMPUTE_UNUSED(idx);
+ const Tensor *src = input(0);
+ const Tensor *weights = input(1);
+
+ ARM_COMPUTE_ERROR_ON(src == nullptr || weights == nullptr);
+
+ return compute_output_descriptor(src->desc(), weights->desc(), _info);
+}
+
+NodeType DepthwiseConvolutionLayerNode::type() const
+{
+ return NodeType::DepthwiseConvolutionLayer;
+}
+
+void DepthwiseConvolutionLayerNode::accept(INodeVisitor &v)
+{
+ v.visit(*this);
+}
+} // namespace graph
+} // namespace arm_compute
\ No newline at end of file
diff --git a/src/graph/nodes/EltwiseLayerNode.cpp b/src/graph/nodes/EltwiseLayerNode.cpp
new file mode 100644
index 0000000..568b882
--- /dev/null
+++ b/src/graph/nodes/EltwiseLayerNode.cpp
@@ -0,0 +1,87 @@
+/*
+ * Copyright (c) 2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/graph/nodes/EltwiseLayerNode.h"
+
+#include "arm_compute/graph/Graph.h"
+#include "arm_compute/graph/INodeVisitor.h"
+
+namespace arm_compute
+{
+namespace graph
+{
+EltwiseLayerNode::EltwiseLayerNode(EltwiseOperation op, ConvertPolicy c_policy, RoundingPolicy r_policy)
+ : _op(op), _convert_policy(c_policy), _rounding_policy(r_policy)
+{
+ _input_edges.resize(2, EmptyEdgeID);
+ _outputs.resize(1, NullTensorID);
+}
+
+EltwiseOperation EltwiseLayerNode::eltwise_operation() const
+{
+ return _op;
+}
+
+ConvertPolicy EltwiseLayerNode::convert_policy() const
+{
+ return _convert_policy;
+}
+
+RoundingPolicy EltwiseLayerNode::rounding_policy() const
+{
+ return _rounding_policy;
+}
+
+bool EltwiseLayerNode::forward_descriptors()
+{
+ if((input_id(0) != NullTensorID) && (output_id(0) != NullTensorID))
+ {
+ Tensor *dst = output(0);
+ ARM_COMPUTE_ERROR_ON(dst == nullptr);
+ dst->desc() = configure_output(0);
+ return true;
+ }
+ return false;
+}
+
+TensorDescriptor EltwiseLayerNode::configure_output(size_t idx) const
+{
+ ARM_COMPUTE_UNUSED(idx, _op, _convert_policy, _rounding_policy);
+
+ const Tensor *src = input(0);
+ ARM_COMPUTE_ERROR_ON(src == nullptr);
+
+ return src->desc();
+}
+
+NodeType EltwiseLayerNode::type() const
+{
+ return NodeType::EltwiseLayer;
+}
+
+void EltwiseLayerNode::accept(INodeVisitor &v)
+{
+ v.visit(*this);
+}
+} // namespace graph
+} // namespace arm_compute
diff --git a/src/graph/nodes/FlattenLayer.cpp b/src/graph/nodes/FlattenLayer.cpp
deleted file mode 100644
index ea08296..0000000
--- a/src/graph/nodes/FlattenLayer.cpp
+++ /dev/null
@@ -1,54 +0,0 @@
-/*
- * Copyright (c) 2017 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/graph/nodes/FlattenLayer.h"
-
-#include "arm_compute/graph/Error.h"
-#include "arm_compute/graph/NodeContext.h"
-#include "arm_compute/graph/OperationRegistry.h"
-#include "support/ToolchainSupport.h"
-
-using namespace arm_compute::graph;
-
-std::unique_ptr<arm_compute::IFunction> FlattenLayer::instantiate_node(GraphContext &ctx, ITensorObject *input, ITensorObject *output)
-{
- ARM_COMPUTE_ERROR_ON_UNALLOCATED_TENSOR_OBJECT(input, output);
-
- _target_hint = ctx.hints().target_hint();
- arm_compute::ITensor *in = input->tensor();
- arm_compute::ITensor *out = output->tensor();
-
- // Auto configure output
- TensorShape tensor_shape = in->info()->tensor_shape();
- tensor_shape.collapse(in->info()->num_dimensions());
- arm_compute::auto_init_if_empty(*out->info(), tensor_shape, 1, in->info()->data_type(), in->info()->fixed_point_position());
-
- // Create node context
- NodeContext node_ctx(OperationType::FlattenLayer);
- node_ctx.set_target(_target_hint);
- node_ctx.add_input(in);
- node_ctx.add_output(out);
-
- // Get function
- return OperationRegistry::get().find_operation(OperationType::FlattenLayer, _target_hint)->configure(node_ctx);
-}
\ No newline at end of file
diff --git a/src/graph/nodes/FlattenLayerNode.cpp b/src/graph/nodes/FlattenLayerNode.cpp
new file mode 100644
index 0000000..78b45dc
--- /dev/null
+++ b/src/graph/nodes/FlattenLayerNode.cpp
@@ -0,0 +1,75 @@
+/*
+ * Copyright (c) 2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/graph/nodes/FlattenLayerNode.h"
+
+#include "arm_compute/graph/Graph.h"
+#include "arm_compute/graph/INodeVisitor.h"
+
+namespace arm_compute
+{
+namespace graph
+{
+FlattenLayerNode::FlattenLayerNode()
+{
+ _input_edges.resize(1, EmptyEdgeID);
+ _outputs.resize(1, NullTensorID);
+}
+
+bool FlattenLayerNode::forward_descriptors()
+{
+ if((input_id(0) != NullTensorID) && (output_id(0) != NullTensorID))
+ {
+ Tensor *dst = output(0);
+ ARM_COMPUTE_ERROR_ON(dst == nullptr);
+ dst->desc() = configure_output(0);
+ return true;
+ }
+ return false;
+}
+
+TensorDescriptor FlattenLayerNode::configure_output(size_t idx) const
+{
+ ARM_COMPUTE_UNUSED(idx);
+ ARM_COMPUTE_ERROR_ON(idx >= _outputs.size());
+
+ const Tensor *src = input(0);
+ ARM_COMPUTE_ERROR_ON(src == nullptr);
+
+ TensorDescriptor output_desc = src->desc();
+ output_desc.shape.collapse(src->desc().shape.num_dimensions());
+
+ return output_desc;
+}
+
+NodeType FlattenLayerNode::type() const
+{
+ return NodeType::FlattenLayer;
+}
+
+void FlattenLayerNode::accept(INodeVisitor &v)
+{
+ v.visit(*this);
+}
+} // namespace graph
+} // namespace arm_compute
\ No newline at end of file
diff --git a/src/graph/nodes/FloorLayer.cpp b/src/graph/nodes/FloorLayer.cpp
deleted file mode 100644
index 8750546..0000000
--- a/src/graph/nodes/FloorLayer.cpp
+++ /dev/null
@@ -1,49 +0,0 @@
-/*
- * Copyright (c) 2017 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/graph/nodes/FloorLayer.h"
-
-#include "arm_compute/graph/Error.h"
-#include "arm_compute/graph/NodeContext.h"
-#include "arm_compute/graph/OperationRegistry.h"
-#include "support/ToolchainSupport.h"
-
-using namespace arm_compute::graph;
-
-std::unique_ptr<arm_compute::IFunction> FloorLayer::instantiate_node(GraphContext &ctx, ITensorObject *input, ITensorObject *output)
-{
- ARM_COMPUTE_ERROR_ON_UNALLOCATED_TENSOR_OBJECT(input, output);
-
- arm_compute::ITensor *in = input->tensor();
- arm_compute::ITensor *out = output->tensor();
- _target_hint = ctx.hints().target_hint();
-
- // Create node context
- NodeContext node_ctx(OperationType::FloorLayer);
- node_ctx.set_target(_target_hint);
- node_ctx.add_input(in);
- node_ctx.add_output(out);
-
- // Get function
- return OperationRegistry::get().find_operation(OperationType::FloorLayer, _target_hint)->configure(node_ctx);
-}
diff --git a/src/graph/nodes/FullyConnectedLayer.cpp b/src/graph/nodes/FullyConnectedLayer.cpp
index 3742150..d94a785 100644
--- a/src/graph/nodes/FullyConnectedLayer.cpp
+++ b/src/graph/nodes/FullyConnectedLayer.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2018 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -21,86 +21,89 @@
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*/
-#include "arm_compute/graph/nodes/FullyConnectedLayer.h"
+#include "arm_compute/graph/nodes/FullyConnectedLayerNode.h"
-#include "arm_compute/graph/Error.h"
-#include "arm_compute/graph/NodeContext.h"
-#include "arm_compute/graph/OperationRegistry.h"
-#include "support/ToolchainSupport.h"
+#include "arm_compute/core/Utils.h"
+#include "arm_compute/graph/Graph.h"
+#include "arm_compute/graph/INodeVisitor.h"
-using namespace arm_compute::graph;
-
-namespace
+namespace arm_compute
{
-TensorShape calculate_fullyconnected_layer_output_shape(const TensorShape &input_shape, unsigned int output_neurons)
+namespace graph
+{
+FullyConnectedLayerNode::FullyConnectedLayerNode(unsigned int num_outputs)
+ : _num_outputs(num_outputs)
+{
+ _input_edges.resize(3, EmptyEdgeID);
+ _outputs.resize(1, NullTensorID);
+}
+
+TensorDescriptor FullyConnectedLayerNode::compute_weights_descriptor(const TensorDescriptor &input_descriptor,
+ unsigned int num_outputs)
+{
+ unsigned int num_weights = 1;
+ unsigned int num_dimensions = input_descriptor.shape.num_dimensions();
+ // Ignore the batch dimension if there is one:
+ if(num_dimensions == 2 || num_dimensions == 4)
+ {
+ num_dimensions--;
+ }
+ for(unsigned int i = 0; i < num_dimensions; i++)
+ {
+ num_weights *= input_descriptor.shape[i];
+ }
+
+ TensorDescriptor weights_descriptor = input_descriptor;
+ weights_descriptor.shape = TensorShape(num_weights, num_outputs);
+
+ return weights_descriptor;
+}
+
+TensorDescriptor FullyConnectedLayerNode::compute_output_descriptor(const TensorDescriptor &input_descriptor,
+ unsigned int num_outputs)
{
// Note: Only 1D batch space is supported at the moment
- unsigned int batches = input_shape[1];
- if(input_shape.num_dimensions() > 2)
+ unsigned int batches = input_descriptor.shape[1];
+ if(input_descriptor.shape.num_dimensions() > 2)
{
- batches = input_shape[3];
+ batches = input_descriptor.shape[3];
}
- return TensorShape(output_neurons, batches);
-}
-} // namespace
-std::unique_ptr<arm_compute::IFunction> FullyConnectedLayer::instantiate_node(GraphContext &ctx, ITensorObject *input, ITensorObject *output)
+ TensorDescriptor output_descriptor = input_descriptor;
+ output_descriptor.shape = TensorShape(num_outputs, batches);
+
+ return output_descriptor;
+}
+
+bool FullyConnectedLayerNode::forward_descriptors()
{
- ARM_COMPUTE_ERROR_ON_UNALLOCATED_TENSOR_OBJECT(input, output);
-
- arm_compute::ITensor *in = input->tensor();
- arm_compute::ITensor *out = output->tensor();
- _target_hint = ctx.hints().target_hint();
-
- if(_weights.tensor() == nullptr)
+ if((input_id(0) != NullTensorID) && (output_id(0) != NullTensorID))
{
- unsigned int num_weights = 1;
- unsigned int num_dimensions = in->info()->num_dimensions();
- // Ignore the batch dimension if there is one:
- if(num_dimensions == 2 || num_dimensions == 4)
- {
- num_dimensions--;
- }
- for(unsigned int i = 0; i < num_dimensions; i++)
- {
- num_weights *= in->info()->dimension(i);
- }
- _weights.set_info(TensorInfo(TensorShape(num_weights, _num_neurons), in->info()->num_channels(), in->info()->data_type(), in->info()->fixed_point_position()));
+ Tensor *dst = output(0);
+ ARM_COMPUTE_ERROR_ON(dst == nullptr);
+ dst->desc() = configure_output(0);
+ return true;
}
- if(_biases.tensor() == nullptr)
- {
- _biases.set_info(TensorInfo(TensorShape(_num_neurons), in->info()->num_channels(), in->info()->data_type(), in->info()->fixed_point_position()));
- }
-
- // Auto configure output
- arm_compute::auto_init_if_empty(*out->info(),
- calculate_fullyconnected_layer_output_shape(in->info()->tensor_shape(), _num_neurons),
- in->info()->num_channels(), in->info()->data_type(), in->info()->fixed_point_position());
-
- bool weights_are_loaded = _weights.tensor() != nullptr;
- bool biases_are_loaded = _biases.tensor() != nullptr;
-
- // Create node context
- NodeContext node_ctx(OperationType::FullyConnectedLayer);
- node_ctx.set_target(_target_hint);
- node_ctx.add_input(in);
- node_ctx.add_input(_weights.set_target(_target_hint));
- node_ctx.add_input(_biases.set_target(_target_hint));
- node_ctx.add_output(out);
-
- // Configure operation
- auto func = OperationRegistry::get().find_operation(OperationType::FullyConnectedLayer, _target_hint)->configure(node_ctx);
-
- // Fill biases
- if(!weights_are_loaded)
- {
- _weights.allocate_and_fill_if_needed();
- }
- if(!biases_are_loaded)
- {
- _biases.allocate_and_fill_if_needed();
- }
-
- // Get function
- return func;
+ return false;
}
+
+TensorDescriptor FullyConnectedLayerNode::configure_output(size_t idx) const
+{
+ ARM_COMPUTE_UNUSED(idx);
+ const Tensor *src = input(0);
+ ARM_COMPUTE_ERROR_ON(src == nullptr);
+
+ return compute_output_descriptor(src->desc(), _num_outputs);
+}
+
+NodeType FullyConnectedLayerNode::type() const
+{
+ return NodeType::FullyConnectedLayer;
+}
+
+void FullyConnectedLayerNode::accept(INodeVisitor &v)
+{
+ v.visit(*this);
+}
+} // namespace graph
+} // namespace arm_compute
\ No newline at end of file
diff --git a/src/graph/nodes/InputNode.cpp b/src/graph/nodes/InputNode.cpp
new file mode 100644
index 0000000..709eaae
--- /dev/null
+++ b/src/graph/nodes/InputNode.cpp
@@ -0,0 +1,67 @@
+/*
+ * Copyright (c) 2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/graph/nodes/InputNode.h"
+
+#include "arm_compute/graph/Graph.h"
+#include "arm_compute/graph/INodeVisitor.h"
+
+namespace arm_compute
+{
+namespace graph
+{
+InputNode::InputNode(TensorDescriptor desc)
+ : _desc(std::move(desc))
+{
+ _outputs.resize(1, NullTensorID);
+}
+
+bool InputNode::forward_descriptors()
+{
+ if(output_id(0) != NullTensorID)
+ {
+ Tensor *t = output(0);
+ ARM_COMPUTE_ERROR_ON(t == nullptr);
+ t->desc() = configure_output(0);
+ return true;
+ }
+ return false;
+}
+
+TensorDescriptor InputNode::configure_output(size_t idx) const
+{
+ ARM_COMPUTE_UNUSED(idx);
+ return _desc;
+}
+
+NodeType InputNode::type() const
+{
+ return NodeType::Input;
+}
+
+void InputNode::accept(INodeVisitor &v)
+{
+ v.visit(*this);
+}
+} // namespace graph
+} // namespace arm_compute
diff --git a/src/graph/nodes/L2NormalizeLayer.cpp b/src/graph/nodes/L2NormalizeLayer.cpp
deleted file mode 100644
index 9813ba4..0000000
--- a/src/graph/nodes/L2NormalizeLayer.cpp
+++ /dev/null
@@ -1,56 +0,0 @@
-/*
- * Copyright (c) 2017 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/graph/nodes/L2NormalizeLayer.h"
-
-#include "arm_compute/graph/Error.h"
-#include "arm_compute/graph/NodeContext.h"
-#include "arm_compute/graph/OperationRegistry.h"
-#include "support/ToolchainSupport.h"
-
-using namespace arm_compute::graph;
-
-L2NormalizeLayer::L2NormalizeLayer(unsigned int axis, float epsilon)
- : _axis(axis), _epsilon(epsilon)
-{
-}
-
-std::unique_ptr<arm_compute::IFunction> L2NormalizeLayer::instantiate_node(GraphContext &ctx, ITensorObject *input, ITensorObject *output)
-{
- ARM_COMPUTE_ERROR_ON_UNALLOCATED_TENSOR_OBJECT(input, output);
-
- arm_compute::ITensor *in = input->tensor();
- arm_compute::ITensor *out = output->tensor();
- _target_hint = ctx.hints().target_hint();
-
- // Create node context
- NodeContext node_ctx(OperationType::L2NormalizeLayer);
- node_ctx.set_target(_target_hint);
- node_ctx.add_input(in);
- node_ctx.add_output(out);
- node_ctx.add_parameter<unsigned int>("axis", _axis);
- node_ctx.add_parameter<float>("epsilon", _epsilon);
-
- // Get function
- return OperationRegistry::get().find_operation(OperationType::L2NormalizeLayer, _target_hint)->configure(node_ctx);
-}
diff --git a/src/graph/nodes/NormalizationLayer.cpp b/src/graph/nodes/NormalizationLayer.cpp
deleted file mode 100644
index a489329..0000000
--- a/src/graph/nodes/NormalizationLayer.cpp
+++ /dev/null
@@ -1,55 +0,0 @@
-/*
- * Copyright (c) 2017 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/graph/nodes/NormalizationLayer.h"
-
-#include "arm_compute/graph/Error.h"
-#include "arm_compute/graph/NodeContext.h"
-#include "arm_compute/graph/OperationRegistry.h"
-#include "support/ToolchainSupport.h"
-
-using namespace arm_compute::graph;
-
-NormalizationLayer::NormalizationLayer(const NormalizationLayerInfo norm_info)
- : _norm_info(norm_info)
-{
-}
-
-std::unique_ptr<arm_compute::IFunction> NormalizationLayer::instantiate_node(GraphContext &ctx, ITensorObject *input, ITensorObject *output)
-{
- ARM_COMPUTE_ERROR_ON_UNALLOCATED_TENSOR_OBJECT(input, output);
-
- arm_compute::ITensor *in = input->tensor();
- arm_compute::ITensor *out = output->tensor();
- _target_hint = ctx.hints().target_hint();
-
- // Create node context
- NodeContext node_ctx(OperationType::NormalizationLayer);
- node_ctx.set_target(_target_hint);
- node_ctx.add_input(in);
- node_ctx.add_output(out);
- node_ctx.add_parameter<NormalizationLayerInfo>("NormalizationLayerInfo", _norm_info);
-
- // Get function
- return OperationRegistry::get().find_operation(OperationType::NormalizationLayer, _target_hint)->configure(node_ctx);
-}
diff --git a/src/graph/nodes/NormalizationLayerNode.cpp b/src/graph/nodes/NormalizationLayerNode.cpp
new file mode 100644
index 0000000..a7b3738
--- /dev/null
+++ b/src/graph/nodes/NormalizationLayerNode.cpp
@@ -0,0 +1,79 @@
+/*
+ * Copyright (c) 2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/graph/nodes/NormalizationLayerNode.h"
+
+#include "arm_compute/core/Utils.h"
+#include "arm_compute/graph/Graph.h"
+#include "arm_compute/graph/INodeVisitor.h"
+
+namespace arm_compute
+{
+namespace graph
+{
+NormalizationLayerNode::NormalizationLayerNode(NormalizationLayerInfo norm_info)
+ : _info(norm_info)
+{
+ _input_edges.resize(1, EmptyEdgeID);
+ _outputs.resize(1, NullTensorID);
+}
+
+NormalizationLayerInfo NormalizationLayerNode::normalization_info() const
+{
+ return _info;
+}
+
+bool NormalizationLayerNode::forward_descriptors()
+{
+ if(input_id(0) != NullTensorID && (output_id(0) != NullTensorID))
+ {
+ Tensor *dst = output(0);
+ ARM_COMPUTE_ERROR_ON(dst == nullptr);
+ dst->desc() = configure_output(0);
+ return true;
+ }
+ return false;
+}
+
+TensorDescriptor NormalizationLayerNode::configure_output(size_t idx) const
+{
+ ARM_COMPUTE_UNUSED(idx);
+ ARM_COMPUTE_ERROR_ON(idx >= _outputs.size());
+
+ const Tensor *src = input(0);
+ ARM_COMPUTE_ERROR_ON(src == nullptr);
+
+ return src->desc();
+}
+
+NodeType NormalizationLayerNode::type() const
+{
+ return NodeType::NormalizationLayer;
+}
+
+void NormalizationLayerNode::accept(INodeVisitor &v)
+{
+ v.visit(*this);
+}
+} // namespace graph
+} // namespace arm_compute
\ No newline at end of file
diff --git a/src/graph/CL/CLMap.cpp b/src/graph/nodes/OutputNode.cpp
similarity index 61%
copy from src/graph/CL/CLMap.cpp
copy to src/graph/nodes/OutputNode.cpp
index 5289ea9..8aa249b 100644
--- a/src/graph/CL/CLMap.cpp
+++ b/src/graph/nodes/OutputNode.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2018 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -21,23 +21,41 @@
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*/
-#include "arm_compute/graph/CL/CLMap.h"
+#include "arm_compute/graph/nodes/OutputNode.h"
-#include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/Validate.h"
-#include "arm_compute/graph/ITensorObject.h"
-#include "arm_compute/runtime/CL/CLScheduler.h"
+#include "arm_compute/core/Error.h"
+#include "arm_compute/graph/Graph.h"
+#include "arm_compute/graph/INodeVisitor.h"
+#include "arm_compute/graph/Tensor.h"
-using namespace arm_compute::graph;
-
-CLMap::CLMap(ITensorObject *tensor, bool blocking)
- : _tensor(dynamic_cast<arm_compute::ICLTensor *>(tensor->tensor())), _blocking(blocking)
+namespace arm_compute
{
- ARM_COMPUTE_ERROR_ON_NULLPTR(_tensor);
+namespace graph
+{
+OutputNode::OutputNode()
+{
+ _input_edges.resize(1, EmptyEdgeID);
}
-void CLMap::run()
+bool OutputNode::forward_descriptors()
{
- _tensor->map(arm_compute::CLScheduler::get().queue(), _blocking);
+ return true;
}
+
+TensorDescriptor OutputNode::configure_output(size_t idx) const
+{
+ ARM_COMPUTE_UNUSED(idx);
+ return TensorDescriptor();
+}
+
+NodeType OutputNode::type() const
+{
+ return NodeType::Output;
+}
+
+void OutputNode::accept(INodeVisitor &v)
+{
+ v.visit(*this);
+}
+} // namespace graph
+} // namespace arm_compute
diff --git a/src/graph/nodes/PoolingLayer.cpp b/src/graph/nodes/PoolingLayer.cpp
deleted file mode 100644
index 2c15119..0000000
--- a/src/graph/nodes/PoolingLayer.cpp
+++ /dev/null
@@ -1,55 +0,0 @@
-/*
- * Copyright (c) 2017 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/graph/nodes/PoolingLayer.h"
-
-#include "arm_compute/graph/Error.h"
-#include "arm_compute/graph/NodeContext.h"
-#include "arm_compute/graph/OperationRegistry.h"
-#include "support/ToolchainSupport.h"
-
-using namespace arm_compute::graph;
-
-PoolingLayer::PoolingLayer(const PoolingLayerInfo pool_info)
- : _pool_info(pool_info)
-{
-}
-
-std::unique_ptr<arm_compute::IFunction> PoolingLayer::instantiate_node(GraphContext &ctx, ITensorObject *input, ITensorObject *output)
-{
- ARM_COMPUTE_ERROR_ON_UNALLOCATED_TENSOR_OBJECT(input, output);
-
- arm_compute::ITensor *in = input->tensor();
- arm_compute::ITensor *out = output->tensor();
- _target_hint = ctx.hints().target_hint();
-
- // Create node context
- NodeContext node_ctx(OperationType::PoolingLayer);
- node_ctx.set_target(_target_hint);
- node_ctx.add_input(in);
- node_ctx.add_output(out);
- node_ctx.add_parameter<PoolingLayerInfo>("PoolingLayerInfo", _pool_info);
-
- // Get function
- return OperationRegistry::get().find_operation(OperationType::PoolingLayer, _target_hint)->configure(node_ctx);
-}
diff --git a/src/graph/nodes/PoolingLayerNode.cpp b/src/graph/nodes/PoolingLayerNode.cpp
new file mode 100644
index 0000000..26c145a
--- /dev/null
+++ b/src/graph/nodes/PoolingLayerNode.cpp
@@ -0,0 +1,100 @@
+/*
+ * Copyright (c) 2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/graph/nodes/PoolingLayerNode.h"
+
+#include "arm_compute/core/Utils.h"
+#include "arm_compute/graph/Graph.h"
+#include "arm_compute/graph/INodeVisitor.h"
+#include "arm_compute/graph/Utils.h"
+
+namespace arm_compute
+{
+namespace graph
+{
+PoolingLayerNode::PoolingLayerNode(PoolingLayerInfo pool_info)
+ : _info(std::move(pool_info))
+{
+ _input_edges.resize(1, EmptyEdgeID);
+ _outputs.resize(1, NullTensorID);
+}
+
+PoolingLayerInfo PoolingLayerNode::pooling_info() const
+{
+ return _info;
+}
+
+TensorDescriptor PoolingLayerNode::compute_output_descriptor(const TensorDescriptor &input_descriptor,
+ PoolingLayerInfo info)
+{
+ unsigned int pooled_width = 0;
+ unsigned int pooled_height = 0;
+
+ const unsigned int input_width = get_dimension_size(input_descriptor, DataLayoutDimension::WIDTH);
+ const unsigned int input_height = get_dimension_size(input_descriptor, DataLayoutDimension::HEIGHT);
+ const unsigned int pool_size_x = info.is_global_pooling() ? input_width : info.pool_size().width;
+ const unsigned int pool_size_y = info.is_global_pooling() ? input_height : info.pool_size().height;
+
+ std::tie(pooled_width, pooled_height) = scaled_dimensions(input_width, input_height, pool_size_x, pool_size_y, info.pad_stride_info());
+
+ TensorDescriptor output_descriptor = input_descriptor;
+ output_descriptor.shape.set(get_dimension_idx(output_descriptor, DataLayoutDimension::WIDTH), pooled_width);
+ output_descriptor.shape.set(get_dimension_idx(output_descriptor, DataLayoutDimension::HEIGHT), pooled_height);
+
+ return output_descriptor;
+}
+
+bool PoolingLayerNode::forward_descriptors()
+{
+ if((input_id(0) != NullTensorID) && (output_id(0) != NullTensorID))
+ {
+ Tensor *dst = output(0);
+ ARM_COMPUTE_ERROR_ON(dst == nullptr);
+ dst->desc() = configure_output(0);
+ return true;
+ }
+ return false;
+}
+
+TensorDescriptor PoolingLayerNode::configure_output(size_t idx) const
+{
+ ARM_COMPUTE_UNUSED(idx);
+ ARM_COMPUTE_ERROR_ON(idx >= _outputs.size());
+
+ const Tensor *src = input(0);
+ ARM_COMPUTE_ERROR_ON(src == nullptr);
+
+ return compute_output_descriptor(src->desc(), _info);
+}
+
+NodeType PoolingLayerNode::type() const
+{
+ return NodeType::PoolingLayer;
+}
+
+void PoolingLayerNode::accept(INodeVisitor &v)
+{
+ v.visit(*this);
+}
+} // namespace graph
+} // namespace arm_compute
\ No newline at end of file
diff --git a/src/graph/nodes/QuantizationLayer.cpp b/src/graph/nodes/QuantizationLayer.cpp
deleted file mode 100644
index c102f47..0000000
--- a/src/graph/nodes/QuantizationLayer.cpp
+++ /dev/null
@@ -1,48 +0,0 @@
-/*
- * Copyright (c) 2017 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/graph/nodes/QuantizationLayer.h"
-
-#include "arm_compute/graph/Error.h"
-#include "arm_compute/graph/NodeContext.h"
-#include "arm_compute/graph/OperationRegistry.h"
-
-using namespace arm_compute::graph;
-
-std::unique_ptr<arm_compute::IFunction> QuantizationLayer::instantiate_node(GraphContext &ctx, ITensorObject *input, ITensorObject *output)
-{
- ARM_COMPUTE_ERROR_ON_UNALLOCATED_TENSOR_OBJECT(input, output);
-
- _target_hint = ctx.hints().target_hint();
- arm_compute::ITensor *in = input->tensor();
- arm_compute::ITensor *out = output->tensor();
-
- // Create node context
- NodeContext node_ctx(OperationType::QuantizationLayer);
- node_ctx.set_target(_target_hint);
- node_ctx.add_input(in);
- node_ctx.add_output(out);
-
- // Get function
- return OperationRegistry::get().find_operation(OperationType::QuantizationLayer, _target_hint)->configure(node_ctx);
-}
diff --git a/src/graph/nodes/ReshapeLayer.cpp b/src/graph/nodes/ReshapeLayer.cpp
index b0c117e..58610e9 100644
--- a/src/graph/nodes/ReshapeLayer.cpp
+++ b/src/graph/nodes/ReshapeLayer.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2018 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -21,37 +21,56 @@
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*/
-#include "arm_compute/graph/nodes/ReshapeLayer.h"
+#include "arm_compute/graph/nodes/ReshapeLayerNode.h"
-#include "arm_compute/graph/Error.h"
-#include "arm_compute/graph/NodeContext.h"
-#include "arm_compute/graph/OperationRegistry.h"
-#include "support/ToolchainSupport.h"
+#include "arm_compute/graph/Graph.h"
+#include "arm_compute/graph/INodeVisitor.h"
-using namespace arm_compute::graph;
-
-ReshapeLayer::ReshapeLayer(TensorShape shape)
+namespace arm_compute
+{
+namespace graph
+{
+ReshapeLayerNode::ReshapeLayerNode(TensorShape shape)
: _shape(shape)
{
+ _input_edges.resize(1, EmptyEdgeID);
+ _outputs.resize(1, NullTensorID);
}
-std::unique_ptr<arm_compute::IFunction> ReshapeLayer::instantiate_node(GraphContext &ctx, ITensorObject *input, ITensorObject *output)
+bool ReshapeLayerNode::forward_descriptors()
{
- ARM_COMPUTE_ERROR_ON_UNALLOCATED_TENSOR_OBJECT(input, output);
-
- _target_hint = ctx.hints().target_hint();
- arm_compute::ITensor *in = input->tensor();
- arm_compute::ITensor *out = output->tensor();
-
- // Auto configure output
- arm_compute::auto_init_if_empty(*out->info(), _shape, 1, in->info()->data_type(), in->info()->fixed_point_position(), in->info()->quantization_info());
-
- // Create node context
- NodeContext node_ctx(OperationType::ReshapeLayer);
- node_ctx.set_target(_target_hint);
- node_ctx.add_input(in);
- node_ctx.add_output(out);
-
- // Get function
- return OperationRegistry::get().find_operation(OperationType::ReshapeLayer, _target_hint)->configure(node_ctx);
+ if((input_id(0) != NullTensorID) && (output_id(0) != NullTensorID))
+ {
+ Tensor *dst = output(0);
+ ARM_COMPUTE_ERROR_ON(dst == nullptr);
+ dst->desc() = configure_output(0);
+ return true;
+ }
+ return false;
}
+
+TensorDescriptor ReshapeLayerNode::configure_output(size_t idx) const
+{
+ ARM_COMPUTE_UNUSED(idx);
+ ARM_COMPUTE_ERROR_ON(idx >= _outputs.size());
+
+ const Tensor *src = input(0);
+ ARM_COMPUTE_ERROR_ON(src == nullptr);
+
+ TensorDescriptor output_desc = src->desc();
+ output_desc.shape = _shape;
+
+ return output_desc;
+}
+
+NodeType ReshapeLayerNode::type() const
+{
+ return NodeType::ReshapeLayer;
+}
+
+void ReshapeLayerNode::accept(INodeVisitor &v)
+{
+ v.visit(*this);
+}
+} // namespace graph
+} // namespace arm_compute
\ No newline at end of file
diff --git a/src/graph/nodes/ResidualLayer.cpp b/src/graph/nodes/ResidualLayer.cpp
deleted file mode 100644
index 87404f9..0000000
--- a/src/graph/nodes/ResidualLayer.cpp
+++ /dev/null
@@ -1,199 +0,0 @@
-/*
- * Copyright (c) 2017-2018 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/graph/nodes/ResidualLayer.h"
-
-#include "arm_compute/graph/Error.h"
-#include "arm_compute/graph/Graph.h"
-#include "arm_compute/graph/NodeContext.h"
-#include "arm_compute/graph/OperationRegistry.h"
-#include "arm_compute/graph/SubGraph.h"
-#include "arm_compute/graph/Tensor.h"
-#include "arm_compute/runtime/IFunction.h"
-#include "support/ToolchainSupport.h"
-#include "utils/Utils.h"
-
-#include <memory>
-#include <tuple>
-#include <vector>
-
-using namespace arm_compute::graph;
-
-/** Residual function */
-class ResidualFunction final : public arm_compute::IFunction
-{
-public:
- /** Default Constructor */
- ResidualFunction(GraphContext &ctx, ITensorObject *output)
- : _ctx(ctx), _input(nullptr), _output(output), _func(nullptr), _graphs(), _graph_outputs()
- {
- }
-
- /** Prevent instances from being copy constructed */
- ResidualFunction(const ResidualFunction &) = delete;
- /** Prevent instances from being copy assigned */
- const ResidualFunction &operator=(const ResidualFunction &) = delete;
- /** Prevent instances from being move constructed */
- ResidualFunction(ResidualFunction &&) = delete;
- /** Prevent instances from being move assigned */
- ResidualFunction &operator=(ResidualFunction &&) = delete;
- /** Default destructor */
- ~ResidualFunction() override = default;
-
- /** Set the input (when using only one sub graph)
- *
- * @param[in] input Input to set
- */
- void set_input(std::unique_ptr<ITensorObject> input)
- {
- _input = std::move(input);
- }
-
- /** Registers graph to be executed by the residual function
- *
- * @param[in] graph Graph to register
- * @param[in] output Output to register
- */
- void register_graph(std::unique_ptr<Graph> graph, std::unique_ptr<ITensorObject> output)
- {
- _graphs.push_back(std::move(graph));
- _graph_outputs.push_back(std::move(output));
- }
-
- /** Configure the function */
- void configure()
- {
- ARM_COMPUTE_ERROR_ON(_graphs.size() < 1 || _graphs.size() > 2);
- TargetHint target_hint = _ctx.hints().target_hint();
-
- // Create node context
- NodeContext node_ctx(OperationType::ArithmeticAddition);
- node_ctx.set_target(target_hint);
-
- if(_graphs.size() == 1)
- {
- arm_compute::ITensor *in = _input->tensor();
- node_ctx.add_input(in);
- }
-
- for(auto &o : _graph_outputs)
- {
- arm_compute::ITensor *in = o->tensor();
- node_ctx.add_input(in);
- }
-
- arm_compute::ITensor *out = _output->tensor();
- auto_init_if_empty(*out->info(), *_graph_outputs[0]->tensor()->info());
- node_ctx.add_output(out);
-
- _func = OperationRegistry::get().find_operation(OperationType::ArithmeticAddition, target_hint)->configure(node_ctx);
-
- for(auto &o : _graph_outputs)
- {
- o->allocate();
- }
- }
-
- // Inherited methods overriden:
- void run() override
- {
- ARM_COMPUTE_ERROR_ON(_graphs.size() < 1 || _graphs.size() > 2);
-
- for(auto &g : _graphs)
- {
- ARM_COMPUTE_ERROR_ON(g.get() == nullptr);
- g->run();
- }
-
- _func->run();
- }
-
-private:
- GraphContext _ctx;
- std::unique_ptr<ITensorObject> _input;
- ITensorObject *_output;
- std::unique_ptr<arm_compute::IFunction> _func;
- std::vector<std::unique_ptr<Graph>> _graphs;
- std::vector<std::unique_ptr<ITensorObject>> _graph_outputs;
-};
-
-std::unique_ptr<arm_compute::IFunction> ResidualLayer::instantiate_node(GraphContext &ctx, ITensorObject *input, ITensorObject *output)
-{
- ARM_COMPUTE_ERROR_ON_UNALLOCATED_TENSOR_OBJECT(input, output);
- ARM_COMPUTE_ERROR_ON(dynamic_cast<Tensor *>(input) == nullptr);
- ARM_COMPUTE_ERROR_ON(dynamic_cast<Tensor *>(output) == nullptr);
-
- // Create residual function
- auto func = arm_compute::support::cpp14::make_unique<ResidualFunction>(ctx, output);
-
- if(_sub_graphs.size() == 1)
- {
- std::unique_ptr<ITensorObject> original_in;
- original_in = arm_compute::support::cpp14::make_unique<SubTensor>(*dynamic_cast<Tensor *>(input),
- input->tensor()->info()->tensor_shape(),
- Coordinates());
- func->set_input(std::move(original_in));
- }
-
- // Constuct all sub-graphs given the input/output
- for(auto &sg : _sub_graphs)
- {
- ARM_COMPUTE_ERROR_ON(sg.get() == nullptr);
-
- // IO buffers
- std::unique_ptr<ITensorObject> in;
- std::unique_ptr<ITensorObject> out;
- std::unique_ptr<ITensorObject> func_in;
-
- // Create input sub-tensor
- if(!sg->has_input())
- {
- in = arm_compute::support::cpp14::make_unique<SubTensor>(*dynamic_cast<Tensor *>(input),
- input->tensor()->info()->tensor_shape(),
- Coordinates());
- }
-
- // Create output sub-tensor
- if(!sg->has_output())
- {
- ITensorInfo *info = input->tensor()->info();
- func_in = arm_compute::support::cpp14::make_unique<Tensor>(TensorInfo(info->num_channels(), info->data_type(), info->fixed_point_position()));
- func_in->set_target(ctx.hints().target_hint());
- out = arm_compute::support::cpp14::make_unique<SubTensor>(func_in->tensor(),
- TensorShape(),
- Coordinates(0, 0, 0),
- func_in->target(),
- true);
- }
-
- // Construct sub_graph
- auto g = sg->construct(ctx, std::move(in), std::move(out));
-
- // Register graph to function
- func->register_graph(std::move(g), std::move(func_in));
- }
-
- func->configure();
-
- return std::move(func);
-}
diff --git a/src/graph/nodes/SoftmaxLayer.cpp b/src/graph/nodes/SoftmaxLayer.cpp
deleted file mode 100644
index 7f2325b..0000000
--- a/src/graph/nodes/SoftmaxLayer.cpp
+++ /dev/null
@@ -1,49 +0,0 @@
-/*
- * Copyright (c) 2017 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/graph/nodes/SoftmaxLayer.h"
-
-#include "arm_compute/graph/Error.h"
-#include "arm_compute/graph/NodeContext.h"
-#include "arm_compute/graph/OperationRegistry.h"
-#include "support/ToolchainSupport.h"
-
-using namespace arm_compute::graph;
-
-std::unique_ptr<arm_compute::IFunction> SoftmaxLayer::instantiate_node(GraphContext &ctx, ITensorObject *input, ITensorObject *output)
-{
- ARM_COMPUTE_ERROR_ON_UNALLOCATED_TENSOR_OBJECT(input, output);
-
- arm_compute::ITensor *in = input->tensor();
- arm_compute::ITensor *out = output->tensor();
- _target_hint = ctx.hints().target_hint();
-
- // Create node context
- NodeContext node_ctx(OperationType::SoftmaxLayer);
- node_ctx.set_target(_target_hint);
- node_ctx.add_input(in);
- node_ctx.add_output(out);
-
- // Get function
- return OperationRegistry::get().find_operation(OperationType::SoftmaxLayer, _target_hint)->configure(node_ctx);
-}
diff --git a/src/graph/nodes/SoftmaxLayerNode.cpp b/src/graph/nodes/SoftmaxLayerNode.cpp
new file mode 100644
index 0000000..57e5561
--- /dev/null
+++ b/src/graph/nodes/SoftmaxLayerNode.cpp
@@ -0,0 +1,82 @@
+/*
+ * Copyright (c) 2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/graph/nodes/SoftmaxLayerNode.h"
+
+#include "arm_compute/core/Utils.h"
+#include "arm_compute/graph/Graph.h"
+#include "arm_compute/graph/INodeVisitor.h"
+
+namespace arm_compute
+{
+namespace graph
+{
+SoftmaxLayerNode::SoftmaxLayerNode(float beta)
+ : _beta(beta)
+{
+ _input_edges.resize(1, EmptyEdgeID);
+ _outputs.resize(1, NullTensorID);
+}
+
+float SoftmaxLayerNode::beta() const
+{
+ return _beta;
+}
+
+bool SoftmaxLayerNode::forward_descriptors()
+{
+ if((input_id(0) != NullTensorID) && (output_id(0) != NullTensorID))
+ {
+ Tensor *dst = output(0);
+ ARM_COMPUTE_ERROR_ON(dst == nullptr);
+ dst->desc() = configure_output(0);
+ return true;
+ }
+ return false;
+}
+
+TensorDescriptor SoftmaxLayerNode::configure_output(size_t idx) const
+{
+ ARM_COMPUTE_UNUSED(idx);
+ ARM_COMPUTE_ERROR_ON(idx >= _outputs.size());
+
+ const Tensor *src = input(0);
+ ARM_COMPUTE_ERROR_ON(src == nullptr);
+
+ TensorDescriptor out_desc = src->desc();
+ out_desc.quant_info = QuantizationInfo(1.f / 256.f, 0);
+
+ return out_desc;
+}
+
+NodeType SoftmaxLayerNode::type() const
+{
+ return NodeType::SoftmaxLayer;
+}
+
+void SoftmaxLayerNode::accept(INodeVisitor &v)
+{
+ v.visit(*this);
+}
+} // namespace graph
+} // namespace arm_compute
\ No newline at end of file
diff --git a/src/graph/nodes/SplitLayerNode.cpp b/src/graph/nodes/SplitLayerNode.cpp
new file mode 100644
index 0000000..5d46c9d
--- /dev/null
+++ b/src/graph/nodes/SplitLayerNode.cpp
@@ -0,0 +1,118 @@
+/*
+ * Copyright (c) 2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/graph/nodes/SplitLayerNode.h"
+
+#include "arm_compute/core/Utils.h"
+#include "arm_compute/graph/Graph.h"
+#include "arm_compute/graph/INodeVisitor.h"
+
+namespace arm_compute
+{
+namespace graph
+{
+SplitLayerNode::SplitLayerNode(unsigned int num_splits, unsigned int axis)
+ : _num_splits(num_splits), _axis(axis)
+{
+ _input_edges.resize(1, EmptyEdgeID);
+ _outputs.resize(num_splits, NullTensorID);
+}
+
+unsigned int SplitLayerNode::num_splits() const
+{
+ return _num_splits;
+}
+
+unsigned int SplitLayerNode::axis() const
+{
+ return _axis;
+}
+
+std::pair<TensorDescriptor, Coordinates> SplitLayerNode::compute_output_descriptor(const TensorDescriptor &input_descriptor,
+ unsigned int num_splits, unsigned int axis, unsigned int idx)
+{
+ const unsigned int split_size = input_descriptor.shape[axis] / num_splits;
+
+ TensorDescriptor output_descriptor = input_descriptor;
+ output_descriptor.shape.set(axis, split_size);
+
+ Coordinates coords;
+ coords.set(axis, idx * split_size);
+
+ return std::make_pair(output_descriptor, coords);
+}
+
+bool SplitLayerNode::forward_descriptors()
+{
+ if(input_id(0) != NullTensorID)
+ {
+ validate();
+ for(unsigned int i = 0; i < _outputs.size(); ++i)
+ {
+ if(output_id(i) != NullTensorID)
+ {
+ Tensor *dst_i = output(i);
+ ARM_COMPUTE_ERROR_ON(dst_i == nullptr);
+ dst_i->desc() = configure_output(i);
+ }
+ }
+ return true;
+ }
+ return false;
+}
+
+TensorDescriptor SplitLayerNode::configure_output(size_t idx) const
+{
+ ARM_COMPUTE_UNUSED(idx);
+ ARM_COMPUTE_ERROR_ON(idx >= _outputs.size());
+
+ const Tensor *src = input(0);
+ ARM_COMPUTE_ERROR_ON(src == nullptr);
+
+ TensorDescriptor output_info;
+ std::tie(output_info, std::ignore) = compute_output_descriptor(src->desc(), _num_splits, _axis, idx);
+
+ return output_info;
+}
+
+Status SplitLayerNode::validate() const
+{
+ const Tensor *src = input(0);
+ ARM_COMPUTE_RETURN_ERROR_ON(src == nullptr);
+ ARM_COMPUTE_RETURN_ERROR_ON(_axis >= src->desc().shape.num_dimensions());
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(src->desc().shape[_axis] % _num_splits, "Split should be exact");
+
+ return Status{};
+}
+
+NodeType SplitLayerNode::type() const
+{
+ return NodeType::SplitLayer;
+}
+
+void SplitLayerNode::accept(INodeVisitor &v)
+{
+ v.visit(*this);
+}
+} // namespace graph
+} // namespace arm_compute
\ No newline at end of file
diff --git a/src/graph/operations/CLSimpleOperations.cpp b/src/graph/operations/CLSimpleOperations.cpp
deleted file mode 100644
index fe56122..0000000
--- a/src/graph/operations/CLSimpleOperations.cpp
+++ /dev/null
@@ -1,495 +0,0 @@
-/*
- * Copyright (c) 2017-2018 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/Error.h"
-#include "arm_compute/graph/IOperation.h"
-#include "arm_compute/graph/NodeContext.h"
-#include "arm_compute/graph/OperationRegistrar.h"
-#include "arm_compute/graph/Types.h"
-#include "arm_compute/runtime/CL/CLFunctions.h"
-#include "support/ToolchainSupport.h"
-#include "utils/GraphTypePrinter.h"
-#include "utils/TypePrinter.h"
-
-#include <memory>
-
-using namespace arm_compute::graph;
-
-/* Activation Layer */
-REGISTER_SIMPLE_OPERATION(CLActivationLayerOperation, OPENCL, OperationType::ActivationLayer)
-{
- ARM_COMPUTE_ERROR_ON(ctx.num_inputs() != 1);
- ARM_COMPUTE_ERROR_ON(ctx.num_outputs() != 1);
- ARM_COMPUTE_ERROR_ON(dynamic_cast<arm_compute::ICLTensor *>(ctx.input(0)) == nullptr);
- ARM_COMPUTE_ERROR_ON(dynamic_cast<arm_compute::ICLTensor *>(ctx.output(0)) == nullptr);
-
- // Extract IO and info
- auto *in = dynamic_cast<arm_compute::ICLTensor *>(ctx.input(0));
- auto *out = dynamic_cast<arm_compute::ICLTensor *>(ctx.output(0));
- const auto act_info = ctx.parameter<ActivationLayerInfo>("ActivationLayerInfo");
-
- // Create and configure function
- auto activation = arm_compute::support::cpp14::make_unique<arm_compute::CLActivationLayer>();
- activation->configure(in, out, act_info);
-
- // Log info
- ARM_COMPUTE_LOG_GRAPH_INFO("Instantiating CLActivationLayer"
- << " Data Type: " << in->info()->data_type()
- << " Input shape: " << in->info()->tensor_shape()
- << " Output shape: " << out->info()->tensor_shape()
- << " Activation function: " << act_info.activation()
- << " a: " << act_info.a()
- << " b: " << act_info.b()
- << std::endl);
-
- return std::move(activation);
-}
-
-/* Arithmetic addition */
-REGISTER_SIMPLE_OPERATION(CLArithmeticAdditionOperation, OPENCL, OperationType::ArithmeticAddition)
-{
- ARM_COMPUTE_ERROR_ON(ctx.num_inputs() != 2);
- ARM_COMPUTE_ERROR_ON(ctx.num_outputs() != 1);
- ARM_COMPUTE_ERROR_ON(dynamic_cast<arm_compute::ICLTensor *>(ctx.input(0)) == nullptr);
- ARM_COMPUTE_ERROR_ON(dynamic_cast<arm_compute::ICLTensor *>(ctx.input(1)) == nullptr);
- ARM_COMPUTE_ERROR_ON(dynamic_cast<arm_compute::ICLTensor *>(ctx.output(0)) == nullptr);
-
- // Extract IO and info
- auto *in1 = dynamic_cast<arm_compute::ICLTensor *>(ctx.input(0));
- auto *in2 = dynamic_cast<arm_compute::ICLTensor *>(ctx.input(1));
- auto *out = dynamic_cast<arm_compute::ICLTensor *>(ctx.output(0));
-
- auto addition = arm_compute::support::cpp14::make_unique<arm_compute::CLArithmeticAddition>();
- addition->configure(in1, in2, out, ConvertPolicy::SATURATE);
-
- // Log info
- ARM_COMPUTE_LOG_GRAPH_INFO("Instantiating CLArithmeticAddition"
- << " Data Type: " << in1->info()->data_type()
- << " Input 1 shape: " << in1->info()->tensor_shape()
- << " Input 2 shape: " << in2->info()->tensor_shape()
- << " Output shape: " << out->info()->tensor_shape()
- << std::endl);
-
- return std::move(addition);
-}
-
-/* Batch Normalization Layer */
-REGISTER_SIMPLE_OPERATION(CLBatchNormalizationLayerOperation, OPENCL, OperationType::BatchNormalizationLayer)
-{
- ARM_COMPUTE_ERROR_ON(ctx.num_inputs() != 5);
- ARM_COMPUTE_ERROR_ON(ctx.num_outputs() != 1);
- ARM_COMPUTE_ERROR_ON(dynamic_cast<arm_compute::ICLTensor *>(ctx.input(0)) == nullptr);
- ARM_COMPUTE_ERROR_ON(dynamic_cast<arm_compute::ICLTensor *>(ctx.input(1)) == nullptr);
- ARM_COMPUTE_ERROR_ON(dynamic_cast<arm_compute::ICLTensor *>(ctx.input(2)) == nullptr);
- ARM_COMPUTE_ERROR_ON(dynamic_cast<arm_compute::ICLTensor *>(ctx.input(3)) == nullptr);
- ARM_COMPUTE_ERROR_ON(dynamic_cast<arm_compute::ICLTensor *>(ctx.input(4)) == nullptr);
- ARM_COMPUTE_ERROR_ON(dynamic_cast<arm_compute::ICLTensor *>(ctx.output(0)) == nullptr);
-
- // Extract IO and info
- auto *in = dynamic_cast<arm_compute::ICLTensor *>(ctx.input(0));
- auto *mean = dynamic_cast<arm_compute::ICLTensor *>(ctx.input(1));
- auto *var = dynamic_cast<arm_compute::ICLTensor *>(ctx.input(2));
- auto *beta = dynamic_cast<arm_compute::ICLTensor *>(ctx.input(3));
- auto *gamma = dynamic_cast<arm_compute::ICLTensor *>(ctx.input(4));
- auto *out = dynamic_cast<arm_compute::ICLTensor *>(ctx.output(0));
- const auto epsilon = ctx.parameter<float>("epsilon");
- const auto act_info = ctx.parameter<ActivationLayerInfo>("act_info");
-
- // Create and configure function
- auto batch_norm = arm_compute::support::cpp14::make_unique<arm_compute::CLBatchNormalizationLayer>();
- batch_norm->configure(in, out, mean, var, beta, gamma, epsilon, act_info);
-
- // Log info
- ARM_COMPUTE_LOG_GRAPH_INFO("Instantiating CLBatchNormalizationLayer"
- << " Data Type: " << in->info()->data_type()
- << " Input shape: " << in->info()->tensor_shape()
- << " Output shape: " << out->info()->tensor_shape()
- << " Mean shape: " << mean->info()->tensor_shape()
- << " Var shape: " << var->info()->tensor_shape()
- << " Beta shape: " << beta->info()->tensor_shape()
- << " Gamma shape: " << gamma->info()->tensor_shape()
- << " Epsilon: " << epsilon
- << " Activation function: " << act_info.activation()
- << " a: " << act_info.a()
- << " b: " << act_info.b()
- << std::endl);
-
- return std::move(batch_norm);
-}
-
-/* DepthConvertLayer Layer */
-REGISTER_SIMPLE_OPERATION(CLDepthConvertLayerOperation, OPENCL, OperationType::DepthConvertLayer)
-{
- ARM_COMPUTE_ERROR_ON(ctx.num_inputs() != 1);
- ARM_COMPUTE_ERROR_ON(ctx.num_outputs() != 1);
- ARM_COMPUTE_ERROR_ON(dynamic_cast<arm_compute::ICLTensor *>(ctx.input(0)) == nullptr);
- ARM_COMPUTE_ERROR_ON(dynamic_cast<arm_compute::ICLTensor *>(ctx.output(0)) == nullptr);
-
- // Extract IO and info
- auto *in = dynamic_cast<arm_compute::ICLTensor *>(ctx.input(0));
- auto *out = dynamic_cast<arm_compute::ICLTensor *>(ctx.output(0));
- const auto conv_policy = ctx.parameter<ConvertPolicy>("ConvertPolicy");
- const auto shift = ctx.parameter<uint32_t>("shift");
-
- // Create and configure function
- auto depthconvert = arm_compute::support::cpp14::make_unique<arm_compute::CLDepthConvertLayer>();
- depthconvert->configure(in, out, conv_policy, shift);
-
- // Log info
- ARM_COMPUTE_LOG_GRAPH_INFO("Instantiating CLDepthConvertLayer"
- << " Data Type: " << in->info()->data_type()
- << " Input shape: " << in->info()->tensor_shape()
- << " Output shape: " << out->info()->tensor_shape()
- << " shift: " << shift
- << std::endl);
-
- return std::move(depthconvert);
-}
-
-/* DepthwiseConvolutionLayer Layer */
-REGISTER_SIMPLE_OPERATION(CLDepthwiseConvolutionOperation, OPENCL, OperationType::DepthwiseConvolutionLayer)
-{
- ARM_COMPUTE_ERROR_ON(ctx.num_inputs() != 2 && ctx.num_inputs() != 3);
- ARM_COMPUTE_ERROR_ON(ctx.num_outputs() != 1);
- ARM_COMPUTE_ERROR_ON(dynamic_cast<arm_compute::ICLTensor *>(ctx.input(0)) == nullptr);
- ARM_COMPUTE_ERROR_ON(dynamic_cast<arm_compute::ICLTensor *>(ctx.output(0)) == nullptr);
-
- // Extract IO and info
- auto *in = dynamic_cast<arm_compute::ICLTensor *>(ctx.input(0));
- auto *weights = dynamic_cast<arm_compute::ICLTensor *>(ctx.input(1));
- auto *biases = ctx.num_inputs() == 3 ? dynamic_cast<arm_compute::ICLTensor *>(ctx.input(2)) : nullptr;
- auto *out = dynamic_cast<arm_compute::ICLTensor *>(ctx.output(0));
- const auto conv_info = ctx.parameter<PadStrideInfo>("ConvolutionInfo");
- const auto opt3x3 = ctx.parameter<bool>("Optimized3x3");
-
- // Create and configure function
- std::unique_ptr<arm_compute::IFunction> func;
- bool run_3x3_opt = opt3x3 && weights->info()->dimension(0) == 3;
- if(run_3x3_opt)
- {
- auto depwthwise_conv = arm_compute::support::cpp14::make_unique<arm_compute::CLDepthwiseConvolutionLayer3x3>();
- depwthwise_conv->configure(in, weights, biases, out, conv_info);
- func = std::move(depwthwise_conv);
- }
- else
- {
- auto depwthwise_conv = arm_compute::support::cpp14::make_unique<arm_compute::CLDepthwiseConvolutionLayer>();
- depwthwise_conv->configure(in, weights, biases, out, conv_info);
- func = std::move(depwthwise_conv);
- }
-
- // Log info
- ARM_COMPUTE_LOG_GRAPH_INFO("Instantiating CLDepthwiseConvolutionLayer"
- << " Data Type: " << in->info()->data_type()
- << " Input shape: " << in->info()->tensor_shape()
- << " Weights shape: " << weights->info()->tensor_shape()
- << " Output shape: " << out->info()->tensor_shape());
- if(biases == nullptr)
- {
- ARM_COMPUTE_LOG_GRAPH_INFO(" Biases shape: No biases provided" << std::endl);
- }
- else
- {
- ARM_COMPUTE_LOG_GRAPH_INFO(" Biases shape: " << biases->info()->tensor_shape() << std::endl);
- }
-
- return func;
-}
-
-/* DeQuantizationLayer Layer */
-REGISTER_SIMPLE_OPERATION(CLDequantizationLayerOperation, OPENCL, OperationType::DequantizationLayer)
-{
- ARM_COMPUTE_ERROR_ON(ctx.num_inputs() != 1);
- ARM_COMPUTE_ERROR_ON(ctx.num_outputs() != 2);
- ARM_COMPUTE_ERROR_ON(dynamic_cast<arm_compute::ICLTensor *>(ctx.input(0)) == nullptr);
- ARM_COMPUTE_ERROR_ON(dynamic_cast<arm_compute::ICLTensor *>(ctx.output(0)) == nullptr);
- ARM_COMPUTE_ERROR_ON(dynamic_cast<arm_compute::ICLTensor *>(ctx.output(1)) == nullptr);
-
- // Extract IO and info
- auto *in = dynamic_cast<arm_compute::ICLTensor *>(ctx.input(0));
- auto *out = dynamic_cast<arm_compute::ICLTensor *>(ctx.output(0));
- auto *min_max = dynamic_cast<arm_compute::ICLTensor *>(ctx.output(1));
-
- // Create and configure function
- auto dequantization = arm_compute::support::cpp14::make_unique<arm_compute::CLDequantizationLayer>();
- dequantization->configure(in, out, min_max);
-
- // Log info
- ARM_COMPUTE_LOG_GRAPH_INFO("Instantiating CLDequantizationLayer"
- << " Data Type: " << in->info()->data_type()
- << " Input shape: " << in->info()->tensor_shape()
- << " Output shape: " << out->info()->tensor_shape()
- << " Min max shape: " << min_max->info()->tensor_shape()
- << std::endl);
-
- return std::move(dequantization);
-}
-
-/* Flatten Layer */
-REGISTER_SIMPLE_OPERATION(CLFlattenLayerOperation, OPENCL, OperationType::FlattenLayer)
-{
- ARM_COMPUTE_ERROR_ON(ctx.num_inputs() != 1);
- ARM_COMPUTE_ERROR_ON(ctx.num_outputs() != 1);
- ARM_COMPUTE_ERROR_ON(dynamic_cast<arm_compute::ICLTensor *>(ctx.input(0)) == nullptr);
- ARM_COMPUTE_ERROR_ON(dynamic_cast<arm_compute::ICLTensor *>(ctx.output(0)) == nullptr);
-
- // Extract IO and info
- auto *in = dynamic_cast<arm_compute::ICLTensor *>(ctx.input(0));
- auto *out = dynamic_cast<arm_compute::ICLTensor *>(ctx.output(0));
-
- // Create and configure function
- auto flatten = arm_compute::support::cpp14::make_unique<arm_compute::CLFlattenLayer>();
- flatten->configure(in, out);
-
- // Log info
- ARM_COMPUTE_LOG_GRAPH_INFO("Instantiating CLFlattenLayer"
- << " Data Type: " << in->info()->data_type()
- << " Input shape: " << in->info()->tensor_shape()
- << " Output shape: " << out->info()->tensor_shape()
- << std::endl);
-
- return std::move(flatten);
-}
-
-/* Floor Layer */
-REGISTER_SIMPLE_OPERATION(CLFloorLayerOperation, OPENCL, OperationType::FloorLayer)
-{
- ARM_COMPUTE_ERROR_ON(ctx.num_inputs() != 1);
- ARM_COMPUTE_ERROR_ON(ctx.num_outputs() != 1);
- ARM_COMPUTE_ERROR_ON(dynamic_cast<arm_compute::ICLTensor *>(ctx.input(0)) == nullptr);
- ARM_COMPUTE_ERROR_ON(dynamic_cast<arm_compute::ICLTensor *>(ctx.output(0)) == nullptr);
-
- // Extract IO and info
- auto *in = dynamic_cast<arm_compute::ICLTensor *>(ctx.input(0));
- auto *out = dynamic_cast<arm_compute::ICLTensor *>(ctx.output(0));
-
- // Create and configure function
- auto floor = arm_compute::support::cpp14::make_unique<arm_compute::CLFloor>();
- floor->configure(in, out);
-
- // Log info
- ARM_COMPUTE_LOG_GRAPH_INFO("Instantiating CLFloorLayer"
- << " Data Type: " << in->info()->data_type()
- << " Input shape: " << in->info()->tensor_shape()
- << " Output shape: " << out->info()->tensor_shape()
- << std::endl);
-
- return std::move(floor);
-}
-
-/* Fully Connected Layer */
-REGISTER_SIMPLE_OPERATION(CLFullyConnectedLayer, OPENCL, OperationType::FullyConnectedLayer)
-{
- ARM_COMPUTE_ERROR_ON(ctx.num_inputs() != 3);
- ARM_COMPUTE_ERROR_ON(ctx.num_outputs() != 1);
- ARM_COMPUTE_ERROR_ON(dynamic_cast<arm_compute::ICLTensor *>(ctx.input(0)) == nullptr);
- ARM_COMPUTE_ERROR_ON(dynamic_cast<arm_compute::ICLTensor *>(ctx.input(1)) == nullptr);
- ARM_COMPUTE_ERROR_ON(dynamic_cast<arm_compute::ICLTensor *>(ctx.input(2)) == nullptr);
- ARM_COMPUTE_ERROR_ON(dynamic_cast<arm_compute::ICLTensor *>(ctx.output(0)) == nullptr);
-
- // Extract IO and info
- auto *in = dynamic_cast<arm_compute::ICLTensor *>(ctx.input(0));
- auto *weights = dynamic_cast<arm_compute::ICLTensor *>(ctx.input(1));
- auto *biases = dynamic_cast<arm_compute::ICLTensor *>(ctx.input(2));
- auto *out = dynamic_cast<arm_compute::ICLTensor *>(ctx.output(0));
-
- // Create and configure function
- auto fc = arm_compute::support::cpp14::make_unique<arm_compute::CLFullyConnectedLayer>();
- fc->configure(in, weights, biases, out);
-
- // Log info
- ARM_COMPUTE_LOG_GRAPH_INFO("Instantiating CLFullyConnectedLayer"
- << " Data Type: " << in->info()->data_type()
- << " Input shape: " << in->info()->tensor_shape()
- << " Weights shape: " << weights->info()->tensor_shape()
- << " Biases Shape: " << biases->info()->tensor_shape()
- << " Output shape: " << out->info()->tensor_shape()
- << std::endl);
-
- return std::move(fc);
-}
-
-/* L2 Normalize Layer */
-REGISTER_SIMPLE_OPERATION(CLL2NormalizeLayerOperation, OPENCL, OperationType::L2NormalizeLayer)
-{
- ARM_COMPUTE_ERROR_ON(ctx.num_inputs() != 1);
- ARM_COMPUTE_ERROR_ON(ctx.num_outputs() != 1);
- ARM_COMPUTE_ERROR_ON(dynamic_cast<arm_compute::ICLTensor *>(ctx.input(0)) == nullptr);
- ARM_COMPUTE_ERROR_ON(dynamic_cast<arm_compute::ICLTensor *>(ctx.output(0)) == nullptr);
-
- // Extract IO and info
- auto *in = dynamic_cast<arm_compute::ICLTensor *>(ctx.input(0));
- auto *out = dynamic_cast<arm_compute::ICLTensor *>(ctx.output(0));
- const auto axis = ctx.parameter<unsigned int>("axis");
- const auto epsilon = ctx.parameter<float>("epsilon");
-
- // Create and configure function
- auto l2_norm = arm_compute::support::cpp14::make_unique<arm_compute::CLL2NormalizeLayer>();
- l2_norm->configure(in, out, axis, epsilon);
-
- // Log info
- ARM_COMPUTE_LOG_GRAPH_INFO("Instantiating CLL2NormalizeLayer"
- << " Data Type: " << in->info()->data_type()
- << " Input shape: " << in->info()->tensor_shape()
- << " Output shape: " << out->info()->tensor_shape()
- << " Axis: " << axis
- << " Epsilon: " << epsilon
- << std::endl);
-
- return std::move(l2_norm);
-}
-
-/* Normalization Layer */
-REGISTER_SIMPLE_OPERATION(CLNormalizationLayerOperation, OPENCL, OperationType::NormalizationLayer)
-{
- ARM_COMPUTE_ERROR_ON(ctx.num_inputs() != 1);
- ARM_COMPUTE_ERROR_ON(ctx.num_outputs() != 1);
- ARM_COMPUTE_ERROR_ON(dynamic_cast<arm_compute::ICLTensor *>(ctx.input(0)) == nullptr);
- ARM_COMPUTE_ERROR_ON(dynamic_cast<arm_compute::ICLTensor *>(ctx.output(0)) == nullptr);
-
- // Extract IO and info
- auto *in = dynamic_cast<arm_compute::ICLTensor *>(ctx.input(0));
- auto *out = dynamic_cast<arm_compute::ICLTensor *>(ctx.output(0));
- const auto norm_info = ctx.parameter<NormalizationLayerInfo>("NormalizationLayerInfo");
-
- // Create and configure function
- auto norm = arm_compute::support::cpp14::make_unique<arm_compute::CLNormalizationLayer>();
- norm->configure(in, out, norm_info);
-
- // Log info
- ARM_COMPUTE_LOG_GRAPH_INFO("Instantiating CLNormalizationLayer"
- << " Data Type: " << in->info()->data_type()
- << " Input shape: " << in->info()->tensor_shape()
- << " Output shape: " << out->info()->tensor_shape()
- << " Normalization info: " << norm_info
- << std::endl);
-
- return std::move(norm);
-}
-
-/* Pooling Layer */
-REGISTER_SIMPLE_OPERATION(CLPoolingLayerOperation, OPENCL, OperationType::PoolingLayer)
-{
- ARM_COMPUTE_ERROR_ON(ctx.num_inputs() != 1);
- ARM_COMPUTE_ERROR_ON(ctx.num_outputs() != 1);
- ARM_COMPUTE_ERROR_ON(dynamic_cast<arm_compute::ICLTensor *>(ctx.input(0)) == nullptr);
- ARM_COMPUTE_ERROR_ON(dynamic_cast<arm_compute::ICLTensor *>(ctx.output(0)) == nullptr);
-
- // Extract IO and info
- auto *in = dynamic_cast<arm_compute::ICLTensor *>(ctx.input(0));
- auto *out = dynamic_cast<arm_compute::ICLTensor *>(ctx.output(0));
- const auto pool_info = ctx.parameter<PoolingLayerInfo>("PoolingLayerInfo");
-
- // Create and configure function
- auto pool = arm_compute::support::cpp14::make_unique<arm_compute::CLPoolingLayer>();
- pool->configure(in, out, pool_info);
-
- // Log info
- ARM_COMPUTE_LOG_GRAPH_INFO("Instantiating CLPoolingLayer"
- << " Data Type: " << in->info()->data_type()
- << " Input shape: " << in->info()->tensor_shape()
- << " Output shape: " << out->info()->tensor_shape()
- << " Pooling info: " << pool_info
- << std::endl);
-
- return std::move(pool);
-}
-
-/* Quantization Layer */
-REGISTER_SIMPLE_OPERATION(CLQuantizationLayerOperation, OPENCL, OperationType::QuantizationLayer)
-{
- ARM_COMPUTE_ERROR_ON(ctx.num_inputs() != 1);
- ARM_COMPUTE_ERROR_ON(ctx.num_outputs() != 1);
- ARM_COMPUTE_ERROR_ON(dynamic_cast<arm_compute::ICLTensor *>(ctx.input(0)) == nullptr);
- ARM_COMPUTE_ERROR_ON(dynamic_cast<arm_compute::ICLTensor *>(ctx.output(0)) == nullptr);
-
- // Extract IO and info
- auto *in = dynamic_cast<arm_compute::ICLTensor *>(ctx.input(0));
- auto *out = dynamic_cast<arm_compute::ICLTensor *>(ctx.output(0));
-
- // Create and configure function
- auto quantization = arm_compute::support::cpp14::make_unique<arm_compute::CLQuantizationLayer>();
- quantization->configure(in, out);
-
- // Log info
- ARM_COMPUTE_LOG_GRAPH_INFO("Instantiating CLQuantizationLayer"
- << " Data Type: " << in->info()->data_type()
- << " Input shape: " << in->info()->tensor_shape()
- << " Output shape: " << out->info()->tensor_shape()
- << std::endl);
-
- return std::move(quantization);
-}
-
-/* Reshape Layer */
-REGISTER_SIMPLE_OPERATION(CLReshapeLayerOperation, OPENCL, OperationType::ReshapeLayer)
-{
- ARM_COMPUTE_ERROR_ON(ctx.num_inputs() != 1);
- ARM_COMPUTE_ERROR_ON(ctx.num_outputs() != 1);
- ARM_COMPUTE_ERROR_ON(dynamic_cast<arm_compute::ICLTensor *>(ctx.input(0)) == nullptr);
- ARM_COMPUTE_ERROR_ON(dynamic_cast<arm_compute::ICLTensor *>(ctx.output(0)) == nullptr);
-
- // Extract IO and info
- auto *in = dynamic_cast<arm_compute::ICLTensor *>(ctx.input(0));
- auto *out = dynamic_cast<arm_compute::ICLTensor *>(ctx.output(0));
-
- // Create and configure function
- auto reshape = arm_compute::support::cpp14::make_unique<arm_compute::CLReshapeLayer>();
- reshape->configure(in, out);
-
- // Log info
- ARM_COMPUTE_LOG_GRAPH_INFO("Instantiating CLReshapeLayer"
- << " Data Type: " << in->info()->data_type()
- << " Input shape: " << in->info()->tensor_shape()
- << " Output shape: " << out->info()->tensor_shape()
- << std::endl);
-
- return std::move(reshape);
-}
-
-/* Softmax Layer */
-REGISTER_SIMPLE_OPERATION(CLSoftmaxLayerOperation, OPENCL, OperationType::SoftmaxLayer)
-{
- ARM_COMPUTE_ERROR_ON(ctx.num_inputs() != 1);
- ARM_COMPUTE_ERROR_ON(ctx.num_outputs() != 1);
- ARM_COMPUTE_ERROR_ON(dynamic_cast<arm_compute::ICLTensor *>(ctx.input(0)) == nullptr);
- ARM_COMPUTE_ERROR_ON(dynamic_cast<arm_compute::ICLTensor *>(ctx.output(0)) == nullptr);
-
- // Extract IO and info
- auto *in = dynamic_cast<arm_compute::ICLTensor *>(ctx.input(0));
- auto *out = dynamic_cast<arm_compute::ICLTensor *>(ctx.output(0));
-
- // Create and configure function
- auto smx = arm_compute::support::cpp14::make_unique<arm_compute::CLSoftmaxLayer>();
- smx->configure(in, out);
-
- // Log info
- ARM_COMPUTE_LOG_GRAPH_INFO("Instantiating CLSoftmaxLayer"
- << " Data Type: " << in->info()->data_type()
- << " Input shape: " << in->info()->tensor_shape()
- << " Output shape: " << out->info()->tensor_shape()
- << std::endl);
-
- return std::move(smx);
-}
diff --git a/src/graph/operations/NESimpleOperations.cpp b/src/graph/operations/NESimpleOperations.cpp
deleted file mode 100644
index 4154b9a..0000000
--- a/src/graph/operations/NESimpleOperations.cpp
+++ /dev/null
@@ -1,495 +0,0 @@
-/*
- * Copyright (c) 2017-2018 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/core/Error.h"
-#include "arm_compute/core/ITensor.h"
-#include "arm_compute/graph/IOperation.h"
-#include "arm_compute/graph/NodeContext.h"
-#include "arm_compute/graph/OperationRegistrar.h"
-#include "arm_compute/graph/Types.h"
-#include "arm_compute/runtime/NEON/NEFunctions.h"
-#include "support/ToolchainSupport.h"
-#include "utils/GraphTypePrinter.h"
-#include "utils/TypePrinter.h"
-
-#include <memory>
-
-using namespace arm_compute::graph;
-
-/* Activation Layer */
-REGISTER_SIMPLE_OPERATION(NEActivationLayerOperation, NEON, OperationType::ActivationLayer)
-{
- ARM_COMPUTE_ERROR_ON(ctx.num_inputs() != 1);
- ARM_COMPUTE_ERROR_ON(ctx.num_outputs() != 1);
- ARM_COMPUTE_ERROR_ON(dynamic_cast<arm_compute::ITensor *>(ctx.input(0)) == nullptr);
- ARM_COMPUTE_ERROR_ON(dynamic_cast<arm_compute::ITensor *>(ctx.output(0)) == nullptr);
-
- // Extract IO and info
- auto *in = dynamic_cast<arm_compute::ITensor *>(ctx.input(0));
- auto *out = dynamic_cast<arm_compute::ITensor *>(ctx.output(0));
- const auto act_info = ctx.parameter<ActivationLayerInfo>("ActivationLayerInfo");
-
- // Create and configure function
- auto activation = arm_compute::support::cpp14::make_unique<arm_compute::NEActivationLayer>();
- activation->configure(in, out, act_info);
-
- // Log info
- ARM_COMPUTE_LOG_GRAPH_INFO("Instantiating NEActivationLayer"
- << " Data Type: " << in->info()->data_type()
- << " Input shape: " << in->info()->tensor_shape()
- << " Output shape: " << out->info()->tensor_shape()
- << " Activation function: " << act_info.activation()
- << " a: " << act_info.a()
- << " b: " << act_info.b()
- << std::endl);
-
- return std::move(activation);
-}
-
-/* Arithmetic addition */
-REGISTER_SIMPLE_OPERATION(NEArithmeticAdditionOperation, NEON, OperationType::ArithmeticAddition)
-{
- ARM_COMPUTE_ERROR_ON(ctx.num_inputs() != 2);
- ARM_COMPUTE_ERROR_ON(ctx.num_outputs() != 1);
- ARM_COMPUTE_ERROR_ON(dynamic_cast<arm_compute::ITensor *>(ctx.input(0)) == nullptr);
- ARM_COMPUTE_ERROR_ON(dynamic_cast<arm_compute::ITensor *>(ctx.input(1)) == nullptr);
- ARM_COMPUTE_ERROR_ON(dynamic_cast<arm_compute::ITensor *>(ctx.output(0)) == nullptr);
-
- // Extract IO and info
- auto *in1 = dynamic_cast<arm_compute::ITensor *>(ctx.input(0));
- auto *in2 = dynamic_cast<arm_compute::ITensor *>(ctx.input(1));
- auto *out = dynamic_cast<arm_compute::ITensor *>(ctx.output(0));
-
- auto addition = arm_compute::support::cpp14::make_unique<arm_compute::NEArithmeticAddition>();
- addition->configure(in1, in2, out, ConvertPolicy::SATURATE);
-
- // Log info
- ARM_COMPUTE_LOG_GRAPH_INFO("Instantiating NEArithmeticAddition"
- << " Data Type: " << in1->info()->data_type()
- << " Input 1 shape: " << in1->info()->tensor_shape()
- << " Input 2 shape: " << in2->info()->tensor_shape()
- << " Output shape: " << out->info()->tensor_shape()
- << std::endl);
-
- return std::move(addition);
-}
-
-/* Batch Normalization Layer */
-REGISTER_SIMPLE_OPERATION(NEBatchNormalizationLayerOperation, NEON, OperationType::BatchNormalizationLayer)
-{
- ARM_COMPUTE_ERROR_ON(ctx.num_inputs() != 5);
- ARM_COMPUTE_ERROR_ON(ctx.num_outputs() != 1);
- ARM_COMPUTE_ERROR_ON(dynamic_cast<arm_compute::ITensor *>(ctx.input(0)) == nullptr);
- ARM_COMPUTE_ERROR_ON(dynamic_cast<arm_compute::ITensor *>(ctx.input(1)) == nullptr);
- ARM_COMPUTE_ERROR_ON(dynamic_cast<arm_compute::ITensor *>(ctx.input(2)) == nullptr);
- ARM_COMPUTE_ERROR_ON(dynamic_cast<arm_compute::ITensor *>(ctx.input(3)) == nullptr);
- ARM_COMPUTE_ERROR_ON(dynamic_cast<arm_compute::ITensor *>(ctx.input(4)) == nullptr);
- ARM_COMPUTE_ERROR_ON(dynamic_cast<arm_compute::ITensor *>(ctx.output(0)) == nullptr);
-
- // Extract IO and info
- auto *in = dynamic_cast<arm_compute::ITensor *>(ctx.input(0));
- auto *mean = dynamic_cast<arm_compute::ITensor *>(ctx.input(1));
- auto *var = dynamic_cast<arm_compute::ITensor *>(ctx.input(2));
- auto *beta = dynamic_cast<arm_compute::ITensor *>(ctx.input(3));
- auto *gamma = dynamic_cast<arm_compute::ITensor *>(ctx.input(4));
- auto *out = dynamic_cast<arm_compute::ITensor *>(ctx.output(0));
- const auto epsilon = ctx.parameter<float>("epsilon");
- const auto act_info = ctx.parameter<ActivationLayerInfo>("act_info");
-
- // Create and configure function
- auto batch_norm = arm_compute::support::cpp14::make_unique<arm_compute::NEBatchNormalizationLayer>();
- batch_norm->configure(in, out, mean, var, beta, gamma, epsilon, act_info);
-
- // Log info
- ARM_COMPUTE_LOG_GRAPH_INFO("Instantiating NEBatchNormalizationLayer"
- << " Data Type: " << in->info()->data_type()
- << " Input shape: " << in->info()->tensor_shape()
- << " Output shape: " << out->info()->tensor_shape()
- << " Mean shape: " << mean->info()->tensor_shape()
- << " Var shape: " << var->info()->tensor_shape()
- << " Beta shape: " << beta->info()->tensor_shape()
- << " Gamma shape: " << gamma->info()->tensor_shape()
- << " Epsilon: " << epsilon
- << " Activation function: " << act_info.activation()
- << " a: " << act_info.a()
- << " b: " << act_info.b()
- << std::endl);
-
- return std::move(batch_norm);
-}
-
-/* DepthConvertLayer Layer */
-REGISTER_SIMPLE_OPERATION(NEDepthConvertLayerOperation, NEON, OperationType::DepthConvertLayer)
-{
- ARM_COMPUTE_ERROR_ON(ctx.num_inputs() != 1);
- ARM_COMPUTE_ERROR_ON(ctx.num_outputs() != 1);
- ARM_COMPUTE_ERROR_ON(dynamic_cast<arm_compute::ITensor *>(ctx.input(0)) == nullptr);
- ARM_COMPUTE_ERROR_ON(dynamic_cast<arm_compute::ITensor *>(ctx.output(0)) == nullptr);
-
- // Extract IO and info
- auto *in = dynamic_cast<arm_compute::ITensor *>(ctx.input(0));
- auto *out = dynamic_cast<arm_compute::ITensor *>(ctx.output(0));
- const auto conv_policy = ctx.parameter<ConvertPolicy>("ConvertPolicy");
- const auto shift = ctx.parameter<uint32_t>("shift");
-
- // Create and configure function
- auto depthconvert = arm_compute::support::cpp14::make_unique<arm_compute::NEDepthConvertLayer>();
- depthconvert->configure(in, out, conv_policy, shift);
-
- // Log info
- ARM_COMPUTE_LOG_GRAPH_INFO("Instantiating NEDepthConvertLayer"
- << " Data Type: " << in->info()->data_type()
- << " Input shape: " << in->info()->tensor_shape()
- << " Output shape: " << out->info()->tensor_shape()
- << " shift: " << shift
- << std::endl);
-
- return std::move(depthconvert);
-}
-
-/* DepthwiseConvolutionLayer Layer */
-REGISTER_SIMPLE_OPERATION(NEDepthwiseConvolutionOperation, NEON, OperationType::DepthwiseConvolutionLayer)
-{
- ARM_COMPUTE_ERROR_ON(ctx.num_inputs() != 2 && ctx.num_inputs() != 3);
- ARM_COMPUTE_ERROR_ON(ctx.num_outputs() != 1);
- ARM_COMPUTE_ERROR_ON(dynamic_cast<arm_compute::ITensor *>(ctx.input(0)) == nullptr);
- ARM_COMPUTE_ERROR_ON(dynamic_cast<arm_compute::ITensor *>(ctx.output(0)) == nullptr);
-
- // Extract IO and info
- auto *in = dynamic_cast<arm_compute::ITensor *>(ctx.input(0));
- auto *weights = dynamic_cast<arm_compute::ITensor *>(ctx.input(1));
- auto *biases = ctx.num_inputs() == 3 ? dynamic_cast<arm_compute::ITensor *>(ctx.input(2)) : nullptr;
- auto *out = dynamic_cast<arm_compute::ITensor *>(ctx.output(0));
- const auto conv_info = ctx.parameter<PadStrideInfo>("ConvolutionInfo");
- const auto opt3x3 = ctx.parameter<bool>("Optimized3x3");
-
- // Create and configure function
- std::unique_ptr<arm_compute::IFunction> func;
- bool run_3x3_opt = opt3x3 && weights->info()->dimension(0) == 3;
- if(run_3x3_opt)
- {
- auto depwthwise_conv = arm_compute::support::cpp14::make_unique<arm_compute::NEDepthwiseConvolutionLayer3x3>();
- depwthwise_conv->configure(in, weights, biases, out, conv_info);
- func = std::move(depwthwise_conv);
- }
- else
- {
- auto depwthwise_conv = arm_compute::support::cpp14::make_unique<arm_compute::NEDepthwiseConvolutionLayer>();
- depwthwise_conv->configure(in, weights, biases, out, conv_info);
- func = std::move(depwthwise_conv);
- }
-
- // Log info
- ARM_COMPUTE_LOG_GRAPH_INFO("Instantiating NEDepthwiseConvolutionLayer"
- << " Data Type: " << in->info()->data_type()
- << " Input shape: " << in->info()->tensor_shape()
- << " Weights shape: " << weights->info()->tensor_shape()
- << " Output shape: " << out->info()->tensor_shape());
- if(biases == nullptr)
- {
- ARM_COMPUTE_LOG_GRAPH_INFO(" Biases shape: No biases provided" << std::endl);
- }
- else
- {
- ARM_COMPUTE_LOG_GRAPH_INFO(" Biases shape: " << biases->info()->tensor_shape() << std::endl);
- }
-
- return func;
-}
-
-/* DeQuantizationLayer Layer */
-REGISTER_SIMPLE_OPERATION(NEDequantizationLayerOperation, NEON, OperationType::DequantizationLayer)
-{
- ARM_COMPUTE_ERROR_ON(ctx.num_inputs() != 1);
- ARM_COMPUTE_ERROR_ON(ctx.num_outputs() != 2);
- ARM_COMPUTE_ERROR_ON(dynamic_cast<arm_compute::ITensor *>(ctx.input(0)) == nullptr);
- ARM_COMPUTE_ERROR_ON(dynamic_cast<arm_compute::ITensor *>(ctx.output(0)) == nullptr);
- ARM_COMPUTE_ERROR_ON(dynamic_cast<arm_compute::ITensor *>(ctx.output(1)) == nullptr);
-
- // Extract IO and info
- auto *in = dynamic_cast<arm_compute::ITensor *>(ctx.input(0));
- auto *out = dynamic_cast<arm_compute::ITensor *>(ctx.output(0));
- auto *min_max = dynamic_cast<arm_compute::ITensor *>(ctx.output(1));
-
- // Create and configure function
- auto dequantization = arm_compute::support::cpp14::make_unique<arm_compute::NEDequantizationLayer>();
- dequantization->configure(in, out, min_max);
-
- // Log info
- ARM_COMPUTE_LOG_GRAPH_INFO("Instantiating NEDequantizationLayer"
- << " Data Type: " << in->info()->data_type()
- << " Input shape: " << in->info()->tensor_shape()
- << " Output shape: " << out->info()->tensor_shape()
- << " Min max shape: " << min_max->info()->tensor_shape()
- << std::endl);
-
- return std::move(dequantization);
-}
-
-/* Flatten Layer */
-REGISTER_SIMPLE_OPERATION(NEFlattenLayerOperation, NEON, OperationType::FlattenLayer)
-{
- ARM_COMPUTE_ERROR_ON(ctx.num_inputs() != 1);
- ARM_COMPUTE_ERROR_ON(ctx.num_outputs() != 1);
- ARM_COMPUTE_ERROR_ON(dynamic_cast<arm_compute::ITensor *>(ctx.input(0)) == nullptr);
- ARM_COMPUTE_ERROR_ON(dynamic_cast<arm_compute::ITensor *>(ctx.output(0)) == nullptr);
-
- // Extract IO and info
- auto *in = dynamic_cast<arm_compute::ITensor *>(ctx.input(0));
- auto *out = dynamic_cast<arm_compute::ITensor *>(ctx.output(0));
-
- // Create and configure function
- auto flatten = arm_compute::support::cpp14::make_unique<arm_compute::NEFlattenLayer>();
- flatten->configure(in, out);
-
- // Log info
- ARM_COMPUTE_LOG_GRAPH_INFO("Instantiating NEFlattenLayer"
- << " Data Type: " << in->info()->data_type()
- << " Input shape: " << in->info()->tensor_shape()
- << " Output shape: " << out->info()->tensor_shape()
- << std::endl);
-
- return std::move(flatten);
-}
-
-/* Floor Layer */
-REGISTER_SIMPLE_OPERATION(NEFloorLayerOperation, NEON, OperationType::FloorLayer)
-{
- ARM_COMPUTE_ERROR_ON(ctx.num_inputs() != 1);
- ARM_COMPUTE_ERROR_ON(ctx.num_outputs() != 1);
- ARM_COMPUTE_ERROR_ON(dynamic_cast<arm_compute::ITensor *>(ctx.input(0)) == nullptr);
- ARM_COMPUTE_ERROR_ON(dynamic_cast<arm_compute::ITensor *>(ctx.output(0)) == nullptr);
-
- // Extract IO and info
- auto *in = dynamic_cast<arm_compute::ITensor *>(ctx.input(0));
- auto *out = dynamic_cast<arm_compute::ITensor *>(ctx.output(0));
-
- // Create and configure function
- auto floor = arm_compute::support::cpp14::make_unique<arm_compute::NEFloor>();
- floor->configure(in, out);
-
- // Log info
- ARM_COMPUTE_LOG_GRAPH_INFO("Instantiating NEFloorLayer"
- << " Data Type: " << in->info()->data_type()
- << " Input shape: " << in->info()->tensor_shape()
- << " Output shape: " << out->info()->tensor_shape()
- << std::endl);
-
- return std::move(floor);
-}
-
-/* Fully Connected Layer */
-REGISTER_SIMPLE_OPERATION(NEFullyConnectedLayer, NEON, OperationType::FullyConnectedLayer)
-{
- ARM_COMPUTE_ERROR_ON(ctx.num_inputs() != 3);
- ARM_COMPUTE_ERROR_ON(ctx.num_outputs() != 1);
- ARM_COMPUTE_ERROR_ON(dynamic_cast<arm_compute::ITensor *>(ctx.input(0)) == nullptr);
- ARM_COMPUTE_ERROR_ON(dynamic_cast<arm_compute::ITensor *>(ctx.input(1)) == nullptr);
- ARM_COMPUTE_ERROR_ON(dynamic_cast<arm_compute::ITensor *>(ctx.input(2)) == nullptr);
- ARM_COMPUTE_ERROR_ON(dynamic_cast<arm_compute::ITensor *>(ctx.output(0)) == nullptr);
-
- // Extract IO and info
- auto *in = dynamic_cast<arm_compute::ITensor *>(ctx.input(0));
- auto *weights = dynamic_cast<arm_compute::ITensor *>(ctx.input(1));
- auto *biases = dynamic_cast<arm_compute::ITensor *>(ctx.input(2));
- auto *out = dynamic_cast<arm_compute::ITensor *>(ctx.output(0));
-
- // Create and configure function
- auto fc = arm_compute::support::cpp14::make_unique<arm_compute::NEFullyConnectedLayer>();
- fc->configure(in, weights, biases, out);
-
- // Log info
- ARM_COMPUTE_LOG_GRAPH_INFO("Instantiating NEFullyConnectedLayer"
- << " Data Type: " << in->info()->data_type()
- << " Input shape: " << in->info()->tensor_shape()
- << " Weights shape: " << weights->info()->tensor_shape()
- << " Biases Shape: " << biases->info()->tensor_shape()
- << " Output shape: " << out->info()->tensor_shape()
- << std::endl);
-
- return std::move(fc);
-}
-
-/* L2 Normalize Layer */
-REGISTER_SIMPLE_OPERATION(NEL2NormalizeLayerOperation, NEON, OperationType::L2NormalizeLayer)
-{
- ARM_COMPUTE_ERROR_ON(ctx.num_inputs() != 1);
- ARM_COMPUTE_ERROR_ON(ctx.num_outputs() != 1);
- ARM_COMPUTE_ERROR_ON(dynamic_cast<arm_compute::ITensor *>(ctx.input(0)) == nullptr);
- ARM_COMPUTE_ERROR_ON(dynamic_cast<arm_compute::ITensor *>(ctx.output(0)) == nullptr);
-
- // Extract IO and info
- auto *in = dynamic_cast<arm_compute::ITensor *>(ctx.input(0));
- auto *out = dynamic_cast<arm_compute::ITensor *>(ctx.output(0));
- const auto axis = ctx.parameter<unsigned int>("axis");
- const auto epsilon = ctx.parameter<float>("epsilon");
-
- // Create and configure function
- auto l2_norm = arm_compute::support::cpp14::make_unique<arm_compute::NEL2NormalizeLayer>();
- l2_norm->configure(in, out, axis, epsilon);
-
- // Log info
- ARM_COMPUTE_LOG_GRAPH_INFO("Instantiating NEL2NormalizeLayer"
- << " Data Type: " << in->info()->data_type()
- << " Input shape: " << in->info()->tensor_shape()
- << " Output shape: " << out->info()->tensor_shape()
- << " Axis: " << axis
- << " Epsilon: " << epsilon
- << std::endl);
-
- return std::move(l2_norm);
-}
-
-/* Normalization Layer */
-REGISTER_SIMPLE_OPERATION(NENormalizationLayerOperation, NEON, OperationType::NormalizationLayer)
-{
- ARM_COMPUTE_ERROR_ON(ctx.num_inputs() != 1);
- ARM_COMPUTE_ERROR_ON(ctx.num_outputs() != 1);
- ARM_COMPUTE_ERROR_ON(dynamic_cast<arm_compute::ITensor *>(ctx.input(0)) == nullptr);
- ARM_COMPUTE_ERROR_ON(dynamic_cast<arm_compute::ITensor *>(ctx.output(0)) == nullptr);
-
- // Extract IO and info
- auto *in = dynamic_cast<arm_compute::ITensor *>(ctx.input(0));
- auto *out = dynamic_cast<arm_compute::ITensor *>(ctx.output(0));
- const auto norm_info = ctx.parameter<NormalizationLayerInfo>("NormalizationLayerInfo");
-
- // Create and configure function
- auto norm = arm_compute::support::cpp14::make_unique<arm_compute::NENormalizationLayer>();
- norm->configure(in, out, norm_info);
-
- // Log info
- ARM_COMPUTE_LOG_GRAPH_INFO("Instantiating NENormalizationLayer"
- << " Data Type: " << in->info()->data_type()
- << " Input shape: " << in->info()->tensor_shape()
- << " Output shape: " << out->info()->tensor_shape()
- << " Normalization info: " << norm_info
- << std::endl);
-
- return std::move(norm);
-}
-
-/* Pooling Layer */
-REGISTER_SIMPLE_OPERATION(NEPoolingLayerOperation, NEON, OperationType::PoolingLayer)
-{
- ARM_COMPUTE_ERROR_ON(ctx.num_inputs() != 1);
- ARM_COMPUTE_ERROR_ON(ctx.num_outputs() != 1);
- ARM_COMPUTE_ERROR_ON(dynamic_cast<arm_compute::ITensor *>(ctx.input(0)) == nullptr);
- ARM_COMPUTE_ERROR_ON(dynamic_cast<arm_compute::ITensor *>(ctx.output(0)) == nullptr);
-
- // Extract IO and info
- auto *in = dynamic_cast<arm_compute::ITensor *>(ctx.input(0));
- auto *out = dynamic_cast<arm_compute::ITensor *>(ctx.output(0));
- const auto pool_info = ctx.parameter<PoolingLayerInfo>("PoolingLayerInfo");
-
- // Create and configure function
- auto pool = arm_compute::support::cpp14::make_unique<arm_compute::NEPoolingLayer>();
- pool->configure(in, out, pool_info);
-
- // Log info
- ARM_COMPUTE_LOG_GRAPH_INFO("Instantiating NEPoolingLayer"
- << " Data Type: " << in->info()->data_type()
- << " Input shape: " << in->info()->tensor_shape()
- << " Output shape: " << out->info()->tensor_shape()
- << " Pooling info: " << pool_info
- << std::endl);
-
- return std::move(pool);
-}
-
-/* Quantization Layer */
-REGISTER_SIMPLE_OPERATION(NEQuantizationLayerOperation, NEON, OperationType::QuantizationLayer)
-{
- ARM_COMPUTE_ERROR_ON(ctx.num_inputs() != 1);
- ARM_COMPUTE_ERROR_ON(ctx.num_outputs() != 1);
- ARM_COMPUTE_ERROR_ON(dynamic_cast<arm_compute::ITensor *>(ctx.input(0)) == nullptr);
- ARM_COMPUTE_ERROR_ON(dynamic_cast<arm_compute::ITensor *>(ctx.output(0)) == nullptr);
-
- // Extract IO and info
- auto *in = dynamic_cast<arm_compute::ITensor *>(ctx.input(0));
- auto *out = dynamic_cast<arm_compute::ITensor *>(ctx.output(0));
-
- // Create and configure function
- auto quantization = arm_compute::support::cpp14::make_unique<arm_compute::NEQuantizationLayer>();
- quantization->configure(in, out);
-
- // Log info
- ARM_COMPUTE_LOG_GRAPH_INFO("Instantiating NEQuantizationLayer"
- << " Data Type: " << in->info()->data_type()
- << " Input shape: " << in->info()->tensor_shape()
- << " Output shape: " << out->info()->tensor_shape()
- << std::endl);
-
- return std::move(quantization);
-}
-
-/* Reshape Layer */
-REGISTER_SIMPLE_OPERATION(NEReshapeLayerOperation, NEON, OperationType::ReshapeLayer)
-{
- ARM_COMPUTE_ERROR_ON(ctx.num_inputs() != 1);
- ARM_COMPUTE_ERROR_ON(ctx.num_outputs() != 1);
- ARM_COMPUTE_ERROR_ON(dynamic_cast<arm_compute::ITensor *>(ctx.input(0)) == nullptr);
- ARM_COMPUTE_ERROR_ON(dynamic_cast<arm_compute::ITensor *>(ctx.output(0)) == nullptr);
-
- // Extract IO and info
- auto *in = dynamic_cast<arm_compute::ITensor *>(ctx.input(0));
- auto *out = dynamic_cast<arm_compute::ITensor *>(ctx.output(0));
-
- // Create and configure function
- auto reshape = arm_compute::support::cpp14::make_unique<arm_compute::NEReshapeLayer>();
- reshape->configure(in, out);
-
- // Log info
- ARM_COMPUTE_LOG_GRAPH_INFO("Instantiating NEReshapeLayer"
- << " Data Type: " << in->info()->data_type()
- << " Input shape: " << in->info()->tensor_shape()
- << " Output shape: " << out->info()->tensor_shape()
- << std::endl);
-
- return std::move(reshape);
-}
-
-/* Softmax Layer */
-REGISTER_SIMPLE_OPERATION(NESoftmaxLayerOperation, NEON, OperationType::SoftmaxLayer)
-{
- ARM_COMPUTE_ERROR_ON(ctx.num_inputs() != 1);
- ARM_COMPUTE_ERROR_ON(ctx.num_outputs() != 1);
- ARM_COMPUTE_ERROR_ON(dynamic_cast<arm_compute::ITensor *>(ctx.input(0)) == nullptr);
- ARM_COMPUTE_ERROR_ON(dynamic_cast<arm_compute::ITensor *>(ctx.output(0)) == nullptr);
-
- // Extract IO and info
- auto *in = dynamic_cast<arm_compute::ITensor *>(ctx.input(0));
- auto *out = dynamic_cast<arm_compute::ITensor *>(ctx.output(0));
-
- // Create and configure function
- auto smx = arm_compute::support::cpp14::make_unique<arm_compute::NESoftmaxLayer>();
- smx->configure(in, out);
-
- // Log info
- ARM_COMPUTE_LOG_GRAPH_INFO("Instantiating NESoftmaxLayer"
- << " Data Type: " << in->info()->data_type()
- << " Input shape: " << in->info()->tensor_shape()
- << " Output shape: " << out->info()->tensor_shape()
- << std::endl);
-
- return std::move(smx);
-}
diff --git a/src/graph/printers/DotGraphPrinter.cpp b/src/graph/printers/DotGraphPrinter.cpp
new file mode 100644
index 0000000..61cf423
--- /dev/null
+++ b/src/graph/printers/DotGraphPrinter.cpp
@@ -0,0 +1,173 @@
+/*
+ * Copyright (c) 2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/graph/printers/DotGraphPrinter.h"
+
+#include "arm_compute/core/Error.h"
+#include "arm_compute/graph/Graph.h"
+#include "arm_compute/graph/Tensor.h"
+#include "arm_compute/graph/TypePrinter.h"
+#include "arm_compute/graph/nodes/Nodes.h"
+
+namespace arm_compute
+{
+namespace graph
+{
+void DotGraphVisitor::visit(ActivationLayerNode &n)
+{
+ std::stringstream ss;
+ ss << n.activation_info().activation();
+ _info = ss.str();
+}
+
+void DotGraphVisitor::visit(BatchNormalizationLayerNode &n)
+{
+ std::stringstream ss;
+ ss << (n.fused_activation().enabled() ? to_string(n.fused_activation().activation()) : "");
+ _info = ss.str();
+}
+
+void DotGraphVisitor::visit(ConvolutionLayerNode &n)
+{
+ std::stringstream ss;
+ ss << n.convolution_method();
+ _info = ss.str();
+}
+
+void DotGraphVisitor::visit(DepthConcatenateLayerNode &n)
+{
+ std::stringstream ss;
+ ss << "Enabled: " << n.is_enabled();
+ _info = ss.str();
+}
+
+void DotGraphVisitor::visit(DepthwiseConvolutionLayerNode &n)
+{
+ std::stringstream ss;
+ ss << n.depthwise_convolution_method();
+ _info = ss.str();
+}
+
+void DotGraphVisitor::visit(EltwiseLayerNode &n)
+{
+ std::stringstream ss;
+ ss << n.eltwise_operation();
+ _info = ss.str();
+}
+
+void DotGraphVisitor::visit(NormalizationLayerNode &n)
+{
+ std::stringstream ss;
+ ss << n.normalization_info().type();
+ _info = ss.str();
+}
+
+void DotGraphVisitor::visit(PoolingLayerNode &n)
+{
+ std::stringstream ss;
+ ss << n.pooling_info().pool_type();
+ ss << R"( \n )";
+ ss << n.pooling_info().pool_size();
+ ss << R"( \n )";
+ ss << n.pooling_info().pad_stride_info();
+ _info = ss.str();
+}
+
+void DotGraphVisitor::default_visit()
+{
+ _info.clear();
+}
+
+const std::string &DotGraphVisitor::info() const
+{
+ return _info;
+}
+
+void DotGraphPrinter::print(const Graph &g, std::ostream &os)
+{
+ // Print header
+ print_header(g, os);
+
+ // Print nodes
+ print_nodes(g, os);
+
+ // Print edges
+ print_edges(g, os);
+
+ // Print footer
+ print_footer(g, os);
+}
+
+void DotGraphPrinter::print_header(const Graph &g, std::ostream &os)
+{
+ // Print graph name
+ std::string graph_name = (g.name().empty()) ? "Graph" : g.name();
+ os << "digraph " << graph_name << "{\n";
+}
+
+void DotGraphPrinter::print_footer(const Graph &g, std::ostream &os)
+{
+ ARM_COMPUTE_UNUSED(g);
+ os << "}\n";
+}
+
+void DotGraphPrinter::print_nodes(const Graph &g, std::ostream &os)
+{
+ for(const auto &n : g.nodes())
+ {
+ if(n)
+ {
+ // Output node id
+ std::string node_id = std::string("n") + support::cpp11::to_string(n->id());
+ os << node_id << " ";
+
+ // Output label
+ n->accept(_dot_node_visitor);
+
+ std::string name = n->name().empty() ? node_id : n->name();
+ auto node_description = _dot_node_visitor.info();
+
+ os << R"([label = ")" << name << R"( \n )" << n->assigned_target() << R"( \n )" << node_description << R"("])";
+ os << ";\n";
+ }
+ }
+}
+
+void DotGraphPrinter::print_edges(const Graph &g, std::ostream &os)
+{
+ for(const auto &e : g.edges())
+ {
+ if(e)
+ {
+ std::string source_node_id = std::string("n") + support::cpp11::to_string(e->producer_id());
+ std::string sink_node_id = std::string("n") + support::cpp11::to_string(e->consumer_id());
+ os << source_node_id << " -> " << sink_node_id << " ";
+ const Tensor *t = e->tensor();
+ ARM_COMPUTE_ERROR_ON(t == nullptr);
+ os << R"([label = ")" << t->desc().shape << R"( \n )" << t->desc().data_type << R"( \n )" << t->desc().layout << R"("])";
+ os << ";\n";
+ }
+ }
+}
+} // namespace graph
+} // namespace arm_compute
diff --git a/src/runtime/Allocator.cpp b/src/runtime/Allocator.cpp
index 50b0f0e..7f0e374 100644
--- a/src/runtime/Allocator.cpp
+++ b/src/runtime/Allocator.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2018 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -22,8 +22,10 @@
* SOFTWARE.
*/
#include "arm_compute/runtime/Allocator.h"
+#include "arm_compute/runtime/MemoryRegion.h"
#include "arm_compute/core/Error.h"
+#include "support/ToolchainSupport.h"
#include <cstddef>
@@ -39,3 +41,9 @@
{
::operator delete(ptr);
}
+
+std::unique_ptr<IMemoryRegion> Allocator::make_region(size_t size, size_t alignment)
+{
+ ARM_COMPUTE_UNUSED(alignment);
+ return arm_compute::support::cpp14::make_unique<MemoryRegion>(size);
+}
\ No newline at end of file
diff --git a/src/runtime/BlobLifetimeManager.cpp b/src/runtime/BlobLifetimeManager.cpp
index 3ca5071..2a4ab6e 100644
--- a/src/runtime/BlobLifetimeManager.cpp
+++ b/src/runtime/BlobLifetimeManager.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2018 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -57,15 +57,15 @@
ARM_COMPUTE_ERROR_ON(!are_all_finalized());
ARM_COMPUTE_ERROR_ON(_active_group == nullptr);
- // Sort active group requirements in descending order.
- std::sort(std::begin(_active_elements), std::end(_active_elements), [](const Element & a, const Element & b)
+ // Sort free blobs requirements in descending order.
+ _free_blobs.sort([](const Blob & ba, const Blob & bb)
{
- return a.size > b.size;
+ return ba.max_size > bb.max_size;
});
std::vector<size_t> group_sizes;
- std::transform(std::begin(_active_elements), std::end(_active_elements), std::back_inserter(group_sizes), [](const Element & e)
+ std::transform(std::begin(_free_blobs), std::end(_free_blobs), std::back_inserter(group_sizes), [](const Blob & b)
{
- return e.size;
+ return b.max_size;
});
// Update blob sizes
@@ -80,8 +80,14 @@
// Calculate group mappings
auto &group_mappings = _active_group->mappings();
int blob_idx = 0;
- for(auto &e : _active_elements)
+ for(auto &free_blob : _free_blobs)
{
- group_mappings[e.handle] = blob_idx++;
+ for(auto &bound_element_id : free_blob.bound_elements)
+ {
+ ARM_COMPUTE_ERROR_ON(_active_elements.find(bound_element_id) == std::end(_active_elements));
+ Element &bound_element = _active_elements[bound_element_id];
+ group_mappings[bound_element.handle] = blob_idx;
+ }
+ ++blob_idx;
}
}
diff --git a/src/runtime/CL/CLBufferAllocator.cpp b/src/runtime/CL/CLBufferAllocator.cpp
index 9a5c13a..84789e7 100644
--- a/src/runtime/CL/CLBufferAllocator.cpp
+++ b/src/runtime/CL/CLBufferAllocator.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2018 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -22,9 +22,11 @@
* SOFTWARE.
*/
#include "arm_compute/runtime/CL/CLBufferAllocator.h"
+#include "arm_compute/runtime/CL/CLMemoryRegion.h"
#include "arm_compute/core/CL/OpenCL.h"
#include "arm_compute/core/Error.h"
+#include "support/ToolchainSupport.h"
#include <cstddef>
@@ -47,3 +49,9 @@
ARM_COMPUTE_ERROR_ON(ptr == nullptr);
clReleaseMemObject(static_cast<cl_mem>(ptr));
}
+
+std::unique_ptr<IMemoryRegion> CLBufferAllocator::make_region(size_t size, size_t alignment)
+{
+ ARM_COMPUTE_UNUSED(alignment);
+ return arm_compute::support::cpp14::make_unique<CLBufferMemoryRegion>(_context, CL_MEM_ALLOC_HOST_PTR | CL_MEM_READ_WRITE, size);
+}
diff --git a/src/runtime/CL/CLHOG.cpp b/src/runtime/CL/CLHOG.cpp
index 3f5266c..c4ea639 100644
--- a/src/runtime/CL/CLHOG.cpp
+++ b/src/runtime/CL/CLHOG.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2018 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -74,11 +74,11 @@
uint8_t *CLHOG::do_map(cl::CommandQueue &q, bool blocking)
{
ARM_COMPUTE_ERROR_ON(_buffer.get() == nullptr);
- return static_cast<uint8_t *>(q.enqueueMapBuffer(_buffer, blocking ? CL_TRUE : CL_FALSE, CL_MAP_READ | CL_MAP_WRITE, 0, info()->descriptor_size()));
+ return static_cast<uint8_t *>(q.enqueueMapBuffer(_buffer, blocking ? CL_TRUE : CL_FALSE, CL_MAP_READ | CL_MAP_WRITE, 0, info()->descriptor_size() * sizeof(float)));
}
void CLHOG::do_unmap(cl::CommandQueue &q)
{
ARM_COMPUTE_ERROR_ON(_buffer.get() == nullptr);
q.enqueueUnmapMemObject(_buffer, descriptor());
-}
\ No newline at end of file
+}
diff --git a/src/runtime/CL/CLMemory.cpp b/src/runtime/CL/CLMemory.cpp
new file mode 100644
index 0000000..534c4f9
--- /dev/null
+++ b/src/runtime/CL/CLMemory.cpp
@@ -0,0 +1,67 @@
+/*
+ * Copyright (c) 2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/CL/CLMemory.h"
+
+#include "arm_compute/core/Error.h"
+
+namespace arm_compute
+{
+CLMemory::CLMemory()
+ : _region(nullptr), _region_owned(nullptr)
+{
+ create_empty_region();
+}
+
+CLMemory::CLMemory(std::shared_ptr<ICLMemoryRegion> memory)
+ : _region(nullptr), _region_owned(std::move(memory))
+{
+ if(_region_owned == nullptr)
+ {
+ create_empty_region();
+ }
+ _region = _region_owned.get();
+}
+
+CLMemory::CLMemory(ICLMemoryRegion *memory)
+ : _region(memory), _region_owned(nullptr)
+{
+ _region = memory;
+}
+
+ICLMemoryRegion *CLMemory::region()
+{
+ return _region;
+}
+
+ICLMemoryRegion *CLMemory::region() const
+{
+ return _region;
+}
+
+void CLMemory::create_empty_region()
+{
+ _region_owned = std::make_shared<CLBufferMemoryRegion>(cl::Context::getDefault(), CL_MEM_ALLOC_HOST_PTR | CL_MEM_READ_WRITE, 0);
+ _region = _region_owned.get();
+}
+} // namespace arm_compute
\ No newline at end of file
diff --git a/src/runtime/CL/CLMemoryRegion.cpp b/src/runtime/CL/CLMemoryRegion.cpp
new file mode 100644
index 0000000..15fd7f3
--- /dev/null
+++ b/src/runtime/CL/CLMemoryRegion.cpp
@@ -0,0 +1,152 @@
+/*
+ * Copyright (c) 2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/CL/CLMemoryRegion.h"
+
+#include "arm_compute/core/Error.h"
+#include "arm_compute/runtime/CL/CLScheduler.h"
+
+namespace arm_compute
+{
+ICLMemoryRegion::ICLMemoryRegion(cl::Context ctx, size_t size)
+ : IMemoryRegion(size), _ctx(std::move(ctx)), _mapping(nullptr), _mem()
+{
+}
+
+const cl::Buffer &ICLMemoryRegion::cl_data() const
+{
+ return _mem;
+}
+
+void *ICLMemoryRegion::buffer()
+{
+ return _mapping;
+}
+
+void *ICLMemoryRegion::buffer() const
+{
+ return _mapping;
+}
+
+void **ICLMemoryRegion::handle()
+{
+ return reinterpret_cast<void **>(&_mem);
+}
+
+CLBufferMemoryRegion::CLBufferMemoryRegion(cl::Context ctx, cl_mem_flags flags, size_t size)
+ : ICLMemoryRegion(std::move(ctx), size)
+{
+ if(_size != 0)
+ {
+ _mem = cl::Buffer(_ctx, flags, _size);
+ }
+}
+
+void *CLBufferMemoryRegion::ptr()
+{
+ return nullptr;
+}
+
+void *CLBufferMemoryRegion::map(cl::CommandQueue &q, bool blocking)
+{
+ ARM_COMPUTE_ERROR_ON(_mem.get() == nullptr);
+ _mapping = q.enqueueMapBuffer(_mem, blocking ? CL_TRUE : CL_FALSE, CL_MAP_READ | CL_MAP_WRITE, 0, _size);
+ return _mapping;
+}
+
+void CLBufferMemoryRegion::unmap(cl::CommandQueue &q)
+{
+ ARM_COMPUTE_ERROR_ON(_mem.get() == nullptr);
+ q.enqueueUnmapMemObject(_mem, _mapping);
+ _mapping = nullptr;
+}
+
+ICLSVMMemoryRegion::ICLSVMMemoryRegion(cl::Context ctx, cl_mem_flags flags, size_t size, size_t alignment)
+ : ICLMemoryRegion(std::move(ctx), size), _ptr(nullptr)
+{
+ if(size != 0)
+ {
+ _ptr = clSVMAlloc(_ctx.get(), flags, size, alignment);
+ if(_ptr != nullptr)
+ {
+ _mem = cl::Buffer(_ctx, CL_MEM_READ_WRITE | CL_MEM_USE_HOST_PTR, _size, _ptr);
+ }
+ }
+}
+
+ICLSVMMemoryRegion::~ICLSVMMemoryRegion()
+{
+ if(_ptr != nullptr)
+ {
+ clFinish(CLScheduler::get().queue().get());
+ _mem = cl::Buffer();
+ clSVMFree(_ctx.get(), _ptr);
+ }
+}
+
+void *ICLSVMMemoryRegion::ptr()
+{
+ return _ptr;
+}
+
+CLCoarseSVMMemoryRegion::CLCoarseSVMMemoryRegion(cl::Context ctx, cl_mem_flags flags, size_t size, size_t alignment)
+ : ICLSVMMemoryRegion(std::move(ctx), flags, size, alignment)
+{
+}
+
+void *CLCoarseSVMMemoryRegion::map(cl::CommandQueue &q, bool blocking)
+{
+ ARM_COMPUTE_ERROR_ON(_ptr == nullptr);
+ clEnqueueSVMMap(q.get(), blocking ? CL_TRUE : CL_FALSE, CL_MAP_READ | CL_MAP_WRITE, _ptr, _size, 0, nullptr, nullptr);
+ _mapping = _ptr;
+ return _mapping;
+}
+
+void CLCoarseSVMMemoryRegion::unmap(cl::CommandQueue &q)
+{
+ ARM_COMPUTE_ERROR_ON(_ptr == nullptr);
+ clEnqueueSVMUnmap(q.get(), _ptr, 0, nullptr, nullptr);
+ _mapping = nullptr;
+}
+
+CLFineSVMMemoryRegion::CLFineSVMMemoryRegion(cl::Context ctx, cl_mem_flags flags, size_t size, size_t alignment)
+ : ICLSVMMemoryRegion(std::move(ctx), flags, size, alignment)
+{
+}
+
+void *CLFineSVMMemoryRegion::map(cl::CommandQueue &q, bool blocking)
+{
+ if(blocking)
+ {
+ clFinish(q.get());
+ }
+ _mapping = _ptr;
+ return _mapping;
+}
+
+void CLFineSVMMemoryRegion::unmap(cl::CommandQueue &q)
+{
+ ARM_COMPUTE_UNUSED(q);
+ _mapping = nullptr;
+}
+} // namespace arm_compute
\ No newline at end of file
diff --git a/src/runtime/CL/CLScheduler.cpp b/src/runtime/CL/CLScheduler.cpp
index 65292fe..fdae615 100644
--- a/src/runtime/CL/CLScheduler.cpp
+++ b/src/runtime/CL/CLScheduler.cpp
@@ -31,7 +31,7 @@
std::once_flag CLScheduler::_initialize_symbols;
CLScheduler::CLScheduler()
- : _context(), _queue(), _target(GPUTarget::MIDGARD), _is_initialised(false), _cl_tuner()
+ : _queue(), _target(GPUTarget::MIDGARD), _is_initialised(false), _cl_tuner()
{
}
@@ -52,7 +52,7 @@
if(_cl_tuner != nullptr)
{
// Tune the OpenCL kernel
- _cl_tuner->tune_kernel(kernel);
+ _cl_tuner->tune_kernel_dynamic(kernel);
}
// Run kernel
diff --git a/src/runtime/CL/CLSubTensor.cpp b/src/runtime/CL/CLSubTensor.cpp
index 5f58024..d0e7d76 100644
--- a/src/runtime/CL/CLSubTensor.cpp
+++ b/src/runtime/CL/CLSubTensor.cpp
@@ -29,6 +29,11 @@
using namespace arm_compute;
+CLSubTensor::CLSubTensor()
+ : _parent(nullptr), _info()
+{
+}
+
CLSubTensor::CLSubTensor(ICLTensor *parent, const TensorShape &tensor_shape, const Coordinates &coords, bool extend_parent)
: _parent(nullptr), _info()
{
diff --git a/src/runtime/CL/CLTensor.cpp b/src/runtime/CL/CLTensor.cpp
index bc513d1..dd27738 100644
--- a/src/runtime/CL/CLTensor.cpp
+++ b/src/runtime/CL/CLTensor.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2016, 2017 ARM Limited.
+ * Copyright (c) 2016-2018 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -47,7 +47,7 @@
return _allocator.cl_data();
}
-ITensorAllocator *CLTensor::allocator()
+CLTensorAllocator *CLTensor::allocator()
{
return &_allocator;
}
diff --git a/src/runtime/CL/CLTensorAllocator.cpp b/src/runtime/CL/CLTensorAllocator.cpp
index ad165fa..54e7c5b 100644
--- a/src/runtime/CL/CLTensorAllocator.cpp
+++ b/src/runtime/CL/CLTensorAllocator.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2016, 2017 ARM Limited.
+ * Copyright (c) 2016-2018 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -30,36 +30,57 @@
using namespace arm_compute;
-CLTensorAllocator::CLTensorAllocator(CLTensor *owner)
- : _associated_memory_group(nullptr), _buffer(), _mapping(nullptr), _owner(owner)
+namespace
{
-}
+std::shared_ptr<arm_compute::ICLMemoryRegion> allocate_region(cl::Context context, size_t size, cl_uint alignment)
+{
+ // Try fine-grain SVM
+ std::shared_ptr<ICLMemoryRegion> region = std::make_shared<CLFineSVMMemoryRegion>(context, CL_MEM_READ_WRITE | CL_MEM_SVM_FINE_GRAIN_BUFFER, size, alignment);
-CLTensorAllocator::~CLTensorAllocator()
+ // Try coarse-grain SVM in case of failure
+ if(region != nullptr && region->ptr() == nullptr)
+ {
+ region = std::make_shared<CLCoarseSVMMemoryRegion>(context, CL_MEM_READ_WRITE, size, alignment);
+ }
+ // Try legacy buffer memory in case of failure
+ if(region != nullptr && region->ptr() == nullptr)
+ {
+ region = std::make_shared<CLBufferMemoryRegion>(context, CL_MEM_ALLOC_HOST_PTR | CL_MEM_READ_WRITE, size);
+ }
+ return region;
+}
+} // namespace
+
+CLTensorAllocator::CLTensorAllocator(CLTensor *owner)
+ : _associated_memory_group(nullptr), _memory(), _owner(owner)
{
- _buffer = cl::Buffer();
}
uint8_t *CLTensorAllocator::data()
{
- return _mapping;
+ ARM_COMPUTE_ERROR_ON(_memory.region() == nullptr);
+ return reinterpret_cast<uint8_t *>(_memory.region()->buffer());
}
const cl::Buffer &CLTensorAllocator::cl_data() const
{
- return _buffer;
+ ARM_COMPUTE_ERROR_ON(_memory.region() == nullptr);
+ return _memory.region()->cl_data();
}
void CLTensorAllocator::allocate()
{
- ARM_COMPUTE_ERROR_ON(_buffer.get() != nullptr);
+ ARM_COMPUTE_ERROR_ON(_memory.region() == nullptr);
+
if(_associated_memory_group == nullptr)
{
- _buffer = cl::Buffer(CLScheduler::get().context(), CL_MEM_ALLOC_HOST_PTR | CL_MEM_READ_WRITE, info().total_size());
+ ARM_COMPUTE_ERROR_ON(_memory.region()->cl_data().get() != nullptr);
+ _memory = CLMemory(allocate_region(CLScheduler::get().context(), info().total_size(), 0));
}
else
{
- _associated_memory_group->finalize_memory(_owner, reinterpret_cast<void **>(&_buffer()), info().total_size());
+ _associated_memory_group->finalize_memory(_owner, _memory.region()->handle(), info().total_size());
+ _memory.region()->set_size(info().total_size());
}
info().set_is_resizable(false);
}
@@ -68,41 +89,55 @@
{
if(_associated_memory_group == nullptr)
{
- _buffer = cl::Buffer();
+ _memory = CLMemory();
info().set_is_resizable(true);
}
}
+arm_compute::Status CLTensorAllocator::import_memory(CLMemory memory)
+{
+ ARM_COMPUTE_ERROR_ON(_memory.region() == nullptr);
+ ARM_COMPUTE_RETURN_ERROR_ON(memory.region()->cl_data().get() == nullptr);
+ ARM_COMPUTE_RETURN_ERROR_ON(_associated_memory_group != nullptr);
+ _memory = memory;
+ info().set_is_resizable(false);
+
+ return Status{};
+}
+
void CLTensorAllocator::set_associated_memory_group(CLMemoryGroup *associated_memory_group)
{
+ ARM_COMPUTE_ERROR_ON(_memory.region() == nullptr);
ARM_COMPUTE_ERROR_ON(associated_memory_group == nullptr);
ARM_COMPUTE_ERROR_ON(_associated_memory_group != nullptr);
- ARM_COMPUTE_ERROR_ON(_buffer.get() != nullptr);
+ ARM_COMPUTE_ERROR_ON(_memory.region()->cl_data().get() != nullptr);
+ _memory = CLMemory(std::make_shared<CLBufferMemoryRegion>(CLScheduler::get().context(), CL_MEM_ALLOC_HOST_PTR | CL_MEM_READ_WRITE, 0));
_associated_memory_group = associated_memory_group;
}
uint8_t *CLTensorAllocator::lock()
{
- ARM_COMPUTE_ERROR_ON(_mapping != nullptr);
- _mapping = map(CLScheduler::get().queue(), true);
- return _mapping;
+ return map(CLScheduler::get().queue(), true);
}
void CLTensorAllocator::unlock()
{
- ARM_COMPUTE_ERROR_ON(_mapping == nullptr);
- unmap(CLScheduler::get().queue(), _mapping);
- _mapping = nullptr;
+ ARM_COMPUTE_ERROR_ON(_memory.region() == nullptr);
+ unmap(CLScheduler::get().queue(), reinterpret_cast<uint8_t *>(_memory.region()->buffer()));
}
uint8_t *CLTensorAllocator::map(cl::CommandQueue &q, bool blocking)
{
- ARM_COMPUTE_ERROR_ON(_buffer.get() == nullptr);
- return static_cast<uint8_t *>(q.enqueueMapBuffer(_buffer, blocking ? CL_TRUE : CL_FALSE, CL_MAP_READ | CL_MAP_WRITE, 0, info().total_size()));
+ ARM_COMPUTE_ERROR_ON(_memory.region() == nullptr);
+ ARM_COMPUTE_ERROR_ON(_memory.region()->buffer() != nullptr);
+ _memory.region()->map(q, blocking);
+ return reinterpret_cast<uint8_t *>(_memory.region()->buffer());
}
void CLTensorAllocator::unmap(cl::CommandQueue &q, uint8_t *mapping)
{
- ARM_COMPUTE_ERROR_ON(_buffer.get() == nullptr);
- q.enqueueUnmapMemObject(_buffer, mapping);
+ ARM_COMPUTE_UNUSED(mapping);
+ ARM_COMPUTE_ERROR_ON(_memory.region() == nullptr);
+ ARM_COMPUTE_ERROR_ON(_memory.region()->buffer() == nullptr);
+ _memory.region()->unmap(q);
}
diff --git a/src/runtime/CL/CLTuner.cpp b/src/runtime/CL/CLTuner.cpp
index df8e255..5f82cd3 100644
--- a/src/runtime/CL/CLTuner.cpp
+++ b/src/runtime/CL/CLTuner.cpp
@@ -35,61 +35,6 @@
using namespace arm_compute;
-namespace
-{
-/* Function to be used to intercept kernel enqueues and store their OpenCL Event */
-class Interceptor
-{
-public:
- explicit Interceptor(CLTuner &tuner);
-
- /** clEnqueueNDRangeKernel interface
- *
- * @param[in] command_queue A valid command-queue. The kernel will be queued for execution on the device associated with command_queue.
- * @param[in] kernel A valid kernel object. The OpenCL context associated with kernel and command_queue must be the same.
- * @param[in] work_dim The number of dimensions used to specify the global work-items and work-items in the work-group. work_dim must be greater than zero and less than or equal to CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS.
- * @param[in] gwo Global-Workgroup-Offset. It can be used to specify an array of work_dim unsigned values that describe the offset used to calculate the global ID of a work-item. If global_work_offset is NULL, the global IDs start at offset (0, 0, ... 0).
- * @param[in] gws Global-Workgroup-Size. Points to an array of work_dim unsigned values that describe the number of global work-items in work_dim dimensions that will execute the kernel function.
- * @param[in] lws Local-Workgroup-Size. Points to an array of work_dim unsigned values that describe the number of work-items that make up a work-group
- * @param[in] num_events_in_wait_list Number of events in the waiting list
- * @param[in] event_wait_list Event waiting list
- * @param[in] event OpenCL kernel event
- *
- * @return the OpenCL status
- */
- cl_int operator()(cl_command_queue command_queue, cl_kernel kernel, cl_uint work_dim, const size_t *gwo, const size_t *gws, const size_t *lws, cl_uint num_events_in_wait_list,
- const cl_event *event_wait_list, cl_event *event);
-
-private:
- CLTuner &_tuner;
-};
-
-Interceptor::Interceptor(CLTuner &tuner)
- : _tuner(tuner)
-{
-}
-
-cl_int Interceptor::operator()(cl_command_queue command_queue, cl_kernel kernel, cl_uint work_dim, const size_t *gwo, const size_t *gws, const size_t *lws, cl_uint num_events_in_wait_list,
- const cl_event *event_wait_list, cl_event *event)
-{
- ARM_COMPUTE_ERROR_ON_MSG(event != nullptr, "Not supported");
- ARM_COMPUTE_UNUSED(event);
- if(_tuner.kernel_event_is_set())
- {
- // If the event is already set it means the kernel enqueue is sliced: given that we only time the first slice we can save time by skipping the other enqueues.
- return CL_SUCCESS;
- }
- cl_event tmp;
- cl_int retval = _tuner.real_clEnqueueNDRangeKernel(command_queue, kernel, work_dim, gwo, gws, lws, num_events_in_wait_list, event_wait_list, &tmp);
-
- // Set OpenCL event
- _tuner.set_cl_kernel_event(tmp);
-
- return retval;
-}
-
-} // namespace
-
CLTuner::CLTuner(bool tune_new_kernels)
: real_clEnqueueNDRangeKernel(nullptr), _lws_table(), _queue(), _queue_profiler(), _kernel_event(), _tune_new_kernels(tune_new_kernels)
{
@@ -113,7 +58,12 @@
return _tune_new_kernels;
}
-void CLTuner::tune_kernel(ICLKernel &kernel)
+void CLTuner::tune_kernel_static(ICLKernel &kernel)
+{
+ ARM_COMPUTE_UNUSED(kernel);
+}
+
+void CLTuner::tune_kernel_dynamic(ICLKernel &kernel)
{
// Get the configuration ID from the kernel
const std::string &config_id = kernel.config_id();
@@ -173,7 +123,25 @@
}
}
// Start intercepting enqueues:
- CLSymbols::get().clEnqueueNDRangeKernel_ptr = Interceptor(*this);
+ auto interceptor = [this](cl_command_queue command_queue, cl_kernel kernel, cl_uint work_dim, const size_t *gwo, const size_t *gws, const size_t *lws, cl_uint num_events_in_wait_list,
+ const cl_event * event_wait_list, cl_event * event)
+ {
+ ARM_COMPUTE_ERROR_ON_MSG(event != nullptr, "Not supported");
+ ARM_COMPUTE_UNUSED(event);
+ if(this->kernel_event_is_set())
+ {
+ // If the event is already set it means the kernel enqueue is sliced: given that we only time the first slice we can save time by skipping the other enqueues.
+ return CL_SUCCESS;
+ }
+ cl_event tmp;
+ cl_int retval = this->real_clEnqueueNDRangeKernel(command_queue, kernel, work_dim, gwo, gws, lws, num_events_in_wait_list, event_wait_list, &tmp);
+
+ // Set OpenCL event
+ this->set_cl_kernel_event(tmp);
+
+ return retval;
+ };
+ CLSymbols::get().clEnqueueNDRangeKernel_ptr = interceptor;
cl_ulong min_exec_time = std::numeric_limits<cl_ulong>::max();
diff --git a/src/runtime/CL/functions/CLChannelShuffleLayer.cpp b/src/runtime/CL/functions/CLChannelShuffleLayer.cpp
new file mode 100644
index 0000000..ff50073
--- /dev/null
+++ b/src/runtime/CL/functions/CLChannelShuffleLayer.cpp
@@ -0,0 +1,43 @@
+/*
+ * Copyright (c) 2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/CL/functions/CLChannelShuffleLayer.h"
+
+#include "arm_compute/core/CL/kernels/CLChannelShuffleLayerKernel.h"
+#include "arm_compute/core/Types.h"
+#include "support/ToolchainSupport.h"
+
+namespace arm_compute
+{
+void CLChannelShuffleLayer::configure(const ICLTensor *input, ICLTensor *output, unsigned int num_groups)
+{
+ auto k = arm_compute::support::cpp14::make_unique<CLChannelShuffleLayerKernel>();
+ k->configure(input, output, num_groups);
+ _kernel = std::move(k);
+}
+
+Status CLChannelShuffleLayer::validate(const ITensorInfo *input, const ITensorInfo *output, unsigned int num_groups)
+{
+ return CLChannelShuffleLayerKernel::validate(input, output, num_groups);
+}
+} // namespace arm_compute
diff --git a/src/runtime/CL/functions/CLConvertFullyConnectedWeights.cpp b/src/runtime/CL/functions/CLConvertFullyConnectedWeights.cpp
new file mode 100644
index 0000000..c226e56
--- /dev/null
+++ b/src/runtime/CL/functions/CLConvertFullyConnectedWeights.cpp
@@ -0,0 +1,40 @@
+/*
+ * Copyright (c) 2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/CL/functions/CLConvertFullyConnectedWeights.h"
+
+using namespace arm_compute;
+
+void CLConvertFullyConnectedWeights::configure(const ICLTensor *input, ICLTensor *output, const TensorShape &original_input_shape,
+ DataLayout data_layout)
+{
+ auto k = arm_compute::support::cpp14::make_unique<CLConvertFullyConnectedWeightsKernel>();
+ k->configure(input, output, original_input_shape, data_layout);
+ _kernel = std::move(k);
+}
+
+Status CLConvertFullyConnectedWeights::validate(const ITensorInfo *input, const ITensorInfo *output, const TensorShape &original_input_shape,
+ DataLayout data_layout)
+{
+ return CLConvertFullyConnectedWeightsKernel::validate(input, output, original_input_shape, data_layout);
+}
\ No newline at end of file
diff --git a/src/runtime/CL/functions/CLConvolutionLayer.cpp b/src/runtime/CL/functions/CLConvolutionLayer.cpp
index 1a486ce..47a8d5f 100644
--- a/src/runtime/CL/functions/CLConvolutionLayer.cpp
+++ b/src/runtime/CL/functions/CLConvolutionLayer.cpp
@@ -42,25 +42,34 @@
{
}
-void CLConvolutionLayer::configure(ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, const PadStrideInfo &conv_info, const WeightsInfo &weights_info)
+void CLConvolutionLayer::configure(ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, const PadStrideInfo &conv_info, const WeightsInfo &weights_info,
+ const Size2D &dilation, const ActivationLayerInfo &act_info, bool enable_fast_math)
{
ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output);
- ARM_COMPUTE_ERROR_THROW_ON(CLConvolutionLayer::validate(input->info(), weights->info(), ((biases != nullptr) ? biases->info() : nullptr), output->info(), conv_info, weights_info));
+ ARM_COMPUTE_ERROR_THROW_ON(CLConvolutionLayer::validate(input->info(), weights->info(), ((biases != nullptr) ? biases->info() : nullptr), output->info(), conv_info, weights_info, dilation, act_info,
+ enable_fast_math));
- switch(CLConvolutionLayer::get_convolution_method(input->info(), weights->info(), ((biases != nullptr) ? biases->info() : nullptr), output->info(), conv_info,
- weights_info, CLScheduler::get().target()))
+ switch(CLConvolutionLayer::get_convolution_method(input->info(), weights->info(), output->info(), conv_info,
+ weights_info, act_info, CLScheduler::get().target(), dilation, enable_fast_math))
{
+ case ConvolutionMethod::WINOGRAD:
+ {
+ auto f = arm_compute::support::cpp14::make_unique<CLWinogradConvolutionLayer>(_memory_manager);
+ f->configure(input, weights, biases, output, conv_info, act_info, enable_fast_math);
+ _function = std::move(f);
+ break;
+ }
case ConvolutionMethod::DIRECT:
{
auto f = arm_compute::support::cpp14::make_unique<CLDirectConvolutionLayer>();
- f->configure(input, weights, biases, output, conv_info);
+ f->configure(input, weights, biases, output, conv_info, act_info);
_function = std::move(f);
break;
}
case ConvolutionMethod::GEMM:
{
auto f = arm_compute::support::cpp14::make_unique<CLGEMMConvolutionLayer>(_memory_manager);
- f->configure(input, weights, biases, output, conv_info, weights_info);
+ f->configure(input, weights, biases, output, conv_info, weights_info, dilation, act_info);
_function = std::move(f);
break;
}
@@ -71,25 +80,30 @@
}
Status CLConvolutionLayer::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info,
- const WeightsInfo &weights_info)
+ const WeightsInfo &weights_info, const Size2D &dilation, const ActivationLayerInfo &act_info, bool enable_fast_math)
{
ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, weights, output);
- //Configure if the parameters match the direct convolution or the gemm-based
const GPUTarget gpu_target = CLScheduler::get().target();
- switch(CLConvolutionLayer::get_convolution_method(input, weights, biases, output, conv_info, weights_info, gpu_target))
+ switch(CLConvolutionLayer::get_convolution_method(input, weights, output, conv_info, weights_info, act_info, gpu_target, dilation, enable_fast_math))
{
+ case ConvolutionMethod::WINOGRAD:
+ {
+ //Validate Winograd
+ ARM_COMPUTE_RETURN_ON_ERROR(CLWinogradConvolutionLayer::validate(input, weights, biases, output, conv_info, act_info, enable_fast_math));
+ break;
+ }
case ConvolutionMethod::DIRECT:
{
// Validate direct convolution layer
- CLDirectConvolutionLayer::validate(input, weights, biases, output, conv_info);
+ ARM_COMPUTE_RETURN_ON_ERROR(CLDirectConvolutionLayer::validate(input, weights, biases, output, conv_info, act_info));
break;
}
case ConvolutionMethod::GEMM:
{
// Validate gemm-based convolution layer
- CLGEMMConvolutionLayer::validate(input, weights, biases, output, conv_info, weights_info);
+ ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMConvolutionLayer::validate(input, weights, biases, output, conv_info, weights_info, dilation, act_info));
break;
}
default:
@@ -100,21 +114,34 @@
return Status{};
}
-ConvolutionMethod CLConvolutionLayer::get_convolution_method(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info,
- const WeightsInfo &weights_info, const GPUTarget gpu_target)
+ConvolutionMethod CLConvolutionLayer::get_convolution_method(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *output, const PadStrideInfo &conv_info,
+ const WeightsInfo &weights_info, const ActivationLayerInfo &act_info, const GPUTarget gpu_target, const Size2D &dilation, bool enable_fast_math)
{
- ARM_COMPUTE_UNUSED(input);
- ARM_COMPUTE_UNUSED(weights);
- ARM_COMPUTE_UNUSED(biases);
- ARM_COMPUTE_UNUSED(output);
- ARM_COMPUTE_UNUSED(conv_info);
+ ARM_COMPUTE_ERROR_ON_NULLPTR(input);
+ ARM_COMPUTE_ERROR_ON_NULLPTR(output);
+ ARM_COMPUTE_ERROR_ON_NULLPTR(weights);
ARM_COMPUTE_UNUSED(weights_info);
ARM_COMPUTE_UNUSED(gpu_target);
- return ConvolutionMethod::GEMM;
+ const size_t idx_c = get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::CHANNEL);
+
+ if(dilation != Size2D(1U, 1U) || (input->dimension(idx_c) < 16))
+ {
+ return ConvolutionMethod::GEMM;
+ }
+ else
+ {
+ return bool(CLWinogradConvolutionLayer::validate(input, weights, nullptr, output, conv_info, act_info, enable_fast_math)) ? ConvolutionMethod::WINOGRAD : ConvolutionMethod::GEMM;
+ }
}
void CLConvolutionLayer::run()
{
+ prepare();
_function->run();
}
+
+void CLConvolutionLayer::prepare()
+{
+ _function->prepare();
+}
diff --git a/src/graph/CL/CLMap.cpp b/src/runtime/CL/functions/CLCopy.cpp
similarity index 68%
rename from src/graph/CL/CLMap.cpp
rename to src/runtime/CL/functions/CLCopy.cpp
index 5289ea9..3442e37 100644
--- a/src/graph/CL/CLMap.cpp
+++ b/src/runtime/CL/functions/CLCopy.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2018 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -21,23 +21,23 @@
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*/
-#include "arm_compute/graph/CL/CLMap.h"
+#include "arm_compute/runtime/CL/functions/CLCopy.h"
#include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/CL/kernels/CLCopyKernel.h"
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/PixelValue.h"
+#include "arm_compute/core/TensorInfo.h"
#include "arm_compute/core/Validate.h"
-#include "arm_compute/graph/ITensorObject.h"
-#include "arm_compute/runtime/CL/CLScheduler.h"
+#include "support/ToolchainSupport.h"
-using namespace arm_compute::graph;
+#include <utility>
-CLMap::CLMap(ITensorObject *tensor, bool blocking)
- : _tensor(dynamic_cast<arm_compute::ICLTensor *>(tensor->tensor())), _blocking(blocking)
+using namespace arm_compute;
+
+void CLCopy::configure(ICLTensor *input, ICLTensor *output)
{
- ARM_COMPUTE_ERROR_ON_NULLPTR(_tensor);
-}
-
-void CLMap::run()
-{
- _tensor->map(arm_compute::CLScheduler::get().queue(), _blocking);
+ auto k = arm_compute::support::cpp14::make_unique<CLCopyKernel>();
+ k->configure(input, output);
+ _kernel = std::move(k);
}
diff --git a/src/runtime/CL/functions/CLDeconvolutionLayer.cpp b/src/runtime/CL/functions/CLDeconvolutionLayer.cpp
index 9e6c0b4..cb8dc02 100644
--- a/src/runtime/CL/functions/CLDeconvolutionLayer.cpp
+++ b/src/runtime/CL/functions/CLDeconvolutionLayer.cpp
@@ -80,7 +80,7 @@
const PadStrideInfo conv_info(1, 1, 0, 0, 0, 0, DimensionRoundingType::CEIL);
ARM_COMPUTE_RETURN_ON_ERROR(CLDeconvolutionLayerUpsample::validate(input, &scale_out_info, BorderSize(inner_border_right, inner_border_top), info));
- ARM_COMPUTE_RETURN_ON_ERROR(CLConvolutionLayer::validate(&scale_out_info, weights, bias, output, info, WeightsInfo()));
+ ARM_COMPUTE_RETURN_ON_ERROR(CLConvolutionLayer::validate(&scale_out_info, weights, bias, output, conv_info, WeightsInfo()));
return Status{};
}
diff --git a/src/runtime/CL/functions/CLDepthwiseConvolutionLayer.cpp b/src/runtime/CL/functions/CLDepthwiseConvolutionLayer.cpp
index 88e9376..676a121 100644
--- a/src/runtime/CL/functions/CLDepthwiseConvolutionLayer.cpp
+++ b/src/runtime/CL/functions/CLDepthwiseConvolutionLayer.cpp
@@ -24,6 +24,8 @@
#include "arm_compute/runtime/CL/functions/CLDepthwiseConvolutionLayer.h"
#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/CL/kernels/CLDepthwiseConvolutionLayer3x3NCHWKernel.h"
+#include "arm_compute/core/CL/kernels/CLDepthwiseConvolutionLayer3x3NHWCKernel.h"
#include "arm_compute/core/PixelValue.h"
#include "arm_compute/core/utils/misc/ShapeCalculator.h"
#include "arm_compute/core/utils/quantization/AsymmHelpers.h"
@@ -35,17 +37,27 @@
using namespace arm_compute::misc::shape_calculator;
CLDepthwiseConvolutionLayer3x3::CLDepthwiseConvolutionLayer3x3()
- : _kernel(), _border_handler()
+ : _kernel(nullptr), _border_handler()
{
}
-void CLDepthwiseConvolutionLayer3x3::configure(ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, const PadStrideInfo &conv_info)
+void CLDepthwiseConvolutionLayer3x3::configure(ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, const PadStrideInfo &conv_info, unsigned int depth_multiplier,
+ ActivationLayerInfo act_info)
{
ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::F16, DataType::F32);
ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, weights);
- _kernel.set_target(CLScheduler::get().target());
- _kernel.configure(input, weights, biases, output, conv_info);
+ if(input->info()->data_layout() == DataLayout::NCHW)
+ {
+ _kernel = arm_compute::support::cpp14::make_unique<CLDepthwiseConvolutionLayer3x3NCHWKernel>();
+ }
+ else
+ {
+ _kernel = arm_compute::support::cpp14::make_unique<CLDepthwiseConvolutionLayer3x3NHWCKernel>();
+ }
+
+ _kernel->set_target(CLScheduler::get().target());
+ _kernel->configure(input, weights, biases, output, conv_info, depth_multiplier, act_info);
// Configure border handler
PixelValue &&zero_value(0.f);
@@ -53,42 +65,62 @@
{
zero_value = PixelValue(static_cast<uint8_t>(input->info()->quantization_info().offset));
}
- _border_handler.configure(input, _kernel.border_size(), BorderMode::CONSTANT, zero_value);
+ _border_handler.configure(input, _kernel->border_size(), BorderMode::CONSTANT, zero_value);
+}
+
+Status CLDepthwiseConvolutionLayer3x3::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info,
+ unsigned int depth_multiplier,
+ ActivationLayerInfo act_info, GPUTarget gpu_target)
+{
+ ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, weights, output);
+ ARM_COMPUTE_RETURN_ERROR_ON(input->data_layout() != DataLayout::NCHW && input->data_layout() != DataLayout::NHWC);
+
+ if(input->data_layout() == DataLayout::NCHW)
+ {
+ return CLDepthwiseConvolutionLayer3x3NCHWKernel::validate(input, weights, biases, output, conv_info, depth_multiplier, act_info, gpu_target);
+ }
+
+ return CLDepthwiseConvolutionLayer3x3NHWCKernel::validate(input, weights, biases, output, conv_info, depth_multiplier, act_info);
}
void CLDepthwiseConvolutionLayer3x3::run()
{
CLScheduler::get().enqueue(_border_handler);
- CLScheduler::get().enqueue(_kernel);
+ CLScheduler::get().enqueue(*_kernel);
}
CLDepthwiseConvolutionLayer::CLDepthwiseConvolutionLayer()
: _im2col_kernel(), _weights_reshape_kernel(), _v2mm_kernel(), _vector_to_tensor_kernel(), _output_stage_kernel(), _v2mm_input_fill_border(), _v2mm_weights_fill_border(), _input_reshaped(),
- _weights_reshaped(), _v2mm_output(), _output_reshaped(), _is_quantized(false)
+ _weights_reshaped(), _v2mm_output(), _output_reshaped(), _is_first_run(true), _is_quantized(false), _original_weights(nullptr)
{
}
-void CLDepthwiseConvolutionLayer::configure(ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, const PadStrideInfo &conv_info)
+void CLDepthwiseConvolutionLayer::configure(ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, const PadStrideInfo &conv_info, unsigned int depth_multiplier)
{
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::F32);
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::F16, DataType::F32);
ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, weights);
- ARM_COMPUTE_ERROR_ON(input->info()->dimension(2) != weights->info()->dimension(2));
const size_t weights_w = weights->info()->dimension(0);
const size_t weights_h = weights->info()->dimension(1);
const size_t weights_z = weights->info()->dimension(2);
- _is_quantized = is_data_type_quantized_asymmetric(input->info()->data_type());
+ _is_first_run = true;
+ _original_weights = weights;
+ _is_quantized = is_data_type_quantized_asymmetric(input->info()->data_type());
bool append_bias = (biases != nullptr) && !_is_quantized;
const GPUTarget gpu_target = CLScheduler::get().target();
// Calculate output shape
- TensorShape dwc_output_shape = shape_calculator::compute_depthwise_convolution_shape(*input->info(), *weights->info(), conv_info);
+ TensorShape output_shape = shape_calculator::compute_depthwise_convolution_shape(*input->info(), *weights->info(), conv_info, depth_multiplier);
+
+ // Output auto inizialitation if not yet initialized
+ auto_init_if_empty(*output->info(), input->info()->clone()->set_tensor_shape(output_shape));
+ ARM_COMPUTE_ERROR_ON_MISMATCHING_DIMENSIONS(output->info()->tensor_shape(), output_shape);
// Output width and height
- const unsigned int conv_w = dwc_output_shape.x();
- const unsigned int conv_h = dwc_output_shape.y();
+ const unsigned int conv_w = output_shape.x();
+ const unsigned int conv_h = output_shape.y();
// Set up intermediate tensors
const size_t patch_size = weights_w * weights_h + ((append_bias) ? 1 : 0);
@@ -101,7 +133,7 @@
shape_im2col.set(2, weights_z);
_input_reshaped.allocator()->init(input->info()->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(shape_im2col));
_im2col_kernel.set_target(gpu_target);
- _im2col_kernel.configure(input, &_input_reshaped, Size2D(weights_w, weights_h), conv_info, append_bias);
+ _im2col_kernel.configure(input, &_input_reshaped, Size2D(weights_w, weights_h), conv_info, append_bias, depth_multiplier);
// Weights reshape configuration
const TensorShape shape_weights_reshape(patch_size, weights_z);
@@ -117,7 +149,7 @@
_v2mm_output.allocator()->init(input->info()->clone()->set_is_resizable(true).reset_padding().set_data_type(v2mm_dt).set_tensor_shape(shape_v2mm_out));
_v2mm_kernel.set_target(gpu_target);
_v2mm_kernel.configure(&_input_reshaped, &_weights_reshaped, &_v2mm_output);
- _output_reshaped.allocator()->init(_v2mm_output.info()->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(dwc_output_shape));
+ _output_reshaped.allocator()->init(_v2mm_output.info()->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(output_shape));
_vector_to_tensor_kernel.configure(&_v2mm_output, (_is_quantized) ? &_output_reshaped : output, conv_w, conv_h);
// Output staged configuration
@@ -152,18 +184,72 @@
_v2mm_output.allocator()->allocate();
}
+Status CLDepthwiseConvolutionLayer::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info,
+ unsigned int depth_multiplier)
+{
+ ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, weights, output);
+ ARM_COMPUTE_RETURN_ERROR_ON((input->dimension(2) * depth_multiplier) != weights->dimension(2));
+
+ const bool is_quantized = is_data_type_quantized_asymmetric(input->data_type());
+ const bool append_bias = (biases != nullptr) && !is_quantized;
+ const TensorShape output_shape = shape_calculator::compute_depthwise_convolution_shape(*input, *weights, conv_info, depth_multiplier);
+ const size_t weights_w = weights->dimension(0);
+ const size_t weights_h = weights->dimension(1);
+ const size_t weights_z = weights->dimension(2);
+ const unsigned int conv_w = output_shape.x();
+ const unsigned int conv_h = output_shape.y();
+ const size_t patch_size = weights_w * weights_h + ((append_bias) ? 1 : 0);
+ const size_t conv_size = conv_w * conv_h;
+
+ TensorShape shape_im2col = input->tensor_shape();
+ shape_im2col.set(0, patch_size);
+ shape_im2col.set(1, conv_size);
+ shape_im2col.set(2, weights_z);
+ TensorInfo input_reshaped(input->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(shape_im2col));
+ ARM_COMPUTE_RETURN_ON_ERROR(CLDepthwiseIm2ColKernel::validate(input, &input_reshaped, Size2D(weights_w, weights_h), conv_info, append_bias, depth_multiplier));
+
+ const TensorShape shape_weights_reshape(patch_size, weights_z);
+ TensorInfo weights_reshaped(weights->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(shape_weights_reshape));
+ ARM_COMPUTE_RETURN_ON_ERROR(CLDepthwiseWeightsReshapeKernel::validate(weights, &weights_reshaped, append_bias ? biases : nullptr));
+
+ DataType v2mm_dt = (input->data_type() == DataType::QASYMM8) ? DataType::S32 : input->data_type();
+ TensorShape shape_v2mm_out = input->tensor_shape();
+ shape_v2mm_out.set(0, conv_size * weights_z);
+ shape_v2mm_out.set(1, 1);
+ shape_v2mm_out.set(2, 1);
+ TensorInfo v2mm_output(input->clone()->set_is_resizable(true).reset_padding().set_data_type(v2mm_dt).set_tensor_shape(shape_v2mm_out));
+ ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMMatrixVectorMultiplyKernel::validate(&input_reshaped, &weights_reshaped, &v2mm_output));
+
+ TensorInfo output_reshaped(v2mm_output.clone()->set_is_resizable(true).reset_padding().set_tensor_shape(output_shape));
+ ARM_COMPUTE_RETURN_ON_ERROR(CLDepthwiseVectorToTensorKernel::validate(&v2mm_output, (is_quantized) ? &output_reshaped : output, conv_w, conv_h));
+
+ if(is_quantized)
+ {
+ ARM_COMPUTE_RETURN_ON_ERROR(CLDirectConvolutionLayerOutputStageKernel::validate(&output_reshaped, biases, output));
+ }
+
+ return Status{};
+}
+
void CLDepthwiseConvolutionLayer::run()
{
+ // Run weights reshaping (Runs once for every configure)
+ if(_is_first_run)
+ {
+ ARM_COMPUTE_ERROR_ON(!_original_weights->is_used());
+
+ CLScheduler::get().enqueue(_weights_reshape_kernel);
+ CLScheduler::get().enqueue(_v2mm_weights_fill_border);
+ _is_first_run = false;
+
+ // Mark original weights tensor as unused
+ _original_weights->mark_as_unused();
+ }
+
CLScheduler::get().enqueue(_im2col_kernel);
-
- CLScheduler::get().enqueue(_weights_reshape_kernel);
-
CLScheduler::get().enqueue(_v2mm_input_fill_border);
- CLScheduler::get().enqueue(_v2mm_weights_fill_border);
CLScheduler::get().enqueue(_v2mm_kernel);
-
CLScheduler::get().enqueue(_vector_to_tensor_kernel);
-
if(_is_quantized)
{
CLScheduler::get().enqueue(_output_stage_kernel);
diff --git a/src/runtime/CL/functions/CLDequantizationLayer.cpp b/src/runtime/CL/functions/CLDequantizationLayer.cpp
index 5559d42..6f33b2e 100644
--- a/src/runtime/CL/functions/CLDequantizationLayer.cpp
+++ b/src/runtime/CL/functions/CLDequantizationLayer.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2018 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -24,6 +24,7 @@
#include "arm_compute/runtime/CL/functions/CLDequantizationLayer.h"
+#include "arm_compute/core/CL/ICLTensor.h"
#include "arm_compute/runtime/CL/CLScheduler.h"
using namespace arm_compute;
@@ -33,8 +34,18 @@
{
}
+Status CLDequantizationLayer::validate(const ITensorInfo *input, const ITensorInfo *output, const ITensorInfo *min_max)
+{
+ ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output, min_max);
+ ARM_COMPUTE_RETURN_ON_ERROR(CLDequantizationLayerKernel::validate(input, output, min_max));
+
+ return Status{};
+}
+
void CLDequantizationLayer::configure(const ICLTensor *input, ICLTensor *output, const ICLTensor *min_max)
{
+ ARM_COMPUTE_ERROR_ON_NULLPTR(input, output, min_max);
+
_dequantize_kernel.configure(input, output, min_max);
}
diff --git a/src/runtime/CL/functions/CLDirectConvolutionLayer.cpp b/src/runtime/CL/functions/CLDirectConvolutionLayer.cpp
index d6a335c..c451bd4 100644
--- a/src/runtime/CL/functions/CLDirectConvolutionLayer.cpp
+++ b/src/runtime/CL/functions/CLDirectConvolutionLayer.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2018 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -33,11 +33,11 @@
using namespace arm_compute;
CLDirectConvolutionLayer::CLDirectConvolutionLayer()
- : _direct_conv_kernel(), _input_border_handler()
+ : _direct_conv_kernel(), _input_border_handler(), _activationlayer_function(), _is_activationlayer_enabled(false)
{
}
-void CLDirectConvolutionLayer::configure(ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, const PadStrideInfo &conv_info)
+void CLDirectConvolutionLayer::configure(ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, const PadStrideInfo &conv_info, const ActivationLayerInfo &act_info)
{
// Set GPU target
_direct_conv_kernel.set_target(CLScheduler::get().target());
@@ -52,11 +52,28 @@
zero_value = PixelValue(static_cast<uint8_t>(input->info()->quantization_info().offset));
}
_input_border_handler.configure(input, _direct_conv_kernel.border_size(), BorderMode::CONSTANT, zero_value);
+
+ // Tune kernels
+ CLScheduler::get().tune_kernel_static(_direct_conv_kernel);
+
+ _is_activationlayer_enabled = act_info.enabled();
+
+ //Configure Activation Layer
+ if(_is_activationlayer_enabled)
+ {
+ _activationlayer_function.configure(output, nullptr, act_info);
+ }
}
-Status CLDirectConvolutionLayer::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info)
+Status CLDirectConvolutionLayer::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info,
+ const ActivationLayerInfo &act_info)
{
- return CLDirectConvolutionLayerKernel::validate(input, weights, biases, output, conv_info, CLScheduler::get().target());
+ ARM_COMPUTE_RETURN_ON_ERROR(CLDirectConvolutionLayerKernel::validate(input, weights, biases, output, conv_info, CLScheduler::get().target()));
+ if(act_info.enabled())
+ {
+ ARM_COMPUTE_RETURN_ON_ERROR(CLActivationLayer::validate(output, nullptr, act_info));
+ }
+ return Status{};
}
void CLDirectConvolutionLayer::run()
@@ -66,4 +83,10 @@
// Run direct convolution
CLScheduler::get().enqueue(_direct_conv_kernel);
+
+ //Run Activation Layer
+ if(_is_activationlayer_enabled)
+ {
+ _activationlayer_function.run();
+ }
}
diff --git a/src/runtime/CL/functions/CLFullyConnectedLayer.cpp b/src/runtime/CL/functions/CLFullyConnectedLayer.cpp
index 2b4670b..151fa1b 100644
--- a/src/runtime/CL/functions/CLFullyConnectedLayer.cpp
+++ b/src/runtime/CL/functions/CLFullyConnectedLayer.cpp
@@ -37,10 +37,8 @@
namespace
{
-Status validate_mm(const ITensorInfo &input, const ITensorInfo &weights, const ITensorInfo &output, bool is_interleaved_transposed)
+Status validate_mm(const ITensorInfo &input, const ITensorInfo &weights, const ITensorInfo &output)
{
- const GPUTarget gpu_target = CLScheduler::get().target();
-
if(is_data_type_quantized_asymmetric(input.data_type()))
{
// Since we need negative offsets for computing convolution, we need to change QuantizationInfo()
@@ -55,7 +53,7 @@
}
else
{
- ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMMatrixMultiplyKernel::validate(&input, &weights, &output, 1.f, is_interleaved_transposed, GEMMReshapeInfo(), gpu_target));
+ ARM_COMPUTE_RETURN_ON_ERROR(CLGEMM::validate(&input, &weights, nullptr, &output, 1.f, 0.0f, GEMMInfo(false, false, true /* Reshape weights only for the first run */)));
}
return Status{};
@@ -75,12 +73,12 @@
}
CLFullyConnectedLayer::CLFullyConnectedLayer(std::shared_ptr<IMemoryManager> memory_manager)
- : _memory_group(memory_manager), _im2col_kernel(), _reshape_weights_kernel(), _mm_kernel(), _mm_gemmlowp(memory_manager), _gemmlowp_output_stage(), _accumulate_biases_kernel(), _im2col_output(),
- _gemmlowp_output(), _reshape_weights_output(), _are_weights_reshaped(true), _is_fc_after_conv(true), _accumulate_biases(false), _is_quantized(false)
+ : _memory_group(memory_manager), _im2col_kernel(), _reshape_weights_kernel(), _mm_gemm(memory_manager), _mm_gemmlowp(memory_manager), _gemmlowp_output_stage(), _accumulate_biases_kernel(),
+ _im2col_output(), _gemmlowp_output(), _reshape_weights_output(), _are_weights_reshaped(true), _is_fc_after_conv(true), _accumulate_biases(false), _is_quantized(false), _original_weights(nullptr)
{
}
-void CLFullyConnectedLayer::configure_mm(const ICLTensor *input, const ICLTensor *weights, ICLTensor *output, bool is_interleaved_transposed)
+void CLFullyConnectedLayer::configure_mm(const ICLTensor *input, const ICLTensor *weights, ICLTensor *output)
{
if(_is_quantized)
{
@@ -102,8 +100,7 @@
else
{
// Configure matrix multiply kernel
- _mm_kernel.set_target(CLScheduler::get().target());
- _mm_kernel.configure(input, weights, output, 1.f, is_interleaved_transposed);
+ _mm_gemm.configure(input, weights, nullptr, output, 1.f, 0.0f, GEMMInfo(false, false, true /* Reshape weights only for the first run */));
}
}
@@ -114,7 +111,7 @@
// If the fully connected layer is called after a convolution layer, the input tensor must be linearized
// Initialize output tensor for im2col
- TensorShape shape_im2col = compute_im2col_shape(input->info());
+ TensorShape shape_im2col = compute_im2col_fc_shape(input->info());
_im2col_output.allocator()->init(input->info()->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(shape_im2col));
// Configure im2col kernel
@@ -122,7 +119,7 @@
_im2col_kernel.configure(input, &_im2col_output, Size2D(1, 1), PadStrideInfo(1, 1, 0, 0), false);
// Configure matrix multiply kernel
- configure_mm(&_im2col_output, weights, output, false);
+ configure_mm(&_im2col_output, weights, output);
// Allocate the output tensor for im2col once all the configure methods have been called
_im2col_output.allocator()->allocate();
@@ -133,7 +130,7 @@
ARM_COMPUTE_ERROR_ON(input->info()->dimension(0) != weights->info()->dimension(1));
// Configure matrix multiply kernel
- configure_mm(input, weights, output, false);
+ configure_mm(input, weights, output);
}
void CLFullyConnectedLayer::configure(const ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, bool transpose_weights, bool are_weights_reshaped)
@@ -152,6 +149,7 @@
_is_fc_after_conv = true;
_accumulate_biases = false;
_is_quantized = is_data_type_quantized_asymmetric(input->info()->data_type());
+ _original_weights = weights;
// Configure gemmlowp output
if(_is_quantized)
@@ -222,13 +220,6 @@
_gemmlowp_output_stage.configure(&_gemmlowp_output, biases, output, output_multiplier, output_shift, output->info()->quantization_info().offset);
_gemmlowp_output.allocator()->allocate();
}
-
- // Allocate the transpose tensor if the are_weights_reshaped flag is false and once all the configure methods have been called
- if(!_are_weights_reshaped)
- {
- // Allocate the tensor for the weights reshaped
- _reshape_weights_output.allocator()->allocate();
- }
}
Status CLFullyConnectedLayer::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, bool transpose_weights, bool are_weights_reshaped)
@@ -243,7 +234,7 @@
bool is_quantized = is_data_type_quantized_asymmetric(input->data_type());
const GPUTarget gpu_target = CLScheduler::get().target();
- const ITensorInfo &im2col_input = TensorInfo(input->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(compute_im2col_shape(input)));
+ const ITensorInfo &im2col_input = TensorInfo(input->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(compute_im2col_fc_shape(input)));
const ITensorInfo &reshaped_weights = TensorInfo(weights->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(compute_transposed_shape(*weights)));
const ITensorInfo &gemmlowp_output = TensorInfo(output->clone()->set_is_resizable(true).reset_padding().set_data_type(DataType::S32));
@@ -300,7 +291,7 @@
ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(0) != weights_to_use->dimension(1));
}
// Validate matrix multiply kernel
- ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(*input_to_use, *weights_to_use, *tmp_output, false));
+ ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(*input_to_use, *weights_to_use, *tmp_output));
// Validate output stage for asymmetric quantized types
if(is_quantized)
@@ -313,12 +304,7 @@
void CLFullyConnectedLayer::run()
{
- // Reshape of the weights (happens only once)
- if(!_are_weights_reshaped)
- {
- _are_weights_reshaped = true;
- _reshape_weights_kernel.run();
- }
+ prepare();
_memory_group.acquire();
@@ -335,7 +321,7 @@
}
else
{
- CLScheduler::get().enqueue(_mm_kernel, !_accumulate_biases);
+ _mm_gemm.run();
}
// Accumulate biases if provided
@@ -353,3 +339,30 @@
_memory_group.release();
}
+
+void CLFullyConnectedLayer::prepare()
+{
+ // Reshape of the weights (happens only once)
+ if(!_are_weights_reshaped)
+ {
+ ARM_COMPUTE_ERROR_ON(!_original_weights->is_used());
+
+ // Run reshape weights kernel and mark weights as unused
+ _reshape_weights_output.allocator()->allocate();
+ _reshape_weights_kernel.run();
+ _original_weights->mark_as_unused();
+
+ // Prepare GEMM prepare and release unused weights
+ if(!_is_quantized)
+ {
+ _mm_gemm.prepare();
+ if(!_reshape_weights_output.is_used())
+ {
+ _reshape_weights_output.allocator()->free();
+ }
+ }
+
+ CLScheduler::get().queue().finish();
+ _are_weights_reshaped = true;
+ }
+}
diff --git a/src/runtime/CL/functions/CLGEMM.cpp b/src/runtime/CL/functions/CLGEMM.cpp
index 6b5cd2d..f81da6c 100644
--- a/src/runtime/CL/functions/CLGEMM.cpp
+++ b/src/runtime/CL/functions/CLGEMM.cpp
@@ -29,14 +29,18 @@
#include "arm_compute/core/CL/kernels/CLGEMMMatrixMultiplyKernel.h"
#include "arm_compute/core/CL/kernels/CLGEMMTranspose1xWKernel.h"
#include "arm_compute/core/Error.h"
+#include "arm_compute/core/GPUTarget.h"
#include "arm_compute/core/Helpers.h"
#include "arm_compute/core/TensorInfo.h"
#include "arm_compute/core/Types.h"
+#include "arm_compute/core/Utils.h"
#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
#include "arm_compute/runtime/CL/CLScheduler.h"
#include "arm_compute/runtime/ITensorAllocator.h"
using namespace arm_compute;
+using namespace arm_compute::misc::shape_calculator;
namespace
{
@@ -44,9 +48,10 @@
{
bool flag = true;
- if(gpu_target == GPUTarget::BIFROST)
+ if(gpu_target_is_in(gpu_target, GPUTarget::G71, GPUTarget::G72, GPUTarget::G51, GPUTarget::G51BIG, GPUTarget::G51LIT, GPUTarget::TNOX))
{
- if(k > 256 && m > 4 && data_type == DataType::F32 && reshape_b_only_on_first_run)
+ // COMPMID-852
+ if(k > 256 && m > 4 && is_data_type_float(data_type) && reshape_b_only_on_first_run)
{
const float scale = k < 1024 ? 2.0f : 2.5f;
flag = (scale * n) > ((1.66f * n) + 38.4f);
@@ -56,39 +61,19 @@
flag = false;
}
}
-
- return flag;
-}
-
-Status validate_arguments(const ITensorInfo *a, const ITensorInfo *b, const ICLTensor *c, const ITensorInfo *output, const float alpha, const float beta, const GEMMInfo &gemm_info = GEMMInfo())
-{
- ARM_COMPUTE_ERROR_ON_NULLPTR(a, b, output);
-
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(a, 1, DataType::QS8, DataType::QS16, DataType::F16, DataType::F32);
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(a, b, output);
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(gemm_info.is_a_reshaped(), "Matrix A already reshaped is not supported");
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(gemm_info.is_b_reshaped(), "Matrix B already reshaped is not supported");
-
- if(c != nullptr)
+ else
{
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(a, c->info());
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(a->dimension(1) != c->info()->dimension(1), "The C matrix must have the same number of rows as the matrix A");
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(b->dimension(0) != c->info()->dimension(0), "The C matrix must have the same number of columns as the matrix B");
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(c->info()->dimension(0) != output->dimension(0), "The C matrix must have the same number of rows as the output matrix");
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(c->info()->dimension(1) != output->dimension(1), "The C matrix must have the same number of columns as the output matrix");
+ // We reshape the matrices only if we do not have the vector-by-matrix case and we reshape the matrix B only once
+ flag = m != 1 && reshape_b_only_on_first_run;
}
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(a->dimension(0) != b->dimension(1), "The product AB is defined only if the number of columns in A is equal to the number of rows in B");
-
- ARM_COMPUTE_UNUSED(alpha);
- ARM_COMPUTE_UNUSED(beta);
- return Status{};
+ return flag;
}
} // namespace
CLGEMM::CLGEMM(std::shared_ptr<IMemoryManager> memory_manager)
- : _memory_group(std::move(memory_manager)), _interleave_kernel(), _transpose_kernel(), _mm_kernel(), _ma_kernel(), _tmp_a(), _tmp_b(), _is_interleaved_transposed(false), _run_addition(false),
- _is_first_run(true), _reshape_b_only_on_first_run(false)
+ : _memory_group(std::move(memory_manager)), _interleave_kernel(), _transpose_kernel(), _mm_kernel(), _ma_kernel(), _tmp_a(), _tmp_b(), _original_b(nullptr), _is_interleaved_transposed(false),
+ _run_addition(false), _reshape_b_only_on_first_run(false), _is_prepared(false)
{
}
@@ -97,10 +82,14 @@
ARM_COMPUTE_ERROR_ON_NULLPTR(a, b, output);
// Perform validation step
- ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(a->info(), b->info(), c, output->info(), alpha, beta, gemm_info));
+ ARM_COMPUTE_ERROR_THROW_ON(validate(a->info(), b->info(), c != nullptr ? c->info() : nullptr, output->info(), alpha, beta, gemm_info));
+
+ // Store original b matrix
+ _original_b = b;
// Check if we need to reshape the matrix B only on the first run
_reshape_b_only_on_first_run = gemm_info.reshape_b_only_on_first_run();
+ _is_prepared = false;
const ICLTensor *matrix_a = a;
const ICLTensor *matrix_b = b;
@@ -121,7 +110,7 @@
int mult_transpose1xW_width = 1;
int mult_interleave4x4_height = 1;
- if(gpu_target == GPUTarget::BIFROST)
+ if(get_arch_from_target(gpu_target) == GPUTarget::BIFROST)
{
mult_transpose1xW_width = 4;
mult_interleave4x4_height = 2;
@@ -137,8 +126,10 @@
// Manage intermediate buffers
_memory_group.manage(&_tmp_a);
- _memory_group.manage(&_tmp_b);
-
+ if(!_reshape_b_only_on_first_run)
+ {
+ _memory_group.manage(&_tmp_b);
+ }
// _tmp_a and _tmp_b will be auto configured in _interleave_kernel and in _transpose_kernel
// Configure interleave kernel
@@ -154,7 +145,10 @@
{
// Allocate intermediate tensors
_tmp_a.allocator()->allocate();
- _tmp_b.allocator()->allocate();
+ if(!_reshape_b_only_on_first_run)
+ {
+ _tmp_b.allocator()->allocate();
+ }
}
// Configure matrix addition kernel
@@ -165,14 +159,74 @@
}
}
-Status CLGEMM::validate(const ITensorInfo *a, const ITensorInfo *b, const ICLTensor *c, const ITensorInfo *output, const float alpha, const float beta, const GEMMInfo &gemm_info)
+Status CLGEMM::validate(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, const ITensorInfo *output, float alpha, float beta, const GEMMInfo &gemm_info)
{
- ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(a, b, c, output, alpha, beta, gemm_info));
+ ARM_COMPUTE_UNUSED(alpha);
+
+ // Check if we need to reshape the matrix B only on the first run
+ const bool reshape_b_only_on_first_run = gemm_info.reshape_b_only_on_first_run();
+
+ const ITensorInfo *matrix_a_info = a;
+ const ITensorInfo *matrix_b_info = b;
+
+ TensorInfo tmp_a_info{};
+ TensorInfo tmp_b_info{};
+ TensorInfo tmp_output_info = *output->clone();
+
+ // Get the GPU target
+ const GPUTarget gpu_target = CLScheduler::get().target();
+
+ // Arguments used by GEMMReshapeInfo
+ // If we pass the matrix A and matrix B reshaped to CLGEMMMatrixMultiplyKernel, we need to pass m, n, k, mult_transpose1xW_width and mult_interleave4x4_height to CLGEMMReshapeInfo
+ // in order to know how the matrices have been reshaped
+ const int m = a->dimension(1);
+ const int n = b->dimension(0);
+ const int k = a->dimension(0);
+ int mult_transpose1xW_width = 1;
+ int mult_interleave4x4_height = 1;
+
+ if(get_arch_from_target(gpu_target) == GPUTarget::BIFROST)
+ {
+ mult_transpose1xW_width = 4;
+ mult_interleave4x4_height = 2;
+ }
+
+ const GEMMReshapeInfo reshape_info = GEMMReshapeInfo(m, n, k, mult_transpose1xW_width, mult_interleave4x4_height);
+
+ // Check if we need to reshape the matrix A and matrix B
+ const bool run_interleave_transpose = is_interleaved_transposed(m, n, k, a->data_type(), reshape_b_only_on_first_run, gpu_target);
+
+ if(run_interleave_transpose)
+ {
+ matrix_a_info = &tmp_a_info;
+ matrix_b_info = &tmp_b_info;
+
+ // Validate interleave kernel
+ auto_init_if_empty(tmp_a_info, a->clone()->set_tensor_shape(compute_interleaved_shape(*a, mult_interleave4x4_height)));
+ ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMInterleave4x4Kernel::validate(a, &tmp_a_info, mult_interleave4x4_height));
+
+ // Validate transpose kernel
+ auto_init_if_empty(tmp_b_info, b->clone()->set_tensor_shape(compute_transpose1xW_with_element_size_shape(*b, mult_transpose1xW_width)));
+ ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMTranspose1xWKernel::validate(b, &tmp_b_info, mult_transpose1xW_width));
+ }
+
+ // Validate matrix multiply
+ auto_init_if_empty(tmp_output_info, matrix_a_info->clone()->set_tensor_shape(compute_mm_shape(*matrix_a_info, *matrix_b_info, run_interleave_transpose, reshape_info)));
+ ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMMatrixMultiplyKernel::validate(matrix_a_info, matrix_b_info, &tmp_output_info, alpha, run_interleave_transpose, reshape_info, gpu_target));
+
+ if(beta != 0 && c != nullptr)
+ {
+ // Validate matrix addition kernel
+ ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMMatrixAdditionKernel::validate(c, &tmp_output_info, beta));
+ }
+
return Status{};
}
void CLGEMM::run()
{
+ prepare();
+
_memory_group.acquire();
if(_is_interleaved_transposed)
@@ -180,14 +234,7 @@
// Run interleave kernel
CLScheduler::get().enqueue(_interleave_kernel, false);
- if(_is_first_run)
- {
- // Run transpose kernel
- CLScheduler::get().enqueue(_transpose_kernel, false);
-
- _is_first_run = false;
- }
- else if(!_reshape_b_only_on_first_run)
+ if(!_reshape_b_only_on_first_run)
{
// Run transpose kernel
CLScheduler::get().enqueue(_transpose_kernel, false);
@@ -205,3 +252,19 @@
_memory_group.release();
}
+
+void CLGEMM::prepare()
+{
+ if(!_is_prepared)
+ {
+ if(_is_interleaved_transposed && _reshape_b_only_on_first_run)
+ {
+ // Run transpose kernel
+ _tmp_b.allocator()->allocate();
+ CLScheduler::get().enqueue(_transpose_kernel, false);
+ _original_b->mark_as_unused();
+ }
+ CLScheduler::get().queue().finish();
+ _is_prepared = true;
+ }
+}
diff --git a/src/runtime/CL/functions/CLGEMMConvolutionLayer.cpp b/src/runtime/CL/functions/CLGEMMConvolutionLayer.cpp
index c58af36..79495e4 100644
--- a/src/runtime/CL/functions/CLGEMMConvolutionLayer.cpp
+++ b/src/runtime/CL/functions/CLGEMMConvolutionLayer.cpp
@@ -38,8 +38,8 @@
using namespace arm_compute;
using namespace arm_compute::misc::shape_calculator;
-CLConvolutionLayerReshapeWeights::CLConvolutionLayerReshapeWeights(std::shared_ptr<IMemoryManager> memory_manager)
- : _memory_group(std::move(memory_manager)), _weights_reshape_kernel(), _weights_transposed_kernel(), _weights_reshaped()
+CLConvolutionLayerReshapeWeights::CLConvolutionLayerReshapeWeights()
+ : _weights_reshape_kernel()
{
}
@@ -86,16 +86,12 @@
void CLConvolutionLayerReshapeWeights::run()
{
- _memory_group.acquire();
-
CLScheduler::get().enqueue(_weights_reshape_kernel);
-
- _memory_group.release();
}
CLGEMMConvolutionLayer::CLGEMMConvolutionLayer(std::shared_ptr<IMemoryManager> memory_manager)
- : _memory_group(memory_manager), _reshape_weights(), _im2col_kernel(), _mm_gemm(memory_manager), _mm_gemmlowp(memory_manager), _gemmlowp_output_stage(), _col2im_kernel(), _im2col_output(),
- _interleave_output(), _weights_reshaped(), _weights_transposed(), _gemm_output(), _tmp_output(), _is_quantized(false), _is_first_run(true)
+ : _memory_group(memory_manager), _reshape_weights(), _im2col_kernel(), _mm_gemm(memory_manager), _mm_gemmlowp(memory_manager), _gemmlowp_output_stage(), _col2im_kernel(), _activationlayer_function(),
+ _original_weights(nullptr), _im2col_output(), _weights_reshaped(), _gemm_output(), _tmp_output(), _is_quantized(false), _is_activationlayer_enabled(false), _is_prepared(false)
{
}
@@ -155,7 +151,8 @@
return Status{};
}
-void CLGEMMConvolutionLayer::configure(const ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, const PadStrideInfo &conv_info, const WeightsInfo &weights_info)
+void CLGEMMConvolutionLayer::configure(const ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, const PadStrideInfo &conv_info, const WeightsInfo &weights_info,
+ const Size2D &dilation, const ActivationLayerInfo &act_info)
{
ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output);
@@ -164,9 +161,13 @@
biases != nullptr ? biases->info() : nullptr,
output->info(),
conv_info,
- weights_info));
+ weights_info,
+ dilation,
+ act_info));
- _is_quantized = is_data_type_quantized_asymmetric(input->info()->data_type());
+ _is_prepared = false;
+ _original_weights = weights;
+ _is_quantized = is_data_type_quantized_asymmetric(input->info()->data_type());
const DataType dt = input->info()->data_type();
@@ -191,7 +192,7 @@
const unsigned int kernel_width = weights->info()->dimension(0);
const unsigned int kernel_height = weights->info()->dimension(1);
std::tie(conv_w, conv_h) = scaled_dimensions(input->info()->dimension(0), input->info()->dimension(1), kernel_width, kernel_height,
- conv_info);
+ conv_info, dilation);
unsigned int mat_weights_cols = weights->info()->dimension(3);
unsigned int mat_weights_rows = weights->info()->dimension(0) * weights->info()->dimension(1) * weights->info()->dimension(2) + bias_element;
@@ -226,7 +227,7 @@
_memory_group.manage(&_gemm_output);
// Configure im2col
- _im2col_kernel.configure(input, &_im2col_output, Size2D(kernel_width, kernel_height), conv_info, append_bias);
+ _im2col_kernel.configure(input, &_im2col_output, Size2D(kernel_width, kernel_height), conv_info, append_bias, dilation);
// Configure GEMM
configure_mm(&_im2col_output, weights, &_gemm_output);
@@ -255,14 +256,19 @@
ARM_COMPUTE_ERROR_ON_MSG((output->info()->dimension(0) != conv_w) || (output->info()->dimension(1) != conv_h), "Output shape does not match the expected one");
- // Allocate intermediate tensor
- _weights_reshaped.allocator()->allocate();
+ //Configure Activation Layer
+ _is_activationlayer_enabled = act_info.enabled();
+
+ if(_is_activationlayer_enabled)
+ {
+ _activationlayer_function.configure(output, nullptr, act_info);
+ }
ARM_COMPUTE_UNUSED(weights_info);
}
Status CLGEMMConvolutionLayer::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info,
- const WeightsInfo &weights_info)
+ const WeightsInfo &weights_info, const Size2D &dilation, const ActivationLayerInfo &act_info)
{
ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, weights, output);
ARM_COMPUTE_RETURN_ERROR_ON_MSG(weights_info.are_reshaped(), "Weights already reshaped are not supported!");
@@ -272,6 +278,11 @@
ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(2) != input->dimension(2));
ARM_COMPUTE_RETURN_ERROR_ON(weights->num_dimensions() > 4);
+ if(act_info.enabled())
+ {
+ ARM_COMPUTE_ERROR_ON(act_info.b() > act_info.a());
+ }
+
const bool is_quantized = is_data_type_quantized_asymmetric(input->data_type());
const bool append_bias = (biases != nullptr) && (!is_quantized);
const unsigned bias_element = (append_bias) ? 1 : 0;
@@ -284,12 +295,12 @@
const unsigned int kernel_width = weights->dimension(0);
const unsigned int kernel_height = weights->dimension(1);
- std::tie(conv_w, conv_h) = scaled_dimensions(input->dimension(0), input->dimension(1), kernel_width, kernel_height, conv_info);
+ std::tie(conv_w, conv_h) = scaled_dimensions(input->dimension(0), input->dimension(1), kernel_width, kernel_height, conv_info, dilation);
unsigned int mat_weights_cols = weights->dimension(3);
unsigned int mat_weights_rows = weights->dimension(0) * weights->dimension(1) * weights->dimension(2) + bias_element;
- CLConvolutionLayerReshapeWeights::validate(weights, biases, nullptr);
+ ARM_COMPUTE_RETURN_ON_ERROR(CLConvolutionLayerReshapeWeights::validate(weights, is_quantized ? nullptr : biases, nullptr));
// Create tensor info for im2col reshaped inputs
const unsigned int mat_input_cols = mat_weights_rows;
@@ -300,7 +311,7 @@
shape_im2col.set(2, 1);
TensorInfo im2col_reshaped_info(shape_im2col, 1, dt, input->fixed_point_position());
im2col_reshaped_info.set_quantization_info(input->quantization_info());
- CLIm2ColKernel::validate(input, &im2col_reshaped_info, Size2D(kernel_width, kernel_height), conv_info, append_bias);
+ ARM_COMPUTE_RETURN_ON_ERROR(CLIm2ColKernel::validate(input, &im2col_reshaped_info, Size2D(kernel_width, kernel_height), conv_info, append_bias, dilation));
// Create GEMM output tensor
TensorShape shape_gemm = im2col_reshaped_info.tensor_shape();
@@ -311,9 +322,10 @@
TensorInfo info_gemm(shape_gemm, 1, gemm_data_type, input->fixed_point_position());
info_gemm.set_quantization_info(output->quantization_info());
- validate_mm(&im2col_reshaped_info, weights, &info_gemm);
+ ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(&im2col_reshaped_info, weights, &info_gemm));
+ TensorInfo tmp_info(shape_gemm, 1, DataType::QASYMM8, input->fixed_point_position());
+ tmp_info.set_quantization_info(output->quantization_info());
- TensorInfo tmp_info(input->tensor_shape(), 1, DataType::QASYMM8, input->fixed_point_position());
if(is_quantized)
{
float multiplier = input->quantization_info().scale * weights->quantization_info().scale / output->quantization_info().scale;
@@ -324,7 +336,7 @@
}
// Validate Col2Im
- CLCol2ImKernel::validate(is_quantized ? &tmp_info : &info_gemm, output, std::make_pair(conv_w, conv_h));
+ ARM_COMPUTE_RETURN_ON_ERROR(CLCol2ImKernel::validate(is_quantized ? &tmp_info : &info_gemm, output, std::make_pair(conv_w, conv_h)));
if(biases != nullptr)
{
@@ -341,18 +353,18 @@
ARM_COMPUTE_RETURN_ERROR_ON(biases->num_dimensions() > 1);
}
+ //Validate Activation Layer
+ if(act_info.enabled())
+ {
+ ARM_COMPUTE_RETURN_ON_ERROR(CLActivationLayer::validate(output, nullptr, act_info));
+ }
+
return Status{};
}
void CLGEMMConvolutionLayer::run()
{
- // Run weights reshaping (Runs once for every configure)
- if(_is_first_run)
- {
- _reshape_weights.run();
-
- _is_first_run = false;
- }
+ prepare();
_memory_group.acquire();
@@ -377,5 +389,36 @@
// Reshape output matrix
CLScheduler::get().enqueue(_col2im_kernel, false);
+ //Run Activation Layer if enabled
+ if(_is_activationlayer_enabled)
+ {
+ _activationlayer_function.run();
+ }
+
_memory_group.release();
}
+
+void CLGEMMConvolutionLayer::prepare()
+{
+ if(!_is_prepared)
+ {
+ // Run weights reshaping and mark as unused
+ ARM_COMPUTE_ERROR_ON(!_original_weights->is_used());
+ _weights_reshaped.allocator()->allocate();
+ _reshape_weights.run();
+ _original_weights->mark_as_unused();
+
+ // Run GEMM prepare
+ if(!_is_quantized)
+ {
+ _mm_gemm.prepare();
+ if(!_weights_reshaped.is_used())
+ {
+ _weights_reshaped.allocator()->free();
+ }
+ }
+
+ CLScheduler::get().queue().finish();
+ _is_prepared = true;
+ }
+}
diff --git a/src/runtime/CL/functions/CLGEMMLowpMatrixMultiplyCore.cpp b/src/runtime/CL/functions/CLGEMMLowpMatrixMultiplyCore.cpp
index c688299..711b006 100644
--- a/src/runtime/CL/functions/CLGEMMLowpMatrixMultiplyCore.cpp
+++ b/src/runtime/CL/functions/CLGEMMLowpMatrixMultiplyCore.cpp
@@ -41,7 +41,7 @@
{
bool flag = true;
- if(gpu_target == GPUTarget::BIFROST)
+ if(gpu_target_is_in(gpu_target, GPUTarget::G71, GPUTarget::G72, GPUTarget::G51, GPUTarget::G51BIG, GPUTarget::G51LIT, GPUTarget::TNOX))
{
// COMPMID-852
if(k > 256 && m > 4 && reshape_b_only_on_first_run)
@@ -102,7 +102,10 @@
matrix_b = &_tmp_b;
_memory_group.manage(&_tmp_a);
- _memory_group.manage(&_tmp_b);
+ if(!_reshape_b_only_on_first_run)
+ {
+ _memory_group.manage(&_tmp_b);
+ }
// Configure interleave kernel
_mtx_a_reshape_kernel.configure(a, &_tmp_a, mult_interleave4x4_height);
@@ -119,7 +122,10 @@
{
TensorInfo info_vector_sum_col(compute_reductionA_shape(*b->info()), 1, DataType::S32);
_vector_sum_col.allocator()->init(info_vector_sum_col);
- _memory_group.manage(&_vector_sum_col);
+ if(!_reshape_b_only_on_first_run)
+ {
+ _memory_group.manage(&_vector_sum_col);
+ }
// Configure Matrix B reduction kernel
_mtx_b_reduction_kernel.configure(b, &_vector_sum_col);
diff --git a/src/runtime/CL/functions/CLGaussianPyramid.cpp b/src/runtime/CL/functions/CLGaussianPyramid.cpp
index 4b32954..ddce5fb 100644
--- a/src/runtime/CL/functions/CLGaussianPyramid.cpp
+++ b/src/runtime/CL/functions/CLGaussianPyramid.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2018 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -49,7 +49,8 @@
}
CLGaussianPyramidHalf::CLGaussianPyramidHalf() // NOLINT
- : _border_handler(),
+ : _horizontal_border_handler(),
+ _vertical_border_handler(),
_horizontal_reduction(),
_vertical_reduction()
{
@@ -64,6 +65,9 @@
ARM_COMPUTE_ERROR_ON(input->info()->dimension(1) != pyramid->info()->height());
ARM_COMPUTE_ERROR_ON(SCALE_PYRAMID_HALF != pyramid->info()->scale());
+ // Constant value to use for vertical fill border when the border mode is CONSTANT
+ const uint16_t pixel_value_u16 = static_cast<uint16_t>(constant_border_value) * 2 + static_cast<uint16_t>(constant_border_value) * 8 + static_cast<uint16_t>(constant_border_value) * 6;
+
/* Get number of pyramid levels */
const size_t num_levels = pyramid->info()->num_levels();
@@ -72,28 +76,31 @@
if(num_levels > 1)
{
- _border_handler = arm_compute::support::cpp14::make_unique<CLFillBorderKernel[]>(num_levels - 1);
- _horizontal_reduction = arm_compute::support::cpp14::make_unique<CLGaussianPyramidHorKernel[]>(num_levels - 1);
- _vertical_reduction = arm_compute::support::cpp14::make_unique<CLGaussianPyramidVertKernel[]>(num_levels - 1);
+ _horizontal_border_handler = arm_compute::support::cpp14::make_unique<CLFillBorderKernel[]>(num_levels - 1);
+ _vertical_border_handler = arm_compute::support::cpp14::make_unique<CLFillBorderKernel[]>(num_levels - 1);
+ _horizontal_reduction = arm_compute::support::cpp14::make_unique<CLGaussianPyramidHorKernel[]>(num_levels - 1);
+ _vertical_reduction = arm_compute::support::cpp14::make_unique<CLGaussianPyramidVertKernel[]>(num_levels - 1);
// Apply half scale to the X dimension of the tensor shape
TensorShape tensor_shape = pyramid->info()->tensor_shape();
tensor_shape.set(0, (pyramid->info()->width() + 1) * SCALE_PYRAMID_HALF);
PyramidInfo pyramid_info(num_levels - 1, SCALE_PYRAMID_HALF, tensor_shape, Format::U16);
-
_tmp.init(pyramid_info);
for(size_t i = 0; i < num_levels - 1; ++i)
{
/* Configure horizontal kernel */
- _horizontal_reduction[i].configure(_pyramid->get_pyramid_level(i), _tmp.get_pyramid_level(i), border_mode == BorderMode::UNDEFINED);
+ _horizontal_reduction[i].configure(_pyramid->get_pyramid_level(i), _tmp.get_pyramid_level(i));
/* Configure vertical kernel */
- _vertical_reduction[i].configure(_tmp.get_pyramid_level(i), _pyramid->get_pyramid_level(i + 1), border_mode == BorderMode::UNDEFINED);
+ _vertical_reduction[i].configure(_tmp.get_pyramid_level(i), _pyramid->get_pyramid_level(i + 1));
/* Configure border */
- _border_handler[i].configure(_pyramid->get_pyramid_level(i), _horizontal_reduction[i].border_size(), border_mode, PixelValue(constant_border_value));
+ _horizontal_border_handler[i].configure(_pyramid->get_pyramid_level(i), _horizontal_reduction[i].border_size(), border_mode, PixelValue(constant_border_value));
+
+ /* Configure border */
+ _vertical_border_handler[i].configure(_tmp.get_pyramid_level(i), _vertical_reduction[i].border_size(), border_mode, PixelValue(pixel_value_u16));
}
_tmp.allocate();
}
@@ -110,13 +117,15 @@
_pyramid->get_pyramid_level(0)->map(CLScheduler::get().queue(), true /* blocking */);
_input->map(CLScheduler::get().queue(), true /* blocking */);
_pyramid->get_pyramid_level(0)->copy_from(*_input);
+
_input->unmap(CLScheduler::get().queue());
_pyramid->get_pyramid_level(0)->unmap(CLScheduler::get().queue());
for(unsigned int i = 0; i < num_levels - 1; ++i)
{
- CLScheduler::get().enqueue(_border_handler[i], false);
+ CLScheduler::get().enqueue(_horizontal_border_handler[i], false);
CLScheduler::get().enqueue(_horizontal_reduction[i], false);
+ CLScheduler::get().enqueue(_vertical_border_handler[i], false);
CLScheduler::get().enqueue(_vertical_reduction[i], false);
}
}
diff --git a/src/runtime/CL/functions/CLL2NormalizeLayer.cpp b/src/runtime/CL/functions/CLL2NormalizeLayer.cpp
index d1bb65f..a3010a7 100644
--- a/src/runtime/CL/functions/CLL2NormalizeLayer.cpp
+++ b/src/runtime/CL/functions/CLL2NormalizeLayer.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2018 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -52,6 +52,26 @@
_sumsq.allocator()->allocate();
}
+Status CLL2NormalizeLayer::validate(const ITensorInfo *input, const ITensorInfo *output, unsigned int axis, float epsilon)
+{
+ TensorShape shape(input->tensor_shape());
+
+ // Create intermediate tensor info
+ TensorInfo sum_sq;
+ sum_sq.set_data_type(input->data_type());
+ sum_sq.set_tensor_shape(shape);
+
+ ARM_COMPUTE_RETURN_ON_ERROR(CLReductionOperation::validate(input, &sum_sq, axis, ReductionOperation::SUM_SQUARE));
+
+ // Reduce shape on axis (supported axis is 0)
+ shape.set(0, 1);
+ sum_sq.set_tensor_shape(shape);
+
+ ARM_COMPUTE_RETURN_ON_ERROR(CLL2NormalizeLayerKernel::validate(input, &sum_sq, output, axis, epsilon));
+
+ return Status{};
+}
+
void CLL2NormalizeLayer::run()
{
_memory_group.acquire();
diff --git a/src/runtime/CL/functions/CLLSTMLayer.cpp b/src/runtime/CL/functions/CLLSTMLayer.cpp
new file mode 100644
index 0000000..930d311
--- /dev/null
+++ b/src/runtime/CL/functions/CLLSTMLayer.cpp
@@ -0,0 +1,508 @@
+/*
+ * Copyright (c) 2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/CL/functions/CLLSTMLayer.h"
+
+#include "arm_compute/core/PixelValue.h"
+#include "arm_compute/core/Utils.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "arm_compute/core/utils/quantization/AsymmHelpers.h"
+#include "arm_compute/runtime/CL/CLScheduler.h"
+
+#include <cmath>
+#include <memory>
+#include <tuple>
+
+using namespace arm_compute;
+using namespace arm_compute::misc::shape_calculator;
+
+CLLSTMLayer::CLLSTMLayer(std::shared_ptr<IMemoryManager> memory_manager)
+ : _memory_group(std::move(memory_manager)), _fully_connected_input_gate(), _gemm_input_gate1(), _gemm_input_gate2(), _transpose_input_gate1(), _transpose_input_gate2(), _accum_input_gate1(),
+ _accum_input_gate2(), _subtract_input_gate(), _activation_input_gate(), _fully_connected_forget_gate(), _gemm_forget_gate1(), _gemm_forget_gate2(), _transpose_forget_gate1(),
+ _transpose_forget_gate2(), _accum_forget_gate1(), _accum_forget_gate2(), _activation_forget_gate(), _fully_connected_cell_state(), _gemm_cell_state1(), _gemm_cell_state2(), _transpose_cell_state1(),
+ _accum_cell_state1(), _accum_cell_state2(), _pixelwise_mul_cell_state1(), _activation_cell_state(), _cell_clip(), _pixelwise_mul_cell_state2(), _fully_connected_output(), _gemm_output1(),
+ _gemm_output2(), _transpose_output1(), _transpose_output2(), _accum_output1(), _accum_output2(), _activation_output(), _activation_output_state(), _pixelwise_mul_output_state(),
+ _fully_connected_output_state(), _gemm_output_state(), _accum_output_state(), _projection_clip(), _copy_cell_state(), _copy_output(), _concat_scratch_buffer(), _input_gate_out1(), _input_gate_out2(),
+ _input_gate_out3(), _input_gate_out4(), _input_gate_out5(), _input_gate_out6(), _forget_gate_out1(), _forget_gate_out2(), _forget_gate_out3(), _forget_gate_out4(), _forget_gate_out5(),
+ _forget_gate_out6(), _cell_state_out1(), _cell_state_out2(), _cell_state_out3(), _cell_state_out4(), _cell_state_out5(), _output1(), _output2(), _output3(), _output4(), _output5(), _output6(),
+ _cell_state_activation(), _output_projection1(), _ones(), _run_peephole_opt(false), _run_cifg_opt(false), _perform_cell_clipping(false), _has_projection_weights(false),
+ _perform_projection_clipping(false)
+{
+}
+
+void CLLSTMLayer::configure(const ICLTensor *input, const ICLTensor *input_to_forget_weights, const ICLTensor *input_to_cell_weights, const ICLTensor *input_to_output_weights,
+ const ICLTensor *recurrent_to_forget_weights, const ICLTensor *recurrent_to_cell_weights, const ICLTensor *recurrent_to_output_weights,
+ const ICLTensor *forget_gate_bias, const ICLTensor *cell_bias, const ICLTensor *output_gate_bias,
+ ICLTensor *output_state, ICLTensor *cell_state, ICLTensor *scratch_buffer, ICLTensor *output, const LSTMParams<ICLTensor> &lstm_params, const ActivationLayerInfo &activation_info,
+ float cell_threshold, float projection_threshold)
+{
+ ARM_COMPUTE_ERROR_ON_NULLPTR(input, input_to_forget_weights, input_to_cell_weights, input_to_output_weights, recurrent_to_forget_weights, recurrent_to_cell_weights, recurrent_to_output_weights,
+ forget_gate_bias, cell_bias, output_gate_bias, output_state, cell_state);
+ LSTMParams<ITensorInfo> lstm_params_info;
+ if(lstm_params.has_peephole_opt())
+ {
+ lstm_params_info.set_peephole_params(lstm_params.cell_to_input_weights()->info(), lstm_params.cell_to_forget_weights()->info(), lstm_params.cell_to_output_weights()->info());
+ }
+ if(lstm_params.has_projection())
+ {
+ lstm_params_info.set_projection_params(lstm_params.projection_weights()->info(), lstm_params.projection_bias()->info());
+ }
+ if(!lstm_params.has_cifg_opt())
+ {
+ lstm_params_info.set_cifg_params(lstm_params.input_to_input_weights()->info(), lstm_params.recurrent_to_input_weights()->info(),
+ lstm_params.cell_to_input_weights()->info(), lstm_params.input_gate_bias()->info());
+ }
+ ARM_COMPUTE_ERROR_THROW_ON(CLLSTMLayer::validate(input->info(), input_to_forget_weights->info(),
+ input_to_cell_weights->info(), input_to_output_weights->info(),
+ recurrent_to_forget_weights->info(), recurrent_to_cell_weights->info(), recurrent_to_output_weights->info(),
+ forget_gate_bias->info(), cell_bias->info(), output_gate_bias->info(),
+ output_state->info(), cell_state->info(), scratch_buffer->info(), output->info(), lstm_params_info,
+ activation_info, cell_threshold, projection_threshold));
+
+ const TensorShape cell_state_shape = cell_state->info()->tensor_shape();
+
+ TensorShape forget_gate1_shape = compute_transposed_shape(*recurrent_to_output_weights->info());
+ TensorShape forget_gate2_shape = compute_transposed_shape(*forget_gate_bias->info());
+ TensorShape forget_gate3_shape{ 1, output_state->info()->dimension(1) };
+ _forget_gate_out1.allocator()->init(TensorInfo(cell_state_shape, 1, input->info()->data_type()));
+ _forget_gate_out2.allocator()->init(TensorInfo(forget_gate1_shape, 1, input->info()->data_type()));
+ _forget_gate_out3.allocator()->init(TensorInfo(cell_state_shape, 1, input->info()->data_type()));
+ _forget_gate_out6.allocator()->init(TensorInfo(cell_state_shape, 1, input->info()->data_type()));
+
+ // Configure block that calculates the forget gate
+ // forget_gate = Activation(input * input_to_forget_weights + output_state * recurrent_to_forget_weights + cell_state * cell_to_forget_weights + forget_gate_bias)
+ _memory_group.manage(&_forget_gate_out1);
+ _fully_connected_forget_gate.configure(input, input_to_forget_weights, forget_gate_bias, &_forget_gate_out1, true, false);
+ _memory_group.manage(&_forget_gate_out2);
+ _transpose_forget_gate1.configure(recurrent_to_forget_weights, &_forget_gate_out2);
+ _memory_group.manage(&_forget_gate_out3);
+ _gemm_forget_gate1.configure(output_state, &_forget_gate_out2, nullptr, &_forget_gate_out3, 1.f, 0.f);
+ _forget_gate_out2.allocator()->allocate();
+ _memory_group.manage(&_forget_gate_out6);
+ _accum_forget_gate1.configure(&_forget_gate_out1, &_forget_gate_out3, &_forget_gate_out6, ConvertPolicy::SATURATE);
+ CLTensor *forget_gate_out = &_forget_gate_out6;
+
+ if(lstm_params.has_peephole_opt())
+ {
+ _forget_gate_out4.allocator()->init(TensorInfo(forget_gate2_shape, 1, input->info()->data_type()));
+ _forget_gate_out5.allocator()->init(TensorInfo(forget_gate3_shape, 1, input->info()->data_type()));
+
+ _run_peephole_opt = true;
+ _memory_group.manage(&_forget_gate_out4);
+ _transpose_forget_gate2.configure(lstm_params.cell_to_forget_weights(), &_forget_gate_out4);
+ _memory_group.manage(&_forget_gate_out5);
+ _gemm_forget_gate2.configure(cell_state, &_forget_gate_out4, nullptr, &_forget_gate_out5, 1.f, 0.f);
+ _forget_gate_out4.allocator()->allocate();
+ _accum_forget_gate2.configure(&_forget_gate_out6, &_forget_gate_out5, &_forget_gate_out3, ConvertPolicy::SATURATE);
+ _forget_gate_out5.allocator()->allocate();
+ _forget_gate_out6.allocator()->allocate();
+ forget_gate_out = &_forget_gate_out3;
+ }
+ else
+ {
+ _forget_gate_out3.allocator()->allocate();
+ }
+ _activation_forget_gate.configure(forget_gate_out, &_forget_gate_out1, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC));
+ forget_gate_out->allocator()->allocate();
+
+ TensorShape input_gate3_shape{ 1, output_state->info()->dimension(1) };
+ _input_gate_out1.allocator()->init(TensorInfo(cell_state_shape, 1, input->info()->data_type()));
+ _input_gate_out5.allocator()->init(TensorInfo(input_gate3_shape, 1, input->info()->data_type()));
+
+ // Configure block that calculates the input gate
+ // input_gate = Activation(input * input_to_input_weights + output_state * recurrent_to_input_weights + cell_state * cell_to_input_weights + input_gate_bias), without CIFG
+ // input_gate = 1 - forget_gate, with CIFG
+ if(lstm_params.has_cifg_opt())
+ {
+ _memory_group.manage(&_input_gate_out1);
+ _ones.allocator()->init(TensorInfo(cell_state_shape, 1, input->info()->data_type()));
+ _subtract_input_gate.configure(&_ones, &_forget_gate_out1, &_input_gate_out1, ConvertPolicy::SATURATE);
+ _ones.allocator()->allocate();
+ _run_cifg_opt = true;
+ }
+ else
+ {
+ TensorShape input_gate1_shape = compute_transposed_shape(*recurrent_to_output_weights->info());
+ TensorShape input_gate2_shape = compute_transposed_shape(*lstm_params.cell_to_input_weights()->info());
+
+ _input_gate_out2.allocator()->init(TensorInfo(input_gate1_shape, 1, input->info()->data_type()));
+ _input_gate_out3.allocator()->init(TensorInfo(cell_state_shape, 1, input->info()->data_type()));
+ _input_gate_out4.allocator()->init(TensorInfo(input_gate2_shape, 1, input->info()->data_type()));
+ _input_gate_out6.allocator()->init(TensorInfo(cell_state_shape, 1, input->info()->data_type()));
+
+ _memory_group.manage(&_input_gate_out1);
+ _fully_connected_input_gate.configure(input, lstm_params.input_to_input_weights(), lstm_params.input_gate_bias(), &_input_gate_out1, true, false);
+ _memory_group.manage(&_input_gate_out2);
+ _transpose_input_gate1.configure(lstm_params.recurrent_to_input_weights(), &_input_gate_out2);
+ _memory_group.manage(&_input_gate_out3);
+ _gemm_input_gate1.configure(output_state, &_input_gate_out2, nullptr, &_input_gate_out3, 1.f, 0.f);
+ _input_gate_out2.allocator()->allocate();
+ _memory_group.manage(&_input_gate_out4);
+ _transpose_input_gate2.configure(lstm_params.cell_to_input_weights(), &_input_gate_out4);
+ _memory_group.manage(&_input_gate_out5);
+ _gemm_input_gate2.configure(cell_state, &_input_gate_out4, nullptr, &_input_gate_out5, 1.f, 0.f);
+ _input_gate_out4.allocator()->allocate();
+ _memory_group.manage(&_input_gate_out6);
+ _accum_input_gate1.configure(&_input_gate_out1, &_input_gate_out3, &_input_gate_out6, ConvertPolicy::SATURATE);
+ _input_gate_out3.allocator()->allocate();
+ _accum_input_gate2.configure(&_input_gate_out6, &_input_gate_out5, &_input_gate_out1, ConvertPolicy::SATURATE);
+ _input_gate_out5.allocator()->allocate();
+ _input_gate_out6.allocator()->allocate();
+ _activation_input_gate.configure(&_input_gate_out1, nullptr, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC));
+ }
+
+ TensorShape cell_state1_shape = compute_transposed_shape(*recurrent_to_output_weights->info());
+ _cell_state_out1.allocator()->init(TensorInfo(cell_state_shape, 1, input->info()->data_type()));
+ _cell_state_out2.allocator()->init(TensorInfo(cell_state1_shape, 1, input->info()->data_type()));
+ _cell_state_out3.allocator()->init(TensorInfo(cell_state_shape, 1, input->info()->data_type()));
+ _cell_state_out4.allocator()->init(TensorInfo(cell_state_shape, 1, input->info()->data_type()));
+ _cell_state_out5.allocator()->init(TensorInfo(cell_state_shape, 1, input->info()->data_type()));
+
+ // Configure block that calculates the cell state
+ // cell_state = Clip((RixelwiseMul(input_gate, Activation(input * input_to_cell_weights + output_state * recurrent_to_cell_weights + cell_bias)) + PixelwiseMul(forget_gate, cell_state)), cell_threshold)
+ _memory_group.manage(&_cell_state_out1);
+ _fully_connected_cell_state.configure(input, input_to_cell_weights, cell_bias, &_cell_state_out1, true, false);
+ _memory_group.manage(&_cell_state_out2);
+ _transpose_cell_state1.configure(recurrent_to_cell_weights, &_cell_state_out2);
+ _memory_group.manage(&_cell_state_out3);
+ _gemm_cell_state1.configure(output_state, &_cell_state_out2, nullptr, &_cell_state_out3, 1.f, 0.f);
+ _cell_state_out2.allocator()->allocate();
+ _memory_group.manage(&_cell_state_out4);
+ _accum_cell_state1.configure(&_cell_state_out1, &_cell_state_out3, &_cell_state_out4, ConvertPolicy::SATURATE);
+ _activation_cell_state.configure(&_cell_state_out4, nullptr, activation_info);
+ _memory_group.manage(&_cell_state_out5);
+ _pixelwise_mul_cell_state1.configure(&_cell_state_out4, &_input_gate_out1, &_cell_state_out5, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_NEAREST_EVEN);
+ _input_gate_out1.allocator()->allocate();
+ _cell_state_out4.allocator()->allocate();
+ _pixelwise_mul_cell_state2.configure(&_forget_gate_out1, cell_state, &_cell_state_out3, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_NEAREST_EVEN);
+ _forget_gate_out1.allocator()->allocate();
+ _accum_cell_state2.configure(&_cell_state_out5, &_cell_state_out3, &_cell_state_out1, ConvertPolicy::SATURATE);
+ _cell_state_out3.allocator()->allocate();
+ _cell_state_out5.allocator()->allocate();
+
+ // Perform clipping
+ if(cell_threshold != 0.f)
+ {
+ _perform_cell_clipping = true;
+ _cell_clip.configure(&_cell_state_out1, nullptr, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, -cell_threshold, cell_threshold));
+ }
+
+ TensorShape output1_shape = compute_transposed_shape(*recurrent_to_output_weights->info());
+ TensorShape output2_shape = compute_transposed_shape(*cell_bias->info());
+ TensorShape output3_shape{ 1, output_state->info()->dimension(1) };
+ _output1.allocator()->init(TensorInfo(cell_state_shape, 1, input->info()->data_type()));
+ _output2.allocator()->init(TensorInfo(output1_shape, 1, input->info()->data_type()));
+ _output3.allocator()->init(TensorInfo(cell_state_shape, 1, input->info()->data_type()));
+ _output6.allocator()->init(TensorInfo(cell_state_shape, 1, input->info()->data_type()));
+
+ // Configure block that calculates the output
+ // output_gate = Activation(input * input_to_output_weights + output_state * recurrent_to_output_weights + cell_state * cell_to_output_weights + output_gate_bias)
+ _memory_group.manage(&_output1);
+ _fully_connected_output.configure(input, input_to_output_weights, output_gate_bias, &_output1, true, false);
+ _memory_group.manage(&_output2);
+ _transpose_output1.configure(recurrent_to_output_weights, &_output2);
+ _memory_group.manage(&_output3);
+ _gemm_output1.configure(output_state, &_output2, nullptr, &_output3, 1.f, 0.f);
+ _output2.allocator()->allocate();
+ _memory_group.manage(&_output6);
+ _accum_output1.configure(&_output1, &_output3, &_output6, ConvertPolicy::SATURATE);
+ _output3.allocator()->allocate();
+ CLTensor *output_gate_out = &_output6;
+ if(lstm_params.has_peephole_opt())
+ {
+ _output4.allocator()->init(TensorInfo(output2_shape, 1, input->info()->data_type()));
+ _output5.allocator()->init(TensorInfo(output3_shape, 1, input->info()->data_type()));
+
+ _memory_group.manage(&_output4);
+ _transpose_output2.configure(lstm_params.cell_to_output_weights(), &_output4);
+ _memory_group.manage(&_output5);
+ _gemm_output2.configure(&_cell_state_out1, &_output4, nullptr, &_output5, 1.f, 0.f);
+ _accum_output2.configure(&_output6, &_output5, &_output1, ConvertPolicy::SATURATE);
+ _output6.allocator()->allocate();
+ output_gate_out = &_output1;
+
+ // Allocate intermediate buffers
+ _output4.allocator()->allocate();
+ _output5.allocator()->allocate();
+ }
+ else
+ {
+ _output1.allocator()->allocate();
+ }
+ _activation_output.configure(output_gate_out, output, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC));
+ output_gate_out->allocator()->allocate();
+
+ _cell_state_activation.allocator()->init(TensorInfo(cell_state_shape, 1, input->info()->data_type()));
+
+ // Configure block that calculates the output state
+ /** lstm_res = PixelwiseMul(output, Activation(cell_state))
+ *
+ * -- Clip(lstm_res * projection_weights + projection_bias, projection_threshold) , if there is a projection
+ * /
+ * output_state = --
+ * \
+ * -- lstm_res , otherwise
+ */
+ _memory_group.manage(&_cell_state_activation);
+ _activation_output_state.configure(&_cell_state_out1, &_cell_state_activation, activation_info);
+ _pixelwise_mul_output_state.configure(&_cell_state_activation, output, output_state, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_NEAREST_EVEN);
+ _cell_state_activation.allocator()->allocate();
+
+ if(lstm_params.has_projection())
+ {
+ _has_projection_weights = true;
+ _output_projection1.allocator()->init(TensorInfo(cell_state_shape, 1, input->info()->data_type()));
+ _memory_group.manage(&_output_projection1);
+ _fully_connected_output_state.configure(output_state, lstm_params.projection_weights(), lstm_params.projection_bias(), &_output_projection1, true, false);
+ // Perform clipping
+ if(projection_threshold != 0.f)
+ {
+ _perform_projection_clipping = true;
+ _projection_clip.configure(&_output_projection1, output_state, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, -projection_threshold, projection_threshold));
+ }
+
+ // Allocate intermediate buffer
+ _output_projection1.allocator()->allocate();
+ }
+
+ // Copy cell state and output
+ _copy_cell_state.configure(&_cell_state_out1, cell_state);
+ _cell_state_out1.allocator()->allocate();
+ _copy_output.configure(output_state, output);
+
+ // Vector for holding the tensors to store in scratch buffer
+ std::vector<ICLTensor *> scratch_inputs;
+ if(lstm_params.has_cifg_opt())
+ {
+ scratch_inputs.emplace_back(&_input_gate_out1);
+ }
+ scratch_inputs.emplace_back(&_cell_state_out1);
+ scratch_inputs.emplace_back(forget_gate_out);
+ scratch_inputs.emplace_back(output_gate_out);
+ _concat_scratch_buffer.configure(scratch_inputs, scratch_buffer);
+}
+
+Status CLLSTMLayer::validate(const ITensorInfo *input, const ITensorInfo *input_to_forget_weights, const ITensorInfo *input_to_cell_weights, const ITensorInfo *input_to_output_weights,
+ const ITensorInfo *recurrent_to_forget_weights, const ITensorInfo *recurrent_to_cell_weights, const ITensorInfo *recurrent_to_output_weights,
+ const ITensorInfo *forget_gate_bias, const ITensorInfo *cell_bias, const ITensorInfo *output_gate_bias,
+ const ITensorInfo *output_state, const ITensorInfo *cell_state, const ITensorInfo *scratch_buffer, const ITensorInfo *output,
+ const LSTMParams<ITensorInfo> &lstm_params, const ActivationLayerInfo &activation_info, float cell_threshold, float projection_threshold)
+{
+ ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, input_to_forget_weights, input_to_cell_weights, input_to_output_weights, recurrent_to_forget_weights, recurrent_to_cell_weights, recurrent_to_output_weights,
+ forget_gate_bias, cell_bias, output_gate_bias, output_state, cell_state);
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F16, DataType::F32);
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, input_to_forget_weights, input_to_cell_weights, input_to_output_weights, recurrent_to_forget_weights, recurrent_to_cell_weights,
+ recurrent_to_output_weights, forget_gate_bias, cell_bias, output_gate_bias, output_state, cell_state);
+ ARM_COMPUTE_RETURN_ERROR_ON(input->num_dimensions() != 2);
+ ARM_COMPUTE_RETURN_ERROR_ON(input_to_forget_weights->num_dimensions() != 2);
+ ARM_COMPUTE_RETURN_ERROR_ON(input_to_cell_weights->num_dimensions() != 2);
+ ARM_COMPUTE_RETURN_ERROR_ON(input_to_output_weights->num_dimensions() != 2);
+ ARM_COMPUTE_RETURN_ERROR_ON(recurrent_to_forget_weights->num_dimensions() != 2);
+ ARM_COMPUTE_RETURN_ERROR_ON(recurrent_to_cell_weights->num_dimensions() != 2);
+ ARM_COMPUTE_RETURN_ERROR_ON(recurrent_to_output_weights->num_dimensions() != 2);
+ ARM_COMPUTE_RETURN_ERROR_ON(forget_gate_bias->num_dimensions() != 1);
+ ARM_COMPUTE_RETURN_ERROR_ON(cell_bias->num_dimensions() != 1);
+ ARM_COMPUTE_RETURN_ERROR_ON(output_gate_bias->num_dimensions() != 1);
+ ARM_COMPUTE_RETURN_ERROR_ON(output_state->num_dimensions() != 2);
+ ARM_COMPUTE_RETURN_ERROR_ON(cell_state->num_dimensions() != 2);
+ ARM_COMPUTE_RETURN_ERROR_ON(scratch_buffer->num_dimensions() != 2);
+ ARM_COMPUTE_RETURN_ERROR_ON(output->num_dimensions() != 2);
+ ARM_COMPUTE_RETURN_ERROR_ON(cell_bias->dimension(0) * 4 != scratch_buffer->dimension(0) && cell_bias->dimension(0) * 3 != scratch_buffer->dimension(0));
+
+ if(lstm_params.has_peephole_opt())
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(lstm_params.cell_to_input_weights(), lstm_params.cell_to_output_weights(), lstm_params.cell_to_forget_weights());
+ ARM_COMPUTE_RETURN_ERROR_ON(lstm_params.cell_to_input_weights()->num_dimensions() != 1);
+ ARM_COMPUTE_RETURN_ERROR_ON(lstm_params.cell_to_forget_weights()->num_dimensions() != 1);
+ ARM_COMPUTE_RETURN_ERROR_ON(lstm_params.cell_to_output_weights()->num_dimensions() != 1);
+ }
+
+ TensorShape units_out_transposed_shape = compute_transposed_shape(*recurrent_to_output_weights);
+ TensorShape gemmv_shape{ 1, output_state->dimension(1) };
+ TensorShape num_units_transposed_shape = compute_transposed_shape(*forget_gate_bias);
+ const TensorInfo units_out_transposed_info = TensorInfo(units_out_transposed_shape, 1, input->data_type());
+ const TensorInfo gemmv_shape_info = TensorInfo(gemmv_shape, 1, input->data_type());
+ const TensorInfo num_units_transposed_info = TensorInfo(num_units_transposed_shape, 1, input->data_type());
+
+ // Validate forget gate
+ ARM_COMPUTE_RETURN_ON_ERROR(CLFullyConnectedLayer::validate(input, input_to_forget_weights, forget_gate_bias, cell_state, true, false));
+ ARM_COMPUTE_RETURN_ON_ERROR(CLGEMM::validate(output_state, &units_out_transposed_info, nullptr, cell_state, 1.f, 0.f, GEMMInfo()));
+ ARM_COMPUTE_RETURN_ON_ERROR(CLArithmeticAdditionKernel::validate(cell_state, cell_state, cell_state, ConvertPolicy::SATURATE));
+ if(lstm_params.has_peephole_opt())
+ {
+ ARM_COMPUTE_RETURN_ON_ERROR(CLGEMM::validate(cell_state, &num_units_transposed_info, nullptr, &gemmv_shape_info, 1.f, 0.f, GEMMInfo()));
+ ARM_COMPUTE_RETURN_ON_ERROR(CLArithmeticAddition::validate(cell_state, &gemmv_shape_info, cell_state, ConvertPolicy::SATURATE));
+ }
+ ARM_COMPUTE_RETURN_ON_ERROR(CLActivationLayerKernel::validate(cell_state, cell_state, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC)));
+
+ // Validate input gate
+ if(!lstm_params.has_cifg_opt())
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(lstm_params.input_to_input_weights(), lstm_params.recurrent_to_input_weights(), lstm_params.cell_to_input_weights(), lstm_params.input_gate_bias());
+ ARM_COMPUTE_RETURN_ERROR_ON(lstm_params.input_to_input_weights()->num_dimensions() != 2);
+ ARM_COMPUTE_RETURN_ERROR_ON(lstm_params.recurrent_to_input_weights()->num_dimensions() != 2);
+ ARM_COMPUTE_RETURN_ERROR_ON(lstm_params.cell_to_input_weights()->num_dimensions() != 1);
+ ARM_COMPUTE_RETURN_ERROR_ON(lstm_params.input_gate_bias()->num_dimensions() != 1);
+ ARM_COMPUTE_RETURN_ON_ERROR(CLFullyConnectedLayer::validate(input, lstm_params.input_to_input_weights(), lstm_params.input_gate_bias(), cell_state, true, false));
+ ARM_COMPUTE_RETURN_ON_ERROR(CLGEMM::validate(cell_state, &num_units_transposed_info, nullptr, &gemmv_shape_info, 1.f, 0.f, GEMMInfo()));
+ ARM_COMPUTE_RETURN_ON_ERROR(CLArithmeticAddition::validate(cell_state, &gemmv_shape_info, cell_state, ConvertPolicy::SATURATE));
+ ARM_COMPUTE_RETURN_ON_ERROR(CLActivationLayerKernel::validate(cell_state, nullptr, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC)));
+ }
+ else
+ {
+ ARM_COMPUTE_RETURN_ON_ERROR(CLArithmeticSubtractionKernel::validate(cell_state, cell_state, cell_state, ConvertPolicy::SATURATE));
+ }
+
+ // Validate cell state
+ ARM_COMPUTE_RETURN_ON_ERROR(CLFullyConnectedLayer::validate(input, input_to_cell_weights, cell_bias, cell_state, true, false));
+ ARM_COMPUTE_RETURN_ON_ERROR(CLActivationLayerKernel::validate(cell_state, nullptr, activation_info));
+ ARM_COMPUTE_RETURN_ON_ERROR(CLPixelWiseMultiplicationKernel::validate(cell_state, cell_state, cell_state, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_NEAREST_EVEN));
+
+ if(cell_threshold != 0.f)
+ {
+ ARM_COMPUTE_RETURN_ON_ERROR(CLActivationLayerKernel::validate(cell_state, nullptr, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, -cell_threshold, cell_threshold)));
+ }
+
+ ARM_COMPUTE_RETURN_ON_ERROR(CLFullyConnectedLayer::validate(input, input_to_output_weights, output_gate_bias, cell_state, true, false));
+ if(lstm_params.has_peephole_opt())
+ {
+ ARM_COMPUTE_RETURN_ON_ERROR(CLArithmeticAddition::validate(cell_state, cell_state, cell_state, ConvertPolicy::SATURATE));
+ }
+ ARM_COMPUTE_RETURN_ON_ERROR(CLActivationLayerKernel::validate(cell_state, output, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC)));
+
+ // Validate output state
+ ARM_COMPUTE_RETURN_ON_ERROR(CLActivationLayerKernel::validate(cell_state, cell_state, activation_info));
+ ARM_COMPUTE_RETURN_ON_ERROR(CLPixelWiseMultiplicationKernel::validate(cell_state, output, output_state, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_NEAREST_EVEN));
+ if(lstm_params.has_projection())
+ {
+ ARM_COMPUTE_RETURN_ON_ERROR(CLFullyConnectedLayer::validate(output_state, lstm_params.projection_weights(), lstm_params.projection_bias(), cell_state, true, false));
+ if(projection_threshold != 0.f)
+ {
+ ARM_COMPUTE_RETURN_ON_ERROR(CLActivationLayerKernel::validate(cell_state, output_state, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, -projection_threshold,
+ projection_threshold)));
+ }
+ }
+
+ std::vector<TensorInfo> inputs_vector_info;
+ if(lstm_params.has_cifg_opt())
+ {
+ inputs_vector_info.emplace_back(*cell_state);
+ }
+ inputs_vector_info.emplace_back(*cell_state);
+ inputs_vector_info.emplace_back(*cell_state);
+ inputs_vector_info.emplace_back(*cell_state);
+
+ std::vector<ITensorInfo *> inputs_vector_info_raw;
+ for(auto &input : inputs_vector_info)
+ {
+ inputs_vector_info_raw.emplace_back(&input);
+ }
+
+ ARM_COMPUTE_RETURN_ON_ERROR(CLWidthConcatenateLayer::validate(inputs_vector_info_raw, scratch_buffer));
+ return Status{};
+}
+
+void CLLSTMLayer::run()
+{
+ _memory_group.acquire();
+
+ _fully_connected_forget_gate.run();
+ CLScheduler::get().enqueue(_transpose_forget_gate1);
+ _gemm_forget_gate1.run();
+ CLScheduler::get().enqueue(_accum_forget_gate1);
+
+ if(_run_peephole_opt)
+ {
+ CLScheduler::get().enqueue(_transpose_forget_gate2);
+ _gemm_forget_gate2.run();
+ _accum_forget_gate2.run();
+ }
+ CLScheduler::get().enqueue(_activation_forget_gate);
+
+ if(_run_cifg_opt)
+ {
+ _ones.map(true);
+ std::fill_n(_ones.buffer(), _ones.info()->total_size(), 1);
+ _ones.unmap();
+ CLScheduler::get().enqueue(_subtract_input_gate);
+ }
+ else
+ {
+ _fully_connected_input_gate.run();
+ CLScheduler::get().enqueue(_transpose_input_gate1);
+ _gemm_input_gate1.run();
+ CLScheduler::get().enqueue(_transpose_input_gate2);
+ _gemm_input_gate2.run();
+ CLScheduler::get().enqueue(_accum_input_gate1);
+ _accum_input_gate2.run();
+ CLScheduler::get().enqueue(_activation_input_gate);
+ }
+
+ _fully_connected_cell_state.run();
+ CLScheduler::get().enqueue(_transpose_cell_state1);
+ _gemm_cell_state1.run();
+ CLScheduler::get().enqueue(_accum_cell_state1);
+ CLScheduler::get().enqueue(_activation_cell_state);
+ CLScheduler::get().enqueue(_pixelwise_mul_cell_state1);
+ CLScheduler::get().enqueue(_pixelwise_mul_cell_state2);
+ CLScheduler::get().enqueue(_accum_cell_state2);
+
+ if(_perform_cell_clipping)
+ {
+ CLScheduler::get().enqueue(_cell_clip);
+ }
+
+ _fully_connected_output.run();
+ CLScheduler::get().enqueue(_transpose_output1);
+ _gemm_output1.run();
+ CLScheduler::get().enqueue(_accum_output1);
+ CLScheduler::get().enqueue(_pixelwise_mul_output_state);
+
+ if(_run_peephole_opt)
+ {
+ CLScheduler::get().enqueue(_transpose_output2);
+ _gemm_output2.run();
+ _accum_output2.run();
+ }
+ CLScheduler::get().enqueue(_activation_output);
+
+ CLScheduler::get().enqueue(_activation_output_state);
+ CLScheduler::get().enqueue(_pixelwise_mul_output_state);
+
+ if(_has_projection_weights)
+ {
+ _fully_connected_output_state.run();
+ if(_perform_projection_clipping)
+ {
+ CLScheduler::get().enqueue(_projection_clip);
+ }
+ }
+
+ CLScheduler::get().enqueue(_copy_cell_state);
+ CLScheduler::get().enqueue(_copy_output);
+
+ _concat_scratch_buffer.run();
+
+ _memory_group.release();
+}
\ No newline at end of file
diff --git a/src/runtime/CL/functions/CLLocallyConnectedLayer.cpp b/src/runtime/CL/functions/CLLocallyConnectedLayer.cpp
index 9120aad..986fe00 100644
--- a/src/runtime/CL/functions/CLLocallyConnectedLayer.cpp
+++ b/src/runtime/CL/functions/CLLocallyConnectedLayer.cpp
@@ -33,72 +33,120 @@
using namespace arm_compute;
-CLLocallyConnectedLayer::CLLocallyConnectedLayer(std::shared_ptr<IMemoryManager> memory_manager)
- : _memory_group(std::move(memory_manager)), _input_im2col_kernel(), _weights_reshape_kernel(), _mm_kernel(), _output_col2im_kernel(), _input_im2col_reshaped(), _weights_reshaped(), _gemm_output(),
- _is_first_run(false)
+namespace
{
-}
-
-void CLLocallyConnectedLayer::configure(const ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, const PadStrideInfo &conv_info)
+void calculate_shapes(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info,
+ TensorShape &shape_wr, TensorShape &shape_im2col, TensorShape &shape_gemm)
{
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32);
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(weights, 1, DataType::F32);
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::F32);
- ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, weights, output);
- ARM_COMPUTE_ERROR_ON(weights->info()->dimension(2) != input->info()->dimension(2));
- ARM_COMPUTE_ERROR_ON(!conv_info.padding_is_symmetric());
+ ARM_COMPUTE_UNUSED(output);
- if(biases != nullptr)
- {
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(biases, 1, DataType::F32);
- ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, biases);
- ARM_COMPUTE_ERROR_ON(biases->info()->dimension(0) != weights->info()->dimension(3));
- ARM_COMPUTE_ERROR_ON(biases->info()->num_dimensions() > 2);
- }
+ const unsigned int kernel_width = weights->dimension(0);
+ const unsigned int kernel_height = weights->dimension(1);
- bool _has_bias = (biases != nullptr);
- _is_first_run = true;
-
- // Get parameters for conv_info
- unsigned int stride_x = 0;
- unsigned int stride_y = 0;
- unsigned int pad_x = 0;
- unsigned int pad_y = 0;
- std::tie(stride_x, stride_y) = conv_info.stride();
- std::tie(pad_x, pad_y) = conv_info.pad();
+ bool has_bias = (biases != nullptr);
// Get convolved dimensions
unsigned int conv_w = 0;
unsigned int conv_h = 0;
- std::tie(conv_w, conv_h) = scaled_dimensions(input->info()->dimension(0), input->info()->dimension(1), weights->info()->dimension(0), weights->info()->dimension(1),
+ std::tie(conv_w, conv_h) = scaled_dimensions(input->dimension(0), input->dimension(1), kernel_width, kernel_height,
conv_info);
- ARM_COMPUTE_ERROR_ON_MSG((output->info()->dimension(0) != conv_w) || (output->info()->dimension(1) != conv_h), "Output shape does not match the expected one");
- ARM_COMPUTE_ERROR_ON_MSG(weights->info()->dimension(4) != (conv_w * conv_h), "Weights shape does not match the expected one");
+ const size_t mat_weights_cols = weights->dimension(3);
+ const size_t mat_weights_rows = weights->dimension(0) * weights->dimension(1) * weights->dimension(2) + ((has_bias) ? 1 : 0);
+ const size_t mat_weights_num = weights->dimension(4);
- // Create tensor to store the reshaped weights
- const size_t mat_weights_cols = weights->info()->dimension(3);
- const size_t mat_weights_rows = weights->info()->dimension(0) * weights->info()->dimension(1) * weights->info()->dimension(2) + ((_has_bias) ? 1 : 0);
- const size_t mat_weights_num = weights->info()->dimension(4);
+ shape_wr = TensorShape(mat_weights_cols, mat_weights_rows, mat_weights_num);
- const TensorShape shape_wr(mat_weights_cols, mat_weights_rows, mat_weights_num);
-
- _weights_reshaped.allocator()->init(TensorInfo(shape_wr, 1, weights->info()->data_type()));
-
- // Create tensor to store im2col reshaped inputs
const size_t mat_input_cols = mat_weights_rows;
const size_t mat_input_rows = conv_w * conv_h;
- TensorShape shape_im2col = input->info()->tensor_shape();
+
+ shape_im2col = input->tensor_shape();
shape_im2col.set(0, mat_input_cols);
shape_im2col.set(1, mat_input_rows);
shape_im2col.set(2, 1);
- _input_im2col_reshaped.allocator()->init(TensorInfo(shape_im2col, 1, input->info()->data_type()));
-
- // Create locally connected layer output tensor
- TensorShape shape_gemm = _input_im2col_reshaped.info()->tensor_shape();
+ shape_gemm = shape_im2col;
shape_gemm.set(0, mat_weights_cols);
shape_gemm.set(1, mat_input_rows);
+}
+} // namespace
+
+CLLocallyConnectedLayer::CLLocallyConnectedLayer(std::shared_ptr<IMemoryManager> memory_manager)
+ : _memory_group(std::move(memory_manager)), _input_im2col_kernel(), _weights_reshape_kernel(), _mm_kernel(), _output_col2im_kernel(), _input_im2col_reshaped(), _weights_reshaped(), _gemm_output(),
+ _is_first_run(false), _original_weights(nullptr)
+{
+}
+
+Status CLLocallyConnectedLayer::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info)
+{
+ ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, weights, output);
+ ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(2) != input->dimension(2));
+ ARM_COMPUTE_RETURN_ERROR_ON(!conv_info.padding_is_symmetric());
+
+ bool has_bias = (biases != nullptr);
+
+ if(has_bias)
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON(biases->dimension(0) != weights->dimension(3));
+ ARM_COMPUTE_RETURN_ERROR_ON(biases->num_dimensions() > 2);
+ }
+
+ const unsigned int kernel_width = weights->dimension(0);
+ const unsigned int kernel_height = weights->dimension(1);
+
+ // Get convolved dimensions
+ unsigned int conv_w = 0;
+ unsigned int conv_h = 0;
+ std::tie(conv_w, conv_h) = scaled_dimensions(input->dimension(0), input->dimension(1), kernel_width, kernel_height,
+ conv_info);
+
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG((output->dimension(0) != conv_w) || (output->dimension(1) != conv_h), "Output shape does not match the expected one");
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(weights->dimension(4) != (conv_w * conv_h), "Weights shape does not match the expected one");
+
+ // Calculate intermediate buffer shapes
+ TensorShape shape_wr;
+ TensorShape shape_im2col;
+ TensorShape shape_gemm;
+ calculate_shapes(input, weights, biases, output, conv_info, shape_wr, shape_im2col, shape_gemm);
+
+ TensorInfo weights_reshaped_info(shape_wr, 1, weights->data_type());
+ TensorInfo input_im2col_reshaped_info(shape_im2col, 1, input->data_type());
+ TensorInfo gemm_output_info(shape_gemm, 1, input->data_type());
+
+ ARM_COMPUTE_RETURN_ON_ERROR(CLIm2ColKernel::validate(input, &input_im2col_reshaped_info, Size2D(kernel_width, kernel_height), conv_info, has_bias));
+ ARM_COMPUTE_RETURN_ON_ERROR(CLWeightsReshapeKernel::validate(weights, biases, &weights_reshaped_info));
+ ARM_COMPUTE_RETURN_ON_ERROR(CLLocallyConnectedMatrixMultiplyKernel::validate(&input_im2col_reshaped_info, &weights_reshaped_info, &gemm_output_info));
+ ARM_COMPUTE_RETURN_ON_ERROR(CLCol2ImKernel::validate(&gemm_output_info, output, std::make_pair(conv_w, conv_h)));
+
+ return Status{};
+}
+
+void CLLocallyConnectedLayer::configure(const ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, const PadStrideInfo &conv_info)
+{
+ ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output);
+ ARM_COMPUTE_ERROR_THROW_ON(CLLocallyConnectedLayer::validate(input->info(), weights->info(), biases == nullptr ? nullptr : biases->info(), output->info(), conv_info));
+
+ bool _has_bias = (biases != nullptr);
+ _original_weights = weights;
+ _is_first_run = true;
+
+ const unsigned int kernel_width = weights->info()->dimension(0);
+ const unsigned int kernel_height = weights->info()->dimension(1);
+
+ // Get convolved dimensions
+ unsigned int conv_w = 0;
+ unsigned int conv_h = 0;
+ std::tie(conv_w, conv_h) = scaled_dimensions(input->info()->dimension(0), input->info()->dimension(1), kernel_width, kernel_height,
+ conv_info);
+
+ // Calculate intermediate buffer shapes
+ TensorShape shape_wr;
+ TensorShape shape_im2col;
+ TensorShape shape_gemm;
+ calculate_shapes(input->info(), weights->info(), biases == nullptr ? nullptr : biases->info(), output->info(), conv_info, shape_wr, shape_im2col, shape_gemm);
+
+ _weights_reshaped.allocator()->init(TensorInfo(shape_wr, 1, weights->info()->data_type()));
+ _input_im2col_reshaped.allocator()->init(TensorInfo(shape_im2col, 1, input->info()->data_type()));
_gemm_output.allocator()->init(TensorInfo(shape_gemm, 1, input->info()->data_type()));
// Manage intermediate buffers
@@ -106,7 +154,7 @@
_memory_group.manage(&_gemm_output);
// Configure kernels
- _input_im2col_kernel.configure(input, &_input_im2col_reshaped, Size2D(conv_w, conv_h), conv_info, _has_bias);
+ _input_im2col_kernel.configure(input, &_input_im2col_reshaped, Size2D(kernel_width, kernel_height), conv_info, _has_bias);
_weights_reshape_kernel.configure(weights, biases, &_weights_reshaped);
_mm_kernel.configure(&_input_im2col_reshaped, &_weights_reshaped, &_gemm_output);
_output_col2im_kernel.configure(&_gemm_output, output, std::make_pair(conv_w, conv_h));
@@ -122,8 +170,13 @@
// Run weights reshaping (Runs once for every configure)
if(_is_first_run)
{
+ ARM_COMPUTE_ERROR_ON(!_original_weights->is_used());
+
_is_first_run = false;
CLScheduler::get().enqueue(_weights_reshape_kernel);
+
+ // Mark original weights tensor as unused
+ _original_weights->mark_as_unused();
}
_memory_group.acquire();
diff --git a/src/runtime/CL/functions/CLPermute.cpp b/src/runtime/CL/functions/CLPermute.cpp
index 146856c..55b7649 100644
--- a/src/runtime/CL/functions/CLPermute.cpp
+++ b/src/runtime/CL/functions/CLPermute.cpp
@@ -39,6 +39,6 @@
Status CLPermute::validate(const ITensorInfo *input, const ITensorInfo *output, const PermutationVector &perm)
{
- ARM_COMPUTE_RETURN_ERROR_ON(CLPermuteKernel::validate(input, output, perm));
+ ARM_COMPUTE_RETURN_ON_ERROR(CLPermuteKernel::validate(input, output, perm));
return Status{};
-}
\ No newline at end of file
+}
diff --git a/src/runtime/CL/functions/CLPoolingLayer.cpp b/src/runtime/CL/functions/CLPoolingLayer.cpp
index 201bf87..17875a3 100644
--- a/src/runtime/CL/functions/CLPoolingLayer.cpp
+++ b/src/runtime/CL/functions/CLPoolingLayer.cpp
@@ -41,13 +41,28 @@
_kernel = std::move(k);
// Configure border depending on operation required (quantize border in case of asymmetric data_type)
- BorderMode border_mode = (PoolingType::MAX == pool_info.pool_type()) ? BorderMode::REPLICATE : BorderMode::CONSTANT;
- PixelValue zero_value(0.f);
+ BorderMode border_mode{};
+ PixelValue pixel_value(0.f);
if(is_data_type_quantized_asymmetric(input->info()->data_type()) && !pool_info.exclude_padding())
{
- zero_value = PixelValue(static_cast<uint32_t>(input->info()->quantization_info().offset));
+ pixel_value = PixelValue(static_cast<uint32_t>(input->info()->quantization_info().offset));
}
- _border_handler.configure(input, _kernel->border_size(), border_mode, zero_value);
+ switch(input->info()->data_layout())
+ {
+ case DataLayout::NCHW:
+ border_mode = (PoolingType::MAX == pool_info.pool_type()) ? BorderMode::REPLICATE : BorderMode::CONSTANT;
+ break;
+ case DataLayout::NHWC:
+ border_mode = BorderMode::CONSTANT;
+ if(PoolingType::MAX == pool_info.pool_type() && !is_data_type_quantized_asymmetric(input->info()->data_type()))
+ {
+ pixel_value = PixelValue(std::numeric_limits<float>::lowest());
+ }
+ break;
+ default:
+ ARM_COMPUTE_ERROR("Data layout not supported");
+ }
+ _border_handler.configure(input, _kernel->border_size(), border_mode, pixel_value);
}
Status CLPoolingLayer::validate(const ITensorInfo *input, const ITensorInfo *output, const PoolingLayerInfo &pool_info)
diff --git a/src/runtime/CL/functions/CLQuantizationLayer.cpp b/src/runtime/CL/functions/CLQuantizationLayer.cpp
index ed1f51c..a13859c 100644
--- a/src/runtime/CL/functions/CLQuantizationLayer.cpp
+++ b/src/runtime/CL/functions/CLQuantizationLayer.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2018 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -24,6 +24,7 @@
#include "arm_compute/runtime/CL/functions/CLQuantizationLayer.h"
+#include "arm_compute/core/Error.h"
#include "arm_compute/runtime/CL/CLScheduler.h"
using namespace arm_compute;
@@ -33,8 +34,21 @@
{
}
+Status CLQuantizationLayer::validate(const ITensorInfo *input, const ITensorInfo *output)
+{
+ ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
+
+ TensorInfo min_max{ input->num_channels(), input->data_type() };
+ ARM_COMPUTE_RETURN_ON_ERROR(CLMinMaxLayerKernel::validate(input, &min_max));
+ ARM_COMPUTE_RETURN_ON_ERROR(CLQuantizationLayerKernel::validate(input, output, &min_max));
+
+ return Status{};
+}
+
void CLQuantizationLayer::configure(const ICLTensor *input, ICLTensor *output)
{
+ ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
+
// Configure min-max kernel. _min_max tensor will be auto-configured within the kernel.
_min_max_kernel.configure(input, &_min_max);
diff --git a/src/runtime/CL/functions/CLRNNLayer.cpp b/src/runtime/CL/functions/CLRNNLayer.cpp
new file mode 100644
index 0000000..4843ba6
--- /dev/null
+++ b/src/runtime/CL/functions/CLRNNLayer.cpp
@@ -0,0 +1,112 @@
+/*
+ * Copyright (c) 2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/CL/functions/CLRNNLayer.h"
+
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/Utils.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "arm_compute/runtime/CL/CLScheduler.h"
+#include "support/ToolchainSupport.h"
+
+#include <utility>
+
+using namespace arm_compute;
+using namespace arm_compute::misc::shape_calculator;
+
+CLRNNLayer::CLRNNLayer(std::shared_ptr<IMemoryManager> memory_manager)
+ : _memory_group(std::move(memory_manager)), _gemm_state_f(), _add_kernel(), _activation_kernel(), _fully_connected_kernel(), _copy_kernel(), _fully_connected_out(), _gemm_output(), _add_output()
+{
+}
+
+Status CLRNNLayer::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *recurrent_weights, const ITensorInfo *bias, const ITensorInfo *hidden_state,
+ const ITensorInfo *output, const ActivationLayerInfo &info)
+{
+ const int idx_width = get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::WIDTH);
+ const int idx_height = get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::HEIGHT);
+ ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, weights, recurrent_weights, bias, hidden_state, output);
+ ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(idx_width) != weights->dimension(idx_width));
+ ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(idx_height) != recurrent_weights->dimension(idx_width));
+ ARM_COMPUTE_RETURN_ERROR_ON(recurrent_weights->dimension(idx_width) != recurrent_weights->dimension(1));
+ ARM_COMPUTE_RETURN_ERROR_ON(bias->num_dimensions() != 1);
+ ARM_COMPUTE_RETURN_ERROR_ON(bias->dimension(idx_width) != weights->dimension(idx_height));
+ ARM_COMPUTE_RETURN_ERROR_ON(hidden_state->dimension(idx_width) != weights->dimension(idx_height));
+ ARM_COMPUTE_RETURN_ERROR_ON(hidden_state->dimension(idx_height) != input->dimension(idx_height));
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(output->tensor_shape(), hidden_state->tensor_shape());
+
+ auto shape_info = TensorInfo(compute_rnn_shape(recurrent_weights, hidden_state->dimension(idx_height)), 1, input->data_type());
+
+ ARM_COMPUTE_RETURN_ON_ERROR(CLFullyConnectedLayer::validate(input, weights, bias, &shape_info, true, false));
+ ARM_COMPUTE_RETURN_ON_ERROR(CLGEMM::validate(hidden_state, recurrent_weights, nullptr, &shape_info, 1.f, 0.f));
+ ARM_COMPUTE_RETURN_ON_ERROR(CLArithmeticAdditionKernel::validate(&shape_info, &shape_info, &shape_info, ConvertPolicy::SATURATE));
+ ARM_COMPUTE_RETURN_ON_ERROR(CLActivationLayerKernel::validate(&shape_info, &shape_info, info));
+
+ return Status{};
+}
+
+void CLRNNLayer::configure(const ICLTensor *input, const ICLTensor *weights, const ICLTensor *recurrent_weights, const ICLTensor *bias, ICLTensor *hidden_state, ICLTensor *output,
+ ActivationLayerInfo &info)
+{
+ ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, recurrent_weights, bias, hidden_state, output);
+ ARM_COMPUTE_ERROR_THROW_ON(CLRNNLayer::validate(input->info(), weights->info(), recurrent_weights->info(), bias->info(), hidden_state->info(), output->info(), info));
+
+ const int idx_height = get_data_layout_dimension_index(input->info()->data_layout(), DataLayoutDimension::HEIGHT);
+ TensorShape shape = compute_rnn_shape(recurrent_weights->info(), hidden_state->info()->dimension(idx_height));
+
+ _fully_connected_out.allocator()->init(TensorInfo(shape, 1, input->info()->data_type()));
+ _gemm_output.allocator()->init(TensorInfo(shape, 1, input->info()->data_type()));
+
+ // Manage intermediate buffers and configure
+ _memory_group.manage(&_fully_connected_out);
+ _fully_connected_kernel.configure(input, weights, bias, &_fully_connected_out, true, false);
+
+ _memory_group.manage(&_gemm_output);
+ _gemm_state_f.configure(hidden_state, recurrent_weights, nullptr, &_gemm_output, 1.f, 0.f);
+
+ _add_output.allocator()->init(TensorInfo(shape, 1, input->info()->data_type()));
+ _memory_group.manage(&_add_output);
+
+ _add_kernel.configure(&_fully_connected_out, &_gemm_output, &_add_output, ConvertPolicy::SATURATE);
+
+ _fully_connected_out.allocator()->allocate();
+ _gemm_output.allocator()->allocate();
+
+ _activation_kernel.configure(&_add_output, hidden_state, info);
+ _add_output.allocator()->allocate();
+
+ _copy_kernel.configure(hidden_state, output);
+}
+
+void CLRNNLayer::run()
+{
+ _memory_group.acquire();
+ _fully_connected_kernel.run();
+ _gemm_state_f.run();
+ CLScheduler::get().enqueue(_add_kernel);
+ CLScheduler::get().enqueue(_activation_kernel);
+
+ // copy hidden out to output
+ CLScheduler::get().enqueue(_copy_kernel);
+ _memory_group.release();
+}
\ No newline at end of file
diff --git a/src/runtime/CL/functions/CLReductionOperation.cpp b/src/runtime/CL/functions/CLReductionOperation.cpp
index d02afb4..3a5133d 100644
--- a/src/runtime/CL/functions/CLReductionOperation.cpp
+++ b/src/runtime/CL/functions/CLReductionOperation.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2018 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -35,19 +35,64 @@
using namespace arm_compute;
+namespace
+{
+unsigned int calculate_number_of_stages(const ITensorInfo *input)
+{
+ // Calculate number of WGs. 16 elements per thread, 8 threads per WG
+ const unsigned int num_of_wg = ceil(input->dimension(0) / 128.f);
+
+ // Calculate number of stages. First stage performs op and the rest reduction sum
+ // depending on the size of the input. Last stage should have only 1 WG.
+ const unsigned int num_of_stages = num_of_wg / 128 + 2;
+
+ return num_of_stages;
+}
+} // namespace
+
CLReductionOperation::CLReductionOperation(std::shared_ptr<IMemoryManager> memory_manager)
: _memory_group(std::move(memory_manager)), _sums_vector(), _reduction_kernels_vector(), _border_handlers_vector(), _num_of_stages()
{
}
+Status CLReductionOperation::validate(const ITensorInfo *input, const ITensorInfo *output, unsigned int axis, ReductionOperation op)
+{
+ const unsigned int num_of_stages = calculate_number_of_stages(input);
+
+ // Create temporary tensor infos
+ auto sums_vector = arm_compute::support::cpp14::make_unique<TensorInfo[]>(num_of_stages - 1);
+
+ // Create intermediate tensor info
+ TensorShape shape{ input->tensor_shape() };
+
+ for(unsigned int i = 0; i < num_of_stages - 1; i++)
+ {
+ shape.set(0, ceil(shape.x() / 128.f));
+ sums_vector[i].set_data_type(input->data_type());
+ sums_vector[i].set_tensor_shape(shape);
+ sums_vector[i].set_num_channels(input->num_channels());
+ sums_vector[i].set_fixed_point_position(input->fixed_point_position());
+ }
+
+ // Validate ReductionOperation only on first kernel
+ ARM_COMPUTE_RETURN_ON_ERROR(CLReductionOperationKernel::validate(input, sums_vector.get(), axis, op));
+
+ // Validate ReductionOperation on intermediate stages
+ for(unsigned int i = 1; i < num_of_stages - 1; ++i)
+ {
+ ARM_COMPUTE_RETURN_ON_ERROR(CLReductionOperationKernel::validate(sums_vector.get() + i - 1, sums_vector.get() + i, axis, op));
+ }
+
+ // Validate ReductionOperation on the last stage
+ const unsigned int last_stage = num_of_stages - 1;
+ ARM_COMPUTE_RETURN_ON_ERROR(CLReductionOperationKernel::validate(sums_vector.get() + last_stage - 1, output, axis, op));
+
+ return Status{};
+}
+
void CLReductionOperation::configure(ICLTensor *input, ICLTensor *output, unsigned int axis, ReductionOperation op)
{
- // Calculate number of WGs. 16 elements per thread, 8 threads per WG
- unsigned int num_of_wg = ceil(input->info()->dimension(0) / 128.f);
-
- // Calculate number of stages. First stage performs op and the rest reduction sum
- // depending on the size of the input. Last stage should have only 1 WG.
- _num_of_stages = num_of_wg / 128 + 2;
+ _num_of_stages = calculate_number_of_stages(input->info());
// Create temporary tensors
_sums_vector = arm_compute::support::cpp14::make_unique<CLTensor[]>(_num_of_stages - 1);
@@ -95,4 +140,4 @@
}
_memory_group.release();
-}
\ No newline at end of file
+}
diff --git a/src/runtime/CL/functions/CLWidthConcatenateLayer.cpp b/src/runtime/CL/functions/CLWidthConcatenateLayer.cpp
new file mode 100644
index 0000000..d542781
--- /dev/null
+++ b/src/runtime/CL/functions/CLWidthConcatenateLayer.cpp
@@ -0,0 +1,98 @@
+/*
+ * Copyright (c) 2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/CL/functions/CLWidthConcatenateLayer.h"
+
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "arm_compute/runtime/CL/CLScheduler.h"
+#include "support/ToolchainSupport.h"
+
+using namespace arm_compute;
+
+CLWidthConcatenateLayer::CLWidthConcatenateLayer() // NOLINT
+ : _concat_kernels_vector(),
+ _num_inputs(0)
+{
+}
+
+Status CLWidthConcatenateLayer::validate(const std::vector<ITensorInfo *> &inputs_vector, const ITensorInfo *output) // NOLINT
+{
+ ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(output);
+ ARM_COMPUTE_RETURN_ERROR_ON(inputs_vector.size() < 2);
+
+ // Output auto inizialitation if not yet initialized
+ TensorInfo tmp_output_info = *output->clone();
+ TensorShape output_shape = arm_compute::misc::shape_calculator::calculate_width_concatenate_shape(inputs_vector);
+ auto_init_if_empty(tmp_output_info, output_shape, 1, inputs_vector[0]->data_type(), inputs_vector[0]->fixed_point_position());
+
+ unsigned int width_offset = 0;
+ for(const auto &input : inputs_vector)
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input);
+ ARM_COMPUTE_RETURN_ON_ERROR(CLWidthConcatenateLayerKernel::validate(input, width_offset, &tmp_output_info));
+ width_offset += input->dimension(0);
+ }
+
+ return Status{};
+}
+
+void CLWidthConcatenateLayer::configure(std::vector<ICLTensor *> inputs_vector, ICLTensor *output) // NOLINT
+{
+ _num_inputs = inputs_vector.size();
+
+ std::vector<ITensorInfo *> inputs_vector_info;
+ for(unsigned int i = 0; i < _num_inputs; i++)
+ {
+ inputs_vector_info.emplace_back(inputs_vector.at(i)->info());
+ }
+ TensorShape output_shape = arm_compute::misc::shape_calculator::calculate_width_concatenate_shape(inputs_vector);
+
+ // Output auto inizialitation if not yet initialized
+ auto_init_if_empty(*output->info(), output_shape, 1, inputs_vector[0]->info()->data_type(), inputs_vector[0]->info()->fixed_point_position());
+ ARM_COMPUTE_ERROR_THROW_ON(CLWidthConcatenateLayer::validate(inputs_vector_info, output->info()));
+
+ unsigned int width_offset = 0;
+
+ _concat_kernels_vector = arm_compute::support::cpp14::make_unique<CLWidthConcatenateLayerKernel[]>(_num_inputs);
+
+ for(unsigned int i = 0; i < _num_inputs; i++)
+ {
+ _concat_kernels_vector[i].configure(inputs_vector.at(i), width_offset, output);
+ width_offset += inputs_vector.at(i)->info()->dimension(0);
+ }
+}
+
+void CLWidthConcatenateLayer::run()
+{
+ cl::CommandQueue q = CLScheduler::get().queue();
+
+ for(unsigned i = 0; i < _num_inputs; i++)
+ {
+ CLScheduler::get().enqueue(_concat_kernels_vector[i], true);
+ }
+}
diff --git a/src/runtime/CL/functions/CLWinogradConvolutionLayer.cpp b/src/runtime/CL/functions/CLWinogradConvolutionLayer.cpp
new file mode 100644
index 0000000..49753ad
--- /dev/null
+++ b/src/runtime/CL/functions/CLWinogradConvolutionLayer.cpp
@@ -0,0 +1,226 @@
+/*
+ * Copyright (c) 2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/CL/functions/CLWinogradConvolutionLayer.h"
+
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/Utils.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "arm_compute/runtime/CL/CLScheduler.h"
+
+using namespace arm_compute;
+
+namespace
+{
+Size2D winograd_output_tile(const Size2D &input_dims, const Size2D &kernel_dims)
+{
+ Size2D output_tile = Size2D{};
+
+ if(kernel_dims == Size2D(3U, 3U))
+ {
+ output_tile = (input_dims.width <= 4 && input_dims.height <= 4) ? Size2D(2U, 2U) : Size2D(4U, 4U);
+ }
+ else if(kernel_dims == Size2D(5U, 5U))
+ {
+ output_tile = Size2D(4U, 4U);
+ }
+
+ return output_tile;
+}
+
+bool check_support_fast_math(const Size2D &output_tile, const Size2D &kernel_size)
+{
+ // Check if we want to configure a Winograd configuration which requires fast math
+ using WinogradConfiguration = std::pair<std::pair<int, int>, std::pair<int, int>>;
+
+ std::vector<WinogradConfiguration> fast_math_winograd =
+ {
+ WinogradConfiguration(std::pair<int, int>(4, 4), std::pair<int, int>(5, 5))
+ };
+
+ auto p = std::make_pair(std::pair<int, int>(output_tile.width, output_tile.height),
+ std::pair<int, int>(kernel_size.width, kernel_size.height));
+
+ return std::find(fast_math_winograd.begin(), fast_math_winograd.end(), p) != fast_math_winograd.end();
+}
+} // namespace
+
+CLWinogradConvolutionLayer::CLWinogradConvolutionLayer(std::shared_ptr<IMemoryManager> memory_manager)
+ : _memory_group(memory_manager), _batched_mm(memory_manager), _input_transform(), _filter_transform(), _output_transform(), _activationlayer_function(), _input0(), _input1(), _batched_mm_output(),
+ _original_weights(nullptr), _is_prepared(false), _is_activationlayer_enabled(false)
+{
+}
+
+void CLWinogradConvolutionLayer::configure(ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, const PadStrideInfo &conv_info, const ActivationLayerInfo &act_info,
+ bool enable_fast_math)
+{
+ // Get indices for the width and height
+ const size_t idx_width = get_data_layout_dimension_index(input->info()->data_layout(), DataLayoutDimension::WIDTH);
+ const size_t idx_height = get_data_layout_dimension_index(input->info()->data_layout(), DataLayoutDimension::HEIGHT);
+
+ // Input shape, kernel size and output tile
+ const Size2D input_dims = Size2D(input->info()->tensor_shape()[idx_width], input->info()->tensor_shape()[idx_height]);
+ const Size2D kernel_size = Size2D(weights->info()->tensor_shape()[idx_width], weights->info()->tensor_shape()[idx_height]);
+ const Size2D output_tile = winograd_output_tile(input_dims, kernel_size);
+
+ // Check if the Winograd configuration requires fast math
+ if(!enable_fast_math)
+ {
+ ARM_COMPUTE_ERROR_ON_MSG(check_support_fast_math(output_tile, kernel_size), "This Winograd configuration requires enable_fast_math=true");
+ }
+
+ const WinogradInfo winograd_info = WinogradInfo(output_tile,
+ kernel_size,
+ input_dims,
+ conv_info,
+ input->info()->data_layout());
+
+ _is_prepared = false;
+ _original_weights = weights;
+
+ // Manage intermediate tensors
+ _memory_group.manage(&_input0);
+ _memory_group.manage(&_batched_mm_output);
+
+ // Do not manage _input1 as it contains the weights
+
+ // Configure input transform
+ _input_transform.configure(input, &_input0, winograd_info);
+
+ // Configure filter transform
+ _filter_transform.configure(weights, &_input1, winograd_info);
+
+ // Configure batched matrix multiply
+ _batched_mm.configure(&_input0, &_input1, nullptr, &_batched_mm_output, 1.0f, 0.0f, GEMMInfo(false, false, true /* Reshape weights only for the first run*/));
+
+ // Configure output transform
+ _output_transform.configure(&_batched_mm_output, biases, output, winograd_info);
+
+ // Configure activation layer
+ _is_activationlayer_enabled = act_info.enabled();
+ if(_is_activationlayer_enabled)
+ {
+ _activationlayer_function.configure(output, nullptr, act_info);
+ }
+
+ // Allocate temporary tensors
+ _input0.allocator()->allocate();
+ _batched_mm_output.allocator()->allocate();
+}
+
+Status CLWinogradConvolutionLayer::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info,
+ const ActivationLayerInfo &act_info, bool enable_fast_math)
+{
+ // Get indeces for the width and height
+ const size_t idx_width = get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::WIDTH);
+ const size_t idx_height = get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::HEIGHT);
+
+ // Input shape, kernel size and output tile
+ const Size2D input_dims = Size2D(input->tensor_shape()[idx_width], input->tensor_shape()[idx_height]);
+ const Size2D kernel_size = Size2D(weights->tensor_shape()[idx_width], weights->tensor_shape()[idx_height]);
+ const Size2D output_tile = winograd_output_tile(input_dims, kernel_size);
+
+ // Check if the Winograd configuration requires fast math
+ if(!enable_fast_math)
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(check_support_fast_math(output_tile, kernel_size), "This Winograd configuration requires enable_fast_math=true");
+ }
+
+ const WinogradInfo winograd_info = WinogradInfo(output_tile,
+ kernel_size,
+ input_dims,
+ conv_info,
+ input->data_layout());
+
+ // Validate input transform
+ const TensorShape input0_shape = misc::shape_calculator::compute_winograd_input_transform_shape(*input, winograd_info);
+ const TensorInfo input0 = input->clone()->set_tensor_shape(input0_shape);
+ ARM_COMPUTE_RETURN_ON_ERROR(CLWinogradInputTransform::validate(input, &input0, winograd_info));
+
+ // Validate filter transform
+ const TensorShape input1_shape = misc::shape_calculator::compute_winograd_filter_transform_shape(*weights, winograd_info);
+ const TensorInfo input1 = weights->clone()->set_tensor_shape(input1_shape);
+ ARM_COMPUTE_RETURN_ON_ERROR(CLWinogradFilterTransformKernel::validate(weights, &input1, winograd_info));
+
+ // Validate batched matrix multiply
+ TensorShape batched_mm_output_shape = input0.tensor_shape();
+ batched_mm_output_shape[0] = input1.tensor_shape()[0];
+ const TensorInfo batched_mm_output = input0.clone()->set_tensor_shape(batched_mm_output_shape);
+ ARM_COMPUTE_RETURN_ON_ERROR(CLGEMM::validate(&input0, &input1, nullptr, &batched_mm_output, 1.0f, 0.0f, GEMMInfo(false, false, true /* Reshape weights only for the first run*/)));
+
+ // Configure output transform
+ ARM_COMPUTE_RETURN_ON_ERROR(CLWinogradOutputTransformKernel::validate(&batched_mm_output, biases, output, winograd_info));
+
+ // Validate Activation Layer
+ if(act_info.enabled())
+ {
+ ARM_COMPUTE_RETURN_ON_ERROR(CLActivationLayer::validate(output, nullptr, act_info));
+ }
+
+ return Status{};
+}
+
+void CLWinogradConvolutionLayer::run()
+{
+ prepare();
+
+ _memory_group.acquire();
+
+ // Run input transform
+ _input_transform.run();
+
+ // Run batched matrix multiplication
+ _batched_mm.run();
+
+ // Run output transform
+ CLScheduler::get().enqueue(_output_transform);
+
+ if(_is_activationlayer_enabled)
+ {
+ _activationlayer_function.run();
+ }
+
+ _memory_group.release();
+}
+
+void CLWinogradConvolutionLayer::prepare()
+{
+ if(!_is_prepared)
+ {
+ // Run filter transform and mark original weights as unused
+ _input1.allocator()->allocate();
+ CLScheduler::get().enqueue(_filter_transform, false);
+ _original_weights->mark_as_unused();
+
+ // Prepare GEMM and release reshaped weights if marked unused by CLGEMM
+ _batched_mm.prepare();
+ if(!_input1.is_used())
+ {
+ _input1.allocator()->free();
+ }
+
+ CLScheduler::get().queue().finish();
+ _is_prepared = true;
+ }
+}
diff --git a/src/runtime/CL/functions/CLWinogradInputTransform.cpp b/src/runtime/CL/functions/CLWinogradInputTransform.cpp
new file mode 100644
index 0000000..09e8456
--- /dev/null
+++ b/src/runtime/CL/functions/CLWinogradInputTransform.cpp
@@ -0,0 +1,45 @@
+/*
+ * Copyright (c) 2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/CL/functions/CLWinogradInputTransform.h"
+
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/CL/kernels/CLWinogradInputTransformKernel.h"
+#include "arm_compute/core/Error.h"
+#include "support/ToolchainSupport.h"
+
+using namespace arm_compute;
+
+void CLWinogradInputTransform::configure(ICLTensor *input, ICLTensor *output, const WinogradInfo &winograd_info)
+{
+ auto k = arm_compute::support::cpp14::make_unique<CLWinogradInputTransformKernel>();
+ k->configure(input, output, winograd_info);
+ _kernel = std::move(k);
+ _border_handler.configure(input, _kernel->border_size(), BorderMode::CONSTANT, PixelValue(0));
+}
+
+Status CLWinogradInputTransform::validate(const ITensorInfo *input, const ITensorInfo *output, const WinogradInfo &winograd_info)
+{
+ ARM_COMPUTE_RETURN_ON_ERROR(CLWinogradInputTransformKernel::validate(input, output, winograd_info));
+ return Status{};
+}
diff --git a/src/runtime/CL/tuners/BifrostTuner.cpp b/src/runtime/CL/tuners/BifrostTuner.cpp
new file mode 100644
index 0000000..c0ebd24
--- /dev/null
+++ b/src/runtime/CL/tuners/BifrostTuner.cpp
@@ -0,0 +1,143 @@
+/*
+ * Copyright (c) 2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/CL/tuners/BifrostTuner.h"
+
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/CL/CLKernels.h"
+#include "arm_compute/core/utils/misc/Cast.h"
+
+namespace arm_compute
+{
+namespace tuners
+{
+namespace
+{
+/** Tunes a @ref CLDirectConvolutionLayerKernel for a bifrost target
+ *
+ * @param[in] k Kernels to tune
+ */
+void tune_direct_convolution_kernel(CLDirectConvolutionLayerKernel &k)
+{
+ cl::NDRange lws_hint = k.lws_hint();
+
+ const GPUTarget gpu_target = k.get_target();
+ const DataType dt = k._input->info()->data_type();
+ const TensorShape weights_shape = k._weights->info()->tensor_shape();
+ const TensorShape inputs_shape = k._input->info()->tensor_shape();
+ const size_t kernel_size = weights_shape.x();
+ const unsigned int stride_x = k._conv_stride_x;
+ const unsigned int stride_y = k._conv_stride_y;
+
+ if(gpu_target_is_in(gpu_target, GPUTarget::G71, GPUTarget::G72) && (kernel_size <= 5) && (stride_x == 1) && (stride_y == 1) && (dt == DataType::F32))
+ {
+ // Through extensive experimentation with over 30 representative tensor
+ // shapes, we found a small number of local work size configurations
+ // that result in nearly optimal execution times. Selecting the right
+ // lws for a given shape, however, required a complex decision tree,
+ // until we constructed a simple feature as described below.
+ //
+ // We started from the number of multiply-accumulate operations for a
+ // convolution layer, which is equal to the product of the input
+ // dimensions 0..2 and the weights dimensions 0..2. Unfortunately,
+ // this resulted in ties between distinct shapes that required distinct
+ // lws configurations. Replacing the width of the input with the kernel
+ // size, however, resulted in nearly optimal predictions. We use underscores
+ // in variable names to indicate when they are intentionally misleading.
+ const size_t product_of_weights_dimensions = weights_shape[0] * weights_shape[1] * weights_shape[2];
+ const size_t product_of_input_dimensions_ = inputs_shape[0] * inputs_shape[1] * inputs_shape[2];
+ const float mega_ops_ = 1e-6 * product_of_weights_dimensions * product_of_input_dimensions_;
+
+ switch(kernel_size)
+ {
+ case 1:
+ {
+ if(mega_ops_ < 1.f)
+ {
+ lws_hint = cl::NDRange(1, 1, 8);
+ }
+ else if(mega_ops_ < 7.f)
+ {
+ lws_hint = cl::NDRange(1, 1, 4);
+ }
+ else
+ {
+ lws_hint = cl::NDRange(1, 1, 2);
+ }
+ break;
+ }
+ case 3:
+ {
+ if(mega_ops_ < 1.f)
+ {
+ lws_hint = cl::NDRange(1, 1, 8);
+ }
+ else if(mega_ops_ < 13.f)
+ {
+ lws_hint = cl::NDRange(2, 1, 4);
+ }
+ else if(mega_ops_ < 50.f)
+ {
+ lws_hint = cl::NDRange(3, 1, 4);
+ }
+ else
+ {
+ lws_hint = cl::NDRange(2, 1, 6);
+ }
+ break;
+ }
+ case 5:
+ {
+ if(mega_ops_ < 2.f || mega_ops_ > 80.f)
+ {
+ lws_hint = cl::NDRange(2, 1, 4);
+ }
+ else
+ {
+ lws_hint = cl::NDRange(2, 1, 8);
+ }
+ break;
+ }
+ default:
+ break;
+ }
+ k.set_lws_hint(lws_hint);
+ }
+}
+} // namespace
+
+void BifrostTuner::tune_kernel_static(ICLKernel &kernel)
+{
+ // Continue on tuning if dynamic tuning
+ if(dynamic_cast<CLDirectConvolutionLayerKernel *>(&kernel) != nullptr)
+ {
+ tune_direct_convolution_kernel(*utils::cast::polymorphic_downcast<CLDirectConvolutionLayerKernel *>(&kernel));
+ }
+}
+
+void BifrostTuner::tune_kernel_dynamic(ICLKernel &kernel)
+{
+ ARM_COMPUTE_UNUSED(kernel);
+}
+} // namespace tuners
+} // namespace arm_compute
\ No newline at end of file
diff --git a/src/runtime/CPP/CPPScheduler.cpp b/src/runtime/CPP/CPPScheduler.cpp
index 168ed6e..92dce34 100644
--- a/src/runtime/CPP/CPPScheduler.cpp
+++ b/src/runtime/CPP/CPPScheduler.cpp
@@ -27,6 +27,7 @@
#include "arm_compute/core/Error.h"
#include "arm_compute/core/Helpers.h"
#include "arm_compute/core/Utils.h"
+#include "arm_compute/runtime/CPUUtils.h"
#include <condition_variable>
#include <iostream>
@@ -159,6 +160,7 @@
: _num_threads(num_threads_hint()),
_threads(_num_threads - 1)
{
+ get_cpu_configuration(_cpu_info);
}
void CPPScheduler::set_num_threads(unsigned int num_threads)
@@ -178,7 +180,7 @@
/** [Scheduler example] */
ThreadInfo info;
- info.cpu_info = _info;
+ info.cpu_info = &_cpu_info;
const Window &max_window = kernel->window();
const unsigned int num_iterations = max_window.num_iterations(split_dimension);
diff --git a/src/runtime/CPP/SingleThreadScheduler.cpp b/src/runtime/CPP/SingleThreadScheduler.cpp
index c8285b4..2adc14c 100644
--- a/src/runtime/CPP/SingleThreadScheduler.cpp
+++ b/src/runtime/CPP/SingleThreadScheduler.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2018 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -45,7 +45,7 @@
{
ARM_COMPUTE_UNUSED(split_dimension);
ThreadInfo info;
- info.cpu_info = cpu_info();
+ info.cpu_info = &_cpu_info;
kernel->run(kernel->window(), info);
}
diff --git a/src/graph/CL/CLUnmap.cpp b/src/runtime/CPP/functions/CPPUpsample.cpp
similarity index 65%
copy from src/graph/CL/CLUnmap.cpp
copy to src/runtime/CPP/functions/CPPUpsample.cpp
index 31f2f19..619b7e1 100644
--- a/src/graph/CL/CLUnmap.cpp
+++ b/src/runtime/CPP/functions/CPPUpsample.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2018 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -21,23 +21,16 @@
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*/
-#include "arm_compute/graph/CL/CLUnmap.h"
+#include "arm_compute/runtime/CPP/functions/CPPUpsample.h"
-#include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/Validate.h"
-#include "arm_compute/graph/ITensorObject.h"
-#include "arm_compute/runtime/CL/CLScheduler.h"
+#include "arm_compute/core/CPP/kernels/CPPUpsampleKernel.h"
+#include "support/ToolchainSupport.h"
-using namespace arm_compute::graph;
+using namespace arm_compute;
-CLUnmap::CLUnmap(ITensorObject *tensor)
- : _tensor(dynamic_cast<arm_compute::ICLTensor *>(tensor->tensor()))
+void CPPUpsample::configure(const ITensor *input, ITensor *output, const PadStrideInfo &info, unsigned int inner_border_right, unsigned int inner_border_top)
{
- ARM_COMPUTE_ERROR_ON_NULLPTR(_tensor);
-}
-
-void CLUnmap::run()
-{
- _tensor->unmap(arm_compute::CLScheduler::get().queue());
-}
+ auto k = arm_compute::support::cpp14::make_unique<CPPUpsampleKernel>();
+ k->configure(input, output, info, inner_border_right, inner_border_top);
+ _kernel = std::move(k);
+}
\ No newline at end of file
diff --git a/src/runtime/CPUUtils.cpp b/src/runtime/CPUUtils.cpp
new file mode 100644
index 0000000..7e8bf2b
--- /dev/null
+++ b/src/runtime/CPUUtils.cpp
@@ -0,0 +1,404 @@
+/*
+ * Copyright (c) 2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/CPUUtils.h"
+
+#include "arm_compute/core/CPP/CPPTypes.h"
+#include "arm_compute/core/Error.h"
+#include "support/ToolchainSupport.h"
+
+#include <array>
+#include <cstdlib>
+#include <cstring>
+#include <fcntl.h>
+#include <fstream>
+#include <map>
+#include <sched.h>
+#include <sys/stat.h>
+#include <sys/types.h>
+#include <unistd.h>
+
+#ifndef BARE_METAL
+#include <regex>
+#include <thread>
+#endif /* BARE_METAL */
+
+#if !defined(BARE_METAL) && (defined(__arm__) || defined(__aarch64__))
+#include <sys/auxv.h>
+
+/* Get HWCAP bits from asm/hwcap.h */
+#include <asm/hwcap.h>
+#endif /* !BARE_METAL */
+
+/* Make sure the bits we care about are defined, just in case asm/hwcap.h is
+ * out of date (or for bare metal mode) */
+#ifndef HWCAP_ASIMDHP
+#define HWCAP_ASIMDHP (1 << 10)
+#endif /* HWCAP_ASIMDHP */
+
+#ifndef HWCAP_CPUID
+#define HWCAP_CPUID (1 << 11)
+#endif /* HWCAP_CPUID */
+
+#ifndef HWCAP_ASIMDDP
+#define HWCAP_ASIMDDP (1 << 20)
+#endif /* HWCAP_ASIMDDP */
+
+namespace
+{
+using namespace arm_compute;
+
+#if !defined(BARE_METAL) && (defined(__arm__) || defined(__aarch64__))
+struct PerCPUData
+{
+ CPUModel model = CPUModel::GENERIC;
+ unsigned int midr = 0;
+ bool model_set = false;
+};
+
+/* Convert an MIDR register value to a CPUModel enum value. */
+CPUModel midr_to_model(const unsigned int midr)
+{
+ CPUModel model;
+
+ // Unpack variant and CPU ID
+ const int variant = (midr >> 20) & 0xF;
+ const int cpunum = (midr >> 4) & 0xFFF;
+
+ // Only CPUs we have code paths for are detected. All other CPUs can be safely classed as "GENERIC"
+ switch(cpunum)
+ {
+ case 0xd03:
+ model = CPUModel::A53;
+ break;
+
+ case 0xd05:
+ if(variant != 0)
+ {
+ model = CPUModel::A55r1;
+ }
+ else
+ {
+ model = CPUModel::A55r0;
+ }
+ break;
+
+ default:
+ model = CPUModel::GENERIC;
+ break;
+ }
+
+ return model;
+}
+
+void populate_models_cpuid(std::vector<PerCPUData> &cpusv)
+{
+ // If the CPUID capability is present, MIDR information is provided in /sys. Use that to populate the CPU model table.
+ uint32_t i = 0;
+ for(auto &c : cpusv)
+ {
+ std::stringstream str;
+ str << "/sys/devices/system/cpu/cpu" << i++ << "/regs/identification/midr_el1";
+ std::ifstream file;
+ file.open(str.str(), std::ios::in);
+ if(file.is_open())
+ {
+ std::string line;
+ if(bool(getline(file, line)))
+ {
+ const unsigned long midr = support::cpp11::stoul(line, nullptr, 16);
+ c.midr = (midr & 0xffffffff);
+ c.model = midr_to_model(c.midr);
+ c.model_set = true;
+ }
+ }
+ }
+}
+
+void populate_models_cpuinfo(std::vector<PerCPUData> &cpusv)
+{
+ // If "long-form" cpuinfo is present, parse that to populate models.
+ std::regex proc_regex("^processor.*(\\d+)$");
+ std::regex imp_regex("^CPU implementer.*0x(..)$");
+ std::regex var_regex("^CPU variant.*0x(.)$");
+ std::regex part_regex("^CPU part.*0x(...)$");
+ std::regex rev_regex("^CPU revision.*(\\d+)$");
+
+ std::ifstream file;
+ file.open("/proc/cpuinfo", std::ios::in);
+
+ if(file.is_open())
+ {
+ std::string line;
+ int midr = 0;
+ int curcpu = -1;
+
+ while(bool(getline(file, line)))
+ {
+ std::smatch match;
+
+ if(std::regex_match(line, match, proc_regex))
+ {
+ std::string id = match[1];
+ int newcpu = support::cpp11::stoi(id, nullptr, 0);
+
+ if(curcpu >= 0 && midr == 0)
+ {
+ // Matched a new CPU ID without any description of the previous one - looks like old format.
+ return;
+ }
+
+ if(curcpu >= 0)
+ {
+ cpusv[curcpu].midr = midr;
+ cpusv[curcpu].model = midr_to_model(midr);
+ cpusv[curcpu].model_set = true;
+ }
+
+ midr = 0;
+ curcpu = newcpu;
+
+ continue;
+ }
+
+ if(std::regex_match(line, match, imp_regex))
+ {
+ int impv = support::cpp11::stoi(match[1], nullptr, 16);
+ midr |= (impv << 24);
+ continue;
+ }
+
+ if(std::regex_match(line, match, var_regex))
+ {
+ int varv = support::cpp11::stoi(match[1], nullptr, 16);
+ midr |= (varv << 16);
+ continue;
+ }
+
+ if(std::regex_match(line, match, part_regex))
+ {
+ int partv = support::cpp11::stoi(match[1], nullptr, 16);
+ midr |= (partv << 4);
+ continue;
+ }
+
+ if(std::regex_match(line, match, rev_regex))
+ {
+ int regv = support::cpp11::stoi(match[1], nullptr, 10);
+ midr |= (regv);
+ midr |= (0xf << 16);
+ continue;
+ }
+ }
+
+ if(curcpu >= 0)
+ {
+ cpusv[curcpu].midr = midr;
+ cpusv[curcpu].model = midr_to_model(midr);
+ cpusv[curcpu].model_set = true;
+ }
+ }
+}
+
+int get_max_cpus()
+{
+ int max_cpus = 1;
+#if !defined(BARE_METAL) && (defined(__arm__) || defined(__aarch64__))
+ std::ifstream CPUspresent;
+ CPUspresent.open("/sys/devices/system/cpu/present", std::ios::in);
+ bool success = false;
+
+ if(CPUspresent.is_open())
+ {
+ std::string line;
+
+ if(bool(getline(CPUspresent, line)))
+ {
+ /* The content of this file is a list of ranges or single values, e.g.
+ * 0-5, or 1-3,5,7 or similar. As we are interested in the
+ * max valid ID, we just need to find the last valid
+ * delimiter ('-' or ',') and parse the integer immediately after that.
+ */
+ auto startfrom = line.begin();
+
+ for(auto i = line.begin(); i < line.end(); ++i)
+ {
+ if(*i == '-' || *i == ',')
+ {
+ startfrom = i + 1;
+ }
+ }
+
+ line.erase(line.begin(), startfrom);
+
+ max_cpus = support::cpp11::stoi(line, nullptr, 0) + 1;
+ success = true;
+ }
+ }
+
+ // Return std::thread::hardware_concurrency() as a fallback.
+ if(!success)
+ {
+ max_cpus = std::thread::hardware_concurrency();
+ }
+#endif /* BARE_METAL */
+
+ return max_cpus;
+}
+#endif /* !defined(BARE_METAL) && (defined(__arm__) || defined(__aarch64__)) */
+
+} // namespace
+
+namespace arm_compute
+{
+void get_cpu_configuration(CPUInfo &cpuinfo)
+{
+#if !defined(BARE_METAL) && (defined(__arm__) || defined(__aarch64__))
+ bool cpuid = false;
+ bool fp16_support = false;
+ bool dot_support = false;
+
+ const uint32_t hwcaps = getauxval(AT_HWCAP);
+
+ if((hwcaps & HWCAP_CPUID) != 0)
+ {
+ cpuid = true;
+ }
+
+ if((hwcaps & HWCAP_ASIMDHP) != 0)
+ {
+ fp16_support = true;
+ }
+
+ if((hwcaps & HWCAP_ASIMDDP) != 0)
+ {
+ dot_support = true;
+ }
+
+#ifdef __aarch64__
+ /* Pre-4.15 kernels don't have the ASIMDDP bit.
+ *
+ * Although the CPUID bit allows us to read the feature register
+ * directly, the kernel quite sensibly masks this to only show
+ * features known by it to be safe to show to userspace. As a
+ * result, pre-4.15 kernels won't show the relevant bit in the
+ * feature registers either.
+ *
+ * So for now, use a whitelist of CPUs known to support the feature.
+ */
+ if(!dot_support && cpuid)
+ {
+ /* List of CPUs with dot product support: A55r1 A75r1 A75r2 */
+ const unsigned int dotprod_whitelist_masks[] = { 0xfff0fff0, 0xfff0fff0, 0xfff0fff0, 0 };
+ const unsigned int dotprod_whitelist_values[] = { 0x4110d050, 0x4110d0a0, 0x4120d0a0, 0 };
+
+ unsigned long cpuid;
+
+ __asm __volatile(
+ "mrs %0, midr_el1\n"
+ : "=r"(cpuid)
+ :
+ : );
+
+ for(int i = 0; dotprod_whitelist_values[i] != 0; i++)
+ {
+ if((cpuid & dotprod_whitelist_masks[i]) == dotprod_whitelist_values[i])
+ {
+ dot_support = true;
+ break;
+ }
+ }
+ }
+#endif /* __aarch64__ */
+ const unsigned int max_cpus = get_max_cpus();
+ cpuinfo.set_cpu_num(max_cpus);
+ cpuinfo.set_fp16(fp16_support);
+ cpuinfo.set_dotprod(dot_support);
+ std::vector<PerCPUData> percpu(max_cpus);
+ if(cpuid)
+ {
+ populate_models_cpuid(percpu);
+ }
+ else
+ {
+ populate_models_cpuinfo(percpu);
+ }
+ int j(0);
+ for(const auto &v : percpu)
+ {
+ cpuinfo.set_cpu_model(j++, v.model);
+ }
+#else /* !defined(BARE_METAL) && (defined(__arm__) || defined(__aarch64__)) */
+ ARM_COMPUTE_UNUSED(cpuinfo);
+#endif /* !defined(BARE_METAL) && (defined(__arm__) || defined(__aarch64__)) */
+}
+
+unsigned int get_threads_hint()
+{
+ unsigned int num_threads_hint = 1;
+
+#ifndef BARE_METAL
+ std::map<std::string, unsigned int> cpu_part_occurrence_map;
+
+ // CPU part regex
+ std::regex cpu_part_rgx(R"(.*CPU part.+?(?=:).+?(?=\w+)(\w+).*)");
+ std::smatch cpu_part_match;
+
+ // Read cpuinfo and get occurrence of each core
+ std::ifstream cpuinfo;
+ cpuinfo.open("/proc/cpuinfo", std::ios::in);
+ if(cpuinfo.is_open())
+ {
+ std::string line;
+ while(bool(getline(cpuinfo, line)))
+ {
+ if(std::regex_search(line.cbegin(), line.cend(), cpu_part_match, cpu_part_rgx))
+ {
+ std::string cpu_part = cpu_part_match[1];
+ if(cpu_part_occurrence_map.find(cpu_part) != cpu_part_occurrence_map.end())
+ {
+ cpu_part_occurrence_map[cpu_part]++;
+ }
+ else
+ {
+ cpu_part_occurrence_map[cpu_part] = 1;
+ }
+ }
+ }
+ }
+
+ // Get min number of threads
+ auto min_common_cores = std::min_element(cpu_part_occurrence_map.begin(), cpu_part_occurrence_map.end(),
+ [](const std::pair<std::string, unsigned int> &p1, const std::pair<std::string, unsigned int> &p2)
+ {
+ return p1.second < p2.second;
+ });
+
+ // Set thread hint
+ num_threads_hint = cpu_part_occurrence_map.empty() ? std::thread::hardware_concurrency() : min_common_cores->second;
+#endif /* BARE_METAL */
+
+ return num_threads_hint;
+}
+
+} // namespace arm_compute
diff --git a/src/runtime/GLES_COMPUTE/GCBufferAllocator.cpp b/src/runtime/GLES_COMPUTE/GCBufferAllocator.cpp
new file mode 100644
index 0000000..cdd12c3
--- /dev/null
+++ b/src/runtime/GLES_COMPUTE/GCBufferAllocator.cpp
@@ -0,0 +1,57 @@
+/*
+ * Copyright (c) 2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/GLES_COMPUTE/GCBufferAllocator.h"
+#include "arm_compute/runtime/GLES_COMPUTE/GCTensorAllocator.h"
+
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/GLES_COMPUTE/OpenGLES.h"
+
+#include <cstddef>
+
+namespace arm_compute
+{
+void *GCBufferAllocator::allocate(size_t size, size_t alignment)
+{
+ ARM_COMPUTE_UNUSED(alignment);
+ auto *gl_buffer = new GLBufferWrapper();
+ ARM_COMPUTE_GL_CHECK(glBindBuffer(GL_SHADER_STORAGE_BUFFER, gl_buffer->_ssbo_name));
+ ARM_COMPUTE_GL_CHECK(glBufferData(GL_SHADER_STORAGE_BUFFER, static_cast<GLsizeiptr>(size), nullptr, GL_STATIC_DRAW));
+ ARM_COMPUTE_GL_CHECK(glBindBuffer(GL_SHADER_STORAGE_BUFFER, 0));
+
+ return reinterpret_cast<void *>(gl_buffer);
+}
+
+void GCBufferAllocator::free(void *ptr)
+{
+ ARM_COMPUTE_ERROR_ON(ptr == nullptr);
+ auto *gl_buffer = reinterpret_cast<GLBufferWrapper *>(ptr);
+ delete gl_buffer;
+}
+
+std::unique_ptr<IMemoryRegion> GCBufferAllocator::make_region(size_t size, size_t alignment)
+{
+ ARM_COMPUTE_UNUSED(size, alignment);
+ return nullptr;
+}
+} // namespace arm_compute
diff --git a/src/runtime/GLES_COMPUTE/GCScheduler.cpp b/src/runtime/GLES_COMPUTE/GCScheduler.cpp
index fcc8559..f781273 100644
--- a/src/runtime/GLES_COMPUTE/GCScheduler.cpp
+++ b/src/runtime/GLES_COMPUTE/GCScheduler.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2018 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -24,6 +24,7 @@
#include "arm_compute/runtime/GLES_COMPUTE/GCScheduler.h"
+#include "arm_compute/core/GLES_COMPUTE/GCHelpers.h"
#include "arm_compute/core/GLES_COMPUTE/GCKernelLibrary.h"
using namespace arm_compute;
@@ -31,7 +32,7 @@
std::once_flag GCScheduler::_initialize_symbols;
GCScheduler::GCScheduler()
- : _display(EGL_NO_DISPLAY), _context(EGL_NO_CONTEXT)
+ : _display(EGL_NO_DISPLAY), _context(EGL_NO_CONTEXT), _target(GPUTarget::MIDGARD)
{
}
@@ -48,11 +49,13 @@
{
setup_context();
- GCKernelLibrary::get().init("./cs_shaders/", _display, _context);
+ init(_display, _context);
}
void GCScheduler::init(EGLDisplay dpy, EGLContext ctx)
{
+ _target = get_target_from_device();
+
GCKernelLibrary::get().init("./cs_shaders/", dpy, ctx);
}
diff --git a/src/runtime/GLES_COMPUTE/GCTensor.cpp b/src/runtime/GLES_COMPUTE/GCTensor.cpp
index edbd16d..e193d26 100644
--- a/src/runtime/GLES_COMPUTE/GCTensor.cpp
+++ b/src/runtime/GLES_COMPUTE/GCTensor.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2018 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -27,7 +27,7 @@
using namespace arm_compute;
GCTensor::GCTensor()
- : _allocator()
+ : _allocator(this)
{
}
diff --git a/src/runtime/GLES_COMPUTE/GCTensorAllocator.cpp b/src/runtime/GLES_COMPUTE/GCTensorAllocator.cpp
index 694b34f..abd2b48 100644
--- a/src/runtime/GLES_COMPUTE/GCTensorAllocator.cpp
+++ b/src/runtime/GLES_COMPUTE/GCTensorAllocator.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2018 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -31,11 +31,16 @@
using namespace arm_compute;
-GCTensorAllocator::GCTensorAllocator()
- : _gl_buffer(), _mapping(nullptr)
+GCTensorAllocator::GCTensorAllocator(GCTensor *owner)
+ : _associated_memory_group(nullptr), _gl_buffer(), _mapping(nullptr), _owner(owner)
{
}
+GCTensorAllocator::~GCTensorAllocator()
+{
+ _gl_buffer = support::cpp14::make_unique<GLBufferWrapper>();
+}
+
uint8_t *GCTensorAllocator::data()
{
return _mapping;
@@ -43,17 +48,35 @@
void GCTensorAllocator::allocate()
{
- _gl_buffer = support::cpp14::make_unique<GLBufferWrapper>();
- ARM_COMPUTE_GL_CHECK(glBindBuffer(GL_SHADER_STORAGE_BUFFER, _gl_buffer->_ssbo_name));
- ARM_COMPUTE_GL_CHECK(glBufferData(GL_SHADER_STORAGE_BUFFER, static_cast<GLsizeiptr>(info().total_size()), nullptr, GL_STATIC_DRAW));
- ARM_COMPUTE_GL_CHECK(glBindBuffer(GL_SHADER_STORAGE_BUFFER, 0));
+ if(_associated_memory_group == nullptr)
+ {
+ _gl_buffer = support::cpp14::make_unique<GLBufferWrapper>();
+ ARM_COMPUTE_GL_CHECK(glBindBuffer(GL_SHADER_STORAGE_BUFFER, _gl_buffer->_ssbo_name));
+ ARM_COMPUTE_GL_CHECK(glBufferData(GL_SHADER_STORAGE_BUFFER, static_cast<GLsizeiptr>(info().total_size()), nullptr, GL_STATIC_DRAW));
+ ARM_COMPUTE_GL_CHECK(glBindBuffer(GL_SHADER_STORAGE_BUFFER, 0));
+ }
+ else
+ {
+ _associated_memory_group->finalize_memory(_owner, reinterpret_cast<void **>(&_gl_buffer), info().total_size());
+ }
info().set_is_resizable(false);
}
void GCTensorAllocator::free()
{
- _gl_buffer.reset();
- info().set_is_resizable(true);
+ if(_associated_memory_group == nullptr)
+ {
+ _gl_buffer.reset();
+ info().set_is_resizable(true);
+ }
+}
+
+void GCTensorAllocator::set_associated_memory_group(GCMemoryGroup *associated_memory_group)
+{
+ ARM_COMPUTE_ERROR_ON(associated_memory_group == nullptr);
+ ARM_COMPUTE_ERROR_ON(_associated_memory_group != nullptr);
+ ARM_COMPUTE_ERROR_ON(_gl_buffer.get() != nullptr);
+ _associated_memory_group = associated_memory_group;
}
uint8_t *GCTensorAllocator::lock()
diff --git a/src/runtime/GLES_COMPUTE/functions/GCConvolutionLayer.cpp b/src/runtime/GLES_COMPUTE/functions/GCConvolutionLayer.cpp
index 1d2370e..2a710f7 100644
--- a/src/runtime/GLES_COMPUTE/functions/GCConvolutionLayer.cpp
+++ b/src/runtime/GLES_COMPUTE/functions/GCConvolutionLayer.cpp
@@ -37,14 +37,14 @@
using namespace arm_compute;
GCConvolutionLayerReshapeWeights::GCConvolutionLayerReshapeWeights()
- : _weights_reshape_kernel(), _weights_transposed_kernel(), _weights_reshaped(), _transpose1xW(false)
+ : _weights_reshape_kernel(), _weights_reshaped()
{
}
-void GCConvolutionLayerReshapeWeights::configure(const IGCTensor *weights, const IGCTensor *biases, IGCTensor *output, bool transpose1xW)
+void GCConvolutionLayerReshapeWeights::configure(const IGCTensor *weights, const IGCTensor *biases, IGCTensor *output)
{
+ ARM_COMPUTE_ERROR_ON_NULLPTR(weights, output);
ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(weights, 1, DataType::F16, DataType::F32);
- ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(weights, output);
ARM_COMPUTE_ERROR_ON(weights->info()->num_dimensions() > 4);
if(biases != nullptr)
@@ -56,73 +56,66 @@
}
const bool append_biases = (biases != nullptr) && !is_data_type_quantized_asymmetric(weights->info()->data_type());
- const unsigned bias_element = (append_biases) ? 1 : 0;
const IGCTensor *biases_to_use = (append_biases) ? biases : nullptr;
- _transpose1xW = transpose1xW;
-
- if(transpose1xW)
- {
- // Create tensor to store the reshaped weights
- const unsigned int mat_weights_cols = weights->info()->dimension(3);
- const unsigned int mat_weights_rows = weights->info()->dimension(0) * weights->info()->dimension(1) * weights->info()->dimension(2) + bias_element;
- TensorShape shape_wr(mat_weights_cols, mat_weights_rows);
- const DataType dt = weights->info()->data_type();
- const int fixed_point_position = weights->info()->fixed_point_position();
- TensorInfo info_wr(shape_wr, 1, dt, fixed_point_position);
-
- _weights_reshaped.allocator()->init(info_wr);
- _weights_reshape_kernel.configure(weights, biases_to_use, &_weights_reshaped);
- _weights_transposed_kernel.configure(&_weights_reshaped, output);
- _weights_reshaped.allocator()->allocate();
- }
- else
- {
- _weights_reshape_kernel.configure(weights, biases_to_use, output);
- }
+ _weights_reshape_kernel.configure(weights, biases_to_use, output);
}
void GCConvolutionLayerReshapeWeights::run()
{
GCScheduler::get().dispatch(_weights_reshape_kernel);
- if(_transpose1xW)
- {
- GCScheduler::get().dispatch(_weights_transposed_kernel);
- }
}
-GCConvolutionLayer::GCConvolutionLayer()
- : _reshape_weights(), _input_im2col_kernel(), _input_interleave_kernel(), _mm_kernel(), _output_col2im_kernel(), _fill_border(), _input_im2col_reshaped(), _input_interleaved_reshaped(),
- _weights_reshaped(), _weights_transposed(), _gemm_output(), _tmp_output(), _append_bias(false), _is_fully_connected_convolution(false), _are_weights_reshaped(false)
+GCConvolutionLayer::GCConvolutionLayer(std::shared_ptr<IMemoryManager> memory_manager)
+ : _memory_group(std::move(memory_manager)), _reshape_weights(), _input_im2col_kernel(), _mm_gemm(), _output_col2im_kernel(), _fill_border(), _activationlayer_function(), _original_weights(nullptr),
+ _input_im2col_reshaped(), _input_interleaved_reshaped(), _weights_reshaped(), _weights_transposed(), _gemm_output(), _tmp_output(), _is_first_run(true), _is_activationlayer_enabled(false)
{
}
-void GCConvolutionLayer::configure_mm(const IGCTensor *input, const IGCTensor *weights, IGCTensor *output, bool is_interleaved_transposed)
+void GCConvolutionLayer::configure_mm(const IGCTensor *input, const IGCTensor *weights, IGCTensor *output)
{
- _mm_kernel.configure(input, weights, output, 1.f, is_interleaved_transposed);
+ ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights);
+ ARM_COMPUTE_ERROR_THROW_ON(validate_mm(input->info(), weights->info(), output->info()));
+
+ _mm_gemm.configure(input, weights, nullptr, output, 1.f, 0.0f, GEMMInfo(false, false, true /* Reshape weights only for the first run */));
}
-void GCConvolutionLayer::configure(const IGCTensor *input, const IGCTensor *weights, const IGCTensor *biases, IGCTensor *output, const PadStrideInfo &conv_info, const WeightsInfo &weights_info)
+Status GCConvolutionLayer::validate_mm(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *output)
{
+ // Perform validation step on Matrix multiply function
+ GCGEMM::validate(input, weights, nullptr, output, 1.0f, 0.0f, GEMMInfo(false, false, true /* Reshape weights only for the first run */));
+ return Status{};
+}
+
+void GCConvolutionLayer::configure(const IGCTensor *input, const IGCTensor *weights, const IGCTensor *biases, IGCTensor *output, const PadStrideInfo &conv_info, const WeightsInfo &weights_info,
+ const Size2D &dilation, const ActivationLayerInfo &act_info)
+{
+ ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights);
ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F16, DataType::F32);
ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, weights);
- ARM_COMPUTE_ERROR_ON(!weights_info.are_reshaped() && weights->info()->dimension(2) != input->info()->dimension(2));
+ ARM_COMPUTE_ERROR_ON_MSG(weights_info.are_reshaped(), "Weights already reshaped are not supported!");
+ ARM_COMPUTE_ERROR_ON(weights->info()->dimension(2) != input->info()->dimension(2));
ARM_COMPUTE_ERROR_ON(weights->info()->num_dimensions() > 4);
+ _is_first_run = true;
+ _original_weights = weights;
+
if(biases != nullptr)
{
ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, biases);
- ARM_COMPUTE_ERROR_ON(!weights_info.are_reshaped() && biases->info()->dimension(0) != weights->info()->dimension(3));
+ ARM_COMPUTE_ERROR_ON(biases->info()->dimension(0) != weights->info()->dimension(3));
ARM_COMPUTE_ERROR_ON(biases->info()->num_dimensions() > 1);
}
const DataType dt = input->info()->data_type();
- _append_bias = (biases != nullptr);
- _are_weights_reshaped = weights_info.are_reshaped();
+ // Set the GPU target for im2col and col2im
+ _input_im2col_kernel.set_target(GCScheduler::get().get_target());
+ _output_col2im_kernel.set_target(GCScheduler::get().get_target());
- const unsigned bias_element = (_append_bias) ? 1 : 0;
- const IGCTensor *biases_to_use = (_append_bias) ? biases : nullptr;
+ const bool append_bias = (biases != nullptr);
+ const unsigned bias_element = (append_bias) ? 1 : 0;
+ const IGCTensor *biases_to_use = (append_bias) ? biases : nullptr;
// Get parameters from conv_info
unsigned int stride_x = 0;
@@ -133,57 +126,19 @@
unsigned int conv_w = 0;
unsigned int conv_h = 0;
- const unsigned int kernel_width = (_are_weights_reshaped) ? weights_info.kernel_size().first : weights->info()->dimension(0);
- const unsigned int kernel_height = (_are_weights_reshaped) ? weights_info.kernel_size().second : weights->info()->dimension(1);
+ const unsigned int kernel_width = weights->info()->dimension(0);
+ const unsigned int kernel_height = weights->info()->dimension(1);
std::tie(conv_w, conv_h) = scaled_dimensions(input->info()->dimension(0), input->info()->dimension(1), kernel_width, kernel_height,
- conv_info);
-
- // Check if its a "fully connected" convolution
- _is_fully_connected_convolution = ((conv_w == 1) && (conv_h == 1));
- const bool run_interleaved = (!_is_fully_connected_convolution);
+ conv_info, dilation);
unsigned int mat_weights_cols = weights->info()->dimension(3);
unsigned int mat_weights_rows = weights->info()->dimension(0) * weights->info()->dimension(1) * weights->info()->dimension(2) + bias_element;
- // Reshape weights if needed
- if(_are_weights_reshaped)
- {
- if(_is_fully_connected_convolution)
- {
- mat_weights_cols = weights->info()->dimension(0);
- mat_weights_rows = weights->info()->dimension(1);
- }
- else
- {
- mat_weights_cols = weights_info.num_kernels();
- const unsigned int quarter_reshaped_cols = weights->info()->dimension(0) / 4;
- mat_weights_rows = quarter_reshaped_cols + bias_element;
- }
- }
- else
- {
- if(_is_fully_connected_convolution)
- {
- // Create tensor to store the reshaped weights
- int num_elems_read_per_iteration_x = 1;
- if(dt == DataType::F16)
- {
- num_elems_read_per_iteration_x = 2;
- }
- TensorShape shape_wr((ceil_to_multiple(mat_weights_cols, num_elems_read_per_iteration_x)), mat_weights_rows);
- _weights_reshaped.allocator()->init(weights->info()->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(shape_wr));
- _reshape_weights.configure(weights, biases_to_use, &_weights_reshaped, false /* 1xW transpose */);
- }
- else
- {
- // Create tensor to store transposed weights
- const float transpose_width = 16.0f / input->info()->element_size();
- TensorShape shape_wt(mat_weights_rows * static_cast<unsigned int>(transpose_width), static_cast<unsigned int>(std::ceil(mat_weights_cols / transpose_width)));
- _weights_reshaped.allocator()->init(weights->info()->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(shape_wt));
- _reshape_weights.configure(weights, biases_to_use, &_weights_reshaped, true /* 1xW transpose */);
- }
- weights = &_weights_reshaped;
- }
+ // _weights_reshaped will be auto configured in the kernel.
+ // Just append biases and do not transpose 1xW as it will be reshaped in GCGEMM
+ _reshape_weights.configure(weights, biases_to_use, &_weights_reshaped);
+
+ weights = &_weights_reshaped;
// Create tensor to store im2col reshaped inputs
const unsigned int mat_input_cols = mat_weights_rows;
@@ -195,17 +150,7 @@
TensorInfo im2col_reshaped_info(shape_im2col, 1, dt, input->info()->fixed_point_position());
_input_im2col_reshaped.allocator()->init(im2col_reshaped_info);
-
- // Create tensor (interleave) to prepare input tensor for GEMM
- if(run_interleaved)
- {
- TensorShape shape_interleaved = shape_im2col;
- shape_interleaved.set(0, shape_interleaved.x() * 4);
- shape_interleaved.set(1, std::ceil(shape_interleaved.y() / 4.f));
-
- TensorInfo interleaved_info(shape_interleaved, 1, dt, input->info()->fixed_point_position());
- _input_interleaved_reshaped.allocator()->init(interleaved_info);
- }
+ _memory_group.manage(&_input_im2col_reshaped);
// Create GEMM output tensor
TensorShape shape_gemm = _input_im2col_reshaped.info()->tensor_shape();
@@ -215,27 +160,20 @@
TensorInfo info_gemm(shape_gemm, 1, gemm_data_type, input->info()->fixed_point_position());
_gemm_output.allocator()->init(info_gemm);
+ _memory_group.manage(&_gemm_output);
- // Configure kernels
if(dt == DataType::F16)
{
BorderSize border_size = BorderSize(conv_info.pad_top(), conv_info.pad_right(), conv_info.pad_bottom(), conv_info.pad_left());
input->info()->extend_padding(border_size);
_fill_border.configure(input, border_size, BorderMode::CONSTANT, PixelValue(0)); // for PAD of im2col fp16: consider it as border
}
- _input_im2col_kernel.configure(input, &_input_im2col_reshaped, Size2D(kernel_width, kernel_height), conv_info, _append_bias);
+ // Configure im2col
+ _input_im2col_kernel.configure(input, &_input_im2col_reshaped, Size2D(kernel_width, kernel_height), conv_info, append_bias, dilation);
- // Configure matrix multiply
- if(run_interleaved)
- {
- _input_interleave_kernel.configure(&_input_im2col_reshaped, &_input_interleaved_reshaped);
- configure_mm(&_input_interleaved_reshaped, weights, &_gemm_output);
- _input_interleaved_reshaped.allocator()->allocate();
- }
- else
- {
- configure_mm(&_input_im2col_reshaped, weights, &_gemm_output, false);
- }
+ // Configure GEMM
+ configure_mm(&_input_im2col_reshaped, weights, &_gemm_output);
+
_input_im2col_reshaped.allocator()->allocate();
// Configure Col2Im
@@ -245,38 +183,53 @@
ARM_COMPUTE_ERROR_ON_MSG((output->info()->dimension(0) != conv_w) || (output->info()->dimension(1) != conv_h), "Output shape does not match the expected one");
// Allocate intermediate tensor
- if(!_are_weights_reshaped)
+ _weights_reshaped.allocator()->allocate();
+
+ //Configure Activation Layer
+ _is_activationlayer_enabled = act_info.enabled();
+
+ if(_is_activationlayer_enabled)
{
- _weights_reshaped.allocator()->allocate();
+ _activationlayer_function.configure(output, nullptr, act_info);
}
+
+ ARM_COMPUTE_UNUSED(weights_info);
}
void GCConvolutionLayer::run()
{
// Run weights reshaping (Runs once for every configure)
- if(!_are_weights_reshaped)
+ if(_is_first_run)
{
- _are_weights_reshaped = true;
+ ARM_COMPUTE_ERROR_ON(!_original_weights->is_used());
+
_reshape_weights.run();
+ _is_first_run = false;
+
+ // Mark original weights tensor as unused
+ _original_weights->mark_as_unused();
}
+ _memory_group.acquire();
+
// Run im2col
GCScheduler::get().dispatch(_fill_border);
GCScheduler::get().memory_barrier();
GCScheduler::get().dispatch(_input_im2col_kernel);
- if(!_is_fully_connected_convolution)
- {
- GCScheduler::get().memory_barrier();
- // Run interleave4x4
- GCScheduler::get().dispatch(_input_interleave_kernel);
- }
-
- GCScheduler::get().memory_barrier();
- // Runs matrix multiply on reshaped matrices
- GCScheduler::get().dispatch(_mm_kernel);
+ // Run gemm on reshaped matrices
+ _mm_gemm.run();
GCScheduler::get().memory_barrier();
// Reshape output matrix
GCScheduler::get().dispatch(_output_col2im_kernel, false);
+
+ _memory_group.release();
+
+ GCScheduler::get().memory_barrier();
+ // Run Activation Layer
+ if(_is_activationlayer_enabled)
+ {
+ _activationlayer_function.run();
+ }
}
diff --git a/src/runtime/GLES_COMPUTE/functions/GCDepthwiseConvolutionLayer.cpp b/src/runtime/GLES_COMPUTE/functions/GCDepthwiseConvolutionLayer.cpp
index 9cba371..7121654 100644
--- a/src/runtime/GLES_COMPUTE/functions/GCDepthwiseConvolutionLayer.cpp
+++ b/src/runtime/GLES_COMPUTE/functions/GCDepthwiseConvolutionLayer.cpp
@@ -35,10 +35,10 @@
{
}
-void GCDepthwiseConvolutionLayer3x3::configure(IGCTensor *input, const IGCTensor *weights, const IGCTensor *biases, IGCTensor *output, const PadStrideInfo &conv_info)
+void GCDepthwiseConvolutionLayer3x3::configure(IGCTensor *input, const IGCTensor *weights, const IGCTensor *biases, IGCTensor *output, const PadStrideInfo &conv_info, unsigned int depth_multiplier)
{
auto k = arm_compute::support::cpp14::make_unique<GCDepthwiseConvolutionLayer3x3Kernel>();
- k->configure(input, weights, biases, output, conv_info);
+ k->configure(input, weights, biases, output, conv_info, depth_multiplier);
_kernel = std::move(k);
// Configure border handler
diff --git a/src/runtime/GLES_COMPUTE/functions/GCDirectConvolutionLayer.cpp b/src/runtime/GLES_COMPUTE/functions/GCDirectConvolutionLayer.cpp
index a2607d4..c0cf098 100644
--- a/src/runtime/GLES_COMPUTE/functions/GCDirectConvolutionLayer.cpp
+++ b/src/runtime/GLES_COMPUTE/functions/GCDirectConvolutionLayer.cpp
@@ -39,26 +39,27 @@
{
}
-void GCDirectConvolutionLayer::configure(IGCTensor *input, const IGCTensor *weights, const IGCTensor *biases, IGCTensor *output, const PadStrideInfo &conv_info)
+void GCDirectConvolutionLayer::configure(IGCTensor *input, const IGCTensor *weights, const IGCTensor *biases, IGCTensor *output, const PadStrideInfo &conv_info,
+ const ActivationLayerInfo &act_info)
{
int kernel_size = weights->info()->dimension(0);
if(kernel_size == 1)
{
auto k = arm_compute::support::cpp14::make_unique<GCDirectConvolutionLayer1x1Kernel>();
- k->configure(input, weights, biases, output, conv_info);
+ k->configure(input, weights, biases, output, conv_info, act_info);
_kernel = std::move(k);
}
else if(kernel_size == 3)
{
auto k = arm_compute::support::cpp14::make_unique<GCDirectConvolutionLayer3x3Kernel>();
- k->configure(input, weights, biases, output, conv_info);
+ k->configure(input, weights, biases, output, conv_info, act_info);
_kernel = std::move(k);
}
else if(kernel_size == 5)
{
auto k = arm_compute::support::cpp14::make_unique<GCDirectConvolutionLayer5x5Kernel>();
- k->configure(input, weights, biases, output, conv_info);
+ k->configure(input, weights, biases, output, conv_info, act_info);
_kernel = std::move(k);
}
else
@@ -79,4 +80,6 @@
GCScheduler::get().dispatch(_border_handler, false);
GCScheduler::get().memory_barrier();
GCScheduler::get().dispatch(*_kernel);
+ GCScheduler::get().memory_barrier();
+ GCScheduler::get().dispatch(_shift_handler);
}
diff --git a/src/runtime/GLES_COMPUTE/functions/GCFullyConnectedLayer.cpp b/src/runtime/GLES_COMPUTE/functions/GCFullyConnectedLayer.cpp
index 9e4f0f6..a300033 100644
--- a/src/runtime/GLES_COMPUTE/functions/GCFullyConnectedLayer.cpp
+++ b/src/runtime/GLES_COMPUTE/functions/GCFullyConnectedLayer.cpp
@@ -38,9 +38,9 @@
_kernel = std::move(k);
}
-GCFullyConnectedLayer::GCFullyConnectedLayer()
- : _im2col_kernel(), _reshape_weights_kernel(), _mm_kernel(), _accumulate_biases_kernel(), _im2col_output(), _reshape_weights_output(), _are_weights_reshaped(true), _is_fc_after_conv(true),
- _accumulate_biases(false)
+GCFullyConnectedLayer::GCFullyConnectedLayer(std::shared_ptr<IMemoryManager> memory_manager)
+ : _memory_group(std::move(memory_manager)), _im2col_kernel(), _reshape_weights_kernel(), _mm_kernel(), _accumulate_biases_kernel(), _im2col_output(), _reshape_weights_output(),
+ _are_weights_reshaped(true), _is_fc_after_conv(true), _accumulate_biases(false)
{
}
@@ -61,6 +61,7 @@
_im2col_output.allocator()->init(TensorInfo(shape_im2col, 1, dt));
// Configure im2col kernel
+ _memory_group.manage(&_im2col_output);
_im2col_kernel.configure(input, &_im2col_output, Size2D(1, 1), PadStrideInfo(1, 1, 0, 0), false);
// Configure matrix multiply kernel
@@ -78,7 +79,8 @@
_mm_kernel.configure(input, weights, output, 1.0f, false);
}
-void GCFullyConnectedLayer::configure(const IGCTensor *input, const IGCTensor *weights, const IGCTensor *biases, IGCTensor *output, bool transpose_weights, bool are_weights_reshaped)
+void GCFullyConnectedLayer::configure(const IGCTensor *input, const IGCTensor *weights, const IGCTensor *biases, IGCTensor *output,
+ bool transpose_weights, bool are_weights_reshaped, bool retain_internal_weights)
{
ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32, DataType::F16);
ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, weights, output);
@@ -140,11 +142,14 @@
}
// Allocate the transpose tensor if the are_weights_reshaped flag is false and once all the configure methods have been called
- if(!_are_weights_reshaped)
+ if(!_are_weights_reshaped && !retain_internal_weights)
{
// Allocate the tensor for the weights reshaped
_reshape_weights_output.allocator()->allocate();
}
+
+ ARM_COMPUTE_ERROR_ON(retain_internal_weights && _reshape_weights_output.gc_buffer() == 0);
+ _are_weights_reshaped = _are_weights_reshaped || retain_internal_weights;
}
void GCFullyConnectedLayer::run()
@@ -156,6 +161,8 @@
_reshape_weights_kernel.run();
}
+ _memory_group.acquire();
+
// Linearize input if it comes from a convolutional layer
if(_is_fc_after_conv)
{
@@ -177,4 +184,6 @@
GCScheduler::get().dispatch(_accumulate_biases_kernel);
}
+
+ _memory_group.release();
}
diff --git a/src/runtime/GLES_COMPUTE/functions/GCGEMM.cpp b/src/runtime/GLES_COMPUTE/functions/GCGEMM.cpp
index 5122c20..79f8f71 100644
--- a/src/runtime/GLES_COMPUTE/functions/GCGEMM.cpp
+++ b/src/runtime/GLES_COMPUTE/functions/GCGEMM.cpp
@@ -38,59 +38,90 @@
#include "arm_compute/runtime/ITensorAllocator.h"
using namespace arm_compute;
-using namespace arm_compute::gles_compute;
-GCGEMM::GCGEMM()
- : _interleave_kernel(), _transpose_kernel(), _mm_kernel(), _ma_kernel(), _tmp_a(), _tmp_b(), _is_interleaved_transposed(false), _run_addition(false)
+namespace
+{
+Status validate_arguments(const ITensorInfo *a, const ITensorInfo *b, const IGCTensor *c, const ITensorInfo *output, const float alpha, const float beta, const GEMMInfo &gemm_info = GEMMInfo())
+{
+ ARM_COMPUTE_ERROR_ON_NULLPTR(a, b, output);
+
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(a, 1, DataType::F16, DataType::F32);
+ ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(a, b, output);
+ ARM_COMPUTE_ERROR_ON_MSG(gemm_info.is_a_reshaped(), "Matrix A already reshaped is not supported");
+ ARM_COMPUTE_ERROR_ON_MSG(gemm_info.is_b_reshaped(), "Matrix B already reshaped is not supported");
+
+ if(c != nullptr)
+ {
+ ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(a, c->info());
+ ARM_COMPUTE_ERROR_ON_MSG(a->dimension(1) != c->info()->dimension(1), "The C matrix must have the same number of rows as the matrix A");
+ ARM_COMPUTE_ERROR_ON_MSG(b->dimension(0) != c->info()->dimension(0), "The C matrix must have the same number of columns as the matrix B");
+ }
+
+ if(output->total_size() != 0)
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(b->dimension(0) != output->dimension(0), "The output matrix must have the same number of columns as the matrix B");
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(a->dimension(1) != output->dimension(1), "The output matrix must have the same number of rows as the matrix A");
+ }
+
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(a->dimension(0) != b->dimension(1), "The product AB is defined only if the number of columns in A is equal to the number of rows in B");
+
+ ARM_COMPUTE_UNUSED(alpha);
+ ARM_COMPUTE_UNUSED(beta);
+ ARM_COMPUTE_UNUSED(gemm_info);
+ return Status{};
+}
+} // namespace
+
+GCGEMM::GCGEMM(std::shared_ptr<IMemoryManager> memory_manager)
+ : _memory_group(std::move(memory_manager)), _interleave_kernel(), _transpose_kernel(), _mm_kernel(), _ma_kernel(), _tmp_a(), _tmp_b(), _is_interleaved_transposed(false), _run_addition(false),
+ _is_first_run(true), _reshape_b_only_on_first_run(false)
{
}
void GCGEMM::configure(const IGCTensor *a, const IGCTensor *b, const IGCTensor *c, IGCTensor *output, float alpha, float beta, const GEMMInfo &gemm_info)
{
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(a, 1, DataType::F32);
- ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(a, b, output);
- ARM_COMPUTE_ERROR_ON_MSG(gemm_info.is_a_reshaped(), "Matrix A already reshaped is not supported");
- ARM_COMPUTE_ERROR_ON_MSG(gemm_info.is_b_reshaped(), "Matrix B already reshaped is not supported");
- ARM_COMPUTE_ERROR_ON_MSG(gemm_info.reshape_b_only_on_first_run(), "Reshape matrix B only on first run is not supported");
- ARM_COMPUTE_UNUSED(gemm_info);
+ ARM_COMPUTE_ERROR_ON_NULLPTR(a, b, output);
- if(c != nullptr)
- {
- ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(a, c);
- ARM_COMPUTE_ERROR_ON_MSG(a->info()->dimension(1) != c->info()->dimension(1), "The C matrix must have the same number of rows as the matrix A");
- ARM_COMPUTE_ERROR_ON_MSG(b->info()->dimension(0) != c->info()->dimension(0), "The C matrix must have the same number of columns as the matrix C");
- ARM_COMPUTE_ERROR_ON_MSG(c->info()->dimension(0) != output->info()->dimension(0), "The C matrix must have the same number of rows as the output matrix");
- ARM_COMPUTE_ERROR_ON_MSG(c->info()->dimension(1) != output->info()->dimension(1), "The C matrix must have the same number of columns as the output matrix");
- }
+ // Perform validation step
+ ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(a->info(), b->info(), c, output->info(), alpha, beta, gemm_info));
- ARM_COMPUTE_ERROR_ON_MSG(a->info()->dimension(0) != b->info()->dimension(1), "The product AB is defined only if the number of columns in A is equal to the number of rows in B");
-
- // If the input tensor has less than 16 rows, we run a special version of GEMM without reshaping the input tensors
- _is_interleaved_transposed = a->info()->dimension(1) > 16;
+ // Check if we need to reshape the matrix B only on the first run
+ _reshape_b_only_on_first_run = gemm_info.reshape_b_only_on_first_run();
const IGCTensor *matrix_a = a;
const IGCTensor *matrix_b = b;
+ // Get the GPU target
+ const GPUTarget gpu_target = GCScheduler::get().get_target();
+
+ // Set the target for the kernels
+ _interleave_kernel.set_target(gpu_target);
+ _mm_kernel.set_target(gpu_target);
+
+ // Arguments used by GEMMReshapeInfo
+ // If we pass the matrix A and matrix B reshaped to GCGEMMMatrixMultiplyKernel, we need to pass m, n, k, mult_transpose1xW_width and mult_interleave4x4_height to GCGEMMReshapeInfo
+ // in order to know how the matrices have been reshaped
+ const int m = a->info()->dimension(1);
+ const int n = b->info()->dimension(0);
+ const int k = a->info()->dimension(0);
+ int mult_transpose1xW_width = 1;
+ int mult_interleave4x4_height = 1;
+
+ // If the input tensor has less than 16 rows, we run a special version of GEMM without reshaping the input tensors
+ _is_interleaved_transposed = a->info()->dimension(1) > 16;
+
if(_is_interleaved_transposed)
{
matrix_a = &_tmp_a;
matrix_b = &_tmp_b;
- TensorShape shape_tmp_a = a->info()->tensor_shape();
- TensorShape shape_tmp_b = b->info()->tensor_shape();
-
- shape_tmp_a.set(0, a->info()->dimension(0) * 4);
- shape_tmp_a.set(1, std::ceil(a->info()->dimension(1) / 4.0f));
-
- const unsigned int transpose_w = max_gc_vector_width / data_size_from_type(b->info()->data_type());
- shape_tmp_b.set(0, b->info()->dimension(1) * transpose_w);
- shape_tmp_b.set(1, std::ceil(b->info()->dimension(0) / static_cast<float>(transpose_w)));
-
- TensorInfo info_a(shape_tmp_a, 1, a->info()->data_type(), a->info()->fixed_point_position());
- _tmp_a.allocator()->init(info_a);
-
- TensorInfo info_b(shape_tmp_b, 1, b->info()->data_type(), b->info()->fixed_point_position());
- _tmp_b.allocator()->init(info_b);
+ // Manage intermediate buffers
+ _memory_group.manage(&_tmp_a);
+ if(!_reshape_b_only_on_first_run)
+ {
+ _memory_group.manage(&_tmp_b);
+ }
+ // _tmp_a and _tmp_b will be auto configured in _interleave_kernel and in _transpose_kernel
// Configure interleave kernel
_interleave_kernel.configure(a, &_tmp_a);
@@ -99,7 +130,7 @@
_transpose_kernel.configure(b, &_tmp_b);
}
- _mm_kernel.configure(matrix_a, matrix_b, output, alpha, _is_interleaved_transposed);
+ _mm_kernel.configure(matrix_a, matrix_b, output, alpha, _is_interleaved_transposed, GEMMReshapeInfo(m, n, k, mult_transpose1xW_width, mult_interleave4x4_height));
if(_is_interleaved_transposed)
{
@@ -116,15 +147,31 @@
}
}
+Status GCGEMM::validate(const ITensorInfo *a, const ITensorInfo *b, const IGCTensor *c, const ITensorInfo *output, const float alpha, const float beta, const GEMMInfo &gemm_info)
+{
+ ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(a, b, c, output, alpha, beta, gemm_info));
+ return Status{};
+}
+
void GCGEMM::run()
{
+ _memory_group.acquire();
if(_is_interleaved_transposed)
{
// Run interleave kernel
GCScheduler::get().dispatch(_interleave_kernel, false);
- // Run transpose kernel
- GCScheduler::get().dispatch(_transpose_kernel, false);
+ if(_is_first_run)
+ {
+ // Run transpose kernel
+ GCScheduler::get().dispatch(_transpose_kernel, false);
+ _is_first_run = false;
+ }
+ else if(!_reshape_b_only_on_first_run)
+ {
+ // Run transpose kernel
+ GCScheduler::get().dispatch(_transpose_kernel, false);
+ }
GCScheduler::get().memory_barrier();
}
@@ -137,4 +184,5 @@
GCScheduler::get().memory_barrier();
GCScheduler::get().dispatch(_ma_kernel);
}
+ _memory_group.release();
}
diff --git a/src/runtime/GLES_COMPUTE/functions/GCNormalizationLayer.cpp b/src/runtime/GLES_COMPUTE/functions/GCNormalizationLayer.cpp
index fc3882d..b2e69ee 100644
--- a/src/runtime/GLES_COMPUTE/functions/GCNormalizationLayer.cpp
+++ b/src/runtime/GLES_COMPUTE/functions/GCNormalizationLayer.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2018 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -33,8 +33,8 @@
using namespace arm_compute;
-GCNormalizationLayer::GCNormalizationLayer()
- : _squared_input(), _norm_kernel(), _multiply_kernel(), _border_handler()
+GCNormalizationLayer::GCNormalizationLayer(std::shared_ptr<IMemoryManager> memory_manager)
+ : _memory_group(std::move(memory_manager)), _squared_input(), _norm_kernel(), _multiply_kernel(), _border_handler()
{
}
@@ -43,6 +43,7 @@
ARM_COMPUTE_ERROR_ON(input == nullptr);
_squared_input.allocator()->init(TensorInfo(input->info()->tensor_shape(), 1, input->info()->data_type()));
+ _memory_group.manage(&_squared_input);
_norm_kernel.configure(input, &_squared_input, output, norm_info);
_multiply_kernel.configure(input, input, &_squared_input, 1.0f);
@@ -55,9 +56,13 @@
void GCNormalizationLayer::run()
{
+ _memory_group.acquire();
+
GCScheduler::get().dispatch(_multiply_kernel, false);
GCScheduler::get().memory_barrier();
GCScheduler::get().dispatch(_border_handler, false);
GCScheduler::get().memory_barrier();
GCScheduler::get().dispatch(_norm_kernel, true);
+
+ _memory_group.release();
}
diff --git a/src/runtime/GLES_COMPUTE/functions/GCSoftmaxLayer.cpp b/src/runtime/GLES_COMPUTE/functions/GCSoftmaxLayer.cpp
index 5221c5c..1748a59 100644
--- a/src/runtime/GLES_COMPUTE/functions/GCSoftmaxLayer.cpp
+++ b/src/runtime/GLES_COMPUTE/functions/GCSoftmaxLayer.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2018 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -29,8 +29,8 @@
using namespace arm_compute;
-GCSoftmaxLayer::GCSoftmaxLayer()
- : _max_kernel(), _shift_exp_sum_kernel(), _norm_kernel(), _max(), _sum(), _tmp()
+GCSoftmaxLayer::GCSoftmaxLayer(std::shared_ptr<IMemoryManager> memory_manager)
+ : _memory_group(std::move(memory_manager)), _max_kernel(), _shift_exp_sum_kernel(), _norm_kernel(), _max(), _sum(), _tmp()
{
}
@@ -50,6 +50,11 @@
_max.allocator()->init(tensor_info_max_sum);
_sum.allocator()->init(tensor_info_max_sum);
+ // Manage intermediate buffers
+ _memory_group.manage(&_tmp);
+ _memory_group.manage(&_max);
+ _memory_group.manage(&_sum);
+
// Configure Kernels
_max_kernel.configure(input, &_max);
_shift_exp_sum_kernel.configure(input, &_max, &_tmp, &_sum);
@@ -63,9 +68,13 @@
void GCSoftmaxLayer::run()
{
+ _memory_group.acquire();
+
GCScheduler::get().dispatch(_max_kernel, false);
GCScheduler::get().memory_barrier();
GCScheduler::get().dispatch(_shift_exp_sum_kernel, false);
GCScheduler::get().memory_barrier();
GCScheduler::get().dispatch(_norm_kernel);
+
+ _memory_group.release();
}
diff --git a/src/runtime/IScheduler.cpp b/src/runtime/IScheduler.cpp
index 583cb40..54a2bd2 100644
--- a/src/runtime/IScheduler.cpp
+++ b/src/runtime/IScheduler.cpp
@@ -23,202 +23,20 @@
*/
#include "arm_compute/runtime/IScheduler.h"
-#include <array>
-#include <cstdlib>
-#include <cstring>
-#include <fcntl.h>
-#include <fstream>
-#include <map>
-#include <sched.h>
-#include <sys/stat.h>
-#include <sys/types.h>
-#include <unistd.h>
-
-#ifndef BARE_METAL
-#include <regex>
-#include <thread>
-#endif /* BARE_METAL */
-
-namespace
-{
-unsigned int get_threads_hint()
-{
- unsigned int num_threads_hint = 1;
-
-#ifndef BARE_METAL
- std::map<std::string, unsigned int> cpu_part_occurrence_map;
-
- // CPU part regex
- std::regex cpu_part_rgx(R"(.*CPU part.+?(?=:).+?(?=\w+)(\w+).*)");
- std::smatch cpu_part_match;
-
- // Read cpuinfo and get occurrence of each core
- std::ifstream cpuinfo;
- cpuinfo.open("/proc/cpuinfo", std::ios::in);
- if(cpuinfo.is_open())
- {
- std::string line;
- while(bool(getline(cpuinfo, line)))
- {
- if(std::regex_search(line.cbegin(), line.cend(), cpu_part_match, cpu_part_rgx))
- {
- std::string cpu_part = cpu_part_match[1];
- if(cpu_part_occurrence_map.find(cpu_part) != cpu_part_occurrence_map.end())
- {
- cpu_part_occurrence_map[cpu_part]++;
- }
- else
- {
- cpu_part_occurrence_map[cpu_part] = 1;
- }
- }
- }
- }
-
- // Get min number of threads
- auto min_common_cores = std::min_element(cpu_part_occurrence_map.begin(), cpu_part_occurrence_map.end(),
- [](const std::pair<std::string, unsigned int> &p1, const std::pair<std::string, unsigned int> &p2)
- {
- return p1.second < p2.second;
- });
-
- // Set thread hint
- num_threads_hint = cpu_part_occurrence_map.empty() ? std::thread::hardware_concurrency() : min_common_cores->second;
-#endif /* BARE_METAL */
-
- return num_threads_hint;
-}
-
-unsigned int get_cpu_impl()
-{
-#ifndef BARE_METAL
- int fd = open("/proc/cpuinfo", 0); // NOLINT
- std::array<char, 3000> buff{ {} };
- char *pos = nullptr;
- char *end = nullptr;
- bool foundid = false;
-
- int cpu = sched_getcpu();
-
- if(fd == -1)
- {
- return 0;
- }
-
- int charsread = read(fd, buff.data(), 3000);
- pos = buff.data();
- end = buff.data() + charsread;
-
- close(fd);
-
- /* So, to date I've encountered two formats for /proc/cpuinfo.
- *
- * One of them just lists processor : n for each processor (with no
- * other info), then at the end lists part information for the current
- * CPU.
- *
- * The other has an entire clause (including part number info) for each
- * CPU in the system, with "processor : n" headers.
- *
- * We can cope with either of these formats by waiting to see
- * "processor: n" (where n = our CPU ID), and then looking for the next
- * "CPU part" field.
- */
- while(pos < end)
- {
- if(foundid && strncmp(pos, "CPU part", 8) == 0)
- {
- /* Found part number */
- pos += 11;
-
- for(char *ch = pos; ch < end; ch++)
- {
- if(*ch == '\n')
- {
- *ch = '\0';
- break;
- }
- }
-
- return strtoul(pos, nullptr, 0);
- }
-
- if(strncmp(pos, "processor", 9) == 0)
- {
- /* Found processor ID, see if it's ours. */
- pos += 11;
-
- for(char *ch = pos; ch < end; ch++)
- {
- if(*ch == '\n')
- {
- *ch = '\0';
- break;
- }
- }
-
- int num = strtol(pos, nullptr, 0);
-
- if(num == cpu)
- {
- foundid = true;
- }
- }
-
- while(pos < end)
- {
- char ch = *pos++;
- if(ch == '\n' || ch == '\0')
- {
- break;
- }
- }
- }
-#endif /* BARE_METAL */
-
- return 0;
-}
-} // namespace
+#include "arm_compute/runtime/CPUUtils.h"
namespace arm_compute
{
IScheduler::IScheduler()
+ : _cpu_info()
{
// Work out the best possible number of execution threads
_num_threads_hint = get_threads_hint();
-
- // Work out the CPU implementation
- switch(get_cpu_impl())
- {
- case 0xd0f:
- _info.CPU = CPUTarget::A55_DOT;
- break;
- case 0xd03:
- _info.CPU = CPUTarget::A53;
- break;
- default:
-#ifdef __arm__
- _info.CPU = CPUTarget::ARMV7;
-#elif __aarch64__
- _info.CPU = CPUTarget::ARMV8;
-#else /* __arm__ || __aarch64__ */
- _info.CPU = CPUTarget::INTRINSICS;
-#endif /* __arm__ || __aarch64__ */
- break;
- }
-
- _info.L1_size = 31000;
- _info.L2_size = 500000;
}
-void IScheduler::set_target(CPUTarget target)
+CPUInfo &IScheduler::cpu_info()
{
- _info.CPU = target;
-}
-
-CPUInfo IScheduler::cpu_info() const
-{
- return _info;
+ return _cpu_info;
}
unsigned int IScheduler::num_threads_hint() const
diff --git a/src/runtime/ISimpleLifetimeManager.cpp b/src/runtime/ISimpleLifetimeManager.cpp
index 2c64475..faaff8a 100644
--- a/src/runtime/ISimpleLifetimeManager.cpp
+++ b/src/runtime/ISimpleLifetimeManager.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2018 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -37,7 +37,7 @@
using namespace arm_compute;
ISimpleLifetimeManager::ISimpleLifetimeManager()
- : _active_group(nullptr), _active_elements(), _finalized_groups()
+ : _active_group(nullptr), _active_elements(), _free_blobs(), _occupied_blobs(), _finalized_groups()
{
}
@@ -53,14 +53,21 @@
void ISimpleLifetimeManager::start_lifetime(void *obj)
{
ARM_COMPUTE_ERROR_ON(obj == nullptr);
- ARM_COMPUTE_ERROR_ON_MSG(std::find_if(std::begin(_active_elements), std::end(_active_elements), [&obj](const Element & e)
+ ARM_COMPUTE_ERROR_ON_MSG(_active_elements.find(obj) != std::end(_active_elements), "Memory object is already registered!");
+
+ // Check if there is a free blob
+ if(_free_blobs.empty())
{
- return obj == e.id;
- }) != std::end(_active_elements),
- "Memory object is already registered!");
+ _occupied_blobs.emplace_front(Blob{ obj, 0, { obj } });
+ }
+ else
+ {
+ _occupied_blobs.splice(std::begin(_occupied_blobs), _free_blobs, std::begin(_free_blobs));
+ _occupied_blobs.front().id = obj;
+ }
// Insert object in groups and mark its finalized state to false
- _active_elements.emplace_back(obj);
+ _active_elements.insert(std::make_pair(obj, obj));
}
void ISimpleLifetimeManager::end_lifetime(void *obj, void **handle, size_t size)
@@ -68,36 +75,50 @@
ARM_COMPUTE_ERROR_ON(obj == nullptr);
// Find object
- auto it = std::find_if(std::begin(_active_elements), std::end(_active_elements), [&obj](const Element & e)
- {
- return obj == e.id;
- });
- ARM_COMPUTE_ERROR_ON(it == std::end(_active_elements));
+ auto active_object_it = _active_elements.find(obj);
+ ARM_COMPUTE_ERROR_ON(active_object_it == std::end(_active_elements));
// Update object fields and mark object as complete
- it->handle = handle;
- it->size = size;
- it->status = true;
+ Element &el = active_object_it->second;
+ el.handle = handle;
+ el.size = size;
+ el.status = true;
+
+ // Find object in the occupied lists
+ auto occupied_blob_it = std::find_if(std::begin(_occupied_blobs), std::end(_occupied_blobs), [&obj](const Blob & b)
+ {
+ return obj == b.id;
+ });
+ ARM_COMPUTE_ERROR_ON(occupied_blob_it == std::end(_occupied_blobs));
+
+ // Update occupied blob and return as free
+ occupied_blob_it->bound_elements.insert(obj);
+ occupied_blob_it->max_size = std::max(occupied_blob_it->max_size, size);
+ occupied_blob_it->id = nullptr;
+ _free_blobs.splice(std::begin(_free_blobs), _occupied_blobs, occupied_blob_it);
// Check if all object are finalized and reset active group
if(are_all_finalized())
{
- // Update finalized groups
- _finalized_groups[_active_group].insert(std::end(_finalized_groups[_active_group]), std::begin(_active_elements), std::end(_active_elements));
+ ARM_COMPUTE_ERROR_ON(!_occupied_blobs.empty());
// Update blobs and group mappings
update_blobs_and_mappings();
+ // Update finalized groups
+ _finalized_groups[_active_group] = std::move(_active_elements);
+
// Reset state
_active_elements.clear();
_active_group = nullptr;
+ _free_blobs.clear();
}
}
bool ISimpleLifetimeManager::are_all_finalized() const
{
- return !std::any_of(std::begin(_active_elements), std::end(_active_elements), [](const Element e)
+ return !std::any_of(std::begin(_active_elements), std::end(_active_elements), [](const std::pair<void *, Element> &e)
{
- return !e.status;
+ return !e.second.status;
});
}
diff --git a/src/runtime/Memory.cpp b/src/runtime/Memory.cpp
index 35d0c82..15bbb17 100644
--- a/src/runtime/Memory.cpp
+++ b/src/runtime/Memory.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2018 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -23,40 +23,45 @@
*/
#include "arm_compute/runtime/Memory.h"
-#include "arm_compute/core/Error.h"
+#include "arm_compute/runtime/MemoryRegion.h"
-using namespace arm_compute;
-
+namespace arm_compute
+{
Memory::Memory()
- : _memory(nullptr), _memory_owned(nullptr)
+ : _region(nullptr), _region_owned(nullptr)
{
+ create_empty_region();
}
-Memory::Memory(std::shared_ptr<uint8_t> memory)
- : _memory(nullptr), _memory_owned(std::move(memory))
+Memory::Memory(std::shared_ptr<IMemoryRegion> memory)
+ : _region(nullptr), _region_owned(std::move(memory))
{
- ARM_COMPUTE_ERROR_ON(_memory_owned.get() == nullptr);
- _memory = _memory_owned.get();
+ if(_region_owned == nullptr)
+ {
+ create_empty_region();
+ }
+ _region = _region_owned.get();
}
-Memory::Memory(uint8_t *memory)
- : _memory(memory), _memory_owned(nullptr)
+Memory::Memory(IMemoryRegion *memory)
+ : _region(memory), _region_owned(nullptr)
{
- ARM_COMPUTE_ERROR_ON(memory == nullptr);
+ _region = memory;
}
-uint8_t *Memory::buffer()
+IMemoryRegion *Memory::region()
{
- return _memory;
+ return _region;
}
-uint8_t *Memory::buffer() const
+IMemoryRegion *Memory::region() const
{
- return _memory;
+ return _region;
}
-uint8_t **Memory::handle()
+void Memory::create_empty_region()
{
- ARM_COMPUTE_ERROR_ON(_memory_owned.get() != nullptr);
- return &_memory;
-}
\ No newline at end of file
+ _region_owned = std::make_shared<MemoryRegion>(0);
+ _region = _region_owned.get();
+}
+} // namespace arm_compute
diff --git a/src/runtime/NEON/functions/NEConvertFullyConnectedWeights.cpp b/src/runtime/NEON/functions/NEConvertFullyConnectedWeights.cpp
new file mode 100644
index 0000000..b5b159a
--- /dev/null
+++ b/src/runtime/NEON/functions/NEConvertFullyConnectedWeights.cpp
@@ -0,0 +1,48 @@
+/*
+ * Copyright (c) 2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/NEON/functions/NEConvertFullyConnectedWeights.h"
+
+using namespace arm_compute;
+
+NEConvertFullyConnectedWeights::NEConvertFullyConnectedWeights()
+ : _kernel()
+{
+}
+
+void NEConvertFullyConnectedWeights::configure(const ITensor *input, ITensor *output, const TensorShape &original_input_shape,
+ DataLayout data_layout)
+{
+ _kernel.configure(input, output, original_input_shape, data_layout);
+}
+
+Status NEConvertFullyConnectedWeights::validate(const ITensorInfo *input, const ITensorInfo *output, const TensorShape &original_input_shape,
+ DataLayout data_layout)
+{
+ return NEConvertFullyConnectedWeightsKernel::validate(input, output, original_input_shape, data_layout);
+}
+
+void NEConvertFullyConnectedWeights::run()
+{
+ NEScheduler::get().schedule(&_kernel, Window::DimZ);
+}
diff --git a/src/runtime/NEON/functions/NEConvolutionLayer.cpp b/src/runtime/NEON/functions/NEConvolutionLayer.cpp
index 0a49158..7053c7e 100644
--- a/src/runtime/NEON/functions/NEConvolutionLayer.cpp
+++ b/src/runtime/NEON/functions/NEConvolutionLayer.cpp
@@ -30,41 +30,44 @@
#include <cmath>
#include <tuple>
+#include <utility>
namespace arm_compute
{
-NEConvolutionLayer::NEConvolutionLayer(std::shared_ptr<IMemoryManager> memory_manager)
- : _memory_manager(std::move(memory_manager)), _function()
+NEConvolutionLayer::NEConvolutionLayer(std::shared_ptr<IMemoryManager> memory_manager) //NOLINT
+ : _memory_manager(std::move(memory_manager)),
+ _function()
{
}
-void NEConvolutionLayer::configure(ITensor *input, const ITensor *weights, const ITensor *biases, ITensor *output, const PadStrideInfo &conv_info, const WeightsInfo &weights_info)
+void NEConvolutionLayer::configure(ITensor *input, const ITensor *weights, const ITensor *biases, ITensor *output, const PadStrideInfo &conv_info, const WeightsInfo &weights_info,
+ const Size2D &dilation, const ActivationLayerInfo &act_info, bool enable_fast_math)
{
// Perform validate step
ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output);
- ARM_COMPUTE_ERROR_THROW_ON(NEConvolutionLayer::validate(input->info(), weights->info(), ((biases != nullptr) ? biases->info() : nullptr), output->info(), conv_info, weights_info));
+ ARM_COMPUTE_ERROR_THROW_ON(NEConvolutionLayer::validate(input->info(), weights->info(), ((biases != nullptr) ? biases->info() : nullptr), output->info(), conv_info, weights_info, dilation, act_info,
+ enable_fast_math));
- switch(NEConvolutionLayer::get_convolution_method(input->info(), weights->info(), ((biases != nullptr) ? biases->info() : nullptr), output->info(), conv_info,
- weights_info))
+ switch(NEConvolutionLayer::get_convolution_method(input->info(), weights->info(), output->info(), conv_info, weights_info, dilation, act_info))
{
case ConvolutionMethod::WINOGRAD:
{
- auto f = arm_compute::support::cpp14::make_unique<NEWinogradLayer>(_memory_manager);
- f->configure(input, weights, biases, output, conv_info);
+ auto f = arm_compute::support::cpp14::make_unique<NEWinogradConvolutionLayer>(_memory_manager);
+ f->configure(input, weights, biases, output, conv_info, act_info, enable_fast_math);
_function = std::move(f);
break;
}
case ConvolutionMethod::GEMM:
{
auto f = arm_compute::support::cpp14::make_unique<NEGEMMConvolutionLayer>(_memory_manager);
- f->configure(input, weights, biases, output, conv_info, weights_info);
+ f->configure(input, weights, biases, output, conv_info, weights_info, dilation, act_info);
_function = std::move(f);
break;
}
case ConvolutionMethod::DIRECT:
{
auto f = arm_compute::support::cpp14::make_unique<NEDirectConvolutionLayer>(_memory_manager);
- f->configure(input, weights, biases, output, conv_info);
+ f->configure(input, weights, biases, output, conv_info, act_info);
_function = std::move(f);
break;
}
@@ -75,21 +78,21 @@
}
Status NEConvolutionLayer::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info,
- const WeightsInfo &weights_info)
+ const WeightsInfo &weights_info, const Size2D &dilation, const ActivationLayerInfo &act_info, bool enable_fast_math)
{
- switch(NEConvolutionLayer::get_convolution_method(input, weights, biases, output, conv_info, weights_info))
+ switch(NEConvolutionLayer::get_convolution_method(input, weights, output, conv_info, weights_info, dilation, act_info))
{
case ConvolutionMethod::WINOGRAD:
//Validate Winograd
- NEWinogradLayer::validate(input, weights, biases, output, conv_info);
+ ARM_COMPUTE_RETURN_ON_ERROR(NEWinogradConvolutionLayer::validate(input, weights, biases, output, conv_info, act_info, enable_fast_math));
break;
case ConvolutionMethod::GEMM:
//Validate Gemm-based Convolution
- NEGEMMConvolutionLayer::validate(input, weights, biases, output, conv_info, weights_info);
+ ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMConvolutionLayer::validate(input, weights, biases, output, conv_info, weights_info, dilation, act_info));
break;
case ConvolutionMethod::DIRECT:
//Validate Gemm-based Convolution
- NEDirectConvolutionLayer::validate(input, weights, biases, output, conv_info);
+ ARM_COMPUTE_RETURN_ON_ERROR(NEDirectConvolutionLayer::validate(input, weights, biases, output, conv_info, act_info));
default:
ARM_COMPUTE_ERROR("Not supported.");
break;
@@ -98,17 +101,20 @@
return Status{};
}
-ConvolutionMethod NEConvolutionLayer::get_convolution_method(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info,
- const WeightsInfo &weights_info)
+ConvolutionMethod NEConvolutionLayer::get_convolution_method(const ITensorInfo *input, const ITensorInfo *weights,
+ const ITensorInfo *output, const PadStrideInfo &conv_info,
+ const WeightsInfo &weights_info, const Size2D &dilation, const ActivationLayerInfo &act_info, bool enable_fast_math)
{
- ARM_COMPUTE_UNUSED(output);
+ ARM_COMPUTE_ERROR_ON_NULLPTR(input, output, weights);
ARM_COMPUTE_UNUSED(weights_info);
- if((input->data_type() == DataType::F32) && (weights->dimension(0) == 3) && (weights->dimension(1) == 3) && (weights->num_dimensions() <= 4) && (conv_info.stride().first == 1)
- && (conv_info.stride().second == 1) && (biases != nullptr))
+
+ if(dilation != Size2D(1U, 1U) || Scheduler::get().cpu_info().get_cpu_model() == CPUModel::A53
+ || input->dimension(get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::CHANNEL)) <= 16)
{
- return ConvolutionMethod::WINOGRAD;
+ return ConvolutionMethod::GEMM;
}
- return ConvolutionMethod::GEMM;
+
+ return bool(NEWinogradConvolutionLayer::validate(input, weights, nullptr, output, conv_info, act_info, enable_fast_math)) ? ConvolutionMethod::WINOGRAD : ConvolutionMethod::GEMM;
}
void NEConvolutionLayer::run()
diff --git a/src/runtime/NEON/functions/NEDeconvolutionLayer.cpp b/src/runtime/NEON/functions/NEDeconvolutionLayer.cpp
index c1ba5dd..40ada8f 100644
--- a/src/runtime/NEON/functions/NEDeconvolutionLayer.cpp
+++ b/src/runtime/NEON/functions/NEDeconvolutionLayer.cpp
@@ -34,6 +34,7 @@
NEDeconvolutionLayer::NEDeconvolutionLayer(std::shared_ptr<IMemoryManager> memory_manager) // NOLINT
: _memory_group(std::move(memory_manager)),
_conv_f(),
+ _upsample_f(),
_scaled_output(),
_input(nullptr),
_info(),
@@ -41,13 +42,64 @@
{
}
+Status NEDeconvolutionLayer::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *bias, const ITensorInfo *output, const PadStrideInfo &info,
+ unsigned int inner_border_right, unsigned int inner_border_top)
+{
+ ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, weights, output);
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32);
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(weights, 1, DataType::F32);
+ ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(0) != weights->dimension(1));
+ ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(0) < 1);
+ ARM_COMPUTE_RETURN_ERROR_ON(!info.padding_is_symmetric());
+
+ const unsigned int stride_x = info.stride().first;
+ const unsigned int stride_y = info.stride().second;
+
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(inner_border_right > stride_x - 1, "inner_border_right must be smaller than stride_x");
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(inner_border_top > stride_y - 1, "inner_border_top must be smaller than stride_y");
+
+ auto out_dims = deconvolution_output_dimensions(input->dimension(0), input->dimension(1), weights->dimension(0), weights->dimension(1),
+ info.pad().first, info.pad().second, inner_border_right, inner_border_top, stride_x, stride_y);
+
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, weights, bias);
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_FIXED_POINT(input, weights, bias);
+
+ if(bias != nullptr)
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, bias);
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_FIXED_POINT(input, bias);
+ }
+
+ if(output->tensor_shape().total_size() > 0)
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_FIXED_POINT(input, output);
+
+ const TensorShape output_shape = deconvolution_output_shape(out_dims, input->tensor_shape(), weights->tensor_shape());
+
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->dimension(Window::DimX) != output_shape.x(), "Output's width is invalid.");
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->dimension(Window::DimY) != output_shape.y(), "Output's height is invalid.");
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->dimension(Window::DimZ) != output_shape.z(), "Output's depth is invalid.");
+ }
+
+ TensorInfo scale_out_info(input->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(compute_deconvolution_shape(*input, stride_x, stride_y, inner_border_right, inner_border_top,
+ info)));
+ const PadStrideInfo conv_info(1, 1, 0, 0, 0, 0, DimensionRoundingType::CEIL);
+
+ for(size_t i = 2; i < Coordinates::num_max_dimensions; ++i)
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(i) != scale_out_info.dimension(i));
+ }
+
+ ARM_COMPUTE_RETURN_ON_ERROR(NEConvolutionLayer::validate(&scale_out_info, weights, bias, output, conv_info, WeightsInfo()));
+
+ return Status{};
+}
+
void NEDeconvolutionLayer::configure(ITensor *input, const ITensor *weights, const ITensor *bias, ITensor *output, const PadStrideInfo &info,
unsigned int inner_border_right, unsigned int inner_border_top)
{
- ARM_COMPUTE_ERROR_ON_NULLPTR(output);
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32);
- ARM_COMPUTE_ERROR_ON(weights->info()->dimension(0) != weights->info()->dimension(1));
- ARM_COMPUTE_ERROR_ON(!info.padding_is_symmetric());
+ ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output);
_input = input;
_info = info;
@@ -55,15 +107,9 @@
const unsigned int stride_x = info.stride().first;
const unsigned int stride_y = info.stride().second;
- auto out_dims = deconvolution_output_dimensions(input->info()->dimension(0), input->info()->dimension(1), weights->info()->dimension(0), weights->info()->dimension(1),
- info.pad().first, info.pad().second, inner_border_right, inner_border_top, stride_x, stride_y);
- const TensorShape output_shape = deconvolution_output_shape(out_dims, input->info()->tensor_shape(), weights->info()->tensor_shape());
-
- ARM_COMPUTE_UNUSED(output_shape);
- ARM_COMPUTE_ERROR_ON_MSG(output->info()->dimension(Window::DimX) != output_shape.x(), "Output's width is invalid.");
- ARM_COMPUTE_ERROR_ON_MSG(output->info()->dimension(Window::DimY) != output_shape.y(), "Output's height is invalid.");
- ARM_COMPUTE_ERROR_ON_MSG(output->info()->dimension(Window::DimZ) != output_shape.z(), "Output's depth is invalid.");
+ // Perform validation step
+ ARM_COMPUTE_ERROR_THROW_ON(NEDeconvolutionLayer::validate(input->info(), weights->info(), bias == nullptr ? nullptr : bias->info(), output->info(), info, inner_border_right, inner_border_top));
_memory_group.manage(&_scaled_output);
@@ -79,44 +125,20 @@
// Allocate auxiliary tensors
_scaled_output.allocator()->allocate();
+
+ // configure upsample function
+ _upsample_f.configure(input, &_scaled_output, info, inner_border_right, inner_border_top);
}
void NEDeconvolutionLayer::run()
{
_memory_group.acquire();
- // Initialize _scaled_output buffer
- const int width_in = _input->info()->dimension(0);
- const int height_in = _input->info()->dimension(1);
- const int width_scaled = _scaled_output.info()->dimension(0);
- const int height_scaled = _scaled_output.info()->dimension(1);
- const int num_2d_slices = _input->info()->tensor_shape().total_size() / (width_in * height_in);
- const int stride_x = _info.stride().first;
- const int stride_y = _info.stride().second;
-
- std::fill_n(_scaled_output.buffer(), _scaled_output.info()->total_size(), 0);
-
- // scaled_output is the input for the forward convolution. We copy the input elements to scaled_output
- // and insert rows and columns with zeroes depending on the stride values.
- for(int slice = 0; slice < num_2d_slices; ++slice)
- {
- const int start_x = _info.pad().first;
- const int start_y = _inner_border.second + _info.pad().second;
- const int end_y = height_scaled - _info.pad().second;
- const int end_x = width_scaled - _inner_border.first - _info.pad().first;
-
- for(int yi = start_y, in_y = 0; yi < end_y; yi += stride_y, in_y++)
- {
- for(int xi = start_x, in_x = 0; xi < end_x; xi += stride_x, in_x++)
- {
- const auto in = *(reinterpret_cast<float *>(_input->buffer() + _input->info()->offset_element_in_bytes(Coordinates(in_x, in_y, slice))));
- *(reinterpret_cast<float *>(_scaled_output.buffer() + _scaled_output.info()->offset_element_in_bytes(Coordinates(xi, yi, slice)))) = in;
- }
- }
- }
+ // Run upsample kernel
+ _upsample_f.run();
// Run convolution layer
_conv_f.run();
_memory_group.release();
-}
+}
\ No newline at end of file
diff --git a/src/runtime/NEON/functions/NEDepthwiseConvolutionLayer.cpp b/src/runtime/NEON/functions/NEDepthwiseConvolutionLayer.cpp
index 95fcf88..0a977ad 100644
--- a/src/runtime/NEON/functions/NEDepthwiseConvolutionLayer.cpp
+++ b/src/runtime/NEON/functions/NEDepthwiseConvolutionLayer.cpp
@@ -37,11 +37,11 @@
NEDepthwiseConvolutionLayer3x3::NEDepthwiseConvolutionLayer3x3()
: _dwc_kernel(), _output_stage_kernel(), _border_handler(), _permute_input(), _permute_weights(), _permute_output(), _accumulator(), _input_nhwc(), _weights_hwio(), _output_nhwc(), _has_bias(false),
- _is_quantized(false), _is_optimized(false), _are_weights_reshaped(false)
+ _is_quantized(false), _is_optimized(false), _are_weights_reshaped(false), _is_nchw(true), _is_first_run(true)
{
}
-void NEDepthwiseConvolutionLayer3x3::configure(ITensor *input, const ITensor *weights, const ITensor *biases, ITensor *output, const PadStrideInfo &conv_info)
+void NEDepthwiseConvolutionLayer3x3::configure(ITensor *input, const ITensor *weights, const ITensor *biases, ITensor *output, const PadStrideInfo &conv_info, unsigned int depth_multiplier)
{
ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::F32);
ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, weights);
@@ -52,30 +52,39 @@
_has_bias = biases != nullptr;
_is_optimized = NEDepthwiseConvolutionLayer3x3Kernel::is_optimized_execution_possible(input->info()->tensor_shape(),
conv_info,
- input->info()->data_type());
+ input->info()->data_type(),
+ depth_multiplier,
+ input->info()->data_layout());
_are_weights_reshaped = false;
+ _is_nchw = input->info()->data_layout() == DataLayout::NCHW;
+
+ ARM_COMPUTE_ERROR_ON(!_is_optimized && !_is_nchw);
if(_is_optimized)
{
- // Configure the function to transform the input tensor from NCHW -> NHWC
- _permute_input.configure(input, &_input_nhwc, PermutationVector(2U, 0U, 1U));
+ if(_is_nchw)
+ {
+ // Configure the function to transform the input tensor from NCHW -> NHWC
+ _permute_input.configure(input, &_input_nhwc, PermutationVector(2U, 0U, 1U));
- // Configure the function to transform the weights tensor from IHW -> HWI
- _permute_weights.configure(weights, &_weights_hwio, PermutationVector(2U, 0U, 1U));
+ // Configure the function to transform the weights tensor from IHW -> HWI
+ _permute_weights.configure(weights, &_weights_hwio, PermutationVector(2U, 0U, 1U));
- // Configure optimized depthwise
- _dwc_kernel.configure(&_input_nhwc, &_weights_hwio, &_output_nhwc, conv_info, DataLayout::NHWC);
+ // Configure optimized depthwise
+ _dwc_kernel.configure(&_input_nhwc, &_weights_hwio, &_output_nhwc, conv_info, depth_multiplier, DataLayout::NHWC);
- // Configure the function to transform the convoluted output to ACL's native ordering format NCHW
- _permute_output.configure(&_output_nhwc, output, PermutationVector(1U, 2U, 0U));
+ // Configure the function to transform the convoluted output to ACL's native ordering format NCHW
+ _permute_output.configure(&_output_nhwc, output, PermutationVector(1U, 2U, 0U));
- // Allocate tensors
- _input_nhwc.allocator()->allocate();
- _weights_hwio.allocator()->allocate();
- _output_nhwc.allocator()->allocate();
-
- // Create convolver (deferred)
- _dwc_kernel.generate_convolver();
+ // Allocate tensors
+ _input_nhwc.allocator()->allocate();
+ _weights_hwio.allocator()->allocate();
+ _output_nhwc.allocator()->allocate();
+ }
+ else
+ {
+ _dwc_kernel.configure(input, weights, output, conv_info, depth_multiplier, DataLayout::NHWC);
+ }
}
else
{
@@ -88,7 +97,7 @@
}
// Configure depthwise convolution kernel
- _dwc_kernel.configure(input, weights, (_is_quantized) ? &_accumulator : output, conv_info);
+ _dwc_kernel.configure(input, weights, (_is_quantized) ? &_accumulator : output, conv_info, depth_multiplier);
// Configure border handler
_border_handler.configure(input, _dwc_kernel.border_size(), BorderMode::CONSTANT, zero_value);
@@ -116,8 +125,15 @@
void NEDepthwiseConvolutionLayer3x3::run()
{
+ if(_is_first_run && _is_optimized)
+ {
+ _is_first_run = false;
+ // Create convolver (deferred)
+ _dwc_kernel.generate_convolver();
+ }
+
// Permute weights in HWIO format if the optimized kernel will be executedd
- if(!_are_weights_reshaped && _is_optimized)
+ if(!_are_weights_reshaped && _is_optimized && _is_nchw)
{
_are_weights_reshaped = true;
_permute_weights.run();
@@ -126,8 +142,11 @@
// Handle input
if(_is_optimized)
{
- // Permute input to NHWC format execution
- _permute_input.run();
+ if(_is_nchw)
+ {
+ // Permute input to NHWC format execution
+ _permute_input.run();
+ }
}
else
{
@@ -139,7 +158,7 @@
NEScheduler::get().schedule(&_dwc_kernel, Window::DimX);
// Permute output to ACL's native NCHW format in case of NHWC execution
- if(_is_optimized)
+ if(_is_optimized && _is_nchw)
{
_permute_output.run();
}
@@ -153,31 +172,37 @@
NEDepthwiseConvolutionLayer::NEDepthwiseConvolutionLayer()
: _im2col_kernel(), _weights_reshape_kernel(), _v2mm_kernel(), _vector_to_tensor_kernel(), _output_stage_kernel(), _v2mm_input_fill_border(), _v2mm_weights_fill_border(), _input_reshaped(),
- _weights_reshaped(), _v2mm_output(), _output_reshaped(), _is_quantized(false)
+ _weights_reshaped(), _v2mm_output(), _output_reshaped(), _is_first_run(true), _is_quantized(false), _original_weights(nullptr)
{
}
-void NEDepthwiseConvolutionLayer::configure(ITensor *input, const ITensor *weights, const ITensor *biases, ITensor *output, const PadStrideInfo &conv_info)
+void NEDepthwiseConvolutionLayer::configure(ITensor *input, const ITensor *weights, const ITensor *biases, ITensor *output, const PadStrideInfo &conv_info, unsigned int depth_multiplier)
{
ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::F32);
ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, weights);
- ARM_COMPUTE_ERROR_ON(input->info()->dimension(2) != weights->info()->dimension(2));
+ ARM_COMPUTE_ERROR_ON((input->info()->dimension(2) * depth_multiplier) != weights->info()->dimension(2));
const size_t weights_w = weights->info()->dimension(0);
const size_t weights_h = weights->info()->dimension(1);
const size_t weights_z = weights->info()->dimension(2);
- _is_quantized = is_data_type_quantized_asymmetric(input->info()->data_type());
+ _is_quantized = is_data_type_quantized_asymmetric(input->info()->data_type());
+ _is_first_run = true;
+ _original_weights = weights;
// Should bias be appended ?
bool append_bias = (biases != nullptr) && !_is_quantized;
// Calculate output shape
- TensorShape dwc_output_shape = shape_calculator::compute_depthwise_convolution_shape(*input->info(), *weights->info(), conv_info);
+ TensorShape output_shape = shape_calculator::compute_depthwise_convolution_shape(*input->info(), *weights->info(), conv_info, depth_multiplier);
+
+ // Output auto inizialitation if not yet initialized
+ auto_init_if_empty(*output->info(), input->info()->clone()->set_tensor_shape(output_shape));
+ ARM_COMPUTE_ERROR_ON_MISMATCHING_DIMENSIONS(output->info()->tensor_shape(), output_shape);
// Output width and height
- const unsigned int conv_w = dwc_output_shape.x();
- const unsigned int conv_h = dwc_output_shape.y();
+ const unsigned int conv_w = output_shape.x();
+ const unsigned int conv_h = output_shape.y();
// Set up intermediate tensors
const size_t patch_size = weights_w * weights_h + (append_bias ? 1 : 0);
@@ -189,7 +214,7 @@
shape_im2col.set(1, conv_size);
shape_im2col.set(2, weights_z);
_input_reshaped.allocator()->init(input->info()->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(shape_im2col));
- _im2col_kernel.configure(input, &_input_reshaped, Size2D(weights_w, weights_h), conv_info, append_bias);
+ _im2col_kernel.configure(input, &_input_reshaped, Size2D(weights_w, weights_h), conv_info, append_bias, depth_multiplier);
// Weights reshape configuration
const TensorShape shape_weights_reshape(patch_size, weights_z);
@@ -204,7 +229,7 @@
shape_v2mm_out.set(2, 1);
_v2mm_output.allocator()->init(input->info()->clone()->set_is_resizable(true).reset_padding().set_data_type(v2mm_dt).set_tensor_shape(shape_v2mm_out));
_v2mm_kernel.configure(&_input_reshaped, &_weights_reshaped, &_v2mm_output);
- _output_reshaped.allocator()->init(_v2mm_output.info()->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(dwc_output_shape));
+ _output_reshaped.allocator()->init(_v2mm_output.info()->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(output_shape));
_vector_to_tensor_kernel.configure(&_v2mm_output, (_is_quantized) ? &_output_reshaped : output, conv_w, conv_h);
// Output staged configuration
@@ -241,10 +266,21 @@
void NEDepthwiseConvolutionLayer::run()
{
+ // Run weights reshaping (Runs once for every configure)
+ if(_is_first_run)
+ {
+ ARM_COMPUTE_ERROR_ON(!_original_weights->is_used());
+
+ NEScheduler::get().schedule(&_weights_reshape_kernel, Window::DimX);
+ NEScheduler::get().schedule(&_v2mm_weights_fill_border, Window::DimX);
+ _is_first_run = false;
+
+ // Mark original weights tensor as unused
+ _original_weights->mark_as_unused();
+ }
+
NEScheduler::get().schedule(&_im2col_kernel, Window::DimX);
- NEScheduler::get().schedule(&_weights_reshape_kernel, Window::DimX);
NEScheduler::get().schedule(&_v2mm_input_fill_border, Window::DimX);
- NEScheduler::get().schedule(&_v2mm_weights_fill_border, Window::DimX);
NEScheduler::get().schedule(&_v2mm_kernel, Window::DimX);
NEScheduler::get().schedule(&_vector_to_tensor_kernel, Window::DimX);
if(_is_quantized)
diff --git a/src/runtime/NEON/functions/NEDequantizationLayer.cpp b/src/runtime/NEON/functions/NEDequantizationLayer.cpp
index a58b6e4..0627977 100644
--- a/src/runtime/NEON/functions/NEDequantizationLayer.cpp
+++ b/src/runtime/NEON/functions/NEDequantizationLayer.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2018 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -25,6 +25,7 @@
#include "arm_compute/runtime/NEON/functions/NEDequantizationLayer.h"
#include "arm_compute/core/Types.h"
+#include "arm_compute/core/Validate.h"
#include "arm_compute/runtime/NEON/NEScheduler.h"
using namespace arm_compute;
@@ -34,8 +35,18 @@
{
}
+Status NEDequantizationLayer::validate(const ITensorInfo *input, const ITensorInfo *output, const ITensorInfo *min_max)
+{
+ ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output, min_max);
+ ARM_COMPUTE_RETURN_ON_ERROR(NEDequantizationLayerKernel::validate(input, output, min_max));
+
+ return Status{};
+}
+
void NEDequantizationLayer::configure(const ITensor *input, ITensor *output, const ITensor *min_max)
{
+ ARM_COMPUTE_ERROR_ON_NULLPTR(input, output, min_max);
+
// Configure kernel
_dequantize_kernel.configure(input, output, min_max);
}
diff --git a/src/runtime/NEON/functions/NEDirectConvolutionLayer.cpp b/src/runtime/NEON/functions/NEDirectConvolutionLayer.cpp
index c26c99a..445864c 100644
--- a/src/runtime/NEON/functions/NEDirectConvolutionLayer.cpp
+++ b/src/runtime/NEON/functions/NEDirectConvolutionLayer.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2018 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -34,18 +34,23 @@
using namespace arm_compute;
NEDirectConvolutionLayer::NEDirectConvolutionLayer(std::shared_ptr<IMemoryManager> memory_manager)
- : _memory_group(std::move(memory_manager)), _output_stage_kernel(), _conv_kernel(), _input_border_handler(), _accumulator(), _has_bias(false), _is_fixed_point(false)
+ : _memory_group(std::move(memory_manager)), _output_stage_kernel(), _conv_kernel(), _input_border_handler(), _activationlayer_function(), _accumulator(), _has_bias(false), _is_fixed_point(false),
+ _is_activationlayer_enabled(false), _dim_split(Window::DimZ)
{
}
-void NEDirectConvolutionLayer::configure(ITensor *input, const ITensor *weights, const ITensor *bias, ITensor *output, const PadStrideInfo &conv_info)
+void NEDirectConvolutionLayer::configure(ITensor *input, const ITensor *weights, const ITensor *bias, ITensor *output, const PadStrideInfo &conv_info, const ActivationLayerInfo &act_info)
{
+ ARM_COMPUTE_ERROR_ON(input->info()->data_layout() == DataLayout::UNKNOWN);
+
// Free accumulator
if(_accumulator.buffer() != nullptr)
{
_accumulator.allocator()->free();
}
+ _dim_split = input->info()->data_layout() == DataLayout::NCHW ? Window::DimZ : Window::DimY;
+
// Check if bias should be added in the convolution result
_has_bias = (bias != nullptr);
@@ -73,9 +78,17 @@
// Add zero padding XY
_input_border_handler.configure(input, _conv_kernel.border_size(), BorderMode::CONSTANT, PixelValue(static_cast<float>(0.f)));
+
+ //Configure Activation Layer
+ _is_activationlayer_enabled = act_info.enabled();
+ if(_is_activationlayer_enabled)
+ {
+ _activationlayer_function.configure(output, nullptr, act_info);
+ }
}
-Status NEDirectConvolutionLayer::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *bias, const ITensorInfo *output, const PadStrideInfo &conv_info)
+Status NEDirectConvolutionLayer::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *bias, const ITensorInfo *output, const PadStrideInfo &conv_info,
+ const ActivationLayerInfo &act_info)
{
ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, weights, output);
@@ -101,6 +114,11 @@
// Validate bias kernel
ARM_COMPUTE_RETURN_ON_ERROR(NEDirectConvolutionLayerOutputStageKernel::validate(&accumulator, bias, output));
+ if(act_info.enabled())
+ {
+ ARM_COMPUTE_RETURN_ON_ERROR(NEActivationLayer::validate(output, nullptr, act_info));
+ }
+
return Status{};
}
@@ -110,10 +128,15 @@
_memory_group.acquire();
- NEScheduler::get().schedule(&_conv_kernel, Window::DimZ);
+ NEScheduler::get().schedule(&_conv_kernel, _dim_split);
if(_has_bias || _is_fixed_point)
{
NEScheduler::get().schedule(&_output_stage_kernel, Window::DimY);
}
+
+ if(_is_activationlayer_enabled)
+ {
+ _activationlayer_function.run();
+ }
_memory_group.release();
}
diff --git a/src/runtime/NEON/functions/NEFullyConnectedLayer.cpp b/src/runtime/NEON/functions/NEFullyConnectedLayer.cpp
index 26b7271..958d081 100644
--- a/src/runtime/NEON/functions/NEFullyConnectedLayer.cpp
+++ b/src/runtime/NEON/functions/NEFullyConnectedLayer.cpp
@@ -132,7 +132,7 @@
NEFullyConnectedLayer::NEFullyConnectedLayer(std::shared_ptr<IMemoryManager> memory_manager)
: _memory_group(std::move(memory_manager)), _im2col_kernel(), _reshape_weights_kernel(), _interleave4x4_kernel(), _mm_kernel(), _accumulate_biases_kernel(), _im2col_output(), _interleave4x4_output(),
- _reshape_weights_output(), _are_weights_reshaped(false), _is_batched_fc_layer(false), _linearize_input(false), _accumulate_biases(false)
+ _reshape_weights_output(), _are_weights_reshaped(false), _is_batched_fc_layer(false), _linearize_input(false), _accumulate_biases(false), _original_weights(nullptr)
{
}
@@ -163,6 +163,7 @@
const int num_input_dimensions = input->info()->tensor_shape().num_dimensions() - num_batch_dimensions;
const size_t linear_input_size = input->info()->tensor_shape().total_size_lower(num_input_dimensions);
+ _original_weights = weights;
_linearize_input = (input->info()->tensor_shape().x() != linear_input_size) || (num_input_dimensions > 1 && linear_input_size == 1);
_are_weights_reshaped = are_weights_reshaped;
_accumulate_biases = biases != nullptr;
@@ -187,7 +188,7 @@
if(_linearize_input)
{
- _im2col_output.allocator()->init(input->info()->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(compute_im2col_shape(input->info(), num_input_dimensions)));
+ _im2col_output.allocator()->init(input->info()->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(compute_im2col_fc_shape(input->info(), num_input_dimensions)));
// Configure im2col kernel
_memory_group.manage(&_im2col_output);
@@ -287,7 +288,7 @@
if(linearize_input)
{
- im2col_output->set_tensor_shape(compute_im2col_shape(input, num_input_dimensions));
+ im2col_output->set_tensor_shape(compute_im2col_fc_shape(input, num_input_dimensions));
ARM_COMPUTE_RETURN_ON_ERROR(NEIm2ColKernel::validate(input, im2col_output.get(), Size2D(1, 1), PadStrideInfo(1, 1, 0, 0), false, true));
@@ -324,8 +325,13 @@
// Reshape of the weights (happens only once)
if(!_are_weights_reshaped)
{
+ ARM_COMPUTE_ERROR_ON(!_original_weights->is_used());
+
_are_weights_reshaped = true;
_reshape_weights_kernel.run();
+
+ // Mark original weights tensor as unused
+ _original_weights->mark_as_unused();
}
_memory_group.acquire();
diff --git a/src/runtime/NEON/functions/NEGEMM.cpp b/src/runtime/NEON/functions/NEGEMM.cpp
index 05907ba..9168ed4 100644
--- a/src/runtime/NEON/functions/NEGEMM.cpp
+++ b/src/runtime/NEON/functions/NEGEMM.cpp
@@ -26,37 +26,20 @@
#include "arm_compute/core/Error.h"
#include "arm_compute/core/Helpers.h"
#include "arm_compute/core/ITensor.h"
-#include "arm_compute/core/NEON/kernels/arm32/NEGEMMAArch32Kernel.h"
-#include "arm_compute/core/NEON/kernels/arm64/NEGEMMAArch64Kernel.h"
-#include "arm_compute/core/NEON/kernels/arm64/NEGEMVAArch64Kernel.h"
-#include "arm_compute/core/NEON/kernels/arm64/NEHGEMMAArch64FP16Kernel.h"
#include "arm_compute/core/TensorInfo.h"
#include "arm_compute/core/Types.h"
#include "arm_compute/core/Validate.h"
+#include "arm_compute/runtime/NEON/AssemblyHelper.h"
#include "arm_compute/runtime/NEON/NEScheduler.h"
#include "arm_compute/runtime/TensorAllocator.h"
#include "support/ToolchainSupport.h"
-namespace arm_compute
-{
-#pragma GCC diagnostic push
-#pragma GCC diagnostic ignored "-Wswitch-default"
-#pragma GCC diagnostic ignored "-Weffc++"
-#include "arm_compute/core/NEON/kernels/assembly/gemm_interleaved.hpp"
-#include "arm_compute/core/NEON/kernels/assembly/gemv_transposed.hpp"
-#include "arm_compute/core/NEON/kernels/assembly/kernels/a32_sgemm_8x6.hpp"
-#include "arm_compute/core/NEON/kernels/assembly/kernels/a64_hgemm_24x8.hpp"
-#include "arm_compute/core/NEON/kernels/assembly/kernels/a64_sgemm_12x8.hpp"
-#include "arm_compute/core/NEON/kernels/assembly/kernels/a64_sgemv_trans.hpp"
-#pragma GCC diagnostic pop
-} // namespace arm_compute
-
#include <cmath>
namespace arm_compute
{
NEGEMM::NEGEMM(std::shared_ptr<IMemoryManager> memory_manager)
- : _memory_group(std::move(memory_manager)), _interleave_kernel(), _transpose_kernel(), _mm_kernel(), _mm_optimised_kernel(nullptr), _ma_kernel(), _tmp_a(), _tmp_b(), _workspace(),
+ : _memory_group(std::move(memory_manager)), _interleave_kernel(), _transpose_kernel(), _mm_kernel(), _asm_glue(), _ma_kernel(), _tmp_a(), _tmp_b(), _workspace(), _B_pretransposed(),
_run_vector_matrix_multiplication(false), _run_addition(false), _is_first_run(true), _reshape_b_only_on_first_run(false)
{
}
@@ -83,41 +66,14 @@
_reshape_b_only_on_first_run = gemm_info.reshape_b_only_on_first_run();
_run_vector_matrix_multiplication = a->info()->dimension(1) < 2;
+ const bool run_optimised = a->info()->data_type() == DataType::F32 && (c == nullptr || beta == 0.f)
+ && setup_assembly_kernel(a, b, d, alpha, beta, _reshape_b_only_on_first_run, _workspace, _B_pretransposed, _memory_group, _asm_glue);
+
// Check if the first input tensor is a vector.
// If so, all the kernels for reshaping the tensors can be skipped
if(_run_vector_matrix_multiplication)
{
-#if defined(__aarch64__)
- if(NEScheduler::get().cpu_info().CPU >= CPUTarget::ARMV8 && a->info()->data_type() == DataType::F32 && (c == nullptr || beta == 0.f))
- {
- _mm_optimised_kernel = support::cpp14::make_unique<NEGEMVAArch64Kernel>();
- }
-
- if(_mm_optimised_kernel != nullptr)
- {
- struct CPUInfo ci = NEScheduler::get().cpu_info();
-
- const int N = d->info()->tensor_shape().x();
- const int K = a->info()->tensor_shape().x();
-
- size_t workbench_size = 0;
-
- if(a->info()->data_type() == DataType::F32)
- {
- workbench_size = GemvTransposed<sgemv_trans, sgemv_trans::operand_type, sgemv_trans::result_type>(&ci, N, K).get_working_size();
- }
-
- constexpr size_t alignment = 4096;
- ARM_COMPUTE_ERROR_ON_MSG(workbench_size == 0, "size cannot be 0");
- _workspace.allocator()->init(TensorInfo(TensorShape{ (workbench_size + alignment - 1) * NEScheduler::get().num_threads() }, 1, DataType::S8));
- _memory_group.manage(&_workspace);
-
- // Configure matrix multiplication kernel
- _mm_optimised_kernel->configure(a, b, d, &_workspace, alpha, 0.f, false /* is_transposed_0 */, false /* is_transposed_1 */);
- _workspace.allocator()->allocate();
- }
- else
-#endif /* defined(__aarch64__) */
+ if(!run_optimised)
{
// Configure the matrix multiply kernel
_mm_kernel.configure(a, b, d, alpha, false);
@@ -132,65 +88,7 @@
}
else
{
-#if defined(__arm__)
- if(NEScheduler::get().cpu_info().CPU == CPUTarget::ARMV7 && a->info()->data_type() == DataType::F32 && (c == nullptr || beta == 0.f))
- {
- _mm_optimised_kernel = support::cpp14::make_unique<NEGEMMAArch32Kernel>();
- }
-#elif defined(__aarch64__)
- if(NEScheduler::get().cpu_info().CPU >= CPUTarget::ARMV8 && a->info()->data_type() == DataType::F32 && (c == nullptr || beta == 0.f))
- {
- _mm_optimised_kernel = support::cpp14::make_unique<NEGEMMAArch64Kernel>();
- }
- else if(a->info()->data_type() == DataType::F16 && (c == nullptr || beta == 0.f))
- {
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
- _mm_optimised_kernel = support::cpp14::make_unique<NEHGEMMAArch64FP16Kernel>();
-#else /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
- ARM_COMPUTE_ERROR("Recompile the library with arch=arm64-v8.2-a to enable support for FP16.");
-#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
- }
-#endif /* defined(__arm__) || defined(__aarch64__) */
-
-#if defined(__arm__) || defined(__aarch64__)
- if(_mm_optimised_kernel != nullptr)
- {
- struct CPUInfo ci = NEScheduler::get().cpu_info();
-
- const int M = d->info()->tensor_shape().y();
- const int N = d->info()->tensor_shape().x();
- const int K = a->info()->tensor_shape().x();
-
- size_t workbench_size = 0;
-
-#if defined(__arm__)
- workbench_size = GemmInterleaved<sgemm_8x6, sgemm_8x6::operand_type, sgemm_8x6::result_type>(&ci, M, N, K, false, false).get_working_size();
-#elif defined(__aarch64__)
- if(a->info()->data_type() == DataType::F32)
- {
- workbench_size = GemmInterleaved<sgemm_12x8, sgemm_12x8::operand_type, sgemm_12x8::result_type>(&ci, M, N, K, false, false).get_working_size();
- }
- else if(a->info()->data_type() == DataType::F16)
- {
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
- workbench_size = GemmInterleaved<hgemm_24x8, hgemm_24x8::operand_type, hgemm_24x8::result_type>(&ci, M, N, K, false, false).get_working_size();
-#else /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
- ARM_COMPUTE_ERROR("Recompile the library with arch=arm64-v8.2-a to enable support for FP16.");
-#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
- }
-#endif /* defined(__arm__) || defined(__aarch64__) */
-
- constexpr size_t alignment = 4096;
- ARM_COMPUTE_ERROR_ON_MSG(workbench_size == 0, "size cannot be 0");
- _workspace.allocator()->init(TensorInfo(TensorShape{ (workbench_size + alignment - 1) * NEScheduler::get().num_threads() }, 1, DataType::S8));
- _memory_group.manage(&_workspace);
-
- // Configure matrix multiplication kernel
- _mm_optimised_kernel->configure(a, b, d, &_workspace, alpha, 0.f, false /* is_transposed_0 */, false /* is_transposed_1 */);
- _workspace.allocator()->allocate();
- }
- else
-#endif /* defined(__arm__) || defined(__aarch64__) */
+ if(!run_optimised)
{
TensorShape shape_tmp_a = a->info()->tensor_shape();
TensorShape shape_tmp_b = b->info()->tensor_shape();
@@ -210,7 +108,10 @@
// Manage intermediate buffers
_memory_group.manage(&_tmp_a);
- _memory_group.manage(&_tmp_b);
+ if(!_reshape_b_only_on_first_run)
+ {
+ _memory_group.manage(&_tmp_b);
+ }
int m = a->info()->dimension(1);
int n = b->info()->dimension(0);
@@ -243,9 +144,9 @@
{
_memory_group.acquire();
- if(_mm_optimised_kernel != nullptr)
+ if(_asm_glue._optimised_kernel != nullptr)
{
- NEScheduler::get().schedule(_mm_optimised_kernel.get(), Window::DimY);
+ _asm_glue.run();
_memory_group.release();
}
else
diff --git a/src/runtime/NEON/functions/NEGEMMConvolutionLayer.cpp b/src/runtime/NEON/functions/NEGEMMConvolutionLayer.cpp
index a85078c..2888b43 100644
--- a/src/runtime/NEON/functions/NEGEMMConvolutionLayer.cpp
+++ b/src/runtime/NEON/functions/NEGEMMConvolutionLayer.cpp
@@ -23,9 +23,6 @@
*/
#include "arm_compute/runtime/NEON/functions/NEGEMMConvolutionLayer.h"
-#include "arm_compute/core/NEON/kernels/arm32/NEGEMMAArch32Kernel.h"
-#include "arm_compute/core/NEON/kernels/arm64/NEGEMMAArch64Kernel.h"
-#include "arm_compute/core/NEON/kernels/arm64/NEGEMMAArch64NativeKernel.h"
#include "arm_compute/core/PixelValue.h"
#include "arm_compute/core/Size2D.h"
#include "arm_compute/core/Utils.h"
@@ -34,13 +31,6 @@
#include "arm_compute/runtime/NEON/NEScheduler.h"
#include "support/ToolchainSupport.h"
-namespace arm_compute
-{
-#include "arm_compute/core/NEON/kernels/assembly/gemm_interleaved.hpp"
-#include "arm_compute/core/NEON/kernels/assembly/kernels/a32_sgemm_8x6.hpp"
-#include "arm_compute/core/NEON/kernels/assembly/kernels/a64_sgemm_12x8.hpp"
-} // namespace arm_compute
-
#include <cmath>
#include <tuple>
@@ -175,19 +165,28 @@
}
}
-Status validate_and_initialize_values(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const PadStrideInfo &conv_info, const WeightsInfo &weights_info, DataType &dt,
- bool &append_bias,
+Status validate_and_initialize_values(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const PadStrideInfo &conv_info, const WeightsInfo &weights_info,
+ const ActivationLayerInfo &act_info, DataType &dt,
+ bool &append_bias, bool &skip_im2col,
bool &are_weights_reshaped, unsigned int &kernel_width, unsigned int &kernel_height,
- bool &is_fully_connected_convolution, bool &is_interleaved, bool &is_quantized,
+ bool &is_fully_connected_convolution, bool &is_interleaved, bool &is_quantized, bool &is_activationlayer_enabled,
unsigned int &mat_weights_cols, unsigned int &mat_weights_rows,
- unsigned int &conv_w, unsigned int &conv_h)
+ unsigned int &conv_w, unsigned int &conv_h, const Size2D &dilation)
{
ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QS8, DataType::QASYMM8, DataType::QS16, DataType::F16, DataType::F32);
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, weights);
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_FIXED_POINT(input, weights);
- ARM_COMPUTE_RETURN_ERROR_ON(!weights_info.are_reshaped() && weights->dimension(2) != input->dimension(2));
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(input, weights);
+
+ DataLayout data_layout = input->data_layout();
+ const int idx_width = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
+ const int idx_height = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
+ const int idx_channel = get_data_layout_dimension_index(data_layout, DataLayoutDimension::CHANNEL);
+
+ ARM_COMPUTE_RETURN_ERROR_ON(!weights_info.are_reshaped() && weights->dimension(idx_channel) != input->dimension(idx_channel));
ARM_COMPUTE_RETURN_ERROR_ON(weights->num_dimensions() > 4);
ARM_COMPUTE_RETURN_ERROR_ON(weights_info.are_reshaped() && is_data_type_quantized_asymmetric(input->data_type()));
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(data_layout == DataLayout::NHWC && input->data_type() != DataType::F32, "NHWC is only supported for FP32 data type.");
dt = input->data_type();
is_quantized = is_data_type_quantized_asymmetric(dt);
@@ -207,28 +206,32 @@
ARM_COMPUTE_RETURN_ERROR_ON(biases->num_dimensions() > 1);
}
+ // If we have 1x1 convolution and data layout is NHWC we can disable im2col
append_bias = (biases != nullptr) && (!is_quantized);
are_weights_reshaped = weights_info.are_reshaped();
- kernel_width = (are_weights_reshaped) ? weights_info.kernel_size().first : weights->dimension(0);
- kernel_height = (are_weights_reshaped) ? weights_info.kernel_size().second : weights->dimension(1);
+ kernel_width = (are_weights_reshaped) ? weights_info.kernel_size().first : weights->dimension(idx_width);
+ kernel_height = (are_weights_reshaped) ? weights_info.kernel_size().second : weights->dimension(idx_height);
mat_weights_cols = weights->dimension(3);
- mat_weights_rows = weights->dimension(0) * weights->dimension(1) * weights->dimension(2) + (append_bias ? 1 : 0);
+ mat_weights_rows = weights->dimension(idx_width) * weights->dimension(idx_height) * weights->dimension(idx_channel) + ((append_bias && !skip_im2col) ? 1 : 0);
+ skip_im2col = (data_layout == DataLayout::NHWC && kernel_width == 1 && kernel_height == 1);
- std::tie(conv_w, conv_h) = scaled_dimensions(input->dimension(0), input->dimension(1), kernel_width, kernel_height,
- conv_info);
+ std::tie(conv_w, conv_h) = scaled_dimensions(input->dimension(idx_width), input->dimension(idx_height), kernel_width, kernel_height,
+ conv_info, dilation);
// Check if its a "fully connected" convolution
is_fully_connected_convolution = ((conv_w == 1) && (conv_h == 1));
is_interleaved = (!is_fully_connected_convolution && !is_quantized);
+ is_activationlayer_enabled = act_info.enabled();
return Status{};
}
} // namespace
NEGEMMConvolutionLayer::NEGEMMConvolutionLayer(const std::shared_ptr<IMemoryManager> &memory_manager)
- : _memory_group(memory_manager), _input_im2col_kernel(), _input_interleave_kernel(), _reshape_weights(), _mm_kernel(), _mm_optimised_kernel(nullptr), _mm_gemmlowp(memory_manager),
- _gemmlowp_output_stage(), _output_col2im_kernel(), _input_im2col_reshaped(), _input_interleaved_reshaped(), _weights_reshaped(), _gemm_output(), _tmp_output(), _workspace(), _append_bias(false),
- _is_fully_connected_convolution(false), _are_weights_reshaped(false), _is_quantized(false), _is_interleaved(false)
+ : _asm_glue(), _memory_group(memory_manager), _input_im2col_kernel(), _input_interleave_kernel(), _reshape_weights(), _mm_kernel(), _mm_gemmlowp(memory_manager), _gemmlowp_output_stage(),
+ _output_col2im_kernel(), _activationlayer_function(), _add_bias_kernel(), _original_weights(nullptr), _input_im2col_reshaped(), _input_interleaved_reshaped(), _weights_reshaped(), _gemm_output(),
+ _tmp_output(), _workspace(), _B_pretransposed(), _data_layout(DataLayout::NCHW), _append_bias(false), _is_fully_connected_convolution(false), _are_weights_reshaped(false), _is_quantized(false),
+ _is_interleaved(false), _is_activationlayer_enabled(false), _skip_im2col(false)
{
}
@@ -256,26 +259,8 @@
}
}
-void NEGEMMConvolutionLayer::configure_asm_mm(const struct CPUInfo &ci, int M, int N, int K)
-{
- ARM_COMPUTE_UNUSED(ci);
- ARM_COMPUTE_UNUSED(M);
- ARM_COMPUTE_UNUSED(N);
- ARM_COMPUTE_UNUSED(K);
-#if defined(__arm__) || defined(__aarch64__)
-#if defined(__arm__)
- GemmInterleaved<sgemm_8x6, float, float> gemm(&ci, M, N, K, false, false);
-#elif defined(__aarch64__)
- GemmInterleaved<sgemm_12x8, float, float> gemm(&ci, M, N, K, false, false);
-#endif /* defined(__arm__) || defined(__aarch64__) */
-
- constexpr size_t alignment = 4096;
- _workspace.allocator()->init(TensorInfo(TensorShape{ (gemm.get_working_size() + alignment - 1) * NEScheduler::get().num_threads() }, 1, DataType::U8));
- _memory_group.manage(&_workspace);
-#endif /* defined(__arm__) || defined(__aarch64__) */
-}
-
-void NEGEMMConvolutionLayer::configure(const ITensor *input, const ITensor *weights, const ITensor *biases, ITensor *output, const PadStrideInfo &conv_info, const WeightsInfo &weights_info)
+void NEGEMMConvolutionLayer::configure(const ITensor *input, const ITensor *weights, const ITensor *biases, ITensor *output, const PadStrideInfo &conv_info, const WeightsInfo &weights_info,
+ const Size2D &dilation, const ActivationLayerInfo &act_info)
{
// Perform validate step
ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output);
@@ -288,45 +273,35 @@
unsigned int conv_w = 0;
unsigned int conv_h = 0;
- Status status = validate_and_initialize_values(input->info(), weights->info(), (biases == nullptr) ? nullptr : biases->info(), conv_info, weights_info, dt, _append_bias, _are_weights_reshaped,
+ _data_layout = input->info()->data_layout();
+ const bool is_nhwc = _data_layout == DataLayout::NHWC;
+ const int idx_width = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::WIDTH);
+ const int idx_height = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::HEIGHT);
+ const int idx_channel = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::CHANNEL);
+
+ Status status = validate_and_initialize_values(input->info(), weights->info(), (biases == nullptr) ? nullptr : biases->info(), conv_info, weights_info, act_info, dt, _append_bias, _skip_im2col,
+ _are_weights_reshaped,
kernel_width, kernel_height,
- _is_fully_connected_convolution, _is_interleaved, _is_quantized,
- mat_weights_cols, mat_weights_rows, conv_w, conv_h);
+ _is_fully_connected_convolution, _is_interleaved, _is_quantized, _is_activationlayer_enabled,
+ mat_weights_cols, mat_weights_rows, conv_w, conv_h, dilation);
ARM_COMPUTE_ERROR_THROW_ON(status);
+ _original_weights = weights;
const unsigned int fixed_point_position = input->info()->fixed_point_position();
const ITensor *biases_to_use = (_append_bias) ? biases : nullptr;
-#if defined(__arm__)
- if(NEScheduler::get().cpu_info().CPU == CPUTarget::ARMV7 && dt == DataType::F32)
- {
- _mm_optimised_kernel = support::cpp14::make_unique<NEGEMMAArch32Kernel>();
- }
-#elif defined(__aarch64__)
- if(NEScheduler::get().cpu_info().CPU >= CPUTarget::ARMV8 && dt == DataType::F32)
- {
- _mm_optimised_kernel = support::cpp14::make_unique<NEGEMMAArch64Kernel>();
- }
-#endif /* defined(__arm__) || defined(__aarch64__) */
+ bool run_optimised = dt == DataType::F32;
// Reshape weights if needed
- if(_mm_optimised_kernel != nullptr)
+ if(run_optimised)
{
- if(_are_weights_reshaped)
- {
- mat_weights_cols = weights_info.num_kernels();
- mat_weights_rows = weights->info()->dimension(1);
- }
- else
- {
- TensorShape reshaped_weights_shape{ mat_weights_cols, mat_weights_rows };
+ TensorShape reshaped_weights_shape{ mat_weights_cols, mat_weights_rows };
- // Create tensor to store the reshaped weights
- _weights_reshaped.allocator()->init(TensorInfo(reshaped_weights_shape, 1, dt, fixed_point_position));
- _reshape_weights.configure(weights, biases, &_weights_reshaped, false /* 1xW transpose */);
- weights = &_weights_reshaped;
- }
+ // Create tensor to store the reshaped weights
+ _weights_reshaped.allocator()->init(TensorInfo(reshaped_weights_shape, 1, dt, fixed_point_position));
+ _reshape_weights.configure(weights, biases, &_weights_reshaped, false /* 1xW transpose */);
+ weights = &_weights_reshaped;
}
else
{
@@ -335,12 +310,12 @@
if(_is_fully_connected_convolution || _is_quantized)
{
mat_weights_cols = weights_info.num_kernels();
- mat_weights_rows = weights->info()->dimension(1);
+ mat_weights_rows = weights->info()->dimension(idx_height);
}
else
{
mat_weights_cols = weights_info.num_kernels();
- mat_weights_rows = weights_info.kernel_size().first * weights_info.kernel_size().second * input->info()->dimension(2) + (_append_bias ? 1 : 0);
+ mat_weights_rows = weights_info.kernel_size().first * weights_info.kernel_size().second * input->info()->dimension(idx_channel) + (_append_bias ? 1 : 0);
}
}
else
@@ -366,66 +341,56 @@
}
}
- // Create tensor to store im2col reshaped inputs
- const unsigned int mat_input_cols = mat_weights_rows;
- const unsigned int mat_input_rows = conv_w * conv_h;
-
- TensorShape shape_im2col(input->info()->tensor_shape());
- shape_im2col.set(0, mat_input_cols);
- shape_im2col.set(1, mat_input_rows);
- shape_im2col.set(2, 1);
- _input_im2col_reshaped.allocator()->init(input->info()->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(shape_im2col));
- _memory_group.manage(&_input_im2col_reshaped);
-
- // Create tensor (interleave) to prepare input tensor for GEMM
- if(!_is_fully_connected_convolution && _mm_optimised_kernel == nullptr)
+ // In case we skip im2col we have to add bias
+ if(!_skip_im2col)
{
- TensorShape shape_interleaved(shape_im2col);
- shape_interleaved.set(0, shape_interleaved.x() * 4);
- shape_interleaved.set(1, std::ceil(shape_interleaved.y() / 4.f));
- _input_interleaved_reshaped.allocator()->init(input->info()->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(shape_interleaved));
- _memory_group.manage(&_input_interleaved_reshaped);
+ const unsigned int mat_input_cols = mat_weights_rows;
+ const unsigned int mat_input_rows = conv_w * conv_h;
+
+ // Create tensor to store im2col reshaped inputs
+ TensorShape shape_im2col(input->info()->tensor_shape());
+ shape_im2col.set(0, mat_input_cols);
+ shape_im2col.set(1, mat_input_rows);
+ shape_im2col.set(2, 1);
+ _input_im2col_reshaped.allocator()->init(input->info()->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(shape_im2col));
+ _memory_group.manage(&_input_im2col_reshaped);
+
+ // Create tensor (interleave) to prepare input tensor for GEMM
+ if(!_is_fully_connected_convolution && !run_optimised && _is_interleaved)
+ {
+ TensorShape shape_interleaved(shape_im2col);
+ shape_interleaved.set(idx_width, shape_interleaved.x() * 4);
+ shape_interleaved.set(idx_height, std::ceil(shape_interleaved[idx_height] / 4.f));
+ _input_interleaved_reshaped.allocator()->init(input->info()->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(shape_interleaved));
+ _memory_group.manage(&_input_interleaved_reshaped);
+ }
+
+ // Create GEMM output tensor
+ TensorShape shape_gemm(_input_im2col_reshaped.info()->tensor_shape());
+ shape_gemm.set(0, mat_weights_cols);
+ shape_gemm.set(1, mat_input_rows);
+ const DataType gemm_data_type = _is_quantized ? DataType::S32 : dt;
+ // GEMM output should be S32 for acquiring raw integer accumulator without quantized postprocessing for quantized asymmetric input.
+ TensorInfo info_gemm(shape_gemm, 1, gemm_data_type, input->info()->fixed_point_position());
+ info_gemm.set_quantization_info(output->info()->quantization_info());
+ _gemm_output.allocator()->init(info_gemm);
+
+ // Configure im2col
+ _input_im2col_kernel.configure(input, &_input_im2col_reshaped, Size2D(kernel_width, kernel_height), conv_info, _append_bias, false, false, dilation);
+ }
+ else if(_append_bias)
+ {
+ // Configure add bias kernel
+ _add_bias_kernel.configure(output, biases, output, ConvertPolicy::SATURATE);
}
- // Create GEMM output tensor
- TensorShape shape_gemm(_input_im2col_reshaped.info()->tensor_shape());
- shape_gemm.set(0, mat_weights_cols);
- shape_gemm.set(1, mat_input_rows);
- const DataType gemm_data_type = _is_quantized ? DataType::S32 : dt;
- // GEMM output should be S32 for acquiring raw integer accumulator without quantized postprocessing for quantized asymmetric input.
- TensorInfo info_gemm(shape_gemm, 1, gemm_data_type, input->info()->fixed_point_position());
- info_gemm.set_quantization_info(output->info()->quantization_info());
- _gemm_output.allocator()->init(info_gemm);
- _memory_group.manage(&_gemm_output);
-
- // Configure kernels
- // Configure im2col
- _input_im2col_kernel.configure(input, &_input_im2col_reshaped, Size2D(kernel_width, kernel_height), conv_info, _append_bias);
-
// Configure matrix multiply
- if(_mm_optimised_kernel != nullptr)
+ if(run_optimised)
{
- struct CPUInfo ci = NEScheduler::get().cpu_info();
-
- const int M = _gemm_output.info()->tensor_shape().y();
- const int N = _gemm_output.info()->tensor_shape().x();
- const int K = _input_im2col_reshaped.info()->tensor_shape().x();
-
-#if defined(__aarch64__)
- if((N <= 128) && (K <= 128))
+ if(!setup_assembly_kernel(_skip_im2col ? input : &_input_im2col_reshaped, weights, is_nhwc ? output : &_gemm_output, 1.f, 0.f, true, _workspace, _B_pretransposed, _memory_group, _asm_glue))
{
- _mm_optimised_kernel = support::cpp14::make_unique<NEGEMMAArch64NativeKernel>();
+ ARM_COMPUTE_ERROR("setup_assembly_kernel failed.");
}
- else
-#endif /* defined(__aarch64__) */
- {
- configure_asm_mm(ci, M, N, K);
- }
-
- // Configure matrix multiplication kernel
- _mm_optimised_kernel->configure(&_input_im2col_reshaped, weights, &_gemm_output, &_workspace);
-
- _workspace.allocator()->allocate();
}
else
{
@@ -435,8 +400,8 @@
_input_interleave_kernel.configure(&_input_im2col_reshaped, &_input_interleaved_reshaped);
// Configure GEMM
- configure_mm(&_input_interleaved_reshaped, weights, &_gemm_output, _is_interleaved, GEMMReshapeInfo(_input_im2col_reshaped.info()->dimension(1), 0 /* no transpose */,
- _input_im2col_reshaped.info()->dimension(0)));
+ configure_mm(&_input_interleaved_reshaped, weights, &_gemm_output, _is_interleaved, GEMMReshapeInfo(_input_im2col_reshaped.info()->dimension(idx_height), 0 /* no transpose */,
+ _input_im2col_reshaped.info()->dimension(idx_width)));
_input_interleaved_reshaped.allocator()->allocate();
}
else
@@ -445,48 +410,63 @@
}
}
- _input_im2col_reshaped.allocator()->allocate();
-
- // Configure output stage for quantized case
- if(_is_quantized)
+ if(!_skip_im2col)
{
- const QuantizationInfo output_quant_info = (output->info()->total_size() == 0) ? input->info()->quantization_info() : output->info()->quantization_info();
+ _input_im2col_reshaped.allocator()->allocate();
- float multiplier = input->info()->quantization_info().scale * weights->info()->quantization_info().scale / output_quant_info.scale;
- int output_multiplier, output_shift;
- quantization::calculate_quantized_multiplier_less_than_one(multiplier, &output_multiplier, &output_shift);
- _memory_group.manage(&_tmp_output);
- _gemmlowp_output_stage.configure(&_gemm_output, biases, &_tmp_output, output_multiplier, output_shift, output_quant_info.offset);
+ // Configure output stage for quantized case
+ if(_is_quantized)
+ {
+ const QuantizationInfo output_quant_info = (output->info()->total_size() == 0) ? input->info()->quantization_info() : output->info()->quantization_info();
+
+ float multiplier = input->info()->quantization_info().scale * weights->info()->quantization_info().scale / output_quant_info.scale;
+ int output_multiplier, output_shift;
+ quantization::calculate_quantized_multiplier_less_than_one(multiplier, &output_multiplier, &output_shift);
+ _memory_group.manage(&_tmp_output);
+ _gemmlowp_output_stage.configure(&_gemm_output, biases, &_tmp_output, output_multiplier, output_shift, output_quant_info.offset);
+ }
+
+ // Configure Col2Im
+ if(!is_nhwc)
+ {
+ _output_col2im_kernel.configure(_is_quantized ? &_tmp_output : &_gemm_output, output, Size2D(conv_w, conv_h));
+ }
+
+ if(_is_quantized)
+ {
+ _tmp_output.allocator()->allocate();
+ }
+ _gemm_output.allocator()->allocate();
}
- // Configure Col2Im
- _output_col2im_kernel.configure(_is_quantized ? &_tmp_output : &_gemm_output, output, Size2D(conv_w, conv_h));
- if(_is_quantized)
- {
- _tmp_output.allocator()->allocate();
- }
- _gemm_output.allocator()->allocate();
-
- ARM_COMPUTE_ERROR_ON_MSG((output->info()->dimension(0) != conv_w) || (output->info()->dimension(1) != conv_h), "Output shape does not match the expected one");
+ ARM_COMPUTE_ERROR_ON_MSG((output->info()->dimension(idx_width) != conv_w) || (output->info()->dimension(idx_height) != conv_h), "Output shape does not match the expected one");
// Allocate intermediate tensor
if(!_are_weights_reshaped)
{
_weights_reshaped.allocator()->allocate();
}
+
+ //Configure Activation Layer
+ if(_is_activationlayer_enabled)
+ {
+ _activationlayer_function.configure(output, nullptr, act_info);
+ }
}
Status NEGEMMConvolutionLayer::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info,
- const WeightsInfo &weights_info)
+ const WeightsInfo &weights_info, const Size2D &dilation, const ActivationLayerInfo &act_info)
{
ARM_COMPUTE_UNUSED(output);
DataType dt{};
bool append_bias{};
+ bool skip_im2col{};
bool are_weights_reshaped{};
bool is_fully_connected_convolution{};
bool is_interleaved{};
bool is_quantized{};
+ bool is_activationlayer_enabled{};
unsigned int kernel_width = 0;
unsigned int kernel_height = 0;
unsigned int mat_weights_cols = 0;
@@ -494,9 +474,14 @@
unsigned int conv_w = 0;
unsigned int conv_h = 0;
- Status status = validate_and_initialize_values(input, weights, biases, conv_info, weights_info, dt, append_bias, are_weights_reshaped, kernel_width, kernel_height,
- is_fully_connected_convolution, is_interleaved, is_quantized, mat_weights_cols, mat_weights_rows,
- conv_w, conv_h);
+ const DataLayout data_layout = input->data_layout();
+ const bool is_nhwc = data_layout == DataLayout::NHWC;
+ const int idx_width = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
+ const int idx_height = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
+
+ Status status = validate_and_initialize_values(input, weights, biases, conv_info, weights_info, act_info, dt, append_bias, skip_im2col, are_weights_reshaped, kernel_width, kernel_height,
+ is_fully_connected_convolution, is_interleaved, is_quantized, is_activationlayer_enabled, mat_weights_cols, mat_weights_rows,
+ conv_w, conv_h, dilation);
const Size2D kernel_weights = Size2D(kernel_width, kernel_height);
@@ -505,68 +490,11 @@
std::unique_ptr<ITensorInfo> reshaped_weights = weights->clone();
bool optimised_kernel = false;
-#if defined(__arm__)
- if(NEScheduler::get().cpu_info().CPU == CPUTarget::ARMV7 && dt == DataType::F32)
+ if(dt == DataType::F32)
{
optimised_kernel = true;
}
-#elif defined(__aarch64__)
- if(NEScheduler::get().cpu_info().CPU >= CPUTarget::ARMV8 && dt == DataType::F32)
- {
- optimised_kernel = true;
- }
-#endif /* defined(__arm__) || defined(__aarch64__) */
- // Reshape weights if needed
- if(optimised_kernel)
- {
- if(are_weights_reshaped)
- {
- mat_weights_cols = weights_info.num_kernels();
- mat_weights_rows = weights->dimension(1);
- }
- else
- {
- TensorShape reshaped_weights_shape{ mat_weights_cols, mat_weights_rows };
-
- // Create tensor to store the reshaped weights
- reshaped_weights->set_tensor_shape(get_reshaped_weights_shape_conv(weights, append_bias, is_fully_connected_convolution));
- ARM_COMPUTE_RETURN_ON_ERROR(NEConvolutionLayerReshapeWeights::validate(weights, biases, reshaped_weights.get(), !is_fully_connected_convolution /* 1xW transpose */));
- weights = reshaped_weights.get();
- }
- }
- else
- {
- if(are_weights_reshaped)
- {
- const unsigned int transpose_width = 16 / input->element_size();
- mat_weights_cols = weights_info.num_kernels();
- mat_weights_rows = weights->dimension(0) / transpose_width + (append_bias ? 1 : 0);
- }
- else
- {
- TensorShape reshaped_weights_shape;
-
- if(is_fully_connected_convolution || is_quantized)
- {
- reshaped_weights_shape = TensorShape{ mat_weights_cols, mat_weights_rows };
- }
- else
- {
- // Create tensor to store transposed weights
- const float transpose_width = 16.0f / input->element_size();
- reshaped_weights_shape = TensorShape{ mat_weights_rows *static_cast<unsigned int>(transpose_width),
- static_cast<unsigned int>(std::ceil(mat_weights_cols / transpose_width)) };
- }
-
- // Create tensor to store the reshaped weights
- reshaped_weights->set_tensor_shape(get_reshaped_weights_shape_conv(weights, append_bias, is_fully_connected_convolution));
- ARM_COMPUTE_RETURN_ON_ERROR(NEConvolutionLayerReshapeWeights::validate(weights, biases, reshaped_weights.get(), !is_fully_connected_convolution /* 1xW transpose */));
- weights = reshaped_weights.get();
- }
- }
-
- // Validate im2col
const unsigned int mat_input_cols = mat_weights_rows;
const unsigned int mat_input_rows = conv_w * conv_h;
TensorShape shape_im2col = input->tensor_shape();
@@ -574,7 +502,17 @@
shape_im2col.set(1, mat_input_rows);
shape_im2col.set(2, 1);
TensorInfo im2_col_info = input->clone()->set_tensor_shape(shape_im2col);
- ARM_COMPUTE_RETURN_ON_ERROR(NEIm2ColKernel::validate(input, &im2_col_info, kernel_weights, conv_info, append_bias, false));
+
+ if(!skip_im2col)
+ {
+ // Validate im2col
+ ARM_COMPUTE_RETURN_ON_ERROR(NEIm2ColKernel::validate(input, &im2_col_info, kernel_weights, conv_info, append_bias, false, false, dilation));
+ }
+ else if(append_bias)
+ {
+ // Validate add bias kernel
+ ARM_COMPUTE_RETURN_ON_ERROR(NEArithmeticAdditionKernel::validate(output, biases, output, ConvertPolicy::SATURATE));
+ }
// Create GEMM output tensor
TensorShape shape_gemm(im2_col_info.tensor_shape());
@@ -582,19 +520,63 @@
shape_gemm.set(1, mat_input_rows);
TensorInfo gemm_output_info = input->clone()->set_tensor_shape(shape_gemm);
- // Validate GEMM interleave and multiply
- if(is_interleaved)
+ // Reshape weights if needed
+ if(optimised_kernel)
{
- TensorShape shape_interleaved = shape_im2col;
- shape_interleaved.set(0, shape_interleaved.x() * 4);
- shape_interleaved.set(1, std::ceil(shape_interleaved.y() / 4.f));
- TensorInfo input_interleaved_info = input->clone()->set_tensor_shape(shape_interleaved);
- ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMInterleave4x4Kernel::validate(&im2_col_info, &input_interleaved_info));
- ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMMatrixMultiplyKernel::validate(&input_interleaved_info, weights, &gemm_output_info, 1.f, is_interleaved, GEMMReshapeInfo()));
+ ARM_COMPUTE_RETURN_ERROR_ON(are_weights_reshaped);
+
+ // Create tensor to store the reshaped weights
+ reshaped_weights->set_tensor_shape(get_reshaped_weights_shape_conv(weights, append_bias, is_fully_connected_convolution));
+ ARM_COMPUTE_RETURN_ON_ERROR(NEConvolutionLayerReshapeWeights::validate(weights, biases, reshaped_weights.get(), !is_fully_connected_convolution /* 1xW transpose */));
}
- else
+ else if(!is_quantized)
{
- ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMMatrixMultiplyKernel::validate(&im2_col_info, weights, &gemm_output_info, 1.f, is_interleaved, GEMMReshapeInfo()));
+ TensorShape reshaped_weights_shape;
+
+ if(is_fully_connected_convolution || is_quantized)
+ {
+ reshaped_weights_shape = TensorShape{ mat_weights_cols, mat_weights_rows };
+ }
+ else
+ {
+ // Create tensor to store transposed weights
+ const float transpose_width = 16.0f / input->element_size();
+ reshaped_weights_shape = TensorShape{ mat_weights_rows *static_cast<unsigned int>(transpose_width),
+ static_cast<unsigned int>(std::ceil(mat_weights_cols / transpose_width)) };
+ }
+
+ // Create tensor to store the reshaped weights
+ reshaped_weights->set_tensor_shape(get_reshaped_weights_shape_conv(weights, append_bias, is_fully_connected_convolution));
+ ARM_COMPUTE_RETURN_ON_ERROR(NEConvolutionLayerReshapeWeights::validate(weights, biases, reshaped_weights.get(), !is_fully_connected_convolution /* 1xW transpose */));
+ weights = reshaped_weights.get();
+
+ // Validate GEMM interleave and multiply
+ if(is_interleaved)
+ {
+ TensorShape shape_interleaved = shape_im2col;
+ shape_interleaved.set(idx_width, shape_interleaved.x() * 4);
+ shape_interleaved.set(idx_height, std::ceil(shape_interleaved.y() / 4.f));
+ TensorInfo input_interleaved_info = input->clone()->set_tensor_shape(shape_interleaved);
+ ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMInterleave4x4Kernel::validate(&im2_col_info, &input_interleaved_info));
+ ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMMatrixMultiplyKernel::validate(&input_interleaved_info, weights, &gemm_output_info, 1.f, is_interleaved, GEMMReshapeInfo(shape_im2col[1], // m
+ weights->tensor_shape()[0], // n
+ shape_im2col[0]) /* k */));
+ }
+ else
+ {
+ ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMMatrixMultiplyKernel::validate(&im2_col_info, weights, &gemm_output_info, 1.f, is_interleaved, GEMMReshapeInfo()));
+ }
+ }
+ if(!is_nhwc)
+ {
+ ARM_COMPUTE_RETURN_ON_ERROR(NECol2ImKernel::validate(&gemm_output_info, output, Size2D(conv_w, conv_h)));
+ }
+
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG((output->dimension(idx_width) != conv_w) || (output->dimension(idx_height) != conv_h), "Output shape does not match the expected one");
+
+ if(act_info.enabled())
+ {
+ ARM_COMPUTE_RETURN_ON_ERROR(NEActivationLayer::validate(output, nullptr, act_info));
}
return Status{};
@@ -605,19 +587,33 @@
// Run weights reshaping (Runs once for every configure)
if(!_are_weights_reshaped)
{
+ ARM_COMPUTE_ERROR_ON(!_original_weights->is_used());
+
_are_weights_reshaped = true;
_reshape_weights.run();
+
+ // Mark original weights tensor as unused
+ _original_weights->mark_as_unused();
}
_memory_group.acquire();
- // Run input reshaping
- NEScheduler::get().schedule(&_input_im2col_kernel, Window::DimY);
+ if(!_skip_im2col)
+ {
+ // Run input reshaping
+ unsigned int _y_dim = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::HEIGHT);
+ NEScheduler::get().schedule(&_input_im2col_kernel, _y_dim);
+ }
// Runs matrix multiply on reshaped matrices
- if(_mm_optimised_kernel != nullptr)
+ if(_asm_glue._optimised_kernel != nullptr)
{
- NEScheduler::get().schedule(_mm_optimised_kernel.get(), Window::DimY);
+ _asm_glue.run();
+ // Release weights in case buffer is pretransposed
+ if(!_weights_reshaped.is_used())
+ {
+ _weights_reshaped.allocator()->free();
+ }
}
else
{
@@ -638,6 +634,11 @@
}
}
+ if(_skip_im2col && _append_bias)
+ {
+ NEScheduler::get().schedule(&_add_bias_kernel, Window::DimY);
+ }
+
// Run output stage for quantized case
if(_is_quantized)
{
@@ -645,7 +646,15 @@
}
// Reshape output matrix
- NEScheduler::get().schedule(&_output_col2im_kernel, Window::DimY);
+ if(_data_layout == DataLayout::NCHW)
+ {
+ NEScheduler::get().schedule(&_output_col2im_kernel, Window::DimY);
+ }
+
+ if(_is_activationlayer_enabled)
+ {
+ _activationlayer_function.run();
+ }
_memory_group.release();
}
diff --git a/src/runtime/NEON/functions/NEGEMMLowpAssemblyMatrixMultiplyCore.cpp b/src/runtime/NEON/functions/NEGEMMLowpAssemblyMatrixMultiplyCore.cpp
index 9b36e81..98b4767 100644
--- a/src/runtime/NEON/functions/NEGEMMLowpAssemblyMatrixMultiplyCore.cpp
+++ b/src/runtime/NEON/functions/NEGEMMLowpAssemblyMatrixMultiplyCore.cpp
@@ -1,4 +1,4 @@
-/* Copyright (c) 2017 ARM Limited.
+/* Copyright (c) 2017-2018 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -25,13 +25,9 @@
#include "arm_compute/core/Error.h"
#include "arm_compute/core/Helpers.h"
#include "arm_compute/core/ITensor.h"
-#include "arm_compute/core/NEON/kernels/NEGEMMAssemblyBaseKernel.h"
#include "arm_compute/core/NEON/kernels/NEGEMMInterleave4x4Kernel.h"
#include "arm_compute/core/NEON/kernels/NEGEMMLowpMatrixMultiplyKernel.h"
#include "arm_compute/core/NEON/kernels/NEGEMMTranspose1xWKernel.h"
-#include "arm_compute/core/NEON/kernels/arm64/NEGEMMLowpAArch64A53Kernel.h"
-#include "arm_compute/core/NEON/kernels/arm64/NEGEMMLowpAArch64Kernel.h"
-#include "arm_compute/core/NEON/kernels/arm64/NEGEMMLowpAArch64V8P4Kernel.h"
#include "arm_compute/core/TensorInfo.h"
#include "arm_compute/core/Types.h"
#include "arm_compute/core/Validate.h"
@@ -39,20 +35,11 @@
#include "arm_compute/runtime/TensorAllocator.h"
#include "support/ToolchainSupport.h"
-namespace arm_compute
-{
-#include "arm_compute/core/NEON/kernels/assembly/gemm_interleaved.hpp"
-#include "arm_compute/core/NEON/kernels/assembly/kernels/a64_gemm_s16_12x8.hpp"
-#include "arm_compute/core/NEON/kernels/assembly/kernels/a64_gemm_s8_12x8.hpp"
-#include "arm_compute/core/NEON/kernels/assembly/kernels/a64_gemm_s8_4x4.hpp"
-#include "arm_compute/core/NEON/kernels/assembly/kernels/a64_gemm_u16_12x8.hpp"
-#include "arm_compute/core/NEON/kernels/assembly/kernels/a64_gemm_u8_4x4.hpp"
-} // namespace arm_compute
-
using namespace arm_compute;
NEGEMMLowpAssemblyMatrixMultiplyCore::NEGEMMLowpAssemblyMatrixMultiplyCore(std::shared_ptr<IMemoryManager> memory_manager)
- : _memory_group(std::move(memory_manager)), _mm_kernel(nullptr), _mtx_a_reshape_kernel(nullptr), _mtx_b_reshape_kernel(nullptr), _tmp_a(), _tmp_b(), _workspace()
+ : _memory_group(std::move(memory_manager)), _asm_glue_unsigned(), _asm_glue_signed(), _mm_kernel(nullptr), _mtx_a_reshape_kernel(nullptr), _mtx_b_reshape_kernel(nullptr), _tmp_a(), _tmp_b(),
+ _workspace(), _B_pretransposed()
{
}
@@ -65,89 +52,29 @@
ARM_COMPUTE_ERROR_ON_MSG((a)->info()->dimension(1) != (output)->info()->dimension(1), "The output matrix must have the same number of rows as the matrix A");
ARM_COMPUTE_ERROR_ON_MSG((b)->info()->dimension(0) != (output)->info()->dimension(0), "The output matrix must have the same number of columns as the matrix B");
+ bool run_optimised = false;
#ifdef __aarch64__
- const int M = output->info()->tensor_shape().y();
- const int N = output->info()->tensor_shape().x();
- const int K = a->info()->tensor_shape().x();
- constexpr size_t workspace_alignment = 4096;
- const struct CPUInfo ci = NEScheduler::get().cpu_info();
+ switch(a->info()->data_type())
+ {
+ case DataType::S8:
+ {
+ run_optimised = setup_assembly_kernel(a, b, output, 1.f, 0.f, true, _workspace, _B_pretransposed, _memory_group, _asm_glue_signed);
+ break;
+ }
+ case DataType::QASYMM8:
+ case DataType::U8:
+ {
+ run_optimised = setup_assembly_kernel(a, b, output, 1.f, 0.f, true, _workspace, _B_pretransposed, _memory_group, _asm_glue_unsigned);
+ break;
+ }
+ default:
+ {
+ ARM_COMPUTE_ERROR("Datatype not supported");
+ break;
+ }
+ }
#endif /* __aarch64__ */
-
-#ifdef ARM_COMPUTE_AARCH64_V8_2
- if(ci.CPU == CPUTarget::A75_DOT || ci.CPU == CPUTarget::A55_DOT)
- {
- // Configure matrix multiply kernel
- GemmInterleaved<gemm_s8_12x8, int8_t, int32_t> gemm(&ci, M, N, K, false, false);
- _workspace.allocator()->init(TensorInfo(TensorShape{ (gemm.get_working_size() + workspace_alignment - 1) * NEScheduler::get().num_threads() }, 1, DataType::U8));
- _memory_group.manage(&_workspace);
-
- // Configure matrix multiplication kernel
- auto k = arm_compute::support::cpp14::make_unique<NEGEMMLowpAArch64V8P4Kernel>();
- k->configure(a, b, output, &_workspace, 1.f, 1.f);
- _mm_kernel = std::move(k);
- _workspace.allocator()->allocate();
- }
- else
-#elif defined(ARM_COMPUTE_AARCH64_V8A)
- if(ci.CPU == CPUTarget::A53)
- {
- switch(a->info()->data_type())
- {
- case DataType::S8:
- {
- // Configure matrix multiply kernel
- GemmInterleaved<gemm_s16_12x8, int8_t, int32_t> gemm(&ci, M, N, K, false, false);
- _workspace.allocator()->init(TensorInfo(TensorShape{ (gemm.get_working_size() + workspace_alignment - 1) * NEScheduler::get().num_threads() }, 1, DataType::U8));
- }
- break;
- case DataType::U8:
- {
- // Configure matrix multiply kernel
- GemmInterleaved<gemm_u16_12x8, uint8_t, uint32_t> gemm(&ci, M, N, K, false, false);
- _workspace.allocator()->init(TensorInfo(TensorShape{ (gemm.get_working_size() + workspace_alignment - 1) * NEScheduler::get().num_threads() }, 1, DataType::U8));
- }
- break;
- default:
- ARM_COMPUTE_ERROR("Datatype not supported");
- }
-
- _memory_group.manage(&_workspace);
- // Configure matrix multiplication kernel
- auto k = arm_compute::support::cpp14::make_unique<NEGEMMLowpAArch64A53Kernel>();
- k->configure(a, b, output, &_workspace, 1.f, 1.f);
- _mm_kernel = std::move(k);
- _workspace.allocator()->allocate();
- }
- else if(1) // Generic v8a kernel
- {
- switch(a->info()->data_type())
- {
- case DataType::S8:
- {
- // Configure matrix multiply kernel
- GemmInterleaved<gemm_s8_4x4, int8_t, int32_t> gemm(&ci, M, N, K, false, false);
- _workspace.allocator()->init(TensorInfo(TensorShape{ (gemm.get_working_size() + workspace_alignment - 1) * NEScheduler::get().num_threads() }, 1, DataType::U8));
- }
- break;
- case DataType::U8:
- {
- // Configure matrix multiply kernel
- GemmInterleaved<gemm_u8_4x4, uint8_t, uint32_t> gemm(&ci, M, N, K, false, false);
- _workspace.allocator()->init(TensorInfo(TensorShape{ (gemm.get_working_size() + workspace_alignment - 1) * NEScheduler::get().num_threads() }, 1, DataType::U8));
- }
- break;
- default:
- ARM_COMPUTE_ERROR("Datatype not supported");
- }
- _memory_group.manage(&_workspace);
- // Configure matrix multiplication kernel
- auto k = arm_compute::support::cpp14::make_unique<NEGEMMLowpAArch64Kernel>();
- k->configure(a, b, output, &_workspace, 1.f, 1.f);
- _mm_kernel = std::move(k);
- _workspace.allocator()->allocate();
- }
- else
-#endif /* ARM_COMPUTE_AARCH64_V8_2 */
+ if(!run_optimised)
{
// The interleaved output matrix will have the following shape: [ a_height * 4, ceil(a_width / 4.0f) ]
TensorShape shape_tmp_a = a->info()->tensor_shape();
@@ -206,7 +133,18 @@
NEScheduler::get().schedule(_mtx_b_reshape_kernel.get(), Window::DimY);
}
- NEScheduler::get().schedule(_mm_kernel.get(), Window::DimY);
+ if(_asm_glue_unsigned._optimised_kernel != nullptr)
+ {
+ _asm_glue_unsigned.run();
+ }
+ else if(_asm_glue_signed._optimised_kernel != nullptr)
+ {
+ _asm_glue_signed.run();
+ }
+ else
+ {
+ NEScheduler::get().schedule(_mm_kernel.get(), Window::DimY);
+ }
_memory_group.release();
}
diff --git a/src/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.cpp b/src/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.cpp
index ad47593..2e06fa2 100644
--- a/src/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.cpp
+++ b/src/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.cpp
@@ -26,11 +26,9 @@
#include "arm_compute/core/Error.h"
#include "arm_compute/core/Helpers.h"
#include "arm_compute/core/ITensor.h"
-#include "arm_compute/core/NEON/kernels/NEGEMMAssemblyBaseKernel.h"
#include "arm_compute/core/NEON/kernels/NEGEMMInterleave4x4Kernel.h"
#include "arm_compute/core/NEON/kernels/NEGEMMLowpMatrixMultiplyKernel.h"
#include "arm_compute/core/NEON/kernels/NEGEMMTranspose1xWKernel.h"
-#include "arm_compute/core/NEON/kernels/arm64/NEGEMMLowpAArch64V8P4Kernel.h"
#include "arm_compute/core/TensorInfo.h"
#include "arm_compute/core/Types.h"
#include "arm_compute/core/Validate.h"
@@ -39,58 +37,48 @@
#include "arm_compute/runtime/TensorAllocator.h"
#include "support/ToolchainSupport.h"
-namespace arm_compute
-{
-#include "arm_compute/core/NEON/kernels/assembly/gemm_interleaved.hpp"
-#include "arm_compute/core/NEON/kernels/assembly/kernels/a64_gemm_u8_12x8.hpp"
-} // namespace arm_compute
-
using namespace arm_compute;
using namespace arm_compute::misc::shape_calculator;
NEGEMMLowpMatrixMultiplyCore::NEGEMMLowpMatrixMultiplyCore(std::shared_ptr<IMemoryManager> memory_manager)
- : _memory_group(std::move(memory_manager)), _mm_kernel(nullptr), _mtx_a_reshape_kernel(nullptr), _mtx_b_reshape_kernel(nullptr), _mtx_a_reduction_kernel(), _mtx_b_reduction_kernel(),
- _offset_contribution_kernel(), _vector_sum_col(), _vector_sum_row(), _tmp_a(), _tmp_b(), _workspace(), _a_offset(0), _b_offset(0), _run_vector_matrix_multiplication(false), _dot_product_path(false)
+ : _memory_group(std::move(memory_manager)), _asm_glue_unsigned(), _asm_glue_signed(), _mm_kernel(nullptr), _mtx_a_reshape_kernel(nullptr), _mtx_b_reshape_kernel(nullptr), _mtx_a_reduction_kernel(),
+ _mtx_b_reduction_kernel(), _offset_contribution_kernel(), _vector_sum_col(), _vector_sum_row(), _tmp_a(), _tmp_b(), _workspace(), _B_pretranspose(), _a_offset(0), _b_offset(0),
+ _run_vector_matrix_multiplication(false), _dot_product_path(false), _is_first_run(true), _reshape_b_only_on_first_run(false)
{
}
void NEGEMMLowpMatrixMultiplyCore::configure(const ITensor *a, const ITensor *b, ITensor *output, const GEMMInfo &gemm_info)
{
ARM_COMPUTE_ERROR_ON_NULLPTR(a, b, output);
- ARM_COMPUTE_UNUSED(gemm_info);
ARM_COMPUTE_ERROR_THROW_ON(NEGEMMLowpMatrixMultiplyCore::validate(a->info(), b->info(), output->info(), gemm_info));
_a_offset = a->info()->quantization_info().offset;
_b_offset = b->info()->quantization_info().offset;
_run_vector_matrix_multiplication = a->info()->dimension(1) < 2;
+ _reshape_b_only_on_first_run = gemm_info.reshape_b_only_on_first_run();
-#ifdef ARM_COMPUTE_AARCH64_V8_2
- // Check for DOT product instruction
- const struct CPUInfo ci = NEScheduler::get().cpu_info();
- const int cpu_has_dotprod = static_cast<int>(ci.CPU) & static_cast<int>(CPUTarget::DOT);
-
- if(cpu_has_dotprod != 0)
+#ifdef __aarch64__
+ switch(a->info()->data_type())
{
- _dot_product_path = true;
-
- // Configure matrix multiply kernel
- struct CPUInfo ci = NEScheduler::get().cpu_info();
- const int M = output->info()->tensor_shape().y();
- const int N = output->info()->tensor_shape().x();
- const int K = a->info()->tensor_shape().x();
-
- const size_t workbench_size = GemmInterleaved<gemm_u8_12x8, gemm_u8_12x8::operand_type, gemm_u8_12x8::result_type>(&ci, M, N, K, false, false).get_working_size();
- constexpr size_t alignment = 4096;
- _workspace.allocator()->init(TensorInfo(TensorShape{ (workbench_size + alignment - 1) * NEScheduler::get().num_threads() }, 1, DataType::U8));
- _memory_group.manage(&_workspace);
-
- // Configure matrix multiplication kernel
- auto k = arm_compute::support::cpp14::make_unique<NEGEMMLowpAArch64V8P4Kernel>();
- k->configure(a, b, output, &_workspace, 1.f, 1.f, false, false);
- _mm_kernel = std::move(k);
+ case DataType::S8:
+ {
+ _dot_product_path = setup_assembly_kernel(a, b, output, 1.f, 0.f, true, _workspace, _B_pretranspose, _memory_group, _asm_glue_signed);
+ break;
+ }
+ case DataType::QASYMM8:
+ case DataType::U8:
+ {
+ _dot_product_path = setup_assembly_kernel(a, b, output, 1.f, 0.f, true, _workspace, _B_pretranspose, _memory_group, _asm_glue_unsigned);
+ break;
+ }
+ default:
+ {
+ ARM_COMPUTE_ERROR("Datatype not supported");
+ break;
+ }
}
- else
-#endif /* ARM_COMPUTE_AARCH64_V8_2 */
+#endif /* __aarch64__ */
+ if(!_dot_product_path)
{
if(_run_vector_matrix_multiplication)
{
@@ -110,7 +98,10 @@
_tmp_a.allocator()->init(info_a);
_tmp_b.allocator()->init(info_b);
_memory_group.manage(&_tmp_a);
- _memory_group.manage(&_tmp_b);
+ if(!_reshape_b_only_on_first_run)
+ {
+ _memory_group.manage(&_tmp_b);
+ }
// Configure interleave kernel
{
@@ -141,7 +132,10 @@
TensorInfo info_vector_sum_col(compute_reductionA_shape(*b->info()), 1, DataType::S32);
_vector_sum_col.allocator()->init(info_vector_sum_col);
- _memory_group.manage(&_vector_sum_col);
+ if(!_reshape_b_only_on_first_run)
+ {
+ _memory_group.manage(&_vector_sum_col);
+ }
// Configure Matrix B reduction kernel
_mtx_b_reduction_kernel.configure(b, &_vector_sum_col, a->info()->dimension(0), false);
@@ -168,10 +162,6 @@
_tmp_a.allocator()->allocate();
_tmp_b.allocator()->allocate();
}
- else
- {
- _workspace.allocator()->allocate();
- }
if(_a_offset != 0)
{
@@ -203,42 +193,28 @@
int32_t b_offset = b->quantization_info().offset;
bool run_vector_matrix_multiplication = a->dimension(1) < 2;
-#ifdef ARM_COMPUTE_AARCH64_V8_2
- // Check for DOT product instruction
- const struct CPUInfo ci = NEScheduler::get().cpu_info();
- const int cpu_has_dotprod = static_cast<int>(ci.CPU) & static_cast<int>(CPUTarget::DOT);
-
- if(cpu_has_dotprod != 0)
+ if(!run_vector_matrix_multiplication)
{
- // Validate matrix multiply kernel
- ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMLowpAArch64V8P4Kernel::validate(a, b, output));
+ // The interleaved output matrix will have the following shape: [ a_height * 4, ceil(a_width / 4.0f) ]
+ TensorShape shape_tmp_a = a->tensor_shape();
+ shape_tmp_a.set(0, a->dimension(0) * 4);
+ shape_tmp_a.set(1, std::ceil(a->dimension(1) / 4.f));
+
+ // The transpose1xW output matrix will have the following shape: [ b_height * 16, ceil(b_width / 16.0f) ]
+ TensorShape shape_tmp_b = b->tensor_shape();
+ shape_tmp_b.set(0, b->dimension(1) * 16);
+ shape_tmp_b.set(1, std::ceil(b->dimension(0) / 16.f));
+
+ TensorInfo info_a(shape_tmp_a, 1, a->data_type());
+ TensorInfo info_b(shape_tmp_b, 1, b->data_type());
+
+ ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMInterleave4x4Kernel::validate(a, &info_a));
+ ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMTranspose1xWKernel::validate(b, &info_b));
+ ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMLowpMatrixMultiplyKernel::validate(&info_a, &info_b, output));
}
else
-#endif /* ARM_COMPUTE_AARCH64_V8_2 */
{
- if(!run_vector_matrix_multiplication)
- {
- // The interleaved output matrix will have the following shape: [ a_height * 4, ceil(a_width / 4.0f) ]
- TensorShape shape_tmp_a = a->tensor_shape();
- shape_tmp_a.set(0, a->dimension(0) * 4);
- shape_tmp_a.set(1, std::ceil(a->dimension(1) / 4.f));
-
- // The transpose1xW output matrix will have the following shape: [ b_height * 16, ceil(b_width / 16.0f) ]
- TensorShape shape_tmp_b = b->tensor_shape();
- shape_tmp_b.set(0, b->dimension(1) * 16);
- shape_tmp_b.set(1, std::ceil(b->dimension(0) / 16.f));
-
- TensorInfo info_a(shape_tmp_a, 1, a->data_type());
- TensorInfo info_b(shape_tmp_b, 1, b->data_type());
-
- ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMInterleave4x4Kernel::validate(a, &info_a));
- ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMTranspose1xWKernel::validate(b, &info_b));
- ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMLowpMatrixMultiplyKernel::validate(&info_a, &info_b, output));
- }
- else
- {
- ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMLowpMatrixMultiplyKernel::validate(a, b, output));
- }
+ ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMLowpMatrixMultiplyKernel::validate(a, b, output));
}
TensorInfo info_vector_sum_col, info_vector_sum_row;
@@ -282,13 +258,24 @@
NEScheduler::get().schedule(_mtx_a_reshape_kernel.get(), Window::DimY);
}
- if(_mtx_b_reshape_kernel)
+ if(_mtx_b_reshape_kernel && (_is_first_run || !_reshape_b_only_on_first_run))
{
NEScheduler::get().schedule(_mtx_b_reshape_kernel.get(), Window::DimY);
}
}
- NEScheduler::get().schedule(_mm_kernel.get(), Window::DimY);
+ if(_asm_glue_unsigned._optimised_kernel != nullptr)
+ {
+ _asm_glue_unsigned.run();
+ }
+ else if(_asm_glue_signed._optimised_kernel != nullptr)
+ {
+ _asm_glue_signed.run();
+ }
+ else
+ {
+ NEScheduler::get().schedule(_mm_kernel.get(), Window::DimY);
+ }
// Run matrix A reduction kernel only if _b_offset is not equal to 0
if(_b_offset != 0)
@@ -297,7 +284,7 @@
}
// Run matrix B reduction kernel only if _a_offset is not equal to 0
- if(_a_offset != 0)
+ if(_a_offset != 0 && (_is_first_run || !_reshape_b_only_on_first_run))
{
NEScheduler::get().schedule(&_mtx_b_reduction_kernel, Window::DimX);
}
@@ -306,4 +293,6 @@
NEScheduler::get().schedule(&_offset_contribution_kernel, Window::DimY);
_memory_group.release();
+
+ _is_first_run = false;
}
diff --git a/src/runtime/NEON/functions/NEIm2Col.cpp b/src/runtime/NEON/functions/NEIm2Col.cpp
index b962db9..6b95cb0 100644
--- a/src/runtime/NEON/functions/NEIm2Col.cpp
+++ b/src/runtime/NEON/functions/NEIm2Col.cpp
@@ -23,19 +23,30 @@
*/
#include "arm_compute/runtime/NEON/functions/NEIm2Col.h"
-#include "arm_compute/core/NEON/kernels/NEIm2ColKernel.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/runtime/NEON/NEScheduler.h"
#include "support/ToolchainSupport.h"
using namespace arm_compute;
-void NEIm2Col::configure(const ITensor *input, ITensor *output, const Size2D &kernel_dims, const PadStrideInfo &conv_info, bool has_bias, bool is_fully_connected)
+NEIm2Col::NEIm2Col()
+ : _kernel(), _y_dim(1)
{
- auto k = arm_compute::support::cpp14::make_unique<NEIm2ColKernel>();
- k->configure(input, output, kernel_dims, conv_info, has_bias, is_fully_connected);
- _kernel = std::move(k);
}
-Status NEIm2Col::validate(const ITensorInfo *input, const ITensorInfo *output, const Size2D &kernel_dims, const PadStrideInfo &conv_info, bool has_bias, bool is_fully_connected)
+void NEIm2Col::configure(const ITensor *input, ITensor *output, const Size2D &kernel_dims, const PadStrideInfo &conv_info, bool has_bias, bool is_fully_connected, bool is_flatten)
{
- return NEIm2ColKernel::validate(input, output, kernel_dims, conv_info, has_bias, is_fully_connected);
+ _y_dim = get_data_layout_dimension_index(input->info()->data_layout(), DataLayoutDimension::HEIGHT);
+
+ _kernel.configure(input, output, kernel_dims, conv_info, has_bias, is_fully_connected, is_flatten);
+}
+
+Status NEIm2Col::validate(const ITensorInfo *input, const ITensorInfo *output, const Size2D &kernel_dims, const PadStrideInfo &conv_info, bool has_bias, bool is_fully_connected, bool is_flatten)
+{
+ return NEIm2ColKernel::validate(input, output, kernel_dims, conv_info, has_bias, is_fully_connected, is_flatten);
+}
+
+void NEIm2Col::run()
+{
+ NEScheduler::get().schedule(&_kernel, _y_dim);
}
diff --git a/src/runtime/NEON/functions/NEL2NormalizeLayer.cpp b/src/runtime/NEON/functions/NEL2NormalizeLayer.cpp
index fa62483..d0b80fb 100644
--- a/src/runtime/NEON/functions/NEL2NormalizeLayer.cpp
+++ b/src/runtime/NEON/functions/NEL2NormalizeLayer.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2018 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -46,6 +46,26 @@
_sumsq.allocator()->allocate();
}
+Status NEL2NormalizeLayer::validate(const ITensorInfo *input, const ITensorInfo *output, unsigned int axis, float epsilon)
+{
+ TensorShape shape(input->tensor_shape());
+
+ // Create intermediate tensor info
+ TensorInfo sum_sq;
+ sum_sq.set_data_type(input->data_type());
+ sum_sq.set_tensor_shape(shape);
+
+ ARM_COMPUTE_RETURN_ON_ERROR(NEReductionOperation::validate(input, &sum_sq, axis, ReductionOperation::SUM_SQUARE));
+
+ // Reduce shape on axis (supported axis is 0)
+ shape.set(0, 1);
+ sum_sq.set_tensor_shape(shape);
+
+ ARM_COMPUTE_RETURN_ON_ERROR(NEL2NormalizeLayerKernel::validate(input, &sum_sq, output, axis, epsilon));
+
+ return Status{};
+}
+
void NEL2NormalizeLayer::run()
{
_memory_group.acquire();
diff --git a/src/runtime/NEON/functions/NELocallyConnectedLayer.cpp b/src/runtime/NEON/functions/NELocallyConnectedLayer.cpp
index 45ddb70..913acf8 100644
--- a/src/runtime/NEON/functions/NELocallyConnectedLayer.cpp
+++ b/src/runtime/NEON/functions/NELocallyConnectedLayer.cpp
@@ -33,39 +33,102 @@
using namespace arm_compute;
+namespace
+{
+void calculate_shapes(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info,
+ TensorShape &shape_wr, TensorShape &shape_im2col, TensorShape &shape_gemm)
+{
+ ARM_COMPUTE_UNUSED(output);
+
+ const unsigned int kernel_width = weights->dimension(0);
+ const unsigned int kernel_height = weights->dimension(1);
+
+ bool has_bias = (biases != nullptr);
+
+ // Get convolved dimensions
+ unsigned int conv_w = 0;
+ unsigned int conv_h = 0;
+ std::tie(conv_w, conv_h) = scaled_dimensions(input->dimension(0), input->dimension(1), kernel_width, kernel_height,
+ conv_info);
+
+ const size_t mat_weights_cols = weights->dimension(3);
+ const size_t mat_weights_rows = weights->dimension(0) * weights->dimension(1) * weights->dimension(2) + ((has_bias) ? 1 : 0);
+ const size_t mat_weights_num = weights->dimension(4);
+
+ shape_wr = TensorShape(mat_weights_cols, mat_weights_rows, mat_weights_num);
+
+ const size_t mat_input_cols = mat_weights_rows;
+ const size_t mat_input_rows = conv_w * conv_h;
+
+ shape_im2col = input->tensor_shape();
+ shape_im2col.set(0, mat_input_cols);
+ shape_im2col.set(1, mat_input_rows);
+ shape_im2col.set(2, 1);
+
+ shape_gemm = shape_im2col;
+ shape_gemm.set(0, mat_weights_cols);
+ shape_gemm.set(1, mat_input_rows);
+}
+} // namespace
+
NELocallyConnectedLayer::NELocallyConnectedLayer(std::shared_ptr<IMemoryManager> memory_manager)
: _memory_group(std::move(memory_manager)), _input_im2col_kernel(), _weights_reshape_kernel(), _mm_kernel(), _output_col2im_kernel(), _input_im2col_reshaped(), _weights_reshaped(), _gemm_output(),
- _is_first_run(false)
+ _is_first_run(false), _original_weights(nullptr)
{
}
+Status NELocallyConnectedLayer::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info)
+{
+ ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, weights, output);
+ ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(2) != input->dimension(2));
+ ARM_COMPUTE_RETURN_ERROR_ON(!conv_info.padding_is_symmetric());
+
+ bool has_bias = (biases != nullptr);
+
+ if(has_bias)
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON(biases->dimension(0) != weights->dimension(3));
+ ARM_COMPUTE_RETURN_ERROR_ON(biases->num_dimensions() > 2);
+ }
+
+ const unsigned int kernel_width = weights->dimension(0);
+ const unsigned int kernel_height = weights->dimension(1);
+
+ // Get convolved dimensions
+ unsigned int conv_w = 0;
+ unsigned int conv_h = 0;
+ std::tie(conv_w, conv_h) = scaled_dimensions(input->dimension(0), input->dimension(1), kernel_width, kernel_height,
+ conv_info);
+
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG((output->dimension(0) != conv_w) || (output->dimension(1) != conv_h), "Output shape does not match the expected one");
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(weights->dimension(4) != (conv_w * conv_h), "Weights shape does not match the expected one");
+
+ // Calculate intermediate buffer shapes
+ TensorShape shape_wr;
+ TensorShape shape_im2col;
+ TensorShape shape_gemm;
+ calculate_shapes(input, weights, biases, output, conv_info, shape_wr, shape_im2col, shape_gemm);
+
+ TensorInfo weights_reshaped_info(shape_wr, 1, weights->data_type());
+ TensorInfo input_im2col_reshaped_info(shape_im2col, 1, input->data_type());
+ TensorInfo gemm_output_info(shape_gemm, 1, input->data_type());
+
+ ARM_COMPUTE_RETURN_ON_ERROR(NEIm2ColKernel::validate(input, &input_im2col_reshaped_info, Size2D(kernel_width, kernel_height), conv_info, has_bias, false));
+ ARM_COMPUTE_RETURN_ON_ERROR(NEWeightsReshapeKernel::validate(weights, biases, &weights_reshaped_info));
+ ARM_COMPUTE_RETURN_ON_ERROR(NELocallyConnectedMatrixMultiplyKernel::validate(&input_im2col_reshaped_info, &weights_reshaped_info, &gemm_output_info));
+ ARM_COMPUTE_RETURN_ON_ERROR(NECol2ImKernel::validate(&gemm_output_info, output, Size2D(conv_w, conv_h)));
+
+ return Status{};
+}
+
void NELocallyConnectedLayer::configure(const ITensor *input, const ITensor *weights, const ITensor *biases, ITensor *output, const PadStrideInfo &conv_info)
{
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32);
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(weights, 1, DataType::F32);
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::F32);
- ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, weights, output);
- ARM_COMPUTE_ERROR_ON(weights->info()->dimension(2) != input->info()->dimension(2));
- ARM_COMPUTE_ERROR_ON(!conv_info.padding_is_symmetric());
+ ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output);
+ ARM_COMPUTE_ERROR_THROW_ON(NELocallyConnectedLayer::validate(input->info(), weights->info(), biases == nullptr ? nullptr : biases->info(), output->info(), conv_info));
- if(biases != nullptr)
- {
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(biases, 1, DataType::F32);
- ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, biases);
- ARM_COMPUTE_ERROR_ON(biases->info()->dimension(0) != weights->info()->dimension(3));
- ARM_COMPUTE_ERROR_ON(biases->info()->num_dimensions() > 2);
- }
-
- bool _has_bias = (biases != nullptr);
- _is_first_run = true;
-
- // Get parameters for conv_info
- unsigned int stride_x = 0;
- unsigned int stride_y = 0;
- unsigned int pad_x = 0;
- unsigned int pad_y = 0;
- std::tie(stride_x, stride_y) = conv_info.stride();
- std::tie(pad_x, pad_y) = conv_info.pad();
+ bool _has_bias = (biases != nullptr);
+ _is_first_run = true;
+ _original_weights = weights;
const unsigned int kernel_width = weights->info()->dimension(0);
const unsigned int kernel_height = weights->info()->dimension(1);
@@ -76,32 +139,14 @@
std::tie(conv_w, conv_h) = scaled_dimensions(input->info()->dimension(0), input->info()->dimension(1), kernel_width, kernel_height,
conv_info);
- ARM_COMPUTE_ERROR_ON_MSG((output->info()->dimension(0) != conv_w) || (output->info()->dimension(1) != conv_h), "Output shape does not match the expected one");
- ARM_COMPUTE_ERROR_ON_MSG(weights->info()->dimension(4) != (conv_w * conv_h), "Weights shape does not match the expected one");
-
- // Create tensor to store the reshaped weights
- const size_t mat_weights_cols = weights->info()->dimension(3);
- const size_t mat_weights_rows = weights->info()->dimension(0) * weights->info()->dimension(1) * weights->info()->dimension(2) + ((_has_bias) ? 1 : 0);
- const size_t mat_weights_num = weights->info()->dimension(4);
-
- const TensorShape shape_wr(mat_weights_cols, mat_weights_rows, mat_weights_num);
+ // Calculate intermediate buffer shapes
+ TensorShape shape_wr;
+ TensorShape shape_im2col;
+ TensorShape shape_gemm;
+ calculate_shapes(input->info(), weights->info(), biases == nullptr ? nullptr : biases->info(), output->info(), conv_info, shape_wr, shape_im2col, shape_gemm);
_weights_reshaped.allocator()->init(TensorInfo(shape_wr, 1, weights->info()->data_type()));
-
- // Create tensor to store im2col reshaped inputs
- const size_t mat_input_cols = mat_weights_rows;
- const size_t mat_input_rows = conv_w * conv_h;
- TensorShape shape_im2col = input->info()->tensor_shape();
- shape_im2col.set(0, mat_input_cols);
- shape_im2col.set(1, mat_input_rows);
- shape_im2col.set(2, 1);
-
_input_im2col_reshaped.allocator()->init(TensorInfo(shape_im2col, 1, input->info()->data_type()));
-
- // Create locally connected layer output tensor
- TensorShape shape_gemm = _input_im2col_reshaped.info()->tensor_shape();
- shape_gemm.set(0, mat_weights_cols);
- shape_gemm.set(1, mat_input_rows);
_gemm_output.allocator()->init(TensorInfo(shape_gemm, 1, input->info()->data_type()));
// Manage intermediate buffers
@@ -125,8 +170,13 @@
// Run weights reshaping (Runs once for every configure)
if(_is_first_run)
{
+ ARM_COMPUTE_ERROR_ON(!_original_weights->is_used());
+
_is_first_run = false;
NEScheduler::get().schedule(&_weights_reshape_kernel, 3);
+
+ // Mark original weights tensor as unused
+ _original_weights->mark_as_unused();
}
_memory_group.acquire();
diff --git a/src/runtime/NEON/functions/NEPixelWiseMultiplication.cpp b/src/runtime/NEON/functions/NEPixelWiseMultiplication.cpp
index 5a474e4..cf6b984 100644
--- a/src/runtime/NEON/functions/NEPixelWiseMultiplication.cpp
+++ b/src/runtime/NEON/functions/NEPixelWiseMultiplication.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2016, 2017 ARM Limited.
+ * Copyright (c) 2016-2018 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -23,6 +23,7 @@
*/
#include "arm_compute/runtime/NEON/functions/NEPixelWiseMultiplication.h"
+#include "arm_compute/core/ITensor.h"
#include "arm_compute/core/NEON/kernels/NEPixelWiseMultiplicationKernel.h"
#include "support/ToolchainSupport.h"
@@ -30,11 +31,21 @@
using namespace arm_compute;
-void NEPixelWiseMultiplication::configure(const ITensor *input1, const ITensor *input2, ITensor *output, float scale, ConvertPolicy overflow_policy, RoundingPolicy rounding_policy)
+void NEPixelWiseMultiplication::configure(ITensor *input1, ITensor *input2, ITensor *output, float scale, ConvertPolicy overflow_policy, RoundingPolicy rounding_policy)
{
auto k = arm_compute::support::cpp14::make_unique<NEPixelWiseMultiplicationKernel>();
k->configure(input1, input2, output, scale, overflow_policy, rounding_policy);
_kernel = std::move(k);
+
+ if(output->info()->dimension(0) > 1)
+ {
+ ITensor *broadcasted_info = (input1->info()->dimension(0) == 1) ? input1 : input2;
+
+ if(broadcasted_info->info()->dimension(0) == 1)
+ {
+ _border_handler.configure(broadcasted_info, _kernel->border_size(), BorderMode::REPLICATE);
+ }
+ }
}
Status NEPixelWiseMultiplication::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, float scale, ConvertPolicy overflow_policy, RoundingPolicy rounding_policy)
{
diff --git a/src/runtime/NEON/functions/NEPoolingLayer.cpp b/src/runtime/NEON/functions/NEPoolingLayer.cpp
index bc0b6f8..cbfd684 100644
--- a/src/runtime/NEON/functions/NEPoolingLayer.cpp
+++ b/src/runtime/NEON/functions/NEPoolingLayer.cpp
@@ -31,7 +31,7 @@
using namespace arm_compute;
NEPoolingLayer::NEPoolingLayer()
- : _pooling_layer_kernel(), _border_handler(), _is_global_pooling_layer(false)
+ : _pooling_layer_kernel(), _border_handler(), _is_global_pooling_layer(false), _data_layout(DataLayout::NCHW)
{
}
@@ -40,17 +40,31 @@
// Check if we have Global Pooling Layer
_is_global_pooling_layer = (input->info()->dimension(0) == pool_info.pool_size().width) && (input->info()->dimension(1) == pool_info.pool_size().height);
+ // Get data layout
+ _data_layout = input->info()->data_layout();
+
// Configure pooling kernel
_pooling_layer_kernel.configure(input, output, pool_info);
- // Configure border depending on operation required (quantize border in case of asymmetric data_type)
- BorderMode border_mode = (pool_info.pool_type() == PoolingType::MAX) ? BorderMode::REPLICATE : BorderMode::CONSTANT;
- PixelValue zero_value(0.f);
- if(is_data_type_quantized_asymmetric(input->info()->data_type()) && !pool_info.exclude_padding())
+ switch(_data_layout)
{
- zero_value = PixelValue(static_cast<uint32_t>(input->info()->quantization_info().offset));
+ case DataLayout::NCHW:
+ {
+ // Configure border depending on operation required (quantize border in case of asymmetric data_type)
+ BorderMode border_mode = (pool_info.pool_type() == PoolingType::MAX) ? BorderMode::REPLICATE : BorderMode::CONSTANT;
+ PixelValue zero_value(0.f);
+ if(is_data_type_quantized_asymmetric(input->info()->data_type()) && !pool_info.exclude_padding())
+ {
+ zero_value = PixelValue(static_cast<uint32_t>(input->info()->quantization_info().offset));
+ }
+ _border_handler.configure(input, _pooling_layer_kernel.border_size(), border_mode, zero_value);
+ break;
+ }
+ case DataLayout::NHWC:
+ break;
+ default:
+ ARM_COMPUTE_ERROR("Data layout not supported");
}
- _border_handler.configure(input, _pooling_layer_kernel.border_size(), border_mode, zero_value);
}
Status NEPoolingLayer::validate(const ITensorInfo *input, const ITensorInfo *output, const PoolingLayerInfo &pool_info)
@@ -60,9 +74,20 @@
void NEPoolingLayer::run()
{
- // Fill border
- NEScheduler::get().schedule(&_border_handler, Window::DimY);
+ switch(_data_layout)
+ {
+ case DataLayout::NCHW:
+ // Fill border
+ NEScheduler::get().schedule(&_border_handler, Window::DimY);
- // Run pooling layer
- NEScheduler::get().schedule(&_pooling_layer_kernel, _is_global_pooling_layer ? Window::DimZ : Window::DimY);
+ // Run pooling layer
+ NEScheduler::get().schedule(&_pooling_layer_kernel, _is_global_pooling_layer ? Window::DimZ : Window::DimY);
+ break;
+ case DataLayout::NHWC:
+ // Run pooling layer
+ NEScheduler::get().schedule(&_pooling_layer_kernel, Window::DimX);
+ break;
+ default:
+ ARM_COMPUTE_ERROR("Data layout not supported");
+ }
}
\ No newline at end of file
diff --git a/src/runtime/NEON/functions/NEQuantizationLayer.cpp b/src/runtime/NEON/functions/NEQuantizationLayer.cpp
index a131c48..8f7db96 100644
--- a/src/runtime/NEON/functions/NEQuantizationLayer.cpp
+++ b/src/runtime/NEON/functions/NEQuantizationLayer.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2018 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -25,6 +25,7 @@
#include "arm_compute/runtime/NEON/functions/NEQuantizationLayer.h"
#include "arm_compute/core/Types.h"
+#include "arm_compute/core/Validate.h"
#include "arm_compute/runtime/NEON/NEScheduler.h"
using namespace arm_compute;
@@ -34,8 +35,21 @@
{
}
+Status NEQuantizationLayer::validate(const ITensorInfo *input, const ITensorInfo *output)
+{
+ ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
+
+ TensorInfo min_max{ input->num_channels(), input->data_type() };
+ ARM_COMPUTE_RETURN_ON_ERROR(NEMinMaxLayerKernel::validate(input, &min_max));
+ ARM_COMPUTE_RETURN_ON_ERROR(NEQuantizationLayerKernel::validate(input, output, &min_max));
+
+ return Status{};
+}
+
void NEQuantizationLayer::configure(const ITensor *input, ITensor *output)
{
+ ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
+
// Configure min-max kernel. _min_max tensor will be auto-configured within the kernel
_min_max_kernel.configure(input, &_min_max);
diff --git a/src/runtime/NEON/functions/NEReductionOperation.cpp b/src/runtime/NEON/functions/NEReductionOperation.cpp
index f1a9145..cd0b42f 100644
--- a/src/runtime/NEON/functions/NEReductionOperation.cpp
+++ b/src/runtime/NEON/functions/NEReductionOperation.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2018 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -63,6 +63,13 @@
{
}
+Status NEReductionOperation::validate(const ITensorInfo *input, const ITensorInfo *output, unsigned int axis, ReductionOperation op)
+{
+ ARM_COMPUTE_RETURN_ON_ERROR(NEReductionOperationKernel::validate(input, output, axis, op));
+
+ return Status{};
+}
+
void NEReductionOperation::configure(ITensor *input, ITensor *output, unsigned int axis, ReductionOperation op)
{
ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32);
diff --git a/src/runtime/NEON/functions/NEScale.cpp b/src/runtime/NEON/functions/NEScale.cpp
index bd565c9..a9c85bd 100644
--- a/src/runtime/NEON/functions/NEScale.cpp
+++ b/src/runtime/NEON/functions/NEScale.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2016, 2017 ARM Limited.
+ * Copyright (c) 2016-2018 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -45,7 +45,6 @@
void precompute_dx_dy_offsets(ITensor *dx, ITensor *dy, ITensor *offsets, float wr, float hr, size_t input_element_size, SamplingPolicy sampling_policy)
{
ARM_COMPUTE_ERROR_ON(nullptr == offsets);
- ARM_COMPUTE_ERROR_ON(sampling_policy != SamplingPolicy::CENTER);
ARM_COMPUTE_UNUSED(sampling_policy);
Window win;
@@ -66,7 +65,7 @@
const int in_xi = std::floor(in_x);
const int in_yi = std::floor(in_y);
- *reinterpret_cast<int32_t *>(offsets_it.ptr()) = in_xi * input_element_size;
+ *reinterpret_cast<int32_t *>(offsets_it.ptr()) = in_xi * static_cast<int>(input_element_size);
*reinterpret_cast<float *>(dx_it.ptr()) = in_x - in_xi;
*reinterpret_cast<float *>(dy_it.ptr()) = in_y - in_yi;
},
@@ -99,20 +98,20 @@
void NEScale::configure(ITensor *input, ITensor *output, InterpolationPolicy policy, BorderMode border_mode, PixelValue constant_border_value, SamplingPolicy sampling_policy)
{
- ARM_COMPUTE_ERROR_ON(nullptr == input);
- ARM_COMPUTE_ERROR_ON(nullptr == output);
+ ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
+ ARM_COMPUTE_ERROR_THROW_ON(NEScale::validate(input->info(), output->info(), policy, border_mode, constant_border_value, sampling_policy));
- for(size_t i = 2; i < Coordinates::num_max_dimensions; ++i)
- {
- ARM_COMPUTE_ERROR_ON(input->info()->dimension(i) != output->info()->dimension(i));
- }
+ // Get data layout and width/height indices
+ const DataLayout data_layout = input->info()->data_layout();
+ const int idx_width = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
+ const int idx_height = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
// Get the tensor shape
- const TensorShape shape(output->info()->dimension(0), output->info()->dimension(1));
+ const TensorShape shape(output->info()->dimension(idx_width), output->info()->dimension(idx_height));
// Compute the ratio between source width/height and destination width/height
- const auto wr = static_cast<float>(input->info()->dimension(0)) / static_cast<float>(output->info()->dimension(0));
- const auto hr = static_cast<float>(input->info()->dimension(1)) / static_cast<float>(output->info()->dimension(1));
+ const auto wr = static_cast<float>(input->info()->dimension(idx_width)) / static_cast<float>(output->info()->dimension(idx_width));
+ const auto hr = static_cast<float>(input->info()->dimension(idx_height)) / static_cast<float>(output->info()->dimension(idx_height));
// Get the element size of the input image
const size_t input_element_size = input->info()->element_size();
@@ -123,9 +122,6 @@
policy = InterpolationPolicy::NEAREST_NEIGHBOR;
}
- // Check if the border mode is UNDEFINED
- const bool border_undefined = border_mode == BorderMode::UNDEFINED;
-
switch(policy)
{
case InterpolationPolicy::NEAREST_NEIGHBOR:
@@ -133,7 +129,7 @@
TensorInfo tensor_info_offsets(shape, Format::S32);
_offsets.allocator()->init(tensor_info_offsets);
- _scale_kernel.configure(input, nullptr, nullptr, &_offsets, output, policy, border_undefined, sampling_policy);
+ _scale_kernel.configure(input, nullptr, nullptr, &_offsets, output, policy, border_mode, sampling_policy);
// Allocate once the configure methods have been called
_offsets.allocator()->allocate();
@@ -151,7 +147,7 @@
_dx.allocator()->init(tensor_info_dxdy);
_dy.allocator()->init(tensor_info_dxdy);
- _scale_kernel.configure(input, &_dx, &_dy, &_offsets, output, policy, border_undefined, sampling_policy);
+ _scale_kernel.configure(input, &_dx, &_dy, &_offsets, output, policy, border_mode, sampling_policy);
// Allocate once the configure methods have been called
_offsets.allocator()->allocate();
@@ -164,7 +160,7 @@
}
case InterpolationPolicy::AREA:
{
- _scale_kernel.configure(input, nullptr, nullptr, nullptr, output, policy, border_undefined);
+ _scale_kernel.configure(input, nullptr, nullptr, nullptr, output, policy, border_mode);
break;
}
default:
@@ -174,6 +170,48 @@
_border_handler.configure(input, _scale_kernel.border_size(), border_mode, PixelValue(constant_border_value));
}
+Status NEScale::validate(const ITensorInfo *input, const ITensorInfo *output, InterpolationPolicy policy,
+ BorderMode border_mode, PixelValue constant_border_value, SamplingPolicy sampling_policy)
+{
+ ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
+ ARM_COMPUTE_RETURN_ERROR_ON(sampling_policy != SamplingPolicy::CENTER);
+ ARM_COMPUTE_UNUSED(border_mode, constant_border_value);
+
+ ITensorInfo *offsets = nullptr;
+ ITensorInfo *dx = nullptr;
+ ITensorInfo *dy = nullptr;
+
+ // Get data layout and width/height indices
+ const DataLayout data_layout = input->data_layout();
+ const int idx_width = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
+ const int idx_height = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
+
+ // Get the tensor shape of auxilary buffers
+ const TensorShape shape(output->dimension(idx_width), output->dimension(idx_height));
+
+ TensorInfo tensor_info_offsets(shape, Format::S32);
+ TensorInfo tensor_info_dx(shape, Format::F32);
+ TensorInfo tensor_info_dy(shape, Format::F32);
+
+ switch(policy)
+ {
+ case InterpolationPolicy::NEAREST_NEIGHBOR:
+ offsets = &tensor_info_offsets;
+ break;
+ case InterpolationPolicy::BILINEAR:
+ offsets = &tensor_info_offsets;
+ dx = &tensor_info_dx;
+ dy = &tensor_info_dy;
+ break;
+ default:
+ break;
+ }
+
+ ARM_COMPUTE_RETURN_ON_ERROR(NEScaleKernel::validate(input->clone().get(), dx, dy, offsets, output->clone().get(),
+ policy, border_mode, sampling_policy));
+ return Status{};
+}
+
void NEScale::run()
{
NEScheduler::get().schedule(&_border_handler, Window::DimZ);
diff --git a/src/runtime/NEON/functions/NEWinogradConvolutionLayer.cpp b/src/runtime/NEON/functions/NEWinogradConvolutionLayer.cpp
new file mode 100644
index 0000000..8f2c4c4
--- /dev/null
+++ b/src/runtime/NEON/functions/NEWinogradConvolutionLayer.cpp
@@ -0,0 +1,456 @@
+/*
+ * Copyright (c) 2017-2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/NEON/functions/NEWinogradConvolutionLayer.h"
+
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Utils.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "arm_compute/runtime/NEON/AssemblyHelper.h"
+#include "arm_compute/runtime/NEON/NEScheduler.h"
+#include "support/ToolchainSupport.h"
+
+#include "arm_compute/core/NEON/kernels/NEWinogradConvolutionLayerKernel.h"
+
+#include "arm_compute/core/NEON/kernels/convolution/winograd/winograd_gemm.hpp"
+
+namespace arm_compute
+{
+namespace
+{
+inline Tensor4DShape internal_get_input_shape(const arm_compute::ITensor *input)
+{
+ const DataLayout data_layout = input->info()->data_layout();
+ const int in_width = input->info()->dimension(get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH));
+ const int in_height = input->info()->dimension(get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT));
+ const int in_channels = input->info()->dimension(get_data_layout_dimension_index(data_layout, DataLayoutDimension::CHANNEL));
+ const int in_batches = input->info()->dimension(3);
+
+ return Tensor4DShape({ in_batches, in_height, in_width, in_channels });
+}
+
+Status validate_arguments(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info)
+{
+ const DataLayout data_layout = input->data_layout();
+ const unsigned int width_idx = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
+ const unsigned int height_idx = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
+
+ ARM_COMPUTE_UNUSED(output);
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32);
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, weights);
+ ARM_COMPUTE_RETURN_ERROR_ON(data_layout != DataLayout::NCHW); // COMPMID-1162
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(weights->dimension(width_idx) != 3 && weights->dimension(height_idx) != 5, "Only 3 and 5 kernels are supported");
+ ARM_COMPUTE_RETURN_ERROR_ON(weights->num_dimensions() > 4);
+
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(conv_info.stride().first != 1 || conv_info.stride().second != 1, "Winograd layer only supports unit strides.");
+
+ if(biases != nullptr)
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, biases);
+ ARM_COMPUTE_RETURN_ERROR_ON(biases->num_dimensions() > 1);
+ }
+
+ return Status{};
+}
+
+Size2D winograd_output_tile(const Size2D &input_dims, const Size2D &kernel_dims)
+{
+ Size2D output_tile = Size2D{};
+
+ if(kernel_dims == Size2D(3U, 3U))
+ {
+ output_tile = (input_dims.width <= 4 && input_dims.height <= 4) ? Size2D(2U, 2U) : Size2D(4U, 4U);
+ }
+ else if(kernel_dims == Size2D(5U, 5U))
+ {
+ output_tile = Size2D(2U, 2U);
+ }
+
+ return output_tile;
+}
+
+bool check_support_fast_math(const Size2D &output_tile, const Size2D &kernel_size)
+{
+ // Check if we want to configure a Winograd configuration which requires fast math
+ using WinogradConfiguration = std::pair<std::pair<int, int>, std::pair<int, int>>;
+
+ std::vector<WinogradConfiguration> fast_math_winograd =
+ {
+ WinogradConfiguration(std::pair<int, int>(2, 2), std::pair<int, int>(5, 5)),
+ WinogradConfiguration(std::pair<int, int>(4, 4), std::pair<int, int>(5, 5))
+ };
+
+ auto p = std::make_pair(std::pair<int, int>(output_tile.width, output_tile.height),
+ std::pair<int, int>(kernel_size.width, kernel_size.height));
+
+ return std::find(fast_math_winograd.begin(), fast_math_winograd.end(), p) != fast_math_winograd.end();
+}
+} //namespace
+
+NEWinogradConvolutionLayer::NEWinogradConvolutionLayer(std::shared_ptr<IMemoryManager> memory_manager)
+ : _memory_group(std::move(memory_manager)), _arm_gemm(nullptr), _gemm_kernel(nullptr), _transform_input_kernel(nullptr), _transform_output_kernel(nullptr), _transform_weights_kernel(nullptr),
+ _activationlayer_function(), _permute_input(), _permute_weights(), _permute_output(), _input_workspace(), _output_workspace(), _kernel_storage(), _input_nhwc(), _output_nhwc(), _weights_hwio(),
+ _workspace(), _input(), _weights(), _output(), _reshaped_kernel(false), _is_activationlayer_enabled(false)
+{
+} /* arm_compute */
+
+void NEWinogradConvolutionLayer::configure(const ITensor *input, const ITensor *weights, const ITensor *biases, ITensor *output, const PadStrideInfo &conv_info, const ActivationLayerInfo &act_info,
+ bool enable_fast_math)
+{
+ ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output);
+ ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), weights->info(), (biases != nullptr) ? biases->info() : nullptr, output->info(), conv_info));
+
+ // Get indices for the width and height
+ const DataLayout data_layout = input->info()->data_layout();
+ const unsigned int width_idx = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
+ const unsigned int height_idx = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
+ const unsigned int channel_idx = get_data_layout_dimension_index(data_layout, DataLayoutDimension::CHANNEL);
+
+ const Size2D input_dims = Size2D(input->info()->dimension(width_idx), input->info()->dimension(height_idx));
+ const Size2D kernel_size = Size2D(weights->info()->dimension(width_idx), weights->info()->dimension(height_idx));
+ const Size2D output_tile = winograd_output_tile(input_dims, kernel_size);
+
+ // Check if the Winograd configuration requires fast math
+ if(!enable_fast_math)
+ {
+ ARM_COMPUTE_ERROR_ON_MSG(check_support_fast_math(output_tile, kernel_size), "This Winograd configuration requires enable_fast_math=true");
+ }
+
+ _weights = weights;
+ _input = input;
+ _output = output;
+
+ std::unique_ptr<INEWinogradLayerTransformInputKernel<float>> transform_input_kernel;
+ std::unique_ptr<INEWinogradLayerTransformWeightsKernel<float>> transform_weights_kernel;
+ std::unique_ptr<INEWinogradLayerTransformOutputKernel<float>> transform_output_kernel;
+
+ int n_gemms = 0;
+ int N_BLOCK = 0; // Size of block used by GEMM.
+
+ switch(kernel_size.width)
+ {
+ case 3:
+ {
+ if(input->info()->dimension(width_idx) > 4 && input->info()->dimension(height_idx) > 4)
+ {
+ transform_input_kernel = support::cpp14::make_unique<NEWinogradLayerTransformInputKernel<float, 4, 4, 3, 3>>();
+ transform_weights_kernel = support::cpp14::make_unique<NEWinogradLayerTransformWeightsKernel<float, 4, 4, 3, 3>>();
+ transform_output_kernel = support::cpp14::make_unique<NEWinogradLayerTransformOutputKernel<float, 4, 4, 3, 3>>();
+ n_gemms = NEWinogradLayerBatchedGEMMKernel<float, float, 4, 4, 3, 3>::WinogradBase::N_GEMMS;
+ N_BLOCK = NEWinogradLayerBatchedGEMMKernel<float, float, 4, 4, 3, 3>::WinogradConv::N_BLOCK;
+ }
+ else
+ {
+ transform_input_kernel = support::cpp14::make_unique<NEWinogradLayerTransformInputKernel<float, 2, 2, 3, 3>>();
+ transform_weights_kernel = support::cpp14::make_unique<NEWinogradLayerTransformWeightsKernel<float, 2, 2, 3, 3>>();
+ transform_output_kernel = support::cpp14::make_unique<NEWinogradLayerTransformOutputKernel<float, 2, 2, 3, 3>>();
+ n_gemms = NEWinogradLayerBatchedGEMMKernel<float, float, 2, 2, 3, 3>::WinogradBase::N_GEMMS;
+ N_BLOCK = NEWinogradLayerBatchedGEMMKernel<float, float, 2, 2, 3, 3>::WinogradConv::N_BLOCK;
+ }
+ break;
+ }
+ case 5:
+ {
+ transform_input_kernel = support::cpp14::make_unique<NEWinogradLayerTransformInputKernel<float, 2, 2, 5, 5>>();
+ transform_weights_kernel = support::cpp14::make_unique<NEWinogradLayerTransformWeightsKernel<float, 2, 2, 5, 5>>();
+ transform_output_kernel = support::cpp14::make_unique<NEWinogradLayerTransformOutputKernel<float, 2, 2, 5, 5>>();
+ n_gemms = NEWinogradLayerBatchedGEMMKernel<float, float, 2, 2, 5, 5>::WinogradBase::N_GEMMS;
+ N_BLOCK = NEWinogradLayerBatchedGEMMKernel<float, float, 2, 2, 5, 5>::WinogradConv::N_BLOCK;
+ break;
+ }
+ default:
+ {
+ ARM_COMPUTE_ERROR("Not supported.");
+ break;
+ }
+ }
+
+ const PaddingType use_padding_type = (conv_info.pad_left() != 0u) ? PADDING_SAME : PADDING_VALID;
+ const bool use_same_padding = use_padding_type == PADDING_SAME;
+
+ // Get convolved dimensions
+ const int in_channels = input->info()->dimension(channel_idx);
+ const int out_channels = output->info()->dimension(channel_idx);
+
+ const Tensor4DShape in_shape(internal_get_input_shape(input));
+ const size_t data_type_size = input->info()->element_size();
+ // Get the memory required to instantiate a new Winograd operator.
+ constexpr size_t storage_alignment = 64;
+ const size_t kernel_storage_size = transform_weights_kernel->get_weight_storage_size(out_channels, in_channels) * data_type_size;
+ _kernel_storage.allocator()->init(TensorInfo(TensorShape{ (kernel_storage_size + storage_alignment - 1) }, 1, DataType::U8));
+ _kernel_storage.allocator()->allocate();
+ // Input storage
+ const size_t input_storage_size = transform_input_kernel->get_input_storage_size(in_shape.n_batches, in_shape.n_channels, in_shape.n_rows, in_shape.n_cols, use_same_padding) * data_type_size;
+ _input_workspace.allocator()->init(TensorInfo(TensorShape{ (input_storage_size + storage_alignment - 1) }, 1, DataType::U8));
+ _input_workspace.allocator()->allocate();
+
+ // Output storage
+ const size_t output_storage_size = transform_output_kernel->get_output_storage_size(in_shape.n_batches, in_shape.n_rows, in_shape.n_cols, out_channels, use_same_padding) * data_type_size;
+ _output_workspace.allocator()->init(TensorInfo(TensorShape{ (output_storage_size + storage_alignment - 1) }, 1, DataType::U8));
+ _output_workspace.allocator()->allocate();
+
+ // configure and allocate dst tensor to be used to convert from winograd domain to spatial domain when calling to reshape_output()
+ TensorInfo info(TensorShape(_output->info()->dimension(2), _output->info()->dimension(0),
+ _output->info()->dimension(1), _output->info()->dimension(3)),
+ 1, _output->info()->data_type());
+ _output_nhwc.allocator()->init(info);
+ _output_nhwc.allocator()->allocate();
+
+ // Re-order a weight tensor from [Output feature map x Input feature map x Height x Width] to [Height x Width x Input feature map x Output feature map]
+ _permute_weights.configure(weights, &_weights_hwio, PermutationVector(3U, 2U, 0U, 1U));
+ _weights_hwio.allocator()->allocate();
+
+ // configure the kernel to transform the input tensor from NCHW -> NHWC
+ _permute_input.configure(input, &_input_nhwc, PermutationVector(2U, 0U, 1U));
+ _input_nhwc.allocator()->allocate();
+
+ const KernelShape kernel_shape({ out_channels, static_cast<int>(kernel_size.height), static_cast<int>(kernel_size.width), in_channels });
+
+ // Configure the InputTransform
+ const int input_matrix_stride = transform_input_kernel->get_matrix_stride(kernel_shape, in_shape, use_padding_type);
+ transform_input_kernel->configure(reinterpret_cast<float *>(_input_nhwc.buffer()), in_shape.n_batches, in_shape.n_rows, in_shape.n_cols, in_shape.n_channels, use_padding_type,
+ reinterpret_cast<float *>(_input_workspace.buffer()), input_matrix_stride);
+
+ // Configure WeightsTransform
+ const int kernel_matrix_stride = transform_weights_kernel->get_matrix_stride(kernel_shape);
+ transform_weights_kernel->configure(&_weights_hwio, reinterpret_cast<float *>(_kernel_storage.buffer()), kernel_matrix_stride, out_channels, in_channels);
+
+ // Configure OutputTransform
+ //The biases tensor has not been allocated at this point in time, the output transform will add the biases to the final result in the run() method
+ const int output_matrix_stride = transform_output_kernel->get_matrix_stride(kernel_shape, in_shape, use_padding_type);
+ const auto output_shape(transform_output_kernel->get_output_shape(kernel_shape, in_shape, use_padding_type));
+
+ transform_output_kernel->configure(biases, reinterpret_cast<float *>(_output_workspace.buffer()),
+ output_matrix_stride, reinterpret_cast<float *>(_output_nhwc.buffer()),
+ in_shape.n_batches, output_shape.n_rows, output_shape.n_cols, out_channels);
+
+ // Configure GEMM
+ const int tile_rows = iceildiv(output_shape.n_rows, output_tile.height);
+ const int tile_cols = iceildiv(output_shape.n_cols, output_tile.width);
+ const int m = in_shape.n_batches * tile_rows * tile_cols;
+ const int k = in_shape.n_channels;
+ const int n = out_channels;
+ const int input_matrix_row_stride = in_shape.n_channels;
+ const int kernel_matrix_row_stride = roundup(out_channels, N_BLOCK);
+ const int output_matrix_row_stride = kernel_matrix_row_stride;
+ unsigned int num_threads = NEScheduler::get().num_threads();
+
+ _arm_gemm = arm_gemm::gemm<float, float>(NEScheduler::get().cpu_info(), m, n, k, 1, n_gemms, false, false, 1.f, 0.f, num_threads, false);
+ _arm_gemm->set_arrays(reinterpret_cast<float *>(_input_workspace.buffer()), input_matrix_row_stride, 0, input_matrix_stride, reinterpret_cast<float *>(_kernel_storage.buffer()),
+ kernel_matrix_row_stride, kernel_matrix_stride, reinterpret_cast<float *>(_output_workspace.buffer()), output_matrix_row_stride, 0, output_matrix_stride);
+
+ auto acl_gemm_wrapper = support::cpp14::make_unique<NEGEMMAssemblyWrapper<arm_gemm::GemmCommon<float, float>>>();
+ acl_gemm_wrapper->configure(_arm_gemm.get());
+ const size_t workspace_size = _arm_gemm->get_working_size();
+
+ // Allocate workspace
+ if(workspace_size > 0)
+ {
+ const unsigned int alignment = 4096;
+ allocate_workspace(workspace_size, _workspace, &_memory_group, alignment, 1);
+ _arm_gemm->set_working_space(reinterpret_cast<float *>(_workspace.buffer()));
+ }
+
+ const unsigned int window_size = _arm_gemm->get_window_size();
+ if(window_size < num_threads)
+ {
+ num_threads = window_size;
+ _arm_gemm->set_nthreads(num_threads);
+ }
+
+ _gemm_kernel = std::move(acl_gemm_wrapper);
+
+ // Reorder the convoluted output to ACL's ordering NCHW
+ _permute_output.configure(&_output_nhwc, _output, PermutationVector(1U, 2U, 0U));
+
+ _transform_input_kernel = std::move(transform_input_kernel);
+ _transform_weights_kernel = std::move(transform_weights_kernel);
+ _transform_output_kernel = std::move(transform_output_kernel);
+
+ //Configure Activation Layer
+ _is_activationlayer_enabled = act_info.enabled();
+ if(_is_activationlayer_enabled)
+ {
+ _activationlayer_function.configure(output, nullptr, act_info);
+ }
+}
+
+void NEWinogradConvolutionLayer::run()
+{
+ _memory_group.acquire();
+ if(!_reshaped_kernel)
+ {
+ _reshaped_kernel = true;
+ _permute_weights.run();
+ NEScheduler::get().schedule(_transform_weights_kernel.get(), Window::DimX);
+ }
+ //Bring channels to the front as Winograd code expects the tensor to be in the format NHWC
+ _permute_input.run();
+
+ // Transform input tensor to the winograd domain
+ NEScheduler::get().schedule(_transform_input_kernel.get(), Window::DimX);
+
+ //Run 16 GEMMs in multiple threads, each kernel runs one or more GEMMs
+ NEScheduler::get().schedule(_gemm_kernel.get(), Window::DimX);
+
+ // Transform output tensor to the spatial domain
+ NEScheduler::get().schedule(_transform_output_kernel.get(), Window::DimX);
+
+ // Reorder the convoluted output to ACL's ordering NCHW
+ _permute_output.run();
+
+ if(_is_activationlayer_enabled)
+ {
+ _activationlayer_function.run();
+ }
+ _memory_group.release();
+}
+
+Status NEWinogradConvolutionLayer::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info,
+ const ActivationLayerInfo &act_info, bool enable_fast_math)
+{
+ ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, weights, output);
+ ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, weights, biases, output, conv_info));
+
+ // Get indices for the width and height
+ const size_t idx_width = get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::WIDTH);
+ const size_t idx_height = get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::HEIGHT);
+
+ // Input shape, kernel size and output tile
+ const Size2D input_dims = Size2D(input->dimension(idx_width), input->dimension(idx_height));
+ const Size2D kernel_size = Size2D(weights->dimension(idx_width), weights->dimension(idx_height));
+ const Size2D output_tile = winograd_output_tile(input_dims, kernel_size);
+
+ // Check if the Winograd configuration requires fast math
+ if(!enable_fast_math)
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(check_support_fast_math(output_tile, kernel_size), "This Winograd configuration requires enable_fast_math=true");
+ }
+
+ const WinogradInfo winograd_info = WinogradInfo(output_tile,
+ kernel_size,
+ input_dims,
+ conv_info,
+ input->data_layout());
+
+ // Validate input transform
+ const TensorShape input0_shape = misc::shape_calculator::compute_winograd_input_transform_shape(*input, winograd_info);
+ const TensorInfo input0 = input->clone()->set_tensor_shape(input0_shape);
+ switch(weights->dimension(idx_width))
+ {
+ case 3:
+ {
+ if(input_dims.width > 4 && input_dims.height > 4)
+ {
+ ARM_COMPUTE_RETURN_ON_ERROR((NEWinogradLayerTransformInputKernel<float, 4, 4, 3, 3>::validate(input, &input0, winograd_info)));
+ }
+ else
+ {
+ ARM_COMPUTE_RETURN_ON_ERROR((NEWinogradLayerTransformInputKernel<float, 2, 2, 3, 3>::validate(input, &input0, winograd_info)));
+ }
+ break;
+ }
+ case 5:
+ {
+ ARM_COMPUTE_RETURN_ON_ERROR((NEWinogradLayerTransformInputKernel<float, 2, 2, 5, 5>::validate(input, &input0, winograd_info)));
+ break;
+ }
+ default:
+ {
+ ARM_COMPUTE_RETURN_ERROR_MSG("Only 3x3 and 5x5 kernels supported.");
+ break;
+ }
+ }
+ // Validate filter transform
+ const TensorShape input1_shape = misc::shape_calculator::compute_winograd_filter_transform_shape(*weights, winograd_info);
+ const TensorInfo input1 = weights->clone()->set_tensor_shape(input1_shape);
+
+ switch(weights->dimension(idx_width))
+ {
+ case 3:
+ {
+ if(input_dims.width > 4 && input_dims.height > 4)
+ {
+ ARM_COMPUTE_RETURN_ON_ERROR((NEWinogradLayerTransformWeightsKernel<float, 4, 4, 3, 3>::validate(weights, &input1, winograd_info)));
+ }
+ else
+ {
+ ARM_COMPUTE_RETURN_ON_ERROR((NEWinogradLayerTransformWeightsKernel<float, 2, 2, 3, 3>::validate(weights, &input1, winograd_info)));
+ }
+ break;
+ }
+ case 5:
+ {
+ ARM_COMPUTE_RETURN_ON_ERROR((NEWinogradLayerTransformWeightsKernel<float, 2, 2, 5, 5>::validate(weights, &input1, winograd_info)));
+ break;
+ }
+ default:
+ {
+ ARM_COMPUTE_RETURN_ERROR_MSG("Only 3x3 and 5x5 kernels supported.");
+ break;
+ }
+ }
+ // Validate batched matrix multiply
+ TensorShape batched_mm_output_shape = input0.tensor_shape();
+ batched_mm_output_shape[0] = input1.tensor_shape()[0];
+ const TensorInfo batched_mm_output = input0.clone()->set_tensor_shape(batched_mm_output_shape);
+ switch(weights->dimension(idx_width))
+ {
+ case 3:
+ {
+ if(input_dims.width > 4 && input_dims.height > 4)
+ {
+ // Validate output transform
+ ARM_COMPUTE_RETURN_ON_ERROR((NEWinogradLayerTransformOutputKernel<float, 4, 4, 3, 3>::validate(&batched_mm_output, biases, output, winograd_info)));
+ }
+ else
+ {
+ // Validate output transform
+ ARM_COMPUTE_RETURN_ON_ERROR((NEWinogradLayerTransformOutputKernel<float, 2, 2, 3, 3>::validate(&batched_mm_output, biases, output, winograd_info)));
+ }
+ break;
+ }
+ case 5:
+ {
+ // Validate output transform
+ ARM_COMPUTE_RETURN_ON_ERROR((NEWinogradLayerTransformOutputKernel<float, 2, 2, 5, 5>::validate(&batched_mm_output, biases, output, winograd_info)));
+ break;
+ }
+ default:
+ {
+ ARM_COMPUTE_RETURN_ERROR_MSG("Only 3x3 and 5x5 kernels supported.");
+ break;
+ }
+ }
+
+ // Validate Activation Layer
+ if(act_info.enabled())
+ {
+ NEActivationLayer::validate(output, nullptr, act_info);
+ }
+ return Status{};
+}
+
+} // namespace arm_compute
diff --git a/src/runtime/NEON/functions/NEWinogradLayer.cpp b/src/runtime/NEON/functions/NEWinogradLayer.cpp
deleted file mode 100644
index 0ac6d09..0000000
--- a/src/runtime/NEON/functions/NEWinogradLayer.cpp
+++ /dev/null
@@ -1,256 +0,0 @@
-/*
- * Copyright (c) 2017-2018 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/runtime/NEON/functions/NEWinogradLayer.h"
-
-#include "arm_compute/core/Error.h"
-#include "arm_compute/core/Utils.h"
-#include "arm_compute/core/Validate.h"
-#include "arm_compute/runtime/NEON/NEScheduler.h"
-#include "support/ToolchainSupport.h"
-
-#include "arm_compute/core/NEON/kernels/NEWinogradLayerKernel.h"
-
-#include "arm_compute/core/NEON/kernels/convolution/winograd/winograd_gemm.hpp"
-
-namespace
-{
-inline Tensor4DShape internal_get_input_shape(const arm_compute::ITensor *input)
-{
- const int in_width = input->info()->dimension(0);
- const int in_height = input->info()->dimension(1);
- const int in_batches = input->info()->dimension(3);
- const int in_channels = input->info()->dimension(2);
- return Tensor4DShape({ in_batches, in_height, in_width, in_channels });
-}
-} /* namespace */
-
-namespace arm_compute
-{
-namespace
-{
-Status validate_arguments(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info)
-{
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32);
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, weights, biases);
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(weights->dimension(0) != 3 && weights->dimension(0) != 5, "Only 3 and 5 kernels are supported");
- ARM_COMPUTE_RETURN_ERROR_ON(weights->num_dimensions() > 4);
-
- if(biases != nullptr)
- {
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, biases);
- ARM_COMPUTE_RETURN_ERROR_ON(biases->num_dimensions() > 1);
- }
-
- // Get parameters from conv_info
- unsigned int stride_x = 0;
- unsigned int stride_y = 0;
- std::tie(stride_x, stride_y) = conv_info.stride();
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(stride_y != 1 || stride_x != 1, "Winograd layer only supports unit strides.");
-
- ARM_COMPUTE_UNUSED(output);
-
- return Status{};
-}
-} //namespace
-
-NEWinogradLayer::NEWinogradLayer(std::shared_ptr<IMemoryManager> memory_manager)
- : _memory_group(std::move(memory_manager)), _batched_gemm_kernel(nullptr), _transform_input_kernel(nullptr), _transform_output_kernel(nullptr), _transform_weights_kernel(nullptr), _permute_input(),
- _permute_weights(), _permute_output(), _input_workspace(), _output_workspace(), _kernel_storage(), _input_nhwc(), _output_nhwc(), _weights_hwio(), _input(), _weights(), _output(),
- _reshaped_kernel(false)
-{
-} /* arm_compute */
-
-void NEWinogradLayer::configure(const ITensor *input, const ITensor *weights, const ITensor *biases, ITensor *output, const PadStrideInfo &conv_info)
-{
- ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, biases, output);
- ARM_COMPUTE_UNUSED(conv_info);
- ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), weights->info(), biases->info(), output->info(), conv_info));
-
- _weights = weights;
- _input = input;
- _output = output;
-
- std::unique_ptr<INEWinogradLayerBatchedGEMMKernel<float, float>> batched_gemm_kernel;
- std::unique_ptr<INEWinogradLayerTransformInputKernel<float>> transform_input_kernel;
- std::unique_ptr<INEWinogradLayerTransformWeightsKernel<float>> transform_weights_kernel;
- std::unique_ptr<INEWinogradLayerTransformOutputKernel<float>> transform_output_kernel;
-
- switch(weights->info()->dimension(0))
- {
- case 3:
- {
- batched_gemm_kernel = support::cpp14::make_unique<NEWinogradLayerBatchedGEMMKernel<float, float, 2, 2, 3, 3>>();
- transform_input_kernel = support::cpp14::make_unique<NEWinogradLayerTransformInputKernel<float, 2, 2, 3, 3>>();
- transform_weights_kernel = support::cpp14::make_unique<NEWinogradLayerTransformWeightsKernel<float, 2, 2, 3, 3>>();
- transform_output_kernel = support::cpp14::make_unique<NEWinogradLayerTransformOutputKernel<float, 2, 2, 3, 3>>();
- break;
- }
- case 5:
- {
- batched_gemm_kernel = support::cpp14::make_unique<NEWinogradLayerBatchedGEMMKernel<float, float, 2, 2, 5, 5>>();
- transform_input_kernel = support::cpp14::make_unique<NEWinogradLayerTransformInputKernel<float, 2, 2, 5, 5>>();
- transform_weights_kernel = support::cpp14::make_unique<NEWinogradLayerTransformWeightsKernel<float, 2, 2, 5, 5>>();
- transform_output_kernel = support::cpp14::make_unique<NEWinogradLayerTransformOutputKernel<float, 2, 2, 5, 5>>();
- break;
- }
- default:
- {
- ARM_COMPUTE_ERROR("Not supported.");
- break;
- }
- }
-
- const PaddingType use_padding_type = (conv_info.pad_left() != 0u) ? PADDING_SAME : PADDING_VALID;
- const bool use_same_padding = use_padding_type == PADDING_SAME;
-
- // Get parameters from conv_info
- unsigned int stride_x = 0;
- unsigned int stride_y = 0;
- std::tie(stride_x, stride_y) = conv_info.stride();
- ARM_COMPUTE_ERROR_ON_MSG(stride_y != 1 || stride_x != 1, "Winograd layer only supports unit strides.");
-
- // Get convolved dimensions
- const int in_channels = input->info()->dimension(2);
- const int out_channels = output->info()->dimension(2);
-
- const Tensor4DShape in_shape(internal_get_input_shape(input));
- const size_t data_type_size = input->info()->element_size();
- // Get the memory required to instantiate a new Winograd operator.
- constexpr size_t storage_alignment = 64;
- const size_t kernel_storage_size = transform_weights_kernel->get_weight_storage_size(out_channels, in_channels) * data_type_size;
- _kernel_storage.allocator()->init(TensorInfo(TensorShape{ (kernel_storage_size + storage_alignment - 1) }, 1, DataType::U8));
- _kernel_storage.allocator()->allocate();
- // Input storage
- const size_t input_storage_size = transform_input_kernel->get_input_storage_size(in_shape.n_batches, in_shape.n_channels, in_shape.n_rows, in_shape.n_cols, use_same_padding) * data_type_size;
- _input_workspace.allocator()->init(TensorInfo(TensorShape{ (input_storage_size + storage_alignment - 1) }, 1, DataType::U8));
- _input_workspace.allocator()->allocate();
-
- // Output storage
- const size_t output_storage_size = transform_output_kernel->get_output_storage_size(in_shape.n_batches, in_shape.n_rows, in_shape.n_cols, out_channels, use_same_padding) * data_type_size;
- _output_workspace.allocator()->init(TensorInfo(TensorShape{ (output_storage_size + storage_alignment - 1) }, 1, DataType::U8));
- _output_workspace.allocator()->allocate();
-
- // configure and allocate dst tensor to be used to convert from winograd domain to spatial domain when calling to reshape_output()
- TensorInfo info(TensorShape(_output->info()->dimension(2), _output->info()->dimension(0),
- _output->info()->dimension(1), _output->info()->dimension(3)),
- 1, _output->info()->data_type());
- _output_nhwc.allocator()->init(info);
- _output_nhwc.allocator()->allocate();
-
- // Re-order a weight tensor from [Output feature map x Input feature map x Height x Width] to [Height x Width x Input feature map x Output feature map]
- _permute_weights.configure(weights, &_weights_hwio, PermutationVector(3U, 2U, 0U, 1U));
- _weights_hwio.allocator()->allocate();
-
- // configure the kernel to transform the input tensor from NCHW -> NHWC
- _permute_input.configure(input, &_input_nhwc, PermutationVector(2U, 0U, 1U));
- _input_nhwc.allocator()->allocate();
-
- const int weights_width = weights->info()->dimension(0);
- const int weights_height = weights->info()->dimension(1);
- const KernelShape kernel_shape({ out_channels, weights_height, weights_width, in_channels });
-
- // Configure the InputTransform
- const int input_matrix_stride = transform_input_kernel->get_matrix_stride(kernel_shape, in_shape, use_padding_type);
- transform_input_kernel->configure(reinterpret_cast<float *>(_input_nhwc.buffer()), in_shape.n_batches, in_shape.n_rows, in_shape.n_cols, in_shape.n_channels, use_padding_type,
- reinterpret_cast<float *>(_input_workspace.buffer()), input_matrix_stride);
-
- // Configure WeightsTransform
- const int kernel_matrix_stride = transform_weights_kernel->get_matrix_stride(kernel_shape);
- transform_weights_kernel->configure(&_weights_hwio, reinterpret_cast<float *>(_kernel_storage.buffer()), kernel_matrix_stride, out_channels, in_channels);
-
- // Configure OutputTransform
- //The biases tensor has not been allocated at this point in time, the output transform will add the biases to the final result in the run() method
- const int output_matrix_stride = transform_output_kernel->get_matrix_stride(kernel_shape, in_shape, use_padding_type);
- const auto output_shape(transform_output_kernel->get_output_shape(kernel_shape, in_shape, use_padding_type));
-
- transform_output_kernel->configure(biases, reinterpret_cast<float *>(_output_workspace.buffer()),
- output_matrix_stride, reinterpret_cast<float *>(_output_nhwc.buffer()),
- in_shape.n_batches, output_shape.n_rows, output_shape.n_cols, out_channels);
-
- // Configure Batched GEMMs
- const int output_tile_rows = batched_gemm_kernel->get_output_tile_rows();
- const int output_tile_cols = batched_gemm_kernel->get_output_tile_cols();
- const int n_block = batched_gemm_kernel->get_number_blocks();
- const int tile_rows = iceildiv(output_shape.n_rows, output_tile_rows);
- const int tile_cols = iceildiv(output_shape.n_cols, output_tile_cols);
- const int m = in_shape.n_batches * tile_rows * tile_cols;
- const int k = in_shape.n_channels;
- const int n = out_channels;
- const int input_matrix_row_stride = in_shape.n_channels;
- const int kernel_matrix_row_stride = roundup(out_channels, n_block);
- const int output_matrix_row_stride = kernel_matrix_row_stride;
- const unsigned n_gemms = batched_gemm_kernel->get_number_gemms();
-
- batched_gemm_kernel->configure(n_gemms, m, k, n,
- input_matrix_stride, input_matrix_row_stride,
- kernel_matrix_stride, kernel_matrix_row_stride,
- output_matrix_stride, output_matrix_row_stride,
- reinterpret_cast<float *>(_input_workspace.buffer()),
- reinterpret_cast<float *>(_kernel_storage.buffer()),
- reinterpret_cast<float *>(_output_workspace.buffer()));
-
- // Reorder the convoluted output to ACL's ordering NCHW
- _permute_output.configure(&_output_nhwc, _output, PermutationVector(1U, 2U, 0U));
-
- _transform_input_kernel = std::move(transform_input_kernel);
- _transform_weights_kernel = std::move(transform_weights_kernel);
- _transform_output_kernel = std::move(transform_output_kernel);
- _batched_gemm_kernel = std::move(batched_gemm_kernel);
-}
-
-void NEWinogradLayer::run()
-{
- _memory_group.acquire();
- if(!_reshaped_kernel)
- {
- _reshaped_kernel = true;
- _permute_weights.run();
- NEScheduler::get().schedule(_transform_weights_kernel.get(), Window::DimX);
- }
- //Bring channels to the front as Winograd code expects the tensor to be in the format NHWC
- _permute_input.run();
-
- // Transform input tensor to the winograd domain
- NEScheduler::get().schedule(_transform_input_kernel.get(), Window::DimX);
-
- //Run 16 GEMMs in multiple threads, each kernel runs one or more GEMMs
- NEScheduler::get().schedule(_batched_gemm_kernel.get(), Window::DimX);
-
- // Transform output tensor to the spatial domain
- NEScheduler::get().schedule(_transform_output_kernel.get(), Window::DimX);
-
- // Reorder the convoluted output to ACL's ordering NCHW
- _permute_output.run();
- _memory_group.release();
-}
-
-Status NEWinogradLayer::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info)
-{
- ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, biases, output);
- ARM_COMPUTE_RETURN_ERROR_ON(validate_arguments(input, weights, biases, output, conv_info));
-
- return Status{};
-}
-
-} // namespace arm_compute
diff --git a/src/runtime/OMP/OMPScheduler.cpp b/src/runtime/OMP/OMPScheduler.cpp
index c6802f3..795c96c 100644
--- a/src/runtime/OMP/OMPScheduler.cpp
+++ b/src/runtime/OMP/OMPScheduler.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2018 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -27,6 +27,7 @@
#include "arm_compute/core/Error.h"
#include "arm_compute/core/Helpers.h"
#include "arm_compute/core/Utils.h"
+#include "arm_compute/runtime/CPUUtils.h"
#include <omp.h>
@@ -41,6 +42,7 @@
OMPScheduler::OMPScheduler() // NOLINT
: _num_threads(omp_get_max_threads())
{
+ get_cpu_configuration(_cpu_info);
}
unsigned int OMPScheduler::num_threads() const
@@ -59,7 +61,7 @@
ARM_COMPUTE_ERROR_ON_MSG(!kernel, "The child class didn't set the kernel");
ThreadInfo info;
- info.cpu_info = _info;
+ info.cpu_info = &_cpu_info;
const Window &max_window = kernel->window();
const unsigned int num_iterations = max_window.num_iterations(split_dimension);
@@ -74,7 +76,7 @@
#pragma omp parallel firstprivate(info) num_threads(info.num_threads)
{
const int tid = omp_get_thread_num();
- Window win = max_window.split_window(split_dimension, tid, info.num_threads);
+ Window win = max_window.split_window(split_dimension, tid, info.num_threads);
info.thread_id = tid;
kernel->run(win, info);
}
diff --git a/src/runtime/OffsetLifetimeManager.cpp b/src/runtime/OffsetLifetimeManager.cpp
index 4540aea..d0b3bde 100644
--- a/src/runtime/OffsetLifetimeManager.cpp
+++ b/src/runtime/OffsetLifetimeManager.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2018 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -58,19 +58,24 @@
ARM_COMPUTE_ERROR_ON(_active_group == nullptr);
// Update blob size
- size_t max_group_size = std::accumulate(std::begin(_active_elements), std::end(_active_elements), static_cast<size_t>(0), [](size_t s, const Element & e)
+ size_t max_group_size = std::accumulate(std::begin(_free_blobs), std::end(_free_blobs), static_cast<size_t>(0), [](size_t s, const Blob & b)
{
- return s + e.size;
+ return s + b.max_size;
});
_blob = std::max(_blob, max_group_size);
// Calculate group mappings
auto &group_mappings = _active_group->mappings();
size_t offset = 0;
- for(auto &e : _active_elements)
+ for(auto &free_blob : _free_blobs)
{
- group_mappings[e.handle] = offset;
- offset += e.size;
+ for(auto &bound_element_id : free_blob.bound_elements)
+ {
+ ARM_COMPUTE_ERROR_ON(_active_elements.find(bound_element_id) == std::end(_active_elements));
+ Element &bound_element = _active_elements[bound_element_id];
+ group_mappings[bound_element.handle] = offset;
+ }
+ offset += free_blob.max_size;
ARM_COMPUTE_ERROR_ON(offset > _blob);
}
}
diff --git a/src/runtime/PoolManager.cpp b/src/runtime/PoolManager.cpp
index 42cc943..293241d 100644
--- a/src/runtime/PoolManager.cpp
+++ b/src/runtime/PoolManager.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2018 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -72,3 +72,10 @@
// Update semaphore
_sem = arm_compute::support::cpp14::make_unique<arm_compute::Semaphore>(_free_pools.size());
}
+
+size_t PoolManager::num_pools() const
+{
+ std::lock_guard<arm_compute::Mutex> lock(_mtx);
+
+ return _free_pools.size() + _occupied_pools.size();
+}
\ No newline at end of file
diff --git a/src/runtime/SubTensor.cpp b/src/runtime/SubTensor.cpp
index c5b8f33..b010a32 100644
--- a/src/runtime/SubTensor.cpp
+++ b/src/runtime/SubTensor.cpp
@@ -27,6 +27,11 @@
using namespace arm_compute;
+SubTensor::SubTensor()
+ : _parent(nullptr), _info()
+{
+}
+
SubTensor::SubTensor(ITensor *parent, const TensorShape &tensor_shape, const Coordinates &coords, bool extend_parent)
: _parent(nullptr), _info()
{
diff --git a/src/runtime/TensorAllocator.cpp b/src/runtime/TensorAllocator.cpp
index a0d41b2..993a95b 100644
--- a/src/runtime/TensorAllocator.cpp
+++ b/src/runtime/TensorAllocator.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2016, 2017 ARM Limited.
+ * Copyright (c) 2016-2018 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -27,6 +27,7 @@
#include "arm_compute/core/Error.h"
#include "arm_compute/core/TensorInfo.h"
#include "arm_compute/runtime/MemoryGroup.h"
+#include "arm_compute/runtime/MemoryRegion.h"
#include "support/ToolchainSupport.h"
#include <cstddef>
@@ -114,7 +115,7 @@
ARM_COMPUTE_UNUSED(validate_subtensor_shape);
// Copy pointer to buffer
- _memory = Memory(allocator._memory.buffer());
+ _memory = Memory(allocator._memory.region());
// Init tensor info with new dimensions
size_t total_size = parent_info.offset_element_in_bytes(coords) + sub_info.total_size() - sub_info.offset_first_element_in_bytes();
@@ -126,22 +127,23 @@
uint8_t *TensorAllocator::data() const
{
- return _memory.buffer();
+ ARM_COMPUTE_ERROR_ON(_memory.region() == nullptr);
+ return reinterpret_cast<uint8_t *>(_memory.region()->buffer());
}
void TensorAllocator::allocate()
{
- ARM_COMPUTE_ERROR_ON(_memory.buffer() != nullptr);
+ ARM_COMPUTE_ERROR_ON(_memory.region() == nullptr);
+ ARM_COMPUTE_ERROR_ON(_memory.region()->buffer() != nullptr);
+
if(_associated_memory_group == nullptr)
{
- _memory = Memory(std::shared_ptr<uint8_t>(new uint8_t[info().total_size()](), [](uint8_t *ptr)
- {
- delete[] ptr;
- }));
+ _memory = Memory(std::make_shared<MemoryRegion>(info().total_size()));
}
else
{
- _associated_memory_group->finalize_memory(_owner, reinterpret_cast<void **>(_memory.handle()), info().total_size());
+ _associated_memory_group->finalize_memory(_owner, reinterpret_cast<void **>(_memory.region()->handle()), info().total_size());
+ _memory.region()->set_size(info().total_size());
}
info().set_is_resizable(false);
}
@@ -154,7 +156,8 @@
arm_compute::Status TensorAllocator::import_memory(Memory memory)
{
- ARM_COMPUTE_RETURN_ERROR_ON(memory.buffer() == nullptr);
+ ARM_COMPUTE_ERROR_ON(_memory.region() == nullptr);
+ ARM_COMPUTE_RETURN_ERROR_ON(memory.region()->buffer() == nullptr);
ARM_COMPUTE_RETURN_ERROR_ON(_associated_memory_group != nullptr);
_memory = memory;
info().set_is_resizable(false);
@@ -164,15 +167,17 @@
void TensorAllocator::set_associated_memory_group(MemoryGroup *associated_memory_group)
{
+ ARM_COMPUTE_ERROR_ON(_memory.region() == nullptr);
ARM_COMPUTE_ERROR_ON(associated_memory_group == nullptr);
ARM_COMPUTE_ERROR_ON(_associated_memory_group != nullptr);
- ARM_COMPUTE_ERROR_ON(_memory.buffer() != nullptr);
+ ARM_COMPUTE_ERROR_ON(_memory.region()->buffer() != nullptr);
_associated_memory_group = associated_memory_group;
}
uint8_t *TensorAllocator::lock()
{
- return _memory.buffer();
+ ARM_COMPUTE_ERROR_ON(_memory.region() == nullptr);
+ return reinterpret_cast<uint8_t *>(_memory.region()->buffer());
}
void TensorAllocator::unlock()