blob: 4c3714ee0940dd5f4c0b4431340d5c31708b4b3b [file] [log] [blame]
Erich Elsen28928892020-06-12 08:08:19 -07001// Copyright 2019 Google LLC
2//
3// This source code is licensed under the BSD-style license found in the
4// LICENSE file in the root directory of this source tree.
5
6#include <assert.h>
7
8#include <psimd.h>
9
10#include <xnnpack/dwconv.h>
11#include <xnnpack/math.h>
12
13
14
15PSIMD_INTRINSIC psimd_f32 extq1_f32(psimd_f32 a, psimd_f32 b) {
16 #if defined(__clang__)
17 return __builtin_shufflevector(a, b, 1, 2, 3, 4);
18 #else
19 return __builtin_shuffle(a, b, (psimd_s32){1, 2, 3, 4});
20 #endif // defined(__clang__)
21}
22
23PSIMD_INTRINSIC psimd_f32 extq2_f32(psimd_f32 a, psimd_f32 b) {
24 #if defined(__clang__)
25 return __builtin_shufflevector(a, b, 2, 3, 4, 5);
26 #else
27 return __builtin_shuffle(a, b, (psimd_s32){2, 3, 4, 5});
28 #endif // defined(__clang__)
29}
30
31PSIMD_INTRINSIC psimd_f32 extq3_f32(psimd_f32 a, psimd_f32 b) {
32 #if defined(__clang__)
33 return __builtin_shufflevector(a, b, 3, 4, 5, 6);
34 #else
35 return __builtin_shuffle(a, b, (psimd_s32){3, 4, 5, 6});
36 #endif // defined(__clang__)
37}
38
Marat Dukhan1c6cad92020-10-21 15:48:21 -070039void xnn_f32_dwconv_chw_ukernel_5x5p2__psimd_3x4(
Erich Elsen28928892020-06-12 08:08:19 -070040 size_t input_height,
41 size_t input_width,
42 const float* input,
43 const float* weights,
44 const float *zero,
45 float* output,
46 uint32_t padding_top,
Erich Elsen28928892020-06-12 08:08:19 -070047 const union xnn_f32_chw_params params[restrict XNN_MIN_ELEMENTS(1)])
48{
Erich Elsen28928892020-06-12 08:08:19 -070049 assert(input_height != 0);
Marat Dukhan75157772020-10-21 01:46:28 -070050 assert(input_width != 0);
51 assert(input_width % sizeof(float) == 0);
Erich Elsen28928892020-06-12 08:08:19 -070052 assert(padding_top == 2);
53
Im Sunghoonea2088b2020-06-15 03:17:07 +090054 const psimd_s32 vmask = psimd_load_s32(params->scalar.mask);
Erich Elsen28928892020-06-12 08:08:19 -070055 const psimd_f32 vmax = psimd_load_splat_f32(&params->scalar.max);
56 const psimd_f32 vmin = psimd_load_splat_f32(&params->scalar.min);
57
Marat Dukhanfaf332d2020-10-21 02:55:14 -070058 const psimd_f32 vbias = psimd_load_splat_f32(weights);
59 const psimd_f32 vk00 = psimd_load_splat_f32(weights + 1);
60 const psimd_f32 vk01 = psimd_load_splat_f32(weights + 2);
61 const psimd_f32 vk02 = psimd_load_splat_f32(weights + 3);
62 const psimd_f32 vk03 = psimd_load_splat_f32(weights + 4);
63 const psimd_f32 vk04 = psimd_load_splat_f32(weights + 5);
64 const psimd_f32 vk10 = psimd_load_splat_f32(weights + 6);
65 const psimd_f32 vk11 = psimd_load_splat_f32(weights + 7);
66 const psimd_f32 vk12 = psimd_load_splat_f32(weights + 8);
67 const psimd_f32 vk13 = psimd_load_splat_f32(weights + 9);
68 const psimd_f32 vk14 = psimd_load_splat_f32(weights + 10);
69 const psimd_f32 vk20 = psimd_load_splat_f32(weights + 11);
70 const psimd_f32 vk21 = psimd_load_splat_f32(weights + 12);
71 const psimd_f32 vk22 = psimd_load_splat_f32(weights + 13);
72 const psimd_f32 vk23 = psimd_load_splat_f32(weights + 14);
73 const psimd_f32 vk24 = psimd_load_splat_f32(weights + 15);
74 const psimd_f32 vk30 = psimd_load_splat_f32(weights + 16);
75 const psimd_f32 vk31 = psimd_load_splat_f32(weights + 17);
76 const psimd_f32 vk32 = psimd_load_splat_f32(weights + 18);
77 const psimd_f32 vk33 = psimd_load_splat_f32(weights + 19);
78 const psimd_f32 vk34 = psimd_load_splat_f32(weights + 20);
79 const psimd_f32 vk40 = psimd_load_splat_f32(weights + 21);
80 const psimd_f32 vk41 = psimd_load_splat_f32(weights + 22);
81 const psimd_f32 vk42 = psimd_load_splat_f32(weights + 23);
82 const psimd_f32 vk43 = psimd_load_splat_f32(weights + 24);
83 const psimd_f32 vk44 = psimd_load_splat_f32(weights + 25);
Erich Elsen28928892020-06-12 08:08:19 -070084
Marat Dukhan75157772020-10-21 01:46:28 -070085 const size_t input_decrement = round_up_po2(input_width, 4 * sizeof(float));
Marat Dukhan7ed0e3c2020-10-21 00:41:31 -070086
87 const float* i0 = zero;
88 const float* i1 = zero;
89 const float* i2 = input;
Marat Dukhan75157772020-10-21 01:46:28 -070090 const float* i3 = (const float*) ((uintptr_t) i2 + input_width);
91 const float* i4 = (const float*) ((uintptr_t) i3 + input_width);
92 const float* i5 = (const float*) ((uintptr_t) i4 + input_width);
93 const float* i6 = (const float*) ((uintptr_t) i5 + input_width);
Marat Dukhan7ed0e3c2020-10-21 00:41:31 -070094
95 float* o0 = output;
Marat Dukhan75157772020-10-21 01:46:28 -070096 float* o1 = (float*) ((uintptr_t) o0 + input_width);
97 float* o2 = (float*) ((uintptr_t) o1 + input_width);
Marat Dukhan7ed0e3c2020-10-21 00:41:31 -070098
99 size_t output_height = input_height;
100 do {
101 if XNN_UNPREDICTABLE(output_height < 2) {
102 i3 = zero;
103 o1 = o0;
104 }
105 if XNN_UNPREDICTABLE(output_height <= 2) {
106 i4 = zero;
107 o2 = o1;
108 }
109 if XNN_UNPREDICTABLE(output_height < 4) {
110 i5 = zero;
111 }
112 if XNN_UNPREDICTABLE(output_height <= 4) {
113 i6 = zero;
114 }
115
Erich Elsen28928892020-06-12 08:08:19 -0700116 psimd_f32 vi0x0123 = psimd_zero_f32();
117 psimd_f32 vi1x0123 = psimd_zero_f32();
118 psimd_f32 vi2x0123 = psimd_zero_f32();
119 psimd_f32 vi3x0123 = psimd_zero_f32();
120 psimd_f32 vi4x0123 = psimd_zero_f32();
121 psimd_f32 vi5x0123 = psimd_zero_f32();
122 psimd_f32 vi6x0123 = psimd_zero_f32();
Marat Dukhanbc967c72020-10-20 18:18:40 -0700123 psimd_f32 vi0x4567 = psimd_load_f32(i0); i0 += 4;
124 psimd_f32 vi1x4567 = psimd_load_f32(i1); i1 += 4;
125 psimd_f32 vi2x4567 = psimd_load_f32(i2); i2 += 4;
126 psimd_f32 vi3x4567 = psimd_load_f32(i3); i3 += 4;
127 psimd_f32 vi4x4567 = psimd_load_f32(i4); i4 += 4;
128 psimd_f32 vi5x4567 = psimd_load_f32(i5); i5 += 4;
129 psimd_f32 vi6x4567 = psimd_load_f32(i6); i6 += 4;
Erich Elsen28928892020-06-12 08:08:19 -0700130
Marat Dukhana690c092020-10-20 19:01:06 -0700131 size_t w = input_width;
Marat Dukhan75157772020-10-21 01:46:28 -0700132 for (; w > 8 * sizeof(float); w -= 4 * sizeof(float)) {
Marat Dukhan6f469a52020-10-21 20:02:52 -0700133 psimd_f32 vo4567p0 = vbias;
134 psimd_f32 vo4567p1 = vbias;
135 psimd_f32 vo4567p2 = vbias;
Erich Elsen28928892020-06-12 08:08:19 -0700136
Marat Dukhanbc967c72020-10-20 18:18:40 -0700137 const psimd_f32 vi0x89AB = psimd_load_f32(i0); i0 += 4;
138 const psimd_f32 vi1x89AB = psimd_load_f32(i1); i1 += 4;
139 const psimd_f32 vi2x89AB = psimd_load_f32(i2); i2 += 4;
140 const psimd_f32 vi3x89AB = psimd_load_f32(i3); i3 += 4;
141 const psimd_f32 vi4x89AB = psimd_load_f32(i4); i4 += 4;
142 const psimd_f32 vi5x89AB = psimd_load_f32(i5); i5 += 4;
143 const psimd_f32 vi6x89AB = psimd_load_f32(i6); i6 += 4;
Erich Elsen28928892020-06-12 08:08:19 -0700144
Marat Dukhan6f469a52020-10-21 20:02:52 -0700145 vo4567p0 = psimd_qfma_f32(vo4567p0, vi0x4567, vk02);
146 vo4567p1 = psimd_qfma_f32(vo4567p1, vi1x4567, vk02);
147 vo4567p2 = psimd_qfma_f32(vo4567p2, vi2x4567, vk02);
Erich Elsen28928892020-06-12 08:08:19 -0700148
Marat Dukhan6f469a52020-10-21 20:02:52 -0700149 vo4567p0 = psimd_qfma_f32(vo4567p0, vi1x4567, vk12);
150 vo4567p1 = psimd_qfma_f32(vo4567p1, vi2x4567, vk12);
151 vo4567p2 = psimd_qfma_f32(vo4567p2, vi3x4567, vk12);
Erich Elsen28928892020-06-12 08:08:19 -0700152
Marat Dukhan6f469a52020-10-21 20:02:52 -0700153 vo4567p0 = psimd_qfma_f32(vo4567p0, vi2x4567, vk22);
154 vo4567p1 = psimd_qfma_f32(vo4567p1, vi3x4567, vk22);
155 vo4567p2 = psimd_qfma_f32(vo4567p2, vi4x4567, vk22);
Erich Elsen28928892020-06-12 08:08:19 -0700156
Marat Dukhan6f469a52020-10-21 20:02:52 -0700157 vo4567p0 = psimd_qfma_f32(vo4567p0, vi3x4567, vk32);
158 vo4567p1 = psimd_qfma_f32(vo4567p1, vi4x4567, vk32);
159 vo4567p2 = psimd_qfma_f32(vo4567p2, vi5x4567, vk32);
Erich Elsen28928892020-06-12 08:08:19 -0700160
Marat Dukhan6f469a52020-10-21 20:02:52 -0700161 vo4567p0 = psimd_qfma_f32(vo4567p0, vi4x4567, vk42);
162 vo4567p1 = psimd_qfma_f32(vo4567p1, vi5x4567, vk42);
163 vo4567p2 = psimd_qfma_f32(vo4567p2, vi6x4567, vk42);
Erich Elsen28928892020-06-12 08:08:19 -0700164
165 const psimd_f32 vi0x3456 = extq3_f32(vi0x0123, vi0x4567);
166 const psimd_f32 vi1x3456 = extq3_f32(vi1x0123, vi1x4567);
167 const psimd_f32 vi2x3456 = extq3_f32(vi2x0123, vi2x4567);
168 const psimd_f32 vi3x3456 = extq3_f32(vi3x0123, vi3x4567);
169 const psimd_f32 vi4x3456 = extq3_f32(vi4x0123, vi4x4567);
170 const psimd_f32 vi5x3456 = extq3_f32(vi5x0123, vi5x4567);
171 const psimd_f32 vi6x3456 = extq3_f32(vi6x0123, vi6x4567);
172
Marat Dukhan6f469a52020-10-21 20:02:52 -0700173 vo4567p0 = psimd_qfma_f32(vo4567p0, vi0x3456, vk01);
174 vo4567p1 = psimd_qfma_f32(vo4567p1, vi1x3456, vk01);
175 vo4567p2 = psimd_qfma_f32(vo4567p2, vi2x3456, vk01);
Erich Elsen28928892020-06-12 08:08:19 -0700176
Marat Dukhan6f469a52020-10-21 20:02:52 -0700177 vo4567p0 = psimd_qfma_f32(vo4567p0, vi1x3456, vk11);
178 vo4567p1 = psimd_qfma_f32(vo4567p1, vi2x3456, vk11);
179 vo4567p2 = psimd_qfma_f32(vo4567p2, vi3x3456, vk11);
Erich Elsen28928892020-06-12 08:08:19 -0700180
Marat Dukhan6f469a52020-10-21 20:02:52 -0700181 vo4567p0 = psimd_qfma_f32(vo4567p0, vi2x3456, vk21);
182 vo4567p1 = psimd_qfma_f32(vo4567p1, vi3x3456, vk21);
183 vo4567p2 = psimd_qfma_f32(vo4567p2, vi4x3456, vk21);
Erich Elsen28928892020-06-12 08:08:19 -0700184
Marat Dukhan6f469a52020-10-21 20:02:52 -0700185 vo4567p0 = psimd_qfma_f32(vo4567p0, vi3x3456, vk31);
186 vo4567p1 = psimd_qfma_f32(vo4567p1, vi4x3456, vk31);
187 vo4567p2 = psimd_qfma_f32(vo4567p2, vi5x3456, vk31);
Erich Elsen28928892020-06-12 08:08:19 -0700188
Marat Dukhan6f469a52020-10-21 20:02:52 -0700189 vo4567p0 = psimd_qfma_f32(vo4567p0, vi4x3456, vk41);
190 vo4567p1 = psimd_qfma_f32(vo4567p1, vi5x3456, vk41);
191 vo4567p2 = psimd_qfma_f32(vo4567p2, vi6x3456, vk41);
Erich Elsen28928892020-06-12 08:08:19 -0700192
193 const psimd_f32 vi0x2345 = extq2_f32(vi0x0123, vi0x4567);
194 const psimd_f32 vi1x2345 = extq2_f32(vi1x0123, vi1x4567);
195 const psimd_f32 vi2x2345 = extq2_f32(vi2x0123, vi2x4567);
196 const psimd_f32 vi3x2345 = extq2_f32(vi3x0123, vi3x4567);
197 const psimd_f32 vi4x2345 = extq2_f32(vi4x0123, vi4x4567);
198 const psimd_f32 vi5x2345 = extq2_f32(vi5x0123, vi5x4567);
199 const psimd_f32 vi6x2345 = extq2_f32(vi6x0123, vi6x4567);
200
Marat Dukhan6f469a52020-10-21 20:02:52 -0700201 vo4567p0 = psimd_qfma_f32(vo4567p0, vi0x2345, vk00);
202 vo4567p1 = psimd_qfma_f32(vo4567p1, vi1x2345, vk00);
203 vo4567p2 = psimd_qfma_f32(vo4567p2, vi2x2345, vk00);
Erich Elsen28928892020-06-12 08:08:19 -0700204
Marat Dukhan6f469a52020-10-21 20:02:52 -0700205 vo4567p0 = psimd_qfma_f32(vo4567p0, vi1x2345, vk10);
206 vo4567p1 = psimd_qfma_f32(vo4567p1, vi2x2345, vk10);
207 vo4567p2 = psimd_qfma_f32(vo4567p2, vi3x2345, vk10);
Erich Elsen28928892020-06-12 08:08:19 -0700208
Marat Dukhan6f469a52020-10-21 20:02:52 -0700209 vo4567p0 = psimd_qfma_f32(vo4567p0, vi2x2345, vk20);
210 vo4567p1 = psimd_qfma_f32(vo4567p1, vi3x2345, vk20);
211 vo4567p2 = psimd_qfma_f32(vo4567p2, vi4x2345, vk20);
Erich Elsen28928892020-06-12 08:08:19 -0700212
Marat Dukhan6f469a52020-10-21 20:02:52 -0700213 vo4567p0 = psimd_qfma_f32(vo4567p0, vi3x2345, vk30);
214 vo4567p1 = psimd_qfma_f32(vo4567p1, vi4x2345, vk30);
215 vo4567p2 = psimd_qfma_f32(vo4567p2, vi5x2345, vk30);
Erich Elsen28928892020-06-12 08:08:19 -0700216
Marat Dukhan6f469a52020-10-21 20:02:52 -0700217 vo4567p0 = psimd_qfma_f32(vo4567p0, vi4x2345, vk40);
218 vo4567p1 = psimd_qfma_f32(vo4567p1, vi5x2345, vk40);
219 vo4567p2 = psimd_qfma_f32(vo4567p2, vi6x2345, vk40);
Erich Elsen28928892020-06-12 08:08:19 -0700220
221 vi0x0123 = vi0x4567;
222 vi1x0123 = vi1x4567;
223 vi2x0123 = vi2x4567;
224 vi3x0123 = vi3x4567;
225 vi4x0123 = vi4x4567;
226 vi5x0123 = vi5x4567;
227 vi6x0123 = vi6x4567;
228
229 const psimd_f32 vi0x5678 = extq1_f32(vi0x4567, vi0x89AB);
230 const psimd_f32 vi1x5678 = extq1_f32(vi1x4567, vi1x89AB);
231 const psimd_f32 vi2x5678 = extq1_f32(vi2x4567, vi2x89AB);
232 const psimd_f32 vi3x5678 = extq1_f32(vi3x4567, vi3x89AB);
233 const psimd_f32 vi4x5678 = extq1_f32(vi4x4567, vi4x89AB);
234 const psimd_f32 vi5x5678 = extq1_f32(vi5x4567, vi5x89AB);
235 const psimd_f32 vi6x5678 = extq1_f32(vi6x4567, vi6x89AB);
236
Marat Dukhan6f469a52020-10-21 20:02:52 -0700237 vo4567p0 = psimd_qfma_f32(vo4567p0, vi0x5678, vk03);
238 vo4567p1 = psimd_qfma_f32(vo4567p1, vi1x5678, vk03);
239 vo4567p2 = psimd_qfma_f32(vo4567p2, vi2x5678, vk03);
Erich Elsen28928892020-06-12 08:08:19 -0700240
Marat Dukhan6f469a52020-10-21 20:02:52 -0700241 vo4567p0 = psimd_qfma_f32(vo4567p0, vi1x5678, vk13);
242 vo4567p1 = psimd_qfma_f32(vo4567p1, vi2x5678, vk13);
243 vo4567p2 = psimd_qfma_f32(vo4567p2, vi3x5678, vk13);
Erich Elsen28928892020-06-12 08:08:19 -0700244
Marat Dukhan6f469a52020-10-21 20:02:52 -0700245 vo4567p0 = psimd_qfma_f32(vo4567p0, vi2x5678, vk23);
246 vo4567p1 = psimd_qfma_f32(vo4567p1, vi3x5678, vk23);
247 vo4567p2 = psimd_qfma_f32(vo4567p2, vi4x5678, vk23);
Erich Elsen28928892020-06-12 08:08:19 -0700248
Marat Dukhan6f469a52020-10-21 20:02:52 -0700249 vo4567p0 = psimd_qfma_f32(vo4567p0, vi3x5678, vk33);
250 vo4567p1 = psimd_qfma_f32(vo4567p1, vi4x5678, vk33);
251 vo4567p2 = psimd_qfma_f32(vo4567p2, vi5x5678, vk33);
Erich Elsen28928892020-06-12 08:08:19 -0700252
Marat Dukhan6f469a52020-10-21 20:02:52 -0700253 vo4567p0 = psimd_qfma_f32(vo4567p0, vi4x5678, vk43);
254 vo4567p1 = psimd_qfma_f32(vo4567p1, vi5x5678, vk43);
255 vo4567p2 = psimd_qfma_f32(vo4567p2, vi6x5678, vk43);
Erich Elsen28928892020-06-12 08:08:19 -0700256
257 const psimd_f32 vi0x6789 = extq2_f32(vi0x4567, vi0x89AB);
258 const psimd_f32 vi1x6789 = extq2_f32(vi1x4567, vi1x89AB);
259 const psimd_f32 vi2x6789 = extq2_f32(vi2x4567, vi2x89AB);
260 const psimd_f32 vi3x6789 = extq2_f32(vi3x4567, vi3x89AB);
261 const psimd_f32 vi4x6789 = extq2_f32(vi4x4567, vi4x89AB);
262 const psimd_f32 vi5x6789 = extq2_f32(vi5x4567, vi5x89AB);
263 const psimd_f32 vi6x6789 = extq2_f32(vi6x4567, vi6x89AB);
264
Marat Dukhan6f469a52020-10-21 20:02:52 -0700265 vo4567p0 = psimd_qfma_f32(vo4567p0, vi0x6789, vk04);
266 vo4567p1 = psimd_qfma_f32(vo4567p1, vi1x6789, vk04);
267 vo4567p2 = psimd_qfma_f32(vo4567p2, vi2x6789, vk04);
Erich Elsen28928892020-06-12 08:08:19 -0700268
Marat Dukhan6f469a52020-10-21 20:02:52 -0700269 vo4567p0 = psimd_qfma_f32(vo4567p0, vi1x6789, vk14);
270 vo4567p1 = psimd_qfma_f32(vo4567p1, vi2x6789, vk14);
271 vo4567p2 = psimd_qfma_f32(vo4567p2, vi3x6789, vk14);
Erich Elsen28928892020-06-12 08:08:19 -0700272
Marat Dukhan6f469a52020-10-21 20:02:52 -0700273 vo4567p0 = psimd_qfma_f32(vo4567p0, vi2x6789, vk24);
274 vo4567p1 = psimd_qfma_f32(vo4567p1, vi3x6789, vk24);
275 vo4567p2 = psimd_qfma_f32(vo4567p2, vi4x6789, vk24);
Erich Elsen28928892020-06-12 08:08:19 -0700276
Marat Dukhan6f469a52020-10-21 20:02:52 -0700277 vo4567p0 = psimd_qfma_f32(vo4567p0, vi3x6789, vk34);
278 vo4567p1 = psimd_qfma_f32(vo4567p1, vi4x6789, vk34);
279 vo4567p2 = psimd_qfma_f32(vo4567p2, vi5x6789, vk34);
Erich Elsen28928892020-06-12 08:08:19 -0700280
Marat Dukhan6f469a52020-10-21 20:02:52 -0700281 vo4567p0 = psimd_qfma_f32(vo4567p0, vi4x6789, vk44);
282 vo4567p1 = psimd_qfma_f32(vo4567p1, vi5x6789, vk44);
283 vo4567p2 = psimd_qfma_f32(vo4567p2, vi6x6789, vk44);
Erich Elsen28928892020-06-12 08:08:19 -0700284
285 vi0x4567 = vi0x89AB;
286 vi1x4567 = vi1x89AB;
287 vi2x4567 = vi2x89AB;
288 vi3x4567 = vi3x89AB;
289 vi4x4567 = vi4x89AB;
290 vi5x4567 = vi5x89AB;
291 vi6x4567 = vi6x89AB;
292
Marat Dukhan6f469a52020-10-21 20:02:52 -0700293 psimd_f32 vo0 = vo4567p0;
294 psimd_f32 vo1 = vo4567p1;
295 psimd_f32 vo2 = vo4567p2;
Erich Elsen28928892020-06-12 08:08:19 -0700296
297 vo0 = psimd_max_f32(vo0, vmin);
Erich Elsen28928892020-06-12 08:08:19 -0700298 vo1 = psimd_max_f32(vo1, vmin);
Erich Elsen28928892020-06-12 08:08:19 -0700299 vo2 = psimd_max_f32(vo2, vmin);
Marat Dukhanfaf332d2020-10-21 02:55:14 -0700300
301 vo0 = psimd_min_f32(vo0, vmax);
302 vo1 = psimd_min_f32(vo1, vmax);
Erich Elsen28928892020-06-12 08:08:19 -0700303 vo2 = psimd_min_f32(vo2, vmax);
304
Marat Dukhana690c092020-10-20 19:01:06 -0700305 psimd_store_f32(o2, vo2); o2 += 4;
Marat Dukhan7ed0e3c2020-10-21 00:41:31 -0700306 psimd_store_f32(o1, vo1); o1 += 4;
307 psimd_store_f32(o0, vo0); o0 += 4;
Erich Elsen28928892020-06-12 08:08:19 -0700308 }
309 // Always process the last block of 5..8 pixels.
Marat Dukhanfaf332d2020-10-21 02:55:14 -0700310 if XNN_LIKELY(w > 4 * sizeof(float)) {
Marat Dukhan6f469a52020-10-21 20:02:52 -0700311 psimd_f32 vo4567p0 = vbias;
312 psimd_f32 vo4567p1 = vbias;
313 psimd_f32 vo4567p2 = vbias;
Erich Elsen28928892020-06-12 08:08:19 -0700314
Marat Dukhanbc967c72020-10-20 18:18:40 -0700315 psimd_f32 vi0x89AB = psimd_load_f32(i0); i0 += 4;
316 psimd_f32 vi1x89AB = psimd_load_f32(i1); i1 += 4;
317 psimd_f32 vi2x89AB = psimd_load_f32(i2); i2 += 4;
318 psimd_f32 vi3x89AB = psimd_load_f32(i3); i3 += 4;
319 psimd_f32 vi4x89AB = psimd_load_f32(i4); i4 += 4;
320 psimd_f32 vi5x89AB = psimd_load_f32(i5); i5 += 4;
321 psimd_f32 vi6x89AB = psimd_load_f32(i6); i6 += 4;
Erich Elsen28928892020-06-12 08:08:19 -0700322
323 vi0x89AB = psimd_andmask_f32(vmask, vi0x89AB);
324 vi1x89AB = psimd_andmask_f32(vmask, vi1x89AB);
325 vi2x89AB = psimd_andmask_f32(vmask, vi2x89AB);
326 vi3x89AB = psimd_andmask_f32(vmask, vi3x89AB);
327 vi4x89AB = psimd_andmask_f32(vmask, vi4x89AB);
328 vi5x89AB = psimd_andmask_f32(vmask, vi5x89AB);
329 vi6x89AB = psimd_andmask_f32(vmask, vi6x89AB);
330
Marat Dukhan6f469a52020-10-21 20:02:52 -0700331 vo4567p0 = psimd_qfma_f32(vo4567p0, vi0x4567, vk02);
332 vo4567p1 = psimd_qfma_f32(vo4567p1, vi1x4567, vk02);
333 vo4567p2 = psimd_qfma_f32(vo4567p2, vi2x4567, vk02);
Erich Elsen28928892020-06-12 08:08:19 -0700334
Marat Dukhan6f469a52020-10-21 20:02:52 -0700335 vo4567p0 = psimd_qfma_f32(vo4567p0, vi1x4567, vk12);
336 vo4567p1 = psimd_qfma_f32(vo4567p1, vi2x4567, vk12);
337 vo4567p2 = psimd_qfma_f32(vo4567p2, vi3x4567, vk12);
Erich Elsen28928892020-06-12 08:08:19 -0700338
Marat Dukhan6f469a52020-10-21 20:02:52 -0700339 vo4567p0 = psimd_qfma_f32(vo4567p0, vi2x4567, vk22);
340 vo4567p1 = psimd_qfma_f32(vo4567p1, vi3x4567, vk22);
341 vo4567p2 = psimd_qfma_f32(vo4567p2, vi4x4567, vk22);
Erich Elsen28928892020-06-12 08:08:19 -0700342
Marat Dukhan6f469a52020-10-21 20:02:52 -0700343 vo4567p0 = psimd_qfma_f32(vo4567p0, vi3x4567, vk32);
344 vo4567p1 = psimd_qfma_f32(vo4567p1, vi4x4567, vk32);
345 vo4567p2 = psimd_qfma_f32(vo4567p2, vi5x4567, vk32);
Erich Elsen28928892020-06-12 08:08:19 -0700346
Marat Dukhan6f469a52020-10-21 20:02:52 -0700347 vo4567p0 = psimd_qfma_f32(vo4567p0, vi4x4567, vk42);
348 vo4567p1 = psimd_qfma_f32(vo4567p1, vi5x4567, vk42);
349 vo4567p2 = psimd_qfma_f32(vo4567p2, vi6x4567, vk42);
Erich Elsen28928892020-06-12 08:08:19 -0700350
351 const psimd_f32 vi0x3456 = extq3_f32(vi0x0123, vi0x4567);
352 const psimd_f32 vi1x3456 = extq3_f32(vi1x0123, vi1x4567);
353 const psimd_f32 vi2x3456 = extq3_f32(vi2x0123, vi2x4567);
354 const psimd_f32 vi3x3456 = extq3_f32(vi3x0123, vi3x4567);
355 const psimd_f32 vi4x3456 = extq3_f32(vi4x0123, vi4x4567);
356 const psimd_f32 vi5x3456 = extq3_f32(vi5x0123, vi5x4567);
357 const psimd_f32 vi6x3456 = extq3_f32(vi6x0123, vi6x4567);
358
Marat Dukhan6f469a52020-10-21 20:02:52 -0700359 vo4567p0 = psimd_qfma_f32(vo4567p0, vi0x3456, vk01);
360 vo4567p1 = psimd_qfma_f32(vo4567p1, vi1x3456, vk01);
361 vo4567p2 = psimd_qfma_f32(vo4567p2, vi2x3456, vk01);
Erich Elsen28928892020-06-12 08:08:19 -0700362
Marat Dukhan6f469a52020-10-21 20:02:52 -0700363 vo4567p0 = psimd_qfma_f32(vo4567p0, vi1x3456, vk11);
364 vo4567p1 = psimd_qfma_f32(vo4567p1, vi2x3456, vk11);
365 vo4567p2 = psimd_qfma_f32(vo4567p2, vi3x3456, vk11);
Erich Elsen28928892020-06-12 08:08:19 -0700366
Marat Dukhan6f469a52020-10-21 20:02:52 -0700367 vo4567p0 = psimd_qfma_f32(vo4567p0, vi2x3456, vk21);
368 vo4567p1 = psimd_qfma_f32(vo4567p1, vi3x3456, vk21);
369 vo4567p2 = psimd_qfma_f32(vo4567p2, vi4x3456, vk21);
Erich Elsen28928892020-06-12 08:08:19 -0700370
Marat Dukhan6f469a52020-10-21 20:02:52 -0700371 vo4567p0 = psimd_qfma_f32(vo4567p0, vi3x3456, vk31);
372 vo4567p1 = psimd_qfma_f32(vo4567p1, vi4x3456, vk31);
373 vo4567p2 = psimd_qfma_f32(vo4567p2, vi5x3456, vk31);
Erich Elsen28928892020-06-12 08:08:19 -0700374
Marat Dukhan6f469a52020-10-21 20:02:52 -0700375 vo4567p0 = psimd_qfma_f32(vo4567p0, vi4x3456, vk41);
376 vo4567p1 = psimd_qfma_f32(vo4567p1, vi5x3456, vk41);
377 vo4567p2 = psimd_qfma_f32(vo4567p2, vi6x3456, vk41);
Erich Elsen28928892020-06-12 08:08:19 -0700378
379 const psimd_f32 vi0x2345 = extq2_f32(vi0x0123, vi0x4567);
380 const psimd_f32 vi1x2345 = extq2_f32(vi1x0123, vi1x4567);
381 const psimd_f32 vi2x2345 = extq2_f32(vi2x0123, vi2x4567);
382 const psimd_f32 vi3x2345 = extq2_f32(vi3x0123, vi3x4567);
383 const psimd_f32 vi4x2345 = extq2_f32(vi4x0123, vi4x4567);
384 const psimd_f32 vi5x2345 = extq2_f32(vi5x0123, vi5x4567);
385 const psimd_f32 vi6x2345 = extq2_f32(vi6x0123, vi6x4567);
386
Marat Dukhan6f469a52020-10-21 20:02:52 -0700387 vo4567p0 = psimd_qfma_f32(vo4567p0, vi0x2345, vk00);
388 vo4567p1 = psimd_qfma_f32(vo4567p1, vi1x2345, vk00);
389 vo4567p2 = psimd_qfma_f32(vo4567p2, vi2x2345, vk00);
Erich Elsen28928892020-06-12 08:08:19 -0700390
Marat Dukhan6f469a52020-10-21 20:02:52 -0700391 vo4567p0 = psimd_qfma_f32(vo4567p0, vi1x2345, vk10);
392 vo4567p1 = psimd_qfma_f32(vo4567p1, vi2x2345, vk10);
393 vo4567p2 = psimd_qfma_f32(vo4567p2, vi3x2345, vk10);
Erich Elsen28928892020-06-12 08:08:19 -0700394
Marat Dukhan6f469a52020-10-21 20:02:52 -0700395 vo4567p0 = psimd_qfma_f32(vo4567p0, vi2x2345, vk20);
396 vo4567p1 = psimd_qfma_f32(vo4567p1, vi3x2345, vk20);
397 vo4567p2 = psimd_qfma_f32(vo4567p2, vi4x2345, vk20);
Erich Elsen28928892020-06-12 08:08:19 -0700398
Marat Dukhan6f469a52020-10-21 20:02:52 -0700399 vo4567p0 = psimd_qfma_f32(vo4567p0, vi3x2345, vk30);
400 vo4567p1 = psimd_qfma_f32(vo4567p1, vi4x2345, vk30);
401 vo4567p2 = psimd_qfma_f32(vo4567p2, vi5x2345, vk30);
Erich Elsen28928892020-06-12 08:08:19 -0700402
Marat Dukhan6f469a52020-10-21 20:02:52 -0700403 vo4567p0 = psimd_qfma_f32(vo4567p0, vi4x2345, vk40);
404 vo4567p1 = psimd_qfma_f32(vo4567p1, vi5x2345, vk40);
405 vo4567p2 = psimd_qfma_f32(vo4567p2, vi6x2345, vk40);
Erich Elsen28928892020-06-12 08:08:19 -0700406
407 vi0x0123 = vi0x4567;
408 vi1x0123 = vi1x4567;
409 vi2x0123 = vi2x4567;
410 vi3x0123 = vi3x4567;
411 vi4x0123 = vi4x4567;
412 vi5x0123 = vi5x4567;
413 vi6x0123 = vi6x4567;
414
415 const psimd_f32 vi0x5678 = extq1_f32(vi0x4567, vi0x89AB);
416 const psimd_f32 vi1x5678 = extq1_f32(vi1x4567, vi1x89AB);
417 const psimd_f32 vi2x5678 = extq1_f32(vi2x4567, vi2x89AB);
418 const psimd_f32 vi3x5678 = extq1_f32(vi3x4567, vi3x89AB);
419 const psimd_f32 vi4x5678 = extq1_f32(vi4x4567, vi4x89AB);
420 const psimd_f32 vi5x5678 = extq1_f32(vi5x4567, vi5x89AB);
421 const psimd_f32 vi6x5678 = extq1_f32(vi6x4567, vi6x89AB);
422
Marat Dukhan6f469a52020-10-21 20:02:52 -0700423 vo4567p0 = psimd_qfma_f32(vo4567p0, vi0x5678, vk03);
424 vo4567p1 = psimd_qfma_f32(vo4567p1, vi1x5678, vk03);
425 vo4567p2 = psimd_qfma_f32(vo4567p2, vi2x5678, vk03);
Erich Elsen28928892020-06-12 08:08:19 -0700426
Marat Dukhan6f469a52020-10-21 20:02:52 -0700427 vo4567p0 = psimd_qfma_f32(vo4567p0, vi1x5678, vk13);
428 vo4567p1 = psimd_qfma_f32(vo4567p1, vi2x5678, vk13);
429 vo4567p2 = psimd_qfma_f32(vo4567p2, vi3x5678, vk13);
Erich Elsen28928892020-06-12 08:08:19 -0700430
Marat Dukhan6f469a52020-10-21 20:02:52 -0700431 vo4567p0 = psimd_qfma_f32(vo4567p0, vi2x5678, vk23);
432 vo4567p1 = psimd_qfma_f32(vo4567p1, vi3x5678, vk23);
433 vo4567p2 = psimd_qfma_f32(vo4567p2, vi4x5678, vk23);
Erich Elsen28928892020-06-12 08:08:19 -0700434
Marat Dukhan6f469a52020-10-21 20:02:52 -0700435 vo4567p0 = psimd_qfma_f32(vo4567p0, vi3x5678, vk33);
436 vo4567p1 = psimd_qfma_f32(vo4567p1, vi4x5678, vk33);
437 vo4567p2 = psimd_qfma_f32(vo4567p2, vi5x5678, vk33);
Erich Elsen28928892020-06-12 08:08:19 -0700438
Marat Dukhan6f469a52020-10-21 20:02:52 -0700439 vo4567p0 = psimd_qfma_f32(vo4567p0, vi4x5678, vk43);
440 vo4567p1 = psimd_qfma_f32(vo4567p1, vi5x5678, vk43);
441 vo4567p2 = psimd_qfma_f32(vo4567p2, vi6x5678, vk43);
Erich Elsen28928892020-06-12 08:08:19 -0700442
443 const psimd_f32 vi0x6789 = extq2_f32(vi0x4567, vi0x89AB);
444 const psimd_f32 vi1x6789 = extq2_f32(vi1x4567, vi1x89AB);
445 const psimd_f32 vi2x6789 = extq2_f32(vi2x4567, vi2x89AB);
446 const psimd_f32 vi3x6789 = extq2_f32(vi3x4567, vi3x89AB);
447 const psimd_f32 vi4x6789 = extq2_f32(vi4x4567, vi4x89AB);
448 const psimd_f32 vi5x6789 = extq2_f32(vi5x4567, vi5x89AB);
449 const psimd_f32 vi6x6789 = extq2_f32(vi6x4567, vi6x89AB);
450
Marat Dukhan6f469a52020-10-21 20:02:52 -0700451 vo4567p0 = psimd_qfma_f32(vo4567p0, vi0x6789, vk04);
452 vo4567p1 = psimd_qfma_f32(vo4567p1, vi1x6789, vk04);
453 vo4567p2 = psimd_qfma_f32(vo4567p2, vi2x6789, vk04);
Erich Elsen28928892020-06-12 08:08:19 -0700454
Marat Dukhan6f469a52020-10-21 20:02:52 -0700455 vo4567p0 = psimd_qfma_f32(vo4567p0, vi1x6789, vk14);
456 vo4567p1 = psimd_qfma_f32(vo4567p1, vi2x6789, vk14);
457 vo4567p2 = psimd_qfma_f32(vo4567p2, vi3x6789, vk14);
Erich Elsen28928892020-06-12 08:08:19 -0700458
Marat Dukhan6f469a52020-10-21 20:02:52 -0700459 vo4567p0 = psimd_qfma_f32(vo4567p0, vi2x6789, vk24);
460 vo4567p1 = psimd_qfma_f32(vo4567p1, vi3x6789, vk24);
461 vo4567p2 = psimd_qfma_f32(vo4567p2, vi4x6789, vk24);
Erich Elsen28928892020-06-12 08:08:19 -0700462
Marat Dukhan6f469a52020-10-21 20:02:52 -0700463 vo4567p0 = psimd_qfma_f32(vo4567p0, vi3x6789, vk34);
464 vo4567p1 = psimd_qfma_f32(vo4567p1, vi4x6789, vk34);
465 vo4567p2 = psimd_qfma_f32(vo4567p2, vi5x6789, vk34);
Erich Elsen28928892020-06-12 08:08:19 -0700466
Marat Dukhan6f469a52020-10-21 20:02:52 -0700467 vo4567p0 = psimd_qfma_f32(vo4567p0, vi4x6789, vk44);
468 vo4567p1 = psimd_qfma_f32(vo4567p1, vi5x6789, vk44);
469 vo4567p2 = psimd_qfma_f32(vo4567p2, vi6x6789, vk44);
Erich Elsen28928892020-06-12 08:08:19 -0700470
471 vi0x4567 = vi0x89AB;
472 vi1x4567 = vi1x89AB;
473 vi2x4567 = vi2x89AB;
474 vi3x4567 = vi3x89AB;
475 vi4x4567 = vi4x89AB;
476 vi5x4567 = vi5x89AB;
477 vi6x4567 = vi6x89AB;
478
Marat Dukhan6f469a52020-10-21 20:02:52 -0700479 psimd_f32 vo0 = vo4567p0;
480 psimd_f32 vo1 = vo4567p1;
481 psimd_f32 vo2 = vo4567p2;
Erich Elsen28928892020-06-12 08:08:19 -0700482
483 vo0 = psimd_max_f32(vo0, vmin);
Erich Elsen28928892020-06-12 08:08:19 -0700484 vo1 = psimd_max_f32(vo1, vmin);
Erich Elsen28928892020-06-12 08:08:19 -0700485 vo2 = psimd_max_f32(vo2, vmin);
Marat Dukhanfaf332d2020-10-21 02:55:14 -0700486
487 vo0 = psimd_min_f32(vo0, vmax);
488 vo1 = psimd_min_f32(vo1, vmax);
Erich Elsen28928892020-06-12 08:08:19 -0700489 vo2 = psimd_min_f32(vo2, vmax);
490
Marat Dukhana690c092020-10-20 19:01:06 -0700491 psimd_store_f32(o2, vo2); o2 += 4;
Marat Dukhan7ed0e3c2020-10-21 00:41:31 -0700492 psimd_store_f32(o1, vo1); o1 += 4;
493 psimd_store_f32(o0, vo0); o0 += 4;
Marat Dukhan75157772020-10-21 01:46:28 -0700494 w -= 4 * sizeof(float);
Erich Elsen28928892020-06-12 08:08:19 -0700495 }
Marat Dukhan75157772020-10-21 01:46:28 -0700496 assert(w >= 1 * sizeof(float));
497 assert(w <= 4 * sizeof(float));
Erich Elsen28928892020-06-12 08:08:19 -0700498 {
Marat Dukhan6f469a52020-10-21 20:02:52 -0700499 psimd_f32 vo4567p0 = vbias;
500 psimd_f32 vo4567p1 = vbias;
501 psimd_f32 vo4567p2 = vbias;
Erich Elsen28928892020-06-12 08:08:19 -0700502
Marat Dukhan7ed0e3c2020-10-21 00:41:31 -0700503 // This might have already happened if there are more than 4 pixels, but we can't count on it.
Erich Elsen28928892020-06-12 08:08:19 -0700504 vi0x4567 = psimd_andmask_f32(vmask, vi0x4567);
505 vi1x4567 = psimd_andmask_f32(vmask, vi1x4567);
506 vi2x4567 = psimd_andmask_f32(vmask, vi2x4567);
507 vi3x4567 = psimd_andmask_f32(vmask, vi3x4567);
508 vi4x4567 = psimd_andmask_f32(vmask, vi4x4567);
509 vi5x4567 = psimd_andmask_f32(vmask, vi5x4567);
510 vi6x4567 = psimd_andmask_f32(vmask, vi6x4567);
511
Marat Dukhan6f469a52020-10-21 20:02:52 -0700512 vo4567p0 = psimd_qfma_f32(vo4567p0, vi0x4567, vk02);
513 vo4567p1 = psimd_qfma_f32(vo4567p1, vi1x4567, vk02);
514 vo4567p2 = psimd_qfma_f32(vo4567p2, vi2x4567, vk02);
Erich Elsen28928892020-06-12 08:08:19 -0700515
Marat Dukhan6f469a52020-10-21 20:02:52 -0700516 vo4567p0 = psimd_qfma_f32(vo4567p0, vi1x4567, vk12);
517 vo4567p1 = psimd_qfma_f32(vo4567p1, vi2x4567, vk12);
518 vo4567p2 = psimd_qfma_f32(vo4567p2, vi3x4567, vk12);
Erich Elsen28928892020-06-12 08:08:19 -0700519
Marat Dukhan6f469a52020-10-21 20:02:52 -0700520 vo4567p0 = psimd_qfma_f32(vo4567p0, vi2x4567, vk22);
521 vo4567p1 = psimd_qfma_f32(vo4567p1, vi3x4567, vk22);
522 vo4567p2 = psimd_qfma_f32(vo4567p2, vi4x4567, vk22);
Erich Elsen28928892020-06-12 08:08:19 -0700523
Marat Dukhan6f469a52020-10-21 20:02:52 -0700524 vo4567p0 = psimd_qfma_f32(vo4567p0, vi3x4567, vk32);
525 vo4567p1 = psimd_qfma_f32(vo4567p1, vi4x4567, vk32);
526 vo4567p2 = psimd_qfma_f32(vo4567p2, vi5x4567, vk32);
Erich Elsen28928892020-06-12 08:08:19 -0700527
Marat Dukhan6f469a52020-10-21 20:02:52 -0700528 vo4567p0 = psimd_qfma_f32(vo4567p0, vi4x4567, vk42);
529 vo4567p1 = psimd_qfma_f32(vo4567p1, vi5x4567, vk42);
530 vo4567p2 = psimd_qfma_f32(vo4567p2, vi6x4567, vk42);
Erich Elsen28928892020-06-12 08:08:19 -0700531
532 const psimd_f32 vi0x3456 = extq3_f32(vi0x0123, vi0x4567);
533 const psimd_f32 vi1x3456 = extq3_f32(vi1x0123, vi1x4567);
534 const psimd_f32 vi2x3456 = extq3_f32(vi2x0123, vi2x4567);
535 const psimd_f32 vi3x3456 = extq3_f32(vi3x0123, vi3x4567);
536 const psimd_f32 vi4x3456 = extq3_f32(vi4x0123, vi4x4567);
537 const psimd_f32 vi5x3456 = extq3_f32(vi5x0123, vi5x4567);
538 const psimd_f32 vi6x3456 = extq3_f32(vi6x0123, vi6x4567);
539
Marat Dukhan6f469a52020-10-21 20:02:52 -0700540 vo4567p0 = psimd_qfma_f32(vo4567p0, vi0x3456, vk01);
541 vo4567p1 = psimd_qfma_f32(vo4567p1, vi1x3456, vk01);
542 vo4567p2 = psimd_qfma_f32(vo4567p2, vi2x3456, vk01);
Erich Elsen28928892020-06-12 08:08:19 -0700543
Marat Dukhan6f469a52020-10-21 20:02:52 -0700544 vo4567p0 = psimd_qfma_f32(vo4567p0, vi1x3456, vk11);
545 vo4567p1 = psimd_qfma_f32(vo4567p1, vi2x3456, vk11);
546 vo4567p2 = psimd_qfma_f32(vo4567p2, vi3x3456, vk11);
Erich Elsen28928892020-06-12 08:08:19 -0700547
Marat Dukhan6f469a52020-10-21 20:02:52 -0700548 vo4567p0 = psimd_qfma_f32(vo4567p0, vi2x3456, vk21);
549 vo4567p1 = psimd_qfma_f32(vo4567p1, vi3x3456, vk21);
550 vo4567p2 = psimd_qfma_f32(vo4567p2, vi4x3456, vk21);
Erich Elsen28928892020-06-12 08:08:19 -0700551
Marat Dukhan6f469a52020-10-21 20:02:52 -0700552 vo4567p0 = psimd_qfma_f32(vo4567p0, vi3x3456, vk31);
553 vo4567p1 = psimd_qfma_f32(vo4567p1, vi4x3456, vk31);
554 vo4567p2 = psimd_qfma_f32(vo4567p2, vi5x3456, vk31);
Erich Elsen28928892020-06-12 08:08:19 -0700555
Marat Dukhan6f469a52020-10-21 20:02:52 -0700556 vo4567p0 = psimd_qfma_f32(vo4567p0, vi4x3456, vk41);
557 vo4567p1 = psimd_qfma_f32(vo4567p1, vi5x3456, vk41);
558 vo4567p2 = psimd_qfma_f32(vo4567p2, vi6x3456, vk41);
Erich Elsen28928892020-06-12 08:08:19 -0700559
560 const psimd_f32 vi0x2345 = extq2_f32(vi0x0123, vi0x4567);
561 const psimd_f32 vi1x2345 = extq2_f32(vi1x0123, vi1x4567);
562 const psimd_f32 vi2x2345 = extq2_f32(vi2x0123, vi2x4567);
563 const psimd_f32 vi3x2345 = extq2_f32(vi3x0123, vi3x4567);
564 const psimd_f32 vi4x2345 = extq2_f32(vi4x0123, vi4x4567);
565 const psimd_f32 vi5x2345 = extq2_f32(vi5x0123, vi5x4567);
566 const psimd_f32 vi6x2345 = extq2_f32(vi6x0123, vi6x4567);
567
Marat Dukhan6f469a52020-10-21 20:02:52 -0700568 vo4567p0 = psimd_qfma_f32(vo4567p0, vi0x2345, vk00);
569 vo4567p1 = psimd_qfma_f32(vo4567p1, vi1x2345, vk00);
570 vo4567p2 = psimd_qfma_f32(vo4567p2, vi2x2345, vk00);
Erich Elsen28928892020-06-12 08:08:19 -0700571
Marat Dukhan6f469a52020-10-21 20:02:52 -0700572 vo4567p0 = psimd_qfma_f32(vo4567p0, vi1x2345, vk10);
573 vo4567p1 = psimd_qfma_f32(vo4567p1, vi2x2345, vk10);
574 vo4567p2 = psimd_qfma_f32(vo4567p2, vi3x2345, vk10);
Erich Elsen28928892020-06-12 08:08:19 -0700575
Marat Dukhan6f469a52020-10-21 20:02:52 -0700576 vo4567p0 = psimd_qfma_f32(vo4567p0, vi2x2345, vk20);
577 vo4567p1 = psimd_qfma_f32(vo4567p1, vi3x2345, vk20);
578 vo4567p2 = psimd_qfma_f32(vo4567p2, vi4x2345, vk20);
Erich Elsen28928892020-06-12 08:08:19 -0700579
Marat Dukhan6f469a52020-10-21 20:02:52 -0700580 vo4567p0 = psimd_qfma_f32(vo4567p0, vi3x2345, vk30);
581 vo4567p1 = psimd_qfma_f32(vo4567p1, vi4x2345, vk30);
582 vo4567p2 = psimd_qfma_f32(vo4567p2, vi5x2345, vk30);
Erich Elsen28928892020-06-12 08:08:19 -0700583
Marat Dukhan6f469a52020-10-21 20:02:52 -0700584 vo4567p0 = psimd_qfma_f32(vo4567p0, vi4x2345, vk40);
585 vo4567p1 = psimd_qfma_f32(vo4567p1, vi5x2345, vk40);
586 vo4567p2 = psimd_qfma_f32(vo4567p2, vi6x2345, vk40);
Erich Elsen28928892020-06-12 08:08:19 -0700587
Erich Elsen28928892020-06-12 08:08:19 -0700588 const psimd_f32 vzero = psimd_zero_f32();
589 const psimd_f32 vi0x5678 = extq1_f32(vi0x4567, vzero);
590 const psimd_f32 vi1x5678 = extq1_f32(vi1x4567, vzero);
591 const psimd_f32 vi2x5678 = extq1_f32(vi2x4567, vzero);
592 const psimd_f32 vi3x5678 = extq1_f32(vi3x4567, vzero);
593 const psimd_f32 vi4x5678 = extq1_f32(vi4x4567, vzero);
594 const psimd_f32 vi5x5678 = extq1_f32(vi5x4567, vzero);
595 const psimd_f32 vi6x5678 = extq1_f32(vi6x4567, vzero);
596
Marat Dukhan6f469a52020-10-21 20:02:52 -0700597 vo4567p0 = psimd_qfma_f32(vo4567p0, vi0x5678, vk03);
598 vo4567p1 = psimd_qfma_f32(vo4567p1, vi1x5678, vk03);
599 vo4567p2 = psimd_qfma_f32(vo4567p2, vi2x5678, vk03);
Erich Elsen28928892020-06-12 08:08:19 -0700600
Marat Dukhan6f469a52020-10-21 20:02:52 -0700601 vo4567p0 = psimd_qfma_f32(vo4567p0, vi1x5678, vk13);
602 vo4567p1 = psimd_qfma_f32(vo4567p1, vi2x5678, vk13);
603 vo4567p2 = psimd_qfma_f32(vo4567p2, vi3x5678, vk13);
Erich Elsen28928892020-06-12 08:08:19 -0700604
Marat Dukhan6f469a52020-10-21 20:02:52 -0700605 vo4567p0 = psimd_qfma_f32(vo4567p0, vi2x5678, vk23);
606 vo4567p1 = psimd_qfma_f32(vo4567p1, vi3x5678, vk23);
607 vo4567p2 = psimd_qfma_f32(vo4567p2, vi4x5678, vk23);
Erich Elsen28928892020-06-12 08:08:19 -0700608
Marat Dukhan6f469a52020-10-21 20:02:52 -0700609 vo4567p0 = psimd_qfma_f32(vo4567p0, vi3x5678, vk33);
610 vo4567p1 = psimd_qfma_f32(vo4567p1, vi4x5678, vk33);
611 vo4567p2 = psimd_qfma_f32(vo4567p2, vi5x5678, vk33);
Erich Elsen28928892020-06-12 08:08:19 -0700612
Marat Dukhan6f469a52020-10-21 20:02:52 -0700613 vo4567p0 = psimd_qfma_f32(vo4567p0, vi4x5678, vk43);
614 vo4567p1 = psimd_qfma_f32(vo4567p1, vi5x5678, vk43);
615 vo4567p2 = psimd_qfma_f32(vo4567p2, vi6x5678, vk43);
Erich Elsen28928892020-06-12 08:08:19 -0700616
617 const psimd_f32 vi0x6789 = extq2_f32(vi0x4567, vzero);
618 const psimd_f32 vi1x6789 = extq2_f32(vi1x4567, vzero);
619 const psimd_f32 vi2x6789 = extq2_f32(vi2x4567, vzero);
620 const psimd_f32 vi3x6789 = extq2_f32(vi3x4567, vzero);
621 const psimd_f32 vi4x6789 = extq2_f32(vi4x4567, vzero);
622 const psimd_f32 vi5x6789 = extq2_f32(vi5x4567, vzero);
623 const psimd_f32 vi6x6789 = extq2_f32(vi6x4567, vzero);
624
Marat Dukhan6f469a52020-10-21 20:02:52 -0700625 vo4567p0 = psimd_qfma_f32(vo4567p0, vi0x6789, vk04);
626 vo4567p1 = psimd_qfma_f32(vo4567p1, vi1x6789, vk04);
627 vo4567p2 = psimd_qfma_f32(vo4567p2, vi2x6789, vk04);
Erich Elsen28928892020-06-12 08:08:19 -0700628
Marat Dukhan6f469a52020-10-21 20:02:52 -0700629 vo4567p0 = psimd_qfma_f32(vo4567p0, vi1x6789, vk14);
630 vo4567p1 = psimd_qfma_f32(vo4567p1, vi2x6789, vk14);
631 vo4567p2 = psimd_qfma_f32(vo4567p2, vi3x6789, vk14);
Erich Elsen28928892020-06-12 08:08:19 -0700632
Marat Dukhan6f469a52020-10-21 20:02:52 -0700633 vo4567p0 = psimd_qfma_f32(vo4567p0, vi2x6789, vk24);
634 vo4567p1 = psimd_qfma_f32(vo4567p1, vi3x6789, vk24);
635 vo4567p2 = psimd_qfma_f32(vo4567p2, vi4x6789, vk24);
Erich Elsen28928892020-06-12 08:08:19 -0700636
Marat Dukhan6f469a52020-10-21 20:02:52 -0700637 vo4567p0 = psimd_qfma_f32(vo4567p0, vi3x6789, vk34);
638 vo4567p1 = psimd_qfma_f32(vo4567p1, vi4x6789, vk34);
639 vo4567p2 = psimd_qfma_f32(vo4567p2, vi5x6789, vk34);
Erich Elsen28928892020-06-12 08:08:19 -0700640
Marat Dukhan6f469a52020-10-21 20:02:52 -0700641 vo4567p0 = psimd_qfma_f32(vo4567p0, vi4x6789, vk44);
642 vo4567p1 = psimd_qfma_f32(vo4567p1, vi5x6789, vk44);
643 vo4567p2 = psimd_qfma_f32(vo4567p2, vi6x6789, vk44);
Erich Elsen28928892020-06-12 08:08:19 -0700644
Marat Dukhan6f469a52020-10-21 20:02:52 -0700645 psimd_f32 vo0 = vo4567p0;
646 psimd_f32 vo1 = vo4567p1;
647 psimd_f32 vo2 = vo4567p2;
Erich Elsen28928892020-06-12 08:08:19 -0700648
649 vo0 = psimd_max_f32(vo0, vmin);
Erich Elsen28928892020-06-12 08:08:19 -0700650 vo1 = psimd_max_f32(vo1, vmin);
Erich Elsen28928892020-06-12 08:08:19 -0700651 vo2 = psimd_max_f32(vo2, vmin);
Marat Dukhan7ed0e3c2020-10-21 00:41:31 -0700652
653 vo0 = psimd_min_f32(vo0, vmax);
654 vo1 = psimd_min_f32(vo1, vmax);
Erich Elsen28928892020-06-12 08:08:19 -0700655 vo2 = psimd_min_f32(vo2, vmax);
656
Marat Dukhan75157772020-10-21 01:46:28 -0700657 if XNN_LIKELY(w & (4 * sizeof(float))) {
Marat Dukhana690c092020-10-20 19:01:06 -0700658 psimd_store_f32(o2, vo2);
Marat Dukhanc808c972020-10-20 19:39:41 -0700659 o2 += 4;
Marat Dukhan7ed0e3c2020-10-21 00:41:31 -0700660 psimd_store_f32(o1, vo1);
661 o1 += 4;
662 psimd_store_f32(o0, vo0);
663 o0 += 4;
Erich Elsen28928892020-06-12 08:08:19 -0700664 } else {
Marat Dukhan75157772020-10-21 01:46:28 -0700665 if (w & (2 * sizeof(float))) {
Marat Dukhanc808c972020-10-20 19:39:41 -0700666 psimd_store2_f32(o2, vo2);
667 o2 += 2;
Marat Dukhan7ed0e3c2020-10-21 00:41:31 -0700668 psimd_store2_f32(o1, vo1);
669 o1 += 2;
670 psimd_store2_f32(o0, vo0);
671 o0 += 2;
672
Erich Elsen28928892020-06-12 08:08:19 -0700673 vo0 = psimd_splat2_f32(vo0);
674 vo1 = psimd_splat2_f32(vo1);
675 vo2 = psimd_splat2_f32(vo2);
676 }
Marat Dukhan75157772020-10-21 01:46:28 -0700677 if (w & (1 * sizeof(float))) {
Marat Dukhanc808c972020-10-20 19:39:41 -0700678 psimd_store1_f32(o2, vo2);
679 o2 += 1;
Marat Dukhan7ed0e3c2020-10-21 00:41:31 -0700680 psimd_store1_f32(o1, vo1);
681 o1 += 1;
682 psimd_store1_f32(o0, vo0);
683 o0 += 1;
Erich Elsen28928892020-06-12 08:08:19 -0700684 }
685 }
686 }
687
Marat Dukhan7ed0e3c2020-10-21 00:41:31 -0700688 i0 = (const float*) ((uintptr_t) i3 - input_decrement);
689 i1 = (const float*) ((uintptr_t) i4 - input_decrement);
Marat Dukhan75157772020-10-21 01:46:28 -0700690 i2 = (const float*) ((uintptr_t) i1 + input_width);
691 i3 = (const float*) ((uintptr_t) i2 + input_width);
692 i4 = (const float*) ((uintptr_t) i3 + input_width);
693 i5 = (const float*) ((uintptr_t) i4 + input_width);
694 i6 = (const float*) ((uintptr_t) i5 + input_width);
Marat Dukhan7ed0e3c2020-10-21 00:41:31 -0700695
Marat Dukhanc808c972020-10-20 19:39:41 -0700696 o0 = o2;
Marat Dukhan75157772020-10-21 01:46:28 -0700697 o1 = (float*) ((uintptr_t) o0 + input_width);
698 o2 = (float*) ((uintptr_t) o1 + input_width);
Erich Elsen28928892020-06-12 08:08:19 -0700699
Marat Dukhan7ed0e3c2020-10-21 00:41:31 -0700700 output_height = doz(output_height, 3);
701 } while (output_height != 0);
Erich Elsen28928892020-06-12 08:08:19 -0700702}