blob: 8f820b7737a531f7d37e1fd34f0bacd5a67f72a2 [file] [log] [blame]
XNNPACK Teamb455b122019-09-27 18:10:33 -07001// Auto-generated file. Do not edit!
2// Template: src/f32-dwconv/up-psimd.c.in
3// Generator: tools/xngen
4//
5// Copyright 2019 Google LLC
6//
7// This source code is licensed under the BSD-style license found in the
8// LICENSE file in the root directory of this source tree.
9
10#include <assert.h>
11
12#include <psimd.h>
13
14#include <xnnpack/dwconv.h>
15
16
17void xnn_f32_dwconv_ukernel_up4x9__psimd(
18 size_t channels,
19 size_t output_width,
20 const float** input,
21 const float* weights,
22 float* output,
23 size_t input_stride,
24 size_t output_increment,
25 const union xnn_f32_output_params params[restrict static 1])
26{
27 assert(channels != 0);
28 assert(output_width != 0);
29
30 const psimd_f32 vmax = psimd_load_splat_f32(&params->scalar.max);
31 const psimd_f32 vmin = psimd_load_splat_f32(&params->scalar.min);
32 do {
33 const float* i0 = input[0];
34 const float* i1 = input[1];
35 const float* i2 = input[2];
36 const float* i3 = input[3];
37 const float* i4 = input[4];
38 const float* i5 = input[5];
39 const float* i6 = input[6];
40 const float* i7 = input[7];
41 const float* i8 = input[8];
42 input = (const float**) ((uintptr_t) input + input_stride);
43
44 size_t c = channels;
45 const float* w = weights;
46 for (; c >= 4; c -= 4) {
47 psimd_f32 vacc0 = psimd_load_f32(w);
48
49 const psimd_f32 vi0 = psimd_load_f32(i0);
50 const psimd_f32 vk0 = psimd_load_f32(w + 4);
51 vacc0 = psimd_qfma_f32(vacc0, vi0, vk0);
52 i0 += 4;
53
54 const psimd_f32 vi1 = psimd_load_f32(i1);
55 const psimd_f32 vk1 = psimd_load_f32(w + 8);
56 psimd_f32 vacc1 = psimd_mul_f32(vi1, vk1);
57 i1 += 4;
58
59 const psimd_f32 vi2 = psimd_load_f32(i2);
60 const psimd_f32 vk2 = psimd_load_f32(w + 12);
61 vacc0 = psimd_qfma_f32(vacc0, vi2, vk2);
62 i2 += 4;
63
64 const psimd_f32 vi3 = psimd_load_f32(i3);
65 const psimd_f32 vk3 = psimd_load_f32(w + 16);
66 vacc1 = psimd_qfma_f32(vacc1, vi3, vk3);
67 i3 += 4;
68
69 const psimd_f32 vi4 = psimd_load_f32(i4);
70 const psimd_f32 vk4 = psimd_load_f32(w + 20);
71 vacc0 = psimd_qfma_f32(vacc0, vi4, vk4);
72 i4 += 4;
73
74 const psimd_f32 vi5 = psimd_load_f32(i5);
75 const psimd_f32 vk5 = psimd_load_f32(w + 24);
76 vacc1 = psimd_qfma_f32(vacc1, vi5, vk5);
77 i5 += 4;
78
79 const psimd_f32 vi6 = psimd_load_f32(i6);
80 const psimd_f32 vk6 = psimd_load_f32(w + 28);
81 vacc0 = psimd_qfma_f32(vacc0, vi6, vk6);
82 i6 += 4;
83
84 const psimd_f32 vi7 = psimd_load_f32(i7);
85 const psimd_f32 vk7 = psimd_load_f32(w + 32);
86 vacc1 = psimd_qfma_f32(vacc1, vi7, vk7);
87 i7 += 4;
88
89 const psimd_f32 vi8 = psimd_load_f32(i8);
90 const psimd_f32 vk8 = psimd_load_f32(w + 36);
91 vacc0 = psimd_qfma_f32(vacc0, vi8, vk8);
92 i8 += 4;
93
94 w += 40;
95
96 vacc0 = psimd_add_f32(vacc0, vacc1);
97
98 vacc0 = psimd_max_f32(vacc0, vmin);
99 vacc0 = psimd_min_f32(vacc0, vmax);
100
101 psimd_store_f32(output, vacc0);
102 output += 4;
103 }
104 if XNN_UNLIKELY(c != 0) {
105 psimd_f32 vacc = psimd_load_f32(w);
106
107 const psimd_f32 vi0 = psimd_load_f32(i0);
108 const psimd_f32 vk0 = psimd_load_f32(w + 4);
109 vacc = psimd_qfma_f32(vacc, vi0, vk0);
110
111 const psimd_f32 vi1 = psimd_load_f32(i1);
112 const psimd_f32 vk1 = psimd_load_f32(w + 8);
113 vacc = psimd_qfma_f32(vacc, vi1, vk1);
114
115 const psimd_f32 vi2 = psimd_load_f32(i2);
116 const psimd_f32 vk2 = psimd_load_f32(w + 12);
117 vacc = psimd_qfma_f32(vacc, vi2, vk2);
118
119 const psimd_f32 vi3 = psimd_load_f32(i3);
120 const psimd_f32 vk3 = psimd_load_f32(w + 16);
121 vacc = psimd_qfma_f32(vacc, vi3, vk3);
122
123 const psimd_f32 vi4 = psimd_load_f32(i4);
124 const psimd_f32 vk4 = psimd_load_f32(w + 20);
125 vacc = psimd_qfma_f32(vacc, vi4, vk4);
126
127 const psimd_f32 vi5 = psimd_load_f32(i5);
128 const psimd_f32 vk5 = psimd_load_f32(w + 24);
129 vacc = psimd_qfma_f32(vacc, vi5, vk5);
130
131 const psimd_f32 vi6 = psimd_load_f32(i6);
132 const psimd_f32 vk6 = psimd_load_f32(w + 28);
133 vacc = psimd_qfma_f32(vacc, vi6, vk6);
134
135 const psimd_f32 vi7 = psimd_load_f32(i7);
136 const psimd_f32 vk7 = psimd_load_f32(w + 32);
137 vacc = psimd_qfma_f32(vacc, vi7, vk7);
138
139 const psimd_f32 vi8 = psimd_load_f32(i8);
140 const psimd_f32 vk8 = psimd_load_f32(w + 36);
141 vacc = psimd_qfma_f32(vacc, vi8, vk8);
142
143 w += 40;
144
145 vacc = psimd_max_f32(vacc, vmin);
146 vacc = psimd_min_f32(vacc, vmax);
147
148 if (c & 2) {
149 psimd_store2_f32(output, vacc);
150 vacc = psimd_concat_hi_f32(vacc, vacc);
151 output += 2;
152 }
153 if (c & 1) {
154 psimd_store1_f32(output, vacc);
155 output += 1;
156 }
157 }
158
159 output = (float*) ((uintptr_t) output + output_increment);
160 } while (--output_width != 0);
161}