blob: 42b53b0b146d0584950c7629b6eda47c6a96ad4b [file] [log] [blame]
Marat Dukhanf5425ea2020-04-24 01:46:00 -07001// Auto-generated file. Do not edit!
2// Template: src/f32-dwconv/up-neon.c.in
3// Generator: tools/xngen
4//
5// Copyright 2019 Google LLC
6//
7// This source code is licensed under the BSD-style license found in the
8// LICENSE file in the root directory of this source tree.
9
10#include <assert.h>
11
12#include <arm_neon.h>
13
14#include <xnnpack/dwconv.h>
15
16
17void xnn_f32_dwconv_minmax_ukernel_up8x4__neon_acc2(
18 size_t channels,
19 size_t output_width,
20 const float** input,
21 const float* weights,
22 float* output,
23 size_t input_stride,
24 size_t output_increment,
25 const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)])
26{
27 assert(channels != 0);
28 assert(output_width != 0);
29
30 const float32x4_t vmax = vld1q_dup_f32(&params->scalar.max);
31 const float32x4_t vmin = vld1q_dup_f32(&params->scalar.min);
32 do {
33 const float* i0 = input[0];
34 assert(i0 != NULL);
35 const float* i1 = input[1];
36 assert(i1 != NULL);
37 const float* i2 = input[2];
38 assert(i2 != NULL);
39 const float* i3 = input[3];
40 assert(i3 != NULL);
41 input = (const float**) ((uintptr_t) input + input_stride);
42
43 size_t c = channels;
44 const float* w = weights;
45 for (; c >= 8; c -= 8) {
46 float32x4_t vacc0123p0 = vld1q_f32(w); w += 4;
47 float32x4_t vacc4567p0 = vld1q_f32(w); w += 4;
48
49
50 const float32x4_t vi0x0123 = vld1q_f32(i0); i0 += 4;
51 const float32x4_t vi0x4567 = vld1q_f32(i0); i0 += 4;
52 const float32x4_t vk0x0123 = vld1q_f32(w); w += 4;
53 const float32x4_t vk0x4567 = vld1q_f32(w); w += 4;
54 vacc0123p0 = vmlaq_f32(vacc0123p0, vi0x0123, vk0x0123);
55 vacc4567p0 = vmlaq_f32(vacc4567p0, vi0x4567, vk0x4567);
56
57 const float32x4_t vi1x0123 = vld1q_f32(i1); i1 += 4;
58 const float32x4_t vi1x4567 = vld1q_f32(i1); i1 += 4;
59 const float32x4_t vk1x0123 = vld1q_f32(w); w += 4;
60 const float32x4_t vk1x4567 = vld1q_f32(w); w += 4;
61 float32x4_t vacc0123p1 = vmulq_f32(vi1x0123, vk1x0123);
62 float32x4_t vacc4567p1 = vmulq_f32(vi1x4567, vk1x4567);
63
64 const float32x4_t vi2x0123 = vld1q_f32(i2); i2 += 4;
65 const float32x4_t vi2x4567 = vld1q_f32(i2); i2 += 4;
66 const float32x4_t vk2x0123 = vld1q_f32(w); w += 4;
67 const float32x4_t vk2x4567 = vld1q_f32(w); w += 4;
68 vacc0123p0 = vmlaq_f32(vacc0123p0, vi2x0123, vk2x0123);
69 vacc4567p0 = vmlaq_f32(vacc4567p0, vi2x4567, vk2x4567);
70
71 const float32x4_t vi3x0123 = vld1q_f32(i3); i3 += 4;
72 const float32x4_t vi3x4567 = vld1q_f32(i3); i3 += 4;
73 const float32x4_t vk3x0123 = vld1q_f32(w); w += 4;
74 const float32x4_t vk3x4567 = vld1q_f32(w); w += 4;
75 vacc0123p1 = vmlaq_f32(vacc0123p1, vi3x0123, vk3x0123);
76 vacc4567p1 = vmlaq_f32(vacc4567p1, vi3x4567, vk3x4567);
77
78 // Add up all accumulators to vacc01234567p0
79 vacc0123p0 = vaddq_f32(vacc0123p0, vacc0123p1);
80 vacc4567p0 = vaddq_f32(vacc4567p0, vacc4567p1);
81
82 float32x4_t vacc0123 = vmaxq_f32(vacc0123p0, vmin);
83 float32x4_t vacc4567 = vmaxq_f32(vacc4567p0, vmin);
84 vacc0123 = vminq_f32(vacc0123, vmax);
85 vacc4567 = vminq_f32(vacc4567, vmax);
86
87 vst1q_f32(output, vacc0123); output += 4;
88 vst1q_f32(output, vacc4567); output += 4;
89 }
90 for (; c >= 4; c -= 4) {
91 float32x4_t vacc0123p0 = vld1q_f32(w); w += 4;
92
93
94 const float32x4_t vi0x0123 = vld1q_f32(i0); i0 += 4;
95 const float32x4_t vk0x0123 = vld1q_f32(w + 4);
96 vacc0123p0 = vmlaq_f32(vacc0123p0, vi0x0123, vk0x0123);
97
98 const float32x4_t vi1x0123 = vld1q_f32(i1); i1 += 4;
99 const float32x4_t vk1x0123 = vld1q_f32(w + 12);
100 float32x4_t vacc0123p1 = vmulq_f32(vi1x0123, vk1x0123);
101
102 const float32x4_t vi2x0123 = vld1q_f32(i2); i2 += 4;
103 const float32x4_t vk2x0123 = vld1q_f32(w + 20);
104 vacc0123p0 = vmlaq_f32(vacc0123p0, vi2x0123, vk2x0123);
105
106 const float32x4_t vi3x0123 = vld1q_f32(i3); i3 += 4;
107 const float32x4_t vk3x0123 = vld1q_f32(w + 28);
108 vacc0123p1 = vmlaq_f32(vacc0123p1, vi3x0123, vk3x0123);
109
110 // Add up all accumulators to vacc0123p0
111 vacc0123p0 = vaddq_f32(vacc0123p0, vacc0123p1);
112
113 float32x4_t vacc0123 = vmaxq_f32(vacc0123p0, vmin);
114 vacc0123 = vminq_f32(vacc0123, vmax);
115
116 vst1q_f32(output, vacc0123); output += 4;
117 }
118 if XNN_UNLIKELY(c != 0) {
119 float32x4_t vacc0123p0 = vld1q_f32(w);
120
121
122 const float32x4_t vi0x0123 = vld1q_f32(i0);
123 const float32x4_t vk0x0123 = vld1q_f32(w + 8);
124 vacc0123p0 = vmlaq_f32(vacc0123p0, vi0x0123, vk0x0123);
125
126 const float32x4_t vi1x0123 = vld1q_f32(i1);
127 const float32x4_t vk1x0123 = vld1q_f32(w + 16);
128 float32x4_t vacc0123p1 = vmulq_f32(vi1x0123, vk1x0123);
129
130 const float32x4_t vi2x0123 = vld1q_f32(i2);
131 const float32x4_t vk2x0123 = vld1q_f32(w + 24);
132 vacc0123p0 = vmlaq_f32(vacc0123p0, vi2x0123, vk2x0123);
133
134 const float32x4_t vi3x0123 = vld1q_f32(i3);
135 const float32x4_t vk3x0123 = vld1q_f32(w + 32);
136 vacc0123p1 = vmlaq_f32(vacc0123p1, vi3x0123, vk3x0123);
137
138 // Add up all accumulators to vacc0123p0
139 vacc0123p0 = vaddq_f32(vacc0123p0, vacc0123p1);
140
141 float32x4_t vacc0123 = vmaxq_f32(vacc0123p0, vmin);
142 vacc0123 = vminq_f32(vacc0123, vmax);
143
144 float32x2_t vacc01 = vget_low_f32(vacc0123);
145 if (c & 2) {
146 vst1_f32(output, vacc01); output += 2;
147 vacc01 = vget_high_f32(vacc0123);
148 }
149 if (c & 1) {
150 vst1_lane_f32(output, vacc01, 0); output += 1;
151 }
152 }
153
154 output = (float*) ((uintptr_t) output + output_increment);
155 } while (--output_width != 0);
156}