blob: 4db318b5a0ff8c8bf8e64340eb13bea386657e85 [file] [log] [blame]
Marat Dukhan80fc9322019-09-29 21:06:36 -07001// Copyright 2019 Google LLC
2//
3// This source code is licensed under the BSD-style license found in the
4// LICENSE file in the root directory of this source tree.
XNNPACK Teamb455b122019-09-27 18:10:33 -07005
6#include <assert.h>
7
8#include <arm_neon.h>
9
10#include <xnnpack/pad.h>
11
12
13void xnn_x32_pad_x2__neon(
14 size_t m,
15 size_t n,
16 size_t l,
17 size_t r,
18 uint32_t c,
19 const void* x,
20 size_t x_stride,
21 void* y,
22 size_t y_stride)
23{
24 assert(m <= 2);
25 assert(l % 4 == 0);
26 assert(n % 4 == 0);
27 assert(r % 4 == 0);
28
29 const uint32_t* x0 = x;
30 uint32_t* y0 = y;
31
32 const uint32_t* x1 = (const uint32_t*) ((uintptr_t) x0 + x_stride);
33 uint32_t* y1 = (uint32_t*) ((uintptr_t) y0 + y_stride);
34 if (m != 2) {
35 x1 = x0;
36 y1 = y0;
37 }
38 const uint32x4_t vc = vmovq_n_u32(c);
39
Marat Dukhan80fc9322019-09-29 21:06:36 -070040 // Pre-pad input channels.
XNNPACK Teamb455b122019-09-27 18:10:33 -070041 for (; l >= 16; l -= 16) {
42 vst1q_u32(y0, vc); y0 += 4;
43 vst1q_u32(y1, vc); y1 += 4;
44 }
45 if (l & 8) {
46 vst1_u32(y0, vget_low_u32(vc)); y0 += 2;
47 vst1_u32(y1, vget_low_u32(vc)); y1 += 2;
48 }
49 if (l & 4) {
50 vst1q_lane_u32(y0, vc, 0); y0 += 1;
51 vst1q_lane_u32(y1, vc, 0); y1 += 1;
52 }
53
Marat Dukhan80fc9322019-09-29 21:06:36 -070054 // Copy input channels.
XNNPACK Teamb455b122019-09-27 18:10:33 -070055 for (; n >= 16; n -= 16) {
56 const uint32x4_t vt0 = vld1q_u32(x0); x0 += 4;
57 const uint32x4_t vt1 = vld1q_u32(x1); x1 += 4;
58 vst1q_u32(y0, vt0); y0 += 4;
59 vst1q_u32(y1, vt1); y1 += 4;
60 }
61 if (n != 0) {
62 const uint32x4_t vt0 = vld1q_u32(x0); x0 += 4;
63 const uint32x4_t vt1 = vld1q_u32(x1); x1 += 4;
64 uint32x2_t vt0lo = vget_low_u32(vt0);
65 uint32x2_t vt1lo = vget_low_u32(vt1);
66 if (n & 8) {
67 vst1_u32(y0, vt0lo); y0 += 2;
68 vst1_u32(y1, vt1lo); y1 += 2;
69 vt0lo = vget_high_u32(vt0);
70 vt1lo = vget_high_u32(vt1);
71 }
72 if (n & 4) {
73 vst1_lane_u32(y0, vt0lo, 0); y0 += 1;
74 vst1_lane_u32(y1, vt1lo, 0); y1 += 1;
75 }
76 }
77
Marat Dukhan80fc9322019-09-29 21:06:36 -070078 // Post-pad input channels.
XNNPACK Teamb455b122019-09-27 18:10:33 -070079 for (; r >= 16; r -= 16) {
80 vst1q_u32(y0, vc); y0 += 4;
81 vst1q_u32(y1, vc); y1 += 4;
82 }
83 if (r & 8) {
84 vst1_u32(y0, vget_low_u32(vc)); y0 += 2;
85 vst1_u32(y1, vget_low_u32(vc)); y1 += 2;
86 }
87 if (r & 4) {
88 vst1q_lane_u32(y0, vc, 0);
89 vst1q_lane_u32(y1, vc, 0);
90 }
91}