blob: c3338523194233f70f5dcd5af89e237a800c6d0f [file] [log] [blame]
Frank Barchardbddfbcd2020-04-15 12:32:41 -07001// Copyright 2020 Google LLC
2//
3// This source code is licensed under the BSD-style license found in the
4// LICENSE file in the root directory of this source tree.
5
6#include <xnnpack/assembly.h>
7
8# void xnn_f16_gemm${"inc" if INC else ""}_minmax_ukernel_1x8__aarch64_neonfp16arith_ld64(
9# size_t mr, (x0) - unused. mr = 1
10# size_t nc, x1
11# size_t kc, x2 / x0
12# const uint8_t*restrict a, x3
13# size_t a_stride, (x4) - unused
14# const void*restrict w, x5
15# uint8_t*restrict c, x6
16# size_t cm_stride, (x7) - unused
17# size_t cn_stride, [sp] -> x14
18$if INC:
19 # const float*restrict acc, [sp + 8] -> x15
20 # const union xnn_f16_minmax_params params[restrict static 1]) [sp + 16] -> x8
21$else:
22 # const union xnn_f16_minmax_params params[restrict static 1]) [sp + 8] -> x8
23
24# d8-d15 need to be preserved if used.
25# x19-30 need to be preserved if used.
26
27# A pointer
28# x3 a0
29
30# C pointer
31# x6 c0
32
33# Vector register usage
34# A0 v0 v1
35# B v20 v21 v22 v23
36# C v16 v18
37# Clamp v4, v5, v6
38# Clamp v4, v5, v6
39
40
41BEGIN_FUNCTION xnn_f16_gemm${"inc" if INC else ""}_minmax_ukernel_1x8__aarch64_neonfp16arith_ld64
42
43 $if INC:
44 # Load cn_stride, acc
45 LDP x14, x15, [sp]
46 # Load params pointer
47 LDR x8, [sp, 16]
48 $else:
49 # Load cn_stride, params pointer
50 LDP x14, x8, [sp]
51
52 # Load params values
53 LD3R {v4.8h, v5.8h, v6.8h}, [x8]
540:
55 $if INC:
56 # Load initial accumulators
57 LDR q16, [x15], 16
58 $else:
59 # Load initial bias from w into accumulators
60 LDR q16, [x5], 16
61
62 MOVI v18.8h, 0 // second set of C for pipelining FMLA
63
64 # Is there at least 4 halffloats (8 bytes)
65 SUBS x0, x2, 8 // k = kc - 8
66
67 B.LO 6f
68
69 # Main loop - 4 halffloats of A (8 bytes)
701:
71 LDR d0, [x3], 8
72 LDP q20, q21, [x5], 32
73 LDP q22, q23, [x5], 32
74 SUBS x0, x0, 8
75 FMLA v16.8h, v20.8h, v0.h[0]
76 FMLA v18.8h, v21.8h, v0.h[1]
77 FMLA v16.8h, v22.8h, v0.h[2]
78 FMLA v18.8h, v23.8h, v0.h[3]
79 B.HS 1b
80
81 # Is there a remainder?- 2 halffloats of A (4 bytes)
82 TBNZ x0, 2, 7f
83 # Is there a remainder?- 1 halffloats of A (2 bytes)
84 TBNZ x0, 1, 8f
85
864:
87 FADD v16.8h, v16.8h, v18.8h
88 SUBS x1, x1, 8
89
90 # Scale and Clamp
91 FMUL v16.8h, v16.8h, v4.8h
92 FMAX v16.8h, v16.8h, v5.8h
93 FMIN v16.8h, v16.8h, v6.8h
94
95 # Store full 1 x 8
96 B.LO 10f
97
98 ST1 {v16.16b}, [x6], x14
99 SUB x3, x3, x2 // a0 -= kc
100
101 B.HI 0b
102 RET
103
1046:
105 TBZ x0, 2, 8f
1067:
107 # Remainder- 2 halffloats of A (4 bytes)
108 LDR s0, [x3], 4
109 LDP q20, q21, [x5], 32
110 FMLA v16.8h, v20.8h, v0.h[0]
111 FMLA v18.8h, v21.8h, v0.h[1]
112 TBZ x0, 1, 4b
113
1148:
115 # Remainder- 1 halffloat of A (2 bytes)
116 LDR h0, [x3], 2
117 LDR q20, [x5], 16
118 FMLA v16.8h, v20.8h, v0.h[0]
119 B 4b
120
121 # Store odd channels
12210:
123 TBZ x1, 2, 11f
124 STR d16, [x6], 8
125 DUP d16, v16.d[1]
126
12711:
128 TBZ x1, 1, 12f
129 STR s16, [x6], 4
130 DUP s16, v16.s[1]
131
13212:
133 TBZ x1, 0, 13f
134 STR h16, [x6]
13513:
136 RET
137
138END_FUNCTION xnn_f16_gemm${"inc" if INC else ""}_minmax_ukernel_1x8__aarch64_neonfp16arith_ld64
139
140#ifdef __ELF__
141.section ".note.GNU-stack","",%progbits
142#endif