blob: b8bfb4ccb264f40eeec5da36d8c9184358015370 [file] [log] [blame]
Frank Barchard36b76b62020-04-10 12:39:17 -07001// Auto-generated file. Do not edit!
2// Template: src/f16-gemm/1x16-aarch64-neonfp16arith-ld32.S.in
3// Generator: tools/xngen
4//
5// Copyright 2020 Google LLC
6//
7// This source code is licensed under the BSD-style license found in the
8// LICENSE file in the root directory of this source tree.
9
10#include <xnnpack/assembly.h>
11
12# void xnn_f16_gemminc_minmax_ukernel_1x16__aarch64_neonfp16arith_ld32(
13# size_t mr, (x0) - unused. mr = 1
14# size_t nc, x1
15# size_t kc, x2 / x0
16# const uint8_t*restrict a, x3
17# size_t a_stride, (x4) - unused
18# const void*restrict w, x5
19# uint8_t*restrict c, x6
20# size_t cm_stride, (x7) - unused
21# size_t cn_stride, [sp] -> x14
22# const float*restrict acc, [sp + 8] -> x15
Marat Dukhanf196d012020-04-15 11:50:03 -070023# const union xnn_f16_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) [sp + 16] -> x8
Frank Barchard36b76b62020-04-10 12:39:17 -070024
25# d8-d15 need to be preserved if used.
26# x19-30 need to be preserved if used.
27
28# A pointer
29# x3 a0
30
31# C pointer
32# x6 c0
33
34# Clamp v4, v5, v6
35
36BEGIN_FUNCTION xnn_f16_gemminc_minmax_ukernel_1x16__aarch64_neonfp16arith_ld32
37
38 # Load cn_stride, acc
39 LDP x14, x15, [sp]
40 # Load params pointer
41 LDR x8, [sp, 16]
42
43 # Load clamping_params values
44 LD3R {v4.8h, v5.8h, v6.8h}, [x8]
450:
46 # Load initial accumulators
47 LDP q16, q17, [x15], 32
48
49 MOVI v18.8h, 0 // second set of C for pipelining FMLA
50 MOVI v19.8h, 0
51
52 # Is there at least 2 halffloats (4 bytes)
53 SUBS x0, x2, 4 // k = kc - 4
54
55 B.LO 8f
56
57 # Main loop - 2 halffloats of A (4 bytes)
581:
59 LDR s0, [x3], 4
60 LDP q20, q21, [x5], 32
61 LDP q22, q23, [x5], 32
62 FMLA v16.8h, v20.8h, v0.h[0]
63 FMLA v17.8h, v21.8h, v0.h[0]
64 SUBS x0, x0, 4
65 FMLA v18.8h, v22.8h, v0.h[1]
66 FMLA v19.8h, v23.8h, v0.h[1]
67 B.HS 1b
68
69 # Is there a remainder?- 1 halffloats of A (2 bytes)
70 TBNZ x0, 1, 8f
71
724:
73 FADD v16.8h, v16.8h, v18.8h
74 FADD v17.8h, v17.8h, v19.8h
75 SUBS x1, x1, 16
76
77 # Scale and Clamp
78
79 FMUL v16.8h, v16.8h, v4.8h
80 FMUL v17.8h, v17.8h, v4.8h
81 FMAX v16.8h, v16.8h, v5.8h
82 FMAX v17.8h, v17.8h, v5.8h
83 FMIN v16.8h, v16.8h, v6.8h
84 FMIN v17.8h, v17.8h, v6.8h
85
86 # Store full 1 x 16
87 B.LO 9f
88
89 STP q16, q17, [x6]
90 ADD x6, x6, x14
91
92 SUB x3, x3, x2 // a0 -= kc
93
94 B.HI 0b
95
96 RET
97
988:
99 # Remainder- 1 halffloat of A (2 bytes)
100 LDP q20, q21, [x5], 32
101 LDR h0, [x3], 2
102 FMLA v16.8h, v20.8h, v0.h[0]
103 FMLA v17.8h, v21.8h, v0.h[0]
104 B 4b
105
106 # Store odd channels
1079:
108 TBZ x1, 3, 10f
109 STR q16, [x6], 16
110 MOV v16.16b, v17.16b
111
11210:
113 TBZ x1, 2, 11f
114 STR d16, [x6], 8
115 DUP d16, v16.d[1]
116
11711:
118 TBZ x1, 1, 12f
119 STR s16, [x6], 4
120 DUP s16, v16.s[1]
121
12212:
123 TBZ x1, 0, 13f
124 STR h16, [x6]
12513:
126 RET
127
128END_FUNCTION xnn_f16_gemminc_minmax_ukernel_1x16__aarch64_neonfp16arith_ld32
129
130#ifdef __ELF__
131.section ".note.GNU-stack","",%progbits
132#endif