Modify the ARM64 assembly file so that it uses only syntax that the clang assembler in XCode 5.x can understand. These changes should all be cosmetic in nature-- they do not change the meaning or readability of the code nor the ability to build it for Linux. Actually, the code is now more in compliance with the ARM64 programming manual. In addition to these changes, there were a couple of instructions that clang simply doesn't support, so gas-preprocessor.pl was modified so that it now converts those into equivalent instructions that clang can handle.
git-svn-id: svn+ssh://svn.code.sf.net/p/libjpeg-turbo/code/branches/1.4.x@1450 632fc199-4ca6-4c93-a231-07263d6284db
diff --git a/simd/jsimd_arm64_neon.S b/simd/jsimd_arm64_neon.S
index f488b0f..2186f24 100644
--- a/simd/jsimd_arm64_neon.S
+++ b/simd/jsimd_arm64_neon.S
@@ -6,6 +6,7 @@
* Author: Siarhei Siamashka <siarhei.siamashka@nokia.com>
* Copyright (C) 2013-2014, Linaro Limited
* Author: Ragesh Radhakrishnan <ragesh.r@linaro.org>
+ * Copyright (C) 2014, D. R. Commander. All rights reserved.
*
* This software is provided 'as-is', without any express or implied
* warranty. In no event will the authors be held liable for any damages
@@ -197,21 +198,21 @@
tmp13 = q1; \
}
-#define XFIX_0_899976223 v0.4h[0]
-#define XFIX_0_541196100 v0.4h[1]
-#define XFIX_2_562915447 v0.4h[2]
-#define XFIX_0_298631336_MINUS_0_899976223 v0.4h[3]
-#define XFIX_1_501321110_MINUS_0_899976223 v1.4h[0]
-#define XFIX_2_053119869_MINUS_2_562915447 v1.4h[1]
-#define XFIX_0_541196100_PLUS_0_765366865 v1.4h[2]
-#define XFIX_1_175875602 v1.4h[3]
-#define XFIX_1_175875602_MINUS_0_390180644 v2.4h[0]
-#define XFIX_0_541196100_MINUS_1_847759065 v2.4h[1]
-#define XFIX_3_072711026_MINUS_2_562915447 v2.4h[2]
-#define XFIX_1_175875602_MINUS_1_961570560 v2.4h[3]
+#define XFIX_0_899976223 v0.h[0]
+#define XFIX_0_541196100 v0.h[1]
+#define XFIX_2_562915447 v0.h[2]
+#define XFIX_0_298631336_MINUS_0_899976223 v0.h[3]
+#define XFIX_1_501321110_MINUS_0_899976223 v1.h[0]
+#define XFIX_2_053119869_MINUS_2_562915447 v1.h[1]
+#define XFIX_0_541196100_PLUS_0_765366865 v1.h[2]
+#define XFIX_1_175875602 v1.h[3]
+#define XFIX_1_175875602_MINUS_0_390180644 v2.h[0]
+#define XFIX_0_541196100_MINUS_1_847759065 v2.h[1]
+#define XFIX_3_072711026_MINUS_2_562915447 v2.h[2]
+#define XFIX_1_175875602_MINUS_1_961570560 v2.h[3]
.balign 16
-jsimd_idct_islow_neon_consts:
+Ljsimd_idct_islow_neon_consts:
.short FIX_0_899976223 /* d0[0] */
.short FIX_0_541196100 /* d0[1] */
.short FIX_2_562915447 /* d0[2] */
@@ -256,54 +257,54 @@
/* Save all NEON registers and x15 (32 NEON registers * 8 bytes + 16) */
sub sp, sp, 272
str x15, [sp], 16
- adr x15, jsimd_idct_islow_neon_consts
- st1 {v0.8b - v3.8b}, [sp], 32
- st1 {v4.8b - v7.8b}, [sp], 32
- st1 {v8.8b - v11.8b}, [sp], 32
- st1 {v12.8b - v15.8b}, [sp], 32
- st1 {v16.8b - v19.8b}, [sp], 32
- st1 {v20.8b - v23.8b}, [sp], 32
- st1 {v24.8b - v27.8b}, [sp], 32
- st1 {v28.8b - v31.8b}, [sp], 32
+ adr x15, Ljsimd_idct_islow_neon_consts
+ st1 {v0.8b, v1.8b, v2.8b, v3.8b}, [sp], 32
+ st1 {v4.8b, v5.8b, v6.8b, v7.8b}, [sp], 32
+ st1 {v8.8b, v9.8b, v10.8b, v11.8b}, [sp], 32
+ st1 {v12.8b, v13.8b, v14.8b, v15.8b}, [sp], 32
+ st1 {v16.8b, v17.8b, v18.8b, v19.8b}, [sp], 32
+ st1 {v20.8b, v21.8b, v22.8b, v23.8b}, [sp], 32
+ st1 {v24.8b, v25.8b, v26.8b, v27.8b}, [sp], 32
+ st1 {v28.8b, v29.8b, v30.8b, v31.8b}, [sp], 32
ld1 {v16.4h, v17.4h, v18.4h, v19.4h}, [COEF_BLOCK], 32
ld1 {v0.4h, v1.4h, v2.4h, v3.4h}, [DCT_TABLE], 32
ld1 {v20.4h, v21.4h, v22.4h, v23.4h}, [COEF_BLOCK], 32
mul v16.4h, v16.4h, v0.4h
mul v17.4h, v17.4h, v1.4h
- ins v16.2d[1], v17.2d[0] /* 128 bit q8 */
+ ins v16.d[1], v17.d[0] /* 128 bit q8 */
ld1 {v4.4h, v5.4h, v6.4h, v7.4h}, [DCT_TABLE], 32
mul v18.4h, v18.4h, v2.4h
mul v19.4h, v19.4h, v3.4h
- ins v18.2d[1], v19.2d[0] /* 128 bit q9 */
+ ins v18.d[1], v19.d[0] /* 128 bit q9 */
ld1 {v24.4h, v25.4h, v26.4h, v27.4h}, [COEF_BLOCK], 32
mul v20.4h, v20.4h, v4.4h
mul v21.4h, v21.4h, v5.4h
- ins v20.2d[1], v21.2d[0] /* 128 bit q10 */
+ ins v20.d[1], v21.d[0] /* 128 bit q10 */
ld1 {v0.4h, v1.4h, v2.4h, v3.4h}, [DCT_TABLE], 32
mul v22.4h, v22.4h, v6.4h
mul v23.4h, v23.4h, v7.4h
- ins v22.2d[1], v23.2d[0] /* 128 bit q11 */
+ ins v22.d[1], v23.d[0] /* 128 bit q11 */
ld1 {v28.4h, v29.4h, v30.4h, v31.4h}, [COEF_BLOCK]
mul v24.4h, v24.4h, v0.4h
mul v25.4h, v25.4h, v1.4h
- ins v24.2d[1], v25.2d[0] /* 128 bit q12 */
+ ins v24.d[1], v25.d[0] /* 128 bit q12 */
ld1 {v4.4h, v5.4h, v6.4h, v7.4h}, [DCT_TABLE], 32
mul v28.4h, v28.4h, v4.4h
mul v29.4h, v29.4h, v5.4h
- ins v28.2d[1], v29.2d[0] /* 128 bit q14 */
+ ins v28.d[1], v29.d[0] /* 128 bit q14 */
mul v26.4h, v26.4h, v2.4h
mul v27.4h, v27.4h, v3.4h
- ins v26.2d[1], v27.2d[0] /* 128 bit q13 */
+ ins v26.d[1], v27.d[0] /* 128 bit q13 */
ld1 {v0.4h, v1.4h, v2.4h, v3.4h}, [x15] /* load constants */
add x15, x15, #16
mul v30.4h, v30.4h, v6.4h
mul v31.4h, v31.4h, v7.4h
- ins v30.2d[1], v31.2d[0] /* 128 bit q15 */
+ ins v30.d[1], v31.d[0] /* 128 bit q15 */
/* Go to the bottom of the stack */
sub sp, sp, 352
stp x4, x5, [sp], 16
- st1 {v8.4h - v11.4h}, [sp], 32 /* save NEON registers */
- st1 {v12.4h - v15.4h}, [sp], 32
+ st1 {v8.4h, v9.4h, v10.4h, v11.4h}, [sp], 32 /* save NEON registers */
+ st1 {v12.4h, v13.4h, v14.4h, v15.4h}, [sp], 32
/* 1-D IDCT, pass 1, left 4x8 half */
add v4.4h, ROW7L.4h, ROW3L.4h
add v5.4h, ROW5L.4h, ROW1L.4h
@@ -378,7 +379,7 @@
rshrn ROW0L.4h, v12.4s, #11
rshrn ROW4L.4h, v6.4s, #11
- beq 3f /* Go to do some special handling for the sparse right 4x8 half */
+ b.eq 3f /* Go to do some special handling for the sparse right 4x8 half */
/* 1-D IDCT, pass 1, right 4x8 half */
ld1 {v2.4h}, [x15] /* reload constants */
@@ -553,33 +554,33 @@
shrn ROW4R.4h, v6.4s, #16
2: /* Descale to 8-bit and range limit */
- ins v16.2d[1], v17.2d[0]
- ins v18.2d[1], v19.2d[0]
- ins v20.2d[1], v21.2d[0]
- ins v22.2d[1], v23.2d[0]
+ ins v16.d[1], v17.d[0]
+ ins v18.d[1], v19.d[0]
+ ins v20.d[1], v21.d[0]
+ ins v22.d[1], v23.d[0]
sqrshrn v16.8b, v16.8h, #2
sqrshrn2 v16.16b, v18.8h, #2
sqrshrn v18.8b, v20.8h, #2
sqrshrn2 v18.16b, v22.8h, #2
/* vpop {v8.4h - d15.4h} */ /* restore NEON registers */
- ld1 {v8.4h - v11.4h}, [sp], 32
- ld1 {v12.4h - v15.4h}, [sp], 32
- ins v24.2d[1], v25.2d[0]
+ ld1 {v8.4h, v9.4h, v10.4h, v11.4h}, [sp], 32
+ ld1 {v12.4h, v13.4h, v14.4h, v15.4h}, [sp], 32
+ ins v24.d[1], v25.d[0]
sqrshrn v20.8b, v24.8h, #2
/* Transpose the final 8-bit samples and do signed->unsigned conversion */
/* trn1 v16.8h, v16.8h, v18.8h */
transpose v16, v18, v3, .16b, .8h
- ins v26.2d[1], v27.2d[0]
- ins v28.2d[1], v29.2d[0]
- ins v30.2d[1], v31.2d[0]
+ ins v26.d[1], v27.d[0]
+ ins v28.d[1], v29.d[0]
+ ins v30.d[1], v31.d[0]
sqrshrn2 v20.16b, v26.8h, #2
sqrshrn v22.8b, v28.8h, #2
movi v0.16b, #(CENTERJSAMPLE)
sqrshrn2 v22.16b, v30.8h, #2
- transpose_single v16, v17, v3, .2d, .8b
- transpose_single v18, v19, v3, .2d, .8b
+ transpose_single v16, v17, v3, .d, .8b
+ transpose_single v18, v19, v3, .d, .8b
add v16.8b, v16.8b, v0.8b
add v17.8b, v17.8b, v0.8b
add v18.8b, v18.8b, v0.8b
@@ -590,7 +591,7 @@
add TMP1, TMP1, OUTPUT_COL
add TMP2, TMP2, OUTPUT_COL
st1 {v16.8b}, [TMP1]
- transpose_single v20, v21, v3, .2d, .8b
+ transpose_single v20, v21, v3, .d, .8b
st1 {v17.8b}, [TMP2]
ldp TMP1, TMP2, [OUTPUT_BUF], 16
add TMP1, TMP1, OUTPUT_COL
@@ -605,7 +606,7 @@
add TMP2, TMP2, OUTPUT_COL
add TMP3, TMP3, OUTPUT_COL
add TMP4, TMP4, OUTPUT_COL
- transpose_single v22, v23, v3, .2d, .8b
+ transpose_single v22, v23, v3, .d, .8b
st1 {v20.8b}, [TMP1]
add v22.8b, v22.8b, v0.8b
add v23.8b, v23.8b, v0.8b
@@ -613,14 +614,14 @@
st1 {v22.8b}, [TMP3]
st1 {v23.8b}, [TMP4]
ldr x15, [sp], 16
- ld1 {v0.8b - v3.8b}, [sp], 32
- ld1 {v4.8b - v7.8b}, [sp], 32
- ld1 {v8.8b - v11.8b}, [sp], 32
- ld1 {v12.8b - v15.8b}, [sp], 32
- ld1 {v16.8b - v19.8b}, [sp], 32
- ld1 {v20.8b - v23.8b}, [sp], 32
- ld1 {v24.8b - v27.8b}, [sp], 32
- ld1 {v28.8b - v31.8b}, [sp], 32
+ ld1 {v0.8b, v1.8b, v2.8b, v3.8b}, [sp], 32
+ ld1 {v4.8b, v5.8b, v6.8b, v7.8b}, [sp], 32
+ ld1 {v8.8b, v9.8b, v10.8b, v11.8b}, [sp], 32
+ ld1 {v12.8b, v13.8b, v14.8b, v15.8b}, [sp], 32
+ ld1 {v16.8b, v17.8b, v18.8b, v19.8b}, [sp], 32
+ ld1 {v20.8b, v21.8b, v22.8b, v23.8b}, [sp], 32
+ ld1 {v24.8b, v25.8b, v26.8b, v27.8b}, [sp], 32
+ ld1 {v28.8b, v29.8b, v30.8b, v31.8b}, [sp], 32
blr x30
3: /* Left 4x8 half is done, right 4x8 half contains mostly zeros */
@@ -636,17 +637,17 @@
transpose ROW0L, ROW2L, v3, .16b, .2s
transpose ROW5L, ROW7L, v3, .16b, .2s
cmp x0, #0
- beq 4f /* Right 4x8 half has all zeros, go to 'sparse' second pass */
+ b.eq 4f /* Right 4x8 half has all zeros, go to 'sparse' second pass */
/* Only row 0 is non-zero for the right 4x8 half */
- dup ROW1R.4h, ROW0R.4h[1]
- dup ROW2R.4h, ROW0R.4h[2]
- dup ROW3R.4h, ROW0R.4h[3]
- dup ROW4R.4h, ROW0R.4h[0]
- dup ROW5R.4h, ROW0R.4h[1]
- dup ROW6R.4h, ROW0R.4h[2]
- dup ROW7R.4h, ROW0R.4h[3]
- dup ROW0R.4h, ROW0R.4h[0]
+ dup ROW1R.4h, ROW0R.h[1]
+ dup ROW2R.4h, ROW0R.h[2]
+ dup ROW3R.4h, ROW0R.h[3]
+ dup ROW4R.4h, ROW0R.h[0]
+ dup ROW5R.4h, ROW0R.h[1]
+ dup ROW6R.4h, ROW0R.h[2]
+ dup ROW7R.4h, ROW0R.h[3]
+ dup ROW0R.4h, ROW0R.h[0]
b 1b /* Go to 'normal' second pass */
4: /* 1-D IDCT, pass 2 (sparse variant with zero rows 4-7), left 4x8 half */
@@ -770,13 +771,13 @@
* per 1-D IDCT pass, totalling to 5 VQDMULH and 35 VADD/VSUB instructions.
*/
-#define XFIX_1_082392200 v0.4h[0]
-#define XFIX_1_414213562 v0.4h[1]
-#define XFIX_1_847759065 v0.4h[2]
-#define XFIX_2_613125930 v0.4h[3]
+#define XFIX_1_082392200 v0.h[0]
+#define XFIX_1_414213562 v0.h[1]
+#define XFIX_1_847759065 v0.h[2]
+#define XFIX_2_613125930 v0.h[3]
.balign 16
-jsimd_idct_ifast_neon_consts:
+Ljsimd_idct_ifast_neon_consts:
.short (277 * 128 - 256 * 128) /* XFIX_1_082392200 */
.short (362 * 128 - 256 * 128) /* XFIX_1_414213562 */
.short (473 * 128 - 256 * 128) /* XFIX_1_847759065 */
@@ -810,12 +811,12 @@
/* Save NEON registers used in fast IDCT */
sub sp, sp, #176
stp x22, x23, [sp], 16
- adr x23, jsimd_idct_ifast_neon_consts
- st1 {v0.8b - v3.8b}, [sp], 32
- st1 {v4.8b - v7.8b}, [sp], 32
- st1 {v8.8b - v11.8b}, [sp], 32
- st1 {v12.8b - v15.8b}, [sp], 32
- st1 {v16.8b - v19.8b}, [sp], 32
+ adr x23, Ljsimd_idct_ifast_neon_consts
+ st1 {v0.8b, v1.8b, v2.8b, v3.8b}, [sp], 32
+ st1 {v4.8b, v5.8b, v6.8b, v7.8b}, [sp], 32
+ st1 {v8.8b, v9.8b, v10.8b, v11.8b}, [sp], 32
+ st1 {v12.8b, v13.8b, v14.8b, v15.8b}, [sp], 32
+ st1 {v16.8b, v17.8b, v18.8b, v19.8b}, [sp], 32
ld1 {v8.8h, v9.8h}, [COEF_BLOCK], 32
ld1 {v0.8h, v1.8h}, [DCT_TABLE], 32
ld1 {v10.8h, v11.8h}, [COEF_BLOCK], 32
@@ -909,24 +910,24 @@
trn2 v15.4s, v18.4s, v15.4s
/* vswp v14.4h, v10-MSB.4h */
umov x22, v14.d[0]
- ins v14.2d[0], v10.2d[1]
- ins v10.2d[1], x22
+ ins v14.d[0], v10.d[1]
+ ins v10.d[1], x22
/* vswp v13.4h, v9MSB.4h */
umov x22, v13.d[0]
- ins v13.2d[0], v9.2d[1]
- ins v9.2d[1], x22
+ ins v13.d[0], v9.d[1]
+ ins v9.d[1], x22
/* 1-D IDCT, pass 2 */
sub v2.8h, v10.8h, v14.8h
/* vswp v15.4h, v11MSB.4h */
umov x22, v15.d[0]
- ins v15.2d[0], v11.2d[1]
- ins v11.2d[1], x22
+ ins v15.d[0], v11.d[1]
+ ins v11.d[1], x22
add v14.8h, v10.8h, v14.8h
/* vswp v12.4h, v8-MSB.4h */
umov x22, v12.d[0]
- ins v12.2d[0], v8.2d[1]
- ins v8.2d[1], x22
+ ins v12.d[0], v8.d[1]
+ ins v8.d[1], x22
sub v1.8h, v11.8h, v13.8h
add v13.8h, v11.8h, v13.8h
sub v5.8h, v9.8h, v15.8h
@@ -997,13 +998,13 @@
trn1 v9.4s, v9.4s, v11.4s
trn2 v11.4s, v18.4s, v11.4s
/* make copy */
- ins v17.2d[0], v8.2d[1]
+ ins v17.d[0], v8.d[1]
/* Transpose d16-d17-msb */
mov v18.16b, v8.16b
trn1 v8.8b, v8.8b, v17.8b
trn2 v17.8b, v18.8b, v17.8b
/* make copy */
- ins v19.2d[0], v9.2d[1]
+ ins v19.d[0], v9.d[1]
mov v18.16b, v9.16b
trn1 v9.8b, v9.8b, v19.8b
trn2 v19.8b, v18.8b, v19.8b
@@ -1018,7 +1019,7 @@
add TMP2, TMP2, OUTPUT_COL
st1 {v9.8b}, [TMP1]
/* make copy */
- ins v7.2d[0], v10.2d[1]
+ ins v7.d[0], v10.d[1]
mov v18.16b, v10.16b
trn1 v10.8b, v10.8b, v7.8b
trn2 v7.8b, v18.8b, v7.8b
@@ -1031,7 +1032,7 @@
add TMP5, TMP5, OUTPUT_COL
st1 {v10.8b}, [TMP1]
/* make copy */
- ins v16.2d[0], v11.2d[1]
+ ins v16.d[0], v11.d[1]
mov v18.16b, v11.16b
trn1 v11.8b, v11.8b, v16.8b
trn2 v16.8b, v18.8b, v16.8b
@@ -1040,11 +1041,11 @@
st1 {v16.8b}, [TMP5]
sub sp, sp, #176
ldp x22, x23, [sp], 16
- ld1 {v0.8b - v3.8b}, [sp], 32
- ld1 {v4.8b - v7.8b}, [sp], 32
- ld1 {v8.8b - v11.8b}, [sp], 32
- ld1 {v12.8b - v15.8b}, [sp], 32
- ld1 {v16.8b - v19.8b}, [sp], 32
+ ld1 {v0.8b, v1.8b, v2.8b, v3.8b}, [sp], 32
+ ld1 {v4.8b, v5.8b, v6.8b, v7.8b}, [sp], 32
+ ld1 {v8.8b, v9.8b, v10.8b, v11.8b}, [sp], 32
+ ld1 {v12.8b, v13.8b, v14.8b, v15.8b}, [sp], 32
+ ld1 {v16.8b, v17.8b, v18.8b, v19.8b}, [sp], 32
blr x30
.unreq DCT_TABLE
@@ -1095,38 +1096,38 @@
#define FIX_3_624509785 (29692) /* FIX(3.624509785) */
.balign 16
-jsimd_idct_4x4_neon_consts:
- .short FIX_1_847759065 /* v0.4h[0] */
- .short -FIX_0_765366865 /* v0.4h[1] */
- .short -FIX_0_211164243 /* v0.4h[2] */
- .short FIX_1_451774981 /* v0.4h[3] */
+Ljsimd_idct_4x4_neon_consts:
+ .short FIX_1_847759065 /* v0.h[0] */
+ .short -FIX_0_765366865 /* v0.h[1] */
+ .short -FIX_0_211164243 /* v0.h[2] */
+ .short FIX_1_451774981 /* v0.h[3] */
.short -FIX_2_172734803 /* d1[0] */
.short FIX_1_061594337 /* d1[1] */
.short -FIX_0_509795579 /* d1[2] */
.short -FIX_0_601344887 /* d1[3] */
- .short FIX_0_899976223 /* v2.4h[0] */
- .short FIX_2_562915447 /* v2.4h[1] */
- .short 1 << (CONST_BITS+1) /* v2.4h[2] */
- .short 0 /* v2.4h[3] */
+ .short FIX_0_899976223 /* v2.h[0] */
+ .short FIX_2_562915447 /* v2.h[1] */
+ .short 1 << (CONST_BITS+1) /* v2.h[2] */
+ .short 0 /* v2.h[3] */
.macro idct_helper x4, x6, x8, x10, x12, x14, x16, shift, y26, y27, y28, y29
- smull v28.4s, \x4, v2.4h[2]
- smlal v28.4s, \x8, v0.4h[0]
- smlal v28.4s, \x14, v0.4h[1]
+ smull v28.4s, \x4, v2.h[2]
+ smlal v28.4s, \x8, v0.h[0]
+ smlal v28.4s, \x14, v0.h[1]
- smull v26.4s, \x16, v1.4h[2]
- smlal v26.4s, \x12, v1.4h[3]
- smlal v26.4s, \x10, v2.4h[0]
- smlal v26.4s, \x6, v2.4h[1]
+ smull v26.4s, \x16, v1.h[2]
+ smlal v26.4s, \x12, v1.h[3]
+ smlal v26.4s, \x10, v2.h[0]
+ smlal v26.4s, \x6, v2.h[1]
- smull v30.4s, \x4, v2.4h[2]
- smlsl v30.4s, \x8, v0.4h[0]
- smlsl v30.4s, \x14, v0.4h[1]
+ smull v30.4s, \x4, v2.h[2]
+ smlsl v30.4s, \x8, v0.h[0]
+ smlsl v30.4s, \x14, v0.h[1]
- smull v24.4s, \x16, v0.4h[2]
- smlal v24.4s, \x12, v0.4h[3]
- smlal v24.4s, \x10, v1.4h[0]
- smlal v24.4s, \x6, v1.4h[1]
+ smull v24.4s, \x16, v0.h[2]
+ smlal v24.4s, \x12, v0.h[3]
+ smlal v24.4s, \x10, v1.h[0]
+ smlal v24.4s, \x6, v1.h[1]
add v20.4s, v28.4s, v26.4s
sub v28.4s, v28.4s, v26.4s
@@ -1171,15 +1172,15 @@
sub sp, sp, 272
str x15, [sp], 16
/* Load constants (v3.4h is just used for padding) */
- adr TMP4, jsimd_idct_4x4_neon_consts
- st1 {v0.8b - v3.8b}, [sp], 32
- st1 {v4.8b - v7.8b}, [sp], 32
- st1 {v8.8b - v11.8b}, [sp], 32
- st1 {v12.8b - v15.8b}, [sp], 32
- st1 {v16.8b - v19.8b}, [sp], 32
- st1 {v20.8b - v23.8b}, [sp], 32
- st1 {v24.8b - v27.8b}, [sp], 32
- st1 {v28.8b - v31.8b}, [sp], 32
+ adr TMP4, Ljsimd_idct_4x4_neon_consts
+ st1 {v0.8b, v1.8b, v2.8b, v3.8b}, [sp], 32
+ st1 {v4.8b, v5.8b, v6.8b, v7.8b}, [sp], 32
+ st1 {v8.8b, v9.8b, v10.8b, v11.8b}, [sp], 32
+ st1 {v12.8b, v13.8b, v14.8b, v15.8b}, [sp], 32
+ st1 {v16.8b, v17.8b, v18.8b, v19.8b}, [sp], 32
+ st1 {v20.8b, v21.8b, v22.8b, v23.8b}, [sp], 32
+ st1 {v24.8b, v25.8b, v26.8b, v27.8b}, [sp], 32
+ st1 {v28.8b, v29.8b, v30.8b, v31.8b}, [sp], 32
ld1 {v0.4h, v1.4h, v2.4h, v3.4h}, [TMP4]
/* Load all COEF_BLOCK into NEON registers with the following allocation:
@@ -1203,45 +1204,45 @@
ld1 {v18.4h, v19.4h, v20.4h, v21.4h}, [DCT_TABLE], 32
mul v4.4h, v4.4h, v18.4h
mul v5.4h, v5.4h, v19.4h
- ins v4.2d[1], v5.2d[0] /* 128 bit q4 */
+ ins v4.d[1], v5.d[0] /* 128 bit q4 */
ld1 {v22.4h, v23.4h, v24.4h, v25.4h}, [DCT_TABLE], 32
mul v6.4h, v6.4h, v20.4h
mul v7.4h, v7.4h, v21.4h
- ins v6.2d[1], v7.2d[0] /* 128 bit q6 */
+ ins v6.d[1], v7.d[0] /* 128 bit q6 */
mul v8.4h, v8.4h, v22.4h
mul v9.4h, v9.4h, v23.4h
- ins v8.2d[1], v9.2d[0] /* 128 bit q8 */
+ ins v8.d[1], v9.d[0] /* 128 bit q8 */
add DCT_TABLE, DCT_TABLE, #16
ld1 {v26.4h, v27.4h, v28.4h, v29.4h}, [DCT_TABLE], 32
mul v10.4h, v10.4h, v24.4h
mul v11.4h, v11.4h, v25.4h
- ins v10.2d[1], v11.2d[0] /* 128 bit q10 */
+ ins v10.d[1], v11.d[0] /* 128 bit q10 */
mul v12.4h, v12.4h, v26.4h
mul v13.4h, v13.4h, v27.4h
- ins v12.2d[1], v13.2d[0] /* 128 bit q12 */
+ ins v12.d[1], v13.d[0] /* 128 bit q12 */
ld1 {v30.4h, v31.4h}, [DCT_TABLE], 16
mul v14.4h, v14.4h, v28.4h
mul v15.4h, v15.4h, v29.4h
- ins v14.2d[1], v15.2d[0] /* 128 bit q14 */
+ ins v14.d[1], v15.d[0] /* 128 bit q14 */
mul v16.4h, v16.4h, v30.4h
mul v17.4h, v17.4h, v31.4h
- ins v16.2d[1], v17.2d[0] /* 128 bit q16 */
+ ins v16.d[1], v17.d[0] /* 128 bit q16 */
/* Pass 1 */
idct_helper v4.4h, v6.4h, v8.4h, v10.4h, v12.4h, v14.4h, v16.4h, 12, v4.4h, v6.4h, v8.4h, v10.4h
transpose_4x4 v4, v6, v8, v10, v3
- ins v10.2d[1], v11.2d[0]
+ ins v10.d[1], v11.d[0]
idct_helper v5.4h, v7.4h, v9.4h, v11.4h, v13.4h, v15.4h, v17.4h, 12, v5.4h, v7.4h, v9.4h, v11.4h
transpose_4x4 v5, v7, v9, v11, v3
- ins v10.2d[1], v11.2d[0]
+ ins v10.d[1], v11.d[0]
/* Pass 2 */
idct_helper v4.4h, v6.4h, v8.4h, v10.4h, v7.4h, v9.4h, v11.4h, 19, v26.4h, v27.4h, v28.4h, v29.4h
transpose_4x4 v26, v27, v28, v29, v3
/* Range limit */
movi v30.8h, #0x80
- ins v26.2d[1], v27.2d[0]
- ins v28.2d[1], v29.2d[0]
+ ins v26.d[1], v27.d[0]
+ ins v28.d[1], v29.d[0]
add v26.8h, v26.8h, v30.8h
add v28.8h, v28.8h, v30.8h
sqxtun v26.8b, v26.8h
@@ -1286,14 +1287,14 @@
/* vpop {v8.4h - v15.4h} ;not available */
sub sp, sp, #272
ldr x15, [sp], 16
- ld1 {v0.8b - v3.8b}, [sp], 32
- ld1 {v4.8b - v7.8b}, [sp], 32
- ld1 {v8.8b - v11.8b}, [sp], 32
- ld1 {v12.8b - v15.8b}, [sp], 32
- ld1 {v16.8b - v19.8b}, [sp], 32
- ld1 {v20.8b - v23.8b}, [sp], 32
- ld1 {v24.8b - v27.8b}, [sp], 32
- ld1 {v28.8b - v31.8b}, [sp], 32
+ ld1 {v0.8b, v1.8b, v2.8b, v3.8b}, [sp], 32
+ ld1 {v4.8b, v5.8b, v6.8b, v7.8b}, [sp], 32
+ ld1 {v8.8b, v9.8b, v10.8b, v11.8b}, [sp], 32
+ ld1 {v12.8b, v13.8b, v14.8b, v15.8b}, [sp], 32
+ ld1 {v16.8b, v17.8b, v18.8b, v19.8b}, [sp], 32
+ ld1 {v20.8b, v21.8b, v22.8b, v23.8b}, [sp], 32
+ ld1 {v24.8b, v25.8b, v26.8b, v27.8b}, [sp], 32
+ ld1 {v28.8b, v29.8b, v30.8b, v31.8b}, [sp], 32
blr x30
.unreq DCT_TABLE
@@ -1325,7 +1326,7 @@
*/
.balign 8
-jsimd_idct_2x2_neon_consts:
+Ljsimd_idct_2x2_neon_consts:
.short -FIX_0_720959822 /* v14[0] */
.short FIX_0_850430095 /* v14[1] */
.short -FIX_1_272758580 /* v14[2] */
@@ -1333,10 +1334,10 @@
.macro idct_helper x4, x6, x10, x12, x16, shift, y26, y27
sshll v15.4s, \x4, #15
- smull v26.4s, \x6, v14.4h[3]
- smlal v26.4s, \x10, v14.4h[2]
- smlal v26.4s, \x12, v14.4h[1]
- smlal v26.4s, \x16, v14.4h[0]
+ smull v26.4s, \x6, v14.h[3]
+ smlal v26.4s, \x10, v14.h[2]
+ smlal v26.4s, \x12, v14.h[1]
+ smlal v26.4s, \x16, v14.h[0]
add v20.4s, v15.4s, v26.4s
sub v15.4s, v15.4s, v26.4s
@@ -1367,14 +1368,14 @@
str x15, [sp], 16
/* Load constants */
- adr TMP2, jsimd_idct_2x2_neon_consts
- st1 {v4.8b - v7.8b}, [sp], 32
- st1 {v8.8b - v11.8b}, [sp], 32
- st1 {v12.8b - v15.8b}, [sp], 32
- st1 {v16.8b - v19.8b}, [sp], 32
- st1 {v21.8b - v22.8b}, [sp], 16
- st1 {v24.8b - v27.8b}, [sp], 32
- st1 {v30.8b - v31.8b}, [sp], 16
+ adr TMP2, Ljsimd_idct_2x2_neon_consts
+ st1 {v4.8b, v5.8b, v6.8b, v7.8b}, [sp], 32
+ st1 {v8.8b, v9.8b, v10.8b, v11.8b}, [sp], 32
+ st1 {v12.8b, v13.8b, v14.8b, v15.8b}, [sp], 32
+ st1 {v16.8b, v17.8b, v18.8b, v19.8b}, [sp], 32
+ st1 {v21.8b, v22.8b}, [sp], 16
+ st1 {v24.8b, v25.8b, v26.8b, v27.8b}, [sp], 32
+ st1 {v30.8b, v31.8b}, [sp], 16
ld1 {v14.4h}, [TMP2]
/* Load all COEF_BLOCK into NEON registers with the following allocation:
@@ -1400,25 +1401,25 @@
ld1 {v18.4h, v19.4h, v20.4h, v21.4h}, [DCT_TABLE], 32
mul v4.4h, v4.4h, v18.4h
mul v5.4h, v5.4h, v19.4h
- ins v4.2d[1], v5.2d[0]
+ ins v4.d[1], v5.d[0]
mul v6.4h, v6.4h, v20.4h
mul v7.4h, v7.4h, v21.4h
- ins v6.2d[1], v7.2d[0]
+ ins v6.d[1], v7.d[0]
add DCT_TABLE, DCT_TABLE, #16
ld1 {v24.4h, v25.4h}, [DCT_TABLE], 16
mul v10.4h, v10.4h, v24.4h
mul v11.4h, v11.4h, v25.4h
- ins v10.2d[1], v11.2d[0]
+ ins v10.d[1], v11.d[0]
add DCT_TABLE, DCT_TABLE, #16
ld1 {v26.4h, v27.4h}, [DCT_TABLE], 16
mul v12.4h, v12.4h, v26.4h
mul v13.4h, v13.4h, v27.4h
- ins v12.2d[1], v13.2d[0]
+ ins v12.d[1], v13.d[0]
add DCT_TABLE, DCT_TABLE, #16
ld1 {v30.4h, v31.4h}, [DCT_TABLE], 16
mul v16.4h, v16.4h, v30.4h
mul v17.4h, v17.4h, v31.4h
- ins v16.2d[1], v17.2d[0]
+ ins v16.d[1], v17.d[0]
/* Pass 1 */
#if 0
@@ -1427,14 +1428,14 @@
idct_helper v5.4h, v7.4h, v11.4h, v13.4h, v17.4h, 13, v5.4h, v7.4h
transpose_4x4 v5.4h, v7.4h, v9.4h, v11.4h
#else
- smull v26.4s, v6.4h, v14.4h[3]
- smlal v26.4s, v10.4h, v14.4h[2]
- smlal v26.4s, v12.4h, v14.4h[1]
- smlal v26.4s, v16.4h, v14.4h[0]
- smull v24.4s, v7.4h, v14.4h[3]
- smlal v24.4s, v11.4h, v14.4h[2]
- smlal v24.4s, v13.4h, v14.4h[1]
- smlal v24.4s, v17.4h, v14.4h[0]
+ smull v26.4s, v6.4h, v14.h[3]
+ smlal v26.4s, v10.4h, v14.h[2]
+ smlal v26.4s, v12.4h, v14.h[1]
+ smlal v26.4s, v16.4h, v14.h[0]
+ smull v24.4s, v7.4h, v14.h[3]
+ smlal v24.4s, v11.4h, v14.h[2]
+ smlal v24.4s, v13.4h, v14.h[1]
+ smlal v24.4s, v17.4h, v14.h[0]
sshll v15.4s, v4.4h, #15
sshll v30.4s, v5.4h, #15
add v20.4s, v15.4s, v26.4s
@@ -1445,12 +1446,12 @@
sub v15.4s, v30.4s, v24.4s
rshrn v5.4h, v20.4s, #13
rshrn v7.4h, v15.4s, #13
- ins v4.2d[1], v5.2d[0]
- ins v6.2d[1], v7.2d[0]
+ ins v4.d[1], v5.d[0]
+ ins v6.d[1], v7.d[0]
transpose v4, v6, v3, .16b, .8h
transpose v6, v10, v3, .16b, .4s
- ins v11.2d[0], v10.2d[1]
- ins v7.2d[0], v6.2d[1]
+ ins v11.d[0], v10.d[1]
+ ins v7.d[0], v6.d[1]
#endif
/* Pass 2 */
@@ -1458,10 +1459,10 @@
/* Range limit */
movi v30.8h, #0x80
- ins v26.2d[1], v27.2d[0]
+ ins v26.d[1], v27.d[0]
add v26.8h, v26.8h, v30.8h
sqxtun v30.8b, v26.8h
- ins v26.2d[0], v30.2d[0]
+ ins v26.d[0], v30.d[0]
sqxtun v27.8b, v26.8h
/* Store results to the output buffer */
@@ -1476,13 +1477,13 @@
sub sp, sp, #208
ldr x15, [sp], 16
- ld1 {v4.8b - v7.8b}, [sp], 32
- ld1 {v8.8b - v11.8b}, [sp], 32
- ld1 {v12.8b - v15.8b}, [sp], 32
- ld1 {v16.8b - v19.8b}, [sp], 32
- ld1 {v21.8b - v22.8b}, [sp], 16
- ld1 {v24.8b - v27.8b}, [sp], 32
- ld1 {v30.8b - v31.8b}, [sp], 16
+ ld1 {v4.8b, v5.8b, v6.8b, v7.8b}, [sp], 32
+ ld1 {v8.8b, v9.8b, v10.8b, v11.8b}, [sp], 32
+ ld1 {v12.8b, v13.8b, v14.8b, v15.8b}, [sp], 32
+ ld1 {v16.8b, v17.8b, v18.8b, v19.8b}, [sp], 32
+ ld1 {v21.8b, v22.8b}, [sp], 16
+ ld1 {v24.8b, v25.8b, v26.8b, v27.8b}, [sp], 32
+ ld1 {v30.8b, v31.8b}, [sp], 16
blr x30
.unreq DCT_TABLE
@@ -1514,9 +1515,9 @@
ld1 {v4.8b}, [U], 8
ld1 {v5.8b}, [V], 8
ld1 {v0.8b}, [Y], 8
- prfm PLDL1KEEP, [U, #64]
- prfm PLDL1KEEP, [V, #64]
- prfm PLDL1KEEP, [Y, #64]
+ prfm pldl1keep, [U, #64]
+ prfm pldl1keep, [V, #64]
+ prfm pldl1keep, [Y, #64]
.elseif \size == 4
ld1 {v4.b}[0], [U], 1
ld1 {v4.b}[1], [U], 1
@@ -1606,14 +1607,14 @@
.macro do_yuv_to_rgb_stage1
uaddw v6.8h, v2.8h, v4.8b /* q3 = u - 128 */
uaddw v8.8h, v2.8h, v5.8b /* q2 = v - 128 */
- smull v20.4s, v6.4h, v1.4h[1] /* multiply by -11277 */
- smlal v20.4s, v8.4h, v1.4h[2] /* multiply by -23401 */
- smull2 v22.4s, v6.8h, v1.4h[1] /* multiply by -11277 */
- smlal2 v22.4s, v8.8h, v1.4h[2] /* multiply by -23401 */
- smull v24.4s, v8.4h, v1.4h[0] /* multiply by 22971 */
- smull2 v26.4s, v8.8h, v1.4h[0] /* multiply by 22971 */
- smull v28.4s, v6.4h, v1.4h[3] /* multiply by 29033 */
- smull2 v30.4s, v6.8h, v1.4h[3] /* multiply by 29033 */
+ smull v20.4s, v6.4h, v1.h[1] /* multiply by -11277 */
+ smlal v20.4s, v8.4h, v1.h[2] /* multiply by -23401 */
+ smull2 v22.4s, v6.8h, v1.h[1] /* multiply by -11277 */
+ smlal2 v22.4s, v8.8h, v1.h[2] /* multiply by -23401 */
+ smull v24.4s, v8.4h, v1.h[0] /* multiply by 22971 */
+ smull2 v26.4s, v8.8h, v1.h[0] /* multiply by 22971 */
+ smull v28.4s, v6.4h, v1.h[3] /* multiply by 29033 */
+ smull2 v30.4s, v6.8h, v1.h[3] /* multiply by 29033 */
.endm
.macro do_yuv_to_rgb_stage2
@@ -1656,18 +1657,18 @@
sqxtun v1\g_offs\defsize, v20.8h
ld1 {v0.8b}, [Y], 8
sqxtun v1\r_offs\defsize, v24.8h
- prfm PLDL1KEEP, [U, #64]
- prfm PLDL1KEEP, [V, #64]
- prfm PLDL1KEEP, [Y, #64]
+ prfm pldl1keep, [U, #64]
+ prfm pldl1keep, [V, #64]
+ prfm pldl1keep, [Y, #64]
sqxtun v1\b_offs\defsize, v28.8h
uaddw v6.8h, v2.8h, v4.8b /* v6.16b = u - 128 */
uaddw v8.8h, v2.8h, v5.8b /* q2 = v - 128 */
- smull v20.4s, v6.4h, v1.4h[1] /* multiply by -11277 */
- smlal v20.4s, v8.4h, v1.4h[2] /* multiply by -23401 */
- smull2 v22.4s, v6.8h, v1.4h[1] /* multiply by -11277 */
- smlal2 v22.4s, v8.8h, v1.4h[2] /* multiply by -23401 */
- smull v24.4s, v8.4h, v1.4h[0] /* multiply by 22971 */
- smull2 v26.4s, v8.8h, v1.4h[0] /* multiply by 22971 */
+ smull v20.4s, v6.4h, v1.h[1] /* multiply by -11277 */
+ smlal v20.4s, v8.4h, v1.h[2] /* multiply by -23401 */
+ smull2 v22.4s, v6.8h, v1.h[1] /* multiply by -11277 */
+ smlal2 v22.4s, v8.8h, v1.h[2] /* multiply by -23401 */
+ smull v24.4s, v8.4h, v1.h[0] /* multiply by 22971 */
+ smull2 v26.4s, v8.8h, v1.h[0] /* multiply by 22971 */
.else /**************************** rgb565 ***********************************/
sqshlu v21.8h, v20.8h, #8
sqshlu v25.8h, v24.8h, #8
@@ -1675,21 +1676,21 @@
uaddw v6.8h, v2.8h, v4.8b /* v6.16b = u - 128 */
uaddw v8.8h, v2.8h, v5.8b /* q2 = v - 128 */
ld1 {v0.8b}, [Y], 8
- smull v20.4s, v6.4h, v1.4h[1] /* multiply by -11277 */
- smlal v20.4s, v8.4h, v1.4h[2] /* multiply by -23401 */
- smull2 v22.4s, v6.8h, v1.4h[1] /* multiply by -11277 */
- smlal2 v22.4s, v8.8h, v1.4h[2] /* multiply by -23401 */
+ smull v20.4s, v6.4h, v1.h[1] /* multiply by -11277 */
+ smlal v20.4s, v8.4h, v1.h[2] /* multiply by -23401 */
+ smull2 v22.4s, v6.8h, v1.h[1] /* multiply by -11277 */
+ smlal2 v22.4s, v8.8h, v1.h[2] /* multiply by -23401 */
sri v25.8h, v21.8h, #5
- smull v24.4s, v8.4h, v1.4h[0] /* multiply by 22971 */
- smull2 v26.4s, v8.8h, v1.4h[0] /* multiply by 22971 */
- prfm PLDL1KEEP, [U, #64]
- prfm PLDL1KEEP, [V, #64]
- prfm PLDL1KEEP, [Y, #64]
+ smull v24.4s, v8.4h, v1.h[0] /* multiply by 22971 */
+ smull2 v26.4s, v8.8h, v1.h[0] /* multiply by 22971 */
+ prfm pldl1keep, [U, #64]
+ prfm pldl1keep, [V, #64]
+ prfm pldl1keep, [Y, #64]
sri v25.8h, v29.8h, #11
.endif
do_store \bpp, 8
- smull v28.4s, v6.4h, v1.4h[3] /* multiply by 29033 */
- smull2 v30.4s, v6.8h, v1.4h[3] /* multiply by 29033 */
+ smull v28.4s, v6.4h, v1.h[3] /* multiply by 29033 */
+ smull2 v30.4s, v6.8h, v1.h[3] /* multiply by 29033 */
.endm
.macro do_yuv_to_rgb
@@ -1702,7 +1703,7 @@
*/
.balign 16
-jsimd_ycc_\colorid\()_neon_consts:
+Ljsimd_ycc_\colorid\()_neon_consts:
.short 0, 0, 0, 0
.short 22971, -11277, -23401, 29033
.short -128, -128, -128, -128
@@ -1717,7 +1718,7 @@
INPUT_BUF0 .req x5
INPUT_BUF1 .req x6
- INPUT_BUF2 .req INPUT_BUF
+ INPUT_BUF2 .req x1
RGB .req x7
Y .req x8
@@ -1728,16 +1729,16 @@
sub sp, sp, 336
str x15, [sp], 16
/* Load constants to d1, d2, d3 (v0.4h is just used for padding) */
- adr x15, jsimd_ycc_\colorid\()_neon_consts
+ adr x15, Ljsimd_ycc_\colorid\()_neon_consts
/* Save NEON registers */
- st1 {v0.8b - v3.8b}, [sp], 32
- st1 {v4.8b - v7.8b}, [sp], 32
- st1 {v8.8b - v11.8b}, [sp], 32
- st1 {v12.8b - v15.8b}, [sp], 32
- st1 {v16.8b - v19.8b}, [sp], 32
- st1 {v20.8b - v23.8b}, [sp], 32
- st1 {v24.8b - v27.8b}, [sp], 32
- st1 {v28.8b - v31.8b}, [sp], 32
+ st1 {v0.8b, v1.8b, v2.8b, v3.8b}, [sp], 32
+ st1 {v4.8b, v5.8b, v6.8b, v7.8b}, [sp], 32
+ st1 {v8.8b, v9.8b, v10.8b, v11.8b}, [sp], 32
+ st1 {v12.8b, v13.8b, v14.8b, v15.8b}, [sp], 32
+ st1 {v16.8b, v17.8b, v18.8b, v19.8b}, [sp], 32
+ st1 {v20.8b, v21.8b, v22.8b, v23.8b}, [sp], 32
+ st1 {v24.8b, v25.8b, v26.8b, v27.8b}, [sp], 32
+ st1 {v28.8b, v29.8b, v30.8b, v31.8b}, [sp], 32
ld1 {v0.4h, v1.4h}, [x15], 16
ld1 {v2.8h}, [x15]
@@ -1748,8 +1749,8 @@
stp x8, x9, [sp], 16
stp x10, x30, [sp], 16
ldr INPUT_BUF0, [INPUT_BUF]
- ldr INPUT_BUF1, [INPUT_BUF, 8]
- ldr INPUT_BUF2, [INPUT_BUF, 16]
+ ldr INPUT_BUF1, [INPUT_BUF, #8]
+ ldr INPUT_BUF2, [INPUT_BUF, #16]
.unreq INPUT_BUF
/* Initially set v10, v11.4h, v12.8b, d13 to 0xFF */
@@ -1758,7 +1759,7 @@
/* Outer loop over scanlines */
cmp NUM_ROWS, #1
- blt 9f
+ b.lt 9f
0:
lsl x16, INPUT_ROW, #3
ldr Y, [INPUT_BUF0, x16]
@@ -1770,60 +1771,60 @@
/* Inner loop over pixels */
subs N, N, #8
- blt 3f
+ b.lt 3f
do_load 8
do_yuv_to_rgb_stage1
subs N, N, #8
- blt 2f
+ b.lt 2f
1:
do_yuv_to_rgb_stage2_store_load_stage1
subs N, N, #8
- bge 1b
+ b.ge 1b
2:
do_yuv_to_rgb_stage2
do_store \bpp, 8
tst N, #7
- beq 8f
+ b.eq 8f
3:
tst N, #4
- beq 3f
+ b.eq 3f
do_load 4
3:
tst N, #2
- beq 4f
+ b.eq 4f
do_load 2
4:
tst N, #1
- beq 5f
+ b.eq 5f
do_load 1
5:
do_yuv_to_rgb
tst N, #4
- beq 6f
+ b.eq 6f
do_store \bpp, 4
6:
tst N, #2
- beq 7f
+ b.eq 7f
do_store \bpp, 2
7:
tst N, #1
- beq 8f
+ b.eq 8f
do_store \bpp, 1
8:
subs NUM_ROWS, NUM_ROWS, #1
- bgt 0b
+ b.gt 0b
9:
/* Restore all registers and return */
sub sp, sp, #336
ldr x15, [sp], 16
- ld1 {v0.8b - v3.8b}, [sp], 32
- ld1 {v4.8b - v7.8b}, [sp], 32
- ld1 {v8.8b - v11.8b}, [sp], 32
- ld1 {v12.8b - v15.8b}, [sp], 32
- ld1 {v16.8b - v19.8b}, [sp], 32
- ld1 {v20.8b - v23.8b}, [sp], 32
- ld1 {v24.8b - v27.8b}, [sp], 32
- ld1 {v28.8b - v31.8b}, [sp], 32
+ ld1 {v0.8b, v1.8b, v2.8b, v3.8b}, [sp], 32
+ ld1 {v4.8b, v5.8b, v6.8b, v7.8b}, [sp], 32
+ ld1 {v8.8b, v9.8b, v10.8b, v11.8b}, [sp], 32
+ ld1 {v12.8b, v13.8b, v14.8b, v15.8b}, [sp], 32
+ ld1 {v16.8b, v17.8b, v18.8b, v19.8b}, [sp], 32
+ ld1 {v20.8b, v21.8b, v22.8b, v23.8b}, [sp], 32
+ ld1 {v24.8b, v25.8b, v26.8b, v27.8b}, [sp], 32
+ ld1 {v28.8b, v29.8b, v30.8b, v31.8b}, [sp], 32
/* pop {r4, r5, r6, r7, r8, r9, r10, pc} */
ldp x4, x5, [sp], 16
ldp x6, x7, [sp], 16