Implement aarch64 neon instruction set AdvSIMD (3V elem).
git-svn-id: https://llvm.org/svn/llvm-project/cfe/trunk@191945 91177308-0d34-0410-b5e6-96231b3b80d8
diff --git a/include/clang/Basic/arm_neon.td b/include/clang/Basic/arm_neon.td
index b59843a..0b5508a 100644
--- a/include/clang/Basic/arm_neon.td
+++ b/include/clang/Basic/arm_neon.td
@@ -40,16 +40,25 @@
def OP_MLAL_N : Op;
def OP_MLSL_N : Op;
def OP_MUL_LN: Op;
+def OP_MULX_LN: Op;
def OP_MULL_LN : Op;
+def OP_MULLHi_LN : Op;
def OP_MLA_LN: Op;
def OP_MLS_LN: Op;
def OP_MLAL_LN : Op;
+def OP_MLALHi_LN : Op;
def OP_MLSL_LN : Op;
+def OP_MLSLHi_LN : Op;
def OP_QDMULL_LN : Op;
+def OP_QDMULLHi_LN : Op;
def OP_QDMLAL_LN : Op;
+def OP_QDMLALHi_LN : Op;
def OP_QDMLSL_LN : Op;
+def OP_QDMLSLHi_LN : Op;
def OP_QDMULH_LN : Op;
def OP_QRDMULH_LN : Op;
+def OP_FMS_LN : Op;
+def OP_FMS_LNQ : Op;
def OP_EQ : Op;
def OP_GE : Op;
def OP_LE : Op;
@@ -146,6 +155,7 @@
// f: float (int args)
// d: default
// g: default, ignore 'Q' size modifier.
+// j: default, force 'Q' size modifier.
// w: double width elements, same num elts
// n: double width elements, half num elts
// h: half width elements, double num elts
@@ -503,7 +513,7 @@
////////////////////////////////////////////////////////////////////////////////
// Multiplication Extended
-def MULX : SInst<"vmulx", "ddd", "fQfQd">;
+def MULX : SInst<"vmulx", "ddd", "fdQfQd">;
////////////////////////////////////////////////////////////////////////////////
// Division
@@ -630,6 +640,63 @@
def VQDMLSL_HIGH : SOpInst<"vqdmlsl_high", "wwkk", "si", OP_QDMLSLHi>;
////////////////////////////////////////////////////////////////////////////////
+
+def VMLA_LANEQ : IOpInst<"vmla_laneq", "dddji",
+ "siUsUifQsQiQUsQUiQf", OP_MLA_LN>;
+def VMLS_LANEQ : IOpInst<"vmls_laneq", "dddji",
+ "siUsUifQsQiQUsQUiQf", OP_MLS_LN>;
+
+def VFMA_LANE : IInst<"vfma_lane", "dddgi", "fdQfQd">;
+def VFMA_LANEQ : IInst<"vfma_laneq", "dddji", "fdQfQd">;
+def VFMS_LANE : IOpInst<"vfms_lane", "dddgi", "fdQfQd", OP_FMS_LN>;
+def VFMS_LANEQ : IOpInst<"vfms_laneq", "dddji", "fdQfQd", OP_FMS_LNQ>;
+
+def VMLAL_LANEQ : SOpInst<"vmlal_laneq", "wwdki", "siUsUi", OP_MLAL_LN>;
+def VMLAL_HIGH_LANE : SOpInst<"vmlal_high_lane", "wwkdi", "siUsUi",
+ OP_MLALHi_LN>;
+def VMLAL_HIGH_LANEQ : SOpInst<"vmlal_high_laneq", "wwkki", "siUsUi",
+ OP_MLALHi_LN>;
+def VMLSL_LANEQ : SOpInst<"vmlsl_laneq", "wwdki", "siUsUi", OP_MLSL_LN>;
+def VMLSL_HIGH_LANE : SOpInst<"vmlsl_high_lane", "wwkdi", "siUsUi",
+ OP_MLSLHi_LN>;
+def VMLSL_HIGH_LANEQ : SOpInst<"vmlsl_high_laneq", "wwkki", "siUsUi",
+ OP_MLSLHi_LN>;
+
+def VQDMLAL_LANEQ : SOpInst<"vqdmlal_laneq", "wwdki", "si", OP_QDMLAL_LN>;
+def VQDMLAL_HIGH_LANE : SOpInst<"vqdmlal_high_lane", "wwkdi", "si",
+ OP_QDMLALHi_LN>;
+def VQDMLAL_HIGH_LANEQ : SOpInst<"vqdmlal_high_laneq", "wwkki", "si",
+ OP_QDMLALHi_LN>;
+def VQDMLSL_LANEQ : SOpInst<"vqdmlsl_laneq", "wwdki", "si", OP_QDMLSL_LN>;
+def VQDMLSL_HIGH_LANE : SOpInst<"vqdmlsl_high_lane", "wwkdi", "si",
+ OP_QDMLSLHi_LN>;
+def VQDMLSL_HIGH_LANEQ : SOpInst<"vqdmlsl_high_laneq", "wwkki", "si",
+ OP_QDMLSLHi_LN>;
+
+// Newly add double parameter for vmul_lane in aarch64
+def VMUL_LANE_A64 : IOpInst<"vmul_lane", "ddgi", "dQd", OP_MUL_LN>;
+
+def VMUL_LANEQ : IOpInst<"vmul_laneq", "ddji",
+ "sifdUsUiQsQiQfQUsQUiQfQd", OP_MUL_LN>;
+def VMULL_LANEQ : SOpInst<"vmull_laneq", "wdki", "siUsUi", OP_MULL_LN>;
+def VMULL_HIGH_LANE : SOpInst<"vmull_high_lane", "wkdi", "siUsUi",
+ OP_MULLHi_LN>;
+def VMULL_HIGH_LANEQ : SOpInst<"vmull_high_laneq", "wkki", "siUsUi",
+ OP_MULLHi_LN>;
+
+def VQDMULL_LANEQ : SOpInst<"vqdmull_laneq", "wdki", "si", OP_QDMULL_LN>;
+def VQDMULL_HIGH_LANE : SOpInst<"vqdmull_high_lane", "wkdi", "si",
+ OP_QDMULLHi_LN>;
+def VQDMULL_HIGH_LANEQ : SOpInst<"vqdmull_high_laneq", "wkki", "si",
+ OP_QDMULLHi_LN>;
+
+def VQDMULH_LANEQ : SOpInst<"vqdmulh_laneq", "ddji", "siQsQi", OP_QDMULH_LN>;
+def VQRDMULH_LANEQ : SOpInst<"vqrdmulh_laneq", "ddji", "siQsQi", OP_QRDMULH_LN>;
+
+def VMULX_LANE : IOpInst<"vmulx_lane", "ddgi", "fdQfQd", OP_MULX_LN>;
+def VMULX_LANEQ : IOpInst<"vmulx_laneq", "ddji", "fdQfQd", OP_MULX_LN>;
+
+////////////////////////////////////////////////////////////////////////////////
// Scalar Arithmetic
// Scalar Addition
diff --git a/lib/CodeGen/CGBuiltin.cpp b/lib/CodeGen/CGBuiltin.cpp
index a6911ba..b4caab2 100644
--- a/lib/CodeGen/CGBuiltin.cpp
+++ b/lib/CodeGen/CGBuiltin.cpp
@@ -2222,6 +2222,46 @@
}
// AArch64-only builtins
+ case AArch64::BI__builtin_neon_vfma_lane_v:
+ case AArch64::BI__builtin_neon_vfmaq_laneq_v: {
+ Value *F = CGM.getIntrinsic(Intrinsic::fma, Ty);
+ Ops[0] = Builder.CreateBitCast(Ops[0], Ty);
+ Ops[1] = Builder.CreateBitCast(Ops[1], Ty);
+
+ Ops[2] = Builder.CreateBitCast(Ops[2], Ty);
+ Ops[2] = EmitNeonSplat(Ops[2], cast<ConstantInt>(Ops[3]));
+ return Builder.CreateCall3(F, Ops[2], Ops[1], Ops[0]);
+ }
+ case AArch64::BI__builtin_neon_vfmaq_lane_v: {
+ Value *F = CGM.getIntrinsic(Intrinsic::fma, Ty);
+ Ops[0] = Builder.CreateBitCast(Ops[0], Ty);
+ Ops[1] = Builder.CreateBitCast(Ops[1], Ty);
+
+ llvm::VectorType *VTy = cast<llvm::VectorType>(Ty);
+ llvm::Type *STy = llvm::VectorType::get(VTy->getElementType(),
+ VTy->getNumElements() / 2);
+ Ops[2] = Builder.CreateBitCast(Ops[2], STy);
+ Value* SV = llvm::ConstantVector::getSplat(VTy->getNumElements(),
+ cast<ConstantInt>(Ops[3]));
+ Ops[2] = Builder.CreateShuffleVector(Ops[2], Ops[2], SV, "lane");
+
+ return Builder.CreateCall3(F, Ops[2], Ops[1], Ops[0]);
+ }
+ case AArch64::BI__builtin_neon_vfma_laneq_v: {
+ Value *F = CGM.getIntrinsic(Intrinsic::fma, Ty);
+ Ops[0] = Builder.CreateBitCast(Ops[0], Ty);
+ Ops[1] = Builder.CreateBitCast(Ops[1], Ty);
+
+ llvm::VectorType *VTy = cast<llvm::VectorType>(Ty);
+ llvm::Type *STy = llvm::VectorType::get(VTy->getElementType(),
+ VTy->getNumElements() * 2);
+ Ops[2] = Builder.CreateBitCast(Ops[2], STy);
+ Value* SV = llvm::ConstantVector::getSplat(VTy->getNumElements(),
+ cast<ConstantInt>(Ops[3]));
+ Ops[2] = Builder.CreateShuffleVector(Ops[2], Ops[2], SV, "lane");
+
+ return Builder.CreateCall3(F, Ops[2], Ops[1], Ops[0]);
+ }
case AArch64::BI__builtin_neon_vfms_v:
case AArch64::BI__builtin_neon_vfmsq_v: {
Value *F = CGM.getIntrinsic(Intrinsic::fma, Ty);
diff --git a/test/CodeGen/aarch64-neon-2velem.c b/test/CodeGen/aarch64-neon-2velem.c
new file mode 100644
index 0000000..f34e11a
--- /dev/null
+++ b/test/CodeGen/aarch64-neon-2velem.c
@@ -0,0 +1,802 @@
+// REQUIRES: aarch64-registered-target
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +neon \
+// RUN: -ffp-contract=fast -S -O3 -o - %s | FileCheck %s
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +neon \
+// RUN: -S -O3 -o - %s | FileCheck %s
+
+// Test new aarch64 intrinsics and types
+
+#include <arm_neon.h>
+
+int16x4_t test_vmla_lane_s16(int16x4_t a, int16x4_t b, int16x4_t v) {
+ // CHECK: test_vmla_lane_s16
+ return vmla_lane_s16(a, b, v, 1);
+ // CHECK: mla {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[1]
+}
+
+int16x8_t test_vmlaq_lane_s16(int16x8_t a, int16x8_t b, int16x4_t v) {
+ // CHECK: test_vmlaq_lane_s16
+ return vmlaq_lane_s16(a, b, v, 1);
+ // CHECK: mla {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[1]
+}
+
+int32x2_t test_vmla_lane_s32(int32x2_t a, int32x2_t b, int32x2_t v) {
+ // CHECK: test_vmla_lane_s32
+ return vmla_lane_s32(a, b, v, 1);
+ // CHECK: mla {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1]
+}
+
+int32x4_t test_vmlaq_lane_s32(int32x4_t a, int32x4_t b, int32x2_t v) {
+ // CHECK: test_vmlaq_lane_s32
+ return vmlaq_lane_s32(a, b, v, 1);
+ // CHECK: mla {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1]
+}
+
+int16x4_t test_vmla_laneq_s16(int16x4_t a, int16x4_t b, int16x8_t v) {
+ // CHECK: test_vmla_laneq_s16
+ return vmla_laneq_s16(a, b, v, 1);
+ // CHECK: mla {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[1]
+}
+
+int16x8_t test_vmlaq_laneq_s16(int16x8_t a, int16x8_t b, int16x8_t v) {
+ // CHECK: test_vmlaq_laneq_s16
+ return vmlaq_laneq_s16(a, b, v, 1);
+ // CHECK: mla {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[1]
+}
+
+int32x2_t test_vmla_laneq_s32(int32x2_t a, int32x2_t b, int32x4_t v) {
+ // CHECK: test_vmla_laneq_s32
+ return vmla_laneq_s32(a, b, v, 1);
+ // CHECK: mla {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1]
+}
+
+int32x4_t test_vmlaq_laneq_s32(int32x4_t a, int32x4_t b, int32x4_t v) {
+ // CHECK: test_vmlaq_laneq_s32
+ return vmlaq_laneq_s32(a, b, v, 1);
+ // CHECK: mla {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1]
+}
+
+int16x4_t test_vmls_lane_s16(int16x4_t a, int16x4_t b, int16x4_t v) {
+ // CHECK: test_vmls_lane_s16
+ return vmls_lane_s16(a, b, v, 1);
+ // CHECK: mls {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[1]
+}
+
+int16x8_t test_vmlsq_lane_s16(int16x8_t a, int16x8_t b, int16x4_t v) {
+ // CHECK: test_vmlsq_lane_s16
+ return vmlsq_lane_s16(a, b, v, 1);
+ // CHECK: mls {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[1]
+}
+
+int32x2_t test_vmls_lane_s32(int32x2_t a, int32x2_t b, int32x2_t v) {
+ // CHECK: test_vmls_lane_s32
+ return vmls_lane_s32(a, b, v, 1);
+ // CHECK: mls {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1]
+}
+
+int32x4_t test_vmlsq_lane_s32(int32x4_t a, int32x4_t b, int32x2_t v) {
+ // CHECK: test_vmlsq_lane_s32
+ return vmlsq_lane_s32(a, b, v, 1);
+ // CHECK: mls {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1]
+}
+
+int16x4_t test_vmls_laneq_s16(int16x4_t a, int16x4_t b, int16x8_t v) {
+ // CHECK: test_vmls_laneq_s16
+ return vmls_laneq_s16(a, b, v, 1);
+ // CHECK: mls {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[1]
+}
+
+int16x8_t test_vmlsq_laneq_s16(int16x8_t a, int16x8_t b, int16x8_t v) {
+ // CHECK: test_vmlsq_laneq_s16
+ return vmlsq_laneq_s16(a, b, v, 1);
+ // CHECK: mls {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[1]
+}
+
+int32x2_t test_vmls_laneq_s32(int32x2_t a, int32x2_t b, int32x4_t v) {
+ // CHECK: test_vmls_laneq_s32
+ return vmls_laneq_s32(a, b, v, 1);
+ // CHECK: mls {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1]
+}
+
+int32x4_t test_vmlsq_laneq_s32(int32x4_t a, int32x4_t b, int32x4_t v) {
+ // CHECK: test_vmlsq_laneq_s32
+ return vmlsq_laneq_s32(a, b, v, 1);
+ // CHECK: mls {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1]
+}
+
+int16x4_t test_vmul_lane_s16(int16x4_t a, int16x4_t v) {
+ // CHECK: test_vmul_lane_s16
+ return vmul_lane_s16(a, v, 1);
+ // CHECK: mul {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[1]
+}
+
+int16x8_t test_vmulq_lane_s16(int16x8_t a, int16x4_t v) {
+ // CHECK: test_vmulq_lane_s16
+ return vmulq_lane_s16(a, v, 1);
+ // CHECK: mul {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[1]
+}
+
+int32x2_t test_vmul_lane_s32(int32x2_t a, int32x2_t v) {
+ // CHECK: test_vmul_lane_s32
+ return vmul_lane_s32(a, v, 1);
+ // CHECK: mul {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1]
+}
+
+int32x4_t test_vmulq_lane_s32(int32x4_t a, int32x2_t v) {
+ // CHECK: test_vmulq_lane_s32
+ return vmulq_lane_s32(a, v, 1);
+ // CHECK: mul {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1]
+}
+
+uint16x4_t test_vmul_lane_u16(uint16x4_t a, uint16x4_t v) {
+ // CHECK: test_vmul_lane_u16
+ return vmul_lane_u16(a, v, 1);
+ // CHECK: mul {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[1]
+}
+
+uint16x8_t test_vmulq_lane_u16(uint16x8_t a, uint16x4_t v) {
+ // CHECK: test_vmulq_lane_u16
+ return vmulq_lane_u16(a, v, 1);
+ // CHECK: mul {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[1]
+}
+
+uint32x2_t test_vmul_lane_u32(uint32x2_t a, uint32x2_t v) {
+ // CHECK: test_vmul_lane_u32
+ return vmul_lane_u32(a, v, 1);
+ // CHECK: mul {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1]
+}
+
+uint32x4_t test_vmulq_lane_u32(uint32x4_t a, uint32x2_t v) {
+ // CHECK: test_vmulq_lane_u32
+ return vmulq_lane_u32(a, v, 1);
+ // CHECK: mul {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1]
+}
+
+int16x4_t test_vmul_laneq_s16(int16x4_t a, int16x8_t v) {
+ // CHECK: test_vmul_laneq_s16
+ return vmul_laneq_s16(a, v, 1);
+ // CHECK: mul {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[1]
+}
+
+int16x8_t test_vmulq_laneq_s16(int16x8_t a, int16x8_t v) {
+ // CHECK: test_vmulq_laneq_s16
+ return vmulq_laneq_s16(a, v, 1);
+ // CHECK: mul {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[1]
+}
+
+int32x2_t test_vmul_laneq_s32(int32x2_t a, int32x4_t v) {
+ // CHECK: test_vmul_laneq_s32
+ return vmul_laneq_s32(a, v, 1);
+ // CHECK: mul {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1]
+}
+
+int32x4_t test_vmulq_laneq_s32(int32x4_t a, int32x4_t v) {
+ // CHECK: test_vmulq_laneq_s32
+ return vmulq_laneq_s32(a, v, 1);
+ // CHECK: mul {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1]
+}
+
+uint16x4_t test_vmul_laneq_u16(uint16x4_t a, uint16x8_t v) {
+ // CHECK: test_vmul_laneq_u16
+ return vmul_laneq_u16(a, v, 1);
+ // CHECK: mul {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[1]
+}
+
+uint16x8_t test_vmulq_laneq_u16(uint16x8_t a, uint16x8_t v) {
+ // CHECK: test_vmulq_laneq_u16
+ return vmulq_laneq_u16(a, v, 1);
+ // CHECK: mul {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[1]
+}
+
+uint32x2_t test_vmul_laneq_u32(uint32x2_t a, uint32x4_t v) {
+ // CHECK: test_vmul_laneq_u32
+ return vmul_laneq_u32(a, v, 1);
+ // CHECK: mul {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1]
+}
+
+uint32x4_t test_vmulq_laneq_u32(uint32x4_t a, uint32x4_t v) {
+ // CHECK: test_vmulq_laneq_u32
+ return vmulq_laneq_u32(a, v, 1);
+ // CHECK: mul {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1]
+}
+
+float32x2_t test_vfma_lane_f32(float32x2_t a, float32x2_t b, float32x2_t v) {
+ // CHECK: test_vfma_lane_f32
+ return vfma_lane_f32(a, b, v, 1);
+ // CHECK: fmla {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1]
+}
+
+float32x4_t test_vfmaq_lane_f32(float32x4_t a, float32x4_t b, float32x2_t v) {
+ // CHECK: test_vfmaq_lane_f32
+ return vfmaq_lane_f32(a, b, v, 1);
+ // CHECK: fmla {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1]
+}
+
+float32x2_t test_vfma_laneq_f32(float32x2_t a, float32x2_t b, float32x4_t v) {
+ // CHECK: test_vfma_laneq_f32
+ return vfma_laneq_f32(a, b, v, 1);
+ // CHECK: fmla {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1]
+}
+
+float32x4_t test_vfmaq_laneq_f32(float32x4_t a, float32x4_t b, float32x4_t v) {
+ // CHECK: test_vfmaq_laneq_f32
+ return vfmaq_laneq_f32(a, b, v, 1);
+ // CHECK: fmla {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1]
+}
+
+float32x2_t test_vfms_lane_f32(float32x2_t a, float32x2_t b, float32x2_t v) {
+ // CHECK: test_vfms_lane_f32
+ return vfms_lane_f32(a, b, v, 1);
+ // CHECK: fmls {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1]
+}
+
+float32x4_t test_vfmsq_lane_f32(float32x4_t a, float32x4_t b, float32x2_t v) {
+ // CHECK: test_vfmsq_lane_f32
+ return vfmsq_lane_f32(a, b, v, 1);
+ // CHECK: fmls {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1]
+}
+
+float32x2_t test_vfms_laneq_f32(float32x2_t a, float32x2_t b, float32x4_t v) {
+ // CHECK: test_vfms_laneq_f32
+ return vfms_laneq_f32(a, b, v, 1);
+ // CHECK: fmls {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1]
+}
+
+float32x4_t test_vfmsq_laneq_f32(float32x4_t a, float32x4_t b, float32x4_t v) {
+ // CHECK: test_vfmsq_laneq_f32
+ return vfmsq_laneq_f32(a, b, v, 1);
+ // CHECK: fmls {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1]
+}
+
+float64x2_t test_vfmaq_lane_f64(float64x2_t a, float64x2_t b, float64x1_t v) {
+ // CHECK: test_vfmaq_lane_f64
+ return vfmaq_lane_f64(a, b, v, 0);
+ // CHECK: fmla {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.d[0]
+}
+
+float64x2_t test_vfmaq_laneq_f64_0(float64x2_t a, float64x2_t b, float64x2_t v) {
+ // CHECK: test_vfmaq_laneq_f64
+ return vfmaq_laneq_f64(a, b, v, 0);
+ // CHECK: fmla {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.d[0]
+}
+
+float64x2_t test_vfmaq_laneq_f64(float64x2_t a, float64x2_t b, float64x2_t v) {
+ // CHECK: test_vfmaq_laneq_f64
+ return vfmaq_laneq_f64(a, b, v, 1);
+ // CHECK: fmla {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.d[1]
+}
+
+float64x2_t test_vfmsq_lane_f64(float64x2_t a, float64x2_t b, float64x1_t v) {
+ // CHECK: test_vfmsq_lane_f64
+ return vfmsq_lane_f64(a, b, v, 0);
+ // CHECK: fmls {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.d[0]
+}
+
+float64x2_t test_vfmsq_laneq_f64_0(float64x2_t a, float64x2_t b, float64x2_t v) {
+ // CHECK: test_vfmsq_laneq_f64
+ return vfmsq_laneq_f64(a, b, v, 0);
+ // CHECK: fmls {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.d[0]
+}
+
+float64x2_t test_vfmsq_laneq_f64(float64x2_t a, float64x2_t b, float64x2_t v) {
+ // CHECK: test_vfmsq_laneq_f64
+ return vfmsq_laneq_f64(a, b, v, 1);
+ // CHECK: fmls {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.d[1]
+}
+
+int32x4_t test_vmlal_lane_s16(int32x4_t a, int16x4_t b, int16x4_t v) {
+ // CHECK: test_vmlal_lane_s16
+ return vmlal_lane_s16(a, b, v, 1);
+ // CHECK: smlal {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[1]
+}
+
+int64x2_t test_vmlal_lane_s32(int64x2_t a, int32x2_t b, int32x2_t v) {
+ // CHECK: test_vmlal_lane_s32
+ return vmlal_lane_s32(a, b, v, 1);
+ // CHECK: smlal {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1]
+}
+
+int32x4_t test_vmlal_laneq_s16(int32x4_t a, int16x4_t b, int16x8_t v) {
+ // CHECK: test_vmlal_laneq_s16
+ return vmlal_laneq_s16(a, b, v, 1);
+ // CHECK: smlal {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[1]
+}
+
+int64x2_t test_vmlal_laneq_s32(int64x2_t a, int32x2_t b, int32x4_t v) {
+ // CHECK: test_vmlal_laneq_s32
+ return vmlal_laneq_s32(a, b, v, 1);
+ // CHECK: smlal {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1]
+}
+
+int32x4_t test_vmlal_high_lane_s16(int32x4_t a, int16x8_t b, int16x4_t v) {
+ // CHECK: test_vmlal_high_lane_s16
+ return vmlal_high_lane_s16(a, b, v, 1);
+ // CHECK: smlal2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[1]
+}
+
+int64x2_t test_vmlal_high_lane_s32(int64x2_t a, int32x4_t b, int32x2_t v) {
+ // CHECK: test_vmlal_high_lane_s32
+ return vmlal_high_lane_s32(a, b, v, 1);
+ // CHECK: smlal2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1]
+}
+
+int32x4_t test_vmlal_high_laneq_s16(int32x4_t a, int16x8_t b, int16x8_t v) {
+ // CHECK: test_vmlal_high_laneq_s16
+ return vmlal_high_laneq_s16(a, b, v, 1);
+ // CHECK: smlal2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[1]
+}
+
+int64x2_t test_vmlal_high_laneq_s32(int64x2_t a, int32x4_t b, int32x4_t v) {
+ // CHECK: test_vmlal_high_laneq_s32
+ return vmlal_high_laneq_s32(a, b, v, 1);
+ // CHECK: smlal2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1]
+}
+
+int32x4_t test_vmlsl_lane_s16(int32x4_t a, int16x4_t b, int16x4_t v) {
+ // CHECK: test_vmlsl_lane_s16
+ return vmlsl_lane_s16(a, b, v, 1);
+ // CHECK: smlsl {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[1]
+}
+
+int64x2_t test_vmlsl_lane_s32(int64x2_t a, int32x2_t b, int32x2_t v) {
+ // CHECK: test_vmlsl_lane_s32
+ return vmlsl_lane_s32(a, b, v, 1);
+ // CHECK: smlsl {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1]
+}
+
+int32x4_t test_vmlsl_laneq_s16(int32x4_t a, int16x4_t b, int16x8_t v) {
+ // CHECK: test_vmlsl_laneq_s16
+ return vmlsl_laneq_s16(a, b, v, 1);
+ // CHECK: smlsl {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[1]
+}
+
+int64x2_t test_vmlsl_laneq_s32(int64x2_t a, int32x2_t b, int32x4_t v) {
+ // CHECK: test_vmlsl_laneq_s32
+ return vmlsl_laneq_s32(a, b, v, 1);
+ // CHECK: smlsl {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1]
+}
+
+int32x4_t test_vmlsl_high_lane_s16(int32x4_t a, int16x8_t b, int16x4_t v) {
+ // CHECK: test_vmlsl_high_lane_s16
+ return vmlsl_high_lane_s16(a, b, v, 1);
+ // CHECK: smlsl2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[1]
+}
+
+int64x2_t test_vmlsl_high_lane_s32(int64x2_t a, int32x4_t b, int32x2_t v) {
+ // CHECK: test_vmlsl_high_lane_s32
+ return vmlsl_high_lane_s32(a, b, v, 1);
+ // CHECK: smlsl2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1]
+}
+
+int32x4_t test_vmlsl_high_laneq_s16(int32x4_t a, int16x8_t b, int16x8_t v) {
+ // CHECK: test_vmlsl_high_laneq_s16
+ return vmlsl_high_laneq_s16(a, b, v, 1);
+ // CHECK: smlsl2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[1]
+}
+
+int64x2_t test_vmlsl_high_laneq_s32(int64x2_t a, int32x4_t b, int32x4_t v) {
+ // CHECK: test_vmlsl_high_laneq_s32
+ return vmlsl_high_laneq_s32(a, b, v, 1);
+ // CHECK: smlsl2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1]
+}
+
+int32x4_t test_vmlal_lane_u16(int32x4_t a, int16x4_t b, int16x4_t v) {
+ // CHECK: test_vmlal_lane_u16
+ return vmlal_lane_u16(a, b, v, 1);
+ // CHECK: umlal {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[1]
+}
+
+int64x2_t test_vmlal_lane_u32(int64x2_t a, int32x2_t b, int32x2_t v) {
+ // CHECK: test_vmlal_lane_u32
+ return vmlal_lane_u32(a, b, v, 1);
+ // CHECK: umlal {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1]
+}
+
+int32x4_t test_vmlal_laneq_u16(int32x4_t a, int16x4_t b, int16x8_t v) {
+ // CHECK: test_vmlal_laneq_u16
+ return vmlal_laneq_u16(a, b, v, 1);
+ // CHECK: umlal {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[1]
+}
+
+int64x2_t test_vmlal_laneq_u32(int64x2_t a, int32x2_t b, int32x4_t v) {
+ // CHECK: test_vmlal_laneq_u32
+ return vmlal_laneq_u32(a, b, v, 1);
+ // CHECK: umlal {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1]
+}
+
+int32x4_t test_vmlal_high_lane_u16(int32x4_t a, int16x8_t b, int16x4_t v) {
+ // CHECK: test_vmlal_high_lane_u16
+ return vmlal_high_lane_u16(a, b, v, 1);
+ // CHECK: umlal2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[1]
+}
+
+int64x2_t test_vmlal_high_lane_u32(int64x2_t a, int32x4_t b, int32x2_t v) {
+ // CHECK: test_vmlal_high_lane_u32
+ return vmlal_high_lane_u32(a, b, v, 1);
+ // CHECK: umlal2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1]
+}
+
+int32x4_t test_vmlal_high_laneq_u16(int32x4_t a, int16x8_t b, int16x8_t v) {
+ // CHECK: test_vmlal_high_laneq_u16
+ return vmlal_high_laneq_u16(a, b, v, 1);
+ // CHECK: umlal2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[1]
+}
+
+int64x2_t test_vmlal_high_laneq_u32(int64x2_t a, int32x4_t b, int32x4_t v) {
+ // CHECK: test_vmlal_high_laneq_u32
+ return vmlal_high_laneq_u32(a, b, v, 1);
+ // CHECK: umlal2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1]
+}
+
+int32x4_t test_vmlsl_lane_u16(int32x4_t a, int16x4_t b, int16x4_t v) {
+ // CHECK: test_vmlsl_lane_u16
+ return vmlsl_lane_u16(a, b, v, 1);
+ // CHECK: umlsl {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[1]
+}
+
+int64x2_t test_vmlsl_lane_u32(int64x2_t a, int32x2_t b, int32x2_t v) {
+ // CHECK: test_vmlsl_lane_u32
+ return vmlsl_lane_u32(a, b, v, 1);
+ // CHECK: umlsl {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1]
+}
+
+int32x4_t test_vmlsl_laneq_u16(int32x4_t a, int16x4_t b, int16x8_t v) {
+ // CHECK: test_vmlsl_laneq_u16
+ return vmlsl_laneq_u16(a, b, v, 1);
+ // CHECK: umlsl {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[1]
+}
+
+int64x2_t test_vmlsl_laneq_u32(int64x2_t a, int32x2_t b, int32x4_t v) {
+ // CHECK: test_vmlsl_laneq_u32
+ return vmlsl_laneq_u32(a, b, v, 1);
+ // CHECK: umlsl {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1]
+}
+
+int32x4_t test_vmlsl_high_lane_u16(int32x4_t a, int16x8_t b, int16x4_t v) {
+ // CHECK: test_vmlsl_high_lane_u16
+ return vmlsl_high_lane_u16(a, b, v, 1);
+ // CHECK: umlsl2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[1]
+}
+
+int64x2_t test_vmlsl_high_lane_u32(int64x2_t a, int32x4_t b, int32x2_t v) {
+ // CHECK: test_vmlsl_high_lane_u32
+ return vmlsl_high_lane_u32(a, b, v, 1);
+ // CHECK: umlsl2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1]
+}
+
+int32x4_t test_vmlsl_high_laneq_u16(int32x4_t a, int16x8_t b, int16x8_t v) {
+ // CHECK: test_vmlsl_high_laneq_u16
+ return vmlsl_high_laneq_u16(a, b, v, 1);
+ // CHECK: umlsl2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[1]
+}
+
+int64x2_t test_vmlsl_high_laneq_u32(int64x2_t a, int32x4_t b, int32x4_t v) {
+ // CHECK: test_vmlsl_high_laneq_u32
+ return vmlsl_high_laneq_u32(a, b, v, 1);
+ // CHECK: umlsl2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1]
+}
+
+int32x4_t test_vmull_lane_s16(int16x4_t a, int16x4_t v) {
+ // CHECK: test_vmull_lane_s16
+ return vmull_lane_s16(a, v, 1);
+ // CHECK: smull {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[1]
+}
+
+int64x2_t test_vmull_lane_s32(int32x2_t a, int32x2_t v) {
+ // CHECK: test_vmull_lane_s32
+ return vmull_lane_s32(a, v, 1);
+ // CHECK: smull {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1]
+}
+
+uint32x4_t test_vmull_lane_u16(uint16x4_t a, uint16x4_t v) {
+ // CHECK: test_vmull_lane_u16
+ return vmull_lane_u16(a, v, 1);
+ // CHECK: umull {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[1]
+}
+
+uint64x2_t test_vmull_lane_u32(uint32x2_t a, uint32x2_t v) {
+ // CHECK: test_vmull_lane_u32
+ return vmull_lane_u32(a, v, 1);
+ // CHECK: umull {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1]
+}
+
+int32x4_t test_vmull_high_lane_s16(int16x8_t a, int16x4_t v) {
+ // CHECK: test_vmull_high_lane_s16
+ return vmull_high_lane_s16(a, v, 1);
+ // CHECK: smull2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[1]
+}
+
+int64x2_t test_vmull_high_lane_s32(int32x4_t a, int32x2_t v) {
+ // CHECK: test_vmull_high_lane_s32
+ return vmull_high_lane_s32(a, v, 1);
+ // CHECK: smull2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1]
+}
+
+uint32x4_t test_vmull_high_lane_u16(uint16x8_t a, uint16x4_t v) {
+ // CHECK: test_vmull_high_lane_u16
+ return vmull_high_lane_u16(a, v, 1);
+ // CHECK: umull2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[1]
+}
+
+uint64x2_t test_vmull_high_lane_u32(uint32x4_t a, uint32x2_t v) {
+ // CHECK: test_vmull_high_lane_u32
+ return vmull_high_lane_u32(a, v, 1);
+ // CHECK: umull2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1]
+}
+
+int32x4_t test_vmull_laneq_s16(int16x4_t a, int16x8_t v) {
+ // CHECK: test_vmull_laneq_s16
+ return vmull_laneq_s16(a, v, 1);
+ // CHECK: smull {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[1]
+}
+
+int64x2_t test_vmull_laneq_s32(int32x2_t a, int32x4_t v) {
+ // CHECK: test_vmull_laneq_s32
+ return vmull_laneq_s32(a, v, 1);
+ // CHECK: smull {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1]
+}
+
+uint32x4_t test_vmull_laneq_u16(uint16x4_t a, uint16x8_t v) {
+ // CHECK: test_vmull_laneq_u16
+ return vmull_laneq_u16(a, v, 1);
+ // CHECK: umull {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[1]
+}
+
+uint64x2_t test_vmull_laneq_u32(uint32x2_t a, uint32x4_t v) {
+ // CHECK: test_vmull_laneq_u32
+ return vmull_laneq_u32(a, v, 1);
+ // CHECK: umull {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1]
+}
+
+int32x4_t test_vmull_high_laneq_s16(int16x8_t a, int16x8_t v) {
+ // CHECK: test_vmull_high_laneq_s16
+ return vmull_high_laneq_s16(a, v, 1);
+ // CHECK: smull2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[1]
+}
+
+int64x2_t test_vmull_high_laneq_s32(int32x4_t a, int32x4_t v) {
+ // CHECK: test_vmull_high_laneq_s32
+ return vmull_high_laneq_s32(a, v, 1);
+ // CHECK: smull2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1]
+}
+
+uint32x4_t test_vmull_high_laneq_u16(uint16x8_t a, uint16x8_t v) {
+ // CHECK: test_vmull_high_laneq_u16
+ return vmull_high_laneq_u16(a, v, 1);
+ // CHECK: umull2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[1]
+}
+
+uint64x2_t test_vmull_high_laneq_u32(uint32x4_t a, uint32x4_t v) {
+ // CHECK: test_vmull_high_laneq_u32
+ return vmull_high_laneq_u32(a, v, 1);
+ // CHECK: umull2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1]
+}
+
+int32x4_t test_vqdmlal_lane_s16(int32x4_t a, int16x4_t b, int16x4_t v) {
+ // CHECK: test_vqdmlal_lane_s16
+ return vqdmlal_lane_s16(a, b, v, 1);
+ // CHECK: sqdmlal {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[1]
+}
+
+int64x2_t test_vqdmlal_lane_s32(int64x2_t a, int32x2_t b, int32x2_t v) {
+ // CHECK: test_vqdmlal_lane_s32
+ return vqdmlal_lane_s32(a, b, v, 1);
+ // CHECK: sqdmlal {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1]
+}
+
+int32x4_t test_vqdmlal_high_lane_s16(int32x4_t a, int16x8_t b, int16x4_t v) {
+ // CHECK: test_vqdmlal_high_lane_s16
+ return vqdmlal_high_lane_s16(a, b, v, 1);
+ // CHECK: sqdmlal2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[1]
+}
+
+int64x2_t test_vqdmlal_high_lane_s32(int64x2_t a, int32x4_t b, int32x2_t v) {
+ // CHECK: test_vqdmlal_high_lane_s32
+ return vqdmlal_high_lane_s32(a, b, v, 1);
+ // CHECK: sqdmlal2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1]
+}
+
+int32x4_t test_vqdmlsl_lane_s16(int32x4_t a, int16x4_t b, int16x4_t v) {
+ // CHECK: test_vqdmlsl_lane_s16
+ return vqdmlsl_lane_s16(a, b, v, 1);
+ // CHECK: sqdmlsl {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[1]
+}
+
+int64x2_t test_vqdmlsl_lane_s32(int64x2_t a, int32x2_t b, int32x2_t v) {
+ // CHECK: test_vqdmlsl_lane_s32
+ return vqdmlsl_lane_s32(a, b, v, 1);
+ // CHECK: sqdmlsl {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1]
+}
+
+int32x4_t test_vqdmlsl_high_lane_s16(int32x4_t a, int16x8_t b, int16x4_t v) {
+ // CHECK: test_vqdmlsl_high_lane_s16
+ return vqdmlsl_high_lane_s16(a, b, v, 1);
+ // CHECK: sqdmlsl2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[1]
+}
+
+int64x2_t test_vqdmlsl_high_lane_s32(int64x2_t a, int32x4_t b, int32x2_t v) {
+ // CHECK: test_vqdmlsl_high_lane_s32
+ return vqdmlsl_high_lane_s32(a, b, v, 1);
+ // CHECK: sqdmlsl2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1]
+}
+
+int32x4_t test_vqdmull_lane_s16(int16x4_t a, int16x4_t v) {
+ // CHECK: test_vqdmull_lane_s16
+ return vqdmull_lane_s16(a, v, 1);
+ // CHECK: sqdmull {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[1]
+}
+
+int64x2_t test_vqdmull_lane_s32(int32x2_t a, int32x2_t v) {
+ // CHECK: test_vqdmull_lane_s32
+ return vqdmull_lane_s32(a, v, 1);
+ // CHECK: sqdmull {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1]
+}
+
+int32x4_t test_vqdmull_laneq_s16(int16x4_t a, int16x8_t v) {
+ // CHECK: test_vqdmull_laneq_s16
+ return vqdmull_laneq_s16(a, v, 1);
+ // CHECK: sqdmull {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[1]
+}
+
+int64x2_t test_vqdmull_laneq_s32(int32x2_t a, int32x4_t v) {
+ // CHECK: test_vqdmull_laneq_s32
+ return vqdmull_laneq_s32(a, v, 1);
+ // CHECK: sqdmull {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1]
+}
+
+int32x4_t test_vqdmull_high_lane_s16(int16x8_t a, int16x4_t v) {
+ // CHECK: test_vqdmull_high_lane_s16
+ return vqdmull_high_lane_s16(a, v, 1);
+ // CHECK: sqdmull2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[1]
+}
+
+int64x2_t test_vqdmull_high_lane_s32(int32x4_t a, int32x2_t v) {
+ // CHECK: test_vqdmull_high_lane_s32
+ return vqdmull_high_lane_s32(a, v, 1);
+ // CHECK: sqdmull2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1]
+}
+
+int32x4_t test_vqdmull_high_laneq_s16(int16x8_t a, int16x8_t v) {
+ // CHECK: test_vqdmull_high_laneq_s16
+ return vqdmull_high_laneq_s16(a, v, 1);
+ // CHECK: sqdmull2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[1]
+}
+
+int64x2_t test_vqdmull_high_laneq_s32(int32x4_t a, int32x4_t v) {
+ // CHECK: test_vqdmull_high_laneq_s32
+ return vqdmull_high_laneq_s32(a, v, 1);
+ // CHECK: sqdmull2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1]
+}
+
+int16x4_t test_vqdmulh_lane_s16(int16x4_t a, int16x4_t v) {
+ // CHECK: test_vqdmulh_lane_s16
+ return vqdmulh_lane_s16(a, v, 1);
+ // CHECK: sqdmulh {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[1]
+}
+
+int16x8_t test_vqdmulhq_lane_s16(int16x8_t a, int16x4_t v) {
+ // CHECK: test_vqdmulhq_lane_s16
+ return vqdmulhq_lane_s16(a, v, 1);
+ // CHECK: sqdmulh {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[1]
+}
+
+int32x2_t test_vqdmulh_lane_s32(int32x2_t a, int32x2_t v) {
+ // CHECK: test_vqdmulh_lane_s32
+ return vqdmulh_lane_s32(a, v, 1);
+ // CHECK: sqdmulh {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1]
+}
+
+int32x4_t test_vqdmulhq_lane_s32(int32x4_t a, int32x2_t v) {
+ // CHECK: test_vqdmulhq_lane_s32
+ return vqdmulhq_lane_s32(a, v, 1);
+ // CHECK: sqdmulh {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1]
+}
+
+int16x4_t test_vqrdmulh_lane_s16(int16x4_t a, int16x4_t v) {
+ // CHECK: test_vqrdmulh_lane_s16
+ return vqrdmulh_lane_s16(a, v, 1);
+ // CHECK: sqrdmulh {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[1]
+}
+
+int16x8_t test_vqrdmulhq_lane_s16(int16x8_t a, int16x4_t v) {
+ // CHECK: test_vqrdmulhq_lane_s16
+ return vqrdmulhq_lane_s16(a, v, 1);
+ // CHECK: sqrdmulh {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[1]
+}
+
+int32x2_t test_vqrdmulh_lane_s32(int32x2_t a, int32x2_t v) {
+ // CHECK: test_vqrdmulh_lane_s32
+ return vqrdmulh_lane_s32(a, v, 1);
+ // CHECK: sqrdmulh {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1]
+}
+
+int32x4_t test_vqrdmulhq_lane_s32(int32x4_t a, int32x2_t v) {
+ // CHECK: test_vqrdmulhq_lane_s32
+ return vqrdmulhq_lane_s32(a, v, 1);
+ // CHECK: sqrdmulh {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1]
+}
+
+float32x2_t test_vmul_lane_f32(float32x2_t a, float32x2_t v) {
+ // CHECK: test_vmul_lane_f32
+ return vmul_lane_f32(a, v, 1);
+ // CHECK: fmul {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1]
+}
+
+float32x4_t test_vmulq_lane_f32(float32x4_t a, float32x2_t v) {
+ // CHECK: test_vmulq_lane_f32
+ return vmulq_lane_f32(a, v, 1);
+ // CHECK: fmul {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1]
+}
+
+float64x2_t test_vmulq_lane_f64(float64x2_t a, float64x1_t v) {
+ // CHECK: test_vmulq_lane_f64
+ return vmulq_lane_f64(a, v, 0);
+ // CHECK: fmul {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.d[0]
+}
+
+float32x2_t test_vmul_laneq_f32(float32x2_t a, float32x4_t v) {
+ // CHECK: test_vmul_laneq_f32
+ return vmul_laneq_f32(a, v, 1);
+ // CHECK: fmul {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1]
+}
+
+float32x4_t test_vmulq_laneq_f32(float32x4_t a, float32x4_t v) {
+ // CHECK: test_vmulq_laneq_f32
+ return vmulq_laneq_f32(a, v, 1);
+ // CHECK: fmul {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1]
+}
+
+float64x2_t test_vmulq_laneq_f64_0(float64x2_t a, float64x2_t v) {
+ // CHECK: test_vmulq_laneq_f64
+ return vmulq_laneq_f64(a, v, 0);
+ // CHECK: fmul {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.d[0]
+}
+
+float64x2_t test_vmulq_laneq_f64(float64x2_t a, float64x2_t v) {
+ // CHECK: test_vmulq_laneq_f64
+ return vmulq_laneq_f64(a, v, 1);
+ // CHECK: fmul {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.d[1]
+}
+
+float32x2_t test_vmulx_lane_f32(float32x2_t a, float32x2_t v) {
+ // CHECK: test_vmulx_lane_f32
+ return vmulx_lane_f32(a, v, 1);
+ // CHECK: fmulx {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1]
+}
+
+float32x4_t test_vmulxq_lane_f32(float32x4_t a, float32x2_t v) {
+ // CHECK: test_vmulxq_lane_f32
+ return vmulxq_lane_f32(a, v, 1);
+ // CHECK: fmulx {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1]
+}
+
+float64x2_t test_vmulxq_lane_f64(float64x2_t a, float64x1_t v) {
+ // CHECK: test_vmulxq_lane_f64
+ return vmulxq_lane_f64(a, v, 0);
+ // CHECK: fmulx {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.d[0]
+}
+
+float32x2_t test_vmulx_laneq_f32(float32x2_t a, float32x4_t v) {
+ // CHECK: test_vmulx_laneq_f32
+ return vmulx_laneq_f32(a, v, 1);
+ // CHECK: fmulx {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1]
+}
+
+float32x4_t test_vmulxq_laneq_f32(float32x4_t a, float32x4_t v) {
+ // CHECK: test_vmulxq_laneq_f32
+ return vmulxq_laneq_f32(a, v, 1);
+ // CHECK: fmulx {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1]
+}
+
+float64x2_t test_vmulxq_laneq_f64_0(float64x2_t a, float64x2_t v) {
+ // CHECK: test_vmulxq_laneq_f64
+ return vmulxq_laneq_f64(a, v, 0);
+ // CHECK: fmulx {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.d[0]
+}
+
+float64x2_t test_vmulxq_laneq_f64(float64x2_t a, float64x2_t v) {
+ // CHECK: test_vmulxq_laneq_f64
+ return vmulxq_laneq_f64(a, v, 1);
+ // CHECK: fmulx {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.d[1]
+}
+
diff --git a/utils/TableGen/NeonEmitter.cpp b/utils/TableGen/NeonEmitter.cpp
index f700c67..9dc2d56 100644
--- a/utils/TableGen/NeonEmitter.cpp
+++ b/utils/TableGen/NeonEmitter.cpp
@@ -62,16 +62,25 @@
OpMlalN,
OpMlslN,
OpMulLane,
+ OpMulXLane,
OpMullLane,
+ OpMullHiLane,
OpMlaLane,
OpMlsLane,
OpMlalLane,
+ OpMlalHiLane,
OpMlslLane,
+ OpMlslHiLane,
OpQDMullLane,
+ OpQDMullHiLane,
OpQDMlalLane,
+ OpQDMlalHiLane,
OpQDMlslLane,
+ OpQDMlslHiLane,
OpQDMulhLane,
OpQRDMulhLane,
+ OpFMSLane,
+ OpFMSLaneQ,
OpEq,
OpGe,
OpLe,
@@ -197,16 +206,25 @@
OpMap["OP_MLAL_N"] = OpMlalN;
OpMap["OP_MLSL_N"] = OpMlslN;
OpMap["OP_MUL_LN"]= OpMulLane;
+ OpMap["OP_MULX_LN"]= OpMulXLane;
OpMap["OP_MULL_LN"] = OpMullLane;
+ OpMap["OP_MULLHi_LN"] = OpMullHiLane;
OpMap["OP_MLA_LN"]= OpMlaLane;
OpMap["OP_MLS_LN"]= OpMlsLane;
OpMap["OP_MLAL_LN"] = OpMlalLane;
+ OpMap["OP_MLALHi_LN"] = OpMlalHiLane;
OpMap["OP_MLSL_LN"] = OpMlslLane;
+ OpMap["OP_MLSLHi_LN"] = OpMlslHiLane;
OpMap["OP_QDMULL_LN"] = OpQDMullLane;
+ OpMap["OP_QDMULLHi_LN"] = OpQDMullHiLane;
OpMap["OP_QDMLAL_LN"] = OpQDMlalLane;
+ OpMap["OP_QDMLALHi_LN"] = OpQDMlalHiLane;
OpMap["OP_QDMLSL_LN"] = OpQDMlslLane;
+ OpMap["OP_QDMLSLHi_LN"] = OpQDMlslHiLane;
OpMap["OP_QDMULH_LN"] = OpQDMulhLane;
OpMap["OP_QRDMULH_LN"] = OpQRDMulhLane;
+ OpMap["OP_FMS_LN"] = OpFMSLane;
+ OpMap["OP_FMS_LNQ"] = OpFMSLaneQ;
OpMap["OP_EQ"] = OpEq;
OpMap["OP_GE"] = OpGe;
OpMap["OP_LE"] = OpLe;
@@ -447,6 +465,9 @@
case 'g':
quad = false;
break;
+ case 'j':
+ quad = true;
+ break;
case 'w':
type = Widen(type);
quad = true;
@@ -626,7 +647,8 @@
type = 's';
usgn = true;
}
- usgn = usgn | poly | ((ck == ClassI || ck == ClassW) && scal && type != 'f');
+ usgn = usgn | poly | ((ck == ClassI || ck == ClassW) &&
+ scal && type != 'f' && type != 'd');
if (scal) {
SmallString<128> s;
@@ -657,6 +679,8 @@
return "vv*"; // void result with void* first argument
if (mod == 'f' || (ck != ClassB && type == 'f'))
return quad ? "V4f" : "V2f";
+ if (ck != ClassB && type == 'd')
+ return quad ? "V2d" : "V1d";
if (ck != ClassB && type == 's')
return quad ? "V8s" : "V4s";
if (ck != ClassB && type == 'i')
@@ -677,6 +701,8 @@
if (mod == 'f' || (ck != ClassB && type == 'f'))
return quad ? "V4f" : "V2f";
+ if (ck != ClassB && type == 'd')
+ return quad ? "V2d" : "V1d";
if (ck != ClassB && type == 's')
return quad ? "V8s" : "V4s";
if (ck != ClassB && type == 'i')
@@ -974,6 +1000,7 @@
NormedProto += 'q';
break;
case 'g':
+ case 'j':
case 'h':
case 'e':
NormedProto += 'd';
@@ -1504,6 +1531,10 @@
case OpMulLane:
s += "__a * " + SplatLane(nElts, "__b", "__c") + ";";
break;
+ case OpMulXLane:
+ s += MangleName("vmulx", typestr, ClassS) + "(__a, " +
+ SplatLane(nElts, "__b", "__c") + ");";
+ break;
case OpMul:
s += "__a * __b;";
break;
@@ -1511,6 +1542,10 @@
s += MangleName("vmull", typestr, ClassS) + "(__a, " +
SplatLane(nElts, "__b", "__c") + ");";
break;
+ case OpMullHiLane:
+ s += MangleName("vmull", typestr, ClassS) + "(" +
+ GetHigh("__a", typestr) + ", " + SplatLane(nElts, "__b", "__c") + ");";
+ break;
case OpMlaN:
s += "__a + (__b * " + Duplicate(nElts, typestr, "__c") + ");";
break;
@@ -1528,6 +1563,10 @@
s += "__a + " + MangleName("vmull", typestr, ClassS) + "(__b, " +
SplatLane(nElts, "__c", "__d") + ");";
break;
+ case OpMlalHiLane:
+ s += "__a + " + MangleName("vmull", typestr, ClassS) + "(" +
+ GetHigh("__b", typestr) + ", " + SplatLane(nElts, "__c", "__d") + ");";
+ break;
case OpMlal:
s += "__a + " + MangleName("vmull", typestr, ClassS) + "(__b, __c);";
break;
@@ -1543,6 +1582,18 @@
case OpMlsLane:
s += "__a - (__b * " + SplatLane(nElts, "__c", "__d") + ");";
break;
+ case OpFMSLane:
+ s += TypeString(proto[1], typestr) + " __a1 = __a; \\\n ";
+ s += TypeString(proto[2], typestr) + " __b1 = __b; \\\n ";
+ s += TypeString(proto[3], typestr) + " __c1 = __c; \\\n ";
+ s += MangleName("vfma_lane", typestr, ClassS) + "(__a1, __b1, -__c1, __d);";
+ break;
+ case OpFMSLaneQ:
+ s += TypeString(proto[1], typestr) + " __a1 = __a; \\\n ";
+ s += TypeString(proto[2], typestr) + " __b1 = __b; \\\n ";
+ s += TypeString(proto[3], typestr) + " __c1 = __c; \\\n ";
+ s += MangleName("vfma_laneq", typestr, ClassS) + "(__a1, __b1, -__c1, __d);";
+ break;
case OpMls:
s += "__a - (__b * __c);";
break;
@@ -1554,6 +1605,10 @@
s += "__a - " + MangleName("vmull", typestr, ClassS) + "(__b, " +
SplatLane(nElts, "__c", "__d") + ");";
break;
+ case OpMlslHiLane:
+ s += "__a - " + MangleName("vmull", typestr, ClassS) + "(" +
+ GetHigh("__b", typestr) + ", " + SplatLane(nElts, "__c", "__d") + ");";
+ break;
case OpMlsl:
s += "__a - " + MangleName("vmull", typestr, ClassS) + "(__b, __c);";
break;
@@ -1564,14 +1619,26 @@
s += MangleName("vqdmull", typestr, ClassS) + "(__a, " +
SplatLane(nElts, "__b", "__c") + ");";
break;
+ case OpQDMullHiLane:
+ s += MangleName("vqdmull", typestr, ClassS) + "(" +
+ GetHigh("__a", typestr) + ", " + SplatLane(nElts, "__b", "__c") + ");";
+ break;
case OpQDMlalLane:
s += MangleName("vqdmlal", typestr, ClassS) + "(__a, __b, " +
SplatLane(nElts, "__c", "__d") + ");";
break;
+ case OpQDMlalHiLane:
+ s += MangleName("vqdmlal", typestr, ClassS) + "(__a, " +
+ GetHigh("__b", typestr) + ", " + SplatLane(nElts, "__c", "__d") + ");";
+ break;
case OpQDMlslLane:
s += MangleName("vqdmlsl", typestr, ClassS) + "(__a, __b, " +
SplatLane(nElts, "__c", "__d") + ");";
break;
+ case OpQDMlslHiLane:
+ s += MangleName("vqdmlsl", typestr, ClassS) + "(__a, " +
+ GetHigh("__b", typestr) + ", " + SplatLane(nElts, "__c", "__d") + ");";
+ break;
case OpQDMulhLane:
s += MangleName("vqdmulh", typestr, ClassS) + "(__a, " +
SplatLane(nElts, "__b", "__c") + ");";
@@ -2072,20 +2139,28 @@
// Emit Neon vector typedefs.
std::string TypedefTypes(
- "cQcsQsiQilQlUcQUcUsQUsUiQUiUlQUlhQhfQfQdPcQPcPsQPs");
+ "cQcsQsiQilQlUcQUcUsQUsUiQUiUlQUlhQhfQfdQdPcQPcPsQPs");
SmallVector<StringRef, 24> TDTypeVec;
ParseTypes(0, TypedefTypes, TDTypeVec);
// Emit vector typedefs.
+ bool isA64 = false;
for (unsigned i = 0, e = TDTypeVec.size(); i != e; ++i) {
bool dummy, quad = false, poly = false;
char type = ClassifyType(TDTypeVec[i], quad, poly, dummy);
- bool isA64 = false;
+ bool preinsert = false;
+ bool postinsert = false;
- if (type == 'd' && quad)
+ if (type == 'd') {
+ preinsert = isA64? false: true;
isA64 = true;
-
- if (isA64)
+ } else {
+ postinsert = isA64? true: false;
+ isA64 = false;
+ }
+ if (postinsert)
+ OS << "#endif\n";
+ if (preinsert)
OS << "#ifdef __aarch64__\n";
if (poly)
@@ -2101,22 +2176,28 @@
OS << TypeString('s', TDTypeVec[i]);
OS << " " << TypeString('d', TDTypeVec[i]) << ";\n";
- if (isA64)
- OS << "#endif\n";
}
OS << "\n";
// Emit struct typedefs.
+ isA64 = false;
for (unsigned vi = 2; vi != 5; ++vi) {
for (unsigned i = 0, e = TDTypeVec.size(); i != e; ++i) {
bool dummy, quad = false, poly = false;
char type = ClassifyType(TDTypeVec[i], quad, poly, dummy);
- bool isA64 = false;
+ bool preinsert = false;
+ bool postinsert = false;
- if (type == 'd' && quad)
+ if (type == 'd') {
+ preinsert = isA64? false: true;
isA64 = true;
-
- if (isA64)
+ } else {
+ postinsert = isA64? true: false;
+ isA64 = false;
+ }
+ if (postinsert)
+ OS << "#endif\n";
+ if (preinsert)
OS << "#ifdef __aarch64__\n";
std::string ts = TypeString('d', TDTypeVec[i]);
@@ -2126,10 +2207,6 @@
OS << "[" << utostr(vi) << "]";
OS << ";\n} ";
OS << vs << ";\n";
-
- if (isA64)
- OS << "#endif\n";
-
OS << "\n";
}
}
@@ -2255,6 +2332,7 @@
case 'f':
case 'i':
return (2 << (int)quad) - 1;
+ case 'd':
case 'l':
return (1 << (int)quad) - 1;
default: