[AArch64] Add IR intrinsics for sq(r)dmulh_lane(q)
Summary:
Currently, sqdmulh_lane and friends from the ACLE (implemented in arm_neon.h),
are represented in LLVM IR as a (by vector) sqdmulh and a vector of (repeated)
indices, like so:
%shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
%vqdmulh2.i = tail call <4 x i16> @llvm.aarch64.neon.sqdmulh.v4i16(<4 x i16> %a, <4 x i16> %shuffle)
When %v's values are known, the shufflevector is optimized away and we are no
longer able to select the lane variant of sqdmulh in the backend.
This defeats a (hand-coded) optimization that packs several constants into a
single vector and uses the lane intrinsics to reduce register pressure and
trade-off materialising several constants for a single vector load from the
constant pool, like so:
int16x8_t v = {2,3,4,5,6,7,8,9};
a = vqdmulh_laneq_s16(a, v, 0);
b = vqdmulh_laneq_s16(b, v, 1);
c = vqdmulh_laneq_s16(c, v, 2);
d = vqdmulh_laneq_s16(d, v, 3);
[...]
In one microbenchmark from libjpeg-turbo this accounts for a 2.5% to 4%
performance difference.
We could teach the compiler to recover the lane variants, but this would likely
require its own pass. (Alternatively, "volatile" could be used on the constants
vector, but this is a bit ugly.)
This patch instead implements the following LLVM IR intrinsics for AArch64 to
maintain the original structure through IR optmization and into instruction
selection:
- sqdmulh_lane
- sqdmulh_laneq
- sqrdmulh_lane
- sqrdmulh_laneq.
These 'lane' variants need an additional register class. The second argument
must be in the lower half of the 64-bit NEON register file, but only when
operating on i16 elements.
Note that the existing patterns for shufflevector and sqdmulh into sqdmulh_lane
(etc.) remain, so code that does not rely on NEON intrinsics to generate these
instructions is not affected.
This patch also changes clang to emit these IR intrinsics for the corresponding
NEON intrinsics (AArch64 only).
Reviewers: SjoerdMeijer, dmgreen, t.p.northover, rovka, rengolin, efriedma
Reviewed By: efriedma
Subscribers: kristof.beyls, hiraditya, jdoerfert, cfe-commits, llvm-commits
Tags: #clang, #llvm
Differential Revision: https://reviews.llvm.org/D71469
diff --git a/clang/include/clang/Basic/arm_neon.td b/clang/include/clang/Basic/arm_neon.td
index a4dc21b..380a2a0 100644
--- a/clang/include/clang/Basic/arm_neon.td
+++ b/clang/include/clang/Basic/arm_neon.td
@@ -528,9 +528,16 @@
def VQDMULL_N : SOpInst<"vqdmull_n", "(>Q).1", "si", OP_QDMULL_N>;
def VQDMULL_LANE : SOpInst<"vqdmull_lane", "(>Q)..I", "si", OP_QDMULL_LN>;
def VQDMULH_N : SOpInst<"vqdmulh_n", "..1", "siQsQi", OP_QDMULH_N>;
-def VQDMULH_LANE : SOpInst<"vqdmulh_lane", "..qI", "siQsQi", OP_QDMULH_LN>;
def VQRDMULH_N : SOpInst<"vqrdmulh_n", "..1", "siQsQi", OP_QRDMULH_N>;
+
+let ArchGuard = "!defined(__aarch64__)" in {
+def VQDMULH_LANE : SOpInst<"vqdmulh_lane", "..qI", "siQsQi", OP_QDMULH_LN>;
def VQRDMULH_LANE : SOpInst<"vqrdmulh_lane", "..qI", "siQsQi", OP_QRDMULH_LN>;
+}
+let ArchGuard = "defined(__aarch64__)" in {
+def A64_VQDMULH_LANE : SInst<"vqdmulh_lane", "..qI", "siQsQi">;
+def A64_VQRDMULH_LANE : SInst<"vqrdmulh_lane", "..qI", "siQsQi">;
+}
let ArchGuard = "defined(__ARM_FEATURE_QRDMX)" in {
def VQRDMLAH_LANE : SOpInst<"vqrdmlah_lane", "...qI", "siQsQi", OP_QRDMLAH_LN>;
@@ -951,9 +958,10 @@
def VQDMULL_HIGH_LANEQ : SOpInst<"vqdmull_high_laneq", "(>Q)QQI", "si",
OP_QDMULLHi_LN>;
-def VQDMULH_LANEQ : SOpInst<"vqdmulh_laneq", "..QI", "siQsQi", OP_QDMULH_LN>;
-def VQRDMULH_LANEQ : SOpInst<"vqrdmulh_laneq", "..QI", "siQsQi", OP_QRDMULH_LN>;
-
+let isLaneQ = 1 in {
+def VQDMULH_LANEQ : SInst<"vqdmulh_laneq", "..QI", "siQsQi">;
+def VQRDMULH_LANEQ : SInst<"vqrdmulh_laneq", "..QI", "siQsQi">;
+}
let ArchGuard = "defined(__ARM_FEATURE_QRDMX) && defined(__aarch64__)" in {
def VQRDMLAH_LANEQ : SOpInst<"vqrdmlah_laneq", "...QI", "siQsQi", OP_QRDMLAH_LN>;
def VQRDMLSH_LANEQ : SOpInst<"vqrdmlsh_laneq", "...QI", "siQsQi", OP_QRDMLSH_LN>;
diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp
index a48f6d2..d6640e5 100644
--- a/clang/lib/CodeGen/CGBuiltin.cpp
+++ b/clang/lib/CodeGen/CGBuiltin.cpp
@@ -4969,14 +4969,22 @@
NEONMAP2(vqaddq_v, aarch64_neon_uqadd, aarch64_neon_sqadd, Add1ArgType | UnsignedAlts),
NEONMAP2(vqdmlal_v, aarch64_neon_sqdmull, aarch64_neon_sqadd, 0),
NEONMAP2(vqdmlsl_v, aarch64_neon_sqdmull, aarch64_neon_sqsub, 0),
+ NEONMAP1(vqdmulh_lane_v, aarch64_neon_sqdmulh_lane, 0),
+ NEONMAP1(vqdmulh_laneq_v, aarch64_neon_sqdmulh_laneq, 0),
NEONMAP1(vqdmulh_v, aarch64_neon_sqdmulh, Add1ArgType),
+ NEONMAP1(vqdmulhq_lane_v, aarch64_neon_sqdmulh_lane, 0),
+ NEONMAP1(vqdmulhq_laneq_v, aarch64_neon_sqdmulh_laneq, 0),
NEONMAP1(vqdmulhq_v, aarch64_neon_sqdmulh, Add1ArgType),
NEONMAP1(vqdmull_v, aarch64_neon_sqdmull, Add1ArgType),
NEONMAP2(vqmovn_v, aarch64_neon_uqxtn, aarch64_neon_sqxtn, Add1ArgType | UnsignedAlts),
NEONMAP1(vqmovun_v, aarch64_neon_sqxtun, Add1ArgType),
NEONMAP1(vqneg_v, aarch64_neon_sqneg, Add1ArgType),
NEONMAP1(vqnegq_v, aarch64_neon_sqneg, Add1ArgType),
+ NEONMAP1(vqrdmulh_lane_v, aarch64_neon_sqrdmulh_lane, 0),
+ NEONMAP1(vqrdmulh_laneq_v, aarch64_neon_sqrdmulh_laneq, 0),
NEONMAP1(vqrdmulh_v, aarch64_neon_sqrdmulh, Add1ArgType),
+ NEONMAP1(vqrdmulhq_lane_v, aarch64_neon_sqrdmulh_lane, 0),
+ NEONMAP1(vqrdmulhq_laneq_v, aarch64_neon_sqrdmulh_laneq, 0),
NEONMAP1(vqrdmulhq_v, aarch64_neon_sqrdmulh, Add1ArgType),
NEONMAP2(vqrshl_v, aarch64_neon_uqrshl, aarch64_neon_sqrshl, Add1ArgType | UnsignedAlts),
NEONMAP2(vqrshlq_v, aarch64_neon_uqrshl, aarch64_neon_sqrshl, Add1ArgType | UnsignedAlts),
@@ -5754,6 +5762,24 @@
Ops.resize(2);
return EmitNeonCall(CGM.getIntrinsic(AltLLVMIntrinsic, Ty), Ops, NameHint);
}
+ case NEON::BI__builtin_neon_vqdmulhq_lane_v:
+ case NEON::BI__builtin_neon_vqdmulh_lane_v:
+ case NEON::BI__builtin_neon_vqrdmulhq_lane_v:
+ case NEON::BI__builtin_neon_vqrdmulh_lane_v: {
+ llvm::Type *Tys[2] = {
+ Ty, GetNeonType(this, NeonTypeFlags(Type.getEltType(), false,
+ /*isQuad*/ false))};
+ return EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, NameHint);
+ }
+ case NEON::BI__builtin_neon_vqdmulhq_laneq_v:
+ case NEON::BI__builtin_neon_vqdmulh_laneq_v:
+ case NEON::BI__builtin_neon_vqrdmulhq_laneq_v:
+ case NEON::BI__builtin_neon_vqrdmulh_laneq_v: {
+ llvm::Type *Tys[2] = {
+ Ty, GetNeonType(this, NeonTypeFlags(Type.getEltType(), false,
+ /*isQuad*/ true))};
+ return EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, NameHint);
+ }
case NEON::BI__builtin_neon_vqshl_n_v:
case NEON::BI__builtin_neon_vqshlq_n_v:
return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vqshl_n",
diff --git a/clang/test/CodeGen/aarch64-neon-2velem.c b/clang/test/CodeGen/aarch64-neon-2velem.c
index 37cdb16..5ad06cf 100644
--- a/clang/test/CodeGen/aarch64-neon-2velem.c
+++ b/clang/test/CodeGen/aarch64-neon-2velem.c
@@ -1440,12 +1440,12 @@
// CHECK-LABEL: @test_vqdmulh_lane_s16(
// CHECK-NEXT: entry:
-// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i16> [[V:%.*]], <4 x i16> [[V]], <4 x i32> <i32 3, i32 3, i32 3, i32 3>
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[A:%.*]] to <8 x i8>
-// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
-// CHECK-NEXT: [[VQDMULH_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqdmulh.v4i16(<4 x i16> [[A]], <4 x i16> [[SHUFFLE]]) #4
-// CHECK-NEXT: [[VQDMULH_V3_I:%.*]] = bitcast <4 x i16> [[VQDMULH_V2_I]] to <8 x i8>
-// CHECK-NEXT: ret <4 x i16> [[VQDMULH_V2_I]]
+// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8>
+// CHECK-NEXT: [[VQDMULH_LANE_V:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK-NEXT: [[VQDMULH_LANE_V1:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK-NEXT: [[VQDMULH_LANE_V2:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqdmulh.lane.v4i16.v4i16(<4 x i16> [[VQDMULH_LANE_V]], <4 x i16> [[VQDMULH_LANE_V1]], i32 3)
+// CHECK-NEXT: ret <4 x i16> [[VQDMULH_LANE_V2]]
//
int16x4_t test_vqdmulh_lane_s16(int16x4_t a, int16x4_t v) {
return vqdmulh_lane_s16(a, v, 3);
@@ -1453,12 +1453,12 @@
// CHECK-LABEL: @test_vqdmulhq_lane_s16(
// CHECK-NEXT: entry:
-// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i16> [[V:%.*]], <4 x i16> [[V]], <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[A:%.*]] to <16 x i8>
-// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i16> [[SHUFFLE]] to <16 x i8>
-// CHECK-NEXT: [[VQDMULHQ_V2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqdmulh.v8i16(<8 x i16> [[A]], <8 x i16> [[SHUFFLE]]) #4
-// CHECK-NEXT: [[VQDMULHQ_V3_I:%.*]] = bitcast <8 x i16> [[VQDMULHQ_V2_I]] to <16 x i8>
-// CHECK-NEXT: ret <8 x i16> [[VQDMULHQ_V2_I]]
+// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8>
+// CHECK-NEXT: [[VQDMULHQ_LANE_V:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK-NEXT: [[VQDMULHQ_LANE_V1:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK-NEXT: [[VQDMULHQ_LANE_V2:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqdmulh.lane.v8i16.v4i16(<8 x i16> [[VQDMULHQ_LANE_V]], <4 x i16> [[VQDMULHQ_LANE_V1]], i32 3)
+// CHECK-NEXT: ret <8 x i16> [[VQDMULHQ_LANE_V2]]
//
int16x8_t test_vqdmulhq_lane_s16(int16x8_t a, int16x4_t v) {
return vqdmulhq_lane_s16(a, v, 3);
@@ -1466,12 +1466,12 @@
// CHECK-LABEL: @test_vqdmulh_lane_s32(
// CHECK-NEXT: entry:
-// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[V:%.*]], <2 x i32> [[V]], <2 x i32> <i32 1, i32 1>
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[A:%.*]] to <8 x i8>
-// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
-// CHECK-NEXT: [[VQDMULH_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqdmulh.v2i32(<2 x i32> [[A]], <2 x i32> [[SHUFFLE]]) #4
-// CHECK-NEXT: [[VQDMULH_V3_I:%.*]] = bitcast <2 x i32> [[VQDMULH_V2_I]] to <8 x i8>
-// CHECK-NEXT: ret <2 x i32> [[VQDMULH_V2_I]]
+// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8>
+// CHECK-NEXT: [[VQDMULH_LANE_V:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK-NEXT: [[VQDMULH_LANE_V1:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK-NEXT: [[VQDMULH_LANE_V2:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqdmulh.lane.v2i32.v2i32(<2 x i32> [[VQDMULH_LANE_V]], <2 x i32> [[VQDMULH_LANE_V1]], i32 1)
+// CHECK-NEXT: ret <2 x i32> [[VQDMULH_LANE_V2]]
//
int32x2_t test_vqdmulh_lane_s32(int32x2_t a, int32x2_t v) {
return vqdmulh_lane_s32(a, v, 1);
@@ -1479,12 +1479,12 @@
// CHECK-LABEL: @test_vqdmulhq_lane_s32(
// CHECK-NEXT: entry:
-// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[V:%.*]], <2 x i32> [[V]], <4 x i32> <i32 1, i32 1, i32 1, i32 1>
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8>
-// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[SHUFFLE]] to <16 x i8>
-// CHECK-NEXT: [[VQDMULHQ_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmulh.v4i32(<4 x i32> [[A]], <4 x i32> [[SHUFFLE]]) #4
-// CHECK-NEXT: [[VQDMULHQ_V3_I:%.*]] = bitcast <4 x i32> [[VQDMULHQ_V2_I]] to <16 x i8>
-// CHECK-NEXT: ret <4 x i32> [[VQDMULHQ_V2_I]]
+// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8>
+// CHECK-NEXT: [[VQDMULHQ_LANE_V:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK-NEXT: [[VQDMULHQ_LANE_V1:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK-NEXT: [[VQDMULHQ_LANE_V2:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmulh.lane.v4i32.v2i32(<4 x i32> [[VQDMULHQ_LANE_V]], <2 x i32> [[VQDMULHQ_LANE_V1]], i32 1)
+// CHECK-NEXT: ret <4 x i32> [[VQDMULHQ_LANE_V2]]
//
int32x4_t test_vqdmulhq_lane_s32(int32x4_t a, int32x2_t v) {
return vqdmulhq_lane_s32(a, v, 1);
@@ -1492,12 +1492,12 @@
// CHECK-LABEL: @test_vqrdmulh_lane_s16(
// CHECK-NEXT: entry:
-// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i16> [[V:%.*]], <4 x i16> [[V]], <4 x i32> <i32 3, i32 3, i32 3, i32 3>
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[A:%.*]] to <8 x i8>
-// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
-// CHECK-NEXT: [[VQRDMULH_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqrdmulh.v4i16(<4 x i16> [[A]], <4 x i16> [[SHUFFLE]]) #4
-// CHECK-NEXT: [[VQRDMULH_V3_I:%.*]] = bitcast <4 x i16> [[VQRDMULH_V2_I]] to <8 x i8>
-// CHECK-NEXT: ret <4 x i16> [[VQRDMULH_V2_I]]
+// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8>
+// CHECK-NEXT: [[VQRDMULH_LANE_V:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK-NEXT: [[VQRDMULH_LANE_V1:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK-NEXT: [[VQRDMULH_LANE_V2:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqrdmulh.lane.v4i16.v4i16(<4 x i16> [[VQRDMULH_LANE_V]], <4 x i16> [[VQRDMULH_LANE_V1]], i32 3)
+// CHECK-NEXT: ret <4 x i16> [[VQRDMULH_LANE_V2]]
//
int16x4_t test_vqrdmulh_lane_s16(int16x4_t a, int16x4_t v) {
return vqrdmulh_lane_s16(a, v, 3);
@@ -1505,12 +1505,12 @@
// CHECK-LABEL: @test_vqrdmulhq_lane_s16(
// CHECK-NEXT: entry:
-// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i16> [[V:%.*]], <4 x i16> [[V]], <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[A:%.*]] to <16 x i8>
-// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i16> [[SHUFFLE]] to <16 x i8>
-// CHECK-NEXT: [[VQRDMULHQ_V2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqrdmulh.v8i16(<8 x i16> [[A]], <8 x i16> [[SHUFFLE]]) #4
-// CHECK-NEXT: [[VQRDMULHQ_V3_I:%.*]] = bitcast <8 x i16> [[VQRDMULHQ_V2_I]] to <16 x i8>
-// CHECK-NEXT: ret <8 x i16> [[VQRDMULHQ_V2_I]]
+// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8>
+// CHECK-NEXT: [[VQRDMULHQ_LANE_V:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK-NEXT: [[VQRDMULHQ_LANE_V1:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK-NEXT: [[VQRDMULHQ_LANE_V2:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqrdmulh.lane.v8i16.v4i16(<8 x i16> [[VQRDMULHQ_LANE_V]], <4 x i16> [[VQRDMULHQ_LANE_V1]], i32 3)
+// CHECK-NEXT: ret <8 x i16> [[VQRDMULHQ_LANE_V2]]
//
int16x8_t test_vqrdmulhq_lane_s16(int16x8_t a, int16x4_t v) {
return vqrdmulhq_lane_s16(a, v, 3);
@@ -1518,12 +1518,12 @@
// CHECK-LABEL: @test_vqrdmulh_lane_s32(
// CHECK-NEXT: entry:
-// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[V:%.*]], <2 x i32> [[V]], <2 x i32> <i32 1, i32 1>
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[A:%.*]] to <8 x i8>
-// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
-// CHECK-NEXT: [[VQRDMULH_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqrdmulh.v2i32(<2 x i32> [[A]], <2 x i32> [[SHUFFLE]]) #4
-// CHECK-NEXT: [[VQRDMULH_V3_I:%.*]] = bitcast <2 x i32> [[VQRDMULH_V2_I]] to <8 x i8>
-// CHECK-NEXT: ret <2 x i32> [[VQRDMULH_V2_I]]
+// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8>
+// CHECK-NEXT: [[VQRDMULH_LANE_V:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK-NEXT: [[VQRDMULH_LANE_V1:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK-NEXT: [[VQRDMULH_LANE_V2:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqrdmulh.lane.v2i32.v2i32(<2 x i32> [[VQRDMULH_LANE_V]], <2 x i32> [[VQRDMULH_LANE_V1]], i32 1)
+// CHECK-NEXT: ret <2 x i32> [[VQRDMULH_LANE_V2]]
//
int32x2_t test_vqrdmulh_lane_s32(int32x2_t a, int32x2_t v) {
return vqrdmulh_lane_s32(a, v, 1);
@@ -1531,12 +1531,12 @@
// CHECK-LABEL: @test_vqrdmulhq_lane_s32(
// CHECK-NEXT: entry:
-// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[V:%.*]], <2 x i32> [[V]], <4 x i32> <i32 1, i32 1, i32 1, i32 1>
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8>
-// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[SHUFFLE]] to <16 x i8>
-// CHECK-NEXT: [[VQRDMULHQ_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqrdmulh.v4i32(<4 x i32> [[A]], <4 x i32> [[SHUFFLE]]) #4
-// CHECK-NEXT: [[VQRDMULHQ_V3_I:%.*]] = bitcast <4 x i32> [[VQRDMULHQ_V2_I]] to <16 x i8>
-// CHECK-NEXT: ret <4 x i32> [[VQRDMULHQ_V2_I]]
+// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8>
+// CHECK-NEXT: [[VQRDMULHQ_LANE_V:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK-NEXT: [[VQRDMULHQ_LANE_V1:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK-NEXT: [[VQRDMULHQ_LANE_V2:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqrdmulh.lane.v4i32.v2i32(<4 x i32> [[VQRDMULHQ_LANE_V]], <2 x i32> [[VQRDMULHQ_LANE_V1]], i32 1)
+// CHECK-NEXT: ret <4 x i32> [[VQRDMULHQ_LANE_V2]]
//
int32x4_t test_vqrdmulhq_lane_s32(int32x4_t a, int32x2_t v) {
return vqrdmulhq_lane_s32(a, v, 1);
@@ -3066,12 +3066,12 @@
// CHECK-LABEL: @test_vqdmulh_lane_s16_0(
// CHECK-NEXT: entry:
-// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i16> [[V:%.*]], <4 x i16> [[V]], <4 x i32> zeroinitializer
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[A:%.*]] to <8 x i8>
-// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
-// CHECK-NEXT: [[VQDMULH_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqdmulh.v4i16(<4 x i16> [[A]], <4 x i16> [[SHUFFLE]]) #4
-// CHECK-NEXT: [[VQDMULH_V3_I:%.*]] = bitcast <4 x i16> [[VQDMULH_V2_I]] to <8 x i8>
-// CHECK-NEXT: ret <4 x i16> [[VQDMULH_V2_I]]
+// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8>
+// CHECK-NEXT: [[VQDMULH_LANE_V:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK-NEXT: [[VQDMULH_LANE_V1:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK-NEXT: [[VQDMULH_LANE_V2:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqdmulh.lane.v4i16.v4i16(<4 x i16> [[VQDMULH_LANE_V]], <4 x i16> [[VQDMULH_LANE_V1]], i32 0)
+// CHECK-NEXT: ret <4 x i16> [[VQDMULH_LANE_V2]]
//
int16x4_t test_vqdmulh_lane_s16_0(int16x4_t a, int16x4_t v) {
return vqdmulh_lane_s16(a, v, 0);
@@ -3079,12 +3079,12 @@
// CHECK-LABEL: @test_vqdmulhq_lane_s16_0(
// CHECK-NEXT: entry:
-// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i16> [[V:%.*]], <4 x i16> [[V]], <8 x i32> zeroinitializer
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[A:%.*]] to <16 x i8>
-// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i16> [[SHUFFLE]] to <16 x i8>
-// CHECK-NEXT: [[VQDMULHQ_V2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqdmulh.v8i16(<8 x i16> [[A]], <8 x i16> [[SHUFFLE]]) #4
-// CHECK-NEXT: [[VQDMULHQ_V3_I:%.*]] = bitcast <8 x i16> [[VQDMULHQ_V2_I]] to <16 x i8>
-// CHECK-NEXT: ret <8 x i16> [[VQDMULHQ_V2_I]]
+// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8>
+// CHECK-NEXT: [[VQDMULHQ_LANE_V:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK-NEXT: [[VQDMULHQ_LANE_V1:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK-NEXT: [[VQDMULHQ_LANE_V2:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqdmulh.lane.v8i16.v4i16(<8 x i16> [[VQDMULHQ_LANE_V]], <4 x i16> [[VQDMULHQ_LANE_V1]], i32 0)
+// CHECK-NEXT: ret <8 x i16> [[VQDMULHQ_LANE_V2]]
//
int16x8_t test_vqdmulhq_lane_s16_0(int16x8_t a, int16x4_t v) {
return vqdmulhq_lane_s16(a, v, 0);
@@ -3092,12 +3092,12 @@
// CHECK-LABEL: @test_vqdmulh_lane_s32_0(
// CHECK-NEXT: entry:
-// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[V:%.*]], <2 x i32> [[V]], <2 x i32> zeroinitializer
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[A:%.*]] to <8 x i8>
-// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
-// CHECK-NEXT: [[VQDMULH_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqdmulh.v2i32(<2 x i32> [[A]], <2 x i32> [[SHUFFLE]]) #4
-// CHECK-NEXT: [[VQDMULH_V3_I:%.*]] = bitcast <2 x i32> [[VQDMULH_V2_I]] to <8 x i8>
-// CHECK-NEXT: ret <2 x i32> [[VQDMULH_V2_I]]
+// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8>
+// CHECK-NEXT: [[VQDMULH_LANE_V:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK-NEXT: [[VQDMULH_LANE_V1:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK-NEXT: [[VQDMULH_LANE_V2:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqdmulh.lane.v2i32.v2i32(<2 x i32> [[VQDMULH_LANE_V]], <2 x i32> [[VQDMULH_LANE_V1]], i32 0)
+// CHECK-NEXT: ret <2 x i32> [[VQDMULH_LANE_V2]]
//
int32x2_t test_vqdmulh_lane_s32_0(int32x2_t a, int32x2_t v) {
return vqdmulh_lane_s32(a, v, 0);
@@ -3105,12 +3105,12 @@
// CHECK-LABEL: @test_vqdmulhq_lane_s32_0(
// CHECK-NEXT: entry:
-// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[V:%.*]], <2 x i32> [[V]], <4 x i32> zeroinitializer
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8>
-// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[SHUFFLE]] to <16 x i8>
-// CHECK-NEXT: [[VQDMULHQ_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmulh.v4i32(<4 x i32> [[A]], <4 x i32> [[SHUFFLE]]) #4
-// CHECK-NEXT: [[VQDMULHQ_V3_I:%.*]] = bitcast <4 x i32> [[VQDMULHQ_V2_I]] to <16 x i8>
-// CHECK-NEXT: ret <4 x i32> [[VQDMULHQ_V2_I]]
+// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8>
+// CHECK-NEXT: [[VQDMULHQ_LANE_V:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK-NEXT: [[VQDMULHQ_LANE_V1:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK-NEXT: [[VQDMULHQ_LANE_V2:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmulh.lane.v4i32.v2i32(<4 x i32> [[VQDMULHQ_LANE_V]], <2 x i32> [[VQDMULHQ_LANE_V1]], i32 0)
+// CHECK-NEXT: ret <4 x i32> [[VQDMULHQ_LANE_V2]]
//
int32x4_t test_vqdmulhq_lane_s32_0(int32x4_t a, int32x2_t v) {
return vqdmulhq_lane_s32(a, v, 0);
@@ -3118,12 +3118,12 @@
// CHECK-LABEL: @test_vqrdmulh_lane_s16_0(
// CHECK-NEXT: entry:
-// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i16> [[V:%.*]], <4 x i16> [[V]], <4 x i32> zeroinitializer
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[A:%.*]] to <8 x i8>
-// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
-// CHECK-NEXT: [[VQRDMULH_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqrdmulh.v4i16(<4 x i16> [[A]], <4 x i16> [[SHUFFLE]]) #4
-// CHECK-NEXT: [[VQRDMULH_V3_I:%.*]] = bitcast <4 x i16> [[VQRDMULH_V2_I]] to <8 x i8>
-// CHECK-NEXT: ret <4 x i16> [[VQRDMULH_V2_I]]
+// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8>
+// CHECK-NEXT: [[VQRDMULH_LANE_V:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK-NEXT: [[VQRDMULH_LANE_V1:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK-NEXT: [[VQRDMULH_LANE_V2:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqrdmulh.lane.v4i16.v4i16(<4 x i16> [[VQRDMULH_LANE_V]], <4 x i16> [[VQRDMULH_LANE_V1]], i32 0)
+// CHECK-NEXT: ret <4 x i16> [[VQRDMULH_LANE_V2]]
//
int16x4_t test_vqrdmulh_lane_s16_0(int16x4_t a, int16x4_t v) {
return vqrdmulh_lane_s16(a, v, 0);
@@ -3131,12 +3131,12 @@
// CHECK-LABEL: @test_vqrdmulhq_lane_s16_0(
// CHECK-NEXT: entry:
-// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i16> [[V:%.*]], <4 x i16> [[V]], <8 x i32> zeroinitializer
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[A:%.*]] to <16 x i8>
-// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i16> [[SHUFFLE]] to <16 x i8>
-// CHECK-NEXT: [[VQRDMULHQ_V2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqrdmulh.v8i16(<8 x i16> [[A]], <8 x i16> [[SHUFFLE]]) #4
-// CHECK-NEXT: [[VQRDMULHQ_V3_I:%.*]] = bitcast <8 x i16> [[VQRDMULHQ_V2_I]] to <16 x i8>
-// CHECK-NEXT: ret <8 x i16> [[VQRDMULHQ_V2_I]]
+// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8>
+// CHECK-NEXT: [[VQRDMULHQ_LANE_V:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK-NEXT: [[VQRDMULHQ_LANE_V1:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK-NEXT: [[VQRDMULHQ_LANE_V2:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqrdmulh.lane.v8i16.v4i16(<8 x i16> [[VQRDMULHQ_LANE_V]], <4 x i16> [[VQRDMULHQ_LANE_V1]], i32 0)
+// CHECK-NEXT: ret <8 x i16> [[VQRDMULHQ_LANE_V2]]
//
int16x8_t test_vqrdmulhq_lane_s16_0(int16x8_t a, int16x4_t v) {
return vqrdmulhq_lane_s16(a, v, 0);
@@ -3144,12 +3144,12 @@
// CHECK-LABEL: @test_vqrdmulh_lane_s32_0(
// CHECK-NEXT: entry:
-// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[V:%.*]], <2 x i32> [[V]], <2 x i32> zeroinitializer
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[A:%.*]] to <8 x i8>
-// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
-// CHECK-NEXT: [[VQRDMULH_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqrdmulh.v2i32(<2 x i32> [[A]], <2 x i32> [[SHUFFLE]]) #4
-// CHECK-NEXT: [[VQRDMULH_V3_I:%.*]] = bitcast <2 x i32> [[VQRDMULH_V2_I]] to <8 x i8>
-// CHECK-NEXT: ret <2 x i32> [[VQRDMULH_V2_I]]
+// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8>
+// CHECK-NEXT: [[VQRDMULH_LANE_V:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK-NEXT: [[VQRDMULH_LANE_V1:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK-NEXT: [[VQRDMULH_LANE_V2:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqrdmulh.lane.v2i32.v2i32(<2 x i32> [[VQRDMULH_LANE_V]], <2 x i32> [[VQRDMULH_LANE_V1]], i32 0)
+// CHECK-NEXT: ret <2 x i32> [[VQRDMULH_LANE_V2]]
//
int32x2_t test_vqrdmulh_lane_s32_0(int32x2_t a, int32x2_t v) {
return vqrdmulh_lane_s32(a, v, 0);
@@ -3157,12 +3157,12 @@
// CHECK-LABEL: @test_vqrdmulhq_lane_s32_0(
// CHECK-NEXT: entry:
-// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[V:%.*]], <2 x i32> [[V]], <4 x i32> zeroinitializer
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8>
-// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[SHUFFLE]] to <16 x i8>
-// CHECK-NEXT: [[VQRDMULHQ_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqrdmulh.v4i32(<4 x i32> [[A]], <4 x i32> [[SHUFFLE]]) #4
-// CHECK-NEXT: [[VQRDMULHQ_V3_I:%.*]] = bitcast <4 x i32> [[VQRDMULHQ_V2_I]] to <16 x i8>
-// CHECK-NEXT: ret <4 x i32> [[VQRDMULHQ_V2_I]]
+// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8>
+// CHECK-NEXT: [[VQRDMULHQ_LANE_V:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK-NEXT: [[VQRDMULHQ_LANE_V1:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK-NEXT: [[VQRDMULHQ_LANE_V2:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqrdmulh.lane.v4i32.v2i32(<4 x i32> [[VQRDMULHQ_LANE_V]], <2 x i32> [[VQRDMULHQ_LANE_V1]], i32 0)
+// CHECK-NEXT: ret <4 x i32> [[VQRDMULHQ_LANE_V2]]
//
int32x4_t test_vqrdmulhq_lane_s32_0(int32x4_t a, int32x2_t v) {
return vqrdmulhq_lane_s32(a, v, 0);
@@ -4753,12 +4753,12 @@
// CHECK-LABEL: @test_vqdmulh_laneq_s16_0(
// CHECK-NEXT: entry:
-// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <8 x i16> [[V:%.*]], <8 x i16> [[V]], <4 x i32> zeroinitializer
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[A:%.*]] to <8 x i8>
-// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
-// CHECK-NEXT: [[VQDMULH_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqdmulh.v4i16(<4 x i16> [[A]], <4 x i16> [[SHUFFLE]]) #4
-// CHECK-NEXT: [[VQDMULH_V3_I:%.*]] = bitcast <4 x i16> [[VQDMULH_V2_I]] to <8 x i8>
-// CHECK-NEXT: ret <4 x i16> [[VQDMULH_V2_I]]
+// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8>
+// CHECK-NEXT: [[VQDMULH_LANEQ_V:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK-NEXT: [[VQDMULH_LANEQ_V1:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
+// CHECK-NEXT: [[VQDMULH_LANEQ_V2:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqdmulh.laneq.v4i16.v8i16(<4 x i16> [[VQDMULH_LANEQ_V]], <8 x i16> [[VQDMULH_LANEQ_V1]], i32 0)
+// CHECK-NEXT: ret <4 x i16> [[VQDMULH_LANEQ_V2]]
//
int16x4_t test_vqdmulh_laneq_s16_0(int16x4_t a, int16x8_t v) {
return vqdmulh_laneq_s16(a, v, 0);
@@ -4766,12 +4766,12 @@
// CHECK-LABEL: @test_vqdmulhq_laneq_s16_0(
// CHECK-NEXT: entry:
-// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <8 x i16> [[V:%.*]], <8 x i16> [[V]], <8 x i32> zeroinitializer
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[A:%.*]] to <16 x i8>
-// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i16> [[SHUFFLE]] to <16 x i8>
-// CHECK-NEXT: [[VQDMULHQ_V2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqdmulh.v8i16(<8 x i16> [[A]], <8 x i16> [[SHUFFLE]]) #4
-// CHECK-NEXT: [[VQDMULHQ_V3_I:%.*]] = bitcast <8 x i16> [[VQDMULHQ_V2_I]] to <16 x i8>
-// CHECK-NEXT: ret <8 x i16> [[VQDMULHQ_V2_I]]
+// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8>
+// CHECK-NEXT: [[VQDMULHQ_LANEQ_V:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK-NEXT: [[VQDMULHQ_LANEQ_V1:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
+// CHECK-NEXT: [[VQDMULHQ_LANEQ_V2:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqdmulh.laneq.v8i16.v8i16(<8 x i16> [[VQDMULHQ_LANEQ_V]], <8 x i16> [[VQDMULHQ_LANEQ_V1]], i32 0)
+// CHECK-NEXT: ret <8 x i16> [[VQDMULHQ_LANEQ_V2]]
//
int16x8_t test_vqdmulhq_laneq_s16_0(int16x8_t a, int16x8_t v) {
return vqdmulhq_laneq_s16(a, v, 0);
@@ -4779,12 +4779,12 @@
// CHECK-LABEL: @test_vqdmulh_laneq_s32_0(
// CHECK-NEXT: entry:
-// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[V:%.*]], <4 x i32> [[V]], <2 x i32> zeroinitializer
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[A:%.*]] to <8 x i8>
-// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
-// CHECK-NEXT: [[VQDMULH_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqdmulh.v2i32(<2 x i32> [[A]], <2 x i32> [[SHUFFLE]]) #4
-// CHECK-NEXT: [[VQDMULH_V3_I:%.*]] = bitcast <2 x i32> [[VQDMULH_V2_I]] to <8 x i8>
-// CHECK-NEXT: ret <2 x i32> [[VQDMULH_V2_I]]
+// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8>
+// CHECK-NEXT: [[VQDMULH_LANEQ_V:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK-NEXT: [[VQDMULH_LANEQ_V1:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
+// CHECK-NEXT: [[VQDMULH_LANEQ_V2:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqdmulh.laneq.v2i32.v4i32(<2 x i32> [[VQDMULH_LANEQ_V]], <4 x i32> [[VQDMULH_LANEQ_V1]], i32 0)
+// CHECK-NEXT: ret <2 x i32> [[VQDMULH_LANEQ_V2]]
//
int32x2_t test_vqdmulh_laneq_s32_0(int32x2_t a, int32x4_t v) {
return vqdmulh_laneq_s32(a, v, 0);
@@ -4792,12 +4792,12 @@
// CHECK-LABEL: @test_vqdmulhq_laneq_s32_0(
// CHECK-NEXT: entry:
-// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[V:%.*]], <4 x i32> [[V]], <4 x i32> zeroinitializer
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8>
-// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[SHUFFLE]] to <16 x i8>
-// CHECK-NEXT: [[VQDMULHQ_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmulh.v4i32(<4 x i32> [[A]], <4 x i32> [[SHUFFLE]]) #4
-// CHECK-NEXT: [[VQDMULHQ_V3_I:%.*]] = bitcast <4 x i32> [[VQDMULHQ_V2_I]] to <16 x i8>
-// CHECK-NEXT: ret <4 x i32> [[VQDMULHQ_V2_I]]
+// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8>
+// CHECK-NEXT: [[VQDMULHQ_LANEQ_V:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK-NEXT: [[VQDMULHQ_LANEQ_V1:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
+// CHECK-NEXT: [[VQDMULHQ_LANEQ_V2:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmulh.laneq.v4i32.v4i32(<4 x i32> [[VQDMULHQ_LANEQ_V]], <4 x i32> [[VQDMULHQ_LANEQ_V1]], i32 0)
+// CHECK-NEXT: ret <4 x i32> [[VQDMULHQ_LANEQ_V2]]
//
int32x4_t test_vqdmulhq_laneq_s32_0(int32x4_t a, int32x4_t v) {
return vqdmulhq_laneq_s32(a, v, 0);
@@ -4805,12 +4805,12 @@
// CHECK-LABEL: @test_vqrdmulh_laneq_s16_0(
// CHECK-NEXT: entry:
-// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <8 x i16> [[V:%.*]], <8 x i16> [[V]], <4 x i32> zeroinitializer
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[A:%.*]] to <8 x i8>
-// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
-// CHECK-NEXT: [[VQRDMULH_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqrdmulh.v4i16(<4 x i16> [[A]], <4 x i16> [[SHUFFLE]]) #4
-// CHECK-NEXT: [[VQRDMULH_V3_I:%.*]] = bitcast <4 x i16> [[VQRDMULH_V2_I]] to <8 x i8>
-// CHECK-NEXT: ret <4 x i16> [[VQRDMULH_V2_I]]
+// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8>
+// CHECK-NEXT: [[VQRDMULH_LANEQ_V:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK-NEXT: [[VQRDMULH_LANEQ_V1:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
+// CHECK-NEXT: [[VQRDMULH_LANEQ_V2:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqrdmulh.laneq.v4i16.v8i16(<4 x i16> [[VQRDMULH_LANEQ_V]], <8 x i16> [[VQRDMULH_LANEQ_V1]], i32 0)
+// CHECK-NEXT: ret <4 x i16> [[VQRDMULH_LANEQ_V2]]
//
int16x4_t test_vqrdmulh_laneq_s16_0(int16x4_t a, int16x8_t v) {
return vqrdmulh_laneq_s16(a, v, 0);
@@ -4818,12 +4818,12 @@
// CHECK-LABEL: @test_vqrdmulhq_laneq_s16_0(
// CHECK-NEXT: entry:
-// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <8 x i16> [[V:%.*]], <8 x i16> [[V]], <8 x i32> zeroinitializer
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[A:%.*]] to <16 x i8>
-// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i16> [[SHUFFLE]] to <16 x i8>
-// CHECK-NEXT: [[VQRDMULHQ_V2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqrdmulh.v8i16(<8 x i16> [[A]], <8 x i16> [[SHUFFLE]]) #4
-// CHECK-NEXT: [[VQRDMULHQ_V3_I:%.*]] = bitcast <8 x i16> [[VQRDMULHQ_V2_I]] to <16 x i8>
-// CHECK-NEXT: ret <8 x i16> [[VQRDMULHQ_V2_I]]
+// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8>
+// CHECK-NEXT: [[VQRDMULHQ_LANEQ_V:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK-NEXT: [[VQRDMULHQ_LANEQ_V1:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
+// CHECK-NEXT: [[VQRDMULHQ_LANEQ_V2:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqrdmulh.laneq.v8i16.v8i16(<8 x i16> [[VQRDMULHQ_LANEQ_V]], <8 x i16> [[VQRDMULHQ_LANEQ_V1]], i32 0)
+// CHECK-NEXT: ret <8 x i16> [[VQRDMULHQ_LANEQ_V2]]
//
int16x8_t test_vqrdmulhq_laneq_s16_0(int16x8_t a, int16x8_t v) {
return vqrdmulhq_laneq_s16(a, v, 0);
@@ -4831,12 +4831,12 @@
// CHECK-LABEL: @test_vqrdmulh_laneq_s32_0(
// CHECK-NEXT: entry:
-// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[V:%.*]], <4 x i32> [[V]], <2 x i32> zeroinitializer
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[A:%.*]] to <8 x i8>
-// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
-// CHECK-NEXT: [[VQRDMULH_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqrdmulh.v2i32(<2 x i32> [[A]], <2 x i32> [[SHUFFLE]]) #4
-// CHECK-NEXT: [[VQRDMULH_V3_I:%.*]] = bitcast <2 x i32> [[VQRDMULH_V2_I]] to <8 x i8>
-// CHECK-NEXT: ret <2 x i32> [[VQRDMULH_V2_I]]
+// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8>
+// CHECK-NEXT: [[VQRDMULH_LANEQ_V:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK-NEXT: [[VQRDMULH_LANEQ_V1:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
+// CHECK-NEXT: [[VQRDMULH_LANEQ_V2:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqrdmulh.laneq.v2i32.v4i32(<2 x i32> [[VQRDMULH_LANEQ_V]], <4 x i32> [[VQRDMULH_LANEQ_V1]], i32 0)
+// CHECK-NEXT: ret <2 x i32> [[VQRDMULH_LANEQ_V2]]
//
int32x2_t test_vqrdmulh_laneq_s32_0(int32x2_t a, int32x4_t v) {
return vqrdmulh_laneq_s32(a, v, 0);
@@ -4844,12 +4844,12 @@
// CHECK-LABEL: @test_vqrdmulhq_laneq_s32_0(
// CHECK-NEXT: entry:
-// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[V:%.*]], <4 x i32> [[V]], <4 x i32> zeroinitializer
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8>
-// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[SHUFFLE]] to <16 x i8>
-// CHECK-NEXT: [[VQRDMULHQ_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqrdmulh.v4i32(<4 x i32> [[A]], <4 x i32> [[SHUFFLE]]) #4
-// CHECK-NEXT: [[VQRDMULHQ_V3_I:%.*]] = bitcast <4 x i32> [[VQRDMULHQ_V2_I]] to <16 x i8>
-// CHECK-NEXT: ret <4 x i32> [[VQRDMULHQ_V2_I]]
+// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8>
+// CHECK-NEXT: [[VQRDMULHQ_LANEQ_V:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK-NEXT: [[VQRDMULHQ_LANEQ_V1:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
+// CHECK-NEXT: [[VQRDMULHQ_LANEQ_V2:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqrdmulh.laneq.v4i32.v4i32(<4 x i32> [[VQRDMULHQ_LANEQ_V]], <4 x i32> [[VQRDMULHQ_LANEQ_V1]], i32 0)
+// CHECK-NEXT: ret <4 x i32> [[VQRDMULHQ_LANEQ_V2]]
//
int32x4_t test_vqrdmulhq_laneq_s32_0(int32x4_t a, int32x4_t v) {
return vqrdmulhq_laneq_s32(a, v, 0);
@@ -5149,12 +5149,12 @@
// CHECK-LABEL: @test_vqdmulh_laneq_s16(
// CHECK-NEXT: entry:
-// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <8 x i16> [[V:%.*]], <8 x i16> [[V]], <4 x i32> <i32 7, i32 7, i32 7, i32 7>
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[A:%.*]] to <8 x i8>
-// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
-// CHECK-NEXT: [[VQDMULH_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqdmulh.v4i16(<4 x i16> [[A]], <4 x i16> [[SHUFFLE]]) #4
-// CHECK-NEXT: [[VQDMULH_V3_I:%.*]] = bitcast <4 x i16> [[VQDMULH_V2_I]] to <8 x i8>
-// CHECK-NEXT: ret <4 x i16> [[VQDMULH_V2_I]]
+// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8>
+// CHECK-NEXT: [[VQDMULH_LANEQ_V:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK-NEXT: [[VQDMULH_LANEQ_V1:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
+// CHECK-NEXT: [[VQDMULH_LANEQ_V2:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqdmulh.laneq.v4i16.v8i16(<4 x i16> [[VQDMULH_LANEQ_V]], <8 x i16> [[VQDMULH_LANEQ_V1]], i32 7)
+// CHECK-NEXT: ret <4 x i16> [[VQDMULH_LANEQ_V2]]
//
int16x4_t test_vqdmulh_laneq_s16(int16x4_t a, int16x8_t v) {
return vqdmulh_laneq_s16(a, v, 7);
@@ -5162,12 +5162,12 @@
// CHECK-LABEL: @test_vqdmulhq_laneq_s16(
// CHECK-NEXT: entry:
-// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <8 x i16> [[V:%.*]], <8 x i16> [[V]], <8 x i32> <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[A:%.*]] to <16 x i8>
-// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i16> [[SHUFFLE]] to <16 x i8>
-// CHECK-NEXT: [[VQDMULHQ_V2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqdmulh.v8i16(<8 x i16> [[A]], <8 x i16> [[SHUFFLE]]) #4
-// CHECK-NEXT: [[VQDMULHQ_V3_I:%.*]] = bitcast <8 x i16> [[VQDMULHQ_V2_I]] to <16 x i8>
-// CHECK-NEXT: ret <8 x i16> [[VQDMULHQ_V2_I]]
+// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8>
+// CHECK-NEXT: [[VQDMULHQ_LANEQ_V:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK-NEXT: [[VQDMULHQ_LANEQ_V1:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
+// CHECK-NEXT: [[VQDMULHQ_LANEQ_V2:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqdmulh.laneq.v8i16.v8i16(<8 x i16> [[VQDMULHQ_LANEQ_V]], <8 x i16> [[VQDMULHQ_LANEQ_V1]], i32 7)
+// CHECK-NEXT: ret <8 x i16> [[VQDMULHQ_LANEQ_V2]]
//
int16x8_t test_vqdmulhq_laneq_s16(int16x8_t a, int16x8_t v) {
return vqdmulhq_laneq_s16(a, v, 7);
@@ -5175,12 +5175,12 @@
// CHECK-LABEL: @test_vqdmulh_laneq_s32(
// CHECK-NEXT: entry:
-// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[V:%.*]], <4 x i32> [[V]], <2 x i32> <i32 3, i32 3>
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[A:%.*]] to <8 x i8>
-// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
-// CHECK-NEXT: [[VQDMULH_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqdmulh.v2i32(<2 x i32> [[A]], <2 x i32> [[SHUFFLE]]) #4
-// CHECK-NEXT: [[VQDMULH_V3_I:%.*]] = bitcast <2 x i32> [[VQDMULH_V2_I]] to <8 x i8>
-// CHECK-NEXT: ret <2 x i32> [[VQDMULH_V2_I]]
+// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8>
+// CHECK-NEXT: [[VQDMULH_LANEQ_V:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK-NEXT: [[VQDMULH_LANEQ_V1:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
+// CHECK-NEXT: [[VQDMULH_LANEQ_V2:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqdmulh.laneq.v2i32.v4i32(<2 x i32> [[VQDMULH_LANEQ_V]], <4 x i32> [[VQDMULH_LANEQ_V1]], i32 3)
+// CHECK-NEXT: ret <2 x i32> [[VQDMULH_LANEQ_V2]]
//
int32x2_t test_vqdmulh_laneq_s32(int32x2_t a, int32x4_t v) {
return vqdmulh_laneq_s32(a, v, 3);
@@ -5188,12 +5188,12 @@
// CHECK-LABEL: @test_vqdmulhq_laneq_s32(
// CHECK-NEXT: entry:
-// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[V:%.*]], <4 x i32> [[V]], <4 x i32> <i32 3, i32 3, i32 3, i32 3>
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8>
-// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[SHUFFLE]] to <16 x i8>
-// CHECK-NEXT: [[VQDMULHQ_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmulh.v4i32(<4 x i32> [[A]], <4 x i32> [[SHUFFLE]]) #4
-// CHECK-NEXT: [[VQDMULHQ_V3_I:%.*]] = bitcast <4 x i32> [[VQDMULHQ_V2_I]] to <16 x i8>
-// CHECK-NEXT: ret <4 x i32> [[VQDMULHQ_V2_I]]
+// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8>
+// CHECK-NEXT: [[VQDMULHQ_LANEQ_V:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK-NEXT: [[VQDMULHQ_LANEQ_V1:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
+// CHECK-NEXT: [[VQDMULHQ_LANEQ_V2:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmulh.laneq.v4i32.v4i32(<4 x i32> [[VQDMULHQ_LANEQ_V]], <4 x i32> [[VQDMULHQ_LANEQ_V1]], i32 3)
+// CHECK-NEXT: ret <4 x i32> [[VQDMULHQ_LANEQ_V2]]
//
int32x4_t test_vqdmulhq_laneq_s32(int32x4_t a, int32x4_t v) {
return vqdmulhq_laneq_s32(a, v, 3);
@@ -5201,12 +5201,12 @@
// CHECK-LABEL: @test_vqrdmulh_laneq_s16(
// CHECK-NEXT: entry:
-// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <8 x i16> [[V:%.*]], <8 x i16> [[V]], <4 x i32> <i32 7, i32 7, i32 7, i32 7>
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[A:%.*]] to <8 x i8>
-// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
-// CHECK-NEXT: [[VQRDMULH_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqrdmulh.v4i16(<4 x i16> [[A]], <4 x i16> [[SHUFFLE]]) #4
-// CHECK-NEXT: [[VQRDMULH_V3_I:%.*]] = bitcast <4 x i16> [[VQRDMULH_V2_I]] to <8 x i8>
-// CHECK-NEXT: ret <4 x i16> [[VQRDMULH_V2_I]]
+// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8>
+// CHECK-NEXT: [[VQRDMULH_LANEQ_V:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK-NEXT: [[VQRDMULH_LANEQ_V1:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
+// CHECK-NEXT: [[VQRDMULH_LANEQ_V2:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqrdmulh.laneq.v4i16.v8i16(<4 x i16> [[VQRDMULH_LANEQ_V]], <8 x i16> [[VQRDMULH_LANEQ_V1]], i32 7)
+// CHECK-NEXT: ret <4 x i16> [[VQRDMULH_LANEQ_V2]]
//
int16x4_t test_vqrdmulh_laneq_s16(int16x4_t a, int16x8_t v) {
return vqrdmulh_laneq_s16(a, v, 7);
@@ -5214,12 +5214,12 @@
// CHECK-LABEL: @test_vqrdmulhq_laneq_s16(
// CHECK-NEXT: entry:
-// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <8 x i16> [[V:%.*]], <8 x i16> [[V]], <8 x i32> <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[A:%.*]] to <16 x i8>
-// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i16> [[SHUFFLE]] to <16 x i8>
-// CHECK-NEXT: [[VQRDMULHQ_V2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqrdmulh.v8i16(<8 x i16> [[A]], <8 x i16> [[SHUFFLE]]) #4
-// CHECK-NEXT: [[VQRDMULHQ_V3_I:%.*]] = bitcast <8 x i16> [[VQRDMULHQ_V2_I]] to <16 x i8>
-// CHECK-NEXT: ret <8 x i16> [[VQRDMULHQ_V2_I]]
+// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8>
+// CHECK-NEXT: [[VQRDMULHQ_LANEQ_V:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK-NEXT: [[VQRDMULHQ_LANEQ_V1:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
+// CHECK-NEXT: [[VQRDMULHQ_LANEQ_V2:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqrdmulh.laneq.v8i16.v8i16(<8 x i16> [[VQRDMULHQ_LANEQ_V]], <8 x i16> [[VQRDMULHQ_LANEQ_V1]], i32 7)
+// CHECK-NEXT: ret <8 x i16> [[VQRDMULHQ_LANEQ_V2]]
//
int16x8_t test_vqrdmulhq_laneq_s16(int16x8_t a, int16x8_t v) {
return vqrdmulhq_laneq_s16(a, v, 7);
@@ -5227,12 +5227,12 @@
// CHECK-LABEL: @test_vqrdmulh_laneq_s32(
// CHECK-NEXT: entry:
-// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[V:%.*]], <4 x i32> [[V]], <2 x i32> <i32 3, i32 3>
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[A:%.*]] to <8 x i8>
-// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
-// CHECK-NEXT: [[VQRDMULH_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqrdmulh.v2i32(<2 x i32> [[A]], <2 x i32> [[SHUFFLE]]) #4
-// CHECK-NEXT: [[VQRDMULH_V3_I:%.*]] = bitcast <2 x i32> [[VQRDMULH_V2_I]] to <8 x i8>
-// CHECK-NEXT: ret <2 x i32> [[VQRDMULH_V2_I]]
+// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8>
+// CHECK-NEXT: [[VQRDMULH_LANEQ_V:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK-NEXT: [[VQRDMULH_LANEQ_V1:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
+// CHECK-NEXT: [[VQRDMULH_LANEQ_V2:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqrdmulh.laneq.v2i32.v4i32(<2 x i32> [[VQRDMULH_LANEQ_V]], <4 x i32> [[VQRDMULH_LANEQ_V1]], i32 3)
+// CHECK-NEXT: ret <2 x i32> [[VQRDMULH_LANEQ_V2]]
//
int32x2_t test_vqrdmulh_laneq_s32(int32x2_t a, int32x4_t v) {
return vqrdmulh_laneq_s32(a, v, 3);
@@ -5240,12 +5240,12 @@
// CHECK-LABEL: @test_vqrdmulhq_laneq_s32(
// CHECK-NEXT: entry:
-// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[V:%.*]], <4 x i32> [[V]], <4 x i32> <i32 3, i32 3, i32 3, i32 3>
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8>
-// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[SHUFFLE]] to <16 x i8>
-// CHECK-NEXT: [[VQRDMULHQ_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqrdmulh.v4i32(<4 x i32> [[A]], <4 x i32> [[SHUFFLE]]) #4
-// CHECK-NEXT: [[VQRDMULHQ_V3_I:%.*]] = bitcast <4 x i32> [[VQRDMULHQ_V2_I]] to <16 x i8>
-// CHECK-NEXT: ret <4 x i32> [[VQRDMULHQ_V2_I]]
+// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8>
+// CHECK-NEXT: [[VQRDMULHQ_LANEQ_V:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK-NEXT: [[VQRDMULHQ_LANEQ_V1:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
+// CHECK-NEXT: [[VQRDMULHQ_LANEQ_V2:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqrdmulh.laneq.v4i32.v4i32(<4 x i32> [[VQRDMULHQ_LANEQ_V]], <4 x i32> [[VQRDMULHQ_LANEQ_V1]], i32 3)
+// CHECK-NEXT: ret <4 x i32> [[VQRDMULHQ_LANEQ_V2]]
//
int32x4_t test_vqrdmulhq_laneq_s32(int32x4_t a, int32x4_t v) {
return vqrdmulhq_laneq_s32(a, v, 3);
diff --git a/llvm/include/llvm/IR/IntrinsicsAArch64.td b/llvm/include/llvm/IR/IntrinsicsAArch64.td
index 8c550b7..6fbcfe8 100644
--- a/llvm/include/llvm/IR/IntrinsicsAArch64.td
+++ b/llvm/include/llvm/IR/IntrinsicsAArch64.td
@@ -133,6 +133,10 @@
: Intrinsic<[llvm_anyvector_ty],
[LLVMHalfElementsVectorType<0>, llvm_anyvector_ty],
[IntrNoMem]>;
+ class AdvSIMD_2VectorArg_Lane_Intrinsic
+ : Intrinsic<[llvm_anyint_ty],
+ [LLVMMatchType<0>, llvm_anyint_ty, llvm_i32_ty],
+ [IntrNoMem]>;
class AdvSIMD_3VectorArg_Intrinsic
: Intrinsic<[llvm_anyvector_ty],
@@ -207,9 +211,13 @@
// Vector Saturating Doubling Multiply High
def int_aarch64_neon_sqdmulh : AdvSIMD_2IntArg_Intrinsic;
+ def int_aarch64_neon_sqdmulh_lane : AdvSIMD_2VectorArg_Lane_Intrinsic;
+ def int_aarch64_neon_sqdmulh_laneq : AdvSIMD_2VectorArg_Lane_Intrinsic;
// Vector Saturating Rounding Doubling Multiply High
def int_aarch64_neon_sqrdmulh : AdvSIMD_2IntArg_Intrinsic;
+ def int_aarch64_neon_sqrdmulh_lane : AdvSIMD_2VectorArg_Lane_Intrinsic;
+ def int_aarch64_neon_sqrdmulh_laneq : AdvSIMD_2VectorArg_Lane_Intrinsic;
// Vector Polynominal Multiply
def int_aarch64_neon_pmul : AdvSIMD_2VectorArg_Intrinsic;
diff --git a/llvm/lib/Target/AArch64/AArch64InstrFormats.td b/llvm/lib/Target/AArch64/AArch64InstrFormats.td
index d91ef35..db27a53 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrFormats.td
+++ b/llvm/lib/Target/AArch64/AArch64InstrFormats.td
@@ -360,6 +360,9 @@
def am_indexedu6s128 : ComplexPattern<i64, 2, "SelectAddrModeIndexedU6S128", []>;
def am_indexeds9s128 : ComplexPattern<i64, 2, "SelectAddrModeIndexedS9S128", []>;
+def UImmS1XForm : SDNodeXForm<imm, [{
+ return CurDAG->getTargetConstant(N->getZExtValue(), SDLoc(N), MVT::i64);
+}]>;
def UImmS2XForm : SDNodeXForm<imm, [{
return CurDAG->getTargetConstant(N->getZExtValue() / 2, SDLoc(N), MVT::i64);
}]>;
@@ -7968,6 +7971,64 @@
}
}
+multiclass SIMDIndexedHSPatterns<SDPatternOperator OpNodeLane,
+ SDPatternOperator OpNodeLaneQ> {
+
+ def : Pat<(v4i16 (OpNodeLane
+ (v4i16 V64:$Rn), (v4i16 V64_lo:$Rm),
+ VectorIndexS32b:$idx)),
+ (!cast<Instruction>(NAME # v4i16_indexed) $Rn,
+ (SUBREG_TO_REG (i32 0), (v4i16 V64_lo:$Rm), dsub),
+ (UImmS1XForm $idx))>;
+
+ def : Pat<(v4i16 (OpNodeLaneQ
+ (v4i16 V64:$Rn), (v8i16 V128_lo:$Rm),
+ VectorIndexH32b:$idx)),
+ (!cast<Instruction>(NAME # v4i16_indexed) $Rn, $Rm,
+ (UImmS1XForm $idx))>;
+
+ def : Pat<(v8i16 (OpNodeLane
+ (v8i16 V128:$Rn), (v4i16 V64_lo:$Rm),
+ VectorIndexS32b:$idx)),
+ (!cast<Instruction>(NAME # v8i16_indexed) $Rn,
+ (SUBREG_TO_REG (i32 0), $Rm, dsub),
+ (UImmS1XForm $idx))>;
+
+ def : Pat<(v8i16 (OpNodeLaneQ
+ (v8i16 V128:$Rn), (v8i16 V128_lo:$Rm),
+ VectorIndexH32b:$idx)),
+ (!cast<Instruction>(NAME # v8i16_indexed) $Rn, $Rm,
+ (UImmS1XForm $idx))>;
+
+ def : Pat<(v2i32 (OpNodeLane
+ (v2i32 V64:$Rn), (v2i32 V64:$Rm),
+ VectorIndexD32b:$idx)),
+ (!cast<Instruction>(NAME # v2i32_indexed) $Rn,
+ (SUBREG_TO_REG (i32 0), (v2i32 V64_lo:$Rm), dsub),
+ (UImmS1XForm $idx))>;
+
+ def : Pat<(v2i32 (OpNodeLaneQ
+ (v2i32 V64:$Rn), (v4i32 V128:$Rm),
+ VectorIndexS32b:$idx)),
+ (!cast<Instruction>(NAME # v2i32_indexed) $Rn, $Rm,
+ (UImmS1XForm $idx))>;
+
+ def : Pat<(v4i32 (OpNodeLane
+ (v4i32 V128:$Rn), (v2i32 V64:$Rm),
+ VectorIndexD32b:$idx)),
+ (!cast<Instruction>(NAME # v4i32_indexed) $Rn,
+ (SUBREG_TO_REG (i32 0), $Rm, dsub),
+ (UImmS1XForm $idx))>;
+
+ def : Pat<(v4i32 (OpNodeLaneQ
+ (v4i32 V128:$Rn),
+ (v4i32 V128:$Rm),
+ VectorIndexS32b:$idx)),
+ (!cast<Instruction>(NAME # v4i32_indexed) $Rn, $Rm,
+ (UImmS1XForm $idx))>;
+
+}
+
multiclass SIMDIndexedHS<bit U, bits<4> opc, string asm,
SDPatternOperator OpNode> {
def v4i16_indexed : BaseSIMDIndexed<0, U, 0, 0b01, opc, V64, V64,
diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.td b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
index c2853da..9cf1a51 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
@@ -5631,6 +5631,11 @@
defm SQDMULH : SIMDIndexedHS<0, 0b1100, "sqdmulh", int_aarch64_neon_sqdmulh>;
defm SQRDMULH : SIMDIndexedHS<0, 0b1101, "sqrdmulh", int_aarch64_neon_sqrdmulh>;
+defm SQDMULH : SIMDIndexedHSPatterns<int_aarch64_neon_sqdmulh_lane,
+ int_aarch64_neon_sqdmulh_laneq>;
+defm SQRDMULH : SIMDIndexedHSPatterns<int_aarch64_neon_sqrdmulh_lane,
+ int_aarch64_neon_sqrdmulh_laneq>;
+
// Generated by MachineCombine
defm MLA : SIMDVectorIndexedHSTied<1, 0b0000, "mla", null_frag>;
defm MLS : SIMDVectorIndexedHSTied<1, 0b0100, "mls", null_frag>;
diff --git a/llvm/lib/Target/AArch64/AArch64RegisterBankInfo.cpp b/llvm/lib/Target/AArch64/AArch64RegisterBankInfo.cpp
index 40efac2..5830837 100644
--- a/llvm/lib/Target/AArch64/AArch64RegisterBankInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64RegisterBankInfo.cpp
@@ -230,6 +230,7 @@
case AArch64::FPR16RegClassID:
case AArch64::FPR32RegClassID:
case AArch64::FPR64RegClassID:
+ case AArch64::FPR64_loRegClassID:
case AArch64::FPR128RegClassID:
case AArch64::FPR128_loRegClassID:
case AArch64::DDRegClassID:
diff --git a/llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp b/llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp
index cdfbc0f..6183487 100644
--- a/llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp
@@ -596,6 +596,7 @@
return 32;
case AArch64::FPR128_loRegClassID:
+ case AArch64::FPR64_loRegClassID:
return 16;
}
}
diff --git a/llvm/lib/Target/AArch64/AArch64RegisterInfo.td b/llvm/lib/Target/AArch64/AArch64RegisterInfo.td
index f52feab..4d89391 100644
--- a/llvm/lib/Target/AArch64/AArch64RegisterInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64RegisterInfo.td
@@ -429,6 +429,10 @@
def FPR64 : RegisterClass<"AArch64", [f64, i64, v2f32, v1f64, v8i8, v4i16, v2i32,
v1i64, v4f16],
64, (sequence "D%u", 0, 31)>;
+def FPR64_lo : RegisterClass<"AArch64",
+ [v8i8, v4i16, v2i32, v1i64, v4f16, v2f32, v1f64],
+ 64, (trunc FPR64, 16)>;
+
// We don't (yet) have an f128 legal type, so don't use that here. We
// normalize 128-bit vectors to v2f64 for arg passing and such, so use
// that here.
@@ -503,6 +507,9 @@
let Name = "VectorRegLo";
let PredicateMethod = "isNeonVectorRegLo";
}
+def V64_lo : RegisterOperand<FPR64_lo, "printVRegOperand"> {
+ let ParserMatchClass = VectorRegLoAsmOperand;
+}
def V128_lo : RegisterOperand<FPR128_lo, "printVRegOperand"> {
let ParserMatchClass = VectorRegLoAsmOperand;
}
diff --git a/llvm/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp b/llvm/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp
index 5c7697a..fb09cc2 100644
--- a/llvm/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp
+++ b/llvm/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp
@@ -1033,8 +1033,10 @@
bool isNeonVectorRegLo() const {
return Kind == k_Register && Reg.Kind == RegKind::NeonVector &&
- AArch64MCRegisterClasses[AArch64::FPR128_loRegClassID].contains(
- Reg.RegNum);
+ (AArch64MCRegisterClasses[AArch64::FPR128_loRegClassID].contains(
+ Reg.RegNum) ||
+ AArch64MCRegisterClasses[AArch64::FPR64_loRegClassID].contains(
+ Reg.RegNum));
}
template <unsigned Class> bool isSVEVectorReg() const {
diff --git a/llvm/test/CodeGen/AArch64/arm64-neon-2velem.ll b/llvm/test/CodeGen/AArch64/arm64-neon-2velem.ll
index 2682678..eee0d77d 100644
--- a/llvm/test/CodeGen/AArch64/arm64-neon-2velem.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-neon-2velem.ll
@@ -9,20 +9,36 @@
declare <2 x float> @llvm.aarch64.neon.fmulx.v2f32(<2 x float>, <2 x float>)
declare <4 x i32> @llvm.aarch64.neon.sqrdmulh.v4i32(<4 x i32>, <4 x i32>)
+declare <4 x i32> @llvm.aarch64.neon.sqrdmulh.lane.v4i32.v2i32(<4 x i32>, <2 x i32>, i32)
+declare <4 x i32> @llvm.aarch64.neon.sqrdmulh.laneq.v4i32.v4i32(<4 x i32>, <4 x i32>, i32)
declare <2 x i32> @llvm.aarch64.neon.sqrdmulh.v2i32(<2 x i32>, <2 x i32>)
+declare <2 x i32> @llvm.aarch64.neon.sqrdmulh.lane.v2i32.v2i32(<2 x i32>, <2 x i32>, i32)
+declare <2 x i32> @llvm.aarch64.neon.sqrdmulh.laneq.v2i32.v4i32(<2 x i32>, <4 x i32>, i32)
declare <8 x i16> @llvm.aarch64.neon.sqrdmulh.v8i16(<8 x i16>, <8 x i16>)
+declare <8 x i16> @llvm.aarch64.neon.sqrdmulh.lane.v8i16.v4i16(<8 x i16>, <4 x i16>, i32)
+declare <8 x i16> @llvm.aarch64.neon.sqrdmulh.laneq.v8i16.v8i16(<8 x i16>, <8 x i16>, i32)
declare <4 x i16> @llvm.aarch64.neon.sqrdmulh.v4i16(<4 x i16>, <4 x i16>)
+declare <4 x i16> @llvm.aarch64.neon.sqrdmulh.lane.v4i16.v4i16(<4 x i16>, <4 x i16>, i32)
+declare <4 x i16> @llvm.aarch64.neon.sqrdmulh.laneq.v4i16.v8i16(<4 x i16>, <8 x i16>, i32)
declare <4 x i32> @llvm.aarch64.neon.sqdmulh.v4i32(<4 x i32>, <4 x i32>)
+declare <4 x i32> @llvm.aarch64.neon.sqdmulh.lane.v4i32.v2i32(<4 x i32>, <2 x i32>, i32)
+declare <4 x i32> @llvm.aarch64.neon.sqdmulh.laneq.v4i32.v4i32(<4 x i32>, <4 x i32>, i32)
declare <2 x i32> @llvm.aarch64.neon.sqdmulh.v2i32(<2 x i32>, <2 x i32>)
+declare <2 x i32> @llvm.aarch64.neon.sqdmulh.lane.v2i32.v2i32(<2 x i32>, <2 x i32>, i32)
+declare <2 x i32> @llvm.aarch64.neon.sqdmulh.laneq.v2i32.v4i32(<2 x i32>, <4 x i32>, i32)
declare <8 x i16> @llvm.aarch64.neon.sqdmulh.v8i16(<8 x i16>, <8 x i16>)
+declare <8 x i16> @llvm.aarch64.neon.sqdmulh.lane.v8i16.v4i16(<8 x i16>, <4 x i16>, i32)
+declare <8 x i16> @llvm.aarch64.neon.sqdmulh.laneq.v8i16.v8i16(<8 x i16>, <8 x i16>, i32)
declare <4 x i16> @llvm.aarch64.neon.sqdmulh.v4i16(<4 x i16>, <4 x i16>)
+declare <4 x i16> @llvm.aarch64.neon.sqdmulh.lane.v4i16.v4i16(<4 x i16>, <4 x i16>, i32)
+declare <4 x i16> @llvm.aarch64.neon.sqdmulh.laneq.v4i16.v8i16(<4 x i16>, <8 x i16>, i32)
declare <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32>, <2 x i32>)
@@ -1515,6 +1531,37 @@
ret <4 x i16> %vqdmulh2.i
}
+define <4 x i16> @test_vqdmulh_lane_s16_intrinsic(<4 x i16> %a, <4 x i16> %v) {
+; CHECK-LABEL: test_vqdmulh_lane_s16_intrinsic:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1
+; CHECK-NEXT: sqdmulh v0.4h, v0.4h, v1.h[3]
+; CHECK-NEXT: ret
+entry:
+ %vqdmulh2.i = tail call <4 x i16> @llvm.aarch64.neon.sqdmulh.lane.v4i16.v4i16(<4 x i16> %a, <4 x i16> %v, i32 3)
+ ret <4 x i16> %vqdmulh2.i
+}
+
+define <4 x i16> @test_vqdmulh_laneq_s16_intrinsic_lo(<4 x i16> %a, <8 x i16> %v) {
+; CHECK-LABEL: test_vqdmulh_laneq_s16_intrinsic_lo:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: sqdmulh v0.4h, v0.4h, v1.h[3]
+; CHECK-NEXT: ret
+entry:
+ %vqdmulh2.i = tail call <4 x i16> @llvm.aarch64.neon.sqdmulh.laneq.v4i16.v8i16(<4 x i16> %a, <8 x i16> %v, i32 3)
+ ret <4 x i16> %vqdmulh2.i
+}
+
+define <4 x i16> @test_vqdmulh_laneq_s16_intrinsic_hi(<4 x i16> %a, <8 x i16> %v) {
+; CHECK-LABEL: test_vqdmulh_laneq_s16_intrinsic_hi:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: sqdmulh v0.4h, v0.4h, v1.h[7]
+; CHECK-NEXT: ret
+entry:
+ %vqdmulh2.i = tail call <4 x i16> @llvm.aarch64.neon.sqdmulh.laneq.v4i16.v8i16(<4 x i16> %a, <8 x i16> %v, i32 7)
+ ret <4 x i16> %vqdmulh2.i
+}
+
define <8 x i16> @test_vqdmulhq_lane_s16(<8 x i16> %a, <4 x i16> %v) {
; CHECK-LABEL: test_vqdmulhq_lane_s16:
; CHECK: // %bb.0: // %entry
@@ -1527,6 +1574,37 @@
ret <8 x i16> %vqdmulh2.i
}
+define <8 x i16> @test_vqdmulhq_lane_s16_intrinsic(<8 x i16> %a, <4 x i16> %v) {
+; CHECK-LABEL: test_vqdmulhq_lane_s16_intrinsic:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1
+; CHECK-NEXT: sqdmulh v0.8h, v0.8h, v1.h[3]
+; CHECK-NEXT: ret
+entry:
+ %vqdmulh2.i = tail call <8 x i16> @llvm.aarch64.neon.sqdmulh.lane.v8i16.v4i16(<8 x i16> %a, <4 x i16> %v, i32 3)
+ ret <8 x i16> %vqdmulh2.i
+}
+
+define <8 x i16> @test_vqdmulhq_laneq_s16_intrinsic_lo(<8 x i16> %a, <8 x i16> %v) {
+; CHECK-LABEL: test_vqdmulhq_laneq_s16_intrinsic_lo:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: sqdmulh v0.8h, v0.8h, v1.h[3]
+; CHECK-NEXT: ret
+entry:
+ %vqdmulh2.i = tail call <8 x i16> @llvm.aarch64.neon.sqdmulh.laneq.v8i16.v8i16(<8 x i16> %a, <8 x i16> %v, i32 3)
+ ret <8 x i16> %vqdmulh2.i
+}
+
+define <8 x i16> @test_vqdmulhq_laneq_s16_intrinsic_hi(<8 x i16> %a, <8 x i16> %v) {
+; CHECK-LABEL: test_vqdmulhq_laneq_s16_intrinsic_hi:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: sqdmulh v0.8h, v0.8h, v1.h[7]
+; CHECK-NEXT: ret
+entry:
+ %vqdmulh2.i = tail call <8 x i16> @llvm.aarch64.neon.sqdmulh.laneq.v8i16.v8i16(<8 x i16> %a, <8 x i16> %v, i32 7)
+ ret <8 x i16> %vqdmulh2.i
+}
+
define <2 x i32> @test_vqdmulh_lane_s32(<2 x i32> %a, <2 x i32> %v) {
; CHECK-LABEL: test_vqdmulh_lane_s32:
; CHECK: // %bb.0: // %entry
@@ -1539,6 +1617,37 @@
ret <2 x i32> %vqdmulh2.i
}
+define <2 x i32> @test_vqdmulh_lane_s32_intrinsic(<2 x i32> %a, <2 x i32> %v) {
+; CHECK-LABEL: test_vqdmulh_lane_s32_intrinsic:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1
+; CHECK-NEXT: sqdmulh v0.2s, v0.2s, v1.s[1]
+; CHECK-NEXT: ret
+entry:
+ %vqdmulh2.i = tail call <2 x i32> @llvm.aarch64.neon.sqdmulh.lane.v2i32.v2i32(<2 x i32> %a, <2 x i32> %v, i32 1)
+ ret <2 x i32> %vqdmulh2.i
+}
+
+define <2 x i32> @test_vqdmulh_laneq_s32_intrinsic_lo(<2 x i32> %a, <4 x i32> %v) {
+; CHECK-LABEL: test_vqdmulh_laneq_s32_intrinsic_lo:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: sqdmulh v0.2s, v0.2s, v1.s[1]
+; CHECK-NEXT: ret
+entry:
+ %vqdmulh2.i = tail call <2 x i32> @llvm.aarch64.neon.sqdmulh.laneq.v2i32.v4i32(<2 x i32> %a, <4 x i32> %v, i32 1)
+ ret <2 x i32> %vqdmulh2.i
+}
+
+define <2 x i32> @test_vqdmulh_laneq_s32_intrinsic_hi(<2 x i32> %a, <4 x i32> %v) {
+; CHECK-LABEL: test_vqdmulh_laneq_s32_intrinsic_hi:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: sqdmulh v0.2s, v0.2s, v1.s[3]
+; CHECK-NEXT: ret
+entry:
+ %vqdmulh2.i = tail call <2 x i32> @llvm.aarch64.neon.sqdmulh.laneq.v2i32.v4i32(<2 x i32> %a, <4 x i32> %v, i32 3)
+ ret <2 x i32> %vqdmulh2.i
+}
+
define <4 x i32> @test_vqdmulhq_lane_s32(<4 x i32> %a, <2 x i32> %v) {
; CHECK-LABEL: test_vqdmulhq_lane_s32:
; CHECK: // %bb.0: // %entry
@@ -1551,6 +1660,37 @@
ret <4 x i32> %vqdmulh2.i
}
+define <4 x i32> @test_vqdmulhq_lane_s32_intrinsic(<4 x i32> %a, <2 x i32> %v) {
+; CHECK-LABEL: test_vqdmulhq_lane_s32_intrinsic:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1
+; CHECK-NEXT: sqdmulh v0.4s, v0.4s, v1.s[1]
+; CHECK-NEXT: ret
+entry:
+ %vqdmulh2.i = tail call <4 x i32> @llvm.aarch64.neon.sqdmulh.lane.v4i32.v2i32(<4 x i32> %a, <2 x i32> %v, i32 1)
+ ret <4 x i32> %vqdmulh2.i
+}
+
+define <4 x i32> @test_vqdmulhq_laneq_s32_intrinsic_lo(<4 x i32> %a, <4 x i32> %v) {
+; CHECK-LABEL: test_vqdmulhq_laneq_s32_intrinsic_lo:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: sqdmulh v0.4s, v0.4s, v1.s[1]
+; CHECK-NEXT: ret
+entry:
+ %vqdmulh2.i = tail call <4 x i32> @llvm.aarch64.neon.sqdmulh.laneq.v4i32.v4i32(<4 x i32> %a, <4 x i32> %v, i32 1)
+ ret <4 x i32> %vqdmulh2.i
+}
+
+define <4 x i32> @test_vqdmulhq_laneq_s32_intrinsic_hi(<4 x i32> %a, <4 x i32> %v) {
+; CHECK-LABEL: test_vqdmulhq_laneq_s32_intrinsic_hi:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: sqdmulh v0.4s, v0.4s, v1.s[3]
+; CHECK-NEXT: ret
+entry:
+ %vqdmulh2.i = tail call <4 x i32> @llvm.aarch64.neon.sqdmulh.laneq.v4i32.v4i32(<4 x i32> %a, <4 x i32> %v, i32 3)
+ ret <4 x i32> %vqdmulh2.i
+}
+
define <4 x i16> @test_vqrdmulh_lane_s16(<4 x i16> %a, <4 x i16> %v) {
; CHECK-LABEL: test_vqrdmulh_lane_s16:
; CHECK: // %bb.0: // %entry
@@ -1563,6 +1703,37 @@
ret <4 x i16> %vqrdmulh2.i
}
+define <4 x i16> @test_vqrdmulh_lane_s16_intrinsic(<4 x i16> %a, <4 x i16> %v) {
+; CHECK-LABEL: test_vqrdmulh_lane_s16_intrinsic:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1
+; CHECK-NEXT: sqrdmulh v0.4h, v0.4h, v1.h[3]
+; CHECK-NEXT: ret
+entry:
+ %vqrdmulh2.i = tail call <4 x i16> @llvm.aarch64.neon.sqrdmulh.lane.v4i16.v4i16(<4 x i16> %a, <4 x i16> %v, i32 3)
+ ret <4 x i16> %vqrdmulh2.i
+}
+
+define <4 x i16> @test_vqrdmulh_laneq_s16_intrinsic_lo(<4 x i16> %a, <8 x i16> %v) {
+; CHECK-LABEL: test_vqrdmulh_laneq_s16_intrinsic_lo:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: sqrdmulh v0.4h, v0.4h, v1.h[3]
+; CHECK-NEXT: ret
+entry:
+ %vqrdmulh2.i = tail call <4 x i16> @llvm.aarch64.neon.sqrdmulh.laneq.v4i16.v8i16(<4 x i16> %a, <8 x i16> %v, i32 3)
+ ret <4 x i16> %vqrdmulh2.i
+}
+
+define <4 x i16> @test_vqrdmulh_laneq_s16_intrinsic_hi(<4 x i16> %a, <8 x i16> %v) {
+; CHECK-LABEL: test_vqrdmulh_laneq_s16_intrinsic_hi:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: sqrdmulh v0.4h, v0.4h, v1.h[7]
+; CHECK-NEXT: ret
+entry:
+ %vqrdmulh2.i = tail call <4 x i16> @llvm.aarch64.neon.sqrdmulh.laneq.v4i16.v8i16(<4 x i16> %a, <8 x i16> %v, i32 7)
+ ret <4 x i16> %vqrdmulh2.i
+}
+
define <8 x i16> @test_vqrdmulhq_lane_s16(<8 x i16> %a, <4 x i16> %v) {
; CHECK-LABEL: test_vqrdmulhq_lane_s16:
; CHECK: // %bb.0: // %entry
@@ -1575,6 +1746,37 @@
ret <8 x i16> %vqrdmulh2.i
}
+define <8 x i16> @test_vqrdmulhq_lane_s16_intrinsic(<8 x i16> %a, <4 x i16> %v) {
+; CHECK-LABEL: test_vqrdmulhq_lane_s16_intrinsic:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1
+; CHECK-NEXT: sqrdmulh v0.8h, v0.8h, v1.h[3]
+; CHECK-NEXT: ret
+entry:
+ %vqrdmulh2.i = tail call <8 x i16> @llvm.aarch64.neon.sqrdmulh.lane.v8i16.v4i16(<8 x i16> %a, <4 x i16> %v, i32 3)
+ ret <8 x i16> %vqrdmulh2.i
+}
+
+define <8 x i16> @test_vqrdmulhq_laneq_s16_intrinsic_lo(<8 x i16> %a, <8 x i16> %v) {
+; CHECK-LABEL: test_vqrdmulhq_laneq_s16_intrinsic_lo:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: sqrdmulh v0.8h, v0.8h, v1.h[3]
+; CHECK-NEXT: ret
+entry:
+ %vqrdmulh2.i = tail call <8 x i16> @llvm.aarch64.neon.sqrdmulh.laneq.v8i16.v8i16(<8 x i16> %a, <8 x i16> %v, i32 3)
+ ret <8 x i16> %vqrdmulh2.i
+}
+
+define <8 x i16> @test_vqrdmulhq_laneq_s16_intrinsic_hi(<8 x i16> %a, <8 x i16> %v) {
+; CHECK-LABEL: test_vqrdmulhq_laneq_s16_intrinsic_hi:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: sqrdmulh v0.8h, v0.8h, v1.h[7]
+; CHECK-NEXT: ret
+entry:
+ %vqrdmulh2.i = tail call <8 x i16> @llvm.aarch64.neon.sqrdmulh.laneq.v8i16.v8i16(<8 x i16> %a, <8 x i16> %v, i32 7)
+ ret <8 x i16> %vqrdmulh2.i
+}
+
define <2 x i32> @test_vqrdmulh_lane_s32(<2 x i32> %a, <2 x i32> %v) {
; CHECK-LABEL: test_vqrdmulh_lane_s32:
; CHECK: // %bb.0: // %entry
@@ -1587,6 +1789,37 @@
ret <2 x i32> %vqrdmulh2.i
}
+define <2 x i32> @test_vqrdmulh_lane_s32_intrinsic(<2 x i32> %a, <2 x i32> %v) {
+; CHECK-LABEL: test_vqrdmulh_lane_s32_intrinsic:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1
+; CHECK-NEXT: sqrdmulh v0.2s, v0.2s, v1.s[1]
+; CHECK-NEXT: ret
+entry:
+ %vqrdmulh2.i = tail call <2 x i32> @llvm.aarch64.neon.sqrdmulh.lane.v2i32.v2i32(<2 x i32> %a, <2 x i32> %v, i32 1)
+ ret <2 x i32> %vqrdmulh2.i
+}
+
+define <2 x i32> @test_vqrdmulh_laneq_s32_intrinsic_lo(<2 x i32> %a, <4 x i32> %v) {
+; CHECK-LABEL: test_vqrdmulh_laneq_s32_intrinsic_lo:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: sqrdmulh v0.2s, v0.2s, v1.s[1]
+; CHECK-NEXT: ret
+entry:
+ %vqrdmulh2.i = tail call <2 x i32> @llvm.aarch64.neon.sqrdmulh.laneq.v2i32.v4i32(<2 x i32> %a, <4 x i32> %v, i32 1)
+ ret <2 x i32> %vqrdmulh2.i
+}
+
+define <2 x i32> @test_vqrdmulh_laneq_s32_intrinsic_hi(<2 x i32> %a, <4 x i32> %v) {
+; CHECK-LABEL: test_vqrdmulh_laneq_s32_intrinsic_hi:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: sqrdmulh v0.2s, v0.2s, v1.s[3]
+; CHECK-NEXT: ret
+entry:
+ %vqrdmulh2.i = tail call <2 x i32> @llvm.aarch64.neon.sqrdmulh.laneq.v2i32.v4i32(<2 x i32> %a, <4 x i32> %v, i32 3)
+ ret <2 x i32> %vqrdmulh2.i
+}
+
define <4 x i32> @test_vqrdmulhq_lane_s32(<4 x i32> %a, <2 x i32> %v) {
; CHECK-LABEL: test_vqrdmulhq_lane_s32:
; CHECK: // %bb.0: // %entry
@@ -1599,6 +1832,37 @@
ret <4 x i32> %vqrdmulh2.i
}
+define <4 x i32> @test_vqrdmulhq_lane_s32_intrinsic(<4 x i32> %a, <2 x i32> %v) {
+; CHECK-LABEL: test_vqrdmulhq_lane_s32_intrinsic:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1
+; CHECK-NEXT: sqrdmulh v0.4s, v0.4s, v1.s[1]
+; CHECK-NEXT: ret
+entry:
+ %vqrdmulh2.i = tail call <4 x i32> @llvm.aarch64.neon.sqrdmulh.lane.v4i32.v2i32(<4 x i32> %a, <2 x i32> %v, i32 1)
+ ret <4 x i32> %vqrdmulh2.i
+}
+
+define <4 x i32> @test_vqrdmulhq_laneq_s32_intrinsic_lo(<4 x i32> %a, <4 x i32> %v) {
+; CHECK-LABEL: test_vqrdmulhq_laneq_s32_intrinsic_lo:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: sqrdmulh v0.4s, v0.4s, v1.s[1]
+; CHECK-NEXT: ret
+entry:
+ %vqrdmulh2.i = tail call <4 x i32> @llvm.aarch64.neon.sqrdmulh.laneq.v4i32.v4i32(<4 x i32> %a, <4 x i32> %v, i32 1)
+ ret <4 x i32> %vqrdmulh2.i
+}
+
+define <4 x i32> @test_vqrdmulhq_laneq_s32_intrinsic_hi(<4 x i32> %a, <4 x i32> %v) {
+; CHECK-LABEL: test_vqrdmulhq_laneq_s32_intrinsic_hi:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: sqrdmulh v0.4s, v0.4s, v1.s[3]
+; CHECK-NEXT: ret
+entry:
+ %vqrdmulh2.i = tail call <4 x i32> @llvm.aarch64.neon.sqrdmulh.laneq.v4i32.v4i32(<4 x i32> %a, <4 x i32> %v, i32 3)
+ ret <4 x i32> %vqrdmulh2.i
+}
+
define <2 x float> @test_vmul_lane_f32(<2 x float> %a, <2 x float> %v) {
; CHECK-LABEL: test_vmul_lane_f32:
; CHECK: // %bb.0: // %entry