Use vAny type to get rid of Neon intrinsics that differed only in whether
the overloaded vector types allowed floating-point or integer vector elements.
Most of these operations actually depend on the element type, so bitcasting
was not an option.
If you include the vpadd intrinsics that I updated earlier, this gets rid
of 20 intrinsics.
git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@78646 91177308-0d34-0410-b5e6-96231b3b80d8
diff --git a/lib/Target/ARM/ARMISelDAGToDAG.cpp b/lib/Target/ARM/ARMISelDAGToDAG.cpp
index 1096e8e..a927da2 100644
--- a/lib/Target/ARM/ARMISelDAGToDAG.cpp
+++ b/lib/Target/ARM/ARMISelDAGToDAG.cpp
@@ -1466,8 +1466,7 @@
switch (IntNo) {
default: break;
- case Intrinsic::arm_neon_vtrni:
- case Intrinsic::arm_neon_vtrnf:
+ case Intrinsic::arm_neon_vtrn:
switch (VT.getSimpleVT()) {
default: return NULL;
case EVT::v8i8: Opc = ARM::VTRNd8; break;
@@ -1482,8 +1481,7 @@
return CurDAG->getTargetNode(Opc, dl, VT, VT, N->getOperand(1),
N->getOperand(2));
- case Intrinsic::arm_neon_vuzpi:
- case Intrinsic::arm_neon_vuzpf:
+ case Intrinsic::arm_neon_vuzp:
switch (VT.getSimpleVT()) {
default: return NULL;
case EVT::v8i8: Opc = ARM::VUZPd8; break;
@@ -1498,8 +1496,7 @@
return CurDAG->getTargetNode(Opc, dl, VT, VT, N->getOperand(1),
N->getOperand(2));
- case Intrinsic::arm_neon_vzipi:
- case Intrinsic::arm_neon_vzipf:
+ case Intrinsic::arm_neon_vzip:
switch (VT.getSimpleVT()) {
default: return NULL;
case EVT::v8i8: Opc = ARM::VZIPd8; break;
diff --git a/lib/Target/ARM/ARMISelLowering.cpp b/lib/Target/ARM/ARMISelLowering.cpp
index 61722d4..1a662d9 100644
--- a/lib/Target/ARM/ARMISelLowering.cpp
+++ b/lib/Target/ARM/ARMISelLowering.cpp
@@ -1360,23 +1360,17 @@
ARMTargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op, SelectionDAG &DAG) {
unsigned IntNo = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
switch (IntNo) {
- case Intrinsic::arm_neon_vld2i:
- case Intrinsic::arm_neon_vld2f:
+ case Intrinsic::arm_neon_vld2:
return LowerNeonVLDIntrinsic(Op, DAG, ARMISD::VLD2D);
- case Intrinsic::arm_neon_vld3i:
- case Intrinsic::arm_neon_vld3f:
+ case Intrinsic::arm_neon_vld3:
return LowerNeonVLDIntrinsic(Op, DAG, ARMISD::VLD3D);
- case Intrinsic::arm_neon_vld4i:
- case Intrinsic::arm_neon_vld4f:
+ case Intrinsic::arm_neon_vld4:
return LowerNeonVLDIntrinsic(Op, DAG, ARMISD::VLD4D);
- case Intrinsic::arm_neon_vst2i:
- case Intrinsic::arm_neon_vst2f:
+ case Intrinsic::arm_neon_vst2:
return LowerNeonVSTIntrinsic(Op, DAG, ARMISD::VST2D, 2);
- case Intrinsic::arm_neon_vst3i:
- case Intrinsic::arm_neon_vst3f:
+ case Intrinsic::arm_neon_vst3:
return LowerNeonVSTIntrinsic(Op, DAG, ARMISD::VST3D, 3);
- case Intrinsic::arm_neon_vst4i:
- case Intrinsic::arm_neon_vst4f:
+ case Intrinsic::arm_neon_vst4:
return LowerNeonVSTIntrinsic(Op, DAG, ARMISD::VST4D, 4);
default: return SDValue(); // Don't custom lower most intrinsics.
}
diff --git a/lib/Target/ARM/ARMInstrNEON.td b/lib/Target/ARM/ARMInstrNEON.td
index 1ed3a61..53283e8 100644
--- a/lib/Target/ARM/ARMInstrNEON.td
+++ b/lib/Target/ARM/ARMInstrNEON.td
@@ -173,17 +173,17 @@
!strconcat(OpcodeStr, "\t${dst:dregpair}, $addr"),
[(set QPR:$dst, (Ty (IntOp addrmode6:$addr)))]>;
-def VLD1d8 : VLD1D<"vld1.8", v8i8, int_arm_neon_vld1i>;
-def VLD1d16 : VLD1D<"vld1.16", v4i16, int_arm_neon_vld1i>;
-def VLD1d32 : VLD1D<"vld1.32", v2i32, int_arm_neon_vld1i>;
-def VLD1df : VLD1D<"vld1.32", v2f32, int_arm_neon_vld1f>;
-def VLD1d64 : VLD1D<"vld1.64", v1i64, int_arm_neon_vld1i>;
+def VLD1d8 : VLD1D<"vld1.8", v8i8, int_arm_neon_vld1>;
+def VLD1d16 : VLD1D<"vld1.16", v4i16, int_arm_neon_vld1>;
+def VLD1d32 : VLD1D<"vld1.32", v2i32, int_arm_neon_vld1>;
+def VLD1df : VLD1D<"vld1.32", v2f32, int_arm_neon_vld1>;
+def VLD1d64 : VLD1D<"vld1.64", v1i64, int_arm_neon_vld1>;
-def VLD1q8 : VLD1Q<"vld1.8", v16i8, int_arm_neon_vld1i>;
-def VLD1q16 : VLD1Q<"vld1.16", v8i16, int_arm_neon_vld1i>;
-def VLD1q32 : VLD1Q<"vld1.32", v4i32, int_arm_neon_vld1i>;
-def VLD1qf : VLD1Q<"vld1.32", v4f32, int_arm_neon_vld1f>;
-def VLD1q64 : VLD1Q<"vld1.64", v2i64, int_arm_neon_vld1i>;
+def VLD1q8 : VLD1Q<"vld1.8", v16i8, int_arm_neon_vld1>;
+def VLD1q16 : VLD1Q<"vld1.16", v8i16, int_arm_neon_vld1>;
+def VLD1q32 : VLD1Q<"vld1.32", v4i32, int_arm_neon_vld1>;
+def VLD1qf : VLD1Q<"vld1.32", v4f32, int_arm_neon_vld1>;
+def VLD1q64 : VLD1Q<"vld1.64", v2i64, int_arm_neon_vld1>;
// VLD2 : Vector Load (multiple 2-element structures)
class VLD2D<string OpcodeStr>
@@ -228,17 +228,17 @@
!strconcat(OpcodeStr, "\t${src:dregpair}, $addr"),
[(IntOp addrmode6:$addr, (Ty QPR:$src))]>;
-def VST1d8 : VST1D<"vst1.8", v8i8, int_arm_neon_vst1i>;
-def VST1d16 : VST1D<"vst1.16", v4i16, int_arm_neon_vst1i>;
-def VST1d32 : VST1D<"vst1.32", v2i32, int_arm_neon_vst1i>;
-def VST1df : VST1D<"vst1.32", v2f32, int_arm_neon_vst1f>;
-def VST1d64 : VST1D<"vst1.64", v1i64, int_arm_neon_vst1i>;
+def VST1d8 : VST1D<"vst1.8", v8i8, int_arm_neon_vst1>;
+def VST1d16 : VST1D<"vst1.16", v4i16, int_arm_neon_vst1>;
+def VST1d32 : VST1D<"vst1.32", v2i32, int_arm_neon_vst1>;
+def VST1df : VST1D<"vst1.32", v2f32, int_arm_neon_vst1>;
+def VST1d64 : VST1D<"vst1.64", v1i64, int_arm_neon_vst1>;
-def VST1q8 : VST1Q<"vst1.8", v16i8, int_arm_neon_vst1i>;
-def VST1q16 : VST1Q<"vst1.16", v8i16, int_arm_neon_vst1i>;
-def VST1q32 : VST1Q<"vst1.32", v4i32, int_arm_neon_vst1i>;
-def VST1qf : VST1Q<"vst1.32", v4f32, int_arm_neon_vst1f>;
-def VST1q64 : VST1Q<"vst1.64", v2i64, int_arm_neon_vst1i>;
+def VST1q8 : VST1Q<"vst1.8", v16i8, int_arm_neon_vst1>;
+def VST1q16 : VST1Q<"vst1.16", v8i16, int_arm_neon_vst1>;
+def VST1q32 : VST1Q<"vst1.32", v4i32, int_arm_neon_vst1>;
+def VST1qf : VST1Q<"vst1.32", v4f32, int_arm_neon_vst1>;
+def VST1q64 : VST1Q<"vst1.64", v2i64, int_arm_neon_vst1>;
// VST2 : Vector Store (multiple 2-element structures)
class VST2D<string OpcodeStr>
@@ -1223,9 +1223,9 @@
defm VABDs : N3VInt_QHS<0, 0, 0b0111, 0, "vabd.s", int_arm_neon_vabds, 0>;
defm VABDu : N3VInt_QHS<1, 0, 0b0111, 0, "vabd.u", int_arm_neon_vabdu, 0>;
def VABDfd : N3VDInt<1, 0, 0b10, 0b1101, 0, "vabd.f32", v2f32, v2f32,
- int_arm_neon_vabdf, 0>;
+ int_arm_neon_vabds, 0>;
def VABDfq : N3VQInt<1, 0, 0b10, 0b1101, 0, "vabd.f32", v4f32, v4f32,
- int_arm_neon_vabdf, 0>;
+ int_arm_neon_vabds, 0>;
// VABDL : Vector Absolute Difference Long (Q = | D - D |)
defm VABDLs : N3VLInt_QHS<0,1,0b0111,0, "vabdl.s", int_arm_neon_vabdls, 0>;
@@ -1245,17 +1245,17 @@
defm VMAXs : N3VInt_QHS<0, 0, 0b0110, 0, "vmax.s", int_arm_neon_vmaxs, 1>;
defm VMAXu : N3VInt_QHS<1, 0, 0b0110, 0, "vmax.u", int_arm_neon_vmaxu, 1>;
def VMAXfd : N3VDInt<0, 0, 0b00, 0b1111, 0, "vmax.f32", v2f32, v2f32,
- int_arm_neon_vmaxf, 1>;
+ int_arm_neon_vmaxs, 1>;
def VMAXfq : N3VQInt<0, 0, 0b00, 0b1111, 0, "vmax.f32", v4f32, v4f32,
- int_arm_neon_vmaxf, 1>;
+ int_arm_neon_vmaxs, 1>;
// VMIN : Vector Minimum
defm VMINs : N3VInt_QHS<0, 0, 0b0110, 1, "vmin.s", int_arm_neon_vmins, 1>;
defm VMINu : N3VInt_QHS<1, 0, 0b0110, 1, "vmin.u", int_arm_neon_vminu, 1>;
def VMINfd : N3VDInt<0, 0, 0b10, 0b1111, 0, "vmin.f32", v2f32, v2f32,
- int_arm_neon_vminf, 1>;
+ int_arm_neon_vmins, 1>;
def VMINfq : N3VQInt<0, 0, 0b10, 0b1111, 0, "vmin.f32", v4f32, v4f32,
- int_arm_neon_vminf, 1>;
+ int_arm_neon_vmins, 1>;
// Vector Pairwise Operations.
@@ -1295,7 +1295,7 @@
def VPMAXu32 : N3VDInt<1, 0, 0b10, 0b1010, 0, "vpmax.u32", v2i32, v2i32,
int_arm_neon_vpmaxu, 0>;
def VPMAXf : N3VDInt<1, 0, 0b00, 0b1111, 0, "vpmax.f32", v2f32, v2f32,
- int_arm_neon_vpmaxf, 0>;
+ int_arm_neon_vpmaxs, 0>;
// VPMIN : Vector Pairwise Minimum
def VPMINs8 : N3VDInt<0, 0, 0b00, 0b1010, 1, "vpmin.s8", v8i8, v8i8,
@@ -1311,7 +1311,7 @@
def VPMINu32 : N3VDInt<1, 0, 0b10, 0b1010, 1, "vpmin.u32", v2i32, v2i32,
int_arm_neon_vpminu, 0>;
def VPMINf : N3VDInt<1, 0, 0b10, 0b1111, 0, "vpmin.f32", v2f32, v2f32,
- int_arm_neon_vpminf, 0>;
+ int_arm_neon_vpmins, 0>;
// Vector Reciprocal and Reciprocal Square Root Estimate and Step.
@@ -1321,9 +1321,9 @@
def VRECPEq : N2VQInt<0b11, 0b11, 0b10, 0b11, 0b01000, 0, "vrecpe.u32",
v4i32, v4i32, int_arm_neon_vrecpe>;
def VRECPEfd : N2VDInt<0b11, 0b11, 0b10, 0b11, 0b01010, 0, "vrecpe.f32",
- v2f32, v2f32, int_arm_neon_vrecpef>;
+ v2f32, v2f32, int_arm_neon_vrecpe>;
def VRECPEfq : N2VQInt<0b11, 0b11, 0b10, 0b11, 0b01010, 0, "vrecpe.f32",
- v4f32, v4f32, int_arm_neon_vrecpef>;
+ v4f32, v4f32, int_arm_neon_vrecpe>;
// VRECPS : Vector Reciprocal Step
def VRECPSfd : N3VDInt<0, 0, 0b00, 0b1111, 1, "vrecps.f32", v2f32, v2f32,
@@ -1337,9 +1337,9 @@
def VRSQRTEq : N2VQInt<0b11, 0b11, 0b10, 0b11, 0b01001, 0, "vrsqrte.u32",
v4i32, v4i32, int_arm_neon_vrsqrte>;
def VRSQRTEfd : N2VDInt<0b11, 0b11, 0b10, 0b11, 0b01011, 0, "vrsqrte.f32",
- v2f32, v2f32, int_arm_neon_vrsqrtef>;
+ v2f32, v2f32, int_arm_neon_vrsqrte>;
def VRSQRTEfq : N2VQInt<0b11, 0b11, 0b10, 0b11, 0b01011, 0, "vrsqrte.f32",
- v4f32, v4f32, int_arm_neon_vrsqrtef>;
+ v4f32, v4f32, int_arm_neon_vrsqrte>;
// VRSQRTS : Vector Reciprocal Square Root Step
def VRSQRTSfd : N3VDInt<0, 0, 0b10, 0b1111, 1, "vrsqrts.f32", v2f32, v2f32,
@@ -1480,9 +1480,9 @@
defm VABS : N2VInt_QHS<0b11, 0b11, 0b01, 0b00110, 0, "vabs.s",
int_arm_neon_vabs>;
def VABSfd : N2VDInt<0b11, 0b11, 0b10, 0b01, 0b01110, 0, "vabs.f32",
- v2f32, v2f32, int_arm_neon_vabsf>;
+ v2f32, v2f32, int_arm_neon_vabs>;
def VABSfq : N2VQInt<0b11, 0b11, 0b10, 0b01, 0b01110, 0, "vabs.f32",
- v4f32, v4f32, int_arm_neon_vabsf>;
+ v4f32, v4f32, int_arm_neon_vabs>;
// VQABS : Vector Saturating Absolute Value
defm VQABS : N2VInt_QHS<0b11, 0b11, 0b00, 0b01110, 0, "vqabs.s",
@@ -2017,7 +2017,7 @@
// Vector Absolute used for single-precision FP
let neverHasSideEffects = 1 in
def VABSfd_sfp : N2VDInts<0b11, 0b11, 0b10, 0b01, 0b01110, 0, "vabs.f32",
- v2f32, v2f32, int_arm_neon_vabsf>;
+ v2f32, v2f32, int_arm_neon_vabs>;
def : N2VDIntsPat<fabs, VABSfd_sfp>;
// Vector Negate used for single-precision FP