Update LLVM for 3.5 rebase (r209712).
Change-Id: I149556c940fb7dc92d075273c87ff584f400941f
diff --git a/lib/Target/AArch64/AArch64.h b/lib/Target/AArch64/AArch64.h
index 0297de1..1c022aa 100644
--- a/lib/Target/AArch64/AArch64.h
+++ b/lib/Target/AArch64/AArch64.h
@@ -1,4 +1,4 @@
-//==-- AArch64.h - Top-level interface for AArch64 representation -*- C++ -*-=//
+//==-- AArch64.h - Top-level interface for AArch64 --------------*- C++ -*-==//
//
// The LLVM Compiler Infrastructure
//
@@ -12,35 +12,38 @@
//
//===----------------------------------------------------------------------===//
-#ifndef LLVM_TARGET_AARCH64_H
-#define LLVM_TARGET_AARCH64_H
+#ifndef TARGET_AArch64_H
+#define TARGET_AArch64_H
+#include "Utils/AArch64BaseInfo.h"
#include "MCTargetDesc/AArch64MCTargetDesc.h"
#include "llvm/Target/TargetMachine.h"
+#include "llvm/Support/DataTypes.h"
namespace llvm {
-class AArch64AsmPrinter;
-class FunctionPass;
class AArch64TargetMachine;
-class MachineInstr;
-class MCInst;
+class FunctionPass;
+class MachineFunctionPass;
-FunctionPass *createAArch64ISelDAG(AArch64TargetMachine &TM,
- CodeGenOpt::Level OptLevel);
+FunctionPass *createAArch64DeadRegisterDefinitions();
+FunctionPass *createAArch64ConditionalCompares();
+FunctionPass *createAArch64AdvSIMDScalar();
+FunctionPass *createAArch64BranchRelaxation();
+FunctionPass *createAArch64ISelDag(AArch64TargetMachine &TM,
+ CodeGenOpt::Level OptLevel);
+FunctionPass *createAArch64StorePairSuppressPass();
+FunctionPass *createAArch64ExpandPseudoPass();
+FunctionPass *createAArch64LoadStoreOptimizationPass();
+ModulePass *createAArch64PromoteConstantPass();
+FunctionPass *createAArch64AddressTypePromotionPass();
+/// \brief Creates an ARM-specific Target Transformation Info pass.
+ImmutablePass *
+createAArch64TargetTransformInfoPass(const AArch64TargetMachine *TM);
FunctionPass *createAArch64CleanupLocalDynamicTLSPass();
-FunctionPass *createAArch64BranchFixupPass();
-
-/// \brief Creates an AArch64-specific Target Transformation Info pass.
-ImmutablePass *createAArch64TargetTransformInfoPass(
- const AArch64TargetMachine *TM);
-
-void LowerAArch64MachineInstrToMCInst(const MachineInstr *MI, MCInst &OutMI,
- AArch64AsmPrinter &AP);
-
-
-}
+FunctionPass *createAArch64CollectLOHPass();
+} // end namespace llvm
#endif
diff --git a/lib/Target/AArch64/AArch64.td b/lib/Target/AArch64/AArch64.td
index e49afd6..1ad5ac8 100644
--- a/lib/Target/AArch64/AArch64.td
+++ b/lib/Target/AArch64/AArch64.td
@@ -1,4 +1,4 @@
-//===- AArch64.td - Describe the AArch64 Target Machine -------*- tblgen -*-==//
+//=- AArch64.td - Describe the AArch64 Target Machine --------*- tablegen -*-=//
//
// The LLVM Compiler Infrastructure
//
@@ -7,12 +7,11 @@
//
//===----------------------------------------------------------------------===//
//
-// This is the top level entry point for the AArch64 target.
//
//===----------------------------------------------------------------------===//
//===----------------------------------------------------------------------===//
-// Target-independent interfaces
+// Target-independent interfaces which we are implementing
//===----------------------------------------------------------------------===//
include "llvm/Target/Target.td"
@@ -22,7 +21,7 @@
//
def FeatureFPARMv8 : SubtargetFeature<"fp-armv8", "HasFPARMv8", "true",
- "Enable ARMv8 FP">;
+ "Enable ARMv8 FP">;
def FeatureNEON : SubtargetFeature<"neon", "HasNEON", "true",
"Enable Advanced SIMD instructions", [FeatureFPARMv8]>;
@@ -30,54 +29,106 @@
def FeatureCrypto : SubtargetFeature<"crypto", "HasCrypto", "true",
"Enable cryptographic instructions">;
-//===----------------------------------------------------------------------===//
-// AArch64 Processors
-//
+def FeatureCRC : SubtargetFeature<"crc", "HasCRC", "true",
+ "Enable ARMv8 CRC-32 checksum instructions">;
-include "AArch64Schedule.td"
+/// Cyclone has register move instructions which are "free".
+def FeatureZCRegMove : SubtargetFeature<"zcm", "HasZeroCycleRegMove", "true",
+ "Has zero-cycle register moves">;
-class ProcNoItin<string Name, list<SubtargetFeature> Features>
- : Processor<Name, NoItineraries, Features>;
-
-def : Processor<"generic", GenericItineraries, [FeatureFPARMv8, FeatureNEON]>;
-
-def ProcA53 : SubtargetFeature<"a53", "ARMProcFamily", "CortexA53",
- "Cortex-A53 ARM processors",
- [FeatureFPARMv8,
- FeatureNEON,
- FeatureCrypto]>;
-
-def ProcA57 : SubtargetFeature<"a57", "ARMProcFamily", "CortexA57",
- "Cortex-A57 ARM processors",
- [FeatureFPARMv8,
- FeatureNEON,
- FeatureCrypto]>;
-
-def : ProcessorModel<"cortex-a53", CortexA53Model, [ProcA53]>;
-def : Processor<"cortex-a57", NoItineraries, [ProcA57]>;
+/// Cyclone has instructions which zero registers for "free".
+def FeatureZCZeroing : SubtargetFeature<"zcz", "HasZeroCycleZeroing", "true",
+ "Has zero-cycle zeroing instructions">;
//===----------------------------------------------------------------------===//
// Register File Description
//===----------------------------------------------------------------------===//
include "AArch64RegisterInfo.td"
-
-include "AArch64CallingConv.td"
+include "AArch64CallingConvention.td"
//===----------------------------------------------------------------------===//
// Instruction Descriptions
//===----------------------------------------------------------------------===//
+include "AArch64Schedule.td"
include "AArch64InstrInfo.td"
-def AArch64InstrInfo : InstrInfo {
- let noNamedPositionallyEncodedOperands = 1;
+def AArch64InstrInfo : InstrInfo;
+
+//===----------------------------------------------------------------------===//
+// AArch64 Processors supported.
+//
+include "AArch64SchedA53.td"
+include "AArch64SchedCyclone.td"
+
+def ProcA53 : SubtargetFeature<"a53", "ARMProcFamily", "CortexA53",
+ "Cortex-A53 ARM processors",
+ [FeatureFPARMv8,
+ FeatureNEON,
+ FeatureCrypto,
+ FeatureCRC]>;
+
+def ProcA57 : SubtargetFeature<"a57", "ARMProcFamily", "CortexA57",
+ "Cortex-A57 ARM processors",
+ [FeatureFPARMv8,
+ FeatureNEON,
+ FeatureCrypto,
+ FeatureCRC]>;
+
+def ProcCyclone : SubtargetFeature<"cyclone", "ARMProcFamily", "Cyclone",
+ "Cyclone",
+ [FeatureFPARMv8,
+ FeatureNEON,
+ FeatureCrypto,
+ FeatureCRC,
+ FeatureZCRegMove, FeatureZCZeroing]>;
+
+def : ProcessorModel<"generic", NoSchedModel, [FeatureFPARMv8,
+ FeatureNEON,
+ FeatureCRC]>;
+
+def : ProcessorModel<"cortex-a53", CortexA53Model, [ProcA53]>;
+def : ProcessorModel<"cortex-a57", NoSchedModel, [ProcA57]>;
+def : ProcessorModel<"cyclone", CycloneModel, [ProcCyclone]>;
+
+//===----------------------------------------------------------------------===//
+// Assembly parser
+//===----------------------------------------------------------------------===//
+
+def GenericAsmParserVariant : AsmParserVariant {
+ int Variant = 0;
+ string Name = "generic";
+}
+
+def AppleAsmParserVariant : AsmParserVariant {
+ int Variant = 1;
+ string Name = "apple-neon";
}
//===----------------------------------------------------------------------===//
-// Declare the target which we are implementing
+// Assembly printer
+//===----------------------------------------------------------------------===//
+// AArch64 Uses the MC printer for asm output, so make sure the TableGen
+// AsmWriter bits get associated with the correct class.
+def GenericAsmWriter : AsmWriter {
+ string AsmWriterClassName = "InstPrinter";
+ int Variant = 0;
+ bit isMCAsmWriter = 1;
+}
+
+def AppleAsmWriter : AsmWriter {
+ let AsmWriterClassName = "AppleInstPrinter";
+ int Variant = 1;
+ int isMCAsmWriter = 1;
+}
+
+//===----------------------------------------------------------------------===//
+// Target Declaration
//===----------------------------------------------------------------------===//
def AArch64 : Target {
let InstructionSet = AArch64InstrInfo;
+ let AssemblyParserVariants = [GenericAsmParserVariant, AppleAsmParserVariant];
+ let AssemblyWriters = [GenericAsmWriter, AppleAsmWriter];
}
diff --git a/lib/Target/AArch64/AArch64AddressTypePromotion.cpp b/lib/Target/AArch64/AArch64AddressTypePromotion.cpp
new file mode 100644
index 0000000..04906f6
--- /dev/null
+++ b/lib/Target/AArch64/AArch64AddressTypePromotion.cpp
@@ -0,0 +1,492 @@
+//===-- AArch64AddressTypePromotion.cpp --- Promote type for addr accesses -==//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass tries to promote the computations use to obtained a sign extended
+// value used into memory accesses.
+// E.g.
+// a = add nsw i32 b, 3
+// d = sext i32 a to i64
+// e = getelementptr ..., i64 d
+//
+// =>
+// f = sext i32 b to i64
+// a = add nsw i64 f, 3
+// e = getelementptr ..., i64 a
+//
+// This is legal to do so if the computations are markers with either nsw or nuw
+// markers.
+// Moreover, the current heuristic is simple: it does not create new sext
+// operations, i.e., it gives up when a sext would have forked (e.g., if
+// a = add i32 b, c, two sexts are required to promote the computation).
+//
+// FIXME: This pass may be useful for other targets too.
+// ===---------------------------------------------------------------------===//
+
+#include "AArch64.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/Operator.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "aarch64-type-promotion"
+
+static cl::opt<bool>
+EnableAddressTypePromotion("aarch64-type-promotion", cl::Hidden,
+ cl::desc("Enable the type promotion pass"),
+ cl::init(true));
+static cl::opt<bool>
+EnableMerge("aarch64-type-promotion-merge", cl::Hidden,
+ cl::desc("Enable merging of redundant sexts when one is dominating"
+ " the other."),
+ cl::init(true));
+
+//===----------------------------------------------------------------------===//
+// AArch64AddressTypePromotion
+//===----------------------------------------------------------------------===//
+
+namespace llvm {
+void initializeAArch64AddressTypePromotionPass(PassRegistry &);
+}
+
+namespace {
+class AArch64AddressTypePromotion : public FunctionPass {
+
+public:
+ static char ID;
+ AArch64AddressTypePromotion()
+ : FunctionPass(ID), Func(nullptr), ConsideredSExtType(nullptr) {
+ initializeAArch64AddressTypePromotionPass(*PassRegistry::getPassRegistry());
+ }
+
+ const char *getPassName() const override {
+ return "AArch64 Address Type Promotion";
+ }
+
+ /// Iterate over the functions and promote the computation of interesting
+ // sext instructions.
+ bool runOnFunction(Function &F) override;
+
+private:
+ /// The current function.
+ Function *Func;
+ /// Filter out all sexts that does not have this type.
+ /// Currently initialized with Int64Ty.
+ Type *ConsideredSExtType;
+
+ // This transformation requires dominator info.
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
+ AU.setPreservesCFG();
+ AU.addRequired<DominatorTreeWrapperPass>();
+ AU.addPreserved<DominatorTreeWrapperPass>();
+ FunctionPass::getAnalysisUsage(AU);
+ }
+
+ typedef SmallPtrSet<Instruction *, 32> SetOfInstructions;
+ typedef SmallVector<Instruction *, 16> Instructions;
+ typedef DenseMap<Value *, Instructions> ValueToInsts;
+
+ /// Check if it is profitable to move a sext through this instruction.
+ /// Currently, we consider it is profitable if:
+ /// - Inst is used only once (no need to insert truncate).
+ /// - Inst has only one operand that will require a sext operation (we do
+ /// do not create new sext operation).
+ bool shouldGetThrough(const Instruction *Inst);
+
+ /// Check if it is possible and legal to move a sext through this
+ /// instruction.
+ /// Current heuristic considers that we can get through:
+ /// - Arithmetic operation marked with the nsw or nuw flag.
+ /// - Other sext operation.
+ /// - Truncate operation if it was just dropping sign extended bits.
+ bool canGetThrough(const Instruction *Inst);
+
+ /// Move sext operations through safe to sext instructions.
+ bool propagateSignExtension(Instructions &SExtInsts);
+
+ /// Is this sext should be considered for code motion.
+ /// We look for sext with ConsideredSExtType and uses in at least one
+ // GetElementPtrInst.
+ bool shouldConsiderSExt(const Instruction *SExt) const;
+
+ /// Collect all interesting sext operations, i.e., the ones with the right
+ /// type and used in memory accesses.
+ /// More precisely, a sext instruction is considered as interesting if it
+ /// is used in a "complex" getelementptr or it exits at least another
+ /// sext instruction that sign extended the same initial value.
+ /// A getelementptr is considered as "complex" if it has more than 2
+ // operands.
+ void analyzeSExtension(Instructions &SExtInsts);
+
+ /// Merge redundant sign extension operations in common dominator.
+ void mergeSExts(ValueToInsts &ValToSExtendedUses,
+ SetOfInstructions &ToRemove);
+};
+} // end anonymous namespace.
+
+char AArch64AddressTypePromotion::ID = 0;
+
+INITIALIZE_PASS_BEGIN(AArch64AddressTypePromotion, "aarch64-type-promotion",
+ "AArch64 Type Promotion Pass", false, false)
+INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
+INITIALIZE_PASS_END(AArch64AddressTypePromotion, "aarch64-type-promotion",
+ "AArch64 Type Promotion Pass", false, false)
+
+FunctionPass *llvm::createAArch64AddressTypePromotionPass() {
+ return new AArch64AddressTypePromotion();
+}
+
+bool AArch64AddressTypePromotion::canGetThrough(const Instruction *Inst) {
+ if (isa<SExtInst>(Inst))
+ return true;
+
+ const BinaryOperator *BinOp = dyn_cast<BinaryOperator>(Inst);
+ if (BinOp && isa<OverflowingBinaryOperator>(BinOp) &&
+ (BinOp->hasNoUnsignedWrap() || BinOp->hasNoSignedWrap()))
+ return true;
+
+ // sext(trunc(sext)) --> sext
+ if (isa<TruncInst>(Inst) && isa<SExtInst>(Inst->getOperand(0))) {
+ const Instruction *Opnd = cast<Instruction>(Inst->getOperand(0));
+ // Check that the truncate just drop sign extended bits.
+ if (Inst->getType()->getIntegerBitWidth() >=
+ Opnd->getOperand(0)->getType()->getIntegerBitWidth() &&
+ Inst->getOperand(0)->getType()->getIntegerBitWidth() <=
+ ConsideredSExtType->getIntegerBitWidth())
+ return true;
+ }
+
+ return false;
+}
+
+bool AArch64AddressTypePromotion::shouldGetThrough(const Instruction *Inst) {
+ // If the type of the sext is the same as the considered one, this sext
+ // will become useless.
+ // Otherwise, we will have to do something to preserve the original value,
+ // unless it is used once.
+ if (isa<SExtInst>(Inst) &&
+ (Inst->getType() == ConsideredSExtType || Inst->hasOneUse()))
+ return true;
+
+ // If the Inst is used more that once, we may need to insert truncate
+ // operations and we don't do that at the moment.
+ if (!Inst->hasOneUse())
+ return false;
+
+ // This truncate is used only once, thus if we can get thourgh, it will become
+ // useless.
+ if (isa<TruncInst>(Inst))
+ return true;
+
+ // If both operands are not constant, a new sext will be created here.
+ // Current heuristic is: each step should be profitable.
+ // Therefore we don't allow to increase the number of sext even if it may
+ // be profitable later on.
+ if (isa<BinaryOperator>(Inst) && isa<ConstantInt>(Inst->getOperand(1)))
+ return true;
+
+ return false;
+}
+
+static bool shouldSExtOperand(const Instruction *Inst, int OpIdx) {
+ if (isa<SelectInst>(Inst) && OpIdx == 0)
+ return false;
+ return true;
+}
+
+bool
+AArch64AddressTypePromotion::shouldConsiderSExt(const Instruction *SExt) const {
+ if (SExt->getType() != ConsideredSExtType)
+ return false;
+
+ for (const Use &U : SExt->uses()) {
+ if (isa<GetElementPtrInst>(*U))
+ return true;
+ }
+
+ return false;
+}
+
+// Input:
+// - SExtInsts contains all the sext instructions that are use direclty in
+// GetElementPtrInst, i.e., access to memory.
+// Algorithm:
+// - For each sext operation in SExtInsts:
+// Let var be the operand of sext.
+// while it is profitable (see shouldGetThrough), legal, and safe
+// (see canGetThrough) to move sext through var's definition:
+// * promote the type of var's definition.
+// * fold var into sext uses.
+// * move sext above var's definition.
+// * update sext operand to use the operand of var that should be sign
+// extended (by construction there is only one).
+//
+// E.g.,
+// a = ... i32 c, 3
+// b = sext i32 a to i64 <- is it legal/safe/profitable to get through 'a'
+// ...
+// = b
+// => Yes, update the code
+// b = sext i32 c to i64
+// a = ... i64 b, 3
+// ...
+// = a
+// Iterate on 'c'.
+bool
+AArch64AddressTypePromotion::propagateSignExtension(Instructions &SExtInsts) {
+ DEBUG(dbgs() << "*** Propagate Sign Extension ***\n");
+
+ bool LocalChange = false;
+ SetOfInstructions ToRemove;
+ ValueToInsts ValToSExtendedUses;
+ while (!SExtInsts.empty()) {
+ // Get through simple chain.
+ Instruction *SExt = SExtInsts.pop_back_val();
+
+ DEBUG(dbgs() << "Consider:\n" << *SExt << '\n');
+
+ // If this SExt has already been merged continue.
+ if (SExt->use_empty() && ToRemove.count(SExt)) {
+ DEBUG(dbgs() << "No uses => marked as delete\n");
+ continue;
+ }
+
+ // Now try to get through the chain of definitions.
+ while (isa<Instruction>(SExt->getOperand(0))) {
+ Instruction *Inst = dyn_cast<Instruction>(SExt->getOperand(0));
+ DEBUG(dbgs() << "Try to get through:\n" << *Inst << '\n');
+ if (!canGetThrough(Inst) || !shouldGetThrough(Inst)) {
+ // We cannot get through something that is not an Instruction
+ // or not safe to SExt.
+ DEBUG(dbgs() << "Cannot get through\n");
+ break;
+ }
+
+ LocalChange = true;
+ // If this is a sign extend, it becomes useless.
+ if (isa<SExtInst>(Inst) || isa<TruncInst>(Inst)) {
+ DEBUG(dbgs() << "SExt or trunc, mark it as to remove\n");
+ // We cannot use replaceAllUsesWith here because we may trigger some
+ // assertion on the type as all involved sext operation may have not
+ // been moved yet.
+ while (!Inst->use_empty()) {
+ Value::use_iterator UseIt = Inst->use_begin();
+ Instruction *UseInst = dyn_cast<Instruction>(*UseIt);
+ assert(UseInst && "Use of sext is not an Instruction!");
+ UseInst->setOperand(UseIt->getOperandNo(), SExt);
+ }
+ ToRemove.insert(Inst);
+ SExt->setOperand(0, Inst->getOperand(0));
+ SExt->moveBefore(Inst);
+ continue;
+ }
+
+ // Get through the Instruction:
+ // 1. Update its type.
+ // 2. Replace the uses of SExt by Inst.
+ // 3. Sign extend each operand that needs to be sign extended.
+
+ // Step #1.
+ Inst->mutateType(SExt->getType());
+ // Step #2.
+ SExt->replaceAllUsesWith(Inst);
+ // Step #3.
+ Instruction *SExtForOpnd = SExt;
+
+ DEBUG(dbgs() << "Propagate SExt to operands\n");
+ for (int OpIdx = 0, EndOpIdx = Inst->getNumOperands(); OpIdx != EndOpIdx;
+ ++OpIdx) {
+ DEBUG(dbgs() << "Operand:\n" << *(Inst->getOperand(OpIdx)) << '\n');
+ if (Inst->getOperand(OpIdx)->getType() == SExt->getType() ||
+ !shouldSExtOperand(Inst, OpIdx)) {
+ DEBUG(dbgs() << "No need to propagate\n");
+ continue;
+ }
+ // Check if we can statically sign extend the operand.
+ Value *Opnd = Inst->getOperand(OpIdx);
+ if (const ConstantInt *Cst = dyn_cast<ConstantInt>(Opnd)) {
+ DEBUG(dbgs() << "Statically sign extend\n");
+ Inst->setOperand(OpIdx, ConstantInt::getSigned(SExt->getType(),
+ Cst->getSExtValue()));
+ continue;
+ }
+ // UndefValue are typed, so we have to statically sign extend them.
+ if (isa<UndefValue>(Opnd)) {
+ DEBUG(dbgs() << "Statically sign extend\n");
+ Inst->setOperand(OpIdx, UndefValue::get(SExt->getType()));
+ continue;
+ }
+
+ // Otherwise we have to explicity sign extend it.
+ assert(SExtForOpnd &&
+ "Only one operand should have been sign extended");
+
+ SExtForOpnd->setOperand(0, Opnd);
+
+ DEBUG(dbgs() << "Move before:\n" << *Inst << "\nSign extend\n");
+ // Move the sign extension before the insertion point.
+ SExtForOpnd->moveBefore(Inst);
+ Inst->setOperand(OpIdx, SExtForOpnd);
+ // If more sext are required, new instructions will have to be created.
+ SExtForOpnd = nullptr;
+ }
+ if (SExtForOpnd == SExt) {
+ DEBUG(dbgs() << "Sign extension is useless now\n");
+ ToRemove.insert(SExt);
+ break;
+ }
+ }
+
+ // If the use is already of the right type, connect its uses to its argument
+ // and delete it.
+ // This can happen for an Instruction which all uses are sign extended.
+ if (!ToRemove.count(SExt) &&
+ SExt->getType() == SExt->getOperand(0)->getType()) {
+ DEBUG(dbgs() << "Sign extension is useless, attach its use to "
+ "its argument\n");
+ SExt->replaceAllUsesWith(SExt->getOperand(0));
+ ToRemove.insert(SExt);
+ } else
+ ValToSExtendedUses[SExt->getOperand(0)].push_back(SExt);
+ }
+
+ if (EnableMerge)
+ mergeSExts(ValToSExtendedUses, ToRemove);
+
+ // Remove all instructions marked as ToRemove.
+ for (Instruction *I: ToRemove)
+ I->eraseFromParent();
+ return LocalChange;
+}
+
+void AArch64AddressTypePromotion::mergeSExts(ValueToInsts &ValToSExtendedUses,
+ SetOfInstructions &ToRemove) {
+ DominatorTree &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree();
+
+ for (auto &Entry : ValToSExtendedUses) {
+ Instructions &Insts = Entry.second;
+ Instructions CurPts;
+ for (Instruction *Inst : Insts) {
+ if (ToRemove.count(Inst))
+ continue;
+ bool inserted = false;
+ for (auto Pt : CurPts) {
+ if (DT.dominates(Inst, Pt)) {
+ DEBUG(dbgs() << "Replace all uses of:\n" << *Pt << "\nwith:\n"
+ << *Inst << '\n');
+ (Pt)->replaceAllUsesWith(Inst);
+ ToRemove.insert(Pt);
+ Pt = Inst;
+ inserted = true;
+ break;
+ }
+ if (!DT.dominates(Pt, Inst))
+ // Give up if we need to merge in a common dominator as the
+ // expermients show it is not profitable.
+ continue;
+
+ DEBUG(dbgs() << "Replace all uses of:\n" << *Inst << "\nwith:\n"
+ << *Pt << '\n');
+ Inst->replaceAllUsesWith(Pt);
+ ToRemove.insert(Inst);
+ inserted = true;
+ break;
+ }
+ if (!inserted)
+ CurPts.push_back(Inst);
+ }
+ }
+}
+
+void AArch64AddressTypePromotion::analyzeSExtension(Instructions &SExtInsts) {
+ DEBUG(dbgs() << "*** Analyze Sign Extensions ***\n");
+
+ DenseMap<Value *, Instruction *> SeenChains;
+
+ for (auto &BB : *Func) {
+ for (auto &II : BB) {
+ Instruction *SExt = &II;
+
+ // Collect all sext operation per type.
+ if (!isa<SExtInst>(SExt) || !shouldConsiderSExt(SExt))
+ continue;
+
+ DEBUG(dbgs() << "Found:\n" << (*SExt) << '\n');
+
+ // Cases where we actually perform the optimization:
+ // 1. SExt is used in a getelementptr with more than 2 operand =>
+ // likely we can merge some computation if they are done on 64 bits.
+ // 2. The beginning of the SExt chain is SExt several time. =>
+ // code sharing is possible.
+
+ bool insert = false;
+ // #1.
+ for (const Use &U : SExt->uses()) {
+ const Instruction *Inst = dyn_cast<GetElementPtrInst>(U);
+ if (Inst && Inst->getNumOperands() > 2) {
+ DEBUG(dbgs() << "Interesting use in GetElementPtrInst\n" << *Inst
+ << '\n');
+ insert = true;
+ break;
+ }
+ }
+
+ // #2.
+ // Check the head of the chain.
+ Instruction *Inst = SExt;
+ Value *Last;
+ do {
+ int OpdIdx = 0;
+ const BinaryOperator *BinOp = dyn_cast<BinaryOperator>(Inst);
+ if (BinOp && isa<ConstantInt>(BinOp->getOperand(0)))
+ OpdIdx = 1;
+ Last = Inst->getOperand(OpdIdx);
+ Inst = dyn_cast<Instruction>(Last);
+ } while (Inst && canGetThrough(Inst) && shouldGetThrough(Inst));
+
+ DEBUG(dbgs() << "Head of the chain:\n" << *Last << '\n');
+ DenseMap<Value *, Instruction *>::iterator AlreadySeen =
+ SeenChains.find(Last);
+ if (insert || AlreadySeen != SeenChains.end()) {
+ DEBUG(dbgs() << "Insert\n");
+ SExtInsts.push_back(SExt);
+ if (AlreadySeen != SeenChains.end() && AlreadySeen->second != nullptr) {
+ DEBUG(dbgs() << "Insert chain member\n");
+ SExtInsts.push_back(AlreadySeen->second);
+ SeenChains[Last] = nullptr;
+ }
+ } else {
+ DEBUG(dbgs() << "Record its chain membership\n");
+ SeenChains[Last] = SExt;
+ }
+ }
+ }
+}
+
+bool AArch64AddressTypePromotion::runOnFunction(Function &F) {
+ if (!EnableAddressTypePromotion || F.isDeclaration())
+ return false;
+ Func = &F;
+ ConsideredSExtType = Type::getInt64Ty(Func->getContext());
+
+ DEBUG(dbgs() << "*** " << getPassName() << ": " << Func->getName() << '\n');
+
+ Instructions SExtInsts;
+ analyzeSExtension(SExtInsts);
+ return propagateSignExtension(SExtInsts);
+}
diff --git a/lib/Target/AArch64/AArch64AdvSIMDScalarPass.cpp b/lib/Target/AArch64/AArch64AdvSIMDScalarPass.cpp
new file mode 100644
index 0000000..734fb21
--- /dev/null
+++ b/lib/Target/AArch64/AArch64AdvSIMDScalarPass.cpp
@@ -0,0 +1,387 @@
+//===-- AArch64AdvSIMDScalar.cpp - Replace dead defs w/ zero reg --===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+// When profitable, replace GPR targeting i64 instructions with their
+// AdvSIMD scalar equivalents. Generally speaking, "profitable" is defined
+// as minimizing the number of cross-class register copies.
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// TODO: Graph based predicate heuristics.
+// Walking the instruction list linearly will get many, perhaps most, of
+// the cases, but to do a truly thorough job of this, we need a more
+// wholistic approach.
+//
+// This optimization is very similar in spirit to the register allocator's
+// spill placement, only here we're determining where to place cross-class
+// register copies rather than spills. As such, a similar approach is
+// called for.
+//
+// We want to build up a set of graphs of all instructions which are candidates
+// for transformation along with instructions which generate their inputs and
+// consume their outputs. For each edge in the graph, we assign a weight
+// based on whether there is a copy required there (weight zero if not) and
+// the block frequency of the block containing the defining or using
+// instruction, whichever is less. Our optimization is then a graph problem
+// to minimize the total weight of all the graphs, then transform instructions
+// and add or remove copy instructions as called for to implement the
+// solution.
+//===----------------------------------------------------------------------===//
+
+#include "AArch64.h"
+#include "AArch64InstrInfo.h"
+#include "AArch64RegisterInfo.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+using namespace llvm;
+
+#define DEBUG_TYPE "aarch64-simd-scalar"
+
+// Allow forcing all i64 operations with equivalent SIMD instructions to use
+// them. For stress-testing the transformation function.
+static cl::opt<bool>
+TransformAll("aarch64-simd-scalar-force-all",
+ cl::desc("Force use of AdvSIMD scalar instructions everywhere"),
+ cl::init(false), cl::Hidden);
+
+STATISTIC(NumScalarInsnsUsed, "Number of scalar instructions used");
+STATISTIC(NumCopiesDeleted, "Number of cross-class copies deleted");
+STATISTIC(NumCopiesInserted, "Number of cross-class copies inserted");
+
+namespace {
+class AArch64AdvSIMDScalar : public MachineFunctionPass {
+ MachineRegisterInfo *MRI;
+ const AArch64InstrInfo *TII;
+
+private:
+ // isProfitableToTransform - Predicate function to determine whether an
+ // instruction should be transformed to its equivalent AdvSIMD scalar
+ // instruction. "add Xd, Xn, Xm" ==> "add Dd, Da, Db", for example.
+ bool isProfitableToTransform(const MachineInstr *MI) const;
+
+ // transformInstruction - Perform the transformation of an instruction
+ // to its equivalant AdvSIMD scalar instruction. Update inputs and outputs
+ // to be the correct register class, minimizing cross-class copies.
+ void transformInstruction(MachineInstr *MI);
+
+ // processMachineBasicBlock - Main optimzation loop.
+ bool processMachineBasicBlock(MachineBasicBlock *MBB);
+
+public:
+ static char ID; // Pass identification, replacement for typeid.
+ explicit AArch64AdvSIMDScalar() : MachineFunctionPass(ID) {}
+
+ bool runOnMachineFunction(MachineFunction &F) override;
+
+ const char *getPassName() const override {
+ return "AdvSIMD Scalar Operation Optimization";
+ }
+
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
+ AU.setPreservesCFG();
+ MachineFunctionPass::getAnalysisUsage(AU);
+ }
+};
+char AArch64AdvSIMDScalar::ID = 0;
+} // end anonymous namespace
+
+static bool isGPR64(unsigned Reg, unsigned SubReg,
+ const MachineRegisterInfo *MRI) {
+ if (SubReg)
+ return false;
+ if (TargetRegisterInfo::isVirtualRegister(Reg))
+ return MRI->getRegClass(Reg)->hasSuperClassEq(&AArch64::GPR64RegClass);
+ return AArch64::GPR64RegClass.contains(Reg);
+}
+
+static bool isFPR64(unsigned Reg, unsigned SubReg,
+ const MachineRegisterInfo *MRI) {
+ if (TargetRegisterInfo::isVirtualRegister(Reg))
+ return (MRI->getRegClass(Reg)->hasSuperClassEq(&AArch64::FPR64RegClass) &&
+ SubReg == 0) ||
+ (MRI->getRegClass(Reg)->hasSuperClassEq(&AArch64::FPR128RegClass) &&
+ SubReg == AArch64::dsub);
+ // Physical register references just check the register class directly.
+ return (AArch64::FPR64RegClass.contains(Reg) && SubReg == 0) ||
+ (AArch64::FPR128RegClass.contains(Reg) && SubReg == AArch64::dsub);
+}
+
+// getSrcFromCopy - Get the original source register for a GPR64 <--> FPR64
+// copy instruction. Return zero_reg if the instruction is not a copy.
+static unsigned getSrcFromCopy(const MachineInstr *MI,
+ const MachineRegisterInfo *MRI,
+ unsigned &SubReg) {
+ SubReg = 0;
+ // The "FMOV Xd, Dn" instruction is the typical form.
+ if (MI->getOpcode() == AArch64::FMOVDXr ||
+ MI->getOpcode() == AArch64::FMOVXDr)
+ return MI->getOperand(1).getReg();
+ // A lane zero extract "UMOV.d Xd, Vn[0]" is equivalent. We shouldn't see
+ // these at this stage, but it's easy to check for.
+ if (MI->getOpcode() == AArch64::UMOVvi64 && MI->getOperand(2).getImm() == 0) {
+ SubReg = AArch64::dsub;
+ return MI->getOperand(1).getReg();
+ }
+ // Or just a plain COPY instruction. This can be directly to/from FPR64,
+ // or it can be a dsub subreg reference to an FPR128.
+ if (MI->getOpcode() == AArch64::COPY) {
+ if (isFPR64(MI->getOperand(0).getReg(), MI->getOperand(0).getSubReg(),
+ MRI) &&
+ isGPR64(MI->getOperand(1).getReg(), MI->getOperand(1).getSubReg(), MRI))
+ return MI->getOperand(1).getReg();
+ if (isGPR64(MI->getOperand(0).getReg(), MI->getOperand(0).getSubReg(),
+ MRI) &&
+ isFPR64(MI->getOperand(1).getReg(), MI->getOperand(1).getSubReg(),
+ MRI)) {
+ SubReg = MI->getOperand(1).getSubReg();
+ return MI->getOperand(1).getReg();
+ }
+ }
+
+ // Otherwise, this is some other kind of instruction.
+ return 0;
+}
+
+// getTransformOpcode - For any opcode for which there is an AdvSIMD equivalent
+// that we're considering transforming to, return that AdvSIMD opcode. For all
+// others, return the original opcode.
+static int getTransformOpcode(unsigned Opc) {
+ switch (Opc) {
+ default:
+ break;
+ // FIXME: Lots more possibilities.
+ case AArch64::ADDXrr:
+ return AArch64::ADDv1i64;
+ case AArch64::SUBXrr:
+ return AArch64::SUBv1i64;
+ }
+ // No AdvSIMD equivalent, so just return the original opcode.
+ return Opc;
+}
+
+static bool isTransformable(const MachineInstr *MI) {
+ int Opc = MI->getOpcode();
+ return Opc != getTransformOpcode(Opc);
+}
+
+// isProfitableToTransform - Predicate function to determine whether an
+// instruction should be transformed to its equivalent AdvSIMD scalar
+// instruction. "add Xd, Xn, Xm" ==> "add Dd, Da, Db", for example.
+bool
+AArch64AdvSIMDScalar::isProfitableToTransform(const MachineInstr *MI) const {
+ // If this instruction isn't eligible to be transformed (no SIMD equivalent),
+ // early exit since that's the common case.
+ if (!isTransformable(MI))
+ return false;
+
+ // Count the number of copies we'll need to add and approximate the number
+ // of copies that a transform will enable us to remove.
+ unsigned NumNewCopies = 3;
+ unsigned NumRemovableCopies = 0;
+
+ unsigned OrigSrc0 = MI->getOperand(1).getReg();
+ unsigned OrigSrc1 = MI->getOperand(2).getReg();
+ unsigned Src0 = 0, SubReg0;
+ unsigned Src1 = 0, SubReg1;
+ if (!MRI->def_empty(OrigSrc0)) {
+ MachineRegisterInfo::def_instr_iterator Def =
+ MRI->def_instr_begin(OrigSrc0);
+ assert(std::next(Def) == MRI->def_instr_end() && "Multiple def in SSA!");
+ Src0 = getSrcFromCopy(&*Def, MRI, SubReg0);
+ // If the source was from a copy, we don't need to insert a new copy.
+ if (Src0)
+ --NumNewCopies;
+ // If there are no other users of the original source, we can delete
+ // that instruction.
+ if (Src0 && MRI->hasOneNonDBGUse(OrigSrc0))
+ ++NumRemovableCopies;
+ }
+ if (!MRI->def_empty(OrigSrc1)) {
+ MachineRegisterInfo::def_instr_iterator Def =
+ MRI->def_instr_begin(OrigSrc1);
+ assert(std::next(Def) == MRI->def_instr_end() && "Multiple def in SSA!");
+ Src1 = getSrcFromCopy(&*Def, MRI, SubReg1);
+ if (Src1)
+ --NumNewCopies;
+ // If there are no other users of the original source, we can delete
+ // that instruction.
+ if (Src1 && MRI->hasOneNonDBGUse(OrigSrc1))
+ ++NumRemovableCopies;
+ }
+
+ // If any of the uses of the original instructions is a cross class copy,
+ // that's a copy that will be removable if we transform. Likewise, if
+ // any of the uses is a transformable instruction, it's likely the tranforms
+ // will chain, enabling us to save a copy there, too. This is an aggressive
+ // heuristic that approximates the graph based cost analysis described above.
+ unsigned Dst = MI->getOperand(0).getReg();
+ bool AllUsesAreCopies = true;
+ for (MachineRegisterInfo::use_instr_nodbg_iterator
+ Use = MRI->use_instr_nodbg_begin(Dst),
+ E = MRI->use_instr_nodbg_end();
+ Use != E; ++Use) {
+ unsigned SubReg;
+ if (getSrcFromCopy(&*Use, MRI, SubReg) || isTransformable(&*Use))
+ ++NumRemovableCopies;
+ // If the use is an INSERT_SUBREG, that's still something that can
+ // directly use the FPR64, so we don't invalidate AllUsesAreCopies. It's
+ // preferable to have it use the FPR64 in most cases, as if the source
+ // vector is an IMPLICIT_DEF, the INSERT_SUBREG just goes away entirely.
+ // Ditto for a lane insert.
+ else if (Use->getOpcode() == AArch64::INSERT_SUBREG ||
+ Use->getOpcode() == AArch64::INSvi64gpr)
+ ;
+ else
+ AllUsesAreCopies = false;
+ }
+ // If all of the uses of the original destination register are copies to
+ // FPR64, then we won't end up having a new copy back to GPR64 either.
+ if (AllUsesAreCopies)
+ --NumNewCopies;
+
+ // If a transform will not increase the number of cross-class copies required,
+ // return true.
+ if (NumNewCopies <= NumRemovableCopies)
+ return true;
+
+ // Finally, even if we otherwise wouldn't transform, check if we're forcing
+ // transformation of everything.
+ return TransformAll;
+}
+
+static MachineInstr *insertCopy(const AArch64InstrInfo *TII, MachineInstr *MI,
+ unsigned Dst, unsigned Src, bool IsKill) {
+ MachineInstrBuilder MIB =
+ BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), TII->get(AArch64::COPY),
+ Dst)
+ .addReg(Src, getKillRegState(IsKill));
+ DEBUG(dbgs() << " adding copy: " << *MIB);
+ ++NumCopiesInserted;
+ return MIB;
+}
+
+// transformInstruction - Perform the transformation of an instruction
+// to its equivalant AdvSIMD scalar instruction. Update inputs and outputs
+// to be the correct register class, minimizing cross-class copies.
+void AArch64AdvSIMDScalar::transformInstruction(MachineInstr *MI) {
+ DEBUG(dbgs() << "Scalar transform: " << *MI);
+
+ MachineBasicBlock *MBB = MI->getParent();
+ int OldOpc = MI->getOpcode();
+ int NewOpc = getTransformOpcode(OldOpc);
+ assert(OldOpc != NewOpc && "transform an instruction to itself?!");
+
+ // Check if we need a copy for the source registers.
+ unsigned OrigSrc0 = MI->getOperand(1).getReg();
+ unsigned OrigSrc1 = MI->getOperand(2).getReg();
+ unsigned Src0 = 0, SubReg0;
+ unsigned Src1 = 0, SubReg1;
+ if (!MRI->def_empty(OrigSrc0)) {
+ MachineRegisterInfo::def_instr_iterator Def =
+ MRI->def_instr_begin(OrigSrc0);
+ assert(std::next(Def) == MRI->def_instr_end() && "Multiple def in SSA!");
+ Src0 = getSrcFromCopy(&*Def, MRI, SubReg0);
+ // If there are no other users of the original source, we can delete
+ // that instruction.
+ if (Src0 && MRI->hasOneNonDBGUse(OrigSrc0)) {
+ assert(Src0 && "Can't delete copy w/o a valid original source!");
+ Def->eraseFromParent();
+ ++NumCopiesDeleted;
+ }
+ }
+ if (!MRI->def_empty(OrigSrc1)) {
+ MachineRegisterInfo::def_instr_iterator Def =
+ MRI->def_instr_begin(OrigSrc1);
+ assert(std::next(Def) == MRI->def_instr_end() && "Multiple def in SSA!");
+ Src1 = getSrcFromCopy(&*Def, MRI, SubReg1);
+ // If there are no other users of the original source, we can delete
+ // that instruction.
+ if (Src1 && MRI->hasOneNonDBGUse(OrigSrc1)) {
+ assert(Src1 && "Can't delete copy w/o a valid original source!");
+ Def->eraseFromParent();
+ ++NumCopiesDeleted;
+ }
+ }
+ // If we weren't able to reference the original source directly, create a
+ // copy.
+ if (!Src0) {
+ SubReg0 = 0;
+ Src0 = MRI->createVirtualRegister(&AArch64::FPR64RegClass);
+ insertCopy(TII, MI, Src0, OrigSrc0, true);
+ }
+ if (!Src1) {
+ SubReg1 = 0;
+ Src1 = MRI->createVirtualRegister(&AArch64::FPR64RegClass);
+ insertCopy(TII, MI, Src1, OrigSrc1, true);
+ }
+
+ // Create a vreg for the destination.
+ // FIXME: No need to do this if the ultimate user expects an FPR64.
+ // Check for that and avoid the copy if possible.
+ unsigned Dst = MRI->createVirtualRegister(&AArch64::FPR64RegClass);
+
+ // For now, all of the new instructions have the same simple three-register
+ // form, so no need to special case based on what instruction we're
+ // building.
+ BuildMI(*MBB, MI, MI->getDebugLoc(), TII->get(NewOpc), Dst)
+ .addReg(Src0, getKillRegState(true), SubReg0)
+ .addReg(Src1, getKillRegState(true), SubReg1);
+
+ // Now copy the result back out to a GPR.
+ // FIXME: Try to avoid this if all uses could actually just use the FPR64
+ // directly.
+ insertCopy(TII, MI, MI->getOperand(0).getReg(), Dst, true);
+
+ // Erase the old instruction.
+ MI->eraseFromParent();
+
+ ++NumScalarInsnsUsed;
+}
+
+// processMachineBasicBlock - Main optimzation loop.
+bool AArch64AdvSIMDScalar::processMachineBasicBlock(MachineBasicBlock *MBB) {
+ bool Changed = false;
+ for (MachineBasicBlock::iterator I = MBB->begin(), E = MBB->end(); I != E;) {
+ MachineInstr *MI = I;
+ ++I;
+ if (isProfitableToTransform(MI)) {
+ transformInstruction(MI);
+ Changed = true;
+ }
+ }
+ return Changed;
+}
+
+// runOnMachineFunction - Pass entry point from PassManager.
+bool AArch64AdvSIMDScalar::runOnMachineFunction(MachineFunction &mf) {
+ bool Changed = false;
+ DEBUG(dbgs() << "***** AArch64AdvSIMDScalar *****\n");
+
+ const TargetMachine &TM = mf.getTarget();
+ MRI = &mf.getRegInfo();
+ TII = static_cast<const AArch64InstrInfo *>(TM.getInstrInfo());
+
+ // Just check things on a one-block-at-a-time basis.
+ for (MachineFunction::iterator I = mf.begin(), E = mf.end(); I != E; ++I)
+ if (processMachineBasicBlock(I))
+ Changed = true;
+ return Changed;
+}
+
+// createAArch64AdvSIMDScalar - Factory function used by AArch64TargetMachine
+// to add the pass to the PassManager.
+FunctionPass *llvm::createAArch64AdvSIMDScalar() {
+ return new AArch64AdvSIMDScalar();
+}
diff --git a/lib/Target/AArch64/AArch64AsmPrinter.cpp b/lib/Target/AArch64/AArch64AsmPrinter.cpp
index f0b52d3..c3ee9bb 100644
--- a/lib/Target/AArch64/AArch64AsmPrinter.cpp
+++ b/lib/Target/AArch64/AArch64AsmPrinter.cpp
@@ -1,4 +1,4 @@
-//===-- AArch64AsmPrinter.cpp - Print machine code to an AArch64 .s file --===//
+//===-- AArch64AsmPrinter.cpp - AArch64 LLVM assembly writer --------------===//
//
// The LLVM Compiler Infrastructure
//
@@ -8,266 +8,133 @@
//===----------------------------------------------------------------------===//
//
// This file contains a printer that converts from our internal representation
-// of machine-dependent LLVM code to GAS-format AArch64 assembly language.
+// of machine-dependent LLVM code to the AArch64 assembly language.
//
//===----------------------------------------------------------------------===//
-#define DEBUG_TYPE "asm-printer"
-#include "AArch64AsmPrinter.h"
+#include "AArch64.h"
+#include "AArch64MachineFunctionInfo.h"
+#include "AArch64MCInstLower.h"
+#include "AArch64RegisterInfo.h"
+#include "AArch64Subtarget.h"
#include "InstPrinter/AArch64InstPrinter.h"
#include "llvm/ADT/SmallString.h"
+#include "llvm/ADT/StringSwitch.h"
+#include "llvm/ADT/Twine.h"
+#include "llvm/CodeGen/AsmPrinter.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/StackMaps.h"
#include "llvm/CodeGen/MachineModuleInfoImpls.h"
#include "llvm/CodeGen/TargetLoweringObjectFileImpl.h"
+#include "llvm/IR/DataLayout.h"
#include "llvm/IR/DebugInfo.h"
-#include "llvm/IR/Mangler.h"
#include "llvm/MC/MCAsmInfo.h"
+#include "llvm/MC/MCContext.h"
#include "llvm/MC/MCInst.h"
-#include "llvm/MC/MCSymbol.h"
+#include "llvm/MC/MCInstBuilder.h"
+#include "llvm/MC/MCLinkerOptimizationHint.h"
+#include "llvm/MC/MCStreamer.h"
+#include "llvm/Support/Debug.h"
#include "llvm/Support/TargetRegistry.h"
-
using namespace llvm;
-/// Try to print a floating-point register as if it belonged to a specified
-/// register-class. For example the inline asm operand modifier "b" requires its
-/// argument to be printed as "bN".
-static bool printModifiedFPRAsmOperand(const MachineOperand &MO,
- const TargetRegisterInfo *TRI,
- char RegType, raw_ostream &O) {
- if (!MO.isReg())
- return true;
+#define DEBUG_TYPE "asm-printer"
- for (MCRegAliasIterator AR(MO.getReg(), TRI, true); AR.isValid(); ++AR) {
- if (AArch64::FPR8RegClass.contains(*AR)) {
- O << RegType << TRI->getEncodingValue(MO.getReg());
- return false;
- }
+namespace {
+
+class AArch64AsmPrinter : public AsmPrinter {
+ /// Subtarget - Keep a pointer to the AArch64Subtarget around so that we can
+ /// make the right decision when printing asm code for different targets.
+ const AArch64Subtarget *Subtarget;
+
+ AArch64MCInstLower MCInstLowering;
+ StackMaps SM;
+
+public:
+ AArch64AsmPrinter(TargetMachine &TM, MCStreamer &Streamer)
+ : AsmPrinter(TM, Streamer),
+ Subtarget(&TM.getSubtarget<AArch64Subtarget>()),
+ MCInstLowering(OutContext, *Mang, *this), SM(*this), AArch64FI(nullptr),
+ LOHLabelCounter(0) {}
+
+ const char *getPassName() const override {
+ return "AArch64 Assembly Printer";
}
- // The register doesn't correspond to anything floating-point like.
- return true;
-}
-
-/// Implements the 'w' and 'x' inline asm operand modifiers, which print a GPR
-/// with the obvious type and an immediate 0 as either wzr or xzr.
-static bool printModifiedGPRAsmOperand(const MachineOperand &MO,
- const TargetRegisterInfo *TRI,
- const TargetRegisterClass &RegClass,
- raw_ostream &O) {
- char Prefix = &RegClass == &AArch64::GPR32RegClass ? 'w' : 'x';
-
- if (MO.isImm() && MO.getImm() == 0) {
- O << Prefix << "zr";
- return false;
- } else if (MO.isReg()) {
- if (MO.getReg() == AArch64::XSP || MO.getReg() == AArch64::WSP) {
- O << (Prefix == 'x' ? "sp" : "wsp");
- return false;
- }
-
- for (MCRegAliasIterator AR(MO.getReg(), TRI, true); AR.isValid(); ++AR) {
- if (RegClass.contains(*AR)) {
- O << AArch64InstPrinter::getRegisterName(*AR);
- return false;
- }
- }
+ /// \brief Wrapper for MCInstLowering.lowerOperand() for the
+ /// tblgen'erated pseudo lowering.
+ bool lowerOperand(const MachineOperand &MO, MCOperand &MCOp) const {
+ return MCInstLowering.lowerOperand(MO, MCOp);
}
- return true;
-}
+ void LowerSTACKMAP(MCStreamer &OutStreamer, StackMaps &SM,
+ const MachineInstr &MI);
+ void LowerPATCHPOINT(MCStreamer &OutStreamer, StackMaps &SM,
+ const MachineInstr &MI);
+ /// \brief tblgen'erated driver function for lowering simple MI->MC
+ /// pseudo instructions.
+ bool emitPseudoExpansionLowering(MCStreamer &OutStreamer,
+ const MachineInstr *MI);
-bool AArch64AsmPrinter::printSymbolicAddress(const MachineOperand &MO,
- bool PrintImmediatePrefix,
- StringRef Suffix, raw_ostream &O) {
- StringRef Name;
- StringRef Modifier;
- switch (MO.getType()) {
- default:
- return true;
- case MachineOperand::MO_GlobalAddress:
- Name = getSymbol(MO.getGlobal())->getName();
+ void EmitInstruction(const MachineInstr *MI) override;
- // Global variables may be accessed either via a GOT or in various fun and
- // interesting TLS-model specific ways. Set the prefix modifier as
- // appropriate here.
- if (const GlobalVariable *GV = dyn_cast<GlobalVariable>(MO.getGlobal())) {
- Reloc::Model RelocM = TM.getRelocationModel();
- if (GV->isThreadLocal()) {
- switch (TM.getTLSModel(GV)) {
- case TLSModel::GeneralDynamic:
- Modifier = "tlsdesc";
- break;
- case TLSModel::LocalDynamic:
- Modifier = "dtprel";
- break;
- case TLSModel::InitialExec:
- Modifier = "gottprel";
- break;
- case TLSModel::LocalExec:
- Modifier = "tprel";
- break;
- }
- } else if (Subtarget->GVIsIndirectSymbol(GV, RelocM)) {
- Modifier = "got";
- }
- }
- break;
- case MachineOperand::MO_BlockAddress:
- Name = GetBlockAddressSymbol(MO.getBlockAddress())->getName();
- break;
- case MachineOperand::MO_ConstantPoolIndex:
- Name = GetCPISymbol(MO.getIndex())->getName();
- break;
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
+ AsmPrinter::getAnalysisUsage(AU);
+ AU.setPreservesAll();
}
- // Some instructions (notably ADRP) don't take the # prefix for
- // immediates. Only print it if asked to.
- if (PrintImmediatePrefix)
- O << '#';
-
- // Only need the joining "_" if both the prefix and the suffix are
- // non-null. This little block simply takes care of the four possibly
- // combinations involved there.
- if (Modifier == "" && Suffix == "")
- O << Name;
- else if (Modifier == "" && Suffix != "")
- O << ":" << Suffix << ':' << Name;
- else if (Modifier != "" && Suffix == "")
- O << ":" << Modifier << ':' << Name;
- else
- O << ":" << Modifier << '_' << Suffix << ':' << Name;
-
- return false;
-}
-
-bool AArch64AsmPrinter::PrintAsmOperand(const MachineInstr *MI, unsigned OpNum,
- unsigned AsmVariant,
- const char *ExtraCode, raw_ostream &O) {
- const TargetRegisterInfo *TRI = MF->getTarget().getRegisterInfo();
-
- if (!ExtraCode)
- ExtraCode = "";
-
- switch(ExtraCode[0]) {
- default:
- if (!AsmPrinter::PrintAsmOperand(MI, OpNum, AsmVariant, ExtraCode, O))
- return false;
- break;
- case 'w':
- // Output 32-bit general register operand, constant zero as wzr, or stack
- // pointer as wsp. Ignored when used with other operand types.
- if (!printModifiedGPRAsmOperand(MI->getOperand(OpNum), TRI,
- AArch64::GPR32RegClass, O))
- return false;
- break;
- case 'x':
- // Output 64-bit general register operand, constant zero as xzr, or stack
- // pointer as sp. Ignored when used with other operand types.
- if (!printModifiedGPRAsmOperand(MI->getOperand(OpNum), TRI,
- AArch64::GPR64RegClass, O))
- return false;
- break;
- case 'H':
- // Output higher numbered of a 64-bit general register pair
- case 'Q':
- // Output least significant register of a 64-bit general register pair
- case 'R':
- // Output most significant register of a 64-bit general register pair
-
- // FIXME note: these three operand modifiers will require, to some extent,
- // adding a paired GPR64 register class. Initial investigation suggests that
- // assertions are hit unless it has a type and is made legal for that type
- // in ISelLowering. After that step is made, the number of modifications
- // needed explodes (operation legality, calling conventions, stores, reg
- // copies ...).
- llvm_unreachable("FIXME: Unimplemented register pairs");
- case 'b':
- case 'h':
- case 's':
- case 'd':
- case 'q':
- if (!printModifiedFPRAsmOperand(MI->getOperand(OpNum), TRI,
- ExtraCode[0], O))
- return false;
- break;
- case 'A':
- // Output symbolic address with appropriate relocation modifier (also
- // suitable for ADRP).
- if (!printSymbolicAddress(MI->getOperand(OpNum), false, "", O))
- return false;
- break;
- case 'L':
- // Output bits 11:0 of symbolic address with appropriate :lo12: relocation
- // modifier.
- if (!printSymbolicAddress(MI->getOperand(OpNum), true, "lo12", O))
- return false;
- break;
- case 'G':
- // Output bits 23:12 of symbolic address with appropriate :hi12: relocation
- // modifier (currently only for TLS local exec).
- if (!printSymbolicAddress(MI->getOperand(OpNum), true, "hi12", O))
- return false;
- break;
- case 'a':
- return PrintAsmMemoryOperand(MI, OpNum, AsmVariant, ExtraCode, O);
+ bool runOnMachineFunction(MachineFunction &F) override {
+ AArch64FI = F.getInfo<AArch64FunctionInfo>();
+ return AsmPrinter::runOnMachineFunction(F);
}
- // There's actually no operand modifier, which leads to a slightly eclectic
- // set of behaviour which we have to handle here.
- const MachineOperand &MO = MI->getOperand(OpNum);
- switch (MO.getType()) {
- default:
- llvm_unreachable("Unexpected operand for inline assembly");
- case MachineOperand::MO_Register:
- // GCC prints the unmodified operand of a 'w' constraint as the vector
- // register. Technically, we could allocate the argument as a VPR128, but
- // that leads to extremely dodgy copies being generated to get the data
- // there.
- if (printModifiedFPRAsmOperand(MO, TRI, 'v', O))
- O << AArch64InstPrinter::getRegisterName(MO.getReg());
- break;
- case MachineOperand::MO_Immediate:
- O << '#' << MO.getImm();
- break;
- case MachineOperand::MO_FPImmediate:
- assert(MO.getFPImm()->isExactlyValue(0.0) && "Only FP 0.0 expected");
- O << "#0.0";
- break;
- case MachineOperand::MO_BlockAddress:
- case MachineOperand::MO_ConstantPoolIndex:
- case MachineOperand::MO_GlobalAddress:
- return printSymbolicAddress(MO, false, "", O);
- }
+private:
+ MachineLocation getDebugValueLocation(const MachineInstr *MI) const;
+ void printOperand(const MachineInstr *MI, unsigned OpNum, raw_ostream &O);
+ bool printAsmMRegister(const MachineOperand &MO, char Mode, raw_ostream &O);
+ bool printAsmRegInClass(const MachineOperand &MO,
+ const TargetRegisterClass *RC, bool isVector,
+ raw_ostream &O);
- return false;
-}
+ bool PrintAsmOperand(const MachineInstr *MI, unsigned OpNum,
+ unsigned AsmVariant, const char *ExtraCode,
+ raw_ostream &O) override;
+ bool PrintAsmMemoryOperand(const MachineInstr *MI, unsigned OpNum,
+ unsigned AsmVariant, const char *ExtraCode,
+ raw_ostream &O) override;
-bool AArch64AsmPrinter::PrintAsmMemoryOperand(const MachineInstr *MI,
- unsigned OpNum,
- unsigned AsmVariant,
- const char *ExtraCode,
- raw_ostream &O) {
- // Currently both the memory constraints (m and Q) behave the same and amount
- // to the address as a single register. In future, we may allow "m" to provide
- // both a base and an offset.
- const MachineOperand &MO = MI->getOperand(OpNum);
- assert(MO.isReg() && "unexpected inline assembly memory operand");
- O << '[' << AArch64InstPrinter::getRegisterName(MO.getReg()) << ']';
- return false;
-}
+ void PrintDebugValueComment(const MachineInstr *MI, raw_ostream &OS);
-#include "AArch64GenMCPseudoLowering.inc"
+ void EmitFunctionBodyEnd() override;
-void AArch64AsmPrinter::EmitInstruction(const MachineInstr *MI) {
- // Do any auto-generated pseudo lowerings.
- if (emitPseudoExpansionLowering(OutStreamer, MI))
- return;
+ MCSymbol *GetCPISymbol(unsigned CPID) const override;
+ void EmitEndOfAsmFile(Module &M) override;
+ AArch64FunctionInfo *AArch64FI;
- MCInst TmpInst;
- LowerAArch64MachineInstrToMCInst(MI, TmpInst, *this);
- EmitToStreamer(OutStreamer, TmpInst);
-}
+ /// \brief Emit the LOHs contained in AArch64FI.
+ void EmitLOHs();
+
+ typedef std::map<const MachineInstr *, MCSymbol *> MInstToMCSymbol;
+ MInstToMCSymbol LOHInstToLabel;
+ unsigned LOHLabelCounter;
+};
+
+} // end of anonymous namespace
+
+//===----------------------------------------------------------------------===//
void AArch64AsmPrinter::EmitEndOfAsmFile(Module &M) {
+ if (Subtarget->isTargetMachO()) {
+ // Funny Darwin hack: This flag tells the linker that no global symbols
+ // contain code that falls through to other global symbols (e.g. the obvious
+ // implementation of multiple entry points). If this doesn't occur, the
+ // linker can safely perform dead code stripping. Since LLVM never
+ // generates code that does this, it is always safe to set.
+ OutStreamer.EmitAssemblerFlag(MCAF_SubsectionsViaSymbols);
+ SM.serializeToStackMapSection();
+ }
+
+ // Emit a .data.rel section containing any stubs that were created.
if (Subtarget->isTargetELF()) {
const TargetLoweringObjectFileELF &TLOFELF =
static_cast<const TargetLoweringObjectFileELF &>(getObjFileLowering());
@@ -288,15 +155,370 @@
Stubs.clear();
}
}
+
}
-bool AArch64AsmPrinter::runOnMachineFunction(MachineFunction &MF) {
- return AsmPrinter::runOnMachineFunction(MF);
+MachineLocation
+AArch64AsmPrinter::getDebugValueLocation(const MachineInstr *MI) const {
+ MachineLocation Location;
+ assert(MI->getNumOperands() == 4 && "Invalid no. of machine operands!");
+ // Frame address. Currently handles register +- offset only.
+ if (MI->getOperand(0).isReg() && MI->getOperand(1).isImm())
+ Location.set(MI->getOperand(0).getReg(), MI->getOperand(1).getImm());
+ else {
+ DEBUG(dbgs() << "DBG_VALUE instruction ignored! " << *MI << "\n");
+ }
+ return Location;
+}
+
+void AArch64AsmPrinter::EmitLOHs() {
+ SmallVector<MCSymbol *, 3> MCArgs;
+
+ for (const auto &D : AArch64FI->getLOHContainer()) {
+ for (const MachineInstr *MI : D.getArgs()) {
+ MInstToMCSymbol::iterator LabelIt = LOHInstToLabel.find(MI);
+ assert(LabelIt != LOHInstToLabel.end() &&
+ "Label hasn't been inserted for LOH related instruction");
+ MCArgs.push_back(LabelIt->second);
+ }
+ OutStreamer.EmitLOHDirective(D.getKind(), MCArgs);
+ MCArgs.clear();
+ }
+}
+
+void AArch64AsmPrinter::EmitFunctionBodyEnd() {
+ if (!AArch64FI->getLOHRelated().empty())
+ EmitLOHs();
+}
+
+/// GetCPISymbol - Return the symbol for the specified constant pool entry.
+MCSymbol *AArch64AsmPrinter::GetCPISymbol(unsigned CPID) const {
+ // Darwin uses a linker-private symbol name for constant-pools (to
+ // avoid addends on the relocation?), ELF has no such concept and
+ // uses a normal private symbol.
+ if (getDataLayout().getLinkerPrivateGlobalPrefix()[0])
+ return OutContext.GetOrCreateSymbol(
+ Twine(getDataLayout().getLinkerPrivateGlobalPrefix()) + "CPI" +
+ Twine(getFunctionNumber()) + "_" + Twine(CPID));
+
+ return OutContext.GetOrCreateSymbol(
+ Twine(getDataLayout().getPrivateGlobalPrefix()) + "CPI" +
+ Twine(getFunctionNumber()) + "_" + Twine(CPID));
+}
+
+void AArch64AsmPrinter::printOperand(const MachineInstr *MI, unsigned OpNum,
+ raw_ostream &O) {
+ const MachineOperand &MO = MI->getOperand(OpNum);
+ switch (MO.getType()) {
+ default:
+ assert(0 && "<unknown operand type>");
+ case MachineOperand::MO_Register: {
+ unsigned Reg = MO.getReg();
+ assert(TargetRegisterInfo::isPhysicalRegister(Reg));
+ assert(!MO.getSubReg() && "Subregs should be eliminated!");
+ O << AArch64InstPrinter::getRegisterName(Reg);
+ break;
+ }
+ case MachineOperand::MO_Immediate: {
+ int64_t Imm = MO.getImm();
+ O << '#' << Imm;
+ break;
+ }
+ }
+}
+
+bool AArch64AsmPrinter::printAsmMRegister(const MachineOperand &MO, char Mode,
+ raw_ostream &O) {
+ unsigned Reg = MO.getReg();
+ switch (Mode) {
+ default:
+ return true; // Unknown mode.
+ case 'w':
+ Reg = getWRegFromXReg(Reg);
+ break;
+ case 'x':
+ Reg = getXRegFromWReg(Reg);
+ break;
+ }
+
+ O << AArch64InstPrinter::getRegisterName(Reg);
+ return false;
+}
+
+// Prints the register in MO using class RC using the offset in the
+// new register class. This should not be used for cross class
+// printing.
+bool AArch64AsmPrinter::printAsmRegInClass(const MachineOperand &MO,
+ const TargetRegisterClass *RC,
+ bool isVector, raw_ostream &O) {
+ assert(MO.isReg() && "Should only get here with a register!");
+ const AArch64RegisterInfo *RI =
+ static_cast<const AArch64RegisterInfo *>(TM.getRegisterInfo());
+ unsigned Reg = MO.getReg();
+ unsigned RegToPrint = RC->getRegister(RI->getEncodingValue(Reg));
+ assert(RI->regsOverlap(RegToPrint, Reg));
+ O << AArch64InstPrinter::getRegisterName(
+ RegToPrint, isVector ? AArch64::vreg : AArch64::NoRegAltName);
+ return false;
+}
+
+bool AArch64AsmPrinter::PrintAsmOperand(const MachineInstr *MI, unsigned OpNum,
+ unsigned AsmVariant,
+ const char *ExtraCode, raw_ostream &O) {
+ const MachineOperand &MO = MI->getOperand(OpNum);
+
+ // First try the generic code, which knows about modifiers like 'c' and 'n'.
+ if (!AsmPrinter::PrintAsmOperand(MI, OpNum, AsmVariant, ExtraCode, O))
+ return false;
+
+ // Does this asm operand have a single letter operand modifier?
+ if (ExtraCode && ExtraCode[0]) {
+ if (ExtraCode[1] != 0)
+ return true; // Unknown modifier.
+
+ switch (ExtraCode[0]) {
+ default:
+ return true; // Unknown modifier.
+ case 'w': // Print W register
+ case 'x': // Print X register
+ if (MO.isReg())
+ return printAsmMRegister(MO, ExtraCode[0], O);
+ if (MO.isImm() && MO.getImm() == 0) {
+ unsigned Reg = ExtraCode[0] == 'w' ? AArch64::WZR : AArch64::XZR;
+ O << AArch64InstPrinter::getRegisterName(Reg);
+ return false;
+ }
+ printOperand(MI, OpNum, O);
+ return false;
+ case 'b': // Print B register.
+ case 'h': // Print H register.
+ case 's': // Print S register.
+ case 'd': // Print D register.
+ case 'q': // Print Q register.
+ if (MO.isReg()) {
+ const TargetRegisterClass *RC;
+ switch (ExtraCode[0]) {
+ case 'b':
+ RC = &AArch64::FPR8RegClass;
+ break;
+ case 'h':
+ RC = &AArch64::FPR16RegClass;
+ break;
+ case 's':
+ RC = &AArch64::FPR32RegClass;
+ break;
+ case 'd':
+ RC = &AArch64::FPR64RegClass;
+ break;
+ case 'q':
+ RC = &AArch64::FPR128RegClass;
+ break;
+ default:
+ return true;
+ }
+ return printAsmRegInClass(MO, RC, false /* vector */, O);
+ }
+ printOperand(MI, OpNum, O);
+ return false;
+ }
+ }
+
+ // According to ARM, we should emit x and v registers unless we have a
+ // modifier.
+ if (MO.isReg()) {
+ unsigned Reg = MO.getReg();
+
+ // If this is a w or x register, print an x register.
+ if (AArch64::GPR32allRegClass.contains(Reg) ||
+ AArch64::GPR64allRegClass.contains(Reg))
+ return printAsmMRegister(MO, 'x', O);
+
+ // If this is a b, h, s, d, or q register, print it as a v register.
+ return printAsmRegInClass(MO, &AArch64::FPR128RegClass, true /* vector */,
+ O);
+ }
+
+ printOperand(MI, OpNum, O);
+ return false;
+}
+
+bool AArch64AsmPrinter::PrintAsmMemoryOperand(const MachineInstr *MI,
+ unsigned OpNum,
+ unsigned AsmVariant,
+ const char *ExtraCode,
+ raw_ostream &O) {
+ if (ExtraCode && ExtraCode[0])
+ return true; // Unknown modifier.
+
+ const MachineOperand &MO = MI->getOperand(OpNum);
+ assert(MO.isReg() && "unexpected inline asm memory operand");
+ O << "[" << AArch64InstPrinter::getRegisterName(MO.getReg()) << "]";
+ return false;
+}
+
+void AArch64AsmPrinter::PrintDebugValueComment(const MachineInstr *MI,
+ raw_ostream &OS) {
+ unsigned NOps = MI->getNumOperands();
+ assert(NOps == 4);
+ OS << '\t' << MAI->getCommentString() << "DEBUG_VALUE: ";
+ // cast away const; DIetc do not take const operands for some reason.
+ DIVariable V(const_cast<MDNode *>(MI->getOperand(NOps - 1).getMetadata()));
+ OS << V.getName();
+ OS << " <- ";
+ // Frame address. Currently handles register +- offset only.
+ assert(MI->getOperand(0).isReg() && MI->getOperand(1).isImm());
+ OS << '[';
+ printOperand(MI, 0, OS);
+ OS << '+';
+ printOperand(MI, 1, OS);
+ OS << ']';
+ OS << "+";
+ printOperand(MI, NOps - 2, OS);
+}
+
+void AArch64AsmPrinter::LowerSTACKMAP(MCStreamer &OutStreamer, StackMaps &SM,
+ const MachineInstr &MI) {
+ unsigned NumNOPBytes = MI.getOperand(1).getImm();
+
+ SM.recordStackMap(MI);
+ // Emit padding.
+ assert(NumNOPBytes % 4 == 0 && "Invalid number of NOP bytes requested!");
+ for (unsigned i = 0; i < NumNOPBytes; i += 4)
+ EmitToStreamer(OutStreamer, MCInstBuilder(AArch64::HINT).addImm(0));
+}
+
+// Lower a patchpoint of the form:
+// [<def>], <id>, <numBytes>, <target>, <numArgs>
+void AArch64AsmPrinter::LowerPATCHPOINT(MCStreamer &OutStreamer, StackMaps &SM,
+ const MachineInstr &MI) {
+ SM.recordPatchPoint(MI);
+
+ PatchPointOpers Opers(&MI);
+
+ int64_t CallTarget = Opers.getMetaOper(PatchPointOpers::TargetPos).getImm();
+ unsigned EncodedBytes = 0;
+ if (CallTarget) {
+ assert((CallTarget & 0xFFFFFFFFFFFF) == CallTarget &&
+ "High 16 bits of call target should be zero.");
+ unsigned ScratchReg = MI.getOperand(Opers.getNextScratchIdx()).getReg();
+ EncodedBytes = 16;
+ // Materialize the jump address:
+ EmitToStreamer(OutStreamer, MCInstBuilder(AArch64::MOVZWi)
+ .addReg(ScratchReg)
+ .addImm((CallTarget >> 32) & 0xFFFF)
+ .addImm(32));
+ EmitToStreamer(OutStreamer, MCInstBuilder(AArch64::MOVKWi)
+ .addReg(ScratchReg)
+ .addReg(ScratchReg)
+ .addImm((CallTarget >> 16) & 0xFFFF)
+ .addImm(16));
+ EmitToStreamer(OutStreamer, MCInstBuilder(AArch64::MOVKWi)
+ .addReg(ScratchReg)
+ .addReg(ScratchReg)
+ .addImm(CallTarget & 0xFFFF)
+ .addImm(0));
+ EmitToStreamer(OutStreamer, MCInstBuilder(AArch64::BLR).addReg(ScratchReg));
+ }
+ // Emit padding.
+ unsigned NumBytes = Opers.getMetaOper(PatchPointOpers::NBytesPos).getImm();
+ assert(NumBytes >= EncodedBytes &&
+ "Patchpoint can't request size less than the length of a call.");
+ assert((NumBytes - EncodedBytes) % 4 == 0 &&
+ "Invalid number of NOP bytes requested!");
+ for (unsigned i = EncodedBytes; i < NumBytes; i += 4)
+ EmitToStreamer(OutStreamer, MCInstBuilder(AArch64::HINT).addImm(0));
+}
+
+// Simple pseudo-instructions have their lowering (with expansion to real
+// instructions) auto-generated.
+#include "AArch64GenMCPseudoLowering.inc"
+
+void AArch64AsmPrinter::EmitInstruction(const MachineInstr *MI) {
+ // Do any auto-generated pseudo lowerings.
+ if (emitPseudoExpansionLowering(OutStreamer, MI))
+ return;
+
+ if (AArch64FI->getLOHRelated().count(MI)) {
+ // Generate a label for LOH related instruction
+ MCSymbol *LOHLabel = GetTempSymbol("loh", LOHLabelCounter++);
+ // Associate the instruction with the label
+ LOHInstToLabel[MI] = LOHLabel;
+ OutStreamer.EmitLabel(LOHLabel);
+ }
+
+ // Do any manual lowerings.
+ switch (MI->getOpcode()) {
+ default:
+ break;
+ case AArch64::DBG_VALUE: {
+ if (isVerbose() && OutStreamer.hasRawTextSupport()) {
+ SmallString<128> TmpStr;
+ raw_svector_ostream OS(TmpStr);
+ PrintDebugValueComment(MI, OS);
+ OutStreamer.EmitRawText(StringRef(OS.str()));
+ }
+ return;
+ }
+
+ // Tail calls use pseudo instructions so they have the proper code-gen
+ // attributes (isCall, isReturn, etc.). We lower them to the real
+ // instruction here.
+ case AArch64::TCRETURNri: {
+ MCInst TmpInst;
+ TmpInst.setOpcode(AArch64::BR);
+ TmpInst.addOperand(MCOperand::CreateReg(MI->getOperand(0).getReg()));
+ EmitToStreamer(OutStreamer, TmpInst);
+ return;
+ }
+ case AArch64::TCRETURNdi: {
+ MCOperand Dest;
+ MCInstLowering.lowerOperand(MI->getOperand(0), Dest);
+ MCInst TmpInst;
+ TmpInst.setOpcode(AArch64::B);
+ TmpInst.addOperand(Dest);
+ EmitToStreamer(OutStreamer, TmpInst);
+ return;
+ }
+ case AArch64::TLSDESC_BLR: {
+ MCOperand Callee, Sym;
+ MCInstLowering.lowerOperand(MI->getOperand(0), Callee);
+ MCInstLowering.lowerOperand(MI->getOperand(1), Sym);
+
+ // First emit a relocation-annotation. This expands to no code, but requests
+ // the following instruction gets an R_AARCH64_TLSDESC_CALL.
+ MCInst TLSDescCall;
+ TLSDescCall.setOpcode(AArch64::TLSDESCCALL);
+ TLSDescCall.addOperand(Sym);
+ EmitToStreamer(OutStreamer, TLSDescCall);
+
+ // Other than that it's just a normal indirect call to the function loaded
+ // from the descriptor.
+ MCInst BLR;
+ BLR.setOpcode(AArch64::BLR);
+ BLR.addOperand(Callee);
+ EmitToStreamer(OutStreamer, BLR);
+
+ return;
+ }
+
+ case TargetOpcode::STACKMAP:
+ return LowerSTACKMAP(OutStreamer, SM, *MI);
+
+ case TargetOpcode::PATCHPOINT:
+ return LowerPATCHPOINT(OutStreamer, SM, *MI);
+ }
+
+ // Finally, do the automated lowerings for everything else.
+ MCInst TmpInst;
+ MCInstLowering.Lower(MI, TmpInst);
+ EmitToStreamer(OutStreamer, TmpInst);
}
// Force static initialization.
extern "C" void LLVMInitializeAArch64AsmPrinter() {
- RegisterAsmPrinter<AArch64AsmPrinter> X(TheAArch64leTarget);
- RegisterAsmPrinter<AArch64AsmPrinter> Y(TheAArch64beTarget);
-}
+ RegisterAsmPrinter<AArch64AsmPrinter> X(TheAArch64leTarget);
+ RegisterAsmPrinter<AArch64AsmPrinter> Y(TheAArch64beTarget);
+ RegisterAsmPrinter<AArch64AsmPrinter> Z(TheARM64leTarget);
+ RegisterAsmPrinter<AArch64AsmPrinter> W(TheARM64beTarget);
+}
diff --git a/lib/Target/AArch64/AArch64AsmPrinter.h b/lib/Target/AArch64/AArch64AsmPrinter.h
deleted file mode 100644
index 824f003..0000000
--- a/lib/Target/AArch64/AArch64AsmPrinter.h
+++ /dev/null
@@ -1,76 +0,0 @@
-// AArch64AsmPrinter.h - Print machine code to an AArch64 .s file -*- C++ -*-=//
-//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This file defines the AArch64 assembly printer class.
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef LLVM_AARCH64ASMPRINTER_H
-#define LLVM_AARCH64ASMPRINTER_H
-
-#include "AArch64.h"
-#include "AArch64TargetMachine.h"
-#include "llvm/CodeGen/AsmPrinter.h"
-#include "llvm/MC/MCStreamer.h"
-#include "llvm/Support/Compiler.h"
-
-namespace llvm {
-
-class MCOperand;
-
-class LLVM_LIBRARY_VISIBILITY AArch64AsmPrinter : public AsmPrinter {
-
- /// Subtarget - Keep a pointer to the AArch64Subtarget around so that we can
- /// make the right decision when printing asm code for different targets.
- const AArch64Subtarget *Subtarget;
-
- // emitPseudoExpansionLowering - tblgen'erated.
- bool emitPseudoExpansionLowering(MCStreamer &OutStreamer,
- const MachineInstr *MI);
-
- public:
- explicit AArch64AsmPrinter(TargetMachine &TM, MCStreamer &Streamer)
- : AsmPrinter(TM, Streamer) {
- Subtarget = &TM.getSubtarget<AArch64Subtarget>();
- }
-
- bool lowerOperand(const MachineOperand &MO, MCOperand &MCOp) const;
-
- MCOperand lowerSymbolOperand(const MachineOperand &MO,
- const MCSymbol *Sym) const;
-
- void EmitInstruction(const MachineInstr *MI);
- void EmitEndOfAsmFile(Module &M);
-
- bool PrintAsmOperand(const MachineInstr *MI, unsigned OpNum,
- unsigned AsmVariant, const char *ExtraCode,
- raw_ostream &O);
- bool PrintAsmMemoryOperand(const MachineInstr *MI, unsigned OpNum,
- unsigned AsmVariant, const char *ExtraCode,
- raw_ostream &O);
-
- /// printSymbolicAddress - Given some kind of reasonably bare symbolic
- /// reference, print out the appropriate asm string to represent it. If
- /// appropriate, a relocation-specifier will be produced, composed of a
- /// general class derived from the MO parameter and an instruction-specific
- /// suffix, provided in Suffix. E.g. ":got_lo12:" if a Suffix of "lo12" is
- /// given.
- bool printSymbolicAddress(const MachineOperand &MO,
- bool PrintImmediatePrefix,
- StringRef Suffix, raw_ostream &O);
-
- virtual const char *getPassName() const {
- return "AArch64 Assembly Printer";
- }
-
- virtual bool runOnMachineFunction(MachineFunction &MF);
-};
-} // end namespace llvm
-
-#endif
diff --git a/lib/Target/AArch64/AArch64BranchFixupPass.cpp b/lib/Target/AArch64/AArch64BranchFixupPass.cpp
deleted file mode 100644
index c03cdde..0000000
--- a/lib/Target/AArch64/AArch64BranchFixupPass.cpp
+++ /dev/null
@@ -1,600 +0,0 @@
-//===-- AArch64BranchFixupPass.cpp - AArch64 branch fixup -----------------===//
-//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This file contains a pass that fixes AArch64 branches which have ended up out
-// of range for their immediate operands.
-//
-//===----------------------------------------------------------------------===//
-
-#define DEBUG_TYPE "aarch64-branch-fixup"
-#include "AArch64.h"
-#include "AArch64InstrInfo.h"
-#include "Utils/AArch64BaseInfo.h"
-#include "llvm/ADT/Statistic.h"
-#include "llvm/CodeGen/MachineFunctionPass.h"
-#include "llvm/CodeGen/MachineInstrBuilder.h"
-#include "llvm/CodeGen/MachineRegisterInfo.h"
-#include "llvm/Support/Debug.h"
-#include "llvm/Support/Format.h"
-#include "llvm/Support/raw_ostream.h"
-using namespace llvm;
-
-STATISTIC(NumSplit, "Number of uncond branches inserted");
-STATISTIC(NumCBrFixed, "Number of cond branches fixed");
-
-/// Return the worst case padding that could result from unknown offset bits.
-/// This does not include alignment padding caused by known offset bits.
-///
-/// @param LogAlign log2(alignment)
-/// @param KnownBits Number of known low offset bits.
-static inline unsigned UnknownPadding(unsigned LogAlign, unsigned KnownBits) {
- if (KnownBits < LogAlign)
- return (1u << LogAlign) - (1u << KnownBits);
- return 0;
-}
-
-namespace {
- /// Due to limited PC-relative displacements, conditional branches to distant
- /// blocks may need converting into an unconditional equivalent. For example:
- /// tbz w1, #0, far_away
- /// becomes
- /// tbnz w1, #0, skip
- /// b far_away
- /// skip:
- class AArch64BranchFixup : public MachineFunctionPass {
- /// Information about the offset and size of a single basic block.
- struct BasicBlockInfo {
- /// Distance from the beginning of the function to the beginning of this
- /// basic block.
- ///
- /// Offsets are computed assuming worst case padding before an aligned
- /// block. This means that subtracting basic block offsets always gives a
- /// conservative estimate of the real distance which may be smaller.
- ///
- /// Because worst case padding is used, the computed offset of an aligned
- /// block may not actually be aligned.
- unsigned Offset;
-
- /// Size of the basic block in bytes. If the block contains inline
- /// assembly, this is a worst case estimate.
- ///
- /// The size does not include any alignment padding whether from the
- /// beginning of the block, or from an aligned jump table at the end.
- unsigned Size;
-
- /// The number of low bits in Offset that are known to be exact. The
- /// remaining bits of Offset are an upper bound.
- uint8_t KnownBits;
-
- /// When non-zero, the block contains instructions (inline asm) of unknown
- /// size. The real size may be smaller than Size bytes by a multiple of 1
- /// << Unalign.
- uint8_t Unalign;
-
- BasicBlockInfo() : Offset(0), Size(0), KnownBits(0), Unalign(0) {}
-
- /// Compute the number of known offset bits internally to this block.
- /// This number should be used to predict worst case padding when
- /// splitting the block.
- unsigned internalKnownBits() const {
- unsigned Bits = Unalign ? Unalign : KnownBits;
- // If the block size isn't a multiple of the known bits, assume the
- // worst case padding.
- if (Size & ((1u << Bits) - 1))
- Bits = countTrailingZeros(Size);
- return Bits;
- }
-
- /// Compute the offset immediately following this block. If LogAlign is
- /// specified, return the offset the successor block will get if it has
- /// this alignment.
- unsigned postOffset(unsigned LogAlign = 0) const {
- unsigned PO = Offset + Size;
- if (!LogAlign)
- return PO;
- // Add alignment padding from the terminator.
- return PO + UnknownPadding(LogAlign, internalKnownBits());
- }
-
- /// Compute the number of known low bits of postOffset. If this block
- /// contains inline asm, the number of known bits drops to the
- /// instruction alignment. An aligned terminator may increase the number
- /// of know bits.
- /// If LogAlign is given, also consider the alignment of the next block.
- unsigned postKnownBits(unsigned LogAlign = 0) const {
- return std::max(LogAlign, internalKnownBits());
- }
- };
-
- std::vector<BasicBlockInfo> BBInfo;
-
- /// One per immediate branch, keeping the machine instruction pointer,
- /// conditional or unconditional, the max displacement, and (if IsCond is
- /// true) the corresponding inverted branch opcode.
- struct ImmBranch {
- MachineInstr *MI;
- unsigned OffsetBits : 31;
- bool IsCond : 1;
- ImmBranch(MachineInstr *mi, unsigned offsetbits, bool cond)
- : MI(mi), OffsetBits(offsetbits), IsCond(cond) {}
- };
-
- /// Keep track of all the immediate branch instructions.
- ///
- std::vector<ImmBranch> ImmBranches;
-
- MachineFunction *MF;
- const AArch64InstrInfo *TII;
- public:
- static char ID;
- AArch64BranchFixup() : MachineFunctionPass(ID) {}
-
- virtual bool runOnMachineFunction(MachineFunction &MF);
-
- virtual const char *getPassName() const {
- return "AArch64 branch fixup pass";
- }
-
- private:
- void initializeFunctionInfo();
- MachineBasicBlock *splitBlockBeforeInstr(MachineInstr *MI);
- void adjustBBOffsetsAfter(MachineBasicBlock *BB);
- bool isBBInRange(MachineInstr *MI, MachineBasicBlock *BB,
- unsigned OffsetBits);
- bool fixupImmediateBr(ImmBranch &Br);
- bool fixupConditionalBr(ImmBranch &Br);
-
- void computeBlockSize(MachineBasicBlock *MBB);
- unsigned getOffsetOf(MachineInstr *MI) const;
- void dumpBBs();
- void verify();
- };
- char AArch64BranchFixup::ID = 0;
-}
-
-/// check BBOffsets
-void AArch64BranchFixup::verify() {
-#ifndef NDEBUG
- for (MachineFunction::iterator MBBI = MF->begin(), E = MF->end();
- MBBI != E; ++MBBI) {
- MachineBasicBlock *MBB = MBBI;
- unsigned MBBId = MBB->getNumber();
- assert(!MBBId || BBInfo[MBBId - 1].postOffset() <= BBInfo[MBBId].Offset);
- }
-#endif
-}
-
-/// print block size and offset information - debugging
-void AArch64BranchFixup::dumpBBs() {
- DEBUG({
- for (unsigned J = 0, E = BBInfo.size(); J !=E; ++J) {
- const BasicBlockInfo &BBI = BBInfo[J];
- dbgs() << format("%08x BB#%u\t", BBI.Offset, J)
- << " kb=" << unsigned(BBI.KnownBits)
- << " ua=" << unsigned(BBI.Unalign)
- << format(" size=%#x\n", BBInfo[J].Size);
- }
- });
-}
-
-/// Returns an instance of the branch fixup pass.
-FunctionPass *llvm::createAArch64BranchFixupPass() {
- return new AArch64BranchFixup();
-}
-
-bool AArch64BranchFixup::runOnMachineFunction(MachineFunction &mf) {
- MF = &mf;
- DEBUG(dbgs() << "***** AArch64BranchFixup ******");
- TII = (const AArch64InstrInfo*)MF->getTarget().getInstrInfo();
-
- // This pass invalidates liveness information when it splits basic blocks.
- MF->getRegInfo().invalidateLiveness();
-
- // Renumber all of the machine basic blocks in the function, guaranteeing that
- // the numbers agree with the position of the block in the function.
- MF->RenumberBlocks();
-
- // Do the initial scan of the function, building up information about the
- // sizes of each block and location of each immediate branch.
- initializeFunctionInfo();
-
- // Iteratively fix up branches until there is no change.
- unsigned NoBRIters = 0;
- bool MadeChange = false;
- while (true) {
- DEBUG(dbgs() << "Beginning iteration #" << NoBRIters << '\n');
- bool BRChange = false;
- for (unsigned i = 0, e = ImmBranches.size(); i != e; ++i)
- BRChange |= fixupImmediateBr(ImmBranches[i]);
- if (BRChange && ++NoBRIters > 30)
- report_fatal_error("Branch Fix Up pass failed to converge!");
- DEBUG(dumpBBs());
-
- if (!BRChange)
- break;
- MadeChange = true;
- }
-
- // After a while, this might be made debug-only, but it is not expensive.
- verify();
-
- DEBUG(dbgs() << '\n'; dumpBBs());
-
- BBInfo.clear();
- ImmBranches.clear();
-
- return MadeChange;
-}
-
-/// Return true if the specified basic block can fallthrough into the block
-/// immediately after it.
-static bool BBHasFallthrough(MachineBasicBlock *MBB) {
- // Get the next machine basic block in the function.
- MachineFunction::iterator MBBI = MBB;
- // Can't fall off end of function.
- if (std::next(MBBI) == MBB->getParent()->end())
- return false;
-
- MachineBasicBlock *NextBB = std::next(MBBI);
- for (MachineBasicBlock::succ_iterator I = MBB->succ_begin(),
- E = MBB->succ_end(); I != E; ++I)
- if (*I == NextBB)
- return true;
-
- return false;
-}
-
-/// Do the initial scan of the function, building up information about the sizes
-/// of each block, and each immediate branch.
-void AArch64BranchFixup::initializeFunctionInfo() {
- BBInfo.clear();
- BBInfo.resize(MF->getNumBlockIDs());
-
- // First thing, compute the size of all basic blocks, and see if the function
- // has any inline assembly in it. If so, we have to be conservative about
- // alignment assumptions, as we don't know for sure the size of any
- // instructions in the inline assembly.
- for (MachineFunction::iterator I = MF->begin(), E = MF->end(); I != E; ++I)
- computeBlockSize(I);
-
- // The known bits of the entry block offset are determined by the function
- // alignment.
- BBInfo.front().KnownBits = MF->getAlignment();
-
- // Compute block offsets and known bits.
- adjustBBOffsetsAfter(MF->begin());
-
- // Now go back through the instructions and build up our data structures.
- for (MachineFunction::iterator MBBI = MF->begin(), E = MF->end();
- MBBI != E; ++MBBI) {
- MachineBasicBlock &MBB = *MBBI;
-
- for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end();
- I != E; ++I) {
- if (I->isDebugValue())
- continue;
-
- int Opc = I->getOpcode();
- if (I->isBranch()) {
- bool IsCond = false;
-
- // The offsets encoded in instructions here scale by the instruction
- // size (4 bytes), effectively increasing their range by 2 bits.
- unsigned Bits = 0;
- switch (Opc) {
- default:
- continue; // Ignore other JT branches
- case AArch64::TBZxii:
- case AArch64::TBZwii:
- case AArch64::TBNZxii:
- case AArch64::TBNZwii:
- IsCond = true;
- Bits = 14 + 2;
- break;
- case AArch64::Bcc:
- case AArch64::CBZx:
- case AArch64::CBZw:
- case AArch64::CBNZx:
- case AArch64::CBNZw:
- IsCond = true;
- Bits = 19 + 2;
- break;
- case AArch64::Bimm:
- Bits = 26 + 2;
- break;
- }
-
- // Record this immediate branch.
- ImmBranches.push_back(ImmBranch(I, Bits, IsCond));
- }
- }
- }
-}
-
-/// Compute the size and some alignment information for MBB. This function
-/// updates BBInfo directly.
-void AArch64BranchFixup::computeBlockSize(MachineBasicBlock *MBB) {
- BasicBlockInfo &BBI = BBInfo[MBB->getNumber()];
- BBI.Size = 0;
- BBI.Unalign = 0;
-
- for (MachineBasicBlock::iterator I = MBB->begin(), E = MBB->end(); I != E;
- ++I) {
- BBI.Size += TII->getInstSizeInBytes(*I);
- // For inline asm, GetInstSizeInBytes returns a conservative estimate.
- // The actual size may be smaller, but still a multiple of the instr size.
- if (I->isInlineAsm())
- BBI.Unalign = 2;
- }
-}
-
-/// Return the current offset of the specified machine instruction from the
-/// start of the function. This offset changes as stuff is moved around inside
-/// the function.
-unsigned AArch64BranchFixup::getOffsetOf(MachineInstr *MI) const {
- MachineBasicBlock *MBB = MI->getParent();
-
- // The offset is composed of two things: the sum of the sizes of all MBB's
- // before this instruction's block, and the offset from the start of the block
- // it is in.
- unsigned Offset = BBInfo[MBB->getNumber()].Offset;
-
- // Sum instructions before MI in MBB.
- for (MachineBasicBlock::iterator I = MBB->begin(); &*I != MI; ++I) {
- assert(I != MBB->end() && "Didn't find MI in its own basic block?");
- Offset += TII->getInstSizeInBytes(*I);
- }
- return Offset;
-}
-
-/// Split the basic block containing MI into two blocks, which are joined by
-/// an unconditional branch. Update data structures and renumber blocks to
-/// account for this change and returns the newly created block.
-MachineBasicBlock *
-AArch64BranchFixup::splitBlockBeforeInstr(MachineInstr *MI) {
- MachineBasicBlock *OrigBB = MI->getParent();
-
- // Create a new MBB for the code after the OrigBB.
- MachineBasicBlock *NewBB =
- MF->CreateMachineBasicBlock(OrigBB->getBasicBlock());
- MachineFunction::iterator MBBI = OrigBB; ++MBBI;
- MF->insert(MBBI, NewBB);
-
- // Splice the instructions starting with MI over to NewBB.
- NewBB->splice(NewBB->end(), OrigBB, MI, OrigBB->end());
-
- // Add an unconditional branch from OrigBB to NewBB.
- // Note the new unconditional branch is not being recorded.
- // There doesn't seem to be meaningful DebugInfo available; this doesn't
- // correspond to anything in the source.
- BuildMI(OrigBB, DebugLoc(), TII->get(AArch64::Bimm)).addMBB(NewBB);
- ++NumSplit;
-
- // Update the CFG. All succs of OrigBB are now succs of NewBB.
- NewBB->transferSuccessors(OrigBB);
-
- // OrigBB branches to NewBB.
- OrigBB->addSuccessor(NewBB);
-
- // Update internal data structures to account for the newly inserted MBB.
- MF->RenumberBlocks(NewBB);
-
- // Insert an entry into BBInfo to align it properly with the (newly
- // renumbered) block numbers.
- BBInfo.insert(BBInfo.begin() + NewBB->getNumber(), BasicBlockInfo());
-
- // Figure out how large the OrigBB is. As the first half of the original
- // block, it cannot contain a tablejump. The size includes
- // the new jump we added. (It should be possible to do this without
- // recounting everything, but it's very confusing, and this is rarely
- // executed.)
- computeBlockSize(OrigBB);
-
- // Figure out how large the NewMBB is. As the second half of the original
- // block, it may contain a tablejump.
- computeBlockSize(NewBB);
-
- // All BBOffsets following these blocks must be modified.
- adjustBBOffsetsAfter(OrigBB);
-
- return NewBB;
-}
-
-void AArch64BranchFixup::adjustBBOffsetsAfter(MachineBasicBlock *BB) {
- unsigned BBNum = BB->getNumber();
- for(unsigned i = BBNum + 1, e = MF->getNumBlockIDs(); i < e; ++i) {
- // Get the offset and known bits at the end of the layout predecessor.
- // Include the alignment of the current block.
- unsigned LogAlign = MF->getBlockNumbered(i)->getAlignment();
- unsigned Offset = BBInfo[i - 1].postOffset(LogAlign);
- unsigned KnownBits = BBInfo[i - 1].postKnownBits(LogAlign);
-
- // This is where block i begins. Stop if the offset is already correct,
- // and we have updated 2 blocks. This is the maximum number of blocks
- // changed before calling this function.
- if (i > BBNum + 2 &&
- BBInfo[i].Offset == Offset &&
- BBInfo[i].KnownBits == KnownBits)
- break;
-
- BBInfo[i].Offset = Offset;
- BBInfo[i].KnownBits = KnownBits;
- }
-}
-
-/// Returns true if the distance between specific MI and specific BB can fit in
-/// MI's displacement field.
-bool AArch64BranchFixup::isBBInRange(MachineInstr *MI,
- MachineBasicBlock *DestBB,
- unsigned OffsetBits) {
- int64_t BrOffset = getOffsetOf(MI);
- int64_t DestOffset = BBInfo[DestBB->getNumber()].Offset;
-
- DEBUG(dbgs() << "Branch of destination BB#" << DestBB->getNumber()
- << " from BB#" << MI->getParent()->getNumber()
- << " bits available=" << OffsetBits
- << " from " << getOffsetOf(MI) << " to " << DestOffset
- << " offset " << int(DestOffset-BrOffset) << "\t" << *MI);
-
- return isIntN(OffsetBits, DestOffset - BrOffset);
-}
-
-/// Fix up an immediate branch whose destination is too far away to fit in its
-/// displacement field.
-bool AArch64BranchFixup::fixupImmediateBr(ImmBranch &Br) {
- MachineInstr *MI = Br.MI;
- MachineBasicBlock *DestBB = 0;
- for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
- if (MI->getOperand(i).isMBB()) {
- DestBB = MI->getOperand(i).getMBB();
- break;
- }
- }
- assert(DestBB && "Branch with no destination BB?");
-
- // Check to see if the DestBB is already in-range.
- if (isBBInRange(MI, DestBB, Br.OffsetBits))
- return false;
-
- assert(Br.IsCond && "Only conditional branches should need fixup");
- return fixupConditionalBr(Br);
-}
-
-/// Fix up a conditional branch whose destination is too far away to fit in its
-/// displacement field. It is converted to an inverse conditional branch + an
-/// unconditional branch to the destination.
-bool
-AArch64BranchFixup::fixupConditionalBr(ImmBranch &Br) {
- MachineInstr *MI = Br.MI;
- MachineBasicBlock *MBB = MI->getParent();
- unsigned CondBrMBBOperand = 0;
-
- // The general idea is to add an unconditional branch to the destination and
- // invert the conditional branch to jump over it. Complications occur around
- // fallthrough and unreachable ends to the block.
- // b.lt L1
- // =>
- // b.ge L2
- // b L1
- // L2:
-
- // First we invert the conditional branch, by creating a replacement if
- // necessary. This if statement contains all the special handling of different
- // branch types.
- if (MI->getOpcode() == AArch64::Bcc) {
- // The basic block is operand number 1 for Bcc
- CondBrMBBOperand = 1;
-
- A64CC::CondCodes CC = (A64CC::CondCodes)MI->getOperand(0).getImm();
- CC = A64InvertCondCode(CC);
- MI->getOperand(0).setImm(CC);
- } else {
- MachineInstrBuilder InvertedMI;
- int InvertedOpcode;
- switch (MI->getOpcode()) {
- default: llvm_unreachable("Unknown branch type");
- case AArch64::TBZxii: InvertedOpcode = AArch64::TBNZxii; break;
- case AArch64::TBZwii: InvertedOpcode = AArch64::TBNZwii; break;
- case AArch64::TBNZxii: InvertedOpcode = AArch64::TBZxii; break;
- case AArch64::TBNZwii: InvertedOpcode = AArch64::TBZwii; break;
- case AArch64::CBZx: InvertedOpcode = AArch64::CBNZx; break;
- case AArch64::CBZw: InvertedOpcode = AArch64::CBNZw; break;
- case AArch64::CBNZx: InvertedOpcode = AArch64::CBZx; break;
- case AArch64::CBNZw: InvertedOpcode = AArch64::CBZw; break;
- }
-
- InvertedMI = BuildMI(*MBB, MI, MI->getDebugLoc(), TII->get(InvertedOpcode));
- for (unsigned i = 0, e= MI->getNumOperands(); i != e; ++i) {
- InvertedMI.addOperand(MI->getOperand(i));
- if (MI->getOperand(i).isMBB())
- CondBrMBBOperand = i;
- }
-
- MI->eraseFromParent();
- MI = Br.MI = InvertedMI;
- }
-
- // If the branch is at the end of its MBB and that has a fall-through block,
- // direct the updated conditional branch to the fall-through
- // block. Otherwise, split the MBB before the next instruction.
- MachineInstr *BMI = &MBB->back();
- bool NeedSplit = (BMI != MI) || !BBHasFallthrough(MBB);
-
- ++NumCBrFixed;
- if (BMI != MI) {
- if (std::next(MachineBasicBlock::iterator(MI)) == std::prev(MBB->end()) &&
- BMI->getOpcode() == AArch64::Bimm) {
- // Last MI in the BB is an unconditional branch. We can swap destinations:
- // b.eq L1 (temporarily b.ne L1 after first change)
- // b L2
- // =>
- // b.ne L2
- // b L1
- MachineBasicBlock *NewDest = BMI->getOperand(0).getMBB();
- if (isBBInRange(MI, NewDest, Br.OffsetBits)) {
- DEBUG(dbgs() << " Invert Bcc condition and swap its destination with "
- << *BMI);
- MachineBasicBlock *DestBB = MI->getOperand(CondBrMBBOperand).getMBB();
- BMI->getOperand(0).setMBB(DestBB);
- MI->getOperand(CondBrMBBOperand).setMBB(NewDest);
- return true;
- }
- }
- }
-
- if (NeedSplit) {
- MachineBasicBlock::iterator MBBI = MI; ++MBBI;
- splitBlockBeforeInstr(MBBI);
- // No need for the branch to the next block. We're adding an unconditional
- // branch to the destination.
- int delta = TII->getInstSizeInBytes(MBB->back());
- BBInfo[MBB->getNumber()].Size -= delta;
- MBB->back().eraseFromParent();
- // BBInfo[SplitBB].Offset is wrong temporarily, fixed below
- }
-
- // After splitting and removing the unconditional branch from the original BB,
- // the structure is now:
- // oldbb:
- // [things]
- // b.invertedCC L1
- // splitbb/fallthroughbb:
- // [old b L2/real continuation]
- //
- // We now have to change the conditional branch to point to splitbb and add an
- // unconditional branch after it to L1, giving the final structure:
- // oldbb:
- // [things]
- // b.invertedCC splitbb
- // b L1
- // splitbb/fallthroughbb:
- // [old b L2/real continuation]
- MachineBasicBlock *NextBB = std::next(MachineFunction::iterator(MBB));
-
- DEBUG(dbgs() << " Insert B to BB#"
- << MI->getOperand(CondBrMBBOperand).getMBB()->getNumber()
- << " also invert condition and change dest. to BB#"
- << NextBB->getNumber() << "\n");
-
- // Insert a new unconditional branch and fixup the destination of the
- // conditional one. Also update the ImmBranch as well as adding a new entry
- // for the new branch.
- BuildMI(MBB, DebugLoc(), TII->get(AArch64::Bimm))
- .addMBB(MI->getOperand(CondBrMBBOperand).getMBB());
- MI->getOperand(CondBrMBBOperand).setMBB(NextBB);
-
- BBInfo[MBB->getNumber()].Size += TII->getInstSizeInBytes(MBB->back());
-
- // 26 bits written down in Bimm, specifying a multiple of 4.
- unsigned OffsetBits = 26 + 2;
- ImmBranches.push_back(ImmBranch(&MBB->back(), OffsetBits, false));
-
- adjustBBOffsetsAfter(MBB);
- return true;
-}
diff --git a/lib/Target/AArch64/AArch64BranchRelaxation.cpp b/lib/Target/AArch64/AArch64BranchRelaxation.cpp
new file mode 100644
index 0000000..5209452
--- /dev/null
+++ b/lib/Target/AArch64/AArch64BranchRelaxation.cpp
@@ -0,0 +1,510 @@
+//===-- AArch64BranchRelaxation.cpp - AArch64 branch relaxation -----------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+//===----------------------------------------------------------------------===//
+
+#include "AArch64.h"
+#include "AArch64InstrInfo.h"
+#include "AArch64MachineFunctionInfo.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/Format.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Support/CommandLine.h"
+using namespace llvm;
+
+#define DEBUG_TYPE "aarch64-branch-relax"
+
+static cl::opt<bool>
+BranchRelaxation("aarch64-branch-relax", cl::Hidden, cl::init(true),
+ cl::desc("Relax out of range conditional branches"));
+
+static cl::opt<unsigned>
+TBZDisplacementBits("aarch64-tbz-offset-bits", cl::Hidden, cl::init(14),
+ cl::desc("Restrict range of TB[N]Z instructions (DEBUG)"));
+
+static cl::opt<unsigned>
+CBZDisplacementBits("aarch64-cbz-offset-bits", cl::Hidden, cl::init(19),
+ cl::desc("Restrict range of CB[N]Z instructions (DEBUG)"));
+
+static cl::opt<unsigned>
+BCCDisplacementBits("aarch64-bcc-offset-bits", cl::Hidden, cl::init(19),
+ cl::desc("Restrict range of Bcc instructions (DEBUG)"));
+
+STATISTIC(NumSplit, "Number of basic blocks split");
+STATISTIC(NumRelaxed, "Number of conditional branches relaxed");
+
+namespace {
+class AArch64BranchRelaxation : public MachineFunctionPass {
+ /// BasicBlockInfo - Information about the offset and size of a single
+ /// basic block.
+ struct BasicBlockInfo {
+ /// Offset - Distance from the beginning of the function to the beginning
+ /// of this basic block.
+ ///
+ /// The offset is always aligned as required by the basic block.
+ unsigned Offset;
+
+ /// Size - Size of the basic block in bytes. If the block contains
+ /// inline assembly, this is a worst case estimate.
+ ///
+ /// The size does not include any alignment padding whether from the
+ /// beginning of the block, or from an aligned jump table at the end.
+ unsigned Size;
+
+ BasicBlockInfo() : Offset(0), Size(0) {}
+
+ /// Compute the offset immediately following this block. If LogAlign is
+ /// specified, return the offset the successor block will get if it has
+ /// this alignment.
+ unsigned postOffset(unsigned LogAlign = 0) const {
+ unsigned PO = Offset + Size;
+ unsigned Align = 1 << LogAlign;
+ return (PO + Align - 1) / Align * Align;
+ }
+ };
+
+ SmallVector<BasicBlockInfo, 16> BlockInfo;
+
+ MachineFunction *MF;
+ const AArch64InstrInfo *TII;
+
+ bool relaxBranchInstructions();
+ void scanFunction();
+ MachineBasicBlock *splitBlockBeforeInstr(MachineInstr *MI);
+ void adjustBlockOffsets(MachineBasicBlock &MBB);
+ bool isBlockInRange(MachineInstr *MI, MachineBasicBlock *BB, unsigned Disp);
+ bool fixupConditionalBranch(MachineInstr *MI);
+ void computeBlockSize(const MachineBasicBlock &MBB);
+ unsigned getInstrOffset(MachineInstr *MI) const;
+ void dumpBBs();
+ void verify();
+
+public:
+ static char ID;
+ AArch64BranchRelaxation() : MachineFunctionPass(ID) {}
+
+ bool runOnMachineFunction(MachineFunction &MF) override;
+
+ const char *getPassName() const override {
+ return "AArch64 branch relaxation pass";
+ }
+};
+char AArch64BranchRelaxation::ID = 0;
+}
+
+/// verify - check BBOffsets, BBSizes, alignment of islands
+void AArch64BranchRelaxation::verify() {
+#ifndef NDEBUG
+ unsigned PrevNum = MF->begin()->getNumber();
+ for (MachineBasicBlock &MBB : *MF) {
+ unsigned Align = MBB.getAlignment();
+ unsigned Num = MBB.getNumber();
+ assert(BlockInfo[Num].Offset % (1u << Align) == 0);
+ assert(!Num || BlockInfo[PrevNum].postOffset() <= BlockInfo[Num].Offset);
+ PrevNum = Num;
+ }
+#endif
+}
+
+/// print block size and offset information - debugging
+void AArch64BranchRelaxation::dumpBBs() {
+ for (auto &MBB : *MF) {
+ const BasicBlockInfo &BBI = BlockInfo[MBB.getNumber()];
+ dbgs() << format("BB#%u\toffset=%08x\t", MBB.getNumber(), BBI.Offset)
+ << format("size=%#x\n", BBI.Size);
+ }
+}
+
+/// BBHasFallthrough - Return true if the specified basic block can fallthrough
+/// into the block immediately after it.
+static bool BBHasFallthrough(MachineBasicBlock *MBB) {
+ // Get the next machine basic block in the function.
+ MachineFunction::iterator MBBI = MBB;
+ // Can't fall off end of function.
+ MachineBasicBlock *NextBB = std::next(MBBI);
+ if (NextBB == MBB->getParent()->end())
+ return false;
+
+ for (MachineBasicBlock *S : MBB->successors())
+ if (S == NextBB)
+ return true;
+
+ return false;
+}
+
+/// scanFunction - Do the initial scan of the function, building up
+/// information about each block.
+void AArch64BranchRelaxation::scanFunction() {
+ BlockInfo.clear();
+ BlockInfo.resize(MF->getNumBlockIDs());
+
+ // First thing, compute the size of all basic blocks, and see if the function
+ // has any inline assembly in it. If so, we have to be conservative about
+ // alignment assumptions, as we don't know for sure the size of any
+ // instructions in the inline assembly.
+ for (MachineBasicBlock &MBB : *MF)
+ computeBlockSize(MBB);
+
+ // Compute block offsets and known bits.
+ adjustBlockOffsets(*MF->begin());
+}
+
+/// computeBlockSize - Compute the size for MBB.
+/// This function updates BlockInfo directly.
+void AArch64BranchRelaxation::computeBlockSize(const MachineBasicBlock &MBB) {
+ unsigned Size = 0;
+ for (const MachineInstr &MI : MBB)
+ Size += TII->GetInstSizeInBytes(&MI);
+ BlockInfo[MBB.getNumber()].Size = Size;
+}
+
+/// getInstrOffset - Return the current offset of the specified machine
+/// instruction from the start of the function. This offset changes as stuff is
+/// moved around inside the function.
+unsigned AArch64BranchRelaxation::getInstrOffset(MachineInstr *MI) const {
+ MachineBasicBlock *MBB = MI->getParent();
+
+ // The offset is composed of two things: the sum of the sizes of all MBB's
+ // before this instruction's block, and the offset from the start of the block
+ // it is in.
+ unsigned Offset = BlockInfo[MBB->getNumber()].Offset;
+
+ // Sum instructions before MI in MBB.
+ for (MachineBasicBlock::iterator I = MBB->begin(); &*I != MI; ++I) {
+ assert(I != MBB->end() && "Didn't find MI in its own basic block?");
+ Offset += TII->GetInstSizeInBytes(I);
+ }
+ return Offset;
+}
+
+void AArch64BranchRelaxation::adjustBlockOffsets(MachineBasicBlock &Start) {
+ unsigned PrevNum = Start.getNumber();
+ for (auto &MBB : make_range(MachineFunction::iterator(Start), MF->end())) {
+ unsigned Num = MBB.getNumber();
+ if (!Num) // block zero is never changed from offset zero.
+ continue;
+ // Get the offset and known bits at the end of the layout predecessor.
+ // Include the alignment of the current block.
+ unsigned LogAlign = MBB.getAlignment();
+ BlockInfo[Num].Offset = BlockInfo[PrevNum].postOffset(LogAlign);
+ PrevNum = Num;
+ }
+}
+
+/// Split the basic block containing MI into two blocks, which are joined by
+/// an unconditional branch. Update data structures and renumber blocks to
+/// account for this change and returns the newly created block.
+/// NOTE: Successor list of the original BB is out of date after this function,
+/// and must be updated by the caller! Other transforms follow using this
+/// utility function, so no point updating now rather than waiting.
+MachineBasicBlock *
+AArch64BranchRelaxation::splitBlockBeforeInstr(MachineInstr *MI) {
+ MachineBasicBlock *OrigBB = MI->getParent();
+
+ // Create a new MBB for the code after the OrigBB.
+ MachineBasicBlock *NewBB =
+ MF->CreateMachineBasicBlock(OrigBB->getBasicBlock());
+ MachineFunction::iterator MBBI = OrigBB;
+ ++MBBI;
+ MF->insert(MBBI, NewBB);
+
+ // Splice the instructions starting with MI over to NewBB.
+ NewBB->splice(NewBB->end(), OrigBB, MI, OrigBB->end());
+
+ // Add an unconditional branch from OrigBB to NewBB.
+ // Note the new unconditional branch is not being recorded.
+ // There doesn't seem to be meaningful DebugInfo available; this doesn't
+ // correspond to anything in the source.
+ BuildMI(OrigBB, DebugLoc(), TII->get(AArch64::B)).addMBB(NewBB);
+
+ // Insert an entry into BlockInfo to align it properly with the block numbers.
+ BlockInfo.insert(BlockInfo.begin() + NewBB->getNumber(), BasicBlockInfo());
+
+ // Figure out how large the OrigBB is. As the first half of the original
+ // block, it cannot contain a tablejump. The size includes
+ // the new jump we added. (It should be possible to do this without
+ // recounting everything, but it's very confusing, and this is rarely
+ // executed.)
+ computeBlockSize(*OrigBB);
+
+ // Figure out how large the NewMBB is. As the second half of the original
+ // block, it may contain a tablejump.
+ computeBlockSize(*NewBB);
+
+ // All BBOffsets following these blocks must be modified.
+ adjustBlockOffsets(*OrigBB);
+
+ ++NumSplit;
+
+ return NewBB;
+}
+
+/// isBlockInRange - Returns true if the distance between specific MI and
+/// specific BB can fit in MI's displacement field.
+bool AArch64BranchRelaxation::isBlockInRange(MachineInstr *MI,
+ MachineBasicBlock *DestBB,
+ unsigned Bits) {
+ unsigned MaxOffs = ((1 << (Bits - 1)) - 1) << 2;
+ unsigned BrOffset = getInstrOffset(MI);
+ unsigned DestOffset = BlockInfo[DestBB->getNumber()].Offset;
+
+ DEBUG(dbgs() << "Branch of destination BB#" << DestBB->getNumber()
+ << " from BB#" << MI->getParent()->getNumber()
+ << " max delta=" << MaxOffs << " from " << getInstrOffset(MI)
+ << " to " << DestOffset << " offset "
+ << int(DestOffset - BrOffset) << "\t" << *MI);
+
+ // Branch before the Dest.
+ if (BrOffset <= DestOffset)
+ return (DestOffset - BrOffset <= MaxOffs);
+ return (BrOffset - DestOffset <= MaxOffs);
+}
+
+static bool isConditionalBranch(unsigned Opc) {
+ switch (Opc) {
+ default:
+ return false;
+ case AArch64::TBZW:
+ case AArch64::TBNZW:
+ case AArch64::TBZX:
+ case AArch64::TBNZX:
+ case AArch64::CBZW:
+ case AArch64::CBNZW:
+ case AArch64::CBZX:
+ case AArch64::CBNZX:
+ case AArch64::Bcc:
+ return true;
+ }
+}
+
+static MachineBasicBlock *getDestBlock(MachineInstr *MI) {
+ switch (MI->getOpcode()) {
+ default:
+ assert(0 && "unexpected opcode!");
+ case AArch64::TBZW:
+ case AArch64::TBNZW:
+ case AArch64::TBZX:
+ case AArch64::TBNZX:
+ return MI->getOperand(2).getMBB();
+ case AArch64::CBZW:
+ case AArch64::CBNZW:
+ case AArch64::CBZX:
+ case AArch64::CBNZX:
+ case AArch64::Bcc:
+ return MI->getOperand(1).getMBB();
+ }
+}
+
+static unsigned getOppositeConditionOpcode(unsigned Opc) {
+ switch (Opc) {
+ default:
+ assert(0 && "unexpected opcode!");
+ case AArch64::TBNZW: return AArch64::TBZW;
+ case AArch64::TBNZX: return AArch64::TBZX;
+ case AArch64::TBZW: return AArch64::TBNZW;
+ case AArch64::TBZX: return AArch64::TBNZX;
+ case AArch64::CBNZW: return AArch64::CBZW;
+ case AArch64::CBNZX: return AArch64::CBZX;
+ case AArch64::CBZW: return AArch64::CBNZW;
+ case AArch64::CBZX: return AArch64::CBNZX;
+ case AArch64::Bcc: return AArch64::Bcc; // Condition is an operand for Bcc.
+ }
+}
+
+static unsigned getBranchDisplacementBits(unsigned Opc) {
+ switch (Opc) {
+ default:
+ assert(0 && "unexpected opcode!");
+ case AArch64::TBNZW:
+ case AArch64::TBZW:
+ case AArch64::TBNZX:
+ case AArch64::TBZX:
+ return TBZDisplacementBits;
+ case AArch64::CBNZW:
+ case AArch64::CBZW:
+ case AArch64::CBNZX:
+ case AArch64::CBZX:
+ return CBZDisplacementBits;
+ case AArch64::Bcc:
+ return BCCDisplacementBits;
+ }
+}
+
+static inline void invertBccCondition(MachineInstr *MI) {
+ assert(MI->getOpcode() == AArch64::Bcc && "Unexpected opcode!");
+ AArch64CC::CondCode CC = (AArch64CC::CondCode)MI->getOperand(0).getImm();
+ CC = AArch64CC::getInvertedCondCode(CC);
+ MI->getOperand(0).setImm((int64_t)CC);
+}
+
+/// fixupConditionalBranch - Fix up a conditional branch whose destination is
+/// too far away to fit in its displacement field. It is converted to an inverse
+/// conditional branch + an unconditional branch to the destination.
+bool AArch64BranchRelaxation::fixupConditionalBranch(MachineInstr *MI) {
+ MachineBasicBlock *DestBB = getDestBlock(MI);
+
+ // Add an unconditional branch to the destination and invert the branch
+ // condition to jump over it:
+ // tbz L1
+ // =>
+ // tbnz L2
+ // b L1
+ // L2:
+
+ // If the branch is at the end of its MBB and that has a fall-through block,
+ // direct the updated conditional branch to the fall-through block. Otherwise,
+ // split the MBB before the next instruction.
+ MachineBasicBlock *MBB = MI->getParent();
+ MachineInstr *BMI = &MBB->back();
+ bool NeedSplit = (BMI != MI) || !BBHasFallthrough(MBB);
+
+ if (BMI != MI) {
+ if (std::next(MachineBasicBlock::iterator(MI)) ==
+ std::prev(MBB->getLastNonDebugInstr()) &&
+ BMI->getOpcode() == AArch64::B) {
+ // Last MI in the BB is an unconditional branch. Can we simply invert the
+ // condition and swap destinations:
+ // beq L1
+ // b L2
+ // =>
+ // bne L2
+ // b L1
+ MachineBasicBlock *NewDest = BMI->getOperand(0).getMBB();
+ if (isBlockInRange(MI, NewDest,
+ getBranchDisplacementBits(MI->getOpcode()))) {
+ DEBUG(dbgs() << " Invert condition and swap its destination with "
+ << *BMI);
+ BMI->getOperand(0).setMBB(DestBB);
+ unsigned OpNum = (MI->getOpcode() == AArch64::TBZW ||
+ MI->getOpcode() == AArch64::TBNZW ||
+ MI->getOpcode() == AArch64::TBZX ||
+ MI->getOpcode() == AArch64::TBNZX)
+ ? 2
+ : 1;
+ MI->getOperand(OpNum).setMBB(NewDest);
+ MI->setDesc(TII->get(getOppositeConditionOpcode(MI->getOpcode())));
+ if (MI->getOpcode() == AArch64::Bcc)
+ invertBccCondition(MI);
+ return true;
+ }
+ }
+ }
+
+ if (NeedSplit) {
+ // Analyze the branch so we know how to update the successor lists.
+ MachineBasicBlock *TBB, *FBB;
+ SmallVector<MachineOperand, 2> Cond;
+ TII->AnalyzeBranch(*MBB, TBB, FBB, Cond, false);
+
+ MachineBasicBlock *NewBB = splitBlockBeforeInstr(MI);
+ // No need for the branch to the next block. We're adding an unconditional
+ // branch to the destination.
+ int delta = TII->GetInstSizeInBytes(&MBB->back());
+ BlockInfo[MBB->getNumber()].Size -= delta;
+ MBB->back().eraseFromParent();
+ // BlockInfo[SplitBB].Offset is wrong temporarily, fixed below
+
+ // Update the successor lists according to the transformation to follow.
+ // Do it here since if there's no split, no update is needed.
+ MBB->replaceSuccessor(FBB, NewBB);
+ NewBB->addSuccessor(FBB);
+ }
+ MachineBasicBlock *NextBB = std::next(MachineFunction::iterator(MBB));
+
+ DEBUG(dbgs() << " Insert B to BB#" << DestBB->getNumber()
+ << ", invert condition and change dest. to BB#"
+ << NextBB->getNumber() << "\n");
+
+ // Insert a new conditional branch and a new unconditional branch.
+ MachineInstrBuilder MIB = BuildMI(
+ MBB, DebugLoc(), TII->get(getOppositeConditionOpcode(MI->getOpcode())))
+ .addOperand(MI->getOperand(0));
+ if (MI->getOpcode() == AArch64::TBZW || MI->getOpcode() == AArch64::TBNZW ||
+ MI->getOpcode() == AArch64::TBZX || MI->getOpcode() == AArch64::TBNZX)
+ MIB.addOperand(MI->getOperand(1));
+ if (MI->getOpcode() == AArch64::Bcc)
+ invertBccCondition(MIB);
+ MIB.addMBB(NextBB);
+ BlockInfo[MBB->getNumber()].Size += TII->GetInstSizeInBytes(&MBB->back());
+ BuildMI(MBB, DebugLoc(), TII->get(AArch64::B)).addMBB(DestBB);
+ BlockInfo[MBB->getNumber()].Size += TII->GetInstSizeInBytes(&MBB->back());
+
+ // Remove the old conditional branch. It may or may not still be in MBB.
+ BlockInfo[MI->getParent()->getNumber()].Size -= TII->GetInstSizeInBytes(MI);
+ MI->eraseFromParent();
+
+ // Finally, keep the block offsets up to date.
+ adjustBlockOffsets(*MBB);
+ return true;
+}
+
+bool AArch64BranchRelaxation::relaxBranchInstructions() {
+ bool Changed = false;
+ // Relaxing branches involves creating new basic blocks, so re-eval
+ // end() for termination.
+ for (auto &MBB : *MF) {
+ MachineInstr *MI = MBB.getFirstTerminator();
+ if (isConditionalBranch(MI->getOpcode()) &&
+ !isBlockInRange(MI, getDestBlock(MI),
+ getBranchDisplacementBits(MI->getOpcode()))) {
+ fixupConditionalBranch(MI);
+ ++NumRelaxed;
+ Changed = true;
+ }
+ }
+ return Changed;
+}
+
+bool AArch64BranchRelaxation::runOnMachineFunction(MachineFunction &mf) {
+ MF = &mf;
+
+ // If the pass is disabled, just bail early.
+ if (!BranchRelaxation)
+ return false;
+
+ DEBUG(dbgs() << "***** AArch64BranchRelaxation *****\n");
+
+ TII = (const AArch64InstrInfo *)MF->getTarget().getInstrInfo();
+
+ // Renumber all of the machine basic blocks in the function, guaranteeing that
+ // the numbers agree with the position of the block in the function.
+ MF->RenumberBlocks();
+
+ // Do the initial scan of the function, building up information about the
+ // sizes of each block.
+ scanFunction();
+
+ DEBUG(dbgs() << " Basic blocks before relaxation\n");
+ DEBUG(dumpBBs());
+
+ bool MadeChange = false;
+ while (relaxBranchInstructions())
+ MadeChange = true;
+
+ // After a while, this might be made debug-only, but it is not expensive.
+ verify();
+
+ DEBUG(dbgs() << " Basic blocks after relaxation\n");
+ DEBUG(dbgs() << '\n'; dumpBBs());
+
+ BlockInfo.clear();
+
+ return MadeChange;
+}
+
+/// createAArch64BranchRelaxation - returns an instance of the constpool
+/// island pass.
+FunctionPass *llvm::createAArch64BranchRelaxation() {
+ return new AArch64BranchRelaxation();
+}
diff --git a/lib/Target/AArch64/AArch64CallingConv.td b/lib/Target/AArch64/AArch64CallingConv.td
deleted file mode 100644
index 9fe6aae..0000000
--- a/lib/Target/AArch64/AArch64CallingConv.td
+++ /dev/null
@@ -1,197 +0,0 @@
-//==-- AArch64CallingConv.td - Calling Conventions for ARM ----*- tblgen -*-==//
-//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-// This describes the calling conventions for AArch64 architecture.
-//===----------------------------------------------------------------------===//
-
-
-// The AArch64 Procedure Call Standard is unfortunately specified at a slightly
-// higher level of abstraction than LLVM's target interface presents. In
-// particular, it refers (like other ABIs, in fact) directly to
-// structs. However, generic LLVM code takes the liberty of lowering structure
-// arguments to the component fields before we see them.
-//
-// As a result, the obvious direct map from LLVM IR to PCS concepts can't be
-// implemented, so the goals of this calling convention are, in decreasing
-// priority order:
-// 1. Expose *some* way to express the concepts required to implement the
-// generic PCS from a front-end.
-// 2. Provide a sane ABI for pure LLVM.
-// 3. Follow the generic PCS as closely as is naturally possible.
-//
-// The suggested front-end implementation of PCS features is:
-// * Integer, float and vector arguments of all sizes which end up in
-// registers are passed and returned via the natural LLVM type.
-// * Structure arguments with size <= 16 bytes are passed and returned in
-// registers as similar integer or composite types. For example:
-// [1 x i64], [2 x i64] or [1 x i128] (if alignment 16 needed).
-// * HFAs in registers follow rules similar to small structs: appropriate
-// composite types.
-// * Structure arguments with size > 16 bytes are passed via a pointer,
-// handled completely by the front-end.
-// * Structure return values > 16 bytes via an sret pointer argument.
-// * Other stack-based arguments (not large structs) are passed using byval
-// pointers. Padding arguments are added beforehand to guarantee a large
-// struct doesn't later use integer registers.
-//
-// N.b. this means that it is the front-end's responsibility (if it cares about
-// PCS compliance) to check whether enough registers are available for an
-// argument when deciding how to pass it.
-
-class CCIfAlign<int Align, CCAction A>:
- CCIf<"ArgFlags.getOrigAlign() == " # Align, A>;
-
-def CC_A64_APCS : CallingConv<[
- // SRet is an LLVM-specific concept, so it takes precedence over general ABI
- // concerns. However, this rule will be used by C/C++ frontends to implement
- // structure return.
- CCIfSRet<CCAssignToReg<[X8]>>,
-
- // Put ByVal arguments directly on the stack. Minimum size and alignment of a
- // slot is 64-bit.
- CCIfByVal<CCPassByVal<8, 8>>,
-
- // Canonicalise the various types that live in different floating-point
- // registers. This makes sense because the PCS does not distinguish Short
- // Vectors and Floating-point types.
- CCIfType<[v1i16, v2i8], CCBitConvertToType<f16>>,
- CCIfType<[v1i32, v4i8, v2i16], CCBitConvertToType<f32>>,
- CCIfType<[v8i8, v4i16, v2i32, v2f32, v1i64, v1f64], CCBitConvertToType<f64>>,
- CCIfType<[v16i8, v8i16, v4i32, v2i64, v4f32, v2f64],
- CCBitConvertToType<f128>>,
-
- // PCS: "C.1: If the argument is a Half-, Single-, Double- or Quad- precision
- // Floating-point or Short Vector Type and the NSRN is less than 8, then the
- // argument is allocated to the least significant bits of register
- // v[NSRN]. The NSRN is incremented by one. The argument has now been
- // allocated."
- CCIfType<[v1i8], CCAssignToReg<[B0, B1, B2, B3, B4, B5, B6, B7]>>,
- CCIfType<[f16], CCAssignToReg<[H0, H1, H2, H3, H4, H5, H6, H7]>>,
- CCIfType<[f32], CCAssignToReg<[S0, S1, S2, S3, S4, S5, S6, S7]>>,
- CCIfType<[f64], CCAssignToReg<[D0, D1, D2, D3, D4, D5, D6, D7]>>,
- CCIfType<[f128], CCAssignToReg<[Q0, Q1, Q2, Q3, Q4, Q5, Q6, Q7]>>,
-
- // PCS: "C.2: If the argument is an HFA and there are sufficient unallocated
- // SIMD and Floating-point registers (NSRN - number of elements < 8), then the
- // argument is allocated to SIMD and Floating-point registers (with one
- // register per element of the HFA). The NSRN is incremented by the number of
- // registers used. The argument has now been allocated."
- //
- // N.b. As above, this rule is the responsibility of the front-end.
-
- // "C.3: If the argument is an HFA then the NSRN is set to 8 and the size of
- // the argument is rounded up to the nearest multiple of 8 bytes."
- //
- // "C.4: If the argument is an HFA, a Quad-precision Floating-point or Short
- // Vector Type then the NSAA is rounded up to the larger of 8 or the Natural
- // Alignment of the Argument's type."
- //
- // It is expected that these will be satisfied by adding dummy arguments to
- // the prototype.
-
- // PCS: "C.5: If the argument is a Half- or Single- precision Floating-point
- // type then the size of the argument is set to 8 bytes. The effect is as if
- // the argument had been copied to the least significant bits of a 64-bit
- // register and the remaining bits filled with unspecified values."
- CCIfType<[f16, f32], CCPromoteToType<f64>>,
-
- // PCS: "C.6: If the argument is an HFA, a Half-, Single-, Double- or Quad-
- // precision Floating-point or Short Vector Type, then the argument is copied
- // to memory at the adjusted NSAA. The NSAA is incremented by the size of the
- // argument. The argument has now been allocated."
- CCIfType<[f64], CCAssignToStack<8, 8>>,
- CCIfType<[f128], CCAssignToStack<16, 16>>,
-
- // PCS: "C.7: If the argument is an Integral Type, the size of the argument is
- // less than or equal to 8 bytes and the NGRN is less than 8, the argument is
- // copied to the least significant bits of x[NGRN]. The NGRN is incremented by
- // one. The argument has now been allocated."
-
- // First we implement C.8 and C.9 (128-bit types get even registers). i128 is
- // represented as two i64s, the first one being split. If we delayed this
- // operation C.8 would never be reached.
- CCIfType<[i64],
- CCIfSplit<CCAssignToRegWithShadow<[X0, X2, X4, X6], [X0, X1, X3, X5]>>>,
-
- // Note: the promotion also implements C.14.
- CCIfType<[i8, i16, i32], CCPromoteToType<i64>>,
-
- // And now the real implementation of C.7
- CCIfType<[i64], CCAssignToReg<[X0, X1, X2, X3, X4, X5, X6, X7]>>,
-
- // PCS: "C.8: If the argument has an alignment of 16 then the NGRN is rounded
- // up to the next even number."
- //
- // "C.9: If the argument is an Integral Type, the size of the argument is
- // equal to 16 and the NGRN is less than 7, the argument is copied to x[NGRN]
- // and x[NGRN+1], x[NGRN] shall contain the lower addressed double-word of the
- // memory representation of the argument. The NGRN is incremented by two. The
- // argument has now been allocated."
- //
- // Subtlety here: what if alignment is 16 but it is not an integral type? All
- // floating-point types have been allocated already, which leaves composite
- // types: this is why a front-end may need to produce i128 for a struct <= 16
- // bytes.
-
- // PCS: "C.10 If the argument is a Composite Type and the size in double-words
- // of the argument is not more than 8 minus NGRN, then the argument is copied
- // into consecutive general-purpose registers, starting at x[NGRN]. The
- // argument is passed as though it had been loaded into the registers from a
- // double-word aligned address with an appropriate sequence of LDR
- // instructions loading consecutive registers from memory (the contents of any
- // unused parts of the registers are unspecified by this standard). The NGRN
- // is incremented by the number of registers used. The argument has now been
- // allocated."
- //
- // Another one that's the responsibility of the front-end (sigh).
-
- // PCS: "C.11: The NGRN is set to 8."
- CCCustom<"CC_AArch64NoMoreRegs">,
-
- // PCS: "C.12: The NSAA is rounded up to the larger of 8 or the Natural
- // Alignment of the argument's type."
- //
- // PCS: "C.13: If the argument is a composite type then the argument is copied
- // to memory at the adjusted NSAA. The NSAA is by the size of the
- // argument. The argument has now been allocated."
- //
- // Note that the effect of this corresponds to a memcpy rather than register
- // stores so that the struct ends up correctly addressable at the adjusted
- // NSAA.
-
- // PCS: "C.14: If the size of the argument is less than 8 bytes then the size
- // of the argument is set to 8 bytes. The effect is as if the argument was
- // copied to the least significant bits of a 64-bit register and the remaining
- // bits filled with unspecified values."
- //
- // Integer types were widened above. Floating-point and composite types have
- // already been allocated completely. Nothing to do.
-
- // PCS: "C.15: The argument is copied to memory at the adjusted NSAA. The NSAA
- // is incremented by the size of the argument. The argument has now been
- // allocated."
- CCIfType<[i64], CCIfSplit<CCAssignToStack<8, 16>>>,
- CCIfType<[i64], CCAssignToStack<8, 8>>
-
-]>;
-
-// According to the PCS, X19-X30 are callee-saved, however only the low 64-bits
-// of vector registers (8-15) are callee-saved. The order here is is picked up
-// by PrologEpilogInserter.cpp to allocate stack slots, starting from top of
-// stack upon entry. This gives the customary layout of x30 at [sp-8], x29 at
-// [sp-16], ...
-def CSR_PCS : CalleeSavedRegs<(add (sequence "X%u", 30, 19),
- (sequence "D%u", 15, 8))>;
-
-
-// TLS descriptor calls are extremely restricted in their changes, to allow
-// optimisations in the (hopefully) more common fast path where no real action
-// is needed. They actually have to preserve all registers, except for the
-// unavoidable X30 and the return register X0.
-def TLSDesc : CalleeSavedRegs<(add (sequence "X%u", 29, 1),
- (sequence "Q%u", 31, 0))>;
diff --git a/lib/Target/AArch64/AArch64CallingConvention.td b/lib/Target/AArch64/AArch64CallingConvention.td
new file mode 100644
index 0000000..ded2e17
--- /dev/null
+++ b/lib/Target/AArch64/AArch64CallingConvention.td
@@ -0,0 +1,240 @@
+//=- AArch64CallingConv.td - Calling Conventions for AArch64 -*- tablegen -*-=//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This describes the calling conventions for AArch64 architecture.
+//
+//===----------------------------------------------------------------------===//
+
+/// CCIfAlign - Match of the original alignment of the arg
+class CCIfAlign<string Align, CCAction A> :
+ CCIf<!strconcat("ArgFlags.getOrigAlign() == ", Align), A>;
+/// CCIfBigEndian - Match only if we're in big endian mode.
+class CCIfBigEndian<CCAction A> :
+ CCIf<"State.getTarget().getDataLayout()->isBigEndian()", A>;
+
+class CCIfUnallocated<string Reg, CCAction A> :
+ CCIf<"!State.isAllocated(AArch64::" # Reg # ")", A>;
+
+//===----------------------------------------------------------------------===//
+// ARM AAPCS64 Calling Convention
+//===----------------------------------------------------------------------===//
+
+def CC_AArch64_AAPCS : CallingConv<[
+ CCIfType<[v2f32], CCBitConvertToType<v2i32>>,
+ CCIfType<[v2f64, v4f32], CCBitConvertToType<v2i64>>,
+
+ // Big endian vectors must be passed as if they were 1-element vectors so that
+ // their lanes are in a consistent order.
+ CCIfBigEndian<CCIfType<[v2i32, v2f32, v4i16, v4f16, v8i8],
+ CCBitConvertToType<f64>>>,
+ CCIfBigEndian<CCIfType<[v2i64, v2f64, v4i32, v4f32, v8i16, v8f16, v16i8],
+ CCBitConvertToType<f128>>>,
+
+ // An SRet is passed in X8, not X0 like a normal pointer parameter.
+ CCIfSRet<CCIfType<[i64], CCAssignToRegWithShadow<[X8], [W8]>>>,
+
+ // Put ByVal arguments directly on the stack. Minimum size and alignment of a
+ // slot is 64-bit.
+ CCIfByVal<CCPassByVal<8, 8>>,
+
+ // Handle i1, i8, i16, i32, i64, f32, f64 and v2f64 by passing in registers,
+ // up to eight each of GPR and FPR.
+ CCIfType<[i1, i8, i16], CCIfUnallocated<"X7", CCPromoteToType<i32>>>,
+ CCIfType<[i32], CCAssignToRegWithShadow<[W0, W1, W2, W3, W4, W5, W6, W7],
+ [X0, X1, X2, X3, X4, X5, X6, X7]>>,
+ // i128 is split to two i64s, we can't fit half to register X7.
+ CCIfType<[i64], CCIfSplit<CCAssignToRegWithShadow<[X0, X2, X4, X6],
+ [X0, X1, X3, X5]>>>,
+
+ // i128 is split to two i64s, and its stack alignment is 16 bytes.
+ CCIfType<[i64], CCIfSplit<CCAssignToStackWithShadow<8, 16, [X7]>>>,
+
+ CCIfType<[i64], CCAssignToRegWithShadow<[X0, X1, X2, X3, X4, X5, X6, X7],
+ [W0, W1, W2, W3, W4, W5, W6, W7]>>,
+ CCIfType<[f32], CCAssignToRegWithShadow<[S0, S1, S2, S3, S4, S5, S6, S7],
+ [Q0, Q1, Q2, Q3, Q4, Q5, Q6, Q7]>>,
+ CCIfType<[f64], CCAssignToRegWithShadow<[D0, D1, D2, D3, D4, D5, D6, D7],
+ [Q0, Q1, Q2, Q3, Q4, Q5, Q6, Q7]>>,
+ CCIfType<[v1i64, v2i32, v4i16, v8i8, v1f64, v2f32],
+ CCAssignToRegWithShadow<[D0, D1, D2, D3, D4, D5, D6, D7],
+ [Q0, Q1, Q2, Q3, Q4, Q5, Q6, Q7]>>,
+ CCIfType<[f128, v2i64, v4i32, v8i16, v16i8, v4f32, v2f64],
+ CCAssignToReg<[Q0, Q1, Q2, Q3, Q4, Q5, Q6, Q7]>>,
+
+ // If more than will fit in registers, pass them on the stack instead.
+ CCIfType<[i1, i8, i16], CCAssignToStack<8, 8>>,
+ CCIfType<[i32, f32], CCAssignToStack<8, 8>>,
+ CCIfType<[i64, f64, v1f64, v2f32, v1i64, v2i32, v4i16, v8i8],
+ CCAssignToStack<8, 8>>,
+ CCIfType<[f128, v2i64, v4i32, v8i16, v16i8, v4f32, v2f64],
+ CCAssignToStack<16, 16>>
+]>;
+
+def RetCC_AArch64_AAPCS : CallingConv<[
+ CCIfType<[v2f32], CCBitConvertToType<v2i32>>,
+ CCIfType<[v2f64, v4f32], CCBitConvertToType<v2i64>>,
+
+ // Big endian vectors must be passed as if they were 1-element vectors so that
+ // their lanes are in a consistent order.
+ CCIfBigEndian<CCIfType<[v2i32, v2f32, v4i16, v4f16, v8i8],
+ CCBitConvertToType<f64>>>,
+ CCIfBigEndian<CCIfType<[v2i64, v2f64, v4i32, v4f32, v8i16, v8f16, v16i8],
+ CCBitConvertToType<f128>>>,
+
+ CCIfType<[i32], CCAssignToRegWithShadow<[W0, W1, W2, W3, W4, W5, W6, W7],
+ [X0, X1, X2, X3, X4, X5, X6, X7]>>,
+ CCIfType<[i64], CCAssignToRegWithShadow<[X0, X1, X2, X3, X4, X5, X6, X7],
+ [W0, W1, W2, W3, W4, W5, W6, W7]>>,
+ CCIfType<[f32], CCAssignToRegWithShadow<[S0, S1, S2, S3, S4, S5, S6, S7],
+ [Q0, Q1, Q2, Q3, Q4, Q5, Q6, Q7]>>,
+ CCIfType<[f64], CCAssignToRegWithShadow<[D0, D1, D2, D3, D4, D5, D6, D7],
+ [Q0, Q1, Q2, Q3, Q4, Q5, Q6, Q7]>>,
+ CCIfType<[v1i64, v2i32, v4i16, v8i8, v1f64, v2f32],
+ CCAssignToRegWithShadow<[D0, D1, D2, D3, D4, D5, D6, D7],
+ [Q0, Q1, Q2, Q3, Q4, Q5, Q6, Q7]>>,
+ CCIfType<[f128, v2i64, v4i32, v8i16, v16i8, v4f32, v2f64],
+ CCAssignToReg<[Q0, Q1, Q2, Q3, Q4, Q5, Q6, Q7]>>
+]>;
+
+
+// Darwin uses a calling convention which differs in only two ways
+// from the standard one at this level:
+// + i128s (i.e. split i64s) don't need even registers.
+// + Stack slots are sized as needed rather than being at least 64-bit.
+def CC_AArch64_DarwinPCS : CallingConv<[
+ CCIfType<[v2f32], CCBitConvertToType<v2i32>>,
+ CCIfType<[v2f64, v4f32, f128], CCBitConvertToType<v2i64>>,
+
+ // An SRet is passed in X8, not X0 like a normal pointer parameter.
+ CCIfSRet<CCIfType<[i64], CCAssignToRegWithShadow<[X8], [W8]>>>,
+
+ // Put ByVal arguments directly on the stack. Minimum size and alignment of a
+ // slot is 64-bit.
+ CCIfByVal<CCPassByVal<8, 8>>,
+
+ // Handle i1, i8, i16, i32, i64, f32, f64 and v2f64 by passing in registers,
+ // up to eight each of GPR and FPR.
+ CCIfType<[i1, i8, i16], CCIfUnallocated<"X7", CCPromoteToType<i32>>>,
+ CCIfType<[i32], CCAssignToRegWithShadow<[W0, W1, W2, W3, W4, W5, W6, W7],
+ [X0, X1, X2, X3, X4, X5, X6, X7]>>,
+ // i128 is split to two i64s, we can't fit half to register X7.
+ CCIfType<[i64],
+ CCIfSplit<CCAssignToRegWithShadow<[X0, X1, X2, X3, X4, X5, X6],
+ [W0, W1, W2, W3, W4, W5, W6]>>>,
+ // i128 is split to two i64s, and its stack alignment is 16 bytes.
+ CCIfType<[i64], CCIfSplit<CCAssignToStackWithShadow<8, 16, [X7]>>>,
+
+ CCIfType<[i64], CCAssignToRegWithShadow<[X0, X1, X2, X3, X4, X5, X6, X7],
+ [W0, W1, W2, W3, W4, W5, W6, W7]>>,
+ CCIfType<[f32], CCAssignToRegWithShadow<[S0, S1, S2, S3, S4, S5, S6, S7],
+ [Q0, Q1, Q2, Q3, Q4, Q5, Q6, Q7]>>,
+ CCIfType<[f64], CCAssignToRegWithShadow<[D0, D1, D2, D3, D4, D5, D6, D7],
+ [Q0, Q1, Q2, Q3, Q4, Q5, Q6, Q7]>>,
+ CCIfType<[v1i64, v2i32, v4i16, v8i8, v1f64, v2f32],
+ CCAssignToRegWithShadow<[D0, D1, D2, D3, D4, D5, D6, D7],
+ [Q0, Q1, Q2, Q3, Q4, Q5, Q6, Q7]>>,
+ CCIfType<[v2i64, v4i32, v8i16, v16i8, v4f32, v2f64],
+ CCAssignToReg<[Q0, Q1, Q2, Q3, Q4, Q5, Q6, Q7]>>,
+
+ // If more than will fit in registers, pass them on the stack instead.
+ CCIfType<[i1, i8], CCAssignToStack<1, 1>>,
+ CCIfType<[i16], CCAssignToStack<2, 2>>,
+ CCIfType<[i32, f32], CCAssignToStack<4, 4>>,
+ CCIfType<[i64, f64, v1f64, v2f32, v1i64, v2i32, v4i16, v8i8],
+ CCAssignToStack<8, 8>>,
+ CCIfType<[v2i64, v4i32, v8i16, v16i8, v4f32, v2f64], CCAssignToStack<16, 16>>
+]>;
+
+def CC_AArch64_DarwinPCS_VarArg : CallingConv<[
+ CCIfType<[v2f32], CCBitConvertToType<v2i32>>,
+ CCIfType<[v2f64, v4f32, f128], CCBitConvertToType<v2i64>>,
+
+ // Handle all scalar types as either i64 or f64.
+ CCIfType<[i8, i16, i32], CCPromoteToType<i64>>,
+ CCIfType<[f32], CCPromoteToType<f64>>,
+
+ // Everything is on the stack.
+ // i128 is split to two i64s, and its stack alignment is 16 bytes.
+ CCIfType<[i64], CCIfSplit<CCAssignToStack<8, 16>>>,
+ CCIfType<[i64, f64, v1i64, v2i32, v4i16, v8i8, v1f64, v2f32], CCAssignToStack<8, 8>>,
+ CCIfType<[v2i64, v4i32, v8i16, v16i8, v4f32, v2f64], CCAssignToStack<16, 16>>
+]>;
+
+// The WebKit_JS calling convention only passes the first argument (the callee)
+// in register and the remaining arguments on stack. We allow 32bit stack slots,
+// so that WebKit can write partial values in the stack and define the other
+// 32bit quantity as undef.
+def CC_AArch64_WebKit_JS : CallingConv<[
+ // Handle i1, i8, i16, i32, and i64 passing in register X0 (W0).
+ CCIfType<[i1, i8, i16], CCIfUnallocated<"X0", CCPromoteToType<i32>>>,
+ CCIfType<[i32], CCAssignToRegWithShadow<[W0], [X0]>>,
+ CCIfType<[i64], CCAssignToRegWithShadow<[X0], [W0]>>,
+
+ // Pass the remaining arguments on the stack instead.
+ CCIfType<[i1, i8, i16], CCAssignToStack<4, 4>>,
+ CCIfType<[i32, f32], CCAssignToStack<4, 4>>,
+ CCIfType<[i64, f64], CCAssignToStack<8, 8>>
+]>;
+
+def RetCC_AArch64_WebKit_JS : CallingConv<[
+ CCIfType<[i32], CCAssignToRegWithShadow<[W0, W1, W2, W3, W4, W5, W6, W7],
+ [X0, X1, X2, X3, X4, X5, X6, X7]>>,
+ CCIfType<[i64], CCAssignToRegWithShadow<[X0, X1, X2, X3, X4, X5, X6, X7],
+ [W0, W1, W2, W3, W4, W5, W6, W7]>>,
+ CCIfType<[f32], CCAssignToRegWithShadow<[S0, S1, S2, S3, S4, S5, S6, S7],
+ [Q0, Q1, Q2, Q3, Q4, Q5, Q6, Q7]>>,
+ CCIfType<[f64], CCAssignToRegWithShadow<[D0, D1, D2, D3, D4, D5, D6, D7],
+ [Q0, Q1, Q2, Q3, Q4, Q5, Q6, Q7]>>
+]>;
+
+// FIXME: LR is only callee-saved in the sense that *we* preserve it and are
+// presumably a callee to someone. External functions may not do so, but this
+// is currently safe since BL has LR as an implicit-def and what happens after a
+// tail call doesn't matter.
+//
+// It would be better to model its preservation semantics properly (create a
+// vreg on entry, use it in RET & tail call generation; make that vreg def if we
+// end up saving LR as part of a call frame). Watch this space...
+def CSR_AArch64_AAPCS : CalleeSavedRegs<(add LR, FP, X19, X20, X21, X22,
+ X23, X24, X25, X26, X27, X28,
+ D8, D9, D10, D11,
+ D12, D13, D14, D15)>;
+
+// Constructors and destructors return 'this' in the iOS 64-bit C++ ABI; since
+// 'this' and the pointer return value are both passed in X0 in these cases,
+// this can be partially modelled by treating X0 as a callee-saved register;
+// only the resulting RegMask is used; the SaveList is ignored
+//
+// (For generic ARM 64-bit ABI code, clang will not generate constructors or
+// destructors with 'this' returns, so this RegMask will not be used in that
+// case)
+def CSR_AArch64_AAPCS_ThisReturn : CalleeSavedRegs<(add CSR_AArch64_AAPCS, X0)>;
+
+// The function used by Darwin to obtain the address of a thread-local variable
+// guarantees more than a normal AAPCS function. x16 and x17 are used on the
+// fast path for calculation, but other registers except X0 (argument/return)
+// and LR (it is a call, after all) are preserved.
+def CSR_AArch64_TLS_Darwin
+ : CalleeSavedRegs<(add (sub (sequence "X%u", 1, 28), X16, X17),
+ FP,
+ (sequence "Q%u", 0, 31))>;
+
+// The ELF stub used for TLS-descriptor access saves every feasible
+// register. Only X0 and LR are clobbered.
+def CSR_AArch64_TLS_ELF
+ : CalleeSavedRegs<(add (sequence "X%u", 1, 28), FP,
+ (sequence "Q%u", 0, 31))>;
+
+def CSR_AArch64_AllRegs
+ : CalleeSavedRegs<(add (sequence "W%u", 0, 30), WSP,
+ (sequence "X%u", 0, 28), FP, LR, SP,
+ (sequence "B%u", 0, 31), (sequence "H%u", 0, 31),
+ (sequence "S%u", 0, 31), (sequence "D%u", 0, 31),
+ (sequence "Q%u", 0, 31))>;
+
diff --git a/lib/Target/AArch64/AArch64CleanupLocalDynamicTLSPass.cpp b/lib/Target/AArch64/AArch64CleanupLocalDynamicTLSPass.cpp
new file mode 100644
index 0000000..4d23dc5
--- /dev/null
+++ b/lib/Target/AArch64/AArch64CleanupLocalDynamicTLSPass.cpp
@@ -0,0 +1,147 @@
+//===-- AArch64CleanupLocalDynamicTLSPass.cpp ---------------------*- C++ -*-=//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Local-dynamic access to thread-local variables proceeds in three stages.
+//
+// 1. The offset of this Module's thread-local area from TPIDR_EL0 is calculated
+// in much the same way as a general-dynamic TLS-descriptor access against
+// the special symbol _TLS_MODULE_BASE.
+// 2. The variable's offset from _TLS_MODULE_BASE_ is calculated using
+// instructions with "dtprel" modifiers.
+// 3. These two are added, together with TPIDR_EL0, to obtain the variable's
+// true address.
+//
+// This is only better than general-dynamic access to the variable if two or
+// more of the first stage TLS-descriptor calculations can be combined. This
+// pass looks through a function and performs such combinations.
+//
+//===----------------------------------------------------------------------===//
+#include "AArch64.h"
+#include "AArch64InstrInfo.h"
+#include "AArch64MachineFunctionInfo.h"
+#include "AArch64TargetMachine.h"
+#include "llvm/CodeGen/MachineDominators.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+using namespace llvm;
+
+namespace {
+struct LDTLSCleanup : public MachineFunctionPass {
+ static char ID;
+ LDTLSCleanup() : MachineFunctionPass(ID) {}
+
+ bool runOnMachineFunction(MachineFunction &MF) override {
+ AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>();
+ if (AFI->getNumLocalDynamicTLSAccesses() < 2) {
+ // No point folding accesses if there isn't at least two.
+ return false;
+ }
+
+ MachineDominatorTree *DT = &getAnalysis<MachineDominatorTree>();
+ return VisitNode(DT->getRootNode(), 0);
+ }
+
+ // Visit the dominator subtree rooted at Node in pre-order.
+ // If TLSBaseAddrReg is non-null, then use that to replace any
+ // TLS_base_addr instructions. Otherwise, create the register
+ // when the first such instruction is seen, and then use it
+ // as we encounter more instructions.
+ bool VisitNode(MachineDomTreeNode *Node, unsigned TLSBaseAddrReg) {
+ MachineBasicBlock *BB = Node->getBlock();
+ bool Changed = false;
+
+ // Traverse the current block.
+ for (MachineBasicBlock::iterator I = BB->begin(), E = BB->end(); I != E;
+ ++I) {
+ switch (I->getOpcode()) {
+ case AArch64::TLSDESC_BLR:
+ // Make sure it's a local dynamic access.
+ if (!I->getOperand(1).isSymbol() ||
+ strcmp(I->getOperand(1).getSymbolName(), "_TLS_MODULE_BASE_"))
+ break;
+
+ if (TLSBaseAddrReg)
+ I = replaceTLSBaseAddrCall(I, TLSBaseAddrReg);
+ else
+ I = setRegister(I, &TLSBaseAddrReg);
+ Changed = true;
+ break;
+ default:
+ break;
+ }
+ }
+
+ // Visit the children of this block in the dominator tree.
+ for (MachineDomTreeNode *N : *Node) {
+ Changed |= VisitNode(N, TLSBaseAddrReg);
+ }
+
+ return Changed;
+ }
+
+ // Replace the TLS_base_addr instruction I with a copy from
+ // TLSBaseAddrReg, returning the new instruction.
+ MachineInstr *replaceTLSBaseAddrCall(MachineInstr *I,
+ unsigned TLSBaseAddrReg) {
+ MachineFunction *MF = I->getParent()->getParent();
+ const AArch64TargetMachine *TM =
+ static_cast<const AArch64TargetMachine *>(&MF->getTarget());
+ const AArch64InstrInfo *TII = TM->getInstrInfo();
+
+ // Insert a Copy from TLSBaseAddrReg to x0, which is where the rest of the
+ // code sequence assumes the address will be.
+ MachineInstr *Copy = BuildMI(*I->getParent(), I, I->getDebugLoc(),
+ TII->get(TargetOpcode::COPY),
+ AArch64::X0).addReg(TLSBaseAddrReg);
+
+ // Erase the TLS_base_addr instruction.
+ I->eraseFromParent();
+
+ return Copy;
+ }
+
+ // Create a virtal register in *TLSBaseAddrReg, and populate it by
+ // inserting a copy instruction after I. Returns the new instruction.
+ MachineInstr *setRegister(MachineInstr *I, unsigned *TLSBaseAddrReg) {
+ MachineFunction *MF = I->getParent()->getParent();
+ const AArch64TargetMachine *TM =
+ static_cast<const AArch64TargetMachine *>(&MF->getTarget());
+ const AArch64InstrInfo *TII = TM->getInstrInfo();
+
+ // Create a virtual register for the TLS base address.
+ MachineRegisterInfo &RegInfo = MF->getRegInfo();
+ *TLSBaseAddrReg = RegInfo.createVirtualRegister(&AArch64::GPR64RegClass);
+
+ // Insert a copy from X0 to TLSBaseAddrReg for later.
+ MachineInstr *Next = I->getNextNode();
+ MachineInstr *Copy = BuildMI(*I->getParent(), Next, I->getDebugLoc(),
+ TII->get(TargetOpcode::COPY),
+ *TLSBaseAddrReg).addReg(AArch64::X0);
+
+ return Copy;
+ }
+
+ const char *getPassName() const override {
+ return "Local Dynamic TLS Access Clean-up";
+ }
+
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
+ AU.setPreservesCFG();
+ AU.addRequired<MachineDominatorTree>();
+ MachineFunctionPass::getAnalysisUsage(AU);
+ }
+};
+}
+
+char LDTLSCleanup::ID = 0;
+FunctionPass *llvm::createAArch64CleanupLocalDynamicTLSPass() {
+ return new LDTLSCleanup();
+}
diff --git a/lib/Target/AArch64/AArch64CollectLOH.cpp b/lib/Target/AArch64/AArch64CollectLOH.cpp
new file mode 100644
index 0000000..6b1f096
--- /dev/null
+++ b/lib/Target/AArch64/AArch64CollectLOH.cpp
@@ -0,0 +1,1117 @@
+//===---------- AArch64CollectLOH.cpp - AArch64 collect LOH pass --*- C++ -*-=//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains a pass that collect the Linker Optimization Hint (LOH).
+// This pass should be run at the very end of the compilation flow, just before
+// assembly printer.
+// To be useful for the linker, the LOH must be printed into the assembly file.
+//
+// A LOH describes a sequence of instructions that may be optimized by the
+// linker.
+// This same sequence cannot be optimized by the compiler because some of
+// the information will be known at link time.
+// For instance, consider the following sequence:
+// L1: adrp xA, sym@PAGE
+// L2: add xB, xA, sym@PAGEOFF
+// L3: ldr xC, [xB, #imm]
+// This sequence can be turned into:
+// A literal load if sym@PAGE + sym@PAGEOFF + #imm - address(L3) is < 1MB:
+// L3: ldr xC, sym+#imm
+// It may also be turned into either the following more efficient
+// code sequences:
+// - If sym@PAGEOFF + #imm fits the encoding space of L3.
+// L1: adrp xA, sym@PAGE
+// L3: ldr xC, [xB, sym@PAGEOFF + #imm]
+// - If sym@PAGE + sym@PAGEOFF - address(L1) < 1MB:
+// L1: adr xA, sym
+// L3: ldr xC, [xB, #imm]
+//
+// To be valid a LOH must meet all the requirements needed by all the related
+// possible linker transformations.
+// For instance, using the running example, the constraints to emit
+// ".loh AdrpAddLdr" are:
+// - L1, L2, and L3 instructions are of the expected type, i.e.,
+// respectively ADRP, ADD (immediate), and LD.
+// - The result of L1 is used only by L2.
+// - The register argument (xA) used in the ADD instruction is defined
+// only by L1.
+// - The result of L2 is used only by L3.
+// - The base address (xB) in L3 is defined only L2.
+// - The ADRP in L1 and the ADD in L2 must reference the same symbol using
+// @PAGE/@PAGEOFF with no additional constants
+//
+// Currently supported LOHs are:
+// * So called non-ADRP-related:
+// - .loh AdrpAddLdr L1, L2, L3:
+// L1: adrp xA, sym@PAGE
+// L2: add xB, xA, sym@PAGEOFF
+// L3: ldr xC, [xB, #imm]
+// - .loh AdrpLdrGotLdr L1, L2, L3:
+// L1: adrp xA, sym@GOTPAGE
+// L2: ldr xB, [xA, sym@GOTPAGEOFF]
+// L3: ldr xC, [xB, #imm]
+// - .loh AdrpLdr L1, L3:
+// L1: adrp xA, sym@PAGE
+// L3: ldr xC, [xA, sym@PAGEOFF]
+// - .loh AdrpAddStr L1, L2, L3:
+// L1: adrp xA, sym@PAGE
+// L2: add xB, xA, sym@PAGEOFF
+// L3: str xC, [xB, #imm]
+// - .loh AdrpLdrGotStr L1, L2, L3:
+// L1: adrp xA, sym@GOTPAGE
+// L2: ldr xB, [xA, sym@GOTPAGEOFF]
+// L3: str xC, [xB, #imm]
+// - .loh AdrpAdd L1, L2:
+// L1: adrp xA, sym@PAGE
+// L2: add xB, xA, sym@PAGEOFF
+// For all these LOHs, L1, L2, L3 form a simple chain:
+// L1 result is used only by L2 and L2 result by L3.
+// L3 LOH-related argument is defined only by L2 and L2 LOH-related argument
+// by L1.
+// All these LOHs aim at using more efficient load/store patterns by folding
+// some instructions used to compute the address directly into the load/store.
+//
+// * So called ADRP-related:
+// - .loh AdrpAdrp L2, L1:
+// L2: ADRP xA, sym1@PAGE
+// L1: ADRP xA, sym2@PAGE
+// L2 dominates L1 and xA is not redifined between L2 and L1
+// This LOH aims at getting rid of redundant ADRP instructions.
+//
+// The overall design for emitting the LOHs is:
+// 1. AArch64CollectLOH (this pass) records the LOHs in the AArch64FunctionInfo.
+// 2. AArch64AsmPrinter reads the LOHs from AArch64FunctionInfo and it:
+// 1. Associates them a label.
+// 2. Emits them in a MCStreamer (EmitLOHDirective).
+// - The MCMachOStreamer records them into the MCAssembler.
+// - The MCAsmStreamer prints them.
+// - Other MCStreamers ignore them.
+// 3. Closes the MCStreamer:
+// - The MachObjectWriter gets them from the MCAssembler and writes
+// them in the object file.
+// - Other ObjectWriters ignore them.
+//===----------------------------------------------------------------------===//
+
+#include "AArch64.h"
+#include "AArch64InstrInfo.h"
+#include "AArch64MachineFunctionInfo.h"
+#include "MCTargetDesc/AArch64AddressingModes.h"
+#include "llvm/ADT/BitVector.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/MapVector.h"
+#include "llvm/ADT/SetVector.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/CodeGen/MachineDominators.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/ADT/Statistic.h"
+using namespace llvm;
+
+#define DEBUG_TYPE "aarch64-collect-loh"
+
+static cl::opt<bool>
+PreCollectRegister("aarch64-collect-loh-pre-collect-register", cl::Hidden,
+ cl::desc("Restrict analysis to registers invovled"
+ " in LOHs"),
+ cl::init(true));
+
+static cl::opt<bool>
+BasicBlockScopeOnly("aarch64-collect-loh-bb-only", cl::Hidden,
+ cl::desc("Restrict analysis at basic block scope"),
+ cl::init(true));
+
+STATISTIC(NumADRPSimpleCandidate,
+ "Number of simplifiable ADRP dominate by another");
+STATISTIC(NumADRPComplexCandidate2,
+ "Number of simplifiable ADRP reachable by 2 defs");
+STATISTIC(NumADRPComplexCandidate3,
+ "Number of simplifiable ADRP reachable by 3 defs");
+STATISTIC(NumADRPComplexCandidateOther,
+ "Number of simplifiable ADRP reachable by 4 or more defs");
+STATISTIC(NumADDToSTRWithImm,
+ "Number of simplifiable STR with imm reachable by ADD");
+STATISTIC(NumLDRToSTRWithImm,
+ "Number of simplifiable STR with imm reachable by LDR");
+STATISTIC(NumADDToSTR, "Number of simplifiable STR reachable by ADD");
+STATISTIC(NumLDRToSTR, "Number of simplifiable STR reachable by LDR");
+STATISTIC(NumADDToLDRWithImm,
+ "Number of simplifiable LDR with imm reachable by ADD");
+STATISTIC(NumLDRToLDRWithImm,
+ "Number of simplifiable LDR with imm reachable by LDR");
+STATISTIC(NumADDToLDR, "Number of simplifiable LDR reachable by ADD");
+STATISTIC(NumLDRToLDR, "Number of simplifiable LDR reachable by LDR");
+STATISTIC(NumADRPToLDR, "Number of simplifiable LDR reachable by ADRP");
+STATISTIC(NumCplxLvl1, "Number of complex case of level 1");
+STATISTIC(NumTooCplxLvl1, "Number of too complex case of level 1");
+STATISTIC(NumCplxLvl2, "Number of complex case of level 2");
+STATISTIC(NumTooCplxLvl2, "Number of too complex case of level 2");
+STATISTIC(NumADRSimpleCandidate, "Number of simplifiable ADRP + ADD");
+STATISTIC(NumADRComplexCandidate, "Number of too complex ADRP + ADD");
+
+namespace llvm {
+void initializeAArch64CollectLOHPass(PassRegistry &);
+}
+
+namespace {
+struct AArch64CollectLOH : public MachineFunctionPass {
+ static char ID;
+ AArch64CollectLOH() : MachineFunctionPass(ID) {
+ initializeAArch64CollectLOHPass(*PassRegistry::getPassRegistry());
+ }
+
+ bool runOnMachineFunction(MachineFunction &MF) override;
+
+ const char *getPassName() const override {
+ return "AArch64 Collect Linker Optimization Hint (LOH)";
+ }
+
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
+ AU.setPreservesAll();
+ MachineFunctionPass::getAnalysisUsage(AU);
+ AU.addRequired<MachineDominatorTree>();
+ }
+
+private:
+};
+
+/// A set of MachineInstruction.
+typedef SetVector<const MachineInstr *> SetOfMachineInstr;
+/// Map a basic block to a set of instructions per register.
+/// This is used to represent the exposed uses of a basic block
+/// per register.
+typedef MapVector<const MachineBasicBlock *, SetOfMachineInstr *>
+BlockToSetOfInstrsPerColor;
+/// Map a basic block to an instruction per register.
+/// This is used to represent the live-out definitions of a basic block
+/// per register.
+typedef MapVector<const MachineBasicBlock *, const MachineInstr **>
+BlockToInstrPerColor;
+/// Map an instruction to a set of instructions. Used to represent the
+/// mapping def to reachable uses or use to definitions.
+typedef MapVector<const MachineInstr *, SetOfMachineInstr> InstrToInstrs;
+/// Map a basic block to a BitVector.
+/// This is used to record the kill registers per basic block.
+typedef MapVector<const MachineBasicBlock *, BitVector> BlockToRegSet;
+
+/// Map a register to a dense id.
+typedef DenseMap<unsigned, unsigned> MapRegToId;
+/// Map a dense id to a register. Used for debug purposes.
+typedef SmallVector<unsigned, 32> MapIdToReg;
+} // end anonymous namespace.
+
+char AArch64CollectLOH::ID = 0;
+
+INITIALIZE_PASS_BEGIN(AArch64CollectLOH, "aarch64-collect-loh",
+ "AArch64 Collect Linker Optimization Hint (LOH)", false,
+ false)
+INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree)
+INITIALIZE_PASS_END(AArch64CollectLOH, "aarch64-collect-loh",
+ "AArch64 Collect Linker Optimization Hint (LOH)", false,
+ false)
+
+/// Given a couple (MBB, reg) get the corresponding set of instruction from
+/// the given "sets".
+/// If this couple does not reference any set, an empty set is added to "sets"
+/// for this couple and returned.
+/// \param nbRegs is used internally allocate some memory. It must be consistent
+/// with the way sets is used.
+static SetOfMachineInstr &getSet(BlockToSetOfInstrsPerColor &sets,
+ const MachineBasicBlock &MBB, unsigned reg,
+ unsigned nbRegs) {
+ SetOfMachineInstr *result;
+ BlockToSetOfInstrsPerColor::iterator it = sets.find(&MBB);
+ if (it != sets.end())
+ result = it->second;
+ else
+ result = sets[&MBB] = new SetOfMachineInstr[nbRegs];
+
+ return result[reg];
+}
+
+/// Given a couple (reg, MI) get the corresponding set of instructions from the
+/// the given "sets".
+/// This is used to get the uses record in sets of a definition identified by
+/// MI and reg, i.e., MI defines reg.
+/// If the couple does not reference anything, an empty set is added to
+/// "sets[reg]".
+/// \pre set[reg] is valid.
+static SetOfMachineInstr &getUses(InstrToInstrs *sets, unsigned reg,
+ const MachineInstr &MI) {
+ return sets[reg][&MI];
+}
+
+/// Same as getUses but does not modify the input map: sets.
+/// \return NULL if the couple (reg, MI) is not in sets.
+static const SetOfMachineInstr *getUses(const InstrToInstrs *sets, unsigned reg,
+ const MachineInstr &MI) {
+ InstrToInstrs::const_iterator Res = sets[reg].find(&MI);
+ if (Res != sets[reg].end())
+ return &(Res->second);
+ return nullptr;
+}
+
+/// Initialize the reaching definition algorithm:
+/// For each basic block BB in MF, record:
+/// - its kill set.
+/// - its reachable uses (uses that are exposed to BB's predecessors).
+/// - its the generated definitions.
+/// \param DummyOp if not NULL, specifies a Dummy Operation to be added to
+/// the list of uses of exposed defintions.
+/// \param ADRPMode specifies to only consider ADRP instructions for generated
+/// definition. It also consider definitions of ADRP instructions as uses and
+/// ignore other uses. The ADRPMode is used to collect the information for LHO
+/// that involve ADRP operation only.
+static void initReachingDef(MachineFunction &MF,
+ InstrToInstrs *ColorOpToReachedUses,
+ BlockToInstrPerColor &Gen, BlockToRegSet &Kill,
+ BlockToSetOfInstrsPerColor &ReachableUses,
+ const MapRegToId &RegToId,
+ const MachineInstr *DummyOp, bool ADRPMode) {
+ const TargetMachine &TM = MF.getTarget();
+ const TargetRegisterInfo *TRI = TM.getRegisterInfo();
+
+ unsigned NbReg = RegToId.size();
+
+ for (MachineBasicBlock &MBB : MF) {
+ const MachineInstr **&BBGen = Gen[&MBB];
+ BBGen = new const MachineInstr *[NbReg];
+ memset(BBGen, 0, sizeof(const MachineInstr *) * NbReg);
+
+ BitVector &BBKillSet = Kill[&MBB];
+ BBKillSet.resize(NbReg);
+ for (const MachineInstr &MI : MBB) {
+ bool IsADRP = MI.getOpcode() == AArch64::ADRP;
+
+ // Process uses first.
+ if (IsADRP || !ADRPMode)
+ for (const MachineOperand &MO : MI.operands()) {
+ // Treat ADRP def as use, as the goal of the analysis is to find
+ // ADRP defs reached by other ADRP defs.
+ if (!MO.isReg() || (!ADRPMode && !MO.isUse()) ||
+ (ADRPMode && (!IsADRP || !MO.isDef())))
+ continue;
+ unsigned CurReg = MO.getReg();
+ MapRegToId::const_iterator ItCurRegId = RegToId.find(CurReg);
+ if (ItCurRegId == RegToId.end())
+ continue;
+ CurReg = ItCurRegId->second;
+
+ // if CurReg has not been defined, this use is reachable.
+ if (!BBGen[CurReg] && !BBKillSet.test(CurReg))
+ getSet(ReachableUses, MBB, CurReg, NbReg).insert(&MI);
+ // current basic block definition for this color, if any, is in Gen.
+ if (BBGen[CurReg])
+ getUses(ColorOpToReachedUses, CurReg, *BBGen[CurReg]).insert(&MI);
+ }
+
+ // Process clobbers.
+ for (const MachineOperand &MO : MI.operands()) {
+ if (!MO.isRegMask())
+ continue;
+ // Clobbers kill the related colors.
+ const uint32_t *PreservedRegs = MO.getRegMask();
+
+ // Set generated regs.
+ for (const auto Entry : RegToId) {
+ unsigned Reg = Entry.second;
+ // Use the global register ID when querying APIs external to this
+ // pass.
+ if (MachineOperand::clobbersPhysReg(PreservedRegs, Entry.first)) {
+ // Do not register clobbered definition for no ADRP.
+ // This definition is not used anyway (otherwise register
+ // allocation is wrong).
+ BBGen[Reg] = ADRPMode ? &MI : nullptr;
+ BBKillSet.set(Reg);
+ }
+ }
+ }
+
+ // Process register defs.
+ for (const MachineOperand &MO : MI.operands()) {
+ if (!MO.isReg() || !MO.isDef())
+ continue;
+ unsigned CurReg = MO.getReg();
+ MapRegToId::const_iterator ItCurRegId = RegToId.find(CurReg);
+ if (ItCurRegId == RegToId.end())
+ continue;
+
+ for (MCRegAliasIterator AI(CurReg, TRI, true); AI.isValid(); ++AI) {
+ MapRegToId::const_iterator ItRegId = RegToId.find(*AI);
+ assert(ItRegId != RegToId.end() &&
+ "Sub-register of an "
+ "involved register, not recorded as involved!");
+ BBKillSet.set(ItRegId->second);
+ BBGen[ItRegId->second] = &MI;
+ }
+ BBGen[ItCurRegId->second] = &MI;
+ }
+ }
+
+ // If we restrict our analysis to basic block scope, conservatively add a
+ // dummy
+ // use for each generated value.
+ if (!ADRPMode && DummyOp && !MBB.succ_empty())
+ for (unsigned CurReg = 0; CurReg < NbReg; ++CurReg)
+ if (BBGen[CurReg])
+ getUses(ColorOpToReachedUses, CurReg, *BBGen[CurReg]).insert(DummyOp);
+ }
+}
+
+/// Reaching def core algorithm:
+/// while an Out has changed
+/// for each bb
+/// for each color
+/// In[bb][color] = U Out[bb.predecessors][color]
+/// insert reachableUses[bb][color] in each in[bb][color]
+/// op.reachedUses
+///
+/// Out[bb] = Gen[bb] U (In[bb] - Kill[bb])
+static void reachingDefAlgorithm(MachineFunction &MF,
+ InstrToInstrs *ColorOpToReachedUses,
+ BlockToSetOfInstrsPerColor &In,
+ BlockToSetOfInstrsPerColor &Out,
+ BlockToInstrPerColor &Gen, BlockToRegSet &Kill,
+ BlockToSetOfInstrsPerColor &ReachableUses,
+ unsigned NbReg) {
+ bool HasChanged;
+ do {
+ HasChanged = false;
+ for (MachineBasicBlock &MBB : MF) {
+ unsigned CurReg;
+ for (CurReg = 0; CurReg < NbReg; ++CurReg) {
+ SetOfMachineInstr &BBInSet = getSet(In, MBB, CurReg, NbReg);
+ SetOfMachineInstr &BBReachableUses =
+ getSet(ReachableUses, MBB, CurReg, NbReg);
+ SetOfMachineInstr &BBOutSet = getSet(Out, MBB, CurReg, NbReg);
+ unsigned Size = BBOutSet.size();
+ // In[bb][color] = U Out[bb.predecessors][color]
+ for (MachineBasicBlock *PredMBB : MBB.predecessors()) {
+ SetOfMachineInstr &PredOutSet = getSet(Out, *PredMBB, CurReg, NbReg);
+ BBInSet.insert(PredOutSet.begin(), PredOutSet.end());
+ }
+ // insert reachableUses[bb][color] in each in[bb][color] op.reachedses
+ for (const MachineInstr *MI : BBInSet) {
+ SetOfMachineInstr &OpReachedUses =
+ getUses(ColorOpToReachedUses, CurReg, *MI);
+ OpReachedUses.insert(BBReachableUses.begin(), BBReachableUses.end());
+ }
+ // Out[bb] = Gen[bb] U (In[bb] - Kill[bb])
+ if (!Kill[&MBB].test(CurReg))
+ BBOutSet.insert(BBInSet.begin(), BBInSet.end());
+ if (Gen[&MBB][CurReg])
+ BBOutSet.insert(Gen[&MBB][CurReg]);
+ HasChanged |= BBOutSet.size() != Size;
+ }
+ }
+ } while (HasChanged);
+}
+
+/// Release all memory dynamically allocated during the reaching
+/// definition algorithm.
+static void finitReachingDef(BlockToSetOfInstrsPerColor &In,
+ BlockToSetOfInstrsPerColor &Out,
+ BlockToInstrPerColor &Gen,
+ BlockToSetOfInstrsPerColor &ReachableUses) {
+ for (auto &IT : Out)
+ delete[] IT.second;
+ for (auto &IT : In)
+ delete[] IT.second;
+ for (auto &IT : ReachableUses)
+ delete[] IT.second;
+ for (auto &IT : Gen)
+ delete[] IT.second;
+}
+
+/// Reaching definition algorithm.
+/// \param MF function on which the algorithm will operate.
+/// \param[out] ColorOpToReachedUses will contain the result of the reaching
+/// def algorithm.
+/// \param ADRPMode specify whether the reaching def algorithm should be tuned
+/// for ADRP optimization. \see initReachingDef for more details.
+/// \param DummyOp if not NULL, the algorithm will work at
+/// basic block scope and will set for every exposed definition a use to
+/// @p DummyOp.
+/// \pre ColorOpToReachedUses is an array of at least number of registers of
+/// InstrToInstrs.
+static void reachingDef(MachineFunction &MF,
+ InstrToInstrs *ColorOpToReachedUses,
+ const MapRegToId &RegToId, bool ADRPMode = false,
+ const MachineInstr *DummyOp = nullptr) {
+ // structures:
+ // For each basic block.
+ // Out: a set per color of definitions that reach the
+ // out boundary of this block.
+ // In: Same as Out but for in boundary.
+ // Gen: generated color in this block (one operation per color).
+ // Kill: register set of killed color in this block.
+ // ReachableUses: a set per color of uses (operation) reachable
+ // for "In" definitions.
+ BlockToSetOfInstrsPerColor Out, In, ReachableUses;
+ BlockToInstrPerColor Gen;
+ BlockToRegSet Kill;
+
+ // Initialize Gen, kill and reachableUses.
+ initReachingDef(MF, ColorOpToReachedUses, Gen, Kill, ReachableUses, RegToId,
+ DummyOp, ADRPMode);
+
+ // Algo.
+ if (!DummyOp)
+ reachingDefAlgorithm(MF, ColorOpToReachedUses, In, Out, Gen, Kill,
+ ReachableUses, RegToId.size());
+
+ // finit.
+ finitReachingDef(In, Out, Gen, ReachableUses);
+}
+
+#ifndef NDEBUG
+/// print the result of the reaching definition algorithm.
+static void printReachingDef(const InstrToInstrs *ColorOpToReachedUses,
+ unsigned NbReg, const TargetRegisterInfo *TRI,
+ const MapIdToReg &IdToReg) {
+ unsigned CurReg;
+ for (CurReg = 0; CurReg < NbReg; ++CurReg) {
+ if (ColorOpToReachedUses[CurReg].empty())
+ continue;
+ DEBUG(dbgs() << "*** Reg " << PrintReg(IdToReg[CurReg], TRI) << " ***\n");
+
+ for (const auto &DefsIt : ColorOpToReachedUses[CurReg]) {
+ DEBUG(dbgs() << "Def:\n");
+ DEBUG(DefsIt.first->print(dbgs()));
+ DEBUG(dbgs() << "Reachable uses:\n");
+ for (const MachineInstr *MI : DefsIt.second) {
+ DEBUG(MI->print(dbgs()));
+ }
+ }
+ }
+}
+#endif // NDEBUG
+
+/// Answer the following question: Can Def be one of the definition
+/// involved in a part of a LOH?
+static bool canDefBePartOfLOH(const MachineInstr *Def) {
+ unsigned Opc = Def->getOpcode();
+ // Accept ADRP, ADDLow and LOADGot.
+ switch (Opc) {
+ default:
+ return false;
+ case AArch64::ADRP:
+ return true;
+ case AArch64::ADDXri:
+ // Check immediate to see if the immediate is an address.
+ switch (Def->getOperand(2).getType()) {
+ default:
+ return false;
+ case MachineOperand::MO_GlobalAddress:
+ case MachineOperand::MO_JumpTableIndex:
+ case MachineOperand::MO_ConstantPoolIndex:
+ case MachineOperand::MO_BlockAddress:
+ return true;
+ }
+ case AArch64::LDRXui:
+ // Check immediate to see if the immediate is an address.
+ switch (Def->getOperand(2).getType()) {
+ default:
+ return false;
+ case MachineOperand::MO_GlobalAddress:
+ return true;
+ }
+ }
+ // Unreachable.
+ return false;
+}
+
+/// Check whether the given instruction can the end of a LOH chain involving a
+/// store.
+static bool isCandidateStore(const MachineInstr *Instr) {
+ switch (Instr->getOpcode()) {
+ default:
+ return false;
+ case AArch64::STRBui:
+ case AArch64::STRHui:
+ case AArch64::STRWui:
+ case AArch64::STRXui:
+ case AArch64::STRSui:
+ case AArch64::STRDui:
+ case AArch64::STRQui:
+ // In case we have str xA, [xA, #imm], this is two different uses
+ // of xA and we cannot fold, otherwise the xA stored may be wrong,
+ // even if #imm == 0.
+ if (Instr->getOperand(0).getReg() != Instr->getOperand(1).getReg())
+ return true;
+ }
+ return false;
+}
+
+/// Given the result of a reaching definition algorithm in ColorOpToReachedUses,
+/// Build the Use to Defs information and filter out obvious non-LOH candidates.
+/// In ADRPMode, non-LOH candidates are "uses" with non-ADRP definitions.
+/// In non-ADRPMode, non-LOH candidates are "uses" with several definition,
+/// i.e., no simple chain.
+/// \param ADRPMode -- \see initReachingDef.
+static void reachedUsesToDefs(InstrToInstrs &UseToReachingDefs,
+ const InstrToInstrs *ColorOpToReachedUses,
+ const MapRegToId &RegToId,
+ bool ADRPMode = false) {
+
+ SetOfMachineInstr NotCandidate;
+ unsigned NbReg = RegToId.size();
+ MapRegToId::const_iterator EndIt = RegToId.end();
+ for (unsigned CurReg = 0; CurReg < NbReg; ++CurReg) {
+ // If this color is never defined, continue.
+ if (ColorOpToReachedUses[CurReg].empty())
+ continue;
+
+ for (const auto &DefsIt : ColorOpToReachedUses[CurReg]) {
+ for (const MachineInstr *MI : DefsIt.second) {
+ const MachineInstr *Def = DefsIt.first;
+ MapRegToId::const_iterator It;
+ // if all the reaching defs are not adrp, this use will not be
+ // simplifiable.
+ if ((ADRPMode && Def->getOpcode() != AArch64::ADRP) ||
+ (!ADRPMode && !canDefBePartOfLOH(Def)) ||
+ (!ADRPMode && isCandidateStore(MI) &&
+ // store are LOH candidate iff the end of the chain is used as
+ // base.
+ ((It = RegToId.find((MI)->getOperand(1).getReg())) == EndIt ||
+ It->second != CurReg))) {
+ NotCandidate.insert(MI);
+ continue;
+ }
+ // Do not consider self reaching as a simplifiable case for ADRP.
+ if (!ADRPMode || MI != DefsIt.first) {
+ UseToReachingDefs[MI].insert(DefsIt.first);
+ // If UsesIt has several reaching definitions, it is not
+ // candidate for simplificaton in non-ADRPMode.
+ if (!ADRPMode && UseToReachingDefs[MI].size() > 1)
+ NotCandidate.insert(MI);
+ }
+ }
+ }
+ }
+ for (const MachineInstr *Elem : NotCandidate) {
+ DEBUG(dbgs() << "Too many reaching defs: " << *Elem << "\n");
+ // It would have been better if we could just remove the entry
+ // from the map. Because of that, we have to filter the garbage
+ // (second.empty) in the subsequence analysis.
+ UseToReachingDefs[Elem].clear();
+ }
+}
+
+/// Based on the use to defs information (in ADRPMode), compute the
+/// opportunities of LOH ADRP-related.
+static void computeADRP(const InstrToInstrs &UseToDefs,
+ AArch64FunctionInfo &AArch64FI,
+ const MachineDominatorTree *MDT) {
+ DEBUG(dbgs() << "*** Compute LOH for ADRP\n");
+ for (const auto &Entry : UseToDefs) {
+ unsigned Size = Entry.second.size();
+ if (Size == 0)
+ continue;
+ if (Size == 1) {
+ const MachineInstr *L2 = *Entry.second.begin();
+ const MachineInstr *L1 = Entry.first;
+ if (!MDT->dominates(L2, L1)) {
+ DEBUG(dbgs() << "Dominance check failed:\n" << *L2 << '\n' << *L1
+ << '\n');
+ continue;
+ }
+ DEBUG(dbgs() << "Record AdrpAdrp:\n" << *L2 << '\n' << *L1 << '\n');
+ SmallVector<const MachineInstr *, 2> Args;
+ Args.push_back(L2);
+ Args.push_back(L1);
+ AArch64FI.addLOHDirective(MCLOH_AdrpAdrp, Args);
+ ++NumADRPSimpleCandidate;
+ }
+#ifdef DEBUG
+ else if (Size == 2)
+ ++NumADRPComplexCandidate2;
+ else if (Size == 3)
+ ++NumADRPComplexCandidate3;
+ else
+ ++NumADRPComplexCandidateOther;
+#endif
+ // if Size < 1, the use should have been removed from the candidates
+ assert(Size >= 1 && "No reaching defs for that use!");
+ }
+}
+
+/// Check whether the given instruction can be the end of a LOH chain
+/// involving a load.
+static bool isCandidateLoad(const MachineInstr *Instr) {
+ switch (Instr->getOpcode()) {
+ default:
+ return false;
+ case AArch64::LDRSBWui:
+ case AArch64::LDRSBXui:
+ case AArch64::LDRSHWui:
+ case AArch64::LDRSHXui:
+ case AArch64::LDRSWui:
+ case AArch64::LDRBui:
+ case AArch64::LDRHui:
+ case AArch64::LDRWui:
+ case AArch64::LDRXui:
+ case AArch64::LDRSui:
+ case AArch64::LDRDui:
+ case AArch64::LDRQui:
+ if (Instr->getOperand(2).getTargetFlags() & AArch64II::MO_GOT)
+ return false;
+ return true;
+ }
+ // Unreachable.
+ return false;
+}
+
+/// Check whether the given instruction can load a litteral.
+static bool supportLoadFromLiteral(const MachineInstr *Instr) {
+ switch (Instr->getOpcode()) {
+ default:
+ return false;
+ case AArch64::LDRSWui:
+ case AArch64::LDRWui:
+ case AArch64::LDRXui:
+ case AArch64::LDRSui:
+ case AArch64::LDRDui:
+ case AArch64::LDRQui:
+ return true;
+ }
+ // Unreachable.
+ return false;
+}
+
+/// Check whether the given instruction is a LOH candidate.
+/// \param UseToDefs is used to check that Instr is at the end of LOH supported
+/// chain.
+/// \pre UseToDefs contains only on def per use, i.e., obvious non candidate are
+/// already been filtered out.
+static bool isCandidate(const MachineInstr *Instr,
+ const InstrToInstrs &UseToDefs,
+ const MachineDominatorTree *MDT) {
+ if (!isCandidateLoad(Instr) && !isCandidateStore(Instr))
+ return false;
+
+ const MachineInstr *Def = *UseToDefs.find(Instr)->second.begin();
+ if (Def->getOpcode() != AArch64::ADRP) {
+ // At this point, Def is ADDXri or LDRXui of the right type of
+ // symbol, because we filtered out the uses that were not defined
+ // by these kind of instructions (+ ADRP).
+
+ // Check if this forms a simple chain: each intermediate node must
+ // dominates the next one.
+ if (!MDT->dominates(Def, Instr))
+ return false;
+ // Move one node up in the simple chain.
+ if (UseToDefs.find(Def) ==
+ UseToDefs.end()
+ // The map may contain garbage we have to ignore.
+ ||
+ UseToDefs.find(Def)->second.empty())
+ return false;
+ Instr = Def;
+ Def = *UseToDefs.find(Def)->second.begin();
+ }
+ // Check if we reached the top of the simple chain:
+ // - top is ADRP.
+ // - check the simple chain property: each intermediate node must
+ // dominates the next one.
+ if (Def->getOpcode() == AArch64::ADRP)
+ return MDT->dominates(Def, Instr);
+ return false;
+}
+
+static bool registerADRCandidate(const MachineInstr &Use,
+ const InstrToInstrs &UseToDefs,
+ const InstrToInstrs *DefsPerColorToUses,
+ AArch64FunctionInfo &AArch64FI,
+ SetOfMachineInstr *InvolvedInLOHs,
+ const MapRegToId &RegToId) {
+ // Look for opportunities to turn ADRP -> ADD or
+ // ADRP -> LDR GOTPAGEOFF into ADR.
+ // If ADRP has more than one use. Give up.
+ if (Use.getOpcode() != AArch64::ADDXri &&
+ (Use.getOpcode() != AArch64::LDRXui ||
+ !(Use.getOperand(2).getTargetFlags() & AArch64II::MO_GOT)))
+ return false;
+ InstrToInstrs::const_iterator It = UseToDefs.find(&Use);
+ // The map may contain garbage that we need to ignore.
+ if (It == UseToDefs.end() || It->second.empty())
+ return false;
+ const MachineInstr &Def = **It->second.begin();
+ if (Def.getOpcode() != AArch64::ADRP)
+ return false;
+ // Check the number of users of ADRP.
+ const SetOfMachineInstr *Users =
+ getUses(DefsPerColorToUses,
+ RegToId.find(Def.getOperand(0).getReg())->second, Def);
+ if (Users->size() > 1) {
+ ++NumADRComplexCandidate;
+ return false;
+ }
+ ++NumADRSimpleCandidate;
+ assert((!InvolvedInLOHs || InvolvedInLOHs->insert(&Def)) &&
+ "ADRP already involved in LOH.");
+ assert((!InvolvedInLOHs || InvolvedInLOHs->insert(&Use)) &&
+ "ADD already involved in LOH.");
+ DEBUG(dbgs() << "Record AdrpAdd\n" << Def << '\n' << Use << '\n');
+
+ SmallVector<const MachineInstr *, 2> Args;
+ Args.push_back(&Def);
+ Args.push_back(&Use);
+
+ AArch64FI.addLOHDirective(Use.getOpcode() == AArch64::ADDXri ? MCLOH_AdrpAdd
+ : MCLOH_AdrpLdrGot,
+ Args);
+ return true;
+}
+
+/// Based on the use to defs information (in non-ADRPMode), compute the
+/// opportunities of LOH non-ADRP-related
+static void computeOthers(const InstrToInstrs &UseToDefs,
+ const InstrToInstrs *DefsPerColorToUses,
+ AArch64FunctionInfo &AArch64FI, const MapRegToId &RegToId,
+ const MachineDominatorTree *MDT) {
+ SetOfMachineInstr *InvolvedInLOHs = nullptr;
+#ifdef DEBUG
+ SetOfMachineInstr InvolvedInLOHsStorage;
+ InvolvedInLOHs = &InvolvedInLOHsStorage;
+#endif // DEBUG
+ DEBUG(dbgs() << "*** Compute LOH for Others\n");
+ // ADRP -> ADD/LDR -> LDR/STR pattern.
+ // Fall back to ADRP -> ADD pattern if we fail to catch the bigger pattern.
+
+ // FIXME: When the statistics are not important,
+ // This initial filtering loop can be merged into the next loop.
+ // Currently, we didn't do it to have the same code for both DEBUG and
+ // NDEBUG builds. Indeed, the iterator of the second loop would need
+ // to be changed.
+ SetOfMachineInstr PotentialCandidates;
+ SetOfMachineInstr PotentialADROpportunities;
+ for (auto &Use : UseToDefs) {
+ // If no definition is available, this is a non candidate.
+ if (Use.second.empty())
+ continue;
+ // Keep only instructions that are load or store and at the end of
+ // a ADRP -> ADD/LDR/Nothing chain.
+ // We already filtered out the no-chain cases.
+ if (!isCandidate(Use.first, UseToDefs, MDT)) {
+ PotentialADROpportunities.insert(Use.first);
+ continue;
+ }
+ PotentialCandidates.insert(Use.first);
+ }
+
+ // Make the following distinctions for statistics as the linker does
+ // know how to decode instructions:
+ // - ADD/LDR/Nothing make there different patterns.
+ // - LDR/STR make two different patterns.
+ // Hence, 6 - 1 base patterns.
+ // (because ADRP-> Nothing -> STR is not simplifiable)
+
+ // The linker is only able to have a simple semantic, i.e., if pattern A
+ // do B.
+ // However, we want to see the opportunity we may miss if we were able to
+ // catch more complex cases.
+
+ // PotentialCandidates are result of a chain ADRP -> ADD/LDR ->
+ // A potential candidate becomes a candidate, if its current immediate
+ // operand is zero and all nodes of the chain have respectively only one user
+#ifdef DEBUG
+ SetOfMachineInstr DefsOfPotentialCandidates;
+#endif
+ for (const MachineInstr *Candidate : PotentialCandidates) {
+ // Get the definition of the candidate i.e., ADD or LDR.
+ const MachineInstr *Def = *UseToDefs.find(Candidate)->second.begin();
+ // Record the elements of the chain.
+ const MachineInstr *L1 = Def;
+ const MachineInstr *L2 = nullptr;
+ unsigned ImmediateDefOpc = Def->getOpcode();
+ if (Def->getOpcode() != AArch64::ADRP) {
+ // Check the number of users of this node.
+ const SetOfMachineInstr *Users =
+ getUses(DefsPerColorToUses,
+ RegToId.find(Def->getOperand(0).getReg())->second, *Def);
+ if (Users->size() > 1) {
+#ifdef DEBUG
+ // if all the uses of this def are in potential candidate, this is
+ // a complex candidate of level 2.
+ bool IsLevel2 = true;
+ for (const MachineInstr *MI : *Users) {
+ if (!PotentialCandidates.count(MI)) {
+ ++NumTooCplxLvl2;
+ IsLevel2 = false;
+ break;
+ }
+ }
+ if (IsLevel2)
+ ++NumCplxLvl2;
+#endif // DEBUG
+ PotentialADROpportunities.insert(Def);
+ continue;
+ }
+ L2 = Def;
+ Def = *UseToDefs.find(Def)->second.begin();
+ L1 = Def;
+ } // else the element in the middle of the chain is nothing, thus
+ // Def already contains the first element of the chain.
+
+ // Check the number of users of the first node in the chain, i.e., ADRP
+ const SetOfMachineInstr *Users =
+ getUses(DefsPerColorToUses,
+ RegToId.find(Def->getOperand(0).getReg())->second, *Def);
+ if (Users->size() > 1) {
+#ifdef DEBUG
+ // if all the uses of this def are in the defs of the potential candidate,
+ // this is a complex candidate of level 1
+ if (DefsOfPotentialCandidates.empty()) {
+ // lazy init
+ DefsOfPotentialCandidates = PotentialCandidates;
+ for (const MachineInstr *Candidate : PotentialCandidates) {
+ if (!UseToDefs.find(Candidate)->second.empty())
+ DefsOfPotentialCandidates.insert(
+ *UseToDefs.find(Candidate)->second.begin());
+ }
+ }
+ bool Found = false;
+ for (auto &Use : *Users) {
+ if (!DefsOfPotentialCandidates.count(Use)) {
+ ++NumTooCplxLvl1;
+ Found = true;
+ break;
+ }
+ }
+ if (!Found)
+ ++NumCplxLvl1;
+#endif // DEBUG
+ continue;
+ }
+
+ bool IsL2Add = (ImmediateDefOpc == AArch64::ADDXri);
+ // If the chain is three instructions long and ldr is the second element,
+ // then this ldr must load form GOT, otherwise this is not a correct chain.
+ if (L2 && !IsL2Add && L2->getOperand(2).getTargetFlags() != AArch64II::MO_GOT)
+ continue;
+ SmallVector<const MachineInstr *, 3> Args;
+ MCLOHType Kind;
+ if (isCandidateLoad(Candidate)) {
+ if (!L2) {
+ // At this point, the candidate LOH indicates that the ldr instruction
+ // may use a direct access to the symbol. There is not such encoding
+ // for loads of byte and half.
+ if (!supportLoadFromLiteral(Candidate))
+ continue;
+
+ DEBUG(dbgs() << "Record AdrpLdr:\n" << *L1 << '\n' << *Candidate
+ << '\n');
+ Kind = MCLOH_AdrpLdr;
+ Args.push_back(L1);
+ Args.push_back(Candidate);
+ assert((!InvolvedInLOHs || InvolvedInLOHs->insert(L1)) &&
+ "L1 already involved in LOH.");
+ assert((!InvolvedInLOHs || InvolvedInLOHs->insert(Candidate)) &&
+ "Candidate already involved in LOH.");
+ ++NumADRPToLDR;
+ } else {
+ DEBUG(dbgs() << "Record Adrp" << (IsL2Add ? "Add" : "LdrGot")
+ << "Ldr:\n" << *L1 << '\n' << *L2 << '\n' << *Candidate
+ << '\n');
+
+ Kind = IsL2Add ? MCLOH_AdrpAddLdr : MCLOH_AdrpLdrGotLdr;
+ Args.push_back(L1);
+ Args.push_back(L2);
+ Args.push_back(Candidate);
+
+ PotentialADROpportunities.remove(L2);
+ assert((!InvolvedInLOHs || InvolvedInLOHs->insert(L1)) &&
+ "L1 already involved in LOH.");
+ assert((!InvolvedInLOHs || InvolvedInLOHs->insert(L2)) &&
+ "L2 already involved in LOH.");
+ assert((!InvolvedInLOHs || InvolvedInLOHs->insert(Candidate)) &&
+ "Candidate already involved in LOH.");
+#ifdef DEBUG
+ // get the immediate of the load
+ if (Candidate->getOperand(2).getImm() == 0)
+ if (ImmediateDefOpc == AArch64::ADDXri)
+ ++NumADDToLDR;
+ else
+ ++NumLDRToLDR;
+ else if (ImmediateDefOpc == AArch64::ADDXri)
+ ++NumADDToLDRWithImm;
+ else
+ ++NumLDRToLDRWithImm;
+#endif // DEBUG
+ }
+ } else {
+ if (ImmediateDefOpc == AArch64::ADRP)
+ continue;
+ else {
+
+ DEBUG(dbgs() << "Record Adrp" << (IsL2Add ? "Add" : "LdrGot")
+ << "Str:\n" << *L1 << '\n' << *L2 << '\n' << *Candidate
+ << '\n');
+
+ Kind = IsL2Add ? MCLOH_AdrpAddStr : MCLOH_AdrpLdrGotStr;
+ Args.push_back(L1);
+ Args.push_back(L2);
+ Args.push_back(Candidate);
+
+ PotentialADROpportunities.remove(L2);
+ assert((!InvolvedInLOHs || InvolvedInLOHs->insert(L1)) &&
+ "L1 already involved in LOH.");
+ assert((!InvolvedInLOHs || InvolvedInLOHs->insert(L2)) &&
+ "L2 already involved in LOH.");
+ assert((!InvolvedInLOHs || InvolvedInLOHs->insert(Candidate)) &&
+ "Candidate already involved in LOH.");
+#ifdef DEBUG
+ // get the immediate of the store
+ if (Candidate->getOperand(2).getImm() == 0)
+ if (ImmediateDefOpc == AArch64::ADDXri)
+ ++NumADDToSTR;
+ else
+ ++NumLDRToSTR;
+ else if (ImmediateDefOpc == AArch64::ADDXri)
+ ++NumADDToSTRWithImm;
+ else
+ ++NumLDRToSTRWithImm;
+#endif // DEBUG
+ }
+ }
+ AArch64FI.addLOHDirective(Kind, Args);
+ }
+
+ // Now, we grabbed all the big patterns, check ADR opportunities.
+ for (const MachineInstr *Candidate : PotentialADROpportunities)
+ registerADRCandidate(*Candidate, UseToDefs, DefsPerColorToUses, AArch64FI,
+ InvolvedInLOHs, RegToId);
+}
+
+/// Look for every register defined by potential LOHs candidates.
+/// Map these registers with dense id in @p RegToId and vice-versa in
+/// @p IdToReg. @p IdToReg is populated only in DEBUG mode.
+static void collectInvolvedReg(MachineFunction &MF, MapRegToId &RegToId,
+ MapIdToReg &IdToReg,
+ const TargetRegisterInfo *TRI) {
+ unsigned CurRegId = 0;
+ if (!PreCollectRegister) {
+ unsigned NbReg = TRI->getNumRegs();
+ for (; CurRegId < NbReg; ++CurRegId) {
+ RegToId[CurRegId] = CurRegId;
+ DEBUG(IdToReg.push_back(CurRegId));
+ DEBUG(assert(IdToReg[CurRegId] == CurRegId && "Reg index mismatches"));
+ }
+ return;
+ }
+
+ DEBUG(dbgs() << "** Collect Involved Register\n");
+ for (const auto &MBB : MF) {
+ for (const MachineInstr &MI : MBB) {
+ if (!canDefBePartOfLOH(&MI))
+ continue;
+
+ // Process defs
+ for (MachineInstr::const_mop_iterator IO = MI.operands_begin(),
+ IOEnd = MI.operands_end();
+ IO != IOEnd; ++IO) {
+ if (!IO->isReg() || !IO->isDef())
+ continue;
+ unsigned CurReg = IO->getReg();
+ for (MCRegAliasIterator AI(CurReg, TRI, true); AI.isValid(); ++AI)
+ if (RegToId.find(*AI) == RegToId.end()) {
+ DEBUG(IdToReg.push_back(*AI);
+ assert(IdToReg[CurRegId] == *AI &&
+ "Reg index mismatches insertion index."));
+ RegToId[*AI] = CurRegId++;
+ DEBUG(dbgs() << "Register: " << PrintReg(*AI, TRI) << '\n');
+ }
+ }
+ }
+ }
+}
+
+bool AArch64CollectLOH::runOnMachineFunction(MachineFunction &MF) {
+ const TargetMachine &TM = MF.getTarget();
+ const TargetRegisterInfo *TRI = TM.getRegisterInfo();
+ const MachineDominatorTree *MDT = &getAnalysis<MachineDominatorTree>();
+
+ MapRegToId RegToId;
+ MapIdToReg IdToReg;
+ AArch64FunctionInfo *AArch64FI = MF.getInfo<AArch64FunctionInfo>();
+ assert(AArch64FI && "No MachineFunctionInfo for this function!");
+
+ DEBUG(dbgs() << "Looking for LOH in " << MF.getName() << '\n');
+
+ collectInvolvedReg(MF, RegToId, IdToReg, TRI);
+ if (RegToId.empty())
+ return false;
+
+ MachineInstr *DummyOp = nullptr;
+ if (BasicBlockScopeOnly) {
+ const AArch64InstrInfo *TII =
+ static_cast<const AArch64InstrInfo *>(TM.getInstrInfo());
+ // For local analysis, create a dummy operation to record uses that are not
+ // local.
+ DummyOp = MF.CreateMachineInstr(TII->get(AArch64::COPY), DebugLoc());
+ }
+
+ unsigned NbReg = RegToId.size();
+ bool Modified = false;
+
+ // Start with ADRP.
+ InstrToInstrs *ColorOpToReachedUses = new InstrToInstrs[NbReg];
+
+ // Compute the reaching def in ADRP mode, meaning ADRP definitions
+ // are first considered as uses.
+ reachingDef(MF, ColorOpToReachedUses, RegToId, true, DummyOp);
+ DEBUG(dbgs() << "ADRP reaching defs\n");
+ DEBUG(printReachingDef(ColorOpToReachedUses, NbReg, TRI, IdToReg));
+
+ // Translate the definition to uses map into a use to definitions map to ease
+ // statistic computation.
+ InstrToInstrs ADRPToReachingDefs;
+ reachedUsesToDefs(ADRPToReachingDefs, ColorOpToReachedUses, RegToId, true);
+
+ // Compute LOH for ADRP.
+ computeADRP(ADRPToReachingDefs, *AArch64FI, MDT);
+ delete[] ColorOpToReachedUses;
+
+ // Continue with general ADRP -> ADD/LDR -> LDR/STR pattern.
+ ColorOpToReachedUses = new InstrToInstrs[NbReg];
+
+ // first perform a regular reaching def analysis.
+ reachingDef(MF, ColorOpToReachedUses, RegToId, false, DummyOp);
+ DEBUG(dbgs() << "All reaching defs\n");
+ DEBUG(printReachingDef(ColorOpToReachedUses, NbReg, TRI, IdToReg));
+
+ // Turn that into a use to defs to ease statistic computation.
+ InstrToInstrs UsesToReachingDefs;
+ reachedUsesToDefs(UsesToReachingDefs, ColorOpToReachedUses, RegToId, false);
+
+ // Compute other than AdrpAdrp LOH.
+ computeOthers(UsesToReachingDefs, ColorOpToReachedUses, *AArch64FI, RegToId,
+ MDT);
+ delete[] ColorOpToReachedUses;
+
+ if (BasicBlockScopeOnly)
+ MF.DeleteMachineInstr(DummyOp);
+
+ return Modified;
+}
+
+/// createAArch64CollectLOHPass - returns an instance of the Statistic for
+/// linker optimization pass.
+FunctionPass *llvm::createAArch64CollectLOHPass() {
+ return new AArch64CollectLOH();
+}
diff --git a/lib/Target/AArch64/AArch64ConditionalCompares.cpp b/lib/Target/AArch64/AArch64ConditionalCompares.cpp
new file mode 100644
index 0000000..452cdec
--- /dev/null
+++ b/lib/Target/AArch64/AArch64ConditionalCompares.cpp
@@ -0,0 +1,919 @@
+//===-- AArch64ConditionalCompares.cpp --- CCMP formation for AArch64 -----===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the AArch64ConditionalCompares pass which reduces
+// branching and code size by using the conditional compare instructions CCMP,
+// CCMN, and FCMP.
+//
+// The CFG transformations for forming conditional compares are very similar to
+// if-conversion, and this pass should run immediately before the early
+// if-conversion pass.
+//
+//===----------------------------------------------------------------------===//
+
+#include "AArch64.h"
+#include "llvm/ADT/BitVector.h"
+#include "llvm/ADT/DepthFirstIterator.h"
+#include "llvm/ADT/SetVector.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/SparseSet.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/CodeGen/MachineBranchProbabilityInfo.h"
+#include "llvm/CodeGen/MachineDominators.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineLoopInfo.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/MachineTraceMetrics.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+#include "llvm/Target/TargetSubtargetInfo.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "aarch64-ccmp"
+
+// Absolute maximum number of instructions allowed per speculated block.
+// This bypasses all other heuristics, so it should be set fairly high.
+static cl::opt<unsigned> BlockInstrLimit(
+ "aarch64-ccmp-limit", cl::init(30), cl::Hidden,
+ cl::desc("Maximum number of instructions per speculated block."));
+
+// Stress testing mode - disable heuristics.
+static cl::opt<bool> Stress("aarch64-stress-ccmp", cl::Hidden,
+ cl::desc("Turn all knobs to 11"));
+
+STATISTIC(NumConsidered, "Number of ccmps considered");
+STATISTIC(NumPhiRejs, "Number of ccmps rejected (PHI)");
+STATISTIC(NumPhysRejs, "Number of ccmps rejected (Physregs)");
+STATISTIC(NumPhi2Rejs, "Number of ccmps rejected (PHI2)");
+STATISTIC(NumHeadBranchRejs, "Number of ccmps rejected (Head branch)");
+STATISTIC(NumCmpBranchRejs, "Number of ccmps rejected (CmpBB branch)");
+STATISTIC(NumCmpTermRejs, "Number of ccmps rejected (CmpBB is cbz...)");
+STATISTIC(NumImmRangeRejs, "Number of ccmps rejected (Imm out of range)");
+STATISTIC(NumLiveDstRejs, "Number of ccmps rejected (Cmp dest live)");
+STATISTIC(NumMultNZCVUses, "Number of ccmps rejected (NZCV used)");
+STATISTIC(NumUnknNZCVDefs, "Number of ccmps rejected (NZCV def unknown)");
+
+STATISTIC(NumSpeculateRejs, "Number of ccmps rejected (Can't speculate)");
+
+STATISTIC(NumConverted, "Number of ccmp instructions created");
+STATISTIC(NumCompBranches, "Number of cbz/cbnz branches converted");
+
+//===----------------------------------------------------------------------===//
+// SSACCmpConv
+//===----------------------------------------------------------------------===//
+//
+// The SSACCmpConv class performs ccmp-conversion on SSA form machine code
+// after determining if it is possible. The class contains no heuristics;
+// external code should be used to determine when ccmp-conversion is a good
+// idea.
+//
+// CCmp-formation works on a CFG representing chained conditions, typically
+// from C's short-circuit || and && operators:
+//
+// From: Head To: Head
+// / | CmpBB
+// / | / |
+// | CmpBB / |
+// | / | Tail |
+// | / | | |
+// Tail | | |
+// | | | |
+// ... ... ... ...
+//
+// The Head block is terminated by a br.cond instruction, and the CmpBB block
+// contains compare + br.cond. Tail must be a successor of both.
+//
+// The cmp-conversion turns the compare instruction in CmpBB into a conditional
+// compare, and merges CmpBB into Head, speculatively executing its
+// instructions. The AArch64 conditional compare instructions have an immediate
+// operand that specifies the NZCV flag values when the condition is false and
+// the compare isn't executed. This makes it possible to chain compares with
+// different condition codes.
+//
+// Example:
+//
+// if (a == 5 || b == 17)
+// foo();
+//
+// Head:
+// cmp w0, #5
+// b.eq Tail
+// CmpBB:
+// cmp w1, #17
+// b.eq Tail
+// ...
+// Tail:
+// bl _foo
+//
+// Becomes:
+//
+// Head:
+// cmp w0, #5
+// ccmp w1, #17, 4, ne ; 4 = nZcv
+// b.eq Tail
+// ...
+// Tail:
+// bl _foo
+//
+// The ccmp condition code is the one that would cause the Head terminator to
+// branch to CmpBB.
+//
+// FIXME: It should also be possible to speculate a block on the critical edge
+// between Head and Tail, just like if-converting a diamond.
+//
+// FIXME: Handle PHIs in Tail by turning them into selects (if-conversion).
+
+namespace {
+class SSACCmpConv {
+ MachineFunction *MF;
+ const TargetInstrInfo *TII;
+ const TargetRegisterInfo *TRI;
+ MachineRegisterInfo *MRI;
+
+public:
+ /// The first block containing a conditional branch, dominating everything
+ /// else.
+ MachineBasicBlock *Head;
+
+ /// The block containing cmp+br.cond with a successor shared with Head.
+ MachineBasicBlock *CmpBB;
+
+ /// The common successor for Head and CmpBB.
+ MachineBasicBlock *Tail;
+
+ /// The compare instruction in CmpBB that can be converted to a ccmp.
+ MachineInstr *CmpMI;
+
+private:
+ /// The branch condition in Head as determined by AnalyzeBranch.
+ SmallVector<MachineOperand, 4> HeadCond;
+
+ /// The condition code that makes Head branch to CmpBB.
+ AArch64CC::CondCode HeadCmpBBCC;
+
+ /// The branch condition in CmpBB.
+ SmallVector<MachineOperand, 4> CmpBBCond;
+
+ /// The condition code that makes CmpBB branch to Tail.
+ AArch64CC::CondCode CmpBBTailCC;
+
+ /// Check if the Tail PHIs are trivially convertible.
+ bool trivialTailPHIs();
+
+ /// Remove CmpBB from the Tail PHIs.
+ void updateTailPHIs();
+
+ /// Check if an operand defining DstReg is dead.
+ bool isDeadDef(unsigned DstReg);
+
+ /// Find the compare instruction in MBB that controls the conditional branch.
+ /// Return NULL if a convertible instruction can't be found.
+ MachineInstr *findConvertibleCompare(MachineBasicBlock *MBB);
+
+ /// Return true if all non-terminator instructions in MBB can be safely
+ /// speculated.
+ bool canSpeculateInstrs(MachineBasicBlock *MBB, const MachineInstr *CmpMI);
+
+public:
+ /// runOnMachineFunction - Initialize per-function data structures.
+ void runOnMachineFunction(MachineFunction &MF) {
+ this->MF = &MF;
+ TII = MF.getTarget().getInstrInfo();
+ TRI = MF.getTarget().getRegisterInfo();
+ MRI = &MF.getRegInfo();
+ }
+
+ /// If the sub-CFG headed by MBB can be cmp-converted, initialize the
+ /// internal state, and return true.
+ bool canConvert(MachineBasicBlock *MBB);
+
+ /// Cmo-convert the last block passed to canConvertCmp(), assuming
+ /// it is possible. Add any erased blocks to RemovedBlocks.
+ void convert(SmallVectorImpl<MachineBasicBlock *> &RemovedBlocks);
+
+ /// Return the expected code size delta if the conversion into a
+ /// conditional compare is performed.
+ int expectedCodeSizeDelta() const;
+};
+} // end anonymous namespace
+
+// Check that all PHIs in Tail are selecting the same value from Head and CmpBB.
+// This means that no if-conversion is required when merging CmpBB into Head.
+bool SSACCmpConv::trivialTailPHIs() {
+ for (auto &I : *Tail) {
+ if (!I.isPHI())
+ break;
+ unsigned HeadReg = 0, CmpBBReg = 0;
+ // PHI operands come in (VReg, MBB) pairs.
+ for (unsigned oi = 1, oe = I.getNumOperands(); oi != oe; oi += 2) {
+ MachineBasicBlock *MBB = I.getOperand(oi + 1).getMBB();
+ unsigned Reg = I.getOperand(oi).getReg();
+ if (MBB == Head) {
+ assert((!HeadReg || HeadReg == Reg) && "Inconsistent PHI operands");
+ HeadReg = Reg;
+ }
+ if (MBB == CmpBB) {
+ assert((!CmpBBReg || CmpBBReg == Reg) && "Inconsistent PHI operands");
+ CmpBBReg = Reg;
+ }
+ }
+ if (HeadReg != CmpBBReg)
+ return false;
+ }
+ return true;
+}
+
+// Assuming that trivialTailPHIs() is true, update the Tail PHIs by simply
+// removing the CmpBB operands. The Head operands will be identical.
+void SSACCmpConv::updateTailPHIs() {
+ for (auto &I : *Tail) {
+ if (!I.isPHI())
+ break;
+ // I is a PHI. It can have multiple entries for CmpBB.
+ for (unsigned oi = I.getNumOperands(); oi > 2; oi -= 2) {
+ // PHI operands are (Reg, MBB) at (oi-2, oi-1).
+ if (I.getOperand(oi - 1).getMBB() == CmpBB) {
+ I.RemoveOperand(oi - 1);
+ I.RemoveOperand(oi - 2);
+ }
+ }
+ }
+}
+
+// This pass runs before the AArch64DeadRegisterDefinitions pass, so compares
+// are still writing virtual registers without any uses.
+bool SSACCmpConv::isDeadDef(unsigned DstReg) {
+ // Writes to the zero register are dead.
+ if (DstReg == AArch64::WZR || DstReg == AArch64::XZR)
+ return true;
+ if (!TargetRegisterInfo::isVirtualRegister(DstReg))
+ return false;
+ // A virtual register def without any uses will be marked dead later, and
+ // eventually replaced by the zero register.
+ return MRI->use_nodbg_empty(DstReg);
+}
+
+// Parse a condition code returned by AnalyzeBranch, and compute the CondCode
+// corresponding to TBB.
+// Return
+static bool parseCond(ArrayRef<MachineOperand> Cond, AArch64CC::CondCode &CC) {
+ // A normal br.cond simply has the condition code.
+ if (Cond[0].getImm() != -1) {
+ assert(Cond.size() == 1 && "Unknown Cond array format");
+ CC = (AArch64CC::CondCode)(int)Cond[0].getImm();
+ return true;
+ }
+ // For tbz and cbz instruction, the opcode is next.
+ switch (Cond[1].getImm()) {
+ default:
+ // This includes tbz / tbnz branches which can't be converted to
+ // ccmp + br.cond.
+ return false;
+ case AArch64::CBZW:
+ case AArch64::CBZX:
+ assert(Cond.size() == 3 && "Unknown Cond array format");
+ CC = AArch64CC::EQ;
+ return true;
+ case AArch64::CBNZW:
+ case AArch64::CBNZX:
+ assert(Cond.size() == 3 && "Unknown Cond array format");
+ CC = AArch64CC::NE;
+ return true;
+ }
+}
+
+MachineInstr *SSACCmpConv::findConvertibleCompare(MachineBasicBlock *MBB) {
+ MachineBasicBlock::iterator I = MBB->getFirstTerminator();
+ if (I == MBB->end())
+ return nullptr;
+ // The terminator must be controlled by the flags.
+ if (!I->readsRegister(AArch64::NZCV)) {
+ switch (I->getOpcode()) {
+ case AArch64::CBZW:
+ case AArch64::CBZX:
+ case AArch64::CBNZW:
+ case AArch64::CBNZX:
+ // These can be converted into a ccmp against #0.
+ return I;
+ }
+ ++NumCmpTermRejs;
+ DEBUG(dbgs() << "Flags not used by terminator: " << *I);
+ return nullptr;
+ }
+
+ // Now find the instruction controlling the terminator.
+ for (MachineBasicBlock::iterator B = MBB->begin(); I != B;) {
+ --I;
+ assert(!I->isTerminator() && "Spurious terminator");
+ switch (I->getOpcode()) {
+ // cmp is an alias for subs with a dead destination register.
+ case AArch64::SUBSWri:
+ case AArch64::SUBSXri:
+ // cmn is an alias for adds with a dead destination register.
+ case AArch64::ADDSWri:
+ case AArch64::ADDSXri:
+ // Check that the immediate operand is within range, ccmp wants a uimm5.
+ // Rd = SUBSri Rn, imm, shift
+ if (I->getOperand(3).getImm() || !isUInt<5>(I->getOperand(2).getImm())) {
+ DEBUG(dbgs() << "Immediate out of range for ccmp: " << *I);
+ ++NumImmRangeRejs;
+ return nullptr;
+ }
+ // Fall through.
+ case AArch64::SUBSWrr:
+ case AArch64::SUBSXrr:
+ case AArch64::ADDSWrr:
+ case AArch64::ADDSXrr:
+ if (isDeadDef(I->getOperand(0).getReg()))
+ return I;
+ DEBUG(dbgs() << "Can't convert compare with live destination: " << *I);
+ ++NumLiveDstRejs;
+ return nullptr;
+ case AArch64::FCMPSrr:
+ case AArch64::FCMPDrr:
+ case AArch64::FCMPESrr:
+ case AArch64::FCMPEDrr:
+ return I;
+ }
+
+ // Check for flag reads and clobbers.
+ MIOperands::PhysRegInfo PRI =
+ MIOperands(I).analyzePhysReg(AArch64::NZCV, TRI);
+
+ if (PRI.Reads) {
+ // The ccmp doesn't produce exactly the same flags as the original
+ // compare, so reject the transform if there are uses of the flags
+ // besides the terminators.
+ DEBUG(dbgs() << "Can't create ccmp with multiple uses: " << *I);
+ ++NumMultNZCVUses;
+ return nullptr;
+ }
+
+ if (PRI.Clobbers) {
+ DEBUG(dbgs() << "Not convertible compare: " << *I);
+ ++NumUnknNZCVDefs;
+ return nullptr;
+ }
+ }
+ DEBUG(dbgs() << "Flags not defined in BB#" << MBB->getNumber() << '\n');
+ return nullptr;
+}
+
+/// Determine if all the instructions in MBB can safely
+/// be speculated. The terminators are not considered.
+///
+/// Only CmpMI is allowed to clobber the flags.
+///
+bool SSACCmpConv::canSpeculateInstrs(MachineBasicBlock *MBB,
+ const MachineInstr *CmpMI) {
+ // Reject any live-in physregs. It's probably NZCV/EFLAGS, and very hard to
+ // get right.
+ if (!MBB->livein_empty()) {
+ DEBUG(dbgs() << "BB#" << MBB->getNumber() << " has live-ins.\n");
+ return false;
+ }
+
+ unsigned InstrCount = 0;
+
+ // Check all instructions, except the terminators. It is assumed that
+ // terminators never have side effects or define any used register values.
+ for (auto &I : make_range(MBB->begin(), MBB->getFirstTerminator())) {
+ if (I.isDebugValue())
+ continue;
+
+ if (++InstrCount > BlockInstrLimit && !Stress) {
+ DEBUG(dbgs() << "BB#" << MBB->getNumber() << " has more than "
+ << BlockInstrLimit << " instructions.\n");
+ return false;
+ }
+
+ // There shouldn't normally be any phis in a single-predecessor block.
+ if (I.isPHI()) {
+ DEBUG(dbgs() << "Can't hoist: " << I);
+ return false;
+ }
+
+ // Don't speculate loads. Note that it may be possible and desirable to
+ // speculate GOT or constant pool loads that are guaranteed not to trap,
+ // but we don't support that for now.
+ if (I.mayLoad()) {
+ DEBUG(dbgs() << "Won't speculate load: " << I);
+ return false;
+ }
+
+ // We never speculate stores, so an AA pointer isn't necessary.
+ bool DontMoveAcrossStore = true;
+ if (!I.isSafeToMove(TII, nullptr, DontMoveAcrossStore)) {
+ DEBUG(dbgs() << "Can't speculate: " << I);
+ return false;
+ }
+
+ // Only CmpMI is allowed to clobber the flags.
+ if (&I != CmpMI && I.modifiesRegister(AArch64::NZCV, TRI)) {
+ DEBUG(dbgs() << "Clobbers flags: " << I);
+ return false;
+ }
+ }
+ return true;
+}
+
+/// Analyze the sub-cfg rooted in MBB, and return true if it is a potential
+/// candidate for cmp-conversion. Fill out the internal state.
+///
+bool SSACCmpConv::canConvert(MachineBasicBlock *MBB) {
+ Head = MBB;
+ Tail = CmpBB = nullptr;
+
+ if (Head->succ_size() != 2)
+ return false;
+ MachineBasicBlock *Succ0 = Head->succ_begin()[0];
+ MachineBasicBlock *Succ1 = Head->succ_begin()[1];
+
+ // CmpBB can only have a single predecessor. Tail is allowed many.
+ if (Succ0->pred_size() != 1)
+ std::swap(Succ0, Succ1);
+
+ // Succ0 is our candidate for CmpBB.
+ if (Succ0->pred_size() != 1 || Succ0->succ_size() != 2)
+ return false;
+
+ CmpBB = Succ0;
+ Tail = Succ1;
+
+ if (!CmpBB->isSuccessor(Tail))
+ return false;
+
+ // The CFG topology checks out.
+ DEBUG(dbgs() << "\nTriangle: BB#" << Head->getNumber() << " -> BB#"
+ << CmpBB->getNumber() << " -> BB#" << Tail->getNumber() << '\n');
+ ++NumConsidered;
+
+ // Tail is allowed to have many predecessors, but we can't handle PHIs yet.
+ //
+ // FIXME: Real PHIs could be if-converted as long as the CmpBB values are
+ // defined before The CmpBB cmp clobbers the flags. Alternatively, it should
+ // always be safe to sink the ccmp down to immediately before the CmpBB
+ // terminators.
+ if (!trivialTailPHIs()) {
+ DEBUG(dbgs() << "Can't handle phis in Tail.\n");
+ ++NumPhiRejs;
+ return false;
+ }
+
+ if (!Tail->livein_empty()) {
+ DEBUG(dbgs() << "Can't handle live-in physregs in Tail.\n");
+ ++NumPhysRejs;
+ return false;
+ }
+
+ // CmpBB should never have PHIs since Head is its only predecessor.
+ // FIXME: Clean them up if it happens.
+ if (!CmpBB->empty() && CmpBB->front().isPHI()) {
+ DEBUG(dbgs() << "Can't handle phis in CmpBB.\n");
+ ++NumPhi2Rejs;
+ return false;
+ }
+
+ if (!CmpBB->livein_empty()) {
+ DEBUG(dbgs() << "Can't handle live-in physregs in CmpBB.\n");
+ ++NumPhysRejs;
+ return false;
+ }
+
+ // The branch we're looking to eliminate must be analyzable.
+ HeadCond.clear();
+ MachineBasicBlock *TBB = nullptr, *FBB = nullptr;
+ if (TII->AnalyzeBranch(*Head, TBB, FBB, HeadCond)) {
+ DEBUG(dbgs() << "Head branch not analyzable.\n");
+ ++NumHeadBranchRejs;
+ return false;
+ }
+
+ // This is weird, probably some sort of degenerate CFG, or an edge to a
+ // landing pad.
+ if (!TBB || HeadCond.empty()) {
+ DEBUG(dbgs() << "AnalyzeBranch didn't find conditional branch in Head.\n");
+ ++NumHeadBranchRejs;
+ return false;
+ }
+
+ if (!parseCond(HeadCond, HeadCmpBBCC)) {
+ DEBUG(dbgs() << "Unsupported branch type on Head\n");
+ ++NumHeadBranchRejs;
+ return false;
+ }
+
+ // Make sure the branch direction is right.
+ if (TBB != CmpBB) {
+ assert(TBB == Tail && "Unexpected TBB");
+ HeadCmpBBCC = AArch64CC::getInvertedCondCode(HeadCmpBBCC);
+ }
+
+ CmpBBCond.clear();
+ TBB = FBB = nullptr;
+ if (TII->AnalyzeBranch(*CmpBB, TBB, FBB, CmpBBCond)) {
+ DEBUG(dbgs() << "CmpBB branch not analyzable.\n");
+ ++NumCmpBranchRejs;
+ return false;
+ }
+
+ if (!TBB || CmpBBCond.empty()) {
+ DEBUG(dbgs() << "AnalyzeBranch didn't find conditional branch in CmpBB.\n");
+ ++NumCmpBranchRejs;
+ return false;
+ }
+
+ if (!parseCond(CmpBBCond, CmpBBTailCC)) {
+ DEBUG(dbgs() << "Unsupported branch type on CmpBB\n");
+ ++NumCmpBranchRejs;
+ return false;
+ }
+
+ if (TBB != Tail)
+ CmpBBTailCC = AArch64CC::getInvertedCondCode(CmpBBTailCC);
+
+ DEBUG(dbgs() << "Head->CmpBB on " << AArch64CC::getCondCodeName(HeadCmpBBCC)
+ << ", CmpBB->Tail on " << AArch64CC::getCondCodeName(CmpBBTailCC)
+ << '\n');
+
+ CmpMI = findConvertibleCompare(CmpBB);
+ if (!CmpMI)
+ return false;
+
+ if (!canSpeculateInstrs(CmpBB, CmpMI)) {
+ ++NumSpeculateRejs;
+ return false;
+ }
+ return true;
+}
+
+void SSACCmpConv::convert(SmallVectorImpl<MachineBasicBlock *> &RemovedBlocks) {
+ DEBUG(dbgs() << "Merging BB#" << CmpBB->getNumber() << " into BB#"
+ << Head->getNumber() << ":\n" << *CmpBB);
+
+ // All CmpBB instructions are moved into Head, and CmpBB is deleted.
+ // Update the CFG first.
+ updateTailPHIs();
+ Head->removeSuccessor(CmpBB);
+ CmpBB->removeSuccessor(Tail);
+ Head->transferSuccessorsAndUpdatePHIs(CmpBB);
+ DebugLoc TermDL = Head->getFirstTerminator()->getDebugLoc();
+ TII->RemoveBranch(*Head);
+
+ // If the Head terminator was one of the cbz / tbz branches with built-in
+ // compare, we need to insert an explicit compare instruction in its place.
+ if (HeadCond[0].getImm() == -1) {
+ ++NumCompBranches;
+ unsigned Opc = 0;
+ switch (HeadCond[1].getImm()) {
+ case AArch64::CBZW:
+ case AArch64::CBNZW:
+ Opc = AArch64::SUBSWri;
+ break;
+ case AArch64::CBZX:
+ case AArch64::CBNZX:
+ Opc = AArch64::SUBSXri;
+ break;
+ default:
+ llvm_unreachable("Cannot convert Head branch");
+ }
+ const MCInstrDesc &MCID = TII->get(Opc);
+ // Create a dummy virtual register for the SUBS def.
+ unsigned DestReg =
+ MRI->createVirtualRegister(TII->getRegClass(MCID, 0, TRI, *MF));
+ // Insert a SUBS Rn, #0 instruction instead of the cbz / cbnz.
+ BuildMI(*Head, Head->end(), TermDL, MCID)
+ .addReg(DestReg, RegState::Define | RegState::Dead)
+ .addOperand(HeadCond[2])
+ .addImm(0)
+ .addImm(0);
+ // SUBS uses the GPR*sp register classes.
+ MRI->constrainRegClass(HeadCond[2].getReg(),
+ TII->getRegClass(MCID, 1, TRI, *MF));
+ }
+
+ Head->splice(Head->end(), CmpBB, CmpBB->begin(), CmpBB->end());
+
+ // Now replace CmpMI with a ccmp instruction that also considers the incoming
+ // flags.
+ unsigned Opc = 0;
+ unsigned FirstOp = 1; // First CmpMI operand to copy.
+ bool isZBranch = false; // CmpMI is a cbz/cbnz instruction.
+ switch (CmpMI->getOpcode()) {
+ default:
+ llvm_unreachable("Unknown compare opcode");
+ case AArch64::SUBSWri: Opc = AArch64::CCMPWi; break;
+ case AArch64::SUBSWrr: Opc = AArch64::CCMPWr; break;
+ case AArch64::SUBSXri: Opc = AArch64::CCMPXi; break;
+ case AArch64::SUBSXrr: Opc = AArch64::CCMPXr; break;
+ case AArch64::ADDSWri: Opc = AArch64::CCMNWi; break;
+ case AArch64::ADDSWrr: Opc = AArch64::CCMNWr; break;
+ case AArch64::ADDSXri: Opc = AArch64::CCMNXi; break;
+ case AArch64::ADDSXrr: Opc = AArch64::CCMNXr; break;
+ case AArch64::FCMPSrr: Opc = AArch64::FCCMPSrr; FirstOp = 0; break;
+ case AArch64::FCMPDrr: Opc = AArch64::FCCMPDrr; FirstOp = 0; break;
+ case AArch64::FCMPESrr: Opc = AArch64::FCCMPESrr; FirstOp = 0; break;
+ case AArch64::FCMPEDrr: Opc = AArch64::FCCMPEDrr; FirstOp = 0; break;
+ case AArch64::CBZW:
+ case AArch64::CBNZW:
+ Opc = AArch64::CCMPWi;
+ FirstOp = 0;
+ isZBranch = true;
+ break;
+ case AArch64::CBZX:
+ case AArch64::CBNZX:
+ Opc = AArch64::CCMPXi;
+ FirstOp = 0;
+ isZBranch = true;
+ break;
+ }
+
+ // The ccmp instruction should set the flags according to the comparison when
+ // Head would have branched to CmpBB.
+ // The NZCV immediate operand should provide flags for the case where Head
+ // would have branched to Tail. These flags should cause the new Head
+ // terminator to branch to tail.
+ unsigned NZCV = AArch64CC::getNZCVToSatisfyCondCode(CmpBBTailCC);
+ const MCInstrDesc &MCID = TII->get(Opc);
+ MRI->constrainRegClass(CmpMI->getOperand(FirstOp).getReg(),
+ TII->getRegClass(MCID, 0, TRI, *MF));
+ if (CmpMI->getOperand(FirstOp + 1).isReg())
+ MRI->constrainRegClass(CmpMI->getOperand(FirstOp + 1).getReg(),
+ TII->getRegClass(MCID, 1, TRI, *MF));
+ MachineInstrBuilder MIB =
+ BuildMI(*Head, CmpMI, CmpMI->getDebugLoc(), MCID)
+ .addOperand(CmpMI->getOperand(FirstOp)); // Register Rn
+ if (isZBranch)
+ MIB.addImm(0); // cbz/cbnz Rn -> ccmp Rn, #0
+ else
+ MIB.addOperand(CmpMI->getOperand(FirstOp + 1)); // Register Rm / Immediate
+ MIB.addImm(NZCV).addImm(HeadCmpBBCC);
+
+ // If CmpMI was a terminator, we need a new conditional branch to replace it.
+ // This now becomes a Head terminator.
+ if (isZBranch) {
+ bool isNZ = CmpMI->getOpcode() == AArch64::CBNZW ||
+ CmpMI->getOpcode() == AArch64::CBNZX;
+ BuildMI(*Head, CmpMI, CmpMI->getDebugLoc(), TII->get(AArch64::Bcc))
+ .addImm(isNZ ? AArch64CC::NE : AArch64CC::EQ)
+ .addOperand(CmpMI->getOperand(1)); // Branch target.
+ }
+ CmpMI->eraseFromParent();
+ Head->updateTerminator();
+
+ RemovedBlocks.push_back(CmpBB);
+ CmpBB->eraseFromParent();
+ DEBUG(dbgs() << "Result:\n" << *Head);
+ ++NumConverted;
+}
+
+int SSACCmpConv::expectedCodeSizeDelta() const {
+ int delta = 0;
+ // If the Head terminator was one of the cbz / tbz branches with built-in
+ // compare, we need to insert an explicit compare instruction in its place
+ // plus a branch instruction.
+ if (HeadCond[0].getImm() == -1) {
+ switch (HeadCond[1].getImm()) {
+ case AArch64::CBZW:
+ case AArch64::CBNZW:
+ case AArch64::CBZX:
+ case AArch64::CBNZX:
+ // Therefore delta += 1
+ delta = 1;
+ break;
+ default:
+ llvm_unreachable("Cannot convert Head branch");
+ }
+ }
+ // If the Cmp terminator was one of the cbz / tbz branches with
+ // built-in compare, it will be turned into a compare instruction
+ // into Head, but we do not save any instruction.
+ // Otherwise, we save the branch instruction.
+ switch (CmpMI->getOpcode()) {
+ default:
+ --delta;
+ break;
+ case AArch64::CBZW:
+ case AArch64::CBNZW:
+ case AArch64::CBZX:
+ case AArch64::CBNZX:
+ break;
+ }
+ return delta;
+}
+
+//===----------------------------------------------------------------------===//
+// AArch64ConditionalCompares Pass
+//===----------------------------------------------------------------------===//
+
+namespace {
+class AArch64ConditionalCompares : public MachineFunctionPass {
+ const TargetInstrInfo *TII;
+ const TargetRegisterInfo *TRI;
+ const MCSchedModel *SchedModel;
+ // Does the proceeded function has Oz attribute.
+ bool MinSize;
+ MachineRegisterInfo *MRI;
+ MachineDominatorTree *DomTree;
+ MachineLoopInfo *Loops;
+ MachineTraceMetrics *Traces;
+ MachineTraceMetrics::Ensemble *MinInstr;
+ SSACCmpConv CmpConv;
+
+public:
+ static char ID;
+ AArch64ConditionalCompares() : MachineFunctionPass(ID) {}
+ void getAnalysisUsage(AnalysisUsage &AU) const override;
+ bool runOnMachineFunction(MachineFunction &MF) override;
+ const char *getPassName() const override {
+ return "AArch64 Conditional Compares";
+ }
+
+private:
+ bool tryConvert(MachineBasicBlock *);
+ void updateDomTree(ArrayRef<MachineBasicBlock *> Removed);
+ void updateLoops(ArrayRef<MachineBasicBlock *> Removed);
+ void invalidateTraces();
+ bool shouldConvert();
+};
+} // end anonymous namespace
+
+char AArch64ConditionalCompares::ID = 0;
+
+namespace llvm {
+void initializeAArch64ConditionalComparesPass(PassRegistry &);
+}
+
+INITIALIZE_PASS_BEGIN(AArch64ConditionalCompares, "aarch64-ccmp",
+ "AArch64 CCMP Pass", false, false)
+INITIALIZE_PASS_DEPENDENCY(MachineBranchProbabilityInfo)
+INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree)
+INITIALIZE_PASS_DEPENDENCY(MachineTraceMetrics)
+INITIALIZE_PASS_END(AArch64ConditionalCompares, "aarch64-ccmp",
+ "AArch64 CCMP Pass", false, false)
+
+FunctionPass *llvm::createAArch64ConditionalCompares() {
+ return new AArch64ConditionalCompares();
+}
+
+void AArch64ConditionalCompares::getAnalysisUsage(AnalysisUsage &AU) const {
+ AU.addRequired<MachineBranchProbabilityInfo>();
+ AU.addRequired<MachineDominatorTree>();
+ AU.addPreserved<MachineDominatorTree>();
+ AU.addRequired<MachineLoopInfo>();
+ AU.addPreserved<MachineLoopInfo>();
+ AU.addRequired<MachineTraceMetrics>();
+ AU.addPreserved<MachineTraceMetrics>();
+ MachineFunctionPass::getAnalysisUsage(AU);
+}
+
+/// Update the dominator tree after if-conversion erased some blocks.
+void AArch64ConditionalCompares::updateDomTree(
+ ArrayRef<MachineBasicBlock *> Removed) {
+ // convert() removes CmpBB which was previously dominated by Head.
+ // CmpBB children should be transferred to Head.
+ MachineDomTreeNode *HeadNode = DomTree->getNode(CmpConv.Head);
+ for (unsigned i = 0, e = Removed.size(); i != e; ++i) {
+ MachineDomTreeNode *Node = DomTree->getNode(Removed[i]);
+ assert(Node != HeadNode && "Cannot erase the head node");
+ assert(Node->getIDom() == HeadNode && "CmpBB should be dominated by Head");
+ while (Node->getNumChildren())
+ DomTree->changeImmediateDominator(Node->getChildren().back(), HeadNode);
+ DomTree->eraseNode(Removed[i]);
+ }
+}
+
+/// Update LoopInfo after if-conversion.
+void
+AArch64ConditionalCompares::updateLoops(ArrayRef<MachineBasicBlock *> Removed) {
+ if (!Loops)
+ return;
+ for (unsigned i = 0, e = Removed.size(); i != e; ++i)
+ Loops->removeBlock(Removed[i]);
+}
+
+/// Invalidate MachineTraceMetrics before if-conversion.
+void AArch64ConditionalCompares::invalidateTraces() {
+ Traces->invalidate(CmpConv.Head);
+ Traces->invalidate(CmpConv.CmpBB);
+}
+
+/// Apply cost model and heuristics to the if-conversion in IfConv.
+/// Return true if the conversion is a good idea.
+///
+bool AArch64ConditionalCompares::shouldConvert() {
+ // Stress testing mode disables all cost considerations.
+ if (Stress)
+ return true;
+ if (!MinInstr)
+ MinInstr = Traces->getEnsemble(MachineTraceMetrics::TS_MinInstrCount);
+
+ // Head dominates CmpBB, so it is always included in its trace.
+ MachineTraceMetrics::Trace Trace = MinInstr->getTrace(CmpConv.CmpBB);
+
+ // If code size is the main concern
+ if (MinSize) {
+ int CodeSizeDelta = CmpConv.expectedCodeSizeDelta();
+ DEBUG(dbgs() << "Code size delta: " << CodeSizeDelta << '\n');
+ // If we are minimizing the code size, do the conversion whatever
+ // the cost is.
+ if (CodeSizeDelta < 0)
+ return true;
+ if (CodeSizeDelta > 0) {
+ DEBUG(dbgs() << "Code size is increasing, give up on this one.\n");
+ return false;
+ }
+ // CodeSizeDelta == 0, continue with the regular heuristics
+ }
+
+ // Heuristic: The compare conversion delays the execution of the branch
+ // instruction because we must wait for the inputs to the second compare as
+ // well. The branch has no dependent instructions, but delaying it increases
+ // the cost of a misprediction.
+ //
+ // Set a limit on the delay we will accept.
+ unsigned DelayLimit = SchedModel->MispredictPenalty * 3 / 4;
+
+ // Instruction depths can be computed for all trace instructions above CmpBB.
+ unsigned HeadDepth =
+ Trace.getInstrCycles(CmpConv.Head->getFirstTerminator()).Depth;
+ unsigned CmpBBDepth =
+ Trace.getInstrCycles(CmpConv.CmpBB->getFirstTerminator()).Depth;
+ DEBUG(dbgs() << "Head depth: " << HeadDepth
+ << "\nCmpBB depth: " << CmpBBDepth << '\n');
+ if (CmpBBDepth > HeadDepth + DelayLimit) {
+ DEBUG(dbgs() << "Branch delay would be larger than " << DelayLimit
+ << " cycles.\n");
+ return false;
+ }
+
+ // Check the resource depth at the bottom of CmpBB - these instructions will
+ // be speculated.
+ unsigned ResDepth = Trace.getResourceDepth(true);
+ DEBUG(dbgs() << "Resources: " << ResDepth << '\n');
+
+ // Heuristic: The speculatively executed instructions must all be able to
+ // merge into the Head block. The Head critical path should dominate the
+ // resource cost of the speculated instructions.
+ if (ResDepth > HeadDepth) {
+ DEBUG(dbgs() << "Too many instructions to speculate.\n");
+ return false;
+ }
+ return true;
+}
+
+bool AArch64ConditionalCompares::tryConvert(MachineBasicBlock *MBB) {
+ bool Changed = false;
+ while (CmpConv.canConvert(MBB) && shouldConvert()) {
+ invalidateTraces();
+ SmallVector<MachineBasicBlock *, 4> RemovedBlocks;
+ CmpConv.convert(RemovedBlocks);
+ Changed = true;
+ updateDomTree(RemovedBlocks);
+ updateLoops(RemovedBlocks);
+ }
+ return Changed;
+}
+
+bool AArch64ConditionalCompares::runOnMachineFunction(MachineFunction &MF) {
+ DEBUG(dbgs() << "********** AArch64 Conditional Compares **********\n"
+ << "********** Function: " << MF.getName() << '\n');
+ TII = MF.getTarget().getInstrInfo();
+ TRI = MF.getTarget().getRegisterInfo();
+ SchedModel =
+ MF.getTarget().getSubtarget<TargetSubtargetInfo>().getSchedModel();
+ MRI = &MF.getRegInfo();
+ DomTree = &getAnalysis<MachineDominatorTree>();
+ Loops = getAnalysisIfAvailable<MachineLoopInfo>();
+ Traces = &getAnalysis<MachineTraceMetrics>();
+ MinInstr = nullptr;
+ MinSize = MF.getFunction()->getAttributes().hasAttribute(
+ AttributeSet::FunctionIndex, Attribute::MinSize);
+
+ bool Changed = false;
+ CmpConv.runOnMachineFunction(MF);
+
+ // Visit blocks in dominator tree pre-order. The pre-order enables multiple
+ // cmp-conversions from the same head block.
+ // Note that updateDomTree() modifies the children of the DomTree node
+ // currently being visited. The df_iterator supports that; it doesn't look at
+ // child_begin() / child_end() until after a node has been visited.
+ for (auto *I : depth_first(DomTree))
+ if (tryConvert(I->getBlock()))
+ Changed = true;
+
+ return Changed;
+}
diff --git a/lib/Target/AArch64/AArch64DeadRegisterDefinitionsPass.cpp b/lib/Target/AArch64/AArch64DeadRegisterDefinitionsPass.cpp
new file mode 100644
index 0000000..a2d853c
--- /dev/null
+++ b/lib/Target/AArch64/AArch64DeadRegisterDefinitionsPass.cpp
@@ -0,0 +1,134 @@
+//==-- AArch64DeadRegisterDefinitions.cpp - Replace dead defs w/ zero reg --==//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+// When allowed by the instruction, replace a dead definition of a GPR with
+// the zero register. This makes the code a bit friendlier towards the
+// hardware's register renamer.
+//===----------------------------------------------------------------------===//
+
+#include "AArch64.h"
+#include "AArch64RegisterInfo.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+using namespace llvm;
+
+#define DEBUG_TYPE "aarch64-dead-defs"
+
+STATISTIC(NumDeadDefsReplaced, "Number of dead definitions replaced");
+
+namespace {
+class AArch64DeadRegisterDefinitions : public MachineFunctionPass {
+private:
+ const TargetRegisterInfo *TRI;
+ bool implicitlyDefinesOverlappingReg(unsigned Reg, const MachineInstr &MI);
+ bool processMachineBasicBlock(MachineBasicBlock &MBB);
+ bool usesFrameIndex(const MachineInstr &MI);
+public:
+ static char ID; // Pass identification, replacement for typeid.
+ explicit AArch64DeadRegisterDefinitions() : MachineFunctionPass(ID) {}
+
+ virtual bool runOnMachineFunction(MachineFunction &F) override;
+
+ const char *getPassName() const override { return "Dead register definitions"; }
+
+ virtual void getAnalysisUsage(AnalysisUsage &AU) const override {
+ AU.setPreservesCFG();
+ MachineFunctionPass::getAnalysisUsage(AU);
+ }
+};
+char AArch64DeadRegisterDefinitions::ID = 0;
+} // end anonymous namespace
+
+bool AArch64DeadRegisterDefinitions::implicitlyDefinesOverlappingReg(
+ unsigned Reg, const MachineInstr &MI) {
+ for (const MachineOperand &MO : MI.implicit_operands())
+ if (MO.isReg() && MO.isDef())
+ if (TRI->regsOverlap(Reg, MO.getReg()))
+ return true;
+ return false;
+}
+
+bool AArch64DeadRegisterDefinitions::usesFrameIndex(const MachineInstr &MI) {
+ for (const MachineOperand &Op : MI.uses())
+ if (Op.isFI())
+ return true;
+ return false;
+}
+
+bool AArch64DeadRegisterDefinitions::processMachineBasicBlock(
+ MachineBasicBlock &MBB) {
+ bool Changed = false;
+ for (MachineInstr &MI : MBB) {
+ if (usesFrameIndex(MI)) {
+ // We need to skip this instruction because while it appears to have a
+ // dead def it uses a frame index which might expand into a multi
+ // instruction sequence during EPI.
+ DEBUG(dbgs() << " Ignoring, operand is frame index\n");
+ continue;
+ }
+ for (int i = 0, e = MI.getDesc().getNumDefs(); i != e; ++i) {
+ MachineOperand &MO = MI.getOperand(i);
+ if (MO.isReg() && MO.isDead() && MO.isDef()) {
+ assert(!MO.isImplicit() && "Unexpected implicit def!");
+ DEBUG(dbgs() << " Dead def operand #" << i << " in:\n ";
+ MI.print(dbgs()));
+ // Be careful not to change the register if it's a tied operand.
+ if (MI.isRegTiedToUseOperand(i)) {
+ DEBUG(dbgs() << " Ignoring, def is tied operand.\n");
+ continue;
+ }
+ // Don't change the register if there's an implicit def of a subreg or
+ // supperreg.
+ if (implicitlyDefinesOverlappingReg(MO.getReg(), MI)) {
+ DEBUG(dbgs() << " Ignoring, implicitly defines overlap reg.\n");
+ continue;
+ }
+ // Make sure the instruction take a register class that contains
+ // the zero register and replace it if so.
+ unsigned NewReg;
+ switch (MI.getDesc().OpInfo[i].RegClass) {
+ default:
+ DEBUG(dbgs() << " Ignoring, register is not a GPR.\n");
+ continue;
+ case AArch64::GPR32RegClassID:
+ NewReg = AArch64::WZR;
+ break;
+ case AArch64::GPR64RegClassID:
+ NewReg = AArch64::XZR;
+ break;
+ }
+ DEBUG(dbgs() << " Replacing with zero register. New:\n ");
+ MO.setReg(NewReg);
+ DEBUG(MI.print(dbgs()));
+ ++NumDeadDefsReplaced;
+ }
+ }
+ }
+ return Changed;
+}
+
+// Scan the function for instructions that have a dead definition of a
+// register. Replace that register with the zero register when possible.
+bool AArch64DeadRegisterDefinitions::runOnMachineFunction(MachineFunction &MF) {
+ TRI = MF.getTarget().getRegisterInfo();
+ bool Changed = false;
+ DEBUG(dbgs() << "***** AArch64DeadRegisterDefinitions *****\n");
+
+ for (auto &MBB : MF)
+ if (processMachineBasicBlock(MBB))
+ Changed = true;
+ return Changed;
+}
+
+FunctionPass *llvm::createAArch64DeadRegisterDefinitions() {
+ return new AArch64DeadRegisterDefinitions();
+}
diff --git a/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp b/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp
new file mode 100644
index 0000000..a76fd76
--- /dev/null
+++ b/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp
@@ -0,0 +1,749 @@
+//==-- AArch64ExpandPseudoInsts.cpp - Expand pseudo instructions --*- C++ -*-=//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains a pass that expands pseudo instructions into target
+// instructions to allow proper scheduling and other late optimizations. This
+// pass should be run after register allocation but before the post-regalloc
+// scheduling pass.
+//
+//===----------------------------------------------------------------------===//
+
+#include "MCTargetDesc/AArch64AddressingModes.h"
+#include "AArch64InstrInfo.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/Support/MathExtras.h"
+using namespace llvm;
+
+namespace {
+class AArch64ExpandPseudo : public MachineFunctionPass {
+public:
+ static char ID;
+ AArch64ExpandPseudo() : MachineFunctionPass(ID) {}
+
+ const AArch64InstrInfo *TII;
+
+ bool runOnMachineFunction(MachineFunction &Fn) override;
+
+ const char *getPassName() const override {
+ return "AArch64 pseudo instruction expansion pass";
+ }
+
+private:
+ bool expandMBB(MachineBasicBlock &MBB);
+ bool expandMI(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI);
+ bool expandMOVImm(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
+ unsigned BitSize);
+};
+char AArch64ExpandPseudo::ID = 0;
+}
+
+/// \brief Transfer implicit operands on the pseudo instruction to the
+/// instructions created from the expansion.
+static void transferImpOps(MachineInstr &OldMI, MachineInstrBuilder &UseMI,
+ MachineInstrBuilder &DefMI) {
+ const MCInstrDesc &Desc = OldMI.getDesc();
+ for (unsigned i = Desc.getNumOperands(), e = OldMI.getNumOperands(); i != e;
+ ++i) {
+ const MachineOperand &MO = OldMI.getOperand(i);
+ assert(MO.isReg() && MO.getReg());
+ if (MO.isUse())
+ UseMI.addOperand(MO);
+ else
+ DefMI.addOperand(MO);
+ }
+}
+
+/// \brief Helper function which extracts the specified 16-bit chunk from a
+/// 64-bit value.
+static uint64_t getChunk(uint64_t Imm, unsigned ChunkIdx) {
+ assert(ChunkIdx < 4 && "Out of range chunk index specified!");
+
+ return (Imm >> (ChunkIdx * 16)) & 0xFFFF;
+}
+
+/// \brief Helper function which replicates a 16-bit chunk within a 64-bit
+/// value. Indices correspond to element numbers in a v4i16.
+static uint64_t replicateChunk(uint64_t Imm, unsigned FromIdx, unsigned ToIdx) {
+ assert((FromIdx < 4) && (ToIdx < 4) && "Out of range chunk index specified!");
+ const unsigned ShiftAmt = ToIdx * 16;
+
+ // Replicate the source chunk to the destination position.
+ const uint64_t Chunk = getChunk(Imm, FromIdx) << ShiftAmt;
+ // Clear the destination chunk.
+ Imm &= ~(0xFFFFLL << ShiftAmt);
+ // Insert the replicated chunk.
+ return Imm | Chunk;
+}
+
+/// \brief Helper function which tries to materialize a 64-bit value with an
+/// ORR + MOVK instruction sequence.
+static bool tryOrrMovk(uint64_t UImm, uint64_t OrrImm, MachineInstr &MI,
+ MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator &MBBI,
+ const AArch64InstrInfo *TII, unsigned ChunkIdx) {
+ assert(ChunkIdx < 4 && "Out of range chunk index specified!");
+ const unsigned ShiftAmt = ChunkIdx * 16;
+
+ uint64_t Encoding;
+ if (AArch64_AM::processLogicalImmediate(OrrImm, 64, Encoding)) {
+ // Create the ORR-immediate instruction.
+ MachineInstrBuilder MIB =
+ BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(AArch64::ORRXri))
+ .addOperand(MI.getOperand(0))
+ .addReg(AArch64::XZR)
+ .addImm(Encoding);
+
+ // Create the MOVK instruction.
+ const unsigned Imm16 = getChunk(UImm, ChunkIdx);
+ const unsigned DstReg = MI.getOperand(0).getReg();
+ const bool DstIsDead = MI.getOperand(0).isDead();
+ MachineInstrBuilder MIB1 =
+ BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(AArch64::MOVKXi))
+ .addReg(DstReg, RegState::Define | getDeadRegState(DstIsDead))
+ .addReg(DstReg)
+ .addImm(Imm16)
+ .addImm(AArch64_AM::getShifterImm(AArch64_AM::LSL, ShiftAmt));
+
+ transferImpOps(MI, MIB, MIB1);
+ MI.eraseFromParent();
+ return true;
+ }
+
+ return false;
+}
+
+/// \brief Check whether the given 16-bit chunk replicated to full 64-bit width
+/// can be materialized with an ORR instruction.
+static bool canUseOrr(uint64_t Chunk, uint64_t &Encoding) {
+ Chunk = (Chunk << 48) | (Chunk << 32) | (Chunk << 16) | Chunk;
+
+ return AArch64_AM::processLogicalImmediate(Chunk, 64, Encoding);
+}
+
+/// \brief Check for identical 16-bit chunks within the constant and if so
+/// materialize them with a single ORR instruction. The remaining one or two
+/// 16-bit chunks will be materialized with MOVK instructions.
+///
+/// This allows us to materialize constants like |A|B|A|A| or |A|B|C|A| (order
+/// of the chunks doesn't matter), assuming |A|A|A|A| can be materialized with
+/// an ORR instruction.
+///
+static bool tryToreplicateChunks(uint64_t UImm, MachineInstr &MI,
+ MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator &MBBI,
+ const AArch64InstrInfo *TII) {
+ typedef DenseMap<uint64_t, unsigned> CountMap;
+ CountMap Counts;
+
+ // Scan the constant and count how often every chunk occurs.
+ for (unsigned Idx = 0; Idx < 4; ++Idx)
+ ++Counts[getChunk(UImm, Idx)];
+
+ // Traverse the chunks to find one which occurs more than once.
+ for (CountMap::const_iterator Chunk = Counts.begin(), End = Counts.end();
+ Chunk != End; ++Chunk) {
+ const uint64_t ChunkVal = Chunk->first;
+ const unsigned Count = Chunk->second;
+
+ uint64_t Encoding = 0;
+
+ // We are looking for chunks which have two or three instances and can be
+ // materialized with an ORR instruction.
+ if ((Count != 2 && Count != 3) || !canUseOrr(ChunkVal, Encoding))
+ continue;
+
+ const bool CountThree = Count == 3;
+ // Create the ORR-immediate instruction.
+ MachineInstrBuilder MIB =
+ BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(AArch64::ORRXri))
+ .addOperand(MI.getOperand(0))
+ .addReg(AArch64::XZR)
+ .addImm(Encoding);
+
+ const unsigned DstReg = MI.getOperand(0).getReg();
+ const bool DstIsDead = MI.getOperand(0).isDead();
+
+ unsigned ShiftAmt = 0;
+ uint64_t Imm16 = 0;
+ // Find the first chunk not materialized with the ORR instruction.
+ for (; ShiftAmt < 64; ShiftAmt += 16) {
+ Imm16 = (UImm >> ShiftAmt) & 0xFFFF;
+
+ if (Imm16 != ChunkVal)
+ break;
+ }
+
+ // Create the first MOVK instruction.
+ MachineInstrBuilder MIB1 =
+ BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(AArch64::MOVKXi))
+ .addReg(DstReg,
+ RegState::Define | getDeadRegState(DstIsDead && CountThree))
+ .addReg(DstReg)
+ .addImm(Imm16)
+ .addImm(AArch64_AM::getShifterImm(AArch64_AM::LSL, ShiftAmt));
+
+ // In case we have three instances the whole constant is now materialized
+ // and we can exit.
+ if (CountThree) {
+ transferImpOps(MI, MIB, MIB1);
+ MI.eraseFromParent();
+ return true;
+ }
+
+ // Find the remaining chunk which needs to be materialized.
+ for (ShiftAmt += 16; ShiftAmt < 64; ShiftAmt += 16) {
+ Imm16 = (UImm >> ShiftAmt) & 0xFFFF;
+
+ if (Imm16 != ChunkVal)
+ break;
+ }
+
+ // Create the second MOVK instruction.
+ MachineInstrBuilder MIB2 =
+ BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(AArch64::MOVKXi))
+ .addReg(DstReg, RegState::Define | getDeadRegState(DstIsDead))
+ .addReg(DstReg)
+ .addImm(Imm16)
+ .addImm(AArch64_AM::getShifterImm(AArch64_AM::LSL, ShiftAmt));
+
+ transferImpOps(MI, MIB, MIB2);
+ MI.eraseFromParent();
+ return true;
+ }
+
+ return false;
+}
+
+/// \brief Check whether this chunk matches the pattern '1...0...'. This pattern
+/// starts a contiguous sequence of ones if we look at the bits from the LSB
+/// towards the MSB.
+static bool isStartChunk(uint64_t Chunk) {
+ if (Chunk == 0 || Chunk == UINT64_MAX)
+ return false;
+
+ return (CountLeadingOnes_64(Chunk) + countTrailingZeros(Chunk)) == 64;
+}
+
+/// \brief Check whether this chunk matches the pattern '0...1...' This pattern
+/// ends a contiguous sequence of ones if we look at the bits from the LSB
+/// towards the MSB.
+static bool isEndChunk(uint64_t Chunk) {
+ if (Chunk == 0 || Chunk == UINT64_MAX)
+ return false;
+
+ return (countLeadingZeros(Chunk) + CountTrailingOnes_64(Chunk)) == 64;
+}
+
+/// \brief Clear or set all bits in the chunk at the given index.
+static uint64_t updateImm(uint64_t Imm, unsigned Idx, bool Clear) {
+ const uint64_t Mask = 0xFFFF;
+
+ if (Clear)
+ // Clear chunk in the immediate.
+ Imm &= ~(Mask << (Idx * 16));
+ else
+ // Set all bits in the immediate for the particular chunk.
+ Imm |= Mask << (Idx * 16);
+
+ return Imm;
+}
+
+/// \brief Check whether the constant contains a sequence of contiguous ones,
+/// which might be interrupted by one or two chunks. If so, materialize the
+/// sequence of contiguous ones with an ORR instruction.
+/// Materialize the chunks which are either interrupting the sequence or outside
+/// of the sequence with a MOVK instruction.
+///
+/// Assuming S is a chunk which starts the sequence (1...0...), E is a chunk
+/// which ends the sequence (0...1...). Then we are looking for constants which
+/// contain at least one S and E chunk.
+/// E.g. |E|A|B|S|, |A|E|B|S| or |A|B|E|S|.
+///
+/// We are also looking for constants like |S|A|B|E| where the contiguous
+/// sequence of ones wraps around the MSB into the LSB.
+///
+static bool trySequenceOfOnes(uint64_t UImm, MachineInstr &MI,
+ MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator &MBBI,
+ const AArch64InstrInfo *TII) {
+ const int NotSet = -1;
+ const uint64_t Mask = 0xFFFF;
+
+ int StartIdx = NotSet;
+ int EndIdx = NotSet;
+ // Try to find the chunks which start/end a contiguous sequence of ones.
+ for (int Idx = 0; Idx < 4; ++Idx) {
+ int64_t Chunk = getChunk(UImm, Idx);
+ // Sign extend the 16-bit chunk to 64-bit.
+ Chunk = (Chunk << 48) >> 48;
+
+ if (isStartChunk(Chunk))
+ StartIdx = Idx;
+ else if (isEndChunk(Chunk))
+ EndIdx = Idx;
+ }
+
+ // Early exit in case we can't find a start/end chunk.
+ if (StartIdx == NotSet || EndIdx == NotSet)
+ return false;
+
+ // Outside of the contiguous sequence of ones everything needs to be zero.
+ uint64_t Outside = 0;
+ // Chunks between the start and end chunk need to have all their bits set.
+ uint64_t Inside = Mask;
+
+ // If our contiguous sequence of ones wraps around from the MSB into the LSB,
+ // just swap indices and pretend we are materializing a contiguous sequence
+ // of zeros surrounded by a contiguous sequence of ones.
+ if (StartIdx > EndIdx) {
+ std::swap(StartIdx, EndIdx);
+ std::swap(Outside, Inside);
+ }
+
+ uint64_t OrrImm = UImm;
+ int FirstMovkIdx = NotSet;
+ int SecondMovkIdx = NotSet;
+
+ // Find out which chunks we need to patch up to obtain a contiguous sequence
+ // of ones.
+ for (int Idx = 0; Idx < 4; ++Idx) {
+ const uint64_t Chunk = getChunk(UImm, Idx);
+
+ // Check whether we are looking at a chunk which is not part of the
+ // contiguous sequence of ones.
+ if ((Idx < StartIdx || EndIdx < Idx) && Chunk != Outside) {
+ OrrImm = updateImm(OrrImm, Idx, Outside == 0);
+
+ // Remember the index we need to patch.
+ if (FirstMovkIdx == NotSet)
+ FirstMovkIdx = Idx;
+ else
+ SecondMovkIdx = Idx;
+
+ // Check whether we are looking a chunk which is part of the contiguous
+ // sequence of ones.
+ } else if (Idx > StartIdx && Idx < EndIdx && Chunk != Inside) {
+ OrrImm = updateImm(OrrImm, Idx, Inside != Mask);
+
+ // Remember the index we need to patch.
+ if (FirstMovkIdx == NotSet)
+ FirstMovkIdx = Idx;
+ else
+ SecondMovkIdx = Idx;
+ }
+ }
+ assert(FirstMovkIdx != NotSet && "Constant materializable with single ORR!");
+
+ // Create the ORR-immediate instruction.
+ uint64_t Encoding = 0;
+ AArch64_AM::processLogicalImmediate(OrrImm, 64, Encoding);
+ MachineInstrBuilder MIB =
+ BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(AArch64::ORRXri))
+ .addOperand(MI.getOperand(0))
+ .addReg(AArch64::XZR)
+ .addImm(Encoding);
+
+ const unsigned DstReg = MI.getOperand(0).getReg();
+ const bool DstIsDead = MI.getOperand(0).isDead();
+
+ const bool SingleMovk = SecondMovkIdx == NotSet;
+ // Create the first MOVK instruction.
+ MachineInstrBuilder MIB1 =
+ BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(AArch64::MOVKXi))
+ .addReg(DstReg,
+ RegState::Define | getDeadRegState(DstIsDead && SingleMovk))
+ .addReg(DstReg)
+ .addImm(getChunk(UImm, FirstMovkIdx))
+ .addImm(
+ AArch64_AM::getShifterImm(AArch64_AM::LSL, FirstMovkIdx * 16));
+
+ // Early exit in case we only need to emit a single MOVK instruction.
+ if (SingleMovk) {
+ transferImpOps(MI, MIB, MIB1);
+ MI.eraseFromParent();
+ return true;
+ }
+
+ // Create the second MOVK instruction.
+ MachineInstrBuilder MIB2 =
+ BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(AArch64::MOVKXi))
+ .addReg(DstReg, RegState::Define | getDeadRegState(DstIsDead))
+ .addReg(DstReg)
+ .addImm(getChunk(UImm, SecondMovkIdx))
+ .addImm(
+ AArch64_AM::getShifterImm(AArch64_AM::LSL, SecondMovkIdx * 16));
+
+ transferImpOps(MI, MIB, MIB2);
+ MI.eraseFromParent();
+ return true;
+}
+
+/// \brief Expand a MOVi32imm or MOVi64imm pseudo instruction to one or more
+/// real move-immediate instructions to synthesize the immediate.
+bool AArch64ExpandPseudo::expandMOVImm(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator MBBI,
+ unsigned BitSize) {
+ MachineInstr &MI = *MBBI;
+ uint64_t Imm = MI.getOperand(1).getImm();
+ const unsigned Mask = 0xFFFF;
+
+ // Try a MOVI instruction (aka ORR-immediate with the zero register).
+ uint64_t UImm = Imm << (64 - BitSize) >> (64 - BitSize);
+ uint64_t Encoding;
+ if (AArch64_AM::processLogicalImmediate(UImm, BitSize, Encoding)) {
+ unsigned Opc = (BitSize == 32 ? AArch64::ORRWri : AArch64::ORRXri);
+ MachineInstrBuilder MIB =
+ BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(Opc))
+ .addOperand(MI.getOperand(0))
+ .addReg(BitSize == 32 ? AArch64::WZR : AArch64::XZR)
+ .addImm(Encoding);
+ transferImpOps(MI, MIB, MIB);
+ MI.eraseFromParent();
+ return true;
+ }
+
+ // Scan the immediate and count the number of 16-bit chunks which are either
+ // all ones or all zeros.
+ unsigned OneChunks = 0;
+ unsigned ZeroChunks = 0;
+ for (unsigned Shift = 0; Shift < BitSize; Shift += 16) {
+ const unsigned Chunk = (Imm >> Shift) & Mask;
+ if (Chunk == Mask)
+ OneChunks++;
+ else if (Chunk == 0)
+ ZeroChunks++;
+ }
+
+ // Since we can't materialize the constant with a single ORR instruction,
+ // let's see whether we can materialize 3/4 of the constant with an ORR
+ // instruction and use an additional MOVK instruction to materialize the
+ // remaining 1/4.
+ //
+ // We are looking for constants with a pattern like: |A|X|B|X| or |X|A|X|B|.
+ //
+ // E.g. assuming |A|X|A|X| is a pattern which can be materialized with ORR,
+ // we would create the following instruction sequence:
+ //
+ // ORR x0, xzr, |A|X|A|X|
+ // MOVK x0, |B|, LSL #16
+ //
+ // Only look at 64-bit constants which can't be materialized with a single
+ // instruction e.g. which have less than either three all zero or all one
+ // chunks.
+ //
+ // Ignore 32-bit constants here, they always can be materialized with a
+ // MOVZ/MOVN + MOVK pair. Since the 32-bit constant can't be materialized
+ // with a single ORR, the best sequence we can achieve is a ORR + MOVK pair.
+ // Thus we fall back to the default code below which in the best case creates
+ // a single MOVZ/MOVN instruction (in case one chunk is all zero or all one).
+ //
+ if (BitSize == 64 && OneChunks < 3 && ZeroChunks < 3) {
+ // If we interpret the 64-bit constant as a v4i16, are elements 0 and 2
+ // identical?
+ if (getChunk(UImm, 0) == getChunk(UImm, 2)) {
+ // See if we can come up with a constant which can be materialized with
+ // ORR-immediate by replicating element 3 into element 1.
+ uint64_t OrrImm = replicateChunk(UImm, 3, 1);
+ if (tryOrrMovk(UImm, OrrImm, MI, MBB, MBBI, TII, 1))
+ return true;
+
+ // See if we can come up with a constant which can be materialized with
+ // ORR-immediate by replicating element 1 into element 3.
+ OrrImm = replicateChunk(UImm, 1, 3);
+ if (tryOrrMovk(UImm, OrrImm, MI, MBB, MBBI, TII, 3))
+ return true;
+
+ // If we interpret the 64-bit constant as a v4i16, are elements 1 and 3
+ // identical?
+ } else if (getChunk(UImm, 1) == getChunk(UImm, 3)) {
+ // See if we can come up with a constant which can be materialized with
+ // ORR-immediate by replicating element 2 into element 0.
+ uint64_t OrrImm = replicateChunk(UImm, 2, 0);
+ if (tryOrrMovk(UImm, OrrImm, MI, MBB, MBBI, TII, 0))
+ return true;
+
+ // See if we can come up with a constant which can be materialized with
+ // ORR-immediate by replicating element 1 into element 3.
+ OrrImm = replicateChunk(UImm, 0, 2);
+ if (tryOrrMovk(UImm, OrrImm, MI, MBB, MBBI, TII, 2))
+ return true;
+ }
+ }
+
+ // Check for identical 16-bit chunks within the constant and if so materialize
+ // them with a single ORR instruction. The remaining one or two 16-bit chunks
+ // will be materialized with MOVK instructions.
+ if (BitSize == 64 && tryToreplicateChunks(UImm, MI, MBB, MBBI, TII))
+ return true;
+
+ // Check whether the constant contains a sequence of contiguous ones, which
+ // might be interrupted by one or two chunks. If so, materialize the sequence
+ // of contiguous ones with an ORR instruction. Materialize the chunks which
+ // are either interrupting the sequence or outside of the sequence with a
+ // MOVK instruction.
+ if (BitSize == 64 && trySequenceOfOnes(UImm, MI, MBB, MBBI, TII))
+ return true;
+
+ // Use a MOVZ or MOVN instruction to set the high bits, followed by one or
+ // more MOVK instructions to insert additional 16-bit portions into the
+ // lower bits.
+ bool isNeg = false;
+
+ // Use MOVN to materialize the high bits if we have more all one chunks
+ // than all zero chunks.
+ if (OneChunks > ZeroChunks) {
+ isNeg = true;
+ Imm = ~Imm;
+ }
+
+ unsigned FirstOpc;
+ if (BitSize == 32) {
+ Imm &= (1LL << 32) - 1;
+ FirstOpc = (isNeg ? AArch64::MOVNWi : AArch64::MOVZWi);
+ } else {
+ FirstOpc = (isNeg ? AArch64::MOVNXi : AArch64::MOVZXi);
+ }
+ unsigned Shift = 0; // LSL amount for high bits with MOVZ/MOVN
+ unsigned LastShift = 0; // LSL amount for last MOVK
+ if (Imm != 0) {
+ unsigned LZ = countLeadingZeros(Imm);
+ unsigned TZ = countTrailingZeros(Imm);
+ Shift = ((63 - LZ) / 16) * 16;
+ LastShift = (TZ / 16) * 16;
+ }
+ unsigned Imm16 = (Imm >> Shift) & Mask;
+ unsigned DstReg = MI.getOperand(0).getReg();
+ bool DstIsDead = MI.getOperand(0).isDead();
+ MachineInstrBuilder MIB1 =
+ BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(FirstOpc))
+ .addReg(DstReg, RegState::Define |
+ getDeadRegState(DstIsDead && Shift == LastShift))
+ .addImm(Imm16)
+ .addImm(AArch64_AM::getShifterImm(AArch64_AM::LSL, Shift));
+
+ // If a MOVN was used for the high bits of a negative value, flip the rest
+ // of the bits back for use with MOVK.
+ if (isNeg)
+ Imm = ~Imm;
+
+ if (Shift == LastShift) {
+ transferImpOps(MI, MIB1, MIB1);
+ MI.eraseFromParent();
+ return true;
+ }
+
+ MachineInstrBuilder MIB2;
+ unsigned Opc = (BitSize == 32 ? AArch64::MOVKWi : AArch64::MOVKXi);
+ while (Shift != LastShift) {
+ Shift -= 16;
+ Imm16 = (Imm >> Shift) & Mask;
+ if (Imm16 == (isNeg ? Mask : 0))
+ continue; // This 16-bit portion is already set correctly.
+ MIB2 = BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(Opc))
+ .addReg(DstReg,
+ RegState::Define |
+ getDeadRegState(DstIsDead && Shift == LastShift))
+ .addReg(DstReg)
+ .addImm(Imm16)
+ .addImm(AArch64_AM::getShifterImm(AArch64_AM::LSL, Shift));
+ }
+
+ transferImpOps(MI, MIB1, MIB2);
+ MI.eraseFromParent();
+ return true;
+}
+
+/// \brief If MBBI references a pseudo instruction that should be expanded here,
+/// do the expansion and return true. Otherwise return false.
+bool AArch64ExpandPseudo::expandMI(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator MBBI) {
+ MachineInstr &MI = *MBBI;
+ unsigned Opcode = MI.getOpcode();
+ switch (Opcode) {
+ default:
+ break;
+
+ case AArch64::ADDWrr:
+ case AArch64::SUBWrr:
+ case AArch64::ADDXrr:
+ case AArch64::SUBXrr:
+ case AArch64::ADDSWrr:
+ case AArch64::SUBSWrr:
+ case AArch64::ADDSXrr:
+ case AArch64::SUBSXrr:
+ case AArch64::ANDWrr:
+ case AArch64::ANDXrr:
+ case AArch64::BICWrr:
+ case AArch64::BICXrr:
+ case AArch64::ANDSWrr:
+ case AArch64::ANDSXrr:
+ case AArch64::BICSWrr:
+ case AArch64::BICSXrr:
+ case AArch64::EONWrr:
+ case AArch64::EONXrr:
+ case AArch64::EORWrr:
+ case AArch64::EORXrr:
+ case AArch64::ORNWrr:
+ case AArch64::ORNXrr:
+ case AArch64::ORRWrr:
+ case AArch64::ORRXrr: {
+ unsigned Opcode;
+ switch (MI.getOpcode()) {
+ default:
+ return false;
+ case AArch64::ADDWrr: Opcode = AArch64::ADDWrs; break;
+ case AArch64::SUBWrr: Opcode = AArch64::SUBWrs; break;
+ case AArch64::ADDXrr: Opcode = AArch64::ADDXrs; break;
+ case AArch64::SUBXrr: Opcode = AArch64::SUBXrs; break;
+ case AArch64::ADDSWrr: Opcode = AArch64::ADDSWrs; break;
+ case AArch64::SUBSWrr: Opcode = AArch64::SUBSWrs; break;
+ case AArch64::ADDSXrr: Opcode = AArch64::ADDSXrs; break;
+ case AArch64::SUBSXrr: Opcode = AArch64::SUBSXrs; break;
+ case AArch64::ANDWrr: Opcode = AArch64::ANDWrs; break;
+ case AArch64::ANDXrr: Opcode = AArch64::ANDXrs; break;
+ case AArch64::BICWrr: Opcode = AArch64::BICWrs; break;
+ case AArch64::BICXrr: Opcode = AArch64::BICXrs; break;
+ case AArch64::ANDSWrr: Opcode = AArch64::ANDSWrs; break;
+ case AArch64::ANDSXrr: Opcode = AArch64::ANDSXrs; break;
+ case AArch64::BICSWrr: Opcode = AArch64::BICSWrs; break;
+ case AArch64::BICSXrr: Opcode = AArch64::BICSXrs; break;
+ case AArch64::EONWrr: Opcode = AArch64::EONWrs; break;
+ case AArch64::EONXrr: Opcode = AArch64::EONXrs; break;
+ case AArch64::EORWrr: Opcode = AArch64::EORWrs; break;
+ case AArch64::EORXrr: Opcode = AArch64::EORXrs; break;
+ case AArch64::ORNWrr: Opcode = AArch64::ORNWrs; break;
+ case AArch64::ORNXrr: Opcode = AArch64::ORNXrs; break;
+ case AArch64::ORRWrr: Opcode = AArch64::ORRWrs; break;
+ case AArch64::ORRXrr: Opcode = AArch64::ORRXrs; break;
+ }
+ MachineInstrBuilder MIB1 =
+ BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(Opcode),
+ MI.getOperand(0).getReg())
+ .addOperand(MI.getOperand(1))
+ .addOperand(MI.getOperand(2))
+ .addImm(AArch64_AM::getShifterImm(AArch64_AM::LSL, 0));
+ transferImpOps(MI, MIB1, MIB1);
+ MI.eraseFromParent();
+ return true;
+ }
+
+ case AArch64::FCVTSHpseudo: {
+ MachineOperand Src = MI.getOperand(1);
+ Src.setImplicit();
+ unsigned SrcH =
+ TII->getRegisterInfo().getSubReg(Src.getReg(), AArch64::hsub);
+ auto MIB = BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(AArch64::FCVTSHr))
+ .addOperand(MI.getOperand(0))
+ .addReg(SrcH, RegState::Undef)
+ .addOperand(Src);
+ transferImpOps(MI, MIB, MIB);
+ MI.eraseFromParent();
+ return true;
+ }
+ case AArch64::LOADgot: {
+ // Expand into ADRP + LDR.
+ unsigned DstReg = MI.getOperand(0).getReg();
+ const MachineOperand &MO1 = MI.getOperand(1);
+ unsigned Flags = MO1.getTargetFlags();
+ MachineInstrBuilder MIB1 =
+ BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(AArch64::ADRP), DstReg);
+ MachineInstrBuilder MIB2 =
+ BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(AArch64::LDRXui))
+ .addOperand(MI.getOperand(0))
+ .addReg(DstReg);
+
+ if (MO1.isGlobal()) {
+ MIB1.addGlobalAddress(MO1.getGlobal(), 0, Flags | AArch64II::MO_PAGE);
+ MIB2.addGlobalAddress(MO1.getGlobal(), 0,
+ Flags | AArch64II::MO_PAGEOFF | AArch64II::MO_NC);
+ } else if (MO1.isSymbol()) {
+ MIB1.addExternalSymbol(MO1.getSymbolName(), Flags | AArch64II::MO_PAGE);
+ MIB2.addExternalSymbol(MO1.getSymbolName(),
+ Flags | AArch64II::MO_PAGEOFF | AArch64II::MO_NC);
+ } else {
+ assert(MO1.isCPI() &&
+ "Only expect globals, externalsymbols, or constant pools");
+ MIB1.addConstantPoolIndex(MO1.getIndex(), MO1.getOffset(),
+ Flags | AArch64II::MO_PAGE);
+ MIB2.addConstantPoolIndex(MO1.getIndex(), MO1.getOffset(),
+ Flags | AArch64II::MO_PAGEOFF |
+ AArch64II::MO_NC);
+ }
+
+ transferImpOps(MI, MIB1, MIB2);
+ MI.eraseFromParent();
+ return true;
+ }
+
+ case AArch64::MOVaddr:
+ case AArch64::MOVaddrJT:
+ case AArch64::MOVaddrCP:
+ case AArch64::MOVaddrBA:
+ case AArch64::MOVaddrTLS:
+ case AArch64::MOVaddrEXT: {
+ // Expand into ADRP + ADD.
+ unsigned DstReg = MI.getOperand(0).getReg();
+ MachineInstrBuilder MIB1 =
+ BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(AArch64::ADRP), DstReg)
+ .addOperand(MI.getOperand(1));
+
+ MachineInstrBuilder MIB2 =
+ BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(AArch64::ADDXri))
+ .addOperand(MI.getOperand(0))
+ .addReg(DstReg)
+ .addOperand(MI.getOperand(2))
+ .addImm(0);
+
+ transferImpOps(MI, MIB1, MIB2);
+ MI.eraseFromParent();
+ return true;
+ }
+
+ case AArch64::MOVi32imm:
+ return expandMOVImm(MBB, MBBI, 32);
+ case AArch64::MOVi64imm:
+ return expandMOVImm(MBB, MBBI, 64);
+ case AArch64::RET_ReallyLR:
+ BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(AArch64::RET))
+ .addReg(AArch64::LR);
+ MI.eraseFromParent();
+ return true;
+ }
+ return false;
+}
+
+/// \brief Iterate over the instructions in basic block MBB and expand any
+/// pseudo instructions. Return true if anything was modified.
+bool AArch64ExpandPseudo::expandMBB(MachineBasicBlock &MBB) {
+ bool Modified = false;
+
+ MachineBasicBlock::iterator MBBI = MBB.begin(), E = MBB.end();
+ while (MBBI != E) {
+ MachineBasicBlock::iterator NMBBI = std::next(MBBI);
+ Modified |= expandMI(MBB, MBBI);
+ MBBI = NMBBI;
+ }
+
+ return Modified;
+}
+
+bool AArch64ExpandPseudo::runOnMachineFunction(MachineFunction &MF) {
+ TII = static_cast<const AArch64InstrInfo *>(MF.getTarget().getInstrInfo());
+
+ bool Modified = false;
+ for (auto &MBB : MF)
+ Modified |= expandMBB(MBB);
+ return Modified;
+}
+
+/// \brief Returns an instance of the pseudo instruction expansion pass.
+FunctionPass *llvm::createAArch64ExpandPseudoPass() {
+ return new AArch64ExpandPseudo();
+}
diff --git a/lib/Target/AArch64/AArch64FastISel.cpp b/lib/Target/AArch64/AArch64FastISel.cpp
new file mode 100644
index 0000000..c3b5369
--- /dev/null
+++ b/lib/Target/AArch64/AArch64FastISel.cpp
@@ -0,0 +1,1981 @@
+//===-- AArch6464FastISel.cpp - AArch64 FastISel implementation -----------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the AArch64-specific support for the FastISel class. Some
+// of the target-specific code is generated by tablegen in the file
+// AArch64GenFastISel.inc, which is #included here.
+//
+//===----------------------------------------------------------------------===//
+
+#include "AArch64.h"
+#include "AArch64TargetMachine.h"
+#include "AArch64Subtarget.h"
+#include "MCTargetDesc/AArch64AddressingModes.h"
+#include "llvm/CodeGen/CallingConvLower.h"
+#include "llvm/CodeGen/FastISel.h"
+#include "llvm/CodeGen/FunctionLoweringInfo.h"
+#include "llvm/CodeGen/MachineConstantPool.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/IR/CallingConv.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/GetElementPtrTypeIterator.h"
+#include "llvm/IR/GlobalAlias.h"
+#include "llvm/IR/GlobalVariable.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Operator.h"
+#include "llvm/Support/CommandLine.h"
+using namespace llvm;
+
+namespace {
+
+class AArch64FastISel : public FastISel {
+
+ class Address {
+ public:
+ typedef enum {
+ RegBase,
+ FrameIndexBase
+ } BaseKind;
+
+ private:
+ BaseKind Kind;
+ union {
+ unsigned Reg;
+ int FI;
+ } Base;
+ int64_t Offset;
+
+ public:
+ Address() : Kind(RegBase), Offset(0) { Base.Reg = 0; }
+ void setKind(BaseKind K) { Kind = K; }
+ BaseKind getKind() const { return Kind; }
+ bool isRegBase() const { return Kind == RegBase; }
+ bool isFIBase() const { return Kind == FrameIndexBase; }
+ void setReg(unsigned Reg) {
+ assert(isRegBase() && "Invalid base register access!");
+ Base.Reg = Reg;
+ }
+ unsigned getReg() const {
+ assert(isRegBase() && "Invalid base register access!");
+ return Base.Reg;
+ }
+ void setFI(unsigned FI) {
+ assert(isFIBase() && "Invalid base frame index access!");
+ Base.FI = FI;
+ }
+ unsigned getFI() const {
+ assert(isFIBase() && "Invalid base frame index access!");
+ return Base.FI;
+ }
+ void setOffset(int64_t O) { Offset = O; }
+ int64_t getOffset() { return Offset; }
+
+ bool isValid() { return isFIBase() || (isRegBase() && getReg() != 0); }
+ };
+
+ /// Subtarget - Keep a pointer to the AArch64Subtarget around so that we can
+ /// make the right decision when generating code for different targets.
+ const AArch64Subtarget *Subtarget;
+ LLVMContext *Context;
+
+private:
+ // Selection routines.
+ bool SelectLoad(const Instruction *I);
+ bool SelectStore(const Instruction *I);
+ bool SelectBranch(const Instruction *I);
+ bool SelectIndirectBr(const Instruction *I);
+ bool SelectCmp(const Instruction *I);
+ bool SelectSelect(const Instruction *I);
+ bool SelectFPExt(const Instruction *I);
+ bool SelectFPTrunc(const Instruction *I);
+ bool SelectFPToInt(const Instruction *I, bool Signed);
+ bool SelectIntToFP(const Instruction *I, bool Signed);
+ bool SelectRem(const Instruction *I, unsigned ISDOpcode);
+ bool SelectCall(const Instruction *I, const char *IntrMemName);
+ bool SelectIntrinsicCall(const IntrinsicInst &I);
+ bool SelectRet(const Instruction *I);
+ bool SelectTrunc(const Instruction *I);
+ bool SelectIntExt(const Instruction *I);
+ bool SelectMul(const Instruction *I);
+
+ // Utility helper routines.
+ bool isTypeLegal(Type *Ty, MVT &VT);
+ bool isLoadStoreTypeLegal(Type *Ty, MVT &VT);
+ bool ComputeAddress(const Value *Obj, Address &Addr);
+ bool SimplifyAddress(Address &Addr, MVT VT, int64_t ScaleFactor,
+ bool UseUnscaled);
+ void AddLoadStoreOperands(Address &Addr, const MachineInstrBuilder &MIB,
+ unsigned Flags, bool UseUnscaled);
+ bool IsMemCpySmall(uint64_t Len, unsigned Alignment);
+ bool TryEmitSmallMemCpy(Address Dest, Address Src, uint64_t Len,
+ unsigned Alignment);
+ // Emit functions.
+ bool EmitCmp(Value *Src1Value, Value *Src2Value, bool isZExt);
+ bool EmitLoad(MVT VT, unsigned &ResultReg, Address Addr,
+ bool UseUnscaled = false);
+ bool EmitStore(MVT VT, unsigned SrcReg, Address Addr,
+ bool UseUnscaled = false);
+ unsigned EmitIntExt(MVT SrcVT, unsigned SrcReg, MVT DestVT, bool isZExt);
+ unsigned Emiti1Ext(unsigned SrcReg, MVT DestVT, bool isZExt);
+
+ unsigned AArch64MaterializeFP(const ConstantFP *CFP, MVT VT);
+ unsigned AArch64MaterializeGV(const GlobalValue *GV);
+
+ // Call handling routines.
+private:
+ CCAssignFn *CCAssignFnForCall(CallingConv::ID CC) const;
+ bool ProcessCallArgs(SmallVectorImpl<Value *> &Args,
+ SmallVectorImpl<unsigned> &ArgRegs,
+ SmallVectorImpl<MVT> &ArgVTs,
+ SmallVectorImpl<ISD::ArgFlagsTy> &ArgFlags,
+ SmallVectorImpl<unsigned> &RegArgs, CallingConv::ID CC,
+ unsigned &NumBytes);
+ bool FinishCall(MVT RetVT, SmallVectorImpl<unsigned> &UsedRegs,
+ const Instruction *I, CallingConv::ID CC, unsigned &NumBytes);
+
+public:
+ // Backend specific FastISel code.
+ unsigned TargetMaterializeAlloca(const AllocaInst *AI) override;
+ unsigned TargetMaterializeConstant(const Constant *C) override;
+
+ explicit AArch64FastISel(FunctionLoweringInfo &funcInfo,
+ const TargetLibraryInfo *libInfo)
+ : FastISel(funcInfo, libInfo) {
+ Subtarget = &TM.getSubtarget<AArch64Subtarget>();
+ Context = &funcInfo.Fn->getContext();
+ }
+
+ bool TargetSelectInstruction(const Instruction *I) override;
+
+#include "AArch64GenFastISel.inc"
+};
+
+} // end anonymous namespace
+
+#include "AArch64GenCallingConv.inc"
+
+CCAssignFn *AArch64FastISel::CCAssignFnForCall(CallingConv::ID CC) const {
+ if (CC == CallingConv::WebKit_JS)
+ return CC_AArch64_WebKit_JS;
+ return Subtarget->isTargetDarwin() ? CC_AArch64_DarwinPCS : CC_AArch64_AAPCS;
+}
+
+unsigned AArch64FastISel::TargetMaterializeAlloca(const AllocaInst *AI) {
+ assert(TLI.getValueType(AI->getType(), true) == MVT::i64 &&
+ "Alloca should always return a pointer.");
+
+ // Don't handle dynamic allocas.
+ if (!FuncInfo.StaticAllocaMap.count(AI))
+ return 0;
+
+ DenseMap<const AllocaInst *, int>::iterator SI =
+ FuncInfo.StaticAllocaMap.find(AI);
+
+ if (SI != FuncInfo.StaticAllocaMap.end()) {
+ unsigned ResultReg = createResultReg(&AArch64::GPR64RegClass);
+ BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(AArch64::ADDXri),
+ ResultReg)
+ .addFrameIndex(SI->second)
+ .addImm(0)
+ .addImm(0);
+ return ResultReg;
+ }
+
+ return 0;
+}
+
+unsigned AArch64FastISel::AArch64MaterializeFP(const ConstantFP *CFP, MVT VT) {
+ if (VT != MVT::f32 && VT != MVT::f64)
+ return 0;
+
+ const APFloat Val = CFP->getValueAPF();
+ bool is64bit = (VT == MVT::f64);
+
+ // This checks to see if we can use FMOV instructions to materialize
+ // a constant, otherwise we have to materialize via the constant pool.
+ if (TLI.isFPImmLegal(Val, VT)) {
+ int Imm;
+ unsigned Opc;
+ if (is64bit) {
+ Imm = AArch64_AM::getFP64Imm(Val);
+ Opc = AArch64::FMOVDi;
+ } else {
+ Imm = AArch64_AM::getFP32Imm(Val);
+ Opc = AArch64::FMOVSi;
+ }
+ unsigned ResultReg = createResultReg(TLI.getRegClassFor(VT));
+ BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc), ResultReg)
+ .addImm(Imm);
+ return ResultReg;
+ }
+
+ // Materialize via constant pool. MachineConstantPool wants an explicit
+ // alignment.
+ unsigned Align = DL.getPrefTypeAlignment(CFP->getType());
+ if (Align == 0)
+ Align = DL.getTypeAllocSize(CFP->getType());
+
+ unsigned Idx = MCP.getConstantPoolIndex(cast<Constant>(CFP), Align);
+ unsigned ADRPReg = createResultReg(&AArch64::GPR64commonRegClass);
+ BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(AArch64::ADRP),
+ ADRPReg).addConstantPoolIndex(Idx, 0, AArch64II::MO_PAGE);
+
+ unsigned Opc = is64bit ? AArch64::LDRDui : AArch64::LDRSui;
+ unsigned ResultReg = createResultReg(TLI.getRegClassFor(VT));
+ BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc), ResultReg)
+ .addReg(ADRPReg)
+ .addConstantPoolIndex(Idx, 0, AArch64II::MO_PAGEOFF | AArch64II::MO_NC);
+ return ResultReg;
+}
+
+unsigned AArch64FastISel::AArch64MaterializeGV(const GlobalValue *GV) {
+ // We can't handle thread-local variables quickly yet. Unfortunately we have
+ // to peer through any aliases to find out if that rule applies.
+ const GlobalValue *TLSGV = GV;
+ if (const GlobalAlias *GA = dyn_cast<GlobalAlias>(GV))
+ TLSGV = GA->getAliasee();
+
+ // MachO still uses GOT for large code-model accesses, but ELF requires
+ // movz/movk sequences, which FastISel doesn't handle yet.
+ if (TM.getCodeModel() != CodeModel::Small && !Subtarget->isTargetMachO())
+ return 0;
+
+ if (const GlobalVariable *GVar = dyn_cast<GlobalVariable>(TLSGV))
+ if (GVar->isThreadLocal())
+ return 0;
+
+ unsigned char OpFlags = Subtarget->ClassifyGlobalReference(GV, TM);
+
+ EVT DestEVT = TLI.getValueType(GV->getType(), true);
+ if (!DestEVT.isSimple())
+ return 0;
+
+ unsigned ADRPReg = createResultReg(&AArch64::GPR64commonRegClass);
+ unsigned ResultReg;
+
+ if (OpFlags & AArch64II::MO_GOT) {
+ // ADRP + LDRX
+ BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(AArch64::ADRP),
+ ADRPReg)
+ .addGlobalAddress(GV, 0, AArch64II::MO_GOT | AArch64II::MO_PAGE);
+
+ ResultReg = createResultReg(&AArch64::GPR64RegClass);
+ BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(AArch64::LDRXui),
+ ResultReg)
+ .addReg(ADRPReg)
+ .addGlobalAddress(GV, 0, AArch64II::MO_GOT | AArch64II::MO_PAGEOFF |
+ AArch64II::MO_NC);
+ } else {
+ // ADRP + ADDX
+ BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(AArch64::ADRP),
+ ADRPReg).addGlobalAddress(GV, 0, AArch64II::MO_PAGE);
+
+ ResultReg = createResultReg(&AArch64::GPR64spRegClass);
+ BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(AArch64::ADDXri),
+ ResultReg)
+ .addReg(ADRPReg)
+ .addGlobalAddress(GV, 0, AArch64II::MO_PAGEOFF | AArch64II::MO_NC)
+ .addImm(0);
+ }
+ return ResultReg;
+}
+
+unsigned AArch64FastISel::TargetMaterializeConstant(const Constant *C) {
+ EVT CEVT = TLI.getValueType(C->getType(), true);
+
+ // Only handle simple types.
+ if (!CEVT.isSimple())
+ return 0;
+ MVT VT = CEVT.getSimpleVT();
+
+ // FIXME: Handle ConstantInt.
+ if (const ConstantFP *CFP = dyn_cast<ConstantFP>(C))
+ return AArch64MaterializeFP(CFP, VT);
+ else if (const GlobalValue *GV = dyn_cast<GlobalValue>(C))
+ return AArch64MaterializeGV(GV);
+
+ return 0;
+}
+
+// Computes the address to get to an object.
+bool AArch64FastISel::ComputeAddress(const Value *Obj, Address &Addr) {
+ const User *U = nullptr;
+ unsigned Opcode = Instruction::UserOp1;
+ if (const Instruction *I = dyn_cast<Instruction>(Obj)) {
+ // Don't walk into other basic blocks unless the object is an alloca from
+ // another block, otherwise it may not have a virtual register assigned.
+ if (FuncInfo.StaticAllocaMap.count(static_cast<const AllocaInst *>(Obj)) ||
+ FuncInfo.MBBMap[I->getParent()] == FuncInfo.MBB) {
+ Opcode = I->getOpcode();
+ U = I;
+ }
+ } else if (const ConstantExpr *C = dyn_cast<ConstantExpr>(Obj)) {
+ Opcode = C->getOpcode();
+ U = C;
+ }
+
+ if (const PointerType *Ty = dyn_cast<PointerType>(Obj->getType()))
+ if (Ty->getAddressSpace() > 255)
+ // Fast instruction selection doesn't support the special
+ // address spaces.
+ return false;
+
+ switch (Opcode) {
+ default:
+ break;
+ case Instruction::BitCast: {
+ // Look through bitcasts.
+ return ComputeAddress(U->getOperand(0), Addr);
+ }
+ case Instruction::IntToPtr: {
+ // Look past no-op inttoptrs.
+ if (TLI.getValueType(U->getOperand(0)->getType()) == TLI.getPointerTy())
+ return ComputeAddress(U->getOperand(0), Addr);
+ break;
+ }
+ case Instruction::PtrToInt: {
+ // Look past no-op ptrtoints.
+ if (TLI.getValueType(U->getType()) == TLI.getPointerTy())
+ return ComputeAddress(U->getOperand(0), Addr);
+ break;
+ }
+ case Instruction::GetElementPtr: {
+ Address SavedAddr = Addr;
+ uint64_t TmpOffset = Addr.getOffset();
+
+ // Iterate through the GEP folding the constants into offsets where
+ // we can.
+ gep_type_iterator GTI = gep_type_begin(U);
+ for (User::const_op_iterator i = U->op_begin() + 1, e = U->op_end(); i != e;
+ ++i, ++GTI) {
+ const Value *Op = *i;
+ if (StructType *STy = dyn_cast<StructType>(*GTI)) {
+ const StructLayout *SL = DL.getStructLayout(STy);
+ unsigned Idx = cast<ConstantInt>(Op)->getZExtValue();
+ TmpOffset += SL->getElementOffset(Idx);
+ } else {
+ uint64_t S = DL.getTypeAllocSize(GTI.getIndexedType());
+ for (;;) {
+ if (const ConstantInt *CI = dyn_cast<ConstantInt>(Op)) {
+ // Constant-offset addressing.
+ TmpOffset += CI->getSExtValue() * S;
+ break;
+ }
+ if (canFoldAddIntoGEP(U, Op)) {
+ // A compatible add with a constant operand. Fold the constant.
+ ConstantInt *CI =
+ cast<ConstantInt>(cast<AddOperator>(Op)->getOperand(1));
+ TmpOffset += CI->getSExtValue() * S;
+ // Iterate on the other operand.
+ Op = cast<AddOperator>(Op)->getOperand(0);
+ continue;
+ }
+ // Unsupported
+ goto unsupported_gep;
+ }
+ }
+ }
+
+ // Try to grab the base operand now.
+ Addr.setOffset(TmpOffset);
+ if (ComputeAddress(U->getOperand(0), Addr))
+ return true;
+
+ // We failed, restore everything and try the other options.
+ Addr = SavedAddr;
+
+ unsupported_gep:
+ break;
+ }
+ case Instruction::Alloca: {
+ const AllocaInst *AI = cast<AllocaInst>(Obj);
+ DenseMap<const AllocaInst *, int>::iterator SI =
+ FuncInfo.StaticAllocaMap.find(AI);
+ if (SI != FuncInfo.StaticAllocaMap.end()) {
+ Addr.setKind(Address::FrameIndexBase);
+ Addr.setFI(SI->second);
+ return true;
+ }
+ break;
+ }
+ }
+
+ // Try to get this in a register if nothing else has worked.
+ if (!Addr.isValid())
+ Addr.setReg(getRegForValue(Obj));
+ return Addr.isValid();
+}
+
+bool AArch64FastISel::isTypeLegal(Type *Ty, MVT &VT) {
+ EVT evt = TLI.getValueType(Ty, true);
+
+ // Only handle simple types.
+ if (evt == MVT::Other || !evt.isSimple())
+ return false;
+ VT = evt.getSimpleVT();
+
+ // This is a legal type, but it's not something we handle in fast-isel.
+ if (VT == MVT::f128)
+ return false;
+
+ // Handle all other legal types, i.e. a register that will directly hold this
+ // value.
+ return TLI.isTypeLegal(VT);
+}
+
+bool AArch64FastISel::isLoadStoreTypeLegal(Type *Ty, MVT &VT) {
+ if (isTypeLegal(Ty, VT))
+ return true;
+
+ // If this is a type than can be sign or zero-extended to a basic operation
+ // go ahead and accept it now. For stores, this reflects truncation.
+ if (VT == MVT::i1 || VT == MVT::i8 || VT == MVT::i16)
+ return true;
+
+ return false;
+}
+
+bool AArch64FastISel::SimplifyAddress(Address &Addr, MVT VT,
+ int64_t ScaleFactor, bool UseUnscaled) {
+ bool needsLowering = false;
+ int64_t Offset = Addr.getOffset();
+ switch (VT.SimpleTy) {
+ default:
+ return false;
+ case MVT::i1:
+ case MVT::i8:
+ case MVT::i16:
+ case MVT::i32:
+ case MVT::i64:
+ case MVT::f32:
+ case MVT::f64:
+ if (!UseUnscaled)
+ // Using scaled, 12-bit, unsigned immediate offsets.
+ needsLowering = ((Offset & 0xfff) != Offset);
+ else
+ // Using unscaled, 9-bit, signed immediate offsets.
+ needsLowering = (Offset > 256 || Offset < -256);
+ break;
+ }
+
+ // FIXME: If this is a stack pointer and the offset needs to be simplified
+ // then put the alloca address into a register, set the base type back to
+ // register and continue. This should almost never happen.
+ if (needsLowering && Addr.getKind() == Address::FrameIndexBase) {
+ return false;
+ }
+
+ // Since the offset is too large for the load/store instruction get the
+ // reg+offset into a register.
+ if (needsLowering) {
+ uint64_t UnscaledOffset = Addr.getOffset() * ScaleFactor;
+ unsigned ResultReg = FastEmit_ri_(MVT::i64, ISD::ADD, Addr.getReg(), false,
+ UnscaledOffset, MVT::i64);
+ if (ResultReg == 0)
+ return false;
+ Addr.setReg(ResultReg);
+ Addr.setOffset(0);
+ }
+ return true;
+}
+
+void AArch64FastISel::AddLoadStoreOperands(Address &Addr,
+ const MachineInstrBuilder &MIB,
+ unsigned Flags, bool UseUnscaled) {
+ int64_t Offset = Addr.getOffset();
+ // Frame base works a bit differently. Handle it separately.
+ if (Addr.getKind() == Address::FrameIndexBase) {
+ int FI = Addr.getFI();
+ // FIXME: We shouldn't be using getObjectSize/getObjectAlignment. The size
+ // and alignment should be based on the VT.
+ MachineMemOperand *MMO = FuncInfo.MF->getMachineMemOperand(
+ MachinePointerInfo::getFixedStack(FI, Offset), Flags,
+ MFI.getObjectSize(FI), MFI.getObjectAlignment(FI));
+ // Now add the rest of the operands.
+ MIB.addFrameIndex(FI).addImm(Offset).addMemOperand(MMO);
+ } else {
+ // Now add the rest of the operands.
+ MIB.addReg(Addr.getReg());
+ MIB.addImm(Offset);
+ }
+}
+
+bool AArch64FastISel::EmitLoad(MVT VT, unsigned &ResultReg, Address Addr,
+ bool UseUnscaled) {
+ // Negative offsets require unscaled, 9-bit, signed immediate offsets.
+ // Otherwise, we try using scaled, 12-bit, unsigned immediate offsets.
+ if (!UseUnscaled && Addr.getOffset() < 0)
+ UseUnscaled = true;
+
+ unsigned Opc;
+ const TargetRegisterClass *RC;
+ bool VTIsi1 = false;
+ int64_t ScaleFactor = 0;
+ switch (VT.SimpleTy) {
+ default:
+ return false;
+ case MVT::i1:
+ VTIsi1 = true;
+ // Intentional fall-through.
+ case MVT::i8:
+ Opc = UseUnscaled ? AArch64::LDURBBi : AArch64::LDRBBui;
+ RC = &AArch64::GPR32RegClass;
+ ScaleFactor = 1;
+ break;
+ case MVT::i16:
+ Opc = UseUnscaled ? AArch64::LDURHHi : AArch64::LDRHHui;
+ RC = &AArch64::GPR32RegClass;
+ ScaleFactor = 2;
+ break;
+ case MVT::i32:
+ Opc = UseUnscaled ? AArch64::LDURWi : AArch64::LDRWui;
+ RC = &AArch64::GPR32RegClass;
+ ScaleFactor = 4;
+ break;
+ case MVT::i64:
+ Opc = UseUnscaled ? AArch64::LDURXi : AArch64::LDRXui;
+ RC = &AArch64::GPR64RegClass;
+ ScaleFactor = 8;
+ break;
+ case MVT::f32:
+ Opc = UseUnscaled ? AArch64::LDURSi : AArch64::LDRSui;
+ RC = TLI.getRegClassFor(VT);
+ ScaleFactor = 4;
+ break;
+ case MVT::f64:
+ Opc = UseUnscaled ? AArch64::LDURDi : AArch64::LDRDui;
+ RC = TLI.getRegClassFor(VT);
+ ScaleFactor = 8;
+ break;
+ }
+ // Scale the offset.
+ if (!UseUnscaled) {
+ int64_t Offset = Addr.getOffset();
+ if (Offset & (ScaleFactor - 1))
+ // Retry using an unscaled, 9-bit, signed immediate offset.
+ return EmitLoad(VT, ResultReg, Addr, /*UseUnscaled*/ true);
+
+ Addr.setOffset(Offset / ScaleFactor);
+ }
+
+ // Simplify this down to something we can handle.
+ if (!SimplifyAddress(Addr, VT, UseUnscaled ? 1 : ScaleFactor, UseUnscaled))
+ return false;
+
+ // Create the base instruction, then add the operands.
+ ResultReg = createResultReg(RC);
+ MachineInstrBuilder MIB = BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+ TII.get(Opc), ResultReg);
+ AddLoadStoreOperands(Addr, MIB, MachineMemOperand::MOLoad, UseUnscaled);
+
+ // Loading an i1 requires special handling.
+ if (VTIsi1) {
+ MRI.constrainRegClass(ResultReg, &AArch64::GPR32RegClass);
+ unsigned ANDReg = createResultReg(&AArch64::GPR32spRegClass);
+ BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(AArch64::ANDWri),
+ ANDReg)
+ .addReg(ResultReg)
+ .addImm(AArch64_AM::encodeLogicalImmediate(1, 32));
+ ResultReg = ANDReg;
+ }
+ return true;
+}
+
+bool AArch64FastISel::SelectLoad(const Instruction *I) {
+ MVT VT;
+ // Verify we have a legal type before going any further. Currently, we handle
+ // simple types that will directly fit in a register (i32/f32/i64/f64) or
+ // those that can be sign or zero-extended to a basic operation (i1/i8/i16).
+ if (!isLoadStoreTypeLegal(I->getType(), VT) || cast<LoadInst>(I)->isAtomic())
+ return false;
+
+ // See if we can handle this address.
+ Address Addr;
+ if (!ComputeAddress(I->getOperand(0), Addr))
+ return false;
+
+ unsigned ResultReg;
+ if (!EmitLoad(VT, ResultReg, Addr))
+ return false;
+
+ UpdateValueMap(I, ResultReg);
+ return true;
+}
+
+bool AArch64FastISel::EmitStore(MVT VT, unsigned SrcReg, Address Addr,
+ bool UseUnscaled) {
+ // Negative offsets require unscaled, 9-bit, signed immediate offsets.
+ // Otherwise, we try using scaled, 12-bit, unsigned immediate offsets.
+ if (!UseUnscaled && Addr.getOffset() < 0)
+ UseUnscaled = true;
+
+ unsigned StrOpc;
+ bool VTIsi1 = false;
+ int64_t ScaleFactor = 0;
+ // Using scaled, 12-bit, unsigned immediate offsets.
+ switch (VT.SimpleTy) {
+ default:
+ return false;
+ case MVT::i1:
+ VTIsi1 = true;
+ case MVT::i8:
+ StrOpc = UseUnscaled ? AArch64::STURBBi : AArch64::STRBBui;
+ ScaleFactor = 1;
+ break;
+ case MVT::i16:
+ StrOpc = UseUnscaled ? AArch64::STURHHi : AArch64::STRHHui;
+ ScaleFactor = 2;
+ break;
+ case MVT::i32:
+ StrOpc = UseUnscaled ? AArch64::STURWi : AArch64::STRWui;
+ ScaleFactor = 4;
+ break;
+ case MVT::i64:
+ StrOpc = UseUnscaled ? AArch64::STURXi : AArch64::STRXui;
+ ScaleFactor = 8;
+ break;
+ case MVT::f32:
+ StrOpc = UseUnscaled ? AArch64::STURSi : AArch64::STRSui;
+ ScaleFactor = 4;
+ break;
+ case MVT::f64:
+ StrOpc = UseUnscaled ? AArch64::STURDi : AArch64::STRDui;
+ ScaleFactor = 8;
+ break;
+ }
+ // Scale the offset.
+ if (!UseUnscaled) {
+ int64_t Offset = Addr.getOffset();
+ if (Offset & (ScaleFactor - 1))
+ // Retry using an unscaled, 9-bit, signed immediate offset.
+ return EmitStore(VT, SrcReg, Addr, /*UseUnscaled*/ true);
+
+ Addr.setOffset(Offset / ScaleFactor);
+ }
+
+ // Simplify this down to something we can handle.
+ if (!SimplifyAddress(Addr, VT, UseUnscaled ? 1 : ScaleFactor, UseUnscaled))
+ return false;
+
+ // Storing an i1 requires special handling.
+ if (VTIsi1) {
+ MRI.constrainRegClass(SrcReg, &AArch64::GPR32RegClass);
+ unsigned ANDReg = createResultReg(&AArch64::GPR32spRegClass);
+ BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(AArch64::ANDWri),
+ ANDReg)
+ .addReg(SrcReg)
+ .addImm(AArch64_AM::encodeLogicalImmediate(1, 32));
+ SrcReg = ANDReg;
+ }
+ // Create the base instruction, then add the operands.
+ MachineInstrBuilder MIB = BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+ TII.get(StrOpc)).addReg(SrcReg);
+ AddLoadStoreOperands(Addr, MIB, MachineMemOperand::MOStore, UseUnscaled);
+ return true;
+}
+
+bool AArch64FastISel::SelectStore(const Instruction *I) {
+ MVT VT;
+ Value *Op0 = I->getOperand(0);
+ // Verify we have a legal type before going any further. Currently, we handle
+ // simple types that will directly fit in a register (i32/f32/i64/f64) or
+ // those that can be sign or zero-extended to a basic operation (i1/i8/i16).
+ if (!isLoadStoreTypeLegal(Op0->getType(), VT) ||
+ cast<StoreInst>(I)->isAtomic())
+ return false;
+
+ // Get the value to be stored into a register.
+ unsigned SrcReg = getRegForValue(Op0);
+ if (SrcReg == 0)
+ return false;
+
+ // See if we can handle this address.
+ Address Addr;
+ if (!ComputeAddress(I->getOperand(1), Addr))
+ return false;
+
+ if (!EmitStore(VT, SrcReg, Addr))
+ return false;
+ return true;
+}
+
+static AArch64CC::CondCode getCompareCC(CmpInst::Predicate Pred) {
+ switch (Pred) {
+ case CmpInst::FCMP_ONE:
+ case CmpInst::FCMP_UEQ:
+ default:
+ // AL is our "false" for now. The other two need more compares.
+ return AArch64CC::AL;
+ case CmpInst::ICMP_EQ:
+ case CmpInst::FCMP_OEQ:
+ return AArch64CC::EQ;
+ case CmpInst::ICMP_SGT:
+ case CmpInst::FCMP_OGT:
+ return AArch64CC::GT;
+ case CmpInst::ICMP_SGE:
+ case CmpInst::FCMP_OGE:
+ return AArch64CC::GE;
+ case CmpInst::ICMP_UGT:
+ case CmpInst::FCMP_UGT:
+ return AArch64CC::HI;
+ case CmpInst::FCMP_OLT:
+ return AArch64CC::MI;
+ case CmpInst::ICMP_ULE:
+ case CmpInst::FCMP_OLE:
+ return AArch64CC::LS;
+ case CmpInst::FCMP_ORD:
+ return AArch64CC::VC;
+ case CmpInst::FCMP_UNO:
+ return AArch64CC::VS;
+ case CmpInst::FCMP_UGE:
+ return AArch64CC::PL;
+ case CmpInst::ICMP_SLT:
+ case CmpInst::FCMP_ULT:
+ return AArch64CC::LT;
+ case CmpInst::ICMP_SLE:
+ case CmpInst::FCMP_ULE:
+ return AArch64CC::LE;
+ case CmpInst::FCMP_UNE:
+ case CmpInst::ICMP_NE:
+ return AArch64CC::NE;
+ case CmpInst::ICMP_UGE:
+ return AArch64CC::HS;
+ case CmpInst::ICMP_ULT:
+ return AArch64CC::LO;
+ }
+}
+
+bool AArch64FastISel::SelectBranch(const Instruction *I) {
+ const BranchInst *BI = cast<BranchInst>(I);
+ MachineBasicBlock *TBB = FuncInfo.MBBMap[BI->getSuccessor(0)];
+ MachineBasicBlock *FBB = FuncInfo.MBBMap[BI->getSuccessor(1)];
+
+ if (const CmpInst *CI = dyn_cast<CmpInst>(BI->getCondition())) {
+ if (CI->hasOneUse() && (CI->getParent() == I->getParent())) {
+ // We may not handle every CC for now.
+ AArch64CC::CondCode CC = getCompareCC(CI->getPredicate());
+ if (CC == AArch64CC::AL)
+ return false;
+
+ // Emit the cmp.
+ if (!EmitCmp(CI->getOperand(0), CI->getOperand(1), CI->isUnsigned()))
+ return false;
+
+ // Emit the branch.
+ BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(AArch64::Bcc))
+ .addImm(CC)
+ .addMBB(TBB);
+ FuncInfo.MBB->addSuccessor(TBB);
+
+ FastEmitBranch(FBB, DbgLoc);
+ return true;
+ }
+ } else if (TruncInst *TI = dyn_cast<TruncInst>(BI->getCondition())) {
+ MVT SrcVT;
+ if (TI->hasOneUse() && TI->getParent() == I->getParent() &&
+ (isLoadStoreTypeLegal(TI->getOperand(0)->getType(), SrcVT))) {
+ unsigned CondReg = getRegForValue(TI->getOperand(0));
+ if (CondReg == 0)
+ return false;
+
+ // Issue an extract_subreg to get the lower 32-bits.
+ if (SrcVT == MVT::i64)
+ CondReg = FastEmitInst_extractsubreg(MVT::i32, CondReg, /*Kill=*/true,
+ AArch64::sub_32);
+
+ MRI.constrainRegClass(CondReg, &AArch64::GPR32RegClass);
+ unsigned ANDReg = createResultReg(&AArch64::GPR32spRegClass);
+ BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+ TII.get(AArch64::ANDWri), ANDReg)
+ .addReg(CondReg)
+ .addImm(AArch64_AM::encodeLogicalImmediate(1, 32));
+ BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+ TII.get(AArch64::SUBSWri))
+ .addReg(ANDReg)
+ .addReg(ANDReg)
+ .addImm(0)
+ .addImm(0);
+
+ unsigned CC = AArch64CC::NE;
+ if (FuncInfo.MBB->isLayoutSuccessor(TBB)) {
+ std::swap(TBB, FBB);
+ CC = AArch64CC::EQ;
+ }
+ BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(AArch64::Bcc))
+ .addImm(CC)
+ .addMBB(TBB);
+ FuncInfo.MBB->addSuccessor(TBB);
+ FastEmitBranch(FBB, DbgLoc);
+ return true;
+ }
+ } else if (const ConstantInt *CI =
+ dyn_cast<ConstantInt>(BI->getCondition())) {
+ uint64_t Imm = CI->getZExtValue();
+ MachineBasicBlock *Target = (Imm == 0) ? FBB : TBB;
+ BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(AArch64::B))
+ .addMBB(Target);
+ FuncInfo.MBB->addSuccessor(Target);
+ return true;
+ }
+
+ unsigned CondReg = getRegForValue(BI->getCondition());
+ if (CondReg == 0)
+ return false;
+
+ // We've been divorced from our compare! Our block was split, and
+ // now our compare lives in a predecessor block. We musn't
+ // re-compare here, as the children of the compare aren't guaranteed
+ // live across the block boundary (we *could* check for this).
+ // Regardless, the compare has been done in the predecessor block,
+ // and it left a value for us in a virtual register. Ergo, we test
+ // the one-bit value left in the virtual register.
+ BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(AArch64::SUBSWri),
+ AArch64::WZR)
+ .addReg(CondReg)
+ .addImm(0)
+ .addImm(0);
+
+ unsigned CC = AArch64CC::NE;
+ if (FuncInfo.MBB->isLayoutSuccessor(TBB)) {
+ std::swap(TBB, FBB);
+ CC = AArch64CC::EQ;
+ }
+
+ BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(AArch64::Bcc))
+ .addImm(CC)
+ .addMBB(TBB);
+ FuncInfo.MBB->addSuccessor(TBB);
+ FastEmitBranch(FBB, DbgLoc);
+ return true;
+}
+
+bool AArch64FastISel::SelectIndirectBr(const Instruction *I) {
+ const IndirectBrInst *BI = cast<IndirectBrInst>(I);
+ unsigned AddrReg = getRegForValue(BI->getOperand(0));
+ if (AddrReg == 0)
+ return false;
+
+ // Emit the indirect branch.
+ BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(AArch64::BR))
+ .addReg(AddrReg);
+
+ // Make sure the CFG is up-to-date.
+ for (unsigned i = 0, e = BI->getNumSuccessors(); i != e; ++i)
+ FuncInfo.MBB->addSuccessor(FuncInfo.MBBMap[BI->getSuccessor(i)]);
+
+ return true;
+}
+
+bool AArch64FastISel::EmitCmp(Value *Src1Value, Value *Src2Value, bool isZExt) {
+ Type *Ty = Src1Value->getType();
+ EVT SrcEVT = TLI.getValueType(Ty, true);
+ if (!SrcEVT.isSimple())
+ return false;
+ MVT SrcVT = SrcEVT.getSimpleVT();
+
+ // Check to see if the 2nd operand is a constant that we can encode directly
+ // in the compare.
+ uint64_t Imm;
+ bool UseImm = false;
+ bool isNegativeImm = false;
+ if (const ConstantInt *ConstInt = dyn_cast<ConstantInt>(Src2Value)) {
+ if (SrcVT == MVT::i64 || SrcVT == MVT::i32 || SrcVT == MVT::i16 ||
+ SrcVT == MVT::i8 || SrcVT == MVT::i1) {
+ const APInt &CIVal = ConstInt->getValue();
+
+ Imm = (isZExt) ? CIVal.getZExtValue() : CIVal.getSExtValue();
+ if (CIVal.isNegative()) {
+ isNegativeImm = true;
+ Imm = -Imm;
+ }
+ // FIXME: We can handle more immediates using shifts.
+ UseImm = ((Imm & 0xfff) == Imm);
+ }
+ } else if (const ConstantFP *ConstFP = dyn_cast<ConstantFP>(Src2Value)) {
+ if (SrcVT == MVT::f32 || SrcVT == MVT::f64)
+ if (ConstFP->isZero() && !ConstFP->isNegative())
+ UseImm = true;
+ }
+
+ unsigned ZReg;
+ unsigned CmpOpc;
+ bool isICmp = true;
+ bool needsExt = false;
+ switch (SrcVT.SimpleTy) {
+ default:
+ return false;
+ case MVT::i1:
+ case MVT::i8:
+ case MVT::i16:
+ needsExt = true;
+ // Intentional fall-through.
+ case MVT::i32:
+ ZReg = AArch64::WZR;
+ if (UseImm)
+ CmpOpc = isNegativeImm ? AArch64::ADDSWri : AArch64::SUBSWri;
+ else
+ CmpOpc = AArch64::SUBSWrr;
+ break;
+ case MVT::i64:
+ ZReg = AArch64::XZR;
+ if (UseImm)
+ CmpOpc = isNegativeImm ? AArch64::ADDSXri : AArch64::SUBSXri;
+ else
+ CmpOpc = AArch64::SUBSXrr;
+ break;
+ case MVT::f32:
+ isICmp = false;
+ CmpOpc = UseImm ? AArch64::FCMPSri : AArch64::FCMPSrr;
+ break;
+ case MVT::f64:
+ isICmp = false;
+ CmpOpc = UseImm ? AArch64::FCMPDri : AArch64::FCMPDrr;
+ break;
+ }
+
+ unsigned SrcReg1 = getRegForValue(Src1Value);
+ if (SrcReg1 == 0)
+ return false;
+
+ unsigned SrcReg2;
+ if (!UseImm) {
+ SrcReg2 = getRegForValue(Src2Value);
+ if (SrcReg2 == 0)
+ return false;
+ }
+
+ // We have i1, i8, or i16, we need to either zero extend or sign extend.
+ if (needsExt) {
+ SrcReg1 = EmitIntExt(SrcVT, SrcReg1, MVT::i32, isZExt);
+ if (SrcReg1 == 0)
+ return false;
+ if (!UseImm) {
+ SrcReg2 = EmitIntExt(SrcVT, SrcReg2, MVT::i32, isZExt);
+ if (SrcReg2 == 0)
+ return false;
+ }
+ }
+
+ if (isICmp) {
+ if (UseImm)
+ BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(CmpOpc))
+ .addReg(ZReg)
+ .addReg(SrcReg1)
+ .addImm(Imm)
+ .addImm(0);
+ else
+ BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(CmpOpc))
+ .addReg(ZReg)
+ .addReg(SrcReg1)
+ .addReg(SrcReg2);
+ } else {
+ if (UseImm)
+ BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(CmpOpc))
+ .addReg(SrcReg1);
+ else
+ BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(CmpOpc))
+ .addReg(SrcReg1)
+ .addReg(SrcReg2);
+ }
+ return true;
+}
+
+bool AArch64FastISel::SelectCmp(const Instruction *I) {
+ const CmpInst *CI = cast<CmpInst>(I);
+
+ // We may not handle every CC for now.
+ AArch64CC::CondCode CC = getCompareCC(CI->getPredicate());
+ if (CC == AArch64CC::AL)
+ return false;
+
+ // Emit the cmp.
+ if (!EmitCmp(CI->getOperand(0), CI->getOperand(1), CI->isUnsigned()))
+ return false;
+
+ // Now set a register based on the comparison.
+ AArch64CC::CondCode invertedCC = getInvertedCondCode(CC);
+ unsigned ResultReg = createResultReg(&AArch64::GPR32RegClass);
+ BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(AArch64::CSINCWr),
+ ResultReg)
+ .addReg(AArch64::WZR)
+ .addReg(AArch64::WZR)
+ .addImm(invertedCC);
+
+ UpdateValueMap(I, ResultReg);
+ return true;
+}
+
+bool AArch64FastISel::SelectSelect(const Instruction *I) {
+ const SelectInst *SI = cast<SelectInst>(I);
+
+ EVT DestEVT = TLI.getValueType(SI->getType(), true);
+ if (!DestEVT.isSimple())
+ return false;
+
+ MVT DestVT = DestEVT.getSimpleVT();
+ if (DestVT != MVT::i32 && DestVT != MVT::i64 && DestVT != MVT::f32 &&
+ DestVT != MVT::f64)
+ return false;
+
+ unsigned CondReg = getRegForValue(SI->getCondition());
+ if (CondReg == 0)
+ return false;
+ unsigned TrueReg = getRegForValue(SI->getTrueValue());
+ if (TrueReg == 0)
+ return false;
+ unsigned FalseReg = getRegForValue(SI->getFalseValue());
+ if (FalseReg == 0)
+ return false;
+
+
+ MRI.constrainRegClass(CondReg, &AArch64::GPR32RegClass);
+ unsigned ANDReg = createResultReg(&AArch64::GPR32spRegClass);
+ BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(AArch64::ANDWri),
+ ANDReg)
+ .addReg(CondReg)
+ .addImm(AArch64_AM::encodeLogicalImmediate(1, 32));
+
+ BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(AArch64::SUBSWri))
+ .addReg(ANDReg)
+ .addReg(ANDReg)
+ .addImm(0)
+ .addImm(0);
+
+ unsigned SelectOpc;
+ switch (DestVT.SimpleTy) {
+ default:
+ return false;
+ case MVT::i32:
+ SelectOpc = AArch64::CSELWr;
+ break;
+ case MVT::i64:
+ SelectOpc = AArch64::CSELXr;
+ break;
+ case MVT::f32:
+ SelectOpc = AArch64::FCSELSrrr;
+ break;
+ case MVT::f64:
+ SelectOpc = AArch64::FCSELDrrr;
+ break;
+ }
+
+ unsigned ResultReg = createResultReg(TLI.getRegClassFor(DestVT));
+ BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(SelectOpc),
+ ResultReg)
+ .addReg(TrueReg)
+ .addReg(FalseReg)
+ .addImm(AArch64CC::NE);
+
+ UpdateValueMap(I, ResultReg);
+ return true;
+}
+
+bool AArch64FastISel::SelectFPExt(const Instruction *I) {
+ Value *V = I->getOperand(0);
+ if (!I->getType()->isDoubleTy() || !V->getType()->isFloatTy())
+ return false;
+
+ unsigned Op = getRegForValue(V);
+ if (Op == 0)
+ return false;
+
+ unsigned ResultReg = createResultReg(&AArch64::FPR64RegClass);
+ BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(AArch64::FCVTDSr),
+ ResultReg).addReg(Op);
+ UpdateValueMap(I, ResultReg);
+ return true;
+}
+
+bool AArch64FastISel::SelectFPTrunc(const Instruction *I) {
+ Value *V = I->getOperand(0);
+ if (!I->getType()->isFloatTy() || !V->getType()->isDoubleTy())
+ return false;
+
+ unsigned Op = getRegForValue(V);
+ if (Op == 0)
+ return false;
+
+ unsigned ResultReg = createResultReg(&AArch64::FPR32RegClass);
+ BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(AArch64::FCVTSDr),
+ ResultReg).addReg(Op);
+ UpdateValueMap(I, ResultReg);
+ return true;
+}
+
+// FPToUI and FPToSI
+bool AArch64FastISel::SelectFPToInt(const Instruction *I, bool Signed) {
+ MVT DestVT;
+ if (!isTypeLegal(I->getType(), DestVT) || DestVT.isVector())
+ return false;
+
+ unsigned SrcReg = getRegForValue(I->getOperand(0));
+ if (SrcReg == 0)
+ return false;
+
+ EVT SrcVT = TLI.getValueType(I->getOperand(0)->getType(), true);
+ if (SrcVT == MVT::f128)
+ return false;
+
+ unsigned Opc;
+ if (SrcVT == MVT::f64) {
+ if (Signed)
+ Opc = (DestVT == MVT::i32) ? AArch64::FCVTZSUWDr : AArch64::FCVTZSUXDr;
+ else
+ Opc = (DestVT == MVT::i32) ? AArch64::FCVTZUUWDr : AArch64::FCVTZUUXDr;
+ } else {
+ if (Signed)
+ Opc = (DestVT == MVT::i32) ? AArch64::FCVTZSUWSr : AArch64::FCVTZSUXSr;
+ else
+ Opc = (DestVT == MVT::i32) ? AArch64::FCVTZUUWSr : AArch64::FCVTZUUXSr;
+ }
+ unsigned ResultReg = createResultReg(
+ DestVT == MVT::i32 ? &AArch64::GPR32RegClass : &AArch64::GPR64RegClass);
+ BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc), ResultReg)
+ .addReg(SrcReg);
+ UpdateValueMap(I, ResultReg);
+ return true;
+}
+
+bool AArch64FastISel::SelectIntToFP(const Instruction *I, bool Signed) {
+ MVT DestVT;
+ if (!isTypeLegal(I->getType(), DestVT) || DestVT.isVector())
+ return false;
+ assert ((DestVT == MVT::f32 || DestVT == MVT::f64) &&
+ "Unexpected value type.");
+
+ unsigned SrcReg = getRegForValue(I->getOperand(0));
+ if (SrcReg == 0)
+ return false;
+
+ EVT SrcVT = TLI.getValueType(I->getOperand(0)->getType(), true);
+
+ // Handle sign-extension.
+ if (SrcVT == MVT::i16 || SrcVT == MVT::i8 || SrcVT == MVT::i1) {
+ SrcReg =
+ EmitIntExt(SrcVT.getSimpleVT(), SrcReg, MVT::i32, /*isZExt*/ !Signed);
+ if (SrcReg == 0)
+ return false;
+ }
+
+ MRI.constrainRegClass(SrcReg, SrcVT == MVT::i64 ? &AArch64::GPR64RegClass
+ : &AArch64::GPR32RegClass);
+
+ unsigned Opc;
+ if (SrcVT == MVT::i64) {
+ if (Signed)
+ Opc = (DestVT == MVT::f32) ? AArch64::SCVTFUXSri : AArch64::SCVTFUXDri;
+ else
+ Opc = (DestVT == MVT::f32) ? AArch64::UCVTFUXSri : AArch64::UCVTFUXDri;
+ } else {
+ if (Signed)
+ Opc = (DestVT == MVT::f32) ? AArch64::SCVTFUWSri : AArch64::SCVTFUWDri;
+ else
+ Opc = (DestVT == MVT::f32) ? AArch64::UCVTFUWSri : AArch64::UCVTFUWDri;
+ }
+
+ unsigned ResultReg = createResultReg(TLI.getRegClassFor(DestVT));
+ BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc), ResultReg)
+ .addReg(SrcReg);
+ UpdateValueMap(I, ResultReg);
+ return true;
+}
+
+bool AArch64FastISel::ProcessCallArgs(
+ SmallVectorImpl<Value *> &Args, SmallVectorImpl<unsigned> &ArgRegs,
+ SmallVectorImpl<MVT> &ArgVTs, SmallVectorImpl<ISD::ArgFlagsTy> &ArgFlags,
+ SmallVectorImpl<unsigned> &RegArgs, CallingConv::ID CC,
+ unsigned &NumBytes) {
+ SmallVector<CCValAssign, 16> ArgLocs;
+ CCState CCInfo(CC, false, *FuncInfo.MF, TM, ArgLocs, *Context);
+ CCInfo.AnalyzeCallOperands(ArgVTs, ArgFlags, CCAssignFnForCall(CC));
+
+ // Get a count of how many bytes are to be pushed on the stack.
+ NumBytes = CCInfo.getNextStackOffset();
+
+ // Issue CALLSEQ_START
+ unsigned AdjStackDown = TII.getCallFrameSetupOpcode();
+ BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(AdjStackDown))
+ .addImm(NumBytes);
+
+ // Process the args.
+ for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
+ CCValAssign &VA = ArgLocs[i];
+ unsigned Arg = ArgRegs[VA.getValNo()];
+ MVT ArgVT = ArgVTs[VA.getValNo()];
+
+ // Handle arg promotion: SExt, ZExt, AExt.
+ switch (VA.getLocInfo()) {
+ case CCValAssign::Full:
+ break;
+ case CCValAssign::SExt: {
+ MVT DestVT = VA.getLocVT();
+ MVT SrcVT = ArgVT;
+ Arg = EmitIntExt(SrcVT, Arg, DestVT, /*isZExt*/ false);
+ if (Arg == 0)
+ return false;
+ ArgVT = DestVT;
+ break;
+ }
+ case CCValAssign::AExt:
+ // Intentional fall-through.
+ case CCValAssign::ZExt: {
+ MVT DestVT = VA.getLocVT();
+ MVT SrcVT = ArgVT;
+ Arg = EmitIntExt(SrcVT, Arg, DestVT, /*isZExt*/ true);
+ if (Arg == 0)
+ return false;
+ ArgVT = DestVT;
+ break;
+ }
+ default:
+ llvm_unreachable("Unknown arg promotion!");
+ }
+
+ // Now copy/store arg to correct locations.
+ if (VA.isRegLoc() && !VA.needsCustom()) {
+ BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+ TII.get(TargetOpcode::COPY), VA.getLocReg()).addReg(Arg);
+ RegArgs.push_back(VA.getLocReg());
+ } else if (VA.needsCustom()) {
+ // FIXME: Handle custom args.
+ return false;
+ } else {
+ assert(VA.isMemLoc() && "Assuming store on stack.");
+
+ // Need to store on the stack.
+ unsigned ArgSize = VA.getLocVT().getSizeInBits() / 8;
+
+ unsigned BEAlign = 0;
+ if (ArgSize < 8 && !Subtarget->isLittleEndian())
+ BEAlign = 8 - ArgSize;
+
+ Address Addr;
+ Addr.setKind(Address::RegBase);
+ Addr.setReg(AArch64::SP);
+ Addr.setOffset(VA.getLocMemOffset() + BEAlign);
+
+ if (!EmitStore(ArgVT, Arg, Addr))
+ return false;
+ }
+ }
+ return true;
+}
+
+bool AArch64FastISel::FinishCall(MVT RetVT, SmallVectorImpl<unsigned> &UsedRegs,
+ const Instruction *I, CallingConv::ID CC,
+ unsigned &NumBytes) {
+ // Issue CALLSEQ_END
+ unsigned AdjStackUp = TII.getCallFrameDestroyOpcode();
+ BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(AdjStackUp))
+ .addImm(NumBytes)
+ .addImm(0);
+
+ // Now the return value.
+ if (RetVT != MVT::isVoid) {
+ SmallVector<CCValAssign, 16> RVLocs;
+ CCState CCInfo(CC, false, *FuncInfo.MF, TM, RVLocs, *Context);
+ CCInfo.AnalyzeCallResult(RetVT, CCAssignFnForCall(CC));
+
+ // Only handle a single return value.
+ if (RVLocs.size() != 1)
+ return false;
+
+ // Copy all of the result registers out of their specified physreg.
+ MVT CopyVT = RVLocs[0].getValVT();
+ unsigned ResultReg = createResultReg(TLI.getRegClassFor(CopyVT));
+ BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+ TII.get(TargetOpcode::COPY),
+ ResultReg).addReg(RVLocs[0].getLocReg());
+ UsedRegs.push_back(RVLocs[0].getLocReg());
+
+ // Finally update the result.
+ UpdateValueMap(I, ResultReg);
+ }
+
+ return true;
+}
+
+bool AArch64FastISel::SelectCall(const Instruction *I,
+ const char *IntrMemName = nullptr) {
+ const CallInst *CI = cast<CallInst>(I);
+ const Value *Callee = CI->getCalledValue();
+
+ // Don't handle inline asm or intrinsics.
+ if (isa<InlineAsm>(Callee))
+ return false;
+
+ // Only handle global variable Callees.
+ const GlobalValue *GV = dyn_cast<GlobalValue>(Callee);
+ if (!GV)
+ return false;
+
+ // Check the calling convention.
+ ImmutableCallSite CS(CI);
+ CallingConv::ID CC = CS.getCallingConv();
+
+ // Let SDISel handle vararg functions.
+ PointerType *PT = cast<PointerType>(CS.getCalledValue()->getType());
+ FunctionType *FTy = cast<FunctionType>(PT->getElementType());
+ if (FTy->isVarArg())
+ return false;
+
+ // Handle *simple* calls for now.
+ MVT RetVT;
+ Type *RetTy = I->getType();
+ if (RetTy->isVoidTy())
+ RetVT = MVT::isVoid;
+ else if (!isTypeLegal(RetTy, RetVT))
+ return false;
+
+ // Set up the argument vectors.
+ SmallVector<Value *, 8> Args;
+ SmallVector<unsigned, 8> ArgRegs;
+ SmallVector<MVT, 8> ArgVTs;
+ SmallVector<ISD::ArgFlagsTy, 8> ArgFlags;
+ Args.reserve(CS.arg_size());
+ ArgRegs.reserve(CS.arg_size());
+ ArgVTs.reserve(CS.arg_size());
+ ArgFlags.reserve(CS.arg_size());
+
+ for (ImmutableCallSite::arg_iterator i = CS.arg_begin(), e = CS.arg_end();
+ i != e; ++i) {
+ // If we're lowering a memory intrinsic instead of a regular call, skip the
+ // last two arguments, which shouldn't be passed to the underlying function.
+ if (IntrMemName && e - i <= 2)
+ break;
+
+ unsigned Arg = getRegForValue(*i);
+ if (Arg == 0)
+ return false;
+
+ ISD::ArgFlagsTy Flags;
+ unsigned AttrInd = i - CS.arg_begin() + 1;
+ if (CS.paramHasAttr(AttrInd, Attribute::SExt))
+ Flags.setSExt();
+ if (CS.paramHasAttr(AttrInd, Attribute::ZExt))
+ Flags.setZExt();
+
+ // FIXME: Only handle *easy* calls for now.
+ if (CS.paramHasAttr(AttrInd, Attribute::InReg) ||
+ CS.paramHasAttr(AttrInd, Attribute::StructRet) ||
+ CS.paramHasAttr(AttrInd, Attribute::Nest) ||
+ CS.paramHasAttr(AttrInd, Attribute::ByVal))
+ return false;
+
+ MVT ArgVT;
+ Type *ArgTy = (*i)->getType();
+ if (!isTypeLegal(ArgTy, ArgVT) &&
+ !(ArgVT == MVT::i1 || ArgVT == MVT::i8 || ArgVT == MVT::i16))
+ return false;
+
+ // We don't handle vector parameters yet.
+ if (ArgVT.isVector() || ArgVT.getSizeInBits() > 64)
+ return false;
+
+ unsigned OriginalAlignment = DL.getABITypeAlignment(ArgTy);
+ Flags.setOrigAlign(OriginalAlignment);
+
+ Args.push_back(*i);
+ ArgRegs.push_back(Arg);
+ ArgVTs.push_back(ArgVT);
+ ArgFlags.push_back(Flags);
+ }
+
+ // Handle the arguments now that we've gotten them.
+ SmallVector<unsigned, 4> RegArgs;
+ unsigned NumBytes;
+ if (!ProcessCallArgs(Args, ArgRegs, ArgVTs, ArgFlags, RegArgs, CC, NumBytes))
+ return false;
+
+ // Issue the call.
+ MachineInstrBuilder MIB;
+ MIB = BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(AArch64::BL));
+ if (!IntrMemName)
+ MIB.addGlobalAddress(GV, 0, 0);
+ else
+ MIB.addExternalSymbol(IntrMemName, 0);
+
+ // Add implicit physical register uses to the call.
+ for (unsigned i = 0, e = RegArgs.size(); i != e; ++i)
+ MIB.addReg(RegArgs[i], RegState::Implicit);
+
+ // Add a register mask with the call-preserved registers.
+ // Proper defs for return values will be added by setPhysRegsDeadExcept().
+ MIB.addRegMask(TRI.getCallPreservedMask(CS.getCallingConv()));
+
+ // Finish off the call including any return values.
+ SmallVector<unsigned, 4> UsedRegs;
+ if (!FinishCall(RetVT, UsedRegs, I, CC, NumBytes))
+ return false;
+
+ // Set all unused physreg defs as dead.
+ static_cast<MachineInstr *>(MIB)->setPhysRegsDeadExcept(UsedRegs, TRI);
+
+ return true;
+}
+
+bool AArch64FastISel::IsMemCpySmall(uint64_t Len, unsigned Alignment) {
+ if (Alignment)
+ return Len / Alignment <= 4;
+ else
+ return Len < 32;
+}
+
+bool AArch64FastISel::TryEmitSmallMemCpy(Address Dest, Address Src,
+ uint64_t Len, unsigned Alignment) {
+ // Make sure we don't bloat code by inlining very large memcpy's.
+ if (!IsMemCpySmall(Len, Alignment))
+ return false;
+
+ int64_t UnscaledOffset = 0;
+ Address OrigDest = Dest;
+ Address OrigSrc = Src;
+
+ while (Len) {
+ MVT VT;
+ if (!Alignment || Alignment >= 8) {
+ if (Len >= 8)
+ VT = MVT::i64;
+ else if (Len >= 4)
+ VT = MVT::i32;
+ else if (Len >= 2)
+ VT = MVT::i16;
+ else {
+ VT = MVT::i8;
+ }
+ } else {
+ // Bound based on alignment.
+ if (Len >= 4 && Alignment == 4)
+ VT = MVT::i32;
+ else if (Len >= 2 && Alignment == 2)
+ VT = MVT::i16;
+ else {
+ VT = MVT::i8;
+ }
+ }
+
+ bool RV;
+ unsigned ResultReg;
+ RV = EmitLoad(VT, ResultReg, Src);
+ assert(RV == true && "Should be able to handle this load.");
+ RV = EmitStore(VT, ResultReg, Dest);
+ assert(RV == true && "Should be able to handle this store.");
+ (void)RV;
+
+ int64_t Size = VT.getSizeInBits() / 8;
+ Len -= Size;
+ UnscaledOffset += Size;
+
+ // We need to recompute the unscaled offset for each iteration.
+ Dest.setOffset(OrigDest.getOffset() + UnscaledOffset);
+ Src.setOffset(OrigSrc.getOffset() + UnscaledOffset);
+ }
+
+ return true;
+}
+
+bool AArch64FastISel::SelectIntrinsicCall(const IntrinsicInst &I) {
+ // FIXME: Handle more intrinsics.
+ switch (I.getIntrinsicID()) {
+ default:
+ return false;
+ case Intrinsic::memcpy:
+ case Intrinsic::memmove: {
+ const MemTransferInst &MTI = cast<MemTransferInst>(I);
+ // Don't handle volatile.
+ if (MTI.isVolatile())
+ return false;
+
+ // Disable inlining for memmove before calls to ComputeAddress. Otherwise,
+ // we would emit dead code because we don't currently handle memmoves.
+ bool isMemCpy = (I.getIntrinsicID() == Intrinsic::memcpy);
+ if (isa<ConstantInt>(MTI.getLength()) && isMemCpy) {
+ // Small memcpy's are common enough that we want to do them without a call
+ // if possible.
+ uint64_t Len = cast<ConstantInt>(MTI.getLength())->getZExtValue();
+ unsigned Alignment = MTI.getAlignment();
+ if (IsMemCpySmall(Len, Alignment)) {
+ Address Dest, Src;
+ if (!ComputeAddress(MTI.getRawDest(), Dest) ||
+ !ComputeAddress(MTI.getRawSource(), Src))
+ return false;
+ if (TryEmitSmallMemCpy(Dest, Src, Len, Alignment))
+ return true;
+ }
+ }
+
+ if (!MTI.getLength()->getType()->isIntegerTy(64))
+ return false;
+
+ if (MTI.getSourceAddressSpace() > 255 || MTI.getDestAddressSpace() > 255)
+ // Fast instruction selection doesn't support the special
+ // address spaces.
+ return false;
+
+ const char *IntrMemName = isa<MemCpyInst>(I) ? "memcpy" : "memmove";
+ return SelectCall(&I, IntrMemName);
+ }
+ case Intrinsic::memset: {
+ const MemSetInst &MSI = cast<MemSetInst>(I);
+ // Don't handle volatile.
+ if (MSI.isVolatile())
+ return false;
+
+ if (!MSI.getLength()->getType()->isIntegerTy(64))
+ return false;
+
+ if (MSI.getDestAddressSpace() > 255)
+ // Fast instruction selection doesn't support the special
+ // address spaces.
+ return false;
+
+ return SelectCall(&I, "memset");
+ }
+ case Intrinsic::trap: {
+ BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(AArch64::BRK))
+ .addImm(1);
+ return true;
+ }
+ }
+ return false;
+}
+
+bool AArch64FastISel::SelectRet(const Instruction *I) {
+ const ReturnInst *Ret = cast<ReturnInst>(I);
+ const Function &F = *I->getParent()->getParent();
+
+ if (!FuncInfo.CanLowerReturn)
+ return false;
+
+ if (F.isVarArg())
+ return false;
+
+ // Build a list of return value registers.
+ SmallVector<unsigned, 4> RetRegs;
+
+ if (Ret->getNumOperands() > 0) {
+ CallingConv::ID CC = F.getCallingConv();
+ SmallVector<ISD::OutputArg, 4> Outs;
+ GetReturnInfo(F.getReturnType(), F.getAttributes(), Outs, TLI);
+
+ // Analyze operands of the call, assigning locations to each operand.
+ SmallVector<CCValAssign, 16> ValLocs;
+ CCState CCInfo(CC, F.isVarArg(), *FuncInfo.MF, TM, ValLocs,
+ I->getContext());
+ CCAssignFn *RetCC = CC == CallingConv::WebKit_JS ? RetCC_AArch64_WebKit_JS
+ : RetCC_AArch64_AAPCS;
+ CCInfo.AnalyzeReturn(Outs, RetCC);
+
+ // Only handle a single return value for now.
+ if (ValLocs.size() != 1)
+ return false;
+
+ CCValAssign &VA = ValLocs[0];
+ const Value *RV = Ret->getOperand(0);
+
+ // Don't bother handling odd stuff for now.
+ if (VA.getLocInfo() != CCValAssign::Full)
+ return false;
+ // Only handle register returns for now.
+ if (!VA.isRegLoc())
+ return false;
+ unsigned Reg = getRegForValue(RV);
+ if (Reg == 0)
+ return false;
+
+ unsigned SrcReg = Reg + VA.getValNo();
+ unsigned DestReg = VA.getLocReg();
+ // Avoid a cross-class copy. This is very unlikely.
+ if (!MRI.getRegClass(SrcReg)->contains(DestReg))
+ return false;
+
+ EVT RVEVT = TLI.getValueType(RV->getType());
+ if (!RVEVT.isSimple())
+ return false;
+
+ // Vectors (of > 1 lane) in big endian need tricky handling.
+ if (RVEVT.isVector() && RVEVT.getVectorNumElements() > 1)
+ return false;
+
+ MVT RVVT = RVEVT.getSimpleVT();
+ if (RVVT == MVT::f128)
+ return false;
+ MVT DestVT = VA.getValVT();
+ // Special handling for extended integers.
+ if (RVVT != DestVT) {
+ if (RVVT != MVT::i1 && RVVT != MVT::i8 && RVVT != MVT::i16)
+ return false;
+
+ if (!Outs[0].Flags.isZExt() && !Outs[0].Flags.isSExt())
+ return false;
+
+ bool isZExt = Outs[0].Flags.isZExt();
+ SrcReg = EmitIntExt(RVVT, SrcReg, DestVT, isZExt);
+ if (SrcReg == 0)
+ return false;
+ }
+
+ // Make the copy.
+ BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+ TII.get(TargetOpcode::COPY), DestReg).addReg(SrcReg);
+
+ // Add register to return instruction.
+ RetRegs.push_back(VA.getLocReg());
+ }
+
+ MachineInstrBuilder MIB = BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+ TII.get(AArch64::RET_ReallyLR));
+ for (unsigned i = 0, e = RetRegs.size(); i != e; ++i)
+ MIB.addReg(RetRegs[i], RegState::Implicit);
+ return true;
+}
+
+bool AArch64FastISel::SelectTrunc(const Instruction *I) {
+ Type *DestTy = I->getType();
+ Value *Op = I->getOperand(0);
+ Type *SrcTy = Op->getType();
+
+ EVT SrcEVT = TLI.getValueType(SrcTy, true);
+ EVT DestEVT = TLI.getValueType(DestTy, true);
+ if (!SrcEVT.isSimple())
+ return false;
+ if (!DestEVT.isSimple())
+ return false;
+
+ MVT SrcVT = SrcEVT.getSimpleVT();
+ MVT DestVT = DestEVT.getSimpleVT();
+
+ if (SrcVT != MVT::i64 && SrcVT != MVT::i32 && SrcVT != MVT::i16 &&
+ SrcVT != MVT::i8)
+ return false;
+ if (DestVT != MVT::i32 && DestVT != MVT::i16 && DestVT != MVT::i8 &&
+ DestVT != MVT::i1)
+ return false;
+
+ unsigned SrcReg = getRegForValue(Op);
+ if (!SrcReg)
+ return false;
+
+ // If we're truncating from i64 to a smaller non-legal type then generate an
+ // AND. Otherwise, we know the high bits are undefined and a truncate doesn't
+ // generate any code.
+ if (SrcVT == MVT::i64) {
+ uint64_t Mask = 0;
+ switch (DestVT.SimpleTy) {
+ default:
+ // Trunc i64 to i32 is handled by the target-independent fast-isel.
+ return false;
+ case MVT::i1:
+ Mask = 0x1;
+ break;
+ case MVT::i8:
+ Mask = 0xff;
+ break;
+ case MVT::i16:
+ Mask = 0xffff;
+ break;
+ }
+ // Issue an extract_subreg to get the lower 32-bits.
+ unsigned Reg32 = FastEmitInst_extractsubreg(MVT::i32, SrcReg, /*Kill=*/true,
+ AArch64::sub_32);
+ MRI.constrainRegClass(Reg32, &AArch64::GPR32RegClass);
+ // Create the AND instruction which performs the actual truncation.
+ unsigned ANDReg = createResultReg(&AArch64::GPR32spRegClass);
+ BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(AArch64::ANDWri),
+ ANDReg)
+ .addReg(Reg32)
+ .addImm(AArch64_AM::encodeLogicalImmediate(Mask, 32));
+ SrcReg = ANDReg;
+ }
+
+ UpdateValueMap(I, SrcReg);
+ return true;
+}
+
+unsigned AArch64FastISel::Emiti1Ext(unsigned SrcReg, MVT DestVT, bool isZExt) {
+ assert((DestVT == MVT::i8 || DestVT == MVT::i16 || DestVT == MVT::i32 ||
+ DestVT == MVT::i64) &&
+ "Unexpected value type.");
+ // Handle i8 and i16 as i32.
+ if (DestVT == MVT::i8 || DestVT == MVT::i16)
+ DestVT = MVT::i32;
+
+ if (isZExt) {
+ MRI.constrainRegClass(SrcReg, &AArch64::GPR32RegClass);
+ unsigned ResultReg = createResultReg(&AArch64::GPR32spRegClass);
+ BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(AArch64::ANDWri),
+ ResultReg)
+ .addReg(SrcReg)
+ .addImm(AArch64_AM::encodeLogicalImmediate(1, 32));
+
+ if (DestVT == MVT::i64) {
+ // We're ZExt i1 to i64. The ANDWri Wd, Ws, #1 implicitly clears the
+ // upper 32 bits. Emit a SUBREG_TO_REG to extend from Wd to Xd.
+ unsigned Reg64 = MRI.createVirtualRegister(&AArch64::GPR64RegClass);
+ BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+ TII.get(AArch64::SUBREG_TO_REG), Reg64)
+ .addImm(0)
+ .addReg(ResultReg)
+ .addImm(AArch64::sub_32);
+ ResultReg = Reg64;
+ }
+ return ResultReg;
+ } else {
+ if (DestVT == MVT::i64) {
+ // FIXME: We're SExt i1 to i64.
+ return 0;
+ }
+ unsigned ResultReg = createResultReg(&AArch64::GPR32RegClass);
+ BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(AArch64::SBFMWri),
+ ResultReg)
+ .addReg(SrcReg)
+ .addImm(0)
+ .addImm(0);
+ return ResultReg;
+ }
+}
+
+unsigned AArch64FastISel::EmitIntExt(MVT SrcVT, unsigned SrcReg, MVT DestVT,
+ bool isZExt) {
+ assert(DestVT != MVT::i1 && "ZeroExt/SignExt an i1?");
+ unsigned Opc;
+ unsigned Imm = 0;
+
+ switch (SrcVT.SimpleTy) {
+ default:
+ return 0;
+ case MVT::i1:
+ return Emiti1Ext(SrcReg, DestVT, isZExt);
+ case MVT::i8:
+ if (DestVT == MVT::i64)
+ Opc = isZExt ? AArch64::UBFMXri : AArch64::SBFMXri;
+ else
+ Opc = isZExt ? AArch64::UBFMWri : AArch64::SBFMWri;
+ Imm = 7;
+ break;
+ case MVT::i16:
+ if (DestVT == MVT::i64)
+ Opc = isZExt ? AArch64::UBFMXri : AArch64::SBFMXri;
+ else
+ Opc = isZExt ? AArch64::UBFMWri : AArch64::SBFMWri;
+ Imm = 15;
+ break;
+ case MVT::i32:
+ assert(DestVT == MVT::i64 && "IntExt i32 to i32?!?");
+ Opc = isZExt ? AArch64::UBFMXri : AArch64::SBFMXri;
+ Imm = 31;
+ break;
+ }
+
+ // Handle i8 and i16 as i32.
+ if (DestVT == MVT::i8 || DestVT == MVT::i16)
+ DestVT = MVT::i32;
+ else if (DestVT == MVT::i64) {
+ unsigned Src64 = MRI.createVirtualRegister(&AArch64::GPR64RegClass);
+ BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+ TII.get(AArch64::SUBREG_TO_REG), Src64)
+ .addImm(0)
+ .addReg(SrcReg)
+ .addImm(AArch64::sub_32);
+ SrcReg = Src64;
+ }
+
+ unsigned ResultReg = createResultReg(TLI.getRegClassFor(DestVT));
+ BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc), ResultReg)
+ .addReg(SrcReg)
+ .addImm(0)
+ .addImm(Imm);
+
+ return ResultReg;
+}
+
+bool AArch64FastISel::SelectIntExt(const Instruction *I) {
+ // On ARM, in general, integer casts don't involve legal types; this code
+ // handles promotable integers. The high bits for a type smaller than
+ // the register size are assumed to be undefined.
+ Type *DestTy = I->getType();
+ Value *Src = I->getOperand(0);
+ Type *SrcTy = Src->getType();
+
+ bool isZExt = isa<ZExtInst>(I);
+ unsigned SrcReg = getRegForValue(Src);
+ if (!SrcReg)
+ return false;
+
+ EVT SrcEVT = TLI.getValueType(SrcTy, true);
+ EVT DestEVT = TLI.getValueType(DestTy, true);
+ if (!SrcEVT.isSimple())
+ return false;
+ if (!DestEVT.isSimple())
+ return false;
+
+ MVT SrcVT = SrcEVT.getSimpleVT();
+ MVT DestVT = DestEVT.getSimpleVT();
+ unsigned ResultReg = EmitIntExt(SrcVT, SrcReg, DestVT, isZExt);
+ if (ResultReg == 0)
+ return false;
+ UpdateValueMap(I, ResultReg);
+ return true;
+}
+
+bool AArch64FastISel::SelectRem(const Instruction *I, unsigned ISDOpcode) {
+ EVT DestEVT = TLI.getValueType(I->getType(), true);
+ if (!DestEVT.isSimple())
+ return false;
+
+ MVT DestVT = DestEVT.getSimpleVT();
+ if (DestVT != MVT::i64 && DestVT != MVT::i32)
+ return false;
+
+ unsigned DivOpc;
+ bool is64bit = (DestVT == MVT::i64);
+ switch (ISDOpcode) {
+ default:
+ return false;
+ case ISD::SREM:
+ DivOpc = is64bit ? AArch64::SDIVXr : AArch64::SDIVWr;
+ break;
+ case ISD::UREM:
+ DivOpc = is64bit ? AArch64::UDIVXr : AArch64::UDIVWr;
+ break;
+ }
+ unsigned MSubOpc = is64bit ? AArch64::MSUBXrrr : AArch64::MSUBWrrr;
+ unsigned Src0Reg = getRegForValue(I->getOperand(0));
+ if (!Src0Reg)
+ return false;
+
+ unsigned Src1Reg = getRegForValue(I->getOperand(1));
+ if (!Src1Reg)
+ return false;
+
+ unsigned QuotReg = createResultReg(TLI.getRegClassFor(DestVT));
+ BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(DivOpc), QuotReg)
+ .addReg(Src0Reg)
+ .addReg(Src1Reg);
+ // The remainder is computed as numerator - (quotient * denominator) using the
+ // MSUB instruction.
+ unsigned ResultReg = createResultReg(TLI.getRegClassFor(DestVT));
+ BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(MSubOpc), ResultReg)
+ .addReg(QuotReg)
+ .addReg(Src1Reg)
+ .addReg(Src0Reg);
+ UpdateValueMap(I, ResultReg);
+ return true;
+}
+
+bool AArch64FastISel::SelectMul(const Instruction *I) {
+ EVT SrcEVT = TLI.getValueType(I->getOperand(0)->getType(), true);
+ if (!SrcEVT.isSimple())
+ return false;
+ MVT SrcVT = SrcEVT.getSimpleVT();
+
+ // Must be simple value type. Don't handle vectors.
+ if (SrcVT != MVT::i64 && SrcVT != MVT::i32 && SrcVT != MVT::i16 &&
+ SrcVT != MVT::i8)
+ return false;
+
+ unsigned Opc;
+ unsigned ZReg;
+ switch (SrcVT.SimpleTy) {
+ default:
+ return false;
+ case MVT::i8:
+ case MVT::i16:
+ case MVT::i32:
+ ZReg = AArch64::WZR;
+ Opc = AArch64::MADDWrrr;
+ break;
+ case MVT::i64:
+ ZReg = AArch64::XZR;
+ Opc = AArch64::MADDXrrr;
+ break;
+ }
+
+ unsigned Src0Reg = getRegForValue(I->getOperand(0));
+ if (!Src0Reg)
+ return false;
+
+ unsigned Src1Reg = getRegForValue(I->getOperand(1));
+ if (!Src1Reg)
+ return false;
+
+ // Create the base instruction, then add the operands.
+ unsigned ResultReg = createResultReg(TLI.getRegClassFor(SrcVT));
+ BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc), ResultReg)
+ .addReg(Src0Reg)
+ .addReg(Src1Reg)
+ .addReg(ZReg);
+ UpdateValueMap(I, ResultReg);
+ return true;
+}
+
+bool AArch64FastISel::TargetSelectInstruction(const Instruction *I) {
+ switch (I->getOpcode()) {
+ default:
+ break;
+ case Instruction::Load:
+ return SelectLoad(I);
+ case Instruction::Store:
+ return SelectStore(I);
+ case Instruction::Br:
+ return SelectBranch(I);
+ case Instruction::IndirectBr:
+ return SelectIndirectBr(I);
+ case Instruction::FCmp:
+ case Instruction::ICmp:
+ return SelectCmp(I);
+ case Instruction::Select:
+ return SelectSelect(I);
+ case Instruction::FPExt:
+ return SelectFPExt(I);
+ case Instruction::FPTrunc:
+ return SelectFPTrunc(I);
+ case Instruction::FPToSI:
+ return SelectFPToInt(I, /*Signed=*/true);
+ case Instruction::FPToUI:
+ return SelectFPToInt(I, /*Signed=*/false);
+ case Instruction::SIToFP:
+ return SelectIntToFP(I, /*Signed=*/true);
+ case Instruction::UIToFP:
+ return SelectIntToFP(I, /*Signed=*/false);
+ case Instruction::SRem:
+ return SelectRem(I, ISD::SREM);
+ case Instruction::URem:
+ return SelectRem(I, ISD::UREM);
+ case Instruction::Call:
+ if (const IntrinsicInst *II = dyn_cast<IntrinsicInst>(I))
+ return SelectIntrinsicCall(*II);
+ return SelectCall(I);
+ case Instruction::Ret:
+ return SelectRet(I);
+ case Instruction::Trunc:
+ return SelectTrunc(I);
+ case Instruction::ZExt:
+ case Instruction::SExt:
+ return SelectIntExt(I);
+ case Instruction::Mul:
+ // FIXME: This really should be handled by the target-independent selector.
+ return SelectMul(I);
+ }
+ return false;
+ // Silence warnings.
+ (void)&CC_AArch64_DarwinPCS_VarArg;
+}
+
+namespace llvm {
+llvm::FastISel *AArch64::createFastISel(FunctionLoweringInfo &funcInfo,
+ const TargetLibraryInfo *libInfo) {
+ return new AArch64FastISel(funcInfo, libInfo);
+}
+}
diff --git a/lib/Target/AArch64/AArch64FrameLowering.cpp b/lib/Target/AArch64/AArch64FrameLowering.cpp
index b29587a..deb306a 100644
--- a/lib/Target/AArch64/AArch64FrameLowering.cpp
+++ b/lib/Target/AArch64/AArch64FrameLowering.cpp
@@ -1,4 +1,4 @@
-//===- AArch64FrameLowering.cpp - AArch64 Frame Information ---------------===//
+//===- AArch64FrameLowering.cpp - AArch64 Frame Lowering -------*- C++ -*-====//
//
// The LLVM Compiler Infrastructure
//
@@ -11,227 +11,444 @@
//
//===----------------------------------------------------------------------===//
-#include "AArch64.h"
#include "AArch64FrameLowering.h"
#include "AArch64InstrInfo.h"
#include "AArch64MachineFunctionInfo.h"
+#include "AArch64Subtarget.h"
+#include "AArch64TargetMachine.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/Function.h"
#include "llvm/CodeGen/MachineFrameInfo.h"
#include "llvm/CodeGen/MachineFunction.h"
#include "llvm/CodeGen/MachineInstrBuilder.h"
-#include "llvm/CodeGen/MachineMemOperand.h"
#include "llvm/CodeGen/MachineModuleInfo.h"
#include "llvm/CodeGen/MachineRegisterInfo.h"
#include "llvm/CodeGen/RegisterScavenging.h"
-#include "llvm/IR/Function.h"
-#include "llvm/MC/MachineLocation.h"
#include "llvm/Support/Debug.h"
-#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/raw_ostream.h"
using namespace llvm;
-void AArch64FrameLowering::splitSPAdjustments(uint64_t Total,
- uint64_t &Initial,
- uint64_t &Residual) const {
- // 0x1f0 here is a pessimistic (i.e. realistic) boundary: x-register LDP
- // instructions have a 7-bit signed immediate scaled by 8, giving a reach of
- // 0x1f8, but stack adjustment should always be a multiple of 16.
- if (Total <= 0x1f0) {
- Initial = Total;
- Residual = 0;
- } else {
- Initial = 0x1f0;
- Residual = Total - Initial;
+#define DEBUG_TYPE "frame-info"
+
+static cl::opt<bool> EnableRedZone("aarch64-redzone",
+ cl::desc("enable use of redzone on AArch64"),
+ cl::init(false), cl::Hidden);
+
+STATISTIC(NumRedZoneFunctions, "Number of functions using red zone");
+
+static unsigned estimateStackSize(MachineFunction &MF) {
+ const MachineFrameInfo *FFI = MF.getFrameInfo();
+ int Offset = 0;
+ for (int i = FFI->getObjectIndexBegin(); i != 0; ++i) {
+ int FixedOff = -FFI->getObjectOffset(i);
+ if (FixedOff > Offset)
+ Offset = FixedOff;
+ }
+ for (unsigned i = 0, e = FFI->getObjectIndexEnd(); i != e; ++i) {
+ if (FFI->isDeadObjectIndex(i))
+ continue;
+ Offset += FFI->getObjectSize(i);
+ unsigned Align = FFI->getObjectAlignment(i);
+ // Adjust to alignment boundary
+ Offset = (Offset + Align - 1) / Align * Align;
+ }
+ // This does not include the 16 bytes used for fp and lr.
+ return (unsigned)Offset;
+}
+
+bool AArch64FrameLowering::canUseRedZone(const MachineFunction &MF) const {
+ if (!EnableRedZone)
+ return false;
+ // Don't use the red zone if the function explicitly asks us not to.
+ // This is typically used for kernel code.
+ if (MF.getFunction()->getAttributes().hasAttribute(
+ AttributeSet::FunctionIndex, Attribute::NoRedZone))
+ return false;
+
+ const MachineFrameInfo *MFI = MF.getFrameInfo();
+ const AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>();
+ unsigned NumBytes = AFI->getLocalStackSize();
+
+ // Note: currently hasFP() is always true for hasCalls(), but that's an
+ // implementation detail of the current code, not a strict requirement,
+ // so stay safe here and check both.
+ if (MFI->hasCalls() || hasFP(MF) || NumBytes > 128)
+ return false;
+ return true;
+}
+
+/// hasFP - Return true if the specified function should have a dedicated frame
+/// pointer register.
+bool AArch64FrameLowering::hasFP(const MachineFunction &MF) const {
+ const MachineFrameInfo *MFI = MF.getFrameInfo();
+
+#ifndef NDEBUG
+ const TargetRegisterInfo *RegInfo = MF.getTarget().getRegisterInfo();
+ assert(!RegInfo->needsStackRealignment(MF) &&
+ "No stack realignment on AArch64!");
+#endif
+
+ return (MFI->hasCalls() || MFI->hasVarSizedObjects() ||
+ MFI->isFrameAddressTaken());
+}
+
+/// hasReservedCallFrame - Under normal circumstances, when a frame pointer is
+/// not required, we reserve argument space for call sites in the function
+/// immediately on entry to the current function. This eliminates the need for
+/// add/sub sp brackets around call sites. Returns true if the call frame is
+/// included as part of the stack frame.
+bool
+AArch64FrameLowering::hasReservedCallFrame(const MachineFunction &MF) const {
+ return !MF.getFrameInfo()->hasVarSizedObjects();
+}
+
+void AArch64FrameLowering::eliminateCallFramePseudoInstr(
+ MachineFunction &MF, MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator I) const {
+ const AArch64InstrInfo *TII =
+ static_cast<const AArch64InstrInfo *>(MF.getTarget().getInstrInfo());
+ DebugLoc DL = I->getDebugLoc();
+ int Opc = I->getOpcode();
+ bool IsDestroy = Opc == TII->getCallFrameDestroyOpcode();
+ uint64_t CalleePopAmount = IsDestroy ? I->getOperand(1).getImm() : 0;
+
+ const TargetFrameLowering *TFI = MF.getTarget().getFrameLowering();
+ if (!TFI->hasReservedCallFrame(MF)) {
+ unsigned Align = getStackAlignment();
+
+ int64_t Amount = I->getOperand(0).getImm();
+ Amount = RoundUpToAlignment(Amount, Align);
+ if (!IsDestroy)
+ Amount = -Amount;
+
+ // N.b. if CalleePopAmount is valid but zero (i.e. callee would pop, but it
+ // doesn't have to pop anything), then the first operand will be zero too so
+ // this adjustment is a no-op.
+ if (CalleePopAmount == 0) {
+ // FIXME: in-function stack adjustment for calls is limited to 24-bits
+ // because there's no guaranteed temporary register available.
+ //
+ // ADD/SUB (immediate) has only LSL #0 and LSL #12 avaiable.
+ // 1) For offset <= 12-bit, we use LSL #0
+ // 2) For 12-bit <= offset <= 24-bit, we use two instructions. One uses
+ // LSL #0, and the other uses LSL #12.
+ //
+ // Mostly call frames will be allocated at the start of a function so
+ // this is OK, but it is a limitation that needs dealing with.
+ assert(Amount > -0xffffff && Amount < 0xffffff && "call frame too large");
+ emitFrameOffset(MBB, I, DL, AArch64::SP, AArch64::SP, Amount, TII);
+ }
+ } else if (CalleePopAmount != 0) {
+ // If the calling convention demands that the callee pops arguments from the
+ // stack, we want to add it back if we have a reserved call frame.
+ assert(CalleePopAmount < 0xffffff && "call frame too large");
+ emitFrameOffset(MBB, I, DL, AArch64::SP, AArch64::SP, -CalleePopAmount,
+ TII);
+ }
+ MBB.erase(I);
+}
+
+void AArch64FrameLowering::emitCalleeSavedFrameMoves(
+ MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
+ unsigned FramePtr) const {
+ MachineFunction &MF = *MBB.getParent();
+ MachineFrameInfo *MFI = MF.getFrameInfo();
+ MachineModuleInfo &MMI = MF.getMMI();
+ const MCRegisterInfo *MRI = MMI.getContext().getRegisterInfo();
+ const AArch64InstrInfo *TII = TM.getInstrInfo();
+ DebugLoc DL = MBB.findDebugLoc(MBBI);
+
+ // Add callee saved registers to move list.
+ const std::vector<CalleeSavedInfo> &CSI = MFI->getCalleeSavedInfo();
+ if (CSI.empty())
+ return;
+
+ const DataLayout *TD = MF.getTarget().getDataLayout();
+ bool HasFP = hasFP(MF);
+
+ // Calculate amount of bytes used for return address storing.
+ int stackGrowth = -TD->getPointerSize(0);
+
+ // Calculate offsets.
+ int64_t saveAreaOffset = (HasFP ? 2 : 1) * stackGrowth;
+ unsigned TotalSkipped = 0;
+ for (const auto &Info : CSI) {
+ unsigned Reg = Info.getReg();
+ int64_t Offset = MFI->getObjectOffset(Info.getFrameIdx()) -
+ getOffsetOfLocalArea() + saveAreaOffset;
+
+ // Don't output a new CFI directive if we're re-saving the frame pointer or
+ // link register. This happens when the PrologEpilogInserter has inserted an
+ // extra "STP" of the frame pointer and link register -- the "emitPrologue"
+ // method automatically generates the directives when frame pointers are
+ // used. If we generate CFI directives for the extra "STP"s, the linker will
+ // lose track of the correct values for the frame pointer and link register.
+ if (HasFP && (FramePtr == Reg || Reg == AArch64::LR)) {
+ TotalSkipped += stackGrowth;
+ continue;
+ }
+
+ unsigned DwarfReg = MRI->getDwarfRegNum(Reg, true);
+ unsigned CFIIndex = MMI.addFrameInst(MCCFIInstruction::createOffset(
+ nullptr, DwarfReg, Offset - TotalSkipped));
+ BuildMI(MBB, MBBI, DL, TII->get(TargetOpcode::CFI_INSTRUCTION))
+ .addCFIIndex(CFIIndex);
}
}
void AArch64FrameLowering::emitPrologue(MachineFunction &MF) const {
- AArch64MachineFunctionInfo *FuncInfo =
- MF.getInfo<AArch64MachineFunctionInfo>();
- MachineBasicBlock &MBB = MF.front();
+ MachineBasicBlock &MBB = MF.front(); // Prologue goes in entry BB.
MachineBasicBlock::iterator MBBI = MBB.begin();
- MachineFrameInfo *MFI = MF.getFrameInfo();
- const TargetInstrInfo &TII = *MF.getTarget().getInstrInfo();
- DebugLoc DL = MBBI != MBB.end() ? MBBI->getDebugLoc() : DebugLoc();
-
+ const MachineFrameInfo *MFI = MF.getFrameInfo();
+ const Function *Fn = MF.getFunction();
+ const AArch64RegisterInfo *RegInfo = TM.getRegisterInfo();
+ const AArch64InstrInfo *TII = TM.getInstrInfo();
MachineModuleInfo &MMI = MF.getMMI();
- const MCRegisterInfo *MRI = MMI.getContext().getRegisterInfo();
- bool NeedsFrameMoves = MMI.hasDebugInfo()
- || MF.getFunction()->needsUnwindTableEntry();
+ AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>();
+ bool needsFrameMoves = MMI.hasDebugInfo() || Fn->needsUnwindTableEntry();
+ bool HasFP = hasFP(MF);
+ DebugLoc DL = MBB.findDebugLoc(MBBI);
- uint64_t NumInitialBytes, NumResidualBytes;
+ int NumBytes = (int)MFI->getStackSize();
+ if (!AFI->hasStackFrame()) {
+ assert(!HasFP && "unexpected function without stack frame but with FP");
- // Currently we expect the stack to be laid out by
- // sub sp, sp, #initial
- // stp x29, x30, [sp, #offset]
- // ...
- // str xxx, [sp, #offset]
- // sub sp, sp, #rest (possibly via extra instructions).
- if (MFI->getCalleeSavedInfo().size()) {
- // If there are callee-saved registers, we want to store them efficiently as
- // a block, and virtual base assignment happens too early to do it for us so
- // we adjust the stack in two phases: first just for callee-saved fiddling,
- // then to allocate the rest of the frame.
- splitSPAdjustments(MFI->getStackSize(), NumInitialBytes, NumResidualBytes);
- } else {
- // If there aren't any callee-saved registers, two-phase adjustment is
- // inefficient. It's more efficient to adjust with NumInitialBytes too
- // because when we're in a "callee pops argument space" situation, that pop
- // must be tacked onto Initial for correctness.
- NumInitialBytes = MFI->getStackSize();
- NumResidualBytes = 0;
- }
+ // All of the stack allocation is for locals.
+ AFI->setLocalStackSize(NumBytes);
- // Tell everyone else how much adjustment we're expecting them to use. In
- // particular if an adjustment is required for a tail call the epilogue could
- // have a different view of things.
- FuncInfo->setInitialStackAdjust(NumInitialBytes);
+ // Label used to tie together the PROLOG_LABEL and the MachineMoves.
+ MCSymbol *FrameLabel = MMI.getContext().CreateTempSymbol();
- emitSPUpdate(MBB, MBBI, DL, TII, AArch64::X16, -NumInitialBytes,
- MachineInstr::FrameSetup);
+ // REDZONE: If the stack size is less than 128 bytes, we don't need
+ // to actually allocate.
+ if (NumBytes && !canUseRedZone(MF)) {
+ emitFrameOffset(MBB, MBBI, DL, AArch64::SP, AArch64::SP, -NumBytes, TII,
+ MachineInstr::FrameSetup);
- if (NeedsFrameMoves && NumInitialBytes) {
- // We emit this update even if the CFA is set from a frame pointer later so
- // that the CFA is valid in the interim.
- MachineLocation Dst(MachineLocation::VirtualFP);
- unsigned Reg = MRI->getDwarfRegNum(AArch64::XSP, true);
- unsigned CFIIndex = MMI.addFrameInst(
- MCCFIInstruction::createDefCfa(nullptr, Reg, -NumInitialBytes));
- BuildMI(MBB, MBBI, DL, TII.get(TargetOpcode::CFI_INSTRUCTION))
- .addCFIIndex(CFIIndex);
- }
-
- // Otherwise we need to set the frame pointer and/or add a second stack
- // adjustment.
-
- bool FPNeedsSetting = hasFP(MF);
- for (; MBBI != MBB.end(); ++MBBI) {
- // Note that this search makes strong assumptions about the operation used
- // to store the frame-pointer: it must be "STP x29, x30, ...". This could
- // change in future, but until then there's no point in implementing
- // untestable more generic cases.
- if (FPNeedsSetting && MBBI->getOpcode() == AArch64::LSPair64_STR
- && MBBI->getOperand(0).getReg() == AArch64::X29) {
- int64_t X29FrameIdx = MBBI->getOperand(2).getIndex();
- FuncInfo->setFramePointerOffset(MFI->getObjectOffset(X29FrameIdx));
-
- ++MBBI;
- emitRegUpdate(MBB, MBBI, DL, TII, AArch64::X29, AArch64::XSP,
- AArch64::X29,
- NumInitialBytes + MFI->getObjectOffset(X29FrameIdx),
- MachineInstr::FrameSetup);
-
- // The offset adjustment used when emitting debugging locations relative
- // to whatever frame base is set. AArch64 uses the default frame base (FP
- // or SP) and this adjusts the calculations to be correct.
- MFI->setOffsetAdjustment(- MFI->getObjectOffset(X29FrameIdx)
- - MFI->getStackSize());
-
- if (NeedsFrameMoves) {
- unsigned Reg = MRI->getDwarfRegNum(AArch64::X29, true);
- unsigned Offset = MFI->getObjectOffset(X29FrameIdx);
- unsigned CFIIndex = MMI.addFrameInst(
- MCCFIInstruction::createDefCfa(nullptr, Reg, Offset));
- BuildMI(MBB, MBBI, DL, TII.get(TargetOpcode::CFI_INSTRUCTION))
- .addCFIIndex(CFIIndex);
- }
-
- FPNeedsSetting = false;
+ // Encode the stack size of the leaf function.
+ unsigned CFIIndex = MMI.addFrameInst(
+ MCCFIInstruction::createDefCfaOffset(FrameLabel, -NumBytes));
+ BuildMI(MBB, MBBI, DL, TII->get(TargetOpcode::CFI_INSTRUCTION))
+ .addCFIIndex(CFIIndex);
+ } else if (NumBytes) {
+ ++NumRedZoneFunctions;
}
- if (!MBBI->getFlag(MachineInstr::FrameSetup))
- break;
- }
-
- assert(!FPNeedsSetting && "Frame pointer couldn't be set");
-
- emitSPUpdate(MBB, MBBI, DL, TII, AArch64::X16, -NumResidualBytes,
- MachineInstr::FrameSetup);
-
- // Now we emit the rest of the frame setup information, if necessary: we've
- // already noted the FP and initial SP moves so we're left with the prologue's
- // final SP update and callee-saved register locations.
- if (!NeedsFrameMoves)
return;
-
- // The rest of the stack adjustment
- if (!hasFP(MF) && NumResidualBytes) {
- MachineLocation Dst(MachineLocation::VirtualFP);
- unsigned Reg = MRI->getDwarfRegNum(AArch64::XSP, true);
- unsigned Offset = NumResidualBytes + NumInitialBytes;
- unsigned CFIIndex =
- MMI.addFrameInst(MCCFIInstruction::createDefCfa(nullptr, Reg, -Offset));
- BuildMI(MBB, MBBI, DL, TII.get(TargetOpcode::CFI_INSTRUCTION))
- .addCFIIndex(CFIIndex);
}
- // And any callee-saved registers (it's fine to leave them to the end here,
- // because the old values are still valid at this point.
- const std::vector<CalleeSavedInfo> &CSI = MFI->getCalleeSavedInfo();
- if (CSI.size()) {
- for (std::vector<CalleeSavedInfo>::const_iterator I = CSI.begin(),
- E = CSI.end(); I != E; ++I) {
- unsigned Offset = MFI->getObjectOffset(I->getFrameIdx());
- unsigned Reg = MRI->getDwarfRegNum(I->getReg(), true);
+ // Only set up FP if we actually need to.
+ int FPOffset = 0;
+ if (HasFP) {
+ // First instruction must a) allocate the stack and b) have an immediate
+ // that is a multiple of -2.
+ assert((MBBI->getOpcode() == AArch64::STPXpre ||
+ MBBI->getOpcode() == AArch64::STPDpre) &&
+ MBBI->getOperand(3).getReg() == AArch64::SP &&
+ MBBI->getOperand(4).getImm() < 0 &&
+ (MBBI->getOperand(4).getImm() & 1) == 0);
+
+ // Frame pointer is fp = sp - 16. Since the STPXpre subtracts the space
+ // required for the callee saved register area we get the frame pointer
+ // by addding that offset - 16 = -getImm()*8 - 2*8 = -(getImm() + 2) * 8.
+ FPOffset = -(MBBI->getOperand(4).getImm() + 2) * 8;
+ assert(FPOffset >= 0 && "Bad Framepointer Offset");
+ }
+
+ // Move past the saves of the callee-saved registers.
+ while (MBBI->getOpcode() == AArch64::STPXi ||
+ MBBI->getOpcode() == AArch64::STPDi ||
+ MBBI->getOpcode() == AArch64::STPXpre ||
+ MBBI->getOpcode() == AArch64::STPDpre) {
+ ++MBBI;
+ NumBytes -= 16;
+ }
+ assert(NumBytes >= 0 && "Negative stack allocation size!?");
+ if (HasFP) {
+ // Issue sub fp, sp, FPOffset or
+ // mov fp,sp when FPOffset is zero.
+ // Note: All stores of callee-saved registers are marked as "FrameSetup".
+ // This code marks the instruction(s) that set the FP also.
+ emitFrameOffset(MBB, MBBI, DL, AArch64::FP, AArch64::SP, FPOffset, TII,
+ MachineInstr::FrameSetup);
+ }
+
+ // All of the remaining stack allocations are for locals.
+ AFI->setLocalStackSize(NumBytes);
+
+ // Allocate space for the rest of the frame.
+ if (NumBytes) {
+ // If we're a leaf function, try using the red zone.
+ if (!canUseRedZone(MF))
+ emitFrameOffset(MBB, MBBI, DL, AArch64::SP, AArch64::SP, -NumBytes, TII,
+ MachineInstr::FrameSetup);
+ }
+
+ // If we need a base pointer, set it up here. It's whatever the value of the
+ // stack pointer is at this point. Any variable size objects will be allocated
+ // after this, so we can still use the base pointer to reference locals.
+ //
+ // FIXME: Clarify FrameSetup flags here.
+ // Note: Use emitFrameOffset() like above for FP if the FrameSetup flag is
+ // needed.
+ //
+ if (RegInfo->hasBasePointer(MF))
+ TII->copyPhysReg(MBB, MBBI, DL, AArch64::X19, AArch64::SP, false);
+
+ if (needsFrameMoves) {
+ const DataLayout *TD = MF.getTarget().getDataLayout();
+ const int StackGrowth = -TD->getPointerSize(0);
+ unsigned FramePtr = RegInfo->getFrameRegister(MF);
+
+ // An example of the prologue:
+ //
+ // .globl __foo
+ // .align 2
+ // __foo:
+ // Ltmp0:
+ // .cfi_startproc
+ // .cfi_personality 155, ___gxx_personality_v0
+ // Leh_func_begin:
+ // .cfi_lsda 16, Lexception33
+ //
+ // stp xa,bx, [sp, -#offset]!
+ // ...
+ // stp x28, x27, [sp, #offset-32]
+ // stp fp, lr, [sp, #offset-16]
+ // add fp, sp, #offset - 16
+ // sub sp, sp, #1360
+ //
+ // The Stack:
+ // +-------------------------------------------+
+ // 10000 | ........ | ........ | ........ | ........ |
+ // 10004 | ........ | ........ | ........ | ........ |
+ // +-------------------------------------------+
+ // 10008 | ........ | ........ | ........ | ........ |
+ // 1000c | ........ | ........ | ........ | ........ |
+ // +===========================================+
+ // 10010 | X28 Register |
+ // 10014 | X28 Register |
+ // +-------------------------------------------+
+ // 10018 | X27 Register |
+ // 1001c | X27 Register |
+ // +===========================================+
+ // 10020 | Frame Pointer |
+ // 10024 | Frame Pointer |
+ // +-------------------------------------------+
+ // 10028 | Link Register |
+ // 1002c | Link Register |
+ // +===========================================+
+ // 10030 | ........ | ........ | ........ | ........ |
+ // 10034 | ........ | ........ | ........ | ........ |
+ // +-------------------------------------------+
+ // 10038 | ........ | ........ | ........ | ........ |
+ // 1003c | ........ | ........ | ........ | ........ |
+ // +-------------------------------------------+
+ //
+ // [sp] = 10030 :: >>initial value<<
+ // sp = 10020 :: stp fp, lr, [sp, #-16]!
+ // fp = sp == 10020 :: mov fp, sp
+ // [sp] == 10020 :: stp x28, x27, [sp, #-16]!
+ // sp == 10010 :: >>final value<<
+ //
+ // The frame pointer (w29) points to address 10020. If we use an offset of
+ // '16' from 'w29', we get the CFI offsets of -8 for w30, -16 for w29, -24
+ // for w27, and -32 for w28:
+ //
+ // Ltmp1:
+ // .cfi_def_cfa w29, 16
+ // Ltmp2:
+ // .cfi_offset w30, -8
+ // Ltmp3:
+ // .cfi_offset w29, -16
+ // Ltmp4:
+ // .cfi_offset w27, -24
+ // Ltmp5:
+ // .cfi_offset w28, -32
+
+ if (HasFP) {
+ // Define the current CFA rule to use the provided FP.
+ unsigned Reg = RegInfo->getDwarfRegNum(FramePtr, true);
unsigned CFIIndex = MMI.addFrameInst(
- MCCFIInstruction::createOffset(nullptr, Reg, Offset));
- BuildMI(MBB, MBBI, DL, TII.get(TargetOpcode::CFI_INSTRUCTION))
+ MCCFIInstruction::createDefCfa(nullptr, Reg, 2 * StackGrowth));
+ BuildMI(MBB, MBBI, DL, TII->get(TargetOpcode::CFI_INSTRUCTION))
+ .addCFIIndex(CFIIndex);
+
+ // Record the location of the stored LR
+ unsigned LR = RegInfo->getDwarfRegNum(AArch64::LR, true);
+ CFIIndex = MMI.addFrameInst(
+ MCCFIInstruction::createOffset(nullptr, LR, StackGrowth));
+ BuildMI(MBB, MBBI, DL, TII->get(TargetOpcode::CFI_INSTRUCTION))
+ .addCFIIndex(CFIIndex);
+
+ // Record the location of the stored FP
+ CFIIndex = MMI.addFrameInst(
+ MCCFIInstruction::createOffset(nullptr, Reg, 2 * StackGrowth));
+ BuildMI(MBB, MBBI, DL, TII->get(TargetOpcode::CFI_INSTRUCTION))
+ .addCFIIndex(CFIIndex);
+ } else {
+ // Encode the stack size of the leaf function.
+ unsigned CFIIndex = MMI.addFrameInst(
+ MCCFIInstruction::createDefCfaOffset(nullptr, -MFI->getStackSize()));
+ BuildMI(MBB, MBBI, DL, TII->get(TargetOpcode::CFI_INSTRUCTION))
.addCFIIndex(CFIIndex);
}
+
+ // Now emit the moves for whatever callee saved regs we have.
+ emitCalleeSavedFrameMoves(MBB, MBBI, FramePtr);
}
}
-void
-AArch64FrameLowering::emitEpilogue(MachineFunction &MF,
- MachineBasicBlock &MBB) const {
- AArch64MachineFunctionInfo *FuncInfo =
- MF.getInfo<AArch64MachineFunctionInfo>();
+static bool isCalleeSavedRegister(unsigned Reg, const MCPhysReg *CSRegs) {
+ for (unsigned i = 0; CSRegs[i]; ++i)
+ if (Reg == CSRegs[i])
+ return true;
+ return false;
+}
+static bool isCSRestore(MachineInstr *MI, const MCPhysReg *CSRegs) {
+ unsigned RtIdx = 0;
+ if (MI->getOpcode() == AArch64::LDPXpost ||
+ MI->getOpcode() == AArch64::LDPDpost)
+ RtIdx = 1;
+
+ if (MI->getOpcode() == AArch64::LDPXpost ||
+ MI->getOpcode() == AArch64::LDPDpost ||
+ MI->getOpcode() == AArch64::LDPXi || MI->getOpcode() == AArch64::LDPDi) {
+ if (!isCalleeSavedRegister(MI->getOperand(RtIdx).getReg(), CSRegs) ||
+ !isCalleeSavedRegister(MI->getOperand(RtIdx + 1).getReg(), CSRegs) ||
+ MI->getOperand(RtIdx + 2).getReg() != AArch64::SP)
+ return false;
+ return true;
+ }
+
+ return false;
+}
+
+void AArch64FrameLowering::emitEpilogue(MachineFunction &MF,
+ MachineBasicBlock &MBB) const {
MachineBasicBlock::iterator MBBI = MBB.getLastNonDebugInstr();
+ assert(MBBI->isReturn() && "Can only insert epilog into returning blocks");
+ MachineFrameInfo *MFI = MF.getFrameInfo();
+ const AArch64InstrInfo *TII =
+ static_cast<const AArch64InstrInfo *>(MF.getTarget().getInstrInfo());
+ const AArch64RegisterInfo *RegInfo = static_cast<const AArch64RegisterInfo *>(
+ MF.getTarget().getRegisterInfo());
DebugLoc DL = MBBI->getDebugLoc();
- const TargetInstrInfo &TII = *MF.getTarget().getInstrInfo();
- MachineFrameInfo &MFI = *MF.getFrameInfo();
unsigned RetOpcode = MBBI->getOpcode();
+ int NumBytes = MFI->getStackSize();
+ const AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>();
+
// Initial and residual are named for consitency with the prologue. Note that
// in the epilogue, the residual adjustment is executed first.
- uint64_t NumInitialBytes = FuncInfo->getInitialStackAdjust();
- uint64_t NumResidualBytes = MFI.getStackSize() - NumInitialBytes;
uint64_t ArgumentPopSize = 0;
- if (RetOpcode == AArch64::TC_RETURNdi ||
- RetOpcode == AArch64::TC_RETURNxi) {
- MachineOperand &JumpTarget = MBBI->getOperand(0);
+ if (RetOpcode == AArch64::TCRETURNdi || RetOpcode == AArch64::TCRETURNri) {
MachineOperand &StackAdjust = MBBI->getOperand(1);
- MachineInstrBuilder MIB;
- if (RetOpcode == AArch64::TC_RETURNdi) {
- MIB = BuildMI(MBB, MBBI, DL, TII.get(AArch64::TAIL_Bimm));
- if (JumpTarget.isGlobal()) {
- MIB.addGlobalAddress(JumpTarget.getGlobal(), JumpTarget.getOffset(),
- JumpTarget.getTargetFlags());
- } else {
- assert(JumpTarget.isSymbol() && "unexpected tail call destination");
- MIB.addExternalSymbol(JumpTarget.getSymbolName(),
- JumpTarget.getTargetFlags());
- }
- } else {
- assert(RetOpcode == AArch64::TC_RETURNxi && JumpTarget.isReg()
- && "Unexpected tail call");
-
- MIB = BuildMI(MBB, MBBI, DL, TII.get(AArch64::TAIL_BRx));
- MIB.addReg(JumpTarget.getReg(), RegState::Kill);
- }
-
- // Add the extra operands onto the new tail call instruction even though
- // they're not used directly (so that liveness is tracked properly etc).
- for (unsigned i = 2, e = MBBI->getNumOperands(); i != e; ++i)
- MIB->addOperand(MBBI->getOperand(i));
-
-
- // Delete the pseudo instruction TC_RETURN.
- MachineInstr *NewMI = std::prev(MBBI);
- MBB.erase(MBBI);
- MBBI = NewMI;
-
// For a tail-call in a callee-pops-arguments environment, some or all of
// the stack may actually be in use for the call's arguments, this is
// calculated during LowerCall and consumed here...
@@ -241,386 +458,434 @@
// conveniently stored in the MachineFunctionInfo by
// LowerFormalArguments. This will, of course, be zero for the C calling
// convention.
- ArgumentPopSize = FuncInfo->getArgumentStackToRestore();
+ ArgumentPopSize = AFI->getArgumentStackToRestore();
}
- assert(NumInitialBytes % 16 == 0 && NumResidualBytes % 16 == 0
- && "refusing to adjust stack by misaligned amt");
-
- // We may need to address callee-saved registers differently, so find out the
- // bound on the frame indices.
- const std::vector<CalleeSavedInfo> &CSI = MFI.getCalleeSavedInfo();
- int MinCSFI = 0;
- int MaxCSFI = -1;
-
- if (CSI.size()) {
- MinCSFI = CSI[0].getFrameIdx();
- MaxCSFI = CSI[CSI.size() - 1].getFrameIdx();
- }
-
- // The "residual" stack update comes first from this direction and guarantees
- // that SP is NumInitialBytes below its value on function entry, either by a
- // direct update or restoring it from the frame pointer.
- if (NumInitialBytes + ArgumentPopSize != 0) {
- emitSPUpdate(MBB, MBBI, DL, TII, AArch64::X16,
- NumInitialBytes + ArgumentPopSize);
- --MBBI;
- }
-
-
- // MBBI now points to the instruction just past the last callee-saved
- // restoration (either RET/B if NumInitialBytes == 0, or the "ADD sp, sp"
- // otherwise).
-
- // Now we need to find out where to put the bulk of the stack adjustment
- MachineBasicBlock::iterator FirstEpilogue = MBBI;
- while (MBBI != MBB.begin()) {
- --MBBI;
-
- unsigned FrameOp;
- for (FrameOp = 0; FrameOp < MBBI->getNumOperands(); ++FrameOp) {
- if (MBBI->getOperand(FrameOp).isFI())
- break;
- }
-
- // If this instruction doesn't have a frame index we've reached the end of
- // the callee-save restoration.
- if (FrameOp == MBBI->getNumOperands())
- break;
-
- // Likewise if it *is* a local reference, but not to a callee-saved object.
- int FrameIdx = MBBI->getOperand(FrameOp).getIndex();
- if (FrameIdx < MinCSFI || FrameIdx > MaxCSFI)
- break;
-
- FirstEpilogue = MBBI;
- }
-
- if (MF.getFrameInfo()->hasVarSizedObjects()) {
- int64_t StaticFrameBase;
- StaticFrameBase = -(NumInitialBytes + FuncInfo->getFramePointerOffset());
- emitRegUpdate(MBB, FirstEpilogue, DL, TII,
- AArch64::XSP, AArch64::X29, AArch64::NoRegister,
- StaticFrameBase);
- } else {
- emitSPUpdate(MBB, FirstEpilogue, DL,TII, AArch64::X16, NumResidualBytes);
- }
-}
-
-int64_t
-AArch64FrameLowering::resolveFrameIndexReference(MachineFunction &MF,
- int FrameIndex,
- unsigned &FrameReg,
- int SPAdj,
- bool IsCalleeSaveOp) const {
- AArch64MachineFunctionInfo *FuncInfo =
- MF.getInfo<AArch64MachineFunctionInfo>();
- MachineFrameInfo *MFI = MF.getFrameInfo();
-
- int64_t TopOfFrameOffset = MFI->getObjectOffset(FrameIndex);
-
- assert(!(IsCalleeSaveOp && FuncInfo->getInitialStackAdjust() == 0)
- && "callee-saved register in unexpected place");
-
- // If the frame for this function is particularly large, we adjust the stack
- // in two phases which means the callee-save related operations see a
- // different (intermediate) stack size.
- int64_t FrameRegPos;
- if (IsCalleeSaveOp) {
- FrameReg = AArch64::XSP;
- FrameRegPos = -static_cast<int64_t>(FuncInfo->getInitialStackAdjust());
- } else if (useFPForAddressing(MF)) {
- // Have to use the frame pointer since we have no idea where SP is.
- FrameReg = AArch64::X29;
- FrameRegPos = FuncInfo->getFramePointerOffset();
- } else {
- FrameReg = AArch64::XSP;
- FrameRegPos = -static_cast<int64_t>(MFI->getStackSize()) + SPAdj;
- }
-
- return TopOfFrameOffset - FrameRegPos;
-}
-
-void
-AArch64FrameLowering::processFunctionBeforeCalleeSavedScan(MachineFunction &MF,
- RegScavenger *RS) const {
- const AArch64RegisterInfo *RegInfo =
- static_cast<const AArch64RegisterInfo *>(MF.getTarget().getRegisterInfo());
- MachineFrameInfo *MFI = MF.getFrameInfo();
- const AArch64InstrInfo &TII =
- *static_cast<const AArch64InstrInfo *>(MF.getTarget().getInstrInfo());
-
- if (hasFP(MF)) {
- MF.getRegInfo().setPhysRegUsed(AArch64::X29);
- MF.getRegInfo().setPhysRegUsed(AArch64::X30);
- }
-
- // If addressing of local variables is going to be more complicated than
- // shoving a base register and an offset into the instruction then we may well
- // need to scavenge registers. We should either specifically add an
- // callee-save register for this purpose or allocate an extra spill slot.
- bool BigStack =
- MFI->estimateStackSize(MF) >= TII.estimateRSStackLimit(MF)
- || MFI->hasVarSizedObjects() // Access will be from X29: messes things up
- || (MFI->adjustsStack() && !hasReservedCallFrame(MF));
-
- if (!BigStack)
- return;
-
- // We certainly need some slack space for the scavenger, preferably an extra
- // register.
- const uint16_t *CSRegs = RegInfo->getCalleeSavedRegs();
- uint16_t ExtraReg = AArch64::NoRegister;
-
- for (unsigned i = 0; CSRegs[i]; ++i) {
- if (AArch64::GPR64RegClass.contains(CSRegs[i]) &&
- !MF.getRegInfo().isPhysRegUsed(CSRegs[i])) {
- ExtraReg = CSRegs[i];
- break;
- }
- }
-
- if (ExtraReg != 0) {
- MF.getRegInfo().setPhysRegUsed(ExtraReg);
- } else {
- assert(RS && "Expect register scavenger to be available");
-
- // Create a stack slot for scavenging purposes. PrologEpilogInserter
- // helpfully places it near either SP or FP for us to avoid
- // infinitely-regression during scavenging.
- const TargetRegisterClass *RC = &AArch64::GPR64RegClass;
- RS->addScavengingFrameIndex(MFI->CreateStackObject(RC->getSize(),
- RC->getAlignment(),
- false));
- }
-}
-
-bool AArch64FrameLowering::determinePrologueDeath(MachineBasicBlock &MBB,
- unsigned Reg) const {
- // If @llvm.returnaddress is called then it will refer to X30 by some means;
- // the prologue store does not kill the register.
- if (Reg == AArch64::X30) {
- if (MBB.getParent()->getFrameInfo()->isReturnAddressTaken()
- && MBB.getParent()->getRegInfo().isLiveIn(Reg))
- return false;
- }
-
- // In all other cases, physical registers are dead after they've been saved
- // but live at the beginning of the prologue block.
- MBB.addLiveIn(Reg);
- return true;
-}
-
-void
-AArch64FrameLowering::emitFrameMemOps(bool isPrologue, MachineBasicBlock &MBB,
- MachineBasicBlock::iterator MBBI,
- const std::vector<CalleeSavedInfo> &CSI,
- const TargetRegisterInfo *TRI,
- const LoadStoreMethod PossClasses[],
- unsigned NumClasses) const {
- DebugLoc DL = MBB.findDebugLoc(MBBI);
- MachineFunction &MF = *MBB.getParent();
- MachineFrameInfo &MFI = *MF.getFrameInfo();
- const TargetInstrInfo &TII = *MF.getTarget().getInstrInfo();
-
- // A certain amount of implicit contract is present here. The actual stack
- // offsets haven't been allocated officially yet, so for strictly correct code
- // we rely on the fact that the elements of CSI are allocated in order
- // starting at SP, purely as dictated by size and alignment. In practice since
- // this function handles the only accesses to those slots it's not quite so
- // important.
+ // The stack frame should be like below,
//
- // We have also ordered the Callee-saved register list in AArch64CallingConv
- // so that the above scheme puts registers in order: in particular we want
- // &X30 to be &X29+8 for an ABI-correct frame record (PCS 5.2.2)
- for (unsigned i = 0, e = CSI.size(); i < e; ++i) {
- unsigned Reg = CSI[i].getReg();
+ // ---------------------- ---
+ // | | |
+ // | BytesInStackArgArea| CalleeArgStackSize
+ // | (NumReusableBytes) | (of tail call)
+ // | | ---
+ // | | |
+ // ---------------------| --- |
+ // | | | |
+ // | CalleeSavedReg | | |
+ // | (NumRestores * 16) | | |
+ // | | | |
+ // ---------------------| | NumBytes
+ // | | StackSize (StackAdjustUp)
+ // | LocalStackSize | | |
+ // | (covering callee | | |
+ // | args) | | |
+ // | | | |
+ // ---------------------- --- ---
+ //
+ // So NumBytes = StackSize + BytesInStackArgArea - CalleeArgStackSize
+ // = StackSize + ArgumentPopSize
+ //
+ // AArch64TargetLowering::LowerCall figures out ArgumentPopSize and keeps
+ // it as the 2nd argument of AArch64ISD::TC_RETURN.
+ NumBytes += ArgumentPopSize;
- // First we need to find out which register class the register belongs to so
- // that we can use the correct load/store instrucitons.
- unsigned ClassIdx;
- for (ClassIdx = 0; ClassIdx < NumClasses; ++ClassIdx) {
- if (PossClasses[ClassIdx].RegClass->contains(Reg))
- break;
+ unsigned NumRestores = 0;
+ // Move past the restores of the callee-saved registers.
+ MachineBasicBlock::iterator LastPopI = MBBI;
+ const MCPhysReg *CSRegs = RegInfo->getCalleeSavedRegs(&MF);
+ if (LastPopI != MBB.begin()) {
+ do {
+ ++NumRestores;
+ --LastPopI;
+ } while (LastPopI != MBB.begin() && isCSRestore(LastPopI, CSRegs));
+ if (!isCSRestore(LastPopI, CSRegs)) {
+ ++LastPopI;
+ --NumRestores;
}
- assert(ClassIdx != NumClasses
- && "Asked to store register in unexpected class");
- const TargetRegisterClass &TheClass = *PossClasses[ClassIdx].RegClass;
-
- // Now we need to decide whether it's possible to emit a paired instruction:
- // for this we want the next register to be in the same class.
- MachineInstrBuilder NewMI;
- bool Pair = false;
- if (i + 1 < CSI.size() && TheClass.contains(CSI[i+1].getReg())) {
- Pair = true;
- unsigned StLow = 0, StHigh = 0;
- if (isPrologue) {
- // Most of these registers will be live-in to the MBB and killed by our
- // store, though there are exceptions (see determinePrologueDeath).
- StLow = getKillRegState(determinePrologueDeath(MBB, CSI[i+1].getReg()));
- StHigh = getKillRegState(determinePrologueDeath(MBB, CSI[i].getReg()));
- } else {
- StLow = RegState::Define;
- StHigh = RegState::Define;
- }
-
- NewMI = BuildMI(MBB, MBBI, DL, TII.get(PossClasses[ClassIdx].PairOpcode))
- .addReg(CSI[i+1].getReg(), StLow)
- .addReg(CSI[i].getReg(), StHigh);
-
- // If it's a paired op, we've consumed two registers
- ++i;
- } else {
- unsigned State;
- if (isPrologue) {
- State = getKillRegState(determinePrologueDeath(MBB, CSI[i].getReg()));
- } else {
- State = RegState::Define;
- }
-
- NewMI = BuildMI(MBB, MBBI, DL,
- TII.get(PossClasses[ClassIdx].SingleOpcode))
- .addReg(CSI[i].getReg(), State);
- }
-
- // Note that the FrameIdx refers to the second register in a pair: it will
- // be allocated the smaller numeric address and so is the one an LDP/STP
- // address must use.
- int FrameIdx = CSI[i].getFrameIdx();
- MachineMemOperand::MemOperandFlags Flags;
- Flags = isPrologue ? MachineMemOperand::MOStore : MachineMemOperand::MOLoad;
- MachineMemOperand *MMO =
- MF.getMachineMemOperand(MachinePointerInfo::getFixedStack(FrameIdx),
- Flags,
- Pair ? TheClass.getSize() * 2 : TheClass.getSize(),
- MFI.getObjectAlignment(FrameIdx));
-
- NewMI.addFrameIndex(FrameIdx)
- .addImm(0) // address-register offset
- .addMemOperand(MMO);
-
- if (isPrologue)
- NewMI.setMIFlags(MachineInstr::FrameSetup);
-
- // For aesthetic reasons, during an epilogue we want to emit complementary
- // operations to the prologue, but in the opposite order. So we still
- // iterate through the CalleeSavedInfo list in order, but we put the
- // instructions successively earlier in the MBB.
- if (!isPrologue)
- --MBBI;
}
+ NumBytes -= NumRestores * 16;
+ assert(NumBytes >= 0 && "Negative stack allocation size!?");
+
+ if (!hasFP(MF)) {
+ // If this was a redzone leaf function, we don't need to restore the
+ // stack pointer.
+ if (!canUseRedZone(MF))
+ emitFrameOffset(MBB, LastPopI, DL, AArch64::SP, AArch64::SP, NumBytes,
+ TII);
+ return;
+ }
+
+ // Restore the original stack pointer.
+ // FIXME: Rather than doing the math here, we should instead just use
+ // non-post-indexed loads for the restores if we aren't actually going to
+ // be able to save any instructions.
+ if (NumBytes || MFI->hasVarSizedObjects())
+ emitFrameOffset(MBB, LastPopI, DL, AArch64::SP, AArch64::FP,
+ -(NumRestores - 1) * 16, TII, MachineInstr::NoFlags);
}
-bool
-AArch64FrameLowering::spillCalleeSavedRegisters(MachineBasicBlock &MBB,
- MachineBasicBlock::iterator MBBI,
- const std::vector<CalleeSavedInfo> &CSI,
- const TargetRegisterInfo *TRI) const {
- if (CSI.empty())
- return false;
+/// getFrameIndexOffset - Returns the displacement from the frame register to
+/// the stack frame of the specified index.
+int AArch64FrameLowering::getFrameIndexOffset(const MachineFunction &MF,
+ int FI) const {
+ unsigned FrameReg;
+ return getFrameIndexReference(MF, FI, FrameReg);
+}
- static const LoadStoreMethod PossibleClasses[] = {
- {&AArch64::GPR64RegClass, AArch64::LSPair64_STR, AArch64::LS64_STR},
- {&AArch64::FPR64RegClass, AArch64::LSFPPair64_STR, AArch64::LSFP64_STR},
- };
- const unsigned NumClasses = llvm::array_lengthof(PossibleClasses);
+/// getFrameIndexReference - Provide a base+offset reference to an FI slot for
+/// debug info. It's the same as what we use for resolving the code-gen
+/// references for now. FIXME: This can go wrong when references are
+/// SP-relative and simple call frames aren't used.
+int AArch64FrameLowering::getFrameIndexReference(const MachineFunction &MF,
+ int FI,
+ unsigned &FrameReg) const {
+ return resolveFrameIndexReference(MF, FI, FrameReg);
+}
- emitFrameMemOps(/* isPrologue = */ true, MBB, MBBI, CSI, TRI,
- PossibleClasses, NumClasses);
+int AArch64FrameLowering::resolveFrameIndexReference(const MachineFunction &MF,
+ int FI, unsigned &FrameReg,
+ bool PreferFP) const {
+ const MachineFrameInfo *MFI = MF.getFrameInfo();
+ const AArch64RegisterInfo *RegInfo = static_cast<const AArch64RegisterInfo *>(
+ MF.getTarget().getRegisterInfo());
+ const AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>();
+ int FPOffset = MFI->getObjectOffset(FI) + 16;
+ int Offset = MFI->getObjectOffset(FI) + MFI->getStackSize();
+ bool isFixed = MFI->isFixedObjectIndex(FI);
+ // Use frame pointer to reference fixed objects. Use it for locals if
+ // there are VLAs (and thus the SP isn't reliable as a base).
+ // Make sure useFPForScavengingIndex() does the right thing for the emergency
+ // spill slot.
+ bool UseFP = false;
+ if (AFI->hasStackFrame()) {
+ // Note: Keeping the following as multiple 'if' statements rather than
+ // merging to a single expression for readability.
+ //
+ // Argument access should always use the FP.
+ if (isFixed) {
+ UseFP = hasFP(MF);
+ } else if (hasFP(MF) && !RegInfo->hasBasePointer(MF)) {
+ // Use SP or FP, whichever gives us the best chance of the offset
+ // being in range for direct access. If the FPOffset is positive,
+ // that'll always be best, as the SP will be even further away.
+ // If the FPOffset is negative, we have to keep in mind that the
+ // available offset range for negative offsets is smaller than for
+ // positive ones. If we have variable sized objects, we're stuck with
+ // using the FP regardless, though, as the SP offset is unknown
+ // and we don't have a base pointer available. If an offset is
+ // available via the FP and the SP, use whichever is closest.
+ if (PreferFP || MFI->hasVarSizedObjects() || FPOffset >= 0 ||
+ (FPOffset >= -256 && Offset > -FPOffset))
+ UseFP = true;
+ }
+ }
+
+ if (UseFP) {
+ FrameReg = RegInfo->getFrameRegister(MF);
+ return FPOffset;
+ }
+
+ // Use the base pointer if we have one.
+ if (RegInfo->hasBasePointer(MF))
+ FrameReg = RegInfo->getBaseRegister();
+ else {
+ FrameReg = AArch64::SP;
+ // If we're using the red zone for this function, the SP won't actually
+ // be adjusted, so the offsets will be negative. They're also all
+ // within range of the signed 9-bit immediate instructions.
+ if (canUseRedZone(MF))
+ Offset -= AFI->getLocalStackSize();
+ }
+
+ return Offset;
+}
+
+static unsigned getPrologueDeath(MachineFunction &MF, unsigned Reg) {
+ if (Reg != AArch64::LR)
+ return getKillRegState(true);
+
+ // LR maybe referred to later by an @llvm.returnaddress intrinsic.
+ bool LRLiveIn = MF.getRegInfo().isLiveIn(AArch64::LR);
+ bool LRKill = !(LRLiveIn && MF.getFrameInfo()->isReturnAddressTaken());
+ return getKillRegState(LRKill);
+}
+
+bool AArch64FrameLowering::spillCalleeSavedRegisters(
+ MachineBasicBlock &MBB, MachineBasicBlock::iterator MI,
+ const std::vector<CalleeSavedInfo> &CSI,
+ const TargetRegisterInfo *TRI) const {
+ MachineFunction &MF = *MBB.getParent();
+ const TargetInstrInfo &TII = *MF.getTarget().getInstrInfo();
+ unsigned Count = CSI.size();
+ DebugLoc DL;
+ assert((Count & 1) == 0 && "Odd number of callee-saved regs to spill!");
+
+ if (MI != MBB.end())
+ DL = MI->getDebugLoc();
+
+ for (unsigned i = 0; i < Count; i += 2) {
+ unsigned idx = Count - i - 2;
+ unsigned Reg1 = CSI[idx].getReg();
+ unsigned Reg2 = CSI[idx + 1].getReg();
+ // GPRs and FPRs are saved in pairs of 64-bit regs. We expect the CSI
+ // list to come in sorted by frame index so that we can issue the store
+ // pair instructions directly. Assert if we see anything otherwise.
+ //
+ // The order of the registers in the list is controlled by
+ // getCalleeSavedRegs(), so they will always be in-order, as well.
+ assert(CSI[idx].getFrameIdx() + 1 == CSI[idx + 1].getFrameIdx() &&
+ "Out of order callee saved regs!");
+ unsigned StrOpc;
+ assert((Count & 1) == 0 && "Odd number of callee-saved regs to spill!");
+ assert((i & 1) == 0 && "Odd index for callee-saved reg spill!");
+ // Issue sequence of non-sp increment and pi sp spills for cs regs. The
+ // first spill is a pre-increment that allocates the stack.
+ // For example:
+ // stp x22, x21, [sp, #-48]! // addImm(-6)
+ // stp x20, x19, [sp, #16] // addImm(+2)
+ // stp fp, lr, [sp, #32] // addImm(+4)
+ // Rationale: This sequence saves uop updates compared to a sequence of
+ // pre-increment spills like stp xi,xj,[sp,#-16]!
+ // Note: Similar rational and sequence for restores in epilog.
+ if (AArch64::GPR64RegClass.contains(Reg1)) {
+ assert(AArch64::GPR64RegClass.contains(Reg2) &&
+ "Expected GPR64 callee-saved register pair!");
+ // For first spill use pre-increment store.
+ if (i == 0)
+ StrOpc = AArch64::STPXpre;
+ else
+ StrOpc = AArch64::STPXi;
+ } else if (AArch64::FPR64RegClass.contains(Reg1)) {
+ assert(AArch64::FPR64RegClass.contains(Reg2) &&
+ "Expected FPR64 callee-saved register pair!");
+ // For first spill use pre-increment store.
+ if (i == 0)
+ StrOpc = AArch64::STPDpre;
+ else
+ StrOpc = AArch64::STPDi;
+ } else
+ llvm_unreachable("Unexpected callee saved register!");
+ DEBUG(dbgs() << "CSR spill: (" << TRI->getName(Reg1) << ", "
+ << TRI->getName(Reg2) << ") -> fi#(" << CSI[idx].getFrameIdx()
+ << ", " << CSI[idx + 1].getFrameIdx() << ")\n");
+ // Compute offset: i = 0 => offset = -Count;
+ // i = 2 => offset = -(Count - 2) + Count = 2 = i; etc.
+ const int Offset = (i == 0) ? -Count : i;
+ assert((Offset >= -64 && Offset <= 63) &&
+ "Offset out of bounds for STP immediate");
+ MachineInstrBuilder MIB = BuildMI(MBB, MI, DL, TII.get(StrOpc));
+ if (StrOpc == AArch64::STPDpre || StrOpc == AArch64::STPXpre)
+ MIB.addReg(AArch64::SP, RegState::Define);
+
+ MIB.addReg(Reg2, getPrologueDeath(MF, Reg2))
+ .addReg(Reg1, getPrologueDeath(MF, Reg1))
+ .addReg(AArch64::SP)
+ .addImm(Offset) // [sp, #offset * 8], where factor * 8 is implicit
+ .setMIFlag(MachineInstr::FrameSetup);
+ }
return true;
}
-bool
-AArch64FrameLowering::restoreCalleeSavedRegisters(MachineBasicBlock &MBB,
- MachineBasicBlock::iterator MBBI,
- const std::vector<CalleeSavedInfo> &CSI,
- const TargetRegisterInfo *TRI) const {
+bool AArch64FrameLowering::restoreCalleeSavedRegisters(
+ MachineBasicBlock &MBB, MachineBasicBlock::iterator MI,
+ const std::vector<CalleeSavedInfo> &CSI,
+ const TargetRegisterInfo *TRI) const {
+ MachineFunction &MF = *MBB.getParent();
+ const TargetInstrInfo &TII = *MF.getTarget().getInstrInfo();
+ unsigned Count = CSI.size();
+ DebugLoc DL;
+ assert((Count & 1) == 0 && "Odd number of callee-saved regs to spill!");
- if (CSI.empty())
- return false;
+ if (MI != MBB.end())
+ DL = MI->getDebugLoc();
- static const LoadStoreMethod PossibleClasses[] = {
- {&AArch64::GPR64RegClass, AArch64::LSPair64_LDR, AArch64::LS64_LDR},
- {&AArch64::FPR64RegClass, AArch64::LSFPPair64_LDR, AArch64::LSFP64_LDR},
- };
- const unsigned NumClasses = llvm::array_lengthof(PossibleClasses);
+ for (unsigned i = 0; i < Count; i += 2) {
+ unsigned Reg1 = CSI[i].getReg();
+ unsigned Reg2 = CSI[i + 1].getReg();
+ // GPRs and FPRs are saved in pairs of 64-bit regs. We expect the CSI
+ // list to come in sorted by frame index so that we can issue the store
+ // pair instructions directly. Assert if we see anything otherwise.
+ assert(CSI[i].getFrameIdx() + 1 == CSI[i + 1].getFrameIdx() &&
+ "Out of order callee saved regs!");
+ // Issue sequence of non-sp increment and sp-pi restores for cs regs. Only
+ // the last load is sp-pi post-increment and de-allocates the stack:
+ // For example:
+ // ldp fp, lr, [sp, #32] // addImm(+4)
+ // ldp x20, x19, [sp, #16] // addImm(+2)
+ // ldp x22, x21, [sp], #48 // addImm(+6)
+ // Note: see comment in spillCalleeSavedRegisters()
+ unsigned LdrOpc;
- emitFrameMemOps(/* isPrologue = */ false, MBB, MBBI, CSI, TRI,
- PossibleClasses, NumClasses);
+ assert((Count & 1) == 0 && "Odd number of callee-saved regs to spill!");
+ assert((i & 1) == 0 && "Odd index for callee-saved reg spill!");
+ if (AArch64::GPR64RegClass.contains(Reg1)) {
+ assert(AArch64::GPR64RegClass.contains(Reg2) &&
+ "Expected GPR64 callee-saved register pair!");
+ if (i == Count - 2)
+ LdrOpc = AArch64::LDPXpost;
+ else
+ LdrOpc = AArch64::LDPXi;
+ } else if (AArch64::FPR64RegClass.contains(Reg1)) {
+ assert(AArch64::FPR64RegClass.contains(Reg2) &&
+ "Expected FPR64 callee-saved register pair!");
+ if (i == Count - 2)
+ LdrOpc = AArch64::LDPDpost;
+ else
+ LdrOpc = AArch64::LDPDi;
+ } else
+ llvm_unreachable("Unexpected callee saved register!");
+ DEBUG(dbgs() << "CSR restore: (" << TRI->getName(Reg1) << ", "
+ << TRI->getName(Reg2) << ") -> fi#(" << CSI[i].getFrameIdx()
+ << ", " << CSI[i + 1].getFrameIdx() << ")\n");
+ // Compute offset: i = 0 => offset = Count - 2; i = 2 => offset = Count - 4;
+ // etc.
+ const int Offset = (i == Count - 2) ? Count : Count - i - 2;
+ assert((Offset >= -64 && Offset <= 63) &&
+ "Offset out of bounds for LDP immediate");
+ MachineInstrBuilder MIB = BuildMI(MBB, MI, DL, TII.get(LdrOpc));
+ if (LdrOpc == AArch64::LDPXpost || LdrOpc == AArch64::LDPDpost)
+ MIB.addReg(AArch64::SP, RegState::Define);
+
+ MIB.addReg(Reg2, getDefRegState(true))
+ .addReg(Reg1, getDefRegState(true))
+ .addReg(AArch64::SP)
+ .addImm(Offset); // [sp], #offset * 8 or [sp, #offset * 8]
+ // where the factor * 8 is implicit
+ }
return true;
}
-bool
-AArch64FrameLowering::hasFP(const MachineFunction &MF) const {
- const MachineFrameInfo *MFI = MF.getFrameInfo();
- const TargetRegisterInfo *RI = MF.getTarget().getRegisterInfo();
+void AArch64FrameLowering::processFunctionBeforeCalleeSavedScan(
+ MachineFunction &MF, RegScavenger *RS) const {
+ const AArch64RegisterInfo *RegInfo = static_cast<const AArch64RegisterInfo *>(
+ MF.getTarget().getRegisterInfo());
+ AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>();
+ MachineRegisterInfo *MRI = &MF.getRegInfo();
+ SmallVector<unsigned, 4> UnspilledCSGPRs;
+ SmallVector<unsigned, 4> UnspilledCSFPRs;
- // This is a decision of ABI compliance. The AArch64 PCS gives various options
- // for conformance, and even at the most stringent level more or less permits
- // elimination for leaf functions because there's no loss of functionality
- // (for debugging etc)..
- if (MF.getTarget().Options.DisableFramePointerElim(MF) && MFI->hasCalls())
- return true;
-
- // The following are hard-limits: incorrect code will be generated if we try
- // to omit the frame.
- return (RI->needsStackRealignment(MF) ||
- MFI->hasVarSizedObjects() ||
- MFI->isFrameAddressTaken());
-}
-
-bool
-AArch64FrameLowering::useFPForAddressing(const MachineFunction &MF) const {
- return MF.getFrameInfo()->hasVarSizedObjects();
-}
-
-bool
-AArch64FrameLowering::hasReservedCallFrame(const MachineFunction &MF) const {
- const MachineFrameInfo *MFI = MF.getFrameInfo();
-
- // Of the various reasons for having a frame pointer, it's actually only
- // variable-sized objects that prevent reservation of a call frame.
- return !(hasFP(MF) && MFI->hasVarSizedObjects());
-}
-
-void
-AArch64FrameLowering::eliminateCallFramePseudoInstr(
- MachineFunction &MF,
- MachineBasicBlock &MBB,
- MachineBasicBlock::iterator MI) const {
- const AArch64InstrInfo &TII =
- *static_cast<const AArch64InstrInfo *>(MF.getTarget().getInstrInfo());
- DebugLoc dl = MI->getDebugLoc();
- int Opcode = MI->getOpcode();
- bool IsDestroy = Opcode == TII.getCallFrameDestroyOpcode();
- uint64_t CalleePopAmount = IsDestroy ? MI->getOperand(1).getImm() : 0;
-
- if (!hasReservedCallFrame(MF)) {
- unsigned Align = getStackAlignment();
-
- int64_t Amount = MI->getOperand(0).getImm();
- Amount = RoundUpToAlignment(Amount, Align);
- if (!IsDestroy) Amount = -Amount;
-
- // N.b. if CalleePopAmount is valid but zero (i.e. callee would pop, but it
- // doesn't have to pop anything), then the first operand will be zero too so
- // this adjustment is a no-op.
- if (CalleePopAmount == 0) {
- // FIXME: in-function stack adjustment for calls is limited to 12-bits
- // because there's no guaranteed temporary register available. Mostly call
- // frames will be allocated at the start of a function so this is OK, but
- // it is a limitation that needs dealing with.
- assert(Amount > -0xfff && Amount < 0xfff && "call frame too large");
- emitSPUpdate(MBB, MI, dl, TII, AArch64::NoRegister, Amount);
- }
- } else if (CalleePopAmount != 0) {
- // If the calling convention demands that the callee pops arguments from the
- // stack, we want to add it back if we have a reserved call frame.
- assert(CalleePopAmount < 0xfff && "call frame too large");
- emitSPUpdate(MBB, MI, dl, TII, AArch64::NoRegister, -CalleePopAmount);
+ // The frame record needs to be created by saving the appropriate registers
+ if (hasFP(MF)) {
+ MRI->setPhysRegUsed(AArch64::FP);
+ MRI->setPhysRegUsed(AArch64::LR);
}
- MBB.erase(MI);
+ // Spill the BasePtr if it's used. Do this first thing so that the
+ // getCalleeSavedRegs() below will get the right answer.
+ if (RegInfo->hasBasePointer(MF))
+ MRI->setPhysRegUsed(RegInfo->getBaseRegister());
+
+ // If any callee-saved registers are used, the frame cannot be eliminated.
+ unsigned NumGPRSpilled = 0;
+ unsigned NumFPRSpilled = 0;
+ bool ExtraCSSpill = false;
+ bool CanEliminateFrame = true;
+ DEBUG(dbgs() << "*** processFunctionBeforeCalleeSavedScan\nUsed CSRs:");
+ const MCPhysReg *CSRegs = RegInfo->getCalleeSavedRegs(&MF);
+
+ // Check pairs of consecutive callee-saved registers.
+ for (unsigned i = 0; CSRegs[i]; i += 2) {
+ assert(CSRegs[i + 1] && "Odd number of callee-saved registers!");
+
+ const unsigned OddReg = CSRegs[i];
+ const unsigned EvenReg = CSRegs[i + 1];
+ assert((AArch64::GPR64RegClass.contains(OddReg) &&
+ AArch64::GPR64RegClass.contains(EvenReg)) ^
+ (AArch64::FPR64RegClass.contains(OddReg) &&
+ AArch64::FPR64RegClass.contains(EvenReg)) &&
+ "Register class mismatch!");
+
+ const bool OddRegUsed = MRI->isPhysRegUsed(OddReg);
+ const bool EvenRegUsed = MRI->isPhysRegUsed(EvenReg);
+
+ // Early exit if none of the registers in the register pair is actually
+ // used.
+ if (!OddRegUsed && !EvenRegUsed) {
+ if (AArch64::GPR64RegClass.contains(OddReg)) {
+ UnspilledCSGPRs.push_back(OddReg);
+ UnspilledCSGPRs.push_back(EvenReg);
+ } else {
+ UnspilledCSFPRs.push_back(OddReg);
+ UnspilledCSFPRs.push_back(EvenReg);
+ }
+ continue;
+ }
+
+ unsigned Reg = AArch64::NoRegister;
+ // If only one of the registers of the register pair is used, make sure to
+ // mark the other one as used as well.
+ if (OddRegUsed ^ EvenRegUsed) {
+ // Find out which register is the additional spill.
+ Reg = OddRegUsed ? EvenReg : OddReg;
+ MRI->setPhysRegUsed(Reg);
+ }
+
+ DEBUG(dbgs() << ' ' << PrintReg(OddReg, RegInfo));
+ DEBUG(dbgs() << ' ' << PrintReg(EvenReg, RegInfo));
+
+ assert(((OddReg == AArch64::LR && EvenReg == AArch64::FP) ||
+ (RegInfo->getEncodingValue(OddReg) + 1 ==
+ RegInfo->getEncodingValue(EvenReg))) &&
+ "Register pair of non-adjacent registers!");
+ if (AArch64::GPR64RegClass.contains(OddReg)) {
+ NumGPRSpilled += 2;
+ // If it's not a reserved register, we can use it in lieu of an
+ // emergency spill slot for the register scavenger.
+ // FIXME: It would be better to instead keep looking and choose another
+ // unspilled register that isn't reserved, if there is one.
+ if (Reg != AArch64::NoRegister && !RegInfo->isReservedReg(MF, Reg))
+ ExtraCSSpill = true;
+ } else
+ NumFPRSpilled += 2;
+
+ CanEliminateFrame = false;
+ }
+
+ // FIXME: Set BigStack if any stack slot references may be out of range.
+ // For now, just conservatively guestimate based on unscaled indexing
+ // range. We'll end up allocating an unnecessary spill slot a lot, but
+ // realistically that's not a big deal at this stage of the game.
+ // The CSR spill slots have not been allocated yet, so estimateStackSize
+ // won't include them.
+ MachineFrameInfo *MFI = MF.getFrameInfo();
+ unsigned CFSize = estimateStackSize(MF) + 8 * (NumGPRSpilled + NumFPRSpilled);
+ DEBUG(dbgs() << "Estimated stack frame size: " << CFSize << " bytes.\n");
+ bool BigStack = (CFSize >= 256);
+ if (BigStack || !CanEliminateFrame || RegInfo->cannotEliminateFrame(MF))
+ AFI->setHasStackFrame(true);
+
+ // Estimate if we might need to scavenge a register at some point in order
+ // to materialize a stack offset. If so, either spill one additional
+ // callee-saved register or reserve a special spill slot to facilitate
+ // register scavenging. If we already spilled an extra callee-saved register
+ // above to keep the number of spills even, we don't need to do anything else
+ // here.
+ if (BigStack && !ExtraCSSpill) {
+
+ // If we're adding a register to spill here, we have to add two of them
+ // to keep the number of regs to spill even.
+ assert(((UnspilledCSGPRs.size() & 1) == 0) && "Odd number of registers!");
+ unsigned Count = 0;
+ while (!UnspilledCSGPRs.empty() && Count < 2) {
+ unsigned Reg = UnspilledCSGPRs.back();
+ UnspilledCSGPRs.pop_back();
+ DEBUG(dbgs() << "Spilling " << PrintReg(Reg, RegInfo)
+ << " to get a scratch register.\n");
+ MRI->setPhysRegUsed(Reg);
+ ExtraCSSpill = true;
+ ++Count;
+ }
+
+ // If we didn't find an extra callee-saved register to spill, create
+ // an emergency spill slot.
+ if (!ExtraCSSpill) {
+ const TargetRegisterClass *RC = &AArch64::GPR64RegClass;
+ int FI = MFI->CreateStackObject(RC->getSize(), RC->getAlignment(), false);
+ RS->addScavengingFrameIndex(FI);
+ DEBUG(dbgs() << "No available CS registers, allocated fi#" << FI
+ << " as the emergency spill slot.\n");
+ }
+ }
}
diff --git a/lib/Target/AArch64/AArch64FrameLowering.h b/lib/Target/AArch64/AArch64FrameLowering.h
index 032dd90..0e00d16 100644
--- a/lib/Target/AArch64/AArch64FrameLowering.h
+++ b/lib/Target/AArch64/AArch64FrameLowering.h
@@ -1,4 +1,4 @@
-//==- AArch64FrameLowering.h - Define frame lowering for AArch64 -*- C++ -*--=//
+//==-- AArch64FrameLowering.h - TargetFrameLowering for AArch64 --*- C++ -*-==//
//
// The LLVM Compiler Infrastructure
//
@@ -7,100 +7,67 @@
//
//===----------------------------------------------------------------------===//
//
-// This class implements the AArch64-specific parts of the TargetFrameLowering
-// class.
+//
//
//===----------------------------------------------------------------------===//
-#ifndef LLVM_AARCH64_FRAMEINFO_H
-#define LLVM_AARCH64_FRAMEINFO_H
+#ifndef AArch64_FRAMELOWERING_H
+#define AArch64_FRAMELOWERING_H
-#include "AArch64Subtarget.h"
#include "llvm/Target/TargetFrameLowering.h"
namespace llvm {
+
class AArch64Subtarget;
+class AArch64TargetMachine;
class AArch64FrameLowering : public TargetFrameLowering {
-private:
- // In order to unify the spilling and restoring of callee-saved registers into
- // emitFrameMemOps, we need to be able to specify which instructions to use
- // for the relevant memory operations on each register class. An array of the
- // following struct is populated and passed in to achieve this.
- struct LoadStoreMethod {
- const TargetRegisterClass *RegClass; // E.g. GPR64RegClass
-
- // The preferred instruction.
- unsigned PairOpcode; // E.g. LSPair64_STR
-
- // Sometimes only a single register can be handled at once.
- unsigned SingleOpcode; // E.g. LS64_STR
- };
-protected:
- const AArch64Subtarget &STI;
+ const AArch64TargetMachine &TM;
public:
- explicit AArch64FrameLowering(const AArch64Subtarget &sti)
- : TargetFrameLowering(TargetFrameLowering::StackGrowsDown, 16, 0, 16),
- STI(sti) {
- }
+ explicit AArch64FrameLowering(const AArch64TargetMachine &TM,
+ const AArch64Subtarget &STI)
+ : TargetFrameLowering(StackGrowsDown, 16, 0, 16,
+ false /*StackRealignable*/),
+ TM(TM) {}
+
+ void emitCalleeSavedFrameMoves(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator MBBI,
+ unsigned FramePtr) const;
+
+ void eliminateCallFramePseudoInstr(MachineFunction &MF,
+ MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator I) const override;
/// emitProlog/emitEpilog - These methods insert prolog and epilog code into
/// the function.
- virtual void emitPrologue(MachineFunction &MF) const;
- virtual void emitEpilogue(MachineFunction &MF, MachineBasicBlock &MBB) const;
+ void emitPrologue(MachineFunction &MF) const override;
+ void emitEpilogue(MachineFunction &MF, MachineBasicBlock &MBB) const override;
- /// Decides how much stack adjustment to perform in each phase of the prologue
- /// and epilogue.
- void splitSPAdjustments(uint64_t Total, uint64_t &Initial,
- uint64_t &Residual) const;
+ int getFrameIndexOffset(const MachineFunction &MF, int FI) const override;
+ int getFrameIndexReference(const MachineFunction &MF, int FI,
+ unsigned &FrameReg) const override;
+ int resolveFrameIndexReference(const MachineFunction &MF, int FI,
+ unsigned &FrameReg,
+ bool PreferFP = false) const;
+ bool spillCalleeSavedRegisters(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator MI,
+ const std::vector<CalleeSavedInfo> &CSI,
+ const TargetRegisterInfo *TRI) const override;
- int64_t resolveFrameIndexReference(MachineFunction &MF, int FrameIndex,
- unsigned &FrameReg, int SPAdj,
- bool IsCalleeSaveOp) const;
+ bool restoreCalleeSavedRegisters(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator MI,
+ const std::vector<CalleeSavedInfo> &CSI,
+ const TargetRegisterInfo *TRI) const override;
- virtual void processFunctionBeforeCalleeSavedScan(MachineFunction &MF,
- RegScavenger *RS) const;
+ /// \brief Can this function use the red zone for local allocations.
+ bool canUseRedZone(const MachineFunction &MF) const;
- virtual bool spillCalleeSavedRegisters(MachineBasicBlock &MBB,
- MachineBasicBlock::iterator MI,
- const std::vector<CalleeSavedInfo> &CSI,
- const TargetRegisterInfo *TRI) const;
- virtual bool restoreCalleeSavedRegisters(MachineBasicBlock &MBB,
- MachineBasicBlock::iterator MI,
- const std::vector<CalleeSavedInfo> &CSI,
- const TargetRegisterInfo *TRI) const;
+ bool hasFP(const MachineFunction &MF) const override;
+ bool hasReservedCallFrame(const MachineFunction &MF) const override;
- void eliminateCallFramePseudoInstr(MachineFunction &MF,
- MachineBasicBlock &MBB,
- MachineBasicBlock::iterator MI) const;
-
- /// If the register is X30 (i.e. LR) and the return address is used in the
- /// function then the callee-save store doesn't actually kill the register,
- /// otherwise it does.
- bool determinePrologueDeath(MachineBasicBlock &MBB, unsigned Reg) const;
-
- /// This function emits the loads or stores required during prologue and
- /// epilogue as efficiently as possible.
- ///
- /// The operations involved in setting up and tearing down the frame are
- /// similar enough to warrant a shared function, particularly as discrepancies
- /// between the two would be disastrous.
- void emitFrameMemOps(bool isStore, MachineBasicBlock &MBB,
- MachineBasicBlock::iterator MI,
- const std::vector<CalleeSavedInfo> &CSI,
- const TargetRegisterInfo *TRI,
- const LoadStoreMethod PossibleClasses[],
- unsigned NumClasses) const;
-
-
- virtual bool hasFP(const MachineFunction &MF) const;
-
- virtual bool useFPForAddressing(const MachineFunction &MF) const;
-
- /// On AA
- virtual bool hasReservedCallFrame(const MachineFunction &MF) const;
-
+ void processFunctionBeforeCalleeSavedScan(MachineFunction &MF,
+ RegScavenger *RS) const override;
};
} // End llvm namespace
diff --git a/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp b/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp
index dac4b32..7007ffc 100644
--- a/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp
+++ b/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp
@@ -11,118 +11,119 @@
//
//===----------------------------------------------------------------------===//
-#define DEBUG_TYPE "aarch64-isel"
-#include "AArch64.h"
-#include "AArch64InstrInfo.h"
-#include "AArch64Subtarget.h"
#include "AArch64TargetMachine.h"
-#include "Utils/AArch64BaseInfo.h"
+#include "MCTargetDesc/AArch64AddressingModes.h"
#include "llvm/ADT/APSInt.h"
#include "llvm/CodeGen/SelectionDAGISel.h"
+#include "llvm/IR/Function.h" // To access function attributes.
#include "llvm/IR/GlobalValue.h"
+#include "llvm/IR/Intrinsics.h"
#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/MathExtras.h"
#include "llvm/Support/raw_ostream.h"
using namespace llvm;
+#define DEBUG_TYPE "aarch64-isel"
+
//===--------------------------------------------------------------------===//
-/// AArch64 specific code to select AArch64 machine instructions for
-/// SelectionDAG operations.
+/// AArch64DAGToDAGISel - AArch64 specific code to select AArch64 machine
+/// instructions for SelectionDAG operations.
///
namespace {
class AArch64DAGToDAGISel : public SelectionDAGISel {
AArch64TargetMachine &TM;
- /// Keep a pointer to the AArch64Subtarget around so that we can
+ /// Subtarget - Keep a pointer to the AArch64Subtarget around so that we can
/// make the right decision when generating code for different targets.
const AArch64Subtarget *Subtarget;
+ bool ForCodeSize;
+
public:
explicit AArch64DAGToDAGISel(AArch64TargetMachine &tm,
CodeGenOpt::Level OptLevel)
- : SelectionDAGISel(tm, OptLevel), TM(tm),
- Subtarget(&TM.getSubtarget<AArch64Subtarget>()) {
- }
+ : SelectionDAGISel(tm, OptLevel), TM(tm), Subtarget(nullptr),
+ ForCodeSize(false) {}
- virtual const char *getPassName() const {
+ const char *getPassName() const override {
return "AArch64 Instruction Selection";
}
- // Include the pieces autogenerated from the target description.
-#include "AArch64GenDAGISel.inc"
-
- template<unsigned MemSize>
- bool SelectOffsetUImm12(SDValue N, SDValue &UImm12) {
- const ConstantSDNode *CN = dyn_cast<ConstantSDNode>(N);
- if (!CN || CN->getZExtValue() % MemSize != 0
- || CN->getZExtValue() / MemSize > 0xfff)
- return false;
-
- UImm12 = CurDAG->getTargetConstant(CN->getZExtValue() / MemSize, MVT::i64);
- return true;
+ bool runOnMachineFunction(MachineFunction &MF) override {
+ AttributeSet FnAttrs = MF.getFunction()->getAttributes();
+ ForCodeSize =
+ FnAttrs.hasAttribute(AttributeSet::FunctionIndex,
+ Attribute::OptimizeForSize) ||
+ FnAttrs.hasAttribute(AttributeSet::FunctionIndex, Attribute::MinSize);
+ Subtarget = &TM.getSubtarget<AArch64Subtarget>();
+ return SelectionDAGISel::runOnMachineFunction(MF);
}
- template<unsigned RegWidth>
- bool SelectCVTFixedPosOperand(SDValue N, SDValue &FixedPos) {
- return SelectCVTFixedPosOperand(N, FixedPos, RegWidth);
- }
+ SDNode *Select(SDNode *Node) override;
- /// Used for pre-lowered address-reference nodes, so we already know
- /// the fields match. This operand's job is simply to add an
- /// appropriate shift operand to the MOVZ/MOVK instruction.
- template<unsigned LogShift>
- bool SelectMOVWAddressRef(SDValue N, SDValue &Imm, SDValue &Shift) {
- Imm = N;
- Shift = CurDAG->getTargetConstant(LogShift, MVT::i32);
- return true;
- }
-
- bool SelectFPZeroOperand(SDValue N, SDValue &Dummy);
-
- bool SelectCVTFixedPosOperand(SDValue N, SDValue &FixedPos,
- unsigned RegWidth);
-
+ /// SelectInlineAsmMemoryOperand - Implement addressing mode selection for
+ /// inline asm expressions.
bool SelectInlineAsmMemoryOperand(const SDValue &Op,
char ConstraintCode,
- std::vector<SDValue> &OutOps);
+ std::vector<SDValue> &OutOps) override;
- bool SelectLogicalImm(SDValue N, SDValue &Imm);
-
- template<unsigned RegWidth>
- bool SelectTSTBOperand(SDValue N, SDValue &FixedPos) {
- return SelectTSTBOperand(N, FixedPos, RegWidth);
+ SDNode *SelectMLAV64LaneV128(SDNode *N);
+ SDNode *SelectMULLV64LaneV128(unsigned IntNo, SDNode *N);
+ bool SelectArithExtendedRegister(SDValue N, SDValue &Reg, SDValue &Shift);
+ bool SelectArithImmed(SDValue N, SDValue &Val, SDValue &Shift);
+ bool SelectNegArithImmed(SDValue N, SDValue &Val, SDValue &Shift);
+ bool SelectArithShiftedRegister(SDValue N, SDValue &Reg, SDValue &Shift) {
+ return SelectShiftedRegister(N, false, Reg, Shift);
+ }
+ bool SelectLogicalShiftedRegister(SDValue N, SDValue &Reg, SDValue &Shift) {
+ return SelectShiftedRegister(N, true, Reg, Shift);
+ }
+ bool SelectAddrModeIndexed8(SDValue N, SDValue &Base, SDValue &OffImm) {
+ return SelectAddrModeIndexed(N, 1, Base, OffImm);
+ }
+ bool SelectAddrModeIndexed16(SDValue N, SDValue &Base, SDValue &OffImm) {
+ return SelectAddrModeIndexed(N, 2, Base, OffImm);
+ }
+ bool SelectAddrModeIndexed32(SDValue N, SDValue &Base, SDValue &OffImm) {
+ return SelectAddrModeIndexed(N, 4, Base, OffImm);
+ }
+ bool SelectAddrModeIndexed64(SDValue N, SDValue &Base, SDValue &OffImm) {
+ return SelectAddrModeIndexed(N, 8, Base, OffImm);
+ }
+ bool SelectAddrModeIndexed128(SDValue N, SDValue &Base, SDValue &OffImm) {
+ return SelectAddrModeIndexed(N, 16, Base, OffImm);
+ }
+ bool SelectAddrModeUnscaled8(SDValue N, SDValue &Base, SDValue &OffImm) {
+ return SelectAddrModeUnscaled(N, 1, Base, OffImm);
+ }
+ bool SelectAddrModeUnscaled16(SDValue N, SDValue &Base, SDValue &OffImm) {
+ return SelectAddrModeUnscaled(N, 2, Base, OffImm);
+ }
+ bool SelectAddrModeUnscaled32(SDValue N, SDValue &Base, SDValue &OffImm) {
+ return SelectAddrModeUnscaled(N, 4, Base, OffImm);
+ }
+ bool SelectAddrModeUnscaled64(SDValue N, SDValue &Base, SDValue &OffImm) {
+ return SelectAddrModeUnscaled(N, 8, Base, OffImm);
+ }
+ bool SelectAddrModeUnscaled128(SDValue N, SDValue &Base, SDValue &OffImm) {
+ return SelectAddrModeUnscaled(N, 16, Base, OffImm);
}
- bool SelectTSTBOperand(SDValue N, SDValue &FixedPos, unsigned RegWidth);
+ template<int Width>
+ bool SelectAddrModeWRO(SDValue N, SDValue &Base, SDValue &Offset,
+ SDValue &SignExtend, SDValue &DoShift) {
+ return SelectAddrModeWRO(N, Width / 8, Base, Offset, SignExtend, DoShift);
+ }
- SDNode *SelectAtomic(SDNode *N, unsigned Op8, unsigned Op16, unsigned Op32,
- unsigned Op64);
+ template<int Width>
+ bool SelectAddrModeXRO(SDValue N, SDValue &Base, SDValue &Offset,
+ SDValue &SignExtend, SDValue &DoShift) {
+ return SelectAddrModeXRO(N, Width / 8, Base, Offset, SignExtend, DoShift);
+ }
- /// Put the given constant into a pool and return a DAG which will give its
- /// address.
- SDValue getConstantPoolItemAddress(SDLoc DL, const Constant *CV);
-
- SDNode *TrySelectToMoveImm(SDNode *N);
- SDNode *LowerToFPLitPool(SDNode *Node);
- SDNode *SelectToLitPool(SDNode *N);
-
- SDNode* Select(SDNode*);
-private:
- /// Get the opcode for table lookup instruction
- unsigned getTBLOpc(bool IsExt, bool Is64Bit, unsigned NumOfVec);
-
- /// Select NEON table lookup intrinsics. NumVecs should be 1, 2, 3 or 4.
- /// IsExt is to indicate if the result will be extended with an argument.
- SDNode *SelectVTBL(SDNode *N, unsigned NumVecs, bool IsExt);
-
- /// Select NEON load intrinsics. NumVecs should be 1, 2, 3 or 4.
- SDNode *SelectVLD(SDNode *N, bool isUpdating, unsigned NumVecs,
- const uint16_t *Opcode);
-
- /// Select NEON store intrinsics. NumVecs should be 1, 2, 3 or 4.
- SDNode *SelectVST(SDNode *N, bool isUpdating, unsigned NumVecs,
- const uint16_t *Opcodes);
/// Form sequences of consecutive 64/128-bit registers for use in NEON
/// instructions making use of a vector-list (e.g. ldN, tbl). Vecs must have
@@ -136,315 +137,713 @@
SDValue createTuple(ArrayRef<SDValue> Vecs, unsigned RegClassIDs[],
unsigned SubRegs[]);
- /// Select NEON load-duplicate intrinsics. NumVecs should be 2, 3 or 4.
- /// The opcode array specifies the instructions used for load.
- SDNode *SelectVLDDup(SDNode *N, bool isUpdating, unsigned NumVecs,
- const uint16_t *Opcodes);
+ SDNode *SelectTable(SDNode *N, unsigned NumVecs, unsigned Opc, bool isExt);
- /// Select NEON load/store lane intrinsics. NumVecs should be 2, 3 or 4.
- /// The opcode arrays specify the instructions used for load/store.
- SDNode *SelectVLDSTLane(SDNode *N, bool IsLoad, bool isUpdating,
- unsigned NumVecs, const uint16_t *Opcodes);
+ SDNode *SelectIndexedLoad(SDNode *N, bool &Done);
- SDValue getTargetSubregToReg(int SRIdx, SDLoc DL, EVT VT, EVT VTD,
- SDValue Operand);
+ SDNode *SelectLoad(SDNode *N, unsigned NumVecs, unsigned Opc,
+ unsigned SubRegIdx);
+ SDNode *SelectPostLoad(SDNode *N, unsigned NumVecs, unsigned Opc,
+ unsigned SubRegIdx);
+ SDNode *SelectLoadLane(SDNode *N, unsigned NumVecs, unsigned Opc);
+ SDNode *SelectPostLoadLane(SDNode *N, unsigned NumVecs, unsigned Opc);
+
+ SDNode *SelectStore(SDNode *N, unsigned NumVecs, unsigned Opc);
+ SDNode *SelectPostStore(SDNode *N, unsigned NumVecs, unsigned Opc);
+ SDNode *SelectStoreLane(SDNode *N, unsigned NumVecs, unsigned Opc);
+ SDNode *SelectPostStoreLane(SDNode *N, unsigned NumVecs, unsigned Opc);
+
+ SDNode *SelectSIMDAddSubNarrowing(unsigned IntNo, SDNode *Node);
+ SDNode *SelectSIMDXtnNarrowing(unsigned IntNo, SDNode *Node);
+
+ SDNode *SelectBitfieldExtractOp(SDNode *N);
+ SDNode *SelectBitfieldInsertOp(SDNode *N);
+
+ SDNode *SelectLIBM(SDNode *N);
+
+// Include the pieces autogenerated from the target description.
+#include "AArch64GenDAGISel.inc"
+
+private:
+ bool SelectShiftedRegister(SDValue N, bool AllowROR, SDValue &Reg,
+ SDValue &Shift);
+ bool SelectAddrModeIndexed(SDValue N, unsigned Size, SDValue &Base,
+ SDValue &OffImm);
+ bool SelectAddrModeUnscaled(SDValue N, unsigned Size, SDValue &Base,
+ SDValue &OffImm);
+ bool SelectAddrModeWRO(SDValue N, unsigned Size, SDValue &Base,
+ SDValue &Offset, SDValue &SignExtend,
+ SDValue &DoShift);
+ bool SelectAddrModeXRO(SDValue N, unsigned Size, SDValue &Base,
+ SDValue &Offset, SDValue &SignExtend,
+ SDValue &DoShift);
+ bool isWorthFolding(SDValue V) const;
+ bool SelectExtendedSHL(SDValue N, unsigned Size, bool WantExtend,
+ SDValue &Offset, SDValue &SignExtend);
+
+ template<unsigned RegWidth>
+ bool SelectCVTFixedPosOperand(SDValue N, SDValue &FixedPos) {
+ return SelectCVTFixedPosOperand(N, FixedPos, RegWidth);
+ }
+
+ bool SelectCVTFixedPosOperand(SDValue N, SDValue &FixedPos, unsigned Width);
};
+} // end anonymous namespace
+
+/// isIntImmediate - This method tests to see if the node is a constant
+/// operand. If so Imm will receive the 32-bit value.
+static bool isIntImmediate(const SDNode *N, uint64_t &Imm) {
+ if (const ConstantSDNode *C = dyn_cast<const ConstantSDNode>(N)) {
+ Imm = C->getZExtValue();
+ return true;
+ }
+ return false;
}
-bool
-AArch64DAGToDAGISel::SelectCVTFixedPosOperand(SDValue N, SDValue &FixedPos,
- unsigned RegWidth) {
- const ConstantFPSDNode *CN = dyn_cast<ConstantFPSDNode>(N);
- if (!CN) return false;
+// isIntImmediate - This method tests to see if a constant operand.
+// If so Imm will receive the value.
+static bool isIntImmediate(SDValue N, uint64_t &Imm) {
+ return isIntImmediate(N.getNode(), Imm);
+}
- // An FCVT[SU] instruction performs: convertToInt(Val * 2^fbits) where fbits
- // is between 1 and 32 for a destination w-register, or 1 and 64 for an
- // x-register.
- //
- // By this stage, we've detected (fp_to_[su]int (fmul Val, THIS_NODE)) so we
- // want THIS_NODE to be 2^fbits. This is much easier to deal with using
- // integers.
- bool IsExact;
+// isOpcWithIntImmediate - This method tests to see if the node is a specific
+// opcode and that it has a immediate integer right operand.
+// If so Imm will receive the 32 bit value.
+static bool isOpcWithIntImmediate(const SDNode *N, unsigned Opc,
+ uint64_t &Imm) {
+ return N->getOpcode() == Opc &&
+ isIntImmediate(N->getOperand(1).getNode(), Imm);
+}
- // fbits is between 1 and 64 in the worst-case, which means the fmul
- // could have 2^64 as an actual operand. Need 65 bits of precision.
- APSInt IntVal(65, true);
- CN->getValueAPF().convertToInteger(IntVal, APFloat::rmTowardZero, &IsExact);
+bool AArch64DAGToDAGISel::SelectInlineAsmMemoryOperand(
+ const SDValue &Op, char ConstraintCode, std::vector<SDValue> &OutOps) {
+ assert(ConstraintCode == 'm' && "unexpected asm memory constraint");
+ // Require the address to be in a register. That is safe for all AArch64
+ // variants and it is hard to do anything much smarter without knowing
+ // how the operand is used.
+ OutOps.push_back(Op);
+ return false;
+}
- // N.b. isPowerOf2 also checks for > 0.
- if (!IsExact || !IntVal.isPowerOf2()) return false;
- unsigned FBits = IntVal.logBase2();
+/// SelectArithImmed - Select an immediate value that can be represented as
+/// a 12-bit value shifted left by either 0 or 12. If so, return true with
+/// Val set to the 12-bit value and Shift set to the shifter operand.
+bool AArch64DAGToDAGISel::SelectArithImmed(SDValue N, SDValue &Val,
+ SDValue &Shift) {
+ // This function is called from the addsub_shifted_imm ComplexPattern,
+ // which lists [imm] as the list of opcode it's interested in, however
+ // we still need to check whether the operand is actually an immediate
+ // here because the ComplexPattern opcode list is only used in
+ // root-level opcode matching.
+ if (!isa<ConstantSDNode>(N.getNode()))
+ return false;
- // Checks above should have guaranteed that we haven't lost information in
- // finding FBits, but it must still be in range.
- if (FBits == 0 || FBits > RegWidth) return false;
+ uint64_t Immed = cast<ConstantSDNode>(N.getNode())->getZExtValue();
+ unsigned ShiftAmt;
- FixedPos = CurDAG->getTargetConstant(64 - FBits, MVT::i32);
+ if (Immed >> 12 == 0) {
+ ShiftAmt = 0;
+ } else if ((Immed & 0xfff) == 0 && Immed >> 24 == 0) {
+ ShiftAmt = 12;
+ Immed = Immed >> 12;
+ } else
+ return false;
+
+ unsigned ShVal = AArch64_AM::getShifterImm(AArch64_AM::LSL, ShiftAmt);
+ Val = CurDAG->getTargetConstant(Immed, MVT::i32);
+ Shift = CurDAG->getTargetConstant(ShVal, MVT::i32);
return true;
}
-bool
-AArch64DAGToDAGISel::SelectInlineAsmMemoryOperand(const SDValue &Op,
- char ConstraintCode,
- std::vector<SDValue> &OutOps) {
- switch (ConstraintCode) {
- default: llvm_unreachable("Unrecognised AArch64 memory constraint");
- case 'm':
- // FIXME: more freedom is actually permitted for 'm'. We can go
- // hunting for a base and an offset if we want. Of course, since
- // we don't really know how the operand is going to be used we're
- // probably restricted to the load/store pair's simm7 as an offset
- // range anyway.
- case 'Q':
- OutOps.push_back(Op);
+/// SelectNegArithImmed - As above, but negates the value before trying to
+/// select it.
+bool AArch64DAGToDAGISel::SelectNegArithImmed(SDValue N, SDValue &Val,
+ SDValue &Shift) {
+ // This function is called from the addsub_shifted_imm ComplexPattern,
+ // which lists [imm] as the list of opcode it's interested in, however
+ // we still need to check whether the operand is actually an immediate
+ // here because the ComplexPattern opcode list is only used in
+ // root-level opcode matching.
+ if (!isa<ConstantSDNode>(N.getNode()))
+ return false;
+
+ // The immediate operand must be a 24-bit zero-extended immediate.
+ uint64_t Immed = cast<ConstantSDNode>(N.getNode())->getZExtValue();
+
+ // This negation is almost always valid, but "cmp wN, #0" and "cmn wN, #0"
+ // have the opposite effect on the C flag, so this pattern mustn't match under
+ // those circumstances.
+ if (Immed == 0)
+ return false;
+
+ if (N.getValueType() == MVT::i32)
+ Immed = ~((uint32_t)Immed) + 1;
+ else
+ Immed = ~Immed + 1ULL;
+ if (Immed & 0xFFFFFFFFFF000000ULL)
+ return false;
+
+ Immed &= 0xFFFFFFULL;
+ return SelectArithImmed(CurDAG->getConstant(Immed, MVT::i32), Val, Shift);
+}
+
+/// getShiftTypeForNode - Translate a shift node to the corresponding
+/// ShiftType value.
+static AArch64_AM::ShiftExtendType getShiftTypeForNode(SDValue N) {
+ switch (N.getOpcode()) {
+ default:
+ return AArch64_AM::InvalidShiftExtend;
+ case ISD::SHL:
+ return AArch64_AM::LSL;
+ case ISD::SRL:
+ return AArch64_AM::LSR;
+ case ISD::SRA:
+ return AArch64_AM::ASR;
+ case ISD::ROTR:
+ return AArch64_AM::ROR;
+ }
+}
+
+/// \brief Determine wether it is worth to fold V into an extended register.
+bool AArch64DAGToDAGISel::isWorthFolding(SDValue V) const {
+ // it hurts if the a value is used at least twice, unless we are optimizing
+ // for code size.
+ if (ForCodeSize || V.hasOneUse())
+ return true;
+ return false;
+}
+
+/// SelectShiftedRegister - Select a "shifted register" operand. If the value
+/// is not shifted, set the Shift operand to default of "LSL 0". The logical
+/// instructions allow the shifted register to be rotated, but the arithmetic
+/// instructions do not. The AllowROR parameter specifies whether ROR is
+/// supported.
+bool AArch64DAGToDAGISel::SelectShiftedRegister(SDValue N, bool AllowROR,
+ SDValue &Reg, SDValue &Shift) {
+ AArch64_AM::ShiftExtendType ShType = getShiftTypeForNode(N);
+ if (ShType == AArch64_AM::InvalidShiftExtend)
+ return false;
+ if (!AllowROR && ShType == AArch64_AM::ROR)
+ return false;
+
+ if (ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(N.getOperand(1))) {
+ unsigned BitSize = N.getValueType().getSizeInBits();
+ unsigned Val = RHS->getZExtValue() & (BitSize - 1);
+ unsigned ShVal = AArch64_AM::getShifterImm(ShType, Val);
+
+ Reg = N.getOperand(0);
+ Shift = CurDAG->getTargetConstant(ShVal, MVT::i32);
+ return isWorthFolding(N);
}
return false;
}
-bool
-AArch64DAGToDAGISel::SelectFPZeroOperand(SDValue N, SDValue &Dummy) {
- ConstantFPSDNode *Imm = dyn_cast<ConstantFPSDNode>(N);
- if (!Imm || !Imm->getValueAPF().isPosZero())
+/// getExtendTypeForNode - Translate an extend node to the corresponding
+/// ExtendType value.
+static AArch64_AM::ShiftExtendType
+getExtendTypeForNode(SDValue N, bool IsLoadStore = false) {
+ if (N.getOpcode() == ISD::SIGN_EXTEND ||
+ N.getOpcode() == ISD::SIGN_EXTEND_INREG) {
+ EVT SrcVT;
+ if (N.getOpcode() == ISD::SIGN_EXTEND_INREG)
+ SrcVT = cast<VTSDNode>(N.getOperand(1))->getVT();
+ else
+ SrcVT = N.getOperand(0).getValueType();
+
+ if (!IsLoadStore && SrcVT == MVT::i8)
+ return AArch64_AM::SXTB;
+ else if (!IsLoadStore && SrcVT == MVT::i16)
+ return AArch64_AM::SXTH;
+ else if (SrcVT == MVT::i32)
+ return AArch64_AM::SXTW;
+ assert(SrcVT != MVT::i64 && "extend from 64-bits?");
+
+ return AArch64_AM::InvalidShiftExtend;
+ } else if (N.getOpcode() == ISD::ZERO_EXTEND ||
+ N.getOpcode() == ISD::ANY_EXTEND) {
+ EVT SrcVT = N.getOperand(0).getValueType();
+ if (!IsLoadStore && SrcVT == MVT::i8)
+ return AArch64_AM::UXTB;
+ else if (!IsLoadStore && SrcVT == MVT::i16)
+ return AArch64_AM::UXTH;
+ else if (SrcVT == MVT::i32)
+ return AArch64_AM::UXTW;
+ assert(SrcVT != MVT::i64 && "extend from 64-bits?");
+
+ return AArch64_AM::InvalidShiftExtend;
+ } else if (N.getOpcode() == ISD::AND) {
+ ConstantSDNode *CSD = dyn_cast<ConstantSDNode>(N.getOperand(1));
+ if (!CSD)
+ return AArch64_AM::InvalidShiftExtend;
+ uint64_t AndMask = CSD->getZExtValue();
+
+ switch (AndMask) {
+ default:
+ return AArch64_AM::InvalidShiftExtend;
+ case 0xFF:
+ return !IsLoadStore ? AArch64_AM::UXTB : AArch64_AM::InvalidShiftExtend;
+ case 0xFFFF:
+ return !IsLoadStore ? AArch64_AM::UXTH : AArch64_AM::InvalidShiftExtend;
+ case 0xFFFFFFFF:
+ return AArch64_AM::UXTW;
+ }
+ }
+
+ return AArch64_AM::InvalidShiftExtend;
+}
+
+// Helper for SelectMLAV64LaneV128 - Recognize high lane extracts.
+static bool checkHighLaneIndex(SDNode *DL, SDValue &LaneOp, int &LaneIdx) {
+ if (DL->getOpcode() != AArch64ISD::DUPLANE16 &&
+ DL->getOpcode() != AArch64ISD::DUPLANE32)
return false;
- // Doesn't actually carry any information, but keeps TableGen quiet.
- Dummy = CurDAG->getTargetConstant(0, MVT::i32);
+ SDValue SV = DL->getOperand(0);
+ if (SV.getOpcode() != ISD::INSERT_SUBVECTOR)
+ return false;
+
+ SDValue EV = SV.getOperand(1);
+ if (EV.getOpcode() != ISD::EXTRACT_SUBVECTOR)
+ return false;
+
+ ConstantSDNode *DLidx = cast<ConstantSDNode>(DL->getOperand(1).getNode());
+ ConstantSDNode *EVidx = cast<ConstantSDNode>(EV.getOperand(1).getNode());
+ LaneIdx = DLidx->getSExtValue() + EVidx->getSExtValue();
+ LaneOp = EV.getOperand(0);
+
return true;
}
-bool AArch64DAGToDAGISel::SelectLogicalImm(SDValue N, SDValue &Imm) {
- uint32_t Bits;
- uint32_t RegWidth = N.getValueType().getSizeInBits();
+// Helper for SelectOpcV64LaneV128 - Recogzine operatinos where one operand is a
+// high lane extract.
+static bool checkV64LaneV128(SDValue Op0, SDValue Op1, SDValue &StdOp,
+ SDValue &LaneOp, int &LaneIdx) {
- ConstantSDNode *CN = dyn_cast<ConstantSDNode>(N);
- if (!CN) return false;
-
- if (!A64Imms::isLogicalImm(RegWidth, CN->getZExtValue(), Bits))
- return false;
-
- Imm = CurDAG->getTargetConstant(Bits, MVT::i32);
+ if (!checkHighLaneIndex(Op0.getNode(), LaneOp, LaneIdx)) {
+ std::swap(Op0, Op1);
+ if (!checkHighLaneIndex(Op0.getNode(), LaneOp, LaneIdx))
+ return false;
+ }
+ StdOp = Op1;
return true;
}
-SDNode *AArch64DAGToDAGISel::TrySelectToMoveImm(SDNode *Node) {
- SDNode *ResNode;
- SDLoc dl(Node);
- EVT DestType = Node->getValueType(0);
- unsigned DestWidth = DestType.getSizeInBits();
+/// SelectMLAV64LaneV128 - AArch64 supports vector MLAs where one multiplicand
+/// is a lane in the upper half of a 128-bit vector. Recognize and select this
+/// so that we don't emit unnecessary lane extracts.
+SDNode *AArch64DAGToDAGISel::SelectMLAV64LaneV128(SDNode *N) {
+ SDValue Op0 = N->getOperand(0);
+ SDValue Op1 = N->getOperand(1);
+ SDValue MLAOp1; // Will hold ordinary multiplicand for MLA.
+ SDValue MLAOp2; // Will hold lane-accessed multiplicand for MLA.
+ int LaneIdx = -1; // Will hold the lane index.
- unsigned MOVOpcode;
- EVT MOVType;
- int UImm16, Shift;
- uint32_t LogicalBits;
-
- uint64_t BitPat = cast<ConstantSDNode>(Node)->getZExtValue();
- if (A64Imms::isMOVZImm(DestWidth, BitPat, UImm16, Shift)) {
- MOVType = DestType;
- MOVOpcode = DestWidth == 64 ? AArch64::MOVZxii : AArch64::MOVZwii;
- } else if (A64Imms::isMOVNImm(DestWidth, BitPat, UImm16, Shift)) {
- MOVType = DestType;
- MOVOpcode = DestWidth == 64 ? AArch64::MOVNxii : AArch64::MOVNwii;
- } else if (DestWidth == 64 && A64Imms::isMOVNImm(32, BitPat, UImm16, Shift)) {
- // To get something like 0x0000_0000_ffff_1234 into a 64-bit register we can
- // use a 32-bit instruction: "movn w0, 0xedbc".
- MOVType = MVT::i32;
- MOVOpcode = AArch64::MOVNwii;
- } else if (A64Imms::isLogicalImm(DestWidth, BitPat, LogicalBits)) {
- MOVOpcode = DestWidth == 64 ? AArch64::ORRxxi : AArch64::ORRwwi;
- uint16_t ZR = DestWidth == 64 ? AArch64::XZR : AArch64::WZR;
-
- return CurDAG->getMachineNode(MOVOpcode, dl, DestType,
- CurDAG->getRegister(ZR, DestType),
- CurDAG->getTargetConstant(LogicalBits, MVT::i32));
- } else {
- // Can't handle it in one instruction. There's scope for permitting two (or
- // more) instructions, but that'll need more thought.
- return NULL;
+ if (Op1.getOpcode() != ISD::MUL ||
+ !checkV64LaneV128(Op1.getOperand(0), Op1.getOperand(1), MLAOp1, MLAOp2,
+ LaneIdx)) {
+ std::swap(Op0, Op1);
+ if (Op1.getOpcode() != ISD::MUL ||
+ !checkV64LaneV128(Op1.getOperand(0), Op1.getOperand(1), MLAOp1, MLAOp2,
+ LaneIdx))
+ return nullptr;
}
- ResNode = CurDAG->getMachineNode(MOVOpcode, dl, MOVType,
- CurDAG->getTargetConstant(UImm16, MVT::i32),
- CurDAG->getTargetConstant(Shift, MVT::i32));
+ SDValue LaneIdxVal = CurDAG->getTargetConstant(LaneIdx, MVT::i64);
- if (MOVType != DestType) {
- ResNode = CurDAG->getMachineNode(TargetOpcode::SUBREG_TO_REG, dl,
- MVT::i64, MVT::i32, MVT::Other,
- CurDAG->getTargetConstant(0, MVT::i64),
- SDValue(ResNode, 0),
- CurDAG->getTargetConstant(AArch64::sub_32, MVT::i32));
- }
+ SDValue Ops[] = { Op0, MLAOp1, MLAOp2, LaneIdxVal };
- return ResNode;
-}
+ unsigned MLAOpc = ~0U;
-SDValue
-AArch64DAGToDAGISel::getConstantPoolItemAddress(SDLoc DL,
- const Constant *CV) {
- EVT PtrVT = getTargetLowering()->getPointerTy();
-
- switch (getTargetLowering()->getTargetMachine().getCodeModel()) {
- case CodeModel::Small: {
- unsigned Alignment =
- getTargetLowering()->getDataLayout()->getABITypeAlignment(CV->getType());
- return CurDAG->getNode(
- AArch64ISD::WrapperSmall, DL, PtrVT,
- CurDAG->getTargetConstantPool(CV, PtrVT, 0, 0, AArch64II::MO_NO_FLAG),
- CurDAG->getTargetConstantPool(CV, PtrVT, 0, 0, AArch64II::MO_LO12),
- CurDAG->getConstant(Alignment, MVT::i32));
- }
- case CodeModel::Large: {
- SDNode *LitAddr;
- LitAddr = CurDAG->getMachineNode(
- AArch64::MOVZxii, DL, PtrVT,
- CurDAG->getTargetConstantPool(CV, PtrVT, 0, 0, AArch64II::MO_ABS_G3),
- CurDAG->getTargetConstant(3, MVT::i32));
- LitAddr = CurDAG->getMachineNode(
- AArch64::MOVKxii, DL, PtrVT, SDValue(LitAddr, 0),
- CurDAG->getTargetConstantPool(CV, PtrVT, 0, 0, AArch64II::MO_ABS_G2_NC),
- CurDAG->getTargetConstant(2, MVT::i32));
- LitAddr = CurDAG->getMachineNode(
- AArch64::MOVKxii, DL, PtrVT, SDValue(LitAddr, 0),
- CurDAG->getTargetConstantPool(CV, PtrVT, 0, 0, AArch64II::MO_ABS_G1_NC),
- CurDAG->getTargetConstant(1, MVT::i32));
- LitAddr = CurDAG->getMachineNode(
- AArch64::MOVKxii, DL, PtrVT, SDValue(LitAddr, 0),
- CurDAG->getTargetConstantPool(CV, PtrVT, 0, 0, AArch64II::MO_ABS_G0_NC),
- CurDAG->getTargetConstant(0, MVT::i32));
- return SDValue(LitAddr, 0);
- }
+ switch (N->getSimpleValueType(0).SimpleTy) {
default:
- llvm_unreachable("Only small and large code models supported now");
+ llvm_unreachable("Unrecognized MLA.");
+ case MVT::v4i16:
+ MLAOpc = AArch64::MLAv4i16_indexed;
+ break;
+ case MVT::v8i16:
+ MLAOpc = AArch64::MLAv8i16_indexed;
+ break;
+ case MVT::v2i32:
+ MLAOpc = AArch64::MLAv2i32_indexed;
+ break;
+ case MVT::v4i32:
+ MLAOpc = AArch64::MLAv4i32_indexed;
+ break;
}
+
+ return CurDAG->getMachineNode(MLAOpc, SDLoc(N), N->getValueType(0), Ops);
}
-SDNode *AArch64DAGToDAGISel::SelectToLitPool(SDNode *Node) {
- SDLoc DL(Node);
- uint64_t UnsignedVal = cast<ConstantSDNode>(Node)->getZExtValue();
- int64_t SignedVal = cast<ConstantSDNode>(Node)->getSExtValue();
- EVT DestType = Node->getValueType(0);
+SDNode *AArch64DAGToDAGISel::SelectMULLV64LaneV128(unsigned IntNo, SDNode *N) {
+ SDValue SMULLOp0;
+ SDValue SMULLOp1;
+ int LaneIdx;
- // Since we may end up loading a 64-bit constant from a 32-bit entry the
- // constant in the pool may have a different type to the eventual node.
- ISD::LoadExtType Extension;
- EVT MemType;
+ if (!checkV64LaneV128(N->getOperand(1), N->getOperand(2), SMULLOp0, SMULLOp1,
+ LaneIdx))
+ return nullptr;
- assert((DestType == MVT::i64 || DestType == MVT::i32)
- && "Only expect integer constants at the moment");
+ SDValue LaneIdxVal = CurDAG->getTargetConstant(LaneIdx, MVT::i64);
- if (DestType == MVT::i32) {
- Extension = ISD::NON_EXTLOAD;
- MemType = MVT::i32;
- } else if (UnsignedVal <= UINT32_MAX) {
- Extension = ISD::ZEXTLOAD;
- MemType = MVT::i32;
- } else if (SignedVal >= INT32_MIN && SignedVal <= INT32_MAX) {
- Extension = ISD::SEXTLOAD;
- MemType = MVT::i32;
+ SDValue Ops[] = { SMULLOp0, SMULLOp1, LaneIdxVal };
+
+ unsigned SMULLOpc = ~0U;
+
+ if (IntNo == Intrinsic::aarch64_neon_smull) {
+ switch (N->getSimpleValueType(0).SimpleTy) {
+ default:
+ llvm_unreachable("Unrecognized SMULL.");
+ case MVT::v4i32:
+ SMULLOpc = AArch64::SMULLv4i16_indexed;
+ break;
+ case MVT::v2i64:
+ SMULLOpc = AArch64::SMULLv2i32_indexed;
+ break;
+ }
+ } else if (IntNo == Intrinsic::aarch64_neon_umull) {
+ switch (N->getSimpleValueType(0).SimpleTy) {
+ default:
+ llvm_unreachable("Unrecognized SMULL.");
+ case MVT::v4i32:
+ SMULLOpc = AArch64::UMULLv4i16_indexed;
+ break;
+ case MVT::v2i64:
+ SMULLOpc = AArch64::UMULLv2i32_indexed;
+ break;
+ }
+ } else
+ llvm_unreachable("Unrecognized intrinsic.");
+
+ return CurDAG->getMachineNode(SMULLOpc, SDLoc(N), N->getValueType(0), Ops);
+}
+
+/// Instructions that accept extend modifiers like UXTW expect the register
+/// being extended to be a GPR32, but the incoming DAG might be acting on a
+/// GPR64 (either via SEXT_INREG or AND). Extract the appropriate low bits if
+/// this is the case.
+static SDValue narrowIfNeeded(SelectionDAG *CurDAG, SDValue N) {
+ if (N.getValueType() == MVT::i32)
+ return N;
+
+ SDValue SubReg = CurDAG->getTargetConstant(AArch64::sub_32, MVT::i32);
+ MachineSDNode *Node = CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG,
+ SDLoc(N), MVT::i32, N, SubReg);
+ return SDValue(Node, 0);
+}
+
+
+/// SelectArithExtendedRegister - Select a "extended register" operand. This
+/// operand folds in an extend followed by an optional left shift.
+bool AArch64DAGToDAGISel::SelectArithExtendedRegister(SDValue N, SDValue &Reg,
+ SDValue &Shift) {
+ unsigned ShiftVal = 0;
+ AArch64_AM::ShiftExtendType Ext;
+
+ if (N.getOpcode() == ISD::SHL) {
+ ConstantSDNode *CSD = dyn_cast<ConstantSDNode>(N.getOperand(1));
+ if (!CSD)
+ return false;
+ ShiftVal = CSD->getZExtValue();
+ if (ShiftVal > 4)
+ return false;
+
+ Ext = getExtendTypeForNode(N.getOperand(0));
+ if (Ext == AArch64_AM::InvalidShiftExtend)
+ return false;
+
+ Reg = N.getOperand(0).getOperand(0);
} else {
- Extension = ISD::NON_EXTLOAD;
- MemType = MVT::i64;
+ Ext = getExtendTypeForNode(N);
+ if (Ext == AArch64_AM::InvalidShiftExtend)
+ return false;
+
+ Reg = N.getOperand(0);
}
- Constant *CV = ConstantInt::get(Type::getIntNTy(*CurDAG->getContext(),
- MemType.getSizeInBits()),
- UnsignedVal);
- SDValue PoolAddr = getConstantPoolItemAddress(DL, CV);
- unsigned Alignment =
- getTargetLowering()->getDataLayout()->getABITypeAlignment(CV->getType());
-
- return CurDAG->getExtLoad(Extension, DL, DestType, CurDAG->getEntryNode(),
- PoolAddr,
- MachinePointerInfo::getConstantPool(), MemType,
- /* isVolatile = */ false,
- /* isNonTemporal = */ false,
- Alignment).getNode();
+ // AArch64 mandates that the RHS of the operation must use the smallest
+ // register classs that could contain the size being extended from. Thus,
+ // if we're folding a (sext i8), we need the RHS to be a GPR32, even though
+ // there might not be an actual 32-bit value in the program. We can
+ // (harmlessly) synthesize one by injected an EXTRACT_SUBREG here.
+ assert(Ext != AArch64_AM::UXTX && Ext != AArch64_AM::SXTX);
+ Reg = narrowIfNeeded(CurDAG, Reg);
+ Shift = CurDAG->getTargetConstant(getArithExtendImm(Ext, ShiftVal), MVT::i32);
+ return isWorthFolding(N);
}
-SDNode *AArch64DAGToDAGISel::LowerToFPLitPool(SDNode *Node) {
- SDLoc DL(Node);
- const ConstantFP *FV = cast<ConstantFPSDNode>(Node)->getConstantFPValue();
- EVT DestType = Node->getValueType(0);
+/// SelectAddrModeIndexed - Select a "register plus scaled unsigned 12-bit
+/// immediate" address. The "Size" argument is the size in bytes of the memory
+/// reference, which determines the scale.
+bool AArch64DAGToDAGISel::SelectAddrModeIndexed(SDValue N, unsigned Size,
+ SDValue &Base, SDValue &OffImm) {
+ const TargetLowering *TLI = getTargetLowering();
+ if (N.getOpcode() == ISD::FrameIndex) {
+ int FI = cast<FrameIndexSDNode>(N)->getIndex();
+ Base = CurDAG->getTargetFrameIndex(FI, TLI->getPointerTy());
+ OffImm = CurDAG->getTargetConstant(0, MVT::i64);
+ return true;
+ }
- unsigned Alignment =
- getTargetLowering()->getDataLayout()->getABITypeAlignment(FV->getType());
- SDValue PoolAddr = getConstantPoolItemAddress(DL, FV);
+ if (N.getOpcode() == AArch64ISD::ADDlow) {
+ GlobalAddressSDNode *GAN =
+ dyn_cast<GlobalAddressSDNode>(N.getOperand(1).getNode());
+ Base = N.getOperand(0);
+ OffImm = N.getOperand(1);
+ if (!GAN)
+ return true;
- return CurDAG->getLoad(DestType, DL, CurDAG->getEntryNode(), PoolAddr,
- MachinePointerInfo::getConstantPool(),
- /* isVolatile = */ false,
- /* isNonTemporal = */ false,
- /* isInvariant = */ true,
- Alignment).getNode();
-}
+ const GlobalValue *GV = GAN->getGlobal();
+ unsigned Alignment = GV->getAlignment();
+ const DataLayout *DL = TLI->getDataLayout();
+ if (Alignment == 0 && !Subtarget->isTargetDarwin())
+ Alignment = DL->getABITypeAlignment(GV->getType()->getElementType());
-bool
-AArch64DAGToDAGISel::SelectTSTBOperand(SDValue N, SDValue &FixedPos,
- unsigned RegWidth) {
- const ConstantSDNode *CN = dyn_cast<ConstantSDNode>(N);
- if (!CN) return false;
+ if (Alignment >= Size)
+ return true;
+ }
- uint64_t Val = CN->getZExtValue();
+ if (CurDAG->isBaseWithConstantOffset(N)) {
+ if (ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(N.getOperand(1))) {
+ int64_t RHSC = (int64_t)RHS->getZExtValue();
+ unsigned Scale = Log2_32(Size);
+ if ((RHSC & (Size - 1)) == 0 && RHSC >= 0 && RHSC < (0x1000 << Scale)) {
+ Base = N.getOperand(0);
+ if (Base.getOpcode() == ISD::FrameIndex) {
+ int FI = cast<FrameIndexSDNode>(Base)->getIndex();
+ Base = CurDAG->getTargetFrameIndex(FI, TLI->getPointerTy());
+ }
+ OffImm = CurDAG->getTargetConstant(RHSC >> Scale, MVT::i64);
+ return true;
+ }
+ }
+ }
- if (!isPowerOf2_64(Val)) return false;
+ // Before falling back to our general case, check if the unscaled
+ // instructions can handle this. If so, that's preferable.
+ if (SelectAddrModeUnscaled(N, Size, Base, OffImm))
+ return false;
- unsigned TestedBit = Log2_64(Val);
- // Checks above should have guaranteed that we haven't lost information in
- // finding TestedBit, but it must still be in range.
- if (TestedBit >= RegWidth) return false;
-
- FixedPos = CurDAG->getTargetConstant(TestedBit, MVT::i64);
+ // Base only. The address will be materialized into a register before
+ // the memory is accessed.
+ // add x0, Xbase, #offset
+ // ldr x0, [x0]
+ Base = N;
+ OffImm = CurDAG->getTargetConstant(0, MVT::i64);
return true;
}
-SDNode *AArch64DAGToDAGISel::SelectAtomic(SDNode *Node, unsigned Op8,
- unsigned Op16,unsigned Op32,
- unsigned Op64) {
- // Mostly direct translation to the given operations, except that we preserve
- // the AtomicOrdering for use later on.
- AtomicSDNode *AN = cast<AtomicSDNode>(Node);
- EVT VT = AN->getMemoryVT();
+/// SelectAddrModeUnscaled - Select a "register plus unscaled signed 9-bit
+/// immediate" address. This should only match when there is an offset that
+/// is not valid for a scaled immediate addressing mode. The "Size" argument
+/// is the size in bytes of the memory reference, which is needed here to know
+/// what is valid for a scaled immediate.
+bool AArch64DAGToDAGISel::SelectAddrModeUnscaled(SDValue N, unsigned Size,
+ SDValue &Base,
+ SDValue &OffImm) {
+ if (!CurDAG->isBaseWithConstantOffset(N))
+ return false;
+ if (ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(N.getOperand(1))) {
+ int64_t RHSC = RHS->getSExtValue();
+ // If the offset is valid as a scaled immediate, don't match here.
+ if ((RHSC & (Size - 1)) == 0 && RHSC >= 0 &&
+ RHSC < (0x1000 << Log2_32(Size)))
+ return false;
+ if (RHSC >= -256 && RHSC < 256) {
+ Base = N.getOperand(0);
+ if (Base.getOpcode() == ISD::FrameIndex) {
+ int FI = cast<FrameIndexSDNode>(Base)->getIndex();
+ const TargetLowering *TLI = getTargetLowering();
+ Base = CurDAG->getTargetFrameIndex(FI, TLI->getPointerTy());
+ }
+ OffImm = CurDAG->getTargetConstant(RHSC, MVT::i64);
+ return true;
+ }
+ }
+ return false;
+}
- unsigned Op;
- if (VT == MVT::i8)
- Op = Op8;
- else if (VT == MVT::i16)
- Op = Op16;
- else if (VT == MVT::i32)
- Op = Op32;
- else if (VT == MVT::i64)
- Op = Op64;
- else
- llvm_unreachable("Unexpected atomic operation");
+static SDValue Widen(SelectionDAG *CurDAG, SDValue N) {
+ SDValue SubReg = CurDAG->getTargetConstant(AArch64::sub_32, MVT::i32);
+ SDValue ImpDef = SDValue(
+ CurDAG->getMachineNode(TargetOpcode::IMPLICIT_DEF, SDLoc(N), MVT::i64),
+ 0);
+ MachineSDNode *Node = CurDAG->getMachineNode(
+ TargetOpcode::INSERT_SUBREG, SDLoc(N), MVT::i64, ImpDef, N, SubReg);
+ return SDValue(Node, 0);
+}
- SmallVector<SDValue, 4> Ops;
- for (unsigned i = 1; i < AN->getNumOperands(); ++i)
- Ops.push_back(AN->getOperand(i));
+/// \brief Check if the given SHL node (\p N), can be used to form an
+/// extended register for an addressing mode.
+bool AArch64DAGToDAGISel::SelectExtendedSHL(SDValue N, unsigned Size,
+ bool WantExtend, SDValue &Offset,
+ SDValue &SignExtend) {
+ assert(N.getOpcode() == ISD::SHL && "Invalid opcode.");
+ ConstantSDNode *CSD = dyn_cast<ConstantSDNode>(N.getOperand(1));
+ if (!CSD || (CSD->getZExtValue() & 0x7) != CSD->getZExtValue())
+ return false;
- Ops.push_back(CurDAG->getTargetConstant(AN->getOrdering(), MVT::i32));
- Ops.push_back(AN->getOperand(0)); // Chain moves to the end
+ if (WantExtend) {
+ AArch64_AM::ShiftExtendType Ext =
+ getExtendTypeForNode(N.getOperand(0), true);
+ if (Ext == AArch64_AM::InvalidShiftExtend)
+ return false;
- return CurDAG->SelectNodeTo(Node, Op,
- AN->getValueType(0), MVT::Other,
- &Ops[0], Ops.size());
+ Offset = narrowIfNeeded(CurDAG, N.getOperand(0).getOperand(0));
+ SignExtend = CurDAG->getTargetConstant(Ext == AArch64_AM::SXTW, MVT::i32);
+ } else {
+ Offset = N.getOperand(0);
+ SignExtend = CurDAG->getTargetConstant(0, MVT::i32);
+ }
+
+ unsigned LegalShiftVal = Log2_32(Size);
+ unsigned ShiftVal = CSD->getZExtValue();
+
+ if (ShiftVal != 0 && ShiftVal != LegalShiftVal)
+ return false;
+
+ if (isWorthFolding(N))
+ return true;
+
+ return false;
+}
+
+bool AArch64DAGToDAGISel::SelectAddrModeWRO(SDValue N, unsigned Size,
+ SDValue &Base, SDValue &Offset,
+ SDValue &SignExtend,
+ SDValue &DoShift) {
+ if (N.getOpcode() != ISD::ADD)
+ return false;
+ SDValue LHS = N.getOperand(0);
+ SDValue RHS = N.getOperand(1);
+
+ // We don't want to match immediate adds here, because they are better lowered
+ // to the register-immediate addressing modes.
+ if (isa<ConstantSDNode>(LHS) || isa<ConstantSDNode>(RHS))
+ return false;
+
+ // Check if this particular node is reused in any non-memory related
+ // operation. If yes, do not try to fold this node into the address
+ // computation, since the computation will be kept.
+ const SDNode *Node = N.getNode();
+ for (SDNode *UI : Node->uses()) {
+ if (!isa<MemSDNode>(*UI))
+ return false;
+ }
+
+ // Remember if it is worth folding N when it produces extended register.
+ bool IsExtendedRegisterWorthFolding = isWorthFolding(N);
+
+ // Try to match a shifted extend on the RHS.
+ if (IsExtendedRegisterWorthFolding && RHS.getOpcode() == ISD::SHL &&
+ SelectExtendedSHL(RHS, Size, true, Offset, SignExtend)) {
+ Base = LHS;
+ DoShift = CurDAG->getTargetConstant(true, MVT::i32);
+ return true;
+ }
+
+ // Try to match a shifted extend on the LHS.
+ if (IsExtendedRegisterWorthFolding && LHS.getOpcode() == ISD::SHL &&
+ SelectExtendedSHL(LHS, Size, true, Offset, SignExtend)) {
+ Base = RHS;
+ DoShift = CurDAG->getTargetConstant(true, MVT::i32);
+ return true;
+ }
+
+ // There was no shift, whatever else we find.
+ DoShift = CurDAG->getTargetConstant(false, MVT::i32);
+
+ AArch64_AM::ShiftExtendType Ext = AArch64_AM::InvalidShiftExtend;
+ // Try to match an unshifted extend on the LHS.
+ if (IsExtendedRegisterWorthFolding &&
+ (Ext = getExtendTypeForNode(LHS, true)) !=
+ AArch64_AM::InvalidShiftExtend) {
+ Base = RHS;
+ Offset = narrowIfNeeded(CurDAG, LHS.getOperand(0));
+ SignExtend = CurDAG->getTargetConstant(Ext == AArch64_AM::SXTW, MVT::i32);
+ if (isWorthFolding(LHS))
+ return true;
+ }
+
+ // Try to match an unshifted extend on the RHS.
+ if (IsExtendedRegisterWorthFolding &&
+ (Ext = getExtendTypeForNode(RHS, true)) !=
+ AArch64_AM::InvalidShiftExtend) {
+ Base = LHS;
+ Offset = narrowIfNeeded(CurDAG, RHS.getOperand(0));
+ SignExtend = CurDAG->getTargetConstant(Ext == AArch64_AM::SXTW, MVT::i32);
+ if (isWorthFolding(RHS))
+ return true;
+ }
+
+ return false;
+}
+
+bool AArch64DAGToDAGISel::SelectAddrModeXRO(SDValue N, unsigned Size,
+ SDValue &Base, SDValue &Offset,
+ SDValue &SignExtend,
+ SDValue &DoShift) {
+ if (N.getOpcode() != ISD::ADD)
+ return false;
+ SDValue LHS = N.getOperand(0);
+ SDValue RHS = N.getOperand(1);
+
+ // We don't want to match immediate adds here, because they are better lowered
+ // to the register-immediate addressing modes.
+ if (isa<ConstantSDNode>(LHS) || isa<ConstantSDNode>(RHS))
+ return false;
+
+ // Check if this particular node is reused in any non-memory related
+ // operation. If yes, do not try to fold this node into the address
+ // computation, since the computation will be kept.
+ const SDNode *Node = N.getNode();
+ for (SDNode *UI : Node->uses()) {
+ if (!isa<MemSDNode>(*UI))
+ return false;
+ }
+
+ // Remember if it is worth folding N when it produces extended register.
+ bool IsExtendedRegisterWorthFolding = isWorthFolding(N);
+
+ // Try to match a shifted extend on the RHS.
+ if (IsExtendedRegisterWorthFolding && RHS.getOpcode() == ISD::SHL &&
+ SelectExtendedSHL(RHS, Size, false, Offset, SignExtend)) {
+ Base = LHS;
+ DoShift = CurDAG->getTargetConstant(true, MVT::i32);
+ return true;
+ }
+
+ // Try to match a shifted extend on the LHS.
+ if (IsExtendedRegisterWorthFolding && LHS.getOpcode() == ISD::SHL &&
+ SelectExtendedSHL(LHS, Size, false, Offset, SignExtend)) {
+ Base = RHS;
+ DoShift = CurDAG->getTargetConstant(true, MVT::i32);
+ return true;
+ }
+
+ // Match any non-shifted, non-extend, non-immediate add expression.
+ Base = LHS;
+ Offset = RHS;
+ SignExtend = CurDAG->getTargetConstant(false, MVT::i32);
+ DoShift = CurDAG->getTargetConstant(false, MVT::i32);
+ // Reg1 + Reg2 is free: no check needed.
+ return true;
}
SDValue AArch64DAGToDAGISel::createDTuple(ArrayRef<SDValue> Regs) {
- static unsigned RegClassIDs[] = { AArch64::DPairRegClassID,
- AArch64::DTripleRegClassID,
- AArch64::DQuadRegClassID };
- static unsigned SubRegs[] = { AArch64::dsub_0, AArch64::dsub_1,
- AArch64::dsub_2, AArch64::dsub_3 };
+ static unsigned RegClassIDs[] = {
+ AArch64::DDRegClassID, AArch64::DDDRegClassID, AArch64::DDDDRegClassID};
+ static unsigned SubRegs[] = { AArch64::dsub0, AArch64::dsub1,
+ AArch64::dsub2, AArch64::dsub3 };
return createTuple(Regs, RegClassIDs, SubRegs);
}
SDValue AArch64DAGToDAGISel::createQTuple(ArrayRef<SDValue> Regs) {
- static unsigned RegClassIDs[] = { AArch64::QPairRegClassID,
- AArch64::QTripleRegClassID,
- AArch64::QQuadRegClassID };
- static unsigned SubRegs[] = { AArch64::qsub_0, AArch64::qsub_1,
- AArch64::qsub_2, AArch64::qsub_3 };
+ static unsigned RegClassIDs[] = {
+ AArch64::QQRegClassID, AArch64::QQQRegClassID, AArch64::QQQQRegClassID};
+ static unsigned SubRegs[] = { AArch64::qsub0, AArch64::qsub1,
+ AArch64::qsub2, AArch64::qsub3 };
return createTuple(Regs, RegClassIDs, SubRegs);
}
@@ -478,1100 +877,2159 @@
return SDValue(N, 0);
}
-
-// Get the register stride update opcode of a VLD/VST instruction that
-// is otherwise equivalent to the given fixed stride updating instruction.
-static unsigned getVLDSTRegisterUpdateOpcode(unsigned Opc) {
- switch (Opc) {
- default: break;
- case AArch64::LD1WB_8B_fixed: return AArch64::LD1WB_8B_register;
- case AArch64::LD1WB_4H_fixed: return AArch64::LD1WB_4H_register;
- case AArch64::LD1WB_2S_fixed: return AArch64::LD1WB_2S_register;
- case AArch64::LD1WB_1D_fixed: return AArch64::LD1WB_1D_register;
- case AArch64::LD1WB_16B_fixed: return AArch64::LD1WB_16B_register;
- case AArch64::LD1WB_8H_fixed: return AArch64::LD1WB_8H_register;
- case AArch64::LD1WB_4S_fixed: return AArch64::LD1WB_4S_register;
- case AArch64::LD1WB_2D_fixed: return AArch64::LD1WB_2D_register;
-
- case AArch64::LD2WB_8B_fixed: return AArch64::LD2WB_8B_register;
- case AArch64::LD2WB_4H_fixed: return AArch64::LD2WB_4H_register;
- case AArch64::LD2WB_2S_fixed: return AArch64::LD2WB_2S_register;
- case AArch64::LD2WB_16B_fixed: return AArch64::LD2WB_16B_register;
- case AArch64::LD2WB_8H_fixed: return AArch64::LD2WB_8H_register;
- case AArch64::LD2WB_4S_fixed: return AArch64::LD2WB_4S_register;
- case AArch64::LD2WB_2D_fixed: return AArch64::LD2WB_2D_register;
-
- case AArch64::LD3WB_8B_fixed: return AArch64::LD3WB_8B_register;
- case AArch64::LD3WB_4H_fixed: return AArch64::LD3WB_4H_register;
- case AArch64::LD3WB_2S_fixed: return AArch64::LD3WB_2S_register;
- case AArch64::LD3WB_16B_fixed: return AArch64::LD3WB_16B_register;
- case AArch64::LD3WB_8H_fixed: return AArch64::LD3WB_8H_register;
- case AArch64::LD3WB_4S_fixed: return AArch64::LD3WB_4S_register;
- case AArch64::LD3WB_2D_fixed: return AArch64::LD3WB_2D_register;
-
- case AArch64::LD4WB_8B_fixed: return AArch64::LD4WB_8B_register;
- case AArch64::LD4WB_4H_fixed: return AArch64::LD4WB_4H_register;
- case AArch64::LD4WB_2S_fixed: return AArch64::LD4WB_2S_register;
- case AArch64::LD4WB_16B_fixed: return AArch64::LD4WB_16B_register;
- case AArch64::LD4WB_8H_fixed: return AArch64::LD4WB_8H_register;
- case AArch64::LD4WB_4S_fixed: return AArch64::LD4WB_4S_register;
- case AArch64::LD4WB_2D_fixed: return AArch64::LD4WB_2D_register;
-
- case AArch64::LD1x2WB_8B_fixed: return AArch64::LD1x2WB_8B_register;
- case AArch64::LD1x2WB_4H_fixed: return AArch64::LD1x2WB_4H_register;
- case AArch64::LD1x2WB_2S_fixed: return AArch64::LD1x2WB_2S_register;
- case AArch64::LD1x2WB_1D_fixed: return AArch64::LD1x2WB_1D_register;
- case AArch64::LD1x2WB_16B_fixed: return AArch64::LD1x2WB_16B_register;
- case AArch64::LD1x2WB_8H_fixed: return AArch64::LD1x2WB_8H_register;
- case AArch64::LD1x2WB_4S_fixed: return AArch64::LD1x2WB_4S_register;
- case AArch64::LD1x2WB_2D_fixed: return AArch64::LD1x2WB_2D_register;
-
- case AArch64::LD1x3WB_8B_fixed: return AArch64::LD1x3WB_8B_register;
- case AArch64::LD1x3WB_4H_fixed: return AArch64::LD1x3WB_4H_register;
- case AArch64::LD1x3WB_2S_fixed: return AArch64::LD1x3WB_2S_register;
- case AArch64::LD1x3WB_1D_fixed: return AArch64::LD1x3WB_1D_register;
- case AArch64::LD1x3WB_16B_fixed: return AArch64::LD1x3WB_16B_register;
- case AArch64::LD1x3WB_8H_fixed: return AArch64::LD1x3WB_8H_register;
- case AArch64::LD1x3WB_4S_fixed: return AArch64::LD1x3WB_4S_register;
- case AArch64::LD1x3WB_2D_fixed: return AArch64::LD1x3WB_2D_register;
-
- case AArch64::LD1x4WB_8B_fixed: return AArch64::LD1x4WB_8B_register;
- case AArch64::LD1x4WB_4H_fixed: return AArch64::LD1x4WB_4H_register;
- case AArch64::LD1x4WB_2S_fixed: return AArch64::LD1x4WB_2S_register;
- case AArch64::LD1x4WB_1D_fixed: return AArch64::LD1x4WB_1D_register;
- case AArch64::LD1x4WB_16B_fixed: return AArch64::LD1x4WB_16B_register;
- case AArch64::LD1x4WB_8H_fixed: return AArch64::LD1x4WB_8H_register;
- case AArch64::LD1x4WB_4S_fixed: return AArch64::LD1x4WB_4S_register;
- case AArch64::LD1x4WB_2D_fixed: return AArch64::LD1x4WB_2D_register;
-
- case AArch64::ST1WB_8B_fixed: return AArch64::ST1WB_8B_register;
- case AArch64::ST1WB_4H_fixed: return AArch64::ST1WB_4H_register;
- case AArch64::ST1WB_2S_fixed: return AArch64::ST1WB_2S_register;
- case AArch64::ST1WB_1D_fixed: return AArch64::ST1WB_1D_register;
- case AArch64::ST1WB_16B_fixed: return AArch64::ST1WB_16B_register;
- case AArch64::ST1WB_8H_fixed: return AArch64::ST1WB_8H_register;
- case AArch64::ST1WB_4S_fixed: return AArch64::ST1WB_4S_register;
- case AArch64::ST1WB_2D_fixed: return AArch64::ST1WB_2D_register;
-
- case AArch64::ST2WB_8B_fixed: return AArch64::ST2WB_8B_register;
- case AArch64::ST2WB_4H_fixed: return AArch64::ST2WB_4H_register;
- case AArch64::ST2WB_2S_fixed: return AArch64::ST2WB_2S_register;
- case AArch64::ST2WB_16B_fixed: return AArch64::ST2WB_16B_register;
- case AArch64::ST2WB_8H_fixed: return AArch64::ST2WB_8H_register;
- case AArch64::ST2WB_4S_fixed: return AArch64::ST2WB_4S_register;
- case AArch64::ST2WB_2D_fixed: return AArch64::ST2WB_2D_register;
-
- case AArch64::ST3WB_8B_fixed: return AArch64::ST3WB_8B_register;
- case AArch64::ST3WB_4H_fixed: return AArch64::ST3WB_4H_register;
- case AArch64::ST3WB_2S_fixed: return AArch64::ST3WB_2S_register;
- case AArch64::ST3WB_16B_fixed: return AArch64::ST3WB_16B_register;
- case AArch64::ST3WB_8H_fixed: return AArch64::ST3WB_8H_register;
- case AArch64::ST3WB_4S_fixed: return AArch64::ST3WB_4S_register;
- case AArch64::ST3WB_2D_fixed: return AArch64::ST3WB_2D_register;
-
- case AArch64::ST4WB_8B_fixed: return AArch64::ST4WB_8B_register;
- case AArch64::ST4WB_4H_fixed: return AArch64::ST4WB_4H_register;
- case AArch64::ST4WB_2S_fixed: return AArch64::ST4WB_2S_register;
- case AArch64::ST4WB_16B_fixed: return AArch64::ST4WB_16B_register;
- case AArch64::ST4WB_8H_fixed: return AArch64::ST4WB_8H_register;
- case AArch64::ST4WB_4S_fixed: return AArch64::ST4WB_4S_register;
- case AArch64::ST4WB_2D_fixed: return AArch64::ST4WB_2D_register;
-
- case AArch64::ST1x2WB_8B_fixed: return AArch64::ST1x2WB_8B_register;
- case AArch64::ST1x2WB_4H_fixed: return AArch64::ST1x2WB_4H_register;
- case AArch64::ST1x2WB_2S_fixed: return AArch64::ST1x2WB_2S_register;
- case AArch64::ST1x2WB_1D_fixed: return AArch64::ST1x2WB_1D_register;
- case AArch64::ST1x2WB_16B_fixed: return AArch64::ST1x2WB_16B_register;
- case AArch64::ST1x2WB_8H_fixed: return AArch64::ST1x2WB_8H_register;
- case AArch64::ST1x2WB_4S_fixed: return AArch64::ST1x2WB_4S_register;
- case AArch64::ST1x2WB_2D_fixed: return AArch64::ST1x2WB_2D_register;
-
- case AArch64::ST1x3WB_8B_fixed: return AArch64::ST1x3WB_8B_register;
- case AArch64::ST1x3WB_4H_fixed: return AArch64::ST1x3WB_4H_register;
- case AArch64::ST1x3WB_2S_fixed: return AArch64::ST1x3WB_2S_register;
- case AArch64::ST1x3WB_1D_fixed: return AArch64::ST1x3WB_1D_register;
- case AArch64::ST1x3WB_16B_fixed: return AArch64::ST1x3WB_16B_register;
- case AArch64::ST1x3WB_8H_fixed: return AArch64::ST1x3WB_8H_register;
- case AArch64::ST1x3WB_4S_fixed: return AArch64::ST1x3WB_4S_register;
- case AArch64::ST1x3WB_2D_fixed: return AArch64::ST1x3WB_2D_register;
-
- case AArch64::ST1x4WB_8B_fixed: return AArch64::ST1x4WB_8B_register;
- case AArch64::ST1x4WB_4H_fixed: return AArch64::ST1x4WB_4H_register;
- case AArch64::ST1x4WB_2S_fixed: return AArch64::ST1x4WB_2S_register;
- case AArch64::ST1x4WB_1D_fixed: return AArch64::ST1x4WB_1D_register;
- case AArch64::ST1x4WB_16B_fixed: return AArch64::ST1x4WB_16B_register;
- case AArch64::ST1x4WB_8H_fixed: return AArch64::ST1x4WB_8H_register;
- case AArch64::ST1x4WB_4S_fixed: return AArch64::ST1x4WB_4S_register;
- case AArch64::ST1x4WB_2D_fixed: return AArch64::ST1x4WB_2D_register;
-
- // Post-index of duplicate loads
- case AArch64::LD2R_WB_8B_fixed: return AArch64::LD2R_WB_8B_register;
- case AArch64::LD2R_WB_4H_fixed: return AArch64::LD2R_WB_4H_register;
- case AArch64::LD2R_WB_2S_fixed: return AArch64::LD2R_WB_2S_register;
- case AArch64::LD2R_WB_1D_fixed: return AArch64::LD2R_WB_1D_register;
- case AArch64::LD2R_WB_16B_fixed: return AArch64::LD2R_WB_16B_register;
- case AArch64::LD2R_WB_8H_fixed: return AArch64::LD2R_WB_8H_register;
- case AArch64::LD2R_WB_4S_fixed: return AArch64::LD2R_WB_4S_register;
- case AArch64::LD2R_WB_2D_fixed: return AArch64::LD2R_WB_2D_register;
-
- case AArch64::LD3R_WB_8B_fixed: return AArch64::LD3R_WB_8B_register;
- case AArch64::LD3R_WB_4H_fixed: return AArch64::LD3R_WB_4H_register;
- case AArch64::LD3R_WB_2S_fixed: return AArch64::LD3R_WB_2S_register;
- case AArch64::LD3R_WB_1D_fixed: return AArch64::LD3R_WB_1D_register;
- case AArch64::LD3R_WB_16B_fixed: return AArch64::LD3R_WB_16B_register;
- case AArch64::LD3R_WB_8H_fixed: return AArch64::LD3R_WB_8H_register;
- case AArch64::LD3R_WB_4S_fixed: return AArch64::LD3R_WB_4S_register;
- case AArch64::LD3R_WB_2D_fixed: return AArch64::LD3R_WB_2D_register;
-
- case AArch64::LD4R_WB_8B_fixed: return AArch64::LD4R_WB_8B_register;
- case AArch64::LD4R_WB_4H_fixed: return AArch64::LD4R_WB_4H_register;
- case AArch64::LD4R_WB_2S_fixed: return AArch64::LD4R_WB_2S_register;
- case AArch64::LD4R_WB_1D_fixed: return AArch64::LD4R_WB_1D_register;
- case AArch64::LD4R_WB_16B_fixed: return AArch64::LD4R_WB_16B_register;
- case AArch64::LD4R_WB_8H_fixed: return AArch64::LD4R_WB_8H_register;
- case AArch64::LD4R_WB_4S_fixed: return AArch64::LD4R_WB_4S_register;
- case AArch64::LD4R_WB_2D_fixed: return AArch64::LD4R_WB_2D_register;
-
- // Post-index of lane loads
- case AArch64::LD2LN_WB_B_fixed: return AArch64::LD2LN_WB_B_register;
- case AArch64::LD2LN_WB_H_fixed: return AArch64::LD2LN_WB_H_register;
- case AArch64::LD2LN_WB_S_fixed: return AArch64::LD2LN_WB_S_register;
- case AArch64::LD2LN_WB_D_fixed: return AArch64::LD2LN_WB_D_register;
-
- case AArch64::LD3LN_WB_B_fixed: return AArch64::LD3LN_WB_B_register;
- case AArch64::LD3LN_WB_H_fixed: return AArch64::LD3LN_WB_H_register;
- case AArch64::LD3LN_WB_S_fixed: return AArch64::LD3LN_WB_S_register;
- case AArch64::LD3LN_WB_D_fixed: return AArch64::LD3LN_WB_D_register;
-
- case AArch64::LD4LN_WB_B_fixed: return AArch64::LD4LN_WB_B_register;
- case AArch64::LD4LN_WB_H_fixed: return AArch64::LD4LN_WB_H_register;
- case AArch64::LD4LN_WB_S_fixed: return AArch64::LD4LN_WB_S_register;
- case AArch64::LD4LN_WB_D_fixed: return AArch64::LD4LN_WB_D_register;
-
- // Post-index of lane stores
- case AArch64::ST2LN_WB_B_fixed: return AArch64::ST2LN_WB_B_register;
- case AArch64::ST2LN_WB_H_fixed: return AArch64::ST2LN_WB_H_register;
- case AArch64::ST2LN_WB_S_fixed: return AArch64::ST2LN_WB_S_register;
- case AArch64::ST2LN_WB_D_fixed: return AArch64::ST2LN_WB_D_register;
-
- case AArch64::ST3LN_WB_B_fixed: return AArch64::ST3LN_WB_B_register;
- case AArch64::ST3LN_WB_H_fixed: return AArch64::ST3LN_WB_H_register;
- case AArch64::ST3LN_WB_S_fixed: return AArch64::ST3LN_WB_S_register;
- case AArch64::ST3LN_WB_D_fixed: return AArch64::ST3LN_WB_D_register;
-
- case AArch64::ST4LN_WB_B_fixed: return AArch64::ST4LN_WB_B_register;
- case AArch64::ST4LN_WB_H_fixed: return AArch64::ST4LN_WB_H_register;
- case AArch64::ST4LN_WB_S_fixed: return AArch64::ST4LN_WB_S_register;
- case AArch64::ST4LN_WB_D_fixed: return AArch64::ST4LN_WB_D_register;
- }
- return Opc; // If not one we handle, return it unchanged.
-}
-
-SDNode *AArch64DAGToDAGISel::SelectVLD(SDNode *N, bool isUpdating,
- unsigned NumVecs,
- const uint16_t *Opcodes) {
- assert(NumVecs >= 1 && NumVecs <= 4 && "VLD NumVecs out-of-range");
-
+SDNode *AArch64DAGToDAGISel::SelectTable(SDNode *N, unsigned NumVecs,
+ unsigned Opc, bool isExt) {
+ SDLoc dl(N);
EVT VT = N->getValueType(0);
- unsigned OpcodeIndex;
- bool is64BitVector = VT.is64BitVector();
- switch (VT.getScalarType().getSizeInBits()) {
- case 8: OpcodeIndex = is64BitVector ? 0 : 4; break;
- case 16: OpcodeIndex = is64BitVector ? 1 : 5; break;
- case 32: OpcodeIndex = is64BitVector ? 2 : 6; break;
- case 64: OpcodeIndex = is64BitVector ? 3 : 7; break;
- default: llvm_unreachable("unhandled vector load type");
- }
- unsigned Opc = Opcodes[OpcodeIndex];
- SmallVector<SDValue, 2> Ops;
- unsigned AddrOpIdx = isUpdating ? 1 : 2;
- Ops.push_back(N->getOperand(AddrOpIdx)); // Push back the Memory Address
+ unsigned ExtOff = isExt;
- if (isUpdating) {
- SDValue Inc = N->getOperand(AddrOpIdx + 1);
- if (!isa<ConstantSDNode>(Inc.getNode())) // Increment in Register
- Opc = getVLDSTRegisterUpdateOpcode(Opc);
- Ops.push_back(Inc);
- }
+ // Form a REG_SEQUENCE to force register allocation.
+ unsigned Vec0Off = ExtOff + 1;
+ SmallVector<SDValue, 4> Regs(N->op_begin() + Vec0Off,
+ N->op_begin() + Vec0Off + NumVecs);
+ SDValue RegSeq = createQTuple(Regs);
- Ops.push_back(N->getOperand(0)); // Push back the Chain
-
- SmallVector<EVT, 3> ResTys;
- // Push back the type of return super register
- if (NumVecs == 1)
- ResTys.push_back(VT);
- else if (NumVecs == 3)
- ResTys.push_back(MVT::Untyped);
- else {
- EVT ResTy = EVT::getVectorVT(*CurDAG->getContext(), MVT::i64,
- is64BitVector ? NumVecs : NumVecs * 2);
- ResTys.push_back(ResTy);
- }
-
- if (isUpdating)
- ResTys.push_back(MVT::i64); // Type of the updated register
- ResTys.push_back(MVT::Other); // Type of the Chain
- SDLoc dl(N);
- SDNode *VLd = CurDAG->getMachineNode(Opc, dl, ResTys, Ops);
-
- // Transfer memoperands.
- MachineSDNode::mmo_iterator MemOp = MF->allocateMemRefsArray(1);
- MemOp[0] = cast<MemIntrinsicSDNode>(N)->getMemOperand();
- cast<MachineSDNode>(VLd)->setMemRefs(MemOp, MemOp + 1);
-
- if (NumVecs == 1)
- return VLd;
-
- // If NumVecs > 1, the return result is a super register containing 2-4
- // consecutive vector registers.
- SDValue SuperReg = SDValue(VLd, 0);
-
- unsigned Sub0 = is64BitVector ? AArch64::dsub_0 : AArch64::qsub_0;
- for (unsigned Vec = 0; Vec < NumVecs; ++Vec)
- ReplaceUses(SDValue(N, Vec),
- CurDAG->getTargetExtractSubreg(Sub0 + Vec, dl, VT, SuperReg));
- // Update users of the Chain
- ReplaceUses(SDValue(N, NumVecs), SDValue(VLd, 1));
- if (isUpdating)
- ReplaceUses(SDValue(N, NumVecs + 1), SDValue(VLd, 2));
-
- return NULL;
+ SmallVector<SDValue, 6> Ops;
+ if (isExt)
+ Ops.push_back(N->getOperand(1));
+ Ops.push_back(RegSeq);
+ Ops.push_back(N->getOperand(NumVecs + ExtOff + 1));
+ return CurDAG->getMachineNode(Opc, dl, VT, Ops);
}
-SDNode *AArch64DAGToDAGISel::SelectVST(SDNode *N, bool isUpdating,
- unsigned NumVecs,
- const uint16_t *Opcodes) {
- assert(NumVecs >= 1 && NumVecs <= 4 && "VST NumVecs out-of-range");
- SDLoc dl(N);
+SDNode *AArch64DAGToDAGISel::SelectIndexedLoad(SDNode *N, bool &Done) {
+ LoadSDNode *LD = cast<LoadSDNode>(N);
+ if (LD->isUnindexed())
+ return nullptr;
+ EVT VT = LD->getMemoryVT();
+ EVT DstVT = N->getValueType(0);
+ ISD::MemIndexedMode AM = LD->getAddressingMode();
+ bool IsPre = AM == ISD::PRE_INC || AM == ISD::PRE_DEC;
- MachineSDNode::mmo_iterator MemOp = MF->allocateMemRefsArray(1);
- MemOp[0] = cast<MemIntrinsicSDNode>(N)->getMemOperand();
+ // We're not doing validity checking here. That was done when checking
+ // if we should mark the load as indexed or not. We're just selecting
+ // the right instruction.
+ unsigned Opcode = 0;
- unsigned AddrOpIdx = isUpdating ? 1 : 2;
- unsigned Vec0Idx = 3;
- EVT VT = N->getOperand(Vec0Idx).getValueType();
- unsigned OpcodeIndex;
- bool is64BitVector = VT.is64BitVector();
- switch (VT.getScalarType().getSizeInBits()) {
- case 8: OpcodeIndex = is64BitVector ? 0 : 4; break;
- case 16: OpcodeIndex = is64BitVector ? 1 : 5; break;
- case 32: OpcodeIndex = is64BitVector ? 2 : 6; break;
- case 64: OpcodeIndex = is64BitVector ? 3 : 7; break;
- default: llvm_unreachable("unhandled vector store type");
+ ISD::LoadExtType ExtType = LD->getExtensionType();
+ bool InsertTo64 = false;
+ if (VT == MVT::i64)
+ Opcode = IsPre ? AArch64::LDRXpre : AArch64::LDRXpost;
+ else if (VT == MVT::i32) {
+ if (ExtType == ISD::NON_EXTLOAD)
+ Opcode = IsPre ? AArch64::LDRWpre : AArch64::LDRWpost;
+ else if (ExtType == ISD::SEXTLOAD)
+ Opcode = IsPre ? AArch64::LDRSWpre : AArch64::LDRSWpost;
+ else {
+ Opcode = IsPre ? AArch64::LDRWpre : AArch64::LDRWpost;
+ InsertTo64 = true;
+ // The result of the load is only i32. It's the subreg_to_reg that makes
+ // it into an i64.
+ DstVT = MVT::i32;
+ }
+ } else if (VT == MVT::i16) {
+ if (ExtType == ISD::SEXTLOAD) {
+ if (DstVT == MVT::i64)
+ Opcode = IsPre ? AArch64::LDRSHXpre : AArch64::LDRSHXpost;
+ else
+ Opcode = IsPre ? AArch64::LDRSHWpre : AArch64::LDRSHWpost;
+ } else {
+ Opcode = IsPre ? AArch64::LDRHHpre : AArch64::LDRHHpost;
+ InsertTo64 = DstVT == MVT::i64;
+ // The result of the load is only i32. It's the subreg_to_reg that makes
+ // it into an i64.
+ DstVT = MVT::i32;
+ }
+ } else if (VT == MVT::i8) {
+ if (ExtType == ISD::SEXTLOAD) {
+ if (DstVT == MVT::i64)
+ Opcode = IsPre ? AArch64::LDRSBXpre : AArch64::LDRSBXpost;
+ else
+ Opcode = IsPre ? AArch64::LDRSBWpre : AArch64::LDRSBWpost;
+ } else {
+ Opcode = IsPre ? AArch64::LDRBBpre : AArch64::LDRBBpost;
+ InsertTo64 = DstVT == MVT::i64;
+ // The result of the load is only i32. It's the subreg_to_reg that makes
+ // it into an i64.
+ DstVT = MVT::i32;
+ }
+ } else if (VT == MVT::f32) {
+ Opcode = IsPre ? AArch64::LDRSpre : AArch64::LDRSpost;
+ } else if (VT == MVT::f64 || VT.is64BitVector()) {
+ Opcode = IsPre ? AArch64::LDRDpre : AArch64::LDRDpost;
+ } else if (VT.is128BitVector()) {
+ Opcode = IsPre ? AArch64::LDRQpre : AArch64::LDRQpost;
+ } else
+ return nullptr;
+ SDValue Chain = LD->getChain();
+ SDValue Base = LD->getBasePtr();
+ ConstantSDNode *OffsetOp = cast<ConstantSDNode>(LD->getOffset());
+ int OffsetVal = (int)OffsetOp->getZExtValue();
+ SDValue Offset = CurDAG->getTargetConstant(OffsetVal, MVT::i64);
+ SDValue Ops[] = { Base, Offset, Chain };
+ SDNode *Res = CurDAG->getMachineNode(Opcode, SDLoc(N), MVT::i64, DstVT,
+ MVT::Other, Ops);
+ // Either way, we're replacing the node, so tell the caller that.
+ Done = true;
+ SDValue LoadedVal = SDValue(Res, 1);
+ if (InsertTo64) {
+ SDValue SubReg = CurDAG->getTargetConstant(AArch64::sub_32, MVT::i32);
+ LoadedVal =
+ SDValue(CurDAG->getMachineNode(
+ AArch64::SUBREG_TO_REG, SDLoc(N), MVT::i64,
+ CurDAG->getTargetConstant(0, MVT::i64), LoadedVal, SubReg),
+ 0);
}
- unsigned Opc = Opcodes[OpcodeIndex];
+ ReplaceUses(SDValue(N, 0), LoadedVal);
+ ReplaceUses(SDValue(N, 1), SDValue(Res, 0));
+ ReplaceUses(SDValue(N, 2), SDValue(Res, 2));
+
+ return nullptr;
+}
+
+SDNode *AArch64DAGToDAGISel::SelectLoad(SDNode *N, unsigned NumVecs,
+ unsigned Opc, unsigned SubRegIdx) {
+ SDLoc dl(N);
+ EVT VT = N->getValueType(0);
+ SDValue Chain = N->getOperand(0);
+
+ SmallVector<SDValue, 6> Ops;
+ Ops.push_back(N->getOperand(2)); // Mem operand;
+ Ops.push_back(Chain);
+
+ std::vector<EVT> ResTys;
+ ResTys.push_back(MVT::Untyped);
+ ResTys.push_back(MVT::Other);
+
+ SDNode *Ld = CurDAG->getMachineNode(Opc, dl, ResTys, Ops);
+ SDValue SuperReg = SDValue(Ld, 0);
+ for (unsigned i = 0; i < NumVecs; ++i)
+ ReplaceUses(SDValue(N, i),
+ CurDAG->getTargetExtractSubreg(SubRegIdx + i, dl, VT, SuperReg));
+
+ ReplaceUses(SDValue(N, NumVecs), SDValue(Ld, 1));
+ return nullptr;
+}
+
+SDNode *AArch64DAGToDAGISel::SelectPostLoad(SDNode *N, unsigned NumVecs,
+ unsigned Opc, unsigned SubRegIdx) {
+ SDLoc dl(N);
+ EVT VT = N->getValueType(0);
+ SDValue Chain = N->getOperand(0);
+
+ SmallVector<SDValue, 6> Ops;
+ Ops.push_back(N->getOperand(1)); // Mem operand
+ Ops.push_back(N->getOperand(2)); // Incremental
+ Ops.push_back(Chain);
+
+ std::vector<EVT> ResTys;
+ ResTys.push_back(MVT::i64); // Type of the write back register
+ ResTys.push_back(MVT::Untyped);
+ ResTys.push_back(MVT::Other);
+
+ SDNode *Ld = CurDAG->getMachineNode(Opc, dl, ResTys, Ops);
+
+ // Update uses of write back register
+ ReplaceUses(SDValue(N, NumVecs), SDValue(Ld, 0));
+
+ // Update uses of vector list
+ SDValue SuperReg = SDValue(Ld, 1);
+ if (NumVecs == 1)
+ ReplaceUses(SDValue(N, 0), SuperReg);
+ else
+ for (unsigned i = 0; i < NumVecs; ++i)
+ ReplaceUses(SDValue(N, i),
+ CurDAG->getTargetExtractSubreg(SubRegIdx + i, dl, VT, SuperReg));
+
+ // Update the chain
+ ReplaceUses(SDValue(N, NumVecs + 1), SDValue(Ld, 2));
+ return nullptr;
+}
+
+SDNode *AArch64DAGToDAGISel::SelectStore(SDNode *N, unsigned NumVecs,
+ unsigned Opc) {
+ SDLoc dl(N);
+ EVT VT = N->getOperand(2)->getValueType(0);
+
+ // Form a REG_SEQUENCE to force register allocation.
+ bool Is128Bit = VT.getSizeInBits() == 128;
+ SmallVector<SDValue, 4> Regs(N->op_begin() + 2, N->op_begin() + 2 + NumVecs);
+ SDValue RegSeq = Is128Bit ? createQTuple(Regs) : createDTuple(Regs);
+
+ SmallVector<SDValue, 6> Ops;
+ Ops.push_back(RegSeq);
+ Ops.push_back(N->getOperand(NumVecs + 2));
+ Ops.push_back(N->getOperand(0));
+ SDNode *St = CurDAG->getMachineNode(Opc, dl, N->getValueType(0), Ops);
+
+ return St;
+}
+
+SDNode *AArch64DAGToDAGISel::SelectPostStore(SDNode *N, unsigned NumVecs,
+ unsigned Opc) {
+ SDLoc dl(N);
+ EVT VT = N->getOperand(2)->getValueType(0);
SmallVector<EVT, 2> ResTys;
- if (isUpdating)
- ResTys.push_back(MVT::i64);
+ ResTys.push_back(MVT::i64); // Type of the write back register
ResTys.push_back(MVT::Other); // Type for the Chain
- SmallVector<SDValue, 6> Ops;
- Ops.push_back(N->getOperand(AddrOpIdx)); // Push back the Memory Address
+ // Form a REG_SEQUENCE to force register allocation.
+ bool Is128Bit = VT.getSizeInBits() == 128;
+ SmallVector<SDValue, 4> Regs(N->op_begin() + 1, N->op_begin() + 1 + NumVecs);
+ SDValue RegSeq = Is128Bit ? createQTuple(Regs) : createDTuple(Regs);
- if (isUpdating) {
- SDValue Inc = N->getOperand(AddrOpIdx + 1);
- if (!isa<ConstantSDNode>(Inc.getNode())) // Increment in Register
- Opc = getVLDSTRegisterUpdateOpcode(Opc);
- Ops.push_back(Inc);
+ SmallVector<SDValue, 6> Ops;
+ Ops.push_back(RegSeq);
+ Ops.push_back(N->getOperand(NumVecs + 1)); // base register
+ Ops.push_back(N->getOperand(NumVecs + 2)); // Incremental
+ Ops.push_back(N->getOperand(0)); // Chain
+ SDNode *St = CurDAG->getMachineNode(Opc, dl, ResTys, Ops);
+
+ return St;
+}
+
+/// WidenVector - Given a value in the V64 register class, produce the
+/// equivalent value in the V128 register class.
+class WidenVector {
+ SelectionDAG &DAG;
+
+public:
+ WidenVector(SelectionDAG &DAG) : DAG(DAG) {}
+
+ SDValue operator()(SDValue V64Reg) {
+ EVT VT = V64Reg.getValueType();
+ unsigned NarrowSize = VT.getVectorNumElements();
+ MVT EltTy = VT.getVectorElementType().getSimpleVT();
+ MVT WideTy = MVT::getVectorVT(EltTy, 2 * NarrowSize);
+ SDLoc DL(V64Reg);
+
+ SDValue Undef =
+ SDValue(DAG.getMachineNode(TargetOpcode::IMPLICIT_DEF, DL, WideTy), 0);
+ return DAG.getTargetInsertSubreg(AArch64::dsub, DL, WideTy, Undef, V64Reg);
+ }
+};
+
+/// NarrowVector - Given a value in the V128 register class, produce the
+/// equivalent value in the V64 register class.
+static SDValue NarrowVector(SDValue V128Reg, SelectionDAG &DAG) {
+ EVT VT = V128Reg.getValueType();
+ unsigned WideSize = VT.getVectorNumElements();
+ MVT EltTy = VT.getVectorElementType().getSimpleVT();
+ MVT NarrowTy = MVT::getVectorVT(EltTy, WideSize / 2);
+
+ return DAG.getTargetExtractSubreg(AArch64::dsub, SDLoc(V128Reg), NarrowTy,
+ V128Reg);
+}
+
+SDNode *AArch64DAGToDAGISel::SelectLoadLane(SDNode *N, unsigned NumVecs,
+ unsigned Opc) {
+ SDLoc dl(N);
+ EVT VT = N->getValueType(0);
+ bool Narrow = VT.getSizeInBits() == 64;
+
+ // Form a REG_SEQUENCE to force register allocation.
+ SmallVector<SDValue, 4> Regs(N->op_begin() + 2, N->op_begin() + 2 + NumVecs);
+
+ if (Narrow)
+ std::transform(Regs.begin(), Regs.end(), Regs.begin(),
+ WidenVector(*CurDAG));
+
+ SDValue RegSeq = createQTuple(Regs);
+
+ std::vector<EVT> ResTys;
+ ResTys.push_back(MVT::Untyped);
+ ResTys.push_back(MVT::Other);
+
+ unsigned LaneNo =
+ cast<ConstantSDNode>(N->getOperand(NumVecs + 2))->getZExtValue();
+
+ SmallVector<SDValue, 6> Ops;
+ Ops.push_back(RegSeq);
+ Ops.push_back(CurDAG->getTargetConstant(LaneNo, MVT::i64));
+ Ops.push_back(N->getOperand(NumVecs + 3));
+ Ops.push_back(N->getOperand(0));
+ SDNode *Ld = CurDAG->getMachineNode(Opc, dl, ResTys, Ops);
+ SDValue SuperReg = SDValue(Ld, 0);
+
+ EVT WideVT = RegSeq.getOperand(1)->getValueType(0);
+ static unsigned QSubs[] = { AArch64::qsub0, AArch64::qsub1, AArch64::qsub2,
+ AArch64::qsub3 };
+ for (unsigned i = 0; i < NumVecs; ++i) {
+ SDValue NV = CurDAG->getTargetExtractSubreg(QSubs[i], dl, WideVT, SuperReg);
+ if (Narrow)
+ NV = NarrowVector(NV, *CurDAG);
+ ReplaceUses(SDValue(N, i), NV);
}
- SmallVector<SDValue, 4> Regs(N->op_begin() + Vec0Idx,
- N->op_begin() + Vec0Idx + NumVecs);
- SDValue SrcReg = is64BitVector ? createDTuple(Regs) : createQTuple(Regs);
- Ops.push_back(SrcReg);
+ ReplaceUses(SDValue(N, NumVecs), SDValue(Ld, 1));
- // Push back the Chain
+ return Ld;
+}
+
+SDNode *AArch64DAGToDAGISel::SelectPostLoadLane(SDNode *N, unsigned NumVecs,
+ unsigned Opc) {
+ SDLoc dl(N);
+ EVT VT = N->getValueType(0);
+ bool Narrow = VT.getSizeInBits() == 64;
+
+ // Form a REG_SEQUENCE to force register allocation.
+ SmallVector<SDValue, 4> Regs(N->op_begin() + 1, N->op_begin() + 1 + NumVecs);
+
+ if (Narrow)
+ std::transform(Regs.begin(), Regs.end(), Regs.begin(),
+ WidenVector(*CurDAG));
+
+ SDValue RegSeq = createQTuple(Regs);
+
+ std::vector<EVT> ResTys;
+ ResTys.push_back(MVT::i64); // Type of the write back register
+ ResTys.push_back(MVT::Untyped);
+ ResTys.push_back(MVT::Other);
+
+ unsigned LaneNo =
+ cast<ConstantSDNode>(N->getOperand(NumVecs + 1))->getZExtValue();
+
+ SmallVector<SDValue, 6> Ops;
+ Ops.push_back(RegSeq);
+ Ops.push_back(CurDAG->getTargetConstant(LaneNo, MVT::i64)); // Lane Number
+ Ops.push_back(N->getOperand(NumVecs + 2)); // Base register
+ Ops.push_back(N->getOperand(NumVecs + 3)); // Incremental
Ops.push_back(N->getOperand(0));
+ SDNode *Ld = CurDAG->getMachineNode(Opc, dl, ResTys, Ops);
+
+ // Update uses of the write back register
+ ReplaceUses(SDValue(N, NumVecs), SDValue(Ld, 0));
+
+ // Update uses of the vector list
+ SDValue SuperReg = SDValue(Ld, 1);
+ if (NumVecs == 1) {
+ ReplaceUses(SDValue(N, 0),
+ Narrow ? NarrowVector(SuperReg, *CurDAG) : SuperReg);
+ } else {
+ EVT WideVT = RegSeq.getOperand(1)->getValueType(0);
+ static unsigned QSubs[] = { AArch64::qsub0, AArch64::qsub1, AArch64::qsub2,
+ AArch64::qsub3 };
+ for (unsigned i = 0; i < NumVecs; ++i) {
+ SDValue NV = CurDAG->getTargetExtractSubreg(QSubs[i], dl, WideVT,
+ SuperReg);
+ if (Narrow)
+ NV = NarrowVector(NV, *CurDAG);
+ ReplaceUses(SDValue(N, i), NV);
+ }
+ }
+
+ // Update the Chain
+ ReplaceUses(SDValue(N, NumVecs + 1), SDValue(Ld, 2));
+
+ return Ld;
+}
+
+SDNode *AArch64DAGToDAGISel::SelectStoreLane(SDNode *N, unsigned NumVecs,
+ unsigned Opc) {
+ SDLoc dl(N);
+ EVT VT = N->getOperand(2)->getValueType(0);
+ bool Narrow = VT.getSizeInBits() == 64;
+
+ // Form a REG_SEQUENCE to force register allocation.
+ SmallVector<SDValue, 4> Regs(N->op_begin() + 2, N->op_begin() + 2 + NumVecs);
+
+ if (Narrow)
+ std::transform(Regs.begin(), Regs.end(), Regs.begin(),
+ WidenVector(*CurDAG));
+
+ SDValue RegSeq = createQTuple(Regs);
+
+ unsigned LaneNo =
+ cast<ConstantSDNode>(N->getOperand(NumVecs + 2))->getZExtValue();
+
+ SmallVector<SDValue, 6> Ops;
+ Ops.push_back(RegSeq);
+ Ops.push_back(CurDAG->getTargetConstant(LaneNo, MVT::i64));
+ Ops.push_back(N->getOperand(NumVecs + 3));
+ Ops.push_back(N->getOperand(0));
+ SDNode *St = CurDAG->getMachineNode(Opc, dl, MVT::Other, Ops);
// Transfer memoperands.
- SDNode *VSt = CurDAG->getMachineNode(Opc, dl, ResTys, Ops);
- cast<MachineSDNode>(VSt)->setMemRefs(MemOp, MemOp + 1);
+ MachineSDNode::mmo_iterator MemOp = MF->allocateMemRefsArray(1);
+ MemOp[0] = cast<MemIntrinsicSDNode>(N)->getMemOperand();
+ cast<MachineSDNode>(St)->setMemRefs(MemOp, MemOp + 1);
- return VSt;
+ return St;
}
-SDValue
-AArch64DAGToDAGISel::getTargetSubregToReg(int SRIdx, SDLoc DL, EVT VT, EVT VTD,
- SDValue Operand) {
- SDNode *Reg = CurDAG->getMachineNode(TargetOpcode::SUBREG_TO_REG, DL,
- VT, VTD, MVT::Other,
- CurDAG->getTargetConstant(0, MVT::i64),
- Operand,
- CurDAG->getTargetConstant(AArch64::sub_64, MVT::i32));
- return SDValue(Reg, 0);
-}
-
-SDNode *AArch64DAGToDAGISel::SelectVLDDup(SDNode *N, bool isUpdating,
- unsigned NumVecs,
- const uint16_t *Opcodes) {
- assert(NumVecs >=2 && NumVecs <= 4 && "Load Dup NumVecs out-of-range");
+SDNode *AArch64DAGToDAGISel::SelectPostStoreLane(SDNode *N, unsigned NumVecs,
+ unsigned Opc) {
SDLoc dl(N);
+ EVT VT = N->getOperand(2)->getValueType(0);
+ bool Narrow = VT.getSizeInBits() == 64;
+
+ // Form a REG_SEQUENCE to force register allocation.
+ SmallVector<SDValue, 4> Regs(N->op_begin() + 1, N->op_begin() + 1 + NumVecs);
+
+ if (Narrow)
+ std::transform(Regs.begin(), Regs.end(), Regs.begin(),
+ WidenVector(*CurDAG));
+
+ SDValue RegSeq = createQTuple(Regs);
+
+ SmallVector<EVT, 2> ResTys;
+ ResTys.push_back(MVT::i64); // Type of the write back register
+ ResTys.push_back(MVT::Other);
+
+ unsigned LaneNo =
+ cast<ConstantSDNode>(N->getOperand(NumVecs + 1))->getZExtValue();
+
+ SmallVector<SDValue, 6> Ops;
+ Ops.push_back(RegSeq);
+ Ops.push_back(CurDAG->getTargetConstant(LaneNo, MVT::i64));
+ Ops.push_back(N->getOperand(NumVecs + 2)); // Base Register
+ Ops.push_back(N->getOperand(NumVecs + 3)); // Incremental
+ Ops.push_back(N->getOperand(0));
+ SDNode *St = CurDAG->getMachineNode(Opc, dl, ResTys, Ops);
+
+ // Transfer memoperands.
+ MachineSDNode::mmo_iterator MemOp = MF->allocateMemRefsArray(1);
+ MemOp[0] = cast<MemIntrinsicSDNode>(N)->getMemOperand();
+ cast<MachineSDNode>(St)->setMemRefs(MemOp, MemOp + 1);
+
+ return St;
+}
+
+static bool isBitfieldExtractOpFromAnd(SelectionDAG *CurDAG, SDNode *N,
+ unsigned &Opc, SDValue &Opd0,
+ unsigned &LSB, unsigned &MSB,
+ unsigned NumberOfIgnoredLowBits,
+ bool BiggerPattern) {
+ assert(N->getOpcode() == ISD::AND &&
+ "N must be a AND operation to call this function");
EVT VT = N->getValueType(0);
- unsigned OpcodeIndex;
- bool is64BitVector = VT.is64BitVector();
- switch (VT.getScalarType().getSizeInBits()) {
- case 8: OpcodeIndex = is64BitVector ? 0 : 4; break;
- case 16: OpcodeIndex = is64BitVector ? 1 : 5; break;
- case 32: OpcodeIndex = is64BitVector ? 2 : 6; break;
- case 64: OpcodeIndex = is64BitVector ? 3 : 7; break;
- default: llvm_unreachable("unhandled vector duplicate lane load type");
- }
- unsigned Opc = Opcodes[OpcodeIndex];
- SDValue SuperReg;
- SmallVector<SDValue, 6> Ops;
- Ops.push_back(N->getOperand(1)); // Push back the Memory Address
- if (isUpdating) {
- SDValue Inc = N->getOperand(2);
- if (!isa<ConstantSDNode>(Inc.getNode())) // Increment in Register
- Opc = getVLDSTRegisterUpdateOpcode(Opc);
- Ops.push_back(Inc);
- }
- Ops.push_back(N->getOperand(0)); // Push back the Chain
+ // Here we can test the type of VT and return false when the type does not
+ // match, but since it is done prior to that call in the current context
+ // we turned that into an assert to avoid redundant code.
+ assert((VT == MVT::i32 || VT == MVT::i64) &&
+ "Type checking must have been done before calling this function");
- SmallVector<EVT, 3> ResTys;
- // Push back the type of return super register
- if (NumVecs == 3)
- ResTys.push_back(MVT::Untyped);
- else {
- EVT ResTy = EVT::getVectorVT(*CurDAG->getContext(), MVT::i64,
- is64BitVector ? NumVecs : NumVecs * 2);
- ResTys.push_back(ResTy);
- }
- if (isUpdating)
- ResTys.push_back(MVT::i64); // Type of the updated register
- ResTys.push_back(MVT::Other); // Type of the Chain
- SDNode *VLdDup = CurDAG->getMachineNode(Opc, dl, ResTys, Ops);
+ // FIXME: simplify-demanded-bits in DAGCombine will probably have
+ // changed the AND node to a 32-bit mask operation. We'll have to
+ // undo that as part of the transform here if we want to catch all
+ // the opportunities.
+ // Currently the NumberOfIgnoredLowBits argument helps to recover
+ // form these situations when matching bigger pattern (bitfield insert).
- // Transfer memoperands.
- MachineSDNode::mmo_iterator MemOp = MF->allocateMemRefsArray(1);
- MemOp[0] = cast<MemIntrinsicSDNode>(N)->getMemOperand();
- cast<MachineSDNode>(VLdDup)->setMemRefs(MemOp, MemOp + 1);
+ // For unsigned extracts, check for a shift right and mask
+ uint64_t And_imm = 0;
+ if (!isOpcWithIntImmediate(N, ISD::AND, And_imm))
+ return false;
- SuperReg = SDValue(VLdDup, 0);
- unsigned Sub0 = is64BitVector ? AArch64::dsub_0 : AArch64::qsub_0;
- // Update uses of each registers in super register
- for (unsigned Vec = 0; Vec < NumVecs; ++Vec)
- ReplaceUses(SDValue(N, Vec),
- CurDAG->getTargetExtractSubreg(Sub0 + Vec, dl, VT, SuperReg));
- // Update uses of the Chain
- ReplaceUses(SDValue(N, NumVecs), SDValue(VLdDup, 1));
- if (isUpdating)
- ReplaceUses(SDValue(N, NumVecs + 1), SDValue(VLdDup, 2));
- return NULL;
+ const SDNode *Op0 = N->getOperand(0).getNode();
+
+ // Because of simplify-demanded-bits in DAGCombine, the mask may have been
+ // simplified. Try to undo that
+ And_imm |= (1 << NumberOfIgnoredLowBits) - 1;
+
+ // The immediate is a mask of the low bits iff imm & (imm+1) == 0
+ if (And_imm & (And_imm + 1))
+ return false;
+
+ bool ClampMSB = false;
+ uint64_t Srl_imm = 0;
+ // Handle the SRL + ANY_EXTEND case.
+ if (VT == MVT::i64 && Op0->getOpcode() == ISD::ANY_EXTEND &&
+ isOpcWithIntImmediate(Op0->getOperand(0).getNode(), ISD::SRL, Srl_imm)) {
+ // Extend the incoming operand of the SRL to 64-bit.
+ Opd0 = Widen(CurDAG, Op0->getOperand(0).getOperand(0));
+ // Make sure to clamp the MSB so that we preserve the semantics of the
+ // original operations.
+ ClampMSB = true;
+ } else if (VT == MVT::i32 && Op0->getOpcode() == ISD::TRUNCATE &&
+ isOpcWithIntImmediate(Op0->getOperand(0).getNode(), ISD::SRL,
+ Srl_imm)) {
+ // If the shift result was truncated, we can still combine them.
+ Opd0 = Op0->getOperand(0).getOperand(0);
+
+ // Use the type of SRL node.
+ VT = Opd0->getValueType(0);
+ } else if (isOpcWithIntImmediate(Op0, ISD::SRL, Srl_imm)) {
+ Opd0 = Op0->getOperand(0);
+ } else if (BiggerPattern) {
+ // Let's pretend a 0 shift right has been performed.
+ // The resulting code will be at least as good as the original one
+ // plus it may expose more opportunities for bitfield insert pattern.
+ // FIXME: Currently we limit this to the bigger pattern, because
+ // some optimizations expect AND and not UBFM
+ Opd0 = N->getOperand(0);
+ } else
+ return false;
+
+ assert((BiggerPattern || (Srl_imm > 0 && Srl_imm < VT.getSizeInBits())) &&
+ "bad amount in shift node!");
+
+ LSB = Srl_imm;
+ MSB = Srl_imm + (VT == MVT::i32 ? CountTrailingOnes_32(And_imm)
+ : CountTrailingOnes_64(And_imm)) -
+ 1;
+ if (ClampMSB)
+ // Since we're moving the extend before the right shift operation, we need
+ // to clamp the MSB to make sure we don't shift in undefined bits instead of
+ // the zeros which would get shifted in with the original right shift
+ // operation.
+ MSB = MSB > 31 ? 31 : MSB;
+
+ Opc = VT == MVT::i32 ? AArch64::UBFMWri : AArch64::UBFMXri;
+ return true;
}
-// We only have 128-bit vector type of load/store lane instructions.
-// If it is 64-bit vector, we also select it to the 128-bit instructions.
-// Just use SUBREG_TO_REG to adapt the input to 128-bit vector and
-// EXTRACT_SUBREG to get the 64-bit vector from the 128-bit vector output.
-SDNode *AArch64DAGToDAGISel::SelectVLDSTLane(SDNode *N, bool IsLoad,
- bool isUpdating, unsigned NumVecs,
- const uint16_t *Opcodes) {
- assert(NumVecs >= 2 && NumVecs <= 4 && "VLDSTLane NumVecs out-of-range");
- SDLoc dl(N);
- unsigned AddrOpIdx = isUpdating ? 1 : 2;
- unsigned Vec0Idx = 3;
+static bool isOneBitExtractOpFromShr(SDNode *N, unsigned &Opc, SDValue &Opd0,
+ unsigned &LSB, unsigned &MSB) {
+ // We are looking for the following pattern which basically extracts a single
+ // bit from the source value and places it in the LSB of the destination
+ // value, all other bits of the destination value or set to zero:
+ //
+ // Value2 = AND Value, MaskImm
+ // SRL Value2, ShiftImm
+ //
+ // with MaskImm >> ShiftImm == 1.
+ //
+ // This gets selected into a single UBFM:
+ //
+ // UBFM Value, ShiftImm, ShiftImm
+ //
- SDValue Chain = N->getOperand(0);
- unsigned Lane =
- cast<ConstantSDNode>(N->getOperand(Vec0Idx + NumVecs))->getZExtValue();
- EVT VT = N->getOperand(Vec0Idx).getValueType();
- bool is64BitVector = VT.is64BitVector();
- EVT VT64; // 64-bit Vector Type
+ if (N->getOpcode() != ISD::SRL)
+ return false;
- if (is64BitVector) {
- VT64 = VT;
- VT = EVT::getVectorVT(*CurDAG->getContext(), VT.getVectorElementType(),
- VT.getVectorNumElements() * 2);
+ uint64_t And_mask = 0;
+ if (!isOpcWithIntImmediate(N->getOperand(0).getNode(), ISD::AND, And_mask))
+ return false;
+
+ Opd0 = N->getOperand(0).getOperand(0);
+
+ uint64_t Srl_imm = 0;
+ if (!isIntImmediate(N->getOperand(1), Srl_imm))
+ return false;
+
+ // Check whether we really have a one bit extract here.
+ if (And_mask >> Srl_imm == 0x1) {
+ if (N->getValueType(0) == MVT::i32)
+ Opc = AArch64::UBFMWri;
+ else
+ Opc = AArch64::UBFMXri;
+
+ LSB = MSB = Srl_imm;
+
+ return true;
}
- unsigned OpcodeIndex;
- switch (VT.getScalarType().getSizeInBits()) {
- case 8: OpcodeIndex = 0; break;
- case 16: OpcodeIndex = 1; break;
- case 32: OpcodeIndex = 2; break;
- case 64: OpcodeIndex = 3; break;
- default: llvm_unreachable("unhandled vector lane load/store type");
- }
- unsigned Opc = Opcodes[OpcodeIndex];
-
- SmallVector<EVT, 3> ResTys;
- if (IsLoad) {
- // Push back the type of return super register
- if (NumVecs == 3)
- ResTys.push_back(MVT::Untyped);
- else {
- EVT ResTy = EVT::getVectorVT(*CurDAG->getContext(), MVT::i64,
- is64BitVector ? NumVecs : NumVecs * 2);
- ResTys.push_back(ResTy);
- }
- }
- if (isUpdating)
- ResTys.push_back(MVT::i64); // Type of the updated register
- ResTys.push_back(MVT::Other); // Type of Chain
- SmallVector<SDValue, 5> Ops;
- Ops.push_back(N->getOperand(AddrOpIdx)); // Push back the Memory Address
- if (isUpdating) {
- SDValue Inc = N->getOperand(AddrOpIdx + 1);
- if (!isa<ConstantSDNode>(Inc.getNode())) // Increment in Register
- Opc = getVLDSTRegisterUpdateOpcode(Opc);
- Ops.push_back(Inc);
- }
-
- SmallVector<SDValue, 4> Regs(N->op_begin() + Vec0Idx,
- N->op_begin() + Vec0Idx + NumVecs);
- if (is64BitVector)
- for (unsigned i = 0; i < Regs.size(); i++)
- Regs[i] = getTargetSubregToReg(AArch64::sub_64, dl, VT, VT64, Regs[i]);
- SDValue SuperReg = createQTuple(Regs);
-
- Ops.push_back(SuperReg); // Source Reg
- SDValue LaneValue = CurDAG->getTargetConstant(Lane, MVT::i32);
- Ops.push_back(LaneValue);
- Ops.push_back(Chain); // Push back the Chain
-
- SDNode *VLdLn = CurDAG->getMachineNode(Opc, dl, ResTys, Ops);
- MachineSDNode::mmo_iterator MemOp = MF->allocateMemRefsArray(1);
- MemOp[0] = cast<MemIntrinsicSDNode>(N)->getMemOperand();
- cast<MachineSDNode>(VLdLn)->setMemRefs(MemOp, MemOp + 1);
- if (!IsLoad)
- return VLdLn;
-
- // Extract the subregisters.
- SuperReg = SDValue(VLdLn, 0);
- unsigned Sub0 = AArch64::qsub_0;
- // Update uses of each registers in super register
- for (unsigned Vec = 0; Vec < NumVecs; ++Vec) {
- SDValue SUB0 = CurDAG->getTargetExtractSubreg(Sub0 + Vec, dl, VT, SuperReg);
- if (is64BitVector) {
- SUB0 = CurDAG->getTargetExtractSubreg(AArch64::sub_64, dl, VT64, SUB0);
- }
- ReplaceUses(SDValue(N, Vec), SUB0);
- }
- ReplaceUses(SDValue(N, NumVecs), SDValue(VLdLn, 1));
- if (isUpdating)
- ReplaceUses(SDValue(N, NumVecs + 1), SDValue(VLdLn, 2));
- return NULL;
+ return false;
}
-unsigned AArch64DAGToDAGISel::getTBLOpc(bool IsExt, bool Is64Bit,
- unsigned NumOfVec) {
- assert(NumOfVec >= 1 && NumOfVec <= 4 && "VST NumVecs out-of-range");
+static bool isBitfieldExtractOpFromShr(SDNode *N, unsigned &Opc, SDValue &Opd0,
+ unsigned &LSB, unsigned &MSB,
+ bool BiggerPattern) {
+ assert((N->getOpcode() == ISD::SRA || N->getOpcode() == ISD::SRL) &&
+ "N must be a SHR/SRA operation to call this function");
- unsigned Opc = 0;
- switch (NumOfVec) {
+ EVT VT = N->getValueType(0);
+
+ // Here we can test the type of VT and return false when the type does not
+ // match, but since it is done prior to that call in the current context
+ // we turned that into an assert to avoid redundant code.
+ assert((VT == MVT::i32 || VT == MVT::i64) &&
+ "Type checking must have been done before calling this function");
+
+ // Check for AND + SRL doing a one bit extract.
+ if (isOneBitExtractOpFromShr(N, Opc, Opd0, LSB, MSB))
+ return true;
+
+ // we're looking for a shift of a shift
+ uint64_t Shl_imm = 0;
+ uint64_t Trunc_bits = 0;
+ if (isOpcWithIntImmediate(N->getOperand(0).getNode(), ISD::SHL, Shl_imm)) {
+ Opd0 = N->getOperand(0).getOperand(0);
+ } else if (VT == MVT::i32 && N->getOpcode() == ISD::SRL &&
+ N->getOperand(0).getNode()->getOpcode() == ISD::TRUNCATE) {
+ // We are looking for a shift of truncate. Truncate from i64 to i32 could
+ // be considered as setting high 32 bits as zero. Our strategy here is to
+ // always generate 64bit UBFM. This consistency will help the CSE pass
+ // later find more redundancy.
+ Opd0 = N->getOperand(0).getOperand(0);
+ Trunc_bits = Opd0->getValueType(0).getSizeInBits() - VT.getSizeInBits();
+ VT = Opd0->getValueType(0);
+ assert(VT == MVT::i64 && "the promoted type should be i64");
+ } else if (BiggerPattern) {
+ // Let's pretend a 0 shift left has been performed.
+ // FIXME: Currently we limit this to the bigger pattern case,
+ // because some optimizations expect AND and not UBFM
+ Opd0 = N->getOperand(0);
+ } else
+ return false;
+
+ assert(Shl_imm < VT.getSizeInBits() && "bad amount in shift node!");
+ uint64_t Srl_imm = 0;
+ if (!isIntImmediate(N->getOperand(1), Srl_imm))
+ return false;
+
+ assert(Srl_imm > 0 && Srl_imm < VT.getSizeInBits() &&
+ "bad amount in shift node!");
+ // Note: The width operand is encoded as width-1.
+ unsigned Width = VT.getSizeInBits() - Trunc_bits - Srl_imm - 1;
+ int sLSB = Srl_imm - Shl_imm;
+ if (sLSB < 0)
+ return false;
+ LSB = sLSB;
+ MSB = LSB + Width;
+ // SRA requires a signed extraction
+ if (VT == MVT::i32)
+ Opc = N->getOpcode() == ISD::SRA ? AArch64::SBFMWri : AArch64::UBFMWri;
+ else
+ Opc = N->getOpcode() == ISD::SRA ? AArch64::SBFMXri : AArch64::UBFMXri;
+ return true;
+}
+
+static bool isBitfieldExtractOp(SelectionDAG *CurDAG, SDNode *N, unsigned &Opc,
+ SDValue &Opd0, unsigned &LSB, unsigned &MSB,
+ unsigned NumberOfIgnoredLowBits = 0,
+ bool BiggerPattern = false) {
+ if (N->getValueType(0) != MVT::i32 && N->getValueType(0) != MVT::i64)
+ return false;
+
+ switch (N->getOpcode()) {
default:
+ if (!N->isMachineOpcode())
+ return false;
break;
- case 1:
- if (IsExt)
- Opc = Is64Bit ? AArch64::TBX1_8b : AArch64::TBX1_16b;
- else
- Opc = Is64Bit ? AArch64::TBL1_8b : AArch64::TBL1_16b;
- break;
- case 2:
- if (IsExt)
- Opc = Is64Bit ? AArch64::TBX2_8b : AArch64::TBX2_16b;
- else
- Opc = Is64Bit ? AArch64::TBL2_8b : AArch64::TBL2_16b;
- break;
- case 3:
- if (IsExt)
- Opc = Is64Bit ? AArch64::TBX3_8b : AArch64::TBX3_16b;
- else
- Opc = Is64Bit ? AArch64::TBL3_8b : AArch64::TBL3_16b;
- break;
- case 4:
- if (IsExt)
- Opc = Is64Bit ? AArch64::TBX4_8b : AArch64::TBX4_16b;
- else
- Opc = Is64Bit ? AArch64::TBL4_8b : AArch64::TBL4_16b;
- break;
+ case ISD::AND:
+ return isBitfieldExtractOpFromAnd(CurDAG, N, Opc, Opd0, LSB, MSB,
+ NumberOfIgnoredLowBits, BiggerPattern);
+ case ISD::SRL:
+ case ISD::SRA:
+ return isBitfieldExtractOpFromShr(N, Opc, Opd0, LSB, MSB, BiggerPattern);
}
- return Opc;
+ unsigned NOpc = N->getMachineOpcode();
+ switch (NOpc) {
+ default:
+ return false;
+ case AArch64::SBFMWri:
+ case AArch64::UBFMWri:
+ case AArch64::SBFMXri:
+ case AArch64::UBFMXri:
+ Opc = NOpc;
+ Opd0 = N->getOperand(0);
+ LSB = cast<ConstantSDNode>(N->getOperand(1).getNode())->getZExtValue();
+ MSB = cast<ConstantSDNode>(N->getOperand(2).getNode())->getZExtValue();
+ return true;
+ }
+ // Unreachable
+ return false;
}
-SDNode *AArch64DAGToDAGISel::SelectVTBL(SDNode *N, unsigned NumVecs,
- bool IsExt) {
- assert(NumVecs >= 1 && NumVecs <= 4 && "VST NumVecs out-of-range");
+SDNode *AArch64DAGToDAGISel::SelectBitfieldExtractOp(SDNode *N) {
+ unsigned Opc, LSB, MSB;
+ SDValue Opd0;
+ if (!isBitfieldExtractOp(CurDAG, N, Opc, Opd0, LSB, MSB))
+ return nullptr;
+
+ EVT VT = N->getValueType(0);
+
+ // If the bit extract operation is 64bit but the original type is 32bit, we
+ // need to add one EXTRACT_SUBREG.
+ if ((Opc == AArch64::SBFMXri || Opc == AArch64::UBFMXri) && VT == MVT::i32) {
+ SDValue Ops64[] = {Opd0, CurDAG->getTargetConstant(LSB, MVT::i64),
+ CurDAG->getTargetConstant(MSB, MVT::i64)};
+
+ SDNode *BFM = CurDAG->getMachineNode(Opc, SDLoc(N), MVT::i64, Ops64);
+ SDValue SubReg = CurDAG->getTargetConstant(AArch64::sub_32, MVT::i32);
+ MachineSDNode *Node =
+ CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG, SDLoc(N), MVT::i32,
+ SDValue(BFM, 0), SubReg);
+ return Node;
+ }
+
+ SDValue Ops[] = {Opd0, CurDAG->getTargetConstant(LSB, VT),
+ CurDAG->getTargetConstant(MSB, VT)};
+ return CurDAG->SelectNodeTo(N, Opc, VT, Ops);
+}
+
+/// Does DstMask form a complementary pair with the mask provided by
+/// BitsToBeInserted, suitable for use in a BFI instruction. Roughly speaking,
+/// this asks whether DstMask zeroes precisely those bits that will be set by
+/// the other half.
+static bool isBitfieldDstMask(uint64_t DstMask, APInt BitsToBeInserted,
+ unsigned NumberOfIgnoredHighBits, EVT VT) {
+ assert((VT == MVT::i32 || VT == MVT::i64) &&
+ "i32 or i64 mask type expected!");
+ unsigned BitWidth = VT.getSizeInBits() - NumberOfIgnoredHighBits;
+
+ APInt SignificantDstMask = APInt(BitWidth, DstMask);
+ APInt SignificantBitsToBeInserted = BitsToBeInserted.zextOrTrunc(BitWidth);
+
+ return (SignificantDstMask & SignificantBitsToBeInserted) == 0 &&
+ (SignificantDstMask | SignificantBitsToBeInserted).isAllOnesValue();
+}
+
+// Look for bits that will be useful for later uses.
+// A bit is consider useless as soon as it is dropped and never used
+// before it as been dropped.
+// E.g., looking for useful bit of x
+// 1. y = x & 0x7
+// 2. z = y >> 2
+// After #1, x useful bits are 0x7, then the useful bits of x, live through
+// y.
+// After #2, the useful bits of x are 0x4.
+// However, if x is used on an unpredicatable instruction, then all its bits
+// are useful.
+// E.g.
+// 1. y = x & 0x7
+// 2. z = y >> 2
+// 3. str x, [@x]
+static void getUsefulBits(SDValue Op, APInt &UsefulBits, unsigned Depth = 0);
+
+static void getUsefulBitsFromAndWithImmediate(SDValue Op, APInt &UsefulBits,
+ unsigned Depth) {
+ uint64_t Imm =
+ cast<const ConstantSDNode>(Op.getOperand(1).getNode())->getZExtValue();
+ Imm = AArch64_AM::decodeLogicalImmediate(Imm, UsefulBits.getBitWidth());
+ UsefulBits &= APInt(UsefulBits.getBitWidth(), Imm);
+ getUsefulBits(Op, UsefulBits, Depth + 1);
+}
+
+static void getUsefulBitsFromBitfieldMoveOpd(SDValue Op, APInt &UsefulBits,
+ uint64_t Imm, uint64_t MSB,
+ unsigned Depth) {
+ // inherit the bitwidth value
+ APInt OpUsefulBits(UsefulBits);
+ OpUsefulBits = 1;
+
+ if (MSB >= Imm) {
+ OpUsefulBits = OpUsefulBits.shl(MSB - Imm + 1);
+ --OpUsefulBits;
+ // The interesting part will be in the lower part of the result
+ getUsefulBits(Op, OpUsefulBits, Depth + 1);
+ // The interesting part was starting at Imm in the argument
+ OpUsefulBits = OpUsefulBits.shl(Imm);
+ } else {
+ OpUsefulBits = OpUsefulBits.shl(MSB + 1);
+ --OpUsefulBits;
+ // The interesting part will be shifted in the result
+ OpUsefulBits = OpUsefulBits.shl(OpUsefulBits.getBitWidth() - Imm);
+ getUsefulBits(Op, OpUsefulBits, Depth + 1);
+ // The interesting part was at zero in the argument
+ OpUsefulBits = OpUsefulBits.lshr(OpUsefulBits.getBitWidth() - Imm);
+ }
+
+ UsefulBits &= OpUsefulBits;
+}
+
+static void getUsefulBitsFromUBFM(SDValue Op, APInt &UsefulBits,
+ unsigned Depth) {
+ uint64_t Imm =
+ cast<const ConstantSDNode>(Op.getOperand(1).getNode())->getZExtValue();
+ uint64_t MSB =
+ cast<const ConstantSDNode>(Op.getOperand(2).getNode())->getZExtValue();
+
+ getUsefulBitsFromBitfieldMoveOpd(Op, UsefulBits, Imm, MSB, Depth);
+}
+
+static void getUsefulBitsFromOrWithShiftedReg(SDValue Op, APInt &UsefulBits,
+ unsigned Depth) {
+ uint64_t ShiftTypeAndValue =
+ cast<const ConstantSDNode>(Op.getOperand(2).getNode())->getZExtValue();
+ APInt Mask(UsefulBits);
+ Mask.clearAllBits();
+ Mask.flipAllBits();
+
+ if (AArch64_AM::getShiftType(ShiftTypeAndValue) == AArch64_AM::LSL) {
+ // Shift Left
+ uint64_t ShiftAmt = AArch64_AM::getShiftValue(ShiftTypeAndValue);
+ Mask = Mask.shl(ShiftAmt);
+ getUsefulBits(Op, Mask, Depth + 1);
+ Mask = Mask.lshr(ShiftAmt);
+ } else if (AArch64_AM::getShiftType(ShiftTypeAndValue) == AArch64_AM::LSR) {
+ // Shift Right
+ // We do not handle AArch64_AM::ASR, because the sign will change the
+ // number of useful bits
+ uint64_t ShiftAmt = AArch64_AM::getShiftValue(ShiftTypeAndValue);
+ Mask = Mask.lshr(ShiftAmt);
+ getUsefulBits(Op, Mask, Depth + 1);
+ Mask = Mask.shl(ShiftAmt);
+ } else
+ return;
+
+ UsefulBits &= Mask;
+}
+
+static void getUsefulBitsFromBFM(SDValue Op, SDValue Orig, APInt &UsefulBits,
+ unsigned Depth) {
+ uint64_t Imm =
+ cast<const ConstantSDNode>(Op.getOperand(2).getNode())->getZExtValue();
+ uint64_t MSB =
+ cast<const ConstantSDNode>(Op.getOperand(3).getNode())->getZExtValue();
+
+ if (Op.getOperand(1) == Orig)
+ return getUsefulBitsFromBitfieldMoveOpd(Op, UsefulBits, Imm, MSB, Depth);
+
+ APInt OpUsefulBits(UsefulBits);
+ OpUsefulBits = 1;
+
+ if (MSB >= Imm) {
+ OpUsefulBits = OpUsefulBits.shl(MSB - Imm + 1);
+ --OpUsefulBits;
+ UsefulBits &= ~OpUsefulBits;
+ getUsefulBits(Op, UsefulBits, Depth + 1);
+ } else {
+ OpUsefulBits = OpUsefulBits.shl(MSB + 1);
+ --OpUsefulBits;
+ UsefulBits = ~(OpUsefulBits.shl(OpUsefulBits.getBitWidth() - Imm));
+ getUsefulBits(Op, UsefulBits, Depth + 1);
+ }
+}
+
+static void getUsefulBitsForUse(SDNode *UserNode, APInt &UsefulBits,
+ SDValue Orig, unsigned Depth) {
+
+ // Users of this node should have already been instruction selected
+ // FIXME: Can we turn that into an assert?
+ if (!UserNode->isMachineOpcode())
+ return;
+
+ switch (UserNode->getMachineOpcode()) {
+ default:
+ return;
+ case AArch64::ANDSWri:
+ case AArch64::ANDSXri:
+ case AArch64::ANDWri:
+ case AArch64::ANDXri:
+ // We increment Depth only when we call the getUsefulBits
+ return getUsefulBitsFromAndWithImmediate(SDValue(UserNode, 0), UsefulBits,
+ Depth);
+ case AArch64::UBFMWri:
+ case AArch64::UBFMXri:
+ return getUsefulBitsFromUBFM(SDValue(UserNode, 0), UsefulBits, Depth);
+
+ case AArch64::ORRWrs:
+ case AArch64::ORRXrs:
+ if (UserNode->getOperand(1) != Orig)
+ return;
+ return getUsefulBitsFromOrWithShiftedReg(SDValue(UserNode, 0), UsefulBits,
+ Depth);
+ case AArch64::BFMWri:
+ case AArch64::BFMXri:
+ return getUsefulBitsFromBFM(SDValue(UserNode, 0), Orig, UsefulBits, Depth);
+ }
+}
+
+static void getUsefulBits(SDValue Op, APInt &UsefulBits, unsigned Depth) {
+ if (Depth >= 6)
+ return;
+ // Initialize UsefulBits
+ if (!Depth) {
+ unsigned Bitwidth = Op.getValueType().getScalarType().getSizeInBits();
+ // At the beginning, assume every produced bits is useful
+ UsefulBits = APInt(Bitwidth, 0);
+ UsefulBits.flipAllBits();
+ }
+ APInt UsersUsefulBits(UsefulBits.getBitWidth(), 0);
+
+ for (SDNode *Node : Op.getNode()->uses()) {
+ // A use cannot produce useful bits
+ APInt UsefulBitsForUse = APInt(UsefulBits);
+ getUsefulBitsForUse(Node, UsefulBitsForUse, Op, Depth);
+ UsersUsefulBits |= UsefulBitsForUse;
+ }
+ // UsefulBits contains the produced bits that are meaningful for the
+ // current definition, thus a user cannot make a bit meaningful at
+ // this point
+ UsefulBits &= UsersUsefulBits;
+}
+
+/// Create a machine node performing a notional SHL of Op by ShlAmount. If
+/// ShlAmount is negative, do a (logical) right-shift instead. If ShlAmount is
+/// 0, return Op unchanged.
+static SDValue getLeftShift(SelectionDAG *CurDAG, SDValue Op, int ShlAmount) {
+ if (ShlAmount == 0)
+ return Op;
+
+ EVT VT = Op.getValueType();
+ unsigned BitWidth = VT.getSizeInBits();
+ unsigned UBFMOpc = BitWidth == 32 ? AArch64::UBFMWri : AArch64::UBFMXri;
+
+ SDNode *ShiftNode;
+ if (ShlAmount > 0) {
+ // LSL wD, wN, #Amt == UBFM wD, wN, #32-Amt, #31-Amt
+ ShiftNode = CurDAG->getMachineNode(
+ UBFMOpc, SDLoc(Op), VT, Op,
+ CurDAG->getTargetConstant(BitWidth - ShlAmount, VT),
+ CurDAG->getTargetConstant(BitWidth - 1 - ShlAmount, VT));
+ } else {
+ // LSR wD, wN, #Amt == UBFM wD, wN, #Amt, #32-1
+ assert(ShlAmount < 0 && "expected right shift");
+ int ShrAmount = -ShlAmount;
+ ShiftNode = CurDAG->getMachineNode(
+ UBFMOpc, SDLoc(Op), VT, Op, CurDAG->getTargetConstant(ShrAmount, VT),
+ CurDAG->getTargetConstant(BitWidth - 1, VT));
+ }
+
+ return SDValue(ShiftNode, 0);
+}
+
+/// Does this tree qualify as an attempt to move a bitfield into position,
+/// essentially "(and (shl VAL, N), Mask)".
+static bool isBitfieldPositioningOp(SelectionDAG *CurDAG, SDValue Op,
+ SDValue &Src, int &ShiftAmount,
+ int &MaskWidth) {
+ EVT VT = Op.getValueType();
+ unsigned BitWidth = VT.getSizeInBits();
+ (void)BitWidth;
+ assert(BitWidth == 32 || BitWidth == 64);
+
+ APInt KnownZero, KnownOne;
+ CurDAG->computeKnownBits(Op, KnownZero, KnownOne);
+
+ // Non-zero in the sense that they're not provably zero, which is the key
+ // point if we want to use this value
+ uint64_t NonZeroBits = (~KnownZero).getZExtValue();
+
+ // Discard a constant AND mask if present. It's safe because the node will
+ // already have been factored into the computeKnownBits calculation above.
+ uint64_t AndImm;
+ if (isOpcWithIntImmediate(Op.getNode(), ISD::AND, AndImm)) {
+ assert((~APInt(BitWidth, AndImm) & ~KnownZero) == 0);
+ Op = Op.getOperand(0);
+ }
+
+ uint64_t ShlImm;
+ if (!isOpcWithIntImmediate(Op.getNode(), ISD::SHL, ShlImm))
+ return false;
+ Op = Op.getOperand(0);
+
+ if (!isShiftedMask_64(NonZeroBits))
+ return false;
+
+ ShiftAmount = countTrailingZeros(NonZeroBits);
+ MaskWidth = CountTrailingOnes_64(NonZeroBits >> ShiftAmount);
+
+ // BFI encompasses sufficiently many nodes that it's worth inserting an extra
+ // LSL/LSR if the mask in NonZeroBits doesn't quite match up with the ISD::SHL
+ // amount.
+ Src = getLeftShift(CurDAG, Op, ShlImm - ShiftAmount);
+
+ return true;
+}
+
+// Given a OR operation, check if we have the following pattern
+// ubfm c, b, imm, imm2 (or something that does the same jobs, see
+// isBitfieldExtractOp)
+// d = e & mask2 ; where mask is a binary sequence of 1..10..0 and
+// countTrailingZeros(mask2) == imm2 - imm + 1
+// f = d | c
+// if yes, given reference arguments will be update so that one can replace
+// the OR instruction with:
+// f = Opc Opd0, Opd1, LSB, MSB ; where Opc is a BFM, LSB = imm, and MSB = imm2
+static bool isBitfieldInsertOpFromOr(SDNode *N, unsigned &Opc, SDValue &Dst,
+ SDValue &Src, unsigned &ImmR,
+ unsigned &ImmS, SelectionDAG *CurDAG) {
+ assert(N->getOpcode() == ISD::OR && "Expect a OR operation");
+
+ // Set Opc
+ EVT VT = N->getValueType(0);
+ if (VT == MVT::i32)
+ Opc = AArch64::BFMWri;
+ else if (VT == MVT::i64)
+ Opc = AArch64::BFMXri;
+ else
+ return false;
+
+ // Because of simplify-demanded-bits in DAGCombine, involved masks may not
+ // have the expected shape. Try to undo that.
+ APInt UsefulBits;
+ getUsefulBits(SDValue(N, 0), UsefulBits);
+
+ unsigned NumberOfIgnoredLowBits = UsefulBits.countTrailingZeros();
+ unsigned NumberOfIgnoredHighBits = UsefulBits.countLeadingZeros();
+
+ // OR is commutative, check both possibilities (does llvm provide a
+ // way to do that directely, e.g., via code matcher?)
+ SDValue OrOpd1Val = N->getOperand(1);
+ SDNode *OrOpd0 = N->getOperand(0).getNode();
+ SDNode *OrOpd1 = N->getOperand(1).getNode();
+ for (int i = 0; i < 2;
+ ++i, std::swap(OrOpd0, OrOpd1), OrOpd1Val = N->getOperand(0)) {
+ unsigned BFXOpc;
+ int DstLSB, Width;
+ if (isBitfieldExtractOp(CurDAG, OrOpd0, BFXOpc, Src, ImmR, ImmS,
+ NumberOfIgnoredLowBits, true)) {
+ // Check that the returned opcode is compatible with the pattern,
+ // i.e., same type and zero extended (U and not S)
+ if ((BFXOpc != AArch64::UBFMXri && VT == MVT::i64) ||
+ (BFXOpc != AArch64::UBFMWri && VT == MVT::i32))
+ continue;
+
+ // Compute the width of the bitfield insertion
+ DstLSB = 0;
+ Width = ImmS - ImmR + 1;
+ // FIXME: This constraint is to catch bitfield insertion we may
+ // want to widen the pattern if we want to grab general bitfied
+ // move case
+ if (Width <= 0)
+ continue;
+
+ // If the mask on the insertee is correct, we have a BFXIL operation. We
+ // can share the ImmR and ImmS values from the already-computed UBFM.
+ } else if (isBitfieldPositioningOp(CurDAG, SDValue(OrOpd0, 0), Src,
+ DstLSB, Width)) {
+ ImmR = (VT.getSizeInBits() - DstLSB) % VT.getSizeInBits();
+ ImmS = Width - 1;
+ } else
+ continue;
+
+ // Check the second part of the pattern
+ EVT VT = OrOpd1->getValueType(0);
+ assert((VT == MVT::i32 || VT == MVT::i64) && "unexpected OR operand");
+
+ // Compute the Known Zero for the candidate of the first operand.
+ // This allows to catch more general case than just looking for
+ // AND with imm. Indeed, simplify-demanded-bits may have removed
+ // the AND instruction because it proves it was useless.
+ APInt KnownZero, KnownOne;
+ CurDAG->computeKnownBits(OrOpd1Val, KnownZero, KnownOne);
+
+ // Check if there is enough room for the second operand to appear
+ // in the first one
+ APInt BitsToBeInserted =
+ APInt::getBitsSet(KnownZero.getBitWidth(), DstLSB, DstLSB + Width);
+
+ if ((BitsToBeInserted & ~KnownZero) != 0)
+ continue;
+
+ // Set the first operand
+ uint64_t Imm;
+ if (isOpcWithIntImmediate(OrOpd1, ISD::AND, Imm) &&
+ isBitfieldDstMask(Imm, BitsToBeInserted, NumberOfIgnoredHighBits, VT))
+ // In that case, we can eliminate the AND
+ Dst = OrOpd1->getOperand(0);
+ else
+ // Maybe the AND has been removed by simplify-demanded-bits
+ // or is useful because it discards more bits
+ Dst = OrOpd1Val;
+
+ // both parts match
+ return true;
+ }
+
+ return false;
+}
+
+SDNode *AArch64DAGToDAGISel::SelectBitfieldInsertOp(SDNode *N) {
+ if (N->getOpcode() != ISD::OR)
+ return nullptr;
+
+ unsigned Opc;
+ unsigned LSB, MSB;
+ SDValue Opd0, Opd1;
+
+ if (!isBitfieldInsertOpFromOr(N, Opc, Opd0, Opd1, LSB, MSB, CurDAG))
+ return nullptr;
+
+ EVT VT = N->getValueType(0);
+ SDValue Ops[] = { Opd0,
+ Opd1,
+ CurDAG->getTargetConstant(LSB, VT),
+ CurDAG->getTargetConstant(MSB, VT) };
+ return CurDAG->SelectNodeTo(N, Opc, VT, Ops);
+}
+
+SDNode *AArch64DAGToDAGISel::SelectLIBM(SDNode *N) {
+ EVT VT = N->getValueType(0);
+ unsigned Variant;
+ unsigned Opc;
+ unsigned FRINTXOpcs[] = { AArch64::FRINTXSr, AArch64::FRINTXDr };
+
+ if (VT == MVT::f32) {
+ Variant = 0;
+ } else if (VT == MVT::f64) {
+ Variant = 1;
+ } else
+ return nullptr; // Unrecognized argument type. Fall back on default codegen.
+
+ // Pick the FRINTX variant needed to set the flags.
+ unsigned FRINTXOpc = FRINTXOpcs[Variant];
+
+ switch (N->getOpcode()) {
+ default:
+ return nullptr; // Unrecognized libm ISD node. Fall back on default codegen.
+ case ISD::FCEIL: {
+ unsigned FRINTPOpcs[] = { AArch64::FRINTPSr, AArch64::FRINTPDr };
+ Opc = FRINTPOpcs[Variant];
+ break;
+ }
+ case ISD::FFLOOR: {
+ unsigned FRINTMOpcs[] = { AArch64::FRINTMSr, AArch64::FRINTMDr };
+ Opc = FRINTMOpcs[Variant];
+ break;
+ }
+ case ISD::FTRUNC: {
+ unsigned FRINTZOpcs[] = { AArch64::FRINTZSr, AArch64::FRINTZDr };
+ Opc = FRINTZOpcs[Variant];
+ break;
+ }
+ case ISD::FROUND: {
+ unsigned FRINTAOpcs[] = { AArch64::FRINTASr, AArch64::FRINTADr };
+ Opc = FRINTAOpcs[Variant];
+ break;
+ }
+ }
+
SDLoc dl(N);
+ SDValue In = N->getOperand(0);
+ SmallVector<SDValue, 2> Ops;
+ Ops.push_back(In);
- // Check the element of look up table is 64-bit or not
- unsigned Vec0Idx = IsExt ? 2 : 1;
- assert(!N->getOperand(Vec0Idx + 0).getValueType().is64BitVector() &&
- "The element of lookup table for vtbl and vtbx must be 128-bit");
+ if (!TM.Options.UnsafeFPMath) {
+ SDNode *FRINTX = CurDAG->getMachineNode(FRINTXOpc, dl, VT, MVT::Glue, In);
+ Ops.push_back(SDValue(FRINTX, 1));
+ }
- // Check the return value type is 64-bit or not
- EVT ResVT = N->getValueType(0);
- bool is64BitRes = ResVT.is64BitVector();
+ return CurDAG->getMachineNode(Opc, dl, VT, Ops);
+}
- // Create new SDValue for vector list
- SmallVector<SDValue, 4> Regs(N->op_begin() + Vec0Idx,
- N->op_begin() + Vec0Idx + NumVecs);
- SDValue TblReg = createQTuple(Regs);
- unsigned Opc = getTBLOpc(IsExt, is64BitRes, NumVecs);
+bool
+AArch64DAGToDAGISel::SelectCVTFixedPosOperand(SDValue N, SDValue &FixedPos,
+ unsigned RegWidth) {
+ APFloat FVal(0.0);
+ if (ConstantFPSDNode *CN = dyn_cast<ConstantFPSDNode>(N))
+ FVal = CN->getValueAPF();
+ else if (LoadSDNode *LN = dyn_cast<LoadSDNode>(N)) {
+ // Some otherwise illegal constants are allowed in this case.
+ if (LN->getOperand(1).getOpcode() != AArch64ISD::ADDlow ||
+ !isa<ConstantPoolSDNode>(LN->getOperand(1)->getOperand(1)))
+ return false;
- SmallVector<SDValue, 3> Ops;
- if (IsExt)
- Ops.push_back(N->getOperand(1));
- Ops.push_back(TblReg);
- Ops.push_back(N->getOperand(Vec0Idx + NumVecs));
- return CurDAG->getMachineNode(Opc, dl, ResVT, Ops);
+ ConstantPoolSDNode *CN =
+ dyn_cast<ConstantPoolSDNode>(LN->getOperand(1)->getOperand(1));
+ FVal = cast<ConstantFP>(CN->getConstVal())->getValueAPF();
+ } else
+ return false;
+
+ // An FCVT[SU] instruction performs: convertToInt(Val * 2^fbits) where fbits
+ // is between 1 and 32 for a destination w-register, or 1 and 64 for an
+ // x-register.
+ //
+ // By this stage, we've detected (fp_to_[su]int (fmul Val, THIS_NODE)) so we
+ // want THIS_NODE to be 2^fbits. This is much easier to deal with using
+ // integers.
+ bool IsExact;
+
+ // fbits is between 1 and 64 in the worst-case, which means the fmul
+ // could have 2^64 as an actual operand. Need 65 bits of precision.
+ APSInt IntVal(65, true);
+ FVal.convertToInteger(IntVal, APFloat::rmTowardZero, &IsExact);
+
+ // N.b. isPowerOf2 also checks for > 0.
+ if (!IsExact || !IntVal.isPowerOf2()) return false;
+ unsigned FBits = IntVal.logBase2();
+
+ // Checks above should have guaranteed that we haven't lost information in
+ // finding FBits, but it must still be in range.
+ if (FBits == 0 || FBits > RegWidth) return false;
+
+ FixedPos = CurDAG->getTargetConstant(FBits, MVT::i32);
+ return true;
}
SDNode *AArch64DAGToDAGISel::Select(SDNode *Node) {
// Dump information about the Node being selected
- DEBUG(dbgs() << "Selecting: "; Node->dump(CurDAG); dbgs() << "\n");
+ DEBUG(errs() << "Selecting: ");
+ DEBUG(Node->dump(CurDAG));
+ DEBUG(errs() << "\n");
+ // If we have a custom node, we already have selected!
if (Node->isMachineOpcode()) {
- DEBUG(dbgs() << "== "; Node->dump(CurDAG); dbgs() << "\n");
+ DEBUG(errs() << "== "; Node->dump(CurDAG); errs() << "\n");
Node->setNodeId(-1);
- return NULL;
+ return nullptr;
}
+ // Few custom selection stuff.
+ SDNode *ResNode = nullptr;
+ EVT VT = Node->getValueType(0);
+
switch (Node->getOpcode()) {
- case ISD::ATOMIC_LOAD_ADD:
- return SelectAtomic(Node,
- AArch64::ATOMIC_LOAD_ADD_I8,
- AArch64::ATOMIC_LOAD_ADD_I16,
- AArch64::ATOMIC_LOAD_ADD_I32,
- AArch64::ATOMIC_LOAD_ADD_I64);
- case ISD::ATOMIC_LOAD_SUB:
- return SelectAtomic(Node,
- AArch64::ATOMIC_LOAD_SUB_I8,
- AArch64::ATOMIC_LOAD_SUB_I16,
- AArch64::ATOMIC_LOAD_SUB_I32,
- AArch64::ATOMIC_LOAD_SUB_I64);
- case ISD::ATOMIC_LOAD_AND:
- return SelectAtomic(Node,
- AArch64::ATOMIC_LOAD_AND_I8,
- AArch64::ATOMIC_LOAD_AND_I16,
- AArch64::ATOMIC_LOAD_AND_I32,
- AArch64::ATOMIC_LOAD_AND_I64);
- case ISD::ATOMIC_LOAD_OR:
- return SelectAtomic(Node,
- AArch64::ATOMIC_LOAD_OR_I8,
- AArch64::ATOMIC_LOAD_OR_I16,
- AArch64::ATOMIC_LOAD_OR_I32,
- AArch64::ATOMIC_LOAD_OR_I64);
- case ISD::ATOMIC_LOAD_XOR:
- return SelectAtomic(Node,
- AArch64::ATOMIC_LOAD_XOR_I8,
- AArch64::ATOMIC_LOAD_XOR_I16,
- AArch64::ATOMIC_LOAD_XOR_I32,
- AArch64::ATOMIC_LOAD_XOR_I64);
- case ISD::ATOMIC_LOAD_NAND:
- return SelectAtomic(Node,
- AArch64::ATOMIC_LOAD_NAND_I8,
- AArch64::ATOMIC_LOAD_NAND_I16,
- AArch64::ATOMIC_LOAD_NAND_I32,
- AArch64::ATOMIC_LOAD_NAND_I64);
- case ISD::ATOMIC_LOAD_MIN:
- return SelectAtomic(Node,
- AArch64::ATOMIC_LOAD_MIN_I8,
- AArch64::ATOMIC_LOAD_MIN_I16,
- AArch64::ATOMIC_LOAD_MIN_I32,
- AArch64::ATOMIC_LOAD_MIN_I64);
- case ISD::ATOMIC_LOAD_MAX:
- return SelectAtomic(Node,
- AArch64::ATOMIC_LOAD_MAX_I8,
- AArch64::ATOMIC_LOAD_MAX_I16,
- AArch64::ATOMIC_LOAD_MAX_I32,
- AArch64::ATOMIC_LOAD_MAX_I64);
- case ISD::ATOMIC_LOAD_UMIN:
- return SelectAtomic(Node,
- AArch64::ATOMIC_LOAD_UMIN_I8,
- AArch64::ATOMIC_LOAD_UMIN_I16,
- AArch64::ATOMIC_LOAD_UMIN_I32,
- AArch64::ATOMIC_LOAD_UMIN_I64);
- case ISD::ATOMIC_LOAD_UMAX:
- return SelectAtomic(Node,
- AArch64::ATOMIC_LOAD_UMAX_I8,
- AArch64::ATOMIC_LOAD_UMAX_I16,
- AArch64::ATOMIC_LOAD_UMAX_I32,
- AArch64::ATOMIC_LOAD_UMAX_I64);
- case ISD::ATOMIC_SWAP:
- return SelectAtomic(Node,
- AArch64::ATOMIC_SWAP_I8,
- AArch64::ATOMIC_SWAP_I16,
- AArch64::ATOMIC_SWAP_I32,
- AArch64::ATOMIC_SWAP_I64);
- case ISD::ATOMIC_CMP_SWAP:
- return SelectAtomic(Node,
- AArch64::ATOMIC_CMP_SWAP_I8,
- AArch64::ATOMIC_CMP_SWAP_I16,
- AArch64::ATOMIC_CMP_SWAP_I32,
- AArch64::ATOMIC_CMP_SWAP_I64);
- case ISD::FrameIndex: {
- int FI = cast<FrameIndexSDNode>(Node)->getIndex();
- EVT PtrTy = getTargetLowering()->getPointerTy();
- SDValue TFI = CurDAG->getTargetFrameIndex(FI, PtrTy);
- return CurDAG->SelectNodeTo(Node, AArch64::ADDxxi_lsl0_s, PtrTy,
- TFI, CurDAG->getTargetConstant(0, PtrTy));
+ default:
+ break;
+
+ case ISD::ADD:
+ if (SDNode *I = SelectMLAV64LaneV128(Node))
+ return I;
+ break;
+
+ case ISD::LOAD: {
+ // Try to select as an indexed load. Fall through to normal processing
+ // if we can't.
+ bool Done = false;
+ SDNode *I = SelectIndexedLoad(Node, Done);
+ if (Done)
+ return I;
+ break;
+ }
+
+ case ISD::SRL:
+ case ISD::AND:
+ case ISD::SRA:
+ if (SDNode *I = SelectBitfieldExtractOp(Node))
+ return I;
+ break;
+
+ case ISD::OR:
+ if (SDNode *I = SelectBitfieldInsertOp(Node))
+ return I;
+ break;
+
+ case ISD::EXTRACT_VECTOR_ELT: {
+ // Extracting lane zero is a special case where we can just use a plain
+ // EXTRACT_SUBREG instruction, which will become FMOV. This is easier for
+ // the rest of the compiler, especially the register allocator and copyi
+ // propagation, to reason about, so is preferred when it's possible to
+ // use it.
+ ConstantSDNode *LaneNode = cast<ConstantSDNode>(Node->getOperand(1));
+ // Bail and use the default Select() for non-zero lanes.
+ if (LaneNode->getZExtValue() != 0)
+ break;
+ // If the element type is not the same as the result type, likewise
+ // bail and use the default Select(), as there's more to do than just
+ // a cross-class COPY. This catches extracts of i8 and i16 elements
+ // since they will need an explicit zext.
+ if (VT != Node->getOperand(0).getValueType().getVectorElementType())
+ break;
+ unsigned SubReg;
+ switch (Node->getOperand(0)
+ .getValueType()
+ .getVectorElementType()
+ .getSizeInBits()) {
+ default:
+ assert(0 && "Unexpected vector element type!");
+ case 64:
+ SubReg = AArch64::dsub;
+ break;
+ case 32:
+ SubReg = AArch64::ssub;
+ break;
+ case 16: // FALLTHROUGH
+ case 8:
+ llvm_unreachable("unexpected zext-requiring extract element!");
+ }
+ SDValue Extract = CurDAG->getTargetExtractSubreg(SubReg, SDLoc(Node), VT,
+ Node->getOperand(0));
+ DEBUG(dbgs() << "ISEL: Custom selection!\n=> ");
+ DEBUG(Extract->dumpr(CurDAG));
+ DEBUG(dbgs() << "\n");
+ return Extract.getNode();
}
case ISD::Constant: {
- SDNode *ResNode = 0;
- if (cast<ConstantSDNode>(Node)->getZExtValue() == 0) {
- // XZR and WZR are probably even better than an actual move: most of the
- // time they can be folded into another instruction with *no* cost.
-
- EVT Ty = Node->getValueType(0);
- assert((Ty == MVT::i32 || Ty == MVT::i64) && "unexpected type");
- uint16_t Register = Ty == MVT::i32 ? AArch64::WZR : AArch64::XZR;
- ResNode = CurDAG->getCopyFromReg(CurDAG->getEntryNode(),
- SDLoc(Node),
- Register, Ty).getNode();
- }
-
- // Next best option is a move-immediate, see if we can do that.
- if (!ResNode) {
- ResNode = TrySelectToMoveImm(Node);
- }
-
- if (ResNode)
- return ResNode;
-
- // If even that fails we fall back to a lit-pool entry at the moment. Future
- // tuning may change this to a sequence of MOVZ/MOVN/MOVK instructions.
- ResNode = SelectToLitPool(Node);
- assert(ResNode && "We need *some* way to materialise a constant");
-
- // We want to continue selection at this point since the litpool access
- // generated used generic nodes for simplicity.
- ReplaceUses(SDValue(Node, 0), SDValue(ResNode, 0));
- Node = ResNode;
- break;
- }
- case ISD::ConstantFP: {
- if (A64Imms::isFPImm(cast<ConstantFPSDNode>(Node)->getValueAPF())) {
- // FMOV will take care of it from TableGen
- break;
- }
-
- SDNode *ResNode = LowerToFPLitPool(Node);
- ReplaceUses(SDValue(Node, 0), SDValue(ResNode, 0));
-
- // We want to continue selection at this point since the litpool access
- // generated used generic nodes for simplicity.
- Node = ResNode;
- break;
- }
- case AArch64ISD::NEON_LD1_UPD: {
- static const uint16_t Opcodes[] = {
- AArch64::LD1WB_8B_fixed, AArch64::LD1WB_4H_fixed,
- AArch64::LD1WB_2S_fixed, AArch64::LD1WB_1D_fixed,
- AArch64::LD1WB_16B_fixed, AArch64::LD1WB_8H_fixed,
- AArch64::LD1WB_4S_fixed, AArch64::LD1WB_2D_fixed
- };
- return SelectVLD(Node, true, 1, Opcodes);
- }
- case AArch64ISD::NEON_LD2_UPD: {
- static const uint16_t Opcodes[] = {
- AArch64::LD2WB_8B_fixed, AArch64::LD2WB_4H_fixed,
- AArch64::LD2WB_2S_fixed, AArch64::LD1x2WB_1D_fixed,
- AArch64::LD2WB_16B_fixed, AArch64::LD2WB_8H_fixed,
- AArch64::LD2WB_4S_fixed, AArch64::LD2WB_2D_fixed
- };
- return SelectVLD(Node, true, 2, Opcodes);
- }
- case AArch64ISD::NEON_LD3_UPD: {
- static const uint16_t Opcodes[] = {
- AArch64::LD3WB_8B_fixed, AArch64::LD3WB_4H_fixed,
- AArch64::LD3WB_2S_fixed, AArch64::LD1x3WB_1D_fixed,
- AArch64::LD3WB_16B_fixed, AArch64::LD3WB_8H_fixed,
- AArch64::LD3WB_4S_fixed, AArch64::LD3WB_2D_fixed
- };
- return SelectVLD(Node, true, 3, Opcodes);
- }
- case AArch64ISD::NEON_LD4_UPD: {
- static const uint16_t Opcodes[] = {
- AArch64::LD4WB_8B_fixed, AArch64::LD4WB_4H_fixed,
- AArch64::LD4WB_2S_fixed, AArch64::LD1x4WB_1D_fixed,
- AArch64::LD4WB_16B_fixed, AArch64::LD4WB_8H_fixed,
- AArch64::LD4WB_4S_fixed, AArch64::LD4WB_2D_fixed
- };
- return SelectVLD(Node, true, 4, Opcodes);
- }
- case AArch64ISD::NEON_LD1x2_UPD: {
- static const uint16_t Opcodes[] = {
- AArch64::LD1x2WB_8B_fixed, AArch64::LD1x2WB_4H_fixed,
- AArch64::LD1x2WB_2S_fixed, AArch64::LD1x2WB_1D_fixed,
- AArch64::LD1x2WB_16B_fixed, AArch64::LD1x2WB_8H_fixed,
- AArch64::LD1x2WB_4S_fixed, AArch64::LD1x2WB_2D_fixed
- };
- return SelectVLD(Node, true, 2, Opcodes);
- }
- case AArch64ISD::NEON_LD1x3_UPD: {
- static const uint16_t Opcodes[] = {
- AArch64::LD1x3WB_8B_fixed, AArch64::LD1x3WB_4H_fixed,
- AArch64::LD1x3WB_2S_fixed, AArch64::LD1x3WB_1D_fixed,
- AArch64::LD1x3WB_16B_fixed, AArch64::LD1x3WB_8H_fixed,
- AArch64::LD1x3WB_4S_fixed, AArch64::LD1x3WB_2D_fixed
- };
- return SelectVLD(Node, true, 3, Opcodes);
- }
- case AArch64ISD::NEON_LD1x4_UPD: {
- static const uint16_t Opcodes[] = {
- AArch64::LD1x4WB_8B_fixed, AArch64::LD1x4WB_4H_fixed,
- AArch64::LD1x4WB_2S_fixed, AArch64::LD1x4WB_1D_fixed,
- AArch64::LD1x4WB_16B_fixed, AArch64::LD1x4WB_8H_fixed,
- AArch64::LD1x4WB_4S_fixed, AArch64::LD1x4WB_2D_fixed
- };
- return SelectVLD(Node, true, 4, Opcodes);
- }
- case AArch64ISD::NEON_ST1_UPD: {
- static const uint16_t Opcodes[] = {
- AArch64::ST1WB_8B_fixed, AArch64::ST1WB_4H_fixed,
- AArch64::ST1WB_2S_fixed, AArch64::ST1WB_1D_fixed,
- AArch64::ST1WB_16B_fixed, AArch64::ST1WB_8H_fixed,
- AArch64::ST1WB_4S_fixed, AArch64::ST1WB_2D_fixed
- };
- return SelectVST(Node, true, 1, Opcodes);
- }
- case AArch64ISD::NEON_ST2_UPD: {
- static const uint16_t Opcodes[] = {
- AArch64::ST2WB_8B_fixed, AArch64::ST2WB_4H_fixed,
- AArch64::ST2WB_2S_fixed, AArch64::ST1x2WB_1D_fixed,
- AArch64::ST2WB_16B_fixed, AArch64::ST2WB_8H_fixed,
- AArch64::ST2WB_4S_fixed, AArch64::ST2WB_2D_fixed
- };
- return SelectVST(Node, true, 2, Opcodes);
- }
- case AArch64ISD::NEON_ST3_UPD: {
- static const uint16_t Opcodes[] = {
- AArch64::ST3WB_8B_fixed, AArch64::ST3WB_4H_fixed,
- AArch64::ST3WB_2S_fixed, AArch64::ST1x3WB_1D_fixed,
- AArch64::ST3WB_16B_fixed, AArch64::ST3WB_8H_fixed,
- AArch64::ST3WB_4S_fixed, AArch64::ST3WB_2D_fixed
- };
- return SelectVST(Node, true, 3, Opcodes);
- }
- case AArch64ISD::NEON_ST4_UPD: {
- static const uint16_t Opcodes[] = {
- AArch64::ST4WB_8B_fixed, AArch64::ST4WB_4H_fixed,
- AArch64::ST4WB_2S_fixed, AArch64::ST1x4WB_1D_fixed,
- AArch64::ST4WB_16B_fixed, AArch64::ST4WB_8H_fixed,
- AArch64::ST4WB_4S_fixed, AArch64::ST4WB_2D_fixed
- };
- return SelectVST(Node, true, 4, Opcodes);
- }
- case AArch64ISD::NEON_LD2DUP: {
- static const uint16_t Opcodes[] = {
- AArch64::LD2R_8B, AArch64::LD2R_4H, AArch64::LD2R_2S,
- AArch64::LD2R_1D, AArch64::LD2R_16B, AArch64::LD2R_8H,
- AArch64::LD2R_4S, AArch64::LD2R_2D
- };
- return SelectVLDDup(Node, false, 2, Opcodes);
- }
- case AArch64ISD::NEON_LD3DUP: {
- static const uint16_t Opcodes[] = {
- AArch64::LD3R_8B, AArch64::LD3R_4H, AArch64::LD3R_2S,
- AArch64::LD3R_1D, AArch64::LD3R_16B, AArch64::LD3R_8H,
- AArch64::LD3R_4S, AArch64::LD3R_2D
- };
- return SelectVLDDup(Node, false, 3, Opcodes);
- }
- case AArch64ISD::NEON_LD4DUP: {
- static const uint16_t Opcodes[] = {
- AArch64::LD4R_8B, AArch64::LD4R_4H, AArch64::LD4R_2S,
- AArch64::LD4R_1D, AArch64::LD4R_16B, AArch64::LD4R_8H,
- AArch64::LD4R_4S, AArch64::LD4R_2D
- };
- return SelectVLDDup(Node, false, 4, Opcodes);
- }
- case AArch64ISD::NEON_LD2DUP_UPD: {
- static const uint16_t Opcodes[] = {
- AArch64::LD2R_WB_8B_fixed, AArch64::LD2R_WB_4H_fixed,
- AArch64::LD2R_WB_2S_fixed, AArch64::LD2R_WB_1D_fixed,
- AArch64::LD2R_WB_16B_fixed, AArch64::LD2R_WB_8H_fixed,
- AArch64::LD2R_WB_4S_fixed, AArch64::LD2R_WB_2D_fixed
- };
- return SelectVLDDup(Node, true, 2, Opcodes);
- }
- case AArch64ISD::NEON_LD3DUP_UPD: {
- static const uint16_t Opcodes[] = {
- AArch64::LD3R_WB_8B_fixed, AArch64::LD3R_WB_4H_fixed,
- AArch64::LD3R_WB_2S_fixed, AArch64::LD3R_WB_1D_fixed,
- AArch64::LD3R_WB_16B_fixed, AArch64::LD3R_WB_8H_fixed,
- AArch64::LD3R_WB_4S_fixed, AArch64::LD3R_WB_2D_fixed
- };
- return SelectVLDDup(Node, true, 3, Opcodes);
- }
- case AArch64ISD::NEON_LD4DUP_UPD: {
- static const uint16_t Opcodes[] = {
- AArch64::LD4R_WB_8B_fixed, AArch64::LD4R_WB_4H_fixed,
- AArch64::LD4R_WB_2S_fixed, AArch64::LD4R_WB_1D_fixed,
- AArch64::LD4R_WB_16B_fixed, AArch64::LD4R_WB_8H_fixed,
- AArch64::LD4R_WB_4S_fixed, AArch64::LD4R_WB_2D_fixed
- };
- return SelectVLDDup(Node, true, 4, Opcodes);
- }
- case AArch64ISD::NEON_LD2LN_UPD: {
- static const uint16_t Opcodes[] = {
- AArch64::LD2LN_WB_B_fixed, AArch64::LD2LN_WB_H_fixed,
- AArch64::LD2LN_WB_S_fixed, AArch64::LD2LN_WB_D_fixed
- };
- return SelectVLDSTLane(Node, true, true, 2, Opcodes);
- }
- case AArch64ISD::NEON_LD3LN_UPD: {
- static const uint16_t Opcodes[] = {
- AArch64::LD3LN_WB_B_fixed, AArch64::LD3LN_WB_H_fixed,
- AArch64::LD3LN_WB_S_fixed, AArch64::LD3LN_WB_D_fixed
- };
- return SelectVLDSTLane(Node, true, true, 3, Opcodes);
- }
- case AArch64ISD::NEON_LD4LN_UPD: {
- static const uint16_t Opcodes[] = {
- AArch64::LD4LN_WB_B_fixed, AArch64::LD4LN_WB_H_fixed,
- AArch64::LD4LN_WB_S_fixed, AArch64::LD4LN_WB_D_fixed
- };
- return SelectVLDSTLane(Node, true, true, 4, Opcodes);
- }
- case AArch64ISD::NEON_ST2LN_UPD: {
- static const uint16_t Opcodes[] = {
- AArch64::ST2LN_WB_B_fixed, AArch64::ST2LN_WB_H_fixed,
- AArch64::ST2LN_WB_S_fixed, AArch64::ST2LN_WB_D_fixed
- };
- return SelectVLDSTLane(Node, false, true, 2, Opcodes);
- }
- case AArch64ISD::NEON_ST3LN_UPD: {
- static const uint16_t Opcodes[] = {
- AArch64::ST3LN_WB_B_fixed, AArch64::ST3LN_WB_H_fixed,
- AArch64::ST3LN_WB_S_fixed, AArch64::ST3LN_WB_D_fixed
- };
- return SelectVLDSTLane(Node, false, true, 3, Opcodes);
- }
- case AArch64ISD::NEON_ST4LN_UPD: {
- static const uint16_t Opcodes[] = {
- AArch64::ST4LN_WB_B_fixed, AArch64::ST4LN_WB_H_fixed,
- AArch64::ST4LN_WB_S_fixed, AArch64::ST4LN_WB_D_fixed
- };
- return SelectVLDSTLane(Node, false, true, 4, Opcodes);
- }
- case AArch64ISD::NEON_ST1x2_UPD: {
- static const uint16_t Opcodes[] = {
- AArch64::ST1x2WB_8B_fixed, AArch64::ST1x2WB_4H_fixed,
- AArch64::ST1x2WB_2S_fixed, AArch64::ST1x2WB_1D_fixed,
- AArch64::ST1x2WB_16B_fixed, AArch64::ST1x2WB_8H_fixed,
- AArch64::ST1x2WB_4S_fixed, AArch64::ST1x2WB_2D_fixed
- };
- return SelectVST(Node, true, 2, Opcodes);
- }
- case AArch64ISD::NEON_ST1x3_UPD: {
- static const uint16_t Opcodes[] = {
- AArch64::ST1x3WB_8B_fixed, AArch64::ST1x3WB_4H_fixed,
- AArch64::ST1x3WB_2S_fixed, AArch64::ST1x3WB_1D_fixed,
- AArch64::ST1x3WB_16B_fixed, AArch64::ST1x3WB_8H_fixed,
- AArch64::ST1x3WB_4S_fixed, AArch64::ST1x3WB_2D_fixed
- };
- return SelectVST(Node, true, 3, Opcodes);
- }
- case AArch64ISD::NEON_ST1x4_UPD: {
- static const uint16_t Opcodes[] = {
- AArch64::ST1x4WB_8B_fixed, AArch64::ST1x4WB_4H_fixed,
- AArch64::ST1x4WB_2S_fixed, AArch64::ST1x4WB_1D_fixed,
- AArch64::ST1x4WB_16B_fixed, AArch64::ST1x4WB_8H_fixed,
- AArch64::ST1x4WB_4S_fixed, AArch64::ST1x4WB_2D_fixed
- };
- return SelectVST(Node, true, 4, Opcodes);
- }
- case ISD::INTRINSIC_WO_CHAIN: {
- unsigned IntNo = cast<ConstantSDNode>(Node->getOperand(0))->getZExtValue();
- bool IsExt = false;
- switch (IntNo) {
- default:
- break;
- case Intrinsic::aarch64_neon_vtbx1:
- IsExt = true;
- case Intrinsic::aarch64_neon_vtbl1:
- return SelectVTBL(Node, 1, IsExt);
- case Intrinsic::aarch64_neon_vtbx2:
- IsExt = true;
- case Intrinsic::aarch64_neon_vtbl2:
- return SelectVTBL(Node, 2, IsExt);
- case Intrinsic::aarch64_neon_vtbx3:
- IsExt = true;
- case Intrinsic::aarch64_neon_vtbl3:
- return SelectVTBL(Node, 3, IsExt);
- case Intrinsic::aarch64_neon_vtbx4:
- IsExt = true;
- case Intrinsic::aarch64_neon_vtbl4:
- return SelectVTBL(Node, 4, IsExt);
+ // Materialize zero constants as copies from WZR/XZR. This allows
+ // the coalescer to propagate these into other instructions.
+ ConstantSDNode *ConstNode = cast<ConstantSDNode>(Node);
+ if (ConstNode->isNullValue()) {
+ if (VT == MVT::i32)
+ return CurDAG->getCopyFromReg(CurDAG->getEntryNode(), SDLoc(Node),
+ AArch64::WZR, MVT::i32).getNode();
+ else if (VT == MVT::i64)
+ return CurDAG->getCopyFromReg(CurDAG->getEntryNode(), SDLoc(Node),
+ AArch64::XZR, MVT::i64).getNode();
}
break;
}
- case ISD::INTRINSIC_VOID:
+
+ case ISD::FrameIndex: {
+ // Selects to ADDXri FI, 0 which in turn will become ADDXri SP, imm.
+ int FI = cast<FrameIndexSDNode>(Node)->getIndex();
+ unsigned Shifter = AArch64_AM::getShifterImm(AArch64_AM::LSL, 0);
+ const TargetLowering *TLI = getTargetLowering();
+ SDValue TFI = CurDAG->getTargetFrameIndex(FI, TLI->getPointerTy());
+ SDValue Ops[] = { TFI, CurDAG->getTargetConstant(0, MVT::i32),
+ CurDAG->getTargetConstant(Shifter, MVT::i32) };
+ return CurDAG->SelectNodeTo(Node, AArch64::ADDXri, MVT::i64, Ops);
+ }
case ISD::INTRINSIC_W_CHAIN: {
unsigned IntNo = cast<ConstantSDNode>(Node->getOperand(1))->getZExtValue();
switch (IntNo) {
default:
break;
- case Intrinsic::arm_neon_vld1: {
- static const uint16_t Opcodes[] = {
- AArch64::LD1_8B, AArch64::LD1_4H, AArch64::LD1_2S, AArch64::LD1_1D,
- AArch64::LD1_16B, AArch64::LD1_8H, AArch64::LD1_4S, AArch64::LD1_2D
- };
- return SelectVLD(Node, false, 1, Opcodes);
+ case Intrinsic::aarch64_ldaxp:
+ case Intrinsic::aarch64_ldxp: {
+ unsigned Op =
+ IntNo == Intrinsic::aarch64_ldaxp ? AArch64::LDAXPX : AArch64::LDXPX;
+ SDValue MemAddr = Node->getOperand(2);
+ SDLoc DL(Node);
+ SDValue Chain = Node->getOperand(0);
+
+ SDNode *Ld = CurDAG->getMachineNode(Op, DL, MVT::i64, MVT::i64,
+ MVT::Other, MemAddr, Chain);
+
+ // Transfer memoperands.
+ MachineSDNode::mmo_iterator MemOp = MF->allocateMemRefsArray(1);
+ MemOp[0] = cast<MemIntrinsicSDNode>(Node)->getMemOperand();
+ cast<MachineSDNode>(Ld)->setMemRefs(MemOp, MemOp + 1);
+ return Ld;
}
- case Intrinsic::arm_neon_vld2: {
- static const uint16_t Opcodes[] = {
- AArch64::LD2_8B, AArch64::LD2_4H, AArch64::LD2_2S, AArch64::LD1x2_1D,
- AArch64::LD2_16B, AArch64::LD2_8H, AArch64::LD2_4S, AArch64::LD2_2D
- };
- return SelectVLD(Node, false, 2, Opcodes);
+ case Intrinsic::aarch64_stlxp:
+ case Intrinsic::aarch64_stxp: {
+ unsigned Op =
+ IntNo == Intrinsic::aarch64_stlxp ? AArch64::STLXPX : AArch64::STXPX;
+ SDLoc DL(Node);
+ SDValue Chain = Node->getOperand(0);
+ SDValue ValLo = Node->getOperand(2);
+ SDValue ValHi = Node->getOperand(3);
+ SDValue MemAddr = Node->getOperand(4);
+
+ // Place arguments in the right order.
+ SmallVector<SDValue, 7> Ops;
+ Ops.push_back(ValLo);
+ Ops.push_back(ValHi);
+ Ops.push_back(MemAddr);
+ Ops.push_back(Chain);
+
+ SDNode *St = CurDAG->getMachineNode(Op, DL, MVT::i32, MVT::Other, Ops);
+ // Transfer memoperands.
+ MachineSDNode::mmo_iterator MemOp = MF->allocateMemRefsArray(1);
+ MemOp[0] = cast<MemIntrinsicSDNode>(Node)->getMemOperand();
+ cast<MachineSDNode>(St)->setMemRefs(MemOp, MemOp + 1);
+
+ return St;
}
- case Intrinsic::arm_neon_vld3: {
- static const uint16_t Opcodes[] = {
- AArch64::LD3_8B, AArch64::LD3_4H, AArch64::LD3_2S, AArch64::LD1x3_1D,
- AArch64::LD3_16B, AArch64::LD3_8H, AArch64::LD3_4S, AArch64::LD3_2D
- };
- return SelectVLD(Node, false, 3, Opcodes);
+ case Intrinsic::aarch64_neon_ld1x2:
+ if (VT == MVT::v8i8)
+ return SelectLoad(Node, 2, AArch64::LD1Twov8b, AArch64::dsub0);
+ else if (VT == MVT::v16i8)
+ return SelectLoad(Node, 2, AArch64::LD1Twov16b, AArch64::qsub0);
+ else if (VT == MVT::v4i16)
+ return SelectLoad(Node, 2, AArch64::LD1Twov4h, AArch64::dsub0);
+ else if (VT == MVT::v8i16)
+ return SelectLoad(Node, 2, AArch64::LD1Twov8h, AArch64::qsub0);
+ else if (VT == MVT::v2i32 || VT == MVT::v2f32)
+ return SelectLoad(Node, 2, AArch64::LD1Twov2s, AArch64::dsub0);
+ else if (VT == MVT::v4i32 || VT == MVT::v4f32)
+ return SelectLoad(Node, 2, AArch64::LD1Twov4s, AArch64::qsub0);
+ else if (VT == MVT::v1i64 || VT == MVT::v1f64)
+ return SelectLoad(Node, 2, AArch64::LD1Twov1d, AArch64::dsub0);
+ else if (VT == MVT::v2i64 || VT == MVT::v2f64)
+ return SelectLoad(Node, 2, AArch64::LD1Twov2d, AArch64::qsub0);
+ break;
+ case Intrinsic::aarch64_neon_ld1x3:
+ if (VT == MVT::v8i8)
+ return SelectLoad(Node, 3, AArch64::LD1Threev8b, AArch64::dsub0);
+ else if (VT == MVT::v16i8)
+ return SelectLoad(Node, 3, AArch64::LD1Threev16b, AArch64::qsub0);
+ else if (VT == MVT::v4i16)
+ return SelectLoad(Node, 3, AArch64::LD1Threev4h, AArch64::dsub0);
+ else if (VT == MVT::v8i16)
+ return SelectLoad(Node, 3, AArch64::LD1Threev8h, AArch64::qsub0);
+ else if (VT == MVT::v2i32 || VT == MVT::v2f32)
+ return SelectLoad(Node, 3, AArch64::LD1Threev2s, AArch64::dsub0);
+ else if (VT == MVT::v4i32 || VT == MVT::v4f32)
+ return SelectLoad(Node, 3, AArch64::LD1Threev4s, AArch64::qsub0);
+ else if (VT == MVT::v1i64 || VT == MVT::v1f64)
+ return SelectLoad(Node, 3, AArch64::LD1Threev1d, AArch64::dsub0);
+ else if (VT == MVT::v2i64 || VT == MVT::v2f64)
+ return SelectLoad(Node, 3, AArch64::LD1Threev2d, AArch64::qsub0);
+ break;
+ case Intrinsic::aarch64_neon_ld1x4:
+ if (VT == MVT::v8i8)
+ return SelectLoad(Node, 4, AArch64::LD1Fourv8b, AArch64::dsub0);
+ else if (VT == MVT::v16i8)
+ return SelectLoad(Node, 4, AArch64::LD1Fourv16b, AArch64::qsub0);
+ else if (VT == MVT::v4i16)
+ return SelectLoad(Node, 4, AArch64::LD1Fourv4h, AArch64::dsub0);
+ else if (VT == MVT::v8i16)
+ return SelectLoad(Node, 4, AArch64::LD1Fourv8h, AArch64::qsub0);
+ else if (VT == MVT::v2i32 || VT == MVT::v2f32)
+ return SelectLoad(Node, 4, AArch64::LD1Fourv2s, AArch64::dsub0);
+ else if (VT == MVT::v4i32 || VT == MVT::v4f32)
+ return SelectLoad(Node, 4, AArch64::LD1Fourv4s, AArch64::qsub0);
+ else if (VT == MVT::v1i64 || VT == MVT::v1f64)
+ return SelectLoad(Node, 4, AArch64::LD1Fourv1d, AArch64::dsub0);
+ else if (VT == MVT::v2i64 || VT == MVT::v2f64)
+ return SelectLoad(Node, 4, AArch64::LD1Fourv2d, AArch64::qsub0);
+ break;
+ case Intrinsic::aarch64_neon_ld2:
+ if (VT == MVT::v8i8)
+ return SelectLoad(Node, 2, AArch64::LD2Twov8b, AArch64::dsub0);
+ else if (VT == MVT::v16i8)
+ return SelectLoad(Node, 2, AArch64::LD2Twov16b, AArch64::qsub0);
+ else if (VT == MVT::v4i16)
+ return SelectLoad(Node, 2, AArch64::LD2Twov4h, AArch64::dsub0);
+ else if (VT == MVT::v8i16)
+ return SelectLoad(Node, 2, AArch64::LD2Twov8h, AArch64::qsub0);
+ else if (VT == MVT::v2i32 || VT == MVT::v2f32)
+ return SelectLoad(Node, 2, AArch64::LD2Twov2s, AArch64::dsub0);
+ else if (VT == MVT::v4i32 || VT == MVT::v4f32)
+ return SelectLoad(Node, 2, AArch64::LD2Twov4s, AArch64::qsub0);
+ else if (VT == MVT::v1i64 || VT == MVT::v1f64)
+ return SelectLoad(Node, 2, AArch64::LD1Twov1d, AArch64::dsub0);
+ else if (VT == MVT::v2i64 || VT == MVT::v2f64)
+ return SelectLoad(Node, 2, AArch64::LD2Twov2d, AArch64::qsub0);
+ break;
+ case Intrinsic::aarch64_neon_ld3:
+ if (VT == MVT::v8i8)
+ return SelectLoad(Node, 3, AArch64::LD3Threev8b, AArch64::dsub0);
+ else if (VT == MVT::v16i8)
+ return SelectLoad(Node, 3, AArch64::LD3Threev16b, AArch64::qsub0);
+ else if (VT == MVT::v4i16)
+ return SelectLoad(Node, 3, AArch64::LD3Threev4h, AArch64::dsub0);
+ else if (VT == MVT::v8i16)
+ return SelectLoad(Node, 3, AArch64::LD3Threev8h, AArch64::qsub0);
+ else if (VT == MVT::v2i32 || VT == MVT::v2f32)
+ return SelectLoad(Node, 3, AArch64::LD3Threev2s, AArch64::dsub0);
+ else if (VT == MVT::v4i32 || VT == MVT::v4f32)
+ return SelectLoad(Node, 3, AArch64::LD3Threev4s, AArch64::qsub0);
+ else if (VT == MVT::v1i64 || VT == MVT::v1f64)
+ return SelectLoad(Node, 3, AArch64::LD1Threev1d, AArch64::dsub0);
+ else if (VT == MVT::v2i64 || VT == MVT::v2f64)
+ return SelectLoad(Node, 3, AArch64::LD3Threev2d, AArch64::qsub0);
+ break;
+ case Intrinsic::aarch64_neon_ld4:
+ if (VT == MVT::v8i8)
+ return SelectLoad(Node, 4, AArch64::LD4Fourv8b, AArch64::dsub0);
+ else if (VT == MVT::v16i8)
+ return SelectLoad(Node, 4, AArch64::LD4Fourv16b, AArch64::qsub0);
+ else if (VT == MVT::v4i16)
+ return SelectLoad(Node, 4, AArch64::LD4Fourv4h, AArch64::dsub0);
+ else if (VT == MVT::v8i16)
+ return SelectLoad(Node, 4, AArch64::LD4Fourv8h, AArch64::qsub0);
+ else if (VT == MVT::v2i32 || VT == MVT::v2f32)
+ return SelectLoad(Node, 4, AArch64::LD4Fourv2s, AArch64::dsub0);
+ else if (VT == MVT::v4i32 || VT == MVT::v4f32)
+ return SelectLoad(Node, 4, AArch64::LD4Fourv4s, AArch64::qsub0);
+ else if (VT == MVT::v1i64 || VT == MVT::v1f64)
+ return SelectLoad(Node, 4, AArch64::LD1Fourv1d, AArch64::dsub0);
+ else if (VT == MVT::v2i64 || VT == MVT::v2f64)
+ return SelectLoad(Node, 4, AArch64::LD4Fourv2d, AArch64::qsub0);
+ break;
+ case Intrinsic::aarch64_neon_ld2r:
+ if (VT == MVT::v8i8)
+ return SelectLoad(Node, 2, AArch64::LD2Rv8b, AArch64::dsub0);
+ else if (VT == MVT::v16i8)
+ return SelectLoad(Node, 2, AArch64::LD2Rv16b, AArch64::qsub0);
+ else if (VT == MVT::v4i16)
+ return SelectLoad(Node, 2, AArch64::LD2Rv4h, AArch64::dsub0);
+ else if (VT == MVT::v8i16)
+ return SelectLoad(Node, 2, AArch64::LD2Rv8h, AArch64::qsub0);
+ else if (VT == MVT::v2i32 || VT == MVT::v2f32)
+ return SelectLoad(Node, 2, AArch64::LD2Rv2s, AArch64::dsub0);
+ else if (VT == MVT::v4i32 || VT == MVT::v4f32)
+ return SelectLoad(Node, 2, AArch64::LD2Rv4s, AArch64::qsub0);
+ else if (VT == MVT::v1i64 || VT == MVT::v1f64)
+ return SelectLoad(Node, 2, AArch64::LD2Rv1d, AArch64::dsub0);
+ else if (VT == MVT::v2i64 || VT == MVT::v2f64)
+ return SelectLoad(Node, 2, AArch64::LD2Rv2d, AArch64::qsub0);
+ break;
+ case Intrinsic::aarch64_neon_ld3r:
+ if (VT == MVT::v8i8)
+ return SelectLoad(Node, 3, AArch64::LD3Rv8b, AArch64::dsub0);
+ else if (VT == MVT::v16i8)
+ return SelectLoad(Node, 3, AArch64::LD3Rv16b, AArch64::qsub0);
+ else if (VT == MVT::v4i16)
+ return SelectLoad(Node, 3, AArch64::LD3Rv4h, AArch64::dsub0);
+ else if (VT == MVT::v8i16)
+ return SelectLoad(Node, 3, AArch64::LD3Rv8h, AArch64::qsub0);
+ else if (VT == MVT::v2i32 || VT == MVT::v2f32)
+ return SelectLoad(Node, 3, AArch64::LD3Rv2s, AArch64::dsub0);
+ else if (VT == MVT::v4i32 || VT == MVT::v4f32)
+ return SelectLoad(Node, 3, AArch64::LD3Rv4s, AArch64::qsub0);
+ else if (VT == MVT::v1i64 || VT == MVT::v1f64)
+ return SelectLoad(Node, 3, AArch64::LD3Rv1d, AArch64::dsub0);
+ else if (VT == MVT::v2i64 || VT == MVT::v2f64)
+ return SelectLoad(Node, 3, AArch64::LD3Rv2d, AArch64::qsub0);
+ break;
+ case Intrinsic::aarch64_neon_ld4r:
+ if (VT == MVT::v8i8)
+ return SelectLoad(Node, 4, AArch64::LD4Rv8b, AArch64::dsub0);
+ else if (VT == MVT::v16i8)
+ return SelectLoad(Node, 4, AArch64::LD4Rv16b, AArch64::qsub0);
+ else if (VT == MVT::v4i16)
+ return SelectLoad(Node, 4, AArch64::LD4Rv4h, AArch64::dsub0);
+ else if (VT == MVT::v8i16)
+ return SelectLoad(Node, 4, AArch64::LD4Rv8h, AArch64::qsub0);
+ else if (VT == MVT::v2i32 || VT == MVT::v2f32)
+ return SelectLoad(Node, 4, AArch64::LD4Rv2s, AArch64::dsub0);
+ else if (VT == MVT::v4i32 || VT == MVT::v4f32)
+ return SelectLoad(Node, 4, AArch64::LD4Rv4s, AArch64::qsub0);
+ else if (VT == MVT::v1i64 || VT == MVT::v1f64)
+ return SelectLoad(Node, 4, AArch64::LD4Rv1d, AArch64::dsub0);
+ else if (VT == MVT::v2i64 || VT == MVT::v2f64)
+ return SelectLoad(Node, 4, AArch64::LD4Rv2d, AArch64::qsub0);
+ break;
+ case Intrinsic::aarch64_neon_ld2lane:
+ if (VT == MVT::v16i8 || VT == MVT::v8i8)
+ return SelectLoadLane(Node, 2, AArch64::LD2i8);
+ else if (VT == MVT::v8i16 || VT == MVT::v4i16)
+ return SelectLoadLane(Node, 2, AArch64::LD2i16);
+ else if (VT == MVT::v4i32 || VT == MVT::v2i32 || VT == MVT::v4f32 ||
+ VT == MVT::v2f32)
+ return SelectLoadLane(Node, 2, AArch64::LD2i32);
+ else if (VT == MVT::v2i64 || VT == MVT::v1i64 || VT == MVT::v2f64 ||
+ VT == MVT::v1f64)
+ return SelectLoadLane(Node, 2, AArch64::LD2i64);
+ break;
+ case Intrinsic::aarch64_neon_ld3lane:
+ if (VT == MVT::v16i8 || VT == MVT::v8i8)
+ return SelectLoadLane(Node, 3, AArch64::LD3i8);
+ else if (VT == MVT::v8i16 || VT == MVT::v4i16)
+ return SelectLoadLane(Node, 3, AArch64::LD3i16);
+ else if (VT == MVT::v4i32 || VT == MVT::v2i32 || VT == MVT::v4f32 ||
+ VT == MVT::v2f32)
+ return SelectLoadLane(Node, 3, AArch64::LD3i32);
+ else if (VT == MVT::v2i64 || VT == MVT::v1i64 || VT == MVT::v2f64 ||
+ VT == MVT::v1f64)
+ return SelectLoadLane(Node, 3, AArch64::LD3i64);
+ break;
+ case Intrinsic::aarch64_neon_ld4lane:
+ if (VT == MVT::v16i8 || VT == MVT::v8i8)
+ return SelectLoadLane(Node, 4, AArch64::LD4i8);
+ else if (VT == MVT::v8i16 || VT == MVT::v4i16)
+ return SelectLoadLane(Node, 4, AArch64::LD4i16);
+ else if (VT == MVT::v4i32 || VT == MVT::v2i32 || VT == MVT::v4f32 ||
+ VT == MVT::v2f32)
+ return SelectLoadLane(Node, 4, AArch64::LD4i32);
+ else if (VT == MVT::v2i64 || VT == MVT::v1i64 || VT == MVT::v2f64 ||
+ VT == MVT::v1f64)
+ return SelectLoadLane(Node, 4, AArch64::LD4i64);
+ break;
}
- case Intrinsic::arm_neon_vld4: {
- static const uint16_t Opcodes[] = {
- AArch64::LD4_8B, AArch64::LD4_4H, AArch64::LD4_2S, AArch64::LD1x4_1D,
- AArch64::LD4_16B, AArch64::LD4_8H, AArch64::LD4_4S, AArch64::LD4_2D
- };
- return SelectVLD(Node, false, 4, Opcodes);
+ } break;
+ case ISD::INTRINSIC_WO_CHAIN: {
+ unsigned IntNo = cast<ConstantSDNode>(Node->getOperand(0))->getZExtValue();
+ switch (IntNo) {
+ default:
+ break;
+ case Intrinsic::aarch64_neon_tbl2:
+ return SelectTable(Node, 2, VT == MVT::v8i8 ? AArch64::TBLv8i8Two
+ : AArch64::TBLv16i8Two,
+ false);
+ case Intrinsic::aarch64_neon_tbl3:
+ return SelectTable(Node, 3, VT == MVT::v8i8 ? AArch64::TBLv8i8Three
+ : AArch64::TBLv16i8Three,
+ false);
+ case Intrinsic::aarch64_neon_tbl4:
+ return SelectTable(Node, 4, VT == MVT::v8i8 ? AArch64::TBLv8i8Four
+ : AArch64::TBLv16i8Four,
+ false);
+ case Intrinsic::aarch64_neon_tbx2:
+ return SelectTable(Node, 2, VT == MVT::v8i8 ? AArch64::TBXv8i8Two
+ : AArch64::TBXv16i8Two,
+ true);
+ case Intrinsic::aarch64_neon_tbx3:
+ return SelectTable(Node, 3, VT == MVT::v8i8 ? AArch64::TBXv8i8Three
+ : AArch64::TBXv16i8Three,
+ true);
+ case Intrinsic::aarch64_neon_tbx4:
+ return SelectTable(Node, 4, VT == MVT::v8i8 ? AArch64::TBXv8i8Four
+ : AArch64::TBXv16i8Four,
+ true);
+ case Intrinsic::aarch64_neon_smull:
+ case Intrinsic::aarch64_neon_umull:
+ if (SDNode *N = SelectMULLV64LaneV128(IntNo, Node))
+ return N;
+ break;
}
- case Intrinsic::aarch64_neon_vld1x2: {
- static const uint16_t Opcodes[] = {
- AArch64::LD1x2_8B, AArch64::LD1x2_4H, AArch64::LD1x2_2S,
- AArch64::LD1x2_1D, AArch64::LD1x2_16B, AArch64::LD1x2_8H,
- AArch64::LD1x2_4S, AArch64::LD1x2_2D
- };
- return SelectVLD(Node, false, 2, Opcodes);
- }
- case Intrinsic::aarch64_neon_vld1x3: {
- static const uint16_t Opcodes[] = {
- AArch64::LD1x3_8B, AArch64::LD1x3_4H, AArch64::LD1x3_2S,
- AArch64::LD1x3_1D, AArch64::LD1x3_16B, AArch64::LD1x3_8H,
- AArch64::LD1x3_4S, AArch64::LD1x3_2D
- };
- return SelectVLD(Node, false, 3, Opcodes);
- }
- case Intrinsic::aarch64_neon_vld1x4: {
- static const uint16_t Opcodes[] = {
- AArch64::LD1x4_8B, AArch64::LD1x4_4H, AArch64::LD1x4_2S,
- AArch64::LD1x4_1D, AArch64::LD1x4_16B, AArch64::LD1x4_8H,
- AArch64::LD1x4_4S, AArch64::LD1x4_2D
- };
- return SelectVLD(Node, false, 4, Opcodes);
- }
- case Intrinsic::arm_neon_vst1: {
- static const uint16_t Opcodes[] = {
- AArch64::ST1_8B, AArch64::ST1_4H, AArch64::ST1_2S, AArch64::ST1_1D,
- AArch64::ST1_16B, AArch64::ST1_8H, AArch64::ST1_4S, AArch64::ST1_2D
- };
- return SelectVST(Node, false, 1, Opcodes);
- }
- case Intrinsic::arm_neon_vst2: {
- static const uint16_t Opcodes[] = {
- AArch64::ST2_8B, AArch64::ST2_4H, AArch64::ST2_2S, AArch64::ST1x2_1D,
- AArch64::ST2_16B, AArch64::ST2_8H, AArch64::ST2_4S, AArch64::ST2_2D
- };
- return SelectVST(Node, false, 2, Opcodes);
- }
- case Intrinsic::arm_neon_vst3: {
- static const uint16_t Opcodes[] = {
- AArch64::ST3_8B, AArch64::ST3_4H, AArch64::ST3_2S, AArch64::ST1x3_1D,
- AArch64::ST3_16B, AArch64::ST3_8H, AArch64::ST3_4S, AArch64::ST3_2D
- };
- return SelectVST(Node, false, 3, Opcodes);
- }
- case Intrinsic::arm_neon_vst4: {
- static const uint16_t Opcodes[] = {
- AArch64::ST4_8B, AArch64::ST4_4H, AArch64::ST4_2S, AArch64::ST1x4_1D,
- AArch64::ST4_16B, AArch64::ST4_8H, AArch64::ST4_4S, AArch64::ST4_2D
- };
- return SelectVST(Node, false, 4, Opcodes);
- }
- case Intrinsic::aarch64_neon_vst1x2: {
- static const uint16_t Opcodes[] = {
- AArch64::ST1x2_8B, AArch64::ST1x2_4H, AArch64::ST1x2_2S,
- AArch64::ST1x2_1D, AArch64::ST1x2_16B, AArch64::ST1x2_8H,
- AArch64::ST1x2_4S, AArch64::ST1x2_2D
- };
- return SelectVST(Node, false, 2, Opcodes);
- }
- case Intrinsic::aarch64_neon_vst1x3: {
- static const uint16_t Opcodes[] = {
- AArch64::ST1x3_8B, AArch64::ST1x3_4H, AArch64::ST1x3_2S,
- AArch64::ST1x3_1D, AArch64::ST1x3_16B, AArch64::ST1x3_8H,
- AArch64::ST1x3_4S, AArch64::ST1x3_2D
- };
- return SelectVST(Node, false, 3, Opcodes);
- }
- case Intrinsic::aarch64_neon_vst1x4: {
- static const uint16_t Opcodes[] = {
- AArch64::ST1x4_8B, AArch64::ST1x4_4H, AArch64::ST1x4_2S,
- AArch64::ST1x4_1D, AArch64::ST1x4_16B, AArch64::ST1x4_8H,
- AArch64::ST1x4_4S, AArch64::ST1x4_2D
- };
- return SelectVST(Node, false, 4, Opcodes);
- }
- case Intrinsic::arm_neon_vld2lane: {
- static const uint16_t Opcodes[] = {
- AArch64::LD2LN_B, AArch64::LD2LN_H, AArch64::LD2LN_S, AArch64::LD2LN_D
- };
- return SelectVLDSTLane(Node, true, false, 2, Opcodes);
- }
- case Intrinsic::arm_neon_vld3lane: {
- static const uint16_t Opcodes[] = {
- AArch64::LD3LN_B, AArch64::LD3LN_H, AArch64::LD3LN_S, AArch64::LD3LN_D
- };
- return SelectVLDSTLane(Node, true, false, 3, Opcodes);
- }
- case Intrinsic::arm_neon_vld4lane: {
- static const uint16_t Opcodes[] = {
- AArch64::LD4LN_B, AArch64::LD4LN_H, AArch64::LD4LN_S, AArch64::LD4LN_D
- };
- return SelectVLDSTLane(Node, true, false, 4, Opcodes);
- }
- case Intrinsic::arm_neon_vst2lane: {
- static const uint16_t Opcodes[] = {
- AArch64::ST2LN_B, AArch64::ST2LN_H, AArch64::ST2LN_S, AArch64::ST2LN_D
- };
- return SelectVLDSTLane(Node, false, false, 2, Opcodes);
- }
- case Intrinsic::arm_neon_vst3lane: {
- static const uint16_t Opcodes[] = {
- AArch64::ST3LN_B, AArch64::ST3LN_H, AArch64::ST3LN_S, AArch64::ST3LN_D
- };
- return SelectVLDSTLane(Node, false, false, 3, Opcodes);
- }
- case Intrinsic::arm_neon_vst4lane: {
- static const uint16_t Opcodes[] = {
- AArch64::ST4LN_B, AArch64::ST4LN_H, AArch64::ST4LN_S, AArch64::ST4LN_D
- };
- return SelectVLDSTLane(Node, false, false, 4, Opcodes);
- }
- } // End of switch IntNo
break;
- } // End of case ISD::INTRINSIC_VOID and :ISD::INTRINSIC_W_CHAIN
- default:
- break; // Let generic code handle it
+ }
+ case ISD::INTRINSIC_VOID: {
+ unsigned IntNo = cast<ConstantSDNode>(Node->getOperand(1))->getZExtValue();
+ if (Node->getNumOperands() >= 3)
+ VT = Node->getOperand(2)->getValueType(0);
+ switch (IntNo) {
+ default:
+ break;
+ case Intrinsic::aarch64_neon_st1x2: {
+ if (VT == MVT::v8i8)
+ return SelectStore(Node, 2, AArch64::ST1Twov8b);
+ else if (VT == MVT::v16i8)
+ return SelectStore(Node, 2, AArch64::ST1Twov16b);
+ else if (VT == MVT::v4i16)
+ return SelectStore(Node, 2, AArch64::ST1Twov4h);
+ else if (VT == MVT::v8i16)
+ return SelectStore(Node, 2, AArch64::ST1Twov8h);
+ else if (VT == MVT::v2i32 || VT == MVT::v2f32)
+ return SelectStore(Node, 2, AArch64::ST1Twov2s);
+ else if (VT == MVT::v4i32 || VT == MVT::v4f32)
+ return SelectStore(Node, 2, AArch64::ST1Twov4s);
+ else if (VT == MVT::v2i64 || VT == MVT::v2f64)
+ return SelectStore(Node, 2, AArch64::ST1Twov2d);
+ else if (VT == MVT::v1i64 || VT == MVT::v1f64)
+ return SelectStore(Node, 2, AArch64::ST1Twov1d);
+ break;
+ }
+ case Intrinsic::aarch64_neon_st1x3: {
+ if (VT == MVT::v8i8)
+ return SelectStore(Node, 3, AArch64::ST1Threev8b);
+ else if (VT == MVT::v16i8)
+ return SelectStore(Node, 3, AArch64::ST1Threev16b);
+ else if (VT == MVT::v4i16)
+ return SelectStore(Node, 3, AArch64::ST1Threev4h);
+ else if (VT == MVT::v8i16)
+ return SelectStore(Node, 3, AArch64::ST1Threev8h);
+ else if (VT == MVT::v2i32 || VT == MVT::v2f32)
+ return SelectStore(Node, 3, AArch64::ST1Threev2s);
+ else if (VT == MVT::v4i32 || VT == MVT::v4f32)
+ return SelectStore(Node, 3, AArch64::ST1Threev4s);
+ else if (VT == MVT::v2i64 || VT == MVT::v2f64)
+ return SelectStore(Node, 3, AArch64::ST1Threev2d);
+ else if (VT == MVT::v1i64 || VT == MVT::v1f64)
+ return SelectStore(Node, 3, AArch64::ST1Threev1d);
+ break;
+ }
+ case Intrinsic::aarch64_neon_st1x4: {
+ if (VT == MVT::v8i8)
+ return SelectStore(Node, 4, AArch64::ST1Fourv8b);
+ else if (VT == MVT::v16i8)
+ return SelectStore(Node, 4, AArch64::ST1Fourv16b);
+ else if (VT == MVT::v4i16)
+ return SelectStore(Node, 4, AArch64::ST1Fourv4h);
+ else if (VT == MVT::v8i16)
+ return SelectStore(Node, 4, AArch64::ST1Fourv8h);
+ else if (VT == MVT::v2i32 || VT == MVT::v2f32)
+ return SelectStore(Node, 4, AArch64::ST1Fourv2s);
+ else if (VT == MVT::v4i32 || VT == MVT::v4f32)
+ return SelectStore(Node, 4, AArch64::ST1Fourv4s);
+ else if (VT == MVT::v2i64 || VT == MVT::v2f64)
+ return SelectStore(Node, 4, AArch64::ST1Fourv2d);
+ else if (VT == MVT::v1i64 || VT == MVT::v1f64)
+ return SelectStore(Node, 4, AArch64::ST1Fourv1d);
+ break;
+ }
+ case Intrinsic::aarch64_neon_st2: {
+ if (VT == MVT::v8i8)
+ return SelectStore(Node, 2, AArch64::ST2Twov8b);
+ else if (VT == MVT::v16i8)
+ return SelectStore(Node, 2, AArch64::ST2Twov16b);
+ else if (VT == MVT::v4i16)
+ return SelectStore(Node, 2, AArch64::ST2Twov4h);
+ else if (VT == MVT::v8i16)
+ return SelectStore(Node, 2, AArch64::ST2Twov8h);
+ else if (VT == MVT::v2i32 || VT == MVT::v2f32)
+ return SelectStore(Node, 2, AArch64::ST2Twov2s);
+ else if (VT == MVT::v4i32 || VT == MVT::v4f32)
+ return SelectStore(Node, 2, AArch64::ST2Twov4s);
+ else if (VT == MVT::v2i64 || VT == MVT::v2f64)
+ return SelectStore(Node, 2, AArch64::ST2Twov2d);
+ else if (VT == MVT::v1i64 || VT == MVT::v1f64)
+ return SelectStore(Node, 2, AArch64::ST1Twov1d);
+ break;
+ }
+ case Intrinsic::aarch64_neon_st3: {
+ if (VT == MVT::v8i8)
+ return SelectStore(Node, 3, AArch64::ST3Threev8b);
+ else if (VT == MVT::v16i8)
+ return SelectStore(Node, 3, AArch64::ST3Threev16b);
+ else if (VT == MVT::v4i16)
+ return SelectStore(Node, 3, AArch64::ST3Threev4h);
+ else if (VT == MVT::v8i16)
+ return SelectStore(Node, 3, AArch64::ST3Threev8h);
+ else if (VT == MVT::v2i32 || VT == MVT::v2f32)
+ return SelectStore(Node, 3, AArch64::ST3Threev2s);
+ else if (VT == MVT::v4i32 || VT == MVT::v4f32)
+ return SelectStore(Node, 3, AArch64::ST3Threev4s);
+ else if (VT == MVT::v2i64 || VT == MVT::v2f64)
+ return SelectStore(Node, 3, AArch64::ST3Threev2d);
+ else if (VT == MVT::v1i64 || VT == MVT::v1f64)
+ return SelectStore(Node, 3, AArch64::ST1Threev1d);
+ break;
+ }
+ case Intrinsic::aarch64_neon_st4: {
+ if (VT == MVT::v8i8)
+ return SelectStore(Node, 4, AArch64::ST4Fourv8b);
+ else if (VT == MVT::v16i8)
+ return SelectStore(Node, 4, AArch64::ST4Fourv16b);
+ else if (VT == MVT::v4i16)
+ return SelectStore(Node, 4, AArch64::ST4Fourv4h);
+ else if (VT == MVT::v8i16)
+ return SelectStore(Node, 4, AArch64::ST4Fourv8h);
+ else if (VT == MVT::v2i32 || VT == MVT::v2f32)
+ return SelectStore(Node, 4, AArch64::ST4Fourv2s);
+ else if (VT == MVT::v4i32 || VT == MVT::v4f32)
+ return SelectStore(Node, 4, AArch64::ST4Fourv4s);
+ else if (VT == MVT::v2i64 || VT == MVT::v2f64)
+ return SelectStore(Node, 4, AArch64::ST4Fourv2d);
+ else if (VT == MVT::v1i64 || VT == MVT::v1f64)
+ return SelectStore(Node, 4, AArch64::ST1Fourv1d);
+ break;
+ }
+ case Intrinsic::aarch64_neon_st2lane: {
+ if (VT == MVT::v16i8 || VT == MVT::v8i8)
+ return SelectStoreLane(Node, 2, AArch64::ST2i8);
+ else if (VT == MVT::v8i16 || VT == MVT::v4i16)
+ return SelectStoreLane(Node, 2, AArch64::ST2i16);
+ else if (VT == MVT::v4i32 || VT == MVT::v2i32 || VT == MVT::v4f32 ||
+ VT == MVT::v2f32)
+ return SelectStoreLane(Node, 2, AArch64::ST2i32);
+ else if (VT == MVT::v2i64 || VT == MVT::v1i64 || VT == MVT::v2f64 ||
+ VT == MVT::v1f64)
+ return SelectStoreLane(Node, 2, AArch64::ST2i64);
+ break;
+ }
+ case Intrinsic::aarch64_neon_st3lane: {
+ if (VT == MVT::v16i8 || VT == MVT::v8i8)
+ return SelectStoreLane(Node, 3, AArch64::ST3i8);
+ else if (VT == MVT::v8i16 || VT == MVT::v4i16)
+ return SelectStoreLane(Node, 3, AArch64::ST3i16);
+ else if (VT == MVT::v4i32 || VT == MVT::v2i32 || VT == MVT::v4f32 ||
+ VT == MVT::v2f32)
+ return SelectStoreLane(Node, 3, AArch64::ST3i32);
+ else if (VT == MVT::v2i64 || VT == MVT::v1i64 || VT == MVT::v2f64 ||
+ VT == MVT::v1f64)
+ return SelectStoreLane(Node, 3, AArch64::ST3i64);
+ break;
+ }
+ case Intrinsic::aarch64_neon_st4lane: {
+ if (VT == MVT::v16i8 || VT == MVT::v8i8)
+ return SelectStoreLane(Node, 4, AArch64::ST4i8);
+ else if (VT == MVT::v8i16 || VT == MVT::v4i16)
+ return SelectStoreLane(Node, 4, AArch64::ST4i16);
+ else if (VT == MVT::v4i32 || VT == MVT::v2i32 || VT == MVT::v4f32 ||
+ VT == MVT::v2f32)
+ return SelectStoreLane(Node, 4, AArch64::ST4i32);
+ else if (VT == MVT::v2i64 || VT == MVT::v1i64 || VT == MVT::v2f64 ||
+ VT == MVT::v1f64)
+ return SelectStoreLane(Node, 4, AArch64::ST4i64);
+ break;
+ }
+ }
+ }
+ case AArch64ISD::LD2post: {
+ if (VT == MVT::v8i8)
+ return SelectPostLoad(Node, 2, AArch64::LD2Twov8b_POST, AArch64::dsub0);
+ else if (VT == MVT::v16i8)
+ return SelectPostLoad(Node, 2, AArch64::LD2Twov16b_POST, AArch64::qsub0);
+ else if (VT == MVT::v4i16)
+ return SelectPostLoad(Node, 2, AArch64::LD2Twov4h_POST, AArch64::dsub0);
+ else if (VT == MVT::v8i16)
+ return SelectPostLoad(Node, 2, AArch64::LD2Twov8h_POST, AArch64::qsub0);
+ else if (VT == MVT::v2i32 || VT == MVT::v2f32)
+ return SelectPostLoad(Node, 2, AArch64::LD2Twov2s_POST, AArch64::dsub0);
+ else if (VT == MVT::v4i32 || VT == MVT::v4f32)
+ return SelectPostLoad(Node, 2, AArch64::LD2Twov4s_POST, AArch64::qsub0);
+ else if (VT == MVT::v1i64 || VT == MVT::v1f64)
+ return SelectPostLoad(Node, 2, AArch64::LD1Twov1d_POST, AArch64::dsub0);
+ else if (VT == MVT::v2i64 || VT == MVT::v2f64)
+ return SelectPostLoad(Node, 2, AArch64::LD2Twov2d_POST, AArch64::qsub0);
+ break;
+ }
+ case AArch64ISD::LD3post: {
+ if (VT == MVT::v8i8)
+ return SelectPostLoad(Node, 3, AArch64::LD3Threev8b_POST, AArch64::dsub0);
+ else if (VT == MVT::v16i8)
+ return SelectPostLoad(Node, 3, AArch64::LD3Threev16b_POST, AArch64::qsub0);
+ else if (VT == MVT::v4i16)
+ return SelectPostLoad(Node, 3, AArch64::LD3Threev4h_POST, AArch64::dsub0);
+ else if (VT == MVT::v8i16)
+ return SelectPostLoad(Node, 3, AArch64::LD3Threev8h_POST, AArch64::qsub0);
+ else if (VT == MVT::v2i32 || VT == MVT::v2f32)
+ return SelectPostLoad(Node, 3, AArch64::LD3Threev2s_POST, AArch64::dsub0);
+ else if (VT == MVT::v4i32 || VT == MVT::v4f32)
+ return SelectPostLoad(Node, 3, AArch64::LD3Threev4s_POST, AArch64::qsub0);
+ else if (VT == MVT::v1i64 || VT == MVT::v1f64)
+ return SelectPostLoad(Node, 3, AArch64::LD1Threev1d_POST, AArch64::dsub0);
+ else if (VT == MVT::v2i64 || VT == MVT::v2f64)
+ return SelectPostLoad(Node, 3, AArch64::LD3Threev2d_POST, AArch64::qsub0);
+ break;
+ }
+ case AArch64ISD::LD4post: {
+ if (VT == MVT::v8i8)
+ return SelectPostLoad(Node, 4, AArch64::LD4Fourv8b_POST, AArch64::dsub0);
+ else if (VT == MVT::v16i8)
+ return SelectPostLoad(Node, 4, AArch64::LD4Fourv16b_POST, AArch64::qsub0);
+ else if (VT == MVT::v4i16)
+ return SelectPostLoad(Node, 4, AArch64::LD4Fourv4h_POST, AArch64::dsub0);
+ else if (VT == MVT::v8i16)
+ return SelectPostLoad(Node, 4, AArch64::LD4Fourv8h_POST, AArch64::qsub0);
+ else if (VT == MVT::v2i32 || VT == MVT::v2f32)
+ return SelectPostLoad(Node, 4, AArch64::LD4Fourv2s_POST, AArch64::dsub0);
+ else if (VT == MVT::v4i32 || VT == MVT::v4f32)
+ return SelectPostLoad(Node, 4, AArch64::LD4Fourv4s_POST, AArch64::qsub0);
+ else if (VT == MVT::v1i64 || VT == MVT::v1f64)
+ return SelectPostLoad(Node, 4, AArch64::LD1Fourv1d_POST, AArch64::dsub0);
+ else if (VT == MVT::v2i64 || VT == MVT::v2f64)
+ return SelectPostLoad(Node, 4, AArch64::LD4Fourv2d_POST, AArch64::qsub0);
+ break;
+ }
+ case AArch64ISD::LD1x2post: {
+ if (VT == MVT::v8i8)
+ return SelectPostLoad(Node, 2, AArch64::LD1Twov8b_POST, AArch64::dsub0);
+ else if (VT == MVT::v16i8)
+ return SelectPostLoad(Node, 2, AArch64::LD1Twov16b_POST, AArch64::qsub0);
+ else if (VT == MVT::v4i16)
+ return SelectPostLoad(Node, 2, AArch64::LD1Twov4h_POST, AArch64::dsub0);
+ else if (VT == MVT::v8i16)
+ return SelectPostLoad(Node, 2, AArch64::LD1Twov8h_POST, AArch64::qsub0);
+ else if (VT == MVT::v2i32 || VT == MVT::v2f32)
+ return SelectPostLoad(Node, 2, AArch64::LD1Twov2s_POST, AArch64::dsub0);
+ else if (VT == MVT::v4i32 || VT == MVT::v4f32)
+ return SelectPostLoad(Node, 2, AArch64::LD1Twov4s_POST, AArch64::qsub0);
+ else if (VT == MVT::v1i64 || VT == MVT::v1f64)
+ return SelectPostLoad(Node, 2, AArch64::LD1Twov1d_POST, AArch64::dsub0);
+ else if (VT == MVT::v2i64 || VT == MVT::v2f64)
+ return SelectPostLoad(Node, 2, AArch64::LD1Twov2d_POST, AArch64::qsub0);
+ break;
+ }
+ case AArch64ISD::LD1x3post: {
+ if (VT == MVT::v8i8)
+ return SelectPostLoad(Node, 3, AArch64::LD1Threev8b_POST, AArch64::dsub0);
+ else if (VT == MVT::v16i8)
+ return SelectPostLoad(Node, 3, AArch64::LD1Threev16b_POST, AArch64::qsub0);
+ else if (VT == MVT::v4i16)
+ return SelectPostLoad(Node, 3, AArch64::LD1Threev4h_POST, AArch64::dsub0);
+ else if (VT == MVT::v8i16)
+ return SelectPostLoad(Node, 3, AArch64::LD1Threev8h_POST, AArch64::qsub0);
+ else if (VT == MVT::v2i32 || VT == MVT::v2f32)
+ return SelectPostLoad(Node, 3, AArch64::LD1Threev2s_POST, AArch64::dsub0);
+ else if (VT == MVT::v4i32 || VT == MVT::v4f32)
+ return SelectPostLoad(Node, 3, AArch64::LD1Threev4s_POST, AArch64::qsub0);
+ else if (VT == MVT::v1i64 || VT == MVT::v1f64)
+ return SelectPostLoad(Node, 3, AArch64::LD1Threev1d_POST, AArch64::dsub0);
+ else if (VT == MVT::v2i64 || VT == MVT::v2f64)
+ return SelectPostLoad(Node, 3, AArch64::LD1Threev2d_POST, AArch64::qsub0);
+ break;
+ }
+ case AArch64ISD::LD1x4post: {
+ if (VT == MVT::v8i8)
+ return SelectPostLoad(Node, 4, AArch64::LD1Fourv8b_POST, AArch64::dsub0);
+ else if (VT == MVT::v16i8)
+ return SelectPostLoad(Node, 4, AArch64::LD1Fourv16b_POST, AArch64::qsub0);
+ else if (VT == MVT::v4i16)
+ return SelectPostLoad(Node, 4, AArch64::LD1Fourv4h_POST, AArch64::dsub0);
+ else if (VT == MVT::v8i16)
+ return SelectPostLoad(Node, 4, AArch64::LD1Fourv8h_POST, AArch64::qsub0);
+ else if (VT == MVT::v2i32 || VT == MVT::v2f32)
+ return SelectPostLoad(Node, 4, AArch64::LD1Fourv2s_POST, AArch64::dsub0);
+ else if (VT == MVT::v4i32 || VT == MVT::v4f32)
+ return SelectPostLoad(Node, 4, AArch64::LD1Fourv4s_POST, AArch64::qsub0);
+ else if (VT == MVT::v1i64 || VT == MVT::v1f64)
+ return SelectPostLoad(Node, 4, AArch64::LD1Fourv1d_POST, AArch64::dsub0);
+ else if (VT == MVT::v2i64 || VT == MVT::v2f64)
+ return SelectPostLoad(Node, 4, AArch64::LD1Fourv2d_POST, AArch64::qsub0);
+ break;
+ }
+ case AArch64ISD::LD1DUPpost: {
+ if (VT == MVT::v8i8)
+ return SelectPostLoad(Node, 1, AArch64::LD1Rv8b_POST, AArch64::dsub0);
+ else if (VT == MVT::v16i8)
+ return SelectPostLoad(Node, 1, AArch64::LD1Rv16b_POST, AArch64::qsub0);
+ else if (VT == MVT::v4i16)
+ return SelectPostLoad(Node, 1, AArch64::LD1Rv4h_POST, AArch64::dsub0);
+ else if (VT == MVT::v8i16)
+ return SelectPostLoad(Node, 1, AArch64::LD1Rv8h_POST, AArch64::qsub0);
+ else if (VT == MVT::v2i32 || VT == MVT::v2f32)
+ return SelectPostLoad(Node, 1, AArch64::LD1Rv2s_POST, AArch64::dsub0);
+ else if (VT == MVT::v4i32 || VT == MVT::v4f32)
+ return SelectPostLoad(Node, 1, AArch64::LD1Rv4s_POST, AArch64::qsub0);
+ else if (VT == MVT::v1i64 || VT == MVT::v1f64)
+ return SelectPostLoad(Node, 1, AArch64::LD1Rv1d_POST, AArch64::dsub0);
+ else if (VT == MVT::v2i64 || VT == MVT::v2f64)
+ return SelectPostLoad(Node, 1, AArch64::LD1Rv2d_POST, AArch64::qsub0);
+ break;
+ }
+ case AArch64ISD::LD2DUPpost: {
+ if (VT == MVT::v8i8)
+ return SelectPostLoad(Node, 2, AArch64::LD2Rv8b_POST, AArch64::dsub0);
+ else if (VT == MVT::v16i8)
+ return SelectPostLoad(Node, 2, AArch64::LD2Rv16b_POST, AArch64::qsub0);
+ else if (VT == MVT::v4i16)
+ return SelectPostLoad(Node, 2, AArch64::LD2Rv4h_POST, AArch64::dsub0);
+ else if (VT == MVT::v8i16)
+ return SelectPostLoad(Node, 2, AArch64::LD2Rv8h_POST, AArch64::qsub0);
+ else if (VT == MVT::v2i32 || VT == MVT::v2f32)
+ return SelectPostLoad(Node, 2, AArch64::LD2Rv2s_POST, AArch64::dsub0);
+ else if (VT == MVT::v4i32 || VT == MVT::v4f32)
+ return SelectPostLoad(Node, 2, AArch64::LD2Rv4s_POST, AArch64::qsub0);
+ else if (VT == MVT::v1i64 || VT == MVT::v1f64)
+ return SelectPostLoad(Node, 2, AArch64::LD2Rv1d_POST, AArch64::dsub0);
+ else if (VT == MVT::v2i64 || VT == MVT::v2f64)
+ return SelectPostLoad(Node, 2, AArch64::LD2Rv2d_POST, AArch64::qsub0);
+ break;
+ }
+ case AArch64ISD::LD3DUPpost: {
+ if (VT == MVT::v8i8)
+ return SelectPostLoad(Node, 3, AArch64::LD3Rv8b_POST, AArch64::dsub0);
+ else if (VT == MVT::v16i8)
+ return SelectPostLoad(Node, 3, AArch64::LD3Rv16b_POST, AArch64::qsub0);
+ else if (VT == MVT::v4i16)
+ return SelectPostLoad(Node, 3, AArch64::LD3Rv4h_POST, AArch64::dsub0);
+ else if (VT == MVT::v8i16)
+ return SelectPostLoad(Node, 3, AArch64::LD3Rv8h_POST, AArch64::qsub0);
+ else if (VT == MVT::v2i32 || VT == MVT::v2f32)
+ return SelectPostLoad(Node, 3, AArch64::LD3Rv2s_POST, AArch64::dsub0);
+ else if (VT == MVT::v4i32 || VT == MVT::v4f32)
+ return SelectPostLoad(Node, 3, AArch64::LD3Rv4s_POST, AArch64::qsub0);
+ else if (VT == MVT::v1i64 || VT == MVT::v1f64)
+ return SelectPostLoad(Node, 3, AArch64::LD3Rv1d_POST, AArch64::dsub0);
+ else if (VT == MVT::v2i64 || VT == MVT::v2f64)
+ return SelectPostLoad(Node, 3, AArch64::LD3Rv2d_POST, AArch64::qsub0);
+ break;
+ }
+ case AArch64ISD::LD4DUPpost: {
+ if (VT == MVT::v8i8)
+ return SelectPostLoad(Node, 4, AArch64::LD4Rv8b_POST, AArch64::dsub0);
+ else if (VT == MVT::v16i8)
+ return SelectPostLoad(Node, 4, AArch64::LD4Rv16b_POST, AArch64::qsub0);
+ else if (VT == MVT::v4i16)
+ return SelectPostLoad(Node, 4, AArch64::LD4Rv4h_POST, AArch64::dsub0);
+ else if (VT == MVT::v8i16)
+ return SelectPostLoad(Node, 4, AArch64::LD4Rv8h_POST, AArch64::qsub0);
+ else if (VT == MVT::v2i32 || VT == MVT::v2f32)
+ return SelectPostLoad(Node, 4, AArch64::LD4Rv2s_POST, AArch64::dsub0);
+ else if (VT == MVT::v4i32 || VT == MVT::v4f32)
+ return SelectPostLoad(Node, 4, AArch64::LD4Rv4s_POST, AArch64::qsub0);
+ else if (VT == MVT::v1i64 || VT == MVT::v1f64)
+ return SelectPostLoad(Node, 4, AArch64::LD4Rv1d_POST, AArch64::dsub0);
+ else if (VT == MVT::v2i64 || VT == MVT::v2f64)
+ return SelectPostLoad(Node, 4, AArch64::LD4Rv2d_POST, AArch64::qsub0);
+ break;
+ }
+ case AArch64ISD::LD1LANEpost: {
+ if (VT == MVT::v16i8 || VT == MVT::v8i8)
+ return SelectPostLoadLane(Node, 1, AArch64::LD1i8_POST);
+ else if (VT == MVT::v8i16 || VT == MVT::v4i16)
+ return SelectPostLoadLane(Node, 1, AArch64::LD1i16_POST);
+ else if (VT == MVT::v4i32 || VT == MVT::v2i32 || VT == MVT::v4f32 ||
+ VT == MVT::v2f32)
+ return SelectPostLoadLane(Node, 1, AArch64::LD1i32_POST);
+ else if (VT == MVT::v2i64 || VT == MVT::v1i64 || VT == MVT::v2f64 ||
+ VT == MVT::v1f64)
+ return SelectPostLoadLane(Node, 1, AArch64::LD1i64_POST);
+ break;
+ }
+ case AArch64ISD::LD2LANEpost: {
+ if (VT == MVT::v16i8 || VT == MVT::v8i8)
+ return SelectPostLoadLane(Node, 2, AArch64::LD2i8_POST);
+ else if (VT == MVT::v8i16 || VT == MVT::v4i16)
+ return SelectPostLoadLane(Node, 2, AArch64::LD2i16_POST);
+ else if (VT == MVT::v4i32 || VT == MVT::v2i32 || VT == MVT::v4f32 ||
+ VT == MVT::v2f32)
+ return SelectPostLoadLane(Node, 2, AArch64::LD2i32_POST);
+ else if (VT == MVT::v2i64 || VT == MVT::v1i64 || VT == MVT::v2f64 ||
+ VT == MVT::v1f64)
+ return SelectPostLoadLane(Node, 2, AArch64::LD2i64_POST);
+ break;
+ }
+ case AArch64ISD::LD3LANEpost: {
+ if (VT == MVT::v16i8 || VT == MVT::v8i8)
+ return SelectPostLoadLane(Node, 3, AArch64::LD3i8_POST);
+ else if (VT == MVT::v8i16 || VT == MVT::v4i16)
+ return SelectPostLoadLane(Node, 3, AArch64::LD3i16_POST);
+ else if (VT == MVT::v4i32 || VT == MVT::v2i32 || VT == MVT::v4f32 ||
+ VT == MVT::v2f32)
+ return SelectPostLoadLane(Node, 3, AArch64::LD3i32_POST);
+ else if (VT == MVT::v2i64 || VT == MVT::v1i64 || VT == MVT::v2f64 ||
+ VT == MVT::v1f64)
+ return SelectPostLoadLane(Node, 3, AArch64::LD3i64_POST);
+ break;
+ }
+ case AArch64ISD::LD4LANEpost: {
+ if (VT == MVT::v16i8 || VT == MVT::v8i8)
+ return SelectPostLoadLane(Node, 4, AArch64::LD4i8_POST);
+ else if (VT == MVT::v8i16 || VT == MVT::v4i16)
+ return SelectPostLoadLane(Node, 4, AArch64::LD4i16_POST);
+ else if (VT == MVT::v4i32 || VT == MVT::v2i32 || VT == MVT::v4f32 ||
+ VT == MVT::v2f32)
+ return SelectPostLoadLane(Node, 4, AArch64::LD4i32_POST);
+ else if (VT == MVT::v2i64 || VT == MVT::v1i64 || VT == MVT::v2f64 ||
+ VT == MVT::v1f64)
+ return SelectPostLoadLane(Node, 4, AArch64::LD4i64_POST);
+ break;
+ }
+ case AArch64ISD::ST2post: {
+ VT = Node->getOperand(1).getValueType();
+ if (VT == MVT::v8i8)
+ return SelectPostStore(Node, 2, AArch64::ST2Twov8b_POST);
+ else if (VT == MVT::v16i8)
+ return SelectPostStore(Node, 2, AArch64::ST2Twov16b_POST);
+ else if (VT == MVT::v4i16)
+ return SelectPostStore(Node, 2, AArch64::ST2Twov4h_POST);
+ else if (VT == MVT::v8i16)
+ return SelectPostStore(Node, 2, AArch64::ST2Twov8h_POST);
+ else if (VT == MVT::v2i32 || VT == MVT::v2f32)
+ return SelectPostStore(Node, 2, AArch64::ST2Twov2s_POST);
+ else if (VT == MVT::v4i32 || VT == MVT::v4f32)
+ return SelectPostStore(Node, 2, AArch64::ST2Twov4s_POST);
+ else if (VT == MVT::v2i64 || VT == MVT::v2f64)
+ return SelectPostStore(Node, 2, AArch64::ST2Twov2d_POST);
+ else if (VT == MVT::v1i64 || VT == MVT::v1f64)
+ return SelectPostStore(Node, 2, AArch64::ST1Twov1d_POST);
+ break;
+ }
+ case AArch64ISD::ST3post: {
+ VT = Node->getOperand(1).getValueType();
+ if (VT == MVT::v8i8)
+ return SelectPostStore(Node, 3, AArch64::ST3Threev8b_POST);
+ else if (VT == MVT::v16i8)
+ return SelectPostStore(Node, 3, AArch64::ST3Threev16b_POST);
+ else if (VT == MVT::v4i16)
+ return SelectPostStore(Node, 3, AArch64::ST3Threev4h_POST);
+ else if (VT == MVT::v8i16)
+ return SelectPostStore(Node, 3, AArch64::ST3Threev8h_POST);
+ else if (VT == MVT::v2i32 || VT == MVT::v2f32)
+ return SelectPostStore(Node, 3, AArch64::ST3Threev2s_POST);
+ else if (VT == MVT::v4i32 || VT == MVT::v4f32)
+ return SelectPostStore(Node, 3, AArch64::ST3Threev4s_POST);
+ else if (VT == MVT::v2i64 || VT == MVT::v2f64)
+ return SelectPostStore(Node, 3, AArch64::ST3Threev2d_POST);
+ else if (VT == MVT::v1i64 || VT == MVT::v1f64)
+ return SelectPostStore(Node, 3, AArch64::ST1Threev1d_POST);
+ break;
+ }
+ case AArch64ISD::ST4post: {
+ VT = Node->getOperand(1).getValueType();
+ if (VT == MVT::v8i8)
+ return SelectPostStore(Node, 4, AArch64::ST4Fourv8b_POST);
+ else if (VT == MVT::v16i8)
+ return SelectPostStore(Node, 4, AArch64::ST4Fourv16b_POST);
+ else if (VT == MVT::v4i16)
+ return SelectPostStore(Node, 4, AArch64::ST4Fourv4h_POST);
+ else if (VT == MVT::v8i16)
+ return SelectPostStore(Node, 4, AArch64::ST4Fourv8h_POST);
+ else if (VT == MVT::v2i32 || VT == MVT::v2f32)
+ return SelectPostStore(Node, 4, AArch64::ST4Fourv2s_POST);
+ else if (VT == MVT::v4i32 || VT == MVT::v4f32)
+ return SelectPostStore(Node, 4, AArch64::ST4Fourv4s_POST);
+ else if (VT == MVT::v2i64 || VT == MVT::v2f64)
+ return SelectPostStore(Node, 4, AArch64::ST4Fourv2d_POST);
+ else if (VT == MVT::v1i64 || VT == MVT::v1f64)
+ return SelectPostStore(Node, 4, AArch64::ST1Fourv1d_POST);
+ break;
+ }
+ case AArch64ISD::ST1x2post: {
+ VT = Node->getOperand(1).getValueType();
+ if (VT == MVT::v8i8)
+ return SelectPostStore(Node, 2, AArch64::ST1Twov8b_POST);
+ else if (VT == MVT::v16i8)
+ return SelectPostStore(Node, 2, AArch64::ST1Twov16b_POST);
+ else if (VT == MVT::v4i16)
+ return SelectPostStore(Node, 2, AArch64::ST1Twov4h_POST);
+ else if (VT == MVT::v8i16)
+ return SelectPostStore(Node, 2, AArch64::ST1Twov8h_POST);
+ else if (VT == MVT::v2i32 || VT == MVT::v2f32)
+ return SelectPostStore(Node, 2, AArch64::ST1Twov2s_POST);
+ else if (VT == MVT::v4i32 || VT == MVT::v4f32)
+ return SelectPostStore(Node, 2, AArch64::ST1Twov4s_POST);
+ else if (VT == MVT::v1i64 || VT == MVT::v1f64)
+ return SelectPostStore(Node, 2, AArch64::ST1Twov1d_POST);
+ else if (VT == MVT::v2i64 || VT == MVT::v2f64)
+ return SelectPostStore(Node, 2, AArch64::ST1Twov2d_POST);
+ break;
+ }
+ case AArch64ISD::ST1x3post: {
+ VT = Node->getOperand(1).getValueType();
+ if (VT == MVT::v8i8)
+ return SelectPostStore(Node, 3, AArch64::ST1Threev8b_POST);
+ else if (VT == MVT::v16i8)
+ return SelectPostStore(Node, 3, AArch64::ST1Threev16b_POST);
+ else if (VT == MVT::v4i16)
+ return SelectPostStore(Node, 3, AArch64::ST1Threev4h_POST);
+ else if (VT == MVT::v8i16)
+ return SelectPostStore(Node, 3, AArch64::ST1Threev8h_POST);
+ else if (VT == MVT::v2i32 || VT == MVT::v2f32)
+ return SelectPostStore(Node, 3, AArch64::ST1Threev2s_POST);
+ else if (VT == MVT::v4i32 || VT == MVT::v4f32)
+ return SelectPostStore(Node, 3, AArch64::ST1Threev4s_POST);
+ else if (VT == MVT::v1i64 || VT == MVT::v1f64)
+ return SelectPostStore(Node, 3, AArch64::ST1Threev1d_POST);
+ else if (VT == MVT::v2i64 || VT == MVT::v2f64)
+ return SelectPostStore(Node, 3, AArch64::ST1Threev2d_POST);
+ break;
+ }
+ case AArch64ISD::ST1x4post: {
+ VT = Node->getOperand(1).getValueType();
+ if (VT == MVT::v8i8)
+ return SelectPostStore(Node, 4, AArch64::ST1Fourv8b_POST);
+ else if (VT == MVT::v16i8)
+ return SelectPostStore(Node, 4, AArch64::ST1Fourv16b_POST);
+ else if (VT == MVT::v4i16)
+ return SelectPostStore(Node, 4, AArch64::ST1Fourv4h_POST);
+ else if (VT == MVT::v8i16)
+ return SelectPostStore(Node, 4, AArch64::ST1Fourv8h_POST);
+ else if (VT == MVT::v2i32 || VT == MVT::v2f32)
+ return SelectPostStore(Node, 4, AArch64::ST1Fourv2s_POST);
+ else if (VT == MVT::v4i32 || VT == MVT::v4f32)
+ return SelectPostStore(Node, 4, AArch64::ST1Fourv4s_POST);
+ else if (VT == MVT::v1i64 || VT == MVT::v1f64)
+ return SelectPostStore(Node, 4, AArch64::ST1Fourv1d_POST);
+ else if (VT == MVT::v2i64 || VT == MVT::v2f64)
+ return SelectPostStore(Node, 4, AArch64::ST1Fourv2d_POST);
+ break;
+ }
+ case AArch64ISD::ST2LANEpost: {
+ VT = Node->getOperand(1).getValueType();
+ if (VT == MVT::v16i8 || VT == MVT::v8i8)
+ return SelectPostStoreLane(Node, 2, AArch64::ST2i8_POST);
+ else if (VT == MVT::v8i16 || VT == MVT::v4i16)
+ return SelectPostStoreLane(Node, 2, AArch64::ST2i16_POST);
+ else if (VT == MVT::v4i32 || VT == MVT::v2i32 || VT == MVT::v4f32 ||
+ VT == MVT::v2f32)
+ return SelectPostStoreLane(Node, 2, AArch64::ST2i32_POST);
+ else if (VT == MVT::v2i64 || VT == MVT::v1i64 || VT == MVT::v2f64 ||
+ VT == MVT::v1f64)
+ return SelectPostStoreLane(Node, 2, AArch64::ST2i64_POST);
+ break;
+ }
+ case AArch64ISD::ST3LANEpost: {
+ VT = Node->getOperand(1).getValueType();
+ if (VT == MVT::v16i8 || VT == MVT::v8i8)
+ return SelectPostStoreLane(Node, 3, AArch64::ST3i8_POST);
+ else if (VT == MVT::v8i16 || VT == MVT::v4i16)
+ return SelectPostStoreLane(Node, 3, AArch64::ST3i16_POST);
+ else if (VT == MVT::v4i32 || VT == MVT::v2i32 || VT == MVT::v4f32 ||
+ VT == MVT::v2f32)
+ return SelectPostStoreLane(Node, 3, AArch64::ST3i32_POST);
+ else if (VT == MVT::v2i64 || VT == MVT::v1i64 || VT == MVT::v2f64 ||
+ VT == MVT::v1f64)
+ return SelectPostStoreLane(Node, 3, AArch64::ST3i64_POST);
+ break;
+ }
+ case AArch64ISD::ST4LANEpost: {
+ VT = Node->getOperand(1).getValueType();
+ if (VT == MVT::v16i8 || VT == MVT::v8i8)
+ return SelectPostStoreLane(Node, 4, AArch64::ST4i8_POST);
+ else if (VT == MVT::v8i16 || VT == MVT::v4i16)
+ return SelectPostStoreLane(Node, 4, AArch64::ST4i16_POST);
+ else if (VT == MVT::v4i32 || VT == MVT::v2i32 || VT == MVT::v4f32 ||
+ VT == MVT::v2f32)
+ return SelectPostStoreLane(Node, 4, AArch64::ST4i32_POST);
+ else if (VT == MVT::v2i64 || VT == MVT::v1i64 || VT == MVT::v2f64 ||
+ VT == MVT::v1f64)
+ return SelectPostStoreLane(Node, 4, AArch64::ST4i64_POST);
+ break;
}
- SDNode *ResNode = SelectCode(Node);
+ case ISD::FCEIL:
+ case ISD::FFLOOR:
+ case ISD::FTRUNC:
+ case ISD::FROUND:
+ if (SDNode *I = SelectLIBM(Node))
+ return I;
+ break;
+ }
- DEBUG(dbgs() << "=> ";
- if (ResNode == NULL || ResNode == Node)
- Node->dump(CurDAG);
- else
- ResNode->dump(CurDAG);
- dbgs() << "\n");
+ // Select the default instruction
+ ResNode = SelectCode(Node);
+
+ DEBUG(errs() << "=> ");
+ if (ResNode == nullptr || ResNode == Node)
+ DEBUG(Node->dump(CurDAG));
+ else
+ DEBUG(ResNode->dump(CurDAG));
+ DEBUG(errs() << "\n");
return ResNode;
}
-/// This pass converts a legalized DAG into a AArch64-specific DAG, ready for
-/// instruction scheduling.
-FunctionPass *llvm::createAArch64ISelDAG(AArch64TargetMachine &TM,
+/// createAArch64ISelDag - This pass converts a legalized DAG into a
+/// AArch64-specific DAG, ready for instruction scheduling.
+FunctionPass *llvm::createAArch64ISelDag(AArch64TargetMachine &TM,
CodeGenOpt::Level OptLevel) {
return new AArch64DAGToDAGISel(TM, OptLevel);
}
diff --git a/lib/Target/AArch64/AArch64ISelLowering.cpp b/lib/Target/AArch64/AArch64ISelLowering.cpp
index 388973a..80d6669 100644
--- a/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -1,4 +1,4 @@
-//===-- AArch64ISelLowering.cpp - AArch64 DAG Lowering Implementation -----===//
+//===-- AArch64ISelLowering.cpp - AArch64 DAG Lowering Implementation ----===//
//
// The LLVM Compiler Infrastructure
//
@@ -7,46 +7,87 @@
//
//===----------------------------------------------------------------------===//
//
-// This file defines the interfaces that AArch64 uses to lower LLVM code into a
-// selection DAG.
+// This file implements the AArch64TargetLowering class.
//
//===----------------------------------------------------------------------===//
-#define DEBUG_TYPE "aarch64-isel"
-#include "AArch64.h"
#include "AArch64ISelLowering.h"
+#include "AArch64PerfectShuffle.h"
+#include "AArch64Subtarget.h"
#include "AArch64MachineFunctionInfo.h"
#include "AArch64TargetMachine.h"
#include "AArch64TargetObjectFile.h"
-#include "Utils/AArch64BaseInfo.h"
-#include "llvm/CodeGen/Analysis.h"
+#include "MCTargetDesc/AArch64AddressingModes.h"
+#include "llvm/ADT/Statistic.h"
#include "llvm/CodeGen/CallingConvLower.h"
#include "llvm/CodeGen/MachineFrameInfo.h"
#include "llvm/CodeGen/MachineInstrBuilder.h"
#include "llvm/CodeGen/MachineRegisterInfo.h"
-#include "llvm/CodeGen/TargetLoweringObjectFileImpl.h"
-#include "llvm/IR/CallingConv.h"
-#include "llvm/Support/MathExtras.h"
-
+#include "llvm/IR/Function.h"
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/IR/Type.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetOptions.h"
using namespace llvm;
-static TargetLoweringObjectFile *createTLOF(AArch64TargetMachine &TM) {
- assert (TM.getSubtarget<AArch64Subtarget>().isTargetELF() &&
- "unknown subtarget type");
- return new AArch64ElfTargetObjectFile();
+#define DEBUG_TYPE "aarch64-lower"
+
+STATISTIC(NumTailCalls, "Number of tail calls");
+STATISTIC(NumShiftInserts, "Number of vector shift inserts");
+
+enum AlignMode {
+ StrictAlign,
+ NoStrictAlign
+};
+
+static cl::opt<AlignMode>
+Align(cl::desc("Load/store alignment support"),
+ cl::Hidden, cl::init(NoStrictAlign),
+ cl::values(
+ clEnumValN(StrictAlign, "aarch64-strict-align",
+ "Disallow all unaligned memory accesses"),
+ clEnumValN(NoStrictAlign, "aarch64-no-strict-align",
+ "Allow unaligned memory accesses"),
+ clEnumValEnd));
+
+// Place holder until extr generation is tested fully.
+static cl::opt<bool>
+EnableAArch64ExtrGeneration("aarch64-extr-generation", cl::Hidden,
+ cl::desc("Allow AArch64 (or (shift)(shift))->extract"),
+ cl::init(true));
+
+static cl::opt<bool>
+EnableAArch64SlrGeneration("aarch64-shift-insert-generation", cl::Hidden,
+ cl::desc("Allow AArch64 SLI/SRI formation"),
+ cl::init(false));
+
+//===----------------------------------------------------------------------===//
+// AArch64 Lowering public interface.
+//===----------------------------------------------------------------------===//
+static TargetLoweringObjectFile *createTLOF(TargetMachine &TM) {
+ if (TM.getSubtarget<AArch64Subtarget>().isTargetDarwin())
+ return new AArch64_MachoTargetObjectFile();
+
+ return new AArch64_ELFTargetObjectFile();
}
AArch64TargetLowering::AArch64TargetLowering(AArch64TargetMachine &TM)
- : TargetLowering(TM, createTLOF(TM)), Itins(TM.getInstrItineraryData()) {
+ : TargetLowering(TM, createTLOF(TM)) {
+ Subtarget = &TM.getSubtarget<AArch64Subtarget>();
- const AArch64Subtarget *Subtarget = &TM.getSubtarget<AArch64Subtarget>();
-
- // SIMD compares set the entire lane's bits to 1
+ // AArch64 doesn't have comparisons which set GPRs or setcc instructions, so
+ // we have to make something up. Arbitrarily, choose ZeroOrOne.
+ setBooleanContents(ZeroOrOneBooleanContent);
+ // When comparing vectors the result sets the different elements in the
+ // vector to all-one or all-zero.
setBooleanVectorContents(ZeroOrNegativeOneBooleanContent);
- // Scalar register <-> type mapping
- addRegisterClass(MVT::i32, &AArch64::GPR32RegClass);
- addRegisterClass(MVT::i64, &AArch64::GPR64RegClass);
+ // Set up the register classes.
+ addRegisterClass(MVT::i32, &AArch64::GPR32allRegClass);
+ addRegisterClass(MVT::i64, &AArch64::GPR64allRegClass);
if (Subtarget->hasFPARMv8()) {
addRegisterClass(MVT::f16, &AArch64::FPR16RegClass);
@@ -56,201 +97,86 @@
}
if (Subtarget->hasNEON()) {
- // And the vectors
- addRegisterClass(MVT::v1i8, &AArch64::FPR8RegClass);
- addRegisterClass(MVT::v1i16, &AArch64::FPR16RegClass);
- addRegisterClass(MVT::v1i32, &AArch64::FPR32RegClass);
- addRegisterClass(MVT::v1i64, &AArch64::FPR64RegClass);
- addRegisterClass(MVT::v1f64, &AArch64::FPR64RegClass);
- addRegisterClass(MVT::v8i8, &AArch64::FPR64RegClass);
- addRegisterClass(MVT::v4i16, &AArch64::FPR64RegClass);
- addRegisterClass(MVT::v2i32, &AArch64::FPR64RegClass);
- addRegisterClass(MVT::v1i64, &AArch64::FPR64RegClass);
- addRegisterClass(MVT::v2f32, &AArch64::FPR64RegClass);
- addRegisterClass(MVT::v16i8, &AArch64::FPR128RegClass);
- addRegisterClass(MVT::v8i16, &AArch64::FPR128RegClass);
- addRegisterClass(MVT::v4i32, &AArch64::FPR128RegClass);
- addRegisterClass(MVT::v2i64, &AArch64::FPR128RegClass);
- addRegisterClass(MVT::v4f32, &AArch64::FPR128RegClass);
- addRegisterClass(MVT::v2f64, &AArch64::FPR128RegClass);
+ addRegisterClass(MVT::v16i8, &AArch64::FPR8RegClass);
+ addRegisterClass(MVT::v8i16, &AArch64::FPR16RegClass);
+ // Someone set us up the NEON.
+ addDRTypeForNEON(MVT::v2f32);
+ addDRTypeForNEON(MVT::v8i8);
+ addDRTypeForNEON(MVT::v4i16);
+ addDRTypeForNEON(MVT::v2i32);
+ addDRTypeForNEON(MVT::v1i64);
+ addDRTypeForNEON(MVT::v1f64);
+
+ addQRTypeForNEON(MVT::v4f32);
+ addQRTypeForNEON(MVT::v2f64);
+ addQRTypeForNEON(MVT::v16i8);
+ addQRTypeForNEON(MVT::v8i16);
+ addQRTypeForNEON(MVT::v4i32);
+ addQRTypeForNEON(MVT::v2i64);
}
+ // Compute derived properties from the register classes
computeRegisterProperties();
- // We combine OR nodes for bitfield and NEON BSL operations.
- setTargetDAGCombine(ISD::OR);
-
- setTargetDAGCombine(ISD::AND);
- setTargetDAGCombine(ISD::SRA);
- setTargetDAGCombine(ISD::SRL);
- setTargetDAGCombine(ISD::SHL);
-
- setTargetDAGCombine(ISD::INTRINSIC_WO_CHAIN);
- setTargetDAGCombine(ISD::INTRINSIC_VOID);
- setTargetDAGCombine(ISD::INTRINSIC_W_CHAIN);
-
- // AArch64 does not have i1 loads, or much of anything for i1 really.
- setLoadExtAction(ISD::SEXTLOAD, MVT::i1, Promote);
- setLoadExtAction(ISD::ZEXTLOAD, MVT::i1, Promote);
- setLoadExtAction(ISD::EXTLOAD, MVT::i1, Promote);
-
- setStackPointerRegisterToSaveRestore(AArch64::XSP);
- setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i64, Expand);
- setOperationAction(ISD::STACKRESTORE, MVT::Other, Expand);
- setOperationAction(ISD::STACKSAVE, MVT::Other, Expand);
-
- // We'll lower globals to wrappers for selection.
+ // Provide all sorts of operation actions
setOperationAction(ISD::GlobalAddress, MVT::i64, Custom);
setOperationAction(ISD::GlobalTLSAddress, MVT::i64, Custom);
-
- // A64 instructions have the comparison predicate attached to the user of the
- // result, but having a separate comparison is valuable for matching.
- setOperationAction(ISD::BR_CC, MVT::i32, Custom);
- setOperationAction(ISD::BR_CC, MVT::i64, Custom);
- setOperationAction(ISD::BR_CC, MVT::f32, Custom);
- setOperationAction(ISD::BR_CC, MVT::f64, Custom);
-
- setOperationAction(ISD::SELECT, MVT::i32, Custom);
- setOperationAction(ISD::SELECT, MVT::i64, Custom);
- setOperationAction(ISD::SELECT, MVT::f32, Custom);
- setOperationAction(ISD::SELECT, MVT::f64, Custom);
-
- setOperationAction(ISD::SELECT_CC, MVT::i32, Custom);
- setOperationAction(ISD::SELECT_CC, MVT::i64, Custom);
- setOperationAction(ISD::SELECT_CC, MVT::f32, Custom);
- setOperationAction(ISD::SELECT_CC, MVT::f64, Custom);
-
- setOperationAction(ISD::BRCOND, MVT::Other, Custom);
-
setOperationAction(ISD::SETCC, MVT::i32, Custom);
setOperationAction(ISD::SETCC, MVT::i64, Custom);
setOperationAction(ISD::SETCC, MVT::f32, Custom);
setOperationAction(ISD::SETCC, MVT::f64, Custom);
-
+ setOperationAction(ISD::BRCOND, MVT::Other, Expand);
+ setOperationAction(ISD::BR_CC, MVT::i32, Custom);
+ setOperationAction(ISD::BR_CC, MVT::i64, Custom);
+ setOperationAction(ISD::BR_CC, MVT::f32, Custom);
+ setOperationAction(ISD::BR_CC, MVT::f64, Custom);
+ setOperationAction(ISD::SELECT, MVT::i32, Custom);
+ setOperationAction(ISD::SELECT, MVT::i64, Custom);
+ setOperationAction(ISD::SELECT, MVT::f32, Custom);
+ setOperationAction(ISD::SELECT, MVT::f64, Custom);
+ setOperationAction(ISD::SELECT_CC, MVT::i32, Custom);
+ setOperationAction(ISD::SELECT_CC, MVT::i64, Custom);
+ setOperationAction(ISD::SELECT_CC, MVT::f32, Custom);
+ setOperationAction(ISD::SELECT_CC, MVT::f64, Custom);
setOperationAction(ISD::BR_JT, MVT::Other, Expand);
- setOperationAction(ISD::JumpTable, MVT::i32, Custom);
setOperationAction(ISD::JumpTable, MVT::i64, Custom);
- setOperationAction(ISD::VASTART, MVT::Other, Custom);
- setOperationAction(ISD::VACOPY, MVT::Other, Custom);
- setOperationAction(ISD::VAEND, MVT::Other, Expand);
- setOperationAction(ISD::VAARG, MVT::Other, Expand);
-
- setOperationAction(ISD::BlockAddress, MVT::i64, Custom);
- setOperationAction(ISD::ConstantPool, MVT::i64, Custom);
-
- setOperationAction(ISD::ROTL, MVT::i32, Expand);
- setOperationAction(ISD::ROTL, MVT::i64, Expand);
-
- setOperationAction(ISD::UREM, MVT::i32, Expand);
- setOperationAction(ISD::UREM, MVT::i64, Expand);
- setOperationAction(ISD::UDIVREM, MVT::i32, Expand);
- setOperationAction(ISD::UDIVREM, MVT::i64, Expand);
-
- setOperationAction(ISD::SREM, MVT::i32, Expand);
- setOperationAction(ISD::SREM, MVT::i64, Expand);
- setOperationAction(ISD::SDIVREM, MVT::i32, Expand);
- setOperationAction(ISD::SDIVREM, MVT::i64, Expand);
-
- setOperationAction(ISD::SMUL_LOHI, MVT::i32, Expand);
- setOperationAction(ISD::SMUL_LOHI, MVT::i64, Expand);
- setOperationAction(ISD::UMUL_LOHI, MVT::i32, Expand);
- setOperationAction(ISD::UMUL_LOHI, MVT::i64, Expand);
-
- setOperationAction(ISD::CTPOP, MVT::i32, Expand);
- setOperationAction(ISD::CTPOP, MVT::i64, Expand);
-
- // Legal floating-point operations.
- setOperationAction(ISD::FABS, MVT::f32, Legal);
- setOperationAction(ISD::FABS, MVT::f64, Legal);
-
- setOperationAction(ISD::FCEIL, MVT::f32, Legal);
- setOperationAction(ISD::FCEIL, MVT::f64, Legal);
-
- setOperationAction(ISD::FFLOOR, MVT::f32, Legal);
- setOperationAction(ISD::FFLOOR, MVT::f64, Legal);
-
- setOperationAction(ISD::FNEARBYINT, MVT::f32, Legal);
- setOperationAction(ISD::FNEARBYINT, MVT::f64, Legal);
-
- setOperationAction(ISD::FNEG, MVT::f32, Legal);
- setOperationAction(ISD::FNEG, MVT::f64, Legal);
-
- setOperationAction(ISD::FRINT, MVT::f32, Legal);
- setOperationAction(ISD::FRINT, MVT::f64, Legal);
-
- setOperationAction(ISD::FSQRT, MVT::f32, Legal);
- setOperationAction(ISD::FSQRT, MVT::f64, Legal);
-
- setOperationAction(ISD::FTRUNC, MVT::f32, Legal);
- setOperationAction(ISD::FTRUNC, MVT::f64, Legal);
-
- setOperationAction(ISD::ConstantFP, MVT::f32, Legal);
- setOperationAction(ISD::ConstantFP, MVT::f64, Legal);
- setOperationAction(ISD::ConstantFP, MVT::f128, Legal);
-
- // Illegal floating-point operations.
- setOperationAction(ISD::FCOPYSIGN, MVT::f32, Expand);
- setOperationAction(ISD::FCOPYSIGN, MVT::f64, Expand);
-
- setOperationAction(ISD::FCOS, MVT::f32, Expand);
- setOperationAction(ISD::FCOS, MVT::f64, Expand);
-
- setOperationAction(ISD::FEXP, MVT::f32, Expand);
- setOperationAction(ISD::FEXP, MVT::f64, Expand);
-
- setOperationAction(ISD::FEXP2, MVT::f32, Expand);
- setOperationAction(ISD::FEXP2, MVT::f64, Expand);
-
- setOperationAction(ISD::FLOG, MVT::f32, Expand);
- setOperationAction(ISD::FLOG, MVT::f64, Expand);
-
- setOperationAction(ISD::FLOG2, MVT::f32, Expand);
- setOperationAction(ISD::FLOG2, MVT::f64, Expand);
-
- setOperationAction(ISD::FLOG10, MVT::f32, Expand);
- setOperationAction(ISD::FLOG10, MVT::f64, Expand);
-
- setOperationAction(ISD::FPOW, MVT::f32, Expand);
- setOperationAction(ISD::FPOW, MVT::f64, Expand);
-
- setOperationAction(ISD::FPOWI, MVT::f32, Expand);
- setOperationAction(ISD::FPOWI, MVT::f64, Expand);
+ setOperationAction(ISD::SHL_PARTS, MVT::i64, Custom);
+ setOperationAction(ISD::SRA_PARTS, MVT::i64, Custom);
+ setOperationAction(ISD::SRL_PARTS, MVT::i64, Custom);
setOperationAction(ISD::FREM, MVT::f32, Expand);
setOperationAction(ISD::FREM, MVT::f64, Expand);
+ setOperationAction(ISD::FREM, MVT::f80, Expand);
- setOperationAction(ISD::FSIN, MVT::f32, Expand);
- setOperationAction(ISD::FSIN, MVT::f64, Expand);
-
- setOperationAction(ISD::FSINCOS, MVT::f32, Expand);
- setOperationAction(ISD::FSINCOS, MVT::f64, Expand);
+ // Custom lowering hooks are needed for XOR
+ // to fold it into CSINC/CSINV.
+ setOperationAction(ISD::XOR, MVT::i32, Custom);
+ setOperationAction(ISD::XOR, MVT::i64, Custom);
// Virtually no operation on f128 is legal, but LLVM can't expand them when
// there's a valid register class, so we need custom operations in most cases.
- setOperationAction(ISD::FABS, MVT::f128, Expand);
- setOperationAction(ISD::FADD, MVT::f128, Custom);
- setOperationAction(ISD::FCOPYSIGN, MVT::f128, Expand);
- setOperationAction(ISD::FCOS, MVT::f128, Expand);
- setOperationAction(ISD::FDIV, MVT::f128, Custom);
- setOperationAction(ISD::FMA, MVT::f128, Expand);
- setOperationAction(ISD::FMUL, MVT::f128, Custom);
- setOperationAction(ISD::FNEG, MVT::f128, Expand);
- setOperationAction(ISD::FP_EXTEND, MVT::f128, Expand);
- setOperationAction(ISD::FP_ROUND, MVT::f128, Expand);
- setOperationAction(ISD::FPOW, MVT::f128, Expand);
- setOperationAction(ISD::FREM, MVT::f128, Expand);
- setOperationAction(ISD::FRINT, MVT::f128, Expand);
- setOperationAction(ISD::FSIN, MVT::f128, Expand);
- setOperationAction(ISD::FSINCOS, MVT::f128, Expand);
- setOperationAction(ISD::FSQRT, MVT::f128, Expand);
- setOperationAction(ISD::FSUB, MVT::f128, Custom);
- setOperationAction(ISD::FTRUNC, MVT::f128, Expand);
- setOperationAction(ISD::SETCC, MVT::f128, Custom);
- setOperationAction(ISD::BR_CC, MVT::f128, Custom);
- setOperationAction(ISD::SELECT, MVT::f128, Expand);
- setOperationAction(ISD::SELECT_CC, MVT::f128, Custom);
- setOperationAction(ISD::FP_EXTEND, MVT::f128, Custom);
+ setOperationAction(ISD::FABS, MVT::f128, Expand);
+ setOperationAction(ISD::FADD, MVT::f128, Custom);
+ setOperationAction(ISD::FCOPYSIGN, MVT::f128, Expand);
+ setOperationAction(ISD::FCOS, MVT::f128, Expand);
+ setOperationAction(ISD::FDIV, MVT::f128, Custom);
+ setOperationAction(ISD::FMA, MVT::f128, Expand);
+ setOperationAction(ISD::FMUL, MVT::f128, Custom);
+ setOperationAction(ISD::FNEG, MVT::f128, Expand);
+ setOperationAction(ISD::FPOW, MVT::f128, Expand);
+ setOperationAction(ISD::FREM, MVT::f128, Expand);
+ setOperationAction(ISD::FRINT, MVT::f128, Expand);
+ setOperationAction(ISD::FSIN, MVT::f128, Expand);
+ setOperationAction(ISD::FSINCOS, MVT::f128, Expand);
+ setOperationAction(ISD::FSQRT, MVT::f128, Expand);
+ setOperationAction(ISD::FSUB, MVT::f128, Custom);
+ setOperationAction(ISD::FTRUNC, MVT::f128, Expand);
+ setOperationAction(ISD::SETCC, MVT::f128, Custom);
+ setOperationAction(ISD::BR_CC, MVT::f128, Custom);
+ setOperationAction(ISD::SELECT, MVT::f128, Custom);
+ setOperationAction(ISD::SELECT_CC, MVT::f128, Custom);
+ setOperationAction(ISD::FP_EXTEND, MVT::f128, Custom);
// Lowering for many of the conversions is actually specified by the non-f128
// type. The LowerXXX function will be trivial when f128 isn't involved.
@@ -266,623 +192,583 @@
setOperationAction(ISD::UINT_TO_FP, MVT::i32, Custom);
setOperationAction(ISD::UINT_TO_FP, MVT::i64, Custom);
setOperationAction(ISD::UINT_TO_FP, MVT::i128, Custom);
- setOperationAction(ISD::FP_ROUND, MVT::f32, Custom);
- setOperationAction(ISD::FP_ROUND, MVT::f64, Custom);
+ setOperationAction(ISD::FP_ROUND, MVT::f32, Custom);
+ setOperationAction(ISD::FP_ROUND, MVT::f64, Custom);
- // i128 shift operation support
- setOperationAction(ISD::SHL_PARTS, MVT::i64, Custom);
- setOperationAction(ISD::SRA_PARTS, MVT::i64, Custom);
- setOperationAction(ISD::SRL_PARTS, MVT::i64, Custom);
+ // Variable arguments.
+ setOperationAction(ISD::VASTART, MVT::Other, Custom);
+ setOperationAction(ISD::VAARG, MVT::Other, Custom);
+ setOperationAction(ISD::VACOPY, MVT::Other, Custom);
+ setOperationAction(ISD::VAEND, MVT::Other, Expand);
- // This prevents LLVM trying to compress double constants into a floating
- // constant-pool entry and trying to load from there. It's of doubtful benefit
- // for A64: we'd need LDR followed by FCVT, I believe.
- setLoadExtAction(ISD::EXTLOAD, MVT::f64, Expand);
- setLoadExtAction(ISD::EXTLOAD, MVT::f32, Expand);
- setLoadExtAction(ISD::EXTLOAD, MVT::f16, Expand);
+ // Variable-sized objects.
+ setOperationAction(ISD::STACKSAVE, MVT::Other, Expand);
+ setOperationAction(ISD::STACKRESTORE, MVT::Other, Expand);
+ setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i64, Expand);
- setTruncStoreAction(MVT::f128, MVT::f64, Expand);
- setTruncStoreAction(MVT::f128, MVT::f32, Expand);
- setTruncStoreAction(MVT::f128, MVT::f16, Expand);
- setTruncStoreAction(MVT::f64, MVT::f32, Expand);
- setTruncStoreAction(MVT::f64, MVT::f16, Expand);
- setTruncStoreAction(MVT::f32, MVT::f16, Expand);
-
+ // Exception handling.
+ // FIXME: These are guesses. Has this been defined yet?
setExceptionPointerRegister(AArch64::X0);
setExceptionSelectorRegister(AArch64::X1);
- if (Subtarget->hasNEON()) {
- setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v8i8, Expand);
- setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v4i16, Expand);
- setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i32, Expand);
- setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v1i64, Expand);
- setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v16i8, Expand);
- setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v8i16, Expand);
- setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v4i32, Expand);
- setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i64, Expand);
+ // Constant pool entries
+ setOperationAction(ISD::ConstantPool, MVT::i64, Custom);
- setOperationAction(ISD::BUILD_VECTOR, MVT::v1i8, Custom);
- setOperationAction(ISD::BUILD_VECTOR, MVT::v8i8, Custom);
- setOperationAction(ISD::BUILD_VECTOR, MVT::v16i8, Custom);
- setOperationAction(ISD::BUILD_VECTOR, MVT::v1i16, Custom);
- setOperationAction(ISD::BUILD_VECTOR, MVT::v4i16, Custom);
- setOperationAction(ISD::BUILD_VECTOR, MVT::v8i16, Custom);
- setOperationAction(ISD::BUILD_VECTOR, MVT::v1i32, Custom);
- setOperationAction(ISD::BUILD_VECTOR, MVT::v2i32, Custom);
- setOperationAction(ISD::BUILD_VECTOR, MVT::v4i32, Custom);
- setOperationAction(ISD::BUILD_VECTOR, MVT::v1i64, Custom);
- setOperationAction(ISD::BUILD_VECTOR, MVT::v2i64, Custom);
- setOperationAction(ISD::BUILD_VECTOR, MVT::v2f32, Custom);
- setOperationAction(ISD::BUILD_VECTOR, MVT::v4f32, Custom);
- setOperationAction(ISD::BUILD_VECTOR, MVT::v1f64, Custom);
- setOperationAction(ISD::BUILD_VECTOR, MVT::v2f64, Custom);
+ // BlockAddress
+ setOperationAction(ISD::BlockAddress, MVT::i64, Custom);
- setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v8i8, Custom);
- setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v16i8, Custom);
- setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v4i16, Custom);
- setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v8i16, Custom);
- setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v2i32, Custom);
- setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v4i32, Custom);
- setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v1i64, Custom);
- setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v2i64, Custom);
- setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v2f32, Custom);
- setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v4f32, Custom);
- setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v1f64, Custom);
- setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v2f64, Custom);
+ // Add/Sub overflow ops with MVT::Glues are lowered to NZCV dependences.
+ setOperationAction(ISD::ADDC, MVT::i32, Custom);
+ setOperationAction(ISD::ADDE, MVT::i32, Custom);
+ setOperationAction(ISD::SUBC, MVT::i32, Custom);
+ setOperationAction(ISD::SUBE, MVT::i32, Custom);
+ setOperationAction(ISD::ADDC, MVT::i64, Custom);
+ setOperationAction(ISD::ADDE, MVT::i64, Custom);
+ setOperationAction(ISD::SUBC, MVT::i64, Custom);
+ setOperationAction(ISD::SUBE, MVT::i64, Custom);
- setOperationAction(ISD::CONCAT_VECTORS, MVT::v2i32, Legal);
- setOperationAction(ISD::CONCAT_VECTORS, MVT::v16i8, Legal);
- setOperationAction(ISD::CONCAT_VECTORS, MVT::v8i16, Legal);
- setOperationAction(ISD::CONCAT_VECTORS, MVT::v4i32, Legal);
- setOperationAction(ISD::CONCAT_VECTORS, MVT::v2i64, Legal);
- setOperationAction(ISD::CONCAT_VECTORS, MVT::v4f32, Legal);
- setOperationAction(ISD::CONCAT_VECTORS, MVT::v2f64, Legal);
+ // AArch64 lacks both left-rotate and popcount instructions.
+ setOperationAction(ISD::ROTL, MVT::i32, Expand);
+ setOperationAction(ISD::ROTL, MVT::i64, Expand);
- setOperationAction(ISD::CONCAT_VECTORS, MVT::v8i8, Custom);
- setOperationAction(ISD::CONCAT_VECTORS, MVT::v4i16, Custom);
- setOperationAction(ISD::CONCAT_VECTORS, MVT::v16i8, Custom);
- setOperationAction(ISD::CONCAT_VECTORS, MVT::v8i16, Custom);
- setOperationAction(ISD::CONCAT_VECTORS, MVT::v4i32, Custom);
+ // AArch64 doesn't have {U|S}MUL_LOHI.
+ setOperationAction(ISD::UMUL_LOHI, MVT::i64, Expand);
+ setOperationAction(ISD::SMUL_LOHI, MVT::i64, Expand);
- setOperationAction(ISD::SETCC, MVT::v8i8, Custom);
- setOperationAction(ISD::SETCC, MVT::v16i8, Custom);
- setOperationAction(ISD::SETCC, MVT::v4i16, Custom);
- setOperationAction(ISD::SETCC, MVT::v8i16, Custom);
- setOperationAction(ISD::SETCC, MVT::v2i32, Custom);
- setOperationAction(ISD::SETCC, MVT::v4i32, Custom);
- setOperationAction(ISD::SETCC, MVT::v1i64, Custom);
- setOperationAction(ISD::SETCC, MVT::v2i64, Custom);
- setOperationAction(ISD::SETCC, MVT::v2f32, Custom);
- setOperationAction(ISD::SETCC, MVT::v4f32, Custom);
- setOperationAction(ISD::SETCC, MVT::v1f64, Custom);
- setOperationAction(ISD::SETCC, MVT::v2f64, Custom);
- setOperationAction(ISD::FFLOOR, MVT::v2f32, Legal);
- setOperationAction(ISD::FFLOOR, MVT::v4f32, Legal);
- setOperationAction(ISD::FFLOOR, MVT::v1f64, Legal);
- setOperationAction(ISD::FFLOOR, MVT::v2f64, Legal);
+ // Expand the undefined-at-zero variants to cttz/ctlz to their defined-at-zero
+ // counterparts, which AArch64 supports directly.
+ setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i32, Expand);
+ setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i32, Expand);
+ setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i64, Expand);
+ setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i64, Expand);
- setOperationAction(ISD::FCEIL, MVT::v2f32, Legal);
- setOperationAction(ISD::FCEIL, MVT::v4f32, Legal);
- setOperationAction(ISD::FCEIL, MVT::v1f64, Legal);
- setOperationAction(ISD::FCEIL, MVT::v2f64, Legal);
+ setOperationAction(ISD::CTPOP, MVT::i32, Custom);
+ setOperationAction(ISD::CTPOP, MVT::i64, Custom);
- setOperationAction(ISD::FTRUNC, MVT::v2f32, Legal);
- setOperationAction(ISD::FTRUNC, MVT::v4f32, Legal);
- setOperationAction(ISD::FTRUNC, MVT::v1f64, Legal);
- setOperationAction(ISD::FTRUNC, MVT::v2f64, Legal);
+ setOperationAction(ISD::SDIVREM, MVT::i32, Expand);
+ setOperationAction(ISD::SDIVREM, MVT::i64, Expand);
+ setOperationAction(ISD::SREM, MVT::i32, Expand);
+ setOperationAction(ISD::SREM, MVT::i64, Expand);
+ setOperationAction(ISD::UDIVREM, MVT::i32, Expand);
+ setOperationAction(ISD::UDIVREM, MVT::i64, Expand);
+ setOperationAction(ISD::UREM, MVT::i32, Expand);
+ setOperationAction(ISD::UREM, MVT::i64, Expand);
- setOperationAction(ISD::FRINT, MVT::v2f32, Legal);
- setOperationAction(ISD::FRINT, MVT::v4f32, Legal);
- setOperationAction(ISD::FRINT, MVT::v1f64, Legal);
- setOperationAction(ISD::FRINT, MVT::v2f64, Legal);
+ // Custom lower Add/Sub/Mul with overflow.
+ setOperationAction(ISD::SADDO, MVT::i32, Custom);
+ setOperationAction(ISD::SADDO, MVT::i64, Custom);
+ setOperationAction(ISD::UADDO, MVT::i32, Custom);
+ setOperationAction(ISD::UADDO, MVT::i64, Custom);
+ setOperationAction(ISD::SSUBO, MVT::i32, Custom);
+ setOperationAction(ISD::SSUBO, MVT::i64, Custom);
+ setOperationAction(ISD::USUBO, MVT::i32, Custom);
+ setOperationAction(ISD::USUBO, MVT::i64, Custom);
+ setOperationAction(ISD::SMULO, MVT::i32, Custom);
+ setOperationAction(ISD::SMULO, MVT::i64, Custom);
+ setOperationAction(ISD::UMULO, MVT::i32, Custom);
+ setOperationAction(ISD::UMULO, MVT::i64, Custom);
- setOperationAction(ISD::FNEARBYINT, MVT::v2f32, Legal);
- setOperationAction(ISD::FNEARBYINT, MVT::v4f32, Legal);
- setOperationAction(ISD::FNEARBYINT, MVT::v1f64, Legal);
- setOperationAction(ISD::FNEARBYINT, MVT::v2f64, Legal);
+ setOperationAction(ISD::FSIN, MVT::f32, Expand);
+ setOperationAction(ISD::FSIN, MVT::f64, Expand);
+ setOperationAction(ISD::FCOS, MVT::f32, Expand);
+ setOperationAction(ISD::FCOS, MVT::f64, Expand);
+ setOperationAction(ISD::FPOW, MVT::f32, Expand);
+ setOperationAction(ISD::FPOW, MVT::f64, Expand);
+ setOperationAction(ISD::FCOPYSIGN, MVT::f64, Custom);
+ setOperationAction(ISD::FCOPYSIGN, MVT::f32, Custom);
- setOperationAction(ISD::FROUND, MVT::v2f32, Legal);
- setOperationAction(ISD::FROUND, MVT::v4f32, Legal);
- setOperationAction(ISD::FROUND, MVT::v1f64, Legal);
- setOperationAction(ISD::FROUND, MVT::v2f64, Legal);
-
- setOperationAction(ISD::SINT_TO_FP, MVT::v1i8, Custom);
- setOperationAction(ISD::SINT_TO_FP, MVT::v1i16, Custom);
- setOperationAction(ISD::SINT_TO_FP, MVT::v1i32, Custom);
- setOperationAction(ISD::SINT_TO_FP, MVT::v4i16, Custom);
- setOperationAction(ISD::SINT_TO_FP, MVT::v2i32, Custom);
- setOperationAction(ISD::SINT_TO_FP, MVT::v2i64, Custom);
-
- setOperationAction(ISD::UINT_TO_FP, MVT::v1i8, Custom);
- setOperationAction(ISD::UINT_TO_FP, MVT::v1i16, Custom);
- setOperationAction(ISD::UINT_TO_FP, MVT::v1i32, Custom);
- setOperationAction(ISD::UINT_TO_FP, MVT::v4i16, Custom);
- setOperationAction(ISD::UINT_TO_FP, MVT::v2i32, Custom);
- setOperationAction(ISD::UINT_TO_FP, MVT::v2i64, Custom);
-
- setOperationAction(ISD::FP_TO_SINT, MVT::v1i8, Custom);
- setOperationAction(ISD::FP_TO_SINT, MVT::v1i16, Custom);
- setOperationAction(ISD::FP_TO_SINT, MVT::v1i32, Custom);
- setOperationAction(ISD::FP_TO_SINT, MVT::v4i16, Custom);
- setOperationAction(ISD::FP_TO_SINT, MVT::v2i32, Custom);
- setOperationAction(ISD::FP_TO_SINT, MVT::v2i64, Custom);
-
- setOperationAction(ISD::FP_TO_UINT, MVT::v1i8, Custom);
- setOperationAction(ISD::FP_TO_UINT, MVT::v1i16, Custom);
- setOperationAction(ISD::FP_TO_UINT, MVT::v1i32, Custom);
- setOperationAction(ISD::FP_TO_UINT, MVT::v4i16, Custom);
- setOperationAction(ISD::FP_TO_UINT, MVT::v2i32, Custom);
- setOperationAction(ISD::FP_TO_UINT, MVT::v2i64, Custom);
-
- // Neon does not support vector divide/remainder operations except
- // floating-point divide.
- setOperationAction(ISD::SDIV, MVT::v1i8, Expand);
- setOperationAction(ISD::SDIV, MVT::v8i8, Expand);
- setOperationAction(ISD::SDIV, MVT::v16i8, Expand);
- setOperationAction(ISD::SDIV, MVT::v1i16, Expand);
- setOperationAction(ISD::SDIV, MVT::v4i16, Expand);
- setOperationAction(ISD::SDIV, MVT::v8i16, Expand);
- setOperationAction(ISD::SDIV, MVT::v1i32, Expand);
- setOperationAction(ISD::SDIV, MVT::v2i32, Expand);
- setOperationAction(ISD::SDIV, MVT::v4i32, Expand);
- setOperationAction(ISD::SDIV, MVT::v1i64, Expand);
- setOperationAction(ISD::SDIV, MVT::v2i64, Expand);
-
- setOperationAction(ISD::UDIV, MVT::v1i8, Expand);
- setOperationAction(ISD::UDIV, MVT::v8i8, Expand);
- setOperationAction(ISD::UDIV, MVT::v16i8, Expand);
- setOperationAction(ISD::UDIV, MVT::v1i16, Expand);
- setOperationAction(ISD::UDIV, MVT::v4i16, Expand);
- setOperationAction(ISD::UDIV, MVT::v8i16, Expand);
- setOperationAction(ISD::UDIV, MVT::v1i32, Expand);
- setOperationAction(ISD::UDIV, MVT::v2i32, Expand);
- setOperationAction(ISD::UDIV, MVT::v4i32, Expand);
- setOperationAction(ISD::UDIV, MVT::v1i64, Expand);
- setOperationAction(ISD::UDIV, MVT::v2i64, Expand);
-
- setOperationAction(ISD::SREM, MVT::v1i8, Expand);
- setOperationAction(ISD::SREM, MVT::v8i8, Expand);
- setOperationAction(ISD::SREM, MVT::v16i8, Expand);
- setOperationAction(ISD::SREM, MVT::v1i16, Expand);
- setOperationAction(ISD::SREM, MVT::v4i16, Expand);
- setOperationAction(ISD::SREM, MVT::v8i16, Expand);
- setOperationAction(ISD::SREM, MVT::v1i32, Expand);
- setOperationAction(ISD::SREM, MVT::v2i32, Expand);
- setOperationAction(ISD::SREM, MVT::v4i32, Expand);
- setOperationAction(ISD::SREM, MVT::v1i64, Expand);
- setOperationAction(ISD::SREM, MVT::v2i64, Expand);
-
- setOperationAction(ISD::UREM, MVT::v1i8, Expand);
- setOperationAction(ISD::UREM, MVT::v8i8, Expand);
- setOperationAction(ISD::UREM, MVT::v16i8, Expand);
- setOperationAction(ISD::UREM, MVT::v1i16, Expand);
- setOperationAction(ISD::UREM, MVT::v4i16, Expand);
- setOperationAction(ISD::UREM, MVT::v8i16, Expand);
- setOperationAction(ISD::UREM, MVT::v1i32, Expand);
- setOperationAction(ISD::UREM, MVT::v2i32, Expand);
- setOperationAction(ISD::UREM, MVT::v4i32, Expand);
- setOperationAction(ISD::UREM, MVT::v1i64, Expand);
- setOperationAction(ISD::UREM, MVT::v2i64, Expand);
-
- setOperationAction(ISD::FREM, MVT::v2f32, Expand);
- setOperationAction(ISD::FREM, MVT::v4f32, Expand);
- setOperationAction(ISD::FREM, MVT::v1f64, Expand);
- setOperationAction(ISD::FREM, MVT::v2f64, Expand);
-
- setOperationAction(ISD::SELECT, MVT::v8i8, Expand);
- setOperationAction(ISD::SELECT, MVT::v16i8, Expand);
- setOperationAction(ISD::SELECT, MVT::v4i16, Expand);
- setOperationAction(ISD::SELECT, MVT::v8i16, Expand);
- setOperationAction(ISD::SELECT, MVT::v2i32, Expand);
- setOperationAction(ISD::SELECT, MVT::v4i32, Expand);
- setOperationAction(ISD::SELECT, MVT::v1i64, Expand);
- setOperationAction(ISD::SELECT, MVT::v2i64, Expand);
- setOperationAction(ISD::SELECT, MVT::v2f32, Expand);
- setOperationAction(ISD::SELECT, MVT::v4f32, Expand);
- setOperationAction(ISD::SELECT, MVT::v1f64, Expand);
- setOperationAction(ISD::SELECT, MVT::v2f64, Expand);
-
- setOperationAction(ISD::SELECT_CC, MVT::v8i8, Custom);
- setOperationAction(ISD::SELECT_CC, MVT::v16i8, Custom);
- setOperationAction(ISD::SELECT_CC, MVT::v4i16, Custom);
- setOperationAction(ISD::SELECT_CC, MVT::v8i16, Custom);
- setOperationAction(ISD::SELECT_CC, MVT::v2i32, Custom);
- setOperationAction(ISD::SELECT_CC, MVT::v4i32, Custom);
- setOperationAction(ISD::SELECT_CC, MVT::v1i64, Custom);
- setOperationAction(ISD::SELECT_CC, MVT::v2i64, Custom);
- setOperationAction(ISD::SELECT_CC, MVT::v2f32, Custom);
- setOperationAction(ISD::SELECT_CC, MVT::v4f32, Custom);
- setOperationAction(ISD::SELECT_CC, MVT::v1f64, Custom);
- setOperationAction(ISD::SELECT_CC, MVT::v2f64, Custom);
-
- // Vector ExtLoad and TruncStore are expanded.
- for (unsigned I = MVT::FIRST_VECTOR_VALUETYPE;
- I <= MVT::LAST_VECTOR_VALUETYPE; ++I) {
- MVT VT = (MVT::SimpleValueType) I;
- setLoadExtAction(ISD::SEXTLOAD, VT, Expand);
- setLoadExtAction(ISD::ZEXTLOAD, VT, Expand);
- setLoadExtAction(ISD::EXTLOAD, VT, Expand);
- for (unsigned II = MVT::FIRST_VECTOR_VALUETYPE;
- II <= MVT::LAST_VECTOR_VALUETYPE; ++II) {
- MVT VT1 = (MVT::SimpleValueType) II;
- // A TruncStore has two vector types of the same number of elements
- // and different element sizes.
- if (VT.getVectorNumElements() == VT1.getVectorNumElements() &&
- VT.getVectorElementType().getSizeInBits()
- > VT1.getVectorElementType().getSizeInBits())
- setTruncStoreAction(VT, VT1, Expand);
- }
- }
-
- // There is no v1i64/v2i64 multiply, expand v1i64/v2i64 to GPR i64 multiply.
- // FIXME: For a v2i64 multiply, we copy VPR to GPR and do 2 i64 multiplies,
- // and then copy back to VPR. This solution may be optimized by Following 3
- // NEON instructions:
- // pmull v2.1q, v0.1d, v1.1d
- // pmull2 v3.1q, v0.2d, v1.2d
- // ins v2.d[1], v3.d[0]
- // As currently we can't verify the correctness of such assumption, we can
- // do such optimization in the future.
- setOperationAction(ISD::MUL, MVT::v1i64, Expand);
- setOperationAction(ISD::MUL, MVT::v2i64, Expand);
-
- setOperationAction(ISD::FCOS, MVT::v2f64, Expand);
- setOperationAction(ISD::FCOS, MVT::v4f32, Expand);
- setOperationAction(ISD::FCOS, MVT::v2f32, Expand);
- setOperationAction(ISD::FSIN, MVT::v2f64, Expand);
- setOperationAction(ISD::FSIN, MVT::v4f32, Expand);
- setOperationAction(ISD::FSIN, MVT::v2f32, Expand);
- setOperationAction(ISD::FPOW, MVT::v2f64, Expand);
- setOperationAction(ISD::FPOW, MVT::v4f32, Expand);
- setOperationAction(ISD::FPOW, MVT::v2f32, Expand);
+ // AArch64 has implementations of a lot of rounding-like FP operations.
+ static MVT RoundingTypes[] = { MVT::f32, MVT::f64};
+ for (unsigned I = 0; I < array_lengthof(RoundingTypes); ++I) {
+ MVT Ty = RoundingTypes[I];
+ setOperationAction(ISD::FFLOOR, Ty, Legal);
+ setOperationAction(ISD::FNEARBYINT, Ty, Legal);
+ setOperationAction(ISD::FCEIL, Ty, Legal);
+ setOperationAction(ISD::FRINT, Ty, Legal);
+ setOperationAction(ISD::FTRUNC, Ty, Legal);
+ setOperationAction(ISD::FROUND, Ty, Legal);
}
- setTargetDAGCombine(ISD::SETCC);
+ setOperationAction(ISD::PREFETCH, MVT::Other, Custom);
+
+ if (Subtarget->isTargetMachO()) {
+ // For iOS, we don't want to the normal expansion of a libcall to
+ // sincos. We want to issue a libcall to __sincos_stret to avoid memory
+ // traffic.
+ setOperationAction(ISD::FSINCOS, MVT::f64, Custom);
+ setOperationAction(ISD::FSINCOS, MVT::f32, Custom);
+ } else {
+ setOperationAction(ISD::FSINCOS, MVT::f64, Expand);
+ setOperationAction(ISD::FSINCOS, MVT::f32, Expand);
+ }
+
+ // AArch64 does not have floating-point extending loads, i1 sign-extending
+ // load, floating-point truncating stores, or v2i32->v2i16 truncating store.
+ setLoadExtAction(ISD::EXTLOAD, MVT::f32, Expand);
+ setLoadExtAction(ISD::EXTLOAD, MVT::f64, Expand);
+ setLoadExtAction(ISD::EXTLOAD, MVT::f80, Expand);
+ setLoadExtAction(ISD::SEXTLOAD, MVT::i1, Expand);
+ setTruncStoreAction(MVT::f32, MVT::f16, Expand);
+ setTruncStoreAction(MVT::f64, MVT::f32, Expand);
+ setTruncStoreAction(MVT::f64, MVT::f16, Expand);
+ setTruncStoreAction(MVT::f128, MVT::f80, Expand);
+ setTruncStoreAction(MVT::f128, MVT::f64, Expand);
+ setTruncStoreAction(MVT::f128, MVT::f32, Expand);
+ setTruncStoreAction(MVT::f128, MVT::f16, Expand);
+ // Indexed loads and stores are supported.
+ for (unsigned im = (unsigned)ISD::PRE_INC;
+ im != (unsigned)ISD::LAST_INDEXED_MODE; ++im) {
+ setIndexedLoadAction(im, MVT::i8, Legal);
+ setIndexedLoadAction(im, MVT::i16, Legal);
+ setIndexedLoadAction(im, MVT::i32, Legal);
+ setIndexedLoadAction(im, MVT::i64, Legal);
+ setIndexedLoadAction(im, MVT::f64, Legal);
+ setIndexedLoadAction(im, MVT::f32, Legal);
+ setIndexedStoreAction(im, MVT::i8, Legal);
+ setIndexedStoreAction(im, MVT::i16, Legal);
+ setIndexedStoreAction(im, MVT::i32, Legal);
+ setIndexedStoreAction(im, MVT::i64, Legal);
+ setIndexedStoreAction(im, MVT::f64, Legal);
+ setIndexedStoreAction(im, MVT::f32, Legal);
+ }
+
+ // Trap.
+ setOperationAction(ISD::TRAP, MVT::Other, Legal);
+
+ // We combine OR nodes for bitfield operations.
+ setTargetDAGCombine(ISD::OR);
+
+ // Vector add and sub nodes may conceal a high-half opportunity.
+ // Also, try to fold ADD into CSINC/CSINV..
+ setTargetDAGCombine(ISD::ADD);
+ setTargetDAGCombine(ISD::SUB);
+
+ setTargetDAGCombine(ISD::XOR);
+ setTargetDAGCombine(ISD::SINT_TO_FP);
+ setTargetDAGCombine(ISD::UINT_TO_FP);
+
+ setTargetDAGCombine(ISD::INTRINSIC_WO_CHAIN);
+
+ setTargetDAGCombine(ISD::ANY_EXTEND);
+ setTargetDAGCombine(ISD::ZERO_EXTEND);
setTargetDAGCombine(ISD::SIGN_EXTEND);
+ setTargetDAGCombine(ISD::BITCAST);
+ setTargetDAGCombine(ISD::CONCAT_VECTORS);
+ setTargetDAGCombine(ISD::STORE);
+
+ setTargetDAGCombine(ISD::MUL);
+
+ setTargetDAGCombine(ISD::SELECT);
setTargetDAGCombine(ISD::VSELECT);
+
+ setTargetDAGCombine(ISD::INTRINSIC_VOID);
+ setTargetDAGCombine(ISD::INTRINSIC_W_CHAIN);
+ setTargetDAGCombine(ISD::INSERT_VECTOR_ELT);
+
+ MaxStoresPerMemset = MaxStoresPerMemsetOptSize = 8;
+ MaxStoresPerMemcpy = MaxStoresPerMemcpyOptSize = 4;
+ MaxStoresPerMemmove = MaxStoresPerMemmoveOptSize = 4;
+
+ setStackPointerRegisterToSaveRestore(AArch64::SP);
+
+ setSchedulingPreference(Sched::Hybrid);
+
+ // Enable TBZ/TBNZ
+ MaskAndBranchFoldingIsLegal = true;
+
+ setMinFunctionAlignment(2);
+
+ RequireStrictAlign = (Align == StrictAlign);
+
+ setHasExtractBitsInsn(true);
+
+ if (Subtarget->hasNEON()) {
+ // FIXME: v1f64 shouldn't be legal if we can avoid it, because it leads to
+ // silliness like this:
+ setOperationAction(ISD::FABS, MVT::v1f64, Expand);
+ setOperationAction(ISD::FADD, MVT::v1f64, Expand);
+ setOperationAction(ISD::FCEIL, MVT::v1f64, Expand);
+ setOperationAction(ISD::FCOPYSIGN, MVT::v1f64, Expand);
+ setOperationAction(ISD::FCOS, MVT::v1f64, Expand);
+ setOperationAction(ISD::FDIV, MVT::v1f64, Expand);
+ setOperationAction(ISD::FFLOOR, MVT::v1f64, Expand);
+ setOperationAction(ISD::FMA, MVT::v1f64, Expand);
+ setOperationAction(ISD::FMUL, MVT::v1f64, Expand);
+ setOperationAction(ISD::FNEARBYINT, MVT::v1f64, Expand);
+ setOperationAction(ISD::FNEG, MVT::v1f64, Expand);
+ setOperationAction(ISD::FPOW, MVT::v1f64, Expand);
+ setOperationAction(ISD::FREM, MVT::v1f64, Expand);
+ setOperationAction(ISD::FROUND, MVT::v1f64, Expand);
+ setOperationAction(ISD::FRINT, MVT::v1f64, Expand);
+ setOperationAction(ISD::FSIN, MVT::v1f64, Expand);
+ setOperationAction(ISD::FSINCOS, MVT::v1f64, Expand);
+ setOperationAction(ISD::FSQRT, MVT::v1f64, Expand);
+ setOperationAction(ISD::FSUB, MVT::v1f64, Expand);
+ setOperationAction(ISD::FTRUNC, MVT::v1f64, Expand);
+ setOperationAction(ISD::SETCC, MVT::v1f64, Expand);
+ setOperationAction(ISD::BR_CC, MVT::v1f64, Expand);
+ setOperationAction(ISD::SELECT, MVT::v1f64, Expand);
+ setOperationAction(ISD::SELECT_CC, MVT::v1f64, Expand);
+ setOperationAction(ISD::FP_EXTEND, MVT::v1f64, Expand);
+
+ setOperationAction(ISD::FP_TO_SINT, MVT::v1i64, Expand);
+ setOperationAction(ISD::FP_TO_UINT, MVT::v1i64, Expand);
+ setOperationAction(ISD::SINT_TO_FP, MVT::v1i64, Expand);
+ setOperationAction(ISD::UINT_TO_FP, MVT::v1i64, Expand);
+ setOperationAction(ISD::FP_ROUND, MVT::v1f64, Expand);
+
+ setOperationAction(ISD::MUL, MVT::v1i64, Expand);
+
+ // AArch64 doesn't have a direct vector ->f32 conversion instructions for
+ // elements smaller than i32, so promote the input to i32 first.
+ setOperationAction(ISD::UINT_TO_FP, MVT::v4i8, Promote);
+ setOperationAction(ISD::SINT_TO_FP, MVT::v4i8, Promote);
+ setOperationAction(ISD::UINT_TO_FP, MVT::v4i16, Promote);
+ setOperationAction(ISD::SINT_TO_FP, MVT::v4i16, Promote);
+ // Similarly, there is no direct i32 -> f64 vector conversion instruction.
+ setOperationAction(ISD::SINT_TO_FP, MVT::v2i32, Custom);
+ setOperationAction(ISD::UINT_TO_FP, MVT::v2i32, Custom);
+ setOperationAction(ISD::SINT_TO_FP, MVT::v2i64, Custom);
+ setOperationAction(ISD::UINT_TO_FP, MVT::v2i64, Custom);
+
+ // AArch64 doesn't have MUL.2d:
+ setOperationAction(ISD::MUL, MVT::v2i64, Expand);
+ setOperationAction(ISD::ANY_EXTEND, MVT::v4i32, Legal);
+ setTruncStoreAction(MVT::v2i32, MVT::v2i16, Expand);
+ // Likewise, narrowing and extending vector loads/stores aren't handled
+ // directly.
+ for (unsigned VT = (unsigned)MVT::FIRST_VECTOR_VALUETYPE;
+ VT <= (unsigned)MVT::LAST_VECTOR_VALUETYPE; ++VT) {
+
+ setOperationAction(ISD::SIGN_EXTEND_INREG, (MVT::SimpleValueType)VT,
+ Expand);
+
+ setOperationAction(ISD::MULHS, (MVT::SimpleValueType)VT, Expand);
+ setOperationAction(ISD::SMUL_LOHI, (MVT::SimpleValueType)VT, Expand);
+ setOperationAction(ISD::MULHU, (MVT::SimpleValueType)VT, Expand);
+ setOperationAction(ISD::UMUL_LOHI, (MVT::SimpleValueType)VT, Expand);
+
+ setOperationAction(ISD::BSWAP, (MVT::SimpleValueType)VT, Expand);
+
+ for (unsigned InnerVT = (unsigned)MVT::FIRST_VECTOR_VALUETYPE;
+ InnerVT <= (unsigned)MVT::LAST_VECTOR_VALUETYPE; ++InnerVT)
+ setTruncStoreAction((MVT::SimpleValueType)VT,
+ (MVT::SimpleValueType)InnerVT, Expand);
+ setLoadExtAction(ISD::SEXTLOAD, (MVT::SimpleValueType)VT, Expand);
+ setLoadExtAction(ISD::ZEXTLOAD, (MVT::SimpleValueType)VT, Expand);
+ setLoadExtAction(ISD::EXTLOAD, (MVT::SimpleValueType)VT, Expand);
+ }
+
+ // AArch64 has implementations of a lot of rounding-like FP operations.
+ static MVT RoundingVecTypes[] = {MVT::v2f32, MVT::v4f32, MVT::v2f64 };
+ for (unsigned I = 0; I < array_lengthof(RoundingVecTypes); ++I) {
+ MVT Ty = RoundingVecTypes[I];
+ setOperationAction(ISD::FFLOOR, Ty, Legal);
+ setOperationAction(ISD::FNEARBYINT, Ty, Legal);
+ setOperationAction(ISD::FCEIL, Ty, Legal);
+ setOperationAction(ISD::FRINT, Ty, Legal);
+ setOperationAction(ISD::FTRUNC, Ty, Legal);
+ setOperationAction(ISD::FROUND, Ty, Legal);
+ }
+ }
+}
+
+void AArch64TargetLowering::addTypeForNEON(EVT VT, EVT PromotedBitwiseVT) {
+ if (VT == MVT::v2f32) {
+ setOperationAction(ISD::LOAD, VT.getSimpleVT(), Promote);
+ AddPromotedToType(ISD::LOAD, VT.getSimpleVT(), MVT::v2i32);
+
+ setOperationAction(ISD::STORE, VT.getSimpleVT(), Promote);
+ AddPromotedToType(ISD::STORE, VT.getSimpleVT(), MVT::v2i32);
+ } else if (VT == MVT::v2f64 || VT == MVT::v4f32) {
+ setOperationAction(ISD::LOAD, VT.getSimpleVT(), Promote);
+ AddPromotedToType(ISD::LOAD, VT.getSimpleVT(), MVT::v2i64);
+
+ setOperationAction(ISD::STORE, VT.getSimpleVT(), Promote);
+ AddPromotedToType(ISD::STORE, VT.getSimpleVT(), MVT::v2i64);
+ }
+
+ // Mark vector float intrinsics as expand.
+ if (VT == MVT::v2f32 || VT == MVT::v4f32 || VT == MVT::v2f64) {
+ setOperationAction(ISD::FSIN, VT.getSimpleVT(), Expand);
+ setOperationAction(ISD::FCOS, VT.getSimpleVT(), Expand);
+ setOperationAction(ISD::FPOWI, VT.getSimpleVT(), Expand);
+ setOperationAction(ISD::FPOW, VT.getSimpleVT(), Expand);
+ setOperationAction(ISD::FLOG, VT.getSimpleVT(), Expand);
+ setOperationAction(ISD::FLOG2, VT.getSimpleVT(), Expand);
+ setOperationAction(ISD::FLOG10, VT.getSimpleVT(), Expand);
+ setOperationAction(ISD::FEXP, VT.getSimpleVT(), Expand);
+ setOperationAction(ISD::FEXP2, VT.getSimpleVT(), Expand);
+ }
+
+ setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT.getSimpleVT(), Custom);
+ setOperationAction(ISD::INSERT_VECTOR_ELT, VT.getSimpleVT(), Custom);
+ setOperationAction(ISD::BUILD_VECTOR, VT.getSimpleVT(), Custom);
+ setOperationAction(ISD::VECTOR_SHUFFLE, VT.getSimpleVT(), Custom);
+ setOperationAction(ISD::EXTRACT_SUBVECTOR, VT.getSimpleVT(), Custom);
+ setOperationAction(ISD::SRA, VT.getSimpleVT(), Custom);
+ setOperationAction(ISD::SRL, VT.getSimpleVT(), Custom);
+ setOperationAction(ISD::SHL, VT.getSimpleVT(), Custom);
+ setOperationAction(ISD::AND, VT.getSimpleVT(), Custom);
+ setOperationAction(ISD::OR, VT.getSimpleVT(), Custom);
+ setOperationAction(ISD::SETCC, VT.getSimpleVT(), Custom);
+ setOperationAction(ISD::CONCAT_VECTORS, VT.getSimpleVT(), Legal);
+
+ setOperationAction(ISD::SELECT, VT.getSimpleVT(), Expand);
+ setOperationAction(ISD::SELECT_CC, VT.getSimpleVT(), Expand);
+ setOperationAction(ISD::VSELECT, VT.getSimpleVT(), Expand);
+ setLoadExtAction(ISD::EXTLOAD, VT.getSimpleVT(), Expand);
+
+ // CNT supports only B element sizes.
+ if (VT != MVT::v8i8 && VT != MVT::v16i8)
+ setOperationAction(ISD::CTPOP, VT.getSimpleVT(), Expand);
+
+ setOperationAction(ISD::UDIV, VT.getSimpleVT(), Expand);
+ setOperationAction(ISD::SDIV, VT.getSimpleVT(), Expand);
+ setOperationAction(ISD::UREM, VT.getSimpleVT(), Expand);
+ setOperationAction(ISD::SREM, VT.getSimpleVT(), Expand);
+ setOperationAction(ISD::FREM, VT.getSimpleVT(), Expand);
+
+ setOperationAction(ISD::FP_TO_SINT, VT.getSimpleVT(), Custom);
+ setOperationAction(ISD::FP_TO_UINT, VT.getSimpleVT(), Custom);
+
+ if (Subtarget->isLittleEndian()) {
+ for (unsigned im = (unsigned)ISD::PRE_INC;
+ im != (unsigned)ISD::LAST_INDEXED_MODE; ++im) {
+ setIndexedLoadAction(im, VT.getSimpleVT(), Legal);
+ setIndexedStoreAction(im, VT.getSimpleVT(), Legal);
+ }
+ }
+}
+
+void AArch64TargetLowering::addDRTypeForNEON(MVT VT) {
+ addRegisterClass(VT, &AArch64::FPR64RegClass);
+ addTypeForNEON(VT, MVT::v2i32);
+}
+
+void AArch64TargetLowering::addQRTypeForNEON(MVT VT) {
+ addRegisterClass(VT, &AArch64::FPR128RegClass);
+ addTypeForNEON(VT, MVT::v4i32);
}
EVT AArch64TargetLowering::getSetCCResultType(LLVMContext &, EVT VT) const {
- // It's reasonably important that this value matches the "natural" legal
- // promotion from i1 for scalar types. Otherwise LegalizeTypes can get itself
- // in a twist (e.g. inserting an any_extend which then becomes i64 -> i64).
- if (!VT.isVector()) return MVT::i32;
+ if (!VT.isVector())
+ return MVT::i32;
return VT.changeVectorElementTypeToInteger();
}
-static void getExclusiveOperation(unsigned Size, AtomicOrdering Ord,
- unsigned &LdrOpc,
- unsigned &StrOpc) {
- static const unsigned LoadBares[] = {AArch64::LDXR_byte, AArch64::LDXR_hword,
- AArch64::LDXR_word, AArch64::LDXR_dword};
- static const unsigned LoadAcqs[] = {AArch64::LDAXR_byte, AArch64::LDAXR_hword,
- AArch64::LDAXR_word, AArch64::LDAXR_dword};
- static const unsigned StoreBares[] = {AArch64::STXR_byte, AArch64::STXR_hword,
- AArch64::STXR_word, AArch64::STXR_dword};
- static const unsigned StoreRels[] = {AArch64::STLXR_byte,AArch64::STLXR_hword,
- AArch64::STLXR_word, AArch64::STLXR_dword};
-
- const unsigned *LoadOps, *StoreOps;
- if (Ord == Acquire || Ord == AcquireRelease || Ord == SequentiallyConsistent)
- LoadOps = LoadAcqs;
- else
- LoadOps = LoadBares;
-
- if (Ord == Release || Ord == AcquireRelease || Ord == SequentiallyConsistent)
- StoreOps = StoreRels;
- else
- StoreOps = StoreBares;
-
- assert(isPowerOf2_32(Size) && Size <= 8 &&
- "unsupported size for atomic binary op!");
-
- LdrOpc = LoadOps[Log2_32(Size)];
- StrOpc = StoreOps[Log2_32(Size)];
-}
-
-// FIXME: AArch64::DTripleRegClass and AArch64::QTripleRegClass don't really
-// have value type mapped, and they are both being defined as MVT::untyped.
-// Without knowing the MVT type, MachineLICM::getRegisterClassIDAndCost
-// would fail to figure out the register pressure correctly.
-std::pair<const TargetRegisterClass*, uint8_t>
-AArch64TargetLowering::findRepresentativeClass(MVT VT) const{
- const TargetRegisterClass *RRC = 0;
- uint8_t Cost = 1;
- switch (VT.SimpleTy) {
+/// computeKnownBitsForTargetNode - Determine which of the bits specified in
+/// Mask are known to be either zero or one and return them in the
+/// KnownZero/KnownOne bitsets.
+void AArch64TargetLowering::computeKnownBitsForTargetNode(
+ const SDValue Op, APInt &KnownZero, APInt &KnownOne,
+ const SelectionDAG &DAG, unsigned Depth) const {
+ switch (Op.getOpcode()) {
default:
- return TargetLowering::findRepresentativeClass(VT);
- case MVT::v4i64:
- RRC = &AArch64::QPairRegClass;
- Cost = 2;
break;
- case MVT::v8i64:
- RRC = &AArch64::QQuadRegClass;
- Cost = 4;
+ case AArch64ISD::CSEL: {
+ APInt KnownZero2, KnownOne2;
+ DAG.computeKnownBits(Op->getOperand(0), KnownZero, KnownOne, Depth + 1);
+ DAG.computeKnownBits(Op->getOperand(1), KnownZero2, KnownOne2, Depth + 1);
+ KnownZero &= KnownZero2;
+ KnownOne &= KnownOne2;
break;
}
- return std::make_pair(RRC, Cost);
-}
-
-MachineBasicBlock *
-AArch64TargetLowering::emitAtomicBinary(MachineInstr *MI, MachineBasicBlock *BB,
- unsigned Size,
- unsigned BinOpcode) const {
- // This also handles ATOMIC_SWAP, indicated by BinOpcode==0.
- const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
-
- const BasicBlock *LLVM_BB = BB->getBasicBlock();
- MachineFunction *MF = BB->getParent();
- MachineFunction::iterator It = BB;
- ++It;
-
- unsigned dest = MI->getOperand(0).getReg();
- unsigned ptr = MI->getOperand(1).getReg();
- unsigned incr = MI->getOperand(2).getReg();
- AtomicOrdering Ord = static_cast<AtomicOrdering>(MI->getOperand(3).getImm());
- DebugLoc dl = MI->getDebugLoc();
-
- MachineRegisterInfo &MRI = BB->getParent()->getRegInfo();
-
- unsigned ldrOpc, strOpc;
- getExclusiveOperation(Size, Ord, ldrOpc, strOpc);
-
- MachineBasicBlock *loopMBB = MF->CreateMachineBasicBlock(LLVM_BB);
- MachineBasicBlock *exitMBB = MF->CreateMachineBasicBlock(LLVM_BB);
- MF->insert(It, loopMBB);
- MF->insert(It, exitMBB);
-
- // Transfer the remainder of BB and its successor edges to exitMBB.
- exitMBB->splice(exitMBB->begin(), BB,
- std::next(MachineBasicBlock::iterator(MI)), BB->end());
- exitMBB->transferSuccessorsAndUpdatePHIs(BB);
-
- const TargetRegisterClass *TRC
- = Size == 8 ? &AArch64::GPR64RegClass : &AArch64::GPR32RegClass;
- unsigned scratch = (!BinOpcode) ? incr : MRI.createVirtualRegister(TRC);
-
- // thisMBB:
- // ...
- // fallthrough --> loopMBB
- BB->addSuccessor(loopMBB);
-
- // loopMBB:
- // ldxr dest, ptr
- // <binop> scratch, dest, incr
- // stxr stxr_status, scratch, ptr
- // cbnz stxr_status, loopMBB
- // fallthrough --> exitMBB
- BB = loopMBB;
- BuildMI(BB, dl, TII->get(ldrOpc), dest).addReg(ptr);
- if (BinOpcode) {
- // All arithmetic operations we'll be creating are designed to take an extra
- // shift or extend operand, which we can conveniently set to zero.
-
- // Operand order needs to go the other way for NAND.
- if (BinOpcode == AArch64::BICwww_lsl || BinOpcode == AArch64::BICxxx_lsl)
- BuildMI(BB, dl, TII->get(BinOpcode), scratch)
- .addReg(incr).addReg(dest).addImm(0);
- else
- BuildMI(BB, dl, TII->get(BinOpcode), scratch)
- .addReg(dest).addReg(incr).addImm(0);
+ case ISD::INTRINSIC_W_CHAIN: {
+ ConstantSDNode *CN = cast<ConstantSDNode>(Op->getOperand(1));
+ Intrinsic::ID IntID = static_cast<Intrinsic::ID>(CN->getZExtValue());
+ switch (IntID) {
+ default: return;
+ case Intrinsic::aarch64_ldaxr:
+ case Intrinsic::aarch64_ldxr: {
+ unsigned BitWidth = KnownOne.getBitWidth();
+ EVT VT = cast<MemIntrinsicSDNode>(Op)->getMemoryVT();
+ unsigned MemBits = VT.getScalarType().getSizeInBits();
+ KnownZero |= APInt::getHighBitsSet(BitWidth, BitWidth - MemBits);
+ return;
+ }
+ }
+ break;
}
-
- // From the stxr, the register is GPR32; from the cmp it's GPR32wsp
- unsigned stxr_status = MRI.createVirtualRegister(&AArch64::GPR32RegClass);
- MRI.constrainRegClass(stxr_status, &AArch64::GPR32wspRegClass);
-
- BuildMI(BB, dl, TII->get(strOpc), stxr_status).addReg(scratch).addReg(ptr);
- BuildMI(BB, dl, TII->get(AArch64::CBNZw))
- .addReg(stxr_status).addMBB(loopMBB);
-
- BB->addSuccessor(loopMBB);
- BB->addSuccessor(exitMBB);
-
- // exitMBB:
- // ...
- BB = exitMBB;
-
- MI->eraseFromParent(); // The instruction is gone now.
-
- return BB;
-}
-
-MachineBasicBlock *
-AArch64TargetLowering::emitAtomicBinaryMinMax(MachineInstr *MI,
- MachineBasicBlock *BB,
- unsigned Size,
- unsigned CmpOp,
- A64CC::CondCodes Cond) const {
- const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
-
- const BasicBlock *LLVM_BB = BB->getBasicBlock();
- MachineFunction *MF = BB->getParent();
- MachineFunction::iterator It = BB;
- ++It;
-
- unsigned dest = MI->getOperand(0).getReg();
- unsigned ptr = MI->getOperand(1).getReg();
- unsigned incr = MI->getOperand(2).getReg();
- AtomicOrdering Ord = static_cast<AtomicOrdering>(MI->getOperand(3).getImm());
-
- unsigned oldval = dest;
- DebugLoc dl = MI->getDebugLoc();
-
- MachineRegisterInfo &MRI = BB->getParent()->getRegInfo();
- const TargetRegisterClass *TRC, *TRCsp;
- if (Size == 8) {
- TRC = &AArch64::GPR64RegClass;
- TRCsp = &AArch64::GPR64xspRegClass;
- } else {
- TRC = &AArch64::GPR32RegClass;
- TRCsp = &AArch64::GPR32wspRegClass;
+ case ISD::INTRINSIC_WO_CHAIN:
+ case ISD::INTRINSIC_VOID: {
+ unsigned IntNo = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
+ switch (IntNo) {
+ default:
+ break;
+ case Intrinsic::aarch64_neon_umaxv:
+ case Intrinsic::aarch64_neon_uminv: {
+ // Figure out the datatype of the vector operand. The UMINV instruction
+ // will zero extend the result, so we can mark as known zero all the
+ // bits larger than the element datatype. 32-bit or larget doesn't need
+ // this as those are legal types and will be handled by isel directly.
+ MVT VT = Op.getOperand(1).getValueType().getSimpleVT();
+ unsigned BitWidth = KnownZero.getBitWidth();
+ if (VT == MVT::v8i8 || VT == MVT::v16i8) {
+ assert(BitWidth >= 8 && "Unexpected width!");
+ APInt Mask = APInt::getHighBitsSet(BitWidth, BitWidth - 8);
+ KnownZero |= Mask;
+ } else if (VT == MVT::v4i16 || VT == MVT::v8i16) {
+ assert(BitWidth >= 16 && "Unexpected width!");
+ APInt Mask = APInt::getHighBitsSet(BitWidth, BitWidth - 16);
+ KnownZero |= Mask;
+ }
+ break;
+ } break;
+ }
}
-
- unsigned ldrOpc, strOpc;
- getExclusiveOperation(Size, Ord, ldrOpc, strOpc);
-
- MachineBasicBlock *loopMBB = MF->CreateMachineBasicBlock(LLVM_BB);
- MachineBasicBlock *exitMBB = MF->CreateMachineBasicBlock(LLVM_BB);
- MF->insert(It, loopMBB);
- MF->insert(It, exitMBB);
-
- // Transfer the remainder of BB and its successor edges to exitMBB.
- exitMBB->splice(exitMBB->begin(), BB,
- std::next(MachineBasicBlock::iterator(MI)), BB->end());
- exitMBB->transferSuccessorsAndUpdatePHIs(BB);
-
- unsigned scratch = MRI.createVirtualRegister(TRC);
- MRI.constrainRegClass(scratch, TRCsp);
-
- // thisMBB:
- // ...
- // fallthrough --> loopMBB
- BB->addSuccessor(loopMBB);
-
- // loopMBB:
- // ldxr dest, ptr
- // cmp incr, dest (, sign extend if necessary)
- // csel scratch, dest, incr, cond
- // stxr stxr_status, scratch, ptr
- // cbnz stxr_status, loopMBB
- // fallthrough --> exitMBB
- BB = loopMBB;
- BuildMI(BB, dl, TII->get(ldrOpc), dest).addReg(ptr);
-
- // Build compare and cmov instructions.
- MRI.constrainRegClass(incr, TRCsp);
- BuildMI(BB, dl, TII->get(CmpOp))
- .addReg(incr).addReg(oldval).addImm(0);
-
- BuildMI(BB, dl, TII->get(Size == 8 ? AArch64::CSELxxxc : AArch64::CSELwwwc),
- scratch)
- .addReg(oldval).addReg(incr).addImm(Cond);
-
- unsigned stxr_status = MRI.createVirtualRegister(&AArch64::GPR32RegClass);
- MRI.constrainRegClass(stxr_status, &AArch64::GPR32wspRegClass);
-
- BuildMI(BB, dl, TII->get(strOpc), stxr_status)
- .addReg(scratch).addReg(ptr);
- BuildMI(BB, dl, TII->get(AArch64::CBNZw))
- .addReg(stxr_status).addMBB(loopMBB);
-
- BB->addSuccessor(loopMBB);
- BB->addSuccessor(exitMBB);
-
- // exitMBB:
- // ...
- BB = exitMBB;
-
- MI->eraseFromParent(); // The instruction is gone now.
-
- return BB;
+ }
}
-MachineBasicBlock *
-AArch64TargetLowering::emitAtomicCmpSwap(MachineInstr *MI,
- MachineBasicBlock *BB,
- unsigned Size) const {
- unsigned dest = MI->getOperand(0).getReg();
- unsigned ptr = MI->getOperand(1).getReg();
- unsigned oldval = MI->getOperand(2).getReg();
- unsigned newval = MI->getOperand(3).getReg();
- AtomicOrdering Ord = static_cast<AtomicOrdering>(MI->getOperand(4).getImm());
- const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
- DebugLoc dl = MI->getDebugLoc();
+MVT AArch64TargetLowering::getScalarShiftAmountTy(EVT LHSTy) const {
+ return MVT::i64;
+}
- MachineRegisterInfo &MRI = BB->getParent()->getRegInfo();
- const TargetRegisterClass *TRCsp;
- TRCsp = Size == 8 ? &AArch64::GPR64xspRegClass : &AArch64::GPR32wspRegClass;
+unsigned AArch64TargetLowering::getMaximalGlobalOffset() const {
+ // FIXME: On AArch64, this depends on the type.
+ // Basically, the addressable offsets are o to 4095 * Ty.getSizeInBytes().
+ // and the offset has to be a multiple of the related size in bytes.
+ return 4095;
+}
- unsigned ldrOpc, strOpc;
- getExclusiveOperation(Size, Ord, ldrOpc, strOpc);
+FastISel *
+AArch64TargetLowering::createFastISel(FunctionLoweringInfo &funcInfo,
+ const TargetLibraryInfo *libInfo) const {
+ return AArch64::createFastISel(funcInfo, libInfo);
+}
- MachineFunction *MF = BB->getParent();
- const BasicBlock *LLVM_BB = BB->getBasicBlock();
- MachineFunction::iterator It = BB;
- ++It; // insert the new blocks after the current block
-
- MachineBasicBlock *loop1MBB = MF->CreateMachineBasicBlock(LLVM_BB);
- MachineBasicBlock *loop2MBB = MF->CreateMachineBasicBlock(LLVM_BB);
- MachineBasicBlock *exitMBB = MF->CreateMachineBasicBlock(LLVM_BB);
- MF->insert(It, loop1MBB);
- MF->insert(It, loop2MBB);
- MF->insert(It, exitMBB);
-
- // Transfer the remainder of BB and its successor edges to exitMBB.
- exitMBB->splice(exitMBB->begin(), BB,
- std::next(MachineBasicBlock::iterator(MI)), BB->end());
- exitMBB->transferSuccessorsAndUpdatePHIs(BB);
-
- // thisMBB:
- // ...
- // fallthrough --> loop1MBB
- BB->addSuccessor(loop1MBB);
-
- // loop1MBB:
- // ldxr dest, [ptr]
- // cmp dest, oldval
- // b.ne exitMBB
- BB = loop1MBB;
- BuildMI(BB, dl, TII->get(ldrOpc), dest).addReg(ptr);
-
- unsigned CmpOp = Size == 8 ? AArch64::CMPxx_lsl : AArch64::CMPww_lsl;
- MRI.constrainRegClass(dest, TRCsp);
- BuildMI(BB, dl, TII->get(CmpOp))
- .addReg(dest).addReg(oldval).addImm(0);
- BuildMI(BB, dl, TII->get(AArch64::Bcc))
- .addImm(A64CC::NE).addMBB(exitMBB);
- BB->addSuccessor(loop2MBB);
- BB->addSuccessor(exitMBB);
-
- // loop2MBB:
- // strex stxr_status, newval, [ptr]
- // cbnz stxr_status, loop1MBB
- BB = loop2MBB;
- unsigned stxr_status = MRI.createVirtualRegister(&AArch64::GPR32RegClass);
- MRI.constrainRegClass(stxr_status, &AArch64::GPR32wspRegClass);
-
- BuildMI(BB, dl, TII->get(strOpc), stxr_status).addReg(newval).addReg(ptr);
- BuildMI(BB, dl, TII->get(AArch64::CBNZw))
- .addReg(stxr_status).addMBB(loop1MBB);
- BB->addSuccessor(loop1MBB);
- BB->addSuccessor(exitMBB);
-
- // exitMBB:
- // ...
- BB = exitMBB;
-
- MI->eraseFromParent(); // The instruction is gone now.
-
- return BB;
+const char *AArch64TargetLowering::getTargetNodeName(unsigned Opcode) const {
+ switch (Opcode) {
+ default:
+ return nullptr;
+ case AArch64ISD::CALL: return "AArch64ISD::CALL";
+ case AArch64ISD::ADRP: return "AArch64ISD::ADRP";
+ case AArch64ISD::ADDlow: return "AArch64ISD::ADDlow";
+ case AArch64ISD::LOADgot: return "AArch64ISD::LOADgot";
+ case AArch64ISD::RET_FLAG: return "AArch64ISD::RET_FLAG";
+ case AArch64ISD::BRCOND: return "AArch64ISD::BRCOND";
+ case AArch64ISD::CSEL: return "AArch64ISD::CSEL";
+ case AArch64ISD::FCSEL: return "AArch64ISD::FCSEL";
+ case AArch64ISD::CSINV: return "AArch64ISD::CSINV";
+ case AArch64ISD::CSNEG: return "AArch64ISD::CSNEG";
+ case AArch64ISD::CSINC: return "AArch64ISD::CSINC";
+ case AArch64ISD::THREAD_POINTER: return "AArch64ISD::THREAD_POINTER";
+ case AArch64ISD::TLSDESC_CALL: return "AArch64ISD::TLSDESC_CALL";
+ case AArch64ISD::ADC: return "AArch64ISD::ADC";
+ case AArch64ISD::SBC: return "AArch64ISD::SBC";
+ case AArch64ISD::ADDS: return "AArch64ISD::ADDS";
+ case AArch64ISD::SUBS: return "AArch64ISD::SUBS";
+ case AArch64ISD::ADCS: return "AArch64ISD::ADCS";
+ case AArch64ISD::SBCS: return "AArch64ISD::SBCS";
+ case AArch64ISD::ANDS: return "AArch64ISD::ANDS";
+ case AArch64ISD::FCMP: return "AArch64ISD::FCMP";
+ case AArch64ISD::FMIN: return "AArch64ISD::FMIN";
+ case AArch64ISD::FMAX: return "AArch64ISD::FMAX";
+ case AArch64ISD::DUP: return "AArch64ISD::DUP";
+ case AArch64ISD::DUPLANE8: return "AArch64ISD::DUPLANE8";
+ case AArch64ISD::DUPLANE16: return "AArch64ISD::DUPLANE16";
+ case AArch64ISD::DUPLANE32: return "AArch64ISD::DUPLANE32";
+ case AArch64ISD::DUPLANE64: return "AArch64ISD::DUPLANE64";
+ case AArch64ISD::MOVI: return "AArch64ISD::MOVI";
+ case AArch64ISD::MOVIshift: return "AArch64ISD::MOVIshift";
+ case AArch64ISD::MOVIedit: return "AArch64ISD::MOVIedit";
+ case AArch64ISD::MOVImsl: return "AArch64ISD::MOVImsl";
+ case AArch64ISD::FMOV: return "AArch64ISD::FMOV";
+ case AArch64ISD::MVNIshift: return "AArch64ISD::MVNIshift";
+ case AArch64ISD::MVNImsl: return "AArch64ISD::MVNImsl";
+ case AArch64ISD::BICi: return "AArch64ISD::BICi";
+ case AArch64ISD::ORRi: return "AArch64ISD::ORRi";
+ case AArch64ISD::BSL: return "AArch64ISD::BSL";
+ case AArch64ISD::NEG: return "AArch64ISD::NEG";
+ case AArch64ISD::EXTR: return "AArch64ISD::EXTR";
+ case AArch64ISD::ZIP1: return "AArch64ISD::ZIP1";
+ case AArch64ISD::ZIP2: return "AArch64ISD::ZIP2";
+ case AArch64ISD::UZP1: return "AArch64ISD::UZP1";
+ case AArch64ISD::UZP2: return "AArch64ISD::UZP2";
+ case AArch64ISD::TRN1: return "AArch64ISD::TRN1";
+ case AArch64ISD::TRN2: return "AArch64ISD::TRN2";
+ case AArch64ISD::REV16: return "AArch64ISD::REV16";
+ case AArch64ISD::REV32: return "AArch64ISD::REV32";
+ case AArch64ISD::REV64: return "AArch64ISD::REV64";
+ case AArch64ISD::EXT: return "AArch64ISD::EXT";
+ case AArch64ISD::VSHL: return "AArch64ISD::VSHL";
+ case AArch64ISD::VLSHR: return "AArch64ISD::VLSHR";
+ case AArch64ISD::VASHR: return "AArch64ISD::VASHR";
+ case AArch64ISD::CMEQ: return "AArch64ISD::CMEQ";
+ case AArch64ISD::CMGE: return "AArch64ISD::CMGE";
+ case AArch64ISD::CMGT: return "AArch64ISD::CMGT";
+ case AArch64ISD::CMHI: return "AArch64ISD::CMHI";
+ case AArch64ISD::CMHS: return "AArch64ISD::CMHS";
+ case AArch64ISD::FCMEQ: return "AArch64ISD::FCMEQ";
+ case AArch64ISD::FCMGE: return "AArch64ISD::FCMGE";
+ case AArch64ISD::FCMGT: return "AArch64ISD::FCMGT";
+ case AArch64ISD::CMEQz: return "AArch64ISD::CMEQz";
+ case AArch64ISD::CMGEz: return "AArch64ISD::CMGEz";
+ case AArch64ISD::CMGTz: return "AArch64ISD::CMGTz";
+ case AArch64ISD::CMLEz: return "AArch64ISD::CMLEz";
+ case AArch64ISD::CMLTz: return "AArch64ISD::CMLTz";
+ case AArch64ISD::FCMEQz: return "AArch64ISD::FCMEQz";
+ case AArch64ISD::FCMGEz: return "AArch64ISD::FCMGEz";
+ case AArch64ISD::FCMGTz: return "AArch64ISD::FCMGTz";
+ case AArch64ISD::FCMLEz: return "AArch64ISD::FCMLEz";
+ case AArch64ISD::FCMLTz: return "AArch64ISD::FCMLTz";
+ case AArch64ISD::NOT: return "AArch64ISD::NOT";
+ case AArch64ISD::BIT: return "AArch64ISD::BIT";
+ case AArch64ISD::CBZ: return "AArch64ISD::CBZ";
+ case AArch64ISD::CBNZ: return "AArch64ISD::CBNZ";
+ case AArch64ISD::TBZ: return "AArch64ISD::TBZ";
+ case AArch64ISD::TBNZ: return "AArch64ISD::TBNZ";
+ case AArch64ISD::TC_RETURN: return "AArch64ISD::TC_RETURN";
+ case AArch64ISD::SITOF: return "AArch64ISD::SITOF";
+ case AArch64ISD::UITOF: return "AArch64ISD::UITOF";
+ case AArch64ISD::SQSHL_I: return "AArch64ISD::SQSHL_I";
+ case AArch64ISD::UQSHL_I: return "AArch64ISD::UQSHL_I";
+ case AArch64ISD::SRSHR_I: return "AArch64ISD::SRSHR_I";
+ case AArch64ISD::URSHR_I: return "AArch64ISD::URSHR_I";
+ case AArch64ISD::SQSHLU_I: return "AArch64ISD::SQSHLU_I";
+ case AArch64ISD::WrapperLarge: return "AArch64ISD::WrapperLarge";
+ case AArch64ISD::LD2post: return "AArch64ISD::LD2post";
+ case AArch64ISD::LD3post: return "AArch64ISD::LD3post";
+ case AArch64ISD::LD4post: return "AArch64ISD::LD4post";
+ case AArch64ISD::ST2post: return "AArch64ISD::ST2post";
+ case AArch64ISD::ST3post: return "AArch64ISD::ST3post";
+ case AArch64ISD::ST4post: return "AArch64ISD::ST4post";
+ case AArch64ISD::LD1x2post: return "AArch64ISD::LD1x2post";
+ case AArch64ISD::LD1x3post: return "AArch64ISD::LD1x3post";
+ case AArch64ISD::LD1x4post: return "AArch64ISD::LD1x4post";
+ case AArch64ISD::ST1x2post: return "AArch64ISD::ST1x2post";
+ case AArch64ISD::ST1x3post: return "AArch64ISD::ST1x3post";
+ case AArch64ISD::ST1x4post: return "AArch64ISD::ST1x4post";
+ case AArch64ISD::LD1DUPpost: return "AArch64ISD::LD1DUPpost";
+ case AArch64ISD::LD2DUPpost: return "AArch64ISD::LD2DUPpost";
+ case AArch64ISD::LD3DUPpost: return "AArch64ISD::LD3DUPpost";
+ case AArch64ISD::LD4DUPpost: return "AArch64ISD::LD4DUPpost";
+ case AArch64ISD::LD1LANEpost: return "AArch64ISD::LD1LANEpost";
+ case AArch64ISD::LD2LANEpost: return "AArch64ISD::LD2LANEpost";
+ case AArch64ISD::LD3LANEpost: return "AArch64ISD::LD3LANEpost";
+ case AArch64ISD::LD4LANEpost: return "AArch64ISD::LD4LANEpost";
+ case AArch64ISD::ST2LANEpost: return "AArch64ISD::ST2LANEpost";
+ case AArch64ISD::ST3LANEpost: return "AArch64ISD::ST3LANEpost";
+ case AArch64ISD::ST4LANEpost: return "AArch64ISD::ST4LANEpost";
+ }
}
MachineBasicBlock *
AArch64TargetLowering::EmitF128CSEL(MachineInstr *MI,
MachineBasicBlock *MBB) const {
- // We materialise the F128CSEL pseudo-instruction using conditional branches
- // and loads, giving an instruciton sequence like:
- // str q0, [sp]
- // b.ne IfTrue
- // b Finish
- // IfTrue:
- // str q1, [sp]
- // Finish:
- // ldr q0, [sp]
- //
- // Using virtual registers would probably not be beneficial since COPY
- // instructions are expensive for f128 (there's no actual instruction to
- // implement them).
- //
- // An alternative would be to do an integer-CSEL on some address. E.g.:
- // mov x0, sp
- // add x1, sp, #16
- // str q0, [x0]
- // str q1, [x1]
- // csel x0, x0, x1, ne
- // ldr q0, [x0]
- //
- // It's unclear which approach is actually optimal.
+ // We materialise the F128CSEL pseudo-instruction as some control flow and a
+ // phi node:
+
+ // OrigBB:
+ // [... previous instrs leading to comparison ...]
+ // b.ne TrueBB
+ // b EndBB
+ // TrueBB:
+ // ; Fallthrough
+ // EndBB:
+ // Dest = PHI [IfTrue, TrueBB], [IfFalse, OrigBB]
+
const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
MachineFunction *MF = MBB->getParent();
const BasicBlock *LLVM_BB = MBB->getBasicBlock();
@@ -906,49 +792,24 @@
MBB->end());
EndBB->transferSuccessorsAndUpdatePHIs(MBB);
- // We need somewhere to store the f128 value needed.
- int ScratchFI = MF->getFrameInfo()->CreateSpillStackObject(16, 16);
-
- // [... start of incoming MBB ...]
- // str qIFFALSE, [sp]
- // b.cc IfTrue
- // b Done
- BuildMI(MBB, DL, TII->get(AArch64::LSFP128_STR))
- .addReg(IfFalseReg)
- .addFrameIndex(ScratchFI)
- .addImm(0);
- BuildMI(MBB, DL, TII->get(AArch64::Bcc))
- .addImm(CondCode)
- .addMBB(TrueBB);
- BuildMI(MBB, DL, TII->get(AArch64::Bimm))
- .addMBB(EndBB);
+ BuildMI(MBB, DL, TII->get(AArch64::Bcc)).addImm(CondCode).addMBB(TrueBB);
+ BuildMI(MBB, DL, TII->get(AArch64::B)).addMBB(EndBB);
MBB->addSuccessor(TrueBB);
MBB->addSuccessor(EndBB);
+ // TrueBB falls through to the end.
+ TrueBB->addSuccessor(EndBB);
+
if (!NZCVKilled) {
- // NZCV is live-through TrueBB.
TrueBB->addLiveIn(AArch64::NZCV);
EndBB->addLiveIn(AArch64::NZCV);
}
- // IfTrue:
- // str qIFTRUE, [sp]
- BuildMI(TrueBB, DL, TII->get(AArch64::LSFP128_STR))
- .addReg(IfTrueReg)
- .addFrameIndex(ScratchFI)
- .addImm(0);
-
- // Note: fallthrough. We can rely on LLVM adding a branch if it reorders the
- // blocks.
- TrueBB->addSuccessor(EndBB);
-
- // Done:
- // ldr qDEST, [sp]
- // [... rest of incoming MBB ...]
- MachineInstr *StartOfEnd = EndBB->begin();
- BuildMI(*EndBB, StartOfEnd, DL, TII->get(AArch64::LSFP128_LDR), DestReg)
- .addFrameIndex(ScratchFI)
- .addImm(0);
+ BuildMI(*EndBB, EndBB->begin(), DL, TII->get(AArch64::PHI), DestReg)
+ .addReg(IfTrueReg)
+ .addMBB(TrueBB)
+ .addReg(IfFalseReg)
+ .addMBB(MBB);
MI->eraseFromParent();
return EndBB;
@@ -956,445 +817,1005 @@
MachineBasicBlock *
AArch64TargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI,
- MachineBasicBlock *MBB) const {
+ MachineBasicBlock *BB) const {
switch (MI->getOpcode()) {
- default: llvm_unreachable("Unhandled instruction with custom inserter");
- case AArch64::F128CSEL:
- return EmitF128CSEL(MI, MBB);
- case AArch64::ATOMIC_LOAD_ADD_I8:
- return emitAtomicBinary(MI, MBB, 1, AArch64::ADDwww_lsl);
- case AArch64::ATOMIC_LOAD_ADD_I16:
- return emitAtomicBinary(MI, MBB, 2, AArch64::ADDwww_lsl);
- case AArch64::ATOMIC_LOAD_ADD_I32:
- return emitAtomicBinary(MI, MBB, 4, AArch64::ADDwww_lsl);
- case AArch64::ATOMIC_LOAD_ADD_I64:
- return emitAtomicBinary(MI, MBB, 8, AArch64::ADDxxx_lsl);
-
- case AArch64::ATOMIC_LOAD_SUB_I8:
- return emitAtomicBinary(MI, MBB, 1, AArch64::SUBwww_lsl);
- case AArch64::ATOMIC_LOAD_SUB_I16:
- return emitAtomicBinary(MI, MBB, 2, AArch64::SUBwww_lsl);
- case AArch64::ATOMIC_LOAD_SUB_I32:
- return emitAtomicBinary(MI, MBB, 4, AArch64::SUBwww_lsl);
- case AArch64::ATOMIC_LOAD_SUB_I64:
- return emitAtomicBinary(MI, MBB, 8, AArch64::SUBxxx_lsl);
-
- case AArch64::ATOMIC_LOAD_AND_I8:
- return emitAtomicBinary(MI, MBB, 1, AArch64::ANDwww_lsl);
- case AArch64::ATOMIC_LOAD_AND_I16:
- return emitAtomicBinary(MI, MBB, 2, AArch64::ANDwww_lsl);
- case AArch64::ATOMIC_LOAD_AND_I32:
- return emitAtomicBinary(MI, MBB, 4, AArch64::ANDwww_lsl);
- case AArch64::ATOMIC_LOAD_AND_I64:
- return emitAtomicBinary(MI, MBB, 8, AArch64::ANDxxx_lsl);
-
- case AArch64::ATOMIC_LOAD_OR_I8:
- return emitAtomicBinary(MI, MBB, 1, AArch64::ORRwww_lsl);
- case AArch64::ATOMIC_LOAD_OR_I16:
- return emitAtomicBinary(MI, MBB, 2, AArch64::ORRwww_lsl);
- case AArch64::ATOMIC_LOAD_OR_I32:
- return emitAtomicBinary(MI, MBB, 4, AArch64::ORRwww_lsl);
- case AArch64::ATOMIC_LOAD_OR_I64:
- return emitAtomicBinary(MI, MBB, 8, AArch64::ORRxxx_lsl);
-
- case AArch64::ATOMIC_LOAD_XOR_I8:
- return emitAtomicBinary(MI, MBB, 1, AArch64::EORwww_lsl);
- case AArch64::ATOMIC_LOAD_XOR_I16:
- return emitAtomicBinary(MI, MBB, 2, AArch64::EORwww_lsl);
- case AArch64::ATOMIC_LOAD_XOR_I32:
- return emitAtomicBinary(MI, MBB, 4, AArch64::EORwww_lsl);
- case AArch64::ATOMIC_LOAD_XOR_I64:
- return emitAtomicBinary(MI, MBB, 8, AArch64::EORxxx_lsl);
-
- case AArch64::ATOMIC_LOAD_NAND_I8:
- return emitAtomicBinary(MI, MBB, 1, AArch64::BICwww_lsl);
- case AArch64::ATOMIC_LOAD_NAND_I16:
- return emitAtomicBinary(MI, MBB, 2, AArch64::BICwww_lsl);
- case AArch64::ATOMIC_LOAD_NAND_I32:
- return emitAtomicBinary(MI, MBB, 4, AArch64::BICwww_lsl);
- case AArch64::ATOMIC_LOAD_NAND_I64:
- return emitAtomicBinary(MI, MBB, 8, AArch64::BICxxx_lsl);
-
- case AArch64::ATOMIC_LOAD_MIN_I8:
- return emitAtomicBinaryMinMax(MI, MBB, 1, AArch64::CMPww_sxtb, A64CC::GT);
- case AArch64::ATOMIC_LOAD_MIN_I16:
- return emitAtomicBinaryMinMax(MI, MBB, 2, AArch64::CMPww_sxth, A64CC::GT);
- case AArch64::ATOMIC_LOAD_MIN_I32:
- return emitAtomicBinaryMinMax(MI, MBB, 4, AArch64::CMPww_lsl, A64CC::GT);
- case AArch64::ATOMIC_LOAD_MIN_I64:
- return emitAtomicBinaryMinMax(MI, MBB, 8, AArch64::CMPxx_lsl, A64CC::GT);
-
- case AArch64::ATOMIC_LOAD_MAX_I8:
- return emitAtomicBinaryMinMax(MI, MBB, 1, AArch64::CMPww_sxtb, A64CC::LT);
- case AArch64::ATOMIC_LOAD_MAX_I16:
- return emitAtomicBinaryMinMax(MI, MBB, 2, AArch64::CMPww_sxth, A64CC::LT);
- case AArch64::ATOMIC_LOAD_MAX_I32:
- return emitAtomicBinaryMinMax(MI, MBB, 4, AArch64::CMPww_lsl, A64CC::LT);
- case AArch64::ATOMIC_LOAD_MAX_I64:
- return emitAtomicBinaryMinMax(MI, MBB, 8, AArch64::CMPxx_lsl, A64CC::LT);
-
- case AArch64::ATOMIC_LOAD_UMIN_I8:
- return emitAtomicBinaryMinMax(MI, MBB, 1, AArch64::CMPww_uxtb, A64CC::HI);
- case AArch64::ATOMIC_LOAD_UMIN_I16:
- return emitAtomicBinaryMinMax(MI, MBB, 2, AArch64::CMPww_uxth, A64CC::HI);
- case AArch64::ATOMIC_LOAD_UMIN_I32:
- return emitAtomicBinaryMinMax(MI, MBB, 4, AArch64::CMPww_lsl, A64CC::HI);
- case AArch64::ATOMIC_LOAD_UMIN_I64:
- return emitAtomicBinaryMinMax(MI, MBB, 8, AArch64::CMPxx_lsl, A64CC::HI);
-
- case AArch64::ATOMIC_LOAD_UMAX_I8:
- return emitAtomicBinaryMinMax(MI, MBB, 1, AArch64::CMPww_uxtb, A64CC::LO);
- case AArch64::ATOMIC_LOAD_UMAX_I16:
- return emitAtomicBinaryMinMax(MI, MBB, 2, AArch64::CMPww_uxth, A64CC::LO);
- case AArch64::ATOMIC_LOAD_UMAX_I32:
- return emitAtomicBinaryMinMax(MI, MBB, 4, AArch64::CMPww_lsl, A64CC::LO);
- case AArch64::ATOMIC_LOAD_UMAX_I64:
- return emitAtomicBinaryMinMax(MI, MBB, 8, AArch64::CMPxx_lsl, A64CC::LO);
-
- case AArch64::ATOMIC_SWAP_I8:
- return emitAtomicBinary(MI, MBB, 1, 0);
- case AArch64::ATOMIC_SWAP_I16:
- return emitAtomicBinary(MI, MBB, 2, 0);
- case AArch64::ATOMIC_SWAP_I32:
- return emitAtomicBinary(MI, MBB, 4, 0);
- case AArch64::ATOMIC_SWAP_I64:
- return emitAtomicBinary(MI, MBB, 8, 0);
-
- case AArch64::ATOMIC_CMP_SWAP_I8:
- return emitAtomicCmpSwap(MI, MBB, 1);
- case AArch64::ATOMIC_CMP_SWAP_I16:
- return emitAtomicCmpSwap(MI, MBB, 2);
- case AArch64::ATOMIC_CMP_SWAP_I32:
- return emitAtomicCmpSwap(MI, MBB, 4);
- case AArch64::ATOMIC_CMP_SWAP_I64:
- return emitAtomicCmpSwap(MI, MBB, 8);
- }
-}
-
-
-const char *AArch64TargetLowering::getTargetNodeName(unsigned Opcode) const {
- switch (Opcode) {
- case AArch64ISD::BR_CC: return "AArch64ISD::BR_CC";
- case AArch64ISD::Call: return "AArch64ISD::Call";
- case AArch64ISD::FPMOV: return "AArch64ISD::FPMOV";
- case AArch64ISD::GOTLoad: return "AArch64ISD::GOTLoad";
- case AArch64ISD::BFI: return "AArch64ISD::BFI";
- case AArch64ISD::EXTR: return "AArch64ISD::EXTR";
- case AArch64ISD::Ret: return "AArch64ISD::Ret";
- case AArch64ISD::SBFX: return "AArch64ISD::SBFX";
- case AArch64ISD::SELECT_CC: return "AArch64ISD::SELECT_CC";
- case AArch64ISD::SETCC: return "AArch64ISD::SETCC";
- case AArch64ISD::TC_RETURN: return "AArch64ISD::TC_RETURN";
- case AArch64ISD::THREAD_POINTER: return "AArch64ISD::THREAD_POINTER";
- case AArch64ISD::TLSDESCCALL: return "AArch64ISD::TLSDESCCALL";
- case AArch64ISD::WrapperLarge: return "AArch64ISD::WrapperLarge";
- case AArch64ISD::WrapperSmall: return "AArch64ISD::WrapperSmall";
-
- case AArch64ISD::NEON_MOVIMM:
- return "AArch64ISD::NEON_MOVIMM";
- case AArch64ISD::NEON_MVNIMM:
- return "AArch64ISD::NEON_MVNIMM";
- case AArch64ISD::NEON_FMOVIMM:
- return "AArch64ISD::NEON_FMOVIMM";
- case AArch64ISD::NEON_CMP:
- return "AArch64ISD::NEON_CMP";
- case AArch64ISD::NEON_CMPZ:
- return "AArch64ISD::NEON_CMPZ";
- case AArch64ISD::NEON_TST:
- return "AArch64ISD::NEON_TST";
- case AArch64ISD::NEON_QSHLs:
- return "AArch64ISD::NEON_QSHLs";
- case AArch64ISD::NEON_QSHLu:
- return "AArch64ISD::NEON_QSHLu";
- case AArch64ISD::NEON_VDUP:
- return "AArch64ISD::NEON_VDUP";
- case AArch64ISD::NEON_VDUPLANE:
- return "AArch64ISD::NEON_VDUPLANE";
- case AArch64ISD::NEON_REV16:
- return "AArch64ISD::NEON_REV16";
- case AArch64ISD::NEON_REV32:
- return "AArch64ISD::NEON_REV32";
- case AArch64ISD::NEON_REV64:
- return "AArch64ISD::NEON_REV64";
- case AArch64ISD::NEON_UZP1:
- return "AArch64ISD::NEON_UZP1";
- case AArch64ISD::NEON_UZP2:
- return "AArch64ISD::NEON_UZP2";
- case AArch64ISD::NEON_ZIP1:
- return "AArch64ISD::NEON_ZIP1";
- case AArch64ISD::NEON_ZIP2:
- return "AArch64ISD::NEON_ZIP2";
- case AArch64ISD::NEON_TRN1:
- return "AArch64ISD::NEON_TRN1";
- case AArch64ISD::NEON_TRN2:
- return "AArch64ISD::NEON_TRN2";
- case AArch64ISD::NEON_LD1_UPD:
- return "AArch64ISD::NEON_LD1_UPD";
- case AArch64ISD::NEON_LD2_UPD:
- return "AArch64ISD::NEON_LD2_UPD";
- case AArch64ISD::NEON_LD3_UPD:
- return "AArch64ISD::NEON_LD3_UPD";
- case AArch64ISD::NEON_LD4_UPD:
- return "AArch64ISD::NEON_LD4_UPD";
- case AArch64ISD::NEON_ST1_UPD:
- return "AArch64ISD::NEON_ST1_UPD";
- case AArch64ISD::NEON_ST2_UPD:
- return "AArch64ISD::NEON_ST2_UPD";
- case AArch64ISD::NEON_ST3_UPD:
- return "AArch64ISD::NEON_ST3_UPD";
- case AArch64ISD::NEON_ST4_UPD:
- return "AArch64ISD::NEON_ST4_UPD";
- case AArch64ISD::NEON_LD1x2_UPD:
- return "AArch64ISD::NEON_LD1x2_UPD";
- case AArch64ISD::NEON_LD1x3_UPD:
- return "AArch64ISD::NEON_LD1x3_UPD";
- case AArch64ISD::NEON_LD1x4_UPD:
- return "AArch64ISD::NEON_LD1x4_UPD";
- case AArch64ISD::NEON_ST1x2_UPD:
- return "AArch64ISD::NEON_ST1x2_UPD";
- case AArch64ISD::NEON_ST1x3_UPD:
- return "AArch64ISD::NEON_ST1x3_UPD";
- case AArch64ISD::NEON_ST1x4_UPD:
- return "AArch64ISD::NEON_ST1x4_UPD";
- case AArch64ISD::NEON_LD2DUP:
- return "AArch64ISD::NEON_LD2DUP";
- case AArch64ISD::NEON_LD3DUP:
- return "AArch64ISD::NEON_LD3DUP";
- case AArch64ISD::NEON_LD4DUP:
- return "AArch64ISD::NEON_LD4DUP";
- case AArch64ISD::NEON_LD2DUP_UPD:
- return "AArch64ISD::NEON_LD2DUP_UPD";
- case AArch64ISD::NEON_LD3DUP_UPD:
- return "AArch64ISD::NEON_LD3DUP_UPD";
- case AArch64ISD::NEON_LD4DUP_UPD:
- return "AArch64ISD::NEON_LD4DUP_UPD";
- case AArch64ISD::NEON_LD2LN_UPD:
- return "AArch64ISD::NEON_LD2LN_UPD";
- case AArch64ISD::NEON_LD3LN_UPD:
- return "AArch64ISD::NEON_LD3LN_UPD";
- case AArch64ISD::NEON_LD4LN_UPD:
- return "AArch64ISD::NEON_LD4LN_UPD";
- case AArch64ISD::NEON_ST2LN_UPD:
- return "AArch64ISD::NEON_ST2LN_UPD";
- case AArch64ISD::NEON_ST3LN_UPD:
- return "AArch64ISD::NEON_ST3LN_UPD";
- case AArch64ISD::NEON_ST4LN_UPD:
- return "AArch64ISD::NEON_ST4LN_UPD";
- case AArch64ISD::NEON_VEXTRACT:
- return "AArch64ISD::NEON_VEXTRACT";
default:
- return NULL;
+#ifndef NDEBUG
+ MI->dump();
+#endif
+ assert(0 && "Unexpected instruction for custom inserter!");
+ break;
+
+ case AArch64::F128CSEL:
+ return EmitF128CSEL(MI, BB);
+
+ case TargetOpcode::STACKMAP:
+ case TargetOpcode::PATCHPOINT:
+ return emitPatchPoint(MI, BB);
+ }
+ llvm_unreachable("Unexpected instruction for custom inserter!");
+}
+
+//===----------------------------------------------------------------------===//
+// AArch64 Lowering private implementation.
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// Lowering Code
+//===----------------------------------------------------------------------===//
+
+/// changeIntCCToAArch64CC - Convert a DAG integer condition code to an AArch64
+/// CC
+static AArch64CC::CondCode changeIntCCToAArch64CC(ISD::CondCode CC) {
+ switch (CC) {
+ default:
+ llvm_unreachable("Unknown condition code!");
+ case ISD::SETNE:
+ return AArch64CC::NE;
+ case ISD::SETEQ:
+ return AArch64CC::EQ;
+ case ISD::SETGT:
+ return AArch64CC::GT;
+ case ISD::SETGE:
+ return AArch64CC::GE;
+ case ISD::SETLT:
+ return AArch64CC::LT;
+ case ISD::SETLE:
+ return AArch64CC::LE;
+ case ISD::SETUGT:
+ return AArch64CC::HI;
+ case ISD::SETUGE:
+ return AArch64CC::HS;
+ case ISD::SETULT:
+ return AArch64CC::LO;
+ case ISD::SETULE:
+ return AArch64CC::LS;
}
}
-static const uint16_t AArch64FPRArgRegs[] = {
- AArch64::Q0, AArch64::Q1, AArch64::Q2, AArch64::Q3,
- AArch64::Q4, AArch64::Q5, AArch64::Q6, AArch64::Q7
-};
-static const unsigned NumFPRArgRegs = llvm::array_lengthof(AArch64FPRArgRegs);
-
-static const uint16_t AArch64ArgRegs[] = {
- AArch64::X0, AArch64::X1, AArch64::X2, AArch64::X3,
- AArch64::X4, AArch64::X5, AArch64::X6, AArch64::X7
-};
-static const unsigned NumArgRegs = llvm::array_lengthof(AArch64ArgRegs);
-
-static bool CC_AArch64NoMoreRegs(unsigned ValNo, MVT ValVT, MVT LocVT,
- CCValAssign::LocInfo LocInfo,
- ISD::ArgFlagsTy ArgFlags, CCState &State) {
- // Mark all remaining general purpose registers as allocated. We don't
- // backtrack: if (for example) an i128 gets put on the stack, no subsequent
- // i64 will go in registers (C.11).
- for (unsigned i = 0; i < NumArgRegs; ++i)
- State.AllocateReg(AArch64ArgRegs[i]);
-
- return false;
+/// changeFPCCToAArch64CC - Convert a DAG fp condition code to an AArch64 CC.
+static void changeFPCCToAArch64CC(ISD::CondCode CC,
+ AArch64CC::CondCode &CondCode,
+ AArch64CC::CondCode &CondCode2) {
+ CondCode2 = AArch64CC::AL;
+ switch (CC) {
+ default:
+ llvm_unreachable("Unknown FP condition!");
+ case ISD::SETEQ:
+ case ISD::SETOEQ:
+ CondCode = AArch64CC::EQ;
+ break;
+ case ISD::SETGT:
+ case ISD::SETOGT:
+ CondCode = AArch64CC::GT;
+ break;
+ case ISD::SETGE:
+ case ISD::SETOGE:
+ CondCode = AArch64CC::GE;
+ break;
+ case ISD::SETOLT:
+ CondCode = AArch64CC::MI;
+ break;
+ case ISD::SETOLE:
+ CondCode = AArch64CC::LS;
+ break;
+ case ISD::SETONE:
+ CondCode = AArch64CC::MI;
+ CondCode2 = AArch64CC::GT;
+ break;
+ case ISD::SETO:
+ CondCode = AArch64CC::VC;
+ break;
+ case ISD::SETUO:
+ CondCode = AArch64CC::VS;
+ break;
+ case ISD::SETUEQ:
+ CondCode = AArch64CC::EQ;
+ CondCode2 = AArch64CC::VS;
+ break;
+ case ISD::SETUGT:
+ CondCode = AArch64CC::HI;
+ break;
+ case ISD::SETUGE:
+ CondCode = AArch64CC::PL;
+ break;
+ case ISD::SETLT:
+ case ISD::SETULT:
+ CondCode = AArch64CC::LT;
+ break;
+ case ISD::SETLE:
+ case ISD::SETULE:
+ CondCode = AArch64CC::LE;
+ break;
+ case ISD::SETNE:
+ case ISD::SETUNE:
+ CondCode = AArch64CC::NE;
+ break;
+ }
}
+/// changeVectorFPCCToAArch64CC - Convert a DAG fp condition code to an AArch64
+/// CC usable with the vector instructions. Fewer operations are available
+/// without a real NZCV register, so we have to use less efficient combinations
+/// to get the same effect.
+static void changeVectorFPCCToAArch64CC(ISD::CondCode CC,
+ AArch64CC::CondCode &CondCode,
+ AArch64CC::CondCode &CondCode2,
+ bool &Invert) {
+ Invert = false;
+ switch (CC) {
+ default:
+ // Mostly the scalar mappings work fine.
+ changeFPCCToAArch64CC(CC, CondCode, CondCode2);
+ break;
+ case ISD::SETUO:
+ Invert = true; // Fallthrough
+ case ISD::SETO:
+ CondCode = AArch64CC::MI;
+ CondCode2 = AArch64CC::GE;
+ break;
+ case ISD::SETUEQ:
+ case ISD::SETULT:
+ case ISD::SETULE:
+ case ISD::SETUGT:
+ case ISD::SETUGE:
+ // All of the compare-mask comparisons are ordered, but we can switch
+ // between the two by a double inversion. E.g. ULE == !OGT.
+ Invert = true;
+ changeFPCCToAArch64CC(getSetCCInverse(CC, false), CondCode, CondCode2);
+ break;
+ }
+}
+
+static bool isLegalArithImmed(uint64_t C) {
+ // Matches AArch64DAGToDAGISel::SelectArithImmed().
+ return (C >> 12 == 0) || ((C & 0xFFFULL) == 0 && C >> 24 == 0);
+}
+
+static SDValue emitComparison(SDValue LHS, SDValue RHS, ISD::CondCode CC,
+ SDLoc dl, SelectionDAG &DAG) {
+ EVT VT = LHS.getValueType();
+
+ if (VT.isFloatingPoint())
+ return DAG.getNode(AArch64ISD::FCMP, dl, VT, LHS, RHS);
+
+ // The CMP instruction is just an alias for SUBS, and representing it as
+ // SUBS means that it's possible to get CSE with subtract operations.
+ // A later phase can perform the optimization of setting the destination
+ // register to WZR/XZR if it ends up being unused.
+ unsigned Opcode = AArch64ISD::SUBS;
+
+ if (RHS.getOpcode() == ISD::SUB && isa<ConstantSDNode>(RHS.getOperand(0)) &&
+ cast<ConstantSDNode>(RHS.getOperand(0))->getZExtValue() == 0 &&
+ (CC == ISD::SETEQ || CC == ISD::SETNE)) {
+ // We'd like to combine a (CMP op1, (sub 0, op2) into a CMN instruction on
+ // the grounds that "op1 - (-op2) == op1 + op2". However, the C and V flags
+ // can be set differently by this operation. It comes down to whether
+ // "SInt(~op2)+1 == SInt(~op2+1)" (and the same for UInt). If they are then
+ // everything is fine. If not then the optimization is wrong. Thus general
+ // comparisons are only valid if op2 != 0.
+
+ // So, finally, the only LLVM-native comparisons that don't mention C and V
+ // are SETEQ and SETNE. They're the only ones we can safely use CMN for in
+ // the absence of information about op2.
+ Opcode = AArch64ISD::ADDS;
+ RHS = RHS.getOperand(1);
+ } else if (LHS.getOpcode() == ISD::AND && isa<ConstantSDNode>(RHS) &&
+ cast<ConstantSDNode>(RHS)->getZExtValue() == 0 &&
+ !isUnsignedIntSetCC(CC)) {
+ // Similarly, (CMP (and X, Y), 0) can be implemented with a TST
+ // (a.k.a. ANDS) except that the flags are only guaranteed to work for one
+ // of the signed comparisons.
+ Opcode = AArch64ISD::ANDS;
+ RHS = LHS.getOperand(1);
+ LHS = LHS.getOperand(0);
+ }
+
+ return DAG.getNode(Opcode, dl, DAG.getVTList(VT, MVT::i32), LHS, RHS)
+ .getValue(1);
+}
+
+static SDValue getAArch64Cmp(SDValue LHS, SDValue RHS, ISD::CondCode CC,
+ SDValue &AArch64cc, SelectionDAG &DAG, SDLoc dl) {
+ if (ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(RHS.getNode())) {
+ EVT VT = RHS.getValueType();
+ uint64_t C = RHSC->getZExtValue();
+ if (!isLegalArithImmed(C)) {
+ // Constant does not fit, try adjusting it by one?
+ switch (CC) {
+ default:
+ break;
+ case ISD::SETLT:
+ case ISD::SETGE:
+ if ((VT == MVT::i32 && C != 0x80000000 &&
+ isLegalArithImmed((uint32_t)(C - 1))) ||
+ (VT == MVT::i64 && C != 0x80000000ULL &&
+ isLegalArithImmed(C - 1ULL))) {
+ CC = (CC == ISD::SETLT) ? ISD::SETLE : ISD::SETGT;
+ C = (VT == MVT::i32) ? (uint32_t)(C - 1) : C - 1;
+ RHS = DAG.getConstant(C, VT);
+ }
+ break;
+ case ISD::SETULT:
+ case ISD::SETUGE:
+ if ((VT == MVT::i32 && C != 0 &&
+ isLegalArithImmed((uint32_t)(C - 1))) ||
+ (VT == MVT::i64 && C != 0ULL && isLegalArithImmed(C - 1ULL))) {
+ CC = (CC == ISD::SETULT) ? ISD::SETULE : ISD::SETUGT;
+ C = (VT == MVT::i32) ? (uint32_t)(C - 1) : C - 1;
+ RHS = DAG.getConstant(C, VT);
+ }
+ break;
+ case ISD::SETLE:
+ case ISD::SETGT:
+ if ((VT == MVT::i32 && C != 0x7fffffff &&
+ isLegalArithImmed((uint32_t)(C + 1))) ||
+ (VT == MVT::i64 && C != 0x7ffffffffffffffULL &&
+ isLegalArithImmed(C + 1ULL))) {
+ CC = (CC == ISD::SETLE) ? ISD::SETLT : ISD::SETGE;
+ C = (VT == MVT::i32) ? (uint32_t)(C + 1) : C + 1;
+ RHS = DAG.getConstant(C, VT);
+ }
+ break;
+ case ISD::SETULE:
+ case ISD::SETUGT:
+ if ((VT == MVT::i32 && C != 0xffffffff &&
+ isLegalArithImmed((uint32_t)(C + 1))) ||
+ (VT == MVT::i64 && C != 0xfffffffffffffffULL &&
+ isLegalArithImmed(C + 1ULL))) {
+ CC = (CC == ISD::SETULE) ? ISD::SETULT : ISD::SETUGE;
+ C = (VT == MVT::i32) ? (uint32_t)(C + 1) : C + 1;
+ RHS = DAG.getConstant(C, VT);
+ }
+ break;
+ }
+ }
+ }
+
+ SDValue Cmp = emitComparison(LHS, RHS, CC, dl, DAG);
+ AArch64CC::CondCode AArch64CC = changeIntCCToAArch64CC(CC);
+ AArch64cc = DAG.getConstant(AArch64CC, MVT::i32);
+ return Cmp;
+}
+
+static std::pair<SDValue, SDValue>
+getAArch64XALUOOp(AArch64CC::CondCode &CC, SDValue Op, SelectionDAG &DAG) {
+ assert((Op.getValueType() == MVT::i32 || Op.getValueType() == MVT::i64) &&
+ "Unsupported value type");
+ SDValue Value, Overflow;
+ SDLoc DL(Op);
+ SDValue LHS = Op.getOperand(0);
+ SDValue RHS = Op.getOperand(1);
+ unsigned Opc = 0;
+ switch (Op.getOpcode()) {
+ default:
+ llvm_unreachable("Unknown overflow instruction!");
+ case ISD::SADDO:
+ Opc = AArch64ISD::ADDS;
+ CC = AArch64CC::VS;
+ break;
+ case ISD::UADDO:
+ Opc = AArch64ISD::ADDS;
+ CC = AArch64CC::HS;
+ break;
+ case ISD::SSUBO:
+ Opc = AArch64ISD::SUBS;
+ CC = AArch64CC::VS;
+ break;
+ case ISD::USUBO:
+ Opc = AArch64ISD::SUBS;
+ CC = AArch64CC::LO;
+ break;
+ // Multiply needs a little bit extra work.
+ case ISD::SMULO:
+ case ISD::UMULO: {
+ CC = AArch64CC::NE;
+ bool IsSigned = (Op.getOpcode() == ISD::SMULO) ? true : false;
+ if (Op.getValueType() == MVT::i32) {
+ unsigned ExtendOpc = IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
+ // For a 32 bit multiply with overflow check we want the instruction
+ // selector to generate a widening multiply (SMADDL/UMADDL). For that we
+ // need to generate the following pattern:
+ // (i64 add 0, (i64 mul (i64 sext|zext i32 %a), (i64 sext|zext i32 %b))
+ LHS = DAG.getNode(ExtendOpc, DL, MVT::i64, LHS);
+ RHS = DAG.getNode(ExtendOpc, DL, MVT::i64, RHS);
+ SDValue Mul = DAG.getNode(ISD::MUL, DL, MVT::i64, LHS, RHS);
+ SDValue Add = DAG.getNode(ISD::ADD, DL, MVT::i64, Mul,
+ DAG.getConstant(0, MVT::i64));
+ // On AArch64 the upper 32 bits are always zero extended for a 32 bit
+ // operation. We need to clear out the upper 32 bits, because we used a
+ // widening multiply that wrote all 64 bits. In the end this should be a
+ // noop.
+ Value = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, Add);
+ if (IsSigned) {
+ // The signed overflow check requires more than just a simple check for
+ // any bit set in the upper 32 bits of the result. These bits could be
+ // just the sign bits of a negative number. To perform the overflow
+ // check we have to arithmetic shift right the 32nd bit of the result by
+ // 31 bits. Then we compare the result to the upper 32 bits.
+ SDValue UpperBits = DAG.getNode(ISD::SRL, DL, MVT::i64, Add,
+ DAG.getConstant(32, MVT::i64));
+ UpperBits = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, UpperBits);
+ SDValue LowerBits = DAG.getNode(ISD::SRA, DL, MVT::i32, Value,
+ DAG.getConstant(31, MVT::i64));
+ // It is important that LowerBits is last, otherwise the arithmetic
+ // shift will not be folded into the compare (SUBS).
+ SDVTList VTs = DAG.getVTList(MVT::i32, MVT::i32);
+ Overflow = DAG.getNode(AArch64ISD::SUBS, DL, VTs, UpperBits, LowerBits)
+ .getValue(1);
+ } else {
+ // The overflow check for unsigned multiply is easy. We only need to
+ // check if any of the upper 32 bits are set. This can be done with a
+ // CMP (shifted register). For that we need to generate the following
+ // pattern:
+ // (i64 AArch64ISD::SUBS i64 0, (i64 srl i64 %Mul, i64 32)
+ SDValue UpperBits = DAG.getNode(ISD::SRL, DL, MVT::i64, Mul,
+ DAG.getConstant(32, MVT::i64));
+ SDVTList VTs = DAG.getVTList(MVT::i64, MVT::i32);
+ Overflow =
+ DAG.getNode(AArch64ISD::SUBS, DL, VTs, DAG.getConstant(0, MVT::i64),
+ UpperBits).getValue(1);
+ }
+ break;
+ }
+ assert(Op.getValueType() == MVT::i64 && "Expected an i64 value type");
+ // For the 64 bit multiply
+ Value = DAG.getNode(ISD::MUL, DL, MVT::i64, LHS, RHS);
+ if (IsSigned) {
+ SDValue UpperBits = DAG.getNode(ISD::MULHS, DL, MVT::i64, LHS, RHS);
+ SDValue LowerBits = DAG.getNode(ISD::SRA, DL, MVT::i64, Value,
+ DAG.getConstant(63, MVT::i64));
+ // It is important that LowerBits is last, otherwise the arithmetic
+ // shift will not be folded into the compare (SUBS).
+ SDVTList VTs = DAG.getVTList(MVT::i64, MVT::i32);
+ Overflow = DAG.getNode(AArch64ISD::SUBS, DL, VTs, UpperBits, LowerBits)
+ .getValue(1);
+ } else {
+ SDValue UpperBits = DAG.getNode(ISD::MULHU, DL, MVT::i64, LHS, RHS);
+ SDVTList VTs = DAG.getVTList(MVT::i64, MVT::i32);
+ Overflow =
+ DAG.getNode(AArch64ISD::SUBS, DL, VTs, DAG.getConstant(0, MVT::i64),
+ UpperBits).getValue(1);
+ }
+ break;
+ }
+ } // switch (...)
+
+ if (Opc) {
+ SDVTList VTs = DAG.getVTList(Op->getValueType(0), MVT::i32);
+
+ // Emit the AArch64 operation with overflow check.
+ Value = DAG.getNode(Opc, DL, VTs, LHS, RHS);
+ Overflow = Value.getValue(1);
+ }
+ return std::make_pair(Value, Overflow);
+}
+
+SDValue AArch64TargetLowering::LowerF128Call(SDValue Op, SelectionDAG &DAG,
+ RTLIB::Libcall Call) const {
+ SmallVector<SDValue, 2> Ops;
+ for (unsigned i = 0, e = Op->getNumOperands(); i != e; ++i)
+ Ops.push_back(Op.getOperand(i));
+
+ return makeLibCall(DAG, Call, MVT::f128, &Ops[0], Ops.size(), false,
+ SDLoc(Op)).first;
+}
+
+static SDValue LowerXOR(SDValue Op, SelectionDAG &DAG) {
+ SDValue Sel = Op.getOperand(0);
+ SDValue Other = Op.getOperand(1);
+
+ // If neither operand is a SELECT_CC, give up.
+ if (Sel.getOpcode() != ISD::SELECT_CC)
+ std::swap(Sel, Other);
+ if (Sel.getOpcode() != ISD::SELECT_CC)
+ return Op;
+
+ // The folding we want to perform is:
+ // (xor x, (select_cc a, b, cc, 0, -1) )
+ // -->
+ // (csel x, (xor x, -1), cc ...)
+ //
+ // The latter will get matched to a CSINV instruction.
+
+ ISD::CondCode CC = cast<CondCodeSDNode>(Sel.getOperand(4))->get();
+ SDValue LHS = Sel.getOperand(0);
+ SDValue RHS = Sel.getOperand(1);
+ SDValue TVal = Sel.getOperand(2);
+ SDValue FVal = Sel.getOperand(3);
+ SDLoc dl(Sel);
+
+ // FIXME: This could be generalized to non-integer comparisons.
+ if (LHS.getValueType() != MVT::i32 && LHS.getValueType() != MVT::i64)
+ return Op;
+
+ ConstantSDNode *CFVal = dyn_cast<ConstantSDNode>(FVal);
+ ConstantSDNode *CTVal = dyn_cast<ConstantSDNode>(TVal);
+
+ // The the values aren't constants, this isn't the pattern we're looking for.
+ if (!CFVal || !CTVal)
+ return Op;
+
+ // We can commute the SELECT_CC by inverting the condition. This
+ // might be needed to make this fit into a CSINV pattern.
+ if (CTVal->isAllOnesValue() && CFVal->isNullValue()) {
+ std::swap(TVal, FVal);
+ std::swap(CTVal, CFVal);
+ CC = ISD::getSetCCInverse(CC, true);
+ }
+
+ // If the constants line up, perform the transform!
+ if (CTVal->isNullValue() && CFVal->isAllOnesValue()) {
+ SDValue CCVal;
+ SDValue Cmp = getAArch64Cmp(LHS, RHS, CC, CCVal, DAG, dl);
+
+ FVal = Other;
+ TVal = DAG.getNode(ISD::XOR, dl, Other.getValueType(), Other,
+ DAG.getConstant(-1ULL, Other.getValueType()));
+
+ return DAG.getNode(AArch64ISD::CSEL, dl, Sel.getValueType(), FVal, TVal,
+ CCVal, Cmp);
+ }
+
+ return Op;
+}
+
+static SDValue LowerADDC_ADDE_SUBC_SUBE(SDValue Op, SelectionDAG &DAG) {
+ EVT VT = Op.getValueType();
+
+ // Let legalize expand this if it isn't a legal type yet.
+ if (!DAG.getTargetLoweringInfo().isTypeLegal(VT))
+ return SDValue();
+
+ SDVTList VTs = DAG.getVTList(VT, MVT::i32);
+
+ unsigned Opc;
+ bool ExtraOp = false;
+ switch (Op.getOpcode()) {
+ default:
+ assert(0 && "Invalid code");
+ case ISD::ADDC:
+ Opc = AArch64ISD::ADDS;
+ break;
+ case ISD::SUBC:
+ Opc = AArch64ISD::SUBS;
+ break;
+ case ISD::ADDE:
+ Opc = AArch64ISD::ADCS;
+ ExtraOp = true;
+ break;
+ case ISD::SUBE:
+ Opc = AArch64ISD::SBCS;
+ ExtraOp = true;
+ break;
+ }
+
+ if (!ExtraOp)
+ return DAG.getNode(Opc, SDLoc(Op), VTs, Op.getOperand(0), Op.getOperand(1));
+ return DAG.getNode(Opc, SDLoc(Op), VTs, Op.getOperand(0), Op.getOperand(1),
+ Op.getOperand(2));
+}
+
+static SDValue LowerXALUO(SDValue Op, SelectionDAG &DAG) {
+ // Let legalize expand this if it isn't a legal type yet.
+ if (!DAG.getTargetLoweringInfo().isTypeLegal(Op.getValueType()))
+ return SDValue();
+
+ AArch64CC::CondCode CC;
+ // The actual operation that sets the overflow or carry flag.
+ SDValue Value, Overflow;
+ std::tie(Value, Overflow) = getAArch64XALUOOp(CC, Op, DAG);
+
+ // We use 0 and 1 as false and true values.
+ SDValue TVal = DAG.getConstant(1, MVT::i32);
+ SDValue FVal = DAG.getConstant(0, MVT::i32);
+
+ // We use an inverted condition, because the conditional select is inverted
+ // too. This will allow it to be selected to a single instruction:
+ // CSINC Wd, WZR, WZR, invert(cond).
+ SDValue CCVal = DAG.getConstant(getInvertedCondCode(CC), MVT::i32);
+ Overflow = DAG.getNode(AArch64ISD::CSEL, SDLoc(Op), MVT::i32, FVal, TVal,
+ CCVal, Overflow);
+
+ SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);
+ return DAG.getNode(ISD::MERGE_VALUES, SDLoc(Op), VTs, Value, Overflow);
+}
+
+// Prefetch operands are:
+// 1: Address to prefetch
+// 2: bool isWrite
+// 3: int locality (0 = no locality ... 3 = extreme locality)
+// 4: bool isDataCache
+static SDValue LowerPREFETCH(SDValue Op, SelectionDAG &DAG) {
+ SDLoc DL(Op);
+ unsigned IsWrite = cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue();
+ unsigned Locality = cast<ConstantSDNode>(Op.getOperand(3))->getZExtValue();
+ // The data thing is not used.
+ // unsigned isData = cast<ConstantSDNode>(Op.getOperand(4))->getZExtValue();
+
+ bool IsStream = !Locality;
+ // When the locality number is set
+ if (Locality) {
+ // The front-end should have filtered out the out-of-range values
+ assert(Locality <= 3 && "Prefetch locality out-of-range");
+ // The locality degree is the opposite of the cache speed.
+ // Put the number the other way around.
+ // The encoding starts at 0 for level 1
+ Locality = 3 - Locality;
+ }
+
+ // built the mask value encoding the expected behavior.
+ unsigned PrfOp = (IsWrite << 4) | // Load/Store bit
+ (Locality << 1) | // Cache level bits
+ (unsigned)IsStream; // Stream bit
+ return DAG.getNode(AArch64ISD::PREFETCH, DL, MVT::Other, Op.getOperand(0),
+ DAG.getConstant(PrfOp, MVT::i32), Op.getOperand(1));
+}
+
+SDValue AArch64TargetLowering::LowerFP_EXTEND(SDValue Op,
+ SelectionDAG &DAG) const {
+ assert(Op.getValueType() == MVT::f128 && "Unexpected lowering");
+
+ RTLIB::Libcall LC;
+ LC = RTLIB::getFPEXT(Op.getOperand(0).getValueType(), Op.getValueType());
+
+ return LowerF128Call(Op, DAG, LC);
+}
+
+SDValue AArch64TargetLowering::LowerFP_ROUND(SDValue Op,
+ SelectionDAG &DAG) const {
+ if (Op.getOperand(0).getValueType() != MVT::f128) {
+ // It's legal except when f128 is involved
+ return Op;
+ }
+
+ RTLIB::Libcall LC;
+ LC = RTLIB::getFPROUND(Op.getOperand(0).getValueType(), Op.getValueType());
+
+ // FP_ROUND node has a second operand indicating whether it is known to be
+ // precise. That doesn't take part in the LibCall so we can't directly use
+ // LowerF128Call.
+ SDValue SrcVal = Op.getOperand(0);
+ return makeLibCall(DAG, LC, Op.getValueType(), &SrcVal, 1,
+ /*isSigned*/ false, SDLoc(Op)).first;
+}
+
+static SDValue LowerVectorFP_TO_INT(SDValue Op, SelectionDAG &DAG) {
+ // Warning: We maintain cost tables in AArch64TargetTransformInfo.cpp.
+ // Any additional optimization in this function should be recorded
+ // in the cost tables.
+ EVT InVT = Op.getOperand(0).getValueType();
+ EVT VT = Op.getValueType();
+
+ // FP_TO_XINT conversion from the same type are legal.
+ if (VT.getSizeInBits() == InVT.getSizeInBits())
+ return Op;
+
+ if (InVT == MVT::v2f64 || InVT == MVT::v4f32) {
+ SDLoc dl(Op);
+ SDValue Cv =
+ DAG.getNode(Op.getOpcode(), dl, InVT.changeVectorElementTypeToInteger(),
+ Op.getOperand(0));
+ return DAG.getNode(ISD::TRUNCATE, dl, VT, Cv);
+ } else if (InVT == MVT::v2f32) {
+ SDLoc dl(Op);
+ SDValue Ext = DAG.getNode(ISD::FP_EXTEND, dl, MVT::v2f64, Op.getOperand(0));
+ return DAG.getNode(Op.getOpcode(), dl, VT, Ext);
+ }
+
+ // Type changing conversions are illegal.
+ return SDValue();
+}
+
+SDValue AArch64TargetLowering::LowerFP_TO_INT(SDValue Op,
+ SelectionDAG &DAG) const {
+ if (Op.getOperand(0).getValueType().isVector())
+ return LowerVectorFP_TO_INT(Op, DAG);
+
+ if (Op.getOperand(0).getValueType() != MVT::f128) {
+ // It's legal except when f128 is involved
+ return Op;
+ }
+
+ RTLIB::Libcall LC;
+ if (Op.getOpcode() == ISD::FP_TO_SINT)
+ LC = RTLIB::getFPTOSINT(Op.getOperand(0).getValueType(), Op.getValueType());
+ else
+ LC = RTLIB::getFPTOUINT(Op.getOperand(0).getValueType(), Op.getValueType());
+
+ SmallVector<SDValue, 2> Ops;
+ for (unsigned i = 0, e = Op->getNumOperands(); i != e; ++i)
+ Ops.push_back(Op.getOperand(i));
+
+ return makeLibCall(DAG, LC, Op.getValueType(), &Ops[0], Ops.size(), false,
+ SDLoc(Op)).first;
+}
+
+static SDValue LowerVectorINT_TO_FP(SDValue Op, SelectionDAG &DAG) {
+ // Warning: We maintain cost tables in AArch64TargetTransformInfo.cpp.
+ // Any additional optimization in this function should be recorded
+ // in the cost tables.
+ EVT VT = Op.getValueType();
+ SDLoc dl(Op);
+ SDValue In = Op.getOperand(0);
+ EVT InVT = In.getValueType();
+
+ // v2i32 to v2f32 is legal.
+ if (VT == MVT::v2f32 && InVT == MVT::v2i32)
+ return Op;
+
+ // This function only handles v2f64 outputs.
+ if (VT == MVT::v2f64) {
+ // Extend the input argument to a v2i64 that we can feed into the
+ // floating point conversion. Zero or sign extend based on whether
+ // we're doing a signed or unsigned float conversion.
+ unsigned Opc =
+ Op.getOpcode() == ISD::UINT_TO_FP ? ISD::ZERO_EXTEND : ISD::SIGN_EXTEND;
+ assert(Op.getNumOperands() == 1 && "FP conversions take one argument");
+ SDValue Promoted = DAG.getNode(Opc, dl, MVT::v2i64, Op.getOperand(0));
+ return DAG.getNode(Op.getOpcode(), dl, Op.getValueType(), Promoted);
+ }
+
+ // Scalarize v2i64 to v2f32 conversions.
+ std::vector<SDValue> BuildVectorOps;
+ for (unsigned i = 0; i < VT.getVectorNumElements(); ++i) {
+ SDValue Sclr = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i64, In,
+ DAG.getConstant(i, MVT::i64));
+ Sclr = DAG.getNode(Op->getOpcode(), dl, MVT::f32, Sclr);
+ BuildVectorOps.push_back(Sclr);
+ }
+
+ return DAG.getNode(ISD::BUILD_VECTOR, dl, VT, BuildVectorOps);
+}
+
+SDValue AArch64TargetLowering::LowerINT_TO_FP(SDValue Op,
+ SelectionDAG &DAG) const {
+ if (Op.getValueType().isVector())
+ return LowerVectorINT_TO_FP(Op, DAG);
+
+ // i128 conversions are libcalls.
+ if (Op.getOperand(0).getValueType() == MVT::i128)
+ return SDValue();
+
+ // Other conversions are legal, unless it's to the completely software-based
+ // fp128.
+ if (Op.getValueType() != MVT::f128)
+ return Op;
+
+ RTLIB::Libcall LC;
+ if (Op.getOpcode() == ISD::SINT_TO_FP)
+ LC = RTLIB::getSINTTOFP(Op.getOperand(0).getValueType(), Op.getValueType());
+ else
+ LC = RTLIB::getUINTTOFP(Op.getOperand(0).getValueType(), Op.getValueType());
+
+ return LowerF128Call(Op, DAG, LC);
+}
+
+SDValue AArch64TargetLowering::LowerFSINCOS(SDValue Op,
+ SelectionDAG &DAG) const {
+ // For iOS, we want to call an alternative entry point: __sincos_stret,
+ // which returns the values in two S / D registers.
+ SDLoc dl(Op);
+ SDValue Arg = Op.getOperand(0);
+ EVT ArgVT = Arg.getValueType();
+ Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());
+
+ ArgListTy Args;
+ ArgListEntry Entry;
+
+ Entry.Node = Arg;
+ Entry.Ty = ArgTy;
+ Entry.isSExt = false;
+ Entry.isZExt = false;
+ Args.push_back(Entry);
+
+ const char *LibcallName =
+ (ArgVT == MVT::f64) ? "__sincos_stret" : "__sincosf_stret";
+ SDValue Callee = DAG.getExternalSymbol(LibcallName, getPointerTy());
+
+ StructType *RetTy = StructType::get(ArgTy, ArgTy, NULL);
+ TargetLowering::CallLoweringInfo CLI(DAG);
+ CLI.setDebugLoc(dl).setChain(DAG.getEntryNode())
+ .setCallee(CallingConv::Fast, RetTy, Callee, &Args, 0);
+
+ std::pair<SDValue, SDValue> CallResult = LowerCallTo(CLI);
+ return CallResult.first;
+}
+
+SDValue AArch64TargetLowering::LowerOperation(SDValue Op,
+ SelectionDAG &DAG) const {
+ switch (Op.getOpcode()) {
+ default:
+ llvm_unreachable("unimplemented operand");
+ return SDValue();
+ case ISD::GlobalAddress:
+ return LowerGlobalAddress(Op, DAG);
+ case ISD::GlobalTLSAddress:
+ return LowerGlobalTLSAddress(Op, DAG);
+ case ISD::SETCC:
+ return LowerSETCC(Op, DAG);
+ case ISD::BR_CC:
+ return LowerBR_CC(Op, DAG);
+ case ISD::SELECT:
+ return LowerSELECT(Op, DAG);
+ case ISD::SELECT_CC:
+ return LowerSELECT_CC(Op, DAG);
+ case ISD::JumpTable:
+ return LowerJumpTable(Op, DAG);
+ case ISD::ConstantPool:
+ return LowerConstantPool(Op, DAG);
+ case ISD::BlockAddress:
+ return LowerBlockAddress(Op, DAG);
+ case ISD::VASTART:
+ return LowerVASTART(Op, DAG);
+ case ISD::VACOPY:
+ return LowerVACOPY(Op, DAG);
+ case ISD::VAARG:
+ return LowerVAARG(Op, DAG);
+ case ISD::ADDC:
+ case ISD::ADDE:
+ case ISD::SUBC:
+ case ISD::SUBE:
+ return LowerADDC_ADDE_SUBC_SUBE(Op, DAG);
+ case ISD::SADDO:
+ case ISD::UADDO:
+ case ISD::SSUBO:
+ case ISD::USUBO:
+ case ISD::SMULO:
+ case ISD::UMULO:
+ return LowerXALUO(Op, DAG);
+ case ISD::FADD:
+ return LowerF128Call(Op, DAG, RTLIB::ADD_F128);
+ case ISD::FSUB:
+ return LowerF128Call(Op, DAG, RTLIB::SUB_F128);
+ case ISD::FMUL:
+ return LowerF128Call(Op, DAG, RTLIB::MUL_F128);
+ case ISD::FDIV:
+ return LowerF128Call(Op, DAG, RTLIB::DIV_F128);
+ case ISD::FP_ROUND:
+ return LowerFP_ROUND(Op, DAG);
+ case ISD::FP_EXTEND:
+ return LowerFP_EXTEND(Op, DAG);
+ case ISD::FRAMEADDR:
+ return LowerFRAMEADDR(Op, DAG);
+ case ISD::RETURNADDR:
+ return LowerRETURNADDR(Op, DAG);
+ case ISD::INSERT_VECTOR_ELT:
+ return LowerINSERT_VECTOR_ELT(Op, DAG);
+ case ISD::EXTRACT_VECTOR_ELT:
+ return LowerEXTRACT_VECTOR_ELT(Op, DAG);
+ case ISD::BUILD_VECTOR:
+ return LowerBUILD_VECTOR(Op, DAG);
+ case ISD::VECTOR_SHUFFLE:
+ return LowerVECTOR_SHUFFLE(Op, DAG);
+ case ISD::EXTRACT_SUBVECTOR:
+ return LowerEXTRACT_SUBVECTOR(Op, DAG);
+ case ISD::SRA:
+ case ISD::SRL:
+ case ISD::SHL:
+ return LowerVectorSRA_SRL_SHL(Op, DAG);
+ case ISD::SHL_PARTS:
+ return LowerShiftLeftParts(Op, DAG);
+ case ISD::SRL_PARTS:
+ case ISD::SRA_PARTS:
+ return LowerShiftRightParts(Op, DAG);
+ case ISD::CTPOP:
+ return LowerCTPOP(Op, DAG);
+ case ISD::FCOPYSIGN:
+ return LowerFCOPYSIGN(Op, DAG);
+ case ISD::AND:
+ return LowerVectorAND(Op, DAG);
+ case ISD::OR:
+ return LowerVectorOR(Op, DAG);
+ case ISD::XOR:
+ return LowerXOR(Op, DAG);
+ case ISD::PREFETCH:
+ return LowerPREFETCH(Op, DAG);
+ case ISD::SINT_TO_FP:
+ case ISD::UINT_TO_FP:
+ return LowerINT_TO_FP(Op, DAG);
+ case ISD::FP_TO_SINT:
+ case ISD::FP_TO_UINT:
+ return LowerFP_TO_INT(Op, DAG);
+ case ISD::FSINCOS:
+ return LowerFSINCOS(Op, DAG);
+ }
+}
+
+/// getFunctionAlignment - Return the Log2 alignment of this function.
+unsigned AArch64TargetLowering::getFunctionAlignment(const Function *F) const {
+ return 2;
+}
+
+//===----------------------------------------------------------------------===//
+// Calling Convention Implementation
+//===----------------------------------------------------------------------===//
+
#include "AArch64GenCallingConv.inc"
-CCAssignFn *AArch64TargetLowering::CCAssignFnForNode(CallingConv::ID CC) const {
-
- switch(CC) {
- default: llvm_unreachable("Unsupported calling convention");
- case CallingConv::Fast:
+/// Selects the correct CCAssignFn for a the given CallingConvention
+/// value.
+CCAssignFn *AArch64TargetLowering::CCAssignFnForCall(CallingConv::ID CC,
+ bool IsVarArg) const {
+ switch (CC) {
+ default:
+ llvm_unreachable("Unsupported calling convention.");
+ case CallingConv::WebKit_JS:
+ return CC_AArch64_WebKit_JS;
case CallingConv::C:
- return CC_A64_APCS;
+ case CallingConv::Fast:
+ if (!Subtarget->isTargetDarwin())
+ return CC_AArch64_AAPCS;
+ return IsVarArg ? CC_AArch64_DarwinPCS_VarArg : CC_AArch64_DarwinPCS;
}
}
-void
-AArch64TargetLowering::SaveVarArgRegisters(CCState &CCInfo, SelectionDAG &DAG,
- SDLoc DL, SDValue &Chain) const {
+SDValue AArch64TargetLowering::LowerFormalArguments(
+ SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
+ const SmallVectorImpl<ISD::InputArg> &Ins, SDLoc DL, SelectionDAG &DAG,
+ SmallVectorImpl<SDValue> &InVals) const {
MachineFunction &MF = DAG.getMachineFunction();
MachineFrameInfo *MFI = MF.getFrameInfo();
- AArch64MachineFunctionInfo *FuncInfo
- = MF.getInfo<AArch64MachineFunctionInfo>();
- SmallVector<SDValue, 8> MemOps;
-
- unsigned FirstVariadicGPR = CCInfo.getFirstUnallocated(AArch64ArgRegs,
- NumArgRegs);
- unsigned FirstVariadicFPR = CCInfo.getFirstUnallocated(AArch64FPRArgRegs,
- NumFPRArgRegs);
-
- unsigned GPRSaveSize = 8 * (NumArgRegs - FirstVariadicGPR);
- int GPRIdx = 0;
- if (GPRSaveSize != 0) {
- GPRIdx = MFI->CreateStackObject(GPRSaveSize, 8, false);
-
- SDValue FIN = DAG.getFrameIndex(GPRIdx, getPointerTy());
-
- for (unsigned i = FirstVariadicGPR; i < NumArgRegs; ++i) {
- unsigned VReg = MF.addLiveIn(AArch64ArgRegs[i], &AArch64::GPR64RegClass);
- SDValue Val = DAG.getCopyFromReg(Chain, DL, VReg, MVT::i64);
- SDValue Store = DAG.getStore(Val.getValue(1), DL, Val, FIN,
- MachinePointerInfo::getStack(i * 8),
- false, false, 0);
- MemOps.push_back(Store);
- FIN = DAG.getNode(ISD::ADD, DL, getPointerTy(), FIN,
- DAG.getConstant(8, getPointerTy()));
- }
- }
-
- if (getSubtarget()->hasFPARMv8()) {
- unsigned FPRSaveSize = 16 * (NumFPRArgRegs - FirstVariadicFPR);
- int FPRIdx = 0;
- // According to the AArch64 Procedure Call Standard, section B.1/B.3, we
- // can omit a register save area if we know we'll never use registers of
- // that class.
- if (FPRSaveSize != 0) {
- FPRIdx = MFI->CreateStackObject(FPRSaveSize, 16, false);
-
- SDValue FIN = DAG.getFrameIndex(FPRIdx, getPointerTy());
-
- for (unsigned i = FirstVariadicFPR; i < NumFPRArgRegs; ++i) {
- unsigned VReg = MF.addLiveIn(AArch64FPRArgRegs[i],
- &AArch64::FPR128RegClass);
- SDValue Val = DAG.getCopyFromReg(Chain, DL, VReg, MVT::f128);
- SDValue Store = DAG.getStore(Val.getValue(1), DL, Val, FIN,
- MachinePointerInfo::getStack(i * 16),
- false, false, 0);
- MemOps.push_back(Store);
- FIN = DAG.getNode(ISD::ADD, DL, getPointerTy(), FIN,
- DAG.getConstant(16, getPointerTy()));
- }
- }
- FuncInfo->setVariadicFPRIdx(FPRIdx);
- FuncInfo->setVariadicFPRSize(FPRSaveSize);
- }
-
- unsigned StackOffset = RoundUpToAlignment(CCInfo.getNextStackOffset(), 8);
- int StackIdx = MFI->CreateFixedObject(8, StackOffset, true);
-
- FuncInfo->setVariadicStackIdx(StackIdx);
- FuncInfo->setVariadicGPRIdx(GPRIdx);
- FuncInfo->setVariadicGPRSize(GPRSaveSize);
-
- if (!MemOps.empty()) {
- Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, &MemOps[0],
- MemOps.size());
- }
-}
-
-
-SDValue
-AArch64TargetLowering::LowerFormalArguments(SDValue Chain,
- CallingConv::ID CallConv, bool isVarArg,
- const SmallVectorImpl<ISD::InputArg> &Ins,
- SDLoc dl, SelectionDAG &DAG,
- SmallVectorImpl<SDValue> &InVals) const {
- MachineFunction &MF = DAG.getMachineFunction();
- AArch64MachineFunctionInfo *FuncInfo
- = MF.getInfo<AArch64MachineFunctionInfo>();
- MachineFrameInfo *MFI = MF.getFrameInfo();
- bool TailCallOpt = MF.getTarget().Options.GuaranteedTailCallOpt;
-
+ // Assign locations to all of the incoming arguments.
SmallVector<CCValAssign, 16> ArgLocs;
CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(),
getTargetMachine(), ArgLocs, *DAG.getContext());
- CCInfo.AnalyzeFormalArguments(Ins, CCAssignFnForNode(CallConv));
+ // At this point, Ins[].VT may already be promoted to i32. To correctly
+ // handle passing i8 as i8 instead of i32 on stack, we pass in both i32 and
+ // i8 to CC_AArch64_AAPCS with i32 being ValVT and i8 being LocVT.
+ // Since AnalyzeFormalArguments uses Ins[].VT for both ValVT and LocVT, here
+ // we use a special version of AnalyzeFormalArguments to pass in ValVT and
+ // LocVT.
+ unsigned NumArgs = Ins.size();
+ Function::const_arg_iterator CurOrigArg = MF.getFunction()->arg_begin();
+ unsigned CurArgIdx = 0;
+ for (unsigned i = 0; i != NumArgs; ++i) {
+ MVT ValVT = Ins[i].VT;
+ std::advance(CurOrigArg, Ins[i].OrigArgIndex - CurArgIdx);
+ CurArgIdx = Ins[i].OrigArgIndex;
+
+ // Get type of the original argument.
+ EVT ActualVT = getValueType(CurOrigArg->getType(), /*AllowUnknown*/ true);
+ MVT ActualMVT = ActualVT.isSimple() ? ActualVT.getSimpleVT() : MVT::Other;
+ // If ActualMVT is i1/i8/i16, we should set LocVT to i8/i8/i16.
+ if (ActualMVT == MVT::i1 || ActualMVT == MVT::i8)
+ ValVT = MVT::i8;
+ else if (ActualMVT == MVT::i16)
+ ValVT = MVT::i16;
+
+ CCAssignFn *AssignFn = CCAssignFnForCall(CallConv, /*IsVarArg=*/false);
+ bool Res =
+ AssignFn(i, ValVT, ValVT, CCValAssign::Full, Ins[i].Flags, CCInfo);
+ assert(!Res && "Call operand has unhandled type");
+ (void)Res;
+ }
+ assert(ArgLocs.size() == Ins.size());
SmallVector<SDValue, 16> ArgValues;
-
- SDValue ArgValue;
for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
CCValAssign &VA = ArgLocs[i];
- ISD::ArgFlagsTy Flags = Ins[i].Flags;
- if (Flags.isByVal()) {
- // Byval is used for small structs and HFAs in the PCS, but the system
- // should work in a non-compliant manner for larger structs.
+ if (Ins[i].Flags.isByVal()) {
+ // Byval is used for HFAs in the PCS, but the system should work in a
+ // non-compliant manner for larger structs.
EVT PtrTy = getPointerTy();
- int Size = Flags.getByValSize();
+ int Size = Ins[i].Flags.getByValSize();
unsigned NumRegs = (Size + 7) / 8;
- uint32_t BEAlign = 0;
- if (Size < 8 && !getSubtarget()->isLittle())
- BEAlign = 8-Size;
- unsigned FrameIdx = MFI->CreateFixedObject(8 * NumRegs,
- VA.getLocMemOffset() + BEAlign,
- false);
+ // FIXME: This works on big-endian for composite byvals, which are the common
+ // case. It should also work for fundamental types too.
+ unsigned FrameIdx =
+ MFI->CreateFixedObject(8 * NumRegs, VA.getLocMemOffset(), false);
SDValue FrameIdxN = DAG.getFrameIndex(FrameIdx, PtrTy);
InVals.push_back(FrameIdxN);
continue;
- } else if (VA.isRegLoc()) {
- MVT RegVT = VA.getLocVT();
- const TargetRegisterClass *RC = getRegClassFor(RegVT);
+ } if (VA.isRegLoc()) {
+ // Arguments stored in registers.
+ EVT RegVT = VA.getLocVT();
+
+ SDValue ArgValue;
+ const TargetRegisterClass *RC;
+
+ if (RegVT == MVT::i32)
+ RC = &AArch64::GPR32RegClass;
+ else if (RegVT == MVT::i64)
+ RC = &AArch64::GPR64RegClass;
+ else if (RegVT == MVT::f32)
+ RC = &AArch64::FPR32RegClass;
+ else if (RegVT == MVT::f64 || RegVT.is64BitVector())
+ RC = &AArch64::FPR64RegClass;
+ else if (RegVT == MVT::f128 || RegVT.is128BitVector())
+ RC = &AArch64::FPR128RegClass;
+ else
+ llvm_unreachable("RegVT not supported by FORMAL_ARGUMENTS Lowering");
+
+ // Transform the arguments in physical registers into virtual ones.
unsigned Reg = MF.addLiveIn(VA.getLocReg(), RC);
+ ArgValue = DAG.getCopyFromReg(Chain, DL, Reg, RegVT);
- ArgValue = DAG.getCopyFromReg(Chain, dl, Reg, RegVT);
- } else { // VA.isRegLoc()
- assert(VA.isMemLoc());
-
- int FI = MFI->CreateFixedObject(VA.getLocVT().getSizeInBits()/8,
- VA.getLocMemOffset(), true);
-
- SDValue FIN = DAG.getFrameIndex(FI, getPointerTy());
- ArgValue = DAG.getLoad(VA.getLocVT(), dl, Chain, FIN,
- MachinePointerInfo::getFixedStack(FI),
- false, false, false, 0);
-
-
- }
-
- switch (VA.getLocInfo()) {
- default: llvm_unreachable("Unknown loc info!");
- case CCValAssign::Full: break;
- case CCValAssign::BCvt:
- ArgValue = DAG.getNode(ISD::BITCAST,dl, VA.getValVT(), ArgValue);
- break;
- case CCValAssign::SExt:
- case CCValAssign::ZExt:
- case CCValAssign::AExt:
- case CCValAssign::FPExt: {
- unsigned DestSize = VA.getValVT().getSizeInBits();
- unsigned DestSubReg;
-
- switch (DestSize) {
- case 8: DestSubReg = AArch64::sub_8; break;
- case 16: DestSubReg = AArch64::sub_16; break;
- case 32: DestSubReg = AArch64::sub_32; break;
- case 64: DestSubReg = AArch64::sub_64; break;
- default: llvm_unreachable("Unexpected argument promotion");
+ // If this is an 8, 16 or 32-bit value, it is really passed promoted
+ // to 64 bits. Insert an assert[sz]ext to capture this, then
+ // truncate to the right size.
+ switch (VA.getLocInfo()) {
+ default:
+ llvm_unreachable("Unknown loc info!");
+ case CCValAssign::Full:
+ break;
+ case CCValAssign::BCvt:
+ ArgValue = DAG.getNode(ISD::BITCAST, DL, VA.getValVT(), ArgValue);
+ break;
+ case CCValAssign::AExt:
+ case CCValAssign::SExt:
+ case CCValAssign::ZExt:
+ // SelectionDAGBuilder will insert appropriate AssertZExt & AssertSExt
+ // nodes after our lowering.
+ assert(RegVT == Ins[i].VT && "incorrect register location selected");
+ break;
}
- ArgValue = SDValue(DAG.getMachineNode(TargetOpcode::EXTRACT_SUBREG, dl,
- VA.getValVT(), ArgValue,
- DAG.getTargetConstant(DestSubReg, MVT::i32)),
- 0);
- break;
- }
- }
+ InVals.push_back(ArgValue);
- InVals.push_back(ArgValue);
+ } else { // VA.isRegLoc()
+ assert(VA.isMemLoc() && "CCValAssign is neither reg nor mem");
+ unsigned ArgOffset = VA.getLocMemOffset();
+ unsigned ArgSize = VA.getLocVT().getSizeInBits() / 8;
+
+ uint32_t BEAlign = 0;
+ if (ArgSize < 8 && !Subtarget->isLittleEndian())
+ BEAlign = 8 - ArgSize;
+
+ int FI = MFI->CreateFixedObject(ArgSize, ArgOffset + BEAlign, true);
+
+ // Create load nodes to retrieve arguments from the stack.
+ SDValue FIN = DAG.getFrameIndex(FI, getPointerTy());
+ SDValue ArgValue;
+
+ ISD::LoadExtType ExtType = ISD::NON_EXTLOAD;
+ switch (VA.getLocInfo()) {
+ default:
+ break;
+ case CCValAssign::SExt:
+ ExtType = ISD::SEXTLOAD;
+ break;
+ case CCValAssign::ZExt:
+ ExtType = ISD::ZEXTLOAD;
+ break;
+ case CCValAssign::AExt:
+ ExtType = ISD::EXTLOAD;
+ break;
+ }
+
+ ArgValue = DAG.getExtLoad(ExtType, DL, VA.getValVT(), Chain, FIN,
+ MachinePointerInfo::getFixedStack(FI),
+ VA.getLocVT(),
+ false, false, false, 0);
+
+ InVals.push_back(ArgValue);
+ }
}
- if (isVarArg)
- SaveVarArgRegisters(CCInfo, DAG, dl, Chain);
+ // varargs
+ if (isVarArg) {
+ if (!Subtarget->isTargetDarwin()) {
+ // The AAPCS variadic function ABI is identical to the non-variadic
+ // one. As a result there may be more arguments in registers and we should
+ // save them for future reference.
+ saveVarArgRegisters(CCInfo, DAG, DL, Chain);
+ }
+ AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>();
+ // This will point to the next argument passed via stack.
+ unsigned StackOffset = CCInfo.getNextStackOffset();
+ // We currently pass all varargs at 8-byte alignment.
+ StackOffset = ((StackOffset + 7) & ~7);
+ AFI->setVarArgsStackIndex(MFI->CreateFixedObject(4, StackOffset, true));
+ }
+
+ AArch64FunctionInfo *FuncInfo = MF.getInfo<AArch64FunctionInfo>();
unsigned StackArgSize = CCInfo.getNextStackOffset();
+ bool TailCallOpt = MF.getTarget().Options.GuaranteedTailCallOpt;
if (DoesCalleeRestoreStack(CallConv, TailCallOpt)) {
// This is a non-standard ABI so by fiat I say we're allowed to make full
// use of the stack area to be popped, which must be aligned to 16 bytes in
@@ -1416,393 +1837,120 @@
return Chain;
}
-SDValue
-AArch64TargetLowering::LowerReturn(SDValue Chain,
- CallingConv::ID CallConv, bool isVarArg,
- const SmallVectorImpl<ISD::OutputArg> &Outs,
- const SmallVectorImpl<SDValue> &OutVals,
- SDLoc dl, SelectionDAG &DAG) const {
- // CCValAssign - represent the assignment of the return value to a location.
- SmallVector<CCValAssign, 16> RVLocs;
-
- // CCState - Info about the registers and stack slots.
- CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(),
- getTargetMachine(), RVLocs, *DAG.getContext());
-
- // Analyze outgoing return values.
- CCInfo.AnalyzeReturn(Outs, CCAssignFnForNode(CallConv));
-
- SDValue Flag;
- SmallVector<SDValue, 4> RetOps(1, Chain);
-
- for (unsigned i = 0, e = RVLocs.size(); i != e; ++i) {
- // PCS: "If the type, T, of the result of a function is such that
- // void func(T arg) would require that arg be passed as a value in a
- // register (or set of registers) according to the rules in 5.4, then the
- // result is returned in the same registers as would be used for such an
- // argument.
- //
- // Otherwise, the caller shall reserve a block of memory of sufficient
- // size and alignment to hold the result. The address of the memory block
- // shall be passed as an additional argument to the function in x8."
- //
- // This is implemented in two places. The register-return values are dealt
- // with here, more complex returns are passed as an sret parameter, which
- // means we don't have to worry about it during actual return.
- CCValAssign &VA = RVLocs[i];
- assert(VA.isRegLoc() && "Only register-returns should be created by PCS");
-
-
- SDValue Arg = OutVals[i];
-
- // There's no convenient note in the ABI about this as there is for normal
- // arguments, but it says return values are passed in the same registers as
- // an argument would be. I believe that includes the comments about
- // unspecified higher bits, putting the burden of widening on the *caller*
- // for return values.
- switch (VA.getLocInfo()) {
- default: llvm_unreachable("Unknown loc info");
- case CCValAssign::Full: break;
- case CCValAssign::SExt:
- case CCValAssign::ZExt:
- case CCValAssign::AExt:
- // Floating-point values should only be extended when they're going into
- // memory, which can't happen here so an integer extend is acceptable.
- Arg = DAG.getNode(ISD::ANY_EXTEND, dl, VA.getLocVT(), Arg);
- break;
- case CCValAssign::BCvt:
- Arg = DAG.getNode(ISD::BITCAST, dl, VA.getLocVT(), Arg);
- break;
- }
-
- Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(), Arg, Flag);
- Flag = Chain.getValue(1);
- RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT()));
- }
-
- RetOps[0] = Chain; // Update chain.
-
- // Add the flag if we have it.
- if (Flag.getNode())
- RetOps.push_back(Flag);
-
- return DAG.getNode(AArch64ISD::Ret, dl, MVT::Other,
- &RetOps[0], RetOps.size());
-}
-
-unsigned AArch64TargetLowering::getByValTypeAlignment(Type *Ty) const {
- // This is a new backend. For anything more precise than this a FE should
- // set an explicit alignment.
- return 4;
-}
-
-SDValue
-AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI,
- SmallVectorImpl<SDValue> &InVals) const {
- SelectionDAG &DAG = CLI.DAG;
- SDLoc &dl = CLI.DL;
- SmallVectorImpl<ISD::OutputArg> &Outs = CLI.Outs;
- SmallVectorImpl<SDValue> &OutVals = CLI.OutVals;
- SmallVectorImpl<ISD::InputArg> &Ins = CLI.Ins;
- SDValue Chain = CLI.Chain;
- SDValue Callee = CLI.Callee;
- bool &IsTailCall = CLI.IsTailCall;
- CallingConv::ID CallConv = CLI.CallConv;
- bool IsVarArg = CLI.IsVarArg;
-
+void AArch64TargetLowering::saveVarArgRegisters(CCState &CCInfo,
+ SelectionDAG &DAG, SDLoc DL,
+ SDValue &Chain) const {
MachineFunction &MF = DAG.getMachineFunction();
- AArch64MachineFunctionInfo *FuncInfo
- = MF.getInfo<AArch64MachineFunctionInfo>();
- bool TailCallOpt = MF.getTarget().Options.GuaranteedTailCallOpt;
- bool IsStructRet = !Outs.empty() && Outs[0].Flags.isSRet();
- bool IsSibCall = false;
+ MachineFrameInfo *MFI = MF.getFrameInfo();
+ AArch64FunctionInfo *FuncInfo = MF.getInfo<AArch64FunctionInfo>();
- if (IsTailCall) {
- IsTailCall = IsEligibleForTailCallOptimization(Callee, CallConv,
- IsVarArg, IsStructRet, MF.getFunction()->hasStructRetAttr(),
- Outs, OutVals, Ins, DAG);
+ SmallVector<SDValue, 8> MemOps;
- // A sibling call is one where we're under the usual C ABI and not planning
- // to change that but can still do a tail call:
- if (!TailCallOpt && IsTailCall)
- IsSibCall = true;
+ static const MCPhysReg GPRArgRegs[] = { AArch64::X0, AArch64::X1, AArch64::X2,
+ AArch64::X3, AArch64::X4, AArch64::X5,
+ AArch64::X6, AArch64::X7 };
+ static const unsigned NumGPRArgRegs = array_lengthof(GPRArgRegs);
+ unsigned FirstVariadicGPR =
+ CCInfo.getFirstUnallocated(GPRArgRegs, NumGPRArgRegs);
+
+ unsigned GPRSaveSize = 8 * (NumGPRArgRegs - FirstVariadicGPR);
+ int GPRIdx = 0;
+ if (GPRSaveSize != 0) {
+ GPRIdx = MFI->CreateStackObject(GPRSaveSize, 8, false);
+
+ SDValue FIN = DAG.getFrameIndex(GPRIdx, getPointerTy());
+
+ for (unsigned i = FirstVariadicGPR; i < NumGPRArgRegs; ++i) {
+ unsigned VReg = MF.addLiveIn(GPRArgRegs[i], &AArch64::GPR64RegClass);
+ SDValue Val = DAG.getCopyFromReg(Chain, DL, VReg, MVT::i64);
+ SDValue Store =
+ DAG.getStore(Val.getValue(1), DL, Val, FIN,
+ MachinePointerInfo::getStack(i * 8), false, false, 0);
+ MemOps.push_back(Store);
+ FIN = DAG.getNode(ISD::ADD, DL, getPointerTy(), FIN,
+ DAG.getConstant(8, getPointerTy()));
+ }
}
+ FuncInfo->setVarArgsGPRIndex(GPRIdx);
+ FuncInfo->setVarArgsGPRSize(GPRSaveSize);
- SmallVector<CCValAssign, 16> ArgLocs;
- CCState CCInfo(CallConv, IsVarArg, DAG.getMachineFunction(),
- getTargetMachine(), ArgLocs, *DAG.getContext());
- CCInfo.AnalyzeCallOperands(Outs, CCAssignFnForNode(CallConv));
+ if (Subtarget->hasFPARMv8()) {
+ static const MCPhysReg FPRArgRegs[] = {
+ AArch64::Q0, AArch64::Q1, AArch64::Q2, AArch64::Q3,
+ AArch64::Q4, AArch64::Q5, AArch64::Q6, AArch64::Q7};
+ static const unsigned NumFPRArgRegs = array_lengthof(FPRArgRegs);
+ unsigned FirstVariadicFPR =
+ CCInfo.getFirstUnallocated(FPRArgRegs, NumFPRArgRegs);
- // On AArch64 (and all other architectures I'm aware of) the most this has to
- // do is adjust the stack pointer.
- unsigned NumBytes = RoundUpToAlignment(CCInfo.getNextStackOffset(), 16);
- if (IsSibCall) {
- // Since we're not changing the ABI to make this a tail call, the memory
- // operands are already available in the caller's incoming argument space.
- NumBytes = 0;
- }
+ unsigned FPRSaveSize = 16 * (NumFPRArgRegs - FirstVariadicFPR);
+ int FPRIdx = 0;
+ if (FPRSaveSize != 0) {
+ FPRIdx = MFI->CreateStackObject(FPRSaveSize, 16, false);
- // FPDiff is the byte offset of the call's argument area from the callee's.
- // Stores to callee stack arguments will be placed in FixedStackSlots offset
- // by this amount for a tail call. In a sibling call it must be 0 because the
- // caller will deallocate the entire stack and the callee still expects its
- // arguments to begin at SP+0. Completely unused for non-tail calls.
- int FPDiff = 0;
+ SDValue FIN = DAG.getFrameIndex(FPRIdx, getPointerTy());
- if (IsTailCall && !IsSibCall) {
- unsigned NumReusableBytes = FuncInfo->getBytesInStackArgArea();
+ for (unsigned i = FirstVariadicFPR; i < NumFPRArgRegs; ++i) {
+ unsigned VReg = MF.addLiveIn(FPRArgRegs[i], &AArch64::FPR128RegClass);
+ SDValue Val = DAG.getCopyFromReg(Chain, DL, VReg, MVT::f128);
- // FPDiff will be negative if this tail call requires more space than we
- // would automatically have in our incoming argument space. Positive if we
- // can actually shrink the stack.
- FPDiff = NumReusableBytes - NumBytes;
-
- // The stack pointer must be 16-byte aligned at all times it's used for a
- // memory operation, which in practice means at *all* times and in
- // particular across call boundaries. Therefore our own arguments started at
- // a 16-byte aligned SP and the delta applied for the tail call should
- // satisfy the same constraint.
- assert(FPDiff % 16 == 0 && "unaligned stack on tail call");
- }
-
- if (!IsSibCall)
- Chain = DAG.getCALLSEQ_START(Chain, DAG.getIntPtrConstant(NumBytes, true),
- dl);
-
- SDValue StackPtr = DAG.getCopyFromReg(Chain, dl, AArch64::XSP,
- getPointerTy());
-
- SmallVector<SDValue, 8> MemOpChains;
- SmallVector<std::pair<unsigned, SDValue>, 8> RegsToPass;
-
- for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
- CCValAssign &VA = ArgLocs[i];
- ISD::ArgFlagsTy Flags = Outs[i].Flags;
- SDValue Arg = OutVals[i];
-
- // Callee does the actual widening, so all extensions just use an implicit
- // definition of the rest of the Loc. Aesthetically, this would be nicer as
- // an ANY_EXTEND, but that isn't valid for floating-point types and this
- // alternative works on integer types too.
- switch (VA.getLocInfo()) {
- default: llvm_unreachable("Unknown loc info!");
- case CCValAssign::Full: break;
- case CCValAssign::SExt:
- case CCValAssign::ZExt:
- case CCValAssign::AExt:
- case CCValAssign::FPExt: {
- unsigned SrcSize = VA.getValVT().getSizeInBits();
- unsigned SrcSubReg;
-
- switch (SrcSize) {
- case 8: SrcSubReg = AArch64::sub_8; break;
- case 16: SrcSubReg = AArch64::sub_16; break;
- case 32: SrcSubReg = AArch64::sub_32; break;
- case 64: SrcSubReg = AArch64::sub_64; break;
- default: llvm_unreachable("Unexpected argument promotion");
+ SDValue Store =
+ DAG.getStore(Val.getValue(1), DL, Val, FIN,
+ MachinePointerInfo::getStack(i * 16), false, false, 0);
+ MemOps.push_back(Store);
+ FIN = DAG.getNode(ISD::ADD, DL, getPointerTy(), FIN,
+ DAG.getConstant(16, getPointerTy()));
}
-
- Arg = SDValue(DAG.getMachineNode(TargetOpcode::INSERT_SUBREG, dl,
- VA.getLocVT(),
- DAG.getUNDEF(VA.getLocVT()),
- Arg,
- DAG.getTargetConstant(SrcSubReg, MVT::i32)),
- 0);
-
- break;
}
- case CCValAssign::BCvt:
- Arg = DAG.getNode(ISD::BITCAST, dl, VA.getLocVT(), Arg);
- break;
- }
-
- if (VA.isRegLoc()) {
- // A normal register (sub-) argument. For now we just note it down because
- // we want to copy things into registers as late as possible to avoid
- // register-pressure (and possibly worse).
- RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg));
- continue;
- }
-
- assert(VA.isMemLoc() && "unexpected argument location");
-
- SDValue DstAddr;
- MachinePointerInfo DstInfo;
- if (IsTailCall) {
- uint32_t OpSize = Flags.isByVal() ? Flags.getByValSize() :
- VA.getLocVT().getSizeInBits();
- OpSize = (OpSize + 7) / 8;
- int32_t Offset = VA.getLocMemOffset() + FPDiff;
- int FI = MF.getFrameInfo()->CreateFixedObject(OpSize, Offset, true);
-
- DstAddr = DAG.getFrameIndex(FI, getPointerTy());
- DstInfo = MachinePointerInfo::getFixedStack(FI);
-
- // Make sure any stack arguments overlapping with where we're storing are
- // loaded before this eventual operation. Otherwise they'll be clobbered.
- Chain = addTokenForArgument(Chain, DAG, MF.getFrameInfo(), FI);
- } else {
- uint32_t OpSize = Flags.isByVal() ? Flags.getByValSize()*8 :
- VA.getLocVT().getSizeInBits();
- OpSize = (OpSize + 7) / 8;
- uint32_t BEAlign = 0;
- if (OpSize < 8 && !getSubtarget()->isLittle())
- BEAlign = 8-OpSize;
- SDValue PtrOff = DAG.getIntPtrConstant(VA.getLocMemOffset() + BEAlign);
-
- DstAddr = DAG.getNode(ISD::ADD, dl, getPointerTy(), StackPtr, PtrOff);
- DstInfo = MachinePointerInfo::getStack(VA.getLocMemOffset());
- }
-
- if (Flags.isByVal()) {
- SDValue SizeNode = DAG.getConstant(Flags.getByValSize(), MVT::i64);
- SDValue Cpy = DAG.getMemcpy(Chain, dl, DstAddr, Arg, SizeNode,
- Flags.getByValAlign(),
- /*isVolatile = */ false,
- /*alwaysInline = */ false,
- DstInfo, MachinePointerInfo(0));
- MemOpChains.push_back(Cpy);
- } else {
- // Normal stack argument, put it where it's needed.
- SDValue Store = DAG.getStore(Chain, dl, Arg, DstAddr, DstInfo,
- false, false, 0);
- MemOpChains.push_back(Store);
- }
+ FuncInfo->setVarArgsFPRIndex(FPRIdx);
+ FuncInfo->setVarArgsFPRSize(FPRSaveSize);
}
- // The loads and stores generated above shouldn't clash with each
- // other. Combining them with this TokenFactor notes that fact for the rest of
- // the backend.
- if (!MemOpChains.empty())
- Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
- &MemOpChains[0], MemOpChains.size());
-
- // Most of the rest of the instructions need to be glued together; we don't
- // want assignments to actual registers used by a call to be rearranged by a
- // well-meaning scheduler.
- SDValue InFlag;
-
- for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) {
- Chain = DAG.getCopyToReg(Chain, dl, RegsToPass[i].first,
- RegsToPass[i].second, InFlag);
- InFlag = Chain.getValue(1);
+ if (!MemOps.empty()) {
+ Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOps);
}
-
- // The linker is responsible for inserting veneers when necessary to put a
- // function call destination in range, so we don't need to bother with a
- // wrapper here.
- if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee)) {
- const GlobalValue *GV = G->getGlobal();
- Callee = DAG.getTargetGlobalAddress(GV, dl, getPointerTy());
- } else if (ExternalSymbolSDNode *S = dyn_cast<ExternalSymbolSDNode>(Callee)) {
- const char *Sym = S->getSymbol();
- Callee = DAG.getTargetExternalSymbol(Sym, getPointerTy());
- }
-
- // We don't usually want to end the call-sequence here because we would tidy
- // the frame up *after* the call, however in the ABI-changing tail-call case
- // we've carefully laid out the parameters so that when sp is reset they'll be
- // in the correct location.
- if (IsTailCall && !IsSibCall) {
- Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(NumBytes, true),
- DAG.getIntPtrConstant(0, true), InFlag, dl);
- InFlag = Chain.getValue(1);
- }
-
- // We produce the following DAG scheme for the actual call instruction:
- // (AArch64Call Chain, Callee, reg1, ..., regn, preserveMask, inflag?
- //
- // Most arguments aren't going to be used and just keep the values live as
- // far as LLVM is concerned. It's expected to be selected as simply "bl
- // callee" (for a direct, non-tail call).
- std::vector<SDValue> Ops;
- Ops.push_back(Chain);
- Ops.push_back(Callee);
-
- if (IsTailCall) {
- // Each tail call may have to adjust the stack by a different amount, so
- // this information must travel along with the operation for eventual
- // consumption by emitEpilogue.
- Ops.push_back(DAG.getTargetConstant(FPDiff, MVT::i32));
- }
-
- for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i)
- Ops.push_back(DAG.getRegister(RegsToPass[i].first,
- RegsToPass[i].second.getValueType()));
-
-
- // Add a register mask operand representing the call-preserved registers. This
- // is used later in codegen to constrain register-allocation.
- const TargetRegisterInfo *TRI = getTargetMachine().getRegisterInfo();
- const uint32_t *Mask = TRI->getCallPreservedMask(CallConv);
- assert(Mask && "Missing call preserved mask for calling convention");
- Ops.push_back(DAG.getRegisterMask(Mask));
-
- // If we needed glue, put it in as the last argument.
- if (InFlag.getNode())
- Ops.push_back(InFlag);
-
- SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
-
- if (IsTailCall) {
- return DAG.getNode(AArch64ISD::TC_RETURN, dl, NodeTys, &Ops[0], Ops.size());
- }
-
- Chain = DAG.getNode(AArch64ISD::Call, dl, NodeTys, &Ops[0], Ops.size());
- InFlag = Chain.getValue(1);
-
- // Now we can reclaim the stack, just as well do it before working out where
- // our return value is.
- if (!IsSibCall) {
- uint64_t CalleePopBytes
- = DoesCalleeRestoreStack(CallConv, TailCallOpt) ? NumBytes : 0;
-
- Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(NumBytes, true),
- DAG.getIntPtrConstant(CalleePopBytes, true),
- InFlag, dl);
- InFlag = Chain.getValue(1);
- }
-
- return LowerCallResult(Chain, InFlag, CallConv,
- IsVarArg, Ins, dl, DAG, InVals);
}
-SDValue
-AArch64TargetLowering::LowerCallResult(SDValue Chain, SDValue InFlag,
- CallingConv::ID CallConv, bool IsVarArg,
- const SmallVectorImpl<ISD::InputArg> &Ins,
- SDLoc dl, SelectionDAG &DAG,
- SmallVectorImpl<SDValue> &InVals) const {
+/// LowerCallResult - Lower the result values of a call into the
+/// appropriate copies out of appropriate physical registers.
+SDValue AArch64TargetLowering::LowerCallResult(
+ SDValue Chain, SDValue InFlag, CallingConv::ID CallConv, bool isVarArg,
+ const SmallVectorImpl<ISD::InputArg> &Ins, SDLoc DL, SelectionDAG &DAG,
+ SmallVectorImpl<SDValue> &InVals, bool isThisReturn,
+ SDValue ThisVal) const {
+ CCAssignFn *RetCC = CallConv == CallingConv::WebKit_JS
+ ? RetCC_AArch64_WebKit_JS
+ : RetCC_AArch64_AAPCS;
// Assign locations to each value returned by this call.
SmallVector<CCValAssign, 16> RVLocs;
- CCState CCInfo(CallConv, IsVarArg, DAG.getMachineFunction(),
+ CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(),
getTargetMachine(), RVLocs, *DAG.getContext());
- CCInfo.AnalyzeCallResult(Ins, CCAssignFnForNode(CallConv));
+ CCInfo.AnalyzeCallResult(Ins, RetCC);
+ // Copy all of the result registers out of their specified physreg.
for (unsigned i = 0; i != RVLocs.size(); ++i) {
CCValAssign VA = RVLocs[i];
- // Return values that are too big to fit into registers should use an sret
- // pointer, so this can be a lot simpler than the main argument code.
- assert(VA.isRegLoc() && "Memory locations not expected for call return");
+ // Pass 'this' value directly from the argument to return value, to avoid
+ // reg unit interference
+ if (i == 0 && isThisReturn) {
+ assert(!VA.needsCustom() && VA.getLocVT() == MVT::i64 &&
+ "unexpected return calling convention register assignment");
+ InVals.push_back(ThisVal);
+ continue;
+ }
- SDValue Val = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), VA.getLocVT(),
- InFlag);
+ SDValue Val =
+ DAG.getCopyFromReg(Chain, DL, VA.getLocReg(), VA.getLocVT(), InFlag);
Chain = Val.getValue(1);
InFlag = Val.getValue(2);
switch (VA.getLocInfo()) {
- default: llvm_unreachable("Unknown loc info!");
- case CCValAssign::Full: break;
- case CCValAssign::BCvt:
- Val = DAG.getNode(ISD::BITCAST, dl, VA.getValVT(), Val);
+ default:
+ llvm_unreachable("Unknown loc info!");
+ case CCValAssign::Full:
break;
- case CCValAssign::ZExt:
- case CCValAssign::SExt:
- case CCValAssign::AExt:
- // Floating-point arguments only get extended/truncated if they're going
- // in memory, so using the integer operation is acceptable here.
- Val = DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), Val);
+ case CCValAssign::BCvt:
+ Val = DAG.getNode(ISD::BITCAST, DL, VA.getValVT(), Val);
break;
}
@@ -1812,17 +1960,12 @@
return Chain;
}
-bool
-AArch64TargetLowering::IsEligibleForTailCallOptimization(SDValue Callee,
- CallingConv::ID CalleeCC,
- bool IsVarArg,
- bool IsCalleeStructRet,
- bool IsCallerStructRet,
- const SmallVectorImpl<ISD::OutputArg> &Outs,
- const SmallVectorImpl<SDValue> &OutVals,
- const SmallVectorImpl<ISD::InputArg> &Ins,
- SelectionDAG& DAG) const {
-
+bool AArch64TargetLowering::isEligibleForTailCallOptimization(
+ SDValue Callee, CallingConv::ID CalleeCC, bool isVarArg,
+ bool isCalleeStructRet, bool isCallerStructRet,
+ const SmallVectorImpl<ISD::OutputArg> &Outs,
+ const SmallVectorImpl<SDValue> &OutVals,
+ const SmallVectorImpl<ISD::InputArg> &Ins, SelectionDAG &DAG) const {
// For CallingConv::C this function knows whether the ABI needs
// changing. That's not true for other conventions so they will have to opt in
// manually.
@@ -1838,7 +1981,8 @@
// we want to reuse during a tail call. Working around this *is* possible (see
// X86) but less efficient and uglier in LowerCall.
for (Function::const_arg_iterator i = CallerF->arg_begin(),
- e = CallerF->arg_end(); i != e; ++i)
+ e = CallerF->arg_end();
+ i != e; ++i)
if (i->hasByValAttr())
return false;
@@ -1854,10 +1998,10 @@
// I want anyone implementing a new calling convention to think long and hard
// about this assert.
- assert((!IsVarArg || CalleeCC == CallingConv::C)
- && "Unexpected variadic calling convention");
+ assert((!isVarArg || CalleeCC == CallingConv::C) &&
+ "Unexpected variadic calling convention");
- if (IsVarArg && !Outs.empty()) {
+ if (isVarArg && !Outs.empty()) {
// At least two cases here: if caller is fastcc then we can't have any
// memory arguments (we'd be expected to clean up the stack afterwards). If
// caller is C then we could potentially use its argument area.
@@ -1865,10 +2009,10 @@
// FIXME: for now we take the most conservative of these in both cases:
// disallow all variadic memory operands.
SmallVector<CCValAssign, 16> ArgLocs;
- CCState CCInfo(CalleeCC, IsVarArg, DAG.getMachineFunction(),
+ CCState CCInfo(CalleeCC, isVarArg, DAG.getMachineFunction(),
getTargetMachine(), ArgLocs, *DAG.getContext());
- CCInfo.AnalyzeCallOperands(Outs, CCAssignFnForNode(CalleeCC));
+ CCInfo.AnalyzeCallOperands(Outs, CCAssignFnForCall(CalleeCC, true));
for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i)
if (!ArgLocs[i].isRegLoc())
return false;
@@ -1880,12 +2024,12 @@
SmallVector<CCValAssign, 16> RVLocs1;
CCState CCInfo1(CalleeCC, false, DAG.getMachineFunction(),
getTargetMachine(), RVLocs1, *DAG.getContext());
- CCInfo1.AnalyzeCallResult(Ins, CCAssignFnForNode(CalleeCC));
+ CCInfo1.AnalyzeCallResult(Ins, CCAssignFnForCall(CalleeCC, isVarArg));
SmallVector<CCValAssign, 16> RVLocs2;
CCState CCInfo2(CallerCC, false, DAG.getMachineFunction(),
getTargetMachine(), RVLocs2, *DAG.getContext());
- CCInfo2.AnalyzeCallResult(Ins, CCAssignFnForNode(CallerCC));
+ CCInfo2.AnalyzeCallResult(Ins, CCAssignFnForCall(CallerCC, isVarArg));
if (RVLocs1.size() != RVLocs2.size())
return false;
@@ -1909,28 +2053,18 @@
return true;
SmallVector<CCValAssign, 16> ArgLocs;
- CCState CCInfo(CalleeCC, IsVarArg, DAG.getMachineFunction(),
+ CCState CCInfo(CalleeCC, isVarArg, DAG.getMachineFunction(),
getTargetMachine(), ArgLocs, *DAG.getContext());
- CCInfo.AnalyzeCallOperands(Outs, CCAssignFnForNode(CalleeCC));
+ CCInfo.AnalyzeCallOperands(Outs, CCAssignFnForCall(CalleeCC, isVarArg));
- const AArch64MachineFunctionInfo *FuncInfo
- = MF.getInfo<AArch64MachineFunctionInfo>();
+ const AArch64FunctionInfo *FuncInfo = MF.getInfo<AArch64FunctionInfo>();
// If the stack arguments for this call would fit into our own save area then
// the call can be made tail.
return CCInfo.getNextStackOffset() <= FuncInfo->getBytesInStackArgArea();
}
-bool AArch64TargetLowering::DoesCalleeRestoreStack(CallingConv::ID CallCC,
- bool TailCallOpt) const {
- return CallCC == CallingConv::Fast && TailCallOpt;
-}
-
-bool AArch64TargetLowering::IsTailCallConvention(CallingConv::ID CallCC) const {
- return CallCC == CallingConv::Fast;
-}
-
SDValue AArch64TargetLowering::addTokenForArgument(SDValue Chain,
SelectionDAG &DAG,
MachineFrameInfo *MFI,
@@ -1946,7 +2080,8 @@
// Add a chain value for each stack argument corresponding
for (SDNode::use_iterator U = DAG.getEntryNode().getNode()->use_begin(),
- UE = DAG.getEntryNode().getNode()->use_end(); U != UE; ++U)
+ UE = DAG.getEntryNode().getNode()->use_end();
+ U != UE; ++U)
if (LoadSDNode *L = dyn_cast<LoadSDNode>(*U))
if (FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(L->getBasePtr()))
if (FI->getIndex() < 0) {
@@ -1959,625 +2094,609 @@
ArgChains.push_back(SDValue(L, 1));
}
- // Build a tokenfactor for all the chains.
- return DAG.getNode(ISD::TokenFactor, SDLoc(Chain), MVT::Other,
- &ArgChains[0], ArgChains.size());
+ // Build a tokenfactor for all the chains.
+ return DAG.getNode(ISD::TokenFactor, SDLoc(Chain), MVT::Other, ArgChains);
}
-static A64CC::CondCodes IntCCToA64CC(ISD::CondCode CC) {
- switch (CC) {
- case ISD::SETEQ: return A64CC::EQ;
- case ISD::SETGT: return A64CC::GT;
- case ISD::SETGE: return A64CC::GE;
- case ISD::SETLT: return A64CC::LT;
- case ISD::SETLE: return A64CC::LE;
- case ISD::SETNE: return A64CC::NE;
- case ISD::SETUGT: return A64CC::HI;
- case ISD::SETUGE: return A64CC::HS;
- case ISD::SETULT: return A64CC::LO;
- case ISD::SETULE: return A64CC::LS;
- default: llvm_unreachable("Unexpected condition code");
+bool AArch64TargetLowering::DoesCalleeRestoreStack(CallingConv::ID CallCC,
+ bool TailCallOpt) const {
+ return CallCC == CallingConv::Fast && TailCallOpt;
+}
+
+bool AArch64TargetLowering::IsTailCallConvention(CallingConv::ID CallCC) const {
+ return CallCC == CallingConv::Fast;
+}
+
+/// LowerCall - Lower a call to a callseq_start + CALL + callseq_end chain,
+/// and add input and output parameter nodes.
+SDValue
+AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI,
+ SmallVectorImpl<SDValue> &InVals) const {
+ SelectionDAG &DAG = CLI.DAG;
+ SDLoc &DL = CLI.DL;
+ SmallVector<ISD::OutputArg, 32> &Outs = CLI.Outs;
+ SmallVector<SDValue, 32> &OutVals = CLI.OutVals;
+ SmallVector<ISD::InputArg, 32> &Ins = CLI.Ins;
+ SDValue Chain = CLI.Chain;
+ SDValue Callee = CLI.Callee;
+ bool &IsTailCall = CLI.IsTailCall;
+ CallingConv::ID CallConv = CLI.CallConv;
+ bool IsVarArg = CLI.IsVarArg;
+
+ MachineFunction &MF = DAG.getMachineFunction();
+ bool IsStructRet = (Outs.empty()) ? false : Outs[0].Flags.isSRet();
+ bool IsThisReturn = false;
+
+ AArch64FunctionInfo *FuncInfo = MF.getInfo<AArch64FunctionInfo>();
+ bool TailCallOpt = MF.getTarget().Options.GuaranteedTailCallOpt;
+ bool IsSibCall = false;
+
+ if (IsTailCall) {
+ // Check if it's really possible to do a tail call.
+ IsTailCall = isEligibleForTailCallOptimization(
+ Callee, CallConv, IsVarArg, IsStructRet,
+ MF.getFunction()->hasStructRetAttr(), Outs, OutVals, Ins, DAG);
+ if (!IsTailCall && CLI.CS && CLI.CS->isMustTailCall())
+ report_fatal_error("failed to perform tail call elimination on a call "
+ "site marked musttail");
+
+ // A sibling call is one where we're under the usual C ABI and not planning
+ // to change that but can still do a tail call:
+ if (!TailCallOpt && IsTailCall)
+ IsSibCall = true;
+
+ if (IsTailCall)
+ ++NumTailCalls;
}
-}
-bool AArch64TargetLowering::isLegalICmpImmediate(int64_t Val) const {
- // icmp is implemented using adds/subs immediate, which take an unsigned
- // 12-bit immediate, optionally shifted left by 12 bits.
+ // Analyze operands of the call, assigning locations to each operand.
+ SmallVector<CCValAssign, 16> ArgLocs;
+ CCState CCInfo(CallConv, IsVarArg, DAG.getMachineFunction(),
+ getTargetMachine(), ArgLocs, *DAG.getContext());
- // Symmetric by using adds/subs
- if (Val < 0)
- Val = -Val;
+ if (IsVarArg) {
+ // Handle fixed and variable vector arguments differently.
+ // Variable vector arguments always go into memory.
+ unsigned NumArgs = Outs.size();
- return (Val & ~0xfff) == 0 || (Val & ~0xfff000) == 0;
-}
+ for (unsigned i = 0; i != NumArgs; ++i) {
+ MVT ArgVT = Outs[i].VT;
+ ISD::ArgFlagsTy ArgFlags = Outs[i].Flags;
+ CCAssignFn *AssignFn = CCAssignFnForCall(CallConv,
+ /*IsVarArg=*/ !Outs[i].IsFixed);
+ bool Res = AssignFn(i, ArgVT, ArgVT, CCValAssign::Full, ArgFlags, CCInfo);
+ assert(!Res && "Call operand has unhandled type");
+ (void)Res;
+ }
+ } else {
+ // At this point, Outs[].VT may already be promoted to i32. To correctly
+ // handle passing i8 as i8 instead of i32 on stack, we pass in both i32 and
+ // i8 to CC_AArch64_AAPCS with i32 being ValVT and i8 being LocVT.
+ // Since AnalyzeCallOperands uses Ins[].VT for both ValVT and LocVT, here
+ // we use a special version of AnalyzeCallOperands to pass in ValVT and
+ // LocVT.
+ unsigned NumArgs = Outs.size();
+ for (unsigned i = 0; i != NumArgs; ++i) {
+ MVT ValVT = Outs[i].VT;
+ // Get type of the original argument.
+ EVT ActualVT = getValueType(CLI.getArgs()[Outs[i].OrigArgIndex].Ty,
+ /*AllowUnknown*/ true);
+ MVT ActualMVT = ActualVT.isSimple() ? ActualVT.getSimpleVT() : ValVT;
+ ISD::ArgFlagsTy ArgFlags = Outs[i].Flags;
+ // If ActualMVT is i1/i8/i16, we should set LocVT to i8/i8/i16.
+ if (ActualMVT == MVT::i1 || ActualMVT == MVT::i8)
+ ValVT = MVT::i8;
+ else if (ActualMVT == MVT::i16)
+ ValVT = MVT::i16;
-SDValue AArch64TargetLowering::getSelectableIntSetCC(SDValue LHS, SDValue RHS,
- ISD::CondCode CC, SDValue &A64cc,
- SelectionDAG &DAG, SDLoc &dl) const {
- if (ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(RHS.getNode())) {
- int64_t C = 0;
- EVT VT = RHSC->getValueType(0);
- bool knownInvalid = false;
+ CCAssignFn *AssignFn = CCAssignFnForCall(CallConv, /*IsVarArg=*/false);
+ bool Res = AssignFn(i, ValVT, ValVT, CCValAssign::Full, ArgFlags, CCInfo);
+ assert(!Res && "Call operand has unhandled type");
+ (void)Res;
+ }
+ }
- // I'm not convinced the rest of LLVM handles these edge cases properly, but
- // we can at least get it right.
- if (isSignedIntSetCC(CC)) {
- C = RHSC->getSExtValue();
- } else if (RHSC->getZExtValue() > INT64_MAX) {
- // A 64-bit constant not representable by a signed 64-bit integer is far
- // too big to fit into a SUBS immediate anyway.
- knownInvalid = true;
- } else {
- C = RHSC->getZExtValue();
+ // Get a count of how many bytes are to be pushed on the stack.
+ unsigned NumBytes = CCInfo.getNextStackOffset();
+
+ if (IsSibCall) {
+ // Since we're not changing the ABI to make this a tail call, the memory
+ // operands are already available in the caller's incoming argument space.
+ NumBytes = 0;
+ }
+
+ // FPDiff is the byte offset of the call's argument area from the callee's.
+ // Stores to callee stack arguments will be placed in FixedStackSlots offset
+ // by this amount for a tail call. In a sibling call it must be 0 because the
+ // caller will deallocate the entire stack and the callee still expects its
+ // arguments to begin at SP+0. Completely unused for non-tail calls.
+ int FPDiff = 0;
+
+ if (IsTailCall && !IsSibCall) {
+ unsigned NumReusableBytes = FuncInfo->getBytesInStackArgArea();
+
+ // Since callee will pop argument stack as a tail call, we must keep the
+ // popped size 16-byte aligned.
+ NumBytes = RoundUpToAlignment(NumBytes, 16);
+
+ // FPDiff will be negative if this tail call requires more space than we
+ // would automatically have in our incoming argument space. Positive if we
+ // can actually shrink the stack.
+ FPDiff = NumReusableBytes - NumBytes;
+
+ // The stack pointer must be 16-byte aligned at all times it's used for a
+ // memory operation, which in practice means at *all* times and in
+ // particular across call boundaries. Therefore our own arguments started at
+ // a 16-byte aligned SP and the delta applied for the tail call should
+ // satisfy the same constraint.
+ assert(FPDiff % 16 == 0 && "unaligned stack on tail call");
+ }
+
+ // Adjust the stack pointer for the new arguments...
+ // These operations are automatically eliminated by the prolog/epilog pass
+ if (!IsSibCall)
+ Chain =
+ DAG.getCALLSEQ_START(Chain, DAG.getIntPtrConstant(NumBytes, true), DL);
+
+ SDValue StackPtr = DAG.getCopyFromReg(Chain, DL, AArch64::SP, getPointerTy());
+
+ SmallVector<std::pair<unsigned, SDValue>, 8> RegsToPass;
+ SmallVector<SDValue, 8> MemOpChains;
+
+ // Walk the register/memloc assignments, inserting copies/loads.
+ for (unsigned i = 0, realArgIdx = 0, e = ArgLocs.size(); i != e;
+ ++i, ++realArgIdx) {
+ CCValAssign &VA = ArgLocs[i];
+ SDValue Arg = OutVals[realArgIdx];
+ ISD::ArgFlagsTy Flags = Outs[realArgIdx].Flags;
+
+ // Promote the value if needed.
+ switch (VA.getLocInfo()) {
+ default:
+ llvm_unreachable("Unknown loc info!");
+ case CCValAssign::Full:
+ break;
+ case CCValAssign::SExt:
+ Arg = DAG.getNode(ISD::SIGN_EXTEND, DL, VA.getLocVT(), Arg);
+ break;
+ case CCValAssign::ZExt:
+ Arg = DAG.getNode(ISD::ZERO_EXTEND, DL, VA.getLocVT(), Arg);
+ break;
+ case CCValAssign::AExt:
+ if (Outs[realArgIdx].ArgVT == MVT::i1) {
+ // AAPCS requires i1 to be zero-extended to 8-bits by the caller.
+ Arg = DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, Arg);
+ Arg = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i8, Arg);
+ }
+ Arg = DAG.getNode(ISD::ANY_EXTEND, DL, VA.getLocVT(), Arg);
+ break;
+ case CCValAssign::BCvt:
+ Arg = DAG.getNode(ISD::BITCAST, DL, VA.getLocVT(), Arg);
+ break;
+ case CCValAssign::FPExt:
+ Arg = DAG.getNode(ISD::FP_EXTEND, DL, VA.getLocVT(), Arg);
+ break;
}
- if (!knownInvalid && !isLegalICmpImmediate(C)) {
- // Constant does not fit, try adjusting it by one?
- switch (CC) {
- default: break;
- case ISD::SETLT:
- case ISD::SETGE:
- if (isLegalICmpImmediate(C-1)) {
- CC = (CC == ISD::SETLT) ? ISD::SETLE : ISD::SETGT;
- RHS = DAG.getConstant(C-1, VT);
- }
- break;
- case ISD::SETULT:
- case ISD::SETUGE:
- if (isLegalICmpImmediate(C-1)) {
- CC = (CC == ISD::SETULT) ? ISD::SETULE : ISD::SETUGT;
- RHS = DAG.getConstant(C-1, VT);
- }
- break;
- case ISD::SETLE:
- case ISD::SETGT:
- if (isLegalICmpImmediate(C+1)) {
- CC = (CC == ISD::SETLE) ? ISD::SETLT : ISD::SETGE;
- RHS = DAG.getConstant(C+1, VT);
- }
- break;
- case ISD::SETULE:
- case ISD::SETUGT:
- if (isLegalICmpImmediate(C+1)) {
- CC = (CC == ISD::SETULE) ? ISD::SETULT : ISD::SETUGE;
- RHS = DAG.getConstant(C+1, VT);
- }
- break;
+ if (VA.isRegLoc()) {
+ if (realArgIdx == 0 && Flags.isReturned() && Outs[0].VT == MVT::i64) {
+ assert(VA.getLocVT() == MVT::i64 &&
+ "unexpected calling convention register assignment");
+ assert(!Ins.empty() && Ins[0].VT == MVT::i64 &&
+ "unexpected use of 'returned'");
+ IsThisReturn = true;
+ }
+ RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg));
+ } else {
+ assert(VA.isMemLoc());
+
+ SDValue DstAddr;
+ MachinePointerInfo DstInfo;
+
+ // FIXME: This works on big-endian for composite byvals, which are the
+ // common case. It should also work for fundamental types too.
+ uint32_t BEAlign = 0;
+ unsigned OpSize = Flags.isByVal() ? Flags.getByValSize() * 8
+ : VA.getLocVT().getSizeInBits();
+ OpSize = (OpSize + 7) / 8;
+ if (!Subtarget->isLittleEndian() && !Flags.isByVal()) {
+ if (OpSize < 8)
+ BEAlign = 8 - OpSize;
+ }
+ unsigned LocMemOffset = VA.getLocMemOffset();
+ int32_t Offset = LocMemOffset + BEAlign;
+ SDValue PtrOff = DAG.getIntPtrConstant(Offset);
+ PtrOff = DAG.getNode(ISD::ADD, DL, getPointerTy(), StackPtr, PtrOff);
+
+ if (IsTailCall) {
+ Offset = Offset + FPDiff;
+ int FI = MF.getFrameInfo()->CreateFixedObject(OpSize, Offset, true);
+
+ DstAddr = DAG.getFrameIndex(FI, getPointerTy());
+ DstInfo = MachinePointerInfo::getFixedStack(FI);
+
+ // Make sure any stack arguments overlapping with where we're storing
+ // are loaded before this eventual operation. Otherwise they'll be
+ // clobbered.
+ Chain = addTokenForArgument(Chain, DAG, MF.getFrameInfo(), FI);
+ } else {
+ SDValue PtrOff = DAG.getIntPtrConstant(Offset);
+
+ DstAddr = DAG.getNode(ISD::ADD, DL, getPointerTy(), StackPtr, PtrOff);
+ DstInfo = MachinePointerInfo::getStack(LocMemOffset);
+ }
+
+ if (Outs[i].Flags.isByVal()) {
+ SDValue SizeNode =
+ DAG.getConstant(Outs[i].Flags.getByValSize(), MVT::i64);
+ SDValue Cpy = DAG.getMemcpy(
+ Chain, DL, DstAddr, Arg, SizeNode, Outs[i].Flags.getByValAlign(),
+ /*isVolatile = */ false,
+ /*alwaysInline = */ false, DstInfo, MachinePointerInfo());
+
+ MemOpChains.push_back(Cpy);
+ } else {
+ // Since we pass i1/i8/i16 as i1/i8/i16 on stack and Arg is already
+ // promoted to a legal register type i32, we should truncate Arg back to
+ // i1/i8/i16.
+ if (Arg.getValueType().isSimple() &&
+ Arg.getValueType().getSimpleVT() == MVT::i32 &&
+ (VA.getLocVT() == MVT::i1 || VA.getLocVT() == MVT::i8 ||
+ VA.getLocVT() == MVT::i16))
+ Arg = DAG.getNode(ISD::TRUNCATE, DL, VA.getLocVT(), Arg);
+
+ SDValue Store =
+ DAG.getStore(Chain, DL, Arg, DstAddr, DstInfo, false, false, 0);
+ MemOpChains.push_back(Store);
}
}
}
- A64CC::CondCodes CondCode = IntCCToA64CC(CC);
- A64cc = DAG.getConstant(CondCode, MVT::i32);
- return DAG.getNode(AArch64ISD::SETCC, dl, MVT::i32, LHS, RHS,
- DAG.getCondCode(CC));
+ if (!MemOpChains.empty())
+ Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOpChains);
+
+ // Build a sequence of copy-to-reg nodes chained together with token chain
+ // and flag operands which copy the outgoing args into the appropriate regs.
+ SDValue InFlag;
+ for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) {
+ Chain = DAG.getCopyToReg(Chain, DL, RegsToPass[i].first,
+ RegsToPass[i].second, InFlag);
+ InFlag = Chain.getValue(1);
+ }
+
+ // If the callee is a GlobalAddress/ExternalSymbol node (quite common, every
+ // direct call is) turn it into a TargetGlobalAddress/TargetExternalSymbol
+ // node so that legalize doesn't hack it.
+ if (getTargetMachine().getCodeModel() == CodeModel::Large &&
+ Subtarget->isTargetMachO()) {
+ if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee)) {
+ const GlobalValue *GV = G->getGlobal();
+ bool InternalLinkage = GV->hasInternalLinkage();
+ if (InternalLinkage)
+ Callee = DAG.getTargetGlobalAddress(GV, DL, getPointerTy(), 0, 0);
+ else {
+ Callee = DAG.getTargetGlobalAddress(GV, DL, getPointerTy(), 0,
+ AArch64II::MO_GOT);
+ Callee = DAG.getNode(AArch64ISD::LOADgot, DL, getPointerTy(), Callee);
+ }
+ } else if (ExternalSymbolSDNode *S =
+ dyn_cast<ExternalSymbolSDNode>(Callee)) {
+ const char *Sym = S->getSymbol();
+ Callee =
+ DAG.getTargetExternalSymbol(Sym, getPointerTy(), AArch64II::MO_GOT);
+ Callee = DAG.getNode(AArch64ISD::LOADgot, DL, getPointerTy(), Callee);
+ }
+ } else if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee)) {
+ const GlobalValue *GV = G->getGlobal();
+ Callee = DAG.getTargetGlobalAddress(GV, DL, getPointerTy(), 0, 0);
+ } else if (ExternalSymbolSDNode *S = dyn_cast<ExternalSymbolSDNode>(Callee)) {
+ const char *Sym = S->getSymbol();
+ Callee = DAG.getTargetExternalSymbol(Sym, getPointerTy(), 0);
+ }
+
+ // We don't usually want to end the call-sequence here because we would tidy
+ // the frame up *after* the call, however in the ABI-changing tail-call case
+ // we've carefully laid out the parameters so that when sp is reset they'll be
+ // in the correct location.
+ if (IsTailCall && !IsSibCall) {
+ Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(NumBytes, true),
+ DAG.getIntPtrConstant(0, true), InFlag, DL);
+ InFlag = Chain.getValue(1);
+ }
+
+ std::vector<SDValue> Ops;
+ Ops.push_back(Chain);
+ Ops.push_back(Callee);
+
+ if (IsTailCall) {
+ // Each tail call may have to adjust the stack by a different amount, so
+ // this information must travel along with the operation for eventual
+ // consumption by emitEpilogue.
+ Ops.push_back(DAG.getTargetConstant(FPDiff, MVT::i32));
+ }
+
+ // Add argument registers to the end of the list so that they are known live
+ // into the call.
+ for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i)
+ Ops.push_back(DAG.getRegister(RegsToPass[i].first,
+ RegsToPass[i].second.getValueType()));
+
+ // Add a register mask operand representing the call-preserved registers.
+ const uint32_t *Mask;
+ const TargetRegisterInfo *TRI = getTargetMachine().getRegisterInfo();
+ const AArch64RegisterInfo *ARI =
+ static_cast<const AArch64RegisterInfo *>(TRI);
+ if (IsThisReturn) {
+ // For 'this' returns, use the X0-preserving mask if applicable
+ Mask = ARI->getThisReturnPreservedMask(CallConv);
+ if (!Mask) {
+ IsThisReturn = false;
+ Mask = ARI->getCallPreservedMask(CallConv);
+ }
+ } else
+ Mask = ARI->getCallPreservedMask(CallConv);
+
+ assert(Mask && "Missing call preserved mask for calling convention");
+ Ops.push_back(DAG.getRegisterMask(Mask));
+
+ if (InFlag.getNode())
+ Ops.push_back(InFlag);
+
+ SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
+
+ // If we're doing a tall call, use a TC_RETURN here rather than an
+ // actual call instruction.
+ if (IsTailCall)
+ return DAG.getNode(AArch64ISD::TC_RETURN, DL, NodeTys, Ops);
+
+ // Returns a chain and a flag for retval copy to use.
+ Chain = DAG.getNode(AArch64ISD::CALL, DL, NodeTys, Ops);
+ InFlag = Chain.getValue(1);
+
+ uint64_t CalleePopBytes = DoesCalleeRestoreStack(CallConv, TailCallOpt)
+ ? RoundUpToAlignment(NumBytes, 16)
+ : 0;
+
+ Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(NumBytes, true),
+ DAG.getIntPtrConstant(CalleePopBytes, true),
+ InFlag, DL);
+ if (!Ins.empty())
+ InFlag = Chain.getValue(1);
+
+ // Handle result values, copying them out of physregs into vregs that we
+ // return.
+ return LowerCallResult(Chain, InFlag, CallConv, IsVarArg, Ins, DL, DAG,
+ InVals, IsThisReturn,
+ IsThisReturn ? OutVals[0] : SDValue());
}
-static A64CC::CondCodes FPCCToA64CC(ISD::CondCode CC,
- A64CC::CondCodes &Alternative) {
- A64CC::CondCodes CondCode = A64CC::Invalid;
- Alternative = A64CC::Invalid;
-
- switch (CC) {
- default: llvm_unreachable("Unknown FP condition!");
- case ISD::SETEQ:
- case ISD::SETOEQ: CondCode = A64CC::EQ; break;
- case ISD::SETGT:
- case ISD::SETOGT: CondCode = A64CC::GT; break;
- case ISD::SETGE:
- case ISD::SETOGE: CondCode = A64CC::GE; break;
- case ISD::SETOLT: CondCode = A64CC::MI; break;
- case ISD::SETOLE: CondCode = A64CC::LS; break;
- case ISD::SETONE: CondCode = A64CC::MI; Alternative = A64CC::GT; break;
- case ISD::SETO: CondCode = A64CC::VC; break;
- case ISD::SETUO: CondCode = A64CC::VS; break;
- case ISD::SETUEQ: CondCode = A64CC::EQ; Alternative = A64CC::VS; break;
- case ISD::SETUGT: CondCode = A64CC::HI; break;
- case ISD::SETUGE: CondCode = A64CC::PL; break;
- case ISD::SETLT:
- case ISD::SETULT: CondCode = A64CC::LT; break;
- case ISD::SETLE:
- case ISD::SETULE: CondCode = A64CC::LE; break;
- case ISD::SETNE:
- case ISD::SETUNE: CondCode = A64CC::NE; break;
- }
- return CondCode;
+bool AArch64TargetLowering::CanLowerReturn(
+ CallingConv::ID CallConv, MachineFunction &MF, bool isVarArg,
+ const SmallVectorImpl<ISD::OutputArg> &Outs, LLVMContext &Context) const {
+ CCAssignFn *RetCC = CallConv == CallingConv::WebKit_JS
+ ? RetCC_AArch64_WebKit_JS
+ : RetCC_AArch64_AAPCS;
+ SmallVector<CCValAssign, 16> RVLocs;
+ CCState CCInfo(CallConv, isVarArg, MF, getTargetMachine(), RVLocs, Context);
+ return CCInfo.CheckReturn(Outs, RetCC);
}
SDValue
-AArch64TargetLowering::LowerBlockAddress(SDValue Op, SelectionDAG &DAG) const {
- SDLoc DL(Op);
- EVT PtrVT = getPointerTy();
- const BlockAddress *BA = cast<BlockAddressSDNode>(Op)->getBlockAddress();
+AArch64TargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
+ bool isVarArg,
+ const SmallVectorImpl<ISD::OutputArg> &Outs,
+ const SmallVectorImpl<SDValue> &OutVals,
+ SDLoc DL, SelectionDAG &DAG) const {
+ CCAssignFn *RetCC = CallConv == CallingConv::WebKit_JS
+ ? RetCC_AArch64_WebKit_JS
+ : RetCC_AArch64_AAPCS;
+ SmallVector<CCValAssign, 16> RVLocs;
+ CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(),
+ getTargetMachine(), RVLocs, *DAG.getContext());
+ CCInfo.AnalyzeReturn(Outs, RetCC);
- switch(getTargetMachine().getCodeModel()) {
- case CodeModel::Small:
- // The most efficient code is PC-relative anyway for the small memory model,
- // so we don't need to worry about relocation model.
- return DAG.getNode(AArch64ISD::WrapperSmall, DL, PtrVT,
- DAG.getTargetBlockAddress(BA, PtrVT, 0,
- AArch64II::MO_NO_FLAG),
- DAG.getTargetBlockAddress(BA, PtrVT, 0,
- AArch64II::MO_LO12),
- DAG.getConstant(/*Alignment=*/ 4, MVT::i32));
- case CodeModel::Large:
+ // Copy the result values into the output registers.
+ SDValue Flag;
+ SmallVector<SDValue, 4> RetOps(1, Chain);
+ for (unsigned i = 0, realRVLocIdx = 0; i != RVLocs.size();
+ ++i, ++realRVLocIdx) {
+ CCValAssign &VA = RVLocs[i];
+ assert(VA.isRegLoc() && "Can only return in registers!");
+ SDValue Arg = OutVals[realRVLocIdx];
+
+ switch (VA.getLocInfo()) {
+ default:
+ llvm_unreachable("Unknown loc info!");
+ case CCValAssign::Full:
+ if (Outs[i].ArgVT == MVT::i1) {
+ // AAPCS requires i1 to be zero-extended to i8 by the producer of the
+ // value. This is strictly redundant on Darwin (which uses "zeroext
+ // i1"), but will be optimised out before ISel.
+ Arg = DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, Arg);
+ Arg = DAG.getNode(ISD::ZERO_EXTEND, DL, VA.getLocVT(), Arg);
+ }
+ break;
+ case CCValAssign::BCvt:
+ Arg = DAG.getNode(ISD::BITCAST, DL, VA.getLocVT(), Arg);
+ break;
+ }
+
+ Chain = DAG.getCopyToReg(Chain, DL, VA.getLocReg(), Arg, Flag);
+ Flag = Chain.getValue(1);
+ RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT()));
+ }
+
+ RetOps[0] = Chain; // Update chain.
+
+ // Add the flag if we have it.
+ if (Flag.getNode())
+ RetOps.push_back(Flag);
+
+ return DAG.getNode(AArch64ISD::RET_FLAG, DL, MVT::Other, RetOps);
+}
+
+//===----------------------------------------------------------------------===//
+// Other Lowering Code
+//===----------------------------------------------------------------------===//
+
+SDValue AArch64TargetLowering::LowerGlobalAddress(SDValue Op,
+ SelectionDAG &DAG) const {
+ EVT PtrVT = getPointerTy();
+ SDLoc DL(Op);
+ const GlobalValue *GV = cast<GlobalAddressSDNode>(Op)->getGlobal();
+ unsigned char OpFlags =
+ Subtarget->ClassifyGlobalReference(GV, getTargetMachine());
+
+ assert(cast<GlobalAddressSDNode>(Op)->getOffset() == 0 &&
+ "unexpected offset in global node");
+
+ // This also catched the large code model case for Darwin.
+ if ((OpFlags & AArch64II::MO_GOT) != 0) {
+ SDValue GotAddr = DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, OpFlags);
+ // FIXME: Once remat is capable of dealing with instructions with register
+ // operands, expand this into two nodes instead of using a wrapper node.
+ return DAG.getNode(AArch64ISD::LOADgot, DL, PtrVT, GotAddr);
+ }
+
+ if (getTargetMachine().getCodeModel() == CodeModel::Large) {
+ const unsigned char MO_NC = AArch64II::MO_NC;
return DAG.getNode(
- AArch64ISD::WrapperLarge, DL, PtrVT,
- DAG.getTargetBlockAddress(BA, PtrVT, 0, AArch64II::MO_ABS_G3),
- DAG.getTargetBlockAddress(BA, PtrVT, 0, AArch64II::MO_ABS_G2_NC),
- DAG.getTargetBlockAddress(BA, PtrVT, 0, AArch64II::MO_ABS_G1_NC),
- DAG.getTargetBlockAddress(BA, PtrVT, 0, AArch64II::MO_ABS_G0_NC));
- default:
- llvm_unreachable("Only small and large code models supported now");
- }
-}
-
-
-// (BRCOND chain, val, dest)
-SDValue
-AArch64TargetLowering::LowerBRCOND(SDValue Op, SelectionDAG &DAG) const {
- SDLoc dl(Op);
- SDValue Chain = Op.getOperand(0);
- SDValue TheBit = Op.getOperand(1);
- SDValue DestBB = Op.getOperand(2);
-
- // AArch64 BooleanContents is the default UndefinedBooleanContent, which means
- // that as the consumer we are responsible for ignoring rubbish in higher
- // bits.
- TheBit = DAG.getNode(ISD::AND, dl, MVT::i32, TheBit,
- DAG.getConstant(1, MVT::i32));
-
- SDValue A64CMP = DAG.getNode(AArch64ISD::SETCC, dl, MVT::i32, TheBit,
- DAG.getConstant(0, TheBit.getValueType()),
- DAG.getCondCode(ISD::SETNE));
-
- return DAG.getNode(AArch64ISD::BR_CC, dl, MVT::Other, Chain,
- A64CMP, DAG.getConstant(A64CC::NE, MVT::i32),
- DestBB);
-}
-
-// (BR_CC chain, condcode, lhs, rhs, dest)
-SDValue
-AArch64TargetLowering::LowerBR_CC(SDValue Op, SelectionDAG &DAG) const {
- SDLoc dl(Op);
- SDValue Chain = Op.getOperand(0);
- ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(1))->get();
- SDValue LHS = Op.getOperand(2);
- SDValue RHS = Op.getOperand(3);
- SDValue DestBB = Op.getOperand(4);
-
- if (LHS.getValueType() == MVT::f128) {
- // f128 comparisons are lowered to runtime calls by a routine which sets
- // LHS, RHS and CC appropriately for the rest of this function to continue.
- softenSetCCOperands(DAG, MVT::f128, LHS, RHS, CC, dl);
-
- // If softenSetCCOperands returned a scalar, we need to compare the result
- // against zero to select between true and false values.
- if (RHS.getNode() == 0) {
- RHS = DAG.getConstant(0, LHS.getValueType());
- CC = ISD::SETNE;
- }
- }
-
- if (LHS.getValueType().isInteger()) {
- SDValue A64cc;
-
- // Integers are handled in a separate function because the combinations of
- // immediates and tests can get hairy and we may want to fiddle things.
- SDValue CmpOp = getSelectableIntSetCC(LHS, RHS, CC, A64cc, DAG, dl);
-
- return DAG.getNode(AArch64ISD::BR_CC, dl, MVT::Other,
- Chain, CmpOp, A64cc, DestBB);
- }
-
- // Note that some LLVM floating-point CondCodes can't be lowered to a single
- // conditional branch, hence FPCCToA64CC can set a second test, where either
- // passing is sufficient.
- A64CC::CondCodes CondCode, Alternative = A64CC::Invalid;
- CondCode = FPCCToA64CC(CC, Alternative);
- SDValue A64cc = DAG.getConstant(CondCode, MVT::i32);
- SDValue SetCC = DAG.getNode(AArch64ISD::SETCC, dl, MVT::i32, LHS, RHS,
- DAG.getCondCode(CC));
- SDValue A64BR_CC = DAG.getNode(AArch64ISD::BR_CC, dl, MVT::Other,
- Chain, SetCC, A64cc, DestBB);
-
- if (Alternative != A64CC::Invalid) {
- A64cc = DAG.getConstant(Alternative, MVT::i32);
- A64BR_CC = DAG.getNode(AArch64ISD::BR_CC, dl, MVT::Other,
- A64BR_CC, SetCC, A64cc, DestBB);
-
- }
-
- return A64BR_CC;
-}
-
-SDValue
-AArch64TargetLowering::LowerF128ToCall(SDValue Op, SelectionDAG &DAG,
- RTLIB::Libcall Call) const {
- ArgListTy Args;
- ArgListEntry Entry;
- for (unsigned i = 0, e = Op->getNumOperands(); i != e; ++i) {
- EVT ArgVT = Op.getOperand(i).getValueType();
- Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());
- Entry.Node = Op.getOperand(i); Entry.Ty = ArgTy;
- Entry.isSExt = false;
- Entry.isZExt = false;
- Args.push_back(Entry);
- }
- SDValue Callee = DAG.getExternalSymbol(getLibcallName(Call), getPointerTy());
-
- Type *RetTy = Op.getValueType().getTypeForEVT(*DAG.getContext());
-
- // By default, the input chain to this libcall is the entry node of the
- // function. If the libcall is going to be emitted as a tail call then
- // isUsedByReturnOnly will change it to the right chain if the return
- // node which is being folded has a non-entry input chain.
- SDValue InChain = DAG.getEntryNode();
-
- // isTailCall may be true since the callee does not reference caller stack
- // frame. Check if it's in the right position.
- SDValue TCChain = InChain;
- bool isTailCall = isInTailCallPosition(DAG, Op.getNode(), TCChain);
- if (isTailCall)
- InChain = TCChain;
-
- TargetLowering::
- CallLoweringInfo CLI(InChain, RetTy, false, false, false, false,
- 0, getLibcallCallingConv(Call), isTailCall,
- /*doesNotReturn=*/false, /*isReturnValueUsed=*/true,
- Callee, Args, DAG, SDLoc(Op));
- std::pair<SDValue, SDValue> CallInfo = LowerCallTo(CLI);
-
- if (!CallInfo.second.getNode())
- // It's a tailcall, return the chain (which is the DAG root).
- return DAG.getRoot();
-
- return CallInfo.first;
-}
-
-SDValue
-AArch64TargetLowering::LowerFP_ROUND(SDValue Op, SelectionDAG &DAG) const {
- if (Op.getOperand(0).getValueType() != MVT::f128) {
- // It's legal except when f128 is involved
- return Op;
- }
-
- RTLIB::Libcall LC;
- LC = RTLIB::getFPROUND(Op.getOperand(0).getValueType(), Op.getValueType());
-
- SDValue SrcVal = Op.getOperand(0);
- return makeLibCall(DAG, LC, Op.getValueType(), &SrcVal, 1,
- /*isSigned*/ false, SDLoc(Op)).first;
-}
-
-SDValue
-AArch64TargetLowering::LowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) const {
- assert(Op.getValueType() == MVT::f128 && "Unexpected lowering");
-
- RTLIB::Libcall LC;
- LC = RTLIB::getFPEXT(Op.getOperand(0).getValueType(), Op.getValueType());
-
- return LowerF128ToCall(Op, DAG, LC);
-}
-
-static SDValue LowerVectorFP_TO_INT(SDValue Op, SelectionDAG &DAG,
- bool IsSigned) {
- SDLoc dl(Op);
- EVT VT = Op.getValueType();
- SDValue Vec = Op.getOperand(0);
- EVT OpVT = Vec.getValueType();
- unsigned Opc = IsSigned ? ISD::FP_TO_SINT : ISD::FP_TO_UINT;
-
- if (VT.getVectorNumElements() == 1) {
- assert(OpVT == MVT::v1f64 && "Unexpected vector type!");
- if (VT.getSizeInBits() == OpVT.getSizeInBits())
- return Op;
- return DAG.UnrollVectorOp(Op.getNode());
- }
-
- if (VT.getSizeInBits() > OpVT.getSizeInBits()) {
- assert(Vec.getValueType() == MVT::v2f32 && VT == MVT::v2i64 &&
- "Unexpected vector type!");
- Vec = DAG.getNode(ISD::FP_EXTEND, dl, MVT::v2f64, Vec);
- return DAG.getNode(Opc, dl, VT, Vec);
- } else if (VT.getSizeInBits() < OpVT.getSizeInBits()) {
- EVT CastVT = EVT::getIntegerVT(*DAG.getContext(),
- OpVT.getVectorElementType().getSizeInBits());
- CastVT =
- EVT::getVectorVT(*DAG.getContext(), CastVT, VT.getVectorNumElements());
- Vec = DAG.getNode(Opc, dl, CastVT, Vec);
- return DAG.getNode(ISD::TRUNCATE, dl, VT, Vec);
- }
- return DAG.getNode(Opc, dl, VT, Vec);
-}
-
-static SDValue LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) {
- // We custom lower concat_vectors with 4, 8, or 16 operands that are all the
- // same operand and of type v1* using the DUP instruction.
- unsigned NumOps = Op->getNumOperands();
- if (NumOps == 2) {
- assert(Op.getValueType().getSizeInBits() == 128 && "unexpected concat");
- return Op;
- }
-
- if (NumOps != 4 && NumOps != 8 && NumOps != 16)
- return SDValue();
-
- // Must be a single value for VDUP.
- SDValue Op0 = Op.getOperand(0);
- for (unsigned i = 1; i < NumOps; ++i) {
- SDValue OpN = Op.getOperand(i);
- if (Op0 != OpN)
- return SDValue();
- }
-
- // Verify the value type.
- EVT EltVT = Op0.getValueType();
- switch (NumOps) {
- default: llvm_unreachable("Unexpected number of operands");
- case 4:
- if (EltVT != MVT::v1i16 && EltVT != MVT::v1i32)
- return SDValue();
- break;
- case 8:
- if (EltVT != MVT::v1i8 && EltVT != MVT::v1i16)
- return SDValue();
- break;
- case 16:
- if (EltVT != MVT::v1i8)
- return SDValue();
- break;
- }
-
- SDLoc DL(Op);
- EVT VT = Op.getValueType();
- // VDUP produces better code for constants.
- if (Op0->getOpcode() == ISD::BUILD_VECTOR)
- return DAG.getNode(AArch64ISD::NEON_VDUP, DL, VT, Op0->getOperand(0));
- return DAG.getNode(AArch64ISD::NEON_VDUPLANE, DL, VT, Op0,
- DAG.getConstant(0, MVT::i64));
-}
-
-SDValue
-AArch64TargetLowering::LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG,
- bool IsSigned) const {
- if (Op.getValueType().isVector())
- return LowerVectorFP_TO_INT(Op, DAG, IsSigned);
- if (Op.getOperand(0).getValueType() != MVT::f128) {
- // It's legal except when f128 is involved
- return Op;
- }
-
- RTLIB::Libcall LC;
- if (IsSigned)
- LC = RTLIB::getFPTOSINT(Op.getOperand(0).getValueType(), Op.getValueType());
- else
- LC = RTLIB::getFPTOUINT(Op.getOperand(0).getValueType(), Op.getValueType());
-
- return LowerF128ToCall(Op, DAG, LC);
-}
-
-SDValue AArch64TargetLowering::LowerRETURNADDR(SDValue Op, SelectionDAG &DAG) const{
- MachineFunction &MF = DAG.getMachineFunction();
- MachineFrameInfo *MFI = MF.getFrameInfo();
- MFI->setReturnAddressIsTaken(true);
-
- if (verifyReturnAddressArgumentIsConstant(Op, DAG))
- return SDValue();
-
- EVT VT = Op.getValueType();
- SDLoc dl(Op);
- unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
- if (Depth) {
- SDValue FrameAddr = LowerFRAMEADDR(Op, DAG);
- SDValue Offset = DAG.getConstant(8, MVT::i64);
- return DAG.getLoad(VT, dl, DAG.getEntryNode(),
- DAG.getNode(ISD::ADD, dl, VT, FrameAddr, Offset),
- MachinePointerInfo(), false, false, false, 0);
- }
-
- // Return X30, which contains the return address. Mark it an implicit live-in.
- unsigned Reg = MF.addLiveIn(AArch64::X30, getRegClassFor(MVT::i64));
- return DAG.getCopyFromReg(DAG.getEntryNode(), dl, Reg, MVT::i64);
-}
-
-
-SDValue AArch64TargetLowering::LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG)
- const {
- MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo();
- MFI->setFrameAddressIsTaken(true);
-
- EVT VT = Op.getValueType();
- SDLoc dl(Op);
- unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
- unsigned FrameReg = AArch64::X29;
- SDValue FrameAddr = DAG.getCopyFromReg(DAG.getEntryNode(), dl, FrameReg, VT);
- while (Depth--)
- FrameAddr = DAG.getLoad(VT, dl, DAG.getEntryNode(), FrameAddr,
- MachinePointerInfo(),
- false, false, false, 0);
- return FrameAddr;
-}
-
-SDValue
-AArch64TargetLowering::LowerGlobalAddressELFLarge(SDValue Op,
- SelectionDAG &DAG) const {
- assert(getTargetMachine().getCodeModel() == CodeModel::Large);
- assert(getTargetMachine().getRelocationModel() == Reloc::Static);
-
- EVT PtrVT = getPointerTy();
- SDLoc dl(Op);
- const GlobalAddressSDNode *GN = cast<GlobalAddressSDNode>(Op);
- const GlobalValue *GV = GN->getGlobal();
-
- SDValue GlobalAddr = DAG.getNode(
- AArch64ISD::WrapperLarge, dl, PtrVT,
- DAG.getTargetGlobalAddress(GV, dl, PtrVT, 0, AArch64II::MO_ABS_G3),
- DAG.getTargetGlobalAddress(GV, dl, PtrVT, 0, AArch64II::MO_ABS_G2_NC),
- DAG.getTargetGlobalAddress(GV, dl, PtrVT, 0, AArch64II::MO_ABS_G1_NC),
- DAG.getTargetGlobalAddress(GV, dl, PtrVT, 0, AArch64II::MO_ABS_G0_NC));
-
- if (GN->getOffset() != 0)
- return DAG.getNode(ISD::ADD, dl, PtrVT, GlobalAddr,
- DAG.getConstant(GN->getOffset(), PtrVT));
-
- return GlobalAddr;
-}
-
-SDValue
-AArch64TargetLowering::LowerGlobalAddressELFSmall(SDValue Op,
- SelectionDAG &DAG) const {
- assert(getTargetMachine().getCodeModel() == CodeModel::Small);
-
- EVT PtrVT = getPointerTy();
- SDLoc dl(Op);
- const GlobalAddressSDNode *GN = cast<GlobalAddressSDNode>(Op);
- const GlobalValue *GV = GN->getGlobal();
- unsigned Alignment = GV->getAlignment();
- Reloc::Model RelocM = getTargetMachine().getRelocationModel();
- if (GV->isWeakForLinker() && GV->isDeclaration() && RelocM == Reloc::Static) {
- // Weak undefined symbols can't use ADRP/ADD pair since they should evaluate
- // to zero when they remain undefined. In PIC mode the GOT can take care of
- // this, but in absolute mode we use a constant pool load.
- SDValue PoolAddr;
- PoolAddr = DAG.getNode(AArch64ISD::WrapperSmall, dl, PtrVT,
- DAG.getTargetConstantPool(GV, PtrVT, 0, 0,
- AArch64II::MO_NO_FLAG),
- DAG.getTargetConstantPool(GV, PtrVT, 0, 0,
- AArch64II::MO_LO12),
- DAG.getConstant(8, MVT::i32));
- SDValue GlobalAddr = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), PoolAddr,
- MachinePointerInfo::getConstantPool(),
- /*isVolatile=*/ false,
- /*isNonTemporal=*/ true,
- /*isInvariant=*/ true, 8);
- if (GN->getOffset() != 0)
- return DAG.getNode(ISD::ADD, dl, PtrVT, GlobalAddr,
- DAG.getConstant(GN->getOffset(), PtrVT));
-
- return GlobalAddr;
- }
-
- if (Alignment == 0) {
- const PointerType *GVPtrTy = cast<PointerType>(GV->getType());
- if (GVPtrTy->getElementType()->isSized()) {
- Alignment
- = getDataLayout()->getABITypeAlignment(GVPtrTy->getElementType());
- } else {
- // Be conservative if we can't guess, not that it really matters:
- // functions and labels aren't valid for loads, and the methods used to
- // actually calculate an address work with any alignment.
- Alignment = 1;
- }
- }
-
- unsigned char HiFixup, LoFixup;
- bool UseGOT = getSubtarget()->GVIsIndirectSymbol(GV, RelocM);
-
- if (UseGOT) {
- HiFixup = AArch64II::MO_GOT;
- LoFixup = AArch64II::MO_GOT_LO12;
- Alignment = 8;
+ AArch64ISD::WrapperLarge, DL, PtrVT,
+ DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, AArch64II::MO_G3),
+ DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, AArch64II::MO_G2 | MO_NC),
+ DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, AArch64II::MO_G1 | MO_NC),
+ DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, AArch64II::MO_G0 | MO_NC));
} else {
- HiFixup = AArch64II::MO_NO_FLAG;
- LoFixup = AArch64II::MO_LO12;
- }
+ // Use ADRP/ADD or ADRP/LDR for everything else: the small model on ELF and
+ // the only correct model on Darwin.
+ SDValue Hi = DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0,
+ OpFlags | AArch64II::MO_PAGE);
+ unsigned char LoFlags = OpFlags | AArch64II::MO_PAGEOFF | AArch64II::MO_NC;
+ SDValue Lo = DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, LoFlags);
- // AArch64's small model demands the following sequence:
- // ADRP x0, somewhere
- // ADD x0, x0, #:lo12:somewhere ; (or LDR directly).
- SDValue GlobalRef = DAG.getNode(AArch64ISD::WrapperSmall, dl, PtrVT,
- DAG.getTargetGlobalAddress(GV, dl, PtrVT, 0,
- HiFixup),
- DAG.getTargetGlobalAddress(GV, dl, PtrVT, 0,
- LoFixup),
- DAG.getConstant(Alignment, MVT::i32));
-
- if (UseGOT) {
- GlobalRef = DAG.getNode(AArch64ISD::GOTLoad, dl, PtrVT, DAG.getEntryNode(),
- GlobalRef);
- }
-
- if (GN->getOffset() != 0)
- return DAG.getNode(ISD::ADD, dl, PtrVT, GlobalRef,
- DAG.getConstant(GN->getOffset(), PtrVT));
-
- return GlobalRef;
-}
-
-SDValue
-AArch64TargetLowering::LowerGlobalAddressELF(SDValue Op,
- SelectionDAG &DAG) const {
- // TableGen doesn't have easy access to the CodeModel or RelocationModel, so
- // we make those distinctions here.
-
- switch (getTargetMachine().getCodeModel()) {
- case CodeModel::Small:
- return LowerGlobalAddressELFSmall(Op, DAG);
- case CodeModel::Large:
- return LowerGlobalAddressELFLarge(Op, DAG);
- default:
- llvm_unreachable("Only small and large code models supported now");
+ SDValue ADRP = DAG.getNode(AArch64ISD::ADRP, DL, PtrVT, Hi);
+ return DAG.getNode(AArch64ISD::ADDlow, DL, PtrVT, ADRP, Lo);
}
}
+/// \brief Convert a TLS address reference into the correct sequence of loads
+/// and calls to compute the variable's address (for Darwin, currently) and
+/// return an SDValue containing the final node.
+
+/// Darwin only has one TLS scheme which must be capable of dealing with the
+/// fully general situation, in the worst case. This means:
+/// + "extern __thread" declaration.
+/// + Defined in a possibly unknown dynamic library.
+///
+/// The general system is that each __thread variable has a [3 x i64] descriptor
+/// which contains information used by the runtime to calculate the address. The
+/// only part of this the compiler needs to know about is the first xword, which
+/// contains a function pointer that must be called with the address of the
+/// entire descriptor in "x0".
+///
+/// Since this descriptor may be in a different unit, in general even the
+/// descriptor must be accessed via an indirect load. The "ideal" code sequence
+/// is:
+/// adrp x0, _var@TLVPPAGE
+/// ldr x0, [x0, _var@TLVPPAGEOFF] ; x0 now contains address of descriptor
+/// ldr x1, [x0] ; x1 contains 1st entry of descriptor,
+/// ; the function pointer
+/// blr x1 ; Uses descriptor address in x0
+/// ; Address of _var is now in x0.
+///
+/// If the address of _var's descriptor *is* known to the linker, then it can
+/// change the first "ldr" instruction to an appropriate "add x0, x0, #imm" for
+/// a slight efficiency gain.
SDValue
-AArch64TargetLowering::LowerConstantPool(SDValue Op,
- SelectionDAG &DAG) const {
+AArch64TargetLowering::LowerDarwinGlobalTLSAddress(SDValue Op,
+ SelectionDAG &DAG) const {
+ assert(Subtarget->isTargetDarwin() && "TLS only supported on Darwin");
+
SDLoc DL(Op);
- EVT PtrVT = getPointerTy();
- ConstantPoolSDNode *CN = cast<ConstantPoolSDNode>(Op);
- const Constant *C = CN->getConstVal();
+ MVT PtrVT = getPointerTy();
+ const GlobalValue *GV = cast<GlobalAddressSDNode>(Op)->getGlobal();
- switch(getTargetMachine().getCodeModel()) {
- case CodeModel::Small:
- // The most efficient code is PC-relative anyway for the small memory model,
- // so we don't need to worry about relocation model.
- return DAG.getNode(AArch64ISD::WrapperSmall, DL, PtrVT,
- DAG.getTargetConstantPool(C, PtrVT, 0, 0,
- AArch64II::MO_NO_FLAG),
- DAG.getTargetConstantPool(C, PtrVT, 0, 0,
- AArch64II::MO_LO12),
- DAG.getConstant(CN->getAlignment(), MVT::i32));
- case CodeModel::Large:
- return DAG.getNode(
- AArch64ISD::WrapperLarge, DL, PtrVT,
- DAG.getTargetConstantPool(C, PtrVT, 0, 0, AArch64II::MO_ABS_G3),
- DAG.getTargetConstantPool(C, PtrVT, 0, 0, AArch64II::MO_ABS_G2_NC),
- DAG.getTargetConstantPool(C, PtrVT, 0, 0, AArch64II::MO_ABS_G1_NC),
- DAG.getTargetConstantPool(C, PtrVT, 0, 0, AArch64II::MO_ABS_G0_NC));
- default:
- llvm_unreachable("Only small and large code models supported now");
- }
+ SDValue TLVPAddr =
+ DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, AArch64II::MO_TLS);
+ SDValue DescAddr = DAG.getNode(AArch64ISD::LOADgot, DL, PtrVT, TLVPAddr);
+
+ // The first entry in the descriptor is a function pointer that we must call
+ // to obtain the address of the variable.
+ SDValue Chain = DAG.getEntryNode();
+ SDValue FuncTLVGet =
+ DAG.getLoad(MVT::i64, DL, Chain, DescAddr, MachinePointerInfo::getGOT(),
+ false, true, true, 8);
+ Chain = FuncTLVGet.getValue(1);
+
+ MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo();
+ MFI->setAdjustsStack(true);
+
+ // TLS calls preserve all registers except those that absolutely must be
+ // trashed: X0 (it takes an argument), LR (it's a call) and NZCV (let's not be
+ // silly).
+ const TargetRegisterInfo *TRI = getTargetMachine().getRegisterInfo();
+ const AArch64RegisterInfo *ARI =
+ static_cast<const AArch64RegisterInfo *>(TRI);
+ const uint32_t *Mask = ARI->getTLSCallPreservedMask();
+
+ // Finally, we can make the call. This is just a degenerate version of a
+ // normal AArch64 call node: x0 takes the address of the descriptor, and
+ // returns the address of the variable in this thread.
+ Chain = DAG.getCopyToReg(Chain, DL, AArch64::X0, DescAddr, SDValue());
+ Chain =
+ DAG.getNode(AArch64ISD::CALL, DL, DAG.getVTList(MVT::Other, MVT::Glue),
+ Chain, FuncTLVGet, DAG.getRegister(AArch64::X0, MVT::i64),
+ DAG.getRegisterMask(Mask), Chain.getValue(1));
+ return DAG.getCopyFromReg(Chain, DL, AArch64::X0, PtrVT, Chain.getValue(1));
}
-SDValue AArch64TargetLowering::LowerTLSDescCall(SDValue SymAddr,
- SDValue DescAddr,
- SDLoc DL,
- SelectionDAG &DAG) const {
+/// When accessing thread-local variables under either the general-dynamic or
+/// local-dynamic system, we make a "TLS-descriptor" call. The variable will
+/// have a descriptor, accessible via a PC-relative ADRP, and whose first entry
+/// is a function pointer to carry out the resolution. This function takes the
+/// address of the descriptor in X0 and returns the TPIDR_EL0 offset in X0. All
+/// other registers (except LR, NZCV) are preserved.
+///
+/// Thus, the ideal call sequence on AArch64 is:
+///
+/// adrp x0, :tlsdesc:thread_var
+/// ldr x8, [x0, :tlsdesc_lo12:thread_var]
+/// add x0, x0, :tlsdesc_lo12:thread_var
+/// .tlsdesccall thread_var
+/// blr x8
+/// (TPIDR_EL0 offset now in x0).
+///
+/// The ".tlsdesccall" directive instructs the assembler to insert a particular
+/// relocation to help the linker relax this sequence if it turns out to be too
+/// conservative.
+///
+/// FIXME: we currently produce an extra, duplicated, ADRP instruction, but this
+/// is harmless.
+SDValue AArch64TargetLowering::LowerELFTLSDescCall(SDValue SymAddr,
+ SDValue DescAddr, SDLoc DL,
+ SelectionDAG &DAG) const {
EVT PtrVT = getPointerTy();
// The function we need to call is simply the first entry in the GOT for this
// descriptor, load it in preparation.
- SDValue Func, Chain;
- Func = DAG.getNode(AArch64ISD::GOTLoad, DL, PtrVT, DAG.getEntryNode(),
- DescAddr);
+ SDValue Func = DAG.getNode(AArch64ISD::LOADgot, DL, PtrVT, SymAddr);
+
+ // TLS calls preserve all registers except those that absolutely must be
+ // trashed: X0 (it takes an argument), LR (it's a call) and NZCV (let's not be
+ // silly).
+ const TargetRegisterInfo *TRI = getTargetMachine().getRegisterInfo();
+ const AArch64RegisterInfo *ARI =
+ static_cast<const AArch64RegisterInfo *>(TRI);
+ const uint32_t *Mask = ARI->getTLSCallPreservedMask();
// The function takes only one argument: the address of the descriptor itself
// in X0.
- SDValue Glue;
+ SDValue Glue, Chain;
Chain = DAG.getCopyToReg(DAG.getEntryNode(), DL, AArch64::X0, DescAddr, Glue);
Glue = Chain.getValue(1);
- // Finally, there's a special calling-convention which means that the lookup
- // must preserve all registers (except X0, obviously).
- const TargetRegisterInfo *TRI = getTargetMachine().getRegisterInfo();
- const AArch64RegisterInfo *A64RI
- = static_cast<const AArch64RegisterInfo *>(TRI);
- const uint32_t *Mask = A64RI->getTLSDescCallPreservedMask();
-
// We're now ready to populate the argument list, as with a normal call:
- std::vector<SDValue> Ops;
+ SmallVector<SDValue, 6> Ops;
Ops.push_back(Chain);
Ops.push_back(Func);
Ops.push_back(SymAddr);
@@ -2586,22 +2705,18 @@
Ops.push_back(Glue);
SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
- Chain = DAG.getNode(AArch64ISD::TLSDESCCALL, DL, NodeTys, &Ops[0],
- Ops.size());
+ Chain = DAG.getNode(AArch64ISD::TLSDESC_CALL, DL, NodeTys, Ops);
Glue = Chain.getValue(1);
- // After the call, the offset from TPIDR_EL0 is in X0, copy it out and pass it
- // back to the generic handling code.
return DAG.getCopyFromReg(Chain, DL, AArch64::X0, PtrVT, Glue);
}
SDValue
-AArch64TargetLowering::LowerGlobalTLSAddress(SDValue Op,
- SelectionDAG &DAG) const {
- assert(getSubtarget()->isTargetELF() &&
- "TLS not implemented for non-ELF targets");
- assert(getTargetMachine().getCodeModel() == CodeModel::Small
- && "TLS only supported in small memory model");
+AArch64TargetLowering::LowerELFGlobalTLSAddress(SDValue Op,
+ SelectionDAG &DAG) const {
+ assert(Subtarget->isTargetELF() && "This function expects an ELF target");
+ assert(getTargetMachine().getCodeModel() == CodeModel::Small &&
+ "ELF TLS only supported in small memory model");
const GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Op);
TLSModel::Model Model = getTargetMachine().getTLSModel(GA->getGlobal());
@@ -2613,39 +2728,22 @@
SDValue ThreadBase = DAG.getNode(AArch64ISD::THREAD_POINTER, DL, PtrVT);
- if (Model == TLSModel::InitialExec) {
- TPOff = DAG.getNode(AArch64ISD::WrapperSmall, DL, PtrVT,
- DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0,
- AArch64II::MO_GOTTPREL),
- DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0,
- AArch64II::MO_GOTTPREL_LO12),
- DAG.getConstant(8, MVT::i32));
- TPOff = DAG.getNode(AArch64ISD::GOTLoad, DL, PtrVT, DAG.getEntryNode(),
- TPOff);
- } else if (Model == TLSModel::LocalExec) {
- SDValue HiVar = DAG.getTargetGlobalAddress(GV, DL, MVT::i64, 0,
- AArch64II::MO_TPREL_G1);
- SDValue LoVar = DAG.getTargetGlobalAddress(GV, DL, MVT::i64, 0,
- AArch64II::MO_TPREL_G0_NC);
+ if (Model == TLSModel::LocalExec) {
+ SDValue HiVar = DAG.getTargetGlobalAddress(
+ GV, DL, PtrVT, 0, AArch64II::MO_TLS | AArch64II::MO_G1);
+ SDValue LoVar = DAG.getTargetGlobalAddress(
+ GV, DL, PtrVT, 0,
+ AArch64II::MO_TLS | AArch64II::MO_G0 | AArch64II::MO_NC);
- TPOff = SDValue(DAG.getMachineNode(AArch64::MOVZxii, DL, PtrVT, HiVar,
- DAG.getTargetConstant(1, MVT::i32)), 0);
- TPOff = SDValue(DAG.getMachineNode(AArch64::MOVKxii, DL, PtrVT,
- TPOff, LoVar,
- DAG.getTargetConstant(0, MVT::i32)), 0);
- } else if (Model == TLSModel::GeneralDynamic) {
- // Accesses used in this sequence go via the TLS descriptor which lives in
- // the GOT. Prepare an address we can use to handle this.
- SDValue HiDesc = DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0,
- AArch64II::MO_TLSDESC);
- SDValue LoDesc = DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0,
- AArch64II::MO_TLSDESC_LO12);
- SDValue DescAddr = DAG.getNode(AArch64ISD::WrapperSmall, DL, PtrVT,
- HiDesc, LoDesc,
- DAG.getConstant(8, MVT::i32));
- SDValue SymAddr = DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0);
-
- TPOff = LowerTLSDescCall(SymAddr, DescAddr, DL, DAG);
+ TPOff = SDValue(DAG.getMachineNode(AArch64::MOVZXi, DL, PtrVT, HiVar,
+ DAG.getTargetConstant(16, MVT::i32)),
+ 0);
+ TPOff = SDValue(DAG.getMachineNode(AArch64::MOVKXi, DL, PtrVT, TPOff, LoVar,
+ DAG.getTargetConstant(0, MVT::i32)),
+ 0);
+ } else if (Model == TLSModel::InitialExec) {
+ TPOff = DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, AArch64II::MO_TLS);
+ TPOff = DAG.getNode(AArch64ISD::LOADgot, DL, PtrVT, TPOff);
} else if (Model == TLSModel::LocalDynamic) {
// Local-dynamic accesses proceed in two phases. A general-dynamic TLS
// descriptor call against the special symbol _TLS_MODULE_BASE_ to calculate
@@ -2653,367 +2751,354 @@
// calculation.
// These accesses will need deduplicating if there's more than one.
- AArch64MachineFunctionInfo* MFI = DAG.getMachineFunction()
- .getInfo<AArch64MachineFunctionInfo>();
+ AArch64FunctionInfo *MFI =
+ DAG.getMachineFunction().getInfo<AArch64FunctionInfo>();
MFI->incNumLocalDynamicTLSAccesses();
+ // Accesses used in this sequence go via the TLS descriptor which lives in
+ // the GOT. Prepare an address we can use to handle this.
+ SDValue HiDesc = DAG.getTargetExternalSymbol(
+ "_TLS_MODULE_BASE_", PtrVT, AArch64II::MO_TLS | AArch64II::MO_PAGE);
+ SDValue LoDesc = DAG.getTargetExternalSymbol(
+ "_TLS_MODULE_BASE_", PtrVT,
+ AArch64II::MO_TLS | AArch64II::MO_PAGEOFF | AArch64II::MO_NC);
- // Get the location of _TLS_MODULE_BASE_:
- SDValue HiDesc = DAG.getTargetExternalSymbol("_TLS_MODULE_BASE_", PtrVT,
- AArch64II::MO_TLSDESC);
- SDValue LoDesc = DAG.getTargetExternalSymbol("_TLS_MODULE_BASE_", PtrVT,
- AArch64II::MO_TLSDESC_LO12);
- SDValue DescAddr = DAG.getNode(AArch64ISD::WrapperSmall, DL, PtrVT,
- HiDesc, LoDesc,
- DAG.getConstant(8, MVT::i32));
- SDValue SymAddr = DAG.getTargetExternalSymbol("_TLS_MODULE_BASE_", PtrVT);
+ // First argument to the descriptor call is the address of the descriptor
+ // itself.
+ SDValue DescAddr = DAG.getNode(AArch64ISD::ADRP, DL, PtrVT, HiDesc);
+ DescAddr = DAG.getNode(AArch64ISD::ADDlow, DL, PtrVT, DescAddr, LoDesc);
- ThreadBase = LowerTLSDescCall(SymAddr, DescAddr, DL, DAG);
+ // The call needs a relocation too for linker relaxation. It doesn't make
+ // sense to call it MO_PAGE or MO_PAGEOFF though so we need another copy of
+ // the address.
+ SDValue SymAddr = DAG.getTargetExternalSymbol("_TLS_MODULE_BASE_", PtrVT,
+ AArch64II::MO_TLS);
- // Get the variable's offset from _TLS_MODULE_BASE_
- SDValue HiVar = DAG.getTargetGlobalAddress(GV, DL, MVT::i64, 0,
- AArch64II::MO_DTPREL_G1);
- SDValue LoVar = DAG.getTargetGlobalAddress(GV, DL, MVT::i64, 0,
- AArch64II::MO_DTPREL_G0_NC);
+ // Now we can calculate the offset from TPIDR_EL0 to this module's
+ // thread-local area.
+ TPOff = LowerELFTLSDescCall(SymAddr, DescAddr, DL, DAG);
- TPOff = SDValue(DAG.getMachineNode(AArch64::MOVZxii, DL, PtrVT, HiVar,
- DAG.getTargetConstant(0, MVT::i32)), 0);
- TPOff = SDValue(DAG.getMachineNode(AArch64::MOVKxii, DL, PtrVT,
- TPOff, LoVar,
- DAG.getTargetConstant(0, MVT::i32)), 0);
+ // Now use :dtprel_whatever: operations to calculate this variable's offset
+ // in its thread-storage area.
+ SDValue HiVar = DAG.getTargetGlobalAddress(
+ GV, DL, MVT::i64, 0, AArch64II::MO_TLS | AArch64II::MO_G1);
+ SDValue LoVar = DAG.getTargetGlobalAddress(
+ GV, DL, MVT::i64, 0,
+ AArch64II::MO_TLS | AArch64II::MO_G0 | AArch64II::MO_NC);
+
+ SDValue DTPOff =
+ SDValue(DAG.getMachineNode(AArch64::MOVZXi, DL, PtrVT, HiVar,
+ DAG.getTargetConstant(16, MVT::i32)),
+ 0);
+ DTPOff =
+ SDValue(DAG.getMachineNode(AArch64::MOVKXi, DL, PtrVT, DTPOff, LoVar,
+ DAG.getTargetConstant(0, MVT::i32)),
+ 0);
+
+ TPOff = DAG.getNode(ISD::ADD, DL, PtrVT, TPOff, DTPOff);
+ } else if (Model == TLSModel::GeneralDynamic) {
+ // Accesses used in this sequence go via the TLS descriptor which lives in
+ // the GOT. Prepare an address we can use to handle this.
+ SDValue HiDesc = DAG.getTargetGlobalAddress(
+ GV, DL, PtrVT, 0, AArch64II::MO_TLS | AArch64II::MO_PAGE);
+ SDValue LoDesc = DAG.getTargetGlobalAddress(
+ GV, DL, PtrVT, 0,
+ AArch64II::MO_TLS | AArch64II::MO_PAGEOFF | AArch64II::MO_NC);
+
+ // First argument to the descriptor call is the address of the descriptor
+ // itself.
+ SDValue DescAddr = DAG.getNode(AArch64ISD::ADRP, DL, PtrVT, HiDesc);
+ DescAddr = DAG.getNode(AArch64ISD::ADDlow, DL, PtrVT, DescAddr, LoDesc);
+
+ // The call needs a relocation too for linker relaxation. It doesn't make
+ // sense to call it MO_PAGE or MO_PAGEOFF though so we need another copy of
+ // the address.
+ SDValue SymAddr =
+ DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, AArch64II::MO_TLS);
+
+ // Finally we can make a call to calculate the offset from tpidr_el0.
+ TPOff = LowerELFTLSDescCall(SymAddr, DescAddr, DL, DAG);
} else
- llvm_unreachable("Unsupported TLS access model");
-
+ llvm_unreachable("Unsupported ELF TLS access model");
return DAG.getNode(ISD::ADD, DL, PtrVT, ThreadBase, TPOff);
}
-static SDValue LowerVectorINT_TO_FP(SDValue Op, SelectionDAG &DAG,
- bool IsSigned) {
+SDValue AArch64TargetLowering::LowerGlobalTLSAddress(SDValue Op,
+ SelectionDAG &DAG) const {
+ if (Subtarget->isTargetDarwin())
+ return LowerDarwinGlobalTLSAddress(Op, DAG);
+ else if (Subtarget->isTargetELF())
+ return LowerELFGlobalTLSAddress(Op, DAG);
+
+ llvm_unreachable("Unexpected platform trying to use TLS");
+}
+SDValue AArch64TargetLowering::LowerBR_CC(SDValue Op, SelectionDAG &DAG) const {
+ SDValue Chain = Op.getOperand(0);
+ ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(1))->get();
+ SDValue LHS = Op.getOperand(2);
+ SDValue RHS = Op.getOperand(3);
+ SDValue Dest = Op.getOperand(4);
SDLoc dl(Op);
- EVT VT = Op.getValueType();
- SDValue Vec = Op.getOperand(0);
- unsigned Opc = IsSigned ? ISD::SINT_TO_FP : ISD::UINT_TO_FP;
- if (VT.getVectorNumElements() == 1) {
- assert(VT == MVT::v1f64 && "Unexpected vector type!");
- if (VT.getSizeInBits() == Vec.getValueSizeInBits())
- return Op;
- return DAG.UnrollVectorOp(Op.getNode());
+ // Handle f128 first, since lowering it will result in comparing the return
+ // value of a libcall against zero, which is just what the rest of LowerBR_CC
+ // is expecting to deal with.
+ if (LHS.getValueType() == MVT::f128) {
+ softenSetCCOperands(DAG, MVT::f128, LHS, RHS, CC, dl);
+
+ // If softenSetCCOperands returned a scalar, we need to compare the result
+ // against zero to select between true and false values.
+ if (!RHS.getNode()) {
+ RHS = DAG.getConstant(0, LHS.getValueType());
+ CC = ISD::SETNE;
+ }
}
- if (VT.getSizeInBits() < Vec.getValueSizeInBits()) {
- assert(Vec.getValueType() == MVT::v2i64 && VT == MVT::v2f32 &&
- "Unexpected vector type!");
- Vec = DAG.getNode(Opc, dl, MVT::v2f64, Vec);
- return DAG.getNode(ISD::FP_ROUND, dl, VT, Vec, DAG.getIntPtrConstant(0));
- } else if (VT.getSizeInBits() > Vec.getValueSizeInBits()) {
- unsigned CastOpc = IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
- EVT CastVT = EVT::getIntegerVT(*DAG.getContext(),
- VT.getVectorElementType().getSizeInBits());
- CastVT =
- EVT::getVectorVT(*DAG.getContext(), CastVT, VT.getVectorNumElements());
- Vec = DAG.getNode(CastOpc, dl, CastVT, Vec);
+ // Optimize {s|u}{add|sub|mul}.with.overflow feeding into a branch
+ // instruction.
+ unsigned Opc = LHS.getOpcode();
+ if (LHS.getResNo() == 1 && isa<ConstantSDNode>(RHS) &&
+ cast<ConstantSDNode>(RHS)->isOne() &&
+ (Opc == ISD::SADDO || Opc == ISD::UADDO || Opc == ISD::SSUBO ||
+ Opc == ISD::USUBO || Opc == ISD::SMULO || Opc == ISD::UMULO)) {
+ assert((CC == ISD::SETEQ || CC == ISD::SETNE) &&
+ "Unexpected condition code.");
+ // Only lower legal XALUO ops.
+ if (!DAG.getTargetLoweringInfo().isTypeLegal(LHS->getValueType(0)))
+ return SDValue();
+
+ // The actual operation with overflow check.
+ AArch64CC::CondCode OFCC;
+ SDValue Value, Overflow;
+ std::tie(Value, Overflow) = getAArch64XALUOOp(OFCC, LHS.getValue(0), DAG);
+
+ if (CC == ISD::SETNE)
+ OFCC = getInvertedCondCode(OFCC);
+ SDValue CCVal = DAG.getConstant(OFCC, MVT::i32);
+
+ return DAG.getNode(AArch64ISD::BRCOND, SDLoc(LHS), MVT::Other, Chain, Dest,
+ CCVal, Overflow);
}
- return DAG.getNode(Opc, dl, VT, Vec);
-}
-
-SDValue
-AArch64TargetLowering::LowerINT_TO_FP(SDValue Op, SelectionDAG &DAG,
- bool IsSigned) const {
- if (Op.getValueType().isVector())
- return LowerVectorINT_TO_FP(Op, DAG, IsSigned);
- if (Op.getValueType() != MVT::f128) {
- // Legal for everything except f128.
- return Op;
- }
-
- RTLIB::Libcall LC;
- if (IsSigned)
- LC = RTLIB::getSINTTOFP(Op.getOperand(0).getValueType(), Op.getValueType());
- else
- LC = RTLIB::getUINTTOFP(Op.getOperand(0).getValueType(), Op.getValueType());
-
- return LowerF128ToCall(Op, DAG, LC);
-}
-
-
-SDValue
-AArch64TargetLowering::LowerJumpTable(SDValue Op, SelectionDAG &DAG) const {
- JumpTableSDNode *JT = cast<JumpTableSDNode>(Op);
- SDLoc dl(JT);
- EVT PtrVT = getPointerTy();
-
- // When compiling PIC, jump tables get put in the code section so a static
- // relocation-style is acceptable for both cases.
- switch (getTargetMachine().getCodeModel()) {
- case CodeModel::Small:
- return DAG.getNode(AArch64ISD::WrapperSmall, dl, PtrVT,
- DAG.getTargetJumpTable(JT->getIndex(), PtrVT),
- DAG.getTargetJumpTable(JT->getIndex(), PtrVT,
- AArch64II::MO_LO12),
- DAG.getConstant(1, MVT::i32));
- case CodeModel::Large:
- return DAG.getNode(
- AArch64ISD::WrapperLarge, dl, PtrVT,
- DAG.getTargetJumpTable(JT->getIndex(), PtrVT, AArch64II::MO_ABS_G3),
- DAG.getTargetJumpTable(JT->getIndex(), PtrVT, AArch64II::MO_ABS_G2_NC),
- DAG.getTargetJumpTable(JT->getIndex(), PtrVT, AArch64II::MO_ABS_G1_NC),
- DAG.getTargetJumpTable(JT->getIndex(), PtrVT, AArch64II::MO_ABS_G0_NC));
- default:
- llvm_unreachable("Only small and large code models supported now");
- }
-}
-
-// (SELECT testbit, iftrue, iffalse)
-SDValue
-AArch64TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
- SDLoc dl(Op);
- SDValue TheBit = Op.getOperand(0);
- SDValue IfTrue = Op.getOperand(1);
- SDValue IfFalse = Op.getOperand(2);
-
- // AArch64 BooleanContents is the default UndefinedBooleanContent, which means
- // that as the consumer we are responsible for ignoring rubbish in higher
- // bits.
- TheBit = DAG.getNode(ISD::AND, dl, MVT::i32, TheBit,
- DAG.getConstant(1, MVT::i32));
- SDValue A64CMP = DAG.getNode(AArch64ISD::SETCC, dl, MVT::i32, TheBit,
- DAG.getConstant(0, TheBit.getValueType()),
- DAG.getCondCode(ISD::SETNE));
-
- return DAG.getNode(AArch64ISD::SELECT_CC, dl, Op.getValueType(),
- A64CMP, IfTrue, IfFalse,
- DAG.getConstant(A64CC::NE, MVT::i32));
-}
-
-static SDValue LowerVectorSETCC(SDValue Op, SelectionDAG &DAG) {
- SDLoc DL(Op);
- SDValue LHS = Op.getOperand(0);
- SDValue RHS = Op.getOperand(1);
- ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(2))->get();
- EVT VT = Op.getValueType();
- bool Invert = false;
- SDValue Op0, Op1;
- unsigned Opcode;
-
if (LHS.getValueType().isInteger()) {
+ assert((LHS.getValueType() == RHS.getValueType()) &&
+ (LHS.getValueType() == MVT::i32 || LHS.getValueType() == MVT::i64));
- // Attempt to use Vector Integer Compare Mask Test instruction.
- // TST = icmp ne (and (op0, op1), zero).
- if (CC == ISD::SETNE) {
- if (((LHS.getOpcode() == ISD::AND) &&
- ISD::isBuildVectorAllZeros(RHS.getNode())) ||
- ((RHS.getOpcode() == ISD::AND) &&
- ISD::isBuildVectorAllZeros(LHS.getNode()))) {
+ // If the RHS of the comparison is zero, we can potentially fold this
+ // to a specialized branch.
+ const ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(RHS);
+ if (RHSC && RHSC->getZExtValue() == 0) {
+ if (CC == ISD::SETEQ) {
+ // See if we can use a TBZ to fold in an AND as well.
+ // TBZ has a smaller branch displacement than CBZ. If the offset is
+ // out of bounds, a late MI-layer pass rewrites branches.
+ // 403.gcc is an example that hits this case.
+ if (LHS.getOpcode() == ISD::AND &&
+ isa<ConstantSDNode>(LHS.getOperand(1)) &&
+ isPowerOf2_64(LHS.getConstantOperandVal(1))) {
+ SDValue Test = LHS.getOperand(0);
+ uint64_t Mask = LHS.getConstantOperandVal(1);
- SDValue AndOp = (LHS.getOpcode() == ISD::AND) ? LHS : RHS;
- SDValue NewLHS = DAG.getNode(ISD::BITCAST, DL, VT, AndOp.getOperand(0));
- SDValue NewRHS = DAG.getNode(ISD::BITCAST, DL, VT, AndOp.getOperand(1));
- return DAG.getNode(AArch64ISD::NEON_TST, DL, VT, NewLHS, NewRHS);
+ // TBZ only operates on i64's, but the ext should be free.
+ if (Test.getValueType() == MVT::i32)
+ Test = DAG.getAnyExtOrTrunc(Test, dl, MVT::i64);
+
+ return DAG.getNode(AArch64ISD::TBZ, dl, MVT::Other, Chain, Test,
+ DAG.getConstant(Log2_64(Mask), MVT::i64), Dest);
+ }
+
+ return DAG.getNode(AArch64ISD::CBZ, dl, MVT::Other, Chain, LHS, Dest);
+ } else if (CC == ISD::SETNE) {
+ // See if we can use a TBZ to fold in an AND as well.
+ // TBZ has a smaller branch displacement than CBZ. If the offset is
+ // out of bounds, a late MI-layer pass rewrites branches.
+ // 403.gcc is an example that hits this case.
+ if (LHS.getOpcode() == ISD::AND &&
+ isa<ConstantSDNode>(LHS.getOperand(1)) &&
+ isPowerOf2_64(LHS.getConstantOperandVal(1))) {
+ SDValue Test = LHS.getOperand(0);
+ uint64_t Mask = LHS.getConstantOperandVal(1);
+
+ // TBNZ only operates on i64's, but the ext should be free.
+ if (Test.getValueType() == MVT::i32)
+ Test = DAG.getAnyExtOrTrunc(Test, dl, MVT::i64);
+
+ return DAG.getNode(AArch64ISD::TBNZ, dl, MVT::Other, Chain, Test,
+ DAG.getConstant(Log2_64(Mask), MVT::i64), Dest);
+ }
+
+ return DAG.getNode(AArch64ISD::CBNZ, dl, MVT::Other, Chain, LHS, Dest);
}
}
- // Attempt to use Vector Integer Compare Mask against Zero instr (Signed).
- // Note: Compare against Zero does not support unsigned predicates.
- if ((ISD::isBuildVectorAllZeros(RHS.getNode()) ||
- ISD::isBuildVectorAllZeros(LHS.getNode())) &&
- !isUnsignedIntSetCC(CC)) {
-
- // If LHS is the zero value, swap operands and CondCode.
- if (ISD::isBuildVectorAllZeros(LHS.getNode())) {
- CC = getSetCCSwappedOperands(CC);
- Op0 = RHS;
- } else
- Op0 = LHS;
-
- // Ensure valid CondCode for Compare Mask against Zero instruction:
- // EQ, GE, GT, LE, LT.
- if (ISD::SETNE == CC) {
- Invert = true;
- CC = ISD::SETEQ;
- }
-
- // Using constant type to differentiate integer and FP compares with zero.
- Op1 = DAG.getConstant(0, MVT::i32);
- Opcode = AArch64ISD::NEON_CMPZ;
-
- } else {
- // Attempt to use Vector Integer Compare Mask instr (Signed/Unsigned).
- // Ensure valid CondCode for Compare Mask instr: EQ, GE, GT, UGE, UGT.
- bool Swap = false;
- switch (CC) {
- default:
- llvm_unreachable("Illegal integer comparison.");
- case ISD::SETEQ:
- case ISD::SETGT:
- case ISD::SETGE:
- case ISD::SETUGT:
- case ISD::SETUGE:
- break;
- case ISD::SETNE:
- Invert = true;
- CC = ISD::SETEQ;
- break;
- case ISD::SETULT:
- case ISD::SETULE:
- case ISD::SETLT:
- case ISD::SETLE:
- Swap = true;
- CC = getSetCCSwappedOperands(CC);
- }
-
- if (Swap)
- std::swap(LHS, RHS);
-
- Opcode = AArch64ISD::NEON_CMP;
- Op0 = LHS;
- Op1 = RHS;
- }
-
- // Generate Compare Mask instr or Compare Mask against Zero instr.
- SDValue NeonCmp =
- DAG.getNode(Opcode, DL, VT, Op0, Op1, DAG.getCondCode(CC));
-
- if (Invert)
- NeonCmp = DAG.getNOT(DL, NeonCmp, VT);
-
- return NeonCmp;
+ SDValue CCVal;
+ SDValue Cmp = getAArch64Cmp(LHS, RHS, CC, CCVal, DAG, dl);
+ return DAG.getNode(AArch64ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal,
+ Cmp);
}
- // Now handle Floating Point cases.
- // Attempt to use Vector Floating Point Compare Mask against Zero instruction.
- if (ISD::isBuildVectorAllZeros(RHS.getNode()) ||
- ISD::isBuildVectorAllZeros(LHS.getNode())) {
+ assert(LHS.getValueType() == MVT::f32 || LHS.getValueType() == MVT::f64);
- // If LHS is the zero value, swap operands and CondCode.
- if (ISD::isBuildVectorAllZeros(LHS.getNode())) {
- CC = getSetCCSwappedOperands(CC);
- Op0 = RHS;
- } else
- Op0 = LHS;
-
- // Using constant type to differentiate integer and FP compares with zero.
- Op1 = DAG.getConstantFP(0, MVT::f32);
- Opcode = AArch64ISD::NEON_CMPZ;
- } else {
- // Attempt to use Vector Floating Point Compare Mask instruction.
- Op0 = LHS;
- Op1 = RHS;
- Opcode = AArch64ISD::NEON_CMP;
+ // Unfortunately, the mapping of LLVM FP CC's onto AArch64 CC's isn't totally
+ // clean. Some of them require two branches to implement.
+ SDValue Cmp = emitComparison(LHS, RHS, CC, dl, DAG);
+ AArch64CC::CondCode CC1, CC2;
+ changeFPCCToAArch64CC(CC, CC1, CC2);
+ SDValue CC1Val = DAG.getConstant(CC1, MVT::i32);
+ SDValue BR1 =
+ DAG.getNode(AArch64ISD::BRCOND, dl, MVT::Other, Chain, Dest, CC1Val, Cmp);
+ if (CC2 != AArch64CC::AL) {
+ SDValue CC2Val = DAG.getConstant(CC2, MVT::i32);
+ return DAG.getNode(AArch64ISD::BRCOND, dl, MVT::Other, BR1, Dest, CC2Val,
+ Cmp);
}
- SDValue NeonCmpAlt;
- // Some register compares have to be implemented with swapped CC and operands,
- // e.g.: OLT implemented as OGT with swapped operands.
- bool SwapIfRegArgs = false;
-
- // Ensure valid CondCode for FP Compare Mask against Zero instruction:
- // EQ, GE, GT, LE, LT.
- // And ensure valid CondCode for FP Compare Mask instruction: EQ, GE, GT.
- switch (CC) {
- default:
- llvm_unreachable("Illegal FP comparison");
- case ISD::SETUNE:
- case ISD::SETNE:
- Invert = true; // Fallthrough
- case ISD::SETOEQ:
- case ISD::SETEQ:
- CC = ISD::SETEQ;
- break;
- case ISD::SETOLT:
- case ISD::SETLT:
- CC = ISD::SETLT;
- SwapIfRegArgs = true;
- break;
- case ISD::SETOGT:
- case ISD::SETGT:
- CC = ISD::SETGT;
- break;
- case ISD::SETOLE:
- case ISD::SETLE:
- CC = ISD::SETLE;
- SwapIfRegArgs = true;
- break;
- case ISD::SETOGE:
- case ISD::SETGE:
- CC = ISD::SETGE;
- break;
- case ISD::SETUGE:
- Invert = true;
- CC = ISD::SETLT;
- SwapIfRegArgs = true;
- break;
- case ISD::SETULE:
- Invert = true;
- CC = ISD::SETGT;
- break;
- case ISD::SETUGT:
- Invert = true;
- CC = ISD::SETLE;
- SwapIfRegArgs = true;
- break;
- case ISD::SETULT:
- Invert = true;
- CC = ISD::SETGE;
- break;
- case ISD::SETUEQ:
- Invert = true; // Fallthrough
- case ISD::SETONE:
- // Expand this to (OGT |OLT).
- NeonCmpAlt =
- DAG.getNode(Opcode, DL, VT, Op0, Op1, DAG.getCondCode(ISD::SETGT));
- CC = ISD::SETLT;
- SwapIfRegArgs = true;
- break;
- case ISD::SETUO:
- Invert = true; // Fallthrough
- case ISD::SETO:
- // Expand this to (OGE | OLT).
- NeonCmpAlt =
- DAG.getNode(Opcode, DL, VT, Op0, Op1, DAG.getCondCode(ISD::SETGE));
- CC = ISD::SETLT;
- SwapIfRegArgs = true;
- break;
- }
-
- if (Opcode == AArch64ISD::NEON_CMP && SwapIfRegArgs) {
- CC = getSetCCSwappedOperands(CC);
- std::swap(Op0, Op1);
- }
-
- // Generate FP Compare Mask instr or FP Compare Mask against Zero instr
- SDValue NeonCmp = DAG.getNode(Opcode, DL, VT, Op0, Op1, DAG.getCondCode(CC));
-
- if (NeonCmpAlt.getNode())
- NeonCmp = DAG.getNode(ISD::OR, DL, VT, NeonCmp, NeonCmpAlt);
-
- if (Invert)
- NeonCmp = DAG.getNOT(DL, NeonCmp, VT);
-
- return NeonCmp;
+ return BR1;
}
-// (SETCC lhs, rhs, condcode)
-SDValue
-AArch64TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const {
- SDLoc dl(Op);
+SDValue AArch64TargetLowering::LowerFCOPYSIGN(SDValue Op,
+ SelectionDAG &DAG) const {
+ EVT VT = Op.getValueType();
+ SDLoc DL(Op);
+
+ SDValue In1 = Op.getOperand(0);
+ SDValue In2 = Op.getOperand(1);
+ EVT SrcVT = In2.getValueType();
+ if (SrcVT != VT) {
+ if (SrcVT == MVT::f32 && VT == MVT::f64)
+ In2 = DAG.getNode(ISD::FP_EXTEND, DL, VT, In2);
+ else if (SrcVT == MVT::f64 && VT == MVT::f32)
+ In2 = DAG.getNode(ISD::FP_ROUND, DL, VT, In2, DAG.getIntPtrConstant(0));
+ else
+ // FIXME: Src type is different, bail out for now. Can VT really be a
+ // vector type?
+ return SDValue();
+ }
+
+ EVT VecVT;
+ EVT EltVT;
+ SDValue EltMask, VecVal1, VecVal2;
+ if (VT == MVT::f32 || VT == MVT::v2f32 || VT == MVT::v4f32) {
+ EltVT = MVT::i32;
+ VecVT = MVT::v4i32;
+ EltMask = DAG.getConstant(0x80000000ULL, EltVT);
+
+ if (!VT.isVector()) {
+ VecVal1 = DAG.getTargetInsertSubreg(AArch64::ssub, DL, VecVT,
+ DAG.getUNDEF(VecVT), In1);
+ VecVal2 = DAG.getTargetInsertSubreg(AArch64::ssub, DL, VecVT,
+ DAG.getUNDEF(VecVT), In2);
+ } else {
+ VecVal1 = DAG.getNode(ISD::BITCAST, DL, VecVT, In1);
+ VecVal2 = DAG.getNode(ISD::BITCAST, DL, VecVT, In2);
+ }
+ } else if (VT == MVT::f64 || VT == MVT::v2f64) {
+ EltVT = MVT::i64;
+ VecVT = MVT::v2i64;
+
+ // We want to materialize a mask with the the high bit set, but the AdvSIMD
+ // immediate moves cannot materialize that in a single instruction for
+ // 64-bit elements. Instead, materialize zero and then negate it.
+ EltMask = DAG.getConstant(0, EltVT);
+
+ if (!VT.isVector()) {
+ VecVal1 = DAG.getTargetInsertSubreg(AArch64::dsub, DL, VecVT,
+ DAG.getUNDEF(VecVT), In1);
+ VecVal2 = DAG.getTargetInsertSubreg(AArch64::dsub, DL, VecVT,
+ DAG.getUNDEF(VecVT), In2);
+ } else {
+ VecVal1 = DAG.getNode(ISD::BITCAST, DL, VecVT, In1);
+ VecVal2 = DAG.getNode(ISD::BITCAST, DL, VecVT, In2);
+ }
+ } else {
+ llvm_unreachable("Invalid type for copysign!");
+ }
+
+ std::vector<SDValue> BuildVectorOps;
+ for (unsigned i = 0; i < VecVT.getVectorNumElements(); ++i)
+ BuildVectorOps.push_back(EltMask);
+
+ SDValue BuildVec = DAG.getNode(ISD::BUILD_VECTOR, DL, VecVT, BuildVectorOps);
+
+ // If we couldn't materialize the mask above, then the mask vector will be
+ // the zero vector, and we need to negate it here.
+ if (VT == MVT::f64 || VT == MVT::v2f64) {
+ BuildVec = DAG.getNode(ISD::BITCAST, DL, MVT::v2f64, BuildVec);
+ BuildVec = DAG.getNode(ISD::FNEG, DL, MVT::v2f64, BuildVec);
+ BuildVec = DAG.getNode(ISD::BITCAST, DL, MVT::v2i64, BuildVec);
+ }
+
+ SDValue Sel =
+ DAG.getNode(AArch64ISD::BIT, DL, VecVT, VecVal1, VecVal2, BuildVec);
+
+ if (VT == MVT::f32)
+ return DAG.getTargetExtractSubreg(AArch64::ssub, DL, VT, Sel);
+ else if (VT == MVT::f64)
+ return DAG.getTargetExtractSubreg(AArch64::dsub, DL, VT, Sel);
+ else
+ return DAG.getNode(ISD::BITCAST, DL, VT, Sel);
+}
+
+SDValue AArch64TargetLowering::LowerCTPOP(SDValue Op, SelectionDAG &DAG) const {
+ if (DAG.getMachineFunction().getFunction()->getAttributes().hasAttribute(
+ AttributeSet::FunctionIndex, Attribute::NoImplicitFloat))
+ return SDValue();
+
+ // While there is no integer popcount instruction, it can
+ // be more efficiently lowered to the following sequence that uses
+ // AdvSIMD registers/instructions as long as the copies to/from
+ // the AdvSIMD registers are cheap.
+ // FMOV D0, X0 // copy 64-bit int to vector, high bits zero'd
+ // CNT V0.8B, V0.8B // 8xbyte pop-counts
+ // ADDV B0, V0.8B // sum 8xbyte pop-counts
+ // UMOV X0, V0.B[0] // copy byte result back to integer reg
+ SDValue Val = Op.getOperand(0);
+ SDLoc DL(Op);
+ EVT VT = Op.getValueType();
+ SDValue ZeroVec = DAG.getUNDEF(MVT::v8i8);
+
+ SDValue VecVal;
+ if (VT == MVT::i32) {
+ VecVal = DAG.getNode(ISD::BITCAST, DL, MVT::f32, Val);
+ VecVal = DAG.getTargetInsertSubreg(AArch64::ssub, DL, MVT::v8i8, ZeroVec,
+ VecVal);
+ } else {
+ VecVal = DAG.getNode(ISD::BITCAST, DL, MVT::v8i8, Val);
+ }
+
+ SDValue CtPop = DAG.getNode(ISD::CTPOP, DL, MVT::v8i8, VecVal);
+ SDValue UaddLV = DAG.getNode(
+ ISD::INTRINSIC_WO_CHAIN, DL, MVT::i32,
+ DAG.getConstant(Intrinsic::aarch64_neon_uaddlv, MVT::i32), CtPop);
+
+ if (VT == MVT::i64)
+ UaddLV = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, UaddLV);
+ return UaddLV;
+}
+
+SDValue AArch64TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const {
+
+ if (Op.getValueType().isVector())
+ return LowerVSETCC(Op, DAG);
+
SDValue LHS = Op.getOperand(0);
SDValue RHS = Op.getOperand(1);
ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(2))->get();
+ SDLoc dl(Op);
+
+ // We chose ZeroOrOneBooleanContents, so use zero and one.
EVT VT = Op.getValueType();
+ SDValue TVal = DAG.getConstant(1, VT);
+ SDValue FVal = DAG.getConstant(0, VT);
- if (VT.isVector())
- return LowerVectorSETCC(Op, DAG);
-
+ // Handle f128 first, since one possible outcome is a normal integer
+ // comparison which gets picked up by the next if statement.
if (LHS.getValueType() == MVT::f128) {
- // f128 comparisons will be lowered to libcalls giving a valid LHS and RHS
- // for the rest of the function (some i32 or i64 values).
softenSetCCOperands(DAG, MVT::f128, LHS, RHS, CC, dl);
// If softenSetCCOperands returned a scalar, use it.
- if (RHS.getNode() == 0) {
+ if (!RHS.getNode()) {
assert(LHS.getValueType() == Op.getValueType() &&
"Unexpected setcc expansion!");
return LHS;
@@ -3021,205 +3106,403 @@
}
if (LHS.getValueType().isInteger()) {
- SDValue A64cc;
+ SDValue CCVal;
+ SDValue Cmp =
+ getAArch64Cmp(LHS, RHS, ISD::getSetCCInverse(CC, true), CCVal, DAG, dl);
- // Integers are handled in a separate function because the combinations of
- // immediates and tests can get hairy and we may want to fiddle things.
- SDValue CmpOp = getSelectableIntSetCC(LHS, RHS, CC, A64cc, DAG, dl);
-
- return DAG.getNode(AArch64ISD::SELECT_CC, dl, VT,
- CmpOp, DAG.getConstant(1, VT), DAG.getConstant(0, VT),
- A64cc);
+ // Note that we inverted the condition above, so we reverse the order of
+ // the true and false operands here. This will allow the setcc to be
+ // matched to a single CSINC instruction.
+ return DAG.getNode(AArch64ISD::CSEL, dl, VT, FVal, TVal, CCVal, Cmp);
}
- // Note that some LLVM floating-point CondCodes can't be lowered to a single
- // conditional branch, hence FPCCToA64CC can set a second test, where either
- // passing is sufficient.
- A64CC::CondCodes CondCode, Alternative = A64CC::Invalid;
- CondCode = FPCCToA64CC(CC, Alternative);
- SDValue A64cc = DAG.getConstant(CondCode, MVT::i32);
- SDValue CmpOp = DAG.getNode(AArch64ISD::SETCC, dl, MVT::i32, LHS, RHS,
- DAG.getCondCode(CC));
- SDValue A64SELECT_CC = DAG.getNode(AArch64ISD::SELECT_CC, dl, VT,
- CmpOp, DAG.getConstant(1, VT),
- DAG.getConstant(0, VT), A64cc);
+ // Now we know we're dealing with FP values.
+ assert(LHS.getValueType() == MVT::f32 || LHS.getValueType() == MVT::f64);
- if (Alternative != A64CC::Invalid) {
- A64cc = DAG.getConstant(Alternative, MVT::i32);
- A64SELECT_CC = DAG.getNode(AArch64ISD::SELECT_CC, dl, VT, CmpOp,
- DAG.getConstant(1, VT), A64SELECT_CC, A64cc);
+ // If that fails, we'll need to perform an FCMP + CSEL sequence. Go ahead
+ // and do the comparison.
+ SDValue Cmp = emitComparison(LHS, RHS, CC, dl, DAG);
+
+ AArch64CC::CondCode CC1, CC2;
+ changeFPCCToAArch64CC(CC, CC1, CC2);
+ if (CC2 == AArch64CC::AL) {
+ changeFPCCToAArch64CC(ISD::getSetCCInverse(CC, false), CC1, CC2);
+ SDValue CC1Val = DAG.getConstant(CC1, MVT::i32);
+
+ // Note that we inverted the condition above, so we reverse the order of
+ // the true and false operands here. This will allow the setcc to be
+ // matched to a single CSINC instruction.
+ return DAG.getNode(AArch64ISD::CSEL, dl, VT, FVal, TVal, CC1Val, Cmp);
+ } else {
+ // Unfortunately, the mapping of LLVM FP CC's onto AArch64 CC's isn't
+ // totally clean. Some of them require two CSELs to implement. As is in
+ // this case, we emit the first CSEL and then emit a second using the output
+ // of the first as the RHS. We're effectively OR'ing the two CC's together.
+
+ // FIXME: It would be nice if we could match the two CSELs to two CSINCs.
+ SDValue CC1Val = DAG.getConstant(CC1, MVT::i32);
+ SDValue CS1 =
+ DAG.getNode(AArch64ISD::CSEL, dl, VT, TVal, FVal, CC1Val, Cmp);
+
+ SDValue CC2Val = DAG.getConstant(CC2, MVT::i32);
+ return DAG.getNode(AArch64ISD::CSEL, dl, VT, TVal, CS1, CC2Val, Cmp);
}
-
- return A64SELECT_CC;
}
-static SDValue LowerVectorSELECT_CC(SDValue Op, SelectionDAG &DAG) {
- SDLoc dl(Op);
- SDValue LHS = Op.getOperand(0);
- SDValue RHS = Op.getOperand(1);
- SDValue IfTrue = Op.getOperand(2);
- SDValue IfFalse = Op.getOperand(3);
- EVT IfTrueVT = IfTrue.getValueType();
- EVT CondVT = IfTrueVT.changeVectorElementTypeToInteger();
- ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(4))->get();
+/// A SELECT_CC operation is really some kind of max or min if both values being
+/// compared are, in some sense, equal to the results in either case. However,
+/// it is permissible to compare f32 values and produce directly extended f64
+/// values.
+///
+/// Extending the comparison operands would also be allowed, but is less likely
+/// to happen in practice since their use is right here. Note that truncate
+/// operations would *not* be semantically equivalent.
+static bool selectCCOpsAreFMaxCompatible(SDValue Cmp, SDValue Result) {
+ if (Cmp == Result)
+ return true;
- // If LHS & RHS are floating point and IfTrue & IfFalse are vectors, we will
- // use NEON compare.
- if ((LHS.getValueType() == MVT::f32 || LHS.getValueType() == MVT::f64)) {
- EVT EltVT = LHS.getValueType();
- unsigned EltNum = 128 / EltVT.getSizeInBits();
- EVT VT = EVT::getVectorVT(*DAG.getContext(), EltVT, EltNum);
- unsigned SubConstant =
- (LHS.getValueType() == MVT::f32) ? AArch64::sub_32 :AArch64::sub_64;
- EVT CEltT = (LHS.getValueType() == MVT::f32) ? MVT::i32 : MVT::i64;
- EVT CVT = EVT::getVectorVT(*DAG.getContext(), CEltT, EltNum);
-
- LHS
- = SDValue(DAG.getMachineNode(TargetOpcode::SUBREG_TO_REG, dl,
- VT, DAG.getTargetConstant(0, MVT::i32), LHS,
- DAG.getTargetConstant(SubConstant, MVT::i32)), 0);
- RHS
- = SDValue(DAG.getMachineNode(TargetOpcode::SUBREG_TO_REG, dl,
- VT, DAG.getTargetConstant(0, MVT::i32), RHS,
- DAG.getTargetConstant(SubConstant, MVT::i32)), 0);
-
- SDValue VSetCC = DAG.getSetCC(dl, CVT, LHS, RHS, CC);
- SDValue ResCC = LowerVectorSETCC(VSetCC, DAG);
- if (CEltT.getSizeInBits() < IfTrueVT.getSizeInBits()) {
- EVT DUPVT =
- EVT::getVectorVT(*DAG.getContext(), CEltT,
- IfTrueVT.getSizeInBits() / CEltT.getSizeInBits());
- ResCC = DAG.getNode(AArch64ISD::NEON_VDUPLANE, dl, DUPVT, ResCC,
- DAG.getConstant(0, MVT::i64, false));
-
- ResCC = DAG.getNode(ISD::BITCAST, dl, CondVT, ResCC);
- } else {
- // FIXME: If IfTrue & IfFalse hold v1i8, v1i16 or v1i32, this function
- // can't handle them and will hit this assert.
- assert(CEltT.getSizeInBits() == IfTrueVT.getSizeInBits() &&
- "Vector of IfTrue & IfFalse is too small.");
-
- unsigned ExEltNum =
- EltNum * IfTrueVT.getSizeInBits() / ResCC.getValueSizeInBits();
- EVT ExVT = EVT::getVectorVT(*DAG.getContext(), CEltT, ExEltNum);
- ResCC = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, ExVT, ResCC,
- DAG.getConstant(0, MVT::i64, false));
- ResCC = DAG.getNode(ISD::BITCAST, dl, CondVT, ResCC);
- }
- SDValue VSelect = DAG.getNode(ISD::VSELECT, dl, IfTrue.getValueType(),
- ResCC, IfTrue, IfFalse);
- return VSelect;
+ ConstantFPSDNode *CCmp = dyn_cast<ConstantFPSDNode>(Cmp);
+ ConstantFPSDNode *CResult = dyn_cast<ConstantFPSDNode>(Result);
+ if (CCmp && CResult && Cmp.getValueType() == MVT::f32 &&
+ Result.getValueType() == MVT::f64) {
+ bool Lossy;
+ APFloat CmpVal = CCmp->getValueAPF();
+ CmpVal.convert(APFloat::IEEEdouble, APFloat::rmNearestTiesToEven, &Lossy);
+ return CResult->getValueAPF().bitwiseIsEqual(CmpVal);
}
- // Here we handle the case that LHS & RHS are integer and IfTrue & IfFalse are
- // vectors.
- A64CC::CondCodes CondCode, Alternative = A64CC::Invalid;
- CondCode = FPCCToA64CC(CC, Alternative);
- SDValue A64cc = DAG.getConstant(CondCode, MVT::i32);
- SDValue SetCC = DAG.getNode(AArch64ISD::SETCC, dl, MVT::i32, LHS, RHS,
- DAG.getCondCode(CC));
- EVT SEVT = MVT::i32;
- if (IfTrue.getValueType().getVectorElementType().getSizeInBits() > 32)
- SEVT = MVT::i64;
- SDValue AllOne = DAG.getConstant(-1, SEVT);
- SDValue AllZero = DAG.getConstant(0, SEVT);
- SDValue A64SELECT_CC = DAG.getNode(AArch64ISD::SELECT_CC, dl, SEVT, SetCC,
- AllOne, AllZero, A64cc);
+ return Result->getOpcode() == ISD::FP_EXTEND && Result->getOperand(0) == Cmp;
+}
- if (Alternative != A64CC::Invalid) {
- A64cc = DAG.getConstant(Alternative, MVT::i32);
- A64SELECT_CC = DAG.getNode(AArch64ISD::SELECT_CC, dl, Op.getValueType(),
- SetCC, AllOne, A64SELECT_CC, A64cc);
+SDValue AArch64TargetLowering::LowerSELECT(SDValue Op,
+ SelectionDAG &DAG) const {
+ SDValue CC = Op->getOperand(0);
+ SDValue TVal = Op->getOperand(1);
+ SDValue FVal = Op->getOperand(2);
+ SDLoc DL(Op);
+
+ unsigned Opc = CC.getOpcode();
+ // Optimize {s|u}{add|sub|mul}.with.overflow feeding into a select
+ // instruction.
+ if (CC.getResNo() == 1 &&
+ (Opc == ISD::SADDO || Opc == ISD::UADDO || Opc == ISD::SSUBO ||
+ Opc == ISD::USUBO || Opc == ISD::SMULO || Opc == ISD::UMULO)) {
+ // Only lower legal XALUO ops.
+ if (!DAG.getTargetLoweringInfo().isTypeLegal(CC->getValueType(0)))
+ return SDValue();
+
+ AArch64CC::CondCode OFCC;
+ SDValue Value, Overflow;
+ std::tie(Value, Overflow) = getAArch64XALUOOp(OFCC, CC.getValue(0), DAG);
+ SDValue CCVal = DAG.getConstant(OFCC, MVT::i32);
+
+ return DAG.getNode(AArch64ISD::CSEL, DL, Op.getValueType(), TVal, FVal,
+ CCVal, Overflow);
}
- SDValue VDup;
- if (IfTrue.getValueType().getVectorNumElements() == 1)
- VDup = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, CondVT, A64SELECT_CC);
+
+ if (CC.getOpcode() == ISD::SETCC)
+ return DAG.getSelectCC(DL, CC.getOperand(0), CC.getOperand(1), TVal, FVal,
+ cast<CondCodeSDNode>(CC.getOperand(2))->get());
else
- VDup = DAG.getNode(AArch64ISD::NEON_VDUP, dl, CondVT, A64SELECT_CC);
- SDValue VSelect = DAG.getNode(ISD::VSELECT, dl, IfTrue.getValueType(),
- VDup, IfTrue, IfFalse);
- return VSelect;
+ return DAG.getSelectCC(DL, CC, DAG.getConstant(0, CC.getValueType()), TVal,
+ FVal, ISD::SETNE);
}
-// (SELECT_CC lhs, rhs, iftrue, iffalse, condcode)
-SDValue
-AArch64TargetLowering::LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const {
- SDLoc dl(Op);
+SDValue AArch64TargetLowering::LowerSELECT_CC(SDValue Op,
+ SelectionDAG &DAG) const {
+ ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(4))->get();
SDValue LHS = Op.getOperand(0);
SDValue RHS = Op.getOperand(1);
- SDValue IfTrue = Op.getOperand(2);
- SDValue IfFalse = Op.getOperand(3);
- ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(4))->get();
+ SDValue TVal = Op.getOperand(2);
+ SDValue FVal = Op.getOperand(3);
+ SDLoc dl(Op);
- if (IfTrue.getValueType().isVector())
- return LowerVectorSELECT_CC(Op, DAG);
-
+ // Handle f128 first, because it will result in a comparison of some RTLIB
+ // call result against zero.
if (LHS.getValueType() == MVT::f128) {
- // f128 comparisons are lowered to libcalls, but slot in nicely here
- // afterwards.
softenSetCCOperands(DAG, MVT::f128, LHS, RHS, CC, dl);
// If softenSetCCOperands returned a scalar, we need to compare the result
// against zero to select between true and false values.
- if (RHS.getNode() == 0) {
+ if (!RHS.getNode()) {
RHS = DAG.getConstant(0, LHS.getValueType());
CC = ISD::SETNE;
}
}
+ // Handle integers first.
if (LHS.getValueType().isInteger()) {
- SDValue A64cc;
+ assert((LHS.getValueType() == RHS.getValueType()) &&
+ (LHS.getValueType() == MVT::i32 || LHS.getValueType() == MVT::i64));
- // Integers are handled in a separate function because the combinations of
- // immediates and tests can get hairy and we may want to fiddle things.
- SDValue CmpOp = getSelectableIntSetCC(LHS, RHS, CC, A64cc, DAG, dl);
+ unsigned Opcode = AArch64ISD::CSEL;
- return DAG.getNode(AArch64ISD::SELECT_CC, dl, Op.getValueType(), CmpOp,
- IfTrue, IfFalse, A64cc);
+ // If both the TVal and the FVal are constants, see if we can swap them in
+ // order to for a CSINV or CSINC out of them.
+ ConstantSDNode *CFVal = dyn_cast<ConstantSDNode>(FVal);
+ ConstantSDNode *CTVal = dyn_cast<ConstantSDNode>(TVal);
+
+ if (CTVal && CFVal && CTVal->isAllOnesValue() && CFVal->isNullValue()) {
+ std::swap(TVal, FVal);
+ std::swap(CTVal, CFVal);
+ CC = ISD::getSetCCInverse(CC, true);
+ } else if (CTVal && CFVal && CTVal->isOne() && CFVal->isNullValue()) {
+ std::swap(TVal, FVal);
+ std::swap(CTVal, CFVal);
+ CC = ISD::getSetCCInverse(CC, true);
+ } else if (TVal.getOpcode() == ISD::XOR) {
+ // If TVal is a NOT we want to swap TVal and FVal so that we can match
+ // with a CSINV rather than a CSEL.
+ ConstantSDNode *CVal = dyn_cast<ConstantSDNode>(TVal.getOperand(1));
+
+ if (CVal && CVal->isAllOnesValue()) {
+ std::swap(TVal, FVal);
+ std::swap(CTVal, CFVal);
+ CC = ISD::getSetCCInverse(CC, true);
+ }
+ } else if (TVal.getOpcode() == ISD::SUB) {
+ // If TVal is a negation (SUB from 0) we want to swap TVal and FVal so
+ // that we can match with a CSNEG rather than a CSEL.
+ ConstantSDNode *CVal = dyn_cast<ConstantSDNode>(TVal.getOperand(0));
+
+ if (CVal && CVal->isNullValue()) {
+ std::swap(TVal, FVal);
+ std::swap(CTVal, CFVal);
+ CC = ISD::getSetCCInverse(CC, true);
+ }
+ } else if (CTVal && CFVal) {
+ const int64_t TrueVal = CTVal->getSExtValue();
+ const int64_t FalseVal = CFVal->getSExtValue();
+ bool Swap = false;
+
+ // If both TVal and FVal are constants, see if FVal is the
+ // inverse/negation/increment of TVal and generate a CSINV/CSNEG/CSINC
+ // instead of a CSEL in that case.
+ if (TrueVal == ~FalseVal) {
+ Opcode = AArch64ISD::CSINV;
+ } else if (TrueVal == -FalseVal) {
+ Opcode = AArch64ISD::CSNEG;
+ } else if (TVal.getValueType() == MVT::i32) {
+ // If our operands are only 32-bit wide, make sure we use 32-bit
+ // arithmetic for the check whether we can use CSINC. This ensures that
+ // the addition in the check will wrap around properly in case there is
+ // an overflow (which would not be the case if we do the check with
+ // 64-bit arithmetic).
+ const uint32_t TrueVal32 = CTVal->getZExtValue();
+ const uint32_t FalseVal32 = CFVal->getZExtValue();
+
+ if ((TrueVal32 == FalseVal32 + 1) || (TrueVal32 + 1 == FalseVal32)) {
+ Opcode = AArch64ISD::CSINC;
+
+ if (TrueVal32 > FalseVal32) {
+ Swap = true;
+ }
+ }
+ // 64-bit check whether we can use CSINC.
+ } else if ((TrueVal == FalseVal + 1) || (TrueVal + 1 == FalseVal)) {
+ Opcode = AArch64ISD::CSINC;
+
+ if (TrueVal > FalseVal) {
+ Swap = true;
+ }
+ }
+
+ // Swap TVal and FVal if necessary.
+ if (Swap) {
+ std::swap(TVal, FVal);
+ std::swap(CTVal, CFVal);
+ CC = ISD::getSetCCInverse(CC, true);
+ }
+
+ if (Opcode != AArch64ISD::CSEL) {
+ // Drop FVal since we can get its value by simply inverting/negating
+ // TVal.
+ FVal = TVal;
+ }
+ }
+
+ SDValue CCVal;
+ SDValue Cmp = getAArch64Cmp(LHS, RHS, CC, CCVal, DAG, dl);
+
+ EVT VT = Op.getValueType();
+ return DAG.getNode(Opcode, dl, VT, TVal, FVal, CCVal, Cmp);
}
- // Note that some LLVM floating-point CondCodes can't be lowered to a single
- // conditional branch, hence FPCCToA64CC can set a second test, where either
- // passing is sufficient.
- A64CC::CondCodes CondCode, Alternative = A64CC::Invalid;
- CondCode = FPCCToA64CC(CC, Alternative);
- SDValue A64cc = DAG.getConstant(CondCode, MVT::i32);
- SDValue SetCC = DAG.getNode(AArch64ISD::SETCC, dl, MVT::i32, LHS, RHS,
- DAG.getCondCode(CC));
- SDValue A64SELECT_CC = DAG.getNode(AArch64ISD::SELECT_CC, dl,
- Op.getValueType(),
- SetCC, IfTrue, IfFalse, A64cc);
+ // Now we know we're dealing with FP values.
+ assert(LHS.getValueType() == MVT::f32 || LHS.getValueType() == MVT::f64);
+ assert(LHS.getValueType() == RHS.getValueType());
+ EVT VT = Op.getValueType();
- if (Alternative != A64CC::Invalid) {
- A64cc = DAG.getConstant(Alternative, MVT::i32);
- A64SELECT_CC = DAG.getNode(AArch64ISD::SELECT_CC, dl, Op.getValueType(),
- SetCC, IfTrue, A64SELECT_CC, A64cc);
+ // Try to match this select into a max/min operation, which have dedicated
+ // opcode in the instruction set.
+ // FIXME: This is not correct in the presence of NaNs, so we only enable this
+ // in no-NaNs mode.
+ if (getTargetMachine().Options.NoNaNsFPMath) {
+ SDValue MinMaxLHS = TVal, MinMaxRHS = FVal;
+ if (selectCCOpsAreFMaxCompatible(LHS, MinMaxRHS) &&
+ selectCCOpsAreFMaxCompatible(RHS, MinMaxLHS)) {
+ CC = ISD::getSetCCSwappedOperands(CC);
+ std::swap(MinMaxLHS, MinMaxRHS);
+ }
+ if (selectCCOpsAreFMaxCompatible(LHS, MinMaxLHS) &&
+ selectCCOpsAreFMaxCompatible(RHS, MinMaxRHS)) {
+ switch (CC) {
+ default:
+ break;
+ case ISD::SETGT:
+ case ISD::SETGE:
+ case ISD::SETUGT:
+ case ISD::SETUGE:
+ case ISD::SETOGT:
+ case ISD::SETOGE:
+ return DAG.getNode(AArch64ISD::FMAX, dl, VT, MinMaxLHS, MinMaxRHS);
+ break;
+ case ISD::SETLT:
+ case ISD::SETLE:
+ case ISD::SETULT:
+ case ISD::SETULE:
+ case ISD::SETOLT:
+ case ISD::SETOLE:
+ return DAG.getNode(AArch64ISD::FMIN, dl, VT, MinMaxLHS, MinMaxRHS);
+ break;
+ }
+ }
}
- return A64SELECT_CC;
+ // If that fails, we'll need to perform an FCMP + CSEL sequence. Go ahead
+ // and do the comparison.
+ SDValue Cmp = emitComparison(LHS, RHS, CC, dl, DAG);
+
+ // Unfortunately, the mapping of LLVM FP CC's onto AArch64 CC's isn't totally
+ // clean. Some of them require two CSELs to implement.
+ AArch64CC::CondCode CC1, CC2;
+ changeFPCCToAArch64CC(CC, CC1, CC2);
+ SDValue CC1Val = DAG.getConstant(CC1, MVT::i32);
+ SDValue CS1 = DAG.getNode(AArch64ISD::CSEL, dl, VT, TVal, FVal, CC1Val, Cmp);
+
+ // If we need a second CSEL, emit it, using the output of the first as the
+ // RHS. We're effectively OR'ing the two CC's together.
+ if (CC2 != AArch64CC::AL) {
+ SDValue CC2Val = DAG.getConstant(CC2, MVT::i32);
+ return DAG.getNode(AArch64ISD::CSEL, dl, VT, TVal, CS1, CC2Val, Cmp);
+ }
+
+ // Otherwise, return the output of the first CSEL.
+ return CS1;
}
-SDValue
-AArch64TargetLowering::LowerVACOPY(SDValue Op, SelectionDAG &DAG) const {
- const Value *DestSV = cast<SrcValueSDNode>(Op.getOperand(3))->getValue();
- const Value *SrcSV = cast<SrcValueSDNode>(Op.getOperand(4))->getValue();
+SDValue AArch64TargetLowering::LowerJumpTable(SDValue Op,
+ SelectionDAG &DAG) const {
+ // Jump table entries as PC relative offsets. No additional tweaking
+ // is necessary here. Just get the address of the jump table.
+ JumpTableSDNode *JT = cast<JumpTableSDNode>(Op);
+ EVT PtrVT = getPointerTy();
+ SDLoc DL(Op);
- // We have to make sure we copy the entire structure: 8+8+8+4+4 = 32 bytes
- // rather than just 8.
- return DAG.getMemcpy(Op.getOperand(0), SDLoc(Op),
- Op.getOperand(1), Op.getOperand(2),
- DAG.getConstant(32, MVT::i32), 8, false, false,
- MachinePointerInfo(DestSV), MachinePointerInfo(SrcSV));
+ if (getTargetMachine().getCodeModel() == CodeModel::Large &&
+ !Subtarget->isTargetMachO()) {
+ const unsigned char MO_NC = AArch64II::MO_NC;
+ return DAG.getNode(
+ AArch64ISD::WrapperLarge, DL, PtrVT,
+ DAG.getTargetJumpTable(JT->getIndex(), PtrVT, AArch64II::MO_G3),
+ DAG.getTargetJumpTable(JT->getIndex(), PtrVT, AArch64II::MO_G2 | MO_NC),
+ DAG.getTargetJumpTable(JT->getIndex(), PtrVT, AArch64II::MO_G1 | MO_NC),
+ DAG.getTargetJumpTable(JT->getIndex(), PtrVT,
+ AArch64II::MO_G0 | MO_NC));
+ }
+
+ SDValue Hi =
+ DAG.getTargetJumpTable(JT->getIndex(), PtrVT, AArch64II::MO_PAGE);
+ SDValue Lo = DAG.getTargetJumpTable(JT->getIndex(), PtrVT,
+ AArch64II::MO_PAGEOFF | AArch64II::MO_NC);
+ SDValue ADRP = DAG.getNode(AArch64ISD::ADRP, DL, PtrVT, Hi);
+ return DAG.getNode(AArch64ISD::ADDlow, DL, PtrVT, ADRP, Lo);
}
-SDValue
-AArch64TargetLowering::LowerVASTART(SDValue Op, SelectionDAG &DAG) const {
+SDValue AArch64TargetLowering::LowerConstantPool(SDValue Op,
+ SelectionDAG &DAG) const {
+ ConstantPoolSDNode *CP = cast<ConstantPoolSDNode>(Op);
+ EVT PtrVT = getPointerTy();
+ SDLoc DL(Op);
+
+ if (getTargetMachine().getCodeModel() == CodeModel::Large) {
+ // Use the GOT for the large code model on iOS.
+ if (Subtarget->isTargetMachO()) {
+ SDValue GotAddr = DAG.getTargetConstantPool(
+ CP->getConstVal(), PtrVT, CP->getAlignment(), CP->getOffset(),
+ AArch64II::MO_GOT);
+ return DAG.getNode(AArch64ISD::LOADgot, DL, PtrVT, GotAddr);
+ }
+
+ const unsigned char MO_NC = AArch64II::MO_NC;
+ return DAG.getNode(
+ AArch64ISD::WrapperLarge, DL, PtrVT,
+ DAG.getTargetConstantPool(CP->getConstVal(), PtrVT, CP->getAlignment(),
+ CP->getOffset(), AArch64II::MO_G3),
+ DAG.getTargetConstantPool(CP->getConstVal(), PtrVT, CP->getAlignment(),
+ CP->getOffset(), AArch64II::MO_G2 | MO_NC),
+ DAG.getTargetConstantPool(CP->getConstVal(), PtrVT, CP->getAlignment(),
+ CP->getOffset(), AArch64II::MO_G1 | MO_NC),
+ DAG.getTargetConstantPool(CP->getConstVal(), PtrVT, CP->getAlignment(),
+ CP->getOffset(), AArch64II::MO_G0 | MO_NC));
+ } else {
+ // Use ADRP/ADD or ADRP/LDR for everything else: the small memory model on
+ // ELF, the only valid one on Darwin.
+ SDValue Hi =
+ DAG.getTargetConstantPool(CP->getConstVal(), PtrVT, CP->getAlignment(),
+ CP->getOffset(), AArch64II::MO_PAGE);
+ SDValue Lo = DAG.getTargetConstantPool(
+ CP->getConstVal(), PtrVT, CP->getAlignment(), CP->getOffset(),
+ AArch64II::MO_PAGEOFF | AArch64II::MO_NC);
+
+ SDValue ADRP = DAG.getNode(AArch64ISD::ADRP, DL, PtrVT, Hi);
+ return DAG.getNode(AArch64ISD::ADDlow, DL, PtrVT, ADRP, Lo);
+ }
+}
+
+SDValue AArch64TargetLowering::LowerBlockAddress(SDValue Op,
+ SelectionDAG &DAG) const {
+ const BlockAddress *BA = cast<BlockAddressSDNode>(Op)->getBlockAddress();
+ EVT PtrVT = getPointerTy();
+ SDLoc DL(Op);
+ if (getTargetMachine().getCodeModel() == CodeModel::Large &&
+ !Subtarget->isTargetMachO()) {
+ const unsigned char MO_NC = AArch64II::MO_NC;
+ return DAG.getNode(
+ AArch64ISD::WrapperLarge, DL, PtrVT,
+ DAG.getTargetBlockAddress(BA, PtrVT, 0, AArch64II::MO_G3),
+ DAG.getTargetBlockAddress(BA, PtrVT, 0, AArch64II::MO_G2 | MO_NC),
+ DAG.getTargetBlockAddress(BA, PtrVT, 0, AArch64II::MO_G1 | MO_NC),
+ DAG.getTargetBlockAddress(BA, PtrVT, 0, AArch64II::MO_G0 | MO_NC));
+ } else {
+ SDValue Hi = DAG.getTargetBlockAddress(BA, PtrVT, 0, AArch64II::MO_PAGE);
+ SDValue Lo = DAG.getTargetBlockAddress(BA, PtrVT, 0, AArch64II::MO_PAGEOFF |
+ AArch64II::MO_NC);
+ SDValue ADRP = DAG.getNode(AArch64ISD::ADRP, DL, PtrVT, Hi);
+ return DAG.getNode(AArch64ISD::ADDlow, DL, PtrVT, ADRP, Lo);
+ }
+}
+
+SDValue AArch64TargetLowering::LowerDarwin_VASTART(SDValue Op,
+ SelectionDAG &DAG) const {
+ AArch64FunctionInfo *FuncInfo =
+ DAG.getMachineFunction().getInfo<AArch64FunctionInfo>();
+
+ SDLoc DL(Op);
+ SDValue FR =
+ DAG.getFrameIndex(FuncInfo->getVarArgsStackIndex(), getPointerTy());
+ const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
+ return DAG.getStore(Op.getOperand(0), DL, FR, Op.getOperand(1),
+ MachinePointerInfo(SV), false, false, 0);
+}
+
+SDValue AArch64TargetLowering::LowerAAPCS_VASTART(SDValue Op,
+ SelectionDAG &DAG) const {
// The layout of the va_list struct is specified in the AArch64 Procedure Call
// Standard, section B.3.
MachineFunction &MF = DAG.getMachineFunction();
- AArch64MachineFunctionInfo *FuncInfo
- = MF.getInfo<AArch64MachineFunctionInfo>();
+ AArch64FunctionInfo *FuncInfo = MF.getInfo<AArch64FunctionInfo>();
SDLoc DL(Op);
SDValue Chain = Op.getOperand(0);
@@ -3228,498 +3511,2894 @@
SmallVector<SDValue, 4> MemOps;
// void *__stack at offset 0
- SDValue Stack = DAG.getFrameIndex(FuncInfo->getVariadicStackIdx(),
- getPointerTy());
+ SDValue Stack =
+ DAG.getFrameIndex(FuncInfo->getVarArgsStackIndex(), getPointerTy());
MemOps.push_back(DAG.getStore(Chain, DL, Stack, VAList,
- MachinePointerInfo(SV), false, false, 0));
+ MachinePointerInfo(SV), false, false, 8));
// void *__gr_top at offset 8
- int GPRSize = FuncInfo->getVariadicGPRSize();
+ int GPRSize = FuncInfo->getVarArgsGPRSize();
if (GPRSize > 0) {
SDValue GRTop, GRTopAddr;
GRTopAddr = DAG.getNode(ISD::ADD, DL, getPointerTy(), VAList,
DAG.getConstant(8, getPointerTy()));
- GRTop = DAG.getFrameIndex(FuncInfo->getVariadicGPRIdx(), getPointerTy());
+ GRTop = DAG.getFrameIndex(FuncInfo->getVarArgsGPRIndex(), getPointerTy());
GRTop = DAG.getNode(ISD::ADD, DL, getPointerTy(), GRTop,
DAG.getConstant(GPRSize, getPointerTy()));
MemOps.push_back(DAG.getStore(Chain, DL, GRTop, GRTopAddr,
- MachinePointerInfo(SV, 8),
- false, false, 0));
+ MachinePointerInfo(SV, 8), false, false, 8));
}
// void *__vr_top at offset 16
- int FPRSize = FuncInfo->getVariadicFPRSize();
+ int FPRSize = FuncInfo->getVarArgsFPRSize();
if (FPRSize > 0) {
SDValue VRTop, VRTopAddr;
VRTopAddr = DAG.getNode(ISD::ADD, DL, getPointerTy(), VAList,
DAG.getConstant(16, getPointerTy()));
- VRTop = DAG.getFrameIndex(FuncInfo->getVariadicFPRIdx(), getPointerTy());
+ VRTop = DAG.getFrameIndex(FuncInfo->getVarArgsFPRIndex(), getPointerTy());
VRTop = DAG.getNode(ISD::ADD, DL, getPointerTy(), VRTop,
DAG.getConstant(FPRSize, getPointerTy()));
MemOps.push_back(DAG.getStore(Chain, DL, VRTop, VRTopAddr,
- MachinePointerInfo(SV, 16),
- false, false, 0));
+ MachinePointerInfo(SV, 16), false, false, 8));
}
// int __gr_offs at offset 24
SDValue GROffsAddr = DAG.getNode(ISD::ADD, DL, getPointerTy(), VAList,
DAG.getConstant(24, getPointerTy()));
MemOps.push_back(DAG.getStore(Chain, DL, DAG.getConstant(-GPRSize, MVT::i32),
- GROffsAddr, MachinePointerInfo(SV, 24),
- false, false, 0));
+ GROffsAddr, MachinePointerInfo(SV, 24), false,
+ false, 4));
// int __vr_offs at offset 28
SDValue VROffsAddr = DAG.getNode(ISD::ADD, DL, getPointerTy(), VAList,
DAG.getConstant(28, getPointerTy()));
MemOps.push_back(DAG.getStore(Chain, DL, DAG.getConstant(-FPRSize, MVT::i32),
- VROffsAddr, MachinePointerInfo(SV, 28),
- false, false, 0));
+ VROffsAddr, MachinePointerInfo(SV, 28), false,
+ false, 4));
- return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, &MemOps[0],
- MemOps.size());
+ return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOps);
}
-SDValue
-AArch64TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
- switch (Op.getOpcode()) {
- default: llvm_unreachable("Don't know how to custom lower this!");
- case ISD::FADD: return LowerF128ToCall(Op, DAG, RTLIB::ADD_F128);
- case ISD::FSUB: return LowerF128ToCall(Op, DAG, RTLIB::SUB_F128);
- case ISD::FMUL: return LowerF128ToCall(Op, DAG, RTLIB::MUL_F128);
- case ISD::FDIV: return LowerF128ToCall(Op, DAG, RTLIB::DIV_F128);
- case ISD::FP_TO_SINT: return LowerFP_TO_INT(Op, DAG, true);
- case ISD::FP_TO_UINT: return LowerFP_TO_INT(Op, DAG, false);
- case ISD::SINT_TO_FP: return LowerINT_TO_FP(Op, DAG, true);
- case ISD::UINT_TO_FP: return LowerINT_TO_FP(Op, DAG, false);
- case ISD::FP_ROUND: return LowerFP_ROUND(Op, DAG);
- case ISD::FP_EXTEND: return LowerFP_EXTEND(Op, DAG);
- case ISD::RETURNADDR: return LowerRETURNADDR(Op, DAG);
- case ISD::FRAMEADDR: return LowerFRAMEADDR(Op, DAG);
+SDValue AArch64TargetLowering::LowerVASTART(SDValue Op,
+ SelectionDAG &DAG) const {
+ return Subtarget->isTargetDarwin() ? LowerDarwin_VASTART(Op, DAG)
+ : LowerAAPCS_VASTART(Op, DAG);
+}
- case ISD::SHL_PARTS: return LowerShiftLeftParts(Op, DAG);
- case ISD::SRL_PARTS:
- case ISD::SRA_PARTS: return LowerShiftRightParts(Op, DAG);
+SDValue AArch64TargetLowering::LowerVACOPY(SDValue Op,
+ SelectionDAG &DAG) const {
+ // AAPCS has three pointers and two ints (= 32 bytes), Darwin has single
+ // pointer.
+ unsigned VaListSize = Subtarget->isTargetDarwin() ? 8 : 32;
+ const Value *DestSV = cast<SrcValueSDNode>(Op.getOperand(3))->getValue();
+ const Value *SrcSV = cast<SrcValueSDNode>(Op.getOperand(4))->getValue();
- case ISD::BlockAddress: return LowerBlockAddress(Op, DAG);
- case ISD::BRCOND: return LowerBRCOND(Op, DAG);
- case ISD::BR_CC: return LowerBR_CC(Op, DAG);
- case ISD::GlobalAddress: return LowerGlobalAddressELF(Op, DAG);
- case ISD::ConstantPool: return LowerConstantPool(Op, DAG);
- case ISD::GlobalTLSAddress: return LowerGlobalTLSAddress(Op, DAG);
- case ISD::JumpTable: return LowerJumpTable(Op, DAG);
- case ISD::SELECT: return LowerSELECT(Op, DAG);
- case ISD::SELECT_CC: return LowerSELECT_CC(Op, DAG);
- case ISD::SETCC: return LowerSETCC(Op, DAG);
- case ISD::VACOPY: return LowerVACOPY(Op, DAG);
- case ISD::VASTART: return LowerVASTART(Op, DAG);
- case ISD::BUILD_VECTOR:
- return LowerBUILD_VECTOR(Op, DAG, getSubtarget());
- case ISD::CONCAT_VECTORS: return LowerCONCAT_VECTORS(Op, DAG);
- case ISD::VECTOR_SHUFFLE: return LowerVECTOR_SHUFFLE(Op, DAG);
+ return DAG.getMemcpy(Op.getOperand(0), SDLoc(Op), Op.getOperand(1),
+ Op.getOperand(2), DAG.getConstant(VaListSize, MVT::i32),
+ 8, false, false, MachinePointerInfo(DestSV),
+ MachinePointerInfo(SrcSV));
+}
+
+SDValue AArch64TargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG) const {
+ assert(Subtarget->isTargetDarwin() &&
+ "automatic va_arg instruction only works on Darwin");
+
+ const Value *V = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
+ EVT VT = Op.getValueType();
+ SDLoc DL(Op);
+ SDValue Chain = Op.getOperand(0);
+ SDValue Addr = Op.getOperand(1);
+ unsigned Align = Op.getConstantOperandVal(3);
+
+ SDValue VAList = DAG.getLoad(getPointerTy(), DL, Chain, Addr,
+ MachinePointerInfo(V), false, false, false, 0);
+ Chain = VAList.getValue(1);
+
+ if (Align > 8) {
+ assert(((Align & (Align - 1)) == 0) && "Expected Align to be a power of 2");
+ VAList = DAG.getNode(ISD::ADD, DL, getPointerTy(), VAList,
+ DAG.getConstant(Align - 1, getPointerTy()));
+ VAList = DAG.getNode(ISD::AND, DL, getPointerTy(), VAList,
+ DAG.getConstant(-(int64_t)Align, getPointerTy()));
}
+ Type *ArgTy = VT.getTypeForEVT(*DAG.getContext());
+ uint64_t ArgSize = getDataLayout()->getTypeAllocSize(ArgTy);
+
+ // Scalar integer and FP values smaller than 64 bits are implicitly extended
+ // up to 64 bits. At the very least, we have to increase the striding of the
+ // vaargs list to match this, and for FP values we need to introduce
+ // FP_ROUND nodes as well.
+ if (VT.isInteger() && !VT.isVector())
+ ArgSize = 8;
+ bool NeedFPTrunc = false;
+ if (VT.isFloatingPoint() && !VT.isVector() && VT != MVT::f64) {
+ ArgSize = 8;
+ NeedFPTrunc = true;
+ }
+
+ // Increment the pointer, VAList, to the next vaarg
+ SDValue VANext = DAG.getNode(ISD::ADD, DL, getPointerTy(), VAList,
+ DAG.getConstant(ArgSize, getPointerTy()));
+ // Store the incremented VAList to the legalized pointer
+ SDValue APStore = DAG.getStore(Chain, DL, VANext, Addr, MachinePointerInfo(V),
+ false, false, 0);
+
+ // Load the actual argument out of the pointer VAList
+ if (NeedFPTrunc) {
+ // Load the value as an f64.
+ SDValue WideFP = DAG.getLoad(MVT::f64, DL, APStore, VAList,
+ MachinePointerInfo(), false, false, false, 0);
+ // Round the value down to an f32.
+ SDValue NarrowFP = DAG.getNode(ISD::FP_ROUND, DL, VT, WideFP.getValue(0),
+ DAG.getIntPtrConstant(1));
+ SDValue Ops[] = { NarrowFP, WideFP.getValue(1) };
+ // Merge the rounded value with the chain output of the load.
+ return DAG.getMergeValues(Ops, DL);
+ }
+
+ return DAG.getLoad(VT, DL, APStore, VAList, MachinePointerInfo(), false,
+ false, false, 0);
+}
+
+SDValue AArch64TargetLowering::LowerFRAMEADDR(SDValue Op,
+ SelectionDAG &DAG) const {
+ MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo();
+ MFI->setFrameAddressIsTaken(true);
+
+ EVT VT = Op.getValueType();
+ SDLoc DL(Op);
+ unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
+ SDValue FrameAddr =
+ DAG.getCopyFromReg(DAG.getEntryNode(), DL, AArch64::FP, VT);
+ while (Depth--)
+ FrameAddr = DAG.getLoad(VT, DL, DAG.getEntryNode(), FrameAddr,
+ MachinePointerInfo(), false, false, false, 0);
+ return FrameAddr;
+}
+
+// FIXME? Maybe this could be a TableGen attribute on some registers and
+// this table could be generated automatically from RegInfo.
+unsigned AArch64TargetLowering::getRegisterByName(const char* RegName,
+ EVT VT) const {
+ unsigned Reg = StringSwitch<unsigned>(RegName)
+ .Case("sp", AArch64::SP)
+ .Default(0);
+ if (Reg)
+ return Reg;
+ report_fatal_error("Invalid register name global variable");
+}
+
+SDValue AArch64TargetLowering::LowerRETURNADDR(SDValue Op,
+ SelectionDAG &DAG) const {
+ MachineFunction &MF = DAG.getMachineFunction();
+ MachineFrameInfo *MFI = MF.getFrameInfo();
+ MFI->setReturnAddressIsTaken(true);
+
+ EVT VT = Op.getValueType();
+ SDLoc DL(Op);
+ unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
+ if (Depth) {
+ SDValue FrameAddr = LowerFRAMEADDR(Op, DAG);
+ SDValue Offset = DAG.getConstant(8, getPointerTy());
+ return DAG.getLoad(VT, DL, DAG.getEntryNode(),
+ DAG.getNode(ISD::ADD, DL, VT, FrameAddr, Offset),
+ MachinePointerInfo(), false, false, false, 0);
+ }
+
+ // Return LR, which contains the return address. Mark it an implicit live-in.
+ unsigned Reg = MF.addLiveIn(AArch64::LR, &AArch64::GPR64RegClass);
+ return DAG.getCopyFromReg(DAG.getEntryNode(), DL, Reg, VT);
+}
+
+/// LowerShiftRightParts - Lower SRA_PARTS, which returns two
+/// i64 values and take a 2 x i64 value to shift plus a shift amount.
+SDValue AArch64TargetLowering::LowerShiftRightParts(SDValue Op,
+ SelectionDAG &DAG) const {
+ assert(Op.getNumOperands() == 3 && "Not a double-shift!");
+ EVT VT = Op.getValueType();
+ unsigned VTBits = VT.getSizeInBits();
+ SDLoc dl(Op);
+ SDValue ShOpLo = Op.getOperand(0);
+ SDValue ShOpHi = Op.getOperand(1);
+ SDValue ShAmt = Op.getOperand(2);
+ SDValue ARMcc;
+ unsigned Opc = (Op.getOpcode() == ISD::SRA_PARTS) ? ISD::SRA : ISD::SRL;
+
+ assert(Op.getOpcode() == ISD::SRA_PARTS || Op.getOpcode() == ISD::SRL_PARTS);
+
+ SDValue RevShAmt = DAG.getNode(ISD::SUB, dl, MVT::i64,
+ DAG.getConstant(VTBits, MVT::i64), ShAmt);
+ SDValue Tmp1 = DAG.getNode(ISD::SRL, dl, VT, ShOpLo, ShAmt);
+ SDValue ExtraShAmt = DAG.getNode(ISD::SUB, dl, MVT::i64, ShAmt,
+ DAG.getConstant(VTBits, MVT::i64));
+ SDValue Tmp2 = DAG.getNode(ISD::SHL, dl, VT, ShOpHi, RevShAmt);
+
+ SDValue Cmp = emitComparison(ExtraShAmt, DAG.getConstant(0, MVT::i64),
+ ISD::SETGE, dl, DAG);
+ SDValue CCVal = DAG.getConstant(AArch64CC::GE, MVT::i32);
+
+ SDValue FalseValLo = DAG.getNode(ISD::OR, dl, VT, Tmp1, Tmp2);
+ SDValue TrueValLo = DAG.getNode(Opc, dl, VT, ShOpHi, ExtraShAmt);
+ SDValue Lo =
+ DAG.getNode(AArch64ISD::CSEL, dl, VT, TrueValLo, FalseValLo, CCVal, Cmp);
+
+ // AArch64 shifts larger than the register width are wrapped rather than
+ // clamped, so we can't just emit "hi >> x".
+ SDValue FalseValHi = DAG.getNode(Opc, dl, VT, ShOpHi, ShAmt);
+ SDValue TrueValHi = Opc == ISD::SRA
+ ? DAG.getNode(Opc, dl, VT, ShOpHi,
+ DAG.getConstant(VTBits - 1, MVT::i64))
+ : DAG.getConstant(0, VT);
+ SDValue Hi =
+ DAG.getNode(AArch64ISD::CSEL, dl, VT, TrueValHi, FalseValHi, CCVal, Cmp);
+
+ SDValue Ops[2] = { Lo, Hi };
+ return DAG.getMergeValues(Ops, dl);
+}
+
+/// LowerShiftLeftParts - Lower SHL_PARTS, which returns two
+/// i64 values and take a 2 x i64 value to shift plus a shift amount.
+SDValue AArch64TargetLowering::LowerShiftLeftParts(SDValue Op,
+ SelectionDAG &DAG) const {
+ assert(Op.getNumOperands() == 3 && "Not a double-shift!");
+ EVT VT = Op.getValueType();
+ unsigned VTBits = VT.getSizeInBits();
+ SDLoc dl(Op);
+ SDValue ShOpLo = Op.getOperand(0);
+ SDValue ShOpHi = Op.getOperand(1);
+ SDValue ShAmt = Op.getOperand(2);
+ SDValue ARMcc;
+
+ assert(Op.getOpcode() == ISD::SHL_PARTS);
+ SDValue RevShAmt = DAG.getNode(ISD::SUB, dl, MVT::i64,
+ DAG.getConstant(VTBits, MVT::i64), ShAmt);
+ SDValue Tmp1 = DAG.getNode(ISD::SRL, dl, VT, ShOpLo, RevShAmt);
+ SDValue ExtraShAmt = DAG.getNode(ISD::SUB, dl, MVT::i64, ShAmt,
+ DAG.getConstant(VTBits, MVT::i64));
+ SDValue Tmp2 = DAG.getNode(ISD::SHL, dl, VT, ShOpHi, ShAmt);
+ SDValue Tmp3 = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, ExtraShAmt);
+
+ SDValue FalseVal = DAG.getNode(ISD::OR, dl, VT, Tmp1, Tmp2);
+
+ SDValue Cmp = emitComparison(ExtraShAmt, DAG.getConstant(0, MVT::i64),
+ ISD::SETGE, dl, DAG);
+ SDValue CCVal = DAG.getConstant(AArch64CC::GE, MVT::i32);
+ SDValue Hi =
+ DAG.getNode(AArch64ISD::CSEL, dl, VT, Tmp3, FalseVal, CCVal, Cmp);
+
+ // AArch64 shifts of larger than register sizes are wrapped rather than
+ // clamped, so we can't just emit "lo << a" if a is too big.
+ SDValue TrueValLo = DAG.getConstant(0, VT);
+ SDValue FalseValLo = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, ShAmt);
+ SDValue Lo =
+ DAG.getNode(AArch64ISD::CSEL, dl, VT, TrueValLo, FalseValLo, CCVal, Cmp);
+
+ SDValue Ops[2] = { Lo, Hi };
+ return DAG.getMergeValues(Ops, dl);
+}
+
+bool AArch64TargetLowering::isOffsetFoldingLegal(
+ const GlobalAddressSDNode *GA) const {
+ // The AArch64 target doesn't support folding offsets into global addresses.
+ return false;
+}
+
+bool AArch64TargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT) const {
+ // We can materialize #0.0 as fmov $Rd, XZR for 64-bit and 32-bit cases.
+ // FIXME: We should be able to handle f128 as well with a clever lowering.
+ if (Imm.isPosZero() && (VT == MVT::f64 || VT == MVT::f32))
+ return true;
+
+ if (VT == MVT::f64)
+ return AArch64_AM::getFP64Imm(Imm) != -1;
+ else if (VT == MVT::f32)
+ return AArch64_AM::getFP32Imm(Imm) != -1;
+ return false;
+}
+
+//===----------------------------------------------------------------------===//
+// AArch64 Optimization Hooks
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// AArch64 Inline Assembly Support
+//===----------------------------------------------------------------------===//
+
+// Table of Constraints
+// TODO: This is the current set of constraints supported by ARM for the
+// compiler, not all of them may make sense, e.g. S may be difficult to support.
+//
+// r - A general register
+// w - An FP/SIMD register of some size in the range v0-v31
+// x - An FP/SIMD register of some size in the range v0-v15
+// I - Constant that can be used with an ADD instruction
+// J - Constant that can be used with a SUB instruction
+// K - Constant that can be used with a 32-bit logical instruction
+// L - Constant that can be used with a 64-bit logical instruction
+// M - Constant that can be used as a 32-bit MOV immediate
+// N - Constant that can be used as a 64-bit MOV immediate
+// Q - A memory reference with base register and no offset
+// S - A symbolic address
+// Y - Floating point constant zero
+// Z - Integer constant zero
+//
+// Note that general register operands will be output using their 64-bit x
+// register name, whatever the size of the variable, unless the asm operand
+// is prefixed by the %w modifier. Floating-point and SIMD register operands
+// will be output with the v prefix unless prefixed by the %b, %h, %s, %d or
+// %q modifier.
+
+/// getConstraintType - Given a constraint letter, return the type of
+/// constraint it is for this target.
+AArch64TargetLowering::ConstraintType
+AArch64TargetLowering::getConstraintType(const std::string &Constraint) const {
+ if (Constraint.size() == 1) {
+ switch (Constraint[0]) {
+ default:
+ break;
+ case 'z':
+ return C_Other;
+ case 'x':
+ case 'w':
+ return C_RegisterClass;
+ // An address with a single base register. Due to the way we
+ // currently handle addresses it is the same as 'r'.
+ case 'Q':
+ return C_Memory;
+ }
+ }
+ return TargetLowering::getConstraintType(Constraint);
+}
+
+/// Examine constraint type and operand type and determine a weight value.
+/// This object must already have been set up with the operand type
+/// and the current alternative constraint selected.
+TargetLowering::ConstraintWeight
+AArch64TargetLowering::getSingleConstraintMatchWeight(
+ AsmOperandInfo &info, const char *constraint) const {
+ ConstraintWeight weight = CW_Invalid;
+ Value *CallOperandVal = info.CallOperandVal;
+ // If we don't have a value, we can't do a match,
+ // but allow it at the lowest weight.
+ if (!CallOperandVal)
+ return CW_Default;
+ Type *type = CallOperandVal->getType();
+ // Look at the constraint type.
+ switch (*constraint) {
+ default:
+ weight = TargetLowering::getSingleConstraintMatchWeight(info, constraint);
+ break;
+ case 'x':
+ case 'w':
+ if (type->isFloatingPointTy() || type->isVectorTy())
+ weight = CW_Register;
+ break;
+ case 'z':
+ weight = CW_Constant;
+ break;
+ }
+ return weight;
+}
+
+std::pair<unsigned, const TargetRegisterClass *>
+AArch64TargetLowering::getRegForInlineAsmConstraint(
+ const std::string &Constraint, MVT VT) const {
+ if (Constraint.size() == 1) {
+ switch (Constraint[0]) {
+ case 'r':
+ if (VT.getSizeInBits() == 64)
+ return std::make_pair(0U, &AArch64::GPR64commonRegClass);
+ return std::make_pair(0U, &AArch64::GPR32commonRegClass);
+ case 'w':
+ if (VT == MVT::f32)
+ return std::make_pair(0U, &AArch64::FPR32RegClass);
+ if (VT.getSizeInBits() == 64)
+ return std::make_pair(0U, &AArch64::FPR64RegClass);
+ if (VT.getSizeInBits() == 128)
+ return std::make_pair(0U, &AArch64::FPR128RegClass);
+ break;
+ // The instructions that this constraint is designed for can
+ // only take 128-bit registers so just use that regclass.
+ case 'x':
+ if (VT.getSizeInBits() == 128)
+ return std::make_pair(0U, &AArch64::FPR128_loRegClass);
+ break;
+ }
+ }
+ if (StringRef("{cc}").equals_lower(Constraint))
+ return std::make_pair(unsigned(AArch64::NZCV), &AArch64::CCRRegClass);
+
+ // Use the default implementation in TargetLowering to convert the register
+ // constraint into a member of a register class.
+ std::pair<unsigned, const TargetRegisterClass *> Res;
+ Res = TargetLowering::getRegForInlineAsmConstraint(Constraint, VT);
+
+ // Not found as a standard register?
+ if (!Res.second) {
+ unsigned Size = Constraint.size();
+ if ((Size == 4 || Size == 5) && Constraint[0] == '{' &&
+ tolower(Constraint[1]) == 'v' && Constraint[Size - 1] == '}') {
+ const std::string Reg =
+ std::string(&Constraint[2], &Constraint[Size - 1]);
+ int RegNo = atoi(Reg.c_str());
+ if (RegNo >= 0 && RegNo <= 31) {
+ // v0 - v31 are aliases of q0 - q31.
+ // By default we'll emit v0-v31 for this unless there's a modifier where
+ // we'll emit the correct register as well.
+ Res.first = AArch64::FPR128RegClass.getRegister(RegNo);
+ Res.second = &AArch64::FPR128RegClass;
+ }
+ }
+ }
+
+ return Res;
+}
+
+/// LowerAsmOperandForConstraint - Lower the specified operand into the Ops
+/// vector. If it is invalid, don't add anything to Ops.
+void AArch64TargetLowering::LowerAsmOperandForConstraint(
+ SDValue Op, std::string &Constraint, std::vector<SDValue> &Ops,
+ SelectionDAG &DAG) const {
+ SDValue Result;
+
+ // Currently only support length 1 constraints.
+ if (Constraint.length() != 1)
+ return;
+
+ char ConstraintLetter = Constraint[0];
+ switch (ConstraintLetter) {
+ default:
+ break;
+
+ // This set of constraints deal with valid constants for various instructions.
+ // Validate and return a target constant for them if we can.
+ case 'z': {
+ // 'z' maps to xzr or wzr so it needs an input of 0.
+ ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op);
+ if (!C || C->getZExtValue() != 0)
+ return;
+
+ if (Op.getValueType() == MVT::i64)
+ Result = DAG.getRegister(AArch64::XZR, MVT::i64);
+ else
+ Result = DAG.getRegister(AArch64::WZR, MVT::i32);
+ break;
+ }
+
+ case 'I':
+ case 'J':
+ case 'K':
+ case 'L':
+ case 'M':
+ case 'N':
+ ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op);
+ if (!C)
+ return;
+
+ // Grab the value and do some validation.
+ uint64_t CVal = C->getZExtValue();
+ switch (ConstraintLetter) {
+ // The I constraint applies only to simple ADD or SUB immediate operands:
+ // i.e. 0 to 4095 with optional shift by 12
+ // The J constraint applies only to ADD or SUB immediates that would be
+ // valid when negated, i.e. if [an add pattern] were to be output as a SUB
+ // instruction [or vice versa], in other words -1 to -4095 with optional
+ // left shift by 12.
+ case 'I':
+ if (isUInt<12>(CVal) || isShiftedUInt<12, 12>(CVal))
+ break;
+ return;
+ case 'J': {
+ uint64_t NVal = -C->getSExtValue();
+ if (isUInt<12>(NVal) || isShiftedUInt<12, 12>(NVal))
+ break;
+ return;
+ }
+ // The K and L constraints apply *only* to logical immediates, including
+ // what used to be the MOVI alias for ORR (though the MOVI alias has now
+ // been removed and MOV should be used). So these constraints have to
+ // distinguish between bit patterns that are valid 32-bit or 64-bit
+ // "bitmask immediates": for example 0xaaaaaaaa is a valid bimm32 (K), but
+ // not a valid bimm64 (L) where 0xaaaaaaaaaaaaaaaa would be valid, and vice
+ // versa.
+ case 'K':
+ if (AArch64_AM::isLogicalImmediate(CVal, 32))
+ break;
+ return;
+ case 'L':
+ if (AArch64_AM::isLogicalImmediate(CVal, 64))
+ break;
+ return;
+ // The M and N constraints are a superset of K and L respectively, for use
+ // with the MOV (immediate) alias. As well as the logical immediates they
+ // also match 32 or 64-bit immediates that can be loaded either using a
+ // *single* MOVZ or MOVN , such as 32-bit 0x12340000, 0x00001234, 0xffffedca
+ // (M) or 64-bit 0x1234000000000000 (N) etc.
+ // As a note some of this code is liberally stolen from the asm parser.
+ case 'M': {
+ if (!isUInt<32>(CVal))
+ return;
+ if (AArch64_AM::isLogicalImmediate(CVal, 32))
+ break;
+ if ((CVal & 0xFFFF) == CVal)
+ break;
+ if ((CVal & 0xFFFF0000ULL) == CVal)
+ break;
+ uint64_t NCVal = ~(uint32_t)CVal;
+ if ((NCVal & 0xFFFFULL) == NCVal)
+ break;
+ if ((NCVal & 0xFFFF0000ULL) == NCVal)
+ break;
+ return;
+ }
+ case 'N': {
+ if (AArch64_AM::isLogicalImmediate(CVal, 64))
+ break;
+ if ((CVal & 0xFFFFULL) == CVal)
+ break;
+ if ((CVal & 0xFFFF0000ULL) == CVal)
+ break;
+ if ((CVal & 0xFFFF00000000ULL) == CVal)
+ break;
+ if ((CVal & 0xFFFF000000000000ULL) == CVal)
+ break;
+ uint64_t NCVal = ~CVal;
+ if ((NCVal & 0xFFFFULL) == NCVal)
+ break;
+ if ((NCVal & 0xFFFF0000ULL) == NCVal)
+ break;
+ if ((NCVal & 0xFFFF00000000ULL) == NCVal)
+ break;
+ if ((NCVal & 0xFFFF000000000000ULL) == NCVal)
+ break;
+ return;
+ }
+ default:
+ return;
+ }
+
+ // All assembler immediates are 64-bit integers.
+ Result = DAG.getTargetConstant(CVal, MVT::i64);
+ break;
+ }
+
+ if (Result.getNode()) {
+ Ops.push_back(Result);
+ return;
+ }
+
+ return TargetLowering::LowerAsmOperandForConstraint(Op, Constraint, Ops, DAG);
+}
+
+//===----------------------------------------------------------------------===//
+// AArch64 Advanced SIMD Support
+//===----------------------------------------------------------------------===//
+
+/// WidenVector - Given a value in the V64 register class, produce the
+/// equivalent value in the V128 register class.
+static SDValue WidenVector(SDValue V64Reg, SelectionDAG &DAG) {
+ EVT VT = V64Reg.getValueType();
+ unsigned NarrowSize = VT.getVectorNumElements();
+ MVT EltTy = VT.getVectorElementType().getSimpleVT();
+ MVT WideTy = MVT::getVectorVT(EltTy, 2 * NarrowSize);
+ SDLoc DL(V64Reg);
+
+ return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, WideTy, DAG.getUNDEF(WideTy),
+ V64Reg, DAG.getConstant(0, MVT::i32));
+}
+
+/// getExtFactor - Determine the adjustment factor for the position when
+/// generating an "extract from vector registers" instruction.
+static unsigned getExtFactor(SDValue &V) {
+ EVT EltType = V.getValueType().getVectorElementType();
+ return EltType.getSizeInBits() / 8;
+}
+
+/// NarrowVector - Given a value in the V128 register class, produce the
+/// equivalent value in the V64 register class.
+static SDValue NarrowVector(SDValue V128Reg, SelectionDAG &DAG) {
+ EVT VT = V128Reg.getValueType();
+ unsigned WideSize = VT.getVectorNumElements();
+ MVT EltTy = VT.getVectorElementType().getSimpleVT();
+ MVT NarrowTy = MVT::getVectorVT(EltTy, WideSize / 2);
+ SDLoc DL(V128Reg);
+
+ return DAG.getTargetExtractSubreg(AArch64::dsub, DL, NarrowTy, V128Reg);
+}
+
+// Gather data to see if the operation can be modelled as a
+// shuffle in combination with VEXTs.
+SDValue AArch64TargetLowering::ReconstructShuffle(SDValue Op,
+ SelectionDAG &DAG) const {
+ SDLoc dl(Op);
+ EVT VT = Op.getValueType();
+ unsigned NumElts = VT.getVectorNumElements();
+
+ SmallVector<SDValue, 2> SourceVecs;
+ SmallVector<unsigned, 2> MinElts;
+ SmallVector<unsigned, 2> MaxElts;
+
+ for (unsigned i = 0; i < NumElts; ++i) {
+ SDValue V = Op.getOperand(i);
+ if (V.getOpcode() == ISD::UNDEF)
+ continue;
+ else if (V.getOpcode() != ISD::EXTRACT_VECTOR_ELT) {
+ // A shuffle can only come from building a vector from various
+ // elements of other vectors.
+ return SDValue();
+ }
+
+ // Record this extraction against the appropriate vector if possible...
+ SDValue SourceVec = V.getOperand(0);
+ unsigned EltNo = cast<ConstantSDNode>(V.getOperand(1))->getZExtValue();
+ bool FoundSource = false;
+ for (unsigned j = 0; j < SourceVecs.size(); ++j) {
+ if (SourceVecs[j] == SourceVec) {
+ if (MinElts[j] > EltNo)
+ MinElts[j] = EltNo;
+ if (MaxElts[j] < EltNo)
+ MaxElts[j] = EltNo;
+ FoundSource = true;
+ break;
+ }
+ }
+
+ // Or record a new source if not...
+ if (!FoundSource) {
+ SourceVecs.push_back(SourceVec);
+ MinElts.push_back(EltNo);
+ MaxElts.push_back(EltNo);
+ }
+ }
+
+ // Currently only do something sane when at most two source vectors
+ // involved.
+ if (SourceVecs.size() > 2)
+ return SDValue();
+
+ SDValue ShuffleSrcs[2] = { DAG.getUNDEF(VT), DAG.getUNDEF(VT) };
+ int VEXTOffsets[2] = { 0, 0 };
+
+ // This loop extracts the usage patterns of the source vectors
+ // and prepares appropriate SDValues for a shuffle if possible.
+ for (unsigned i = 0; i < SourceVecs.size(); ++i) {
+ if (SourceVecs[i].getValueType() == VT) {
+ // No VEXT necessary
+ ShuffleSrcs[i] = SourceVecs[i];
+ VEXTOffsets[i] = 0;
+ continue;
+ } else if (SourceVecs[i].getValueType().getVectorNumElements() < NumElts) {
+ // We can pad out the smaller vector for free, so if it's part of a
+ // shuffle...
+ ShuffleSrcs[i] = DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, SourceVecs[i],
+ DAG.getUNDEF(SourceVecs[i].getValueType()));
+ continue;
+ }
+
+ // Don't attempt to extract subvectors from BUILD_VECTOR sources
+ // that expand or trunc the original value.
+ // TODO: We can try to bitcast and ANY_EXTEND the result but
+ // we need to consider the cost of vector ANY_EXTEND, and the
+ // legality of all the types.
+ if (SourceVecs[i].getValueType().getVectorElementType() !=
+ VT.getVectorElementType())
+ return SDValue();
+
+ // Since only 64-bit and 128-bit vectors are legal on ARM and
+ // we've eliminated the other cases...
+ assert(SourceVecs[i].getValueType().getVectorNumElements() == 2 * NumElts &&
+ "unexpected vector sizes in ReconstructShuffle");
+
+ if (MaxElts[i] - MinElts[i] >= NumElts) {
+ // Span too large for a VEXT to cope
+ return SDValue();
+ }
+
+ if (MinElts[i] >= NumElts) {
+ // The extraction can just take the second half
+ VEXTOffsets[i] = NumElts;
+ ShuffleSrcs[i] =
+ DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, SourceVecs[i],
+ DAG.getIntPtrConstant(NumElts));
+ } else if (MaxElts[i] < NumElts) {
+ // The extraction can just take the first half
+ VEXTOffsets[i] = 0;
+ ShuffleSrcs[i] = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT,
+ SourceVecs[i], DAG.getIntPtrConstant(0));
+ } else {
+ // An actual VEXT is needed
+ VEXTOffsets[i] = MinElts[i];
+ SDValue VEXTSrc1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT,
+ SourceVecs[i], DAG.getIntPtrConstant(0));
+ SDValue VEXTSrc2 =
+ DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, SourceVecs[i],
+ DAG.getIntPtrConstant(NumElts));
+ unsigned Imm = VEXTOffsets[i] * getExtFactor(VEXTSrc1);
+ ShuffleSrcs[i] = DAG.getNode(AArch64ISD::EXT, dl, VT, VEXTSrc1, VEXTSrc2,
+ DAG.getConstant(Imm, MVT::i32));
+ }
+ }
+
+ SmallVector<int, 8> Mask;
+
+ for (unsigned i = 0; i < NumElts; ++i) {
+ SDValue Entry = Op.getOperand(i);
+ if (Entry.getOpcode() == ISD::UNDEF) {
+ Mask.push_back(-1);
+ continue;
+ }
+
+ SDValue ExtractVec = Entry.getOperand(0);
+ int ExtractElt =
+ cast<ConstantSDNode>(Op.getOperand(i).getOperand(1))->getSExtValue();
+ if (ExtractVec == SourceVecs[0]) {
+ Mask.push_back(ExtractElt - VEXTOffsets[0]);
+ } else {
+ Mask.push_back(ExtractElt + NumElts - VEXTOffsets[1]);
+ }
+ }
+
+ // Final check before we try to produce nonsense...
+ if (isShuffleMaskLegal(Mask, VT))
+ return DAG.getVectorShuffle(VT, dl, ShuffleSrcs[0], ShuffleSrcs[1],
+ &Mask[0]);
+
return SDValue();
}
-/// Check if the specified splat value corresponds to a valid vector constant
-/// for a Neon instruction with a "modified immediate" operand (e.g., MOVI). If
-/// so, return the encoded 8-bit immediate and the OpCmode instruction fields
-/// values.
-static bool isNeonModifiedImm(uint64_t SplatBits, uint64_t SplatUndef,
- unsigned SplatBitSize, SelectionDAG &DAG,
- bool is128Bits, NeonModImmType type, EVT &VT,
- unsigned &Imm, unsigned &OpCmode) {
- switch (SplatBitSize) {
- default:
- llvm_unreachable("unexpected size for isNeonModifiedImm");
- case 8: {
- if (type != Neon_Mov_Imm)
- return false;
- assert((SplatBits & ~0xff) == 0 && "one byte splat value is too big");
- // Neon movi per byte: Op=0, Cmode=1110.
- OpCmode = 0xe;
- Imm = SplatBits;
- VT = is128Bits ? MVT::v16i8 : MVT::v8i8;
- break;
- }
- case 16: {
- // Neon move inst per halfword
- VT = is128Bits ? MVT::v8i16 : MVT::v4i16;
- if ((SplatBits & ~0xff) == 0) {
- // Value = 0x00nn is 0x00nn LSL 0
- // movi: Op=0, Cmode=1000; mvni: Op=1, Cmode=1000
- // bic: Op=1, Cmode=1001; orr: Op=0, Cmode=1001
- // Op=x, Cmode=100y
- Imm = SplatBits;
- OpCmode = 0x8;
- break;
- }
- if ((SplatBits & ~0xff00) == 0) {
- // Value = 0xnn00 is 0x00nn LSL 8
- // movi: Op=0, Cmode=1010; mvni: Op=1, Cmode=1010
- // bic: Op=1, Cmode=1011; orr: Op=0, Cmode=1011
- // Op=x, Cmode=101x
- Imm = SplatBits >> 8;
- OpCmode = 0xa;
- break;
- }
- // can't handle any other
+// check if an EXT instruction can handle the shuffle mask when the
+// vector sources of the shuffle are the same.
+static bool isSingletonEXTMask(ArrayRef<int> M, EVT VT, unsigned &Imm) {
+ unsigned NumElts = VT.getVectorNumElements();
+
+ // Assume that the first shuffle index is not UNDEF. Fail if it is.
+ if (M[0] < 0)
return false;
- }
- case 32: {
- // First the LSL variants (MSL is unusable by some interested instructions).
+ Imm = M[0];
- // Neon move instr per word, shift zeros
- VT = is128Bits ? MVT::v4i32 : MVT::v2i32;
- if ((SplatBits & ~0xff) == 0) {
- // Value = 0x000000nn is 0x000000nn LSL 0
- // movi: Op=0, Cmode= 0000; mvni: Op=1, Cmode= 0000
- // bic: Op=1, Cmode= 0001; orr: Op=0, Cmode= 0001
- // Op=x, Cmode=000x
- Imm = SplatBits;
- OpCmode = 0;
- break;
- }
- if ((SplatBits & ~0xff00) == 0) {
- // Value = 0x0000nn00 is 0x000000nn LSL 8
- // movi: Op=0, Cmode= 0010; mvni: Op=1, Cmode= 0010
- // bic: Op=1, Cmode= 0011; orr : Op=0, Cmode= 0011
- // Op=x, Cmode=001x
- Imm = SplatBits >> 8;
- OpCmode = 0x2;
- break;
- }
- if ((SplatBits & ~0xff0000) == 0) {
- // Value = 0x00nn0000 is 0x000000nn LSL 16
- // movi: Op=0, Cmode= 0100; mvni: Op=1, Cmode= 0100
- // bic: Op=1, Cmode= 0101; orr: Op=0, Cmode= 0101
- // Op=x, Cmode=010x
- Imm = SplatBits >> 16;
- OpCmode = 0x4;
- break;
- }
- if ((SplatBits & ~0xff000000) == 0) {
- // Value = 0xnn000000 is 0x000000nn LSL 24
- // movi: Op=0, Cmode= 0110; mvni: Op=1, Cmode= 0110
- // bic: Op=1, Cmode= 0111; orr: Op=0, Cmode= 0111
- // Op=x, Cmode=011x
- Imm = SplatBits >> 24;
- OpCmode = 0x6;
- break;
- }
+ // If this is a VEXT shuffle, the immediate value is the index of the first
+ // element. The other shuffle indices must be the successive elements after
+ // the first one.
+ unsigned ExpectedElt = Imm;
+ for (unsigned i = 1; i < NumElts; ++i) {
+ // Increment the expected index. If it wraps around, just follow it
+ // back to index zero and keep going.
+ ++ExpectedElt;
+ if (ExpectedElt == NumElts)
+ ExpectedElt = 0;
- // Now the MSL immediates.
-
- // Neon move instr per word, shift ones
- if ((SplatBits & ~0xffff) == 0 &&
- ((SplatBits | SplatUndef) & 0xff) == 0xff) {
- // Value = 0x0000nnff is 0x000000nn MSL 8
- // movi: Op=0, Cmode= 1100; mvni: Op=1, Cmode= 1100
- // Op=x, Cmode=1100
- Imm = SplatBits >> 8;
- OpCmode = 0xc;
- break;
- }
- if ((SplatBits & ~0xffffff) == 0 &&
- ((SplatBits | SplatUndef) & 0xffff) == 0xffff) {
- // Value = 0x00nnffff is 0x000000nn MSL 16
- // movi: Op=1, Cmode= 1101; mvni: Op=1, Cmode= 1101
- // Op=x, Cmode=1101
- Imm = SplatBits >> 16;
- OpCmode = 0xd;
- break;
- }
- // can't handle any other
- return false;
- }
-
- case 64: {
- if (type != Neon_Mov_Imm)
+ if (M[i] < 0)
+ continue; // ignore UNDEF indices
+ if (ExpectedElt != static_cast<unsigned>(M[i]))
return false;
- // Neon move instr bytemask, where each byte is either 0x00 or 0xff.
- // movi Op=1, Cmode=1110.
- OpCmode = 0x1e;
- uint64_t BitMask = 0xff;
- uint64_t Val = 0;
- unsigned ImmMask = 1;
- Imm = 0;
- for (int ByteNum = 0; ByteNum < 8; ++ByteNum) {
- if (((SplatBits | SplatUndef) & BitMask) == BitMask) {
- Val |= BitMask;
- Imm |= ImmMask;
- } else if ((SplatBits & BitMask) != 0) {
- return false;
- }
- BitMask <<= 8;
- ImmMask <<= 1;
- }
- SplatBits = Val;
- VT = is128Bits ? MVT::v2i64 : MVT::v1i64;
- break;
- }
}
return true;
}
-static SDValue PerformANDCombine(SDNode *N,
- TargetLowering::DAGCombinerInfo &DCI) {
+// check if an EXT instruction can handle the shuffle mask when the
+// vector sources of the shuffle are different.
+static bool isEXTMask(ArrayRef<int> M, EVT VT, bool &ReverseEXT,
+ unsigned &Imm) {
+ // Look for the first non-undef element.
+ const int *FirstRealElt = std::find_if(M.begin(), M.end(),
+ [](int Elt) {return Elt >= 0;});
- SelectionDAG &DAG = DCI.DAG;
- SDLoc DL(N);
- EVT VT = N->getValueType(0);
+ // Benefit form APInt to handle overflow when calculating expected element.
+ unsigned NumElts = VT.getVectorNumElements();
+ unsigned MaskBits = APInt(32, NumElts * 2).logBase2();
+ APInt ExpectedElt = APInt(MaskBits, *FirstRealElt + 1);
+ // The following shuffle indices must be the successive elements after the
+ // first real element.
+ const int *FirstWrongElt = std::find_if(FirstRealElt + 1, M.end(),
+ [&](int Elt) {return Elt != ExpectedElt++ && Elt != -1;});
+ if (FirstWrongElt != M.end())
+ return false;
- // We're looking for an SRA/SHL pair which form an SBFX.
+ // The index of an EXT is the first element if it is not UNDEF.
+ // Watch out for the beginning UNDEFs. The EXT index should be the expected
+ // value of the first element. E.g.
+ // <-1, -1, 3, ...> is treated as <1, 2, 3, ...>.
+ // <-1, -1, 0, 1, ...> is treated as <2*NumElts-2, 2*NumElts-1, 0, 1, ...>.
+ // ExpectedElt is the last mask index plus 1.
+ Imm = ExpectedElt.getZExtValue();
- if (VT != MVT::i32 && VT != MVT::i64)
- return SDValue();
+ // There are two difference cases requiring to reverse input vectors.
+ // For example, for vector <4 x i32> we have the following cases,
+ // Case 1: shufflevector(<4 x i32>,<4 x i32>,<-1, -1, -1, 0>)
+ // Case 2: shufflevector(<4 x i32>,<4 x i32>,<-1, -1, 7, 0>)
+ // For both cases, we finally use mask <5, 6, 7, 0>, which requires
+ // to reverse two input vectors.
+ if (Imm < NumElts)
+ ReverseEXT = true;
+ else
+ Imm -= NumElts;
- if (!isa<ConstantSDNode>(N->getOperand(1)))
- return SDValue();
-
- uint64_t TruncMask = N->getConstantOperandVal(1);
- if (!isMask_64(TruncMask))
- return SDValue();
-
- uint64_t Width = CountPopulation_64(TruncMask);
- SDValue Shift = N->getOperand(0);
-
- if (Shift.getOpcode() != ISD::SRL)
- return SDValue();
-
- if (!isa<ConstantSDNode>(Shift->getOperand(1)))
- return SDValue();
- uint64_t LSB = Shift->getConstantOperandVal(1);
-
- if (LSB > VT.getSizeInBits() || Width > VT.getSizeInBits())
- return SDValue();
-
- return DAG.getNode(AArch64ISD::UBFX, DL, VT, Shift.getOperand(0),
- DAG.getConstant(LSB, MVT::i64),
- DAG.getConstant(LSB + Width - 1, MVT::i64));
+ return true;
}
-/// For a true bitfield insert, the bits getting into that contiguous mask
-/// should come from the low part of an existing value: they must be formed from
-/// a compatible SHL operation (unless they're already low). This function
-/// checks that condition and returns the least-significant bit that's
-/// intended. If the operation not a field preparation, -1 is returned.
-static int32_t getLSBForBFI(SelectionDAG &DAG, SDLoc DL, EVT VT,
- SDValue &MaskedVal, uint64_t Mask) {
- if (!isShiftedMask_64(Mask))
- return -1;
+/// isREVMask - Check if a vector shuffle corresponds to a REV
+/// instruction with the specified blocksize. (The order of the elements
+/// within each block of the vector is reversed.)
+static bool isREVMask(ArrayRef<int> M, EVT VT, unsigned BlockSize) {
+ assert((BlockSize == 16 || BlockSize == 32 || BlockSize == 64) &&
+ "Only possible block sizes for REV are: 16, 32, 64");
- // Now we need to alter MaskedVal so that it is an appropriate input for a BFI
- // instruction. BFI will do a left-shift by LSB before applying the mask we've
- // spotted, so in general we should pre-emptively "undo" that by making sure
- // the incoming bits have had a right-shift applied to them.
- //
- // This right shift, however, will combine with existing left/right shifts. In
- // the simplest case of a completely straight bitfield operation, it will be
- // expected to completely cancel out with an existing SHL. More complicated
- // cases (e.g. bitfield to bitfield copy) may still need a real shift before
- // the BFI.
+ unsigned EltSz = VT.getVectorElementType().getSizeInBits();
+ if (EltSz == 64)
+ return false;
- uint64_t LSB = countTrailingZeros(Mask);
- int64_t ShiftRightRequired = LSB;
- if (MaskedVal.getOpcode() == ISD::SHL &&
- isa<ConstantSDNode>(MaskedVal.getOperand(1))) {
- ShiftRightRequired -= MaskedVal.getConstantOperandVal(1);
- MaskedVal = MaskedVal.getOperand(0);
- } else if (MaskedVal.getOpcode() == ISD::SRL &&
- isa<ConstantSDNode>(MaskedVal.getOperand(1))) {
- ShiftRightRequired += MaskedVal.getConstantOperandVal(1);
- MaskedVal = MaskedVal.getOperand(0);
+ unsigned NumElts = VT.getVectorNumElements();
+ unsigned BlockElts = M[0] + 1;
+ // If the first shuffle index is UNDEF, be optimistic.
+ if (M[0] < 0)
+ BlockElts = BlockSize / EltSz;
+
+ if (BlockSize <= EltSz || BlockSize != BlockElts * EltSz)
+ return false;
+
+ for (unsigned i = 0; i < NumElts; ++i) {
+ if (M[i] < 0)
+ continue; // ignore UNDEF indices
+ if ((unsigned)M[i] != (i - i % BlockElts) + (BlockElts - 1 - i % BlockElts))
+ return false;
}
- if (ShiftRightRequired > 0)
- MaskedVal = DAG.getNode(ISD::SRL, DL, VT, MaskedVal,
- DAG.getConstant(ShiftRightRequired, MVT::i64));
- else if (ShiftRightRequired < 0) {
- // We could actually end up with a residual left shift, for example with
- // "struc.bitfield = val << 1".
- MaskedVal = DAG.getNode(ISD::SHL, DL, VT, MaskedVal,
- DAG.getConstant(-ShiftRightRequired, MVT::i64));
- }
-
- return LSB;
+ return true;
}
-/// Searches from N for an existing AArch64ISD::BFI node, possibly surrounded by
-/// a mask and an extension. Returns true if a BFI was found and provides
-/// information on its surroundings.
-static bool findMaskedBFI(SDValue N, SDValue &BFI, uint64_t &Mask,
- bool &Extended) {
- Extended = false;
- if (N.getOpcode() == ISD::ZERO_EXTEND) {
- Extended = true;
- N = N.getOperand(0);
+static bool isZIPMask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) {
+ unsigned NumElts = VT.getVectorNumElements();
+ WhichResult = (M[0] == 0 ? 0 : 1);
+ unsigned Idx = WhichResult * NumElts / 2;
+ for (unsigned i = 0; i != NumElts; i += 2) {
+ if ((M[i] >= 0 && (unsigned)M[i] != Idx) ||
+ (M[i + 1] >= 0 && (unsigned)M[i + 1] != Idx + NumElts))
+ return false;
+ Idx += 1;
}
- if (N.getOpcode() == ISD::AND && isa<ConstantSDNode>(N.getOperand(1))) {
- Mask = N->getConstantOperandVal(1);
- N = N.getOperand(0);
- } else {
- // Mask is the whole width.
- Mask = -1ULL >> (64 - N.getValueType().getSizeInBits());
+ return true;
+}
+
+static bool isUZPMask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) {
+ unsigned NumElts = VT.getVectorNumElements();
+ WhichResult = (M[0] == 0 ? 0 : 1);
+ for (unsigned i = 0; i != NumElts; ++i) {
+ if (M[i] < 0)
+ continue; // ignore UNDEF indices
+ if ((unsigned)M[i] != 2 * i + WhichResult)
+ return false;
}
- if (N.getOpcode() == AArch64ISD::BFI) {
- BFI = N;
+ return true;
+}
+
+static bool isTRNMask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) {
+ unsigned NumElts = VT.getVectorNumElements();
+ WhichResult = (M[0] == 0 ? 0 : 1);
+ for (unsigned i = 0; i < NumElts; i += 2) {
+ if ((M[i] >= 0 && (unsigned)M[i] != i + WhichResult) ||
+ (M[i + 1] >= 0 && (unsigned)M[i + 1] != i + NumElts + WhichResult))
+ return false;
+ }
+ return true;
+}
+
+/// isZIP_v_undef_Mask - Special case of isZIPMask for canonical form of
+/// "vector_shuffle v, v", i.e., "vector_shuffle v, undef".
+/// Mask is e.g., <0, 0, 1, 1> instead of <0, 4, 1, 5>.
+static bool isZIP_v_undef_Mask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) {
+ unsigned NumElts = VT.getVectorNumElements();
+ WhichResult = (M[0] == 0 ? 0 : 1);
+ unsigned Idx = WhichResult * NumElts / 2;
+ for (unsigned i = 0; i != NumElts; i += 2) {
+ if ((M[i] >= 0 && (unsigned)M[i] != Idx) ||
+ (M[i + 1] >= 0 && (unsigned)M[i + 1] != Idx))
+ return false;
+ Idx += 1;
+ }
+
+ return true;
+}
+
+/// isUZP_v_undef_Mask - Special case of isUZPMask for canonical form of
+/// "vector_shuffle v, v", i.e., "vector_shuffle v, undef".
+/// Mask is e.g., <0, 2, 0, 2> instead of <0, 2, 4, 6>,
+static bool isUZP_v_undef_Mask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) {
+ unsigned Half = VT.getVectorNumElements() / 2;
+ WhichResult = (M[0] == 0 ? 0 : 1);
+ for (unsigned j = 0; j != 2; ++j) {
+ unsigned Idx = WhichResult;
+ for (unsigned i = 0; i != Half; ++i) {
+ int MIdx = M[i + j * Half];
+ if (MIdx >= 0 && (unsigned)MIdx != Idx)
+ return false;
+ Idx += 2;
+ }
+ }
+
+ return true;
+}
+
+/// isTRN_v_undef_Mask - Special case of isTRNMask for canonical form of
+/// "vector_shuffle v, v", i.e., "vector_shuffle v, undef".
+/// Mask is e.g., <0, 0, 2, 2> instead of <0, 4, 2, 6>.
+static bool isTRN_v_undef_Mask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) {
+ unsigned NumElts = VT.getVectorNumElements();
+ WhichResult = (M[0] == 0 ? 0 : 1);
+ for (unsigned i = 0; i < NumElts; i += 2) {
+ if ((M[i] >= 0 && (unsigned)M[i] != i + WhichResult) ||
+ (M[i + 1] >= 0 && (unsigned)M[i + 1] != i + WhichResult))
+ return false;
+ }
+ return true;
+}
+
+static bool isINSMask(ArrayRef<int> M, int NumInputElements,
+ bool &DstIsLeft, int &Anomaly) {
+ if (M.size() != static_cast<size_t>(NumInputElements))
+ return false;
+
+ int NumLHSMatch = 0, NumRHSMatch = 0;
+ int LastLHSMismatch = -1, LastRHSMismatch = -1;
+
+ for (int i = 0; i < NumInputElements; ++i) {
+ if (M[i] == -1) {
+ ++NumLHSMatch;
+ ++NumRHSMatch;
+ continue;
+ }
+
+ if (M[i] == i)
+ ++NumLHSMatch;
+ else
+ LastLHSMismatch = i;
+
+ if (M[i] == i + NumInputElements)
+ ++NumRHSMatch;
+ else
+ LastRHSMismatch = i;
+ }
+
+ if (NumLHSMatch == NumInputElements - 1) {
+ DstIsLeft = true;
+ Anomaly = LastLHSMismatch;
+ return true;
+ } else if (NumRHSMatch == NumInputElements - 1) {
+ DstIsLeft = false;
+ Anomaly = LastRHSMismatch;
return true;
}
return false;
}
-/// Try to combine a subtree (rooted at an OR) into a "masked BFI" node, which
-/// is roughly equivalent to (and (BFI ...), mask). This form is used because it
-/// can often be further combined with a larger mask. Ultimately, we want mask
-/// to be 2^32-1 or 2^64-1 so the AND can be skipped.
-static SDValue tryCombineToBFI(SDNode *N,
- TargetLowering::DAGCombinerInfo &DCI,
- const AArch64Subtarget *Subtarget) {
- SelectionDAG &DAG = DCI.DAG;
- SDLoc DL(N);
- EVT VT = N->getValueType(0);
+static bool isConcatMask(ArrayRef<int> Mask, EVT VT, bool SplitLHS) {
+ if (VT.getSizeInBits() != 128)
+ return false;
- assert(N->getOpcode() == ISD::OR && "Unexpected root");
+ unsigned NumElts = VT.getVectorNumElements();
- // We need the LHS to be (and SOMETHING, MASK). Find out what that mask is or
- // abandon the effort.
- SDValue LHS = N->getOperand(0);
- if (LHS.getOpcode() != ISD::AND)
- return SDValue();
-
- uint64_t LHSMask;
- if (isa<ConstantSDNode>(LHS.getOperand(1)))
- LHSMask = LHS->getConstantOperandVal(1);
- else
- return SDValue();
-
- // We also need the RHS to be (and SOMETHING, MASK). Find out what that mask
- // is or abandon the effort.
- SDValue RHS = N->getOperand(1);
- if (RHS.getOpcode() != ISD::AND)
- return SDValue();
-
- uint64_t RHSMask;
- if (isa<ConstantSDNode>(RHS.getOperand(1)))
- RHSMask = RHS->getConstantOperandVal(1);
- else
- return SDValue();
-
- // Can't do anything if the masks are incompatible.
- if (LHSMask & RHSMask)
- return SDValue();
-
- // Now we need one of the masks to be a contiguous field. Without loss of
- // generality that should be the RHS one.
- SDValue Bitfield = LHS.getOperand(0);
- if (getLSBForBFI(DAG, DL, VT, Bitfield, LHSMask) != -1) {
- // We know that LHS is a candidate new value, and RHS isn't already a better
- // one.
- std::swap(LHS, RHS);
- std::swap(LHSMask, RHSMask);
+ for (int I = 0, E = NumElts / 2; I != E; I++) {
+ if (Mask[I] != I)
+ return false;
}
- // We've done our best to put the right operands in the right places, all we
- // can do now is check whether a BFI exists.
- Bitfield = RHS.getOperand(0);
- int32_t LSB = getLSBForBFI(DAG, DL, VT, Bitfield, RHSMask);
- if (LSB == -1)
- return SDValue();
+ int Offset = NumElts / 2;
+ for (int I = NumElts / 2, E = NumElts; I != E; I++) {
+ if (Mask[I] != I + SplitLHS * Offset)
+ return false;
+ }
- uint32_t Width = CountPopulation_64(RHSMask);
- assert(Width && "Expected non-zero bitfield width");
-
- SDValue BFI = DAG.getNode(AArch64ISD::BFI, DL, VT,
- LHS.getOperand(0), Bitfield,
- DAG.getConstant(LSB, MVT::i64),
- DAG.getConstant(Width, MVT::i64));
-
- // Mask is trivial
- if ((LHSMask | RHSMask) == (-1ULL >> (64 - VT.getSizeInBits())))
- return BFI;
-
- return DAG.getNode(ISD::AND, DL, VT, BFI,
- DAG.getConstant(LHSMask | RHSMask, VT));
+ return true;
}
-/// Search for the bitwise combining (with careful masks) of a MaskedBFI and its
-/// original input. This is surprisingly common because SROA splits things up
-/// into i8 chunks, so the originally detected MaskedBFI may actually only act
-/// on the low (say) byte of a word. This is then orred into the rest of the
-/// word afterwards.
-///
-/// Basic input: (or (and OLDFIELD, MASK1), (MaskedBFI MASK2, OLDFIELD, ...)).
-///
-/// If MASK1 and MASK2 are compatible, we can fold the whole thing into the
-/// MaskedBFI. We can also deal with a certain amount of extend/truncate being
-/// involved.
-static SDValue tryCombineToLargerBFI(SDNode *N,
- TargetLowering::DAGCombinerInfo &DCI,
- const AArch64Subtarget *Subtarget) {
- SelectionDAG &DAG = DCI.DAG;
- SDLoc DL(N);
- EVT VT = N->getValueType(0);
+static SDValue tryFormConcatFromShuffle(SDValue Op, SelectionDAG &DAG) {
+ SDLoc DL(Op);
+ EVT VT = Op.getValueType();
+ SDValue V0 = Op.getOperand(0);
+ SDValue V1 = Op.getOperand(1);
+ ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(Op)->getMask();
- // First job is to hunt for a MaskedBFI on either the left or right. Swap
- // operands if it's actually on the right.
- SDValue BFI;
- SDValue PossExtraMask;
- uint64_t ExistingMask = 0;
- bool Extended = false;
- if (findMaskedBFI(N->getOperand(0), BFI, ExistingMask, Extended))
- PossExtraMask = N->getOperand(1);
- else if (findMaskedBFI(N->getOperand(1), BFI, ExistingMask, Extended))
- PossExtraMask = N->getOperand(0);
- else
+ if (VT.getVectorElementType() != V0.getValueType().getVectorElementType() ||
+ VT.getVectorElementType() != V1.getValueType().getVectorElementType())
return SDValue();
- // We can only combine a BFI with another compatible mask.
- if (PossExtraMask.getOpcode() != ISD::AND ||
- !isa<ConstantSDNode>(PossExtraMask.getOperand(1)))
+ bool SplitV0 = V0.getValueType().getSizeInBits() == 128;
+
+ if (!isConcatMask(Mask, VT, SplitV0))
return SDValue();
- uint64_t ExtraMask = PossExtraMask->getConstantOperandVal(1);
+ EVT CastVT = EVT::getVectorVT(*DAG.getContext(), VT.getVectorElementType(),
+ VT.getVectorNumElements() / 2);
+ if (SplitV0) {
+ V0 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, CastVT, V0,
+ DAG.getConstant(0, MVT::i64));
+ }
+ if (V1.getValueType().getSizeInBits() == 128) {
+ V1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, CastVT, V1,
+ DAG.getConstant(0, MVT::i64));
+ }
+ return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, V0, V1);
+}
- // Masks must be compatible.
- if (ExtraMask & ExistingMask)
- return SDValue();
+/// GeneratePerfectShuffle - Given an entry in the perfect-shuffle table, emit
+/// the specified operations to build the shuffle.
+static SDValue GeneratePerfectShuffle(unsigned PFEntry, SDValue LHS,
+ SDValue RHS, SelectionDAG &DAG,
+ SDLoc dl) {
+ unsigned OpNum = (PFEntry >> 26) & 0x0F;
+ unsigned LHSID = (PFEntry >> 13) & ((1 << 13) - 1);
+ unsigned RHSID = (PFEntry >> 0) & ((1 << 13) - 1);
- SDValue OldBFIVal = BFI.getOperand(0);
- SDValue NewBFIVal = BFI.getOperand(1);
- if (Extended) {
- // We skipped a ZERO_EXTEND above, so the input to the MaskedBFIs should be
- // 32-bit and we'll be forming a 64-bit MaskedBFI. The MaskedBFI arguments
- // need to be made compatible.
- assert(VT == MVT::i64 && BFI.getValueType() == MVT::i32
- && "Invalid types for BFI");
- OldBFIVal = DAG.getNode(ISD::ANY_EXTEND, DL, VT, OldBFIVal);
- NewBFIVal = DAG.getNode(ISD::ANY_EXTEND, DL, VT, NewBFIVal);
+ enum {
+ OP_COPY = 0, // Copy, used for things like <u,u,u,3> to say it is <0,1,2,3>
+ OP_VREV,
+ OP_VDUP0,
+ OP_VDUP1,
+ OP_VDUP2,
+ OP_VDUP3,
+ OP_VEXT1,
+ OP_VEXT2,
+ OP_VEXT3,
+ OP_VUZPL, // VUZP, left result
+ OP_VUZPR, // VUZP, right result
+ OP_VZIPL, // VZIP, left result
+ OP_VZIPR, // VZIP, right result
+ OP_VTRNL, // VTRN, left result
+ OP_VTRNR // VTRN, right result
+ };
+
+ if (OpNum == OP_COPY) {
+ if (LHSID == (1 * 9 + 2) * 9 + 3)
+ return LHS;
+ assert(LHSID == ((4 * 9 + 5) * 9 + 6) * 9 + 7 && "Illegal OP_COPY!");
+ return RHS;
}
- // We need the MaskedBFI to be combined with a mask of the *same* value.
- if (PossExtraMask.getOperand(0) != OldBFIVal)
+ SDValue OpLHS, OpRHS;
+ OpLHS = GeneratePerfectShuffle(PerfectShuffleTable[LHSID], LHS, RHS, DAG, dl);
+ OpRHS = GeneratePerfectShuffle(PerfectShuffleTable[RHSID], LHS, RHS, DAG, dl);
+ EVT VT = OpLHS.getValueType();
+
+ switch (OpNum) {
+ default:
+ llvm_unreachable("Unknown shuffle opcode!");
+ case OP_VREV:
+ // VREV divides the vector in half and swaps within the half.
+ if (VT.getVectorElementType() == MVT::i32 ||
+ VT.getVectorElementType() == MVT::f32)
+ return DAG.getNode(AArch64ISD::REV64, dl, VT, OpLHS);
+ // vrev <4 x i16> -> REV32
+ if (VT.getVectorElementType() == MVT::i16)
+ return DAG.getNode(AArch64ISD::REV32, dl, VT, OpLHS);
+ // vrev <4 x i8> -> REV16
+ assert(VT.getVectorElementType() == MVT::i8);
+ return DAG.getNode(AArch64ISD::REV16, dl, VT, OpLHS);
+ case OP_VDUP0:
+ case OP_VDUP1:
+ case OP_VDUP2:
+ case OP_VDUP3: {
+ EVT EltTy = VT.getVectorElementType();
+ unsigned Opcode;
+ if (EltTy == MVT::i8)
+ Opcode = AArch64ISD::DUPLANE8;
+ else if (EltTy == MVT::i16)
+ Opcode = AArch64ISD::DUPLANE16;
+ else if (EltTy == MVT::i32 || EltTy == MVT::f32)
+ Opcode = AArch64ISD::DUPLANE32;
+ else if (EltTy == MVT::i64 || EltTy == MVT::f64)
+ Opcode = AArch64ISD::DUPLANE64;
+ else
+ llvm_unreachable("Invalid vector element type?");
+
+ if (VT.getSizeInBits() == 64)
+ OpLHS = WidenVector(OpLHS, DAG);
+ SDValue Lane = DAG.getConstant(OpNum - OP_VDUP0, MVT::i64);
+ return DAG.getNode(Opcode, dl, VT, OpLHS, Lane);
+ }
+ case OP_VEXT1:
+ case OP_VEXT2:
+ case OP_VEXT3: {
+ unsigned Imm = (OpNum - OP_VEXT1 + 1) * getExtFactor(OpLHS);
+ return DAG.getNode(AArch64ISD::EXT, dl, VT, OpLHS, OpRHS,
+ DAG.getConstant(Imm, MVT::i32));
+ }
+ case OP_VUZPL:
+ return DAG.getNode(AArch64ISD::UZP1, dl, DAG.getVTList(VT, VT), OpLHS,
+ OpRHS);
+ case OP_VUZPR:
+ return DAG.getNode(AArch64ISD::UZP2, dl, DAG.getVTList(VT, VT), OpLHS,
+ OpRHS);
+ case OP_VZIPL:
+ return DAG.getNode(AArch64ISD::ZIP1, dl, DAG.getVTList(VT, VT), OpLHS,
+ OpRHS);
+ case OP_VZIPR:
+ return DAG.getNode(AArch64ISD::ZIP2, dl, DAG.getVTList(VT, VT), OpLHS,
+ OpRHS);
+ case OP_VTRNL:
+ return DAG.getNode(AArch64ISD::TRN1, dl, DAG.getVTList(VT, VT), OpLHS,
+ OpRHS);
+ case OP_VTRNR:
+ return DAG.getNode(AArch64ISD::TRN2, dl, DAG.getVTList(VT, VT), OpLHS,
+ OpRHS);
+ }
+}
+
+static SDValue GenerateTBL(SDValue Op, ArrayRef<int> ShuffleMask,
+ SelectionDAG &DAG) {
+ // Check to see if we can use the TBL instruction.
+ SDValue V1 = Op.getOperand(0);
+ SDValue V2 = Op.getOperand(1);
+ SDLoc DL(Op);
+
+ EVT EltVT = Op.getValueType().getVectorElementType();
+ unsigned BytesPerElt = EltVT.getSizeInBits() / 8;
+
+ SmallVector<SDValue, 8> TBLMask;
+ for (int Val : ShuffleMask) {
+ for (unsigned Byte = 0; Byte < BytesPerElt; ++Byte) {
+ unsigned Offset = Byte + Val * BytesPerElt;
+ TBLMask.push_back(DAG.getConstant(Offset, MVT::i32));
+ }
+ }
+
+ MVT IndexVT = MVT::v8i8;
+ unsigned IndexLen = 8;
+ if (Op.getValueType().getSizeInBits() == 128) {
+ IndexVT = MVT::v16i8;
+ IndexLen = 16;
+ }
+
+ SDValue V1Cst = DAG.getNode(ISD::BITCAST, DL, IndexVT, V1);
+ SDValue V2Cst = DAG.getNode(ISD::BITCAST, DL, IndexVT, V2);
+
+ SDValue Shuffle;
+ if (V2.getNode()->getOpcode() == ISD::UNDEF) {
+ if (IndexLen == 8)
+ V1Cst = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v16i8, V1Cst, V1Cst);
+ Shuffle = DAG.getNode(
+ ISD::INTRINSIC_WO_CHAIN, DL, IndexVT,
+ DAG.getConstant(Intrinsic::aarch64_neon_tbl1, MVT::i32), V1Cst,
+ DAG.getNode(ISD::BUILD_VECTOR, DL, IndexVT,
+ makeArrayRef(TBLMask.data(), IndexLen)));
+ } else {
+ if (IndexLen == 8) {
+ V1Cst = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v16i8, V1Cst, V2Cst);
+ Shuffle = DAG.getNode(
+ ISD::INTRINSIC_WO_CHAIN, DL, IndexVT,
+ DAG.getConstant(Intrinsic::aarch64_neon_tbl1, MVT::i32), V1Cst,
+ DAG.getNode(ISD::BUILD_VECTOR, DL, IndexVT,
+ makeArrayRef(TBLMask.data(), IndexLen)));
+ } else {
+ // FIXME: We cannot, for the moment, emit a TBL2 instruction because we
+ // cannot currently represent the register constraints on the input
+ // table registers.
+ // Shuffle = DAG.getNode(AArch64ISD::TBL2, DL, IndexVT, V1Cst, V2Cst,
+ // DAG.getNode(ISD::BUILD_VECTOR, DL, IndexVT,
+ // &TBLMask[0], IndexLen));
+ Shuffle = DAG.getNode(
+ ISD::INTRINSIC_WO_CHAIN, DL, IndexVT,
+ DAG.getConstant(Intrinsic::aarch64_neon_tbl2, MVT::i32), V1Cst, V2Cst,
+ DAG.getNode(ISD::BUILD_VECTOR, DL, IndexVT,
+ makeArrayRef(TBLMask.data(), IndexLen)));
+ }
+ }
+ return DAG.getNode(ISD::BITCAST, DL, Op.getValueType(), Shuffle);
+}
+
+static unsigned getDUPLANEOp(EVT EltType) {
+ if (EltType == MVT::i8)
+ return AArch64ISD::DUPLANE8;
+ if (EltType == MVT::i16)
+ return AArch64ISD::DUPLANE16;
+ if (EltType == MVT::i32 || EltType == MVT::f32)
+ return AArch64ISD::DUPLANE32;
+ if (EltType == MVT::i64 || EltType == MVT::f64)
+ return AArch64ISD::DUPLANE64;
+
+ llvm_unreachable("Invalid vector element type?");
+}
+
+SDValue AArch64TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op,
+ SelectionDAG &DAG) const {
+ SDLoc dl(Op);
+ EVT VT = Op.getValueType();
+
+ ShuffleVectorSDNode *SVN = cast<ShuffleVectorSDNode>(Op.getNode());
+
+ // Convert shuffles that are directly supported on NEON to target-specific
+ // DAG nodes, instead of keeping them as shuffles and matching them again
+ // during code selection. This is more efficient and avoids the possibility
+ // of inconsistencies between legalization and selection.
+ ArrayRef<int> ShuffleMask = SVN->getMask();
+
+ SDValue V1 = Op.getOperand(0);
+ SDValue V2 = Op.getOperand(1);
+
+ if (ShuffleVectorSDNode::isSplatMask(&ShuffleMask[0],
+ V1.getValueType().getSimpleVT())) {
+ int Lane = SVN->getSplatIndex();
+ // If this is undef splat, generate it via "just" vdup, if possible.
+ if (Lane == -1)
+ Lane = 0;
+
+ if (Lane == 0 && V1.getOpcode() == ISD::SCALAR_TO_VECTOR)
+ return DAG.getNode(AArch64ISD::DUP, dl, V1.getValueType(),
+ V1.getOperand(0));
+ // Test if V1 is a BUILD_VECTOR and the lane being referenced is a non-
+ // constant. If so, we can just reference the lane's definition directly.
+ if (V1.getOpcode() == ISD::BUILD_VECTOR &&
+ !isa<ConstantSDNode>(V1.getOperand(Lane)))
+ return DAG.getNode(AArch64ISD::DUP, dl, VT, V1.getOperand(Lane));
+
+ // Otherwise, duplicate from the lane of the input vector.
+ unsigned Opcode = getDUPLANEOp(V1.getValueType().getVectorElementType());
+
+ // SelectionDAGBuilder may have "helpfully" already extracted or conatenated
+ // to make a vector of the same size as this SHUFFLE. We can ignore the
+ // extract entirely, and canonicalise the concat using WidenVector.
+ if (V1.getOpcode() == ISD::EXTRACT_SUBVECTOR) {
+ Lane += cast<ConstantSDNode>(V1.getOperand(1))->getZExtValue();
+ V1 = V1.getOperand(0);
+ } else if (V1.getOpcode() == ISD::CONCAT_VECTORS) {
+ unsigned Idx = Lane >= (int)VT.getVectorNumElements() / 2;
+ Lane -= Idx * VT.getVectorNumElements() / 2;
+ V1 = WidenVector(V1.getOperand(Idx), DAG);
+ } else if (VT.getSizeInBits() == 64)
+ V1 = WidenVector(V1, DAG);
+
+ return DAG.getNode(Opcode, dl, VT, V1, DAG.getConstant(Lane, MVT::i64));
+ }
+
+ if (isREVMask(ShuffleMask, VT, 64))
+ return DAG.getNode(AArch64ISD::REV64, dl, V1.getValueType(), V1, V2);
+ if (isREVMask(ShuffleMask, VT, 32))
+ return DAG.getNode(AArch64ISD::REV32, dl, V1.getValueType(), V1, V2);
+ if (isREVMask(ShuffleMask, VT, 16))
+ return DAG.getNode(AArch64ISD::REV16, dl, V1.getValueType(), V1, V2);
+
+ bool ReverseEXT = false;
+ unsigned Imm;
+ if (isEXTMask(ShuffleMask, VT, ReverseEXT, Imm)) {
+ if (ReverseEXT)
+ std::swap(V1, V2);
+ Imm *= getExtFactor(V1);
+ return DAG.getNode(AArch64ISD::EXT, dl, V1.getValueType(), V1, V2,
+ DAG.getConstant(Imm, MVT::i32));
+ } else if (V2->getOpcode() == ISD::UNDEF &&
+ isSingletonEXTMask(ShuffleMask, VT, Imm)) {
+ Imm *= getExtFactor(V1);
+ return DAG.getNode(AArch64ISD::EXT, dl, V1.getValueType(), V1, V1,
+ DAG.getConstant(Imm, MVT::i32));
+ }
+
+ unsigned WhichResult;
+ if (isZIPMask(ShuffleMask, VT, WhichResult)) {
+ unsigned Opc = (WhichResult == 0) ? AArch64ISD::ZIP1 : AArch64ISD::ZIP2;
+ return DAG.getNode(Opc, dl, V1.getValueType(), V1, V2);
+ }
+ if (isUZPMask(ShuffleMask, VT, WhichResult)) {
+ unsigned Opc = (WhichResult == 0) ? AArch64ISD::UZP1 : AArch64ISD::UZP2;
+ return DAG.getNode(Opc, dl, V1.getValueType(), V1, V2);
+ }
+ if (isTRNMask(ShuffleMask, VT, WhichResult)) {
+ unsigned Opc = (WhichResult == 0) ? AArch64ISD::TRN1 : AArch64ISD::TRN2;
+ return DAG.getNode(Opc, dl, V1.getValueType(), V1, V2);
+ }
+
+ if (isZIP_v_undef_Mask(ShuffleMask, VT, WhichResult)) {
+ unsigned Opc = (WhichResult == 0) ? AArch64ISD::ZIP1 : AArch64ISD::ZIP2;
+ return DAG.getNode(Opc, dl, V1.getValueType(), V1, V1);
+ }
+ if (isUZP_v_undef_Mask(ShuffleMask, VT, WhichResult)) {
+ unsigned Opc = (WhichResult == 0) ? AArch64ISD::UZP1 : AArch64ISD::UZP2;
+ return DAG.getNode(Opc, dl, V1.getValueType(), V1, V1);
+ }
+ if (isTRN_v_undef_Mask(ShuffleMask, VT, WhichResult)) {
+ unsigned Opc = (WhichResult == 0) ? AArch64ISD::TRN1 : AArch64ISD::TRN2;
+ return DAG.getNode(Opc, dl, V1.getValueType(), V1, V1);
+ }
+
+ SDValue Concat = tryFormConcatFromShuffle(Op, DAG);
+ if (Concat.getNode())
+ return Concat;
+
+ bool DstIsLeft;
+ int Anomaly;
+ int NumInputElements = V1.getValueType().getVectorNumElements();
+ if (isINSMask(ShuffleMask, NumInputElements, DstIsLeft, Anomaly)) {
+ SDValue DstVec = DstIsLeft ? V1 : V2;
+ SDValue DstLaneV = DAG.getConstant(Anomaly, MVT::i64);
+
+ SDValue SrcVec = V1;
+ int SrcLane = ShuffleMask[Anomaly];
+ if (SrcLane >= NumInputElements) {
+ SrcVec = V2;
+ SrcLane -= VT.getVectorNumElements();
+ }
+ SDValue SrcLaneV = DAG.getConstant(SrcLane, MVT::i64);
+
+ EVT ScalarVT = VT.getVectorElementType();
+ if (ScalarVT.getSizeInBits() < 32)
+ ScalarVT = MVT::i32;
+
+ return DAG.getNode(
+ ISD::INSERT_VECTOR_ELT, dl, VT, DstVec,
+ DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, ScalarVT, SrcVec, SrcLaneV),
+ DstLaneV);
+ }
+
+ // If the shuffle is not directly supported and it has 4 elements, use
+ // the PerfectShuffle-generated table to synthesize it from other shuffles.
+ unsigned NumElts = VT.getVectorNumElements();
+ if (NumElts == 4) {
+ unsigned PFIndexes[4];
+ for (unsigned i = 0; i != 4; ++i) {
+ if (ShuffleMask[i] < 0)
+ PFIndexes[i] = 8;
+ else
+ PFIndexes[i] = ShuffleMask[i];
+ }
+
+ // Compute the index in the perfect shuffle table.
+ unsigned PFTableIndex = PFIndexes[0] * 9 * 9 * 9 + PFIndexes[1] * 9 * 9 +
+ PFIndexes[2] * 9 + PFIndexes[3];
+ unsigned PFEntry = PerfectShuffleTable[PFTableIndex];
+ unsigned Cost = (PFEntry >> 30);
+
+ if (Cost <= 4)
+ return GeneratePerfectShuffle(PFEntry, V1, V2, DAG, dl);
+ }
+
+ return GenerateTBL(Op, ShuffleMask, DAG);
+}
+
+static bool resolveBuildVector(BuildVectorSDNode *BVN, APInt &CnstBits,
+ APInt &UndefBits) {
+ EVT VT = BVN->getValueType(0);
+ APInt SplatBits, SplatUndef;
+ unsigned SplatBitSize;
+ bool HasAnyUndefs;
+ if (BVN->isConstantSplat(SplatBits, SplatUndef, SplatBitSize, HasAnyUndefs)) {
+ unsigned NumSplats = VT.getSizeInBits() / SplatBitSize;
+
+ for (unsigned i = 0; i < NumSplats; ++i) {
+ CnstBits <<= SplatBitSize;
+ UndefBits <<= SplatBitSize;
+ CnstBits |= SplatBits.zextOrTrunc(VT.getSizeInBits());
+ UndefBits |= (SplatBits ^ SplatUndef).zextOrTrunc(VT.getSizeInBits());
+ }
+
+ return true;
+ }
+
+ return false;
+}
+
+SDValue AArch64TargetLowering::LowerVectorAND(SDValue Op,
+ SelectionDAG &DAG) const {
+ BuildVectorSDNode *BVN =
+ dyn_cast<BuildVectorSDNode>(Op.getOperand(1).getNode());
+ SDValue LHS = Op.getOperand(0);
+ SDLoc dl(Op);
+ EVT VT = Op.getValueType();
+
+ if (!BVN)
+ return Op;
+
+ APInt CnstBits(VT.getSizeInBits(), 0);
+ APInt UndefBits(VT.getSizeInBits(), 0);
+ if (resolveBuildVector(BVN, CnstBits, UndefBits)) {
+ // We only have BIC vector immediate instruction, which is and-not.
+ CnstBits = ~CnstBits;
+
+ // We make use of a little bit of goto ickiness in order to avoid having to
+ // duplicate the immediate matching logic for the undef toggled case.
+ bool SecondTry = false;
+ AttemptModImm:
+
+ if (CnstBits.getHiBits(64) == CnstBits.getLoBits(64)) {
+ CnstBits = CnstBits.zextOrTrunc(64);
+ uint64_t CnstVal = CnstBits.getZExtValue();
+
+ if (AArch64_AM::isAdvSIMDModImmType1(CnstVal)) {
+ CnstVal = AArch64_AM::encodeAdvSIMDModImmType1(CnstVal);
+ MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32;
+ SDValue Mov = DAG.getNode(AArch64ISD::BICi, dl, MovTy, LHS,
+ DAG.getConstant(CnstVal, MVT::i32),
+ DAG.getConstant(0, MVT::i32));
+ return DAG.getNode(ISD::BITCAST, dl, VT, Mov);
+ }
+
+ if (AArch64_AM::isAdvSIMDModImmType2(CnstVal)) {
+ CnstVal = AArch64_AM::encodeAdvSIMDModImmType2(CnstVal);
+ MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32;
+ SDValue Mov = DAG.getNode(AArch64ISD::BICi, dl, MovTy, LHS,
+ DAG.getConstant(CnstVal, MVT::i32),
+ DAG.getConstant(8, MVT::i32));
+ return DAG.getNode(ISD::BITCAST, dl, VT, Mov);
+ }
+
+ if (AArch64_AM::isAdvSIMDModImmType3(CnstVal)) {
+ CnstVal = AArch64_AM::encodeAdvSIMDModImmType3(CnstVal);
+ MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32;
+ SDValue Mov = DAG.getNode(AArch64ISD::BICi, dl, MovTy, LHS,
+ DAG.getConstant(CnstVal, MVT::i32),
+ DAG.getConstant(16, MVT::i32));
+ return DAG.getNode(ISD::BITCAST, dl, VT, Mov);
+ }
+
+ if (AArch64_AM::isAdvSIMDModImmType4(CnstVal)) {
+ CnstVal = AArch64_AM::encodeAdvSIMDModImmType4(CnstVal);
+ MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32;
+ SDValue Mov = DAG.getNode(AArch64ISD::BICi, dl, MovTy, LHS,
+ DAG.getConstant(CnstVal, MVT::i32),
+ DAG.getConstant(24, MVT::i32));
+ return DAG.getNode(ISD::BITCAST, dl, VT, Mov);
+ }
+
+ if (AArch64_AM::isAdvSIMDModImmType5(CnstVal)) {
+ CnstVal = AArch64_AM::encodeAdvSIMDModImmType5(CnstVal);
+ MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v8i16 : MVT::v4i16;
+ SDValue Mov = DAG.getNode(AArch64ISD::BICi, dl, MovTy, LHS,
+ DAG.getConstant(CnstVal, MVT::i32),
+ DAG.getConstant(0, MVT::i32));
+ return DAG.getNode(ISD::BITCAST, dl, VT, Mov);
+ }
+
+ if (AArch64_AM::isAdvSIMDModImmType6(CnstVal)) {
+ CnstVal = AArch64_AM::encodeAdvSIMDModImmType6(CnstVal);
+ MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v8i16 : MVT::v4i16;
+ SDValue Mov = DAG.getNode(AArch64ISD::BICi, dl, MovTy, LHS,
+ DAG.getConstant(CnstVal, MVT::i32),
+ DAG.getConstant(8, MVT::i32));
+ return DAG.getNode(ISD::BITCAST, dl, VT, Mov);
+ }
+ }
+
+ if (SecondTry)
+ goto FailedModImm;
+ SecondTry = true;
+ CnstBits = ~UndefBits;
+ goto AttemptModImm;
+ }
+
+// We can always fall back to a non-immediate AND.
+FailedModImm:
+ return Op;
+}
+
+// Specialized code to quickly find if PotentialBVec is a BuildVector that
+// consists of only the same constant int value, returned in reference arg
+// ConstVal
+static bool isAllConstantBuildVector(const SDValue &PotentialBVec,
+ uint64_t &ConstVal) {
+ BuildVectorSDNode *Bvec = dyn_cast<BuildVectorSDNode>(PotentialBVec);
+ if (!Bvec)
+ return false;
+ ConstantSDNode *FirstElt = dyn_cast<ConstantSDNode>(Bvec->getOperand(0));
+ if (!FirstElt)
+ return false;
+ EVT VT = Bvec->getValueType(0);
+ unsigned NumElts = VT.getVectorNumElements();
+ for (unsigned i = 1; i < NumElts; ++i)
+ if (dyn_cast<ConstantSDNode>(Bvec->getOperand(i)) != FirstElt)
+ return false;
+ ConstVal = FirstElt->getZExtValue();
+ return true;
+}
+
+static unsigned getIntrinsicID(const SDNode *N) {
+ unsigned Opcode = N->getOpcode();
+ switch (Opcode) {
+ default:
+ return Intrinsic::not_intrinsic;
+ case ISD::INTRINSIC_WO_CHAIN: {
+ unsigned IID = cast<ConstantSDNode>(N->getOperand(0))->getZExtValue();
+ if (IID < Intrinsic::num_intrinsics)
+ return IID;
+ return Intrinsic::not_intrinsic;
+ }
+ }
+}
+
+// Attempt to form a vector S[LR]I from (or (and X, BvecC1), (lsl Y, C2)),
+// to (SLI X, Y, C2), where X and Y have matching vector types, BvecC1 is a
+// BUILD_VECTORs with constant element C1, C2 is a constant, and C1 == ~C2.
+// Also, logical shift right -> sri, with the same structure.
+static SDValue tryLowerToSLI(SDNode *N, SelectionDAG &DAG) {
+ EVT VT = N->getValueType(0);
+
+ if (!VT.isVector())
return SDValue();
- BFI = DAG.getNode(AArch64ISD::BFI, DL, VT,
- OldBFIVal, NewBFIVal,
- BFI.getOperand(2), BFI.getOperand(3));
+ SDLoc DL(N);
- // If the masking is trivial, we don't need to create it.
- if ((ExtraMask | ExistingMask) == (-1ULL >> (64 - VT.getSizeInBits())))
- return BFI;
+ // Is the first op an AND?
+ const SDValue And = N->getOperand(0);
+ if (And.getOpcode() != ISD::AND)
+ return SDValue();
- return DAG.getNode(ISD::AND, DL, VT, BFI,
- DAG.getConstant(ExtraMask | ExistingMask, VT));
+ // Is the second op an shl or lshr?
+ SDValue Shift = N->getOperand(1);
+ // This will have been turned into: AArch64ISD::VSHL vector, #shift
+ // or AArch64ISD::VLSHR vector, #shift
+ unsigned ShiftOpc = Shift.getOpcode();
+ if ((ShiftOpc != AArch64ISD::VSHL && ShiftOpc != AArch64ISD::VLSHR))
+ return SDValue();
+ bool IsShiftRight = ShiftOpc == AArch64ISD::VLSHR;
+
+ // Is the shift amount constant?
+ ConstantSDNode *C2node = dyn_cast<ConstantSDNode>(Shift.getOperand(1));
+ if (!C2node)
+ return SDValue();
+
+ // Is the and mask vector all constant?
+ uint64_t C1;
+ if (!isAllConstantBuildVector(And.getOperand(1), C1))
+ return SDValue();
+
+ // Is C1 == ~C2, taking into account how much one can shift elements of a
+ // particular size?
+ uint64_t C2 = C2node->getZExtValue();
+ unsigned ElemSizeInBits = VT.getVectorElementType().getSizeInBits();
+ if (C2 > ElemSizeInBits)
+ return SDValue();
+ unsigned ElemMask = (1 << ElemSizeInBits) - 1;
+ if ((C1 & ElemMask) != (~C2 & ElemMask))
+ return SDValue();
+
+ SDValue X = And.getOperand(0);
+ SDValue Y = Shift.getOperand(0);
+
+ unsigned Intrin =
+ IsShiftRight ? Intrinsic::aarch64_neon_vsri : Intrinsic::aarch64_neon_vsli;
+ SDValue ResultSLI =
+ DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT,
+ DAG.getConstant(Intrin, MVT::i32), X, Y, Shift.getOperand(1));
+
+ DEBUG(dbgs() << "aarch64-lower: transformed: \n");
+ DEBUG(N->dump(&DAG));
+ DEBUG(dbgs() << "into: \n");
+ DEBUG(ResultSLI->dump(&DAG));
+
+ ++NumShiftInserts;
+ return ResultSLI;
+}
+
+SDValue AArch64TargetLowering::LowerVectorOR(SDValue Op,
+ SelectionDAG &DAG) const {
+ // Attempt to form a vector S[LR]I from (or (and X, C1), (lsl Y, C2))
+ if (EnableAArch64SlrGeneration) {
+ SDValue Res = tryLowerToSLI(Op.getNode(), DAG);
+ if (Res.getNode())
+ return Res;
+ }
+
+ BuildVectorSDNode *BVN =
+ dyn_cast<BuildVectorSDNode>(Op.getOperand(0).getNode());
+ SDValue LHS = Op.getOperand(1);
+ SDLoc dl(Op);
+ EVT VT = Op.getValueType();
+
+ // OR commutes, so try swapping the operands.
+ if (!BVN) {
+ LHS = Op.getOperand(0);
+ BVN = dyn_cast<BuildVectorSDNode>(Op.getOperand(1).getNode());
+ }
+ if (!BVN)
+ return Op;
+
+ APInt CnstBits(VT.getSizeInBits(), 0);
+ APInt UndefBits(VT.getSizeInBits(), 0);
+ if (resolveBuildVector(BVN, CnstBits, UndefBits)) {
+ // We make use of a little bit of goto ickiness in order to avoid having to
+ // duplicate the immediate matching logic for the undef toggled case.
+ bool SecondTry = false;
+ AttemptModImm:
+
+ if (CnstBits.getHiBits(64) == CnstBits.getLoBits(64)) {
+ CnstBits = CnstBits.zextOrTrunc(64);
+ uint64_t CnstVal = CnstBits.getZExtValue();
+
+ if (AArch64_AM::isAdvSIMDModImmType1(CnstVal)) {
+ CnstVal = AArch64_AM::encodeAdvSIMDModImmType1(CnstVal);
+ MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32;
+ SDValue Mov = DAG.getNode(AArch64ISD::ORRi, dl, MovTy, LHS,
+ DAG.getConstant(CnstVal, MVT::i32),
+ DAG.getConstant(0, MVT::i32));
+ return DAG.getNode(ISD::BITCAST, dl, VT, Mov);
+ }
+
+ if (AArch64_AM::isAdvSIMDModImmType2(CnstVal)) {
+ CnstVal = AArch64_AM::encodeAdvSIMDModImmType2(CnstVal);
+ MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32;
+ SDValue Mov = DAG.getNode(AArch64ISD::ORRi, dl, MovTy, LHS,
+ DAG.getConstant(CnstVal, MVT::i32),
+ DAG.getConstant(8, MVT::i32));
+ return DAG.getNode(ISD::BITCAST, dl, VT, Mov);
+ }
+
+ if (AArch64_AM::isAdvSIMDModImmType3(CnstVal)) {
+ CnstVal = AArch64_AM::encodeAdvSIMDModImmType3(CnstVal);
+ MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32;
+ SDValue Mov = DAG.getNode(AArch64ISD::ORRi, dl, MovTy, LHS,
+ DAG.getConstant(CnstVal, MVT::i32),
+ DAG.getConstant(16, MVT::i32));
+ return DAG.getNode(ISD::BITCAST, dl, VT, Mov);
+ }
+
+ if (AArch64_AM::isAdvSIMDModImmType4(CnstVal)) {
+ CnstVal = AArch64_AM::encodeAdvSIMDModImmType4(CnstVal);
+ MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32;
+ SDValue Mov = DAG.getNode(AArch64ISD::ORRi, dl, MovTy, LHS,
+ DAG.getConstant(CnstVal, MVT::i32),
+ DAG.getConstant(24, MVT::i32));
+ return DAG.getNode(ISD::BITCAST, dl, VT, Mov);
+ }
+
+ if (AArch64_AM::isAdvSIMDModImmType5(CnstVal)) {
+ CnstVal = AArch64_AM::encodeAdvSIMDModImmType5(CnstVal);
+ MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v8i16 : MVT::v4i16;
+ SDValue Mov = DAG.getNode(AArch64ISD::ORRi, dl, MovTy, LHS,
+ DAG.getConstant(CnstVal, MVT::i32),
+ DAG.getConstant(0, MVT::i32));
+ return DAG.getNode(ISD::BITCAST, dl, VT, Mov);
+ }
+
+ if (AArch64_AM::isAdvSIMDModImmType6(CnstVal)) {
+ CnstVal = AArch64_AM::encodeAdvSIMDModImmType6(CnstVal);
+ MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v8i16 : MVT::v4i16;
+ SDValue Mov = DAG.getNode(AArch64ISD::ORRi, dl, MovTy, LHS,
+ DAG.getConstant(CnstVal, MVT::i32),
+ DAG.getConstant(8, MVT::i32));
+ return DAG.getNode(ISD::BITCAST, dl, VT, Mov);
+ }
+ }
+
+ if (SecondTry)
+ goto FailedModImm;
+ SecondTry = true;
+ CnstBits = UndefBits;
+ goto AttemptModImm;
+ }
+
+// We can always fall back to a non-immediate OR.
+FailedModImm:
+ return Op;
+}
+
+SDValue AArch64TargetLowering::LowerBUILD_VECTOR(SDValue Op,
+ SelectionDAG &DAG) const {
+ BuildVectorSDNode *BVN = cast<BuildVectorSDNode>(Op.getNode());
+ SDLoc dl(Op);
+ EVT VT = Op.getValueType();
+
+ APInt CnstBits(VT.getSizeInBits(), 0);
+ APInt UndefBits(VT.getSizeInBits(), 0);
+ if (resolveBuildVector(BVN, CnstBits, UndefBits)) {
+ // We make use of a little bit of goto ickiness in order to avoid having to
+ // duplicate the immediate matching logic for the undef toggled case.
+ bool SecondTry = false;
+ AttemptModImm:
+
+ if (CnstBits.getHiBits(64) == CnstBits.getLoBits(64)) {
+ CnstBits = CnstBits.zextOrTrunc(64);
+ uint64_t CnstVal = CnstBits.getZExtValue();
+
+ // Certain magic vector constants (used to express things like NOT
+ // and NEG) are passed through unmodified. This allows codegen patterns
+ // for these operations to match. Special-purpose patterns will lower
+ // these immediates to MOVIs if it proves necessary.
+ if (VT.isInteger() && (CnstVal == 0 || CnstVal == ~0ULL))
+ return Op;
+
+ // The many faces of MOVI...
+ if (AArch64_AM::isAdvSIMDModImmType10(CnstVal)) {
+ CnstVal = AArch64_AM::encodeAdvSIMDModImmType10(CnstVal);
+ if (VT.getSizeInBits() == 128) {
+ SDValue Mov = DAG.getNode(AArch64ISD::MOVIedit, dl, MVT::v2i64,
+ DAG.getConstant(CnstVal, MVT::i32));
+ return DAG.getNode(ISD::BITCAST, dl, VT, Mov);
+ }
+
+ // Support the V64 version via subregister insertion.
+ SDValue Mov = DAG.getNode(AArch64ISD::MOVIedit, dl, MVT::f64,
+ DAG.getConstant(CnstVal, MVT::i32));
+ return DAG.getNode(ISD::BITCAST, dl, VT, Mov);
+ }
+
+ if (AArch64_AM::isAdvSIMDModImmType1(CnstVal)) {
+ CnstVal = AArch64_AM::encodeAdvSIMDModImmType1(CnstVal);
+ MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32;
+ SDValue Mov = DAG.getNode(AArch64ISD::MOVIshift, dl, MovTy,
+ DAG.getConstant(CnstVal, MVT::i32),
+ DAG.getConstant(0, MVT::i32));
+ return DAG.getNode(ISD::BITCAST, dl, VT, Mov);
+ }
+
+ if (AArch64_AM::isAdvSIMDModImmType2(CnstVal)) {
+ CnstVal = AArch64_AM::encodeAdvSIMDModImmType2(CnstVal);
+ MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32;
+ SDValue Mov = DAG.getNode(AArch64ISD::MOVIshift, dl, MovTy,
+ DAG.getConstant(CnstVal, MVT::i32),
+ DAG.getConstant(8, MVT::i32));
+ return DAG.getNode(ISD::BITCAST, dl, VT, Mov);
+ }
+
+ if (AArch64_AM::isAdvSIMDModImmType3(CnstVal)) {
+ CnstVal = AArch64_AM::encodeAdvSIMDModImmType3(CnstVal);
+ MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32;
+ SDValue Mov = DAG.getNode(AArch64ISD::MOVIshift, dl, MovTy,
+ DAG.getConstant(CnstVal, MVT::i32),
+ DAG.getConstant(16, MVT::i32));
+ return DAG.getNode(ISD::BITCAST, dl, VT, Mov);
+ }
+
+ if (AArch64_AM::isAdvSIMDModImmType4(CnstVal)) {
+ CnstVal = AArch64_AM::encodeAdvSIMDModImmType4(CnstVal);
+ MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32;
+ SDValue Mov = DAG.getNode(AArch64ISD::MOVIshift, dl, MovTy,
+ DAG.getConstant(CnstVal, MVT::i32),
+ DAG.getConstant(24, MVT::i32));
+ return DAG.getNode(ISD::BITCAST, dl, VT, Mov);
+ }
+
+ if (AArch64_AM::isAdvSIMDModImmType5(CnstVal)) {
+ CnstVal = AArch64_AM::encodeAdvSIMDModImmType5(CnstVal);
+ MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v8i16 : MVT::v4i16;
+ SDValue Mov = DAG.getNode(AArch64ISD::MOVIshift, dl, MovTy,
+ DAG.getConstant(CnstVal, MVT::i32),
+ DAG.getConstant(0, MVT::i32));
+ return DAG.getNode(ISD::BITCAST, dl, VT, Mov);
+ }
+
+ if (AArch64_AM::isAdvSIMDModImmType6(CnstVal)) {
+ CnstVal = AArch64_AM::encodeAdvSIMDModImmType6(CnstVal);
+ MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v8i16 : MVT::v4i16;
+ SDValue Mov = DAG.getNode(AArch64ISD::MOVIshift, dl, MovTy,
+ DAG.getConstant(CnstVal, MVT::i32),
+ DAG.getConstant(8, MVT::i32));
+ return DAG.getNode(ISD::BITCAST, dl, VT, Mov);
+ }
+
+ if (AArch64_AM::isAdvSIMDModImmType7(CnstVal)) {
+ CnstVal = AArch64_AM::encodeAdvSIMDModImmType7(CnstVal);
+ MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32;
+ SDValue Mov = DAG.getNode(AArch64ISD::MOVImsl, dl, MovTy,
+ DAG.getConstant(CnstVal, MVT::i32),
+ DAG.getConstant(264, MVT::i32));
+ return DAG.getNode(ISD::BITCAST, dl, VT, Mov);
+ }
+
+ if (AArch64_AM::isAdvSIMDModImmType8(CnstVal)) {
+ CnstVal = AArch64_AM::encodeAdvSIMDModImmType8(CnstVal);
+ MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32;
+ SDValue Mov = DAG.getNode(AArch64ISD::MOVImsl, dl, MovTy,
+ DAG.getConstant(CnstVal, MVT::i32),
+ DAG.getConstant(272, MVT::i32));
+ return DAG.getNode(ISD::BITCAST, dl, VT, Mov);
+ }
+
+ if (AArch64_AM::isAdvSIMDModImmType9(CnstVal)) {
+ CnstVal = AArch64_AM::encodeAdvSIMDModImmType9(CnstVal);
+ MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v16i8 : MVT::v8i8;
+ SDValue Mov = DAG.getNode(AArch64ISD::MOVI, dl, MovTy,
+ DAG.getConstant(CnstVal, MVT::i32));
+ return DAG.getNode(ISD::BITCAST, dl, VT, Mov);
+ }
+
+ // The few faces of FMOV...
+ if (AArch64_AM::isAdvSIMDModImmType11(CnstVal)) {
+ CnstVal = AArch64_AM::encodeAdvSIMDModImmType11(CnstVal);
+ MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4f32 : MVT::v2f32;
+ SDValue Mov = DAG.getNode(AArch64ISD::FMOV, dl, MovTy,
+ DAG.getConstant(CnstVal, MVT::i32));
+ return DAG.getNode(ISD::BITCAST, dl, VT, Mov);
+ }
+
+ if (AArch64_AM::isAdvSIMDModImmType12(CnstVal) &&
+ VT.getSizeInBits() == 128) {
+ CnstVal = AArch64_AM::encodeAdvSIMDModImmType12(CnstVal);
+ SDValue Mov = DAG.getNode(AArch64ISD::FMOV, dl, MVT::v2f64,
+ DAG.getConstant(CnstVal, MVT::i32));
+ return DAG.getNode(ISD::BITCAST, dl, VT, Mov);
+ }
+
+ // The many faces of MVNI...
+ CnstVal = ~CnstVal;
+ if (AArch64_AM::isAdvSIMDModImmType1(CnstVal)) {
+ CnstVal = AArch64_AM::encodeAdvSIMDModImmType1(CnstVal);
+ MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32;
+ SDValue Mov = DAG.getNode(AArch64ISD::MVNIshift, dl, MovTy,
+ DAG.getConstant(CnstVal, MVT::i32),
+ DAG.getConstant(0, MVT::i32));
+ return DAG.getNode(ISD::BITCAST, dl, VT, Mov);
+ }
+
+ if (AArch64_AM::isAdvSIMDModImmType2(CnstVal)) {
+ CnstVal = AArch64_AM::encodeAdvSIMDModImmType2(CnstVal);
+ MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32;
+ SDValue Mov = DAG.getNode(AArch64ISD::MVNIshift, dl, MovTy,
+ DAG.getConstant(CnstVal, MVT::i32),
+ DAG.getConstant(8, MVT::i32));
+ return DAG.getNode(ISD::BITCAST, dl, VT, Mov);
+ }
+
+ if (AArch64_AM::isAdvSIMDModImmType3(CnstVal)) {
+ CnstVal = AArch64_AM::encodeAdvSIMDModImmType3(CnstVal);
+ MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32;
+ SDValue Mov = DAG.getNode(AArch64ISD::MVNIshift, dl, MovTy,
+ DAG.getConstant(CnstVal, MVT::i32),
+ DAG.getConstant(16, MVT::i32));
+ return DAG.getNode(ISD::BITCAST, dl, VT, Mov);
+ }
+
+ if (AArch64_AM::isAdvSIMDModImmType4(CnstVal)) {
+ CnstVal = AArch64_AM::encodeAdvSIMDModImmType4(CnstVal);
+ MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32;
+ SDValue Mov = DAG.getNode(AArch64ISD::MVNIshift, dl, MovTy,
+ DAG.getConstant(CnstVal, MVT::i32),
+ DAG.getConstant(24, MVT::i32));
+ return DAG.getNode(ISD::BITCAST, dl, VT, Mov);
+ }
+
+ if (AArch64_AM::isAdvSIMDModImmType5(CnstVal)) {
+ CnstVal = AArch64_AM::encodeAdvSIMDModImmType5(CnstVal);
+ MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v8i16 : MVT::v4i16;
+ SDValue Mov = DAG.getNode(AArch64ISD::MVNIshift, dl, MovTy,
+ DAG.getConstant(CnstVal, MVT::i32),
+ DAG.getConstant(0, MVT::i32));
+ return DAG.getNode(ISD::BITCAST, dl, VT, Mov);
+ }
+
+ if (AArch64_AM::isAdvSIMDModImmType6(CnstVal)) {
+ CnstVal = AArch64_AM::encodeAdvSIMDModImmType6(CnstVal);
+ MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v8i16 : MVT::v4i16;
+ SDValue Mov = DAG.getNode(AArch64ISD::MVNIshift, dl, MovTy,
+ DAG.getConstant(CnstVal, MVT::i32),
+ DAG.getConstant(8, MVT::i32));
+ return DAG.getNode(ISD::BITCAST, dl, VT, Mov);
+ }
+
+ if (AArch64_AM::isAdvSIMDModImmType7(CnstVal)) {
+ CnstVal = AArch64_AM::encodeAdvSIMDModImmType7(CnstVal);
+ MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32;
+ SDValue Mov = DAG.getNode(AArch64ISD::MVNImsl, dl, MovTy,
+ DAG.getConstant(CnstVal, MVT::i32),
+ DAG.getConstant(264, MVT::i32));
+ return DAG.getNode(ISD::BITCAST, dl, VT, Mov);
+ }
+
+ if (AArch64_AM::isAdvSIMDModImmType8(CnstVal)) {
+ CnstVal = AArch64_AM::encodeAdvSIMDModImmType8(CnstVal);
+ MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32;
+ SDValue Mov = DAG.getNode(AArch64ISD::MVNImsl, dl, MovTy,
+ DAG.getConstant(CnstVal, MVT::i32),
+ DAG.getConstant(272, MVT::i32));
+ return DAG.getNode(ISD::BITCAST, dl, VT, Mov);
+ }
+ }
+
+ if (SecondTry)
+ goto FailedModImm;
+ SecondTry = true;
+ CnstBits = UndefBits;
+ goto AttemptModImm;
+ }
+FailedModImm:
+
+ // Scan through the operands to find some interesting properties we can
+ // exploit:
+ // 1) If only one value is used, we can use a DUP, or
+ // 2) if only the low element is not undef, we can just insert that, or
+ // 3) if only one constant value is used (w/ some non-constant lanes),
+ // we can splat the constant value into the whole vector then fill
+ // in the non-constant lanes.
+ // 4) FIXME: If different constant values are used, but we can intelligently
+ // select the values we'll be overwriting for the non-constant
+ // lanes such that we can directly materialize the vector
+ // some other way (MOVI, e.g.), we can be sneaky.
+ unsigned NumElts = VT.getVectorNumElements();
+ bool isOnlyLowElement = true;
+ bool usesOnlyOneValue = true;
+ bool usesOnlyOneConstantValue = true;
+ bool isConstant = true;
+ unsigned NumConstantLanes = 0;
+ SDValue Value;
+ SDValue ConstantValue;
+ for (unsigned i = 0; i < NumElts; ++i) {
+ SDValue V = Op.getOperand(i);
+ if (V.getOpcode() == ISD::UNDEF)
+ continue;
+ if (i > 0)
+ isOnlyLowElement = false;
+ if (!isa<ConstantFPSDNode>(V) && !isa<ConstantSDNode>(V))
+ isConstant = false;
+
+ if (isa<ConstantSDNode>(V) || isa<ConstantFPSDNode>(V)) {
+ ++NumConstantLanes;
+ if (!ConstantValue.getNode())
+ ConstantValue = V;
+ else if (ConstantValue != V)
+ usesOnlyOneConstantValue = false;
+ }
+
+ if (!Value.getNode())
+ Value = V;
+ else if (V != Value)
+ usesOnlyOneValue = false;
+ }
+
+ if (!Value.getNode())
+ return DAG.getUNDEF(VT);
+
+ if (isOnlyLowElement)
+ return DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Value);
+
+ // Use DUP for non-constant splats. For f32 constant splats, reduce to
+ // i32 and try again.
+ if (usesOnlyOneValue) {
+ if (!isConstant) {
+ if (Value.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
+ Value.getValueType() != VT)
+ return DAG.getNode(AArch64ISD::DUP, dl, VT, Value);
+
+ // This is actually a DUPLANExx operation, which keeps everything vectory.
+
+ // DUPLANE works on 128-bit vectors, widen it if necessary.
+ SDValue Lane = Value.getOperand(1);
+ Value = Value.getOperand(0);
+ if (Value.getValueType().getSizeInBits() == 64)
+ Value = WidenVector(Value, DAG);
+
+ unsigned Opcode = getDUPLANEOp(VT.getVectorElementType());
+ return DAG.getNode(Opcode, dl, VT, Value, Lane);
+ }
+
+ if (VT.getVectorElementType().isFloatingPoint()) {
+ SmallVector<SDValue, 8> Ops;
+ MVT NewType =
+ (VT.getVectorElementType() == MVT::f32) ? MVT::i32 : MVT::i64;
+ for (unsigned i = 0; i < NumElts; ++i)
+ Ops.push_back(DAG.getNode(ISD::BITCAST, dl, NewType, Op.getOperand(i)));
+ EVT VecVT = EVT::getVectorVT(*DAG.getContext(), NewType, NumElts);
+ SDValue Val = DAG.getNode(ISD::BUILD_VECTOR, dl, VecVT, Ops);
+ Val = LowerBUILD_VECTOR(Val, DAG);
+ if (Val.getNode())
+ return DAG.getNode(ISD::BITCAST, dl, VT, Val);
+ }
+ }
+
+ // If there was only one constant value used and for more than one lane,
+ // start by splatting that value, then replace the non-constant lanes. This
+ // is better than the default, which will perform a separate initialization
+ // for each lane.
+ if (NumConstantLanes > 0 && usesOnlyOneConstantValue) {
+ SDValue Val = DAG.getNode(AArch64ISD::DUP, dl, VT, ConstantValue);
+ // Now insert the non-constant lanes.
+ for (unsigned i = 0; i < NumElts; ++i) {
+ SDValue V = Op.getOperand(i);
+ SDValue LaneIdx = DAG.getConstant(i, MVT::i64);
+ if (!isa<ConstantSDNode>(V) && !isa<ConstantFPSDNode>(V)) {
+ // Note that type legalization likely mucked about with the VT of the
+ // source operand, so we may have to convert it here before inserting.
+ Val = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Val, V, LaneIdx);
+ }
+ }
+ return Val;
+ }
+
+ // If all elements are constants and the case above didn't get hit, fall back
+ // to the default expansion, which will generate a load from the constant
+ // pool.
+ if (isConstant)
+ return SDValue();
+
+ // Empirical tests suggest this is rarely worth it for vectors of length <= 2.
+ if (NumElts >= 4) {
+ SDValue shuffle = ReconstructShuffle(Op, DAG);
+ if (shuffle != SDValue())
+ return shuffle;
+ }
+
+ // If all else fails, just use a sequence of INSERT_VECTOR_ELT when we
+ // know the default expansion would otherwise fall back on something even
+ // worse. For a vector with one or two non-undef values, that's
+ // scalar_to_vector for the elements followed by a shuffle (provided the
+ // shuffle is valid for the target) and materialization element by element
+ // on the stack followed by a load for everything else.
+ if (!isConstant && !usesOnlyOneValue) {
+ SDValue Vec = DAG.getUNDEF(VT);
+ SDValue Op0 = Op.getOperand(0);
+ unsigned ElemSize = VT.getVectorElementType().getSizeInBits();
+ unsigned i = 0;
+ // For 32 and 64 bit types, use INSERT_SUBREG for lane zero to
+ // a) Avoid a RMW dependency on the full vector register, and
+ // b) Allow the register coalescer to fold away the copy if the
+ // value is already in an S or D register.
+ if (Op0.getOpcode() != ISD::UNDEF && (ElemSize == 32 || ElemSize == 64)) {
+ unsigned SubIdx = ElemSize == 32 ? AArch64::ssub : AArch64::dsub;
+ MachineSDNode *N =
+ DAG.getMachineNode(TargetOpcode::INSERT_SUBREG, dl, VT, Vec, Op0,
+ DAG.getTargetConstant(SubIdx, MVT::i32));
+ Vec = SDValue(N, 0);
+ ++i;
+ }
+ for (; i < NumElts; ++i) {
+ SDValue V = Op.getOperand(i);
+ if (V.getOpcode() == ISD::UNDEF)
+ continue;
+ SDValue LaneIdx = DAG.getConstant(i, MVT::i64);
+ Vec = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Vec, V, LaneIdx);
+ }
+ return Vec;
+ }
+
+ // Just use the default expansion. We failed to find a better alternative.
+ return SDValue();
+}
+
+SDValue AArch64TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op,
+ SelectionDAG &DAG) const {
+ assert(Op.getOpcode() == ISD::INSERT_VECTOR_ELT && "Unknown opcode!");
+
+ // Check for non-constant lane.
+ if (!isa<ConstantSDNode>(Op.getOperand(2)))
+ return SDValue();
+
+ EVT VT = Op.getOperand(0).getValueType();
+
+ // Insertion/extraction are legal for V128 types.
+ if (VT == MVT::v16i8 || VT == MVT::v8i16 || VT == MVT::v4i32 ||
+ VT == MVT::v2i64 || VT == MVT::v4f32 || VT == MVT::v2f64)
+ return Op;
+
+ if (VT != MVT::v8i8 && VT != MVT::v4i16 && VT != MVT::v2i32 &&
+ VT != MVT::v1i64 && VT != MVT::v2f32)
+ return SDValue();
+
+ // For V64 types, we perform insertion by expanding the value
+ // to a V128 type and perform the insertion on that.
+ SDLoc DL(Op);
+ SDValue WideVec = WidenVector(Op.getOperand(0), DAG);
+ EVT WideTy = WideVec.getValueType();
+
+ SDValue Node = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, WideTy, WideVec,
+ Op.getOperand(1), Op.getOperand(2));
+ // Re-narrow the resultant vector.
+ return NarrowVector(Node, DAG);
+}
+
+SDValue
+AArch64TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op,
+ SelectionDAG &DAG) const {
+ assert(Op.getOpcode() == ISD::EXTRACT_VECTOR_ELT && "Unknown opcode!");
+
+ // Check for non-constant lane.
+ if (!isa<ConstantSDNode>(Op.getOperand(1)))
+ return SDValue();
+
+ EVT VT = Op.getOperand(0).getValueType();
+
+ // Insertion/extraction are legal for V128 types.
+ if (VT == MVT::v16i8 || VT == MVT::v8i16 || VT == MVT::v4i32 ||
+ VT == MVT::v2i64 || VT == MVT::v4f32 || VT == MVT::v2f64)
+ return Op;
+
+ if (VT != MVT::v8i8 && VT != MVT::v4i16 && VT != MVT::v2i32 &&
+ VT != MVT::v1i64 && VT != MVT::v2f32)
+ return SDValue();
+
+ // For V64 types, we perform extraction by expanding the value
+ // to a V128 type and perform the extraction on that.
+ SDLoc DL(Op);
+ SDValue WideVec = WidenVector(Op.getOperand(0), DAG);
+ EVT WideTy = WideVec.getValueType();
+
+ EVT ExtrTy = WideTy.getVectorElementType();
+ if (ExtrTy == MVT::i16 || ExtrTy == MVT::i8)
+ ExtrTy = MVT::i32;
+
+ // For extractions, we just return the result directly.
+ return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ExtrTy, WideVec,
+ Op.getOperand(1));
+}
+
+SDValue AArch64TargetLowering::LowerEXTRACT_SUBVECTOR(SDValue Op,
+ SelectionDAG &DAG) const {
+ EVT VT = Op.getOperand(0).getValueType();
+ SDLoc dl(Op);
+ // Just in case...
+ if (!VT.isVector())
+ return SDValue();
+
+ ConstantSDNode *Cst = dyn_cast<ConstantSDNode>(Op.getOperand(1));
+ if (!Cst)
+ return SDValue();
+ unsigned Val = Cst->getZExtValue();
+
+ unsigned Size = Op.getValueType().getSizeInBits();
+ if (Val == 0) {
+ switch (Size) {
+ case 8:
+ return DAG.getTargetExtractSubreg(AArch64::bsub, dl, Op.getValueType(),
+ Op.getOperand(0));
+ case 16:
+ return DAG.getTargetExtractSubreg(AArch64::hsub, dl, Op.getValueType(),
+ Op.getOperand(0));
+ case 32:
+ return DAG.getTargetExtractSubreg(AArch64::ssub, dl, Op.getValueType(),
+ Op.getOperand(0));
+ case 64:
+ return DAG.getTargetExtractSubreg(AArch64::dsub, dl, Op.getValueType(),
+ Op.getOperand(0));
+ default:
+ llvm_unreachable("Unexpected vector type in extract_subvector!");
+ }
+ }
+ // If this is extracting the upper 64-bits of a 128-bit vector, we match
+ // that directly.
+ if (Size == 64 && Val * VT.getVectorElementType().getSizeInBits() == 64)
+ return Op;
+
+ return SDValue();
+}
+
+bool AArch64TargetLowering::isShuffleMaskLegal(const SmallVectorImpl<int> &M,
+ EVT VT) const {
+ if (VT.getVectorNumElements() == 4 &&
+ (VT.is128BitVector() || VT.is64BitVector())) {
+ unsigned PFIndexes[4];
+ for (unsigned i = 0; i != 4; ++i) {
+ if (M[i] < 0)
+ PFIndexes[i] = 8;
+ else
+ PFIndexes[i] = M[i];
+ }
+
+ // Compute the index in the perfect shuffle table.
+ unsigned PFTableIndex = PFIndexes[0] * 9 * 9 * 9 + PFIndexes[1] * 9 * 9 +
+ PFIndexes[2] * 9 + PFIndexes[3];
+ unsigned PFEntry = PerfectShuffleTable[PFTableIndex];
+ unsigned Cost = (PFEntry >> 30);
+
+ if (Cost <= 4)
+ return true;
+ }
+
+ bool DummyBool;
+ int DummyInt;
+ unsigned DummyUnsigned;
+
+ return (ShuffleVectorSDNode::isSplatMask(&M[0], VT) || isREVMask(M, VT, 64) ||
+ isREVMask(M, VT, 32) || isREVMask(M, VT, 16) ||
+ isEXTMask(M, VT, DummyBool, DummyUnsigned) ||
+ // isTBLMask(M, VT) || // FIXME: Port TBL support from ARM.
+ isTRNMask(M, VT, DummyUnsigned) || isUZPMask(M, VT, DummyUnsigned) ||
+ isZIPMask(M, VT, DummyUnsigned) ||
+ isTRN_v_undef_Mask(M, VT, DummyUnsigned) ||
+ isUZP_v_undef_Mask(M, VT, DummyUnsigned) ||
+ isZIP_v_undef_Mask(M, VT, DummyUnsigned) ||
+ isINSMask(M, VT.getVectorNumElements(), DummyBool, DummyInt) ||
+ isConcatMask(M, VT, VT.getSizeInBits() == 128));
+}
+
+/// getVShiftImm - Check if this is a valid build_vector for the immediate
+/// operand of a vector shift operation, where all the elements of the
+/// build_vector must have the same constant integer value.
+static bool getVShiftImm(SDValue Op, unsigned ElementBits, int64_t &Cnt) {
+ // Ignore bit_converts.
+ while (Op.getOpcode() == ISD::BITCAST)
+ Op = Op.getOperand(0);
+ BuildVectorSDNode *BVN = dyn_cast<BuildVectorSDNode>(Op.getNode());
+ APInt SplatBits, SplatUndef;
+ unsigned SplatBitSize;
+ bool HasAnyUndefs;
+ if (!BVN || !BVN->isConstantSplat(SplatBits, SplatUndef, SplatBitSize,
+ HasAnyUndefs, ElementBits) ||
+ SplatBitSize > ElementBits)
+ return false;
+ Cnt = SplatBits.getSExtValue();
+ return true;
+}
+
+/// isVShiftLImm - Check if this is a valid build_vector for the immediate
+/// operand of a vector shift left operation. That value must be in the range:
+/// 0 <= Value < ElementBits for a left shift; or
+/// 0 <= Value <= ElementBits for a long left shift.
+static bool isVShiftLImm(SDValue Op, EVT VT, bool isLong, int64_t &Cnt) {
+ assert(VT.isVector() && "vector shift count is not a vector type");
+ unsigned ElementBits = VT.getVectorElementType().getSizeInBits();
+ if (!getVShiftImm(Op, ElementBits, Cnt))
+ return false;
+ return (Cnt >= 0 && (isLong ? Cnt - 1 : Cnt) < ElementBits);
+}
+
+/// isVShiftRImm - Check if this is a valid build_vector for the immediate
+/// operand of a vector shift right operation. For a shift opcode, the value
+/// is positive, but for an intrinsic the value count must be negative. The
+/// absolute value must be in the range:
+/// 1 <= |Value| <= ElementBits for a right shift; or
+/// 1 <= |Value| <= ElementBits/2 for a narrow right shift.
+static bool isVShiftRImm(SDValue Op, EVT VT, bool isNarrow, bool isIntrinsic,
+ int64_t &Cnt) {
+ assert(VT.isVector() && "vector shift count is not a vector type");
+ unsigned ElementBits = VT.getVectorElementType().getSizeInBits();
+ if (!getVShiftImm(Op, ElementBits, Cnt))
+ return false;
+ if (isIntrinsic)
+ Cnt = -Cnt;
+ return (Cnt >= 1 && Cnt <= (isNarrow ? ElementBits / 2 : ElementBits));
+}
+
+SDValue AArch64TargetLowering::LowerVectorSRA_SRL_SHL(SDValue Op,
+ SelectionDAG &DAG) const {
+ EVT VT = Op.getValueType();
+ SDLoc DL(Op);
+ int64_t Cnt;
+
+ if (!Op.getOperand(1).getValueType().isVector())
+ return Op;
+ unsigned EltSize = VT.getVectorElementType().getSizeInBits();
+
+ switch (Op.getOpcode()) {
+ default:
+ llvm_unreachable("unexpected shift opcode");
+
+ case ISD::SHL:
+ if (isVShiftLImm(Op.getOperand(1), VT, false, Cnt) && Cnt < EltSize)
+ return DAG.getNode(AArch64ISD::VSHL, SDLoc(Op), VT, Op.getOperand(0),
+ DAG.getConstant(Cnt, MVT::i32));
+ return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT,
+ DAG.getConstant(Intrinsic::aarch64_neon_ushl, MVT::i32),
+ Op.getOperand(0), Op.getOperand(1));
+ case ISD::SRA:
+ case ISD::SRL:
+ // Right shift immediate
+ if (isVShiftRImm(Op.getOperand(1), VT, false, false, Cnt) &&
+ Cnt < EltSize) {
+ unsigned Opc =
+ (Op.getOpcode() == ISD::SRA) ? AArch64ISD::VASHR : AArch64ISD::VLSHR;
+ return DAG.getNode(Opc, SDLoc(Op), VT, Op.getOperand(0),
+ DAG.getConstant(Cnt, MVT::i32));
+ }
+
+ // Right shift register. Note, there is not a shift right register
+ // instruction, but the shift left register instruction takes a signed
+ // value, where negative numbers specify a right shift.
+ unsigned Opc = (Op.getOpcode() == ISD::SRA) ? Intrinsic::aarch64_neon_sshl
+ : Intrinsic::aarch64_neon_ushl;
+ // negate the shift amount
+ SDValue NegShift = DAG.getNode(AArch64ISD::NEG, DL, VT, Op.getOperand(1));
+ SDValue NegShiftLeft =
+ DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT,
+ DAG.getConstant(Opc, MVT::i32), Op.getOperand(0), NegShift);
+ return NegShiftLeft;
+ }
+
+ return SDValue();
+}
+
+static SDValue EmitVectorComparison(SDValue LHS, SDValue RHS,
+ AArch64CC::CondCode CC, bool NoNans, EVT VT,
+ SDLoc dl, SelectionDAG &DAG) {
+ EVT SrcVT = LHS.getValueType();
+
+ BuildVectorSDNode *BVN = dyn_cast<BuildVectorSDNode>(RHS.getNode());
+ APInt CnstBits(VT.getSizeInBits(), 0);
+ APInt UndefBits(VT.getSizeInBits(), 0);
+ bool IsCnst = BVN && resolveBuildVector(BVN, CnstBits, UndefBits);
+ bool IsZero = IsCnst && (CnstBits == 0);
+
+ if (SrcVT.getVectorElementType().isFloatingPoint()) {
+ switch (CC) {
+ default:
+ return SDValue();
+ case AArch64CC::NE: {
+ SDValue Fcmeq;
+ if (IsZero)
+ Fcmeq = DAG.getNode(AArch64ISD::FCMEQz, dl, VT, LHS);
+ else
+ Fcmeq = DAG.getNode(AArch64ISD::FCMEQ, dl, VT, LHS, RHS);
+ return DAG.getNode(AArch64ISD::NOT, dl, VT, Fcmeq);
+ }
+ case AArch64CC::EQ:
+ if (IsZero)
+ return DAG.getNode(AArch64ISD::FCMEQz, dl, VT, LHS);
+ return DAG.getNode(AArch64ISD::FCMEQ, dl, VT, LHS, RHS);
+ case AArch64CC::GE:
+ if (IsZero)
+ return DAG.getNode(AArch64ISD::FCMGEz, dl, VT, LHS);
+ return DAG.getNode(AArch64ISD::FCMGE, dl, VT, LHS, RHS);
+ case AArch64CC::GT:
+ if (IsZero)
+ return DAG.getNode(AArch64ISD::FCMGTz, dl, VT, LHS);
+ return DAG.getNode(AArch64ISD::FCMGT, dl, VT, LHS, RHS);
+ case AArch64CC::LS:
+ if (IsZero)
+ return DAG.getNode(AArch64ISD::FCMLEz, dl, VT, LHS);
+ return DAG.getNode(AArch64ISD::FCMGE, dl, VT, RHS, LHS);
+ case AArch64CC::LT:
+ if (!NoNans)
+ return SDValue();
+ // If we ignore NaNs then we can use to the MI implementation.
+ // Fallthrough.
+ case AArch64CC::MI:
+ if (IsZero)
+ return DAG.getNode(AArch64ISD::FCMLTz, dl, VT, LHS);
+ return DAG.getNode(AArch64ISD::FCMGT, dl, VT, RHS, LHS);
+ }
+ }
+
+ switch (CC) {
+ default:
+ return SDValue();
+ case AArch64CC::NE: {
+ SDValue Cmeq;
+ if (IsZero)
+ Cmeq = DAG.getNode(AArch64ISD::CMEQz, dl, VT, LHS);
+ else
+ Cmeq = DAG.getNode(AArch64ISD::CMEQ, dl, VT, LHS, RHS);
+ return DAG.getNode(AArch64ISD::NOT, dl, VT, Cmeq);
+ }
+ case AArch64CC::EQ:
+ if (IsZero)
+ return DAG.getNode(AArch64ISD::CMEQz, dl, VT, LHS);
+ return DAG.getNode(AArch64ISD::CMEQ, dl, VT, LHS, RHS);
+ case AArch64CC::GE:
+ if (IsZero)
+ return DAG.getNode(AArch64ISD::CMGEz, dl, VT, LHS);
+ return DAG.getNode(AArch64ISD::CMGE, dl, VT, LHS, RHS);
+ case AArch64CC::GT:
+ if (IsZero)
+ return DAG.getNode(AArch64ISD::CMGTz, dl, VT, LHS);
+ return DAG.getNode(AArch64ISD::CMGT, dl, VT, LHS, RHS);
+ case AArch64CC::LE:
+ if (IsZero)
+ return DAG.getNode(AArch64ISD::CMLEz, dl, VT, LHS);
+ return DAG.getNode(AArch64ISD::CMGE, dl, VT, RHS, LHS);
+ case AArch64CC::LS:
+ return DAG.getNode(AArch64ISD::CMHS, dl, VT, RHS, LHS);
+ case AArch64CC::LO:
+ return DAG.getNode(AArch64ISD::CMHI, dl, VT, RHS, LHS);
+ case AArch64CC::LT:
+ if (IsZero)
+ return DAG.getNode(AArch64ISD::CMLTz, dl, VT, LHS);
+ return DAG.getNode(AArch64ISD::CMGT, dl, VT, RHS, LHS);
+ case AArch64CC::HI:
+ return DAG.getNode(AArch64ISD::CMHI, dl, VT, LHS, RHS);
+ case AArch64CC::HS:
+ return DAG.getNode(AArch64ISD::CMHS, dl, VT, LHS, RHS);
+ }
+}
+
+SDValue AArch64TargetLowering::LowerVSETCC(SDValue Op,
+ SelectionDAG &DAG) const {
+ ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(2))->get();
+ SDValue LHS = Op.getOperand(0);
+ SDValue RHS = Op.getOperand(1);
+ SDLoc dl(Op);
+
+ if (LHS.getValueType().getVectorElementType().isInteger()) {
+ assert(LHS.getValueType() == RHS.getValueType());
+ AArch64CC::CondCode AArch64CC = changeIntCCToAArch64CC(CC);
+ return EmitVectorComparison(LHS, RHS, AArch64CC, false, Op.getValueType(),
+ dl, DAG);
+ }
+
+ assert(LHS.getValueType().getVectorElementType() == MVT::f32 ||
+ LHS.getValueType().getVectorElementType() == MVT::f64);
+
+ // Unfortunately, the mapping of LLVM FP CC's onto AArch64 CC's isn't totally
+ // clean. Some of them require two branches to implement.
+ AArch64CC::CondCode CC1, CC2;
+ bool ShouldInvert;
+ changeVectorFPCCToAArch64CC(CC, CC1, CC2, ShouldInvert);
+
+ bool NoNaNs = getTargetMachine().Options.NoNaNsFPMath;
+ SDValue Cmp =
+ EmitVectorComparison(LHS, RHS, CC1, NoNaNs, Op.getValueType(), dl, DAG);
+ if (!Cmp.getNode())
+ return SDValue();
+
+ if (CC2 != AArch64CC::AL) {
+ SDValue Cmp2 =
+ EmitVectorComparison(LHS, RHS, CC2, NoNaNs, Op.getValueType(), dl, DAG);
+ if (!Cmp2.getNode())
+ return SDValue();
+
+ Cmp = DAG.getNode(ISD::OR, dl, Cmp.getValueType(), Cmp, Cmp2);
+ }
+
+ if (ShouldInvert)
+ return Cmp = DAG.getNOT(dl, Cmp, Cmp.getValueType());
+
+ return Cmp;
+}
+
+/// getTgtMemIntrinsic - Represent NEON load and store intrinsics as
+/// MemIntrinsicNodes. The associated MachineMemOperands record the alignment
+/// specified in the intrinsic calls.
+bool AArch64TargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
+ const CallInst &I,
+ unsigned Intrinsic) const {
+ switch (Intrinsic) {
+ case Intrinsic::aarch64_neon_ld2:
+ case Intrinsic::aarch64_neon_ld3:
+ case Intrinsic::aarch64_neon_ld4:
+ case Intrinsic::aarch64_neon_ld1x2:
+ case Intrinsic::aarch64_neon_ld1x3:
+ case Intrinsic::aarch64_neon_ld1x4:
+ case Intrinsic::aarch64_neon_ld2lane:
+ case Intrinsic::aarch64_neon_ld3lane:
+ case Intrinsic::aarch64_neon_ld4lane:
+ case Intrinsic::aarch64_neon_ld2r:
+ case Intrinsic::aarch64_neon_ld3r:
+ case Intrinsic::aarch64_neon_ld4r: {
+ Info.opc = ISD::INTRINSIC_W_CHAIN;
+ // Conservatively set memVT to the entire set of vectors loaded.
+ uint64_t NumElts = getDataLayout()->getTypeAllocSize(I.getType()) / 8;
+ Info.memVT = EVT::getVectorVT(I.getType()->getContext(), MVT::i64, NumElts);
+ Info.ptrVal = I.getArgOperand(I.getNumArgOperands() - 1);
+ Info.offset = 0;
+ Info.align = 0;
+ Info.vol = false; // volatile loads with NEON intrinsics not supported
+ Info.readMem = true;
+ Info.writeMem = false;
+ return true;
+ }
+ case Intrinsic::aarch64_neon_st2:
+ case Intrinsic::aarch64_neon_st3:
+ case Intrinsic::aarch64_neon_st4:
+ case Intrinsic::aarch64_neon_st1x2:
+ case Intrinsic::aarch64_neon_st1x3:
+ case Intrinsic::aarch64_neon_st1x4:
+ case Intrinsic::aarch64_neon_st2lane:
+ case Intrinsic::aarch64_neon_st3lane:
+ case Intrinsic::aarch64_neon_st4lane: {
+ Info.opc = ISD::INTRINSIC_VOID;
+ // Conservatively set memVT to the entire set of vectors stored.
+ unsigned NumElts = 0;
+ for (unsigned ArgI = 1, ArgE = I.getNumArgOperands(); ArgI < ArgE; ++ArgI) {
+ Type *ArgTy = I.getArgOperand(ArgI)->getType();
+ if (!ArgTy->isVectorTy())
+ break;
+ NumElts += getDataLayout()->getTypeAllocSize(ArgTy) / 8;
+ }
+ Info.memVT = EVT::getVectorVT(I.getType()->getContext(), MVT::i64, NumElts);
+ Info.ptrVal = I.getArgOperand(I.getNumArgOperands() - 1);
+ Info.offset = 0;
+ Info.align = 0;
+ Info.vol = false; // volatile stores with NEON intrinsics not supported
+ Info.readMem = false;
+ Info.writeMem = true;
+ return true;
+ }
+ case Intrinsic::aarch64_ldaxr:
+ case Intrinsic::aarch64_ldxr: {
+ PointerType *PtrTy = cast<PointerType>(I.getArgOperand(0)->getType());
+ Info.opc = ISD::INTRINSIC_W_CHAIN;
+ Info.memVT = MVT::getVT(PtrTy->getElementType());
+ Info.ptrVal = I.getArgOperand(0);
+ Info.offset = 0;
+ Info.align = getDataLayout()->getABITypeAlignment(PtrTy->getElementType());
+ Info.vol = true;
+ Info.readMem = true;
+ Info.writeMem = false;
+ return true;
+ }
+ case Intrinsic::aarch64_stlxr:
+ case Intrinsic::aarch64_stxr: {
+ PointerType *PtrTy = cast<PointerType>(I.getArgOperand(1)->getType());
+ Info.opc = ISD::INTRINSIC_W_CHAIN;
+ Info.memVT = MVT::getVT(PtrTy->getElementType());
+ Info.ptrVal = I.getArgOperand(1);
+ Info.offset = 0;
+ Info.align = getDataLayout()->getABITypeAlignment(PtrTy->getElementType());
+ Info.vol = true;
+ Info.readMem = false;
+ Info.writeMem = true;
+ return true;
+ }
+ case Intrinsic::aarch64_ldaxp:
+ case Intrinsic::aarch64_ldxp: {
+ Info.opc = ISD::INTRINSIC_W_CHAIN;
+ Info.memVT = MVT::i128;
+ Info.ptrVal = I.getArgOperand(0);
+ Info.offset = 0;
+ Info.align = 16;
+ Info.vol = true;
+ Info.readMem = true;
+ Info.writeMem = false;
+ return true;
+ }
+ case Intrinsic::aarch64_stlxp:
+ case Intrinsic::aarch64_stxp: {
+ Info.opc = ISD::INTRINSIC_W_CHAIN;
+ Info.memVT = MVT::i128;
+ Info.ptrVal = I.getArgOperand(2);
+ Info.offset = 0;
+ Info.align = 16;
+ Info.vol = true;
+ Info.readMem = false;
+ Info.writeMem = true;
+ return true;
+ }
+ default:
+ break;
+ }
+
+ return false;
+}
+
+// Truncations from 64-bit GPR to 32-bit GPR is free.
+bool AArch64TargetLowering::isTruncateFree(Type *Ty1, Type *Ty2) const {
+ if (!Ty1->isIntegerTy() || !Ty2->isIntegerTy())
+ return false;
+ unsigned NumBits1 = Ty1->getPrimitiveSizeInBits();
+ unsigned NumBits2 = Ty2->getPrimitiveSizeInBits();
+ if (NumBits1 <= NumBits2)
+ return false;
+ return true;
+}
+bool AArch64TargetLowering::isTruncateFree(EVT VT1, EVT VT2) const {
+ if (!VT1.isInteger() || !VT2.isInteger())
+ return false;
+ unsigned NumBits1 = VT1.getSizeInBits();
+ unsigned NumBits2 = VT2.getSizeInBits();
+ if (NumBits1 <= NumBits2)
+ return false;
+ return true;
+}
+
+// All 32-bit GPR operations implicitly zero the high-half of the corresponding
+// 64-bit GPR.
+bool AArch64TargetLowering::isZExtFree(Type *Ty1, Type *Ty2) const {
+ if (!Ty1->isIntegerTy() || !Ty2->isIntegerTy())
+ return false;
+ unsigned NumBits1 = Ty1->getPrimitiveSizeInBits();
+ unsigned NumBits2 = Ty2->getPrimitiveSizeInBits();
+ if (NumBits1 == 32 && NumBits2 == 64)
+ return true;
+ return false;
+}
+bool AArch64TargetLowering::isZExtFree(EVT VT1, EVT VT2) const {
+ if (!VT1.isInteger() || !VT2.isInteger())
+ return false;
+ unsigned NumBits1 = VT1.getSizeInBits();
+ unsigned NumBits2 = VT2.getSizeInBits();
+ if (NumBits1 == 32 && NumBits2 == 64)
+ return true;
+ return false;
+}
+
+bool AArch64TargetLowering::isZExtFree(SDValue Val, EVT VT2) const {
+ EVT VT1 = Val.getValueType();
+ if (isZExtFree(VT1, VT2)) {
+ return true;
+ }
+
+ if (Val.getOpcode() != ISD::LOAD)
+ return false;
+
+ // 8-, 16-, and 32-bit integer loads all implicitly zero-extend.
+ return (VT1.isSimple() && VT1.isInteger() && VT2.isSimple() &&
+ VT2.isInteger() && VT1.getSizeInBits() <= 32);
+}
+
+bool AArch64TargetLowering::hasPairedLoad(Type *LoadedType,
+ unsigned &RequiredAligment) const {
+ if (!LoadedType->isIntegerTy() && !LoadedType->isFloatTy())
+ return false;
+ // Cyclone supports unaligned accesses.
+ RequiredAligment = 0;
+ unsigned NumBits = LoadedType->getPrimitiveSizeInBits();
+ return NumBits == 32 || NumBits == 64;
+}
+
+bool AArch64TargetLowering::hasPairedLoad(EVT LoadedType,
+ unsigned &RequiredAligment) const {
+ if (!LoadedType.isSimple() ||
+ (!LoadedType.isInteger() && !LoadedType.isFloatingPoint()))
+ return false;
+ // Cyclone supports unaligned accesses.
+ RequiredAligment = 0;
+ unsigned NumBits = LoadedType.getSizeInBits();
+ return NumBits == 32 || NumBits == 64;
+}
+
+static bool memOpAlign(unsigned DstAlign, unsigned SrcAlign,
+ unsigned AlignCheck) {
+ return ((SrcAlign == 0 || SrcAlign % AlignCheck == 0) &&
+ (DstAlign == 0 || DstAlign % AlignCheck == 0));
+}
+
+EVT AArch64TargetLowering::getOptimalMemOpType(uint64_t Size, unsigned DstAlign,
+ unsigned SrcAlign, bool IsMemset,
+ bool ZeroMemset,
+ bool MemcpyStrSrc,
+ MachineFunction &MF) const {
+ // Don't use AdvSIMD to implement 16-byte memset. It would have taken one
+ // instruction to materialize the v2i64 zero and one store (with restrictive
+ // addressing mode). Just do two i64 store of zero-registers.
+ bool Fast;
+ const Function *F = MF.getFunction();
+ if (Subtarget->hasFPARMv8() && !IsMemset && Size >= 16 &&
+ !F->getAttributes().hasAttribute(AttributeSet::FunctionIndex,
+ Attribute::NoImplicitFloat) &&
+ (memOpAlign(SrcAlign, DstAlign, 16) ||
+ (allowsUnalignedMemoryAccesses(MVT::f128, 0, &Fast) && Fast)))
+ return MVT::f128;
+
+ return Size >= 8 ? MVT::i64 : MVT::i32;
+}
+
+// 12-bit optionally shifted immediates are legal for adds.
+bool AArch64TargetLowering::isLegalAddImmediate(int64_t Immed) const {
+ if ((Immed >> 12) == 0 || ((Immed & 0xfff) == 0 && Immed >> 24 == 0))
+ return true;
+ return false;
+}
+
+// Integer comparisons are implemented with ADDS/SUBS, so the range of valid
+// immediates is the same as for an add or a sub.
+bool AArch64TargetLowering::isLegalICmpImmediate(int64_t Immed) const {
+ if (Immed < 0)
+ Immed *= -1;
+ return isLegalAddImmediate(Immed);
+}
+
+/// isLegalAddressingMode - Return true if the addressing mode represented
+/// by AM is legal for this target, for a load/store of the specified type.
+bool AArch64TargetLowering::isLegalAddressingMode(const AddrMode &AM,
+ Type *Ty) const {
+ // AArch64 has five basic addressing modes:
+ // reg
+ // reg + 9-bit signed offset
+ // reg + SIZE_IN_BYTES * 12-bit unsigned offset
+ // reg1 + reg2
+ // reg + SIZE_IN_BYTES * reg
+
+ // No global is ever allowed as a base.
+ if (AM.BaseGV)
+ return false;
+
+ // No reg+reg+imm addressing.
+ if (AM.HasBaseReg && AM.BaseOffs && AM.Scale)
+ return false;
+
+ // check reg + imm case:
+ // i.e., reg + 0, reg + imm9, reg + SIZE_IN_BYTES * uimm12
+ uint64_t NumBytes = 0;
+ if (Ty->isSized()) {
+ uint64_t NumBits = getDataLayout()->getTypeSizeInBits(Ty);
+ NumBytes = NumBits / 8;
+ if (!isPowerOf2_64(NumBits))
+ NumBytes = 0;
+ }
+
+ if (!AM.Scale) {
+ int64_t Offset = AM.BaseOffs;
+
+ // 9-bit signed offset
+ if (Offset >= -(1LL << 9) && Offset <= (1LL << 9) - 1)
+ return true;
+
+ // 12-bit unsigned offset
+ unsigned shift = Log2_64(NumBytes);
+ if (NumBytes && Offset > 0 && (Offset / NumBytes) <= (1LL << 12) - 1 &&
+ // Must be a multiple of NumBytes (NumBytes is a power of 2)
+ (Offset >> shift) << shift == Offset)
+ return true;
+ return false;
+ }
+
+ // Check reg1 + SIZE_IN_BYTES * reg2 and reg1 + reg2
+
+ if (!AM.Scale || AM.Scale == 1 ||
+ (AM.Scale > 0 && (uint64_t)AM.Scale == NumBytes))
+ return true;
+ return false;
+}
+
+int AArch64TargetLowering::getScalingFactorCost(const AddrMode &AM,
+ Type *Ty) const {
+ // Scaling factors are not free at all.
+ // Operands | Rt Latency
+ // -------------------------------------------
+ // Rt, [Xn, Xm] | 4
+ // -------------------------------------------
+ // Rt, [Xn, Xm, lsl #imm] | Rn: 4 Rm: 5
+ // Rt, [Xn, Wm, <extend> #imm] |
+ if (isLegalAddressingMode(AM, Ty))
+ // Scale represents reg2 * scale, thus account for 1 if
+ // it is not equal to 0 or 1.
+ return AM.Scale != 0 && AM.Scale != 1;
+ return -1;
+}
+
+bool AArch64TargetLowering::isFMAFasterThanFMulAndFAdd(EVT VT) const {
+ VT = VT.getScalarType();
+
+ if (!VT.isSimple())
+ return false;
+
+ switch (VT.getSimpleVT().SimpleTy) {
+ case MVT::f32:
+ case MVT::f64:
+ return true;
+ default:
+ break;
+ }
+
+ return false;
+}
+
+const MCPhysReg *
+AArch64TargetLowering::getScratchRegisters(CallingConv::ID) const {
+ // LR is a callee-save register, but we must treat it as clobbered by any call
+ // site. Hence we include LR in the scratch registers, which are in turn added
+ // as implicit-defs for stackmaps and patchpoints.
+ static const MCPhysReg ScratchRegs[] = {
+ AArch64::X16, AArch64::X17, AArch64::LR, 0
+ };
+ return ScratchRegs;
+}
+
+bool
+AArch64TargetLowering::isDesirableToCommuteWithShift(const SDNode *N) const {
+ EVT VT = N->getValueType(0);
+ // If N is unsigned bit extraction: ((x >> C) & mask), then do not combine
+ // it with shift to let it be lowered to UBFX.
+ if (N->getOpcode() == ISD::AND && (VT == MVT::i32 || VT == MVT::i64) &&
+ isa<ConstantSDNode>(N->getOperand(1))) {
+ uint64_t TruncMask = N->getConstantOperandVal(1);
+ if (isMask_64(TruncMask) &&
+ N->getOperand(0).getOpcode() == ISD::SRL &&
+ isa<ConstantSDNode>(N->getOperand(0)->getOperand(1)))
+ return false;
+ }
+ return true;
+}
+
+bool AArch64TargetLowering::shouldConvertConstantLoadToIntImm(const APInt &Imm,
+ Type *Ty) const {
+ assert(Ty->isIntegerTy());
+
+ unsigned BitSize = Ty->getPrimitiveSizeInBits();
+ if (BitSize == 0)
+ return false;
+
+ int64_t Val = Imm.getSExtValue();
+ if (Val == 0 || AArch64_AM::isLogicalImmediate(Val, BitSize))
+ return true;
+
+ if ((int64_t)Val < 0)
+ Val = ~Val;
+ if (BitSize == 32)
+ Val &= (1LL << 32) - 1;
+
+ unsigned LZ = countLeadingZeros((uint64_t)Val);
+ unsigned Shift = (63 - LZ) / 16;
+ // MOVZ is free so return true for one or fewer MOVK.
+ return (Shift < 3) ? true : false;
+}
+
+// Generate SUBS and CSEL for integer abs.
+static SDValue performIntegerAbsCombine(SDNode *N, SelectionDAG &DAG) {
+ EVT VT = N->getValueType(0);
+
+ SDValue N0 = N->getOperand(0);
+ SDValue N1 = N->getOperand(1);
+ SDLoc DL(N);
+
+ // Check pattern of XOR(ADD(X,Y), Y) where Y is SRA(X, size(X)-1)
+ // and change it to SUB and CSEL.
+ if (VT.isInteger() && N->getOpcode() == ISD::XOR &&
+ N0.getOpcode() == ISD::ADD && N0.getOperand(1) == N1 &&
+ N1.getOpcode() == ISD::SRA && N1.getOperand(0) == N0.getOperand(0))
+ if (ConstantSDNode *Y1C = dyn_cast<ConstantSDNode>(N1.getOperand(1)))
+ if (Y1C->getAPIntValue() == VT.getSizeInBits() - 1) {
+ SDValue Neg = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, VT),
+ N0.getOperand(0));
+ // Generate SUBS & CSEL.
+ SDValue Cmp =
+ DAG.getNode(AArch64ISD::SUBS, DL, DAG.getVTList(VT, MVT::i32),
+ N0.getOperand(0), DAG.getConstant(0, VT));
+ return DAG.getNode(AArch64ISD::CSEL, DL, VT, N0.getOperand(0), Neg,
+ DAG.getConstant(AArch64CC::PL, MVT::i32),
+ SDValue(Cmp.getNode(), 1));
+ }
+ return SDValue();
+}
+
+// performXorCombine - Attempts to handle integer ABS.
+static SDValue performXorCombine(SDNode *N, SelectionDAG &DAG,
+ TargetLowering::DAGCombinerInfo &DCI,
+ const AArch64Subtarget *Subtarget) {
+ if (DCI.isBeforeLegalizeOps())
+ return SDValue();
+
+ return performIntegerAbsCombine(N, DAG);
+}
+
+static SDValue performMulCombine(SDNode *N, SelectionDAG &DAG,
+ TargetLowering::DAGCombinerInfo &DCI,
+ const AArch64Subtarget *Subtarget) {
+ if (DCI.isBeforeLegalizeOps())
+ return SDValue();
+
+ // Multiplication of a power of two plus/minus one can be done more
+ // cheaply as as shift+add/sub. For now, this is true unilaterally. If
+ // future CPUs have a cheaper MADD instruction, this may need to be
+ // gated on a subtarget feature. For Cyclone, 32-bit MADD is 4 cycles and
+ // 64-bit is 5 cycles, so this is always a win.
+ if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(N->getOperand(1))) {
+ APInt Value = C->getAPIntValue();
+ EVT VT = N->getValueType(0);
+ APInt VP1 = Value + 1;
+ if (VP1.isPowerOf2()) {
+ // Multiplying by one less than a power of two, replace with a shift
+ // and a subtract.
+ SDValue ShiftedVal =
+ DAG.getNode(ISD::SHL, SDLoc(N), VT, N->getOperand(0),
+ DAG.getConstant(VP1.logBase2(), MVT::i64));
+ return DAG.getNode(ISD::SUB, SDLoc(N), VT, ShiftedVal, N->getOperand(0));
+ }
+ APInt VM1 = Value - 1;
+ if (VM1.isPowerOf2()) {
+ // Multiplying by one more than a power of two, replace with a shift
+ // and an add.
+ SDValue ShiftedVal =
+ DAG.getNode(ISD::SHL, SDLoc(N), VT, N->getOperand(0),
+ DAG.getConstant(VM1.logBase2(), MVT::i64));
+ return DAG.getNode(ISD::ADD, SDLoc(N), VT, ShiftedVal, N->getOperand(0));
+ }
+ }
+ return SDValue();
+}
+
+static SDValue performIntToFpCombine(SDNode *N, SelectionDAG &DAG) {
+ EVT VT = N->getValueType(0);
+ if (VT != MVT::f32 && VT != MVT::f64)
+ return SDValue();
+ // Only optimize when the source and destination types have the same width.
+ if (VT.getSizeInBits() != N->getOperand(0).getValueType().getSizeInBits())
+ return SDValue();
+
+ // If the result of an integer load is only used by an integer-to-float
+ // conversion, use a fp load instead and a AdvSIMD scalar {S|U}CVTF instead.
+ // This eliminates an "integer-to-vector-move UOP and improve throughput.
+ SDValue N0 = N->getOperand(0);
+ if (ISD::isNormalLoad(N0.getNode()) && N0.hasOneUse() &&
+ // Do not change the width of a volatile load.
+ !cast<LoadSDNode>(N0)->isVolatile()) {
+ LoadSDNode *LN0 = cast<LoadSDNode>(N0);
+ SDValue Load = DAG.getLoad(VT, SDLoc(N), LN0->getChain(), LN0->getBasePtr(),
+ LN0->getPointerInfo(), LN0->isVolatile(),
+ LN0->isNonTemporal(), LN0->isInvariant(),
+ LN0->getAlignment());
+
+ // Make sure successors of the original load stay after it by updating them
+ // to use the new Chain.
+ DAG.ReplaceAllUsesOfValueWith(SDValue(LN0, 1), Load.getValue(1));
+
+ unsigned Opcode =
+ (N->getOpcode() == ISD::SINT_TO_FP) ? AArch64ISD::SITOF : AArch64ISD::UITOF;
+ return DAG.getNode(Opcode, SDLoc(N), VT, Load);
+ }
+
+ return SDValue();
}
/// An EXTR instruction is made up of two shifts, ORed together. This helper
@@ -3782,44 +6461,19 @@
std::swap(ShiftLHS, ShiftRHS);
}
- return DAG.getNode(AArch64ISD::EXTR, DL, VT,
- LHS, RHS,
+ return DAG.getNode(AArch64ISD::EXTR, DL, VT, LHS, RHS,
DAG.getConstant(ShiftRHS, MVT::i64));
}
-/// Target-specific dag combine xforms for ISD::OR
-static SDValue PerformORCombine(SDNode *N,
- TargetLowering::DAGCombinerInfo &DCI,
- const AArch64Subtarget *Subtarget) {
-
+static SDValue tryCombineToBSL(SDNode *N,
+ TargetLowering::DAGCombinerInfo &DCI) {
+ EVT VT = N->getValueType(0);
SelectionDAG &DAG = DCI.DAG;
SDLoc DL(N);
- EVT VT = N->getValueType(0);
- if(!DAG.getTargetLoweringInfo().isTypeLegal(VT))
+ if (!VT.isVector())
return SDValue();
- // Attempt to recognise bitfield-insert operations.
- SDValue Res = tryCombineToBFI(N, DCI, Subtarget);
- if (Res.getNode())
- return Res;
-
- // Attempt to combine an existing MaskedBFI operation into one with a larger
- // mask.
- Res = tryCombineToLargerBFI(N, DCI, Subtarget);
- if (Res.getNode())
- return Res;
-
- Res = tryCombineToEXTR(N, DCI);
- if (Res.getNode())
- return Res;
-
- if (!Subtarget->hasNEON())
- return SDValue();
-
- // Attempt to use vector immediate-form BSL
- // (or (and B, A), (and C, ~A)) => (VBSL A, B, C) when A is a constant.
-
SDValue N0 = N->getOperand(0);
if (N0.getOpcode() != ISD::AND)
return SDValue();
@@ -3828,252 +6482,931 @@
if (N1.getOpcode() != ISD::AND)
return SDValue();
- if (VT.isVector() && DAG.getTargetLoweringInfo().isTypeLegal(VT)) {
- APInt SplatUndef;
+ // We only have to look for constant vectors here since the general, variable
+ // case can be handled in TableGen.
+ unsigned Bits = VT.getVectorElementType().getSizeInBits();
+ uint64_t BitMask = Bits == 64 ? -1ULL : ((1ULL << Bits) - 1);
+ for (int i = 1; i >= 0; --i)
+ for (int j = 1; j >= 0; --j) {
+ BuildVectorSDNode *BVN0 = dyn_cast<BuildVectorSDNode>(N0->getOperand(i));
+ BuildVectorSDNode *BVN1 = dyn_cast<BuildVectorSDNode>(N1->getOperand(j));
+ if (!BVN0 || !BVN1)
+ continue;
+
+ bool FoundMatch = true;
+ for (unsigned k = 0; k < VT.getVectorNumElements(); ++k) {
+ ConstantSDNode *CN0 = dyn_cast<ConstantSDNode>(BVN0->getOperand(k));
+ ConstantSDNode *CN1 = dyn_cast<ConstantSDNode>(BVN1->getOperand(k));
+ if (!CN0 || !CN1 ||
+ CN0->getZExtValue() != (BitMask & ~CN1->getZExtValue())) {
+ FoundMatch = false;
+ break;
+ }
+ }
+
+ if (FoundMatch)
+ return DAG.getNode(AArch64ISD::BSL, DL, VT, SDValue(BVN0, 0),
+ N0->getOperand(1 - i), N1->getOperand(1 - j));
+ }
+
+ return SDValue();
+}
+
+static SDValue performORCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI,
+ const AArch64Subtarget *Subtarget) {
+ // Attempt to form an EXTR from (or (shl VAL1, #N), (srl VAL2, #RegWidth-N))
+ if (!EnableAArch64ExtrGeneration)
+ return SDValue();
+ SelectionDAG &DAG = DCI.DAG;
+ EVT VT = N->getValueType(0);
+
+ if (!DAG.getTargetLoweringInfo().isTypeLegal(VT))
+ return SDValue();
+
+ SDValue Res = tryCombineToEXTR(N, DCI);
+ if (Res.getNode())
+ return Res;
+
+ Res = tryCombineToBSL(N, DCI);
+ if (Res.getNode())
+ return Res;
+
+ return SDValue();
+}
+
+static SDValue performBitcastCombine(SDNode *N,
+ TargetLowering::DAGCombinerInfo &DCI,
+ SelectionDAG &DAG) {
+ // Wait 'til after everything is legalized to try this. That way we have
+ // legal vector types and such.
+ if (DCI.isBeforeLegalizeOps())
+ return SDValue();
+
+ // Remove extraneous bitcasts around an extract_subvector.
+ // For example,
+ // (v4i16 (bitconvert
+ // (extract_subvector (v2i64 (bitconvert (v8i16 ...)), (i64 1)))))
+ // becomes
+ // (extract_subvector ((v8i16 ...), (i64 4)))
+
+ // Only interested in 64-bit vectors as the ultimate result.
+ EVT VT = N->getValueType(0);
+ if (!VT.isVector())
+ return SDValue();
+ if (VT.getSimpleVT().getSizeInBits() != 64)
+ return SDValue();
+ // Is the operand an extract_subvector starting at the beginning or halfway
+ // point of the vector? A low half may also come through as an
+ // EXTRACT_SUBREG, so look for that, too.
+ SDValue Op0 = N->getOperand(0);
+ if (Op0->getOpcode() != ISD::EXTRACT_SUBVECTOR &&
+ !(Op0->isMachineOpcode() &&
+ Op0->getMachineOpcode() == AArch64::EXTRACT_SUBREG))
+ return SDValue();
+ uint64_t idx = cast<ConstantSDNode>(Op0->getOperand(1))->getZExtValue();
+ if (Op0->getOpcode() == ISD::EXTRACT_SUBVECTOR) {
+ if (Op0->getValueType(0).getVectorNumElements() != idx && idx != 0)
+ return SDValue();
+ } else if (Op0->getMachineOpcode() == AArch64::EXTRACT_SUBREG) {
+ if (idx != AArch64::dsub)
+ return SDValue();
+ // The dsub reference is equivalent to a lane zero subvector reference.
+ idx = 0;
+ }
+ // Look through the bitcast of the input to the extract.
+ if (Op0->getOperand(0)->getOpcode() != ISD::BITCAST)
+ return SDValue();
+ SDValue Source = Op0->getOperand(0)->getOperand(0);
+ // If the source type has twice the number of elements as our destination
+ // type, we know this is an extract of the high or low half of the vector.
+ EVT SVT = Source->getValueType(0);
+ if (SVT.getVectorNumElements() != VT.getVectorNumElements() * 2)
+ return SDValue();
+
+ DEBUG(dbgs() << "aarch64-lower: bitcast extract_subvector simplification\n");
+
+ // Create the simplified form to just extract the low or high half of the
+ // vector directly rather than bothering with the bitcasts.
+ SDLoc dl(N);
+ unsigned NumElements = VT.getVectorNumElements();
+ if (idx) {
+ SDValue HalfIdx = DAG.getConstant(NumElements, MVT::i64);
+ return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, Source, HalfIdx);
+ } else {
+ SDValue SubReg = DAG.getTargetConstant(AArch64::dsub, MVT::i32);
+ return SDValue(DAG.getMachineNode(TargetOpcode::EXTRACT_SUBREG, dl, VT,
+ Source, SubReg),
+ 0);
+ }
+}
+
+static SDValue performConcatVectorsCombine(SDNode *N,
+ TargetLowering::DAGCombinerInfo &DCI,
+ SelectionDAG &DAG) {
+ // Wait 'til after everything is legalized to try this. That way we have
+ // legal vector types and such.
+ if (DCI.isBeforeLegalizeOps())
+ return SDValue();
+
+ SDLoc dl(N);
+ EVT VT = N->getValueType(0);
+
+ // If we see a (concat_vectors (v1x64 A), (v1x64 A)) it's really a vector
+ // splat. The indexed instructions are going to be expecting a DUPLANE64, so
+ // canonicalise to that.
+ if (N->getOperand(0) == N->getOperand(1) && VT.getVectorNumElements() == 2) {
+ assert(VT.getVectorElementType().getSizeInBits() == 64);
+ return DAG.getNode(AArch64ISD::DUPLANE64, dl, VT,
+ WidenVector(N->getOperand(0), DAG),
+ DAG.getConstant(0, MVT::i64));
+ }
+
+ // Canonicalise concat_vectors so that the right-hand vector has as few
+ // bit-casts as possible before its real operation. The primary matching
+ // destination for these operations will be the narrowing "2" instructions,
+ // which depend on the operation being performed on this right-hand vector.
+ // For example,
+ // (concat_vectors LHS, (v1i64 (bitconvert (v4i16 RHS))))
+ // becomes
+ // (bitconvert (concat_vectors (v4i16 (bitconvert LHS)), RHS))
+
+ SDValue Op1 = N->getOperand(1);
+ if (Op1->getOpcode() != ISD::BITCAST)
+ return SDValue();
+ SDValue RHS = Op1->getOperand(0);
+ MVT RHSTy = RHS.getValueType().getSimpleVT();
+ // If the RHS is not a vector, this is not the pattern we're looking for.
+ if (!RHSTy.isVector())
+ return SDValue();
+
+ DEBUG(dbgs() << "aarch64-lower: concat_vectors bitcast simplification\n");
+
+ MVT ConcatTy = MVT::getVectorVT(RHSTy.getVectorElementType(),
+ RHSTy.getVectorNumElements() * 2);
+ return DAG.getNode(
+ ISD::BITCAST, dl, VT,
+ DAG.getNode(ISD::CONCAT_VECTORS, dl, ConcatTy,
+ DAG.getNode(ISD::BITCAST, dl, RHSTy, N->getOperand(0)), RHS));
+}
+
+static SDValue tryCombineFixedPointConvert(SDNode *N,
+ TargetLowering::DAGCombinerInfo &DCI,
+ SelectionDAG &DAG) {
+ // Wait 'til after everything is legalized to try this. That way we have
+ // legal vector types and such.
+ if (DCI.isBeforeLegalizeOps())
+ return SDValue();
+ // Transform a scalar conversion of a value from a lane extract into a
+ // lane extract of a vector conversion. E.g., from foo1 to foo2:
+ // double foo1(int64x2_t a) { return vcvtd_n_f64_s64(a[1], 9); }
+ // double foo2(int64x2_t a) { return vcvtq_n_f64_s64(a, 9)[1]; }
+ //
+ // The second form interacts better with instruction selection and the
+ // register allocator to avoid cross-class register copies that aren't
+ // coalescable due to a lane reference.
+
+ // Check the operand and see if it originates from a lane extract.
+ SDValue Op1 = N->getOperand(1);
+ if (Op1.getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
+ // Yep, no additional predication needed. Perform the transform.
+ SDValue IID = N->getOperand(0);
+ SDValue Shift = N->getOperand(2);
+ SDValue Vec = Op1.getOperand(0);
+ SDValue Lane = Op1.getOperand(1);
+ EVT ResTy = N->getValueType(0);
+ EVT VecResTy;
+ SDLoc DL(N);
+
+ // The vector width should be 128 bits by the time we get here, even
+ // if it started as 64 bits (the extract_vector handling will have
+ // done so).
+ assert(Vec.getValueType().getSizeInBits() == 128 &&
+ "unexpected vector size on extract_vector_elt!");
+ if (Vec.getValueType() == MVT::v4i32)
+ VecResTy = MVT::v4f32;
+ else if (Vec.getValueType() == MVT::v2i64)
+ VecResTy = MVT::v2f64;
+ else
+ assert(0 && "unexpected vector type!");
+
+ SDValue Convert =
+ DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VecResTy, IID, Vec, Shift);
+ return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ResTy, Convert, Lane);
+ }
+ return SDValue();
+}
+
+// AArch64 high-vector "long" operations are formed by performing the non-high
+// version on an extract_subvector of each operand which gets the high half:
+//
+// (longop2 LHS, RHS) == (longop (extract_high LHS), (extract_high RHS))
+//
+// However, there are cases which don't have an extract_high explicitly, but
+// have another operation that can be made compatible with one for free. For
+// example:
+//
+// (dupv64 scalar) --> (extract_high (dup128 scalar))
+//
+// This routine does the actual conversion of such DUPs, once outer routines
+// have determined that everything else is in order.
+static SDValue tryExtendDUPToExtractHigh(SDValue N, SelectionDAG &DAG) {
+ // We can handle most types of duplicate, but the lane ones have an extra
+ // operand saying *which* lane, so we need to know.
+ bool IsDUPLANE;
+ switch (N.getOpcode()) {
+ case AArch64ISD::DUP:
+ IsDUPLANE = false;
+ break;
+ case AArch64ISD::DUPLANE8:
+ case AArch64ISD::DUPLANE16:
+ case AArch64ISD::DUPLANE32:
+ case AArch64ISD::DUPLANE64:
+ IsDUPLANE = true;
+ break;
+ default:
+ return SDValue();
+ }
+
+ MVT NarrowTy = N.getSimpleValueType();
+ if (!NarrowTy.is64BitVector())
+ return SDValue();
+
+ MVT ElementTy = NarrowTy.getVectorElementType();
+ unsigned NumElems = NarrowTy.getVectorNumElements();
+ MVT NewDUPVT = MVT::getVectorVT(ElementTy, NumElems * 2);
+
+ SDValue NewDUP;
+ if (IsDUPLANE)
+ NewDUP = DAG.getNode(N.getOpcode(), SDLoc(N), NewDUPVT, N.getOperand(0),
+ N.getOperand(1));
+ else
+ NewDUP = DAG.getNode(AArch64ISD::DUP, SDLoc(N), NewDUPVT, N.getOperand(0));
+
+ return DAG.getNode(ISD::EXTRACT_SUBVECTOR, SDLoc(N.getNode()), NarrowTy,
+ NewDUP, DAG.getConstant(NumElems, MVT::i64));
+}
+
+static bool isEssentiallyExtractSubvector(SDValue N) {
+ if (N.getOpcode() == ISD::EXTRACT_SUBVECTOR)
+ return true;
+
+ return N.getOpcode() == ISD::BITCAST &&
+ N.getOperand(0).getOpcode() == ISD::EXTRACT_SUBVECTOR;
+}
+
+/// \brief Helper structure to keep track of ISD::SET_CC operands.
+struct GenericSetCCInfo {
+ const SDValue *Opnd0;
+ const SDValue *Opnd1;
+ ISD::CondCode CC;
+};
+
+/// \brief Helper structure to keep track of a SET_CC lowered into AArch64 code.
+struct AArch64SetCCInfo {
+ const SDValue *Cmp;
+ AArch64CC::CondCode CC;
+};
+
+/// \brief Helper structure to keep track of SetCC information.
+union SetCCInfo {
+ GenericSetCCInfo Generic;
+ AArch64SetCCInfo AArch64;
+};
+
+/// \brief Helper structure to be able to read SetCC information. If set to
+/// true, IsAArch64 field, Info is a AArch64SetCCInfo, otherwise Info is a
+/// GenericSetCCInfo.
+struct SetCCInfoAndKind {
+ SetCCInfo Info;
+ bool IsAArch64;
+};
+
+/// \brief Check whether or not \p Op is a SET_CC operation, either a generic or
+/// an
+/// AArch64 lowered one.
+/// \p SetCCInfo is filled accordingly.
+/// \post SetCCInfo is meanginfull only when this function returns true.
+/// \return True when Op is a kind of SET_CC operation.
+static bool isSetCC(SDValue Op, SetCCInfoAndKind &SetCCInfo) {
+ // If this is a setcc, this is straight forward.
+ if (Op.getOpcode() == ISD::SETCC) {
+ SetCCInfo.Info.Generic.Opnd0 = &Op.getOperand(0);
+ SetCCInfo.Info.Generic.Opnd1 = &Op.getOperand(1);
+ SetCCInfo.Info.Generic.CC = cast<CondCodeSDNode>(Op.getOperand(2))->get();
+ SetCCInfo.IsAArch64 = false;
+ return true;
+ }
+ // Otherwise, check if this is a matching csel instruction.
+ // In other words:
+ // - csel 1, 0, cc
+ // - csel 0, 1, !cc
+ if (Op.getOpcode() != AArch64ISD::CSEL)
+ return false;
+ // Set the information about the operands.
+ // TODO: we want the operands of the Cmp not the csel
+ SetCCInfo.Info.AArch64.Cmp = &Op.getOperand(3);
+ SetCCInfo.IsAArch64 = true;
+ SetCCInfo.Info.AArch64.CC = static_cast<AArch64CC::CondCode>(
+ cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue());
+
+ // Check that the operands matches the constraints:
+ // (1) Both operands must be constants.
+ // (2) One must be 1 and the other must be 0.
+ ConstantSDNode *TValue = dyn_cast<ConstantSDNode>(Op.getOperand(0));
+ ConstantSDNode *FValue = dyn_cast<ConstantSDNode>(Op.getOperand(1));
+
+ // Check (1).
+ if (!TValue || !FValue)
+ return false;
+
+ // Check (2).
+ if (!TValue->isOne()) {
+ // Update the comparison when we are interested in !cc.
+ std::swap(TValue, FValue);
+ SetCCInfo.Info.AArch64.CC =
+ AArch64CC::getInvertedCondCode(SetCCInfo.Info.AArch64.CC);
+ }
+ return TValue->isOne() && FValue->isNullValue();
+}
+
+// Returns true if Op is setcc or zext of setcc.
+static bool isSetCCOrZExtSetCC(const SDValue& Op, SetCCInfoAndKind &Info) {
+ if (isSetCC(Op, Info))
+ return true;
+ return ((Op.getOpcode() == ISD::ZERO_EXTEND) &&
+ isSetCC(Op->getOperand(0), Info));
+}
+
+// The folding we want to perform is:
+// (add x, [zext] (setcc cc ...) )
+// -->
+// (csel x, (add x, 1), !cc ...)
+//
+// The latter will get matched to a CSINC instruction.
+static SDValue performSetccAddFolding(SDNode *Op, SelectionDAG &DAG) {
+ assert(Op && Op->getOpcode() == ISD::ADD && "Unexpected operation!");
+ SDValue LHS = Op->getOperand(0);
+ SDValue RHS = Op->getOperand(1);
+ SetCCInfoAndKind InfoAndKind;
+
+ // If neither operand is a SET_CC, give up.
+ if (!isSetCCOrZExtSetCC(LHS, InfoAndKind)) {
+ std::swap(LHS, RHS);
+ if (!isSetCCOrZExtSetCC(LHS, InfoAndKind))
+ return SDValue();
+ }
+
+ // FIXME: This could be generatized to work for FP comparisons.
+ EVT CmpVT = InfoAndKind.IsAArch64
+ ? InfoAndKind.Info.AArch64.Cmp->getOperand(0).getValueType()
+ : InfoAndKind.Info.Generic.Opnd0->getValueType();
+ if (CmpVT != MVT::i32 && CmpVT != MVT::i64)
+ return SDValue();
+
+ SDValue CCVal;
+ SDValue Cmp;
+ SDLoc dl(Op);
+ if (InfoAndKind.IsAArch64) {
+ CCVal = DAG.getConstant(
+ AArch64CC::getInvertedCondCode(InfoAndKind.Info.AArch64.CC), MVT::i32);
+ Cmp = *InfoAndKind.Info.AArch64.Cmp;
+ } else
+ Cmp = getAArch64Cmp(*InfoAndKind.Info.Generic.Opnd0,
+ *InfoAndKind.Info.Generic.Opnd1,
+ ISD::getSetCCInverse(InfoAndKind.Info.Generic.CC, true),
+ CCVal, DAG, dl);
+
+ EVT VT = Op->getValueType(0);
+ LHS = DAG.getNode(ISD::ADD, dl, VT, RHS, DAG.getConstant(1, VT));
+ return DAG.getNode(AArch64ISD::CSEL, dl, VT, RHS, LHS, CCVal, Cmp);
+}
+
+// The basic add/sub long vector instructions have variants with "2" on the end
+// which act on the high-half of their inputs. They are normally matched by
+// patterns like:
+//
+// (add (zeroext (extract_high LHS)),
+// (zeroext (extract_high RHS)))
+// -> uaddl2 vD, vN, vM
+//
+// However, if one of the extracts is something like a duplicate, this
+// instruction can still be used profitably. This function puts the DAG into a
+// more appropriate form for those patterns to trigger.
+static SDValue performAddSubLongCombine(SDNode *N,
+ TargetLowering::DAGCombinerInfo &DCI,
+ SelectionDAG &DAG) {
+ if (DCI.isBeforeLegalizeOps())
+ return SDValue();
+
+ MVT VT = N->getSimpleValueType(0);
+ if (!VT.is128BitVector()) {
+ if (N->getOpcode() == ISD::ADD)
+ return performSetccAddFolding(N, DAG);
+ return SDValue();
+ }
+
+ // Make sure both branches are extended in the same way.
+ SDValue LHS = N->getOperand(0);
+ SDValue RHS = N->getOperand(1);
+ if ((LHS.getOpcode() != ISD::ZERO_EXTEND &&
+ LHS.getOpcode() != ISD::SIGN_EXTEND) ||
+ LHS.getOpcode() != RHS.getOpcode())
+ return SDValue();
+
+ unsigned ExtType = LHS.getOpcode();
+
+ // It's not worth doing if at least one of the inputs isn't already an
+ // extract, but we don't know which it'll be so we have to try both.
+ if (isEssentiallyExtractSubvector(LHS.getOperand(0))) {
+ RHS = tryExtendDUPToExtractHigh(RHS.getOperand(0), DAG);
+ if (!RHS.getNode())
+ return SDValue();
+
+ RHS = DAG.getNode(ExtType, SDLoc(N), VT, RHS);
+ } else if (isEssentiallyExtractSubvector(RHS.getOperand(0))) {
+ LHS = tryExtendDUPToExtractHigh(LHS.getOperand(0), DAG);
+ if (!LHS.getNode())
+ return SDValue();
+
+ LHS = DAG.getNode(ExtType, SDLoc(N), VT, LHS);
+ }
+
+ return DAG.getNode(N->getOpcode(), SDLoc(N), VT, LHS, RHS);
+}
+
+// Massage DAGs which we can use the high-half "long" operations on into
+// something isel will recognize better. E.g.
+//
+// (aarch64_neon_umull (extract_high vec) (dupv64 scalar)) -->
+// (aarch64_neon_umull (extract_high (v2i64 vec)))
+// (extract_high (v2i64 (dup128 scalar)))))
+//
+static SDValue tryCombineLongOpWithDup(unsigned IID, SDNode *N,
+ TargetLowering::DAGCombinerInfo &DCI,
+ SelectionDAG &DAG) {
+ if (DCI.isBeforeLegalizeOps())
+ return SDValue();
+
+ SDValue LHS = N->getOperand(1);
+ SDValue RHS = N->getOperand(2);
+ assert(LHS.getValueType().is64BitVector() &&
+ RHS.getValueType().is64BitVector() &&
+ "unexpected shape for long operation");
+
+ // Either node could be a DUP, but it's not worth doing both of them (you'd
+ // just as well use the non-high version) so look for a corresponding extract
+ // operation on the other "wing".
+ if (isEssentiallyExtractSubvector(LHS)) {
+ RHS = tryExtendDUPToExtractHigh(RHS, DAG);
+ if (!RHS.getNode())
+ return SDValue();
+ } else if (isEssentiallyExtractSubvector(RHS)) {
+ LHS = tryExtendDUPToExtractHigh(LHS, DAG);
+ if (!LHS.getNode())
+ return SDValue();
+ }
+
+ return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SDLoc(N), N->getValueType(0),
+ N->getOperand(0), LHS, RHS);
+}
+
+static SDValue tryCombineShiftImm(unsigned IID, SDNode *N, SelectionDAG &DAG) {
+ MVT ElemTy = N->getSimpleValueType(0).getScalarType();
+ unsigned ElemBits = ElemTy.getSizeInBits();
+
+ int64_t ShiftAmount;
+ if (BuildVectorSDNode *BVN = dyn_cast<BuildVectorSDNode>(N->getOperand(2))) {
+ APInt SplatValue, SplatUndef;
unsigned SplatBitSize;
bool HasAnyUndefs;
- BuildVectorSDNode *BVN0 = dyn_cast<BuildVectorSDNode>(N0->getOperand(1));
- APInt SplatBits0;
- if (BVN0 && BVN0->isConstantSplat(SplatBits0, SplatUndef, SplatBitSize,
- HasAnyUndefs) &&
- !HasAnyUndefs) {
- BuildVectorSDNode *BVN1 = dyn_cast<BuildVectorSDNode>(N1->getOperand(1));
- APInt SplatBits1;
- if (BVN1 && BVN1->isConstantSplat(SplatBits1, SplatUndef, SplatBitSize,
- HasAnyUndefs) && !HasAnyUndefs &&
- SplatBits0.getBitWidth() == SplatBits1.getBitWidth() &&
- SplatBits0 == ~SplatBits1) {
+ if (!BVN->isConstantSplat(SplatValue, SplatUndef, SplatBitSize,
+ HasAnyUndefs, ElemBits) ||
+ SplatBitSize != ElemBits)
+ return SDValue();
- return DAG.getNode(ISD::VSELECT, DL, VT, N0->getOperand(1),
- N0->getOperand(0), N1->getOperand(0));
- }
- }
+ ShiftAmount = SplatValue.getSExtValue();
+ } else if (ConstantSDNode *CVN = dyn_cast<ConstantSDNode>(N->getOperand(2))) {
+ ShiftAmount = CVN->getSExtValue();
+ } else
+ return SDValue();
+
+ unsigned Opcode;
+ bool IsRightShift;
+ switch (IID) {
+ default:
+ llvm_unreachable("Unknown shift intrinsic");
+ case Intrinsic::aarch64_neon_sqshl:
+ Opcode = AArch64ISD::SQSHL_I;
+ IsRightShift = false;
+ break;
+ case Intrinsic::aarch64_neon_uqshl:
+ Opcode = AArch64ISD::UQSHL_I;
+ IsRightShift = false;
+ break;
+ case Intrinsic::aarch64_neon_srshl:
+ Opcode = AArch64ISD::SRSHR_I;
+ IsRightShift = true;
+ break;
+ case Intrinsic::aarch64_neon_urshl:
+ Opcode = AArch64ISD::URSHR_I;
+ IsRightShift = true;
+ break;
+ case Intrinsic::aarch64_neon_sqshlu:
+ Opcode = AArch64ISD::SQSHLU_I;
+ IsRightShift = false;
+ break;
}
+ if (IsRightShift && ShiftAmount <= -1 && ShiftAmount >= -(int)ElemBits)
+ return DAG.getNode(Opcode, SDLoc(N), N->getValueType(0), N->getOperand(1),
+ DAG.getConstant(-ShiftAmount, MVT::i32));
+ else if (!IsRightShift && ShiftAmount >= 0 && ShiftAmount <= ElemBits)
+ return DAG.getNode(Opcode, SDLoc(N), N->getValueType(0), N->getOperand(1),
+ DAG.getConstant(ShiftAmount, MVT::i32));
+
return SDValue();
}
-/// Target-specific dag combine xforms for ISD::SRA
-static SDValue PerformSRACombine(SDNode *N,
- TargetLowering::DAGCombinerInfo &DCI) {
+// The CRC32[BH] instructions ignore the high bits of their data operand. Since
+// the intrinsics must be legal and take an i32, this means there's almost
+// certainly going to be a zext in the DAG which we can eliminate.
+static SDValue tryCombineCRC32(unsigned Mask, SDNode *N, SelectionDAG &DAG) {
+ SDValue AndN = N->getOperand(2);
+ if (AndN.getOpcode() != ISD::AND)
+ return SDValue();
+ ConstantSDNode *CMask = dyn_cast<ConstantSDNode>(AndN.getOperand(1));
+ if (!CMask || CMask->getZExtValue() != Mask)
+ return SDValue();
+
+ return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SDLoc(N), MVT::i32,
+ N->getOperand(0), N->getOperand(1), AndN.getOperand(0));
+}
+
+static SDValue performIntrinsicCombine(SDNode *N,
+ TargetLowering::DAGCombinerInfo &DCI,
+ const AArch64Subtarget *Subtarget) {
SelectionDAG &DAG = DCI.DAG;
- SDLoc DL(N);
- EVT VT = N->getValueType(0);
-
- // We're looking for an SRA/SHL pair which form an SBFX.
-
- if (VT != MVT::i32 && VT != MVT::i64)
- return SDValue();
-
- if (!isa<ConstantSDNode>(N->getOperand(1)))
- return SDValue();
-
- uint64_t ExtraSignBits = N->getConstantOperandVal(1);
- SDValue Shift = N->getOperand(0);
-
- if (Shift.getOpcode() != ISD::SHL)
- return SDValue();
-
- if (!isa<ConstantSDNode>(Shift->getOperand(1)))
- return SDValue();
-
- uint64_t BitsOnLeft = Shift->getConstantOperandVal(1);
- uint64_t Width = VT.getSizeInBits() - ExtraSignBits;
- uint64_t LSB = VT.getSizeInBits() - Width - BitsOnLeft;
-
- if (LSB > VT.getSizeInBits() || Width > VT.getSizeInBits())
- return SDValue();
-
- return DAG.getNode(AArch64ISD::SBFX, DL, VT, Shift.getOperand(0),
- DAG.getConstant(LSB, MVT::i64),
- DAG.getConstant(LSB + Width - 1, MVT::i64));
+ unsigned IID = getIntrinsicID(N);
+ switch (IID) {
+ default:
+ break;
+ case Intrinsic::aarch64_neon_vcvtfxs2fp:
+ case Intrinsic::aarch64_neon_vcvtfxu2fp:
+ return tryCombineFixedPointConvert(N, DCI, DAG);
+ break;
+ case Intrinsic::aarch64_neon_fmax:
+ return DAG.getNode(AArch64ISD::FMAX, SDLoc(N), N->getValueType(0),
+ N->getOperand(1), N->getOperand(2));
+ case Intrinsic::aarch64_neon_fmin:
+ return DAG.getNode(AArch64ISD::FMIN, SDLoc(N), N->getValueType(0),
+ N->getOperand(1), N->getOperand(2));
+ case Intrinsic::aarch64_neon_smull:
+ case Intrinsic::aarch64_neon_umull:
+ case Intrinsic::aarch64_neon_pmull:
+ case Intrinsic::aarch64_neon_sqdmull:
+ return tryCombineLongOpWithDup(IID, N, DCI, DAG);
+ case Intrinsic::aarch64_neon_sqshl:
+ case Intrinsic::aarch64_neon_uqshl:
+ case Intrinsic::aarch64_neon_sqshlu:
+ case Intrinsic::aarch64_neon_srshl:
+ case Intrinsic::aarch64_neon_urshl:
+ return tryCombineShiftImm(IID, N, DAG);
+ case Intrinsic::aarch64_crc32b:
+ case Intrinsic::aarch64_crc32cb:
+ return tryCombineCRC32(0xff, N, DAG);
+ case Intrinsic::aarch64_crc32h:
+ case Intrinsic::aarch64_crc32ch:
+ return tryCombineCRC32(0xffff, N, DAG);
+ }
+ return SDValue();
}
-/// Check if this is a valid build_vector for the immediate operand of
-/// a vector shift operation, where all the elements of the build_vector
-/// must have the same constant integer value.
-static bool getVShiftImm(SDValue Op, unsigned ElementBits, int64_t &Cnt) {
- // Ignore bit_converts.
- while (Op.getOpcode() == ISD::BITCAST)
- Op = Op.getOperand(0);
- BuildVectorSDNode *BVN = dyn_cast<BuildVectorSDNode>(Op.getNode());
- APInt SplatBits, SplatUndef;
- unsigned SplatBitSize;
- bool HasAnyUndefs;
- if (!BVN || !BVN->isConstantSplat(SplatBits, SplatUndef, SplatBitSize,
- HasAnyUndefs, ElementBits) ||
- SplatBitSize > ElementBits)
- return false;
- Cnt = SplatBits.getSExtValue();
- return true;
-}
+static SDValue performExtendCombine(SDNode *N,
+ TargetLowering::DAGCombinerInfo &DCI,
+ SelectionDAG &DAG) {
+ // If we see something like (zext (sabd (extract_high ...), (DUP ...))) then
+ // we can convert that DUP into another extract_high (of a bigger DUP), which
+ // helps the backend to decide that an sabdl2 would be useful, saving a real
+ // extract_high operation.
+ if (!DCI.isBeforeLegalizeOps() && N->getOpcode() == ISD::ZERO_EXTEND &&
+ N->getOperand(0).getOpcode() == ISD::INTRINSIC_WO_CHAIN) {
+ SDNode *ABDNode = N->getOperand(0).getNode();
+ unsigned IID = getIntrinsicID(ABDNode);
+ if (IID == Intrinsic::aarch64_neon_sabd ||
+ IID == Intrinsic::aarch64_neon_uabd) {
+ SDValue NewABD = tryCombineLongOpWithDup(IID, ABDNode, DCI, DAG);
+ if (!NewABD.getNode())
+ return SDValue();
-/// Check if this is a valid build_vector for the immediate operand of
-/// a vector shift left operation. That value must be in the range:
-/// 0 <= Value < ElementBits
-static bool isVShiftLImm(SDValue Op, EVT VT, int64_t &Cnt) {
- assert(VT.isVector() && "vector shift count is not a vector type");
- unsigned ElementBits = VT.getVectorElementType().getSizeInBits();
- if (!getVShiftImm(Op, ElementBits, Cnt))
- return false;
- return (Cnt >= 0 && Cnt < ElementBits);
-}
-
-/// Check if this is a valid build_vector for the immediate operand of a
-/// vector shift right operation. The value must be in the range:
-/// 1 <= Value <= ElementBits
-static bool isVShiftRImm(SDValue Op, EVT VT, int64_t &Cnt) {
- assert(VT.isVector() && "vector shift count is not a vector type");
- unsigned ElementBits = VT.getVectorElementType().getSizeInBits();
- if (!getVShiftImm(Op, ElementBits, Cnt))
- return false;
- return (Cnt >= 1 && Cnt <= ElementBits);
-}
-
-static SDValue GenForSextInreg(SDNode *N,
- TargetLowering::DAGCombinerInfo &DCI,
- EVT SrcVT, EVT DestVT, EVT SubRegVT,
- const int *Mask, SDValue Src) {
- SelectionDAG &DAG = DCI.DAG;
- SDValue Bitcast
- = DAG.getNode(ISD::BITCAST, SDLoc(N), SrcVT, Src);
- SDValue Sext
- = DAG.getNode(ISD::SIGN_EXTEND, SDLoc(N), DestVT, Bitcast);
- SDValue ShuffleVec
- = DAG.getVectorShuffle(DestVT, SDLoc(N), Sext, DAG.getUNDEF(DestVT), Mask);
- SDValue ExtractSubreg
- = SDValue(DAG.getMachineNode(TargetOpcode::EXTRACT_SUBREG, SDLoc(N),
- SubRegVT, ShuffleVec,
- DAG.getTargetConstant(AArch64::sub_64, MVT::i32)), 0);
- return ExtractSubreg;
-}
-
-/// Checks for vector shifts and lowers them.
-static SDValue PerformShiftCombine(SDNode *N,
- TargetLowering::DAGCombinerInfo &DCI,
- const AArch64Subtarget *ST) {
- SelectionDAG &DAG = DCI.DAG;
- EVT VT = N->getValueType(0);
- if (N->getOpcode() == ISD::SRA && (VT == MVT::i32 || VT == MVT::i64))
- return PerformSRACombine(N, DCI);
-
- // We're looking for an SRA/SHL pair to help generating instruction
- // sshll v0.8h, v0.8b, #0
- // The instruction STXL is also the alias of this instruction.
- //
- // For example, for DAG like below,
- // v2i32 = sra (v2i32 (shl v2i32, 16)), 16
- // we can transform it into
- // v2i32 = EXTRACT_SUBREG
- // (v4i32 (suffle_vector
- // (v4i32 (sext (v4i16 (bitcast v2i32))),
- // undef, (0, 2, u, u)),
- // sub_64
- //
- // With this transformation we expect to generate "SSHLL + UZIP1"
- // Sometimes UZIP1 can be optimized away by combining with other context.
- int64_t ShrCnt, ShlCnt;
- if (N->getOpcode() == ISD::SRA
- && (VT == MVT::v2i32 || VT == MVT::v4i16)
- && isVShiftRImm(N->getOperand(1), VT, ShrCnt)
- && N->getOperand(0).getOpcode() == ISD::SHL
- && isVShiftRImm(N->getOperand(0).getOperand(1), VT, ShlCnt)) {
- SDValue Src = N->getOperand(0).getOperand(0);
- if (VT == MVT::v2i32 && ShrCnt == 16 && ShlCnt == 16) {
- // sext_inreg(v2i32, v2i16)
- // We essentially only care the Mask {0, 2, u, u}
- int Mask[4] = {0, 2, 4, 6};
- return GenForSextInreg(N, DCI, MVT::v4i16, MVT::v4i32, MVT::v2i32,
- Mask, Src);
- }
- else if (VT == MVT::v2i32 && ShrCnt == 24 && ShlCnt == 24) {
- // sext_inreg(v2i16, v2i8)
- // We essentially only care the Mask {0, u, 4, u, u, u, u, u, u, u, u, u}
- int Mask[8] = {0, 2, 4, 6, 8, 10, 12, 14};
- return GenForSextInreg(N, DCI, MVT::v8i8, MVT::v8i16, MVT::v2i32,
- Mask, Src);
- }
- else if (VT == MVT::v4i16 && ShrCnt == 8 && ShlCnt == 8) {
- // sext_inreg(v4i16, v4i8)
- // We essentially only care the Mask {0, 2, 4, 6, u, u, u, u, u, u, u, u}
- int Mask[8] = {0, 2, 4, 6, 8, 10, 12, 14};
- return GenForSextInreg(N, DCI, MVT::v8i8, MVT::v8i16, MVT::v4i16,
- Mask, Src);
+ return DAG.getNode(ISD::ZERO_EXTEND, SDLoc(N), N->getValueType(0),
+ NewABD);
}
}
- // Nothing to be done for scalar shifts.
+ // This is effectively a custom type legalization for AArch64.
+ //
+ // Type legalization will split an extend of a small, legal, type to a larger
+ // illegal type by first splitting the destination type, often creating
+ // illegal source types, which then get legalized in isel-confusing ways,
+ // leading to really terrible codegen. E.g.,
+ // %result = v8i32 sext v8i8 %value
+ // becomes
+ // %losrc = extract_subreg %value, ...
+ // %hisrc = extract_subreg %value, ...
+ // %lo = v4i32 sext v4i8 %losrc
+ // %hi = v4i32 sext v4i8 %hisrc
+ // Things go rapidly downhill from there.
+ //
+ // For AArch64, the [sz]ext vector instructions can only go up one element
+ // size, so we can, e.g., extend from i8 to i16, but to go from i8 to i32
+ // take two instructions.
+ //
+ // This implies that the most efficient way to do the extend from v8i8
+ // to two v4i32 values is to first extend the v8i8 to v8i16, then do
+ // the normal splitting to happen for the v8i16->v8i32.
+
+ // This is pre-legalization to catch some cases where the default
+ // type legalization will create ill-tempered code.
+ if (!DCI.isBeforeLegalizeOps())
+ return SDValue();
+
+ // We're only interested in cleaning things up for non-legal vector types
+ // here. If both the source and destination are legal, things will just
+ // work naturally without any fiddling.
const TargetLowering &TLI = DAG.getTargetLoweringInfo();
- if (!VT.isVector() || !TLI.isTypeLegal(VT))
+ EVT ResVT = N->getValueType(0);
+ if (!ResVT.isVector() || TLI.isTypeLegal(ResVT))
+ return SDValue();
+ // If the vector type isn't a simple VT, it's beyond the scope of what
+ // we're worried about here. Let legalization do its thing and hope for
+ // the best.
+ if (!ResVT.isSimple())
return SDValue();
- assert(ST->hasNEON() && "unexpected vector shift");
- int64_t Cnt;
+ SDValue Src = N->getOperand(0);
+ MVT SrcVT = Src->getValueType(0).getSimpleVT();
+ // If the source VT is a 64-bit vector, we can play games and get the
+ // better results we want.
+ if (SrcVT.getSizeInBits() != 64)
+ return SDValue();
- switch (N->getOpcode()) {
- default:
- llvm_unreachable("unexpected shift opcode");
+ unsigned SrcEltSize = SrcVT.getVectorElementType().getSizeInBits();
+ unsigned ElementCount = SrcVT.getVectorNumElements();
+ SrcVT = MVT::getVectorVT(MVT::getIntegerVT(SrcEltSize * 2), ElementCount);
+ SDLoc DL(N);
+ Src = DAG.getNode(N->getOpcode(), DL, SrcVT, Src);
- case ISD::SHL:
- if (isVShiftLImm(N->getOperand(1), VT, Cnt)) {
- SDValue RHS =
- DAG.getNode(AArch64ISD::NEON_VDUP, SDLoc(N->getOperand(1)), VT,
- DAG.getConstant(Cnt, MVT::i32));
- return DAG.getNode(ISD::SHL, SDLoc(N), VT, N->getOperand(0), RHS);
- }
- break;
+ // Now split the rest of the operation into two halves, each with a 64
+ // bit source.
+ EVT LoVT, HiVT;
+ SDValue Lo, Hi;
+ unsigned NumElements = ResVT.getVectorNumElements();
+ assert(!(NumElements & 1) && "Splitting vector, but not in half!");
+ LoVT = HiVT = EVT::getVectorVT(*DAG.getContext(),
+ ResVT.getVectorElementType(), NumElements / 2);
- case ISD::SRA:
- case ISD::SRL:
- if (isVShiftRImm(N->getOperand(1), VT, Cnt)) {
- SDValue RHS =
- DAG.getNode(AArch64ISD::NEON_VDUP, SDLoc(N->getOperand(1)), VT,
- DAG.getConstant(Cnt, MVT::i32));
- return DAG.getNode(N->getOpcode(), SDLoc(N), VT, N->getOperand(0), RHS);
- }
- break;
- }
+ EVT InNVT = EVT::getVectorVT(*DAG.getContext(), SrcVT.getVectorElementType(),
+ LoVT.getVectorNumElements());
+ Lo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, InNVT, Src,
+ DAG.getIntPtrConstant(0));
+ Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, InNVT, Src,
+ DAG.getIntPtrConstant(InNVT.getVectorNumElements()));
+ Lo = DAG.getNode(N->getOpcode(), DL, LoVT, Lo);
+ Hi = DAG.getNode(N->getOpcode(), DL, HiVT, Hi);
- return SDValue();
+ // Now combine the parts back together so we still have a single result
+ // like the combiner expects.
+ return DAG.getNode(ISD::CONCAT_VECTORS, DL, ResVT, Lo, Hi);
}
-/// ARM-specific DAG combining for intrinsics.
-static SDValue PerformIntrinsicCombine(SDNode *N, SelectionDAG &DAG) {
- unsigned IntNo = cast<ConstantSDNode>(N->getOperand(0))->getZExtValue();
+/// Replace a splat of a scalar to a vector store by scalar stores of the scalar
+/// value. The load store optimizer pass will merge them to store pair stores.
+/// This has better performance than a splat of the scalar followed by a split
+/// vector store. Even if the stores are not merged it is four stores vs a dup,
+/// followed by an ext.b and two stores.
+static SDValue replaceSplatVectorStore(SelectionDAG &DAG, StoreSDNode *St) {
+ SDValue StVal = St->getValue();
+ EVT VT = StVal.getValueType();
- switch (IntNo) {
- default:
- // Don't do anything for most intrinsics.
- break;
+ // Don't replace floating point stores, they possibly won't be transformed to
+ // stp because of the store pair suppress pass.
+ if (VT.isFloatingPoint())
+ return SDValue();
- case Intrinsic::arm_neon_vqshifts:
- case Intrinsic::arm_neon_vqshiftu:
- EVT VT = N->getOperand(1).getValueType();
- int64_t Cnt;
- if (!isVShiftLImm(N->getOperand(2), VT, Cnt))
- break;
- unsigned VShiftOpc = (IntNo == Intrinsic::arm_neon_vqshifts)
- ? AArch64ISD::NEON_QSHLs
- : AArch64ISD::NEON_QSHLu;
- return DAG.getNode(VShiftOpc, SDLoc(N), N->getValueType(0),
- N->getOperand(1), DAG.getConstant(Cnt, MVT::i32));
+ // Check for insert vector elements.
+ if (StVal.getOpcode() != ISD::INSERT_VECTOR_ELT)
+ return SDValue();
+
+ // We can express a splat as store pair(s) for 2 or 4 elements.
+ unsigned NumVecElts = VT.getVectorNumElements();
+ if (NumVecElts != 4 && NumVecElts != 2)
+ return SDValue();
+ SDValue SplatVal = StVal.getOperand(1);
+ unsigned RemainInsertElts = NumVecElts - 1;
+
+ // Check that this is a splat.
+ while (--RemainInsertElts) {
+ SDValue NextInsertElt = StVal.getOperand(0);
+ if (NextInsertElt.getOpcode() != ISD::INSERT_VECTOR_ELT)
+ return SDValue();
+ if (NextInsertElt.getOperand(1) != SplatVal)
+ return SDValue();
+ StVal = NextInsertElt;
+ }
+ unsigned OrigAlignment = St->getAlignment();
+ unsigned EltOffset = NumVecElts == 4 ? 4 : 8;
+ unsigned Alignment = std::min(OrigAlignment, EltOffset);
+
+ // Create scalar stores. This is at least as good as the code sequence for a
+ // split unaligned store wich is a dup.s, ext.b, and two stores.
+ // Most of the time the three stores should be replaced by store pair
+ // instructions (stp).
+ SDLoc DL(St);
+ SDValue BasePtr = St->getBasePtr();
+ SDValue NewST1 =
+ DAG.getStore(St->getChain(), DL, SplatVal, BasePtr, St->getPointerInfo(),
+ St->isVolatile(), St->isNonTemporal(), St->getAlignment());
+
+ unsigned Offset = EltOffset;
+ while (--NumVecElts) {
+ SDValue OffsetPtr = DAG.getNode(ISD::ADD, DL, MVT::i64, BasePtr,
+ DAG.getConstant(Offset, MVT::i64));
+ NewST1 = DAG.getStore(NewST1.getValue(0), DL, SplatVal, OffsetPtr,
+ St->getPointerInfo(), St->isVolatile(),
+ St->isNonTemporal(), Alignment);
+ Offset += EltOffset;
+ }
+ return NewST1;
+}
+
+static SDValue performSTORECombine(SDNode *N,
+ TargetLowering::DAGCombinerInfo &DCI,
+ SelectionDAG &DAG,
+ const AArch64Subtarget *Subtarget) {
+ if (!DCI.isBeforeLegalize())
+ return SDValue();
+
+ StoreSDNode *S = cast<StoreSDNode>(N);
+ if (S->isVolatile())
+ return SDValue();
+
+ // Cyclone has bad performance on unaligned 16B stores when crossing line and
+ // page boundries. We want to split such stores.
+ if (!Subtarget->isCyclone())
+ return SDValue();
+
+ // Don't split at Oz.
+ MachineFunction &MF = DAG.getMachineFunction();
+ bool IsMinSize = MF.getFunction()->getAttributes().hasAttribute(
+ AttributeSet::FunctionIndex, Attribute::MinSize);
+ if (IsMinSize)
+ return SDValue();
+
+ SDValue StVal = S->getValue();
+ EVT VT = StVal.getValueType();
+
+ // Don't split v2i64 vectors. Memcpy lowering produces those and splitting
+ // those up regresses performance on micro-benchmarks and olden/bh.
+ if (!VT.isVector() || VT.getVectorNumElements() < 2 || VT == MVT::v2i64)
+ return SDValue();
+
+ // Split unaligned 16B stores. They are terrible for performance.
+ // Don't split stores with alignment of 1 or 2. Code that uses clang vector
+ // extensions can use this to mark that it does not want splitting to happen
+ // (by underspecifying alignment to be 1 or 2). Furthermore, the chance of
+ // eliminating alignment hazards is only 1 in 8 for alignment of 2.
+ if (VT.getSizeInBits() != 128 || S->getAlignment() >= 16 ||
+ S->getAlignment() <= 2)
+ return SDValue();
+
+ // If we get a splat of a scalar convert this vector store to a store of
+ // scalars. They will be merged into store pairs thereby removing two
+ // instructions.
+ SDValue ReplacedSplat = replaceSplatVectorStore(DAG, S);
+ if (ReplacedSplat != SDValue())
+ return ReplacedSplat;
+
+ SDLoc DL(S);
+ unsigned NumElts = VT.getVectorNumElements() / 2;
+ // Split VT into two.
+ EVT HalfVT =
+ EVT::getVectorVT(*DAG.getContext(), VT.getVectorElementType(), NumElts);
+ SDValue SubVector0 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, StVal,
+ DAG.getIntPtrConstant(0));
+ SDValue SubVector1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, StVal,
+ DAG.getIntPtrConstant(NumElts));
+ SDValue BasePtr = S->getBasePtr();
+ SDValue NewST1 =
+ DAG.getStore(S->getChain(), DL, SubVector0, BasePtr, S->getPointerInfo(),
+ S->isVolatile(), S->isNonTemporal(), S->getAlignment());
+ SDValue OffsetPtr = DAG.getNode(ISD::ADD, DL, MVT::i64, BasePtr,
+ DAG.getConstant(8, MVT::i64));
+ return DAG.getStore(NewST1.getValue(0), DL, SubVector1, OffsetPtr,
+ S->getPointerInfo(), S->isVolatile(), S->isNonTemporal(),
+ S->getAlignment());
+}
+
+/// Target-specific DAG combine function for post-increment LD1 (lane) and
+/// post-increment LD1R.
+static SDValue performPostLD1Combine(SDNode *N,
+ TargetLowering::DAGCombinerInfo &DCI,
+ bool IsLaneOp) {
+ if (DCI.isBeforeLegalizeOps())
+ return SDValue();
+
+ SelectionDAG &DAG = DCI.DAG;
+ EVT VT = N->getValueType(0);
+
+ unsigned LoadIdx = IsLaneOp ? 1 : 0;
+ SDNode *LD = N->getOperand(LoadIdx).getNode();
+ // If it is not LOAD, can not do such combine.
+ if (LD->getOpcode() != ISD::LOAD)
+ return SDValue();
+
+ LoadSDNode *LoadSDN = cast<LoadSDNode>(LD);
+ EVT MemVT = LoadSDN->getMemoryVT();
+ // Check if memory operand is the same type as the vector element.
+ if (MemVT != VT.getVectorElementType())
+ return SDValue();
+
+ // Check if there are other uses. If so, do not combine as it will introduce
+ // an extra load.
+ for (SDNode::use_iterator UI = LD->use_begin(), UE = LD->use_end(); UI != UE;
+ ++UI) {
+ if (UI.getUse().getResNo() == 1) // Ignore uses of the chain result.
+ continue;
+ if (*UI != N)
+ return SDValue();
}
+ SDValue Addr = LD->getOperand(1);
+ SDValue Vector = N->getOperand(0);
+ // Search for a use of the address operand that is an increment.
+ for (SDNode::use_iterator UI = Addr.getNode()->use_begin(), UE =
+ Addr.getNode()->use_end(); UI != UE; ++UI) {
+ SDNode *User = *UI;
+ if (User->getOpcode() != ISD::ADD
+ || UI.getUse().getResNo() != Addr.getResNo())
+ continue;
+
+ // Check that the add is independent of the load. Otherwise, folding it
+ // would create a cycle.
+ if (User->isPredecessorOf(LD) || LD->isPredecessorOf(User))
+ continue;
+ // Also check that add is not used in the vector operand. This would also
+ // create a cycle.
+ if (User->isPredecessorOf(Vector.getNode()))
+ continue;
+
+ // If the increment is a constant, it must match the memory ref size.
+ SDValue Inc = User->getOperand(User->getOperand(0) == Addr ? 1 : 0);
+ if (ConstantSDNode *CInc = dyn_cast<ConstantSDNode>(Inc.getNode())) {
+ uint32_t IncVal = CInc->getZExtValue();
+ unsigned NumBytes = VT.getScalarSizeInBits() / 8;
+ if (IncVal != NumBytes)
+ continue;
+ Inc = DAG.getRegister(AArch64::XZR, MVT::i64);
+ }
+
+ SmallVector<SDValue, 8> Ops;
+ Ops.push_back(LD->getOperand(0)); // Chain
+ if (IsLaneOp) {
+ Ops.push_back(Vector); // The vector to be inserted
+ Ops.push_back(N->getOperand(2)); // The lane to be inserted in the vector
+ }
+ Ops.push_back(Addr);
+ Ops.push_back(Inc);
+
+ EVT Tys[3] = { VT, MVT::i64, MVT::Other };
+ SDVTList SDTys = DAG.getVTList(ArrayRef<EVT>(Tys, 3));
+ unsigned NewOp = IsLaneOp ? AArch64ISD::LD1LANEpost : AArch64ISD::LD1DUPpost;
+ SDValue UpdN = DAG.getMemIntrinsicNode(NewOp, SDLoc(N), SDTys, Ops,
+ MemVT,
+ LoadSDN->getMemOperand());
+
+ // Update the uses.
+ std::vector<SDValue> NewResults;
+ NewResults.push_back(SDValue(LD, 0)); // The result of load
+ NewResults.push_back(SDValue(UpdN.getNode(), 2)); // Chain
+ DCI.CombineTo(LD, NewResults);
+ DCI.CombineTo(N, SDValue(UpdN.getNode(), 0)); // Dup/Inserted Result
+ DCI.CombineTo(User, SDValue(UpdN.getNode(), 1)); // Write back register
+
+ break;
+ }
return SDValue();
}
/// Target-specific DAG combine function for NEON load/store intrinsics
/// to merge base address updates.
-static SDValue CombineBaseUpdate(SDNode *N,
- TargetLowering::DAGCombinerInfo &DCI) {
+static SDValue performNEONPostLDSTCombine(SDNode *N,
+ TargetLowering::DAGCombinerInfo &DCI,
+ SelectionDAG &DAG) {
if (DCI.isBeforeLegalize() || DCI.isCalledByLegalizer())
return SDValue();
- SelectionDAG &DAG = DCI.DAG;
- bool isIntrinsic = (N->getOpcode() == ISD::INTRINSIC_VOID ||
- N->getOpcode() == ISD::INTRINSIC_W_CHAIN);
- unsigned AddrOpIdx = (isIntrinsic ? 2 : 1);
+ unsigned AddrOpIdx = N->getNumOperands() - 1;
SDValue Addr = N->getOperand(AddrOpIdx);
// Search for a use of the address operand that is an increment.
@@ -4090,106 +7423,96 @@
continue;
// Find the new opcode for the updating load/store.
- bool isLoad = true;
- bool isLaneOp = false;
+ bool IsStore = false;
+ bool IsLaneOp = false;
+ bool IsDupOp = false;
unsigned NewOpc = 0;
unsigned NumVecs = 0;
- if (isIntrinsic) {
- unsigned IntNo = cast<ConstantSDNode>(N->getOperand(1))->getZExtValue();
- switch (IntNo) {
- default: llvm_unreachable("unexpected intrinsic for Neon base update");
- case Intrinsic::arm_neon_vld1: NewOpc = AArch64ISD::NEON_LD1_UPD;
- NumVecs = 1; break;
- case Intrinsic::arm_neon_vld2: NewOpc = AArch64ISD::NEON_LD2_UPD;
- NumVecs = 2; break;
- case Intrinsic::arm_neon_vld3: NewOpc = AArch64ISD::NEON_LD3_UPD;
- NumVecs = 3; break;
- case Intrinsic::arm_neon_vld4: NewOpc = AArch64ISD::NEON_LD4_UPD;
- NumVecs = 4; break;
- case Intrinsic::arm_neon_vst1: NewOpc = AArch64ISD::NEON_ST1_UPD;
- NumVecs = 1; isLoad = false; break;
- case Intrinsic::arm_neon_vst2: NewOpc = AArch64ISD::NEON_ST2_UPD;
- NumVecs = 2; isLoad = false; break;
- case Intrinsic::arm_neon_vst3: NewOpc = AArch64ISD::NEON_ST3_UPD;
- NumVecs = 3; isLoad = false; break;
- case Intrinsic::arm_neon_vst4: NewOpc = AArch64ISD::NEON_ST4_UPD;
- NumVecs = 4; isLoad = false; break;
- case Intrinsic::aarch64_neon_vld1x2: NewOpc = AArch64ISD::NEON_LD1x2_UPD;
- NumVecs = 2; break;
- case Intrinsic::aarch64_neon_vld1x3: NewOpc = AArch64ISD::NEON_LD1x3_UPD;
- NumVecs = 3; break;
- case Intrinsic::aarch64_neon_vld1x4: NewOpc = AArch64ISD::NEON_LD1x4_UPD;
- NumVecs = 4; break;
- case Intrinsic::aarch64_neon_vst1x2: NewOpc = AArch64ISD::NEON_ST1x2_UPD;
- NumVecs = 2; isLoad = false; break;
- case Intrinsic::aarch64_neon_vst1x3: NewOpc = AArch64ISD::NEON_ST1x3_UPD;
- NumVecs = 3; isLoad = false; break;
- case Intrinsic::aarch64_neon_vst1x4: NewOpc = AArch64ISD::NEON_ST1x4_UPD;
- NumVecs = 4; isLoad = false; break;
- case Intrinsic::arm_neon_vld2lane: NewOpc = AArch64ISD::NEON_LD2LN_UPD;
- NumVecs = 2; isLaneOp = true; break;
- case Intrinsic::arm_neon_vld3lane: NewOpc = AArch64ISD::NEON_LD3LN_UPD;
- NumVecs = 3; isLaneOp = true; break;
- case Intrinsic::arm_neon_vld4lane: NewOpc = AArch64ISD::NEON_LD4LN_UPD;
- NumVecs = 4; isLaneOp = true; break;
- case Intrinsic::arm_neon_vst2lane: NewOpc = AArch64ISD::NEON_ST2LN_UPD;
- NumVecs = 2; isLoad = false; isLaneOp = true; break;
- case Intrinsic::arm_neon_vst3lane: NewOpc = AArch64ISD::NEON_ST3LN_UPD;
- NumVecs = 3; isLoad = false; isLaneOp = true; break;
- case Intrinsic::arm_neon_vst4lane: NewOpc = AArch64ISD::NEON_ST4LN_UPD;
- NumVecs = 4; isLoad = false; isLaneOp = true; break;
- }
- } else {
- isLaneOp = true;
- switch (N->getOpcode()) {
- default: llvm_unreachable("unexpected opcode for Neon base update");
- case AArch64ISD::NEON_LD2DUP: NewOpc = AArch64ISD::NEON_LD2DUP_UPD;
- NumVecs = 2; break;
- case AArch64ISD::NEON_LD3DUP: NewOpc = AArch64ISD::NEON_LD3DUP_UPD;
- NumVecs = 3; break;
- case AArch64ISD::NEON_LD4DUP: NewOpc = AArch64ISD::NEON_LD4DUP_UPD;
- NumVecs = 4; break;
- }
+ unsigned IntNo = cast<ConstantSDNode>(N->getOperand(1))->getZExtValue();
+ switch (IntNo) {
+ default: llvm_unreachable("unexpected intrinsic for Neon base update");
+ case Intrinsic::aarch64_neon_ld2: NewOpc = AArch64ISD::LD2post;
+ NumVecs = 2; break;
+ case Intrinsic::aarch64_neon_ld3: NewOpc = AArch64ISD::LD3post;
+ NumVecs = 3; break;
+ case Intrinsic::aarch64_neon_ld4: NewOpc = AArch64ISD::LD4post;
+ NumVecs = 4; break;
+ case Intrinsic::aarch64_neon_st2: NewOpc = AArch64ISD::ST2post;
+ NumVecs = 2; IsStore = true; break;
+ case Intrinsic::aarch64_neon_st3: NewOpc = AArch64ISD::ST3post;
+ NumVecs = 3; IsStore = true; break;
+ case Intrinsic::aarch64_neon_st4: NewOpc = AArch64ISD::ST4post;
+ NumVecs = 4; IsStore = true; break;
+ case Intrinsic::aarch64_neon_ld1x2: NewOpc = AArch64ISD::LD1x2post;
+ NumVecs = 2; break;
+ case Intrinsic::aarch64_neon_ld1x3: NewOpc = AArch64ISD::LD1x3post;
+ NumVecs = 3; break;
+ case Intrinsic::aarch64_neon_ld1x4: NewOpc = AArch64ISD::LD1x4post;
+ NumVecs = 4; break;
+ case Intrinsic::aarch64_neon_st1x2: NewOpc = AArch64ISD::ST1x2post;
+ NumVecs = 2; IsStore = true; break;
+ case Intrinsic::aarch64_neon_st1x3: NewOpc = AArch64ISD::ST1x3post;
+ NumVecs = 3; IsStore = true; break;
+ case Intrinsic::aarch64_neon_st1x4: NewOpc = AArch64ISD::ST1x4post;
+ NumVecs = 4; IsStore = true; break;
+ case Intrinsic::aarch64_neon_ld2r: NewOpc = AArch64ISD::LD2DUPpost;
+ NumVecs = 2; IsDupOp = true; break;
+ case Intrinsic::aarch64_neon_ld3r: NewOpc = AArch64ISD::LD3DUPpost;
+ NumVecs = 3; IsDupOp = true; break;
+ case Intrinsic::aarch64_neon_ld4r: NewOpc = AArch64ISD::LD4DUPpost;
+ NumVecs = 4; IsDupOp = true; break;
+ case Intrinsic::aarch64_neon_ld2lane: NewOpc = AArch64ISD::LD2LANEpost;
+ NumVecs = 2; IsLaneOp = true; break;
+ case Intrinsic::aarch64_neon_ld3lane: NewOpc = AArch64ISD::LD3LANEpost;
+ NumVecs = 3; IsLaneOp = true; break;
+ case Intrinsic::aarch64_neon_ld4lane: NewOpc = AArch64ISD::LD4LANEpost;
+ NumVecs = 4; IsLaneOp = true; break;
+ case Intrinsic::aarch64_neon_st2lane: NewOpc = AArch64ISD::ST2LANEpost;
+ NumVecs = 2; IsStore = true; IsLaneOp = true; break;
+ case Intrinsic::aarch64_neon_st3lane: NewOpc = AArch64ISD::ST3LANEpost;
+ NumVecs = 3; IsStore = true; IsLaneOp = true; break;
+ case Intrinsic::aarch64_neon_st4lane: NewOpc = AArch64ISD::ST4LANEpost;
+ NumVecs = 4; IsStore = true; IsLaneOp = true; break;
}
- // Find the size of memory referenced by the load/store.
EVT VecTy;
- if (isLoad)
- VecTy = N->getValueType(0);
+ if (IsStore)
+ VecTy = N->getOperand(2).getValueType();
else
- VecTy = N->getOperand(AddrOpIdx + 1).getValueType();
- unsigned NumBytes = NumVecs * VecTy.getSizeInBits() / 8;
- if (isLaneOp)
- NumBytes /= VecTy.getVectorNumElements();
+ VecTy = N->getValueType(0);
// If the increment is a constant, it must match the memory ref size.
SDValue Inc = User->getOperand(User->getOperand(0) == Addr ? 1 : 0);
if (ConstantSDNode *CInc = dyn_cast<ConstantSDNode>(Inc.getNode())) {
uint32_t IncVal = CInc->getZExtValue();
+ unsigned NumBytes = NumVecs * VecTy.getSizeInBits() / 8;
+ if (IsLaneOp || IsDupOp)
+ NumBytes /= VecTy.getVectorNumElements();
if (IncVal != NumBytes)
continue;
- Inc = DAG.getTargetConstant(IncVal, MVT::i32);
+ Inc = DAG.getRegister(AArch64::XZR, MVT::i64);
}
+ SmallVector<SDValue, 8> Ops;
+ Ops.push_back(N->getOperand(0)); // Incoming chain
+ // Load lane and store have vector list as input.
+ if (IsLaneOp || IsStore)
+ for (unsigned i = 2; i < AddrOpIdx; ++i)
+ Ops.push_back(N->getOperand(i));
+ Ops.push_back(Addr); // Base register
+ Ops.push_back(Inc);
- // Create the new updating load/store node.
+ // Return Types.
EVT Tys[6];
- unsigned NumResultVecs = (isLoad ? NumVecs : 0);
+ unsigned NumResultVecs = (IsStore ? 0 : NumVecs);
unsigned n;
for (n = 0; n < NumResultVecs; ++n)
Tys[n] = VecTy;
- Tys[n++] = MVT::i64;
- Tys[n] = MVT::Other;
- SDVTList SDTys = DAG.getVTList(Tys, NumResultVecs + 2);
- SmallVector<SDValue, 8> Ops;
- Ops.push_back(N->getOperand(0)); // incoming chain
- Ops.push_back(N->getOperand(AddrOpIdx));
- Ops.push_back(Inc);
- for (unsigned i = AddrOpIdx + 1; i < N->getNumOperands(); ++i) {
- Ops.push_back(N->getOperand(i));
- }
+ Tys[n++] = MVT::i64; // Type of write back register
+ Tys[n] = MVT::Other; // Type of the chain
+ SDVTList SDTys = DAG.getVTList(ArrayRef<EVT>(Tys, NumResultVecs + 2));
+
MemIntrinsicSDNode *MemInt = cast<MemIntrinsicSDNode>(N);
- SDValue UpdN = DAG.getMemIntrinsicNode(NewOpc, SDLoc(N), SDTys,
- Ops.data(), Ops.size(),
+ SDValue UpdN = DAG.getMemIntrinsicNode(NewOpc, SDLoc(N), SDTys, Ops,
MemInt->getMemoryVT(),
MemInt->getMemOperand());
@@ -4198,7 +7521,7 @@
for (unsigned i = 0; i < NumResultVecs; ++i) {
NewResults.push_back(SDValue(UpdN.getNode(), i));
}
- NewResults.push_back(SDValue(UpdN.getNode(), NumResultVecs + 1)); // chain
+ NewResults.push_back(SDValue(UpdN.getNode(), NumResultVecs + 1));
DCI.CombineTo(N, NewResults);
DCI.CombineTo(User, SDValue(UpdN.getNode(), NumResultVecs));
@@ -4207,107 +7530,58 @@
return SDValue();
}
-/// For a VDUPLANE node N, check if its source operand is a vldN-lane (N > 1)
-/// intrinsic, and if all the other uses of that intrinsic are also VDUPLANEs.
-/// If so, combine them to a vldN-dup operation and return true.
-static SDValue CombineVLDDUP(SDNode *N, TargetLowering::DAGCombinerInfo &DCI) {
- SelectionDAG &DAG = DCI.DAG;
- EVT VT = N->getValueType(0);
+// Optimize compare with zero and branch.
+static SDValue performBRCONDCombine(SDNode *N,
+ TargetLowering::DAGCombinerInfo &DCI,
+ SelectionDAG &DAG) {
+ SDValue Chain = N->getOperand(0);
+ SDValue Dest = N->getOperand(1);
+ SDValue CCVal = N->getOperand(2);
+ SDValue Cmp = N->getOperand(3);
- // Check if the VDUPLANE operand is a vldN-dup intrinsic.
- SDNode *VLD = N->getOperand(0).getNode();
- if (VLD->getOpcode() != ISD::INTRINSIC_W_CHAIN)
- return SDValue();
- unsigned NumVecs = 0;
- unsigned NewOpc = 0;
- unsigned IntNo = cast<ConstantSDNode>(VLD->getOperand(1))->getZExtValue();
- if (IntNo == Intrinsic::arm_neon_vld2lane) {
- NumVecs = 2;
- NewOpc = AArch64ISD::NEON_LD2DUP;
- } else if (IntNo == Intrinsic::arm_neon_vld3lane) {
- NumVecs = 3;
- NewOpc = AArch64ISD::NEON_LD3DUP;
- } else if (IntNo == Intrinsic::arm_neon_vld4lane) {
- NumVecs = 4;
- NewOpc = AArch64ISD::NEON_LD4DUP;
- } else {
- return SDValue();
- }
-
- // First check that all the vldN-lane uses are VDUPLANEs and that the lane
- // numbers match the load.
- unsigned VLDLaneNo =
- cast<ConstantSDNode>(VLD->getOperand(NumVecs + 3))->getZExtValue();
- for (SDNode::use_iterator UI = VLD->use_begin(), UE = VLD->use_end();
- UI != UE; ++UI) {
- // Ignore uses of the chain result.
- if (UI.getUse().getResNo() == NumVecs)
- continue;
- SDNode *User = *UI;
- if (User->getOpcode() != AArch64ISD::NEON_VDUPLANE ||
- VLDLaneNo != cast<ConstantSDNode>(User->getOperand(1))->getZExtValue())
- return SDValue();
- }
-
- // Create the vldN-dup node.
- EVT Tys[5];
- unsigned n;
- for (n = 0; n < NumVecs; ++n)
- Tys[n] = VT;
- Tys[n] = MVT::Other;
- SDVTList SDTys = DAG.getVTList(Tys, NumVecs + 1);
- SDValue Ops[] = { VLD->getOperand(0), VLD->getOperand(2) };
- MemIntrinsicSDNode *VLDMemInt = cast<MemIntrinsicSDNode>(VLD);
- SDValue VLDDup = DAG.getMemIntrinsicNode(NewOpc, SDLoc(VLD), SDTys, Ops, 2,
- VLDMemInt->getMemoryVT(),
- VLDMemInt->getMemOperand());
-
- // Update the uses.
- for (SDNode::use_iterator UI = VLD->use_begin(), UE = VLD->use_end();
- UI != UE; ++UI) {
- unsigned ResNo = UI.getUse().getResNo();
- // Ignore uses of the chain result.
- if (ResNo == NumVecs)
- continue;
- SDNode *User = *UI;
- DCI.CombineTo(User, SDValue(VLDDup.getNode(), ResNo));
- }
-
- // Now the vldN-lane intrinsic is dead except for its chain result.
- // Update uses of the chain.
- std::vector<SDValue> VLDDupResults;
- for (unsigned n = 0; n < NumVecs; ++n)
- VLDDupResults.push_back(SDValue(VLDDup.getNode(), n));
- VLDDupResults.push_back(SDValue(VLDDup.getNode(), NumVecs));
- DCI.CombineTo(VLD, VLDDupResults);
-
- return SDValue(N, 0);
-}
-
-// v1i1 setcc ->
-// v1i1 (bitcast (i1 setcc (extract_vector_elt, extract_vector_elt))
-// FIXME: Currently the type legalizer can't handle SETCC having v1i1 as result.
-// If it can legalize "v1i1 SETCC" correctly, no need to combine such SETCC.
-static SDValue PerformSETCCCombine(SDNode *N, SelectionDAG &DAG) {
- EVT ResVT = N->getValueType(0);
-
- if (!ResVT.isVector() || ResVT.getVectorNumElements() != 1 ||
- ResVT.getVectorElementType() != MVT::i1)
+ assert(isa<ConstantSDNode>(CCVal) && "Expected a ConstantSDNode here!");
+ unsigned CC = cast<ConstantSDNode>(CCVal)->getZExtValue();
+ if (CC != AArch64CC::EQ && CC != AArch64CC::NE)
return SDValue();
- SDValue LHS = N->getOperand(0);
- SDValue RHS = N->getOperand(1);
- EVT CmpVT = LHS.getValueType();
- LHS = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SDLoc(N),
- CmpVT.getVectorElementType(), LHS,
- DAG.getConstant(0, MVT::i64));
- RHS = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SDLoc(N),
- CmpVT.getVectorElementType(), RHS,
- DAG.getConstant(0, MVT::i64));
- SDValue SetCC =
- DAG.getSetCC(SDLoc(N), MVT::i1, LHS, RHS,
- cast<CondCodeSDNode>(N->getOperand(2))->get());
- return DAG.getNode(ISD::BITCAST, SDLoc(N), ResVT, SetCC);
+ unsigned CmpOpc = Cmp.getOpcode();
+ if (CmpOpc != AArch64ISD::ADDS && CmpOpc != AArch64ISD::SUBS)
+ return SDValue();
+
+ // Only attempt folding if there is only one use of the flag and no use of the
+ // value.
+ if (!Cmp->hasNUsesOfValue(0, 0) || !Cmp->hasNUsesOfValue(1, 1))
+ return SDValue();
+
+ SDValue LHS = Cmp.getOperand(0);
+ SDValue RHS = Cmp.getOperand(1);
+
+ assert(LHS.getValueType() == RHS.getValueType() &&
+ "Expected the value type to be the same for both operands!");
+ if (LHS.getValueType() != MVT::i32 && LHS.getValueType() != MVT::i64)
+ return SDValue();
+
+ if (isa<ConstantSDNode>(LHS) && cast<ConstantSDNode>(LHS)->isNullValue())
+ std::swap(LHS, RHS);
+
+ if (!isa<ConstantSDNode>(RHS) || !cast<ConstantSDNode>(RHS)->isNullValue())
+ return SDValue();
+
+ if (LHS.getOpcode() == ISD::SHL || LHS.getOpcode() == ISD::SRA ||
+ LHS.getOpcode() == ISD::SRL)
+ return SDValue();
+
+ // Fold the compare into the branch instruction.
+ SDValue BR;
+ if (CC == AArch64CC::EQ)
+ BR = DAG.getNode(AArch64ISD::CBZ, SDLoc(N), MVT::Other, Chain, LHS, Dest);
+ else
+ BR = DAG.getNode(AArch64ISD::CBNZ, SDLoc(N), MVT::Other, Chain, LHS, Dest);
+
+ // Do not add new nodes to DAG combiner worklist.
+ DCI.CombineTo(N, BR, false);
+
+ return SDValue();
}
// vselect (v1i1 setcc) ->
@@ -4315,7 +7589,7 @@
// FIXME: Currently the type legalizer can't handle VSELECT having v1i1 as
// condition. If it can legalize "VSELECT v1i1" correctly, no need to combine
// such VSELECT.
-static SDValue PerformVSelectCombine(SDNode *N, SelectionDAG &DAG) {
+static SDValue performVSelectCombine(SDNode *N, SelectionDAG &DAG) {
SDValue N0 = N->getOperand(0);
EVT CCVT = N0.getValueType();
@@ -4340,79 +7614,109 @@
IfTrue, IfFalse);
}
-// sign_extend (extract_vector_elt (v1i1 setcc)) ->
-// extract_vector_elt (v1iXX setcc)
-// (XX is the size of the compared operand type)
-static SDValue PerformSignExtendCombine(SDNode *N, SelectionDAG &DAG) {
+/// A vector select: "(select vL, vR, (setcc LHS, RHS))" is best performed with
+/// the compare-mask instructions rather than going via NZCV, even if LHS and
+/// RHS are really scalar. This replaces any scalar setcc in the above pattern
+/// with a vector one followed by a DUP shuffle on the result.
+static SDValue performSelectCombine(SDNode *N, SelectionDAG &DAG) {
SDValue N0 = N->getOperand(0);
- SDValue Vec = N0.getOperand(0);
-
- if (N0.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
- Vec.getOpcode() != ISD::SETCC)
- return SDValue();
-
EVT ResVT = N->getValueType(0);
- EVT CmpVT = Vec.getOperand(0).getValueType();
- // Only optimize when the result type is of the same size as the element
- // type of the compared operand.
- if (ResVT.getSizeInBits() != CmpVT.getVectorElementType().getSizeInBits())
+
+ if (!N->getOperand(1).getValueType().isVector())
return SDValue();
- SDValue Lane = N0.getOperand(1);
- SDValue SetCC =
- DAG.getSetCC(SDLoc(N), CmpVT.changeVectorElementTypeToInteger(),
- Vec.getOperand(0), Vec.getOperand(1),
- cast<CondCodeSDNode>(Vec.getOperand(2))->get());
- return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SDLoc(N), ResVT,
- SetCC, Lane);
+ if (N0.getOpcode() != ISD::SETCC || N0.getValueType() != MVT::i1)
+ return SDValue();
+
+ SDLoc DL(N0);
+
+ EVT SrcVT = N0.getOperand(0).getValueType();
+ SrcVT = EVT::getVectorVT(*DAG.getContext(), SrcVT,
+ ResVT.getSizeInBits() / SrcVT.getSizeInBits());
+ EVT CCVT = SrcVT.changeVectorElementTypeToInteger();
+
+ // First perform a vector comparison, where lane 0 is the one we're interested
+ // in.
+ SDValue LHS =
+ DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, SrcVT, N0.getOperand(0));
+ SDValue RHS =
+ DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, SrcVT, N0.getOperand(1));
+ SDValue SetCC = DAG.getNode(ISD::SETCC, DL, CCVT, LHS, RHS, N0.getOperand(2));
+
+ // Now duplicate the comparison mask we want across all other lanes.
+ SmallVector<int, 8> DUPMask(CCVT.getVectorNumElements(), 0);
+ SDValue Mask = DAG.getVectorShuffle(CCVT, DL, SetCC, SetCC, DUPMask.data());
+ Mask = DAG.getNode(ISD::BITCAST, DL, ResVT.changeVectorElementTypeToInteger(),
+ Mask);
+
+ return DAG.getSelect(DL, ResVT, Mask, N->getOperand(1), N->getOperand(2));
}
-SDValue
-AArch64TargetLowering::PerformDAGCombine(SDNode *N,
- DAGCombinerInfo &DCI) const {
+SDValue AArch64TargetLowering::PerformDAGCombine(SDNode *N,
+ DAGCombinerInfo &DCI) const {
+ SelectionDAG &DAG = DCI.DAG;
switch (N->getOpcode()) {
- default: break;
- case ISD::AND: return PerformANDCombine(N, DCI);
- case ISD::OR: return PerformORCombine(N, DCI, getSubtarget());
- case ISD::SHL:
- case ISD::SRA:
- case ISD::SRL:
- return PerformShiftCombine(N, DCI, getSubtarget());
- case ISD::SETCC: return PerformSETCCCombine(N, DCI.DAG);
- case ISD::VSELECT: return PerformVSelectCombine(N, DCI.DAG);
- case ISD::SIGN_EXTEND: return PerformSignExtendCombine(N, DCI.DAG);
+ default:
+ break;
+ case ISD::ADD:
+ case ISD::SUB:
+ return performAddSubLongCombine(N, DCI, DAG);
+ case ISD::XOR:
+ return performXorCombine(N, DAG, DCI, Subtarget);
+ case ISD::MUL:
+ return performMulCombine(N, DAG, DCI, Subtarget);
+ case ISD::SINT_TO_FP:
+ case ISD::UINT_TO_FP:
+ return performIntToFpCombine(N, DAG);
+ case ISD::OR:
+ return performORCombine(N, DCI, Subtarget);
case ISD::INTRINSIC_WO_CHAIN:
- return PerformIntrinsicCombine(N, DCI.DAG);
- case AArch64ISD::NEON_VDUPLANE:
- return CombineVLDDUP(N, DCI);
- case AArch64ISD::NEON_LD2DUP:
- case AArch64ISD::NEON_LD3DUP:
- case AArch64ISD::NEON_LD4DUP:
- return CombineBaseUpdate(N, DCI);
+ return performIntrinsicCombine(N, DCI, Subtarget);
+ case ISD::ANY_EXTEND:
+ case ISD::ZERO_EXTEND:
+ case ISD::SIGN_EXTEND:
+ return performExtendCombine(N, DCI, DAG);
+ case ISD::BITCAST:
+ return performBitcastCombine(N, DCI, DAG);
+ case ISD::CONCAT_VECTORS:
+ return performConcatVectorsCombine(N, DCI, DAG);
+ case ISD::SELECT:
+ return performSelectCombine(N, DAG);
+ case ISD::VSELECT:
+ return performVSelectCombine(N, DCI.DAG);
+ case ISD::STORE:
+ return performSTORECombine(N, DCI, DAG, Subtarget);
+ case AArch64ISD::BRCOND:
+ return performBRCONDCombine(N, DCI, DAG);
+ case AArch64ISD::DUP:
+ return performPostLD1Combine(N, DCI, false);
+ case ISD::INSERT_VECTOR_ELT:
+ return performPostLD1Combine(N, DCI, true);
case ISD::INTRINSIC_VOID:
case ISD::INTRINSIC_W_CHAIN:
switch (cast<ConstantSDNode>(N->getOperand(1))->getZExtValue()) {
- case Intrinsic::arm_neon_vld1:
- case Intrinsic::arm_neon_vld2:
- case Intrinsic::arm_neon_vld3:
- case Intrinsic::arm_neon_vld4:
- case Intrinsic::arm_neon_vst1:
- case Intrinsic::arm_neon_vst2:
- case Intrinsic::arm_neon_vst3:
- case Intrinsic::arm_neon_vst4:
- case Intrinsic::arm_neon_vld2lane:
- case Intrinsic::arm_neon_vld3lane:
- case Intrinsic::arm_neon_vld4lane:
- case Intrinsic::aarch64_neon_vld1x2:
- case Intrinsic::aarch64_neon_vld1x3:
- case Intrinsic::aarch64_neon_vld1x4:
- case Intrinsic::aarch64_neon_vst1x2:
- case Intrinsic::aarch64_neon_vst1x3:
- case Intrinsic::aarch64_neon_vst1x4:
- case Intrinsic::arm_neon_vst2lane:
- case Intrinsic::arm_neon_vst3lane:
- case Intrinsic::arm_neon_vst4lane:
- return CombineBaseUpdate(N, DCI);
+ case Intrinsic::aarch64_neon_ld2:
+ case Intrinsic::aarch64_neon_ld3:
+ case Intrinsic::aarch64_neon_ld4:
+ case Intrinsic::aarch64_neon_ld1x2:
+ case Intrinsic::aarch64_neon_ld1x3:
+ case Intrinsic::aarch64_neon_ld1x4:
+ case Intrinsic::aarch64_neon_ld2lane:
+ case Intrinsic::aarch64_neon_ld3lane:
+ case Intrinsic::aarch64_neon_ld4lane:
+ case Intrinsic::aarch64_neon_ld2r:
+ case Intrinsic::aarch64_neon_ld3r:
+ case Intrinsic::aarch64_neon_ld4r:
+ case Intrinsic::aarch64_neon_st2:
+ case Intrinsic::aarch64_neon_st3:
+ case Intrinsic::aarch64_neon_st4:
+ case Intrinsic::aarch64_neon_st1x2:
+ case Intrinsic::aarch64_neon_st1x3:
+ case Intrinsic::aarch64_neon_st1x4:
+ case Intrinsic::aarch64_neon_st2lane:
+ case Intrinsic::aarch64_neon_st3lane:
+ case Intrinsic::aarch64_neon_st4lane:
+ return performNEONPostLDSTCombine(N, DCI, DAG);
default:
break;
}
@@ -4420,979 +7724,214 @@
return SDValue();
}
-bool
-AArch64TargetLowering::isFMAFasterThanFMulAndFAdd(EVT VT) const {
- VT = VT.getScalarType();
-
- if (!VT.isSimple())
+// Check if the return value is used as only a return value, as otherwise
+// we can't perform a tail-call. In particular, we need to check for
+// target ISD nodes that are returns and any other "odd" constructs
+// that the generic analysis code won't necessarily catch.
+bool AArch64TargetLowering::isUsedByReturnOnly(SDNode *N,
+ SDValue &Chain) const {
+ if (N->getNumValues() != 1)
+ return false;
+ if (!N->hasNUsesOfValue(1, 0))
return false;
- switch (VT.getSimpleVT().SimpleTy) {
- case MVT::f16:
- case MVT::f32:
- case MVT::f64:
- return true;
- case MVT::f128:
- return false;
- default:
- break;
- }
-
- return false;
-}
-// Check whether a shuffle_vector could be presented as concat_vector.
-bool AArch64TargetLowering::isConcatVector(SDValue Op, SelectionDAG &DAG,
- SDValue V0, SDValue V1,
- const int *Mask,
- SDValue &Res) const {
- SDLoc DL(Op);
- EVT VT = Op.getValueType();
- if (VT.getSizeInBits() != 128)
- return false;
- if (VT.getVectorElementType() != V0.getValueType().getVectorElementType() ||
- VT.getVectorElementType() != V1.getValueType().getVectorElementType())
- return false;
-
- unsigned NumElts = VT.getVectorNumElements();
- bool isContactVector = true;
- bool splitV0 = false;
- if (V0.getValueType().getSizeInBits() == 128)
- splitV0 = true;
-
- for (int I = 0, E = NumElts / 2; I != E; I++) {
- if (Mask[I] != I) {
- isContactVector = false;
- break;
- }
- }
-
- if (isContactVector) {
- int offset = NumElts / 2;
- for (int I = NumElts / 2, E = NumElts; I != E; I++) {
- if (Mask[I] != I + splitV0 * offset) {
- isContactVector = false;
- break;
- }
- }
- }
-
- if (isContactVector) {
- EVT CastVT = EVT::getVectorVT(*DAG.getContext(), VT.getVectorElementType(),
- NumElts / 2);
- if (splitV0) {
- V0 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, CastVT, V0,
- DAG.getConstant(0, MVT::i64));
- }
- if (V1.getValueType().getSizeInBits() == 128) {
- V1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, CastVT, V1,
- DAG.getConstant(0, MVT::i64));
- }
- Res = DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, V0, V1);
- return true;
- }
- return false;
-}
-
-// Check whether a Build Vector could be presented as Shuffle Vector.
-// This Shuffle Vector maybe not legalized, so the length of its operand and
-// the length of result may not equal.
-bool AArch64TargetLowering::isKnownShuffleVector(SDValue Op, SelectionDAG &DAG,
- SDValue &V0, SDValue &V1,
- int *Mask) const {
- SDLoc DL(Op);
- EVT VT = Op.getValueType();
- unsigned NumElts = VT.getVectorNumElements();
- unsigned V0NumElts = 0;
-
- // Check if all elements are extracted from less than 3 vectors.
- for (unsigned i = 0; i < NumElts; ++i) {
- SDValue Elt = Op.getOperand(i);
- if (Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
- Elt.getOperand(0).getValueType().getVectorElementType() !=
- VT.getVectorElementType())
+ SDValue TCChain = Chain;
+ SDNode *Copy = *N->use_begin();
+ if (Copy->getOpcode() == ISD::CopyToReg) {
+ // If the copy has a glue operand, we conservatively assume it isn't safe to
+ // perform a tail call.
+ if (Copy->getOperand(Copy->getNumOperands() - 1).getValueType() ==
+ MVT::Glue)
return false;
+ TCChain = Copy->getOperand(0);
+ } else if (Copy->getOpcode() != ISD::FP_EXTEND)
+ return false;
- if (V0.getNode() == 0) {
- V0 = Elt.getOperand(0);
- V0NumElts = V0.getValueType().getVectorNumElements();
- }
- if (Elt.getOperand(0) == V0) {
- Mask[i] = (cast<ConstantSDNode>(Elt->getOperand(1))->getZExtValue());
- continue;
- } else if (V1.getNode() == 0) {
- V1 = Elt.getOperand(0);
- }
- if (Elt.getOperand(0) == V1) {
- unsigned Lane = cast<ConstantSDNode>(Elt->getOperand(1))->getZExtValue();
- Mask[i] = (Lane + V0NumElts);
- continue;
- } else {
+ bool HasRet = false;
+ for (SDNode *Node : Copy->uses()) {
+ if (Node->getOpcode() != AArch64ISD::RET_FLAG)
return false;
- }
+ HasRet = true;
}
+
+ if (!HasRet)
+ return false;
+
+ Chain = TCChain;
return true;
}
-// LowerShiftRightParts - Lower SRL_PARTS and SRA_PARTS, which returns two
-/// i64 values and take a 2 x i64 value to shift plus a shift amount.
-SDValue AArch64TargetLowering::LowerShiftRightParts(SDValue Op,
- SelectionDAG &DAG) const {
- assert(Op.getNumOperands() == 3 && "Not a quad-shift!");
- EVT VT = Op.getValueType();
- unsigned VTBits = VT.getSizeInBits();
- SDLoc dl(Op);
- SDValue ShOpLo = Op.getOperand(0);
- SDValue ShOpHi = Op.getOperand(1);
- SDValue ShAmt = Op.getOperand(2);
- unsigned Opc = (Op.getOpcode() == ISD::SRA_PARTS) ? ISD::SRA : ISD::SRL;
-
- assert(Op.getOpcode() == ISD::SRA_PARTS || Op.getOpcode() == ISD::SRL_PARTS);
- SDValue RevShAmt = DAG.getNode(ISD::SUB, dl, MVT::i64,
- DAG.getConstant(VTBits, MVT::i64), ShAmt);
- SDValue Tmp1 = DAG.getNode(ISD::SRL, dl, VT, ShOpLo, ShAmt);
- SDValue ExtraShAmt = DAG.getNode(ISD::SUB, dl, MVT::i64, ShAmt,
- DAG.getConstant(VTBits, MVT::i64));
- SDValue Tmp2 = DAG.getNode(ISD::SHL, dl, VT, ShOpHi, RevShAmt);
- SDValue FalseVal = DAG.getNode(ISD::OR, dl, VT, Tmp1, Tmp2);
- SDValue TrueVal = DAG.getNode(Opc, dl, VT, ShOpHi, ExtraShAmt);
- SDValue Tmp3 = DAG.getNode(Opc, dl, VT, ShOpHi, ShAmt);
-
- SDValue A64cc;
- SDValue CmpOp = getSelectableIntSetCC(ExtraShAmt,
- DAG.getConstant(0, MVT::i64),
- ISD::SETGE, A64cc,
- DAG, dl);
-
- SDValue Hi = DAG.getNode(AArch64ISD::SELECT_CC, dl, VT, CmpOp,
- DAG.getConstant(0, Tmp3.getValueType()), Tmp3,
- A64cc);
- SDValue Lo = DAG.getNode(AArch64ISD::SELECT_CC, dl, VT, CmpOp,
- TrueVal, FalseVal, A64cc);
-
- SDValue Ops[2] = { Lo, Hi };
- return DAG.getMergeValues(Ops, 2, dl);
-}
-
-/// LowerShiftLeftParts - Lower SHL_PARTS, which returns two
-/// i64 values and take a 2 x i64 value to shift plus a shift amount.
-SDValue AArch64TargetLowering::LowerShiftLeftParts(SDValue Op,
- SelectionDAG &DAG) const {
- assert(Op.getNumOperands() == 3 && "Not a quad-shift!");
- EVT VT = Op.getValueType();
- unsigned VTBits = VT.getSizeInBits();
- SDLoc dl(Op);
- SDValue ShOpLo = Op.getOperand(0);
- SDValue ShOpHi = Op.getOperand(1);
- SDValue ShAmt = Op.getOperand(2);
-
- assert(Op.getOpcode() == ISD::SHL_PARTS);
- SDValue RevShAmt = DAG.getNode(ISD::SUB, dl, MVT::i64,
- DAG.getConstant(VTBits, MVT::i64), ShAmt);
- SDValue Tmp1 = DAG.getNode(ISD::SRL, dl, VT, ShOpLo, RevShAmt);
- SDValue ExtraShAmt = DAG.getNode(ISD::SUB, dl, MVT::i64, ShAmt,
- DAG.getConstant(VTBits, MVT::i64));
- SDValue Tmp2 = DAG.getNode(ISD::SHL, dl, VT, ShOpHi, ShAmt);
- SDValue Tmp3 = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, ExtraShAmt);
- SDValue FalseVal = DAG.getNode(ISD::OR, dl, VT, Tmp1, Tmp2);
- SDValue Tmp4 = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, ShAmt);
-
- SDValue A64cc;
- SDValue CmpOp = getSelectableIntSetCC(ExtraShAmt,
- DAG.getConstant(0, MVT::i64),
- ISD::SETGE, A64cc,
- DAG, dl);
-
- SDValue Lo = DAG.getNode(AArch64ISD::SELECT_CC, dl, VT, CmpOp,
- DAG.getConstant(0, Tmp4.getValueType()), Tmp4,
- A64cc);
- SDValue Hi = DAG.getNode(AArch64ISD::SELECT_CC, dl, VT, CmpOp,
- Tmp3, FalseVal, A64cc);
-
- SDValue Ops[2] = { Lo, Hi };
- return DAG.getMergeValues(Ops, 2, dl);
-}
-
-// If this is a case we can't handle, return null and let the default
-// expansion code take care of it.
-SDValue
-AArch64TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG,
- const AArch64Subtarget *ST) const {
-
- BuildVectorSDNode *BVN = cast<BuildVectorSDNode>(Op.getNode());
- SDLoc DL(Op);
- EVT VT = Op.getValueType();
-
- APInt SplatBits, SplatUndef;
- unsigned SplatBitSize;
- bool HasAnyUndefs;
-
- unsigned UseNeonMov = VT.getSizeInBits() >= 64;
-
- // Note we favor lowering MOVI over MVNI.
- // This has implications on the definition of patterns in TableGen to select
- // BIC immediate instructions but not ORR immediate instructions.
- // If this lowering order is changed, TableGen patterns for BIC immediate and
- // ORR immediate instructions have to be updated.
- if (UseNeonMov &&
- BVN->isConstantSplat(SplatBits, SplatUndef, SplatBitSize, HasAnyUndefs)) {
- if (SplatBitSize <= 64) {
- // First attempt to use vector immediate-form MOVI
- EVT NeonMovVT;
- unsigned Imm = 0;
- unsigned OpCmode = 0;
-
- if (isNeonModifiedImm(SplatBits.getZExtValue(), SplatUndef.getZExtValue(),
- SplatBitSize, DAG, VT.is128BitVector(),
- Neon_Mov_Imm, NeonMovVT, Imm, OpCmode)) {
- SDValue ImmVal = DAG.getTargetConstant(Imm, MVT::i32);
- SDValue OpCmodeVal = DAG.getConstant(OpCmode, MVT::i32);
-
- if (ImmVal.getNode() && OpCmodeVal.getNode()) {
- SDValue NeonMov = DAG.getNode(AArch64ISD::NEON_MOVIMM, DL, NeonMovVT,
- ImmVal, OpCmodeVal);
- return DAG.getNode(ISD::BITCAST, DL, VT, NeonMov);
- }
- }
-
- // Then attempt to use vector immediate-form MVNI
- uint64_t NegatedImm = (~SplatBits).getZExtValue();
- if (isNeonModifiedImm(NegatedImm, SplatUndef.getZExtValue(), SplatBitSize,
- DAG, VT.is128BitVector(), Neon_Mvn_Imm, NeonMovVT,
- Imm, OpCmode)) {
- SDValue ImmVal = DAG.getTargetConstant(Imm, MVT::i32);
- SDValue OpCmodeVal = DAG.getConstant(OpCmode, MVT::i32);
- if (ImmVal.getNode() && OpCmodeVal.getNode()) {
- SDValue NeonMov = DAG.getNode(AArch64ISD::NEON_MVNIMM, DL, NeonMovVT,
- ImmVal, OpCmodeVal);
- return DAG.getNode(ISD::BITCAST, DL, VT, NeonMov);
- }
- }
-
- // Attempt to use vector immediate-form FMOV
- if (((VT == MVT::v2f32 || VT == MVT::v4f32) && SplatBitSize == 32) ||
- (VT == MVT::v2f64 && SplatBitSize == 64)) {
- APFloat RealVal(
- SplatBitSize == 32 ? APFloat::IEEEsingle : APFloat::IEEEdouble,
- SplatBits);
- uint32_t ImmVal;
- if (A64Imms::isFPImm(RealVal, ImmVal)) {
- SDValue Val = DAG.getTargetConstant(ImmVal, MVT::i32);
- return DAG.getNode(AArch64ISD::NEON_FMOVIMM, DL, VT, Val);
- }
- }
- }
- }
-
- unsigned NumElts = VT.getVectorNumElements();
- bool isOnlyLowElement = true;
- bool usesOnlyOneValue = true;
- bool hasDominantValue = false;
- bool isConstant = true;
-
- // Map of the number of times a particular SDValue appears in the
- // element list.
- DenseMap<SDValue, unsigned> ValueCounts;
- SDValue Value;
- for (unsigned i = 0; i < NumElts; ++i) {
- SDValue V = Op.getOperand(i);
- if (V.getOpcode() == ISD::UNDEF)
- continue;
- if (i > 0)
- isOnlyLowElement = false;
- if (!isa<ConstantFPSDNode>(V) && !isa<ConstantSDNode>(V))
- isConstant = false;
-
- ValueCounts.insert(std::make_pair(V, 0));
- unsigned &Count = ValueCounts[V];
-
- // Is this value dominant? (takes up more than half of the lanes)
- if (++Count > (NumElts / 2)) {
- hasDominantValue = true;
- Value = V;
- }
- }
- if (ValueCounts.size() != 1)
- usesOnlyOneValue = false;
- if (!Value.getNode() && ValueCounts.size() > 0)
- Value = ValueCounts.begin()->first;
-
- if (ValueCounts.size() == 0)
- return DAG.getUNDEF(VT);
-
- if (isOnlyLowElement)
- return DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VT, Value);
-
- unsigned EltSize = VT.getVectorElementType().getSizeInBits();
- if (hasDominantValue && EltSize <= 64) {
- // Use VDUP for non-constant splats.
- if (!isConstant) {
- SDValue N;
-
- // If we are DUPing a value that comes directly from a vector, we could
- // just use DUPLANE. We can only do this if the lane being extracted
- // is at a constant index, as the DUP from lane instructions only have
- // constant-index forms.
- //
- // If there is a TRUNCATE between EXTRACT_VECTOR_ELT and DUP, we can
- // remove TRUNCATE for DUPLANE by apdating the source vector to
- // appropriate vector type and lane index.
- //
- // FIXME: for now we have v1i8, v1i16, v1i32 legal vector types, if they
- // are not legal any more, no need to check the type size in bits should
- // be large than 64.
- SDValue V = Value;
- if (Value->getOpcode() == ISD::TRUNCATE)
- V = Value->getOperand(0);
- if (V->getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
- isa<ConstantSDNode>(V->getOperand(1)) &&
- V->getOperand(0).getValueType().getSizeInBits() >= 64) {
-
- // If the element size of source vector is larger than DUPLANE
- // element size, we can do transformation by,
- // 1) bitcasting source register to smaller element vector
- // 2) mutiplying the lane index by SrcEltSize/ResEltSize
- // For example, we can lower
- // "v8i16 vdup_lane(v4i32, 1)"
- // to be
- // "v8i16 vdup_lane(v8i16 bitcast(v4i32), 2)".
- SDValue SrcVec = V->getOperand(0);
- unsigned SrcEltSize =
- SrcVec.getValueType().getVectorElementType().getSizeInBits();
- unsigned ResEltSize = VT.getVectorElementType().getSizeInBits();
- if (SrcEltSize > ResEltSize) {
- assert((SrcEltSize % ResEltSize == 0) && "Invalid element size");
- SDValue BitCast;
- unsigned SrcSize = SrcVec.getValueType().getSizeInBits();
- unsigned ResSize = VT.getSizeInBits();
-
- if (SrcSize > ResSize) {
- assert((SrcSize % ResSize == 0) && "Invalid vector size");
- EVT CastVT =
- EVT::getVectorVT(*DAG.getContext(), VT.getVectorElementType(),
- SrcSize / ResEltSize);
- BitCast = DAG.getNode(ISD::BITCAST, DL, CastVT, SrcVec);
- } else {
- assert((SrcSize == ResSize) && "Invalid vector size of source vec");
- BitCast = DAG.getNode(ISD::BITCAST, DL, VT, SrcVec);
- }
-
- unsigned LaneIdx = V->getConstantOperandVal(1);
- SDValue Lane =
- DAG.getConstant((SrcEltSize / ResEltSize) * LaneIdx, MVT::i64);
- N = DAG.getNode(AArch64ISD::NEON_VDUPLANE, DL, VT, BitCast, Lane);
- } else {
- assert((SrcEltSize == ResEltSize) &&
- "Invalid element size of source vec");
- N = DAG.getNode(AArch64ISD::NEON_VDUPLANE, DL, VT, V->getOperand(0),
- V->getOperand(1));
- }
- } else
- N = DAG.getNode(AArch64ISD::NEON_VDUP, DL, VT, Value);
-
- if (!usesOnlyOneValue) {
- // The dominant value was splatted as 'N', but we now have to insert
- // all differing elements.
- for (unsigned I = 0; I < NumElts; ++I) {
- if (Op.getOperand(I) == Value)
- continue;
- SmallVector<SDValue, 3> Ops;
- Ops.push_back(N);
- Ops.push_back(Op.getOperand(I));
- Ops.push_back(DAG.getConstant(I, MVT::i64));
- N = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, VT, &Ops[0], 3);
- }
- }
- return N;
- }
- if (usesOnlyOneValue && isConstant) {
- return DAG.getNode(AArch64ISD::NEON_VDUP, DL, VT, Value);
- }
- }
- // If all elements are constants and the case above didn't get hit, fall back
- // to the default expansion, which will generate a load from the constant
- // pool.
- if (isConstant)
- return SDValue();
-
- // Try to lower this in lowering ShuffleVector way.
- SDValue V0, V1;
- int Mask[16];
- if (isKnownShuffleVector(Op, DAG, V0, V1, Mask)) {
- unsigned V0NumElts = V0.getValueType().getVectorNumElements();
- if (!V1.getNode() && V0NumElts == NumElts * 2) {
- V1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, V0,
- DAG.getConstant(NumElts, MVT::i64));
- V0 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, V0,
- DAG.getConstant(0, MVT::i64));
- V0NumElts = V0.getValueType().getVectorNumElements();
- }
-
- if (V1.getNode() && NumElts == V0NumElts &&
- V0NumElts == V1.getValueType().getVectorNumElements()) {
- SDValue Shuffle = DAG.getVectorShuffle(VT, DL, V0, V1, Mask);
- if (Shuffle.getOpcode() != ISD::VECTOR_SHUFFLE)
- return Shuffle;
- else
- return LowerVECTOR_SHUFFLE(Shuffle, DAG);
- } else {
- SDValue Res;
- if (isConcatVector(Op, DAG, V0, V1, Mask, Res))
- return Res;
- }
- }
-
- // If all else fails, just use a sequence of INSERT_VECTOR_ELT when we
- // know the default expansion would otherwise fall back on something even
- // worse. For a vector with one or two non-undef values, that's
- // scalar_to_vector for the elements followed by a shuffle (provided the
- // shuffle is valid for the target) and materialization element by element
- // on the stack followed by a load for everything else.
- if (!isConstant && !usesOnlyOneValue) {
- SDValue Vec = DAG.getUNDEF(VT);
- for (unsigned i = 0 ; i < NumElts; ++i) {
- SDValue V = Op.getOperand(i);
- if (V.getOpcode() == ISD::UNDEF)
- continue;
- SDValue LaneIdx = DAG.getConstant(i, MVT::i64);
- Vec = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, VT, Vec, V, LaneIdx);
- }
- return Vec;
- }
- return SDValue();
-}
-
-/// isREVMask - Check if a vector shuffle corresponds to a REV
-/// instruction with the specified blocksize. (The order of the elements
-/// within each block of the vector is reversed.)
-static bool isREVMask(ArrayRef<int> M, EVT VT, unsigned BlockSize) {
- assert((BlockSize == 16 || BlockSize == 32 || BlockSize == 64) &&
- "Only possible block sizes for REV are: 16, 32, 64");
-
- unsigned EltSz = VT.getVectorElementType().getSizeInBits();
- if (EltSz == 64)
+// Return whether the an instruction can potentially be optimized to a tail
+// call. This will cause the optimizers to attempt to move, or duplicate,
+// return instructions to help enable tail call optimizations for this
+// instruction.
+bool AArch64TargetLowering::mayBeEmittedAsTailCall(CallInst *CI) const {
+ if (!CI->isTailCall())
return false;
- unsigned NumElts = VT.getVectorNumElements();
- unsigned BlockElts = M[0] + 1;
- // If the first shuffle index is UNDEF, be optimistic.
- if (M[0] < 0)
- BlockElts = BlockSize / EltSz;
-
- if (BlockSize <= EltSz || BlockSize != BlockElts * EltSz)
- return false;
-
- for (unsigned i = 0; i < NumElts; ++i) {
- if (M[i] < 0)
- continue; // ignore UNDEF indices
- if ((unsigned)M[i] != (i - i % BlockElts) + (BlockElts - 1 - i % BlockElts))
- return false;
- }
-
return true;
}
-// isPermuteMask - Check whether the vector shuffle matches to UZP, ZIP and
-// TRN instruction.
-static unsigned isPermuteMask(ArrayRef<int> M, EVT VT, bool isV2undef) {
- unsigned NumElts = VT.getVectorNumElements();
- if (NumElts < 4)
- return 0;
+bool AArch64TargetLowering::getIndexedAddressParts(SDNode *Op, SDValue &Base,
+ SDValue &Offset,
+ ISD::MemIndexedMode &AM,
+ bool &IsInc,
+ SelectionDAG &DAG) const {
+ if (Op->getOpcode() != ISD::ADD && Op->getOpcode() != ISD::SUB)
+ return false;
- bool ismatch = true;
-
- // Check UZP1
- for (unsigned i = 0; i < NumElts; ++i) {
- unsigned answer = i * 2;
- if (isV2undef && answer >= NumElts)
- answer -= NumElts;
- if (M[i] != -1 && (unsigned)M[i] != answer) {
- ismatch = false;
- break;
- }
- }
- if (ismatch)
- return AArch64ISD::NEON_UZP1;
-
- // Check UZP2
- ismatch = true;
- for (unsigned i = 0; i < NumElts; ++i) {
- unsigned answer = i * 2 + 1;
- if (isV2undef && answer >= NumElts)
- answer -= NumElts;
- if (M[i] != -1 && (unsigned)M[i] != answer) {
- ismatch = false;
- break;
- }
- }
- if (ismatch)
- return AArch64ISD::NEON_UZP2;
-
- // Check ZIP1
- ismatch = true;
- for (unsigned i = 0; i < NumElts; ++i) {
- unsigned answer = i / 2 + NumElts * (i % 2);
- if (isV2undef && answer >= NumElts)
- answer -= NumElts;
- if (M[i] != -1 && (unsigned)M[i] != answer) {
- ismatch = false;
- break;
- }
- }
- if (ismatch)
- return AArch64ISD::NEON_ZIP1;
-
- // Check ZIP2
- ismatch = true;
- for (unsigned i = 0; i < NumElts; ++i) {
- unsigned answer = (NumElts + i) / 2 + NumElts * (i % 2);
- if (isV2undef && answer >= NumElts)
- answer -= NumElts;
- if (M[i] != -1 && (unsigned)M[i] != answer) {
- ismatch = false;
- break;
- }
- }
- if (ismatch)
- return AArch64ISD::NEON_ZIP2;
-
- // Check TRN1
- ismatch = true;
- for (unsigned i = 0; i < NumElts; ++i) {
- unsigned answer = i + (NumElts - 1) * (i % 2);
- if (isV2undef && answer >= NumElts)
- answer -= NumElts;
- if (M[i] != -1 && (unsigned)M[i] != answer) {
- ismatch = false;
- break;
- }
- }
- if (ismatch)
- return AArch64ISD::NEON_TRN1;
-
- // Check TRN2
- ismatch = true;
- for (unsigned i = 0; i < NumElts; ++i) {
- unsigned answer = 1 + i + (NumElts - 1) * (i % 2);
- if (isV2undef && answer >= NumElts)
- answer -= NumElts;
- if (M[i] != -1 && (unsigned)M[i] != answer) {
- ismatch = false;
- break;
- }
- }
- if (ismatch)
- return AArch64ISD::NEON_TRN2;
-
- return 0;
-}
-
-SDValue
-AArch64TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op,
- SelectionDAG &DAG) const {
- SDValue V1 = Op.getOperand(0);
- SDValue V2 = Op.getOperand(1);
- SDLoc dl(Op);
- EVT VT = Op.getValueType();
- ShuffleVectorSDNode *SVN = cast<ShuffleVectorSDNode>(Op.getNode());
-
- // Convert shuffles that are directly supported on NEON to target-specific
- // DAG nodes, instead of keeping them as shuffles and matching them again
- // during code selection. This is more efficient and avoids the possibility
- // of inconsistencies between legalization and selection.
- ArrayRef<int> ShuffleMask = SVN->getMask();
-
- unsigned EltSize = VT.getVectorElementType().getSizeInBits();
- if (EltSize > 64)
- return SDValue();
-
- if (isREVMask(ShuffleMask, VT, 64))
- return DAG.getNode(AArch64ISD::NEON_REV64, dl, VT, V1);
- if (isREVMask(ShuffleMask, VT, 32))
- return DAG.getNode(AArch64ISD::NEON_REV32, dl, VT, V1);
- if (isREVMask(ShuffleMask, VT, 16))
- return DAG.getNode(AArch64ISD::NEON_REV16, dl, VT, V1);
-
- unsigned ISDNo;
- if (V2.getOpcode() == ISD::UNDEF)
- ISDNo = isPermuteMask(ShuffleMask, VT, true);
- else
- ISDNo = isPermuteMask(ShuffleMask, VT, false);
-
- if (ISDNo) {
- if (V2.getOpcode() == ISD::UNDEF)
- return DAG.getNode(ISDNo, dl, VT, V1, V1);
- else
- return DAG.getNode(ISDNo, dl, VT, V1, V2);
- }
-
- SDValue Res;
- if (isConcatVector(Op, DAG, V1, V2, &ShuffleMask[0], Res))
- return Res;
-
- // If the element of shuffle mask are all the same constant, we can
- // transform it into either NEON_VDUP or NEON_VDUPLANE
- if (ShuffleVectorSDNode::isSplatMask(&ShuffleMask[0], VT)) {
- int Lane = SVN->getSplatIndex();
- // If this is undef splat, generate it via "just" vdup, if possible.
- if (Lane == -1) Lane = 0;
-
- // Test if V1 is a SCALAR_TO_VECTOR.
- if (V1.getOpcode() == ISD::SCALAR_TO_VECTOR) {
- return DAG.getNode(AArch64ISD::NEON_VDUP, dl, VT, V1.getOperand(0));
- }
- // Test if V1 is a BUILD_VECTOR which is equivalent to a SCALAR_TO_VECTOR.
- if (V1.getOpcode() == ISD::BUILD_VECTOR) {
- bool IsScalarToVector = true;
- for (unsigned i = 0, e = V1.getNumOperands(); i != e; ++i)
- if (V1.getOperand(i).getOpcode() != ISD::UNDEF &&
- i != (unsigned)Lane) {
- IsScalarToVector = false;
- break;
- }
- if (IsScalarToVector)
- return DAG.getNode(AArch64ISD::NEON_VDUP, dl, VT,
- V1.getOperand(Lane));
- }
-
- // Test if V1 is a EXTRACT_SUBVECTOR.
- if (V1.getOpcode() == ISD::EXTRACT_SUBVECTOR) {
- int ExtLane = cast<ConstantSDNode>(V1.getOperand(1))->getZExtValue();
- return DAG.getNode(AArch64ISD::NEON_VDUPLANE, dl, VT, V1.getOperand(0),
- DAG.getConstant(Lane + ExtLane, MVT::i64));
- }
- // Test if V1 is a CONCAT_VECTORS.
- if (V1.getOpcode() == ISD::CONCAT_VECTORS &&
- V1.getOperand(1).getOpcode() == ISD::UNDEF) {
- SDValue Op0 = V1.getOperand(0);
- assert((unsigned)Lane < Op0.getValueType().getVectorNumElements() &&
- "Invalid vector lane access");
- return DAG.getNode(AArch64ISD::NEON_VDUPLANE, dl, VT, Op0,
- DAG.getConstant(Lane, MVT::i64));
- }
-
- return DAG.getNode(AArch64ISD::NEON_VDUPLANE, dl, VT, V1,
- DAG.getConstant(Lane, MVT::i64));
- }
-
- int Length = ShuffleMask.size();
- int V1EltNum = V1.getValueType().getVectorNumElements();
-
- // If the number of v1 elements is the same as the number of shuffle mask
- // element and the shuffle masks are sequential values, we can transform
- // it into NEON_VEXTRACT.
- if (V1EltNum == Length) {
- // Check if the shuffle mask is sequential.
- int SkipUndef = 0;
- while (ShuffleMask[SkipUndef] == -1) {
- SkipUndef++;
- }
- int CurMask = ShuffleMask[SkipUndef];
- if (CurMask >= SkipUndef) {
- bool IsSequential = true;
- for (int I = SkipUndef; I < Length; ++I) {
- if (ShuffleMask[I] != -1 && ShuffleMask[I] != CurMask) {
- IsSequential = false;
- break;
- }
- CurMask++;
- }
- if (IsSequential) {
- assert((EltSize % 8 == 0) && "Bitsize of vector element is incorrect");
- unsigned VecSize = EltSize * V1EltNum;
- unsigned Index = (EltSize / 8) * (ShuffleMask[SkipUndef] - SkipUndef);
- if (VecSize == 64 || VecSize == 128)
- return DAG.getNode(AArch64ISD::NEON_VEXTRACT, dl, VT, V1, V2,
- DAG.getConstant(Index, MVT::i64));
- }
- }
- }
-
- // For shuffle mask like "0, 1, 2, 3, 4, 5, 13, 7", try to generate insert
- // by element from V2 to V1 .
- // If shuffle mask is like "0, 1, 10, 11, 12, 13, 14, 15", V2 would be a
- // better choice to be inserted than V1 as less insert needed, so we count
- // element to be inserted for both V1 and V2, and select less one as insert
- // target.
-
- // Collect elements need to be inserted and their index.
- SmallVector<int, 8> NV1Elt;
- SmallVector<int, 8> N1Index;
- SmallVector<int, 8> NV2Elt;
- SmallVector<int, 8> N2Index;
- for (int I = 0; I != Length; ++I) {
- if (ShuffleMask[I] != I) {
- NV1Elt.push_back(ShuffleMask[I]);
- N1Index.push_back(I);
- }
- }
- for (int I = 0; I != Length; ++I) {
- if (ShuffleMask[I] != (I + V1EltNum)) {
- NV2Elt.push_back(ShuffleMask[I]);
- N2Index.push_back(I);
- }
- }
-
- // Decide which to be inserted. If all lanes mismatch, neither V1 nor V2
- // will be inserted.
- SDValue InsV = V1;
- SmallVector<int, 8> InsMasks = NV1Elt;
- SmallVector<int, 8> InsIndex = N1Index;
- if ((int)NV1Elt.size() != Length || (int)NV2Elt.size() != Length) {
- if (NV1Elt.size() > NV2Elt.size()) {
- InsV = V2;
- InsMasks = NV2Elt;
- InsIndex = N2Index;
- }
- } else {
- InsV = DAG.getNode(ISD::UNDEF, dl, VT);
- }
-
- for (int I = 0, E = InsMasks.size(); I != E; ++I) {
- SDValue ExtV = V1;
- int Mask = InsMasks[I];
- if (Mask >= V1EltNum) {
- ExtV = V2;
- Mask -= V1EltNum;
- }
- // Any value type smaller than i32 is illegal in AArch64, and this lower
- // function is called after legalize pass, so we need to legalize
- // the result here.
- EVT EltVT;
- if (VT.getVectorElementType().isFloatingPoint())
- EltVT = (EltSize == 64) ? MVT::f64 : MVT::f32;
- else
- EltVT = (EltSize == 64) ? MVT::i64 : MVT::i32;
-
- if (Mask >= 0) {
- ExtV = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, ExtV,
- DAG.getConstant(Mask, MVT::i64));
- InsV = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, InsV, ExtV,
- DAG.getConstant(InsIndex[I], MVT::i64));
- }
- }
- return InsV;
-}
-
-AArch64TargetLowering::ConstraintType
-AArch64TargetLowering::getConstraintType(const std::string &Constraint) const {
- if (Constraint.size() == 1) {
- switch (Constraint[0]) {
- default: break;
- case 'w': // An FP/SIMD vector register
- return C_RegisterClass;
- case 'I': // Constant that can be used with an ADD instruction
- case 'J': // Constant that can be used with a SUB instruction
- case 'K': // Constant that can be used with a 32-bit logical instruction
- case 'L': // Constant that can be used with a 64-bit logical instruction
- case 'M': // Constant that can be used as a 32-bit MOV immediate
- case 'N': // Constant that can be used as a 64-bit MOV immediate
- case 'Y': // Floating point constant zero
- case 'Z': // Integer constant zero
- return C_Other;
- case 'Q': // A memory reference with base register and no offset
- return C_Memory;
- case 'S': // A symbolic address
- return C_Other;
- }
- }
-
- // FIXME: Ump, Utf, Usa, Ush
- // Ump: A memory address suitable for ldp/stp in SI, DI, SF and DF modes,
- // whatever they may be
- // Utf: A memory address suitable for ldp/stp in TF mode, whatever it may be
- // Usa: An absolute symbolic address
- // Ush: The high part (bits 32:12) of a pc-relative symbolic address
- assert(Constraint != "Ump" && Constraint != "Utf" && Constraint != "Usa"
- && Constraint != "Ush" && "Unimplemented constraints");
-
- return TargetLowering::getConstraintType(Constraint);
-}
-
-TargetLowering::ConstraintWeight
-AArch64TargetLowering::getSingleConstraintMatchWeight(AsmOperandInfo &Info,
- const char *Constraint) const {
-
- llvm_unreachable("Constraint weight unimplemented");
-}
-
-void
-AArch64TargetLowering::LowerAsmOperandForConstraint(SDValue Op,
- std::string &Constraint,
- std::vector<SDValue> &Ops,
- SelectionDAG &DAG) const {
- SDValue Result(0, 0);
-
- // Only length 1 constraints are C_Other.
- if (Constraint.size() != 1) return;
-
- // Only C_Other constraints get lowered like this. That means constants for us
- // so return early if there's no hope the constraint can be lowered.
-
- switch(Constraint[0]) {
- default: break;
- case 'I': case 'J': case 'K': case 'L':
- case 'M': case 'N': case 'Z': {
- ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op);
- if (!C)
- return;
-
- uint64_t CVal = C->getZExtValue();
- uint32_t Bits;
-
- switch (Constraint[0]) {
- default:
- // FIXME: 'M' and 'N' are MOV pseudo-insts -- unsupported in assembly. 'J'
- // is a peculiarly useless SUB constraint.
- llvm_unreachable("Unimplemented C_Other constraint");
- case 'I':
- if (CVal <= 0xfff)
- break;
- return;
- case 'K':
- if (A64Imms::isLogicalImm(32, CVal, Bits))
- break;
- return;
- case 'L':
- if (A64Imms::isLogicalImm(64, CVal, Bits))
- break;
- return;
- case 'Z':
- if (CVal == 0)
- break;
- return;
- }
-
- Result = DAG.getTargetConstant(CVal, Op.getValueType());
- break;
- }
- case 'S': {
- // An absolute symbolic address or label reference.
- if (const GlobalAddressSDNode *GA = dyn_cast<GlobalAddressSDNode>(Op)) {
- Result = DAG.getTargetGlobalAddress(GA->getGlobal(), SDLoc(Op),
- GA->getValueType(0));
- } else if (const BlockAddressSDNode *BA
- = dyn_cast<BlockAddressSDNode>(Op)) {
- Result = DAG.getTargetBlockAddress(BA->getBlockAddress(),
- BA->getValueType(0));
- } else if (const ExternalSymbolSDNode *ES
- = dyn_cast<ExternalSymbolSDNode>(Op)) {
- Result = DAG.getTargetExternalSymbol(ES->getSymbol(),
- ES->getValueType(0));
- } else
- return;
- break;
- }
- case 'Y':
- if (const ConstantFPSDNode *CFP = dyn_cast<ConstantFPSDNode>(Op)) {
- if (CFP->isExactlyValue(0.0)) {
- Result = DAG.getTargetConstantFP(0.0, CFP->getValueType(0));
- break;
- }
- }
- return;
- }
-
- if (Result.getNode()) {
- Ops.push_back(Result);
- return;
- }
-
- // It's an unknown constraint for us. Let generic code have a go.
- TargetLowering::LowerAsmOperandForConstraint(Op, Constraint, Ops, DAG);
-}
-
-std::pair<unsigned, const TargetRegisterClass*>
-AArch64TargetLowering::getRegForInlineAsmConstraint(
- const std::string &Constraint,
- MVT VT) const {
- if (Constraint.size() == 1) {
- switch (Constraint[0]) {
- case 'r':
- if (VT.getSizeInBits() <= 32)
- return std::make_pair(0U, &AArch64::GPR32RegClass);
- else if (VT == MVT::i64)
- return std::make_pair(0U, &AArch64::GPR64RegClass);
- break;
- case 'w':
- if (VT == MVT::f16)
- return std::make_pair(0U, &AArch64::FPR16RegClass);
- else if (VT == MVT::f32)
- return std::make_pair(0U, &AArch64::FPR32RegClass);
- else if (VT.getSizeInBits() == 64)
- return std::make_pair(0U, &AArch64::FPR64RegClass);
- else if (VT.getSizeInBits() == 128)
- return std::make_pair(0U, &AArch64::FPR128RegClass);
- break;
- }
- }
-
- // Use the default implementation in TargetLowering to convert the register
- // constraint into a member of a register class.
- return TargetLowering::getRegForInlineAsmConstraint(Constraint, VT);
-}
-
-/// Represent NEON load and store intrinsics as MemIntrinsicNodes.
-/// The associated MachineMemOperands record the alignment specified
-/// in the intrinsic calls.
-bool AArch64TargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
- const CallInst &I,
- unsigned Intrinsic) const {
- switch (Intrinsic) {
- case Intrinsic::arm_neon_vld1:
- case Intrinsic::arm_neon_vld2:
- case Intrinsic::arm_neon_vld3:
- case Intrinsic::arm_neon_vld4:
- case Intrinsic::aarch64_neon_vld1x2:
- case Intrinsic::aarch64_neon_vld1x3:
- case Intrinsic::aarch64_neon_vld1x4:
- case Intrinsic::arm_neon_vld2lane:
- case Intrinsic::arm_neon_vld3lane:
- case Intrinsic::arm_neon_vld4lane: {
- Info.opc = ISD::INTRINSIC_W_CHAIN;
- // Conservatively set memVT to the entire set of vectors loaded.
- uint64_t NumElts = getDataLayout()->getTypeAllocSize(I.getType()) / 8;
- Info.memVT = EVT::getVectorVT(I.getType()->getContext(), MVT::i64, NumElts);
- Info.ptrVal = I.getArgOperand(0);
- Info.offset = 0;
- Value *AlignArg = I.getArgOperand(I.getNumArgOperands() - 1);
- Info.align = cast<ConstantInt>(AlignArg)->getZExtValue();
- Info.vol = false; // volatile loads with NEON intrinsics not supported
- Info.readMem = true;
- Info.writeMem = false;
+ Base = Op->getOperand(0);
+ // All of the indexed addressing mode instructions take a signed
+ // 9 bit immediate offset.
+ if (ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(Op->getOperand(1))) {
+ int64_t RHSC = (int64_t)RHS->getZExtValue();
+ if (RHSC >= 256 || RHSC <= -256)
+ return false;
+ IsInc = (Op->getOpcode() == ISD::ADD);
+ Offset = Op->getOperand(1);
return true;
}
- case Intrinsic::arm_neon_vst1:
- case Intrinsic::arm_neon_vst2:
- case Intrinsic::arm_neon_vst3:
- case Intrinsic::arm_neon_vst4:
- case Intrinsic::aarch64_neon_vst1x2:
- case Intrinsic::aarch64_neon_vst1x3:
- case Intrinsic::aarch64_neon_vst1x4:
- case Intrinsic::arm_neon_vst2lane:
- case Intrinsic::arm_neon_vst3lane:
- case Intrinsic::arm_neon_vst4lane: {
- Info.opc = ISD::INTRINSIC_VOID;
- // Conservatively set memVT to the entire set of vectors stored.
- unsigned NumElts = 0;
- for (unsigned ArgI = 1, ArgE = I.getNumArgOperands(); ArgI < ArgE; ++ArgI) {
- Type *ArgTy = I.getArgOperand(ArgI)->getType();
- if (!ArgTy->isVectorTy())
- break;
- NumElts += getDataLayout()->getTypeAllocSize(ArgTy) / 8;
- }
- Info.memVT = EVT::getVectorVT(I.getType()->getContext(), MVT::i64, NumElts);
- Info.ptrVal = I.getArgOperand(0);
- Info.offset = 0;
- Value *AlignArg = I.getArgOperand(I.getNumArgOperands() - 1);
- Info.align = cast<ConstantInt>(AlignArg)->getZExtValue();
- Info.vol = false; // volatile stores with NEON intrinsics not supported
- Info.readMem = false;
- Info.writeMem = true;
- return true;
- }
- default:
- break;
- }
-
return false;
}
+
+bool AArch64TargetLowering::getPreIndexedAddressParts(SDNode *N, SDValue &Base,
+ SDValue &Offset,
+ ISD::MemIndexedMode &AM,
+ SelectionDAG &DAG) const {
+ EVT VT;
+ SDValue Ptr;
+ if (LoadSDNode *LD = dyn_cast<LoadSDNode>(N)) {
+ VT = LD->getMemoryVT();
+ Ptr = LD->getBasePtr();
+ } else if (StoreSDNode *ST = dyn_cast<StoreSDNode>(N)) {
+ VT = ST->getMemoryVT();
+ Ptr = ST->getBasePtr();
+ } else
+ return false;
+
+ bool IsInc;
+ if (!getIndexedAddressParts(Ptr.getNode(), Base, Offset, AM, IsInc, DAG))
+ return false;
+ AM = IsInc ? ISD::PRE_INC : ISD::PRE_DEC;
+ return true;
+}
+
+bool AArch64TargetLowering::getPostIndexedAddressParts(
+ SDNode *N, SDNode *Op, SDValue &Base, SDValue &Offset,
+ ISD::MemIndexedMode &AM, SelectionDAG &DAG) const {
+ EVT VT;
+ SDValue Ptr;
+ if (LoadSDNode *LD = dyn_cast<LoadSDNode>(N)) {
+ VT = LD->getMemoryVT();
+ Ptr = LD->getBasePtr();
+ } else if (StoreSDNode *ST = dyn_cast<StoreSDNode>(N)) {
+ VT = ST->getMemoryVT();
+ Ptr = ST->getBasePtr();
+ } else
+ return false;
+
+ bool IsInc;
+ if (!getIndexedAddressParts(Op, Base, Offset, AM, IsInc, DAG))
+ return false;
+ // Post-indexing updates the base, so it's not a valid transform
+ // if that's not the same as the load's pointer.
+ if (Ptr != Base)
+ return false;
+ AM = IsInc ? ISD::POST_INC : ISD::POST_DEC;
+ return true;
+}
+
+void AArch64TargetLowering::ReplaceNodeResults(
+ SDNode *N, SmallVectorImpl<SDValue> &Results, SelectionDAG &DAG) const {
+ switch (N->getOpcode()) {
+ default:
+ llvm_unreachable("Don't know how to custom expand this");
+ case ISD::FP_TO_UINT:
+ case ISD::FP_TO_SINT:
+ assert(N->getValueType(0) == MVT::i128 && "unexpected illegal conversion");
+ // Let normal code take care of it by not adding anything to Results.
+ return;
+ }
+}
+
+bool AArch64TargetLowering::shouldExpandAtomicInIR(Instruction *Inst) const {
+ // Loads and stores less than 128-bits are already atomic; ones above that
+ // are doomed anyway, so defer to the default libcall and blame the OS when
+ // things go wrong:
+ if (StoreInst *SI = dyn_cast<StoreInst>(Inst))
+ return SI->getValueOperand()->getType()->getPrimitiveSizeInBits() == 128;
+ else if (LoadInst *LI = dyn_cast<LoadInst>(Inst))
+ return LI->getType()->getPrimitiveSizeInBits() == 128;
+
+ // For the real atomic operations, we have ldxr/stxr up to 128 bits.
+ return Inst->getType()->getPrimitiveSizeInBits() <= 128;
+}
+
+Value *AArch64TargetLowering::emitLoadLinked(IRBuilder<> &Builder, Value *Addr,
+ AtomicOrdering Ord) const {
+ Module *M = Builder.GetInsertBlock()->getParent()->getParent();
+ Type *ValTy = cast<PointerType>(Addr->getType())->getElementType();
+ bool IsAcquire =
+ Ord == Acquire || Ord == AcquireRelease || Ord == SequentiallyConsistent;
+
+ // Since i128 isn't legal and intrinsics don't get type-lowered, the ldrexd
+ // intrinsic must return {i64, i64} and we have to recombine them into a
+ // single i128 here.
+ if (ValTy->getPrimitiveSizeInBits() == 128) {
+ Intrinsic::ID Int =
+ IsAcquire ? Intrinsic::aarch64_ldaxp : Intrinsic::aarch64_ldxp;
+ Function *Ldxr = llvm::Intrinsic::getDeclaration(M, Int);
+
+ Addr = Builder.CreateBitCast(Addr, Type::getInt8PtrTy(M->getContext()));
+ Value *LoHi = Builder.CreateCall(Ldxr, Addr, "lohi");
+
+ Value *Lo = Builder.CreateExtractValue(LoHi, 0, "lo");
+ Value *Hi = Builder.CreateExtractValue(LoHi, 1, "hi");
+ Lo = Builder.CreateZExt(Lo, ValTy, "lo64");
+ Hi = Builder.CreateZExt(Hi, ValTy, "hi64");
+ return Builder.CreateOr(
+ Lo, Builder.CreateShl(Hi, ConstantInt::get(ValTy, 64)), "val64");
+ }
+
+ Type *Tys[] = { Addr->getType() };
+ Intrinsic::ID Int =
+ IsAcquire ? Intrinsic::aarch64_ldaxr : Intrinsic::aarch64_ldxr;
+ Function *Ldxr = llvm::Intrinsic::getDeclaration(M, Int, Tys);
+
+ return Builder.CreateTruncOrBitCast(
+ Builder.CreateCall(Ldxr, Addr),
+ cast<PointerType>(Addr->getType())->getElementType());
+}
+
+Value *AArch64TargetLowering::emitStoreConditional(IRBuilder<> &Builder,
+ Value *Val, Value *Addr,
+ AtomicOrdering Ord) const {
+ Module *M = Builder.GetInsertBlock()->getParent()->getParent();
+ bool IsRelease =
+ Ord == Release || Ord == AcquireRelease || Ord == SequentiallyConsistent;
+
+ // Since the intrinsics must have legal type, the i128 intrinsics take two
+ // parameters: "i64, i64". We must marshal Val into the appropriate form
+ // before the call.
+ if (Val->getType()->getPrimitiveSizeInBits() == 128) {
+ Intrinsic::ID Int =
+ IsRelease ? Intrinsic::aarch64_stlxp : Intrinsic::aarch64_stxp;
+ Function *Stxr = Intrinsic::getDeclaration(M, Int);
+ Type *Int64Ty = Type::getInt64Ty(M->getContext());
+
+ Value *Lo = Builder.CreateTrunc(Val, Int64Ty, "lo");
+ Value *Hi = Builder.CreateTrunc(Builder.CreateLShr(Val, 64), Int64Ty, "hi");
+ Addr = Builder.CreateBitCast(Addr, Type::getInt8PtrTy(M->getContext()));
+ return Builder.CreateCall3(Stxr, Lo, Hi, Addr);
+ }
+
+ Intrinsic::ID Int =
+ IsRelease ? Intrinsic::aarch64_stlxr : Intrinsic::aarch64_stxr;
+ Type *Tys[] = { Addr->getType() };
+ Function *Stxr = Intrinsic::getDeclaration(M, Int, Tys);
+
+ return Builder.CreateCall2(
+ Stxr, Builder.CreateZExtOrBitCast(
+ Val, Stxr->getFunctionType()->getParamType(0)),
+ Addr);
+}
diff --git a/lib/Target/AArch64/AArch64ISelLowering.h b/lib/Target/AArch64/AArch64ISelLowering.h
index e946b25..de16c4d 100644
--- a/lib/Target/AArch64/AArch64ISelLowering.h
+++ b/lib/Target/AArch64/AArch64ISelLowering.h
@@ -12,255 +12,348 @@
//
//===----------------------------------------------------------------------===//
-#ifndef LLVM_TARGET_AARCH64_ISELLOWERING_H
-#define LLVM_TARGET_AARCH64_ISELLOWERING_H
+#ifndef LLVM_TARGET_AArch64_ISELLOWERING_H
+#define LLVM_TARGET_AArch64_ISELLOWERING_H
-#include "Utils/AArch64BaseInfo.h"
#include "llvm/CodeGen/CallingConvLower.h"
#include "llvm/CodeGen/SelectionDAG.h"
-#include "llvm/IR/Intrinsics.h"
+#include "llvm/IR/CallingConv.h"
#include "llvm/Target/TargetLowering.h"
namespace llvm {
+
namespace AArch64ISD {
- enum NodeType {
- // Start the numbering from where ISD NodeType finishes.
- FIRST_NUMBER = ISD::BUILTIN_OP_END,
- // This is a conditional branch which also notes the flag needed
- // (eq/sgt/...). A64 puts this information on the branches rather than
- // compares as LLVM does.
- BR_CC,
+enum {
+ FIRST_NUMBER = ISD::BUILTIN_OP_END,
+ WrapperLarge, // 4-instruction MOVZ/MOVK sequence for 64-bit addresses.
+ CALL, // Function call.
- // A node to be selected to an actual call operation: either BL or BLR in
- // the absence of tail calls.
- Call,
+ // Almost the same as a normal call node, except that a TLSDesc relocation is
+ // needed so the linker can relax it correctly if possible.
+ TLSDESC_CALL,
+ ADRP, // Page address of a TargetGlobalAddress operand.
+ ADDlow, // Add the low 12 bits of a TargetGlobalAddress operand.
+ LOADgot, // Load from automatically generated descriptor (e.g. Global
+ // Offset Table, TLS record).
+ RET_FLAG, // Return with a flag operand. Operand 0 is the chain operand.
+ BRCOND, // Conditional branch instruction; "b.cond".
+ CSEL,
+ FCSEL, // Conditional move instruction.
+ CSINV, // Conditional select invert.
+ CSNEG, // Conditional select negate.
+ CSINC, // Conditional select increment.
- // Indicates a floating-point immediate which fits into the format required
- // by the FMOV instructions. First (and only) operand is the 8-bit encoded
- // value of that immediate.
- FPMOV,
+ // Pointer to the thread's local storage area. Materialised from TPIDR_EL0 on
+ // ELF.
+ THREAD_POINTER,
+ ADC,
+ SBC, // adc, sbc instructions
- // Corresponds directly to an EXTR instruction. Operands are an LHS an RHS
- // and an LSB.
- EXTR,
+ // Arithmetic instructions which write flags.
+ ADDS,
+ SUBS,
+ ADCS,
+ SBCS,
+ ANDS,
- // Wraps a load from the GOT, which should always be performed with a 64-bit
- // load instruction. This prevents the DAG combiner folding a truncate to
- // form a smaller memory access.
- GOTLoad,
+ // Floating point comparison
+ FCMP,
- // Performs a bitfield insert. Arguments are: the value being inserted into;
- // the value being inserted; least significant bit changed; width of the
- // field.
- BFI,
+ // Floating point max and min instructions.
+ FMAX,
+ FMIN,
- // Simply a convenient node inserted during ISelLowering to represent
- // procedure return. Will almost certainly be selected to "RET".
- Ret,
+ // Scalar extract
+ EXTR,
- /// Extracts a field of contiguous bits from the source and sign extends
- /// them into a single register. Arguments are: source; immr; imms. Note
- /// these are pre-encoded since DAG matching can't cope with combining LSB
- /// and Width into these values itself.
- SBFX,
+ // Scalar-to-vector duplication
+ DUP,
+ DUPLANE8,
+ DUPLANE16,
+ DUPLANE32,
+ DUPLANE64,
- /// This is an A64-ification of the standard LLVM SELECT_CC operation. The
- /// main difference is that it only has the values and an A64 condition,
- /// which will be produced by a setcc instruction.
- SELECT_CC,
+ // Vector immedate moves
+ MOVI,
+ MOVIshift,
+ MOVIedit,
+ MOVImsl,
+ FMOV,
+ MVNIshift,
+ MVNImsl,
- /// This serves most of the functions of the LLVM SETCC instruction, for two
- /// purposes. First, it prevents optimisations from fiddling with the
- /// compare after we've moved the CondCode information onto the SELECT_CC or
- /// BR_CC instructions. Second, it gives a legal instruction for the actual
- /// comparison.
- ///
- /// It keeps a record of the condition flags asked for because certain
- /// instructions are only valid for a subset of condition codes.
- SETCC,
+ // Vector immediate ops
+ BICi,
+ ORRi,
- // Designates a node which is a tail call: both a call and a return
- // instruction as far as selction is concerned. It should be selected to an
- // unconditional branch. Has the usual plethora of call operands, but: 1st
- // is callee, 2nd is stack adjustment required immediately before branch.
- TC_RETURN,
+ // Vector bit select: similar to ISD::VSELECT but not all bits within an
+ // element must be identical.
+ BSL,
- // Designates a call used to support the TLS descriptor ABI. The call itself
- // will be indirect ("BLR xN") but a relocation-specifier (".tlsdesccall
- // var") must be attached somehow during code generation. It takes two
- // operands: the callee and the symbol to be relocated against.
- TLSDESCCALL,
+ // Vector arithmetic negation
+ NEG,
- // Leaf node which will be lowered to an appropriate MRS to obtain the
- // thread pointer: TPIDR_EL0.
- THREAD_POINTER,
+ // Vector shuffles
+ ZIP1,
+ ZIP2,
+ UZP1,
+ UZP2,
+ TRN1,
+ TRN2,
+ REV16,
+ REV32,
+ REV64,
+ EXT,
- /// Extracts a field of contiguous bits from the source and zero extends
- /// them into a single register. Arguments are: source; immr; imms. Note
- /// these are pre-encoded since DAG matching can't cope with combining LSB
- /// and Width into these values itself.
- UBFX,
+ // Vector shift by scalar
+ VSHL,
+ VLSHR,
+ VASHR,
- // Wraps an address which the ISelLowering phase has decided should be
- // created using the large memory model style: i.e. a sequence of four
- // movz/movk instructions.
- WrapperLarge,
+ // Vector shift by scalar (again)
+ SQSHL_I,
+ UQSHL_I,
+ SQSHLU_I,
+ SRSHR_I,
+ URSHR_I,
- // Wraps an address which the ISelLowering phase has decided should be
- // created using the small memory model style: i.e. adrp/add or
- // adrp/mem-op. This exists to prevent bare TargetAddresses which may never
- // get selected.
- WrapperSmall,
+ // Vector comparisons
+ CMEQ,
+ CMGE,
+ CMGT,
+ CMHI,
+ CMHS,
+ FCMEQ,
+ FCMGE,
+ FCMGT,
- // Vector move immediate
- NEON_MOVIMM,
+ // Vector zero comparisons
+ CMEQz,
+ CMGEz,
+ CMGTz,
+ CMLEz,
+ CMLTz,
+ FCMEQz,
+ FCMGEz,
+ FCMGTz,
+ FCMLEz,
+ FCMLTz,
- // Vector Move Inverted Immediate
- NEON_MVNIMM,
+ // Vector bitwise negation
+ NOT,
- // Vector FP move immediate
- NEON_FMOVIMM,
+ // Vector bitwise selection
+ BIT,
- // Vector permute
- NEON_UZP1,
- NEON_UZP2,
- NEON_ZIP1,
- NEON_ZIP2,
- NEON_TRN1,
- NEON_TRN2,
+ // Compare-and-branch
+ CBZ,
+ CBNZ,
+ TBZ,
+ TBNZ,
- // Vector Element reverse
- NEON_REV64,
- NEON_REV32,
- NEON_REV16,
+ // Tail calls
+ TC_RETURN,
- // Vector compare
- NEON_CMP,
+ // Custom prefetch handling
+ PREFETCH,
- // Vector compare zero
- NEON_CMPZ,
+ // {s|u}int to FP within a FP register.
+ SITOF,
+ UITOF,
- // Vector compare bitwise test
- NEON_TST,
+ // NEON Load/Store with post-increment base updates
+ LD2post = ISD::FIRST_TARGET_MEMORY_OPCODE,
+ LD3post,
+ LD4post,
+ ST2post,
+ ST3post,
+ ST4post,
+ LD1x2post,
+ LD1x3post,
+ LD1x4post,
+ ST1x2post,
+ ST1x3post,
+ ST1x4post,
+ LD1DUPpost,
+ LD2DUPpost,
+ LD3DUPpost,
+ LD4DUPpost,
+ LD1LANEpost,
+ LD2LANEpost,
+ LD3LANEpost,
+ LD4LANEpost,
+ ST2LANEpost,
+ ST3LANEpost,
+ ST4LANEpost
+};
- // Vector saturating shift
- NEON_QSHLs,
- NEON_QSHLu,
-
- // Vector dup
- NEON_VDUP,
-
- // Vector dup by lane
- NEON_VDUPLANE,
-
- // Vector extract
- NEON_VEXTRACT,
-
- // NEON duplicate lane loads
- NEON_LD2DUP = ISD::FIRST_TARGET_MEMORY_OPCODE,
- NEON_LD3DUP,
- NEON_LD4DUP,
-
- // NEON loads with post-increment base updates:
- NEON_LD1_UPD,
- NEON_LD2_UPD,
- NEON_LD3_UPD,
- NEON_LD4_UPD,
- NEON_LD1x2_UPD,
- NEON_LD1x3_UPD,
- NEON_LD1x4_UPD,
-
- // NEON stores with post-increment base updates:
- NEON_ST1_UPD,
- NEON_ST2_UPD,
- NEON_ST3_UPD,
- NEON_ST4_UPD,
- NEON_ST1x2_UPD,
- NEON_ST1x3_UPD,
- NEON_ST1x4_UPD,
-
- // NEON duplicate lane loads with post-increment base updates:
- NEON_LD2DUP_UPD,
- NEON_LD3DUP_UPD,
- NEON_LD4DUP_UPD,
-
- // NEON lane loads with post-increment base updates:
- NEON_LD2LN_UPD,
- NEON_LD3LN_UPD,
- NEON_LD4LN_UPD,
-
- // NEON lane store with post-increment base updates:
- NEON_ST2LN_UPD,
- NEON_ST3LN_UPD,
- NEON_ST4LN_UPD
- };
-}
-
+} // end namespace AArch64ISD
class AArch64Subtarget;
class AArch64TargetMachine;
class AArch64TargetLowering : public TargetLowering {
+ bool RequireStrictAlign;
+
public:
explicit AArch64TargetLowering(AArch64TargetMachine &TM);
- const char *getTargetNodeName(unsigned Opcode) const;
+ /// Selects the correct CCAssignFn for a the given CallingConvention
+ /// value.
+ CCAssignFn *CCAssignFnForCall(CallingConv::ID CC, bool IsVarArg) const;
- CCAssignFn *CCAssignFnForNode(CallingConv::ID CC) const;
+ /// computeKnownBitsForTargetNode - Determine which of the bits specified in
+ /// Mask are known to be either zero or one and return them in the
+ /// KnownZero/KnownOne bitsets.
+ void computeKnownBitsForTargetNode(const SDValue Op, APInt &KnownZero,
+ APInt &KnownOne, const SelectionDAG &DAG,
+ unsigned Depth = 0) const override;
- SDValue LowerFormalArguments(SDValue Chain,
- CallingConv::ID CallConv, bool isVarArg,
- const SmallVectorImpl<ISD::InputArg> &Ins,
- SDLoc dl, SelectionDAG &DAG,
- SmallVectorImpl<SDValue> &InVals) const;
+ MVT getScalarShiftAmountTy(EVT LHSTy) const override;
- SDValue LowerReturn(SDValue Chain,
- CallingConv::ID CallConv, bool isVarArg,
- const SmallVectorImpl<ISD::OutputArg> &Outs,
- const SmallVectorImpl<SDValue> &OutVals,
- SDLoc dl, SelectionDAG &DAG) const;
+ /// allowsUnalignedMemoryAccesses - Returns true if the target allows
+ /// unaligned memory accesses. of the specified type.
+ bool allowsUnalignedMemoryAccesses(EVT VT, unsigned AddrSpace = 0,
+ bool *Fast = nullptr) const override {
+ if (RequireStrictAlign)
+ return false;
+ // FIXME: True for Cyclone, but not necessary others.
+ if (Fast)
+ *Fast = true;
+ return true;
+ }
- virtual unsigned getByValTypeAlignment(Type *Ty) const override;
+ /// LowerOperation - Provide custom lowering hooks for some operations.
+ SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override;
- SDValue LowerCall(CallLoweringInfo &CLI,
- SmallVectorImpl<SDValue> &InVals) const;
+ const char *getTargetNodeName(unsigned Opcode) const override;
+
+ SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const override;
+
+ /// getFunctionAlignment - Return the Log2 alignment of this function.
+ unsigned getFunctionAlignment(const Function *F) const;
+
+ /// getMaximalGlobalOffset - Returns the maximal possible offset which can
+ /// be used for loads / stores from the global.
+ unsigned getMaximalGlobalOffset() const override;
+
+ /// Returns true if a cast between SrcAS and DestAS is a noop.
+ bool isNoopAddrSpaceCast(unsigned SrcAS, unsigned DestAS) const override {
+ // Addrspacecasts are always noops.
+ return true;
+ }
+
+ /// createFastISel - This method returns a target specific FastISel object,
+ /// or null if the target does not support "fast" ISel.
+ FastISel *createFastISel(FunctionLoweringInfo &funcInfo,
+ const TargetLibraryInfo *libInfo) const override;
+
+ bool isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const override;
+
+ bool isFPImmLegal(const APFloat &Imm, EVT VT) const override;
+
+ /// isShuffleMaskLegal - Return true if the given shuffle mask can be
+ /// codegen'd directly, or if it should be stack expanded.
+ bool isShuffleMaskLegal(const SmallVectorImpl<int> &M, EVT VT) const override;
+
+ /// getSetCCResultType - Return the ISD::SETCC ValueType
+ EVT getSetCCResultType(LLVMContext &Context, EVT VT) const override;
+
+ SDValue ReconstructShuffle(SDValue Op, SelectionDAG &DAG) const;
+
+ MachineBasicBlock *EmitF128CSEL(MachineInstr *MI,
+ MachineBasicBlock *BB) const;
+
+ MachineBasicBlock *
+ EmitInstrWithCustomInserter(MachineInstr *MI,
+ MachineBasicBlock *MBB) const override;
+
+ bool getTgtMemIntrinsic(IntrinsicInfo &Info, const CallInst &I,
+ unsigned Intrinsic) const override;
+
+ bool isTruncateFree(Type *Ty1, Type *Ty2) const override;
+ bool isTruncateFree(EVT VT1, EVT VT2) const override;
+
+ bool isZExtFree(Type *Ty1, Type *Ty2) const override;
+ bool isZExtFree(EVT VT1, EVT VT2) const override;
+ bool isZExtFree(SDValue Val, EVT VT2) const override;
+
+ bool hasPairedLoad(Type *LoadedType,
+ unsigned &RequiredAligment) const override;
+ bool hasPairedLoad(EVT LoadedType, unsigned &RequiredAligment) const override;
+
+ bool isLegalAddImmediate(int64_t) const override;
+ bool isLegalICmpImmediate(int64_t) const override;
+
+ EVT getOptimalMemOpType(uint64_t Size, unsigned DstAlign, unsigned SrcAlign,
+ bool IsMemset, bool ZeroMemset, bool MemcpyStrSrc,
+ MachineFunction &MF) const override;
+
+ /// isLegalAddressingMode - Return true if the addressing mode represented
+ /// by AM is legal for this target, for a load/store of the specified type.
+ bool isLegalAddressingMode(const AddrMode &AM, Type *Ty) const override;
+
+ /// \brief Return the cost of the scaling factor used in the addressing
+ /// mode represented by AM for this target, for a load/store
+ /// of the specified type.
+ /// If the AM is supported, the return value must be >= 0.
+ /// If the AM is not supported, it returns a negative value.
+ int getScalingFactorCost(const AddrMode &AM, Type *Ty) const override;
+
+ /// isFMAFasterThanFMulAndFAdd - Return true if an FMA operation is faster
+ /// than a pair of fmul and fadd instructions. fmuladd intrinsics will be
+ /// expanded to FMAs when this method returns true, otherwise fmuladd is
+ /// expanded to fmul + fadd.
+ bool isFMAFasterThanFMulAndFAdd(EVT VT) const override;
+
+ const MCPhysReg *getScratchRegisters(CallingConv::ID CC) const override;
+
+ /// \brief Returns false if N is a bit extraction pattern of (X >> C) & Mask.
+ bool isDesirableToCommuteWithShift(const SDNode *N) const override;
+
+ /// \brief Returns true if it is beneficial to convert a load of a constant
+ /// to just the constant itself.
+ bool shouldConvertConstantLoadToIntImm(const APInt &Imm,
+ Type *Ty) const override;
+
+ Value *emitLoadLinked(IRBuilder<> &Builder, Value *Addr,
+ AtomicOrdering Ord) const override;
+ Value *emitStoreConditional(IRBuilder<> &Builder, Value *Val,
+ Value *Addr, AtomicOrdering Ord) const override;
+
+ bool shouldExpandAtomicInIR(Instruction *Inst) const override;
+
+private:
+ /// Subtarget - Keep a pointer to the AArch64Subtarget around so that we can
+ /// make the right decision when generating code for different targets.
+ const AArch64Subtarget *Subtarget;
+
+ void addTypeForNEON(EVT VT, EVT PromotedBitwiseVT);
+ void addDRTypeForNEON(MVT VT);
+ void addQRTypeForNEON(MVT VT);
+
+ SDValue
+ LowerFormalArguments(SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
+ const SmallVectorImpl<ISD::InputArg> &Ins, SDLoc DL,
+ SelectionDAG &DAG,
+ SmallVectorImpl<SDValue> &InVals) const override;
+
+ SDValue LowerCall(CallLoweringInfo & /*CLI*/,
+ SmallVectorImpl<SDValue> &InVals) const override;
SDValue LowerCallResult(SDValue Chain, SDValue InFlag,
- CallingConv::ID CallConv, bool IsVarArg,
- const SmallVectorImpl<ISD::InputArg> &Ins,
- SDLoc dl, SelectionDAG &DAG,
- SmallVectorImpl<SDValue> &InVals) const;
+ CallingConv::ID CallConv, bool isVarArg,
+ const SmallVectorImpl<ISD::InputArg> &Ins, SDLoc DL,
+ SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals,
+ bool isThisReturn, SDValue ThisVal) const;
- SDValue LowerShiftLeftParts(SDValue Op, SelectionDAG &DAG) const;
- SDValue LowerShiftRightParts(SDValue Op, SelectionDAG &DAG) const;
-
- bool isConcatVector(SDValue Op, SelectionDAG &DAG, SDValue V0, SDValue V1,
- const int *Mask, SDValue &Res) const;
-
- bool isKnownShuffleVector(SDValue Op, SelectionDAG &DAG, SDValue &V0,
- SDValue &V1, int *Mask) const;
-
- SDValue LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG,
- const AArch64Subtarget *ST) const;
-
- SDValue LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const;
-
- void SaveVarArgRegisters(CCState &CCInfo, SelectionDAG &DAG, SDLoc DL,
- SDValue &Chain) const;
-
- /// IsEligibleForTailCallOptimization - Check whether the call is eligible
- /// for tail call optimization. Targets which want to do tail call
- /// optimization should implement this function.
- bool IsEligibleForTailCallOptimization(SDValue Callee,
- CallingConv::ID CalleeCC,
- bool IsVarArg,
- bool IsCalleeStructRet,
- bool IsCallerStructRet,
- const SmallVectorImpl<ISD::OutputArg> &Outs,
- const SmallVectorImpl<SDValue> &OutVals,
- const SmallVectorImpl<ISD::InputArg> &Ins,
- SelectionDAG& DAG) const;
+ bool isEligibleForTailCallOptimization(
+ SDValue Callee, CallingConv::ID CalleeCC, bool isVarArg,
+ bool isCalleeStructRet, bool isCallerStructRet,
+ const SmallVectorImpl<ISD::OutputArg> &Outs,
+ const SmallVectorImpl<SDValue> &OutVals,
+ const SmallVectorImpl<ISD::InputArg> &Ins, SelectionDAG &DAG) const;
/// Finds the incoming stack arguments which overlap the given fixed stack
/// object and incorporates their load into the current chain. This prevents
@@ -268,108 +361,104 @@
SDValue addTokenForArgument(SDValue Chain, SelectionDAG &DAG,
MachineFrameInfo *MFI, int ClobberedFI) const;
- EVT getSetCCResultType(LLVMContext &Context, EVT VT) const;
-
bool DoesCalleeRestoreStack(CallingConv::ID CallCC, bool TailCallOpt) const;
bool IsTailCallConvention(CallingConv::ID CallCC) const;
- SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const;
+ void saveVarArgRegisters(CCState &CCInfo, SelectionDAG &DAG, SDLoc DL,
+ SDValue &Chain) const;
- bool isLegalICmpImmediate(int64_t Val) const;
- SDValue getSelectableIntSetCC(SDValue LHS, SDValue RHS, ISD::CondCode CC,
- SDValue &A64cc, SelectionDAG &DAG, SDLoc &dl) const;
+ bool CanLowerReturn(CallingConv::ID CallConv, MachineFunction &MF,
+ bool isVarArg,
+ const SmallVectorImpl<ISD::OutputArg> &Outs,
+ LLVMContext &Context) const override;
- virtual MachineBasicBlock *
- EmitInstrWithCustomInserter(MachineInstr *MI, MachineBasicBlock *MBB) const;
+ SDValue LowerReturn(SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
+ const SmallVectorImpl<ISD::OutputArg> &Outs,
+ const SmallVectorImpl<SDValue> &OutVals, SDLoc DL,
+ SelectionDAG &DAG) const override;
- MachineBasicBlock *
- emitAtomicBinary(MachineInstr *MI, MachineBasicBlock *MBB,
- unsigned Size, unsigned Opcode) const;
-
- MachineBasicBlock *
- emitAtomicBinaryMinMax(MachineInstr *MI, MachineBasicBlock *BB,
- unsigned Size, unsigned CmpOp,
- A64CC::CondCodes Cond) const;
- MachineBasicBlock *
- emitAtomicCmpSwap(MachineInstr *MI, MachineBasicBlock *BB,
- unsigned Size) const;
-
- MachineBasicBlock *
- EmitF128CSEL(MachineInstr *MI, MachineBasicBlock *MBB) const;
-
- SDValue LowerATOMIC_FENCE(SDValue Op, SelectionDAG &DAG) const;
- SDValue LowerATOMIC_STORE(SDValue Op, SelectionDAG &DAG) const;
- SDValue LowerBlockAddress(SDValue Op, SelectionDAG &DAG) const;
- SDValue LowerBRCOND(SDValue Op, SelectionDAG &DAG) const;
- SDValue LowerBR_CC(SDValue Op, SelectionDAG &DAG) const;
- SDValue LowerF128ToCall(SDValue Op, SelectionDAG &DAG,
- RTLIB::Libcall Call) const;
- SDValue LowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) const;
- SDValue LowerFP_ROUND(SDValue Op, SelectionDAG &DAG) const;
- SDValue LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG, bool IsSigned) const;
- SDValue LowerRETURNADDR(SDValue Op, SelectionDAG &DAG) const;
- SDValue LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const;
-
- SDValue LowerGlobalAddressELFSmall(SDValue Op, SelectionDAG &DAG) const;
- SDValue LowerGlobalAddressELFLarge(SDValue Op, SelectionDAG &DAG) const;
- SDValue LowerGlobalAddressELF(SDValue Op, SelectionDAG &DAG) const;
-
- SDValue LowerConstantPool(SDValue Op, SelectionDAG &DAG) const;
-
- SDValue LowerTLSDescCall(SDValue SymAddr, SDValue DescAddr, SDLoc DL,
- SelectionDAG &DAG) const;
+ SDValue LowerGlobalAddress(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const;
- SDValue LowerINT_TO_FP(SDValue Op, SelectionDAG &DAG, bool IsSigned) const;
- SDValue LowerJumpTable(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerDarwinGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerELFGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerELFTLSDescCall(SDValue SymAddr, SDValue DescAddr, SDLoc DL,
+ SelectionDAG &DAG) const;
+ SDValue LowerSETCC(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerBR_CC(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerSELECT(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const;
- SDValue LowerSETCC(SDValue Op, SelectionDAG &DAG) const;
- SDValue LowerVACOPY(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerJumpTable(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerConstantPool(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerBlockAddress(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerAAPCS_VASTART(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerDarwin_VASTART(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerVASTART(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerVACOPY(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerVAARG(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerRETURNADDR(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerINSERT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerEXTRACT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerSCALAR_TO_VECTOR(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerEXTRACT_SUBVECTOR(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerVectorSRA_SRL_SHL(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerShiftLeftParts(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerShiftRightParts(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerVSETCC(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerCTPOP(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerF128Call(SDValue Op, SelectionDAG &DAG,
+ RTLIB::Libcall Call) const;
+ SDValue LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerFP_ROUND(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerINT_TO_FP(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerVectorAND(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerVectorOR(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerFSINCOS(SDValue Op, SelectionDAG &DAG) const;
- virtual SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const;
+ ConstraintType
+ getConstraintType(const std::string &Constraint) const override;
+ unsigned getRegisterByName(const char* RegName, EVT VT) const override;
- /// isFMAFasterThanFMulAndFAdd - Return true if an FMA operation is faster
- /// than a pair of fmul and fadd instructions. fmuladd intrinsics will be
- /// expanded to FMAs when this method returns true, otherwise fmuladd is
- /// expanded to fmul + fadd.
- virtual bool isFMAFasterThanFMulAndFAdd(EVT VT) const;
+ /// Examine constraint string and operand type and determine a weight value.
+ /// The operand object must already have been set up with the operand type.
+ ConstraintWeight
+ getSingleConstraintMatchWeight(AsmOperandInfo &info,
+ const char *constraint) const override;
- ConstraintType getConstraintType(const std::string &Constraint) const;
-
- ConstraintWeight getSingleConstraintMatchWeight(AsmOperandInfo &Info,
- const char *Constraint) const;
- void LowerAsmOperandForConstraint(SDValue Op,
- std::string &Constraint,
+ std::pair<unsigned, const TargetRegisterClass *>
+ getRegForInlineAsmConstraint(const std::string &Constraint,
+ MVT VT) const override;
+ void LowerAsmOperandForConstraint(SDValue Op, std::string &Constraint,
std::vector<SDValue> &Ops,
- SelectionDAG &DAG) const;
+ SelectionDAG &DAG) const override;
- std::pair<unsigned, const TargetRegisterClass*>
- getRegForInlineAsmConstraint(const std::string &Constraint, MVT VT) const;
+ bool isUsedByReturnOnly(SDNode *N, SDValue &Chain) const override;
+ bool mayBeEmittedAsTailCall(CallInst *CI) const override;
+ bool getIndexedAddressParts(SDNode *Op, SDValue &Base, SDValue &Offset,
+ ISD::MemIndexedMode &AM, bool &IsInc,
+ SelectionDAG &DAG) const;
+ bool getPreIndexedAddressParts(SDNode *N, SDValue &Base, SDValue &Offset,
+ ISD::MemIndexedMode &AM,
+ SelectionDAG &DAG) const override;
+ bool getPostIndexedAddressParts(SDNode *N, SDNode *Op, SDValue &Base,
+ SDValue &Offset, ISD::MemIndexedMode &AM,
+ SelectionDAG &DAG) const override;
- virtual bool getTgtMemIntrinsic(IntrinsicInfo &Info, const CallInst &I,
- unsigned Intrinsic) const override;
-
-protected:
- std::pair<const TargetRegisterClass*, uint8_t>
- findRepresentativeClass(MVT VT) const;
-
-private:
- const InstrItineraryData *Itins;
-
- const AArch64Subtarget *getSubtarget() const {
- return &getTargetMachine().getSubtarget<AArch64Subtarget>();
- }
-};
-enum NeonModImmType {
- Neon_Mov_Imm,
- Neon_Mvn_Imm
+ void ReplaceNodeResults(SDNode *N, SmallVectorImpl<SDValue> &Results,
+ SelectionDAG &DAG) const override;
};
-extern SDValue ScanBUILD_VECTOR(SDValue Op, bool &isOnlyLowElement,
- bool &usesOnlyOneValue, bool &hasDominantValue,
- bool &isConstant, bool &isUNDEF);
-} // namespace llvm
+namespace AArch64 {
+FastISel *createFastISel(FunctionLoweringInfo &funcInfo,
+ const TargetLibraryInfo *libInfo);
+} // end namespace AArch64
-#endif // LLVM_TARGET_AARCH64_ISELLOWERING_H
+} // end namespace llvm
+
+#endif // LLVM_TARGET_AArch64_ISELLOWERING_H
diff --git a/lib/Target/AArch64/AArch64InstrAtomics.td b/lib/Target/AArch64/AArch64InstrAtomics.td
new file mode 100644
index 0000000..3b9e3c6
--- /dev/null
+++ b/lib/Target/AArch64/AArch64InstrAtomics.td
@@ -0,0 +1,364 @@
+//=- AArch64InstrAtomics.td - AArch64 Atomic codegen support -*- tablegen -*-=//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// AArch64 Atomic operand code-gen constructs.
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------
+// Atomic fences
+//===----------------------------------
+def : Pat<(atomic_fence (i64 4), (imm)), (DMB (i32 0x9))>;
+def : Pat<(atomic_fence (imm), (imm)), (DMB (i32 0xb))>;
+
+//===----------------------------------
+// Atomic loads
+//===----------------------------------
+
+// When they're actually atomic, only one addressing mode (GPR64sp) is
+// supported, but when they're relaxed and anything can be used, all the
+// standard modes would be valid and may give efficiency gains.
+
+// A atomic load operation that actually needs acquire semantics.
+class acquiring_load<PatFrag base>
+ : PatFrag<(ops node:$ptr), (base node:$ptr), [{
+ AtomicOrdering Ordering = cast<AtomicSDNode>(N)->getOrdering();
+ assert(Ordering != AcquireRelease && "unexpected load ordering");
+ return Ordering == Acquire || Ordering == SequentiallyConsistent;
+}]>;
+
+// An atomic load operation that does not need either acquire or release
+// semantics.
+class relaxed_load<PatFrag base>
+ : PatFrag<(ops node:$ptr), (base node:$ptr), [{
+ AtomicOrdering Ordering = cast<AtomicSDNode>(N)->getOrdering();
+ return Ordering == Monotonic || Ordering == Unordered;
+}]>;
+
+// 8-bit loads
+def : Pat<(acquiring_load<atomic_load_8> GPR64sp:$ptr), (LDARB GPR64sp:$ptr)>;
+def : Pat<(relaxed_load<atomic_load_8> (ro_Windexed8 GPR64sp:$Rn, GPR32:$Rm,
+ ro_Wextend8:$offset)),
+ (LDRBBroW GPR64sp:$Rn, GPR32:$Rm, ro_Wextend8:$offset)>;
+def : Pat<(relaxed_load<atomic_load_8> (ro_Xindexed8 GPR64sp:$Rn, GPR64:$Rm,
+ ro_Xextend8:$offset)),
+ (LDRBBroX GPR64sp:$Rn, GPR64:$Rm, ro_Xextend8:$offset)>;
+def : Pat<(relaxed_load<atomic_load_8> (am_indexed8 GPR64sp:$Rn,
+ uimm12s1:$offset)),
+ (LDRBBui GPR64sp:$Rn, uimm12s1:$offset)>;
+def : Pat<(relaxed_load<atomic_load_8>
+ (am_unscaled8 GPR64sp:$Rn, simm9:$offset)),
+ (LDURBBi GPR64sp:$Rn, simm9:$offset)>;
+
+// 16-bit loads
+def : Pat<(acquiring_load<atomic_load_16> GPR64sp:$ptr), (LDARH GPR64sp:$ptr)>;
+def : Pat<(relaxed_load<atomic_load_16> (ro_Windexed16 GPR64sp:$Rn, GPR32:$Rm,
+ ro_Wextend16:$extend)),
+ (LDRHHroW GPR64sp:$Rn, GPR32:$Rm, ro_Wextend16:$extend)>;
+def : Pat<(relaxed_load<atomic_load_16> (ro_Xindexed16 GPR64sp:$Rn, GPR64:$Rm,
+ ro_Xextend16:$extend)),
+ (LDRHHroX GPR64sp:$Rn, GPR64:$Rm, ro_Xextend16:$extend)>;
+def : Pat<(relaxed_load<atomic_load_16> (am_indexed16 GPR64sp:$Rn,
+ uimm12s2:$offset)),
+ (LDRHHui GPR64sp:$Rn, uimm12s2:$offset)>;
+def : Pat<(relaxed_load<atomic_load_16>
+ (am_unscaled16 GPR64sp:$Rn, simm9:$offset)),
+ (LDURHHi GPR64sp:$Rn, simm9:$offset)>;
+
+// 32-bit loads
+def : Pat<(acquiring_load<atomic_load_32> GPR64sp:$ptr), (LDARW GPR64sp:$ptr)>;
+def : Pat<(relaxed_load<atomic_load_32> (ro_Windexed32 GPR64sp:$Rn, GPR32:$Rm,
+ ro_Wextend32:$extend)),
+ (LDRWroW GPR64sp:$Rn, GPR32:$Rm, ro_Wextend32:$extend)>;
+def : Pat<(relaxed_load<atomic_load_32> (ro_Xindexed32 GPR64sp:$Rn, GPR64:$Rm,
+ ro_Xextend32:$extend)),
+ (LDRWroX GPR64sp:$Rn, GPR64:$Rm, ro_Xextend32:$extend)>;
+def : Pat<(relaxed_load<atomic_load_32> (am_indexed32 GPR64sp:$Rn,
+ uimm12s4:$offset)),
+ (LDRWui GPR64sp:$Rn, uimm12s4:$offset)>;
+def : Pat<(relaxed_load<atomic_load_32>
+ (am_unscaled32 GPR64sp:$Rn, simm9:$offset)),
+ (LDURWi GPR64sp:$Rn, simm9:$offset)>;
+
+// 64-bit loads
+def : Pat<(acquiring_load<atomic_load_64> GPR64sp:$ptr), (LDARX GPR64sp:$ptr)>;
+def : Pat<(relaxed_load<atomic_load_64> (ro_Windexed64 GPR64sp:$Rn, GPR32:$Rm,
+ ro_Wextend64:$extend)),
+ (LDRXroW GPR64sp:$Rn, GPR32:$Rm, ro_Wextend64:$extend)>;
+def : Pat<(relaxed_load<atomic_load_64> (ro_Xindexed64 GPR64sp:$Rn, GPR64:$Rm,
+ ro_Xextend64:$extend)),
+ (LDRXroX GPR64sp:$Rn, GPR64:$Rm, ro_Xextend64:$extend)>;
+def : Pat<(relaxed_load<atomic_load_64> (am_indexed64 GPR64sp:$Rn,
+ uimm12s8:$offset)),
+ (LDRXui GPR64sp:$Rn, uimm12s8:$offset)>;
+def : Pat<(relaxed_load<atomic_load_64>
+ (am_unscaled64 GPR64sp:$Rn, simm9:$offset)),
+ (LDURXi GPR64sp:$Rn, simm9:$offset)>;
+
+//===----------------------------------
+// Atomic stores
+//===----------------------------------
+
+// When they're actually atomic, only one addressing mode (GPR64sp) is
+// supported, but when they're relaxed and anything can be used, all the
+// standard modes would be valid and may give efficiency gains.
+
+// A store operation that actually needs release semantics.
+class releasing_store<PatFrag base>
+ : PatFrag<(ops node:$ptr, node:$val), (base node:$ptr, node:$val), [{
+ AtomicOrdering Ordering = cast<AtomicSDNode>(N)->getOrdering();
+ assert(Ordering != AcquireRelease && "unexpected store ordering");
+ return Ordering == Release || Ordering == SequentiallyConsistent;
+}]>;
+
+// An atomic store operation that doesn't actually need to be atomic on AArch64.
+class relaxed_store<PatFrag base>
+ : PatFrag<(ops node:$ptr, node:$val), (base node:$ptr, node:$val), [{
+ AtomicOrdering Ordering = cast<AtomicSDNode>(N)->getOrdering();
+ return Ordering == Monotonic || Ordering == Unordered;
+}]>;
+
+// 8-bit stores
+def : Pat<(releasing_store<atomic_store_8> GPR64sp:$ptr, GPR32:$val),
+ (STLRB GPR32:$val, GPR64sp:$ptr)>;
+def : Pat<(relaxed_store<atomic_store_8>
+ (ro_Windexed8 GPR64sp:$Rn, GPR32:$Rm, ro_Wextend8:$extend),
+ GPR32:$val),
+ (STRBBroW GPR32:$val, GPR64sp:$Rn, GPR32:$Rm, ro_Wextend8:$extend)>;
+def : Pat<(relaxed_store<atomic_store_8>
+ (ro_Xindexed8 GPR64sp:$Rn, GPR64:$Rm, ro_Xextend8:$extend),
+ GPR32:$val),
+ (STRBBroX GPR32:$val, GPR64sp:$Rn, GPR64:$Rm, ro_Xextend8:$extend)>;
+def : Pat<(relaxed_store<atomic_store_8>
+ (am_indexed8 GPR64sp:$Rn, uimm12s1:$offset), GPR32:$val),
+ (STRBBui GPR32:$val, GPR64sp:$Rn, uimm12s1:$offset)>;
+def : Pat<(relaxed_store<atomic_store_8>
+ (am_unscaled8 GPR64sp:$Rn, simm9:$offset), GPR32:$val),
+ (STURBBi GPR32:$val, GPR64sp:$Rn, simm9:$offset)>;
+
+// 16-bit stores
+def : Pat<(releasing_store<atomic_store_16> GPR64sp:$ptr, GPR32:$val),
+ (STLRH GPR32:$val, GPR64sp:$ptr)>;
+def : Pat<(relaxed_store<atomic_store_16> (ro_Windexed16 GPR64sp:$Rn, GPR32:$Rm,
+ ro_Wextend16:$extend),
+ GPR32:$val),
+ (STRHHroW GPR32:$val, GPR64sp:$Rn, GPR32:$Rm, ro_Wextend16:$extend)>;
+def : Pat<(relaxed_store<atomic_store_16> (ro_Xindexed16 GPR64sp:$Rn, GPR64:$Rm,
+ ro_Xextend16:$extend),
+ GPR32:$val),
+ (STRHHroX GPR32:$val, GPR64sp:$Rn, GPR64:$Rm, ro_Xextend16:$extend)>;
+def : Pat<(relaxed_store<atomic_store_16>
+ (am_indexed16 GPR64sp:$Rn, uimm12s2:$offset), GPR32:$val),
+ (STRHHui GPR32:$val, GPR64sp:$Rn, uimm12s2:$offset)>;
+def : Pat<(relaxed_store<atomic_store_16>
+ (am_unscaled16 GPR64sp:$Rn, simm9:$offset), GPR32:$val),
+ (STURHHi GPR32:$val, GPR64sp:$Rn, simm9:$offset)>;
+
+// 32-bit stores
+def : Pat<(releasing_store<atomic_store_32> GPR64sp:$ptr, GPR32:$val),
+ (STLRW GPR32:$val, GPR64sp:$ptr)>;
+def : Pat<(relaxed_store<atomic_store_32> (ro_Windexed32 GPR64sp:$Rn, GPR32:$Rm,
+ ro_Wextend32:$extend),
+ GPR32:$val),
+ (STRWroW GPR32:$val, GPR64sp:$Rn, GPR32:$Rm, ro_Wextend32:$extend)>;
+def : Pat<(relaxed_store<atomic_store_32> (ro_Xindexed32 GPR64sp:$Rn, GPR64:$Rm,
+ ro_Xextend32:$extend),
+ GPR32:$val),
+ (STRWroX GPR32:$val, GPR64sp:$Rn, GPR64:$Rm, ro_Xextend32:$extend)>;
+def : Pat<(relaxed_store<atomic_store_32>
+ (am_indexed32 GPR64sp:$Rn, uimm12s4:$offset), GPR32:$val),
+ (STRWui GPR32:$val, GPR64sp:$Rn, uimm12s4:$offset)>;
+def : Pat<(relaxed_store<atomic_store_32>
+ (am_unscaled32 GPR64sp:$Rn, simm9:$offset), GPR32:$val),
+ (STURWi GPR32:$val, GPR64sp:$Rn, simm9:$offset)>;
+
+// 64-bit stores
+def : Pat<(releasing_store<atomic_store_64> GPR64sp:$ptr, GPR64:$val),
+ (STLRX GPR64:$val, GPR64sp:$ptr)>;
+def : Pat<(relaxed_store<atomic_store_64> (ro_Windexed64 GPR64sp:$Rn, GPR32:$Rm,
+ ro_Wextend16:$extend),
+ GPR64:$val),
+ (STRXroW GPR64:$val, GPR64sp:$Rn, GPR32:$Rm, ro_Wextend64:$extend)>;
+def : Pat<(relaxed_store<atomic_store_64> (ro_Xindexed64 GPR64sp:$Rn, GPR64:$Rm,
+ ro_Xextend16:$extend),
+ GPR64:$val),
+ (STRXroX GPR64:$val, GPR64sp:$Rn, GPR64:$Rm, ro_Xextend64:$extend)>;
+def : Pat<(relaxed_store<atomic_store_64>
+ (am_indexed64 GPR64sp:$Rn, uimm12s8:$offset), GPR64:$val),
+ (STRXui GPR64:$val, GPR64sp:$Rn, uimm12s8:$offset)>;
+def : Pat<(relaxed_store<atomic_store_64>
+ (am_unscaled64 GPR64sp:$Rn, simm9:$offset), GPR64:$val),
+ (STURXi GPR64:$val, GPR64sp:$Rn, simm9:$offset)>;
+
+//===----------------------------------
+// Low-level exclusive operations
+//===----------------------------------
+
+// Load-exclusives.
+
+def ldxr_1 : PatFrag<(ops node:$ptr), (int_aarch64_ldxr node:$ptr), [{
+ return cast<MemIntrinsicSDNode>(N)->getMemoryVT() == MVT::i8;
+}]>;
+
+def ldxr_2 : PatFrag<(ops node:$ptr), (int_aarch64_ldxr node:$ptr), [{
+ return cast<MemIntrinsicSDNode>(N)->getMemoryVT() == MVT::i16;
+}]>;
+
+def ldxr_4 : PatFrag<(ops node:$ptr), (int_aarch64_ldxr node:$ptr), [{
+ return cast<MemIntrinsicSDNode>(N)->getMemoryVT() == MVT::i32;
+}]>;
+
+def ldxr_8 : PatFrag<(ops node:$ptr), (int_aarch64_ldxr node:$ptr), [{
+ return cast<MemIntrinsicSDNode>(N)->getMemoryVT() == MVT::i64;
+}]>;
+
+def : Pat<(ldxr_1 GPR64sp:$addr),
+ (SUBREG_TO_REG (i64 0), (LDXRB GPR64sp:$addr), sub_32)>;
+def : Pat<(ldxr_2 GPR64sp:$addr),
+ (SUBREG_TO_REG (i64 0), (LDXRH GPR64sp:$addr), sub_32)>;
+def : Pat<(ldxr_4 GPR64sp:$addr),
+ (SUBREG_TO_REG (i64 0), (LDXRW GPR64sp:$addr), sub_32)>;
+def : Pat<(ldxr_8 GPR64sp:$addr), (LDXRX GPR64sp:$addr)>;
+
+def : Pat<(and (ldxr_1 GPR64sp:$addr), 0xff),
+ (SUBREG_TO_REG (i64 0), (LDXRB GPR64sp:$addr), sub_32)>;
+def : Pat<(and (ldxr_2 GPR64sp:$addr), 0xffff),
+ (SUBREG_TO_REG (i64 0), (LDXRH GPR64sp:$addr), sub_32)>;
+def : Pat<(and (ldxr_4 GPR64sp:$addr), 0xffffffff),
+ (SUBREG_TO_REG (i64 0), (LDXRW GPR64sp:$addr), sub_32)>;
+
+// Load-exclusives.
+
+def ldaxr_1 : PatFrag<(ops node:$ptr), (int_aarch64_ldaxr node:$ptr), [{
+ return cast<MemIntrinsicSDNode>(N)->getMemoryVT() == MVT::i8;
+}]>;
+
+def ldaxr_2 : PatFrag<(ops node:$ptr), (int_aarch64_ldaxr node:$ptr), [{
+ return cast<MemIntrinsicSDNode>(N)->getMemoryVT() == MVT::i16;
+}]>;
+
+def ldaxr_4 : PatFrag<(ops node:$ptr), (int_aarch64_ldaxr node:$ptr), [{
+ return cast<MemIntrinsicSDNode>(N)->getMemoryVT() == MVT::i32;
+}]>;
+
+def ldaxr_8 : PatFrag<(ops node:$ptr), (int_aarch64_ldaxr node:$ptr), [{
+ return cast<MemIntrinsicSDNode>(N)->getMemoryVT() == MVT::i64;
+}]>;
+
+def : Pat<(ldaxr_1 GPR64sp:$addr),
+ (SUBREG_TO_REG (i64 0), (LDAXRB GPR64sp:$addr), sub_32)>;
+def : Pat<(ldaxr_2 GPR64sp:$addr),
+ (SUBREG_TO_REG (i64 0), (LDAXRH GPR64sp:$addr), sub_32)>;
+def : Pat<(ldaxr_4 GPR64sp:$addr),
+ (SUBREG_TO_REG (i64 0), (LDAXRW GPR64sp:$addr), sub_32)>;
+def : Pat<(ldaxr_8 GPR64sp:$addr), (LDAXRX GPR64sp:$addr)>;
+
+def : Pat<(and (ldaxr_1 GPR64sp:$addr), 0xff),
+ (SUBREG_TO_REG (i64 0), (LDAXRB GPR64sp:$addr), sub_32)>;
+def : Pat<(and (ldaxr_2 GPR64sp:$addr), 0xffff),
+ (SUBREG_TO_REG (i64 0), (LDAXRH GPR64sp:$addr), sub_32)>;
+def : Pat<(and (ldaxr_4 GPR64sp:$addr), 0xffffffff),
+ (SUBREG_TO_REG (i64 0), (LDAXRW GPR64sp:$addr), sub_32)>;
+
+// Store-exclusives.
+
+def stxr_1 : PatFrag<(ops node:$val, node:$ptr),
+ (int_aarch64_stxr node:$val, node:$ptr), [{
+ return cast<MemIntrinsicSDNode>(N)->getMemoryVT() == MVT::i8;
+}]>;
+
+def stxr_2 : PatFrag<(ops node:$val, node:$ptr),
+ (int_aarch64_stxr node:$val, node:$ptr), [{
+ return cast<MemIntrinsicSDNode>(N)->getMemoryVT() == MVT::i16;
+}]>;
+
+def stxr_4 : PatFrag<(ops node:$val, node:$ptr),
+ (int_aarch64_stxr node:$val, node:$ptr), [{
+ return cast<MemIntrinsicSDNode>(N)->getMemoryVT() == MVT::i32;
+}]>;
+
+def stxr_8 : PatFrag<(ops node:$val, node:$ptr),
+ (int_aarch64_stxr node:$val, node:$ptr), [{
+ return cast<MemIntrinsicSDNode>(N)->getMemoryVT() == MVT::i64;
+}]>;
+
+
+def : Pat<(stxr_1 GPR64:$val, GPR64sp:$addr),
+ (STXRB (EXTRACT_SUBREG GPR64:$val, sub_32), GPR64sp:$addr)>;
+def : Pat<(stxr_2 GPR64:$val, GPR64sp:$addr),
+ (STXRH (EXTRACT_SUBREG GPR64:$val, sub_32), GPR64sp:$addr)>;
+def : Pat<(stxr_4 GPR64:$val, GPR64sp:$addr),
+ (STXRW (EXTRACT_SUBREG GPR64:$val, sub_32), GPR64sp:$addr)>;
+def : Pat<(stxr_8 GPR64:$val, GPR64sp:$addr),
+ (STXRX GPR64:$val, GPR64sp:$addr)>;
+
+def : Pat<(stxr_1 (zext (and GPR32:$val, 0xff)), GPR64sp:$addr),
+ (STXRB GPR32:$val, GPR64sp:$addr)>;
+def : Pat<(stxr_2 (zext (and GPR32:$val, 0xffff)), GPR64sp:$addr),
+ (STXRH GPR32:$val, GPR64sp:$addr)>;
+def : Pat<(stxr_4 (zext GPR32:$val), GPR64sp:$addr),
+ (STXRW GPR32:$val, GPR64sp:$addr)>;
+
+def : Pat<(stxr_1 (and GPR64:$val, 0xff), GPR64sp:$addr),
+ (STXRB (EXTRACT_SUBREG GPR64:$val, sub_32), GPR64sp:$addr)>;
+def : Pat<(stxr_2 (and GPR64:$val, 0xffff), GPR64sp:$addr),
+ (STXRH (EXTRACT_SUBREG GPR64:$val, sub_32), GPR64sp:$addr)>;
+def : Pat<(stxr_4 (and GPR64:$val, 0xffffffff), GPR64sp:$addr),
+ (STXRW (EXTRACT_SUBREG GPR64:$val, sub_32), GPR64sp:$addr)>;
+
+// Store-release-exclusives.
+
+def stlxr_1 : PatFrag<(ops node:$val, node:$ptr),
+ (int_aarch64_stlxr node:$val, node:$ptr), [{
+ return cast<MemIntrinsicSDNode>(N)->getMemoryVT() == MVT::i8;
+}]>;
+
+def stlxr_2 : PatFrag<(ops node:$val, node:$ptr),
+ (int_aarch64_stlxr node:$val, node:$ptr), [{
+ return cast<MemIntrinsicSDNode>(N)->getMemoryVT() == MVT::i16;
+}]>;
+
+def stlxr_4 : PatFrag<(ops node:$val, node:$ptr),
+ (int_aarch64_stlxr node:$val, node:$ptr), [{
+ return cast<MemIntrinsicSDNode>(N)->getMemoryVT() == MVT::i32;
+}]>;
+
+def stlxr_8 : PatFrag<(ops node:$val, node:$ptr),
+ (int_aarch64_stlxr node:$val, node:$ptr), [{
+ return cast<MemIntrinsicSDNode>(N)->getMemoryVT() == MVT::i64;
+}]>;
+
+
+def : Pat<(stlxr_1 GPR64:$val, GPR64sp:$addr),
+ (STLXRB (EXTRACT_SUBREG GPR64:$val, sub_32), GPR64sp:$addr)>;
+def : Pat<(stlxr_2 GPR64:$val, GPR64sp:$addr),
+ (STLXRH (EXTRACT_SUBREG GPR64:$val, sub_32), GPR64sp:$addr)>;
+def : Pat<(stlxr_4 GPR64:$val, GPR64sp:$addr),
+ (STLXRW (EXTRACT_SUBREG GPR64:$val, sub_32), GPR64sp:$addr)>;
+def : Pat<(stlxr_8 GPR64:$val, GPR64sp:$addr),
+ (STLXRX GPR64:$val, GPR64sp:$addr)>;
+
+def : Pat<(stlxr_1 (zext (and GPR32:$val, 0xff)), GPR64sp:$addr),
+ (STLXRB GPR32:$val, GPR64sp:$addr)>;
+def : Pat<(stlxr_2 (zext (and GPR32:$val, 0xffff)), GPR64sp:$addr),
+ (STLXRH GPR32:$val, GPR64sp:$addr)>;
+def : Pat<(stlxr_4 (zext GPR32:$val), GPR64sp:$addr),
+ (STLXRW GPR32:$val, GPR64sp:$addr)>;
+
+def : Pat<(stlxr_1 (and GPR64:$val, 0xff), GPR64sp:$addr),
+ (STLXRB (EXTRACT_SUBREG GPR64:$val, sub_32), GPR64sp:$addr)>;
+def : Pat<(stlxr_2 (and GPR64:$val, 0xffff), GPR64sp:$addr),
+ (STLXRH (EXTRACT_SUBREG GPR64:$val, sub_32), GPR64sp:$addr)>;
+def : Pat<(stlxr_4 (and GPR64:$val, 0xffffffff), GPR64sp:$addr),
+ (STLXRW (EXTRACT_SUBREG GPR64:$val, sub_32), GPR64sp:$addr)>;
+
+
+// And clear exclusive.
+
+def : Pat<(int_aarch64_clrex), (CLREX 0xf)>;
diff --git a/lib/Target/AArch64/AArch64InstrFormats.td b/lib/Target/AArch64/AArch64InstrFormats.td
index 4cc3813..d455d7e 100644
--- a/lib/Target/AArch64/AArch64InstrFormats.td
+++ b/lib/Target/AArch64/AArch64InstrFormats.td
@@ -1,4 +1,4 @@
-//===- AArch64InstrFormats.td - AArch64 Instruction Formats --*- tablegen -*-=//
+//===- AArch64InstrFormats.td - AArch64 Instruction Formats --*- tblgen -*-===//
//
// The LLVM Compiler Infrastructure
//
@@ -6,1482 +6,8569 @@
// License. See LICENSE.TXT for details.
//
//===----------------------------------------------------------------------===//
-// This file describes AArch64 instruction formats, down to the level of the
-// instruction's overall class.
-//===----------------------------------------------------------------------===//
-
//===----------------------------------------------------------------------===//
-// A64 Instruction Format Definitions.
-//===----------------------------------------------------------------------===//
+// Describe AArch64 instructions format here
+//
-// A64 is currently the only instruction set supported by the AArch64
-// architecture.
-class A64Inst<dag outs, dag ins, string asmstr, list<dag> patterns,
- InstrItinClass itin>
- : Instruction {
- // All A64 instructions are 32-bit. This field will be filled in
- // gradually going down the hierarchy.
- field bits<32> Inst;
+// Format specifies the encoding used by the instruction. This is part of the
+// ad-hoc solution used to emit machine instruction encodings by our machine
+// code emitter.
+class Format<bits<2> val> {
+ bits<2> Value = val;
+}
+def PseudoFrm : Format<0>;
+def NormalFrm : Format<1>; // Do we need any others?
+
+// AArch64 Instruction Format
+class AArch64Inst<Format f, string cstr> : Instruction {
+ field bits<32> Inst; // Instruction encoding.
+ // Mask of bits that cause an encoding to be UNPREDICTABLE.
+ // If a bit is set, then if the corresponding bit in the
+ // target encoding differs from its value in the "Inst" field,
+ // the instruction is UNPREDICTABLE (SoftFail in abstract parlance).
field bits<32> Unpredictable = 0;
// SoftFail is the generic name for this field, but we alias it so
// as to make it more obvious what it means in ARM-land.
field bits<32> SoftFail = Unpredictable;
-
- // LLVM-level model of the AArch64/A64 distinction.
- let Namespace = "AArch64";
- let DecoderNamespace = "A64";
- let Size = 4;
-
- // Set the templated fields
- let OutOperandList = outs;
- let InOperandList = ins;
- let AsmString = asmstr;
- let Pattern = patterns;
- let Itinerary = itin;
+ let Namespace = "AArch64";
+ Format F = f;
+ bits<2> Form = F.Value;
+ let Pattern = [];
+ let Constraints = cstr;
}
-class PseudoInst<dag outs, dag ins, list<dag> patterns> : Instruction {
- let Namespace = "AArch64";
-
- let OutOperandList = outs;
- let InOperandList= ins;
- let Pattern = patterns;
- let isCodeGenOnly = 1;
- let isPseudo = 1;
+// Pseudo instructions (don't have encoding information)
+class Pseudo<dag oops, dag iops, list<dag> pattern, string cstr = "">
+ : AArch64Inst<PseudoFrm, cstr> {
+ dag OutOperandList = oops;
+ dag InOperandList = iops;
+ let Pattern = pattern;
+ let isCodeGenOnly = 1;
}
-// Represents a pseudo-instruction that represents a single A64 instruction for
-// whatever reason, the eventual result will be a 32-bit real instruction.
-class A64PseudoInst<dag outs, dag ins, list<dag> patterns>
- : PseudoInst<outs, ins, patterns> {
+// Real instructions (have encoding information)
+class EncodedI<string cstr, list<dag> pattern> : AArch64Inst<NormalFrm, cstr> {
+ let Pattern = pattern;
let Size = 4;
}
-// As above, this will be a single A64 instruction, but we can actually give the
-// expansion in TableGen.
-class A64PseudoExpand<dag outs, dag ins, list<dag> patterns, dag Result>
- : A64PseudoInst<outs, ins, patterns>,
- PseudoInstExpansion<Result>;
-
-
-// First, some common cross-hierarchy register formats.
-
-class A64InstRd<dag outs, dag ins, string asmstr,
- list<dag> patterns, InstrItinClass itin>
- : A64Inst<outs, ins, asmstr, patterns, itin> {
- bits<5> Rd;
-
- let Inst{4-0} = Rd;
+// Normal instructions
+class I<dag oops, dag iops, string asm, string operands, string cstr,
+ list<dag> pattern>
+ : EncodedI<cstr, pattern> {
+ dag OutOperandList = oops;
+ dag InOperandList = iops;
+ let AsmString = !strconcat(asm, operands);
}
-class A64InstRt<dag outs, dag ins, string asmstr,
- list<dag> patterns, InstrItinClass itin>
- : A64Inst<outs, ins, asmstr, patterns, itin> {
+class TriOpFrag<dag res> : PatFrag<(ops node:$LHS, node:$MHS, node:$RHS), res>;
+class BinOpFrag<dag res> : PatFrag<(ops node:$LHS, node:$RHS), res>;
+class UnOpFrag<dag res> : PatFrag<(ops node:$LHS), res>;
+
+// Helper fragment for an extract of the high portion of a 128-bit vector.
+def extract_high_v16i8 :
+ UnOpFrag<(extract_subvector (v16i8 node:$LHS), (i64 8))>;
+def extract_high_v8i16 :
+ UnOpFrag<(extract_subvector (v8i16 node:$LHS), (i64 4))>;
+def extract_high_v4i32 :
+ UnOpFrag<(extract_subvector (v4i32 node:$LHS), (i64 2))>;
+def extract_high_v2i64 :
+ UnOpFrag<(extract_subvector (v2i64 node:$LHS), (i64 1))>;
+
+//===----------------------------------------------------------------------===//
+// Asm Operand Classes.
+//
+
+// Shifter operand for arithmetic shifted encodings.
+def ShifterOperand : AsmOperandClass {
+ let Name = "Shifter";
+}
+
+// Shifter operand for mov immediate encodings.
+def MovImm32ShifterOperand : AsmOperandClass {
+ let SuperClasses = [ShifterOperand];
+ let Name = "MovImm32Shifter";
+ let RenderMethod = "addShifterOperands";
+ let DiagnosticType = "InvalidMovImm32Shift";
+}
+def MovImm64ShifterOperand : AsmOperandClass {
+ let SuperClasses = [ShifterOperand];
+ let Name = "MovImm64Shifter";
+ let RenderMethod = "addShifterOperands";
+ let DiagnosticType = "InvalidMovImm64Shift";
+}
+
+// Shifter operand for arithmetic register shifted encodings.
+class ArithmeticShifterOperand<int width> : AsmOperandClass {
+ let SuperClasses = [ShifterOperand];
+ let Name = "ArithmeticShifter" # width;
+ let PredicateMethod = "isArithmeticShifter<" # width # ">";
+ let RenderMethod = "addShifterOperands";
+ let DiagnosticType = "AddSubRegShift" # width;
+}
+
+def ArithmeticShifterOperand32 : ArithmeticShifterOperand<32>;
+def ArithmeticShifterOperand64 : ArithmeticShifterOperand<64>;
+
+// Shifter operand for logical register shifted encodings.
+class LogicalShifterOperand<int width> : AsmOperandClass {
+ let SuperClasses = [ShifterOperand];
+ let Name = "LogicalShifter" # width;
+ let PredicateMethod = "isLogicalShifter<" # width # ">";
+ let RenderMethod = "addShifterOperands";
+ let DiagnosticType = "AddSubRegShift" # width;
+}
+
+def LogicalShifterOperand32 : LogicalShifterOperand<32>;
+def LogicalShifterOperand64 : LogicalShifterOperand<64>;
+
+// Shifter operand for logical vector 128/64-bit shifted encodings.
+def LogicalVecShifterOperand : AsmOperandClass {
+ let SuperClasses = [ShifterOperand];
+ let Name = "LogicalVecShifter";
+ let RenderMethod = "addShifterOperands";
+}
+def LogicalVecHalfWordShifterOperand : AsmOperandClass {
+ let SuperClasses = [LogicalVecShifterOperand];
+ let Name = "LogicalVecHalfWordShifter";
+ let RenderMethod = "addShifterOperands";
+}
+
+// The "MSL" shifter on the vector MOVI instruction.
+def MoveVecShifterOperand : AsmOperandClass {
+ let SuperClasses = [ShifterOperand];
+ let Name = "MoveVecShifter";
+ let RenderMethod = "addShifterOperands";
+}
+
+// Extend operand for arithmetic encodings.
+def ExtendOperand : AsmOperandClass {
+ let Name = "Extend";
+ let DiagnosticType = "AddSubRegExtendLarge";
+}
+def ExtendOperand64 : AsmOperandClass {
+ let SuperClasses = [ExtendOperand];
+ let Name = "Extend64";
+ let DiagnosticType = "AddSubRegExtendSmall";
+}
+// 'extend' that's a lsl of a 64-bit register.
+def ExtendOperandLSL64 : AsmOperandClass {
+ let SuperClasses = [ExtendOperand];
+ let Name = "ExtendLSL64";
+ let RenderMethod = "addExtend64Operands";
+ let DiagnosticType = "AddSubRegExtendLarge";
+}
+
+// 8-bit floating-point immediate encodings.
+def FPImmOperand : AsmOperandClass {
+ let Name = "FPImm";
+ let ParserMethod = "tryParseFPImm";
+ let DiagnosticType = "InvalidFPImm";
+}
+
+def CondCode : AsmOperandClass {
+ let Name = "CondCode";
+ let DiagnosticType = "InvalidCondCode";
+}
+
+// A 32-bit register pasrsed as 64-bit
+def GPR32as64Operand : AsmOperandClass {
+ let Name = "GPR32as64";
+}
+def GPR32as64 : RegisterOperand<GPR32> {
+ let ParserMatchClass = GPR32as64Operand;
+}
+
+// 8-bit immediate for AdvSIMD where 64-bit values of the form:
+// aaaaaaaa bbbbbbbb cccccccc dddddddd eeeeeeee ffffffff gggggggg hhhhhhhh
+// are encoded as the eight bit value 'abcdefgh'.
+def SIMDImmType10Operand : AsmOperandClass { let Name = "SIMDImmType10"; }
+
+
+//===----------------------------------------------------------------------===//
+// Operand Definitions.
+//
+
+// ADR[P] instruction labels.
+def AdrpOperand : AsmOperandClass {
+ let Name = "AdrpLabel";
+ let ParserMethod = "tryParseAdrpLabel";
+ let DiagnosticType = "InvalidLabel";
+}
+def adrplabel : Operand<i64> {
+ let EncoderMethod = "getAdrLabelOpValue";
+ let PrintMethod = "printAdrpLabel";
+ let ParserMatchClass = AdrpOperand;
+}
+
+def AdrOperand : AsmOperandClass {
+ let Name = "AdrLabel";
+ let ParserMethod = "tryParseAdrLabel";
+ let DiagnosticType = "InvalidLabel";
+}
+def adrlabel : Operand<i64> {
+ let EncoderMethod = "getAdrLabelOpValue";
+ let ParserMatchClass = AdrOperand;
+}
+
+// simm9 predicate - True if the immediate is in the range [-256, 255].
+def SImm9Operand : AsmOperandClass {
+ let Name = "SImm9";
+ let DiagnosticType = "InvalidMemoryIndexedSImm9";
+}
+def simm9 : Operand<i64>, ImmLeaf<i64, [{ return Imm >= -256 && Imm < 256; }]> {
+ let ParserMatchClass = SImm9Operand;
+}
+
+// simm7sN predicate - True if the immediate is a multiple of N in the range
+// [-64 * N, 63 * N].
+class SImm7Scaled<int Scale> : AsmOperandClass {
+ let Name = "SImm7s" # Scale;
+ let DiagnosticType = "InvalidMemoryIndexed" # Scale # "SImm7";
+}
+
+def SImm7s4Operand : SImm7Scaled<4>;
+def SImm7s8Operand : SImm7Scaled<8>;
+def SImm7s16Operand : SImm7Scaled<16>;
+
+def simm7s4 : Operand<i32> {
+ let ParserMatchClass = SImm7s4Operand;
+ let PrintMethod = "printImmScale<4>";
+}
+
+def simm7s8 : Operand<i32> {
+ let ParserMatchClass = SImm7s8Operand;
+ let PrintMethod = "printImmScale<8>";
+}
+
+def simm7s16 : Operand<i32> {
+ let ParserMatchClass = SImm7s16Operand;
+ let PrintMethod = "printImmScale<16>";
+}
+
+class AsmImmRange<int Low, int High> : AsmOperandClass {
+ let Name = "Imm" # Low # "_" # High;
+ let DiagnosticType = "InvalidImm" # Low # "_" # High;
+}
+
+def Imm1_8Operand : AsmImmRange<1, 8>;
+def Imm1_16Operand : AsmImmRange<1, 16>;
+def Imm1_32Operand : AsmImmRange<1, 32>;
+def Imm1_64Operand : AsmImmRange<1, 64>;
+
+def MovZSymbolG3AsmOperand : AsmOperandClass {
+ let Name = "MovZSymbolG3";
+ let RenderMethod = "addImmOperands";
+}
+
+def movz_symbol_g3 : Operand<i32> {
+ let ParserMatchClass = MovZSymbolG3AsmOperand;
+}
+
+def MovZSymbolG2AsmOperand : AsmOperandClass {
+ let Name = "MovZSymbolG2";
+ let RenderMethod = "addImmOperands";
+}
+
+def movz_symbol_g2 : Operand<i32> {
+ let ParserMatchClass = MovZSymbolG2AsmOperand;
+}
+
+def MovZSymbolG1AsmOperand : AsmOperandClass {
+ let Name = "MovZSymbolG1";
+ let RenderMethod = "addImmOperands";
+}
+
+def movz_symbol_g1 : Operand<i32> {
+ let ParserMatchClass = MovZSymbolG1AsmOperand;
+}
+
+def MovZSymbolG0AsmOperand : AsmOperandClass {
+ let Name = "MovZSymbolG0";
+ let RenderMethod = "addImmOperands";
+}
+
+def movz_symbol_g0 : Operand<i32> {
+ let ParserMatchClass = MovZSymbolG0AsmOperand;
+}
+
+def MovKSymbolG3AsmOperand : AsmOperandClass {
+ let Name = "MovKSymbolG3";
+ let RenderMethod = "addImmOperands";
+}
+
+def movk_symbol_g3 : Operand<i32> {
+ let ParserMatchClass = MovKSymbolG3AsmOperand;
+}
+
+def MovKSymbolG2AsmOperand : AsmOperandClass {
+ let Name = "MovKSymbolG2";
+ let RenderMethod = "addImmOperands";
+}
+
+def movk_symbol_g2 : Operand<i32> {
+ let ParserMatchClass = MovKSymbolG2AsmOperand;
+}
+
+def MovKSymbolG1AsmOperand : AsmOperandClass {
+ let Name = "MovKSymbolG1";
+ let RenderMethod = "addImmOperands";
+}
+
+def movk_symbol_g1 : Operand<i32> {
+ let ParserMatchClass = MovKSymbolG1AsmOperand;
+}
+
+def MovKSymbolG0AsmOperand : AsmOperandClass {
+ let Name = "MovKSymbolG0";
+ let RenderMethod = "addImmOperands";
+}
+
+def movk_symbol_g0 : Operand<i32> {
+ let ParserMatchClass = MovKSymbolG0AsmOperand;
+}
+
+class fixedpoint_i32<ValueType FloatVT>
+ : Operand<FloatVT>,
+ ComplexPattern<FloatVT, 1, "SelectCVTFixedPosOperand<32>", [fpimm, ld]> {
+ let EncoderMethod = "getFixedPointScaleOpValue";
+ let DecoderMethod = "DecodeFixedPointScaleImm32";
+ let ParserMatchClass = Imm1_32Operand;
+}
+
+class fixedpoint_i64<ValueType FloatVT>
+ : Operand<FloatVT>,
+ ComplexPattern<FloatVT, 1, "SelectCVTFixedPosOperand<64>", [fpimm, ld]> {
+ let EncoderMethod = "getFixedPointScaleOpValue";
+ let DecoderMethod = "DecodeFixedPointScaleImm64";
+ let ParserMatchClass = Imm1_64Operand;
+}
+
+def fixedpoint_f32_i32 : fixedpoint_i32<f32>;
+def fixedpoint_f64_i32 : fixedpoint_i32<f64>;
+
+def fixedpoint_f32_i64 : fixedpoint_i64<f32>;
+def fixedpoint_f64_i64 : fixedpoint_i64<f64>;
+
+def vecshiftR8 : Operand<i32>, ImmLeaf<i32, [{
+ return (((uint32_t)Imm) > 0) && (((uint32_t)Imm) < 9);
+}]> {
+ let EncoderMethod = "getVecShiftR8OpValue";
+ let DecoderMethod = "DecodeVecShiftR8Imm";
+ let ParserMatchClass = Imm1_8Operand;
+}
+def vecshiftR16 : Operand<i32>, ImmLeaf<i32, [{
+ return (((uint32_t)Imm) > 0) && (((uint32_t)Imm) < 17);
+}]> {
+ let EncoderMethod = "getVecShiftR16OpValue";
+ let DecoderMethod = "DecodeVecShiftR16Imm";
+ let ParserMatchClass = Imm1_16Operand;
+}
+def vecshiftR16Narrow : Operand<i32>, ImmLeaf<i32, [{
+ return (((uint32_t)Imm) > 0) && (((uint32_t)Imm) < 9);
+}]> {
+ let EncoderMethod = "getVecShiftR16OpValue";
+ let DecoderMethod = "DecodeVecShiftR16ImmNarrow";
+ let ParserMatchClass = Imm1_8Operand;
+}
+def vecshiftR32 : Operand<i32>, ImmLeaf<i32, [{
+ return (((uint32_t)Imm) > 0) && (((uint32_t)Imm) < 33);
+}]> {
+ let EncoderMethod = "getVecShiftR32OpValue";
+ let DecoderMethod = "DecodeVecShiftR32Imm";
+ let ParserMatchClass = Imm1_32Operand;
+}
+def vecshiftR32Narrow : Operand<i32>, ImmLeaf<i32, [{
+ return (((uint32_t)Imm) > 0) && (((uint32_t)Imm) < 17);
+}]> {
+ let EncoderMethod = "getVecShiftR32OpValue";
+ let DecoderMethod = "DecodeVecShiftR32ImmNarrow";
+ let ParserMatchClass = Imm1_16Operand;
+}
+def vecshiftR64 : Operand<i32>, ImmLeaf<i32, [{
+ return (((uint32_t)Imm) > 0) && (((uint32_t)Imm) < 65);
+}]> {
+ let EncoderMethod = "getVecShiftR64OpValue";
+ let DecoderMethod = "DecodeVecShiftR64Imm";
+ let ParserMatchClass = Imm1_64Operand;
+}
+def vecshiftR64Narrow : Operand<i32>, ImmLeaf<i32, [{
+ return (((uint32_t)Imm) > 0) && (((uint32_t)Imm) < 33);
+}]> {
+ let EncoderMethod = "getVecShiftR64OpValue";
+ let DecoderMethod = "DecodeVecShiftR64ImmNarrow";
+ let ParserMatchClass = Imm1_32Operand;
+}
+
+def Imm0_7Operand : AsmImmRange<0, 7>;
+def Imm0_15Operand : AsmImmRange<0, 15>;
+def Imm0_31Operand : AsmImmRange<0, 31>;
+def Imm0_63Operand : AsmImmRange<0, 63>;
+
+def vecshiftL8 : Operand<i32>, ImmLeaf<i32, [{
+ return (((uint32_t)Imm) < 8);
+}]> {
+ let EncoderMethod = "getVecShiftL8OpValue";
+ let DecoderMethod = "DecodeVecShiftL8Imm";
+ let ParserMatchClass = Imm0_7Operand;
+}
+def vecshiftL16 : Operand<i32>, ImmLeaf<i32, [{
+ return (((uint32_t)Imm) < 16);
+}]> {
+ let EncoderMethod = "getVecShiftL16OpValue";
+ let DecoderMethod = "DecodeVecShiftL16Imm";
+ let ParserMatchClass = Imm0_15Operand;
+}
+def vecshiftL32 : Operand<i32>, ImmLeaf<i32, [{
+ return (((uint32_t)Imm) < 32);
+}]> {
+ let EncoderMethod = "getVecShiftL32OpValue";
+ let DecoderMethod = "DecodeVecShiftL32Imm";
+ let ParserMatchClass = Imm0_31Operand;
+}
+def vecshiftL64 : Operand<i32>, ImmLeaf<i32, [{
+ return (((uint32_t)Imm) < 64);
+}]> {
+ let EncoderMethod = "getVecShiftL64OpValue";
+ let DecoderMethod = "DecodeVecShiftL64Imm";
+ let ParserMatchClass = Imm0_63Operand;
+}
+
+
+// Crazy immediate formats used by 32-bit and 64-bit logical immediate
+// instructions for splatting repeating bit patterns across the immediate.
+def logical_imm32_XFORM : SDNodeXForm<imm, [{
+ uint64_t enc = AArch64_AM::encodeLogicalImmediate(N->getZExtValue(), 32);
+ return CurDAG->getTargetConstant(enc, MVT::i32);
+}]>;
+def logical_imm64_XFORM : SDNodeXForm<imm, [{
+ uint64_t enc = AArch64_AM::encodeLogicalImmediate(N->getZExtValue(), 64);
+ return CurDAG->getTargetConstant(enc, MVT::i32);
+}]>;
+
+def LogicalImm32Operand : AsmOperandClass {
+ let Name = "LogicalImm32";
+ let DiagnosticType = "LogicalSecondSource";
+}
+def LogicalImm64Operand : AsmOperandClass {
+ let Name = "LogicalImm64";
+ let DiagnosticType = "LogicalSecondSource";
+}
+def logical_imm32 : Operand<i32>, PatLeaf<(imm), [{
+ return AArch64_AM::isLogicalImmediate(N->getZExtValue(), 32);
+}], logical_imm32_XFORM> {
+ let PrintMethod = "printLogicalImm32";
+ let ParserMatchClass = LogicalImm32Operand;
+}
+def logical_imm64 : Operand<i64>, PatLeaf<(imm), [{
+ return AArch64_AM::isLogicalImmediate(N->getZExtValue(), 64);
+}], logical_imm64_XFORM> {
+ let PrintMethod = "printLogicalImm64";
+ let ParserMatchClass = LogicalImm64Operand;
+}
+
+// imm0_65535 predicate - True if the immediate is in the range [0,65535].
+def Imm0_65535Operand : AsmImmRange<0, 65535>;
+def imm0_65535 : Operand<i32>, ImmLeaf<i32, [{
+ return ((uint32_t)Imm) < 65536;
+}]> {
+ let ParserMatchClass = Imm0_65535Operand;
+ let PrintMethod = "printHexImm";
+}
+
+// imm0_255 predicate - True if the immediate is in the range [0,255].
+def Imm0_255Operand : AsmOperandClass { let Name = "Imm0_255"; }
+def imm0_255 : Operand<i32>, ImmLeaf<i32, [{
+ return ((uint32_t)Imm) < 256;
+}]> {
+ let ParserMatchClass = Imm0_255Operand;
+ let PrintMethod = "printHexImm";
+}
+
+// imm0_127 predicate - True if the immediate is in the range [0,127]
+def Imm0_127Operand : AsmImmRange<0, 127>;
+def imm0_127 : Operand<i32>, ImmLeaf<i32, [{
+ return ((uint32_t)Imm) < 128;
+}]> {
+ let ParserMatchClass = Imm0_127Operand;
+ let PrintMethod = "printHexImm";
+}
+
+// NOTE: These imm0_N operands have to be of type i64 because i64 is the size
+// for all shift-amounts.
+
+// imm0_63 predicate - True if the immediate is in the range [0,63]
+def imm0_63 : Operand<i64>, ImmLeaf<i64, [{
+ return ((uint64_t)Imm) < 64;
+}]> {
+ let ParserMatchClass = Imm0_63Operand;
+}
+
+// imm0_31 predicate - True if the immediate is in the range [0,31]
+def imm0_31 : Operand<i64>, ImmLeaf<i64, [{
+ return ((uint64_t)Imm) < 32;
+}]> {
+ let ParserMatchClass = Imm0_31Operand;
+}
+
+// imm0_15 predicate - True if the immediate is in the range [0,15]
+def imm0_15 : Operand<i64>, ImmLeaf<i64, [{
+ return ((uint64_t)Imm) < 16;
+}]> {
+ let ParserMatchClass = Imm0_15Operand;
+}
+
+// imm0_7 predicate - True if the immediate is in the range [0,7]
+def imm0_7 : Operand<i64>, ImmLeaf<i64, [{
+ return ((uint64_t)Imm) < 8;
+}]> {
+ let ParserMatchClass = Imm0_7Operand;
+}
+
+// An arithmetic shifter operand:
+// {7-6} - shift type: 00 = lsl, 01 = lsr, 10 = asr
+// {5-0} - imm6
+class arith_shift<ValueType Ty, int width> : Operand<Ty> {
+ let PrintMethod = "printShifter";
+ let ParserMatchClass = !cast<AsmOperandClass>(
+ "ArithmeticShifterOperand" # width);
+}
+
+def arith_shift32 : arith_shift<i32, 32>;
+def arith_shift64 : arith_shift<i64, 64>;
+
+class arith_shifted_reg<ValueType Ty, RegisterClass regclass, int width>
+ : Operand<Ty>,
+ ComplexPattern<Ty, 2, "SelectArithShiftedRegister", []> {
+ let PrintMethod = "printShiftedRegister";
+ let MIOperandInfo = (ops regclass, !cast<Operand>("arith_shift" # width));
+}
+
+def arith_shifted_reg32 : arith_shifted_reg<i32, GPR32, 32>;
+def arith_shifted_reg64 : arith_shifted_reg<i64, GPR64, 64>;
+
+// An arithmetic shifter operand:
+// {7-6} - shift type: 00 = lsl, 01 = lsr, 10 = asr, 11 = ror
+// {5-0} - imm6
+class logical_shift<int width> : Operand<i32> {
+ let PrintMethod = "printShifter";
+ let ParserMatchClass = !cast<AsmOperandClass>(
+ "LogicalShifterOperand" # width);
+}
+
+def logical_shift32 : logical_shift<32>;
+def logical_shift64 : logical_shift<64>;
+
+class logical_shifted_reg<ValueType Ty, RegisterClass regclass, Operand shiftop>
+ : Operand<Ty>,
+ ComplexPattern<Ty, 2, "SelectLogicalShiftedRegister", []> {
+ let PrintMethod = "printShiftedRegister";
+ let MIOperandInfo = (ops regclass, shiftop);
+}
+
+def logical_shifted_reg32 : logical_shifted_reg<i32, GPR32, logical_shift32>;
+def logical_shifted_reg64 : logical_shifted_reg<i64, GPR64, logical_shift64>;
+
+// A logical vector shifter operand:
+// {7-6} - shift type: 00 = lsl
+// {5-0} - imm6: #0, #8, #16, or #24
+def logical_vec_shift : Operand<i32> {
+ let PrintMethod = "printShifter";
+ let EncoderMethod = "getVecShifterOpValue";
+ let ParserMatchClass = LogicalVecShifterOperand;
+}
+
+// A logical vector half-word shifter operand:
+// {7-6} - shift type: 00 = lsl
+// {5-0} - imm6: #0 or #8
+def logical_vec_hw_shift : Operand<i32> {
+ let PrintMethod = "printShifter";
+ let EncoderMethod = "getVecShifterOpValue";
+ let ParserMatchClass = LogicalVecHalfWordShifterOperand;
+}
+
+// A vector move shifter operand:
+// {0} - imm1: #8 or #16
+def move_vec_shift : Operand<i32> {
+ let PrintMethod = "printShifter";
+ let EncoderMethod = "getMoveVecShifterOpValue";
+ let ParserMatchClass = MoveVecShifterOperand;
+}
+
+def AddSubImmOperand : AsmOperandClass {
+ let Name = "AddSubImm";
+ let ParserMethod = "tryParseAddSubImm";
+ let DiagnosticType = "AddSubSecondSource";
+}
+// An ADD/SUB immediate shifter operand:
+// second operand:
+// {7-6} - shift type: 00 = lsl
+// {5-0} - imm6: #0 or #12
+class addsub_shifted_imm<ValueType Ty>
+ : Operand<Ty>, ComplexPattern<Ty, 2, "SelectArithImmed", [imm]> {
+ let PrintMethod = "printAddSubImm";
+ let EncoderMethod = "getAddSubImmOpValue";
+ let ParserMatchClass = AddSubImmOperand;
+ let MIOperandInfo = (ops i32imm, i32imm);
+}
+
+def addsub_shifted_imm32 : addsub_shifted_imm<i32>;
+def addsub_shifted_imm64 : addsub_shifted_imm<i64>;
+
+class neg_addsub_shifted_imm<ValueType Ty>
+ : Operand<Ty>, ComplexPattern<Ty, 2, "SelectNegArithImmed", [imm]> {
+ let PrintMethod = "printAddSubImm";
+ let EncoderMethod = "getAddSubImmOpValue";
+ let ParserMatchClass = AddSubImmOperand;
+ let MIOperandInfo = (ops i32imm, i32imm);
+}
+
+def neg_addsub_shifted_imm32 : neg_addsub_shifted_imm<i32>;
+def neg_addsub_shifted_imm64 : neg_addsub_shifted_imm<i64>;
+
+// An extend operand:
+// {5-3} - extend type
+// {2-0} - imm3
+def arith_extend : Operand<i32> {
+ let PrintMethod = "printArithExtend";
+ let ParserMatchClass = ExtendOperand;
+}
+def arith_extend64 : Operand<i32> {
+ let PrintMethod = "printArithExtend";
+ let ParserMatchClass = ExtendOperand64;
+}
+
+// 'extend' that's a lsl of a 64-bit register.
+def arith_extendlsl64 : Operand<i32> {
+ let PrintMethod = "printArithExtend";
+ let ParserMatchClass = ExtendOperandLSL64;
+}
+
+class arith_extended_reg32<ValueType Ty> : Operand<Ty>,
+ ComplexPattern<Ty, 2, "SelectArithExtendedRegister", []> {
+ let PrintMethod = "printExtendedRegister";
+ let MIOperandInfo = (ops GPR32, arith_extend);
+}
+
+class arith_extended_reg32to64<ValueType Ty> : Operand<Ty>,
+ ComplexPattern<Ty, 2, "SelectArithExtendedRegister", []> {
+ let PrintMethod = "printExtendedRegister";
+ let MIOperandInfo = (ops GPR32, arith_extend64);
+}
+
+// Floating-point immediate.
+def fpimm32 : Operand<f32>,
+ PatLeaf<(f32 fpimm), [{
+ return AArch64_AM::getFP32Imm(N->getValueAPF()) != -1;
+ }], SDNodeXForm<fpimm, [{
+ APFloat InVal = N->getValueAPF();
+ uint32_t enc = AArch64_AM::getFP32Imm(InVal);
+ return CurDAG->getTargetConstant(enc, MVT::i32);
+ }]>> {
+ let ParserMatchClass = FPImmOperand;
+ let PrintMethod = "printFPImmOperand";
+}
+def fpimm64 : Operand<f64>,
+ PatLeaf<(f64 fpimm), [{
+ return AArch64_AM::getFP64Imm(N->getValueAPF()) != -1;
+ }], SDNodeXForm<fpimm, [{
+ APFloat InVal = N->getValueAPF();
+ uint32_t enc = AArch64_AM::getFP64Imm(InVal);
+ return CurDAG->getTargetConstant(enc, MVT::i32);
+ }]>> {
+ let ParserMatchClass = FPImmOperand;
+ let PrintMethod = "printFPImmOperand";
+}
+
+def fpimm8 : Operand<i32> {
+ let ParserMatchClass = FPImmOperand;
+ let PrintMethod = "printFPImmOperand";
+}
+
+def fpimm0 : PatLeaf<(fpimm), [{
+ return N->isExactlyValue(+0.0);
+}]>;
+
+// Vector lane operands
+class AsmVectorIndex<string Suffix> : AsmOperandClass {
+ let Name = "VectorIndex" # Suffix;
+ let DiagnosticType = "InvalidIndex" # Suffix;
+}
+def VectorIndex1Operand : AsmVectorIndex<"1">;
+def VectorIndexBOperand : AsmVectorIndex<"B">;
+def VectorIndexHOperand : AsmVectorIndex<"H">;
+def VectorIndexSOperand : AsmVectorIndex<"S">;
+def VectorIndexDOperand : AsmVectorIndex<"D">;
+
+def VectorIndex1 : Operand<i64>, ImmLeaf<i64, [{
+ return ((uint64_t)Imm) == 1;
+}]> {
+ let ParserMatchClass = VectorIndex1Operand;
+ let PrintMethod = "printVectorIndex";
+ let MIOperandInfo = (ops i64imm);
+}
+def VectorIndexB : Operand<i64>, ImmLeaf<i64, [{
+ return ((uint64_t)Imm) < 16;
+}]> {
+ let ParserMatchClass = VectorIndexBOperand;
+ let PrintMethod = "printVectorIndex";
+ let MIOperandInfo = (ops i64imm);
+}
+def VectorIndexH : Operand<i64>, ImmLeaf<i64, [{
+ return ((uint64_t)Imm) < 8;
+}]> {
+ let ParserMatchClass = VectorIndexHOperand;
+ let PrintMethod = "printVectorIndex";
+ let MIOperandInfo = (ops i64imm);
+}
+def VectorIndexS : Operand<i64>, ImmLeaf<i64, [{
+ return ((uint64_t)Imm) < 4;
+}]> {
+ let ParserMatchClass = VectorIndexSOperand;
+ let PrintMethod = "printVectorIndex";
+ let MIOperandInfo = (ops i64imm);
+}
+def VectorIndexD : Operand<i64>, ImmLeaf<i64, [{
+ return ((uint64_t)Imm) < 2;
+}]> {
+ let ParserMatchClass = VectorIndexDOperand;
+ let PrintMethod = "printVectorIndex";
+ let MIOperandInfo = (ops i64imm);
+}
+
+// 8-bit immediate for AdvSIMD where 64-bit values of the form:
+// aaaaaaaa bbbbbbbb cccccccc dddddddd eeeeeeee ffffffff gggggggg hhhhhhhh
+// are encoded as the eight bit value 'abcdefgh'.
+def simdimmtype10 : Operand<i32>,
+ PatLeaf<(f64 fpimm), [{
+ return AArch64_AM::isAdvSIMDModImmType10(N->getValueAPF()
+ .bitcastToAPInt()
+ .getZExtValue());
+ }], SDNodeXForm<fpimm, [{
+ APFloat InVal = N->getValueAPF();
+ uint32_t enc = AArch64_AM::encodeAdvSIMDModImmType10(N->getValueAPF()
+ .bitcastToAPInt()
+ .getZExtValue());
+ return CurDAG->getTargetConstant(enc, MVT::i32);
+ }]>> {
+ let ParserMatchClass = SIMDImmType10Operand;
+ let PrintMethod = "printSIMDType10Operand";
+}
+
+
+//---
+// System management
+//---
+
+// Base encoding for system instruction operands.
+let mayLoad = 0, mayStore = 0, hasSideEffects = 1 in
+class BaseSystemI<bit L, dag oops, dag iops, string asm, string operands>
+ : I<oops, iops, asm, operands, "", []> {
+ let Inst{31-22} = 0b1101010100;
+ let Inst{21} = L;
+}
+
+// System instructions which do not have an Rt register.
+class SimpleSystemI<bit L, dag iops, string asm, string operands>
+ : BaseSystemI<L, (outs), iops, asm, operands> {
+ let Inst{4-0} = 0b11111;
+}
+
+// System instructions which have an Rt register.
+class RtSystemI<bit L, dag oops, dag iops, string asm, string operands>
+ : BaseSystemI<L, oops, iops, asm, operands>,
+ Sched<[WriteSys]> {
bits<5> Rt;
-
let Inst{4-0} = Rt;
}
+// Hint instructions that take both a CRm and a 3-bit immediate.
+class HintI<string mnemonic>
+ : SimpleSystemI<0, (ins imm0_127:$imm), mnemonic#" $imm", "">,
+ Sched<[WriteHint]> {
+ bits <7> imm;
+ let Inst{20-12} = 0b000110010;
+ let Inst{11-5} = imm;
+}
-class A64InstRdn<dag outs, dag ins, string asmstr,
- list<dag> patterns, InstrItinClass itin>
- : A64InstRd<outs, ins, asmstr, patterns, itin> {
- // Inherit rdt
+// System instructions taking a single literal operand which encodes into
+// CRm. op2 differentiates the opcodes.
+def BarrierAsmOperand : AsmOperandClass {
+ let Name = "Barrier";
+ let ParserMethod = "tryParseBarrierOperand";
+}
+def barrier_op : Operand<i32> {
+ let PrintMethod = "printBarrierOption";
+ let ParserMatchClass = BarrierAsmOperand;
+}
+class CRmSystemI<Operand crmtype, bits<3> opc, string asm>
+ : SimpleSystemI<0, (ins crmtype:$CRm), asm, "\t$CRm">,
+ Sched<[WriteBarrier]> {
+ bits<4> CRm;
+ let Inst{20-12} = 0b000110011;
+ let Inst{11-8} = CRm;
+ let Inst{7-5} = opc;
+}
+
+// MRS/MSR system instructions. These have different operand classes because
+// a different subset of registers can be accessed through each instruction.
+def MRSSystemRegisterOperand : AsmOperandClass {
+ let Name = "MRSSystemRegister";
+ let ParserMethod = "tryParseSysReg";
+ let DiagnosticType = "MRS";
+}
+// concatenation of 1, op0, op1, CRn, CRm, op2. 16-bit immediate.
+def mrs_sysreg_op : Operand<i32> {
+ let ParserMatchClass = MRSSystemRegisterOperand;
+ let DecoderMethod = "DecodeMRSSystemRegister";
+ let PrintMethod = "printMRSSystemRegister";
+}
+
+def MSRSystemRegisterOperand : AsmOperandClass {
+ let Name = "MSRSystemRegister";
+ let ParserMethod = "tryParseSysReg";
+ let DiagnosticType = "MSR";
+}
+def msr_sysreg_op : Operand<i32> {
+ let ParserMatchClass = MSRSystemRegisterOperand;
+ let DecoderMethod = "DecodeMSRSystemRegister";
+ let PrintMethod = "printMSRSystemRegister";
+}
+
+class MRSI : RtSystemI<1, (outs GPR64:$Rt), (ins mrs_sysreg_op:$systemreg),
+ "mrs", "\t$Rt, $systemreg"> {
+ bits<15> systemreg;
+ let Inst{20} = 1;
+ let Inst{19-5} = systemreg;
+}
+
+// FIXME: Some of these def NZCV, others don't. Best way to model that?
+// Explicitly modeling each of the system register as a register class
+// would do it, but feels like overkill at this point.
+class MSRI : RtSystemI<0, (outs), (ins msr_sysreg_op:$systemreg, GPR64:$Rt),
+ "msr", "\t$systemreg, $Rt"> {
+ bits<15> systemreg;
+ let Inst{20} = 1;
+ let Inst{19-5} = systemreg;
+}
+
+def SystemPStateFieldOperand : AsmOperandClass {
+ let Name = "SystemPStateField";
+ let ParserMethod = "tryParseSysReg";
+}
+def pstatefield_op : Operand<i32> {
+ let ParserMatchClass = SystemPStateFieldOperand;
+ let PrintMethod = "printSystemPStateField";
+}
+
+let Defs = [NZCV] in
+class MSRpstateI
+ : SimpleSystemI<0, (ins pstatefield_op:$pstate_field, imm0_15:$imm),
+ "msr", "\t$pstate_field, $imm">,
+ Sched<[WriteSys]> {
+ bits<6> pstatefield;
+ bits<4> imm;
+ let Inst{20-19} = 0b00;
+ let Inst{18-16} = pstatefield{5-3};
+ let Inst{15-12} = 0b0100;
+ let Inst{11-8} = imm;
+ let Inst{7-5} = pstatefield{2-0};
+
+ let DecoderMethod = "DecodeSystemPStateInstruction";
+}
+
+// SYS and SYSL generic system instructions.
+def SysCRAsmOperand : AsmOperandClass {
+ let Name = "SysCR";
+ let ParserMethod = "tryParseSysCROperand";
+}
+
+def sys_cr_op : Operand<i32> {
+ let PrintMethod = "printSysCROperand";
+ let ParserMatchClass = SysCRAsmOperand;
+}
+
+class SystemXtI<bit L, string asm>
+ : RtSystemI<L, (outs),
+ (ins imm0_7:$op1, sys_cr_op:$Cn, sys_cr_op:$Cm, imm0_7:$op2, GPR64:$Rt),
+ asm, "\t$op1, $Cn, $Cm, $op2, $Rt"> {
+ bits<3> op1;
+ bits<4> Cn;
+ bits<4> Cm;
+ bits<3> op2;
+ let Inst{20-19} = 0b01;
+ let Inst{18-16} = op1;
+ let Inst{15-12} = Cn;
+ let Inst{11-8} = Cm;
+ let Inst{7-5} = op2;
+}
+
+class SystemLXtI<bit L, string asm>
+ : RtSystemI<L, (outs),
+ (ins GPR64:$Rt, imm0_7:$op1, sys_cr_op:$Cn, sys_cr_op:$Cm, imm0_7:$op2),
+ asm, "\t$Rt, $op1, $Cn, $Cm, $op2"> {
+ bits<3> op1;
+ bits<4> Cn;
+ bits<4> Cm;
+ bits<3> op2;
+ let Inst{20-19} = 0b01;
+ let Inst{18-16} = op1;
+ let Inst{15-12} = Cn;
+ let Inst{11-8} = Cm;
+ let Inst{7-5} = op2;
+}
+
+
+// Branch (register) instructions:
+//
+// case opc of
+// 0001 blr
+// 0000 br
+// 0101 dret
+// 0100 eret
+// 0010 ret
+// otherwise UNDEFINED
+class BaseBranchReg<bits<4> opc, dag oops, dag iops, string asm,
+ string operands, list<dag> pattern>
+ : I<oops, iops, asm, operands, "", pattern>, Sched<[WriteBrReg]> {
+ let Inst{31-25} = 0b1101011;
+ let Inst{24-21} = opc;
+ let Inst{20-16} = 0b11111;
+ let Inst{15-10} = 0b000000;
+ let Inst{4-0} = 0b00000;
+}
+
+class BranchReg<bits<4> opc, string asm, list<dag> pattern>
+ : BaseBranchReg<opc, (outs), (ins GPR64:$Rn), asm, "\t$Rn", pattern> {
bits<5> Rn;
-
let Inst{9-5} = Rn;
}
-class A64InstRtn<dag outs, dag ins, string asmstr,
- list<dag> patterns, InstrItinClass itin>
- : A64InstRt<outs, ins, asmstr, patterns, itin> {
- // Inherit rdt
+let mayLoad = 0, mayStore = 0, hasSideEffects = 1, isReturn = 1 in
+class SpecialReturn<bits<4> opc, string asm>
+ : BaseBranchReg<opc, (outs), (ins), asm, "", []> {
+ let Inst{9-5} = 0b11111;
+}
+
+//---
+// Conditional branch instruction.
+//---
+
+// Condition code.
+// 4-bit immediate. Pretty-printed as <cc>
+def ccode : Operand<i32> {
+ let PrintMethod = "printCondCode";
+ let ParserMatchClass = CondCode;
+}
+def inv_ccode : Operand<i32> {
+ let PrintMethod = "printInverseCondCode";
+ let ParserMatchClass = CondCode;
+}
+
+// Conditional branch target. 19-bit immediate. The low two bits of the target
+// offset are implied zero and so are not part of the immediate.
+def PCRelLabel19Operand : AsmOperandClass {
+ let Name = "PCRelLabel19";
+ let DiagnosticType = "InvalidLabel";
+}
+def am_brcond : Operand<OtherVT> {
+ let EncoderMethod = "getCondBranchTargetOpValue";
+ let DecoderMethod = "DecodePCRelLabel19";
+ let PrintMethod = "printAlignedLabel";
+ let ParserMatchClass = PCRelLabel19Operand;
+}
+
+class BranchCond : I<(outs), (ins ccode:$cond, am_brcond:$target),
+ "b", ".$cond\t$target", "",
+ [(AArch64brcond bb:$target, imm:$cond, NZCV)]>,
+ Sched<[WriteBr]> {
+ let isBranch = 1;
+ let isTerminator = 1;
+ let Uses = [NZCV];
+
+ bits<4> cond;
+ bits<19> target;
+ let Inst{31-24} = 0b01010100;
+ let Inst{23-5} = target;
+ let Inst{4} = 0;
+ let Inst{3-0} = cond;
+}
+
+//---
+// Compare-and-branch instructions.
+//---
+class BaseCmpBranch<RegisterClass regtype, bit op, string asm, SDNode node>
+ : I<(outs), (ins regtype:$Rt, am_brcond:$target),
+ asm, "\t$Rt, $target", "",
+ [(node regtype:$Rt, bb:$target)]>,
+ Sched<[WriteBr]> {
+ let isBranch = 1;
+ let isTerminator = 1;
+
+ bits<5> Rt;
+ bits<19> target;
+ let Inst{30-25} = 0b011010;
+ let Inst{24} = op;
+ let Inst{23-5} = target;
+ let Inst{4-0} = Rt;
+}
+
+multiclass CmpBranch<bit op, string asm, SDNode node> {
+ def W : BaseCmpBranch<GPR32, op, asm, node> {
+ let Inst{31} = 0;
+ }
+ def X : BaseCmpBranch<GPR64, op, asm, node> {
+ let Inst{31} = 1;
+ }
+}
+
+//---
+// Test-bit-and-branch instructions.
+//---
+// Test-and-branch target. 14-bit sign-extended immediate. The low two bits of
+// the target offset are implied zero and so are not part of the immediate.
+def BranchTarget14Operand : AsmOperandClass {
+ let Name = "BranchTarget14";
+}
+def am_tbrcond : Operand<OtherVT> {
+ let EncoderMethod = "getTestBranchTargetOpValue";
+ let PrintMethod = "printAlignedLabel";
+ let ParserMatchClass = BranchTarget14Operand;
+}
+
+// AsmOperand classes to emit (or not) special diagnostics
+def TBZImm0_31Operand : AsmOperandClass {
+ let Name = "TBZImm0_31";
+ let PredicateMethod = "isImm0_31";
+ let RenderMethod = "addImm0_31Operands";
+}
+def TBZImm32_63Operand : AsmOperandClass {
+ let Name = "Imm32_63";
+ let DiagnosticType = "InvalidImm0_63";
+}
+
+class tbz_imm0_31<AsmOperandClass matcher> : Operand<i64>, ImmLeaf<i64, [{
+ return (((uint32_t)Imm) < 32);
+}]> {
+ let ParserMatchClass = matcher;
+}
+
+def tbz_imm0_31_diag : tbz_imm0_31<Imm0_31Operand>;
+def tbz_imm0_31_nodiag : tbz_imm0_31<TBZImm0_31Operand>;
+
+def tbz_imm32_63 : Operand<i64>, ImmLeaf<i64, [{
+ return (((uint32_t)Imm) > 31) && (((uint32_t)Imm) < 64);
+}]> {
+ let ParserMatchClass = TBZImm32_63Operand;
+}
+
+class BaseTestBranch<RegisterClass regtype, Operand immtype,
+ bit op, string asm, SDNode node>
+ : I<(outs), (ins regtype:$Rt, immtype:$bit_off, am_tbrcond:$target),
+ asm, "\t$Rt, $bit_off, $target", "",
+ [(node regtype:$Rt, immtype:$bit_off, bb:$target)]>,
+ Sched<[WriteBr]> {
+ let isBranch = 1;
+ let isTerminator = 1;
+
+ bits<5> Rt;
+ bits<6> bit_off;
+ bits<14> target;
+
+ let Inst{30-25} = 0b011011;
+ let Inst{24} = op;
+ let Inst{23-19} = bit_off{4-0};
+ let Inst{18-5} = target;
+ let Inst{4-0} = Rt;
+
+ let DecoderMethod = "DecodeTestAndBranch";
+}
+
+multiclass TestBranch<bit op, string asm, SDNode node> {
+ def W : BaseTestBranch<GPR32, tbz_imm0_31_diag, op, asm, node> {
+ let Inst{31} = 0;
+ }
+
+ def X : BaseTestBranch<GPR64, tbz_imm32_63, op, asm, node> {
+ let Inst{31} = 1;
+ }
+
+ // Alias X-reg with 0-31 imm to W-Reg.
+ def : InstAlias<asm # "\t$Rd, $imm, $target",
+ (!cast<Instruction>(NAME#"W") GPR32as64:$Rd,
+ tbz_imm0_31_nodiag:$imm, am_tbrcond:$target), 0>;
+ def : Pat<(node GPR64:$Rn, tbz_imm0_31_diag:$imm, bb:$target),
+ (!cast<Instruction>(NAME#"W") (EXTRACT_SUBREG GPR64:$Rn, sub_32),
+ tbz_imm0_31_diag:$imm, bb:$target)>;
+}
+
+//---
+// Unconditional branch (immediate) instructions.
+//---
+def BranchTarget26Operand : AsmOperandClass {
+ let Name = "BranchTarget26";
+ let DiagnosticType = "InvalidLabel";
+}
+def am_b_target : Operand<OtherVT> {
+ let EncoderMethod = "getBranchTargetOpValue";
+ let PrintMethod = "printAlignedLabel";
+ let ParserMatchClass = BranchTarget26Operand;
+}
+def am_bl_target : Operand<i64> {
+ let EncoderMethod = "getBranchTargetOpValue";
+ let PrintMethod = "printAlignedLabel";
+ let ParserMatchClass = BranchTarget26Operand;
+}
+
+class BImm<bit op, dag iops, string asm, list<dag> pattern>
+ : I<(outs), iops, asm, "\t$addr", "", pattern>, Sched<[WriteBr]> {
+ bits<26> addr;
+ let Inst{31} = op;
+ let Inst{30-26} = 0b00101;
+ let Inst{25-0} = addr;
+
+ let DecoderMethod = "DecodeUnconditionalBranch";
+}
+
+class BranchImm<bit op, string asm, list<dag> pattern>
+ : BImm<op, (ins am_b_target:$addr), asm, pattern>;
+class CallImm<bit op, string asm, list<dag> pattern>
+ : BImm<op, (ins am_bl_target:$addr), asm, pattern>;
+
+//---
+// Basic one-operand data processing instructions.
+//---
+
+let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
+class BaseOneOperandData<bits<3> opc, RegisterClass regtype, string asm,
+ SDPatternOperator node>
+ : I<(outs regtype:$Rd), (ins regtype:$Rn), asm, "\t$Rd, $Rn", "",
+ [(set regtype:$Rd, (node regtype:$Rn))]>,
+ Sched<[WriteI, ReadI]> {
+ bits<5> Rd;
bits<5> Rn;
- let Inst{9-5} = Rn;
+ let Inst{30-13} = 0b101101011000000000;
+ let Inst{12-10} = opc;
+ let Inst{9-5} = Rn;
+ let Inst{4-0} = Rd;
}
-// Instructions taking Rt,Rt2,Rn
-class A64InstRtt2n<dag outs, dag ins, string asmstr,
- list<dag> patterns, InstrItinClass itin>
- : A64InstRtn<outs, ins, asmstr, patterns, itin> {
- bits<5> Rt2;
+let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
+multiclass OneOperandData<bits<3> opc, string asm,
+ SDPatternOperator node = null_frag> {
+ def Wr : BaseOneOperandData<opc, GPR32, asm, node> {
+ let Inst{31} = 0;
+ }
- let Inst{14-10} = Rt2;
+ def Xr : BaseOneOperandData<opc, GPR64, asm, node> {
+ let Inst{31} = 1;
+ }
}
-class A64InstRdnm<dag outs, dag ins, string asmstr,
- list<dag> patterns, InstrItinClass itin>
- : A64InstRdn<outs, ins, asmstr, patterns, itin> {
+class OneWRegData<bits<3> opc, string asm, SDPatternOperator node>
+ : BaseOneOperandData<opc, GPR32, asm, node> {
+ let Inst{31} = 0;
+}
+
+class OneXRegData<bits<3> opc, string asm, SDPatternOperator node>
+ : BaseOneOperandData<opc, GPR64, asm, node> {
+ let Inst{31} = 1;
+}
+
+//---
+// Basic two-operand data processing instructions.
+//---
+class BaseBaseAddSubCarry<bit isSub, RegisterClass regtype, string asm,
+ list<dag> pattern>
+ : I<(outs regtype:$Rd), (ins regtype:$Rn, regtype:$Rm),
+ asm, "\t$Rd, $Rn, $Rm", "", pattern>,
+ Sched<[WriteI, ReadI, ReadI]> {
+ let Uses = [NZCV];
+ bits<5> Rd;
+ bits<5> Rn;
bits<5> Rm;
-
+ let Inst{30} = isSub;
+ let Inst{28-21} = 0b11010000;
let Inst{20-16} = Rm;
+ let Inst{15-10} = 0;
+ let Inst{9-5} = Rn;
+ let Inst{4-0} = Rd;
}
-class A64InstRtnm<dag outs, dag ins, string asmstr,
- list<dag> patterns, InstrItinClass itin>
- : A64InstRtn<outs, ins, asmstr, patterns, itin> {
+class BaseAddSubCarry<bit isSub, RegisterClass regtype, string asm,
+ SDNode OpNode>
+ : BaseBaseAddSubCarry<isSub, regtype, asm,
+ [(set regtype:$Rd, (OpNode regtype:$Rn, regtype:$Rm, NZCV))]>;
+
+class BaseAddSubCarrySetFlags<bit isSub, RegisterClass regtype, string asm,
+ SDNode OpNode>
+ : BaseBaseAddSubCarry<isSub, regtype, asm,
+ [(set regtype:$Rd, (OpNode regtype:$Rn, regtype:$Rm, NZCV)),
+ (implicit NZCV)]> {
+ let Defs = [NZCV];
+}
+
+multiclass AddSubCarry<bit isSub, string asm, string asm_setflags,
+ SDNode OpNode, SDNode OpNode_setflags> {
+ def Wr : BaseAddSubCarry<isSub, GPR32, asm, OpNode> {
+ let Inst{31} = 0;
+ let Inst{29} = 0;
+ }
+ def Xr : BaseAddSubCarry<isSub, GPR64, asm, OpNode> {
+ let Inst{31} = 1;
+ let Inst{29} = 0;
+ }
+
+ // Sets flags.
+ def SWr : BaseAddSubCarrySetFlags<isSub, GPR32, asm_setflags,
+ OpNode_setflags> {
+ let Inst{31} = 0;
+ let Inst{29} = 1;
+ }
+ def SXr : BaseAddSubCarrySetFlags<isSub, GPR64, asm_setflags,
+ OpNode_setflags> {
+ let Inst{31} = 1;
+ let Inst{29} = 1;
+ }
+}
+
+class BaseTwoOperand<bits<4> opc, RegisterClass regtype, string asm,
+ SDPatternOperator OpNode>
+ : I<(outs regtype:$Rd), (ins regtype:$Rn, regtype:$Rm),
+ asm, "\t$Rd, $Rn, $Rm", "",
+ [(set regtype:$Rd, (OpNode regtype:$Rn, regtype:$Rm))]> {
+ bits<5> Rd;
+ bits<5> Rn;
bits<5> Rm;
-
+ let Inst{30-21} = 0b0011010110;
let Inst{20-16} = Rm;
+ let Inst{15-14} = 0b00;
+ let Inst{13-10} = opc;
+ let Inst{9-5} = Rn;
+ let Inst{4-0} = Rd;
}
-//===----------------------------------------------------------------------===//
-//
-// Actual A64 Instruction Formats
-//
-
-// Format for Add-subtract (extended register) instructions.
-class A64I_addsubext<bit sf, bit op, bit S, bits<2> opt, bits<3> option,
- dag outs, dag ins, string asmstr, list<dag> patterns,
- InstrItinClass itin>
- : A64InstRdnm<outs, ins, asmstr, patterns, itin> {
- bits<3> Imm3;
-
- let Inst{31} = sf;
- let Inst{30} = op;
- let Inst{29} = S;
- let Inst{28-24} = 0b01011;
- let Inst{23-22} = opt;
- let Inst{21} = 0b1;
- // Rm inherited in 20-16
- let Inst{15-13} = option;
- let Inst{12-10} = Imm3;
- // Rn inherited in 9-5
- // Rd inherited in 4-0
+class BaseDiv<bit isSigned, RegisterClass regtype, string asm,
+ SDPatternOperator OpNode>
+ : BaseTwoOperand<{0,0,1,?}, regtype, asm, OpNode> {
+ let Inst{10} = isSigned;
}
-// Format for Add-subtract (immediate) instructions.
-class A64I_addsubimm<bit sf, bit op, bit S, bits<2> shift,
- dag outs, dag ins, string asmstr,
- list<dag> patterns, InstrItinClass itin>
- : A64InstRdn<outs, ins, asmstr, patterns, itin> {
- bits<12> Imm12;
+multiclass Div<bit isSigned, string asm, SDPatternOperator OpNode> {
+ def Wr : BaseDiv<isSigned, GPR32, asm, OpNode>,
+ Sched<[WriteID32, ReadID, ReadID]> {
+ let Inst{31} = 0;
+ }
+ def Xr : BaseDiv<isSigned, GPR64, asm, OpNode>,
+ Sched<[WriteID64, ReadID, ReadID]> {
+ let Inst{31} = 1;
+ }
+}
+
+class BaseShift<bits<2> shift_type, RegisterClass regtype, string asm,
+ SDPatternOperator OpNode = null_frag>
+ : BaseTwoOperand<{1,0,?,?}, regtype, asm, OpNode>,
+ Sched<[WriteIS, ReadI]> {
+ let Inst{11-10} = shift_type;
+}
+
+multiclass Shift<bits<2> shift_type, string asm, SDNode OpNode> {
+ def Wr : BaseShift<shift_type, GPR32, asm> {
+ let Inst{31} = 0;
+ }
+
+ def Xr : BaseShift<shift_type, GPR64, asm, OpNode> {
+ let Inst{31} = 1;
+ }
+
+ def : Pat<(i32 (OpNode GPR32:$Rn, i64:$Rm)),
+ (!cast<Instruction>(NAME # "Wr") GPR32:$Rn,
+ (EXTRACT_SUBREG i64:$Rm, sub_32))>;
+
+ def : Pat<(i32 (OpNode GPR32:$Rn, (i64 (zext GPR32:$Rm)))),
+ (!cast<Instruction>(NAME # "Wr") GPR32:$Rn, GPR32:$Rm)>;
+
+ def : Pat<(i32 (OpNode GPR32:$Rn, (i64 (anyext GPR32:$Rm)))),
+ (!cast<Instruction>(NAME # "Wr") GPR32:$Rn, GPR32:$Rm)>;
+
+ def : Pat<(i32 (OpNode GPR32:$Rn, (i64 (sext GPR32:$Rm)))),
+ (!cast<Instruction>(NAME # "Wr") GPR32:$Rn, GPR32:$Rm)>;
+}
+
+class ShiftAlias<string asm, Instruction inst, RegisterClass regtype>
+ : InstAlias<asm#" $dst, $src1, $src2",
+ (inst regtype:$dst, regtype:$src1, regtype:$src2), 0>;
+
+class BaseMulAccum<bit isSub, bits<3> opc, RegisterClass multype,
+ RegisterClass addtype, string asm,
+ list<dag> pattern>
+ : I<(outs addtype:$Rd), (ins multype:$Rn, multype:$Rm, addtype:$Ra),
+ asm, "\t$Rd, $Rn, $Rm, $Ra", "", pattern> {
+ bits<5> Rd;
+ bits<5> Rn;
+ bits<5> Rm;
+ bits<5> Ra;
+ let Inst{30-24} = 0b0011011;
+ let Inst{23-21} = opc;
+ let Inst{20-16} = Rm;
+ let Inst{15} = isSub;
+ let Inst{14-10} = Ra;
+ let Inst{9-5} = Rn;
+ let Inst{4-0} = Rd;
+}
+
+multiclass MulAccum<bit isSub, string asm, SDNode AccNode> {
+ def Wrrr : BaseMulAccum<isSub, 0b000, GPR32, GPR32, asm,
+ [(set GPR32:$Rd, (AccNode GPR32:$Ra, (mul GPR32:$Rn, GPR32:$Rm)))]>,
+ Sched<[WriteIM32, ReadIMA, ReadIM, ReadIM]> {
+ let Inst{31} = 0;
+ }
+
+ def Xrrr : BaseMulAccum<isSub, 0b000, GPR64, GPR64, asm,
+ [(set GPR64:$Rd, (AccNode GPR64:$Ra, (mul GPR64:$Rn, GPR64:$Rm)))]>,
+ Sched<[WriteIM64, ReadIMA, ReadIM, ReadIM]> {
+ let Inst{31} = 1;
+ }
+}
+
+class WideMulAccum<bit isSub, bits<3> opc, string asm,
+ SDNode AccNode, SDNode ExtNode>
+ : BaseMulAccum<isSub, opc, GPR32, GPR64, asm,
+ [(set GPR64:$Rd, (AccNode GPR64:$Ra,
+ (mul (ExtNode GPR32:$Rn), (ExtNode GPR32:$Rm))))]>,
+ Sched<[WriteIM32, ReadIMA, ReadIM, ReadIM]> {
+ let Inst{31} = 1;
+}
+
+class MulHi<bits<3> opc, string asm, SDNode OpNode>
+ : I<(outs GPR64:$Rd), (ins GPR64:$Rn, GPR64:$Rm),
+ asm, "\t$Rd, $Rn, $Rm", "",
+ [(set GPR64:$Rd, (OpNode GPR64:$Rn, GPR64:$Rm))]>,
+ Sched<[WriteIM64, ReadIM, ReadIM]> {
+ bits<5> Rd;
+ bits<5> Rn;
+ bits<5> Rm;
+ let Inst{31-24} = 0b10011011;
+ let Inst{23-21} = opc;
+ let Inst{20-16} = Rm;
+ let Inst{15} = 0;
+ let Inst{9-5} = Rn;
+ let Inst{4-0} = Rd;
+
+ // The Ra field of SMULH and UMULH is unused: it should be assembled as 31
+ // (i.e. all bits 1) but is ignored by the processor.
+ let PostEncoderMethod = "fixMulHigh";
+}
+
+class MulAccumWAlias<string asm, Instruction inst>
+ : InstAlias<asm#" $dst, $src1, $src2",
+ (inst GPR32:$dst, GPR32:$src1, GPR32:$src2, WZR)>;
+class MulAccumXAlias<string asm, Instruction inst>
+ : InstAlias<asm#" $dst, $src1, $src2",
+ (inst GPR64:$dst, GPR64:$src1, GPR64:$src2, XZR)>;
+class WideMulAccumAlias<string asm, Instruction inst>
+ : InstAlias<asm#" $dst, $src1, $src2",
+ (inst GPR64:$dst, GPR32:$src1, GPR32:$src2, XZR)>;
+
+class BaseCRC32<bit sf, bits<2> sz, bit C, RegisterClass StreamReg,
+ SDPatternOperator OpNode, string asm>
+ : I<(outs GPR32:$Rd), (ins GPR32:$Rn, StreamReg:$Rm),
+ asm, "\t$Rd, $Rn, $Rm", "",
+ [(set GPR32:$Rd, (OpNode GPR32:$Rn, StreamReg:$Rm))]>,
+ Sched<[WriteISReg, ReadI, ReadISReg]> {
+ bits<5> Rd;
+ bits<5> Rn;
+ bits<5> Rm;
let Inst{31} = sf;
- let Inst{30} = op;
- let Inst{29} = S;
+ let Inst{30-21} = 0b0011010110;
+ let Inst{20-16} = Rm;
+ let Inst{15-13} = 0b010;
+ let Inst{12} = C;
+ let Inst{11-10} = sz;
+ let Inst{9-5} = Rn;
+ let Inst{4-0} = Rd;
+ let Predicates = [HasCRC];
+}
+
+//---
+// Address generation.
+//---
+
+class ADRI<bit page, string asm, Operand adr, list<dag> pattern>
+ : I<(outs GPR64:$Xd), (ins adr:$label), asm, "\t$Xd, $label", "",
+ pattern>,
+ Sched<[WriteI]> {
+ bits<5> Xd;
+ bits<21> label;
+ let Inst{31} = page;
+ let Inst{30-29} = label{1-0};
+ let Inst{28-24} = 0b10000;
+ let Inst{23-5} = label{20-2};
+ let Inst{4-0} = Xd;
+
+ let DecoderMethod = "DecodeAdrInstruction";
+}
+
+//---
+// Move immediate.
+//---
+
+def movimm32_imm : Operand<i32> {
+ let ParserMatchClass = Imm0_65535Operand;
+ let EncoderMethod = "getMoveWideImmOpValue";
+ let PrintMethod = "printHexImm";
+}
+def movimm32_shift : Operand<i32> {
+ let PrintMethod = "printShifter";
+ let ParserMatchClass = MovImm32ShifterOperand;
+}
+def movimm64_shift : Operand<i32> {
+ let PrintMethod = "printShifter";
+ let ParserMatchClass = MovImm64ShifterOperand;
+}
+
+let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
+class BaseMoveImmediate<bits<2> opc, RegisterClass regtype, Operand shifter,
+ string asm>
+ : I<(outs regtype:$Rd), (ins movimm32_imm:$imm, shifter:$shift),
+ asm, "\t$Rd, $imm$shift", "", []>,
+ Sched<[WriteImm]> {
+ bits<5> Rd;
+ bits<16> imm;
+ bits<6> shift;
+ let Inst{30-29} = opc;
+ let Inst{28-23} = 0b100101;
+ let Inst{22-21} = shift{5-4};
+ let Inst{20-5} = imm;
+ let Inst{4-0} = Rd;
+
+ let DecoderMethod = "DecodeMoveImmInstruction";
+}
+
+multiclass MoveImmediate<bits<2> opc, string asm> {
+ def Wi : BaseMoveImmediate<opc, GPR32, movimm32_shift, asm> {
+ let Inst{31} = 0;
+ }
+
+ def Xi : BaseMoveImmediate<opc, GPR64, movimm64_shift, asm> {
+ let Inst{31} = 1;
+ }
+}
+
+let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
+class BaseInsertImmediate<bits<2> opc, RegisterClass regtype, Operand shifter,
+ string asm>
+ : I<(outs regtype:$Rd),
+ (ins regtype:$src, movimm32_imm:$imm, shifter:$shift),
+ asm, "\t$Rd, $imm$shift", "$src = $Rd", []>,
+ Sched<[WriteI, ReadI]> {
+ bits<5> Rd;
+ bits<16> imm;
+ bits<6> shift;
+ let Inst{30-29} = opc;
+ let Inst{28-23} = 0b100101;
+ let Inst{22-21} = shift{5-4};
+ let Inst{20-5} = imm;
+ let Inst{4-0} = Rd;
+
+ let DecoderMethod = "DecodeMoveImmInstruction";
+}
+
+multiclass InsertImmediate<bits<2> opc, string asm> {
+ def Wi : BaseInsertImmediate<opc, GPR32, movimm32_shift, asm> {
+ let Inst{31} = 0;
+ }
+
+ def Xi : BaseInsertImmediate<opc, GPR64, movimm64_shift, asm> {
+ let Inst{31} = 1;
+ }
+}
+
+//---
+// Add/Subtract
+//---
+
+class BaseAddSubImm<bit isSub, bit setFlags, RegisterClass dstRegtype,
+ RegisterClass srcRegtype, addsub_shifted_imm immtype,
+ string asm, SDPatternOperator OpNode>
+ : I<(outs dstRegtype:$Rd), (ins srcRegtype:$Rn, immtype:$imm),
+ asm, "\t$Rd, $Rn, $imm", "",
+ [(set dstRegtype:$Rd, (OpNode srcRegtype:$Rn, immtype:$imm))]>,
+ Sched<[WriteI, ReadI]> {
+ bits<5> Rd;
+ bits<5> Rn;
+ bits<14> imm;
+ let Inst{30} = isSub;
+ let Inst{29} = setFlags;
let Inst{28-24} = 0b10001;
- let Inst{23-22} = shift;
- let Inst{21-10} = Imm12;
+ let Inst{23-22} = imm{13-12}; // '00' => lsl #0, '01' => lsl #12
+ let Inst{21-10} = imm{11-0};
+ let Inst{9-5} = Rn;
+ let Inst{4-0} = Rd;
+ let DecoderMethod = "DecodeBaseAddSubImm";
}
-// Format for Add-subtract (shifted register) instructions.
-class A64I_addsubshift<bit sf, bit op, bit S, bits<2> shift,
- dag outs, dag ins, string asmstr, list<dag> patterns,
- InstrItinClass itin>
- : A64InstRdnm<outs, ins, asmstr, patterns, itin> {
- bits<6> Imm6;
+class BaseAddSubRegPseudo<RegisterClass regtype,
+ SDPatternOperator OpNode>
+ : Pseudo<(outs regtype:$Rd), (ins regtype:$Rn, regtype:$Rm),
+ [(set regtype:$Rd, (OpNode regtype:$Rn, regtype:$Rm))]>,
+ Sched<[WriteI, ReadI, ReadI]>;
- let Inst{31} = sf;
- let Inst{30} = op;
- let Inst{29} = S;
- let Inst{28-24} = 0b01011;
- let Inst{23-22} = shift;
- let Inst{21} = 0b0;
- // Rm inherited in 20-16
- let Inst{15-10} = Imm6;
- // Rn inherited in 9-5
- // Rd inherited in 4-0
+class BaseAddSubSReg<bit isSub, bit setFlags, RegisterClass regtype,
+ arith_shifted_reg shifted_regtype, string asm,
+ SDPatternOperator OpNode>
+ : I<(outs regtype:$Rd), (ins regtype:$Rn, shifted_regtype:$Rm),
+ asm, "\t$Rd, $Rn, $Rm", "",
+ [(set regtype:$Rd, (OpNode regtype:$Rn, shifted_regtype:$Rm))]>,
+ Sched<[WriteISReg, ReadI, ReadISReg]> {
+ // The operands are in order to match the 'addr' MI operands, so we
+ // don't need an encoder method and by-name matching. Just use the default
+ // in-order handling. Since we're using by-order, make sure the names
+ // do not match.
+ bits<5> dst;
+ bits<5> src1;
+ bits<5> src2;
+ bits<8> shift;
+ let Inst{30} = isSub;
+ let Inst{29} = setFlags;
+ let Inst{28-24} = 0b01011;
+ let Inst{23-22} = shift{7-6};
+ let Inst{21} = 0;
+ let Inst{20-16} = src2;
+ let Inst{15-10} = shift{5-0};
+ let Inst{9-5} = src1;
+ let Inst{4-0} = dst;
+
+ let DecoderMethod = "DecodeThreeAddrSRegInstruction";
}
-// Format for Add-subtract (with carry) instructions.
-class A64I_addsubcarry<bit sf, bit op, bit S, bits<6> opcode2,
- dag outs, dag ins, string asmstr, list<dag> patterns,
- InstrItinClass itin>
- : A64InstRdnm<outs, ins, asmstr, patterns, itin> {
- let Inst{31} = sf;
- let Inst{30} = op;
- let Inst{29} = S;
- let Inst{28-21} = 0b11010000;
- // Rm inherited in 20-16
- let Inst{15-10} = opcode2;
- // Rn inherited in 9-5
- // Rd inherited in 4-0
+class BaseAddSubEReg<bit isSub, bit setFlags, RegisterClass dstRegtype,
+ RegisterClass src1Regtype, Operand src2Regtype,
+ string asm, SDPatternOperator OpNode>
+ : I<(outs dstRegtype:$R1),
+ (ins src1Regtype:$R2, src2Regtype:$R3),
+ asm, "\t$R1, $R2, $R3", "",
+ [(set dstRegtype:$R1, (OpNode src1Regtype:$R2, src2Regtype:$R3))]>,
+ Sched<[WriteIEReg, ReadI, ReadIEReg]> {
+ bits<5> Rd;
+ bits<5> Rn;
+ bits<5> Rm;
+ bits<6> ext;
+ let Inst{30} = isSub;
+ let Inst{29} = setFlags;
+ let Inst{28-24} = 0b01011;
+ let Inst{23-21} = 0b001;
+ let Inst{20-16} = Rm;
+ let Inst{15-13} = ext{5-3};
+ let Inst{12-10} = ext{2-0};
+ let Inst{9-5} = Rn;
+ let Inst{4-0} = Rd;
+
+ let DecoderMethod = "DecodeAddSubERegInstruction";
}
+let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
+class BaseAddSubEReg64<bit isSub, bit setFlags, RegisterClass dstRegtype,
+ RegisterClass src1Regtype, RegisterClass src2Regtype,
+ Operand ext_op, string asm>
+ : I<(outs dstRegtype:$Rd),
+ (ins src1Regtype:$Rn, src2Regtype:$Rm, ext_op:$ext),
+ asm, "\t$Rd, $Rn, $Rm$ext", "", []>,
+ Sched<[WriteIEReg, ReadI, ReadIEReg]> {
+ bits<5> Rd;
+ bits<5> Rn;
+ bits<5> Rm;
+ bits<6> ext;
+ let Inst{30} = isSub;
+ let Inst{29} = setFlags;
+ let Inst{28-24} = 0b01011;
+ let Inst{23-21} = 0b001;
+ let Inst{20-16} = Rm;
+ let Inst{15} = ext{5};
+ let Inst{12-10} = ext{2-0};
+ let Inst{9-5} = Rn;
+ let Inst{4-0} = Rd;
-// Format for Bitfield instructions
-class A64I_bitfield<bit sf, bits<2> opc, bit n,
- dag outs, dag ins, string asmstr,
- list<dag> patterns, InstrItinClass itin>
- : A64InstRdn<outs, ins, asmstr, patterns, itin> {
- bits<6> ImmR;
- bits<6> ImmS;
+ let DecoderMethod = "DecodeAddSubERegInstruction";
+}
- let Inst{31} = sf;
+// Aliases for register+register add/subtract.
+class AddSubRegAlias<string asm, Instruction inst, RegisterClass dstRegtype,
+ RegisterClass src1Regtype, RegisterClass src2Regtype,
+ int shiftExt>
+ : InstAlias<asm#" $dst, $src1, $src2",
+ (inst dstRegtype:$dst, src1Regtype:$src1, src2Regtype:$src2,
+ shiftExt)>;
+
+multiclass AddSub<bit isSub, string mnemonic,
+ SDPatternOperator OpNode = null_frag> {
+ let hasSideEffects = 0 in {
+ // Add/Subtract immediate
+ def Wri : BaseAddSubImm<isSub, 0, GPR32sp, GPR32sp, addsub_shifted_imm32,
+ mnemonic, OpNode> {
+ let Inst{31} = 0;
+ }
+ def Xri : BaseAddSubImm<isSub, 0, GPR64sp, GPR64sp, addsub_shifted_imm64,
+ mnemonic, OpNode> {
+ let Inst{31} = 1;
+ }
+
+ // Add/Subtract register - Only used for CodeGen
+ def Wrr : BaseAddSubRegPseudo<GPR32, OpNode>;
+ def Xrr : BaseAddSubRegPseudo<GPR64, OpNode>;
+
+ // Add/Subtract shifted register
+ def Wrs : BaseAddSubSReg<isSub, 0, GPR32, arith_shifted_reg32, mnemonic,
+ OpNode> {
+ let Inst{31} = 0;
+ }
+ def Xrs : BaseAddSubSReg<isSub, 0, GPR64, arith_shifted_reg64, mnemonic,
+ OpNode> {
+ let Inst{31} = 1;
+ }
+ }
+
+ // Add/Subtract extended register
+ let AddedComplexity = 1, hasSideEffects = 0 in {
+ def Wrx : BaseAddSubEReg<isSub, 0, GPR32sp, GPR32sp,
+ arith_extended_reg32<i32>, mnemonic, OpNode> {
+ let Inst{31} = 0;
+ }
+ def Xrx : BaseAddSubEReg<isSub, 0, GPR64sp, GPR64sp,
+ arith_extended_reg32to64<i64>, mnemonic, OpNode> {
+ let Inst{31} = 1;
+ }
+ }
+
+ def Xrx64 : BaseAddSubEReg64<isSub, 0, GPR64sp, GPR64sp, GPR64,
+ arith_extendlsl64, mnemonic> {
+ // UXTX and SXTX only.
+ let Inst{14-13} = 0b11;
+ let Inst{31} = 1;
+ }
+
+ // Register/register aliases with no shift when SP is not used.
+ def : AddSubRegAlias<mnemonic, !cast<Instruction>(NAME#"Wrs"),
+ GPR32, GPR32, GPR32, 0>;
+ def : AddSubRegAlias<mnemonic, !cast<Instruction>(NAME#"Xrs"),
+ GPR64, GPR64, GPR64, 0>;
+
+ // Register/register aliases with no shift when either the destination or
+ // first source register is SP.
+ def : AddSubRegAlias<mnemonic, !cast<Instruction>(NAME#"Wrx"),
+ GPR32sponly, GPR32sp, GPR32, 16>; // UXTW #0
+ def : AddSubRegAlias<mnemonic, !cast<Instruction>(NAME#"Wrx"),
+ GPR32sp, GPR32sponly, GPR32, 16>; // UXTW #0
+ def : AddSubRegAlias<mnemonic,
+ !cast<Instruction>(NAME#"Xrx64"),
+ GPR64sponly, GPR64sp, GPR64, 24>; // UXTX #0
+ def : AddSubRegAlias<mnemonic,
+ !cast<Instruction>(NAME#"Xrx64"),
+ GPR64sp, GPR64sponly, GPR64, 24>; // UXTX #0
+}
+
+multiclass AddSubS<bit isSub, string mnemonic, SDNode OpNode, string cmp> {
+ let isCompare = 1, Defs = [NZCV] in {
+ // Add/Subtract immediate
+ def Wri : BaseAddSubImm<isSub, 1, GPR32, GPR32sp, addsub_shifted_imm32,
+ mnemonic, OpNode> {
+ let Inst{31} = 0;
+ }
+ def Xri : BaseAddSubImm<isSub, 1, GPR64, GPR64sp, addsub_shifted_imm64,
+ mnemonic, OpNode> {
+ let Inst{31} = 1;
+ }
+
+ // Add/Subtract register
+ def Wrr : BaseAddSubRegPseudo<GPR32, OpNode>;
+ def Xrr : BaseAddSubRegPseudo<GPR64, OpNode>;
+
+ // Add/Subtract shifted register
+ def Wrs : BaseAddSubSReg<isSub, 1, GPR32, arith_shifted_reg32, mnemonic,
+ OpNode> {
+ let Inst{31} = 0;
+ }
+ def Xrs : BaseAddSubSReg<isSub, 1, GPR64, arith_shifted_reg64, mnemonic,
+ OpNode> {
+ let Inst{31} = 1;
+ }
+
+ // Add/Subtract extended register
+ let AddedComplexity = 1 in {
+ def Wrx : BaseAddSubEReg<isSub, 1, GPR32, GPR32sp,
+ arith_extended_reg32<i32>, mnemonic, OpNode> {
+ let Inst{31} = 0;
+ }
+ def Xrx : BaseAddSubEReg<isSub, 1, GPR64, GPR64sp,
+ arith_extended_reg32<i64>, mnemonic, OpNode> {
+ let Inst{31} = 1;
+ }
+ }
+
+ def Xrx64 : BaseAddSubEReg64<isSub, 1, GPR64, GPR64sp, GPR64,
+ arith_extendlsl64, mnemonic> {
+ // UXTX and SXTX only.
+ let Inst{14-13} = 0b11;
+ let Inst{31} = 1;
+ }
+ } // Defs = [NZCV]
+
+ // Compare aliases
+ def : InstAlias<cmp#" $src, $imm", (!cast<Instruction>(NAME#"Wri")
+ WZR, GPR32sp:$src, addsub_shifted_imm32:$imm), 5>;
+ def : InstAlias<cmp#" $src, $imm", (!cast<Instruction>(NAME#"Xri")
+ XZR, GPR64sp:$src, addsub_shifted_imm64:$imm), 5>;
+ def : InstAlias<cmp#" $src1, $src2$sh", (!cast<Instruction>(NAME#"Wrx")
+ WZR, GPR32sp:$src1, GPR32:$src2, arith_extend:$sh), 4>;
+ def : InstAlias<cmp#" $src1, $src2$sh", (!cast<Instruction>(NAME#"Xrx")
+ XZR, GPR64sp:$src1, GPR32:$src2, arith_extend:$sh), 4>;
+ def : InstAlias<cmp#" $src1, $src2$sh", (!cast<Instruction>(NAME#"Xrx64")
+ XZR, GPR64sp:$src1, GPR64:$src2, arith_extendlsl64:$sh), 4>;
+ def : InstAlias<cmp#" $src1, $src2$sh", (!cast<Instruction>(NAME#"Wrs")
+ WZR, GPR32:$src1, GPR32:$src2, arith_shift32:$sh), 4>;
+ def : InstAlias<cmp#" $src1, $src2$sh", (!cast<Instruction>(NAME#"Xrs")
+ XZR, GPR64:$src1, GPR64:$src2, arith_shift64:$sh), 4>;
+
+ // Compare shorthands
+ def : InstAlias<cmp#" $src1, $src2", (!cast<Instruction>(NAME#"Wrs")
+ WZR, GPR32:$src1, GPR32:$src2, 0), 5>;
+ def : InstAlias<cmp#" $src1, $src2", (!cast<Instruction>(NAME#"Xrs")
+ XZR, GPR64:$src1, GPR64:$src2, 0), 5>;
+
+ // Register/register aliases with no shift when SP is not used.
+ def : AddSubRegAlias<mnemonic, !cast<Instruction>(NAME#"Wrs"),
+ GPR32, GPR32, GPR32, 0>;
+ def : AddSubRegAlias<mnemonic, !cast<Instruction>(NAME#"Xrs"),
+ GPR64, GPR64, GPR64, 0>;
+
+ // Register/register aliases with no shift when the first source register
+ // is SP.
+ def : AddSubRegAlias<mnemonic, !cast<Instruction>(NAME#"Wrx"),
+ GPR32, GPR32sponly, GPR32, 16>; // UXTW #0
+ def : AddSubRegAlias<mnemonic,
+ !cast<Instruction>(NAME#"Xrx64"),
+ GPR64, GPR64sponly, GPR64, 24>; // UXTX #0
+}
+
+//---
+// Extract
+//---
+def SDTA64EXTR : SDTypeProfile<1, 3, [SDTCisSameAs<0, 1>, SDTCisSameAs<0, 2>,
+ SDTCisPtrTy<3>]>;
+def AArch64Extr : SDNode<"AArch64ISD::EXTR", SDTA64EXTR>;
+
+class BaseExtractImm<RegisterClass regtype, Operand imm_type, string asm,
+ list<dag> patterns>
+ : I<(outs regtype:$Rd), (ins regtype:$Rn, regtype:$Rm, imm_type:$imm),
+ asm, "\t$Rd, $Rn, $Rm, $imm", "", patterns>,
+ Sched<[WriteExtr, ReadExtrHi]> {
+ bits<5> Rd;
+ bits<5> Rn;
+ bits<5> Rm;
+ bits<6> imm;
+
+ let Inst{30-23} = 0b00100111;
+ let Inst{21} = 0;
+ let Inst{20-16} = Rm;
+ let Inst{15-10} = imm;
+ let Inst{9-5} = Rn;
+ let Inst{4-0} = Rd;
+}
+
+multiclass ExtractImm<string asm> {
+ def Wrri : BaseExtractImm<GPR32, imm0_31, asm,
+ [(set GPR32:$Rd,
+ (AArch64Extr GPR32:$Rn, GPR32:$Rm, imm0_31:$imm))]> {
+ let Inst{31} = 0;
+ let Inst{22} = 0;
+ // imm<5> must be zero.
+ let imm{5} = 0;
+ }
+ def Xrri : BaseExtractImm<GPR64, imm0_63, asm,
+ [(set GPR64:$Rd,
+ (AArch64Extr GPR64:$Rn, GPR64:$Rm, imm0_63:$imm))]> {
+
+ let Inst{31} = 1;
+ let Inst{22} = 1;
+ }
+}
+
+//---
+// Bitfield
+//---
+
+let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
+class BaseBitfieldImm<bits<2> opc,
+ RegisterClass regtype, Operand imm_type, string asm>
+ : I<(outs regtype:$Rd), (ins regtype:$Rn, imm_type:$immr, imm_type:$imms),
+ asm, "\t$Rd, $Rn, $immr, $imms", "", []>,
+ Sched<[WriteIS, ReadI]> {
+ bits<5> Rd;
+ bits<5> Rn;
+ bits<6> immr;
+ bits<6> imms;
+
let Inst{30-29} = opc;
let Inst{28-23} = 0b100110;
- let Inst{22} = n;
- let Inst{21-16} = ImmR;
- let Inst{15-10} = ImmS;
- // Inherit Rn in 9-5
- // Inherit Rd in 4-0
+ let Inst{21-16} = immr;
+ let Inst{15-10} = imms;
+ let Inst{9-5} = Rn;
+ let Inst{4-0} = Rd;
}
-// Format for compare and branch (immediate) instructions.
-class A64I_cmpbr<bit sf, bit op,
- dag outs, dag ins, string asmstr,
- list<dag> patterns, InstrItinClass itin>
- : A64InstRt<outs, ins, asmstr, patterns, itin> {
- bits<19> Label;
-
- let Inst{31} = sf;
- let Inst{30-25} = 0b011010;
- let Inst{24} = op;
- let Inst{23-5} = Label;
- // Inherit Rt in 4-0
+multiclass BitfieldImm<bits<2> opc, string asm> {
+ def Wri : BaseBitfieldImm<opc, GPR32, imm0_31, asm> {
+ let Inst{31} = 0;
+ let Inst{22} = 0;
+ // imms<5> and immr<5> must be zero, else ReservedValue().
+ let Inst{21} = 0;
+ let Inst{15} = 0;
+ }
+ def Xri : BaseBitfieldImm<opc, GPR64, imm0_63, asm> {
+ let Inst{31} = 1;
+ let Inst{22} = 1;
+ }
}
-// Format for conditional branch (immediate) instructions.
-class A64I_condbr<bit o1, bit o0,
- dag outs, dag ins, string asmstr,
- list<dag> patterns, InstrItinClass itin>
- : A64Inst<outs, ins, asmstr, patterns, itin> {
- bits<19> Label;
- bits<4> Cond;
-
- let Inst{31-25} = 0b0101010;
- let Inst{24} = o1;
- let Inst{23-5} = Label;
- let Inst{4} = o0;
- let Inst{3-0} = Cond;
-}
-
-// Format for conditional compare (immediate) instructions.
-class A64I_condcmpimm<bit sf, bit op, bit o2, bit o3, bit s,
- dag outs, dag ins, string asmstr,
- list<dag> patterns, InstrItinClass itin>
- : A64Inst<outs, ins, asmstr, patterns, itin> {
+let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
+class BaseBitfieldImmWith2RegArgs<bits<2> opc,
+ RegisterClass regtype, Operand imm_type, string asm>
+ : I<(outs regtype:$Rd), (ins regtype:$src, regtype:$Rn, imm_type:$immr,
+ imm_type:$imms),
+ asm, "\t$Rd, $Rn, $immr, $imms", "$src = $Rd", []>,
+ Sched<[WriteIS, ReadI]> {
+ bits<5> Rd;
bits<5> Rn;
- bits<5> UImm5;
- bits<4> NZCVImm;
- bits<4> Cond;
+ bits<6> immr;
+ bits<6> imms;
- let Inst{31} = sf;
- let Inst{30} = op;
- let Inst{29} = s;
- let Inst{28-21} = 0b11010010;
- let Inst{20-16} = UImm5;
- let Inst{15-12} = Cond;
- let Inst{11} = 0b1;
- let Inst{10} = o2;
- let Inst{9-5} = Rn;
- let Inst{4} = o3;
- let Inst{3-0} = NZCVImm;
+ let Inst{30-29} = opc;
+ let Inst{28-23} = 0b100110;
+ let Inst{21-16} = immr;
+ let Inst{15-10} = imms;
+ let Inst{9-5} = Rn;
+ let Inst{4-0} = Rd;
}
-// Format for conditional compare (register) instructions.
-class A64I_condcmpreg<bit sf, bit op, bit o2, bit o3, bit s,
- dag outs, dag ins, string asmstr,
- list<dag> patterns, InstrItinClass itin>
- : A64Inst<outs, ins, asmstr, patterns, itin> {
+multiclass BitfieldImmWith2RegArgs<bits<2> opc, string asm> {
+ def Wri : BaseBitfieldImmWith2RegArgs<opc, GPR32, imm0_31, asm> {
+ let Inst{31} = 0;
+ let Inst{22} = 0;
+ // imms<5> and immr<5> must be zero, else ReservedValue().
+ let Inst{21} = 0;
+ let Inst{15} = 0;
+ }
+ def Xri : BaseBitfieldImmWith2RegArgs<opc, GPR64, imm0_63, asm> {
+ let Inst{31} = 1;
+ let Inst{22} = 1;
+ }
+}
+
+//---
+// Logical
+//---
+
+// Logical (immediate)
+class BaseLogicalImm<bits<2> opc, RegisterClass dregtype,
+ RegisterClass sregtype, Operand imm_type, string asm,
+ list<dag> pattern>
+ : I<(outs dregtype:$Rd), (ins sregtype:$Rn, imm_type:$imm),
+ asm, "\t$Rd, $Rn, $imm", "", pattern>,
+ Sched<[WriteI, ReadI]> {
+ bits<5> Rd;
+ bits<5> Rn;
+ bits<13> imm;
+ let Inst{30-29} = opc;
+ let Inst{28-23} = 0b100100;
+ let Inst{22} = imm{12};
+ let Inst{21-16} = imm{11-6};
+ let Inst{15-10} = imm{5-0};
+ let Inst{9-5} = Rn;
+ let Inst{4-0} = Rd;
+
+ let DecoderMethod = "DecodeLogicalImmInstruction";
+}
+
+// Logical (shifted register)
+class BaseLogicalSReg<bits<2> opc, bit N, RegisterClass regtype,
+ logical_shifted_reg shifted_regtype, string asm,
+ list<dag> pattern>
+ : I<(outs regtype:$Rd), (ins regtype:$Rn, shifted_regtype:$Rm),
+ asm, "\t$Rd, $Rn, $Rm", "", pattern>,
+ Sched<[WriteISReg, ReadI, ReadISReg]> {
+ // The operands are in order to match the 'addr' MI operands, so we
+ // don't need an encoder method and by-name matching. Just use the default
+ // in-order handling. Since we're using by-order, make sure the names
+ // do not match.
+ bits<5> dst;
+ bits<5> src1;
+ bits<5> src2;
+ bits<8> shift;
+ let Inst{30-29} = opc;
+ let Inst{28-24} = 0b01010;
+ let Inst{23-22} = shift{7-6};
+ let Inst{21} = N;
+ let Inst{20-16} = src2;
+ let Inst{15-10} = shift{5-0};
+ let Inst{9-5} = src1;
+ let Inst{4-0} = dst;
+
+ let DecoderMethod = "DecodeThreeAddrSRegInstruction";
+}
+
+// Aliases for register+register logical instructions.
+class LogicalRegAlias<string asm, Instruction inst, RegisterClass regtype>
+ : InstAlias<asm#" $dst, $src1, $src2",
+ (inst regtype:$dst, regtype:$src1, regtype:$src2, 0)>;
+
+let AddedComplexity = 6 in
+multiclass LogicalImm<bits<2> opc, string mnemonic, SDNode OpNode> {
+ def Wri : BaseLogicalImm<opc, GPR32sp, GPR32, logical_imm32, mnemonic,
+ [(set GPR32sp:$Rd, (OpNode GPR32:$Rn,
+ logical_imm32:$imm))]> {
+ let Inst{31} = 0;
+ let Inst{22} = 0; // 64-bit version has an additional bit of immediate.
+ }
+ def Xri : BaseLogicalImm<opc, GPR64sp, GPR64, logical_imm64, mnemonic,
+ [(set GPR64sp:$Rd, (OpNode GPR64:$Rn,
+ logical_imm64:$imm))]> {
+ let Inst{31} = 1;
+ }
+}
+
+multiclass LogicalImmS<bits<2> opc, string mnemonic, SDNode OpNode> {
+ let isCompare = 1, Defs = [NZCV] in {
+ def Wri : BaseLogicalImm<opc, GPR32, GPR32, logical_imm32, mnemonic,
+ [(set GPR32:$Rd, (OpNode GPR32:$Rn, logical_imm32:$imm))]> {
+ let Inst{31} = 0;
+ let Inst{22} = 0; // 64-bit version has an additional bit of immediate.
+ }
+ def Xri : BaseLogicalImm<opc, GPR64, GPR64, logical_imm64, mnemonic,
+ [(set GPR64:$Rd, (OpNode GPR64:$Rn, logical_imm64:$imm))]> {
+ let Inst{31} = 1;
+ }
+ } // end Defs = [NZCV]
+}
+
+class BaseLogicalRegPseudo<RegisterClass regtype, SDPatternOperator OpNode>
+ : Pseudo<(outs regtype:$Rd), (ins regtype:$Rn, regtype:$Rm),
+ [(set regtype:$Rd, (OpNode regtype:$Rn, regtype:$Rm))]>,
+ Sched<[WriteI, ReadI, ReadI]>;
+
+// Split from LogicalImm as not all instructions have both.
+multiclass LogicalReg<bits<2> opc, bit N, string mnemonic,
+ SDPatternOperator OpNode> {
+ def Wrr : BaseLogicalRegPseudo<GPR32, OpNode>;
+ def Xrr : BaseLogicalRegPseudo<GPR64, OpNode>;
+
+ def Wrs : BaseLogicalSReg<opc, N, GPR32, logical_shifted_reg32, mnemonic,
+ [(set GPR32:$Rd, (OpNode GPR32:$Rn,
+ logical_shifted_reg32:$Rm))]> {
+ let Inst{31} = 0;
+ }
+ def Xrs : BaseLogicalSReg<opc, N, GPR64, logical_shifted_reg64, mnemonic,
+ [(set GPR64:$Rd, (OpNode GPR64:$Rn,
+ logical_shifted_reg64:$Rm))]> {
+ let Inst{31} = 1;
+ }
+
+ def : LogicalRegAlias<mnemonic,
+ !cast<Instruction>(NAME#"Wrs"), GPR32>;
+ def : LogicalRegAlias<mnemonic,
+ !cast<Instruction>(NAME#"Xrs"), GPR64>;
+}
+
+// Split from LogicalReg to allow setting NZCV Defs
+multiclass LogicalRegS<bits<2> opc, bit N, string mnemonic,
+ SDPatternOperator OpNode = null_frag> {
+ let Defs = [NZCV], mayLoad = 0, mayStore = 0, hasSideEffects = 0 in {
+ def Wrr : BaseLogicalRegPseudo<GPR32, OpNode>;
+ def Xrr : BaseLogicalRegPseudo<GPR64, OpNode>;
+
+ def Wrs : BaseLogicalSReg<opc, N, GPR32, logical_shifted_reg32, mnemonic,
+ [(set GPR32:$Rd, (OpNode GPR32:$Rn, logical_shifted_reg32:$Rm))]> {
+ let Inst{31} = 0;
+ }
+ def Xrs : BaseLogicalSReg<opc, N, GPR64, logical_shifted_reg64, mnemonic,
+ [(set GPR64:$Rd, (OpNode GPR64:$Rn, logical_shifted_reg64:$Rm))]> {
+ let Inst{31} = 1;
+ }
+ } // Defs = [NZCV]
+
+ def : LogicalRegAlias<mnemonic,
+ !cast<Instruction>(NAME#"Wrs"), GPR32>;
+ def : LogicalRegAlias<mnemonic,
+ !cast<Instruction>(NAME#"Xrs"), GPR64>;
+}
+
+//---
+// Conditionally set flags
+//---
+
+let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
+class BaseCondSetFlagsImm<bit op, RegisterClass regtype, string asm>
+ : I<(outs), (ins regtype:$Rn, imm0_31:$imm, imm0_15:$nzcv, ccode:$cond),
+ asm, "\t$Rn, $imm, $nzcv, $cond", "", []>,
+ Sched<[WriteI, ReadI]> {
+ let Uses = [NZCV];
+ let Defs = [NZCV];
+
+ bits<5> Rn;
+ bits<5> imm;
+ bits<4> nzcv;
+ bits<4> cond;
+
+ let Inst{30} = op;
+ let Inst{29-21} = 0b111010010;
+ let Inst{20-16} = imm;
+ let Inst{15-12} = cond;
+ let Inst{11-10} = 0b10;
+ let Inst{9-5} = Rn;
+ let Inst{4} = 0b0;
+ let Inst{3-0} = nzcv;
+}
+
+multiclass CondSetFlagsImm<bit op, string asm> {
+ def Wi : BaseCondSetFlagsImm<op, GPR32, asm> {
+ let Inst{31} = 0;
+ }
+ def Xi : BaseCondSetFlagsImm<op, GPR64, asm> {
+ let Inst{31} = 1;
+ }
+}
+
+let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
+class BaseCondSetFlagsReg<bit op, RegisterClass regtype, string asm>
+ : I<(outs), (ins regtype:$Rn, regtype:$Rm, imm0_15:$nzcv, ccode:$cond),
+ asm, "\t$Rn, $Rm, $nzcv, $cond", "", []>,
+ Sched<[WriteI, ReadI, ReadI]> {
+ let Uses = [NZCV];
+ let Defs = [NZCV];
+
bits<5> Rn;
bits<5> Rm;
- bits<4> NZCVImm;
- bits<4> Cond;
+ bits<4> nzcv;
+ bits<4> cond;
-
- let Inst{31} = sf;
- let Inst{30} = op;
- let Inst{29} = s;
- let Inst{28-21} = 0b11010010;
+ let Inst{30} = op;
+ let Inst{29-21} = 0b111010010;
let Inst{20-16} = Rm;
- let Inst{15-12} = Cond;
- let Inst{11} = 0b0;
- let Inst{10} = o2;
- let Inst{9-5} = Rn;
- let Inst{4} = o3;
- let Inst{3-0} = NZCVImm;
+ let Inst{15-12} = cond;
+ let Inst{11-10} = 0b00;
+ let Inst{9-5} = Rn;
+ let Inst{4} = 0b0;
+ let Inst{3-0} = nzcv;
}
-// Format for conditional select instructions.
-class A64I_condsel<bit sf, bit op, bit s, bits<2> op2,
- dag outs, dag ins, string asmstr,
- list<dag> patterns, InstrItinClass itin>
- : A64InstRdnm<outs, ins, asmstr, patterns, itin> {
- bits<4> Cond;
+multiclass CondSetFlagsReg<bit op, string asm> {
+ def Wr : BaseCondSetFlagsReg<op, GPR32, asm> {
+ let Inst{31} = 0;
+ }
+ def Xr : BaseCondSetFlagsReg<op, GPR64, asm> {
+ let Inst{31} = 1;
+ }
+}
- let Inst{31} = sf;
- let Inst{30} = op;
- let Inst{29} = s;
- let Inst{28-21} = 0b11010100;
- // Inherit Rm in 20-16
- let Inst{15-12} = Cond;
+//---
+// Conditional select
+//---
+
+class BaseCondSelect<bit op, bits<2> op2, RegisterClass regtype, string asm>
+ : I<(outs regtype:$Rd), (ins regtype:$Rn, regtype:$Rm, ccode:$cond),
+ asm, "\t$Rd, $Rn, $Rm, $cond", "",
+ [(set regtype:$Rd,
+ (AArch64csel regtype:$Rn, regtype:$Rm, (i32 imm:$cond), NZCV))]>,
+ Sched<[WriteI, ReadI, ReadI]> {
+ let Uses = [NZCV];
+
+ bits<5> Rd;
+ bits<5> Rn;
+ bits<5> Rm;
+ bits<4> cond;
+
+ let Inst{30} = op;
+ let Inst{29-21} = 0b011010100;
+ let Inst{20-16} = Rm;
+ let Inst{15-12} = cond;
let Inst{11-10} = op2;
- // Inherit Rn in 9-5
- // Inherit Rd in 4-0
+ let Inst{9-5} = Rn;
+ let Inst{4-0} = Rd;
}
-// Format for data processing (1 source) instructions
-class A64I_dp_1src<bit sf, bit S, bits<5> opcode2, bits<6> opcode,
- string asmstr, dag outs, dag ins,
- list<dag> patterns, InstrItinClass itin>
- : A64InstRdn<outs, ins, asmstr, patterns, itin> {
- let Inst{31} = sf;
- let Inst{30} = 0b1;
- let Inst{29} = S;
- let Inst{28-21} = 0b11010110;
- let Inst{20-16} = opcode2;
- let Inst{15-10} = opcode;
+multiclass CondSelect<bit op, bits<2> op2, string asm> {
+ def Wr : BaseCondSelect<op, op2, GPR32, asm> {
+ let Inst{31} = 0;
+ }
+ def Xr : BaseCondSelect<op, op2, GPR64, asm> {
+ let Inst{31} = 1;
+ }
}
-// Format for data processing (2 source) instructions
-class A64I_dp_2src<bit sf, bits<6> opcode, bit S,
- string asmstr, dag outs, dag ins,
- list<dag> patterns, InstrItinClass itin>
- : A64InstRdnm<outs, ins, asmstr, patterns, itin> {
- let Inst{31} = sf;
- let Inst{30} = 0b0;
- let Inst{29} = S;
- let Inst{28-21} = 0b11010110;
- let Inst{15-10} = opcode;
+class BaseCondSelectOp<bit op, bits<2> op2, RegisterClass regtype, string asm,
+ PatFrag frag>
+ : I<(outs regtype:$Rd), (ins regtype:$Rn, regtype:$Rm, ccode:$cond),
+ asm, "\t$Rd, $Rn, $Rm, $cond", "",
+ [(set regtype:$Rd,
+ (AArch64csel regtype:$Rn, (frag regtype:$Rm),
+ (i32 imm:$cond), NZCV))]>,
+ Sched<[WriteI, ReadI, ReadI]> {
+ let Uses = [NZCV];
+
+ bits<5> Rd;
+ bits<5> Rn;
+ bits<5> Rm;
+ bits<4> cond;
+
+ let Inst{30} = op;
+ let Inst{29-21} = 0b011010100;
+ let Inst{20-16} = Rm;
+ let Inst{15-12} = cond;
+ let Inst{11-10} = op2;
+ let Inst{9-5} = Rn;
+ let Inst{4-0} = Rd;
}
-// Format for data-processing (3 source) instructions
+def inv_cond_XFORM : SDNodeXForm<imm, [{
+ AArch64CC::CondCode CC = static_cast<AArch64CC::CondCode>(N->getZExtValue());
+ return CurDAG->getTargetConstant(AArch64CC::getInvertedCondCode(CC), MVT::i32);
+}]>;
-class A64I_dp3<bit sf, bits<6> opcode,
- dag outs, dag ins, string asmstr,
- list<dag> patterns, InstrItinClass itin>
- : A64InstRdnm<outs, ins, asmstr, patterns, itin> {
- let Inst{31} = sf;
- let Inst{30-29} = opcode{5-4};
- let Inst{28-24} = 0b11011;
- let Inst{23-21} = opcode{3-1};
- // Inherits Rm in 20-16
- let Inst{15} = opcode{0};
- // {14-10} mostly Ra, but unspecified for SMULH/UMULH
- // Inherits Rn in 9-5
- // Inherits Rd in 4-0
+multiclass CondSelectOp<bit op, bits<2> op2, string asm, PatFrag frag> {
+ def Wr : BaseCondSelectOp<op, op2, GPR32, asm, frag> {
+ let Inst{31} = 0;
+ }
+ def Xr : BaseCondSelectOp<op, op2, GPR64, asm, frag> {
+ let Inst{31} = 1;
+ }
+
+ def : Pat<(AArch64csel (frag GPR32:$Rm), GPR32:$Rn, (i32 imm:$cond), NZCV),
+ (!cast<Instruction>(NAME # Wr) GPR32:$Rn, GPR32:$Rm,
+ (inv_cond_XFORM imm:$cond))>;
+
+ def : Pat<(AArch64csel (frag GPR64:$Rm), GPR64:$Rn, (i32 imm:$cond), NZCV),
+ (!cast<Instruction>(NAME # Xr) GPR64:$Rn, GPR64:$Rm,
+ (inv_cond_XFORM imm:$cond))>;
}
-// Format for exception generation instructions
-class A64I_exception<bits<3> opc, bits<3> op2, bits<2> ll,
- dag outs, dag ins, string asmstr,
- list<dag> patterns, InstrItinClass itin>
- : A64Inst<outs, ins, asmstr, patterns, itin> {
- bits<16> UImm16;
+//---
+// Special Mask Value
+//---
+def maski8_or_more : Operand<i32>,
+ ImmLeaf<i32, [{ return (Imm & 0xff) == 0xff; }]> {
+}
+def maski16_or_more : Operand<i32>,
+ ImmLeaf<i32, [{ return (Imm & 0xffff) == 0xffff; }]> {
+}
+
+//---
+// Load/store
+//---
+
+// (unsigned immediate)
+// Indexed for 8-bit registers. offset is in range [0,4095].
+def am_indexed8 : ComplexPattern<i64, 2, "SelectAddrModeIndexed8", []>;
+def am_indexed16 : ComplexPattern<i64, 2, "SelectAddrModeIndexed16", []>;
+def am_indexed32 : ComplexPattern<i64, 2, "SelectAddrModeIndexed32", []>;
+def am_indexed64 : ComplexPattern<i64, 2, "SelectAddrModeIndexed64", []>;
+def am_indexed128 : ComplexPattern<i64, 2, "SelectAddrModeIndexed128", []>;
+
+class UImm12OffsetOperand<int Scale> : AsmOperandClass {
+ let Name = "UImm12Offset" # Scale;
+ let RenderMethod = "addUImm12OffsetOperands<" # Scale # ">";
+ let PredicateMethod = "isUImm12Offset<" # Scale # ">";
+ let DiagnosticType = "InvalidMemoryIndexed" # Scale;
+}
+
+def UImm12OffsetScale1Operand : UImm12OffsetOperand<1>;
+def UImm12OffsetScale2Operand : UImm12OffsetOperand<2>;
+def UImm12OffsetScale4Operand : UImm12OffsetOperand<4>;
+def UImm12OffsetScale8Operand : UImm12OffsetOperand<8>;
+def UImm12OffsetScale16Operand : UImm12OffsetOperand<16>;
+
+class uimm12_scaled<int Scale> : Operand<i64> {
+ let ParserMatchClass
+ = !cast<AsmOperandClass>("UImm12OffsetScale" # Scale # "Operand");
+ let EncoderMethod
+ = "getLdStUImm12OpValue<AArch64::fixup_aarch64_ldst_imm12_scale" # Scale # ">";
+ let PrintMethod = "printUImm12Offset<" # Scale # ">";
+}
+
+def uimm12s1 : uimm12_scaled<1>;
+def uimm12s2 : uimm12_scaled<2>;
+def uimm12s4 : uimm12_scaled<4>;
+def uimm12s8 : uimm12_scaled<8>;
+def uimm12s16 : uimm12_scaled<16>;
+
+class BaseLoadStoreUI<bits<2> sz, bit V, bits<2> opc, dag oops, dag iops,
+ string asm, list<dag> pattern>
+ : I<oops, iops, asm, "\t$Rt, [$Rn, $offset]", "", pattern> {
+ bits<5> Rt;
+
+ bits<5> Rn;
+ bits<12> offset;
+
+ let Inst{31-30} = sz;
+ let Inst{29-27} = 0b111;
+ let Inst{26} = V;
+ let Inst{25-24} = 0b01;
+ let Inst{23-22} = opc;
+ let Inst{21-10} = offset;
+ let Inst{9-5} = Rn;
+ let Inst{4-0} = Rt;
+
+ let DecoderMethod = "DecodeUnsignedLdStInstruction";
+}
+
+multiclass LoadUI<bits<2> sz, bit V, bits<2> opc, RegisterClass regtype,
+ Operand indextype, string asm, list<dag> pattern> {
+ let AddedComplexity = 10, mayLoad = 1, mayStore = 0, hasSideEffects = 0 in
+ def ui : BaseLoadStoreUI<sz, V, opc, (outs regtype:$Rt),
+ (ins GPR64sp:$Rn, indextype:$offset),
+ asm, pattern>,
+ Sched<[WriteLD]>;
+
+ def : InstAlias<asm # " $Rt, [$Rn]",
+ (!cast<Instruction>(NAME # "ui") regtype:$Rt, GPR64sp:$Rn, 0)>;
+}
+
+multiclass StoreUI<bits<2> sz, bit V, bits<2> opc, RegisterClass regtype,
+ Operand indextype, string asm, list<dag> pattern> {
+ let AddedComplexity = 10, mayLoad = 0, mayStore = 1, hasSideEffects = 0 in
+ def ui : BaseLoadStoreUI<sz, V, opc, (outs),
+ (ins regtype:$Rt, GPR64sp:$Rn, indextype:$offset),
+ asm, pattern>,
+ Sched<[WriteST]>;
+
+ def : InstAlias<asm # " $Rt, [$Rn]",
+ (!cast<Instruction>(NAME # "ui") regtype:$Rt, GPR64sp:$Rn, 0)>;
+}
+
+def PrefetchOperand : AsmOperandClass {
+ let Name = "Prefetch";
+ let ParserMethod = "tryParsePrefetch";
+}
+def prfop : Operand<i32> {
+ let PrintMethod = "printPrefetchOp";
+ let ParserMatchClass = PrefetchOperand;
+}
+
+let mayLoad = 0, mayStore = 0, hasSideEffects = 1 in
+class PrefetchUI<bits<2> sz, bit V, bits<2> opc, string asm, list<dag> pat>
+ : BaseLoadStoreUI<sz, V, opc,
+ (outs), (ins prfop:$Rt, GPR64sp:$Rn, uimm12s8:$offset),
+ asm, pat>,
+ Sched<[WriteLD]>;
+
+//---
+// Load literal
+//---
+
+// Load literal address: 19-bit immediate. The low two bits of the target
+// offset are implied zero and so are not part of the immediate.
+def am_ldrlit : Operand<OtherVT> {
+ let EncoderMethod = "getLoadLiteralOpValue";
+ let DecoderMethod = "DecodePCRelLabel19";
+ let PrintMethod = "printAlignedLabel";
+ let ParserMatchClass = PCRelLabel19Operand;
+}
+
+let mayLoad = 1, mayStore = 0, hasSideEffects = 0 in
+class LoadLiteral<bits<2> opc, bit V, RegisterClass regtype, string asm>
+ : I<(outs regtype:$Rt), (ins am_ldrlit:$label),
+ asm, "\t$Rt, $label", "", []>,
+ Sched<[WriteLD]> {
+ bits<5> Rt;
+ bits<19> label;
+ let Inst{31-30} = opc;
+ let Inst{29-27} = 0b011;
+ let Inst{26} = V;
+ let Inst{25-24} = 0b00;
+ let Inst{23-5} = label;
+ let Inst{4-0} = Rt;
+}
+
+let mayLoad = 0, mayStore = 0, hasSideEffects = 1 in
+class PrefetchLiteral<bits<2> opc, bit V, string asm, list<dag> pat>
+ : I<(outs), (ins prfop:$Rt, am_ldrlit:$label),
+ asm, "\t$Rt, $label", "", pat>,
+ Sched<[WriteLD]> {
+ bits<5> Rt;
+ bits<19> label;
+ let Inst{31-30} = opc;
+ let Inst{29-27} = 0b011;
+ let Inst{26} = V;
+ let Inst{25-24} = 0b00;
+ let Inst{23-5} = label;
+ let Inst{4-0} = Rt;
+}
+
+//---
+// Load/store register offset
+//---
+
+def ro_Xindexed8 : ComplexPattern<i64, 4, "SelectAddrModeXRO<8>", []>;
+def ro_Xindexed16 : ComplexPattern<i64, 4, "SelectAddrModeXRO<16>", []>;
+def ro_Xindexed32 : ComplexPattern<i64, 4, "SelectAddrModeXRO<32>", []>;
+def ro_Xindexed64 : ComplexPattern<i64, 4, "SelectAddrModeXRO<64>", []>;
+def ro_Xindexed128 : ComplexPattern<i64, 4, "SelectAddrModeXRO<128>", []>;
+
+def ro_Windexed8 : ComplexPattern<i64, 4, "SelectAddrModeWRO<8>", []>;
+def ro_Windexed16 : ComplexPattern<i64, 4, "SelectAddrModeWRO<16>", []>;
+def ro_Windexed32 : ComplexPattern<i64, 4, "SelectAddrModeWRO<32>", []>;
+def ro_Windexed64 : ComplexPattern<i64, 4, "SelectAddrModeWRO<64>", []>;
+def ro_Windexed128 : ComplexPattern<i64, 4, "SelectAddrModeWRO<128>", []>;
+
+class MemExtendOperand<string Reg, int Width> : AsmOperandClass {
+ let Name = "Mem" # Reg # "Extend" # Width;
+ let PredicateMethod = "isMem" # Reg # "Extend<" # Width # ">";
+ let RenderMethod = "addMemExtendOperands";
+ let DiagnosticType = "InvalidMemory" # Reg # "Extend" # Width;
+}
+
+def MemWExtend8Operand : MemExtendOperand<"W", 8> {
+ // The address "[x0, x1, lsl #0]" actually maps to the variant which performs
+ // the trivial shift.
+ let RenderMethod = "addMemExtend8Operands";
+}
+def MemWExtend16Operand : MemExtendOperand<"W", 16>;
+def MemWExtend32Operand : MemExtendOperand<"W", 32>;
+def MemWExtend64Operand : MemExtendOperand<"W", 64>;
+def MemWExtend128Operand : MemExtendOperand<"W", 128>;
+
+def MemXExtend8Operand : MemExtendOperand<"X", 8> {
+ // The address "[x0, x1, lsl #0]" actually maps to the variant which performs
+ // the trivial shift.
+ let RenderMethod = "addMemExtend8Operands";
+}
+def MemXExtend16Operand : MemExtendOperand<"X", 16>;
+def MemXExtend32Operand : MemExtendOperand<"X", 32>;
+def MemXExtend64Operand : MemExtendOperand<"X", 64>;
+def MemXExtend128Operand : MemExtendOperand<"X", 128>;
+
+class ro_extend<AsmOperandClass ParserClass, string Reg, int Width>
+ : Operand<i32> {
+ let ParserMatchClass = ParserClass;
+ let PrintMethod = "printMemExtend<'" # Reg # "', " # Width # ">";
+ let DecoderMethod = "DecodeMemExtend";
+ let EncoderMethod = "getMemExtendOpValue";
+ let MIOperandInfo = (ops i32imm:$signed, i32imm:$doshift);
+}
+
+def ro_Wextend8 : ro_extend<MemWExtend8Operand, "w", 8>;
+def ro_Wextend16 : ro_extend<MemWExtend16Operand, "w", 16>;
+def ro_Wextend32 : ro_extend<MemWExtend32Operand, "w", 32>;
+def ro_Wextend64 : ro_extend<MemWExtend64Operand, "w", 64>;
+def ro_Wextend128 : ro_extend<MemWExtend128Operand, "w", 128>;
+
+def ro_Xextend8 : ro_extend<MemXExtend8Operand, "x", 8>;
+def ro_Xextend16 : ro_extend<MemXExtend16Operand, "x", 16>;
+def ro_Xextend32 : ro_extend<MemXExtend32Operand, "x", 32>;
+def ro_Xextend64 : ro_extend<MemXExtend64Operand, "x", 64>;
+def ro_Xextend128 : ro_extend<MemXExtend128Operand, "x", 128>;
+
+class ROAddrMode<ComplexPattern windex, ComplexPattern xindex,
+ Operand wextend, Operand xextend> {
+ // CodeGen-level pattern covering the entire addressing mode.
+ ComplexPattern Wpat = windex;
+ ComplexPattern Xpat = xindex;
+
+ // Asm-level Operand covering the valid "uxtw #3" style syntax.
+ Operand Wext = wextend;
+ Operand Xext = xextend;
+}
+
+def ro8 : ROAddrMode<ro_Windexed8, ro_Xindexed8, ro_Wextend8, ro_Xextend8>;
+def ro16 : ROAddrMode<ro_Windexed16, ro_Xindexed16, ro_Wextend16, ro_Xextend16>;
+def ro32 : ROAddrMode<ro_Windexed32, ro_Xindexed32, ro_Wextend32, ro_Xextend32>;
+def ro64 : ROAddrMode<ro_Windexed64, ro_Xindexed64, ro_Wextend64, ro_Xextend64>;
+def ro128 : ROAddrMode<ro_Windexed128, ro_Xindexed128, ro_Wextend128,
+ ro_Xextend128>;
+
+class LoadStore8RO<bits<2> sz, bit V, bits<2> opc, RegisterClass regtype,
+ string asm, dag ins, dag outs, list<dag> pat>
+ : I<ins, outs, asm, "\t$Rt, [$Rn, $Rm, $extend]", "", pat> {
+ bits<5> Rt;
+ bits<5> Rn;
+ bits<5> Rm;
+ bits<2> extend;
+ let Inst{31-30} = sz;
+ let Inst{29-27} = 0b111;
+ let Inst{26} = V;
+ let Inst{25-24} = 0b00;
+ let Inst{23-22} = opc;
+ let Inst{21} = 1;
+ let Inst{20-16} = Rm;
+ let Inst{15} = extend{1}; // sign extend Rm?
+ let Inst{14} = 1;
+ let Inst{12} = extend{0}; // do shift?
+ let Inst{11-10} = 0b10;
+ let Inst{9-5} = Rn;
+ let Inst{4-0} = Rt;
+}
+
+class ROInstAlias<string asm, RegisterClass regtype, Instruction INST>
+ : InstAlias<asm # " $Rt, [$Rn, $Rm]",
+ (INST regtype:$Rt, GPR64sp:$Rn, GPR64:$Rm, 0, 0)>;
+
+multiclass Load8RO<bits<2> sz, bit V, bits<2> opc, RegisterClass regtype,
+ string asm, ValueType Ty, SDPatternOperator loadop> {
+ let AddedComplexity = 10 in
+ def roW : LoadStore8RO<sz, V, opc, regtype, asm,
+ (outs regtype:$Rt),
+ (ins GPR64sp:$Rn, GPR32:$Rm, ro_Wextend8:$extend),
+ [(set (Ty regtype:$Rt),
+ (loadop (ro_Windexed8 GPR64sp:$Rn, GPR32:$Rm,
+ ro_Wextend8:$extend)))]>,
+ Sched<[WriteLDIdx, ReadAdrBase]> {
+ let Inst{13} = 0b0;
+ }
+
+ let AddedComplexity = 10 in
+ def roX : LoadStore8RO<sz, V, opc, regtype, asm,
+ (outs regtype:$Rt),
+ (ins GPR64sp:$Rn, GPR64:$Rm, ro_Xextend8:$extend),
+ [(set (Ty regtype:$Rt),
+ (loadop (ro_Xindexed8 GPR64sp:$Rn, GPR64:$Rm,
+ ro_Xextend8:$extend)))]>,
+ Sched<[WriteLDIdx, ReadAdrBase]> {
+ let Inst{13} = 0b1;
+ }
+
+ def : ROInstAlias<asm, regtype, !cast<Instruction>(NAME # "roX")>;
+}
+
+multiclass Store8RO<bits<2> sz, bit V, bits<2> opc, RegisterClass regtype,
+ string asm, ValueType Ty, SDPatternOperator storeop> {
+ let AddedComplexity = 10 in
+ def roW : LoadStore8RO<sz, V, opc, regtype, asm, (outs),
+ (ins regtype:$Rt, GPR64sp:$Rn, GPR32:$Rm, ro_Wextend8:$extend),
+ [(storeop (Ty regtype:$Rt),
+ (ro_Windexed8 GPR64sp:$Rn, GPR32:$Rm,
+ ro_Wextend8:$extend))]>,
+ Sched<[WriteSTIdx, ReadAdrBase]> {
+ let Inst{13} = 0b0;
+ }
+
+ let AddedComplexity = 10 in
+ def roX : LoadStore8RO<sz, V, opc, regtype, asm, (outs),
+ (ins regtype:$Rt, GPR64sp:$Rn, GPR64:$Rm, ro_Xextend8:$extend),
+ [(storeop (Ty regtype:$Rt),
+ (ro_Xindexed8 GPR64sp:$Rn, GPR64:$Rm,
+ ro_Xextend8:$extend))]>,
+ Sched<[WriteSTIdx, ReadAdrBase]> {
+ let Inst{13} = 0b1;
+ }
+
+ def : ROInstAlias<asm, regtype, !cast<Instruction>(NAME # "roX")>;
+}
+
+class LoadStore16RO<bits<2> sz, bit V, bits<2> opc, RegisterClass regtype,
+ string asm, dag ins, dag outs, list<dag> pat>
+ : I<ins, outs, asm, "\t$Rt, [$Rn, $Rm, $extend]", "", pat> {
+ bits<5> Rt;
+ bits<5> Rn;
+ bits<5> Rm;
+ bits<2> extend;
+ let Inst{31-30} = sz;
+ let Inst{29-27} = 0b111;
+ let Inst{26} = V;
+ let Inst{25-24} = 0b00;
+ let Inst{23-22} = opc;
+ let Inst{21} = 1;
+ let Inst{20-16} = Rm;
+ let Inst{15} = extend{1}; // sign extend Rm?
+ let Inst{14} = 1;
+ let Inst{12} = extend{0}; // do shift?
+ let Inst{11-10} = 0b10;
+ let Inst{9-5} = Rn;
+ let Inst{4-0} = Rt;
+}
+
+multiclass Load16RO<bits<2> sz, bit V, bits<2> opc, RegisterClass regtype,
+ string asm, ValueType Ty, SDPatternOperator loadop> {
+ let AddedComplexity = 10 in
+ def roW : LoadStore16RO<sz, V, opc, regtype, asm, (outs regtype:$Rt),
+ (ins GPR64sp:$Rn, GPR32:$Rm, ro_Wextend16:$extend),
+ [(set (Ty regtype:$Rt),
+ (loadop (ro_Windexed16 GPR64sp:$Rn, GPR32:$Rm,
+ ro_Wextend16:$extend)))]>,
+ Sched<[WriteLDIdx, ReadAdrBase]> {
+ let Inst{13} = 0b0;
+ }
+
+ let AddedComplexity = 10 in
+ def roX : LoadStore16RO<sz, V, opc, regtype, asm, (outs regtype:$Rt),
+ (ins GPR64sp:$Rn, GPR64:$Rm, ro_Xextend16:$extend),
+ [(set (Ty regtype:$Rt),
+ (loadop (ro_Xindexed16 GPR64sp:$Rn, GPR64:$Rm,
+ ro_Xextend16:$extend)))]>,
+ Sched<[WriteLDIdx, ReadAdrBase]> {
+ let Inst{13} = 0b1;
+ }
+
+ def : ROInstAlias<asm, regtype, !cast<Instruction>(NAME # "roX")>;
+}
+
+multiclass Store16RO<bits<2> sz, bit V, bits<2> opc, RegisterClass regtype,
+ string asm, ValueType Ty, SDPatternOperator storeop> {
+ let AddedComplexity = 10 in
+ def roW : LoadStore16RO<sz, V, opc, regtype, asm, (outs),
+ (ins regtype:$Rt, GPR64sp:$Rn, GPR32:$Rm, ro_Wextend16:$extend),
+ [(storeop (Ty regtype:$Rt),
+ (ro_Windexed16 GPR64sp:$Rn, GPR32:$Rm,
+ ro_Wextend16:$extend))]>,
+ Sched<[WriteSTIdx, ReadAdrBase]> {
+ let Inst{13} = 0b0;
+ }
+
+ let AddedComplexity = 10 in
+ def roX : LoadStore16RO<sz, V, opc, regtype, asm, (outs),
+ (ins regtype:$Rt, GPR64sp:$Rn, GPR64:$Rm, ro_Xextend16:$extend),
+ [(storeop (Ty regtype:$Rt),
+ (ro_Xindexed16 GPR64sp:$Rn, GPR64:$Rm,
+ ro_Xextend16:$extend))]>,
+ Sched<[WriteSTIdx, ReadAdrBase]> {
+ let Inst{13} = 0b1;
+ }
+
+ def : ROInstAlias<asm, regtype, !cast<Instruction>(NAME # "roX")>;
+}
+
+class LoadStore32RO<bits<2> sz, bit V, bits<2> opc, RegisterClass regtype,
+ string asm, dag ins, dag outs, list<dag> pat>
+ : I<ins, outs, asm, "\t$Rt, [$Rn, $Rm, $extend]", "", pat> {
+ bits<5> Rt;
+ bits<5> Rn;
+ bits<5> Rm;
+ bits<2> extend;
+ let Inst{31-30} = sz;
+ let Inst{29-27} = 0b111;
+ let Inst{26} = V;
+ let Inst{25-24} = 0b00;
+ let Inst{23-22} = opc;
+ let Inst{21} = 1;
+ let Inst{20-16} = Rm;
+ let Inst{15} = extend{1}; // sign extend Rm?
+ let Inst{14} = 1;
+ let Inst{12} = extend{0}; // do shift?
+ let Inst{11-10} = 0b10;
+ let Inst{9-5} = Rn;
+ let Inst{4-0} = Rt;
+}
+
+multiclass Load32RO<bits<2> sz, bit V, bits<2> opc, RegisterClass regtype,
+ string asm, ValueType Ty, SDPatternOperator loadop> {
+ let AddedComplexity = 10 in
+ def roW : LoadStore32RO<sz, V, opc, regtype, asm, (outs regtype:$Rt),
+ (ins GPR64sp:$Rn, GPR32:$Rm, ro_Wextend32:$extend),
+ [(set (Ty regtype:$Rt),
+ (loadop (ro_Windexed32 GPR64sp:$Rn, GPR32:$Rm,
+ ro_Wextend32:$extend)))]>,
+ Sched<[WriteLDIdx, ReadAdrBase]> {
+ let Inst{13} = 0b0;
+ }
+
+ let AddedComplexity = 10 in
+ def roX : LoadStore32RO<sz, V, opc, regtype, asm, (outs regtype:$Rt),
+ (ins GPR64sp:$Rn, GPR64:$Rm, ro_Xextend32:$extend),
+ [(set (Ty regtype:$Rt),
+ (loadop (ro_Xindexed32 GPR64sp:$Rn, GPR64:$Rm,
+ ro_Xextend32:$extend)))]>,
+ Sched<[WriteLDIdx, ReadAdrBase]> {
+ let Inst{13} = 0b1;
+ }
+
+ def : ROInstAlias<asm, regtype, !cast<Instruction>(NAME # "roX")>;
+}
+
+multiclass Store32RO<bits<2> sz, bit V, bits<2> opc, RegisterClass regtype,
+ string asm, ValueType Ty, SDPatternOperator storeop> {
+ let AddedComplexity = 10 in
+ def roW : LoadStore32RO<sz, V, opc, regtype, asm, (outs),
+ (ins regtype:$Rt, GPR64sp:$Rn, GPR32:$Rm, ro_Wextend32:$extend),
+ [(storeop (Ty regtype:$Rt),
+ (ro_Windexed32 GPR64sp:$Rn, GPR32:$Rm,
+ ro_Wextend32:$extend))]>,
+ Sched<[WriteSTIdx, ReadAdrBase]> {
+ let Inst{13} = 0b0;
+ }
+
+ let AddedComplexity = 10 in
+ def roX : LoadStore32RO<sz, V, opc, regtype, asm, (outs),
+ (ins regtype:$Rt, GPR64sp:$Rn, GPR64:$Rm, ro_Xextend32:$extend),
+ [(storeop (Ty regtype:$Rt),
+ (ro_Xindexed32 GPR64sp:$Rn, GPR64:$Rm,
+ ro_Xextend32:$extend))]>,
+ Sched<[WriteSTIdx, ReadAdrBase]> {
+ let Inst{13} = 0b1;
+ }
+
+ def : ROInstAlias<asm, regtype, !cast<Instruction>(NAME # "roX")>;
+}
+
+class LoadStore64RO<bits<2> sz, bit V, bits<2> opc, RegisterClass regtype,
+ string asm, dag ins, dag outs, list<dag> pat>
+ : I<ins, outs, asm, "\t$Rt, [$Rn, $Rm, $extend]", "", pat> {
+ bits<5> Rt;
+ bits<5> Rn;
+ bits<5> Rm;
+ bits<2> extend;
+ let Inst{31-30} = sz;
+ let Inst{29-27} = 0b111;
+ let Inst{26} = V;
+ let Inst{25-24} = 0b00;
+ let Inst{23-22} = opc;
+ let Inst{21} = 1;
+ let Inst{20-16} = Rm;
+ let Inst{15} = extend{1}; // sign extend Rm?
+ let Inst{14} = 1;
+ let Inst{12} = extend{0}; // do shift?
+ let Inst{11-10} = 0b10;
+ let Inst{9-5} = Rn;
+ let Inst{4-0} = Rt;
+}
+
+multiclass Load64RO<bits<2> sz, bit V, bits<2> opc, RegisterClass regtype,
+ string asm, ValueType Ty, SDPatternOperator loadop> {
+ let AddedComplexity = 10, mayLoad = 1, mayStore = 0, hasSideEffects = 0 in
+ def roW : LoadStore64RO<sz, V, opc, regtype, asm, (outs regtype:$Rt),
+ (ins GPR64sp:$Rn, GPR32:$Rm, ro_Wextend64:$extend),
+ [(set (Ty regtype:$Rt),
+ (loadop (ro_Windexed64 GPR64sp:$Rn, GPR32:$Rm,
+ ro_Wextend64:$extend)))]>,
+ Sched<[WriteLDIdx, ReadAdrBase]> {
+ let Inst{13} = 0b0;
+ }
+
+ let AddedComplexity = 10, mayLoad = 1, mayStore = 0, hasSideEffects = 0 in
+ def roX : LoadStore64RO<sz, V, opc, regtype, asm, (outs regtype:$Rt),
+ (ins GPR64sp:$Rn, GPR64:$Rm, ro_Xextend64:$extend),
+ [(set (Ty regtype:$Rt),
+ (loadop (ro_Xindexed64 GPR64sp:$Rn, GPR64:$Rm,
+ ro_Xextend64:$extend)))]>,
+ Sched<[WriteLDIdx, ReadAdrBase]> {
+ let Inst{13} = 0b1;
+ }
+
+ def : ROInstAlias<asm, regtype, !cast<Instruction>(NAME # "roX")>;
+}
+
+multiclass Store64RO<bits<2> sz, bit V, bits<2> opc, RegisterClass regtype,
+ string asm, ValueType Ty, SDPatternOperator storeop> {
+ let AddedComplexity = 10, mayLoad = 0, mayStore = 1, hasSideEffects = 0 in
+ def roW : LoadStore64RO<sz, V, opc, regtype, asm, (outs),
+ (ins regtype:$Rt, GPR64sp:$Rn, GPR32:$Rm, ro_Wextend64:$extend),
+ [(storeop (Ty regtype:$Rt),
+ (ro_Windexed64 GPR64sp:$Rn, GPR32:$Rm,
+ ro_Wextend64:$extend))]>,
+ Sched<[WriteSTIdx, ReadAdrBase]> {
+ let Inst{13} = 0b0;
+ }
+
+ let AddedComplexity = 10, mayLoad = 0, mayStore = 1, hasSideEffects = 0 in
+ def roX : LoadStore64RO<sz, V, opc, regtype, asm, (outs),
+ (ins regtype:$Rt, GPR64sp:$Rn, GPR64:$Rm, ro_Xextend64:$extend),
+ [(storeop (Ty regtype:$Rt),
+ (ro_Xindexed64 GPR64sp:$Rn, GPR64:$Rm,
+ ro_Xextend64:$extend))]>,
+ Sched<[WriteSTIdx, ReadAdrBase]> {
+ let Inst{13} = 0b1;
+ }
+
+ def : ROInstAlias<asm, regtype, !cast<Instruction>(NAME # "roX")>;
+}
+
+class LoadStore128RO<bits<2> sz, bit V, bits<2> opc, RegisterClass regtype,
+ string asm, dag ins, dag outs, list<dag> pat>
+ : I<ins, outs, asm, "\t$Rt, [$Rn, $Rm, $extend]", "", pat> {
+ bits<5> Rt;
+ bits<5> Rn;
+ bits<5> Rm;
+ bits<2> extend;
+ let Inst{31-30} = sz;
+ let Inst{29-27} = 0b111;
+ let Inst{26} = V;
+ let Inst{25-24} = 0b00;
+ let Inst{23-22} = opc;
+ let Inst{21} = 1;
+ let Inst{20-16} = Rm;
+ let Inst{15} = extend{1}; // sign extend Rm?
+ let Inst{14} = 1;
+ let Inst{12} = extend{0}; // do shift?
+ let Inst{11-10} = 0b10;
+ let Inst{9-5} = Rn;
+ let Inst{4-0} = Rt;
+}
+
+multiclass Load128RO<bits<2> sz, bit V, bits<2> opc, RegisterClass regtype,
+ string asm, ValueType Ty, SDPatternOperator loadop> {
+ let AddedComplexity = 10, mayLoad = 1, mayStore = 0, hasSideEffects = 0 in
+ def roW : LoadStore128RO<sz, V, opc, regtype, asm, (outs regtype:$Rt),
+ (ins GPR64sp:$Rn, GPR32:$Rm, ro_Wextend128:$extend),
+ [(set (Ty regtype:$Rt),
+ (loadop (ro_Windexed128 GPR64sp:$Rn, GPR32:$Rm,
+ ro_Wextend128:$extend)))]>,
+ Sched<[WriteLDIdx, ReadAdrBase]> {
+ let Inst{13} = 0b0;
+ }
+
+ let AddedComplexity = 10, mayLoad = 1, mayStore = 0, hasSideEffects = 0 in
+ def roX : LoadStore128RO<sz, V, opc, regtype, asm, (outs regtype:$Rt),
+ (ins GPR64sp:$Rn, GPR64:$Rm, ro_Xextend128:$extend),
+ [(set (Ty regtype:$Rt),
+ (loadop (ro_Xindexed128 GPR64sp:$Rn, GPR64:$Rm,
+ ro_Xextend128:$extend)))]>,
+ Sched<[WriteLDIdx, ReadAdrBase]> {
+ let Inst{13} = 0b1;
+ }
+
+ def : ROInstAlias<asm, regtype, !cast<Instruction>(NAME # "roX")>;
+}
+
+multiclass Store128RO<bits<2> sz, bit V, bits<2> opc, RegisterClass regtype,
+ string asm, ValueType Ty, SDPatternOperator storeop> {
+ let AddedComplexity = 10, mayLoad = 0, mayStore = 1, hasSideEffects = 0 in
+ def roW : LoadStore128RO<sz, V, opc, regtype, asm, (outs),
+ (ins regtype:$Rt, GPR64sp:$Rn, GPR32:$Rm, ro_Wextend128:$extend),
+ [(storeop (Ty regtype:$Rt),
+ (ro_Windexed128 GPR64sp:$Rn, GPR32:$Rm,
+ ro_Wextend128:$extend))]>,
+ Sched<[WriteSTIdx, ReadAdrBase]> {
+ let Inst{13} = 0b0;
+ }
+
+ let AddedComplexity = 10, mayLoad = 0, mayStore = 1, hasSideEffects = 0 in
+ def roX : LoadStore128RO<sz, V, opc, regtype, asm, (outs),
+ (ins regtype:$Rt, GPR64sp:$Rn, GPR64:$Rm, ro_Xextend128:$extend),
+ [(storeop (Ty regtype:$Rt),
+ (ro_Xindexed128 GPR64sp:$Rn, GPR64:$Rm,
+ ro_Xextend128:$extend))]>,
+ Sched<[WriteSTIdx, ReadAdrBase]> {
+ let Inst{13} = 0b1;
+ }
+
+ def : ROInstAlias<asm, regtype, !cast<Instruction>(NAME # "roX")>;
+}
+
+let mayLoad = 0, mayStore = 0, hasSideEffects = 1 in
+class BasePrefetchRO<bits<2> sz, bit V, bits<2> opc, dag outs, dag ins,
+ string asm, list<dag> pat>
+ : I<outs, ins, asm, "\t$Rt, [$Rn, $Rm, $extend]", "", pat>,
+ Sched<[WriteLD]> {
+ bits<5> Rt;
+ bits<5> Rn;
+ bits<5> Rm;
+ bits<2> extend;
+ let Inst{31-30} = sz;
+ let Inst{29-27} = 0b111;
+ let Inst{26} = V;
+ let Inst{25-24} = 0b00;
+ let Inst{23-22} = opc;
+ let Inst{21} = 1;
+ let Inst{20-16} = Rm;
+ let Inst{15} = extend{1}; // sign extend Rm?
+ let Inst{14} = 1;
+ let Inst{12} = extend{0}; // do shift?
+ let Inst{11-10} = 0b10;
+ let Inst{9-5} = Rn;
+ let Inst{4-0} = Rt;
+}
+
+multiclass PrefetchRO<bits<2> sz, bit V, bits<2> opc, string asm> {
+ def roW : BasePrefetchRO<sz, V, opc, (outs),
+ (ins prfop:$Rt, GPR64sp:$Rn, GPR32:$Rm, ro_Wextend64:$extend),
+ asm, [(AArch64Prefetch imm:$Rt,
+ (ro_Windexed64 GPR64sp:$Rn, GPR32:$Rm,
+ ro_Wextend64:$extend))]> {
+ let Inst{13} = 0b0;
+ }
+
+ def roX : BasePrefetchRO<sz, V, opc, (outs),
+ (ins prfop:$Rt, GPR64sp:$Rn, GPR64:$Rm, ro_Xextend64:$extend),
+ asm, [(AArch64Prefetch imm:$Rt,
+ (ro_Xindexed64 GPR64sp:$Rn, GPR64:$Rm,
+ ro_Xextend64:$extend))]> {
+ let Inst{13} = 0b1;
+ }
+
+ def : InstAlias<"prfm $Rt, [$Rn, $Rm]",
+ (!cast<Instruction>(NAME # "roX") prfop:$Rt,
+ GPR64sp:$Rn, GPR64:$Rm, 0, 0)>;
+}
+
+//---
+// Load/store unscaled immediate
+//---
+
+def am_unscaled8 : ComplexPattern<i64, 2, "SelectAddrModeUnscaled8", []>;
+def am_unscaled16 : ComplexPattern<i64, 2, "SelectAddrModeUnscaled16", []>;
+def am_unscaled32 : ComplexPattern<i64, 2, "SelectAddrModeUnscaled32", []>;
+def am_unscaled64 : ComplexPattern<i64, 2, "SelectAddrModeUnscaled64", []>;
+def am_unscaled128 :ComplexPattern<i64, 2, "SelectAddrModeUnscaled128", []>;
+
+class BaseLoadStoreUnscale<bits<2> sz, bit V, bits<2> opc, dag oops, dag iops,
+ string asm, list<dag> pattern>
+ : I<oops, iops, asm, "\t$Rt, [$Rn, $offset]", "", pattern> {
+ bits<5> Rt;
+ bits<5> Rn;
+ bits<9> offset;
+ let Inst{31-30} = sz;
+ let Inst{29-27} = 0b111;
+ let Inst{26} = V;
+ let Inst{25-24} = 0b00;
+ let Inst{23-22} = opc;
+ let Inst{21} = 0;
+ let Inst{20-12} = offset;
+ let Inst{11-10} = 0b00;
+ let Inst{9-5} = Rn;
+ let Inst{4-0} = Rt;
+
+ let DecoderMethod = "DecodeSignedLdStInstruction";
+}
+
+multiclass LoadUnscaled<bits<2> sz, bit V, bits<2> opc, RegisterClass regtype,
+ string asm, list<dag> pattern> {
+ let AddedComplexity = 1 in // try this before LoadUI
+ def i : BaseLoadStoreUnscale<sz, V, opc, (outs regtype:$Rt),
+ (ins GPR64sp:$Rn, simm9:$offset), asm, pattern>,
+ Sched<[WriteLD]>;
+
+ def : InstAlias<asm # " $Rt, [$Rn]",
+ (!cast<Instruction>(NAME # "i") regtype:$Rt, GPR64sp:$Rn, 0)>;
+}
+
+multiclass StoreUnscaled<bits<2> sz, bit V, bits<2> opc, RegisterClass regtype,
+ string asm, list<dag> pattern> {
+ let AddedComplexity = 1 in // try this before StoreUI
+ def i : BaseLoadStoreUnscale<sz, V, opc, (outs),
+ (ins regtype:$Rt, GPR64sp:$Rn, simm9:$offset),
+ asm, pattern>,
+ Sched<[WriteST]>;
+
+ def : InstAlias<asm # " $Rt, [$Rn]",
+ (!cast<Instruction>(NAME # "i") regtype:$Rt, GPR64sp:$Rn, 0)>;
+}
+
+multiclass PrefetchUnscaled<bits<2> sz, bit V, bits<2> opc, string asm,
+ list<dag> pat> {
+ let mayLoad = 0, mayStore = 0, hasSideEffects = 1 in
+ def i : BaseLoadStoreUnscale<sz, V, opc, (outs),
+ (ins prfop:$Rt, GPR64sp:$Rn, simm9:$offset),
+ asm, pat>,
+ Sched<[WriteLD]>;
+
+ def : InstAlias<asm # " $Rt, [$Rn]",
+ (!cast<Instruction>(NAME # "i") prfop:$Rt, GPR64sp:$Rn, 0)>;
+}
+
+//---
+// Load/store unscaled immediate, unprivileged
+//---
+
+class BaseLoadStoreUnprivileged<bits<2> sz, bit V, bits<2> opc,
+ dag oops, dag iops, string asm>
+ : I<oops, iops, asm, "\t$Rt, [$Rn, $offset]", "", []> {
+ bits<5> Rt;
+ bits<5> Rn;
+ bits<9> offset;
+ let Inst{31-30} = sz;
+ let Inst{29-27} = 0b111;
+ let Inst{26} = V;
+ let Inst{25-24} = 0b00;
+ let Inst{23-22} = opc;
+ let Inst{21} = 0;
+ let Inst{20-12} = offset;
+ let Inst{11-10} = 0b10;
+ let Inst{9-5} = Rn;
+ let Inst{4-0} = Rt;
+
+ let DecoderMethod = "DecodeSignedLdStInstruction";
+}
+
+multiclass LoadUnprivileged<bits<2> sz, bit V, bits<2> opc,
+ RegisterClass regtype, string asm> {
+ let mayStore = 0, mayLoad = 1, hasSideEffects = 0 in
+ def i : BaseLoadStoreUnprivileged<sz, V, opc, (outs regtype:$Rt),
+ (ins GPR64sp:$Rn, simm9:$offset), asm>,
+ Sched<[WriteLD]>;
+
+ def : InstAlias<asm # " $Rt, [$Rn]",
+ (!cast<Instruction>(NAME # "i") regtype:$Rt, GPR64sp:$Rn, 0)>;
+}
+
+multiclass StoreUnprivileged<bits<2> sz, bit V, bits<2> opc,
+ RegisterClass regtype, string asm> {
+ let mayStore = 1, mayLoad = 0, hasSideEffects = 0 in
+ def i : BaseLoadStoreUnprivileged<sz, V, opc, (outs),
+ (ins regtype:$Rt, GPR64sp:$Rn, simm9:$offset),
+ asm>,
+ Sched<[WriteST]>;
+
+ def : InstAlias<asm # " $Rt, [$Rn]",
+ (!cast<Instruction>(NAME # "i") regtype:$Rt, GPR64sp:$Rn, 0)>;
+}
+
+//---
+// Load/store pre-indexed
+//---
+
+class BaseLoadStorePreIdx<bits<2> sz, bit V, bits<2> opc, dag oops, dag iops,
+ string asm, string cstr, list<dag> pat>
+ : I<oops, iops, asm, "\t$Rt, [$Rn, $offset]!", cstr, pat> {
+ bits<5> Rt;
+ bits<5> Rn;
+ bits<9> offset;
+ let Inst{31-30} = sz;
+ let Inst{29-27} = 0b111;
+ let Inst{26} = V;
+ let Inst{25-24} = 0;
+ let Inst{23-22} = opc;
+ let Inst{21} = 0;
+ let Inst{20-12} = offset;
+ let Inst{11-10} = 0b11;
+ let Inst{9-5} = Rn;
+ let Inst{4-0} = Rt;
+
+ let DecoderMethod = "DecodeSignedLdStInstruction";
+}
+
+let hasSideEffects = 0 in {
+let mayStore = 0, mayLoad = 1 in
+class LoadPreIdx<bits<2> sz, bit V, bits<2> opc, RegisterClass regtype,
+ string asm>
+ : BaseLoadStorePreIdx<sz, V, opc,
+ (outs GPR64sp:$wback, regtype:$Rt),
+ (ins GPR64sp:$Rn, simm9:$offset), asm,
+ "$Rn = $wback", []>,
+ Sched<[WriteLD, WriteAdr]>;
+
+let mayStore = 1, mayLoad = 0 in
+class StorePreIdx<bits<2> sz, bit V, bits<2> opc, RegisterClass regtype,
+ string asm, SDPatternOperator storeop, ValueType Ty>
+ : BaseLoadStorePreIdx<sz, V, opc,
+ (outs GPR64sp:$wback),
+ (ins regtype:$Rt, GPR64sp:$Rn, simm9:$offset),
+ asm, "$Rn = $wback",
+ [(set GPR64sp:$wback,
+ (storeop (Ty regtype:$Rt), GPR64sp:$Rn, simm9:$offset))]>,
+ Sched<[WriteAdr, WriteST]>;
+} // hasSideEffects = 0
+
+//---
+// Load/store post-indexed
+//---
+
+// (pre-index) load/stores.
+class BaseLoadStorePostIdx<bits<2> sz, bit V, bits<2> opc, dag oops, dag iops,
+ string asm, string cstr, list<dag> pat>
+ : I<oops, iops, asm, "\t$Rt, [$Rn], $offset", cstr, pat> {
+ bits<5> Rt;
+ bits<5> Rn;
+ bits<9> offset;
+ let Inst{31-30} = sz;
+ let Inst{29-27} = 0b111;
+ let Inst{26} = V;
+ let Inst{25-24} = 0b00;
+ let Inst{23-22} = opc;
+ let Inst{21} = 0b0;
+ let Inst{20-12} = offset;
+ let Inst{11-10} = 0b01;
+ let Inst{9-5} = Rn;
+ let Inst{4-0} = Rt;
+
+ let DecoderMethod = "DecodeSignedLdStInstruction";
+}
+
+let hasSideEffects = 0 in {
+let mayStore = 0, mayLoad = 1 in
+class LoadPostIdx<bits<2> sz, bit V, bits<2> opc, RegisterClass regtype,
+ string asm>
+ : BaseLoadStorePostIdx<sz, V, opc,
+ (outs GPR64sp:$wback, regtype:$Rt),
+ (ins GPR64sp:$Rn, simm9:$offset),
+ asm, "$Rn = $wback", []>,
+ Sched<[WriteLD, WriteI]>;
+
+let mayStore = 1, mayLoad = 0 in
+class StorePostIdx<bits<2> sz, bit V, bits<2> opc, RegisterClass regtype,
+ string asm, SDPatternOperator storeop, ValueType Ty>
+ : BaseLoadStorePostIdx<sz, V, opc,
+ (outs GPR64sp:$wback),
+ (ins regtype:$Rt, GPR64sp:$Rn, simm9:$offset),
+ asm, "$Rn = $wback",
+ [(set GPR64sp:$wback,
+ (storeop (Ty regtype:$Rt), GPR64sp:$Rn, simm9:$offset))]>,
+ Sched<[WriteAdr, WriteST, ReadAdrBase]>;
+} // hasSideEffects = 0
+
+
+//---
+// Load/store pair
+//---
+
+// (indexed, offset)
+
+class BaseLoadStorePairOffset<bits<2> opc, bit V, bit L, dag oops, dag iops,
+ string asm>
+ : I<oops, iops, asm, "\t$Rt, $Rt2, [$Rn, $offset]", "", []> {
+ bits<5> Rt;
+ bits<5> Rt2;
+ bits<5> Rn;
+ bits<7> offset;
+ let Inst{31-30} = opc;
+ let Inst{29-27} = 0b101;
+ let Inst{26} = V;
+ let Inst{25-23} = 0b010;
+ let Inst{22} = L;
+ let Inst{21-15} = offset;
+ let Inst{14-10} = Rt2;
+ let Inst{9-5} = Rn;
+ let Inst{4-0} = Rt;
+
+ let DecoderMethod = "DecodePairLdStInstruction";
+}
+
+multiclass LoadPairOffset<bits<2> opc, bit V, RegisterClass regtype,
+ Operand indextype, string asm> {
+ let hasSideEffects = 0, mayStore = 0, mayLoad = 1 in
+ def i : BaseLoadStorePairOffset<opc, V, 1,
+ (outs regtype:$Rt, regtype:$Rt2),
+ (ins GPR64sp:$Rn, indextype:$offset), asm>,
+ Sched<[WriteLD, WriteLDHi]>;
+
+ def : InstAlias<asm # " $Rt, $Rt2, [$Rn]",
+ (!cast<Instruction>(NAME # "i") regtype:$Rt, regtype:$Rt2,
+ GPR64sp:$Rn, 0)>;
+}
+
+
+multiclass StorePairOffset<bits<2> opc, bit V, RegisterClass regtype,
+ Operand indextype, string asm> {
+ let hasSideEffects = 0, mayLoad = 0, mayStore = 1 in
+ def i : BaseLoadStorePairOffset<opc, V, 0, (outs),
+ (ins regtype:$Rt, regtype:$Rt2,
+ GPR64sp:$Rn, indextype:$offset),
+ asm>,
+ Sched<[WriteSTP]>;
+
+ def : InstAlias<asm # " $Rt, $Rt2, [$Rn]",
+ (!cast<Instruction>(NAME # "i") regtype:$Rt, regtype:$Rt2,
+ GPR64sp:$Rn, 0)>;
+}
+
+// (pre-indexed)
+class BaseLoadStorePairPreIdx<bits<2> opc, bit V, bit L, dag oops, dag iops,
+ string asm>
+ : I<oops, iops, asm, "\t$Rt, $Rt2, [$Rn, $offset]!", "$Rn = $wback", []> {
+ bits<5> Rt;
+ bits<5> Rt2;
+ bits<5> Rn;
+ bits<7> offset;
+ let Inst{31-30} = opc;
+ let Inst{29-27} = 0b101;
+ let Inst{26} = V;
+ let Inst{25-23} = 0b011;
+ let Inst{22} = L;
+ let Inst{21-15} = offset;
+ let Inst{14-10} = Rt2;
+ let Inst{9-5} = Rn;
+ let Inst{4-0} = Rt;
+
+ let DecoderMethod = "DecodePairLdStInstruction";
+}
+
+let hasSideEffects = 0 in {
+let mayStore = 0, mayLoad = 1 in
+class LoadPairPreIdx<bits<2> opc, bit V, RegisterClass regtype,
+ Operand indextype, string asm>
+ : BaseLoadStorePairPreIdx<opc, V, 1,
+ (outs GPR64sp:$wback, regtype:$Rt, regtype:$Rt2),
+ (ins GPR64sp:$Rn, indextype:$offset), asm>,
+ Sched<[WriteLD, WriteLDHi, WriteAdr]>;
+
+let mayStore = 1, mayLoad = 0 in
+class StorePairPreIdx<bits<2> opc, bit V, RegisterClass regtype,
+ Operand indextype, string asm>
+ : BaseLoadStorePairPreIdx<opc, V, 0, (outs GPR64sp:$wback),
+ (ins regtype:$Rt, regtype:$Rt2,
+ GPR64sp:$Rn, indextype:$offset),
+ asm>,
+ Sched<[WriteAdr, WriteSTP]>;
+} // hasSideEffects = 0
+
+// (post-indexed)
+
+class BaseLoadStorePairPostIdx<bits<2> opc, bit V, bit L, dag oops, dag iops,
+ string asm>
+ : I<oops, iops, asm, "\t$Rt, $Rt2, [$Rn], $offset", "$Rn = $wback", []> {
+ bits<5> Rt;
+ bits<5> Rt2;
+ bits<5> Rn;
+ bits<7> offset;
+ let Inst{31-30} = opc;
+ let Inst{29-27} = 0b101;
+ let Inst{26} = V;
+ let Inst{25-23} = 0b001;
+ let Inst{22} = L;
+ let Inst{21-15} = offset;
+ let Inst{14-10} = Rt2;
+ let Inst{9-5} = Rn;
+ let Inst{4-0} = Rt;
+
+ let DecoderMethod = "DecodePairLdStInstruction";
+}
+
+let hasSideEffects = 0 in {
+let mayStore = 0, mayLoad = 1 in
+class LoadPairPostIdx<bits<2> opc, bit V, RegisterClass regtype,
+ Operand idxtype, string asm>
+ : BaseLoadStorePairPostIdx<opc, V, 1,
+ (outs GPR64sp:$wback, regtype:$Rt, regtype:$Rt2),
+ (ins GPR64sp:$Rn, idxtype:$offset), asm>,
+ Sched<[WriteLD, WriteLDHi, WriteAdr]>;
+
+let mayStore = 1, mayLoad = 0 in
+class StorePairPostIdx<bits<2> opc, bit V, RegisterClass regtype,
+ Operand idxtype, string asm>
+ : BaseLoadStorePairPostIdx<opc, V, 0, (outs),
+ (ins GPR64sp:$wback, regtype:$Rt, regtype:$Rt2,
+ GPR64sp:$Rn, idxtype:$offset),
+ asm>,
+ Sched<[WriteAdr, WriteSTP]>;
+} // hasSideEffects = 0
+
+// (no-allocate)
+
+class BaseLoadStorePairNoAlloc<bits<2> opc, bit V, bit L, dag oops, dag iops,
+ string asm>
+ : I<oops, iops, asm, "\t$Rt, $Rt2, [$Rn, $offset]", "", []> {
+ bits<5> Rt;
+ bits<5> Rt2;
+ bits<5> Rn;
+ bits<7> offset;
+ let Inst{31-30} = opc;
+ let Inst{29-27} = 0b101;
+ let Inst{26} = V;
+ let Inst{25-23} = 0b000;
+ let Inst{22} = L;
+ let Inst{21-15} = offset;
+ let Inst{14-10} = Rt2;
+ let Inst{9-5} = Rn;
+ let Inst{4-0} = Rt;
+
+ let DecoderMethod = "DecodePairLdStInstruction";
+}
+
+multiclass LoadPairNoAlloc<bits<2> opc, bit V, RegisterClass regtype,
+ Operand indextype, string asm> {
+ let hasSideEffects = 0, mayStore = 0, mayLoad = 1 in
+ def i : BaseLoadStorePairNoAlloc<opc, V, 1,
+ (outs regtype:$Rt, regtype:$Rt2),
+ (ins GPR64sp:$Rn, indextype:$offset), asm>,
+ Sched<[WriteLD, WriteLDHi]>;
+
+
+ def : InstAlias<asm # "\t$Rt, $Rt2, [$Rn]",
+ (!cast<Instruction>(NAME # "i") regtype:$Rt, regtype:$Rt2,
+ GPR64sp:$Rn, 0)>;
+}
+
+multiclass StorePairNoAlloc<bits<2> opc, bit V, RegisterClass regtype,
+ Operand indextype, string asm> {
+ let hasSideEffects = 0, mayStore = 1, mayLoad = 0 in
+ def i : BaseLoadStorePairNoAlloc<opc, V, 0, (outs),
+ (ins regtype:$Rt, regtype:$Rt2,
+ GPR64sp:$Rn, indextype:$offset),
+ asm>,
+ Sched<[WriteSTP]>;
+
+ def : InstAlias<asm # "\t$Rt, $Rt2, [$Rn]",
+ (!cast<Instruction>(NAME # "i") regtype:$Rt, regtype:$Rt2,
+ GPR64sp:$Rn, 0)>;
+}
+
+//---
+// Load/store exclusive
+//---
+
+// True exclusive operations write to and/or read from the system's exclusive
+// monitors, which as far as a compiler is concerned can be modelled as a
+// random shared memory address. Hence LoadExclusive mayStore.
+//
+// Since these instructions have the undefined register bits set to 1 in
+// their canonical form, we need a post encoder method to set those bits
+// to 1 when encoding these instructions. We do this using the
+// fixLoadStoreExclusive function. This function has template parameters:
+//
+// fixLoadStoreExclusive<int hasRs, int hasRt2>
+//
+// hasRs indicates that the instruction uses the Rs field, so we won't set
+// it to 1 (and the same for Rt2). We don't need template parameters for
+// the other register fields since Rt and Rn are always used.
+//
+let hasSideEffects = 1, mayLoad = 1, mayStore = 1 in
+class BaseLoadStoreExclusive<bits<2> sz, bit o2, bit L, bit o1, bit o0,
+ dag oops, dag iops, string asm, string operands>
+ : I<oops, iops, asm, operands, "", []> {
+ let Inst{31-30} = sz;
+ let Inst{29-24} = 0b001000;
+ let Inst{23} = o2;
+ let Inst{22} = L;
+ let Inst{21} = o1;
+ let Inst{15} = o0;
+
+ let DecoderMethod = "DecodeExclusiveLdStInstruction";
+}
+
+// Neither Rs nor Rt2 operands.
+class LoadStoreExclusiveSimple<bits<2> sz, bit o2, bit L, bit o1, bit o0,
+ dag oops, dag iops, string asm, string operands>
+ : BaseLoadStoreExclusive<sz, o2, L, o1, o0, oops, iops, asm, operands> {
+ bits<5> Rt;
+ bits<5> Rn;
+ let Inst{9-5} = Rn;
+ let Inst{4-0} = Rt;
+
+ let PostEncoderMethod = "fixLoadStoreExclusive<0,0>";
+}
+
+// Simple load acquires don't set the exclusive monitor
+let mayLoad = 1, mayStore = 0 in
+class LoadAcquire<bits<2> sz, bit o2, bit L, bit o1, bit o0,
+ RegisterClass regtype, string asm>
+ : LoadStoreExclusiveSimple<sz, o2, L, o1, o0, (outs regtype:$Rt),
+ (ins GPR64sp0:$Rn), asm, "\t$Rt, [$Rn]">,
+ Sched<[WriteLD]>;
+
+class LoadExclusive<bits<2> sz, bit o2, bit L, bit o1, bit o0,
+ RegisterClass regtype, string asm>
+ : LoadStoreExclusiveSimple<sz, o2, L, o1, o0, (outs regtype:$Rt),
+ (ins GPR64sp0:$Rn), asm, "\t$Rt, [$Rn]">,
+ Sched<[WriteLD]>;
+
+class LoadExclusivePair<bits<2> sz, bit o2, bit L, bit o1, bit o0,
+ RegisterClass regtype, string asm>
+ : BaseLoadStoreExclusive<sz, o2, L, o1, o0,
+ (outs regtype:$Rt, regtype:$Rt2),
+ (ins GPR64sp0:$Rn), asm,
+ "\t$Rt, $Rt2, [$Rn]">,
+ Sched<[WriteLD, WriteLDHi]> {
+ bits<5> Rt;
+ bits<5> Rt2;
+ bits<5> Rn;
+ let Inst{14-10} = Rt2;
+ let Inst{9-5} = Rn;
+ let Inst{4-0} = Rt;
+
+ let PostEncoderMethod = "fixLoadStoreExclusive<0,1>";
+}
+
+// Simple store release operations do not check the exclusive monitor.
+let mayLoad = 0, mayStore = 1 in
+class StoreRelease<bits<2> sz, bit o2, bit L, bit o1, bit o0,
+ RegisterClass regtype, string asm>
+ : LoadStoreExclusiveSimple<sz, o2, L, o1, o0, (outs),
+ (ins regtype:$Rt, GPR64sp0:$Rn),
+ asm, "\t$Rt, [$Rn]">,
+ Sched<[WriteST]>;
+
+let mayLoad = 1, mayStore = 1 in
+class StoreExclusive<bits<2> sz, bit o2, bit L, bit o1, bit o0,
+ RegisterClass regtype, string asm>
+ : BaseLoadStoreExclusive<sz, o2, L, o1, o0, (outs GPR32:$Ws),
+ (ins regtype:$Rt, GPR64sp0:$Rn),
+ asm, "\t$Ws, $Rt, [$Rn]">,
+ Sched<[WriteSTX]> {
+ bits<5> Ws;
+ bits<5> Rt;
+ bits<5> Rn;
+ let Inst{20-16} = Ws;
+ let Inst{9-5} = Rn;
+ let Inst{4-0} = Rt;
+
+ let Constraints = "@earlyclobber $Ws";
+ let PostEncoderMethod = "fixLoadStoreExclusive<1,0>";
+}
+
+class StoreExclusivePair<bits<2> sz, bit o2, bit L, bit o1, bit o0,
+ RegisterClass regtype, string asm>
+ : BaseLoadStoreExclusive<sz, o2, L, o1, o0,
+ (outs GPR32:$Ws),
+ (ins regtype:$Rt, regtype:$Rt2, GPR64sp0:$Rn),
+ asm, "\t$Ws, $Rt, $Rt2, [$Rn]">,
+ Sched<[WriteSTX]> {
+ bits<5> Ws;
+ bits<5> Rt;
+ bits<5> Rt2;
+ bits<5> Rn;
+ let Inst{20-16} = Ws;
+ let Inst{14-10} = Rt2;
+ let Inst{9-5} = Rn;
+ let Inst{4-0} = Rt;
+
+ let Constraints = "@earlyclobber $Ws";
+}
+
+//---
+// Exception generation
+//---
+
+let mayLoad = 0, mayStore = 0, hasSideEffects = 1 in
+class ExceptionGeneration<bits<3> op1, bits<2> ll, string asm>
+ : I<(outs), (ins imm0_65535:$imm), asm, "\t$imm", "", []>,
+ Sched<[WriteSys]> {
+ bits<16> imm;
let Inst{31-24} = 0b11010100;
- let Inst{23-21} = opc;
- let Inst{20-5} = UImm16;
- let Inst{4-2} = op2;
- let Inst{1-0} = ll;
-}
-
-// Format for extract (immediate) instructions
-class A64I_extract<bit sf, bits<3> op, bit n,
- dag outs, dag ins, string asmstr,
- list<dag> patterns, InstrItinClass itin>
- : A64InstRdnm<outs, ins, asmstr, patterns, itin> {
- bits<6> LSB;
-
- let Inst{31} = sf;
- let Inst{30-29} = op{2-1};
- let Inst{28-23} = 0b100111;
- let Inst{22} = n;
- let Inst{21} = op{0};
- // Inherits Rm in bits 20-16
- let Inst{15-10} = LSB;
- // Inherits Rn in 9-5
- // Inherits Rd in 4-0
+ let Inst{23-21} = op1;
+ let Inst{20-5} = imm;
+ let Inst{4-2} = 0b000;
+ let Inst{1-0} = ll;
}
let Predicates = [HasFPARMv8] in {
-// Format for floating-point compare instructions.
-class A64I_fpcmp<bit m, bit s, bits<2> type, bits<2> op, bits<5> opcode2,
- dag outs, dag ins, string asmstr,
- list<dag> patterns, InstrItinClass itin>
- : A64Inst<outs, ins, asmstr, patterns, itin> {
+//---
+// Floating point to integer conversion
+//---
+
+class BaseFPToIntegerUnscaled<bits<2> type, bits<2> rmode, bits<3> opcode,
+ RegisterClass srcType, RegisterClass dstType,
+ string asm, list<dag> pattern>
+ : I<(outs dstType:$Rd), (ins srcType:$Rn),
+ asm, "\t$Rd, $Rn", "", pattern>,
+ Sched<[WriteFCvt]> {
+ bits<5> Rd;
bits<5> Rn;
- bits<5> Rm;
-
- let Inst{31} = m;
- let Inst{30} = 0b0;
- let Inst{29} = s;
+ let Inst{30-29} = 0b00;
let Inst{28-24} = 0b11110;
let Inst{23-22} = type;
- let Inst{21} = 0b1;
- let Inst{20-16} = Rm;
- let Inst{15-14} = op;
- let Inst{13-10} = 0b1000;
- let Inst{9-5} = Rn;
- let Inst{4-0} = opcode2;
-}
-
-// Format for floating-point conditional compare instructions.
-class A64I_fpccmp<bit m, bit s, bits<2> type, bit op,
- dag outs, dag ins, string asmstr,
- list<dag> patterns, InstrItinClass itin>
- : A64InstRdn<outs, ins, asmstr, patterns, itin> {
- bits<5> Rn;
- bits<5> Rm;
- bits<4> NZCVImm;
- bits<4> Cond;
-
- let Inst{31} = m;
- let Inst{30} = 0b0;
- let Inst{29} = s;
- let Inst{28-24} = 0b11110;
- let Inst{23-22} = type;
- let Inst{21} = 0b1;
- let Inst{20-16} = Rm;
- let Inst{15-12} = Cond;
- let Inst{11-10} = 0b01;
- let Inst{9-5} = Rn;
- let Inst{4} = op;
- let Inst{3-0} = NZCVImm;
-}
-
-// Format for floating-point conditional select instructions.
-class A64I_fpcondsel<bit m, bit s, bits<2> type,
- dag outs, dag ins, string asmstr,
- list<dag> patterns, InstrItinClass itin>
- : A64InstRdnm<outs, ins, asmstr, patterns, itin> {
- bits<4> Cond;
-
- let Inst{31} = m;
- let Inst{30} = 0b0;
- let Inst{29} = s;
- let Inst{28-24} = 0b11110;
- let Inst{23-22} = type;
- let Inst{21} = 0b1;
- // Inherit Rm in 20-16
- let Inst{15-12} = Cond;
- let Inst{11-10} = 0b11;
- // Inherit Rn in 9-5
- // Inherit Rd in 4-0
-}
-
-
-// Format for floating-point data-processing (1 source) instructions.
-class A64I_fpdp1<bit m, bit s, bits<2> type, bits<6> opcode,
- dag outs, dag ins, string asmstr,
- list<dag> patterns, InstrItinClass itin>
- : A64InstRdn<outs, ins, asmstr, patterns, itin> {
- let Inst{31} = m;
- let Inst{30} = 0b0;
- let Inst{29} = s;
- let Inst{28-24} = 0b11110;
- let Inst{23-22} = type;
- let Inst{21} = 0b1;
- let Inst{20-15} = opcode;
- let Inst{14-10} = 0b10000;
- // Inherit Rn in 9-5
- // Inherit Rd in 4-0
-}
-
-// Format for floating-point data-processing (2 sources) instructions.
-class A64I_fpdp2<bit m, bit s, bits<2> type, bits<4> opcode,
- dag outs, dag ins, string asmstr,
- list<dag> patterns, InstrItinClass itin>
- : A64InstRdnm<outs, ins, asmstr, patterns, itin> {
- let Inst{31} = m;
- let Inst{30} = 0b0;
- let Inst{29} = s;
- let Inst{28-24} = 0b11110;
- let Inst{23-22} = type;
- let Inst{21} = 0b1;
- // Inherit Rm in 20-16
- let Inst{15-12} = opcode;
- let Inst{11-10} = 0b10;
- // Inherit Rn in 9-5
- // Inherit Rd in 4-0
-}
-
-// Format for floating-point data-processing (3 sources) instructions.
-class A64I_fpdp3<bit m, bit s, bits<2> type, bit o1, bit o0,
- dag outs, dag ins, string asmstr,
- list<dag> patterns, InstrItinClass itin>
- : A64InstRdnm<outs, ins, asmstr, patterns, itin> {
- bits<5> Ra;
-
- let Inst{31} = m;
- let Inst{30} = 0b0;
- let Inst{29} = s;
- let Inst{28-24} = 0b11111;
- let Inst{23-22} = type;
- let Inst{21} = o1;
- // Inherit Rm in 20-16
- let Inst{15} = o0;
- let Inst{14-10} = Ra;
- // Inherit Rn in 9-5
- // Inherit Rd in 4-0
-}
-
-// Format for floating-point <-> fixed-point conversion instructions.
-class A64I_fpfixed<bit sf, bit s, bits<2> type, bits<2> mode, bits<3> opcode,
- dag outs, dag ins, string asmstr,
- list<dag> patterns, InstrItinClass itin>
- : A64InstRdn<outs, ins, asmstr, patterns, itin> {
- bits<6> Scale;
-
- let Inst{31} = sf;
- let Inst{30} = 0b0;
- let Inst{29} = s;
- let Inst{28-24} = 0b11110;
- let Inst{23-22} = type;
- let Inst{21} = 0b0;
- let Inst{20-19} = mode;
+ let Inst{21} = 1;
+ let Inst{20-19} = rmode;
let Inst{18-16} = opcode;
- let Inst{15-10} = Scale;
- // Inherit Rn in 9-5
- // Inherit Rd in 4-0
+ let Inst{15-10} = 0;
+ let Inst{9-5} = Rn;
+ let Inst{4-0} = Rd;
}
-// Format for floating-point <-> integer conversion instructions.
-class A64I_fpint<bit sf, bit s, bits<2> type, bits<2> rmode, bits<3> opcode,
- dag outs, dag ins, string asmstr,
- list<dag> patterns, InstrItinClass itin>
- : A64InstRdn<outs, ins, asmstr, patterns, itin> {
- let Inst{31} = sf;
- let Inst{30} = 0b0;
- let Inst{29} = s;
+let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
+class BaseFPToInteger<bits<2> type, bits<2> rmode, bits<3> opcode,
+ RegisterClass srcType, RegisterClass dstType,
+ Operand immType, string asm, list<dag> pattern>
+ : I<(outs dstType:$Rd), (ins srcType:$Rn, immType:$scale),
+ asm, "\t$Rd, $Rn, $scale", "", pattern>,
+ Sched<[WriteFCvt]> {
+ bits<5> Rd;
+ bits<5> Rn;
+ bits<6> scale;
+ let Inst{30-29} = 0b00;
let Inst{28-24} = 0b11110;
let Inst{23-22} = type;
- let Inst{21} = 0b1;
+ let Inst{21} = 0;
+ let Inst{20-19} = rmode;
+ let Inst{18-16} = opcode;
+ let Inst{15-10} = scale;
+ let Inst{9-5} = Rn;
+ let Inst{4-0} = Rd;
+}
+
+multiclass FPToIntegerUnscaled<bits<2> rmode, bits<3> opcode, string asm,
+ SDPatternOperator OpN> {
+ // Unscaled single-precision to 32-bit
+ def UWSr : BaseFPToIntegerUnscaled<0b00, rmode, opcode, FPR32, GPR32, asm,
+ [(set GPR32:$Rd, (OpN FPR32:$Rn))]> {
+ let Inst{31} = 0; // 32-bit GPR flag
+ }
+
+ // Unscaled single-precision to 64-bit
+ def UXSr : BaseFPToIntegerUnscaled<0b00, rmode, opcode, FPR32, GPR64, asm,
+ [(set GPR64:$Rd, (OpN FPR32:$Rn))]> {
+ let Inst{31} = 1; // 64-bit GPR flag
+ }
+
+ // Unscaled double-precision to 32-bit
+ def UWDr : BaseFPToIntegerUnscaled<0b01, rmode, opcode, FPR64, GPR32, asm,
+ [(set GPR32:$Rd, (OpN (f64 FPR64:$Rn)))]> {
+ let Inst{31} = 0; // 32-bit GPR flag
+ }
+
+ // Unscaled double-precision to 64-bit
+ def UXDr : BaseFPToIntegerUnscaled<0b01, rmode, opcode, FPR64, GPR64, asm,
+ [(set GPR64:$Rd, (OpN (f64 FPR64:$Rn)))]> {
+ let Inst{31} = 1; // 64-bit GPR flag
+ }
+}
+
+multiclass FPToIntegerScaled<bits<2> rmode, bits<3> opcode, string asm,
+ SDPatternOperator OpN> {
+ // Scaled single-precision to 32-bit
+ def SWSri : BaseFPToInteger<0b00, rmode, opcode, FPR32, GPR32,
+ fixedpoint_f32_i32, asm,
+ [(set GPR32:$Rd, (OpN (fmul FPR32:$Rn,
+ fixedpoint_f32_i32:$scale)))]> {
+ let Inst{31} = 0; // 32-bit GPR flag
+ let scale{5} = 1;
+ }
+
+ // Scaled single-precision to 64-bit
+ def SXSri : BaseFPToInteger<0b00, rmode, opcode, FPR32, GPR64,
+ fixedpoint_f32_i64, asm,
+ [(set GPR64:$Rd, (OpN (fmul FPR32:$Rn,
+ fixedpoint_f32_i64:$scale)))]> {
+ let Inst{31} = 1; // 64-bit GPR flag
+ }
+
+ // Scaled double-precision to 32-bit
+ def SWDri : BaseFPToInteger<0b01, rmode, opcode, FPR64, GPR32,
+ fixedpoint_f64_i32, asm,
+ [(set GPR32:$Rd, (OpN (fmul FPR64:$Rn,
+ fixedpoint_f64_i32:$scale)))]> {
+ let Inst{31} = 0; // 32-bit GPR flag
+ let scale{5} = 1;
+ }
+
+ // Scaled double-precision to 64-bit
+ def SXDri : BaseFPToInteger<0b01, rmode, opcode, FPR64, GPR64,
+ fixedpoint_f64_i64, asm,
+ [(set GPR64:$Rd, (OpN (fmul FPR64:$Rn,
+ fixedpoint_f64_i64:$scale)))]> {
+ let Inst{31} = 1; // 64-bit GPR flag
+ }
+}
+
+//---
+// Integer to floating point conversion
+//---
+
+let mayStore = 0, mayLoad = 0, hasSideEffects = 0 in
+class BaseIntegerToFP<bit isUnsigned,
+ RegisterClass srcType, RegisterClass dstType,
+ Operand immType, string asm, list<dag> pattern>
+ : I<(outs dstType:$Rd), (ins srcType:$Rn, immType:$scale),
+ asm, "\t$Rd, $Rn, $scale", "", pattern>,
+ Sched<[WriteFCvt]> {
+ bits<5> Rd;
+ bits<5> Rn;
+ bits<6> scale;
+ let Inst{30-23} = 0b00111100;
+ let Inst{21-17} = 0b00001;
+ let Inst{16} = isUnsigned;
+ let Inst{15-10} = scale;
+ let Inst{9-5} = Rn;
+ let Inst{4-0} = Rd;
+}
+
+class BaseIntegerToFPUnscaled<bit isUnsigned,
+ RegisterClass srcType, RegisterClass dstType,
+ ValueType dvt, string asm, SDNode node>
+ : I<(outs dstType:$Rd), (ins srcType:$Rn),
+ asm, "\t$Rd, $Rn", "", [(set (dvt dstType:$Rd), (node srcType:$Rn))]>,
+ Sched<[WriteFCvt]> {
+ bits<5> Rd;
+ bits<5> Rn;
+ bits<6> scale;
+ let Inst{30-23} = 0b00111100;
+ let Inst{21-17} = 0b10001;
+ let Inst{16} = isUnsigned;
+ let Inst{15-10} = 0b000000;
+ let Inst{9-5} = Rn;
+ let Inst{4-0} = Rd;
+}
+
+multiclass IntegerToFP<bit isUnsigned, string asm, SDNode node> {
+ // Unscaled
+ def UWSri: BaseIntegerToFPUnscaled<isUnsigned, GPR32, FPR32, f32, asm, node> {
+ let Inst{31} = 0; // 32-bit GPR flag
+ let Inst{22} = 0; // 32-bit FPR flag
+ }
+
+ def UWDri: BaseIntegerToFPUnscaled<isUnsigned, GPR32, FPR64, f64, asm, node> {
+ let Inst{31} = 0; // 32-bit GPR flag
+ let Inst{22} = 1; // 64-bit FPR flag
+ }
+
+ def UXSri: BaseIntegerToFPUnscaled<isUnsigned, GPR64, FPR32, f32, asm, node> {
+ let Inst{31} = 1; // 64-bit GPR flag
+ let Inst{22} = 0; // 32-bit FPR flag
+ }
+
+ def UXDri: BaseIntegerToFPUnscaled<isUnsigned, GPR64, FPR64, f64, asm, node> {
+ let Inst{31} = 1; // 64-bit GPR flag
+ let Inst{22} = 1; // 64-bit FPR flag
+ }
+
+ // Scaled
+ def SWSri: BaseIntegerToFP<isUnsigned, GPR32, FPR32, fixedpoint_f32_i32, asm,
+ [(set FPR32:$Rd,
+ (fdiv (node GPR32:$Rn),
+ fixedpoint_f32_i32:$scale))]> {
+ let Inst{31} = 0; // 32-bit GPR flag
+ let Inst{22} = 0; // 32-bit FPR flag
+ let scale{5} = 1;
+ }
+
+ def SWDri: BaseIntegerToFP<isUnsigned, GPR32, FPR64, fixedpoint_f64_i32, asm,
+ [(set FPR64:$Rd,
+ (fdiv (node GPR32:$Rn),
+ fixedpoint_f64_i32:$scale))]> {
+ let Inst{31} = 0; // 32-bit GPR flag
+ let Inst{22} = 1; // 64-bit FPR flag
+ let scale{5} = 1;
+ }
+
+ def SXSri: BaseIntegerToFP<isUnsigned, GPR64, FPR32, fixedpoint_f32_i64, asm,
+ [(set FPR32:$Rd,
+ (fdiv (node GPR64:$Rn),
+ fixedpoint_f32_i64:$scale))]> {
+ let Inst{31} = 1; // 64-bit GPR flag
+ let Inst{22} = 0; // 32-bit FPR flag
+ }
+
+ def SXDri: BaseIntegerToFP<isUnsigned, GPR64, FPR64, fixedpoint_f64_i64, asm,
+ [(set FPR64:$Rd,
+ (fdiv (node GPR64:$Rn),
+ fixedpoint_f64_i64:$scale))]> {
+ let Inst{31} = 1; // 64-bit GPR flag
+ let Inst{22} = 1; // 64-bit FPR flag
+ }
+}
+
+//---
+// Unscaled integer <-> floating point conversion (i.e. FMOV)
+//---
+
+let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
+class BaseUnscaledConversion<bits<2> rmode, bits<3> opcode,
+ RegisterClass srcType, RegisterClass dstType,
+ string asm>
+ : I<(outs dstType:$Rd), (ins srcType:$Rn), asm, "\t$Rd, $Rn", "",
+ // We use COPY_TO_REGCLASS for these bitconvert operations.
+ // copyPhysReg() expands the resultant COPY instructions after
+ // regalloc is done. This gives greater freedom for the allocator
+ // and related passes (coalescing, copy propagation, et. al.) to
+ // be more effective.
+ [/*(set (dvt dstType:$Rd), (bitconvert (svt srcType:$Rn)))*/]>,
+ Sched<[WriteFCopy]> {
+ bits<5> Rd;
+ bits<5> Rn;
+ let Inst{30-23} = 0b00111100;
+ let Inst{21} = 1;
let Inst{20-19} = rmode;
let Inst{18-16} = opcode;
let Inst{15-10} = 0b000000;
- // Inherit Rn in 9-5
- // Inherit Rd in 4-0
-}
-
-
-// Format for floating-point immediate instructions.
-class A64I_fpimm<bit m, bit s, bits<2> type, bits<5> imm5,
- dag outs, dag ins, string asmstr,
- list<dag> patterns, InstrItinClass itin>
- : A64InstRd<outs, ins, asmstr, patterns, itin> {
- bits<8> Imm8;
-
- let Inst{31} = m;
- let Inst{30} = 0b0;
- let Inst{29} = s;
- let Inst{28-24} = 0b11110;
- let Inst{23-22} = type;
- let Inst{21} = 0b1;
- let Inst{20-13} = Imm8;
- let Inst{12-10} = 0b100;
- let Inst{9-5} = imm5;
- // Inherit Rd in 4-0
-}
-
-}
-
-// Format for load-register (literal) instructions.
-class A64I_LDRlit<bits<2> opc, bit v,
- dag outs, dag ins, string asmstr,
- list<dag> patterns, InstrItinClass itin>
- : A64InstRt<outs, ins, asmstr, patterns, itin> {
- bits<19> Imm19;
-
- let Inst{31-30} = opc;
- let Inst{29-27} = 0b011;
- let Inst{26} = v;
- let Inst{25-24} = 0b00;
- let Inst{23-5} = Imm19;
- // Inherit Rt in 4-0
-}
-
-// Format for load-store exclusive instructions.
-class A64I_LDSTex_tn<bits<2> size, bit o2, bit L, bit o1, bit o0,
- dag outs, dag ins, string asmstr,
- list <dag> patterns, InstrItinClass itin>
- : A64InstRtn<outs, ins, asmstr, patterns, itin> {
- let Inst{31-30} = size;
- let Inst{29-24} = 0b001000;
- let Inst{23} = o2;
- let Inst{22} = L;
- let Inst{21} = o1;
- let Inst{15} = o0;
-}
-
-class A64I_LDSTex_tt2n<bits<2> size, bit o2, bit L, bit o1, bit o0,
- dag outs, dag ins, string asmstr,
- list <dag> patterns, InstrItinClass itin>:
- A64I_LDSTex_tn<size, o2, L, o1, o0, outs, ins, asmstr, patterns, itin>{
- bits<5> Rt2;
- let Inst{14-10} = Rt2;
-}
-
-class A64I_LDSTex_stn<bits<2> size, bit o2, bit L, bit o1, bit o0,
- dag outs, dag ins, string asmstr,
- list <dag> patterns, InstrItinClass itin>:
- A64I_LDSTex_tn<size, o2, L, o1, o0, outs, ins, asmstr, patterns, itin>{
- bits<5> Rs;
- let Inst{20-16} = Rs;
-}
-
-class A64I_LDSTex_stt2n<bits<2> size, bit o2, bit L, bit o1, bit o0,
- dag outs, dag ins, string asmstr,
- list <dag> patterns, InstrItinClass itin>:
- A64I_LDSTex_stn<size, o2, L, o1, o0, outs, ins, asmstr, patterns, itin>{
- bits<5> Rt2;
- let Inst{14-10} = Rt2;
-}
-
-// Format for load-store register (immediate post-indexed) instructions
-class A64I_LSpostind<bits<2> size, bit v, bits<2> opc,
- dag outs, dag ins, string asmstr,
- list<dag> patterns, InstrItinClass itin>
- : A64InstRtn<outs, ins, asmstr, patterns, itin> {
- bits<9> SImm9;
-
- let Inst{31-30} = size;
- let Inst{29-27} = 0b111;
- let Inst{26} = v;
- let Inst{25-24} = 0b00;
- let Inst{23-22} = opc;
- let Inst{21} = 0b0;
- let Inst{20-12} = SImm9;
- let Inst{11-10} = 0b01;
- // Inherit Rn in 9-5
- // Inherit Rt in 4-0
-}
-
-// Format for load-store register (immediate pre-indexed) instructions
-class A64I_LSpreind<bits<2> size, bit v, bits<2> opc,
- dag outs, dag ins, string asmstr,
- list<dag> patterns, InstrItinClass itin>
- : A64InstRtn<outs, ins, asmstr, patterns, itin> {
- bits<9> SImm9;
-
-
- let Inst{31-30} = size;
- let Inst{29-27} = 0b111;
- let Inst{26} = v;
- let Inst{25-24} = 0b00;
- let Inst{23-22} = opc;
- let Inst{21} = 0b0;
- let Inst{20-12} = SImm9;
- let Inst{11-10} = 0b11;
- // Inherit Rn in 9-5
- // Inherit Rt in 4-0
-}
-
-// Format for load-store register (unprivileged) instructions
-class A64I_LSunpriv<bits<2> size, bit v, bits<2> opc,
- dag outs, dag ins, string asmstr,
- list<dag> patterns, InstrItinClass itin>
- : A64InstRtn<outs, ins, asmstr, patterns, itin> {
- bits<9> SImm9;
-
-
- let Inst{31-30} = size;
- let Inst{29-27} = 0b111;
- let Inst{26} = v;
- let Inst{25-24} = 0b00;
- let Inst{23-22} = opc;
- let Inst{21} = 0b0;
- let Inst{20-12} = SImm9;
- let Inst{11-10} = 0b10;
- // Inherit Rn in 9-5
- // Inherit Rt in 4-0
-}
-
-// Format for load-store (unscaled immediate) instructions.
-class A64I_LSunalimm<bits<2> size, bit v, bits<2> opc,
- dag outs, dag ins, string asmstr,
- list<dag> patterns, InstrItinClass itin>
- : A64InstRtn<outs, ins, asmstr, patterns, itin> {
- bits<9> SImm9;
-
- let Inst{31-30} = size;
- let Inst{29-27} = 0b111;
- let Inst{26} = v;
- let Inst{25-24} = 0b00;
- let Inst{23-22} = opc;
- let Inst{21} = 0b0;
- let Inst{20-12} = SImm9;
- let Inst{11-10} = 0b00;
- // Inherit Rn in 9-5
- // Inherit Rt in 4-0
-}
-
-
-// Format for load-store (unsigned immediate) instructions.
-class A64I_LSunsigimm<bits<2> size, bit v, bits<2> opc,
- dag outs, dag ins, string asmstr,
- list<dag> patterns, InstrItinClass itin>
- : A64InstRtn<outs, ins, asmstr, patterns, itin> {
- bits<12> UImm12;
-
- let Inst{31-30} = size;
- let Inst{29-27} = 0b111;
- let Inst{26} = v;
- let Inst{25-24} = 0b01;
- let Inst{23-22} = opc;
- let Inst{21-10} = UImm12;
-}
-
-// Format for load-store register (register offset) instructions.
-class A64I_LSregoff<bits<2> size, bit v, bits<2> opc, bit optionlo,
- dag outs, dag ins, string asmstr,
- list<dag> patterns, InstrItinClass itin>
- : A64InstRtn<outs, ins, asmstr, patterns, itin> {
- bits<5> Rm;
-
- // Complex operand selection needed for these instructions, so they
- // need an "addr" field for encoding/decoding to be generated.
- bits<3> Ext;
- // OptionHi = Ext{2-1}
- // S = Ext{0}
-
- let Inst{31-30} = size;
- let Inst{29-27} = 0b111;
- let Inst{26} = v;
- let Inst{25-24} = 0b00;
- let Inst{23-22} = opc;
- let Inst{21} = 0b1;
- let Inst{20-16} = Rm;
- let Inst{15-14} = Ext{2-1};
- let Inst{13} = optionlo;
- let Inst{12} = Ext{0};
- let Inst{11-10} = 0b10;
- // Inherits Rn in 9-5
- // Inherits Rt in 4-0
-
- let AddedComplexity = 50;
-}
-
-// Format for Load-store register pair (offset) instructions
-class A64I_LSPoffset<bits<2> opc, bit v, bit l,
- dag outs, dag ins, string asmstr,
- list<dag> patterns, InstrItinClass itin>
- : A64InstRtt2n<outs, ins, asmstr, patterns, itin> {
- bits<7> SImm7;
-
- let Inst{31-30} = opc;
- let Inst{29-27} = 0b101;
- let Inst{26} = v;
- let Inst{25-23} = 0b010;
- let Inst{22} = l;
- let Inst{21-15} = SImm7;
- // Inherit Rt2 in 14-10
- // Inherit Rn in 9-5
- // Inherit Rt in 4-0
-}
-
-// Format for Load-store register pair (post-indexed) instructions
-class A64I_LSPpostind<bits<2> opc, bit v, bit l,
- dag outs, dag ins, string asmstr,
- list<dag> patterns, InstrItinClass itin>
- : A64InstRtt2n<outs, ins, asmstr, patterns, itin> {
- bits<7> SImm7;
-
- let Inst{31-30} = opc;
- let Inst{29-27} = 0b101;
- let Inst{26} = v;
- let Inst{25-23} = 0b001;
- let Inst{22} = l;
- let Inst{21-15} = SImm7;
- // Inherit Rt2 in 14-10
- // Inherit Rn in 9-5
- // Inherit Rt in 4-0
-}
-
-// Format for Load-store register pair (pre-indexed) instructions
-class A64I_LSPpreind<bits<2> opc, bit v, bit l,
- dag outs, dag ins, string asmstr,
- list<dag> patterns, InstrItinClass itin>
- : A64InstRtt2n<outs, ins, asmstr, patterns, itin> {
- bits<7> SImm7;
-
- let Inst{31-30} = opc;
- let Inst{29-27} = 0b101;
- let Inst{26} = v;
- let Inst{25-23} = 0b011;
- let Inst{22} = l;
- let Inst{21-15} = SImm7;
- // Inherit Rt2 in 14-10
- // Inherit Rn in 9-5
- // Inherit Rt in 4-0
-}
-
-// Format for Load-store non-temporal register pair (offset) instructions
-class A64I_LSPnontemp<bits<2> opc, bit v, bit l,
- dag outs, dag ins, string asmstr,
- list<dag> patterns, InstrItinClass itin>
- : A64InstRtt2n<outs, ins, asmstr, patterns, itin> {
- bits<7> SImm7;
-
- let Inst{31-30} = opc;
- let Inst{29-27} = 0b101;
- let Inst{26} = v;
- let Inst{25-23} = 0b000;
- let Inst{22} = l;
- let Inst{21-15} = SImm7;
- // Inherit Rt2 in 14-10
- // Inherit Rn in 9-5
- // Inherit Rt in 4-0
-}
-
-// Format for Logical (immediate) instructions
-class A64I_logicalimm<bit sf, bits<2> opc,
- dag outs, dag ins, string asmstr,
- list<dag> patterns, InstrItinClass itin>
- : A64InstRdn<outs, ins, asmstr, patterns, itin> {
- bit N;
- bits<6> ImmR;
- bits<6> ImmS;
-
- // N, ImmR and ImmS have no separate existence in any assembly syntax (or for
- // selection), so we'll combine them into a single field here.
- bits<13> Imm;
- // N = Imm{12};
- // ImmR = Imm{11-6};
- // ImmS = Imm{5-0};
-
- let Inst{31} = sf;
- let Inst{30-29} = opc;
- let Inst{28-23} = 0b100100;
- let Inst{22} = Imm{12};
- let Inst{21-16} = Imm{11-6};
- let Inst{15-10} = Imm{5-0};
- // Rn inherited in 9-5
- // Rd inherited in 4-0
-}
-
-// Format for Logical (shifted register) instructions
-class A64I_logicalshift<bit sf, bits<2> opc, bits<2> shift, bit N,
- dag outs, dag ins, string asmstr,
- list<dag> patterns, InstrItinClass itin>
- : A64InstRdnm<outs, ins, asmstr, patterns, itin> {
- bits<6> Imm6;
-
- let Inst{31} = sf;
- let Inst{30-29} = opc;
- let Inst{28-24} = 0b01010;
- let Inst{23-22} = shift;
- let Inst{21} = N;
- // Rm inherited
- let Inst{15-10} = Imm6;
- // Rn inherited
- // Rd inherited
-}
-
-// Format for Move wide (immediate)
-class A64I_movw<bit sf, bits<2> opc,
- dag outs, dag ins, string asmstr,
- list<dag> patterns, InstrItinClass itin>
- : A64InstRd<outs, ins, asmstr, patterns, itin> {
- bits<16> UImm16;
- bits<2> Shift; // Called "hw" officially
-
- let Inst{31} = sf;
- let Inst{30-29} = opc;
- let Inst{28-23} = 0b100101;
- let Inst{22-21} = Shift;
- let Inst{20-5} = UImm16;
- // Inherits Rd in 4-0
-}
-
-// Format for PC-relative addressing instructions, ADR and ADRP.
-class A64I_PCADR<bit op,
- dag outs, dag ins, string asmstr,
- list<dag> patterns, InstrItinClass itin>
- : A64InstRd<outs, ins, asmstr, patterns, itin> {
- bits<21> Label;
-
- let Inst{31} = op;
- let Inst{30-29} = Label{1-0};
- let Inst{28-24} = 0b10000;
- let Inst{23-5} = Label{20-2};
-}
-
-// Format for system instructions
-class A64I_system<bit l,
- dag outs, dag ins, string asmstr,
- list<dag> patterns, InstrItinClass itin>
- : A64Inst<outs, ins, asmstr, patterns, itin> {
- bits<2> Op0;
- bits<3> Op1;
- bits<4> CRn;
- bits<4> CRm;
- bits<3> Op2;
- bits<5> Rt;
-
- let Inst{31-22} = 0b1101010100;
- let Inst{21} = l;
- let Inst{20-19} = Op0;
- let Inst{18-16} = Op1;
- let Inst{15-12} = CRn;
- let Inst{11-8} = CRm;
- let Inst{7-5} = Op2;
- let Inst{4-0} = Rt;
-
- // These instructions can do horrible things.
- let hasSideEffects = 1;
-}
-
-// Format for unconditional branch (immediate) instructions
-class A64I_Bimm<bit op,
- dag outs, dag ins, string asmstr,
- list<dag> patterns, InstrItinClass itin>
- : A64Inst<outs, ins, asmstr, patterns, itin> {
- // Doubly special in not even sharing register fields with other
- // instructions, so we create our own Rn here.
- bits<26> Label;
-
- let Inst{31} = op;
- let Inst{30-26} = 0b00101;
- let Inst{25-0} = Label;
-}
-
-// Format for Test & branch (immediate) instructions
-class A64I_TBimm<bit op,
- dag outs, dag ins, string asmstr,
- list<dag> patterns, InstrItinClass itin>
- : A64InstRt<outs, ins, asmstr, patterns, itin> {
- // Doubly special in not even sharing register fields with other
- // instructions, so we create our own Rn here.
- bits<6> Imm;
- bits<14> Label;
-
- let Inst{31} = Imm{5};
- let Inst{30-25} = 0b011011;
- let Inst{24} = op;
- let Inst{23-19} = Imm{4-0};
- let Inst{18-5} = Label;
- // Inherit Rt in 4-0
-}
-
-// Format for Unconditional branch (register) instructions, including
-// RET. Shares no fields with instructions further up the hierarchy
-// so top-level.
-class A64I_Breg<bits<4> opc, bits<5> op2, bits<6> op3, bits<5> op4,
- dag outs, dag ins, string asmstr,
- list<dag> patterns, InstrItinClass itin>
- : A64Inst<outs, ins, asmstr, patterns, itin> {
- // Doubly special in not even sharing register fields with other
- // instructions, so we create our own Rn here.
- bits<5> Rn;
-
- let Inst{31-25} = 0b1101011;
- let Inst{24-21} = opc;
- let Inst{20-16} = op2;
- let Inst{15-10} = op3;
let Inst{9-5} = Rn;
- let Inst{4-0} = op4;
+ let Inst{4-0} = Rd;
+}
+
+let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
+class BaseUnscaledConversionToHigh<bits<2> rmode, bits<3> opcode,
+ RegisterClass srcType, RegisterOperand dstType, string asm,
+ string kind>
+ : I<(outs dstType:$Rd), (ins srcType:$Rn, VectorIndex1:$idx), asm,
+ "{\t$Rd"#kind#"$idx, $Rn|"#kind#"\t$Rd$idx, $Rn}", "", []>,
+ Sched<[WriteFCopy]> {
+ bits<5> Rd;
+ bits<5> Rn;
+ let Inst{30-23} = 0b00111101;
+ let Inst{21} = 1;
+ let Inst{20-19} = rmode;
+ let Inst{18-16} = opcode;
+ let Inst{15-10} = 0b000000;
+ let Inst{9-5} = Rn;
+ let Inst{4-0} = Rd;
+
+ let DecoderMethod = "DecodeFMOVLaneInstruction";
+}
+
+let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
+class BaseUnscaledConversionFromHigh<bits<2> rmode, bits<3> opcode,
+ RegisterOperand srcType, RegisterClass dstType, string asm,
+ string kind>
+ : I<(outs dstType:$Rd), (ins srcType:$Rn, VectorIndex1:$idx), asm,
+ "{\t$Rd, $Rn"#kind#"$idx|"#kind#"\t$Rd, $Rn$idx}", "", []>,
+ Sched<[WriteFCopy]> {
+ bits<5> Rd;
+ bits<5> Rn;
+ let Inst{30-23} = 0b00111101;
+ let Inst{21} = 1;
+ let Inst{20-19} = rmode;
+ let Inst{18-16} = opcode;
+ let Inst{15-10} = 0b000000;
+ let Inst{9-5} = Rn;
+ let Inst{4-0} = Rd;
+
+ let DecoderMethod = "DecodeFMOVLaneInstruction";
}
-//===----------------------------------------------------------------------===//
-//
-// Neon Instruction Format Definitions.
-//
+
+multiclass UnscaledConversion<string asm> {
+ def WSr : BaseUnscaledConversion<0b00, 0b111, GPR32, FPR32, asm> {
+ let Inst{31} = 0; // 32-bit GPR flag
+ let Inst{22} = 0; // 32-bit FPR flag
+ }
+
+ def XDr : BaseUnscaledConversion<0b00, 0b111, GPR64, FPR64, asm> {
+ let Inst{31} = 1; // 64-bit GPR flag
+ let Inst{22} = 1; // 64-bit FPR flag
+ }
+
+ def SWr : BaseUnscaledConversion<0b00, 0b110, FPR32, GPR32, asm> {
+ let Inst{31} = 0; // 32-bit GPR flag
+ let Inst{22} = 0; // 32-bit FPR flag
+ }
+
+ def DXr : BaseUnscaledConversion<0b00, 0b110, FPR64, GPR64, asm> {
+ let Inst{31} = 1; // 64-bit GPR flag
+ let Inst{22} = 1; // 64-bit FPR flag
+ }
+
+ def XDHighr : BaseUnscaledConversionToHigh<0b01, 0b111, GPR64, V128,
+ asm, ".d"> {
+ let Inst{31} = 1;
+ let Inst{22} = 0;
+ }
+
+ def DXHighr : BaseUnscaledConversionFromHigh<0b01, 0b110, V128, GPR64,
+ asm, ".d"> {
+ let Inst{31} = 1;
+ let Inst{22} = 0;
+ }
+}
+
+//---
+// Floating point conversion
+//---
+
+class BaseFPConversion<bits<2> type, bits<2> opcode, RegisterClass dstType,
+ RegisterClass srcType, string asm, list<dag> pattern>
+ : I<(outs dstType:$Rd), (ins srcType:$Rn), asm, "\t$Rd, $Rn", "", pattern>,
+ Sched<[WriteFCvt]> {
+ bits<5> Rd;
+ bits<5> Rn;
+ let Inst{31-24} = 0b00011110;
+ let Inst{23-22} = type;
+ let Inst{21-17} = 0b10001;
+ let Inst{16-15} = opcode;
+ let Inst{14-10} = 0b10000;
+ let Inst{9-5} = Rn;
+ let Inst{4-0} = Rd;
+}
+
+multiclass FPConversion<string asm> {
+ // Double-precision to Half-precision
+ def HDr : BaseFPConversion<0b01, 0b11, FPR16, FPR64, asm,
+ [(set FPR16:$Rd, (fround FPR64:$Rn))]>;
+
+ // Double-precision to Single-precision
+ def SDr : BaseFPConversion<0b01, 0b00, FPR32, FPR64, asm,
+ [(set FPR32:$Rd, (fround FPR64:$Rn))]>;
+
+ // Half-precision to Double-precision
+ def DHr : BaseFPConversion<0b11, 0b01, FPR64, FPR16, asm,
+ [(set FPR64:$Rd, (fextend FPR16:$Rn))]>;
+
+ // Half-precision to Single-precision
+ def SHr : BaseFPConversion<0b11, 0b00, FPR32, FPR16, asm,
+ [(set FPR32:$Rd, (fextend FPR16:$Rn))]>;
+
+ // Single-precision to Double-precision
+ def DSr : BaseFPConversion<0b00, 0b01, FPR64, FPR32, asm,
+ [(set FPR64:$Rd, (fextend FPR32:$Rn))]>;
+
+ // Single-precision to Half-precision
+ def HSr : BaseFPConversion<0b00, 0b11, FPR16, FPR32, asm,
+ [(set FPR16:$Rd, (fround FPR32:$Rn))]>;
+}
+
+//---
+// Single operand floating point data processing
+//---
+
+let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
+class BaseSingleOperandFPData<bits<4> opcode, RegisterClass regtype,
+ ValueType vt, string asm, SDPatternOperator node>
+ : I<(outs regtype:$Rd), (ins regtype:$Rn), asm, "\t$Rd, $Rn", "",
+ [(set (vt regtype:$Rd), (node (vt regtype:$Rn)))]>,
+ Sched<[WriteF]> {
+ bits<5> Rd;
+ bits<5> Rn;
+ let Inst{31-23} = 0b000111100;
+ let Inst{21-19} = 0b100;
+ let Inst{18-15} = opcode;
+ let Inst{14-10} = 0b10000;
+ let Inst{9-5} = Rn;
+ let Inst{4-0} = Rd;
+}
+
+multiclass SingleOperandFPData<bits<4> opcode, string asm,
+ SDPatternOperator node = null_frag> {
+ def Sr : BaseSingleOperandFPData<opcode, FPR32, f32, asm, node> {
+ let Inst{22} = 0; // 32-bit size flag
+ }
+
+ def Dr : BaseSingleOperandFPData<opcode, FPR64, f64, asm, node> {
+ let Inst{22} = 1; // 64-bit size flag
+ }
+}
+
+//---
+// Two operand floating point data processing
+//---
+
+let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
+class BaseTwoOperandFPData<bits<4> opcode, RegisterClass regtype,
+ string asm, list<dag> pat>
+ : I<(outs regtype:$Rd), (ins regtype:$Rn, regtype:$Rm),
+ asm, "\t$Rd, $Rn, $Rm", "", pat>,
+ Sched<[WriteF]> {
+ bits<5> Rd;
+ bits<5> Rn;
+ bits<5> Rm;
+ let Inst{31-23} = 0b000111100;
+ let Inst{21} = 1;
+ let Inst{20-16} = Rm;
+ let Inst{15-12} = opcode;
+ let Inst{11-10} = 0b10;
+ let Inst{9-5} = Rn;
+ let Inst{4-0} = Rd;
+}
+
+multiclass TwoOperandFPData<bits<4> opcode, string asm,
+ SDPatternOperator node = null_frag> {
+ def Srr : BaseTwoOperandFPData<opcode, FPR32, asm,
+ [(set (f32 FPR32:$Rd),
+ (node (f32 FPR32:$Rn), (f32 FPR32:$Rm)))]> {
+ let Inst{22} = 0; // 32-bit size flag
+ }
+
+ def Drr : BaseTwoOperandFPData<opcode, FPR64, asm,
+ [(set (f64 FPR64:$Rd),
+ (node (f64 FPR64:$Rn), (f64 FPR64:$Rm)))]> {
+ let Inst{22} = 1; // 64-bit size flag
+ }
+}
+
+multiclass TwoOperandFPDataNeg<bits<4> opcode, string asm, SDNode node> {
+ def Srr : BaseTwoOperandFPData<opcode, FPR32, asm,
+ [(set FPR32:$Rd, (fneg (node FPR32:$Rn, (f32 FPR32:$Rm))))]> {
+ let Inst{22} = 0; // 32-bit size flag
+ }
+
+ def Drr : BaseTwoOperandFPData<opcode, FPR64, asm,
+ [(set FPR64:$Rd, (fneg (node FPR64:$Rn, (f64 FPR64:$Rm))))]> {
+ let Inst{22} = 1; // 64-bit size flag
+ }
+}
+
+
+//---
+// Three operand floating point data processing
+//---
+
+class BaseThreeOperandFPData<bit isNegated, bit isSub,
+ RegisterClass regtype, string asm, list<dag> pat>
+ : I<(outs regtype:$Rd), (ins regtype:$Rn, regtype:$Rm, regtype: $Ra),
+ asm, "\t$Rd, $Rn, $Rm, $Ra", "", pat>,
+ Sched<[WriteFMul]> {
+ bits<5> Rd;
+ bits<5> Rn;
+ bits<5> Rm;
+ bits<5> Ra;
+ let Inst{31-23} = 0b000111110;
+ let Inst{21} = isNegated;
+ let Inst{20-16} = Rm;
+ let Inst{15} = isSub;
+ let Inst{14-10} = Ra;
+ let Inst{9-5} = Rn;
+ let Inst{4-0} = Rd;
+}
+
+multiclass ThreeOperandFPData<bit isNegated, bit isSub,string asm,
+ SDPatternOperator node> {
+ def Srrr : BaseThreeOperandFPData<isNegated, isSub, FPR32, asm,
+ [(set FPR32:$Rd,
+ (node (f32 FPR32:$Rn), (f32 FPR32:$Rm), (f32 FPR32:$Ra)))]> {
+ let Inst{22} = 0; // 32-bit size flag
+ }
+
+ def Drrr : BaseThreeOperandFPData<isNegated, isSub, FPR64, asm,
+ [(set FPR64:$Rd,
+ (node (f64 FPR64:$Rn), (f64 FPR64:$Rm), (f64 FPR64:$Ra)))]> {
+ let Inst{22} = 1; // 64-bit size flag
+ }
+}
+
+//---
+// Floating point data comparisons
+//---
+
+let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
+class BaseOneOperandFPComparison<bit signalAllNans,
+ RegisterClass regtype, string asm,
+ list<dag> pat>
+ : I<(outs), (ins regtype:$Rn), asm, "\t$Rn, #0.0", "", pat>,
+ Sched<[WriteFCmp]> {
+ bits<5> Rn;
+ let Inst{31-23} = 0b000111100;
+ let Inst{21} = 1;
+
+ let Inst{15-10} = 0b001000;
+ let Inst{9-5} = Rn;
+ let Inst{4} = signalAllNans;
+ let Inst{3-0} = 0b1000;
+
+ // Rm should be 0b00000 canonically, but we need to accept any value.
+ let PostEncoderMethod = "fixOneOperandFPComparison";
+}
+
+let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
+class BaseTwoOperandFPComparison<bit signalAllNans, RegisterClass regtype,
+ string asm, list<dag> pat>
+ : I<(outs), (ins regtype:$Rn, regtype:$Rm), asm, "\t$Rn, $Rm", "", pat>,
+ Sched<[WriteFCmp]> {
+ bits<5> Rm;
+ bits<5> Rn;
+ let Inst{31-23} = 0b000111100;
+ let Inst{21} = 1;
+ let Inst{20-16} = Rm;
+ let Inst{15-10} = 0b001000;
+ let Inst{9-5} = Rn;
+ let Inst{4} = signalAllNans;
+ let Inst{3-0} = 0b0000;
+}
+
+multiclass FPComparison<bit signalAllNans, string asm,
+ SDPatternOperator OpNode = null_frag> {
+ let Defs = [NZCV] in {
+ def Srr : BaseTwoOperandFPComparison<signalAllNans, FPR32, asm,
+ [(OpNode FPR32:$Rn, (f32 FPR32:$Rm)), (implicit NZCV)]> {
+ let Inst{22} = 0;
+ }
+
+ def Sri : BaseOneOperandFPComparison<signalAllNans, FPR32, asm,
+ [(OpNode (f32 FPR32:$Rn), fpimm0), (implicit NZCV)]> {
+ let Inst{22} = 0;
+ }
+
+ def Drr : BaseTwoOperandFPComparison<signalAllNans, FPR64, asm,
+ [(OpNode FPR64:$Rn, (f64 FPR64:$Rm)), (implicit NZCV)]> {
+ let Inst{22} = 1;
+ }
+
+ def Dri : BaseOneOperandFPComparison<signalAllNans, FPR64, asm,
+ [(OpNode (f64 FPR64:$Rn), fpimm0), (implicit NZCV)]> {
+ let Inst{22} = 1;
+ }
+ } // Defs = [NZCV]
+}
+
+//---
+// Floating point conditional comparisons
+//---
+
+let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
+class BaseFPCondComparison<bit signalAllNans,
+ RegisterClass regtype, string asm>
+ : I<(outs), (ins regtype:$Rn, regtype:$Rm, imm0_15:$nzcv, ccode:$cond),
+ asm, "\t$Rn, $Rm, $nzcv, $cond", "", []>,
+ Sched<[WriteFCmp]> {
+ bits<5> Rn;
+ bits<5> Rm;
+ bits<4> nzcv;
+ bits<4> cond;
+
+ let Inst{31-23} = 0b000111100;
+ let Inst{21} = 1;
+ let Inst{20-16} = Rm;
+ let Inst{15-12} = cond;
+ let Inst{11-10} = 0b01;
+ let Inst{9-5} = Rn;
+ let Inst{4} = signalAllNans;
+ let Inst{3-0} = nzcv;
+}
+
+multiclass FPCondComparison<bit signalAllNans, string asm> {
+ let Defs = [NZCV], Uses = [NZCV] in {
+ def Srr : BaseFPCondComparison<signalAllNans, FPR32, asm> {
+ let Inst{22} = 0;
+ }
+
+ def Drr : BaseFPCondComparison<signalAllNans, FPR64, asm> {
+ let Inst{22} = 1;
+ }
+ } // Defs = [NZCV], Uses = [NZCV]
+}
+
+//---
+// Floating point conditional select
+//---
+
+class BaseFPCondSelect<RegisterClass regtype, ValueType vt, string asm>
+ : I<(outs regtype:$Rd), (ins regtype:$Rn, regtype:$Rm, ccode:$cond),
+ asm, "\t$Rd, $Rn, $Rm, $cond", "",
+ [(set regtype:$Rd,
+ (AArch64csel (vt regtype:$Rn), regtype:$Rm,
+ (i32 imm:$cond), NZCV))]>,
+ Sched<[WriteF]> {
+ bits<5> Rd;
+ bits<5> Rn;
+ bits<5> Rm;
+ bits<4> cond;
+
+ let Inst{31-23} = 0b000111100;
+ let Inst{21} = 1;
+ let Inst{20-16} = Rm;
+ let Inst{15-12} = cond;
+ let Inst{11-10} = 0b11;
+ let Inst{9-5} = Rn;
+ let Inst{4-0} = Rd;
+}
+
+multiclass FPCondSelect<string asm> {
+ let Uses = [NZCV] in {
+ def Srrr : BaseFPCondSelect<FPR32, f32, asm> {
+ let Inst{22} = 0;
+ }
+
+ def Drrr : BaseFPCondSelect<FPR64, f64, asm> {
+ let Inst{22} = 1;
+ }
+ } // Uses = [NZCV]
+}
+
+//---
+// Floating move immediate
+//---
+
+class BaseFPMoveImmediate<RegisterClass regtype, Operand fpimmtype, string asm>
+ : I<(outs regtype:$Rd), (ins fpimmtype:$imm), asm, "\t$Rd, $imm", "",
+ [(set regtype:$Rd, fpimmtype:$imm)]>,
+ Sched<[WriteFImm]> {
+ bits<5> Rd;
+ bits<8> imm;
+ let Inst{31-23} = 0b000111100;
+ let Inst{21} = 1;
+ let Inst{20-13} = imm;
+ let Inst{12-5} = 0b10000000;
+ let Inst{4-0} = Rd;
+}
+
+multiclass FPMoveImmediate<string asm> {
+ def Si : BaseFPMoveImmediate<FPR32, fpimm32, asm> {
+ let Inst{22} = 0;
+ }
+
+ def Di : BaseFPMoveImmediate<FPR64, fpimm64, asm> {
+ let Inst{22} = 1;
+ }
+}
+} // end of 'let Predicates = [HasFPARMv8]'
+
+//----------------------------------------------------------------------------
+// AdvSIMD
+//----------------------------------------------------------------------------
let Predicates = [HasNEON] in {
-class NeonInstAlias<string Asm, dag Result, bit Emit = 0b1>
- : InstAlias<Asm, Result, Emit> {
+//----------------------------------------------------------------------------
+// AdvSIMD three register vector instructions
+//----------------------------------------------------------------------------
+
+let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
+class BaseSIMDThreeSameVector<bit Q, bit U, bits<2> size, bits<5> opcode,
+ RegisterOperand regtype, string asm, string kind,
+ list<dag> pattern>
+ : I<(outs regtype:$Rd), (ins regtype:$Rn, regtype:$Rm), asm,
+ "{\t$Rd" # kind # ", $Rn" # kind # ", $Rm" # kind #
+ "|" # kind # "\t$Rd, $Rn, $Rm|}", "", pattern>,
+ Sched<[WriteV]> {
+ bits<5> Rd;
+ bits<5> Rn;
+ bits<5> Rm;
+ let Inst{31} = 0;
+ let Inst{30} = Q;
+ let Inst{29} = U;
+ let Inst{28-24} = 0b01110;
+ let Inst{23-22} = size;
+ let Inst{21} = 1;
+ let Inst{20-16} = Rm;
+ let Inst{15-11} = opcode;
+ let Inst{10} = 1;
+ let Inst{9-5} = Rn;
+ let Inst{4-0} = Rd;
}
-// Format AdvSIMD bitwise extract
-class NeonI_BitExtract<bit q, bits<2> op2,
- dag outs, dag ins, string asmstr,
- list<dag> patterns, InstrItinClass itin>
- : A64InstRdnm<outs, ins, asmstr, patterns, itin> {
- let Inst{31} = 0b0;
- let Inst{30} = q;
+let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
+class BaseSIMDThreeSameVectorTied<bit Q, bit U, bits<2> size, bits<5> opcode,
+ RegisterOperand regtype, string asm, string kind,
+ list<dag> pattern>
+ : I<(outs regtype:$dst), (ins regtype:$Rd, regtype:$Rn, regtype:$Rm), asm,
+ "{\t$Rd" # kind # ", $Rn" # kind # ", $Rm" # kind #
+ "|" # kind # "\t$Rd, $Rn, $Rm}", "$Rd = $dst", pattern>,
+ Sched<[WriteV]> {
+ bits<5> Rd;
+ bits<5> Rn;
+ bits<5> Rm;
+ let Inst{31} = 0;
+ let Inst{30} = Q;
+ let Inst{29} = U;
+ let Inst{28-24} = 0b01110;
+ let Inst{23-22} = size;
+ let Inst{21} = 1;
+ let Inst{20-16} = Rm;
+ let Inst{15-11} = opcode;
+ let Inst{10} = 1;
+ let Inst{9-5} = Rn;
+ let Inst{4-0} = Rd;
+}
+
+// All operand sizes distinguished in the encoding.
+multiclass SIMDThreeSameVector<bit U, bits<5> opc, string asm,
+ SDPatternOperator OpNode> {
+ def v8i8 : BaseSIMDThreeSameVector<0, U, 0b00, opc, V64,
+ asm, ".8b",
+ [(set (v8i8 V64:$Rd), (OpNode (v8i8 V64:$Rn), (v8i8 V64:$Rm)))]>;
+ def v16i8 : BaseSIMDThreeSameVector<1, U, 0b00, opc, V128,
+ asm, ".16b",
+ [(set (v16i8 V128:$Rd), (OpNode (v16i8 V128:$Rn), (v16i8 V128:$Rm)))]>;
+ def v4i16 : BaseSIMDThreeSameVector<0, U, 0b01, opc, V64,
+ asm, ".4h",
+ [(set (v4i16 V64:$Rd), (OpNode (v4i16 V64:$Rn), (v4i16 V64:$Rm)))]>;
+ def v8i16 : BaseSIMDThreeSameVector<1, U, 0b01, opc, V128,
+ asm, ".8h",
+ [(set (v8i16 V128:$Rd), (OpNode (v8i16 V128:$Rn), (v8i16 V128:$Rm)))]>;
+ def v2i32 : BaseSIMDThreeSameVector<0, U, 0b10, opc, V64,
+ asm, ".2s",
+ [(set (v2i32 V64:$Rd), (OpNode (v2i32 V64:$Rn), (v2i32 V64:$Rm)))]>;
+ def v4i32 : BaseSIMDThreeSameVector<1, U, 0b10, opc, V128,
+ asm, ".4s",
+ [(set (v4i32 V128:$Rd), (OpNode (v4i32 V128:$Rn), (v4i32 V128:$Rm)))]>;
+ def v2i64 : BaseSIMDThreeSameVector<1, U, 0b11, opc, V128,
+ asm, ".2d",
+ [(set (v2i64 V128:$Rd), (OpNode (v2i64 V128:$Rn), (v2i64 V128:$Rm)))]>;
+}
+
+// As above, but D sized elements unsupported.
+multiclass SIMDThreeSameVectorBHS<bit U, bits<5> opc, string asm,
+ SDPatternOperator OpNode> {
+ def v8i8 : BaseSIMDThreeSameVector<0, U, 0b00, opc, V64,
+ asm, ".8b",
+ [(set V64:$Rd, (v8i8 (OpNode (v8i8 V64:$Rn), (v8i8 V64:$Rm))))]>;
+ def v16i8 : BaseSIMDThreeSameVector<1, U, 0b00, opc, V128,
+ asm, ".16b",
+ [(set V128:$Rd, (v16i8 (OpNode (v16i8 V128:$Rn), (v16i8 V128:$Rm))))]>;
+ def v4i16 : BaseSIMDThreeSameVector<0, U, 0b01, opc, V64,
+ asm, ".4h",
+ [(set V64:$Rd, (v4i16 (OpNode (v4i16 V64:$Rn), (v4i16 V64:$Rm))))]>;
+ def v8i16 : BaseSIMDThreeSameVector<1, U, 0b01, opc, V128,
+ asm, ".8h",
+ [(set V128:$Rd, (v8i16 (OpNode (v8i16 V128:$Rn), (v8i16 V128:$Rm))))]>;
+ def v2i32 : BaseSIMDThreeSameVector<0, U, 0b10, opc, V64,
+ asm, ".2s",
+ [(set V64:$Rd, (v2i32 (OpNode (v2i32 V64:$Rn), (v2i32 V64:$Rm))))]>;
+ def v4i32 : BaseSIMDThreeSameVector<1, U, 0b10, opc, V128,
+ asm, ".4s",
+ [(set V128:$Rd, (v4i32 (OpNode (v4i32 V128:$Rn), (v4i32 V128:$Rm))))]>;
+}
+
+multiclass SIMDThreeSameVectorBHSTied<bit U, bits<5> opc, string asm,
+ SDPatternOperator OpNode> {
+ def v8i8 : BaseSIMDThreeSameVectorTied<0, U, 0b00, opc, V64,
+ asm, ".8b",
+ [(set (v8i8 V64:$dst),
+ (OpNode (v8i8 V64:$Rd), (v8i8 V64:$Rn), (v8i8 V64:$Rm)))]>;
+ def v16i8 : BaseSIMDThreeSameVectorTied<1, U, 0b00, opc, V128,
+ asm, ".16b",
+ [(set (v16i8 V128:$dst),
+ (OpNode (v16i8 V128:$Rd), (v16i8 V128:$Rn), (v16i8 V128:$Rm)))]>;
+ def v4i16 : BaseSIMDThreeSameVectorTied<0, U, 0b01, opc, V64,
+ asm, ".4h",
+ [(set (v4i16 V64:$dst),
+ (OpNode (v4i16 V64:$Rd), (v4i16 V64:$Rn), (v4i16 V64:$Rm)))]>;
+ def v8i16 : BaseSIMDThreeSameVectorTied<1, U, 0b01, opc, V128,
+ asm, ".8h",
+ [(set (v8i16 V128:$dst),
+ (OpNode (v8i16 V128:$Rd), (v8i16 V128:$Rn), (v8i16 V128:$Rm)))]>;
+ def v2i32 : BaseSIMDThreeSameVectorTied<0, U, 0b10, opc, V64,
+ asm, ".2s",
+ [(set (v2i32 V64:$dst),
+ (OpNode (v2i32 V64:$Rd), (v2i32 V64:$Rn), (v2i32 V64:$Rm)))]>;
+ def v4i32 : BaseSIMDThreeSameVectorTied<1, U, 0b10, opc, V128,
+ asm, ".4s",
+ [(set (v4i32 V128:$dst),
+ (OpNode (v4i32 V128:$Rd), (v4i32 V128:$Rn), (v4i32 V128:$Rm)))]>;
+}
+
+// As above, but only B sized elements supported.
+multiclass SIMDThreeSameVectorB<bit U, bits<5> opc, string asm,
+ SDPatternOperator OpNode> {
+ def v8i8 : BaseSIMDThreeSameVector<0, U, 0b00, opc, V64,
+ asm, ".8b",
+ [(set (v8i8 V64:$Rd), (OpNode (v8i8 V64:$Rn), (v8i8 V64:$Rm)))]>;
+ def v16i8 : BaseSIMDThreeSameVector<1, U, 0b00, opc, V128,
+ asm, ".16b",
+ [(set (v16i8 V128:$Rd),
+ (OpNode (v16i8 V128:$Rn), (v16i8 V128:$Rm)))]>;
+}
+
+// As above, but only S and D sized floating point elements supported.
+multiclass SIMDThreeSameVectorFP<bit U, bit S, bits<5> opc,
+ string asm, SDPatternOperator OpNode> {
+ def v2f32 : BaseSIMDThreeSameVector<0, U, {S,0}, opc, V64,
+ asm, ".2s",
+ [(set (v2f32 V64:$Rd), (OpNode (v2f32 V64:$Rn), (v2f32 V64:$Rm)))]>;
+ def v4f32 : BaseSIMDThreeSameVector<1, U, {S,0}, opc, V128,
+ asm, ".4s",
+ [(set (v4f32 V128:$Rd), (OpNode (v4f32 V128:$Rn), (v4f32 V128:$Rm)))]>;
+ def v2f64 : BaseSIMDThreeSameVector<1, U, {S,1}, opc, V128,
+ asm, ".2d",
+ [(set (v2f64 V128:$Rd), (OpNode (v2f64 V128:$Rn), (v2f64 V128:$Rm)))]>;
+}
+
+multiclass SIMDThreeSameVectorFPCmp<bit U, bit S, bits<5> opc,
+ string asm,
+ SDPatternOperator OpNode> {
+ def v2f32 : BaseSIMDThreeSameVector<0, U, {S,0}, opc, V64,
+ asm, ".2s",
+ [(set (v2i32 V64:$Rd), (OpNode (v2f32 V64:$Rn), (v2f32 V64:$Rm)))]>;
+ def v4f32 : BaseSIMDThreeSameVector<1, U, {S,0}, opc, V128,
+ asm, ".4s",
+ [(set (v4i32 V128:$Rd), (OpNode (v4f32 V128:$Rn), (v4f32 V128:$Rm)))]>;
+ def v2f64 : BaseSIMDThreeSameVector<1, U, {S,1}, opc, V128,
+ asm, ".2d",
+ [(set (v2i64 V128:$Rd), (OpNode (v2f64 V128:$Rn), (v2f64 V128:$Rm)))]>;
+}
+
+multiclass SIMDThreeSameVectorFPTied<bit U, bit S, bits<5> opc,
+ string asm, SDPatternOperator OpNode> {
+ def v2f32 : BaseSIMDThreeSameVectorTied<0, U, {S,0}, opc, V64,
+ asm, ".2s",
+ [(set (v2f32 V64:$dst),
+ (OpNode (v2f32 V64:$Rd), (v2f32 V64:$Rn), (v2f32 V64:$Rm)))]>;
+ def v4f32 : BaseSIMDThreeSameVectorTied<1, U, {S,0}, opc, V128,
+ asm, ".4s",
+ [(set (v4f32 V128:$dst),
+ (OpNode (v4f32 V128:$Rd), (v4f32 V128:$Rn), (v4f32 V128:$Rm)))]>;
+ def v2f64 : BaseSIMDThreeSameVectorTied<1, U, {S,1}, opc, V128,
+ asm, ".2d",
+ [(set (v2f64 V128:$dst),
+ (OpNode (v2f64 V128:$Rd), (v2f64 V128:$Rn), (v2f64 V128:$Rm)))]>;
+}
+
+// As above, but D and B sized elements unsupported.
+multiclass SIMDThreeSameVectorHS<bit U, bits<5> opc, string asm,
+ SDPatternOperator OpNode> {
+ def v4i16 : BaseSIMDThreeSameVector<0, U, 0b01, opc, V64,
+ asm, ".4h",
+ [(set (v4i16 V64:$Rd), (OpNode (v4i16 V64:$Rn), (v4i16 V64:$Rm)))]>;
+ def v8i16 : BaseSIMDThreeSameVector<1, U, 0b01, opc, V128,
+ asm, ".8h",
+ [(set (v8i16 V128:$Rd), (OpNode (v8i16 V128:$Rn), (v8i16 V128:$Rm)))]>;
+ def v2i32 : BaseSIMDThreeSameVector<0, U, 0b10, opc, V64,
+ asm, ".2s",
+ [(set (v2i32 V64:$Rd), (OpNode (v2i32 V64:$Rn), (v2i32 V64:$Rm)))]>;
+ def v4i32 : BaseSIMDThreeSameVector<1, U, 0b10, opc, V128,
+ asm, ".4s",
+ [(set (v4i32 V128:$Rd), (OpNode (v4i32 V128:$Rn), (v4i32 V128:$Rm)))]>;
+}
+
+// Logical three vector ops share opcode bits, and only use B sized elements.
+multiclass SIMDLogicalThreeVector<bit U, bits<2> size, string asm,
+ SDPatternOperator OpNode = null_frag> {
+ def v8i8 : BaseSIMDThreeSameVector<0, U, size, 0b00011, V64,
+ asm, ".8b",
+ [(set (v8i8 V64:$Rd), (OpNode V64:$Rn, V64:$Rm))]>;
+ def v16i8 : BaseSIMDThreeSameVector<1, U, size, 0b00011, V128,
+ asm, ".16b",
+ [(set (v16i8 V128:$Rd), (OpNode V128:$Rn, V128:$Rm))]>;
+
+ def : Pat<(v4i16 (OpNode V64:$LHS, V64:$RHS)),
+ (!cast<Instruction>(NAME#"v8i8") V64:$LHS, V64:$RHS)>;
+ def : Pat<(v2i32 (OpNode V64:$LHS, V64:$RHS)),
+ (!cast<Instruction>(NAME#"v8i8") V64:$LHS, V64:$RHS)>;
+ def : Pat<(v1i64 (OpNode V64:$LHS, V64:$RHS)),
+ (!cast<Instruction>(NAME#"v8i8") V64:$LHS, V64:$RHS)>;
+
+ def : Pat<(v8i16 (OpNode V128:$LHS, V128:$RHS)),
+ (!cast<Instruction>(NAME#"v16i8") V128:$LHS, V128:$RHS)>;
+ def : Pat<(v4i32 (OpNode V128:$LHS, V128:$RHS)),
+ (!cast<Instruction>(NAME#"v16i8") V128:$LHS, V128:$RHS)>;
+ def : Pat<(v2i64 (OpNode V128:$LHS, V128:$RHS)),
+ (!cast<Instruction>(NAME#"v16i8") V128:$LHS, V128:$RHS)>;
+}
+
+multiclass SIMDLogicalThreeVectorTied<bit U, bits<2> size,
+ string asm, SDPatternOperator OpNode> {
+ def v8i8 : BaseSIMDThreeSameVectorTied<0, U, size, 0b00011, V64,
+ asm, ".8b",
+ [(set (v8i8 V64:$dst),
+ (OpNode (v8i8 V64:$Rd), (v8i8 V64:$Rn), (v8i8 V64:$Rm)))]>;
+ def v16i8 : BaseSIMDThreeSameVectorTied<1, U, size, 0b00011, V128,
+ asm, ".16b",
+ [(set (v16i8 V128:$dst),
+ (OpNode (v16i8 V128:$Rd), (v16i8 V128:$Rn),
+ (v16i8 V128:$Rm)))]>;
+
+ def : Pat<(v4i16 (OpNode (v4i16 V64:$LHS), (v4i16 V64:$MHS),
+ (v4i16 V64:$RHS))),
+ (!cast<Instruction>(NAME#"v8i8")
+ V64:$LHS, V64:$MHS, V64:$RHS)>;
+ def : Pat<(v2i32 (OpNode (v2i32 V64:$LHS), (v2i32 V64:$MHS),
+ (v2i32 V64:$RHS))),
+ (!cast<Instruction>(NAME#"v8i8")
+ V64:$LHS, V64:$MHS, V64:$RHS)>;
+ def : Pat<(v1i64 (OpNode (v1i64 V64:$LHS), (v1i64 V64:$MHS),
+ (v1i64 V64:$RHS))),
+ (!cast<Instruction>(NAME#"v8i8")
+ V64:$LHS, V64:$MHS, V64:$RHS)>;
+
+ def : Pat<(v8i16 (OpNode (v8i16 V128:$LHS), (v8i16 V128:$MHS),
+ (v8i16 V128:$RHS))),
+ (!cast<Instruction>(NAME#"v16i8")
+ V128:$LHS, V128:$MHS, V128:$RHS)>;
+ def : Pat<(v4i32 (OpNode (v4i32 V128:$LHS), (v4i32 V128:$MHS),
+ (v4i32 V128:$RHS))),
+ (!cast<Instruction>(NAME#"v16i8")
+ V128:$LHS, V128:$MHS, V128:$RHS)>;
+ def : Pat<(v2i64 (OpNode (v2i64 V128:$LHS), (v2i64 V128:$MHS),
+ (v2i64 V128:$RHS))),
+ (!cast<Instruction>(NAME#"v16i8")
+ V128:$LHS, V128:$MHS, V128:$RHS)>;
+}
+
+
+//----------------------------------------------------------------------------
+// AdvSIMD two register vector instructions.
+//----------------------------------------------------------------------------
+
+let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
+class BaseSIMDTwoSameVector<bit Q, bit U, bits<2> size, bits<5> opcode,
+ RegisterOperand regtype, string asm, string dstkind,
+ string srckind, list<dag> pattern>
+ : I<(outs regtype:$Rd), (ins regtype:$Rn), asm,
+ "{\t$Rd" # dstkind # ", $Rn" # srckind #
+ "|" # dstkind # "\t$Rd, $Rn}", "", pattern>,
+ Sched<[WriteV]> {
+ bits<5> Rd;
+ bits<5> Rn;
+ let Inst{31} = 0;
+ let Inst{30} = Q;
+ let Inst{29} = U;
+ let Inst{28-24} = 0b01110;
+ let Inst{23-22} = size;
+ let Inst{21-17} = 0b10000;
+ let Inst{16-12} = opcode;
+ let Inst{11-10} = 0b10;
+ let Inst{9-5} = Rn;
+ let Inst{4-0} = Rd;
+}
+
+let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
+class BaseSIMDTwoSameVectorTied<bit Q, bit U, bits<2> size, bits<5> opcode,
+ RegisterOperand regtype, string asm, string dstkind,
+ string srckind, list<dag> pattern>
+ : I<(outs regtype:$dst), (ins regtype:$Rd, regtype:$Rn), asm,
+ "{\t$Rd" # dstkind # ", $Rn" # srckind #
+ "|" # dstkind # "\t$Rd, $Rn}", "$Rd = $dst", pattern>,
+ Sched<[WriteV]> {
+ bits<5> Rd;
+ bits<5> Rn;
+ let Inst{31} = 0;
+ let Inst{30} = Q;
+ let Inst{29} = U;
+ let Inst{28-24} = 0b01110;
+ let Inst{23-22} = size;
+ let Inst{21-17} = 0b10000;
+ let Inst{16-12} = opcode;
+ let Inst{11-10} = 0b10;
+ let Inst{9-5} = Rn;
+ let Inst{4-0} = Rd;
+}
+
+// Supports B, H, and S element sizes.
+multiclass SIMDTwoVectorBHS<bit U, bits<5> opc, string asm,
+ SDPatternOperator OpNode> {
+ def v8i8 : BaseSIMDTwoSameVector<0, U, 0b00, opc, V64,
+ asm, ".8b", ".8b",
+ [(set (v8i8 V64:$Rd), (OpNode (v8i8 V64:$Rn)))]>;
+ def v16i8 : BaseSIMDTwoSameVector<1, U, 0b00, opc, V128,
+ asm, ".16b", ".16b",
+ [(set (v16i8 V128:$Rd), (OpNode (v16i8 V128:$Rn)))]>;
+ def v4i16 : BaseSIMDTwoSameVector<0, U, 0b01, opc, V64,
+ asm, ".4h", ".4h",
+ [(set (v4i16 V64:$Rd), (OpNode (v4i16 V64:$Rn)))]>;
+ def v8i16 : BaseSIMDTwoSameVector<1, U, 0b01, opc, V128,
+ asm, ".8h", ".8h",
+ [(set (v8i16 V128:$Rd), (OpNode (v8i16 V128:$Rn)))]>;
+ def v2i32 : BaseSIMDTwoSameVector<0, U, 0b10, opc, V64,
+ asm, ".2s", ".2s",
+ [(set (v2i32 V64:$Rd), (OpNode (v2i32 V64:$Rn)))]>;
+ def v4i32 : BaseSIMDTwoSameVector<1, U, 0b10, opc, V128,
+ asm, ".4s", ".4s",
+ [(set (v4i32 V128:$Rd), (OpNode (v4i32 V128:$Rn)))]>;
+}
+
+class BaseSIMDVectorLShiftLongBySize<bit Q, bits<2> size,
+ RegisterOperand regtype, string asm, string dstkind,
+ string srckind, string amount>
+ : I<(outs V128:$Rd), (ins regtype:$Rn), asm,
+ "{\t$Rd" # dstkind # ", $Rn" # srckind # ", #" # amount #
+ "|" # dstkind # "\t$Rd, $Rn, #" # amount # "}", "", []>,
+ Sched<[WriteV]> {
+ bits<5> Rd;
+ bits<5> Rn;
+ let Inst{31} = 0;
+ let Inst{30} = Q;
let Inst{29-24} = 0b101110;
- let Inst{23-22} = op2;
- let Inst{21} = 0b0;
- // Inherit Rm in 20-16
- let Inst{15} = 0b0;
- // imm4 in 14-11
- let Inst{10} = 0b0;
- // Inherit Rn in 9-5
- // Inherit Rd in 4-0
-}
-
-// Format AdvSIMD perm
-class NeonI_Perm<bit q, bits<2> size, bits<3> opcode,
- dag outs, dag ins, string asmstr,
- list<dag> patterns, InstrItinClass itin>
- : A64InstRdnm<outs, ins, asmstr, patterns, itin> {
- let Inst{31} = 0b0;
- let Inst{30} = q;
- let Inst{29-24} = 0b001110;
let Inst{23-22} = size;
- let Inst{21} = 0b0;
- // Inherit Rm in 20-16
- let Inst{15} = 0b0;
- let Inst{14-12} = opcode;
- let Inst{11-10} = 0b10;
- // Inherit Rn in 9-5
- // Inherit Rd in 4-0
+ let Inst{21-10} = 0b100001001110;
+ let Inst{9-5} = Rn;
+ let Inst{4-0} = Rd;
}
-// Format AdvSIMD table lookup
-class NeonI_TBL<bit q, bits<2> op2, bits<2> len, bit op,
- dag outs, dag ins, string asmstr,
- list<dag> patterns, InstrItinClass itin>
- : A64InstRdnm<outs, ins, asmstr, patterns, itin> {
- let Inst{31} = 0b0;
- let Inst{30} = q;
- let Inst{29-24} = 0b001110;
- let Inst{23-22} = op2;
- let Inst{21} = 0b0;
- // Inherit Rm in 20-16
- let Inst{15} = 0b0;
- let Inst{14-13} = len;
- let Inst{12} = op;
+multiclass SIMDVectorLShiftLongBySizeBHS {
+ let neverHasSideEffects = 1 in {
+ def v8i8 : BaseSIMDVectorLShiftLongBySize<0, 0b00, V64,
+ "shll", ".8h", ".8b", "8">;
+ def v16i8 : BaseSIMDVectorLShiftLongBySize<1, 0b00, V128,
+ "shll2", ".8h", ".16b", "8">;
+ def v4i16 : BaseSIMDVectorLShiftLongBySize<0, 0b01, V64,
+ "shll", ".4s", ".4h", "16">;
+ def v8i16 : BaseSIMDVectorLShiftLongBySize<1, 0b01, V128,
+ "shll2", ".4s", ".8h", "16">;
+ def v2i32 : BaseSIMDVectorLShiftLongBySize<0, 0b10, V64,
+ "shll", ".2d", ".2s", "32">;
+ def v4i32 : BaseSIMDVectorLShiftLongBySize<1, 0b10, V128,
+ "shll2", ".2d", ".4s", "32">;
+ }
+}
+
+// Supports all element sizes.
+multiclass SIMDLongTwoVector<bit U, bits<5> opc, string asm,
+ SDPatternOperator OpNode> {
+ def v8i8_v4i16 : BaseSIMDTwoSameVector<0, U, 0b00, opc, V64,
+ asm, ".4h", ".8b",
+ [(set (v4i16 V64:$Rd), (OpNode (v8i8 V64:$Rn)))]>;
+ def v16i8_v8i16 : BaseSIMDTwoSameVector<1, U, 0b00, opc, V128,
+ asm, ".8h", ".16b",
+ [(set (v8i16 V128:$Rd), (OpNode (v16i8 V128:$Rn)))]>;
+ def v4i16_v2i32 : BaseSIMDTwoSameVector<0, U, 0b01, opc, V64,
+ asm, ".2s", ".4h",
+ [(set (v2i32 V64:$Rd), (OpNode (v4i16 V64:$Rn)))]>;
+ def v8i16_v4i32 : BaseSIMDTwoSameVector<1, U, 0b01, opc, V128,
+ asm, ".4s", ".8h",
+ [(set (v4i32 V128:$Rd), (OpNode (v8i16 V128:$Rn)))]>;
+ def v2i32_v1i64 : BaseSIMDTwoSameVector<0, U, 0b10, opc, V64,
+ asm, ".1d", ".2s",
+ [(set (v1i64 V64:$Rd), (OpNode (v2i32 V64:$Rn)))]>;
+ def v4i32_v2i64 : BaseSIMDTwoSameVector<1, U, 0b10, opc, V128,
+ asm, ".2d", ".4s",
+ [(set (v2i64 V128:$Rd), (OpNode (v4i32 V128:$Rn)))]>;
+}
+
+multiclass SIMDLongTwoVectorTied<bit U, bits<5> opc, string asm,
+ SDPatternOperator OpNode> {
+ def v8i8_v4i16 : BaseSIMDTwoSameVectorTied<0, U, 0b00, opc, V64,
+ asm, ".4h", ".8b",
+ [(set (v4i16 V64:$dst), (OpNode (v4i16 V64:$Rd),
+ (v8i8 V64:$Rn)))]>;
+ def v16i8_v8i16 : BaseSIMDTwoSameVectorTied<1, U, 0b00, opc, V128,
+ asm, ".8h", ".16b",
+ [(set (v8i16 V128:$dst), (OpNode (v8i16 V128:$Rd),
+ (v16i8 V128:$Rn)))]>;
+ def v4i16_v2i32 : BaseSIMDTwoSameVectorTied<0, U, 0b01, opc, V64,
+ asm, ".2s", ".4h",
+ [(set (v2i32 V64:$dst), (OpNode (v2i32 V64:$Rd),
+ (v4i16 V64:$Rn)))]>;
+ def v8i16_v4i32 : BaseSIMDTwoSameVectorTied<1, U, 0b01, opc, V128,
+ asm, ".4s", ".8h",
+ [(set (v4i32 V128:$dst), (OpNode (v4i32 V128:$Rd),
+ (v8i16 V128:$Rn)))]>;
+ def v2i32_v1i64 : BaseSIMDTwoSameVectorTied<0, U, 0b10, opc, V64,
+ asm, ".1d", ".2s",
+ [(set (v1i64 V64:$dst), (OpNode (v1i64 V64:$Rd),
+ (v2i32 V64:$Rn)))]>;
+ def v4i32_v2i64 : BaseSIMDTwoSameVectorTied<1, U, 0b10, opc, V128,
+ asm, ".2d", ".4s",
+ [(set (v2i64 V128:$dst), (OpNode (v2i64 V128:$Rd),
+ (v4i32 V128:$Rn)))]>;
+}
+
+// Supports all element sizes, except 1xD.
+multiclass SIMDTwoVectorBHSDTied<bit U, bits<5> opc, string asm,
+ SDPatternOperator OpNode> {
+ def v8i8 : BaseSIMDTwoSameVectorTied<0, U, 0b00, opc, V64,
+ asm, ".8b", ".8b",
+ [(set (v8i8 V64:$dst), (OpNode (v8i8 V64:$Rd), (v8i8 V64:$Rn)))]>;
+ def v16i8 : BaseSIMDTwoSameVectorTied<1, U, 0b00, opc, V128,
+ asm, ".16b", ".16b",
+ [(set (v16i8 V128:$dst), (OpNode (v16i8 V128:$Rd), (v16i8 V128:$Rn)))]>;
+ def v4i16 : BaseSIMDTwoSameVectorTied<0, U, 0b01, opc, V64,
+ asm, ".4h", ".4h",
+ [(set (v4i16 V64:$dst), (OpNode (v4i16 V64:$Rd), (v4i16 V64:$Rn)))]>;
+ def v8i16 : BaseSIMDTwoSameVectorTied<1, U, 0b01, opc, V128,
+ asm, ".8h", ".8h",
+ [(set (v8i16 V128:$dst), (OpNode (v8i16 V128:$Rd), (v8i16 V128:$Rn)))]>;
+ def v2i32 : BaseSIMDTwoSameVectorTied<0, U, 0b10, opc, V64,
+ asm, ".2s", ".2s",
+ [(set (v2i32 V64:$dst), (OpNode (v2i32 V64:$Rd), (v2i32 V64:$Rn)))]>;
+ def v4i32 : BaseSIMDTwoSameVectorTied<1, U, 0b10, opc, V128,
+ asm, ".4s", ".4s",
+ [(set (v4i32 V128:$dst), (OpNode (v4i32 V128:$Rd), (v4i32 V128:$Rn)))]>;
+ def v2i64 : BaseSIMDTwoSameVectorTied<1, U, 0b11, opc, V128,
+ asm, ".2d", ".2d",
+ [(set (v2i64 V128:$dst), (OpNode (v2i64 V128:$Rd), (v2i64 V128:$Rn)))]>;
+}
+
+multiclass SIMDTwoVectorBHSD<bit U, bits<5> opc, string asm,
+ SDPatternOperator OpNode = null_frag> {
+ def v8i8 : BaseSIMDTwoSameVector<0, U, 0b00, opc, V64,
+ asm, ".8b", ".8b",
+ [(set (v8i8 V64:$Rd), (OpNode (v8i8 V64:$Rn)))]>;
+ def v16i8 : BaseSIMDTwoSameVector<1, U, 0b00, opc, V128,
+ asm, ".16b", ".16b",
+ [(set (v16i8 V128:$Rd), (OpNode (v16i8 V128:$Rn)))]>;
+ def v4i16 : BaseSIMDTwoSameVector<0, U, 0b01, opc, V64,
+ asm, ".4h", ".4h",
+ [(set (v4i16 V64:$Rd), (OpNode (v4i16 V64:$Rn)))]>;
+ def v8i16 : BaseSIMDTwoSameVector<1, U, 0b01, opc, V128,
+ asm, ".8h", ".8h",
+ [(set (v8i16 V128:$Rd), (OpNode (v8i16 V128:$Rn)))]>;
+ def v2i32 : BaseSIMDTwoSameVector<0, U, 0b10, opc, V64,
+ asm, ".2s", ".2s",
+ [(set (v2i32 V64:$Rd), (OpNode (v2i32 V64:$Rn)))]>;
+ def v4i32 : BaseSIMDTwoSameVector<1, U, 0b10, opc, V128,
+ asm, ".4s", ".4s",
+ [(set (v4i32 V128:$Rd), (OpNode (v4i32 V128:$Rn)))]>;
+ def v2i64 : BaseSIMDTwoSameVector<1, U, 0b11, opc, V128,
+ asm, ".2d", ".2d",
+ [(set (v2i64 V128:$Rd), (OpNode (v2i64 V128:$Rn)))]>;
+}
+
+
+// Supports only B element sizes.
+multiclass SIMDTwoVectorB<bit U, bits<2> size, bits<5> opc, string asm,
+ SDPatternOperator OpNode> {
+ def v8i8 : BaseSIMDTwoSameVector<0, U, size, opc, V64,
+ asm, ".8b", ".8b",
+ [(set (v8i8 V64:$Rd), (OpNode (v8i8 V64:$Rn)))]>;
+ def v16i8 : BaseSIMDTwoSameVector<1, U, size, opc, V128,
+ asm, ".16b", ".16b",
+ [(set (v16i8 V128:$Rd), (OpNode (v16i8 V128:$Rn)))]>;
+
+}
+
+// Supports only B and H element sizes.
+multiclass SIMDTwoVectorBH<bit U, bits<5> opc, string asm,
+ SDPatternOperator OpNode> {
+ def v8i8 : BaseSIMDTwoSameVector<0, U, 0b00, opc, V64,
+ asm, ".8b", ".8b",
+ [(set (v8i8 V64:$Rd), (OpNode V64:$Rn))]>;
+ def v16i8 : BaseSIMDTwoSameVector<1, U, 0b00, opc, V128,
+ asm, ".16b", ".16b",
+ [(set (v16i8 V128:$Rd), (OpNode V128:$Rn))]>;
+ def v4i16 : BaseSIMDTwoSameVector<0, U, 0b01, opc, V64,
+ asm, ".4h", ".4h",
+ [(set (v4i16 V64:$Rd), (OpNode V64:$Rn))]>;
+ def v8i16 : BaseSIMDTwoSameVector<1, U, 0b01, opc, V128,
+ asm, ".8h", ".8h",
+ [(set (v8i16 V128:$Rd), (OpNode V128:$Rn))]>;
+}
+
+// Supports only S and D element sizes, uses high bit of the size field
+// as an extra opcode bit.
+multiclass SIMDTwoVectorFP<bit U, bit S, bits<5> opc, string asm,
+ SDPatternOperator OpNode> {
+ def v2f32 : BaseSIMDTwoSameVector<0, U, {S,0}, opc, V64,
+ asm, ".2s", ".2s",
+ [(set (v2f32 V64:$Rd), (OpNode (v2f32 V64:$Rn)))]>;
+ def v4f32 : BaseSIMDTwoSameVector<1, U, {S,0}, opc, V128,
+ asm, ".4s", ".4s",
+ [(set (v4f32 V128:$Rd), (OpNode (v4f32 V128:$Rn)))]>;
+ def v2f64 : BaseSIMDTwoSameVector<1, U, {S,1}, opc, V128,
+ asm, ".2d", ".2d",
+ [(set (v2f64 V128:$Rd), (OpNode (v2f64 V128:$Rn)))]>;
+}
+
+// Supports only S element size.
+multiclass SIMDTwoVectorS<bit U, bit S, bits<5> opc, string asm,
+ SDPatternOperator OpNode> {
+ def v2i32 : BaseSIMDTwoSameVector<0, U, {S,0}, opc, V64,
+ asm, ".2s", ".2s",
+ [(set (v2i32 V64:$Rd), (OpNode (v2i32 V64:$Rn)))]>;
+ def v4i32 : BaseSIMDTwoSameVector<1, U, {S,0}, opc, V128,
+ asm, ".4s", ".4s",
+ [(set (v4i32 V128:$Rd), (OpNode (v4i32 V128:$Rn)))]>;
+}
+
+
+multiclass SIMDTwoVectorFPToInt<bit U, bit S, bits<5> opc, string asm,
+ SDPatternOperator OpNode> {
+ def v2f32 : BaseSIMDTwoSameVector<0, U, {S,0}, opc, V64,
+ asm, ".2s", ".2s",
+ [(set (v2i32 V64:$Rd), (OpNode (v2f32 V64:$Rn)))]>;
+ def v4f32 : BaseSIMDTwoSameVector<1, U, {S,0}, opc, V128,
+ asm, ".4s", ".4s",
+ [(set (v4i32 V128:$Rd), (OpNode (v4f32 V128:$Rn)))]>;
+ def v2f64 : BaseSIMDTwoSameVector<1, U, {S,1}, opc, V128,
+ asm, ".2d", ".2d",
+ [(set (v2i64 V128:$Rd), (OpNode (v2f64 V128:$Rn)))]>;
+}
+
+multiclass SIMDTwoVectorIntToFP<bit U, bit S, bits<5> opc, string asm,
+ SDPatternOperator OpNode> {
+ def v2f32 : BaseSIMDTwoSameVector<0, U, {S,0}, opc, V64,
+ asm, ".2s", ".2s",
+ [(set (v2f32 V64:$Rd), (OpNode (v2i32 V64:$Rn)))]>;
+ def v4f32 : BaseSIMDTwoSameVector<1, U, {S,0}, opc, V128,
+ asm, ".4s", ".4s",
+ [(set (v4f32 V128:$Rd), (OpNode (v4i32 V128:$Rn)))]>;
+ def v2f64 : BaseSIMDTwoSameVector<1, U, {S,1}, opc, V128,
+ asm, ".2d", ".2d",
+ [(set (v2f64 V128:$Rd), (OpNode (v2i64 V128:$Rn)))]>;
+}
+
+
+class BaseSIMDMixedTwoVector<bit Q, bit U, bits<2> size, bits<5> opcode,
+ RegisterOperand inreg, RegisterOperand outreg,
+ string asm, string outkind, string inkind,
+ list<dag> pattern>
+ : I<(outs outreg:$Rd), (ins inreg:$Rn), asm,
+ "{\t$Rd" # outkind # ", $Rn" # inkind #
+ "|" # outkind # "\t$Rd, $Rn}", "", pattern>,
+ Sched<[WriteV]> {
+ bits<5> Rd;
+ bits<5> Rn;
+ let Inst{31} = 0;
+ let Inst{30} = Q;
+ let Inst{29} = U;
+ let Inst{28-24} = 0b01110;
+ let Inst{23-22} = size;
+ let Inst{21-17} = 0b10000;
+ let Inst{16-12} = opcode;
+ let Inst{11-10} = 0b10;
+ let Inst{9-5} = Rn;
+ let Inst{4-0} = Rd;
+}
+
+class BaseSIMDMixedTwoVectorTied<bit Q, bit U, bits<2> size, bits<5> opcode,
+ RegisterOperand inreg, RegisterOperand outreg,
+ string asm, string outkind, string inkind,
+ list<dag> pattern>
+ : I<(outs outreg:$dst), (ins outreg:$Rd, inreg:$Rn), asm,
+ "{\t$Rd" # outkind # ", $Rn" # inkind #
+ "|" # outkind # "\t$Rd, $Rn}", "$Rd = $dst", pattern>,
+ Sched<[WriteV]> {
+ bits<5> Rd;
+ bits<5> Rn;
+ let Inst{31} = 0;
+ let Inst{30} = Q;
+ let Inst{29} = U;
+ let Inst{28-24} = 0b01110;
+ let Inst{23-22} = size;
+ let Inst{21-17} = 0b10000;
+ let Inst{16-12} = opcode;
+ let Inst{11-10} = 0b10;
+ let Inst{9-5} = Rn;
+ let Inst{4-0} = Rd;
+}
+
+multiclass SIMDMixedTwoVector<bit U, bits<5> opc, string asm,
+ SDPatternOperator OpNode> {
+ def v8i8 : BaseSIMDMixedTwoVector<0, U, 0b00, opc, V128, V64,
+ asm, ".8b", ".8h",
+ [(set (v8i8 V64:$Rd), (OpNode (v8i16 V128:$Rn)))]>;
+ def v16i8 : BaseSIMDMixedTwoVectorTied<1, U, 0b00, opc, V128, V128,
+ asm#"2", ".16b", ".8h", []>;
+ def v4i16 : BaseSIMDMixedTwoVector<0, U, 0b01, opc, V128, V64,
+ asm, ".4h", ".4s",
+ [(set (v4i16 V64:$Rd), (OpNode (v4i32 V128:$Rn)))]>;
+ def v8i16 : BaseSIMDMixedTwoVectorTied<1, U, 0b01, opc, V128, V128,
+ asm#"2", ".8h", ".4s", []>;
+ def v2i32 : BaseSIMDMixedTwoVector<0, U, 0b10, opc, V128, V64,
+ asm, ".2s", ".2d",
+ [(set (v2i32 V64:$Rd), (OpNode (v2i64 V128:$Rn)))]>;
+ def v4i32 : BaseSIMDMixedTwoVectorTied<1, U, 0b10, opc, V128, V128,
+ asm#"2", ".4s", ".2d", []>;
+
+ def : Pat<(concat_vectors (v8i8 V64:$Rd), (OpNode (v8i16 V128:$Rn))),
+ (!cast<Instruction>(NAME # "v16i8")
+ (INSERT_SUBREG (IMPLICIT_DEF), V64:$Rd, dsub), V128:$Rn)>;
+ def : Pat<(concat_vectors (v4i16 V64:$Rd), (OpNode (v4i32 V128:$Rn))),
+ (!cast<Instruction>(NAME # "v8i16")
+ (INSERT_SUBREG (IMPLICIT_DEF), V64:$Rd, dsub), V128:$Rn)>;
+ def : Pat<(concat_vectors (v2i32 V64:$Rd), (OpNode (v2i64 V128:$Rn))),
+ (!cast<Instruction>(NAME # "v4i32")
+ (INSERT_SUBREG (IMPLICIT_DEF), V64:$Rd, dsub), V128:$Rn)>;
+}
+
+class BaseSIMDCmpTwoVector<bit Q, bit U, bits<2> size, bits<5> opcode,
+ RegisterOperand regtype,
+ string asm, string kind, string zero,
+ ValueType dty, ValueType sty, SDNode OpNode>
+ : I<(outs regtype:$Rd), (ins regtype:$Rn), asm,
+ "{\t$Rd" # kind # ", $Rn" # kind # ", #" # zero #
+ "|" # kind # "\t$Rd, $Rn, #" # zero # "}", "",
+ [(set (dty regtype:$Rd), (OpNode (sty regtype:$Rn)))]>,
+ Sched<[WriteV]> {
+ bits<5> Rd;
+ bits<5> Rn;
+ let Inst{31} = 0;
+ let Inst{30} = Q;
+ let Inst{29} = U;
+ let Inst{28-24} = 0b01110;
+ let Inst{23-22} = size;
+ let Inst{21-17} = 0b10000;
+ let Inst{16-12} = opcode;
+ let Inst{11-10} = 0b10;
+ let Inst{9-5} = Rn;
+ let Inst{4-0} = Rd;
+}
+
+// Comparisons support all element sizes, except 1xD.
+multiclass SIMDCmpTwoVector<bit U, bits<5> opc, string asm,
+ SDNode OpNode> {
+ def v8i8rz : BaseSIMDCmpTwoVector<0, U, 0b00, opc, V64,
+ asm, ".8b", "0",
+ v8i8, v8i8, OpNode>;
+ def v16i8rz : BaseSIMDCmpTwoVector<1, U, 0b00, opc, V128,
+ asm, ".16b", "0",
+ v16i8, v16i8, OpNode>;
+ def v4i16rz : BaseSIMDCmpTwoVector<0, U, 0b01, opc, V64,
+ asm, ".4h", "0",
+ v4i16, v4i16, OpNode>;
+ def v8i16rz : BaseSIMDCmpTwoVector<1, U, 0b01, opc, V128,
+ asm, ".8h", "0",
+ v8i16, v8i16, OpNode>;
+ def v2i32rz : BaseSIMDCmpTwoVector<0, U, 0b10, opc, V64,
+ asm, ".2s", "0",
+ v2i32, v2i32, OpNode>;
+ def v4i32rz : BaseSIMDCmpTwoVector<1, U, 0b10, opc, V128,
+ asm, ".4s", "0",
+ v4i32, v4i32, OpNode>;
+ def v2i64rz : BaseSIMDCmpTwoVector<1, U, 0b11, opc, V128,
+ asm, ".2d", "0",
+ v2i64, v2i64, OpNode>;
+}
+
+// FP Comparisons support only S and D element sizes.
+multiclass SIMDFPCmpTwoVector<bit U, bit S, bits<5> opc,
+ string asm, SDNode OpNode> {
+
+ def v2i32rz : BaseSIMDCmpTwoVector<0, U, {S,0}, opc, V64,
+ asm, ".2s", "0.0",
+ v2i32, v2f32, OpNode>;
+ def v4i32rz : BaseSIMDCmpTwoVector<1, U, {S,0}, opc, V128,
+ asm, ".4s", "0.0",
+ v4i32, v4f32, OpNode>;
+ def v2i64rz : BaseSIMDCmpTwoVector<1, U, {S,1}, opc, V128,
+ asm, ".2d", "0.0",
+ v2i64, v2f64, OpNode>;
+
+ def : InstAlias<asm # " $Vd.2s, $Vn.2s, #0",
+ (!cast<Instruction>(NAME # v2i32rz) V64:$Vd, V64:$Vn), 0>;
+ def : InstAlias<asm # " $Vd.4s, $Vn.4s, #0",
+ (!cast<Instruction>(NAME # v4i32rz) V128:$Vd, V128:$Vn), 0>;
+ def : InstAlias<asm # " $Vd.2d, $Vn.2d, #0",
+ (!cast<Instruction>(NAME # v2i64rz) V128:$Vd, V128:$Vn), 0>;
+ def : InstAlias<asm # ".2s $Vd, $Vn, #0",
+ (!cast<Instruction>(NAME # v2i32rz) V64:$Vd, V64:$Vn), 0>;
+ def : InstAlias<asm # ".4s $Vd, $Vn, #0",
+ (!cast<Instruction>(NAME # v4i32rz) V128:$Vd, V128:$Vn), 0>;
+ def : InstAlias<asm # ".2d $Vd, $Vn, #0",
+ (!cast<Instruction>(NAME # v2i64rz) V128:$Vd, V128:$Vn), 0>;
+}
+
+let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
+class BaseSIMDFPCvtTwoVector<bit Q, bit U, bits<2> size, bits<5> opcode,
+ RegisterOperand outtype, RegisterOperand intype,
+ string asm, string VdTy, string VnTy,
+ list<dag> pattern>
+ : I<(outs outtype:$Rd), (ins intype:$Rn), asm,
+ !strconcat("\t$Rd", VdTy, ", $Rn", VnTy), "", pattern>,
+ Sched<[WriteV]> {
+ bits<5> Rd;
+ bits<5> Rn;
+ let Inst{31} = 0;
+ let Inst{30} = Q;
+ let Inst{29} = U;
+ let Inst{28-24} = 0b01110;
+ let Inst{23-22} = size;
+ let Inst{21-17} = 0b10000;
+ let Inst{16-12} = opcode;
+ let Inst{11-10} = 0b10;
+ let Inst{9-5} = Rn;
+ let Inst{4-0} = Rd;
+}
+
+class BaseSIMDFPCvtTwoVectorTied<bit Q, bit U, bits<2> size, bits<5> opcode,
+ RegisterOperand outtype, RegisterOperand intype,
+ string asm, string VdTy, string VnTy,
+ list<dag> pattern>
+ : I<(outs outtype:$dst), (ins outtype:$Rd, intype:$Rn), asm,
+ !strconcat("\t$Rd", VdTy, ", $Rn", VnTy), "$Rd = $dst", pattern>,
+ Sched<[WriteV]> {
+ bits<5> Rd;
+ bits<5> Rn;
+ let Inst{31} = 0;
+ let Inst{30} = Q;
+ let Inst{29} = U;
+ let Inst{28-24} = 0b01110;
+ let Inst{23-22} = size;
+ let Inst{21-17} = 0b10000;
+ let Inst{16-12} = opcode;
+ let Inst{11-10} = 0b10;
+ let Inst{9-5} = Rn;
+ let Inst{4-0} = Rd;
+}
+
+multiclass SIMDFPWidenTwoVector<bit U, bit S, bits<5> opc, string asm> {
+ def v4i16 : BaseSIMDFPCvtTwoVector<0, U, {S,0}, opc, V128, V64,
+ asm, ".4s", ".4h", []>;
+ def v8i16 : BaseSIMDFPCvtTwoVector<1, U, {S,0}, opc, V128, V128,
+ asm#"2", ".4s", ".8h", []>;
+ def v2i32 : BaseSIMDFPCvtTwoVector<0, U, {S,1}, opc, V128, V64,
+ asm, ".2d", ".2s", []>;
+ def v4i32 : BaseSIMDFPCvtTwoVector<1, U, {S,1}, opc, V128, V128,
+ asm#"2", ".2d", ".4s", []>;
+}
+
+multiclass SIMDFPNarrowTwoVector<bit U, bit S, bits<5> opc, string asm> {
+ def v4i16 : BaseSIMDFPCvtTwoVector<0, U, {S,0}, opc, V64, V128,
+ asm, ".4h", ".4s", []>;
+ def v8i16 : BaseSIMDFPCvtTwoVectorTied<1, U, {S,0}, opc, V128, V128,
+ asm#"2", ".8h", ".4s", []>;
+ def v2i32 : BaseSIMDFPCvtTwoVector<0, U, {S,1}, opc, V64, V128,
+ asm, ".2s", ".2d", []>;
+ def v4i32 : BaseSIMDFPCvtTwoVectorTied<1, U, {S,1}, opc, V128, V128,
+ asm#"2", ".4s", ".2d", []>;
+}
+
+multiclass SIMDFPInexactCvtTwoVector<bit U, bit S, bits<5> opc, string asm,
+ Intrinsic OpNode> {
+ def v2f32 : BaseSIMDFPCvtTwoVector<0, U, {S,1}, opc, V64, V128,
+ asm, ".2s", ".2d",
+ [(set (v2f32 V64:$Rd), (OpNode (v2f64 V128:$Rn)))]>;
+ def v4f32 : BaseSIMDFPCvtTwoVectorTied<1, U, {S,1}, opc, V128, V128,
+ asm#"2", ".4s", ".2d", []>;
+
+ def : Pat<(concat_vectors (v2f32 V64:$Rd), (OpNode (v2f64 V128:$Rn))),
+ (!cast<Instruction>(NAME # "v4f32")
+ (INSERT_SUBREG (IMPLICIT_DEF), V64:$Rd, dsub), V128:$Rn)>;
+}
+
+//----------------------------------------------------------------------------
+// AdvSIMD three register different-size vector instructions.
+//----------------------------------------------------------------------------
+
+let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
+class BaseSIMDDifferentThreeVector<bit U, bits<3> size, bits<4> opcode,
+ RegisterOperand outtype, RegisterOperand intype1,
+ RegisterOperand intype2, string asm,
+ string outkind, string inkind1, string inkind2,
+ list<dag> pattern>
+ : I<(outs outtype:$Rd), (ins intype1:$Rn, intype2:$Rm), asm,
+ "{\t$Rd" # outkind # ", $Rn" # inkind1 # ", $Rm" # inkind2 #
+ "|" # outkind # "\t$Rd, $Rn, $Rm}", "", pattern>,
+ Sched<[WriteV]> {
+ bits<5> Rd;
+ bits<5> Rn;
+ bits<5> Rm;
+ let Inst{31} = 0;
+ let Inst{30} = size{0};
+ let Inst{29} = U;
+ let Inst{28-24} = 0b01110;
+ let Inst{23-22} = size{2-1};
+ let Inst{21} = 1;
+ let Inst{20-16} = Rm;
+ let Inst{15-12} = opcode;
let Inst{11-10} = 0b00;
- // Inherit Rn in 9-5
- // Inherit Rd in 4-0
+ let Inst{9-5} = Rn;
+ let Inst{4-0} = Rd;
}
-// Format AdvSIMD 3 vector registers with same vector type
-class NeonI_3VSame<bit q, bit u, bits<2> size, bits<5> opcode,
- dag outs, dag ins, string asmstr,
- list<dag> patterns, InstrItinClass itin>
- : A64InstRdnm<outs, ins, asmstr, patterns, itin> {
- let Inst{31} = 0b0;
- let Inst{30} = q;
- let Inst{29} = u;
+let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
+class BaseSIMDDifferentThreeVectorTied<bit U, bits<3> size, bits<4> opcode,
+ RegisterOperand outtype, RegisterOperand intype1,
+ RegisterOperand intype2, string asm,
+ string outkind, string inkind1, string inkind2,
+ list<dag> pattern>
+ : I<(outs outtype:$dst), (ins outtype:$Rd, intype1:$Rn, intype2:$Rm), asm,
+ "{\t$Rd" # outkind # ", $Rn" # inkind1 # ", $Rm" # inkind2 #
+ "|" # outkind # "\t$Rd, $Rn, $Rm}", "$Rd = $dst", pattern>,
+ Sched<[WriteV]> {
+ bits<5> Rd;
+ bits<5> Rn;
+ bits<5> Rm;
+ let Inst{31} = 0;
+ let Inst{30} = size{0};
+ let Inst{29} = U;
let Inst{28-24} = 0b01110;
+ let Inst{23-22} = size{2-1};
+ let Inst{21} = 1;
+ let Inst{20-16} = Rm;
+ let Inst{15-12} = opcode;
+ let Inst{11-10} = 0b00;
+ let Inst{9-5} = Rn;
+ let Inst{4-0} = Rd;
+}
+
+// FIXME: TableGen doesn't know how to deal with expanded types that also
+// change the element count (in this case, placing the results in
+// the high elements of the result register rather than the low
+// elements). Until that's fixed, we can't code-gen those.
+multiclass SIMDNarrowThreeVectorBHS<bit U, bits<4> opc, string asm,
+ Intrinsic IntOp> {
+ def v8i16_v8i8 : BaseSIMDDifferentThreeVector<U, 0b000, opc,
+ V64, V128, V128,
+ asm, ".8b", ".8h", ".8h",
+ [(set (v8i8 V64:$Rd), (IntOp (v8i16 V128:$Rn), (v8i16 V128:$Rm)))]>;
+ def v8i16_v16i8 : BaseSIMDDifferentThreeVectorTied<U, 0b001, opc,
+ V128, V128, V128,
+ asm#"2", ".16b", ".8h", ".8h",
+ []>;
+ def v4i32_v4i16 : BaseSIMDDifferentThreeVector<U, 0b010, opc,
+ V64, V128, V128,
+ asm, ".4h", ".4s", ".4s",
+ [(set (v4i16 V64:$Rd), (IntOp (v4i32 V128:$Rn), (v4i32 V128:$Rm)))]>;
+ def v4i32_v8i16 : BaseSIMDDifferentThreeVectorTied<U, 0b011, opc,
+ V128, V128, V128,
+ asm#"2", ".8h", ".4s", ".4s",
+ []>;
+ def v2i64_v2i32 : BaseSIMDDifferentThreeVector<U, 0b100, opc,
+ V64, V128, V128,
+ asm, ".2s", ".2d", ".2d",
+ [(set (v2i32 V64:$Rd), (IntOp (v2i64 V128:$Rn), (v2i64 V128:$Rm)))]>;
+ def v2i64_v4i32 : BaseSIMDDifferentThreeVectorTied<U, 0b101, opc,
+ V128, V128, V128,
+ asm#"2", ".4s", ".2d", ".2d",
+ []>;
+
+
+ // Patterns for the '2' variants involve INSERT_SUBREG, which you can't put in
+ // a version attached to an instruction.
+ def : Pat<(concat_vectors (v8i8 V64:$Rd), (IntOp (v8i16 V128:$Rn),
+ (v8i16 V128:$Rm))),
+ (!cast<Instruction>(NAME # "v8i16_v16i8")
+ (INSERT_SUBREG (IMPLICIT_DEF), V64:$Rd, dsub),
+ V128:$Rn, V128:$Rm)>;
+ def : Pat<(concat_vectors (v4i16 V64:$Rd), (IntOp (v4i32 V128:$Rn),
+ (v4i32 V128:$Rm))),
+ (!cast<Instruction>(NAME # "v4i32_v8i16")
+ (INSERT_SUBREG (IMPLICIT_DEF), V64:$Rd, dsub),
+ V128:$Rn, V128:$Rm)>;
+ def : Pat<(concat_vectors (v2i32 V64:$Rd), (IntOp (v2i64 V128:$Rn),
+ (v2i64 V128:$Rm))),
+ (!cast<Instruction>(NAME # "v2i64_v4i32")
+ (INSERT_SUBREG (IMPLICIT_DEF), V64:$Rd, dsub),
+ V128:$Rn, V128:$Rm)>;
+}
+
+multiclass SIMDDifferentThreeVectorBD<bit U, bits<4> opc, string asm,
+ Intrinsic IntOp> {
+ def v8i8 : BaseSIMDDifferentThreeVector<U, 0b000, opc,
+ V128, V64, V64,
+ asm, ".8h", ".8b", ".8b",
+ [(set (v8i16 V128:$Rd), (IntOp (v8i8 V64:$Rn), (v8i8 V64:$Rm)))]>;
+ def v16i8 : BaseSIMDDifferentThreeVector<U, 0b001, opc,
+ V128, V128, V128,
+ asm#"2", ".8h", ".16b", ".16b", []>;
+ let Predicates = [HasCrypto] in {
+ def v1i64 : BaseSIMDDifferentThreeVector<U, 0b110, opc,
+ V128, V64, V64,
+ asm, ".1q", ".1d", ".1d", []>;
+ def v2i64 : BaseSIMDDifferentThreeVector<U, 0b111, opc,
+ V128, V128, V128,
+ asm#"2", ".1q", ".2d", ".2d", []>;
+ }
+
+ def : Pat<(v8i16 (IntOp (v8i8 (extract_high_v16i8 V128:$Rn)),
+ (v8i8 (extract_high_v16i8 V128:$Rm)))),
+ (!cast<Instruction>(NAME#"v16i8") V128:$Rn, V128:$Rm)>;
+}
+
+multiclass SIMDLongThreeVectorHS<bit U, bits<4> opc, string asm,
+ SDPatternOperator OpNode> {
+ def v4i16_v4i32 : BaseSIMDDifferentThreeVector<U, 0b010, opc,
+ V128, V64, V64,
+ asm, ".4s", ".4h", ".4h",
+ [(set (v4i32 V128:$Rd), (OpNode (v4i16 V64:$Rn), (v4i16 V64:$Rm)))]>;
+ def v8i16_v4i32 : BaseSIMDDifferentThreeVector<U, 0b011, opc,
+ V128, V128, V128,
+ asm#"2", ".4s", ".8h", ".8h",
+ [(set (v4i32 V128:$Rd), (OpNode (extract_high_v8i16 V128:$Rn),
+ (extract_high_v8i16 V128:$Rm)))]>;
+ def v2i32_v2i64 : BaseSIMDDifferentThreeVector<U, 0b100, opc,
+ V128, V64, V64,
+ asm, ".2d", ".2s", ".2s",
+ [(set (v2i64 V128:$Rd), (OpNode (v2i32 V64:$Rn), (v2i32 V64:$Rm)))]>;
+ def v4i32_v2i64 : BaseSIMDDifferentThreeVector<U, 0b101, opc,
+ V128, V128, V128,
+ asm#"2", ".2d", ".4s", ".4s",
+ [(set (v2i64 V128:$Rd), (OpNode (extract_high_v4i32 V128:$Rn),
+ (extract_high_v4i32 V128:$Rm)))]>;
+}
+
+multiclass SIMDLongThreeVectorBHSabdl<bit U, bits<4> opc, string asm,
+ SDPatternOperator OpNode = null_frag> {
+ def v8i8_v8i16 : BaseSIMDDifferentThreeVector<U, 0b000, opc,
+ V128, V64, V64,
+ asm, ".8h", ".8b", ".8b",
+ [(set (v8i16 V128:$Rd),
+ (zext (v8i8 (OpNode (v8i8 V64:$Rn), (v8i8 V64:$Rm)))))]>;
+ def v16i8_v8i16 : BaseSIMDDifferentThreeVector<U, 0b001, opc,
+ V128, V128, V128,
+ asm#"2", ".8h", ".16b", ".16b",
+ [(set (v8i16 V128:$Rd),
+ (zext (v8i8 (OpNode (extract_high_v16i8 V128:$Rn),
+ (extract_high_v16i8 V128:$Rm)))))]>;
+ def v4i16_v4i32 : BaseSIMDDifferentThreeVector<U, 0b010, opc,
+ V128, V64, V64,
+ asm, ".4s", ".4h", ".4h",
+ [(set (v4i32 V128:$Rd),
+ (zext (v4i16 (OpNode (v4i16 V64:$Rn), (v4i16 V64:$Rm)))))]>;
+ def v8i16_v4i32 : BaseSIMDDifferentThreeVector<U, 0b011, opc,
+ V128, V128, V128,
+ asm#"2", ".4s", ".8h", ".8h",
+ [(set (v4i32 V128:$Rd),
+ (zext (v4i16 (OpNode (extract_high_v8i16 V128:$Rn),
+ (extract_high_v8i16 V128:$Rm)))))]>;
+ def v2i32_v2i64 : BaseSIMDDifferentThreeVector<U, 0b100, opc,
+ V128, V64, V64,
+ asm, ".2d", ".2s", ".2s",
+ [(set (v2i64 V128:$Rd),
+ (zext (v2i32 (OpNode (v2i32 V64:$Rn), (v2i32 V64:$Rm)))))]>;
+ def v4i32_v2i64 : BaseSIMDDifferentThreeVector<U, 0b101, opc,
+ V128, V128, V128,
+ asm#"2", ".2d", ".4s", ".4s",
+ [(set (v2i64 V128:$Rd),
+ (zext (v2i32 (OpNode (extract_high_v4i32 V128:$Rn),
+ (extract_high_v4i32 V128:$Rm)))))]>;
+}
+
+multiclass SIMDLongThreeVectorTiedBHSabal<bit U, bits<4> opc,
+ string asm,
+ SDPatternOperator OpNode> {
+ def v8i8_v8i16 : BaseSIMDDifferentThreeVectorTied<U, 0b000, opc,
+ V128, V64, V64,
+ asm, ".8h", ".8b", ".8b",
+ [(set (v8i16 V128:$dst),
+ (add (v8i16 V128:$Rd),
+ (zext (v8i8 (OpNode (v8i8 V64:$Rn), (v8i8 V64:$Rm))))))]>;
+ def v16i8_v8i16 : BaseSIMDDifferentThreeVectorTied<U, 0b001, opc,
+ V128, V128, V128,
+ asm#"2", ".8h", ".16b", ".16b",
+ [(set (v8i16 V128:$dst),
+ (add (v8i16 V128:$Rd),
+ (zext (v8i8 (OpNode (extract_high_v16i8 V128:$Rn),
+ (extract_high_v16i8 V128:$Rm))))))]>;
+ def v4i16_v4i32 : BaseSIMDDifferentThreeVectorTied<U, 0b010, opc,
+ V128, V64, V64,
+ asm, ".4s", ".4h", ".4h",
+ [(set (v4i32 V128:$dst),
+ (add (v4i32 V128:$Rd),
+ (zext (v4i16 (OpNode (v4i16 V64:$Rn), (v4i16 V64:$Rm))))))]>;
+ def v8i16_v4i32 : BaseSIMDDifferentThreeVectorTied<U, 0b011, opc,
+ V128, V128, V128,
+ asm#"2", ".4s", ".8h", ".8h",
+ [(set (v4i32 V128:$dst),
+ (add (v4i32 V128:$Rd),
+ (zext (v4i16 (OpNode (extract_high_v8i16 V128:$Rn),
+ (extract_high_v8i16 V128:$Rm))))))]>;
+ def v2i32_v2i64 : BaseSIMDDifferentThreeVectorTied<U, 0b100, opc,
+ V128, V64, V64,
+ asm, ".2d", ".2s", ".2s",
+ [(set (v2i64 V128:$dst),
+ (add (v2i64 V128:$Rd),
+ (zext (v2i32 (OpNode (v2i32 V64:$Rn), (v2i32 V64:$Rm))))))]>;
+ def v4i32_v2i64 : BaseSIMDDifferentThreeVectorTied<U, 0b101, opc,
+ V128, V128, V128,
+ asm#"2", ".2d", ".4s", ".4s",
+ [(set (v2i64 V128:$dst),
+ (add (v2i64 V128:$Rd),
+ (zext (v2i32 (OpNode (extract_high_v4i32 V128:$Rn),
+ (extract_high_v4i32 V128:$Rm))))))]>;
+}
+
+multiclass SIMDLongThreeVectorBHS<bit U, bits<4> opc, string asm,
+ SDPatternOperator OpNode = null_frag> {
+ def v8i8_v8i16 : BaseSIMDDifferentThreeVector<U, 0b000, opc,
+ V128, V64, V64,
+ asm, ".8h", ".8b", ".8b",
+ [(set (v8i16 V128:$Rd), (OpNode (v8i8 V64:$Rn), (v8i8 V64:$Rm)))]>;
+ def v16i8_v8i16 : BaseSIMDDifferentThreeVector<U, 0b001, opc,
+ V128, V128, V128,
+ asm#"2", ".8h", ".16b", ".16b",
+ [(set (v8i16 V128:$Rd), (OpNode (extract_high_v16i8 V128:$Rn),
+ (extract_high_v16i8 V128:$Rm)))]>;
+ def v4i16_v4i32 : BaseSIMDDifferentThreeVector<U, 0b010, opc,
+ V128, V64, V64,
+ asm, ".4s", ".4h", ".4h",
+ [(set (v4i32 V128:$Rd), (OpNode (v4i16 V64:$Rn), (v4i16 V64:$Rm)))]>;
+ def v8i16_v4i32 : BaseSIMDDifferentThreeVector<U, 0b011, opc,
+ V128, V128, V128,
+ asm#"2", ".4s", ".8h", ".8h",
+ [(set (v4i32 V128:$Rd), (OpNode (extract_high_v8i16 V128:$Rn),
+ (extract_high_v8i16 V128:$Rm)))]>;
+ def v2i32_v2i64 : BaseSIMDDifferentThreeVector<U, 0b100, opc,
+ V128, V64, V64,
+ asm, ".2d", ".2s", ".2s",
+ [(set (v2i64 V128:$Rd), (OpNode (v2i32 V64:$Rn), (v2i32 V64:$Rm)))]>;
+ def v4i32_v2i64 : BaseSIMDDifferentThreeVector<U, 0b101, opc,
+ V128, V128, V128,
+ asm#"2", ".2d", ".4s", ".4s",
+ [(set (v2i64 V128:$Rd), (OpNode (extract_high_v4i32 V128:$Rn),
+ (extract_high_v4i32 V128:$Rm)))]>;
+}
+
+multiclass SIMDLongThreeVectorTiedBHS<bit U, bits<4> opc,
+ string asm,
+ SDPatternOperator OpNode> {
+ def v8i8_v8i16 : BaseSIMDDifferentThreeVectorTied<U, 0b000, opc,
+ V128, V64, V64,
+ asm, ".8h", ".8b", ".8b",
+ [(set (v8i16 V128:$dst),
+ (OpNode (v8i16 V128:$Rd), (v8i8 V64:$Rn), (v8i8 V64:$Rm)))]>;
+ def v16i8_v8i16 : BaseSIMDDifferentThreeVectorTied<U, 0b001, opc,
+ V128, V128, V128,
+ asm#"2", ".8h", ".16b", ".16b",
+ [(set (v8i16 V128:$dst),
+ (OpNode (v8i16 V128:$Rd),
+ (extract_high_v16i8 V128:$Rn),
+ (extract_high_v16i8 V128:$Rm)))]>;
+ def v4i16_v4i32 : BaseSIMDDifferentThreeVectorTied<U, 0b010, opc,
+ V128, V64, V64,
+ asm, ".4s", ".4h", ".4h",
+ [(set (v4i32 V128:$dst),
+ (OpNode (v4i32 V128:$Rd), (v4i16 V64:$Rn), (v4i16 V64:$Rm)))]>;
+ def v8i16_v4i32 : BaseSIMDDifferentThreeVectorTied<U, 0b011, opc,
+ V128, V128, V128,
+ asm#"2", ".4s", ".8h", ".8h",
+ [(set (v4i32 V128:$dst),
+ (OpNode (v4i32 V128:$Rd),
+ (extract_high_v8i16 V128:$Rn),
+ (extract_high_v8i16 V128:$Rm)))]>;
+ def v2i32_v2i64 : BaseSIMDDifferentThreeVectorTied<U, 0b100, opc,
+ V128, V64, V64,
+ asm, ".2d", ".2s", ".2s",
+ [(set (v2i64 V128:$dst),
+ (OpNode (v2i64 V128:$Rd), (v2i32 V64:$Rn), (v2i32 V64:$Rm)))]>;
+ def v4i32_v2i64 : BaseSIMDDifferentThreeVectorTied<U, 0b101, opc,
+ V128, V128, V128,
+ asm#"2", ".2d", ".4s", ".4s",
+ [(set (v2i64 V128:$dst),
+ (OpNode (v2i64 V128:$Rd),
+ (extract_high_v4i32 V128:$Rn),
+ (extract_high_v4i32 V128:$Rm)))]>;
+}
+
+multiclass SIMDLongThreeVectorSQDMLXTiedHS<bit U, bits<4> opc, string asm,
+ SDPatternOperator Accum> {
+ def v4i16_v4i32 : BaseSIMDDifferentThreeVectorTied<U, 0b010, opc,
+ V128, V64, V64,
+ asm, ".4s", ".4h", ".4h",
+ [(set (v4i32 V128:$dst),
+ (Accum (v4i32 V128:$Rd),
+ (v4i32 (int_aarch64_neon_sqdmull (v4i16 V64:$Rn),
+ (v4i16 V64:$Rm)))))]>;
+ def v8i16_v4i32 : BaseSIMDDifferentThreeVectorTied<U, 0b011, opc,
+ V128, V128, V128,
+ asm#"2", ".4s", ".8h", ".8h",
+ [(set (v4i32 V128:$dst),
+ (Accum (v4i32 V128:$Rd),
+ (v4i32 (int_aarch64_neon_sqdmull (extract_high_v8i16 V128:$Rn),
+ (extract_high_v8i16 V128:$Rm)))))]>;
+ def v2i32_v2i64 : BaseSIMDDifferentThreeVectorTied<U, 0b100, opc,
+ V128, V64, V64,
+ asm, ".2d", ".2s", ".2s",
+ [(set (v2i64 V128:$dst),
+ (Accum (v2i64 V128:$Rd),
+ (v2i64 (int_aarch64_neon_sqdmull (v2i32 V64:$Rn),
+ (v2i32 V64:$Rm)))))]>;
+ def v4i32_v2i64 : BaseSIMDDifferentThreeVectorTied<U, 0b101, opc,
+ V128, V128, V128,
+ asm#"2", ".2d", ".4s", ".4s",
+ [(set (v2i64 V128:$dst),
+ (Accum (v2i64 V128:$Rd),
+ (v2i64 (int_aarch64_neon_sqdmull (extract_high_v4i32 V128:$Rn),
+ (extract_high_v4i32 V128:$Rm)))))]>;
+}
+
+multiclass SIMDWideThreeVectorBHS<bit U, bits<4> opc, string asm,
+ SDPatternOperator OpNode> {
+ def v8i8_v8i16 : BaseSIMDDifferentThreeVector<U, 0b000, opc,
+ V128, V128, V64,
+ asm, ".8h", ".8h", ".8b",
+ [(set (v8i16 V128:$Rd), (OpNode (v8i16 V128:$Rn), (v8i8 V64:$Rm)))]>;
+ def v16i8_v8i16 : BaseSIMDDifferentThreeVector<U, 0b001, opc,
+ V128, V128, V128,
+ asm#"2", ".8h", ".8h", ".16b",
+ [(set (v8i16 V128:$Rd), (OpNode (v8i16 V128:$Rn),
+ (extract_high_v16i8 V128:$Rm)))]>;
+ def v4i16_v4i32 : BaseSIMDDifferentThreeVector<U, 0b010, opc,
+ V128, V128, V64,
+ asm, ".4s", ".4s", ".4h",
+ [(set (v4i32 V128:$Rd), (OpNode (v4i32 V128:$Rn), (v4i16 V64:$Rm)))]>;
+ def v8i16_v4i32 : BaseSIMDDifferentThreeVector<U, 0b011, opc,
+ V128, V128, V128,
+ asm#"2", ".4s", ".4s", ".8h",
+ [(set (v4i32 V128:$Rd), (OpNode (v4i32 V128:$Rn),
+ (extract_high_v8i16 V128:$Rm)))]>;
+ def v2i32_v2i64 : BaseSIMDDifferentThreeVector<U, 0b100, opc,
+ V128, V128, V64,
+ asm, ".2d", ".2d", ".2s",
+ [(set (v2i64 V128:$Rd), (OpNode (v2i64 V128:$Rn), (v2i32 V64:$Rm)))]>;
+ def v4i32_v2i64 : BaseSIMDDifferentThreeVector<U, 0b101, opc,
+ V128, V128, V128,
+ asm#"2", ".2d", ".2d", ".4s",
+ [(set (v2i64 V128:$Rd), (OpNode (v2i64 V128:$Rn),
+ (extract_high_v4i32 V128:$Rm)))]>;
+}
+
+//----------------------------------------------------------------------------
+// AdvSIMD bitwise extract from vector
+//----------------------------------------------------------------------------
+
+class BaseSIMDBitwiseExtract<bit size, RegisterOperand regtype, ValueType vty,
+ string asm, string kind>
+ : I<(outs regtype:$Rd), (ins regtype:$Rn, regtype:$Rm, i32imm:$imm), asm,
+ "{\t$Rd" # kind # ", $Rn" # kind # ", $Rm" # kind # ", $imm" #
+ "|" # kind # "\t$Rd, $Rn, $Rm, $imm}", "",
+ [(set (vty regtype:$Rd),
+ (AArch64ext regtype:$Rn, regtype:$Rm, (i32 imm:$imm)))]>,
+ Sched<[WriteV]> {
+ bits<5> Rd;
+ bits<5> Rn;
+ bits<5> Rm;
+ bits<4> imm;
+ let Inst{31} = 0;
+ let Inst{30} = size;
+ let Inst{29-21} = 0b101110000;
+ let Inst{20-16} = Rm;
+ let Inst{15} = 0;
+ let Inst{14-11} = imm;
+ let Inst{10} = 0;
+ let Inst{9-5} = Rn;
+ let Inst{4-0} = Rd;
+}
+
+
+multiclass SIMDBitwiseExtract<string asm> {
+ def v8i8 : BaseSIMDBitwiseExtract<0, V64, v8i8, asm, ".8b"> {
+ let imm{3} = 0;
+ }
+ def v16i8 : BaseSIMDBitwiseExtract<1, V128, v16i8, asm, ".16b">;
+}
+
+//----------------------------------------------------------------------------
+// AdvSIMD zip vector
+//----------------------------------------------------------------------------
+
+class BaseSIMDZipVector<bits<3> size, bits<3> opc, RegisterOperand regtype,
+ string asm, string kind, SDNode OpNode, ValueType valty>
+ : I<(outs regtype:$Rd), (ins regtype:$Rn, regtype:$Rm), asm,
+ "{\t$Rd" # kind # ", $Rn" # kind # ", $Rm" # kind #
+ "|" # kind # "\t$Rd, $Rn, $Rm}", "",
+ [(set (valty regtype:$Rd), (OpNode regtype:$Rn, regtype:$Rm))]>,
+ Sched<[WriteV]> {
+ bits<5> Rd;
+ bits<5> Rn;
+ bits<5> Rm;
+ let Inst{31} = 0;
+ let Inst{30} = size{0};
+ let Inst{29-24} = 0b001110;
+ let Inst{23-22} = size{2-1};
+ let Inst{21} = 0;
+ let Inst{20-16} = Rm;
+ let Inst{15} = 0;
+ let Inst{14-12} = opc;
+ let Inst{11-10} = 0b10;
+ let Inst{9-5} = Rn;
+ let Inst{4-0} = Rd;
+}
+
+multiclass SIMDZipVector<bits<3>opc, string asm,
+ SDNode OpNode> {
+ def v8i8 : BaseSIMDZipVector<0b000, opc, V64,
+ asm, ".8b", OpNode, v8i8>;
+ def v16i8 : BaseSIMDZipVector<0b001, opc, V128,
+ asm, ".16b", OpNode, v16i8>;
+ def v4i16 : BaseSIMDZipVector<0b010, opc, V64,
+ asm, ".4h", OpNode, v4i16>;
+ def v8i16 : BaseSIMDZipVector<0b011, opc, V128,
+ asm, ".8h", OpNode, v8i16>;
+ def v2i32 : BaseSIMDZipVector<0b100, opc, V64,
+ asm, ".2s", OpNode, v2i32>;
+ def v4i32 : BaseSIMDZipVector<0b101, opc, V128,
+ asm, ".4s", OpNode, v4i32>;
+ def v2i64 : BaseSIMDZipVector<0b111, opc, V128,
+ asm, ".2d", OpNode, v2i64>;
+
+ def : Pat<(v2f32 (OpNode V64:$Rn, V64:$Rm)),
+ (!cast<Instruction>(NAME#"v2i32") V64:$Rn, V64:$Rm)>;
+ def : Pat<(v4f32 (OpNode V128:$Rn, V128:$Rm)),
+ (!cast<Instruction>(NAME#"v4i32") V128:$Rn, V128:$Rm)>;
+ def : Pat<(v2f64 (OpNode V128:$Rn, V128:$Rm)),
+ (!cast<Instruction>(NAME#"v2i64") V128:$Rn, V128:$Rm)>;
+}
+
+//----------------------------------------------------------------------------
+// AdvSIMD three register scalar instructions
+//----------------------------------------------------------------------------
+
+let mayStore = 0, mayLoad = 0, hasSideEffects = 0 in
+class BaseSIMDThreeScalar<bit U, bits<2> size, bits<5> opcode,
+ RegisterClass regtype, string asm,
+ list<dag> pattern>
+ : I<(outs regtype:$Rd), (ins regtype:$Rn, regtype:$Rm), asm,
+ "\t$Rd, $Rn, $Rm", "", pattern>,
+ Sched<[WriteV]> {
+ bits<5> Rd;
+ bits<5> Rn;
+ bits<5> Rm;
+ let Inst{31-30} = 0b01;
+ let Inst{29} = U;
+ let Inst{28-24} = 0b11110;
let Inst{23-22} = size;
- let Inst{21} = 0b1;
- // Inherit Rm in 20-16
+ let Inst{21} = 1;
+ let Inst{20-16} = Rm;
let Inst{15-11} = opcode;
- let Inst{10} = 0b1;
- // Inherit Rn in 9-5
- // Inherit Rd in 4-0
+ let Inst{10} = 1;
+ let Inst{9-5} = Rn;
+ let Inst{4-0} = Rd;
}
-// Format AdvSIMD 3 vector registers with different vector type
-class NeonI_3VDiff<bit q, bit u, bits<2> size, bits<4> opcode,
- dag outs, dag ins, string asmstr,
- list<dag> patterns, InstrItinClass itin>
- : A64InstRdnm<outs, ins, asmstr, patterns, itin> {
- let Inst{31} = 0b0;
- let Inst{30} = q;
- let Inst{29} = u;
+multiclass SIMDThreeScalarD<bit U, bits<5> opc, string asm,
+ SDPatternOperator OpNode> {
+ def v1i64 : BaseSIMDThreeScalar<U, 0b11, opc, FPR64, asm,
+ [(set (v1i64 FPR64:$Rd), (OpNode (v1i64 FPR64:$Rn), (v1i64 FPR64:$Rm)))]>;
+}
+
+multiclass SIMDThreeScalarBHSD<bit U, bits<5> opc, string asm,
+ SDPatternOperator OpNode> {
+ def v1i64 : BaseSIMDThreeScalar<U, 0b11, opc, FPR64, asm,
+ [(set (v1i64 FPR64:$Rd), (OpNode (v1i64 FPR64:$Rn), (v1i64 FPR64:$Rm)))]>;
+ def v1i32 : BaseSIMDThreeScalar<U, 0b10, opc, FPR32, asm, []>;
+ def v1i16 : BaseSIMDThreeScalar<U, 0b01, opc, FPR16, asm, []>;
+ def v1i8 : BaseSIMDThreeScalar<U, 0b00, opc, FPR8 , asm, []>;
+
+ def : Pat<(i64 (OpNode (i64 FPR64:$Rn), (i64 FPR64:$Rm))),
+ (!cast<Instruction>(NAME#"v1i64") FPR64:$Rn, FPR64:$Rm)>;
+ def : Pat<(i32 (OpNode (i32 FPR32:$Rn), (i32 FPR32:$Rm))),
+ (!cast<Instruction>(NAME#"v1i32") FPR32:$Rn, FPR32:$Rm)>;
+}
+
+multiclass SIMDThreeScalarHS<bit U, bits<5> opc, string asm,
+ SDPatternOperator OpNode> {
+ def v1i32 : BaseSIMDThreeScalar<U, 0b10, opc, FPR32, asm,
+ [(set FPR32:$Rd, (OpNode FPR32:$Rn, FPR32:$Rm))]>;
+ def v1i16 : BaseSIMDThreeScalar<U, 0b01, opc, FPR16, asm, []>;
+}
+
+multiclass SIMDThreeScalarSD<bit U, bit S, bits<5> opc, string asm,
+ SDPatternOperator OpNode = null_frag> {
+ let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in {
+ def #NAME#64 : BaseSIMDThreeScalar<U, {S,1}, opc, FPR64, asm,
+ [(set (f64 FPR64:$Rd), (OpNode (f64 FPR64:$Rn), (f64 FPR64:$Rm)))]>;
+ def #NAME#32 : BaseSIMDThreeScalar<U, {S,0}, opc, FPR32, asm,
+ [(set FPR32:$Rd, (OpNode FPR32:$Rn, FPR32:$Rm))]>;
+ }
+
+ def : Pat<(v1f64 (OpNode (v1f64 FPR64:$Rn), (v1f64 FPR64:$Rm))),
+ (!cast<Instruction>(NAME # "64") FPR64:$Rn, FPR64:$Rm)>;
+}
+
+multiclass SIMDThreeScalarFPCmp<bit U, bit S, bits<5> opc, string asm,
+ SDPatternOperator OpNode = null_frag> {
+ let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in {
+ def #NAME#64 : BaseSIMDThreeScalar<U, {S,1}, opc, FPR64, asm,
+ [(set (i64 FPR64:$Rd), (OpNode (f64 FPR64:$Rn), (f64 FPR64:$Rm)))]>;
+ def #NAME#32 : BaseSIMDThreeScalar<U, {S,0}, opc, FPR32, asm,
+ [(set (i32 FPR32:$Rd), (OpNode (f32 FPR32:$Rn), (f32 FPR32:$Rm)))]>;
+ }
+
+ def : Pat<(v1i64 (OpNode (v1f64 FPR64:$Rn), (v1f64 FPR64:$Rm))),
+ (!cast<Instruction>(NAME # "64") FPR64:$Rn, FPR64:$Rm)>;
+}
+
+class BaseSIMDThreeScalarMixed<bit U, bits<2> size, bits<5> opcode,
+ dag oops, dag iops, string asm, string cstr, list<dag> pat>
+ : I<oops, iops, asm,
+ "\t$Rd, $Rn, $Rm", cstr, pat>,
+ Sched<[WriteV]> {
+ bits<5> Rd;
+ bits<5> Rn;
+ bits<5> Rm;
+ let Inst{31-30} = 0b01;
+ let Inst{29} = U;
+ let Inst{28-24} = 0b11110;
+ let Inst{23-22} = size;
+ let Inst{21} = 1;
+ let Inst{20-16} = Rm;
+ let Inst{15-11} = opcode;
+ let Inst{10} = 0;
+ let Inst{9-5} = Rn;
+ let Inst{4-0} = Rd;
+}
+
+let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
+multiclass SIMDThreeScalarMixedHS<bit U, bits<5> opc, string asm,
+ SDPatternOperator OpNode = null_frag> {
+ def i16 : BaseSIMDThreeScalarMixed<U, 0b01, opc,
+ (outs FPR32:$Rd),
+ (ins FPR16:$Rn, FPR16:$Rm), asm, "", []>;
+ def i32 : BaseSIMDThreeScalarMixed<U, 0b10, opc,
+ (outs FPR64:$Rd),
+ (ins FPR32:$Rn, FPR32:$Rm), asm, "",
+ [(set (i64 FPR64:$Rd), (OpNode (i32 FPR32:$Rn), (i32 FPR32:$Rm)))]>;
+}
+
+let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
+multiclass SIMDThreeScalarMixedTiedHS<bit U, bits<5> opc, string asm,
+ SDPatternOperator OpNode = null_frag> {
+ def i16 : BaseSIMDThreeScalarMixed<U, 0b01, opc,
+ (outs FPR32:$dst),
+ (ins FPR32:$Rd, FPR16:$Rn, FPR16:$Rm),
+ asm, "$Rd = $dst", []>;
+ def i32 : BaseSIMDThreeScalarMixed<U, 0b10, opc,
+ (outs FPR64:$dst),
+ (ins FPR64:$Rd, FPR32:$Rn, FPR32:$Rm),
+ asm, "$Rd = $dst",
+ [(set (i64 FPR64:$dst),
+ (OpNode (i64 FPR64:$Rd), (i32 FPR32:$Rn), (i32 FPR32:$Rm)))]>;
+}
+
+//----------------------------------------------------------------------------
+// AdvSIMD two register scalar instructions
+//----------------------------------------------------------------------------
+
+let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
+class BaseSIMDTwoScalar<bit U, bits<2> size, bits<5> opcode,
+ RegisterClass regtype, RegisterClass regtype2,
+ string asm, list<dag> pat>
+ : I<(outs regtype:$Rd), (ins regtype2:$Rn), asm,
+ "\t$Rd, $Rn", "", pat>,
+ Sched<[WriteV]> {
+ bits<5> Rd;
+ bits<5> Rn;
+ let Inst{31-30} = 0b01;
+ let Inst{29} = U;
+ let Inst{28-24} = 0b11110;
+ let Inst{23-22} = size;
+ let Inst{21-17} = 0b10000;
+ let Inst{16-12} = opcode;
+ let Inst{11-10} = 0b10;
+ let Inst{9-5} = Rn;
+ let Inst{4-0} = Rd;
+}
+
+let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
+class BaseSIMDTwoScalarTied<bit U, bits<2> size, bits<5> opcode,
+ RegisterClass regtype, RegisterClass regtype2,
+ string asm, list<dag> pat>
+ : I<(outs regtype:$dst), (ins regtype:$Rd, regtype2:$Rn), asm,
+ "\t$Rd, $Rn", "$Rd = $dst", pat>,
+ Sched<[WriteV]> {
+ bits<5> Rd;
+ bits<5> Rn;
+ let Inst{31-30} = 0b01;
+ let Inst{29} = U;
+ let Inst{28-24} = 0b11110;
+ let Inst{23-22} = size;
+ let Inst{21-17} = 0b10000;
+ let Inst{16-12} = opcode;
+ let Inst{11-10} = 0b10;
+ let Inst{9-5} = Rn;
+ let Inst{4-0} = Rd;
+}
+
+
+let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
+class BaseSIMDCmpTwoScalar<bit U, bits<2> size, bits<5> opcode,
+ RegisterClass regtype, string asm, string zero>
+ : I<(outs regtype:$Rd), (ins regtype:$Rn), asm,
+ "\t$Rd, $Rn, #" # zero, "", []>,
+ Sched<[WriteV]> {
+ bits<5> Rd;
+ bits<5> Rn;
+ let Inst{31-30} = 0b01;
+ let Inst{29} = U;
+ let Inst{28-24} = 0b11110;
+ let Inst{23-22} = size;
+ let Inst{21-17} = 0b10000;
+ let Inst{16-12} = opcode;
+ let Inst{11-10} = 0b10;
+ let Inst{9-5} = Rn;
+ let Inst{4-0} = Rd;
+}
+
+class SIMDInexactCvtTwoScalar<bits<5> opcode, string asm>
+ : I<(outs FPR32:$Rd), (ins FPR64:$Rn), asm, "\t$Rd, $Rn", "",
+ [(set (f32 FPR32:$Rd), (int_aarch64_sisd_fcvtxn (f64 FPR64:$Rn)))]>,
+ Sched<[WriteV]> {
+ bits<5> Rd;
+ bits<5> Rn;
+ let Inst{31-17} = 0b011111100110000;
+ let Inst{16-12} = opcode;
+ let Inst{11-10} = 0b10;
+ let Inst{9-5} = Rn;
+ let Inst{4-0} = Rd;
+}
+
+multiclass SIMDCmpTwoScalarD<bit U, bits<5> opc, string asm,
+ SDPatternOperator OpNode> {
+ def v1i64rz : BaseSIMDCmpTwoScalar<U, 0b11, opc, FPR64, asm, "0">;
+
+ def : Pat<(v1i64 (OpNode FPR64:$Rn)),
+ (!cast<Instruction>(NAME # v1i64rz) FPR64:$Rn)>;
+}
+
+multiclass SIMDCmpTwoScalarSD<bit U, bit S, bits<5> opc, string asm,
+ SDPatternOperator OpNode> {
+ def v1i64rz : BaseSIMDCmpTwoScalar<U, {S,1}, opc, FPR64, asm, "0.0">;
+ def v1i32rz : BaseSIMDCmpTwoScalar<U, {S,0}, opc, FPR32, asm, "0.0">;
+
+ def : InstAlias<asm # " $Rd, $Rn, #0",
+ (!cast<Instruction>(NAME # v1i64rz) FPR64:$Rd, FPR64:$Rn), 0>;
+ def : InstAlias<asm # " $Rd, $Rn, #0",
+ (!cast<Instruction>(NAME # v1i32rz) FPR32:$Rd, FPR32:$Rn), 0>;
+
+ def : Pat<(v1i64 (OpNode (v1f64 FPR64:$Rn))),
+ (!cast<Instruction>(NAME # v1i64rz) FPR64:$Rn)>;
+}
+
+multiclass SIMDTwoScalarD<bit U, bits<5> opc, string asm,
+ SDPatternOperator OpNode = null_frag> {
+ def v1i64 : BaseSIMDTwoScalar<U, 0b11, opc, FPR64, FPR64, asm,
+ [(set (v1i64 FPR64:$Rd), (OpNode (v1i64 FPR64:$Rn)))]>;
+
+ def : Pat<(i64 (OpNode (i64 FPR64:$Rn))),
+ (!cast<Instruction>(NAME # "v1i64") FPR64:$Rn)>;
+}
+
+multiclass SIMDTwoScalarSD<bit U, bit S, bits<5> opc, string asm> {
+ def v1i64 : BaseSIMDTwoScalar<U, {S,1}, opc, FPR64, FPR64, asm,[]>;
+ def v1i32 : BaseSIMDTwoScalar<U, {S,0}, opc, FPR32, FPR32, asm,[]>;
+}
+
+multiclass SIMDTwoScalarCVTSD<bit U, bit S, bits<5> opc, string asm,
+ SDPatternOperator OpNode> {
+ def v1i64 : BaseSIMDTwoScalar<U, {S,1}, opc, FPR64, FPR64, asm,
+ [(set FPR64:$Rd, (OpNode (f64 FPR64:$Rn)))]>;
+ def v1i32 : BaseSIMDTwoScalar<U, {S,0}, opc, FPR32, FPR32, asm,
+ [(set FPR32:$Rd, (OpNode (f32 FPR32:$Rn)))]>;
+}
+
+multiclass SIMDTwoScalarBHSD<bit U, bits<5> opc, string asm,
+ SDPatternOperator OpNode = null_frag> {
+ let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in {
+ def v1i64 : BaseSIMDTwoScalar<U, 0b11, opc, FPR64, FPR64, asm,
+ [(set (i64 FPR64:$Rd), (OpNode (i64 FPR64:$Rn)))]>;
+ def v1i32 : BaseSIMDTwoScalar<U, 0b10, opc, FPR32, FPR32, asm,
+ [(set (i32 FPR32:$Rd), (OpNode (i32 FPR32:$Rn)))]>;
+ def v1i16 : BaseSIMDTwoScalar<U, 0b01, opc, FPR16, FPR16, asm, []>;
+ def v1i8 : BaseSIMDTwoScalar<U, 0b00, opc, FPR8 , FPR8 , asm, []>;
+ }
+
+ def : Pat<(v1i64 (OpNode (v1i64 FPR64:$Rn))),
+ (!cast<Instruction>(NAME # v1i64) FPR64:$Rn)>;
+}
+
+multiclass SIMDTwoScalarBHSDTied<bit U, bits<5> opc, string asm,
+ Intrinsic OpNode> {
+ let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in {
+ def v1i64 : BaseSIMDTwoScalarTied<U, 0b11, opc, FPR64, FPR64, asm,
+ [(set (i64 FPR64:$dst), (OpNode (i64 FPR64:$Rd), (i64 FPR64:$Rn)))]>;
+ def v1i32 : BaseSIMDTwoScalarTied<U, 0b10, opc, FPR32, FPR32, asm,
+ [(set (i32 FPR32:$dst), (OpNode (i32 FPR32:$Rd), (i32 FPR32:$Rn)))]>;
+ def v1i16 : BaseSIMDTwoScalarTied<U, 0b01, opc, FPR16, FPR16, asm, []>;
+ def v1i8 : BaseSIMDTwoScalarTied<U, 0b00, opc, FPR8 , FPR8 , asm, []>;
+ }
+
+ def : Pat<(v1i64 (OpNode (v1i64 FPR64:$Rd), (v1i64 FPR64:$Rn))),
+ (!cast<Instruction>(NAME # v1i64) FPR64:$Rd, FPR64:$Rn)>;
+}
+
+
+
+let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
+multiclass SIMDTwoScalarMixedBHS<bit U, bits<5> opc, string asm,
+ SDPatternOperator OpNode = null_frag> {
+ def v1i32 : BaseSIMDTwoScalar<U, 0b10, opc, FPR32, FPR64, asm,
+ [(set (i32 FPR32:$Rd), (OpNode (i64 FPR64:$Rn)))]>;
+ def v1i16 : BaseSIMDTwoScalar<U, 0b01, opc, FPR16, FPR32, asm, []>;
+ def v1i8 : BaseSIMDTwoScalar<U, 0b00, opc, FPR8 , FPR16, asm, []>;
+}
+
+//----------------------------------------------------------------------------
+// AdvSIMD scalar pairwise instructions
+//----------------------------------------------------------------------------
+
+let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
+class BaseSIMDPairwiseScalar<bit U, bits<2> size, bits<5> opcode,
+ RegisterOperand regtype, RegisterOperand vectype,
+ string asm, string kind>
+ : I<(outs regtype:$Rd), (ins vectype:$Rn), asm,
+ "{\t$Rd, $Rn" # kind # "|" # kind # "\t$Rd, $Rn}", "", []>,
+ Sched<[WriteV]> {
+ bits<5> Rd;
+ bits<5> Rn;
+ let Inst{31-30} = 0b01;
+ let Inst{29} = U;
+ let Inst{28-24} = 0b11110;
+ let Inst{23-22} = size;
+ let Inst{21-17} = 0b11000;
+ let Inst{16-12} = opcode;
+ let Inst{11-10} = 0b10;
+ let Inst{9-5} = Rn;
+ let Inst{4-0} = Rd;
+}
+
+multiclass SIMDPairwiseScalarD<bit U, bits<5> opc, string asm> {
+ def v2i64p : BaseSIMDPairwiseScalar<U, 0b11, opc, FPR64Op, V128,
+ asm, ".2d">;
+}
+
+multiclass SIMDPairwiseScalarSD<bit U, bit S, bits<5> opc, string asm> {
+ def v2i32p : BaseSIMDPairwiseScalar<U, {S,0}, opc, FPR32Op, V64,
+ asm, ".2s">;
+ def v2i64p : BaseSIMDPairwiseScalar<U, {S,1}, opc, FPR64Op, V128,
+ asm, ".2d">;
+}
+
+//----------------------------------------------------------------------------
+// AdvSIMD across lanes instructions
+//----------------------------------------------------------------------------
+
+let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
+class BaseSIMDAcrossLanes<bit Q, bit U, bits<2> size, bits<5> opcode,
+ RegisterClass regtype, RegisterOperand vectype,
+ string asm, string kind, list<dag> pattern>
+ : I<(outs regtype:$Rd), (ins vectype:$Rn), asm,
+ "{\t$Rd, $Rn" # kind # "|" # kind # "\t$Rd, $Rn}", "", pattern>,
+ Sched<[WriteV]> {
+ bits<5> Rd;
+ bits<5> Rn;
+ let Inst{31} = 0;
+ let Inst{30} = Q;
+ let Inst{29} = U;
let Inst{28-24} = 0b01110;
let Inst{23-22} = size;
- let Inst{21} = 0b1;
- // Inherit Rm in 20-16
- let Inst{15-12} = opcode;
- let Inst{11} = 0b0;
- let Inst{10} = 0b0;
- // Inherit Rn in 9-5
- // Inherit Rd in 4-0
+ let Inst{21-17} = 0b11000;
+ let Inst{16-12} = opcode;
+ let Inst{11-10} = 0b10;
+ let Inst{9-5} = Rn;
+ let Inst{4-0} = Rd;
}
-// Format AdvSIMD two registers and an element
-class NeonI_2VElem<bit q, bit u, bits<2> size, bits<4> opcode,
- dag outs, dag ins, string asmstr,
- list<dag> patterns, InstrItinClass itin>
- : A64InstRdnm<outs, ins, asmstr, patterns, itin> {
- let Inst{31} = 0b0;
- let Inst{30} = q;
- let Inst{29} = u;
- let Inst{28-24} = 0b01111;
- let Inst{23-22} = size;
- // l in Inst{21}
- // m in Inst{20}
- // Inherit Rm in 19-16
- let Inst{15-12} = opcode;
- // h in Inst{11}
- let Inst{10} = 0b0;
- // Inherit Rn in 9-5
- // Inherit Rd in 4-0
+multiclass SIMDAcrossLanesBHS<bit U, bits<5> opcode,
+ string asm> {
+ def v8i8v : BaseSIMDAcrossLanes<0, U, 0b00, opcode, FPR8, V64,
+ asm, ".8b", []>;
+ def v16i8v : BaseSIMDAcrossLanes<1, U, 0b00, opcode, FPR8, V128,
+ asm, ".16b", []>;
+ def v4i16v : BaseSIMDAcrossLanes<0, U, 0b01, opcode, FPR16, V64,
+ asm, ".4h", []>;
+ def v8i16v : BaseSIMDAcrossLanes<1, U, 0b01, opcode, FPR16, V128,
+ asm, ".8h", []>;
+ def v4i32v : BaseSIMDAcrossLanes<1, U, 0b10, opcode, FPR32, V128,
+ asm, ".4s", []>;
}
-// Format AdvSIMD 1 vector register with modified immediate
-class NeonI_1VModImm<bit q, bit op,
- dag outs, dag ins, string asmstr,
- list<dag> patterns, InstrItinClass itin>
- : A64InstRd<outs,ins, asmstr, patterns, itin> {
- bits<8> Imm;
- bits<4> cmode;
- let Inst{31} = 0b0;
- let Inst{30} = q;
+multiclass SIMDAcrossLanesHSD<bit U, bits<5> opcode, string asm> {
+ def v8i8v : BaseSIMDAcrossLanes<0, U, 0b00, opcode, FPR16, V64,
+ asm, ".8b", []>;
+ def v16i8v : BaseSIMDAcrossLanes<1, U, 0b00, opcode, FPR16, V128,
+ asm, ".16b", []>;
+ def v4i16v : BaseSIMDAcrossLanes<0, U, 0b01, opcode, FPR32, V64,
+ asm, ".4h", []>;
+ def v8i16v : BaseSIMDAcrossLanes<1, U, 0b01, opcode, FPR32, V128,
+ asm, ".8h", []>;
+ def v4i32v : BaseSIMDAcrossLanes<1, U, 0b10, opcode, FPR64, V128,
+ asm, ".4s", []>;
+}
+
+multiclass SIMDAcrossLanesS<bits<5> opcode, bit sz1, string asm,
+ Intrinsic intOp> {
+ def v4i32v : BaseSIMDAcrossLanes<1, 1, {sz1, 0}, opcode, FPR32, V128,
+ asm, ".4s",
+ [(set FPR32:$Rd, (intOp (v4f32 V128:$Rn)))]>;
+}
+
+//----------------------------------------------------------------------------
+// AdvSIMD INS/DUP instructions
+//----------------------------------------------------------------------------
+
+// FIXME: There has got to be a better way to factor these. ugh.
+
+class BaseSIMDInsDup<bit Q, bit op, dag outs, dag ins, string asm,
+ string operands, string constraints, list<dag> pattern>
+ : I<outs, ins, asm, operands, constraints, pattern>,
+ Sched<[WriteV]> {
+ bits<5> Rd;
+ bits<5> Rn;
+ let Inst{31} = 0;
+ let Inst{30} = Q;
let Inst{29} = op;
- let Inst{28-19} = 0b0111100000;
- let Inst{15-12} = cmode;
- let Inst{11} = 0b0; // o2
+ let Inst{28-21} = 0b01110000;
+ let Inst{15} = 0;
let Inst{10} = 1;
- // Inherit Rd in 4-0
- let Inst{18-16} = Imm{7-5}; // imm a:b:c
- let Inst{9-5} = Imm{4-0}; // imm d:e:f:g:h
+ let Inst{9-5} = Rn;
+ let Inst{4-0} = Rd;
}
-// Format AdvSIMD 3 scalar registers with same type
-
-class NeonI_Scalar3Same<bit u, bits<2> size, bits<5> opcode,
- dag outs, dag ins, string asmstr,
- list<dag> patterns, InstrItinClass itin>
- : A64InstRdnm<outs, ins, asmstr, patterns, itin> {
- let Inst{31} = 0b0;
- let Inst{30} = 0b1;
- let Inst{29} = u;
- let Inst{28-24} = 0b11110;
- let Inst{23-22} = size;
- let Inst{21} = 0b1;
- // Inherit Rm in 20-16
- let Inst{15-11} = opcode;
- let Inst{10} = 0b1;
- // Inherit Rn in 9-5
- // Inherit Rd in 4-0
+class SIMDDupFromMain<bit Q, bits<5> imm5, string size, ValueType vectype,
+ RegisterOperand vecreg, RegisterClass regtype>
+ : BaseSIMDInsDup<Q, 0, (outs vecreg:$Rd), (ins regtype:$Rn), "dup",
+ "{\t$Rd" # size # ", $Rn" #
+ "|" # size # "\t$Rd, $Rn}", "",
+ [(set (vectype vecreg:$Rd), (AArch64dup regtype:$Rn))]> {
+ let Inst{20-16} = imm5;
+ let Inst{14-11} = 0b0001;
}
-
-// Format AdvSIMD 2 vector registers miscellaneous
-class NeonI_2VMisc<bit q, bit u, bits<2> size, bits<5> opcode,
- dag outs, dag ins, string asmstr,
- list<dag> patterns, InstrItinClass itin>
- : A64InstRdn<outs, ins, asmstr, patterns, itin> {
- let Inst{31} = 0b0;
- let Inst{30} = q;
- let Inst{29} = u;
- let Inst{28-24} = 0b01110;
- let Inst{23-22} = size;
- let Inst{21-17} = 0b10000;
- let Inst{16-12} = opcode;
- let Inst{11-10} = 0b10;
-
- // Inherit Rn in 9-5
- // Inherit Rd in 4-0
+class SIMDDupFromElement<bit Q, string dstkind, string srckind,
+ ValueType vectype, ValueType insreg,
+ RegisterOperand vecreg, Operand idxtype,
+ ValueType elttype, SDNode OpNode>
+ : BaseSIMDInsDup<Q, 0, (outs vecreg:$Rd), (ins V128:$Rn, idxtype:$idx), "dup",
+ "{\t$Rd" # dstkind # ", $Rn" # srckind # "$idx" #
+ "|" # dstkind # "\t$Rd, $Rn$idx}", "",
+ [(set (vectype vecreg:$Rd),
+ (OpNode (insreg V128:$Rn), idxtype:$idx))]> {
+ let Inst{14-11} = 0b0000;
}
-// Format AdvSIMD 2 vector 1 immediate shift
-class NeonI_2VShiftImm<bit q, bit u, bits<5> opcode,
- dag outs, dag ins, string asmstr,
- list<dag> patterns, InstrItinClass itin>
- : A64InstRdn<outs, ins, asmstr, patterns, itin> {
- bits<7> Imm;
- let Inst{31} = 0b0;
- let Inst{30} = q;
- let Inst{29} = u;
- let Inst{28-23} = 0b011110;
- let Inst{22-16} = Imm;
- let Inst{15-11} = opcode;
- let Inst{10} = 0b1;
-
- // Inherit Rn in 9-5
- // Inherit Rd in 4-0
+class SIMDDup64FromElement
+ : SIMDDupFromElement<1, ".2d", ".d", v2i64, v2i64, V128,
+ VectorIndexD, i64, AArch64duplane64> {
+ bits<1> idx;
+ let Inst{20} = idx;
+ let Inst{19-16} = 0b1000;
}
-// Format AdvSIMD duplicate and insert
-class NeonI_copy<bit q, bit op, bits<4> imm4,
- dag outs, dag ins, string asmstr,
- list<dag> patterns, InstrItinClass itin>
- : A64InstRdn<outs, ins, asmstr, patterns, itin> {
- bits<5> Imm5;
- let Inst{31} = 0b0;
- let Inst{30} = q;
- let Inst{29} = op;
- let Inst{28-21} = 0b01110000;
- let Inst{20-16} = Imm5;
- let Inst{15} = 0b0;
+class SIMDDup32FromElement<bit Q, string size, ValueType vectype,
+ RegisterOperand vecreg>
+ : SIMDDupFromElement<Q, size, ".s", vectype, v4i32, vecreg,
+ VectorIndexS, i64, AArch64duplane32> {
+ bits<2> idx;
+ let Inst{20-19} = idx;
+ let Inst{18-16} = 0b100;
+}
+
+class SIMDDup16FromElement<bit Q, string size, ValueType vectype,
+ RegisterOperand vecreg>
+ : SIMDDupFromElement<Q, size, ".h", vectype, v8i16, vecreg,
+ VectorIndexH, i64, AArch64duplane16> {
+ bits<3> idx;
+ let Inst{20-18} = idx;
+ let Inst{17-16} = 0b10;
+}
+
+class SIMDDup8FromElement<bit Q, string size, ValueType vectype,
+ RegisterOperand vecreg>
+ : SIMDDupFromElement<Q, size, ".b", vectype, v16i8, vecreg,
+ VectorIndexB, i64, AArch64duplane8> {
+ bits<4> idx;
+ let Inst{20-17} = idx;
+ let Inst{16} = 1;
+}
+
+class BaseSIMDMov<bit Q, string size, bits<4> imm4, RegisterClass regtype,
+ Operand idxtype, string asm, list<dag> pattern>
+ : BaseSIMDInsDup<Q, 0, (outs regtype:$Rd), (ins V128:$Rn, idxtype:$idx), asm,
+ "{\t$Rd, $Rn" # size # "$idx" #
+ "|" # size # "\t$Rd, $Rn$idx}", "", pattern> {
let Inst{14-11} = imm4;
- let Inst{10} = 0b1;
-
- // Inherit Rn in 9-5
- // Inherit Rd in 4-0
-}
-// Format AdvSIMD insert from element to vector
-class NeonI_insert<bit q, bit op,
- dag outs, dag ins, string asmstr,
- list<dag> patterns, InstrItinClass itin>
- : A64InstRdn<outs, ins, asmstr, patterns, itin> {
- bits<5> Imm5;
- bits<4> Imm4;
- let Inst{31} = 0b0;
- let Inst{30} = q;
- let Inst{29} = op;
- let Inst{28-21} = 0b01110000;
- let Inst{20-16} = Imm5;
- let Inst{15} = 0b0;
- let Inst{14-11} = Imm4;
- let Inst{10} = 0b1;
-
- // Inherit Rn in 9-5
- // Inherit Rd in 4-0
}
-// Format AdvSIMD scalar pairwise
-class NeonI_ScalarPair<bit u, bits<2> size, bits<5> opcode,
- dag outs, dag ins, string asmstr,
- list<dag> patterns, InstrItinClass itin>
- : A64InstRdn<outs, ins, asmstr, patterns, itin> {
- let Inst{31} = 0b0;
- let Inst{30} = 0b1;
- let Inst{29} = u;
- let Inst{28-24} = 0b11110;
+class SIMDSMov<bit Q, string size, RegisterClass regtype,
+ Operand idxtype>
+ : BaseSIMDMov<Q, size, 0b0101, regtype, idxtype, "smov", []>;
+class SIMDUMov<bit Q, string size, ValueType vectype, RegisterClass regtype,
+ Operand idxtype>
+ : BaseSIMDMov<Q, size, 0b0111, regtype, idxtype, "umov",
+ [(set regtype:$Rd, (vector_extract (vectype V128:$Rn), idxtype:$idx))]>;
+
+class SIMDMovAlias<string asm, string size, Instruction inst,
+ RegisterClass regtype, Operand idxtype>
+ : InstAlias<asm#"{\t$dst, $src"#size#"$idx" #
+ "|" # size # "\t$dst, $src$idx}",
+ (inst regtype:$dst, V128:$src, idxtype:$idx)>;
+
+multiclass SMov {
+ def vi8to32 : SIMDSMov<0, ".b", GPR32, VectorIndexB> {
+ bits<4> idx;
+ let Inst{20-17} = idx;
+ let Inst{16} = 1;
+ }
+ def vi8to64 : SIMDSMov<1, ".b", GPR64, VectorIndexB> {
+ bits<4> idx;
+ let Inst{20-17} = idx;
+ let Inst{16} = 1;
+ }
+ def vi16to32 : SIMDSMov<0, ".h", GPR32, VectorIndexH> {
+ bits<3> idx;
+ let Inst{20-18} = idx;
+ let Inst{17-16} = 0b10;
+ }
+ def vi16to64 : SIMDSMov<1, ".h", GPR64, VectorIndexH> {
+ bits<3> idx;
+ let Inst{20-18} = idx;
+ let Inst{17-16} = 0b10;
+ }
+ def vi32to64 : SIMDSMov<1, ".s", GPR64, VectorIndexS> {
+ bits<2> idx;
+ let Inst{20-19} = idx;
+ let Inst{18-16} = 0b100;
+ }
+}
+
+multiclass UMov {
+ def vi8 : SIMDUMov<0, ".b", v16i8, GPR32, VectorIndexB> {
+ bits<4> idx;
+ let Inst{20-17} = idx;
+ let Inst{16} = 1;
+ }
+ def vi16 : SIMDUMov<0, ".h", v8i16, GPR32, VectorIndexH> {
+ bits<3> idx;
+ let Inst{20-18} = idx;
+ let Inst{17-16} = 0b10;
+ }
+ def vi32 : SIMDUMov<0, ".s", v4i32, GPR32, VectorIndexS> {
+ bits<2> idx;
+ let Inst{20-19} = idx;
+ let Inst{18-16} = 0b100;
+ }
+ def vi64 : SIMDUMov<1, ".d", v2i64, GPR64, VectorIndexD> {
+ bits<1> idx;
+ let Inst{20} = idx;
+ let Inst{19-16} = 0b1000;
+ }
+ def : SIMDMovAlias<"mov", ".s",
+ !cast<Instruction>(NAME#"vi32"),
+ GPR32, VectorIndexS>;
+ def : SIMDMovAlias<"mov", ".d",
+ !cast<Instruction>(NAME#"vi64"),
+ GPR64, VectorIndexD>;
+}
+
+class SIMDInsFromMain<string size, ValueType vectype,
+ RegisterClass regtype, Operand idxtype>
+ : BaseSIMDInsDup<1, 0, (outs V128:$dst),
+ (ins V128:$Rd, idxtype:$idx, regtype:$Rn), "ins",
+ "{\t$Rd" # size # "$idx, $Rn" #
+ "|" # size # "\t$Rd$idx, $Rn}",
+ "$Rd = $dst",
+ [(set V128:$dst,
+ (vector_insert (vectype V128:$Rd), regtype:$Rn, idxtype:$idx))]> {
+ let Inst{14-11} = 0b0011;
+}
+
+class SIMDInsFromElement<string size, ValueType vectype,
+ ValueType elttype, Operand idxtype>
+ : BaseSIMDInsDup<1, 1, (outs V128:$dst),
+ (ins V128:$Rd, idxtype:$idx, V128:$Rn, idxtype:$idx2), "ins",
+ "{\t$Rd" # size # "$idx, $Rn" # size # "$idx2" #
+ "|" # size # "\t$Rd$idx, $Rn$idx2}",
+ "$Rd = $dst",
+ [(set V128:$dst,
+ (vector_insert
+ (vectype V128:$Rd),
+ (elttype (vector_extract (vectype V128:$Rn), idxtype:$idx2)),
+ idxtype:$idx))]>;
+
+class SIMDInsMainMovAlias<string size, Instruction inst,
+ RegisterClass regtype, Operand idxtype>
+ : InstAlias<"mov" # "{\t$dst" # size # "$idx, $src" #
+ "|" # size #"\t$dst$idx, $src}",
+ (inst V128:$dst, idxtype:$idx, regtype:$src)>;
+class SIMDInsElementMovAlias<string size, Instruction inst,
+ Operand idxtype>
+ : InstAlias<"mov" # "{\t$dst" # size # "$idx, $src" # size # "$idx2" #
+ # "|" # size #" $dst$idx, $src$idx2}",
+ (inst V128:$dst, idxtype:$idx, V128:$src, idxtype:$idx2)>;
+
+
+multiclass SIMDIns {
+ def vi8gpr : SIMDInsFromMain<".b", v16i8, GPR32, VectorIndexB> {
+ bits<4> idx;
+ let Inst{20-17} = idx;
+ let Inst{16} = 1;
+ }
+ def vi16gpr : SIMDInsFromMain<".h", v8i16, GPR32, VectorIndexH> {
+ bits<3> idx;
+ let Inst{20-18} = idx;
+ let Inst{17-16} = 0b10;
+ }
+ def vi32gpr : SIMDInsFromMain<".s", v4i32, GPR32, VectorIndexS> {
+ bits<2> idx;
+ let Inst{20-19} = idx;
+ let Inst{18-16} = 0b100;
+ }
+ def vi64gpr : SIMDInsFromMain<".d", v2i64, GPR64, VectorIndexD> {
+ bits<1> idx;
+ let Inst{20} = idx;
+ let Inst{19-16} = 0b1000;
+ }
+
+ def vi8lane : SIMDInsFromElement<".b", v16i8, i32, VectorIndexB> {
+ bits<4> idx;
+ bits<4> idx2;
+ let Inst{20-17} = idx;
+ let Inst{16} = 1;
+ let Inst{14-11} = idx2;
+ }
+ def vi16lane : SIMDInsFromElement<".h", v8i16, i32, VectorIndexH> {
+ bits<3> idx;
+ bits<3> idx2;
+ let Inst{20-18} = idx;
+ let Inst{17-16} = 0b10;
+ let Inst{14-12} = idx2;
+ let Inst{11} = 0;
+ }
+ def vi32lane : SIMDInsFromElement<".s", v4i32, i32, VectorIndexS> {
+ bits<2> idx;
+ bits<2> idx2;
+ let Inst{20-19} = idx;
+ let Inst{18-16} = 0b100;
+ let Inst{14-13} = idx2;
+ let Inst{12-11} = 0;
+ }
+ def vi64lane : SIMDInsFromElement<".d", v2i64, i64, VectorIndexD> {
+ bits<1> idx;
+ bits<1> idx2;
+ let Inst{20} = idx;
+ let Inst{19-16} = 0b1000;
+ let Inst{14} = idx2;
+ let Inst{13-11} = 0;
+ }
+
+ // For all forms of the INS instruction, the "mov" mnemonic is the
+ // preferred alias. Why they didn't just call the instruction "mov" in
+ // the first place is a very good question indeed...
+ def : SIMDInsMainMovAlias<".b", !cast<Instruction>(NAME#"vi8gpr"),
+ GPR32, VectorIndexB>;
+ def : SIMDInsMainMovAlias<".h", !cast<Instruction>(NAME#"vi16gpr"),
+ GPR32, VectorIndexH>;
+ def : SIMDInsMainMovAlias<".s", !cast<Instruction>(NAME#"vi32gpr"),
+ GPR32, VectorIndexS>;
+ def : SIMDInsMainMovAlias<".d", !cast<Instruction>(NAME#"vi64gpr"),
+ GPR64, VectorIndexD>;
+
+ def : SIMDInsElementMovAlias<".b", !cast<Instruction>(NAME#"vi8lane"),
+ VectorIndexB>;
+ def : SIMDInsElementMovAlias<".h", !cast<Instruction>(NAME#"vi16lane"),
+ VectorIndexH>;
+ def : SIMDInsElementMovAlias<".s", !cast<Instruction>(NAME#"vi32lane"),
+ VectorIndexS>;
+ def : SIMDInsElementMovAlias<".d", !cast<Instruction>(NAME#"vi64lane"),
+ VectorIndexD>;
+}
+
+//----------------------------------------------------------------------------
+// AdvSIMD TBL/TBX
+//----------------------------------------------------------------------------
+
+let mayStore = 0, mayLoad = 0, hasSideEffects = 0 in
+class BaseSIMDTableLookup<bit Q, bits<2> len, bit op, RegisterOperand vectype,
+ RegisterOperand listtype, string asm, string kind>
+ : I<(outs vectype:$Vd), (ins listtype:$Vn, vectype:$Vm), asm,
+ "\t$Vd" # kind # ", $Vn, $Vm" # kind, "", []>,
+ Sched<[WriteV]> {
+ bits<5> Vd;
+ bits<5> Vn;
+ bits<5> Vm;
+ let Inst{31} = 0;
+ let Inst{30} = Q;
+ let Inst{29-21} = 0b001110000;
+ let Inst{20-16} = Vm;
+ let Inst{15} = 0;
+ let Inst{14-13} = len;
+ let Inst{12} = op;
+ let Inst{11-10} = 0b00;
+ let Inst{9-5} = Vn;
+ let Inst{4-0} = Vd;
+}
+
+let mayStore = 0, mayLoad = 0, hasSideEffects = 0 in
+class BaseSIMDTableLookupTied<bit Q, bits<2> len, bit op, RegisterOperand vectype,
+ RegisterOperand listtype, string asm, string kind>
+ : I<(outs vectype:$dst), (ins vectype:$Vd, listtype:$Vn, vectype:$Vm), asm,
+ "\t$Vd" # kind # ", $Vn, $Vm" # kind, "$Vd = $dst", []>,
+ Sched<[WriteV]> {
+ bits<5> Vd;
+ bits<5> Vn;
+ bits<5> Vm;
+ let Inst{31} = 0;
+ let Inst{30} = Q;
+ let Inst{29-21} = 0b001110000;
+ let Inst{20-16} = Vm;
+ let Inst{15} = 0;
+ let Inst{14-13} = len;
+ let Inst{12} = op;
+ let Inst{11-10} = 0b00;
+ let Inst{9-5} = Vn;
+ let Inst{4-0} = Vd;
+}
+
+class SIMDTableLookupAlias<string asm, Instruction inst,
+ RegisterOperand vectype, RegisterOperand listtype>
+ : InstAlias<!strconcat(asm, "\t$dst, $lst, $index"),
+ (inst vectype:$dst, listtype:$lst, vectype:$index), 0>;
+
+multiclass SIMDTableLookup<bit op, string asm> {
+ def v8i8One : BaseSIMDTableLookup<0, 0b00, op, V64, VecListOne16b,
+ asm, ".8b">;
+ def v8i8Two : BaseSIMDTableLookup<0, 0b01, op, V64, VecListTwo16b,
+ asm, ".8b">;
+ def v8i8Three : BaseSIMDTableLookup<0, 0b10, op, V64, VecListThree16b,
+ asm, ".8b">;
+ def v8i8Four : BaseSIMDTableLookup<0, 0b11, op, V64, VecListFour16b,
+ asm, ".8b">;
+ def v16i8One : BaseSIMDTableLookup<1, 0b00, op, V128, VecListOne16b,
+ asm, ".16b">;
+ def v16i8Two : BaseSIMDTableLookup<1, 0b01, op, V128, VecListTwo16b,
+ asm, ".16b">;
+ def v16i8Three: BaseSIMDTableLookup<1, 0b10, op, V128, VecListThree16b,
+ asm, ".16b">;
+ def v16i8Four : BaseSIMDTableLookup<1, 0b11, op, V128, VecListFour16b,
+ asm, ".16b">;
+
+ def : SIMDTableLookupAlias<asm # ".8b",
+ !cast<Instruction>(NAME#"v8i8One"),
+ V64, VecListOne128>;
+ def : SIMDTableLookupAlias<asm # ".8b",
+ !cast<Instruction>(NAME#"v8i8Two"),
+ V64, VecListTwo128>;
+ def : SIMDTableLookupAlias<asm # ".8b",
+ !cast<Instruction>(NAME#"v8i8Three"),
+ V64, VecListThree128>;
+ def : SIMDTableLookupAlias<asm # ".8b",
+ !cast<Instruction>(NAME#"v8i8Four"),
+ V64, VecListFour128>;
+ def : SIMDTableLookupAlias<asm # ".16b",
+ !cast<Instruction>(NAME#"v16i8One"),
+ V128, VecListOne128>;
+ def : SIMDTableLookupAlias<asm # ".16b",
+ !cast<Instruction>(NAME#"v16i8Two"),
+ V128, VecListTwo128>;
+ def : SIMDTableLookupAlias<asm # ".16b",
+ !cast<Instruction>(NAME#"v16i8Three"),
+ V128, VecListThree128>;
+ def : SIMDTableLookupAlias<asm # ".16b",
+ !cast<Instruction>(NAME#"v16i8Four"),
+ V128, VecListFour128>;
+}
+
+multiclass SIMDTableLookupTied<bit op, string asm> {
+ def v8i8One : BaseSIMDTableLookupTied<0, 0b00, op, V64, VecListOne16b,
+ asm, ".8b">;
+ def v8i8Two : BaseSIMDTableLookupTied<0, 0b01, op, V64, VecListTwo16b,
+ asm, ".8b">;
+ def v8i8Three : BaseSIMDTableLookupTied<0, 0b10, op, V64, VecListThree16b,
+ asm, ".8b">;
+ def v8i8Four : BaseSIMDTableLookupTied<0, 0b11, op, V64, VecListFour16b,
+ asm, ".8b">;
+ def v16i8One : BaseSIMDTableLookupTied<1, 0b00, op, V128, VecListOne16b,
+ asm, ".16b">;
+ def v16i8Two : BaseSIMDTableLookupTied<1, 0b01, op, V128, VecListTwo16b,
+ asm, ".16b">;
+ def v16i8Three: BaseSIMDTableLookupTied<1, 0b10, op, V128, VecListThree16b,
+ asm, ".16b">;
+ def v16i8Four : BaseSIMDTableLookupTied<1, 0b11, op, V128, VecListFour16b,
+ asm, ".16b">;
+
+ def : SIMDTableLookupAlias<asm # ".8b",
+ !cast<Instruction>(NAME#"v8i8One"),
+ V64, VecListOne128>;
+ def : SIMDTableLookupAlias<asm # ".8b",
+ !cast<Instruction>(NAME#"v8i8Two"),
+ V64, VecListTwo128>;
+ def : SIMDTableLookupAlias<asm # ".8b",
+ !cast<Instruction>(NAME#"v8i8Three"),
+ V64, VecListThree128>;
+ def : SIMDTableLookupAlias<asm # ".8b",
+ !cast<Instruction>(NAME#"v8i8Four"),
+ V64, VecListFour128>;
+ def : SIMDTableLookupAlias<asm # ".16b",
+ !cast<Instruction>(NAME#"v16i8One"),
+ V128, VecListOne128>;
+ def : SIMDTableLookupAlias<asm # ".16b",
+ !cast<Instruction>(NAME#"v16i8Two"),
+ V128, VecListTwo128>;
+ def : SIMDTableLookupAlias<asm # ".16b",
+ !cast<Instruction>(NAME#"v16i8Three"),
+ V128, VecListThree128>;
+ def : SIMDTableLookupAlias<asm # ".16b",
+ !cast<Instruction>(NAME#"v16i8Four"),
+ V128, VecListFour128>;
+}
+
+
+//----------------------------------------------------------------------------
+// AdvSIMD scalar CPY
+//----------------------------------------------------------------------------
+let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
+class BaseSIMDScalarCPY<RegisterClass regtype, RegisterOperand vectype,
+ string kind, Operand idxtype>
+ : I<(outs regtype:$dst), (ins vectype:$src, idxtype:$idx), "mov",
+ "{\t$dst, $src" # kind # "$idx" #
+ "|\t$dst, $src$idx}", "", []>,
+ Sched<[WriteV]> {
+ bits<5> dst;
+ bits<5> src;
+ let Inst{31-21} = 0b01011110000;
+ let Inst{15-10} = 0b000001;
+ let Inst{9-5} = src;
+ let Inst{4-0} = dst;
+}
+
+class SIMDScalarCPYAlias<string asm, string size, Instruction inst,
+ RegisterClass regtype, RegisterOperand vectype, Operand idxtype>
+ : InstAlias<asm # "{\t$dst, $src" # size # "$index" #
+ # "|\t$dst, $src$index}",
+ (inst regtype:$dst, vectype:$src, idxtype:$index), 0>;
+
+
+multiclass SIMDScalarCPY<string asm> {
+ def i8 : BaseSIMDScalarCPY<FPR8, V128, ".b", VectorIndexB> {
+ bits<4> idx;
+ let Inst{20-17} = idx;
+ let Inst{16} = 1;
+ }
+ def i16 : BaseSIMDScalarCPY<FPR16, V128, ".h", VectorIndexH> {
+ bits<3> idx;
+ let Inst{20-18} = idx;
+ let Inst{17-16} = 0b10;
+ }
+ def i32 : BaseSIMDScalarCPY<FPR32, V128, ".s", VectorIndexS> {
+ bits<2> idx;
+ let Inst{20-19} = idx;
+ let Inst{18-16} = 0b100;
+ }
+ def i64 : BaseSIMDScalarCPY<FPR64, V128, ".d", VectorIndexD> {
+ bits<1> idx;
+ let Inst{20} = idx;
+ let Inst{19-16} = 0b1000;
+ }
+
+ def : Pat<(v1i64 (scalar_to_vector (i64 (vector_extract (v2i64 V128:$src),
+ VectorIndexD:$idx)))),
+ (!cast<Instruction>(NAME # i64) V128:$src, VectorIndexD:$idx)>;
+
+ // 'DUP' mnemonic aliases.
+ def : SIMDScalarCPYAlias<"dup", ".b",
+ !cast<Instruction>(NAME#"i8"),
+ FPR8, V128, VectorIndexB>;
+ def : SIMDScalarCPYAlias<"dup", ".h",
+ !cast<Instruction>(NAME#"i16"),
+ FPR16, V128, VectorIndexH>;
+ def : SIMDScalarCPYAlias<"dup", ".s",
+ !cast<Instruction>(NAME#"i32"),
+ FPR32, V128, VectorIndexS>;
+ def : SIMDScalarCPYAlias<"dup", ".d",
+ !cast<Instruction>(NAME#"i64"),
+ FPR64, V128, VectorIndexD>;
+}
+
+//----------------------------------------------------------------------------
+// AdvSIMD modified immediate instructions
+//----------------------------------------------------------------------------
+
+class BaseSIMDModifiedImm<bit Q, bit op, dag oops, dag iops,
+ string asm, string op_string,
+ string cstr, list<dag> pattern>
+ : I<oops, iops, asm, op_string, cstr, pattern>,
+ Sched<[WriteV]> {
+ bits<5> Rd;
+ bits<8> imm8;
+ let Inst{31} = 0;
+ let Inst{30} = Q;
+ let Inst{29} = op;
+ let Inst{28-19} = 0b0111100000;
+ let Inst{18-16} = imm8{7-5};
+ let Inst{11-10} = 0b01;
+ let Inst{9-5} = imm8{4-0};
+ let Inst{4-0} = Rd;
+}
+
+class BaseSIMDModifiedImmVector<bit Q, bit op, RegisterOperand vectype,
+ Operand immtype, dag opt_shift_iop,
+ string opt_shift, string asm, string kind,
+ list<dag> pattern>
+ : BaseSIMDModifiedImm<Q, op, (outs vectype:$Rd),
+ !con((ins immtype:$imm8), opt_shift_iop), asm,
+ "{\t$Rd" # kind # ", $imm8" # opt_shift #
+ "|" # kind # "\t$Rd, $imm8" # opt_shift # "}",
+ "", pattern> {
+ let DecoderMethod = "DecodeModImmInstruction";
+}
+
+class BaseSIMDModifiedImmVectorTied<bit Q, bit op, RegisterOperand vectype,
+ Operand immtype, dag opt_shift_iop,
+ string opt_shift, string asm, string kind,
+ list<dag> pattern>
+ : BaseSIMDModifiedImm<Q, op, (outs vectype:$dst),
+ !con((ins vectype:$Rd, immtype:$imm8), opt_shift_iop),
+ asm, "{\t$Rd" # kind # ", $imm8" # opt_shift #
+ "|" # kind # "\t$Rd, $imm8" # opt_shift # "}",
+ "$Rd = $dst", pattern> {
+ let DecoderMethod = "DecodeModImmTiedInstruction";
+}
+
+class BaseSIMDModifiedImmVectorShift<bit Q, bit op, bits<2> b15_b12,
+ RegisterOperand vectype, string asm,
+ string kind, list<dag> pattern>
+ : BaseSIMDModifiedImmVector<Q, op, vectype, imm0_255,
+ (ins logical_vec_shift:$shift),
+ "$shift", asm, kind, pattern> {
+ bits<2> shift;
+ let Inst{15} = b15_b12{1};
+ let Inst{14-13} = shift;
+ let Inst{12} = b15_b12{0};
+}
+
+class BaseSIMDModifiedImmVectorShiftTied<bit Q, bit op, bits<2> b15_b12,
+ RegisterOperand vectype, string asm,
+ string kind, list<dag> pattern>
+ : BaseSIMDModifiedImmVectorTied<Q, op, vectype, imm0_255,
+ (ins logical_vec_shift:$shift),
+ "$shift", asm, kind, pattern> {
+ bits<2> shift;
+ let Inst{15} = b15_b12{1};
+ let Inst{14-13} = shift;
+ let Inst{12} = b15_b12{0};
+}
+
+
+class BaseSIMDModifiedImmVectorShiftHalf<bit Q, bit op, bits<2> b15_b12,
+ RegisterOperand vectype, string asm,
+ string kind, list<dag> pattern>
+ : BaseSIMDModifiedImmVector<Q, op, vectype, imm0_255,
+ (ins logical_vec_hw_shift:$shift),
+ "$shift", asm, kind, pattern> {
+ bits<2> shift;
+ let Inst{15} = b15_b12{1};
+ let Inst{14} = 0;
+ let Inst{13} = shift{0};
+ let Inst{12} = b15_b12{0};
+}
+
+class BaseSIMDModifiedImmVectorShiftHalfTied<bit Q, bit op, bits<2> b15_b12,
+ RegisterOperand vectype, string asm,
+ string kind, list<dag> pattern>
+ : BaseSIMDModifiedImmVectorTied<Q, op, vectype, imm0_255,
+ (ins logical_vec_hw_shift:$shift),
+ "$shift", asm, kind, pattern> {
+ bits<2> shift;
+ let Inst{15} = b15_b12{1};
+ let Inst{14} = 0;
+ let Inst{13} = shift{0};
+ let Inst{12} = b15_b12{0};
+}
+
+multiclass SIMDModifiedImmVectorShift<bit op, bits<2> hw_cmode, bits<2> w_cmode,
+ string asm> {
+ def v4i16 : BaseSIMDModifiedImmVectorShiftHalf<0, op, hw_cmode, V64,
+ asm, ".4h", []>;
+ def v8i16 : BaseSIMDModifiedImmVectorShiftHalf<1, op, hw_cmode, V128,
+ asm, ".8h", []>;
+
+ def v2i32 : BaseSIMDModifiedImmVectorShift<0, op, w_cmode, V64,
+ asm, ".2s", []>;
+ def v4i32 : BaseSIMDModifiedImmVectorShift<1, op, w_cmode, V128,
+ asm, ".4s", []>;
+}
+
+multiclass SIMDModifiedImmVectorShiftTied<bit op, bits<2> hw_cmode,
+ bits<2> w_cmode, string asm,
+ SDNode OpNode> {
+ def v4i16 : BaseSIMDModifiedImmVectorShiftHalfTied<0, op, hw_cmode, V64,
+ asm, ".4h",
+ [(set (v4i16 V64:$dst), (OpNode V64:$Rd,
+ imm0_255:$imm8,
+ (i32 imm:$shift)))]>;
+ def v8i16 : BaseSIMDModifiedImmVectorShiftHalfTied<1, op, hw_cmode, V128,
+ asm, ".8h",
+ [(set (v8i16 V128:$dst), (OpNode V128:$Rd,
+ imm0_255:$imm8,
+ (i32 imm:$shift)))]>;
+
+ def v2i32 : BaseSIMDModifiedImmVectorShiftTied<0, op, w_cmode, V64,
+ asm, ".2s",
+ [(set (v2i32 V64:$dst), (OpNode V64:$Rd,
+ imm0_255:$imm8,
+ (i32 imm:$shift)))]>;
+ def v4i32 : BaseSIMDModifiedImmVectorShiftTied<1, op, w_cmode, V128,
+ asm, ".4s",
+ [(set (v4i32 V128:$dst), (OpNode V128:$Rd,
+ imm0_255:$imm8,
+ (i32 imm:$shift)))]>;
+}
+
+class SIMDModifiedImmMoveMSL<bit Q, bit op, bits<4> cmode,
+ RegisterOperand vectype, string asm,
+ string kind, list<dag> pattern>
+ : BaseSIMDModifiedImmVector<Q, op, vectype, imm0_255,
+ (ins move_vec_shift:$shift),
+ "$shift", asm, kind, pattern> {
+ bits<1> shift;
+ let Inst{15-13} = cmode{3-1};
+ let Inst{12} = shift;
+}
+
+class SIMDModifiedImmVectorNoShift<bit Q, bit op, bits<4> cmode,
+ RegisterOperand vectype,
+ Operand imm_type, string asm,
+ string kind, list<dag> pattern>
+ : BaseSIMDModifiedImmVector<Q, op, vectype, imm_type, (ins), "",
+ asm, kind, pattern> {
+ let Inst{15-12} = cmode;
+}
+
+class SIMDModifiedImmScalarNoShift<bit Q, bit op, bits<4> cmode, string asm,
+ list<dag> pattern>
+ : BaseSIMDModifiedImm<Q, op, (outs FPR64:$Rd), (ins simdimmtype10:$imm8), asm,
+ "\t$Rd, $imm8", "", pattern> {
+ let Inst{15-12} = cmode;
+ let DecoderMethod = "DecodeModImmInstruction";
+}
+
+//----------------------------------------------------------------------------
+// AdvSIMD indexed element
+//----------------------------------------------------------------------------
+
+let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
+class BaseSIMDIndexed<bit Q, bit U, bit Scalar, bits<2> size, bits<4> opc,
+ RegisterOperand dst_reg, RegisterOperand lhs_reg,
+ RegisterOperand rhs_reg, Operand vec_idx, string asm,
+ string apple_kind, string dst_kind, string lhs_kind,
+ string rhs_kind, list<dag> pattern>
+ : I<(outs dst_reg:$Rd), (ins lhs_reg:$Rn, rhs_reg:$Rm, vec_idx:$idx),
+ asm,
+ "{\t$Rd" # dst_kind # ", $Rn" # lhs_kind # ", $Rm" # rhs_kind # "$idx" #
+ "|" # apple_kind # "\t$Rd, $Rn, $Rm$idx}", "", pattern>,
+ Sched<[WriteV]> {
+ bits<5> Rd;
+ bits<5> Rn;
+ bits<5> Rm;
+
+ let Inst{31} = 0;
+ let Inst{30} = Q;
+ let Inst{29} = U;
+ let Inst{28} = Scalar;
+ let Inst{27-24} = 0b1111;
let Inst{23-22} = size;
- let Inst{21-17} = 0b11000;
- let Inst{16-12} = opcode;
- let Inst{11-10} = 0b10;
-
- // Inherit Rn in 9-5
- // Inherit Rd in 4-0
+ // Bit 21 must be set by the derived class.
+ let Inst{20-16} = Rm;
+ let Inst{15-12} = opc;
+ // Bit 11 must be set by the derived class.
+ let Inst{10} = 0;
+ let Inst{9-5} = Rn;
+ let Inst{4-0} = Rd;
}
-// Format AdvSIMD 2 vector across lanes
-class NeonI_2VAcross<bit q, bit u, bits<2> size, bits<5> opcode,
- dag outs, dag ins, string asmstr,
- list<dag> patterns, InstrItinClass itin>
- : A64InstRdn<outs, ins, asmstr, patterns, itin>
-{
- let Inst{31} = 0b0;
- let Inst{30} = q;
- let Inst{29} = u;
- let Inst{28-24} = 0b01110;
+let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
+class BaseSIMDIndexedTied<bit Q, bit U, bit Scalar, bits<2> size, bits<4> opc,
+ RegisterOperand dst_reg, RegisterOperand lhs_reg,
+ RegisterOperand rhs_reg, Operand vec_idx, string asm,
+ string apple_kind, string dst_kind, string lhs_kind,
+ string rhs_kind, list<dag> pattern>
+ : I<(outs dst_reg:$dst),
+ (ins dst_reg:$Rd, lhs_reg:$Rn, rhs_reg:$Rm, vec_idx:$idx), asm,
+ "{\t$Rd" # dst_kind # ", $Rn" # lhs_kind # ", $Rm" # rhs_kind # "$idx" #
+ "|" # apple_kind # "\t$Rd, $Rn, $Rm$idx}", "$Rd = $dst", pattern>,
+ Sched<[WriteV]> {
+ bits<5> Rd;
+ bits<5> Rn;
+ bits<5> Rm;
+
+ let Inst{31} = 0;
+ let Inst{30} = Q;
+ let Inst{29} = U;
+ let Inst{28} = Scalar;
+ let Inst{27-24} = 0b1111;
let Inst{23-22} = size;
- let Inst{21-17} = 0b11000;
- let Inst{16-12} = opcode;
- let Inst{11-10} = 0b10;
-
- // Inherit Rn in 9-5
- // Inherit Rd in 4-0
+ // Bit 21 must be set by the derived class.
+ let Inst{20-16} = Rm;
+ let Inst{15-12} = opc;
+ // Bit 11 must be set by the derived class.
+ let Inst{10} = 0;
+ let Inst{9-5} = Rn;
+ let Inst{4-0} = Rd;
}
-// Format AdvSIMD scalar two registers miscellaneous
-class NeonI_Scalar2SameMisc<bit u, bits<2> size, bits<5> opcode, dag outs, dag ins,
- string asmstr, list<dag> patterns, InstrItinClass itin>
- : A64InstRdn<outs, ins, asmstr, patterns, itin> {
- let Inst{31} = 0b0;
- let Inst{30} = 0b1;
- let Inst{29} = u;
- let Inst{28-24} = 0b11110;
- let Inst{23-22} = size;
- let Inst{21-17} = 0b10000;
- let Inst{16-12} = opcode;
- let Inst{11-10} = 0b10;
- // Inherit Rn in 9-5
- // Inherit Rd in 4-0
+multiclass SIMDFPIndexedSD<bit U, bits<4> opc, string asm,
+ SDPatternOperator OpNode> {
+ def v2i32_indexed : BaseSIMDIndexed<0, U, 0, 0b10, opc,
+ V64, V64,
+ V128, VectorIndexS,
+ asm, ".2s", ".2s", ".2s", ".s",
+ [(set (v2f32 V64:$Rd),
+ (OpNode (v2f32 V64:$Rn),
+ (v2f32 (AArch64duplane32 (v4f32 V128:$Rm), VectorIndexS:$idx))))]> {
+ bits<2> idx;
+ let Inst{11} = idx{1};
+ let Inst{21} = idx{0};
+ }
+
+ def v4i32_indexed : BaseSIMDIndexed<1, U, 0, 0b10, opc,
+ V128, V128,
+ V128, VectorIndexS,
+ asm, ".4s", ".4s", ".4s", ".s",
+ [(set (v4f32 V128:$Rd),
+ (OpNode (v4f32 V128:$Rn),
+ (v4f32 (AArch64duplane32 (v4f32 V128:$Rm), VectorIndexS:$idx))))]> {
+ bits<2> idx;
+ let Inst{11} = idx{1};
+ let Inst{21} = idx{0};
+ }
+
+ def v2i64_indexed : BaseSIMDIndexed<1, U, 0, 0b11, opc,
+ V128, V128,
+ V128, VectorIndexD,
+ asm, ".2d", ".2d", ".2d", ".d",
+ [(set (v2f64 V128:$Rd),
+ (OpNode (v2f64 V128:$Rn),
+ (v2f64 (AArch64duplane64 (v2f64 V128:$Rm), VectorIndexD:$idx))))]> {
+ bits<1> idx;
+ let Inst{11} = idx{0};
+ let Inst{21} = 0;
+ }
+
+ def v1i32_indexed : BaseSIMDIndexed<1, U, 1, 0b10, opc,
+ FPR32Op, FPR32Op, V128, VectorIndexS,
+ asm, ".s", "", "", ".s",
+ [(set (f32 FPR32Op:$Rd),
+ (OpNode (f32 FPR32Op:$Rn),
+ (f32 (vector_extract (v4f32 V128:$Rm),
+ VectorIndexS:$idx))))]> {
+ bits<2> idx;
+ let Inst{11} = idx{1};
+ let Inst{21} = idx{0};
+ }
+
+ def v1i64_indexed : BaseSIMDIndexed<1, U, 1, 0b11, opc,
+ FPR64Op, FPR64Op, V128, VectorIndexD,
+ asm, ".d", "", "", ".d",
+ [(set (f64 FPR64Op:$Rd),
+ (OpNode (f64 FPR64Op:$Rn),
+ (f64 (vector_extract (v2f64 V128:$Rm),
+ VectorIndexD:$idx))))]> {
+ bits<1> idx;
+ let Inst{11} = idx{0};
+ let Inst{21} = 0;
+ }
}
-// Format AdvSIMD vector load/store multiple N-element structure
-class NeonI_LdStMult<bit q, bit l, bits<4> opcode, bits<2> size,
- dag outs, dag ins, string asmstr,
- list<dag> patterns, InstrItinClass itin>
- : A64InstRtn<outs, ins, asmstr, patterns, itin>
-{
- let Inst{31} = 0b0;
- let Inst{30} = q;
+multiclass SIMDFPIndexedSDTiedPatterns<string INST, SDPatternOperator OpNode> {
+ // 2 variants for the .2s version: DUPLANE from 128-bit and DUP scalar.
+ def : Pat<(v2f32 (OpNode (v2f32 V64:$Rd), (v2f32 V64:$Rn),
+ (AArch64duplane32 (v4f32 V128:$Rm),
+ VectorIndexS:$idx))),
+ (!cast<Instruction>(INST # v2i32_indexed)
+ V64:$Rd, V64:$Rn, V128:$Rm, VectorIndexS:$idx)>;
+ def : Pat<(v2f32 (OpNode (v2f32 V64:$Rd), (v2f32 V64:$Rn),
+ (AArch64dup (f32 FPR32Op:$Rm)))),
+ (!cast<Instruction>(INST # "v2i32_indexed") V64:$Rd, V64:$Rn,
+ (SUBREG_TO_REG (i32 0), FPR32Op:$Rm, ssub), (i64 0))>;
+
+
+ // 2 variants for the .4s version: DUPLANE from 128-bit and DUP scalar.
+ def : Pat<(v4f32 (OpNode (v4f32 V128:$Rd), (v4f32 V128:$Rn),
+ (AArch64duplane32 (v4f32 V128:$Rm),
+ VectorIndexS:$idx))),
+ (!cast<Instruction>(INST # "v4i32_indexed")
+ V128:$Rd, V128:$Rn, V128:$Rm, VectorIndexS:$idx)>;
+ def : Pat<(v4f32 (OpNode (v4f32 V128:$Rd), (v4f32 V128:$Rn),
+ (AArch64dup (f32 FPR32Op:$Rm)))),
+ (!cast<Instruction>(INST # "v4i32_indexed") V128:$Rd, V128:$Rn,
+ (SUBREG_TO_REG (i32 0), FPR32Op:$Rm, ssub), (i64 0))>;
+
+ // 2 variants for the .2d version: DUPLANE from 128-bit and DUP scalar.
+ def : Pat<(v2f64 (OpNode (v2f64 V128:$Rd), (v2f64 V128:$Rn),
+ (AArch64duplane64 (v2f64 V128:$Rm),
+ VectorIndexD:$idx))),
+ (!cast<Instruction>(INST # "v2i64_indexed")
+ V128:$Rd, V128:$Rn, V128:$Rm, VectorIndexS:$idx)>;
+ def : Pat<(v2f64 (OpNode (v2f64 V128:$Rd), (v2f64 V128:$Rn),
+ (AArch64dup (f64 FPR64Op:$Rm)))),
+ (!cast<Instruction>(INST # "v2i64_indexed") V128:$Rd, V128:$Rn,
+ (SUBREG_TO_REG (i32 0), FPR64Op:$Rm, dsub), (i64 0))>;
+
+ // 2 variants for 32-bit scalar version: extract from .2s or from .4s
+ def : Pat<(f32 (OpNode (f32 FPR32:$Rd), (f32 FPR32:$Rn),
+ (vector_extract (v4f32 V128:$Rm), VectorIndexS:$idx))),
+ (!cast<Instruction>(INST # "v1i32_indexed") FPR32:$Rd, FPR32:$Rn,
+ V128:$Rm, VectorIndexS:$idx)>;
+ def : Pat<(f32 (OpNode (f32 FPR32:$Rd), (f32 FPR32:$Rn),
+ (vector_extract (v2f32 V64:$Rm), VectorIndexS:$idx))),
+ (!cast<Instruction>(INST # "v1i32_indexed") FPR32:$Rd, FPR32:$Rn,
+ (SUBREG_TO_REG (i32 0), V64:$Rm, dsub), VectorIndexS:$idx)>;
+
+ // 1 variant for 64-bit scalar version: extract from .1d or from .2d
+ def : Pat<(f64 (OpNode (f64 FPR64:$Rd), (f64 FPR64:$Rn),
+ (vector_extract (v2f64 V128:$Rm), VectorIndexD:$idx))),
+ (!cast<Instruction>(INST # "v1i64_indexed") FPR64:$Rd, FPR64:$Rn,
+ V128:$Rm, VectorIndexD:$idx)>;
+}
+
+multiclass SIMDFPIndexedSDTied<bit U, bits<4> opc, string asm> {
+ def v2i32_indexed : BaseSIMDIndexedTied<0, U, 0, 0b10, opc, V64, V64,
+ V128, VectorIndexS,
+ asm, ".2s", ".2s", ".2s", ".s", []> {
+ bits<2> idx;
+ let Inst{11} = idx{1};
+ let Inst{21} = idx{0};
+ }
+
+ def v4i32_indexed : BaseSIMDIndexedTied<1, U, 0, 0b10, opc,
+ V128, V128,
+ V128, VectorIndexS,
+ asm, ".4s", ".4s", ".4s", ".s", []> {
+ bits<2> idx;
+ let Inst{11} = idx{1};
+ let Inst{21} = idx{0};
+ }
+
+ def v2i64_indexed : BaseSIMDIndexedTied<1, U, 0, 0b11, opc,
+ V128, V128,
+ V128, VectorIndexD,
+ asm, ".2d", ".2d", ".2d", ".d", []> {
+ bits<1> idx;
+ let Inst{11} = idx{0};
+ let Inst{21} = 0;
+ }
+
+
+ def v1i32_indexed : BaseSIMDIndexedTied<1, U, 1, 0b10, opc,
+ FPR32Op, FPR32Op, V128, VectorIndexS,
+ asm, ".s", "", "", ".s", []> {
+ bits<2> idx;
+ let Inst{11} = idx{1};
+ let Inst{21} = idx{0};
+ }
+
+ def v1i64_indexed : BaseSIMDIndexedTied<1, U, 1, 0b11, opc,
+ FPR64Op, FPR64Op, V128, VectorIndexD,
+ asm, ".d", "", "", ".d", []> {
+ bits<1> idx;
+ let Inst{11} = idx{0};
+ let Inst{21} = 0;
+ }
+}
+
+multiclass SIMDIndexedHS<bit U, bits<4> opc, string asm,
+ SDPatternOperator OpNode> {
+ def v4i16_indexed : BaseSIMDIndexed<0, U, 0, 0b01, opc, V64, V64,
+ V128_lo, VectorIndexH,
+ asm, ".4h", ".4h", ".4h", ".h",
+ [(set (v4i16 V64:$Rd),
+ (OpNode (v4i16 V64:$Rn),
+ (v4i16 (AArch64duplane16 (v8i16 V128_lo:$Rm), VectorIndexH:$idx))))]> {
+ bits<3> idx;
+ let Inst{11} = idx{2};
+ let Inst{21} = idx{1};
+ let Inst{20} = idx{0};
+ }
+
+ def v8i16_indexed : BaseSIMDIndexed<1, U, 0, 0b01, opc,
+ V128, V128,
+ V128_lo, VectorIndexH,
+ asm, ".8h", ".8h", ".8h", ".h",
+ [(set (v8i16 V128:$Rd),
+ (OpNode (v8i16 V128:$Rn),
+ (v8i16 (AArch64duplane16 (v8i16 V128_lo:$Rm), VectorIndexH:$idx))))]> {
+ bits<3> idx;
+ let Inst{11} = idx{2};
+ let Inst{21} = idx{1};
+ let Inst{20} = idx{0};
+ }
+
+ def v2i32_indexed : BaseSIMDIndexed<0, U, 0, 0b10, opc,
+ V64, V64,
+ V128, VectorIndexS,
+ asm, ".2s", ".2s", ".2s", ".s",
+ [(set (v2i32 V64:$Rd),
+ (OpNode (v2i32 V64:$Rn),
+ (v2i32 (AArch64duplane32 (v4i32 V128:$Rm), VectorIndexS:$idx))))]> {
+ bits<2> idx;
+ let Inst{11} = idx{1};
+ let Inst{21} = idx{0};
+ }
+
+ def v4i32_indexed : BaseSIMDIndexed<1, U, 0, 0b10, opc,
+ V128, V128,
+ V128, VectorIndexS,
+ asm, ".4s", ".4s", ".4s", ".s",
+ [(set (v4i32 V128:$Rd),
+ (OpNode (v4i32 V128:$Rn),
+ (v4i32 (AArch64duplane32 (v4i32 V128:$Rm), VectorIndexS:$idx))))]> {
+ bits<2> idx;
+ let Inst{11} = idx{1};
+ let Inst{21} = idx{0};
+ }
+
+ def v1i16_indexed : BaseSIMDIndexed<1, U, 1, 0b01, opc,
+ FPR16Op, FPR16Op, V128_lo, VectorIndexH,
+ asm, ".h", "", "", ".h", []> {
+ bits<3> idx;
+ let Inst{11} = idx{2};
+ let Inst{21} = idx{1};
+ let Inst{20} = idx{0};
+ }
+
+ def v1i32_indexed : BaseSIMDIndexed<1, U, 1, 0b10, opc,
+ FPR32Op, FPR32Op, V128, VectorIndexS,
+ asm, ".s", "", "", ".s",
+ [(set (i32 FPR32Op:$Rd),
+ (OpNode FPR32Op:$Rn,
+ (i32 (vector_extract (v4i32 V128:$Rm),
+ VectorIndexS:$idx))))]> {
+ bits<2> idx;
+ let Inst{11} = idx{1};
+ let Inst{21} = idx{0};
+ }
+}
+
+multiclass SIMDVectorIndexedHS<bit U, bits<4> opc, string asm,
+ SDPatternOperator OpNode> {
+ def v4i16_indexed : BaseSIMDIndexed<0, U, 0, 0b01, opc,
+ V64, V64,
+ V128_lo, VectorIndexH,
+ asm, ".4h", ".4h", ".4h", ".h",
+ [(set (v4i16 V64:$Rd),
+ (OpNode (v4i16 V64:$Rn),
+ (v4i16 (AArch64duplane16 (v8i16 V128_lo:$Rm), VectorIndexH:$idx))))]> {
+ bits<3> idx;
+ let Inst{11} = idx{2};
+ let Inst{21} = idx{1};
+ let Inst{20} = idx{0};
+ }
+
+ def v8i16_indexed : BaseSIMDIndexed<1, U, 0, 0b01, opc,
+ V128, V128,
+ V128_lo, VectorIndexH,
+ asm, ".8h", ".8h", ".8h", ".h",
+ [(set (v8i16 V128:$Rd),
+ (OpNode (v8i16 V128:$Rn),
+ (v8i16 (AArch64duplane16 (v8i16 V128_lo:$Rm), VectorIndexH:$idx))))]> {
+ bits<3> idx;
+ let Inst{11} = idx{2};
+ let Inst{21} = idx{1};
+ let Inst{20} = idx{0};
+ }
+
+ def v2i32_indexed : BaseSIMDIndexed<0, U, 0, 0b10, opc,
+ V64, V64,
+ V128, VectorIndexS,
+ asm, ".2s", ".2s", ".2s", ".s",
+ [(set (v2i32 V64:$Rd),
+ (OpNode (v2i32 V64:$Rn),
+ (v2i32 (AArch64duplane32 (v4i32 V128:$Rm), VectorIndexS:$idx))))]> {
+ bits<2> idx;
+ let Inst{11} = idx{1};
+ let Inst{21} = idx{0};
+ }
+
+ def v4i32_indexed : BaseSIMDIndexed<1, U, 0, 0b10, opc,
+ V128, V128,
+ V128, VectorIndexS,
+ asm, ".4s", ".4s", ".4s", ".s",
+ [(set (v4i32 V128:$Rd),
+ (OpNode (v4i32 V128:$Rn),
+ (v4i32 (AArch64duplane32 (v4i32 V128:$Rm), VectorIndexS:$idx))))]> {
+ bits<2> idx;
+ let Inst{11} = idx{1};
+ let Inst{21} = idx{0};
+ }
+}
+
+multiclass SIMDVectorIndexedHSTied<bit U, bits<4> opc, string asm,
+ SDPatternOperator OpNode> {
+ def v4i16_indexed : BaseSIMDIndexedTied<0, U, 0, 0b01, opc, V64, V64,
+ V128_lo, VectorIndexH,
+ asm, ".4h", ".4h", ".4h", ".h",
+ [(set (v4i16 V64:$dst),
+ (OpNode (v4i16 V64:$Rd),(v4i16 V64:$Rn),
+ (v4i16 (AArch64duplane16 (v8i16 V128_lo:$Rm), VectorIndexH:$idx))))]> {
+ bits<3> idx;
+ let Inst{11} = idx{2};
+ let Inst{21} = idx{1};
+ let Inst{20} = idx{0};
+ }
+
+ def v8i16_indexed : BaseSIMDIndexedTied<1, U, 0, 0b01, opc,
+ V128, V128,
+ V128_lo, VectorIndexH,
+ asm, ".8h", ".8h", ".8h", ".h",
+ [(set (v8i16 V128:$dst),
+ (OpNode (v8i16 V128:$Rd), (v8i16 V128:$Rn),
+ (v8i16 (AArch64duplane16 (v8i16 V128_lo:$Rm), VectorIndexH:$idx))))]> {
+ bits<3> idx;
+ let Inst{11} = idx{2};
+ let Inst{21} = idx{1};
+ let Inst{20} = idx{0};
+ }
+
+ def v2i32_indexed : BaseSIMDIndexedTied<0, U, 0, 0b10, opc,
+ V64, V64,
+ V128, VectorIndexS,
+ asm, ".2s", ".2s", ".2s", ".s",
+ [(set (v2i32 V64:$dst),
+ (OpNode (v2i32 V64:$Rd), (v2i32 V64:$Rn),
+ (v2i32 (AArch64duplane32 (v4i32 V128:$Rm), VectorIndexS:$idx))))]> {
+ bits<2> idx;
+ let Inst{11} = idx{1};
+ let Inst{21} = idx{0};
+ }
+
+ def v4i32_indexed : BaseSIMDIndexedTied<1, U, 0, 0b10, opc,
+ V128, V128,
+ V128, VectorIndexS,
+ asm, ".4s", ".4s", ".4s", ".s",
+ [(set (v4i32 V128:$dst),
+ (OpNode (v4i32 V128:$Rd), (v4i32 V128:$Rn),
+ (v4i32 (AArch64duplane32 (v4i32 V128:$Rm), VectorIndexS:$idx))))]> {
+ bits<2> idx;
+ let Inst{11} = idx{1};
+ let Inst{21} = idx{0};
+ }
+}
+
+multiclass SIMDIndexedLongSD<bit U, bits<4> opc, string asm,
+ SDPatternOperator OpNode> {
+ def v4i16_indexed : BaseSIMDIndexed<0, U, 0, 0b01, opc,
+ V128, V64,
+ V128_lo, VectorIndexH,
+ asm, ".4s", ".4s", ".4h", ".h",
+ [(set (v4i32 V128:$Rd),
+ (OpNode (v4i16 V64:$Rn),
+ (v4i16 (AArch64duplane16 (v8i16 V128_lo:$Rm), VectorIndexH:$idx))))]> {
+ bits<3> idx;
+ let Inst{11} = idx{2};
+ let Inst{21} = idx{1};
+ let Inst{20} = idx{0};
+ }
+
+ def v8i16_indexed : BaseSIMDIndexed<1, U, 0, 0b01, opc,
+ V128, V128,
+ V128_lo, VectorIndexH,
+ asm#"2", ".4s", ".4s", ".8h", ".h",
+ [(set (v4i32 V128:$Rd),
+ (OpNode (extract_high_v8i16 V128:$Rn),
+ (extract_high_v8i16 (AArch64duplane16 (v8i16 V128_lo:$Rm),
+ VectorIndexH:$idx))))]> {
+
+ bits<3> idx;
+ let Inst{11} = idx{2};
+ let Inst{21} = idx{1};
+ let Inst{20} = idx{0};
+ }
+
+ def v2i32_indexed : BaseSIMDIndexed<0, U, 0, 0b10, opc,
+ V128, V64,
+ V128, VectorIndexS,
+ asm, ".2d", ".2d", ".2s", ".s",
+ [(set (v2i64 V128:$Rd),
+ (OpNode (v2i32 V64:$Rn),
+ (v2i32 (AArch64duplane32 (v4i32 V128:$Rm), VectorIndexS:$idx))))]> {
+ bits<2> idx;
+ let Inst{11} = idx{1};
+ let Inst{21} = idx{0};
+ }
+
+ def v4i32_indexed : BaseSIMDIndexed<1, U, 0, 0b10, opc,
+ V128, V128,
+ V128, VectorIndexS,
+ asm#"2", ".2d", ".2d", ".4s", ".s",
+ [(set (v2i64 V128:$Rd),
+ (OpNode (extract_high_v4i32 V128:$Rn),
+ (extract_high_v4i32 (AArch64duplane32 (v4i32 V128:$Rm),
+ VectorIndexS:$idx))))]> {
+ bits<2> idx;
+ let Inst{11} = idx{1};
+ let Inst{21} = idx{0};
+ }
+
+ def v1i32_indexed : BaseSIMDIndexed<1, U, 1, 0b01, opc,
+ FPR32Op, FPR16Op, V128_lo, VectorIndexH,
+ asm, ".h", "", "", ".h", []> {
+ bits<3> idx;
+ let Inst{11} = idx{2};
+ let Inst{21} = idx{1};
+ let Inst{20} = idx{0};
+ }
+
+ def v1i64_indexed : BaseSIMDIndexed<1, U, 1, 0b10, opc,
+ FPR64Op, FPR32Op, V128, VectorIndexS,
+ asm, ".s", "", "", ".s", []> {
+ bits<2> idx;
+ let Inst{11} = idx{1};
+ let Inst{21} = idx{0};
+ }
+}
+
+multiclass SIMDIndexedLongSQDMLXSDTied<bit U, bits<4> opc, string asm,
+ SDPatternOperator Accum> {
+ def v4i16_indexed : BaseSIMDIndexedTied<0, U, 0, 0b01, opc,
+ V128, V64,
+ V128_lo, VectorIndexH,
+ asm, ".4s", ".4s", ".4h", ".h",
+ [(set (v4i32 V128:$dst),
+ (Accum (v4i32 V128:$Rd),
+ (v4i32 (int_aarch64_neon_sqdmull
+ (v4i16 V64:$Rn),
+ (v4i16 (AArch64duplane16 (v8i16 V128_lo:$Rm),
+ VectorIndexH:$idx))))))]> {
+ bits<3> idx;
+ let Inst{11} = idx{2};
+ let Inst{21} = idx{1};
+ let Inst{20} = idx{0};
+ }
+
+ // FIXME: it would be nice to use the scalar (v1i32) instruction here, but an
+ // intermediate EXTRACT_SUBREG would be untyped.
+ def : Pat<(i32 (Accum (i32 FPR32Op:$Rd),
+ (i32 (vector_extract (v4i32
+ (int_aarch64_neon_sqdmull (v4i16 V64:$Rn),
+ (v4i16 (AArch64duplane16 (v8i16 V128_lo:$Rm),
+ VectorIndexH:$idx)))),
+ (i64 0))))),
+ (EXTRACT_SUBREG
+ (!cast<Instruction>(NAME # v4i16_indexed)
+ (SUBREG_TO_REG (i32 0), FPR32Op:$Rd, ssub), V64:$Rn,
+ V128_lo:$Rm, VectorIndexH:$idx),
+ ssub)>;
+
+ def v8i16_indexed : BaseSIMDIndexedTied<1, U, 0, 0b01, opc,
+ V128, V128,
+ V128_lo, VectorIndexH,
+ asm#"2", ".4s", ".4s", ".8h", ".h",
+ [(set (v4i32 V128:$dst),
+ (Accum (v4i32 V128:$Rd),
+ (v4i32 (int_aarch64_neon_sqdmull
+ (extract_high_v8i16 V128:$Rn),
+ (extract_high_v8i16
+ (AArch64duplane16 (v8i16 V128_lo:$Rm),
+ VectorIndexH:$idx))))))]> {
+ bits<3> idx;
+ let Inst{11} = idx{2};
+ let Inst{21} = idx{1};
+ let Inst{20} = idx{0};
+ }
+
+ def v2i32_indexed : BaseSIMDIndexedTied<0, U, 0, 0b10, opc,
+ V128, V64,
+ V128, VectorIndexS,
+ asm, ".2d", ".2d", ".2s", ".s",
+ [(set (v2i64 V128:$dst),
+ (Accum (v2i64 V128:$Rd),
+ (v2i64 (int_aarch64_neon_sqdmull
+ (v2i32 V64:$Rn),
+ (v2i32 (AArch64duplane32 (v4i32 V128:$Rm),
+ VectorIndexS:$idx))))))]> {
+ bits<2> idx;
+ let Inst{11} = idx{1};
+ let Inst{21} = idx{0};
+ }
+
+ def v4i32_indexed : BaseSIMDIndexedTied<1, U, 0, 0b10, opc,
+ V128, V128,
+ V128, VectorIndexS,
+ asm#"2", ".2d", ".2d", ".4s", ".s",
+ [(set (v2i64 V128:$dst),
+ (Accum (v2i64 V128:$Rd),
+ (v2i64 (int_aarch64_neon_sqdmull
+ (extract_high_v4i32 V128:$Rn),
+ (extract_high_v4i32
+ (AArch64duplane32 (v4i32 V128:$Rm),
+ VectorIndexS:$idx))))))]> {
+ bits<2> idx;
+ let Inst{11} = idx{1};
+ let Inst{21} = idx{0};
+ }
+
+ def v1i32_indexed : BaseSIMDIndexedTied<1, U, 1, 0b01, opc,
+ FPR32Op, FPR16Op, V128_lo, VectorIndexH,
+ asm, ".h", "", "", ".h", []> {
+ bits<3> idx;
+ let Inst{11} = idx{2};
+ let Inst{21} = idx{1};
+ let Inst{20} = idx{0};
+ }
+
+
+ def v1i64_indexed : BaseSIMDIndexedTied<1, U, 1, 0b10, opc,
+ FPR64Op, FPR32Op, V128, VectorIndexS,
+ asm, ".s", "", "", ".s",
+ [(set (i64 FPR64Op:$dst),
+ (Accum (i64 FPR64Op:$Rd),
+ (i64 (int_aarch64_neon_sqdmulls_scalar
+ (i32 FPR32Op:$Rn),
+ (i32 (vector_extract (v4i32 V128:$Rm),
+ VectorIndexS:$idx))))))]> {
+
+ bits<2> idx;
+ let Inst{11} = idx{1};
+ let Inst{21} = idx{0};
+ }
+}
+
+multiclass SIMDVectorIndexedLongSD<bit U, bits<4> opc, string asm,
+ SDPatternOperator OpNode> {
+ let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in {
+ def v4i16_indexed : BaseSIMDIndexed<0, U, 0, 0b01, opc,
+ V128, V64,
+ V128_lo, VectorIndexH,
+ asm, ".4s", ".4s", ".4h", ".h",
+ [(set (v4i32 V128:$Rd),
+ (OpNode (v4i16 V64:$Rn),
+ (v4i16 (AArch64duplane16 (v8i16 V128_lo:$Rm), VectorIndexH:$idx))))]> {
+ bits<3> idx;
+ let Inst{11} = idx{2};
+ let Inst{21} = idx{1};
+ let Inst{20} = idx{0};
+ }
+
+ def v8i16_indexed : BaseSIMDIndexed<1, U, 0, 0b01, opc,
+ V128, V128,
+ V128_lo, VectorIndexH,
+ asm#"2", ".4s", ".4s", ".8h", ".h",
+ [(set (v4i32 V128:$Rd),
+ (OpNode (extract_high_v8i16 V128:$Rn),
+ (extract_high_v8i16 (AArch64duplane16 (v8i16 V128_lo:$Rm),
+ VectorIndexH:$idx))))]> {
+
+ bits<3> idx;
+ let Inst{11} = idx{2};
+ let Inst{21} = idx{1};
+ let Inst{20} = idx{0};
+ }
+
+ def v2i32_indexed : BaseSIMDIndexed<0, U, 0, 0b10, opc,
+ V128, V64,
+ V128, VectorIndexS,
+ asm, ".2d", ".2d", ".2s", ".s",
+ [(set (v2i64 V128:$Rd),
+ (OpNode (v2i32 V64:$Rn),
+ (v2i32 (AArch64duplane32 (v4i32 V128:$Rm), VectorIndexS:$idx))))]> {
+ bits<2> idx;
+ let Inst{11} = idx{1};
+ let Inst{21} = idx{0};
+ }
+
+ def v4i32_indexed : BaseSIMDIndexed<1, U, 0, 0b10, opc,
+ V128, V128,
+ V128, VectorIndexS,
+ asm#"2", ".2d", ".2d", ".4s", ".s",
+ [(set (v2i64 V128:$Rd),
+ (OpNode (extract_high_v4i32 V128:$Rn),
+ (extract_high_v4i32 (AArch64duplane32 (v4i32 V128:$Rm),
+ VectorIndexS:$idx))))]> {
+ bits<2> idx;
+ let Inst{11} = idx{1};
+ let Inst{21} = idx{0};
+ }
+ }
+}
+
+multiclass SIMDVectorIndexedLongSDTied<bit U, bits<4> opc, string asm,
+ SDPatternOperator OpNode> {
+ let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in {
+ def v4i16_indexed : BaseSIMDIndexedTied<0, U, 0, 0b01, opc,
+ V128, V64,
+ V128_lo, VectorIndexH,
+ asm, ".4s", ".4s", ".4h", ".h",
+ [(set (v4i32 V128:$dst),
+ (OpNode (v4i32 V128:$Rd), (v4i16 V64:$Rn),
+ (v4i16 (AArch64duplane16 (v8i16 V128_lo:$Rm), VectorIndexH:$idx))))]> {
+ bits<3> idx;
+ let Inst{11} = idx{2};
+ let Inst{21} = idx{1};
+ let Inst{20} = idx{0};
+ }
+
+ def v8i16_indexed : BaseSIMDIndexedTied<1, U, 0, 0b01, opc,
+ V128, V128,
+ V128_lo, VectorIndexH,
+ asm#"2", ".4s", ".4s", ".8h", ".h",
+ [(set (v4i32 V128:$dst),
+ (OpNode (v4i32 V128:$Rd),
+ (extract_high_v8i16 V128:$Rn),
+ (extract_high_v8i16 (AArch64duplane16 (v8i16 V128_lo:$Rm),
+ VectorIndexH:$idx))))]> {
+ bits<3> idx;
+ let Inst{11} = idx{2};
+ let Inst{21} = idx{1};
+ let Inst{20} = idx{0};
+ }
+
+ def v2i32_indexed : BaseSIMDIndexedTied<0, U, 0, 0b10, opc,
+ V128, V64,
+ V128, VectorIndexS,
+ asm, ".2d", ".2d", ".2s", ".s",
+ [(set (v2i64 V128:$dst),
+ (OpNode (v2i64 V128:$Rd), (v2i32 V64:$Rn),
+ (v2i32 (AArch64duplane32 (v4i32 V128:$Rm), VectorIndexS:$idx))))]> {
+ bits<2> idx;
+ let Inst{11} = idx{1};
+ let Inst{21} = idx{0};
+ }
+
+ def v4i32_indexed : BaseSIMDIndexedTied<1, U, 0, 0b10, opc,
+ V128, V128,
+ V128, VectorIndexS,
+ asm#"2", ".2d", ".2d", ".4s", ".s",
+ [(set (v2i64 V128:$dst),
+ (OpNode (v2i64 V128:$Rd),
+ (extract_high_v4i32 V128:$Rn),
+ (extract_high_v4i32 (AArch64duplane32 (v4i32 V128:$Rm),
+ VectorIndexS:$idx))))]> {
+ bits<2> idx;
+ let Inst{11} = idx{1};
+ let Inst{21} = idx{0};
+ }
+ }
+}
+
+//----------------------------------------------------------------------------
+// AdvSIMD scalar shift by immediate
+//----------------------------------------------------------------------------
+
+let mayStore = 0, mayLoad = 0, hasSideEffects = 0 in
+class BaseSIMDScalarShift<bit U, bits<5> opc, bits<7> fixed_imm,
+ RegisterClass regtype1, RegisterClass regtype2,
+ Operand immtype, string asm, list<dag> pattern>
+ : I<(outs regtype1:$Rd), (ins regtype2:$Rn, immtype:$imm),
+ asm, "\t$Rd, $Rn, $imm", "", pattern>,
+ Sched<[WriteV]> {
+ bits<5> Rd;
+ bits<5> Rn;
+ bits<7> imm;
+ let Inst{31-30} = 0b01;
+ let Inst{29} = U;
+ let Inst{28-23} = 0b111110;
+ let Inst{22-16} = fixed_imm;
+ let Inst{15-11} = opc;
+ let Inst{10} = 1;
+ let Inst{9-5} = Rn;
+ let Inst{4-0} = Rd;
+}
+
+let mayStore = 0, mayLoad = 0, hasSideEffects = 0 in
+class BaseSIMDScalarShiftTied<bit U, bits<5> opc, bits<7> fixed_imm,
+ RegisterClass regtype1, RegisterClass regtype2,
+ Operand immtype, string asm, list<dag> pattern>
+ : I<(outs regtype1:$dst), (ins regtype1:$Rd, regtype2:$Rn, immtype:$imm),
+ asm, "\t$Rd, $Rn, $imm", "$Rd = $dst", pattern>,
+ Sched<[WriteV]> {
+ bits<5> Rd;
+ bits<5> Rn;
+ bits<7> imm;
+ let Inst{31-30} = 0b01;
+ let Inst{29} = U;
+ let Inst{28-23} = 0b111110;
+ let Inst{22-16} = fixed_imm;
+ let Inst{15-11} = opc;
+ let Inst{10} = 1;
+ let Inst{9-5} = Rn;
+ let Inst{4-0} = Rd;
+}
+
+
+multiclass SIMDScalarRShiftSD<bit U, bits<5> opc, string asm> {
+ def s : BaseSIMDScalarShift<U, opc, {0,1,?,?,?,?,?},
+ FPR32, FPR32, vecshiftR32, asm, []> {
+ let Inst{20-16} = imm{4-0};
+ }
+
+ def d : BaseSIMDScalarShift<U, opc, {1,?,?,?,?,?,?},
+ FPR64, FPR64, vecshiftR64, asm, []> {
+ let Inst{21-16} = imm{5-0};
+ }
+}
+
+multiclass SIMDScalarRShiftD<bit U, bits<5> opc, string asm,
+ SDPatternOperator OpNode> {
+ def d : BaseSIMDScalarShift<U, opc, {1,?,?,?,?,?,?},
+ FPR64, FPR64, vecshiftR64, asm,
+ [(set (i64 FPR64:$Rd),
+ (OpNode (i64 FPR64:$Rn), (i32 vecshiftR64:$imm)))]> {
+ let Inst{21-16} = imm{5-0};
+ }
+
+ def : Pat<(v1i64 (OpNode (v1i64 FPR64:$Rn), (i32 vecshiftR64:$imm))),
+ (!cast<Instruction>(NAME # "d") FPR64:$Rn, vecshiftR64:$imm)>;
+}
+
+multiclass SIMDScalarRShiftDTied<bit U, bits<5> opc, string asm,
+ SDPatternOperator OpNode = null_frag> {
+ def d : BaseSIMDScalarShiftTied<U, opc, {1,?,?,?,?,?,?},
+ FPR64, FPR64, vecshiftR64, asm,
+ [(set (i64 FPR64:$dst), (OpNode (i64 FPR64:$Rd), (i64 FPR64:$Rn),
+ (i32 vecshiftR64:$imm)))]> {
+ let Inst{21-16} = imm{5-0};
+ }
+
+ def : Pat<(v1i64 (OpNode (v1i64 FPR64:$Rd), (v1i64 FPR64:$Rn),
+ (i32 vecshiftR64:$imm))),
+ (!cast<Instruction>(NAME # "d") FPR64:$Rd, FPR64:$Rn,
+ vecshiftR64:$imm)>;
+}
+
+multiclass SIMDScalarLShiftD<bit U, bits<5> opc, string asm,
+ SDPatternOperator OpNode> {
+ def d : BaseSIMDScalarShift<U, opc, {1,?,?,?,?,?,?},
+ FPR64, FPR64, vecshiftL64, asm,
+ [(set (v1i64 FPR64:$Rd),
+ (OpNode (v1i64 FPR64:$Rn), (i32 vecshiftL64:$imm)))]> {
+ let Inst{21-16} = imm{5-0};
+ }
+}
+
+let mayStore = 0, mayLoad = 0, hasSideEffects = 0 in
+multiclass SIMDScalarLShiftDTied<bit U, bits<5> opc, string asm> {
+ def d : BaseSIMDScalarShiftTied<U, opc, {1,?,?,?,?,?,?},
+ FPR64, FPR64, vecshiftL64, asm, []> {
+ let Inst{21-16} = imm{5-0};
+ }
+}
+
+let mayStore = 0, mayLoad = 0, hasSideEffects = 0 in
+multiclass SIMDScalarRShiftBHS<bit U, bits<5> opc, string asm,
+ SDPatternOperator OpNode = null_frag> {
+ def b : BaseSIMDScalarShift<U, opc, {0,0,0,1,?,?,?},
+ FPR8, FPR16, vecshiftR8, asm, []> {
+ let Inst{18-16} = imm{2-0};
+ }
+
+ def h : BaseSIMDScalarShift<U, opc, {0,0,1,?,?,?,?},
+ FPR16, FPR32, vecshiftR16, asm, []> {
+ let Inst{19-16} = imm{3-0};
+ }
+
+ def s : BaseSIMDScalarShift<U, opc, {0,1,?,?,?,?,?},
+ FPR32, FPR64, vecshiftR32, asm,
+ [(set (i32 FPR32:$Rd), (OpNode (i64 FPR64:$Rn), vecshiftR32:$imm))]> {
+ let Inst{20-16} = imm{4-0};
+ }
+}
+
+multiclass SIMDScalarLShiftBHSD<bit U, bits<5> opc, string asm,
+ SDPatternOperator OpNode> {
+ def b : BaseSIMDScalarShift<U, opc, {0,0,0,1,?,?,?},
+ FPR8, FPR8, vecshiftL8, asm, []> {
+ let Inst{18-16} = imm{2-0};
+ }
+
+ def h : BaseSIMDScalarShift<U, opc, {0,0,1,?,?,?,?},
+ FPR16, FPR16, vecshiftL16, asm, []> {
+ let Inst{19-16} = imm{3-0};
+ }
+
+ def s : BaseSIMDScalarShift<U, opc, {0,1,?,?,?,?,?},
+ FPR32, FPR32, vecshiftL32, asm,
+ [(set (i32 FPR32:$Rd), (OpNode (i32 FPR32:$Rn), (i32 vecshiftL32:$imm)))]> {
+ let Inst{20-16} = imm{4-0};
+ }
+
+ def d : BaseSIMDScalarShift<U, opc, {1,?,?,?,?,?,?},
+ FPR64, FPR64, vecshiftL64, asm,
+ [(set (i64 FPR64:$Rd), (OpNode (i64 FPR64:$Rn), (i32 vecshiftL64:$imm)))]> {
+ let Inst{21-16} = imm{5-0};
+ }
+
+ def : Pat<(v1i64 (OpNode (v1i64 FPR64:$Rn), (i32 vecshiftL64:$imm))),
+ (!cast<Instruction>(NAME # "d") FPR64:$Rn, vecshiftL64:$imm)>;
+}
+
+multiclass SIMDScalarRShiftBHSD<bit U, bits<5> opc, string asm> {
+ def b : BaseSIMDScalarShift<U, opc, {0,0,0,1,?,?,?},
+ FPR8, FPR8, vecshiftR8, asm, []> {
+ let Inst{18-16} = imm{2-0};
+ }
+
+ def h : BaseSIMDScalarShift<U, opc, {0,0,1,?,?,?,?},
+ FPR16, FPR16, vecshiftR16, asm, []> {
+ let Inst{19-16} = imm{3-0};
+ }
+
+ def s : BaseSIMDScalarShift<U, opc, {0,1,?,?,?,?,?},
+ FPR32, FPR32, vecshiftR32, asm, []> {
+ let Inst{20-16} = imm{4-0};
+ }
+
+ def d : BaseSIMDScalarShift<U, opc, {1,?,?,?,?,?,?},
+ FPR64, FPR64, vecshiftR64, asm, []> {
+ let Inst{21-16} = imm{5-0};
+ }
+}
+
+//----------------------------------------------------------------------------
+// AdvSIMD vector x indexed element
+//----------------------------------------------------------------------------
+
+let mayStore = 0, mayLoad = 0, hasSideEffects = 0 in
+class BaseSIMDVectorShift<bit Q, bit U, bits<5> opc, bits<7> fixed_imm,
+ RegisterOperand dst_reg, RegisterOperand src_reg,
+ Operand immtype,
+ string asm, string dst_kind, string src_kind,
+ list<dag> pattern>
+ : I<(outs dst_reg:$Rd), (ins src_reg:$Rn, immtype:$imm),
+ asm, "{\t$Rd" # dst_kind # ", $Rn" # src_kind # ", $imm" #
+ "|" # dst_kind # "\t$Rd, $Rn, $imm}", "", pattern>,
+ Sched<[WriteV]> {
+ bits<5> Rd;
+ bits<5> Rn;
+ let Inst{31} = 0;
+ let Inst{30} = Q;
+ let Inst{29} = U;
+ let Inst{28-23} = 0b011110;
+ let Inst{22-16} = fixed_imm;
+ let Inst{15-11} = opc;
+ let Inst{10} = 1;
+ let Inst{9-5} = Rn;
+ let Inst{4-0} = Rd;
+}
+
+let mayStore = 0, mayLoad = 0, hasSideEffects = 0 in
+class BaseSIMDVectorShiftTied<bit Q, bit U, bits<5> opc, bits<7> fixed_imm,
+ RegisterOperand vectype1, RegisterOperand vectype2,
+ Operand immtype,
+ string asm, string dst_kind, string src_kind,
+ list<dag> pattern>
+ : I<(outs vectype1:$dst), (ins vectype1:$Rd, vectype2:$Rn, immtype:$imm),
+ asm, "{\t$Rd" # dst_kind # ", $Rn" # src_kind # ", $imm" #
+ "|" # dst_kind # "\t$Rd, $Rn, $imm}", "$Rd = $dst", pattern>,
+ Sched<[WriteV]> {
+ bits<5> Rd;
+ bits<5> Rn;
+ let Inst{31} = 0;
+ let Inst{30} = Q;
+ let Inst{29} = U;
+ let Inst{28-23} = 0b011110;
+ let Inst{22-16} = fixed_imm;
+ let Inst{15-11} = opc;
+ let Inst{10} = 1;
+ let Inst{9-5} = Rn;
+ let Inst{4-0} = Rd;
+}
+
+multiclass SIMDVectorRShiftSD<bit U, bits<5> opc, string asm,
+ Intrinsic OpNode> {
+ def v2i32_shift : BaseSIMDVectorShift<0, U, opc, {0,1,?,?,?,?,?},
+ V64, V64, vecshiftR32,
+ asm, ".2s", ".2s",
+ [(set (v2i32 V64:$Rd), (OpNode (v2f32 V64:$Rn), (i32 imm:$imm)))]> {
+ bits<5> imm;
+ let Inst{20-16} = imm;
+ }
+
+ def v4i32_shift : BaseSIMDVectorShift<1, U, opc, {0,1,?,?,?,?,?},
+ V128, V128, vecshiftR32,
+ asm, ".4s", ".4s",
+ [(set (v4i32 V128:$Rd), (OpNode (v4f32 V128:$Rn), (i32 imm:$imm)))]> {
+ bits<5> imm;
+ let Inst{20-16} = imm;
+ }
+
+ def v2i64_shift : BaseSIMDVectorShift<1, U, opc, {1,?,?,?,?,?,?},
+ V128, V128, vecshiftR64,
+ asm, ".2d", ".2d",
+ [(set (v2i64 V128:$Rd), (OpNode (v2f64 V128:$Rn), (i32 imm:$imm)))]> {
+ bits<6> imm;
+ let Inst{21-16} = imm;
+ }
+}
+
+multiclass SIMDVectorRShiftSDToFP<bit U, bits<5> opc, string asm,
+ Intrinsic OpNode> {
+ def v2i32_shift : BaseSIMDVectorShift<0, U, opc, {0,1,?,?,?,?,?},
+ V64, V64, vecshiftR32,
+ asm, ".2s", ".2s",
+ [(set (v2f32 V64:$Rd), (OpNode (v2i32 V64:$Rn), (i32 imm:$imm)))]> {
+ bits<5> imm;
+ let Inst{20-16} = imm;
+ }
+
+ def v4i32_shift : BaseSIMDVectorShift<1, U, opc, {0,1,?,?,?,?,?},
+ V128, V128, vecshiftR32,
+ asm, ".4s", ".4s",
+ [(set (v4f32 V128:$Rd), (OpNode (v4i32 V128:$Rn), (i32 imm:$imm)))]> {
+ bits<5> imm;
+ let Inst{20-16} = imm;
+ }
+
+ def v2i64_shift : BaseSIMDVectorShift<1, U, opc, {1,?,?,?,?,?,?},
+ V128, V128, vecshiftR64,
+ asm, ".2d", ".2d",
+ [(set (v2f64 V128:$Rd), (OpNode (v2i64 V128:$Rn), (i32 imm:$imm)))]> {
+ bits<6> imm;
+ let Inst{21-16} = imm;
+ }
+}
+
+multiclass SIMDVectorRShiftNarrowBHS<bit U, bits<5> opc, string asm,
+ SDPatternOperator OpNode> {
+ def v8i8_shift : BaseSIMDVectorShift<0, U, opc, {0,0,0,1,?,?,?},
+ V64, V128, vecshiftR16Narrow,
+ asm, ".8b", ".8h",
+ [(set (v8i8 V64:$Rd), (OpNode (v8i16 V128:$Rn), vecshiftR16Narrow:$imm))]> {
+ bits<3> imm;
+ let Inst{18-16} = imm;
+ }
+
+ def v16i8_shift : BaseSIMDVectorShiftTied<1, U, opc, {0,0,0,1,?,?,?},
+ V128, V128, vecshiftR16Narrow,
+ asm#"2", ".16b", ".8h", []> {
+ bits<3> imm;
+ let Inst{18-16} = imm;
+ let hasSideEffects = 0;
+ }
+
+ def v4i16_shift : BaseSIMDVectorShift<0, U, opc, {0,0,1,?,?,?,?},
+ V64, V128, vecshiftR32Narrow,
+ asm, ".4h", ".4s",
+ [(set (v4i16 V64:$Rd), (OpNode (v4i32 V128:$Rn), vecshiftR32Narrow:$imm))]> {
+ bits<4> imm;
+ let Inst{19-16} = imm;
+ }
+
+ def v8i16_shift : BaseSIMDVectorShiftTied<1, U, opc, {0,0,1,?,?,?,?},
+ V128, V128, vecshiftR32Narrow,
+ asm#"2", ".8h", ".4s", []> {
+ bits<4> imm;
+ let Inst{19-16} = imm;
+ let hasSideEffects = 0;
+ }
+
+ def v2i32_shift : BaseSIMDVectorShift<0, U, opc, {0,1,?,?,?,?,?},
+ V64, V128, vecshiftR64Narrow,
+ asm, ".2s", ".2d",
+ [(set (v2i32 V64:$Rd), (OpNode (v2i64 V128:$Rn), vecshiftR64Narrow:$imm))]> {
+ bits<5> imm;
+ let Inst{20-16} = imm;
+ }
+
+ def v4i32_shift : BaseSIMDVectorShiftTied<1, U, opc, {0,1,?,?,?,?,?},
+ V128, V128, vecshiftR64Narrow,
+ asm#"2", ".4s", ".2d", []> {
+ bits<5> imm;
+ let Inst{20-16} = imm;
+ let hasSideEffects = 0;
+ }
+
+ // TableGen doesn't like patters w/ INSERT_SUBREG on the instructions
+ // themselves, so put them here instead.
+
+ // Patterns involving what's effectively an insert high and a normal
+ // intrinsic, represented by CONCAT_VECTORS.
+ def : Pat<(concat_vectors (v8i8 V64:$Rd),(OpNode (v8i16 V128:$Rn),
+ vecshiftR16Narrow:$imm)),
+ (!cast<Instruction>(NAME # "v16i8_shift")
+ (INSERT_SUBREG (IMPLICIT_DEF), V64:$Rd, dsub),
+ V128:$Rn, vecshiftR16Narrow:$imm)>;
+ def : Pat<(concat_vectors (v4i16 V64:$Rd), (OpNode (v4i32 V128:$Rn),
+ vecshiftR32Narrow:$imm)),
+ (!cast<Instruction>(NAME # "v8i16_shift")
+ (INSERT_SUBREG (IMPLICIT_DEF), V64:$Rd, dsub),
+ V128:$Rn, vecshiftR32Narrow:$imm)>;
+ def : Pat<(concat_vectors (v2i32 V64:$Rd), (OpNode (v2i64 V128:$Rn),
+ vecshiftR64Narrow:$imm)),
+ (!cast<Instruction>(NAME # "v4i32_shift")
+ (INSERT_SUBREG (IMPLICIT_DEF), V64:$Rd, dsub),
+ V128:$Rn, vecshiftR64Narrow:$imm)>;
+}
+
+multiclass SIMDVectorLShiftBHSD<bit U, bits<5> opc, string asm,
+ SDPatternOperator OpNode> {
+ def v8i8_shift : BaseSIMDVectorShift<0, U, opc, {0,0,0,1,?,?,?},
+ V64, V64, vecshiftL8,
+ asm, ".8b", ".8b",
+ [(set (v8i8 V64:$Rd), (OpNode (v8i8 V64:$Rn),
+ (i32 vecshiftL8:$imm)))]> {
+ bits<3> imm;
+ let Inst{18-16} = imm;
+ }
+
+ def v16i8_shift : BaseSIMDVectorShift<1, U, opc, {0,0,0,1,?,?,?},
+ V128, V128, vecshiftL8,
+ asm, ".16b", ".16b",
+ [(set (v16i8 V128:$Rd), (OpNode (v16i8 V128:$Rn),
+ (i32 vecshiftL8:$imm)))]> {
+ bits<3> imm;
+ let Inst{18-16} = imm;
+ }
+
+ def v4i16_shift : BaseSIMDVectorShift<0, U, opc, {0,0,1,?,?,?,?},
+ V64, V64, vecshiftL16,
+ asm, ".4h", ".4h",
+ [(set (v4i16 V64:$Rd), (OpNode (v4i16 V64:$Rn),
+ (i32 vecshiftL16:$imm)))]> {
+ bits<4> imm;
+ let Inst{19-16} = imm;
+ }
+
+ def v8i16_shift : BaseSIMDVectorShift<1, U, opc, {0,0,1,?,?,?,?},
+ V128, V128, vecshiftL16,
+ asm, ".8h", ".8h",
+ [(set (v8i16 V128:$Rd), (OpNode (v8i16 V128:$Rn),
+ (i32 vecshiftL16:$imm)))]> {
+ bits<4> imm;
+ let Inst{19-16} = imm;
+ }
+
+ def v2i32_shift : BaseSIMDVectorShift<0, U, opc, {0,1,?,?,?,?,?},
+ V64, V64, vecshiftL32,
+ asm, ".2s", ".2s",
+ [(set (v2i32 V64:$Rd), (OpNode (v2i32 V64:$Rn),
+ (i32 vecshiftL32:$imm)))]> {
+ bits<5> imm;
+ let Inst{20-16} = imm;
+ }
+
+ def v4i32_shift : BaseSIMDVectorShift<1, U, opc, {0,1,?,?,?,?,?},
+ V128, V128, vecshiftL32,
+ asm, ".4s", ".4s",
+ [(set (v4i32 V128:$Rd), (OpNode (v4i32 V128:$Rn),
+ (i32 vecshiftL32:$imm)))]> {
+ bits<5> imm;
+ let Inst{20-16} = imm;
+ }
+
+ def v2i64_shift : BaseSIMDVectorShift<1, U, opc, {1,?,?,?,?,?,?},
+ V128, V128, vecshiftL64,
+ asm, ".2d", ".2d",
+ [(set (v2i64 V128:$Rd), (OpNode (v2i64 V128:$Rn),
+ (i32 vecshiftL64:$imm)))]> {
+ bits<6> imm;
+ let Inst{21-16} = imm;
+ }
+}
+
+multiclass SIMDVectorRShiftBHSD<bit U, bits<5> opc, string asm,
+ SDPatternOperator OpNode> {
+ def v8i8_shift : BaseSIMDVectorShift<0, U, opc, {0,0,0,1,?,?,?},
+ V64, V64, vecshiftR8,
+ asm, ".8b", ".8b",
+ [(set (v8i8 V64:$Rd), (OpNode (v8i8 V64:$Rn),
+ (i32 vecshiftR8:$imm)))]> {
+ bits<3> imm;
+ let Inst{18-16} = imm;
+ }
+
+ def v16i8_shift : BaseSIMDVectorShift<1, U, opc, {0,0,0,1,?,?,?},
+ V128, V128, vecshiftR8,
+ asm, ".16b", ".16b",
+ [(set (v16i8 V128:$Rd), (OpNode (v16i8 V128:$Rn),
+ (i32 vecshiftR8:$imm)))]> {
+ bits<3> imm;
+ let Inst{18-16} = imm;
+ }
+
+ def v4i16_shift : BaseSIMDVectorShift<0, U, opc, {0,0,1,?,?,?,?},
+ V64, V64, vecshiftR16,
+ asm, ".4h", ".4h",
+ [(set (v4i16 V64:$Rd), (OpNode (v4i16 V64:$Rn),
+ (i32 vecshiftR16:$imm)))]> {
+ bits<4> imm;
+ let Inst{19-16} = imm;
+ }
+
+ def v8i16_shift : BaseSIMDVectorShift<1, U, opc, {0,0,1,?,?,?,?},
+ V128, V128, vecshiftR16,
+ asm, ".8h", ".8h",
+ [(set (v8i16 V128:$Rd), (OpNode (v8i16 V128:$Rn),
+ (i32 vecshiftR16:$imm)))]> {
+ bits<4> imm;
+ let Inst{19-16} = imm;
+ }
+
+ def v2i32_shift : BaseSIMDVectorShift<0, U, opc, {0,1,?,?,?,?,?},
+ V64, V64, vecshiftR32,
+ asm, ".2s", ".2s",
+ [(set (v2i32 V64:$Rd), (OpNode (v2i32 V64:$Rn),
+ (i32 vecshiftR32:$imm)))]> {
+ bits<5> imm;
+ let Inst{20-16} = imm;
+ }
+
+ def v4i32_shift : BaseSIMDVectorShift<1, U, opc, {0,1,?,?,?,?,?},
+ V128, V128, vecshiftR32,
+ asm, ".4s", ".4s",
+ [(set (v4i32 V128:$Rd), (OpNode (v4i32 V128:$Rn),
+ (i32 vecshiftR32:$imm)))]> {
+ bits<5> imm;
+ let Inst{20-16} = imm;
+ }
+
+ def v2i64_shift : BaseSIMDVectorShift<1, U, opc, {1,?,?,?,?,?,?},
+ V128, V128, vecshiftR64,
+ asm, ".2d", ".2d",
+ [(set (v2i64 V128:$Rd), (OpNode (v2i64 V128:$Rn),
+ (i32 vecshiftR64:$imm)))]> {
+ bits<6> imm;
+ let Inst{21-16} = imm;
+ }
+}
+
+let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
+multiclass SIMDVectorRShiftBHSDTied<bit U, bits<5> opc, string asm,
+ SDPatternOperator OpNode = null_frag> {
+ def v8i8_shift : BaseSIMDVectorShiftTied<0, U, opc, {0,0,0,1,?,?,?},
+ V64, V64, vecshiftR8, asm, ".8b", ".8b",
+ [(set (v8i8 V64:$dst),
+ (OpNode (v8i8 V64:$Rd), (v8i8 V64:$Rn),
+ (i32 vecshiftR8:$imm)))]> {
+ bits<3> imm;
+ let Inst{18-16} = imm;
+ }
+
+ def v16i8_shift : BaseSIMDVectorShiftTied<1, U, opc, {0,0,0,1,?,?,?},
+ V128, V128, vecshiftR8, asm, ".16b", ".16b",
+ [(set (v16i8 V128:$dst),
+ (OpNode (v16i8 V128:$Rd), (v16i8 V128:$Rn),
+ (i32 vecshiftR8:$imm)))]> {
+ bits<3> imm;
+ let Inst{18-16} = imm;
+ }
+
+ def v4i16_shift : BaseSIMDVectorShiftTied<0, U, opc, {0,0,1,?,?,?,?},
+ V64, V64, vecshiftR16, asm, ".4h", ".4h",
+ [(set (v4i16 V64:$dst),
+ (OpNode (v4i16 V64:$Rd), (v4i16 V64:$Rn),
+ (i32 vecshiftR16:$imm)))]> {
+ bits<4> imm;
+ let Inst{19-16} = imm;
+ }
+
+ def v8i16_shift : BaseSIMDVectorShiftTied<1, U, opc, {0,0,1,?,?,?,?},
+ V128, V128, vecshiftR16, asm, ".8h", ".8h",
+ [(set (v8i16 V128:$dst),
+ (OpNode (v8i16 V128:$Rd), (v8i16 V128:$Rn),
+ (i32 vecshiftR16:$imm)))]> {
+ bits<4> imm;
+ let Inst{19-16} = imm;
+ }
+
+ def v2i32_shift : BaseSIMDVectorShiftTied<0, U, opc, {0,1,?,?,?,?,?},
+ V64, V64, vecshiftR32, asm, ".2s", ".2s",
+ [(set (v2i32 V64:$dst),
+ (OpNode (v2i32 V64:$Rd), (v2i32 V64:$Rn),
+ (i32 vecshiftR32:$imm)))]> {
+ bits<5> imm;
+ let Inst{20-16} = imm;
+ }
+
+ def v4i32_shift : BaseSIMDVectorShiftTied<1, U, opc, {0,1,?,?,?,?,?},
+ V128, V128, vecshiftR32, asm, ".4s", ".4s",
+ [(set (v4i32 V128:$dst),
+ (OpNode (v4i32 V128:$Rd), (v4i32 V128:$Rn),
+ (i32 vecshiftR32:$imm)))]> {
+ bits<5> imm;
+ let Inst{20-16} = imm;
+ }
+
+ def v2i64_shift : BaseSIMDVectorShiftTied<1, U, opc, {1,?,?,?,?,?,?},
+ V128, V128, vecshiftR64,
+ asm, ".2d", ".2d", [(set (v2i64 V128:$dst),
+ (OpNode (v2i64 V128:$Rd), (v2i64 V128:$Rn),
+ (i32 vecshiftR64:$imm)))]> {
+ bits<6> imm;
+ let Inst{21-16} = imm;
+ }
+}
+
+multiclass SIMDVectorLShiftBHSDTied<bit U, bits<5> opc, string asm,
+ SDPatternOperator OpNode = null_frag> {
+ def v8i8_shift : BaseSIMDVectorShiftTied<0, U, opc, {0,0,0,1,?,?,?},
+ V64, V64, vecshiftL8,
+ asm, ".8b", ".8b",
+ [(set (v8i8 V64:$dst),
+ (OpNode (v8i8 V64:$Rd), (v8i8 V64:$Rn),
+ (i32 vecshiftL8:$imm)))]> {
+ bits<3> imm;
+ let Inst{18-16} = imm;
+ }
+
+ def v16i8_shift : BaseSIMDVectorShiftTied<1, U, opc, {0,0,0,1,?,?,?},
+ V128, V128, vecshiftL8,
+ asm, ".16b", ".16b",
+ [(set (v16i8 V128:$dst),
+ (OpNode (v16i8 V128:$Rd), (v16i8 V128:$Rn),
+ (i32 vecshiftL8:$imm)))]> {
+ bits<3> imm;
+ let Inst{18-16} = imm;
+ }
+
+ def v4i16_shift : BaseSIMDVectorShiftTied<0, U, opc, {0,0,1,?,?,?,?},
+ V64, V64, vecshiftL16,
+ asm, ".4h", ".4h",
+ [(set (v4i16 V64:$dst),
+ (OpNode (v4i16 V64:$Rd), (v4i16 V64:$Rn),
+ (i32 vecshiftL16:$imm)))]> {
+ bits<4> imm;
+ let Inst{19-16} = imm;
+ }
+
+ def v8i16_shift : BaseSIMDVectorShiftTied<1, U, opc, {0,0,1,?,?,?,?},
+ V128, V128, vecshiftL16,
+ asm, ".8h", ".8h",
+ [(set (v8i16 V128:$dst),
+ (OpNode (v8i16 V128:$Rd), (v8i16 V128:$Rn),
+ (i32 vecshiftL16:$imm)))]> {
+ bits<4> imm;
+ let Inst{19-16} = imm;
+ }
+
+ def v2i32_shift : BaseSIMDVectorShiftTied<0, U, opc, {0,1,?,?,?,?,?},
+ V64, V64, vecshiftL32,
+ asm, ".2s", ".2s",
+ [(set (v2i32 V64:$dst),
+ (OpNode (v2i32 V64:$Rd), (v2i32 V64:$Rn),
+ (i32 vecshiftL32:$imm)))]> {
+ bits<5> imm;
+ let Inst{20-16} = imm;
+ }
+
+ def v4i32_shift : BaseSIMDVectorShiftTied<1, U, opc, {0,1,?,?,?,?,?},
+ V128, V128, vecshiftL32,
+ asm, ".4s", ".4s",
+ [(set (v4i32 V128:$dst),
+ (OpNode (v4i32 V128:$Rd), (v4i32 V128:$Rn),
+ (i32 vecshiftL32:$imm)))]> {
+ bits<5> imm;
+ let Inst{20-16} = imm;
+ }
+
+ def v2i64_shift : BaseSIMDVectorShiftTied<1, U, opc, {1,?,?,?,?,?,?},
+ V128, V128, vecshiftL64,
+ asm, ".2d", ".2d",
+ [(set (v2i64 V128:$dst),
+ (OpNode (v2i64 V128:$Rd), (v2i64 V128:$Rn),
+ (i32 vecshiftL64:$imm)))]> {
+ bits<6> imm;
+ let Inst{21-16} = imm;
+ }
+}
+
+multiclass SIMDVectorLShiftLongBHSD<bit U, bits<5> opc, string asm,
+ SDPatternOperator OpNode> {
+ def v8i8_shift : BaseSIMDVectorShift<0, U, opc, {0,0,0,1,?,?,?},
+ V128, V64, vecshiftL8, asm, ".8h", ".8b",
+ [(set (v8i16 V128:$Rd), (OpNode (v8i8 V64:$Rn), vecshiftL8:$imm))]> {
+ bits<3> imm;
+ let Inst{18-16} = imm;
+ }
+
+ def v16i8_shift : BaseSIMDVectorShift<1, U, opc, {0,0,0,1,?,?,?},
+ V128, V128, vecshiftL8,
+ asm#"2", ".8h", ".16b",
+ [(set (v8i16 V128:$Rd),
+ (OpNode (extract_high_v16i8 V128:$Rn), vecshiftL8:$imm))]> {
+ bits<3> imm;
+ let Inst{18-16} = imm;
+ }
+
+ def v4i16_shift : BaseSIMDVectorShift<0, U, opc, {0,0,1,?,?,?,?},
+ V128, V64, vecshiftL16, asm, ".4s", ".4h",
+ [(set (v4i32 V128:$Rd), (OpNode (v4i16 V64:$Rn), vecshiftL16:$imm))]> {
+ bits<4> imm;
+ let Inst{19-16} = imm;
+ }
+
+ def v8i16_shift : BaseSIMDVectorShift<1, U, opc, {0,0,1,?,?,?,?},
+ V128, V128, vecshiftL16,
+ asm#"2", ".4s", ".8h",
+ [(set (v4i32 V128:$Rd),
+ (OpNode (extract_high_v8i16 V128:$Rn), vecshiftL16:$imm))]> {
+
+ bits<4> imm;
+ let Inst{19-16} = imm;
+ }
+
+ def v2i32_shift : BaseSIMDVectorShift<0, U, opc, {0,1,?,?,?,?,?},
+ V128, V64, vecshiftL32, asm, ".2d", ".2s",
+ [(set (v2i64 V128:$Rd), (OpNode (v2i32 V64:$Rn), vecshiftL32:$imm))]> {
+ bits<5> imm;
+ let Inst{20-16} = imm;
+ }
+
+ def v4i32_shift : BaseSIMDVectorShift<1, U, opc, {0,1,?,?,?,?,?},
+ V128, V128, vecshiftL32,
+ asm#"2", ".2d", ".4s",
+ [(set (v2i64 V128:$Rd),
+ (OpNode (extract_high_v4i32 V128:$Rn), vecshiftL32:$imm))]> {
+ bits<5> imm;
+ let Inst{20-16} = imm;
+ }
+}
+
+
+//---
+// Vector load/store
+//---
+// SIMD ldX/stX no-index memory references don't allow the optional
+// ", #0" constant and handle post-indexing explicitly, so we use
+// a more specialized parse method for them. Otherwise, it's the same as
+// the general GPR64sp handling.
+
+class BaseSIMDLdSt<bit Q, bit L, bits<4> opcode, bits<2> size,
+ string asm, dag oops, dag iops, list<dag> pattern>
+ : I<oops, iops, asm, "\t$Vt, [$Rn]", "", pattern> {
+ bits<5> Vt;
+ bits<5> Rn;
+ let Inst{31} = 0;
+ let Inst{30} = Q;
let Inst{29-23} = 0b0011000;
- let Inst{22} = l;
+ let Inst{22} = L;
let Inst{21-16} = 0b000000;
let Inst{15-12} = opcode;
let Inst{11-10} = size;
-
- // Inherit Rn in 9-5
- // Inherit Rt in 4-0
+ let Inst{9-5} = Rn;
+ let Inst{4-0} = Vt;
}
-// Format AdvSIMD vector load/store multiple N-element structure (post-index)
-class NeonI_LdStMult_Post<bit q, bit l, bits<4> opcode, bits<2> size,
- dag outs, dag ins, string asmstr,
- list<dag> patterns, InstrItinClass itin>
- : A64InstRtnm<outs, ins, asmstr, patterns, itin>
-{
- let Inst{31} = 0b0;
- let Inst{30} = q;
+class BaseSIMDLdStPost<bit Q, bit L, bits<4> opcode, bits<2> size,
+ string asm, dag oops, dag iops>
+ : I<oops, iops, asm, "\t$Vt, [$Rn], $Xm", "$Rn = $wback", []> {
+ bits<5> Vt;
+ bits<5> Rn;
+ bits<5> Xm;
+ let Inst{31} = 0;
+ let Inst{30} = Q;
let Inst{29-23} = 0b0011001;
- let Inst{22} = l;
- let Inst{21} = 0b0;
- // Inherit Rm in 20-16
+ let Inst{22} = L;
+ let Inst{21} = 0;
+ let Inst{20-16} = Xm;
let Inst{15-12} = opcode;
let Inst{11-10} = size;
- // Inherit Rn in 9-5
- // Inherit Rt in 4-0
+ let Inst{9-5} = Rn;
+ let Inst{4-0} = Vt;
}
-// Format AdvSIMD vector load Single N-element structure to all lanes
-class NeonI_LdOne_Dup<bit q, bit r, bits<3> opcode, bits<2> size, dag outs,
- dag ins, string asmstr, list<dag> patterns,
- InstrItinClass itin>
- : A64InstRtn<outs, ins, asmstr, patterns, itin>
-{
- let Inst{31} = 0b0;
- let Inst{30} = q;
- let Inst{29-23} = 0b0011010;
- let Inst{22} = 0b1;
- let Inst{21} = r;
- let Inst{20-16} = 0b00000;
+// The immediate form of AdvSIMD post-indexed addressing is encoded with
+// register post-index addressing from the zero register.
+multiclass SIMDLdStAliases<string asm, string layout, string Count,
+ int Offset, int Size> {
+ // E.g. "ld1 { v0.8b, v1.8b }, [x1], #16"
+ // "ld1\t$Vt, [$Rn], #16"
+ // may get mapped to
+ // (LD1Twov8b_POST VecListTwo8b:$Vt, GPR64sp:$Rn, XZR)
+ def : InstAlias<asm # "\t$Vt, [$Rn], #" # Offset,
+ (!cast<Instruction>(NAME # Count # "v" # layout # "_POST")
+ GPR64sp:$Rn,
+ !cast<RegisterOperand>("VecList" # Count # layout):$Vt,
+ XZR), 1>;
+
+ // E.g. "ld1.8b { v0, v1 }, [x1], #16"
+ // "ld1.8b\t$Vt, [$Rn], #16"
+ // may get mapped to
+ // (LD1Twov8b_POST VecListTwo64:$Vt, GPR64sp:$Rn, XZR)
+ def : InstAlias<asm # "." # layout # "\t$Vt, [$Rn], #" # Offset,
+ (!cast<Instruction>(NAME # Count # "v" # layout # "_POST")
+ GPR64sp:$Rn,
+ !cast<RegisterOperand>("VecList" # Count # Size):$Vt,
+ XZR), 0>;
+
+ // E.g. "ld1.8b { v0, v1 }, [x1]"
+ // "ld1\t$Vt, [$Rn]"
+ // may get mapped to
+ // (LD1Twov8b VecListTwo64:$Vt, GPR64sp:$Rn)
+ def : InstAlias<asm # "." # layout # "\t$Vt, [$Rn]",
+ (!cast<Instruction>(NAME # Count # "v" # layout)
+ !cast<RegisterOperand>("VecList" # Count # Size):$Vt,
+ GPR64sp:$Rn), 0>;
+
+ // E.g. "ld1.8b { v0, v1 }, [x1], x2"
+ // "ld1\t$Vt, [$Rn], $Xm"
+ // may get mapped to
+ // (LD1Twov8b_POST VecListTwo64:$Vt, GPR64sp:$Rn, GPR64pi8:$Xm)
+ def : InstAlias<asm # "." # layout # "\t$Vt, [$Rn], $Xm",
+ (!cast<Instruction>(NAME # Count # "v" # layout # "_POST")
+ GPR64sp:$Rn,
+ !cast<RegisterOperand>("VecList" # Count # Size):$Vt,
+ !cast<RegisterOperand>("GPR64pi" # Offset):$Xm), 0>;
+}
+
+multiclass BaseSIMDLdN<string Count, string asm, string veclist, int Offset128,
+ int Offset64, bits<4> opcode> {
+ let hasSideEffects = 0, mayLoad = 1, mayStore = 0 in {
+ def v16b: BaseSIMDLdSt<1, 1, opcode, 0b00, asm,
+ (outs !cast<RegisterOperand>(veclist # "16b"):$Vt),
+ (ins GPR64sp:$Rn), []>;
+ def v8h : BaseSIMDLdSt<1, 1, opcode, 0b01, asm,
+ (outs !cast<RegisterOperand>(veclist # "8h"):$Vt),
+ (ins GPR64sp:$Rn), []>;
+ def v4s : BaseSIMDLdSt<1, 1, opcode, 0b10, asm,
+ (outs !cast<RegisterOperand>(veclist # "4s"):$Vt),
+ (ins GPR64sp:$Rn), []>;
+ def v2d : BaseSIMDLdSt<1, 1, opcode, 0b11, asm,
+ (outs !cast<RegisterOperand>(veclist # "2d"):$Vt),
+ (ins GPR64sp:$Rn), []>;
+ def v8b : BaseSIMDLdSt<0, 1, opcode, 0b00, asm,
+ (outs !cast<RegisterOperand>(veclist # "8b"):$Vt),
+ (ins GPR64sp:$Rn), []>;
+ def v4h : BaseSIMDLdSt<0, 1, opcode, 0b01, asm,
+ (outs !cast<RegisterOperand>(veclist # "4h"):$Vt),
+ (ins GPR64sp:$Rn), []>;
+ def v2s : BaseSIMDLdSt<0, 1, opcode, 0b10, asm,
+ (outs !cast<RegisterOperand>(veclist # "2s"):$Vt),
+ (ins GPR64sp:$Rn), []>;
+
+
+ def v16b_POST: BaseSIMDLdStPost<1, 1, opcode, 0b00, asm,
+ (outs GPR64sp:$wback,
+ !cast<RegisterOperand>(veclist # "16b"):$Vt),
+ (ins GPR64sp:$Rn,
+ !cast<RegisterOperand>("GPR64pi" # Offset128):$Xm)>;
+ def v8h_POST : BaseSIMDLdStPost<1, 1, opcode, 0b01, asm,
+ (outs GPR64sp:$wback,
+ !cast<RegisterOperand>(veclist # "8h"):$Vt),
+ (ins GPR64sp:$Rn,
+ !cast<RegisterOperand>("GPR64pi" # Offset128):$Xm)>;
+ def v4s_POST : BaseSIMDLdStPost<1, 1, opcode, 0b10, asm,
+ (outs GPR64sp:$wback,
+ !cast<RegisterOperand>(veclist # "4s"):$Vt),
+ (ins GPR64sp:$Rn,
+ !cast<RegisterOperand>("GPR64pi" # Offset128):$Xm)>;
+ def v2d_POST : BaseSIMDLdStPost<1, 1, opcode, 0b11, asm,
+ (outs GPR64sp:$wback,
+ !cast<RegisterOperand>(veclist # "2d"):$Vt),
+ (ins GPR64sp:$Rn,
+ !cast<RegisterOperand>("GPR64pi" # Offset128):$Xm)>;
+ def v8b_POST : BaseSIMDLdStPost<0, 1, opcode, 0b00, asm,
+ (outs GPR64sp:$wback,
+ !cast<RegisterOperand>(veclist # "8b"):$Vt),
+ (ins GPR64sp:$Rn,
+ !cast<RegisterOperand>("GPR64pi" # Offset64):$Xm)>;
+ def v4h_POST : BaseSIMDLdStPost<0, 1, opcode, 0b01, asm,
+ (outs GPR64sp:$wback,
+ !cast<RegisterOperand>(veclist # "4h"):$Vt),
+ (ins GPR64sp:$Rn,
+ !cast<RegisterOperand>("GPR64pi" # Offset64):$Xm)>;
+ def v2s_POST : BaseSIMDLdStPost<0, 1, opcode, 0b10, asm,
+ (outs GPR64sp:$wback,
+ !cast<RegisterOperand>(veclist # "2s"):$Vt),
+ (ins GPR64sp:$Rn,
+ !cast<RegisterOperand>("GPR64pi" # Offset64):$Xm)>;
+ }
+
+ defm : SIMDLdStAliases<asm, "16b", Count, Offset128, 128>;
+ defm : SIMDLdStAliases<asm, "8h", Count, Offset128, 128>;
+ defm : SIMDLdStAliases<asm, "4s", Count, Offset128, 128>;
+ defm : SIMDLdStAliases<asm, "2d", Count, Offset128, 128>;
+ defm : SIMDLdStAliases<asm, "8b", Count, Offset64, 64>;
+ defm : SIMDLdStAliases<asm, "4h", Count, Offset64, 64>;
+ defm : SIMDLdStAliases<asm, "2s", Count, Offset64, 64>;
+}
+
+// Only ld1/st1 has a v1d version.
+multiclass BaseSIMDStN<string Count, string asm, string veclist, int Offset128,
+ int Offset64, bits<4> opcode> {
+ let hasSideEffects = 0, mayStore = 1, mayLoad = 0 in {
+ def v16b : BaseSIMDLdSt<1, 0, opcode, 0b00, asm, (outs),
+ (ins !cast<RegisterOperand>(veclist # "16b"):$Vt,
+ GPR64sp:$Rn), []>;
+ def v8h : BaseSIMDLdSt<1, 0, opcode, 0b01, asm, (outs),
+ (ins !cast<RegisterOperand>(veclist # "8h"):$Vt,
+ GPR64sp:$Rn), []>;
+ def v4s : BaseSIMDLdSt<1, 0, opcode, 0b10, asm, (outs),
+ (ins !cast<RegisterOperand>(veclist # "4s"):$Vt,
+ GPR64sp:$Rn), []>;
+ def v2d : BaseSIMDLdSt<1, 0, opcode, 0b11, asm, (outs),
+ (ins !cast<RegisterOperand>(veclist # "2d"):$Vt,
+ GPR64sp:$Rn), []>;
+ def v8b : BaseSIMDLdSt<0, 0, opcode, 0b00, asm, (outs),
+ (ins !cast<RegisterOperand>(veclist # "8b"):$Vt,
+ GPR64sp:$Rn), []>;
+ def v4h : BaseSIMDLdSt<0, 0, opcode, 0b01, asm, (outs),
+ (ins !cast<RegisterOperand>(veclist # "4h"):$Vt,
+ GPR64sp:$Rn), []>;
+ def v2s : BaseSIMDLdSt<0, 0, opcode, 0b10, asm, (outs),
+ (ins !cast<RegisterOperand>(veclist # "2s"):$Vt,
+ GPR64sp:$Rn), []>;
+
+ def v16b_POST : BaseSIMDLdStPost<1, 0, opcode, 0b00, asm,
+ (outs GPR64sp:$wback),
+ (ins !cast<RegisterOperand>(veclist # "16b"):$Vt,
+ GPR64sp:$Rn,
+ !cast<RegisterOperand>("GPR64pi" # Offset128):$Xm)>;
+ def v8h_POST : BaseSIMDLdStPost<1, 0, opcode, 0b01, asm,
+ (outs GPR64sp:$wback),
+ (ins !cast<RegisterOperand>(veclist # "8h"):$Vt,
+ GPR64sp:$Rn,
+ !cast<RegisterOperand>("GPR64pi" # Offset128):$Xm)>;
+ def v4s_POST : BaseSIMDLdStPost<1, 0, opcode, 0b10, asm,
+ (outs GPR64sp:$wback),
+ (ins !cast<RegisterOperand>(veclist # "4s"):$Vt,
+ GPR64sp:$Rn,
+ !cast<RegisterOperand>("GPR64pi" # Offset128):$Xm)>;
+ def v2d_POST : BaseSIMDLdStPost<1, 0, opcode, 0b11, asm,
+ (outs GPR64sp:$wback),
+ (ins !cast<RegisterOperand>(veclist # "2d"):$Vt,
+ GPR64sp:$Rn,
+ !cast<RegisterOperand>("GPR64pi" # Offset128):$Xm)>;
+ def v8b_POST : BaseSIMDLdStPost<0, 0, opcode, 0b00, asm,
+ (outs GPR64sp:$wback),
+ (ins !cast<RegisterOperand>(veclist # "8b"):$Vt,
+ GPR64sp:$Rn,
+ !cast<RegisterOperand>("GPR64pi" # Offset64):$Xm)>;
+ def v4h_POST : BaseSIMDLdStPost<0, 0, opcode, 0b01, asm,
+ (outs GPR64sp:$wback),
+ (ins !cast<RegisterOperand>(veclist # "4h"):$Vt,
+ GPR64sp:$Rn,
+ !cast<RegisterOperand>("GPR64pi" # Offset64):$Xm)>;
+ def v2s_POST : BaseSIMDLdStPost<0, 0, opcode, 0b10, asm,
+ (outs GPR64sp:$wback),
+ (ins !cast<RegisterOperand>(veclist # "2s"):$Vt,
+ GPR64sp:$Rn,
+ !cast<RegisterOperand>("GPR64pi" # Offset64):$Xm)>;
+ }
+
+ defm : SIMDLdStAliases<asm, "16b", Count, Offset128, 128>;
+ defm : SIMDLdStAliases<asm, "8h", Count, Offset128, 128>;
+ defm : SIMDLdStAliases<asm, "4s", Count, Offset128, 128>;
+ defm : SIMDLdStAliases<asm, "2d", Count, Offset128, 128>;
+ defm : SIMDLdStAliases<asm, "8b", Count, Offset64, 64>;
+ defm : SIMDLdStAliases<asm, "4h", Count, Offset64, 64>;
+ defm : SIMDLdStAliases<asm, "2s", Count, Offset64, 64>;
+}
+
+multiclass BaseSIMDLd1<string Count, string asm, string veclist,
+ int Offset128, int Offset64, bits<4> opcode>
+ : BaseSIMDLdN<Count, asm, veclist, Offset128, Offset64, opcode> {
+
+ // LD1 instructions have extra "1d" variants.
+ let hasSideEffects = 0, mayLoad = 1, mayStore = 0 in {
+ def v1d : BaseSIMDLdSt<0, 1, opcode, 0b11, asm,
+ (outs !cast<RegisterOperand>(veclist # "1d"):$Vt),
+ (ins GPR64sp:$Rn), []>;
+
+ def v1d_POST : BaseSIMDLdStPost<0, 1, opcode, 0b11, asm,
+ (outs GPR64sp:$wback,
+ !cast<RegisterOperand>(veclist # "1d"):$Vt),
+ (ins GPR64sp:$Rn,
+ !cast<RegisterOperand>("GPR64pi" # Offset64):$Xm)>;
+ }
+
+ defm : SIMDLdStAliases<asm, "1d", Count, Offset64, 64>;
+}
+
+multiclass BaseSIMDSt1<string Count, string asm, string veclist,
+ int Offset128, int Offset64, bits<4> opcode>
+ : BaseSIMDStN<Count, asm, veclist, Offset128, Offset64, opcode> {
+
+ // ST1 instructions have extra "1d" variants.
+ let hasSideEffects = 0, mayLoad = 0, mayStore = 1 in {
+ def v1d : BaseSIMDLdSt<0, 0, opcode, 0b11, asm, (outs),
+ (ins !cast<RegisterOperand>(veclist # "1d"):$Vt,
+ GPR64sp:$Rn), []>;
+
+ def v1d_POST : BaseSIMDLdStPost<0, 0, opcode, 0b11, asm,
+ (outs GPR64sp:$wback),
+ (ins !cast<RegisterOperand>(veclist # "1d"):$Vt,
+ GPR64sp:$Rn,
+ !cast<RegisterOperand>("GPR64pi" # Offset64):$Xm)>;
+ }
+
+ defm : SIMDLdStAliases<asm, "1d", Count, Offset64, 64>;
+}
+
+multiclass SIMDLd1Multiple<string asm> {
+ defm One : BaseSIMDLd1<"One", asm, "VecListOne", 16, 8, 0b0111>;
+ defm Two : BaseSIMDLd1<"Two", asm, "VecListTwo", 32, 16, 0b1010>;
+ defm Three : BaseSIMDLd1<"Three", asm, "VecListThree", 48, 24, 0b0110>;
+ defm Four : BaseSIMDLd1<"Four", asm, "VecListFour", 64, 32, 0b0010>;
+}
+
+multiclass SIMDSt1Multiple<string asm> {
+ defm One : BaseSIMDSt1<"One", asm, "VecListOne", 16, 8, 0b0111>;
+ defm Two : BaseSIMDSt1<"Two", asm, "VecListTwo", 32, 16, 0b1010>;
+ defm Three : BaseSIMDSt1<"Three", asm, "VecListThree", 48, 24, 0b0110>;
+ defm Four : BaseSIMDSt1<"Four", asm, "VecListFour", 64, 32, 0b0010>;
+}
+
+multiclass SIMDLd2Multiple<string asm> {
+ defm Two : BaseSIMDLdN<"Two", asm, "VecListTwo", 32, 16, 0b1000>;
+}
+
+multiclass SIMDSt2Multiple<string asm> {
+ defm Two : BaseSIMDStN<"Two", asm, "VecListTwo", 32, 16, 0b1000>;
+}
+
+multiclass SIMDLd3Multiple<string asm> {
+ defm Three : BaseSIMDLdN<"Three", asm, "VecListThree", 48, 24, 0b0100>;
+}
+
+multiclass SIMDSt3Multiple<string asm> {
+ defm Three : BaseSIMDStN<"Three", asm, "VecListThree", 48, 24, 0b0100>;
+}
+
+multiclass SIMDLd4Multiple<string asm> {
+ defm Four : BaseSIMDLdN<"Four", asm, "VecListFour", 64, 32, 0b0000>;
+}
+
+multiclass SIMDSt4Multiple<string asm> {
+ defm Four : BaseSIMDStN<"Four", asm, "VecListFour", 64, 32, 0b0000>;
+}
+
+//---
+// AdvSIMD Load/store single-element
+//---
+
+class BaseSIMDLdStSingle<bit L, bit R, bits<3> opcode,
+ string asm, string operands, string cst,
+ dag oops, dag iops, list<dag> pattern>
+ : I<oops, iops, asm, operands, cst, pattern> {
+ bits<5> Vt;
+ bits<5> Rn;
+ let Inst{31} = 0;
+ let Inst{29-24} = 0b001101;
+ let Inst{22} = L;
+ let Inst{21} = R;
let Inst{15-13} = opcode;
- let Inst{12} = 0b0;
- let Inst{11-10} = size;
-
- // Inherit Rn in 9-5
- // Inherit Rt in 4-0
+ let Inst{9-5} = Rn;
+ let Inst{4-0} = Vt;
}
-// Format AdvSIMD vector load/store Single N-element structure to/from one lane
-class NeonI_LdStOne_Lane<bit l, bit r, bits<2> op2_1, bit op0, dag outs,
- dag ins, string asmstr,
- list<dag> patterns, InstrItinClass itin>
- : A64InstRtn<outs, ins, asmstr, patterns, itin>
-{
- bits<4> lane;
- let Inst{31} = 0b0;
- let Inst{29-23} = 0b0011010;
- let Inst{22} = l;
- let Inst{21} = r;
- let Inst{20-16} = 0b00000;
- let Inst{15-14} = op2_1;
- let Inst{13} = op0;
-
- // Inherit Rn in 9-5
- // Inherit Rt in 4-0
-}
-
-// Format AdvSIMD post-index vector load Single N-element structure to all lanes
-class NeonI_LdOne_Dup_Post<bit q, bit r, bits<3> opcode, bits<2> size, dag outs,
- dag ins, string asmstr, list<dag> patterns,
- InstrItinClass itin>
- : A64InstRtnm<outs, ins, asmstr, patterns, itin>
-{
- let Inst{31} = 0b0;
- let Inst{30} = q;
- let Inst{29-23} = 0b0011011;
- let Inst{22} = 0b1;
- let Inst{21} = r;
- // Inherit Rm in 20-16
+class BaseSIMDLdStSingleTied<bit L, bit R, bits<3> opcode,
+ string asm, string operands, string cst,
+ dag oops, dag iops, list<dag> pattern>
+ : I<oops, iops, asm, operands, "$Vt = $dst," # cst, pattern> {
+ bits<5> Vt;
+ bits<5> Rn;
+ let Inst{31} = 0;
+ let Inst{29-24} = 0b001101;
+ let Inst{22} = L;
+ let Inst{21} = R;
let Inst{15-13} = opcode;
- let Inst{12} = 0b0;
+ let Inst{9-5} = Rn;
+ let Inst{4-0} = Vt;
+}
+
+
+let mayLoad = 1, mayStore = 0, hasSideEffects = 0 in
+class BaseSIMDLdR<bit Q, bit R, bits<3> opcode, bit S, bits<2> size, string asm,
+ Operand listtype>
+ : BaseSIMDLdStSingle<1, R, opcode, asm, "\t$Vt, [$Rn]", "",
+ (outs listtype:$Vt), (ins GPR64sp:$Rn),
+ []> {
+ let Inst{30} = Q;
+ let Inst{23} = 0;
+ let Inst{20-16} = 0b00000;
+ let Inst{12} = S;
let Inst{11-10} = size;
-
- // Inherit Rn in 9-5
- // Inherit Rt in 4-0
+}
+let mayLoad = 1, mayStore = 0, hasSideEffects = 0 in
+class BaseSIMDLdRPost<bit Q, bit R, bits<3> opcode, bit S, bits<2> size,
+ string asm, Operand listtype, Operand GPR64pi>
+ : BaseSIMDLdStSingle<1, R, opcode, asm, "\t$Vt, [$Rn], $Xm",
+ "$Rn = $wback",
+ (outs GPR64sp:$wback, listtype:$Vt),
+ (ins GPR64sp:$Rn, GPR64pi:$Xm), []> {
+ bits<5> Xm;
+ let Inst{30} = Q;
+ let Inst{23} = 1;
+ let Inst{20-16} = Xm;
+ let Inst{12} = S;
+ let Inst{11-10} = size;
}
-// Format AdvSIMD post-index vector load/store Single N-element structure
-// to/from one lane
-class NeonI_LdStOne_Lane_Post<bit l, bit r, bits<2> op2_1, bit op0, dag outs,
- dag ins, string asmstr,
- list<dag> patterns, InstrItinClass itin>
- : A64InstRtnm<outs, ins, asmstr, patterns, itin>
-{
- bits<4> lane;
- let Inst{31} = 0b0;
- let Inst{29-23} = 0b0011011;
- let Inst{22} = l;
- let Inst{21} = r;
- // Inherit Rm in 20-16
- let Inst{15-14} = op2_1;
- let Inst{13} = op0;
-
- // Inherit Rn in 9-5
- // Inherit Rt in 4-0
+multiclass SIMDLdrAliases<string asm, string layout, string Count,
+ int Offset, int Size> {
+ // E.g. "ld1r { v0.8b }, [x1], #1"
+ // "ld1r.8b\t$Vt, [$Rn], #1"
+ // may get mapped to
+ // (LD1Rv8b_POST VecListOne8b:$Vt, GPR64sp:$Rn, XZR)
+ def : InstAlias<asm # "\t$Vt, [$Rn], #" # Offset,
+ (!cast<Instruction>(NAME # "v" # layout # "_POST")
+ GPR64sp:$Rn,
+ !cast<RegisterOperand>("VecList" # Count # layout):$Vt,
+ XZR), 1>;
+
+ // E.g. "ld1r.8b { v0 }, [x1], #1"
+ // "ld1r.8b\t$Vt, [$Rn], #1"
+ // may get mapped to
+ // (LD1Rv8b_POST VecListOne64:$Vt, GPR64sp:$Rn, XZR)
+ def : InstAlias<asm # "." # layout # "\t$Vt, [$Rn], #" # Offset,
+ (!cast<Instruction>(NAME # "v" # layout # "_POST")
+ GPR64sp:$Rn,
+ !cast<RegisterOperand>("VecList" # Count # Size):$Vt,
+ XZR), 0>;
+
+ // E.g. "ld1r.8b { v0 }, [x1]"
+ // "ld1r.8b\t$Vt, [$Rn]"
+ // may get mapped to
+ // (LD1Rv8b VecListOne64:$Vt, GPR64sp:$Rn)
+ def : InstAlias<asm # "." # layout # "\t$Vt, [$Rn]",
+ (!cast<Instruction>(NAME # "v" # layout)
+ !cast<RegisterOperand>("VecList" # Count # Size):$Vt,
+ GPR64sp:$Rn), 0>;
+
+ // E.g. "ld1r.8b { v0 }, [x1], x2"
+ // "ld1r.8b\t$Vt, [$Rn], $Xm"
+ // may get mapped to
+ // (LD1Rv8b_POST VecListOne64:$Vt, GPR64sp:$Rn, GPR64pi1:$Xm)
+ def : InstAlias<asm # "." # layout # "\t$Vt, [$Rn], $Xm",
+ (!cast<Instruction>(NAME # "v" # layout # "_POST")
+ GPR64sp:$Rn,
+ !cast<RegisterOperand>("VecList" # Count # Size):$Vt,
+ !cast<RegisterOperand>("GPR64pi" # Offset):$Xm), 0>;
}
-// Format AdvSIMD 3 scalar registers with different type
+multiclass SIMDLdR<bit R, bits<3> opcode, bit S, string asm, string Count,
+ int Offset1, int Offset2, int Offset4, int Offset8> {
+ def v8b : BaseSIMDLdR<0, R, opcode, S, 0b00, asm,
+ !cast<Operand>("VecList" # Count # "8b")>;
+ def v16b: BaseSIMDLdR<1, R, opcode, S, 0b00, asm,
+ !cast<Operand>("VecList" # Count #"16b")>;
+ def v4h : BaseSIMDLdR<0, R, opcode, S, 0b01, asm,
+ !cast<Operand>("VecList" # Count #"4h")>;
+ def v8h : BaseSIMDLdR<1, R, opcode, S, 0b01, asm,
+ !cast<Operand>("VecList" # Count #"8h")>;
+ def v2s : BaseSIMDLdR<0, R, opcode, S, 0b10, asm,
+ !cast<Operand>("VecList" # Count #"2s")>;
+ def v4s : BaseSIMDLdR<1, R, opcode, S, 0b10, asm,
+ !cast<Operand>("VecList" # Count #"4s")>;
+ def v1d : BaseSIMDLdR<0, R, opcode, S, 0b11, asm,
+ !cast<Operand>("VecList" # Count #"1d")>;
+ def v2d : BaseSIMDLdR<1, R, opcode, S, 0b11, asm,
+ !cast<Operand>("VecList" # Count #"2d")>;
-class NeonI_Scalar3Diff<bit u, bits<2> size, bits<4> opcode,
- dag outs, dag ins, string asmstr,
- list<dag> patterns, InstrItinClass itin>
- : A64InstRdnm<outs, ins, asmstr, patterns, itin> {
- let Inst{31-30} = 0b01;
- let Inst{29} = u;
- let Inst{28-24} = 0b11110;
- let Inst{23-22} = size;
- let Inst{21} = 0b1;
- // Inherit Rm in 20-16
- let Inst{15-12} = opcode;
- let Inst{11-10} = 0b00;
- // Inherit Rn in 9-5
- // Inherit Rd in 4-0
+ def v8b_POST : BaseSIMDLdRPost<0, R, opcode, S, 0b00, asm,
+ !cast<Operand>("VecList" # Count # "8b"),
+ !cast<Operand>("GPR64pi" # Offset1)>;
+ def v16b_POST: BaseSIMDLdRPost<1, R, opcode, S, 0b00, asm,
+ !cast<Operand>("VecList" # Count # "16b"),
+ !cast<Operand>("GPR64pi" # Offset1)>;
+ def v4h_POST : BaseSIMDLdRPost<0, R, opcode, S, 0b01, asm,
+ !cast<Operand>("VecList" # Count # "4h"),
+ !cast<Operand>("GPR64pi" # Offset2)>;
+ def v8h_POST : BaseSIMDLdRPost<1, R, opcode, S, 0b01, asm,
+ !cast<Operand>("VecList" # Count # "8h"),
+ !cast<Operand>("GPR64pi" # Offset2)>;
+ def v2s_POST : BaseSIMDLdRPost<0, R, opcode, S, 0b10, asm,
+ !cast<Operand>("VecList" # Count # "2s"),
+ !cast<Operand>("GPR64pi" # Offset4)>;
+ def v4s_POST : BaseSIMDLdRPost<1, R, opcode, S, 0b10, asm,
+ !cast<Operand>("VecList" # Count # "4s"),
+ !cast<Operand>("GPR64pi" # Offset4)>;
+ def v1d_POST : BaseSIMDLdRPost<0, R, opcode, S, 0b11, asm,
+ !cast<Operand>("VecList" # Count # "1d"),
+ !cast<Operand>("GPR64pi" # Offset8)>;
+ def v2d_POST : BaseSIMDLdRPost<1, R, opcode, S, 0b11, asm,
+ !cast<Operand>("VecList" # Count # "2d"),
+ !cast<Operand>("GPR64pi" # Offset8)>;
+
+ defm : SIMDLdrAliases<asm, "8b", Count, Offset1, 64>;
+ defm : SIMDLdrAliases<asm, "16b", Count, Offset1, 128>;
+ defm : SIMDLdrAliases<asm, "4h", Count, Offset2, 64>;
+ defm : SIMDLdrAliases<asm, "8h", Count, Offset2, 128>;
+ defm : SIMDLdrAliases<asm, "2s", Count, Offset4, 64>;
+ defm : SIMDLdrAliases<asm, "4s", Count, Offset4, 128>;
+ defm : SIMDLdrAliases<asm, "1d", Count, Offset8, 64>;
+ defm : SIMDLdrAliases<asm, "2d", Count, Offset8, 128>;
}
-// Format AdvSIMD scalar shift by immediate
-
-class NeonI_ScalarShiftImm<bit u, bits<5> opcode,
- dag outs, dag ins, string asmstr,
- list<dag> patterns, InstrItinClass itin>
- : A64InstRdn<outs, ins, asmstr, patterns, itin> {
- bits<4> Imm4;
- bits<3> Imm3;
- let Inst{31-30} = 0b01;
- let Inst{29} = u;
- let Inst{28-23} = 0b111110;
- let Inst{22-19} = Imm4;
- let Inst{18-16} = Imm3;
- let Inst{15-11} = opcode;
- let Inst{10} = 0b1;
- // Inherit Rn in 9-5
- // Inherit Rd in 4-0
+class SIMDLdStSingleB<bit L, bit R, bits<3> opcode, string asm,
+ dag oops, dag iops, list<dag> pattern>
+ : BaseSIMDLdStSingle<L, R, opcode, asm, "\t$Vt$idx, [$Rn]", "", oops, iops,
+ pattern> {
+ // idx encoded in Q:S:size fields.
+ bits<4> idx;
+ let Inst{30} = idx{3};
+ let Inst{23} = 0;
+ let Inst{20-16} = 0b00000;
+ let Inst{12} = idx{2};
+ let Inst{11-10} = idx{1-0};
+}
+class SIMDLdStSingleBTied<bit L, bit R, bits<3> opcode, string asm,
+ dag oops, dag iops, list<dag> pattern>
+ : BaseSIMDLdStSingleTied<L, R, opcode, asm, "\t$Vt$idx, [$Rn]", "",
+ oops, iops, pattern> {
+ // idx encoded in Q:S:size fields.
+ bits<4> idx;
+ let Inst{30} = idx{3};
+ let Inst{23} = 0;
+ let Inst{20-16} = 0b00000;
+ let Inst{12} = idx{2};
+ let Inst{11-10} = idx{1-0};
+}
+class SIMDLdStSingleBPost<bit L, bit R, bits<3> opcode, string asm,
+ dag oops, dag iops>
+ : BaseSIMDLdStSingle<L, R, opcode, asm, "\t$Vt$idx, [$Rn], $Xm",
+ "$Rn = $wback", oops, iops, []> {
+ // idx encoded in Q:S:size fields.
+ bits<4> idx;
+ bits<5> Xm;
+ let Inst{30} = idx{3};
+ let Inst{23} = 1;
+ let Inst{20-16} = Xm;
+ let Inst{12} = idx{2};
+ let Inst{11-10} = idx{1-0};
+}
+class SIMDLdStSingleBTiedPost<bit L, bit R, bits<3> opcode, string asm,
+ dag oops, dag iops>
+ : BaseSIMDLdStSingleTied<L, R, opcode, asm, "\t$Vt$idx, [$Rn], $Xm",
+ "$Rn = $wback", oops, iops, []> {
+ // idx encoded in Q:S:size fields.
+ bits<4> idx;
+ bits<5> Xm;
+ let Inst{30} = idx{3};
+ let Inst{23} = 1;
+ let Inst{20-16} = Xm;
+ let Inst{12} = idx{2};
+ let Inst{11-10} = idx{1-0};
}
-// Format AdvSIMD crypto AES
-class NeonI_Crypto_AES<bits<2> size, bits<5> opcode,
- dag outs, dag ins, string asmstr,
- list<dag> patterns, InstrItinClass itin>
- : A64InstRdn<outs, ins, asmstr, patterns, itin> {
- let Inst{31-24} = 0b01001110;
- let Inst{23-22} = size;
- let Inst{21-17} = 0b10100;
- let Inst{16-12} = opcode;
+class SIMDLdStSingleH<bit L, bit R, bits<3> opcode, bit size, string asm,
+ dag oops, dag iops, list<dag> pattern>
+ : BaseSIMDLdStSingle<L, R, opcode, asm, "\t$Vt$idx, [$Rn]", "", oops, iops,
+ pattern> {
+ // idx encoded in Q:S:size<1> fields.
+ bits<3> idx;
+ let Inst{30} = idx{2};
+ let Inst{23} = 0;
+ let Inst{20-16} = 0b00000;
+ let Inst{12} = idx{1};
+ let Inst{11} = idx{0};
+ let Inst{10} = size;
+}
+class SIMDLdStSingleHTied<bit L, bit R, bits<3> opcode, bit size, string asm,
+ dag oops, dag iops, list<dag> pattern>
+ : BaseSIMDLdStSingleTied<L, R, opcode, asm, "\t$Vt$idx, [$Rn]", "",
+ oops, iops, pattern> {
+ // idx encoded in Q:S:size<1> fields.
+ bits<3> idx;
+ let Inst{30} = idx{2};
+ let Inst{23} = 0;
+ let Inst{20-16} = 0b00000;
+ let Inst{12} = idx{1};
+ let Inst{11} = idx{0};
+ let Inst{10} = size;
+}
+
+class SIMDLdStSingleHPost<bit L, bit R, bits<3> opcode, bit size, string asm,
+ dag oops, dag iops>
+ : BaseSIMDLdStSingle<L, R, opcode, asm, "\t$Vt$idx, [$Rn], $Xm",
+ "$Rn = $wback", oops, iops, []> {
+ // idx encoded in Q:S:size<1> fields.
+ bits<3> idx;
+ bits<5> Xm;
+ let Inst{30} = idx{2};
+ let Inst{23} = 1;
+ let Inst{20-16} = Xm;
+ let Inst{12} = idx{1};
+ let Inst{11} = idx{0};
+ let Inst{10} = size;
+}
+class SIMDLdStSingleHTiedPost<bit L, bit R, bits<3> opcode, bit size, string asm,
+ dag oops, dag iops>
+ : BaseSIMDLdStSingleTied<L, R, opcode, asm, "\t$Vt$idx, [$Rn], $Xm",
+ "$Rn = $wback", oops, iops, []> {
+ // idx encoded in Q:S:size<1> fields.
+ bits<3> idx;
+ bits<5> Xm;
+ let Inst{30} = idx{2};
+ let Inst{23} = 1;
+ let Inst{20-16} = Xm;
+ let Inst{12} = idx{1};
+ let Inst{11} = idx{0};
+ let Inst{10} = size;
+}
+class SIMDLdStSingleS<bit L, bit R, bits<3> opcode, bits<2> size, string asm,
+ dag oops, dag iops, list<dag> pattern>
+ : BaseSIMDLdStSingle<L, R, opcode, asm, "\t$Vt$idx, [$Rn]", "", oops, iops,
+ pattern> {
+ // idx encoded in Q:S fields.
+ bits<2> idx;
+ let Inst{30} = idx{1};
+ let Inst{23} = 0;
+ let Inst{20-16} = 0b00000;
+ let Inst{12} = idx{0};
+ let Inst{11-10} = size;
+}
+class SIMDLdStSingleSTied<bit L, bit R, bits<3> opcode, bits<2> size, string asm,
+ dag oops, dag iops, list<dag> pattern>
+ : BaseSIMDLdStSingleTied<L, R, opcode, asm, "\t$Vt$idx, [$Rn]", "",
+ oops, iops, pattern> {
+ // idx encoded in Q:S fields.
+ bits<2> idx;
+ let Inst{30} = idx{1};
+ let Inst{23} = 0;
+ let Inst{20-16} = 0b00000;
+ let Inst{12} = idx{0};
+ let Inst{11-10} = size;
+}
+class SIMDLdStSingleSPost<bit L, bit R, bits<3> opcode, bits<2> size,
+ string asm, dag oops, dag iops>
+ : BaseSIMDLdStSingle<L, R, opcode, asm, "\t$Vt$idx, [$Rn], $Xm",
+ "$Rn = $wback", oops, iops, []> {
+ // idx encoded in Q:S fields.
+ bits<2> idx;
+ bits<5> Xm;
+ let Inst{30} = idx{1};
+ let Inst{23} = 1;
+ let Inst{20-16} = Xm;
+ let Inst{12} = idx{0};
+ let Inst{11-10} = size;
+}
+class SIMDLdStSingleSTiedPost<bit L, bit R, bits<3> opcode, bits<2> size,
+ string asm, dag oops, dag iops>
+ : BaseSIMDLdStSingleTied<L, R, opcode, asm, "\t$Vt$idx, [$Rn], $Xm",
+ "$Rn = $wback", oops, iops, []> {
+ // idx encoded in Q:S fields.
+ bits<2> idx;
+ bits<5> Xm;
+ let Inst{30} = idx{1};
+ let Inst{23} = 1;
+ let Inst{20-16} = Xm;
+ let Inst{12} = idx{0};
+ let Inst{11-10} = size;
+}
+class SIMDLdStSingleD<bit L, bit R, bits<3> opcode, bits<2> size, string asm,
+ dag oops, dag iops, list<dag> pattern>
+ : BaseSIMDLdStSingle<L, R, opcode, asm, "\t$Vt$idx, [$Rn]", "", oops, iops,
+ pattern> {
+ // idx encoded in Q field.
+ bits<1> idx;
+ let Inst{30} = idx;
+ let Inst{23} = 0;
+ let Inst{20-16} = 0b00000;
+ let Inst{12} = 0;
+ let Inst{11-10} = size;
+}
+class SIMDLdStSingleDTied<bit L, bit R, bits<3> opcode, bits<2> size, string asm,
+ dag oops, dag iops, list<dag> pattern>
+ : BaseSIMDLdStSingleTied<L, R, opcode, asm, "\t$Vt$idx, [$Rn]", "",
+ oops, iops, pattern> {
+ // idx encoded in Q field.
+ bits<1> idx;
+ let Inst{30} = idx;
+ let Inst{23} = 0;
+ let Inst{20-16} = 0b00000;
+ let Inst{12} = 0;
+ let Inst{11-10} = size;
+}
+class SIMDLdStSingleDPost<bit L, bit R, bits<3> opcode, bits<2> size,
+ string asm, dag oops, dag iops>
+ : BaseSIMDLdStSingle<L, R, opcode, asm, "\t$Vt$idx, [$Rn], $Xm",
+ "$Rn = $wback", oops, iops, []> {
+ // idx encoded in Q field.
+ bits<1> idx;
+ bits<5> Xm;
+ let Inst{30} = idx;
+ let Inst{23} = 1;
+ let Inst{20-16} = Xm;
+ let Inst{12} = 0;
+ let Inst{11-10} = size;
+}
+class SIMDLdStSingleDTiedPost<bit L, bit R, bits<3> opcode, bits<2> size,
+ string asm, dag oops, dag iops>
+ : BaseSIMDLdStSingleTied<L, R, opcode, asm, "\t$Vt$idx, [$Rn], $Xm",
+ "$Rn = $wback", oops, iops, []> {
+ // idx encoded in Q field.
+ bits<1> idx;
+ bits<5> Xm;
+ let Inst{30} = idx;
+ let Inst{23} = 1;
+ let Inst{20-16} = Xm;
+ let Inst{12} = 0;
+ let Inst{11-10} = size;
+}
+
+let mayLoad = 1, mayStore = 0, hasSideEffects = 0 in
+multiclass SIMDLdSingleBTied<bit R, bits<3> opcode, string asm,
+ RegisterOperand listtype,
+ RegisterOperand GPR64pi> {
+ def i8 : SIMDLdStSingleBTied<1, R, opcode, asm,
+ (outs listtype:$dst),
+ (ins listtype:$Vt, VectorIndexB:$idx,
+ GPR64sp:$Rn), []>;
+
+ def i8_POST : SIMDLdStSingleBTiedPost<1, R, opcode, asm,
+ (outs GPR64sp:$wback, listtype:$dst),
+ (ins listtype:$Vt, VectorIndexB:$idx,
+ GPR64sp:$Rn, GPR64pi:$Xm)>;
+}
+let mayLoad = 1, mayStore = 0, hasSideEffects = 0 in
+multiclass SIMDLdSingleHTied<bit R, bits<3> opcode, bit size, string asm,
+ RegisterOperand listtype,
+ RegisterOperand GPR64pi> {
+ def i16 : SIMDLdStSingleHTied<1, R, opcode, size, asm,
+ (outs listtype:$dst),
+ (ins listtype:$Vt, VectorIndexH:$idx,
+ GPR64sp:$Rn), []>;
+
+ def i16_POST : SIMDLdStSingleHTiedPost<1, R, opcode, size, asm,
+ (outs GPR64sp:$wback, listtype:$dst),
+ (ins listtype:$Vt, VectorIndexH:$idx,
+ GPR64sp:$Rn, GPR64pi:$Xm)>;
+}
+let mayLoad = 1, mayStore = 0, hasSideEffects = 0 in
+multiclass SIMDLdSingleSTied<bit R, bits<3> opcode, bits<2> size,string asm,
+ RegisterOperand listtype,
+ RegisterOperand GPR64pi> {
+ def i32 : SIMDLdStSingleSTied<1, R, opcode, size, asm,
+ (outs listtype:$dst),
+ (ins listtype:$Vt, VectorIndexS:$idx,
+ GPR64sp:$Rn), []>;
+
+ def i32_POST : SIMDLdStSingleSTiedPost<1, R, opcode, size, asm,
+ (outs GPR64sp:$wback, listtype:$dst),
+ (ins listtype:$Vt, VectorIndexS:$idx,
+ GPR64sp:$Rn, GPR64pi:$Xm)>;
+}
+let mayLoad = 1, mayStore = 0, hasSideEffects = 0 in
+multiclass SIMDLdSingleDTied<bit R, bits<3> opcode, bits<2> size, string asm,
+ RegisterOperand listtype, RegisterOperand GPR64pi> {
+ def i64 : SIMDLdStSingleDTied<1, R, opcode, size, asm,
+ (outs listtype:$dst),
+ (ins listtype:$Vt, VectorIndexD:$idx,
+ GPR64sp:$Rn), []>;
+
+ def i64_POST : SIMDLdStSingleDTiedPost<1, R, opcode, size, asm,
+ (outs GPR64sp:$wback, listtype:$dst),
+ (ins listtype:$Vt, VectorIndexD:$idx,
+ GPR64sp:$Rn, GPR64pi:$Xm)>;
+}
+let mayLoad = 0, mayStore = 1, hasSideEffects = 0 in
+multiclass SIMDStSingleB<bit R, bits<3> opcode, string asm,
+ RegisterOperand listtype, RegisterOperand GPR64pi> {
+ def i8 : SIMDLdStSingleB<0, R, opcode, asm,
+ (outs), (ins listtype:$Vt, VectorIndexB:$idx,
+ GPR64sp:$Rn), []>;
+
+ def i8_POST : SIMDLdStSingleBPost<0, R, opcode, asm,
+ (outs GPR64sp:$wback),
+ (ins listtype:$Vt, VectorIndexB:$idx,
+ GPR64sp:$Rn, GPR64pi:$Xm)>;
+}
+let mayLoad = 0, mayStore = 1, hasSideEffects = 0 in
+multiclass SIMDStSingleH<bit R, bits<3> opcode, bit size, string asm,
+ RegisterOperand listtype, RegisterOperand GPR64pi> {
+ def i16 : SIMDLdStSingleH<0, R, opcode, size, asm,
+ (outs), (ins listtype:$Vt, VectorIndexH:$idx,
+ GPR64sp:$Rn), []>;
+
+ def i16_POST : SIMDLdStSingleHPost<0, R, opcode, size, asm,
+ (outs GPR64sp:$wback),
+ (ins listtype:$Vt, VectorIndexH:$idx,
+ GPR64sp:$Rn, GPR64pi:$Xm)>;
+}
+let mayLoad = 0, mayStore = 1, hasSideEffects = 0 in
+multiclass SIMDStSingleS<bit R, bits<3> opcode, bits<2> size,string asm,
+ RegisterOperand listtype, RegisterOperand GPR64pi> {
+ def i32 : SIMDLdStSingleS<0, R, opcode, size, asm,
+ (outs), (ins listtype:$Vt, VectorIndexS:$idx,
+ GPR64sp:$Rn), []>;
+
+ def i32_POST : SIMDLdStSingleSPost<0, R, opcode, size, asm,
+ (outs GPR64sp:$wback),
+ (ins listtype:$Vt, VectorIndexS:$idx,
+ GPR64sp:$Rn, GPR64pi:$Xm)>;
+}
+let mayLoad = 0, mayStore = 1, hasSideEffects = 0 in
+multiclass SIMDStSingleD<bit R, bits<3> opcode, bits<2> size, string asm,
+ RegisterOperand listtype, RegisterOperand GPR64pi> {
+ def i64 : SIMDLdStSingleD<0, R, opcode, size, asm,
+ (outs), (ins listtype:$Vt, VectorIndexD:$idx,
+ GPR64sp:$Rn), []>;
+
+ def i64_POST : SIMDLdStSingleDPost<0, R, opcode, size, asm,
+ (outs GPR64sp:$wback),
+ (ins listtype:$Vt, VectorIndexD:$idx,
+ GPR64sp:$Rn, GPR64pi:$Xm)>;
+}
+
+multiclass SIMDLdStSingleAliases<string asm, string layout, string Type,
+ string Count, int Offset, Operand idxtype> {
+ // E.g. "ld1 { v0.8b }[0], [x1], #1"
+ // "ld1\t$Vt, [$Rn], #1"
+ // may get mapped to
+ // (LD1Rv8b_POST VecListOne8b:$Vt, GPR64sp:$Rn, XZR)
+ def : InstAlias<asm # "\t$Vt$idx, [$Rn], #" # Offset,
+ (!cast<Instruction>(NAME # Type # "_POST")
+ GPR64sp:$Rn,
+ !cast<RegisterOperand>("VecList" # Count # layout):$Vt,
+ idxtype:$idx, XZR), 1>;
+
+ // E.g. "ld1.8b { v0 }[0], [x1], #1"
+ // "ld1.8b\t$Vt, [$Rn], #1"
+ // may get mapped to
+ // (LD1Rv8b_POST VecListOne64:$Vt, GPR64sp:$Rn, XZR)
+ def : InstAlias<asm # "." # layout # "\t$Vt$idx, [$Rn], #" # Offset,
+ (!cast<Instruction>(NAME # Type # "_POST")
+ GPR64sp:$Rn,
+ !cast<RegisterOperand>("VecList" # Count # "128"):$Vt,
+ idxtype:$idx, XZR), 0>;
+
+ // E.g. "ld1.8b { v0 }[0], [x1]"
+ // "ld1.8b\t$Vt, [$Rn]"
+ // may get mapped to
+ // (LD1Rv8b VecListOne64:$Vt, GPR64sp:$Rn)
+ def : InstAlias<asm # "." # layout # "\t$Vt$idx, [$Rn]",
+ (!cast<Instruction>(NAME # Type)
+ !cast<RegisterOperand>("VecList" # Count # "128"):$Vt,
+ idxtype:$idx, GPR64sp:$Rn), 0>;
+
+ // E.g. "ld1.8b { v0 }[0], [x1], x2"
+ // "ld1.8b\t$Vt, [$Rn], $Xm"
+ // may get mapped to
+ // (LD1Rv8b_POST VecListOne64:$Vt, GPR64sp:$Rn, GPR64pi1:$Xm)
+ def : InstAlias<asm # "." # layout # "\t$Vt$idx, [$Rn], $Xm",
+ (!cast<Instruction>(NAME # Type # "_POST")
+ GPR64sp:$Rn,
+ !cast<RegisterOperand>("VecList" # Count # "128"):$Vt,
+ idxtype:$idx,
+ !cast<RegisterOperand>("GPR64pi" # Offset):$Xm), 0>;
+}
+
+multiclass SIMDLdSt1SingleAliases<string asm> {
+ defm : SIMDLdStSingleAliases<asm, "b", "i8", "One", 1, VectorIndexB>;
+ defm : SIMDLdStSingleAliases<asm, "h", "i16", "One", 2, VectorIndexH>;
+ defm : SIMDLdStSingleAliases<asm, "s", "i32", "One", 4, VectorIndexS>;
+ defm : SIMDLdStSingleAliases<asm, "d", "i64", "One", 8, VectorIndexD>;
+}
+
+multiclass SIMDLdSt2SingleAliases<string asm> {
+ defm : SIMDLdStSingleAliases<asm, "b", "i8", "Two", 2, VectorIndexB>;
+ defm : SIMDLdStSingleAliases<asm, "h", "i16", "Two", 4, VectorIndexH>;
+ defm : SIMDLdStSingleAliases<asm, "s", "i32", "Two", 8, VectorIndexS>;
+ defm : SIMDLdStSingleAliases<asm, "d", "i64", "Two", 16, VectorIndexD>;
+}
+
+multiclass SIMDLdSt3SingleAliases<string asm> {
+ defm : SIMDLdStSingleAliases<asm, "b", "i8", "Three", 3, VectorIndexB>;
+ defm : SIMDLdStSingleAliases<asm, "h", "i16", "Three", 6, VectorIndexH>;
+ defm : SIMDLdStSingleAliases<asm, "s", "i32", "Three", 12, VectorIndexS>;
+ defm : SIMDLdStSingleAliases<asm, "d", "i64", "Three", 24, VectorIndexD>;
+}
+
+multiclass SIMDLdSt4SingleAliases<string asm> {
+ defm : SIMDLdStSingleAliases<asm, "b", "i8", "Four", 4, VectorIndexB>;
+ defm : SIMDLdStSingleAliases<asm, "h", "i16", "Four", 8, VectorIndexH>;
+ defm : SIMDLdStSingleAliases<asm, "s", "i32", "Four", 16, VectorIndexS>;
+ defm : SIMDLdStSingleAliases<asm, "d", "i64", "Four", 32, VectorIndexD>;
+}
+} // end of 'let Predicates = [HasNEON]'
+
+//----------------------------------------------------------------------------
+// Crypto extensions
+//----------------------------------------------------------------------------
+
+let Predicates = [HasCrypto] in {
+let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
+class AESBase<bits<4> opc, string asm, dag outs, dag ins, string cstr,
+ list<dag> pat>
+ : I<outs, ins, asm, "{\t$Rd.16b, $Rn.16b|.16b\t$Rd, $Rn}", cstr, pat>,
+ Sched<[WriteV]>{
+ bits<5> Rd;
+ bits<5> Rn;
+ let Inst{31-16} = 0b0100111000101000;
+ let Inst{15-12} = opc;
let Inst{11-10} = 0b10;
- // Inherit Rn in 9-5
- // Inherit Rd in 4-0
+ let Inst{9-5} = Rn;
+ let Inst{4-0} = Rd;
}
-// Format AdvSIMD crypto SHA
-class NeonI_Crypto_SHA<bits<2> size, bits<5> opcode,
- dag outs, dag ins, string asmstr,
- list<dag> patterns, InstrItinClass itin>
- : A64InstRdn<outs, ins, asmstr, patterns, itin> {
- let Inst{31-24} = 0b01011110;
- let Inst{23-22} = size;
- let Inst{21-17} = 0b10100;
- let Inst{16-12} = opcode;
- let Inst{11-10} = 0b10;
- // Inherit Rn in 9-5
- // Inherit Rd in 4-0
-}
+class AESInst<bits<4> opc, string asm, Intrinsic OpNode>
+ : AESBase<opc, asm, (outs V128:$Rd), (ins V128:$Rn), "",
+ [(set (v16i8 V128:$Rd), (OpNode (v16i8 V128:$Rn)))]>;
-// Format AdvSIMD crypto 3V SHA
-class NeonI_Crypto_3VSHA<bits<2> size, bits<3> opcode,
- dag outs, dag ins, string asmstr,
- list<dag> patterns, InstrItinClass itin>
- : A64InstRdnm<outs, ins, asmstr, patterns, itin> {
- let Inst{31-24} = 0b01011110;
- let Inst{23-22} = size;
- let Inst{21} = 0b0;
- // Inherit Rm in 20-16
- let Inst{15} = 0b0;
- let Inst{14-12} = opcode;
+class AESTiedInst<bits<4> opc, string asm, Intrinsic OpNode>
+ : AESBase<opc, asm, (outs V128:$dst), (ins V128:$Rd, V128:$Rn),
+ "$Rd = $dst",
+ [(set (v16i8 V128:$dst),
+ (OpNode (v16i8 V128:$Rd), (v16i8 V128:$Rn)))]>;
+
+let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
+class SHA3OpTiedInst<bits<3> opc, string asm, string dst_lhs_kind,
+ dag oops, dag iops, list<dag> pat>
+ : I<oops, iops, asm,
+ "{\t$Rd" # dst_lhs_kind # ", $Rn" # dst_lhs_kind # ", $Rm.4s" #
+ "|.4s\t$Rd, $Rn, $Rm}", "$Rd = $dst", pat>,
+ Sched<[WriteV]>{
+ bits<5> Rd;
+ bits<5> Rn;
+ bits<5> Rm;
+ let Inst{31-21} = 0b01011110000;
+ let Inst{20-16} = Rm;
+ let Inst{15} = 0;
+ let Inst{14-12} = opc;
let Inst{11-10} = 0b00;
- // Inherit Rn in 9-5
- // Inherit Rd in 4-0
+ let Inst{9-5} = Rn;
+ let Inst{4-0} = Rd;
}
-// Format AdvSIMD scalar x indexed element
-class NeonI_ScalarXIndexedElem<bit u, bit szhi, bit szlo,
- bits<4> opcode, dag outs, dag ins,
- string asmstr, list<dag> patterns,
- InstrItinClass itin>
- : A64InstRdnm<outs, ins, asmstr, patterns, itin>
-{
- let Inst{31} = 0b0;
- let Inst{30} = 0b1;
- let Inst{29} = u;
- let Inst{28-24} = 0b11111;
- let Inst{23} = szhi;
- let Inst{22} = szlo;
- // l in Inst{21}
- // m in Instr{20}
- // Inherit Rm in 19-16
- let Inst{15-12} = opcode;
- // h in Inst{11}
- let Inst{10} = 0b0;
- // Inherit Rn in 9-5
- // Inherit Rd in 4-0
-}
-// Format AdvSIMD scalar copy - insert from element to scalar
-class NeonI_ScalarCopy<dag outs, dag ins, string asmstr,
- list<dag> patterns, InstrItinClass itin>
- : NeonI_copy<0b1, 0b0, 0b0000, outs, ins, asmstr, patterns, itin> {
- let Inst{28} = 0b1;
-}
+class SHATiedInstQSV<bits<3> opc, string asm, Intrinsic OpNode>
+ : SHA3OpTiedInst<opc, asm, "", (outs FPR128:$dst),
+ (ins FPR128:$Rd, FPR32:$Rn, V128:$Rm),
+ [(set (v4i32 FPR128:$dst),
+ (OpNode (v4i32 FPR128:$Rd), (i32 FPR32:$Rn),
+ (v4i32 V128:$Rm)))]>;
+
+class SHATiedInstVVV<bits<3> opc, string asm, Intrinsic OpNode>
+ : SHA3OpTiedInst<opc, asm, ".4s", (outs V128:$dst),
+ (ins V128:$Rd, V128:$Rn, V128:$Rm),
+ [(set (v4i32 V128:$dst),
+ (OpNode (v4i32 V128:$Rd), (v4i32 V128:$Rn),
+ (v4i32 V128:$Rm)))]>;
+
+class SHATiedInstQQV<bits<3> opc, string asm, Intrinsic OpNode>
+ : SHA3OpTiedInst<opc, asm, "", (outs FPR128:$dst),
+ (ins FPR128:$Rd, FPR128:$Rn, V128:$Rm),
+ [(set (v4i32 FPR128:$dst),
+ (OpNode (v4i32 FPR128:$Rd), (v4i32 FPR128:$Rn),
+ (v4i32 V128:$Rm)))]>;
+
+let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
+class SHA2OpInst<bits<4> opc, string asm, string kind,
+ string cstr, dag oops, dag iops,
+ list<dag> pat>
+ : I<oops, iops, asm, "{\t$Rd" # kind # ", $Rn" # kind #
+ "|" # kind # "\t$Rd, $Rn}", cstr, pat>,
+ Sched<[WriteV]>{
+ bits<5> Rd;
+ bits<5> Rn;
+ let Inst{31-16} = 0b0101111000101000;
+ let Inst{15-12} = opc;
+ let Inst{11-10} = 0b10;
+ let Inst{9-5} = Rn;
+ let Inst{4-0} = Rd;
}
+class SHATiedInstVV<bits<4> opc, string asm, Intrinsic OpNode>
+ : SHA2OpInst<opc, asm, ".4s", "$Rd = $dst", (outs V128:$dst),
+ (ins V128:$Rd, V128:$Rn),
+ [(set (v4i32 V128:$dst),
+ (OpNode (v4i32 V128:$Rd), (v4i32 V128:$Rn)))]>;
+
+class SHAInstSS<bits<4> opc, string asm, Intrinsic OpNode>
+ : SHA2OpInst<opc, asm, "", "", (outs FPR32:$Rd), (ins FPR32:$Rn),
+ [(set (i32 FPR32:$Rd), (OpNode (i32 FPR32:$Rn)))]>;
+} // end of 'let Predicates = [HasCrypto]'
+
+// Allow the size specifier tokens to be upper case, not just lower.
+def : TokenAlias<".8B", ".8b">;
+def : TokenAlias<".4H", ".4h">;
+def : TokenAlias<".2S", ".2s">;
+def : TokenAlias<".1D", ".1d">;
+def : TokenAlias<".16B", ".16b">;
+def : TokenAlias<".8H", ".8h">;
+def : TokenAlias<".4S", ".4s">;
+def : TokenAlias<".2D", ".2d">;
+def : TokenAlias<".1Q", ".1q">;
+def : TokenAlias<".B", ".b">;
+def : TokenAlias<".H", ".h">;
+def : TokenAlias<".S", ".s">;
+def : TokenAlias<".D", ".d">;
+def : TokenAlias<".Q", ".q">;
diff --git a/lib/Target/AArch64/AArch64InstrInfo.cpp b/lib/Target/AArch64/AArch64InstrInfo.cpp
index afb2034..ff115c0 100644
--- a/lib/Target/AArch64/AArch64InstrInfo.cpp
+++ b/lib/Target/AArch64/AArch64InstrInfo.cpp
@@ -11,257 +11,83 @@
//
//===----------------------------------------------------------------------===//
-#include "AArch64.h"
#include "AArch64InstrInfo.h"
-#include "AArch64MachineFunctionInfo.h"
-#include "AArch64TargetMachine.h"
-#include "MCTargetDesc/AArch64MCTargetDesc.h"
-#include "Utils/AArch64BaseInfo.h"
-#include "llvm/CodeGen/MachineConstantPool.h"
-#include "llvm/CodeGen/MachineDominators.h"
+#include "AArch64Subtarget.h"
+#include "MCTargetDesc/AArch64AddressingModes.h"
#include "llvm/CodeGen/MachineFrameInfo.h"
-#include "llvm/CodeGen/MachineFunctionPass.h"
#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineMemOperand.h"
#include "llvm/CodeGen/MachineRegisterInfo.h"
-#include "llvm/IR/Function.h"
+#include "llvm/CodeGen/PseudoSourceValue.h"
+#include "llvm/MC/MCInst.h"
#include "llvm/Support/ErrorHandling.h"
#include "llvm/Support/TargetRegistry.h"
-#include <algorithm>
+
+using namespace llvm;
#define GET_INSTRINFO_CTOR_DTOR
#include "AArch64GenInstrInfo.inc"
-using namespace llvm;
-
AArch64InstrInfo::AArch64InstrInfo(const AArch64Subtarget &STI)
- : AArch64GenInstrInfo(AArch64::ADJCALLSTACKDOWN, AArch64::ADJCALLSTACKUP),
- Subtarget(STI) {}
+ : AArch64GenInstrInfo(AArch64::ADJCALLSTACKDOWN, AArch64::ADJCALLSTACKUP),
+ RI(this, &STI), Subtarget(STI) {}
-void AArch64InstrInfo::copyPhysReg(MachineBasicBlock &MBB,
- MachineBasicBlock::iterator I, DebugLoc DL,
- unsigned DestReg, unsigned SrcReg,
- bool KillSrc) const {
- unsigned Opc = 0;
- unsigned ZeroReg = 0;
- if (DestReg == AArch64::XSP || SrcReg == AArch64::XSP) {
- // E.g. ADD xDst, xsp, #0 (, lsl #0)
- BuildMI(MBB, I, DL, get(AArch64::ADDxxi_lsl0_s), DestReg)
- .addReg(SrcReg)
- .addImm(0);
- return;
- } else if (DestReg == AArch64::WSP || SrcReg == AArch64::WSP) {
- // E.g. ADD wDST, wsp, #0 (, lsl #0)
- BuildMI(MBB, I, DL, get(AArch64::ADDwwi_lsl0_s), DestReg)
- .addReg(SrcReg)
- .addImm(0);
- return;
- } else if (DestReg == AArch64::NZCV) {
- assert(AArch64::GPR64RegClass.contains(SrcReg));
- // E.g. MSR NZCV, xDST
- BuildMI(MBB, I, DL, get(AArch64::MSRix))
- .addImm(A64SysReg::NZCV)
- .addReg(SrcReg);
- } else if (SrcReg == AArch64::NZCV) {
- assert(AArch64::GPR64RegClass.contains(DestReg));
- // E.g. MRS xDST, NZCV
- BuildMI(MBB, I, DL, get(AArch64::MRSxi), DestReg)
- .addImm(A64SysReg::NZCV);
- } else if (AArch64::GPR64RegClass.contains(DestReg)) {
- if(AArch64::GPR64RegClass.contains(SrcReg)){
- Opc = AArch64::ORRxxx_lsl;
- ZeroReg = AArch64::XZR;
- } else{
- assert(AArch64::FPR64RegClass.contains(SrcReg));
- BuildMI(MBB, I, DL, get(AArch64::FMOVxd), DestReg)
- .addReg(SrcReg);
- return;
- }
- } else if (AArch64::GPR32RegClass.contains(DestReg)) {
- if(AArch64::GPR32RegClass.contains(SrcReg)){
- Opc = AArch64::ORRwww_lsl;
- ZeroReg = AArch64::WZR;
- } else{
- assert(AArch64::FPR32RegClass.contains(SrcReg));
- BuildMI(MBB, I, DL, get(AArch64::FMOVws), DestReg)
- .addReg(SrcReg);
- return;
- }
- } else if (AArch64::FPR32RegClass.contains(DestReg)) {
- if(AArch64::FPR32RegClass.contains(SrcReg)){
- BuildMI(MBB, I, DL, get(AArch64::FMOVss), DestReg)
- .addReg(SrcReg);
- return;
- }
- else {
- assert(AArch64::GPR32RegClass.contains(SrcReg));
- BuildMI(MBB, I, DL, get(AArch64::FMOVsw), DestReg)
- .addReg(SrcReg);
- return;
- }
- } else if (AArch64::FPR64RegClass.contains(DestReg)) {
- if(AArch64::FPR64RegClass.contains(SrcReg)){
- BuildMI(MBB, I, DL, get(AArch64::FMOVdd), DestReg)
- .addReg(SrcReg);
- return;
- }
- else {
- assert(AArch64::GPR64RegClass.contains(SrcReg));
- BuildMI(MBB, I, DL, get(AArch64::FMOVdx), DestReg)
- .addReg(SrcReg);
- return;
- }
- } else if (AArch64::FPR128RegClass.contains(DestReg)) {
- assert(AArch64::FPR128RegClass.contains(SrcReg));
+/// GetInstSize - Return the number of bytes of code the specified
+/// instruction may be. This returns the maximum number of bytes.
+unsigned AArch64InstrInfo::GetInstSizeInBytes(const MachineInstr *MI) const {
+ const MCInstrDesc &Desc = MI->getDesc();
- // If NEON is enable, we use ORR to implement this copy.
- // If NEON isn't available, emit STR and LDR to handle this.
- if(getSubTarget().hasNEON()) {
- BuildMI(MBB, I, DL, get(AArch64::ORRvvv_16B), DestReg)
- .addReg(SrcReg)
- .addReg(SrcReg);
- return;
- } else {
- BuildMI(MBB, I, DL, get(AArch64::LSFP128_PreInd_STR), AArch64::XSP)
- .addReg(SrcReg)
- .addReg(AArch64::XSP)
- .addImm(0x1ff & -16);
-
- BuildMI(MBB, I, DL, get(AArch64::LSFP128_PostInd_LDR), DestReg)
- .addReg(AArch64::XSP, RegState::Define)
- .addReg(AArch64::XSP)
- .addImm(16);
- return;
- }
- } else if (AArch64::FPR8RegClass.contains(DestReg, SrcReg)) {
- // The copy of two FPR8 registers is implemented by the copy of two FPR32
- const TargetRegisterInfo *TRI = &getRegisterInfo();
- unsigned Dst = TRI->getMatchingSuperReg(DestReg, AArch64::sub_8,
- &AArch64::FPR32RegClass);
- unsigned Src = TRI->getMatchingSuperReg(SrcReg, AArch64::sub_8,
- &AArch64::FPR32RegClass);
- BuildMI(MBB, I, DL, get(AArch64::FMOVss), Dst)
- .addReg(Src);
- return;
- } else if (AArch64::FPR16RegClass.contains(DestReg, SrcReg)) {
- // The copy of two FPR16 registers is implemented by the copy of two FPR32
- const TargetRegisterInfo *TRI = &getRegisterInfo();
- unsigned Dst = TRI->getMatchingSuperReg(DestReg, AArch64::sub_16,
- &AArch64::FPR32RegClass);
- unsigned Src = TRI->getMatchingSuperReg(SrcReg, AArch64::sub_16,
- &AArch64::FPR32RegClass);
- BuildMI(MBB, I, DL, get(AArch64::FMOVss), Dst)
- .addReg(Src);
- return;
- } else {
- CopyPhysRegTuple(MBB, I, DL, DestReg, SrcReg);
- return;
- }
-
- // E.g. ORR xDst, xzr, xSrc, lsl #0
- BuildMI(MBB, I, DL, get(Opc), DestReg)
- .addReg(ZeroReg)
- .addReg(SrcReg)
- .addImm(0);
-}
-
-void AArch64InstrInfo::CopyPhysRegTuple(MachineBasicBlock &MBB,
- MachineBasicBlock::iterator I,
- DebugLoc DL, unsigned DestReg,
- unsigned SrcReg) const {
- unsigned SubRegs;
- bool IsQRegs;
- if (AArch64::DPairRegClass.contains(DestReg, SrcReg)) {
- SubRegs = 2;
- IsQRegs = false;
- } else if (AArch64::DTripleRegClass.contains(DestReg, SrcReg)) {
- SubRegs = 3;
- IsQRegs = false;
- } else if (AArch64::DQuadRegClass.contains(DestReg, SrcReg)) {
- SubRegs = 4;
- IsQRegs = false;
- } else if (AArch64::QPairRegClass.contains(DestReg, SrcReg)) {
- SubRegs = 2;
- IsQRegs = true;
- } else if (AArch64::QTripleRegClass.contains(DestReg, SrcReg)) {
- SubRegs = 3;
- IsQRegs = true;
- } else if (AArch64::QQuadRegClass.contains(DestReg, SrcReg)) {
- SubRegs = 4;
- IsQRegs = true;
- } else
- llvm_unreachable("Unknown register class");
-
- unsigned BeginIdx = IsQRegs ? AArch64::qsub_0 : AArch64::dsub_0;
- int Spacing = 1;
- const TargetRegisterInfo *TRI = &getRegisterInfo();
- // Copy register tuples backward when the first Dest reg overlaps
- // with SrcReg.
- if (TRI->regsOverlap(SrcReg, TRI->getSubReg(DestReg, BeginIdx))) {
- BeginIdx = BeginIdx + (SubRegs - 1);
- Spacing = -1;
- }
-
- unsigned Opc = IsQRegs ? AArch64::ORRvvv_16B : AArch64::ORRvvv_8B;
- for (unsigned i = 0; i != SubRegs; ++i) {
- unsigned Dst = TRI->getSubReg(DestReg, BeginIdx + i * Spacing);
- unsigned Src = TRI->getSubReg(SrcReg, BeginIdx + i * Spacing);
- assert(Dst && Src && "Bad sub-register");
- BuildMI(MBB, I, I->getDebugLoc(), get(Opc), Dst)
- .addReg(Src)
- .addReg(Src);
- }
- return;
-}
-
-/// Does the Opcode represent a conditional branch that we can remove and re-add
-/// at the end of a basic block?
-static bool isCondBranch(unsigned Opc) {
- return Opc == AArch64::Bcc || Opc == AArch64::CBZw || Opc == AArch64::CBZx ||
- Opc == AArch64::CBNZw || Opc == AArch64::CBNZx ||
- Opc == AArch64::TBZwii || Opc == AArch64::TBZxii ||
- Opc == AArch64::TBNZwii || Opc == AArch64::TBNZxii;
-}
-
-/// Takes apart a given conditional branch MachineInstr (see isCondBranch),
-/// setting TBB to the destination basic block and populating the Cond vector
-/// with data necessary to recreate the conditional branch at a later
-/// date. First element will be the opcode, and subsequent ones define the
-/// conditions being branched on in an instruction-specific manner.
-static void classifyCondBranch(MachineInstr *I, MachineBasicBlock *&TBB,
- SmallVectorImpl<MachineOperand> &Cond) {
- switch(I->getOpcode()) {
- case AArch64::Bcc:
- case AArch64::CBZw:
- case AArch64::CBZx:
- case AArch64::CBNZw:
- case AArch64::CBNZx:
- // These instructions just have one predicate operand in position 0 (either
- // a condition code or a register being compared).
- Cond.push_back(MachineOperand::CreateImm(I->getOpcode()));
- Cond.push_back(I->getOperand(0));
- TBB = I->getOperand(1).getMBB();
- return;
- case AArch64::TBZwii:
- case AArch64::TBZxii:
- case AArch64::TBNZwii:
- case AArch64::TBNZxii:
- // These have two predicate operands: a register and a bit position.
- Cond.push_back(MachineOperand::CreateImm(I->getOpcode()));
- Cond.push_back(I->getOperand(0));
- Cond.push_back(I->getOperand(1));
- TBB = I->getOperand(2).getMBB();
- return;
+ switch (Desc.getOpcode()) {
default:
- llvm_unreachable("Unknown conditional branch to classify");
+ // Anything not explicitly designated otherwise is a nomal 4-byte insn.
+ return 4;
+ case TargetOpcode::DBG_VALUE:
+ case TargetOpcode::EH_LABEL:
+ case TargetOpcode::IMPLICIT_DEF:
+ case TargetOpcode::KILL:
+ return 0;
+ }
+
+ llvm_unreachable("GetInstSizeInBytes()- Unable to determin insn size");
+}
+
+static void parseCondBranch(MachineInstr *LastInst, MachineBasicBlock *&Target,
+ SmallVectorImpl<MachineOperand> &Cond) {
+ // Block ends with fall-through condbranch.
+ switch (LastInst->getOpcode()) {
+ default:
+ llvm_unreachable("Unknown branch instruction?");
+ case AArch64::Bcc:
+ Target = LastInst->getOperand(1).getMBB();
+ Cond.push_back(LastInst->getOperand(0));
+ break;
+ case AArch64::CBZW:
+ case AArch64::CBZX:
+ case AArch64::CBNZW:
+ case AArch64::CBNZX:
+ Target = LastInst->getOperand(1).getMBB();
+ Cond.push_back(MachineOperand::CreateImm(-1));
+ Cond.push_back(MachineOperand::CreateImm(LastInst->getOpcode()));
+ Cond.push_back(LastInst->getOperand(0));
+ break;
+ case AArch64::TBZW:
+ case AArch64::TBZX:
+ case AArch64::TBNZW:
+ case AArch64::TBNZX:
+ Target = LastInst->getOperand(2).getMBB();
+ Cond.push_back(MachineOperand::CreateImm(-1));
+ Cond.push_back(MachineOperand::CreateImm(LastInst->getOpcode()));
+ Cond.push_back(LastInst->getOperand(0));
+ Cond.push_back(LastInst->getOperand(1));
}
}
-
-bool
-AArch64InstrInfo::AnalyzeBranch(MachineBasicBlock &MBB,MachineBasicBlock *&TBB,
- MachineBasicBlock *&FBB,
- SmallVectorImpl<MachineOperand> &Cond,
- bool AllowModify) const {
+// Branch analysis.
+bool AArch64InstrInfo::AnalyzeBranch(MachineBasicBlock &MBB,
+ MachineBasicBlock *&TBB,
+ MachineBasicBlock *&FBB,
+ SmallVectorImpl<MachineOperand> &Cond,
+ bool AllowModify) const {
// If the block has no terminators, it just falls into the block after it.
MachineBasicBlock::iterator I = MBB.end();
if (I == MBB.begin())
@@ -281,15 +107,16 @@
// If there is only one terminator instruction, process it.
unsigned LastOpc = LastInst->getOpcode();
if (I == MBB.begin() || !isUnpredicatedTerminator(--I)) {
- if (LastOpc == AArch64::Bimm) {
+ if (isUncondBranchOpcode(LastOpc)) {
TBB = LastInst->getOperand(0).getMBB();
return false;
}
- if (isCondBranch(LastOpc)) {
- classifyCondBranch(LastInst, TBB, Cond);
+ if (isCondBranchOpcode(LastOpc)) {
+ // Block ends with fall-through condbranch.
+ parseCondBranch(LastInst, TBB, Cond);
return false;
}
- return true; // Can't handle indirect branch.
+ return true; // Can't handle indirect branch.
}
// Get the instruction before it if it is a terminator.
@@ -298,8 +125,8 @@
// If AllowModify is true and the block ends with two or more unconditional
// branches, delete all but the first unconditional branch.
- if (AllowModify && LastOpc == AArch64::Bimm) {
- while (SecondLastOpc == AArch64::Bimm) {
+ if (AllowModify && isUncondBranchOpcode(LastOpc)) {
+ while (isUncondBranchOpcode(SecondLastOpc)) {
LastInst->eraseFromParent();
LastInst = SecondLastInst;
LastOpc = LastInst->getOpcode();
@@ -319,23 +146,15 @@
return true;
// If the block ends with a B and a Bcc, handle it.
- if (LastOpc == AArch64::Bimm) {
- if (SecondLastOpc == AArch64::Bcc) {
- TBB = SecondLastInst->getOperand(1).getMBB();
- Cond.push_back(MachineOperand::CreateImm(AArch64::Bcc));
- Cond.push_back(SecondLastInst->getOperand(0));
- FBB = LastInst->getOperand(0).getMBB();
- return false;
- } else if (isCondBranch(SecondLastOpc)) {
- classifyCondBranch(SecondLastInst, TBB, Cond);
- FBB = LastInst->getOperand(0).getMBB();
- return false;
- }
+ if (isCondBranchOpcode(SecondLastOpc) && isUncondBranchOpcode(LastOpc)) {
+ parseCondBranch(SecondLastInst, TBB, Cond);
+ FBB = LastInst->getOperand(0).getMBB();
+ return false;
}
// If the block ends with two unconditional branches, handle it. The second
// one is not executed, so remove it.
- if (SecondLastOpc == AArch64::Bimm && LastOpc == AArch64::Bimm) {
+ if (isUncondBranchOpcode(SecondLastOpc) && isUncondBranchOpcode(LastOpc)) {
TBB = SecondLastInst->getOperand(0).getMBB();
I = LastInst;
if (AllowModify)
@@ -343,84 +162,72 @@
return false;
}
+ // ...likewise if it ends with an indirect branch followed by an unconditional
+ // branch.
+ if (isIndirectBranchOpcode(SecondLastOpc) && isUncondBranchOpcode(LastOpc)) {
+ I = LastInst;
+ if (AllowModify)
+ I->eraseFromParent();
+ return true;
+ }
+
// Otherwise, can't handle this.
return true;
}
bool AArch64InstrInfo::ReverseBranchCondition(
- SmallVectorImpl<MachineOperand> &Cond) const {
- switch (Cond[0].getImm()) {
- case AArch64::Bcc: {
- A64CC::CondCodes CC = static_cast<A64CC::CondCodes>(Cond[1].getImm());
- CC = A64InvertCondCode(CC);
- Cond[1].setImm(CC);
- return false;
- }
- case AArch64::CBZw:
- Cond[0].setImm(AArch64::CBNZw);
- return false;
- case AArch64::CBZx:
- Cond[0].setImm(AArch64::CBNZx);
- return false;
- case AArch64::CBNZw:
- Cond[0].setImm(AArch64::CBZw);
- return false;
- case AArch64::CBNZx:
- Cond[0].setImm(AArch64::CBZx);
- return false;
- case AArch64::TBZwii:
- Cond[0].setImm(AArch64::TBNZwii);
- return false;
- case AArch64::TBZxii:
- Cond[0].setImm(AArch64::TBNZxii);
- return false;
- case AArch64::TBNZwii:
- Cond[0].setImm(AArch64::TBZwii);
- return false;
- case AArch64::TBNZxii:
- Cond[0].setImm(AArch64::TBZxii);
- return false;
- default:
- llvm_unreachable("Unknown branch type");
- }
-}
-
-
-unsigned
-AArch64InstrInfo::InsertBranch(MachineBasicBlock &MBB, MachineBasicBlock *TBB,
- MachineBasicBlock *FBB,
- const SmallVectorImpl<MachineOperand> &Cond,
- DebugLoc DL) const {
- if (FBB == 0 && Cond.empty()) {
- BuildMI(&MBB, DL, get(AArch64::Bimm)).addMBB(TBB);
- return 1;
- } else if (FBB == 0) {
- MachineInstrBuilder MIB = BuildMI(&MBB, DL, get(Cond[0].getImm()));
- for (int i = 1, e = Cond.size(); i != e; ++i)
- MIB.addOperand(Cond[i]);
- MIB.addMBB(TBB);
- return 1;
+ SmallVectorImpl<MachineOperand> &Cond) const {
+ if (Cond[0].getImm() != -1) {
+ // Regular Bcc
+ AArch64CC::CondCode CC = (AArch64CC::CondCode)(int)Cond[0].getImm();
+ Cond[0].setImm(AArch64CC::getInvertedCondCode(CC));
+ } else {
+ // Folded compare-and-branch
+ switch (Cond[1].getImm()) {
+ default:
+ llvm_unreachable("Unknown conditional branch!");
+ case AArch64::CBZW:
+ Cond[1].setImm(AArch64::CBNZW);
+ break;
+ case AArch64::CBNZW:
+ Cond[1].setImm(AArch64::CBZW);
+ break;
+ case AArch64::CBZX:
+ Cond[1].setImm(AArch64::CBNZX);
+ break;
+ case AArch64::CBNZX:
+ Cond[1].setImm(AArch64::CBZX);
+ break;
+ case AArch64::TBZW:
+ Cond[1].setImm(AArch64::TBNZW);
+ break;
+ case AArch64::TBNZW:
+ Cond[1].setImm(AArch64::TBZW);
+ break;
+ case AArch64::TBZX:
+ Cond[1].setImm(AArch64::TBNZX);
+ break;
+ case AArch64::TBNZX:
+ Cond[1].setImm(AArch64::TBZX);
+ break;
+ }
}
- MachineInstrBuilder MIB = BuildMI(&MBB, DL, get(Cond[0].getImm()));
- for (int i = 1, e = Cond.size(); i != e; ++i)
- MIB.addOperand(Cond[i]);
- MIB.addMBB(TBB);
-
- BuildMI(&MBB, DL, get(AArch64::Bimm)).addMBB(FBB);
- return 2;
+ return false;
}
unsigned AArch64InstrInfo::RemoveBranch(MachineBasicBlock &MBB) const {
MachineBasicBlock::iterator I = MBB.end();
- if (I == MBB.begin()) return 0;
+ if (I == MBB.begin())
+ return 0;
--I;
while (I->isDebugValue()) {
if (I == MBB.begin())
return 0;
--I;
}
- if (I->getOpcode() != AArch64::Bimm && !isCondBranch(I->getOpcode()))
+ if (!isUncondBranchOpcode(I->getOpcode()) &&
+ !isCondBranchOpcode(I->getOpcode()))
return 0;
// Remove the branch.
@@ -428,9 +235,10 @@
I = MBB.end();
- if (I == MBB.begin()) return 1;
+ if (I == MBB.begin())
+ return 1;
--I;
- if (!isCondBranch(I->getOpcode()))
+ if (!isCondBranchOpcode(I->getOpcode()))
return 1;
// Remove the branch.
@@ -438,542 +246,1838 @@
return 2;
}
-bool
-AArch64InstrInfo::expandPostRAPseudo(MachineBasicBlock::iterator MBBI) const {
- MachineInstr &MI = *MBBI;
- MachineBasicBlock &MBB = *MI.getParent();
+void AArch64InstrInfo::instantiateCondBranch(
+ MachineBasicBlock &MBB, DebugLoc DL, MachineBasicBlock *TBB,
+ const SmallVectorImpl<MachineOperand> &Cond) const {
+ if (Cond[0].getImm() != -1) {
+ // Regular Bcc
+ BuildMI(&MBB, DL, get(AArch64::Bcc)).addImm(Cond[0].getImm()).addMBB(TBB);
+ } else {
+ // Folded compare-and-branch
+ const MachineInstrBuilder MIB =
+ BuildMI(&MBB, DL, get(Cond[1].getImm())).addReg(Cond[2].getReg());
+ if (Cond.size() > 3)
+ MIB.addImm(Cond[3].getImm());
+ MIB.addMBB(TBB);
+ }
+}
- unsigned Opcode = MI.getOpcode();
- switch (Opcode) {
- case AArch64::TLSDESC_BLRx: {
- MachineInstr *NewMI =
- BuildMI(MBB, MBBI, MI.getDebugLoc(), get(AArch64::TLSDESCCALL))
- .addOperand(MI.getOperand(1));
- MI.setDesc(get(AArch64::BLRx));
+unsigned AArch64InstrInfo::InsertBranch(
+ MachineBasicBlock &MBB, MachineBasicBlock *TBB, MachineBasicBlock *FBB,
+ const SmallVectorImpl<MachineOperand> &Cond, DebugLoc DL) const {
+ // Shouldn't be a fall through.
+ assert(TBB && "InsertBranch must not be told to insert a fallthrough");
- llvm::finalizeBundle(MBB, NewMI, *++MBBI);
+ if (!FBB) {
+ if (Cond.empty()) // Unconditional branch?
+ BuildMI(&MBB, DL, get(AArch64::B)).addMBB(TBB);
+ else
+ instantiateCondBranch(MBB, DL, TBB, Cond);
+ return 1;
+ }
+
+ // Two-way conditional branch.
+ instantiateCondBranch(MBB, DL, TBB, Cond);
+ BuildMI(&MBB, DL, get(AArch64::B)).addMBB(FBB);
+ return 2;
+}
+
+// Find the original register that VReg is copied from.
+static unsigned removeCopies(const MachineRegisterInfo &MRI, unsigned VReg) {
+ while (TargetRegisterInfo::isVirtualRegister(VReg)) {
+ const MachineInstr *DefMI = MRI.getVRegDef(VReg);
+ if (!DefMI->isFullCopy())
+ return VReg;
+ VReg = DefMI->getOperand(1).getReg();
+ }
+ return VReg;
+}
+
+// Determine if VReg is defined by an instruction that can be folded into a
+// csel instruction. If so, return the folded opcode, and the replacement
+// register.
+static unsigned canFoldIntoCSel(const MachineRegisterInfo &MRI, unsigned VReg,
+ unsigned *NewVReg = nullptr) {
+ VReg = removeCopies(MRI, VReg);
+ if (!TargetRegisterInfo::isVirtualRegister(VReg))
+ return 0;
+
+ bool Is64Bit = AArch64::GPR64allRegClass.hasSubClassEq(MRI.getRegClass(VReg));
+ const MachineInstr *DefMI = MRI.getVRegDef(VReg);
+ unsigned Opc = 0;
+ unsigned SrcOpNum = 0;
+ switch (DefMI->getOpcode()) {
+ case AArch64::ADDSXri:
+ case AArch64::ADDSWri:
+ // if NZCV is used, do not fold.
+ if (DefMI->findRegisterDefOperandIdx(AArch64::NZCV, true) == -1)
+ return 0;
+ // fall-through to ADDXri and ADDWri.
+ case AArch64::ADDXri:
+ case AArch64::ADDWri:
+ // add x, 1 -> csinc.
+ if (!DefMI->getOperand(2).isImm() || DefMI->getOperand(2).getImm() != 1 ||
+ DefMI->getOperand(3).getImm() != 0)
+ return 0;
+ SrcOpNum = 1;
+ Opc = Is64Bit ? AArch64::CSINCXr : AArch64::CSINCWr;
+ break;
+
+ case AArch64::ORNXrr:
+ case AArch64::ORNWrr: {
+ // not x -> csinv, represented as orn dst, xzr, src.
+ unsigned ZReg = removeCopies(MRI, DefMI->getOperand(1).getReg());
+ if (ZReg != AArch64::XZR && ZReg != AArch64::WZR)
+ return 0;
+ SrcOpNum = 2;
+ Opc = Is64Bit ? AArch64::CSINVXr : AArch64::CSINVWr;
+ break;
+ }
+
+ case AArch64::SUBSXrr:
+ case AArch64::SUBSWrr:
+ // if NZCV is used, do not fold.
+ if (DefMI->findRegisterDefOperandIdx(AArch64::NZCV, true) == -1)
+ return 0;
+ // fall-through to SUBXrr and SUBWrr.
+ case AArch64::SUBXrr:
+ case AArch64::SUBWrr: {
+ // neg x -> csneg, represented as sub dst, xzr, src.
+ unsigned ZReg = removeCopies(MRI, DefMI->getOperand(1).getReg());
+ if (ZReg != AArch64::XZR && ZReg != AArch64::WZR)
+ return 0;
+ SrcOpNum = 2;
+ Opc = Is64Bit ? AArch64::CSNEGXr : AArch64::CSNEGWr;
+ break;
+ }
+ default:
+ return 0;
+ }
+ assert(Opc && SrcOpNum && "Missing parameters");
+
+ if (NewVReg)
+ *NewVReg = DefMI->getOperand(SrcOpNum).getReg();
+ return Opc;
+}
+
+bool AArch64InstrInfo::canInsertSelect(
+ const MachineBasicBlock &MBB, const SmallVectorImpl<MachineOperand> &Cond,
+ unsigned TrueReg, unsigned FalseReg, int &CondCycles, int &TrueCycles,
+ int &FalseCycles) const {
+ // Check register classes.
+ const MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
+ const TargetRegisterClass *RC =
+ RI.getCommonSubClass(MRI.getRegClass(TrueReg), MRI.getRegClass(FalseReg));
+ if (!RC)
+ return false;
+
+ // Expanding cbz/tbz requires an extra cycle of latency on the condition.
+ unsigned ExtraCondLat = Cond.size() != 1;
+
+ // GPRs are handled by csel.
+ // FIXME: Fold in x+1, -x, and ~x when applicable.
+ if (AArch64::GPR64allRegClass.hasSubClassEq(RC) ||
+ AArch64::GPR32allRegClass.hasSubClassEq(RC)) {
+ // Single-cycle csel, csinc, csinv, and csneg.
+ CondCycles = 1 + ExtraCondLat;
+ TrueCycles = FalseCycles = 1;
+ if (canFoldIntoCSel(MRI, TrueReg))
+ TrueCycles = 0;
+ else if (canFoldIntoCSel(MRI, FalseReg))
+ FalseCycles = 0;
return true;
+ }
+
+ // Scalar floating point is handled by fcsel.
+ // FIXME: Form fabs, fmin, and fmax when applicable.
+ if (AArch64::FPR64RegClass.hasSubClassEq(RC) ||
+ AArch64::FPR32RegClass.hasSubClassEq(RC)) {
+ CondCycles = 5 + ExtraCondLat;
+ TrueCycles = FalseCycles = 2;
+ return true;
+ }
+
+ // Can't do vectors.
+ return false;
+}
+
+void AArch64InstrInfo::insertSelect(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator I, DebugLoc DL,
+ unsigned DstReg,
+ const SmallVectorImpl<MachineOperand> &Cond,
+ unsigned TrueReg, unsigned FalseReg) const {
+ MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
+
+ // Parse the condition code, see parseCondBranch() above.
+ AArch64CC::CondCode CC;
+ switch (Cond.size()) {
+ default:
+ llvm_unreachable("Unknown condition opcode in Cond");
+ case 1: // b.cc
+ CC = AArch64CC::CondCode(Cond[0].getImm());
+ break;
+ case 3: { // cbz/cbnz
+ // We must insert a compare against 0.
+ bool Is64Bit;
+ switch (Cond[1].getImm()) {
+ default:
+ llvm_unreachable("Unknown branch opcode in Cond");
+ case AArch64::CBZW:
+ Is64Bit = 0;
+ CC = AArch64CC::EQ;
+ break;
+ case AArch64::CBZX:
+ Is64Bit = 1;
+ CC = AArch64CC::EQ;
+ break;
+ case AArch64::CBNZW:
+ Is64Bit = 0;
+ CC = AArch64CC::NE;
+ break;
+ case AArch64::CBNZX:
+ Is64Bit = 1;
+ CC = AArch64CC::NE;
+ break;
}
+ unsigned SrcReg = Cond[2].getReg();
+ if (Is64Bit) {
+ // cmp reg, #0 is actually subs xzr, reg, #0.
+ MRI.constrainRegClass(SrcReg, &AArch64::GPR64spRegClass);
+ BuildMI(MBB, I, DL, get(AArch64::SUBSXri), AArch64::XZR)
+ .addReg(SrcReg)
+ .addImm(0)
+ .addImm(0);
+ } else {
+ MRI.constrainRegClass(SrcReg, &AArch64::GPR32spRegClass);
+ BuildMI(MBB, I, DL, get(AArch64::SUBSWri), AArch64::WZR)
+ .addReg(SrcReg)
+ .addImm(0)
+ .addImm(0);
+ }
+ break;
+ }
+ case 4: { // tbz/tbnz
+ // We must insert a tst instruction.
+ switch (Cond[1].getImm()) {
+ default:
+ llvm_unreachable("Unknown branch opcode in Cond");
+ case AArch64::TBZW:
+ case AArch64::TBZX:
+ CC = AArch64CC::EQ;
+ break;
+ case AArch64::TBNZW:
+ case AArch64::TBNZX:
+ CC = AArch64CC::NE;
+ break;
+ }
+ // cmp reg, #foo is actually ands xzr, reg, #1<<foo.
+ if (Cond[1].getImm() == AArch64::TBZW || Cond[1].getImm() == AArch64::TBNZW)
+ BuildMI(MBB, I, DL, get(AArch64::ANDSWri), AArch64::WZR)
+ .addReg(Cond[2].getReg())
+ .addImm(
+ AArch64_AM::encodeLogicalImmediate(1ull << Cond[3].getImm(), 32));
+ else
+ BuildMI(MBB, I, DL, get(AArch64::ANDSXri), AArch64::XZR)
+ .addReg(Cond[2].getReg())
+ .addImm(
+ AArch64_AM::encodeLogicalImmediate(1ull << Cond[3].getImm(), 64));
+ break;
+ }
+ }
+
+ unsigned Opc = 0;
+ const TargetRegisterClass *RC = nullptr;
+ bool TryFold = false;
+ if (MRI.constrainRegClass(DstReg, &AArch64::GPR64RegClass)) {
+ RC = &AArch64::GPR64RegClass;
+ Opc = AArch64::CSELXr;
+ TryFold = true;
+ } else if (MRI.constrainRegClass(DstReg, &AArch64::GPR32RegClass)) {
+ RC = &AArch64::GPR32RegClass;
+ Opc = AArch64::CSELWr;
+ TryFold = true;
+ } else if (MRI.constrainRegClass(DstReg, &AArch64::FPR64RegClass)) {
+ RC = &AArch64::FPR64RegClass;
+ Opc = AArch64::FCSELDrrr;
+ } else if (MRI.constrainRegClass(DstReg, &AArch64::FPR32RegClass)) {
+ RC = &AArch64::FPR32RegClass;
+ Opc = AArch64::FCSELSrrr;
+ }
+ assert(RC && "Unsupported regclass");
+
+ // Try folding simple instructions into the csel.
+ if (TryFold) {
+ unsigned NewVReg = 0;
+ unsigned FoldedOpc = canFoldIntoCSel(MRI, TrueReg, &NewVReg);
+ if (FoldedOpc) {
+ // The folded opcodes csinc, csinc and csneg apply the operation to
+ // FalseReg, so we need to invert the condition.
+ CC = AArch64CC::getInvertedCondCode(CC);
+ TrueReg = FalseReg;
+ } else
+ FoldedOpc = canFoldIntoCSel(MRI, FalseReg, &NewVReg);
+
+ // Fold the operation. Leave any dead instructions for DCE to clean up.
+ if (FoldedOpc) {
+ FalseReg = NewVReg;
+ Opc = FoldedOpc;
+ // The extends the live range of NewVReg.
+ MRI.clearKillFlags(NewVReg);
+ }
+ }
+
+ // Pull all virtual register into the appropriate class.
+ MRI.constrainRegClass(TrueReg, RC);
+ MRI.constrainRegClass(FalseReg, RC);
+
+ // Insert the csel.
+ BuildMI(MBB, I, DL, get(Opc), DstReg).addReg(TrueReg).addReg(FalseReg).addImm(
+ CC);
+}
+
+bool AArch64InstrInfo::isCoalescableExtInstr(const MachineInstr &MI,
+ unsigned &SrcReg, unsigned &DstReg,
+ unsigned &SubIdx) const {
+ switch (MI.getOpcode()) {
default:
return false;
+ case AArch64::SBFMXri: // aka sxtw
+ case AArch64::UBFMXri: // aka uxtw
+ // Check for the 32 -> 64 bit extension case, these instructions can do
+ // much more.
+ if (MI.getOperand(2).getImm() != 0 || MI.getOperand(3).getImm() != 31)
+ return false;
+ // This is a signed or unsigned 32 -> 64 bit extension.
+ SrcReg = MI.getOperand(1).getReg();
+ DstReg = MI.getOperand(0).getReg();
+ SubIdx = AArch64::sub_32;
+ return true;
+ }
+}
+
+/// analyzeCompare - For a comparison instruction, return the source registers
+/// in SrcReg and SrcReg2, and the value it compares against in CmpValue.
+/// Return true if the comparison instruction can be analyzed.
+bool AArch64InstrInfo::analyzeCompare(const MachineInstr *MI, unsigned &SrcReg,
+ unsigned &SrcReg2, int &CmpMask,
+ int &CmpValue) const {
+ switch (MI->getOpcode()) {
+ default:
+ break;
+ case AArch64::SUBSWrr:
+ case AArch64::SUBSWrs:
+ case AArch64::SUBSWrx:
+ case AArch64::SUBSXrr:
+ case AArch64::SUBSXrs:
+ case AArch64::SUBSXrx:
+ case AArch64::ADDSWrr:
+ case AArch64::ADDSWrs:
+ case AArch64::ADDSWrx:
+ case AArch64::ADDSXrr:
+ case AArch64::ADDSXrs:
+ case AArch64::ADDSXrx:
+ // Replace SUBSWrr with SUBWrr if NZCV is not used.
+ SrcReg = MI->getOperand(1).getReg();
+ SrcReg2 = MI->getOperand(2).getReg();
+ CmpMask = ~0;
+ CmpValue = 0;
+ return true;
+ case AArch64::SUBSWri:
+ case AArch64::ADDSWri:
+ case AArch64::SUBSXri:
+ case AArch64::ADDSXri:
+ SrcReg = MI->getOperand(1).getReg();
+ SrcReg2 = 0;
+ CmpMask = ~0;
+ CmpValue = MI->getOperand(2).getImm();
+ return true;
+ case AArch64::ANDSWri:
+ case AArch64::ANDSXri:
+ // ANDS does not use the same encoding scheme as the others xxxS
+ // instructions.
+ SrcReg = MI->getOperand(1).getReg();
+ SrcReg2 = 0;
+ CmpMask = ~0;
+ CmpValue = AArch64_AM::decodeLogicalImmediate(
+ MI->getOperand(2).getImm(),
+ MI->getOpcode() == AArch64::ANDSWri ? 32 : 64);
+ return true;
}
return false;
}
-void
-AArch64InstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB,
- MachineBasicBlock::iterator MBBI,
- unsigned SrcReg, bool isKill,
- int FrameIdx,
- const TargetRegisterClass *RC,
- const TargetRegisterInfo *TRI) const {
- DebugLoc DL = MBB.findDebugLoc(MBBI);
- MachineFunction &MF = *MBB.getParent();
- MachineFrameInfo &MFI = *MF.getFrameInfo();
- unsigned Align = MFI.getObjectAlignment(FrameIdx);
+static bool UpdateOperandRegClass(MachineInstr *Instr) {
+ MachineBasicBlock *MBB = Instr->getParent();
+ assert(MBB && "Can't get MachineBasicBlock here");
+ MachineFunction *MF = MBB->getParent();
+ assert(MF && "Can't get MachineFunction here");
+ const TargetMachine *TM = &MF->getTarget();
+ const TargetInstrInfo *TII = TM->getInstrInfo();
+ const TargetRegisterInfo *TRI = TM->getRegisterInfo();
+ MachineRegisterInfo *MRI = &MF->getRegInfo();
- MachineMemOperand *MMO
- = MF.getMachineMemOperand(MachinePointerInfo::getFixedStack(FrameIdx),
- MachineMemOperand::MOStore,
- MFI.getObjectSize(FrameIdx),
- Align);
+ for (unsigned OpIdx = 0, EndIdx = Instr->getNumOperands(); OpIdx < EndIdx;
+ ++OpIdx) {
+ MachineOperand &MO = Instr->getOperand(OpIdx);
+ const TargetRegisterClass *OpRegCstraints =
+ Instr->getRegClassConstraint(OpIdx, TII, TRI);
- unsigned StoreOp = 0;
- if (RC->hasType(MVT::i64) || RC->hasType(MVT::i32)) {
- switch(RC->getSize()) {
- case 4: StoreOp = AArch64::LS32_STR; break;
- case 8: StoreOp = AArch64::LS64_STR; break;
- default:
- llvm_unreachable("Unknown size for regclass");
- }
- } else if (AArch64::FPR8RegClass.hasSubClassEq(RC)) {
- StoreOp = AArch64::LSFP8_STR;
- } else if (AArch64::FPR16RegClass.hasSubClassEq(RC)) {
- StoreOp = AArch64::LSFP16_STR;
- } else if (RC->hasType(MVT::f32) || RC->hasType(MVT::f64) ||
- RC->hasType(MVT::f128)) {
- switch (RC->getSize()) {
- case 4: StoreOp = AArch64::LSFP32_STR; break;
- case 8: StoreOp = AArch64::LSFP64_STR; break;
- case 16: StoreOp = AArch64::LSFP128_STR; break;
- default:
- llvm_unreachable("Unknown size for regclass");
- }
- } else { // For a super register class has more than one sub registers
- if (AArch64::DPairRegClass.hasSubClassEq(RC))
- StoreOp = AArch64::ST1x2_8B;
- else if (AArch64::DTripleRegClass.hasSubClassEq(RC))
- StoreOp = AArch64::ST1x3_8B;
- else if (AArch64::DQuadRegClass.hasSubClassEq(RC))
- StoreOp = AArch64::ST1x4_8B;
- else if (AArch64::QPairRegClass.hasSubClassEq(RC))
- StoreOp = AArch64::ST1x2_16B;
- else if (AArch64::QTripleRegClass.hasSubClassEq(RC))
- StoreOp = AArch64::ST1x3_16B;
- else if (AArch64::QQuadRegClass.hasSubClassEq(RC))
- StoreOp = AArch64::ST1x4_16B;
- else
- llvm_unreachable("Unknown reg class");
+ // If there's no constraint, there's nothing to do.
+ if (!OpRegCstraints)
+ continue;
+ // If the operand is a frame index, there's nothing to do here.
+ // A frame index operand will resolve correctly during PEI.
+ if (MO.isFI())
+ continue;
- MachineInstrBuilder NewMI = BuildMI(MBB, MBBI, DL, get(StoreOp));
- // Vector store has different operands from other store instructions.
- NewMI.addFrameIndex(FrameIdx)
- .addReg(SrcReg, getKillRegState(isKill))
- .addMemOperand(MMO);
- return;
+ assert(MO.isReg() &&
+ "Operand has register constraints without being a register!");
+
+ unsigned Reg = MO.getReg();
+ if (TargetRegisterInfo::isPhysicalRegister(Reg)) {
+ if (!OpRegCstraints->contains(Reg))
+ return false;
+ } else if (!OpRegCstraints->hasSubClassEq(MRI->getRegClass(Reg)) &&
+ !MRI->constrainRegClass(Reg, OpRegCstraints))
+ return false;
}
- MachineInstrBuilder NewMI = BuildMI(MBB, MBBI, DL, get(StoreOp));
- NewMI.addReg(SrcReg, getKillRegState(isKill))
- .addFrameIndex(FrameIdx)
- .addImm(0)
- .addMemOperand(MMO);
-
+ return true;
}
-void
-AArch64InstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB,
- MachineBasicBlock::iterator MBBI,
- unsigned DestReg, int FrameIdx,
- const TargetRegisterClass *RC,
- const TargetRegisterInfo *TRI) const {
- DebugLoc DL = MBB.findDebugLoc(MBBI);
- MachineFunction &MF = *MBB.getParent();
- MachineFrameInfo &MFI = *MF.getFrameInfo();
- unsigned Align = MFI.getObjectAlignment(FrameIdx);
+/// optimizeCompareInstr - Convert the instruction supplying the argument to the
+/// comparison into one that sets the zero bit in the flags register.
+bool AArch64InstrInfo::optimizeCompareInstr(
+ MachineInstr *CmpInstr, unsigned SrcReg, unsigned SrcReg2, int CmpMask,
+ int CmpValue, const MachineRegisterInfo *MRI) const {
- MachineMemOperand *MMO
- = MF.getMachineMemOperand(MachinePointerInfo::getFixedStack(FrameIdx),
- MachineMemOperand::MOLoad,
- MFI.getObjectSize(FrameIdx),
- Align);
-
- unsigned LoadOp = 0;
- if (RC->hasType(MVT::i64) || RC->hasType(MVT::i32)) {
- switch(RC->getSize()) {
- case 4: LoadOp = AArch64::LS32_LDR; break;
- case 8: LoadOp = AArch64::LS64_LDR; break;
+ // Replace SUBSWrr with SUBWrr if NZCV is not used.
+ int Cmp_NZCV = CmpInstr->findRegisterDefOperandIdx(AArch64::NZCV, true);
+ if (Cmp_NZCV != -1) {
+ unsigned NewOpc;
+ switch (CmpInstr->getOpcode()) {
default:
- llvm_unreachable("Unknown size for regclass");
+ return false;
+ case AArch64::ADDSWrr: NewOpc = AArch64::ADDWrr; break;
+ case AArch64::ADDSWri: NewOpc = AArch64::ADDWri; break;
+ case AArch64::ADDSWrs: NewOpc = AArch64::ADDWrs; break;
+ case AArch64::ADDSWrx: NewOpc = AArch64::ADDWrx; break;
+ case AArch64::ADDSXrr: NewOpc = AArch64::ADDXrr; break;
+ case AArch64::ADDSXri: NewOpc = AArch64::ADDXri; break;
+ case AArch64::ADDSXrs: NewOpc = AArch64::ADDXrs; break;
+ case AArch64::ADDSXrx: NewOpc = AArch64::ADDXrx; break;
+ case AArch64::SUBSWrr: NewOpc = AArch64::SUBWrr; break;
+ case AArch64::SUBSWri: NewOpc = AArch64::SUBWri; break;
+ case AArch64::SUBSWrs: NewOpc = AArch64::SUBWrs; break;
+ case AArch64::SUBSWrx: NewOpc = AArch64::SUBWrx; break;
+ case AArch64::SUBSXrr: NewOpc = AArch64::SUBXrr; break;
+ case AArch64::SUBSXri: NewOpc = AArch64::SUBXri; break;
+ case AArch64::SUBSXrs: NewOpc = AArch64::SUBXrs; break;
+ case AArch64::SUBSXrx: NewOpc = AArch64::SUBXrx; break;
}
- } else if (AArch64::FPR8RegClass.hasSubClassEq(RC)) {
- LoadOp = AArch64::LSFP8_LDR;
- } else if (AArch64::FPR16RegClass.hasSubClassEq(RC)) {
- LoadOp = AArch64::LSFP16_LDR;
- } else if (RC->hasType(MVT::f32) || RC->hasType(MVT::f64) ||
- RC->hasType(MVT::f128)) {
- switch (RC->getSize()) {
- case 4: LoadOp = AArch64::LSFP32_LDR; break;
- case 8: LoadOp = AArch64::LSFP64_LDR; break;
- case 16: LoadOp = AArch64::LSFP128_LDR; break;
- default:
- llvm_unreachable("Unknown size for regclass");
- }
- } else { // For a super register class has more than one sub registers
- if (AArch64::DPairRegClass.hasSubClassEq(RC))
- LoadOp = AArch64::LD1x2_8B;
- else if (AArch64::DTripleRegClass.hasSubClassEq(RC))
- LoadOp = AArch64::LD1x3_8B;
- else if (AArch64::DQuadRegClass.hasSubClassEq(RC))
- LoadOp = AArch64::LD1x4_8B;
- else if (AArch64::QPairRegClass.hasSubClassEq(RC))
- LoadOp = AArch64::LD1x2_16B;
- else if (AArch64::QTripleRegClass.hasSubClassEq(RC))
- LoadOp = AArch64::LD1x3_16B;
- else if (AArch64::QQuadRegClass.hasSubClassEq(RC))
- LoadOp = AArch64::LD1x4_16B;
- else
- llvm_unreachable("Unknown reg class");
- MachineInstrBuilder NewMI = BuildMI(MBB, MBBI, DL, get(LoadOp), DestReg);
- // Vector load has different operands from other load instructions.
- NewMI.addFrameIndex(FrameIdx)
- .addMemOperand(MMO);
- return;
+ const MCInstrDesc &MCID = get(NewOpc);
+ CmpInstr->setDesc(MCID);
+ CmpInstr->RemoveOperand(Cmp_NZCV);
+ bool succeeded = UpdateOperandRegClass(CmpInstr);
+ (void)succeeded;
+ assert(succeeded && "Some operands reg class are incompatible!");
+ return true;
}
- MachineInstrBuilder NewMI = BuildMI(MBB, MBBI, DL, get(LoadOp), DestReg);
- NewMI.addFrameIndex(FrameIdx)
- .addImm(0)
- .addMemOperand(MMO);
-}
+ // Continue only if we have a "ri" where immediate is zero.
+ if (CmpValue != 0 || SrcReg2 != 0)
+ return false;
-unsigned AArch64InstrInfo::estimateRSStackLimit(MachineFunction &MF) const {
- unsigned Limit = (1 << 16) - 1;
- for (MachineFunction::iterator BB = MF.begin(),E = MF.end(); BB != E; ++BB) {
- for (MachineBasicBlock::iterator I = BB->begin(), E = BB->end();
- I != E; ++I) {
- for (unsigned i = 0, e = I->getNumOperands(); i != e; ++i) {
- if (!I->getOperand(i).isFI()) continue;
+ // CmpInstr is a Compare instruction if destination register is not used.
+ if (!MRI->use_nodbg_empty(CmpInstr->getOperand(0).getReg()))
+ return false;
- // When using ADDxxi_lsl0_s to get the address of a stack object, 0xfff
- // is the largest offset guaranteed to fit in the immediate offset.
- if (I->getOpcode() == AArch64::ADDxxi_lsl0_s) {
- Limit = std::min(Limit, 0xfffu);
- break;
- }
+ // Get the unique definition of SrcReg.
+ MachineInstr *MI = MRI->getUniqueVRegDef(SrcReg);
+ if (!MI)
+ return false;
- int AccessScale, MinOffset, MaxOffset;
- getAddressConstraints(*I, AccessScale, MinOffset, MaxOffset);
- Limit = std::min(Limit, static_cast<unsigned>(MaxOffset));
+ // We iterate backward, starting from the instruction before CmpInstr and
+ // stop when reaching the definition of the source register or done with the
+ // basic block, to check whether NZCV is used or modified in between.
+ MachineBasicBlock::iterator I = CmpInstr, E = MI,
+ B = CmpInstr->getParent()->begin();
- break; // At most one FI per instruction
+ // Early exit if CmpInstr is at the beginning of the BB.
+ if (I == B)
+ return false;
+
+ // Check whether the definition of SrcReg is in the same basic block as
+ // Compare. If not, we can't optimize away the Compare.
+ if (MI->getParent() != CmpInstr->getParent())
+ return false;
+
+ // Check that NZCV isn't set between the comparison instruction and the one we
+ // want to change.
+ const TargetRegisterInfo *TRI = &getRegisterInfo();
+ for (--I; I != E; --I) {
+ const MachineInstr &Instr = *I;
+
+ if (Instr.modifiesRegister(AArch64::NZCV, TRI) ||
+ Instr.readsRegister(AArch64::NZCV, TRI))
+ // This instruction modifies or uses NZCV after the one we want to
+ // change. We can't do this transformation.
+ return false;
+ if (I == B)
+ // The 'and' is below the comparison instruction.
+ return false;
+ }
+
+ unsigned NewOpc = MI->getOpcode();
+ switch (MI->getOpcode()) {
+ default:
+ return false;
+ case AArch64::ADDSWrr:
+ case AArch64::ADDSWri:
+ case AArch64::ADDSXrr:
+ case AArch64::ADDSXri:
+ case AArch64::SUBSWrr:
+ case AArch64::SUBSWri:
+ case AArch64::SUBSXrr:
+ case AArch64::SUBSXri:
+ break;
+ case AArch64::ADDWrr: NewOpc = AArch64::ADDSWrr; break;
+ case AArch64::ADDWri: NewOpc = AArch64::ADDSWri; break;
+ case AArch64::ADDXrr: NewOpc = AArch64::ADDSXrr; break;
+ case AArch64::ADDXri: NewOpc = AArch64::ADDSXri; break;
+ case AArch64::ADCWr: NewOpc = AArch64::ADCSWr; break;
+ case AArch64::ADCXr: NewOpc = AArch64::ADCSXr; break;
+ case AArch64::SUBWrr: NewOpc = AArch64::SUBSWrr; break;
+ case AArch64::SUBWri: NewOpc = AArch64::SUBSWri; break;
+ case AArch64::SUBXrr: NewOpc = AArch64::SUBSXrr; break;
+ case AArch64::SUBXri: NewOpc = AArch64::SUBSXri; break;
+ case AArch64::SBCWr: NewOpc = AArch64::SBCSWr; break;
+ case AArch64::SBCXr: NewOpc = AArch64::SBCSXr; break;
+ case AArch64::ANDWri: NewOpc = AArch64::ANDSWri; break;
+ case AArch64::ANDXri: NewOpc = AArch64::ANDSXri; break;
+ }
+
+ // Scan forward for the use of NZCV.
+ // When checking against MI: if it's a conditional code requires
+ // checking of V bit, then this is not safe to do.
+ // It is safe to remove CmpInstr if NZCV is redefined or killed.
+ // If we are done with the basic block, we need to check whether NZCV is
+ // live-out.
+ bool IsSafe = false;
+ for (MachineBasicBlock::iterator I = CmpInstr,
+ E = CmpInstr->getParent()->end();
+ !IsSafe && ++I != E;) {
+ const MachineInstr &Instr = *I;
+ for (unsigned IO = 0, EO = Instr.getNumOperands(); !IsSafe && IO != EO;
+ ++IO) {
+ const MachineOperand &MO = Instr.getOperand(IO);
+ if (MO.isRegMask() && MO.clobbersPhysReg(AArch64::NZCV)) {
+ IsSafe = true;
+ break;
}
- }
- }
+ if (!MO.isReg() || MO.getReg() != AArch64::NZCV)
+ continue;
+ if (MO.isDef()) {
+ IsSafe = true;
+ break;
+ }
- return Limit;
-}
-void AArch64InstrInfo::getAddressConstraints(const MachineInstr &MI,
- int &AccessScale, int &MinOffset,
- int &MaxOffset) const {
- switch (MI.getOpcode()) {
- default:
- llvm_unreachable("Unknown load/store kind");
- case TargetOpcode::DBG_VALUE:
- AccessScale = 1;
- MinOffset = INT_MIN;
- MaxOffset = INT_MAX;
- return;
- case AArch64::LS8_LDR: case AArch64::LS8_STR:
- case AArch64::LSFP8_LDR: case AArch64::LSFP8_STR:
- case AArch64::LDRSBw:
- case AArch64::LDRSBx:
- AccessScale = 1;
- MinOffset = 0;
- MaxOffset = 0xfff;
- return;
- case AArch64::LS16_LDR: case AArch64::LS16_STR:
- case AArch64::LSFP16_LDR: case AArch64::LSFP16_STR:
- case AArch64::LDRSHw:
- case AArch64::LDRSHx:
- AccessScale = 2;
- MinOffset = 0;
- MaxOffset = 0xfff * AccessScale;
- return;
- case AArch64::LS32_LDR: case AArch64::LS32_STR:
- case AArch64::LSFP32_LDR: case AArch64::LSFP32_STR:
- case AArch64::LDRSWx:
- case AArch64::LDPSWx:
- AccessScale = 4;
- MinOffset = 0;
- MaxOffset = 0xfff * AccessScale;
- return;
- case AArch64::LS64_LDR: case AArch64::LS64_STR:
- case AArch64::LSFP64_LDR: case AArch64::LSFP64_STR:
- case AArch64::PRFM:
- AccessScale = 8;
- MinOffset = 0;
- MaxOffset = 0xfff * AccessScale;
- return;
- case AArch64::LSFP128_LDR: case AArch64::LSFP128_STR:
- AccessScale = 16;
- MinOffset = 0;
- MaxOffset = 0xfff * AccessScale;
- return;
- case AArch64::LSPair32_LDR: case AArch64::LSPair32_STR:
- case AArch64::LSFPPair32_LDR: case AArch64::LSFPPair32_STR:
- AccessScale = 4;
- MinOffset = -0x40 * AccessScale;
- MaxOffset = 0x3f * AccessScale;
- return;
- case AArch64::LSPair64_LDR: case AArch64::LSPair64_STR:
- case AArch64::LSFPPair64_LDR: case AArch64::LSFPPair64_STR:
- AccessScale = 8;
- MinOffset = -0x40 * AccessScale;
- MaxOffset = 0x3f * AccessScale;
- return;
- case AArch64::LSFPPair128_LDR: case AArch64::LSFPPair128_STR:
- AccessScale = 16;
- MinOffset = -0x40 * AccessScale;
- MaxOffset = 0x3f * AccessScale;
- return;
- case AArch64::LD1x2_8B: case AArch64::ST1x2_8B:
- AccessScale = 16;
- MinOffset = 0;
- MaxOffset = 0xfff * AccessScale;
- return;
- case AArch64::LD1x3_8B: case AArch64::ST1x3_8B:
- AccessScale = 24;
- MinOffset = 0;
- MaxOffset = 0xfff * AccessScale;
- return;
- case AArch64::LD1x4_8B: case AArch64::ST1x4_8B:
- case AArch64::LD1x2_16B: case AArch64::ST1x2_16B:
- AccessScale = 32;
- MinOffset = 0;
- MaxOffset = 0xfff * AccessScale;
- return;
- case AArch64::LD1x3_16B: case AArch64::ST1x3_16B:
- AccessScale = 48;
- MinOffset = 0;
- MaxOffset = 0xfff * AccessScale;
- return;
- case AArch64::LD1x4_16B: case AArch64::ST1x4_16B:
- AccessScale = 64;
- MinOffset = 0;
- MaxOffset = 0xfff * AccessScale;
- return;
- }
-}
+ // Decode the condition code.
+ unsigned Opc = Instr.getOpcode();
+ AArch64CC::CondCode CC;
+ switch (Opc) {
+ default:
+ return false;
+ case AArch64::Bcc:
+ CC = (AArch64CC::CondCode)Instr.getOperand(IO - 2).getImm();
+ break;
+ case AArch64::CSINVWr:
+ case AArch64::CSINVXr:
+ case AArch64::CSINCWr:
+ case AArch64::CSINCXr:
+ case AArch64::CSELWr:
+ case AArch64::CSELXr:
+ case AArch64::CSNEGWr:
+ case AArch64::CSNEGXr:
+ case AArch64::FCSELSrrr:
+ case AArch64::FCSELDrrr:
+ CC = (AArch64CC::CondCode)Instr.getOperand(IO - 1).getImm();
+ break;
+ }
-unsigned AArch64InstrInfo::getInstSizeInBytes(const MachineInstr &MI) const {
- const MCInstrDesc &MCID = MI.getDesc();
- const MachineBasicBlock &MBB = *MI.getParent();
- const MachineFunction &MF = *MBB.getParent();
- const MCAsmInfo &MAI = *MF.getTarget().getMCAsmInfo();
-
- if (MCID.getSize())
- return MCID.getSize();
-
- if (MI.getOpcode() == AArch64::INLINEASM)
- return getInlineAsmLength(MI.getOperand(0).getSymbolName(), MAI);
-
- switch (MI.getOpcode()) {
- case TargetOpcode::BUNDLE:
- return getInstBundleLength(MI);
- case TargetOpcode::IMPLICIT_DEF:
- case TargetOpcode::KILL:
- case TargetOpcode::CFI_INSTRUCTION:
- case TargetOpcode::EH_LABEL:
- case TargetOpcode::GC_LABEL:
- case TargetOpcode::DBG_VALUE:
- case AArch64::TLSDESCCALL:
- return 0;
- default:
- llvm_unreachable("Unknown instruction class");
- }
-}
-
-unsigned AArch64InstrInfo::getInstBundleLength(const MachineInstr &MI) const {
- unsigned Size = 0;
- MachineBasicBlock::const_instr_iterator I = MI;
- MachineBasicBlock::const_instr_iterator E = MI.getParent()->instr_end();
- while (++I != E && I->isInsideBundle()) {
- assert(!I->isBundle() && "No nested bundle!");
- Size += getInstSizeInBytes(*I);
- }
- return Size;
-}
-
-bool llvm::rewriteA64FrameIndex(MachineInstr &MI, unsigned FrameRegIdx,
- unsigned FrameReg, int &Offset,
- const AArch64InstrInfo &TII) {
- MachineBasicBlock &MBB = *MI.getParent();
- MachineFunction &MF = *MBB.getParent();
- MachineFrameInfo &MFI = *MF.getFrameInfo();
-
- MFI.getObjectOffset(FrameRegIdx);
- llvm_unreachable("Unimplemented rewriteFrameIndex");
-}
-
-void llvm::emitRegUpdate(MachineBasicBlock &MBB,
- MachineBasicBlock::iterator MBBI,
- DebugLoc dl, const TargetInstrInfo &TII,
- unsigned DstReg, unsigned SrcReg, unsigned ScratchReg,
- int64_t NumBytes, MachineInstr::MIFlag MIFlags) {
- if (NumBytes == 0 && DstReg == SrcReg)
- return;
- else if (abs64(NumBytes) & ~0xffffff) {
- // Generically, we have to materialize the offset into a temporary register
- // and subtract it. There are a couple of ways this could be done, for now
- // we'll use a movz/movk or movn/movk sequence.
- uint64_t Bits = static_cast<uint64_t>(abs64(NumBytes));
- BuildMI(MBB, MBBI, dl, TII.get(AArch64::MOVZxii), ScratchReg)
- .addImm(0xffff & Bits).addImm(0)
- .setMIFlags(MIFlags);
-
- Bits >>= 16;
- if (Bits & 0xffff) {
- BuildMI(MBB, MBBI, dl, TII.get(AArch64::MOVKxii), ScratchReg)
- .addReg(ScratchReg)
- .addImm(0xffff & Bits).addImm(1)
- .setMIFlags(MIFlags);
- }
-
- Bits >>= 16;
- if (Bits & 0xffff) {
- BuildMI(MBB, MBBI, dl, TII.get(AArch64::MOVKxii), ScratchReg)
- .addReg(ScratchReg)
- .addImm(0xffff & Bits).addImm(2)
- .setMIFlags(MIFlags);
- }
-
- Bits >>= 16;
- if (Bits & 0xffff) {
- BuildMI(MBB, MBBI, dl, TII.get(AArch64::MOVKxii), ScratchReg)
- .addReg(ScratchReg)
- .addImm(0xffff & Bits).addImm(3)
- .setMIFlags(MIFlags);
- }
-
- // ADD DST, SRC, xTMP (, lsl #0)
- unsigned AddOp = NumBytes > 0 ? AArch64::ADDxxx_uxtx : AArch64::SUBxxx_uxtx;
- BuildMI(MBB, MBBI, dl, TII.get(AddOp), DstReg)
- .addReg(SrcReg, RegState::Kill)
- .addReg(ScratchReg, RegState::Kill)
- .addImm(0)
- .setMIFlag(MIFlags);
- return;
- }
-
- // Now we know that the adjustment can be done in at most two add/sub
- // (immediate) instructions, which is always more efficient than a
- // literal-pool load, or even a hypothetical movz/movk/add sequence
-
- // Decide whether we're doing addition or subtraction
- unsigned LowOp, HighOp;
- if (NumBytes >= 0) {
- LowOp = AArch64::ADDxxi_lsl0_s;
- HighOp = AArch64::ADDxxi_lsl12_s;
- } else {
- LowOp = AArch64::SUBxxi_lsl0_s;
- HighOp = AArch64::SUBxxi_lsl12_s;
- NumBytes = abs64(NumBytes);
- }
-
- // If we're here, at the very least a move needs to be produced, which just
- // happens to be materializable by an ADD.
- if ((NumBytes & 0xfff) || NumBytes == 0) {
- BuildMI(MBB, MBBI, dl, TII.get(LowOp), DstReg)
- .addReg(SrcReg, RegState::Kill)
- .addImm(NumBytes & 0xfff)
- .setMIFlag(MIFlags);
-
- // Next update should use the register we've just defined.
- SrcReg = DstReg;
- }
-
- if (NumBytes & 0xfff000) {
- BuildMI(MBB, MBBI, dl, TII.get(HighOp), DstReg)
- .addReg(SrcReg, RegState::Kill)
- .addImm(NumBytes >> 12)
- .setMIFlag(MIFlags);
- }
-}
-
-void llvm::emitSPUpdate(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI,
- DebugLoc dl, const TargetInstrInfo &TII,
- unsigned ScratchReg, int64_t NumBytes,
- MachineInstr::MIFlag MIFlags) {
- emitRegUpdate(MBB, MI, dl, TII, AArch64::XSP, AArch64::XSP, AArch64::X16,
- NumBytes, MIFlags);
-}
-
-
-namespace {
- struct LDTLSCleanup : public MachineFunctionPass {
- static char ID;
- LDTLSCleanup() : MachineFunctionPass(ID) {}
-
- virtual bool runOnMachineFunction(MachineFunction &MF) {
- AArch64MachineFunctionInfo* MFI
- = MF.getInfo<AArch64MachineFunctionInfo>();
- if (MFI->getNumLocalDynamicTLSAccesses() < 2) {
- // No point folding accesses if there isn't at least two.
+ // It is not safe to remove Compare instruction if Overflow(V) is used.
+ switch (CC) {
+ default:
+ // NZCV can be used multiple times, we should continue.
+ break;
+ case AArch64CC::VS:
+ case AArch64CC::VC:
+ case AArch64CC::GE:
+ case AArch64CC::LT:
+ case AArch64CC::GT:
+ case AArch64CC::LE:
return false;
}
-
- MachineDominatorTree *DT = &getAnalysis<MachineDominatorTree>();
- return VisitNode(DT->getRootNode(), 0);
}
+ }
- // Visit the dominator subtree rooted at Node in pre-order.
- // If TLSBaseAddrReg is non-null, then use that to replace any
- // TLS_base_addr instructions. Otherwise, create the register
- // when the first such instruction is seen, and then use it
- // as we encounter more instructions.
- bool VisitNode(MachineDomTreeNode *Node, unsigned TLSBaseAddrReg) {
- MachineBasicBlock *BB = Node->getBlock();
- bool Changed = false;
+ // If NZCV is not killed nor re-defined, we should check whether it is
+ // live-out. If it is live-out, do not optimize.
+ if (!IsSafe) {
+ MachineBasicBlock *ParentBlock = CmpInstr->getParent();
+ for (auto *MBB : ParentBlock->successors())
+ if (MBB->isLiveIn(AArch64::NZCV))
+ return false;
+ }
- // Traverse the current block.
- for (MachineBasicBlock::iterator I = BB->begin(), E = BB->end(); I != E;
- ++I) {
- switch (I->getOpcode()) {
- case AArch64::TLSDESC_BLRx:
- // Make sure it's a local dynamic access.
- if (!I->getOperand(1).isSymbol() ||
- strcmp(I->getOperand(1).getSymbolName(), "_TLS_MODULE_BASE_"))
- break;
+ // Update the instruction to set NZCV.
+ MI->setDesc(get(NewOpc));
+ CmpInstr->eraseFromParent();
+ bool succeeded = UpdateOperandRegClass(MI);
+ (void)succeeded;
+ assert(succeeded && "Some operands reg class are incompatible!");
+ MI->addRegisterDefined(AArch64::NZCV, TRI);
+ return true;
+}
- if (TLSBaseAddrReg)
- I = ReplaceTLSBaseAddrCall(I, TLSBaseAddrReg);
- else
- I = SetRegister(I, &TLSBaseAddrReg);
- Changed = true;
- break;
- default:
- break;
- }
- }
-
- // Visit the children of this block in the dominator tree.
- for (MachineDomTreeNode::iterator I = Node->begin(), E = Node->end();
- I != E; ++I) {
- Changed |= VisitNode(*I, TLSBaseAddrReg);
- }
-
- return Changed;
+/// Return true if this is this instruction has a non-zero immediate
+bool AArch64InstrInfo::hasShiftedReg(const MachineInstr *MI) const {
+ switch (MI->getOpcode()) {
+ default:
+ break;
+ case AArch64::ADDSWrs:
+ case AArch64::ADDSXrs:
+ case AArch64::ADDWrs:
+ case AArch64::ADDXrs:
+ case AArch64::ANDSWrs:
+ case AArch64::ANDSXrs:
+ case AArch64::ANDWrs:
+ case AArch64::ANDXrs:
+ case AArch64::BICSWrs:
+ case AArch64::BICSXrs:
+ case AArch64::BICWrs:
+ case AArch64::BICXrs:
+ case AArch64::CRC32Brr:
+ case AArch64::CRC32CBrr:
+ case AArch64::CRC32CHrr:
+ case AArch64::CRC32CWrr:
+ case AArch64::CRC32CXrr:
+ case AArch64::CRC32Hrr:
+ case AArch64::CRC32Wrr:
+ case AArch64::CRC32Xrr:
+ case AArch64::EONWrs:
+ case AArch64::EONXrs:
+ case AArch64::EORWrs:
+ case AArch64::EORXrs:
+ case AArch64::ORNWrs:
+ case AArch64::ORNXrs:
+ case AArch64::ORRWrs:
+ case AArch64::ORRXrs:
+ case AArch64::SUBSWrs:
+ case AArch64::SUBSXrs:
+ case AArch64::SUBWrs:
+ case AArch64::SUBXrs:
+ if (MI->getOperand(3).isImm()) {
+ unsigned val = MI->getOperand(3).getImm();
+ return (val != 0);
}
+ break;
+ }
+ return false;
+}
- // Replace the TLS_base_addr instruction I with a copy from
- // TLSBaseAddrReg, returning the new instruction.
- MachineInstr *ReplaceTLSBaseAddrCall(MachineInstr *I,
- unsigned TLSBaseAddrReg) {
- MachineFunction *MF = I->getParent()->getParent();
- const AArch64TargetMachine *TM =
- static_cast<const AArch64TargetMachine *>(&MF->getTarget());
- const AArch64InstrInfo *TII = TM->getInstrInfo();
-
- // Insert a Copy from TLSBaseAddrReg to x0, which is where the rest of the
- // code sequence assumes the address will be.
- MachineInstr *Copy = BuildMI(*I->getParent(), I, I->getDebugLoc(),
- TII->get(TargetOpcode::COPY),
- AArch64::X0)
- .addReg(TLSBaseAddrReg);
-
- // Erase the TLS_base_addr instruction.
- I->eraseFromParent();
-
- return Copy;
+/// Return true if this is this instruction has a non-zero immediate
+bool AArch64InstrInfo::hasExtendedReg(const MachineInstr *MI) const {
+ switch (MI->getOpcode()) {
+ default:
+ break;
+ case AArch64::ADDSWrx:
+ case AArch64::ADDSXrx:
+ case AArch64::ADDSXrx64:
+ case AArch64::ADDWrx:
+ case AArch64::ADDXrx:
+ case AArch64::ADDXrx64:
+ case AArch64::SUBSWrx:
+ case AArch64::SUBSXrx:
+ case AArch64::SUBSXrx64:
+ case AArch64::SUBWrx:
+ case AArch64::SUBXrx:
+ case AArch64::SUBXrx64:
+ if (MI->getOperand(3).isImm()) {
+ unsigned val = MI->getOperand(3).getImm();
+ return (val != 0);
}
+ break;
+ }
- // Create a virtal register in *TLSBaseAddrReg, and populate it by
- // inserting a copy instruction after I. Returns the new instruction.
- MachineInstr *SetRegister(MachineInstr *I, unsigned *TLSBaseAddrReg) {
- MachineFunction *MF = I->getParent()->getParent();
- const AArch64TargetMachine *TM =
- static_cast<const AArch64TargetMachine *>(&MF->getTarget());
- const AArch64InstrInfo *TII = TM->getInstrInfo();
+ return false;
+}
- // Create a virtual register for the TLS base address.
- MachineRegisterInfo &RegInfo = MF->getRegInfo();
- *TLSBaseAddrReg = RegInfo.createVirtualRegister(&AArch64::GPR64RegClass);
-
- // Insert a copy from X0 to TLSBaseAddrReg for later.
- MachineInstr *Next = I->getNextNode();
- MachineInstr *Copy = BuildMI(*I->getParent(), Next, I->getDebugLoc(),
- TII->get(TargetOpcode::COPY),
- *TLSBaseAddrReg)
- .addReg(AArch64::X0);
-
- return Copy;
+// Return true if this instruction simply sets its single destination register
+// to zero. This is equivalent to a register rename of the zero-register.
+bool AArch64InstrInfo::isGPRZero(const MachineInstr *MI) const {
+ switch (MI->getOpcode()) {
+ default:
+ break;
+ case AArch64::MOVZWi:
+ case AArch64::MOVZXi: // movz Rd, #0 (LSL #0)
+ if (MI->getOperand(1).isImm() && MI->getOperand(1).getImm() == 0) {
+ assert(MI->getDesc().getNumOperands() == 3 &&
+ MI->getOperand(2).getImm() == 0 && "invalid MOVZi operands");
+ return true;
}
+ break;
+ case AArch64::ANDWri: // and Rd, Rzr, #imm
+ return MI->getOperand(1).getReg() == AArch64::WZR;
+ case AArch64::ANDXri:
+ return MI->getOperand(1).getReg() == AArch64::XZR;
+ case TargetOpcode::COPY:
+ return MI->getOperand(1).getReg() == AArch64::WZR;
+ }
+ return false;
+}
- virtual const char *getPassName() const {
- return "Local Dynamic TLS Access Clean-up";
+// Return true if this instruction simply renames a general register without
+// modifying bits.
+bool AArch64InstrInfo::isGPRCopy(const MachineInstr *MI) const {
+ switch (MI->getOpcode()) {
+ default:
+ break;
+ case TargetOpcode::COPY: {
+ // GPR32 copies will by lowered to ORRXrs
+ unsigned DstReg = MI->getOperand(0).getReg();
+ return (AArch64::GPR32RegClass.contains(DstReg) ||
+ AArch64::GPR64RegClass.contains(DstReg));
+ }
+ case AArch64::ORRXrs: // orr Xd, Xzr, Xm (LSL #0)
+ if (MI->getOperand(1).getReg() == AArch64::XZR) {
+ assert(MI->getDesc().getNumOperands() == 4 &&
+ MI->getOperand(3).getImm() == 0 && "invalid ORRrs operands");
+ return true;
}
+ case AArch64::ADDXri: // add Xd, Xn, #0 (LSL #0)
+ if (MI->getOperand(2).getImm() == 0) {
+ assert(MI->getDesc().getNumOperands() == 4 &&
+ MI->getOperand(3).getImm() == 0 && "invalid ADDXri operands");
+ return true;
+ }
+ }
+ return false;
+}
- virtual void getAnalysisUsage(AnalysisUsage &AU) const {
- AU.setPreservesCFG();
- AU.addRequired<MachineDominatorTree>();
- MachineFunctionPass::getAnalysisUsage(AU);
+// Return true if this instruction simply renames a general register without
+// modifying bits.
+bool AArch64InstrInfo::isFPRCopy(const MachineInstr *MI) const {
+ switch (MI->getOpcode()) {
+ default:
+ break;
+ case TargetOpcode::COPY: {
+ // FPR64 copies will by lowered to ORR.16b
+ unsigned DstReg = MI->getOperand(0).getReg();
+ return (AArch64::FPR64RegClass.contains(DstReg) ||
+ AArch64::FPR128RegClass.contains(DstReg));
+ }
+ case AArch64::ORRv16i8:
+ if (MI->getOperand(1).getReg() == MI->getOperand(2).getReg()) {
+ assert(MI->getDesc().getNumOperands() == 3 && MI->getOperand(0).isReg() &&
+ "invalid ORRv16i8 operands");
+ return true;
}
+ }
+ return false;
+}
+
+unsigned AArch64InstrInfo::isLoadFromStackSlot(const MachineInstr *MI,
+ int &FrameIndex) const {
+ switch (MI->getOpcode()) {
+ default:
+ break;
+ case AArch64::LDRWui:
+ case AArch64::LDRXui:
+ case AArch64::LDRBui:
+ case AArch64::LDRHui:
+ case AArch64::LDRSui:
+ case AArch64::LDRDui:
+ case AArch64::LDRQui:
+ if (MI->getOperand(0).getSubReg() == 0 && MI->getOperand(1).isFI() &&
+ MI->getOperand(2).isImm() && MI->getOperand(2).getImm() == 0) {
+ FrameIndex = MI->getOperand(1).getIndex();
+ return MI->getOperand(0).getReg();
+ }
+ break;
+ }
+
+ return 0;
+}
+
+unsigned AArch64InstrInfo::isStoreToStackSlot(const MachineInstr *MI,
+ int &FrameIndex) const {
+ switch (MI->getOpcode()) {
+ default:
+ break;
+ case AArch64::STRWui:
+ case AArch64::STRXui:
+ case AArch64::STRBui:
+ case AArch64::STRHui:
+ case AArch64::STRSui:
+ case AArch64::STRDui:
+ case AArch64::STRQui:
+ if (MI->getOperand(0).getSubReg() == 0 && MI->getOperand(1).isFI() &&
+ MI->getOperand(2).isImm() && MI->getOperand(2).getImm() == 0) {
+ FrameIndex = MI->getOperand(1).getIndex();
+ return MI->getOperand(0).getReg();
+ }
+ break;
+ }
+ return 0;
+}
+
+/// Return true if this is load/store scales or extends its register offset.
+/// This refers to scaling a dynamic index as opposed to scaled immediates.
+/// MI should be a memory op that allows scaled addressing.
+bool AArch64InstrInfo::isScaledAddr(const MachineInstr *MI) const {
+ switch (MI->getOpcode()) {
+ default:
+ break;
+ case AArch64::LDRBBroW:
+ case AArch64::LDRBroW:
+ case AArch64::LDRDroW:
+ case AArch64::LDRHHroW:
+ case AArch64::LDRHroW:
+ case AArch64::LDRQroW:
+ case AArch64::LDRSBWroW:
+ case AArch64::LDRSBXroW:
+ case AArch64::LDRSHWroW:
+ case AArch64::LDRSHXroW:
+ case AArch64::LDRSWroW:
+ case AArch64::LDRSroW:
+ case AArch64::LDRWroW:
+ case AArch64::LDRXroW:
+ case AArch64::STRBBroW:
+ case AArch64::STRBroW:
+ case AArch64::STRDroW:
+ case AArch64::STRHHroW:
+ case AArch64::STRHroW:
+ case AArch64::STRQroW:
+ case AArch64::STRSroW:
+ case AArch64::STRWroW:
+ case AArch64::STRXroW:
+ case AArch64::LDRBBroX:
+ case AArch64::LDRBroX:
+ case AArch64::LDRDroX:
+ case AArch64::LDRHHroX:
+ case AArch64::LDRHroX:
+ case AArch64::LDRQroX:
+ case AArch64::LDRSBWroX:
+ case AArch64::LDRSBXroX:
+ case AArch64::LDRSHWroX:
+ case AArch64::LDRSHXroX:
+ case AArch64::LDRSWroX:
+ case AArch64::LDRSroX:
+ case AArch64::LDRWroX:
+ case AArch64::LDRXroX:
+ case AArch64::STRBBroX:
+ case AArch64::STRBroX:
+ case AArch64::STRDroX:
+ case AArch64::STRHHroX:
+ case AArch64::STRHroX:
+ case AArch64::STRQroX:
+ case AArch64::STRSroX:
+ case AArch64::STRWroX:
+ case AArch64::STRXroX:
+
+ unsigned Val = MI->getOperand(3).getImm();
+ AArch64_AM::ShiftExtendType ExtType = AArch64_AM::getMemExtendType(Val);
+ return (ExtType != AArch64_AM::UXTX) || AArch64_AM::getMemDoShift(Val);
+ }
+ return false;
+}
+
+/// Check all MachineMemOperands for a hint to suppress pairing.
+bool AArch64InstrInfo::isLdStPairSuppressed(const MachineInstr *MI) const {
+ assert(MOSuppressPair < (1 << MachineMemOperand::MOTargetNumBits) &&
+ "Too many target MO flags");
+ for (auto *MM : MI->memoperands()) {
+ if (MM->getFlags() &
+ (MOSuppressPair << MachineMemOperand::MOTargetStartBit)) {
+ return true;
+ }
+ }
+ return false;
+}
+
+/// Set a flag on the first MachineMemOperand to suppress pairing.
+void AArch64InstrInfo::suppressLdStPair(MachineInstr *MI) const {
+ if (MI->memoperands_empty())
+ return;
+
+ assert(MOSuppressPair < (1 << MachineMemOperand::MOTargetNumBits) &&
+ "Too many target MO flags");
+ (*MI->memoperands_begin())
+ ->setFlags(MOSuppressPair << MachineMemOperand::MOTargetStartBit);
+}
+
+bool
+AArch64InstrInfo::getLdStBaseRegImmOfs(MachineInstr *LdSt, unsigned &BaseReg,
+ unsigned &Offset,
+ const TargetRegisterInfo *TRI) const {
+ switch (LdSt->getOpcode()) {
+ default:
+ return false;
+ case AArch64::STRSui:
+ case AArch64::STRDui:
+ case AArch64::STRQui:
+ case AArch64::STRXui:
+ case AArch64::STRWui:
+ case AArch64::LDRSui:
+ case AArch64::LDRDui:
+ case AArch64::LDRQui:
+ case AArch64::LDRXui:
+ case AArch64::LDRWui:
+ if (!LdSt->getOperand(1).isReg() || !LdSt->getOperand(2).isImm())
+ return false;
+ BaseReg = LdSt->getOperand(1).getReg();
+ MachineFunction &MF = *LdSt->getParent()->getParent();
+ unsigned Width = getRegClass(LdSt->getDesc(), 0, TRI, MF)->getSize();
+ Offset = LdSt->getOperand(2).getImm() * Width;
+ return true;
};
}
-char LDTLSCleanup::ID = 0;
-FunctionPass*
-llvm::createAArch64CleanupLocalDynamicTLSPass() { return new LDTLSCleanup(); }
+/// Detect opportunities for ldp/stp formation.
+///
+/// Only called for LdSt for which getLdStBaseRegImmOfs returns true.
+bool AArch64InstrInfo::shouldClusterLoads(MachineInstr *FirstLdSt,
+ MachineInstr *SecondLdSt,
+ unsigned NumLoads) const {
+ // Only cluster up to a single pair.
+ if (NumLoads > 1)
+ return false;
+ if (FirstLdSt->getOpcode() != SecondLdSt->getOpcode())
+ return false;
+ // getLdStBaseRegImmOfs guarantees that oper 2 isImm.
+ unsigned Ofs1 = FirstLdSt->getOperand(2).getImm();
+ // Allow 6 bits of positive range.
+ if (Ofs1 > 64)
+ return false;
+ // The caller should already have ordered First/SecondLdSt by offset.
+ unsigned Ofs2 = SecondLdSt->getOperand(2).getImm();
+ return Ofs1 + 1 == Ofs2;
+}
+
+bool AArch64InstrInfo::shouldScheduleAdjacent(MachineInstr *First,
+ MachineInstr *Second) const {
+ // Cyclone can fuse CMN, CMP followed by Bcc.
+
+ // FIXME: B0 can also fuse:
+ // AND, BIC, ORN, ORR, or EOR (optional S) followed by Bcc or CBZ or CBNZ.
+ if (Second->getOpcode() != AArch64::Bcc)
+ return false;
+ switch (First->getOpcode()) {
+ default:
+ return false;
+ case AArch64::SUBSWri:
+ case AArch64::ADDSWri:
+ case AArch64::ANDSWri:
+ case AArch64::SUBSXri:
+ case AArch64::ADDSXri:
+ case AArch64::ANDSXri:
+ return true;
+ }
+}
+
+MachineInstr *AArch64InstrInfo::emitFrameIndexDebugValue(MachineFunction &MF,
+ int FrameIx,
+ uint64_t Offset,
+ const MDNode *MDPtr,
+ DebugLoc DL) const {
+ MachineInstrBuilder MIB = BuildMI(MF, DL, get(AArch64::DBG_VALUE))
+ .addFrameIndex(FrameIx)
+ .addImm(0)
+ .addImm(Offset)
+ .addMetadata(MDPtr);
+ return &*MIB;
+}
+
+static const MachineInstrBuilder &AddSubReg(const MachineInstrBuilder &MIB,
+ unsigned Reg, unsigned SubIdx,
+ unsigned State,
+ const TargetRegisterInfo *TRI) {
+ if (!SubIdx)
+ return MIB.addReg(Reg, State);
+
+ if (TargetRegisterInfo::isPhysicalRegister(Reg))
+ return MIB.addReg(TRI->getSubReg(Reg, SubIdx), State);
+ return MIB.addReg(Reg, State, SubIdx);
+}
+
+static bool forwardCopyWillClobberTuple(unsigned DestReg, unsigned SrcReg,
+ unsigned NumRegs) {
+ // We really want the positive remainder mod 32 here, that happens to be
+ // easily obtainable with a mask.
+ return ((DestReg - SrcReg) & 0x1f) < NumRegs;
+}
+
+void AArch64InstrInfo::copyPhysRegTuple(
+ MachineBasicBlock &MBB, MachineBasicBlock::iterator I, DebugLoc DL,
+ unsigned DestReg, unsigned SrcReg, bool KillSrc, unsigned Opcode,
+ llvm::ArrayRef<unsigned> Indices) const {
+ assert(getSubTarget().hasNEON() &&
+ "Unexpected register copy without NEON");
+ const TargetRegisterInfo *TRI = &getRegisterInfo();
+ uint16_t DestEncoding = TRI->getEncodingValue(DestReg);
+ uint16_t SrcEncoding = TRI->getEncodingValue(SrcReg);
+ unsigned NumRegs = Indices.size();
+
+ int SubReg = 0, End = NumRegs, Incr = 1;
+ if (forwardCopyWillClobberTuple(DestEncoding, SrcEncoding, NumRegs)) {
+ SubReg = NumRegs - 1;
+ End = -1;
+ Incr = -1;
+ }
+
+ for (; SubReg != End; SubReg += Incr) {
+ const MachineInstrBuilder &MIB = BuildMI(MBB, I, DL, get(Opcode));
+ AddSubReg(MIB, DestReg, Indices[SubReg], RegState::Define, TRI);
+ AddSubReg(MIB, SrcReg, Indices[SubReg], 0, TRI);
+ AddSubReg(MIB, SrcReg, Indices[SubReg], getKillRegState(KillSrc), TRI);
+ }
+}
+
+void AArch64InstrInfo::copyPhysReg(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator I, DebugLoc DL,
+ unsigned DestReg, unsigned SrcReg,
+ bool KillSrc) const {
+ if (AArch64::GPR32spRegClass.contains(DestReg) &&
+ (AArch64::GPR32spRegClass.contains(SrcReg) || SrcReg == AArch64::WZR)) {
+ const TargetRegisterInfo *TRI = &getRegisterInfo();
+
+ if (DestReg == AArch64::WSP || SrcReg == AArch64::WSP) {
+ // If either operand is WSP, expand to ADD #0.
+ if (Subtarget.hasZeroCycleRegMove()) {
+ // Cyclone recognizes "ADD Xd, Xn, #0" as a zero-cycle register move.
+ unsigned DestRegX = TRI->getMatchingSuperReg(DestReg, AArch64::sub_32,
+ &AArch64::GPR64spRegClass);
+ unsigned SrcRegX = TRI->getMatchingSuperReg(SrcReg, AArch64::sub_32,
+ &AArch64::GPR64spRegClass);
+ // This instruction is reading and writing X registers. This may upset
+ // the register scavenger and machine verifier, so we need to indicate
+ // that we are reading an undefined value from SrcRegX, but a proper
+ // value from SrcReg.
+ BuildMI(MBB, I, DL, get(AArch64::ADDXri), DestRegX)
+ .addReg(SrcRegX, RegState::Undef)
+ .addImm(0)
+ .addImm(AArch64_AM::getShifterImm(AArch64_AM::LSL, 0))
+ .addReg(SrcReg, RegState::Implicit | getKillRegState(KillSrc));
+ } else {
+ BuildMI(MBB, I, DL, get(AArch64::ADDWri), DestReg)
+ .addReg(SrcReg, getKillRegState(KillSrc))
+ .addImm(0)
+ .addImm(AArch64_AM::getShifterImm(AArch64_AM::LSL, 0));
+ }
+ } else if (SrcReg == AArch64::WZR && Subtarget.hasZeroCycleZeroing()) {
+ BuildMI(MBB, I, DL, get(AArch64::MOVZWi), DestReg).addImm(0).addImm(
+ AArch64_AM::getShifterImm(AArch64_AM::LSL, 0));
+ } else {
+ if (Subtarget.hasZeroCycleRegMove()) {
+ // Cyclone recognizes "ORR Xd, XZR, Xm" as a zero-cycle register move.
+ unsigned DestRegX = TRI->getMatchingSuperReg(DestReg, AArch64::sub_32,
+ &AArch64::GPR64spRegClass);
+ unsigned SrcRegX = TRI->getMatchingSuperReg(SrcReg, AArch64::sub_32,
+ &AArch64::GPR64spRegClass);
+ // This instruction is reading and writing X registers. This may upset
+ // the register scavenger and machine verifier, so we need to indicate
+ // that we are reading an undefined value from SrcRegX, but a proper
+ // value from SrcReg.
+ BuildMI(MBB, I, DL, get(AArch64::ORRXrr), DestRegX)
+ .addReg(AArch64::XZR)
+ .addReg(SrcRegX, RegState::Undef)
+ .addReg(SrcReg, RegState::Implicit | getKillRegState(KillSrc));
+ } else {
+ // Otherwise, expand to ORR WZR.
+ BuildMI(MBB, I, DL, get(AArch64::ORRWrr), DestReg)
+ .addReg(AArch64::WZR)
+ .addReg(SrcReg, getKillRegState(KillSrc));
+ }
+ }
+ return;
+ }
+
+ if (AArch64::GPR64spRegClass.contains(DestReg) &&
+ (AArch64::GPR64spRegClass.contains(SrcReg) || SrcReg == AArch64::XZR)) {
+ if (DestReg == AArch64::SP || SrcReg == AArch64::SP) {
+ // If either operand is SP, expand to ADD #0.
+ BuildMI(MBB, I, DL, get(AArch64::ADDXri), DestReg)
+ .addReg(SrcReg, getKillRegState(KillSrc))
+ .addImm(0)
+ .addImm(AArch64_AM::getShifterImm(AArch64_AM::LSL, 0));
+ } else if (SrcReg == AArch64::XZR && Subtarget.hasZeroCycleZeroing()) {
+ BuildMI(MBB, I, DL, get(AArch64::MOVZXi), DestReg).addImm(0).addImm(
+ AArch64_AM::getShifterImm(AArch64_AM::LSL, 0));
+ } else {
+ // Otherwise, expand to ORR XZR.
+ BuildMI(MBB, I, DL, get(AArch64::ORRXrr), DestReg)
+ .addReg(AArch64::XZR)
+ .addReg(SrcReg, getKillRegState(KillSrc));
+ }
+ return;
+ }
+
+ // Copy a DDDD register quad by copying the individual sub-registers.
+ if (AArch64::DDDDRegClass.contains(DestReg) &&
+ AArch64::DDDDRegClass.contains(SrcReg)) {
+ static const unsigned Indices[] = { AArch64::dsub0, AArch64::dsub1,
+ AArch64::dsub2, AArch64::dsub3 };
+ copyPhysRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORRv8i8,
+ Indices);
+ return;
+ }
+
+ // Copy a DDD register triple by copying the individual sub-registers.
+ if (AArch64::DDDRegClass.contains(DestReg) &&
+ AArch64::DDDRegClass.contains(SrcReg)) {
+ static const unsigned Indices[] = { AArch64::dsub0, AArch64::dsub1,
+ AArch64::dsub2 };
+ copyPhysRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORRv8i8,
+ Indices);
+ return;
+ }
+
+ // Copy a DD register pair by copying the individual sub-registers.
+ if (AArch64::DDRegClass.contains(DestReg) &&
+ AArch64::DDRegClass.contains(SrcReg)) {
+ static const unsigned Indices[] = { AArch64::dsub0, AArch64::dsub1 };
+ copyPhysRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORRv8i8,
+ Indices);
+ return;
+ }
+
+ // Copy a QQQQ register quad by copying the individual sub-registers.
+ if (AArch64::QQQQRegClass.contains(DestReg) &&
+ AArch64::QQQQRegClass.contains(SrcReg)) {
+ static const unsigned Indices[] = { AArch64::qsub0, AArch64::qsub1,
+ AArch64::qsub2, AArch64::qsub3 };
+ copyPhysRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORRv16i8,
+ Indices);
+ return;
+ }
+
+ // Copy a QQQ register triple by copying the individual sub-registers.
+ if (AArch64::QQQRegClass.contains(DestReg) &&
+ AArch64::QQQRegClass.contains(SrcReg)) {
+ static const unsigned Indices[] = { AArch64::qsub0, AArch64::qsub1,
+ AArch64::qsub2 };
+ copyPhysRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORRv16i8,
+ Indices);
+ return;
+ }
+
+ // Copy a QQ register pair by copying the individual sub-registers.
+ if (AArch64::QQRegClass.contains(DestReg) &&
+ AArch64::QQRegClass.contains(SrcReg)) {
+ static const unsigned Indices[] = { AArch64::qsub0, AArch64::qsub1 };
+ copyPhysRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORRv16i8,
+ Indices);
+ return;
+ }
+
+ if (AArch64::FPR128RegClass.contains(DestReg) &&
+ AArch64::FPR128RegClass.contains(SrcReg)) {
+ if(getSubTarget().hasNEON()) {
+ BuildMI(MBB, I, DL, get(AArch64::ORRv16i8), DestReg)
+ .addReg(SrcReg)
+ .addReg(SrcReg, getKillRegState(KillSrc));
+ } else {
+ BuildMI(MBB, I, DL, get(AArch64::STRQpre))
+ .addReg(AArch64::SP, RegState::Define)
+ .addReg(SrcReg, getKillRegState(KillSrc))
+ .addReg(AArch64::SP)
+ .addImm(-16);
+ BuildMI(MBB, I, DL, get(AArch64::LDRQpre))
+ .addReg(AArch64::SP, RegState::Define)
+ .addReg(DestReg, RegState::Define)
+ .addReg(AArch64::SP)
+ .addImm(16);
+ }
+ return;
+ }
+
+ if (AArch64::FPR64RegClass.contains(DestReg) &&
+ AArch64::FPR64RegClass.contains(SrcReg)) {
+ if(getSubTarget().hasNEON()) {
+ DestReg = RI.getMatchingSuperReg(DestReg, AArch64::dsub,
+ &AArch64::FPR128RegClass);
+ SrcReg = RI.getMatchingSuperReg(SrcReg, AArch64::dsub,
+ &AArch64::FPR128RegClass);
+ BuildMI(MBB, I, DL, get(AArch64::ORRv16i8), DestReg)
+ .addReg(SrcReg)
+ .addReg(SrcReg, getKillRegState(KillSrc));
+ } else {
+ BuildMI(MBB, I, DL, get(AArch64::FMOVDr), DestReg)
+ .addReg(SrcReg, getKillRegState(KillSrc));
+ }
+ return;
+ }
+
+ if (AArch64::FPR32RegClass.contains(DestReg) &&
+ AArch64::FPR32RegClass.contains(SrcReg)) {
+ if(getSubTarget().hasNEON()) {
+ DestReg = RI.getMatchingSuperReg(DestReg, AArch64::ssub,
+ &AArch64::FPR128RegClass);
+ SrcReg = RI.getMatchingSuperReg(SrcReg, AArch64::ssub,
+ &AArch64::FPR128RegClass);
+ BuildMI(MBB, I, DL, get(AArch64::ORRv16i8), DestReg)
+ .addReg(SrcReg)
+ .addReg(SrcReg, getKillRegState(KillSrc));
+ } else {
+ BuildMI(MBB, I, DL, get(AArch64::FMOVSr), DestReg)
+ .addReg(SrcReg, getKillRegState(KillSrc));
+ }
+ return;
+ }
+
+ if (AArch64::FPR16RegClass.contains(DestReg) &&
+ AArch64::FPR16RegClass.contains(SrcReg)) {
+ if(getSubTarget().hasNEON()) {
+ DestReg = RI.getMatchingSuperReg(DestReg, AArch64::hsub,
+ &AArch64::FPR128RegClass);
+ SrcReg = RI.getMatchingSuperReg(SrcReg, AArch64::hsub,
+ &AArch64::FPR128RegClass);
+ BuildMI(MBB, I, DL, get(AArch64::ORRv16i8), DestReg)
+ .addReg(SrcReg)
+ .addReg(SrcReg, getKillRegState(KillSrc));
+ } else {
+ DestReg = RI.getMatchingSuperReg(DestReg, AArch64::hsub,
+ &AArch64::FPR32RegClass);
+ SrcReg = RI.getMatchingSuperReg(SrcReg, AArch64::hsub,
+ &AArch64::FPR32RegClass);
+ BuildMI(MBB, I, DL, get(AArch64::FMOVSr), DestReg)
+ .addReg(SrcReg, getKillRegState(KillSrc));
+ }
+ return;
+ }
+
+ if (AArch64::FPR8RegClass.contains(DestReg) &&
+ AArch64::FPR8RegClass.contains(SrcReg)) {
+ if(getSubTarget().hasNEON()) {
+ DestReg = RI.getMatchingSuperReg(DestReg, AArch64::bsub,
+ &AArch64::FPR128RegClass);
+ SrcReg = RI.getMatchingSuperReg(SrcReg, AArch64::bsub,
+ &AArch64::FPR128RegClass);
+ BuildMI(MBB, I, DL, get(AArch64::ORRv16i8), DestReg)
+ .addReg(SrcReg)
+ .addReg(SrcReg, getKillRegState(KillSrc));
+ } else {
+ DestReg = RI.getMatchingSuperReg(DestReg, AArch64::bsub,
+ &AArch64::FPR32RegClass);
+ SrcReg = RI.getMatchingSuperReg(SrcReg, AArch64::bsub,
+ &AArch64::FPR32RegClass);
+ BuildMI(MBB, I, DL, get(AArch64::FMOVSr), DestReg)
+ .addReg(SrcReg, getKillRegState(KillSrc));
+ }
+ return;
+ }
+
+ // Copies between GPR64 and FPR64.
+ if (AArch64::FPR64RegClass.contains(DestReg) &&
+ AArch64::GPR64RegClass.contains(SrcReg)) {
+ BuildMI(MBB, I, DL, get(AArch64::FMOVXDr), DestReg)
+ .addReg(SrcReg, getKillRegState(KillSrc));
+ return;
+ }
+ if (AArch64::GPR64RegClass.contains(DestReg) &&
+ AArch64::FPR64RegClass.contains(SrcReg)) {
+ BuildMI(MBB, I, DL, get(AArch64::FMOVDXr), DestReg)
+ .addReg(SrcReg, getKillRegState(KillSrc));
+ return;
+ }
+ // Copies between GPR32 and FPR32.
+ if (AArch64::FPR32RegClass.contains(DestReg) &&
+ AArch64::GPR32RegClass.contains(SrcReg)) {
+ BuildMI(MBB, I, DL, get(AArch64::FMOVWSr), DestReg)
+ .addReg(SrcReg, getKillRegState(KillSrc));
+ return;
+ }
+ if (AArch64::GPR32RegClass.contains(DestReg) &&
+ AArch64::FPR32RegClass.contains(SrcReg)) {
+ BuildMI(MBB, I, DL, get(AArch64::FMOVSWr), DestReg)
+ .addReg(SrcReg, getKillRegState(KillSrc));
+ return;
+ }
+
+ if (DestReg == AArch64::NZCV) {
+ assert(AArch64::GPR64RegClass.contains(SrcReg) && "Invalid NZCV copy");
+ BuildMI(MBB, I, DL, get(AArch64::MSR))
+ .addImm(AArch64SysReg::NZCV)
+ .addReg(SrcReg, getKillRegState(KillSrc))
+ .addReg(AArch64::NZCV, RegState::Implicit | RegState::Define);
+ return;
+ }
+
+ if (SrcReg == AArch64::NZCV) {
+ assert(AArch64::GPR64RegClass.contains(DestReg) && "Invalid NZCV copy");
+ BuildMI(MBB, I, DL, get(AArch64::MRS))
+ .addReg(DestReg)
+ .addImm(AArch64SysReg::NZCV)
+ .addReg(AArch64::NZCV, RegState::Implicit | getKillRegState(KillSrc));
+ return;
+ }
+
+ llvm_unreachable("unimplemented reg-to-reg copy");
+}
+
+void AArch64InstrInfo::storeRegToStackSlot(
+ MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, unsigned SrcReg,
+ bool isKill, int FI, const TargetRegisterClass *RC,
+ const TargetRegisterInfo *TRI) const {
+ DebugLoc DL;
+ if (MBBI != MBB.end())
+ DL = MBBI->getDebugLoc();
+ MachineFunction &MF = *MBB.getParent();
+ MachineFrameInfo &MFI = *MF.getFrameInfo();
+ unsigned Align = MFI.getObjectAlignment(FI);
+
+ MachinePointerInfo PtrInfo(PseudoSourceValue::getFixedStack(FI));
+ MachineMemOperand *MMO = MF.getMachineMemOperand(
+ PtrInfo, MachineMemOperand::MOStore, MFI.getObjectSize(FI), Align);
+ unsigned Opc = 0;
+ bool Offset = true;
+ switch (RC->getSize()) {
+ case 1:
+ if (AArch64::FPR8RegClass.hasSubClassEq(RC))
+ Opc = AArch64::STRBui;
+ break;
+ case 2:
+ if (AArch64::FPR16RegClass.hasSubClassEq(RC))
+ Opc = AArch64::STRHui;
+ break;
+ case 4:
+ if (AArch64::GPR32allRegClass.hasSubClassEq(RC)) {
+ Opc = AArch64::STRWui;
+ if (TargetRegisterInfo::isVirtualRegister(SrcReg))
+ MF.getRegInfo().constrainRegClass(SrcReg, &AArch64::GPR32RegClass);
+ else
+ assert(SrcReg != AArch64::WSP);
+ } else if (AArch64::FPR32RegClass.hasSubClassEq(RC))
+ Opc = AArch64::STRSui;
+ break;
+ case 8:
+ if (AArch64::GPR64allRegClass.hasSubClassEq(RC)) {
+ Opc = AArch64::STRXui;
+ if (TargetRegisterInfo::isVirtualRegister(SrcReg))
+ MF.getRegInfo().constrainRegClass(SrcReg, &AArch64::GPR64RegClass);
+ else
+ assert(SrcReg != AArch64::SP);
+ } else if (AArch64::FPR64RegClass.hasSubClassEq(RC))
+ Opc = AArch64::STRDui;
+ break;
+ case 16:
+ if (AArch64::FPR128RegClass.hasSubClassEq(RC))
+ Opc = AArch64::STRQui;
+ else if (AArch64::DDRegClass.hasSubClassEq(RC)) {
+ assert(getSubTarget().hasNEON() &&
+ "Unexpected register store without NEON");
+ Opc = AArch64::ST1Twov1d, Offset = false;
+ }
+ break;
+ case 24:
+ if (AArch64::DDDRegClass.hasSubClassEq(RC)) {
+ assert(getSubTarget().hasNEON() &&
+ "Unexpected register store without NEON");
+ Opc = AArch64::ST1Threev1d, Offset = false;
+ }
+ break;
+ case 32:
+ if (AArch64::DDDDRegClass.hasSubClassEq(RC)) {
+ assert(getSubTarget().hasNEON() &&
+ "Unexpected register store without NEON");
+ Opc = AArch64::ST1Fourv1d, Offset = false;
+ } else if (AArch64::QQRegClass.hasSubClassEq(RC)) {
+ assert(getSubTarget().hasNEON() &&
+ "Unexpected register store without NEON");
+ Opc = AArch64::ST1Twov2d, Offset = false;
+ }
+ break;
+ case 48:
+ if (AArch64::QQQRegClass.hasSubClassEq(RC)) {
+ assert(getSubTarget().hasNEON() &&
+ "Unexpected register store without NEON");
+ Opc = AArch64::ST1Threev2d, Offset = false;
+ }
+ break;
+ case 64:
+ if (AArch64::QQQQRegClass.hasSubClassEq(RC)) {
+ assert(getSubTarget().hasNEON() &&
+ "Unexpected register store without NEON");
+ Opc = AArch64::ST1Fourv2d, Offset = false;
+ }
+ break;
+ }
+ assert(Opc && "Unknown register class");
+
+ const MachineInstrBuilder &MI = BuildMI(MBB, MBBI, DL, get(Opc))
+ .addReg(SrcReg, getKillRegState(isKill))
+ .addFrameIndex(FI);
+
+ if (Offset)
+ MI.addImm(0);
+ MI.addMemOperand(MMO);
+}
+
+void AArch64InstrInfo::loadRegFromStackSlot(
+ MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, unsigned DestReg,
+ int FI, const TargetRegisterClass *RC,
+ const TargetRegisterInfo *TRI) const {
+ DebugLoc DL;
+ if (MBBI != MBB.end())
+ DL = MBBI->getDebugLoc();
+ MachineFunction &MF = *MBB.getParent();
+ MachineFrameInfo &MFI = *MF.getFrameInfo();
+ unsigned Align = MFI.getObjectAlignment(FI);
+ MachinePointerInfo PtrInfo(PseudoSourceValue::getFixedStack(FI));
+ MachineMemOperand *MMO = MF.getMachineMemOperand(
+ PtrInfo, MachineMemOperand::MOLoad, MFI.getObjectSize(FI), Align);
+
+ unsigned Opc = 0;
+ bool Offset = true;
+ switch (RC->getSize()) {
+ case 1:
+ if (AArch64::FPR8RegClass.hasSubClassEq(RC))
+ Opc = AArch64::LDRBui;
+ break;
+ case 2:
+ if (AArch64::FPR16RegClass.hasSubClassEq(RC))
+ Opc = AArch64::LDRHui;
+ break;
+ case 4:
+ if (AArch64::GPR32allRegClass.hasSubClassEq(RC)) {
+ Opc = AArch64::LDRWui;
+ if (TargetRegisterInfo::isVirtualRegister(DestReg))
+ MF.getRegInfo().constrainRegClass(DestReg, &AArch64::GPR32RegClass);
+ else
+ assert(DestReg != AArch64::WSP);
+ } else if (AArch64::FPR32RegClass.hasSubClassEq(RC))
+ Opc = AArch64::LDRSui;
+ break;
+ case 8:
+ if (AArch64::GPR64allRegClass.hasSubClassEq(RC)) {
+ Opc = AArch64::LDRXui;
+ if (TargetRegisterInfo::isVirtualRegister(DestReg))
+ MF.getRegInfo().constrainRegClass(DestReg, &AArch64::GPR64RegClass);
+ else
+ assert(DestReg != AArch64::SP);
+ } else if (AArch64::FPR64RegClass.hasSubClassEq(RC))
+ Opc = AArch64::LDRDui;
+ break;
+ case 16:
+ if (AArch64::FPR128RegClass.hasSubClassEq(RC))
+ Opc = AArch64::LDRQui;
+ else if (AArch64::DDRegClass.hasSubClassEq(RC)) {
+ assert(getSubTarget().hasNEON() &&
+ "Unexpected register load without NEON");
+ Opc = AArch64::LD1Twov1d, Offset = false;
+ }
+ break;
+ case 24:
+ if (AArch64::DDDRegClass.hasSubClassEq(RC)) {
+ assert(getSubTarget().hasNEON() &&
+ "Unexpected register load without NEON");
+ Opc = AArch64::LD1Threev1d, Offset = false;
+ }
+ break;
+ case 32:
+ if (AArch64::DDDDRegClass.hasSubClassEq(RC)) {
+ assert(getSubTarget().hasNEON() &&
+ "Unexpected register load without NEON");
+ Opc = AArch64::LD1Fourv1d, Offset = false;
+ } else if (AArch64::QQRegClass.hasSubClassEq(RC)) {
+ assert(getSubTarget().hasNEON() &&
+ "Unexpected register load without NEON");
+ Opc = AArch64::LD1Twov2d, Offset = false;
+ }
+ break;
+ case 48:
+ if (AArch64::QQQRegClass.hasSubClassEq(RC)) {
+ assert(getSubTarget().hasNEON() &&
+ "Unexpected register load without NEON");
+ Opc = AArch64::LD1Threev2d, Offset = false;
+ }
+ break;
+ case 64:
+ if (AArch64::QQQQRegClass.hasSubClassEq(RC)) {
+ assert(getSubTarget().hasNEON() &&
+ "Unexpected register load without NEON");
+ Opc = AArch64::LD1Fourv2d, Offset = false;
+ }
+ break;
+ }
+ assert(Opc && "Unknown register class");
+
+ const MachineInstrBuilder &MI = BuildMI(MBB, MBBI, DL, get(Opc))
+ .addReg(DestReg, getDefRegState(true))
+ .addFrameIndex(FI);
+ if (Offset)
+ MI.addImm(0);
+ MI.addMemOperand(MMO);
+}
+
+void llvm::emitFrameOffset(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator MBBI, DebugLoc DL,
+ unsigned DestReg, unsigned SrcReg, int Offset,
+ const AArch64InstrInfo *TII,
+ MachineInstr::MIFlag Flag, bool SetNZCV) {
+ if (DestReg == SrcReg && Offset == 0)
+ return;
+
+ bool isSub = Offset < 0;
+ if (isSub)
+ Offset = -Offset;
+
+ // FIXME: If the offset won't fit in 24-bits, compute the offset into a
+ // scratch register. If DestReg is a virtual register, use it as the
+ // scratch register; otherwise, create a new virtual register (to be
+ // replaced by the scavenger at the end of PEI). That case can be optimized
+ // slightly if DestReg is SP which is always 16-byte aligned, so the scratch
+ // register can be loaded with offset%8 and the add/sub can use an extending
+ // instruction with LSL#3.
+ // Currently the function handles any offsets but generates a poor sequence
+ // of code.
+ // assert(Offset < (1 << 24) && "unimplemented reg plus immediate");
+
+ unsigned Opc;
+ if (SetNZCV)
+ Opc = isSub ? AArch64::SUBSXri : AArch64::ADDSXri;
+ else
+ Opc = isSub ? AArch64::SUBXri : AArch64::ADDXri;
+ const unsigned MaxEncoding = 0xfff;
+ const unsigned ShiftSize = 12;
+ const unsigned MaxEncodableValue = MaxEncoding << ShiftSize;
+ while (((unsigned)Offset) >= (1 << ShiftSize)) {
+ unsigned ThisVal;
+ if (((unsigned)Offset) > MaxEncodableValue) {
+ ThisVal = MaxEncodableValue;
+ } else {
+ ThisVal = Offset & MaxEncodableValue;
+ }
+ assert((ThisVal >> ShiftSize) <= MaxEncoding &&
+ "Encoding cannot handle value that big");
+ BuildMI(MBB, MBBI, DL, TII->get(Opc), DestReg)
+ .addReg(SrcReg)
+ .addImm(ThisVal >> ShiftSize)
+ .addImm(AArch64_AM::getShifterImm(AArch64_AM::LSL, ShiftSize))
+ .setMIFlag(Flag);
+
+ SrcReg = DestReg;
+ Offset -= ThisVal;
+ if (Offset == 0)
+ return;
+ }
+ BuildMI(MBB, MBBI, DL, TII->get(Opc), DestReg)
+ .addReg(SrcReg)
+ .addImm(Offset)
+ .addImm(AArch64_AM::getShifterImm(AArch64_AM::LSL, 0))
+ .setMIFlag(Flag);
+}
+
+MachineInstr *
+AArch64InstrInfo::foldMemoryOperandImpl(MachineFunction &MF, MachineInstr *MI,
+ const SmallVectorImpl<unsigned> &Ops,
+ int FrameIndex) const {
+ // This is a bit of a hack. Consider this instruction:
+ //
+ // %vreg0<def> = COPY %SP; GPR64all:%vreg0
+ //
+ // We explicitly chose GPR64all for the virtual register so such a copy might
+ // be eliminated by RegisterCoalescer. However, that may not be possible, and
+ // %vreg0 may even spill. We can't spill %SP, and since it is in the GPR64all
+ // register class, TargetInstrInfo::foldMemoryOperand() is going to try.
+ //
+ // To prevent that, we are going to constrain the %vreg0 register class here.
+ //
+ // <rdar://problem/11522048>
+ //
+ if (MI->isCopy()) {
+ unsigned DstReg = MI->getOperand(0).getReg();
+ unsigned SrcReg = MI->getOperand(1).getReg();
+ if (SrcReg == AArch64::SP &&
+ TargetRegisterInfo::isVirtualRegister(DstReg)) {
+ MF.getRegInfo().constrainRegClass(DstReg, &AArch64::GPR64RegClass);
+ return nullptr;
+ }
+ if (DstReg == AArch64::SP &&
+ TargetRegisterInfo::isVirtualRegister(SrcReg)) {
+ MF.getRegInfo().constrainRegClass(SrcReg, &AArch64::GPR64RegClass);
+ return nullptr;
+ }
+ }
+
+ // Cannot fold.
+ return nullptr;
+}
+
+int llvm::isAArch64FrameOffsetLegal(const MachineInstr &MI, int &Offset,
+ bool *OutUseUnscaledOp,
+ unsigned *OutUnscaledOp,
+ int *EmittableOffset) {
+ int Scale = 1;
+ bool IsSigned = false;
+ // The ImmIdx should be changed case by case if it is not 2.
+ unsigned ImmIdx = 2;
+ unsigned UnscaledOp = 0;
+ // Set output values in case of early exit.
+ if (EmittableOffset)
+ *EmittableOffset = 0;
+ if (OutUseUnscaledOp)
+ *OutUseUnscaledOp = false;
+ if (OutUnscaledOp)
+ *OutUnscaledOp = 0;
+ switch (MI.getOpcode()) {
+ default:
+ assert(0 && "unhandled opcode in rewriteAArch64FrameIndex");
+ // Vector spills/fills can't take an immediate offset.
+ case AArch64::LD1Twov2d:
+ case AArch64::LD1Threev2d:
+ case AArch64::LD1Fourv2d:
+ case AArch64::LD1Twov1d:
+ case AArch64::LD1Threev1d:
+ case AArch64::LD1Fourv1d:
+ case AArch64::ST1Twov2d:
+ case AArch64::ST1Threev2d:
+ case AArch64::ST1Fourv2d:
+ case AArch64::ST1Twov1d:
+ case AArch64::ST1Threev1d:
+ case AArch64::ST1Fourv1d:
+ return AArch64FrameOffsetCannotUpdate;
+ case AArch64::PRFMui:
+ Scale = 8;
+ UnscaledOp = AArch64::PRFUMi;
+ break;
+ case AArch64::LDRXui:
+ Scale = 8;
+ UnscaledOp = AArch64::LDURXi;
+ break;
+ case AArch64::LDRWui:
+ Scale = 4;
+ UnscaledOp = AArch64::LDURWi;
+ break;
+ case AArch64::LDRBui:
+ Scale = 1;
+ UnscaledOp = AArch64::LDURBi;
+ break;
+ case AArch64::LDRHui:
+ Scale = 2;
+ UnscaledOp = AArch64::LDURHi;
+ break;
+ case AArch64::LDRSui:
+ Scale = 4;
+ UnscaledOp = AArch64::LDURSi;
+ break;
+ case AArch64::LDRDui:
+ Scale = 8;
+ UnscaledOp = AArch64::LDURDi;
+ break;
+ case AArch64::LDRQui:
+ Scale = 16;
+ UnscaledOp = AArch64::LDURQi;
+ break;
+ case AArch64::LDRBBui:
+ Scale = 1;
+ UnscaledOp = AArch64::LDURBBi;
+ break;
+ case AArch64::LDRHHui:
+ Scale = 2;
+ UnscaledOp = AArch64::LDURHHi;
+ break;
+ case AArch64::LDRSBXui:
+ Scale = 1;
+ UnscaledOp = AArch64::LDURSBXi;
+ break;
+ case AArch64::LDRSBWui:
+ Scale = 1;
+ UnscaledOp = AArch64::LDURSBWi;
+ break;
+ case AArch64::LDRSHXui:
+ Scale = 2;
+ UnscaledOp = AArch64::LDURSHXi;
+ break;
+ case AArch64::LDRSHWui:
+ Scale = 2;
+ UnscaledOp = AArch64::LDURSHWi;
+ break;
+ case AArch64::LDRSWui:
+ Scale = 4;
+ UnscaledOp = AArch64::LDURSWi;
+ break;
+
+ case AArch64::STRXui:
+ Scale = 8;
+ UnscaledOp = AArch64::STURXi;
+ break;
+ case AArch64::STRWui:
+ Scale = 4;
+ UnscaledOp = AArch64::STURWi;
+ break;
+ case AArch64::STRBui:
+ Scale = 1;
+ UnscaledOp = AArch64::STURBi;
+ break;
+ case AArch64::STRHui:
+ Scale = 2;
+ UnscaledOp = AArch64::STURHi;
+ break;
+ case AArch64::STRSui:
+ Scale = 4;
+ UnscaledOp = AArch64::STURSi;
+ break;
+ case AArch64::STRDui:
+ Scale = 8;
+ UnscaledOp = AArch64::STURDi;
+ break;
+ case AArch64::STRQui:
+ Scale = 16;
+ UnscaledOp = AArch64::STURQi;
+ break;
+ case AArch64::STRBBui:
+ Scale = 1;
+ UnscaledOp = AArch64::STURBBi;
+ break;
+ case AArch64::STRHHui:
+ Scale = 2;
+ UnscaledOp = AArch64::STURHHi;
+ break;
+
+ case AArch64::LDPXi:
+ case AArch64::LDPDi:
+ case AArch64::STPXi:
+ case AArch64::STPDi:
+ IsSigned = true;
+ Scale = 8;
+ break;
+ case AArch64::LDPQi:
+ case AArch64::STPQi:
+ IsSigned = true;
+ Scale = 16;
+ break;
+ case AArch64::LDPWi:
+ case AArch64::LDPSi:
+ case AArch64::STPWi:
+ case AArch64::STPSi:
+ IsSigned = true;
+ Scale = 4;
+ break;
+
+ case AArch64::LDURXi:
+ case AArch64::LDURWi:
+ case AArch64::LDURBi:
+ case AArch64::LDURHi:
+ case AArch64::LDURSi:
+ case AArch64::LDURDi:
+ case AArch64::LDURQi:
+ case AArch64::LDURHHi:
+ case AArch64::LDURBBi:
+ case AArch64::LDURSBXi:
+ case AArch64::LDURSBWi:
+ case AArch64::LDURSHXi:
+ case AArch64::LDURSHWi:
+ case AArch64::LDURSWi:
+ case AArch64::STURXi:
+ case AArch64::STURWi:
+ case AArch64::STURBi:
+ case AArch64::STURHi:
+ case AArch64::STURSi:
+ case AArch64::STURDi:
+ case AArch64::STURQi:
+ case AArch64::STURBBi:
+ case AArch64::STURHHi:
+ Scale = 1;
+ break;
+ }
+
+ Offset += MI.getOperand(ImmIdx).getImm() * Scale;
+
+ bool useUnscaledOp = false;
+ // If the offset doesn't match the scale, we rewrite the instruction to
+ // use the unscaled instruction instead. Likewise, if we have a negative
+ // offset (and have an unscaled op to use).
+ if ((Offset & (Scale - 1)) != 0 || (Offset < 0 && UnscaledOp != 0))
+ useUnscaledOp = true;
+
+ // Use an unscaled addressing mode if the instruction has a negative offset
+ // (or if the instruction is already using an unscaled addressing mode).
+ unsigned MaskBits;
+ if (IsSigned) {
+ // ldp/stp instructions.
+ MaskBits = 7;
+ Offset /= Scale;
+ } else if (UnscaledOp == 0 || useUnscaledOp) {
+ MaskBits = 9;
+ IsSigned = true;
+ Scale = 1;
+ } else {
+ MaskBits = 12;
+ IsSigned = false;
+ Offset /= Scale;
+ }
+
+ // Attempt to fold address computation.
+ int MaxOff = (1 << (MaskBits - IsSigned)) - 1;
+ int MinOff = (IsSigned ? (-MaxOff - 1) : 0);
+ if (Offset >= MinOff && Offset <= MaxOff) {
+ if (EmittableOffset)
+ *EmittableOffset = Offset;
+ Offset = 0;
+ } else {
+ int NewOff = Offset < 0 ? MinOff : MaxOff;
+ if (EmittableOffset)
+ *EmittableOffset = NewOff;
+ Offset = (Offset - NewOff) * Scale;
+ }
+ if (OutUseUnscaledOp)
+ *OutUseUnscaledOp = useUnscaledOp;
+ if (OutUnscaledOp)
+ *OutUnscaledOp = UnscaledOp;
+ return AArch64FrameOffsetCanUpdate |
+ (Offset == 0 ? AArch64FrameOffsetIsLegal : 0);
+}
+
+bool llvm::rewriteAArch64FrameIndex(MachineInstr &MI, unsigned FrameRegIdx,
+ unsigned FrameReg, int &Offset,
+ const AArch64InstrInfo *TII) {
+ unsigned Opcode = MI.getOpcode();
+ unsigned ImmIdx = FrameRegIdx + 1;
+
+ if (Opcode == AArch64::ADDSXri || Opcode == AArch64::ADDXri) {
+ Offset += MI.getOperand(ImmIdx).getImm();
+ emitFrameOffset(*MI.getParent(), MI, MI.getDebugLoc(),
+ MI.getOperand(0).getReg(), FrameReg, Offset, TII,
+ MachineInstr::NoFlags, (Opcode == AArch64::ADDSXri));
+ MI.eraseFromParent();
+ Offset = 0;
+ return true;
+ }
+
+ int NewOffset;
+ unsigned UnscaledOp;
+ bool UseUnscaledOp;
+ int Status = isAArch64FrameOffsetLegal(MI, Offset, &UseUnscaledOp,
+ &UnscaledOp, &NewOffset);
+ if (Status & AArch64FrameOffsetCanUpdate) {
+ if (Status & AArch64FrameOffsetIsLegal)
+ // Replace the FrameIndex with FrameReg.
+ MI.getOperand(FrameRegIdx).ChangeToRegister(FrameReg, false);
+ if (UseUnscaledOp)
+ MI.setDesc(TII->get(UnscaledOp));
+
+ MI.getOperand(ImmIdx).ChangeToImmediate(NewOffset);
+ return Offset == 0;
+ }
+
+ return false;
+}
+
+void AArch64InstrInfo::getNoopForMachoTarget(MCInst &NopInst) const {
+ NopInst.setOpcode(AArch64::HINT);
+ NopInst.addOperand(MCOperand::CreateImm(0));
+}
diff --git a/lib/Target/AArch64/AArch64InstrInfo.h b/lib/Target/AArch64/AArch64InstrInfo.h
index ad20f9c..90ce75f 100644
--- a/lib/Target/AArch64/AArch64InstrInfo.h
+++ b/lib/Target/AArch64/AArch64InstrInfo.h
@@ -11,9 +11,10 @@
//
//===----------------------------------------------------------------------===//
-#ifndef LLVM_TARGET_AARCH64INSTRINFO_H
-#define LLVM_TARGET_AARCH64INSTRINFO_H
+#ifndef LLVM_TARGET_AArch64INSTRINFO_H
+#define LLVM_TARGET_AArch64INSTRINFO_H
+#include "AArch64.h"
#include "AArch64RegisterInfo.h"
#include "llvm/Target/TargetInstrInfo.h"
@@ -23,89 +24,208 @@
namespace llvm {
class AArch64Subtarget;
+class AArch64TargetMachine;
class AArch64InstrInfo : public AArch64GenInstrInfo {
+ // Reserve bits in the MachineMemOperand target hint flags, starting at 1.
+ // They will be shifted into MOTargetHintStart when accessed.
+ enum TargetMemOperandFlags {
+ MOSuppressPair = 1
+ };
+
const AArch64RegisterInfo RI;
const AArch64Subtarget &Subtarget;
+
public:
- explicit AArch64InstrInfo(const AArch64Subtarget &TM);
+ explicit AArch64InstrInfo(const AArch64Subtarget &STI);
/// getRegisterInfo - TargetInstrInfo is a superset of MRegister info. As
/// such, whenever a client has an instance of instruction info, it should
/// always be able to get register info as well (through this method).
- ///
- const TargetRegisterInfo &getRegisterInfo() const { return RI; }
+ const AArch64RegisterInfo &getRegisterInfo() const { return RI; }
const AArch64Subtarget &getSubTarget() const { return Subtarget; }
- void copyPhysReg(MachineBasicBlock &MBB,
- MachineBasicBlock::iterator I, DebugLoc DL,
- unsigned DestReg, unsigned SrcReg,
- bool KillSrc) const;
- void CopyPhysRegTuple(MachineBasicBlock &MBB,
- MachineBasicBlock::iterator I, DebugLoc DL,
- unsigned DestReg, unsigned SrcReg) const;
+ unsigned GetInstSizeInBytes(const MachineInstr *MI) const;
+
+ bool isCoalescableExtInstr(const MachineInstr &MI, unsigned &SrcReg,
+ unsigned &DstReg, unsigned &SubIdx) const override;
+
+ unsigned isLoadFromStackSlot(const MachineInstr *MI,
+ int &FrameIndex) const override;
+ unsigned isStoreToStackSlot(const MachineInstr *MI,
+ int &FrameIndex) const override;
+
+ /// Returns true if there is a shiftable register and that the shift value
+ /// is non-zero.
+ bool hasShiftedReg(const MachineInstr *MI) const;
+
+ /// Returns true if there is an extendable register and that the extending
+ /// value is non-zero.
+ bool hasExtendedReg(const MachineInstr *MI) const;
+
+ /// \brief Does this instruction set its full destination register to zero?
+ bool isGPRZero(const MachineInstr *MI) const;
+
+ /// \brief Does this instruction rename a GPR without modifying bits?
+ bool isGPRCopy(const MachineInstr *MI) const;
+
+ /// \brief Does this instruction rename an FPR without modifying bits?
+ bool isFPRCopy(const MachineInstr *MI) const;
+
+ /// Return true if this is load/store scales or extends its register offset.
+ /// This refers to scaling a dynamic index as opposed to scaled immediates.
+ /// MI should be a memory op that allows scaled addressing.
+ bool isScaledAddr(const MachineInstr *MI) const;
+
+ /// Return true if pairing the given load or store is hinted to be
+ /// unprofitable.
+ bool isLdStPairSuppressed(const MachineInstr *MI) const;
+
+ /// Hint that pairing the given load or store is unprofitable.
+ void suppressLdStPair(MachineInstr *MI) const;
+
+ bool getLdStBaseRegImmOfs(MachineInstr *LdSt, unsigned &BaseReg,
+ unsigned &Offset,
+ const TargetRegisterInfo *TRI) const override;
+
+ bool enableClusterLoads() const override { return true; }
+
+ bool shouldClusterLoads(MachineInstr *FirstLdSt, MachineInstr *SecondLdSt,
+ unsigned NumLoads) const override;
+
+ bool shouldScheduleAdjacent(MachineInstr *First,
+ MachineInstr *Second) const override;
+
+ MachineInstr *emitFrameIndexDebugValue(MachineFunction &MF, int FrameIx,
+ uint64_t Offset, const MDNode *MDPtr,
+ DebugLoc DL) const;
+ void copyPhysRegTuple(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
+ DebugLoc DL, unsigned DestReg, unsigned SrcReg,
+ bool KillSrc, unsigned Opcode,
+ llvm::ArrayRef<unsigned> Indices) const;
+ void copyPhysReg(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
+ DebugLoc DL, unsigned DestReg, unsigned SrcReg,
+ bool KillSrc) const override;
void storeRegToStackSlot(MachineBasicBlock &MBB,
- MachineBasicBlock::iterator MI,
- unsigned SrcReg, bool isKill, int FrameIndex,
+ MachineBasicBlock::iterator MBBI, unsigned SrcReg,
+ bool isKill, int FrameIndex,
const TargetRegisterClass *RC,
- const TargetRegisterInfo *TRI) const;
+ const TargetRegisterInfo *TRI) const override;
+
void loadRegFromStackSlot(MachineBasicBlock &MBB,
- MachineBasicBlock::iterator MBBI,
- unsigned DestReg, int FrameIdx,
- const TargetRegisterClass *RC,
- const TargetRegisterInfo *TRI) const;
+ MachineBasicBlock::iterator MBBI, unsigned DestReg,
+ int FrameIndex, const TargetRegisterClass *RC,
+ const TargetRegisterInfo *TRI) const override;
+
+ MachineInstr *
+ foldMemoryOperandImpl(MachineFunction &MF, MachineInstr *MI,
+ const SmallVectorImpl<unsigned> &Ops,
+ int FrameIndex) const override;
bool AnalyzeBranch(MachineBasicBlock &MBB, MachineBasicBlock *&TBB,
MachineBasicBlock *&FBB,
SmallVectorImpl<MachineOperand> &Cond,
- bool AllowModify = false) const;
+ bool AllowModify = false) const override;
+ unsigned RemoveBranch(MachineBasicBlock &MBB) const override;
unsigned InsertBranch(MachineBasicBlock &MBB, MachineBasicBlock *TBB,
MachineBasicBlock *FBB,
const SmallVectorImpl<MachineOperand> &Cond,
- DebugLoc DL) const;
- unsigned RemoveBranch(MachineBasicBlock &MBB) const;
- bool ReverseBranchCondition(SmallVectorImpl<MachineOperand> &Cond) const;
+ DebugLoc DL) const override;
+ bool
+ ReverseBranchCondition(SmallVectorImpl<MachineOperand> &Cond) const override;
+ bool canInsertSelect(const MachineBasicBlock &,
+ const SmallVectorImpl<MachineOperand> &Cond, unsigned,
+ unsigned, int &, int &, int &) const override;
+ void insertSelect(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI,
+ DebugLoc DL, unsigned DstReg,
+ const SmallVectorImpl<MachineOperand> &Cond,
+ unsigned TrueReg, unsigned FalseReg) const override;
+ void getNoopForMachoTarget(MCInst &NopInst) const override;
- bool expandPostRAPseudo(MachineBasicBlock::iterator MI) const;
+ /// analyzeCompare - For a comparison instruction, return the source registers
+ /// in SrcReg and SrcReg2, and the value it compares against in CmpValue.
+ /// Return true if the comparison instruction can be analyzed.
+ bool analyzeCompare(const MachineInstr *MI, unsigned &SrcReg,
+ unsigned &SrcReg2, int &CmpMask,
+ int &CmpValue) const override;
+ /// optimizeCompareInstr - Convert the instruction supplying the argument to
+ /// the comparison into one that sets the zero bit in the flags register.
+ bool optimizeCompareInstr(MachineInstr *CmpInstr, unsigned SrcReg,
+ unsigned SrcReg2, int CmpMask, int CmpValue,
+ const MachineRegisterInfo *MRI) const override;
- /// Look through the instructions in this function and work out the largest
- /// the stack frame can be while maintaining the ability to address local
- /// slots with no complexities.
- unsigned estimateRSStackLimit(MachineFunction &MF) const;
-
- /// getAddressConstraints - For loads and stores (and PRFMs) taking an
- /// immediate offset, this function determines the constraints required for
- /// the immediate. It must satisfy:
- /// + MinOffset <= imm <= MaxOffset
- /// + imm % OffsetScale == 0
- void getAddressConstraints(const MachineInstr &MI, int &AccessScale,
- int &MinOffset, int &MaxOffset) const;
-
-
- unsigned getInstSizeInBytes(const MachineInstr &MI) const;
-
- unsigned getInstBundleLength(const MachineInstr &MI) const;
-
+private:
+ void instantiateCondBranch(MachineBasicBlock &MBB, DebugLoc DL,
+ MachineBasicBlock *TBB,
+ const SmallVectorImpl<MachineOperand> &Cond) const;
};
-bool rewriteA64FrameIndex(MachineInstr &MI, unsigned FrameRegIdx,
- unsigned FrameReg, int &Offset,
- const AArch64InstrInfo &TII);
+/// emitFrameOffset - Emit instructions as needed to set DestReg to SrcReg
+/// plus Offset. This is intended to be used from within the prolog/epilog
+/// insertion (PEI) pass, where a virtual scratch register may be allocated
+/// if necessary, to be replaced by the scavenger at the end of PEI.
+void emitFrameOffset(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
+ DebugLoc DL, unsigned DestReg, unsigned SrcReg, int Offset,
+ const AArch64InstrInfo *TII,
+ MachineInstr::MIFlag = MachineInstr::NoFlags,
+ bool SetNZCV = false);
+/// rewriteAArch64FrameIndex - Rewrite MI to access 'Offset' bytes from the
+/// FP. Return false if the offset could not be handled directly in MI, and
+/// return the left-over portion by reference.
+bool rewriteAArch64FrameIndex(MachineInstr &MI, unsigned FrameRegIdx,
+ unsigned FrameReg, int &Offset,
+ const AArch64InstrInfo *TII);
-void emitRegUpdate(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI,
- DebugLoc dl, const TargetInstrInfo &TII,
- unsigned DstReg, unsigned SrcReg, unsigned ScratchReg,
- int64_t NumBytes,
- MachineInstr::MIFlag MIFlags = MachineInstr::NoFlags);
+/// \brief Use to report the frame offset status in isAArch64FrameOffsetLegal.
+enum AArch64FrameOffsetStatus {
+ AArch64FrameOffsetCannotUpdate = 0x0, ///< Offset cannot apply.
+ AArch64FrameOffsetIsLegal = 0x1, ///< Offset is legal.
+ AArch64FrameOffsetCanUpdate = 0x2 ///< Offset can apply, at least partly.
+};
-void emitSPUpdate(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI,
- DebugLoc dl, const TargetInstrInfo &TII,
- unsigned ScratchReg, int64_t NumBytes,
- MachineInstr::MIFlag MIFlags = MachineInstr::NoFlags);
+/// \brief Check if the @p Offset is a valid frame offset for @p MI.
+/// The returned value reports the validity of the frame offset for @p MI.
+/// It uses the values defined by AArch64FrameOffsetStatus for that.
+/// If result == AArch64FrameOffsetCannotUpdate, @p MI cannot be updated to
+/// use an offset.eq
+/// If result & AArch64FrameOffsetIsLegal, @p Offset can completely be
+/// rewriten in @p MI.
+/// If result & AArch64FrameOffsetCanUpdate, @p Offset contains the
+/// amount that is off the limit of the legal offset.
+/// If set, @p OutUseUnscaledOp will contain the whether @p MI should be
+/// turned into an unscaled operator, which opcode is in @p OutUnscaledOp.
+/// If set, @p EmittableOffset contains the amount that can be set in @p MI
+/// (possibly with @p OutUnscaledOp if OutUseUnscaledOp is true) and that
+/// is a legal offset.
+int isAArch64FrameOffsetLegal(const MachineInstr &MI, int &Offset,
+ bool *OutUseUnscaledOp = nullptr,
+ unsigned *OutUnscaledOp = nullptr,
+ int *EmittableOffset = nullptr);
+static inline bool isUncondBranchOpcode(int Opc) { return Opc == AArch64::B; }
+
+static inline bool isCondBranchOpcode(int Opc) {
+ switch (Opc) {
+ case AArch64::Bcc:
+ case AArch64::CBZW:
+ case AArch64::CBZX:
+ case AArch64::CBNZW:
+ case AArch64::CBNZX:
+ case AArch64::TBZW:
+ case AArch64::TBZX:
+ case AArch64::TBNZW:
+ case AArch64::TBNZX:
+ return true;
+ default:
+ return false;
+ }
}
+static inline bool isIndirectBranchOpcode(int Opc) { return Opc == AArch64::BR; }
+
+} // end namespace llvm
+
#endif
diff --git a/lib/Target/AArch64/AArch64InstrInfo.td b/lib/Target/AArch64/AArch64InstrInfo.td
index 7d7a641..9ad36e8 100644
--- a/lib/Target/AArch64/AArch64InstrInfo.td
+++ b/lib/Target/AArch64/AArch64InstrInfo.td
@@ -1,4 +1,4 @@
-//===----- AArch64InstrInfo.td - AArch64 Instruction Info ----*- tablegen -*-=//
+//=- AArch64InstrInfo.td - Describe the AArch64 Instructions -*- tablegen -*-=//
//
// The LLVM Compiler Infrastructure
//
@@ -7,7 +7,7 @@
//
//===----------------------------------------------------------------------===//
//
-// This file describes the AArch64 scalar instructions in TableGen format.
+// AArch64 Instruction definitions.
//
//===----------------------------------------------------------------------===//
@@ -19,5368 +19,5266 @@
def HasNEON : Predicate<"Subtarget->hasNEON()">,
AssemblerPredicate<"FeatureNEON", "neon">;
def HasCrypto : Predicate<"Subtarget->hasCrypto()">,
- AssemblerPredicate<"FeatureCrypto","crypto">;
+ AssemblerPredicate<"FeatureCrypto", "crypto">;
+def HasCRC : Predicate<"Subtarget->hasCRC()">,
+ AssemblerPredicate<"FeatureCRC", "crc">;
+def IsLE : Predicate<"Subtarget->isLittleEndian()">;
+def IsBE : Predicate<"!Subtarget->isLittleEndian()">;
-// Use fused MAC if more precision in FP computation is allowed.
-def UseFusedMAC : Predicate<"(TM.Options.AllowFPOpFusion =="
- " FPOpFusion::Fast)">;
+//===----------------------------------------------------------------------===//
+// AArch64-specific DAG Nodes.
+//
+
+// SDTBinaryArithWithFlagsOut - RES1, FLAGS = op LHS, RHS
+def SDTBinaryArithWithFlagsOut : SDTypeProfile<2, 2,
+ [SDTCisSameAs<0, 2>,
+ SDTCisSameAs<0, 3>,
+ SDTCisInt<0>, SDTCisVT<1, i32>]>;
+
+// SDTBinaryArithWithFlagsIn - RES1, FLAGS = op LHS, RHS, FLAGS
+def SDTBinaryArithWithFlagsIn : SDTypeProfile<1, 3,
+ [SDTCisSameAs<0, 1>,
+ SDTCisSameAs<0, 2>,
+ SDTCisInt<0>,
+ SDTCisVT<3, i32>]>;
+
+// SDTBinaryArithWithFlagsInOut - RES1, FLAGS = op LHS, RHS, FLAGS
+def SDTBinaryArithWithFlagsInOut : SDTypeProfile<2, 3,
+ [SDTCisSameAs<0, 2>,
+ SDTCisSameAs<0, 3>,
+ SDTCisInt<0>,
+ SDTCisVT<1, i32>,
+ SDTCisVT<4, i32>]>;
+
+def SDT_AArch64Brcond : SDTypeProfile<0, 3,
+ [SDTCisVT<0, OtherVT>, SDTCisVT<1, i32>,
+ SDTCisVT<2, i32>]>;
+def SDT_AArch64cbz : SDTypeProfile<0, 2, [SDTCisInt<0>, SDTCisVT<1, OtherVT>]>;
+def SDT_AArch64tbz : SDTypeProfile<0, 3, [SDTCisInt<0>, SDTCisInt<1>,
+ SDTCisVT<2, OtherVT>]>;
+
+
+def SDT_AArch64CSel : SDTypeProfile<1, 4,
+ [SDTCisSameAs<0, 1>,
+ SDTCisSameAs<0, 2>,
+ SDTCisInt<3>,
+ SDTCisVT<4, i32>]>;
+def SDT_AArch64FCmp : SDTypeProfile<0, 2,
+ [SDTCisFP<0>,
+ SDTCisSameAs<0, 1>]>;
+def SDT_AArch64Dup : SDTypeProfile<1, 1, [SDTCisVec<0>]>;
+def SDT_AArch64DupLane : SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisInt<2>]>;
+def SDT_AArch64Zip : SDTypeProfile<1, 2, [SDTCisVec<0>,
+ SDTCisSameAs<0, 1>,
+ SDTCisSameAs<0, 2>]>;
+def SDT_AArch64MOVIedit : SDTypeProfile<1, 1, [SDTCisInt<1>]>;
+def SDT_AArch64MOVIshift : SDTypeProfile<1, 2, [SDTCisInt<1>, SDTCisInt<2>]>;
+def SDT_AArch64vecimm : SDTypeProfile<1, 3, [SDTCisVec<0>, SDTCisSameAs<0,1>,
+ SDTCisInt<2>, SDTCisInt<3>]>;
+def SDT_AArch64UnaryVec: SDTypeProfile<1, 1, [SDTCisVec<0>, SDTCisSameAs<0,1>]>;
+def SDT_AArch64ExtVec: SDTypeProfile<1, 3, [SDTCisVec<0>, SDTCisSameAs<0,1>,
+ SDTCisSameAs<0,2>, SDTCisInt<3>]>;
+def SDT_AArch64vshift : SDTypeProfile<1, 2, [SDTCisSameAs<0,1>, SDTCisInt<2>]>;
+
+def SDT_AArch64unvec : SDTypeProfile<1, 1, [SDTCisVec<0>, SDTCisSameAs<0,1>]>;
+def SDT_AArch64fcmpz : SDTypeProfile<1, 1, []>;
+def SDT_AArch64fcmp : SDTypeProfile<1, 2, [SDTCisSameAs<1,2>]>;
+def SDT_AArch64binvec : SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisSameAs<0,1>,
+ SDTCisSameAs<0,2>]>;
+def SDT_AArch64trivec : SDTypeProfile<1, 3, [SDTCisVec<0>, SDTCisSameAs<0,1>,
+ SDTCisSameAs<0,2>,
+ SDTCisSameAs<0,3>]>;
+def SDT_AArch64TCRET : SDTypeProfile<0, 2, [SDTCisPtrTy<0>]>;
+def SDT_AArch64PREFETCH : SDTypeProfile<0, 2, [SDTCisVT<0, i32>, SDTCisPtrTy<1>]>;
+
+def SDT_AArch64ITOF : SDTypeProfile<1, 1, [SDTCisFP<0>, SDTCisSameAs<0,1>]>;
+
+def SDT_AArch64TLSDescCall : SDTypeProfile<0, -2, [SDTCisPtrTy<0>,
+ SDTCisPtrTy<1>]>;
+def SDT_AArch64WrapperLarge : SDTypeProfile<1, 4,
+ [SDTCisVT<0, i64>, SDTCisVT<1, i32>,
+ SDTCisSameAs<1, 2>, SDTCisSameAs<1, 3>,
+ SDTCisSameAs<1, 4>]>;
+
+
+// Node definitions.
+def AArch64adrp : SDNode<"AArch64ISD::ADRP", SDTIntUnaryOp, []>;
+def AArch64addlow : SDNode<"AArch64ISD::ADDlow", SDTIntBinOp, []>;
+def AArch64LOADgot : SDNode<"AArch64ISD::LOADgot", SDTIntUnaryOp>;
+def AArch64callseq_start : SDNode<"ISD::CALLSEQ_START",
+ SDCallSeqStart<[ SDTCisVT<0, i32> ]>,
+ [SDNPHasChain, SDNPOutGlue]>;
+def AArch64callseq_end : SDNode<"ISD::CALLSEQ_END",
+ SDCallSeqEnd<[ SDTCisVT<0, i32>,
+ SDTCisVT<1, i32> ]>,
+ [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue]>;
+def AArch64call : SDNode<"AArch64ISD::CALL",
+ SDTypeProfile<0, -1, [SDTCisPtrTy<0>]>,
+ [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue,
+ SDNPVariadic]>;
+def AArch64brcond : SDNode<"AArch64ISD::BRCOND", SDT_AArch64Brcond,
+ [SDNPHasChain]>;
+def AArch64cbz : SDNode<"AArch64ISD::CBZ", SDT_AArch64cbz,
+ [SDNPHasChain]>;
+def AArch64cbnz : SDNode<"AArch64ISD::CBNZ", SDT_AArch64cbz,
+ [SDNPHasChain]>;
+def AArch64tbz : SDNode<"AArch64ISD::TBZ", SDT_AArch64tbz,
+ [SDNPHasChain]>;
+def AArch64tbnz : SDNode<"AArch64ISD::TBNZ", SDT_AArch64tbz,
+ [SDNPHasChain]>;
+
+
+def AArch64csel : SDNode<"AArch64ISD::CSEL", SDT_AArch64CSel>;
+def AArch64csinv : SDNode<"AArch64ISD::CSINV", SDT_AArch64CSel>;
+def AArch64csneg : SDNode<"AArch64ISD::CSNEG", SDT_AArch64CSel>;
+def AArch64csinc : SDNode<"AArch64ISD::CSINC", SDT_AArch64CSel>;
+def AArch64retflag : SDNode<"AArch64ISD::RET_FLAG", SDTNone,
+ [SDNPHasChain, SDNPOptInGlue, SDNPVariadic]>;
+def AArch64adc : SDNode<"AArch64ISD::ADC", SDTBinaryArithWithFlagsIn >;
+def AArch64sbc : SDNode<"AArch64ISD::SBC", SDTBinaryArithWithFlagsIn>;
+def AArch64add_flag : SDNode<"AArch64ISD::ADDS", SDTBinaryArithWithFlagsOut,
+ [SDNPCommutative]>;
+def AArch64sub_flag : SDNode<"AArch64ISD::SUBS", SDTBinaryArithWithFlagsOut>;
+def AArch64and_flag : SDNode<"AArch64ISD::ANDS", SDTBinaryArithWithFlagsOut,
+ [SDNPCommutative]>;
+def AArch64adc_flag : SDNode<"AArch64ISD::ADCS", SDTBinaryArithWithFlagsInOut>;
+def AArch64sbc_flag : SDNode<"AArch64ISD::SBCS", SDTBinaryArithWithFlagsInOut>;
+
+def AArch64threadpointer : SDNode<"AArch64ISD::THREAD_POINTER", SDTPtrLeaf>;
+
+def AArch64fcmp : SDNode<"AArch64ISD::FCMP", SDT_AArch64FCmp>;
+
+def AArch64fmax : SDNode<"AArch64ISD::FMAX", SDTFPBinOp>;
+def AArch64fmin : SDNode<"AArch64ISD::FMIN", SDTFPBinOp>;
+
+def AArch64dup : SDNode<"AArch64ISD::DUP", SDT_AArch64Dup>;
+def AArch64duplane8 : SDNode<"AArch64ISD::DUPLANE8", SDT_AArch64DupLane>;
+def AArch64duplane16 : SDNode<"AArch64ISD::DUPLANE16", SDT_AArch64DupLane>;
+def AArch64duplane32 : SDNode<"AArch64ISD::DUPLANE32", SDT_AArch64DupLane>;
+def AArch64duplane64 : SDNode<"AArch64ISD::DUPLANE64", SDT_AArch64DupLane>;
+
+def AArch64zip1 : SDNode<"AArch64ISD::ZIP1", SDT_AArch64Zip>;
+def AArch64zip2 : SDNode<"AArch64ISD::ZIP2", SDT_AArch64Zip>;
+def AArch64uzp1 : SDNode<"AArch64ISD::UZP1", SDT_AArch64Zip>;
+def AArch64uzp2 : SDNode<"AArch64ISD::UZP2", SDT_AArch64Zip>;
+def AArch64trn1 : SDNode<"AArch64ISD::TRN1", SDT_AArch64Zip>;
+def AArch64trn2 : SDNode<"AArch64ISD::TRN2", SDT_AArch64Zip>;
+
+def AArch64movi_edit : SDNode<"AArch64ISD::MOVIedit", SDT_AArch64MOVIedit>;
+def AArch64movi_shift : SDNode<"AArch64ISD::MOVIshift", SDT_AArch64MOVIshift>;
+def AArch64movi_msl : SDNode<"AArch64ISD::MOVImsl", SDT_AArch64MOVIshift>;
+def AArch64mvni_shift : SDNode<"AArch64ISD::MVNIshift", SDT_AArch64MOVIshift>;
+def AArch64mvni_msl : SDNode<"AArch64ISD::MVNImsl", SDT_AArch64MOVIshift>;
+def AArch64movi : SDNode<"AArch64ISD::MOVI", SDT_AArch64MOVIedit>;
+def AArch64fmov : SDNode<"AArch64ISD::FMOV", SDT_AArch64MOVIedit>;
+
+def AArch64rev16 : SDNode<"AArch64ISD::REV16", SDT_AArch64UnaryVec>;
+def AArch64rev32 : SDNode<"AArch64ISD::REV32", SDT_AArch64UnaryVec>;
+def AArch64rev64 : SDNode<"AArch64ISD::REV64", SDT_AArch64UnaryVec>;
+def AArch64ext : SDNode<"AArch64ISD::EXT", SDT_AArch64ExtVec>;
+
+def AArch64vashr : SDNode<"AArch64ISD::VASHR", SDT_AArch64vshift>;
+def AArch64vlshr : SDNode<"AArch64ISD::VLSHR", SDT_AArch64vshift>;
+def AArch64vshl : SDNode<"AArch64ISD::VSHL", SDT_AArch64vshift>;
+def AArch64sqshli : SDNode<"AArch64ISD::SQSHL_I", SDT_AArch64vshift>;
+def AArch64uqshli : SDNode<"AArch64ISD::UQSHL_I", SDT_AArch64vshift>;
+def AArch64sqshlui : SDNode<"AArch64ISD::SQSHLU_I", SDT_AArch64vshift>;
+def AArch64srshri : SDNode<"AArch64ISD::SRSHR_I", SDT_AArch64vshift>;
+def AArch64urshri : SDNode<"AArch64ISD::URSHR_I", SDT_AArch64vshift>;
+
+def AArch64not: SDNode<"AArch64ISD::NOT", SDT_AArch64unvec>;
+def AArch64bit: SDNode<"AArch64ISD::BIT", SDT_AArch64trivec>;
+def AArch64bsl: SDNode<"AArch64ISD::BSL", SDT_AArch64trivec>;
+
+def AArch64cmeq: SDNode<"AArch64ISD::CMEQ", SDT_AArch64binvec>;
+def AArch64cmge: SDNode<"AArch64ISD::CMGE", SDT_AArch64binvec>;
+def AArch64cmgt: SDNode<"AArch64ISD::CMGT", SDT_AArch64binvec>;
+def AArch64cmhi: SDNode<"AArch64ISD::CMHI", SDT_AArch64binvec>;
+def AArch64cmhs: SDNode<"AArch64ISD::CMHS", SDT_AArch64binvec>;
+
+def AArch64fcmeq: SDNode<"AArch64ISD::FCMEQ", SDT_AArch64fcmp>;
+def AArch64fcmge: SDNode<"AArch64ISD::FCMGE", SDT_AArch64fcmp>;
+def AArch64fcmgt: SDNode<"AArch64ISD::FCMGT", SDT_AArch64fcmp>;
+
+def AArch64cmeqz: SDNode<"AArch64ISD::CMEQz", SDT_AArch64unvec>;
+def AArch64cmgez: SDNode<"AArch64ISD::CMGEz", SDT_AArch64unvec>;
+def AArch64cmgtz: SDNode<"AArch64ISD::CMGTz", SDT_AArch64unvec>;
+def AArch64cmlez: SDNode<"AArch64ISD::CMLEz", SDT_AArch64unvec>;
+def AArch64cmltz: SDNode<"AArch64ISD::CMLTz", SDT_AArch64unvec>;
+def AArch64cmtst : PatFrag<(ops node:$LHS, node:$RHS),
+ (AArch64not (AArch64cmeqz (and node:$LHS, node:$RHS)))>;
+
+def AArch64fcmeqz: SDNode<"AArch64ISD::FCMEQz", SDT_AArch64fcmpz>;
+def AArch64fcmgez: SDNode<"AArch64ISD::FCMGEz", SDT_AArch64fcmpz>;
+def AArch64fcmgtz: SDNode<"AArch64ISD::FCMGTz", SDT_AArch64fcmpz>;
+def AArch64fcmlez: SDNode<"AArch64ISD::FCMLEz", SDT_AArch64fcmpz>;
+def AArch64fcmltz: SDNode<"AArch64ISD::FCMLTz", SDT_AArch64fcmpz>;
+
+def AArch64bici: SDNode<"AArch64ISD::BICi", SDT_AArch64vecimm>;
+def AArch64orri: SDNode<"AArch64ISD::ORRi", SDT_AArch64vecimm>;
+
+def AArch64neg : SDNode<"AArch64ISD::NEG", SDT_AArch64unvec>;
+
+def AArch64tcret: SDNode<"AArch64ISD::TC_RETURN", SDT_AArch64TCRET,
+ [SDNPHasChain, SDNPOptInGlue, SDNPVariadic]>;
+
+def AArch64Prefetch : SDNode<"AArch64ISD::PREFETCH", SDT_AArch64PREFETCH,
+ [SDNPHasChain, SDNPSideEffect]>;
+
+def AArch64sitof: SDNode<"AArch64ISD::SITOF", SDT_AArch64ITOF>;
+def AArch64uitof: SDNode<"AArch64ISD::UITOF", SDT_AArch64ITOF>;
+
+def AArch64tlsdesc_call : SDNode<"AArch64ISD::TLSDESC_CALL",
+ SDT_AArch64TLSDescCall,
+ [SDNPInGlue, SDNPOutGlue, SDNPHasChain,
+ SDNPVariadic]>;
+
+def AArch64WrapperLarge : SDNode<"AArch64ISD::WrapperLarge",
+ SDT_AArch64WrapperLarge>;
+
+
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+
+// AArch64 Instruction Predicate Definitions.
+//
+def HasZCZ : Predicate<"Subtarget->hasZeroCycleZeroing()">;
+def NoZCZ : Predicate<"!Subtarget->hasZeroCycleZeroing()">;
+def IsDarwin : Predicate<"Subtarget->isTargetDarwin()">;
+def IsNotDarwin: Predicate<"!Subtarget->isTargetDarwin()">;
+def ForCodeSize : Predicate<"ForCodeSize">;
+def NotForCodeSize : Predicate<"!ForCodeSize">;
+
include "AArch64InstrFormats.td"
//===----------------------------------------------------------------------===//
-// AArch64 specific pattern fragments.
-//
-// An 'fmul' node with a single use.
-def fmul_su : PatFrag<(ops node:$lhs, node:$rhs), (fmul node:$lhs, node:$rhs),[{
- return N->hasOneUse();
-}]>;
-
//===----------------------------------------------------------------------===//
-// Target-specific ISD nodes and profiles
+// Miscellaneous instructions.
//===----------------------------------------------------------------------===//
-def SDT_A64ret : SDTypeProfile<0, 0, []>;
-def A64ret : SDNode<"AArch64ISD::Ret", SDT_A64ret, [SDNPHasChain,
- SDNPOptInGlue,
- SDNPVariadic]>;
+let Defs = [SP], Uses = [SP], hasSideEffects = 1, isCodeGenOnly = 1 in {
+def ADJCALLSTACKDOWN : Pseudo<(outs), (ins i32imm:$amt),
+ [(AArch64callseq_start timm:$amt)]>;
+def ADJCALLSTACKUP : Pseudo<(outs), (ins i32imm:$amt1, i32imm:$amt2),
+ [(AArch64callseq_end timm:$amt1, timm:$amt2)]>;
+} // Defs = [SP], Uses = [SP], hasSideEffects = 1, isCodeGenOnly = 1
-// (ins NZCV, Condition, Dest)
-def SDT_A64br_cc : SDTypeProfile<0, 3, [SDTCisVT<0, i32>]>;
-def A64br_cc : SDNode<"AArch64ISD::BR_CC", SDT_A64br_cc, [SDNPHasChain]>;
+let isReMaterializable = 1, isCodeGenOnly = 1 in {
+// FIXME: The following pseudo instructions are only needed because remat
+// cannot handle multiple instructions. When that changes, they can be
+// removed, along with the AArch64Wrapper node.
-// (outs Result), (ins NZCV, IfTrue, IfFalse, Condition)
-def SDT_A64select_cc : SDTypeProfile<1, 4, [SDTCisVT<1, i32>,
- SDTCisSameAs<0, 2>,
- SDTCisSameAs<2, 3>]>;
-def A64select_cc : SDNode<"AArch64ISD::SELECT_CC", SDT_A64select_cc>;
+let AddedComplexity = 10 in
+def LOADgot : Pseudo<(outs GPR64:$dst), (ins i64imm:$addr),
+ [(set GPR64:$dst, (AArch64LOADgot tglobaladdr:$addr))]>,
+ Sched<[WriteLDAdr]>;
-// (outs NZCV), (ins LHS, RHS, Condition)
-def SDT_A64setcc : SDTypeProfile<1, 3, [SDTCisVT<0, i32>,
- SDTCisSameAs<1, 2>]>;
-def A64setcc : SDNode<"AArch64ISD::SETCC", SDT_A64setcc>;
+// The MOVaddr instruction should match only when the add is not folded
+// into a load or store address.
+def MOVaddr
+ : Pseudo<(outs GPR64:$dst), (ins i64imm:$hi, i64imm:$low),
+ [(set GPR64:$dst, (AArch64addlow (AArch64adrp tglobaladdr:$hi),
+ tglobaladdr:$low))]>,
+ Sched<[WriteAdrAdr]>;
+def MOVaddrJT
+ : Pseudo<(outs GPR64:$dst), (ins i64imm:$hi, i64imm:$low),
+ [(set GPR64:$dst, (AArch64addlow (AArch64adrp tjumptable:$hi),
+ tjumptable:$low))]>,
+ Sched<[WriteAdrAdr]>;
+def MOVaddrCP
+ : Pseudo<(outs GPR64:$dst), (ins i64imm:$hi, i64imm:$low),
+ [(set GPR64:$dst, (AArch64addlow (AArch64adrp tconstpool:$hi),
+ tconstpool:$low))]>,
+ Sched<[WriteAdrAdr]>;
+def MOVaddrBA
+ : Pseudo<(outs GPR64:$dst), (ins i64imm:$hi, i64imm:$low),
+ [(set GPR64:$dst, (AArch64addlow (AArch64adrp tblockaddress:$hi),
+ tblockaddress:$low))]>,
+ Sched<[WriteAdrAdr]>;
+def MOVaddrTLS
+ : Pseudo<(outs GPR64:$dst), (ins i64imm:$hi, i64imm:$low),
+ [(set GPR64:$dst, (AArch64addlow (AArch64adrp tglobaltlsaddr:$hi),
+ tglobaltlsaddr:$low))]>,
+ Sched<[WriteAdrAdr]>;
+def MOVaddrEXT
+ : Pseudo<(outs GPR64:$dst), (ins i64imm:$hi, i64imm:$low),
+ [(set GPR64:$dst, (AArch64addlow (AArch64adrp texternalsym:$hi),
+ texternalsym:$low))]>,
+ Sched<[WriteAdrAdr]>;
+} // isReMaterializable, isCodeGenOnly
-// (outs GPR64), (ins)
-def A64threadpointer : SDNode<"AArch64ISD::THREAD_POINTER", SDTPtrLeaf>;
+def : Pat<(AArch64LOADgot tglobaltlsaddr:$addr),
+ (LOADgot tglobaltlsaddr:$addr)>;
-// A64 compares don't care about the cond really (they set all flags) so a
-// simple binary operator is useful.
-def A64cmp : PatFrag<(ops node:$lhs, node:$rhs),
- (A64setcc node:$lhs, node:$rhs, cond)>;
+def : Pat<(AArch64LOADgot texternalsym:$addr),
+ (LOADgot texternalsym:$addr)>;
-
-// When matching a notional (CMP op1, (sub 0, op2)), we'd like to use a CMN
-// instruction on the grounds that "op1 - (-op2) == op1 + op2". However, the C
-// and V flags can be set differently by this operation. It comes down to
-// whether "SInt(~op2)+1 == SInt(~op2+1)" (and the same for UInt). If they are
-// then everything is fine. If not then the optimization is wrong. Thus general
-// comparisons are only valid if op2 != 0.
-
-// So, finally, the only LLVM-native comparisons that don't mention C and V are
-// SETEQ and SETNE. They're the only ones we can safely use CMN for in the
-// absence of information about op2.
-def equality_cond : PatLeaf<(cond), [{
- return N->get() == ISD::SETEQ || N->get() == ISD::SETNE;
-}]>;
-
-def A64cmn : PatFrag<(ops node:$lhs, node:$rhs),
- (A64setcc node:$lhs, (sub 0, node:$rhs), equality_cond)>;
-
-// There are two layers of indirection here, driven by the following
-// considerations.
-// + TableGen does not know CodeModel or Reloc so that decision should be
-// made for a variable/address at ISelLowering.
-// + The output of ISelLowering should be selectable (hence the Wrapper,
-// rather than a bare target opcode)
-def SDTAArch64WrapperLarge : SDTypeProfile<1, 4, [SDTCisSameAs<0, 1>,
- SDTCisSameAs<0, 2>,
- SDTCisSameAs<0, 3>,
- SDTCisSameAs<0, 4>,
- SDTCisPtrTy<0>]>;
-
-def A64WrapperLarge :SDNode<"AArch64ISD::WrapperLarge", SDTAArch64WrapperLarge>;
-
-def SDTAArch64WrapperSmall : SDTypeProfile<1, 3, [SDTCisSameAs<0, 1>,
- SDTCisSameAs<1, 2>,
- SDTCisVT<3, i32>,
- SDTCisPtrTy<0>]>;
-
-def A64WrapperSmall :SDNode<"AArch64ISD::WrapperSmall", SDTAArch64WrapperSmall>;
-
-
-def SDTAArch64GOTLoad : SDTypeProfile<1, 1, [SDTCisPtrTy<0>, SDTCisPtrTy<1>]>;
-def A64GOTLoad : SDNode<"AArch64ISD::GOTLoad", SDTAArch64GOTLoad,
- [SDNPHasChain]>;
-
-
-// (A64BFI LHS, RHS, LSB, Width)
-def SDTA64BFI : SDTypeProfile<1, 4, [SDTCisSameAs<0, 1>,
- SDTCisSameAs<1, 2>,
- SDTCisVT<3, i64>,
- SDTCisVT<4, i64>]>;
-
-def A64Bfi : SDNode<"AArch64ISD::BFI", SDTA64BFI>;
-
-// (A64EXTR HiReg, LoReg, LSB)
-def SDTA64EXTR : SDTypeProfile<1, 3, [SDTCisSameAs<0, 1>, SDTCisSameAs<1, 2>,
- SDTCisVT<3, i64>]>;
-def A64Extr : SDNode<"AArch64ISD::EXTR", SDTA64EXTR>;
-
-// (A64[SU]BFX Field, ImmR, ImmS).
-//
-// Note that ImmR and ImmS are already encoded for the actual instructions. The
-// more natural LSB and Width mix together to form ImmR and ImmS, something
-// which TableGen can't handle.
-def SDTA64BFX : SDTypeProfile<1, 3, [SDTCisVT<2, i64>, SDTCisVT<3, i64>]>;
-def A64Sbfx : SDNode<"AArch64ISD::SBFX", SDTA64BFX>;
-
-def A64Ubfx : SDNode<"AArch64ISD::UBFX", SDTA64BFX>;
-
-class BinOpFrag<dag res> : PatFrag<(ops node:$LHS, node:$RHS), res>;
+def : Pat<(AArch64LOADgot tconstpool:$addr),
+ (LOADgot tconstpool:$addr)>;
//===----------------------------------------------------------------------===//
-// Call sequence pseudo-instructions
+// System instructions.
//===----------------------------------------------------------------------===//
+def HINT : HintI<"hint">;
+def : InstAlias<"nop", (HINT 0b000)>;
+def : InstAlias<"yield",(HINT 0b001)>;
+def : InstAlias<"wfe", (HINT 0b010)>;
+def : InstAlias<"wfi", (HINT 0b011)>;
+def : InstAlias<"sev", (HINT 0b100)>;
+def : InstAlias<"sevl", (HINT 0b101)>;
-def SDT_AArch64Call : SDTypeProfile<0, -1, [SDTCisPtrTy<0>]>;
-def AArch64Call : SDNode<"AArch64ISD::Call", SDT_AArch64Call,
- [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue, SDNPVariadic]>;
+ // As far as LLVM is concerned this writes to the system's exclusive monitors.
+let mayLoad = 1, mayStore = 1 in
+def CLREX : CRmSystemI<imm0_15, 0b010, "clrex">;
-def AArch64tcret : SDNode<"AArch64ISD::TC_RETURN", SDT_AArch64Call,
- [SDNPHasChain, SDNPOptInGlue, SDNPVariadic]>;
+def DMB : CRmSystemI<barrier_op, 0b101, "dmb">;
+def DSB : CRmSystemI<barrier_op, 0b100, "dsb">;
+def ISB : CRmSystemI<barrier_op, 0b110, "isb">;
+def : InstAlias<"clrex", (CLREX 0xf)>;
+def : InstAlias<"isb", (ISB 0xf)>;
-// The TLSDESCCALL node is a variant call which goes to an indirectly calculated
-// destination but needs a relocation against a fixed symbol. As such it has two
-// certain operands: the callee and the relocated variable.
-//
-// The TLS ABI only allows it to be selected to a BLR instructin (with
-// appropriate relocation).
-def SDTTLSDescCall : SDTypeProfile<0, -2, [SDTCisPtrTy<0>, SDTCisPtrTy<1>]>;
+def MRS : MRSI;
+def MSR : MSRI;
+def MSRpstate: MSRpstateI;
-def A64tlsdesc_blr : SDNode<"AArch64ISD::TLSDESCCALL", SDTTLSDescCall,
- [SDNPInGlue, SDNPOutGlue, SDNPHasChain,
- SDNPVariadic]>;
+// The thread pointer (on Linux, at least, where this has been implemented) is
+// TPIDR_EL0.
+def : Pat<(AArch64threadpointer), (MRS 0xde82)>;
+// Generic system instructions
+def SYSxt : SystemXtI<0, "sys">;
+def SYSLxt : SystemLXtI<1, "sysl">;
-def SDT_AArch64CallSeqStart : SDCallSeqStart<[ SDTCisPtrTy<0> ]>;
-def AArch64callseq_start : SDNode<"ISD::CALLSEQ_START", SDT_AArch64CallSeqStart,
- [SDNPHasChain, SDNPOutGlue]>;
-
-def SDT_AArch64CallSeqEnd : SDCallSeqEnd<[ SDTCisPtrTy<0>, SDTCisPtrTy<1> ]>;
-def AArch64callseq_end : SDNode<"ISD::CALLSEQ_END", SDT_AArch64CallSeqEnd,
- [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue]>;
-
-
-
-// These pseudo-instructions have special semantics by virtue of being passed to
-// the InstrInfo constructor. CALLSEQ_START/CALLSEQ_END are produced by
-// LowerCall to (in our case) tell the back-end about stack adjustments for
-// arguments passed on the stack. Here we select those markers to
-// pseudo-instructions which explicitly set the stack, and finally in the
-// RegisterInfo we convert them to a true stack adjustment.
-let Defs = [XSP], Uses = [XSP] in {
- def ADJCALLSTACKDOWN : PseudoInst<(outs), (ins i64imm:$amt),
- [(AArch64callseq_start timm:$amt)]>;
-
- def ADJCALLSTACKUP : PseudoInst<(outs), (ins i64imm:$amt1, i64imm:$amt2),
- [(AArch64callseq_end timm:$amt1, timm:$amt2)]>;
-}
+def : InstAlias<"sys $op1, $Cn, $Cm, $op2",
+ (SYSxt imm0_7:$op1, sys_cr_op:$Cn,
+ sys_cr_op:$Cm, imm0_7:$op2, XZR)>;
//===----------------------------------------------------------------------===//
-// Atomic operation pseudo-instructions
+// Move immediate instructions.
//===----------------------------------------------------------------------===//
-// These get selected from C++ code as a pretty much direct translation from the
-// generic DAG nodes. The one exception is the AtomicOrdering is added as an
-// operand so that the eventual lowering can make use of it and choose
-// acquire/release operations when required.
+defm MOVK : InsertImmediate<0b11, "movk">;
+defm MOVN : MoveImmediate<0b00, "movn">;
-let usesCustomInserter = 1, hasCtrlDep = 1, mayLoad = 1, mayStore = 1 in {
-multiclass AtomicSizes {
- def _I8 : PseudoInst<(outs GPR32:$dst),
- (ins GPR64xsp:$ptr, GPR32:$incr, i32imm:$ordering), []>;
- def _I16 : PseudoInst<(outs GPR32:$dst),
- (ins GPR64xsp:$ptr, GPR32:$incr, i32imm:$ordering), []>;
- def _I32 : PseudoInst<(outs GPR32:$dst),
- (ins GPR64xsp:$ptr, GPR32:$incr, i32imm:$ordering), []>;
- def _I64 : PseudoInst<(outs GPR64:$dst),
- (ins GPR64xsp:$ptr, GPR64:$incr, i32imm:$ordering), []>;
-}
-}
+let PostEncoderMethod = "fixMOVZ" in
+defm MOVZ : MoveImmediate<0b10, "movz">;
-defm ATOMIC_LOAD_ADD : AtomicSizes;
-defm ATOMIC_LOAD_SUB : AtomicSizes;
-defm ATOMIC_LOAD_AND : AtomicSizes;
-defm ATOMIC_LOAD_OR : AtomicSizes;
-defm ATOMIC_LOAD_XOR : AtomicSizes;
-defm ATOMIC_LOAD_NAND : AtomicSizes;
-defm ATOMIC_SWAP : AtomicSizes;
-let Defs = [NZCV] in {
- // These operations need a CMP to calculate the correct value
- defm ATOMIC_LOAD_MIN : AtomicSizes;
- defm ATOMIC_LOAD_MAX : AtomicSizes;
- defm ATOMIC_LOAD_UMIN : AtomicSizes;
- defm ATOMIC_LOAD_UMAX : AtomicSizes;
-}
+// First group of aliases covers an implicit "lsl #0".
+def : InstAlias<"movk $dst, $imm", (MOVKWi GPR32:$dst, imm0_65535:$imm, 0)>;
+def : InstAlias<"movk $dst, $imm", (MOVKXi GPR64:$dst, imm0_65535:$imm, 0)>;
+def : InstAlias<"movn $dst, $imm", (MOVNWi GPR32:$dst, imm0_65535:$imm, 0)>;
+def : InstAlias<"movn $dst, $imm", (MOVNXi GPR64:$dst, imm0_65535:$imm, 0)>;
+def : InstAlias<"movz $dst, $imm", (MOVZWi GPR32:$dst, imm0_65535:$imm, 0)>;
+def : InstAlias<"movz $dst, $imm", (MOVZXi GPR64:$dst, imm0_65535:$imm, 0)>;
-class AtomicCmpSwap<RegisterClass GPRData>
- : PseudoInst<(outs GPRData:$dst),
- (ins GPR64xsp:$ptr, GPRData:$old, GPRData:$new,
- i32imm:$ordering), []> {
- let usesCustomInserter = 1;
- let hasCtrlDep = 1;
- let mayLoad = 1;
- let mayStore = 1;
- let Defs = [NZCV];
-}
+// Next, we have various ELF relocations with the ":XYZ_g0:sym" syntax.
+def : InstAlias<"movz $Rd, $sym", (MOVZXi GPR64:$Rd, movz_symbol_g3:$sym, 48)>;
+def : InstAlias<"movz $Rd, $sym", (MOVZXi GPR64:$Rd, movz_symbol_g2:$sym, 32)>;
+def : InstAlias<"movz $Rd, $sym", (MOVZXi GPR64:$Rd, movz_symbol_g1:$sym, 16)>;
+def : InstAlias<"movz $Rd, $sym", (MOVZXi GPR64:$Rd, movz_symbol_g0:$sym, 0)>;
-def ATOMIC_CMP_SWAP_I8 : AtomicCmpSwap<GPR32>;
-def ATOMIC_CMP_SWAP_I16 : AtomicCmpSwap<GPR32>;
-def ATOMIC_CMP_SWAP_I32 : AtomicCmpSwap<GPR32>;
-def ATOMIC_CMP_SWAP_I64 : AtomicCmpSwap<GPR64>;
+def : InstAlias<"movn $Rd, $sym", (MOVNXi GPR64:$Rd, movz_symbol_g3:$sym, 48)>;
+def : InstAlias<"movn $Rd, $sym", (MOVNXi GPR64:$Rd, movz_symbol_g2:$sym, 32)>;
+def : InstAlias<"movn $Rd, $sym", (MOVNXi GPR64:$Rd, movz_symbol_g1:$sym, 16)>;
+def : InstAlias<"movn $Rd, $sym", (MOVNXi GPR64:$Rd, movz_symbol_g0:$sym, 0)>;
-//===----------------------------------------------------------------------===//
-// Add-subtract (extended register) instructions
-//===----------------------------------------------------------------------===//
-// Contains: ADD, ADDS, SUB, SUBS + aliases CMN, CMP
+def : InstAlias<"movk $Rd, $sym", (MOVKXi GPR64:$Rd, movk_symbol_g3:$sym, 48)>;
+def : InstAlias<"movk $Rd, $sym", (MOVKXi GPR64:$Rd, movk_symbol_g2:$sym, 32)>;
+def : InstAlias<"movk $Rd, $sym", (MOVKXi GPR64:$Rd, movk_symbol_g1:$sym, 16)>;
+def : InstAlias<"movk $Rd, $sym", (MOVKXi GPR64:$Rd, movk_symbol_g0:$sym, 0)>;
-// The RHS of these operations is conceptually a sign/zero-extended
-// register, optionally shifted left by 1-4. The extension can be a
-// NOP (e.g. "sxtx" sign-extending a 64-bit register to 64-bits) but
-// must be specified with one exception:
+def : InstAlias<"movz $Rd, $sym", (MOVZWi GPR32:$Rd, movz_symbol_g1:$sym, 16)>;
+def : InstAlias<"movz $Rd, $sym", (MOVZWi GPR32:$Rd, movz_symbol_g0:$sym, 0)>;
-// If one of the registers is sp/wsp then LSL is an alias for UXTW in
-// 32-bit instructions and UXTX in 64-bit versions, the shift amount
-// is not optional in that case (but can explicitly be 0), and the
-// entire suffix can be skipped (e.g. "add sp, x3, x2").
+def : InstAlias<"movn $Rd, $sym", (MOVNWi GPR32:$Rd, movz_symbol_g1:$sym, 16)>;
+def : InstAlias<"movn $Rd, $sym", (MOVNWi GPR32:$Rd, movz_symbol_g0:$sym, 0)>;
-multiclass extend_operands<string PREFIX, string Diag> {
- def _asmoperand : AsmOperandClass {
- let Name = PREFIX;
- let RenderMethod = "addRegExtendOperands";
- let PredicateMethod = "isRegExtend<A64SE::" # PREFIX # ">";
- let DiagnosticType = "AddSubRegExtend" # Diag;
- }
+def : InstAlias<"movk $Rd, $sym", (MOVKWi GPR32:$Rd, movk_symbol_g1:$sym, 16)>;
+def : InstAlias<"movk $Rd, $sym", (MOVKWi GPR32:$Rd, movk_symbol_g0:$sym, 0)>;
- def _operand : Operand<i64>,
- ImmLeaf<i64, [{ return Imm >= 0 && Imm <= 4; }]> {
- let PrintMethod = "printRegExtendOperand<A64SE::" # PREFIX # ">";
- let DecoderMethod = "DecodeRegExtendOperand";
- let ParserMatchClass = !cast<AsmOperandClass>(PREFIX # "_asmoperand");
- }
-}
-
-defm UXTB : extend_operands<"UXTB", "Small">;
-defm UXTH : extend_operands<"UXTH", "Small">;
-defm UXTW : extend_operands<"UXTW", "Small">;
-defm UXTX : extend_operands<"UXTX", "Large">;
-defm SXTB : extend_operands<"SXTB", "Small">;
-defm SXTH : extend_operands<"SXTH", "Small">;
-defm SXTW : extend_operands<"SXTW", "Small">;
-defm SXTX : extend_operands<"SXTX", "Large">;
-
-def LSL_extasmoperand : AsmOperandClass {
- let Name = "RegExtendLSL";
- let RenderMethod = "addRegExtendOperands";
- let DiagnosticType = "AddSubRegExtendLarge";
-}
-
-def LSL_extoperand : Operand<i64> {
- let ParserMatchClass = LSL_extasmoperand;
-}
-
-
-// The patterns for various sign-extensions are a little ugly and
-// non-uniform because everything has already been promoted to the
-// legal i64 and i32 types. We'll wrap the various variants up in a
-// class for use later.
-class extend_types {
- dag uxtb; dag uxth; dag uxtw; dag uxtx;
- dag sxtb; dag sxth; dag sxtw; dag sxtx;
- ValueType ty;
- RegisterClass GPR;
-}
-
-def extends_to_i64 : extend_types {
- let uxtb = (and (anyext i32:$Rm), 255);
- let uxth = (and (anyext i32:$Rm), 65535);
- let uxtw = (zext i32:$Rm);
- let uxtx = (i64 $Rm);
-
- let sxtb = (sext_inreg (anyext i32:$Rm), i8);
- let sxth = (sext_inreg (anyext i32:$Rm), i16);
- let sxtw = (sext i32:$Rm);
- let sxtx = (i64 $Rm);
-
- let ty = i64;
- let GPR = GPR64xsp;
-}
-
-
-def extends_to_i32 : extend_types {
- let uxtb = (and i32:$Rm, 255);
- let uxth = (and i32:$Rm, 65535);
- let uxtw = (i32 i32:$Rm);
- let uxtx = (i32 i32:$Rm);
-
- let sxtb = (sext_inreg i32:$Rm, i8);
- let sxth = (sext_inreg i32:$Rm, i16);
- let sxtw = (i32 i32:$Rm);
- let sxtx = (i32 i32:$Rm);
-
- let ty = i32;
- let GPR = GPR32wsp;
-}
-
-// Now, six of the extensions supported are easy and uniform: if the source size
-// is 32-bits or less, then Rm is always a 32-bit register. We'll instantiate
-// those instructions in one block.
-
-// The uxtx/sxtx could potentially be merged in, but three facts dissuaded me:
-// + It would break the naming scheme: either ADDxx_uxtx or ADDww_uxtx would
-// be impossible.
-// + Patterns are very different as well.
-// + Passing different registers would be ugly (more fields in extend_types
-// would probably be the best option).
-multiclass addsub_exts<bit sf, bit op, bit S, string asmop,
- SDPatternOperator opfrag,
- dag outs, extend_types exts> {
- def w_uxtb : A64I_addsubext<sf, op, S, 0b00, 0b000,
- outs, (ins exts.GPR:$Rn, GPR32:$Rm, UXTB_operand:$Imm3),
- !strconcat(asmop, "$Rn, $Rm, $Imm3"),
- [(opfrag exts.ty:$Rn, (shl exts.uxtb, UXTB_operand:$Imm3))],
- NoItinerary>,
- Sched<[WriteALU, ReadALU, ReadALU]>;
- def w_uxth : A64I_addsubext<sf, op, S, 0b00, 0b001,
- outs, (ins exts.GPR:$Rn, GPR32:$Rm, UXTH_operand:$Imm3),
- !strconcat(asmop, "$Rn, $Rm, $Imm3"),
- [(opfrag exts.ty:$Rn, (shl exts.uxth, UXTH_operand:$Imm3))],
- NoItinerary>,
- Sched<[WriteALU, ReadALU, ReadALU]>;
- def w_uxtw : A64I_addsubext<sf, op, S, 0b00, 0b010,
- outs, (ins exts.GPR:$Rn, GPR32:$Rm, UXTW_operand:$Imm3),
- !strconcat(asmop, "$Rn, $Rm, $Imm3"),
- [(opfrag exts.ty:$Rn, (shl exts.uxtw, UXTW_operand:$Imm3))],
- NoItinerary>,
- Sched<[WriteALU, ReadALU, ReadALU]>;
-
- def w_sxtb : A64I_addsubext<sf, op, S, 0b00, 0b100,
- outs, (ins exts.GPR:$Rn, GPR32:$Rm, SXTB_operand:$Imm3),
- !strconcat(asmop, "$Rn, $Rm, $Imm3"),
- [(opfrag exts.ty:$Rn, (shl exts.sxtb, SXTB_operand:$Imm3))],
- NoItinerary>,
- Sched<[WriteALU, ReadALU, ReadALU]>;
- def w_sxth : A64I_addsubext<sf, op, S, 0b00, 0b101,
- outs, (ins exts.GPR:$Rn, GPR32:$Rm, SXTH_operand:$Imm3),
- !strconcat(asmop, "$Rn, $Rm, $Imm3"),
- [(opfrag exts.ty:$Rn, (shl exts.sxth, SXTH_operand:$Imm3))],
- NoItinerary>,
- Sched<[WriteALU, ReadALU, ReadALU]>;
- def w_sxtw : A64I_addsubext<sf, op, S, 0b00, 0b110,
- outs, (ins exts.GPR:$Rn, GPR32:$Rm, SXTW_operand:$Imm3),
- !strconcat(asmop, "$Rn, $Rm, $Imm3"),
- [(opfrag exts.ty:$Rn, (shl exts.sxtw, SXTW_operand:$Imm3))],
- NoItinerary>,
- Sched<[WriteALU, ReadALU, ReadALU]>;
-}
-
-// These two could be merge in with the above, but their patterns aren't really
-// necessary and the naming-scheme would necessarily break:
-multiclass addsub_xxtx<bit op, bit S, string asmop, SDPatternOperator opfrag,
- dag outs> {
- def x_uxtx : A64I_addsubext<0b1, op, S, 0b00, 0b011,
- outs,
- (ins GPR64xsp:$Rn, GPR64:$Rm, UXTX_operand:$Imm3),
- !strconcat(asmop, "$Rn, $Rm, $Imm3"),
- [(opfrag i64:$Rn, (shl i64:$Rm, UXTX_operand:$Imm3))],
- NoItinerary>,
- Sched<[WriteALU, ReadALU, ReadALU]>;
-
- def x_sxtx : A64I_addsubext<0b1, op, S, 0b00, 0b111,
- outs,
- (ins GPR64xsp:$Rn, GPR64:$Rm, SXTX_operand:$Imm3),
- !strconcat(asmop, "$Rn, $Rm, $Imm3"),
- [/* No Pattern: same as uxtx */],
- NoItinerary>,
- Sched<[WriteALU, ReadALU, ReadALU]>;
-}
-
-multiclass addsub_wxtx<bit op, bit S, string asmop, dag outs> {
- def w_uxtx : A64I_addsubext<0b0, op, S, 0b00, 0b011,
- outs, (ins GPR32wsp:$Rn, GPR32:$Rm, UXTX_operand:$Imm3),
- !strconcat(asmop, "$Rn, $Rm, $Imm3"),
- [/* No pattern: probably same as uxtw */],
- NoItinerary>,
- Sched<[WriteALU, ReadALU, ReadALU]>;
-
- def w_sxtx : A64I_addsubext<0b0, op, S, 0b00, 0b111,
- outs, (ins GPR32wsp:$Rn, GPR32:$Rm, SXTX_operand:$Imm3),
- !strconcat(asmop, "$Rn, $Rm, $Imm3"),
- [/* No Pattern: probably same as uxtw */],
- NoItinerary>,
- Sched<[WriteALU, ReadALU, ReadALU]>;
-}
-
-class SetRD<RegisterClass RC, SDPatternOperator op>
- : PatFrag<(ops node:$lhs, node:$rhs), (set RC:$Rd, (op node:$lhs, node:$rhs))>;
-class SetNZCV<SDPatternOperator op>
- : PatFrag<(ops node:$lhs, node:$rhs), (set NZCV, (op node:$lhs, node:$rhs))>;
-
-defm ADDxx :addsub_exts<0b1, 0b0, 0b0, "add\t$Rd, ", SetRD<GPR64xsp, add>,
- (outs GPR64xsp:$Rd), extends_to_i64>,
- addsub_xxtx< 0b0, 0b0, "add\t$Rd, ", SetRD<GPR64xsp, add>,
- (outs GPR64xsp:$Rd)>;
-defm ADDww :addsub_exts<0b0, 0b0, 0b0, "add\t$Rd, ", SetRD<GPR32wsp, add>,
- (outs GPR32wsp:$Rd), extends_to_i32>,
- addsub_wxtx< 0b0, 0b0, "add\t$Rd, ",
- (outs GPR32wsp:$Rd)>;
-defm SUBxx :addsub_exts<0b1, 0b1, 0b0, "sub\t$Rd, ", SetRD<GPR64xsp, sub>,
- (outs GPR64xsp:$Rd), extends_to_i64>,
- addsub_xxtx< 0b1, 0b0, "sub\t$Rd, ", SetRD<GPR64xsp, sub>,
- (outs GPR64xsp:$Rd)>;
-defm SUBww :addsub_exts<0b0, 0b1, 0b0, "sub\t$Rd, ", SetRD<GPR32wsp, sub>,
- (outs GPR32wsp:$Rd), extends_to_i32>,
- addsub_wxtx< 0b1, 0b0, "sub\t$Rd, ",
- (outs GPR32wsp:$Rd)>;
-
-let Defs = [NZCV] in {
-defm ADDSxx :addsub_exts<0b1, 0b0, 0b1, "adds\t$Rd, ", SetRD<GPR64, addc>,
- (outs GPR64:$Rd), extends_to_i64>,
- addsub_xxtx< 0b0, 0b1, "adds\t$Rd, ", SetRD<GPR64, addc>,
- (outs GPR64:$Rd)>;
-defm ADDSww :addsub_exts<0b0, 0b0, 0b1, "adds\t$Rd, ", SetRD<GPR32, addc>,
- (outs GPR32:$Rd), extends_to_i32>,
- addsub_wxtx< 0b0, 0b1, "adds\t$Rd, ",
- (outs GPR32:$Rd)>;
-defm SUBSxx :addsub_exts<0b1, 0b1, 0b1, "subs\t$Rd, ", SetRD<GPR64, subc>,
- (outs GPR64:$Rd), extends_to_i64>,
- addsub_xxtx< 0b1, 0b1, "subs\t$Rd, ", SetRD<GPR64, subc>,
- (outs GPR64:$Rd)>;
-defm SUBSww :addsub_exts<0b0, 0b1, 0b1, "subs\t$Rd, ", SetRD<GPR32, subc>,
- (outs GPR32:$Rd), extends_to_i32>,
- addsub_wxtx< 0b1, 0b1, "subs\t$Rd, ",
- (outs GPR32:$Rd)>;
-
-
-let SchedRW = [WriteCMP, ReadCMP, ReadCMP], Rd = 0b11111, isCompare = 1 in {
-defm CMNx : addsub_exts<0b1, 0b0, 0b1, "cmn\t", SetNZCV<A64cmn>,
- (outs), extends_to_i64>,
- addsub_xxtx< 0b0, 0b1, "cmn\t", SetNZCV<A64cmn>, (outs)>;
-defm CMNw : addsub_exts<0b0, 0b0, 0b1, "cmn\t", SetNZCV<A64cmn>,
- (outs), extends_to_i32>,
- addsub_wxtx< 0b0, 0b1, "cmn\t", (outs)>;
-defm CMPx : addsub_exts<0b1, 0b1, 0b1, "cmp\t", SetNZCV<A64cmp>,
- (outs), extends_to_i64>,
- addsub_xxtx< 0b1, 0b1, "cmp\t", SetNZCV<A64cmp>, (outs)>;
-defm CMPw : addsub_exts<0b0, 0b1, 0b1, "cmp\t", SetNZCV<A64cmp>,
- (outs), extends_to_i32>,
- addsub_wxtx< 0b1, 0b1, "cmp\t", (outs)>;
-}
-}
-
-// Now patterns for the operation without a shift being needed. No patterns are
-// created for uxtx/sxtx since they're non-uniform and it's expected that
-// add/sub (shifted register) will handle those cases anyway.
-multiclass addsubext_noshift_patterns<string prefix, SDPatternOperator nodeop,
- extend_types exts> {
- def : Pat<(nodeop exts.ty:$Rn, exts.uxtb),
- (!cast<Instruction>(prefix # "w_uxtb") $Rn, $Rm, 0)>;
- def : Pat<(nodeop exts.ty:$Rn, exts.uxth),
- (!cast<Instruction>(prefix # "w_uxth") $Rn, $Rm, 0)>;
- def : Pat<(nodeop exts.ty:$Rn, exts.uxtw),
- (!cast<Instruction>(prefix # "w_uxtw") $Rn, $Rm, 0)>;
-
- def : Pat<(nodeop exts.ty:$Rn, exts.sxtb),
- (!cast<Instruction>(prefix # "w_sxtb") $Rn, $Rm, 0)>;
- def : Pat<(nodeop exts.ty:$Rn, exts.sxth),
- (!cast<Instruction>(prefix # "w_sxth") $Rn, $Rm, 0)>;
- def : Pat<(nodeop exts.ty:$Rn, exts.sxtw),
- (!cast<Instruction>(prefix # "w_sxtw") $Rn, $Rm, 0)>;
-}
-
-defm : addsubext_noshift_patterns<"ADDxx", add, extends_to_i64>;
-defm : addsubext_noshift_patterns<"ADDww", add, extends_to_i32>;
-defm : addsubext_noshift_patterns<"SUBxx", sub, extends_to_i64>;
-defm : addsubext_noshift_patterns<"SUBww", sub, extends_to_i32>;
-
-defm : addsubext_noshift_patterns<"CMNx", A64cmn, extends_to_i64>;
-defm : addsubext_noshift_patterns<"CMNw", A64cmn, extends_to_i32>;
-defm : addsubext_noshift_patterns<"CMPx", A64cmp, extends_to_i64>;
-defm : addsubext_noshift_patterns<"CMPw", A64cmp, extends_to_i32>;
-
-// An extend of "lsl #imm" is valid if and only if one of Rn and Rd is
-// sp/wsp. It is synonymous with uxtx/uxtw depending on the size of the
-// operation. Also permitted in this case is complete omission of the argument,
-// which implies "lsl #0".
-multiclass lsl_aliases<string asmop, Instruction inst, RegisterClass GPR_Rd,
- RegisterClass GPR_Rn, RegisterClass GPR_Rm> {
- def : InstAlias<!strconcat(asmop, " $Rd, $Rn, $Rm"),
- (inst GPR_Rd:$Rd, GPR_Rn:$Rn, GPR_Rm:$Rm, 0)>;
-
- def : InstAlias<!strconcat(asmop, " $Rd, $Rn, $Rm, $LSL"),
- (inst GPR_Rd:$Rd, GPR_Rn:$Rn, GPR_Rm:$Rm, LSL_extoperand:$LSL)>;
-
-}
-
-defm : lsl_aliases<"add", ADDxxx_uxtx, Rxsp, GPR64xsp, GPR64>;
-defm : lsl_aliases<"add", ADDxxx_uxtx, GPR64xsp, Rxsp, GPR64>;
-defm : lsl_aliases<"add", ADDwww_uxtw, Rwsp, GPR32wsp, GPR32>;
-defm : lsl_aliases<"add", ADDwww_uxtw, GPR32wsp, Rwsp, GPR32>;
-defm : lsl_aliases<"sub", SUBxxx_uxtx, Rxsp, GPR64xsp, GPR64>;
-defm : lsl_aliases<"sub", SUBxxx_uxtx, GPR64xsp, Rxsp, GPR64>;
-defm : lsl_aliases<"sub", SUBwww_uxtw, Rwsp, GPR32wsp, GPR32>;
-defm : lsl_aliases<"sub", SUBwww_uxtw, GPR32wsp, Rwsp, GPR32>;
-
-// Rd cannot be sp for flag-setting variants so only half of the aliases are
-// needed.
-defm : lsl_aliases<"adds", ADDSxxx_uxtx, GPR64, Rxsp, GPR64>;
-defm : lsl_aliases<"adds", ADDSwww_uxtw, GPR32, Rwsp, GPR32>;
-defm : lsl_aliases<"subs", SUBSxxx_uxtx, GPR64, Rxsp, GPR64>;
-defm : lsl_aliases<"subs", SUBSwww_uxtw, GPR32, Rwsp, GPR32>;
-
-// CMP unfortunately has to be different because the instruction doesn't have a
-// dest register.
-multiclass cmp_lsl_aliases<string asmop, Instruction inst,
- RegisterClass GPR_Rn, RegisterClass GPR_Rm> {
- def : InstAlias<!strconcat(asmop, " $Rn, $Rm"),
- (inst GPR_Rn:$Rn, GPR_Rm:$Rm, 0)>;
-
- def : InstAlias<!strconcat(asmop, " $Rn, $Rm, $LSL"),
- (inst GPR_Rn:$Rn, GPR_Rm:$Rm, LSL_extoperand:$LSL)>;
-}
-
-defm : cmp_lsl_aliases<"cmp", CMPxx_uxtx, Rxsp, GPR64>;
-defm : cmp_lsl_aliases<"cmp", CMPww_uxtw, Rwsp, GPR32>;
-defm : cmp_lsl_aliases<"cmn", CMNxx_uxtx, Rxsp, GPR64>;
-defm : cmp_lsl_aliases<"cmn", CMNww_uxtw, Rwsp, GPR32>;
-
-//===----------------------------------------------------------------------===//
-// Add-subtract (immediate) instructions
-//===----------------------------------------------------------------------===//
-// Contains: ADD, ADDS, SUB, SUBS + aliases CMN, CMP, MOV
-
-// These instructions accept a 12-bit unsigned immediate, optionally shifted
-// left by 12 bits. Official assembly format specifies a 12 bit immediate with
-// one of "", "LSL #0", "LSL #12" supplementary operands.
-
-// There are surprisingly few ways to make this work with TableGen, so this
-// implementation has separate instructions for the "LSL #0" and "LSL #12"
-// variants.
-
-// If the MCInst retained a single combined immediate (which could be 0x123000,
-// for example) then both components (imm & shift) would have to be delegated to
-// a single assembly operand. This would entail a separate operand parser
-// (because the LSL would have to live in the same AArch64Operand as the
-// immediate to be accessible); assembly parsing is rather complex and
-// error-prone C++ code.
-//
-// By splitting the immediate, we can delegate handling this optional operand to
-// an InstAlias. Supporting functions to generate the correct MCInst are still
-// required, but these are essentially trivial and parsing can remain generic.
-//
-// Rejected plans with rationale:
-// ------------------------------
-//
-// In an ideal world you'de have two first class immediate operands (in
-// InOperandList, specifying imm12 and shift). Unfortunately this is not
-// selectable by any means I could discover.
-//
-// An Instruction with two MCOperands hidden behind a single entry in
-// InOperandList (expanded by ComplexPatterns and MIOperandInfo) was functional,
-// but required more C++ code to handle encoding/decoding. Parsing (the intended
-// main beneficiary) ended up equally complex because of the optional nature of
-// "LSL #0".
-//
-// Attempting to circumvent the need for a custom OperandParser above by giving
-// InstAliases without the "lsl #0" failed. add/sub could be accommodated but
-// the cmp/cmn aliases didn't use the MIOperandInfo to determine how operands
-// should be parsed: there was no way to accommodate an "lsl #12".
-
-let ParserMethod = "ParseImmWithLSLOperand",
- RenderMethod = "addImmWithLSLOperands" in {
- // Derived PredicateMethod fields are different for each
- def addsubimm_lsl0_asmoperand : AsmOperandClass {
- let Name = "AddSubImmLSL0";
- // If an error is reported against this operand, instruction could also be a
- // register variant.
- let DiagnosticType = "AddSubSecondSource";
- }
-
- def addsubimm_lsl12_asmoperand : AsmOperandClass {
- let Name = "AddSubImmLSL12";
- let DiagnosticType = "AddSubSecondSource";
- }
-}
-
-def shr_12_XFORM : SDNodeXForm<imm, [{
- return CurDAG->getTargetConstant(N->getSExtValue() >> 12, MVT::i32);
-}]>;
-
-def shr_12_neg_XFORM : SDNodeXForm<imm, [{
- return CurDAG->getTargetConstant((-N->getSExtValue()) >> 12, MVT::i32);
-}]>;
-
-def neg_XFORM : SDNodeXForm<imm, [{
- return CurDAG->getTargetConstant(-N->getSExtValue(), MVT::i32);
-}]>;
-
-
-multiclass addsub_imm_operands<ValueType ty> {
- let PrintMethod = "printAddSubImmLSL0Operand",
- EncoderMethod = "getAddSubImmOpValue",
- ParserMatchClass = addsubimm_lsl0_asmoperand in {
- def _posimm_lsl0 : Operand<ty>,
- ImmLeaf<ty, [{ return Imm >= 0 && (Imm & ~0xfff) == 0; }]>;
- def _negimm_lsl0 : Operand<ty>,
- ImmLeaf<ty, [{ return Imm < 0 && (-Imm & ~0xfff) == 0; }],
- neg_XFORM>;
- }
-
- let PrintMethod = "printAddSubImmLSL12Operand",
- EncoderMethod = "getAddSubImmOpValue",
- ParserMatchClass = addsubimm_lsl12_asmoperand in {
- def _posimm_lsl12 : Operand<ty>,
- ImmLeaf<ty, [{ return Imm >= 0 && (Imm & ~0xfff000) == 0; }],
- shr_12_XFORM>;
-
- def _negimm_lsl12 : Operand<ty>,
- ImmLeaf<ty, [{ return Imm < 0 && (-Imm & ~0xfff000) == 0; }],
- shr_12_neg_XFORM>;
- }
-}
-
-// The add operands don't need any transformation
-defm addsubimm_operand_i32 : addsub_imm_operands<i32>;
-defm addsubimm_operand_i64 : addsub_imm_operands<i64>;
-
-multiclass addsubimm_varieties<string prefix, bit sf, bit op, bits<2> shift,
- string asmop, string cmpasmop,
- Operand imm_operand, Operand cmp_imm_operand,
- RegisterClass GPR, RegisterClass GPRsp,
- AArch64Reg ZR, ValueType Ty> {
- // All registers for non-S variants allow SP
- def _s : A64I_addsubimm<sf, op, 0b0, shift,
- (outs GPRsp:$Rd),
- (ins GPRsp:$Rn, imm_operand:$Imm12),
- !strconcat(asmop, "\t$Rd, $Rn, $Imm12"),
- [(set Ty:$Rd, (add Ty:$Rn, imm_operand:$Imm12))],
- NoItinerary>,
- Sched<[WriteALU, ReadALU]>;
-
-
- // S variants can read SP but would write to ZR
- def _S : A64I_addsubimm<sf, op, 0b1, shift,
- (outs GPR:$Rd),
- (ins GPRsp:$Rn, imm_operand:$Imm12),
- !strconcat(asmop, "s\t$Rd, $Rn, $Imm12"),
- [(set Ty:$Rd, (addc Ty:$Rn, imm_operand:$Imm12))],
- NoItinerary>,
- Sched<[WriteALU, ReadALU]> {
- let Defs = [NZCV];
- }
-
- // Note that the pattern here for ADDS is subtle. Canonically CMP
- // a, b becomes SUBS a, b. If b < 0 then this is equivalent to
- // ADDS a, (-b). This is not true in general.
- def _cmp : A64I_addsubimm<sf, op, 0b1, shift,
- (outs), (ins GPRsp:$Rn, imm_operand:$Imm12),
- !strconcat(cmpasmop, " $Rn, $Imm12"),
- [(set NZCV,
- (A64cmp Ty:$Rn, cmp_imm_operand:$Imm12))],
- NoItinerary>,
- Sched<[WriteCMP, ReadCMP]> {
- let Rd = 0b11111;
- let Defs = [NZCV];
- let isCompare = 1;
- }
-}
-
-
-multiclass addsubimm_shifts<string prefix, bit sf, bit op,
- string asmop, string cmpasmop, string operand, string cmpoperand,
- RegisterClass GPR, RegisterClass GPRsp, AArch64Reg ZR,
- ValueType Ty> {
- defm _lsl0 : addsubimm_varieties<prefix # "_lsl0", sf, op, 0b00,
- asmop, cmpasmop,
- !cast<Operand>(operand # "_lsl0"),
- !cast<Operand>(cmpoperand # "_lsl0"),
- GPR, GPRsp, ZR, Ty>;
-
- defm _lsl12 : addsubimm_varieties<prefix # "_lsl12", sf, op, 0b01,
- asmop, cmpasmop,
- !cast<Operand>(operand # "_lsl12"),
- !cast<Operand>(cmpoperand # "_lsl12"),
- GPR, GPRsp, ZR, Ty>;
-}
-
-defm ADDwwi : addsubimm_shifts<"ADDwi", 0b0, 0b0, "add", "cmn",
- "addsubimm_operand_i32_posimm",
- "addsubimm_operand_i32_negimm",
- GPR32, GPR32wsp, WZR, i32>;
-defm ADDxxi : addsubimm_shifts<"ADDxi", 0b1, 0b0, "add", "cmn",
- "addsubimm_operand_i64_posimm",
- "addsubimm_operand_i64_negimm",
- GPR64, GPR64xsp, XZR, i64>;
-defm SUBwwi : addsubimm_shifts<"SUBwi", 0b0, 0b1, "sub", "cmp",
- "addsubimm_operand_i32_negimm",
- "addsubimm_operand_i32_posimm",
- GPR32, GPR32wsp, WZR, i32>;
-defm SUBxxi : addsubimm_shifts<"SUBxi", 0b1, 0b1, "sub", "cmp",
- "addsubimm_operand_i64_negimm",
- "addsubimm_operand_i64_posimm",
- GPR64, GPR64xsp, XZR, i64>;
-
-multiclass MOVsp<RegisterClass GPRsp, RegisterClass SP, Instruction addop> {
- def _fromsp : InstAlias<"mov $Rd, $Rn",
- (addop GPRsp:$Rd, SP:$Rn, 0),
- 0b1>;
-
- def _tosp : InstAlias<"mov $Rd, $Rn",
- (addop SP:$Rd, GPRsp:$Rn, 0),
- 0b1>;
-}
-
-// Recall Rxsp is a RegisterClass containing *just* xsp.
-defm MOVxx : MOVsp<GPR64xsp, Rxsp, ADDxxi_lsl0_s>;
-defm MOVww : MOVsp<GPR32wsp, Rwsp, ADDwwi_lsl0_s>;
-
-//===----------------------------------------------------------------------===//
-// Add-subtract (shifted register) instructions
-//===----------------------------------------------------------------------===//
-// Contains: ADD, ADDS, SUB, SUBS + aliases CMN, CMP, NEG, NEGS
-
-//===-------------------------------
-// 1. The "shifted register" operands. Shared with logical insts.
-//===-------------------------------
-
-multiclass shift_operands<string prefix, string form> {
- def _asmoperand_i32 : AsmOperandClass {
- let Name = "Shift" # form # "i32";
- let RenderMethod = "addShiftOperands";
- let PredicateMethod = "isShift<A64SE::" # form # ", false>";
- let DiagnosticType = "AddSubRegShift32";
- }
-
- // Note that the operand type is intentionally i64 because the DAGCombiner
- // puts these into a canonical form.
- def _i32 : Operand<i64>, ImmLeaf<i64, [{ return Imm >= 0 && Imm <= 31; }]> {
- let ParserMatchClass
- = !cast<AsmOperandClass>(prefix # "_asmoperand_i32");
- let PrintMethod = "printShiftOperand<A64SE::" # form # ">";
- let DecoderMethod = "Decode32BitShiftOperand";
- }
-
- def _asmoperand_i64 : AsmOperandClass {
- let Name = "Shift" # form # "i64";
- let RenderMethod = "addShiftOperands";
- let PredicateMethod = "isShift<A64SE::" # form # ", true>";
- let DiagnosticType = "AddSubRegShift64";
- }
-
- def _i64 : Operand<i64>, ImmLeaf<i64, [{ return Imm >= 0 && Imm <= 63; }]> {
- let ParserMatchClass
- = !cast<AsmOperandClass>(prefix # "_asmoperand_i64");
- let PrintMethod = "printShiftOperand<A64SE::" # form # ">";
- }
-}
-
-defm lsl_operand : shift_operands<"lsl_operand", "LSL">;
-defm lsr_operand : shift_operands<"lsr_operand", "LSR">;
-defm asr_operand : shift_operands<"asr_operand", "ASR">;
-
-// Not used for add/sub, but defined here for completeness. The "logical
-// (shifted register)" instructions *do* have an ROR variant.
-defm ror_operand : shift_operands<"ror_operand", "ROR">;
-
-//===-------------------------------
-// 2. The basic 3.5-operand ADD/SUB/ADDS/SUBS instructions.
-//===-------------------------------
-
-// N.b. the commutable parameter is just !N. It will be first against the wall
-// when the revolution comes.
-multiclass addsub_shifts<string prefix, bit sf, bit op, bit s, bit commutable,
- string asmop, SDPatternOperator opfrag, ValueType ty,
- RegisterClass GPR, list<Register> defs> {
- let isCommutable = commutable, Defs = defs in {
- def _lsl : A64I_addsubshift<sf, op, s, 0b00,
- (outs GPR:$Rd),
- (ins GPR:$Rn, GPR:$Rm,
- !cast<Operand>("lsl_operand_" # ty):$Imm6),
- !strconcat(asmop, "\t$Rd, $Rn, $Rm, $Imm6"),
- [(set GPR:$Rd, (opfrag ty:$Rn, (shl ty:$Rm,
- !cast<Operand>("lsl_operand_" # ty):$Imm6))
- )],
- NoItinerary>,
- Sched<[WriteALU, ReadALU]>;
-
- def _lsr : A64I_addsubshift<sf, op, s, 0b01,
- (outs GPR:$Rd),
- (ins GPR:$Rn, GPR:$Rm,
- !cast<Operand>("lsr_operand_" # ty):$Imm6),
- !strconcat(asmop, "\t$Rd, $Rn, $Rm, $Imm6"),
- [(set ty:$Rd, (opfrag ty:$Rn, (srl ty:$Rm,
- !cast<Operand>("lsr_operand_" # ty):$Imm6))
- )],
- NoItinerary>,
- Sched<[WriteALU, ReadALU]>;
-
- def _asr : A64I_addsubshift<sf, op, s, 0b10,
- (outs GPR:$Rd),
- (ins GPR:$Rn, GPR:$Rm,
- !cast<Operand>("asr_operand_" # ty):$Imm6),
- !strconcat(asmop, "\t$Rd, $Rn, $Rm, $Imm6"),
- [(set ty:$Rd, (opfrag ty:$Rn, (sra ty:$Rm,
- !cast<Operand>("asr_operand_" # ty):$Imm6))
- )],
- NoItinerary>,
- Sched<[WriteALU, ReadALU]>;
- }
-
- def _noshift
- : InstAlias<!strconcat(asmop, " $Rd, $Rn, $Rm"),
- (!cast<Instruction>(prefix # "_lsl") GPR:$Rd, GPR:$Rn,
- GPR:$Rm, 0)>;
-
- def : Pat<(opfrag ty:$Rn, ty:$Rm),
- (!cast<Instruction>(prefix # "_lsl") $Rn, $Rm, 0)>;
-}
-
-multiclass addsub_sizes<string prefix, bit op, bit s, bit commutable,
- string asmop, SDPatternOperator opfrag,
- list<Register> defs> {
- defm xxx : addsub_shifts<prefix # "xxx", 0b1, op, s,
- commutable, asmop, opfrag, i64, GPR64, defs>;
- defm www : addsub_shifts<prefix # "www", 0b0, op, s,
- commutable, asmop, opfrag, i32, GPR32, defs>;
-}
-
-
-defm ADD : addsub_sizes<"ADD", 0b0, 0b0, 0b1, "add", add, []>;
-defm SUB : addsub_sizes<"SUB", 0b1, 0b0, 0b0, "sub", sub, []>;
-
-defm ADDS : addsub_sizes<"ADDS", 0b0, 0b1, 0b1, "adds", addc, [NZCV]>;
-defm SUBS : addsub_sizes<"SUBS", 0b1, 0b1, 0b0, "subs", subc, [NZCV]>;
-
-//===-------------------------------
-// 1. The NEG/NEGS aliases
-//===-------------------------------
-
-multiclass neg_alias<Instruction INST, RegisterClass GPR, Register ZR,
- ValueType ty, Operand shift_operand, SDNode shiftop> {
- def : InstAlias<"neg $Rd, $Rm, $Imm6",
- (INST GPR:$Rd, ZR, GPR:$Rm, shift_operand:$Imm6)>;
-
- def : Pat<(sub 0, (shiftop ty:$Rm, shift_operand:$Imm6)),
- (INST ZR, $Rm, shift_operand:$Imm6)>;
-}
-
-defm : neg_alias<SUBwww_lsl, GPR32, WZR, i32, lsl_operand_i32, shl>;
-defm : neg_alias<SUBwww_lsr, GPR32, WZR, i32, lsr_operand_i32, srl>;
-defm : neg_alias<SUBwww_asr, GPR32, WZR, i32, asr_operand_i32, sra>;
-def : InstAlias<"neg $Rd, $Rm", (SUBwww_lsl GPR32:$Rd, WZR, GPR32:$Rm, 0)>;
-def : Pat<(sub 0, i32:$Rm), (SUBwww_lsl WZR, $Rm, 0)>;
-
-defm : neg_alias<SUBxxx_lsl, GPR64, XZR, i64, lsl_operand_i64, shl>;
-defm : neg_alias<SUBxxx_lsr, GPR64, XZR, i64, lsr_operand_i64, srl>;
-defm : neg_alias<SUBxxx_asr, GPR64, XZR, i64, asr_operand_i64, sra>;
-def : InstAlias<"neg $Rd, $Rm", (SUBxxx_lsl GPR64:$Rd, XZR, GPR64:$Rm, 0)>;
-def : Pat<(sub 0, i64:$Rm), (SUBxxx_lsl XZR, $Rm, 0)>;
-
-// NEGS doesn't get any patterns yet: defining multiple outputs means C++ has to
-// be involved.
-class negs_alias<Instruction INST, RegisterClass GPR,
- Register ZR, Operand shift_operand, SDNode shiftop>
- : InstAlias<"negs $Rd, $Rm, $Imm6",
- (INST GPR:$Rd, ZR, GPR:$Rm, shift_operand:$Imm6)>;
-
-def : negs_alias<SUBSwww_lsl, GPR32, WZR, lsl_operand_i32, shl>;
-def : negs_alias<SUBSwww_lsr, GPR32, WZR, lsr_operand_i32, srl>;
-def : negs_alias<SUBSwww_asr, GPR32, WZR, asr_operand_i32, sra>;
-def : InstAlias<"negs $Rd, $Rm", (SUBSwww_lsl GPR32:$Rd, WZR, GPR32:$Rm, 0)>;
-
-def : negs_alias<SUBSxxx_lsl, GPR64, XZR, lsl_operand_i64, shl>;
-def : negs_alias<SUBSxxx_lsr, GPR64, XZR, lsr_operand_i64, srl>;
-def : negs_alias<SUBSxxx_asr, GPR64, XZR, asr_operand_i64, sra>;
-def : InstAlias<"negs $Rd, $Rm", (SUBSxxx_lsl GPR64:$Rd, XZR, GPR64:$Rm, 0)>;
-
-//===-------------------------------
-// 1. The CMP/CMN aliases
-//===-------------------------------
-
-multiclass cmp_shifts<string prefix, bit sf, bit op, bit commutable,
- string asmop, SDPatternOperator opfrag, ValueType ty,
- RegisterClass GPR> {
- let isCommutable = commutable, Rd = 0b11111, Defs = [NZCV] in {
- def _lsl : A64I_addsubshift<sf, op, 0b1, 0b00,
- (outs),
- (ins GPR:$Rn, GPR:$Rm,
- !cast<Operand>("lsl_operand_" # ty):$Imm6),
- !strconcat(asmop, "\t$Rn, $Rm, $Imm6"),
- [(set NZCV, (opfrag ty:$Rn, (shl ty:$Rm,
- !cast<Operand>("lsl_operand_" # ty):$Imm6))
- )],
- NoItinerary>,
- Sched<[WriteCMP, ReadCMP, ReadCMP]>;
-
- def _lsr : A64I_addsubshift<sf, op, 0b1, 0b01,
- (outs),
- (ins GPR:$Rn, GPR:$Rm,
- !cast<Operand>("lsr_operand_" # ty):$Imm6),
- !strconcat(asmop, "\t$Rn, $Rm, $Imm6"),
- [(set NZCV, (opfrag ty:$Rn, (srl ty:$Rm,
- !cast<Operand>("lsr_operand_" # ty):$Imm6))
- )],
- NoItinerary>,
- Sched<[WriteCMP, ReadCMP, ReadCMP]>;
-
- def _asr : A64I_addsubshift<sf, op, 0b1, 0b10,
- (outs),
- (ins GPR:$Rn, GPR:$Rm,
- !cast<Operand>("asr_operand_" # ty):$Imm6),
- !strconcat(asmop, "\t$Rn, $Rm, $Imm6"),
- [(set NZCV, (opfrag ty:$Rn, (sra ty:$Rm,
- !cast<Operand>("asr_operand_" # ty):$Imm6))
- )],
- NoItinerary>,
- Sched<[WriteCMP, ReadCMP, ReadCMP]>;
- }
-
- def _noshift
- : InstAlias<!strconcat(asmop, " $Rn, $Rm"),
- (!cast<Instruction>(prefix # "_lsl") GPR:$Rn, GPR:$Rm, 0)>;
-
- def : Pat<(opfrag ty:$Rn, ty:$Rm),
- (!cast<Instruction>(prefix # "_lsl") $Rn, $Rm, 0)>;
-}
-
-defm CMPww : cmp_shifts<"CMPww", 0b0, 0b1, 0b0, "cmp", A64cmp, i32, GPR32>;
-defm CMPxx : cmp_shifts<"CMPxx", 0b1, 0b1, 0b0, "cmp", A64cmp, i64, GPR64>;
-
-defm CMNww : cmp_shifts<"CMNww", 0b0, 0b0, 0b1, "cmn", A64cmn, i32, GPR32>;
-defm CMNxx : cmp_shifts<"CMNxx", 0b1, 0b0, 0b1, "cmn", A64cmn, i64, GPR64>;
-
-//===----------------------------------------------------------------------===//
-// Add-subtract (with carry) instructions
-//===----------------------------------------------------------------------===//
-// Contains: ADC, ADCS, SBC, SBCS + aliases NGC, NGCS
-
-multiclass A64I_addsubcarrySizes<bit op, bit s, string asmop> {
- let Uses = [NZCV] in {
- def www : A64I_addsubcarry<0b0, op, s, 0b000000,
- (outs GPR32:$Rd), (ins GPR32:$Rn, GPR32:$Rm),
- !strconcat(asmop, "\t$Rd, $Rn, $Rm"),
- [], NoItinerary>,
- Sched<[WriteALU, ReadALU, ReadALU]>;
-
- def xxx : A64I_addsubcarry<0b1, op, s, 0b000000,
- (outs GPR64:$Rd), (ins GPR64:$Rn, GPR64:$Rm),
- !strconcat(asmop, "\t$Rd, $Rn, $Rm"),
- [], NoItinerary>,
- Sched<[WriteALU, ReadALU, ReadALU]>;
- }
-}
-
-let isCommutable = 1 in {
- defm ADC : A64I_addsubcarrySizes<0b0, 0b0, "adc">;
-}
-
-defm SBC : A64I_addsubcarrySizes<0b1, 0b0, "sbc">;
-
-let Defs = [NZCV] in {
- let isCommutable = 1 in {
- defm ADCS : A64I_addsubcarrySizes<0b0, 0b1, "adcs">;
- }
-
- defm SBCS : A64I_addsubcarrySizes<0b1, 0b1, "sbcs">;
-}
-
-def : InstAlias<"ngc $Rd, $Rm", (SBCwww GPR32:$Rd, WZR, GPR32:$Rm)>;
-def : InstAlias<"ngc $Rd, $Rm", (SBCxxx GPR64:$Rd, XZR, GPR64:$Rm)>;
-def : InstAlias<"ngcs $Rd, $Rm", (SBCSwww GPR32:$Rd, WZR, GPR32:$Rm)>;
-def : InstAlias<"ngcs $Rd, $Rm", (SBCSxxx GPR64:$Rd, XZR, GPR64:$Rm)>;
-
-// Note that adde and sube can form a chain longer than two (e.g. for 256-bit
-// addition). So the flag-setting instructions are appropriate.
-def : Pat<(adde i32:$Rn, i32:$Rm), (ADCSwww $Rn, $Rm)>;
-def : Pat<(adde i64:$Rn, i64:$Rm), (ADCSxxx $Rn, $Rm)>;
-def : Pat<(sube i32:$Rn, i32:$Rm), (SBCSwww $Rn, $Rm)>;
-def : Pat<(sube i64:$Rn, i64:$Rm), (SBCSxxx $Rn, $Rm)>;
-
-//===----------------------------------------------------------------------===//
-// Bitfield
-//===----------------------------------------------------------------------===//
-// Contains: SBFM, BFM, UBFM, [SU]XT[BHW], ASR, LSR, LSL, SBFI[ZX], BFI, BFXIL,
-// UBFIZ, UBFX
-
-// Because of the rather complicated nearly-overlapping aliases, the decoding of
-// this range of instructions is handled manually. The architectural
-// instructions are BFM, SBFM and UBFM but a disassembler should never produce
-// these.
-//
-// In the end, the best option was to use BFM instructions for decoding under
-// almost all circumstances, but to create aliasing *Instructions* for each of
-// the canonical forms and specify a completely custom decoder which would
-// substitute the correct MCInst as needed.
-//
-// This also simplifies instruction selection, parsing etc because the MCInsts
-// have a shape that's closer to their use in code.
-
-//===-------------------------------
-// 1. The architectural BFM instructions
-//===-------------------------------
-
-def uimm5_asmoperand : AsmOperandClass {
- let Name = "UImm5";
- let PredicateMethod = "isUImm<5>";
- let RenderMethod = "addImmOperands";
- let DiagnosticType = "UImm5";
-}
-
-def uimm6_asmoperand : AsmOperandClass {
- let Name = "UImm6";
- let PredicateMethod = "isUImm<6>";
- let RenderMethod = "addImmOperands";
- let DiagnosticType = "UImm6";
-}
-
-def bitfield32_imm : Operand<i64>,
- ImmLeaf<i64, [{ return Imm >= 0 && Imm < 32; }]> {
- let ParserMatchClass = uimm5_asmoperand;
-
- let DecoderMethod = "DecodeBitfield32ImmOperand";
-}
-
-
-def bitfield64_imm : Operand<i64>,
- ImmLeaf<i64, [{ return Imm >= 0 && Imm < 64; }]> {
- let ParserMatchClass = uimm6_asmoperand;
-
- // Default decoder works in 64-bit case: the 6-bit field can take any value.
-}
-
-multiclass A64I_bitfieldSizes<bits<2> opc, string asmop> {
- def wwii : A64I_bitfield<0b0, opc, 0b0, (outs GPR32:$Rd),
- (ins GPR32:$Rn, bitfield32_imm:$ImmR, bitfield32_imm:$ImmS),
- !strconcat(asmop, "\t$Rd, $Rn, $ImmR, $ImmS"),
- [], NoItinerary>,
- Sched<[WriteALU, ReadALU]> {
- let DecoderMethod = "DecodeBitfieldInstruction";
- }
-
- def xxii : A64I_bitfield<0b1, opc, 0b1, (outs GPR64:$Rd),
- (ins GPR64:$Rn, bitfield64_imm:$ImmR, bitfield64_imm:$ImmS),
- !strconcat(asmop, "\t$Rd, $Rn, $ImmR, $ImmS"),
- [], NoItinerary>,
- Sched<[WriteALU, ReadALU]> {
- let DecoderMethod = "DecodeBitfieldInstruction";
- }
-}
-
-defm SBFM : A64I_bitfieldSizes<0b00, "sbfm">;
-defm UBFM : A64I_bitfieldSizes<0b10, "ubfm">;
-
-// BFM instructions modify the destination register rather than defining it
-// completely.
-def BFMwwii :
- A64I_bitfield<0b0, 0b01, 0b0, (outs GPR32:$Rd),
- (ins GPR32:$src, GPR32:$Rn, bitfield32_imm:$ImmR, bitfield32_imm:$ImmS),
- "bfm\t$Rd, $Rn, $ImmR, $ImmS", [], NoItinerary>,
- Sched<[WriteALU, ReadALU, ReadALU]> {
- let DecoderMethod = "DecodeBitfieldInstruction";
- let Constraints = "$src = $Rd";
-}
-
-def BFMxxii :
- A64I_bitfield<0b1, 0b01, 0b1, (outs GPR64:$Rd),
- (ins GPR64:$src, GPR64:$Rn, bitfield64_imm:$ImmR, bitfield64_imm:$ImmS),
- "bfm\t$Rd, $Rn, $ImmR, $ImmS", [], NoItinerary>,
- Sched<[WriteALU, ReadALU, ReadALU]> {
- let DecoderMethod = "DecodeBitfieldInstruction";
- let Constraints = "$src = $Rd";
-}
-
-
-//===-------------------------------
-// 2. Extend aliases to 64-bit dest
-//===-------------------------------
-
-// Unfortunately the extensions that end up as 64-bits cannot be handled by an
-// instruction alias: their syntax is (for example) "SXTB x0, w0", which needs
-// to be mapped to "SBFM x0, x0, #0, 7" (changing the class of Rn). InstAlias is
-// not capable of such a map as far as I'm aware
-
-// Note that these instructions are strictly more specific than the
-// BFM ones (in ImmR) so they can handle their own decoding.
-class A64I_bf_ext<bit sf, bits<2> opc, RegisterClass GPRDest, ValueType dty,
- string asmop, bits<6> imms, dag pattern>
- : A64I_bitfield<sf, opc, sf,
- (outs GPRDest:$Rd), (ins GPR32:$Rn),
- !strconcat(asmop, "\t$Rd, $Rn"),
- [(set dty:$Rd, pattern)], NoItinerary>,
- Sched<[WriteALU, ReadALU]> {
- let ImmR = 0b000000;
- let ImmS = imms;
-}
-
-// Signed extensions
-def SXTBxw : A64I_bf_ext<0b1, 0b00, GPR64, i64, "sxtb", 7,
- (sext_inreg (anyext i32:$Rn), i8)>;
-def SXTBww : A64I_bf_ext<0b0, 0b00, GPR32, i32, "sxtb", 7,
- (sext_inreg i32:$Rn, i8)>;
-def SXTHxw : A64I_bf_ext<0b1, 0b00, GPR64, i64, "sxth", 15,
- (sext_inreg (anyext i32:$Rn), i16)>;
-def SXTHww : A64I_bf_ext<0b0, 0b00, GPR32, i32, "sxth", 15,
- (sext_inreg i32:$Rn, i16)>;
-def SXTWxw : A64I_bf_ext<0b1, 0b00, GPR64, i64, "sxtw", 31, (sext i32:$Rn)>;
-
-// Unsigned extensions
-def UXTBww : A64I_bf_ext<0b0, 0b10, GPR32, i32, "uxtb", 7,
- (and i32:$Rn, 255)>;
-def UXTHww : A64I_bf_ext<0b0, 0b10, GPR32, i32, "uxth", 15,
- (and i32:$Rn, 65535)>;
-
-// The 64-bit unsigned variants are not strictly architectural but recommended
-// for consistency.
-let isAsmParserOnly = 1 in {
- def UXTBxw : A64I_bf_ext<0b0, 0b10, GPR64, i64, "uxtb", 7,
- (and (anyext i32:$Rn), 255)>;
- def UXTHxw : A64I_bf_ext<0b0, 0b10, GPR64, i64, "uxth", 15,
- (and (anyext i32:$Rn), 65535)>;
-}
-
-// Extra patterns for when the source register is actually 64-bits
-// too. There's no architectural difference here, it's just LLVM
-// shinanigans. There's no need for equivalent zero-extension patterns
-// because they'll already be caught by logical (immediate) matching.
-def : Pat<(sext_inreg i64:$Rn, i8),
- (SXTBxw (EXTRACT_SUBREG $Rn, sub_32))>;
-def : Pat<(sext_inreg i64:$Rn, i16),
- (SXTHxw (EXTRACT_SUBREG $Rn, sub_32))>;
-def : Pat<(sext_inreg i64:$Rn, i32),
- (SXTWxw (EXTRACT_SUBREG $Rn, sub_32))>;
-
-
-//===-------------------------------
-// 3. Aliases for ASR and LSR (the simple shifts)
-//===-------------------------------
-
-// These also handle their own decoding because ImmS being set makes
-// them take precedence over BFM.
-multiclass A64I_shift<bits<2> opc, string asmop, SDNode opnode> {
- def wwi : A64I_bitfield<0b0, opc, 0b0,
- (outs GPR32:$Rd), (ins GPR32:$Rn, bitfield32_imm:$ImmR),
- !strconcat(asmop, "\t$Rd, $Rn, $ImmR"),
- [(set i32:$Rd, (opnode i32:$Rn, bitfield32_imm:$ImmR))],
- NoItinerary>,
- Sched<[WriteALU, ReadALU]> {
- let ImmS = 31;
- }
-
- def xxi : A64I_bitfield<0b1, opc, 0b1,
- (outs GPR64:$Rd), (ins GPR64:$Rn, bitfield64_imm:$ImmR),
- !strconcat(asmop, "\t$Rd, $Rn, $ImmR"),
- [(set i64:$Rd, (opnode i64:$Rn, bitfield64_imm:$ImmR))],
- NoItinerary>,
- Sched<[WriteALU, ReadALU]> {
- let ImmS = 63;
- }
-
-}
-
-defm ASR : A64I_shift<0b00, "asr", sra>;
-defm LSR : A64I_shift<0b10, "lsr", srl>;
-
-//===-------------------------------
-// 4. Aliases for LSL
-//===-------------------------------
-
-// Unfortunately LSL and subsequent aliases are much more complicated. We need
-// to be able to say certain output instruction fields depend in a complex
-// manner on combinations of input assembly fields).
-//
-// MIOperandInfo *might* have been able to do it, but at the cost of
-// significantly more C++ code.
-
-// N.b. contrary to usual practice these operands store the shift rather than
-// the machine bits in an MCInst. The complexity overhead of consistency
-// outweighed the benefits in this case (custom asmparser, printer and selection
-// vs custom encoder).
-def bitfield32_lsl_imm : Operand<i64>,
- ImmLeaf<i64, [{ return Imm >= 0 && Imm <= 31; }]> {
- let ParserMatchClass = uimm5_asmoperand;
- let EncoderMethod = "getBitfield32LSLOpValue";
-}
-
-def bitfield64_lsl_imm : Operand<i64>,
- ImmLeaf<i64, [{ return Imm >= 0 && Imm <= 63; }]> {
- let ParserMatchClass = uimm6_asmoperand;
- let EncoderMethod = "getBitfield64LSLOpValue";
-}
-
-class A64I_bitfield_lsl<bit sf, RegisterClass GPR, ValueType ty,
- Operand operand>
- : A64I_bitfield<sf, 0b10, sf, (outs GPR:$Rd), (ins GPR:$Rn, operand:$FullImm),
- "lsl\t$Rd, $Rn, $FullImm",
- [(set ty:$Rd, (shl ty:$Rn, operand:$FullImm))],
- NoItinerary>,
- Sched<[WriteALU, ReadALU]> {
- bits<12> FullImm;
- let ImmR = FullImm{5-0};
- let ImmS = FullImm{11-6};
-
- // No disassembler allowed because it would overlap with BFM which does the
- // actual work.
- let isAsmParserOnly = 1;
-}
-
-def LSLwwi : A64I_bitfield_lsl<0b0, GPR32, i32, bitfield32_lsl_imm>;
-def LSLxxi : A64I_bitfield_lsl<0b1, GPR64, i64, bitfield64_lsl_imm>;
-
-//===-------------------------------
-// 5. Aliases for bitfield extract instructions
-//===-------------------------------
-
-def bfx32_width_asmoperand : AsmOperandClass {
- let Name = "BFX32Width";
- let PredicateMethod = "isBitfieldWidth<32>";
- let RenderMethod = "addBFXWidthOperands";
- let DiagnosticType = "Width32";
-}
-
-def bfx32_width : Operand<i64>, ImmLeaf<i64, [{ return true; }]> {
- let PrintMethod = "printBFXWidthOperand";
- let ParserMatchClass = bfx32_width_asmoperand;
-}
-
-def bfx64_width_asmoperand : AsmOperandClass {
- let Name = "BFX64Width";
- let PredicateMethod = "isBitfieldWidth<64>";
- let RenderMethod = "addBFXWidthOperands";
- let DiagnosticType = "Width64";
-}
-
-def bfx64_width : Operand<i64> {
- let PrintMethod = "printBFXWidthOperand";
- let ParserMatchClass = bfx64_width_asmoperand;
-}
-
-
-multiclass A64I_bitfield_extract<bits<2> opc, string asmop, SDNode op> {
- def wwii : A64I_bitfield<0b0, opc, 0b0, (outs GPR32:$Rd),
- (ins GPR32:$Rn, bitfield32_imm:$ImmR, bfx32_width:$ImmS),
- !strconcat(asmop, "\t$Rd, $Rn, $ImmR, $ImmS"),
- [(set i32:$Rd, (op i32:$Rn, imm:$ImmR, imm:$ImmS))],
- NoItinerary>,
- Sched<[WriteALU, ReadALU]> {
- // As above, no disassembler allowed.
- let isAsmParserOnly = 1;
- }
-
- def xxii : A64I_bitfield<0b1, opc, 0b1, (outs GPR64:$Rd),
- (ins GPR64:$Rn, bitfield64_imm:$ImmR, bfx64_width:$ImmS),
- !strconcat(asmop, "\t$Rd, $Rn, $ImmR, $ImmS"),
- [(set i64:$Rd, (op i64:$Rn, imm:$ImmR, imm:$ImmS))],
- NoItinerary>,
- Sched<[WriteALU, ReadALU]> {
- // As above, no disassembler allowed.
- let isAsmParserOnly = 1;
- }
-}
-
-defm SBFX : A64I_bitfield_extract<0b00, "sbfx", A64Sbfx>;
-defm UBFX : A64I_bitfield_extract<0b10, "ubfx", A64Ubfx>;
-
-// Again, variants based on BFM modify Rd so need it as an input too.
-def BFXILwwii : A64I_bitfield<0b0, 0b01, 0b0, (outs GPR32:$Rd),
- (ins GPR32:$src, GPR32:$Rn, bitfield32_imm:$ImmR, bfx32_width:$ImmS),
- "bfxil\t$Rd, $Rn, $ImmR, $ImmS", [], NoItinerary>,
- Sched<[WriteALU, ReadALU, ReadALU]> {
- // As above, no disassembler allowed.
- let isAsmParserOnly = 1;
- let Constraints = "$src = $Rd";
-}
-
-def BFXILxxii : A64I_bitfield<0b1, 0b01, 0b1, (outs GPR64:$Rd),
- (ins GPR64:$src, GPR64:$Rn, bitfield64_imm:$ImmR, bfx64_width:$ImmS),
- "bfxil\t$Rd, $Rn, $ImmR, $ImmS", [], NoItinerary>,
- Sched<[WriteALU, ReadALU, ReadALU]> {
- // As above, no disassembler allowed.
- let isAsmParserOnly = 1;
- let Constraints = "$src = $Rd";
-}
-
-// SBFX instructions can do a 1-instruction sign-extension of boolean values.
-def : Pat<(sext_inreg i64:$Rn, i1), (SBFXxxii $Rn, 0, 0)>;
-def : Pat<(sext_inreg i32:$Rn, i1), (SBFXwwii $Rn, 0, 0)>;
-def : Pat<(i64 (sext_inreg (anyext i32:$Rn), i1)),
- (SBFXxxii (SUBREG_TO_REG (i64 0), $Rn, sub_32), 0, 0)>;
-
-// UBFX makes sense as an implementation of a 64-bit zero-extension too. Could
-// use either 64-bit or 32-bit variant, but 32-bit might be more efficient.
-def : Pat<(i64 (zext i32:$Rn)), (SUBREG_TO_REG (i64 0), (UBFXwwii $Rn, 0, 31),
- sub_32)>;
-
-//===-------------------------------
-// 6. Aliases for bitfield insert instructions
-//===-------------------------------
-
-def bfi32_lsb_asmoperand : AsmOperandClass {
- let Name = "BFI32LSB";
- let PredicateMethod = "isUImm<5>";
- let RenderMethod = "addBFILSBOperands<32>";
- let DiagnosticType = "UImm5";
-}
-
-def bfi32_lsb : Operand<i64>,
- ImmLeaf<i64, [{ return Imm >= 0 && Imm <= 31; }]> {
- let PrintMethod = "printBFILSBOperand<32>";
- let ParserMatchClass = bfi32_lsb_asmoperand;
-}
-
-def bfi64_lsb_asmoperand : AsmOperandClass {
- let Name = "BFI64LSB";
- let PredicateMethod = "isUImm<6>";
- let RenderMethod = "addBFILSBOperands<64>";
- let DiagnosticType = "UImm6";
-}
-
-def bfi64_lsb : Operand<i64>,
- ImmLeaf<i64, [{ return Imm >= 0 && Imm <= 63; }]> {
- let PrintMethod = "printBFILSBOperand<64>";
- let ParserMatchClass = bfi64_lsb_asmoperand;
-}
-
-// Width verification is performed during conversion so width operand can be
-// shared between 32/64-bit cases. Still needed for the print method though
-// because ImmR encodes "width - 1".
-def bfi32_width_asmoperand : AsmOperandClass {
- let Name = "BFI32Width";
- let PredicateMethod = "isBitfieldWidth<32>";
- let RenderMethod = "addBFIWidthOperands";
- let DiagnosticType = "Width32";
-}
-
-def bfi32_width : Operand<i64>,
- ImmLeaf<i64, [{ return Imm >= 1 && Imm <= 32; }]> {
- let PrintMethod = "printBFIWidthOperand";
- let ParserMatchClass = bfi32_width_asmoperand;
-}
-
-def bfi64_width_asmoperand : AsmOperandClass {
- let Name = "BFI64Width";
- let PredicateMethod = "isBitfieldWidth<64>";
- let RenderMethod = "addBFIWidthOperands";
- let DiagnosticType = "Width64";
-}
-
-def bfi64_width : Operand<i64>,
- ImmLeaf<i64, [{ return Imm >= 1 && Imm <= 64; }]> {
- let PrintMethod = "printBFIWidthOperand";
- let ParserMatchClass = bfi64_width_asmoperand;
-}
-
-multiclass A64I_bitfield_insert<bits<2> opc, string asmop> {
- def wwii : A64I_bitfield<0b0, opc, 0b0, (outs GPR32:$Rd),
- (ins GPR32:$Rn, bfi32_lsb:$ImmR, bfi32_width:$ImmS),
- !strconcat(asmop, "\t$Rd, $Rn, $ImmR, $ImmS"),
- [], NoItinerary>,
- Sched<[WriteALU, ReadALU]> {
- // As above, no disassembler allowed.
- let isAsmParserOnly = 1;
- }
-
- def xxii : A64I_bitfield<0b1, opc, 0b1, (outs GPR64:$Rd),
- (ins GPR64:$Rn, bfi64_lsb:$ImmR, bfi64_width:$ImmS),
- !strconcat(asmop, "\t$Rd, $Rn, $ImmR, $ImmS"),
- [], NoItinerary>,
- Sched<[WriteALU, ReadALU]> {
- // As above, no disassembler allowed.
- let isAsmParserOnly = 1;
- }
-}
-
-defm SBFIZ : A64I_bitfield_insert<0b00, "sbfiz">;
-defm UBFIZ : A64I_bitfield_insert<0b10, "ubfiz">;
-
-
-def BFIwwii : A64I_bitfield<0b0, 0b01, 0b0, (outs GPR32:$Rd),
- (ins GPR32:$src, GPR32:$Rn, bfi32_lsb:$ImmR, bfi32_width:$ImmS),
- "bfi\t$Rd, $Rn, $ImmR, $ImmS", [], NoItinerary>,
- Sched<[WriteALU, ReadALU, ReadALU]> {
- // As above, no disassembler allowed.
- let isAsmParserOnly = 1;
- let Constraints = "$src = $Rd";
-}
-
-def BFIxxii : A64I_bitfield<0b1, 0b01, 0b1, (outs GPR64:$Rd),
- (ins GPR64:$src, GPR64:$Rn, bfi64_lsb:$ImmR, bfi64_width:$ImmS),
- "bfi\t$Rd, $Rn, $ImmR, $ImmS", [], NoItinerary>,
- Sched<[WriteALU, ReadALU, ReadALU]> {
- // As above, no disassembler allowed.
- let isAsmParserOnly = 1;
- let Constraints = "$src = $Rd";
-}
-
-//===----------------------------------------------------------------------===//
-// Compare and branch (immediate)
-//===----------------------------------------------------------------------===//
-// Contains: CBZ, CBNZ
-
-class label_asmoperand<int width, int scale> : AsmOperandClass {
- let Name = "Label" # width # "_" # scale;
- let PredicateMethod = "isLabel<" # width # "," # scale # ">";
- let RenderMethod = "addLabelOperands<" # width # ", " # scale # ">";
- let DiagnosticType = "Label";
-}
-
-def label_wid19_scal4_asmoperand : label_asmoperand<19, 4>;
-
-// All conditional immediate branches are the same really: 19 signed bits scaled
-// by the instruction-size (4).
-def bcc_target : Operand<OtherVT> {
- // This label is a 19-bit offset from PC, scaled by the instruction-width: 4.
- let ParserMatchClass = label_wid19_scal4_asmoperand;
- let PrintMethod = "printLabelOperand<19, 4>";
- let EncoderMethod = "getLabelOpValue<AArch64::fixup_a64_condbr>";
- let OperandType = "OPERAND_PCREL";
-}
-
-multiclass cmpbr_sizes<bit op, string asmop, ImmLeaf SETOP> {
- let isBranch = 1, isTerminator = 1 in {
- def x : A64I_cmpbr<0b1, op,
- (outs),
- (ins GPR64:$Rt, bcc_target:$Label),
- !strconcat(asmop,"\t$Rt, $Label"),
- [(A64br_cc (A64cmp i64:$Rt, 0), SETOP, bb:$Label)],
- NoItinerary>,
- Sched<[WriteBr, ReadBr]>;
-
- def w : A64I_cmpbr<0b0, op,
- (outs),
- (ins GPR32:$Rt, bcc_target:$Label),
- !strconcat(asmop,"\t$Rt, $Label"),
- [(A64br_cc (A64cmp i32:$Rt, 0), SETOP, bb:$Label)],
- NoItinerary>,
- Sched<[WriteBr, ReadBr]>;
- }
-}
-
-defm CBZ : cmpbr_sizes<0b0, "cbz", ImmLeaf<i32, [{
- return Imm == A64CC::EQ;
-}]> >;
-defm CBNZ : cmpbr_sizes<0b1, "cbnz", ImmLeaf<i32, [{
- return Imm == A64CC::NE;
-}]> >;
-
-//===----------------------------------------------------------------------===//
-// Conditional branch (immediate) instructions
-//===----------------------------------------------------------------------===//
-// Contains: B.cc
-
-def cond_code_asmoperand : AsmOperandClass {
- let Name = "CondCode";
- let DiagnosticType = "CondCode";
-}
-
-def cond_code : Operand<i32>, ImmLeaf<i32, [{
- return Imm >= 0 && Imm <= 15;
-}]> {
- let PrintMethod = "printCondCodeOperand";
- let ParserMatchClass = cond_code_asmoperand;
-}
-
-def Bcc : A64I_condbr<0b0, 0b0, (outs),
- (ins cond_code:$Cond, bcc_target:$Label),
- "b.$Cond $Label", [(A64br_cc NZCV, (i32 imm:$Cond), bb:$Label)],
- NoItinerary>,
- Sched<[WriteBr]> {
- let Uses = [NZCV];
- let isBranch = 1;
- let isTerminator = 1;
-}
-
-//===----------------------------------------------------------------------===//
-// Conditional compare (immediate) instructions
-//===----------------------------------------------------------------------===//
-// Contains: CCMN, CCMP
-
-def uimm4_asmoperand : AsmOperandClass {
- let Name = "UImm4";
- let PredicateMethod = "isUImm<4>";
- let RenderMethod = "addImmOperands";
- let DiagnosticType = "UImm4";
-}
-
-def uimm4 : Operand<i32> {
- let ParserMatchClass = uimm4_asmoperand;
-}
-
-def uimm5 : Operand<i32> {
- let ParserMatchClass = uimm5_asmoperand;
-}
-
-// The only difference between this operand and the one for instructions like
-// B.cc is that it's parsed manually. The other get parsed implicitly as part of
-// the mnemonic handling.
-def cond_code_op_asmoperand : AsmOperandClass {
- let Name = "CondCodeOp";
- let RenderMethod = "addCondCodeOperands";
- let PredicateMethod = "isCondCode";
- let ParserMethod = "ParseCondCodeOperand";
- let DiagnosticType = "CondCode";
-}
-
-def cond_code_op : Operand<i32> {
- let PrintMethod = "printCondCodeOperand";
- let ParserMatchClass = cond_code_op_asmoperand;
-}
-
-class A64I_condcmpimmImpl<bit sf, bit op, RegisterClass GPR, string asmop>
- : A64I_condcmpimm<sf, op, 0b0, 0b0, 0b1, (outs),
- (ins GPR:$Rn, uimm5:$UImm5, uimm4:$NZCVImm, cond_code_op:$Cond),
- !strconcat(asmop, "\t$Rn, $UImm5, $NZCVImm, $Cond"),
- [], NoItinerary>,
- Sched<[WriteCMP, ReadCMP]> {
- let Defs = [NZCV];
-}
-
-def CCMNwi : A64I_condcmpimmImpl<0b0, 0b0, GPR32, "ccmn">;
-def CCMNxi : A64I_condcmpimmImpl<0b1, 0b0, GPR64, "ccmn">;
-def CCMPwi : A64I_condcmpimmImpl<0b0, 0b1, GPR32, "ccmp">;
-def CCMPxi : A64I_condcmpimmImpl<0b1, 0b1, GPR64, "ccmp">;
-
-//===----------------------------------------------------------------------===//
-// Conditional compare (register) instructions
-//===----------------------------------------------------------------------===//
-// Contains: CCMN, CCMP
-
-class A64I_condcmpregImpl<bit sf, bit op, RegisterClass GPR, string asmop>
- : A64I_condcmpreg<sf, op, 0b0, 0b0, 0b1,
- (outs),
- (ins GPR:$Rn, GPR:$Rm, uimm4:$NZCVImm, cond_code_op:$Cond),
- !strconcat(asmop, "\t$Rn, $Rm, $NZCVImm, $Cond"),
- [], NoItinerary>,
- Sched<[WriteCMP, ReadCMP, ReadCMP]> {
- let Defs = [NZCV];
-}
-
-def CCMNww : A64I_condcmpregImpl<0b0, 0b0, GPR32, "ccmn">;
-def CCMNxx : A64I_condcmpregImpl<0b1, 0b0, GPR64, "ccmn">;
-def CCMPww : A64I_condcmpregImpl<0b0, 0b1, GPR32, "ccmp">;
-def CCMPxx : A64I_condcmpregImpl<0b1, 0b1, GPR64, "ccmp">;
-
-//===----------------------------------------------------------------------===//
-// Conditional select instructions
-//===----------------------------------------------------------------------===//
-// Contains: CSEL, CSINC, CSINV, CSNEG + aliases CSET, CSETM, CINC, CINV, CNEG
-
-// Condition code which is encoded as the inversion (semantically rather than
-// bitwise) in the instruction.
-def inv_cond_code_op_asmoperand : AsmOperandClass {
- let Name = "InvCondCodeOp";
- let RenderMethod = "addInvCondCodeOperands";
- let PredicateMethod = "isCondCode";
- let ParserMethod = "ParseCondCodeOperand";
- let DiagnosticType = "CondCode";
-}
-
-def inv_cond_code_op : Operand<i32> {
- let ParserMatchClass = inv_cond_code_op_asmoperand;
-}
-
-// Having a separate operand for the selectable use-case is debatable, but gives
-// consistency with cond_code.
-def inv_cond_XFORM : SDNodeXForm<imm, [{
- A64CC::CondCodes CC = static_cast<A64CC::CondCodes>(N->getZExtValue());
- return CurDAG->getTargetConstant(A64InvertCondCode(CC), MVT::i32);
-}]>;
-
-def inv_cond_code
- : ImmLeaf<i32, [{ return Imm >= 0 && Imm <= 15; }], inv_cond_XFORM>;
-
-
-multiclass A64I_condselSizes<bit op, bits<2> op2, string asmop,
- SDPatternOperator select> {
- let Uses = [NZCV] in {
- def wwwc : A64I_condsel<0b0, op, 0b0, op2,
- (outs GPR32:$Rd),
- (ins GPR32:$Rn, GPR32:$Rm, cond_code_op:$Cond),
- !strconcat(asmop, "\t$Rd, $Rn, $Rm, $Cond"),
- [(set i32:$Rd, (select i32:$Rn, i32:$Rm))],
- NoItinerary>,
- Sched<[WriteCMP, ReadCMP, ReadCMP]>;
-
-
- def xxxc : A64I_condsel<0b1, op, 0b0, op2,
- (outs GPR64:$Rd),
- (ins GPR64:$Rn, GPR64:$Rm, cond_code_op:$Cond),
- !strconcat(asmop, "\t$Rd, $Rn, $Rm, $Cond"),
- [(set i64:$Rd, (select i64:$Rn, i64:$Rm))],
- NoItinerary>,
- Sched<[WriteCMP, ReadCMP, ReadCMP]>;
- }
-}
-
-def simple_select
- : PatFrag<(ops node:$lhs, node:$rhs),
- (A64select_cc NZCV, node:$lhs, node:$rhs, (i32 imm:$Cond))>;
-
-class complex_select<SDPatternOperator opnode>
- : PatFrag<(ops node:$lhs, node:$rhs),
- (A64select_cc NZCV, node:$lhs, (opnode node:$rhs), (i32 imm:$Cond))>;
-
-
-defm CSEL : A64I_condselSizes<0b0, 0b00, "csel", simple_select>;
-defm CSINC : A64I_condselSizes<0b0, 0b01, "csinc",
- complex_select<PatFrag<(ops node:$val),
- (add node:$val, 1)>>>;
-defm CSINV : A64I_condselSizes<0b1, 0b00, "csinv", complex_select<not>>;
-defm CSNEG : A64I_condselSizes<0b1, 0b01, "csneg", complex_select<ineg>>;
-
-// Now the instruction aliases, which fit nicely into LLVM's model:
-
-def : InstAlias<"cset $Rd, $Cond",
- (CSINCwwwc GPR32:$Rd, WZR, WZR, inv_cond_code_op:$Cond)>;
-def : InstAlias<"cset $Rd, $Cond",
- (CSINCxxxc GPR64:$Rd, XZR, XZR, inv_cond_code_op:$Cond)>;
-def : InstAlias<"csetm $Rd, $Cond",
- (CSINVwwwc GPR32:$Rd, WZR, WZR, inv_cond_code_op:$Cond)>;
-def : InstAlias<"csetm $Rd, $Cond",
- (CSINVxxxc GPR64:$Rd, XZR, XZR, inv_cond_code_op:$Cond)>;
-def : InstAlias<"cinc $Rd, $Rn, $Cond",
- (CSINCwwwc GPR32:$Rd, GPR32:$Rn, GPR32:$Rn, inv_cond_code_op:$Cond)>;
-def : InstAlias<"cinc $Rd, $Rn, $Cond",
- (CSINCxxxc GPR64:$Rd, GPR64:$Rn, GPR64:$Rn, inv_cond_code_op:$Cond)>;
-def : InstAlias<"cinv $Rd, $Rn, $Cond",
- (CSINVwwwc GPR32:$Rd, GPR32:$Rn, GPR32:$Rn, inv_cond_code_op:$Cond)>;
-def : InstAlias<"cinv $Rd, $Rn, $Cond",
- (CSINVxxxc GPR64:$Rd, GPR64:$Rn, GPR64:$Rn, inv_cond_code_op:$Cond)>;
-def : InstAlias<"cneg $Rd, $Rn, $Cond",
- (CSNEGwwwc GPR32:$Rd, GPR32:$Rn, GPR32:$Rn, inv_cond_code_op:$Cond)>;
-def : InstAlias<"cneg $Rd, $Rn, $Cond",
- (CSNEGxxxc GPR64:$Rd, GPR64:$Rn, GPR64:$Rn, inv_cond_code_op:$Cond)>;
-
-// Finally some helper patterns.
-
-// For CSET (a.k.a. zero-extension of icmp)
-def : Pat<(A64select_cc NZCV, 0, 1, cond_code:$Cond),
- (CSINCwwwc WZR, WZR, cond_code:$Cond)>;
-def : Pat<(A64select_cc NZCV, 1, 0, inv_cond_code:$Cond),
- (CSINCwwwc WZR, WZR, inv_cond_code:$Cond)>;
-
-def : Pat<(A64select_cc NZCV, 0, 1, cond_code:$Cond),
- (CSINCxxxc XZR, XZR, cond_code:$Cond)>;
-def : Pat<(A64select_cc NZCV, 1, 0, inv_cond_code:$Cond),
- (CSINCxxxc XZR, XZR, inv_cond_code:$Cond)>;
-
-// For CSETM (a.k.a. sign-extension of icmp)
-def : Pat<(A64select_cc NZCV, 0, -1, cond_code:$Cond),
- (CSINVwwwc WZR, WZR, cond_code:$Cond)>;
-def : Pat<(A64select_cc NZCV, -1, 0, inv_cond_code:$Cond),
- (CSINVwwwc WZR, WZR, inv_cond_code:$Cond)>;
-
-def : Pat<(A64select_cc NZCV, 0, -1, cond_code:$Cond),
- (CSINVxxxc XZR, XZR, cond_code:$Cond)>;
-def : Pat<(A64select_cc NZCV, -1, 0, inv_cond_code:$Cond),
- (CSINVxxxc XZR, XZR, inv_cond_code:$Cond)>;
-
-// CINC, CINV and CNEG get dealt with automatically, which leaves the issue of
-// commutativity. The instructions are to complex for isCommutable to be used,
-// so we have to create the patterns manually:
-
-// No commutable pattern for CSEL since the commuted version is isomorphic.
-
-// CSINC
-def :Pat<(A64select_cc NZCV, (add i32:$Rm, 1), i32:$Rn, inv_cond_code:$Cond),
- (CSINCwwwc $Rn, $Rm, inv_cond_code:$Cond)>;
-def :Pat<(A64select_cc NZCV, (add i64:$Rm, 1), i64:$Rn, inv_cond_code:$Cond),
- (CSINCxxxc $Rn, $Rm, inv_cond_code:$Cond)>;
-
-// CSINV
-def :Pat<(A64select_cc NZCV, (not i32:$Rm), i32:$Rn, inv_cond_code:$Cond),
- (CSINVwwwc $Rn, $Rm, inv_cond_code:$Cond)>;
-def :Pat<(A64select_cc NZCV, (not i64:$Rm), i64:$Rn, inv_cond_code:$Cond),
- (CSINVxxxc $Rn, $Rm, inv_cond_code:$Cond)>;
-
-// CSNEG
-def :Pat<(A64select_cc NZCV, (ineg i32:$Rm), i32:$Rn, inv_cond_code:$Cond),
- (CSNEGwwwc $Rn, $Rm, inv_cond_code:$Cond)>;
-def :Pat<(A64select_cc NZCV, (ineg i64:$Rm), i64:$Rn, inv_cond_code:$Cond),
- (CSNEGxxxc $Rn, $Rm, inv_cond_code:$Cond)>;
-
-//===----------------------------------------------------------------------===//
-// Data Processing (1 source) instructions
-//===----------------------------------------------------------------------===//
-// Contains: RBIT, REV16, REV, REV32, CLZ, CLS.
-
-// We define an unary operator which always fails. We will use this to
-// define unary operators that cannot be matched.
-
-class A64I_dp_1src_impl<bit sf, bits<6> opcode, string asmop,
- list<dag> patterns, RegisterClass GPRrc,
- InstrItinClass itin>:
- A64I_dp_1src<sf,
- 0,
- 0b00000,
- opcode,
- !strconcat(asmop, "\t$Rd, $Rn"),
- (outs GPRrc:$Rd),
- (ins GPRrc:$Rn),
- patterns,
- itin>,
- Sched<[WriteALU, ReadALU]>;
-
-multiclass A64I_dp_1src <bits<6> opcode, string asmop> {
- let hasSideEffects = 0 in {
- def ww : A64I_dp_1src_impl<0b0, opcode, asmop, [], GPR32, NoItinerary>;
- def xx : A64I_dp_1src_impl<0b1, opcode, asmop, [], GPR64, NoItinerary>;
- }
-}
-
-defm RBIT : A64I_dp_1src<0b000000, "rbit">;
-defm CLS : A64I_dp_1src<0b000101, "cls">;
-defm CLZ : A64I_dp_1src<0b000100, "clz">;
-
-def : Pat<(ctlz i32:$Rn), (CLZww $Rn)>;
-def : Pat<(ctlz i64:$Rn), (CLZxx $Rn)>;
-def : Pat<(ctlz_zero_undef i32:$Rn), (CLZww $Rn)>;
-def : Pat<(ctlz_zero_undef i64:$Rn), (CLZxx $Rn)>;
-
-def : Pat<(cttz i32:$Rn), (CLZww (RBITww $Rn))>;
-def : Pat<(cttz i64:$Rn), (CLZxx (RBITxx $Rn))>;
-def : Pat<(cttz_zero_undef i32:$Rn), (CLZww (RBITww $Rn))>;
-def : Pat<(cttz_zero_undef i64:$Rn), (CLZxx (RBITxx $Rn))>;
-
-
-def REVww : A64I_dp_1src_impl<0b0, 0b000010, "rev",
- [(set i32:$Rd, (bswap i32:$Rn))],
- GPR32, NoItinerary>;
-def REVxx : A64I_dp_1src_impl<0b1, 0b000011, "rev",
- [(set i64:$Rd, (bswap i64:$Rn))],
- GPR64, NoItinerary>;
-def REV32xx : A64I_dp_1src_impl<0b1, 0b000010, "rev32",
- [(set i64:$Rd, (bswap (rotr i64:$Rn, (i64 32))))],
- GPR64, NoItinerary>;
-def REV16ww : A64I_dp_1src_impl<0b0, 0b000001, "rev16",
- [(set i32:$Rd, (bswap (rotr i32:$Rn, (i64 16))))],
- GPR32,
- NoItinerary>;
-def REV16xx : A64I_dp_1src_impl<0b1, 0b000001, "rev16", [], GPR64, NoItinerary>;
-
-//===----------------------------------------------------------------------===//
-// Data Processing (2 sources) instructions
-//===----------------------------------------------------------------------===//
-// Contains: CRC32C?[BHWX], UDIV, SDIV, LSLV, LSRV, ASRV, RORV + aliases LSL,
-// LSR, ASR, ROR
-
-
-class dp_2src_impl<bit sf, bits<6> opcode, string asmop, list<dag> patterns,
- RegisterClass GPRsp,
- InstrItinClass itin>:
- A64I_dp_2src<sf,
- opcode,
- 0,
- !strconcat(asmop, "\t$Rd, $Rn, $Rm"),
- (outs GPRsp:$Rd),
- (ins GPRsp:$Rn, GPRsp:$Rm),
- patterns,
- itin>,
- Sched<[WriteALU, ReadALU, ReadALU]>;
-
-multiclass dp_2src_crc<bit c, string asmop> {
- def B_www : dp_2src_impl<0b0, {0, 1, 0, c, 0, 0},
- !strconcat(asmop, "b"), [], GPR32, NoItinerary>;
- def H_www : dp_2src_impl<0b0, {0, 1, 0, c, 0, 1},
- !strconcat(asmop, "h"), [], GPR32, NoItinerary>;
- def W_www : dp_2src_impl<0b0, {0, 1, 0, c, 1, 0},
- !strconcat(asmop, "w"), [], GPR32, NoItinerary>;
- def X_wwx : A64I_dp_2src<0b1, {0, 1, 0, c, 1, 1}, 0b0,
- !strconcat(asmop, "x\t$Rd, $Rn, $Rm"),
- (outs GPR32:$Rd), (ins GPR32:$Rn, GPR64:$Rm), [],
- NoItinerary>,
- Sched<[WriteALU, ReadALU, ReadALU]>;
-}
-
-multiclass dp_2src_zext <bits<6> opcode, string asmop, SDPatternOperator op> {
- def www : dp_2src_impl<0b0,
- opcode,
- asmop,
- [(set i32:$Rd,
- (op i32:$Rn, (i64 (zext i32:$Rm))))],
- GPR32,
- NoItinerary>;
- def xxx : dp_2src_impl<0b1,
- opcode,
- asmop,
- [(set i64:$Rd, (op i64:$Rn, i64:$Rm))],
- GPR64,
- NoItinerary>;
-}
-
-
-multiclass dp_2src <bits<6> opcode, string asmop, SDPatternOperator op> {
- def www : dp_2src_impl<0b0,
- opcode,
- asmop,
- [(set i32:$Rd, (op i32:$Rn, i32:$Rm))],
- GPR32,
- NoItinerary>;
- def xxx : dp_2src_impl<0b1,
- opcode,
- asmop,
- [(set i64:$Rd, (op i64:$Rn, i64:$Rm))],
- GPR64,
- NoItinerary>;
-}
-
-// Here we define the data processing 2 source instructions.
-defm CRC32 : dp_2src_crc<0b0, "crc32">;
-defm CRC32C : dp_2src_crc<0b1, "crc32c">;
-
-let SchedRW = [WriteDiv, ReadDiv, ReadDiv] in {
- defm UDIV : dp_2src<0b000010, "udiv", udiv>;
- defm SDIV : dp_2src<0b000011, "sdiv", sdiv>;
-}
-
-let SchedRW = [WriteALUs, ReadALU, ReadALU] in {
- defm LSLV : dp_2src_zext<0b001000, "lsl", shl>;
- defm LSRV : dp_2src_zext<0b001001, "lsr", srl>;
- defm ASRV : dp_2src_zext<0b001010, "asr", sra>;
- defm RORV : dp_2src_zext<0b001011, "ror", rotr>;
-}
-
-// Extra patterns for an incoming 64-bit value for a 32-bit
-// operation. Since the LLVM operations are undefined (as in C) if the
-// RHS is out of range, it's perfectly permissible to discard the high
-// bits of the GPR64.
-def : Pat<(shl i32:$Rn, i64:$Rm),
- (LSLVwww $Rn, (EXTRACT_SUBREG $Rm, sub_32))>;
-def : Pat<(srl i32:$Rn, i64:$Rm),
- (LSRVwww $Rn, (EXTRACT_SUBREG $Rm, sub_32))>;
-def : Pat<(sra i32:$Rn, i64:$Rm),
- (ASRVwww $Rn, (EXTRACT_SUBREG $Rm, sub_32))>;
-def : Pat<(rotr i32:$Rn, i64:$Rm),
- (RORVwww $Rn, (EXTRACT_SUBREG $Rm, sub_32))>;
-
-// Here we define the aliases for the data processing 2 source instructions.
-def LSL_mnemonic : MnemonicAlias<"lslv", "lsl">;
-def LSR_mnemonic : MnemonicAlias<"lsrv", "lsr">;
-def ASR_menmonic : MnemonicAlias<"asrv", "asr">;
-def ROR_menmonic : MnemonicAlias<"rorv", "ror">;
-
-//===----------------------------------------------------------------------===//
-// Data Processing (3 sources) instructions
-//===----------------------------------------------------------------------===//
-// Contains: MADD, MSUB, SMADDL, SMSUBL, SMULH, UMADDL, UMSUBL, UMULH
-// + aliases MUL, MNEG, SMULL, SMNEGL, UMULL, UMNEGL
-
-class A64I_dp3_4operand<bit sf, bits<6> opcode, RegisterClass AccReg,
- ValueType AccTy, RegisterClass SrcReg,
- string asmop, dag pattern>
- : A64I_dp3<sf, opcode,
- (outs AccReg:$Rd), (ins SrcReg:$Rn, SrcReg:$Rm, AccReg:$Ra),
- !strconcat(asmop, "\t$Rd, $Rn, $Rm, $Ra"),
- [(set AccTy:$Rd, pattern)], NoItinerary>,
- Sched<[WriteMAC, ReadMAC, ReadMAC, ReadMAC]> {
- bits<5> Ra;
- let Inst{14-10} = Ra;
-
- RegisterClass AccGPR = AccReg;
- RegisterClass SrcGPR = SrcReg;
-}
-
-def MADDwwww : A64I_dp3_4operand<0b0, 0b000000, GPR32, i32, GPR32, "madd",
- (add i32:$Ra, (mul i32:$Rn, i32:$Rm))>;
-def MADDxxxx : A64I_dp3_4operand<0b1, 0b000000, GPR64, i64, GPR64, "madd",
- (add i64:$Ra, (mul i64:$Rn, i64:$Rm))>;
-
-def MSUBwwww : A64I_dp3_4operand<0b0, 0b000001, GPR32, i32, GPR32, "msub",
- (sub i32:$Ra, (mul i32:$Rn, i32:$Rm))>;
-def MSUBxxxx : A64I_dp3_4operand<0b1, 0b000001, GPR64, i64, GPR64, "msub",
- (sub i64:$Ra, (mul i64:$Rn, i64:$Rm))>;
-
-def SMADDLxwwx : A64I_dp3_4operand<0b1, 0b000010, GPR64, i64, GPR32, "smaddl",
- (add i64:$Ra, (mul (i64 (sext i32:$Rn)), (sext i32:$Rm)))>;
-def SMSUBLxwwx : A64I_dp3_4operand<0b1, 0b000011, GPR64, i64, GPR32, "smsubl",
- (sub i64:$Ra, (mul (i64 (sext i32:$Rn)), (sext i32:$Rm)))>;
-
-def UMADDLxwwx : A64I_dp3_4operand<0b1, 0b001010, GPR64, i64, GPR32, "umaddl",
- (add i64:$Ra, (mul (i64 (zext i32:$Rn)), (zext i32:$Rm)))>;
-def UMSUBLxwwx : A64I_dp3_4operand<0b1, 0b001011, GPR64, i64, GPR32, "umsubl",
- (sub i64:$Ra, (mul (i64 (zext i32:$Rn)), (zext i32:$Rm)))>;
-
-let isCommutable = 1, PostEncoderMethod = "fixMulHigh" in {
- def UMULHxxx : A64I_dp3<0b1, 0b001100, (outs GPR64:$Rd),
- (ins GPR64:$Rn, GPR64:$Rm),
- "umulh\t$Rd, $Rn, $Rm",
- [(set i64:$Rd, (mulhu i64:$Rn, i64:$Rm))],
- NoItinerary>,
- Sched<[WriteMAC, ReadMAC, ReadMAC]>;
-
- def SMULHxxx : A64I_dp3<0b1, 0b000100, (outs GPR64:$Rd),
- (ins GPR64:$Rn, GPR64:$Rm),
- "smulh\t$Rd, $Rn, $Rm",
- [(set i64:$Rd, (mulhs i64:$Rn, i64:$Rm))],
- NoItinerary>,
- Sched<[WriteMAC, ReadMAC, ReadMAC]>;
-}
-
-multiclass A64I_dp3_3operand<string asmop, A64I_dp3_4operand INST,
- Register ZR, dag pattern> {
- def : InstAlias<asmop # " $Rd, $Rn, $Rm",
- (INST INST.AccGPR:$Rd, INST.SrcGPR:$Rn, INST.SrcGPR:$Rm, ZR)>;
-
- def : Pat<pattern, (INST $Rn, $Rm, ZR)>;
-}
-
-defm : A64I_dp3_3operand<"mul", MADDwwww, WZR, (mul i32:$Rn, i32:$Rm)>;
-defm : A64I_dp3_3operand<"mul", MADDxxxx, XZR, (mul i64:$Rn, i64:$Rm)>;
-
-defm : A64I_dp3_3operand<"mneg", MSUBwwww, WZR,
- (sub 0, (mul i32:$Rn, i32:$Rm))>;
-defm : A64I_dp3_3operand<"mneg", MSUBxxxx, XZR,
- (sub 0, (mul i64:$Rn, i64:$Rm))>;
-
-defm : A64I_dp3_3operand<"smull", SMADDLxwwx, XZR,
- (mul (i64 (sext i32:$Rn)), (sext i32:$Rm))>;
-defm : A64I_dp3_3operand<"smnegl", SMSUBLxwwx, XZR,
- (sub 0, (mul (i64 (sext i32:$Rn)), (sext i32:$Rm)))>;
-
-defm : A64I_dp3_3operand<"umull", UMADDLxwwx, XZR,
- (mul (i64 (zext i32:$Rn)), (zext i32:$Rm))>;
-defm : A64I_dp3_3operand<"umnegl", UMSUBLxwwx, XZR,
- (sub 0, (mul (i64 (zext i32:$Rn)), (zext i32:$Rm)))>;
-
-
-//===----------------------------------------------------------------------===//
-// Exception generation
-//===----------------------------------------------------------------------===//
-// Contains: SVC, HVC, SMC, BRK, HLT, DCPS1, DCPS2, DCPS3
-
-def uimm16_asmoperand : AsmOperandClass {
- let Name = "UImm16";
- let PredicateMethod = "isUImm<16>";
- let RenderMethod = "addImmOperands";
- let DiagnosticType = "UImm16";
-}
-
-def uimm16 : Operand<i32> {
- let ParserMatchClass = uimm16_asmoperand;
-}
-
-class A64I_exceptImpl<bits<3> opc, bits<2> ll, string asmop>
- : A64I_exception<opc, 0b000, ll, (outs), (ins uimm16:$UImm16),
- !strconcat(asmop, "\t$UImm16"), [], NoItinerary>,
- Sched<[WriteBr]> {
- let isBranch = 1;
- let isTerminator = 1;
-}
-
-def SVCi : A64I_exceptImpl<0b000, 0b01, "svc">;
-def HVCi : A64I_exceptImpl<0b000, 0b10, "hvc">;
-def SMCi : A64I_exceptImpl<0b000, 0b11, "smc">;
-def BRKi : A64I_exceptImpl<0b001, 0b00, "brk">;
-def HLTi : A64I_exceptImpl<0b010, 0b00, "hlt">;
-
-def DCPS1i : A64I_exceptImpl<0b101, 0b01, "dcps1">;
-def DCPS2i : A64I_exceptImpl<0b101, 0b10, "dcps2">;
-def DCPS3i : A64I_exceptImpl<0b101, 0b11, "dcps3">;
-
-// The immediate is optional for the DCPS instructions, defaulting to 0.
-def : InstAlias<"dcps1", (DCPS1i 0)>;
-def : InstAlias<"dcps2", (DCPS2i 0)>;
-def : InstAlias<"dcps3", (DCPS3i 0)>;
-
-//===----------------------------------------------------------------------===//
-// Extract (immediate)
-//===----------------------------------------------------------------------===//
-// Contains: EXTR + alias ROR
-
-def EXTRwwwi : A64I_extract<0b0, 0b000, 0b0,
- (outs GPR32:$Rd),
- (ins GPR32:$Rn, GPR32:$Rm, bitfield32_imm:$LSB),
- "extr\t$Rd, $Rn, $Rm, $LSB",
- [(set i32:$Rd,
- (A64Extr i32:$Rn, i32:$Rm, imm:$LSB))],
- NoItinerary>,
- Sched<[WriteALU, ReadALU, ReadALU]>;
-def EXTRxxxi : A64I_extract<0b1, 0b000, 0b1,
- (outs GPR64:$Rd),
- (ins GPR64:$Rn, GPR64:$Rm, bitfield64_imm:$LSB),
- "extr\t$Rd, $Rn, $Rm, $LSB",
- [(set i64:$Rd,
- (A64Extr i64:$Rn, i64:$Rm, imm:$LSB))],
- NoItinerary>,
- Sched<[WriteALU, ReadALU, ReadALU]>;
-
-def : InstAlias<"ror $Rd, $Rs, $LSB",
- (EXTRwwwi GPR32:$Rd, GPR32:$Rs, GPR32:$Rs, bitfield32_imm:$LSB)>;
-def : InstAlias<"ror $Rd, $Rs, $LSB",
- (EXTRxxxi GPR64:$Rd, GPR64:$Rs, GPR64:$Rs, bitfield64_imm:$LSB)>;
-
-def : Pat<(rotr i32:$Rn, bitfield32_imm:$LSB),
- (EXTRwwwi $Rn, $Rn, bitfield32_imm:$LSB)>;
-def : Pat<(rotr i64:$Rn, bitfield64_imm:$LSB),
- (EXTRxxxi $Rn, $Rn, bitfield64_imm:$LSB)>;
-
-//===----------------------------------------------------------------------===//
-// Floating-point compare instructions
-//===----------------------------------------------------------------------===//
-// Contains: FCMP, FCMPE
-
-def fpzero_asmoperand : AsmOperandClass {
- let Name = "FPZero";
- let ParserMethod = "ParseFPImmOperand";
- let DiagnosticType = "FPZero";
-}
-
-def fpz32 : Operand<f32>,
- ComplexPattern<f32, 1, "SelectFPZeroOperand", [fpimm]> {
- let ParserMatchClass = fpzero_asmoperand;
- let PrintMethod = "printFPZeroOperand";
- let DecoderMethod = "DecodeFPZeroOperand";
-}
-
-def fpz64 : Operand<f64>,
- ComplexPattern<f64, 1, "SelectFPZeroOperand", [fpimm]> {
- let ParserMatchClass = fpzero_asmoperand;
- let PrintMethod = "printFPZeroOperand";
- let DecoderMethod = "DecodeFPZeroOperand";
-}
-
-def fpz64movi : Operand<i64>,
- ComplexPattern<f64, 1, "SelectFPZeroOperand", [fpimm]> {
- let ParserMatchClass = fpzero_asmoperand;
- let PrintMethod = "printFPZeroOperand";
- let DecoderMethod = "DecodeFPZeroOperand";
-}
-
-multiclass A64I_fpcmpSignal<bits<2> type, bit imm, dag ins, dag pattern> {
- def _quiet : A64I_fpcmp<0b0, 0b0, type, 0b00, {0b0, imm, 0b0, 0b0, 0b0},
- (outs), ins, "fcmp\t$Rn, $Rm", [pattern],
- NoItinerary>,
- Sched<[WriteFPALU, ReadFPALU, ReadFPALU]> {
- let Defs = [NZCV];
- }
-
- def _sig : A64I_fpcmp<0b0, 0b0, type, 0b00, {0b1, imm, 0b0, 0b0, 0b0},
- (outs), ins, "fcmpe\t$Rn, $Rm", [], NoItinerary>,
- Sched<[WriteFPALU, ReadFPALU, ReadFPALU]> {
- let Defs = [NZCV];
- }
-}
-
-defm FCMPss : A64I_fpcmpSignal<0b00, 0b0, (ins FPR32:$Rn, FPR32:$Rm),
- (set NZCV, (A64cmp f32:$Rn, f32:$Rm))>;
-defm FCMPdd : A64I_fpcmpSignal<0b01, 0b0, (ins FPR64:$Rn, FPR64:$Rm),
- (set NZCV, (A64cmp f64:$Rn, f64:$Rm))>;
-
-// What would be Rm should be written as 0; note that even though it's called
-// "$Rm" here to fit in with the InstrFormats, it's actually an immediate.
-defm FCMPsi : A64I_fpcmpSignal<0b00, 0b1, (ins FPR32:$Rn, fpz32:$Rm),
- (set NZCV, (A64cmp f32:$Rn, fpz32:$Rm))>;
-
-defm FCMPdi : A64I_fpcmpSignal<0b01, 0b1, (ins FPR64:$Rn, fpz64:$Rm),
- (set NZCV, (A64cmp f64:$Rn, fpz64:$Rm))>;
-
-
-//===----------------------------------------------------------------------===//
-// Floating-point conditional compare instructions
-//===----------------------------------------------------------------------===//
-// Contains: FCCMP, FCCMPE
-
-class A64I_fpccmpImpl<bits<2> type, bit op, RegisterClass FPR, string asmop>
- : A64I_fpccmp<0b0, 0b0, type, op,
- (outs),
- (ins FPR:$Rn, FPR:$Rm, uimm4:$NZCVImm, cond_code_op:$Cond),
- !strconcat(asmop, "\t$Rn, $Rm, $NZCVImm, $Cond"),
- [], NoItinerary>,
- Sched<[WriteFPALU, ReadFPALU, ReadFPALU]> {
- let Defs = [NZCV];
-}
-
-def FCCMPss : A64I_fpccmpImpl<0b00, 0b0, FPR32, "fccmp">;
-def FCCMPEss : A64I_fpccmpImpl<0b00, 0b1, FPR32, "fccmpe">;
-def FCCMPdd : A64I_fpccmpImpl<0b01, 0b0, FPR64, "fccmp">;
-def FCCMPEdd : A64I_fpccmpImpl<0b01, 0b1, FPR64, "fccmpe">;
-
-//===----------------------------------------------------------------------===//
-// Floating-point conditional select instructions
-//===----------------------------------------------------------------------===//
-// Contains: FCSEL
-
-let Uses = [NZCV] in {
- def FCSELsssc : A64I_fpcondsel<0b0, 0b0, 0b00, (outs FPR32:$Rd),
- (ins FPR32:$Rn, FPR32:$Rm, cond_code_op:$Cond),
- "fcsel\t$Rd, $Rn, $Rm, $Cond",
- [(set f32:$Rd,
- (simple_select f32:$Rn, f32:$Rm))],
- NoItinerary>,
- Sched<[WriteFPALU, ReadFPALU, ReadFPALU]>;
-
-
- def FCSELdddc : A64I_fpcondsel<0b0, 0b0, 0b01, (outs FPR64:$Rd),
- (ins FPR64:$Rn, FPR64:$Rm, cond_code_op:$Cond),
- "fcsel\t$Rd, $Rn, $Rm, $Cond",
- [(set f64:$Rd,
- (simple_select f64:$Rn, f64:$Rm))],
- NoItinerary>,
- Sched<[WriteFPALU, ReadFPALU, ReadFPALU]>;
-}
-
-//===----------------------------------------------------------------------===//
-// Floating-point data-processing (1 source)
-//===----------------------------------------------------------------------===//
-// Contains: FMOV, FABS, FNEG, FSQRT, FCVT, FRINT[NPMZAXI].
-
-def FPNoUnop : PatFrag<(ops node:$val), (fneg node:$val),
- [{ (void)N; return false; }]>;
-
-// First we do the fairly trivial bunch with uniform "OP s, s" and "OP d, d"
-// syntax. Default to no pattern because most are odd enough not to have one.
-multiclass A64I_fpdp1sizes<bits<6> opcode, string asmstr,
- SDPatternOperator opnode = FPNoUnop> {
- def ss : A64I_fpdp1<0b0, 0b0, 0b00, opcode, (outs FPR32:$Rd), (ins FPR32:$Rn),
- !strconcat(asmstr, "\t$Rd, $Rn"),
- [(set f32:$Rd, (opnode f32:$Rn))],
- NoItinerary>,
- Sched<[WriteFPALU, ReadFPALU]>;
-
- def dd : A64I_fpdp1<0b0, 0b0, 0b01, opcode, (outs FPR64:$Rd), (ins FPR64:$Rn),
- !strconcat(asmstr, "\t$Rd, $Rn"),
- [(set f64:$Rd, (opnode f64:$Rn))],
- NoItinerary>,
- Sched<[WriteFPALU, ReadFPALU]>;
-}
-
-defm FMOV : A64I_fpdp1sizes<0b000000, "fmov">;
-defm FABS : A64I_fpdp1sizes<0b000001, "fabs", fabs>;
-defm FNEG : A64I_fpdp1sizes<0b000010, "fneg", fneg>;
-let SchedRW = [WriteFPSqrt, ReadFPSqrt] in {
- defm FSQRT : A64I_fpdp1sizes<0b000011, "fsqrt", fsqrt>;
-}
-
-defm FRINTN : A64I_fpdp1sizes<0b001000, "frintn">;
-defm FRINTP : A64I_fpdp1sizes<0b001001, "frintp", fceil>;
-defm FRINTM : A64I_fpdp1sizes<0b001010, "frintm", ffloor>;
-defm FRINTZ : A64I_fpdp1sizes<0b001011, "frintz", ftrunc>;
-defm FRINTA : A64I_fpdp1sizes<0b001100, "frinta">;
-defm FRINTX : A64I_fpdp1sizes<0b001110, "frintx", frint>;
-defm FRINTI : A64I_fpdp1sizes<0b001111, "frinti", fnearbyint>;
-
-// The FCVT instrucitons have different source and destination register-types,
-// but the fields are uniform everywhere a D-register (say) crops up. Package
-// this information in a Record.
-class FCVTRegType<RegisterClass rc, bits<2> fld, ValueType vt> {
- RegisterClass Class = rc;
- ValueType VT = vt;
- bit t1 = fld{1};
- bit t0 = fld{0};
-}
-
-def FCVT16 : FCVTRegType<FPR16, 0b11, f16>;
-def FCVT32 : FCVTRegType<FPR32, 0b00, f32>;
-def FCVT64 : FCVTRegType<FPR64, 0b01, f64>;
-
-class A64I_fpdp1_fcvt<FCVTRegType DestReg, FCVTRegType SrcReg, SDNode opnode>
- : A64I_fpdp1<0b0, 0b0, {SrcReg.t1, SrcReg.t0},
- {0,0,0,1, DestReg.t1, DestReg.t0},
- (outs DestReg.Class:$Rd), (ins SrcReg.Class:$Rn),
- "fcvt\t$Rd, $Rn",
- [(set DestReg.VT:$Rd, (opnode SrcReg.VT:$Rn))], NoItinerary>,
- Sched<[WriteFPALU, ReadFPALU]>;
-
-def FCVTds : A64I_fpdp1_fcvt<FCVT64, FCVT32, fextend>;
-def FCVThs : A64I_fpdp1_fcvt<FCVT16, FCVT32, fround>;
-def FCVTsd : A64I_fpdp1_fcvt<FCVT32, FCVT64, fround>;
-def FCVThd : A64I_fpdp1_fcvt<FCVT16, FCVT64, fround>;
-def FCVTsh : A64I_fpdp1_fcvt<FCVT32, FCVT16, fextend>;
-def FCVTdh : A64I_fpdp1_fcvt<FCVT64, FCVT16, fextend>;
-
-
-//===----------------------------------------------------------------------===//
-// Floating-point data-processing (2 sources) instructions
-//===----------------------------------------------------------------------===//
-// Contains: FMUL, FDIV, FADD, FSUB, FMAX, FMIN, FMAXNM, FMINNM, FNMUL
-
-def FPNoBinop : PatFrag<(ops node:$lhs, node:$rhs), (fadd node:$lhs, node:$rhs),
- [{ (void)N; return false; }]>;
-
-multiclass A64I_fpdp2sizes<bits<4> opcode, string asmstr,
- SDPatternOperator opnode> {
- def sss : A64I_fpdp2<0b0, 0b0, 0b00, opcode,
- (outs FPR32:$Rd),
- (ins FPR32:$Rn, FPR32:$Rm),
- !strconcat(asmstr, "\t$Rd, $Rn, $Rm"),
- [(set f32:$Rd, (opnode f32:$Rn, f32:$Rm))],
- NoItinerary>,
- Sched<[WriteFPALU, ReadFPALU, ReadFPALU]>;
-
- def ddd : A64I_fpdp2<0b0, 0b0, 0b01, opcode,
- (outs FPR64:$Rd),
- (ins FPR64:$Rn, FPR64:$Rm),
- !strconcat(asmstr, "\t$Rd, $Rn, $Rm"),
- [(set f64:$Rd, (opnode f64:$Rn, f64:$Rm))],
- NoItinerary>,
- Sched<[WriteFPALU, ReadFPALU, ReadFPALU]>;
-}
-
-let isCommutable = 1 in {
- let SchedRW = [WriteFPMul, ReadFPMul, ReadFPMul] in {
- defm FMUL : A64I_fpdp2sizes<0b0000, "fmul", fmul>;
- }
- defm FADD : A64I_fpdp2sizes<0b0010, "fadd", fadd>;
-
- // No patterns for these.
- defm FMAX : A64I_fpdp2sizes<0b0100, "fmax", FPNoBinop>;
- defm FMIN : A64I_fpdp2sizes<0b0101, "fmin", FPNoBinop>;
- defm FMAXNM : A64I_fpdp2sizes<0b0110, "fmaxnm", FPNoBinop>;
- defm FMINNM : A64I_fpdp2sizes<0b0111, "fminnm", FPNoBinop>;
-
- let SchedRW = [WriteFPMul, ReadFPMul, ReadFPMul] in {
- defm FNMUL : A64I_fpdp2sizes<0b1000, "fnmul",
- PatFrag<(ops node:$lhs, node:$rhs),
- (fneg (fmul node:$lhs, node:$rhs))> >;
- }
-}
-
-let SchedRW = [WriteFPDiv, ReadFPDiv, ReadFPDiv] in {
- defm FDIV : A64I_fpdp2sizes<0b0001, "fdiv", fdiv>;
-}
-defm FSUB : A64I_fpdp2sizes<0b0011, "fsub", fsub>;
-
-//===----------------------------------------------------------------------===//
-// Floating-point data-processing (3 sources) instructions
-//===----------------------------------------------------------------------===//
-// Contains: FMADD, FMSUB, FNMADD, FNMSUB
-
-def fmsub : PatFrag<(ops node:$Rn, node:$Rm, node:$Ra),
- (fma (fneg node:$Rn), node:$Rm, node:$Ra)>;
-def fnmsub : PatFrag<(ops node:$Rn, node:$Rm, node:$Ra),
- (fma node:$Rn, node:$Rm, (fneg node:$Ra))>;
-def fnmadd : PatFrag<(ops node:$Rn, node:$Rm, node:$Ra),
- (fma (fneg node:$Rn), node:$Rm, (fneg node:$Ra))>;
-
-class A64I_fpdp3Impl<string asmop, RegisterClass FPR, ValueType VT,
- bits<2> type, bit o1, bit o0, SDPatternOperator fmakind>
- : A64I_fpdp3<0b0, 0b0, type, o1, o0, (outs FPR:$Rd),
- (ins FPR:$Rn, FPR:$Rm, FPR:$Ra),
- !strconcat(asmop,"\t$Rd, $Rn, $Rm, $Ra"),
- [(set VT:$Rd, (fmakind VT:$Rn, VT:$Rm, VT:$Ra))],
- NoItinerary>,
- Sched<[WriteFPMAC, ReadFPMAC, ReadFPMAC, ReadFPMAC]>;
-
-def FMADDssss : A64I_fpdp3Impl<"fmadd", FPR32, f32, 0b00, 0b0, 0b0, fma>;
-def FMSUBssss : A64I_fpdp3Impl<"fmsub", FPR32, f32, 0b00, 0b0, 0b1, fmsub>;
-def FNMADDssss : A64I_fpdp3Impl<"fnmadd", FPR32, f32, 0b00, 0b1, 0b0, fnmadd>;
-def FNMSUBssss : A64I_fpdp3Impl<"fnmsub", FPR32, f32, 0b00, 0b1, 0b1, fnmsub>;
-
-def FMADDdddd : A64I_fpdp3Impl<"fmadd", FPR64, f64, 0b01, 0b0, 0b0, fma>;
-def FMSUBdddd : A64I_fpdp3Impl<"fmsub", FPR64, f64, 0b01, 0b0, 0b1, fmsub>;
-def FNMADDdddd : A64I_fpdp3Impl<"fnmadd", FPR64, f64, 0b01, 0b1, 0b0, fnmadd>;
-def FNMSUBdddd : A64I_fpdp3Impl<"fnmsub", FPR64, f64, 0b01, 0b1, 0b1, fnmsub>;
-
-// Extra patterns for when we're allowed to optimise separate multiplication and
-// addition.
-let Predicates = [HasFPARMv8, UseFusedMAC] in {
-def : Pat<(f32 (fadd FPR32:$Ra, (f32 (fmul_su FPR32:$Rn, FPR32:$Rm)))),
- (FMADDssss FPR32:$Rn, FPR32:$Rm, FPR32:$Ra)>;
-def : Pat<(f32 (fsub FPR32:$Ra, (f32 (fmul_su FPR32:$Rn, FPR32:$Rm)))),
- (FMSUBssss FPR32:$Rn, FPR32:$Rm, FPR32:$Ra)>;
-def : Pat<(f32 (fsub (f32 (fneg FPR32:$Ra)), (f32 (fmul_su FPR32:$Rn, FPR32:$Rm)))),
- (FNMADDssss FPR32:$Rn, FPR32:$Rm, FPR32:$Ra)>;
-def : Pat<(f32 (fsub (f32 (fmul_su FPR32:$Rn, FPR32:$Rm)), FPR32:$Ra)),
- (FNMSUBssss FPR32:$Rn, FPR32:$Rm, FPR32:$Ra)>;
-
-def : Pat<(f64 (fadd FPR64:$Ra, (f64 (fmul_su FPR64:$Rn, FPR64:$Rm)))),
- (FMADDdddd FPR64:$Rn, FPR64:$Rm, FPR64:$Ra)>;
-def : Pat<(f64 (fsub FPR64:$Ra, (f64 (fmul_su FPR64:$Rn, FPR64:$Rm)))),
- (FMSUBdddd FPR64:$Rn, FPR64:$Rm, FPR64:$Ra)>;
-def : Pat<(f64 (fsub (f64 (fneg FPR64:$Ra)), (f64 (fmul_su FPR64:$Rn, FPR64:$Rm)))),
- (FNMADDdddd FPR64:$Rn, FPR64:$Rm, FPR64:$Ra)>;
-def : Pat<(f64 (fsub (f64 (fmul_su FPR64:$Rn, FPR64:$Rm)), FPR64:$Ra)),
- (FNMSUBdddd FPR64:$Rn, FPR64:$Rm, FPR64:$Ra)>;
-}
-
-
-//===----------------------------------------------------------------------===//
-// Floating-point <-> fixed-point conversion instructions
-//===----------------------------------------------------------------------===//
-// Contains: FCVTZS, FCVTZU, SCVTF, UCVTF
-
-// #1-#32 allowed, encoded as "64 - <specified imm>
-def fixedpos_asmoperand_i32 : AsmOperandClass {
- let Name = "CVTFixedPos32";
- let RenderMethod = "addCVTFixedPosOperands";
- let PredicateMethod = "isCVTFixedPos<32>";
- let DiagnosticType = "CVTFixedPos32";
-}
-
-// Also encoded as "64 - <specified imm>" but #1-#64 allowed.
-def fixedpos_asmoperand_i64 : AsmOperandClass {
- let Name = "CVTFixedPos64";
- let RenderMethod = "addCVTFixedPosOperands";
- let PredicateMethod = "isCVTFixedPos<64>";
- let DiagnosticType = "CVTFixedPos64";
-}
-
-// We need the cartesian product of f32/f64 i32/i64 operands for
-// conversions:
-// + Selection needs to use operands of correct floating type
-// + Assembly parsing and decoding depend on integer width
-class cvtfix_i32_op<ValueType FloatVT>
- : Operand<FloatVT>,
- ComplexPattern<FloatVT, 1, "SelectCVTFixedPosOperand<32>", [fpimm]> {
- let ParserMatchClass = fixedpos_asmoperand_i32;
- let DecoderMethod = "DecodeCVT32FixedPosOperand";
- let PrintMethod = "printCVTFixedPosOperand";
-}
-
-class cvtfix_i64_op<ValueType FloatVT>
- : Operand<FloatVT>,
- ComplexPattern<FloatVT, 1, "SelectCVTFixedPosOperand<64>", [fpimm]> {
- let ParserMatchClass = fixedpos_asmoperand_i64;
- let PrintMethod = "printCVTFixedPosOperand";
-}
-
-// Because of the proliferation of weird operands, it's not really
-// worth going for a multiclass here. Oh well.
-
-class A64I_fptofix<bit sf, bits<2> type, bits<3> opcode,
- RegisterClass GPR, RegisterClass FPR,
- ValueType DstTy, ValueType SrcTy,
- Operand scale_op, string asmop, SDNode cvtop>
- : A64I_fpfixed<sf, 0b0, type, 0b11, opcode,
- (outs GPR:$Rd), (ins FPR:$Rn, scale_op:$Scale),
- !strconcat(asmop, "\t$Rd, $Rn, $Scale"),
- [(set DstTy:$Rd, (cvtop (fmul SrcTy:$Rn, scale_op:$Scale)))],
- NoItinerary>,
- Sched<[WriteFPALU, ReadFPALU]>;
-
-def FCVTZSwsi : A64I_fptofix<0b0, 0b00, 0b000, GPR32, FPR32, i32, f32,
- cvtfix_i32_op<f32>, "fcvtzs", fp_to_sint>;
-def FCVTZSxsi : A64I_fptofix<0b1, 0b00, 0b000, GPR64, FPR32, i64, f32,
- cvtfix_i64_op<f32>, "fcvtzs", fp_to_sint>;
-def FCVTZUwsi : A64I_fptofix<0b0, 0b00, 0b001, GPR32, FPR32, i32, f32,
- cvtfix_i32_op<f32>, "fcvtzu", fp_to_uint>;
-def FCVTZUxsi : A64I_fptofix<0b1, 0b00, 0b001, GPR64, FPR32, i64, f32,
- cvtfix_i64_op<f32>, "fcvtzu", fp_to_uint>;
-
-def FCVTZSwdi : A64I_fptofix<0b0, 0b01, 0b000, GPR32, FPR64, i32, f64,
- cvtfix_i32_op<f64>, "fcvtzs", fp_to_sint>;
-def FCVTZSxdi : A64I_fptofix<0b1, 0b01, 0b000, GPR64, FPR64, i64, f64,
- cvtfix_i64_op<f64>, "fcvtzs", fp_to_sint>;
-def FCVTZUwdi : A64I_fptofix<0b0, 0b01, 0b001, GPR32, FPR64, i32, f64,
- cvtfix_i32_op<f64>, "fcvtzu", fp_to_uint>;
-def FCVTZUxdi : A64I_fptofix<0b1, 0b01, 0b001, GPR64, FPR64, i64, f64,
- cvtfix_i64_op<f64>, "fcvtzu", fp_to_uint>;
-
-
-class A64I_fixtofp<bit sf, bits<2> type, bits<3> opcode,
- RegisterClass FPR, RegisterClass GPR,
- ValueType DstTy, ValueType SrcTy,
- Operand scale_op, string asmop, SDNode cvtop>
- : A64I_fpfixed<sf, 0b0, type, 0b00, opcode,
- (outs FPR:$Rd), (ins GPR:$Rn, scale_op:$Scale),
- !strconcat(asmop, "\t$Rd, $Rn, $Scale"),
- [(set DstTy:$Rd, (fdiv (cvtop SrcTy:$Rn), scale_op:$Scale))],
- NoItinerary>,
- Sched<[WriteFPALU, ReadFPALU]>;
-
-def SCVTFswi : A64I_fixtofp<0b0, 0b00, 0b010, FPR32, GPR32, f32, i32,
- cvtfix_i32_op<f32>, "scvtf", sint_to_fp>;
-def SCVTFsxi : A64I_fixtofp<0b1, 0b00, 0b010, FPR32, GPR64, f32, i64,
- cvtfix_i64_op<f32>, "scvtf", sint_to_fp>;
-def UCVTFswi : A64I_fixtofp<0b0, 0b00, 0b011, FPR32, GPR32, f32, i32,
- cvtfix_i32_op<f32>, "ucvtf", uint_to_fp>;
-def UCVTFsxi : A64I_fixtofp<0b1, 0b00, 0b011, FPR32, GPR64, f32, i64,
- cvtfix_i64_op<f32>, "ucvtf", uint_to_fp>;
-def SCVTFdwi : A64I_fixtofp<0b0, 0b01, 0b010, FPR64, GPR32, f64, i32,
- cvtfix_i32_op<f64>, "scvtf", sint_to_fp>;
-def SCVTFdxi : A64I_fixtofp<0b1, 0b01, 0b010, FPR64, GPR64, f64, i64,
- cvtfix_i64_op<f64>, "scvtf", sint_to_fp>;
-def UCVTFdwi : A64I_fixtofp<0b0, 0b01, 0b011, FPR64, GPR32, f64, i32,
- cvtfix_i32_op<f64>, "ucvtf", uint_to_fp>;
-def UCVTFdxi : A64I_fixtofp<0b1, 0b01, 0b011, FPR64, GPR64, f64, i64,
- cvtfix_i64_op<f64>, "ucvtf", uint_to_fp>;
-
-//===----------------------------------------------------------------------===//
-// Floating-point <-> integer conversion instructions
-//===----------------------------------------------------------------------===//
-// Contains: FCVTZS, FCVTZU, SCVTF, UCVTF
-
-class A64I_fpintI<bit sf, bits<2> type, bits<2> rmode, bits<3> opcode,
- RegisterClass DestPR, RegisterClass SrcPR, string asmop>
- : A64I_fpint<sf, 0b0, type, rmode, opcode, (outs DestPR:$Rd), (ins SrcPR:$Rn),
- !strconcat(asmop, "\t$Rd, $Rn"), [], NoItinerary>,
- Sched<[WriteFPALU, ReadFPALU]>;
-
-multiclass A64I_fptointRM<bits<2> rmode, bit o2, string asmop> {
- def Sws : A64I_fpintI<0b0, 0b00, rmode, {o2, 0, 0},
- GPR32, FPR32, asmop # "s">;
- def Sxs : A64I_fpintI<0b1, 0b00, rmode, {o2, 0, 0},
- GPR64, FPR32, asmop # "s">;
- def Uws : A64I_fpintI<0b0, 0b00, rmode, {o2, 0, 1},
- GPR32, FPR32, asmop # "u">;
- def Uxs : A64I_fpintI<0b1, 0b00, rmode, {o2, 0, 1},
- GPR64, FPR32, asmop # "u">;
-
- def Swd : A64I_fpintI<0b0, 0b01, rmode, {o2, 0, 0},
- GPR32, FPR64, asmop # "s">;
- def Sxd : A64I_fpintI<0b1, 0b01, rmode, {o2, 0, 0},
- GPR64, FPR64, asmop # "s">;
- def Uwd : A64I_fpintI<0b0, 0b01, rmode, {o2, 0, 1},
- GPR32, FPR64, asmop # "u">;
- def Uxd : A64I_fpintI<0b1, 0b01, rmode, {o2, 0, 1},
- GPR64, FPR64, asmop # "u">;
-}
-
-defm FCVTN : A64I_fptointRM<0b00, 0b0, "fcvtn">;
-defm FCVTP : A64I_fptointRM<0b01, 0b0, "fcvtp">;
-defm FCVTM : A64I_fptointRM<0b10, 0b0, "fcvtm">;
-defm FCVTZ : A64I_fptointRM<0b11, 0b0, "fcvtz">;
-defm FCVTA : A64I_fptointRM<0b00, 0b1, "fcvta">;
-
-let Predicates = [HasFPARMv8] in {
-def : Pat<(i32 (fp_to_sint f32:$Rn)), (FCVTZSws $Rn)>;
-def : Pat<(i64 (fp_to_sint f32:$Rn)), (FCVTZSxs $Rn)>;
-def : Pat<(i32 (fp_to_uint f32:$Rn)), (FCVTZUws $Rn)>;
-def : Pat<(i64 (fp_to_uint f32:$Rn)), (FCVTZUxs $Rn)>;
-def : Pat<(i32 (fp_to_sint f64:$Rn)), (FCVTZSwd $Rn)>;
-def : Pat<(i64 (fp_to_sint f64:$Rn)), (FCVTZSxd $Rn)>;
-def : Pat<(i32 (fp_to_uint f64:$Rn)), (FCVTZUwd $Rn)>;
-def : Pat<(i64 (fp_to_uint f64:$Rn)), (FCVTZUxd $Rn)>;
-}
-
-multiclass A64I_inttofp<bit o0, string asmop> {
- def CVTFsw : A64I_fpintI<0b0, 0b00, 0b00, {0, 1, o0}, FPR32, GPR32, asmop>;
- def CVTFsx : A64I_fpintI<0b1, 0b00, 0b00, {0, 1, o0}, FPR32, GPR64, asmop>;
- def CVTFdw : A64I_fpintI<0b0, 0b01, 0b00, {0, 1, o0}, FPR64, GPR32, asmop>;
- def CVTFdx : A64I_fpintI<0b1, 0b01, 0b00, {0, 1, o0}, FPR64, GPR64, asmop>;
-}
-
-defm S : A64I_inttofp<0b0, "scvtf">;
-defm U : A64I_inttofp<0b1, "ucvtf">;
-
-let Predicates = [HasFPARMv8] in {
-def : Pat<(f32 (sint_to_fp i32:$Rn)), (SCVTFsw $Rn)>;
-def : Pat<(f32 (sint_to_fp i64:$Rn)), (SCVTFsx $Rn)>;
-def : Pat<(f64 (sint_to_fp i32:$Rn)), (SCVTFdw $Rn)>;
-def : Pat<(f64 (sint_to_fp i64:$Rn)), (SCVTFdx $Rn)>;
-def : Pat<(f32 (uint_to_fp i32:$Rn)), (UCVTFsw $Rn)>;
-def : Pat<(f32 (uint_to_fp i64:$Rn)), (UCVTFsx $Rn)>;
-def : Pat<(f64 (uint_to_fp i32:$Rn)), (UCVTFdw $Rn)>;
-def : Pat<(f64 (uint_to_fp i64:$Rn)), (UCVTFdx $Rn)>;
-}
-
-def FMOVws : A64I_fpintI<0b0, 0b00, 0b00, 0b110, GPR32, FPR32, "fmov">;
-def FMOVsw : A64I_fpintI<0b0, 0b00, 0b00, 0b111, FPR32, GPR32, "fmov">;
-def FMOVxd : A64I_fpintI<0b1, 0b01, 0b00, 0b110, GPR64, FPR64, "fmov">;
-def FMOVdx : A64I_fpintI<0b1, 0b01, 0b00, 0b111, FPR64, GPR64, "fmov">;
-
-let Predicates = [HasFPARMv8] in {
-def : Pat<(i32 (bitconvert f32:$Rn)), (FMOVws $Rn)>;
-def : Pat<(f32 (bitconvert i32:$Rn)), (FMOVsw $Rn)>;
-def : Pat<(i64 (bitconvert f64:$Rn)), (FMOVxd $Rn)>;
-def : Pat<(f64 (bitconvert i64:$Rn)), (FMOVdx $Rn)>;
-}
-
-def lane1_asmoperand : AsmOperandClass {
- let Name = "Lane1";
- let RenderMethod = "addImmOperands";
- let DiagnosticType = "Lane1";
-}
-
-def lane1 : Operand<i32> {
- let ParserMatchClass = lane1_asmoperand;
- let PrintMethod = "printBareImmOperand";
-}
-
-let DecoderMethod = "DecodeFMOVLaneInstruction" in {
- def FMOVxv : A64I_fpint<0b1, 0b0, 0b10, 0b01, 0b110,
- (outs GPR64:$Rd), (ins VPR128:$Rn, lane1:$Lane),
- "fmov\t$Rd, $Rn.d[$Lane]", [], NoItinerary>,
- Sched<[WriteFPALU, ReadFPALU]>;
-
- def FMOVvx : A64I_fpint<0b1, 0b0, 0b10, 0b01, 0b111,
- (outs VPR128:$Rd), (ins GPR64:$Rn, lane1:$Lane),
- "fmov\t$Rd.d[$Lane], $Rn", [], NoItinerary>,
- Sched<[WriteFPALU, ReadFPALU]>;
-}
-
-let Predicates = [HasFPARMv8] in {
-def : InstAlias<"fmov $Rd, $Rn.2d[$Lane]",
- (FMOVxv GPR64:$Rd, VPR128:$Rn, lane1:$Lane), 0b0>;
-
-def : InstAlias<"fmov $Rd.2d[$Lane], $Rn",
- (FMOVvx VPR128:$Rd, GPR64:$Rn, lane1:$Lane), 0b0>;
-}
-
-//===----------------------------------------------------------------------===//
-// Floating-point immediate instructions
-//===----------------------------------------------------------------------===//
-// Contains: FMOV
-
-def fpimm_asmoperand : AsmOperandClass {
- let Name = "FMOVImm";
- let ParserMethod = "ParseFPImmOperand";
- let DiagnosticType = "FPImm";
-}
-
-// The MCOperand for these instructions are the encoded 8-bit values.
-def SDXF_fpimm : SDNodeXForm<fpimm, [{
- uint32_t Imm8;
- A64Imms::isFPImm(N->getValueAPF(), Imm8);
- return CurDAG->getTargetConstant(Imm8, MVT::i32);
-}]>;
-
-class fmov_operand<ValueType FT>
- : Operand<i32>,
- PatLeaf<(FT fpimm), [{ return A64Imms::isFPImm(N->getValueAPF()); }],
- SDXF_fpimm> {
- let PrintMethod = "printFPImmOperand";
- let ParserMatchClass = fpimm_asmoperand;
-}
-
-def fmov32_operand : fmov_operand<f32>;
-def fmov64_operand : fmov_operand<f64>;
-
-class A64I_fpimm_impl<bits<2> type, RegisterClass Reg, ValueType VT,
- Operand fmov_operand>
- : A64I_fpimm<0b0, 0b0, type, 0b00000,
- (outs Reg:$Rd),
- (ins fmov_operand:$Imm8),
- "fmov\t$Rd, $Imm8",
- [(set VT:$Rd, fmov_operand:$Imm8)],
- NoItinerary>,
- Sched<[WriteFPALU]>;
-
-def FMOVsi : A64I_fpimm_impl<0b00, FPR32, f32, fmov32_operand>;
-def FMOVdi : A64I_fpimm_impl<0b01, FPR64, f64, fmov64_operand>;
-
-//===----------------------------------------------------------------------===//
-// Load-register (literal) instructions
-//===----------------------------------------------------------------------===//
-// Contains: LDR, LDRSW, PRFM
-
-def ldrlit_label_asmoperand : AsmOperandClass {
- let Name = "LoadLitLabel";
- let RenderMethod = "addLabelOperands<19, 4>";
- let DiagnosticType = "Label";
-}
-
-def ldrlit_label : Operand<i64> {
- let EncoderMethod = "getLoadLitLabelOpValue";
-
- // This label is a 19-bit offset from PC, scaled by the instruction-width: 4.
- let PrintMethod = "printLabelOperand<19, 4>";
- let ParserMatchClass = ldrlit_label_asmoperand;
- let OperandType = "OPERAND_PCREL";
-}
-
-// Various instructions take an immediate value (which can always be used),
-// where some numbers have a symbolic name to make things easier. These operands
-// and the associated functions abstract away the differences.
-multiclass namedimm<string prefix, string mapper> {
+// Final group of aliases covers true "mov $Rd, $imm" cases.
+multiclass movw_mov_alias<string basename,Instruction INST, RegisterClass GPR,
+ int width, int shift> {
def _asmoperand : AsmOperandClass {
- let Name = "NamedImm" # prefix;
- let PredicateMethod = "isUImm";
- let RenderMethod = "addImmOperands";
- let ParserMethod = "ParseNamedImmOperand<" # mapper # ">";
- let DiagnosticType = "NamedImm_" # prefix;
+ let Name = basename # width # "_lsl" # shift # "MovAlias";
+ let PredicateMethod = "is" # basename # "MovAlias<" # width # ", "
+ # shift # ">";
+ let RenderMethod = "add" # basename # "MovAliasOperands<" # shift # ">";
}
- def _op : Operand<i32> {
- let ParserMatchClass = !cast<AsmOperandClass>(prefix # "_asmoperand");
- let PrintMethod = "printNamedImmOperand<" # mapper # ">";
- let DecoderMethod = "DecodeNamedImmOperand<" # mapper # ">";
- }
-}
-
-defm prefetch : namedimm<"prefetch", "A64PRFM::PRFMMapper">;
-
-class A64I_LDRlitSimple<bits<2> opc, bit v, RegisterClass OutReg,
- list<dag> patterns = []>
- : A64I_LDRlit<opc, v, (outs OutReg:$Rt), (ins ldrlit_label:$Imm19),
- "ldr\t$Rt, $Imm19", patterns, NoItinerary>,
- Sched<[WriteLd]>;
-
-let mayLoad = 1 in {
- def LDRw_lit : A64I_LDRlitSimple<0b00, 0b0, GPR32>;
- def LDRx_lit : A64I_LDRlitSimple<0b01, 0b0, GPR64>;
-}
-
-let Predicates = [HasFPARMv8] in {
-def LDRs_lit : A64I_LDRlitSimple<0b00, 0b1, FPR32>;
-def LDRd_lit : A64I_LDRlitSimple<0b01, 0b1, FPR64>;
-}
-
-let mayLoad = 1 in {
- let Predicates = [HasFPARMv8] in {
- def LDRq_lit : A64I_LDRlitSimple<0b10, 0b1, FPR128>;
+ def _movimm : Operand<i32> {
+ let ParserMatchClass = !cast<AsmOperandClass>(NAME # "_asmoperand");
}
- def LDRSWx_lit : A64I_LDRlit<0b10, 0b0,
- (outs GPR64:$Rt),
- (ins ldrlit_label:$Imm19),
- "ldrsw\t$Rt, $Imm19",
- [], NoItinerary>,
- Sched<[WriteLd]>;
-
- def PRFM_lit : A64I_LDRlit<0b11, 0b0,
- (outs), (ins prefetch_op:$Rt, ldrlit_label:$Imm19),
- "prfm\t$Rt, $Imm19",
- [], NoItinerary>,
- Sched<[WriteLd, ReadLd]>;
+ def : InstAlias<"mov $Rd, $imm",
+ (INST GPR:$Rd, !cast<Operand>(NAME # "_movimm"):$imm, shift)>;
}
-//===----------------------------------------------------------------------===//
-// Load-store exclusive instructions
-//===----------------------------------------------------------------------===//
-// Contains: STXRB, STXRH, STXR, LDXRB, LDXRH, LDXR. STXP, LDXP, STLXRB,
-// STLXRH, STLXR, LDAXRB, LDAXRH, LDAXR, STLXP, LDAXP, STLRB,
-// STLRH, STLR, LDARB, LDARH, LDAR
+defm : movw_mov_alias<"MOVZ", MOVZWi, GPR32, 32, 0>;
+defm : movw_mov_alias<"MOVZ", MOVZWi, GPR32, 32, 16>;
-// Since these instructions have the undefined register bits set to 1 in
-// their canonical form, we need a post encoder method to set those bits
-// to 1 when encoding these instructions. We do this using the
-// fixLoadStoreExclusive function. This function has template parameters:
-//
-// fixLoadStoreExclusive<int hasRs, int hasRt2>
-//
-// hasRs indicates that the instruction uses the Rs field, so we won't set
-// it to 1 (and the same for Rt2). We don't need template parameters for
-// the other register fiels since Rt and Rn are always used.
+defm : movw_mov_alias<"MOVZ", MOVZXi, GPR64, 64, 0>;
+defm : movw_mov_alias<"MOVZ", MOVZXi, GPR64, 64, 16>;
+defm : movw_mov_alias<"MOVZ", MOVZXi, GPR64, 64, 32>;
+defm : movw_mov_alias<"MOVZ", MOVZXi, GPR64, 64, 48>;
-// This operand parses a GPR64xsp register, followed by an optional immediate
-// #0.
-def GPR64xsp0_asmoperand : AsmOperandClass {
- let Name = "GPR64xsp0";
- let PredicateMethod = "isWrappedReg";
- let RenderMethod = "addRegOperands";
- let ParserMethod = "ParseLSXAddressOperand";
- // Diagnostics are provided by ParserMethod
-}
+defm : movw_mov_alias<"MOVN", MOVNWi, GPR32, 32, 0>;
+defm : movw_mov_alias<"MOVN", MOVNWi, GPR32, 32, 16>;
-def GPR64xsp0 : RegisterOperand<GPR64xsp> {
- let ParserMatchClass = GPR64xsp0_asmoperand;
-}
+defm : movw_mov_alias<"MOVN", MOVNXi, GPR64, 64, 0>;
+defm : movw_mov_alias<"MOVN", MOVNXi, GPR64, 64, 16>;
+defm : movw_mov_alias<"MOVN", MOVNXi, GPR64, 64, 32>;
+defm : movw_mov_alias<"MOVN", MOVNXi, GPR64, 64, 48>;
-//===----------------------------------
-// Store-exclusive (releasing & normal)
-//===----------------------------------
+let isReMaterializable = 1, isCodeGenOnly = 1, isMoveImm = 1,
+ isAsCheapAsAMove = 1 in {
+// FIXME: The following pseudo instructions are only needed because remat
+// cannot handle multiple instructions. When that changes, we can select
+// directly to the real instructions and get rid of these pseudos.
-class A64I_SRexs_impl<bits<2> size, bits<3> opcode, string asm, dag outs,
- dag ins, list<dag> pat,
- InstrItinClass itin> :
- A64I_LDSTex_stn <size,
- opcode{2}, 0, opcode{1}, opcode{0},
- outs, ins,
- !strconcat(asm, "\t$Rs, $Rt, [$Rn]"),
- pat, itin> {
- let mayStore = 1;
- let PostEncoderMethod = "fixLoadStoreExclusive<1,0>";
- let Constraints = "@earlyclobber $Rs";
-}
+def MOVi32imm
+ : Pseudo<(outs GPR32:$dst), (ins i32imm:$src),
+ [(set GPR32:$dst, imm:$src)]>,
+ Sched<[WriteImm]>;
+def MOVi64imm
+ : Pseudo<(outs GPR64:$dst), (ins i64imm:$src),
+ [(set GPR64:$dst, imm:$src)]>,
+ Sched<[WriteImm]>;
+} // isReMaterializable, isCodeGenOnly
-multiclass A64I_SRex<string asmstr, bits<3> opcode, string prefix> {
- def _byte: A64I_SRexs_impl<0b00, opcode, !strconcat(asmstr, "b"),
- (outs GPR32:$Rs), (ins GPR32:$Rt, GPR64xsp0:$Rn),
- [], NoItinerary>,
- Sched<[WriteSt, ReadSt, ReadSt]>;
-
- def _hword: A64I_SRexs_impl<0b01, opcode, !strconcat(asmstr, "h"),
- (outs GPR32:$Rs), (ins GPR32:$Rt, GPR64xsp0:$Rn),
- [],NoItinerary>,
- Sched<[WriteSt, ReadSt, ReadSt]>;
-
- def _word: A64I_SRexs_impl<0b10, opcode, asmstr,
- (outs GPR32:$Rs), (ins GPR32:$Rt, GPR64xsp0:$Rn),
- [], NoItinerary>,
- Sched<[WriteSt, ReadSt, ReadSt]>;
-
- def _dword: A64I_SRexs_impl<0b11, opcode, asmstr,
- (outs GPR32:$Rs), (ins GPR64:$Rt, GPR64xsp0:$Rn),
- [], NoItinerary>,
- Sched<[WriteSt, ReadSt, ReadSt]>;
-}
-
-defm STXR : A64I_SRex<"stxr", 0b000, "STXR">;
-defm STLXR : A64I_SRex<"stlxr", 0b001, "STLXR">;
-
-//===----------------------------------
-// Loads
-//===----------------------------------
-
-class A64I_LRexs_impl<bits<2> size, bits<3> opcode, string asm, dag outs,
- dag ins, list<dag> pat,
- InstrItinClass itin> :
- A64I_LDSTex_tn <size,
- opcode{2}, 1, opcode{1}, opcode{0},
- outs, ins,
- !strconcat(asm, "\t$Rt, [$Rn]"),
- pat, itin> {
- let mayLoad = 1;
- let PostEncoderMethod = "fixLoadStoreExclusive<0,0>";
-}
-
-multiclass A64I_LRex<string asmstr, bits<3> opcode> {
- def _byte: A64I_LRexs_impl<0b00, opcode, !strconcat(asmstr, "b"),
- (outs GPR32:$Rt), (ins GPR64xsp0:$Rn),
- [], NoItinerary>,
- Sched<[WriteLd]>;
-
- def _hword: A64I_LRexs_impl<0b01, opcode, !strconcat(asmstr, "h"),
- (outs GPR32:$Rt), (ins GPR64xsp0:$Rn),
- [], NoItinerary>,
- Sched<[WriteLd]>;
-
- def _word: A64I_LRexs_impl<0b10, opcode, asmstr,
- (outs GPR32:$Rt), (ins GPR64xsp0:$Rn),
- [], NoItinerary>,
- Sched<[WriteLd]>;
-
- def _dword: A64I_LRexs_impl<0b11, opcode, asmstr,
- (outs GPR64:$Rt), (ins GPR64xsp0:$Rn),
- [], NoItinerary>,
- Sched<[WriteLd]>;
-}
-
-defm LDXR : A64I_LRex<"ldxr", 0b000>;
-defm LDAXR : A64I_LRex<"ldaxr", 0b001>;
-defm LDAR : A64I_LRex<"ldar", 0b101>;
-
-class acquiring_load<PatFrag base>
- : PatFrag<(ops node:$ptr), (base node:$ptr), [{
- AtomicOrdering Ordering = cast<AtomicSDNode>(N)->getOrdering();
- return Ordering == Acquire || Ordering == SequentiallyConsistent;
+// If possible, we want to use MOVi32imm even for 64-bit moves. This gives the
+// eventual expansion code fewer bits to worry about getting right. Marshalling
+// the types is a little tricky though:
+def i64imm_32bit : ImmLeaf<i64, [{
+ return (Imm & 0xffffffffULL) == static_cast<uint64_t>(Imm);
}]>;
-def atomic_load_acquire_8 : acquiring_load<atomic_load_8>;
-def atomic_load_acquire_16 : acquiring_load<atomic_load_16>;
-def atomic_load_acquire_32 : acquiring_load<atomic_load_32>;
-def atomic_load_acquire_64 : acquiring_load<atomic_load_64>;
-
-def : Pat<(atomic_load_acquire_8 i64:$Rn), (LDAR_byte $Rn)>;
-def : Pat<(atomic_load_acquire_16 i64:$Rn), (LDAR_hword $Rn)>;
-def : Pat<(atomic_load_acquire_32 i64:$Rn), (LDAR_word $Rn)>;
-def : Pat<(atomic_load_acquire_64 i64:$Rn), (LDAR_dword $Rn)>;
-
-//===----------------------------------
-// Store-release (no exclusivity)
-//===----------------------------------
-
-class A64I_SLexs_impl<bits<2> size, bits<3> opcode, string asm, dag outs,
- dag ins, list<dag> pat,
- InstrItinClass itin> :
- A64I_LDSTex_tn <size,
- opcode{2}, 0, opcode{1}, opcode{0},
- outs, ins,
- !strconcat(asm, "\t$Rt, [$Rn]"),
- pat, itin> {
- let mayStore = 1;
- let PostEncoderMethod = "fixLoadStoreExclusive<0,0>";
-}
-
-class releasing_store<PatFrag base>
- : PatFrag<(ops node:$ptr, node:$val), (base node:$ptr, node:$val), [{
- AtomicOrdering Ordering = cast<AtomicSDNode>(N)->getOrdering();
- return Ordering == Release || Ordering == SequentiallyConsistent;
+def trunc_imm : SDNodeXForm<imm, [{
+ return CurDAG->getTargetConstant(N->getZExtValue(), MVT::i32);
}]>;
-def atomic_store_release_8 : releasing_store<atomic_store_8>;
-def atomic_store_release_16 : releasing_store<atomic_store_16>;
-def atomic_store_release_32 : releasing_store<atomic_store_32>;
-def atomic_store_release_64 : releasing_store<atomic_store_64>;
+def : Pat<(i64 i64imm_32bit:$src),
+ (SUBREG_TO_REG (i64 0), (MOVi32imm (trunc_imm imm:$src)), sub_32)>;
-multiclass A64I_SLex<string asmstr, bits<3> opcode, string prefix> {
- def _byte: A64I_SLexs_impl<0b00, opcode, !strconcat(asmstr, "b"),
- (outs), (ins GPR32:$Rt, GPR64xsp0:$Rn),
- [(atomic_store_release_8 i64:$Rn, i32:$Rt)],
- NoItinerary>,
- Sched<[WriteSt, ReadSt, ReadSt]>;
+// Deal with the various forms of (ELF) large addressing with MOVZ/MOVK
+// sequences.
+def : Pat<(AArch64WrapperLarge tglobaladdr:$g3, tglobaladdr:$g2,
+ tglobaladdr:$g1, tglobaladdr:$g0),
+ (MOVKXi (MOVKXi (MOVKXi (MOVZXi tglobaladdr:$g3, 48),
+ tglobaladdr:$g2, 32),
+ tglobaladdr:$g1, 16),
+ tglobaladdr:$g0, 0)>;
- def _hword: A64I_SLexs_impl<0b01, opcode, !strconcat(asmstr, "h"),
- (outs), (ins GPR32:$Rt, GPR64xsp0:$Rn),
- [(atomic_store_release_16 i64:$Rn, i32:$Rt)],
- NoItinerary>,
- Sched<[WriteSt, ReadSt, ReadSt]>;
+def : Pat<(AArch64WrapperLarge tblockaddress:$g3, tblockaddress:$g2,
+ tblockaddress:$g1, tblockaddress:$g0),
+ (MOVKXi (MOVKXi (MOVKXi (MOVZXi tblockaddress:$g3, 48),
+ tblockaddress:$g2, 32),
+ tblockaddress:$g1, 16),
+ tblockaddress:$g0, 0)>;
- def _word: A64I_SLexs_impl<0b10, opcode, asmstr,
- (outs), (ins GPR32:$Rt, GPR64xsp0:$Rn),
- [(atomic_store_release_32 i64:$Rn, i32:$Rt)],
- NoItinerary>,
- Sched<[WriteSt, ReadSt, ReadSt]>;
+def : Pat<(AArch64WrapperLarge tconstpool:$g3, tconstpool:$g2,
+ tconstpool:$g1, tconstpool:$g0),
+ (MOVKXi (MOVKXi (MOVKXi (MOVZXi tconstpool:$g3, 48),
+ tconstpool:$g2, 32),
+ tconstpool:$g1, 16),
+ tconstpool:$g0, 0)>;
- def _dword: A64I_SLexs_impl<0b11, opcode, asmstr,
- (outs), (ins GPR64:$Rt, GPR64xsp0:$Rn),
- [(atomic_store_release_64 i64:$Rn, i64:$Rt)],
- NoItinerary>,
- Sched<[WriteSt, ReadSt, ReadSt]>;
+def : Pat<(AArch64WrapperLarge tjumptable:$g3, tjumptable:$g2,
+ tjumptable:$g1, tjumptable:$g0),
+ (MOVKXi (MOVKXi (MOVKXi (MOVZXi tjumptable:$g3, 48),
+ tjumptable:$g2, 32),
+ tjumptable:$g1, 16),
+ tjumptable:$g0, 0)>;
+
+
+//===----------------------------------------------------------------------===//
+// Arithmetic instructions.
+//===----------------------------------------------------------------------===//
+
+// Add/subtract with carry.
+defm ADC : AddSubCarry<0, "adc", "adcs", AArch64adc, AArch64adc_flag>;
+defm SBC : AddSubCarry<1, "sbc", "sbcs", AArch64sbc, AArch64sbc_flag>;
+
+def : InstAlias<"ngc $dst, $src", (SBCWr GPR32:$dst, WZR, GPR32:$src)>;
+def : InstAlias<"ngc $dst, $src", (SBCXr GPR64:$dst, XZR, GPR64:$src)>;
+def : InstAlias<"ngcs $dst, $src", (SBCSWr GPR32:$dst, WZR, GPR32:$src)>;
+def : InstAlias<"ngcs $dst, $src", (SBCSXr GPR64:$dst, XZR, GPR64:$src)>;
+
+// Add/subtract
+defm ADD : AddSub<0, "add", add>;
+defm SUB : AddSub<1, "sub">;
+
+def : InstAlias<"mov $dst, $src",
+ (ADDWri GPR32sponly:$dst, GPR32sp:$src, 0, 0)>;
+def : InstAlias<"mov $dst, $src",
+ (ADDWri GPR32sp:$dst, GPR32sponly:$src, 0, 0)>;
+def : InstAlias<"mov $dst, $src",
+ (ADDXri GPR64sponly:$dst, GPR64sp:$src, 0, 0)>;
+def : InstAlias<"mov $dst, $src",
+ (ADDXri GPR64sp:$dst, GPR64sponly:$src, 0, 0)>;
+
+defm ADDS : AddSubS<0, "adds", AArch64add_flag, "cmn">;
+defm SUBS : AddSubS<1, "subs", AArch64sub_flag, "cmp">;
+
+// Use SUBS instead of SUB to enable CSE between SUBS and SUB.
+def : Pat<(sub GPR32sp:$Rn, addsub_shifted_imm32:$imm),
+ (SUBSWri GPR32sp:$Rn, addsub_shifted_imm32:$imm)>;
+def : Pat<(sub GPR64sp:$Rn, addsub_shifted_imm64:$imm),
+ (SUBSXri GPR64sp:$Rn, addsub_shifted_imm64:$imm)>;
+def : Pat<(sub GPR32:$Rn, GPR32:$Rm),
+ (SUBSWrr GPR32:$Rn, GPR32:$Rm)>;
+def : Pat<(sub GPR64:$Rn, GPR64:$Rm),
+ (SUBSXrr GPR64:$Rn, GPR64:$Rm)>;
+def : Pat<(sub GPR32:$Rn, arith_shifted_reg32:$Rm),
+ (SUBSWrs GPR32:$Rn, arith_shifted_reg32:$Rm)>;
+def : Pat<(sub GPR64:$Rn, arith_shifted_reg64:$Rm),
+ (SUBSXrs GPR64:$Rn, arith_shifted_reg64:$Rm)>;
+def : Pat<(sub GPR32sp:$R2, arith_extended_reg32<i32>:$R3),
+ (SUBSWrx GPR32sp:$R2, arith_extended_reg32<i32>:$R3)>;
+def : Pat<(sub GPR64sp:$R2, arith_extended_reg32to64<i64>:$R3),
+ (SUBSXrx GPR64sp:$R2, arith_extended_reg32to64<i64>:$R3)>;
+
+// Because of the immediate format for add/sub-imm instructions, the
+// expression (add x, -1) must be transformed to (SUB{W,X}ri x, 1).
+// These patterns capture that transformation.
+let AddedComplexity = 1 in {
+def : Pat<(add GPR32:$Rn, neg_addsub_shifted_imm32:$imm),
+ (SUBSWri GPR32:$Rn, neg_addsub_shifted_imm32:$imm)>;
+def : Pat<(add GPR64:$Rn, neg_addsub_shifted_imm64:$imm),
+ (SUBSXri GPR64:$Rn, neg_addsub_shifted_imm64:$imm)>;
+def : Pat<(sub GPR32:$Rn, neg_addsub_shifted_imm32:$imm),
+ (ADDWri GPR32:$Rn, neg_addsub_shifted_imm32:$imm)>;
+def : Pat<(sub GPR64:$Rn, neg_addsub_shifted_imm64:$imm),
+ (ADDXri GPR64:$Rn, neg_addsub_shifted_imm64:$imm)>;
}
-defm STLR : A64I_SLex<"stlr", 0b101, "STLR">;
-
-//===----------------------------------
-// Store-exclusive pair (releasing & normal)
-//===----------------------------------
-
-class A64I_SPexs_impl<bits<2> size, bits<3> opcode, string asm, dag outs,
- dag ins, list<dag> pat,
- InstrItinClass itin> :
- A64I_LDSTex_stt2n <size,
- opcode{2}, 0, opcode{1}, opcode{0},
- outs, ins,
- !strconcat(asm, "\t$Rs, $Rt, $Rt2, [$Rn]"),
- pat, itin> {
- let mayStore = 1;
+// Because of the immediate format for add/sub-imm instructions, the
+// expression (add x, -1) must be transformed to (SUB{W,X}ri x, 1).
+// These patterns capture that transformation.
+let AddedComplexity = 1 in {
+def : Pat<(AArch64add_flag GPR32:$Rn, neg_addsub_shifted_imm32:$imm),
+ (SUBSWri GPR32:$Rn, neg_addsub_shifted_imm32:$imm)>;
+def : Pat<(AArch64add_flag GPR64:$Rn, neg_addsub_shifted_imm64:$imm),
+ (SUBSXri GPR64:$Rn, neg_addsub_shifted_imm64:$imm)>;
+def : Pat<(AArch64sub_flag GPR32:$Rn, neg_addsub_shifted_imm32:$imm),
+ (ADDSWri GPR32:$Rn, neg_addsub_shifted_imm32:$imm)>;
+def : Pat<(AArch64sub_flag GPR64:$Rn, neg_addsub_shifted_imm64:$imm),
+ (ADDSXri GPR64:$Rn, neg_addsub_shifted_imm64:$imm)>;
}
+def : InstAlias<"neg $dst, $src", (SUBWrs GPR32:$dst, WZR, GPR32:$src, 0), 3>;
+def : InstAlias<"neg $dst, $src", (SUBXrs GPR64:$dst, XZR, GPR64:$src, 0), 3>;
+def : InstAlias<"neg $dst, $src$shift",
+ (SUBWrs GPR32:$dst, WZR, GPR32:$src, arith_shift32:$shift), 2>;
+def : InstAlias<"neg $dst, $src$shift",
+ (SUBXrs GPR64:$dst, XZR, GPR64:$src, arith_shift64:$shift), 2>;
-multiclass A64I_SPex<string asmstr, bits<3> opcode> {
- def _word: A64I_SPexs_impl<0b10, opcode, asmstr, (outs),
- (ins GPR32:$Rs, GPR32:$Rt, GPR32:$Rt2,
- GPR64xsp0:$Rn),
- [], NoItinerary>,
- Sched<[WriteSt, ReadSt, ReadSt, ReadSt]>;
+def : InstAlias<"negs $dst, $src", (SUBSWrs GPR32:$dst, WZR, GPR32:$src, 0), 3>;
+def : InstAlias<"negs $dst, $src", (SUBSXrs GPR64:$dst, XZR, GPR64:$src, 0), 3>;
+def : InstAlias<"negs $dst, $src$shift",
+ (SUBSWrs GPR32:$dst, WZR, GPR32:$src, arith_shift32:$shift), 2>;
+def : InstAlias<"negs $dst, $src$shift",
+ (SUBSXrs GPR64:$dst, XZR, GPR64:$src, arith_shift64:$shift), 2>;
- def _dword: A64I_SPexs_impl<0b11, opcode, asmstr, (outs),
- (ins GPR32:$Rs, GPR64:$Rt, GPR64:$Rt2,
- GPR64xsp0:$Rn),
- [], NoItinerary>,
- Sched<[WriteSt, ReadSt, ReadSt, ReadSt]>;
+
+// Unsigned/Signed divide
+defm UDIV : Div<0, "udiv", udiv>;
+defm SDIV : Div<1, "sdiv", sdiv>;
+let isCodeGenOnly = 1 in {
+defm UDIV_Int : Div<0, "udiv", int_aarch64_udiv>;
+defm SDIV_Int : Div<1, "sdiv", int_aarch64_sdiv>;
}
-defm STXP : A64I_SPex<"stxp", 0b010>;
-defm STLXP : A64I_SPex<"stlxp", 0b011>;
+// Variable shift
+defm ASRV : Shift<0b10, "asr", sra>;
+defm LSLV : Shift<0b00, "lsl", shl>;
+defm LSRV : Shift<0b01, "lsr", srl>;
+defm RORV : Shift<0b11, "ror", rotr>;
-//===----------------------------------
-// Load-exclusive pair (acquiring & normal)
-//===----------------------------------
+def : ShiftAlias<"asrv", ASRVWr, GPR32>;
+def : ShiftAlias<"asrv", ASRVXr, GPR64>;
+def : ShiftAlias<"lslv", LSLVWr, GPR32>;
+def : ShiftAlias<"lslv", LSLVXr, GPR64>;
+def : ShiftAlias<"lsrv", LSRVWr, GPR32>;
+def : ShiftAlias<"lsrv", LSRVXr, GPR64>;
+def : ShiftAlias<"rorv", RORVWr, GPR32>;
+def : ShiftAlias<"rorv", RORVXr, GPR64>;
-class A64I_LPexs_impl<bits<2> size, bits<3> opcode, string asm, dag outs,
- dag ins, list<dag> pat,
- InstrItinClass itin> :
- A64I_LDSTex_tt2n <size,
- opcode{2}, 1, opcode{1}, opcode{0},
- outs, ins,
- !strconcat(asm, "\t$Rt, $Rt2, [$Rn]"),
- pat, itin>{
- let mayLoad = 1;
- let DecoderMethod = "DecodeLoadPairExclusiveInstruction";
- let PostEncoderMethod = "fixLoadStoreExclusive<0,1>";
+// Multiply-add
+let AddedComplexity = 7 in {
+defm MADD : MulAccum<0, "madd", add>;
+defm MSUB : MulAccum<1, "msub", sub>;
+
+def : Pat<(i32 (mul GPR32:$Rn, GPR32:$Rm)),
+ (MADDWrrr GPR32:$Rn, GPR32:$Rm, WZR)>;
+def : Pat<(i64 (mul GPR64:$Rn, GPR64:$Rm)),
+ (MADDXrrr GPR64:$Rn, GPR64:$Rm, XZR)>;
+
+def : Pat<(i32 (ineg (mul GPR32:$Rn, GPR32:$Rm))),
+ (MSUBWrrr GPR32:$Rn, GPR32:$Rm, WZR)>;
+def : Pat<(i64 (ineg (mul GPR64:$Rn, GPR64:$Rm))),
+ (MSUBXrrr GPR64:$Rn, GPR64:$Rm, XZR)>;
+} // AddedComplexity = 7
+
+let AddedComplexity = 5 in {
+def SMADDLrrr : WideMulAccum<0, 0b001, "smaddl", add, sext>;
+def SMSUBLrrr : WideMulAccum<1, 0b001, "smsubl", sub, sext>;
+def UMADDLrrr : WideMulAccum<0, 0b101, "umaddl", add, zext>;
+def UMSUBLrrr : WideMulAccum<1, 0b101, "umsubl", sub, zext>;
+
+def : Pat<(i64 (mul (sext GPR32:$Rn), (sext GPR32:$Rm))),
+ (SMADDLrrr GPR32:$Rn, GPR32:$Rm, XZR)>;
+def : Pat<(i64 (mul (zext GPR32:$Rn), (zext GPR32:$Rm))),
+ (UMADDLrrr GPR32:$Rn, GPR32:$Rm, XZR)>;
+
+def : Pat<(i64 (ineg (mul (sext GPR32:$Rn), (sext GPR32:$Rm)))),
+ (SMSUBLrrr GPR32:$Rn, GPR32:$Rm, XZR)>;
+def : Pat<(i64 (ineg (mul (zext GPR32:$Rn), (zext GPR32:$Rm)))),
+ (UMSUBLrrr GPR32:$Rn, GPR32:$Rm, XZR)>;
+} // AddedComplexity = 5
+
+def : MulAccumWAlias<"mul", MADDWrrr>;
+def : MulAccumXAlias<"mul", MADDXrrr>;
+def : MulAccumWAlias<"mneg", MSUBWrrr>;
+def : MulAccumXAlias<"mneg", MSUBXrrr>;
+def : WideMulAccumAlias<"smull", SMADDLrrr>;
+def : WideMulAccumAlias<"smnegl", SMSUBLrrr>;
+def : WideMulAccumAlias<"umull", UMADDLrrr>;
+def : WideMulAccumAlias<"umnegl", UMSUBLrrr>;
+
+// Multiply-high
+def SMULHrr : MulHi<0b010, "smulh", mulhs>;
+def UMULHrr : MulHi<0b110, "umulh", mulhu>;
+
+// CRC32
+def CRC32Brr : BaseCRC32<0, 0b00, 0, GPR32, int_aarch64_crc32b, "crc32b">;
+def CRC32Hrr : BaseCRC32<0, 0b01, 0, GPR32, int_aarch64_crc32h, "crc32h">;
+def CRC32Wrr : BaseCRC32<0, 0b10, 0, GPR32, int_aarch64_crc32w, "crc32w">;
+def CRC32Xrr : BaseCRC32<1, 0b11, 0, GPR64, int_aarch64_crc32x, "crc32x">;
+
+def CRC32CBrr : BaseCRC32<0, 0b00, 1, GPR32, int_aarch64_crc32cb, "crc32cb">;
+def CRC32CHrr : BaseCRC32<0, 0b01, 1, GPR32, int_aarch64_crc32ch, "crc32ch">;
+def CRC32CWrr : BaseCRC32<0, 0b10, 1, GPR32, int_aarch64_crc32cw, "crc32cw">;
+def CRC32CXrr : BaseCRC32<1, 0b11, 1, GPR64, int_aarch64_crc32cx, "crc32cx">;
+
+
+//===----------------------------------------------------------------------===//
+// Logical instructions.
+//===----------------------------------------------------------------------===//
+
+// (immediate)
+defm ANDS : LogicalImmS<0b11, "ands", AArch64and_flag>;
+defm AND : LogicalImm<0b00, "and", and>;
+defm EOR : LogicalImm<0b10, "eor", xor>;
+defm ORR : LogicalImm<0b01, "orr", or>;
+
+// FIXME: these aliases *are* canonical sometimes (when movz can't be
+// used). Actually, it seems to be working right now, but putting logical_immXX
+// here is a bit dodgy on the AsmParser side too.
+def : InstAlias<"mov $dst, $imm", (ORRWri GPR32sp:$dst, WZR,
+ logical_imm32:$imm), 0>;
+def : InstAlias<"mov $dst, $imm", (ORRXri GPR64sp:$dst, XZR,
+ logical_imm64:$imm), 0>;
+
+
+// (register)
+defm ANDS : LogicalRegS<0b11, 0, "ands", AArch64and_flag>;
+defm BICS : LogicalRegS<0b11, 1, "bics",
+ BinOpFrag<(AArch64and_flag node:$LHS, (not node:$RHS))>>;
+defm AND : LogicalReg<0b00, 0, "and", and>;
+defm BIC : LogicalReg<0b00, 1, "bic",
+ BinOpFrag<(and node:$LHS, (not node:$RHS))>>;
+defm EON : LogicalReg<0b10, 1, "eon",
+ BinOpFrag<(xor node:$LHS, (not node:$RHS))>>;
+defm EOR : LogicalReg<0b10, 0, "eor", xor>;
+defm ORN : LogicalReg<0b01, 1, "orn",
+ BinOpFrag<(or node:$LHS, (not node:$RHS))>>;
+defm ORR : LogicalReg<0b01, 0, "orr", or>;
+
+def : InstAlias<"mov $dst, $src", (ORRWrs GPR32:$dst, WZR, GPR32:$src, 0), 2>;
+def : InstAlias<"mov $dst, $src", (ORRXrs GPR64:$dst, XZR, GPR64:$src, 0), 2>;
+
+def : InstAlias<"mvn $Wd, $Wm", (ORNWrs GPR32:$Wd, WZR, GPR32:$Wm, 0), 3>;
+def : InstAlias<"mvn $Xd, $Xm", (ORNXrs GPR64:$Xd, XZR, GPR64:$Xm, 0), 3>;
+
+def : InstAlias<"mvn $Wd, $Wm$sh",
+ (ORNWrs GPR32:$Wd, WZR, GPR32:$Wm, logical_shift32:$sh), 2>;
+def : InstAlias<"mvn $Xd, $Xm$sh",
+ (ORNXrs GPR64:$Xd, XZR, GPR64:$Xm, logical_shift64:$sh), 2>;
+
+def : InstAlias<"tst $src1, $src2",
+ (ANDSWri WZR, GPR32:$src1, logical_imm32:$src2), 2>;
+def : InstAlias<"tst $src1, $src2",
+ (ANDSXri XZR, GPR64:$src1, logical_imm64:$src2), 2>;
+
+def : InstAlias<"tst $src1, $src2",
+ (ANDSWrs WZR, GPR32:$src1, GPR32:$src2, 0), 3>;
+def : InstAlias<"tst $src1, $src2",
+ (ANDSXrs XZR, GPR64:$src1, GPR64:$src2, 0), 3>;
+
+def : InstAlias<"tst $src1, $src2$sh",
+ (ANDSWrs WZR, GPR32:$src1, GPR32:$src2, logical_shift32:$sh), 2>;
+def : InstAlias<"tst $src1, $src2$sh",
+ (ANDSXrs XZR, GPR64:$src1, GPR64:$src2, logical_shift64:$sh), 2>;
+
+
+def : Pat<(not GPR32:$Wm), (ORNWrr WZR, GPR32:$Wm)>;
+def : Pat<(not GPR64:$Xm), (ORNXrr XZR, GPR64:$Xm)>;
+
+
+//===----------------------------------------------------------------------===//
+// One operand data processing instructions.
+//===----------------------------------------------------------------------===//
+
+defm CLS : OneOperandData<0b101, "cls">;
+defm CLZ : OneOperandData<0b100, "clz", ctlz>;
+defm RBIT : OneOperandData<0b000, "rbit">;
+def REV16Wr : OneWRegData<0b001, "rev16",
+ UnOpFrag<(rotr (bswap node:$LHS), (i64 16))>>;
+def REV16Xr : OneXRegData<0b001, "rev16", null_frag>;
+
+def : Pat<(cttz GPR32:$Rn),
+ (CLZWr (RBITWr GPR32:$Rn))>;
+def : Pat<(cttz GPR64:$Rn),
+ (CLZXr (RBITXr GPR64:$Rn))>;
+def : Pat<(ctlz (or (shl (xor (sra GPR32:$Rn, (i64 31)), GPR32:$Rn), (i64 1)),
+ (i32 1))),
+ (CLSWr GPR32:$Rn)>;
+def : Pat<(ctlz (or (shl (xor (sra GPR64:$Rn, (i64 63)), GPR64:$Rn), (i64 1)),
+ (i64 1))),
+ (CLSXr GPR64:$Rn)>;
+
+// Unlike the other one operand instructions, the instructions with the "rev"
+// mnemonic do *not* just different in the size bit, but actually use different
+// opcode bits for the different sizes.
+def REVWr : OneWRegData<0b010, "rev", bswap>;
+def REVXr : OneXRegData<0b011, "rev", bswap>;
+def REV32Xr : OneXRegData<0b010, "rev32",
+ UnOpFrag<(rotr (bswap node:$LHS), (i64 32))>>;
+
+// The bswap commutes with the rotr so we want a pattern for both possible
+// orders.
+def : Pat<(bswap (rotr GPR32:$Rn, (i64 16))), (REV16Wr GPR32:$Rn)>;
+def : Pat<(bswap (rotr GPR64:$Rn, (i64 32))), (REV32Xr GPR64:$Rn)>;
+
+//===----------------------------------------------------------------------===//
+// Bitfield immediate extraction instruction.
+//===----------------------------------------------------------------------===//
+let neverHasSideEffects = 1 in
+defm EXTR : ExtractImm<"extr">;
+def : InstAlias<"ror $dst, $src, $shift",
+ (EXTRWrri GPR32:$dst, GPR32:$src, GPR32:$src, imm0_31:$shift)>;
+def : InstAlias<"ror $dst, $src, $shift",
+ (EXTRXrri GPR64:$dst, GPR64:$src, GPR64:$src, imm0_63:$shift)>;
+
+def : Pat<(rotr GPR32:$Rn, (i64 imm0_31:$imm)),
+ (EXTRWrri GPR32:$Rn, GPR32:$Rn, imm0_31:$imm)>;
+def : Pat<(rotr GPR64:$Rn, (i64 imm0_63:$imm)),
+ (EXTRXrri GPR64:$Rn, GPR64:$Rn, imm0_63:$imm)>;
+
+//===----------------------------------------------------------------------===//
+// Other bitfield immediate instructions.
+//===----------------------------------------------------------------------===//
+let neverHasSideEffects = 1 in {
+defm BFM : BitfieldImmWith2RegArgs<0b01, "bfm">;
+defm SBFM : BitfieldImm<0b00, "sbfm">;
+defm UBFM : BitfieldImm<0b10, "ubfm">;
}
-multiclass A64I_LPex<string asmstr, bits<3> opcode> {
- def _word: A64I_LPexs_impl<0b10, opcode, asmstr,
- (outs GPR32:$Rt, GPR32:$Rt2),
- (ins GPR64xsp0:$Rn),
- [], NoItinerary>,
- Sched<[WriteLd, WriteLd, ReadLd]>;
-
- def _dword: A64I_LPexs_impl<0b11, opcode, asmstr,
- (outs GPR64:$Rt, GPR64:$Rt2),
- (ins GPR64xsp0:$Rn),
- [], NoItinerary>,
- Sched<[WriteLd, WriteLd, ReadLd]>;
-}
-
-defm LDXP : A64I_LPex<"ldxp", 0b010>;
-defm LDAXP : A64I_LPex<"ldaxp", 0b011>;
-
-//===----------------------------------------------------------------------===//
-// Load-store register (unscaled immediate) instructions
-//===----------------------------------------------------------------------===//
-// Contains: LDURB, LDURH, LDRUSB, LDRUSH, LDRUSW, STUR, STURB, STURH and PRFUM
-//
-// and
-//
-//===----------------------------------------------------------------------===//
-// Load-store register (register offset) instructions
-//===----------------------------------------------------------------------===//
-// Contains: LDRB, LDRH, LDRSB, LDRSH, LDRSW, STR, STRB, STRH and PRFM
-//
-// and
-//
-//===----------------------------------------------------------------------===//
-// Load-store register (unsigned immediate) instructions
-//===----------------------------------------------------------------------===//
-// Contains: LDRB, LDRH, LDRSB, LDRSH, LDRSW, STR, STRB, STRH and PRFM
-//
-// and
-//
-//===----------------------------------------------------------------------===//
-// Load-store register (immediate post-indexed) instructions
-//===----------------------------------------------------------------------===//
-// Contains: STRB, STRH, STR, LDRB, LDRH, LDR, LDRSB, LDRSH, LDRSW
-//
-// and
-//
-//===----------------------------------------------------------------------===//
-// Load-store register (immediate pre-indexed) instructions
-//===----------------------------------------------------------------------===//
-// Contains: STRB, STRH, STR, LDRB, LDRH, LDR, LDRSB, LDRSH, LDRSW
-
-// Note that patterns are much later on in a completely separate section (they
-// need ADRPxi to be defined).
-
-//===-------------------------------
-// 1. Various operands needed
-//===-------------------------------
-
-//===-------------------------------
-// 1.1 Unsigned 12-bit immediate operands
-//===-------------------------------
-// The addressing mode for these instructions consists of an unsigned 12-bit
-// immediate which is scaled by the size of the memory access.
-//
-// We represent this in the MC layer by two operands:
-// 1. A base register.
-// 2. A 12-bit immediate: not multiplied by access size, so "LDR x0,[x0,#8]"
-// would have '1' in this field.
-// This means that separate functions are needed for converting representations
-// which *are* aware of the intended access size.
-
-// Anything that creates an MCInst (Decoding, selection and AsmParsing) has to
-// know the access size via some means. An isolated operand does not have this
-// information unless told from here, which means we need separate tablegen
-// Operands for each access size. This multiclass takes care of instantiating
-// the correct template functions in the rest of the backend.
-
-//===-------------------------------
-// 1.1 Unsigned 12-bit immediate operands
-//===-------------------------------
-
-multiclass offsets_uimm12<int MemSize, string prefix> {
- def uimm12_asmoperand : AsmOperandClass {
- let Name = "OffsetUImm12_" # MemSize;
- let PredicateMethod = "isOffsetUImm12<" # MemSize # ">";
- let RenderMethod = "addOffsetUImm12Operands<" # MemSize # ">";
- let DiagnosticType = "LoadStoreUImm12_" # MemSize;
- }
-
- // Pattern is really no more than an ImmLeaf, but predicated on MemSize which
- // complicates things beyond TableGen's ken.
- def uimm12 : Operand<i64>,
- ComplexPattern<i64, 1, "SelectOffsetUImm12<" # MemSize # ">"> {
- let ParserMatchClass
- = !cast<AsmOperandClass>(prefix # uimm12_asmoperand);
-
- let PrintMethod = "printOffsetUImm12Operand<" # MemSize # ">";
- let EncoderMethod = "getOffsetUImm12OpValue<" # MemSize # ">";
- }
-}
-
-defm byte_ : offsets_uimm12<1, "byte_">;
-defm hword_ : offsets_uimm12<2, "hword_">;
-defm word_ : offsets_uimm12<4, "word_">;
-defm dword_ : offsets_uimm12<8, "dword_">;
-defm qword_ : offsets_uimm12<16, "qword_">;
-
-//===-------------------------------
-// 1.1 Signed 9-bit immediate operands
-//===-------------------------------
-
-// The MCInst is expected to store the bit-wise encoding of the value,
-// which amounts to lopping off the extended sign bits.
-def SDXF_simm9 : SDNodeXForm<imm, [{
- return CurDAG->getTargetConstant(N->getZExtValue() & 0x1ff, MVT::i32);
+def i32shift_a : Operand<i64>, SDNodeXForm<imm, [{
+ uint64_t enc = (32 - N->getZExtValue()) & 0x1f;
+ return CurDAG->getTargetConstant(enc, MVT::i64);
}]>;
-def simm9_asmoperand : AsmOperandClass {
- let Name = "SImm9";
- let PredicateMethod = "isSImm<9>";
- let RenderMethod = "addSImmOperands<9>";
- let DiagnosticType = "LoadStoreSImm9";
-}
-
-def simm9 : Operand<i64>,
- ImmLeaf<i64, [{ return Imm >= -0x100 && Imm <= 0xff; }],
- SDXF_simm9> {
- let PrintMethod = "printOffsetSImm9Operand";
- let ParserMatchClass = simm9_asmoperand;
-}
-
-
-//===-------------------------------
-// 1.3 Register offset extensions
-//===-------------------------------
-
-// The assembly-syntax for these addressing-modes is:
-// [<Xn|SP>, <R><m> {, <extend> {<amount>}}]
-//
-// The essential semantics are:
-// + <amount> is a shift: #<log(transfer size)> or #0
-// + <R> can be W or X.
-// + If <R> is W, <extend> can be UXTW or SXTW
-// + If <R> is X, <extend> can be LSL or SXTX
-//
-// The trickiest of those constraints is that Rm can be either GPR32 or GPR64,
-// which will need separate instructions for LLVM type-consistency. We'll also
-// need separate operands, of course.
-multiclass regexts<int MemSize, int RmSize, RegisterClass GPR,
- string Rm, string prefix> {
- def regext_asmoperand : AsmOperandClass {
- let Name = "AddrRegExtend_" # MemSize # "_" # Rm;
- let PredicateMethod = "isAddrRegExtend<" # MemSize # "," # RmSize # ">";
- let RenderMethod = "addAddrRegExtendOperands<" # MemSize # ">";
- let DiagnosticType = "LoadStoreExtend" # RmSize # "_" # MemSize;
- }
-
- def regext : Operand<i64> {
- let PrintMethod
- = "printAddrRegExtendOperand<" # MemSize # ", " # RmSize # ">";
-
- let DecoderMethod = "DecodeAddrRegExtendOperand";
- let ParserMatchClass
- = !cast<AsmOperandClass>(prefix # regext_asmoperand);
- }
-}
-
-multiclass regexts_wx<int MemSize, string prefix> {
- // Rm is an X-register if LSL or SXTX are specified as the shift.
- defm Xm_ : regexts<MemSize, 64, GPR64, "Xm", prefix # "Xm_">;
-
- // Rm is a W-register if UXTW or SXTW are specified as the shift.
- defm Wm_ : regexts<MemSize, 32, GPR32, "Wm", prefix # "Wm_">;
-}
-
-defm byte_ : regexts_wx<1, "byte_">;
-defm hword_ : regexts_wx<2, "hword_">;
-defm word_ : regexts_wx<4, "word_">;
-defm dword_ : regexts_wx<8, "dword_">;
-defm qword_ : regexts_wx<16, "qword_">;
-
-
-//===------------------------------
-// 2. The instructions themselves.
-//===------------------------------
-
-// We have the following instructions to implement:
-// | | B | H | W | X |
-// |-----------------+-------+-------+-------+--------|
-// | unsigned str | STRB | STRH | STR | STR |
-// | unsigned ldr | LDRB | LDRH | LDR | LDR |
-// | signed ldr to W | LDRSB | LDRSH | - | - |
-// | signed ldr to X | LDRSB | LDRSH | LDRSW | (PRFM) |
-
-// This will instantiate the LDR/STR instructions you'd expect to use for an
-// unsigned datatype (first two rows above) or floating-point register, which is
-// reasonably uniform across all access sizes.
-
-
-//===------------------------------
-// 2.1 Regular instructions
-//===------------------------------
-
-// This class covers the basic unsigned or irrelevantly-signed loads and stores,
-// to general-purpose and floating-point registers.
-
-class AddrParams<string prefix> {
- Operand uimm12 = !cast<Operand>(prefix # "_uimm12");
-
- Operand regextWm = !cast<Operand>(prefix # "_Wm_regext");
- Operand regextXm = !cast<Operand>(prefix # "_Xm_regext");
-}
-
-def byte_addrparams : AddrParams<"byte">;
-def hword_addrparams : AddrParams<"hword">;
-def word_addrparams : AddrParams<"word">;
-def dword_addrparams : AddrParams<"dword">;
-def qword_addrparams : AddrParams<"qword">;
-
-multiclass A64I_LDRSTR_unsigned<string prefix, bits<2> size, bit v,
- bit high_opc, string asmsuffix,
- RegisterClass GPR, AddrParams params> {
- // Unsigned immediate
- def _STR : A64I_LSunsigimm<size, v, {high_opc, 0b0},
- (outs), (ins GPR:$Rt, GPR64xsp:$Rn, params.uimm12:$UImm12),
- "str" # asmsuffix # "\t$Rt, [$Rn, $UImm12]",
- [], NoItinerary>,
- Sched<[WriteSt, ReadSt, ReadSt]> {
- let mayStore = 1;
- }
- def : InstAlias<"str" # asmsuffix # " $Rt, [$Rn]",
- (!cast<Instruction>(prefix # "_STR") GPR:$Rt, GPR64xsp:$Rn, 0)>;
-
- def _LDR : A64I_LSunsigimm<size, v, {high_opc, 0b1},
- (outs GPR:$Rt), (ins GPR64xsp:$Rn, params.uimm12:$UImm12),
- "ldr" # asmsuffix # "\t$Rt, [$Rn, $UImm12]",
- [], NoItinerary>,
- Sched<[WriteLd, ReadLd]> {
- let mayLoad = 1;
- }
- def : InstAlias<"ldr" # asmsuffix # " $Rt, [$Rn]",
- (!cast<Instruction>(prefix # "_LDR") GPR:$Rt, GPR64xsp:$Rn, 0)>;
-
- // Register offset (four of these: load/store and Wm/Xm).
- let mayLoad = 1 in {
- def _Wm_RegOffset_LDR : A64I_LSregoff<size, v, {high_opc, 0b1}, 0b0,
- (outs GPR:$Rt),
- (ins GPR64xsp:$Rn, GPR32:$Rm, params.regextWm:$Ext),
- "ldr" # asmsuffix # "\t$Rt, [$Rn, $Rm, $Ext]",
- [], NoItinerary>,
- Sched<[WriteLd, ReadLd, ReadLd]>;
-
- def _Xm_RegOffset_LDR : A64I_LSregoff<size, v, {high_opc, 0b1}, 0b1,
- (outs GPR:$Rt),
- (ins GPR64xsp:$Rn, GPR64:$Rm, params.regextXm:$Ext),
- "ldr" # asmsuffix # "\t$Rt, [$Rn, $Rm, $Ext]",
- [], NoItinerary>,
- Sched<[WriteLd, ReadLd, ReadLd]>;
- }
- def : InstAlias<"ldr" # asmsuffix # " $Rt, [$Rn, $Rm]",
- (!cast<Instruction>(prefix # "_Xm_RegOffset_LDR") GPR:$Rt, GPR64xsp:$Rn,
- GPR64:$Rm, 2)>;
-
- let mayStore = 1 in {
- def _Wm_RegOffset_STR : A64I_LSregoff<size, v, {high_opc, 0b0}, 0b0,
- (outs), (ins GPR:$Rt, GPR64xsp:$Rn, GPR32:$Rm,
- params.regextWm:$Ext),
- "str" # asmsuffix # "\t$Rt, [$Rn, $Rm, $Ext]",
- [], NoItinerary>,
- Sched<[WriteSt, ReadSt, ReadSt, ReadSt]>;
-
- def _Xm_RegOffset_STR : A64I_LSregoff<size, v, {high_opc, 0b0}, 0b1,
- (outs), (ins GPR:$Rt, GPR64xsp:$Rn, GPR64:$Rm,
- params.regextXm:$Ext),
- "str" # asmsuffix # "\t$Rt, [$Rn, $Rm, $Ext]",
- [], NoItinerary>,
- Sched<[WriteSt, ReadSt, ReadSt, ReadSt]>;
- }
- def : InstAlias<"str" # asmsuffix # " $Rt, [$Rn, $Rm]",
- (!cast<Instruction>(prefix # "_Xm_RegOffset_STR") GPR:$Rt, GPR64xsp:$Rn,
- GPR64:$Rm, 2)>;
-
- // Unaligned immediate
- def _STUR : A64I_LSunalimm<size, v, {high_opc, 0b0},
- (outs), (ins GPR:$Rt, GPR64xsp:$Rn, simm9:$SImm9),
- "stur" # asmsuffix # "\t$Rt, [$Rn, $SImm9]",
- [], NoItinerary>,
- Sched<[WriteSt, ReadSt, ReadSt]> {
- let mayStore = 1;
- }
- def : InstAlias<"stur" # asmsuffix # " $Rt, [$Rn]",
- (!cast<Instruction>(prefix # "_STUR") GPR:$Rt, GPR64xsp:$Rn, 0)>;
-
- def _LDUR : A64I_LSunalimm<size, v, {high_opc, 0b1},
- (outs GPR:$Rt), (ins GPR64xsp:$Rn, simm9:$SImm9),
- "ldur" # asmsuffix # "\t$Rt, [$Rn, $SImm9]",
- [], NoItinerary>,
- Sched<[WriteLd, ReadLd]> {
- let mayLoad = 1;
- }
- def : InstAlias<"ldur" # asmsuffix # " $Rt, [$Rn]",
- (!cast<Instruction>(prefix # "_LDUR") GPR:$Rt, GPR64xsp:$Rn, 0)>;
-
- // Post-indexed
- def _PostInd_STR : A64I_LSpostind<size, v, {high_opc, 0b0},
- (outs GPR64xsp:$Rn_wb),
- (ins GPR:$Rt, GPR64xsp:$Rn, simm9:$SImm9),
- "str" # asmsuffix # "\t$Rt, [$Rn], $SImm9",
- [], NoItinerary>,
- Sched<[WriteSt, ReadSt, ReadSt]> {
- let Constraints = "$Rn = $Rn_wb";
- let mayStore = 1;
-
- // Decoder only needed for unpredictability checking (FIXME).
- let DecoderMethod = "DecodeSingleIndexedInstruction";
- }
-
- def _PostInd_LDR : A64I_LSpostind<size, v, {high_opc, 0b1},
- (outs GPR:$Rt, GPR64xsp:$Rn_wb),
- (ins GPR64xsp:$Rn, simm9:$SImm9),
- "ldr" # asmsuffix # "\t$Rt, [$Rn], $SImm9",
- [], NoItinerary>,
- Sched<[WriteLd, WriteLd, ReadLd]> {
- let mayLoad = 1;
- let Constraints = "$Rn = $Rn_wb";
- let DecoderMethod = "DecodeSingleIndexedInstruction";
- }
-
- // Pre-indexed
- def _PreInd_STR : A64I_LSpreind<size, v, {high_opc, 0b0},
- (outs GPR64xsp:$Rn_wb),
- (ins GPR:$Rt, GPR64xsp:$Rn, simm9:$SImm9),
- "str" # asmsuffix # "\t$Rt, [$Rn, $SImm9]!",
- [], NoItinerary>,
- Sched<[WriteSt, ReadSt, ReadSt]> {
- let Constraints = "$Rn = $Rn_wb";
- let mayStore = 1;
-
- // Decoder only needed for unpredictability checking (FIXME).
- let DecoderMethod = "DecodeSingleIndexedInstruction";
- }
-
- def _PreInd_LDR : A64I_LSpreind<size, v, {high_opc, 0b1},
- (outs GPR:$Rt, GPR64xsp:$Rn_wb),
- (ins GPR64xsp:$Rn, simm9:$SImm9),
- "ldr" # asmsuffix # "\t$Rt, [$Rn, $SImm9]!",
- [], NoItinerary>,
- Sched<[WriteLd, WriteLd, ReadLd]> {
- let mayLoad = 1;
- let Constraints = "$Rn = $Rn_wb";
- let DecoderMethod = "DecodeSingleIndexedInstruction";
- }
-
-}
-
-// STRB/LDRB: First define the instructions
-defm LS8
- : A64I_LDRSTR_unsigned<"LS8", 0b00, 0b0, 0b0, "b", GPR32, byte_addrparams>;
-
-// STRH/LDRH
-defm LS16
- : A64I_LDRSTR_unsigned<"LS16", 0b01, 0b0, 0b0, "h", GPR32, hword_addrparams>;
-
-
-// STR/LDR to/from a W register
-defm LS32
- : A64I_LDRSTR_unsigned<"LS32", 0b10, 0b0, 0b0, "", GPR32, word_addrparams>;
-
-// STR/LDR to/from an X register
-defm LS64
- : A64I_LDRSTR_unsigned<"LS64", 0b11, 0b0, 0b0, "", GPR64, dword_addrparams>;
-
-let Predicates = [HasFPARMv8] in {
-// STR/LDR to/from a B register
-defm LSFP8
- : A64I_LDRSTR_unsigned<"LSFP8", 0b00, 0b1, 0b0, "", FPR8, byte_addrparams>;
-
-// STR/LDR to/from an H register
-defm LSFP16
- : A64I_LDRSTR_unsigned<"LSFP16", 0b01, 0b1, 0b0, "", FPR16, hword_addrparams>;
-
-// STR/LDR to/from an S register
-defm LSFP32
- : A64I_LDRSTR_unsigned<"LSFP32", 0b10, 0b1, 0b0, "", FPR32, word_addrparams>;
-// STR/LDR to/from a D register
-defm LSFP64
- : A64I_LDRSTR_unsigned<"LSFP64", 0b11, 0b1, 0b0, "", FPR64, dword_addrparams>;
-// STR/LDR to/from a Q register
-defm LSFP128
- : A64I_LDRSTR_unsigned<"LSFP128", 0b00, 0b1, 0b1, "", FPR128,
- qword_addrparams>;
-}
-
-//===------------------------------
-// 2.3 Signed loads
-//===------------------------------
-
-// Byte and half-word signed loads can both go into either an X or a W register,
-// so it's worth factoring out. Signed word loads don't fit because there is no
-// W version.
-multiclass A64I_LDR_signed<bits<2> size, string asmopcode, AddrParams params,
- string prefix> {
- // Unsigned offset
- def w : A64I_LSunsigimm<size, 0b0, 0b11,
- (outs GPR32:$Rt),
- (ins GPR64xsp:$Rn, params.uimm12:$UImm12),
- "ldrs" # asmopcode # "\t$Rt, [$Rn, $UImm12]",
- [], NoItinerary>,
- Sched<[WriteLd, ReadLd]> {
- let mayLoad = 1;
- }
- def : InstAlias<"ldrs" # asmopcode # " $Rt, [$Rn]",
- (!cast<Instruction>(prefix # w) GPR32:$Rt, GPR64xsp:$Rn, 0)>;
-
- def x : A64I_LSunsigimm<size, 0b0, 0b10,
- (outs GPR64:$Rt),
- (ins GPR64xsp:$Rn, params.uimm12:$UImm12),
- "ldrs" # asmopcode # "\t$Rt, [$Rn, $UImm12]",
- [], NoItinerary>,
- Sched<[WriteLd, ReadLd]> {
- let mayLoad = 1;
- }
- def : InstAlias<"ldrs" # asmopcode # " $Rt, [$Rn]",
- (!cast<Instruction>(prefix # x) GPR64:$Rt, GPR64xsp:$Rn, 0)>;
-
- // Register offset
- let mayLoad = 1 in {
- def w_Wm_RegOffset : A64I_LSregoff<size, 0b0, 0b11, 0b0,
- (outs GPR32:$Rt),
- (ins GPR64xsp:$Rn, GPR32:$Rm, params.regextWm:$Ext),
- "ldrs" # asmopcode # "\t$Rt, [$Rn, $Rm, $Ext]",
- [], NoItinerary>,
- Sched<[WriteLd, ReadLd, ReadLd]>;
-
- def w_Xm_RegOffset : A64I_LSregoff<size, 0b0, 0b11, 0b1,
- (outs GPR32:$Rt),
- (ins GPR64xsp:$Rn, GPR64:$Rm, params.regextXm:$Ext),
- "ldrs" # asmopcode # "\t$Rt, [$Rn, $Rm, $Ext]",
- [], NoItinerary>,
- Sched<[WriteLd, ReadLd, ReadLd]>;
-
- def x_Wm_RegOffset : A64I_LSregoff<size, 0b0, 0b10, 0b0,
- (outs GPR64:$Rt),
- (ins GPR64xsp:$Rn, GPR32:$Rm, params.regextWm:$Ext),
- "ldrs" # asmopcode # "\t$Rt, [$Rn, $Rm, $Ext]",
- [], NoItinerary>,
- Sched<[WriteLd, ReadLd, ReadLd]>;
-
- def x_Xm_RegOffset : A64I_LSregoff<size, 0b0, 0b10, 0b1,
- (outs GPR64:$Rt),
- (ins GPR64xsp:$Rn, GPR64:$Rm, params.regextXm:$Ext),
- "ldrs" # asmopcode # "\t$Rt, [$Rn, $Rm, $Ext]",
- [], NoItinerary>,
- Sched<[WriteLd, ReadLd, ReadLd]>;
- }
- def : InstAlias<"ldrs" # asmopcode # " $Rt, [$Rn, $Rm]",
- (!cast<Instruction>(prefix # "w_Xm_RegOffset") GPR32:$Rt, GPR64xsp:$Rn,
- GPR64:$Rm, 2)>;
-
- def : InstAlias<"ldrs" # asmopcode # " $Rt, [$Rn, $Rm]",
- (!cast<Instruction>(prefix # "x_Xm_RegOffset") GPR64:$Rt, GPR64xsp:$Rn,
- GPR64:$Rm, 2)>;
-
-
- let mayLoad = 1 in {
- // Unaligned offset
- def w_U : A64I_LSunalimm<size, 0b0, 0b11,
- (outs GPR32:$Rt),
- (ins GPR64xsp:$Rn, simm9:$SImm9),
- "ldurs" # asmopcode # "\t$Rt, [$Rn, $SImm9]",
- [], NoItinerary>,
- Sched<[WriteLd, ReadLd]>;
-
- def x_U : A64I_LSunalimm<size, 0b0, 0b10,
- (outs GPR64:$Rt),
- (ins GPR64xsp:$Rn, simm9:$SImm9),
- "ldurs" # asmopcode # "\t$Rt, [$Rn, $SImm9]",
- [], NoItinerary>,
- Sched<[WriteLd, ReadLd]>;
-
-
- // Post-indexed
- def w_PostInd : A64I_LSpostind<size, 0b0, 0b11,
- (outs GPR32:$Rt, GPR64xsp:$Rn_wb),
- (ins GPR64xsp:$Rn, simm9:$SImm9),
- "ldrs" # asmopcode # "\t$Rt, [$Rn], $SImm9",
- [], NoItinerary>,
- Sched<[WriteLd, WriteLd, ReadLd]> {
- let Constraints = "$Rn = $Rn_wb";
- let DecoderMethod = "DecodeSingleIndexedInstruction";
- }
-
- def x_PostInd : A64I_LSpostind<size, 0b0, 0b10,
- (outs GPR64:$Rt, GPR64xsp:$Rn_wb),
- (ins GPR64xsp:$Rn, simm9:$SImm9),
- "ldrs" # asmopcode # "\t$Rt, [$Rn], $SImm9",
- [], NoItinerary>,
- Sched<[WriteLd, WriteLd, ReadLd]> {
- let Constraints = "$Rn = $Rn_wb";
- let DecoderMethod = "DecodeSingleIndexedInstruction";
- }
-
- // Pre-indexed
- def w_PreInd : A64I_LSpreind<size, 0b0, 0b11,
- (outs GPR32:$Rt, GPR64xsp:$Rn_wb),
- (ins GPR64xsp:$Rn, simm9:$SImm9),
- "ldrs" # asmopcode # "\t$Rt, [$Rn, $SImm9]!",
- [], NoItinerary>,
- Sched<[WriteLd, WriteLd, ReadLd]> {
- let Constraints = "$Rn = $Rn_wb";
- let DecoderMethod = "DecodeSingleIndexedInstruction";
- }
-
- def x_PreInd : A64I_LSpreind<size, 0b0, 0b10,
- (outs GPR64:$Rt, GPR64xsp:$Rn_wb),
- (ins GPR64xsp:$Rn, simm9:$SImm9),
- "ldrs" # asmopcode # "\t$Rt, [$Rn, $SImm9]!",
- [], NoItinerary>,
- Sched<[WriteLd, WriteLd, ReadLd]> {
- let Constraints = "$Rn = $Rn_wb";
- let DecoderMethod = "DecodeSingleIndexedInstruction";
- }
- } // let mayLoad = 1
-}
-
-// LDRSB
-defm LDRSB : A64I_LDR_signed<0b00, "b", byte_addrparams, "LDRSB">;
-// LDRSH
-defm LDRSH : A64I_LDR_signed<0b01, "h", hword_addrparams, "LDRSH">;
-
-// LDRSW: load a 32-bit register, sign-extending to 64-bits.
-def LDRSWx
- : A64I_LSunsigimm<0b10, 0b0, 0b10,
- (outs GPR64:$Rt),
- (ins GPR64xsp:$Rn, word_uimm12:$UImm12),
- "ldrsw\t$Rt, [$Rn, $UImm12]",
- [], NoItinerary>,
- Sched<[WriteLd, ReadLd]> {
- let mayLoad = 1;
-}
-def : InstAlias<"ldrsw $Rt, [$Rn]", (LDRSWx GPR64:$Rt, GPR64xsp:$Rn, 0)>;
-
-let mayLoad = 1 in {
- def LDRSWx_Wm_RegOffset : A64I_LSregoff<0b10, 0b0, 0b10, 0b0,
- (outs GPR64:$Rt),
- (ins GPR64xsp:$Rn, GPR32:$Rm, word_Wm_regext:$Ext),
- "ldrsw\t$Rt, [$Rn, $Rm, $Ext]",
- [], NoItinerary>,
- Sched<[WriteLd, ReadLd, ReadLd]>;
-
- def LDRSWx_Xm_RegOffset : A64I_LSregoff<0b10, 0b0, 0b10, 0b1,
- (outs GPR64:$Rt),
- (ins GPR64xsp:$Rn, GPR64:$Rm, word_Xm_regext:$Ext),
- "ldrsw\t$Rt, [$Rn, $Rm, $Ext]",
- [], NoItinerary>,
- Sched<[WriteLd, ReadLd, ReadLd]>;
-}
-def : InstAlias<"ldrsw $Rt, [$Rn, $Rm]",
- (LDRSWx_Xm_RegOffset GPR64:$Rt, GPR64xsp:$Rn, GPR64:$Rm, 2)>;
-
-
-def LDURSWx
- : A64I_LSunalimm<0b10, 0b0, 0b10,
- (outs GPR64:$Rt),
- (ins GPR64xsp:$Rn, simm9:$SImm9),
- "ldursw\t$Rt, [$Rn, $SImm9]",
- [], NoItinerary>,
- Sched<[WriteLd, ReadLd]> {
- let mayLoad = 1;
-}
-def : InstAlias<"ldursw $Rt, [$Rn]", (LDURSWx GPR64:$Rt, GPR64xsp:$Rn, 0)>;
-
-def LDRSWx_PostInd
- : A64I_LSpostind<0b10, 0b0, 0b10,
- (outs GPR64:$Rt, GPR64xsp:$Rn_wb),
- (ins GPR64xsp:$Rn, simm9:$SImm9),
- "ldrsw\t$Rt, [$Rn], $SImm9",
- [], NoItinerary>,
- Sched<[WriteLd, WriteLd, ReadLd]> {
- let mayLoad = 1;
- let Constraints = "$Rn = $Rn_wb";
- let DecoderMethod = "DecodeSingleIndexedInstruction";
-}
-
-def LDRSWx_PreInd : A64I_LSpreind<0b10, 0b0, 0b10,
- (outs GPR64:$Rt, GPR64xsp:$Rn_wb),
- (ins GPR64xsp:$Rn, simm9:$SImm9),
- "ldrsw\t$Rt, [$Rn, $SImm9]!",
- [], NoItinerary>,
- Sched<[WriteLd, WriteLd, ReadLd]> {
- let mayLoad = 1;
- let Constraints = "$Rn = $Rn_wb";
- let DecoderMethod = "DecodeSingleIndexedInstruction";
-}
-
-//===------------------------------
-// 2.4 Prefetch operations
-//===------------------------------
-
-def PRFM : A64I_LSunsigimm<0b11, 0b0, 0b10, (outs),
- (ins prefetch_op:$Rt, GPR64xsp:$Rn, dword_uimm12:$UImm12),
- "prfm\t$Rt, [$Rn, $UImm12]",
- [], NoItinerary>,
- Sched<[WritePreLd, ReadPreLd]> {
- let mayLoad = 1;
-}
-def : InstAlias<"prfm $Rt, [$Rn]",
- (PRFM prefetch_op:$Rt, GPR64xsp:$Rn, 0)>;
-
-let mayLoad = 1 in {
- def PRFM_Wm_RegOffset : A64I_LSregoff<0b11, 0b0, 0b10, 0b0, (outs),
- (ins prefetch_op:$Rt, GPR64xsp:$Rn,
- GPR32:$Rm, dword_Wm_regext:$Ext),
- "prfm\t$Rt, [$Rn, $Rm, $Ext]",
- [], NoItinerary>,
- Sched<[WritePreLd, ReadPreLd]>;
- def PRFM_Xm_RegOffset : A64I_LSregoff<0b11, 0b0, 0b10, 0b1, (outs),
- (ins prefetch_op:$Rt, GPR64xsp:$Rn,
- GPR64:$Rm, dword_Xm_regext:$Ext),
- "prfm\t$Rt, [$Rn, $Rm, $Ext]",
- [], NoItinerary>,
- Sched<[WritePreLd, ReadPreLd]>;
-}
-
-def : InstAlias<"prfm $Rt, [$Rn, $Rm]",
- (PRFM_Xm_RegOffset prefetch_op:$Rt, GPR64xsp:$Rn,
- GPR64:$Rm, 2)>;
-
-
-def PRFUM : A64I_LSunalimm<0b11, 0b0, 0b10, (outs),
- (ins prefetch_op:$Rt, GPR64xsp:$Rn, simm9:$SImm9),
- "prfum\t$Rt, [$Rn, $SImm9]",
- [], NoItinerary>,
- Sched<[WritePreLd, ReadPreLd]> {
- let mayLoad = 1;
-}
-def : InstAlias<"prfum $Rt, [$Rn]",
- (PRFUM prefetch_op:$Rt, GPR64xsp:$Rn, 0)>;
-
-//===----------------------------------------------------------------------===//
-// Load-store register (unprivileged) instructions
-//===----------------------------------------------------------------------===//
-// Contains: LDTRB, LDTRH, LDTRSB, LDTRSH, LDTRSW, STTR, STTRB and STTRH
-
-// These instructions very much mirror the "unscaled immediate" loads, but since
-// there are no floating-point variants we need to split them out into their own
-// section to avoid instantiation of "ldtr d0, [sp]" etc.
-
-multiclass A64I_LDTRSTTR<bits<2> size, string asmsuffix, RegisterClass GPR,
- string prefix> {
- def _UnPriv_STR : A64I_LSunpriv<size, 0b0, 0b00,
- (outs), (ins GPR:$Rt, GPR64xsp:$Rn, simm9:$SImm9),
- "sttr" # asmsuffix # "\t$Rt, [$Rn, $SImm9]",
- [], NoItinerary>,
- Sched<[WriteLd, ReadLd]> {
- let mayStore = 1;
- }
-
- def : InstAlias<"sttr" # asmsuffix # " $Rt, [$Rn]",
- (!cast<Instruction>(prefix # "_UnPriv_STR") GPR:$Rt, GPR64xsp:$Rn, 0)>;
-
- def _UnPriv_LDR : A64I_LSunpriv<size, 0b0, 0b01,
- (outs GPR:$Rt), (ins GPR64xsp:$Rn, simm9:$SImm9),
- "ldtr" # asmsuffix # "\t$Rt, [$Rn, $SImm9]",
- [], NoItinerary>,
- Sched<[WriteLd, ReadLd]> {
- let mayLoad = 1;
- }
-
- def : InstAlias<"ldtr" # asmsuffix # " $Rt, [$Rn]",
- (!cast<Instruction>(prefix # "_UnPriv_LDR") GPR:$Rt, GPR64xsp:$Rn, 0)>;
-
-}
-
-// STTRB/LDTRB: First define the instructions
-defm LS8 : A64I_LDTRSTTR<0b00, "b", GPR32, "LS8">;
-
-// STTRH/LDTRH
-defm LS16 : A64I_LDTRSTTR<0b01, "h", GPR32, "LS16">;
-
-// STTR/LDTR to/from a W register
-defm LS32 : A64I_LDTRSTTR<0b10, "", GPR32, "LS32">;
-
-// STTR/LDTR to/from an X register
-defm LS64 : A64I_LDTRSTTR<0b11, "", GPR64, "LS64">;
-
-// Now a class for the signed instructions that can go to either 32 or 64
-// bits...
-multiclass A64I_LDTR_signed<bits<2> size, string asmopcode, string prefix> {
- let mayLoad = 1 in {
- def w : A64I_LSunpriv<size, 0b0, 0b11,
- (outs GPR32:$Rt),
- (ins GPR64xsp:$Rn, simm9:$SImm9),
- "ldtrs" # asmopcode # "\t$Rt, [$Rn, $SImm9]",
- [], NoItinerary>,
- Sched<[WriteLd, ReadLd]>;
-
- def x : A64I_LSunpriv<size, 0b0, 0b10,
- (outs GPR64:$Rt),
- (ins GPR64xsp:$Rn, simm9:$SImm9),
- "ldtrs" # asmopcode # "\t$Rt, [$Rn, $SImm9]",
- [], NoItinerary>,
- Sched<[WriteLd, ReadLd]>;
- }
-
- def : InstAlias<"ldtrs" # asmopcode # " $Rt, [$Rn]",
- (!cast<Instruction>(prefix # "w") GPR32:$Rt, GPR64xsp:$Rn, 0)>;
-
- def : InstAlias<"ldtrs" # asmopcode # " $Rt, [$Rn]",
- (!cast<Instruction>(prefix # "x") GPR64:$Rt, GPR64xsp:$Rn, 0)>;
-
-}
-
-// LDTRSB
-defm LDTRSB : A64I_LDTR_signed<0b00, "b", "LDTRSB">;
-// LDTRSH
-defm LDTRSH : A64I_LDTR_signed<0b01, "h", "LDTRSH">;
-
-// And finally LDTRSW which only goes to 64 bits.
-def LDTRSWx : A64I_LSunpriv<0b10, 0b0, 0b10,
- (outs GPR64:$Rt),
- (ins GPR64xsp:$Rn, simm9:$SImm9),
- "ldtrsw\t$Rt, [$Rn, $SImm9]",
- [], NoItinerary>,
- Sched<[WriteLd, ReadLd]> {
- let mayLoad = 1;
-}
-def : InstAlias<"ldtrsw $Rt, [$Rn]", (LDTRSWx GPR64:$Rt, GPR64xsp:$Rn, 0)>;
-
-//===----------------------------------------------------------------------===//
-// Load-store register pair (offset) instructions
-//===----------------------------------------------------------------------===//
-//
-// and
-//
-//===----------------------------------------------------------------------===//
-// Load-store register pair (post-indexed) instructions
-//===----------------------------------------------------------------------===//
-// Contains: STP, LDP, LDPSW
-//
-// and
-//
-//===----------------------------------------------------------------------===//
-// Load-store register pair (pre-indexed) instructions
-//===----------------------------------------------------------------------===//
-// Contains: STP, LDP, LDPSW
-//
-// and
-//
-//===----------------------------------------------------------------------===//
-// Load-store non-temporal register pair (offset) instructions
-//===----------------------------------------------------------------------===//
-// Contains: STNP, LDNP
-
-
-// Anything that creates an MCInst (Decoding, selection and AsmParsing) has to
-// know the access size via some means. An isolated operand does not have this
-// information unless told from here, which means we need separate tablegen
-// Operands for each access size. This multiclass takes care of instantiating
-// the correct template functions in the rest of the backend.
-
-multiclass offsets_simm7<string MemSize, string prefix> {
- // The bare signed 7-bit immediate is used in post-indexed instructions, but
- // because of the scaling performed a generic "simm7" operand isn't
- // appropriate here either.
- def simm7_asmoperand : AsmOperandClass {
- let Name = "SImm7_Scaled" # MemSize;
- let PredicateMethod = "isSImm7Scaled<" # MemSize # ">";
- let RenderMethod = "addSImm7ScaledOperands<" # MemSize # ">";
- let DiagnosticType = "LoadStoreSImm7_" # MemSize;
- }
-
- def simm7 : Operand<i64> {
- let PrintMethod = "printSImm7ScaledOperand<" # MemSize # ">";
- let ParserMatchClass = !cast<AsmOperandClass>(prefix # "simm7_asmoperand");
- }
-}
-
-defm word_ : offsets_simm7<"4", "word_">;
-defm dword_ : offsets_simm7<"8", "dword_">;
-defm qword_ : offsets_simm7<"16", "qword_">;
-
-multiclass A64I_LSPsimple<bits<2> opc, bit v, RegisterClass SomeReg,
- Operand simm7, string prefix> {
- def _STR : A64I_LSPoffset<opc, v, 0b0, (outs),
- (ins SomeReg:$Rt, SomeReg:$Rt2, GPR64xsp:$Rn, simm7:$SImm7),
- "stp\t$Rt, $Rt2, [$Rn, $SImm7]", [], NoItinerary>,
- Sched<[WriteLd, ReadLd]> {
- let mayStore = 1;
- let DecoderMethod = "DecodeLDSTPairInstruction";
- }
- def : InstAlias<"stp $Rt, $Rt2, [$Rn]",
- (!cast<Instruction>(prefix # "_STR") SomeReg:$Rt,
- SomeReg:$Rt2, GPR64xsp:$Rn, 0)>;
-
- def _LDR : A64I_LSPoffset<opc, v, 0b1,
- (outs SomeReg:$Rt, SomeReg:$Rt2),
- (ins GPR64xsp:$Rn, simm7:$SImm7),
- "ldp\t$Rt, $Rt2, [$Rn, $SImm7]", [], NoItinerary>,
- Sched<[WriteLd, WriteLd, ReadLd]> {
- let mayLoad = 1;
- let DecoderMethod = "DecodeLDSTPairInstruction";
- }
- def : InstAlias<"ldp $Rt, $Rt2, [$Rn]",
- (!cast<Instruction>(prefix # "_LDR") SomeReg:$Rt,
- SomeReg:$Rt2, GPR64xsp:$Rn, 0)>;
-
- def _PostInd_STR : A64I_LSPpostind<opc, v, 0b0,
- (outs GPR64xsp:$Rn_wb),
- (ins SomeReg:$Rt, SomeReg:$Rt2,
- GPR64xsp:$Rn,
- simm7:$SImm7),
- "stp\t$Rt, $Rt2, [$Rn], $SImm7",
- [], NoItinerary>,
- Sched<[WriteSt, ReadSt, ReadSt, ReadSt]> {
- let mayStore = 1;
- let Constraints = "$Rn = $Rn_wb";
-
- // Decoder only needed for unpredictability checking (FIXME).
- let DecoderMethod = "DecodeLDSTPairInstruction";
- }
-
- def _PostInd_LDR : A64I_LSPpostind<opc, v, 0b1,
- (outs SomeReg:$Rt, SomeReg:$Rt2, GPR64xsp:$Rn_wb),
- (ins GPR64xsp:$Rn, simm7:$SImm7),
- "ldp\t$Rt, $Rt2, [$Rn], $SImm7",
- [], NoItinerary>,
- Sched<[WriteLd, WriteLd, WriteLd, ReadLd]> {
- let mayLoad = 1;
- let Constraints = "$Rn = $Rn_wb";
- let DecoderMethod = "DecodeLDSTPairInstruction";
- }
-
- def _PreInd_STR : A64I_LSPpreind<opc, v, 0b0, (outs GPR64xsp:$Rn_wb),
- (ins SomeReg:$Rt, SomeReg:$Rt2, GPR64xsp:$Rn, simm7:$SImm7),
- "stp\t$Rt, $Rt2, [$Rn, $SImm7]!",
- [], NoItinerary>,
- Sched<[WriteSt, ReadSt, ReadSt, ReadSt]> {
- let mayStore = 1;
- let Constraints = "$Rn = $Rn_wb";
- let DecoderMethod = "DecodeLDSTPairInstruction";
- }
-
- def _PreInd_LDR : A64I_LSPpreind<opc, v, 0b1,
- (outs SomeReg:$Rt, SomeReg:$Rt2, GPR64xsp:$Rn_wb),
- (ins GPR64xsp:$Rn, simm7:$SImm7),
- "ldp\t$Rt, $Rt2, [$Rn, $SImm7]!",
- [], NoItinerary>,
- Sched<[WriteLd, WriteLd, WriteLd, ReadLd]> {
- let mayLoad = 1;
- let Constraints = "$Rn = $Rn_wb";
- let DecoderMethod = "DecodeLDSTPairInstruction";
- }
-
- def _NonTemp_STR : A64I_LSPnontemp<opc, v, 0b0, (outs),
- (ins SomeReg:$Rt, SomeReg:$Rt2, GPR64xsp:$Rn, simm7:$SImm7),
- "stnp\t$Rt, $Rt2, [$Rn, $SImm7]", [], NoItinerary>,
- Sched<[WriteSt, ReadSt, ReadSt, ReadSt]> {
- let mayStore = 1;
- let DecoderMethod = "DecodeLDSTPairInstruction";
- }
- def : InstAlias<"stnp $Rt, $Rt2, [$Rn]",
- (!cast<Instruction>(prefix # "_NonTemp_STR") SomeReg:$Rt,
- SomeReg:$Rt2, GPR64xsp:$Rn, 0)>;
-
- def _NonTemp_LDR : A64I_LSPnontemp<opc, v, 0b1,
- (outs SomeReg:$Rt, SomeReg:$Rt2),
- (ins GPR64xsp:$Rn, simm7:$SImm7),
- "ldnp\t$Rt, $Rt2, [$Rn, $SImm7]", [], NoItinerary>,
- Sched<[WriteLd, WriteLd, ReadLd]> {
- let mayLoad = 1;
- let DecoderMethod = "DecodeLDSTPairInstruction";
- }
- def : InstAlias<"ldnp $Rt, $Rt2, [$Rn]",
- (!cast<Instruction>(prefix # "_NonTemp_LDR") SomeReg:$Rt,
- SomeReg:$Rt2, GPR64xsp:$Rn, 0)>;
-
-}
-
-
-defm LSPair32 : A64I_LSPsimple<0b00, 0b0, GPR32, word_simm7, "LSPair32">;
-defm LSPair64 : A64I_LSPsimple<0b10, 0b0, GPR64, dword_simm7, "LSPair64">;
-
-let Predicates = [HasFPARMv8] in {
-defm LSFPPair32 : A64I_LSPsimple<0b00, 0b1, FPR32, word_simm7, "LSFPPair32">;
-defm LSFPPair64 : A64I_LSPsimple<0b01, 0b1, FPR64, dword_simm7, "LSFPPair64">;
-defm LSFPPair128 : A64I_LSPsimple<0b10, 0b1, FPR128, qword_simm7,
- "LSFPPair128">;
-}
-
-
-def LDPSWx : A64I_LSPoffset<0b01, 0b0, 0b1,
- (outs GPR64:$Rt, GPR64:$Rt2),
- (ins GPR64xsp:$Rn, word_simm7:$SImm7),
- "ldpsw\t$Rt, $Rt2, [$Rn, $SImm7]", [], NoItinerary>,
- Sched<[WriteLd, WriteLd, ReadLd]> {
- let mayLoad = 1;
- let DecoderMethod = "DecodeLDSTPairInstruction";
-}
-def : InstAlias<"ldpsw $Rt, $Rt2, [$Rn]",
- (LDPSWx GPR64:$Rt, GPR64:$Rt2, GPR64xsp:$Rn, 0)>;
-
-def LDPSWx_PostInd : A64I_LSPpostind<0b01, 0b0, 0b1,
- (outs GPR64:$Rt, GPR64:$Rt2, GPR64:$Rn_wb),
- (ins GPR64xsp:$Rn, word_simm7:$SImm7),
- "ldpsw\t$Rt, $Rt2, [$Rn], $SImm7",
- [], NoItinerary>,
- Sched<[WriteLd, WriteLd, WriteLd, ReadLd]> {
- let mayLoad = 1;
- let Constraints = "$Rn = $Rn_wb";
- let DecoderMethod = "DecodeLDSTPairInstruction";
-}
-
-def LDPSWx_PreInd : A64I_LSPpreind<0b01, 0b0, 0b1,
- (outs GPR64:$Rt, GPR64:$Rt2, GPR64:$Rn_wb),
- (ins GPR64xsp:$Rn, word_simm7:$SImm7),
- "ldpsw\t$Rt, $Rt2, [$Rn, $SImm7]!",
- [], NoItinerary>,
- Sched<[WriteLd, WriteLd, WriteLd, ReadLd]> {
- let mayLoad = 1;
- let Constraints = "$Rn = $Rn_wb";
- let DecoderMethod = "DecodeLDSTPairInstruction";
-}
-
-//===----------------------------------------------------------------------===//
-// Logical (immediate) instructions
-//===----------------------------------------------------------------------===//
-// Contains: AND, ORR, EOR, ANDS, + aliases TST, MOV
-
-multiclass logical_imm_operands<string prefix, string note,
- int size, ValueType VT> {
- def _asmoperand : AsmOperandClass {
- let Name = "LogicalImm" # note # size;
- let PredicateMethod = "isLogicalImm" # note # "<" # size # ">";
- let RenderMethod = "addLogicalImmOperands<" # size # ">";
- let DiagnosticType = "LogicalSecondSource";
- }
-
- def _operand
- : Operand<VT>, ComplexPattern<VT, 1, "SelectLogicalImm", [imm]> {
- let ParserMatchClass = !cast<AsmOperandClass>(prefix # "_asmoperand");
- let PrintMethod = "printLogicalImmOperand<" # size # ">";
- let DecoderMethod = "DecodeLogicalImmOperand<" # size # ">";
- }
-}
-
-defm logical_imm32 : logical_imm_operands<"logical_imm32", "", 32, i32>;
-defm logical_imm64 : logical_imm_operands<"logical_imm64", "", 64, i64>;
-
-// The mov versions only differ in assembly parsing, where they
-// exclude values representable with either MOVZ or MOVN.
-defm logical_imm32_mov
- : logical_imm_operands<"logical_imm32_mov", "MOV", 32, i32>;
-defm logical_imm64_mov
- : logical_imm_operands<"logical_imm64_mov", "MOV", 64, i64>;
-
-
-multiclass A64I_logimmSizes<bits<2> opc, string asmop, SDNode opnode> {
- def wwi : A64I_logicalimm<0b0, opc, (outs GPR32wsp:$Rd),
- (ins GPR32:$Rn, logical_imm32_operand:$Imm),
- !strconcat(asmop, "\t$Rd, $Rn, $Imm"),
- [(set i32:$Rd,
- (opnode i32:$Rn, logical_imm32_operand:$Imm))],
- NoItinerary>,
- Sched<[WriteALU, ReadALU]>;
-
- def xxi : A64I_logicalimm<0b1, opc, (outs GPR64xsp:$Rd),
- (ins GPR64:$Rn, logical_imm64_operand:$Imm),
- !strconcat(asmop, "\t$Rd, $Rn, $Imm"),
- [(set i64:$Rd,
- (opnode i64:$Rn, logical_imm64_operand:$Imm))],
- NoItinerary>,
- Sched<[WriteALU, ReadALU]>;
-}
-
-defm AND : A64I_logimmSizes<0b00, "and", and>;
-defm ORR : A64I_logimmSizes<0b01, "orr", or>;
-defm EOR : A64I_logimmSizes<0b10, "eor", xor>;
-
-let Defs = [NZCV] in {
- def ANDSwwi : A64I_logicalimm<0b0, 0b11, (outs GPR32:$Rd),
- (ins GPR32:$Rn, logical_imm32_operand:$Imm),
- "ands\t$Rd, $Rn, $Imm",
- [], NoItinerary>,
- Sched<[WriteALU, ReadALU]>;
-
- def ANDSxxi : A64I_logicalimm<0b1, 0b11, (outs GPR64:$Rd),
- (ins GPR64:$Rn, logical_imm64_operand:$Imm),
- "ands\t$Rd, $Rn, $Imm",
- [], NoItinerary>,
- Sched<[WriteALU, ReadALU]>;
-}
-
-
-def : InstAlias<"tst $Rn, $Imm",
- (ANDSwwi WZR, GPR32:$Rn, logical_imm32_operand:$Imm)>;
-def : InstAlias<"tst $Rn, $Imm",
- (ANDSxxi XZR, GPR64:$Rn, logical_imm64_operand:$Imm)>;
-def : InstAlias<"mov $Rd, $Imm",
- (ORRwwi GPR32wsp:$Rd, WZR, logical_imm32_mov_operand:$Imm)>;
-def : InstAlias<"mov $Rd, $Imm",
- (ORRxxi GPR64xsp:$Rd, XZR, logical_imm64_mov_operand:$Imm)>;
-
-//===----------------------------------------------------------------------===//
-// Logical (shifted register) instructions
-//===----------------------------------------------------------------------===//
-// Contains: AND, BIC, ORR, ORN, EOR, EON, ANDS, BICS + aliases TST, MVN, MOV
-
-// Operand for optimizing (icmp (and LHS, RHS), 0, SomeCode). In theory "ANDS"
-// behaves differently for unsigned comparisons, so we defensively only allow
-// signed or n/a as the operand. In practice "unsigned greater than 0" is "not
-// equal to 0" and LLVM gives us this.
-def signed_cond : PatLeaf<(cond), [{
- return !isUnsignedIntSetCC(N->get());
+def i32shift_b : Operand<i64>, SDNodeXForm<imm, [{
+ uint64_t enc = 31 - N->getZExtValue();
+ return CurDAG->getTargetConstant(enc, MVT::i64);
}]>;
-
-// These instructions share their "shift" operands with add/sub (shifted
-// register instructions). They are defined there.
-
-// N.b. the commutable parameter is just !N. It will be first against the wall
-// when the revolution comes.
-multiclass logical_shifts<string prefix, bit sf, bits<2> opc,
- bit N, bit commutable,
- string asmop, SDPatternOperator opfrag, ValueType ty,
- RegisterClass GPR, list<Register> defs> {
- let isCommutable = commutable, Defs = defs in {
- def _lsl : A64I_logicalshift<sf, opc, 0b00, N,
- (outs GPR:$Rd),
- (ins GPR:$Rn, GPR:$Rm,
- !cast<Operand>("lsl_operand_" # ty):$Imm6),
- !strconcat(asmop, "\t$Rd, $Rn, $Rm, $Imm6"),
- [(set ty:$Rd, (opfrag ty:$Rn, (shl ty:$Rm,
- !cast<Operand>("lsl_operand_" # ty):$Imm6))
- )],
- NoItinerary>,
- Sched<[WriteALU, ReadALU, ReadALU]>;
-
- def _lsr : A64I_logicalshift<sf, opc, 0b01, N,
- (outs GPR:$Rd),
- (ins GPR:$Rn, GPR:$Rm,
- !cast<Operand>("lsr_operand_" # ty):$Imm6),
- !strconcat(asmop, "\t$Rd, $Rn, $Rm, $Imm6"),
- [(set ty:$Rd, (opfrag ty:$Rn, (srl ty:$Rm,
- !cast<Operand>("lsr_operand_" # ty):$Imm6))
- )],
- NoItinerary>,
- Sched<[WriteALU, ReadALU, ReadALU]>;
-
- def _asr : A64I_logicalshift<sf, opc, 0b10, N,
- (outs GPR:$Rd),
- (ins GPR:$Rn, GPR:$Rm,
- !cast<Operand>("asr_operand_" # ty):$Imm6),
- !strconcat(asmop, "\t$Rd, $Rn, $Rm, $Imm6"),
- [(set ty:$Rd, (opfrag ty:$Rn, (sra ty:$Rm,
- !cast<Operand>("asr_operand_" # ty):$Imm6))
- )],
- NoItinerary>,
- Sched<[WriteALU, ReadALU, ReadALU]>;
-
- def _ror : A64I_logicalshift<sf, opc, 0b11, N,
- (outs GPR:$Rd),
- (ins GPR:$Rn, GPR:$Rm,
- !cast<Operand>("ror_operand_" # ty):$Imm6),
- !strconcat(asmop, "\t$Rd, $Rn, $Rm, $Imm6"),
- [(set ty:$Rd, (opfrag ty:$Rn, (rotr ty:$Rm,
- !cast<Operand>("ror_operand_" # ty):$Imm6))
- )],
- NoItinerary>,
- Sched<[WriteALU, ReadALU, ReadALU]>;
- }
-
- def _noshift
- : InstAlias<!strconcat(asmop, " $Rd, $Rn, $Rm"),
- (!cast<Instruction>(prefix # "_lsl") GPR:$Rd, GPR:$Rn,
- GPR:$Rm, 0)>;
-
- def : Pat<(opfrag ty:$Rn, ty:$Rm),
- (!cast<Instruction>(prefix # "_lsl") $Rn, $Rm, 0)>;
-}
-
-multiclass logical_sizes<string prefix, bits<2> opc, bit N, bit commutable,
- string asmop, SDPatternOperator opfrag,
- list<Register> defs> {
- defm xxx : logical_shifts<prefix # "xxx", 0b1, opc, N,
- commutable, asmop, opfrag, i64, GPR64, defs>;
- defm www : logical_shifts<prefix # "www", 0b0, opc, N,
- commutable, asmop, opfrag, i32, GPR32, defs>;
-}
-
-
-defm AND : logical_sizes<"AND", 0b00, 0b0, 0b1, "and", and, []>;
-defm ORR : logical_sizes<"ORR", 0b01, 0b0, 0b1, "orr", or, []>;
-defm EOR : logical_sizes<"EOR", 0b10, 0b0, 0b1, "eor", xor, []>;
-defm ANDS : logical_sizes<"ANDS", 0b11, 0b0, 0b1, "ands",
- PatFrag<(ops node:$lhs, node:$rhs), (and node:$lhs, node:$rhs),
- [{ (void)N; return false; }]>,
- [NZCV]>;
-
-defm BIC : logical_sizes<"BIC", 0b00, 0b1, 0b0, "bic",
- PatFrag<(ops node:$lhs, node:$rhs),
- (and node:$lhs, (not node:$rhs))>, []>;
-defm ORN : logical_sizes<"ORN", 0b01, 0b1, 0b0, "orn",
- PatFrag<(ops node:$lhs, node:$rhs),
- (or node:$lhs, (not node:$rhs))>, []>;
-defm EON : logical_sizes<"EON", 0b10, 0b1, 0b0, "eon",
- PatFrag<(ops node:$lhs, node:$rhs),
- (xor node:$lhs, (not node:$rhs))>, []>;
-defm BICS : logical_sizes<"BICS", 0b11, 0b1, 0b0, "bics",
- PatFrag<(ops node:$lhs, node:$rhs),
- (and node:$lhs, (not node:$rhs)),
- [{ (void)N; return false; }]>,
- [NZCV]>;
-
-multiclass tst_shifts<string prefix, bit sf, ValueType ty, RegisterClass GPR> {
- let isCommutable = 1, Rd = 0b11111, Defs = [NZCV] in {
- def _lsl : A64I_logicalshift<sf, 0b11, 0b00, 0b0,
- (outs),
- (ins GPR:$Rn, GPR:$Rm,
- !cast<Operand>("lsl_operand_" # ty):$Imm6),
- "tst\t$Rn, $Rm, $Imm6",
- [(set NZCV, (A64setcc (and ty:$Rn, (shl ty:$Rm,
- !cast<Operand>("lsl_operand_" # ty):$Imm6)),
- 0, signed_cond))],
- NoItinerary>,
- Sched<[WriteALU, ReadALU, ReadALU]>;
-
-
- def _lsr : A64I_logicalshift<sf, 0b11, 0b01, 0b0,
- (outs),
- (ins GPR:$Rn, GPR:$Rm,
- !cast<Operand>("lsr_operand_" # ty):$Imm6),
- "tst\t$Rn, $Rm, $Imm6",
- [(set NZCV, (A64setcc (and ty:$Rn, (srl ty:$Rm,
- !cast<Operand>("lsr_operand_" # ty):$Imm6)),
- 0, signed_cond))],
- NoItinerary>,
- Sched<[WriteALU, ReadALU, ReadALU]>;
-
- def _asr : A64I_logicalshift<sf, 0b11, 0b10, 0b0,
- (outs),
- (ins GPR:$Rn, GPR:$Rm,
- !cast<Operand>("asr_operand_" # ty):$Imm6),
- "tst\t$Rn, $Rm, $Imm6",
- [(set NZCV, (A64setcc (and ty:$Rn, (sra ty:$Rm,
- !cast<Operand>("asr_operand_" # ty):$Imm6)),
- 0, signed_cond))],
- NoItinerary>,
- Sched<[WriteALU, ReadALU, ReadALU]>;
-
- def _ror : A64I_logicalshift<sf, 0b11, 0b11, 0b0,
- (outs),
- (ins GPR:$Rn, GPR:$Rm,
- !cast<Operand>("ror_operand_" # ty):$Imm6),
- "tst\t$Rn, $Rm, $Imm6",
- [(set NZCV, (A64setcc (and ty:$Rn, (rotr ty:$Rm,
- !cast<Operand>("ror_operand_" # ty):$Imm6)),
- 0, signed_cond))],
- NoItinerary>,
- Sched<[WriteALU, ReadALU, ReadALU]>;
- }
-
- def _noshift : InstAlias<"tst $Rn, $Rm",
- (!cast<Instruction>(prefix # "_lsl") GPR:$Rn, GPR:$Rm, 0)>;
-
- def : Pat<(A64setcc (and ty:$Rn, ty:$Rm), 0, signed_cond),
- (!cast<Instruction>(prefix # "_lsl") $Rn, $Rm, 0)>;
-}
-
-defm TSTxx : tst_shifts<"TSTxx", 0b1, i64, GPR64>;
-defm TSTww : tst_shifts<"TSTww", 0b0, i32, GPR32>;
-
-
-multiclass mvn_shifts<string prefix, bit sf, ValueType ty, RegisterClass GPR> {
- let isCommutable = 0, Rn = 0b11111 in {
- def _lsl : A64I_logicalshift<sf, 0b01, 0b00, 0b1,
- (outs GPR:$Rd),
- (ins GPR:$Rm,
- !cast<Operand>("lsl_operand_" # ty):$Imm6),
- "mvn\t$Rd, $Rm, $Imm6",
- [(set ty:$Rd, (not (shl ty:$Rm,
- !cast<Operand>("lsl_operand_" # ty):$Imm6)))],
- NoItinerary>,
- Sched<[WriteALU, ReadALU, ReadALU]>;
-
-
- def _lsr : A64I_logicalshift<sf, 0b01, 0b01, 0b1,
- (outs GPR:$Rd),
- (ins GPR:$Rm,
- !cast<Operand>("lsr_operand_" # ty):$Imm6),
- "mvn\t$Rd, $Rm, $Imm6",
- [(set ty:$Rd, (not (srl ty:$Rm,
- !cast<Operand>("lsr_operand_" # ty):$Imm6)))],
- NoItinerary>,
- Sched<[WriteALU, ReadALU, ReadALU]>;
-
- def _asr : A64I_logicalshift<sf, 0b01, 0b10, 0b1,
- (outs GPR:$Rd),
- (ins GPR:$Rm,
- !cast<Operand>("asr_operand_" # ty):$Imm6),
- "mvn\t$Rd, $Rm, $Imm6",
- [(set ty:$Rd, (not (sra ty:$Rm,
- !cast<Operand>("asr_operand_" # ty):$Imm6)))],
- NoItinerary>,
- Sched<[WriteALU, ReadALU, ReadALU]>;
-
- def _ror : A64I_logicalshift<sf, 0b01, 0b11, 0b1,
- (outs GPR:$Rd),
- (ins GPR:$Rm,
- !cast<Operand>("ror_operand_" # ty):$Imm6),
- "mvn\t$Rd, $Rm, $Imm6",
- [(set ty:$Rd, (not (rotr ty:$Rm,
- !cast<Operand>("lsl_operand_" # ty):$Imm6)))],
- NoItinerary>,
- Sched<[WriteALU, ReadALU, ReadALU]>;
- }
-
- def _noshift : InstAlias<"mvn $Rn, $Rm",
- (!cast<Instruction>(prefix # "_lsl") GPR:$Rn, GPR:$Rm, 0)>;
-
- def : Pat<(not ty:$Rm),
- (!cast<Instruction>(prefix # "_lsl") $Rm, 0)>;
-}
-
-defm MVNxx : mvn_shifts<"MVNxx", 0b1, i64, GPR64>;
-defm MVNww : mvn_shifts<"MVNww", 0b0, i32, GPR32>;
-
-def MOVxx :InstAlias<"mov $Rd, $Rm", (ORRxxx_lsl GPR64:$Rd, XZR, GPR64:$Rm, 0)>;
-def MOVww :InstAlias<"mov $Rd, $Rm", (ORRwww_lsl GPR32:$Rd, WZR, GPR32:$Rm, 0)>;
-
-//===----------------------------------------------------------------------===//
-// Move wide (immediate) instructions
-//===----------------------------------------------------------------------===//
-// Contains: MOVN, MOVZ, MOVK + MOV aliases
-
-// A wide variety of different relocations are needed for variants of these
-// instructions, so it turns out that we need a different operand for all of
-// them.
-multiclass movw_operands<string prefix, string instname, int width> {
- def _imm_asmoperand : AsmOperandClass {
- let Name = instname # width # "Shifted" # shift;
- let PredicateMethod = "is" # instname # width # "Imm";
- let RenderMethod = "addMoveWideImmOperands";
- let ParserMethod = "ParseImmWithLSLOperand";
- let DiagnosticType = "MOVWUImm16";
- }
-
- def _imm : Operand<i64> {
- let ParserMatchClass = !cast<AsmOperandClass>(prefix # "_imm_asmoperand");
- let PrintMethod = "printMoveWideImmOperand";
- let EncoderMethod = "getMoveWideImmOpValue";
- let DecoderMethod = "DecodeMoveWideImmOperand<" # width # ">";
-
- let MIOperandInfo = (ops uimm16:$UImm16, imm:$Shift);
- }
-}
-
-defm movn32 : movw_operands<"movn32", "MOVN", 32>;
-defm movn64 : movw_operands<"movn64", "MOVN", 64>;
-defm movz32 : movw_operands<"movz32", "MOVZ", 32>;
-defm movz64 : movw_operands<"movz64", "MOVZ", 64>;
-defm movk32 : movw_operands<"movk32", "MOVK", 32>;
-defm movk64 : movw_operands<"movk64", "MOVK", 64>;
-
-multiclass A64I_movwSizes<bits<2> opc, string asmop, dag ins32bit,
- dag ins64bit> {
-
- def wii : A64I_movw<0b0, opc, (outs GPR32:$Rd), ins32bit,
- !strconcat(asmop, "\t$Rd, $FullImm"),
- [], NoItinerary>,
- Sched<[WriteALU]> {
- bits<18> FullImm;
- let UImm16 = FullImm{15-0};
- let Shift = FullImm{17-16};
- }
-
- def xii : A64I_movw<0b1, opc, (outs GPR64:$Rd), ins64bit,
- !strconcat(asmop, "\t$Rd, $FullImm"),
- [], NoItinerary>,
- Sched<[WriteALU]> {
- bits<18> FullImm;
- let UImm16 = FullImm{15-0};
- let Shift = FullImm{17-16};
- }
-}
-
-let isMoveImm = 1, isReMaterializable = 1,
- isAsCheapAsAMove = 1, hasSideEffects = 0 in {
- defm MOVN : A64I_movwSizes<0b00, "movn",
- (ins movn32_imm:$FullImm),
- (ins movn64_imm:$FullImm)>;
-
- // Some relocations are able to convert between a MOVZ and a MOVN. If these
- // are applied the instruction must be emitted with the corresponding bits as
- // 0, which means a MOVZ needs to override that bit from the default.
- let PostEncoderMethod = "fixMOVZ" in
- defm MOVZ : A64I_movwSizes<0b10, "movz",
- (ins movz32_imm:$FullImm),
- (ins movz64_imm:$FullImm)>;
-}
-
-let Constraints = "$src = $Rd",
- SchedRW = [WriteALU, ReadALU] in
-defm MOVK : A64I_movwSizes<0b11, "movk",
- (ins GPR32:$src, movk32_imm:$FullImm),
- (ins GPR64:$src, movk64_imm:$FullImm)>;
-
-
-// And now the "MOV" aliases. These also need their own operands because what
-// they accept is completely different to what the base instructions accept.
-multiclass movalias_operand<string prefix, string basename,
- string immpredicate, int width> {
- def _asmoperand : AsmOperandClass {
- let Name = basename # width # "MovAlias";
- let PredicateMethod
- = "isMoveWideMovAlias<" # width # ", A64Imms::" # immpredicate # ">";
- let RenderMethod
- = "addMoveWideMovAliasOperands<" # width # ", "
- # "A64Imms::" # immpredicate # ">";
- }
-
- def _movimm : Operand<i64> {
- let ParserMatchClass = !cast<AsmOperandClass>(prefix # "_asmoperand");
-
- let MIOperandInfo = (ops uimm16:$UImm16, imm:$Shift);
- }
-}
-
-defm movz32 : movalias_operand<"movz32", "MOVZ", "isMOVZImm", 32>;
-defm movz64 : movalias_operand<"movz64", "MOVZ", "isMOVZImm", 64>;
-defm movn32 : movalias_operand<"movn32", "MOVN", "isOnlyMOVNImm", 32>;
-defm movn64 : movalias_operand<"movn64", "MOVN", "isOnlyMOVNImm", 64>;
-
-// FIXME: these are officially canonical aliases, but TableGen is too limited to
-// print them at the moment. I believe in this case an "AliasPredicate" method
-// will need to be implemented. to allow it, as well as the more generally
-// useful handling of non-register, non-constant operands.
-class movalias<Instruction INST, RegisterClass GPR, Operand operand>
- : InstAlias<"mov $Rd, $FullImm", (INST GPR:$Rd, operand:$FullImm)>;
-
-def : movalias<MOVZwii, GPR32, movz32_movimm>;
-def : movalias<MOVZxii, GPR64, movz64_movimm>;
-def : movalias<MOVNwii, GPR32, movn32_movimm>;
-def : movalias<MOVNxii, GPR64, movn64_movimm>;
-
-def movw_addressref_g0 : ComplexPattern<i64, 2, "SelectMOVWAddressRef<0>">;
-def movw_addressref_g1 : ComplexPattern<i64, 2, "SelectMOVWAddressRef<1>">;
-def movw_addressref_g2 : ComplexPattern<i64, 2, "SelectMOVWAddressRef<2>">;
-def movw_addressref_g3 : ComplexPattern<i64, 2, "SelectMOVWAddressRef<3>">;
-
-def : Pat<(A64WrapperLarge movw_addressref_g3:$G3, movw_addressref_g2:$G2,
- movw_addressref_g1:$G1, movw_addressref_g0:$G0),
- (MOVKxii (MOVKxii (MOVKxii (MOVZxii movw_addressref_g3:$G3),
- movw_addressref_g2:$G2),
- movw_addressref_g1:$G1),
- movw_addressref_g0:$G0)>;
-
-//===----------------------------------------------------------------------===//
-// PC-relative addressing instructions
-//===----------------------------------------------------------------------===//
-// Contains: ADR, ADRP
-
-def adr_label : Operand<i64> {
- let EncoderMethod = "getLabelOpValue<AArch64::fixup_a64_adr_prel>";
-
- // This label is a 21-bit offset from PC, unscaled
- let PrintMethod = "printLabelOperand<21, 1>";
- let ParserMatchClass = label_asmoperand<21, 1>;
- let OperandType = "OPERAND_PCREL";
-}
-
-def adrp_label_asmoperand : AsmOperandClass {
- let Name = "AdrpLabel";
- let RenderMethod = "addLabelOperands<21, 4096>";
- let DiagnosticType = "Label";
-}
-
-def adrp_label : Operand<i64> {
- let EncoderMethod = "getAdrpLabelOpValue";
-
- // This label is a 21-bit offset from PC, scaled by the page-size: 4096.
- let PrintMethod = "printLabelOperand<21, 4096>";
- let ParserMatchClass = adrp_label_asmoperand;
- let OperandType = "OPERAND_PCREL";
-}
-
-let hasSideEffects = 0 in {
- def ADRxi : A64I_PCADR<0b0, (outs GPR64:$Rd), (ins adr_label:$Label),
- "adr\t$Rd, $Label", [], NoItinerary>,
- Sched<[WriteALUs]>;
-
- def ADRPxi : A64I_PCADR<0b1, (outs GPR64:$Rd), (ins adrp_label:$Label),
- "adrp\t$Rd, $Label", [], NoItinerary>,
- Sched<[WriteALUs]>;
-}
-
-//===----------------------------------------------------------------------===//
-// System instructions
-//===----------------------------------------------------------------------===//
-// Contains: HINT, CLREX, DSB, DMB, ISB, MSR, SYS, SYSL, MRS
-// + aliases IC, DC, AT, TLBI, NOP, YIELD, WFE, WFI, SEV, SEVL
-
-// Op1 and Op2 fields are sometimes simple 3-bit unsigned immediate values.
-def uimm3_asmoperand : AsmOperandClass {
- let Name = "UImm3";
- let PredicateMethod = "isUImm<3>";
- let RenderMethod = "addImmOperands";
- let DiagnosticType = "UImm3";
-}
-
-def uimm3 : Operand<i32> {
- let ParserMatchClass = uimm3_asmoperand;
-}
-
-// The HINT alias can accept a simple unsigned 7-bit immediate.
-def uimm7_asmoperand : AsmOperandClass {
- let Name = "UImm7";
- let PredicateMethod = "isUImm<7>";
- let RenderMethod = "addImmOperands";
- let DiagnosticType = "UImm7";
-}
-
-def uimm7 : Operand<i32> {
- let ParserMatchClass = uimm7_asmoperand;
-}
-
-// Multiclass namedimm is defined with the prefetch operands. Most of these fit
-// into the NamedImmMapper scheme well: they either accept a named operand or
-// any immediate under a particular value (which may be 0, implying no immediate
-// is allowed).
-defm dbarrier : namedimm<"dbarrier", "A64DB::DBarrierMapper">;
-defm isb : namedimm<"isb", "A64ISB::ISBMapper">;
-defm ic : namedimm<"ic", "A64IC::ICMapper">;
-defm dc : namedimm<"dc", "A64DC::DCMapper">;
-defm at : namedimm<"at", "A64AT::ATMapper">;
-defm tlbi : namedimm<"tlbi", "A64TLBI::TLBIMapper">;
-
-// However, MRS and MSR are more complicated for a few reasons:
-// * There are ~1000 generic names S3_<op1>_<CRn>_<CRm>_<Op2> which have an
-// implementation-defined effect
-// * Most registers are shared, but some are read-only or write-only.
-// * There is a variant of MSR which accepts the same register name (SPSel),
-// but which would have a different encoding.
-
-// In principle these could be resolved in with more complicated subclasses of
-// NamedImmMapper, however that imposes an overhead on other "named
-// immediates". Both in concrete terms with virtual tables and in unnecessary
-// abstraction.
-
-// The solution adopted here is to take the MRS/MSR Mappers out of the usual
-// hierarchy (they're not derived from NamedImmMapper) and to add logic for
-// their special situation.
-def mrs_asmoperand : AsmOperandClass {
- let Name = "MRS";
- let ParserMethod = "ParseSysRegOperand";
- let DiagnosticType = "MRS";
-}
-
-def mrs_op : Operand<i32> {
- let ParserMatchClass = mrs_asmoperand;
- let PrintMethod = "printMRSOperand";
- let DecoderMethod = "DecodeMRSOperand";
-}
-
-def msr_asmoperand : AsmOperandClass {
- let Name = "MSRWithReg";
-
- // Note that SPSel is valid for both this and the pstate operands, but with
- // different immediate encodings. This is why these operands provide a string
- // AArch64Operand rather than an immediate. The overlap is small enough that
- // it could be resolved with hackery now, but who can say in future?
- let ParserMethod = "ParseSysRegOperand";
- let DiagnosticType = "MSR";
-}
-
-def msr_op : Operand<i32> {
- let ParserMatchClass = msr_asmoperand;
- let PrintMethod = "printMSROperand";
- let DecoderMethod = "DecodeMSROperand";
-}
-
-def pstate_asmoperand : AsmOperandClass {
- let Name = "MSRPState";
- // See comment above about parser.
- let ParserMethod = "ParseSysRegOperand";
- let DiagnosticType = "MSR";
-}
-
-def pstate_op : Operand<i32> {
- let ParserMatchClass = pstate_asmoperand;
- let PrintMethod = "printNamedImmOperand<A64PState::PStateMapper>";
- let DecoderMethod = "DecodeNamedImmOperand<A64PState::PStateMapper>";
-}
-
-// When <CRn> is specified, an assembler should accept something like "C4", not
-// the usual "#4" immediate.
-def CRx_asmoperand : AsmOperandClass {
- let Name = "CRx";
- let PredicateMethod = "isUImm<4>";
- let RenderMethod = "addImmOperands";
- let ParserMethod = "ParseCRxOperand";
- // Diagnostics are handled in all cases by ParseCRxOperand.
-}
-
-def CRx : Operand<i32> {
- let ParserMatchClass = CRx_asmoperand;
- let PrintMethod = "printCRxOperand";
-}
-
-
-// Finally, we can start defining the instructions.
-
-// HINT is straightforward, with a few aliases.
-def HINTi : A64I_system<0b0, (outs), (ins uimm7:$UImm7), "hint\t$UImm7",
- [], NoItinerary> {
- bits<7> UImm7;
- let CRm = UImm7{6-3};
- let Op2 = UImm7{2-0};
-
- let Op0 = 0b00;
- let Op1 = 0b011;
- let CRn = 0b0010;
- let Rt = 0b11111;
-}
-
-def : InstAlias<"nop", (HINTi 0)>;
-def : InstAlias<"yield", (HINTi 1)>;
-def : InstAlias<"wfe", (HINTi 2)>;
-def : InstAlias<"wfi", (HINTi 3)>;
-def : InstAlias<"sev", (HINTi 4)>;
-def : InstAlias<"sevl", (HINTi 5)>;
-
-// Quite a few instructions then follow a similar pattern of fixing common
-// fields in the bitpattern, we'll define a helper-class for them.
-class simple_sys<bits<2> op0, bits<3> op1, bits<4> crn, bits<3> op2,
- Operand operand, string asmop>
- : A64I_system<0b0, (outs), (ins operand:$CRm), !strconcat(asmop, "\t$CRm"),
- [], NoItinerary> {
- let Op0 = op0;
- let Op1 = op1;
- let CRn = crn;
- let Op2 = op2;
- let Rt = 0b11111;
-}
-
-
-def CLREXi : simple_sys<0b00, 0b011, 0b0011, 0b010, uimm4, "clrex">;
-def DSBi : simple_sys<0b00, 0b011, 0b0011, 0b100, dbarrier_op, "dsb">;
-def DMBi : simple_sys<0b00, 0b011, 0b0011, 0b101, dbarrier_op, "dmb">;
-def ISBi : simple_sys<0b00, 0b011, 0b0011, 0b110, isb_op, "isb">;
-
-def : InstAlias<"clrex", (CLREXi 0b1111)>;
-def : InstAlias<"isb", (ISBi 0b1111)>;
-
-// (DMBi 0xb) is a "DMB ISH" instruciton, appropriate for Linux SMP
-// configurations at least.
-def : Pat<(atomic_fence imm, imm), (DMBi 0xb)>;
-
-// Any SYS bitpattern can be represented with a complex and opaque "SYS"
-// instruction.
-def SYSiccix : A64I_system<0b0, (outs),
- (ins uimm3:$Op1, CRx:$CRn, CRx:$CRm,
- uimm3:$Op2, GPR64:$Rt),
- "sys\t$Op1, $CRn, $CRm, $Op2, $Rt",
- [], NoItinerary> {
- let Op0 = 0b01;
-}
-
-// You can skip the Xt argument whether it makes sense or not for the generic
-// SYS instruction.
-def : InstAlias<"sys $Op1, $CRn, $CRm, $Op2",
- (SYSiccix uimm3:$Op1, CRx:$CRn, CRx:$CRm, uimm3:$Op2, XZR)>;
-
-
-// But many have aliases, which obviously don't fit into
-class SYSalias<dag ins, string asmstring>
- : A64I_system<0b0, (outs), ins, asmstring, [], NoItinerary> {
- let isAsmParserOnly = 1;
-
- bits<14> SysOp;
- let Op0 = 0b01;
- let Op1 = SysOp{13-11};
- let CRn = SysOp{10-7};
- let CRm = SysOp{6-3};
- let Op2 = SysOp{2-0};
-}
-
-def ICix : SYSalias<(ins ic_op:$SysOp, GPR64:$Rt), "ic\t$SysOp, $Rt">;
-
-def ICi : SYSalias<(ins ic_op:$SysOp), "ic\t$SysOp"> {
- let Rt = 0b11111;
-}
-
-def DCix : SYSalias<(ins dc_op:$SysOp, GPR64:$Rt), "dc\t$SysOp, $Rt">;
-def ATix : SYSalias<(ins at_op:$SysOp, GPR64:$Rt), "at\t$SysOp, $Rt">;
-
-def TLBIix : SYSalias<(ins tlbi_op:$SysOp, GPR64:$Rt), "tlbi\t$SysOp, $Rt">;
-
-def TLBIi : SYSalias<(ins tlbi_op:$SysOp), "tlbi\t$SysOp"> {
- let Rt = 0b11111;
-}
-
-
-def SYSLxicci : A64I_system<0b1, (outs GPR64:$Rt),
- (ins uimm3:$Op1, CRx:$CRn, CRx:$CRm, uimm3:$Op2),
- "sysl\t$Rt, $Op1, $CRn, $CRm, $Op2",
- [], NoItinerary> {
- let Op0 = 0b01;
-}
-
-// The instructions themselves are rather simple for MSR and MRS.
-def MSRix : A64I_system<0b0, (outs), (ins msr_op:$SysReg, GPR64:$Rt),
- "msr\t$SysReg, $Rt", [], NoItinerary> {
- bits<16> SysReg;
- let Op0 = SysReg{15-14};
- let Op1 = SysReg{13-11};
- let CRn = SysReg{10-7};
- let CRm = SysReg{6-3};
- let Op2 = SysReg{2-0};
-}
-
-def MRSxi : A64I_system<0b1, (outs GPR64:$Rt), (ins mrs_op:$SysReg),
- "mrs\t$Rt, $SysReg", [], NoItinerary> {
- bits<16> SysReg;
- let Op0 = SysReg{15-14};
- let Op1 = SysReg{13-11};
- let CRn = SysReg{10-7};
- let CRm = SysReg{6-3};
- let Op2 = SysReg{2-0};
-}
-
-def MSRii : A64I_system<0b0, (outs), (ins pstate_op:$PState, uimm4:$CRm),
- "msr\t$PState, $CRm", [], NoItinerary> {
- bits<6> PState;
-
- let Op0 = 0b00;
- let Op1 = PState{5-3};
- let CRn = 0b0100;
- let Op2 = PState{2-0};
- let Rt = 0b11111;
-}
-
-//===----------------------------------------------------------------------===//
-// Test & branch (immediate) instructions
-//===----------------------------------------------------------------------===//
-// Contains: TBZ, TBNZ
-
-// The bit to test is a simple unsigned 6-bit immediate in the X-register
-// versions.
-def uimm6 : Operand<i64> {
- let ParserMatchClass = uimm6_asmoperand;
-}
-
-def label_wid14_scal4_asmoperand : label_asmoperand<14, 4>;
-
-def tbimm_target : Operand<OtherVT> {
- let EncoderMethod = "getLabelOpValue<AArch64::fixup_a64_tstbr>";
-
- // This label is a 14-bit offset from PC, scaled by the instruction-width: 4.
- let PrintMethod = "printLabelOperand<14, 4>";
- let ParserMatchClass = label_wid14_scal4_asmoperand;
-
- let OperandType = "OPERAND_PCREL";
-}
-
-def A64eq : ImmLeaf<i32, [{ return Imm == A64CC::EQ; }]>;
-def A64ne : ImmLeaf<i32, [{ return Imm == A64CC::NE; }]>;
-
-// These instructions correspond to patterns involving "and" with a power of
-// two, which we need to be able to select.
-def tstb64_pat : ComplexPattern<i64, 1, "SelectTSTBOperand<64>">;
-def tstb32_pat : ComplexPattern<i32, 1, "SelectTSTBOperand<32>">;
-
-let isBranch = 1, isTerminator = 1 in {
- def TBZxii : A64I_TBimm<0b0, (outs),
- (ins GPR64:$Rt, uimm6:$Imm, tbimm_target:$Label),
- "tbz\t$Rt, $Imm, $Label",
- [(A64br_cc (A64cmp (and i64:$Rt, tstb64_pat:$Imm), 0),
- A64eq, bb:$Label)],
- NoItinerary>,
- Sched<[WriteBr]>;
-
- def TBNZxii : A64I_TBimm<0b1, (outs),
- (ins GPR64:$Rt, uimm6:$Imm, tbimm_target:$Label),
- "tbnz\t$Rt, $Imm, $Label",
- [(A64br_cc (A64cmp (and i64:$Rt, tstb64_pat:$Imm), 0),
- A64ne, bb:$Label)],
- NoItinerary>,
- Sched<[WriteBr]>;
-
-
- // Note, these instructions overlap with the above 64-bit patterns. This is
- // intentional, "tbz x3, #1, somewhere" and "tbz w3, #1, somewhere" would both
- // do the same thing and are both permitted assembly. They also both have
- // sensible DAG patterns.
- def TBZwii : A64I_TBimm<0b0, (outs),
- (ins GPR32:$Rt, uimm5:$Imm, tbimm_target:$Label),
- "tbz\t$Rt, $Imm, $Label",
- [(A64br_cc (A64cmp (and i32:$Rt, tstb32_pat:$Imm), 0),
- A64eq, bb:$Label)],
- NoItinerary>,
- Sched<[WriteBr]> {
- let Imm{5} = 0b0;
- }
-
- def TBNZwii : A64I_TBimm<0b1, (outs),
- (ins GPR32:$Rt, uimm5:$Imm, tbimm_target:$Label),
- "tbnz\t$Rt, $Imm, $Label",
- [(A64br_cc (A64cmp (and i32:$Rt, tstb32_pat:$Imm), 0),
- A64ne, bb:$Label)],
- NoItinerary>,
- Sched<[WriteBr]> {
- let Imm{5} = 0b0;
- }
-}
-
-//===----------------------------------------------------------------------===//
-// Unconditional branch (immediate) instructions
-//===----------------------------------------------------------------------===//
-// Contains: B, BL
-
-def label_wid26_scal4_asmoperand : label_asmoperand<26, 4>;
-
-def bimm_target : Operand<OtherVT> {
- let EncoderMethod = "getLabelOpValue<AArch64::fixup_a64_uncondbr>";
-
- // This label is a 26-bit offset from PC, scaled by the instruction-width: 4.
- let PrintMethod = "printLabelOperand<26, 4>";
- let ParserMatchClass = label_wid26_scal4_asmoperand;
-
- let OperandType = "OPERAND_PCREL";
-}
-
-def blimm_target : Operand<i64> {
- let EncoderMethod = "getLabelOpValue<AArch64::fixup_a64_call>";
-
- // This label is a 26-bit offset from PC, scaled by the instruction-width: 4.
- let PrintMethod = "printLabelOperand<26, 4>";
- let ParserMatchClass = label_wid26_scal4_asmoperand;
-
- let OperandType = "OPERAND_PCREL";
-}
-
-class A64I_BimmImpl<bit op, string asmop, list<dag> patterns, Operand lbl_type>
- : A64I_Bimm<op, (outs), (ins lbl_type:$Label),
- !strconcat(asmop, "\t$Label"), patterns,
- NoItinerary>,
- Sched<[WriteBr]>;
-
-let isBranch = 1 in {
- def Bimm : A64I_BimmImpl<0b0, "b", [(br bb:$Label)], bimm_target> {
- let isTerminator = 1;
- let isBarrier = 1;
- }
-
- let SchedRW = [WriteBrL] in {
- def BLimm : A64I_BimmImpl<0b1, "bl",
- [(AArch64Call tglobaladdr:$Label)], blimm_target> {
- let isCall = 1;
- let Defs = [X30];
- }
- }
-}
-
-def : Pat<(AArch64Call texternalsym:$Label), (BLimm texternalsym:$Label)>;
-
-//===----------------------------------------------------------------------===//
-// Unconditional branch (register) instructions
-//===----------------------------------------------------------------------===//
-// Contains: BR, BLR, RET, ERET, DRP.
-
-// Most of the notional opcode fields in the A64I_Breg format are fixed in A64
-// at the moment.
-class A64I_BregImpl<bits<4> opc,
- dag outs, dag ins, string asmstr, list<dag> patterns,
- InstrItinClass itin = NoItinerary>
- : A64I_Breg<opc, 0b11111, 0b000000, 0b00000,
- outs, ins, asmstr, patterns, itin>,
- Sched<[WriteBr]> {
- let isBranch = 1;
- let isIndirectBranch = 1;
-}
-
-// Note that these are not marked isCall or isReturn because as far as LLVM is
-// concerned they're not. "ret" is just another jump unless it has been selected
-// by LLVM as the function's return.
-
-let isBranch = 1 in {
- def BRx : A64I_BregImpl<0b0000,(outs), (ins GPR64:$Rn),
- "br\t$Rn", [(brind i64:$Rn)]> {
- let isBarrier = 1;
- let isTerminator = 1;
- }
-
- let SchedRW = [WriteBrL] in {
- def BLRx : A64I_BregImpl<0b0001, (outs), (ins GPR64:$Rn),
- "blr\t$Rn", [(AArch64Call i64:$Rn)]> {
- let isBarrier = 0;
- let isCall = 1;
- let Defs = [X30];
- }
- }
-
- def RETx : A64I_BregImpl<0b0010, (outs), (ins GPR64:$Rn),
- "ret\t$Rn", []> {
- let isBarrier = 1;
- let isTerminator = 1;
- let isReturn = 1;
- }
-
- // Create a separate pseudo-instruction for codegen to use so that we don't
- // flag x30 as used in every function. It'll be restored before the RET by the
- // epilogue if it's legitimately used.
- def RET : A64PseudoExpand<(outs), (ins), [(A64ret)], (RETx (ops X30))> {
- let isTerminator = 1;
- let isBarrier = 1;
- let isReturn = 1;
- }
-
- def ERET : A64I_BregImpl<0b0100, (outs), (ins), "eret", []> {
- let Rn = 0b11111;
- let isBarrier = 1;
- let isTerminator = 1;
- let isReturn = 1;
- }
-
- def DRPS : A64I_BregImpl<0b0101, (outs), (ins), "drps", []> {
- let Rn = 0b11111;
- let isBarrier = 1;
- }
-}
-
-def RETAlias : InstAlias<"ret", (RETx X30)>;
-
-
-//===----------------------------------------------------------------------===//
-// Address generation patterns
-//===----------------------------------------------------------------------===//
-
-// Primary method of address generation for the small/absolute memory model is
-// an ADRP/ADR pair:
-// ADRP x0, some_variable
-// ADD x0, x0, #:lo12:some_variable
-//
-// The load/store elision of the ADD is accomplished when selecting
-// addressing-modes. This just mops up the cases where that doesn't work and we
-// really need an address in some register.
-
-// This wrapper applies a LO12 modifier to the address. Otherwise we could just
-// use the same address.
-
-class ADRP_ADD<SDNode Wrapper, SDNode addrop>
- : Pat<(Wrapper addrop:$Hi, addrop:$Lo12, (i32 imm)),
- (ADDxxi_lsl0_s (ADRPxi addrop:$Hi), addrop:$Lo12)>;
-
-def : ADRP_ADD<A64WrapperSmall, tblockaddress>;
-def : ADRP_ADD<A64WrapperSmall, texternalsym>;
-def : ADRP_ADD<A64WrapperSmall, tglobaladdr>;
-def : ADRP_ADD<A64WrapperSmall, tglobaltlsaddr>;
-def : ADRP_ADD<A64WrapperSmall, tjumptable>;
-def : ADRP_ADD<A64WrapperSmall, tconstpool>;
-
-//===----------------------------------------------------------------------===//
-// GOT access patterns
-//===----------------------------------------------------------------------===//
-
-class GOTLoadSmall<SDNode addrfrag>
- : Pat<(A64GOTLoad (A64WrapperSmall addrfrag:$Hi, addrfrag:$Lo12, 8)),
- (LS64_LDR (ADRPxi addrfrag:$Hi), addrfrag:$Lo12)>;
-
-def : GOTLoadSmall<texternalsym>;
-def : GOTLoadSmall<tglobaladdr>;
-def : GOTLoadSmall<tglobaltlsaddr>;
-
-//===----------------------------------------------------------------------===//
-// Tail call handling
-//===----------------------------------------------------------------------===//
-
-let isCall = 1, isTerminator = 1, isReturn = 1, isBarrier = 1, Uses = [XSP] in {
- def TC_RETURNdi
- : PseudoInst<(outs), (ins i64imm:$dst, i32imm:$FPDiff),
- [(AArch64tcret tglobaladdr:$dst, (i32 timm:$FPDiff))]>;
-
- def TC_RETURNxi
- : PseudoInst<(outs), (ins tcGPR64:$dst, i32imm:$FPDiff),
- [(AArch64tcret i64:$dst, (i32 timm:$FPDiff))]>;
-}
-
-let isCall = 1, isTerminator = 1, isReturn = 1, isBarrier = 1,
- Uses = [XSP] in {
- def TAIL_Bimm : A64PseudoExpand<(outs), (ins bimm_target:$Label), [],
- (Bimm bimm_target:$Label)>;
-
- def TAIL_BRx : A64PseudoExpand<(outs), (ins tcGPR64:$Rd), [],
- (BRx GPR64:$Rd)>;
-}
-
-
-def : Pat<(AArch64tcret texternalsym:$dst, (i32 timm:$FPDiff)),
- (TC_RETURNdi texternalsym:$dst, imm:$FPDiff)>;
-
-//===----------------------------------------------------------------------===//
-// Thread local storage
-//===----------------------------------------------------------------------===//
-
-// This is a pseudo-instruction representing the ".tlsdesccall" directive in
-// assembly. Its effect is to insert an R_AARCH64_TLSDESC_CALL relocation at the
-// current location. It should always be immediately followed by a BLR
-// instruction, and is intended solely for relaxation by the linker.
-
-def : Pat<(A64threadpointer), (MRSxi 0xde82)>;
-
-def TLSDESCCALL : PseudoInst<(outs), (ins i64imm:$Lbl), []> {
- let hasSideEffects = 1;
-}
-
-def TLSDESC_BLRx : PseudoInst<(outs), (ins GPR64:$Rn, i64imm:$Var),
- [(A64tlsdesc_blr i64:$Rn, tglobaltlsaddr:$Var)]> {
- let isCall = 1;
- let Defs = [X30];
-}
-
-def : Pat<(A64tlsdesc_blr i64:$Rn, texternalsym:$Var),
- (TLSDESC_BLRx $Rn, texternalsym:$Var)>;
-
-//===----------------------------------------------------------------------===//
-// Bitfield patterns
-//===----------------------------------------------------------------------===//
-
-def bfi32_lsb_to_immr : SDNodeXForm<imm, [{
- return CurDAG->getTargetConstant((32 - N->getZExtValue()) % 32, MVT::i64);
+// min(7, 31 - shift_amt)
+def i32shift_sext_i8 : Operand<i64>, SDNodeXForm<imm, [{
+ uint64_t enc = 31 - N->getZExtValue();
+ enc = enc > 7 ? 7 : enc;
+ return CurDAG->getTargetConstant(enc, MVT::i64);
}]>;
-def bfi64_lsb_to_immr : SDNodeXForm<imm, [{
- return CurDAG->getTargetConstant((64 - N->getZExtValue()) % 64, MVT::i64);
+// min(15, 31 - shift_amt)
+def i32shift_sext_i16 : Operand<i64>, SDNodeXForm<imm, [{
+ uint64_t enc = 31 - N->getZExtValue();
+ enc = enc > 15 ? 15 : enc;
+ return CurDAG->getTargetConstant(enc, MVT::i64);
}]>;
-def bfi_width_to_imms : SDNodeXForm<imm, [{
- return CurDAG->getTargetConstant(N->getZExtValue() - 1, MVT::i64);
+def i64shift_a : Operand<i64>, SDNodeXForm<imm, [{
+ uint64_t enc = (64 - N->getZExtValue()) & 0x3f;
+ return CurDAG->getTargetConstant(enc, MVT::i64);
}]>;
+def i64shift_b : Operand<i64>, SDNodeXForm<imm, [{
+ uint64_t enc = 63 - N->getZExtValue();
+ return CurDAG->getTargetConstant(enc, MVT::i64);
+}]>;
-// The simpler patterns deal with cases where no AND mask is actually needed
-// (either all bits are used or the low 32 bits are used).
+// min(7, 63 - shift_amt)
+def i64shift_sext_i8 : Operand<i64>, SDNodeXForm<imm, [{
+ uint64_t enc = 63 - N->getZExtValue();
+ enc = enc > 7 ? 7 : enc;
+ return CurDAG->getTargetConstant(enc, MVT::i64);
+}]>;
+
+// min(15, 63 - shift_amt)
+def i64shift_sext_i16 : Operand<i64>, SDNodeXForm<imm, [{
+ uint64_t enc = 63 - N->getZExtValue();
+ enc = enc > 15 ? 15 : enc;
+ return CurDAG->getTargetConstant(enc, MVT::i64);
+}]>;
+
+// min(31, 63 - shift_amt)
+def i64shift_sext_i32 : Operand<i64>, SDNodeXForm<imm, [{
+ uint64_t enc = 63 - N->getZExtValue();
+ enc = enc > 31 ? 31 : enc;
+ return CurDAG->getTargetConstant(enc, MVT::i64);
+}]>;
+
+def : Pat<(shl GPR32:$Rn, (i64 imm0_31:$imm)),
+ (UBFMWri GPR32:$Rn, (i64 (i32shift_a imm0_31:$imm)),
+ (i64 (i32shift_b imm0_31:$imm)))>;
+def : Pat<(shl GPR64:$Rn, (i64 imm0_63:$imm)),
+ (UBFMXri GPR64:$Rn, (i64 (i64shift_a imm0_63:$imm)),
+ (i64 (i64shift_b imm0_63:$imm)))>;
+
let AddedComplexity = 10 in {
+def : Pat<(sra GPR32:$Rn, (i64 imm0_31:$imm)),
+ (SBFMWri GPR32:$Rn, imm0_31:$imm, 31)>;
+def : Pat<(sra GPR64:$Rn, (i64 imm0_63:$imm)),
+ (SBFMXri GPR64:$Rn, imm0_63:$imm, 63)>;
+}
-def : Pat<(A64Bfi i64:$src, i64:$Rn, imm:$ImmR, imm:$ImmS),
- (BFIxxii $src, $Rn,
- (bfi64_lsb_to_immr (i64 imm:$ImmR)),
- (bfi_width_to_imms (i64 imm:$ImmS)))>;
+def : InstAlias<"asr $dst, $src, $shift",
+ (SBFMWri GPR32:$dst, GPR32:$src, imm0_31:$shift, 31)>;
+def : InstAlias<"asr $dst, $src, $shift",
+ (SBFMXri GPR64:$dst, GPR64:$src, imm0_63:$shift, 63)>;
+def : InstAlias<"sxtb $dst, $src", (SBFMWri GPR32:$dst, GPR32:$src, 0, 7)>;
+def : InstAlias<"sxtb $dst, $src", (SBFMXri GPR64:$dst, GPR64:$src, 0, 7)>;
+def : InstAlias<"sxth $dst, $src", (SBFMWri GPR32:$dst, GPR32:$src, 0, 15)>;
+def : InstAlias<"sxth $dst, $src", (SBFMXri GPR64:$dst, GPR64:$src, 0, 15)>;
+def : InstAlias<"sxtw $dst, $src", (SBFMXri GPR64:$dst, GPR64:$src, 0, 31)>;
-def : Pat<(A64Bfi i32:$src, i32:$Rn, imm:$ImmR, imm:$ImmS),
- (BFIwwii $src, $Rn,
- (bfi32_lsb_to_immr (i64 imm:$ImmR)),
- (bfi_width_to_imms (i64 imm:$ImmS)))>;
+def : Pat<(srl GPR32:$Rn, (i64 imm0_31:$imm)),
+ (UBFMWri GPR32:$Rn, imm0_31:$imm, 31)>;
+def : Pat<(srl GPR64:$Rn, (i64 imm0_63:$imm)),
+ (UBFMXri GPR64:$Rn, imm0_63:$imm, 63)>;
+
+def : InstAlias<"lsr $dst, $src, $shift",
+ (UBFMWri GPR32:$dst, GPR32:$src, imm0_31:$shift, 31)>;
+def : InstAlias<"lsr $dst, $src, $shift",
+ (UBFMXri GPR64:$dst, GPR64:$src, imm0_63:$shift, 63)>;
+def : InstAlias<"uxtb $dst, $src", (UBFMWri GPR32:$dst, GPR32:$src, 0, 7)>;
+def : InstAlias<"uxtb $dst, $src", (UBFMXri GPR64:$dst, GPR64:$src, 0, 7)>;
+def : InstAlias<"uxth $dst, $src", (UBFMWri GPR32:$dst, GPR32:$src, 0, 15)>;
+def : InstAlias<"uxth $dst, $src", (UBFMXri GPR64:$dst, GPR64:$src, 0, 15)>;
+def : InstAlias<"uxtw $dst, $src", (UBFMXri GPR64:$dst, GPR64:$src, 0, 31)>;
+
+//===----------------------------------------------------------------------===//
+// Conditionally set flags instructions.
+//===----------------------------------------------------------------------===//
+defm CCMN : CondSetFlagsImm<0, "ccmn">;
+defm CCMP : CondSetFlagsImm<1, "ccmp">;
+
+defm CCMN : CondSetFlagsReg<0, "ccmn">;
+defm CCMP : CondSetFlagsReg<1, "ccmp">;
+
+//===----------------------------------------------------------------------===//
+// Conditional select instructions.
+//===----------------------------------------------------------------------===//
+defm CSEL : CondSelect<0, 0b00, "csel">;
+
+def inc : PatFrag<(ops node:$in), (add node:$in, 1)>;
+defm CSINC : CondSelectOp<0, 0b01, "csinc", inc>;
+defm CSINV : CondSelectOp<1, 0b00, "csinv", not>;
+defm CSNEG : CondSelectOp<1, 0b01, "csneg", ineg>;
+
+def : Pat<(AArch64csinv GPR32:$tval, GPR32:$fval, (i32 imm:$cc), NZCV),
+ (CSINVWr GPR32:$tval, GPR32:$fval, (i32 imm:$cc))>;
+def : Pat<(AArch64csinv GPR64:$tval, GPR64:$fval, (i32 imm:$cc), NZCV),
+ (CSINVXr GPR64:$tval, GPR64:$fval, (i32 imm:$cc))>;
+def : Pat<(AArch64csneg GPR32:$tval, GPR32:$fval, (i32 imm:$cc), NZCV),
+ (CSNEGWr GPR32:$tval, GPR32:$fval, (i32 imm:$cc))>;
+def : Pat<(AArch64csneg GPR64:$tval, GPR64:$fval, (i32 imm:$cc), NZCV),
+ (CSNEGXr GPR64:$tval, GPR64:$fval, (i32 imm:$cc))>;
+def : Pat<(AArch64csinc GPR32:$tval, GPR32:$fval, (i32 imm:$cc), NZCV),
+ (CSINCWr GPR32:$tval, GPR32:$fval, (i32 imm:$cc))>;
+def : Pat<(AArch64csinc GPR64:$tval, GPR64:$fval, (i32 imm:$cc), NZCV),
+ (CSINCXr GPR64:$tval, GPR64:$fval, (i32 imm:$cc))>;
+
+def : Pat<(AArch64csel (i32 0), (i32 1), (i32 imm:$cc), NZCV),
+ (CSINCWr WZR, WZR, (i32 imm:$cc))>;
+def : Pat<(AArch64csel (i64 0), (i64 1), (i32 imm:$cc), NZCV),
+ (CSINCXr XZR, XZR, (i32 imm:$cc))>;
+def : Pat<(AArch64csel (i32 0), (i32 -1), (i32 imm:$cc), NZCV),
+ (CSINVWr WZR, WZR, (i32 imm:$cc))>;
+def : Pat<(AArch64csel (i64 0), (i64 -1), (i32 imm:$cc), NZCV),
+ (CSINVXr XZR, XZR, (i32 imm:$cc))>;
+
+// The inverse of the condition code from the alias instruction is what is used
+// in the aliased instruction. The parser all ready inverts the condition code
+// for these aliases.
+def : InstAlias<"cset $dst, $cc",
+ (CSINCWr GPR32:$dst, WZR, WZR, inv_ccode:$cc)>;
+def : InstAlias<"cset $dst, $cc",
+ (CSINCXr GPR64:$dst, XZR, XZR, inv_ccode:$cc)>;
+
+def : InstAlias<"csetm $dst, $cc",
+ (CSINVWr GPR32:$dst, WZR, WZR, inv_ccode:$cc)>;
+def : InstAlias<"csetm $dst, $cc",
+ (CSINVXr GPR64:$dst, XZR, XZR, inv_ccode:$cc)>;
+
+def : InstAlias<"cinc $dst, $src, $cc",
+ (CSINCWr GPR32:$dst, GPR32:$src, GPR32:$src, inv_ccode:$cc)>;
+def : InstAlias<"cinc $dst, $src, $cc",
+ (CSINCXr GPR64:$dst, GPR64:$src, GPR64:$src, inv_ccode:$cc)>;
+
+def : InstAlias<"cinv $dst, $src, $cc",
+ (CSINVWr GPR32:$dst, GPR32:$src, GPR32:$src, inv_ccode:$cc)>;
+def : InstAlias<"cinv $dst, $src, $cc",
+ (CSINVXr GPR64:$dst, GPR64:$src, GPR64:$src, inv_ccode:$cc)>;
+
+def : InstAlias<"cneg $dst, $src, $cc",
+ (CSNEGWr GPR32:$dst, GPR32:$src, GPR32:$src, inv_ccode:$cc)>;
+def : InstAlias<"cneg $dst, $src, $cc",
+ (CSNEGXr GPR64:$dst, GPR64:$src, GPR64:$src, inv_ccode:$cc)>;
+
+//===----------------------------------------------------------------------===//
+// PC-relative instructions.
+//===----------------------------------------------------------------------===//
+let isReMaterializable = 1 in {
+let neverHasSideEffects = 1, mayStore = 0, mayLoad = 0 in {
+def ADR : ADRI<0, "adr", adrlabel, []>;
+} // neverHasSideEffects = 1
+
+def ADRP : ADRI<1, "adrp", adrplabel,
+ [(set GPR64:$Xd, (AArch64adrp tglobaladdr:$label))]>;
+} // isReMaterializable = 1
+
+// page address of a constant pool entry, block address
+def : Pat<(AArch64adrp tconstpool:$cp), (ADRP tconstpool:$cp)>;
+def : Pat<(AArch64adrp tblockaddress:$cp), (ADRP tblockaddress:$cp)>;
+
+//===----------------------------------------------------------------------===//
+// Unconditional branch (register) instructions.
+//===----------------------------------------------------------------------===//
+
+let isReturn = 1, isTerminator = 1, isBarrier = 1 in {
+def RET : BranchReg<0b0010, "ret", []>;
+def DRPS : SpecialReturn<0b0101, "drps">;
+def ERET : SpecialReturn<0b0100, "eret">;
+} // isReturn = 1, isTerminator = 1, isBarrier = 1
+
+// Default to the LR register.
+def : InstAlias<"ret", (RET LR)>;
+
+let isCall = 1, Defs = [LR], Uses = [SP] in {
+def BLR : BranchReg<0b0001, "blr", [(AArch64call GPR64:$Rn)]>;
+} // isCall
+
+let isBranch = 1, isTerminator = 1, isBarrier = 1, isIndirectBranch = 1 in {
+def BR : BranchReg<0b0000, "br", [(brind GPR64:$Rn)]>;
+} // isBranch, isTerminator, isBarrier, isIndirectBranch
+
+// Create a separate pseudo-instruction for codegen to use so that we don't
+// flag lr as used in every function. It'll be restored before the RET by the
+// epilogue if it's legitimately used.
+def RET_ReallyLR : Pseudo<(outs), (ins), [(AArch64retflag)]> {
+ let isTerminator = 1;
+ let isBarrier = 1;
+ let isReturn = 1;
+}
+
+// This is a directive-like pseudo-instruction. The purpose is to insert an
+// R_AARCH64_TLSDESC_CALL relocation at the offset of the following instruction
+// (which in the usual case is a BLR).
+let hasSideEffects = 1 in
+def TLSDESCCALL : Pseudo<(outs), (ins i64imm:$sym), []> {
+ let AsmString = ".tlsdesccall $sym";
+}
+
+// Pseudo-instruction representing a BLR with attached TLSDESC relocation. It
+// gets expanded to two MCInsts during lowering.
+let isCall = 1, Defs = [LR] in
+def TLSDESC_BLR
+ : Pseudo<(outs), (ins GPR64:$dest, i64imm:$sym),
+ [(AArch64tlsdesc_call GPR64:$dest, tglobaltlsaddr:$sym)]>;
+
+def : Pat<(AArch64tlsdesc_call GPR64:$dest, texternalsym:$sym),
+ (TLSDESC_BLR GPR64:$dest, texternalsym:$sym)>;
+//===----------------------------------------------------------------------===//
+// Conditional branch (immediate) instruction.
+//===----------------------------------------------------------------------===//
+def Bcc : BranchCond;
+
+//===----------------------------------------------------------------------===//
+// Compare-and-branch instructions.
+//===----------------------------------------------------------------------===//
+defm CBZ : CmpBranch<0, "cbz", AArch64cbz>;
+defm CBNZ : CmpBranch<1, "cbnz", AArch64cbnz>;
+
+//===----------------------------------------------------------------------===//
+// Test-bit-and-branch instructions.
+//===----------------------------------------------------------------------===//
+defm TBZ : TestBranch<0, "tbz", AArch64tbz>;
+defm TBNZ : TestBranch<1, "tbnz", AArch64tbnz>;
+
+//===----------------------------------------------------------------------===//
+// Unconditional branch (immediate) instructions.
+//===----------------------------------------------------------------------===//
+let isBranch = 1, isTerminator = 1, isBarrier = 1 in {
+def B : BranchImm<0, "b", [(br bb:$addr)]>;
+} // isBranch, isTerminator, isBarrier
+
+let isCall = 1, Defs = [LR], Uses = [SP] in {
+def BL : CallImm<1, "bl", [(AArch64call tglobaladdr:$addr)]>;
+} // isCall
+def : Pat<(AArch64call texternalsym:$func), (BL texternalsym:$func)>;
+
+//===----------------------------------------------------------------------===//
+// Exception generation instructions.
+//===----------------------------------------------------------------------===//
+def BRK : ExceptionGeneration<0b001, 0b00, "brk">;
+def DCPS1 : ExceptionGeneration<0b101, 0b01, "dcps1">;
+def DCPS2 : ExceptionGeneration<0b101, 0b10, "dcps2">;
+def DCPS3 : ExceptionGeneration<0b101, 0b11, "dcps3">;
+def HLT : ExceptionGeneration<0b010, 0b00, "hlt">;
+def HVC : ExceptionGeneration<0b000, 0b10, "hvc">;
+def SMC : ExceptionGeneration<0b000, 0b11, "smc">;
+def SVC : ExceptionGeneration<0b000, 0b01, "svc">;
+
+// DCPSn defaults to an immediate operand of zero if unspecified.
+def : InstAlias<"dcps1", (DCPS1 0)>;
+def : InstAlias<"dcps2", (DCPS2 0)>;
+def : InstAlias<"dcps3", (DCPS3 0)>;
+
+//===----------------------------------------------------------------------===//
+// Load instructions.
+//===----------------------------------------------------------------------===//
+
+// Pair (indexed, offset)
+defm LDPW : LoadPairOffset<0b00, 0, GPR32, simm7s4, "ldp">;
+defm LDPX : LoadPairOffset<0b10, 0, GPR64, simm7s8, "ldp">;
+defm LDPS : LoadPairOffset<0b00, 1, FPR32, simm7s4, "ldp">;
+defm LDPD : LoadPairOffset<0b01, 1, FPR64, simm7s8, "ldp">;
+defm LDPQ : LoadPairOffset<0b10, 1, FPR128, simm7s16, "ldp">;
+
+defm LDPSW : LoadPairOffset<0b01, 0, GPR64, simm7s4, "ldpsw">;
+
+// Pair (pre-indexed)
+def LDPWpre : LoadPairPreIdx<0b00, 0, GPR32, simm7s4, "ldp">;
+def LDPXpre : LoadPairPreIdx<0b10, 0, GPR64, simm7s8, "ldp">;
+def LDPSpre : LoadPairPreIdx<0b00, 1, FPR32, simm7s4, "ldp">;
+def LDPDpre : LoadPairPreIdx<0b01, 1, FPR64, simm7s8, "ldp">;
+def LDPQpre : LoadPairPreIdx<0b10, 1, FPR128, simm7s16, "ldp">;
+
+def LDPSWpre : LoadPairPreIdx<0b01, 0, GPR64, simm7s4, "ldpsw">;
+
+// Pair (post-indexed)
+def LDPWpost : LoadPairPostIdx<0b00, 0, GPR32, simm7s4, "ldp">;
+def LDPXpost : LoadPairPostIdx<0b10, 0, GPR64, simm7s8, "ldp">;
+def LDPSpost : LoadPairPostIdx<0b00, 1, FPR32, simm7s4, "ldp">;
+def LDPDpost : LoadPairPostIdx<0b01, 1, FPR64, simm7s8, "ldp">;
+def LDPQpost : LoadPairPostIdx<0b10, 1, FPR128, simm7s16, "ldp">;
+
+def LDPSWpost : LoadPairPostIdx<0b01, 0, GPR64, simm7s4, "ldpsw">;
-def : Pat<(and (A64Bfi i64:$src, i64:$Rn, imm:$ImmR, imm:$ImmS),
- (i64 4294967295)),
- (SUBREG_TO_REG (i64 0),
- (BFIwwii (EXTRACT_SUBREG $src, sub_32),
- (EXTRACT_SUBREG $Rn, sub_32),
- (bfi32_lsb_to_immr (i64 imm:$ImmR)),
- (bfi_width_to_imms (i64 imm:$ImmS))),
- sub_32)>;
+// Pair (no allocate)
+defm LDNPW : LoadPairNoAlloc<0b00, 0, GPR32, simm7s4, "ldnp">;
+defm LDNPX : LoadPairNoAlloc<0b10, 0, GPR64, simm7s8, "ldnp">;
+defm LDNPS : LoadPairNoAlloc<0b00, 1, FPR32, simm7s4, "ldnp">;
+defm LDNPD : LoadPairNoAlloc<0b01, 1, FPR64, simm7s8, "ldnp">;
+defm LDNPQ : LoadPairNoAlloc<0b10, 1, FPR128, simm7s16, "ldnp">;
+
+//---
+// (register offset)
+//---
+
+// Integer
+defm LDRBB : Load8RO<0b00, 0, 0b01, GPR32, "ldrb", i32, zextloadi8>;
+defm LDRHH : Load16RO<0b01, 0, 0b01, GPR32, "ldrh", i32, zextloadi16>;
+defm LDRW : Load32RO<0b10, 0, 0b01, GPR32, "ldr", i32, load>;
+defm LDRX : Load64RO<0b11, 0, 0b01, GPR64, "ldr", i64, load>;
+
+// Floating-point
+defm LDRB : Load8RO<0b00, 1, 0b01, FPR8, "ldr", untyped, load>;
+defm LDRH : Load16RO<0b01, 1, 0b01, FPR16, "ldr", f16, load>;
+defm LDRS : Load32RO<0b10, 1, 0b01, FPR32, "ldr", f32, load>;
+defm LDRD : Load64RO<0b11, 1, 0b01, FPR64, "ldr", f64, load>;
+defm LDRQ : Load128RO<0b00, 1, 0b11, FPR128, "ldr", f128, load>;
+
+// Load sign-extended half-word
+defm LDRSHW : Load16RO<0b01, 0, 0b11, GPR32, "ldrsh", i32, sextloadi16>;
+defm LDRSHX : Load16RO<0b01, 0, 0b10, GPR64, "ldrsh", i64, sextloadi16>;
+
+// Load sign-extended byte
+defm LDRSBW : Load8RO<0b00, 0, 0b11, GPR32, "ldrsb", i32, sextloadi8>;
+defm LDRSBX : Load8RO<0b00, 0, 0b10, GPR64, "ldrsb", i64, sextloadi8>;
+
+// Load sign-extended word
+defm LDRSW : Load32RO<0b10, 0, 0b10, GPR64, "ldrsw", i64, sextloadi32>;
+
+// Pre-fetch.
+defm PRFM : PrefetchRO<0b11, 0, 0b10, "prfm">;
+
+// For regular load, we do not have any alignment requirement.
+// Thus, it is safe to directly map the vector loads with interesting
+// addressing modes.
+// FIXME: We could do the same for bitconvert to floating point vectors.
+multiclass ScalToVecROLoadPat<ROAddrMode ro, SDPatternOperator loadop,
+ ValueType ScalTy, ValueType VecTy,
+ Instruction LOADW, Instruction LOADX,
+ SubRegIndex sub> {
+ def : Pat<(VecTy (scalar_to_vector (ScalTy
+ (loadop (ro.Wpat GPR64sp:$Rn, GPR32:$Rm, ro.Wext:$offset))))),
+ (INSERT_SUBREG (VecTy (IMPLICIT_DEF)),
+ (LOADW GPR64sp:$Rn, GPR32:$Rm, ro.Wext:$offset),
+ sub)>;
+
+ def : Pat<(VecTy (scalar_to_vector (ScalTy
+ (loadop (ro.Xpat GPR64sp:$Rn, GPR64:$Rm, ro.Xext:$offset))))),
+ (INSERT_SUBREG (VecTy (IMPLICIT_DEF)),
+ (LOADX GPR64sp:$Rn, GPR64:$Rm, ro.Xext:$offset),
+ sub)>;
+}
+
+let AddedComplexity = 10 in {
+defm : ScalToVecROLoadPat<ro8, extloadi8, i32, v8i8, LDRBroW, LDRBroX, bsub>;
+defm : ScalToVecROLoadPat<ro8, extloadi8, i32, v16i8, LDRBroW, LDRBroX, bsub>;
+
+defm : ScalToVecROLoadPat<ro16, extloadi16, i32, v4i16, LDRHroW, LDRHroX, hsub>;
+defm : ScalToVecROLoadPat<ro16, extloadi16, i32, v8i16, LDRHroW, LDRHroX, hsub>;
+
+defm : ScalToVecROLoadPat<ro32, load, i32, v2i32, LDRSroW, LDRSroX, ssub>;
+defm : ScalToVecROLoadPat<ro32, load, i32, v4i32, LDRSroW, LDRSroX, ssub>;
+
+defm : ScalToVecROLoadPat<ro32, load, f32, v2f32, LDRSroW, LDRSroX, ssub>;
+defm : ScalToVecROLoadPat<ro32, load, f32, v4f32, LDRSroW, LDRSroX, ssub>;
+
+defm : ScalToVecROLoadPat<ro64, load, i64, v2i64, LDRDroW, LDRDroX, dsub>;
+
+defm : ScalToVecROLoadPat<ro64, load, f64, v2f64, LDRDroW, LDRDroX, dsub>;
+
+
+def : Pat <(v1i64 (scalar_to_vector (i64
+ (load (ro_Windexed64 GPR64sp:$Rn, GPR32:$Rm,
+ ro_Wextend64:$extend))))),
+ (LDRDroW GPR64sp:$Rn, GPR32:$Rm, ro_Wextend64:$extend)>;
+
+def : Pat <(v1i64 (scalar_to_vector (i64
+ (load (ro_Xindexed64 GPR64sp:$Rn, GPR64:$Rm,
+ ro_Xextend64:$extend))))),
+ (LDRDroX GPR64sp:$Rn, GPR64:$Rm, ro_Xextend64:$extend)>;
+}
+
+// Match all load 64 bits width whose type is compatible with FPR64
+multiclass VecROLoadPat<ROAddrMode ro, ValueType VecTy,
+ Instruction LOADW, Instruction LOADX> {
+
+ def : Pat<(VecTy (load (ro.Wpat GPR64sp:$Rn, GPR32:$Rm, ro.Wext:$extend))),
+ (LOADW GPR64sp:$Rn, GPR32:$Rm, ro.Wext:$extend)>;
+
+ def : Pat<(VecTy (load (ro.Xpat GPR64sp:$Rn, GPR64:$Rm, ro.Xext:$extend))),
+ (LOADX GPR64sp:$Rn, GPR64:$Rm, ro.Xext:$extend)>;
+}
+
+let AddedComplexity = 10 in {
+let Predicates = [IsLE] in {
+ // We must do vector loads with LD1 in big-endian.
+ defm : VecROLoadPat<ro64, v2i32, LDRDroW, LDRDroX>;
+ defm : VecROLoadPat<ro64, v2f32, LDRDroW, LDRDroX>;
+ defm : VecROLoadPat<ro64, v8i8, LDRDroW, LDRDroX>;
+ defm : VecROLoadPat<ro64, v4i16, LDRDroW, LDRDroX>;
+}
+
+defm : VecROLoadPat<ro64, v1i64, LDRDroW, LDRDroX>;
+defm : VecROLoadPat<ro64, v1f64, LDRDroW, LDRDroX>;
+
+// Match all load 128 bits width whose type is compatible with FPR128
+let Predicates = [IsLE] in {
+ // We must do vector loads with LD1 in big-endian.
+ defm : VecROLoadPat<ro128, v2i64, LDRQroW, LDRQroX>;
+ defm : VecROLoadPat<ro128, v2f64, LDRQroW, LDRQroX>;
+ defm : VecROLoadPat<ro128, v4i32, LDRQroW, LDRQroX>;
+ defm : VecROLoadPat<ro128, v4f32, LDRQroW, LDRQroX>;
+ defm : VecROLoadPat<ro128, v8i16, LDRQroW, LDRQroX>;
+ defm : VecROLoadPat<ro128, v16i8, LDRQroW, LDRQroX>;
+}
+} // AddedComplexity = 10
+
+// zextload -> i64
+multiclass ExtLoadTo64ROPat<ROAddrMode ro, SDPatternOperator loadop,
+ Instruction INSTW, Instruction INSTX> {
+ def : Pat<(i64 (loadop (ro.Wpat GPR64sp:$Rn, GPR32:$Rm, ro.Wext:$extend))),
+ (SUBREG_TO_REG (i64 0),
+ (INSTW GPR64sp:$Rn, GPR32:$Rm, ro.Wext:$extend),
+ sub_32)>;
+
+ def : Pat<(i64 (loadop (ro.Xpat GPR64sp:$Rn, GPR64:$Rm, ro.Xext:$extend))),
+ (SUBREG_TO_REG (i64 0),
+ (INSTX GPR64sp:$Rn, GPR64:$Rm, ro.Xext:$extend),
+ sub_32)>;
+}
+
+let AddedComplexity = 10 in {
+ defm : ExtLoadTo64ROPat<ro8, zextloadi8, LDRBBroW, LDRBBroX>;
+ defm : ExtLoadTo64ROPat<ro16, zextloadi16, LDRHHroW, LDRHHroX>;
+ defm : ExtLoadTo64ROPat<ro32, zextloadi32, LDRWroW, LDRWroX>;
+
+ // zextloadi1 -> zextloadi8
+ defm : ExtLoadTo64ROPat<ro8, zextloadi1, LDRBBroW, LDRBBroX>;
+
+ // extload -> zextload
+ defm : ExtLoadTo64ROPat<ro8, extloadi8, LDRBBroW, LDRBBroX>;
+ defm : ExtLoadTo64ROPat<ro16, extloadi16, LDRHHroW, LDRHHroX>;
+ defm : ExtLoadTo64ROPat<ro32, extloadi32, LDRWroW, LDRWroX>;
+
+ // extloadi1 -> zextloadi8
+ defm : ExtLoadTo64ROPat<ro8, extloadi1, LDRBBroW, LDRBBroX>;
+}
+
+
+// zextload -> i64
+multiclass ExtLoadTo32ROPat<ROAddrMode ro, SDPatternOperator loadop,
+ Instruction INSTW, Instruction INSTX> {
+ def : Pat<(i32 (loadop (ro.Wpat GPR64sp:$Rn, GPR32:$Rm, ro.Wext:$extend))),
+ (INSTW GPR64sp:$Rn, GPR32:$Rm, ro.Wext:$extend)>;
+
+ def : Pat<(i32 (loadop (ro.Xpat GPR64sp:$Rn, GPR64:$Rm, ro.Xext:$extend))),
+ (INSTX GPR64sp:$Rn, GPR64:$Rm, ro.Xext:$extend)>;
}
+let AddedComplexity = 10 in {
+ // extload -> zextload
+ defm : ExtLoadTo32ROPat<ro8, extloadi8, LDRBBroW, LDRBBroX>;
+ defm : ExtLoadTo32ROPat<ro16, extloadi16, LDRHHroW, LDRHHroX>;
+ defm : ExtLoadTo32ROPat<ro32, extloadi32, LDRWroW, LDRWroX>;
+
+ // zextloadi1 -> zextloadi8
+ defm : ExtLoadTo32ROPat<ro8, zextloadi1, LDRBBroW, LDRBBroX>;
+}
+
+//---
+// (unsigned immediate)
+//---
+defm LDRX : LoadUI<0b11, 0, 0b01, GPR64, uimm12s8, "ldr",
+ [(set GPR64:$Rt,
+ (load (am_indexed64 GPR64sp:$Rn, uimm12s8:$offset)))]>;
+defm LDRW : LoadUI<0b10, 0, 0b01, GPR32, uimm12s4, "ldr",
+ [(set GPR32:$Rt,
+ (load (am_indexed32 GPR64sp:$Rn, uimm12s4:$offset)))]>;
+defm LDRB : LoadUI<0b00, 1, 0b01, FPR8, uimm12s1, "ldr",
+ [(set FPR8:$Rt,
+ (load (am_indexed8 GPR64sp:$Rn, uimm12s1:$offset)))]>;
+defm LDRH : LoadUI<0b01, 1, 0b01, FPR16, uimm12s2, "ldr",
+ [(set (f16 FPR16:$Rt),
+ (load (am_indexed16 GPR64sp:$Rn, uimm12s2:$offset)))]>;
+defm LDRS : LoadUI<0b10, 1, 0b01, FPR32, uimm12s4, "ldr",
+ [(set (f32 FPR32:$Rt),
+ (load (am_indexed32 GPR64sp:$Rn, uimm12s4:$offset)))]>;
+defm LDRD : LoadUI<0b11, 1, 0b01, FPR64, uimm12s8, "ldr",
+ [(set (f64 FPR64:$Rt),
+ (load (am_indexed64 GPR64sp:$Rn, uimm12s8:$offset)))]>;
+defm LDRQ : LoadUI<0b00, 1, 0b11, FPR128, uimm12s16, "ldr",
+ [(set (f128 FPR128:$Rt),
+ (load (am_indexed128 GPR64sp:$Rn, uimm12s16:$offset)))]>;
+
+// For regular load, we do not have any alignment requirement.
+// Thus, it is safe to directly map the vector loads with interesting
+// addressing modes.
+// FIXME: We could do the same for bitconvert to floating point vectors.
+def : Pat <(v8i8 (scalar_to_vector (i32
+ (extloadi8 (am_indexed8 GPR64sp:$Rn, uimm12s1:$offset))))),
+ (INSERT_SUBREG (v8i8 (IMPLICIT_DEF)),
+ (LDRBui GPR64sp:$Rn, uimm12s1:$offset), bsub)>;
+def : Pat <(v16i8 (scalar_to_vector (i32
+ (extloadi8 (am_indexed8 GPR64sp:$Rn, uimm12s1:$offset))))),
+ (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)),
+ (LDRBui GPR64sp:$Rn, uimm12s1:$offset), bsub)>;
+def : Pat <(v4i16 (scalar_to_vector (i32
+ (extloadi16 (am_indexed16 GPR64sp:$Rn, uimm12s2:$offset))))),
+ (INSERT_SUBREG (v4i16 (IMPLICIT_DEF)),
+ (LDRHui GPR64sp:$Rn, uimm12s2:$offset), hsub)>;
+def : Pat <(v8i16 (scalar_to_vector (i32
+ (extloadi16 (am_indexed16 GPR64sp:$Rn, uimm12s2:$offset))))),
+ (INSERT_SUBREG (v8i16 (IMPLICIT_DEF)),
+ (LDRHui GPR64sp:$Rn, uimm12s2:$offset), hsub)>;
+def : Pat <(v2i32 (scalar_to_vector (i32
+ (load (am_indexed32 GPR64sp:$Rn, uimm12s4:$offset))))),
+ (INSERT_SUBREG (v2i32 (IMPLICIT_DEF)),
+ (LDRSui GPR64sp:$Rn, uimm12s4:$offset), ssub)>;
+def : Pat <(v4i32 (scalar_to_vector (i32
+ (load (am_indexed32 GPR64sp:$Rn, uimm12s4:$offset))))),
+ (INSERT_SUBREG (v4i32 (IMPLICIT_DEF)),
+ (LDRSui GPR64sp:$Rn, uimm12s4:$offset), ssub)>;
+def : Pat <(v1i64 (scalar_to_vector (i64
+ (load (am_indexed64 GPR64sp:$Rn, uimm12s8:$offset))))),
+ (LDRDui GPR64sp:$Rn, uimm12s8:$offset)>;
+def : Pat <(v2i64 (scalar_to_vector (i64
+ (load (am_indexed64 GPR64sp:$Rn, uimm12s8:$offset))))),
+ (INSERT_SUBREG (v2i64 (IMPLICIT_DEF)),
+ (LDRDui GPR64sp:$Rn, uimm12s8:$offset), dsub)>;
+
+// Match all load 64 bits width whose type is compatible with FPR64
+let Predicates = [IsLE] in {
+ // We must use LD1 to perform vector loads in big-endian.
+ def : Pat<(v2f32 (load (am_indexed64 GPR64sp:$Rn, uimm12s8:$offset))),
+ (LDRDui GPR64sp:$Rn, uimm12s8:$offset)>;
+ def : Pat<(v8i8 (load (am_indexed64 GPR64sp:$Rn, uimm12s8:$offset))),
+ (LDRDui GPR64sp:$Rn, uimm12s8:$offset)>;
+ def : Pat<(v4i16 (load (am_indexed64 GPR64sp:$Rn, uimm12s8:$offset))),
+ (LDRDui GPR64sp:$Rn, uimm12s8:$offset)>;
+ def : Pat<(v2i32 (load (am_indexed64 GPR64sp:$Rn, uimm12s8:$offset))),
+ (LDRDui GPR64sp:$Rn, uimm12s8:$offset)>;
+}
+def : Pat<(v1f64 (load (am_indexed64 GPR64sp:$Rn, uimm12s8:$offset))),
+ (LDRDui GPR64sp:$Rn, uimm12s8:$offset)>;
+def : Pat<(v1i64 (load (am_indexed64 GPR64sp:$Rn, uimm12s8:$offset))),
+ (LDRDui GPR64sp:$Rn, uimm12s8:$offset)>;
+
+// Match all load 128 bits width whose type is compatible with FPR128
+let Predicates = [IsLE] in {
+ // We must use LD1 to perform vector loads in big-endian.
+ def : Pat<(v4f32 (load (am_indexed128 GPR64sp:$Rn, uimm12s16:$offset))),
+ (LDRQui GPR64sp:$Rn, uimm12s16:$offset)>;
+ def : Pat<(v2f64 (load (am_indexed128 GPR64sp:$Rn, uimm12s16:$offset))),
+ (LDRQui GPR64sp:$Rn, uimm12s16:$offset)>;
+ def : Pat<(v16i8 (load (am_indexed128 GPR64sp:$Rn, uimm12s16:$offset))),
+ (LDRQui GPR64sp:$Rn, uimm12s16:$offset)>;
+ def : Pat<(v8i16 (load (am_indexed128 GPR64sp:$Rn, uimm12s16:$offset))),
+ (LDRQui GPR64sp:$Rn, uimm12s16:$offset)>;
+ def : Pat<(v4i32 (load (am_indexed128 GPR64sp:$Rn, uimm12s16:$offset))),
+ (LDRQui GPR64sp:$Rn, uimm12s16:$offset)>;
+ def : Pat<(v2i64 (load (am_indexed128 GPR64sp:$Rn, uimm12s16:$offset))),
+ (LDRQui GPR64sp:$Rn, uimm12s16:$offset)>;
+}
+def : Pat<(f128 (load (am_indexed128 GPR64sp:$Rn, uimm12s16:$offset))),
+ (LDRQui GPR64sp:$Rn, uimm12s16:$offset)>;
+
+defm LDRHH : LoadUI<0b01, 0, 0b01, GPR32, uimm12s2, "ldrh",
+ [(set GPR32:$Rt,
+ (zextloadi16 (am_indexed16 GPR64sp:$Rn,
+ uimm12s2:$offset)))]>;
+defm LDRBB : LoadUI<0b00, 0, 0b01, GPR32, uimm12s1, "ldrb",
+ [(set GPR32:$Rt,
+ (zextloadi8 (am_indexed8 GPR64sp:$Rn,
+ uimm12s1:$offset)))]>;
+// zextload -> i64
+def : Pat<(i64 (zextloadi8 (am_indexed8 GPR64sp:$Rn, uimm12s1:$offset))),
+ (SUBREG_TO_REG (i64 0), (LDRBBui GPR64sp:$Rn, uimm12s1:$offset), sub_32)>;
+def : Pat<(i64 (zextloadi16 (am_indexed16 GPR64sp:$Rn, uimm12s2:$offset))),
+ (SUBREG_TO_REG (i64 0), (LDRHHui GPR64sp:$Rn, uimm12s2:$offset), sub_32)>;
+
+// zextloadi1 -> zextloadi8
+def : Pat<(i32 (zextloadi1 (am_indexed8 GPR64sp:$Rn, uimm12s1:$offset))),
+ (LDRBBui GPR64sp:$Rn, uimm12s1:$offset)>;
+def : Pat<(i64 (zextloadi1 (am_indexed8 GPR64sp:$Rn, uimm12s1:$offset))),
+ (SUBREG_TO_REG (i64 0), (LDRBBui GPR64sp:$Rn, uimm12s1:$offset), sub_32)>;
+
+// extload -> zextload
+def : Pat<(i32 (extloadi16 (am_indexed16 GPR64sp:$Rn, uimm12s2:$offset))),
+ (LDRHHui GPR64sp:$Rn, uimm12s2:$offset)>;
+def : Pat<(i32 (extloadi8 (am_indexed8 GPR64sp:$Rn, uimm12s1:$offset))),
+ (LDRBBui GPR64sp:$Rn, uimm12s1:$offset)>;
+def : Pat<(i32 (extloadi1 (am_indexed8 GPR64sp:$Rn, uimm12s1:$offset))),
+ (LDRBBui GPR64sp:$Rn, uimm12s1:$offset)>;
+def : Pat<(i64 (extloadi32 (am_indexed32 GPR64sp:$Rn, uimm12s4:$offset))),
+ (SUBREG_TO_REG (i64 0), (LDRWui GPR64sp:$Rn, uimm12s4:$offset), sub_32)>;
+def : Pat<(i64 (extloadi16 (am_indexed16 GPR64sp:$Rn, uimm12s2:$offset))),
+ (SUBREG_TO_REG (i64 0), (LDRHHui GPR64sp:$Rn, uimm12s2:$offset), sub_32)>;
+def : Pat<(i64 (extloadi8 (am_indexed8 GPR64sp:$Rn, uimm12s1:$offset))),
+ (SUBREG_TO_REG (i64 0), (LDRBBui GPR64sp:$Rn, uimm12s1:$offset), sub_32)>;
+def : Pat<(i64 (extloadi1 (am_indexed8 GPR64sp:$Rn, uimm12s1:$offset))),
+ (SUBREG_TO_REG (i64 0), (LDRBBui GPR64sp:$Rn, uimm12s1:$offset), sub_32)>;
+
+// load sign-extended half-word
+defm LDRSHW : LoadUI<0b01, 0, 0b11, GPR32, uimm12s2, "ldrsh",
+ [(set GPR32:$Rt,
+ (sextloadi16 (am_indexed16 GPR64sp:$Rn,
+ uimm12s2:$offset)))]>;
+defm LDRSHX : LoadUI<0b01, 0, 0b10, GPR64, uimm12s2, "ldrsh",
+ [(set GPR64:$Rt,
+ (sextloadi16 (am_indexed16 GPR64sp:$Rn,
+ uimm12s2:$offset)))]>;
+
+// load sign-extended byte
+defm LDRSBW : LoadUI<0b00, 0, 0b11, GPR32, uimm12s1, "ldrsb",
+ [(set GPR32:$Rt,
+ (sextloadi8 (am_indexed8 GPR64sp:$Rn,
+ uimm12s1:$offset)))]>;
+defm LDRSBX : LoadUI<0b00, 0, 0b10, GPR64, uimm12s1, "ldrsb",
+ [(set GPR64:$Rt,
+ (sextloadi8 (am_indexed8 GPR64sp:$Rn,
+ uimm12s1:$offset)))]>;
+
+// load sign-extended word
+defm LDRSW : LoadUI<0b10, 0, 0b10, GPR64, uimm12s4, "ldrsw",
+ [(set GPR64:$Rt,
+ (sextloadi32 (am_indexed32 GPR64sp:$Rn,
+ uimm12s4:$offset)))]>;
+
+// load zero-extended word
+def : Pat<(i64 (zextloadi32 (am_indexed32 GPR64sp:$Rn, uimm12s4:$offset))),
+ (SUBREG_TO_REG (i64 0), (LDRWui GPR64sp:$Rn, uimm12s4:$offset), sub_32)>;
+
+// Pre-fetch.
+def PRFMui : PrefetchUI<0b11, 0, 0b10, "prfm",
+ [(AArch64Prefetch imm:$Rt,
+ (am_indexed64 GPR64sp:$Rn,
+ uimm12s8:$offset))]>;
+
+def : InstAlias<"prfm $Rt, [$Rn]", (PRFMui prfop:$Rt, GPR64sp:$Rn, 0)>;
+
+//---
+// (literal)
+def LDRWl : LoadLiteral<0b00, 0, GPR32, "ldr">;
+def LDRXl : LoadLiteral<0b01, 0, GPR64, "ldr">;
+def LDRSl : LoadLiteral<0b00, 1, FPR32, "ldr">;
+def LDRDl : LoadLiteral<0b01, 1, FPR64, "ldr">;
+def LDRQl : LoadLiteral<0b10, 1, FPR128, "ldr">;
+
+// load sign-extended word
+def LDRSWl : LoadLiteral<0b10, 0, GPR64, "ldrsw">;
+
+// prefetch
+def PRFMl : PrefetchLiteral<0b11, 0, "prfm", []>;
+// [(AArch64Prefetch imm:$Rt, tglobaladdr:$label)]>;
+
+//---
+// (unscaled immediate)
+defm LDURX : LoadUnscaled<0b11, 0, 0b01, GPR64, "ldur",
+ [(set GPR64:$Rt,
+ (load (am_unscaled64 GPR64sp:$Rn, simm9:$offset)))]>;
+defm LDURW : LoadUnscaled<0b10, 0, 0b01, GPR32, "ldur",
+ [(set GPR32:$Rt,
+ (load (am_unscaled32 GPR64sp:$Rn, simm9:$offset)))]>;
+defm LDURB : LoadUnscaled<0b00, 1, 0b01, FPR8, "ldur",
+ [(set FPR8:$Rt,
+ (load (am_unscaled8 GPR64sp:$Rn, simm9:$offset)))]>;
+defm LDURH : LoadUnscaled<0b01, 1, 0b01, FPR16, "ldur",
+ [(set FPR16:$Rt,
+ (load (am_unscaled16 GPR64sp:$Rn, simm9:$offset)))]>;
+defm LDURS : LoadUnscaled<0b10, 1, 0b01, FPR32, "ldur",
+ [(set (f32 FPR32:$Rt),
+ (load (am_unscaled32 GPR64sp:$Rn, simm9:$offset)))]>;
+defm LDURD : LoadUnscaled<0b11, 1, 0b01, FPR64, "ldur",
+ [(set (f64 FPR64:$Rt),
+ (load (am_unscaled64 GPR64sp:$Rn, simm9:$offset)))]>;
+defm LDURQ : LoadUnscaled<0b00, 1, 0b11, FPR128, "ldur",
+ [(set (f128 FPR128:$Rt),
+ (load (am_unscaled128 GPR64sp:$Rn, simm9:$offset)))]>;
+
+defm LDURHH
+ : LoadUnscaled<0b01, 0, 0b01, GPR32, "ldurh",
+ [(set GPR32:$Rt,
+ (zextloadi16 (am_unscaled16 GPR64sp:$Rn, simm9:$offset)))]>;
+defm LDURBB
+ : LoadUnscaled<0b00, 0, 0b01, GPR32, "ldurb",
+ [(set GPR32:$Rt,
+ (zextloadi8 (am_unscaled16 GPR64sp:$Rn, simm9:$offset)))]>;
+
+// Match all load 64 bits width whose type is compatible with FPR64
+let Predicates = [IsLE] in {
+ def : Pat<(v2f32 (load (am_unscaled64 GPR64sp:$Rn, simm9:$offset))),
+ (LDURDi GPR64sp:$Rn, simm9:$offset)>;
+ def : Pat<(v2i32 (load (am_unscaled64 GPR64sp:$Rn, simm9:$offset))),
+ (LDURDi GPR64sp:$Rn, simm9:$offset)>;
+ def : Pat<(v4i16 (load (am_unscaled64 GPR64sp:$Rn, simm9:$offset))),
+ (LDURDi GPR64sp:$Rn, simm9:$offset)>;
+ def : Pat<(v8i8 (load (am_unscaled64 GPR64sp:$Rn, simm9:$offset))),
+ (LDURDi GPR64sp:$Rn, simm9:$offset)>;
+}
+def : Pat<(v1f64 (load (am_unscaled64 GPR64sp:$Rn, simm9:$offset))),
+ (LDURDi GPR64sp:$Rn, simm9:$offset)>;
+def : Pat<(v1i64 (load (am_unscaled64 GPR64sp:$Rn, simm9:$offset))),
+ (LDURDi GPR64sp:$Rn, simm9:$offset)>;
+
+// Match all load 128 bits width whose type is compatible with FPR128
+let Predicates = [IsLE] in {
+ def : Pat<(v2f64 (load (am_unscaled128 GPR64sp:$Rn, simm9:$offset))),
+ (LDURQi GPR64sp:$Rn, simm9:$offset)>;
+ def : Pat<(v2i64 (load (am_unscaled128 GPR64sp:$Rn, simm9:$offset))),
+ (LDURQi GPR64sp:$Rn, simm9:$offset)>;
+ def : Pat<(v4f32 (load (am_unscaled128 GPR64sp:$Rn, simm9:$offset))),
+ (LDURQi GPR64sp:$Rn, simm9:$offset)>;
+ def : Pat<(v4i32 (load (am_unscaled128 GPR64sp:$Rn, simm9:$offset))),
+ (LDURQi GPR64sp:$Rn, simm9:$offset)>;
+ def : Pat<(v8i16 (load (am_unscaled128 GPR64sp:$Rn, simm9:$offset))),
+ (LDURQi GPR64sp:$Rn, simm9:$offset)>;
+ def : Pat<(v16i8 (load (am_unscaled128 GPR64sp:$Rn, simm9:$offset))),
+ (LDURQi GPR64sp:$Rn, simm9:$offset)>;
+}
+
+// anyext -> zext
+def : Pat<(i32 (extloadi16 (am_unscaled16 GPR64sp:$Rn, simm9:$offset))),
+ (LDURHHi GPR64sp:$Rn, simm9:$offset)>;
+def : Pat<(i32 (extloadi8 (am_unscaled8 GPR64sp:$Rn, simm9:$offset))),
+ (LDURBBi GPR64sp:$Rn, simm9:$offset)>;
+def : Pat<(i32 (extloadi1 (am_unscaled8 GPR64sp:$Rn, simm9:$offset))),
+ (LDURBBi GPR64sp:$Rn, simm9:$offset)>;
+def : Pat<(i64 (extloadi32 (am_unscaled32 GPR64sp:$Rn, simm9:$offset))),
+ (SUBREG_TO_REG (i64 0), (LDURWi GPR64sp:$Rn, simm9:$offset), sub_32)>;
+def : Pat<(i64 (extloadi16 (am_unscaled16 GPR64sp:$Rn, simm9:$offset))),
+ (SUBREG_TO_REG (i64 0), (LDURHHi GPR64sp:$Rn, simm9:$offset), sub_32)>;
+def : Pat<(i64 (extloadi8 (am_unscaled8 GPR64sp:$Rn, simm9:$offset))),
+ (SUBREG_TO_REG (i64 0), (LDURBBi GPR64sp:$Rn, simm9:$offset), sub_32)>;
+def : Pat<(i64 (extloadi1 (am_unscaled8 GPR64sp:$Rn, simm9:$offset))),
+ (SUBREG_TO_REG (i64 0), (LDURBBi GPR64sp:$Rn, simm9:$offset), sub_32)>;
+// unscaled zext
+def : Pat<(i32 (zextloadi16 (am_unscaled16 GPR64sp:$Rn, simm9:$offset))),
+ (LDURHHi GPR64sp:$Rn, simm9:$offset)>;
+def : Pat<(i32 (zextloadi8 (am_unscaled8 GPR64sp:$Rn, simm9:$offset))),
+ (LDURBBi GPR64sp:$Rn, simm9:$offset)>;
+def : Pat<(i32 (zextloadi1 (am_unscaled8 GPR64sp:$Rn, simm9:$offset))),
+ (LDURBBi GPR64sp:$Rn, simm9:$offset)>;
+def : Pat<(i64 (zextloadi32 (am_unscaled32 GPR64sp:$Rn, simm9:$offset))),
+ (SUBREG_TO_REG (i64 0), (LDURWi GPR64sp:$Rn, simm9:$offset), sub_32)>;
+def : Pat<(i64 (zextloadi16 (am_unscaled16 GPR64sp:$Rn, simm9:$offset))),
+ (SUBREG_TO_REG (i64 0), (LDURHHi GPR64sp:$Rn, simm9:$offset), sub_32)>;
+def : Pat<(i64 (zextloadi8 (am_unscaled8 GPR64sp:$Rn, simm9:$offset))),
+ (SUBREG_TO_REG (i64 0), (LDURBBi GPR64sp:$Rn, simm9:$offset), sub_32)>;
+def : Pat<(i64 (zextloadi1 (am_unscaled8 GPR64sp:$Rn, simm9:$offset))),
+ (SUBREG_TO_REG (i64 0), (LDURBBi GPR64sp:$Rn, simm9:$offset), sub_32)>;
+
+
+//---
+// LDR mnemonics fall back to LDUR for negative or unaligned offsets.
+
+// Define new assembler match classes as we want to only match these when
+// the don't otherwise match the scaled addressing mode for LDR/STR. Don't
+// associate a DiagnosticType either, as we want the diagnostic for the
+// canonical form (the scaled operand) to take precedence.
+class SImm9OffsetOperand<int Width> : AsmOperandClass {
+ let Name = "SImm9OffsetFB" # Width;
+ let PredicateMethod = "isSImm9OffsetFB<" # Width # ">";
+ let RenderMethod = "addImmOperands";
+}
+
+def SImm9OffsetFB8Operand : SImm9OffsetOperand<8>;
+def SImm9OffsetFB16Operand : SImm9OffsetOperand<16>;
+def SImm9OffsetFB32Operand : SImm9OffsetOperand<32>;
+def SImm9OffsetFB64Operand : SImm9OffsetOperand<64>;
+def SImm9OffsetFB128Operand : SImm9OffsetOperand<128>;
+
+def simm9_offset_fb8 : Operand<i64> {
+ let ParserMatchClass = SImm9OffsetFB8Operand;
+}
+def simm9_offset_fb16 : Operand<i64> {
+ let ParserMatchClass = SImm9OffsetFB16Operand;
+}
+def simm9_offset_fb32 : Operand<i64> {
+ let ParserMatchClass = SImm9OffsetFB32Operand;
+}
+def simm9_offset_fb64 : Operand<i64> {
+ let ParserMatchClass = SImm9OffsetFB64Operand;
+}
+def simm9_offset_fb128 : Operand<i64> {
+ let ParserMatchClass = SImm9OffsetFB128Operand;
+}
+
+def : InstAlias<"ldr $Rt, [$Rn, $offset]",
+ (LDURXi GPR64:$Rt, GPR64sp:$Rn, simm9_offset_fb64:$offset), 0>;
+def : InstAlias<"ldr $Rt, [$Rn, $offset]",
+ (LDURWi GPR32:$Rt, GPR64sp:$Rn, simm9_offset_fb32:$offset), 0>;
+def : InstAlias<"ldr $Rt, [$Rn, $offset]",
+ (LDURBi FPR8:$Rt, GPR64sp:$Rn, simm9_offset_fb8:$offset), 0>;
+def : InstAlias<"ldr $Rt, [$Rn, $offset]",
+ (LDURHi FPR16:$Rt, GPR64sp:$Rn, simm9_offset_fb16:$offset), 0>;
+def : InstAlias<"ldr $Rt, [$Rn, $offset]",
+ (LDURSi FPR32:$Rt, GPR64sp:$Rn, simm9_offset_fb32:$offset), 0>;
+def : InstAlias<"ldr $Rt, [$Rn, $offset]",
+ (LDURDi FPR64:$Rt, GPR64sp:$Rn, simm9_offset_fb64:$offset), 0>;
+def : InstAlias<"ldr $Rt, [$Rn, $offset]",
+ (LDURQi FPR128:$Rt, GPR64sp:$Rn, simm9_offset_fb128:$offset), 0>;
+
+// zextload -> i64
+def : Pat<(i64 (zextloadi8 (am_unscaled8 GPR64sp:$Rn, simm9:$offset))),
+ (SUBREG_TO_REG (i64 0), (LDURBBi GPR64sp:$Rn, simm9:$offset), sub_32)>;
+def : Pat<(i64 (zextloadi16 (am_unscaled16 GPR64sp:$Rn, simm9:$offset))),
+ (SUBREG_TO_REG (i64 0), (LDURHHi GPR64sp:$Rn, simm9:$offset), sub_32)>;
+
+// load sign-extended half-word
+defm LDURSHW
+ : LoadUnscaled<0b01, 0, 0b11, GPR32, "ldursh",
+ [(set GPR32:$Rt,
+ (sextloadi16 (am_unscaled16 GPR64sp:$Rn, simm9:$offset)))]>;
+defm LDURSHX
+ : LoadUnscaled<0b01, 0, 0b10, GPR64, "ldursh",
+ [(set GPR64:$Rt,
+ (sextloadi16 (am_unscaled16 GPR64sp:$Rn, simm9:$offset)))]>;
+
+// load sign-extended byte
+defm LDURSBW
+ : LoadUnscaled<0b00, 0, 0b11, GPR32, "ldursb",
+ [(set GPR32:$Rt,
+ (sextloadi8 (am_unscaled8 GPR64sp:$Rn, simm9:$offset)))]>;
+defm LDURSBX
+ : LoadUnscaled<0b00, 0, 0b10, GPR64, "ldursb",
+ [(set GPR64:$Rt,
+ (sextloadi8 (am_unscaled8 GPR64sp:$Rn, simm9:$offset)))]>;
+
+// load sign-extended word
+defm LDURSW
+ : LoadUnscaled<0b10, 0, 0b10, GPR64, "ldursw",
+ [(set GPR64:$Rt,
+ (sextloadi32 (am_unscaled32 GPR64sp:$Rn, simm9:$offset)))]>;
+
+// zero and sign extending aliases from generic LDR* mnemonics to LDUR*.
+def : InstAlias<"ldrb $Rt, [$Rn, $offset]",
+ (LDURBBi GPR32:$Rt, GPR64sp:$Rn, simm9_offset_fb8:$offset), 0>;
+def : InstAlias<"ldrh $Rt, [$Rn, $offset]",
+ (LDURHHi GPR32:$Rt, GPR64sp:$Rn, simm9_offset_fb16:$offset), 0>;
+def : InstAlias<"ldrsb $Rt, [$Rn, $offset]",
+ (LDURSBWi GPR32:$Rt, GPR64sp:$Rn, simm9_offset_fb8:$offset), 0>;
+def : InstAlias<"ldrsb $Rt, [$Rn, $offset]",
+ (LDURSBXi GPR64:$Rt, GPR64sp:$Rn, simm9_offset_fb8:$offset), 0>;
+def : InstAlias<"ldrsh $Rt, [$Rn, $offset]",
+ (LDURSHWi GPR32:$Rt, GPR64sp:$Rn, simm9_offset_fb16:$offset), 0>;
+def : InstAlias<"ldrsh $Rt, [$Rn, $offset]",
+ (LDURSHXi GPR64:$Rt, GPR64sp:$Rn, simm9_offset_fb16:$offset), 0>;
+def : InstAlias<"ldrsw $Rt, [$Rn, $offset]",
+ (LDURSWi GPR64:$Rt, GPR64sp:$Rn, simm9_offset_fb32:$offset), 0>;
+
+// Pre-fetch.
+defm PRFUM : PrefetchUnscaled<0b11, 0, 0b10, "prfum",
+ [(AArch64Prefetch imm:$Rt,
+ (am_unscaled64 GPR64sp:$Rn, simm9:$offset))]>;
+
+//---
+// (unscaled immediate, unprivileged)
+defm LDTRX : LoadUnprivileged<0b11, 0, 0b01, GPR64, "ldtr">;
+defm LDTRW : LoadUnprivileged<0b10, 0, 0b01, GPR32, "ldtr">;
+
+defm LDTRH : LoadUnprivileged<0b01, 0, 0b01, GPR32, "ldtrh">;
+defm LDTRB : LoadUnprivileged<0b00, 0, 0b01, GPR32, "ldtrb">;
+
+// load sign-extended half-word
+defm LDTRSHW : LoadUnprivileged<0b01, 0, 0b11, GPR32, "ldtrsh">;
+defm LDTRSHX : LoadUnprivileged<0b01, 0, 0b10, GPR64, "ldtrsh">;
+
+// load sign-extended byte
+defm LDTRSBW : LoadUnprivileged<0b00, 0, 0b11, GPR32, "ldtrsb">;
+defm LDTRSBX : LoadUnprivileged<0b00, 0, 0b10, GPR64, "ldtrsb">;
+
+// load sign-extended word
+defm LDTRSW : LoadUnprivileged<0b10, 0, 0b10, GPR64, "ldtrsw">;
+
+//---
+// (immediate pre-indexed)
+def LDRWpre : LoadPreIdx<0b10, 0, 0b01, GPR32, "ldr">;
+def LDRXpre : LoadPreIdx<0b11, 0, 0b01, GPR64, "ldr">;
+def LDRBpre : LoadPreIdx<0b00, 1, 0b01, FPR8, "ldr">;
+def LDRHpre : LoadPreIdx<0b01, 1, 0b01, FPR16, "ldr">;
+def LDRSpre : LoadPreIdx<0b10, 1, 0b01, FPR32, "ldr">;
+def LDRDpre : LoadPreIdx<0b11, 1, 0b01, FPR64, "ldr">;
+def LDRQpre : LoadPreIdx<0b00, 1, 0b11, FPR128, "ldr">;
+
+// load sign-extended half-word
+def LDRSHWpre : LoadPreIdx<0b01, 0, 0b11, GPR32, "ldrsh">;
+def LDRSHXpre : LoadPreIdx<0b01, 0, 0b10, GPR64, "ldrsh">;
+
+// load sign-extended byte
+def LDRSBWpre : LoadPreIdx<0b00, 0, 0b11, GPR32, "ldrsb">;
+def LDRSBXpre : LoadPreIdx<0b00, 0, 0b10, GPR64, "ldrsb">;
+
+// load zero-extended byte
+def LDRBBpre : LoadPreIdx<0b00, 0, 0b01, GPR32, "ldrb">;
+def LDRHHpre : LoadPreIdx<0b01, 0, 0b01, GPR32, "ldrh">;
+
+// load sign-extended word
+def LDRSWpre : LoadPreIdx<0b10, 0, 0b10, GPR64, "ldrsw">;
+
+//---
+// (immediate post-indexed)
+def LDRWpost : LoadPostIdx<0b10, 0, 0b01, GPR32, "ldr">;
+def LDRXpost : LoadPostIdx<0b11, 0, 0b01, GPR64, "ldr">;
+def LDRBpost : LoadPostIdx<0b00, 1, 0b01, FPR8, "ldr">;
+def LDRHpost : LoadPostIdx<0b01, 1, 0b01, FPR16, "ldr">;
+def LDRSpost : LoadPostIdx<0b10, 1, 0b01, FPR32, "ldr">;
+def LDRDpost : LoadPostIdx<0b11, 1, 0b01, FPR64, "ldr">;
+def LDRQpost : LoadPostIdx<0b00, 1, 0b11, FPR128, "ldr">;
+
+// load sign-extended half-word
+def LDRSHWpost : LoadPostIdx<0b01, 0, 0b11, GPR32, "ldrsh">;
+def LDRSHXpost : LoadPostIdx<0b01, 0, 0b10, GPR64, "ldrsh">;
+
+// load sign-extended byte
+def LDRSBWpost : LoadPostIdx<0b00, 0, 0b11, GPR32, "ldrsb">;
+def LDRSBXpost : LoadPostIdx<0b00, 0, 0b10, GPR64, "ldrsb">;
+
+// load zero-extended byte
+def LDRBBpost : LoadPostIdx<0b00, 0, 0b01, GPR32, "ldrb">;
+def LDRHHpost : LoadPostIdx<0b01, 0, 0b01, GPR32, "ldrh">;
+
+// load sign-extended word
+def LDRSWpost : LoadPostIdx<0b10, 0, 0b10, GPR64, "ldrsw">;
+
//===----------------------------------------------------------------------===//
-// Miscellaneous patterns
+// Store instructions.
//===----------------------------------------------------------------------===//
-// Truncation from 64 to 32-bits just involves renaming your register.
-def : Pat<(i32 (trunc i64:$val)), (EXTRACT_SUBREG $val, sub_32)>;
+// Pair (indexed, offset)
+// FIXME: Use dedicated range-checked addressing mode operand here.
+defm STPW : StorePairOffset<0b00, 0, GPR32, simm7s4, "stp">;
+defm STPX : StorePairOffset<0b10, 0, GPR64, simm7s8, "stp">;
+defm STPS : StorePairOffset<0b00, 1, FPR32, simm7s4, "stp">;
+defm STPD : StorePairOffset<0b01, 1, FPR64, simm7s8, "stp">;
+defm STPQ : StorePairOffset<0b10, 1, FPR128, simm7s16, "stp">;
-// Similarly, extension where we don't care about the high bits is
-// just a rename.
-def : Pat<(i64 (anyext i32:$val)),
- (INSERT_SUBREG (IMPLICIT_DEF), $val, sub_32)>;
+// Pair (pre-indexed)
+def STPWpre : StorePairPreIdx<0b00, 0, GPR32, simm7s4, "stp">;
+def STPXpre : StorePairPreIdx<0b10, 0, GPR64, simm7s8, "stp">;
+def STPSpre : StorePairPreIdx<0b00, 1, FPR32, simm7s4, "stp">;
+def STPDpre : StorePairPreIdx<0b01, 1, FPR64, simm7s8, "stp">;
+def STPQpre : StorePairPreIdx<0b10, 1, FPR128, simm7s16, "stp">;
-// SELECT instructions providing f128 types need to be handled by a
+// Pair (pre-indexed)
+def STPWpost : StorePairPostIdx<0b00, 0, GPR32, simm7s4, "stp">;
+def STPXpost : StorePairPostIdx<0b10, 0, GPR64, simm7s8, "stp">;
+def STPSpost : StorePairPostIdx<0b00, 1, FPR32, simm7s4, "stp">;
+def STPDpost : StorePairPostIdx<0b01, 1, FPR64, simm7s8, "stp">;
+def STPQpost : StorePairPostIdx<0b10, 1, FPR128, simm7s16, "stp">;
+
+// Pair (no allocate)
+defm STNPW : StorePairNoAlloc<0b00, 0, GPR32, simm7s4, "stnp">;
+defm STNPX : StorePairNoAlloc<0b10, 0, GPR64, simm7s8, "stnp">;
+defm STNPS : StorePairNoAlloc<0b00, 1, FPR32, simm7s4, "stnp">;
+defm STNPD : StorePairNoAlloc<0b01, 1, FPR64, simm7s8, "stnp">;
+defm STNPQ : StorePairNoAlloc<0b10, 1, FPR128, simm7s16, "stnp">;
+
+//---
+// (Register offset)
+
+// Integer
+defm STRBB : Store8RO< 0b00, 0, 0b00, GPR32, "strb", i32, truncstorei8>;
+defm STRHH : Store16RO<0b01, 0, 0b00, GPR32, "strh", i32, truncstorei16>;
+defm STRW : Store32RO<0b10, 0, 0b00, GPR32, "str", i32, store>;
+defm STRX : Store64RO<0b11, 0, 0b00, GPR64, "str", i64, store>;
+
+
+// Floating-point
+defm STRB : Store8RO< 0b00, 1, 0b00, FPR8, "str", untyped, store>;
+defm STRH : Store16RO<0b01, 1, 0b00, FPR16, "str", f16, store>;
+defm STRS : Store32RO<0b10, 1, 0b00, FPR32, "str", f32, store>;
+defm STRD : Store64RO<0b11, 1, 0b00, FPR64, "str", f64, store>;
+defm STRQ : Store128RO<0b00, 1, 0b10, FPR128, "str", f128, store>;
+
+multiclass TruncStoreFrom64ROPat<ROAddrMode ro, SDPatternOperator storeop,
+ Instruction STRW, Instruction STRX> {
+
+ def : Pat<(storeop GPR64:$Rt,
+ (ro.Wpat GPR64sp:$Rn, GPR32:$Rm, ro.Wext:$extend)),
+ (STRW (EXTRACT_SUBREG GPR64:$Rt, sub_32),
+ GPR64sp:$Rn, GPR32:$Rm, ro.Wext:$extend)>;
+
+ def : Pat<(storeop GPR64:$Rt,
+ (ro.Xpat GPR64sp:$Rn, GPR64:$Rm, ro.Xext:$extend)),
+ (STRX (EXTRACT_SUBREG GPR64:$Rt, sub_32),
+ GPR64sp:$Rn, GPR64:$Rm, ro.Xext:$extend)>;
+}
+
+let AddedComplexity = 10 in {
+ // truncstore i64
+ defm : TruncStoreFrom64ROPat<ro8, truncstorei8, STRBBroW, STRBBroX>;
+ defm : TruncStoreFrom64ROPat<ro16, truncstorei16, STRHHroW, STRHHroX>;
+ defm : TruncStoreFrom64ROPat<ro32, truncstorei32, STRWroW, STRWroX>;
+}
+
+multiclass VecROStorePat<ROAddrMode ro, ValueType VecTy, RegisterClass FPR,
+ Instruction STRW, Instruction STRX> {
+ def : Pat<(store (VecTy FPR:$Rt),
+ (ro.Wpat GPR64sp:$Rn, GPR32:$Rm, ro.Wext:$extend)),
+ (STRW FPR:$Rt, GPR64sp:$Rn, GPR32:$Rm, ro.Wext:$extend)>;
+
+ def : Pat<(store (VecTy FPR:$Rt),
+ (ro.Xpat GPR64sp:$Rn, GPR64:$Rm, ro.Xext:$extend)),
+ (STRX FPR:$Rt, GPR64sp:$Rn, GPR64:$Rm, ro.Xext:$extend)>;
+}
+
+let AddedComplexity = 10 in {
+// Match all store 64 bits width whose type is compatible with FPR64
+let Predicates = [IsLE] in {
+ // We must use ST1 to store vectors in big-endian.
+ defm : VecROStorePat<ro64, v2i32, FPR64, STRDroW, STRDroX>;
+ defm : VecROStorePat<ro64, v2f32, FPR64, STRDroW, STRDroX>;
+ defm : VecROStorePat<ro64, v4i16, FPR64, STRDroW, STRDroX>;
+ defm : VecROStorePat<ro64, v8i8, FPR64, STRDroW, STRDroX>;
+}
+
+defm : VecROStorePat<ro64, v1i64, FPR64, STRDroW, STRDroX>;
+defm : VecROStorePat<ro64, v1f64, FPR64, STRDroW, STRDroX>;
+
+// Match all store 128 bits width whose type is compatible with FPR128
+let Predicates = [IsLE] in {
+ // We must use ST1 to store vectors in big-endian.
+ defm : VecROStorePat<ro128, v2i64, FPR128, STRQroW, STRQroX>;
+ defm : VecROStorePat<ro128, v2f64, FPR128, STRQroW, STRQroX>;
+ defm : VecROStorePat<ro128, v4i32, FPR128, STRQroW, STRQroX>;
+ defm : VecROStorePat<ro128, v4f32, FPR128, STRQroW, STRQroX>;
+ defm : VecROStorePat<ro128, v8i16, FPR128, STRQroW, STRQroX>;
+ defm : VecROStorePat<ro128, v16i8, FPR128, STRQroW, STRQroX>;
+}
+} // AddedComplexity = 10
+
+//---
+// (unsigned immediate)
+defm STRX : StoreUI<0b11, 0, 0b00, GPR64, uimm12s8, "str",
+ [(store GPR64:$Rt,
+ (am_indexed64 GPR64sp:$Rn, uimm12s8:$offset))]>;
+defm STRW : StoreUI<0b10, 0, 0b00, GPR32, uimm12s4, "str",
+ [(store GPR32:$Rt,
+ (am_indexed32 GPR64sp:$Rn, uimm12s4:$offset))]>;
+defm STRB : StoreUI<0b00, 1, 0b00, FPR8, uimm12s1, "str",
+ [(store FPR8:$Rt,
+ (am_indexed8 GPR64sp:$Rn, uimm12s1:$offset))]>;
+defm STRH : StoreUI<0b01, 1, 0b00, FPR16, uimm12s2, "str",
+ [(store (f16 FPR16:$Rt),
+ (am_indexed16 GPR64sp:$Rn, uimm12s2:$offset))]>;
+defm STRS : StoreUI<0b10, 1, 0b00, FPR32, uimm12s4, "str",
+ [(store (f32 FPR32:$Rt),
+ (am_indexed32 GPR64sp:$Rn, uimm12s4:$offset))]>;
+defm STRD : StoreUI<0b11, 1, 0b00, FPR64, uimm12s8, "str",
+ [(store (f64 FPR64:$Rt),
+ (am_indexed64 GPR64sp:$Rn, uimm12s8:$offset))]>;
+defm STRQ : StoreUI<0b00, 1, 0b10, FPR128, uimm12s16, "str", []>;
+
+defm STRHH : StoreUI<0b01, 0, 0b00, GPR32, uimm12s2, "strh",
+ [(truncstorei16 GPR32:$Rt,
+ (am_indexed16 GPR64sp:$Rn,
+ uimm12s2:$offset))]>;
+defm STRBB : StoreUI<0b00, 0, 0b00, GPR32, uimm12s1, "strb",
+ [(truncstorei8 GPR32:$Rt,
+ (am_indexed8 GPR64sp:$Rn,
+ uimm12s1:$offset))]>;
+
+// Match all store 64 bits width whose type is compatible with FPR64
+let AddedComplexity = 10 in {
+let Predicates = [IsLE] in {
+ // We must use ST1 to store vectors in big-endian.
+ def : Pat<(store (v2f32 FPR64:$Rt),
+ (am_indexed64 GPR64sp:$Rn, uimm12s8:$offset)),
+ (STRDui FPR64:$Rt, GPR64sp:$Rn, uimm12s8:$offset)>;
+ def : Pat<(store (v8i8 FPR64:$Rt),
+ (am_indexed64 GPR64sp:$Rn, uimm12s8:$offset)),
+ (STRDui FPR64:$Rt, GPR64sp:$Rn, uimm12s8:$offset)>;
+ def : Pat<(store (v4i16 FPR64:$Rt),
+ (am_indexed64 GPR64sp:$Rn, uimm12s8:$offset)),
+ (STRDui FPR64:$Rt, GPR64sp:$Rn, uimm12s8:$offset)>;
+ def : Pat<(store (v2i32 FPR64:$Rt),
+ (am_indexed64 GPR64sp:$Rn, uimm12s8:$offset)),
+ (STRDui FPR64:$Rt, GPR64sp:$Rn, uimm12s8:$offset)>;
+}
+def : Pat<(store (v1f64 FPR64:$Rt),
+ (am_indexed64 GPR64sp:$Rn, uimm12s8:$offset)),
+ (STRDui FPR64:$Rt, GPR64sp:$Rn, uimm12s8:$offset)>;
+def : Pat<(store (v1i64 FPR64:$Rt),
+ (am_indexed64 GPR64sp:$Rn, uimm12s8:$offset)),
+ (STRDui FPR64:$Rt, GPR64sp:$Rn, uimm12s8:$offset)>;
+
+// Match all store 128 bits width whose type is compatible with FPR128
+let Predicates = [IsLE] in {
+ // We must use ST1 to store vectors in big-endian.
+ def : Pat<(store (v4f32 FPR128:$Rt),
+ (am_indexed128 GPR64sp:$Rn, uimm12s16:$offset)),
+ (STRQui FPR128:$Rt, GPR64sp:$Rn, uimm12s16:$offset)>;
+ def : Pat<(store (v2f64 FPR128:$Rt),
+ (am_indexed128 GPR64sp:$Rn, uimm12s16:$offset)),
+ (STRQui FPR128:$Rt, GPR64sp:$Rn, uimm12s16:$offset)>;
+ def : Pat<(store (v16i8 FPR128:$Rt),
+ (am_indexed128 GPR64sp:$Rn, uimm12s16:$offset)),
+ (STRQui FPR128:$Rt, GPR64sp:$Rn, uimm12s16:$offset)>;
+ def : Pat<(store (v8i16 FPR128:$Rt),
+ (am_indexed128 GPR64sp:$Rn, uimm12s16:$offset)),
+ (STRQui FPR128:$Rt, GPR64sp:$Rn, uimm12s16:$offset)>;
+ def : Pat<(store (v4i32 FPR128:$Rt),
+ (am_indexed128 GPR64sp:$Rn, uimm12s16:$offset)),
+ (STRQui FPR128:$Rt, GPR64sp:$Rn, uimm12s16:$offset)>;
+ def : Pat<(store (v2i64 FPR128:$Rt),
+ (am_indexed128 GPR64sp:$Rn, uimm12s16:$offset)),
+ (STRQui FPR128:$Rt, GPR64sp:$Rn, uimm12s16:$offset)>;
+}
+def : Pat<(store (f128 FPR128:$Rt),
+ (am_indexed128 GPR64sp:$Rn, uimm12s16:$offset)),
+ (STRQui FPR128:$Rt, GPR64sp:$Rn, uimm12s16:$offset)>;
+
+// truncstore i64
+def : Pat<(truncstorei32 GPR64:$Rt,
+ (am_indexed32 GPR64sp:$Rn, uimm12s4:$offset)),
+ (STRWui (EXTRACT_SUBREG GPR64:$Rt, sub_32), GPR64sp:$Rn, uimm12s4:$offset)>;
+def : Pat<(truncstorei16 GPR64:$Rt,
+ (am_indexed16 GPR64sp:$Rn, uimm12s2:$offset)),
+ (STRHHui (EXTRACT_SUBREG GPR64:$Rt, sub_32), GPR64sp:$Rn, uimm12s2:$offset)>;
+def : Pat<(truncstorei8 GPR64:$Rt, (am_indexed8 GPR64sp:$Rn, uimm12s1:$offset)),
+ (STRBBui (EXTRACT_SUBREG GPR64:$Rt, sub_32), GPR64sp:$Rn, uimm12s1:$offset)>;
+
+} // AddedComplexity = 10
+
+//---
+// (unscaled immediate)
+defm STURX : StoreUnscaled<0b11, 0, 0b00, GPR64, "stur",
+ [(store GPR64:$Rt,
+ (am_unscaled64 GPR64sp:$Rn, simm9:$offset))]>;
+defm STURW : StoreUnscaled<0b10, 0, 0b00, GPR32, "stur",
+ [(store GPR32:$Rt,
+ (am_unscaled32 GPR64sp:$Rn, simm9:$offset))]>;
+defm STURB : StoreUnscaled<0b00, 1, 0b00, FPR8, "stur",
+ [(store FPR8:$Rt,
+ (am_unscaled8 GPR64sp:$Rn, simm9:$offset))]>;
+defm STURH : StoreUnscaled<0b01, 1, 0b00, FPR16, "stur",
+ [(store (f16 FPR16:$Rt),
+ (am_unscaled16 GPR64sp:$Rn, simm9:$offset))]>;
+defm STURS : StoreUnscaled<0b10, 1, 0b00, FPR32, "stur",
+ [(store (f32 FPR32:$Rt),
+ (am_unscaled32 GPR64sp:$Rn, simm9:$offset))]>;
+defm STURD : StoreUnscaled<0b11, 1, 0b00, FPR64, "stur",
+ [(store (f64 FPR64:$Rt),
+ (am_unscaled64 GPR64sp:$Rn, simm9:$offset))]>;
+defm STURQ : StoreUnscaled<0b00, 1, 0b10, FPR128, "stur",
+ [(store (f128 FPR128:$Rt),
+ (am_unscaled128 GPR64sp:$Rn, simm9:$offset))]>;
+defm STURHH : StoreUnscaled<0b01, 0, 0b00, GPR32, "sturh",
+ [(truncstorei16 GPR32:$Rt,
+ (am_unscaled16 GPR64sp:$Rn, simm9:$offset))]>;
+defm STURBB : StoreUnscaled<0b00, 0, 0b00, GPR32, "sturb",
+ [(truncstorei8 GPR32:$Rt,
+ (am_unscaled8 GPR64sp:$Rn, simm9:$offset))]>;
+
+// Match all store 64 bits width whose type is compatible with FPR64
+let Predicates = [IsLE] in {
+ // We must use ST1 to store vectors in big-endian.
+ def : Pat<(store (v2f32 FPR64:$Rt),
+ (am_unscaled64 GPR64sp:$Rn, simm9:$offset)),
+ (STURDi FPR64:$Rt, GPR64sp:$Rn, simm9:$offset)>;
+ def : Pat<(store (v8i8 FPR64:$Rt),
+ (am_unscaled64 GPR64sp:$Rn, simm9:$offset)),
+ (STURDi FPR64:$Rt, GPR64sp:$Rn, simm9:$offset)>;
+ def : Pat<(store (v4i16 FPR64:$Rt),
+ (am_unscaled64 GPR64sp:$Rn, simm9:$offset)),
+ (STURDi FPR64:$Rt, GPR64sp:$Rn, simm9:$offset)>;
+ def : Pat<(store (v2i32 FPR64:$Rt),
+ (am_unscaled64 GPR64sp:$Rn, simm9:$offset)),
+ (STURDi FPR64:$Rt, GPR64sp:$Rn, simm9:$offset)>;
+}
+def : Pat<(store (v1f64 FPR64:$Rt), (am_unscaled64 GPR64sp:$Rn, simm9:$offset)),
+ (STURDi FPR64:$Rt, GPR64sp:$Rn, simm9:$offset)>;
+def : Pat<(store (v1i64 FPR64:$Rt), (am_unscaled64 GPR64sp:$Rn, simm9:$offset)),
+ (STURDi FPR64:$Rt, GPR64sp:$Rn, simm9:$offset)>;
+
+// Match all store 128 bits width whose type is compatible with FPR128
+let Predicates = [IsLE] in {
+ // We must use ST1 to store vectors in big-endian.
+ def : Pat<(store (v4f32 FPR128:$Rt),
+ (am_unscaled128 GPR64sp:$Rn, simm9:$offset)),
+ (STURQi FPR128:$Rt, GPR64sp:$Rn, simm9:$offset)>;
+ def : Pat<(store (v2f64 FPR128:$Rt),
+ (am_unscaled128 GPR64sp:$Rn, simm9:$offset)),
+ (STURQi FPR128:$Rt, GPR64sp:$Rn, simm9:$offset)>;
+ def : Pat<(store (v16i8 FPR128:$Rt),
+ (am_unscaled128 GPR64sp:$Rn, simm9:$offset)),
+ (STURQi FPR128:$Rt, GPR64sp:$Rn, simm9:$offset)>;
+ def : Pat<(store (v8i16 FPR128:$Rt),
+ (am_unscaled128 GPR64sp:$Rn, simm9:$offset)),
+ (STURQi FPR128:$Rt, GPR64sp:$Rn, simm9:$offset)>;
+ def : Pat<(store (v4i32 FPR128:$Rt),
+ (am_unscaled128 GPR64sp:$Rn, simm9:$offset)),
+ (STURQi FPR128:$Rt, GPR64sp:$Rn, simm9:$offset)>;
+ def : Pat<(store (v2i64 FPR128:$Rt),
+ (am_unscaled128 GPR64sp:$Rn, simm9:$offset)),
+ (STURQi FPR128:$Rt, GPR64sp:$Rn, simm9:$offset)>;
+ def : Pat<(store (v2f64 FPR128:$Rt),
+ (am_unscaled128 GPR64sp:$Rn, simm9:$offset)),
+ (STURQi FPR128:$Rt, GPR64sp:$Rn, simm9:$offset)>;
+}
+
+// unscaled i64 truncating stores
+def : Pat<(truncstorei32 GPR64:$Rt, (am_unscaled32 GPR64sp:$Rn, simm9:$offset)),
+ (STURWi (EXTRACT_SUBREG GPR64:$Rt, sub_32), GPR64sp:$Rn, simm9:$offset)>;
+def : Pat<(truncstorei16 GPR64:$Rt, (am_unscaled16 GPR64sp:$Rn, simm9:$offset)),
+ (STURHHi (EXTRACT_SUBREG GPR64:$Rt, sub_32), GPR64sp:$Rn, simm9:$offset)>;
+def : Pat<(truncstorei8 GPR64:$Rt, (am_unscaled8 GPR64sp:$Rn, simm9:$offset)),
+ (STURBBi (EXTRACT_SUBREG GPR64:$Rt, sub_32), GPR64sp:$Rn, simm9:$offset)>;
+
+//---
+// STR mnemonics fall back to STUR for negative or unaligned offsets.
+def : InstAlias<"str $Rt, [$Rn, $offset]",
+ (STURXi GPR64:$Rt, GPR64sp:$Rn, simm9_offset_fb64:$offset), 0>;
+def : InstAlias<"str $Rt, [$Rn, $offset]",
+ (STURWi GPR32:$Rt, GPR64sp:$Rn, simm9_offset_fb32:$offset), 0>;
+def : InstAlias<"str $Rt, [$Rn, $offset]",
+ (STURBi FPR8:$Rt, GPR64sp:$Rn, simm9_offset_fb8:$offset), 0>;
+def : InstAlias<"str $Rt, [$Rn, $offset]",
+ (STURHi FPR16:$Rt, GPR64sp:$Rn, simm9_offset_fb16:$offset), 0>;
+def : InstAlias<"str $Rt, [$Rn, $offset]",
+ (STURSi FPR32:$Rt, GPR64sp:$Rn, simm9_offset_fb32:$offset), 0>;
+def : InstAlias<"str $Rt, [$Rn, $offset]",
+ (STURDi FPR64:$Rt, GPR64sp:$Rn, simm9_offset_fb64:$offset), 0>;
+def : InstAlias<"str $Rt, [$Rn, $offset]",
+ (STURQi FPR128:$Rt, GPR64sp:$Rn, simm9_offset_fb128:$offset), 0>;
+
+def : InstAlias<"strb $Rt, [$Rn, $offset]",
+ (STURBBi GPR32:$Rt, GPR64sp:$Rn, simm9_offset_fb8:$offset), 0>;
+def : InstAlias<"strh $Rt, [$Rn, $offset]",
+ (STURHHi GPR32:$Rt, GPR64sp:$Rn, simm9_offset_fb16:$offset), 0>;
+
+//---
+// (unscaled immediate, unprivileged)
+defm STTRW : StoreUnprivileged<0b10, 0, 0b00, GPR32, "sttr">;
+defm STTRX : StoreUnprivileged<0b11, 0, 0b00, GPR64, "sttr">;
+
+defm STTRH : StoreUnprivileged<0b01, 0, 0b00, GPR32, "sttrh">;
+defm STTRB : StoreUnprivileged<0b00, 0, 0b00, GPR32, "sttrb">;
+
+//---
+// (immediate pre-indexed)
+def STRWpre : StorePreIdx<0b10, 0, 0b00, GPR32, "str", pre_store, i32>;
+def STRXpre : StorePreIdx<0b11, 0, 0b00, GPR64, "str", pre_store, i64>;
+def STRBpre : StorePreIdx<0b00, 1, 0b00, FPR8, "str", pre_store, untyped>;
+def STRHpre : StorePreIdx<0b01, 1, 0b00, FPR16, "str", pre_store, f16>;
+def STRSpre : StorePreIdx<0b10, 1, 0b00, FPR32, "str", pre_store, f32>;
+def STRDpre : StorePreIdx<0b11, 1, 0b00, FPR64, "str", pre_store, f64>;
+def STRQpre : StorePreIdx<0b00, 1, 0b10, FPR128, "str", pre_store, f128>;
+
+def STRBBpre : StorePreIdx<0b00, 0, 0b00, GPR32, "strb", pre_truncsti8, i32>;
+def STRHHpre : StorePreIdx<0b01, 0, 0b00, GPR32, "strh", pre_truncsti16, i32>;
+
+// truncstore i64
+def : Pat<(pre_truncsti32 GPR64:$Rt, GPR64sp:$addr, simm9:$off),
+ (STRWpre (EXTRACT_SUBREG GPR64:$Rt, sub_32), GPR64sp:$addr,
+ simm9:$off)>;
+def : Pat<(pre_truncsti16 GPR64:$Rt, GPR64sp:$addr, simm9:$off),
+ (STRHHpre (EXTRACT_SUBREG GPR64:$Rt, sub_32), GPR64sp:$addr,
+ simm9:$off)>;
+def : Pat<(pre_truncsti8 GPR64:$Rt, GPR64sp:$addr, simm9:$off),
+ (STRBBpre (EXTRACT_SUBREG GPR64:$Rt, sub_32), GPR64sp:$addr,
+ simm9:$off)>;
+
+def : Pat<(pre_store (v8i8 FPR64:$Rt), GPR64sp:$addr, simm9:$off),
+ (STRDpre FPR64:$Rt, GPR64sp:$addr, simm9:$off)>;
+def : Pat<(pre_store (v4i16 FPR64:$Rt), GPR64sp:$addr, simm9:$off),
+ (STRDpre FPR64:$Rt, GPR64sp:$addr, simm9:$off)>;
+def : Pat<(pre_store (v2i32 FPR64:$Rt), GPR64sp:$addr, simm9:$off),
+ (STRDpre FPR64:$Rt, GPR64sp:$addr, simm9:$off)>;
+def : Pat<(pre_store (v2f32 FPR64:$Rt), GPR64sp:$addr, simm9:$off),
+ (STRDpre FPR64:$Rt, GPR64sp:$addr, simm9:$off)>;
+def : Pat<(pre_store (v1i64 FPR64:$Rt), GPR64sp:$addr, simm9:$off),
+ (STRDpre FPR64:$Rt, GPR64sp:$addr, simm9:$off)>;
+def : Pat<(pre_store (v1f64 FPR64:$Rt), GPR64sp:$addr, simm9:$off),
+ (STRDpre FPR64:$Rt, GPR64sp:$addr, simm9:$off)>;
+
+def : Pat<(pre_store (v16i8 FPR128:$Rt), GPR64sp:$addr, simm9:$off),
+ (STRQpre FPR128:$Rt, GPR64sp:$addr, simm9:$off)>;
+def : Pat<(pre_store (v8i16 FPR128:$Rt), GPR64sp:$addr, simm9:$off),
+ (STRQpre FPR128:$Rt, GPR64sp:$addr, simm9:$off)>;
+def : Pat<(pre_store (v4i32 FPR128:$Rt), GPR64sp:$addr, simm9:$off),
+ (STRQpre FPR128:$Rt, GPR64sp:$addr, simm9:$off)>;
+def : Pat<(pre_store (v4f32 FPR128:$Rt), GPR64sp:$addr, simm9:$off),
+ (STRQpre FPR128:$Rt, GPR64sp:$addr, simm9:$off)>;
+def : Pat<(pre_store (v2i64 FPR128:$Rt), GPR64sp:$addr, simm9:$off),
+ (STRQpre FPR128:$Rt, GPR64sp:$addr, simm9:$off)>;
+def : Pat<(pre_store (v2f64 FPR128:$Rt), GPR64sp:$addr, simm9:$off),
+ (STRQpre FPR128:$Rt, GPR64sp:$addr, simm9:$off)>;
+
+//---
+// (immediate post-indexed)
+def STRWpost : StorePostIdx<0b10, 0, 0b00, GPR32, "str", post_store, i32>;
+def STRXpost : StorePostIdx<0b11, 0, 0b00, GPR64, "str", post_store, i64>;
+def STRBpost : StorePostIdx<0b00, 1, 0b00, FPR8, "str", post_store, untyped>;
+def STRHpost : StorePostIdx<0b01, 1, 0b00, FPR16, "str", post_store, f16>;
+def STRSpost : StorePostIdx<0b10, 1, 0b00, FPR32, "str", post_store, f32>;
+def STRDpost : StorePostIdx<0b11, 1, 0b00, FPR64, "str", post_store, f64>;
+def STRQpost : StorePostIdx<0b00, 1, 0b10, FPR128, "str", post_store, f128>;
+
+def STRBBpost : StorePostIdx<0b00, 0, 0b00, GPR32, "strb", post_truncsti8, i32>;
+def STRHHpost : StorePostIdx<0b01, 0, 0b00, GPR32, "strh", post_truncsti16, i32>;
+
+// truncstore i64
+def : Pat<(post_truncsti32 GPR64:$Rt, GPR64sp:$addr, simm9:$off),
+ (STRWpost (EXTRACT_SUBREG GPR64:$Rt, sub_32), GPR64sp:$addr,
+ simm9:$off)>;
+def : Pat<(post_truncsti16 GPR64:$Rt, GPR64sp:$addr, simm9:$off),
+ (STRHHpost (EXTRACT_SUBREG GPR64:$Rt, sub_32), GPR64sp:$addr,
+ simm9:$off)>;
+def : Pat<(post_truncsti8 GPR64:$Rt, GPR64sp:$addr, simm9:$off),
+ (STRBBpost (EXTRACT_SUBREG GPR64:$Rt, sub_32), GPR64sp:$addr,
+ simm9:$off)>;
+
+def : Pat<(post_store (v8i8 FPR64:$Rt), GPR64sp:$addr, simm9:$off),
+ (STRDpost FPR64:$Rt, GPR64sp:$addr, simm9:$off)>;
+def : Pat<(post_store (v4i16 FPR64:$Rt), GPR64sp:$addr, simm9:$off),
+ (STRDpost FPR64:$Rt, GPR64sp:$addr, simm9:$off)>;
+def : Pat<(post_store (v2i32 FPR64:$Rt), GPR64sp:$addr, simm9:$off),
+ (STRDpost FPR64:$Rt, GPR64sp:$addr, simm9:$off)>;
+def : Pat<(post_store (v2f32 FPR64:$Rt), GPR64sp:$addr, simm9:$off),
+ (STRDpost FPR64:$Rt, GPR64sp:$addr, simm9:$off)>;
+def : Pat<(post_store (v1i64 FPR64:$Rt), GPR64sp:$addr, simm9:$off),
+ (STRDpost FPR64:$Rt, GPR64sp:$addr, simm9:$off)>;
+def : Pat<(post_store (v1f64 FPR64:$Rt), GPR64sp:$addr, simm9:$off),
+ (STRDpost FPR64:$Rt, GPR64sp:$addr, simm9:$off)>;
+
+def : Pat<(post_store (v16i8 FPR128:$Rt), GPR64sp:$addr, simm9:$off),
+ (STRQpost FPR128:$Rt, GPR64sp:$addr, simm9:$off)>;
+def : Pat<(post_store (v8i16 FPR128:$Rt), GPR64sp:$addr, simm9:$off),
+ (STRQpost FPR128:$Rt, GPR64sp:$addr, simm9:$off)>;
+def : Pat<(post_store (v4i32 FPR128:$Rt), GPR64sp:$addr, simm9:$off),
+ (STRQpost FPR128:$Rt, GPR64sp:$addr, simm9:$off)>;
+def : Pat<(post_store (v4f32 FPR128:$Rt), GPR64sp:$addr, simm9:$off),
+ (STRQpost FPR128:$Rt, GPR64sp:$addr, simm9:$off)>;
+def : Pat<(post_store (v2i64 FPR128:$Rt), GPR64sp:$addr, simm9:$off),
+ (STRQpost FPR128:$Rt, GPR64sp:$addr, simm9:$off)>;
+def : Pat<(post_store (v2f64 FPR128:$Rt), GPR64sp:$addr, simm9:$off),
+ (STRQpost FPR128:$Rt, GPR64sp:$addr, simm9:$off)>;
+
+//===----------------------------------------------------------------------===//
+// Load/store exclusive instructions.
+//===----------------------------------------------------------------------===//
+
+def LDARW : LoadAcquire <0b10, 1, 1, 0, 1, GPR32, "ldar">;
+def LDARX : LoadAcquire <0b11, 1, 1, 0, 1, GPR64, "ldar">;
+def LDARB : LoadAcquire <0b00, 1, 1, 0, 1, GPR32, "ldarb">;
+def LDARH : LoadAcquire <0b01, 1, 1, 0, 1, GPR32, "ldarh">;
+
+def LDAXRW : LoadExclusive <0b10, 0, 1, 0, 1, GPR32, "ldaxr">;
+def LDAXRX : LoadExclusive <0b11, 0, 1, 0, 1, GPR64, "ldaxr">;
+def LDAXRB : LoadExclusive <0b00, 0, 1, 0, 1, GPR32, "ldaxrb">;
+def LDAXRH : LoadExclusive <0b01, 0, 1, 0, 1, GPR32, "ldaxrh">;
+
+def LDXRW : LoadExclusive <0b10, 0, 1, 0, 0, GPR32, "ldxr">;
+def LDXRX : LoadExclusive <0b11, 0, 1, 0, 0, GPR64, "ldxr">;
+def LDXRB : LoadExclusive <0b00, 0, 1, 0, 0, GPR32, "ldxrb">;
+def LDXRH : LoadExclusive <0b01, 0, 1, 0, 0, GPR32, "ldxrh">;
+
+def STLRW : StoreRelease <0b10, 1, 0, 0, 1, GPR32, "stlr">;
+def STLRX : StoreRelease <0b11, 1, 0, 0, 1, GPR64, "stlr">;
+def STLRB : StoreRelease <0b00, 1, 0, 0, 1, GPR32, "stlrb">;
+def STLRH : StoreRelease <0b01, 1, 0, 0, 1, GPR32, "stlrh">;
+
+def STLXRW : StoreExclusive<0b10, 0, 0, 0, 1, GPR32, "stlxr">;
+def STLXRX : StoreExclusive<0b11, 0, 0, 0, 1, GPR64, "stlxr">;
+def STLXRB : StoreExclusive<0b00, 0, 0, 0, 1, GPR32, "stlxrb">;
+def STLXRH : StoreExclusive<0b01, 0, 0, 0, 1, GPR32, "stlxrh">;
+
+def STXRW : StoreExclusive<0b10, 0, 0, 0, 0, GPR32, "stxr">;
+def STXRX : StoreExclusive<0b11, 0, 0, 0, 0, GPR64, "stxr">;
+def STXRB : StoreExclusive<0b00, 0, 0, 0, 0, GPR32, "stxrb">;
+def STXRH : StoreExclusive<0b01, 0, 0, 0, 0, GPR32, "stxrh">;
+
+def LDAXPW : LoadExclusivePair<0b10, 0, 1, 1, 1, GPR32, "ldaxp">;
+def LDAXPX : LoadExclusivePair<0b11, 0, 1, 1, 1, GPR64, "ldaxp">;
+
+def LDXPW : LoadExclusivePair<0b10, 0, 1, 1, 0, GPR32, "ldxp">;
+def LDXPX : LoadExclusivePair<0b11, 0, 1, 1, 0, GPR64, "ldxp">;
+
+def STLXPW : StoreExclusivePair<0b10, 0, 0, 1, 1, GPR32, "stlxp">;
+def STLXPX : StoreExclusivePair<0b11, 0, 0, 1, 1, GPR64, "stlxp">;
+
+def STXPW : StoreExclusivePair<0b10, 0, 0, 1, 0, GPR32, "stxp">;
+def STXPX : StoreExclusivePair<0b11, 0, 0, 1, 0, GPR64, "stxp">;
+
+//===----------------------------------------------------------------------===//
+// Scaled floating point to integer conversion instructions.
+//===----------------------------------------------------------------------===//
+
+defm FCVTAS : FPToIntegerUnscaled<0b00, 0b100, "fcvtas", int_aarch64_neon_fcvtas>;
+defm FCVTAU : FPToIntegerUnscaled<0b00, 0b101, "fcvtau", int_aarch64_neon_fcvtau>;
+defm FCVTMS : FPToIntegerUnscaled<0b10, 0b000, "fcvtms", int_aarch64_neon_fcvtms>;
+defm FCVTMU : FPToIntegerUnscaled<0b10, 0b001, "fcvtmu", int_aarch64_neon_fcvtmu>;
+defm FCVTNS : FPToIntegerUnscaled<0b00, 0b000, "fcvtns", int_aarch64_neon_fcvtns>;
+defm FCVTNU : FPToIntegerUnscaled<0b00, 0b001, "fcvtnu", int_aarch64_neon_fcvtnu>;
+defm FCVTPS : FPToIntegerUnscaled<0b01, 0b000, "fcvtps", int_aarch64_neon_fcvtps>;
+defm FCVTPU : FPToIntegerUnscaled<0b01, 0b001, "fcvtpu", int_aarch64_neon_fcvtpu>;
+defm FCVTZS : FPToIntegerUnscaled<0b11, 0b000, "fcvtzs", fp_to_sint>;
+defm FCVTZU : FPToIntegerUnscaled<0b11, 0b001, "fcvtzu", fp_to_uint>;
+defm FCVTZS : FPToIntegerScaled<0b11, 0b000, "fcvtzs", fp_to_sint>;
+defm FCVTZU : FPToIntegerScaled<0b11, 0b001, "fcvtzu", fp_to_uint>;
+let isCodeGenOnly = 1 in {
+defm FCVTZS_Int : FPToIntegerUnscaled<0b11, 0b000, "fcvtzs", int_aarch64_neon_fcvtzs>;
+defm FCVTZU_Int : FPToIntegerUnscaled<0b11, 0b001, "fcvtzu", int_aarch64_neon_fcvtzu>;
+defm FCVTZS_Int : FPToIntegerScaled<0b11, 0b000, "fcvtzs", int_aarch64_neon_fcvtzs>;
+defm FCVTZU_Int : FPToIntegerScaled<0b11, 0b001, "fcvtzu", int_aarch64_neon_fcvtzu>;
+}
+
+//===----------------------------------------------------------------------===//
+// Scaled integer to floating point conversion instructions.
+//===----------------------------------------------------------------------===//
+
+defm SCVTF : IntegerToFP<0, "scvtf", sint_to_fp>;
+defm UCVTF : IntegerToFP<1, "ucvtf", uint_to_fp>;
+
+//===----------------------------------------------------------------------===//
+// Unscaled integer to floating point conversion instruction.
+//===----------------------------------------------------------------------===//
+
+defm FMOV : UnscaledConversion<"fmov">;
+
+def : Pat<(f32 (fpimm0)), (FMOVWSr WZR)>, Requires<[NoZCZ]>;
+def : Pat<(f64 (fpimm0)), (FMOVXDr XZR)>, Requires<[NoZCZ]>;
+
+//===----------------------------------------------------------------------===//
+// Floating point conversion instruction.
+//===----------------------------------------------------------------------===//
+
+defm FCVT : FPConversion<"fcvt">;
+
+def : Pat<(f32_to_f16 FPR32:$Rn),
+ (i32 (COPY_TO_REGCLASS
+ (f32 (SUBREG_TO_REG (i32 0), (FCVTHSr FPR32:$Rn), hsub)),
+ GPR32))>;
+
+def FCVTSHpseudo : Pseudo<(outs FPR32:$Rd), (ins FPR32:$Rn),
+ [(set (f32 FPR32:$Rd), (f16_to_f32 i32:$Rn))]>;
+
+//===----------------------------------------------------------------------===//
+// Floating point single operand instructions.
+//===----------------------------------------------------------------------===//
+
+defm FABS : SingleOperandFPData<0b0001, "fabs", fabs>;
+defm FMOV : SingleOperandFPData<0b0000, "fmov">;
+defm FNEG : SingleOperandFPData<0b0010, "fneg", fneg>;
+defm FRINTA : SingleOperandFPData<0b1100, "frinta", frnd>;
+defm FRINTI : SingleOperandFPData<0b1111, "frinti", fnearbyint>;
+defm FRINTM : SingleOperandFPData<0b1010, "frintm", ffloor>;
+defm FRINTN : SingleOperandFPData<0b1000, "frintn", int_aarch64_neon_frintn>;
+defm FRINTP : SingleOperandFPData<0b1001, "frintp", fceil>;
+
+def : Pat<(v1f64 (int_aarch64_neon_frintn (v1f64 FPR64:$Rn))),
+ (FRINTNDr FPR64:$Rn)>;
+
+// FRINTX is inserted to set the flags as required by FENV_ACCESS ON behavior
+// in the C spec. Setting hasSideEffects ensures it is not DCE'd.
+// <rdar://problem/13715968>
+// TODO: We should really model the FPSR flags correctly. This is really ugly.
+let hasSideEffects = 1 in {
+defm FRINTX : SingleOperandFPData<0b1110, "frintx", frint>;
+}
+
+defm FRINTZ : SingleOperandFPData<0b1011, "frintz", ftrunc>;
+
+let SchedRW = [WriteFDiv] in {
+defm FSQRT : SingleOperandFPData<0b0011, "fsqrt", fsqrt>;
+}
+
+//===----------------------------------------------------------------------===//
+// Floating point two operand instructions.
+//===----------------------------------------------------------------------===//
+
+defm FADD : TwoOperandFPData<0b0010, "fadd", fadd>;
+let SchedRW = [WriteFDiv] in {
+defm FDIV : TwoOperandFPData<0b0001, "fdiv", fdiv>;
+}
+defm FMAXNM : TwoOperandFPData<0b0110, "fmaxnm", int_aarch64_neon_fmaxnm>;
+defm FMAX : TwoOperandFPData<0b0100, "fmax", AArch64fmax>;
+defm FMINNM : TwoOperandFPData<0b0111, "fminnm", int_aarch64_neon_fminnm>;
+defm FMIN : TwoOperandFPData<0b0101, "fmin", AArch64fmin>;
+let SchedRW = [WriteFMul] in {
+defm FMUL : TwoOperandFPData<0b0000, "fmul", fmul>;
+defm FNMUL : TwoOperandFPDataNeg<0b1000, "fnmul", fmul>;
+}
+defm FSUB : TwoOperandFPData<0b0011, "fsub", fsub>;
+
+def : Pat<(v1f64 (AArch64fmax (v1f64 FPR64:$Rn), (v1f64 FPR64:$Rm))),
+ (FMAXDrr FPR64:$Rn, FPR64:$Rm)>;
+def : Pat<(v1f64 (AArch64fmin (v1f64 FPR64:$Rn), (v1f64 FPR64:$Rm))),
+ (FMINDrr FPR64:$Rn, FPR64:$Rm)>;
+def : Pat<(v1f64 (int_aarch64_neon_fmaxnm (v1f64 FPR64:$Rn), (v1f64 FPR64:$Rm))),
+ (FMAXNMDrr FPR64:$Rn, FPR64:$Rm)>;
+def : Pat<(v1f64 (int_aarch64_neon_fminnm (v1f64 FPR64:$Rn), (v1f64 FPR64:$Rm))),
+ (FMINNMDrr FPR64:$Rn, FPR64:$Rm)>;
+
+//===----------------------------------------------------------------------===//
+// Floating point three operand instructions.
+//===----------------------------------------------------------------------===//
+
+defm FMADD : ThreeOperandFPData<0, 0, "fmadd", fma>;
+defm FMSUB : ThreeOperandFPData<0, 1, "fmsub",
+ TriOpFrag<(fma node:$LHS, (fneg node:$MHS), node:$RHS)> >;
+defm FNMADD : ThreeOperandFPData<1, 0, "fnmadd",
+ TriOpFrag<(fneg (fma node:$LHS, node:$MHS, node:$RHS))> >;
+defm FNMSUB : ThreeOperandFPData<1, 1, "fnmsub",
+ TriOpFrag<(fma node:$LHS, node:$MHS, (fneg node:$RHS))> >;
+
+// The following def pats catch the case where the LHS of an FMA is negated.
+// The TriOpFrag above catches the case where the middle operand is negated.
+
+// N.b. FMSUB etc have the accumulator at the *end* of (outs), unlike
+// the NEON variant.
+def : Pat<(f32 (fma (fneg FPR32:$Rn), FPR32:$Rm, FPR32:$Ra)),
+ (FMSUBSrrr FPR32:$Rn, FPR32:$Rm, FPR32:$Ra)>;
+
+def : Pat<(f64 (fma (fneg FPR64:$Rn), FPR64:$Rm, FPR64:$Ra)),
+ (FMSUBDrrr FPR64:$Rn, FPR64:$Rm, FPR64:$Ra)>;
+
+// We handled -(a + b*c) for FNMADD above, now it's time for "(-a) + (-b)*c" and
+// "(-a) + b*(-c)".
+def : Pat<(f32 (fma (fneg FPR32:$Rn), FPR32:$Rm, (fneg FPR32:$Ra))),
+ (FNMADDSrrr FPR32:$Rn, FPR32:$Rm, FPR32:$Ra)>;
+
+def : Pat<(f64 (fma (fneg FPR64:$Rn), FPR64:$Rm, (fneg FPR64:$Ra))),
+ (FNMADDDrrr FPR64:$Rn, FPR64:$Rm, FPR64:$Ra)>;
+
+def : Pat<(f32 (fma FPR32:$Rn, (fneg FPR32:$Rm), (fneg FPR32:$Ra))),
+ (FNMADDSrrr FPR32:$Rn, FPR32:$Rm, FPR32:$Ra)>;
+
+def : Pat<(f64 (fma FPR64:$Rn, (fneg FPR64:$Rm), (fneg FPR64:$Ra))),
+ (FNMADDDrrr FPR64:$Rn, FPR64:$Rm, FPR64:$Ra)>;
+
+//===----------------------------------------------------------------------===//
+// Floating point comparison instructions.
+//===----------------------------------------------------------------------===//
+
+defm FCMPE : FPComparison<1, "fcmpe">;
+defm FCMP : FPComparison<0, "fcmp", AArch64fcmp>;
+
+//===----------------------------------------------------------------------===//
+// Floating point conditional comparison instructions.
+//===----------------------------------------------------------------------===//
+
+defm FCCMPE : FPCondComparison<1, "fccmpe">;
+defm FCCMP : FPCondComparison<0, "fccmp">;
+
+//===----------------------------------------------------------------------===//
+// Floating point conditional select instruction.
+//===----------------------------------------------------------------------===//
+
+defm FCSEL : FPCondSelect<"fcsel">;
+
+// CSEL instructions providing f128 types need to be handled by a
// pseudo-instruction since the eventual code will need to introduce basic
// blocks and control flow.
-def F128CSEL : PseudoInst<(outs FPR128:$Rd),
- (ins FPR128:$Rn, FPR128:$Rm, cond_code_op:$Cond),
- [(set f128:$Rd, (simple_select f128:$Rn, f128:$Rm))]> {
+def F128CSEL : Pseudo<(outs FPR128:$Rd),
+ (ins FPR128:$Rn, FPR128:$Rm, ccode:$cond),
+ [(set (f128 FPR128:$Rd),
+ (AArch64csel FPR128:$Rn, FPR128:$Rm,
+ (i32 imm:$cond), NZCV))]> {
let Uses = [NZCV];
let usesCustomInserter = 1;
}
-//===----------------------------------------------------------------------===//
-// Load/store patterns
-//===----------------------------------------------------------------------===//
-
-// There are lots of patterns here, because we need to allow at least three
-// parameters to vary independently.
-// 1. Instruction: "ldrb w9, [sp]", "ldrh w9, [sp]", ...
-// 2. LLVM source: zextloadi8, anyextloadi8, ...
-// 3. Address-generation: A64Wrapper, (add BASE, OFFSET), ...
-//
-// The biggest problem turns out to be the address-generation variable. At the
-// point of instantiation we need to produce two DAGs, one for the pattern and
-// one for the instruction. Doing this at the lowest level of classes doesn't
-// work.
-//
-// Consider the simple uimm12 addressing mode, and the desire to match both (add
-// GPR64xsp:$Rn, uimm12:$Offset) and GPR64xsp:$Rn, particularly on the
-// instruction side. We'd need to insert either "GPR64xsp" and "uimm12" or
-// "GPR64xsp" and "0" into an unknown dag. !subst is not capable of this
-// operation, and PatFrags are for selection not output.
-//
-// As a result, the address-generation patterns are the final
-// instantiations. However, we do still need to vary the operand for the address
-// further down (At the point we're deciding A64WrapperSmall, we don't know
-// the memory width of the operation).
-
-//===------------------------------
-// 1. Basic infrastructural defs
-//===------------------------------
-
-// First, some simple classes for !foreach and !subst to use:
-class Decls {
- dag pattern;
-}
-
-def decls : Decls;
-def ALIGN;
-def INST;
-def OFFSET;
-def SHIFT;
-
-// You can't use !subst on an actual immediate, but you *can* use it on an
-// operand record that happens to match a single immediate. So we do.
-def imm_eq0 : ImmLeaf<i64, [{ return Imm == 0; }]>;
-def imm_eq1 : ImmLeaf<i64, [{ return Imm == 1; }]>;
-def imm_eq2 : ImmLeaf<i64, [{ return Imm == 2; }]>;
-def imm_eq3 : ImmLeaf<i64, [{ return Imm == 3; }]>;
-def imm_eq4 : ImmLeaf<i64, [{ return Imm == 4; }]>;
-
-// If the low bits of a pointer are known to be 0 then an "or" is just as good
-// as addition for computing an offset. This fragment forwards that check for
-// TableGen's use.
-def add_like_or : PatFrag<(ops node:$lhs, node:$rhs), (or node:$lhs, node:$rhs),
-[{
- return CurDAG->isBaseWithConstantOffset(SDValue(N, 0));
-}]>;
-
-// Load/store (unsigned immediate) operations with relocations against global
-// symbols (for lo12) are only valid if those symbols have correct alignment
-// (since the immediate offset is divided by the access scale, it can't have a
-// remainder).
-//
-// The guaranteed alignment is provided as part of the WrapperSmall
-// operation, and checked against one of these.
-def any_align : ImmLeaf<i32, [{ (void)Imm; return true; }]>;
-def min_align2 : ImmLeaf<i32, [{ return Imm >= 2; }]>;
-def min_align4 : ImmLeaf<i32, [{ return Imm >= 4; }]>;
-def min_align8 : ImmLeaf<i32, [{ return Imm >= 8; }]>;
-def min_align16 : ImmLeaf<i32, [{ return Imm >= 16; }]>;
-
-// "Normal" load/store instructions can be used on atomic operations, provided
-// the ordering parameter is at most "monotonic". Anything above that needs
-// special handling with acquire/release instructions.
-class simple_load<PatFrag base>
- : PatFrag<(ops node:$ptr), (base node:$ptr), [{
- return cast<AtomicSDNode>(N)->getOrdering() <= Monotonic;
-}]>;
-
-def atomic_load_simple_i8 : simple_load<atomic_load_8>;
-def atomic_load_simple_i16 : simple_load<atomic_load_16>;
-def atomic_load_simple_i32 : simple_load<atomic_load_32>;
-def atomic_load_simple_i64 : simple_load<atomic_load_64>;
-
-class simple_store<PatFrag base>
- : PatFrag<(ops node:$ptr, node:$val), (base node:$ptr, node:$val), [{
- return cast<AtomicSDNode>(N)->getOrdering() <= Monotonic;
-}]>;
-
-def atomic_store_simple_i8 : simple_store<atomic_store_8>;
-def atomic_store_simple_i16 : simple_store<atomic_store_16>;
-def atomic_store_simple_i32 : simple_store<atomic_store_32>;
-def atomic_store_simple_i64 : simple_store<atomic_store_64>;
-
-//===------------------------------
-// 2. UImm12 and SImm9
-//===------------------------------
-
-// These instructions have two operands providing the address so they can be
-// treated similarly for most purposes.
-
-//===------------------------------
-// 2.1 Base patterns covering extend/truncate semantics
-//===------------------------------
-
-// Atomic patterns can be shared between integer operations of all sizes, a
-// quick multiclass here allows reuse.
-multiclass ls_atomic_pats<Instruction LOAD, Instruction STORE, dag Base,
- dag Offset, dag address, ValueType transty,
- ValueType sty> {
- def : Pat<(!cast<PatFrag>("atomic_load_simple_" # sty) address),
- (LOAD Base, Offset)>;
-
- def : Pat<(!cast<PatFrag>("atomic_store_simple_" # sty) address, transty:$Rt),
- (STORE $Rt, Base, Offset)>;
-}
-
-// Instructions accessing a memory chunk smaller than a register (or, in a
-// pinch, the same size) have a characteristic set of patterns they want to
-// match: extending loads and truncating stores. This class deals with the
-// sign-neutral version of those patterns.
-//
-// It will be instantiated across multiple addressing-modes.
-multiclass ls_small_pats<Instruction LOAD, Instruction STORE,
- dag Base, dag Offset,
- dag address, ValueType sty>
- : ls_atomic_pats<LOAD, STORE, Base, Offset, address, i32, sty> {
- def : Pat<(!cast<SDNode>(zextload # sty) address), (LOAD Base, Offset)>;
-
- def : Pat<(!cast<SDNode>(extload # sty) address), (LOAD Base, Offset)>;
-
- // For zero-extension to 64-bits we have to tell LLVM that the whole 64-bit
- // register was actually set.
- def : Pat<(i64 (!cast<SDNode>(zextload # sty) address)),
- (SUBREG_TO_REG (i64 0), (LOAD Base, Offset), sub_32)>;
-
- def : Pat<(i64 (!cast<SDNode>(extload # sty) address)),
- (SUBREG_TO_REG (i64 0), (LOAD Base, Offset), sub_32)>;
-
- def : Pat<(!cast<SDNode>(truncstore # sty) i32:$Rt, address),
- (STORE $Rt, Base, Offset)>;
-
- // For truncating store from 64-bits, we have to manually tell LLVM to
- // ignore the high bits of the x register.
- def : Pat<(!cast<SDNode>(truncstore # sty) i64:$Rt, address),
- (STORE (EXTRACT_SUBREG $Rt, sub_32), Base, Offset)>;
-}
-
-// Next come patterns for sign-extending loads.
-multiclass load_signed_pats<string T, string U, dag Base, dag Offset,
- dag address, ValueType sty> {
- def : Pat<(i32 (!cast<SDNode>("sextload" # sty) address)),
- (!cast<Instruction>("LDRS" # T # "w" # U) Base, Offset)>;
-
- def : Pat<(i64 (!cast<SDNode>("sextload" # sty) address)),
- (!cast<Instruction>("LDRS" # T # "x" # U) Base, Offset)>;
-
-}
-
-// and finally "natural-width" loads and stores come next.
-multiclass ls_neutral_pats<Instruction LOAD, Instruction STORE, dag Base,
- dag Offset, dag address, ValueType sty> {
- def : Pat<(sty (load address)), (LOAD Base, Offset)>;
- def : Pat<(store sty:$Rt, address), (STORE $Rt, Base, Offset)>;
-}
-
-// Integer operations also get atomic instructions to select for.
-multiclass ls_int_neutral_pats<Instruction LOAD, Instruction STORE, dag Base,
- dag Offset, dag address, ValueType sty>
- : ls_neutral_pats<LOAD, STORE, Base, Offset, address, sty>,
- ls_atomic_pats<LOAD, STORE, Base, Offset, address, sty, sty>;
-
-//===------------------------------
-// 2.2. Addressing-mode instantiations
-//===------------------------------
-
-multiclass uimm12_pats<dag address, dag Base, dag Offset> {
- defm : ls_small_pats<LS8_LDR, LS8_STR, Base,
- !foreach(decls.pattern, Offset,
- !subst(OFFSET, byte_uimm12, decls.pattern)),
- !foreach(decls.pattern, address,
- !subst(OFFSET, byte_uimm12,
- !subst(ALIGN, any_align, decls.pattern))),
- i8>;
- defm : ls_small_pats<LS16_LDR, LS16_STR, Base,
- !foreach(decls.pattern, Offset,
- !subst(OFFSET, hword_uimm12, decls.pattern)),
- !foreach(decls.pattern, address,
- !subst(OFFSET, hword_uimm12,
- !subst(ALIGN, min_align2, decls.pattern))),
- i16>;
- defm : ls_small_pats<LS32_LDR, LS32_STR, Base,
- !foreach(decls.pattern, Offset,
- !subst(OFFSET, word_uimm12, decls.pattern)),
- !foreach(decls.pattern, address,
- !subst(OFFSET, word_uimm12,
- !subst(ALIGN, min_align4, decls.pattern))),
- i32>;
-
- defm : ls_int_neutral_pats<LS32_LDR, LS32_STR, Base,
- !foreach(decls.pattern, Offset,
- !subst(OFFSET, word_uimm12, decls.pattern)),
- !foreach(decls.pattern, address,
- !subst(OFFSET, word_uimm12,
- !subst(ALIGN, min_align4, decls.pattern))),
- i32>;
-
- defm : ls_int_neutral_pats<LS64_LDR, LS64_STR, Base,
- !foreach(decls.pattern, Offset,
- !subst(OFFSET, dword_uimm12, decls.pattern)),
- !foreach(decls.pattern, address,
- !subst(OFFSET, dword_uimm12,
- !subst(ALIGN, min_align8, decls.pattern))),
- i64>;
-
- defm : ls_neutral_pats<LSFP16_LDR, LSFP16_STR, Base,
- !foreach(decls.pattern, Offset,
- !subst(OFFSET, hword_uimm12, decls.pattern)),
- !foreach(decls.pattern, address,
- !subst(OFFSET, hword_uimm12,
- !subst(ALIGN, min_align2, decls.pattern))),
- f16>;
-
- defm : ls_neutral_pats<LSFP32_LDR, LSFP32_STR, Base,
- !foreach(decls.pattern, Offset,
- !subst(OFFSET, word_uimm12, decls.pattern)),
- !foreach(decls.pattern, address,
- !subst(OFFSET, word_uimm12,
- !subst(ALIGN, min_align4, decls.pattern))),
- f32>;
-
- defm : ls_neutral_pats<LSFP64_LDR, LSFP64_STR, Base,
- !foreach(decls.pattern, Offset,
- !subst(OFFSET, dword_uimm12, decls.pattern)),
- !foreach(decls.pattern, address,
- !subst(OFFSET, dword_uimm12,
- !subst(ALIGN, min_align8, decls.pattern))),
- f64>;
-
- defm : ls_neutral_pats<LSFP128_LDR, LSFP128_STR, Base,
- !foreach(decls.pattern, Offset,
- !subst(OFFSET, qword_uimm12, decls.pattern)),
- !foreach(decls.pattern, address,
- !subst(OFFSET, qword_uimm12,
- !subst(ALIGN, min_align16, decls.pattern))),
- f128>;
-
- defm : load_signed_pats<"B", "", Base,
- !foreach(decls.pattern, Offset,
- !subst(OFFSET, byte_uimm12, decls.pattern)),
- !foreach(decls.pattern, address,
- !subst(OFFSET, byte_uimm12,
- !subst(ALIGN, any_align, decls.pattern))),
- i8>;
-
- defm : load_signed_pats<"H", "", Base,
- !foreach(decls.pattern, Offset,
- !subst(OFFSET, hword_uimm12, decls.pattern)),
- !foreach(decls.pattern, address,
- !subst(OFFSET, hword_uimm12,
- !subst(ALIGN, min_align2, decls.pattern))),
- i16>;
-
- def : Pat<(sextloadi32 !foreach(decls.pattern, address,
- !subst(OFFSET, word_uimm12,
- !subst(ALIGN, min_align4, decls.pattern)))),
- (LDRSWx Base, !foreach(decls.pattern, Offset,
- !subst(OFFSET, word_uimm12, decls.pattern)))>;
-}
-
-// Straightforward patterns of last resort: a pointer with or without an
-// appropriate offset.
-defm : uimm12_pats<(i64 i64:$Rn), (i64 i64:$Rn), (i64 0)>;
-defm : uimm12_pats<(add i64:$Rn, OFFSET:$UImm12),
- (i64 i64:$Rn), (i64 OFFSET:$UImm12)>;
-
-// The offset could be hidden behind an "or", of course:
-defm : uimm12_pats<(add_like_or i64:$Rn, OFFSET:$UImm12),
- (i64 i64:$Rn), (i64 OFFSET:$UImm12)>;
-
-// Global addresses under the small-absolute model should use these
-// instructions. There are ELF relocations specifically for it.
-defm : uimm12_pats<(A64WrapperSmall tglobaladdr:$Hi, tglobaladdr:$Lo12, ALIGN),
- (ADRPxi tglobaladdr:$Hi), (i64 tglobaladdr:$Lo12)>;
-
-defm : uimm12_pats<(A64WrapperSmall tglobaltlsaddr:$Hi, tglobaltlsaddr:$Lo12,
- ALIGN),
- (ADRPxi tglobaltlsaddr:$Hi), (i64 tglobaltlsaddr:$Lo12)>;
-
-// External symbols that make it this far should also get standard relocations.
-defm : uimm12_pats<(A64WrapperSmall texternalsym:$Hi, texternalsym:$Lo12,
- ALIGN),
- (ADRPxi texternalsym:$Hi), (i64 texternalsym:$Lo12)>;
-
-defm : uimm12_pats<(A64WrapperSmall tconstpool:$Hi, tconstpool:$Lo12, ALIGN),
- (ADRPxi tconstpool:$Hi), (i64 tconstpool:$Lo12)>;
-
-// We also want to use uimm12 instructions for local variables at the moment.
-def tframeindex_XFORM : SDNodeXForm<frameindex, [{
- int FI = cast<FrameIndexSDNode>(N)->getIndex();
- return CurDAG->getTargetFrameIndex(FI, MVT::i64);
-}]>;
-
-defm : uimm12_pats<(i64 frameindex:$Rn),
- (tframeindex_XFORM tframeindex:$Rn), (i64 0)>;
-
-// These can be much simpler than uimm12 because we don't to change the operand
-// type (e.g. LDURB and LDURH take the same operands).
-multiclass simm9_pats<dag address, dag Base, dag Offset> {
- defm : ls_small_pats<LS8_LDUR, LS8_STUR, Base, Offset, address, i8>;
- defm : ls_small_pats<LS16_LDUR, LS16_STUR, Base, Offset, address, i16>;
-
- defm : ls_int_neutral_pats<LS32_LDUR, LS32_STUR, Base, Offset, address, i32>;
- defm : ls_int_neutral_pats<LS64_LDUR, LS64_STUR, Base, Offset, address, i64>;
-
- defm : ls_neutral_pats<LSFP16_LDUR, LSFP16_STUR, Base, Offset, address, f16>;
- defm : ls_neutral_pats<LSFP32_LDUR, LSFP32_STUR, Base, Offset, address, f32>;
- defm : ls_neutral_pats<LSFP64_LDUR, LSFP64_STUR, Base, Offset, address, f64>;
- defm : ls_neutral_pats<LSFP128_LDUR, LSFP128_STUR, Base, Offset, address,
- f128>;
-
- def : Pat<(i64 (zextloadi32 address)),
- (SUBREG_TO_REG (i64 0), (LS32_LDUR Base, Offset), sub_32)>;
-
- def : Pat<(truncstorei32 i64:$Rt, address),
- (LS32_STUR (EXTRACT_SUBREG $Rt, sub_32), Base, Offset)>;
-
- defm : load_signed_pats<"B", "_U", Base, Offset, address, i8>;
- defm : load_signed_pats<"H", "_U", Base, Offset, address, i16>;
- def : Pat<(sextloadi32 address), (LDURSWx Base, Offset)>;
-}
-
-defm : simm9_pats<(add i64:$Rn, simm9:$SImm9),
- (i64 $Rn), (SDXF_simm9 simm9:$SImm9)>;
-
-defm : simm9_pats<(add_like_or i64:$Rn, simm9:$SImm9),
- (i64 $Rn), (SDXF_simm9 simm9:$SImm9)>;
-
-
-//===------------------------------
-// 3. Register offset patterns
-//===------------------------------
-
-// Atomic patterns can be shared between integer operations of all sizes, a
-// quick multiclass here allows reuse.
-multiclass ro_atomic_pats<Instruction LOAD, Instruction STORE, dag Base,
- dag Offset, dag Extend, dag address,
- ValueType transty, ValueType sty> {
- def : Pat<(!cast<PatFrag>("atomic_load_simple_" # sty) address),
- (LOAD Base, Offset, Extend)>;
-
- def : Pat<(!cast<PatFrag>("atomic_store_simple_" # sty) address, transty:$Rt),
- (STORE $Rt, Base, Offset, Extend)>;
-}
-
-// The register offset instructions take three operands giving the instruction,
-// and have an annoying split between instructions where Rm is 32-bit and
-// 64-bit. So we need a special hierarchy to describe them. Other than that the
-// same operations should be supported as for simm9 and uimm12 addressing.
-
-multiclass ro_small_pats<Instruction LOAD, Instruction STORE,
- dag Base, dag Offset, dag Extend,
- dag address, ValueType sty>
- : ro_atomic_pats<LOAD, STORE, Base, Offset, Extend, address, i32, sty> {
- def : Pat<(!cast<SDNode>(zextload # sty) address),
- (LOAD Base, Offset, Extend)>;
-
- def : Pat<(!cast<SDNode>(extload # sty) address),
- (LOAD Base, Offset, Extend)>;
-
- // For zero-extension to 64-bits we have to tell LLVM that the whole 64-bit
- // register was actually set.
- def : Pat<(i64 (!cast<SDNode>(zextload # sty) address)),
- (SUBREG_TO_REG (i64 0), (LOAD Base, Offset, Extend), sub_32)>;
-
- def : Pat<(i64 (!cast<SDNode>(extload # sty) address)),
- (SUBREG_TO_REG (i64 0), (LOAD Base, Offset, Extend), sub_32)>;
-
- def : Pat<(!cast<SDNode>(truncstore # sty) i32:$Rt, address),
- (STORE $Rt, Base, Offset, Extend)>;
-
- // For truncating store from 64-bits, we have to manually tell LLVM to
- // ignore the high bits of the x register.
- def : Pat<(!cast<SDNode>(truncstore # sty) i64:$Rt, address),
- (STORE (EXTRACT_SUBREG $Rt, sub_32), Base, Offset, Extend)>;
-
-}
-
-// Next come patterns for sign-extending loads.
-multiclass ro_signed_pats<string T, string Rm, dag Base, dag Offset, dag Extend,
- dag address, ValueType sty> {
- def : Pat<(i32 (!cast<SDNode>("sextload" # sty) address)),
- (!cast<Instruction>("LDRS" # T # "w_" # Rm # "_RegOffset")
- Base, Offset, Extend)>;
-
- def : Pat<(i64 (!cast<SDNode>("sextload" # sty) address)),
- (!cast<Instruction>("LDRS" # T # "x_" # Rm # "_RegOffset")
- Base, Offset, Extend)>;
-}
-
-// and finally "natural-width" loads and stores come next.
-multiclass ro_neutral_pats<Instruction LOAD, Instruction STORE,
- dag Base, dag Offset, dag Extend, dag address,
- ValueType sty> {
- def : Pat<(sty (load address)), (LOAD Base, Offset, Extend)>;
- def : Pat<(store sty:$Rt, address),
- (STORE $Rt, Base, Offset, Extend)>;
-}
-
-multiclass ro_int_neutral_pats<Instruction LOAD, Instruction STORE,
- dag Base, dag Offset, dag Extend, dag address,
- ValueType sty>
- : ro_neutral_pats<LOAD, STORE, Base, Offset, Extend, address, sty>,
- ro_atomic_pats<LOAD, STORE, Base, Offset, Extend, address, sty, sty>;
-
-multiclass regoff_pats<string Rm, dag address, dag Base, dag Offset,
- dag Extend> {
- defm : ro_small_pats<!cast<Instruction>("LS8_" # Rm # "_RegOffset_LDR"),
- !cast<Instruction>("LS8_" # Rm # "_RegOffset_STR"),
- Base, Offset, Extend,
- !foreach(decls.pattern, address,
- !subst(SHIFT, imm_eq0, decls.pattern)),
- i8>;
- defm : ro_small_pats<!cast<Instruction>("LS16_" # Rm # "_RegOffset_LDR"),
- !cast<Instruction>("LS16_" # Rm # "_RegOffset_STR"),
- Base, Offset, Extend,
- !foreach(decls.pattern, address,
- !subst(SHIFT, imm_eq1, decls.pattern)),
- i16>;
- defm : ro_small_pats<!cast<Instruction>("LS32_" # Rm # "_RegOffset_LDR"),
- !cast<Instruction>("LS32_" # Rm # "_RegOffset_STR"),
- Base, Offset, Extend,
- !foreach(decls.pattern, address,
- !subst(SHIFT, imm_eq2, decls.pattern)),
- i32>;
-
- defm : ro_int_neutral_pats<
- !cast<Instruction>("LS32_" # Rm # "_RegOffset_LDR"),
- !cast<Instruction>("LS32_" # Rm # "_RegOffset_STR"),
- Base, Offset, Extend,
- !foreach(decls.pattern, address,
- !subst(SHIFT, imm_eq2, decls.pattern)),
- i32>;
-
- defm : ro_int_neutral_pats<
- !cast<Instruction>("LS64_" # Rm # "_RegOffset_LDR"),
- !cast<Instruction>("LS64_" # Rm # "_RegOffset_STR"),
- Base, Offset, Extend,
- !foreach(decls.pattern, address,
- !subst(SHIFT, imm_eq3, decls.pattern)),
- i64>;
-
- defm : ro_neutral_pats<!cast<Instruction>("LSFP16_" # Rm # "_RegOffset_LDR"),
- !cast<Instruction>("LSFP16_" # Rm # "_RegOffset_STR"),
- Base, Offset, Extend,
- !foreach(decls.pattern, address,
- !subst(SHIFT, imm_eq1, decls.pattern)),
- f16>;
-
- defm : ro_neutral_pats<!cast<Instruction>("LSFP32_" # Rm # "_RegOffset_LDR"),
- !cast<Instruction>("LSFP32_" # Rm # "_RegOffset_STR"),
- Base, Offset, Extend,
- !foreach(decls.pattern, address,
- !subst(SHIFT, imm_eq2, decls.pattern)),
- f32>;
-
- defm : ro_neutral_pats<!cast<Instruction>("LSFP64_" # Rm # "_RegOffset_LDR"),
- !cast<Instruction>("LSFP64_" # Rm # "_RegOffset_STR"),
- Base, Offset, Extend,
- !foreach(decls.pattern, address,
- !subst(SHIFT, imm_eq3, decls.pattern)),
- f64>;
-
- defm : ro_neutral_pats<!cast<Instruction>("LSFP128_" # Rm # "_RegOffset_LDR"),
- !cast<Instruction>("LSFP128_" # Rm # "_RegOffset_STR"),
- Base, Offset, Extend,
- !foreach(decls.pattern, address,
- !subst(SHIFT, imm_eq4, decls.pattern)),
- f128>;
-
- defm : ro_signed_pats<"B", Rm, Base, Offset, Extend,
- !foreach(decls.pattern, address,
- !subst(SHIFT, imm_eq0, decls.pattern)),
- i8>;
-
- defm : ro_signed_pats<"H", Rm, Base, Offset, Extend,
- !foreach(decls.pattern, address,
- !subst(SHIFT, imm_eq1, decls.pattern)),
- i16>;
-
- def : Pat<(sextloadi32 !foreach(decls.pattern, address,
- !subst(SHIFT, imm_eq2, decls.pattern))),
- (!cast<Instruction>("LDRSWx_" # Rm # "_RegOffset")
- Base, Offset, Extend)>;
-}
-
-
-// Finally we're in a position to tell LLVM exactly what addresses are reachable
-// using register-offset instructions. Essentially a base plus a possibly
-// extended, possibly shifted (by access size) offset.
-
-defm : regoff_pats<"Wm", (add i64:$Rn, (sext i32:$Rm)),
- (i64 i64:$Rn), (i32 i32:$Rm), (i64 6)>;
-
-defm : regoff_pats<"Wm", (add i64:$Rn, (shl (sext i32:$Rm), SHIFT)),
- (i64 i64:$Rn), (i32 i32:$Rm), (i64 7)>;
-
-defm : regoff_pats<"Wm", (add i64:$Rn, (zext i32:$Rm)),
- (i64 i64:$Rn), (i32 i32:$Rm), (i64 2)>;
-
-defm : regoff_pats<"Wm", (add i64:$Rn, (shl (zext i32:$Rm), SHIFT)),
- (i64 i64:$Rn), (i32 i32:$Rm), (i64 3)>;
-
-defm : regoff_pats<"Xm", (add i64:$Rn, i64:$Rm),
- (i64 i64:$Rn), (i64 i64:$Rm), (i64 2)>;
-
-defm : regoff_pats<"Xm", (add i64:$Rn, (shl i64:$Rm, SHIFT)),
- (i64 i64:$Rn), (i64 i64:$Rm), (i64 3)>;
//===----------------------------------------------------------------------===//
-// Advanced SIMD (NEON) Support
+// Floating point immediate move.
+//===----------------------------------------------------------------------===//
+
+let isReMaterializable = 1 in {
+defm FMOV : FPMoveImmediate<"fmov">;
+}
+
+//===----------------------------------------------------------------------===//
+// Advanced SIMD two vector instructions.
+//===----------------------------------------------------------------------===//
+
+defm ABS : SIMDTwoVectorBHSD<0, 0b01011, "abs", int_aarch64_neon_abs>;
+defm CLS : SIMDTwoVectorBHS<0, 0b00100, "cls", int_aarch64_neon_cls>;
+defm CLZ : SIMDTwoVectorBHS<1, 0b00100, "clz", ctlz>;
+defm CMEQ : SIMDCmpTwoVector<0, 0b01001, "cmeq", AArch64cmeqz>;
+defm CMGE : SIMDCmpTwoVector<1, 0b01000, "cmge", AArch64cmgez>;
+defm CMGT : SIMDCmpTwoVector<0, 0b01000, "cmgt", AArch64cmgtz>;
+defm CMLE : SIMDCmpTwoVector<1, 0b01001, "cmle", AArch64cmlez>;
+defm CMLT : SIMDCmpTwoVector<0, 0b01010, "cmlt", AArch64cmltz>;
+defm CNT : SIMDTwoVectorB<0, 0b00, 0b00101, "cnt", ctpop>;
+defm FABS : SIMDTwoVectorFP<0, 1, 0b01111, "fabs", fabs>;
+
+defm FCMEQ : SIMDFPCmpTwoVector<0, 1, 0b01101, "fcmeq", AArch64fcmeqz>;
+defm FCMGE : SIMDFPCmpTwoVector<1, 1, 0b01100, "fcmge", AArch64fcmgez>;
+defm FCMGT : SIMDFPCmpTwoVector<0, 1, 0b01100, "fcmgt", AArch64fcmgtz>;
+defm FCMLE : SIMDFPCmpTwoVector<1, 1, 0b01101, "fcmle", AArch64fcmlez>;
+defm FCMLT : SIMDFPCmpTwoVector<0, 1, 0b01110, "fcmlt", AArch64fcmltz>;
+defm FCVTAS : SIMDTwoVectorFPToInt<0,0,0b11100, "fcvtas",int_aarch64_neon_fcvtas>;
+defm FCVTAU : SIMDTwoVectorFPToInt<1,0,0b11100, "fcvtau",int_aarch64_neon_fcvtau>;
+defm FCVTL : SIMDFPWidenTwoVector<0, 0, 0b10111, "fcvtl">;
+def : Pat<(v4f32 (int_aarch64_neon_vcvthf2fp (v4i16 V64:$Rn))),
+ (FCVTLv4i16 V64:$Rn)>;
+def : Pat<(v4f32 (int_aarch64_neon_vcvthf2fp (extract_subvector (v8i16 V128:$Rn),
+ (i64 4)))),
+ (FCVTLv8i16 V128:$Rn)>;
+def : Pat<(v2f64 (fextend (v2f32 V64:$Rn))), (FCVTLv2i32 V64:$Rn)>;
+def : Pat<(v2f64 (fextend (v2f32 (extract_subvector (v4f32 V128:$Rn),
+ (i64 2))))),
+ (FCVTLv4i32 V128:$Rn)>;
+
+defm FCVTMS : SIMDTwoVectorFPToInt<0,0,0b11011, "fcvtms",int_aarch64_neon_fcvtms>;
+defm FCVTMU : SIMDTwoVectorFPToInt<1,0,0b11011, "fcvtmu",int_aarch64_neon_fcvtmu>;
+defm FCVTNS : SIMDTwoVectorFPToInt<0,0,0b11010, "fcvtns",int_aarch64_neon_fcvtns>;
+defm FCVTNU : SIMDTwoVectorFPToInt<1,0,0b11010, "fcvtnu",int_aarch64_neon_fcvtnu>;
+defm FCVTN : SIMDFPNarrowTwoVector<0, 0, 0b10110, "fcvtn">;
+def : Pat<(v4i16 (int_aarch64_neon_vcvtfp2hf (v4f32 V128:$Rn))),
+ (FCVTNv4i16 V128:$Rn)>;
+def : Pat<(concat_vectors V64:$Rd,
+ (v4i16 (int_aarch64_neon_vcvtfp2hf (v4f32 V128:$Rn)))),
+ (FCVTNv8i16 (INSERT_SUBREG (IMPLICIT_DEF), V64:$Rd, dsub), V128:$Rn)>;
+def : Pat<(v2f32 (fround (v2f64 V128:$Rn))), (FCVTNv2i32 V128:$Rn)>;
+def : Pat<(concat_vectors V64:$Rd, (v2f32 (fround (v2f64 V128:$Rn)))),
+ (FCVTNv4i32 (INSERT_SUBREG (IMPLICIT_DEF), V64:$Rd, dsub), V128:$Rn)>;
+defm FCVTPS : SIMDTwoVectorFPToInt<0,1,0b11010, "fcvtps",int_aarch64_neon_fcvtps>;
+defm FCVTPU : SIMDTwoVectorFPToInt<1,1,0b11010, "fcvtpu",int_aarch64_neon_fcvtpu>;
+defm FCVTXN : SIMDFPInexactCvtTwoVector<1, 0, 0b10110, "fcvtxn",
+ int_aarch64_neon_fcvtxn>;
+defm FCVTZS : SIMDTwoVectorFPToInt<0, 1, 0b11011, "fcvtzs", fp_to_sint>;
+defm FCVTZU : SIMDTwoVectorFPToInt<1, 1, 0b11011, "fcvtzu", fp_to_uint>;
+let isCodeGenOnly = 1 in {
+defm FCVTZS_Int : SIMDTwoVectorFPToInt<0, 1, 0b11011, "fcvtzs",
+ int_aarch64_neon_fcvtzs>;
+defm FCVTZU_Int : SIMDTwoVectorFPToInt<1, 1, 0b11011, "fcvtzu",
+ int_aarch64_neon_fcvtzu>;
+}
+defm FNEG : SIMDTwoVectorFP<1, 1, 0b01111, "fneg", fneg>;
+defm FRECPE : SIMDTwoVectorFP<0, 1, 0b11101, "frecpe", int_aarch64_neon_frecpe>;
+defm FRINTA : SIMDTwoVectorFP<1, 0, 0b11000, "frinta", frnd>;
+defm FRINTI : SIMDTwoVectorFP<1, 1, 0b11001, "frinti", fnearbyint>;
+defm FRINTM : SIMDTwoVectorFP<0, 0, 0b11001, "frintm", ffloor>;
+defm FRINTN : SIMDTwoVectorFP<0, 0, 0b11000, "frintn", int_aarch64_neon_frintn>;
+defm FRINTP : SIMDTwoVectorFP<0, 1, 0b11000, "frintp", fceil>;
+defm FRINTX : SIMDTwoVectorFP<1, 0, 0b11001, "frintx", frint>;
+defm FRINTZ : SIMDTwoVectorFP<0, 1, 0b11001, "frintz", ftrunc>;
+defm FRSQRTE: SIMDTwoVectorFP<1, 1, 0b11101, "frsqrte", int_aarch64_neon_frsqrte>;
+defm FSQRT : SIMDTwoVectorFP<1, 1, 0b11111, "fsqrt", fsqrt>;
+defm NEG : SIMDTwoVectorBHSD<1, 0b01011, "neg",
+ UnOpFrag<(sub immAllZerosV, node:$LHS)> >;
+defm NOT : SIMDTwoVectorB<1, 0b00, 0b00101, "not", vnot>;
+// Aliases for MVN -> NOT.
+def : InstAlias<"mvn{ $Vd.8b, $Vn.8b|.8b $Vd, $Vn}",
+ (NOTv8i8 V64:$Vd, V64:$Vn)>;
+def : InstAlias<"mvn{ $Vd.16b, $Vn.16b|.16b $Vd, $Vn}",
+ (NOTv16i8 V128:$Vd, V128:$Vn)>;
+
+def : Pat<(AArch64neg (v8i8 V64:$Rn)), (NEGv8i8 V64:$Rn)>;
+def : Pat<(AArch64neg (v16i8 V128:$Rn)), (NEGv16i8 V128:$Rn)>;
+def : Pat<(AArch64neg (v4i16 V64:$Rn)), (NEGv4i16 V64:$Rn)>;
+def : Pat<(AArch64neg (v8i16 V128:$Rn)), (NEGv8i16 V128:$Rn)>;
+def : Pat<(AArch64neg (v2i32 V64:$Rn)), (NEGv2i32 V64:$Rn)>;
+def : Pat<(AArch64neg (v4i32 V128:$Rn)), (NEGv4i32 V128:$Rn)>;
+def : Pat<(AArch64neg (v2i64 V128:$Rn)), (NEGv2i64 V128:$Rn)>;
+
+def : Pat<(AArch64not (v8i8 V64:$Rn)), (NOTv8i8 V64:$Rn)>;
+def : Pat<(AArch64not (v16i8 V128:$Rn)), (NOTv16i8 V128:$Rn)>;
+def : Pat<(AArch64not (v4i16 V64:$Rn)), (NOTv8i8 V64:$Rn)>;
+def : Pat<(AArch64not (v8i16 V128:$Rn)), (NOTv16i8 V128:$Rn)>;
+def : Pat<(AArch64not (v2i32 V64:$Rn)), (NOTv8i8 V64:$Rn)>;
+def : Pat<(AArch64not (v1i64 V64:$Rn)), (NOTv8i8 V64:$Rn)>;
+def : Pat<(AArch64not (v4i32 V128:$Rn)), (NOTv16i8 V128:$Rn)>;
+def : Pat<(AArch64not (v2i64 V128:$Rn)), (NOTv16i8 V128:$Rn)>;
+
+def : Pat<(vnot (v4i16 V64:$Rn)), (NOTv8i8 V64:$Rn)>;
+def : Pat<(vnot (v8i16 V128:$Rn)), (NOTv16i8 V128:$Rn)>;
+def : Pat<(vnot (v2i32 V64:$Rn)), (NOTv8i8 V64:$Rn)>;
+def : Pat<(vnot (v4i32 V128:$Rn)), (NOTv16i8 V128:$Rn)>;
+def : Pat<(vnot (v2i64 V128:$Rn)), (NOTv16i8 V128:$Rn)>;
+
+defm RBIT : SIMDTwoVectorB<1, 0b01, 0b00101, "rbit", int_aarch64_neon_rbit>;
+defm REV16 : SIMDTwoVectorB<0, 0b00, 0b00001, "rev16", AArch64rev16>;
+defm REV32 : SIMDTwoVectorBH<1, 0b00000, "rev32", AArch64rev32>;
+defm REV64 : SIMDTwoVectorBHS<0, 0b00000, "rev64", AArch64rev64>;
+defm SADALP : SIMDLongTwoVectorTied<0, 0b00110, "sadalp",
+ BinOpFrag<(add node:$LHS, (int_aarch64_neon_saddlp node:$RHS))> >;
+defm SADDLP : SIMDLongTwoVector<0, 0b00010, "saddlp", int_aarch64_neon_saddlp>;
+defm SCVTF : SIMDTwoVectorIntToFP<0, 0, 0b11101, "scvtf", sint_to_fp>;
+defm SHLL : SIMDVectorLShiftLongBySizeBHS;
+defm SQABS : SIMDTwoVectorBHSD<0, 0b00111, "sqabs", int_aarch64_neon_sqabs>;
+defm SQNEG : SIMDTwoVectorBHSD<1, 0b00111, "sqneg", int_aarch64_neon_sqneg>;
+defm SQXTN : SIMDMixedTwoVector<0, 0b10100, "sqxtn", int_aarch64_neon_sqxtn>;
+defm SQXTUN : SIMDMixedTwoVector<1, 0b10010, "sqxtun", int_aarch64_neon_sqxtun>;
+defm SUQADD : SIMDTwoVectorBHSDTied<0, 0b00011, "suqadd",int_aarch64_neon_suqadd>;
+defm UADALP : SIMDLongTwoVectorTied<1, 0b00110, "uadalp",
+ BinOpFrag<(add node:$LHS, (int_aarch64_neon_uaddlp node:$RHS))> >;
+defm UADDLP : SIMDLongTwoVector<1, 0b00010, "uaddlp",
+ int_aarch64_neon_uaddlp>;
+defm UCVTF : SIMDTwoVectorIntToFP<1, 0, 0b11101, "ucvtf", uint_to_fp>;
+defm UQXTN : SIMDMixedTwoVector<1, 0b10100, "uqxtn", int_aarch64_neon_uqxtn>;
+defm URECPE : SIMDTwoVectorS<0, 1, 0b11100, "urecpe", int_aarch64_neon_urecpe>;
+defm URSQRTE: SIMDTwoVectorS<1, 1, 0b11100, "ursqrte", int_aarch64_neon_ursqrte>;
+defm USQADD : SIMDTwoVectorBHSDTied<1, 0b00011, "usqadd",int_aarch64_neon_usqadd>;
+defm XTN : SIMDMixedTwoVector<0, 0b10010, "xtn", trunc>;
+
+def : Pat<(v2f32 (AArch64rev64 V64:$Rn)), (REV64v2i32 V64:$Rn)>;
+def : Pat<(v4f32 (AArch64rev64 V128:$Rn)), (REV64v4i32 V128:$Rn)>;
+
+// Patterns for vector long shift (by element width). These need to match all
+// three of zext, sext and anyext so it's easier to pull the patterns out of the
+// definition.
+multiclass SIMDVectorLShiftLongBySizeBHSPats<SDPatternOperator ext> {
+ def : Pat<(AArch64vshl (v8i16 (ext (v8i8 V64:$Rn))), (i32 8)),
+ (SHLLv8i8 V64:$Rn)>;
+ def : Pat<(AArch64vshl (v8i16 (ext (extract_high_v16i8 V128:$Rn))), (i32 8)),
+ (SHLLv16i8 V128:$Rn)>;
+ def : Pat<(AArch64vshl (v4i32 (ext (v4i16 V64:$Rn))), (i32 16)),
+ (SHLLv4i16 V64:$Rn)>;
+ def : Pat<(AArch64vshl (v4i32 (ext (extract_high_v8i16 V128:$Rn))), (i32 16)),
+ (SHLLv8i16 V128:$Rn)>;
+ def : Pat<(AArch64vshl (v2i64 (ext (v2i32 V64:$Rn))), (i32 32)),
+ (SHLLv2i32 V64:$Rn)>;
+ def : Pat<(AArch64vshl (v2i64 (ext (extract_high_v4i32 V128:$Rn))), (i32 32)),
+ (SHLLv4i32 V128:$Rn)>;
+}
+
+defm : SIMDVectorLShiftLongBySizeBHSPats<anyext>;
+defm : SIMDVectorLShiftLongBySizeBHSPats<zext>;
+defm : SIMDVectorLShiftLongBySizeBHSPats<sext>;
+
+//===----------------------------------------------------------------------===//
+// Advanced SIMD three vector instructions.
+//===----------------------------------------------------------------------===//
+
+defm ADD : SIMDThreeSameVector<0, 0b10000, "add", add>;
+defm ADDP : SIMDThreeSameVector<0, 0b10111, "addp", int_aarch64_neon_addp>;
+defm CMEQ : SIMDThreeSameVector<1, 0b10001, "cmeq", AArch64cmeq>;
+defm CMGE : SIMDThreeSameVector<0, 0b00111, "cmge", AArch64cmge>;
+defm CMGT : SIMDThreeSameVector<0, 0b00110, "cmgt", AArch64cmgt>;
+defm CMHI : SIMDThreeSameVector<1, 0b00110, "cmhi", AArch64cmhi>;
+defm CMHS : SIMDThreeSameVector<1, 0b00111, "cmhs", AArch64cmhs>;
+defm CMTST : SIMDThreeSameVector<0, 0b10001, "cmtst", AArch64cmtst>;
+defm FABD : SIMDThreeSameVectorFP<1,1,0b11010,"fabd", int_aarch64_neon_fabd>;
+defm FACGE : SIMDThreeSameVectorFPCmp<1,0,0b11101,"facge",int_aarch64_neon_facge>;
+defm FACGT : SIMDThreeSameVectorFPCmp<1,1,0b11101,"facgt",int_aarch64_neon_facgt>;
+defm FADDP : SIMDThreeSameVectorFP<1,0,0b11010,"faddp",int_aarch64_neon_addp>;
+defm FADD : SIMDThreeSameVectorFP<0,0,0b11010,"fadd", fadd>;
+defm FCMEQ : SIMDThreeSameVectorFPCmp<0, 0, 0b11100, "fcmeq", AArch64fcmeq>;
+defm FCMGE : SIMDThreeSameVectorFPCmp<1, 0, 0b11100, "fcmge", AArch64fcmge>;
+defm FCMGT : SIMDThreeSameVectorFPCmp<1, 1, 0b11100, "fcmgt", AArch64fcmgt>;
+defm FDIV : SIMDThreeSameVectorFP<1,0,0b11111,"fdiv", fdiv>;
+defm FMAXNMP : SIMDThreeSameVectorFP<1,0,0b11000,"fmaxnmp", int_aarch64_neon_fmaxnmp>;
+defm FMAXNM : SIMDThreeSameVectorFP<0,0,0b11000,"fmaxnm", int_aarch64_neon_fmaxnm>;
+defm FMAXP : SIMDThreeSameVectorFP<1,0,0b11110,"fmaxp", int_aarch64_neon_fmaxp>;
+defm FMAX : SIMDThreeSameVectorFP<0,0,0b11110,"fmax", AArch64fmax>;
+defm FMINNMP : SIMDThreeSameVectorFP<1,1,0b11000,"fminnmp", int_aarch64_neon_fminnmp>;
+defm FMINNM : SIMDThreeSameVectorFP<0,1,0b11000,"fminnm", int_aarch64_neon_fminnm>;
+defm FMINP : SIMDThreeSameVectorFP<1,1,0b11110,"fminp", int_aarch64_neon_fminp>;
+defm FMIN : SIMDThreeSameVectorFP<0,1,0b11110,"fmin", AArch64fmin>;
+
+// NOTE: The operands of the PatFrag are reordered on FMLA/FMLS because the
+// instruction expects the addend first, while the fma intrinsic puts it last.
+defm FMLA : SIMDThreeSameVectorFPTied<0, 0, 0b11001, "fmla",
+ TriOpFrag<(fma node:$RHS, node:$MHS, node:$LHS)> >;
+defm FMLS : SIMDThreeSameVectorFPTied<0, 1, 0b11001, "fmls",
+ TriOpFrag<(fma node:$MHS, (fneg node:$RHS), node:$LHS)> >;
+
+// The following def pats catch the case where the LHS of an FMA is negated.
+// The TriOpFrag above catches the case where the middle operand is negated.
+def : Pat<(v2f32 (fma (fneg V64:$Rn), V64:$Rm, V64:$Rd)),
+ (FMLSv2f32 V64:$Rd, V64:$Rn, V64:$Rm)>;
+
+def : Pat<(v4f32 (fma (fneg V128:$Rn), V128:$Rm, V128:$Rd)),
+ (FMLSv4f32 V128:$Rd, V128:$Rn, V128:$Rm)>;
+
+def : Pat<(v2f64 (fma (fneg V128:$Rn), V128:$Rm, V128:$Rd)),
+ (FMLSv2f64 V128:$Rd, V128:$Rn, V128:$Rm)>;
+
+defm FMULX : SIMDThreeSameVectorFP<0,0,0b11011,"fmulx", int_aarch64_neon_fmulx>;
+defm FMUL : SIMDThreeSameVectorFP<1,0,0b11011,"fmul", fmul>;
+defm FRECPS : SIMDThreeSameVectorFP<0,0,0b11111,"frecps", int_aarch64_neon_frecps>;
+defm FRSQRTS : SIMDThreeSameVectorFP<0,1,0b11111,"frsqrts", int_aarch64_neon_frsqrts>;
+defm FSUB : SIMDThreeSameVectorFP<0,1,0b11010,"fsub", fsub>;
+defm MLA : SIMDThreeSameVectorBHSTied<0, 0b10010, "mla",
+ TriOpFrag<(add node:$LHS, (mul node:$MHS, node:$RHS))> >;
+defm MLS : SIMDThreeSameVectorBHSTied<1, 0b10010, "mls",
+ TriOpFrag<(sub node:$LHS, (mul node:$MHS, node:$RHS))> >;
+defm MUL : SIMDThreeSameVectorBHS<0, 0b10011, "mul", mul>;
+defm PMUL : SIMDThreeSameVectorB<1, 0b10011, "pmul", int_aarch64_neon_pmul>;
+defm SABA : SIMDThreeSameVectorBHSTied<0, 0b01111, "saba",
+ TriOpFrag<(add node:$LHS, (int_aarch64_neon_sabd node:$MHS, node:$RHS))> >;
+defm SABD : SIMDThreeSameVectorBHS<0,0b01110,"sabd", int_aarch64_neon_sabd>;
+defm SHADD : SIMDThreeSameVectorBHS<0,0b00000,"shadd", int_aarch64_neon_shadd>;
+defm SHSUB : SIMDThreeSameVectorBHS<0,0b00100,"shsub", int_aarch64_neon_shsub>;
+defm SMAXP : SIMDThreeSameVectorBHS<0,0b10100,"smaxp", int_aarch64_neon_smaxp>;
+defm SMAX : SIMDThreeSameVectorBHS<0,0b01100,"smax", int_aarch64_neon_smax>;
+defm SMINP : SIMDThreeSameVectorBHS<0,0b10101,"sminp", int_aarch64_neon_sminp>;
+defm SMIN : SIMDThreeSameVectorBHS<0,0b01101,"smin", int_aarch64_neon_smin>;
+defm SQADD : SIMDThreeSameVector<0,0b00001,"sqadd", int_aarch64_neon_sqadd>;
+defm SQDMULH : SIMDThreeSameVectorHS<0,0b10110,"sqdmulh",int_aarch64_neon_sqdmulh>;
+defm SQRDMULH : SIMDThreeSameVectorHS<1,0b10110,"sqrdmulh",int_aarch64_neon_sqrdmulh>;
+defm SQRSHL : SIMDThreeSameVector<0,0b01011,"sqrshl", int_aarch64_neon_sqrshl>;
+defm SQSHL : SIMDThreeSameVector<0,0b01001,"sqshl", int_aarch64_neon_sqshl>;
+defm SQSUB : SIMDThreeSameVector<0,0b00101,"sqsub", int_aarch64_neon_sqsub>;
+defm SRHADD : SIMDThreeSameVectorBHS<0,0b00010,"srhadd",int_aarch64_neon_srhadd>;
+defm SRSHL : SIMDThreeSameVector<0,0b01010,"srshl", int_aarch64_neon_srshl>;
+defm SSHL : SIMDThreeSameVector<0,0b01000,"sshl", int_aarch64_neon_sshl>;
+defm SUB : SIMDThreeSameVector<1,0b10000,"sub", sub>;
+defm UABA : SIMDThreeSameVectorBHSTied<1, 0b01111, "uaba",
+ TriOpFrag<(add node:$LHS, (int_aarch64_neon_uabd node:$MHS, node:$RHS))> >;
+defm UABD : SIMDThreeSameVectorBHS<1,0b01110,"uabd", int_aarch64_neon_uabd>;
+defm UHADD : SIMDThreeSameVectorBHS<1,0b00000,"uhadd", int_aarch64_neon_uhadd>;
+defm UHSUB : SIMDThreeSameVectorBHS<1,0b00100,"uhsub", int_aarch64_neon_uhsub>;
+defm UMAXP : SIMDThreeSameVectorBHS<1,0b10100,"umaxp", int_aarch64_neon_umaxp>;
+defm UMAX : SIMDThreeSameVectorBHS<1,0b01100,"umax", int_aarch64_neon_umax>;
+defm UMINP : SIMDThreeSameVectorBHS<1,0b10101,"uminp", int_aarch64_neon_uminp>;
+defm UMIN : SIMDThreeSameVectorBHS<1,0b01101,"umin", int_aarch64_neon_umin>;
+defm UQADD : SIMDThreeSameVector<1,0b00001,"uqadd", int_aarch64_neon_uqadd>;
+defm UQRSHL : SIMDThreeSameVector<1,0b01011,"uqrshl", int_aarch64_neon_uqrshl>;
+defm UQSHL : SIMDThreeSameVector<1,0b01001,"uqshl", int_aarch64_neon_uqshl>;
+defm UQSUB : SIMDThreeSameVector<1,0b00101,"uqsub", int_aarch64_neon_uqsub>;
+defm URHADD : SIMDThreeSameVectorBHS<1,0b00010,"urhadd", int_aarch64_neon_urhadd>;
+defm URSHL : SIMDThreeSameVector<1,0b01010,"urshl", int_aarch64_neon_urshl>;
+defm USHL : SIMDThreeSameVector<1,0b01000,"ushl", int_aarch64_neon_ushl>;
+
+defm AND : SIMDLogicalThreeVector<0, 0b00, "and", and>;
+defm BIC : SIMDLogicalThreeVector<0, 0b01, "bic",
+ BinOpFrag<(and node:$LHS, (vnot node:$RHS))> >;
+defm BIF : SIMDLogicalThreeVector<1, 0b11, "bif">;
+defm BIT : SIMDLogicalThreeVectorTied<1, 0b10, "bit", AArch64bit>;
+defm BSL : SIMDLogicalThreeVectorTied<1, 0b01, "bsl",
+ TriOpFrag<(or (and node:$LHS, node:$MHS), (and (vnot node:$LHS), node:$RHS))>>;
+defm EOR : SIMDLogicalThreeVector<1, 0b00, "eor", xor>;
+defm ORN : SIMDLogicalThreeVector<0, 0b11, "orn",
+ BinOpFrag<(or node:$LHS, (vnot node:$RHS))> >;
+defm ORR : SIMDLogicalThreeVector<0, 0b10, "orr", or>;
+
+def : Pat<(AArch64bsl (v8i8 V64:$Rd), V64:$Rn, V64:$Rm),
+ (BSLv8i8 V64:$Rd, V64:$Rn, V64:$Rm)>;
+def : Pat<(AArch64bsl (v4i16 V64:$Rd), V64:$Rn, V64:$Rm),
+ (BSLv8i8 V64:$Rd, V64:$Rn, V64:$Rm)>;
+def : Pat<(AArch64bsl (v2i32 V64:$Rd), V64:$Rn, V64:$Rm),
+ (BSLv8i8 V64:$Rd, V64:$Rn, V64:$Rm)>;
+def : Pat<(AArch64bsl (v1i64 V64:$Rd), V64:$Rn, V64:$Rm),
+ (BSLv8i8 V64:$Rd, V64:$Rn, V64:$Rm)>;
+
+def : Pat<(AArch64bsl (v16i8 V128:$Rd), V128:$Rn, V128:$Rm),
+ (BSLv16i8 V128:$Rd, V128:$Rn, V128:$Rm)>;
+def : Pat<(AArch64bsl (v8i16 V128:$Rd), V128:$Rn, V128:$Rm),
+ (BSLv16i8 V128:$Rd, V128:$Rn, V128:$Rm)>;
+def : Pat<(AArch64bsl (v4i32 V128:$Rd), V128:$Rn, V128:$Rm),
+ (BSLv16i8 V128:$Rd, V128:$Rn, V128:$Rm)>;
+def : Pat<(AArch64bsl (v2i64 V128:$Rd), V128:$Rn, V128:$Rm),
+ (BSLv16i8 V128:$Rd, V128:$Rn, V128:$Rm)>;
+
+def : InstAlias<"mov{\t$dst.16b, $src.16b|.16b\t$dst, $src}",
+ (ORRv16i8 V128:$dst, V128:$src, V128:$src), 1>;
+def : InstAlias<"mov{\t$dst.8h, $src.8h|.8h\t$dst, $src}",
+ (ORRv16i8 V128:$dst, V128:$src, V128:$src), 0>;
+def : InstAlias<"mov{\t$dst.4s, $src.4s|.4s\t$dst, $src}",
+ (ORRv16i8 V128:$dst, V128:$src, V128:$src), 0>;
+def : InstAlias<"mov{\t$dst.2d, $src.2d|.2d\t$dst, $src}",
+ (ORRv16i8 V128:$dst, V128:$src, V128:$src), 0>;
+
+def : InstAlias<"mov{\t$dst.8b, $src.8b|.8b\t$dst, $src}",
+ (ORRv8i8 V64:$dst, V64:$src, V64:$src), 1>;
+def : InstAlias<"mov{\t$dst.4h, $src.4h|.4h\t$dst, $src}",
+ (ORRv8i8 V64:$dst, V64:$src, V64:$src), 0>;
+def : InstAlias<"mov{\t$dst.2s, $src.2s|.2s\t$dst, $src}",
+ (ORRv8i8 V64:$dst, V64:$src, V64:$src), 0>;
+def : InstAlias<"mov{\t$dst.1d, $src.1d|.1d\t$dst, $src}",
+ (ORRv8i8 V64:$dst, V64:$src, V64:$src), 0>;
+
+def : InstAlias<"{cmls\t$dst.8b, $src1.8b, $src2.8b" #
+ "|cmls.8b\t$dst, $src1, $src2}",
+ (CMHSv8i8 V64:$dst, V64:$src2, V64:$src1), 0>;
+def : InstAlias<"{cmls\t$dst.16b, $src1.16b, $src2.16b" #
+ "|cmls.16b\t$dst, $src1, $src2}",
+ (CMHSv16i8 V128:$dst, V128:$src2, V128:$src1), 0>;
+def : InstAlias<"{cmls\t$dst.4h, $src1.4h, $src2.4h" #
+ "|cmls.4h\t$dst, $src1, $src2}",
+ (CMHSv4i16 V64:$dst, V64:$src2, V64:$src1), 0>;
+def : InstAlias<"{cmls\t$dst.8h, $src1.8h, $src2.8h" #
+ "|cmls.8h\t$dst, $src1, $src2}",
+ (CMHSv8i16 V128:$dst, V128:$src2, V128:$src1), 0>;
+def : InstAlias<"{cmls\t$dst.2s, $src1.2s, $src2.2s" #
+ "|cmls.2s\t$dst, $src1, $src2}",
+ (CMHSv2i32 V64:$dst, V64:$src2, V64:$src1), 0>;
+def : InstAlias<"{cmls\t$dst.4s, $src1.4s, $src2.4s" #
+ "|cmls.4s\t$dst, $src1, $src2}",
+ (CMHSv4i32 V128:$dst, V128:$src2, V128:$src1), 0>;
+def : InstAlias<"{cmls\t$dst.2d, $src1.2d, $src2.2d" #
+ "|cmls.2d\t$dst, $src1, $src2}",
+ (CMHSv2i64 V128:$dst, V128:$src2, V128:$src1), 0>;
+
+def : InstAlias<"{cmlo\t$dst.8b, $src1.8b, $src2.8b" #
+ "|cmlo.8b\t$dst, $src1, $src2}",
+ (CMHIv8i8 V64:$dst, V64:$src2, V64:$src1), 0>;
+def : InstAlias<"{cmlo\t$dst.16b, $src1.16b, $src2.16b" #
+ "|cmlo.16b\t$dst, $src1, $src2}",
+ (CMHIv16i8 V128:$dst, V128:$src2, V128:$src1), 0>;
+def : InstAlias<"{cmlo\t$dst.4h, $src1.4h, $src2.4h" #
+ "|cmlo.4h\t$dst, $src1, $src2}",
+ (CMHIv4i16 V64:$dst, V64:$src2, V64:$src1), 0>;
+def : InstAlias<"{cmlo\t$dst.8h, $src1.8h, $src2.8h" #
+ "|cmlo.8h\t$dst, $src1, $src2}",
+ (CMHIv8i16 V128:$dst, V128:$src2, V128:$src1), 0>;
+def : InstAlias<"{cmlo\t$dst.2s, $src1.2s, $src2.2s" #
+ "|cmlo.2s\t$dst, $src1, $src2}",
+ (CMHIv2i32 V64:$dst, V64:$src2, V64:$src1), 0>;
+def : InstAlias<"{cmlo\t$dst.4s, $src1.4s, $src2.4s" #
+ "|cmlo.4s\t$dst, $src1, $src2}",
+ (CMHIv4i32 V128:$dst, V128:$src2, V128:$src1), 0>;
+def : InstAlias<"{cmlo\t$dst.2d, $src1.2d, $src2.2d" #
+ "|cmlo.2d\t$dst, $src1, $src2}",
+ (CMHIv2i64 V128:$dst, V128:$src2, V128:$src1), 0>;
+
+def : InstAlias<"{cmle\t$dst.8b, $src1.8b, $src2.8b" #
+ "|cmle.8b\t$dst, $src1, $src2}",
+ (CMGEv8i8 V64:$dst, V64:$src2, V64:$src1), 0>;
+def : InstAlias<"{cmle\t$dst.16b, $src1.16b, $src2.16b" #
+ "|cmle.16b\t$dst, $src1, $src2}",
+ (CMGEv16i8 V128:$dst, V128:$src2, V128:$src1), 0>;
+def : InstAlias<"{cmle\t$dst.4h, $src1.4h, $src2.4h" #
+ "|cmle.4h\t$dst, $src1, $src2}",
+ (CMGEv4i16 V64:$dst, V64:$src2, V64:$src1), 0>;
+def : InstAlias<"{cmle\t$dst.8h, $src1.8h, $src2.8h" #
+ "|cmle.8h\t$dst, $src1, $src2}",
+ (CMGEv8i16 V128:$dst, V128:$src2, V128:$src1), 0>;
+def : InstAlias<"{cmle\t$dst.2s, $src1.2s, $src2.2s" #
+ "|cmle.2s\t$dst, $src1, $src2}",
+ (CMGEv2i32 V64:$dst, V64:$src2, V64:$src1), 0>;
+def : InstAlias<"{cmle\t$dst.4s, $src1.4s, $src2.4s" #
+ "|cmle.4s\t$dst, $src1, $src2}",
+ (CMGEv4i32 V128:$dst, V128:$src2, V128:$src1), 0>;
+def : InstAlias<"{cmle\t$dst.2d, $src1.2d, $src2.2d" #
+ "|cmle.2d\t$dst, $src1, $src2}",
+ (CMGEv2i64 V128:$dst, V128:$src2, V128:$src1), 0>;
+
+def : InstAlias<"{cmlt\t$dst.8b, $src1.8b, $src2.8b" #
+ "|cmlt.8b\t$dst, $src1, $src2}",
+ (CMGTv8i8 V64:$dst, V64:$src2, V64:$src1), 0>;
+def : InstAlias<"{cmlt\t$dst.16b, $src1.16b, $src2.16b" #
+ "|cmlt.16b\t$dst, $src1, $src2}",
+ (CMGTv16i8 V128:$dst, V128:$src2, V128:$src1), 0>;
+def : InstAlias<"{cmlt\t$dst.4h, $src1.4h, $src2.4h" #
+ "|cmlt.4h\t$dst, $src1, $src2}",
+ (CMGTv4i16 V64:$dst, V64:$src2, V64:$src1), 0>;
+def : InstAlias<"{cmlt\t$dst.8h, $src1.8h, $src2.8h" #
+ "|cmlt.8h\t$dst, $src1, $src2}",
+ (CMGTv8i16 V128:$dst, V128:$src2, V128:$src1), 0>;
+def : InstAlias<"{cmlt\t$dst.2s, $src1.2s, $src2.2s" #
+ "|cmlt.2s\t$dst, $src1, $src2}",
+ (CMGTv2i32 V64:$dst, V64:$src2, V64:$src1), 0>;
+def : InstAlias<"{cmlt\t$dst.4s, $src1.4s, $src2.4s" #
+ "|cmlt.4s\t$dst, $src1, $src2}",
+ (CMGTv4i32 V128:$dst, V128:$src2, V128:$src1), 0>;
+def : InstAlias<"{cmlt\t$dst.2d, $src1.2d, $src2.2d" #
+ "|cmlt.2d\t$dst, $src1, $src2}",
+ (CMGTv2i64 V128:$dst, V128:$src2, V128:$src1), 0>;
+
+def : InstAlias<"{fcmle\t$dst.2s, $src1.2s, $src2.2s" #
+ "|fcmle.2s\t$dst, $src1, $src2}",
+ (FCMGEv2f32 V64:$dst, V64:$src2, V64:$src1), 0>;
+def : InstAlias<"{fcmle\t$dst.4s, $src1.4s, $src2.4s" #
+ "|fcmle.4s\t$dst, $src1, $src2}",
+ (FCMGEv4f32 V128:$dst, V128:$src2, V128:$src1), 0>;
+def : InstAlias<"{fcmle\t$dst.2d, $src1.2d, $src2.2d" #
+ "|fcmle.2d\t$dst, $src1, $src2}",
+ (FCMGEv2f64 V128:$dst, V128:$src2, V128:$src1), 0>;
+
+def : InstAlias<"{fcmlt\t$dst.2s, $src1.2s, $src2.2s" #
+ "|fcmlt.2s\t$dst, $src1, $src2}",
+ (FCMGTv2f32 V64:$dst, V64:$src2, V64:$src1), 0>;
+def : InstAlias<"{fcmlt\t$dst.4s, $src1.4s, $src2.4s" #
+ "|fcmlt.4s\t$dst, $src1, $src2}",
+ (FCMGTv4f32 V128:$dst, V128:$src2, V128:$src1), 0>;
+def : InstAlias<"{fcmlt\t$dst.2d, $src1.2d, $src2.2d" #
+ "|fcmlt.2d\t$dst, $src1, $src2}",
+ (FCMGTv2f64 V128:$dst, V128:$src2, V128:$src1), 0>;
+
+def : InstAlias<"{facle\t$dst.2s, $src1.2s, $src2.2s" #
+ "|facle.2s\t$dst, $src1, $src2}",
+ (FACGEv2f32 V64:$dst, V64:$src2, V64:$src1), 0>;
+def : InstAlias<"{facle\t$dst.4s, $src1.4s, $src2.4s" #
+ "|facle.4s\t$dst, $src1, $src2}",
+ (FACGEv4f32 V128:$dst, V128:$src2, V128:$src1), 0>;
+def : InstAlias<"{facle\t$dst.2d, $src1.2d, $src2.2d" #
+ "|facle.2d\t$dst, $src1, $src2}",
+ (FACGEv2f64 V128:$dst, V128:$src2, V128:$src1), 0>;
+
+def : InstAlias<"{faclt\t$dst.2s, $src1.2s, $src2.2s" #
+ "|faclt.2s\t$dst, $src1, $src2}",
+ (FACGTv2f32 V64:$dst, V64:$src2, V64:$src1), 0>;
+def : InstAlias<"{faclt\t$dst.4s, $src1.4s, $src2.4s" #
+ "|faclt.4s\t$dst, $src1, $src2}",
+ (FACGTv4f32 V128:$dst, V128:$src2, V128:$src1), 0>;
+def : InstAlias<"{faclt\t$dst.2d, $src1.2d, $src2.2d" #
+ "|faclt.2d\t$dst, $src1, $src2}",
+ (FACGTv2f64 V128:$dst, V128:$src2, V128:$src1), 0>;
+
+//===----------------------------------------------------------------------===//
+// Advanced SIMD three scalar instructions.
+//===----------------------------------------------------------------------===//
+
+defm ADD : SIMDThreeScalarD<0, 0b10000, "add", add>;
+defm CMEQ : SIMDThreeScalarD<1, 0b10001, "cmeq", AArch64cmeq>;
+defm CMGE : SIMDThreeScalarD<0, 0b00111, "cmge", AArch64cmge>;
+defm CMGT : SIMDThreeScalarD<0, 0b00110, "cmgt", AArch64cmgt>;
+defm CMHI : SIMDThreeScalarD<1, 0b00110, "cmhi", AArch64cmhi>;
+defm CMHS : SIMDThreeScalarD<1, 0b00111, "cmhs", AArch64cmhs>;
+defm CMTST : SIMDThreeScalarD<0, 0b10001, "cmtst", AArch64cmtst>;
+defm FABD : SIMDThreeScalarSD<1, 1, 0b11010, "fabd", int_aarch64_sisd_fabd>;
+def : Pat<(v1f64 (int_aarch64_neon_fabd (v1f64 FPR64:$Rn), (v1f64 FPR64:$Rm))),
+ (FABD64 FPR64:$Rn, FPR64:$Rm)>;
+defm FACGE : SIMDThreeScalarFPCmp<1, 0, 0b11101, "facge",
+ int_aarch64_neon_facge>;
+defm FACGT : SIMDThreeScalarFPCmp<1, 1, 0b11101, "facgt",
+ int_aarch64_neon_facgt>;
+defm FCMEQ : SIMDThreeScalarFPCmp<0, 0, 0b11100, "fcmeq", AArch64fcmeq>;
+defm FCMGE : SIMDThreeScalarFPCmp<1, 0, 0b11100, "fcmge", AArch64fcmge>;
+defm FCMGT : SIMDThreeScalarFPCmp<1, 1, 0b11100, "fcmgt", AArch64fcmgt>;
+defm FMULX : SIMDThreeScalarSD<0, 0, 0b11011, "fmulx", int_aarch64_neon_fmulx>;
+defm FRECPS : SIMDThreeScalarSD<0, 0, 0b11111, "frecps", int_aarch64_neon_frecps>;
+defm FRSQRTS : SIMDThreeScalarSD<0, 1, 0b11111, "frsqrts", int_aarch64_neon_frsqrts>;
+defm SQADD : SIMDThreeScalarBHSD<0, 0b00001, "sqadd", int_aarch64_neon_sqadd>;
+defm SQDMULH : SIMDThreeScalarHS< 0, 0b10110, "sqdmulh", int_aarch64_neon_sqdmulh>;
+defm SQRDMULH : SIMDThreeScalarHS< 1, 0b10110, "sqrdmulh", int_aarch64_neon_sqrdmulh>;
+defm SQRSHL : SIMDThreeScalarBHSD<0, 0b01011, "sqrshl",int_aarch64_neon_sqrshl>;
+defm SQSHL : SIMDThreeScalarBHSD<0, 0b01001, "sqshl", int_aarch64_neon_sqshl>;
+defm SQSUB : SIMDThreeScalarBHSD<0, 0b00101, "sqsub", int_aarch64_neon_sqsub>;
+defm SRSHL : SIMDThreeScalarD< 0, 0b01010, "srshl", int_aarch64_neon_srshl>;
+defm SSHL : SIMDThreeScalarD< 0, 0b01000, "sshl", int_aarch64_neon_sshl>;
+defm SUB : SIMDThreeScalarD< 1, 0b10000, "sub", sub>;
+defm UQADD : SIMDThreeScalarBHSD<1, 0b00001, "uqadd", int_aarch64_neon_uqadd>;
+defm UQRSHL : SIMDThreeScalarBHSD<1, 0b01011, "uqrshl",int_aarch64_neon_uqrshl>;
+defm UQSHL : SIMDThreeScalarBHSD<1, 0b01001, "uqshl", int_aarch64_neon_uqshl>;
+defm UQSUB : SIMDThreeScalarBHSD<1, 0b00101, "uqsub", int_aarch64_neon_uqsub>;
+defm URSHL : SIMDThreeScalarD< 1, 0b01010, "urshl", int_aarch64_neon_urshl>;
+defm USHL : SIMDThreeScalarD< 1, 0b01000, "ushl", int_aarch64_neon_ushl>;
+
+def : InstAlias<"cmls $dst, $src1, $src2",
+ (CMHSv1i64 FPR64:$dst, FPR64:$src2, FPR64:$src1), 0>;
+def : InstAlias<"cmle $dst, $src1, $src2",
+ (CMGEv1i64 FPR64:$dst, FPR64:$src2, FPR64:$src1), 0>;
+def : InstAlias<"cmlo $dst, $src1, $src2",
+ (CMHIv1i64 FPR64:$dst, FPR64:$src2, FPR64:$src1), 0>;
+def : InstAlias<"cmlt $dst, $src1, $src2",
+ (CMGTv1i64 FPR64:$dst, FPR64:$src2, FPR64:$src1), 0>;
+def : InstAlias<"fcmle $dst, $src1, $src2",
+ (FCMGE32 FPR32:$dst, FPR32:$src2, FPR32:$src1), 0>;
+def : InstAlias<"fcmle $dst, $src1, $src2",
+ (FCMGE64 FPR64:$dst, FPR64:$src2, FPR64:$src1), 0>;
+def : InstAlias<"fcmlt $dst, $src1, $src2",
+ (FCMGT32 FPR32:$dst, FPR32:$src2, FPR32:$src1), 0>;
+def : InstAlias<"fcmlt $dst, $src1, $src2",
+ (FCMGT64 FPR64:$dst, FPR64:$src2, FPR64:$src1), 0>;
+def : InstAlias<"facle $dst, $src1, $src2",
+ (FACGE32 FPR32:$dst, FPR32:$src2, FPR32:$src1), 0>;
+def : InstAlias<"facle $dst, $src1, $src2",
+ (FACGE64 FPR64:$dst, FPR64:$src2, FPR64:$src1), 0>;
+def : InstAlias<"faclt $dst, $src1, $src2",
+ (FACGT32 FPR32:$dst, FPR32:$src2, FPR32:$src1), 0>;
+def : InstAlias<"faclt $dst, $src1, $src2",
+ (FACGT64 FPR64:$dst, FPR64:$src2, FPR64:$src1), 0>;
+
+//===----------------------------------------------------------------------===//
+// Advanced SIMD three scalar instructions (mixed operands).
+//===----------------------------------------------------------------------===//
+defm SQDMULL : SIMDThreeScalarMixedHS<0, 0b11010, "sqdmull",
+ int_aarch64_neon_sqdmulls_scalar>;
+defm SQDMLAL : SIMDThreeScalarMixedTiedHS<0, 0b10010, "sqdmlal">;
+defm SQDMLSL : SIMDThreeScalarMixedTiedHS<0, 0b10110, "sqdmlsl">;
+
+def : Pat<(i64 (int_aarch64_neon_sqadd (i64 FPR64:$Rd),
+ (i64 (int_aarch64_neon_sqdmulls_scalar (i32 FPR32:$Rn),
+ (i32 FPR32:$Rm))))),
+ (SQDMLALi32 FPR64:$Rd, FPR32:$Rn, FPR32:$Rm)>;
+def : Pat<(i64 (int_aarch64_neon_sqsub (i64 FPR64:$Rd),
+ (i64 (int_aarch64_neon_sqdmulls_scalar (i32 FPR32:$Rn),
+ (i32 FPR32:$Rm))))),
+ (SQDMLSLi32 FPR64:$Rd, FPR32:$Rn, FPR32:$Rm)>;
+
+//===----------------------------------------------------------------------===//
+// Advanced SIMD two scalar instructions.
+//===----------------------------------------------------------------------===//
+
+defm ABS : SIMDTwoScalarD< 0, 0b01011, "abs", int_aarch64_neon_abs>;
+defm CMEQ : SIMDCmpTwoScalarD< 0, 0b01001, "cmeq", AArch64cmeqz>;
+defm CMGE : SIMDCmpTwoScalarD< 1, 0b01000, "cmge", AArch64cmgez>;
+defm CMGT : SIMDCmpTwoScalarD< 0, 0b01000, "cmgt", AArch64cmgtz>;
+defm CMLE : SIMDCmpTwoScalarD< 1, 0b01001, "cmle", AArch64cmlez>;
+defm CMLT : SIMDCmpTwoScalarD< 0, 0b01010, "cmlt", AArch64cmltz>;
+defm FCMEQ : SIMDCmpTwoScalarSD<0, 1, 0b01101, "fcmeq", AArch64fcmeqz>;
+defm FCMGE : SIMDCmpTwoScalarSD<1, 1, 0b01100, "fcmge", AArch64fcmgez>;
+defm FCMGT : SIMDCmpTwoScalarSD<0, 1, 0b01100, "fcmgt", AArch64fcmgtz>;
+defm FCMLE : SIMDCmpTwoScalarSD<1, 1, 0b01101, "fcmle", AArch64fcmlez>;
+defm FCMLT : SIMDCmpTwoScalarSD<0, 1, 0b01110, "fcmlt", AArch64fcmltz>;
+defm FCVTAS : SIMDTwoScalarSD< 0, 0, 0b11100, "fcvtas">;
+defm FCVTAU : SIMDTwoScalarSD< 1, 0, 0b11100, "fcvtau">;
+defm FCVTMS : SIMDTwoScalarSD< 0, 0, 0b11011, "fcvtms">;
+defm FCVTMU : SIMDTwoScalarSD< 1, 0, 0b11011, "fcvtmu">;
+defm FCVTNS : SIMDTwoScalarSD< 0, 0, 0b11010, "fcvtns">;
+defm FCVTNU : SIMDTwoScalarSD< 1, 0, 0b11010, "fcvtnu">;
+defm FCVTPS : SIMDTwoScalarSD< 0, 1, 0b11010, "fcvtps">;
+defm FCVTPU : SIMDTwoScalarSD< 1, 1, 0b11010, "fcvtpu">;
+def FCVTXNv1i64 : SIMDInexactCvtTwoScalar<0b10110, "fcvtxn">;
+defm FCVTZS : SIMDTwoScalarSD< 0, 1, 0b11011, "fcvtzs">;
+defm FCVTZU : SIMDTwoScalarSD< 1, 1, 0b11011, "fcvtzu">;
+defm FRECPE : SIMDTwoScalarSD< 0, 1, 0b11101, "frecpe">;
+defm FRECPX : SIMDTwoScalarSD< 0, 1, 0b11111, "frecpx">;
+defm FRSQRTE : SIMDTwoScalarSD< 1, 1, 0b11101, "frsqrte">;
+defm NEG : SIMDTwoScalarD< 1, 0b01011, "neg",
+ UnOpFrag<(sub immAllZerosV, node:$LHS)> >;
+defm SCVTF : SIMDTwoScalarCVTSD< 0, 0, 0b11101, "scvtf", AArch64sitof>;
+defm SQABS : SIMDTwoScalarBHSD< 0, 0b00111, "sqabs", int_aarch64_neon_sqabs>;
+defm SQNEG : SIMDTwoScalarBHSD< 1, 0b00111, "sqneg", int_aarch64_neon_sqneg>;
+defm SQXTN : SIMDTwoScalarMixedBHS< 0, 0b10100, "sqxtn", int_aarch64_neon_scalar_sqxtn>;
+defm SQXTUN : SIMDTwoScalarMixedBHS< 1, 0b10010, "sqxtun", int_aarch64_neon_scalar_sqxtun>;
+defm SUQADD : SIMDTwoScalarBHSDTied< 0, 0b00011, "suqadd",
+ int_aarch64_neon_suqadd>;
+defm UCVTF : SIMDTwoScalarCVTSD< 1, 0, 0b11101, "ucvtf", AArch64uitof>;
+defm UQXTN : SIMDTwoScalarMixedBHS<1, 0b10100, "uqxtn", int_aarch64_neon_scalar_uqxtn>;
+defm USQADD : SIMDTwoScalarBHSDTied< 1, 0b00011, "usqadd",
+ int_aarch64_neon_usqadd>;
+
+def : Pat<(AArch64neg (v1i64 V64:$Rn)), (NEGv1i64 V64:$Rn)>;
+
+def : Pat<(v1i64 (int_aarch64_neon_fcvtas (v1f64 FPR64:$Rn))),
+ (FCVTASv1i64 FPR64:$Rn)>;
+def : Pat<(v1i64 (int_aarch64_neon_fcvtau (v1f64 FPR64:$Rn))),
+ (FCVTAUv1i64 FPR64:$Rn)>;
+def : Pat<(v1i64 (int_aarch64_neon_fcvtms (v1f64 FPR64:$Rn))),
+ (FCVTMSv1i64 FPR64:$Rn)>;
+def : Pat<(v1i64 (int_aarch64_neon_fcvtmu (v1f64 FPR64:$Rn))),
+ (FCVTMUv1i64 FPR64:$Rn)>;
+def : Pat<(v1i64 (int_aarch64_neon_fcvtns (v1f64 FPR64:$Rn))),
+ (FCVTNSv1i64 FPR64:$Rn)>;
+def : Pat<(v1i64 (int_aarch64_neon_fcvtnu (v1f64 FPR64:$Rn))),
+ (FCVTNUv1i64 FPR64:$Rn)>;
+def : Pat<(v1i64 (int_aarch64_neon_fcvtps (v1f64 FPR64:$Rn))),
+ (FCVTPSv1i64 FPR64:$Rn)>;
+def : Pat<(v1i64 (int_aarch64_neon_fcvtpu (v1f64 FPR64:$Rn))),
+ (FCVTPUv1i64 FPR64:$Rn)>;
+
+def : Pat<(f32 (int_aarch64_neon_frecpe (f32 FPR32:$Rn))),
+ (FRECPEv1i32 FPR32:$Rn)>;
+def : Pat<(f64 (int_aarch64_neon_frecpe (f64 FPR64:$Rn))),
+ (FRECPEv1i64 FPR64:$Rn)>;
+def : Pat<(v1f64 (int_aarch64_neon_frecpe (v1f64 FPR64:$Rn))),
+ (FRECPEv1i64 FPR64:$Rn)>;
+
+def : Pat<(f32 (int_aarch64_neon_frecpx (f32 FPR32:$Rn))),
+ (FRECPXv1i32 FPR32:$Rn)>;
+def : Pat<(f64 (int_aarch64_neon_frecpx (f64 FPR64:$Rn))),
+ (FRECPXv1i64 FPR64:$Rn)>;
+
+def : Pat<(f32 (int_aarch64_neon_frsqrte (f32 FPR32:$Rn))),
+ (FRSQRTEv1i32 FPR32:$Rn)>;
+def : Pat<(f64 (int_aarch64_neon_frsqrte (f64 FPR64:$Rn))),
+ (FRSQRTEv1i64 FPR64:$Rn)>;
+def : Pat<(v1f64 (int_aarch64_neon_frsqrte (v1f64 FPR64:$Rn))),
+ (FRSQRTEv1i64 FPR64:$Rn)>;
+
+// If an integer is about to be converted to a floating point value,
+// just load it on the floating point unit.
+// Here are the patterns for 8 and 16-bits to float.
+// 8-bits -> float.
+multiclass UIntToFPROLoadPat<ValueType DstTy, ValueType SrcTy,
+ SDPatternOperator loadop, Instruction UCVTF,
+ ROAddrMode ro, Instruction LDRW, Instruction LDRX,
+ SubRegIndex sub> {
+ def : Pat<(DstTy (uint_to_fp (SrcTy
+ (loadop (ro.Wpat GPR64sp:$Rn, GPR32:$Rm,
+ ro.Wext:$extend))))),
+ (UCVTF (INSERT_SUBREG (DstTy (IMPLICIT_DEF)),
+ (LDRW GPR64sp:$Rn, GPR32:$Rm, ro.Wext:$extend),
+ sub))>;
+
+ def : Pat<(DstTy (uint_to_fp (SrcTy
+ (loadop (ro.Xpat GPR64sp:$Rn, GPR64:$Rm,
+ ro.Wext:$extend))))),
+ (UCVTF (INSERT_SUBREG (DstTy (IMPLICIT_DEF)),
+ (LDRX GPR64sp:$Rn, GPR64:$Rm, ro.Xext:$extend),
+ sub))>;
+}
+
+defm : UIntToFPROLoadPat<f32, i32, zextloadi8,
+ UCVTFv1i32, ro8, LDRBroW, LDRBroX, bsub>;
+def : Pat <(f32 (uint_to_fp (i32
+ (zextloadi8 (am_indexed8 GPR64sp:$Rn, uimm12s1:$offset))))),
+ (UCVTFv1i32 (INSERT_SUBREG (f32 (IMPLICIT_DEF)),
+ (LDRBui GPR64sp:$Rn, uimm12s1:$offset), bsub))>;
+def : Pat <(f32 (uint_to_fp (i32
+ (zextloadi8 (am_unscaled8 GPR64sp:$Rn, simm9:$offset))))),
+ (UCVTFv1i32 (INSERT_SUBREG (f32 (IMPLICIT_DEF)),
+ (LDURBi GPR64sp:$Rn, simm9:$offset), bsub))>;
+// 16-bits -> float.
+defm : UIntToFPROLoadPat<f32, i32, zextloadi16,
+ UCVTFv1i32, ro16, LDRHroW, LDRHroX, hsub>;
+def : Pat <(f32 (uint_to_fp (i32
+ (zextloadi16 (am_indexed16 GPR64sp:$Rn, uimm12s2:$offset))))),
+ (UCVTFv1i32 (INSERT_SUBREG (f32 (IMPLICIT_DEF)),
+ (LDRHui GPR64sp:$Rn, uimm12s2:$offset), hsub))>;
+def : Pat <(f32 (uint_to_fp (i32
+ (zextloadi16 (am_unscaled16 GPR64sp:$Rn, simm9:$offset))))),
+ (UCVTFv1i32 (INSERT_SUBREG (f32 (IMPLICIT_DEF)),
+ (LDURHi GPR64sp:$Rn, simm9:$offset), hsub))>;
+// 32-bits are handled in target specific dag combine:
+// performIntToFpCombine.
+// 64-bits integer to 32-bits floating point, not possible with
+// UCVTF on floating point registers (both source and destination
+// must have the same size).
+
+// Here are the patterns for 8, 16, 32, and 64-bits to double.
+// 8-bits -> double.
+defm : UIntToFPROLoadPat<f64, i32, zextloadi8,
+ UCVTFv1i64, ro8, LDRBroW, LDRBroX, bsub>;
+def : Pat <(f64 (uint_to_fp (i32
+ (zextloadi8 (am_indexed8 GPR64sp:$Rn, uimm12s1:$offset))))),
+ (UCVTFv1i64 (INSERT_SUBREG (f64 (IMPLICIT_DEF)),
+ (LDRBui GPR64sp:$Rn, uimm12s1:$offset), bsub))>;
+def : Pat <(f64 (uint_to_fp (i32
+ (zextloadi8 (am_unscaled8 GPR64sp:$Rn, simm9:$offset))))),
+ (UCVTFv1i64 (INSERT_SUBREG (f64 (IMPLICIT_DEF)),
+ (LDURBi GPR64sp:$Rn, simm9:$offset), bsub))>;
+// 16-bits -> double.
+defm : UIntToFPROLoadPat<f64, i32, zextloadi16,
+ UCVTFv1i64, ro16, LDRHroW, LDRHroX, hsub>;
+def : Pat <(f64 (uint_to_fp (i32
+ (zextloadi16 (am_indexed16 GPR64sp:$Rn, uimm12s2:$offset))))),
+ (UCVTFv1i64 (INSERT_SUBREG (f64 (IMPLICIT_DEF)),
+ (LDRHui GPR64sp:$Rn, uimm12s2:$offset), hsub))>;
+def : Pat <(f64 (uint_to_fp (i32
+ (zextloadi16 (am_unscaled16 GPR64sp:$Rn, simm9:$offset))))),
+ (UCVTFv1i64 (INSERT_SUBREG (f64 (IMPLICIT_DEF)),
+ (LDURHi GPR64sp:$Rn, simm9:$offset), hsub))>;
+// 32-bits -> double.
+defm : UIntToFPROLoadPat<f64, i32, load,
+ UCVTFv1i64, ro32, LDRSroW, LDRSroX, ssub>;
+def : Pat <(f64 (uint_to_fp (i32
+ (load (am_indexed32 GPR64sp:$Rn, uimm12s4:$offset))))),
+ (UCVTFv1i64 (INSERT_SUBREG (f64 (IMPLICIT_DEF)),
+ (LDRSui GPR64sp:$Rn, uimm12s4:$offset), ssub))>;
+def : Pat <(f64 (uint_to_fp (i32
+ (load (am_unscaled32 GPR64sp:$Rn, simm9:$offset))))),
+ (UCVTFv1i64 (INSERT_SUBREG (f64 (IMPLICIT_DEF)),
+ (LDURSi GPR64sp:$Rn, simm9:$offset), ssub))>;
+// 64-bits -> double are handled in target specific dag combine:
+// performIntToFpCombine.
+
+//===----------------------------------------------------------------------===//
+// Advanced SIMD three different-sized vector instructions.
+//===----------------------------------------------------------------------===//
+
+defm ADDHN : SIMDNarrowThreeVectorBHS<0,0b0100,"addhn", int_aarch64_neon_addhn>;
+defm SUBHN : SIMDNarrowThreeVectorBHS<0,0b0110,"subhn", int_aarch64_neon_subhn>;
+defm RADDHN : SIMDNarrowThreeVectorBHS<1,0b0100,"raddhn",int_aarch64_neon_raddhn>;
+defm RSUBHN : SIMDNarrowThreeVectorBHS<1,0b0110,"rsubhn",int_aarch64_neon_rsubhn>;
+defm PMULL : SIMDDifferentThreeVectorBD<0,0b1110,"pmull",int_aarch64_neon_pmull>;
+defm SABAL : SIMDLongThreeVectorTiedBHSabal<0,0b0101,"sabal",
+ int_aarch64_neon_sabd>;
+defm SABDL : SIMDLongThreeVectorBHSabdl<0, 0b0111, "sabdl",
+ int_aarch64_neon_sabd>;
+defm SADDL : SIMDLongThreeVectorBHS< 0, 0b0000, "saddl",
+ BinOpFrag<(add (sext node:$LHS), (sext node:$RHS))>>;
+defm SADDW : SIMDWideThreeVectorBHS< 0, 0b0001, "saddw",
+ BinOpFrag<(add node:$LHS, (sext node:$RHS))>>;
+defm SMLAL : SIMDLongThreeVectorTiedBHS<0, 0b1000, "smlal",
+ TriOpFrag<(add node:$LHS, (int_aarch64_neon_smull node:$MHS, node:$RHS))>>;
+defm SMLSL : SIMDLongThreeVectorTiedBHS<0, 0b1010, "smlsl",
+ TriOpFrag<(sub node:$LHS, (int_aarch64_neon_smull node:$MHS, node:$RHS))>>;
+defm SMULL : SIMDLongThreeVectorBHS<0, 0b1100, "smull", int_aarch64_neon_smull>;
+defm SQDMLAL : SIMDLongThreeVectorSQDMLXTiedHS<0, 0b1001, "sqdmlal",
+ int_aarch64_neon_sqadd>;
+defm SQDMLSL : SIMDLongThreeVectorSQDMLXTiedHS<0, 0b1011, "sqdmlsl",
+ int_aarch64_neon_sqsub>;
+defm SQDMULL : SIMDLongThreeVectorHS<0, 0b1101, "sqdmull",
+ int_aarch64_neon_sqdmull>;
+defm SSUBL : SIMDLongThreeVectorBHS<0, 0b0010, "ssubl",
+ BinOpFrag<(sub (sext node:$LHS), (sext node:$RHS))>>;
+defm SSUBW : SIMDWideThreeVectorBHS<0, 0b0011, "ssubw",
+ BinOpFrag<(sub node:$LHS, (sext node:$RHS))>>;
+defm UABAL : SIMDLongThreeVectorTiedBHSabal<1, 0b0101, "uabal",
+ int_aarch64_neon_uabd>;
+defm UABDL : SIMDLongThreeVectorBHSabdl<1, 0b0111, "uabdl",
+ int_aarch64_neon_uabd>;
+defm UADDL : SIMDLongThreeVectorBHS<1, 0b0000, "uaddl",
+ BinOpFrag<(add (zext node:$LHS), (zext node:$RHS))>>;
+defm UADDW : SIMDWideThreeVectorBHS<1, 0b0001, "uaddw",
+ BinOpFrag<(add node:$LHS, (zext node:$RHS))>>;
+defm UMLAL : SIMDLongThreeVectorTiedBHS<1, 0b1000, "umlal",
+ TriOpFrag<(add node:$LHS, (int_aarch64_neon_umull node:$MHS, node:$RHS))>>;
+defm UMLSL : SIMDLongThreeVectorTiedBHS<1, 0b1010, "umlsl",
+ TriOpFrag<(sub node:$LHS, (int_aarch64_neon_umull node:$MHS, node:$RHS))>>;
+defm UMULL : SIMDLongThreeVectorBHS<1, 0b1100, "umull", int_aarch64_neon_umull>;
+defm USUBL : SIMDLongThreeVectorBHS<1, 0b0010, "usubl",
+ BinOpFrag<(sub (zext node:$LHS), (zext node:$RHS))>>;
+defm USUBW : SIMDWideThreeVectorBHS< 1, 0b0011, "usubw",
+ BinOpFrag<(sub node:$LHS, (zext node:$RHS))>>;
+
+// Patterns for 64-bit pmull
+def : Pat<(int_aarch64_neon_pmull64 V64:$Rn, V64:$Rm),
+ (PMULLv1i64 V64:$Rn, V64:$Rm)>;
+def : Pat<(int_aarch64_neon_pmull64 (vector_extract (v2i64 V128:$Rn), (i64 1)),
+ (vector_extract (v2i64 V128:$Rm), (i64 1))),
+ (PMULLv2i64 V128:$Rn, V128:$Rm)>;
+
+// CodeGen patterns for addhn and subhn instructions, which can actually be
+// written in LLVM IR without too much difficulty.
+
+// ADDHN
+def : Pat<(v8i8 (trunc (v8i16 (AArch64vlshr (add V128:$Rn, V128:$Rm), (i32 8))))),
+ (ADDHNv8i16_v8i8 V128:$Rn, V128:$Rm)>;
+def : Pat<(v4i16 (trunc (v4i32 (AArch64vlshr (add V128:$Rn, V128:$Rm),
+ (i32 16))))),
+ (ADDHNv4i32_v4i16 V128:$Rn, V128:$Rm)>;
+def : Pat<(v2i32 (trunc (v2i64 (AArch64vlshr (add V128:$Rn, V128:$Rm),
+ (i32 32))))),
+ (ADDHNv2i64_v2i32 V128:$Rn, V128:$Rm)>;
+def : Pat<(concat_vectors (v8i8 V64:$Rd),
+ (trunc (v8i16 (AArch64vlshr (add V128:$Rn, V128:$Rm),
+ (i32 8))))),
+ (ADDHNv8i16_v16i8 (SUBREG_TO_REG (i32 0), V64:$Rd, dsub),
+ V128:$Rn, V128:$Rm)>;
+def : Pat<(concat_vectors (v4i16 V64:$Rd),
+ (trunc (v4i32 (AArch64vlshr (add V128:$Rn, V128:$Rm),
+ (i32 16))))),
+ (ADDHNv4i32_v8i16 (SUBREG_TO_REG (i32 0), V64:$Rd, dsub),
+ V128:$Rn, V128:$Rm)>;
+def : Pat<(concat_vectors (v2i32 V64:$Rd),
+ (trunc (v2i64 (AArch64vlshr (add V128:$Rn, V128:$Rm),
+ (i32 32))))),
+ (ADDHNv2i64_v4i32 (SUBREG_TO_REG (i32 0), V64:$Rd, dsub),
+ V128:$Rn, V128:$Rm)>;
+
+// SUBHN
+def : Pat<(v8i8 (trunc (v8i16 (AArch64vlshr (sub V128:$Rn, V128:$Rm), (i32 8))))),
+ (SUBHNv8i16_v8i8 V128:$Rn, V128:$Rm)>;
+def : Pat<(v4i16 (trunc (v4i32 (AArch64vlshr (sub V128:$Rn, V128:$Rm),
+ (i32 16))))),
+ (SUBHNv4i32_v4i16 V128:$Rn, V128:$Rm)>;
+def : Pat<(v2i32 (trunc (v2i64 (AArch64vlshr (sub V128:$Rn, V128:$Rm),
+ (i32 32))))),
+ (SUBHNv2i64_v2i32 V128:$Rn, V128:$Rm)>;
+def : Pat<(concat_vectors (v8i8 V64:$Rd),
+ (trunc (v8i16 (AArch64vlshr (sub V128:$Rn, V128:$Rm),
+ (i32 8))))),
+ (SUBHNv8i16_v16i8 (SUBREG_TO_REG (i32 0), V64:$Rd, dsub),
+ V128:$Rn, V128:$Rm)>;
+def : Pat<(concat_vectors (v4i16 V64:$Rd),
+ (trunc (v4i32 (AArch64vlshr (sub V128:$Rn, V128:$Rm),
+ (i32 16))))),
+ (SUBHNv4i32_v8i16 (SUBREG_TO_REG (i32 0), V64:$Rd, dsub),
+ V128:$Rn, V128:$Rm)>;
+def : Pat<(concat_vectors (v2i32 V64:$Rd),
+ (trunc (v2i64 (AArch64vlshr (sub V128:$Rn, V128:$Rm),
+ (i32 32))))),
+ (SUBHNv2i64_v4i32 (SUBREG_TO_REG (i32 0), V64:$Rd, dsub),
+ V128:$Rn, V128:$Rm)>;
+
+//----------------------------------------------------------------------------
+// AdvSIMD bitwise extract from vector instruction.
+//----------------------------------------------------------------------------
+
+defm EXT : SIMDBitwiseExtract<"ext">;
+
+def : Pat<(v4i16 (AArch64ext V64:$Rn, V64:$Rm, (i32 imm:$imm))),
+ (EXTv8i8 V64:$Rn, V64:$Rm, imm:$imm)>;
+def : Pat<(v8i16 (AArch64ext V128:$Rn, V128:$Rm, (i32 imm:$imm))),
+ (EXTv16i8 V128:$Rn, V128:$Rm, imm:$imm)>;
+def : Pat<(v2i32 (AArch64ext V64:$Rn, V64:$Rm, (i32 imm:$imm))),
+ (EXTv8i8 V64:$Rn, V64:$Rm, imm:$imm)>;
+def : Pat<(v2f32 (AArch64ext V64:$Rn, V64:$Rm, (i32 imm:$imm))),
+ (EXTv8i8 V64:$Rn, V64:$Rm, imm:$imm)>;
+def : Pat<(v4i32 (AArch64ext V128:$Rn, V128:$Rm, (i32 imm:$imm))),
+ (EXTv16i8 V128:$Rn, V128:$Rm, imm:$imm)>;
+def : Pat<(v4f32 (AArch64ext V128:$Rn, V128:$Rm, (i32 imm:$imm))),
+ (EXTv16i8 V128:$Rn, V128:$Rm, imm:$imm)>;
+def : Pat<(v2i64 (AArch64ext V128:$Rn, V128:$Rm, (i32 imm:$imm))),
+ (EXTv16i8 V128:$Rn, V128:$Rm, imm:$imm)>;
+def : Pat<(v2f64 (AArch64ext V128:$Rn, V128:$Rm, (i32 imm:$imm))),
+ (EXTv16i8 V128:$Rn, V128:$Rm, imm:$imm)>;
+
+// We use EXT to handle extract_subvector to copy the upper 64-bits of a
+// 128-bit vector.
+def : Pat<(v8i8 (extract_subvector V128:$Rn, (i64 8))),
+ (EXTRACT_SUBREG (EXTv16i8 V128:$Rn, V128:$Rn, 8), dsub)>;
+def : Pat<(v4i16 (extract_subvector V128:$Rn, (i64 4))),
+ (EXTRACT_SUBREG (EXTv16i8 V128:$Rn, V128:$Rn, 8), dsub)>;
+def : Pat<(v2i32 (extract_subvector V128:$Rn, (i64 2))),
+ (EXTRACT_SUBREG (EXTv16i8 V128:$Rn, V128:$Rn, 8), dsub)>;
+def : Pat<(v1i64 (extract_subvector V128:$Rn, (i64 1))),
+ (EXTRACT_SUBREG (EXTv16i8 V128:$Rn, V128:$Rn, 8), dsub)>;
+def : Pat<(v2f32 (extract_subvector V128:$Rn, (i64 2))),
+ (EXTRACT_SUBREG (EXTv16i8 V128:$Rn, V128:$Rn, 8), dsub)>;
+def : Pat<(v1f64 (extract_subvector V128:$Rn, (i64 1))),
+ (EXTRACT_SUBREG (EXTv16i8 V128:$Rn, V128:$Rn, 8), dsub)>;
+
+
+//----------------------------------------------------------------------------
+// AdvSIMD zip vector
+//----------------------------------------------------------------------------
+
+defm TRN1 : SIMDZipVector<0b010, "trn1", AArch64trn1>;
+defm TRN2 : SIMDZipVector<0b110, "trn2", AArch64trn2>;
+defm UZP1 : SIMDZipVector<0b001, "uzp1", AArch64uzp1>;
+defm UZP2 : SIMDZipVector<0b101, "uzp2", AArch64uzp2>;
+defm ZIP1 : SIMDZipVector<0b011, "zip1", AArch64zip1>;
+defm ZIP2 : SIMDZipVector<0b111, "zip2", AArch64zip2>;
+
+//----------------------------------------------------------------------------
+// AdvSIMD TBL/TBX instructions
+//----------------------------------------------------------------------------
+
+defm TBL : SIMDTableLookup< 0, "tbl">;
+defm TBX : SIMDTableLookupTied<1, "tbx">;
+
+def : Pat<(v8i8 (int_aarch64_neon_tbl1 (v16i8 VecListOne128:$Rn), (v8i8 V64:$Ri))),
+ (TBLv8i8One VecListOne128:$Rn, V64:$Ri)>;
+def : Pat<(v16i8 (int_aarch64_neon_tbl1 (v16i8 V128:$Ri), (v16i8 V128:$Rn))),
+ (TBLv16i8One V128:$Ri, V128:$Rn)>;
+
+def : Pat<(v8i8 (int_aarch64_neon_tbx1 (v8i8 V64:$Rd),
+ (v16i8 VecListOne128:$Rn), (v8i8 V64:$Ri))),
+ (TBXv8i8One V64:$Rd, VecListOne128:$Rn, V64:$Ri)>;
+def : Pat<(v16i8 (int_aarch64_neon_tbx1 (v16i8 V128:$Rd),
+ (v16i8 V128:$Ri), (v16i8 V128:$Rn))),
+ (TBXv16i8One V128:$Rd, V128:$Ri, V128:$Rn)>;
+
+
+//----------------------------------------------------------------------------
+// AdvSIMD scalar CPY instruction
+//----------------------------------------------------------------------------
+
+defm CPY : SIMDScalarCPY<"cpy">;
+
+//----------------------------------------------------------------------------
+// AdvSIMD scalar pairwise instructions
+//----------------------------------------------------------------------------
+
+defm ADDP : SIMDPairwiseScalarD<0, 0b11011, "addp">;
+defm FADDP : SIMDPairwiseScalarSD<1, 0, 0b01101, "faddp">;
+defm FMAXNMP : SIMDPairwiseScalarSD<1, 0, 0b01100, "fmaxnmp">;
+defm FMAXP : SIMDPairwiseScalarSD<1, 0, 0b01111, "fmaxp">;
+defm FMINNMP : SIMDPairwiseScalarSD<1, 1, 0b01100, "fminnmp">;
+defm FMINP : SIMDPairwiseScalarSD<1, 1, 0b01111, "fminp">;
+def : Pat<(i64 (int_aarch64_neon_saddv (v2i64 V128:$Rn))),
+ (ADDPv2i64p V128:$Rn)>;
+def : Pat<(i64 (int_aarch64_neon_uaddv (v2i64 V128:$Rn))),
+ (ADDPv2i64p V128:$Rn)>;
+def : Pat<(f32 (int_aarch64_neon_faddv (v2f32 V64:$Rn))),
+ (FADDPv2i32p V64:$Rn)>;
+def : Pat<(f32 (int_aarch64_neon_faddv (v4f32 V128:$Rn))),
+ (FADDPv2i32p (EXTRACT_SUBREG (FADDPv4f32 V128:$Rn, V128:$Rn), dsub))>;
+def : Pat<(f64 (int_aarch64_neon_faddv (v2f64 V128:$Rn))),
+ (FADDPv2i64p V128:$Rn)>;
+def : Pat<(f32 (int_aarch64_neon_fmaxnmv (v2f32 V64:$Rn))),
+ (FMAXNMPv2i32p V64:$Rn)>;
+def : Pat<(f64 (int_aarch64_neon_fmaxnmv (v2f64 V128:$Rn))),
+ (FMAXNMPv2i64p V128:$Rn)>;
+def : Pat<(f32 (int_aarch64_neon_fmaxv (v2f32 V64:$Rn))),
+ (FMAXPv2i32p V64:$Rn)>;
+def : Pat<(f64 (int_aarch64_neon_fmaxv (v2f64 V128:$Rn))),
+ (FMAXPv2i64p V128:$Rn)>;
+def : Pat<(f32 (int_aarch64_neon_fminnmv (v2f32 V64:$Rn))),
+ (FMINNMPv2i32p V64:$Rn)>;
+def : Pat<(f64 (int_aarch64_neon_fminnmv (v2f64 V128:$Rn))),
+ (FMINNMPv2i64p V128:$Rn)>;
+def : Pat<(f32 (int_aarch64_neon_fminv (v2f32 V64:$Rn))),
+ (FMINPv2i32p V64:$Rn)>;
+def : Pat<(f64 (int_aarch64_neon_fminv (v2f64 V128:$Rn))),
+ (FMINPv2i64p V128:$Rn)>;
+
+//----------------------------------------------------------------------------
+// AdvSIMD INS/DUP instructions
+//----------------------------------------------------------------------------
+
+def DUPv8i8gpr : SIMDDupFromMain<0, 0b00001, ".8b", v8i8, V64, GPR32>;
+def DUPv16i8gpr : SIMDDupFromMain<1, 0b00001, ".16b", v16i8, V128, GPR32>;
+def DUPv4i16gpr : SIMDDupFromMain<0, 0b00010, ".4h", v4i16, V64, GPR32>;
+def DUPv8i16gpr : SIMDDupFromMain<1, 0b00010, ".8h", v8i16, V128, GPR32>;
+def DUPv2i32gpr : SIMDDupFromMain<0, 0b00100, ".2s", v2i32, V64, GPR32>;
+def DUPv4i32gpr : SIMDDupFromMain<1, 0b00100, ".4s", v4i32, V128, GPR32>;
+def DUPv2i64gpr : SIMDDupFromMain<1, 0b01000, ".2d", v2i64, V128, GPR64>;
+
+def DUPv2i64lane : SIMDDup64FromElement;
+def DUPv2i32lane : SIMDDup32FromElement<0, ".2s", v2i32, V64>;
+def DUPv4i32lane : SIMDDup32FromElement<1, ".4s", v4i32, V128>;
+def DUPv4i16lane : SIMDDup16FromElement<0, ".4h", v4i16, V64>;
+def DUPv8i16lane : SIMDDup16FromElement<1, ".8h", v8i16, V128>;
+def DUPv8i8lane : SIMDDup8FromElement <0, ".8b", v8i8, V64>;
+def DUPv16i8lane : SIMDDup8FromElement <1, ".16b", v16i8, V128>;
+
+def : Pat<(v2f32 (AArch64dup (f32 FPR32:$Rn))),
+ (v2f32 (DUPv2i32lane
+ (INSERT_SUBREG (v4i32 (IMPLICIT_DEF)), FPR32:$Rn, ssub),
+ (i64 0)))>;
+def : Pat<(v4f32 (AArch64dup (f32 FPR32:$Rn))),
+ (v4f32 (DUPv4i32lane
+ (INSERT_SUBREG (v4i32 (IMPLICIT_DEF)), FPR32:$Rn, ssub),
+ (i64 0)))>;
+def : Pat<(v2f64 (AArch64dup (f64 FPR64:$Rn))),
+ (v2f64 (DUPv2i64lane
+ (INSERT_SUBREG (v4i32 (IMPLICIT_DEF)), FPR64:$Rn, dsub),
+ (i64 0)))>;
+
+def : Pat<(v2f32 (AArch64duplane32 (v4f32 V128:$Rn), VectorIndexS:$imm)),
+ (DUPv2i32lane V128:$Rn, VectorIndexS:$imm)>;
+def : Pat<(v4f32 (AArch64duplane32 (v4f32 V128:$Rn), VectorIndexS:$imm)),
+ (DUPv4i32lane V128:$Rn, VectorIndexS:$imm)>;
+def : Pat<(v2f64 (AArch64duplane64 (v2f64 V128:$Rn), VectorIndexD:$imm)),
+ (DUPv2i64lane V128:$Rn, VectorIndexD:$imm)>;
+
+// If there's an (AArch64dup (vector_extract ...) ...), we can use a duplane
+// instruction even if the types don't match: we just have to remap the lane
+// carefully. N.b. this trick only applies to truncations.
+def VecIndex_x2 : SDNodeXForm<imm, [{
+ return CurDAG->getTargetConstant(2 * N->getZExtValue(), MVT::i64);
+}]>;
+def VecIndex_x4 : SDNodeXForm<imm, [{
+ return CurDAG->getTargetConstant(4 * N->getZExtValue(), MVT::i64);
+}]>;
+def VecIndex_x8 : SDNodeXForm<imm, [{
+ return CurDAG->getTargetConstant(8 * N->getZExtValue(), MVT::i64);
+}]>;
+
+multiclass DUPWithTruncPats<ValueType ResVT, ValueType Src64VT,
+ ValueType Src128VT, ValueType ScalVT,
+ Instruction DUP, SDNodeXForm IdxXFORM> {
+ def : Pat<(ResVT (AArch64dup (ScalVT (vector_extract (Src128VT V128:$Rn),
+ imm:$idx)))),
+ (DUP V128:$Rn, (IdxXFORM imm:$idx))>;
+
+ def : Pat<(ResVT (AArch64dup (ScalVT (vector_extract (Src64VT V64:$Rn),
+ imm:$idx)))),
+ (DUP (SUBREG_TO_REG (i64 0), V64:$Rn, dsub), (IdxXFORM imm:$idx))>;
+}
+
+defm : DUPWithTruncPats<v8i8, v4i16, v8i16, i32, DUPv8i8lane, VecIndex_x2>;
+defm : DUPWithTruncPats<v8i8, v2i32, v4i32, i32, DUPv8i8lane, VecIndex_x4>;
+defm : DUPWithTruncPats<v4i16, v2i32, v4i32, i32, DUPv4i16lane, VecIndex_x2>;
+
+defm : DUPWithTruncPats<v16i8, v4i16, v8i16, i32, DUPv16i8lane, VecIndex_x2>;
+defm : DUPWithTruncPats<v16i8, v2i32, v4i32, i32, DUPv16i8lane, VecIndex_x4>;
+defm : DUPWithTruncPats<v8i16, v2i32, v4i32, i32, DUPv8i16lane, VecIndex_x2>;
+
+multiclass DUPWithTrunci64Pats<ValueType ResVT, Instruction DUP,
+ SDNodeXForm IdxXFORM> {
+ def : Pat<(ResVT (AArch64dup (i32 (trunc (vector_extract (v2i64 V128:$Rn),
+ imm:$idx))))),
+ (DUP V128:$Rn, (IdxXFORM imm:$idx))>;
+
+ def : Pat<(ResVT (AArch64dup (i32 (trunc (vector_extract (v1i64 V64:$Rn),
+ imm:$idx))))),
+ (DUP (SUBREG_TO_REG (i64 0), V64:$Rn, dsub), (IdxXFORM imm:$idx))>;
+}
+
+defm : DUPWithTrunci64Pats<v8i8, DUPv8i8lane, VecIndex_x8>;
+defm : DUPWithTrunci64Pats<v4i16, DUPv4i16lane, VecIndex_x4>;
+defm : DUPWithTrunci64Pats<v2i32, DUPv2i32lane, VecIndex_x2>;
+
+defm : DUPWithTrunci64Pats<v16i8, DUPv16i8lane, VecIndex_x8>;
+defm : DUPWithTrunci64Pats<v8i16, DUPv8i16lane, VecIndex_x4>;
+defm : DUPWithTrunci64Pats<v4i32, DUPv4i32lane, VecIndex_x2>;
+
+// SMOV and UMOV definitions, with some extra patterns for convenience
+defm SMOV : SMov;
+defm UMOV : UMov;
+
+def : Pat<(sext_inreg (vector_extract (v16i8 V128:$Rn), VectorIndexB:$idx), i8),
+ (i32 (SMOVvi8to32 V128:$Rn, VectorIndexB:$idx))>;
+def : Pat<(sext_inreg (vector_extract (v16i8 V128:$Rn), VectorIndexB:$idx), i8),
+ (i64 (SMOVvi8to64 V128:$Rn, VectorIndexB:$idx))>;
+def : Pat<(sext_inreg (vector_extract (v8i16 V128:$Rn), VectorIndexH:$idx),i16),
+ (i32 (SMOVvi16to32 V128:$Rn, VectorIndexH:$idx))>;
+def : Pat<(sext_inreg (vector_extract (v8i16 V128:$Rn), VectorIndexH:$idx),i16),
+ (i64 (SMOVvi16to64 V128:$Rn, VectorIndexH:$idx))>;
+def : Pat<(sext_inreg (vector_extract (v8i16 V128:$Rn), VectorIndexH:$idx),i16),
+ (i32 (SMOVvi16to32 V128:$Rn, VectorIndexH:$idx))>;
+def : Pat<(sext (i32 (vector_extract (v4i32 V128:$Rn), VectorIndexS:$idx))),
+ (i64 (SMOVvi32to64 V128:$Rn, VectorIndexS:$idx))>;
+
+// Extracting i8 or i16 elements will have the zero-extend transformed to
+// an 'and' mask by type legalization since neither i8 nor i16 are legal types
+// for AArch64. Match these patterns here since UMOV already zeroes out the high
+// bits of the destination register.
+def : Pat<(and (vector_extract (v16i8 V128:$Rn), VectorIndexB:$idx),
+ (i32 0xff)),
+ (i32 (UMOVvi8 V128:$Rn, VectorIndexB:$idx))>;
+def : Pat<(and (vector_extract (v8i16 V128:$Rn), VectorIndexH:$idx),
+ (i32 0xffff)),
+ (i32 (UMOVvi16 V128:$Rn, VectorIndexH:$idx))>;
+
+defm INS : SIMDIns;
+
+def : Pat<(v16i8 (scalar_to_vector GPR32:$Rn)),
+ (SUBREG_TO_REG (i32 0),
+ (f32 (COPY_TO_REGCLASS GPR32:$Rn, FPR32)), ssub)>;
+def : Pat<(v8i8 (scalar_to_vector GPR32:$Rn)),
+ (SUBREG_TO_REG (i32 0),
+ (f32 (COPY_TO_REGCLASS GPR32:$Rn, FPR32)), ssub)>;
+
+def : Pat<(v8i16 (scalar_to_vector GPR32:$Rn)),
+ (SUBREG_TO_REG (i32 0),
+ (f32 (COPY_TO_REGCLASS GPR32:$Rn, FPR32)), ssub)>;
+def : Pat<(v4i16 (scalar_to_vector GPR32:$Rn)),
+ (SUBREG_TO_REG (i32 0),
+ (f32 (COPY_TO_REGCLASS GPR32:$Rn, FPR32)), ssub)>;
+
+def : Pat<(v2i32 (scalar_to_vector (i32 FPR32:$Rn))),
+ (v2i32 (INSERT_SUBREG (v2i32 (IMPLICIT_DEF)),
+ (i32 FPR32:$Rn), ssub))>;
+def : Pat<(v4i32 (scalar_to_vector (i32 FPR32:$Rn))),
+ (v4i32 (INSERT_SUBREG (v4i32 (IMPLICIT_DEF)),
+ (i32 FPR32:$Rn), ssub))>;
+def : Pat<(v2i64 (scalar_to_vector (i64 FPR64:$Rn))),
+ (v2i64 (INSERT_SUBREG (v2i64 (IMPLICIT_DEF)),
+ (i64 FPR64:$Rn), dsub))>;
+
+def : Pat<(v4f32 (scalar_to_vector (f32 FPR32:$Rn))),
+ (INSERT_SUBREG (v4f32 (IMPLICIT_DEF)), FPR32:$Rn, ssub)>;
+def : Pat<(v2f32 (scalar_to_vector (f32 FPR32:$Rn))),
+ (INSERT_SUBREG (v2f32 (IMPLICIT_DEF)), FPR32:$Rn, ssub)>;
+def : Pat<(v2f64 (scalar_to_vector (f64 FPR64:$Rn))),
+ (INSERT_SUBREG (v2f64 (IMPLICIT_DEF)), FPR64:$Rn, dsub)>;
+
+def : Pat<(v2f32 (vector_insert (v2f32 V64:$Rn),
+ (f32 FPR32:$Rm), (i64 VectorIndexS:$imm))),
+ (EXTRACT_SUBREG
+ (INSvi32lane
+ (v4f32 (INSERT_SUBREG (v4f32 (IMPLICIT_DEF)), V64:$Rn, dsub)),
+ VectorIndexS:$imm,
+ (v4f32 (INSERT_SUBREG (v4f32 (IMPLICIT_DEF)), FPR32:$Rm, ssub)),
+ (i64 0)),
+ dsub)>;
+def : Pat<(v4f32 (vector_insert (v4f32 V128:$Rn),
+ (f32 FPR32:$Rm), (i64 VectorIndexS:$imm))),
+ (INSvi32lane
+ V128:$Rn, VectorIndexS:$imm,
+ (v4f32 (INSERT_SUBREG (v4f32 (IMPLICIT_DEF)), FPR32:$Rm, ssub)),
+ (i64 0))>;
+def : Pat<(v2f64 (vector_insert (v2f64 V128:$Rn),
+ (f64 FPR64:$Rm), (i64 VectorIndexD:$imm))),
+ (INSvi64lane
+ V128:$Rn, VectorIndexD:$imm,
+ (v2f64 (INSERT_SUBREG (v2f64 (IMPLICIT_DEF)), FPR64:$Rm, dsub)),
+ (i64 0))>;
+
+// Copy an element at a constant index in one vector into a constant indexed
+// element of another.
+// FIXME refactor to a shared class/dev parameterized on vector type, vector
+// index type and INS extension
+def : Pat<(v16i8 (int_aarch64_neon_vcopy_lane
+ (v16i8 V128:$Vd), VectorIndexB:$idx, (v16i8 V128:$Vs),
+ VectorIndexB:$idx2)),
+ (v16i8 (INSvi8lane
+ V128:$Vd, VectorIndexB:$idx, V128:$Vs, VectorIndexB:$idx2)
+ )>;
+def : Pat<(v8i16 (int_aarch64_neon_vcopy_lane
+ (v8i16 V128:$Vd), VectorIndexH:$idx, (v8i16 V128:$Vs),
+ VectorIndexH:$idx2)),
+ (v8i16 (INSvi16lane
+ V128:$Vd, VectorIndexH:$idx, V128:$Vs, VectorIndexH:$idx2)
+ )>;
+def : Pat<(v4i32 (int_aarch64_neon_vcopy_lane
+ (v4i32 V128:$Vd), VectorIndexS:$idx, (v4i32 V128:$Vs),
+ VectorIndexS:$idx2)),
+ (v4i32 (INSvi32lane
+ V128:$Vd, VectorIndexS:$idx, V128:$Vs, VectorIndexS:$idx2)
+ )>;
+def : Pat<(v2i64 (int_aarch64_neon_vcopy_lane
+ (v2i64 V128:$Vd), VectorIndexD:$idx, (v2i64 V128:$Vs),
+ VectorIndexD:$idx2)),
+ (v2i64 (INSvi64lane
+ V128:$Vd, VectorIndexD:$idx, V128:$Vs, VectorIndexD:$idx2)
+ )>;
+
+multiclass Neon_INS_elt_pattern<ValueType VT128, ValueType VT64,
+ ValueType VTScal, Instruction INS> {
+ def : Pat<(VT128 (vector_insert V128:$src,
+ (VTScal (vector_extract (VT128 V128:$Rn), imm:$Immn)),
+ imm:$Immd)),
+ (INS V128:$src, imm:$Immd, V128:$Rn, imm:$Immn)>;
+
+ def : Pat<(VT128 (vector_insert V128:$src,
+ (VTScal (vector_extract (VT64 V64:$Rn), imm:$Immn)),
+ imm:$Immd)),
+ (INS V128:$src, imm:$Immd,
+ (SUBREG_TO_REG (i64 0), V64:$Rn, dsub), imm:$Immn)>;
+
+ def : Pat<(VT64 (vector_insert V64:$src,
+ (VTScal (vector_extract (VT128 V128:$Rn), imm:$Immn)),
+ imm:$Immd)),
+ (EXTRACT_SUBREG (INS (SUBREG_TO_REG (i64 0), V64:$src, dsub),
+ imm:$Immd, V128:$Rn, imm:$Immn),
+ dsub)>;
+
+ def : Pat<(VT64 (vector_insert V64:$src,
+ (VTScal (vector_extract (VT64 V64:$Rn), imm:$Immn)),
+ imm:$Immd)),
+ (EXTRACT_SUBREG
+ (INS (SUBREG_TO_REG (i64 0), V64:$src, dsub), imm:$Immd,
+ (SUBREG_TO_REG (i64 0), V64:$Rn, dsub), imm:$Immn),
+ dsub)>;
+}
+
+defm : Neon_INS_elt_pattern<v4f32, v2f32, f32, INSvi32lane>;
+defm : Neon_INS_elt_pattern<v2f64, v1f64, f64, INSvi64lane>;
+defm : Neon_INS_elt_pattern<v16i8, v8i8, i32, INSvi8lane>;
+defm : Neon_INS_elt_pattern<v8i16, v4i16, i32, INSvi16lane>;
+defm : Neon_INS_elt_pattern<v4i32, v2i32, i32, INSvi32lane>;
+defm : Neon_INS_elt_pattern<v2i64, v1i64, i64, INSvi32lane>;
+
+
+// Floating point vector extractions are codegen'd as either a sequence of
+// subregister extractions, possibly fed by an INS if the lane number is
+// anything other than zero.
+def : Pat<(vector_extract (v2f64 V128:$Rn), 0),
+ (f64 (EXTRACT_SUBREG V128:$Rn, dsub))>;
+def : Pat<(vector_extract (v4f32 V128:$Rn), 0),
+ (f32 (EXTRACT_SUBREG V128:$Rn, ssub))>;
+def : Pat<(vector_extract (v2f64 V128:$Rn), VectorIndexD:$idx),
+ (f64 (EXTRACT_SUBREG
+ (INSvi64lane (v2f64 (IMPLICIT_DEF)), 0,
+ V128:$Rn, VectorIndexD:$idx),
+ dsub))>;
+def : Pat<(vector_extract (v4f32 V128:$Rn), VectorIndexS:$idx),
+ (f32 (EXTRACT_SUBREG
+ (INSvi32lane (v4f32 (IMPLICIT_DEF)), 0,
+ V128:$Rn, VectorIndexS:$idx),
+ ssub))>;
+
+// All concat_vectors operations are canonicalised to act on i64 vectors for
+// AArch64. In the general case we need an instruction, which had just as well be
+// INS.
+class ConcatPat<ValueType DstTy, ValueType SrcTy>
+ : Pat<(DstTy (concat_vectors (SrcTy V64:$Rd), V64:$Rn)),
+ (INSvi64lane (INSERT_SUBREG (IMPLICIT_DEF), V64:$Rd, dsub), 1,
+ (INSERT_SUBREG (IMPLICIT_DEF), V64:$Rn, dsub), 0)>;
+
+def : ConcatPat<v2i64, v1i64>;
+def : ConcatPat<v2f64, v1f64>;
+def : ConcatPat<v4i32, v2i32>;
+def : ConcatPat<v4f32, v2f32>;
+def : ConcatPat<v8i16, v4i16>;
+def : ConcatPat<v16i8, v8i8>;
+
+// If the high lanes are undef, though, we can just ignore them:
+class ConcatUndefPat<ValueType DstTy, ValueType SrcTy>
+ : Pat<(DstTy (concat_vectors (SrcTy V64:$Rn), undef)),
+ (INSERT_SUBREG (IMPLICIT_DEF), V64:$Rn, dsub)>;
+
+def : ConcatUndefPat<v2i64, v1i64>;
+def : ConcatUndefPat<v2f64, v1f64>;
+def : ConcatUndefPat<v4i32, v2i32>;
+def : ConcatUndefPat<v4f32, v2f32>;
+def : ConcatUndefPat<v8i16, v4i16>;
+def : ConcatUndefPat<v16i8, v8i8>;
+
+//----------------------------------------------------------------------------
+// AdvSIMD across lanes instructions
+//----------------------------------------------------------------------------
+
+defm ADDV : SIMDAcrossLanesBHS<0, 0b11011, "addv">;
+defm SMAXV : SIMDAcrossLanesBHS<0, 0b01010, "smaxv">;
+defm SMINV : SIMDAcrossLanesBHS<0, 0b11010, "sminv">;
+defm UMAXV : SIMDAcrossLanesBHS<1, 0b01010, "umaxv">;
+defm UMINV : SIMDAcrossLanesBHS<1, 0b11010, "uminv">;
+defm SADDLV : SIMDAcrossLanesHSD<0, 0b00011, "saddlv">;
+defm UADDLV : SIMDAcrossLanesHSD<1, 0b00011, "uaddlv">;
+defm FMAXNMV : SIMDAcrossLanesS<0b01100, 0, "fmaxnmv", int_aarch64_neon_fmaxnmv>;
+defm FMAXV : SIMDAcrossLanesS<0b01111, 0, "fmaxv", int_aarch64_neon_fmaxv>;
+defm FMINNMV : SIMDAcrossLanesS<0b01100, 1, "fminnmv", int_aarch64_neon_fminnmv>;
+defm FMINV : SIMDAcrossLanesS<0b01111, 1, "fminv", int_aarch64_neon_fminv>;
+
+multiclass SIMDAcrossLanesSignedIntrinsic<string baseOpc, Intrinsic intOp> {
+// If there is a sign extension after this intrinsic, consume it as smov already
+// performed it
+ def : Pat<(i32 (sext_inreg (i32 (intOp (v8i8 V64:$Rn))), i8)),
+ (i32 (SMOVvi8to32
+ (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)),
+ (!cast<Instruction>(!strconcat(baseOpc, "v8i8v")) V64:$Rn), bsub),
+ (i64 0)))>;
+ def : Pat<(i32 (intOp (v8i8 V64:$Rn))),
+ (i32 (SMOVvi8to32
+ (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)),
+ (!cast<Instruction>(!strconcat(baseOpc, "v8i8v")) V64:$Rn), bsub),
+ (i64 0)))>;
+// If there is a sign extension after this intrinsic, consume it as smov already
+// performed it
+def : Pat<(i32 (sext_inreg (i32 (intOp (v16i8 V128:$Rn))), i8)),
+ (i32 (SMOVvi8to32
+ (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)),
+ (!cast<Instruction>(!strconcat(baseOpc, "v16i8v")) V128:$Rn), bsub),
+ (i64 0)))>;
+def : Pat<(i32 (intOp (v16i8 V128:$Rn))),
+ (i32 (SMOVvi8to32
+ (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)),
+ (!cast<Instruction>(!strconcat(baseOpc, "v16i8v")) V128:$Rn), bsub),
+ (i64 0)))>;
+// If there is a sign extension after this intrinsic, consume it as smov already
+// performed it
+def : Pat<(i32 (sext_inreg (i32 (intOp (v4i16 V64:$Rn))), i16)),
+ (i32 (SMOVvi16to32
+ (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)),
+ (!cast<Instruction>(!strconcat(baseOpc, "v4i16v")) V64:$Rn), hsub),
+ (i64 0)))>;
+def : Pat<(i32 (intOp (v4i16 V64:$Rn))),
+ (i32 (SMOVvi16to32
+ (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)),
+ (!cast<Instruction>(!strconcat(baseOpc, "v4i16v")) V64:$Rn), hsub),
+ (i64 0)))>;
+// If there is a sign extension after this intrinsic, consume it as smov already
+// performed it
+def : Pat<(i32 (sext_inreg (i32 (intOp (v8i16 V128:$Rn))), i16)),
+ (i32 (SMOVvi16to32
+ (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)),
+ (!cast<Instruction>(!strconcat(baseOpc, "v8i16v")) V128:$Rn), hsub),
+ (i64 0)))>;
+def : Pat<(i32 (intOp (v8i16 V128:$Rn))),
+ (i32 (SMOVvi16to32
+ (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)),
+ (!cast<Instruction>(!strconcat(baseOpc, "v8i16v")) V128:$Rn), hsub),
+ (i64 0)))>;
+
+def : Pat<(i32 (intOp (v4i32 V128:$Rn))),
+ (i32 (EXTRACT_SUBREG
+ (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)),
+ (!cast<Instruction>(!strconcat(baseOpc, "v4i32v")) V128:$Rn), ssub),
+ ssub))>;
+}
+
+multiclass SIMDAcrossLanesUnsignedIntrinsic<string baseOpc, Intrinsic intOp> {
+// If there is a masking operation keeping only what has been actually
+// generated, consume it.
+ def : Pat<(i32 (and (i32 (intOp (v8i8 V64:$Rn))), maski8_or_more)),
+ (i32 (EXTRACT_SUBREG
+ (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)),
+ (!cast<Instruction>(!strconcat(baseOpc, "v8i8v")) V64:$Rn), bsub),
+ ssub))>;
+ def : Pat<(i32 (intOp (v8i8 V64:$Rn))),
+ (i32 (EXTRACT_SUBREG
+ (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)),
+ (!cast<Instruction>(!strconcat(baseOpc, "v8i8v")) V64:$Rn), bsub),
+ ssub))>;
+// If there is a masking operation keeping only what has been actually
+// generated, consume it.
+def : Pat<(i32 (and (i32 (intOp (v16i8 V128:$Rn))), maski8_or_more)),
+ (i32 (EXTRACT_SUBREG
+ (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)),
+ (!cast<Instruction>(!strconcat(baseOpc, "v16i8v")) V128:$Rn), bsub),
+ ssub))>;
+def : Pat<(i32 (intOp (v16i8 V128:$Rn))),
+ (i32 (EXTRACT_SUBREG
+ (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)),
+ (!cast<Instruction>(!strconcat(baseOpc, "v16i8v")) V128:$Rn), bsub),
+ ssub))>;
+
+// If there is a masking operation keeping only what has been actually
+// generated, consume it.
+def : Pat<(i32 (and (i32 (intOp (v4i16 V64:$Rn))), maski16_or_more)),
+ (i32 (EXTRACT_SUBREG
+ (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)),
+ (!cast<Instruction>(!strconcat(baseOpc, "v4i16v")) V64:$Rn), hsub),
+ ssub))>;
+def : Pat<(i32 (intOp (v4i16 V64:$Rn))),
+ (i32 (EXTRACT_SUBREG
+ (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)),
+ (!cast<Instruction>(!strconcat(baseOpc, "v4i16v")) V64:$Rn), hsub),
+ ssub))>;
+// If there is a masking operation keeping only what has been actually
+// generated, consume it.
+def : Pat<(i32 (and (i32 (intOp (v8i16 V128:$Rn))), maski16_or_more)),
+ (i32 (EXTRACT_SUBREG
+ (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)),
+ (!cast<Instruction>(!strconcat(baseOpc, "v8i16v")) V128:$Rn), hsub),
+ ssub))>;
+def : Pat<(i32 (intOp (v8i16 V128:$Rn))),
+ (i32 (EXTRACT_SUBREG
+ (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)),
+ (!cast<Instruction>(!strconcat(baseOpc, "v8i16v")) V128:$Rn), hsub),
+ ssub))>;
+
+def : Pat<(i32 (intOp (v4i32 V128:$Rn))),
+ (i32 (EXTRACT_SUBREG
+ (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)),
+ (!cast<Instruction>(!strconcat(baseOpc, "v4i32v")) V128:$Rn), ssub),
+ ssub))>;
+
+}
+
+multiclass SIMDAcrossLanesSignedLongIntrinsic<string baseOpc, Intrinsic intOp> {
+ def : Pat<(i32 (intOp (v8i8 V64:$Rn))),
+ (i32 (SMOVvi16to32
+ (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)),
+ (!cast<Instruction>(!strconcat(baseOpc, "v8i8v")) V64:$Rn), hsub),
+ (i64 0)))>;
+def : Pat<(i32 (intOp (v16i8 V128:$Rn))),
+ (i32 (SMOVvi16to32
+ (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)),
+ (!cast<Instruction>(!strconcat(baseOpc, "v16i8v")) V128:$Rn), hsub),
+ (i64 0)))>;
+
+def : Pat<(i32 (intOp (v4i16 V64:$Rn))),
+ (i32 (EXTRACT_SUBREG
+ (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)),
+ (!cast<Instruction>(!strconcat(baseOpc, "v4i16v")) V64:$Rn), ssub),
+ ssub))>;
+def : Pat<(i32 (intOp (v8i16 V128:$Rn))),
+ (i32 (EXTRACT_SUBREG
+ (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)),
+ (!cast<Instruction>(!strconcat(baseOpc, "v8i16v")) V128:$Rn), ssub),
+ ssub))>;
+
+def : Pat<(i64 (intOp (v4i32 V128:$Rn))),
+ (i64 (EXTRACT_SUBREG
+ (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)),
+ (!cast<Instruction>(!strconcat(baseOpc, "v4i32v")) V128:$Rn), dsub),
+ dsub))>;
+}
+
+multiclass SIMDAcrossLanesUnsignedLongIntrinsic<string baseOpc,
+ Intrinsic intOp> {
+ def : Pat<(i32 (intOp (v8i8 V64:$Rn))),
+ (i32 (EXTRACT_SUBREG
+ (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)),
+ (!cast<Instruction>(!strconcat(baseOpc, "v8i8v")) V64:$Rn), hsub),
+ ssub))>;
+def : Pat<(i32 (intOp (v16i8 V128:$Rn))),
+ (i32 (EXTRACT_SUBREG
+ (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)),
+ (!cast<Instruction>(!strconcat(baseOpc, "v16i8v")) V128:$Rn), hsub),
+ ssub))>;
+
+def : Pat<(i32 (intOp (v4i16 V64:$Rn))),
+ (i32 (EXTRACT_SUBREG
+ (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)),
+ (!cast<Instruction>(!strconcat(baseOpc, "v4i16v")) V64:$Rn), ssub),
+ ssub))>;
+def : Pat<(i32 (intOp (v8i16 V128:$Rn))),
+ (i32 (EXTRACT_SUBREG
+ (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)),
+ (!cast<Instruction>(!strconcat(baseOpc, "v8i16v")) V128:$Rn), ssub),
+ ssub))>;
+
+def : Pat<(i64 (intOp (v4i32 V128:$Rn))),
+ (i64 (EXTRACT_SUBREG
+ (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)),
+ (!cast<Instruction>(!strconcat(baseOpc, "v4i32v")) V128:$Rn), dsub),
+ dsub))>;
+}
+
+defm : SIMDAcrossLanesSignedIntrinsic<"ADDV", int_aarch64_neon_saddv>;
+// vaddv_[su]32 is special; -> ADDP Vd.2S,Vn.2S,Vm.2S; return Vd.s[0];Vn==Vm
+def : Pat<(i32 (int_aarch64_neon_saddv (v2i32 V64:$Rn))),
+ (EXTRACT_SUBREG (ADDPv2i32 V64:$Rn, V64:$Rn), ssub)>;
+
+defm : SIMDAcrossLanesUnsignedIntrinsic<"ADDV", int_aarch64_neon_uaddv>;
+// vaddv_[su]32 is special; -> ADDP Vd.2S,Vn.2S,Vm.2S; return Vd.s[0];Vn==Vm
+def : Pat<(i32 (int_aarch64_neon_uaddv (v2i32 V64:$Rn))),
+ (EXTRACT_SUBREG (ADDPv2i32 V64:$Rn, V64:$Rn), ssub)>;
+
+defm : SIMDAcrossLanesSignedIntrinsic<"SMAXV", int_aarch64_neon_smaxv>;
+def : Pat<(i32 (int_aarch64_neon_smaxv (v2i32 V64:$Rn))),
+ (EXTRACT_SUBREG (SMAXPv2i32 V64:$Rn, V64:$Rn), ssub)>;
+
+defm : SIMDAcrossLanesSignedIntrinsic<"SMINV", int_aarch64_neon_sminv>;
+def : Pat<(i32 (int_aarch64_neon_sminv (v2i32 V64:$Rn))),
+ (EXTRACT_SUBREG (SMINPv2i32 V64:$Rn, V64:$Rn), ssub)>;
+
+defm : SIMDAcrossLanesUnsignedIntrinsic<"UMAXV", int_aarch64_neon_umaxv>;
+def : Pat<(i32 (int_aarch64_neon_umaxv (v2i32 V64:$Rn))),
+ (EXTRACT_SUBREG (UMAXPv2i32 V64:$Rn, V64:$Rn), ssub)>;
+
+defm : SIMDAcrossLanesUnsignedIntrinsic<"UMINV", int_aarch64_neon_uminv>;
+def : Pat<(i32 (int_aarch64_neon_uminv (v2i32 V64:$Rn))),
+ (EXTRACT_SUBREG (UMINPv2i32 V64:$Rn, V64:$Rn), ssub)>;
+
+defm : SIMDAcrossLanesSignedLongIntrinsic<"SADDLV", int_aarch64_neon_saddlv>;
+defm : SIMDAcrossLanesUnsignedLongIntrinsic<"UADDLV", int_aarch64_neon_uaddlv>;
+
+// The vaddlv_s32 intrinsic gets mapped to SADDLP.
+def : Pat<(i64 (int_aarch64_neon_saddlv (v2i32 V64:$Rn))),
+ (i64 (EXTRACT_SUBREG
+ (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)),
+ (SADDLPv2i32_v1i64 V64:$Rn), dsub),
+ dsub))>;
+// The vaddlv_u32 intrinsic gets mapped to UADDLP.
+def : Pat<(i64 (int_aarch64_neon_uaddlv (v2i32 V64:$Rn))),
+ (i64 (EXTRACT_SUBREG
+ (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)),
+ (UADDLPv2i32_v1i64 V64:$Rn), dsub),
+ dsub))>;
+
+//------------------------------------------------------------------------------
+// AdvSIMD modified immediate instructions
+//------------------------------------------------------------------------------
+
+// AdvSIMD BIC
+defm BIC : SIMDModifiedImmVectorShiftTied<1, 0b11, 0b01, "bic", AArch64bici>;
+// AdvSIMD ORR
+defm ORR : SIMDModifiedImmVectorShiftTied<0, 0b11, 0b01, "orr", AArch64orri>;
+
+def : InstAlias<"bic $Vd.4h, $imm", (BICv4i16 V64:$Vd, imm0_255:$imm, 0)>;
+def : InstAlias<"bic $Vd.8h, $imm", (BICv8i16 V128:$Vd, imm0_255:$imm, 0)>;
+def : InstAlias<"bic $Vd.2s, $imm", (BICv2i32 V64:$Vd, imm0_255:$imm, 0)>;
+def : InstAlias<"bic $Vd.4s, $imm", (BICv4i32 V128:$Vd, imm0_255:$imm, 0)>;
+
+def : InstAlias<"bic.4h $Vd, $imm", (BICv4i16 V64:$Vd, imm0_255:$imm, 0), 0>;
+def : InstAlias<"bic.8h $Vd, $imm", (BICv8i16 V128:$Vd, imm0_255:$imm, 0), 0>;
+def : InstAlias<"bic.2s $Vd, $imm", (BICv2i32 V64:$Vd, imm0_255:$imm, 0), 0>;
+def : InstAlias<"bic.4s $Vd, $imm", (BICv4i32 V128:$Vd, imm0_255:$imm, 0), 0>;
+
+def : InstAlias<"orr $Vd.4h, $imm", (ORRv4i16 V64:$Vd, imm0_255:$imm, 0)>;
+def : InstAlias<"orr $Vd.8h, $imm", (ORRv8i16 V128:$Vd, imm0_255:$imm, 0)>;
+def : InstAlias<"orr $Vd.2s, $imm", (ORRv2i32 V64:$Vd, imm0_255:$imm, 0)>;
+def : InstAlias<"orr $Vd.4s, $imm", (ORRv4i32 V128:$Vd, imm0_255:$imm, 0)>;
+
+def : InstAlias<"orr.4h $Vd, $imm", (ORRv4i16 V64:$Vd, imm0_255:$imm, 0), 0>;
+def : InstAlias<"orr.8h $Vd, $imm", (ORRv8i16 V128:$Vd, imm0_255:$imm, 0), 0>;
+def : InstAlias<"orr.2s $Vd, $imm", (ORRv2i32 V64:$Vd, imm0_255:$imm, 0), 0>;
+def : InstAlias<"orr.4s $Vd, $imm", (ORRv4i32 V128:$Vd, imm0_255:$imm, 0), 0>;
+
+// AdvSIMD FMOV
+def FMOVv2f64_ns : SIMDModifiedImmVectorNoShift<1, 1, 0b1111, V128, fpimm8,
+ "fmov", ".2d",
+ [(set (v2f64 V128:$Rd), (AArch64fmov imm0_255:$imm8))]>;
+def FMOVv2f32_ns : SIMDModifiedImmVectorNoShift<0, 0, 0b1111, V64, fpimm8,
+ "fmov", ".2s",
+ [(set (v2f32 V64:$Rd), (AArch64fmov imm0_255:$imm8))]>;
+def FMOVv4f32_ns : SIMDModifiedImmVectorNoShift<1, 0, 0b1111, V128, fpimm8,
+ "fmov", ".4s",
+ [(set (v4f32 V128:$Rd), (AArch64fmov imm0_255:$imm8))]>;
+
+// AdvSIMD MOVI
+
+// EDIT byte mask: scalar
+let isReMaterializable = 1, isAsCheapAsAMove = 1 in
+def MOVID : SIMDModifiedImmScalarNoShift<0, 1, 0b1110, "movi",
+ [(set FPR64:$Rd, simdimmtype10:$imm8)]>;
+// The movi_edit node has the immediate value already encoded, so we use
+// a plain imm0_255 here.
+def : Pat<(f64 (AArch64movi_edit imm0_255:$shift)),
+ (MOVID imm0_255:$shift)>;
+
+def : Pat<(v1i64 immAllZerosV), (MOVID (i32 0))>;
+def : Pat<(v2i32 immAllZerosV), (MOVID (i32 0))>;
+def : Pat<(v4i16 immAllZerosV), (MOVID (i32 0))>;
+def : Pat<(v8i8 immAllZerosV), (MOVID (i32 0))>;
+
+def : Pat<(v1i64 immAllOnesV), (MOVID (i32 255))>;
+def : Pat<(v2i32 immAllOnesV), (MOVID (i32 255))>;
+def : Pat<(v4i16 immAllOnesV), (MOVID (i32 255))>;
+def : Pat<(v8i8 immAllOnesV), (MOVID (i32 255))>;
+
+// EDIT byte mask: 2d
+
+// The movi_edit node has the immediate value already encoded, so we use
+// a plain imm0_255 in the pattern
+let isReMaterializable = 1, isAsCheapAsAMove = 1 in
+def MOVIv2d_ns : SIMDModifiedImmVectorNoShift<1, 1, 0b1110, V128,
+ simdimmtype10,
+ "movi", ".2d",
+ [(set (v2i64 V128:$Rd), (AArch64movi_edit imm0_255:$imm8))]>;
+
+
+// Use movi.2d to materialize 0.0 if the HW does zero-cycle zeroing.
+// Complexity is added to break a tie with a plain MOVI.
+let AddedComplexity = 1 in {
+def : Pat<(f32 fpimm0),
+ (f32 (EXTRACT_SUBREG (v2i64 (MOVIv2d_ns (i32 0))), ssub))>,
+ Requires<[HasZCZ]>;
+def : Pat<(f64 fpimm0),
+ (f64 (EXTRACT_SUBREG (v2i64 (MOVIv2d_ns (i32 0))), dsub))>,
+ Requires<[HasZCZ]>;
+}
+
+def : Pat<(v2i64 immAllZerosV), (MOVIv2d_ns (i32 0))>;
+def : Pat<(v4i32 immAllZerosV), (MOVIv2d_ns (i32 0))>;
+def : Pat<(v8i16 immAllZerosV), (MOVIv2d_ns (i32 0))>;
+def : Pat<(v16i8 immAllZerosV), (MOVIv2d_ns (i32 0))>;
+
+def : Pat<(v2i64 immAllOnesV), (MOVIv2d_ns (i32 255))>;
+def : Pat<(v4i32 immAllOnesV), (MOVIv2d_ns (i32 255))>;
+def : Pat<(v8i16 immAllOnesV), (MOVIv2d_ns (i32 255))>;
+def : Pat<(v16i8 immAllOnesV), (MOVIv2d_ns (i32 255))>;
+
+def : Pat<(v2f64 (AArch64dup (f64 fpimm0))), (MOVIv2d_ns (i32 0))>;
+def : Pat<(v4f32 (AArch64dup (f32 fpimm0))), (MOVIv2d_ns (i32 0))>;
+
+// EDIT per word & halfword: 2s, 4h, 4s, & 8h
+defm MOVI : SIMDModifiedImmVectorShift<0, 0b10, 0b00, "movi">;
+
+def : InstAlias<"movi $Vd.4h, $imm", (MOVIv4i16 V64:$Vd, imm0_255:$imm, 0), 0>;
+def : InstAlias<"movi $Vd.8h, $imm", (MOVIv8i16 V128:$Vd, imm0_255:$imm, 0), 0>;
+def : InstAlias<"movi $Vd.2s, $imm", (MOVIv2i32 V64:$Vd, imm0_255:$imm, 0), 0>;
+def : InstAlias<"movi $Vd.4s, $imm", (MOVIv4i32 V128:$Vd, imm0_255:$imm, 0), 0>;
+
+def : InstAlias<"movi.4h $Vd, $imm", (MOVIv4i16 V64:$Vd, imm0_255:$imm, 0), 0>;
+def : InstAlias<"movi.8h $Vd, $imm", (MOVIv8i16 V128:$Vd, imm0_255:$imm, 0), 0>;
+def : InstAlias<"movi.2s $Vd, $imm", (MOVIv2i32 V64:$Vd, imm0_255:$imm, 0), 0>;
+def : InstAlias<"movi.4s $Vd, $imm", (MOVIv4i32 V128:$Vd, imm0_255:$imm, 0), 0>;
+
+def : Pat<(v2i32 (AArch64movi_shift imm0_255:$imm8, (i32 imm:$shift))),
+ (MOVIv2i32 imm0_255:$imm8, imm:$shift)>;
+def : Pat<(v4i32 (AArch64movi_shift imm0_255:$imm8, (i32 imm:$shift))),
+ (MOVIv4i32 imm0_255:$imm8, imm:$shift)>;
+def : Pat<(v4i16 (AArch64movi_shift imm0_255:$imm8, (i32 imm:$shift))),
+ (MOVIv4i16 imm0_255:$imm8, imm:$shift)>;
+def : Pat<(v8i16 (AArch64movi_shift imm0_255:$imm8, (i32 imm:$shift))),
+ (MOVIv8i16 imm0_255:$imm8, imm:$shift)>;
+
+// EDIT per word: 2s & 4s with MSL shifter
+def MOVIv2s_msl : SIMDModifiedImmMoveMSL<0, 0, {1,1,0,?}, V64, "movi", ".2s",
+ [(set (v2i32 V64:$Rd),
+ (AArch64movi_msl imm0_255:$imm8, (i32 imm:$shift)))]>;
+def MOVIv4s_msl : SIMDModifiedImmMoveMSL<1, 0, {1,1,0,?}, V128, "movi", ".4s",
+ [(set (v4i32 V128:$Rd),
+ (AArch64movi_msl imm0_255:$imm8, (i32 imm:$shift)))]>;
+
+// Per byte: 8b & 16b
+def MOVIv8b_ns : SIMDModifiedImmVectorNoShift<0, 0, 0b1110, V64, imm0_255,
+ "movi", ".8b",
+ [(set (v8i8 V64:$Rd), (AArch64movi imm0_255:$imm8))]>;
+def MOVIv16b_ns : SIMDModifiedImmVectorNoShift<1, 0, 0b1110, V128, imm0_255,
+ "movi", ".16b",
+ [(set (v16i8 V128:$Rd), (AArch64movi imm0_255:$imm8))]>;
+
+// AdvSIMD MVNI
+
+// EDIT per word & halfword: 2s, 4h, 4s, & 8h
+defm MVNI : SIMDModifiedImmVectorShift<1, 0b10, 0b00, "mvni">;
+
+def : InstAlias<"mvni $Vd.4h, $imm", (MVNIv4i16 V64:$Vd, imm0_255:$imm, 0), 0>;
+def : InstAlias<"mvni $Vd.8h, $imm", (MVNIv8i16 V128:$Vd, imm0_255:$imm, 0), 0>;
+def : InstAlias<"mvni $Vd.2s, $imm", (MVNIv2i32 V64:$Vd, imm0_255:$imm, 0), 0>;
+def : InstAlias<"mvni $Vd.4s, $imm", (MVNIv4i32 V128:$Vd, imm0_255:$imm, 0), 0>;
+
+def : InstAlias<"mvni.4h $Vd, $imm", (MVNIv4i16 V64:$Vd, imm0_255:$imm, 0), 0>;
+def : InstAlias<"mvni.8h $Vd, $imm", (MVNIv8i16 V128:$Vd, imm0_255:$imm, 0), 0>;
+def : InstAlias<"mvni.2s $Vd, $imm", (MVNIv2i32 V64:$Vd, imm0_255:$imm, 0), 0>;
+def : InstAlias<"mvni.4s $Vd, $imm", (MVNIv4i32 V128:$Vd, imm0_255:$imm, 0), 0>;
+
+def : Pat<(v2i32 (AArch64mvni_shift imm0_255:$imm8, (i32 imm:$shift))),
+ (MVNIv2i32 imm0_255:$imm8, imm:$shift)>;
+def : Pat<(v4i32 (AArch64mvni_shift imm0_255:$imm8, (i32 imm:$shift))),
+ (MVNIv4i32 imm0_255:$imm8, imm:$shift)>;
+def : Pat<(v4i16 (AArch64mvni_shift imm0_255:$imm8, (i32 imm:$shift))),
+ (MVNIv4i16 imm0_255:$imm8, imm:$shift)>;
+def : Pat<(v8i16 (AArch64mvni_shift imm0_255:$imm8, (i32 imm:$shift))),
+ (MVNIv8i16 imm0_255:$imm8, imm:$shift)>;
+
+// EDIT per word: 2s & 4s with MSL shifter
+def MVNIv2s_msl : SIMDModifiedImmMoveMSL<0, 1, {1,1,0,?}, V64, "mvni", ".2s",
+ [(set (v2i32 V64:$Rd),
+ (AArch64mvni_msl imm0_255:$imm8, (i32 imm:$shift)))]>;
+def MVNIv4s_msl : SIMDModifiedImmMoveMSL<1, 1, {1,1,0,?}, V128, "mvni", ".4s",
+ [(set (v4i32 V128:$Rd),
+ (AArch64mvni_msl imm0_255:$imm8, (i32 imm:$shift)))]>;
+
+//----------------------------------------------------------------------------
+// AdvSIMD indexed element
+//----------------------------------------------------------------------------
+
+let neverHasSideEffects = 1 in {
+ defm FMLA : SIMDFPIndexedSDTied<0, 0b0001, "fmla">;
+ defm FMLS : SIMDFPIndexedSDTied<0, 0b0101, "fmls">;
+}
+
+// NOTE: Operands are reordered in the FMLA/FMLS PatFrags because the
+// instruction expects the addend first, while the intrinsic expects it last.
+
+// On the other hand, there are quite a few valid combinatorial options due to
+// the commutativity of multiplication and the fact that (-x) * y = x * (-y).
+defm : SIMDFPIndexedSDTiedPatterns<"FMLA",
+ TriOpFrag<(fma node:$RHS, node:$MHS, node:$LHS)>>;
+defm : SIMDFPIndexedSDTiedPatterns<"FMLA",
+ TriOpFrag<(fma node:$MHS, node:$RHS, node:$LHS)>>;
+
+defm : SIMDFPIndexedSDTiedPatterns<"FMLS",
+ TriOpFrag<(fma node:$MHS, (fneg node:$RHS), node:$LHS)> >;
+defm : SIMDFPIndexedSDTiedPatterns<"FMLS",
+ TriOpFrag<(fma node:$RHS, (fneg node:$MHS), node:$LHS)> >;
+defm : SIMDFPIndexedSDTiedPatterns<"FMLS",
+ TriOpFrag<(fma (fneg node:$RHS), node:$MHS, node:$LHS)> >;
+defm : SIMDFPIndexedSDTiedPatterns<"FMLS",
+ TriOpFrag<(fma (fneg node:$MHS), node:$RHS, node:$LHS)> >;
+
+multiclass FMLSIndexedAfterNegPatterns<SDPatternOperator OpNode> {
+ // 3 variants for the .2s version: DUPLANE from 128-bit, DUPLANE from 64-bit
+ // and DUP scalar.
+ def : Pat<(v2f32 (OpNode (v2f32 V64:$Rd), (v2f32 V64:$Rn),
+ (AArch64duplane32 (v4f32 (fneg V128:$Rm)),
+ VectorIndexS:$idx))),
+ (FMLSv2i32_indexed V64:$Rd, V64:$Rn, V128:$Rm, VectorIndexS:$idx)>;
+ def : Pat<(v2f32 (OpNode (v2f32 V64:$Rd), (v2f32 V64:$Rn),
+ (v2f32 (AArch64duplane32
+ (v4f32 (insert_subvector undef,
+ (v2f32 (fneg V64:$Rm)),
+ (i32 0))),
+ VectorIndexS:$idx)))),
+ (FMLSv2i32_indexed V64:$Rd, V64:$Rn,
+ (SUBREG_TO_REG (i32 0), V64:$Rm, dsub),
+ VectorIndexS:$idx)>;
+ def : Pat<(v2f32 (OpNode (v2f32 V64:$Rd), (v2f32 V64:$Rn),
+ (AArch64dup (f32 (fneg FPR32Op:$Rm))))),
+ (FMLSv2i32_indexed V64:$Rd, V64:$Rn,
+ (SUBREG_TO_REG (i32 0), FPR32Op:$Rm, ssub), (i64 0))>;
+
+ // 3 variants for the .4s version: DUPLANE from 128-bit, DUPLANE from 64-bit
+ // and DUP scalar.
+ def : Pat<(v4f32 (OpNode (v4f32 V128:$Rd), (v4f32 V128:$Rn),
+ (AArch64duplane32 (v4f32 (fneg V128:$Rm)),
+ VectorIndexS:$idx))),
+ (FMLSv4i32_indexed V128:$Rd, V128:$Rn, V128:$Rm,
+ VectorIndexS:$idx)>;
+ def : Pat<(v4f32 (OpNode (v4f32 V128:$Rd), (v4f32 V128:$Rn),
+ (v4f32 (AArch64duplane32
+ (v4f32 (insert_subvector undef,
+ (v2f32 (fneg V64:$Rm)),
+ (i32 0))),
+ VectorIndexS:$idx)))),
+ (FMLSv4i32_indexed V128:$Rd, V128:$Rn,
+ (SUBREG_TO_REG (i32 0), V64:$Rm, dsub),
+ VectorIndexS:$idx)>;
+ def : Pat<(v4f32 (OpNode (v4f32 V128:$Rd), (v4f32 V128:$Rn),
+ (AArch64dup (f32 (fneg FPR32Op:$Rm))))),
+ (FMLSv4i32_indexed V128:$Rd, V128:$Rn,
+ (SUBREG_TO_REG (i32 0), FPR32Op:$Rm, ssub), (i64 0))>;
+
+ // 2 variants for the .2d version: DUPLANE from 128-bit, and DUP scalar
+ // (DUPLANE from 64-bit would be trivial).
+ def : Pat<(v2f64 (OpNode (v2f64 V128:$Rd), (v2f64 V128:$Rn),
+ (AArch64duplane64 (v2f64 (fneg V128:$Rm)),
+ VectorIndexD:$idx))),
+ (FMLSv2i64_indexed
+ V128:$Rd, V128:$Rn, V128:$Rm, VectorIndexS:$idx)>;
+ def : Pat<(v2f64 (OpNode (v2f64 V128:$Rd), (v2f64 V128:$Rn),
+ (AArch64dup (f64 (fneg FPR64Op:$Rm))))),
+ (FMLSv2i64_indexed V128:$Rd, V128:$Rn,
+ (SUBREG_TO_REG (i32 0), FPR64Op:$Rm, dsub), (i64 0))>;
+
+ // 2 variants for 32-bit scalar version: extract from .2s or from .4s
+ def : Pat<(f32 (OpNode (f32 FPR32:$Rd), (f32 FPR32:$Rn),
+ (vector_extract (v4f32 (fneg V128:$Rm)),
+ VectorIndexS:$idx))),
+ (FMLSv1i32_indexed FPR32:$Rd, FPR32:$Rn,
+ V128:$Rm, VectorIndexS:$idx)>;
+ def : Pat<(f32 (OpNode (f32 FPR32:$Rd), (f32 FPR32:$Rn),
+ (vector_extract (v2f32 (fneg V64:$Rm)),
+ VectorIndexS:$idx))),
+ (FMLSv1i32_indexed FPR32:$Rd, FPR32:$Rn,
+ (SUBREG_TO_REG (i32 0), V64:$Rm, dsub), VectorIndexS:$idx)>;
+
+ // 1 variant for 64-bit scalar version: extract from .1d or from .2d
+ def : Pat<(f64 (OpNode (f64 FPR64:$Rd), (f64 FPR64:$Rn),
+ (vector_extract (v2f64 (fneg V128:$Rm)),
+ VectorIndexS:$idx))),
+ (FMLSv1i64_indexed FPR64:$Rd, FPR64:$Rn,
+ V128:$Rm, VectorIndexS:$idx)>;
+}
+
+defm : FMLSIndexedAfterNegPatterns<
+ TriOpFrag<(fma node:$RHS, node:$MHS, node:$LHS)> >;
+defm : FMLSIndexedAfterNegPatterns<
+ TriOpFrag<(fma node:$MHS, node:$RHS, node:$LHS)> >;
+
+defm FMULX : SIMDFPIndexedSD<1, 0b1001, "fmulx", int_aarch64_neon_fmulx>;
+defm FMUL : SIMDFPIndexedSD<0, 0b1001, "fmul", fmul>;
+
+def : Pat<(v2f32 (fmul V64:$Rn, (AArch64dup (f32 FPR32:$Rm)))),
+ (FMULv2i32_indexed V64:$Rn,
+ (INSERT_SUBREG (v4i32 (IMPLICIT_DEF)), FPR32:$Rm, ssub),
+ (i64 0))>;
+def : Pat<(v4f32 (fmul V128:$Rn, (AArch64dup (f32 FPR32:$Rm)))),
+ (FMULv4i32_indexed V128:$Rn,
+ (INSERT_SUBREG (v4i32 (IMPLICIT_DEF)), FPR32:$Rm, ssub),
+ (i64 0))>;
+def : Pat<(v2f64 (fmul V128:$Rn, (AArch64dup (f64 FPR64:$Rm)))),
+ (FMULv2i64_indexed V128:$Rn,
+ (INSERT_SUBREG (v4i32 (IMPLICIT_DEF)), FPR64:$Rm, dsub),
+ (i64 0))>;
+
+defm SQDMULH : SIMDIndexedHS<0, 0b1100, "sqdmulh", int_aarch64_neon_sqdmulh>;
+defm SQRDMULH : SIMDIndexedHS<0, 0b1101, "sqrdmulh", int_aarch64_neon_sqrdmulh>;
+defm MLA : SIMDVectorIndexedHSTied<1, 0b0000, "mla",
+ TriOpFrag<(add node:$LHS, (mul node:$MHS, node:$RHS))>>;
+defm MLS : SIMDVectorIndexedHSTied<1, 0b0100, "mls",
+ TriOpFrag<(sub node:$LHS, (mul node:$MHS, node:$RHS))>>;
+defm MUL : SIMDVectorIndexedHS<0, 0b1000, "mul", mul>;
+defm SMLAL : SIMDVectorIndexedLongSDTied<0, 0b0010, "smlal",
+ TriOpFrag<(add node:$LHS, (int_aarch64_neon_smull node:$MHS, node:$RHS))>>;
+defm SMLSL : SIMDVectorIndexedLongSDTied<0, 0b0110, "smlsl",
+ TriOpFrag<(sub node:$LHS, (int_aarch64_neon_smull node:$MHS, node:$RHS))>>;
+defm SMULL : SIMDVectorIndexedLongSD<0, 0b1010, "smull",
+ int_aarch64_neon_smull>;
+defm SQDMLAL : SIMDIndexedLongSQDMLXSDTied<0, 0b0011, "sqdmlal",
+ int_aarch64_neon_sqadd>;
+defm SQDMLSL : SIMDIndexedLongSQDMLXSDTied<0, 0b0111, "sqdmlsl",
+ int_aarch64_neon_sqsub>;
+defm SQDMULL : SIMDIndexedLongSD<0, 0b1011, "sqdmull", int_aarch64_neon_sqdmull>;
+defm UMLAL : SIMDVectorIndexedLongSDTied<1, 0b0010, "umlal",
+ TriOpFrag<(add node:$LHS, (int_aarch64_neon_umull node:$MHS, node:$RHS))>>;
+defm UMLSL : SIMDVectorIndexedLongSDTied<1, 0b0110, "umlsl",
+ TriOpFrag<(sub node:$LHS, (int_aarch64_neon_umull node:$MHS, node:$RHS))>>;
+defm UMULL : SIMDVectorIndexedLongSD<1, 0b1010, "umull",
+ int_aarch64_neon_umull>;
+
+// A scalar sqdmull with the second operand being a vector lane can be
+// handled directly with the indexed instruction encoding.
+def : Pat<(int_aarch64_neon_sqdmulls_scalar (i32 FPR32:$Rn),
+ (vector_extract (v4i32 V128:$Vm),
+ VectorIndexS:$idx)),
+ (SQDMULLv1i64_indexed FPR32:$Rn, V128:$Vm, VectorIndexS:$idx)>;
+
+//----------------------------------------------------------------------------
+// AdvSIMD scalar shift instructions
+//----------------------------------------------------------------------------
+defm FCVTZS : SIMDScalarRShiftSD<0, 0b11111, "fcvtzs">;
+defm FCVTZU : SIMDScalarRShiftSD<1, 0b11111, "fcvtzu">;
+defm SCVTF : SIMDScalarRShiftSD<0, 0b11100, "scvtf">;
+defm UCVTF : SIMDScalarRShiftSD<1, 0b11100, "ucvtf">;
+// Codegen patterns for the above. We don't put these directly on the
+// instructions because TableGen's type inference can't handle the truth.
+// Having the same base pattern for fp <--> int totally freaks it out.
+def : Pat<(int_aarch64_neon_vcvtfp2fxs FPR32:$Rn, vecshiftR32:$imm),
+ (FCVTZSs FPR32:$Rn, vecshiftR32:$imm)>;
+def : Pat<(int_aarch64_neon_vcvtfp2fxu FPR32:$Rn, vecshiftR32:$imm),
+ (FCVTZUs FPR32:$Rn, vecshiftR32:$imm)>;
+def : Pat<(i64 (int_aarch64_neon_vcvtfp2fxs (f64 FPR64:$Rn), vecshiftR64:$imm)),
+ (FCVTZSd FPR64:$Rn, vecshiftR64:$imm)>;
+def : Pat<(i64 (int_aarch64_neon_vcvtfp2fxu (f64 FPR64:$Rn), vecshiftR64:$imm)),
+ (FCVTZUd FPR64:$Rn, vecshiftR64:$imm)>;
+def : Pat<(v1i64 (int_aarch64_neon_vcvtfp2fxs (v1f64 FPR64:$Rn),
+ vecshiftR64:$imm)),
+ (FCVTZSd FPR64:$Rn, vecshiftR64:$imm)>;
+def : Pat<(v1i64 (int_aarch64_neon_vcvtfp2fxu (v1f64 FPR64:$Rn),
+ vecshiftR64:$imm)),
+ (FCVTZUd FPR64:$Rn, vecshiftR64:$imm)>;
+def : Pat<(int_aarch64_neon_vcvtfxs2fp FPR32:$Rn, vecshiftR32:$imm),
+ (SCVTFs FPR32:$Rn, vecshiftR32:$imm)>;
+def : Pat<(int_aarch64_neon_vcvtfxu2fp FPR32:$Rn, vecshiftR32:$imm),
+ (UCVTFs FPR32:$Rn, vecshiftR32:$imm)>;
+def : Pat<(f64 (int_aarch64_neon_vcvtfxs2fp (i64 FPR64:$Rn), vecshiftR64:$imm)),
+ (SCVTFd FPR64:$Rn, vecshiftR64:$imm)>;
+def : Pat<(f64 (int_aarch64_neon_vcvtfxu2fp (i64 FPR64:$Rn), vecshiftR64:$imm)),
+ (UCVTFd FPR64:$Rn, vecshiftR64:$imm)>;
+def : Pat<(v1f64 (int_aarch64_neon_vcvtfxs2fp (v1i64 FPR64:$Rn),
+ vecshiftR64:$imm)),
+ (SCVTFd FPR64:$Rn, vecshiftR64:$imm)>;
+def : Pat<(v1f64 (int_aarch64_neon_vcvtfxu2fp (v1i64 FPR64:$Rn),
+ vecshiftR64:$imm)),
+ (UCVTFd FPR64:$Rn, vecshiftR64:$imm)>;
+
+defm SHL : SIMDScalarLShiftD< 0, 0b01010, "shl", AArch64vshl>;
+defm SLI : SIMDScalarLShiftDTied<1, 0b01010, "sli">;
+defm SQRSHRN : SIMDScalarRShiftBHS< 0, 0b10011, "sqrshrn",
+ int_aarch64_neon_sqrshrn>;
+defm SQRSHRUN : SIMDScalarRShiftBHS< 1, 0b10001, "sqrshrun",
+ int_aarch64_neon_sqrshrun>;
+defm SQSHLU : SIMDScalarLShiftBHSD<1, 0b01100, "sqshlu", AArch64sqshlui>;
+defm SQSHL : SIMDScalarLShiftBHSD<0, 0b01110, "sqshl", AArch64sqshli>;
+defm SQSHRN : SIMDScalarRShiftBHS< 0, 0b10010, "sqshrn",
+ int_aarch64_neon_sqshrn>;
+defm SQSHRUN : SIMDScalarRShiftBHS< 1, 0b10000, "sqshrun",
+ int_aarch64_neon_sqshrun>;
+defm SRI : SIMDScalarRShiftDTied< 1, 0b01000, "sri">;
+defm SRSHR : SIMDScalarRShiftD< 0, 0b00100, "srshr", AArch64srshri>;
+defm SRSRA : SIMDScalarRShiftDTied< 0, 0b00110, "srsra",
+ TriOpFrag<(add node:$LHS,
+ (AArch64srshri node:$MHS, node:$RHS))>>;
+defm SSHR : SIMDScalarRShiftD< 0, 0b00000, "sshr", AArch64vashr>;
+defm SSRA : SIMDScalarRShiftDTied< 0, 0b00010, "ssra",
+ TriOpFrag<(add node:$LHS,
+ (AArch64vashr node:$MHS, node:$RHS))>>;
+defm UQRSHRN : SIMDScalarRShiftBHS< 1, 0b10011, "uqrshrn",
+ int_aarch64_neon_uqrshrn>;
+defm UQSHL : SIMDScalarLShiftBHSD<1, 0b01110, "uqshl", AArch64uqshli>;
+defm UQSHRN : SIMDScalarRShiftBHS< 1, 0b10010, "uqshrn",
+ int_aarch64_neon_uqshrn>;
+defm URSHR : SIMDScalarRShiftD< 1, 0b00100, "urshr", AArch64urshri>;
+defm URSRA : SIMDScalarRShiftDTied< 1, 0b00110, "ursra",
+ TriOpFrag<(add node:$LHS,
+ (AArch64urshri node:$MHS, node:$RHS))>>;
+defm USHR : SIMDScalarRShiftD< 1, 0b00000, "ushr", AArch64vlshr>;
+defm USRA : SIMDScalarRShiftDTied< 1, 0b00010, "usra",
+ TriOpFrag<(add node:$LHS,
+ (AArch64vlshr node:$MHS, node:$RHS))>>;
+
+//----------------------------------------------------------------------------
+// AdvSIMD vector shift instructions
+//----------------------------------------------------------------------------
+defm FCVTZS:SIMDVectorRShiftSD<0, 0b11111, "fcvtzs", int_aarch64_neon_vcvtfp2fxs>;
+defm FCVTZU:SIMDVectorRShiftSD<1, 0b11111, "fcvtzu", int_aarch64_neon_vcvtfp2fxu>;
+defm SCVTF: SIMDVectorRShiftSDToFP<0, 0b11100, "scvtf",
+ int_aarch64_neon_vcvtfxs2fp>;
+defm RSHRN : SIMDVectorRShiftNarrowBHS<0, 0b10001, "rshrn",
+ int_aarch64_neon_rshrn>;
+defm SHL : SIMDVectorLShiftBHSD<0, 0b01010, "shl", AArch64vshl>;
+defm SHRN : SIMDVectorRShiftNarrowBHS<0, 0b10000, "shrn",
+ BinOpFrag<(trunc (AArch64vashr node:$LHS, node:$RHS))>>;
+defm SLI : SIMDVectorLShiftBHSDTied<1, 0b01010, "sli", int_aarch64_neon_vsli>;
+def : Pat<(v1i64 (int_aarch64_neon_vsli (v1i64 FPR64:$Rd), (v1i64 FPR64:$Rn),
+ (i32 vecshiftL64:$imm))),
+ (SLId FPR64:$Rd, FPR64:$Rn, vecshiftL64:$imm)>;
+defm SQRSHRN : SIMDVectorRShiftNarrowBHS<0, 0b10011, "sqrshrn",
+ int_aarch64_neon_sqrshrn>;
+defm SQRSHRUN: SIMDVectorRShiftNarrowBHS<1, 0b10001, "sqrshrun",
+ int_aarch64_neon_sqrshrun>;
+defm SQSHLU : SIMDVectorLShiftBHSD<1, 0b01100, "sqshlu", AArch64sqshlui>;
+defm SQSHL : SIMDVectorLShiftBHSD<0, 0b01110, "sqshl", AArch64sqshli>;
+defm SQSHRN : SIMDVectorRShiftNarrowBHS<0, 0b10010, "sqshrn",
+ int_aarch64_neon_sqshrn>;
+defm SQSHRUN : SIMDVectorRShiftNarrowBHS<1, 0b10000, "sqshrun",
+ int_aarch64_neon_sqshrun>;
+defm SRI : SIMDVectorRShiftBHSDTied<1, 0b01000, "sri", int_aarch64_neon_vsri>;
+def : Pat<(v1i64 (int_aarch64_neon_vsri (v1i64 FPR64:$Rd), (v1i64 FPR64:$Rn),
+ (i32 vecshiftR64:$imm))),
+ (SRId FPR64:$Rd, FPR64:$Rn, vecshiftR64:$imm)>;
+defm SRSHR : SIMDVectorRShiftBHSD<0, 0b00100, "srshr", AArch64srshri>;
+defm SRSRA : SIMDVectorRShiftBHSDTied<0, 0b00110, "srsra",
+ TriOpFrag<(add node:$LHS,
+ (AArch64srshri node:$MHS, node:$RHS))> >;
+defm SSHLL : SIMDVectorLShiftLongBHSD<0, 0b10100, "sshll",
+ BinOpFrag<(AArch64vshl (sext node:$LHS), node:$RHS)>>;
+
+defm SSHR : SIMDVectorRShiftBHSD<0, 0b00000, "sshr", AArch64vashr>;
+defm SSRA : SIMDVectorRShiftBHSDTied<0, 0b00010, "ssra",
+ TriOpFrag<(add node:$LHS, (AArch64vashr node:$MHS, node:$RHS))>>;
+defm UCVTF : SIMDVectorRShiftSDToFP<1, 0b11100, "ucvtf",
+ int_aarch64_neon_vcvtfxu2fp>;
+defm UQRSHRN : SIMDVectorRShiftNarrowBHS<1, 0b10011, "uqrshrn",
+ int_aarch64_neon_uqrshrn>;
+defm UQSHL : SIMDVectorLShiftBHSD<1, 0b01110, "uqshl", AArch64uqshli>;
+defm UQSHRN : SIMDVectorRShiftNarrowBHS<1, 0b10010, "uqshrn",
+ int_aarch64_neon_uqshrn>;
+defm URSHR : SIMDVectorRShiftBHSD<1, 0b00100, "urshr", AArch64urshri>;
+defm URSRA : SIMDVectorRShiftBHSDTied<1, 0b00110, "ursra",
+ TriOpFrag<(add node:$LHS,
+ (AArch64urshri node:$MHS, node:$RHS))> >;
+defm USHLL : SIMDVectorLShiftLongBHSD<1, 0b10100, "ushll",
+ BinOpFrag<(AArch64vshl (zext node:$LHS), node:$RHS)>>;
+defm USHR : SIMDVectorRShiftBHSD<1, 0b00000, "ushr", AArch64vlshr>;
+defm USRA : SIMDVectorRShiftBHSDTied<1, 0b00010, "usra",
+ TriOpFrag<(add node:$LHS, (AArch64vlshr node:$MHS, node:$RHS))> >;
+
+// SHRN patterns for when a logical right shift was used instead of arithmetic
+// (the immediate guarantees no sign bits actually end up in the result so it
+// doesn't matter).
+def : Pat<(v8i8 (trunc (AArch64vlshr (v8i16 V128:$Rn), vecshiftR16Narrow:$imm))),
+ (SHRNv8i8_shift V128:$Rn, vecshiftR16Narrow:$imm)>;
+def : Pat<(v4i16 (trunc (AArch64vlshr (v4i32 V128:$Rn), vecshiftR32Narrow:$imm))),
+ (SHRNv4i16_shift V128:$Rn, vecshiftR32Narrow:$imm)>;
+def : Pat<(v2i32 (trunc (AArch64vlshr (v2i64 V128:$Rn), vecshiftR64Narrow:$imm))),
+ (SHRNv2i32_shift V128:$Rn, vecshiftR64Narrow:$imm)>;
+
+def : Pat<(v16i8 (concat_vectors (v8i8 V64:$Rd),
+ (trunc (AArch64vlshr (v8i16 V128:$Rn),
+ vecshiftR16Narrow:$imm)))),
+ (SHRNv16i8_shift (INSERT_SUBREG (IMPLICIT_DEF), V64:$Rd, dsub),
+ V128:$Rn, vecshiftR16Narrow:$imm)>;
+def : Pat<(v8i16 (concat_vectors (v4i16 V64:$Rd),
+ (trunc (AArch64vlshr (v4i32 V128:$Rn),
+ vecshiftR32Narrow:$imm)))),
+ (SHRNv8i16_shift (INSERT_SUBREG (IMPLICIT_DEF), V64:$Rd, dsub),
+ V128:$Rn, vecshiftR32Narrow:$imm)>;
+def : Pat<(v4i32 (concat_vectors (v2i32 V64:$Rd),
+ (trunc (AArch64vlshr (v2i64 V128:$Rn),
+ vecshiftR64Narrow:$imm)))),
+ (SHRNv4i32_shift (INSERT_SUBREG (IMPLICIT_DEF), V64:$Rd, dsub),
+ V128:$Rn, vecshiftR32Narrow:$imm)>;
+
+// Vector sign and zero extensions are implemented with SSHLL and USSHLL.
+// Anyexts are implemented as zexts.
+def : Pat<(v8i16 (sext (v8i8 V64:$Rn))), (SSHLLv8i8_shift V64:$Rn, (i32 0))>;
+def : Pat<(v8i16 (zext (v8i8 V64:$Rn))), (USHLLv8i8_shift V64:$Rn, (i32 0))>;
+def : Pat<(v8i16 (anyext (v8i8 V64:$Rn))), (USHLLv8i8_shift V64:$Rn, (i32 0))>;
+def : Pat<(v4i32 (sext (v4i16 V64:$Rn))), (SSHLLv4i16_shift V64:$Rn, (i32 0))>;
+def : Pat<(v4i32 (zext (v4i16 V64:$Rn))), (USHLLv4i16_shift V64:$Rn, (i32 0))>;
+def : Pat<(v4i32 (anyext (v4i16 V64:$Rn))), (USHLLv4i16_shift V64:$Rn, (i32 0))>;
+def : Pat<(v2i64 (sext (v2i32 V64:$Rn))), (SSHLLv2i32_shift V64:$Rn, (i32 0))>;
+def : Pat<(v2i64 (zext (v2i32 V64:$Rn))), (USHLLv2i32_shift V64:$Rn, (i32 0))>;
+def : Pat<(v2i64 (anyext (v2i32 V64:$Rn))), (USHLLv2i32_shift V64:$Rn, (i32 0))>;
+// Also match an extend from the upper half of a 128 bit source register.
+def : Pat<(v8i16 (anyext (v8i8 (extract_subvector V128:$Rn, (i64 8)) ))),
+ (USHLLv16i8_shift V128:$Rn, (i32 0))>;
+def : Pat<(v8i16 (zext (v8i8 (extract_subvector V128:$Rn, (i64 8)) ))),
+ (USHLLv16i8_shift V128:$Rn, (i32 0))>;
+def : Pat<(v8i16 (sext (v8i8 (extract_subvector V128:$Rn, (i64 8)) ))),
+ (SSHLLv16i8_shift V128:$Rn, (i32 0))>;
+def : Pat<(v4i32 (anyext (v4i16 (extract_subvector V128:$Rn, (i64 4)) ))),
+ (USHLLv8i16_shift V128:$Rn, (i32 0))>;
+def : Pat<(v4i32 (zext (v4i16 (extract_subvector V128:$Rn, (i64 4)) ))),
+ (USHLLv8i16_shift V128:$Rn, (i32 0))>;
+def : Pat<(v4i32 (sext (v4i16 (extract_subvector V128:$Rn, (i64 4)) ))),
+ (SSHLLv8i16_shift V128:$Rn, (i32 0))>;
+def : Pat<(v2i64 (anyext (v2i32 (extract_subvector V128:$Rn, (i64 2)) ))),
+ (USHLLv4i32_shift V128:$Rn, (i32 0))>;
+def : Pat<(v2i64 (zext (v2i32 (extract_subvector V128:$Rn, (i64 2)) ))),
+ (USHLLv4i32_shift V128:$Rn, (i32 0))>;
+def : Pat<(v2i64 (sext (v2i32 (extract_subvector V128:$Rn, (i64 2)) ))),
+ (SSHLLv4i32_shift V128:$Rn, (i32 0))>;
+
+// Vector shift sxtl aliases
+def : InstAlias<"sxtl.8h $dst, $src1",
+ (SSHLLv8i8_shift V128:$dst, V64:$src1, 0)>;
+def : InstAlias<"sxtl $dst.8h, $src1.8b",
+ (SSHLLv8i8_shift V128:$dst, V64:$src1, 0)>;
+def : InstAlias<"sxtl.4s $dst, $src1",
+ (SSHLLv4i16_shift V128:$dst, V64:$src1, 0)>;
+def : InstAlias<"sxtl $dst.4s, $src1.4h",
+ (SSHLLv4i16_shift V128:$dst, V64:$src1, 0)>;
+def : InstAlias<"sxtl.2d $dst, $src1",
+ (SSHLLv2i32_shift V128:$dst, V64:$src1, 0)>;
+def : InstAlias<"sxtl $dst.2d, $src1.2s",
+ (SSHLLv2i32_shift V128:$dst, V64:$src1, 0)>;
+
+// Vector shift sxtl2 aliases
+def : InstAlias<"sxtl2.8h $dst, $src1",
+ (SSHLLv16i8_shift V128:$dst, V128:$src1, 0)>;
+def : InstAlias<"sxtl2 $dst.8h, $src1.16b",
+ (SSHLLv16i8_shift V128:$dst, V128:$src1, 0)>;
+def : InstAlias<"sxtl2.4s $dst, $src1",
+ (SSHLLv8i16_shift V128:$dst, V128:$src1, 0)>;
+def : InstAlias<"sxtl2 $dst.4s, $src1.8h",
+ (SSHLLv8i16_shift V128:$dst, V128:$src1, 0)>;
+def : InstAlias<"sxtl2.2d $dst, $src1",
+ (SSHLLv4i32_shift V128:$dst, V128:$src1, 0)>;
+def : InstAlias<"sxtl2 $dst.2d, $src1.4s",
+ (SSHLLv4i32_shift V128:$dst, V128:$src1, 0)>;
+
+// Vector shift uxtl aliases
+def : InstAlias<"uxtl.8h $dst, $src1",
+ (USHLLv8i8_shift V128:$dst, V64:$src1, 0)>;
+def : InstAlias<"uxtl $dst.8h, $src1.8b",
+ (USHLLv8i8_shift V128:$dst, V64:$src1, 0)>;
+def : InstAlias<"uxtl.4s $dst, $src1",
+ (USHLLv4i16_shift V128:$dst, V64:$src1, 0)>;
+def : InstAlias<"uxtl $dst.4s, $src1.4h",
+ (USHLLv4i16_shift V128:$dst, V64:$src1, 0)>;
+def : InstAlias<"uxtl.2d $dst, $src1",
+ (USHLLv2i32_shift V128:$dst, V64:$src1, 0)>;
+def : InstAlias<"uxtl $dst.2d, $src1.2s",
+ (USHLLv2i32_shift V128:$dst, V64:$src1, 0)>;
+
+// Vector shift uxtl2 aliases
+def : InstAlias<"uxtl2.8h $dst, $src1",
+ (USHLLv16i8_shift V128:$dst, V128:$src1, 0)>;
+def : InstAlias<"uxtl2 $dst.8h, $src1.16b",
+ (USHLLv16i8_shift V128:$dst, V128:$src1, 0)>;
+def : InstAlias<"uxtl2.4s $dst, $src1",
+ (USHLLv8i16_shift V128:$dst, V128:$src1, 0)>;
+def : InstAlias<"uxtl2 $dst.4s, $src1.8h",
+ (USHLLv8i16_shift V128:$dst, V128:$src1, 0)>;
+def : InstAlias<"uxtl2.2d $dst, $src1",
+ (USHLLv4i32_shift V128:$dst, V128:$src1, 0)>;
+def : InstAlias<"uxtl2 $dst.2d, $src1.4s",
+ (USHLLv4i32_shift V128:$dst, V128:$src1, 0)>;
+
+// If an integer is about to be converted to a floating point value,
+// just load it on the floating point unit.
+// These patterns are more complex because floating point loads do not
+// support sign extension.
+// The sign extension has to be explicitly added and is only supported for
+// one step: byte-to-half, half-to-word, word-to-doubleword.
+// SCVTF GPR -> FPR is 9 cycles.
+// SCVTF FPR -> FPR is 4 cyclces.
+// (sign extension with lengthen) SXTL FPR -> FPR is 2 cycles.
+// Therefore, we can do 2 sign extensions and one SCVTF FPR -> FPR
+// and still being faster.
+// However, this is not good for code size.
+// 8-bits -> float. 2 sizes step-up.
+class SExtLoadi8CVTf32Pat<dag addrmode, dag INST>
+ : Pat<(f32 (sint_to_fp (i32 (sextloadi8 addrmode)))),
+ (SCVTFv1i32 (f32 (EXTRACT_SUBREG
+ (SSHLLv4i16_shift
+ (f64
+ (EXTRACT_SUBREG
+ (SSHLLv8i8_shift
+ (INSERT_SUBREG (f64 (IMPLICIT_DEF)),
+ INST,
+ bsub),
+ 0),
+ dsub)),
+ 0),
+ ssub)))>, Requires<[NotForCodeSize]>;
+
+def : SExtLoadi8CVTf32Pat<(ro8.Wpat GPR64sp:$Rn, GPR32:$Rm, ro8.Wext:$ext),
+ (LDRBroW GPR64sp:$Rn, GPR32:$Rm, ro8.Wext:$ext)>;
+def : SExtLoadi8CVTf32Pat<(ro8.Xpat GPR64sp:$Rn, GPR64:$Rm, ro8.Xext:$ext),
+ (LDRBroX GPR64sp:$Rn, GPR64:$Rm, ro8.Xext:$ext)>;
+def : SExtLoadi8CVTf32Pat<(am_indexed8 GPR64sp:$Rn, uimm12s1:$offset),
+ (LDRBui GPR64sp:$Rn, uimm12s1:$offset)>;
+def : SExtLoadi8CVTf32Pat<(am_unscaled8 GPR64sp:$Rn, simm9:$offset),
+ (LDURBi GPR64sp:$Rn, simm9:$offset)>;
+
+// 16-bits -> float. 1 size step-up.
+class SExtLoadi16CVTf32Pat<dag addrmode, dag INST>
+ : Pat<(f32 (sint_to_fp (i32 (sextloadi16 addrmode)))),
+ (SCVTFv1i32 (f32 (EXTRACT_SUBREG
+ (SSHLLv4i16_shift
+ (INSERT_SUBREG (f64 (IMPLICIT_DEF)),
+ INST,
+ hsub),
+ 0),
+ ssub)))>, Requires<[NotForCodeSize]>;
+
+def : SExtLoadi16CVTf32Pat<(ro16.Wpat GPR64sp:$Rn, GPR32:$Rm, ro16.Wext:$ext),
+ (LDRHroW GPR64sp:$Rn, GPR32:$Rm, ro16.Wext:$ext)>;
+def : SExtLoadi16CVTf32Pat<(ro16.Xpat GPR64sp:$Rn, GPR64:$Rm, ro16.Xext:$ext),
+ (LDRHroX GPR64sp:$Rn, GPR64:$Rm, ro16.Xext:$ext)>;
+def : SExtLoadi16CVTf32Pat<(am_indexed16 GPR64sp:$Rn, uimm12s2:$offset),
+ (LDRHui GPR64sp:$Rn, uimm12s2:$offset)>;
+def : SExtLoadi16CVTf32Pat<(am_unscaled16 GPR64sp:$Rn, simm9:$offset),
+ (LDURHi GPR64sp:$Rn, simm9:$offset)>;
+
+// 32-bits to 32-bits are handled in target specific dag combine:
+// performIntToFpCombine.
+// 64-bits integer to 32-bits floating point, not possible with
+// SCVTF on floating point registers (both source and destination
+// must have the same size).
+
+// Here are the patterns for 8, 16, 32, and 64-bits to double.
+// 8-bits -> double. 3 size step-up: give up.
+// 16-bits -> double. 2 size step.
+class SExtLoadi16CVTf64Pat<dag addrmode, dag INST>
+ : Pat <(f64 (sint_to_fp (i32 (sextloadi16 addrmode)))),
+ (SCVTFv1i64 (f64 (EXTRACT_SUBREG
+ (SSHLLv2i32_shift
+ (f64
+ (EXTRACT_SUBREG
+ (SSHLLv4i16_shift
+ (INSERT_SUBREG (f64 (IMPLICIT_DEF)),
+ INST,
+ hsub),
+ 0),
+ dsub)),
+ 0),
+ dsub)))>, Requires<[NotForCodeSize]>;
+
+def : SExtLoadi16CVTf64Pat<(ro16.Wpat GPR64sp:$Rn, GPR32:$Rm, ro16.Wext:$ext),
+ (LDRHroW GPR64sp:$Rn, GPR32:$Rm, ro16.Wext:$ext)>;
+def : SExtLoadi16CVTf64Pat<(ro16.Xpat GPR64sp:$Rn, GPR64:$Rm, ro16.Xext:$ext),
+ (LDRHroX GPR64sp:$Rn, GPR64:$Rm, ro16.Xext:$ext)>;
+def : SExtLoadi16CVTf64Pat<(am_indexed16 GPR64sp:$Rn, uimm12s2:$offset),
+ (LDRHui GPR64sp:$Rn, uimm12s2:$offset)>;
+def : SExtLoadi16CVTf64Pat<(am_unscaled16 GPR64sp:$Rn, simm9:$offset),
+ (LDURHi GPR64sp:$Rn, simm9:$offset)>;
+// 32-bits -> double. 1 size step-up.
+class SExtLoadi32CVTf64Pat<dag addrmode, dag INST>
+ : Pat <(f64 (sint_to_fp (i32 (load addrmode)))),
+ (SCVTFv1i64 (f64 (EXTRACT_SUBREG
+ (SSHLLv2i32_shift
+ (INSERT_SUBREG (f64 (IMPLICIT_DEF)),
+ INST,
+ ssub),
+ 0),
+ dsub)))>, Requires<[NotForCodeSize]>;
+
+def : SExtLoadi32CVTf64Pat<(ro32.Wpat GPR64sp:$Rn, GPR32:$Rm, ro32.Wext:$ext),
+ (LDRSroW GPR64sp:$Rn, GPR32:$Rm, ro32.Wext:$ext)>;
+def : SExtLoadi32CVTf64Pat<(ro32.Xpat GPR64sp:$Rn, GPR64:$Rm, ro32.Xext:$ext),
+ (LDRSroX GPR64sp:$Rn, GPR64:$Rm, ro32.Xext:$ext)>;
+def : SExtLoadi32CVTf64Pat<(am_indexed32 GPR64sp:$Rn, uimm12s4:$offset),
+ (LDRSui GPR64sp:$Rn, uimm12s4:$offset)>;
+def : SExtLoadi32CVTf64Pat<(am_unscaled32 GPR64sp:$Rn, simm9:$offset),
+ (LDURSi GPR64sp:$Rn, simm9:$offset)>;
+
+// 64-bits -> double are handled in target specific dag combine:
+// performIntToFpCombine.
+
+
+//----------------------------------------------------------------------------
+// AdvSIMD Load-Store Structure
+//----------------------------------------------------------------------------
+defm LD1 : SIMDLd1Multiple<"ld1">;
+defm LD2 : SIMDLd2Multiple<"ld2">;
+defm LD3 : SIMDLd3Multiple<"ld3">;
+defm LD4 : SIMDLd4Multiple<"ld4">;
+
+defm ST1 : SIMDSt1Multiple<"st1">;
+defm ST2 : SIMDSt2Multiple<"st2">;
+defm ST3 : SIMDSt3Multiple<"st3">;
+defm ST4 : SIMDSt4Multiple<"st4">;
+
+class Ld1Pat<ValueType ty, Instruction INST>
+ : Pat<(ty (load GPR64sp:$Rn)), (INST GPR64sp:$Rn)>;
+
+def : Ld1Pat<v16i8, LD1Onev16b>;
+def : Ld1Pat<v8i16, LD1Onev8h>;
+def : Ld1Pat<v4i32, LD1Onev4s>;
+def : Ld1Pat<v2i64, LD1Onev2d>;
+def : Ld1Pat<v8i8, LD1Onev8b>;
+def : Ld1Pat<v4i16, LD1Onev4h>;
+def : Ld1Pat<v2i32, LD1Onev2s>;
+def : Ld1Pat<v1i64, LD1Onev1d>;
+
+class St1Pat<ValueType ty, Instruction INST>
+ : Pat<(store ty:$Vt, GPR64sp:$Rn),
+ (INST ty:$Vt, GPR64sp:$Rn)>;
+
+def : St1Pat<v16i8, ST1Onev16b>;
+def : St1Pat<v8i16, ST1Onev8h>;
+def : St1Pat<v4i32, ST1Onev4s>;
+def : St1Pat<v2i64, ST1Onev2d>;
+def : St1Pat<v8i8, ST1Onev8b>;
+def : St1Pat<v4i16, ST1Onev4h>;
+def : St1Pat<v2i32, ST1Onev2s>;
+def : St1Pat<v1i64, ST1Onev1d>;
+
+//---
+// Single-element
+//---
+
+defm LD1R : SIMDLdR<0, 0b110, 0, "ld1r", "One", 1, 2, 4, 8>;
+defm LD2R : SIMDLdR<1, 0b110, 0, "ld2r", "Two", 2, 4, 8, 16>;
+defm LD3R : SIMDLdR<0, 0b111, 0, "ld3r", "Three", 3, 6, 12, 24>;
+defm LD4R : SIMDLdR<1, 0b111, 0, "ld4r", "Four", 4, 8, 16, 32>;
+let mayLoad = 1, neverHasSideEffects = 1 in {
+defm LD1 : SIMDLdSingleBTied<0, 0b000, "ld1", VecListOneb, GPR64pi1>;
+defm LD1 : SIMDLdSingleHTied<0, 0b010, 0, "ld1", VecListOneh, GPR64pi2>;
+defm LD1 : SIMDLdSingleSTied<0, 0b100, 0b00, "ld1", VecListOnes, GPR64pi4>;
+defm LD1 : SIMDLdSingleDTied<0, 0b100, 0b01, "ld1", VecListOned, GPR64pi8>;
+defm LD2 : SIMDLdSingleBTied<1, 0b000, "ld2", VecListTwob, GPR64pi2>;
+defm LD2 : SIMDLdSingleHTied<1, 0b010, 0, "ld2", VecListTwoh, GPR64pi4>;
+defm LD2 : SIMDLdSingleSTied<1, 0b100, 0b00, "ld2", VecListTwos, GPR64pi8>;
+defm LD2 : SIMDLdSingleDTied<1, 0b100, 0b01, "ld2", VecListTwod, GPR64pi16>;
+defm LD3 : SIMDLdSingleBTied<0, 0b001, "ld3", VecListThreeb, GPR64pi3>;
+defm LD3 : SIMDLdSingleHTied<0, 0b011, 0, "ld3", VecListThreeh, GPR64pi6>;
+defm LD3 : SIMDLdSingleSTied<0, 0b101, 0b00, "ld3", VecListThrees, GPR64pi12>;
+defm LD3 : SIMDLdSingleDTied<0, 0b101, 0b01, "ld3", VecListThreed, GPR64pi24>;
+defm LD4 : SIMDLdSingleBTied<1, 0b001, "ld4", VecListFourb, GPR64pi4>;
+defm LD4 : SIMDLdSingleHTied<1, 0b011, 0, "ld4", VecListFourh, GPR64pi8>;
+defm LD4 : SIMDLdSingleSTied<1, 0b101, 0b00, "ld4", VecListFours, GPR64pi16>;
+defm LD4 : SIMDLdSingleDTied<1, 0b101, 0b01, "ld4", VecListFourd, GPR64pi32>;
+}
+
+def : Pat<(v8i8 (AArch64dup (i32 (extloadi8 GPR64sp:$Rn)))),
+ (LD1Rv8b GPR64sp:$Rn)>;
+def : Pat<(v16i8 (AArch64dup (i32 (extloadi8 GPR64sp:$Rn)))),
+ (LD1Rv16b GPR64sp:$Rn)>;
+def : Pat<(v4i16 (AArch64dup (i32 (extloadi16 GPR64sp:$Rn)))),
+ (LD1Rv4h GPR64sp:$Rn)>;
+def : Pat<(v8i16 (AArch64dup (i32 (extloadi16 GPR64sp:$Rn)))),
+ (LD1Rv8h GPR64sp:$Rn)>;
+def : Pat<(v2i32 (AArch64dup (i32 (load GPR64sp:$Rn)))),
+ (LD1Rv2s GPR64sp:$Rn)>;
+def : Pat<(v4i32 (AArch64dup (i32 (load GPR64sp:$Rn)))),
+ (LD1Rv4s GPR64sp:$Rn)>;
+def : Pat<(v2i64 (AArch64dup (i64 (load GPR64sp:$Rn)))),
+ (LD1Rv2d GPR64sp:$Rn)>;
+def : Pat<(v1i64 (AArch64dup (i64 (load GPR64sp:$Rn)))),
+ (LD1Rv1d GPR64sp:$Rn)>;
+// Grab the floating point version too
+def : Pat<(v2f32 (AArch64dup (f32 (load GPR64sp:$Rn)))),
+ (LD1Rv2s GPR64sp:$Rn)>;
+def : Pat<(v4f32 (AArch64dup (f32 (load GPR64sp:$Rn)))),
+ (LD1Rv4s GPR64sp:$Rn)>;
+def : Pat<(v2f64 (AArch64dup (f64 (load GPR64sp:$Rn)))),
+ (LD1Rv2d GPR64sp:$Rn)>;
+def : Pat<(v1f64 (AArch64dup (f64 (load GPR64sp:$Rn)))),
+ (LD1Rv1d GPR64sp:$Rn)>;
+
+class Ld1Lane128Pat<SDPatternOperator scalar_load, Operand VecIndex,
+ ValueType VTy, ValueType STy, Instruction LD1>
+ : Pat<(vector_insert (VTy VecListOne128:$Rd),
+ (STy (scalar_load GPR64sp:$Rn)), VecIndex:$idx),
+ (LD1 VecListOne128:$Rd, VecIndex:$idx, GPR64sp:$Rn)>;
+
+def : Ld1Lane128Pat<extloadi8, VectorIndexB, v16i8, i32, LD1i8>;
+def : Ld1Lane128Pat<extloadi16, VectorIndexH, v8i16, i32, LD1i16>;
+def : Ld1Lane128Pat<load, VectorIndexS, v4i32, i32, LD1i32>;
+def : Ld1Lane128Pat<load, VectorIndexS, v4f32, f32, LD1i32>;
+def : Ld1Lane128Pat<load, VectorIndexD, v2i64, i64, LD1i64>;
+def : Ld1Lane128Pat<load, VectorIndexD, v2f64, f64, LD1i64>;
+
+class Ld1Lane64Pat<SDPatternOperator scalar_load, Operand VecIndex,
+ ValueType VTy, ValueType STy, Instruction LD1>
+ : Pat<(vector_insert (VTy VecListOne64:$Rd),
+ (STy (scalar_load GPR64sp:$Rn)), VecIndex:$idx),
+ (EXTRACT_SUBREG
+ (LD1 (SUBREG_TO_REG (i32 0), VecListOne64:$Rd, dsub),
+ VecIndex:$idx, GPR64sp:$Rn),
+ dsub)>;
+
+def : Ld1Lane64Pat<extloadi8, VectorIndexB, v8i8, i32, LD1i8>;
+def : Ld1Lane64Pat<extloadi16, VectorIndexH, v4i16, i32, LD1i16>;
+def : Ld1Lane64Pat<load, VectorIndexS, v2i32, i32, LD1i32>;
+def : Ld1Lane64Pat<load, VectorIndexS, v2f32, f32, LD1i32>;
+
+
+defm LD1 : SIMDLdSt1SingleAliases<"ld1">;
+defm LD2 : SIMDLdSt2SingleAliases<"ld2">;
+defm LD3 : SIMDLdSt3SingleAliases<"ld3">;
+defm LD4 : SIMDLdSt4SingleAliases<"ld4">;
+
+// Stores
+defm ST1 : SIMDStSingleB<0, 0b000, "st1", VecListOneb, GPR64pi1>;
+defm ST1 : SIMDStSingleH<0, 0b010, 0, "st1", VecListOneh, GPR64pi2>;
+defm ST1 : SIMDStSingleS<0, 0b100, 0b00, "st1", VecListOnes, GPR64pi4>;
+defm ST1 : SIMDStSingleD<0, 0b100, 0b01, "st1", VecListOned, GPR64pi8>;
+
+let AddedComplexity = 15 in
+class St1Lane128Pat<SDPatternOperator scalar_store, Operand VecIndex,
+ ValueType VTy, ValueType STy, Instruction ST1>
+ : Pat<(scalar_store
+ (STy (vector_extract (VTy VecListOne128:$Vt), VecIndex:$idx)),
+ GPR64sp:$Rn),
+ (ST1 VecListOne128:$Vt, VecIndex:$idx, GPR64sp:$Rn)>;
+
+def : St1Lane128Pat<truncstorei8, VectorIndexB, v16i8, i32, ST1i8>;
+def : St1Lane128Pat<truncstorei16, VectorIndexH, v8i16, i32, ST1i16>;
+def : St1Lane128Pat<store, VectorIndexS, v4i32, i32, ST1i32>;
+def : St1Lane128Pat<store, VectorIndexS, v4f32, f32, ST1i32>;
+def : St1Lane128Pat<store, VectorIndexD, v2i64, i64, ST1i64>;
+def : St1Lane128Pat<store, VectorIndexD, v2f64, f64, ST1i64>;
+
+let AddedComplexity = 15 in
+class St1Lane64Pat<SDPatternOperator scalar_store, Operand VecIndex,
+ ValueType VTy, ValueType STy, Instruction ST1>
+ : Pat<(scalar_store
+ (STy (vector_extract (VTy VecListOne64:$Vt), VecIndex:$idx)),
+ GPR64sp:$Rn),
+ (ST1 (SUBREG_TO_REG (i32 0), VecListOne64:$Vt, dsub),
+ VecIndex:$idx, GPR64sp:$Rn)>;
+
+def : St1Lane64Pat<truncstorei8, VectorIndexB, v8i8, i32, ST1i8>;
+def : St1Lane64Pat<truncstorei16, VectorIndexH, v4i16, i32, ST1i16>;
+def : St1Lane64Pat<store, VectorIndexS, v2i32, i32, ST1i32>;
+def : St1Lane64Pat<store, VectorIndexS, v2f32, f32, ST1i32>;
+
+multiclass St1LanePost64Pat<SDPatternOperator scalar_store, Operand VecIndex,
+ ValueType VTy, ValueType STy, Instruction ST1,
+ int offset> {
+ def : Pat<(scalar_store
+ (STy (vector_extract (VTy VecListOne64:$Vt), VecIndex:$idx)),
+ GPR64sp:$Rn, offset),
+ (ST1 (SUBREG_TO_REG (i32 0), VecListOne64:$Vt, dsub),
+ VecIndex:$idx, GPR64sp:$Rn, XZR)>;
+
+ def : Pat<(scalar_store
+ (STy (vector_extract (VTy VecListOne64:$Vt), VecIndex:$idx)),
+ GPR64sp:$Rn, GPR64:$Rm),
+ (ST1 (SUBREG_TO_REG (i32 0), VecListOne64:$Vt, dsub),
+ VecIndex:$idx, GPR64sp:$Rn, $Rm)>;
+}
+
+defm : St1LanePost64Pat<post_truncsti8, VectorIndexB, v8i8, i32, ST1i8_POST, 1>;
+defm : St1LanePost64Pat<post_truncsti16, VectorIndexH, v4i16, i32, ST1i16_POST,
+ 2>;
+defm : St1LanePost64Pat<post_store, VectorIndexS, v2i32, i32, ST1i32_POST, 4>;
+defm : St1LanePost64Pat<post_store, VectorIndexS, v2f32, f32, ST1i32_POST, 4>;
+defm : St1LanePost64Pat<post_store, VectorIndexD, v1i64, i64, ST1i64_POST, 8>;
+defm : St1LanePost64Pat<post_store, VectorIndexD, v1f64, f64, ST1i64_POST, 8>;
+
+multiclass St1LanePost128Pat<SDPatternOperator scalar_store, Operand VecIndex,
+ ValueType VTy, ValueType STy, Instruction ST1,
+ int offset> {
+ def : Pat<(scalar_store
+ (STy (vector_extract (VTy VecListOne128:$Vt), VecIndex:$idx)),
+ GPR64sp:$Rn, offset),
+ (ST1 VecListOne128:$Vt, VecIndex:$idx, GPR64sp:$Rn, XZR)>;
+
+ def : Pat<(scalar_store
+ (STy (vector_extract (VTy VecListOne128:$Vt), VecIndex:$idx)),
+ GPR64sp:$Rn, GPR64:$Rm),
+ (ST1 VecListOne128:$Vt, VecIndex:$idx, GPR64sp:$Rn, $Rm)>;
+}
+
+defm : St1LanePost128Pat<post_truncsti8, VectorIndexB, v16i8, i32, ST1i8_POST,
+ 1>;
+defm : St1LanePost128Pat<post_truncsti16, VectorIndexH, v8i16, i32, ST1i16_POST,
+ 2>;
+defm : St1LanePost128Pat<post_store, VectorIndexS, v4i32, i32, ST1i32_POST, 4>;
+defm : St1LanePost128Pat<post_store, VectorIndexS, v4f32, f32, ST1i32_POST, 4>;
+defm : St1LanePost128Pat<post_store, VectorIndexD, v2i64, i64, ST1i64_POST, 8>;
+defm : St1LanePost128Pat<post_store, VectorIndexD, v2f64, f64, ST1i64_POST, 8>;
+
+let mayStore = 1, neverHasSideEffects = 1 in {
+defm ST2 : SIMDStSingleB<1, 0b000, "st2", VecListTwob, GPR64pi2>;
+defm ST2 : SIMDStSingleH<1, 0b010, 0, "st2", VecListTwoh, GPR64pi4>;
+defm ST2 : SIMDStSingleS<1, 0b100, 0b00, "st2", VecListTwos, GPR64pi8>;
+defm ST2 : SIMDStSingleD<1, 0b100, 0b01, "st2", VecListTwod, GPR64pi16>;
+defm ST3 : SIMDStSingleB<0, 0b001, "st3", VecListThreeb, GPR64pi3>;
+defm ST3 : SIMDStSingleH<0, 0b011, 0, "st3", VecListThreeh, GPR64pi6>;
+defm ST3 : SIMDStSingleS<0, 0b101, 0b00, "st3", VecListThrees, GPR64pi12>;
+defm ST3 : SIMDStSingleD<0, 0b101, 0b01, "st3", VecListThreed, GPR64pi24>;
+defm ST4 : SIMDStSingleB<1, 0b001, "st4", VecListFourb, GPR64pi4>;
+defm ST4 : SIMDStSingleH<1, 0b011, 0, "st4", VecListFourh, GPR64pi8>;
+defm ST4 : SIMDStSingleS<1, 0b101, 0b00, "st4", VecListFours, GPR64pi16>;
+defm ST4 : SIMDStSingleD<1, 0b101, 0b01, "st4", VecListFourd, GPR64pi32>;
+}
+
+defm ST1 : SIMDLdSt1SingleAliases<"st1">;
+defm ST2 : SIMDLdSt2SingleAliases<"st2">;
+defm ST3 : SIMDLdSt3SingleAliases<"st3">;
+defm ST4 : SIMDLdSt4SingleAliases<"st4">;
+
+//----------------------------------------------------------------------------
+// Crypto extensions
+//----------------------------------------------------------------------------
+
+def AESErr : AESTiedInst<0b0100, "aese", int_aarch64_crypto_aese>;
+def AESDrr : AESTiedInst<0b0101, "aesd", int_aarch64_crypto_aesd>;
+def AESMCrr : AESInst< 0b0110, "aesmc", int_aarch64_crypto_aesmc>;
+def AESIMCrr : AESInst< 0b0111, "aesimc", int_aarch64_crypto_aesimc>;
+
+def SHA1Crrr : SHATiedInstQSV<0b000, "sha1c", int_aarch64_crypto_sha1c>;
+def SHA1Prrr : SHATiedInstQSV<0b001, "sha1p", int_aarch64_crypto_sha1p>;
+def SHA1Mrrr : SHATiedInstQSV<0b010, "sha1m", int_aarch64_crypto_sha1m>;
+def SHA1SU0rrr : SHATiedInstVVV<0b011, "sha1su0", int_aarch64_crypto_sha1su0>;
+def SHA256Hrrr : SHATiedInstQQV<0b100, "sha256h", int_aarch64_crypto_sha256h>;
+def SHA256H2rrr : SHATiedInstQQV<0b101, "sha256h2",int_aarch64_crypto_sha256h2>;
+def SHA256SU1rrr :SHATiedInstVVV<0b110, "sha256su1",int_aarch64_crypto_sha256su1>;
+
+def SHA1Hrr : SHAInstSS< 0b0000, "sha1h", int_aarch64_crypto_sha1h>;
+def SHA1SU1rr : SHATiedInstVV<0b0001, "sha1su1", int_aarch64_crypto_sha1su1>;
+def SHA256SU0rr : SHATiedInstVV<0b0010, "sha256su0",int_aarch64_crypto_sha256su0>;
+
+//----------------------------------------------------------------------------
+// Compiler-pseudos
+//----------------------------------------------------------------------------
+// FIXME: Like for X86, these should go in their own separate .td file.
+
+// Any instruction that defines a 32-bit result leaves the high half of the
+// register. Truncate can be lowered to EXTRACT_SUBREG. CopyFromReg may
+// be copying from a truncate. But any other 32-bit operation will zero-extend
+// up to 64 bits.
+// FIXME: X86 also checks for CMOV here. Do we need something similar?
+def def32 : PatLeaf<(i32 GPR32:$src), [{
+ return N->getOpcode() != ISD::TRUNCATE &&
+ N->getOpcode() != TargetOpcode::EXTRACT_SUBREG &&
+ N->getOpcode() != ISD::CopyFromReg;
+}]>;
+
+// In the case of a 32-bit def that is known to implicitly zero-extend,
+// we can use a SUBREG_TO_REG.
+def : Pat<(i64 (zext def32:$src)), (SUBREG_TO_REG (i64 0), GPR32:$src, sub_32)>;
+
+// For an anyext, we don't care what the high bits are, so we can perform an
+// INSERT_SUBREF into an IMPLICIT_DEF.
+def : Pat<(i64 (anyext GPR32:$src)),
+ (INSERT_SUBREG (i64 (IMPLICIT_DEF)), GPR32:$src, sub_32)>;
+
+// When we need to explicitly zero-extend, we use an unsigned bitfield move
+// instruction (UBFM) on the enclosing super-reg.
+def : Pat<(i64 (zext GPR32:$src)),
+ (UBFMXri (INSERT_SUBREG (i64 (IMPLICIT_DEF)), GPR32:$src, sub_32), 0, 31)>;
+
+// To sign extend, we use a signed bitfield move instruction (SBFM) on the
+// containing super-reg.
+def : Pat<(i64 (sext GPR32:$src)),
+ (SBFMXri (INSERT_SUBREG (i64 (IMPLICIT_DEF)), GPR32:$src, sub_32), 0, 31)>;
+def : Pat<(i64 (sext_inreg GPR64:$src, i32)), (SBFMXri GPR64:$src, 0, 31)>;
+def : Pat<(i64 (sext_inreg GPR64:$src, i16)), (SBFMXri GPR64:$src, 0, 15)>;
+def : Pat<(i64 (sext_inreg GPR64:$src, i8)), (SBFMXri GPR64:$src, 0, 7)>;
+def : Pat<(i64 (sext_inreg GPR64:$src, i1)), (SBFMXri GPR64:$src, 0, 0)>;
+def : Pat<(i32 (sext_inreg GPR32:$src, i16)), (SBFMWri GPR32:$src, 0, 15)>;
+def : Pat<(i32 (sext_inreg GPR32:$src, i8)), (SBFMWri GPR32:$src, 0, 7)>;
+def : Pat<(i32 (sext_inreg GPR32:$src, i1)), (SBFMWri GPR32:$src, 0, 0)>;
+
+def : Pat<(shl (sext_inreg GPR32:$Rn, i8), (i64 imm0_31:$imm)),
+ (SBFMWri GPR32:$Rn, (i64 (i32shift_a imm0_31:$imm)),
+ (i64 (i32shift_sext_i8 imm0_31:$imm)))>;
+def : Pat<(shl (sext_inreg GPR64:$Rn, i8), (i64 imm0_63:$imm)),
+ (SBFMXri GPR64:$Rn, (i64 (i64shift_a imm0_63:$imm)),
+ (i64 (i64shift_sext_i8 imm0_63:$imm)))>;
+
+def : Pat<(shl (sext_inreg GPR32:$Rn, i16), (i64 imm0_31:$imm)),
+ (SBFMWri GPR32:$Rn, (i64 (i32shift_a imm0_31:$imm)),
+ (i64 (i32shift_sext_i16 imm0_31:$imm)))>;
+def : Pat<(shl (sext_inreg GPR64:$Rn, i16), (i64 imm0_63:$imm)),
+ (SBFMXri GPR64:$Rn, (i64 (i64shift_a imm0_63:$imm)),
+ (i64 (i64shift_sext_i16 imm0_63:$imm)))>;
+
+def : Pat<(shl (i64 (sext GPR32:$Rn)), (i64 imm0_63:$imm)),
+ (SBFMXri (INSERT_SUBREG (i64 (IMPLICIT_DEF)), GPR32:$Rn, sub_32),
+ (i64 (i64shift_a imm0_63:$imm)),
+ (i64 (i64shift_sext_i32 imm0_63:$imm)))>;
+
+// sra patterns have an AddedComplexity of 10, so make sure we have a higher
+// AddedComplexity for the following patterns since we want to match sext + sra
+// patterns before we attempt to match a single sra node.
+let AddedComplexity = 20 in {
+// We support all sext + sra combinations which preserve at least one bit of the
+// original value which is to be sign extended. E.g. we support shifts up to
+// bitwidth-1 bits.
+def : Pat<(sra (sext_inreg GPR32:$Rn, i8), (i64 imm0_7:$imm)),
+ (SBFMWri GPR32:$Rn, (i64 imm0_7:$imm), 7)>;
+def : Pat<(sra (sext_inreg GPR64:$Rn, i8), (i64 imm0_7:$imm)),
+ (SBFMXri GPR64:$Rn, (i64 imm0_7:$imm), 7)>;
+
+def : Pat<(sra (sext_inreg GPR32:$Rn, i16), (i64 imm0_15:$imm)),
+ (SBFMWri GPR32:$Rn, (i64 imm0_15:$imm), 15)>;
+def : Pat<(sra (sext_inreg GPR64:$Rn, i16), (i64 imm0_15:$imm)),
+ (SBFMXri GPR64:$Rn, (i64 imm0_15:$imm), 15)>;
+
+def : Pat<(sra (i64 (sext GPR32:$Rn)), (i64 imm0_31:$imm)),
+ (SBFMXri (INSERT_SUBREG (i64 (IMPLICIT_DEF)), GPR32:$Rn, sub_32),
+ (i64 imm0_31:$imm), 31)>;
+} // AddedComplexity = 20
+
+// To truncate, we can simply extract from a subregister.
+def : Pat<(i32 (trunc GPR64sp:$src)),
+ (i32 (EXTRACT_SUBREG GPR64sp:$src, sub_32))>;
+
+// __builtin_trap() uses the BRK instruction on AArch64.
+def : Pat<(trap), (BRK 1)>;
+
+// Conversions within AdvSIMD types in the same register size are free.
+// But because we need a consistent lane ordering, in big endian many
+// conversions require one or more REV instructions.
+//
+// Consider a simple memory load followed by a bitconvert then a store.
+// v0 = load v2i32
+// v1 = BITCAST v2i32 v0 to v4i16
+// store v4i16 v2
+//
+// In big endian mode every memory access has an implicit byte swap. LDR and
+// STR do a 64-bit byte swap, whereas LD1/ST1 do a byte swap per lane - that
+// is, they treat the vector as a sequence of elements to be byte-swapped.
+// The two pairs of instructions are fundamentally incompatible. We've decided
+// to use LD1/ST1 only to simplify compiler implementation.
+//
+// LD1/ST1 perform the equivalent of a sequence of LDR/STR + REV. This makes
+// the original code sequence:
+// v0 = load v2i32
+// v1 = REV v2i32 (implicit)
+// v2 = BITCAST v2i32 v1 to v4i16
+// v3 = REV v4i16 v2 (implicit)
+// store v4i16 v3
+//
+// But this is now broken - the value stored is different to the value loaded
+// due to lane reordering. To fix this, on every BITCAST we must perform two
+// other REVs:
+// v0 = load v2i32
+// v1 = REV v2i32 (implicit)
+// v2 = REV v2i32
+// v3 = BITCAST v2i32 v2 to v4i16
+// v4 = REV v4i16
+// v5 = REV v4i16 v4 (implicit)
+// store v4i16 v5
+//
+// This means an extra two instructions, but actually in most cases the two REV
+// instructions can be combined into one. For example:
+// (REV64_2s (REV64_4h X)) === (REV32_4h X)
+//
+// There is also no 128-bit REV instruction. This must be synthesized with an
+// EXT instruction.
+//
+// Most bitconverts require some sort of conversion. The only exceptions are:
+// a) Identity conversions - vNfX <-> vNiX
+// b) Single-lane-to-scalar - v1fX <-> fX or v1iX <-> iX
//
-include "AArch64InstrNEON.td"
+let Predicates = [IsLE] in {
+def : Pat<(v8i8 (bitconvert GPR64:$Xn)), (COPY_TO_REGCLASS GPR64:$Xn, FPR64)>;
+def : Pat<(v4i16 (bitconvert GPR64:$Xn)), (COPY_TO_REGCLASS GPR64:$Xn, FPR64)>;
+def : Pat<(v2i32 (bitconvert GPR64:$Xn)), (COPY_TO_REGCLASS GPR64:$Xn, FPR64)>;
+def : Pat<(v2f32 (bitconvert GPR64:$Xn)), (COPY_TO_REGCLASS GPR64:$Xn, FPR64)>;
+
+def : Pat<(i64 (bitconvert (v8i8 V64:$Vn))),
+ (COPY_TO_REGCLASS V64:$Vn, GPR64)>;
+def : Pat<(i64 (bitconvert (v4i16 V64:$Vn))),
+ (COPY_TO_REGCLASS V64:$Vn, GPR64)>;
+def : Pat<(i64 (bitconvert (v2i32 V64:$Vn))),
+ (COPY_TO_REGCLASS V64:$Vn, GPR64)>;
+def : Pat<(i64 (bitconvert (v2f32 V64:$Vn))),
+ (COPY_TO_REGCLASS V64:$Vn, GPR64)>;
+def : Pat<(i64 (bitconvert (v1f64 V64:$Vn))),
+ (COPY_TO_REGCLASS V64:$Vn, GPR64)>;
+}
+let Predicates = [IsBE] in {
+def : Pat<(v8i8 (bitconvert GPR64:$Xn)),
+ (REV64v8i8 (COPY_TO_REGCLASS GPR64:$Xn, FPR64))>;
+def : Pat<(v4i16 (bitconvert GPR64:$Xn)),
+ (REV64v4i16 (COPY_TO_REGCLASS GPR64:$Xn, FPR64))>;
+def : Pat<(v2i32 (bitconvert GPR64:$Xn)),
+ (REV64v2i32 (COPY_TO_REGCLASS GPR64:$Xn, FPR64))>;
+def : Pat<(v2f32 (bitconvert GPR64:$Xn)),
+ (REV64v2i32 (COPY_TO_REGCLASS GPR64:$Xn, FPR64))>;
+
+def : Pat<(i64 (bitconvert (v8i8 V64:$Vn))),
+ (REV64v8i8 (COPY_TO_REGCLASS V64:$Vn, GPR64))>;
+def : Pat<(i64 (bitconvert (v4i16 V64:$Vn))),
+ (REV64v4i16 (COPY_TO_REGCLASS V64:$Vn, GPR64))>;
+def : Pat<(i64 (bitconvert (v2i32 V64:$Vn))),
+ (REV64v2i32 (COPY_TO_REGCLASS V64:$Vn, GPR64))>;
+def : Pat<(i64 (bitconvert (v2f32 V64:$Vn))),
+ (REV64v2i32 (COPY_TO_REGCLASS V64:$Vn, GPR64))>;
+}
+def : Pat<(v1i64 (bitconvert GPR64:$Xn)), (COPY_TO_REGCLASS GPR64:$Xn, FPR64)>;
+def : Pat<(v1f64 (bitconvert GPR64:$Xn)), (COPY_TO_REGCLASS GPR64:$Xn, FPR64)>;
+def : Pat<(i64 (bitconvert (v1i64 V64:$Vn))),
+ (COPY_TO_REGCLASS V64:$Vn, GPR64)>;
+def : Pat<(v1i64 (scalar_to_vector GPR64:$Xn)),
+ (COPY_TO_REGCLASS GPR64:$Xn, FPR64)>;
+def : Pat<(v1f64 (scalar_to_vector GPR64:$Xn)),
+ (COPY_TO_REGCLASS GPR64:$Xn, FPR64)>;
+def : Pat<(v1f64 (scalar_to_vector (f64 FPR64:$Xn))), (v1f64 FPR64:$Xn)>;
+
+def : Pat<(f32 (bitconvert (i32 GPR32:$Xn))),
+ (COPY_TO_REGCLASS GPR32:$Xn, FPR32)>;
+def : Pat<(i32 (bitconvert (f32 FPR32:$Xn))),
+ (COPY_TO_REGCLASS FPR32:$Xn, GPR32)>;
+def : Pat<(f64 (bitconvert (i64 GPR64:$Xn))),
+ (COPY_TO_REGCLASS GPR64:$Xn, FPR64)>;
+def : Pat<(i64 (bitconvert (f64 FPR64:$Xn))),
+ (COPY_TO_REGCLASS FPR64:$Xn, GPR64)>;
+def : Pat<(i64 (bitconvert (v1f64 V64:$Vn))),
+ (COPY_TO_REGCLASS V64:$Vn, GPR64)>;
+
+let Predicates = [IsLE] in {
+def : Pat<(v1i64 (bitconvert (v2i32 FPR64:$src))), (v1i64 FPR64:$src)>;
+def : Pat<(v1i64 (bitconvert (v4i16 FPR64:$src))), (v1i64 FPR64:$src)>;
+def : Pat<(v1i64 (bitconvert (v8i8 FPR64:$src))), (v1i64 FPR64:$src)>;
+def : Pat<(v1i64 (bitconvert (v2f32 FPR64:$src))), (v1i64 FPR64:$src)>;
+}
+let Predicates = [IsBE] in {
+def : Pat<(v1i64 (bitconvert (v2i32 FPR64:$src))),
+ (v1i64 (REV64v2i32 FPR64:$src))>;
+def : Pat<(v1i64 (bitconvert (v4i16 FPR64:$src))),
+ (v1i64 (REV64v4i16 FPR64:$src))>;
+def : Pat<(v1i64 (bitconvert (v8i8 FPR64:$src))),
+ (v1i64 (REV64v8i8 FPR64:$src))>;
+def : Pat<(v1i64 (bitconvert (v2f32 FPR64:$src))),
+ (v1i64 (REV64v2i32 FPR64:$src))>;
+}
+def : Pat<(v1i64 (bitconvert (v1f64 FPR64:$src))), (v1i64 FPR64:$src)>;
+def : Pat<(v1i64 (bitconvert (f64 FPR64:$src))), (v1i64 FPR64:$src)>;
+
+let Predicates = [IsLE] in {
+def : Pat<(v2i32 (bitconvert (v1i64 FPR64:$src))), (v2i32 FPR64:$src)>;
+def : Pat<(v2i32 (bitconvert (v4i16 FPR64:$src))), (v2i32 FPR64:$src)>;
+def : Pat<(v2i32 (bitconvert (v8i8 FPR64:$src))), (v2i32 FPR64:$src)>;
+def : Pat<(v2i32 (bitconvert (f64 FPR64:$src))), (v2i32 FPR64:$src)>;
+def : Pat<(v2i32 (bitconvert (v1f64 FPR64:$src))), (v2i32 FPR64:$src)>;
+}
+let Predicates = [IsBE] in {
+def : Pat<(v2i32 (bitconvert (v1i64 FPR64:$src))),
+ (v2i32 (REV64v2i32 FPR64:$src))>;
+def : Pat<(v2i32 (bitconvert (v4i16 FPR64:$src))),
+ (v2i32 (REV32v4i16 FPR64:$src))>;
+def : Pat<(v2i32 (bitconvert (v8i8 FPR64:$src))),
+ (v2i32 (REV32v8i8 FPR64:$src))>;
+def : Pat<(v2i32 (bitconvert (f64 FPR64:$src))),
+ (v2i32 (REV64v2i32 FPR64:$src))>;
+def : Pat<(v2i32 (bitconvert (v1f64 FPR64:$src))),
+ (v2i32 (REV64v2i32 FPR64:$src))>;
+}
+def : Pat<(v2i32 (bitconvert (v2f32 FPR64:$src))), (v2i32 FPR64:$src)>;
+
+let Predicates = [IsLE] in {
+def : Pat<(v4i16 (bitconvert (v1i64 FPR64:$src))), (v4i16 FPR64:$src)>;
+def : Pat<(v4i16 (bitconvert (v2i32 FPR64:$src))), (v4i16 FPR64:$src)>;
+def : Pat<(v4i16 (bitconvert (v8i8 FPR64:$src))), (v4i16 FPR64:$src)>;
+def : Pat<(v4i16 (bitconvert (f64 FPR64:$src))), (v4i16 FPR64:$src)>;
+def : Pat<(v4i16 (bitconvert (v2f32 FPR64:$src))), (v4i16 FPR64:$src)>;
+def : Pat<(v4i16 (bitconvert (v1f64 FPR64:$src))), (v4i16 FPR64:$src)>;
+}
+let Predicates = [IsBE] in {
+def : Pat<(v4i16 (bitconvert (v1i64 FPR64:$src))),
+ (v4i16 (REV64v4i16 FPR64:$src))>;
+def : Pat<(v4i16 (bitconvert (v2i32 FPR64:$src))),
+ (v4i16 (REV32v4i16 FPR64:$src))>;
+def : Pat<(v4i16 (bitconvert (v8i8 FPR64:$src))),
+ (v4i16 (REV16v8i8 FPR64:$src))>;
+def : Pat<(v4i16 (bitconvert (f64 FPR64:$src))),
+ (v4i16 (REV64v4i16 FPR64:$src))>;
+def : Pat<(v4i16 (bitconvert (v2f32 FPR64:$src))),
+ (v4i16 (REV32v4i16 FPR64:$src))>;
+def : Pat<(v4i16 (bitconvert (v1f64 FPR64:$src))),
+ (v4i16 (REV64v4i16 FPR64:$src))>;
+}
+
+let Predicates = [IsLE] in {
+def : Pat<(v8i8 (bitconvert (v1i64 FPR64:$src))), (v8i8 FPR64:$src)>;
+def : Pat<(v8i8 (bitconvert (v2i32 FPR64:$src))), (v8i8 FPR64:$src)>;
+def : Pat<(v8i8 (bitconvert (v4i16 FPR64:$src))), (v8i8 FPR64:$src)>;
+def : Pat<(v8i8 (bitconvert (f64 FPR64:$src))), (v8i8 FPR64:$src)>;
+def : Pat<(v8i8 (bitconvert (v2f32 FPR64:$src))), (v8i8 FPR64:$src)>;
+def : Pat<(v8i8 (bitconvert (v1f64 FPR64:$src))), (v8i8 FPR64:$src)>;
+}
+let Predicates = [IsBE] in {
+def : Pat<(v8i8 (bitconvert (v1i64 FPR64:$src))),
+ (v8i8 (REV64v8i8 FPR64:$src))>;
+def : Pat<(v8i8 (bitconvert (v2i32 FPR64:$src))),
+ (v8i8 (REV32v8i8 FPR64:$src))>;
+def : Pat<(v8i8 (bitconvert (v4i16 FPR64:$src))),
+ (v8i8 (REV16v8i8 FPR64:$src))>;
+def : Pat<(v8i8 (bitconvert (f64 FPR64:$src))),
+ (v8i8 (REV64v8i8 FPR64:$src))>;
+def : Pat<(v8i8 (bitconvert (v2f32 FPR64:$src))),
+ (v8i8 (REV32v8i8 FPR64:$src))>;
+def : Pat<(v8i8 (bitconvert (v1f64 FPR64:$src))),
+ (v8i8 (REV64v8i8 FPR64:$src))>;
+}
+
+let Predicates = [IsLE] in {
+def : Pat<(f64 (bitconvert (v2i32 FPR64:$src))), (f64 FPR64:$src)>;
+def : Pat<(f64 (bitconvert (v4i16 FPR64:$src))), (f64 FPR64:$src)>;
+def : Pat<(f64 (bitconvert (v2f32 FPR64:$src))), (f64 FPR64:$src)>;
+def : Pat<(f64 (bitconvert (v8i8 FPR64:$src))), (f64 FPR64:$src)>;
+}
+let Predicates = [IsBE] in {
+def : Pat<(f64 (bitconvert (v2i32 FPR64:$src))),
+ (f64 (REV64v2i32 FPR64:$src))>;
+def : Pat<(f64 (bitconvert (v4i16 FPR64:$src))),
+ (f64 (REV64v4i16 FPR64:$src))>;
+def : Pat<(f64 (bitconvert (v2f32 FPR64:$src))),
+ (f64 (REV64v2i32 FPR64:$src))>;
+def : Pat<(f64 (bitconvert (v8i8 FPR64:$src))),
+ (f64 (REV64v8i8 FPR64:$src))>;
+}
+def : Pat<(f64 (bitconvert (v1i64 FPR64:$src))), (f64 FPR64:$src)>;
+def : Pat<(f64 (bitconvert (v1f64 FPR64:$src))), (f64 FPR64:$src)>;
+
+let Predicates = [IsLE] in {
+def : Pat<(v1f64 (bitconvert (v2i32 FPR64:$src))), (v1f64 FPR64:$src)>;
+def : Pat<(v1f64 (bitconvert (v4i16 FPR64:$src))), (v1f64 FPR64:$src)>;
+def : Pat<(v1f64 (bitconvert (v8i8 FPR64:$src))), (v1f64 FPR64:$src)>;
+def : Pat<(v1f64 (bitconvert (v2f32 FPR64:$src))), (v1f64 FPR64:$src)>;
+}
+let Predicates = [IsBE] in {
+def : Pat<(v1f64 (bitconvert (v2i32 FPR64:$src))),
+ (v1f64 (REV64v2i32 FPR64:$src))>;
+def : Pat<(v1f64 (bitconvert (v4i16 FPR64:$src))),
+ (v1f64 (REV64v4i16 FPR64:$src))>;
+def : Pat<(v1f64 (bitconvert (v8i8 FPR64:$src))),
+ (v1f64 (REV64v8i8 FPR64:$src))>;
+def : Pat<(v1f64 (bitconvert (v2f32 FPR64:$src))),
+ (v1f64 (REV64v2i32 FPR64:$src))>;
+}
+def : Pat<(v1f64 (bitconvert (v1i64 FPR64:$src))), (v1f64 FPR64:$src)>;
+def : Pat<(v1f64 (bitconvert (f64 FPR64:$src))), (v1f64 FPR64:$src)>;
+
+let Predicates = [IsLE] in {
+def : Pat<(v2f32 (bitconvert (v1i64 FPR64:$src))), (v2f32 FPR64:$src)>;
+def : Pat<(v2f32 (bitconvert (v4i16 FPR64:$src))), (v2f32 FPR64:$src)>;
+def : Pat<(v2f32 (bitconvert (v8i8 FPR64:$src))), (v2f32 FPR64:$src)>;
+def : Pat<(v2f32 (bitconvert (v1f64 FPR64:$src))), (v2f32 FPR64:$src)>;
+def : Pat<(v2f32 (bitconvert (f64 FPR64:$src))), (v2f32 FPR64:$src)>;
+}
+let Predicates = [IsBE] in {
+def : Pat<(v2f32 (bitconvert (v1i64 FPR64:$src))),
+ (v2f32 (REV64v2i32 FPR64:$src))>;
+def : Pat<(v2f32 (bitconvert (v4i16 FPR64:$src))),
+ (v2f32 (REV32v4i16 FPR64:$src))>;
+def : Pat<(v2f32 (bitconvert (v8i8 FPR64:$src))),
+ (v2f32 (REV32v8i8 FPR64:$src))>;
+def : Pat<(v2f32 (bitconvert (v1f64 FPR64:$src))),
+ (v2f32 (REV64v2i32 FPR64:$src))>;
+def : Pat<(v2f32 (bitconvert (f64 FPR64:$src))),
+ (v2f32 (REV64v2i32 FPR64:$src))>;
+}
+def : Pat<(v2f32 (bitconvert (v2i32 FPR64:$src))), (v2f32 FPR64:$src)>;
+
+let Predicates = [IsLE] in {
+def : Pat<(f128 (bitconvert (v2i64 FPR128:$src))), (f128 FPR128:$src)>;
+def : Pat<(f128 (bitconvert (v4i32 FPR128:$src))), (f128 FPR128:$src)>;
+def : Pat<(f128 (bitconvert (v8i16 FPR128:$src))), (f128 FPR128:$src)>;
+def : Pat<(f128 (bitconvert (v2f64 FPR128:$src))), (f128 FPR128:$src)>;
+def : Pat<(f128 (bitconvert (v4f32 FPR128:$src))), (f128 FPR128:$src)>;
+def : Pat<(f128 (bitconvert (v16i8 FPR128:$src))), (f128 FPR128:$src)>;
+}
+let Predicates = [IsBE] in {
+def : Pat<(f128 (bitconvert (v2i64 FPR128:$src))),
+ (f128 (EXTv16i8 FPR128:$src, FPR128:$src, (i32 8)))>;
+def : Pat<(f128 (bitconvert (v4i32 FPR128:$src))),
+ (f128 (EXTv16i8 (REV64v4i32 FPR128:$src),
+ (REV64v4i32 FPR128:$src), (i32 8)))>;
+def : Pat<(f128 (bitconvert (v8i16 FPR128:$src))),
+ (f128 (EXTv16i8 (REV64v8i16 FPR128:$src),
+ (REV64v8i16 FPR128:$src), (i32 8)))>;
+def : Pat<(f128 (bitconvert (v2f64 FPR128:$src))),
+ (f128 (EXTv16i8 FPR128:$src, FPR128:$src, (i32 8)))>;
+def : Pat<(f128 (bitconvert (v4f32 FPR128:$src))),
+ (f128 (EXTv16i8 (REV64v4i32 FPR128:$src),
+ (REV64v4i32 FPR128:$src), (i32 8)))>;
+def : Pat<(f128 (bitconvert (v16i8 FPR128:$src))),
+ (f128 (EXTv16i8 (REV64v16i8 FPR128:$src),
+ (REV64v16i8 FPR128:$src), (i32 8)))>;
+}
+
+let Predicates = [IsLE] in {
+def : Pat<(v2f64 (bitconvert (f128 FPR128:$src))), (v2f64 FPR128:$src)>;
+def : Pat<(v2f64 (bitconvert (v4i32 FPR128:$src))), (v2f64 FPR128:$src)>;
+def : Pat<(v2f64 (bitconvert (v8i16 FPR128:$src))), (v2f64 FPR128:$src)>;
+def : Pat<(v2f64 (bitconvert (v16i8 FPR128:$src))), (v2f64 FPR128:$src)>;
+def : Pat<(v2f64 (bitconvert (v4f32 FPR128:$src))), (v2f64 FPR128:$src)>;
+}
+let Predicates = [IsBE] in {
+def : Pat<(v2f64 (bitconvert (f128 FPR128:$src))),
+ (v2f64 (EXTv16i8 FPR128:$src,
+ FPR128:$src, (i32 8)))>;
+def : Pat<(v2f64 (bitconvert (v4i32 FPR128:$src))),
+ (v2f64 (REV64v4i32 FPR128:$src))>;
+def : Pat<(v2f64 (bitconvert (v8i16 FPR128:$src))),
+ (v2f64 (REV64v8i16 FPR128:$src))>;
+def : Pat<(v2f64 (bitconvert (v16i8 FPR128:$src))),
+ (v2f64 (REV64v16i8 FPR128:$src))>;
+def : Pat<(v2f64 (bitconvert (v4f32 FPR128:$src))),
+ (v2f64 (REV64v4i32 FPR128:$src))>;
+}
+def : Pat<(v2f64 (bitconvert (v2i64 FPR128:$src))), (v2f64 FPR128:$src)>;
+
+let Predicates = [IsLE] in {
+def : Pat<(v4f32 (bitconvert (f128 FPR128:$src))), (v4f32 FPR128:$src)>;
+def : Pat<(v4f32 (bitconvert (v8i16 FPR128:$src))), (v4f32 FPR128:$src)>;
+def : Pat<(v4f32 (bitconvert (v16i8 FPR128:$src))), (v4f32 FPR128:$src)>;
+def : Pat<(v4f32 (bitconvert (v2i64 FPR128:$src))), (v4f32 FPR128:$src)>;
+def : Pat<(v4f32 (bitconvert (v2f64 FPR128:$src))), (v4f32 FPR128:$src)>;
+}
+let Predicates = [IsBE] in {
+def : Pat<(v4f32 (bitconvert (f128 FPR128:$src))),
+ (v4f32 (EXTv16i8 (REV64v4i32 FPR128:$src),
+ (REV64v4i32 FPR128:$src), (i32 8)))>;
+def : Pat<(v4f32 (bitconvert (v8i16 FPR128:$src))),
+ (v4f32 (REV32v8i16 FPR128:$src))>;
+def : Pat<(v4f32 (bitconvert (v16i8 FPR128:$src))),
+ (v4f32 (REV32v16i8 FPR128:$src))>;
+def : Pat<(v4f32 (bitconvert (v2i64 FPR128:$src))),
+ (v4f32 (REV64v4i32 FPR128:$src))>;
+def : Pat<(v4f32 (bitconvert (v2f64 FPR128:$src))),
+ (v4f32 (REV64v4i32 FPR128:$src))>;
+}
+def : Pat<(v4f32 (bitconvert (v4i32 FPR128:$src))), (v4f32 FPR128:$src)>;
+
+let Predicates = [IsLE] in {
+def : Pat<(v2i64 (bitconvert (f128 FPR128:$src))), (v2i64 FPR128:$src)>;
+def : Pat<(v2i64 (bitconvert (v4i32 FPR128:$src))), (v2i64 FPR128:$src)>;
+def : Pat<(v2i64 (bitconvert (v8i16 FPR128:$src))), (v2i64 FPR128:$src)>;
+def : Pat<(v2i64 (bitconvert (v16i8 FPR128:$src))), (v2i64 FPR128:$src)>;
+def : Pat<(v2i64 (bitconvert (v4f32 FPR128:$src))), (v2i64 FPR128:$src)>;
+}
+let Predicates = [IsBE] in {
+def : Pat<(v2i64 (bitconvert (f128 FPR128:$src))),
+ (v2i64 (EXTv16i8 FPR128:$src,
+ FPR128:$src, (i32 8)))>;
+def : Pat<(v2i64 (bitconvert (v4i32 FPR128:$src))),
+ (v2i64 (REV64v4i32 FPR128:$src))>;
+def : Pat<(v2i64 (bitconvert (v8i16 FPR128:$src))),
+ (v2i64 (REV64v8i16 FPR128:$src))>;
+def : Pat<(v2i64 (bitconvert (v16i8 FPR128:$src))),
+ (v2i64 (REV64v16i8 FPR128:$src))>;
+def : Pat<(v2i64 (bitconvert (v4f32 FPR128:$src))),
+ (v2i64 (REV64v4i32 FPR128:$src))>;
+}
+def : Pat<(v2i64 (bitconvert (v2f64 FPR128:$src))), (v2i64 FPR128:$src)>;
+
+let Predicates = [IsLE] in {
+def : Pat<(v4i32 (bitconvert (f128 FPR128:$src))), (v4i32 FPR128:$src)>;
+def : Pat<(v4i32 (bitconvert (v2i64 FPR128:$src))), (v4i32 FPR128:$src)>;
+def : Pat<(v4i32 (bitconvert (v8i16 FPR128:$src))), (v4i32 FPR128:$src)>;
+def : Pat<(v4i32 (bitconvert (v16i8 FPR128:$src))), (v4i32 FPR128:$src)>;
+def : Pat<(v4i32 (bitconvert (v2f64 FPR128:$src))), (v4i32 FPR128:$src)>;
+}
+let Predicates = [IsBE] in {
+def : Pat<(v4i32 (bitconvert (f128 FPR128:$src))),
+ (v4i32 (EXTv16i8 (REV64v4i32 FPR128:$src),
+ (REV64v4i32 FPR128:$src),
+ (i32 8)))>;
+def : Pat<(v4i32 (bitconvert (v2i64 FPR128:$src))),
+ (v4i32 (REV64v4i32 FPR128:$src))>;
+def : Pat<(v4i32 (bitconvert (v8i16 FPR128:$src))),
+ (v4i32 (REV32v8i16 FPR128:$src))>;
+def : Pat<(v4i32 (bitconvert (v16i8 FPR128:$src))),
+ (v4i32 (REV32v16i8 FPR128:$src))>;
+def : Pat<(v4i32 (bitconvert (v2f64 FPR128:$src))),
+ (v4i32 (REV64v4i32 FPR128:$src))>;
+}
+def : Pat<(v4i32 (bitconvert (v4f32 FPR128:$src))), (v4i32 FPR128:$src)>;
+
+let Predicates = [IsLE] in {
+def : Pat<(v8i16 (bitconvert (f128 FPR128:$src))), (v8i16 FPR128:$src)>;
+def : Pat<(v8i16 (bitconvert (v2i64 FPR128:$src))), (v8i16 FPR128:$src)>;
+def : Pat<(v8i16 (bitconvert (v4i32 FPR128:$src))), (v8i16 FPR128:$src)>;
+def : Pat<(v8i16 (bitconvert (v16i8 FPR128:$src))), (v8i16 FPR128:$src)>;
+def : Pat<(v8i16 (bitconvert (v2f64 FPR128:$src))), (v8i16 FPR128:$src)>;
+def : Pat<(v8i16 (bitconvert (v4f32 FPR128:$src))), (v8i16 FPR128:$src)>;
+}
+let Predicates = [IsBE] in {
+def : Pat<(v8i16 (bitconvert (f128 FPR128:$src))),
+ (v8i16 (EXTv16i8 (REV64v8i16 FPR128:$src),
+ (REV64v8i16 FPR128:$src),
+ (i32 8)))>;
+def : Pat<(v8i16 (bitconvert (v2i64 FPR128:$src))),
+ (v8i16 (REV64v8i16 FPR128:$src))>;
+def : Pat<(v8i16 (bitconvert (v4i32 FPR128:$src))),
+ (v8i16 (REV32v8i16 FPR128:$src))>;
+def : Pat<(v8i16 (bitconvert (v16i8 FPR128:$src))),
+ (v8i16 (REV16v16i8 FPR128:$src))>;
+def : Pat<(v8i16 (bitconvert (v2f64 FPR128:$src))),
+ (v8i16 (REV64v8i16 FPR128:$src))>;
+def : Pat<(v8i16 (bitconvert (v4f32 FPR128:$src))),
+ (v8i16 (REV32v8i16 FPR128:$src))>;
+}
+
+let Predicates = [IsLE] in {
+def : Pat<(v16i8 (bitconvert (f128 FPR128:$src))), (v16i8 FPR128:$src)>;
+def : Pat<(v16i8 (bitconvert (v2i64 FPR128:$src))), (v16i8 FPR128:$src)>;
+def : Pat<(v16i8 (bitconvert (v4i32 FPR128:$src))), (v16i8 FPR128:$src)>;
+def : Pat<(v16i8 (bitconvert (v8i16 FPR128:$src))), (v16i8 FPR128:$src)>;
+def : Pat<(v16i8 (bitconvert (v2f64 FPR128:$src))), (v16i8 FPR128:$src)>;
+def : Pat<(v16i8 (bitconvert (v4f32 FPR128:$src))), (v16i8 FPR128:$src)>;
+}
+let Predicates = [IsBE] in {
+def : Pat<(v16i8 (bitconvert (f128 FPR128:$src))),
+ (v16i8 (EXTv16i8 (REV64v16i8 FPR128:$src),
+ (REV64v16i8 FPR128:$src),
+ (i32 8)))>;
+def : Pat<(v16i8 (bitconvert (v2i64 FPR128:$src))),
+ (v16i8 (REV64v16i8 FPR128:$src))>;
+def : Pat<(v16i8 (bitconvert (v4i32 FPR128:$src))),
+ (v16i8 (REV32v16i8 FPR128:$src))>;
+def : Pat<(v16i8 (bitconvert (v8i16 FPR128:$src))),
+ (v16i8 (REV16v16i8 FPR128:$src))>;
+def : Pat<(v16i8 (bitconvert (v2f64 FPR128:$src))),
+ (v16i8 (REV64v16i8 FPR128:$src))>;
+def : Pat<(v16i8 (bitconvert (v4f32 FPR128:$src))),
+ (v16i8 (REV32v16i8 FPR128:$src))>;
+}
+
+def : Pat<(v8i8 (extract_subvector (v16i8 FPR128:$Rn), (i64 1))),
+ (EXTRACT_SUBREG (DUPv2i64lane FPR128:$Rn, 1), dsub)>;
+def : Pat<(v4i16 (extract_subvector (v8i16 FPR128:$Rn), (i64 1))),
+ (EXTRACT_SUBREG (DUPv2i64lane FPR128:$Rn, 1), dsub)>;
+def : Pat<(v2i32 (extract_subvector (v4i32 FPR128:$Rn), (i64 1))),
+ (EXTRACT_SUBREG (DUPv2i64lane FPR128:$Rn, 1), dsub)>;
+def : Pat<(v1i64 (extract_subvector (v2i64 FPR128:$Rn), (i64 1))),
+ (EXTRACT_SUBREG (DUPv2i64lane FPR128:$Rn, 1), dsub)>;
+
+// A 64-bit subvector insert to the first 128-bit vector position
+// is a subregister copy that needs no instruction.
+def : Pat<(insert_subvector undef, (v1i64 FPR64:$src), (i32 0)),
+ (INSERT_SUBREG (v2i64 (IMPLICIT_DEF)), FPR64:$src, dsub)>;
+def : Pat<(insert_subvector undef, (v1f64 FPR64:$src), (i32 0)),
+ (INSERT_SUBREG (v2f64 (IMPLICIT_DEF)), FPR64:$src, dsub)>;
+def : Pat<(insert_subvector undef, (v2i32 FPR64:$src), (i32 0)),
+ (INSERT_SUBREG (v4i32 (IMPLICIT_DEF)), FPR64:$src, dsub)>;
+def : Pat<(insert_subvector undef, (v2f32 FPR64:$src), (i32 0)),
+ (INSERT_SUBREG (v4f32 (IMPLICIT_DEF)), FPR64:$src, dsub)>;
+def : Pat<(insert_subvector undef, (v4i16 FPR64:$src), (i32 0)),
+ (INSERT_SUBREG (v8i16 (IMPLICIT_DEF)), FPR64:$src, dsub)>;
+def : Pat<(insert_subvector undef, (v8i8 FPR64:$src), (i32 0)),
+ (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)), FPR64:$src, dsub)>;
+
+// Use pair-wise add instructions when summing up the lanes for v2f64, v2i64
+// or v2f32.
+def : Pat<(i64 (add (vector_extract (v2i64 FPR128:$Rn), (i64 0)),
+ (vector_extract (v2i64 FPR128:$Rn), (i64 1)))),
+ (i64 (ADDPv2i64p (v2i64 FPR128:$Rn)))>;
+def : Pat<(f64 (fadd (vector_extract (v2f64 FPR128:$Rn), (i64 0)),
+ (vector_extract (v2f64 FPR128:$Rn), (i64 1)))),
+ (f64 (FADDPv2i64p (v2f64 FPR128:$Rn)))>;
+ // vector_extract on 64-bit vectors gets promoted to a 128 bit vector,
+ // so we match on v4f32 here, not v2f32. This will also catch adding
+ // the low two lanes of a true v4f32 vector.
+def : Pat<(fadd (vector_extract (v4f32 FPR128:$Rn), (i64 0)),
+ (vector_extract (v4f32 FPR128:$Rn), (i64 1))),
+ (f32 (FADDPv2i32p (EXTRACT_SUBREG FPR128:$Rn, dsub)))>;
+
+// Scalar 64-bit shifts in FPR64 registers.
+def : Pat<(i64 (int_aarch64_neon_sshl (i64 FPR64:$Rn), (i64 FPR64:$Rm))),
+ (SSHLv1i64 FPR64:$Rn, FPR64:$Rm)>;
+def : Pat<(i64 (int_aarch64_neon_ushl (i64 FPR64:$Rn), (i64 FPR64:$Rm))),
+ (USHLv1i64 FPR64:$Rn, FPR64:$Rm)>;
+def : Pat<(i64 (int_aarch64_neon_srshl (i64 FPR64:$Rn), (i64 FPR64:$Rm))),
+ (SRSHLv1i64 FPR64:$Rn, FPR64:$Rm)>;
+def : Pat<(i64 (int_aarch64_neon_urshl (i64 FPR64:$Rn), (i64 FPR64:$Rm))),
+ (URSHLv1i64 FPR64:$Rn, FPR64:$Rm)>;
+
+// Tail call return handling. These are all compiler pseudo-instructions,
+// so no encoding information or anything like that.
+let isCall = 1, isTerminator = 1, isReturn = 1, isBarrier = 1, Uses = [SP] in {
+ def TCRETURNdi : Pseudo<(outs), (ins i64imm:$dst, i32imm:$FPDiff),[]>;
+ def TCRETURNri : Pseudo<(outs), (ins tcGPR64:$dst, i32imm:$FPDiff), []>;
+}
+
+def : Pat<(AArch64tcret tcGPR64:$dst, (i32 timm:$FPDiff)),
+ (TCRETURNri tcGPR64:$dst, imm:$FPDiff)>;
+def : Pat<(AArch64tcret tglobaladdr:$dst, (i32 timm:$FPDiff)),
+ (TCRETURNdi texternalsym:$dst, imm:$FPDiff)>;
+def : Pat<(AArch64tcret texternalsym:$dst, (i32 timm:$FPDiff)),
+ (TCRETURNdi texternalsym:$dst, imm:$FPDiff)>;
+
+include "AArch64InstrAtomics.td"
diff --git a/lib/Target/AArch64/AArch64InstrNEON.td b/lib/Target/AArch64/AArch64InstrNEON.td
deleted file mode 100644
index 0b97e3b..0000000
--- a/lib/Target/AArch64/AArch64InstrNEON.td
+++ /dev/null
@@ -1,9476 +0,0 @@
-//===-- AArch64InstrNEON.td - NEON support for AArch64 -----*- tablegen -*-===//
-//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This file describes the AArch64 NEON instruction set.
-//
-//===----------------------------------------------------------------------===//
-
-//===----------------------------------------------------------------------===//
-// NEON-specific DAG Nodes.
-//===----------------------------------------------------------------------===//
-
-// (outs Result), (ins Imm, OpCmode)
-def SDT_Neon_movi : SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisVT<1, i32>]>;
-
-def Neon_movi : SDNode<"AArch64ISD::NEON_MOVIMM", SDT_Neon_movi>;
-
-def Neon_mvni : SDNode<"AArch64ISD::NEON_MVNIMM", SDT_Neon_movi>;
-
-// (outs Result), (ins Imm)
-def Neon_fmovi : SDNode<"AArch64ISD::NEON_FMOVIMM", SDTypeProfile<1, 1,
- [SDTCisVec<0>, SDTCisVT<1, i32>]>>;
-
-// (outs Result), (ins LHS, RHS, CondCode)
-def Neon_cmp : SDNode<"AArch64ISD::NEON_CMP", SDTypeProfile<1, 3,
- [SDTCisVec<0>, SDTCisSameAs<1, 2>]>>;
-
-// (outs Result), (ins LHS, 0/0.0 constant, CondCode)
-def Neon_cmpz : SDNode<"AArch64ISD::NEON_CMPZ", SDTypeProfile<1, 3,
- [SDTCisVec<0>, SDTCisVec<1>]>>;
-
-// (outs Result), (ins LHS, RHS)
-def Neon_tst : SDNode<"AArch64ISD::NEON_TST", SDTypeProfile<1, 2,
- [SDTCisVec<0>, SDTCisSameAs<1, 2>]>>;
-
-def SDTARMVSH : SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisSameAs<0, 1>,
- SDTCisVT<2, i32>]>;
-def Neon_sqrshlImm : SDNode<"AArch64ISD::NEON_QSHLs", SDTARMVSH>;
-def Neon_uqrshlImm : SDNode<"AArch64ISD::NEON_QSHLu", SDTARMVSH>;
-
-def SDTPERMUTE : SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisSameAs<0, 1>,
- SDTCisSameAs<0, 2>]>;
-def Neon_uzp1 : SDNode<"AArch64ISD::NEON_UZP1", SDTPERMUTE>;
-def Neon_uzp2 : SDNode<"AArch64ISD::NEON_UZP2", SDTPERMUTE>;
-def Neon_zip1 : SDNode<"AArch64ISD::NEON_ZIP1", SDTPERMUTE>;
-def Neon_zip2 : SDNode<"AArch64ISD::NEON_ZIP2", SDTPERMUTE>;
-def Neon_trn1 : SDNode<"AArch64ISD::NEON_TRN1", SDTPERMUTE>;
-def Neon_trn2 : SDNode<"AArch64ISD::NEON_TRN2", SDTPERMUTE>;
-
-def SDTVSHUF : SDTypeProfile<1, 1, [SDTCisVec<0>, SDTCisSameAs<0, 1>]>;
-def Neon_rev64 : SDNode<"AArch64ISD::NEON_REV64", SDTVSHUF>;
-def Neon_rev32 : SDNode<"AArch64ISD::NEON_REV32", SDTVSHUF>;
-def Neon_rev16 : SDNode<"AArch64ISD::NEON_REV16", SDTVSHUF>;
-def Neon_vdup : SDNode<"AArch64ISD::NEON_VDUP", SDTypeProfile<1, 1,
- [SDTCisVec<0>]>>;
-def Neon_vduplane : SDNode<"AArch64ISD::NEON_VDUPLANE", SDTypeProfile<1, 2,
- [SDTCisVec<0>, SDTCisVec<1>, SDTCisVT<2, i64>]>>;
-def Neon_vextract : SDNode<"AArch64ISD::NEON_VEXTRACT", SDTypeProfile<1, 3,
- [SDTCisVec<0>, SDTCisSameAs<0, 1>,
- SDTCisSameAs<0, 2>, SDTCisVT<3, i64>]>>;
-
-//===----------------------------------------------------------------------===//
-// Addressing-mode instantiations
-//===----------------------------------------------------------------------===//
-
-multiclass ls_64_pats<dag address, dag Base, dag Offset, ValueType Ty> {
-defm : ls_neutral_pats<LSFP64_LDR, LSFP64_STR, Base,
- !foreach(decls.pattern, Offset,
- !subst(OFFSET, dword_uimm12, decls.pattern)),
- !foreach(decls.pattern, address,
- !subst(OFFSET, dword_uimm12,
- !subst(ALIGN, min_align8, decls.pattern))),
- Ty>;
-}
-
-multiclass ls_128_pats<dag address, dag Base, dag Offset, ValueType Ty> {
-defm : ls_neutral_pats<LSFP128_LDR, LSFP128_STR, Base,
- !foreach(decls.pattern, Offset,
- !subst(OFFSET, qword_uimm12, decls.pattern)),
- !foreach(decls.pattern, address,
- !subst(OFFSET, qword_uimm12,
- !subst(ALIGN, min_align16, decls.pattern))),
- Ty>;
-}
-
-multiclass uimm12_neon_pats<dag address, dag Base, dag Offset> {
- defm : ls_64_pats<address, Base, Offset, v8i8>;
- defm : ls_64_pats<address, Base, Offset, v4i16>;
- defm : ls_64_pats<address, Base, Offset, v2i32>;
- defm : ls_64_pats<address, Base, Offset, v1i64>;
- defm : ls_64_pats<address, Base, Offset, v2f32>;
- defm : ls_64_pats<address, Base, Offset, v1f64>;
-
- defm : ls_128_pats<address, Base, Offset, v16i8>;
- defm : ls_128_pats<address, Base, Offset, v8i16>;
- defm : ls_128_pats<address, Base, Offset, v4i32>;
- defm : ls_128_pats<address, Base, Offset, v2i64>;
- defm : ls_128_pats<address, Base, Offset, v4f32>;
- defm : ls_128_pats<address, Base, Offset, v2f64>;
-}
-
-defm : uimm12_neon_pats<(A64WrapperSmall
- tconstpool:$Hi, tconstpool:$Lo12, ALIGN),
- (ADRPxi tconstpool:$Hi), (i64 tconstpool:$Lo12)>;
-
-//===----------------------------------------------------------------------===//
-// Multiclasses
-//===----------------------------------------------------------------------===//
-
-multiclass NeonI_3VSame_B_sizes<bit u, bits<2> size, bits<5> opcode,
- string asmop, SDPatternOperator opnode8B,
- SDPatternOperator opnode16B,
- bit Commutable = 0> {
- let isCommutable = Commutable in {
- def _8B : NeonI_3VSame<0b0, u, size, opcode,
- (outs VPR64:$Rd), (ins VPR64:$Rn, VPR64:$Rm),
- asmop # "\t$Rd.8b, $Rn.8b, $Rm.8b",
- [(set (v8i8 VPR64:$Rd),
- (v8i8 (opnode8B (v8i8 VPR64:$Rn), (v8i8 VPR64:$Rm))))],
- NoItinerary>,
- Sched<[WriteFPALU, ReadFPALU, ReadFPALU]>;
-
- def _16B : NeonI_3VSame<0b1, u, size, opcode,
- (outs VPR128:$Rd), (ins VPR128:$Rn, VPR128:$Rm),
- asmop # "\t$Rd.16b, $Rn.16b, $Rm.16b",
- [(set (v16i8 VPR128:$Rd),
- (v16i8 (opnode16B (v16i8 VPR128:$Rn), (v16i8 VPR128:$Rm))))],
- NoItinerary>,
- Sched<[WriteFPALU, ReadFPALU, ReadFPALU]>;
- }
-
-}
-
-multiclass NeonI_3VSame_HS_sizes<bit u, bits<5> opcode,
- string asmop, SDPatternOperator opnode,
- bit Commutable = 0> {
- let isCommutable = Commutable in {
- def _4H : NeonI_3VSame<0b0, u, 0b01, opcode,
- (outs VPR64:$Rd), (ins VPR64:$Rn, VPR64:$Rm),
- asmop # "\t$Rd.4h, $Rn.4h, $Rm.4h",
- [(set (v4i16 VPR64:$Rd),
- (v4i16 (opnode (v4i16 VPR64:$Rn), (v4i16 VPR64:$Rm))))],
- NoItinerary>,
- Sched<[WriteFPALU, ReadFPALU, ReadFPALU]>;
-
- def _8H : NeonI_3VSame<0b1, u, 0b01, opcode,
- (outs VPR128:$Rd), (ins VPR128:$Rn, VPR128:$Rm),
- asmop # "\t$Rd.8h, $Rn.8h, $Rm.8h",
- [(set (v8i16 VPR128:$Rd),
- (v8i16 (opnode (v8i16 VPR128:$Rn), (v8i16 VPR128:$Rm))))],
- NoItinerary>,
- Sched<[WriteFPALU, ReadFPALU, ReadFPALU]>;
-
- def _2S : NeonI_3VSame<0b0, u, 0b10, opcode,
- (outs VPR64:$Rd), (ins VPR64:$Rn, VPR64:$Rm),
- asmop # "\t$Rd.2s, $Rn.2s, $Rm.2s",
- [(set (v2i32 VPR64:$Rd),
- (v2i32 (opnode (v2i32 VPR64:$Rn), (v2i32 VPR64:$Rm))))],
- NoItinerary>,
- Sched<[WriteFPALU, ReadFPALU, ReadFPALU]>;
-
- def _4S : NeonI_3VSame<0b1, u, 0b10, opcode,
- (outs VPR128:$Rd), (ins VPR128:$Rn, VPR128:$Rm),
- asmop # "\t$Rd.4s, $Rn.4s, $Rm.4s",
- [(set (v4i32 VPR128:$Rd),
- (v4i32 (opnode (v4i32 VPR128:$Rn), (v4i32 VPR128:$Rm))))],
- NoItinerary>,
- Sched<[WriteFPALU, ReadFPALU, ReadFPALU]>;
- }
-}
-multiclass NeonI_3VSame_BHS_sizes<bit u, bits<5> opcode,
- string asmop, SDPatternOperator opnode,
- bit Commutable = 0>
- : NeonI_3VSame_HS_sizes<u, opcode, asmop, opnode, Commutable> {
- let isCommutable = Commutable in {
- def _8B : NeonI_3VSame<0b0, u, 0b00, opcode,
- (outs VPR64:$Rd), (ins VPR64:$Rn, VPR64:$Rm),
- asmop # "\t$Rd.8b, $Rn.8b, $Rm.8b",
- [(set (v8i8 VPR64:$Rd),
- (v8i8 (opnode (v8i8 VPR64:$Rn), (v8i8 VPR64:$Rm))))],
- NoItinerary>,
- Sched<[WriteFPALU, ReadFPALU, ReadFPALU]>;
-
- def _16B : NeonI_3VSame<0b1, u, 0b00, opcode,
- (outs VPR128:$Rd), (ins VPR128:$Rn, VPR128:$Rm),
- asmop # "\t$Rd.16b, $Rn.16b, $Rm.16b",
- [(set (v16i8 VPR128:$Rd),
- (v16i8 (opnode (v16i8 VPR128:$Rn), (v16i8 VPR128:$Rm))))],
- NoItinerary>,
- Sched<[WriteFPALU, ReadFPALU, ReadFPALU]>;
- }
-}
-
-multiclass NeonI_3VSame_BHSD_sizes<bit u, bits<5> opcode,
- string asmop, SDPatternOperator opnode,
- bit Commutable = 0>
- : NeonI_3VSame_BHS_sizes<u, opcode, asmop, opnode, Commutable> {
- let isCommutable = Commutable in {
- def _2D : NeonI_3VSame<0b1, u, 0b11, opcode,
- (outs VPR128:$Rd), (ins VPR128:$Rn, VPR128:$Rm),
- asmop # "\t$Rd.2d, $Rn.2d, $Rm.2d",
- [(set (v2i64 VPR128:$Rd),
- (v2i64 (opnode (v2i64 VPR128:$Rn), (v2i64 VPR128:$Rm))))],
- NoItinerary>,
- Sched<[WriteFPALU, ReadFPALU, ReadFPALU]>;
- }
-}
-
-// Multiclass NeonI_3VSame_SD_sizes: Operand types are floating point types,
-// but Result types can be integer or floating point types.
-multiclass NeonI_3VSame_SD_sizes<bit u, bit size, bits<5> opcode,
- string asmop, SDPatternOperator opnode,
- ValueType ResTy2S, ValueType ResTy4S,
- ValueType ResTy2D, bit Commutable = 0> {
- let isCommutable = Commutable in {
- def _2S : NeonI_3VSame<0b0, u, {size, 0b0}, opcode,
- (outs VPR64:$Rd), (ins VPR64:$Rn, VPR64:$Rm),
- asmop # "\t$Rd.2s, $Rn.2s, $Rm.2s",
- [(set (ResTy2S VPR64:$Rd),
- (ResTy2S (opnode (v2f32 VPR64:$Rn), (v2f32 VPR64:$Rm))))],
- NoItinerary>,
- Sched<[WriteFPALU, ReadFPALU, ReadFPALU]>;
-
- def _4S : NeonI_3VSame<0b1, u, {size, 0b0}, opcode,
- (outs VPR128:$Rd), (ins VPR128:$Rn, VPR128:$Rm),
- asmop # "\t$Rd.4s, $Rn.4s, $Rm.4s",
- [(set (ResTy4S VPR128:$Rd),
- (ResTy4S (opnode (v4f32 VPR128:$Rn), (v4f32 VPR128:$Rm))))],
- NoItinerary>,
- Sched<[WriteFPALU, ReadFPALU, ReadFPALU]>;
-
- def _2D : NeonI_3VSame<0b1, u, {size, 0b1}, opcode,
- (outs VPR128:$Rd), (ins VPR128:$Rn, VPR128:$Rm),
- asmop # "\t$Rd.2d, $Rn.2d, $Rm.2d",
- [(set (ResTy2D VPR128:$Rd),
- (ResTy2D (opnode (v2f64 VPR128:$Rn), (v2f64 VPR128:$Rm))))],
- NoItinerary>,
- Sched<[WriteFPALU, ReadFPALU, ReadFPALU]>;
- }
-}
-
-//===----------------------------------------------------------------------===//
-// Instruction Definitions
-//===----------------------------------------------------------------------===//
-
-// Vector Arithmetic Instructions
-
-// Vector Add (Integer and Floating-Point)
-
-defm ADDvvv : NeonI_3VSame_BHSD_sizes<0b0, 0b10000, "add", add, 1>;
-defm FADDvvv : NeonI_3VSame_SD_sizes<0b0, 0b0, 0b11010, "fadd", fadd,
- v2f32, v4f32, v2f64, 1>;
-
-// Patterns to match add of v1i8/v1i16/v1i32 types
-def : Pat<(v1i8 (add FPR8:$Rn, FPR8:$Rm)),
- (EXTRACT_SUBREG
- (ADDvvv_8B (SUBREG_TO_REG (i64 0), FPR8:$Rn, sub_8),
- (SUBREG_TO_REG (i64 0), FPR8:$Rm, sub_8)),
- sub_8)>;
-def : Pat<(v1i16 (add FPR16:$Rn, FPR16:$Rm)),
- (EXTRACT_SUBREG
- (ADDvvv_4H (SUBREG_TO_REG (i64 0), FPR16:$Rn, sub_16),
- (SUBREG_TO_REG (i64 0), FPR16:$Rm, sub_16)),
- sub_16)>;
-def : Pat<(v1i32 (add FPR32:$Rn, FPR32:$Rm)),
- (EXTRACT_SUBREG
- (ADDvvv_2S (SUBREG_TO_REG (i64 0), FPR32:$Rn, sub_32),
- (SUBREG_TO_REG (i64 0), FPR32:$Rm, sub_32)),
- sub_32)>;
-
-// Vector Sub (Integer and Floating-Point)
-
-defm SUBvvv : NeonI_3VSame_BHSD_sizes<0b1, 0b10000, "sub", sub, 0>;
-defm FSUBvvv : NeonI_3VSame_SD_sizes<0b0, 0b1, 0b11010, "fsub", fsub,
- v2f32, v4f32, v2f64, 0>;
-
-// Patterns to match sub of v1i8/v1i16/v1i32 types
-def : Pat<(v1i8 (sub FPR8:$Rn, FPR8:$Rm)),
- (EXTRACT_SUBREG
- (SUBvvv_8B (SUBREG_TO_REG (i64 0), FPR8:$Rn, sub_8),
- (SUBREG_TO_REG (i64 0), FPR8:$Rm, sub_8)),
- sub_8)>;
-def : Pat<(v1i16 (sub FPR16:$Rn, FPR16:$Rm)),
- (EXTRACT_SUBREG
- (SUBvvv_4H (SUBREG_TO_REG (i64 0), FPR16:$Rn, sub_16),
- (SUBREG_TO_REG (i64 0), FPR16:$Rm, sub_16)),
- sub_16)>;
-def : Pat<(v1i32 (sub FPR32:$Rn, FPR32:$Rm)),
- (EXTRACT_SUBREG
- (SUBvvv_2S (SUBREG_TO_REG (i64 0), FPR32:$Rn, sub_32),
- (SUBREG_TO_REG (i64 0), FPR32:$Rm, sub_32)),
- sub_32)>;
-
-// Vector Multiply (Integer and Floating-Point)
-
-let SchedRW = [WriteFPMul, ReadFPMul, ReadFPMul] in {
-defm MULvvv : NeonI_3VSame_BHS_sizes<0b0, 0b10011, "mul", mul, 1>;
-defm FMULvvv : NeonI_3VSame_SD_sizes<0b1, 0b0, 0b11011, "fmul", fmul,
- v2f32, v4f32, v2f64, 1>;
-}
-
-// Patterns to match mul of v1i8/v1i16/v1i32 types
-def : Pat<(v1i8 (mul FPR8:$Rn, FPR8:$Rm)),
- (EXTRACT_SUBREG
- (MULvvv_8B (SUBREG_TO_REG (i64 0), FPR8:$Rn, sub_8),
- (SUBREG_TO_REG (i64 0), FPR8:$Rm, sub_8)),
- sub_8)>;
-def : Pat<(v1i16 (mul FPR16:$Rn, FPR16:$Rm)),
- (EXTRACT_SUBREG
- (MULvvv_4H (SUBREG_TO_REG (i64 0), FPR16:$Rn, sub_16),
- (SUBREG_TO_REG (i64 0), FPR16:$Rm, sub_16)),
- sub_16)>;
-def : Pat<(v1i32 (mul FPR32:$Rn, FPR32:$Rm)),
- (EXTRACT_SUBREG
- (MULvvv_2S (SUBREG_TO_REG (i64 0), FPR32:$Rn, sub_32),
- (SUBREG_TO_REG (i64 0), FPR32:$Rm, sub_32)),
- sub_32)>;
-
-// Vector Multiply (Polynomial)
-
-let SchedRW = [WriteFPMul, ReadFPMul, ReadFPMul] in {
-defm PMULvvv : NeonI_3VSame_B_sizes<0b1, 0b00, 0b10011, "pmul",
- int_arm_neon_vmulp, int_arm_neon_vmulp, 1>;
-}
-
-// Vector Multiply-accumulate and Multiply-subtract (Integer)
-
-// class NeonI_3VSame_Constraint_impl: NeonI_3VSame with no data type and
-// two operands constraints.
-class NeonI_3VSame_Constraint_impl<string asmop, string asmlane,
- RegisterOperand VPRC, ValueType OpTy, bit q, bit u, bits<2> size,
- bits<5> opcode, SDPatternOperator opnode>
- : NeonI_3VSame<q, u, size, opcode,
- (outs VPRC:$Rd), (ins VPRC:$src, VPRC:$Rn, VPRC:$Rm),
- asmop # "\t$Rd" # asmlane # ", $Rn" # asmlane # ", $Rm" # asmlane,
- [(set (OpTy VPRC:$Rd),
- (OpTy (opnode (OpTy VPRC:$src), (OpTy VPRC:$Rn), (OpTy VPRC:$Rm))))],
- NoItinerary>,
- Sched<[WriteFPALU, ReadFPALU, ReadFPALU, ReadFPALU]> {
- let Constraints = "$src = $Rd";
-}
-
-def Neon_mla : PatFrag<(ops node:$Ra, node:$Rn, node:$Rm),
- (add node:$Ra, (mul node:$Rn, node:$Rm))>;
-
-def Neon_mls : PatFrag<(ops node:$Ra, node:$Rn, node:$Rm),
- (sub node:$Ra, (mul node:$Rn, node:$Rm))>;
-
-
-let SchedRW = [WriteFPMAC, ReadFPMAC, ReadFPMAC] in {
-def MLAvvv_8B: NeonI_3VSame_Constraint_impl<"mla", ".8b", VPR64, v8i8,
- 0b0, 0b0, 0b00, 0b10010, Neon_mla>;
-def MLAvvv_16B: NeonI_3VSame_Constraint_impl<"mla", ".16b", VPR128, v16i8,
- 0b1, 0b0, 0b00, 0b10010, Neon_mla>;
-def MLAvvv_4H: NeonI_3VSame_Constraint_impl<"mla", ".4h", VPR64, v4i16,
- 0b0, 0b0, 0b01, 0b10010, Neon_mla>;
-def MLAvvv_8H: NeonI_3VSame_Constraint_impl<"mla", ".8h", VPR128, v8i16,
- 0b1, 0b0, 0b01, 0b10010, Neon_mla>;
-def MLAvvv_2S: NeonI_3VSame_Constraint_impl<"mla", ".2s", VPR64, v2i32,
- 0b0, 0b0, 0b10, 0b10010, Neon_mla>;
-def MLAvvv_4S: NeonI_3VSame_Constraint_impl<"mla", ".4s", VPR128, v4i32,
- 0b1, 0b0, 0b10, 0b10010, Neon_mla>;
-
-def MLSvvv_8B: NeonI_3VSame_Constraint_impl<"mls", ".8b", VPR64, v8i8,
- 0b0, 0b1, 0b00, 0b10010, Neon_mls>;
-def MLSvvv_16B: NeonI_3VSame_Constraint_impl<"mls", ".16b", VPR128, v16i8,
- 0b1, 0b1, 0b00, 0b10010, Neon_mls>;
-def MLSvvv_4H: NeonI_3VSame_Constraint_impl<"mls", ".4h", VPR64, v4i16,
- 0b0, 0b1, 0b01, 0b10010, Neon_mls>;
-def MLSvvv_8H: NeonI_3VSame_Constraint_impl<"mls", ".8h", VPR128, v8i16,
- 0b1, 0b1, 0b01, 0b10010, Neon_mls>;
-def MLSvvv_2S: NeonI_3VSame_Constraint_impl<"mls", ".2s", VPR64, v2i32,
- 0b0, 0b1, 0b10, 0b10010, Neon_mls>;
-def MLSvvv_4S: NeonI_3VSame_Constraint_impl<"mls", ".4s", VPR128, v4i32,
- 0b1, 0b1, 0b10, 0b10010, Neon_mls>;
-}
-
-// Vector Multiply-accumulate and Multiply-subtract (Floating Point)
-
-def Neon_fmla : PatFrag<(ops node:$Ra, node:$Rn, node:$Rm),
- (fadd node:$Ra, (fmul_su node:$Rn, node:$Rm))>;
-
-def Neon_fmls : PatFrag<(ops node:$Ra, node:$Rn, node:$Rm),
- (fsub node:$Ra, (fmul_su node:$Rn, node:$Rm))>;
-
-let Predicates = [HasNEON, UseFusedMAC],
- SchedRW = [WriteFPMAC, ReadFPMAC, ReadFPMAC] in {
-def FMLAvvv_2S: NeonI_3VSame_Constraint_impl<"fmla", ".2s", VPR64, v2f32,
- 0b0, 0b0, 0b00, 0b11001, Neon_fmla>;
-def FMLAvvv_4S: NeonI_3VSame_Constraint_impl<"fmla", ".4s", VPR128, v4f32,
- 0b1, 0b0, 0b00, 0b11001, Neon_fmla>;
-def FMLAvvv_2D: NeonI_3VSame_Constraint_impl<"fmla", ".2d", VPR128, v2f64,
- 0b1, 0b0, 0b01, 0b11001, Neon_fmla>;
-
-def FMLSvvv_2S: NeonI_3VSame_Constraint_impl<"fmls", ".2s", VPR64, v2f32,
- 0b0, 0b0, 0b10, 0b11001, Neon_fmls>;
-def FMLSvvv_4S: NeonI_3VSame_Constraint_impl<"fmls", ".4s", VPR128, v4f32,
- 0b1, 0b0, 0b10, 0b11001, Neon_fmls>;
-def FMLSvvv_2D: NeonI_3VSame_Constraint_impl<"fmls", ".2d", VPR128, v2f64,
- 0b1, 0b0, 0b11, 0b11001, Neon_fmls>;
-}
-
-// We're also allowed to match the fma instruction regardless of compile
-// options.
-def : Pat<(v2f32 (fma VPR64:$Rn, VPR64:$Rm, VPR64:$Ra)),
- (FMLAvvv_2S VPR64:$Ra, VPR64:$Rn, VPR64:$Rm)>;
-def : Pat<(v4f32 (fma VPR128:$Rn, VPR128:$Rm, VPR128:$Ra)),
- (FMLAvvv_4S VPR128:$Ra, VPR128:$Rn, VPR128:$Rm)>;
-def : Pat<(v2f64 (fma VPR128:$Rn, VPR128:$Rm, VPR128:$Ra)),
- (FMLAvvv_2D VPR128:$Ra, VPR128:$Rn, VPR128:$Rm)>;
-
-def : Pat<(v2f32 (fma (fneg VPR64:$Rn), VPR64:$Rm, VPR64:$Ra)),
- (FMLSvvv_2S VPR64:$Ra, VPR64:$Rn, VPR64:$Rm)>;
-def : Pat<(v4f32 (fma (fneg VPR128:$Rn), VPR128:$Rm, VPR128:$Ra)),
- (FMLSvvv_4S VPR128:$Ra, VPR128:$Rn, VPR128:$Rm)>;
-def : Pat<(v2f64 (fma (fneg VPR128:$Rn), VPR128:$Rm, VPR128:$Ra)),
- (FMLSvvv_2D VPR128:$Ra, VPR128:$Rn, VPR128:$Rm)>;
-
-// Vector Divide (Floating-Point)
-
-let SchedRW = [WriteFPDiv, ReadFPDiv, ReadFPDiv] in {
-defm FDIVvvv : NeonI_3VSame_SD_sizes<0b1, 0b0, 0b11111, "fdiv", fdiv,
- v2f32, v4f32, v2f64, 0>;
-}
-
-// Vector Bitwise Operations
-
-// Vector Bitwise AND
-
-defm ANDvvv : NeonI_3VSame_B_sizes<0b0, 0b00, 0b00011, "and", and, and, 1>;
-
-// Vector Bitwise Exclusive OR
-
-defm EORvvv : NeonI_3VSame_B_sizes<0b1, 0b00, 0b00011, "eor", xor, xor, 1>;
-
-// Vector Bitwise OR
-
-defm ORRvvv : NeonI_3VSame_B_sizes<0b0, 0b10, 0b00011, "orr", or, or, 1>;
-
-// ORR disassembled as MOV if Vn==Vm
-
-// Vector Move - register
-// Alias for ORR if Vn=Vm.
-// FIXME: This is actually the preferred syntax but TableGen can't deal with
-// custom printing of aliases.
-def : NeonInstAlias<"mov $Rd.8b, $Rn.8b",
- (ORRvvv_8B VPR64:$Rd, VPR64:$Rn, VPR64:$Rn), 0>;
-def : NeonInstAlias<"mov $Rd.16b, $Rn.16b",
- (ORRvvv_16B VPR128:$Rd, VPR128:$Rn, VPR128:$Rn), 0>;
-
-// The MOVI instruction takes two immediate operands. The first is the
-// immediate encoding, while the second is the cmode. A cmode of 14, or
-// 0b1110, produces a MOVI operation, rather than a MVNI, ORR, or BIC.
-def Neon_AllZero : PatFrag<(ops), (Neon_movi (i32 0), (i32 14))>;
-def Neon_AllOne : PatFrag<(ops), (Neon_movi (i32 255), (i32 14))>;
-
-def Neon_not8B : PatFrag<(ops node:$in),
- (xor node:$in, (bitconvert (v8i8 Neon_AllOne)))>;
-def Neon_not16B : PatFrag<(ops node:$in),
- (xor node:$in, (bitconvert (v16i8 Neon_AllOne)))>;
-
-def Neon_orn8B : PatFrag<(ops node:$Rn, node:$Rm),
- (or node:$Rn, (Neon_not8B node:$Rm))>;
-
-def Neon_orn16B : PatFrag<(ops node:$Rn, node:$Rm),
- (or node:$Rn, (Neon_not16B node:$Rm))>;
-
-def Neon_bic8B : PatFrag<(ops node:$Rn, node:$Rm),
- (and node:$Rn, (Neon_not8B node:$Rm))>;
-
-def Neon_bic16B : PatFrag<(ops node:$Rn, node:$Rm),
- (and node:$Rn, (Neon_not16B node:$Rm))>;
-
-
-// Vector Bitwise OR NOT - register
-
-defm ORNvvv : NeonI_3VSame_B_sizes<0b0, 0b11, 0b00011, "orn",
- Neon_orn8B, Neon_orn16B, 0>;
-
-// Vector Bitwise Bit Clear (AND NOT) - register
-
-defm BICvvv : NeonI_3VSame_B_sizes<0b0, 0b01, 0b00011, "bic",
- Neon_bic8B, Neon_bic16B, 0>;
-
-multiclass Neon_bitwise2V_patterns<SDPatternOperator opnode8B,
- SDPatternOperator opnode16B,
- Instruction INST8B,
- Instruction INST16B> {
- def : Pat<(v2i32 (opnode8B VPR64:$Rn, VPR64:$Rm)),
- (INST8B VPR64:$Rn, VPR64:$Rm)>;
- def : Pat<(v4i16 (opnode8B VPR64:$Rn, VPR64:$Rm)),
- (INST8B VPR64:$Rn, VPR64:$Rm)>;
- def : Pat<(v1i64 (opnode8B VPR64:$Rn, VPR64:$Rm)),
- (INST8B VPR64:$Rn, VPR64:$Rm)>;
- def : Pat<(v4i32 (opnode16B VPR128:$Rn, VPR128:$Rm)),
- (INST16B VPR128:$Rn, VPR128:$Rm)>;
- def : Pat<(v8i16 (opnode16B VPR128:$Rn, VPR128:$Rm)),
- (INST16B VPR128:$Rn, VPR128:$Rm)>;
- def : Pat<(v2i64 (opnode16B VPR128:$Rn, VPR128:$Rm)),
- (INST16B VPR128:$Rn, VPR128:$Rm)>;
-}
-
-// Additional patterns for bitwise instructions AND, EOR, ORR, BIC, ORN
-defm : Neon_bitwise2V_patterns<and, and, ANDvvv_8B, ANDvvv_16B>;
-defm : Neon_bitwise2V_patterns<or, or, ORRvvv_8B, ORRvvv_16B>;
-defm : Neon_bitwise2V_patterns<xor, xor, EORvvv_8B, EORvvv_16B>;
-defm : Neon_bitwise2V_patterns<Neon_bic8B, Neon_bic16B, BICvvv_8B, BICvvv_16B>;
-defm : Neon_bitwise2V_patterns<Neon_orn8B, Neon_orn16B, ORNvvv_8B, ORNvvv_16B>;
-
-// Vector Bitwise Select
-def BSLvvv_8B : NeonI_3VSame_Constraint_impl<"bsl", ".8b", VPR64, v8i8,
- 0b0, 0b1, 0b01, 0b00011, vselect>;
-
-def BSLvvv_16B : NeonI_3VSame_Constraint_impl<"bsl", ".16b", VPR128, v16i8,
- 0b1, 0b1, 0b01, 0b00011, vselect>;
-
-multiclass Neon_bitwise3V_patterns<SDPatternOperator opnode,
- Instruction INST8B,
- Instruction INST16B> {
- // Disassociate type from instruction definition
- def : Pat<(v8i8 (opnode (v8i8 VPR64:$src), VPR64:$Rn, VPR64:$Rm)),
- (INST8B VPR64:$src, VPR64:$Rn, VPR64:$Rm)>;
- def : Pat<(v2i32 (opnode (v2i32 VPR64:$src), VPR64:$Rn, VPR64:$Rm)),
- (INST8B VPR64:$src, VPR64:$Rn, VPR64:$Rm)>;
- def : Pat<(v2f32 (opnode (v2i32 VPR64:$src), VPR64:$Rn, VPR64:$Rm)),
- (INST8B VPR64:$src, VPR64:$Rn, VPR64:$Rm)>;
- def : Pat<(v4i16 (opnode (v4i16 VPR64:$src), VPR64:$Rn, VPR64:$Rm)),
- (INST8B VPR64:$src, VPR64:$Rn, VPR64:$Rm)>;
- def : Pat<(v1i64 (opnode (v1i64 VPR64:$src), VPR64:$Rn, VPR64:$Rm)),
- (INST8B VPR64:$src, VPR64:$Rn, VPR64:$Rm)>;
- def : Pat<(v1f64 (opnode (v1i64 VPR64:$src), VPR64:$Rn, VPR64:$Rm)),
- (INST8B VPR64:$src, VPR64:$Rn, VPR64:$Rm)>;
- def : Pat<(v16i8 (opnode (v16i8 VPR128:$src), VPR128:$Rn, VPR128:$Rm)),
- (INST16B VPR128:$src, VPR128:$Rn, VPR128:$Rm)>;
- def : Pat<(v4i32 (opnode (v4i32 VPR128:$src), VPR128:$Rn, VPR128:$Rm)),
- (INST16B VPR128:$src, VPR128:$Rn, VPR128:$Rm)>;
- def : Pat<(v8i16 (opnode (v8i16 VPR128:$src), VPR128:$Rn, VPR128:$Rm)),
- (INST16B VPR128:$src, VPR128:$Rn, VPR128:$Rm)>;
- def : Pat<(v2i64 (opnode (v2i64 VPR128:$src), VPR128:$Rn, VPR128:$Rm)),
- (INST16B VPR128:$src, VPR128:$Rn, VPR128:$Rm)>;
- def : Pat<(v2f64 (opnode (v2i64 VPR128:$src), VPR128:$Rn, VPR128:$Rm)),
- (INST16B VPR128:$src, VPR128:$Rn, VPR128:$Rm)>;
- def : Pat<(v4f32 (opnode (v4i32 VPR128:$src), VPR128:$Rn, VPR128:$Rm)),
- (INST16B VPR128:$src, VPR128:$Rn, VPR128:$Rm)>;
-
- // Allow to match BSL instruction pattern with non-constant operand
- def : Pat<(v8i8 (or (and VPR64:$Rn, VPR64:$Rd),
- (and VPR64:$Rm, (Neon_not8B VPR64:$Rd)))),
- (INST8B VPR64:$Rd, VPR64:$Rn, VPR64:$Rm)>;
- def : Pat<(v4i16 (or (and VPR64:$Rn, VPR64:$Rd),
- (and VPR64:$Rm, (Neon_not8B VPR64:$Rd)))),
- (INST8B VPR64:$Rd, VPR64:$Rn, VPR64:$Rm)>;
- def : Pat<(v2i32 (or (and VPR64:$Rn, VPR64:$Rd),
- (and VPR64:$Rm, (Neon_not8B VPR64:$Rd)))),
- (INST8B VPR64:$Rd, VPR64:$Rn, VPR64:$Rm)>;
- def : Pat<(v1i64 (or (and VPR64:$Rn, VPR64:$Rd),
- (and VPR64:$Rm, (Neon_not8B VPR64:$Rd)))),
- (INST8B VPR64:$Rd, VPR64:$Rn, VPR64:$Rm)>;
- def : Pat<(v16i8 (or (and VPR128:$Rn, VPR128:$Rd),
- (and VPR128:$Rm, (Neon_not16B VPR128:$Rd)))),
- (INST16B VPR128:$Rd, VPR128:$Rn, VPR128:$Rm)>;
- def : Pat<(v8i16 (or (and VPR128:$Rn, VPR128:$Rd),
- (and VPR128:$Rm, (Neon_not16B VPR128:$Rd)))),
- (INST16B VPR128:$Rd, VPR128:$Rn, VPR128:$Rm)>;
- def : Pat<(v4i32 (or (and VPR128:$Rn, VPR128:$Rd),
- (and VPR128:$Rm, (Neon_not16B VPR128:$Rd)))),
- (INST16B VPR128:$Rd, VPR128:$Rn, VPR128:$Rm)>;
- def : Pat<(v2i64 (or (and VPR128:$Rn, VPR128:$Rd),
- (and VPR128:$Rm, (Neon_not16B VPR128:$Rd)))),
- (INST16B VPR128:$Rd, VPR128:$Rn, VPR128:$Rm)>;
-
- // Allow to match llvm.arm.* intrinsics.
- def : Pat<(v8i8 (int_arm_neon_vbsl (v8i8 VPR64:$src),
- (v8i8 VPR64:$Rn), (v8i8 VPR64:$Rm))),
- (INST8B VPR64:$src, VPR64:$Rn, VPR64:$Rm)>;
- def : Pat<(v4i16 (int_arm_neon_vbsl (v4i16 VPR64:$src),
- (v4i16 VPR64:$Rn), (v4i16 VPR64:$Rm))),
- (INST8B VPR64:$src, VPR64:$Rn, VPR64:$Rm)>;
- def : Pat<(v2i32 (int_arm_neon_vbsl (v2i32 VPR64:$src),
- (v2i32 VPR64:$Rn), (v2i32 VPR64:$Rm))),
- (INST8B VPR64:$src, VPR64:$Rn, VPR64:$Rm)>;
- def : Pat<(v1i64 (int_arm_neon_vbsl (v1i64 VPR64:$src),
- (v1i64 VPR64:$Rn), (v1i64 VPR64:$Rm))),
- (INST8B VPR64:$src, VPR64:$Rn, VPR64:$Rm)>;
- def : Pat<(v2f32 (int_arm_neon_vbsl (v2f32 VPR64:$src),
- (v2f32 VPR64:$Rn), (v2f32 VPR64:$Rm))),
- (INST8B VPR64:$src, VPR64:$Rn, VPR64:$Rm)>;
- def : Pat<(v1f64 (int_arm_neon_vbsl (v1f64 VPR64:$src),
- (v1f64 VPR64:$Rn), (v1f64 VPR64:$Rm))),
- (INST8B VPR64:$src, VPR64:$Rn, VPR64:$Rm)>;
- def : Pat<(v16i8 (int_arm_neon_vbsl (v16i8 VPR128:$src),
- (v16i8 VPR128:$Rn), (v16i8 VPR128:$Rm))),
- (INST16B VPR128:$src, VPR128:$Rn, VPR128:$Rm)>;
- def : Pat<(v8i16 (int_arm_neon_vbsl (v8i16 VPR128:$src),
- (v8i16 VPR128:$Rn), (v8i16 VPR128:$Rm))),
- (INST16B VPR128:$src, VPR128:$Rn, VPR128:$Rm)>;
- def : Pat<(v4i32 (int_arm_neon_vbsl (v4i32 VPR128:$src),
- (v4i32 VPR128:$Rn), (v4i32 VPR128:$Rm))),
- (INST16B VPR128:$src, VPR128:$Rn, VPR128:$Rm)>;
- def : Pat<(v2i64 (int_arm_neon_vbsl (v2i64 VPR128:$src),
- (v2i64 VPR128:$Rn), (v2i64 VPR128:$Rm))),
- (INST16B VPR128:$src, VPR128:$Rn, VPR128:$Rm)>;
- def : Pat<(v4f32 (int_arm_neon_vbsl (v4f32 VPR128:$src),
- (v4f32 VPR128:$Rn), (v4f32 VPR128:$Rm))),
- (INST16B VPR128:$src, VPR128:$Rn, VPR128:$Rm)>;
- def : Pat<(v2f64 (int_arm_neon_vbsl (v2f64 VPR128:$src),
- (v2f64 VPR128:$Rn), (v2f64 VPR128:$Rm))),
- (INST16B VPR128:$src, VPR128:$Rn, VPR128:$Rm)>;
-}
-
-// Additional patterns for bitwise instruction BSL
-defm: Neon_bitwise3V_patterns<vselect, BSLvvv_8B, BSLvvv_16B>;
-
-def Neon_NoBSLop : PatFrag<(ops node:$src, node:$Rn, node:$Rm),
- (vselect node:$src, node:$Rn, node:$Rm),
- [{ (void)N; return false; }]>;
-
-// Vector Bitwise Insert if True
-
-def BITvvv_8B : NeonI_3VSame_Constraint_impl<"bit", ".8b", VPR64, v8i8,
- 0b0, 0b1, 0b10, 0b00011, Neon_NoBSLop>;
-def BITvvv_16B : NeonI_3VSame_Constraint_impl<"bit", ".16b", VPR128, v16i8,
- 0b1, 0b1, 0b10, 0b00011, Neon_NoBSLop>;
-
-// Vector Bitwise Insert if False
-
-def BIFvvv_8B : NeonI_3VSame_Constraint_impl<"bif", ".8b", VPR64, v8i8,
- 0b0, 0b1, 0b11, 0b00011, Neon_NoBSLop>;
-def BIFvvv_16B : NeonI_3VSame_Constraint_impl<"bif", ".16b", VPR128, v16i8,
- 0b1, 0b1, 0b11, 0b00011, Neon_NoBSLop>;
-
-// Vector Absolute Difference and Accumulate (Signed, Unsigned)
-
-def Neon_uaba : PatFrag<(ops node:$Ra, node:$Rn, node:$Rm),
- (add node:$Ra, (int_arm_neon_vabdu node:$Rn, node:$Rm))>;
-def Neon_saba : PatFrag<(ops node:$Ra, node:$Rn, node:$Rm),
- (add node:$Ra, (int_arm_neon_vabds node:$Rn, node:$Rm))>;
-
-// Vector Absolute Difference and Accumulate (Unsigned)
-def UABAvvv_8B : NeonI_3VSame_Constraint_impl<"uaba", ".8b", VPR64, v8i8,
- 0b0, 0b1, 0b00, 0b01111, Neon_uaba>;
-def UABAvvv_16B : NeonI_3VSame_Constraint_impl<"uaba", ".16b", VPR128, v16i8,
- 0b1, 0b1, 0b00, 0b01111, Neon_uaba>;
-def UABAvvv_4H : NeonI_3VSame_Constraint_impl<"uaba", ".4h", VPR64, v4i16,
- 0b0, 0b1, 0b01, 0b01111, Neon_uaba>;
-def UABAvvv_8H : NeonI_3VSame_Constraint_impl<"uaba", ".8h", VPR128, v8i16,
- 0b1, 0b1, 0b01, 0b01111, Neon_uaba>;
-def UABAvvv_2S : NeonI_3VSame_Constraint_impl<"uaba", ".2s", VPR64, v2i32,
- 0b0, 0b1, 0b10, 0b01111, Neon_uaba>;
-def UABAvvv_4S : NeonI_3VSame_Constraint_impl<"uaba", ".4s", VPR128, v4i32,
- 0b1, 0b1, 0b10, 0b01111, Neon_uaba>;
-
-// Vector Absolute Difference and Accumulate (Signed)
-def SABAvvv_8B : NeonI_3VSame_Constraint_impl<"saba", ".8b", VPR64, v8i8,
- 0b0, 0b0, 0b00, 0b01111, Neon_saba>;
-def SABAvvv_16B : NeonI_3VSame_Constraint_impl<"saba", ".16b", VPR128, v16i8,
- 0b1, 0b0, 0b00, 0b01111, Neon_saba>;
-def SABAvvv_4H : NeonI_3VSame_Constraint_impl<"saba", ".4h", VPR64, v4i16,
- 0b0, 0b0, 0b01, 0b01111, Neon_saba>;
-def SABAvvv_8H : NeonI_3VSame_Constraint_impl<"saba", ".8h", VPR128, v8i16,
- 0b1, 0b0, 0b01, 0b01111, Neon_saba>;
-def SABAvvv_2S : NeonI_3VSame_Constraint_impl<"saba", ".2s", VPR64, v2i32,
- 0b0, 0b0, 0b10, 0b01111, Neon_saba>;
-def SABAvvv_4S : NeonI_3VSame_Constraint_impl<"saba", ".4s", VPR128, v4i32,
- 0b1, 0b0, 0b10, 0b01111, Neon_saba>;
-
-
-// Vector Absolute Difference (Signed, Unsigned)
-defm UABDvvv : NeonI_3VSame_BHS_sizes<0b1, 0b01110, "uabd", int_arm_neon_vabdu, 0>;
-defm SABDvvv : NeonI_3VSame_BHS_sizes<0b0, 0b01110, "sabd", int_arm_neon_vabds, 0>;
-
-// Vector Absolute Difference (Floating Point)
-defm FABDvvv: NeonI_3VSame_SD_sizes<0b1, 0b1, 0b11010, "fabd",
- int_arm_neon_vabds, v2f32, v4f32, v2f64, 0>;
-
-// Vector Reciprocal Step (Floating Point)
-defm FRECPSvvv : NeonI_3VSame_SD_sizes<0b0, 0b0, 0b11111, "frecps",
- int_arm_neon_vrecps,
- v2f32, v4f32, v2f64, 0>;
-
-// Vector Reciprocal Square Root Step (Floating Point)
-defm FRSQRTSvvv : NeonI_3VSame_SD_sizes<0b0, 0b1, 0b11111, "frsqrts",
- int_arm_neon_vrsqrts,
- v2f32, v4f32, v2f64, 0>;
-
-// Vector Comparisons
-
-def Neon_cmeq : PatFrag<(ops node:$lhs, node:$rhs),
- (Neon_cmp node:$lhs, node:$rhs, SETEQ)>;
-def Neon_cmphs : PatFrag<(ops node:$lhs, node:$rhs),
- (Neon_cmp node:$lhs, node:$rhs, SETUGE)>;
-def Neon_cmge : PatFrag<(ops node:$lhs, node:$rhs),
- (Neon_cmp node:$lhs, node:$rhs, SETGE)>;
-def Neon_cmhi : PatFrag<(ops node:$lhs, node:$rhs),
- (Neon_cmp node:$lhs, node:$rhs, SETUGT)>;
-def Neon_cmgt : PatFrag<(ops node:$lhs, node:$rhs),
- (Neon_cmp node:$lhs, node:$rhs, SETGT)>;
-
-// NeonI_compare_aliases class: swaps register operands to implement
-// comparison aliases, e.g., CMLE is alias for CMGE with operands reversed.
-class NeonI_compare_aliases<string asmop, string asmlane,
- Instruction inst, RegisterOperand VPRC>
- : NeonInstAlias<asmop # "\t$Rd" # asmlane #", $Rn" # asmlane #
- ", $Rm" # asmlane,
- (inst VPRC:$Rd, VPRC:$Rm, VPRC:$Rn), 0b0>;
-
-// Vector Comparisons (Integer)
-
-// Vector Compare Mask Equal (Integer)
-let isCommutable =1 in {
-defm CMEQvvv : NeonI_3VSame_BHSD_sizes<0b1, 0b10001, "cmeq", Neon_cmeq, 0>;
-}
-
-// Vector Compare Mask Higher or Same (Unsigned Integer)
-defm CMHSvvv : NeonI_3VSame_BHSD_sizes<0b1, 0b00111, "cmhs", Neon_cmphs, 0>;
-
-// Vector Compare Mask Greater Than or Equal (Integer)
-defm CMGEvvv : NeonI_3VSame_BHSD_sizes<0b0, 0b00111, "cmge", Neon_cmge, 0>;
-
-// Vector Compare Mask Higher (Unsigned Integer)
-defm CMHIvvv : NeonI_3VSame_BHSD_sizes<0b1, 0b00110, "cmhi", Neon_cmhi, 0>;
-
-// Vector Compare Mask Greater Than (Integer)
-defm CMGTvvv : NeonI_3VSame_BHSD_sizes<0b0, 0b00110, "cmgt", Neon_cmgt, 0>;
-
-// Vector Compare Mask Bitwise Test (Integer)
-defm CMTSTvvv: NeonI_3VSame_BHSD_sizes<0b0, 0b10001, "cmtst", Neon_tst, 0>;
-
-// Vector Compare Mask Less or Same (Unsigned Integer)
-// CMLS is alias for CMHS with operands reversed.
-def CMLSvvv_8B : NeonI_compare_aliases<"cmls", ".8b", CMHSvvv_8B, VPR64>;
-def CMLSvvv_16B : NeonI_compare_aliases<"cmls", ".16b", CMHSvvv_16B, VPR128>;
-def CMLSvvv_4H : NeonI_compare_aliases<"cmls", ".4h", CMHSvvv_4H, VPR64>;
-def CMLSvvv_8H : NeonI_compare_aliases<"cmls", ".8h", CMHSvvv_8H, VPR128>;
-def CMLSvvv_2S : NeonI_compare_aliases<"cmls", ".2s", CMHSvvv_2S, VPR64>;
-def CMLSvvv_4S : NeonI_compare_aliases<"cmls", ".4s", CMHSvvv_4S, VPR128>;
-def CMLSvvv_2D : NeonI_compare_aliases<"cmls", ".2d", CMHSvvv_2D, VPR128>;
-
-// Vector Compare Mask Less Than or Equal (Integer)
-// CMLE is alias for CMGE with operands reversed.
-def CMLEvvv_8B : NeonI_compare_aliases<"cmle", ".8b", CMGEvvv_8B, VPR64>;
-def CMLEvvv_16B : NeonI_compare_aliases<"cmle", ".16b", CMGEvvv_16B, VPR128>;
-def CMLEvvv_4H : NeonI_compare_aliases<"cmle", ".4h", CMGEvvv_4H, VPR64>;
-def CMLEvvv_8H : NeonI_compare_aliases<"cmle", ".8h", CMGEvvv_8H, VPR128>;
-def CMLEvvv_2S : NeonI_compare_aliases<"cmle", ".2s", CMGEvvv_2S, VPR64>;
-def CMLEvvv_4S : NeonI_compare_aliases<"cmle", ".4s", CMGEvvv_4S, VPR128>;
-def CMLEvvv_2D : NeonI_compare_aliases<"cmle", ".2d", CMGEvvv_2D, VPR128>;
-
-// Vector Compare Mask Lower (Unsigned Integer)
-// CMLO is alias for CMHI with operands reversed.
-def CMLOvvv_8B : NeonI_compare_aliases<"cmlo", ".8b", CMHIvvv_8B, VPR64>;
-def CMLOvvv_16B : NeonI_compare_aliases<"cmlo", ".16b", CMHIvvv_16B, VPR128>;
-def CMLOvvv_4H : NeonI_compare_aliases<"cmlo", ".4h", CMHIvvv_4H, VPR64>;
-def CMLOvvv_8H : NeonI_compare_aliases<"cmlo", ".8h", CMHIvvv_8H, VPR128>;
-def CMLOvvv_2S : NeonI_compare_aliases<"cmlo", ".2s", CMHIvvv_2S, VPR64>;
-def CMLOvvv_4S : NeonI_compare_aliases<"cmlo", ".4s", CMHIvvv_4S, VPR128>;
-def CMLOvvv_2D : NeonI_compare_aliases<"cmlo", ".2d", CMHIvvv_2D, VPR128>;
-
-// Vector Compare Mask Less Than (Integer)
-// CMLT is alias for CMGT with operands reversed.
-def CMLTvvv_8B : NeonI_compare_aliases<"cmlt", ".8b", CMGTvvv_8B, VPR64>;
-def CMLTvvv_16B : NeonI_compare_aliases<"cmlt", ".16b", CMGTvvv_16B, VPR128>;
-def CMLTvvv_4H : NeonI_compare_aliases<"cmlt", ".4h", CMGTvvv_4H, VPR64>;
-def CMLTvvv_8H : NeonI_compare_aliases<"cmlt", ".8h", CMGTvvv_8H, VPR128>;
-def CMLTvvv_2S : NeonI_compare_aliases<"cmlt", ".2s", CMGTvvv_2S, VPR64>;
-def CMLTvvv_4S : NeonI_compare_aliases<"cmlt", ".4s", CMGTvvv_4S, VPR128>;
-def CMLTvvv_2D : NeonI_compare_aliases<"cmlt", ".2d", CMGTvvv_2D, VPR128>;
-
-
-def neon_uimm0_asmoperand : AsmOperandClass
-{
- let Name = "UImm0";
- let PredicateMethod = "isUImm<0>";
- let RenderMethod = "addImmOperands";
-}
-
-def neon_uimm0 : Operand<i32>, ImmLeaf<i32, [{return Imm == 0;}]> {
- let ParserMatchClass = neon_uimm0_asmoperand;
- let PrintMethod = "printNeonUImm0Operand";
-
-}
-
-multiclass NeonI_cmpz_sizes<bit u, bits<5> opcode, string asmop, CondCode CC>
-{
- def _8B : NeonI_2VMisc<0b0, u, 0b00, opcode,
- (outs VPR64:$Rd), (ins VPR64:$Rn, neon_uimm0:$Imm),
- asmop # "\t$Rd.8b, $Rn.8b, $Imm",
- [(set (v8i8 VPR64:$Rd),
- (v8i8 (Neon_cmpz (v8i8 VPR64:$Rn), (i32 imm:$Imm), CC)))],
- NoItinerary>,
- Sched<[WriteFPALU, ReadFPALU]>;
-
- def _16B : NeonI_2VMisc<0b1, u, 0b00, opcode,
- (outs VPR128:$Rd), (ins VPR128:$Rn, neon_uimm0:$Imm),
- asmop # "\t$Rd.16b, $Rn.16b, $Imm",
- [(set (v16i8 VPR128:$Rd),
- (v16i8 (Neon_cmpz (v16i8 VPR128:$Rn), (i32 imm:$Imm), CC)))],
- NoItinerary>,
- Sched<[WriteFPALU, ReadFPALU]>;
-
- def _4H : NeonI_2VMisc<0b0, u, 0b01, opcode,
- (outs VPR64:$Rd), (ins VPR64:$Rn, neon_uimm0:$Imm),
- asmop # "\t$Rd.4h, $Rn.4h, $Imm",
- [(set (v4i16 VPR64:$Rd),
- (v4i16 (Neon_cmpz (v4i16 VPR64:$Rn), (i32 imm:$Imm), CC)))],
- NoItinerary>,
- Sched<[WriteFPALU, ReadFPALU]>;
-
- def _8H : NeonI_2VMisc<0b1, u, 0b01, opcode,
- (outs VPR128:$Rd), (ins VPR128:$Rn, neon_uimm0:$Imm),
- asmop # "\t$Rd.8h, $Rn.8h, $Imm",
- [(set (v8i16 VPR128:$Rd),
- (v8i16 (Neon_cmpz (v8i16 VPR128:$Rn), (i32 imm:$Imm), CC)))],
- NoItinerary>,
- Sched<[WriteFPALU, ReadFPALU]>;
-
- def _2S : NeonI_2VMisc<0b0, u, 0b10, opcode,
- (outs VPR64:$Rd), (ins VPR64:$Rn, neon_uimm0:$Imm),
- asmop # "\t$Rd.2s, $Rn.2s, $Imm",
- [(set (v2i32 VPR64:$Rd),
- (v2i32 (Neon_cmpz (v2i32 VPR64:$Rn), (i32 imm:$Imm), CC)))],
- NoItinerary>,
- Sched<[WriteFPALU, ReadFPALU]>;
-
- def _4S : NeonI_2VMisc<0b1, u, 0b10, opcode,
- (outs VPR128:$Rd), (ins VPR128:$Rn, neon_uimm0:$Imm),
- asmop # "\t$Rd.4s, $Rn.4s, $Imm",
- [(set (v4i32 VPR128:$Rd),
- (v4i32 (Neon_cmpz (v4i32 VPR128:$Rn), (i32 imm:$Imm), CC)))],
- NoItinerary>,
- Sched<[WriteFPALU, ReadFPALU]>;
-
- def _2D : NeonI_2VMisc<0b1, u, 0b11, opcode,
- (outs VPR128:$Rd), (ins VPR128:$Rn, neon_uimm0:$Imm),
- asmop # "\t$Rd.2d, $Rn.2d, $Imm",
- [(set (v2i64 VPR128:$Rd),
- (v2i64 (Neon_cmpz (v2i64 VPR128:$Rn), (i32 imm:$Imm), CC)))],
- NoItinerary>,
- Sched<[WriteFPALU, ReadFPALU]>;
-}
-
-// Vector Compare Mask Equal to Zero (Integer)
-defm CMEQvvi : NeonI_cmpz_sizes<0b0, 0b01001, "cmeq", SETEQ>;
-
-// Vector Compare Mask Greater Than or Equal to Zero (Signed Integer)
-defm CMGEvvi : NeonI_cmpz_sizes<0b1, 0b01000, "cmge", SETGE>;
-
-// Vector Compare Mask Greater Than Zero (Signed Integer)
-defm CMGTvvi : NeonI_cmpz_sizes<0b0, 0b01000, "cmgt", SETGT>;
-
-// Vector Compare Mask Less Than or Equal To Zero (Signed Integer)
-defm CMLEvvi : NeonI_cmpz_sizes<0b1, 0b01001, "cmle", SETLE>;
-
-// Vector Compare Mask Less Than Zero (Signed Integer)
-defm CMLTvvi : NeonI_cmpz_sizes<0b0, 0b01010, "cmlt", SETLT>;
-
-// Vector Comparisons (Floating Point)
-
-// Vector Compare Mask Equal (Floating Point)
-let isCommutable =1 in {
-defm FCMEQvvv : NeonI_3VSame_SD_sizes<0b0, 0b0, 0b11100, "fcmeq", Neon_cmeq,
- v2i32, v4i32, v2i64, 0>;
-}
-
-// Vector Compare Mask Greater Than Or Equal (Floating Point)
-defm FCMGEvvv : NeonI_3VSame_SD_sizes<0b1, 0b0, 0b11100, "fcmge", Neon_cmge,
- v2i32, v4i32, v2i64, 0>;
-
-// Vector Compare Mask Greater Than (Floating Point)
-defm FCMGTvvv : NeonI_3VSame_SD_sizes<0b1, 0b1, 0b11100, "fcmgt", Neon_cmgt,
- v2i32, v4i32, v2i64, 0>;
-
-// Vector Compare Mask Less Than Or Equal (Floating Point)
-// FCMLE is alias for FCMGE with operands reversed.
-def FCMLEvvv_2S : NeonI_compare_aliases<"fcmle", ".2s", FCMGEvvv_2S, VPR64>;
-def FCMLEvvv_4S : NeonI_compare_aliases<"fcmle", ".4s", FCMGEvvv_4S, VPR128>;
-def FCMLEvvv_2D : NeonI_compare_aliases<"fcmle", ".2d", FCMGEvvv_2D, VPR128>;
-
-// Vector Compare Mask Less Than (Floating Point)
-// FCMLT is alias for FCMGT with operands reversed.
-def FCMLTvvv_2S : NeonI_compare_aliases<"fcmlt", ".2s", FCMGTvvv_2S, VPR64>;
-def FCMLTvvv_4S : NeonI_compare_aliases<"fcmlt", ".4s", FCMGTvvv_4S, VPR128>;
-def FCMLTvvv_2D : NeonI_compare_aliases<"fcmlt", ".2d", FCMGTvvv_2D, VPR128>;
-
-def fpzero_izero_asmoperand : AsmOperandClass {
- let Name = "FPZeroIZero";
- let ParserMethod = "ParseFPImm0AndImm0Operand";
- let DiagnosticType = "FPZero";
-}
-
-def fpzz32 : Operand<f32>,
- ComplexPattern<f32, 1, "SelectFPZeroOperand", [fpimm]> {
- let ParserMatchClass = fpzero_izero_asmoperand;
- let PrintMethod = "printFPZeroOperand";
- let DecoderMethod = "DecodeFPZeroOperand";
-}
-
-multiclass NeonI_fpcmpz_sizes<bit u, bit size, bits<5> opcode,
- string asmop, CondCode CC>
-{
- def _2S : NeonI_2VMisc<0b0, u, {size, 0b0}, opcode,
- (outs VPR64:$Rd), (ins VPR64:$Rn, fpzz32:$FPImm),
- asmop # "\t$Rd.2s, $Rn.2s, $FPImm",
- [(set (v2i32 VPR64:$Rd),
- (v2i32 (Neon_cmpz (v2f32 VPR64:$Rn), (f32 fpzz32:$FPImm), CC)))],
- NoItinerary>,
- Sched<[WriteFPALU, ReadFPALU]>;
-
- def _4S : NeonI_2VMisc<0b1, u, {size, 0b0}, opcode,
- (outs VPR128:$Rd), (ins VPR128:$Rn, fpzz32:$FPImm),
- asmop # "\t$Rd.4s, $Rn.4s, $FPImm",
- [(set (v4i32 VPR128:$Rd),
- (v4i32 (Neon_cmpz (v4f32 VPR128:$Rn), (f32 fpzz32:$FPImm), CC)))],
- NoItinerary>,
- Sched<[WriteFPALU, ReadFPALU]>;
-
- def _2D : NeonI_2VMisc<0b1, u, {size, 0b1}, opcode,
- (outs VPR128:$Rd), (ins VPR128:$Rn, fpzz32:$FPImm),
- asmop # "\t$Rd.2d, $Rn.2d, $FPImm",
- [(set (v2i64 VPR128:$Rd),
- (v2i64 (Neon_cmpz (v2f64 VPR128:$Rn), (f32 fpzz32:$FPImm), CC)))],
- NoItinerary>,
- Sched<[WriteFPALU, ReadFPALU]>;
-}
-
-// Vector Compare Mask Equal to Zero (Floating Point)
-defm FCMEQvvi : NeonI_fpcmpz_sizes<0b0, 0b1, 0b01101, "fcmeq", SETEQ>;
-
-// Vector Compare Mask Greater Than or Equal to Zero (Floating Point)
-defm FCMGEvvi : NeonI_fpcmpz_sizes<0b1, 0b1, 0b01100, "fcmge", SETGE>;
-
-// Vector Compare Mask Greater Than Zero (Floating Point)
-defm FCMGTvvi : NeonI_fpcmpz_sizes<0b0, 0b1, 0b01100, "fcmgt", SETGT>;
-
-// Vector Compare Mask Less Than or Equal To Zero (Floating Point)
-defm FCMLEvvi : NeonI_fpcmpz_sizes<0b1, 0b1, 0b01101, "fcmle", SETLE>;
-
-// Vector Compare Mask Less Than Zero (Floating Point)
-defm FCMLTvvi : NeonI_fpcmpz_sizes<0b0, 0b1, 0b01110, "fcmlt", SETLT>;
-
-// Vector Absolute Comparisons (Floating Point)
-
-// Vector Absolute Compare Mask Greater Than Or Equal (Floating Point)
-defm FACGEvvv : NeonI_3VSame_SD_sizes<0b1, 0b0, 0b11101, "facge",
- int_arm_neon_vacge,
- v2i32, v4i32, v2i64, 0>;
-
-// Vector Absolute Compare Mask Greater Than (Floating Point)
-defm FACGTvvv : NeonI_3VSame_SD_sizes<0b1, 0b1, 0b11101, "facgt",
- int_arm_neon_vacgt,
- v2i32, v4i32, v2i64, 0>;
-
-// Vector Absolute Compare Mask Less Than Or Equal (Floating Point)
-// FACLE is alias for FACGE with operands reversed.
-def FACLEvvv_2S : NeonI_compare_aliases<"facle", ".2s", FACGEvvv_2S, VPR64>;
-def FACLEvvv_4S : NeonI_compare_aliases<"facle", ".4s", FACGEvvv_4S, VPR128>;
-def FACLEvvv_2D : NeonI_compare_aliases<"facle", ".2d", FACGEvvv_2D, VPR128>;
-
-// Vector Absolute Compare Mask Less Than (Floating Point)
-// FACLT is alias for FACGT with operands reversed.
-def FACLTvvv_2S : NeonI_compare_aliases<"faclt", ".2s", FACGTvvv_2S, VPR64>;
-def FACLTvvv_4S : NeonI_compare_aliases<"faclt", ".4s", FACGTvvv_4S, VPR128>;
-def FACLTvvv_2D : NeonI_compare_aliases<"faclt", ".2d", FACGTvvv_2D, VPR128>;
-
-// Vector halving add (Integer Signed, Unsigned)
-defm SHADDvvv : NeonI_3VSame_BHS_sizes<0b0, 0b00000, "shadd",
- int_arm_neon_vhadds, 1>;
-defm UHADDvvv : NeonI_3VSame_BHS_sizes<0b1, 0b00000, "uhadd",
- int_arm_neon_vhaddu, 1>;
-
-// Vector halving sub (Integer Signed, Unsigned)
-defm SHSUBvvv : NeonI_3VSame_BHS_sizes<0b0, 0b00100, "shsub",
- int_arm_neon_vhsubs, 0>;
-defm UHSUBvvv : NeonI_3VSame_BHS_sizes<0b1, 0b00100, "uhsub",
- int_arm_neon_vhsubu, 0>;
-
-// Vector rouding halving add (Integer Signed, Unsigned)
-defm SRHADDvvv : NeonI_3VSame_BHS_sizes<0b0, 0b00010, "srhadd",
- int_arm_neon_vrhadds, 1>;
-defm URHADDvvv : NeonI_3VSame_BHS_sizes<0b1, 0b00010, "urhadd",
- int_arm_neon_vrhaddu, 1>;
-
-// Vector Saturating add (Integer Signed, Unsigned)
-defm SQADDvvv : NeonI_3VSame_BHSD_sizes<0b0, 0b00001, "sqadd",
- int_arm_neon_vqadds, 1>;
-defm UQADDvvv : NeonI_3VSame_BHSD_sizes<0b1, 0b00001, "uqadd",
- int_arm_neon_vqaddu, 1>;
-
-// Vector Saturating sub (Integer Signed, Unsigned)
-defm SQSUBvvv : NeonI_3VSame_BHSD_sizes<0b0, 0b00101, "sqsub",
- int_arm_neon_vqsubs, 1>;
-defm UQSUBvvv : NeonI_3VSame_BHSD_sizes<0b1, 0b00101, "uqsub",
- int_arm_neon_vqsubu, 1>;
-
-// Vector Shift Left (Signed and Unsigned Integer)
-defm SSHLvvv : NeonI_3VSame_BHSD_sizes<0b0, 0b01000, "sshl",
- int_arm_neon_vshifts, 1>;
-defm USHLvvv : NeonI_3VSame_BHSD_sizes<0b1, 0b01000, "ushl",
- int_arm_neon_vshiftu, 1>;
-
-// Vector Saturating Shift Left (Signed and Unsigned Integer)
-defm SQSHLvvv : NeonI_3VSame_BHSD_sizes<0b0, 0b01001, "sqshl",
- int_arm_neon_vqshifts, 1>;
-defm UQSHLvvv : NeonI_3VSame_BHSD_sizes<0b1, 0b01001, "uqshl",
- int_arm_neon_vqshiftu, 1>;
-
-// Vector Rouding Shift Left (Signed and Unsigned Integer)
-defm SRSHLvvv : NeonI_3VSame_BHSD_sizes<0b0, 0b01010, "srshl",
- int_arm_neon_vrshifts, 1>;
-defm URSHLvvv : NeonI_3VSame_BHSD_sizes<0b1, 0b01010, "urshl",
- int_arm_neon_vrshiftu, 1>;
-
-// Vector Saturating Rouding Shift Left (Signed and Unsigned Integer)
-defm SQRSHLvvv : NeonI_3VSame_BHSD_sizes<0b0, 0b01011, "sqrshl",
- int_arm_neon_vqrshifts, 1>;
-defm UQRSHLvvv : NeonI_3VSame_BHSD_sizes<0b1, 0b01011, "uqrshl",
- int_arm_neon_vqrshiftu, 1>;
-
-// Vector Maximum (Signed and Unsigned Integer)
-defm SMAXvvv : NeonI_3VSame_BHS_sizes<0b0, 0b01100, "smax", int_arm_neon_vmaxs, 1>;
-defm UMAXvvv : NeonI_3VSame_BHS_sizes<0b1, 0b01100, "umax", int_arm_neon_vmaxu, 1>;
-
-// Vector Minimum (Signed and Unsigned Integer)
-defm SMINvvv : NeonI_3VSame_BHS_sizes<0b0, 0b01101, "smin", int_arm_neon_vmins, 1>;
-defm UMINvvv : NeonI_3VSame_BHS_sizes<0b1, 0b01101, "umin", int_arm_neon_vminu, 1>;
-
-// Vector Maximum (Floating Point)
-defm FMAXvvv : NeonI_3VSame_SD_sizes<0b0, 0b0, 0b11110, "fmax",
- int_arm_neon_vmaxs,
- v2f32, v4f32, v2f64, 1>;
-
-// Vector Minimum (Floating Point)
-defm FMINvvv : NeonI_3VSame_SD_sizes<0b0, 0b1, 0b11110, "fmin",
- int_arm_neon_vmins,
- v2f32, v4f32, v2f64, 1>;
-
-// Vector maxNum (Floating Point) - prefer a number over a quiet NaN)
-defm FMAXNMvvv : NeonI_3VSame_SD_sizes<0b0, 0b0, 0b11000, "fmaxnm",
- int_aarch64_neon_vmaxnm,
- v2f32, v4f32, v2f64, 1>;
-
-// Vector minNum (Floating Point) - prefer a number over a quiet NaN)
-defm FMINNMvvv : NeonI_3VSame_SD_sizes<0b0, 0b1, 0b11000, "fminnm",
- int_aarch64_neon_vminnm,
- v2f32, v4f32, v2f64, 1>;
-
-// Vector Maximum Pairwise (Signed and Unsigned Integer)
-defm SMAXPvvv : NeonI_3VSame_BHS_sizes<0b0, 0b10100, "smaxp", int_arm_neon_vpmaxs, 1>;
-defm UMAXPvvv : NeonI_3VSame_BHS_sizes<0b1, 0b10100, "umaxp", int_arm_neon_vpmaxu, 1>;
-
-// Vector Minimum Pairwise (Signed and Unsigned Integer)
-defm SMINPvvv : NeonI_3VSame_BHS_sizes<0b0, 0b10101, "sminp", int_arm_neon_vpmins, 1>;
-defm UMINPvvv : NeonI_3VSame_BHS_sizes<0b1, 0b10101, "uminp", int_arm_neon_vpminu, 1>;
-
-// Vector Maximum Pairwise (Floating Point)
-defm FMAXPvvv : NeonI_3VSame_SD_sizes<0b1, 0b0, 0b11110, "fmaxp",
- int_arm_neon_vpmaxs, v2f32, v4f32, v2f64, 1>;
-
-// Vector Minimum Pairwise (Floating Point)
-defm FMINPvvv : NeonI_3VSame_SD_sizes<0b1, 0b1, 0b11110, "fminp",
- int_arm_neon_vpmins, v2f32, v4f32, v2f64, 1>;
-
-// Vector maxNum Pairwise (Floating Point) - prefer a number over a quiet NaN)
-defm FMAXNMPvvv : NeonI_3VSame_SD_sizes<0b1, 0b0, 0b11000, "fmaxnmp",
- int_aarch64_neon_vpmaxnm,
- v2f32, v4f32, v2f64, 1>;
-
-// Vector minNum Pairwise (Floating Point) - prefer a number over a quiet NaN)
-defm FMINNMPvvv : NeonI_3VSame_SD_sizes<0b1, 0b1, 0b11000, "fminnmp",
- int_aarch64_neon_vpminnm,
- v2f32, v4f32, v2f64, 1>;
-
-// Vector Addition Pairwise (Integer)
-defm ADDP : NeonI_3VSame_BHSD_sizes<0b0, 0b10111, "addp", int_arm_neon_vpadd, 1>;
-
-// Vector Addition Pairwise (Floating Point)
-defm FADDP : NeonI_3VSame_SD_sizes<0b1, 0b0, 0b11010, "faddp",
- int_arm_neon_vpadd,
- v2f32, v4f32, v2f64, 1>;
-
-let SchedRW = [WriteFPMul, ReadFPMul, ReadFPMul] in {
-// Vector Saturating Doubling Multiply High
-defm SQDMULHvvv : NeonI_3VSame_HS_sizes<0b0, 0b10110, "sqdmulh",
- int_arm_neon_vqdmulh, 1>;
-
-// Vector Saturating Rouding Doubling Multiply High
-defm SQRDMULHvvv : NeonI_3VSame_HS_sizes<0b1, 0b10110, "sqrdmulh",
- int_arm_neon_vqrdmulh, 1>;
-
-// Vector Multiply Extended (Floating Point)
-defm FMULXvvv : NeonI_3VSame_SD_sizes<0b0, 0b0, 0b11011, "fmulx",
- int_aarch64_neon_vmulx,
- v2f32, v4f32, v2f64, 1>;
-}
-
-// Patterns to match llvm.aarch64.* intrinsic for
-// ADDP, SMINP, UMINP, SMAXP, UMAXP having i32 as output
-class Neon_VectorPair_v2i32_pattern<SDPatternOperator opnode, Instruction INST>
- : Pat<(v1i32 (opnode (v2i32 VPR64:$Rn))),
- (EXTRACT_SUBREG
- (v2i32 (INST (v2i32 VPR64:$Rn), (v2i32 VPR64:$Rn))),
- sub_32)>;
-
-def : Neon_VectorPair_v2i32_pattern<int_aarch64_neon_sminv, SMINPvvv_2S>;
-def : Neon_VectorPair_v2i32_pattern<int_aarch64_neon_uminv, UMINPvvv_2S>;
-def : Neon_VectorPair_v2i32_pattern<int_aarch64_neon_smaxv, SMAXPvvv_2S>;
-def : Neon_VectorPair_v2i32_pattern<int_aarch64_neon_umaxv, UMAXPvvv_2S>;
-def : Neon_VectorPair_v2i32_pattern<int_aarch64_neon_vaddv, ADDP_2S>;
-
-// Vector Immediate Instructions
-
-multiclass neon_mov_imm_shift_asmoperands<string PREFIX>
-{
- def _asmoperand : AsmOperandClass
- {
- let Name = "NeonMovImmShift" # PREFIX;
- let RenderMethod = "addNeonMovImmShift" # PREFIX # "Operands";
- let PredicateMethod = "isNeonMovImmShift" # PREFIX;
- }
-}
-
-// Definition of vector immediates shift operands
-
-// The selectable use-cases extract the shift operation
-// information from the OpCmode fields encoded in the immediate.
-def neon_mod_shift_imm_XFORM : SDNodeXForm<imm, [{
- uint64_t OpCmode = N->getZExtValue();
- unsigned ShiftImm;
- unsigned ShiftOnesIn;
- unsigned HasShift =
- A64Imms::decodeNeonModShiftImm(OpCmode, ShiftImm, ShiftOnesIn);
- if (!HasShift) return SDValue();
- return CurDAG->getTargetConstant(ShiftImm, MVT::i32);
-}]>;
-
-// Vector immediates shift operands which accept LSL and MSL
-// shift operators with shift value in the range of 0, 8, 16, 24 (LSL),
-// or 0, 8 (LSLH) or 8, 16 (MSL).
-defm neon_mov_imm_LSL : neon_mov_imm_shift_asmoperands<"LSL">;
-defm neon_mov_imm_MSL : neon_mov_imm_shift_asmoperands<"MSL">;
-// LSLH restricts shift amount to 0, 8 out of 0, 8, 16, 24
-defm neon_mov_imm_LSLH : neon_mov_imm_shift_asmoperands<"LSLH">;
-
-multiclass neon_mov_imm_shift_operands<string PREFIX,
- string HALF, string ISHALF, code pred>
-{
- def _operand : Operand<i32>, ImmLeaf<i32, pred, neon_mod_shift_imm_XFORM>
- {
- let PrintMethod =
- "printNeonMovImmShiftOperand<A64SE::" # PREFIX # ", " # ISHALF # ">";
- let DecoderMethod =
- "DecodeNeonMovImmShiftOperand<A64SE::" # PREFIX # ", " # ISHALF # ">";
- let ParserMatchClass =
- !cast<AsmOperandClass>("neon_mov_imm_" # PREFIX # HALF # "_asmoperand");
- }
-}
-
-defm neon_mov_imm_LSL : neon_mov_imm_shift_operands<"LSL", "", "false", [{
- unsigned ShiftImm;
- unsigned ShiftOnesIn;
- unsigned HasShift =
- A64Imms::decodeNeonModShiftImm(Imm, ShiftImm, ShiftOnesIn);
- return (HasShift && !ShiftOnesIn);
-}]>;
-
-defm neon_mov_imm_MSL : neon_mov_imm_shift_operands<"MSL", "", "false", [{
- unsigned ShiftImm;
- unsigned ShiftOnesIn;
- unsigned HasShift =
- A64Imms::decodeNeonModShiftImm(Imm, ShiftImm, ShiftOnesIn);
- return (HasShift && ShiftOnesIn);
-}]>;
-
-defm neon_mov_imm_LSLH : neon_mov_imm_shift_operands<"LSL", "H", "true", [{
- unsigned ShiftImm;
- unsigned ShiftOnesIn;
- unsigned HasShift =
- A64Imms::decodeNeonModShiftImm(Imm, ShiftImm, ShiftOnesIn);
- return (HasShift && !ShiftOnesIn);
-}]>;
-
-def neon_uimm1_asmoperand : AsmOperandClass
-{
- let Name = "UImm1";
- let PredicateMethod = "isUImm<1>";
- let RenderMethod = "addImmOperands";
-}
-
-def neon_uimm2_asmoperand : AsmOperandClass
-{
- let Name = "UImm2";
- let PredicateMethod = "isUImm<2>";
- let RenderMethod = "addImmOperands";
-}
-
-def neon_uimm8_asmoperand : AsmOperandClass
-{
- let Name = "UImm8";
- let PredicateMethod = "isUImm<8>";
- let RenderMethod = "addImmOperands";
-}
-
-def neon_uimm8 : Operand<i32>, ImmLeaf<i32, [{(void)Imm; return true;}]> {
- let ParserMatchClass = neon_uimm8_asmoperand;
- let PrintMethod = "printUImmHexOperand";
-}
-
-def neon_uimm64_mask_asmoperand : AsmOperandClass
-{
- let Name = "NeonUImm64Mask";
- let PredicateMethod = "isNeonUImm64Mask";
- let RenderMethod = "addNeonUImm64MaskOperands";
-}
-
-// MCOperand for 64-bit bytemask with each byte having only the
-// value 0x00 and 0xff is encoded as an unsigned 8-bit value
-def neon_uimm64_mask : Operand<i32>, ImmLeaf<i32, [{(void)Imm; return true;}]> {
- let ParserMatchClass = neon_uimm64_mask_asmoperand;
- let PrintMethod = "printNeonUImm64MaskOperand";
-}
-
-multiclass NeonI_mov_imm_lsl_sizes<string asmop, bit op,
- SDPatternOperator opnode>
-{
- // shift zeros, per word
- def _2S : NeonI_1VModImm<0b0, op,
- (outs VPR64:$Rd),
- (ins neon_uimm8:$Imm,
- neon_mov_imm_LSL_operand:$Simm),
- !strconcat(asmop, "\t$Rd.2s, $Imm$Simm"),
- [(set (v2i32 VPR64:$Rd),
- (v2i32 (opnode (timm:$Imm),
- (neon_mov_imm_LSL_operand:$Simm))))],
- NoItinerary>,
- Sched<[WriteFPALU]> {
- bits<2> Simm;
- let cmode = {0b0, Simm{1}, Simm{0}, 0b0};
- }
-
- def _4S : NeonI_1VModImm<0b1, op,
- (outs VPR128:$Rd),
- (ins neon_uimm8:$Imm,
- neon_mov_imm_LSL_operand:$Simm),
- !strconcat(asmop, "\t$Rd.4s, $Imm$Simm"),
- [(set (v4i32 VPR128:$Rd),
- (v4i32 (opnode (timm:$Imm),
- (neon_mov_imm_LSL_operand:$Simm))))],
- NoItinerary>,
- Sched<[WriteFPALU]> {
- bits<2> Simm;
- let cmode = {0b0, Simm{1}, Simm{0}, 0b0};
- }
-
- // shift zeros, per halfword
- def _4H : NeonI_1VModImm<0b0, op,
- (outs VPR64:$Rd),
- (ins neon_uimm8:$Imm,
- neon_mov_imm_LSLH_operand:$Simm),
- !strconcat(asmop, "\t$Rd.4h, $Imm$Simm"),
- [(set (v4i16 VPR64:$Rd),
- (v4i16 (opnode (timm:$Imm),
- (neon_mov_imm_LSLH_operand:$Simm))))],
- NoItinerary>,
- Sched<[WriteFPALU]> {
- bit Simm;
- let cmode = {0b1, 0b0, Simm, 0b0};
- }
-
- def _8H : NeonI_1VModImm<0b1, op,
- (outs VPR128:$Rd),
- (ins neon_uimm8:$Imm,
- neon_mov_imm_LSLH_operand:$Simm),
- !strconcat(asmop, "\t$Rd.8h, $Imm$Simm"),
- [(set (v8i16 VPR128:$Rd),
- (v8i16 (opnode (timm:$Imm),
- (neon_mov_imm_LSLH_operand:$Simm))))],
- NoItinerary>,
- Sched<[WriteFPALU]> {
- bit Simm;
- let cmode = {0b1, 0b0, Simm, 0b0};
- }
-}
-
-multiclass NeonI_mov_imm_with_constraint_lsl_sizes<string asmop, bit op,
- SDPatternOperator opnode,
- SDPatternOperator neonopnode>
-{
- let Constraints = "$src = $Rd" in {
- // shift zeros, per word
- def _2S : NeonI_1VModImm<0b0, op,
- (outs VPR64:$Rd),
- (ins VPR64:$src, neon_uimm8:$Imm,
- neon_mov_imm_LSL_operand:$Simm),
- !strconcat(asmop, "\t$Rd.2s, $Imm$Simm"),
- [(set (v2i32 VPR64:$Rd),
- (v2i32 (opnode (v2i32 VPR64:$src),
- (v2i32 (neonopnode timm:$Imm,
- neon_mov_imm_LSL_operand:$Simm)))))],
- NoItinerary>,
- Sched<[WriteFPALU, ReadFPALU]> {
- bits<2> Simm;
- let cmode = {0b0, Simm{1}, Simm{0}, 0b1};
- }
-
- def _4S : NeonI_1VModImm<0b1, op,
- (outs VPR128:$Rd),
- (ins VPR128:$src, neon_uimm8:$Imm,
- neon_mov_imm_LSL_operand:$Simm),
- !strconcat(asmop, "\t$Rd.4s, $Imm$Simm"),
- [(set (v4i32 VPR128:$Rd),
- (v4i32 (opnode (v4i32 VPR128:$src),
- (v4i32 (neonopnode timm:$Imm,
- neon_mov_imm_LSL_operand:$Simm)))))],
- NoItinerary>,
- Sched<[WriteFPALU, ReadFPALU]> {
- bits<2> Simm;
- let cmode = {0b0, Simm{1}, Simm{0}, 0b1};
- }
-
- // shift zeros, per halfword
- def _4H : NeonI_1VModImm<0b0, op,
- (outs VPR64:$Rd),
- (ins VPR64:$src, neon_uimm8:$Imm,
- neon_mov_imm_LSLH_operand:$Simm),
- !strconcat(asmop, "\t$Rd.4h, $Imm$Simm"),
- [(set (v4i16 VPR64:$Rd),
- (v4i16 (opnode (v4i16 VPR64:$src),
- (v4i16 (neonopnode timm:$Imm,
- neon_mov_imm_LSL_operand:$Simm)))))],
- NoItinerary>,
- Sched<[WriteFPALU, ReadFPALU]> {
- bit Simm;
- let cmode = {0b1, 0b0, Simm, 0b1};
- }
-
- def _8H : NeonI_1VModImm<0b1, op,
- (outs VPR128:$Rd),
- (ins VPR128:$src, neon_uimm8:$Imm,
- neon_mov_imm_LSLH_operand:$Simm),
- !strconcat(asmop, "\t$Rd.8h, $Imm$Simm"),
- [(set (v8i16 VPR128:$Rd),
- (v8i16 (opnode (v8i16 VPR128:$src),
- (v8i16 (neonopnode timm:$Imm,
- neon_mov_imm_LSL_operand:$Simm)))))],
- NoItinerary>,
- Sched<[WriteFPALU, ReadFPALU]> {
- bit Simm;
- let cmode = {0b1, 0b0, Simm, 0b1};
- }
- }
-}
-
-multiclass NeonI_mov_imm_msl_sizes<string asmop, bit op,
- SDPatternOperator opnode>
-{
- // shift ones, per word
- def _2S : NeonI_1VModImm<0b0, op,
- (outs VPR64:$Rd),
- (ins neon_uimm8:$Imm,
- neon_mov_imm_MSL_operand:$Simm),
- !strconcat(asmop, "\t$Rd.2s, $Imm$Simm"),
- [(set (v2i32 VPR64:$Rd),
- (v2i32 (opnode (timm:$Imm),
- (neon_mov_imm_MSL_operand:$Simm))))],
- NoItinerary>,
- Sched<[WriteFPALU]> {
- bit Simm;
- let cmode = {0b1, 0b1, 0b0, Simm};
- }
-
- def _4S : NeonI_1VModImm<0b1, op,
- (outs VPR128:$Rd),
- (ins neon_uimm8:$Imm,
- neon_mov_imm_MSL_operand:$Simm),
- !strconcat(asmop, "\t$Rd.4s, $Imm$Simm"),
- [(set (v4i32 VPR128:$Rd),
- (v4i32 (opnode (timm:$Imm),
- (neon_mov_imm_MSL_operand:$Simm))))],
- NoItinerary>,
- Sched<[WriteFPALU]> {
- bit Simm;
- let cmode = {0b1, 0b1, 0b0, Simm};
- }
-}
-
-// Vector Move Immediate Shifted
-let isReMaterializable = 1 in {
-defm MOVIvi_lsl : NeonI_mov_imm_lsl_sizes<"movi", 0b0, Neon_movi>;
-}
-
-// Vector Move Inverted Immediate Shifted
-let isReMaterializable = 1 in {
-defm MVNIvi_lsl : NeonI_mov_imm_lsl_sizes<"mvni", 0b1, Neon_mvni>;
-}
-
-// Vector Bitwise Bit Clear (AND NOT) - immediate
-let isReMaterializable = 1 in {
-defm BICvi_lsl : NeonI_mov_imm_with_constraint_lsl_sizes<"bic", 0b1,
- and, Neon_mvni>;
-}
-
-// Vector Bitwise OR - immedidate
-
-let isReMaterializable = 1 in {
-defm ORRvi_lsl : NeonI_mov_imm_with_constraint_lsl_sizes<"orr", 0b0,
- or, Neon_movi>;
-}
-
-// Additional patterns for Vector Bitwise Bit Clear (AND NOT) - immedidate
-// LowerBUILD_VECTOR favors lowering MOVI over MVNI.
-// BIC immediate instructions selection requires additional patterns to
-// transform Neon_movi operands into BIC immediate operands
-
-def neon_mov_imm_LSLH_transform_XFORM : SDNodeXForm<imm, [{
- uint64_t OpCmode = N->getZExtValue();
- unsigned ShiftImm;
- unsigned ShiftOnesIn;
- (void)A64Imms::decodeNeonModShiftImm(OpCmode, ShiftImm, ShiftOnesIn);
- // LSLH restricts shift amount to 0, 8 which are encoded as 0 and 1
- // Transform encoded shift amount 0 to 1 and 1 to 0.
- return CurDAG->getTargetConstant(!ShiftImm, MVT::i32);
-}]>;
-
-def neon_mov_imm_LSLH_transform_operand
- : ImmLeaf<i32, [{
- unsigned ShiftImm;
- unsigned ShiftOnesIn;
- unsigned HasShift =
- A64Imms::decodeNeonModShiftImm(Imm, ShiftImm, ShiftOnesIn);
- return (HasShift && !ShiftOnesIn); }],
- neon_mov_imm_LSLH_transform_XFORM>;
-
-// Transform (and A, (4h Neon_movi 0xff)) -> BIC 4h (A, 0xff, LSL 8)
-// Transform (and A, (4h Neon_movi 0xff LSL #8)) -> BIC 4h (A, 0xff)
-def : Pat<(v4i16 (and VPR64:$src,
- (v4i16 (Neon_movi 255,
- neon_mov_imm_LSLH_transform_operand:$Simm)))),
- (BICvi_lsl_4H VPR64:$src, 255,
- neon_mov_imm_LSLH_transform_operand:$Simm)>;
-
-// Transform (and A, (8h Neon_movi 8h 0xff)) -> BIC 8h (A, 0xff, LSL 8)
-// Transform (and A, (8h Neon_movi 0xff LSL #8)) -> BIC 8h (A, 0xff)
-def : Pat<(v8i16 (and VPR128:$src,
- (v8i16 (Neon_movi 255,
- neon_mov_imm_LSLH_transform_operand:$Simm)))),
- (BICvi_lsl_8H VPR128:$src, 255,
- neon_mov_imm_LSLH_transform_operand:$Simm)>;
-
-def : Pat<(v8i8 (and VPR64:$src,
- (bitconvert(v4i16 (Neon_movi 255,
- neon_mov_imm_LSLH_transform_operand:$Simm))))),
- (BICvi_lsl_4H VPR64:$src, 255,
- neon_mov_imm_LSLH_transform_operand:$Simm)>;
-def : Pat<(v2i32 (and VPR64:$src,
- (bitconvert(v4i16 (Neon_movi 255,
- neon_mov_imm_LSLH_transform_operand:$Simm))))),
- (BICvi_lsl_4H VPR64:$src, 255,
- neon_mov_imm_LSLH_transform_operand:$Simm)>;
-def : Pat<(v1i64 (and VPR64:$src,
- (bitconvert(v4i16 (Neon_movi 255,
- neon_mov_imm_LSLH_transform_operand:$Simm))))),
- (BICvi_lsl_4H VPR64:$src, 255,
- neon_mov_imm_LSLH_transform_operand:$Simm)>;
-
-def : Pat<(v16i8 (and VPR128:$src,
- (bitconvert(v8i16 (Neon_movi 255,
- neon_mov_imm_LSLH_transform_operand:$Simm))))),
- (BICvi_lsl_8H VPR128:$src, 255,
- neon_mov_imm_LSLH_transform_operand:$Simm)>;
-def : Pat<(v4i32 (and VPR128:$src,
- (bitconvert(v8i16 (Neon_movi 255,
- neon_mov_imm_LSLH_transform_operand:$Simm))))),
- (BICvi_lsl_8H VPR128:$src, 255,
- neon_mov_imm_LSLH_transform_operand:$Simm)>;
-def : Pat<(v2i64 (and VPR128:$src,
- (bitconvert(v8i16 (Neon_movi 255,
- neon_mov_imm_LSLH_transform_operand:$Simm))))),
- (BICvi_lsl_8H VPR128:$src, 255,
- neon_mov_imm_LSLH_transform_operand:$Simm)>;
-
-multiclass Neon_bitwiseVi_patterns<SDPatternOperator opnode,
- SDPatternOperator neonopnode,
- Instruction INST4H,
- Instruction INST8H,
- Instruction INST2S,
- Instruction INST4S> {
- def : Pat<(v8i8 (opnode VPR64:$src,
- (bitconvert(v4i16 (neonopnode timm:$Imm,
- neon_mov_imm_LSLH_operand:$Simm))))),
- (INST4H VPR64:$src, neon_uimm8:$Imm,
- neon_mov_imm_LSLH_operand:$Simm)>;
- def : Pat<(v2i32 (opnode VPR64:$src,
- (bitconvert(v4i16 (neonopnode timm:$Imm,
- neon_mov_imm_LSLH_operand:$Simm))))),
- (INST4H VPR64:$src, neon_uimm8:$Imm,
- neon_mov_imm_LSLH_operand:$Simm)>;
- def : Pat<(v1i64 (opnode VPR64:$src,
- (bitconvert(v4i16 (neonopnode timm:$Imm,
- neon_mov_imm_LSLH_operand:$Simm))))),
- (INST4H VPR64:$src, neon_uimm8:$Imm,
- neon_mov_imm_LSLH_operand:$Simm)>;
-
- def : Pat<(v16i8 (opnode VPR128:$src,
- (bitconvert(v8i16 (neonopnode timm:$Imm,
- neon_mov_imm_LSLH_operand:$Simm))))),
- (INST8H VPR128:$src, neon_uimm8:$Imm,
- neon_mov_imm_LSLH_operand:$Simm)>;
- def : Pat<(v4i32 (opnode VPR128:$src,
- (bitconvert(v8i16 (neonopnode timm:$Imm,
- neon_mov_imm_LSLH_operand:$Simm))))),
- (INST8H VPR128:$src, neon_uimm8:$Imm,
- neon_mov_imm_LSLH_operand:$Simm)>;
- def : Pat<(v2i64 (opnode VPR128:$src,
- (bitconvert(v8i16 (neonopnode timm:$Imm,
- neon_mov_imm_LSLH_operand:$Simm))))),
- (INST8H VPR128:$src, neon_uimm8:$Imm,
- neon_mov_imm_LSLH_operand:$Simm)>;
-
- def : Pat<(v8i8 (opnode VPR64:$src,
- (bitconvert(v2i32 (neonopnode timm:$Imm,
- neon_mov_imm_LSLH_operand:$Simm))))),
- (INST2S VPR64:$src, neon_uimm8:$Imm,
- neon_mov_imm_LSLH_operand:$Simm)>;
- def : Pat<(v4i16 (opnode VPR64:$src,
- (bitconvert(v2i32 (neonopnode timm:$Imm,
- neon_mov_imm_LSLH_operand:$Simm))))),
- (INST2S VPR64:$src, neon_uimm8:$Imm,
- neon_mov_imm_LSLH_operand:$Simm)>;
- def : Pat<(v1i64 (opnode VPR64:$src,
- (bitconvert(v2i32 (neonopnode timm:$Imm,
- neon_mov_imm_LSLH_operand:$Simm))))),
- (INST2S VPR64:$src, neon_uimm8:$Imm,
- neon_mov_imm_LSLH_operand:$Simm)>;
-
- def : Pat<(v16i8 (opnode VPR128:$src,
- (bitconvert(v4i32 (neonopnode timm:$Imm,
- neon_mov_imm_LSLH_operand:$Simm))))),
- (INST4S VPR128:$src, neon_uimm8:$Imm,
- neon_mov_imm_LSLH_operand:$Simm)>;
- def : Pat<(v8i16 (opnode VPR128:$src,
- (bitconvert(v4i32 (neonopnode timm:$Imm,
- neon_mov_imm_LSLH_operand:$Simm))))),
- (INST4S VPR128:$src, neon_uimm8:$Imm,
- neon_mov_imm_LSLH_operand:$Simm)>;
- def : Pat<(v2i64 (opnode VPR128:$src,
- (bitconvert(v4i32 (neonopnode timm:$Imm,
- neon_mov_imm_LSLH_operand:$Simm))))),
- (INST4S VPR128:$src, neon_uimm8:$Imm,
- neon_mov_imm_LSLH_operand:$Simm)>;
-}
-
-// Additional patterns for Vector Vector Bitwise Bit Clear (AND NOT) - immediate
-defm : Neon_bitwiseVi_patterns<and, Neon_mvni, BICvi_lsl_4H, BICvi_lsl_8H,
- BICvi_lsl_2S, BICvi_lsl_4S>;
-
-// Additional patterns for Vector Bitwise OR - immedidate
-defm : Neon_bitwiseVi_patterns<or, Neon_movi, ORRvi_lsl_4H, ORRvi_lsl_8H,
- ORRvi_lsl_2S, ORRvi_lsl_4S>;
-
-
-// Vector Move Immediate Masked
-let isReMaterializable = 1 in {
-defm MOVIvi_msl : NeonI_mov_imm_msl_sizes<"movi", 0b0, Neon_movi>;
-}
-
-// Vector Move Inverted Immediate Masked
-let isReMaterializable = 1 in {
-defm MVNIvi_msl : NeonI_mov_imm_msl_sizes<"mvni", 0b1, Neon_mvni>;
-}
-
-class NeonI_mov_imm_lsl_aliases<string asmop, string asmlane,
- Instruction inst, RegisterOperand VPRC>
- : NeonInstAlias<!strconcat(asmop, "\t$Rd," # asmlane # ", $Imm"),
- (inst VPRC:$Rd, neon_uimm8:$Imm, 0), 0b0>;
-
-// Aliases for Vector Move Immediate Shifted
-def : NeonI_mov_imm_lsl_aliases<"movi", ".2s", MOVIvi_lsl_2S, VPR64>;
-def : NeonI_mov_imm_lsl_aliases<"movi", ".4s", MOVIvi_lsl_4S, VPR128>;
-def : NeonI_mov_imm_lsl_aliases<"movi", ".4h", MOVIvi_lsl_4H, VPR64>;
-def : NeonI_mov_imm_lsl_aliases<"movi", ".8h", MOVIvi_lsl_8H, VPR128>;
-
-// Aliases for Vector Move Inverted Immediate Shifted
-def : NeonI_mov_imm_lsl_aliases<"mvni", ".2s", MVNIvi_lsl_2S, VPR64>;
-def : NeonI_mov_imm_lsl_aliases<"mvni", ".4s", MVNIvi_lsl_4S, VPR128>;
-def : NeonI_mov_imm_lsl_aliases<"mvni", ".4h", MVNIvi_lsl_4H, VPR64>;
-def : NeonI_mov_imm_lsl_aliases<"mvni", ".8h", MVNIvi_lsl_8H, VPR128>;
-
-// Aliases for Vector Bitwise Bit Clear (AND NOT) - immediate
-def : NeonI_mov_imm_lsl_aliases<"bic", ".2s", BICvi_lsl_2S, VPR64>;
-def : NeonI_mov_imm_lsl_aliases<"bic", ".4s", BICvi_lsl_4S, VPR128>;
-def : NeonI_mov_imm_lsl_aliases<"bic", ".4h", BICvi_lsl_4H, VPR64>;
-def : NeonI_mov_imm_lsl_aliases<"bic", ".8h", BICvi_lsl_8H, VPR128>;
-
-// Aliases for Vector Bitwise OR - immedidate
-def : NeonI_mov_imm_lsl_aliases<"orr", ".2s", ORRvi_lsl_2S, VPR64>;
-def : NeonI_mov_imm_lsl_aliases<"orr", ".4s", ORRvi_lsl_4S, VPR128>;
-def : NeonI_mov_imm_lsl_aliases<"orr", ".4h", ORRvi_lsl_4H, VPR64>;
-def : NeonI_mov_imm_lsl_aliases<"orr", ".8h", ORRvi_lsl_8H, VPR128>;
-
-// Vector Move Immediate - per byte
-let isReMaterializable = 1 in {
-def MOVIvi_8B : NeonI_1VModImm<0b0, 0b0,
- (outs VPR64:$Rd), (ins neon_uimm8:$Imm),
- "movi\t$Rd.8b, $Imm",
- [(set (v8i8 VPR64:$Rd),
- (v8i8 (Neon_movi (timm:$Imm), (i32 imm))))],
- NoItinerary>,
- Sched<[WriteFPALU]> {
- let cmode = 0b1110;
-}
-
-def MOVIvi_16B : NeonI_1VModImm<0b1, 0b0,
- (outs VPR128:$Rd), (ins neon_uimm8:$Imm),
- "movi\t$Rd.16b, $Imm",
- [(set (v16i8 VPR128:$Rd),
- (v16i8 (Neon_movi (timm:$Imm), (i32 imm))))],
- NoItinerary>,
- Sched<[WriteFPALU]> {
- let cmode = 0b1110;
-}
-}
-
-// Vector Move Immediate - bytemask, per double word
-let isReMaterializable = 1 in {
-def MOVIvi_2D : NeonI_1VModImm<0b1, 0b1,
- (outs VPR128:$Rd), (ins neon_uimm64_mask:$Imm),
- "movi\t $Rd.2d, $Imm",
- [(set (v2i64 VPR128:$Rd),
- (v2i64 (Neon_movi (timm:$Imm), (i32 imm))))],
- NoItinerary>,
- Sched<[WriteFPALU]> {
- let cmode = 0b1110;
-}
-}
-
-// Vector Move Immediate - bytemask, one doubleword
-
-let isReMaterializable = 1 in {
-def MOVIdi : NeonI_1VModImm<0b0, 0b1,
- (outs FPR64:$Rd), (ins neon_uimm64_mask:$Imm),
- "movi\t $Rd, $Imm",
- [(set (v1i64 FPR64:$Rd),
- (v1i64 (Neon_movi (timm:$Imm), (i32 imm))))],
- NoItinerary>,
- Sched<[WriteFPALU]> {
- let cmode = 0b1110;
-}
-}
-
-// Vector Floating Point Move Immediate
-
-class NeonI_FMOV_impl<string asmlane, RegisterOperand VPRC, ValueType OpTy,
- Operand immOpType, bit q, bit op>
- : NeonI_1VModImm<q, op,
- (outs VPRC:$Rd), (ins immOpType:$Imm),
- "fmov\t$Rd" # asmlane # ", $Imm",
- [(set (OpTy VPRC:$Rd),
- (OpTy (Neon_fmovi (timm:$Imm))))],
- NoItinerary>,
- Sched<[WriteFPALU]> {
- let cmode = 0b1111;
- }
-
-let isReMaterializable = 1 in {
-def FMOVvi_2S : NeonI_FMOV_impl<".2s", VPR64, v2f32, fmov32_operand, 0b0, 0b0>;
-def FMOVvi_4S : NeonI_FMOV_impl<".4s", VPR128, v4f32, fmov32_operand, 0b1, 0b0>;
-def FMOVvi_2D : NeonI_FMOV_impl<".2d", VPR128, v2f64, fmov64_operand, 0b1, 0b1>;
-}
-
-// Vector Shift (Immediate)
-
-// Shift Right/Left Immediate - The immh:immb field of these shifts are encoded
-// as follows:
-//
-// Offset Encoding
-// 8 immh:immb<6:3> = '0001xxx', <imm> is encoded in immh:immb<2:0>
-// 16 immh:immb<6:4> = '001xxxx', <imm> is encoded in immh:immb<3:0>
-// 32 immh:immb<6:5> = '01xxxxx', <imm> is encoded in immh:immb<4:0>
-// 64 immh:immb<6> = '1xxxxxx', <imm> is encoded in immh:immb<5:0>
-//
-// The shift right immediate amount, in the range 1 to element bits, is computed
-// as Offset - UInt(immh:immb). The shift left immediate amount, in the range 0
-// to element bits - 1, is computed as UInt(immh:immb) - Offset.
-
-class shr_imm_asmoperands<string OFFSET> : AsmOperandClass {
- let Name = "ShrImm" # OFFSET;
- let RenderMethod = "addImmOperands";
- let DiagnosticType = "ShrImm" # OFFSET;
-}
-
-class shr_imm<string OFFSET> : Operand<i32> {
- let EncoderMethod = "getShiftRightImm" # OFFSET;
- let DecoderMethod = "DecodeShiftRightImm" # OFFSET;
- let ParserMatchClass =
- !cast<AsmOperandClass>("shr_imm" # OFFSET # "_asmoperand");
-}
-
-def shr_imm8_asmoperand : shr_imm_asmoperands<"8">;
-def shr_imm16_asmoperand : shr_imm_asmoperands<"16">;
-def shr_imm32_asmoperand : shr_imm_asmoperands<"32">;
-def shr_imm64_asmoperand : shr_imm_asmoperands<"64">;
-
-def shr_imm8 : shr_imm<"8">, ImmLeaf<i32, [{return Imm > 0 && Imm <= 8;}]>;
-def shr_imm16 : shr_imm<"16">, ImmLeaf<i32, [{return Imm > 0 && Imm <= 16;}]>;
-def shr_imm32 : shr_imm<"32">, ImmLeaf<i32, [{return Imm > 0 && Imm <= 32;}]>;
-def shr_imm64 : shr_imm<"64">, ImmLeaf<i32, [{return Imm > 0 && Imm <= 64;}]>;
-
-class shl_imm_asmoperands<string OFFSET> : AsmOperandClass {
- let Name = "ShlImm" # OFFSET;
- let RenderMethod = "addImmOperands";
- let DiagnosticType = "ShlImm" # OFFSET;
-}
-
-class shl_imm<string OFFSET> : Operand<i32> {
- let EncoderMethod = "getShiftLeftImm" # OFFSET;
- let DecoderMethod = "DecodeShiftLeftImm" # OFFSET;
- let ParserMatchClass =
- !cast<AsmOperandClass>("shl_imm" # OFFSET # "_asmoperand");
-}
-
-def shl_imm8_asmoperand : shl_imm_asmoperands<"8">;
-def shl_imm16_asmoperand : shl_imm_asmoperands<"16">;
-def shl_imm32_asmoperand : shl_imm_asmoperands<"32">;
-def shl_imm64_asmoperand : shl_imm_asmoperands<"64">;
-
-def shl_imm8 : shl_imm<"8">, ImmLeaf<i32, [{return Imm >= 0 && Imm < 8;}]>;
-def shl_imm16 : shl_imm<"16">, ImmLeaf<i32, [{return Imm >= 0 && Imm < 16;}]>;
-def shl_imm32 : shl_imm<"32">, ImmLeaf<i32, [{return Imm >= 0 && Imm < 32;}]>;
-def shl_imm64 : shl_imm<"64">, ImmLeaf<i32, [{return Imm >= 0 && Imm < 64;}]>;
-
-class N2VShift<bit q, bit u, bits<5> opcode, string asmop, string T,
- RegisterOperand VPRC, ValueType Ty, Operand ImmTy, SDNode OpNode>
- : NeonI_2VShiftImm<q, u, opcode,
- (outs VPRC:$Rd), (ins VPRC:$Rn, ImmTy:$Imm),
- asmop # "\t$Rd." # T # ", $Rn." # T # ", $Imm",
- [(set (Ty VPRC:$Rd),
- (Ty (OpNode (Ty VPRC:$Rn),
- (Ty (Neon_vdup (i32 ImmTy:$Imm))))))],
- NoItinerary>,
- Sched<[WriteFPALU, ReadFPALU]>;
-
-multiclass NeonI_N2VShL<bit u, bits<5> opcode, string asmop> {
- // 64-bit vector types.
- def _8B : N2VShift<0b0, u, opcode, asmop, "8b", VPR64, v8i8, shl_imm8, shl> {
- let Inst{22-19} = 0b0001; // immh:immb = 0001xxx
- }
-
- def _4H : N2VShift<0b0, u, opcode, asmop, "4h", VPR64, v4i16, shl_imm16, shl> {
- let Inst{22-20} = 0b001; // immh:immb = 001xxxx
- }
-
- def _2S : N2VShift<0b0, u, opcode, asmop, "2s", VPR64, v2i32, shl_imm32, shl> {
- let Inst{22-21} = 0b01; // immh:immb = 01xxxxx
- }
-
- // 128-bit vector types.
- def _16B : N2VShift<0b1, u, opcode, asmop, "16b", VPR128, v16i8, shl_imm8, shl> {
- let Inst{22-19} = 0b0001; // immh:immb = 0001xxx
- }
-
- def _8H : N2VShift<0b1, u, opcode, asmop, "8h", VPR128, v8i16, shl_imm16, shl> {
- let Inst{22-20} = 0b001; // immh:immb = 001xxxx
- }
-
- def _4S : N2VShift<0b1, u, opcode, asmop, "4s", VPR128, v4i32, shl_imm32, shl> {
- let Inst{22-21} = 0b01; // immh:immb = 01xxxxx
- }
-
- def _2D : N2VShift<0b1, u, opcode, asmop, "2d", VPR128, v2i64, shl_imm64, shl> {
- let Inst{22} = 0b1; // immh:immb = 1xxxxxx
- }
-}
-
-multiclass NeonI_N2VShR<bit u, bits<5> opcode, string asmop, SDNode OpNode> {
- def _8B : N2VShift<0b0, u, opcode, asmop, "8b", VPR64, v8i8, shr_imm8,
- OpNode> {
- let Inst{22-19} = 0b0001;
- }
-
- def _4H : N2VShift<0b0, u, opcode, asmop, "4h", VPR64, v4i16, shr_imm16,
- OpNode> {
- let Inst{22-20} = 0b001;
- }
-
- def _2S : N2VShift<0b0, u, opcode, asmop, "2s", VPR64, v2i32, shr_imm32,
- OpNode> {
- let Inst{22-21} = 0b01;
- }
-
- def _16B : N2VShift<0b1, u, opcode, asmop, "16b", VPR128, v16i8, shr_imm8,
- OpNode> {
- let Inst{22-19} = 0b0001;
- }
-
- def _8H : N2VShift<0b1, u, opcode, asmop, "8h", VPR128, v8i16, shr_imm16,
- OpNode> {
- let Inst{22-20} = 0b001;
- }
-
- def _4S : N2VShift<0b1, u, opcode, asmop, "4s", VPR128, v4i32, shr_imm32,
- OpNode> {
- let Inst{22-21} = 0b01;
- }
-
- def _2D : N2VShift<0b1, u, opcode, asmop, "2d", VPR128, v2i64, shr_imm64,
- OpNode> {
- let Inst{22} = 0b1;
- }
-}
-
-// Shift left
-
-defm SHLvvi : NeonI_N2VShL<0b0, 0b01010, "shl">;
-
-// Additional patterns to match vector shift left by immediate.
-// (v1i8/v1i16/v1i32 types)
-def : Pat<(v1i8 (shl (v1i8 FPR8:$Rn),
- (v1i8 (Neon_vdup (i32 (shl_imm8:$Imm)))))),
- (EXTRACT_SUBREG
- (SHLvvi_8B (SUBREG_TO_REG (i64 0), FPR8:$Rn, sub_8),
- shl_imm8:$Imm),
- sub_8)>;
-def : Pat<(v1i16 (shl (v1i16 FPR16:$Rn),
- (v1i16 (Neon_vdup (i32 (shl_imm16:$Imm)))))),
- (EXTRACT_SUBREG
- (SHLvvi_4H (SUBREG_TO_REG (i64 0), FPR16:$Rn, sub_16),
- shl_imm16:$Imm),
- sub_16)>;
-def : Pat<(v1i32 (shl (v1i32 FPR32:$Rn),
- (v1i32 (Neon_vdup (i32 (shl_imm32:$Imm)))))),
- (EXTRACT_SUBREG
- (SHLvvi_2S (SUBREG_TO_REG (i64 0), FPR32:$Rn, sub_32),
- shl_imm32:$Imm),
- sub_32)>;
-
-// Shift right
-defm SSHRvvi : NeonI_N2VShR<0b0, 0b00000, "sshr", sra>;
-defm USHRvvi : NeonI_N2VShR<0b1, 0b00000, "ushr", srl>;
-
-// Additional patterns to match vector shift right by immediate.
-// (v1i8/v1i16/v1i32 types)
-def : Pat<(v1i8 (sra (v1i8 FPR8:$Rn),
- (v1i8 (Neon_vdup (i32 (shr_imm8:$Imm)))))),
- (EXTRACT_SUBREG
- (SSHRvvi_8B (SUBREG_TO_REG (i64 0), FPR8:$Rn, sub_8),
- shr_imm8:$Imm),
- sub_8)>;
-def : Pat<(v1i16 (sra (v1i16 FPR16:$Rn),
- (v1i16 (Neon_vdup (i32 (shr_imm16:$Imm)))))),
- (EXTRACT_SUBREG
- (SSHRvvi_4H (SUBREG_TO_REG (i64 0), FPR16:$Rn, sub_16),
- shr_imm16:$Imm),
- sub_16)>;
-def : Pat<(v1i32 (sra (v1i32 FPR32:$Rn),
- (v1i32 (Neon_vdup (i32 (shr_imm32:$Imm)))))),
- (EXTRACT_SUBREG
- (SSHRvvi_2S (SUBREG_TO_REG (i64 0), FPR32:$Rn, sub_32),
- shr_imm32:$Imm),
- sub_32)>;
-def : Pat<(v1i8 (srl (v1i8 FPR8:$Rn),
- (v1i8 (Neon_vdup (i32 (shr_imm8:$Imm)))))),
- (EXTRACT_SUBREG
- (USHRvvi_8B (SUBREG_TO_REG (i64 0), FPR8:$Rn, sub_8),
- shr_imm8:$Imm),
- sub_8)>;
-def : Pat<(v1i16 (srl (v1i16 FPR16:$Rn),
- (v1i16 (Neon_vdup (i32 (shr_imm16:$Imm)))))),
- (EXTRACT_SUBREG
- (USHRvvi_4H (SUBREG_TO_REG (i64 0), FPR16:$Rn, sub_16),
- shr_imm16:$Imm),
- sub_16)>;
-def : Pat<(v1i32 (srl (v1i32 FPR32:$Rn),
- (v1i32 (Neon_vdup (i32 (shr_imm32:$Imm)))))),
- (EXTRACT_SUBREG
- (USHRvvi_2S (SUBREG_TO_REG (i64 0), FPR32:$Rn, sub_32),
- shr_imm32:$Imm),
- sub_32)>;
-
-def Neon_High16B : PatFrag<(ops node:$in),
- (extract_subvector (v16i8 node:$in), (iPTR 8))>;
-def Neon_High8H : PatFrag<(ops node:$in),
- (extract_subvector (v8i16 node:$in), (iPTR 4))>;
-def Neon_High4S : PatFrag<(ops node:$in),
- (extract_subvector (v4i32 node:$in), (iPTR 2))>;
-def Neon_High2D : PatFrag<(ops node:$in),
- (extract_subvector (v2i64 node:$in), (iPTR 1))>;
-def Neon_High4float : PatFrag<(ops node:$in),
- (extract_subvector (v4f32 node:$in), (iPTR 2))>;
-def Neon_High2double : PatFrag<(ops node:$in),
- (extract_subvector (v2f64 node:$in), (iPTR 1))>;
-
-def Neon_Low16B : PatFrag<(ops node:$in),
- (v8i8 (extract_subvector (v16i8 node:$in),
- (iPTR 0)))>;
-def Neon_Low8H : PatFrag<(ops node:$in),
- (v4i16 (extract_subvector (v8i16 node:$in),
- (iPTR 0)))>;
-def Neon_Low4S : PatFrag<(ops node:$in),
- (v2i32 (extract_subvector (v4i32 node:$in),
- (iPTR 0)))>;
-def Neon_Low2D : PatFrag<(ops node:$in),
- (v1i64 (extract_subvector (v2i64 node:$in),
- (iPTR 0)))>;
-def Neon_Low4float : PatFrag<(ops node:$in),
- (v2f32 (extract_subvector (v4f32 node:$in),
- (iPTR 0)))>;
-def Neon_Low2double : PatFrag<(ops node:$in),
- (v1f64 (extract_subvector (v2f64 node:$in),
- (iPTR 0)))>;
-
-class N2VShiftLong<bit q, bit u, bits<5> opcode, string asmop, string DestT,
- string SrcT, ValueType DestTy, ValueType SrcTy,
- Operand ImmTy, SDPatternOperator ExtOp>
- : NeonI_2VShiftImm<q, u, opcode, (outs VPR128:$Rd),
- (ins VPR64:$Rn, ImmTy:$Imm),
- asmop # "\t$Rd." # DestT # ", $Rn." # SrcT # ", $Imm",
- [(set (DestTy VPR128:$Rd),
- (DestTy (shl
- (DestTy (ExtOp (SrcTy VPR64:$Rn))),
- (DestTy (Neon_vdup (i32 ImmTy:$Imm))))))],
- NoItinerary>,
- Sched<[WriteFPALU, ReadFPALU]>;
-
-class N2VShiftLongHigh<bit q, bit u, bits<5> opcode, string asmop, string DestT,
- string SrcT, ValueType DestTy, ValueType SrcTy,
- int StartIndex, Operand ImmTy,
- SDPatternOperator ExtOp, PatFrag getTop>
- : NeonI_2VShiftImm<q, u, opcode, (outs VPR128:$Rd),
- (ins VPR128:$Rn, ImmTy:$Imm),
- asmop # "2\t$Rd." # DestT # ", $Rn." # SrcT # ", $Imm",
- [(set (DestTy VPR128:$Rd),
- (DestTy (shl
- (DestTy (ExtOp
- (SrcTy (getTop VPR128:$Rn)))),
- (DestTy (Neon_vdup (i32 ImmTy:$Imm))))))],
- NoItinerary>,
- Sched<[WriteFPALU, ReadFPALU]>;
-
-multiclass NeonI_N2VShLL<string prefix, bit u, bits<5> opcode, string asmop,
- SDNode ExtOp> {
- // 64-bit vector types.
- def _8B : N2VShiftLong<0b0, u, opcode, asmop, "8h", "8b", v8i16, v8i8,
- shl_imm8, ExtOp> {
- let Inst{22-19} = 0b0001; // immh:immb = 0001xxx
- }
-
- def _4H : N2VShiftLong<0b0, u, opcode, asmop, "4s", "4h", v4i32, v4i16,
- shl_imm16, ExtOp> {
- let Inst{22-20} = 0b001; // immh:immb = 001xxxx
- }
-
- def _2S : N2VShiftLong<0b0, u, opcode, asmop, "2d", "2s", v2i64, v2i32,
- shl_imm32, ExtOp> {
- let Inst{22-21} = 0b01; // immh:immb = 01xxxxx
- }
-
- // 128-bit vector types
- def _16B : N2VShiftLongHigh<0b1, u, opcode, asmop, "8h", "16b", v8i16, v8i8,
- 8, shl_imm8, ExtOp, Neon_High16B> {
- let Inst{22-19} = 0b0001; // immh:immb = 0001xxx
- }
-
- def _8H : N2VShiftLongHigh<0b1, u, opcode, asmop, "4s", "8h", v4i32, v4i16,
- 4, shl_imm16, ExtOp, Neon_High8H> {
- let Inst{22-20} = 0b001; // immh:immb = 001xxxx
- }
-
- def _4S : N2VShiftLongHigh<0b1, u, opcode, asmop, "2d", "4s", v2i64, v2i32,
- 2, shl_imm32, ExtOp, Neon_High4S> {
- let Inst{22-21} = 0b01; // immh:immb = 01xxxxx
- }
-
- // Use other patterns to match when the immediate is 0.
- def : Pat<(v8i16 (ExtOp (v8i8 VPR64:$Rn))),
- (!cast<Instruction>(prefix # "_8B") VPR64:$Rn, 0)>;
-
- def : Pat<(v4i32 (ExtOp (v4i16 VPR64:$Rn))),
- (!cast<Instruction>(prefix # "_4H") VPR64:$Rn, 0)>;
-
- def : Pat<(v2i64 (ExtOp (v2i32 VPR64:$Rn))),
- (!cast<Instruction>(prefix # "_2S") VPR64:$Rn, 0)>;
-
- def : Pat<(v8i16 (ExtOp (v8i8 (Neon_High16B VPR128:$Rn)))),
- (!cast<Instruction>(prefix # "_16B") VPR128:$Rn, 0)>;
-
- def : Pat<(v4i32 (ExtOp (v4i16 (Neon_High8H VPR128:$Rn)))),
- (!cast<Instruction>(prefix # "_8H") VPR128:$Rn, 0)>;
-
- def : Pat<(v2i64 (ExtOp (v2i32 (Neon_High4S VPR128:$Rn)))),
- (!cast<Instruction>(prefix # "_4S") VPR128:$Rn, 0)>;
-}
-
-// Shift left long
-defm SSHLLvvi : NeonI_N2VShLL<"SSHLLvvi", 0b0, 0b10100, "sshll", sext>;
-defm USHLLvvi : NeonI_N2VShLL<"USHLLvvi", 0b1, 0b10100, "ushll", zext>;
-
-class NeonI_ext_len_alias<string asmop, string lane, string laneOp,
- Instruction inst, RegisterOperand VPRC,
- RegisterOperand VPRCOp>
- : NeonInstAlias<asmop # "\t$Rd" # lane #", $Rn" # laneOp,
- (inst VPRC:$Rd, VPRCOp:$Rn, 0), 0b0>;
-
-// Signed integer lengthen (vector) is alias for SSHLL Vd, Vn, #0
-// Signed integer lengthen (vector, second part) is alias for SSHLL2 Vd, Vn, #0
-// FIXME: This is actually the preferred syntax but TableGen can't deal with
-// custom printing of aliases.
-def SXTLvv_8B : NeonI_ext_len_alias<"sxtl", ".8h", ".8b", SSHLLvvi_8B, VPR128, VPR64>;
-def SXTLvv_4H : NeonI_ext_len_alias<"sxtl", ".4s", ".4h", SSHLLvvi_4H, VPR128, VPR64>;
-def SXTLvv_2S : NeonI_ext_len_alias<"sxtl", ".2d", ".2s", SSHLLvvi_2S, VPR128, VPR64>;
-def SXTL2vv_16B : NeonI_ext_len_alias<"sxtl2", ".8h", ".16b", SSHLLvvi_16B, VPR128, VPR128>;
-def SXTL2vv_8H : NeonI_ext_len_alias<"sxtl2", ".4s", ".8h", SSHLLvvi_8H, VPR128, VPR128>;
-def SXTL2vv_4S : NeonI_ext_len_alias<"sxtl2", ".2d", ".4s", SSHLLvvi_4S, VPR128, VPR128>;
-
-// Unsigned integer lengthen (vector) is alias for USHLL Vd, Vn, #0
-// Unsigned integer lengthen (vector, second part) is alias for USHLL2 Vd, Vn, #0
-// FIXME: This is actually the preferred syntax but TableGen can't deal with
-// custom printing of aliases.
-def UXTLvv_8B : NeonI_ext_len_alias<"uxtl", ".8h", ".8b", USHLLvvi_8B, VPR128, VPR64>;
-def UXTLvv_4H : NeonI_ext_len_alias<"uxtl", ".4s", ".4h", USHLLvvi_4H, VPR128, VPR64>;
-def UXTLvv_2S : NeonI_ext_len_alias<"uxtl", ".2d", ".2s", USHLLvvi_2S, VPR128, VPR64>;
-def UXTL2vv_16B : NeonI_ext_len_alias<"uxtl2", ".8h", ".16b", USHLLvvi_16B, VPR128, VPR128>;
-def UXTL2vv_8H : NeonI_ext_len_alias<"uxtl2", ".4s", ".8h", USHLLvvi_8H, VPR128, VPR128>;
-def UXTL2vv_4S : NeonI_ext_len_alias<"uxtl2", ".2d", ".4s", USHLLvvi_4S, VPR128, VPR128>;
-
-def : Pat<(v8i16 (anyext (v8i8 VPR64:$Rn))), (USHLLvvi_8B VPR64:$Rn, 0)>;
-def : Pat<(v4i32 (anyext (v4i16 VPR64:$Rn))), (USHLLvvi_4H VPR64:$Rn, 0)>;
-def : Pat<(v2i64 (anyext (v2i32 VPR64:$Rn))), (USHLLvvi_2S VPR64:$Rn, 0)>;
-
-// Rounding/Saturating shift
-class N2VShift_RQ<bit q, bit u, bits<5> opcode, string asmop, string T,
- RegisterOperand VPRC, ValueType Ty, Operand ImmTy,
- SDPatternOperator OpNode>
- : NeonI_2VShiftImm<q, u, opcode,
- (outs VPRC:$Rd), (ins VPRC:$Rn, ImmTy:$Imm),
- asmop # "\t$Rd." # T # ", $Rn." # T # ", $Imm",
- [(set (Ty VPRC:$Rd), (Ty (OpNode (Ty VPRC:$Rn),
- (i32 ImmTy:$Imm))))],
- NoItinerary>,
- Sched<[WriteFPALU, ReadFPALU]>;
-
-// shift right (vector by immediate)
-multiclass NeonI_N2VShR_RQ<bit u, bits<5> opcode, string asmop,
- SDPatternOperator OpNode> {
- def _8B : N2VShift_RQ<0b0, u, opcode, asmop, "8b", VPR64, v8i8, shr_imm8,
- OpNode> {
- let Inst{22-19} = 0b0001;
- }
-
- def _4H : N2VShift_RQ<0b0, u, opcode, asmop, "4h", VPR64, v4i16, shr_imm16,
- OpNode> {
- let Inst{22-20} = 0b001;
- }
-
- def _2S : N2VShift_RQ<0b0, u, opcode, asmop, "2s", VPR64, v2i32, shr_imm32,
- OpNode> {
- let Inst{22-21} = 0b01;
- }
-
- def _16B : N2VShift_RQ<0b1, u, opcode, asmop, "16b", VPR128, v16i8, shr_imm8,
- OpNode> {
- let Inst{22-19} = 0b0001;
- }
-
- def _8H : N2VShift_RQ<0b1, u, opcode, asmop, "8h", VPR128, v8i16, shr_imm16,
- OpNode> {
- let Inst{22-20} = 0b001;
- }
-
- def _4S : N2VShift_RQ<0b1, u, opcode, asmop, "4s", VPR128, v4i32, shr_imm32,
- OpNode> {
- let Inst{22-21} = 0b01;
- }
-
- def _2D : N2VShift_RQ<0b1, u, opcode, asmop, "2d", VPR128, v2i64, shr_imm64,
- OpNode> {
- let Inst{22} = 0b1;
- }
-}
-
-multiclass NeonI_N2VShL_Q<bit u, bits<5> opcode, string asmop,
- SDPatternOperator OpNode> {
- // 64-bit vector types.
- def _8B : N2VShift_RQ<0b0, u, opcode, asmop, "8b", VPR64, v8i8, shl_imm8,
- OpNode> {
- let Inst{22-19} = 0b0001;
- }
-
- def _4H : N2VShift_RQ<0b0, u, opcode, asmop, "4h", VPR64, v4i16, shl_imm16,
- OpNode> {
- let Inst{22-20} = 0b001;
- }
-
- def _2S : N2VShift_RQ<0b0, u, opcode, asmop, "2s", VPR64, v2i32, shl_imm32,
- OpNode> {
- let Inst{22-21} = 0b01;
- }
-
- // 128-bit vector types.
- def _16B : N2VShift_RQ<0b1, u, opcode, asmop, "16b", VPR128, v16i8, shl_imm8,
- OpNode> {
- let Inst{22-19} = 0b0001;
- }
-
- def _8H : N2VShift_RQ<0b1, u, opcode, asmop, "8h", VPR128, v8i16, shl_imm16,
- OpNode> {
- let Inst{22-20} = 0b001;
- }
-
- def _4S : N2VShift_RQ<0b1, u, opcode, asmop, "4s", VPR128, v4i32, shl_imm32,
- OpNode> {
- let Inst{22-21} = 0b01;
- }
-
- def _2D : N2VShift_RQ<0b1, u, opcode, asmop, "2d", VPR128, v2i64, shl_imm64,
- OpNode> {
- let Inst{22} = 0b1;
- }
-}
-
-// Rounding shift right
-defm SRSHRvvi : NeonI_N2VShR_RQ<0b0, 0b00100, "srshr",
- int_aarch64_neon_vsrshr>;
-defm URSHRvvi : NeonI_N2VShR_RQ<0b1, 0b00100, "urshr",
- int_aarch64_neon_vurshr>;
-
-// Saturating shift left unsigned
-defm SQSHLUvvi : NeonI_N2VShL_Q<0b1, 0b01100, "sqshlu", int_aarch64_neon_vsqshlu>;
-
-// Saturating shift left
-defm SQSHLvvi : NeonI_N2VShL_Q<0b0, 0b01110, "sqshl", Neon_sqrshlImm>;
-defm UQSHLvvi : NeonI_N2VShL_Q<0b1, 0b01110, "uqshl", Neon_uqrshlImm>;
-
-class N2VShiftAdd<bit q, bit u, bits<5> opcode, string asmop, string T,
- RegisterOperand VPRC, ValueType Ty, Operand ImmTy,
- SDNode OpNode>
- : NeonI_2VShiftImm<q, u, opcode,
- (outs VPRC:$Rd), (ins VPRC:$src, VPRC:$Rn, ImmTy:$Imm),
- asmop # "\t$Rd." # T # ", $Rn." # T # ", $Imm",
- [(set (Ty VPRC:$Rd), (Ty (add (Ty VPRC:$src),
- (Ty (OpNode (Ty VPRC:$Rn),
- (Ty (Neon_vdup (i32 ImmTy:$Imm))))))))],
- NoItinerary>,
- Sched<[WriteFPALU, ReadFPALU, ReadFPALU]> {
- let Constraints = "$src = $Rd";
-}
-
-// Shift Right accumulate
-multiclass NeonI_N2VShRAdd<bit u, bits<5> opcode, string asmop, SDNode OpNode> {
- def _8B : N2VShiftAdd<0b0, u, opcode, asmop, "8b", VPR64, v8i8, shr_imm8,
- OpNode> {
- let Inst{22-19} = 0b0001;
- }
-
- def _4H : N2VShiftAdd<0b0, u, opcode, asmop, "4h", VPR64, v4i16, shr_imm16,
- OpNode> {
- let Inst{22-20} = 0b001;
- }
-
- def _2S : N2VShiftAdd<0b0, u, opcode, asmop, "2s", VPR64, v2i32, shr_imm32,
- OpNode> {
- let Inst{22-21} = 0b01;
- }
-
- def _16B : N2VShiftAdd<0b1, u, opcode, asmop, "16b", VPR128, v16i8, shr_imm8,
- OpNode> {
- let Inst{22-19} = 0b0001;
- }
-
- def _8H : N2VShiftAdd<0b1, u, opcode, asmop, "8h", VPR128, v8i16, shr_imm16,
- OpNode> {
- let Inst{22-20} = 0b001;
- }
-
- def _4S : N2VShiftAdd<0b1, u, opcode, asmop, "4s", VPR128, v4i32, shr_imm32,
- OpNode> {
- let Inst{22-21} = 0b01;
- }
-
- def _2D : N2VShiftAdd<0b1, u, opcode, asmop, "2d", VPR128, v2i64, shr_imm64,
- OpNode> {
- let Inst{22} = 0b1;
- }
-}
-
-// Shift right and accumulate
-defm SSRAvvi : NeonI_N2VShRAdd<0, 0b00010, "ssra", sra>;
-defm USRAvvi : NeonI_N2VShRAdd<1, 0b00010, "usra", srl>;
-
-// Rounding shift accumulate
-class N2VShiftAdd_R<bit q, bit u, bits<5> opcode, string asmop, string T,
- RegisterOperand VPRC, ValueType Ty, Operand ImmTy,
- SDPatternOperator OpNode>
- : NeonI_2VShiftImm<q, u, opcode,
- (outs VPRC:$Rd), (ins VPRC:$src, VPRC:$Rn, ImmTy:$Imm),
- asmop # "\t$Rd." # T # ", $Rn." # T # ", $Imm",
- [(set (Ty VPRC:$Rd), (Ty (add (Ty VPRC:$src),
- (Ty (OpNode (Ty VPRC:$Rn), (i32 ImmTy:$Imm))))))],
- NoItinerary>,
- Sched<[WriteFPALU, ReadFPALU, ReadFPALU]> {
- let Constraints = "$src = $Rd";
-}
-
-multiclass NeonI_N2VShRAdd_R<bit u, bits<5> opcode, string asmop,
- SDPatternOperator OpNode> {
- def _8B : N2VShiftAdd_R<0b0, u, opcode, asmop, "8b", VPR64, v8i8, shr_imm8,
- OpNode> {
- let Inst{22-19} = 0b0001;
- }
-
- def _4H : N2VShiftAdd_R<0b0, u, opcode, asmop, "4h", VPR64, v4i16, shr_imm16,
- OpNode> {
- let Inst{22-20} = 0b001;
- }
-
- def _2S : N2VShiftAdd_R<0b0, u, opcode, asmop, "2s", VPR64, v2i32, shr_imm32,
- OpNode> {
- let Inst{22-21} = 0b01;
- }
-
- def _16B : N2VShiftAdd_R<0b1, u, opcode, asmop, "16b", VPR128, v16i8, shr_imm8,
- OpNode> {
- let Inst{22-19} = 0b0001;
- }
-
- def _8H : N2VShiftAdd_R<0b1, u, opcode, asmop, "8h", VPR128, v8i16, shr_imm16,
- OpNode> {
- let Inst{22-20} = 0b001;
- }
-
- def _4S : N2VShiftAdd_R<0b1, u, opcode, asmop, "4s", VPR128, v4i32, shr_imm32,
- OpNode> {
- let Inst{22-21} = 0b01;
- }
-
- def _2D : N2VShiftAdd_R<0b1, u, opcode, asmop, "2d", VPR128, v2i64, shr_imm64,
- OpNode> {
- let Inst{22} = 0b1;
- }
-}
-
-// Rounding shift right and accumulate
-defm SRSRAvvi : NeonI_N2VShRAdd_R<0, 0b00110, "srsra", int_aarch64_neon_vsrshr>;
-defm URSRAvvi : NeonI_N2VShRAdd_R<1, 0b00110, "ursra", int_aarch64_neon_vurshr>;
-
-// Shift insert by immediate
-class N2VShiftIns<bit q, bit u, bits<5> opcode, string asmop, string T,
- RegisterOperand VPRC, ValueType Ty, Operand ImmTy,
- SDPatternOperator OpNode>
- : NeonI_2VShiftImm<q, u, opcode,
- (outs VPRC:$Rd), (ins VPRC:$src, VPRC:$Rn, ImmTy:$Imm),
- asmop # "\t$Rd." # T # ", $Rn." # T # ", $Imm",
- [(set (Ty VPRC:$Rd), (Ty (OpNode (Ty VPRC:$src), (Ty VPRC:$Rn),
- (i32 ImmTy:$Imm))))],
- NoItinerary>,
- Sched<[WriteFPALU, ReadFPALU, ReadFPALU]> {
- let Constraints = "$src = $Rd";
-}
-
-// shift left insert (vector by immediate)
-multiclass NeonI_N2VShLIns<bit u, bits<5> opcode, string asmop> {
- def _8B : N2VShiftIns<0b0, u, opcode, asmop, "8b", VPR64, v8i8, shl_imm8,
- int_aarch64_neon_vsli> {
- let Inst{22-19} = 0b0001;
- }
-
- def _4H : N2VShiftIns<0b0, u, opcode, asmop, "4h", VPR64, v4i16, shl_imm16,
- int_aarch64_neon_vsli> {
- let Inst{22-20} = 0b001;
- }
-
- def _2S : N2VShiftIns<0b0, u, opcode, asmop, "2s", VPR64, v2i32, shl_imm32,
- int_aarch64_neon_vsli> {
- let Inst{22-21} = 0b01;
- }
-
- // 128-bit vector types
- def _16B : N2VShiftIns<0b1, u, opcode, asmop, "16b", VPR128, v16i8, shl_imm8,
- int_aarch64_neon_vsli> {
- let Inst{22-19} = 0b0001;
- }
-
- def _8H : N2VShiftIns<0b1, u, opcode, asmop, "8h", VPR128, v8i16, shl_imm16,
- int_aarch64_neon_vsli> {
- let Inst{22-20} = 0b001;
- }
-
- def _4S : N2VShiftIns<0b1, u, opcode, asmop, "4s", VPR128, v4i32, shl_imm32,
- int_aarch64_neon_vsli> {
- let Inst{22-21} = 0b01;
- }
-
- def _2D : N2VShiftIns<0b1, u, opcode, asmop, "2d", VPR128, v2i64, shl_imm64,
- int_aarch64_neon_vsli> {
- let Inst{22} = 0b1;
- }
-}
-
-// shift right insert (vector by immediate)
-multiclass NeonI_N2VShRIns<bit u, bits<5> opcode, string asmop> {
- // 64-bit vector types.
- def _8B : N2VShiftIns<0b0, u, opcode, asmop, "8b", VPR64, v8i8, shr_imm8,
- int_aarch64_neon_vsri> {
- let Inst{22-19} = 0b0001;
- }
-
- def _4H : N2VShiftIns<0b0, u, opcode, asmop, "4h", VPR64, v4i16, shr_imm16,
- int_aarch64_neon_vsri> {
- let Inst{22-20} = 0b001;
- }
-
- def _2S : N2VShiftIns<0b0, u, opcode, asmop, "2s", VPR64, v2i32, shr_imm32,
- int_aarch64_neon_vsri> {
- let Inst{22-21} = 0b01;
- }
-
- // 128-bit vector types
- def _16B : N2VShiftIns<0b1, u, opcode, asmop, "16b", VPR128, v16i8, shr_imm8,
- int_aarch64_neon_vsri> {
- let Inst{22-19} = 0b0001;
- }
-
- def _8H : N2VShiftIns<0b1, u, opcode, asmop, "8h", VPR128, v8i16, shr_imm16,
- int_aarch64_neon_vsri> {
- let Inst{22-20} = 0b001;
- }
-
- def _4S : N2VShiftIns<0b1, u, opcode, asmop, "4s", VPR128, v4i32, shr_imm32,
- int_aarch64_neon_vsri> {
- let Inst{22-21} = 0b01;
- }
-
- def _2D : N2VShiftIns<0b1, u, opcode, asmop, "2d", VPR128, v2i64, shr_imm64,
- int_aarch64_neon_vsri> {
- let Inst{22} = 0b1;
- }
-}
-
-// Shift left and insert
-defm SLIvvi : NeonI_N2VShLIns<0b1, 0b01010, "sli">;
-
-// Shift right and insert
-defm SRIvvi : NeonI_N2VShRIns<0b1, 0b01000, "sri">;
-
-class N2VShR_Narrow<bit q, bit u, bits<5> opcode, string asmop, string DestT,
- string SrcT, Operand ImmTy>
- : NeonI_2VShiftImm<q, u, opcode,
- (outs VPR64:$Rd), (ins VPR128:$Rn, ImmTy:$Imm),
- asmop # "\t$Rd." # DestT # ", $Rn." # SrcT # ", $Imm",
- [], NoItinerary>,
- Sched<[WriteFPALU, ReadFPALU]>;
-
-class N2VShR_Narrow_Hi<bit q, bit u, bits<5> opcode, string asmop, string DestT,
- string SrcT, Operand ImmTy>
- : NeonI_2VShiftImm<q, u, opcode, (outs VPR128:$Rd),
- (ins VPR128:$src, VPR128:$Rn, ImmTy:$Imm),
- asmop # "\t$Rd." # DestT # ", $Rn." # SrcT # ", $Imm",
- [], NoItinerary>,
- Sched<[WriteFPALU, ReadFPALU, ReadFPALU]> {
- let Constraints = "$src = $Rd";
-}
-
-// left long shift by immediate
-multiclass NeonI_N2VShR_Narrow<bit u, bits<5> opcode, string asmop> {
- def _8B : N2VShR_Narrow<0b0, u, opcode, asmop, "8b", "8h", shr_imm8> {
- let Inst{22-19} = 0b0001;
- }
-
- def _4H : N2VShR_Narrow<0b0, u, opcode, asmop, "4h", "4s", shr_imm16> {
- let Inst{22-20} = 0b001;
- }
-
- def _2S : N2VShR_Narrow<0b0, u, opcode, asmop, "2s", "2d", shr_imm32> {
- let Inst{22-21} = 0b01;
- }
-
- // Shift Narrow High
- def _16B : N2VShR_Narrow_Hi<0b1, u, opcode, asmop # "2", "16b", "8h",
- shr_imm8> {
- let Inst{22-19} = 0b0001;
- }
-
- def _8H : N2VShR_Narrow_Hi<0b1, u, opcode, asmop # "2", "8h", "4s",
- shr_imm16> {
- let Inst{22-20} = 0b001;
- }
-
- def _4S : N2VShR_Narrow_Hi<0b1, u, opcode, asmop # "2", "4s", "2d",
- shr_imm32> {
- let Inst{22-21} = 0b01;
- }
-}
-
-// Shift right narrow
-defm SHRNvvi : NeonI_N2VShR_Narrow<0b0, 0b10000, "shrn">;
-
-// Shift right narrow (prefix Q is saturating, prefix R is rounding)
-defm QSHRUNvvi :NeonI_N2VShR_Narrow<0b1, 0b10000, "sqshrun">;
-defm RSHRNvvi : NeonI_N2VShR_Narrow<0b0, 0b10001, "rshrn">;
-defm QRSHRUNvvi : NeonI_N2VShR_Narrow<0b1, 0b10001, "sqrshrun">;
-defm SQSHRNvvi : NeonI_N2VShR_Narrow<0b0, 0b10010, "sqshrn">;
-defm UQSHRNvvi : NeonI_N2VShR_Narrow<0b1, 0b10010, "uqshrn">;
-defm SQRSHRNvvi : NeonI_N2VShR_Narrow<0b0, 0b10011, "sqrshrn">;
-defm UQRSHRNvvi : NeonI_N2VShR_Narrow<0b1, 0b10011, "uqrshrn">;
-
-def Neon_combine_2D : PatFrag<(ops node:$Rm, node:$Rn),
- (v2i64 (concat_vectors (v1i64 node:$Rm),
- (v1i64 node:$Rn)))>;
-def Neon_combine_8H : PatFrag<(ops node:$Rm, node:$Rn),
- (v8i16 (concat_vectors (v4i16 node:$Rm),
- (v4i16 node:$Rn)))>;
-def Neon_combine_4S : PatFrag<(ops node:$Rm, node:$Rn),
- (v4i32 (concat_vectors (v2i32 node:$Rm),
- (v2i32 node:$Rn)))>;
-def Neon_combine_4f : PatFrag<(ops node:$Rm, node:$Rn),
- (v4f32 (concat_vectors (v2f32 node:$Rm),
- (v2f32 node:$Rn)))>;
-def Neon_combine_2d : PatFrag<(ops node:$Rm, node:$Rn),
- (v2f64 (concat_vectors (v1f64 node:$Rm),
- (v1f64 node:$Rn)))>;
-
-def Neon_lshrImm8H : PatFrag<(ops node:$lhs, node:$rhs),
- (v8i16 (srl (v8i16 node:$lhs),
- (v8i16 (Neon_vdup (i32 node:$rhs)))))>;
-def Neon_lshrImm4S : PatFrag<(ops node:$lhs, node:$rhs),
- (v4i32 (srl (v4i32 node:$lhs),
- (v4i32 (Neon_vdup (i32 node:$rhs)))))>;
-def Neon_lshrImm2D : PatFrag<(ops node:$lhs, node:$rhs),
- (v2i64 (srl (v2i64 node:$lhs),
- (v2i64 (Neon_vdup (i32 node:$rhs)))))>;
-def Neon_ashrImm8H : PatFrag<(ops node:$lhs, node:$rhs),
- (v8i16 (sra (v8i16 node:$lhs),
- (v8i16 (Neon_vdup (i32 node:$rhs)))))>;
-def Neon_ashrImm4S : PatFrag<(ops node:$lhs, node:$rhs),
- (v4i32 (sra (v4i32 node:$lhs),
- (v4i32 (Neon_vdup (i32 node:$rhs)))))>;
-def Neon_ashrImm2D : PatFrag<(ops node:$lhs, node:$rhs),
- (v2i64 (sra (v2i64 node:$lhs),
- (v2i64 (Neon_vdup (i32 node:$rhs)))))>;
-
-// Normal shift right narrow is matched by IR (srl/sra, trunc, concat_vectors)
-multiclass Neon_shiftNarrow_patterns<string shr> {
- def : Pat<(v8i8 (trunc (!cast<PatFrag>("Neon_" # shr # "Imm8H") VPR128:$Rn,
- (i32 shr_imm8:$Imm)))),
- (SHRNvvi_8B VPR128:$Rn, imm:$Imm)>;
- def : Pat<(v4i16 (trunc (!cast<PatFrag>("Neon_" # shr # "Imm4S") VPR128:$Rn,
- (i32 shr_imm16:$Imm)))),
- (SHRNvvi_4H VPR128:$Rn, imm:$Imm)>;
- def : Pat<(v2i32 (trunc (!cast<PatFrag>("Neon_" # shr # "Imm2D") VPR128:$Rn,
- (i32 shr_imm32:$Imm)))),
- (SHRNvvi_2S VPR128:$Rn, imm:$Imm)>;
-
- def : Pat<(Neon_combine_2D (v1i64 VPR64:$src), (v1i64 (bitconvert
- (v8i8 (trunc (!cast<PatFrag>("Neon_" # shr # "Imm8H")
- VPR128:$Rn, (i32 shr_imm8:$Imm))))))),
- (SHRNvvi_16B (v2i64 (SUBREG_TO_REG (i64 0), VPR64:$src, sub_64)),
- VPR128:$Rn, imm:$Imm)>;
- def : Pat<(Neon_combine_2D (v1i64 VPR64:$src), (v1i64 (bitconvert
- (v4i16 (trunc (!cast<PatFrag>("Neon_" # shr # "Imm4S")
- VPR128:$Rn, (i32 shr_imm16:$Imm))))))),
- (SHRNvvi_8H (SUBREG_TO_REG (i64 0), VPR64:$src, sub_64),
- VPR128:$Rn, imm:$Imm)>;
- def : Pat<(Neon_combine_2D (v1i64 VPR64:$src), (v1i64 (bitconvert
- (v2i32 (trunc (!cast<PatFrag>("Neon_" # shr # "Imm2D")
- VPR128:$Rn, (i32 shr_imm32:$Imm))))))),
- (SHRNvvi_4S (SUBREG_TO_REG (i64 0), VPR64:$src, sub_64),
- VPR128:$Rn, imm:$Imm)>;
-}
-
-multiclass Neon_shiftNarrow_QR_patterns<SDPatternOperator op, string prefix> {
- def : Pat<(v8i8 (op (v8i16 VPR128:$Rn), shr_imm8:$Imm)),
- (!cast<Instruction>(prefix # "_8B") VPR128:$Rn, imm:$Imm)>;
- def : Pat<(v4i16 (op (v4i32 VPR128:$Rn), shr_imm16:$Imm)),
- (!cast<Instruction>(prefix # "_4H") VPR128:$Rn, imm:$Imm)>;
- def : Pat<(v2i32 (op (v2i64 VPR128:$Rn), shr_imm32:$Imm)),
- (!cast<Instruction>(prefix # "_2S") VPR128:$Rn, imm:$Imm)>;
-
- def : Pat<(Neon_combine_2D (v1i64 VPR64:$src),
- (v1i64 (bitconvert (v8i8
- (op (v8i16 VPR128:$Rn), shr_imm8:$Imm))))),
- (!cast<Instruction>(prefix # "_16B")
- (SUBREG_TO_REG (i64 0), VPR64:$src, sub_64),
- VPR128:$Rn, imm:$Imm)>;
- def : Pat<(Neon_combine_2D (v1i64 VPR64:$src),
- (v1i64 (bitconvert (v4i16
- (op (v4i32 VPR128:$Rn), shr_imm16:$Imm))))),
- (!cast<Instruction>(prefix # "_8H")
- (SUBREG_TO_REG (i64 0), VPR64:$src, sub_64),
- VPR128:$Rn, imm:$Imm)>;
- def : Pat<(Neon_combine_2D (v1i64 VPR64:$src),
- (v1i64 (bitconvert (v2i32
- (op (v2i64 VPR128:$Rn), shr_imm32:$Imm))))),
- (!cast<Instruction>(prefix # "_4S")
- (SUBREG_TO_REG (i64 0), VPR64:$src, sub_64),
- VPR128:$Rn, imm:$Imm)>;
-}
-
-defm : Neon_shiftNarrow_patterns<"lshr">;
-defm : Neon_shiftNarrow_patterns<"ashr">;
-
-defm : Neon_shiftNarrow_QR_patterns<int_aarch64_neon_vsqshrun, "QSHRUNvvi">;
-defm : Neon_shiftNarrow_QR_patterns<int_aarch64_neon_vrshrn, "RSHRNvvi">;
-defm : Neon_shiftNarrow_QR_patterns<int_aarch64_neon_vsqrshrun, "QRSHRUNvvi">;
-defm : Neon_shiftNarrow_QR_patterns<int_aarch64_neon_vsqshrn, "SQSHRNvvi">;
-defm : Neon_shiftNarrow_QR_patterns<int_aarch64_neon_vuqshrn, "UQSHRNvvi">;
-defm : Neon_shiftNarrow_QR_patterns<int_aarch64_neon_vsqrshrn, "SQRSHRNvvi">;
-defm : Neon_shiftNarrow_QR_patterns<int_aarch64_neon_vuqrshrn, "UQRSHRNvvi">;
-
-// Convert fix-point and float-pointing
-class N2VCvt_Fx<bit q, bit u, bits<5> opcode, string asmop, string T,
- RegisterOperand VPRC, ValueType DestTy, ValueType SrcTy,
- Operand ImmTy, SDPatternOperator IntOp>
- : NeonI_2VShiftImm<q, u, opcode,
- (outs VPRC:$Rd), (ins VPRC:$Rn, ImmTy:$Imm),
- asmop # "\t$Rd." # T # ", $Rn." # T # ", $Imm",
- [(set (DestTy VPRC:$Rd), (DestTy (IntOp (SrcTy VPRC:$Rn),
- (i32 ImmTy:$Imm))))],
- NoItinerary>,
- Sched<[WriteFPALU, ReadFPALU]>;
-
-multiclass NeonI_N2VCvt_Fx2fp<bit u, bits<5> opcode, string asmop,
- SDPatternOperator IntOp> {
- def _2S : N2VCvt_Fx<0, u, opcode, asmop, "2s", VPR64, v2f32, v2i32,
- shr_imm32, IntOp> {
- let Inst{22-21} = 0b01;
- }
-
- def _4S : N2VCvt_Fx<1, u, opcode, asmop, "4s", VPR128, v4f32, v4i32,
- shr_imm32, IntOp> {
- let Inst{22-21} = 0b01;
- }
-
- def _2D : N2VCvt_Fx<1, u, opcode, asmop, "2d", VPR128, v2f64, v2i64,
- shr_imm64, IntOp> {
- let Inst{22} = 0b1;
- }
-}
-
-multiclass NeonI_N2VCvt_Fp2fx<bit u, bits<5> opcode, string asmop,
- SDPatternOperator IntOp> {
- def _2S : N2VCvt_Fx<0, u, opcode, asmop, "2s", VPR64, v2i32, v2f32,
- shr_imm32, IntOp> {
- let Inst{22-21} = 0b01;
- }
-
- def _4S : N2VCvt_Fx<1, u, opcode, asmop, "4s", VPR128, v4i32, v4f32,
- shr_imm32, IntOp> {
- let Inst{22-21} = 0b01;
- }
-
- def _2D : N2VCvt_Fx<1, u, opcode, asmop, "2d", VPR128, v2i64, v2f64,
- shr_imm64, IntOp> {
- let Inst{22} = 0b1;
- }
-}
-
-// Convert fixed-point to floating-point
-defm VCVTxs2f : NeonI_N2VCvt_Fx2fp<0, 0b11100, "scvtf",
- int_arm_neon_vcvtfxs2fp>;
-defm VCVTxu2f : NeonI_N2VCvt_Fx2fp<1, 0b11100, "ucvtf",
- int_arm_neon_vcvtfxu2fp>;
-
-// Convert floating-point to fixed-point
-defm VCVTf2xs : NeonI_N2VCvt_Fp2fx<0, 0b11111, "fcvtzs",
- int_arm_neon_vcvtfp2fxs>;
-defm VCVTf2xu : NeonI_N2VCvt_Fp2fx<1, 0b11111, "fcvtzu",
- int_arm_neon_vcvtfp2fxu>;
-
-multiclass Neon_sshll2_0<SDNode ext>
-{
- def _v8i8 : PatFrag<(ops node:$Rn),
- (v8i16 (ext (v8i8 (Neon_High16B node:$Rn))))>;
- def _v4i16 : PatFrag<(ops node:$Rn),
- (v4i32 (ext (v4i16 (Neon_High8H node:$Rn))))>;
- def _v2i32 : PatFrag<(ops node:$Rn),
- (v2i64 (ext (v2i32 (Neon_High4S node:$Rn))))>;
-}
-
-defm NI_sext_high : Neon_sshll2_0<sext>;
-defm NI_zext_high : Neon_sshll2_0<zext>;
-
-
-//===----------------------------------------------------------------------===//
-// Multiclasses for NeonI_Across
-//===----------------------------------------------------------------------===//
-
-// Variant 1
-
-multiclass NeonI_2VAcross_1<bit u, bits<5> opcode,
- string asmop, SDPatternOperator opnode>
-{
- def _1h8b: NeonI_2VAcross<0b0, u, 0b00, opcode,
- (outs FPR16:$Rd), (ins VPR64:$Rn),
- asmop # "\t$Rd, $Rn.8b",
- [(set (v1i16 FPR16:$Rd),
- (v1i16 (opnode (v8i8 VPR64:$Rn))))],
- NoItinerary>,
- Sched<[WriteFPALU, ReadFPALU]>;
-
- def _1h16b: NeonI_2VAcross<0b1, u, 0b00, opcode,
- (outs FPR16:$Rd), (ins VPR128:$Rn),
- asmop # "\t$Rd, $Rn.16b",
- [(set (v1i16 FPR16:$Rd),
- (v1i16 (opnode (v16i8 VPR128:$Rn))))],
- NoItinerary>,
- Sched<[WriteFPALU, ReadFPALU]>;
-
- def _1s4h: NeonI_2VAcross<0b0, u, 0b01, opcode,
- (outs FPR32:$Rd), (ins VPR64:$Rn),
- asmop # "\t$Rd, $Rn.4h",
- [(set (v1i32 FPR32:$Rd),
- (v1i32 (opnode (v4i16 VPR64:$Rn))))],
- NoItinerary>,
- Sched<[WriteFPALU, ReadFPALU]>;
-
- def _1s8h: NeonI_2VAcross<0b1, u, 0b01, opcode,
- (outs FPR32:$Rd), (ins VPR128:$Rn),
- asmop # "\t$Rd, $Rn.8h",
- [(set (v1i32 FPR32:$Rd),
- (v1i32 (opnode (v8i16 VPR128:$Rn))))],
- NoItinerary>,
- Sched<[WriteFPALU, ReadFPALU]>;
-
- // _1d2s doesn't exist!
-
- def _1d4s: NeonI_2VAcross<0b1, u, 0b10, opcode,
- (outs FPR64:$Rd), (ins VPR128:$Rn),
- asmop # "\t$Rd, $Rn.4s",
- [(set (v1i64 FPR64:$Rd),
- (v1i64 (opnode (v4i32 VPR128:$Rn))))],
- NoItinerary>,
- Sched<[WriteFPALU, ReadFPALU]>;
-}
-
-defm SADDLV : NeonI_2VAcross_1<0b0, 0b00011, "saddlv", int_aarch64_neon_saddlv>;
-defm UADDLV : NeonI_2VAcross_1<0b1, 0b00011, "uaddlv", int_aarch64_neon_uaddlv>;
-
-// Variant 2
-
-multiclass NeonI_2VAcross_2<bit u, bits<5> opcode,
- string asmop, SDPatternOperator opnode>
-{
- def _1b8b: NeonI_2VAcross<0b0, u, 0b00, opcode,
- (outs FPR8:$Rd), (ins VPR64:$Rn),
- asmop # "\t$Rd, $Rn.8b",
- [(set (v1i8 FPR8:$Rd),
- (v1i8 (opnode (v8i8 VPR64:$Rn))))],
- NoItinerary>,
- Sched<[WriteFPALU, ReadFPALU]>;
-
- def _1b16b: NeonI_2VAcross<0b1, u, 0b00, opcode,
- (outs FPR8:$Rd), (ins VPR128:$Rn),
- asmop # "\t$Rd, $Rn.16b",
- [(set (v1i8 FPR8:$Rd),
- (v1i8 (opnode (v16i8 VPR128:$Rn))))],
- NoItinerary>,
- Sched<[WriteFPALU, ReadFPALU]>;
-
- def _1h4h: NeonI_2VAcross<0b0, u, 0b01, opcode,
- (outs FPR16:$Rd), (ins VPR64:$Rn),
- asmop # "\t$Rd, $Rn.4h",
- [(set (v1i16 FPR16:$Rd),
- (v1i16 (opnode (v4i16 VPR64:$Rn))))],
- NoItinerary>,
- Sched<[WriteFPALU, ReadFPALU]>;
-
- def _1h8h: NeonI_2VAcross<0b1, u, 0b01, opcode,
- (outs FPR16:$Rd), (ins VPR128:$Rn),
- asmop # "\t$Rd, $Rn.8h",
- [(set (v1i16 FPR16:$Rd),
- (v1i16 (opnode (v8i16 VPR128:$Rn))))],
- NoItinerary>,
- Sched<[WriteFPALU, ReadFPALU]>;
-
- // _1s2s doesn't exist!
-
- def _1s4s: NeonI_2VAcross<0b1, u, 0b10, opcode,
- (outs FPR32:$Rd), (ins VPR128:$Rn),
- asmop # "\t$Rd, $Rn.4s",
- [(set (v1i32 FPR32:$Rd),
- (v1i32 (opnode (v4i32 VPR128:$Rn))))],
- NoItinerary>,
- Sched<[WriteFPALU, ReadFPALU]>;
-}
-
-defm SMAXV : NeonI_2VAcross_2<0b0, 0b01010, "smaxv", int_aarch64_neon_smaxv>;
-defm UMAXV : NeonI_2VAcross_2<0b1, 0b01010, "umaxv", int_aarch64_neon_umaxv>;
-
-defm SMINV : NeonI_2VAcross_2<0b0, 0b11010, "sminv", int_aarch64_neon_sminv>;
-defm UMINV : NeonI_2VAcross_2<0b1, 0b11010, "uminv", int_aarch64_neon_uminv>;
-
-defm ADDV : NeonI_2VAcross_2<0b0, 0b11011, "addv", int_aarch64_neon_vaddv>;
-
-// Variant 3
-
-multiclass NeonI_2VAcross_3<bit u, bits<5> opcode, bits<2> size,
- string asmop, SDPatternOperator opnode> {
- def _1s4s: NeonI_2VAcross<0b1, u, size, opcode,
- (outs FPR32:$Rd), (ins VPR128:$Rn),
- asmop # "\t$Rd, $Rn.4s",
- [(set (f32 FPR32:$Rd),
- (f32 (opnode (v4f32 VPR128:$Rn))))],
- NoItinerary>,
- Sched<[WriteFPALU, ReadFPALU]>;
-}
-
-defm FMAXNMV : NeonI_2VAcross_3<0b1, 0b01100, 0b00, "fmaxnmv",
- int_aarch64_neon_vmaxnmv>;
-defm FMINNMV : NeonI_2VAcross_3<0b1, 0b01100, 0b10, "fminnmv",
- int_aarch64_neon_vminnmv>;
-
-defm FMAXV : NeonI_2VAcross_3<0b1, 0b01111, 0b00, "fmaxv",
- int_aarch64_neon_vmaxv>;
-defm FMINV : NeonI_2VAcross_3<0b1, 0b01111, 0b10, "fminv",
- int_aarch64_neon_vminv>;
-
-// The followings are for instruction class (Perm)
-
-class NeonI_Permute<bit q, bits<2> size, bits<3> opcode,
- string asmop, RegisterOperand OpVPR, string OpS,
- SDPatternOperator opnode, ValueType Ty>
- : NeonI_Perm<q, size, opcode,
- (outs OpVPR:$Rd), (ins OpVPR:$Rn, OpVPR:$Rm),
- asmop # "\t$Rd." # OpS # ", $Rn." # OpS # ", $Rm." # OpS,
- [(set (Ty OpVPR:$Rd),
- (Ty (opnode (Ty OpVPR:$Rn), (Ty OpVPR:$Rm))))],
- NoItinerary>,
- Sched<[WriteFPALU, ReadFPALU, ReadFPALU]>;
-
-multiclass NeonI_Perm_pat<bits<3> opcode, string asmop,
- SDPatternOperator opnode> {
- def _8b : NeonI_Permute<0b0, 0b00, opcode, asmop,
- VPR64, "8b", opnode, v8i8>;
- def _16b : NeonI_Permute<0b1, 0b00, opcode, asmop,
- VPR128, "16b",opnode, v16i8>;
- def _4h : NeonI_Permute<0b0, 0b01, opcode, asmop,
- VPR64, "4h", opnode, v4i16>;
- def _8h : NeonI_Permute<0b1, 0b01, opcode, asmop,
- VPR128, "8h", opnode, v8i16>;
- def _2s : NeonI_Permute<0b0, 0b10, opcode, asmop,
- VPR64, "2s", opnode, v2i32>;
- def _4s : NeonI_Permute<0b1, 0b10, opcode, asmop,
- VPR128, "4s", opnode, v4i32>;
- def _2d : NeonI_Permute<0b1, 0b11, opcode, asmop,
- VPR128, "2d", opnode, v2i64>;
-}
-
-defm UZP1vvv : NeonI_Perm_pat<0b001, "uzp1", Neon_uzp1>;
-defm TRN1vvv : NeonI_Perm_pat<0b010, "trn1", Neon_trn1>;
-defm ZIP1vvv : NeonI_Perm_pat<0b011, "zip1", Neon_zip1>;
-defm UZP2vvv : NeonI_Perm_pat<0b101, "uzp2", Neon_uzp2>;
-defm TRN2vvv : NeonI_Perm_pat<0b110, "trn2", Neon_trn2>;
-defm ZIP2vvv : NeonI_Perm_pat<0b111, "zip2", Neon_zip2>;
-
-multiclass NeonI_Perm_float_pat<string INS, SDPatternOperator opnode> {
- def : Pat<(v2f32 (opnode (v2f32 VPR64:$Rn), (v2f32 VPR64:$Rm))),
- (!cast<Instruction>(INS # "_2s") VPR64:$Rn, VPR64:$Rm)>;
-
- def : Pat<(v4f32 (opnode (v4f32 VPR128:$Rn), (v4f32 VPR128:$Rm))),
- (!cast<Instruction>(INS # "_4s") VPR128:$Rn, VPR128:$Rm)>;
-
- def : Pat<(v2f64 (opnode (v2f64 VPR128:$Rn), (v2f64 VPR128:$Rm))),
- (!cast<Instruction>(INS # "_2d") VPR128:$Rn, VPR128:$Rm)>;
-}
-
-defm : NeonI_Perm_float_pat<"UZP1vvv", Neon_uzp1>;
-defm : NeonI_Perm_float_pat<"UZP2vvv", Neon_uzp2>;
-defm : NeonI_Perm_float_pat<"ZIP1vvv", Neon_zip1>;
-defm : NeonI_Perm_float_pat<"ZIP2vvv", Neon_zip2>;
-defm : NeonI_Perm_float_pat<"TRN1vvv", Neon_trn1>;
-defm : NeonI_Perm_float_pat<"TRN2vvv", Neon_trn2>;
-
-// The followings are for instruction class (3V Diff)
-
-// normal long/long2 pattern
-class NeonI_3VDL<bit q, bit u, bits<2> size, bits<4> opcode,
- string asmop, string ResS, string OpS,
- SDPatternOperator opnode, SDPatternOperator ext,
- RegisterOperand OpVPR,
- ValueType ResTy, ValueType OpTy>
- : NeonI_3VDiff<q, u, size, opcode,
- (outs VPR128:$Rd), (ins OpVPR:$Rn, OpVPR:$Rm),
- asmop # "\t$Rd." # ResS # ", $Rn." # OpS # ", $Rm." # OpS,
- [(set (ResTy VPR128:$Rd),
- (ResTy (opnode (ResTy (ext (OpTy OpVPR:$Rn))),
- (ResTy (ext (OpTy OpVPR:$Rm))))))],
- NoItinerary>,
- Sched<[WriteFPALU, ReadFPALU, ReadFPALU]>;
-
-multiclass NeonI_3VDL_s<bit u, bits<4> opcode,
- string asmop, SDPatternOperator opnode,
- bit Commutable = 0> {
- let isCommutable = Commutable in {
- def _8h8b : NeonI_3VDL<0b0, u, 0b00, opcode, asmop, "8h", "8b",
- opnode, sext, VPR64, v8i16, v8i8>;
- def _4s4h : NeonI_3VDL<0b0, u, 0b01, opcode, asmop, "4s", "4h",
- opnode, sext, VPR64, v4i32, v4i16>;
- def _2d2s : NeonI_3VDL<0b0, u, 0b10, opcode, asmop, "2d", "2s",
- opnode, sext, VPR64, v2i64, v2i32>;
- }
-}
-
-multiclass NeonI_3VDL2_s<bit u, bits<4> opcode, string asmop,
- SDPatternOperator opnode, bit Commutable = 0> {
- let isCommutable = Commutable in {
- def _8h16b : NeonI_3VDL<0b1, u, 0b00, opcode, asmop, "8h", "16b",
- opnode, NI_sext_high_v8i8, VPR128, v8i16, v16i8>;
- def _4s8h : NeonI_3VDL<0b1, u, 0b01, opcode, asmop, "4s", "8h",
- opnode, NI_sext_high_v4i16, VPR128, v4i32, v8i16>;
- def _2d4s : NeonI_3VDL<0b1, u, 0b10, opcode, asmop, "2d", "4s",
- opnode, NI_sext_high_v2i32, VPR128, v2i64, v4i32>;
- }
-}
-
-multiclass NeonI_3VDL_u<bit u, bits<4> opcode, string asmop,
- SDPatternOperator opnode, bit Commutable = 0> {
- let isCommutable = Commutable in {
- def _8h8b : NeonI_3VDL<0b0, u, 0b00, opcode, asmop, "8h", "8b",
- opnode, zext, VPR64, v8i16, v8i8>;
- def _4s4h : NeonI_3VDL<0b0, u, 0b01, opcode, asmop, "4s", "4h",
- opnode, zext, VPR64, v4i32, v4i16>;
- def _2d2s : NeonI_3VDL<0b0, u, 0b10, opcode, asmop, "2d", "2s",
- opnode, zext, VPR64, v2i64, v2i32>;
- }
-}
-
-multiclass NeonI_3VDL2_u<bit u, bits<4> opcode, string asmop,
- SDPatternOperator opnode, bit Commutable = 0> {
- let isCommutable = Commutable in {
- def _8h16b : NeonI_3VDL<0b1, u, 0b00, opcode, asmop, "8h", "16b",
- opnode, NI_zext_high_v8i8, VPR128, v8i16, v16i8>;
- def _4s8h : NeonI_3VDL<0b1, u, 0b01, opcode, asmop, "4s", "8h",
- opnode, NI_zext_high_v4i16, VPR128, v4i32, v8i16>;
- def _2d4s : NeonI_3VDL<0b1, u, 0b10, opcode, asmop, "2d", "4s",
- opnode, NI_zext_high_v2i32, VPR128, v2i64, v4i32>;
- }
-}
-
-defm SADDLvvv : NeonI_3VDL_s<0b0, 0b0000, "saddl", add, 1>;
-defm UADDLvvv : NeonI_3VDL_u<0b1, 0b0000, "uaddl", add, 1>;
-
-defm SADDL2vvv : NeonI_3VDL2_s<0b0, 0b0000, "saddl2", add, 1>;
-defm UADDL2vvv : NeonI_3VDL2_u<0b1, 0b0000, "uaddl2", add, 1>;
-
-defm SSUBLvvv : NeonI_3VDL_s<0b0, 0b0010, "ssubl", sub, 0>;
-defm USUBLvvv : NeonI_3VDL_u<0b1, 0b0010, "usubl", sub, 0>;
-
-defm SSUBL2vvv : NeonI_3VDL2_s<0b0, 0b0010, "ssubl2", sub, 0>;
-defm USUBL2vvv : NeonI_3VDL2_u<0b1, 0b0010, "usubl2", sub, 0>;
-
-// normal wide/wide2 pattern
-class NeonI_3VDW<bit q, bit u, bits<2> size, bits<4> opcode,
- string asmop, string ResS, string OpS,
- SDPatternOperator opnode, SDPatternOperator ext,
- RegisterOperand OpVPR,
- ValueType ResTy, ValueType OpTy>
- : NeonI_3VDiff<q, u, size, opcode,
- (outs VPR128:$Rd), (ins VPR128:$Rn, OpVPR:$Rm),
- asmop # "\t$Rd." # ResS # ", $Rn." # ResS # ", $Rm." # OpS,
- [(set (ResTy VPR128:$Rd),
- (ResTy (opnode (ResTy VPR128:$Rn),
- (ResTy (ext (OpTy OpVPR:$Rm))))))],
- NoItinerary>,
- Sched<[WriteFPALU, ReadFPALU, ReadFPALU]>;
-
-multiclass NeonI_3VDW_s<bit u, bits<4> opcode, string asmop,
- SDPatternOperator opnode> {
- def _8h8b : NeonI_3VDW<0b0, u, 0b00, opcode, asmop, "8h", "8b",
- opnode, sext, VPR64, v8i16, v8i8>;
- def _4s4h : NeonI_3VDW<0b0, u, 0b01, opcode, asmop, "4s", "4h",
- opnode, sext, VPR64, v4i32, v4i16>;
- def _2d2s : NeonI_3VDW<0b0, u, 0b10, opcode, asmop, "2d", "2s",
- opnode, sext, VPR64, v2i64, v2i32>;
-}
-
-defm SADDWvvv : NeonI_3VDW_s<0b0, 0b0001, "saddw", add>;
-defm SSUBWvvv : NeonI_3VDW_s<0b0, 0b0011, "ssubw", sub>;
-
-multiclass NeonI_3VDW2_s<bit u, bits<4> opcode, string asmop,
- SDPatternOperator opnode> {
- def _8h16b : NeonI_3VDW<0b1, u, 0b00, opcode, asmop, "8h", "16b",
- opnode, NI_sext_high_v8i8, VPR128, v8i16, v16i8>;
- def _4s8h : NeonI_3VDW<0b1, u, 0b01, opcode, asmop, "4s", "8h",
- opnode, NI_sext_high_v4i16, VPR128, v4i32, v8i16>;
- def _2d4s : NeonI_3VDW<0b1, u, 0b10, opcode, asmop, "2d", "4s",
- opnode, NI_sext_high_v2i32, VPR128, v2i64, v4i32>;
-}
-
-defm SADDW2vvv : NeonI_3VDW2_s<0b0, 0b0001, "saddw2", add>;
-defm SSUBW2vvv : NeonI_3VDW2_s<0b0, 0b0011, "ssubw2", sub>;
-
-multiclass NeonI_3VDW_u<bit u, bits<4> opcode, string asmop,
- SDPatternOperator opnode> {
- def _8h8b : NeonI_3VDW<0b0, u, 0b00, opcode, asmop, "8h", "8b",
- opnode, zext, VPR64, v8i16, v8i8>;
- def _4s4h : NeonI_3VDW<0b0, u, 0b01, opcode, asmop, "4s", "4h",
- opnode, zext, VPR64, v4i32, v4i16>;
- def _2d2s : NeonI_3VDW<0b0, u, 0b10, opcode, asmop, "2d", "2s",
- opnode, zext, VPR64, v2i64, v2i32>;
-}
-
-defm UADDWvvv : NeonI_3VDW_u<0b1, 0b0001, "uaddw", add>;
-defm USUBWvvv : NeonI_3VDW_u<0b1, 0b0011, "usubw", sub>;
-
-multiclass NeonI_3VDW2_u<bit u, bits<4> opcode, string asmop,
- SDPatternOperator opnode> {
- def _8h16b : NeonI_3VDW<0b1, u, 0b00, opcode, asmop, "8h", "16b",
- opnode, NI_zext_high_v8i8, VPR128, v8i16, v16i8>;
- def _4s8h : NeonI_3VDW<0b1, u, 0b01, opcode, asmop, "4s", "8h",
- opnode, NI_zext_high_v4i16, VPR128, v4i32, v8i16>;
- def _2d4s : NeonI_3VDW<0b1, u, 0b10, opcode, asmop, "2d", "4s",
- opnode, NI_zext_high_v2i32, VPR128, v2i64, v4i32>;
-}
-
-defm UADDW2vvv : NeonI_3VDW2_u<0b1, 0b0001, "uaddw2", add>;
-defm USUBW2vvv : NeonI_3VDW2_u<0b1, 0b0011, "usubw2", sub>;
-
-// Get the high half part of the vector element.
-multiclass NeonI_get_high {
- def _8h : PatFrag<(ops node:$Rn),
- (v8i8 (trunc (v8i16 (srl (v8i16 node:$Rn),
- (v8i16 (Neon_vdup (i32 8)))))))>;
- def _4s : PatFrag<(ops node:$Rn),
- (v4i16 (trunc (v4i32 (srl (v4i32 node:$Rn),
- (v4i32 (Neon_vdup (i32 16)))))))>;
- def _2d : PatFrag<(ops node:$Rn),
- (v2i32 (trunc (v2i64 (srl (v2i64 node:$Rn),
- (v2i64 (Neon_vdup (i32 32)))))))>;
-}
-
-defm NI_get_hi : NeonI_get_high;
-
-// pattern for addhn/subhn with 2 operands
-class NeonI_3VDN_addhn_2Op<bit q, bit u, bits<2> size, bits<4> opcode,
- string asmop, string ResS, string OpS,
- SDPatternOperator opnode, SDPatternOperator get_hi,
- ValueType ResTy, ValueType OpTy>
- : NeonI_3VDiff<q, u, size, opcode,
- (outs VPR64:$Rd), (ins VPR128:$Rn, VPR128:$Rm),
- asmop # "\t$Rd." # ResS # ", $Rn." # OpS # ", $Rm." # OpS,
- [(set (ResTy VPR64:$Rd),
- (ResTy (get_hi
- (OpTy (opnode (OpTy VPR128:$Rn),
- (OpTy VPR128:$Rm))))))],
- NoItinerary>,
- Sched<[WriteFPALU, ReadFPALU, ReadFPALU]>;
-
-multiclass NeonI_3VDN_addhn_2Op<bit u, bits<4> opcode, string asmop,
- SDPatternOperator opnode, bit Commutable = 0> {
- let isCommutable = Commutable in {
- def _8b8h : NeonI_3VDN_addhn_2Op<0b0, u, 0b00, opcode, asmop, "8b", "8h",
- opnode, NI_get_hi_8h, v8i8, v8i16>;
- def _4h4s : NeonI_3VDN_addhn_2Op<0b0, u, 0b01, opcode, asmop, "4h", "4s",
- opnode, NI_get_hi_4s, v4i16, v4i32>;
- def _2s2d : NeonI_3VDN_addhn_2Op<0b0, u, 0b10, opcode, asmop, "2s", "2d",
- opnode, NI_get_hi_2d, v2i32, v2i64>;
- }
-}
-
-defm ADDHNvvv : NeonI_3VDN_addhn_2Op<0b0, 0b0100, "addhn", add, 1>;
-defm SUBHNvvv : NeonI_3VDN_addhn_2Op<0b0, 0b0110, "subhn", sub, 0>;
-
-// pattern for operation with 2 operands
-class NeonI_3VD_2Op<bit q, bit u, bits<2> size, bits<4> opcode,
- string asmop, string ResS, string OpS,
- SDPatternOperator opnode,
- RegisterOperand ResVPR, RegisterOperand OpVPR,
- ValueType ResTy, ValueType OpTy>
- : NeonI_3VDiff<q, u, size, opcode,
- (outs ResVPR:$Rd), (ins OpVPR:$Rn, OpVPR:$Rm),
- asmop # "\t$Rd." # ResS # ", $Rn." # OpS # ", $Rm." # OpS,
- [(set (ResTy ResVPR:$Rd),
- (ResTy (opnode (OpTy OpVPR:$Rn), (OpTy OpVPR:$Rm))))],
- NoItinerary>,
- Sched<[WriteFPALU, ReadFPALU, ReadFPALU]>;
-
-// normal narrow pattern
-multiclass NeonI_3VDN_2Op<bit u, bits<4> opcode, string asmop,
- SDPatternOperator opnode, bit Commutable = 0> {
- let isCommutable = Commutable in {
- def _8b8h : NeonI_3VD_2Op<0b0, u, 0b00, opcode, asmop, "8b", "8h",
- opnode, VPR64, VPR128, v8i8, v8i16>;
- def _4h4s : NeonI_3VD_2Op<0b0, u, 0b01, opcode, asmop, "4h", "4s",
- opnode, VPR64, VPR128, v4i16, v4i32>;
- def _2s2d : NeonI_3VD_2Op<0b0, u, 0b10, opcode, asmop, "2s", "2d",
- opnode, VPR64, VPR128, v2i32, v2i64>;
- }
-}
-
-defm RADDHNvvv : NeonI_3VDN_2Op<0b1, 0b0100, "raddhn", int_arm_neon_vraddhn, 1>;
-defm RSUBHNvvv : NeonI_3VDN_2Op<0b1, 0b0110, "rsubhn", int_arm_neon_vrsubhn, 0>;
-
-// pattern for acle intrinsic with 3 operands
-class NeonI_3VDN_3Op<bit q, bit u, bits<2> size, bits<4> opcode,
- string asmop, string ResS, string OpS>
- : NeonI_3VDiff<q, u, size, opcode,
- (outs VPR128:$Rd), (ins VPR128:$src, VPR128:$Rn, VPR128:$Rm),
- asmop # "\t$Rd." # ResS # ", $Rn." # OpS # ", $Rm." # OpS,
- [], NoItinerary>,
- Sched<[WriteFPALU, ReadFPALU, ReadFPALU, ReadFPALU]> {
- let Constraints = "$src = $Rd";
- let neverHasSideEffects = 1;
-}
-
-multiclass NeonI_3VDN_3Op_v1<bit u, bits<4> opcode, string asmop> {
- def _16b8h : NeonI_3VDN_3Op<0b1, u, 0b00, opcode, asmop, "16b", "8h">;
- def _8h4s : NeonI_3VDN_3Op<0b1, u, 0b01, opcode, asmop, "8h", "4s">;
- def _4s2d : NeonI_3VDN_3Op<0b1, u, 0b10, opcode, asmop, "4s", "2d">;
-}
-
-defm ADDHN2vvv : NeonI_3VDN_3Op_v1<0b0, 0b0100, "addhn2">;
-defm SUBHN2vvv : NeonI_3VDN_3Op_v1<0b0, 0b0110, "subhn2">;
-
-defm RADDHN2vvv : NeonI_3VDN_3Op_v1<0b1, 0b0100, "raddhn2">;
-defm RSUBHN2vvv : NeonI_3VDN_3Op_v1<0b1, 0b0110, "rsubhn2">;
-
-// Patterns have to be separate because there's a SUBREG_TO_REG in the output
-// part.
-class NarrowHighHalfPat<Instruction INST, ValueType DstTy, ValueType SrcTy,
- SDPatternOperator coreop>
- : Pat<(Neon_combine_2D (v1i64 VPR64:$src),
- (v1i64 (bitconvert (DstTy (coreop (SrcTy VPR128:$Rn),
- (SrcTy VPR128:$Rm)))))),
- (INST (SUBREG_TO_REG (i64 0), VPR64:$src, sub_64),
- VPR128:$Rn, VPR128:$Rm)>;
-
-// addhn2 patterns
-def : NarrowHighHalfPat<ADDHN2vvv_16b8h, v8i8, v8i16,
- BinOpFrag<(NI_get_hi_8h (add node:$LHS, node:$RHS))>>;
-def : NarrowHighHalfPat<ADDHN2vvv_8h4s, v4i16, v4i32,
- BinOpFrag<(NI_get_hi_4s (add node:$LHS, node:$RHS))>>;
-def : NarrowHighHalfPat<ADDHN2vvv_4s2d, v2i32, v2i64,
- BinOpFrag<(NI_get_hi_2d (add node:$LHS, node:$RHS))>>;
-
-// subhn2 patterns
-def : NarrowHighHalfPat<SUBHN2vvv_16b8h, v8i8, v8i16,
- BinOpFrag<(NI_get_hi_8h (sub node:$LHS, node:$RHS))>>;
-def : NarrowHighHalfPat<SUBHN2vvv_8h4s, v4i16, v4i32,
- BinOpFrag<(NI_get_hi_4s (sub node:$LHS, node:$RHS))>>;
-def : NarrowHighHalfPat<SUBHN2vvv_4s2d, v2i32, v2i64,
- BinOpFrag<(NI_get_hi_2d (sub node:$LHS, node:$RHS))>>;
-
-// raddhn2 patterns
-def : NarrowHighHalfPat<RADDHN2vvv_16b8h, v8i8, v8i16, int_arm_neon_vraddhn>;
-def : NarrowHighHalfPat<RADDHN2vvv_8h4s, v4i16, v4i32, int_arm_neon_vraddhn>;
-def : NarrowHighHalfPat<RADDHN2vvv_4s2d, v2i32, v2i64, int_arm_neon_vraddhn>;
-
-// rsubhn2 patterns
-def : NarrowHighHalfPat<RSUBHN2vvv_16b8h, v8i8, v8i16, int_arm_neon_vrsubhn>;
-def : NarrowHighHalfPat<RSUBHN2vvv_8h4s, v4i16, v4i32, int_arm_neon_vrsubhn>;
-def : NarrowHighHalfPat<RSUBHN2vvv_4s2d, v2i32, v2i64, int_arm_neon_vrsubhn>;
-
-// pattern that need to extend result
-class NeonI_3VDL_Ext<bit q, bit u, bits<2> size, bits<4> opcode,
- string asmop, string ResS, string OpS,
- SDPatternOperator opnode,
- RegisterOperand OpVPR,
- ValueType ResTy, ValueType OpTy, ValueType OpSTy>
- : NeonI_3VDiff<q, u, size, opcode,
- (outs VPR128:$Rd), (ins OpVPR:$Rn, OpVPR:$Rm),
- asmop # "\t$Rd." # ResS # ", $Rn." # OpS # ", $Rm." # OpS,
- [(set (ResTy VPR128:$Rd),
- (ResTy (zext (OpSTy (opnode (OpTy OpVPR:$Rn),
- (OpTy OpVPR:$Rm))))))],
- NoItinerary>,
- Sched<[WriteFPALU, ReadFPALU, ReadFPALU]>;
-
-multiclass NeonI_3VDL_zext<bit u, bits<4> opcode, string asmop,
- SDPatternOperator opnode, bit Commutable = 0> {
- let isCommutable = Commutable in {
- def _8h8b : NeonI_3VDL_Ext<0b0, u, 0b00, opcode, asmop, "8h", "8b",
- opnode, VPR64, v8i16, v8i8, v8i8>;
- def _4s4h : NeonI_3VDL_Ext<0b0, u, 0b01, opcode, asmop, "4s", "4h",
- opnode, VPR64, v4i32, v4i16, v4i16>;
- def _2d2s : NeonI_3VDL_Ext<0b0, u, 0b10, opcode, asmop, "2d", "2s",
- opnode, VPR64, v2i64, v2i32, v2i32>;
- }
-}
-
-defm SABDLvvv : NeonI_3VDL_zext<0b0, 0b0111, "sabdl", int_arm_neon_vabds, 1>;
-defm UABDLvvv : NeonI_3VDL_zext<0b1, 0b0111, "uabdl", int_arm_neon_vabdu, 1>;
-
-multiclass NeonI_Op_High<SDPatternOperator op> {
- def _16B : PatFrag<(ops node:$Rn, node:$Rm),
- (op (v8i8 (Neon_High16B node:$Rn)),
- (v8i8 (Neon_High16B node:$Rm)))>;
- def _8H : PatFrag<(ops node:$Rn, node:$Rm),
- (op (v4i16 (Neon_High8H node:$Rn)),
- (v4i16 (Neon_High8H node:$Rm)))>;
- def _4S : PatFrag<(ops node:$Rn, node:$Rm),
- (op (v2i32 (Neon_High4S node:$Rn)),
- (v2i32 (Neon_High4S node:$Rm)))>;
-}
-
-defm NI_sabdl_hi : NeonI_Op_High<int_arm_neon_vabds>;
-defm NI_uabdl_hi : NeonI_Op_High<int_arm_neon_vabdu>;
-defm NI_smull_hi : NeonI_Op_High<int_arm_neon_vmulls>;
-defm NI_umull_hi : NeonI_Op_High<int_arm_neon_vmullu>;
-defm NI_qdmull_hi : NeonI_Op_High<int_arm_neon_vqdmull>;
-defm NI_pmull_hi : NeonI_Op_High<int_arm_neon_vmullp>;
-
-multiclass NeonI_3VDL_Abd_u<bit u, bits<4> opcode, string asmop, string opnode,
- bit Commutable = 0> {
- let isCommutable = Commutable in {
- def _8h8b : NeonI_3VDL_Ext<0b1, u, 0b00, opcode, asmop, "8h", "16b",
- !cast<PatFrag>(opnode # "_16B"),
- VPR128, v8i16, v16i8, v8i8>;
- def _4s4h : NeonI_3VDL_Ext<0b1, u, 0b01, opcode, asmop, "4s", "8h",
- !cast<PatFrag>(opnode # "_8H"),
- VPR128, v4i32, v8i16, v4i16>;
- def _2d2s : NeonI_3VDL_Ext<0b1, u, 0b10, opcode, asmop, "2d", "4s",
- !cast<PatFrag>(opnode # "_4S"),
- VPR128, v2i64, v4i32, v2i32>;
- }
-}
-
-defm SABDL2vvv : NeonI_3VDL_Abd_u<0b0, 0b0111, "sabdl2", "NI_sabdl_hi", 1>;
-defm UABDL2vvv : NeonI_3VDL_Abd_u<0b1, 0b0111, "uabdl2", "NI_uabdl_hi", 1>;
-
-// For pattern that need two operators being chained.
-class NeonI_3VDL_Aba<bit q, bit u, bits<2> size, bits<4> opcode,
- string asmop, string ResS, string OpS,
- SDPatternOperator opnode, SDPatternOperator subop,
- RegisterOperand OpVPR,
- ValueType ResTy, ValueType OpTy, ValueType OpSTy>
- : NeonI_3VDiff<q, u, size, opcode,
- (outs VPR128:$Rd), (ins VPR128:$src, OpVPR:$Rn, OpVPR:$Rm),
- asmop # "\t$Rd." # ResS # ", $Rn." # OpS # ", $Rm." # OpS,
- [(set (ResTy VPR128:$Rd),
- (ResTy (opnode
- (ResTy VPR128:$src),
- (ResTy (zext (OpSTy (subop (OpTy OpVPR:$Rn),
- (OpTy OpVPR:$Rm))))))))],
- NoItinerary>,
- Sched<[WriteFPALU, ReadFPALU, ReadFPALU, ReadFPALU]> {
- let Constraints = "$src = $Rd";
-}
-
-multiclass NeonI_3VDL_Aba_v1<bit u, bits<4> opcode, string asmop,
- SDPatternOperator opnode, SDPatternOperator subop>{
- def _8h8b : NeonI_3VDL_Aba<0b0, u, 0b00, opcode, asmop, "8h", "8b",
- opnode, subop, VPR64, v8i16, v8i8, v8i8>;
- def _4s4h : NeonI_3VDL_Aba<0b0, u, 0b01, opcode, asmop, "4s", "4h",
- opnode, subop, VPR64, v4i32, v4i16, v4i16>;
- def _2d2s : NeonI_3VDL_Aba<0b0, u, 0b10, opcode, asmop, "2d", "2s",
- opnode, subop, VPR64, v2i64, v2i32, v2i32>;
-}
-
-defm SABALvvv : NeonI_3VDL_Aba_v1<0b0, 0b0101, "sabal",
- add, int_arm_neon_vabds>;
-defm UABALvvv : NeonI_3VDL_Aba_v1<0b1, 0b0101, "uabal",
- add, int_arm_neon_vabdu>;
-
-multiclass NeonI_3VDL2_Aba_v1<bit u, bits<4> opcode, string asmop,
- SDPatternOperator opnode, string subop> {
- def _8h8b : NeonI_3VDL_Aba<0b1, u, 0b00, opcode, asmop, "8h", "16b",
- opnode, !cast<PatFrag>(subop # "_16B"),
- VPR128, v8i16, v16i8, v8i8>;
- def _4s4h : NeonI_3VDL_Aba<0b1, u, 0b01, opcode, asmop, "4s", "8h",
- opnode, !cast<PatFrag>(subop # "_8H"),
- VPR128, v4i32, v8i16, v4i16>;
- def _2d2s : NeonI_3VDL_Aba<0b1, u, 0b10, opcode, asmop, "2d", "4s",
- opnode, !cast<PatFrag>(subop # "_4S"),
- VPR128, v2i64, v4i32, v2i32>;
-}
-
-defm SABAL2vvv : NeonI_3VDL2_Aba_v1<0b0, 0b0101, "sabal2", add,
- "NI_sabdl_hi">;
-defm UABAL2vvv : NeonI_3VDL2_Aba_v1<0b1, 0b0101, "uabal2", add,
- "NI_uabdl_hi">;
-
-// Long pattern with 2 operands
-multiclass NeonI_3VDL_2Op<bit u, bits<4> opcode, string asmop,
- SDPatternOperator opnode, bit Commutable = 0> {
- let isCommutable = Commutable,
- SchedRW = [WriteFPMul, ReadFPMul, ReadFPMul] in {
- def _8h8b : NeonI_3VD_2Op<0b0, u, 0b00, opcode, asmop, "8h", "8b",
- opnode, VPR128, VPR64, v8i16, v8i8>;
- def _4s4h : NeonI_3VD_2Op<0b0, u, 0b01, opcode, asmop, "4s", "4h",
- opnode, VPR128, VPR64, v4i32, v4i16>;
- def _2d2s : NeonI_3VD_2Op<0b0, u, 0b10, opcode, asmop, "2d", "2s",
- opnode, VPR128, VPR64, v2i64, v2i32>;
- }
-}
-
-defm SMULLvvv : NeonI_3VDL_2Op<0b0, 0b1100, "smull", int_arm_neon_vmulls, 1>;
-defm UMULLvvv : NeonI_3VDL_2Op<0b1, 0b1100, "umull", int_arm_neon_vmullu, 1>;
-
-class NeonI_3VDL2_2Op_mull<bit q, bit u, bits<2> size, bits<4> opcode,
- string asmop, string ResS, string OpS,
- SDPatternOperator opnode,
- ValueType ResTy, ValueType OpTy>
- : NeonI_3VDiff<q, u, size, opcode,
- (outs VPR128:$Rd), (ins VPR128:$Rn, VPR128:$Rm),
- asmop # "\t$Rd." # ResS # ", $Rn." # OpS # ", $Rm." # OpS,
- [(set (ResTy VPR128:$Rd),
- (ResTy (opnode (OpTy VPR128:$Rn), (OpTy VPR128:$Rm))))],
- NoItinerary>,
- Sched<[WriteFPMul, ReadFPMul, ReadFPMul]>;
-
-multiclass NeonI_3VDL2_2Op_mull_v1<bit u, bits<4> opcode, string asmop,
- string opnode, bit Commutable = 0> {
- let isCommutable = Commutable in {
- def _8h16b : NeonI_3VDL2_2Op_mull<0b1, u, 0b00, opcode, asmop, "8h", "16b",
- !cast<PatFrag>(opnode # "_16B"),
- v8i16, v16i8>;
- def _4s8h : NeonI_3VDL2_2Op_mull<0b1, u, 0b01, opcode, asmop, "4s", "8h",
- !cast<PatFrag>(opnode # "_8H"),
- v4i32, v8i16>;
- def _2d4s : NeonI_3VDL2_2Op_mull<0b1, u, 0b10, opcode, asmop, "2d", "4s",
- !cast<PatFrag>(opnode # "_4S"),
- v2i64, v4i32>;
- }
-}
-
-defm SMULL2vvv : NeonI_3VDL2_2Op_mull_v1<0b0, 0b1100, "smull2",
- "NI_smull_hi", 1>;
-defm UMULL2vvv : NeonI_3VDL2_2Op_mull_v1<0b1, 0b1100, "umull2",
- "NI_umull_hi", 1>;
-
-// Long pattern with 3 operands
-class NeonI_3VDL_3Op<bit q, bit u, bits<2> size, bits<4> opcode,
- string asmop, string ResS, string OpS,
- SDPatternOperator opnode,
- ValueType ResTy, ValueType OpTy>
- : NeonI_3VDiff<q, u, size, opcode,
- (outs VPR128:$Rd), (ins VPR128:$src, VPR64:$Rn, VPR64:$Rm),
- asmop # "\t$Rd." # ResS # ", $Rn." # OpS # ", $Rm." # OpS,
- [(set (ResTy VPR128:$Rd),
- (ResTy (opnode
- (ResTy VPR128:$src),
- (OpTy VPR64:$Rn), (OpTy VPR64:$Rm))))],
- NoItinerary>,
- Sched<[WriteFPMAC, ReadFPMAC, ReadFPMAC, ReadFPMAC]> {
- let Constraints = "$src = $Rd";
-}
-
-multiclass NeonI_3VDL_3Op_v1<bit u, bits<4> opcode, string asmop,
- SDPatternOperator opnode> {
- def _8h8b : NeonI_3VDL_3Op<0b0, u, 0b00, opcode, asmop, "8h", "8b",
- opnode, v8i16, v8i8>;
- def _4s4h : NeonI_3VDL_3Op<0b0, u, 0b01, opcode, asmop, "4s", "4h",
- opnode, v4i32, v4i16>;
- def _2d2s : NeonI_3VDL_3Op<0b0, u, 0b10, opcode, asmop, "2d", "2s",
- opnode, v2i64, v2i32>;
-}
-
-def Neon_smlal : PatFrag<(ops node:$Rd, node:$Rn, node:$Rm),
- (add node:$Rd,
- (int_arm_neon_vmulls node:$Rn, node:$Rm))>;
-
-def Neon_umlal : PatFrag<(ops node:$Rd, node:$Rn, node:$Rm),
- (add node:$Rd,
- (int_arm_neon_vmullu node:$Rn, node:$Rm))>;
-
-def Neon_smlsl : PatFrag<(ops node:$Rd, node:$Rn, node:$Rm),
- (sub node:$Rd,
- (int_arm_neon_vmulls node:$Rn, node:$Rm))>;
-
-def Neon_umlsl : PatFrag<(ops node:$Rd, node:$Rn, node:$Rm),
- (sub node:$Rd,
- (int_arm_neon_vmullu node:$Rn, node:$Rm))>;
-
-defm SMLALvvv : NeonI_3VDL_3Op_v1<0b0, 0b1000, "smlal", Neon_smlal>;
-defm UMLALvvv : NeonI_3VDL_3Op_v1<0b1, 0b1000, "umlal", Neon_umlal>;
-
-defm SMLSLvvv : NeonI_3VDL_3Op_v1<0b0, 0b1010, "smlsl", Neon_smlsl>;
-defm UMLSLvvv : NeonI_3VDL_3Op_v1<0b1, 0b1010, "umlsl", Neon_umlsl>;
-
-class NeonI_3VDL2_3Op_mlas<bit q, bit u, bits<2> size, bits<4> opcode,
- string asmop, string ResS, string OpS,
- SDPatternOperator subop, SDPatternOperator opnode,
- RegisterOperand OpVPR,
- ValueType ResTy, ValueType OpTy>
- : NeonI_3VDiff<q, u, size, opcode,
- (outs VPR128:$Rd), (ins VPR128:$src, OpVPR:$Rn, OpVPR:$Rm),
- asmop # "\t$Rd." # ResS # ", $Rn." # OpS # ", $Rm." # OpS,
- [(set (ResTy VPR128:$Rd),
- (ResTy (subop
- (ResTy VPR128:$src),
- (ResTy (opnode (OpTy OpVPR:$Rn), (OpTy OpVPR:$Rm))))))],
- NoItinerary>,
- Sched<[WriteFPMAC, ReadFPMAC, ReadFPMAC, ReadFPMAC]> {
- let Constraints = "$src = $Rd";
-}
-
-multiclass NeonI_3VDL2_3Op_mlas_v1<bit u, bits<4> opcode, string asmop,
- SDPatternOperator subop, string opnode> {
- def _8h16b : NeonI_3VDL2_3Op_mlas<0b1, u, 0b00, opcode, asmop, "8h", "16b",
- subop, !cast<PatFrag>(opnode # "_16B"),
- VPR128, v8i16, v16i8>;
- def _4s8h : NeonI_3VDL2_3Op_mlas<0b1, u, 0b01, opcode, asmop, "4s", "8h",
- subop, !cast<PatFrag>(opnode # "_8H"),
- VPR128, v4i32, v8i16>;
- def _2d4s : NeonI_3VDL2_3Op_mlas<0b1, u, 0b10, opcode, asmop, "2d", "4s",
- subop, !cast<PatFrag>(opnode # "_4S"),
- VPR128, v2i64, v4i32>;
-}
-
-defm SMLAL2vvv : NeonI_3VDL2_3Op_mlas_v1<0b0, 0b1000, "smlal2",
- add, "NI_smull_hi">;
-defm UMLAL2vvv : NeonI_3VDL2_3Op_mlas_v1<0b1, 0b1000, "umlal2",
- add, "NI_umull_hi">;
-
-defm SMLSL2vvv : NeonI_3VDL2_3Op_mlas_v1<0b0, 0b1010, "smlsl2",
- sub, "NI_smull_hi">;
-defm UMLSL2vvv : NeonI_3VDL2_3Op_mlas_v1<0b1, 0b1010, "umlsl2",
- sub, "NI_umull_hi">;
-
-multiclass NeonI_3VDL_qdmlal_3Op_v2<bit u, bits<4> opcode, string asmop,
- SDPatternOperator opnode> {
- def _4s4h : NeonI_3VDL2_3Op_mlas<0b0, u, 0b01, opcode, asmop, "4s", "4h",
- opnode, int_arm_neon_vqdmull,
- VPR64, v4i32, v4i16>;
- def _2d2s : NeonI_3VDL2_3Op_mlas<0b0, u, 0b10, opcode, asmop, "2d", "2s",
- opnode, int_arm_neon_vqdmull,
- VPR64, v2i64, v2i32>;
-}
-
-defm SQDMLALvvv : NeonI_3VDL_qdmlal_3Op_v2<0b0, 0b1001, "sqdmlal",
- int_arm_neon_vqadds>;
-defm SQDMLSLvvv : NeonI_3VDL_qdmlal_3Op_v2<0b0, 0b1011, "sqdmlsl",
- int_arm_neon_vqsubs>;
-
-multiclass NeonI_3VDL_v2<bit u, bits<4> opcode, string asmop,
- SDPatternOperator opnode, bit Commutable = 0> {
- let isCommutable = Commutable in {
- def _4s4h : NeonI_3VD_2Op<0b0, u, 0b01, opcode, asmop, "4s", "4h",
- opnode, VPR128, VPR64, v4i32, v4i16>;
- def _2d2s : NeonI_3VD_2Op<0b0, u, 0b10, opcode, asmop, "2d", "2s",
- opnode, VPR128, VPR64, v2i64, v2i32>;
- }
-}
-
-let SchedRW = [WriteFPMul, ReadFPMul, ReadFPMul] in {
-defm SQDMULLvvv : NeonI_3VDL_v2<0b0, 0b1101, "sqdmull",
- int_arm_neon_vqdmull, 1>;
-}
-
-multiclass NeonI_3VDL2_2Op_mull_v2<bit u, bits<4> opcode, string asmop,
- string opnode, bit Commutable = 0> {
- let isCommutable = Commutable in {
- def _4s8h : NeonI_3VDL2_2Op_mull<0b1, u, 0b01, opcode, asmop, "4s", "8h",
- !cast<PatFrag>(opnode # "_8H"),
- v4i32, v8i16>;
- def _2d4s : NeonI_3VDL2_2Op_mull<0b1, u, 0b10, opcode, asmop, "2d", "4s",
- !cast<PatFrag>(opnode # "_4S"),
- v2i64, v4i32>;
- }
-}
-
-defm SQDMULL2vvv : NeonI_3VDL2_2Op_mull_v2<0b0, 0b1101, "sqdmull2",
- "NI_qdmull_hi", 1>;
-
-multiclass NeonI_3VDL2_3Op_qdmlal_v2<bit u, bits<4> opcode, string asmop,
- SDPatternOperator opnode> {
- def _4s8h : NeonI_3VDL2_3Op_mlas<0b1, u, 0b01, opcode, asmop, "4s", "8h",
- opnode, NI_qdmull_hi_8H,
- VPR128, v4i32, v8i16>;
- def _2d4s : NeonI_3VDL2_3Op_mlas<0b1, u, 0b10, opcode, asmop, "2d", "4s",
- opnode, NI_qdmull_hi_4S,
- VPR128, v2i64, v4i32>;
-}
-
-defm SQDMLAL2vvv : NeonI_3VDL2_3Op_qdmlal_v2<0b0, 0b1001, "sqdmlal2",
- int_arm_neon_vqadds>;
-defm SQDMLSL2vvv : NeonI_3VDL2_3Op_qdmlal_v2<0b0, 0b1011, "sqdmlsl2",
- int_arm_neon_vqsubs>;
-
-multiclass NeonI_3VDL_v3<bit u, bits<4> opcode, string asmop,
- SDPatternOperator opnode_8h8b,
- SDPatternOperator opnode_1q1d, bit Commutable = 0> {
- let isCommutable = Commutable in {
- def _8h8b : NeonI_3VD_2Op<0b0, u, 0b00, opcode, asmop, "8h", "8b",
- opnode_8h8b, VPR128, VPR64, v8i16, v8i8>;
-
- def _1q1d : NeonI_3VD_2Op<0b0, u, 0b11, opcode, asmop, "1q", "1d",
- opnode_1q1d, VPR128, VPR64, v16i8, v1i64>;
- }
-}
-
-let SchedRW = [WriteFPMul, ReadFPMul, ReadFPMul] in
-defm PMULLvvv : NeonI_3VDL_v3<0b0, 0b1110, "pmull", int_arm_neon_vmullp,
- int_aarch64_neon_vmull_p64, 1>;
-
-multiclass NeonI_3VDL2_2Op_mull_v3<bit u, bits<4> opcode, string asmop,
- string opnode, bit Commutable = 0> {
- let isCommutable = Commutable in {
- def _8h16b : NeonI_3VDL2_2Op_mull<0b1, u, 0b00, opcode, asmop, "8h", "16b",
- !cast<PatFrag>(opnode # "_16B"),
- v8i16, v16i8>;
-
- def _1q2d :
- NeonI_3VDiff<0b1, u, 0b11, opcode,
- (outs VPR128:$Rd), (ins VPR128:$Rn, VPR128:$Rm),
- asmop # "\t$Rd.1q, $Rn.2d, $Rm.2d",
- [(set (v16i8 VPR128:$Rd),
- (v16i8 (int_aarch64_neon_vmull_p64
- (v1i64 (scalar_to_vector
- (i64 (vector_extract (v2i64 VPR128:$Rn), 1)))),
- (v1i64 (scalar_to_vector
- (i64 (vector_extract (v2i64 VPR128:$Rm), 1)))))))],
- NoItinerary>,
- Sched<[WriteFPMul, ReadFPMul, ReadFPMul]>;
- }
-
- def : Pat<(v16i8 (int_aarch64_neon_vmull_p64
- (v1i64 (extract_subvector (v2i64 VPR128:$Rn), (i64 1))),
- (v1i64 (extract_subvector (v2i64 VPR128:$Rm), (i64 1))))),
- (!cast<Instruction>(NAME # "_1q2d") VPR128:$Rn, VPR128:$Rm)>;
-}
-
-defm PMULL2vvv : NeonI_3VDL2_2Op_mull_v3<0b0, 0b1110, "pmull2", "NI_pmull_hi",
- 1>;
-
-// End of implementation for instruction class (3V Diff)
-
-// The followings are vector load/store multiple N-element structure
-// (class SIMD lselem).
-
-// ld1: load multiple 1-element structure to 1/2/3/4 registers.
-// ld2/ld3/ld4: load multiple N-element structure to N registers (N = 2, 3, 4).
-// The structure consists of a sequence of sets of N values.
-// The first element of the structure is placed in the first lane
-// of the first first vector, the second element in the first lane
-// of the second vector, and so on.
-// E.g. LD1_3V_2S will load 32-bit elements {A, B, C, D, E, F} sequentially into
-// the three 64-bit vectors list {BA, DC, FE}.
-// E.g. LD3_2S will load 32-bit elements {A, B, C, D, E, F} into the three
-// 64-bit vectors list {DA, EB, FC}.
-// Store instructions store multiple structure to N registers like load.
-
-
-class NeonI_LDVList<bit q, bits<4> opcode, bits<2> size,
- RegisterOperand VecList, string asmop>
- : NeonI_LdStMult<q, 1, opcode, size,
- (outs VecList:$Rt), (ins GPR64xsp:$Rn),
- asmop # "\t$Rt, [$Rn]",
- [],
- NoItinerary>,
- Sched<[WriteVecLd, ReadVecLd]> {
- let mayLoad = 1;
- let neverHasSideEffects = 1;
-}
-
-multiclass LDVList_BHSD<bits<4> opcode, string List, string asmop> {
- def _8B : NeonI_LDVList<0, opcode, 0b00,
- !cast<RegisterOperand>(List # "8B_operand"), asmop>;
-
- def _4H : NeonI_LDVList<0, opcode, 0b01,
- !cast<RegisterOperand>(List # "4H_operand"), asmop>;
-
- def _2S : NeonI_LDVList<0, opcode, 0b10,
- !cast<RegisterOperand>(List # "2S_operand"), asmop>;
-
- def _16B : NeonI_LDVList<1, opcode, 0b00,
- !cast<RegisterOperand>(List # "16B_operand"), asmop>;
-
- def _8H : NeonI_LDVList<1, opcode, 0b01,
- !cast<RegisterOperand>(List # "8H_operand"), asmop>;
-
- def _4S : NeonI_LDVList<1, opcode, 0b10,
- !cast<RegisterOperand>(List # "4S_operand"), asmop>;
-
- def _2D : NeonI_LDVList<1, opcode, 0b11,
- !cast<RegisterOperand>(List # "2D_operand"), asmop>;
-}
-
-// Load multiple N-element structure to N consecutive registers (N = 1,2,3,4)
-defm LD1 : LDVList_BHSD<0b0111, "VOne", "ld1">;
-def LD1_1D : NeonI_LDVList<0, 0b0111, 0b11, VOne1D_operand, "ld1">;
-
-defm LD2 : LDVList_BHSD<0b1000, "VPair", "ld2">;
-
-defm LD3 : LDVList_BHSD<0b0100, "VTriple", "ld3">;
-
-defm LD4 : LDVList_BHSD<0b0000, "VQuad", "ld4">;
-
-// Load multiple 1-element structure to N consecutive registers (N = 2,3,4)
-defm LD1x2 : LDVList_BHSD<0b1010, "VPair", "ld1">;
-def LD1x2_1D : NeonI_LDVList<0, 0b1010, 0b11, VPair1D_operand, "ld1">;
-
-defm LD1x3 : LDVList_BHSD<0b0110, "VTriple", "ld1">;
-def LD1x3_1D : NeonI_LDVList<0, 0b0110, 0b11, VTriple1D_operand, "ld1">;
-
-defm LD1x4 : LDVList_BHSD<0b0010, "VQuad", "ld1">;
-def LD1x4_1D : NeonI_LDVList<0, 0b0010, 0b11, VQuad1D_operand, "ld1">;
-
-class NeonI_STVList<bit q, bits<4> opcode, bits<2> size,
- RegisterOperand VecList, string asmop>
- : NeonI_LdStMult<q, 0, opcode, size,
- (outs), (ins GPR64xsp:$Rn, VecList:$Rt),
- asmop # "\t$Rt, [$Rn]",
- [],
- NoItinerary>,
- Sched<[WriteVecSt, ReadVecSt, ReadVecSt]> {
- let mayStore = 1;
- let neverHasSideEffects = 1;
-}
-
-multiclass STVList_BHSD<bits<4> opcode, string List, string asmop> {
- def _8B : NeonI_STVList<0, opcode, 0b00,
- !cast<RegisterOperand>(List # "8B_operand"), asmop>;
-
- def _4H : NeonI_STVList<0, opcode, 0b01,
- !cast<RegisterOperand>(List # "4H_operand"), asmop>;
-
- def _2S : NeonI_STVList<0, opcode, 0b10,
- !cast<RegisterOperand>(List # "2S_operand"), asmop>;
-
- def _16B : NeonI_STVList<1, opcode, 0b00,
- !cast<RegisterOperand>(List # "16B_operand"), asmop>;
-
- def _8H : NeonI_STVList<1, opcode, 0b01,
- !cast<RegisterOperand>(List # "8H_operand"), asmop>;
-
- def _4S : NeonI_STVList<1, opcode, 0b10,
- !cast<RegisterOperand>(List # "4S_operand"), asmop>;
-
- def _2D : NeonI_STVList<1, opcode, 0b11,
- !cast<RegisterOperand>(List # "2D_operand"), asmop>;
-}
-
-// Store multiple N-element structures from N registers (N = 1,2,3,4)
-defm ST1 : STVList_BHSD<0b0111, "VOne", "st1">;
-def ST1_1D : NeonI_STVList<0, 0b0111, 0b11, VOne1D_operand, "st1">;
-
-defm ST2 : STVList_BHSD<0b1000, "VPair", "st2">;
-
-defm ST3 : STVList_BHSD<0b0100, "VTriple", "st3">;
-
-defm ST4 : STVList_BHSD<0b0000, "VQuad", "st4">;
-
-// Store multiple 1-element structures from N consecutive registers (N = 2,3,4)
-defm ST1x2 : STVList_BHSD<0b1010, "VPair", "st1">;
-def ST1x2_1D : NeonI_STVList<0, 0b1010, 0b11, VPair1D_operand, "st1">;
-
-defm ST1x3 : STVList_BHSD<0b0110, "VTriple", "st1">;
-def ST1x3_1D : NeonI_STVList<0, 0b0110, 0b11, VTriple1D_operand, "st1">;
-
-defm ST1x4 : STVList_BHSD<0b0010, "VQuad", "st1">;
-def ST1x4_1D : NeonI_STVList<0, 0b0010, 0b11, VQuad1D_operand, "st1">;
-
-def : Pat<(v2f64 (load GPR64xsp:$addr)), (LD1_2D GPR64xsp:$addr)>;
-def : Pat<(v2i64 (load GPR64xsp:$addr)), (LD1_2D GPR64xsp:$addr)>;
-
-def : Pat<(v4f32 (load GPR64xsp:$addr)), (LD1_4S GPR64xsp:$addr)>;
-def : Pat<(v4i32 (load GPR64xsp:$addr)), (LD1_4S GPR64xsp:$addr)>;
-
-def : Pat<(v8i16 (load GPR64xsp:$addr)), (LD1_8H GPR64xsp:$addr)>;
-def : Pat<(v16i8 (load GPR64xsp:$addr)), (LD1_16B GPR64xsp:$addr)>;
-
-def : Pat<(v1f64 (load GPR64xsp:$addr)), (LD1_1D GPR64xsp:$addr)>;
-def : Pat<(v1i64 (load GPR64xsp:$addr)), (LD1_1D GPR64xsp:$addr)>;
-
-def : Pat<(v2f32 (load GPR64xsp:$addr)), (LD1_2S GPR64xsp:$addr)>;
-def : Pat<(v2i32 (load GPR64xsp:$addr)), (LD1_2S GPR64xsp:$addr)>;
-
-def : Pat<(v4i16 (load GPR64xsp:$addr)), (LD1_4H GPR64xsp:$addr)>;
-def : Pat<(v8i8 (load GPR64xsp:$addr)), (LD1_8B GPR64xsp:$addr)>;
-
-def : Pat<(store (v2i64 VPR128:$value), GPR64xsp:$addr),
- (ST1_2D GPR64xsp:$addr, VPR128:$value)>;
-def : Pat<(store (v2f64 VPR128:$value), GPR64xsp:$addr),
- (ST1_2D GPR64xsp:$addr, VPR128:$value)>;
-
-def : Pat<(store (v4i32 VPR128:$value), GPR64xsp:$addr),
- (ST1_4S GPR64xsp:$addr, VPR128:$value)>;
-def : Pat<(store (v4f32 VPR128:$value), GPR64xsp:$addr),
- (ST1_4S GPR64xsp:$addr, VPR128:$value)>;
-
-def : Pat<(store (v8i16 VPR128:$value), GPR64xsp:$addr),
- (ST1_8H GPR64xsp:$addr, VPR128:$value)>;
-def : Pat<(store (v16i8 VPR128:$value), GPR64xsp:$addr),
- (ST1_16B GPR64xsp:$addr, VPR128:$value)>;
-
-def : Pat<(store (v1i64 VPR64:$value), GPR64xsp:$addr),
- (ST1_1D GPR64xsp:$addr, VPR64:$value)>;
-def : Pat<(store (v1f64 VPR64:$value), GPR64xsp:$addr),
- (ST1_1D GPR64xsp:$addr, VPR64:$value)>;
-
-def : Pat<(store (v2i32 VPR64:$value), GPR64xsp:$addr),
- (ST1_2S GPR64xsp:$addr, VPR64:$value)>;
-def : Pat<(store (v2f32 VPR64:$value), GPR64xsp:$addr),
- (ST1_2S GPR64xsp:$addr, VPR64:$value)>;
-
-def : Pat<(store (v4i16 VPR64:$value), GPR64xsp:$addr),
- (ST1_4H GPR64xsp:$addr, VPR64:$value)>;
-def : Pat<(store (v8i8 VPR64:$value), GPR64xsp:$addr),
- (ST1_8B GPR64xsp:$addr, VPR64:$value)>;
-
-// Match load/store of v1i8/v1i16/v1i32 type to FPR8/FPR16/FPR32 load/store.
-// FIXME: for now we have v1i8, v1i16, v1i32 legal types, if they are illegal,
-// these patterns are not needed any more.
-def : Pat<(v1i8 (load GPR64xsp:$addr)), (LSFP8_LDR $addr, 0)>;
-def : Pat<(v1i16 (load GPR64xsp:$addr)), (LSFP16_LDR $addr, 0)>;
-def : Pat<(v1i32 (load GPR64xsp:$addr)), (LSFP32_LDR $addr, 0)>;
-
-def : Pat<(store (v1i8 FPR8:$value), GPR64xsp:$addr),
- (LSFP8_STR $value, $addr, 0)>;
-def : Pat<(store (v1i16 FPR16:$value), GPR64xsp:$addr),
- (LSFP16_STR $value, $addr, 0)>;
-def : Pat<(store (v1i32 FPR32:$value), GPR64xsp:$addr),
- (LSFP32_STR $value, $addr, 0)>;
-
-
-// End of vector load/store multiple N-element structure(class SIMD lselem)
-
-// The followings are post-index vector load/store multiple N-element
-// structure(class SIMD lselem-post)
-def exact1_asmoperand : AsmOperandClass {
- let Name = "Exact1";
- let PredicateMethod = "isExactImm<1>";
- let RenderMethod = "addImmOperands";
-}
-def uimm_exact1 : Operand<i32>, ImmLeaf<i32, [{return Imm == 1;}]> {
- let ParserMatchClass = exact1_asmoperand;
-}
-
-def exact2_asmoperand : AsmOperandClass {
- let Name = "Exact2";
- let PredicateMethod = "isExactImm<2>";
- let RenderMethod = "addImmOperands";
-}
-def uimm_exact2 : Operand<i32>, ImmLeaf<i32, [{return Imm == 2;}]> {
- let ParserMatchClass = exact2_asmoperand;
-}
-
-def exact3_asmoperand : AsmOperandClass {
- let Name = "Exact3";
- let PredicateMethod = "isExactImm<3>";
- let RenderMethod = "addImmOperands";
-}
-def uimm_exact3 : Operand<i32>, ImmLeaf<i32, [{return Imm == 3;}]> {
- let ParserMatchClass = exact3_asmoperand;
-}
-
-def exact4_asmoperand : AsmOperandClass {
- let Name = "Exact4";
- let PredicateMethod = "isExactImm<4>";
- let RenderMethod = "addImmOperands";
-}
-def uimm_exact4 : Operand<i32>, ImmLeaf<i32, [{return Imm == 4;}]> {
- let ParserMatchClass = exact4_asmoperand;
-}
-
-def exact6_asmoperand : AsmOperandClass {
- let Name = "Exact6";
- let PredicateMethod = "isExactImm<6>";
- let RenderMethod = "addImmOperands";
-}
-def uimm_exact6 : Operand<i32>, ImmLeaf<i32, [{return Imm == 6;}]> {
- let ParserMatchClass = exact6_asmoperand;
-}
-
-def exact8_asmoperand : AsmOperandClass {
- let Name = "Exact8";
- let PredicateMethod = "isExactImm<8>";
- let RenderMethod = "addImmOperands";
-}
-def uimm_exact8 : Operand<i32>, ImmLeaf<i32, [{return Imm == 8;}]> {
- let ParserMatchClass = exact8_asmoperand;
-}
-
-def exact12_asmoperand : AsmOperandClass {
- let Name = "Exact12";
- let PredicateMethod = "isExactImm<12>";
- let RenderMethod = "addImmOperands";
-}
-def uimm_exact12 : Operand<i32>, ImmLeaf<i32, [{return Imm == 12;}]> {
- let ParserMatchClass = exact12_asmoperand;
-}
-
-def exact16_asmoperand : AsmOperandClass {
- let Name = "Exact16";
- let PredicateMethod = "isExactImm<16>";
- let RenderMethod = "addImmOperands";
-}
-def uimm_exact16 : Operand<i32>, ImmLeaf<i32, [{return Imm == 16;}]> {
- let ParserMatchClass = exact16_asmoperand;
-}
-
-def exact24_asmoperand : AsmOperandClass {
- let Name = "Exact24";
- let PredicateMethod = "isExactImm<24>";
- let RenderMethod = "addImmOperands";
-}
-def uimm_exact24 : Operand<i32>, ImmLeaf<i32, [{return Imm == 24;}]> {
- let ParserMatchClass = exact24_asmoperand;
-}
-
-def exact32_asmoperand : AsmOperandClass {
- let Name = "Exact32";
- let PredicateMethod = "isExactImm<32>";
- let RenderMethod = "addImmOperands";
-}
-def uimm_exact32 : Operand<i32>, ImmLeaf<i32, [{return Imm == 32;}]> {
- let ParserMatchClass = exact32_asmoperand;
-}
-
-def exact48_asmoperand : AsmOperandClass {
- let Name = "Exact48";
- let PredicateMethod = "isExactImm<48>";
- let RenderMethod = "addImmOperands";
-}
-def uimm_exact48 : Operand<i32>, ImmLeaf<i32, [{return Imm == 48;}]> {
- let ParserMatchClass = exact48_asmoperand;
-}
-
-def exact64_asmoperand : AsmOperandClass {
- let Name = "Exact64";
- let PredicateMethod = "isExactImm<64>";
- let RenderMethod = "addImmOperands";
-}
-def uimm_exact64 : Operand<i32>, ImmLeaf<i32, [{return Imm == 64;}]> {
- let ParserMatchClass = exact64_asmoperand;
-}
-
-multiclass NeonI_LDWB_VList<bit q, bits<4> opcode, bits<2> size,
- RegisterOperand VecList, Operand ImmTy,
- string asmop> {
- let Constraints = "$Rn = $wb", mayLoad = 1, neverHasSideEffects = 1,
- DecoderMethod = "DecodeVLDSTPostInstruction" in {
- def _fixed : NeonI_LdStMult_Post<q, 1, opcode, size,
- (outs VecList:$Rt, GPR64xsp:$wb),
- (ins GPR64xsp:$Rn, ImmTy:$amt),
- asmop # "\t$Rt, [$Rn], $amt",
- [],
- NoItinerary>,
- Sched<[WriteVecLd, WriteVecLd, ReadVecLd]> {
- let Rm = 0b11111;
- }
-
- def _register : NeonI_LdStMult_Post<q, 1, opcode, size,
- (outs VecList:$Rt, GPR64xsp:$wb),
- (ins GPR64xsp:$Rn, GPR64noxzr:$Rm),
- asmop # "\t$Rt, [$Rn], $Rm",
- [],
- NoItinerary>,
- Sched<[WriteVecLd, WriteVecLd, ReadVecLd, ReadVecLd]>;
- }
-}
-
-multiclass LDWB_VList_BHSD<bits<4> opcode, string List, Operand ImmTy,
- Operand ImmTy2, string asmop> {
- defm _8B : NeonI_LDWB_VList<0, opcode, 0b00,
- !cast<RegisterOperand>(List # "8B_operand"),
- ImmTy, asmop>;
-
- defm _4H : NeonI_LDWB_VList<0, opcode, 0b01,
- !cast<RegisterOperand>(List # "4H_operand"),
- ImmTy, asmop>;
-
- defm _2S : NeonI_LDWB_VList<0, opcode, 0b10,
- !cast<RegisterOperand>(List # "2S_operand"),
- ImmTy, asmop>;
-
- defm _16B : NeonI_LDWB_VList<1, opcode, 0b00,
- !cast<RegisterOperand>(List # "16B_operand"),
- ImmTy2, asmop>;
-
- defm _8H : NeonI_LDWB_VList<1, opcode, 0b01,
- !cast<RegisterOperand>(List # "8H_operand"),
- ImmTy2, asmop>;
-
- defm _4S : NeonI_LDWB_VList<1, opcode, 0b10,
- !cast<RegisterOperand>(List # "4S_operand"),
- ImmTy2, asmop>;
-
- defm _2D : NeonI_LDWB_VList<1, opcode, 0b11,
- !cast<RegisterOperand>(List # "2D_operand"),
- ImmTy2, asmop>;
-}
-
-// Post-index load multiple N-element structures from N registers (N = 1,2,3,4)
-defm LD1WB : LDWB_VList_BHSD<0b0111, "VOne", uimm_exact8, uimm_exact16, "ld1">;
-defm LD1WB_1D : NeonI_LDWB_VList<0, 0b0111, 0b11, VOne1D_operand, uimm_exact8,
- "ld1">;
-
-defm LD2WB : LDWB_VList_BHSD<0b1000, "VPair", uimm_exact16, uimm_exact32, "ld2">;
-
-defm LD3WB : LDWB_VList_BHSD<0b0100, "VTriple", uimm_exact24, uimm_exact48,
- "ld3">;
-
-defm LD4WB : LDWB_VList_BHSD<0b0000, "VQuad", uimm_exact32, uimm_exact64, "ld4">;
-
-// Post-index load multiple 1-element structures from N consecutive registers
-// (N = 2,3,4)
-defm LD1x2WB : LDWB_VList_BHSD<0b1010, "VPair", uimm_exact16, uimm_exact32,
- "ld1">;
-defm LD1x2WB_1D : NeonI_LDWB_VList<0, 0b1010, 0b11, VPair1D_operand,
- uimm_exact16, "ld1">;
-
-defm LD1x3WB : LDWB_VList_BHSD<0b0110, "VTriple", uimm_exact24, uimm_exact48,
- "ld1">;
-defm LD1x3WB_1D : NeonI_LDWB_VList<0, 0b0110, 0b11, VTriple1D_operand,
- uimm_exact24, "ld1">;
-
-defm LD1x4WB : LDWB_VList_BHSD<0b0010, "VQuad", uimm_exact32, uimm_exact64,
- "ld1">;
-defm LD1x4WB_1D : NeonI_LDWB_VList<0, 0b0010, 0b11, VQuad1D_operand,
- uimm_exact32, "ld1">;
-
-multiclass NeonI_STWB_VList<bit q, bits<4> opcode, bits<2> size,
- RegisterOperand VecList, Operand ImmTy,
- string asmop> {
- let Constraints = "$Rn = $wb", mayStore = 1, neverHasSideEffects = 1,
- DecoderMethod = "DecodeVLDSTPostInstruction" in {
- def _fixed : NeonI_LdStMult_Post<q, 0, opcode, size,
- (outs GPR64xsp:$wb),
- (ins GPR64xsp:$Rn, ImmTy:$amt, VecList:$Rt),
- asmop # "\t$Rt, [$Rn], $amt",
- [],
- NoItinerary>,
- Sched<[WriteVecSt, ReadVecSt, ReadVecSt]> {
- let Rm = 0b11111;
- }
-
- def _register : NeonI_LdStMult_Post<q, 0, opcode, size,
- (outs GPR64xsp:$wb),
- (ins GPR64xsp:$Rn, GPR64noxzr:$Rm, VecList:$Rt),
- asmop # "\t$Rt, [$Rn], $Rm",
- [],
- NoItinerary>,
- Sched<[WriteVecSt, ReadVecSt, ReadVecSt, ReadVecSt]>;
- }
-}
-
-multiclass STWB_VList_BHSD<bits<4> opcode, string List, Operand ImmTy,
- Operand ImmTy2, string asmop> {
- defm _8B : NeonI_STWB_VList<0, opcode, 0b00,
- !cast<RegisterOperand>(List # "8B_operand"), ImmTy, asmop>;
-
- defm _4H : NeonI_STWB_VList<0, opcode, 0b01,
- !cast<RegisterOperand>(List # "4H_operand"),
- ImmTy, asmop>;
-
- defm _2S : NeonI_STWB_VList<0, opcode, 0b10,
- !cast<RegisterOperand>(List # "2S_operand"),
- ImmTy, asmop>;
-
- defm _16B : NeonI_STWB_VList<1, opcode, 0b00,
- !cast<RegisterOperand>(List # "16B_operand"),
- ImmTy2, asmop>;
-
- defm _8H : NeonI_STWB_VList<1, opcode, 0b01,
- !cast<RegisterOperand>(List # "8H_operand"),
- ImmTy2, asmop>;
-
- defm _4S : NeonI_STWB_VList<1, opcode, 0b10,
- !cast<RegisterOperand>(List # "4S_operand"),
- ImmTy2, asmop>;
-
- defm _2D : NeonI_STWB_VList<1, opcode, 0b11,
- !cast<RegisterOperand>(List # "2D_operand"),
- ImmTy2, asmop>;
-}
-
-// Post-index load multiple N-element structures from N registers (N = 1,2,3,4)
-defm ST1WB : STWB_VList_BHSD<0b0111, "VOne", uimm_exact8, uimm_exact16, "st1">;
-defm ST1WB_1D : NeonI_STWB_VList<0, 0b0111, 0b11, VOne1D_operand, uimm_exact8,
- "st1">;
-
-defm ST2WB : STWB_VList_BHSD<0b1000, "VPair", uimm_exact16, uimm_exact32, "st2">;
-
-defm ST3WB : STWB_VList_BHSD<0b0100, "VTriple", uimm_exact24, uimm_exact48,
- "st3">;
-
-defm ST4WB : STWB_VList_BHSD<0b0000, "VQuad", uimm_exact32, uimm_exact64, "st4">;
-
-// Post-index load multiple 1-element structures from N consecutive registers
-// (N = 2,3,4)
-defm ST1x2WB : STWB_VList_BHSD<0b1010, "VPair", uimm_exact16, uimm_exact32,
- "st1">;
-defm ST1x2WB_1D : NeonI_STWB_VList<0, 0b1010, 0b11, VPair1D_operand,
- uimm_exact16, "st1">;
-
-defm ST1x3WB : STWB_VList_BHSD<0b0110, "VTriple", uimm_exact24, uimm_exact48,
- "st1">;
-defm ST1x3WB_1D : NeonI_STWB_VList<0, 0b0110, 0b11, VTriple1D_operand,
- uimm_exact24, "st1">;
-
-defm ST1x4WB : STWB_VList_BHSD<0b0010, "VQuad", uimm_exact32, uimm_exact64,
- "st1">;
-defm ST1x4WB_1D : NeonI_STWB_VList<0, 0b0010, 0b11, VQuad1D_operand,
- uimm_exact32, "st1">;
-
-// End of post-index vector load/store multiple N-element structure
-// (class SIMD lselem-post)
-
-// The followings are vector load/store single N-element structure
-// (class SIMD lsone).
-def neon_uimm0_bare : Operand<i64>,
- ImmLeaf<i64, [{return Imm == 0;}]> {
- let ParserMatchClass = neon_uimm0_asmoperand;
- let PrintMethod = "printUImmBareOperand";
-}
-
-def neon_uimm1_bare : Operand<i64>,
- ImmLeaf<i64, [{return Imm < 2;}]> {
- let ParserMatchClass = neon_uimm1_asmoperand;
- let PrintMethod = "printUImmBareOperand";
-}
-
-def neon_uimm2_bare : Operand<i64>,
- ImmLeaf<i64, [{return Imm < 4;}]> {
- let ParserMatchClass = neon_uimm2_asmoperand;
- let PrintMethod = "printUImmBareOperand";
-}
-
-def neon_uimm3_bare : Operand<i64>,
- ImmLeaf<i64, [{return Imm < 8;}]> {
- let ParserMatchClass = uimm3_asmoperand;
- let PrintMethod = "printUImmBareOperand";
-}
-
-def neon_uimm4_bare : Operand<i64>,
- ImmLeaf<i64, [{return Imm < 16;}]> {
- let ParserMatchClass = uimm4_asmoperand;
- let PrintMethod = "printUImmBareOperand";
-}
-
-class NeonI_LDN_Dup<bit q, bit r, bits<3> opcode, bits<2> size,
- RegisterOperand VecList, string asmop>
- : NeonI_LdOne_Dup<q, r, opcode, size,
- (outs VecList:$Rt), (ins GPR64xsp:$Rn),
- asmop # "\t$Rt, [$Rn]",
- [],
- NoItinerary>,
- Sched<[WriteVecLd, ReadVecLd]> {
- let mayLoad = 1;
- let neverHasSideEffects = 1;
-}
-
-multiclass LDN_Dup_BHSD<bit r, bits<3> opcode, string List, string asmop> {
- def _8B : NeonI_LDN_Dup<0, r, opcode, 0b00,
- !cast<RegisterOperand>(List # "8B_operand"), asmop>;
-
- def _4H : NeonI_LDN_Dup<0, r, opcode, 0b01,
- !cast<RegisterOperand>(List # "4H_operand"), asmop>;
-
- def _2S : NeonI_LDN_Dup<0, r, opcode, 0b10,
- !cast<RegisterOperand>(List # "2S_operand"), asmop>;
-
- def _1D : NeonI_LDN_Dup<0, r, opcode, 0b11,
- !cast<RegisterOperand>(List # "1D_operand"), asmop>;
-
- def _16B : NeonI_LDN_Dup<1, r, opcode, 0b00,
- !cast<RegisterOperand>(List # "16B_operand"), asmop>;
-
- def _8H : NeonI_LDN_Dup<1, r, opcode, 0b01,
- !cast<RegisterOperand>(List # "8H_operand"), asmop>;
-
- def _4S : NeonI_LDN_Dup<1, r, opcode, 0b10,
- !cast<RegisterOperand>(List # "4S_operand"), asmop>;
-
- def _2D : NeonI_LDN_Dup<1, r, opcode, 0b11,
- !cast<RegisterOperand>(List # "2D_operand"), asmop>;
-}
-
-// Load single 1-element structure to all lanes of 1 register
-defm LD1R : LDN_Dup_BHSD<0b0, 0b110, "VOne", "ld1r">;
-
-// Load single N-element structure to all lanes of N consecutive
-// registers (N = 2,3,4)
-defm LD2R : LDN_Dup_BHSD<0b1, 0b110, "VPair", "ld2r">;
-defm LD3R : LDN_Dup_BHSD<0b0, 0b111, "VTriple", "ld3r">;
-defm LD4R : LDN_Dup_BHSD<0b1, 0b111, "VQuad", "ld4r">;
-
-
-class LD1R_pattern <ValueType VTy, ValueType DTy, PatFrag LoadOp,
- Instruction INST>
- : Pat<(VTy (Neon_vdup (DTy (LoadOp GPR64xsp:$Rn)))),
- (VTy (INST GPR64xsp:$Rn))>;
-
-// Match all LD1R instructions
-def : LD1R_pattern<v8i8, i32, extloadi8, LD1R_8B>;
-
-def : LD1R_pattern<v16i8, i32, extloadi8, LD1R_16B>;
-
-def : LD1R_pattern<v4i16, i32, extloadi16, LD1R_4H>;
-
-def : LD1R_pattern<v8i16, i32, extloadi16, LD1R_8H>;
-
-def : LD1R_pattern<v2i32, i32, load, LD1R_2S>;
-def : LD1R_pattern<v2f32, f32, load, LD1R_2S>;
-
-def : LD1R_pattern<v4i32, i32, load, LD1R_4S>;
-def : LD1R_pattern<v4f32, f32, load, LD1R_4S>;
-
-def : LD1R_pattern<v2i64, i64, load, LD1R_2D>;
-def : LD1R_pattern<v2f64, f64, load, LD1R_2D>;
-
-class LD1R_pattern_v1 <ValueType VTy, ValueType DTy, PatFrag LoadOp,
- Instruction INST>
- : Pat<(VTy (scalar_to_vector (DTy (LoadOp GPR64xsp:$Rn)))),
- (VTy (INST GPR64xsp:$Rn))>;
-
-def : LD1R_pattern_v1<v1i64, i64, load, LD1R_1D>;
-def : LD1R_pattern_v1<v1f64, f64, load, LD1R_1D>;
-
-multiclass VectorList_Bare_BHSD<string PREFIX, int Count,
- RegisterClass RegList> {
- defm B : VectorList_operands<PREFIX, "B", Count, RegList>;
- defm H : VectorList_operands<PREFIX, "H", Count, RegList>;
- defm S : VectorList_operands<PREFIX, "S", Count, RegList>;
- defm D : VectorList_operands<PREFIX, "D", Count, RegList>;
-}
-
-// Special vector list operand of 128-bit vectors with bare layout.
-// i.e. only show ".b", ".h", ".s", ".d"
-defm VOne : VectorList_Bare_BHSD<"VOne", 1, FPR128>;
-defm VPair : VectorList_Bare_BHSD<"VPair", 2, QPair>;
-defm VTriple : VectorList_Bare_BHSD<"VTriple", 3, QTriple>;
-defm VQuad : VectorList_Bare_BHSD<"VQuad", 4, QQuad>;
-
-class NeonI_LDN_Lane<bit r, bits<2> op2_1, bit op0, RegisterOperand VList,
- Operand ImmOp, string asmop>
- : NeonI_LdStOne_Lane<1, r, op2_1, op0,
- (outs VList:$Rt),
- (ins GPR64xsp:$Rn, VList:$src, ImmOp:$lane),
- asmop # "\t$Rt[$lane], [$Rn]",
- [],
- NoItinerary>,
- Sched<[WriteVecLd, ReadVecLd, ReadVecLd]> {
- let mayLoad = 1;
- let neverHasSideEffects = 1;
- let hasExtraDefRegAllocReq = 1;
- let Constraints = "$src = $Rt";
-}
-
-multiclass LDN_Lane_BHSD<bit r, bit op0, string List, string asmop> {
- def _B : NeonI_LDN_Lane<r, 0b00, op0,
- !cast<RegisterOperand>(List # "B_operand"),
- neon_uimm4_bare, asmop> {
- let Inst{12-10} = lane{2-0};
- let Inst{30} = lane{3};
- }
-
- def _H : NeonI_LDN_Lane<r, 0b01, op0,
- !cast<RegisterOperand>(List # "H_operand"),
- neon_uimm3_bare, asmop> {
- let Inst{12-10} = {lane{1}, lane{0}, 0b0};
- let Inst{30} = lane{2};
- }
-
- def _S : NeonI_LDN_Lane<r, 0b10, op0,
- !cast<RegisterOperand>(List # "S_operand"),
- neon_uimm2_bare, asmop> {
- let Inst{12-10} = {lane{0}, 0b0, 0b0};
- let Inst{30} = lane{1};
- }
-
- def _D : NeonI_LDN_Lane<r, 0b10, op0,
- !cast<RegisterOperand>(List # "D_operand"),
- neon_uimm1_bare, asmop> {
- let Inst{12-10} = 0b001;
- let Inst{30} = lane{0};
- }
-}
-
-// Load single 1-element structure to one lane of 1 register.
-defm LD1LN : LDN_Lane_BHSD<0b0, 0b0, "VOne", "ld1">;
-
-// Load single N-element structure to one lane of N consecutive registers
-// (N = 2,3,4)
-defm LD2LN : LDN_Lane_BHSD<0b1, 0b0, "VPair", "ld2">;
-defm LD3LN : LDN_Lane_BHSD<0b0, 0b1, "VTriple", "ld3">;
-defm LD4LN : LDN_Lane_BHSD<0b1, 0b1, "VQuad", "ld4">;
-
-multiclass LD1LN_patterns<ValueType VTy, ValueType VTy2, ValueType DTy,
- Operand ImmOp, Operand ImmOp2, PatFrag LoadOp,
- Instruction INST> {
- def : Pat<(VTy (vector_insert (VTy VPR64:$src),
- (DTy (LoadOp GPR64xsp:$Rn)), (ImmOp:$lane))),
- (VTy (EXTRACT_SUBREG
- (INST GPR64xsp:$Rn,
- (SUBREG_TO_REG (i64 0), VPR64:$src, sub_64),
- ImmOp:$lane),
- sub_64))>;
-
- def : Pat<(VTy2 (vector_insert (VTy2 VPR128:$src),
- (DTy (LoadOp GPR64xsp:$Rn)), (ImmOp2:$lane))),
- (VTy2 (INST GPR64xsp:$Rn, VPR128:$src, ImmOp2:$lane))>;
-}
-
-// Match all LD1LN instructions
-defm : LD1LN_patterns<v8i8, v16i8, i32, neon_uimm3_bare, neon_uimm4_bare,
- extloadi8, LD1LN_B>;
-
-defm : LD1LN_patterns<v4i16, v8i16, i32, neon_uimm2_bare, neon_uimm3_bare,
- extloadi16, LD1LN_H>;
-
-defm : LD1LN_patterns<v2i32, v4i32, i32, neon_uimm1_bare, neon_uimm2_bare,
- load, LD1LN_S>;
-defm : LD1LN_patterns<v2f32, v4f32, f32, neon_uimm1_bare, neon_uimm2_bare,
- load, LD1LN_S>;
-
-defm : LD1LN_patterns<v1i64, v2i64, i64, neon_uimm0_bare, neon_uimm1_bare,
- load, LD1LN_D>;
-defm : LD1LN_patterns<v1f64, v2f64, f64, neon_uimm0_bare, neon_uimm1_bare,
- load, LD1LN_D>;
-
-class NeonI_STN_Lane<bit r, bits<2> op2_1, bit op0, RegisterOperand VList,
- Operand ImmOp, string asmop>
- : NeonI_LdStOne_Lane<0, r, op2_1, op0,
- (outs), (ins GPR64xsp:$Rn, VList:$Rt, ImmOp:$lane),
- asmop # "\t$Rt[$lane], [$Rn]",
- [],
- NoItinerary>,
- Sched<[WriteVecSt, ReadVecSt, ReadVecSt]> {
- let mayStore = 1;
- let neverHasSideEffects = 1;
- let hasExtraDefRegAllocReq = 1;
-}
-
-multiclass STN_Lane_BHSD<bit r, bit op0, string List, string asmop> {
- def _B : NeonI_STN_Lane<r, 0b00, op0,
- !cast<RegisterOperand>(List # "B_operand"),
- neon_uimm4_bare, asmop> {
- let Inst{12-10} = lane{2-0};
- let Inst{30} = lane{3};
- }
-
- def _H : NeonI_STN_Lane<r, 0b01, op0,
- !cast<RegisterOperand>(List # "H_operand"),
- neon_uimm3_bare, asmop> {
- let Inst{12-10} = {lane{1}, lane{0}, 0b0};
- let Inst{30} = lane{2};
- }
-
- def _S : NeonI_STN_Lane<r, 0b10, op0,
- !cast<RegisterOperand>(List # "S_operand"),
- neon_uimm2_bare, asmop> {
- let Inst{12-10} = {lane{0}, 0b0, 0b0};
- let Inst{30} = lane{1};
- }
-
- def _D : NeonI_STN_Lane<r, 0b10, op0,
- !cast<RegisterOperand>(List # "D_operand"),
- neon_uimm1_bare, asmop>{
- let Inst{12-10} = 0b001;
- let Inst{30} = lane{0};
- }
-}
-
-// Store single 1-element structure from one lane of 1 register.
-defm ST1LN : STN_Lane_BHSD<0b0, 0b0, "VOne", "st1">;
-
-// Store single N-element structure from one lane of N consecutive registers
-// (N = 2,3,4)
-defm ST2LN : STN_Lane_BHSD<0b1, 0b0, "VPair", "st2">;
-defm ST3LN : STN_Lane_BHSD<0b0, 0b1, "VTriple", "st3">;
-defm ST4LN : STN_Lane_BHSD<0b1, 0b1, "VQuad", "st4">;
-
-multiclass ST1LN_patterns<ValueType VTy, ValueType VTy2, ValueType DTy,
- Operand ImmOp, Operand ImmOp2, PatFrag StoreOp,
- Instruction INST> {
- def : Pat<(StoreOp (DTy (vector_extract (VTy VPR64:$Rt), ImmOp:$lane)),
- GPR64xsp:$Rn),
- (INST GPR64xsp:$Rn,
- (SUBREG_TO_REG (i64 0), VPR64:$Rt, sub_64),
- ImmOp:$lane)>;
-
- def : Pat<(StoreOp (DTy (vector_extract (VTy2 VPR128:$Rt), ImmOp2:$lane)),
- GPR64xsp:$Rn),
- (INST GPR64xsp:$Rn, VPR128:$Rt, ImmOp2:$lane)>;
-}
-
-// Match all ST1LN instructions
-defm : ST1LN_patterns<v8i8, v16i8, i32, neon_uimm3_bare, neon_uimm4_bare,
- truncstorei8, ST1LN_B>;
-
-defm : ST1LN_patterns<v4i16, v8i16, i32, neon_uimm2_bare, neon_uimm3_bare,
- truncstorei16, ST1LN_H>;
-
-defm : ST1LN_patterns<v2i32, v4i32, i32, neon_uimm1_bare, neon_uimm2_bare,
- store, ST1LN_S>;
-defm : ST1LN_patterns<v2f32, v4f32, f32, neon_uimm1_bare, neon_uimm2_bare,
- store, ST1LN_S>;
-
-defm : ST1LN_patterns<v1i64, v2i64, i64, neon_uimm0_bare, neon_uimm1_bare,
- store, ST1LN_D>;
-defm : ST1LN_patterns<v1f64, v2f64, f64, neon_uimm0_bare, neon_uimm1_bare,
- store, ST1LN_D>;
-
-// End of vector load/store single N-element structure (class SIMD lsone).
-
-
-// The following are post-index load/store single N-element instructions
-// (class SIMD lsone-post)
-
-multiclass NeonI_LDN_WB_Dup<bit q, bit r, bits<3> opcode, bits<2> size,
- RegisterOperand VecList, Operand ImmTy,
- string asmop> {
- let mayLoad = 1, neverHasSideEffects = 1, Constraints = "$wb = $Rn",
- DecoderMethod = "DecodeVLDSTLanePostInstruction" in {
- def _fixed : NeonI_LdOne_Dup_Post<q, r, opcode, size,
- (outs VecList:$Rt, GPR64xsp:$wb),
- (ins GPR64xsp:$Rn, ImmTy:$amt),
- asmop # "\t$Rt, [$Rn], $amt",
- [],
- NoItinerary>,
- Sched<[WriteVecLd, WriteVecLd, ReadVecLd]> {
- let Rm = 0b11111;
- }
-
- def _register : NeonI_LdOne_Dup_Post<q, r, opcode, size,
- (outs VecList:$Rt, GPR64xsp:$wb),
- (ins GPR64xsp:$Rn, GPR64noxzr:$Rm),
- asmop # "\t$Rt, [$Rn], $Rm",
- [],
- NoItinerary>,
- Sched<[WriteVecLd, WriteVecLd, ReadVecLd, ReadVecLd]>;
- }
-}
-
-multiclass LDWB_Dup_BHSD<bit r, bits<3> opcode, string List, string asmop,
- Operand uimm_b, Operand uimm_h,
- Operand uimm_s, Operand uimm_d> {
- defm _8B : NeonI_LDN_WB_Dup<0, r, opcode, 0b00,
- !cast<RegisterOperand>(List # "8B_operand"),
- uimm_b, asmop>;
-
- defm _4H : NeonI_LDN_WB_Dup<0, r, opcode, 0b01,
- !cast<RegisterOperand>(List # "4H_operand"),
- uimm_h, asmop>;
-
- defm _2S : NeonI_LDN_WB_Dup<0, r, opcode, 0b10,
- !cast<RegisterOperand>(List # "2S_operand"),
- uimm_s, asmop>;
-
- defm _1D : NeonI_LDN_WB_Dup<0, r, opcode, 0b11,
- !cast<RegisterOperand>(List # "1D_operand"),
- uimm_d, asmop>;
-
- defm _16B : NeonI_LDN_WB_Dup<1, r, opcode, 0b00,
- !cast<RegisterOperand>(List # "16B_operand"),
- uimm_b, asmop>;
-
- defm _8H : NeonI_LDN_WB_Dup<1, r, opcode, 0b01,
- !cast<RegisterOperand>(List # "8H_operand"),
- uimm_h, asmop>;
-
- defm _4S : NeonI_LDN_WB_Dup<1, r, opcode, 0b10,
- !cast<RegisterOperand>(List # "4S_operand"),
- uimm_s, asmop>;
-
- defm _2D : NeonI_LDN_WB_Dup<1, r, opcode, 0b11,
- !cast<RegisterOperand>(List # "2D_operand"),
- uimm_d, asmop>;
-}
-
-// Post-index load single 1-element structure to all lanes of 1 register
-defm LD1R_WB : LDWB_Dup_BHSD<0b0, 0b110, "VOne", "ld1r", uimm_exact1,
- uimm_exact2, uimm_exact4, uimm_exact8>;
-
-// Post-index load single N-element structure to all lanes of N consecutive
-// registers (N = 2,3,4)
-defm LD2R_WB : LDWB_Dup_BHSD<0b1, 0b110, "VPair", "ld2r", uimm_exact2,
- uimm_exact4, uimm_exact8, uimm_exact16>;
-defm LD3R_WB : LDWB_Dup_BHSD<0b0, 0b111, "VTriple", "ld3r", uimm_exact3,
- uimm_exact6, uimm_exact12, uimm_exact24>;
-defm LD4R_WB : LDWB_Dup_BHSD<0b1, 0b111, "VQuad", "ld4r", uimm_exact4,
- uimm_exact8, uimm_exact16, uimm_exact32>;
-
-let mayLoad = 1, neverHasSideEffects = 1, hasExtraDefRegAllocReq = 1,
- Constraints = "$Rn = $wb, $Rt = $src",
- DecoderMethod = "DecodeVLDSTLanePostInstruction" in {
- class LDN_WBFx_Lane<bit r, bits<2> op2_1, bit op0, RegisterOperand VList,
- Operand ImmTy, Operand ImmOp, string asmop>
- : NeonI_LdStOne_Lane_Post<1, r, op2_1, op0,
- (outs VList:$Rt, GPR64xsp:$wb),
- (ins GPR64xsp:$Rn, ImmTy:$amt,
- VList:$src, ImmOp:$lane),
- asmop # "\t$Rt[$lane], [$Rn], $amt",
- [],
- NoItinerary>,
- Sched<[WriteVecLd, WriteVecLd, ReadVecLd, ReadVecLd]> {
- let Rm = 0b11111;
- }
-
- class LDN_WBReg_Lane<bit r, bits<2> op2_1, bit op0, RegisterOperand VList,
- Operand ImmTy, Operand ImmOp, string asmop>
- : NeonI_LdStOne_Lane_Post<1, r, op2_1, op0,
- (outs VList:$Rt, GPR64xsp:$wb),
- (ins GPR64xsp:$Rn, GPR64noxzr:$Rm,
- VList:$src, ImmOp:$lane),
- asmop # "\t$Rt[$lane], [$Rn], $Rm",
- [],
- NoItinerary>,
- Sched<[WriteVecLd, WriteVecLd, ReadVecLd, ReadVecLd, ReadVecLd]>;
-}
-
-multiclass LD_Lane_WB_BHSD<bit r, bit op0, string List, string asmop,
- Operand uimm_b, Operand uimm_h,
- Operand uimm_s, Operand uimm_d> {
- def _B_fixed : LDN_WBFx_Lane<r, 0b00, op0,
- !cast<RegisterOperand>(List # "B_operand"),
- uimm_b, neon_uimm4_bare, asmop> {
- let Inst{12-10} = lane{2-0};
- let Inst{30} = lane{3};
- }
-
- def _B_register : LDN_WBReg_Lane<r, 0b00, op0,
- !cast<RegisterOperand>(List # "B_operand"),
- uimm_b, neon_uimm4_bare, asmop> {
- let Inst{12-10} = lane{2-0};
- let Inst{30} = lane{3};
- }
-
- def _H_fixed : LDN_WBFx_Lane<r, 0b01, op0,
- !cast<RegisterOperand>(List # "H_operand"),
- uimm_h, neon_uimm3_bare, asmop> {
- let Inst{12-10} = {lane{1}, lane{0}, 0b0};
- let Inst{30} = lane{2};
- }
-
- def _H_register : LDN_WBReg_Lane<r, 0b01, op0,
- !cast<RegisterOperand>(List # "H_operand"),
- uimm_h, neon_uimm3_bare, asmop> {
- let Inst{12-10} = {lane{1}, lane{0}, 0b0};
- let Inst{30} = lane{2};
- }
-
- def _S_fixed : LDN_WBFx_Lane<r, 0b10, op0,
- !cast<RegisterOperand>(List # "S_operand"),
- uimm_s, neon_uimm2_bare, asmop> {
- let Inst{12-10} = {lane{0}, 0b0, 0b0};
- let Inst{30} = lane{1};
- }
-
- def _S_register : LDN_WBReg_Lane<r, 0b10, op0,
- !cast<RegisterOperand>(List # "S_operand"),
- uimm_s, neon_uimm2_bare, asmop> {
- let Inst{12-10} = {lane{0}, 0b0, 0b0};
- let Inst{30} = lane{1};
- }
-
- def _D_fixed : LDN_WBFx_Lane<r, 0b10, op0,
- !cast<RegisterOperand>(List # "D_operand"),
- uimm_d, neon_uimm1_bare, asmop> {
- let Inst{12-10} = 0b001;
- let Inst{30} = lane{0};
- }
-
- def _D_register : LDN_WBReg_Lane<r, 0b10, op0,
- !cast<RegisterOperand>(List # "D_operand"),
- uimm_d, neon_uimm1_bare, asmop> {
- let Inst{12-10} = 0b001;
- let Inst{30} = lane{0};
- }
-}
-
-// Post-index load single 1-element structure to one lane of 1 register.
-defm LD1LN_WB : LD_Lane_WB_BHSD<0b0, 0b0, "VOne", "ld1", uimm_exact1,
- uimm_exact2, uimm_exact4, uimm_exact8>;
-
-// Post-index load single N-element structure to one lane of N consecutive
-// registers
-// (N = 2,3,4)
-defm LD2LN_WB : LD_Lane_WB_BHSD<0b1, 0b0, "VPair", "ld2", uimm_exact2,
- uimm_exact4, uimm_exact8, uimm_exact16>;
-defm LD3LN_WB : LD_Lane_WB_BHSD<0b0, 0b1, "VTriple", "ld3", uimm_exact3,
- uimm_exact6, uimm_exact12, uimm_exact24>;
-defm LD4LN_WB : LD_Lane_WB_BHSD<0b1, 0b1, "VQuad", "ld4", uimm_exact4,
- uimm_exact8, uimm_exact16, uimm_exact32>;
-
-let mayStore = 1, neverHasSideEffects = 1,
- hasExtraDefRegAllocReq = 1, Constraints = "$Rn = $wb",
- DecoderMethod = "DecodeVLDSTLanePostInstruction" in {
- class STN_WBFx_Lane<bit r, bits<2> op2_1, bit op0, RegisterOperand VList,
- Operand ImmTy, Operand ImmOp, string asmop>
- : NeonI_LdStOne_Lane_Post<0, r, op2_1, op0,
- (outs GPR64xsp:$wb),
- (ins GPR64xsp:$Rn, ImmTy:$amt,
- VList:$Rt, ImmOp:$lane),
- asmop # "\t$Rt[$lane], [$Rn], $amt",
- [],
- NoItinerary>,
- Sched<[WriteVecSt, ReadVecSt, ReadVecSt]> {
- let Rm = 0b11111;
- }
-
- class STN_WBReg_Lane<bit r, bits<2> op2_1, bit op0, RegisterOperand VList,
- Operand ImmTy, Operand ImmOp, string asmop>
- : NeonI_LdStOne_Lane_Post<0, r, op2_1, op0,
- (outs GPR64xsp:$wb),
- (ins GPR64xsp:$Rn, GPR64noxzr:$Rm, VList:$Rt,
- ImmOp:$lane),
- asmop # "\t$Rt[$lane], [$Rn], $Rm",
- [],
- NoItinerary>,
- Sched<[WriteVecSt, ReadVecSt, ReadVecSt, ReadVecSt]>;
-}
-
-multiclass ST_Lane_WB_BHSD<bit r, bit op0, string List, string asmop,
- Operand uimm_b, Operand uimm_h,
- Operand uimm_s, Operand uimm_d> {
- def _B_fixed : STN_WBFx_Lane<r, 0b00, op0,
- !cast<RegisterOperand>(List # "B_operand"),
- uimm_b, neon_uimm4_bare, asmop> {
- let Inst{12-10} = lane{2-0};
- let Inst{30} = lane{3};
- }
-
- def _B_register : STN_WBReg_Lane<r, 0b00, op0,
- !cast<RegisterOperand>(List # "B_operand"),
- uimm_b, neon_uimm4_bare, asmop> {
- let Inst{12-10} = lane{2-0};
- let Inst{30} = lane{3};
- }
-
- def _H_fixed : STN_WBFx_Lane<r, 0b01, op0,
- !cast<RegisterOperand>(List # "H_operand"),
- uimm_h, neon_uimm3_bare, asmop> {
- let Inst{12-10} = {lane{1}, lane{0}, 0b0};
- let Inst{30} = lane{2};
- }
-
- def _H_register : STN_WBReg_Lane<r, 0b01, op0,
- !cast<RegisterOperand>(List # "H_operand"),
- uimm_h, neon_uimm3_bare, asmop> {
- let Inst{12-10} = {lane{1}, lane{0}, 0b0};
- let Inst{30} = lane{2};
- }
-
- def _S_fixed : STN_WBFx_Lane<r, 0b10, op0,
- !cast<RegisterOperand>(List # "S_operand"),
- uimm_s, neon_uimm2_bare, asmop> {
- let Inst{12-10} = {lane{0}, 0b0, 0b0};
- let Inst{30} = lane{1};
- }
-
- def _S_register : STN_WBReg_Lane<r, 0b10, op0,
- !cast<RegisterOperand>(List # "S_operand"),
- uimm_s, neon_uimm2_bare, asmop> {
- let Inst{12-10} = {lane{0}, 0b0, 0b0};
- let Inst{30} = lane{1};
- }
-
- def _D_fixed : STN_WBFx_Lane<r, 0b10, op0,
- !cast<RegisterOperand>(List # "D_operand"),
- uimm_d, neon_uimm1_bare, asmop> {
- let Inst{12-10} = 0b001;
- let Inst{30} = lane{0};
- }
-
- def _D_register : STN_WBReg_Lane<r, 0b10, op0,
- !cast<RegisterOperand>(List # "D_operand"),
- uimm_d, neon_uimm1_bare, asmop> {
- let Inst{12-10} = 0b001;
- let Inst{30} = lane{0};
- }
-}
-
-// Post-index store single 1-element structure from one lane of 1 register.
-defm ST1LN_WB : ST_Lane_WB_BHSD<0b0, 0b0, "VOne", "st1", uimm_exact1,
- uimm_exact2, uimm_exact4, uimm_exact8>;
-
-// Post-index store single N-element structure from one lane of N consecutive
-// registers (N = 2,3,4)
-defm ST2LN_WB : ST_Lane_WB_BHSD<0b1, 0b0, "VPair", "st2", uimm_exact2,
- uimm_exact4, uimm_exact8, uimm_exact16>;
-defm ST3LN_WB : ST_Lane_WB_BHSD<0b0, 0b1, "VTriple", "st3", uimm_exact3,
- uimm_exact6, uimm_exact12, uimm_exact24>;
-defm ST4LN_WB : ST_Lane_WB_BHSD<0b1, 0b1, "VQuad", "st4", uimm_exact4,
- uimm_exact8, uimm_exact16, uimm_exact32>;
-
-// End of post-index load/store single N-element instructions
-// (class SIMD lsone-post)
-
-// Neon Scalar instructions implementation
-// Scalar Three Same
-
-class NeonI_Scalar3Same_size<bit u, bits<2> size, bits<5> opcode, string asmop,
- RegisterClass FPRC>
- : NeonI_Scalar3Same<u, size, opcode,
- (outs FPRC:$Rd), (ins FPRC:$Rn, FPRC:$Rm),
- !strconcat(asmop, "\t$Rd, $Rn, $Rm"),
- [],
- NoItinerary>,
- Sched<[WriteFPALU, ReadFPALU, ReadFPALU]>;
-
-class NeonI_Scalar3Same_D_size<bit u, bits<5> opcode, string asmop>
- : NeonI_Scalar3Same_size<u, 0b11, opcode, asmop, FPR64>;
-
-multiclass NeonI_Scalar3Same_HS_sizes<bit u, bits<5> opcode, string asmop,
- bit Commutable = 0> {
- let isCommutable = Commutable in {
- def hhh : NeonI_Scalar3Same_size<u, 0b01, opcode, asmop, FPR16>;
- def sss : NeonI_Scalar3Same_size<u, 0b10, opcode, asmop, FPR32>;
- }
-}
-
-multiclass NeonI_Scalar3Same_SD_sizes<bit u, bit size_high, bits<5> opcode,
- string asmop, bit Commutable = 0> {
- let isCommutable = Commutable in {
- def sss : NeonI_Scalar3Same_size<u, {size_high, 0b0}, opcode, asmop, FPR32>;
- def ddd : NeonI_Scalar3Same_size<u, {size_high, 0b1}, opcode, asmop, FPR64>;
- }
-}
-
-multiclass NeonI_Scalar3Same_BHSD_sizes<bit u, bits<5> opcode,
- string asmop, bit Commutable = 0> {
- let isCommutable = Commutable in {
- def bbb : NeonI_Scalar3Same_size<u, 0b00, opcode, asmop, FPR8>;
- def hhh : NeonI_Scalar3Same_size<u, 0b01, opcode, asmop, FPR16>;
- def sss : NeonI_Scalar3Same_size<u, 0b10, opcode, asmop, FPR32>;
- def ddd : NeonI_Scalar3Same_size<u, 0b11, opcode, asmop, FPR64>;
- }
-}
-
-multiclass Neon_Scalar3Same_D_size_patterns<SDPatternOperator opnode,
- Instruction INSTD> {
- def : Pat<(v1i64 (opnode (v1i64 FPR64:$Rn), (v1i64 FPR64:$Rm))),
- (INSTD FPR64:$Rn, FPR64:$Rm)>;
-}
-
-multiclass Neon_Scalar3Same_BHSD_size_patterns<SDPatternOperator opnode,
- Instruction INSTB,
- Instruction INSTH,
- Instruction INSTS,
- Instruction INSTD>
- : Neon_Scalar3Same_D_size_patterns<opnode, INSTD> {
- def: Pat<(v1i8 (opnode (v1i8 FPR8:$Rn), (v1i8 FPR8:$Rm))),
- (INSTB FPR8:$Rn, FPR8:$Rm)>;
- def: Pat<(v1i16 (opnode (v1i16 FPR16:$Rn), (v1i16 FPR16:$Rm))),
- (INSTH FPR16:$Rn, FPR16:$Rm)>;
- def: Pat<(v1i32 (opnode (v1i32 FPR32:$Rn), (v1i32 FPR32:$Rm))),
- (INSTS FPR32:$Rn, FPR32:$Rm)>;
-}
-
-multiclass Neon_Scalar3Same_HS_size_patterns<SDPatternOperator opnode,
- Instruction INSTH,
- Instruction INSTS> {
- def : Pat<(v1i16 (opnode (v1i16 FPR16:$Rn), (v1i16 FPR16:$Rm))),
- (INSTH FPR16:$Rn, FPR16:$Rm)>;
- def : Pat<(v1i32 (opnode (v1i32 FPR32:$Rn), (v1i32 FPR32:$Rm))),
- (INSTS FPR32:$Rn, FPR32:$Rm)>;
-}
-
-multiclass Neon_Scalar3Same_SD_size_patterns<SDPatternOperator opnode,
- ValueType SResTy, ValueType STy,
- Instruction INSTS, ValueType DResTy,
- ValueType DTy, Instruction INSTD> {
- def : Pat<(SResTy (opnode (STy FPR32:$Rn), (STy FPR32:$Rm))),
- (INSTS FPR32:$Rn, FPR32:$Rm)>;
- def : Pat<(DResTy (opnode (DTy FPR64:$Rn), (DTy FPR64:$Rm))),
- (INSTD FPR64:$Rn, FPR64:$Rm)>;
-}
-
-class Neon_Scalar3Same_cmp_V1_D_size_patterns<CondCode CC,
- Instruction INSTD>
- : Pat<(v1i64 (Neon_cmp (v1f64 FPR64:$Rn), (v1f64 FPR64:$Rm), CC)),
- (INSTD FPR64:$Rn, FPR64:$Rm)>;
-
-// Scalar Three Different
-
-class NeonI_Scalar3Diff_size<bit u, bits<2> size, bits<4> opcode, string asmop,
- RegisterClass FPRCD, RegisterClass FPRCS>
- : NeonI_Scalar3Diff<u, size, opcode,
- (outs FPRCD:$Rd), (ins FPRCS:$Rn, FPRCS:$Rm),
- !strconcat(asmop, "\t$Rd, $Rn, $Rm"),
- [],
- NoItinerary>,
- Sched<[WriteFPALU, ReadFPALU, ReadFPALU]>;
-
-multiclass NeonI_Scalar3Diff_HS_size<bit u, bits<4> opcode, string asmop> {
- def shh : NeonI_Scalar3Diff_size<u, 0b01, opcode, asmop, FPR32, FPR16>;
- def dss : NeonI_Scalar3Diff_size<u, 0b10, opcode, asmop, FPR64, FPR32>;
-}
-
-multiclass NeonI_Scalar3Diff_ml_HS_size<bit u, bits<4> opcode, string asmop> {
- let Constraints = "$Src = $Rd" in {
- def shh : NeonI_Scalar3Diff<u, 0b01, opcode,
- (outs FPR32:$Rd), (ins FPR32:$Src, FPR16:$Rn, FPR16:$Rm),
- !strconcat(asmop, "\t$Rd, $Rn, $Rm"),
- [],
- NoItinerary>,
- Sched<[WriteFPALU, ReadFPALU, ReadFPALU, ReadFPALU]>;
- def dss : NeonI_Scalar3Diff<u, 0b10, opcode,
- (outs FPR64:$Rd), (ins FPR64:$Src, FPR32:$Rn, FPR32:$Rm),
- !strconcat(asmop, "\t$Rd, $Rn, $Rm"),
- [],
- NoItinerary>,
- Sched<[WriteFPALU, ReadFPALU, ReadFPALU, ReadFPALU]>;
- }
-}
-
-multiclass Neon_Scalar3Diff_HS_size_patterns<SDPatternOperator opnode,
- Instruction INSTH,
- Instruction INSTS> {
- def : Pat<(v1i32 (opnode (v1i16 FPR16:$Rn), (v1i16 FPR16:$Rm))),
- (INSTH FPR16:$Rn, FPR16:$Rm)>;
- def : Pat<(v1i64 (opnode (v1i32 FPR32:$Rn), (v1i32 FPR32:$Rm))),
- (INSTS FPR32:$Rn, FPR32:$Rm)>;
-}
-
-multiclass Neon_Scalar3Diff_ml_HS_size_patterns<SDPatternOperator opnode,
- Instruction INSTH,
- Instruction INSTS> {
- def : Pat<(v1i32 (opnode (v1i32 FPR32:$Src), (v1i16 FPR16:$Rn), (v1i16 FPR16:$Rm))),
- (INSTH FPR32:$Src, FPR16:$Rn, FPR16:$Rm)>;
- def : Pat<(v1i64 (opnode (v1i64 FPR64:$Src), (v1i32 FPR32:$Rn), (v1i32 FPR32:$Rm))),
- (INSTS FPR64:$Src, FPR32:$Rn, FPR32:$Rm)>;
-}
-
-// Scalar Two Registers Miscellaneous
-
-class NeonI_Scalar2SameMisc_size<bit u, bits<2> size, bits<5> opcode, string asmop,
- RegisterClass FPRCD, RegisterClass FPRCS>
- : NeonI_Scalar2SameMisc<u, size, opcode,
- (outs FPRCD:$Rd), (ins FPRCS:$Rn),
- !strconcat(asmop, "\t$Rd, $Rn"),
- [],
- NoItinerary>,
- Sched<[WriteFPALU, ReadFPALU]>;
-
-multiclass NeonI_Scalar2SameMisc_SD_size<bit u, bit size_high, bits<5> opcode,
- string asmop> {
- def ss : NeonI_Scalar2SameMisc_size<u, {size_high, 0b0}, opcode, asmop, FPR32,
- FPR32>;
- def dd : NeonI_Scalar2SameMisc_size<u, {size_high, 0b1}, opcode, asmop, FPR64,
- FPR64>;
-}
-
-multiclass NeonI_Scalar2SameMisc_D_size<bit u, bits<5> opcode, string asmop> {
- def dd : NeonI_Scalar2SameMisc_size<u, 0b11, opcode, asmop, FPR64, FPR64>;
-}
-
-multiclass NeonI_Scalar2SameMisc_BHSD_size<bit u, bits<5> opcode, string asmop>
- : NeonI_Scalar2SameMisc_D_size<u, opcode, asmop> {
- def bb : NeonI_Scalar2SameMisc_size<u, 0b00, opcode, asmop, FPR8, FPR8>;
- def hh : NeonI_Scalar2SameMisc_size<u, 0b01, opcode, asmop, FPR16, FPR16>;
- def ss : NeonI_Scalar2SameMisc_size<u, 0b10, opcode, asmop, FPR32, FPR32>;
-}
-
-class NeonI_Scalar2SameMisc_fcvtxn_D_size<bit u, bits<5> opcode, string asmop>
- : NeonI_Scalar2SameMisc_size<u, 0b01, opcode, asmop, FPR32, FPR64>;
-
-multiclass NeonI_Scalar2SameMisc_narrow_HSD_size<bit u, bits<5> opcode,
- string asmop> {
- def bh : NeonI_Scalar2SameMisc_size<u, 0b00, opcode, asmop, FPR8, FPR16>;
- def hs : NeonI_Scalar2SameMisc_size<u, 0b01, opcode, asmop, FPR16, FPR32>;
- def sd : NeonI_Scalar2SameMisc_size<u, 0b10, opcode, asmop, FPR32, FPR64>;
-}
-
-class NeonI_Scalar2SameMisc_accum_size<bit u, bits<2> size, bits<5> opcode,
- string asmop, RegisterClass FPRC>
- : NeonI_Scalar2SameMisc<u, size, opcode,
- (outs FPRC:$Rd), (ins FPRC:$Src, FPRC:$Rn),
- !strconcat(asmop, "\t$Rd, $Rn"),
- [],
- NoItinerary>,
- Sched<[WriteFPALU, ReadFPALU, ReadFPALU]>;
-
-multiclass NeonI_Scalar2SameMisc_accum_BHSD_size<bit u, bits<5> opcode,
- string asmop> {
-
- let Constraints = "$Src = $Rd" in {
- def bb : NeonI_Scalar2SameMisc_accum_size<u, 0b00, opcode, asmop, FPR8>;
- def hh : NeonI_Scalar2SameMisc_accum_size<u, 0b01, opcode, asmop, FPR16>;
- def ss : NeonI_Scalar2SameMisc_accum_size<u, 0b10, opcode, asmop, FPR32>;
- def dd : NeonI_Scalar2SameMisc_accum_size<u, 0b11, opcode, asmop, FPR64>;
- }
-}
-
-class Neon_Scalar2SameMisc_fcvtxn_D_size_patterns<SDPatternOperator opnode,
- Instruction INSTD>
- : Pat<(f32 (opnode (f64 FPR64:$Rn))),
- (INSTD FPR64:$Rn)>;
-
-multiclass Neon_Scalar2SameMisc_fcvt_SD_size_patterns<SDPatternOperator opnode,
- Instruction INSTS,
- Instruction INSTD> {
- def : Pat<(v1i32 (opnode (f32 FPR32:$Rn))),
- (INSTS FPR32:$Rn)>;
- def : Pat<(v1i64 (opnode (f64 FPR64:$Rn))),
- (INSTD FPR64:$Rn)>;
-}
-
-class Neon_Scalar2SameMisc_vcvt_D_size_patterns<SDPatternOperator opnode,
- Instruction INSTD>
- : Pat<(v1i64 (opnode (v1f64 FPR64:$Rn))),
- (INSTD FPR64:$Rn)>;
-
-multiclass Neon_Scalar2SameMisc_cvt_SD_size_patterns<SDPatternOperator opnode,
- Instruction INSTS,
- Instruction INSTD> {
- def : Pat<(f32 (opnode (v1i32 FPR32:$Rn))),
- (INSTS FPR32:$Rn)>;
- def : Pat<(f64 (opnode (v1i64 FPR64:$Rn))),
- (INSTD FPR64:$Rn)>;
-}
-
-multiclass Neon_Scalar2SameMisc_SD_size_patterns<SDPatternOperator opnode,
- Instruction INSTS,
- Instruction INSTD> {
- def : Pat<(f32 (opnode (f32 FPR32:$Rn))),
- (INSTS FPR32:$Rn)>;
- def : Pat<(f64 (opnode (f64 FPR64:$Rn))),
- (INSTD FPR64:$Rn)>;
-}
-
-class Neon_Scalar2SameMisc_V1_D_size_patterns<SDPatternOperator opnode,
- Instruction INSTD>
- : Pat<(v1f64 (opnode (v1f64 FPR64:$Rn))),
- (INSTD FPR64:$Rn)>;
-
-class NeonI_Scalar2SameMisc_cmpz_D_size<bit u, bits<5> opcode, string asmop>
- : NeonI_Scalar2SameMisc<u, 0b11, opcode,
- (outs FPR64:$Rd), (ins FPR64:$Rn, neon_uimm0:$Imm),
- !strconcat(asmop, "\t$Rd, $Rn, $Imm"),
- [],
- NoItinerary>,
- Sched<[WriteFPALU, ReadFPALU]>;
-
-multiclass NeonI_Scalar2SameMisc_cmpz_SD_size<bit u, bits<5> opcode,
- string asmop> {
- def ssi : NeonI_Scalar2SameMisc<u, 0b10, opcode,
- (outs FPR32:$Rd), (ins FPR32:$Rn, fpzz32:$FPImm),
- !strconcat(asmop, "\t$Rd, $Rn, $FPImm"),
- [],
- NoItinerary>,
- Sched<[WriteFPALU, ReadFPALU]>;
- def ddi : NeonI_Scalar2SameMisc<u, 0b11, opcode,
- (outs FPR64:$Rd), (ins FPR64:$Rn, fpzz32:$FPImm),
- !strconcat(asmop, "\t$Rd, $Rn, $FPImm"),
- [],
- NoItinerary>,
- Sched<[WriteFPALU, ReadFPALU]>;
-}
-
-class Neon_Scalar2SameMisc_cmpz_D_size_patterns<SDPatternOperator opnode,
- Instruction INSTD>
- : Pat<(v1i64 (opnode (v1i64 FPR64:$Rn),
- (v1i64 (bitconvert (v8i8 Neon_AllZero))))),
- (INSTD FPR64:$Rn, 0)>;
-
-class Neon_Scalar2SameMisc_cmpz_D_V1_size_patterns<CondCode CC,
- Instruction INSTD>
- : Pat<(v1i64 (Neon_cmpz (v1i64 FPR64:$Rn),
- (i32 neon_uimm0:$Imm), CC)),
- (INSTD FPR64:$Rn, neon_uimm0:$Imm)>;
-
-multiclass Neon_Scalar2SameMisc_cmpz_SD_size_patterns<SDPatternOperator opnode,
- CondCode CC,
- Instruction INSTS,
- Instruction INSTD> {
- def : Pat<(v1i32 (opnode (f32 FPR32:$Rn), (f32 fpzz32:$FPImm))),
- (INSTS FPR32:$Rn, fpzz32:$FPImm)>;
- def : Pat<(v1i64 (opnode (f64 FPR64:$Rn), (f32 fpzz32:$FPImm))),
- (INSTD FPR64:$Rn, fpzz32:$FPImm)>;
- def : Pat<(v1i64 (Neon_cmpz (v1f64 FPR64:$Rn), (f32 fpzz32:$FPImm), CC)),
- (INSTD FPR64:$Rn, fpzz32:$FPImm)>;
-}
-
-multiclass Neon_Scalar2SameMisc_D_size_patterns<SDPatternOperator opnode,
- Instruction INSTD> {
- def : Pat<(v1i64 (opnode (v1i64 FPR64:$Rn))),
- (INSTD FPR64:$Rn)>;
-}
-
-multiclass Neon_Scalar2SameMisc_BHSD_size_patterns<SDPatternOperator opnode,
- Instruction INSTB,
- Instruction INSTH,
- Instruction INSTS,
- Instruction INSTD>
- : Neon_Scalar2SameMisc_D_size_patterns<opnode, INSTD> {
- def : Pat<(v1i8 (opnode (v1i8 FPR8:$Rn))),
- (INSTB FPR8:$Rn)>;
- def : Pat<(v1i16 (opnode (v1i16 FPR16:$Rn))),
- (INSTH FPR16:$Rn)>;
- def : Pat<(v1i32 (opnode (v1i32 FPR32:$Rn))),
- (INSTS FPR32:$Rn)>;
-}
-
-multiclass Neon_Scalar2SameMisc_narrow_HSD_size_patterns<
- SDPatternOperator opnode,
- Instruction INSTH,
- Instruction INSTS,
- Instruction INSTD> {
- def : Pat<(v1i8 (opnode (v1i16 FPR16:$Rn))),
- (INSTH FPR16:$Rn)>;
- def : Pat<(v1i16 (opnode (v1i32 FPR32:$Rn))),
- (INSTS FPR32:$Rn)>;
- def : Pat<(v1i32 (opnode (v1i64 FPR64:$Rn))),
- (INSTD FPR64:$Rn)>;
-
-}
-
-multiclass Neon_Scalar2SameMisc_accum_BHSD_size_patterns<
- SDPatternOperator opnode,
- Instruction INSTB,
- Instruction INSTH,
- Instruction INSTS,
- Instruction INSTD> {
- def : Pat<(v1i8 (opnode (v1i8 FPR8:$Src), (v1i8 FPR8:$Rn))),
- (INSTB FPR8:$Src, FPR8:$Rn)>;
- def : Pat<(v1i16 (opnode (v1i16 FPR16:$Src), (v1i16 FPR16:$Rn))),
- (INSTH FPR16:$Src, FPR16:$Rn)>;
- def : Pat<(v1i32 (opnode (v1i32 FPR32:$Src), (v1i32 FPR32:$Rn))),
- (INSTS FPR32:$Src, FPR32:$Rn)>;
- def : Pat<(v1i64 (opnode (v1i64 FPR64:$Src), (v1i64 FPR64:$Rn))),
- (INSTD FPR64:$Src, FPR64:$Rn)>;
-}
-
-// Scalar Shift By Immediate
-
-class NeonI_ScalarShiftImm_size<bit u, bits<5> opcode, string asmop,
- RegisterClass FPRC, Operand ImmTy>
- : NeonI_ScalarShiftImm<u, opcode,
- (outs FPRC:$Rd), (ins FPRC:$Rn, ImmTy:$Imm),
- !strconcat(asmop, "\t$Rd, $Rn, $Imm"),
- [], NoItinerary>,
- Sched<[WriteFPALU, ReadFPALU]>;
-
-multiclass NeonI_ScalarShiftRightImm_D_size<bit u, bits<5> opcode,
- string asmop> {
- def ddi : NeonI_ScalarShiftImm_size<u, opcode, asmop, FPR64, shr_imm64> {
- bits<6> Imm;
- let Inst{22} = 0b1; // immh:immb = 1xxxxxx
- let Inst{21-16} = Imm;
- }
-}
-
-multiclass NeonI_ScalarShiftRightImm_BHSD_size<bit u, bits<5> opcode,
- string asmop>
- : NeonI_ScalarShiftRightImm_D_size<u, opcode, asmop> {
- def bbi : NeonI_ScalarShiftImm_size<u, opcode, asmop, FPR8, shr_imm8> {
- bits<3> Imm;
- let Inst{22-19} = 0b0001; // immh:immb = 0001xxx
- let Inst{18-16} = Imm;
- }
- def hhi : NeonI_ScalarShiftImm_size<u, opcode, asmop, FPR16, shr_imm16> {
- bits<4> Imm;
- let Inst{22-20} = 0b001; // immh:immb = 001xxxx
- let Inst{19-16} = Imm;
- }
- def ssi : NeonI_ScalarShiftImm_size<u, opcode, asmop, FPR32, shr_imm32> {
- bits<5> Imm;
- let Inst{22-21} = 0b01; // immh:immb = 01xxxxx
- let Inst{20-16} = Imm;
- }
-}
-
-multiclass NeonI_ScalarShiftLeftImm_D_size<bit u, bits<5> opcode,
- string asmop> {
- def ddi : NeonI_ScalarShiftImm_size<u, opcode, asmop, FPR64, shl_imm64> {
- bits<6> Imm;
- let Inst{22} = 0b1; // immh:immb = 1xxxxxx
- let Inst{21-16} = Imm;
- }
-}
-
-multiclass NeonI_ScalarShiftLeftImm_BHSD_size<bit u, bits<5> opcode,
- string asmop>
- : NeonI_ScalarShiftLeftImm_D_size<u, opcode, asmop> {
- def bbi : NeonI_ScalarShiftImm_size<u, opcode, asmop, FPR8, shl_imm8> {
- bits<3> Imm;
- let Inst{22-19} = 0b0001; // immh:immb = 0001xxx
- let Inst{18-16} = Imm;
- }
- def hhi : NeonI_ScalarShiftImm_size<u, opcode, asmop, FPR16, shl_imm16> {
- bits<4> Imm;
- let Inst{22-20} = 0b001; // immh:immb = 001xxxx
- let Inst{19-16} = Imm;
- }
- def ssi : NeonI_ScalarShiftImm_size<u, opcode, asmop, FPR32, shl_imm32> {
- bits<5> Imm;
- let Inst{22-21} = 0b01; // immh:immb = 01xxxxx
- let Inst{20-16} = Imm;
- }
-}
-
-class NeonI_ScalarShiftRightImm_accum_D_size<bit u, bits<5> opcode, string asmop>
- : NeonI_ScalarShiftImm<u, opcode,
- (outs FPR64:$Rd),
- (ins FPR64:$Src, FPR64:$Rn, shr_imm64:$Imm),
- !strconcat(asmop, "\t$Rd, $Rn, $Imm"),
- [], NoItinerary>,
- Sched<[WriteFPALU, ReadFPALU, ReadFPALU]> {
- bits<6> Imm;
- let Inst{22} = 0b1; // immh:immb = 1xxxxxx
- let Inst{21-16} = Imm;
- let Constraints = "$Src = $Rd";
-}
-
-class NeonI_ScalarShiftLeftImm_accum_D_size<bit u, bits<5> opcode, string asmop>
- : NeonI_ScalarShiftImm<u, opcode,
- (outs FPR64:$Rd),
- (ins FPR64:$Src, FPR64:$Rn, shl_imm64:$Imm),
- !strconcat(asmop, "\t$Rd, $Rn, $Imm"),
- [], NoItinerary>,
- Sched<[WriteFPALU, ReadFPALU, ReadFPALU]> {
- bits<6> Imm;
- let Inst{22} = 0b1; // immh:immb = 1xxxxxx
- let Inst{21-16} = Imm;
- let Constraints = "$Src = $Rd";
-}
-
-class NeonI_ScalarShiftImm_narrow_size<bit u, bits<5> opcode, string asmop,
- RegisterClass FPRCD, RegisterClass FPRCS,
- Operand ImmTy>
- : NeonI_ScalarShiftImm<u, opcode,
- (outs FPRCD:$Rd), (ins FPRCS:$Rn, ImmTy:$Imm),
- !strconcat(asmop, "\t$Rd, $Rn, $Imm"),
- [], NoItinerary>,
- Sched<[WriteFPALU, ReadFPALU]>;
-
-multiclass NeonI_ScalarShiftImm_narrow_HSD_size<bit u, bits<5> opcode,
- string asmop> {
- def bhi : NeonI_ScalarShiftImm_narrow_size<u, opcode, asmop, FPR8, FPR16,
- shr_imm8> {
- bits<3> Imm;
- let Inst{22-19} = 0b0001; // immh:immb = 0001xxx
- let Inst{18-16} = Imm;
- }
- def hsi : NeonI_ScalarShiftImm_narrow_size<u, opcode, asmop, FPR16, FPR32,
- shr_imm16> {
- bits<4> Imm;
- let Inst{22-20} = 0b001; // immh:immb = 001xxxx
- let Inst{19-16} = Imm;
- }
- def sdi : NeonI_ScalarShiftImm_narrow_size<u, opcode, asmop, FPR32, FPR64,
- shr_imm32> {
- bits<5> Imm;
- let Inst{22-21} = 0b01; // immh:immb = 01xxxxx
- let Inst{20-16} = Imm;
- }
-}
-
-multiclass NeonI_ScalarShiftImm_cvt_SD_size<bit u, bits<5> opcode, string asmop> {
- def ssi : NeonI_ScalarShiftImm_size<u, opcode, asmop, FPR32, shr_imm32> {
- bits<5> Imm;
- let Inst{22-21} = 0b01; // immh:immb = 01xxxxx
- let Inst{20-16} = Imm;
- }
- def ddi : NeonI_ScalarShiftImm_size<u, opcode, asmop, FPR64, shr_imm64> {
- bits<6> Imm;
- let Inst{22} = 0b1; // immh:immb = 1xxxxxx
- let Inst{21-16} = Imm;
- }
-}
-
-multiclass Neon_ScalarShiftRImm_D_size_patterns<SDPatternOperator opnode,
- Instruction INSTD> {
- def ddi : Pat<(v1i64 (opnode (v1i64 FPR64:$Rn), (i32 shr_imm64:$Imm))),
- (INSTD FPR64:$Rn, imm:$Imm)>;
-}
-
-multiclass Neon_ScalarShiftLImm_D_size_patterns<SDPatternOperator opnode,
- Instruction INSTD> {
- def ddi : Pat<(v1i64 (opnode (v1i64 FPR64:$Rn), (i32 shl_imm64:$Imm))),
- (INSTD FPR64:$Rn, imm:$Imm)>;
-}
-
-class Neon_ScalarShiftLImm_V1_D_size_patterns<SDPatternOperator opnode,
- Instruction INSTD>
- : Pat<(v1i64 (opnode (v1i64 FPR64:$Rn),
- (v1i64 (Neon_vdup (i32 shl_imm64:$Imm))))),
- (INSTD FPR64:$Rn, imm:$Imm)>;
-
-class Neon_ScalarShiftRImm_V1_D_size_patterns<SDPatternOperator opnode,
- Instruction INSTD>
- : Pat<(v1i64 (opnode (v1i64 FPR64:$Rn),
- (v1i64 (Neon_vdup (i32 shr_imm64:$Imm))))),
- (INSTD FPR64:$Rn, imm:$Imm)>;
-
-multiclass Neon_ScalarShiftLImm_BHSD_size_patterns<SDPatternOperator opnode,
- Instruction INSTB,
- Instruction INSTH,
- Instruction INSTS,
- Instruction INSTD>
- : Neon_ScalarShiftLImm_D_size_patterns<opnode, INSTD> {
- def bbi : Pat<(v1i8 (opnode (v1i8 FPR8:$Rn), (i32 shl_imm8:$Imm))),
- (INSTB FPR8:$Rn, imm:$Imm)>;
- def hhi : Pat<(v1i16 (opnode (v1i16 FPR16:$Rn), (i32 shl_imm16:$Imm))),
- (INSTH FPR16:$Rn, imm:$Imm)>;
- def ssi : Pat<(v1i32 (opnode (v1i32 FPR32:$Rn), (i32 shl_imm32:$Imm))),
- (INSTS FPR32:$Rn, imm:$Imm)>;
-}
-
-class Neon_ScalarShiftLImm_accum_D_size_patterns<SDPatternOperator opnode,
- Instruction INSTD>
- : Pat<(v1i64 (opnode (v1i64 FPR64:$Src), (v1i64 FPR64:$Rn),
- (i32 shl_imm64:$Imm))),
- (INSTD FPR64:$Src, FPR64:$Rn, imm:$Imm)>;
-
-class Neon_ScalarShiftRImm_accum_D_size_patterns<SDPatternOperator opnode,
- Instruction INSTD>
- : Pat<(v1i64 (opnode (v1i64 FPR64:$Src), (v1i64 FPR64:$Rn),
- (i32 shr_imm64:$Imm))),
- (INSTD FPR64:$Src, FPR64:$Rn, imm:$Imm)>;
-
-multiclass Neon_ScalarShiftImm_narrow_HSD_size_patterns<
- SDPatternOperator opnode,
- Instruction INSTH,
- Instruction INSTS,
- Instruction INSTD> {
- def bhi : Pat<(v1i8 (opnode (v1i16 FPR16:$Rn), (i32 shr_imm16:$Imm))),
- (INSTH FPR16:$Rn, imm:$Imm)>;
- def hsi : Pat<(v1i16 (opnode (v1i32 FPR32:$Rn), (i32 shr_imm32:$Imm))),
- (INSTS FPR32:$Rn, imm:$Imm)>;
- def sdi : Pat<(v1i32 (opnode (v1i64 FPR64:$Rn), (i32 shr_imm64:$Imm))),
- (INSTD FPR64:$Rn, imm:$Imm)>;
-}
-
-multiclass Neon_ScalarShiftImm_scvtf_SD_size_patterns<SDPatternOperator opnode,
- Instruction INSTS,
- Instruction INSTD> {
- def ssi : Pat<(f32 (opnode (v1i32 FPR32:$Rn), (i32 shr_imm32:$Imm))),
- (INSTS FPR32:$Rn, imm:$Imm)>;
- def ddi : Pat<(f64 (opnode (v1i64 FPR64:$Rn), (i32 shr_imm64:$Imm))),
- (INSTD FPR64:$Rn, imm:$Imm)>;
-}
-
-multiclass Neon_ScalarShiftImm_fcvts_SD_size_patterns<SDPatternOperator opnode,
- Instruction INSTS,
- Instruction INSTD> {
- def ssi : Pat<(v1i32 (opnode (f32 FPR32:$Rn), (i32 shr_imm32:$Imm))),
- (INSTS FPR32:$Rn, imm:$Imm)>;
- def ddi : Pat<(v1i64 (opnode (f64 FPR64:$Rn), (i32 shr_imm64:$Imm))),
- (INSTD FPR64:$Rn, imm:$Imm)>;
-}
-
-// Scalar Signed Shift Right (Immediate)
-defm SSHR : NeonI_ScalarShiftRightImm_D_size<0b0, 0b00000, "sshr">;
-defm : Neon_ScalarShiftRImm_D_size_patterns<int_aarch64_neon_vshrds_n, SSHRddi>;
-// Pattern to match llvm.arm.* intrinsic.
-def : Neon_ScalarShiftRImm_V1_D_size_patterns<sra, SSHRddi>;
-
-// Scalar Unsigned Shift Right (Immediate)
-defm USHR : NeonI_ScalarShiftRightImm_D_size<0b1, 0b00000, "ushr">;
-defm : Neon_ScalarShiftRImm_D_size_patterns<int_aarch64_neon_vshrdu_n, USHRddi>;
-// Pattern to match llvm.arm.* intrinsic.
-def : Neon_ScalarShiftRImm_V1_D_size_patterns<srl, USHRddi>;
-
-// Scalar Signed Rounding Shift Right (Immediate)
-defm SRSHR : NeonI_ScalarShiftRightImm_D_size<0b0, 0b00100, "srshr">;
-defm : Neon_ScalarShiftRImm_D_size_patterns<int_aarch64_neon_vsrshr, SRSHRddi>;
-
-// Scalar Unigned Rounding Shift Right (Immediate)
-defm URSHR : NeonI_ScalarShiftRightImm_D_size<0b1, 0b00100, "urshr">;
-defm : Neon_ScalarShiftRImm_D_size_patterns<int_aarch64_neon_vurshr, URSHRddi>;
-
-// Scalar Signed Shift Right and Accumulate (Immediate)
-def SSRA : NeonI_ScalarShiftRightImm_accum_D_size<0b0, 0b00010, "ssra">;
-def : Neon_ScalarShiftRImm_accum_D_size_patterns
- <int_aarch64_neon_vsrads_n, SSRA>;
-
-// Scalar Unsigned Shift Right and Accumulate (Immediate)
-def USRA : NeonI_ScalarShiftRightImm_accum_D_size<0b1, 0b00010, "usra">;
-def : Neon_ScalarShiftRImm_accum_D_size_patterns
- <int_aarch64_neon_vsradu_n, USRA>;
-
-// Scalar Signed Rounding Shift Right and Accumulate (Immediate)
-def SRSRA : NeonI_ScalarShiftRightImm_accum_D_size<0b0, 0b00110, "srsra">;
-def : Neon_ScalarShiftRImm_accum_D_size_patterns
- <int_aarch64_neon_vrsrads_n, SRSRA>;
-
-// Scalar Unsigned Rounding Shift Right and Accumulate (Immediate)
-def URSRA : NeonI_ScalarShiftRightImm_accum_D_size<0b1, 0b00110, "ursra">;
-def : Neon_ScalarShiftRImm_accum_D_size_patterns
- <int_aarch64_neon_vrsradu_n, URSRA>;
-
-// Scalar Shift Left (Immediate)
-defm SHL : NeonI_ScalarShiftLeftImm_D_size<0b0, 0b01010, "shl">;
-defm : Neon_ScalarShiftLImm_D_size_patterns<int_aarch64_neon_vshld_n, SHLddi>;
-// Pattern to match llvm.arm.* intrinsic.
-def : Neon_ScalarShiftLImm_V1_D_size_patterns<shl, SHLddi>;
-
-// Signed Saturating Shift Left (Immediate)
-defm SQSHL : NeonI_ScalarShiftLeftImm_BHSD_size<0b0, 0b01110, "sqshl">;
-defm : Neon_ScalarShiftLImm_BHSD_size_patterns<int_aarch64_neon_vqshls_n,
- SQSHLbbi, SQSHLhhi,
- SQSHLssi, SQSHLddi>;
-// Pattern to match llvm.arm.* intrinsic.
-defm : Neon_ScalarShiftLImm_D_size_patterns<Neon_sqrshlImm, SQSHLddi>;
-
-// Unsigned Saturating Shift Left (Immediate)
-defm UQSHL : NeonI_ScalarShiftLeftImm_BHSD_size<0b1, 0b01110, "uqshl">;
-defm : Neon_ScalarShiftLImm_BHSD_size_patterns<int_aarch64_neon_vqshlu_n,
- UQSHLbbi, UQSHLhhi,
- UQSHLssi, UQSHLddi>;
-// Pattern to match llvm.arm.* intrinsic.
-defm : Neon_ScalarShiftLImm_D_size_patterns<Neon_uqrshlImm, UQSHLddi>;
-
-// Signed Saturating Shift Left Unsigned (Immediate)
-defm SQSHLU : NeonI_ScalarShiftLeftImm_BHSD_size<0b1, 0b01100, "sqshlu">;
-defm : Neon_ScalarShiftLImm_BHSD_size_patterns<int_aarch64_neon_vsqshlu,
- SQSHLUbbi, SQSHLUhhi,
- SQSHLUssi, SQSHLUddi>;
-
-// Shift Right And Insert (Immediate)
-def SRI : NeonI_ScalarShiftRightImm_accum_D_size<0b1, 0b01000, "sri">;
-def : Neon_ScalarShiftRImm_accum_D_size_patterns
- <int_aarch64_neon_vsri, SRI>;
-
-// Shift Left And Insert (Immediate)
-def SLI : NeonI_ScalarShiftLeftImm_accum_D_size<0b1, 0b01010, "sli">;
-def : Neon_ScalarShiftLImm_accum_D_size_patterns
- <int_aarch64_neon_vsli, SLI>;
-
-// Signed Saturating Shift Right Narrow (Immediate)
-defm SQSHRN : NeonI_ScalarShiftImm_narrow_HSD_size<0b0, 0b10010, "sqshrn">;
-defm : Neon_ScalarShiftImm_narrow_HSD_size_patterns<int_aarch64_neon_vsqshrn,
- SQSHRNbhi, SQSHRNhsi,
- SQSHRNsdi>;
-
-// Unsigned Saturating Shift Right Narrow (Immediate)
-defm UQSHRN : NeonI_ScalarShiftImm_narrow_HSD_size<0b1, 0b10010, "uqshrn">;
-defm : Neon_ScalarShiftImm_narrow_HSD_size_patterns<int_aarch64_neon_vuqshrn,
- UQSHRNbhi, UQSHRNhsi,
- UQSHRNsdi>;
-
-// Signed Saturating Rounded Shift Right Narrow (Immediate)
-defm SQRSHRN : NeonI_ScalarShiftImm_narrow_HSD_size<0b0, 0b10011, "sqrshrn">;
-defm : Neon_ScalarShiftImm_narrow_HSD_size_patterns<int_aarch64_neon_vsqrshrn,
- SQRSHRNbhi, SQRSHRNhsi,
- SQRSHRNsdi>;
-
-// Unsigned Saturating Rounded Shift Right Narrow (Immediate)
-defm UQRSHRN : NeonI_ScalarShiftImm_narrow_HSD_size<0b1, 0b10011, "uqrshrn">;
-defm : Neon_ScalarShiftImm_narrow_HSD_size_patterns<int_aarch64_neon_vuqrshrn,
- UQRSHRNbhi, UQRSHRNhsi,
- UQRSHRNsdi>;
-
-// Signed Saturating Shift Right Unsigned Narrow (Immediate)
-defm SQSHRUN : NeonI_ScalarShiftImm_narrow_HSD_size<0b1, 0b10000, "sqshrun">;
-defm : Neon_ScalarShiftImm_narrow_HSD_size_patterns<int_aarch64_neon_vsqshrun,
- SQSHRUNbhi, SQSHRUNhsi,
- SQSHRUNsdi>;
-
-// Signed Saturating Rounded Shift Right Unsigned Narrow (Immediate)
-defm SQRSHRUN : NeonI_ScalarShiftImm_narrow_HSD_size<0b1, 0b10001, "sqrshrun">;
-defm : Neon_ScalarShiftImm_narrow_HSD_size_patterns<int_aarch64_neon_vsqrshrun,
- SQRSHRUNbhi, SQRSHRUNhsi,
- SQRSHRUNsdi>;
-
-// Scalar Signed Fixed-point Convert To Floating-Point (Immediate)
-defm SCVTF_N : NeonI_ScalarShiftImm_cvt_SD_size<0b0, 0b11100, "scvtf">;
-defm : Neon_ScalarShiftImm_scvtf_SD_size_patterns<int_aarch64_neon_vcvtfxs2fp_n,
- SCVTF_Nssi, SCVTF_Nddi>;
-
-// Scalar Unsigned Fixed-point Convert To Floating-Point (Immediate)
-defm UCVTF_N : NeonI_ScalarShiftImm_cvt_SD_size<0b1, 0b11100, "ucvtf">;
-defm : Neon_ScalarShiftImm_scvtf_SD_size_patterns<int_aarch64_neon_vcvtfxu2fp_n,
- UCVTF_Nssi, UCVTF_Nddi>;
-
-// Scalar Floating-point Convert To Signed Fixed-point (Immediate)
-defm FCVTZS_N : NeonI_ScalarShiftImm_cvt_SD_size<0b0, 0b11111, "fcvtzs">;
-defm : Neon_ScalarShiftImm_fcvts_SD_size_patterns<int_aarch64_neon_vcvtfp2fxs_n,
- FCVTZS_Nssi, FCVTZS_Nddi>;
-
-// Scalar Floating-point Convert To Unsigned Fixed-point (Immediate)
-defm FCVTZU_N : NeonI_ScalarShiftImm_cvt_SD_size<0b1, 0b11111, "fcvtzu">;
-defm : Neon_ScalarShiftImm_fcvts_SD_size_patterns<int_aarch64_neon_vcvtfp2fxu_n,
- FCVTZU_Nssi, FCVTZU_Nddi>;
-
-// Patterns For Convert Instructions Between v1f64 and v1i64
-class Neon_ScalarShiftImm_cvtf_v1f64_pattern<SDPatternOperator opnode,
- Instruction INST>
- : Pat<(v1f64 (opnode (v1i64 FPR64:$Rn), (i32 shr_imm64:$Imm))),
- (INST FPR64:$Rn, imm:$Imm)>;
-
-class Neon_ScalarShiftImm_fcvt_v1f64_pattern<SDPatternOperator opnode,
- Instruction INST>
- : Pat<(v1i64 (opnode (v1f64 FPR64:$Rn), (i32 shr_imm64:$Imm))),
- (INST FPR64:$Rn, imm:$Imm)>;
-
-def : Neon_ScalarShiftImm_cvtf_v1f64_pattern<int_arm_neon_vcvtfxs2fp,
- SCVTF_Nddi>;
-
-def : Neon_ScalarShiftImm_cvtf_v1f64_pattern<int_arm_neon_vcvtfxu2fp,
- UCVTF_Nddi>;
-
-def : Neon_ScalarShiftImm_fcvt_v1f64_pattern<int_arm_neon_vcvtfp2fxs,
- FCVTZS_Nddi>;
-
-def : Neon_ScalarShiftImm_fcvt_v1f64_pattern<int_arm_neon_vcvtfp2fxu,
- FCVTZU_Nddi>;
-
-// Scalar Integer Add
-let isCommutable = 1 in {
-def ADDddd : NeonI_Scalar3Same_D_size<0b0, 0b10000, "add">;
-}
-
-// Scalar Integer Sub
-def SUBddd : NeonI_Scalar3Same_D_size<0b1, 0b10000, "sub">;
-
-// Pattern for Scalar Integer Add and Sub with D register only
-defm : Neon_Scalar3Same_D_size_patterns<add, ADDddd>;
-defm : Neon_Scalar3Same_D_size_patterns<sub, SUBddd>;
-
-// Patterns to match llvm.aarch64.* intrinsic for Scalar Add, Sub
-defm : Neon_Scalar3Same_D_size_patterns<int_aarch64_neon_vaddds, ADDddd>;
-defm : Neon_Scalar3Same_D_size_patterns<int_aarch64_neon_vadddu, ADDddd>;
-defm : Neon_Scalar3Same_D_size_patterns<int_aarch64_neon_vsubds, SUBddd>;
-defm : Neon_Scalar3Same_D_size_patterns<int_aarch64_neon_vsubdu, SUBddd>;
-
-// Scalar Integer Saturating Add (Signed, Unsigned)
-defm SQADD : NeonI_Scalar3Same_BHSD_sizes<0b0, 0b00001, "sqadd", 1>;
-defm UQADD : NeonI_Scalar3Same_BHSD_sizes<0b1, 0b00001, "uqadd", 1>;
-
-// Scalar Integer Saturating Sub (Signed, Unsigned)
-defm SQSUB : NeonI_Scalar3Same_BHSD_sizes<0b0, 0b00101, "sqsub", 0>;
-defm UQSUB : NeonI_Scalar3Same_BHSD_sizes<0b1, 0b00101, "uqsub", 0>;
-
-
-// Patterns to match llvm.aarch64.* intrinsic for
-// Scalar Integer Saturating Add, Sub (Signed, Unsigned)
-defm : Neon_Scalar3Same_BHSD_size_patterns<int_arm_neon_vqadds, SQADDbbb,
- SQADDhhh, SQADDsss, SQADDddd>;
-defm : Neon_Scalar3Same_BHSD_size_patterns<int_arm_neon_vqaddu, UQADDbbb,
- UQADDhhh, UQADDsss, UQADDddd>;
-defm : Neon_Scalar3Same_BHSD_size_patterns<int_arm_neon_vqsubs, SQSUBbbb,
- SQSUBhhh, SQSUBsss, SQSUBddd>;
-defm : Neon_Scalar3Same_BHSD_size_patterns<int_arm_neon_vqsubu, UQSUBbbb,
- UQSUBhhh, UQSUBsss, UQSUBddd>;
-
-// Scalar Integer Saturating Doubling Multiply Half High
-let SchedRW = [WriteFPMul, ReadFPMul, ReadFPMul, ReadFPMul] in
-defm SQDMULH : NeonI_Scalar3Same_HS_sizes<0b0, 0b10110, "sqdmulh", 1>;
-
-// Scalar Integer Saturating Rounding Doubling Multiply Half High
-let SchedRW = [WriteFPMul, ReadFPMul, ReadFPMul] in {
-defm SQRDMULH : NeonI_Scalar3Same_HS_sizes<0b1, 0b10110, "sqrdmulh", 1>;
-}
-
-// Patterns to match llvm.arm.* intrinsic for
-// Scalar Integer Saturating Doubling Multiply Half High and
-// Scalar Integer Saturating Rounding Doubling Multiply Half High
-defm : Neon_Scalar3Same_HS_size_patterns<int_arm_neon_vqdmulh, SQDMULHhhh,
- SQDMULHsss>;
-defm : Neon_Scalar3Same_HS_size_patterns<int_arm_neon_vqrdmulh, SQRDMULHhhh,
- SQRDMULHsss>;
-
-let SchedRW = [WriteFPMul, ReadFPMul, ReadFPMul, ReadFPMul] in {
-// Scalar Floating-point Multiply Extended
-defm FMULX : NeonI_Scalar3Same_SD_sizes<0b0, 0b0, 0b11011, "fmulx", 1>;
-}
-
-// Scalar Floating-point Reciprocal Step
-defm FRECPS : NeonI_Scalar3Same_SD_sizes<0b0, 0b0, 0b11111, "frecps", 0>;
-defm : Neon_Scalar3Same_SD_size_patterns<int_aarch64_neon_vrecps, f32, f32,
- FRECPSsss, f64, f64, FRECPSddd>;
-def : Pat<(v1f64 (int_arm_neon_vrecps (v1f64 FPR64:$Rn), (v1f64 FPR64:$Rm))),
- (FRECPSddd FPR64:$Rn, FPR64:$Rm)>;
-
-// Scalar Floating-point Reciprocal Square Root Step
-defm FRSQRTS : NeonI_Scalar3Same_SD_sizes<0b0, 0b1, 0b11111, "frsqrts", 0>;
-defm : Neon_Scalar3Same_SD_size_patterns<int_aarch64_neon_vrsqrts, f32, f32,
- FRSQRTSsss, f64, f64, FRSQRTSddd>;
-def : Pat<(v1f64 (int_arm_neon_vrsqrts (v1f64 FPR64:$Rn), (v1f64 FPR64:$Rm))),
- (FRSQRTSddd FPR64:$Rn, FPR64:$Rm)>;
-def : Pat<(v1f64 (fsqrt (v1f64 FPR64:$Rn))), (FSQRTdd FPR64:$Rn)>;
-
-// Patterns to match llvm.aarch64.* intrinsic for
-// Scalar Floating-point Multiply Extended,
-multiclass Neon_Scalar3Same_MULX_SD_size_patterns<SDPatternOperator opnode,
- Instruction INSTS,
- Instruction INSTD> {
- def : Pat<(f32 (opnode (f32 FPR32:$Rn), (f32 FPR32:$Rm))),
- (INSTS FPR32:$Rn, FPR32:$Rm)>;
- def : Pat<(f64 (opnode (f64 FPR64:$Rn), (f64 FPR64:$Rm))),
- (INSTD FPR64:$Rn, FPR64:$Rm)>;
-}
-
-defm : Neon_Scalar3Same_MULX_SD_size_patterns<int_aarch64_neon_vmulx,
- FMULXsss, FMULXddd>;
-def : Pat<(v1f64 (int_aarch64_neon_vmulx (v1f64 FPR64:$Rn), (v1f64 FPR64:$Rm))),
- (FMULXddd FPR64:$Rn, FPR64:$Rm)>;
-
-// Scalar Integer Shift Left (Signed, Unsigned)
-def SSHLddd : NeonI_Scalar3Same_D_size<0b0, 0b01000, "sshl">;
-def USHLddd : NeonI_Scalar3Same_D_size<0b1, 0b01000, "ushl">;
-
-// Patterns to match llvm.arm.* intrinsic for
-// Scalar Integer Shift Left (Signed, Unsigned)
-defm : Neon_Scalar3Same_D_size_patterns<int_arm_neon_vshifts, SSHLddd>;
-defm : Neon_Scalar3Same_D_size_patterns<int_arm_neon_vshiftu, USHLddd>;
-
-// Patterns to match llvm.aarch64.* intrinsic for
-// Scalar Integer Shift Left (Signed, Unsigned)
-defm : Neon_Scalar3Same_D_size_patterns<int_aarch64_neon_vshlds, SSHLddd>;
-defm : Neon_Scalar3Same_D_size_patterns<int_aarch64_neon_vshldu, USHLddd>;
-
-// Scalar Integer Saturating Shift Left (Signed, Unsigned)
-defm SQSHL: NeonI_Scalar3Same_BHSD_sizes<0b0, 0b01001, "sqshl", 0>;
-defm UQSHL: NeonI_Scalar3Same_BHSD_sizes<0b1, 0b01001, "uqshl", 0>;
-
-// Patterns to match llvm.aarch64.* intrinsic for
-// Scalar Integer Saturating Shift Letf (Signed, Unsigned)
-defm : Neon_Scalar3Same_BHSD_size_patterns<int_aarch64_neon_vqshls, SQSHLbbb,
- SQSHLhhh, SQSHLsss, SQSHLddd>;
-defm : Neon_Scalar3Same_BHSD_size_patterns<int_aarch64_neon_vqshlu, UQSHLbbb,
- UQSHLhhh, UQSHLsss, UQSHLddd>;
-
-// Patterns to match llvm.arm.* intrinsic for
-// Scalar Integer Saturating Shift Letf (Signed, Unsigned)
-defm : Neon_Scalar3Same_D_size_patterns<int_arm_neon_vqshifts, SQSHLddd>;
-defm : Neon_Scalar3Same_D_size_patterns<int_arm_neon_vqshiftu, UQSHLddd>;
-
-// Scalar Integer Rounding Shift Left (Signed, Unsigned)
-def SRSHLddd: NeonI_Scalar3Same_D_size<0b0, 0b01010, "srshl">;
-def URSHLddd: NeonI_Scalar3Same_D_size<0b1, 0b01010, "urshl">;
-
-// Patterns to match llvm.aarch64.* intrinsic for
-// Scalar Integer Rounding Shift Left (Signed, Unsigned)
-defm : Neon_Scalar3Same_D_size_patterns<int_aarch64_neon_vrshlds, SRSHLddd>;
-defm : Neon_Scalar3Same_D_size_patterns<int_aarch64_neon_vrshldu, URSHLddd>;
-
-// Patterns to match llvm.arm.* intrinsic for
-// Scalar Integer Rounding Shift Left (Signed, Unsigned)
-defm : Neon_Scalar3Same_D_size_patterns<int_arm_neon_vrshifts, SRSHLddd>;
-defm : Neon_Scalar3Same_D_size_patterns<int_arm_neon_vrshiftu, URSHLddd>;
-
-// Scalar Integer Saturating Rounding Shift Left (Signed, Unsigned)
-defm SQRSHL: NeonI_Scalar3Same_BHSD_sizes<0b0, 0b01011, "sqrshl", 0>;
-defm UQRSHL: NeonI_Scalar3Same_BHSD_sizes<0b1, 0b01011, "uqrshl", 0>;
-
-// Patterns to match llvm.aarch64.* intrinsic for
-// Scalar Integer Saturating Rounding Shift Left (Signed, Unsigned)
-defm : Neon_Scalar3Same_BHSD_size_patterns<int_aarch64_neon_vqrshls, SQRSHLbbb,
- SQRSHLhhh, SQRSHLsss, SQRSHLddd>;
-defm : Neon_Scalar3Same_BHSD_size_patterns<int_aarch64_neon_vqrshlu, UQRSHLbbb,
- UQRSHLhhh, UQRSHLsss, UQRSHLddd>;
-
-// Patterns to match llvm.arm.* intrinsic for
-// Scalar Integer Saturating Rounding Shift Left (Signed, Unsigned)
-defm : Neon_Scalar3Same_D_size_patterns<int_arm_neon_vqrshifts, SQRSHLddd>;
-defm : Neon_Scalar3Same_D_size_patterns<int_arm_neon_vqrshiftu, UQRSHLddd>;
-
-let SchedRW = [WriteFPMAC, ReadFPMAC, ReadFPMAC, ReadFPMAC] in {
-// Signed Saturating Doubling Multiply-Add Long
-defm SQDMLAL : NeonI_Scalar3Diff_ml_HS_size<0b0, 0b1001, "sqdmlal">;
-}
-defm : Neon_Scalar3Diff_ml_HS_size_patterns<int_aarch64_neon_vqdmlal,
- SQDMLALshh, SQDMLALdss>;
-
-// Signed Saturating Doubling Multiply-Subtract Long
-let SchedRW = [WriteFPMAC, ReadFPMAC, ReadFPMAC, ReadFPMAC] in {
-defm SQDMLSL : NeonI_Scalar3Diff_ml_HS_size<0b0, 0b1011, "sqdmlsl">;
-}
-defm : Neon_Scalar3Diff_ml_HS_size_patterns<int_aarch64_neon_vqdmlsl,
- SQDMLSLshh, SQDMLSLdss>;
-
-// Signed Saturating Doubling Multiply Long
-let SchedRW = [WriteFPMul, ReadFPMul, ReadFPMul, ReadFPMul] in {
-defm SQDMULL : NeonI_Scalar3Diff_HS_size<0b0, 0b1101, "sqdmull">;
-}
-defm : Neon_Scalar3Diff_HS_size_patterns<int_arm_neon_vqdmull,
- SQDMULLshh, SQDMULLdss>;
-
-// Scalar Signed Integer Convert To Floating-point
-defm SCVTF : NeonI_Scalar2SameMisc_SD_size<0b0, 0b0, 0b11101, "scvtf">;
-defm : Neon_Scalar2SameMisc_cvt_SD_size_patterns<int_aarch64_neon_vcvtint2fps,
- SCVTFss, SCVTFdd>;
-
-// Scalar Unsigned Integer Convert To Floating-point
-defm UCVTF : NeonI_Scalar2SameMisc_SD_size<0b1, 0b0, 0b11101, "ucvtf">;
-defm : Neon_Scalar2SameMisc_cvt_SD_size_patterns<int_aarch64_neon_vcvtint2fpu,
- UCVTFss, UCVTFdd>;
-
-// Scalar Floating-point Converts
-def FCVTXN : NeonI_Scalar2SameMisc_fcvtxn_D_size<0b1, 0b10110, "fcvtxn">;
-def : Neon_Scalar2SameMisc_fcvtxn_D_size_patterns<int_aarch64_neon_fcvtxn,
- FCVTXN>;
-
-defm FCVTNS : NeonI_Scalar2SameMisc_SD_size<0b0, 0b0, 0b11010, "fcvtns">;
-defm : Neon_Scalar2SameMisc_fcvt_SD_size_patterns<int_aarch64_neon_fcvtns,
- FCVTNSss, FCVTNSdd>;
-def : Neon_Scalar2SameMisc_vcvt_D_size_patterns<int_arm_neon_vcvtns, FCVTNSdd>;
-
-defm FCVTNU : NeonI_Scalar2SameMisc_SD_size<0b1, 0b0, 0b11010, "fcvtnu">;
-defm : Neon_Scalar2SameMisc_fcvt_SD_size_patterns<int_aarch64_neon_fcvtnu,
- FCVTNUss, FCVTNUdd>;
-def : Neon_Scalar2SameMisc_vcvt_D_size_patterns<int_arm_neon_vcvtnu, FCVTNUdd>;
-
-defm FCVTMS : NeonI_Scalar2SameMisc_SD_size<0b0, 0b0, 0b11011, "fcvtms">;
-defm : Neon_Scalar2SameMisc_fcvt_SD_size_patterns<int_aarch64_neon_fcvtms,
- FCVTMSss, FCVTMSdd>;
-def : Neon_Scalar2SameMisc_vcvt_D_size_patterns<int_arm_neon_vcvtms, FCVTMSdd>;
-
-defm FCVTMU : NeonI_Scalar2SameMisc_SD_size<0b1, 0b0, 0b11011, "fcvtmu">;
-defm : Neon_Scalar2SameMisc_fcvt_SD_size_patterns<int_aarch64_neon_fcvtmu,
- FCVTMUss, FCVTMUdd>;
-def : Neon_Scalar2SameMisc_vcvt_D_size_patterns<int_arm_neon_vcvtmu, FCVTMUdd>;
-
-defm FCVTAS : NeonI_Scalar2SameMisc_SD_size<0b0, 0b0, 0b11100, "fcvtas">;
-defm : Neon_Scalar2SameMisc_fcvt_SD_size_patterns<int_aarch64_neon_fcvtas,
- FCVTASss, FCVTASdd>;
-def : Neon_Scalar2SameMisc_vcvt_D_size_patterns<int_arm_neon_vcvtas, FCVTASdd>;
-
-defm FCVTAU : NeonI_Scalar2SameMisc_SD_size<0b1, 0b0, 0b11100, "fcvtau">;
-defm : Neon_Scalar2SameMisc_fcvt_SD_size_patterns<int_aarch64_neon_fcvtau,
- FCVTAUss, FCVTAUdd>;
-def : Neon_Scalar2SameMisc_vcvt_D_size_patterns<int_arm_neon_vcvtau, FCVTAUdd>;
-
-defm FCVTPS : NeonI_Scalar2SameMisc_SD_size<0b0, 0b1, 0b11010, "fcvtps">;
-defm : Neon_Scalar2SameMisc_fcvt_SD_size_patterns<int_aarch64_neon_fcvtps,
- FCVTPSss, FCVTPSdd>;
-def : Neon_Scalar2SameMisc_vcvt_D_size_patterns<int_arm_neon_vcvtps, FCVTPSdd>;
-
-defm FCVTPU : NeonI_Scalar2SameMisc_SD_size<0b1, 0b1, 0b11010, "fcvtpu">;
-defm : Neon_Scalar2SameMisc_fcvt_SD_size_patterns<int_aarch64_neon_fcvtpu,
- FCVTPUss, FCVTPUdd>;
-def : Neon_Scalar2SameMisc_vcvt_D_size_patterns<int_arm_neon_vcvtpu, FCVTPUdd>;
-
-defm FCVTZS : NeonI_Scalar2SameMisc_SD_size<0b0, 0b1, 0b11011, "fcvtzs">;
-defm : Neon_Scalar2SameMisc_fcvt_SD_size_patterns<int_aarch64_neon_fcvtzs,
- FCVTZSss, FCVTZSdd>;
-def : Neon_Scalar2SameMisc_vcvt_D_size_patterns<int_aarch64_neon_vcvtzs,
- FCVTZSdd>;
-
-defm FCVTZU : NeonI_Scalar2SameMisc_SD_size<0b1, 0b1, 0b11011, "fcvtzu">;
-defm : Neon_Scalar2SameMisc_fcvt_SD_size_patterns<int_aarch64_neon_fcvtzu,
- FCVTZUss, FCVTZUdd>;
-def : Neon_Scalar2SameMisc_vcvt_D_size_patterns<int_aarch64_neon_vcvtzu,
- FCVTZUdd>;
-
-// Patterns For Convert Instructions Between v1f64 and v1i64
-class Neon_Scalar2SameMisc_cvtf_v1f64_pattern<SDPatternOperator opnode,
- Instruction INST>
- : Pat<(v1f64 (opnode (v1i64 FPR64:$Rn))), (INST FPR64:$Rn)>;
-
-class Neon_Scalar2SameMisc_fcvt_v1f64_pattern<SDPatternOperator opnode,
- Instruction INST>
- : Pat<(v1i64 (opnode (v1f64 FPR64:$Rn))), (INST FPR64:$Rn)>;
-
-def : Neon_Scalar2SameMisc_cvtf_v1f64_pattern<sint_to_fp, SCVTFdd>;
-def : Neon_Scalar2SameMisc_cvtf_v1f64_pattern<uint_to_fp, UCVTFdd>;
-
-def : Neon_Scalar2SameMisc_fcvt_v1f64_pattern<fp_to_sint, FCVTZSdd>;
-def : Neon_Scalar2SameMisc_fcvt_v1f64_pattern<fp_to_uint, FCVTZUdd>;
-
-// Scalar Floating-point Reciprocal Estimate
-defm FRECPE : NeonI_Scalar2SameMisc_SD_size<0b0, 0b1, 0b11101, "frecpe">;
-defm : Neon_Scalar2SameMisc_SD_size_patterns<int_aarch64_neon_vrecpe,
- FRECPEss, FRECPEdd>;
-def : Neon_Scalar2SameMisc_V1_D_size_patterns<int_arm_neon_vrecpe,
- FRECPEdd>;
-
-// Scalar Floating-point Reciprocal Exponent
-defm FRECPX : NeonI_Scalar2SameMisc_SD_size<0b0, 0b1, 0b11111, "frecpx">;
-defm : Neon_Scalar2SameMisc_SD_size_patterns<int_aarch64_neon_vrecpx,
- FRECPXss, FRECPXdd>;
-
-// Scalar Floating-point Reciprocal Square Root Estimate
-defm FRSQRTE: NeonI_Scalar2SameMisc_SD_size<0b1, 0b1, 0b11101, "frsqrte">;
-defm : Neon_Scalar2SameMisc_SD_size_patterns<int_aarch64_neon_vrsqrte,
- FRSQRTEss, FRSQRTEdd>;
-def : Neon_Scalar2SameMisc_V1_D_size_patterns<int_arm_neon_vrsqrte,
- FRSQRTEdd>;
-
-// Scalar Floating-point Round
-class Neon_ScalarFloatRound_pattern<SDPatternOperator opnode, Instruction INST>
- : Pat<(v1f64 (opnode (v1f64 FPR64:$Rn))), (INST FPR64:$Rn)>;
-
-def : Neon_ScalarFloatRound_pattern<fceil, FRINTPdd>;
-def : Neon_ScalarFloatRound_pattern<ffloor, FRINTMdd>;
-def : Neon_ScalarFloatRound_pattern<ftrunc, FRINTZdd>;
-def : Neon_ScalarFloatRound_pattern<frint, FRINTXdd>;
-def : Neon_ScalarFloatRound_pattern<fnearbyint, FRINTIdd>;
-def : Neon_ScalarFloatRound_pattern<frnd, FRINTAdd>;
-def : Neon_ScalarFloatRound_pattern<int_aarch64_neon_frintn, FRINTNdd>;
-
-// Scalar Integer Compare
-
-// Scalar Compare Bitwise Equal
-def CMEQddd: NeonI_Scalar3Same_D_size<0b1, 0b10001, "cmeq">;
-defm : Neon_Scalar3Same_D_size_patterns<int_aarch64_neon_vceq, CMEQddd>;
-
-class Neon_Scalar3Same_cmp_D_size_v1_patterns<SDPatternOperator opnode,
- Instruction INSTD,
- CondCode CC>
- : Pat<(v1i64 (opnode (v1i64 FPR64:$Rn), (v1i64 FPR64:$Rm), CC)),
- (INSTD FPR64:$Rn, FPR64:$Rm)>;
-
-def : Neon_Scalar3Same_cmp_D_size_v1_patterns<Neon_cmp, CMEQddd, SETEQ>;
-
-// Scalar Compare Signed Greather Than Or Equal
-def CMGEddd: NeonI_Scalar3Same_D_size<0b0, 0b00111, "cmge">;
-defm : Neon_Scalar3Same_D_size_patterns<int_aarch64_neon_vcge, CMGEddd>;
-def : Neon_Scalar3Same_cmp_D_size_v1_patterns<Neon_cmp, CMGEddd, SETGE>;
-
-// Scalar Compare Unsigned Higher Or Same
-def CMHSddd: NeonI_Scalar3Same_D_size<0b1, 0b00111, "cmhs">;
-defm : Neon_Scalar3Same_D_size_patterns<int_aarch64_neon_vchs, CMHSddd>;
-def : Neon_Scalar3Same_cmp_D_size_v1_patterns<Neon_cmp, CMHSddd, SETUGE>;
-
-// Scalar Compare Unsigned Higher
-def CMHIddd: NeonI_Scalar3Same_D_size<0b1, 0b00110, "cmhi">;
-defm : Neon_Scalar3Same_D_size_patterns<int_aarch64_neon_vchi, CMHIddd>;
-def : Neon_Scalar3Same_cmp_D_size_v1_patterns<Neon_cmp, CMHIddd, SETUGT>;
-
-// Scalar Compare Signed Greater Than
-def CMGTddd: NeonI_Scalar3Same_D_size<0b0, 0b00110, "cmgt">;
-defm : Neon_Scalar3Same_D_size_patterns<int_aarch64_neon_vcgt, CMGTddd>;
-def : Neon_Scalar3Same_cmp_D_size_v1_patterns<Neon_cmp, CMGTddd, SETGT>;
-
-// Scalar Compare Bitwise Test Bits
-def CMTSTddd: NeonI_Scalar3Same_D_size<0b0, 0b10001, "cmtst">;
-defm : Neon_Scalar3Same_D_size_patterns<int_aarch64_neon_vtstd, CMTSTddd>;
-defm : Neon_Scalar3Same_D_size_patterns<Neon_tst, CMTSTddd>;
-
-// Scalar Compare Bitwise Equal To Zero
-def CMEQddi: NeonI_Scalar2SameMisc_cmpz_D_size<0b0, 0b01001, "cmeq">;
-def : Neon_Scalar2SameMisc_cmpz_D_size_patterns<int_aarch64_neon_vceq,
- CMEQddi>;
-def : Neon_Scalar2SameMisc_cmpz_D_V1_size_patterns<SETEQ, CMEQddi>;
-
-// Scalar Compare Signed Greather Than Or Equal To Zero
-def CMGEddi: NeonI_Scalar2SameMisc_cmpz_D_size<0b1, 0b01000, "cmge">;
-def : Neon_Scalar2SameMisc_cmpz_D_size_patterns<int_aarch64_neon_vcge,
- CMGEddi>;
-def : Neon_Scalar2SameMisc_cmpz_D_V1_size_patterns<SETGE, CMGEddi>;
-
-// Scalar Compare Signed Greater Than Zero
-def CMGTddi: NeonI_Scalar2SameMisc_cmpz_D_size<0b0, 0b01000, "cmgt">;
-def : Neon_Scalar2SameMisc_cmpz_D_size_patterns<int_aarch64_neon_vcgt,
- CMGTddi>;
-def : Neon_Scalar2SameMisc_cmpz_D_V1_size_patterns<SETGT, CMGTddi>;
-
-// Scalar Compare Signed Less Than Or Equal To Zero
-def CMLEddi: NeonI_Scalar2SameMisc_cmpz_D_size<0b1, 0b01001, "cmle">;
-def : Neon_Scalar2SameMisc_cmpz_D_size_patterns<int_aarch64_neon_vclez,
- CMLEddi>;
-def : Neon_Scalar2SameMisc_cmpz_D_V1_size_patterns<SETLE, CMLEddi>;
-
-// Scalar Compare Less Than Zero
-def CMLTddi: NeonI_Scalar2SameMisc_cmpz_D_size<0b0, 0b01010, "cmlt">;
-def : Neon_Scalar2SameMisc_cmpz_D_size_patterns<int_aarch64_neon_vcltz,
- CMLTddi>;
-def : Neon_Scalar2SameMisc_cmpz_D_V1_size_patterns<SETLT, CMLTddi>;
-
-// Scalar Floating-point Compare
-
-// Scalar Floating-point Compare Mask Equal
-defm FCMEQ: NeonI_Scalar3Same_SD_sizes<0b0, 0b0, 0b11100, "fcmeq">;
-defm : Neon_Scalar3Same_SD_size_patterns<int_aarch64_neon_fceq, v1i32, f32,
- FCMEQsss, v1i64, f64, FCMEQddd>;
-def : Neon_Scalar3Same_cmp_V1_D_size_patterns<SETEQ, FCMEQddd>;
-
-// Scalar Floating-point Compare Mask Equal To Zero
-defm FCMEQZ: NeonI_Scalar2SameMisc_cmpz_SD_size<0b0, 0b01101, "fcmeq">;
-defm : Neon_Scalar2SameMisc_cmpz_SD_size_patterns<int_aarch64_neon_fceq, SETEQ,
- FCMEQZssi, FCMEQZddi>;
-
-// Scalar Floating-point Compare Mask Greater Than Or Equal
-defm FCMGE: NeonI_Scalar3Same_SD_sizes<0b1, 0b0, 0b11100, "fcmge">;
-defm : Neon_Scalar3Same_SD_size_patterns<int_aarch64_neon_fcge, v1i32, f32,
- FCMGEsss, v1i64, f64, FCMGEddd>;
-def : Neon_Scalar3Same_cmp_V1_D_size_patterns<SETGE, FCMGEddd>;
-
-// Scalar Floating-point Compare Mask Greater Than Or Equal To Zero
-defm FCMGEZ: NeonI_Scalar2SameMisc_cmpz_SD_size<0b1, 0b01100, "fcmge">;
-defm : Neon_Scalar2SameMisc_cmpz_SD_size_patterns<int_aarch64_neon_fcge, SETGE,
- FCMGEZssi, FCMGEZddi>;
-
-// Scalar Floating-point Compare Mask Greather Than
-defm FCMGT: NeonI_Scalar3Same_SD_sizes<0b1, 0b1, 0b11100, "fcmgt">;
-defm : Neon_Scalar3Same_SD_size_patterns<int_aarch64_neon_fcgt, v1i32, f32,
- FCMGTsss, v1i64, f64, FCMGTddd>;
-def : Neon_Scalar3Same_cmp_V1_D_size_patterns<SETGT, FCMGTddd>;
-
-// Scalar Floating-point Compare Mask Greather Than Zero
-defm FCMGTZ: NeonI_Scalar2SameMisc_cmpz_SD_size<0b0, 0b01100, "fcmgt">;
-defm : Neon_Scalar2SameMisc_cmpz_SD_size_patterns<int_aarch64_neon_fcgt, SETGT,
- FCMGTZssi, FCMGTZddi>;
-
-// Scalar Floating-point Compare Mask Less Than Or Equal To Zero
-defm FCMLEZ: NeonI_Scalar2SameMisc_cmpz_SD_size<0b1, 0b01101, "fcmle">;
-defm : Neon_Scalar2SameMisc_cmpz_SD_size_patterns<int_aarch64_neon_fclez, SETLE,
- FCMLEZssi, FCMLEZddi>;
-
-// Scalar Floating-point Compare Mask Less Than Zero
-defm FCMLTZ: NeonI_Scalar2SameMisc_cmpz_SD_size<0b0, 0b01110, "fcmlt">;
-defm : Neon_Scalar2SameMisc_cmpz_SD_size_patterns<int_aarch64_neon_fcltz, SETLT,
- FCMLTZssi, FCMLTZddi>;
-
-// Scalar Floating-point Absolute Compare Mask Greater Than Or Equal
-defm FACGE: NeonI_Scalar3Same_SD_sizes<0b1, 0b0, 0b11101, "facge">;
-defm : Neon_Scalar3Same_SD_size_patterns<int_aarch64_neon_fcage, v1i32, f32,
- FACGEsss, v1i64, f64, FACGEddd>;
-def : Pat<(v1i64 (int_arm_neon_vacge (v1f64 FPR64:$Rn), (v1f64 FPR64:$Rm))),
- (FACGEddd FPR64:$Rn, FPR64:$Rm)>;
-
-// Scalar Floating-point Absolute Compare Mask Greater Than
-defm FACGT: NeonI_Scalar3Same_SD_sizes<0b1, 0b1, 0b11101, "facgt">;
-defm : Neon_Scalar3Same_SD_size_patterns<int_aarch64_neon_fcagt, v1i32, f32,
- FACGTsss, v1i64, f64, FACGTddd>;
-def : Pat<(v1i64 (int_arm_neon_vacgt (v1f64 FPR64:$Rn), (v1f64 FPR64:$Rm))),
- (FACGTddd FPR64:$Rn, FPR64:$Rm)>;
-
-// Scalar Floating-point Absolute Difference
-defm FABD: NeonI_Scalar3Same_SD_sizes<0b1, 0b1, 0b11010, "fabd">;
-defm : Neon_Scalar3Same_SD_size_patterns<int_aarch64_neon_vabd, f32, f32,
- FABDsss, f64, f64, FABDddd>;
-
-// Scalar Absolute Value
-defm ABS : NeonI_Scalar2SameMisc_D_size<0b0, 0b01011, "abs">;
-defm : Neon_Scalar2SameMisc_D_size_patterns<int_aarch64_neon_vabs, ABSdd>;
-
-// Scalar Signed Saturating Absolute Value
-defm SQABS : NeonI_Scalar2SameMisc_BHSD_size<0b0, 0b00111, "sqabs">;
-defm : Neon_Scalar2SameMisc_BHSD_size_patterns<int_arm_neon_vqabs,
- SQABSbb, SQABShh, SQABSss, SQABSdd>;
-
-// Scalar Negate
-defm NEG : NeonI_Scalar2SameMisc_D_size<0b1, 0b01011, "neg">;
-defm : Neon_Scalar2SameMisc_D_size_patterns<int_aarch64_neon_vneg, NEGdd>;
-
-// Scalar Signed Saturating Negate
-defm SQNEG : NeonI_Scalar2SameMisc_BHSD_size<0b1, 0b00111, "sqneg">;
-defm : Neon_Scalar2SameMisc_BHSD_size_patterns<int_arm_neon_vqneg,
- SQNEGbb, SQNEGhh, SQNEGss, SQNEGdd>;
-
-// Scalar Signed Saturating Accumulated of Unsigned Value
-defm SUQADD : NeonI_Scalar2SameMisc_accum_BHSD_size<0b0, 0b00011, "suqadd">;
-defm : Neon_Scalar2SameMisc_accum_BHSD_size_patterns<int_aarch64_neon_vuqadd,
- SUQADDbb, SUQADDhh,
- SUQADDss, SUQADDdd>;
-
-// Scalar Unsigned Saturating Accumulated of Signed Value
-defm USQADD : NeonI_Scalar2SameMisc_accum_BHSD_size<0b1, 0b00011, "usqadd">;
-defm : Neon_Scalar2SameMisc_accum_BHSD_size_patterns<int_aarch64_neon_vsqadd,
- USQADDbb, USQADDhh,
- USQADDss, USQADDdd>;
-
-def : Pat<(v1i64 (int_aarch64_neon_suqadd (v1i64 FPR64:$Src),
- (v1i64 FPR64:$Rn))),
- (SUQADDdd FPR64:$Src, FPR64:$Rn)>;
-
-def : Pat<(v1i64 (int_aarch64_neon_usqadd (v1i64 FPR64:$Src),
- (v1i64 FPR64:$Rn))),
- (USQADDdd FPR64:$Src, FPR64:$Rn)>;
-
-def : Pat<(v1i64 (int_arm_neon_vabs (v1i64 FPR64:$Rn))),
- (ABSdd FPR64:$Rn)>;
-
-def : Pat<(v1i64 (int_arm_neon_vqabs (v1i64 FPR64:$Rn))),
- (SQABSdd FPR64:$Rn)>;
-
-def : Pat<(v1i64 (int_arm_neon_vqneg (v1i64 FPR64:$Rn))),
- (SQNEGdd FPR64:$Rn)>;
-
-def : Pat<(v1i64 (sub (v1i64 (bitconvert (v8i8 Neon_AllZero))),
- (v1i64 FPR64:$Rn))),
- (NEGdd FPR64:$Rn)>;
-
-// Scalar Signed Saturating Extract Unsigned Narrow
-defm SQXTUN : NeonI_Scalar2SameMisc_narrow_HSD_size<0b1, 0b10010, "sqxtun">;
-defm : Neon_Scalar2SameMisc_narrow_HSD_size_patterns<int_arm_neon_vqmovnsu,
- SQXTUNbh, SQXTUNhs,
- SQXTUNsd>;
-
-// Scalar Signed Saturating Extract Narrow
-defm SQXTN : NeonI_Scalar2SameMisc_narrow_HSD_size<0b0, 0b10100, "sqxtn">;
-defm : Neon_Scalar2SameMisc_narrow_HSD_size_patterns<int_arm_neon_vqmovns,
- SQXTNbh, SQXTNhs,
- SQXTNsd>;
-
-// Scalar Unsigned Saturating Extract Narrow
-defm UQXTN : NeonI_Scalar2SameMisc_narrow_HSD_size<0b1, 0b10100, "uqxtn">;
-defm : Neon_Scalar2SameMisc_narrow_HSD_size_patterns<int_arm_neon_vqmovnu,
- UQXTNbh, UQXTNhs,
- UQXTNsd>;
-
-// Scalar Reduce Pairwise
-
-multiclass NeonI_ScalarPair_D_sizes<bit u, bit size, bits<5> opcode,
- string asmop, bit Commutable = 0> {
- let isCommutable = Commutable in {
- def _D_2D : NeonI_ScalarPair<u, {size, 0b1}, opcode,
- (outs FPR64:$Rd), (ins VPR128:$Rn),
- !strconcat(asmop, "\t$Rd, $Rn.2d"),
- [],
- NoItinerary>,
- Sched<[WriteFPALU, ReadFPALU]>;
- }
-}
-
-multiclass NeonI_ScalarPair_SD_sizes<bit u, bit size, bits<5> opcode,
- string asmop, bit Commutable = 0>
- : NeonI_ScalarPair_D_sizes<u, size, opcode, asmop, Commutable> {
- let isCommutable = Commutable in {
- def _S_2S : NeonI_ScalarPair<u, {size, 0b0}, opcode,
- (outs FPR32:$Rd), (ins VPR64:$Rn),
- !strconcat(asmop, "\t$Rd, $Rn.2s"),
- [],
- NoItinerary>,
- Sched<[WriteFPALU, ReadFPALU]>;
- }
-}
-
-// Scalar Reduce Addition Pairwise (Integer) with
-// Pattern to match llvm.arm.* intrinsic
-defm ADDPvv : NeonI_ScalarPair_D_sizes<0b0, 0b1, 0b11011, "addp", 0>;
-
-// Pattern to match llvm.aarch64.* intrinsic for
-// Scalar Reduce Addition Pairwise (Integer)
-def : Pat<(v1i64 (int_aarch64_neon_vpadd (v2i64 VPR128:$Rn))),
- (ADDPvv_D_2D VPR128:$Rn)>;
-def : Pat<(v1i64 (int_aarch64_neon_vaddv (v2i64 VPR128:$Rn))),
- (ADDPvv_D_2D VPR128:$Rn)>;
-
-// Scalar Reduce Addition Pairwise (Floating Point)
-defm FADDPvv : NeonI_ScalarPair_SD_sizes<0b1, 0b0, 0b01101, "faddp", 0>;
-
-// Scalar Reduce Maximum Pairwise (Floating Point)
-defm FMAXPvv : NeonI_ScalarPair_SD_sizes<0b1, 0b0, 0b01111, "fmaxp", 0>;
-
-// Scalar Reduce Minimum Pairwise (Floating Point)
-defm FMINPvv : NeonI_ScalarPair_SD_sizes<0b1, 0b1, 0b01111, "fminp", 0>;
-
-// Scalar Reduce maxNum Pairwise (Floating Point)
-defm FMAXNMPvv : NeonI_ScalarPair_SD_sizes<0b1, 0b0, 0b01100, "fmaxnmp", 0>;
-
-// Scalar Reduce minNum Pairwise (Floating Point)
-defm FMINNMPvv : NeonI_ScalarPair_SD_sizes<0b1, 0b1, 0b01100, "fminnmp", 0>;
-
-multiclass Neon_ScalarPair_SD_size_patterns<SDPatternOperator opnode,
- Instruction INSTS,
- Instruction INSTD> {
- def : Pat<(f32 (opnode (v2f32 VPR64:$Rn))),
- (INSTS VPR64:$Rn)>;
- def : Pat<(f64 (opnode (v2f64 VPR128:$Rn))),
- (INSTD VPR128:$Rn)>;
-}
-
-// Patterns to match llvm.aarch64.* intrinsic for
-// Scalar Reduce Add, Max, Min, MaxiNum, MinNum Pairwise (Floating Point)
-defm : Neon_ScalarPair_SD_size_patterns<int_aarch64_neon_vpfadd,
- FADDPvv_S_2S, FADDPvv_D_2D>;
-
-defm : Neon_ScalarPair_SD_size_patterns<int_aarch64_neon_vpmax,
- FMAXPvv_S_2S, FMAXPvv_D_2D>;
-
-defm : Neon_ScalarPair_SD_size_patterns<int_aarch64_neon_vpmin,
- FMINPvv_S_2S, FMINPvv_D_2D>;
-
-defm : Neon_ScalarPair_SD_size_patterns<int_aarch64_neon_vpfmaxnm,
- FMAXNMPvv_S_2S, FMAXNMPvv_D_2D>;
-
-defm : Neon_ScalarPair_SD_size_patterns<int_aarch64_neon_vpfminnm,
- FMINNMPvv_S_2S, FMINNMPvv_D_2D>;
-
-def : Pat<(f32 (int_aarch64_neon_vpfadd (v4f32 VPR128:$Rn))),
- (FADDPvv_S_2S (v2f32
- (EXTRACT_SUBREG
- (v4f32 (FADDP_4S (v4f32 VPR128:$Rn), (v4f32 VPR128:$Rn))),
- sub_64)))>;
-
-// Scalar by element Arithmetic
-
-class NeonI_ScalarXIndexedElemArith<string asmop, bits<4> opcode,
- string rmlane, bit u, bit szhi, bit szlo,
- RegisterClass ResFPR, RegisterClass OpFPR,
- RegisterOperand OpVPR, Operand OpImm>
- : NeonI_ScalarXIndexedElem<u, szhi, szlo, opcode,
- (outs ResFPR:$Rd),
- (ins OpFPR:$Rn, OpVPR:$MRm, OpImm:$Imm),
- asmop # "\t$Rd, $Rn, $MRm" # rmlane # "[$Imm]",
- [],
- NoItinerary>,
- Sched<[WriteFPMul, ReadFPMul, ReadFPMul]> {
- bits<3> Imm;
- bits<5> MRm;
-}
-
-class NeonI_ScalarXIndexedElemArith_Constraint_Impl<string asmop, bits<4> opcode,
- string rmlane,
- bit u, bit szhi, bit szlo,
- RegisterClass ResFPR,
- RegisterClass OpFPR,
- RegisterOperand OpVPR,
- Operand OpImm>
- : NeonI_ScalarXIndexedElem<u, szhi, szlo, opcode,
- (outs ResFPR:$Rd),
- (ins ResFPR:$src, OpFPR:$Rn, OpVPR:$MRm, OpImm:$Imm),
- asmop # "\t$Rd, $Rn, $MRm" # rmlane # "[$Imm]",
- [],
- NoItinerary>,
- Sched<[WriteFPMAC, ReadFPMAC, ReadFPMAC, ReadFPMAC]> {
- let Constraints = "$src = $Rd";
- bits<3> Imm;
- bits<5> MRm;
-}
-
-// Scalar Floating Point multiply (scalar, by element)
-def FMULssv_4S : NeonI_ScalarXIndexedElemArith<"fmul",
- 0b1001, ".s", 0b0, 0b1, 0b0, FPR32, FPR32, VPR128, neon_uimm2_bare> {
- let Inst{11} = Imm{1}; // h
- let Inst{21} = Imm{0}; // l
- let Inst{20-16} = MRm;
-}
-def FMULddv_2D : NeonI_ScalarXIndexedElemArith<"fmul",
- 0b1001, ".d", 0b0, 0b1, 0b1, FPR64, FPR64, VPR128, neon_uimm1_bare> {
- let Inst{11} = Imm{0}; // h
- let Inst{21} = 0b0; // l
- let Inst{20-16} = MRm;
-}
-
-// Scalar Floating Point multiply extended (scalar, by element)
-def FMULXssv_4S : NeonI_ScalarXIndexedElemArith<"fmulx",
- 0b1001, ".s", 0b1, 0b1, 0b0, FPR32, FPR32, VPR128, neon_uimm2_bare> {
- let Inst{11} = Imm{1}; // h
- let Inst{21} = Imm{0}; // l
- let Inst{20-16} = MRm;
-}
-def FMULXddv_2D : NeonI_ScalarXIndexedElemArith<"fmulx",
- 0b1001, ".d", 0b1, 0b1, 0b1, FPR64, FPR64, VPR128, neon_uimm1_bare> {
- let Inst{11} = Imm{0}; // h
- let Inst{21} = 0b0; // l
- let Inst{20-16} = MRm;
-}
-
-multiclass Neon_ScalarXIndexedElem_MUL_MULX_Patterns<
- SDPatternOperator opnode,
- Instruction INST,
- ValueType ResTy, RegisterClass FPRC, ValueType OpTy, Operand OpImm,
- ValueType OpNTy, ValueType ExTy, Operand OpNImm> {
-
- def : Pat<(ResTy (opnode (ResTy FPRC:$Rn),
- (ResTy (vector_extract (OpTy VPR128:$MRm), OpImm:$Imm)))),
- (ResTy (INST (ResTy FPRC:$Rn), (OpTy VPR128:$MRm), OpImm:$Imm))>;
-
- def : Pat<(ResTy (opnode (ResTy FPRC:$Rn),
- (ResTy (vector_extract (OpNTy VPR64:$MRm), OpNImm:$Imm)))),
- (ResTy (INST (ResTy FPRC:$Rn),
- (ExTy (SUBREG_TO_REG (i64 0), VPR64:$MRm, sub_64)),
- OpNImm:$Imm))>;
-
- // swapped operands
- def : Pat<(ResTy (opnode
- (ResTy (vector_extract (OpTy VPR128:$MRm), OpImm:$Imm)),
- (ResTy FPRC:$Rn))),
- (ResTy (INST (ResTy FPRC:$Rn), (OpTy VPR128:$MRm), OpImm:$Imm))>;
-
- def : Pat<(ResTy (opnode
- (ResTy (vector_extract (OpNTy VPR64:$MRm), OpNImm:$Imm)),
- (ResTy FPRC:$Rn))),
- (ResTy (INST (ResTy FPRC:$Rn),
- (ExTy (SUBREG_TO_REG (i64 0), VPR64:$MRm, sub_64)),
- OpNImm:$Imm))>;
-}
-
-// Patterns for Scalar Floating Point multiply (scalar, by element)
-defm : Neon_ScalarXIndexedElem_MUL_MULX_Patterns<fmul, FMULssv_4S,
- f32, FPR32, v4f32, neon_uimm2_bare, v2f32, v4f32, neon_uimm1_bare>;
-defm : Neon_ScalarXIndexedElem_MUL_MULX_Patterns<fmul, FMULddv_2D,
- f64, FPR64, v2f64, neon_uimm1_bare, v1f64, v2f64, neon_uimm0_bare>;
-
-// Patterns for Scalar Floating Point multiply extended (scalar, by element)
-defm : Neon_ScalarXIndexedElem_MUL_MULX_Patterns<int_aarch64_neon_vmulx,
- FMULXssv_4S, f32, FPR32, v4f32, neon_uimm2_bare,
- v2f32, v4f32, neon_uimm1_bare>;
-defm : Neon_ScalarXIndexedElem_MUL_MULX_Patterns<int_aarch64_neon_vmulx,
- FMULXddv_2D, f64, FPR64, v2f64, neon_uimm1_bare,
- v1f64, v2f64, neon_uimm0_bare>;
-
-// Scalar Floating Point fused multiply-add (scalar, by element)
-def FMLAssv_4S : NeonI_ScalarXIndexedElemArith_Constraint_Impl<"fmla",
- 0b0001, ".s", 0b0, 0b1, 0b0, FPR32, FPR32, VPR128, neon_uimm2_bare> {
- let Inst{11} = Imm{1}; // h
- let Inst{21} = Imm{0}; // l
- let Inst{20-16} = MRm;
-}
-def FMLAddv_2D : NeonI_ScalarXIndexedElemArith_Constraint_Impl<"fmla",
- 0b0001, ".d", 0b0, 0b1, 0b1, FPR64, FPR64, VPR128, neon_uimm1_bare> {
- let Inst{11} = Imm{0}; // h
- let Inst{21} = 0b0; // l
- let Inst{20-16} = MRm;
-}
-
-// Scalar Floating Point fused multiply-subtract (scalar, by element)
-def FMLSssv_4S : NeonI_ScalarXIndexedElemArith_Constraint_Impl<"fmls",
- 0b0101, ".s", 0b0, 0b1, 0b0, FPR32, FPR32, VPR128, neon_uimm2_bare> {
- let Inst{11} = Imm{1}; // h
- let Inst{21} = Imm{0}; // l
- let Inst{20-16} = MRm;
-}
-def FMLSddv_2D : NeonI_ScalarXIndexedElemArith_Constraint_Impl<"fmls",
- 0b0101, ".d", 0b0, 0b1, 0b1, FPR64, FPR64, VPR128, neon_uimm1_bare> {
- let Inst{11} = Imm{0}; // h
- let Inst{21} = 0b0; // l
- let Inst{20-16} = MRm;
-}
-// We are allowed to match the fma instruction regardless of compile options.
-multiclass Neon_ScalarXIndexedElem_FMA_Patterns<
- Instruction FMLAI, Instruction FMLSI,
- ValueType ResTy, RegisterClass FPRC, ValueType OpTy, Operand OpImm,
- ValueType OpNTy, ValueType ExTy, Operand OpNImm> {
- // fmla
- def : Pat<(ResTy (fma (ResTy FPRC:$Rn),
- (ResTy (vector_extract (OpTy VPR128:$MRm), OpImm:$Imm)),
- (ResTy FPRC:$Ra))),
- (ResTy (FMLAI (ResTy FPRC:$Ra),
- (ResTy FPRC:$Rn), (OpTy VPR128:$MRm), OpImm:$Imm))>;
-
- def : Pat<(ResTy (fma (ResTy FPRC:$Rn),
- (ResTy (vector_extract (OpNTy VPR64:$MRm), OpNImm:$Imm)),
- (ResTy FPRC:$Ra))),
- (ResTy (FMLAI (ResTy FPRC:$Ra),
- (ResTy FPRC:$Rn),
- (ExTy (SUBREG_TO_REG (i64 0), VPR64:$MRm, sub_64)),
- OpNImm:$Imm))>;
-
- // swapped fmla operands
- def : Pat<(ResTy (fma
- (ResTy (vector_extract (OpTy VPR128:$MRm), OpImm:$Imm)),
- (ResTy FPRC:$Rn),
- (ResTy FPRC:$Ra))),
- (ResTy (FMLAI (ResTy FPRC:$Ra),
- (ResTy FPRC:$Rn), (OpTy VPR128:$MRm), OpImm:$Imm))>;
-
- def : Pat<(ResTy (fma
- (ResTy (vector_extract (OpNTy VPR64:$MRm), OpNImm:$Imm)),
- (ResTy FPRC:$Rn),
- (ResTy FPRC:$Ra))),
- (ResTy (FMLAI (ResTy FPRC:$Ra),
- (ResTy FPRC:$Rn),
- (ExTy (SUBREG_TO_REG (i64 0), VPR64:$MRm, sub_64)),
- OpNImm:$Imm))>;
-
- // fmls
- def : Pat<(ResTy (fma (ResTy FPRC:$Rn),
- (fneg (ResTy (vector_extract (OpTy VPR128:$MRm), OpImm:$Imm))),
- (ResTy FPRC:$Ra))),
- (ResTy (FMLSI (ResTy FPRC:$Ra),
- (ResTy FPRC:$Rn), (OpTy VPR128:$MRm), OpImm:$Imm))>;
-
- def : Pat<(ResTy (fma (ResTy FPRC:$Rn),
- (fneg (ResTy (vector_extract (OpNTy VPR64:$MRm), OpNImm:$Imm))),
- (ResTy FPRC:$Ra))),
- (ResTy (FMLSI (ResTy FPRC:$Ra),
- (ResTy FPRC:$Rn),
- (ExTy (SUBREG_TO_REG (i64 0), VPR64:$MRm, sub_64)),
- OpNImm:$Imm))>;
-
- // swapped fmls operands
- def : Pat<(ResTy (fma
- (fneg (ResTy (vector_extract (OpTy VPR128:$MRm), OpImm:$Imm))),
- (ResTy FPRC:$Rn),
- (ResTy FPRC:$Ra))),
- (ResTy (FMLSI (ResTy FPRC:$Ra),
- (ResTy FPRC:$Rn), (OpTy VPR128:$MRm), OpImm:$Imm))>;
-
- def : Pat<(ResTy (fma
- (fneg (ResTy (vector_extract (OpNTy VPR64:$MRm), OpNImm:$Imm))),
- (ResTy FPRC:$Rn),
- (ResTy FPRC:$Ra))),
- (ResTy (FMLSI (ResTy FPRC:$Ra),
- (ResTy FPRC:$Rn),
- (ExTy (SUBREG_TO_REG (i64 0), VPR64:$MRm, sub_64)),
- OpNImm:$Imm))>;
-}
-
-// Scalar Floating Point fused multiply-add and
-// multiply-subtract (scalar, by element)
-defm : Neon_ScalarXIndexedElem_FMA_Patterns<FMLAssv_4S, FMLSssv_4S,
- f32, FPR32, v4f32, neon_uimm2_bare, v2f32, v4f32, neon_uimm1_bare>;
-defm : Neon_ScalarXIndexedElem_FMA_Patterns<FMLAddv_2D, FMLSddv_2D,
- f64, FPR64, v2f64, neon_uimm1_bare, v1f64, v2f64, neon_uimm0_bare>;
-defm : Neon_ScalarXIndexedElem_FMA_Patterns<FMLAddv_2D, FMLSddv_2D,
- f64, FPR64, v2f64, neon_uimm1_bare, v1f64, v2f64, neon_uimm0_bare>;
-
-// Scalar Signed saturating doubling multiply long (scalar, by element)
-def SQDMULLshv_4H : NeonI_ScalarXIndexedElemArith<"sqdmull",
- 0b1011, ".h", 0b0, 0b0, 0b1, FPR32, FPR16, VPR64Lo, neon_uimm2_bare> {
- let Inst{11} = 0b0; // h
- let Inst{21} = Imm{1}; // l
- let Inst{20} = Imm{0}; // m
- let Inst{19-16} = MRm{3-0};
-}
-def SQDMULLshv_8H : NeonI_ScalarXIndexedElemArith<"sqdmull",
- 0b1011, ".h", 0b0, 0b0, 0b1, FPR32, FPR16, VPR128Lo, neon_uimm3_bare> {
- let Inst{11} = Imm{2}; // h
- let Inst{21} = Imm{1}; // l
- let Inst{20} = Imm{0}; // m
- let Inst{19-16} = MRm{3-0};
-}
-def SQDMULLdsv_2S : NeonI_ScalarXIndexedElemArith<"sqdmull",
- 0b1011, ".s", 0b0, 0b1, 0b0, FPR64, FPR32, VPR64, neon_uimm1_bare> {
- let Inst{11} = 0b0; // h
- let Inst{21} = Imm{0}; // l
- let Inst{20-16} = MRm;
-}
-def SQDMULLdsv_4S : NeonI_ScalarXIndexedElemArith<"sqdmull",
- 0b1011, ".s", 0b0, 0b1, 0b0, FPR64, FPR32, VPR128, neon_uimm2_bare> {
- let Inst{11} = Imm{1}; // h
- let Inst{21} = Imm{0}; // l
- let Inst{20-16} = MRm;
-}
-
-multiclass Neon_ScalarXIndexedElem_MUL_Patterns<
- SDPatternOperator opnode,
- Instruction INST,
- ValueType ResTy, RegisterClass FPRC,
- ValueType OpVTy, ValueType OpTy,
- ValueType VecOpTy, ValueType ExTy, RegisterOperand VPRC, Operand OpImm> {
-
- def : Pat<(ResTy (opnode (OpVTy FPRC:$Rn),
- (OpVTy (scalar_to_vector
- (ExTy (vector_extract (VecOpTy VPRC:$MRm), OpImm:$Imm)))))),
- (ResTy (INST (OpVTy FPRC:$Rn), (VecOpTy VPRC:$MRm), OpImm:$Imm))>;
-
- def : Pat<(ResTy (opnode (OpVTy FPRC:$Rn),
- (OpVTy (extract_subvector (VecOpTy VPRC:$MRm), OpImm:$Imm)))),
- (ResTy (INST (OpVTy FPRC:$Rn), (VecOpTy VPRC:$MRm), OpImm:$Imm))>;
-
- //swapped operands
- def : Pat<(ResTy (opnode
- (OpVTy (scalar_to_vector
- (ExTy (vector_extract (VecOpTy VPRC:$MRm), OpImm:$Imm)))),
- (OpVTy FPRC:$Rn))),
- (ResTy (INST (OpVTy FPRC:$Rn), (VecOpTy VPRC:$MRm), OpImm:$Imm))>;
-
- def : Pat<(ResTy (opnode
- (OpVTy (extract_subvector (VecOpTy VPRC:$MRm), OpImm:$Imm)),
- (OpVTy FPRC:$Rn))),
- (ResTy (INST (OpVTy FPRC:$Rn), (VecOpTy VPRC:$MRm), OpImm:$Imm))>;
-}
-
-
-// Patterns for Scalar Signed saturating doubling
-// multiply long (scalar, by element)
-defm : Neon_ScalarXIndexedElem_MUL_Patterns<int_arm_neon_vqdmull,
- SQDMULLshv_4H, v1i32, FPR16, v1i16, i16, v4i16,
- i32, VPR64Lo, neon_uimm2_bare>;
-defm : Neon_ScalarXIndexedElem_MUL_Patterns<int_arm_neon_vqdmull,
- SQDMULLshv_8H, v1i32, FPR16, v1i16, i16, v8i16,
- i32, VPR128Lo, neon_uimm3_bare>;
-defm : Neon_ScalarXIndexedElem_MUL_Patterns<int_arm_neon_vqdmull,
- SQDMULLdsv_2S, v1i64, FPR32, v1i32, i32, v2i32,
- i32, VPR64Lo, neon_uimm1_bare>;
-defm : Neon_ScalarXIndexedElem_MUL_Patterns<int_arm_neon_vqdmull,
- SQDMULLdsv_4S, v1i64, FPR32, v1i32, i32, v4i32,
- i32, VPR128Lo, neon_uimm2_bare>;
-
-// Scalar Signed saturating doubling multiply-add long (scalar, by element)
-def SQDMLALshv_4H : NeonI_ScalarXIndexedElemArith_Constraint_Impl<"sqdmlal",
- 0b0011, ".h", 0b0, 0b0, 0b1, FPR32, FPR16, VPR64Lo, neon_uimm2_bare> {
- let Inst{11} = 0b0; // h
- let Inst{21} = Imm{1}; // l
- let Inst{20} = Imm{0}; // m
- let Inst{19-16} = MRm{3-0};
-}
-def SQDMLALshv_8H : NeonI_ScalarXIndexedElemArith_Constraint_Impl<"sqdmlal",
- 0b0011, ".h", 0b0, 0b0, 0b1, FPR32, FPR16, VPR128Lo, neon_uimm3_bare> {
- let Inst{11} = Imm{2}; // h
- let Inst{21} = Imm{1}; // l
- let Inst{20} = Imm{0}; // m
- let Inst{19-16} = MRm{3-0};
-}
-def SQDMLALdsv_2S : NeonI_ScalarXIndexedElemArith_Constraint_Impl<"sqdmlal",
- 0b0011, ".s", 0b0, 0b1, 0b0, FPR64, FPR32, VPR64, neon_uimm1_bare> {
- let Inst{11} = 0b0; // h
- let Inst{21} = Imm{0}; // l
- let Inst{20-16} = MRm;
-}
-def SQDMLALdsv_4S : NeonI_ScalarXIndexedElemArith_Constraint_Impl<"sqdmlal",
- 0b0011, ".s", 0b0, 0b1, 0b0, FPR64, FPR32, VPR128, neon_uimm2_bare> {
- let Inst{11} = Imm{1}; // h
- let Inst{21} = Imm{0}; // l
- let Inst{20-16} = MRm;
-}
-
-// Scalar Signed saturating doubling
-// multiply-subtract long (scalar, by element)
-def SQDMLSLshv_4H : NeonI_ScalarXIndexedElemArith_Constraint_Impl<"sqdmlsl",
- 0b0111, ".h", 0b0, 0b0, 0b1, FPR32, FPR16, VPR64Lo, neon_uimm2_bare> {
- let Inst{11} = 0b0; // h
- let Inst{21} = Imm{1}; // l
- let Inst{20} = Imm{0}; // m
- let Inst{19-16} = MRm{3-0};
-}
-def SQDMLSLshv_8H : NeonI_ScalarXIndexedElemArith_Constraint_Impl<"sqdmlsl",
- 0b0111, ".h", 0b0, 0b0, 0b1, FPR32, FPR16, VPR128Lo, neon_uimm3_bare> {
- let Inst{11} = Imm{2}; // h
- let Inst{21} = Imm{1}; // l
- let Inst{20} = Imm{0}; // m
- let Inst{19-16} = MRm{3-0};
-}
-def SQDMLSLdsv_2S : NeonI_ScalarXIndexedElemArith_Constraint_Impl<"sqdmlsl",
- 0b0111, ".s", 0b0, 0b1, 0b0, FPR64, FPR32, VPR64, neon_uimm1_bare> {
- let Inst{11} = 0b0; // h
- let Inst{21} = Imm{0}; // l
- let Inst{20-16} = MRm;
-}
-def SQDMLSLdsv_4S : NeonI_ScalarXIndexedElemArith_Constraint_Impl<"sqdmlsl",
- 0b0111, ".s", 0b0, 0b1, 0b0, FPR64, FPR32, VPR128, neon_uimm2_bare> {
- let Inst{11} = Imm{1}; // h
- let Inst{21} = Imm{0}; // l
- let Inst{20-16} = MRm;
-}
-
-multiclass Neon_ScalarXIndexedElem_MLAL_Patterns<
- SDPatternOperator opnode,
- SDPatternOperator coreopnode,
- Instruction INST,
- ValueType ResTy, RegisterClass ResFPRC, RegisterClass FPRC,
- ValueType OpTy,
- ValueType OpVTy, ValueType ExTy, RegisterOperand VPRC, Operand OpImm> {
-
- def : Pat<(ResTy (opnode
- (ResTy ResFPRC:$Ra),
- (ResTy (coreopnode (OpTy FPRC:$Rn),
- (OpTy (scalar_to_vector
- (ExTy (vector_extract (OpVTy VPRC:$MRm), OpImm:$Imm)))))))),
- (ResTy (INST (ResTy ResFPRC:$Ra),
- (OpTy FPRC:$Rn), (OpVTy VPRC:$MRm), OpImm:$Imm))>;
-
- def : Pat<(ResTy (opnode
- (ResTy ResFPRC:$Ra),
- (ResTy (coreopnode (OpTy FPRC:$Rn),
- (OpTy (extract_subvector (OpVTy VPRC:$MRm), OpImm:$Imm)))))),
- (ResTy (INST (ResTy ResFPRC:$Ra),
- (OpTy FPRC:$Rn), (OpVTy VPRC:$MRm), OpImm:$Imm))>;
-
- // swapped operands
- def : Pat<(ResTy (opnode
- (ResTy ResFPRC:$Ra),
- (ResTy (coreopnode
- (OpTy (scalar_to_vector
- (ExTy (vector_extract (OpVTy VPRC:$MRm), OpImm:$Imm)))),
- (OpTy FPRC:$Rn))))),
- (ResTy (INST (ResTy ResFPRC:$Ra),
- (OpTy FPRC:$Rn), (OpVTy VPRC:$MRm), OpImm:$Imm))>;
-
- def : Pat<(ResTy (opnode
- (ResTy ResFPRC:$Ra),
- (ResTy (coreopnode
- (OpTy (extract_subvector (OpVTy VPRC:$MRm), OpImm:$Imm)),
- (OpTy FPRC:$Rn))))),
- (ResTy (INST (ResTy ResFPRC:$Ra),
- (OpTy FPRC:$Rn), (OpVTy VPRC:$MRm), OpImm:$Imm))>;
-}
-
-// Patterns for Scalar Signed saturating
-// doubling multiply-add long (scalar, by element)
-defm : Neon_ScalarXIndexedElem_MLAL_Patterns<int_arm_neon_vqadds,
- int_arm_neon_vqdmull, SQDMLALshv_4H, v1i32, FPR32, FPR16, v1i16, v4i16,
- i32, VPR64Lo, neon_uimm2_bare>;
-defm : Neon_ScalarXIndexedElem_MLAL_Patterns<int_arm_neon_vqadds,
- int_arm_neon_vqdmull, SQDMLALshv_8H, v1i32, FPR32, FPR16, v1i16, v8i16,
- i32, VPR128Lo, neon_uimm3_bare>;
-defm : Neon_ScalarXIndexedElem_MLAL_Patterns<int_arm_neon_vqadds,
- int_arm_neon_vqdmull, SQDMLALdsv_2S, v1i64, FPR64, FPR32, v1i32, v2i32,
- i32, VPR64Lo, neon_uimm1_bare>;
-defm : Neon_ScalarXIndexedElem_MLAL_Patterns<int_arm_neon_vqadds,
- int_arm_neon_vqdmull, SQDMLALdsv_4S, v1i64, FPR64, FPR32, v1i32, v4i32,
- i32, VPR128Lo, neon_uimm2_bare>;
-
-// Patterns for Scalar Signed saturating
-// doubling multiply-sub long (scalar, by element)
-defm : Neon_ScalarXIndexedElem_MLAL_Patterns<int_arm_neon_vqsubs,
- int_arm_neon_vqdmull, SQDMLSLshv_4H, v1i32, FPR32, FPR16, v1i16, v4i16,
- i32, VPR64Lo, neon_uimm2_bare>;
-defm : Neon_ScalarXIndexedElem_MLAL_Patterns<int_arm_neon_vqsubs,
- int_arm_neon_vqdmull, SQDMLSLshv_8H, v1i32, FPR32, FPR16, v1i16, v8i16,
- i32, VPR128Lo, neon_uimm3_bare>;
-defm : Neon_ScalarXIndexedElem_MLAL_Patterns<int_arm_neon_vqsubs,
- int_arm_neon_vqdmull, SQDMLSLdsv_2S, v1i64, FPR64, FPR32, v1i32, v2i32,
- i32, VPR64Lo, neon_uimm1_bare>;
-defm : Neon_ScalarXIndexedElem_MLAL_Patterns<int_arm_neon_vqsubs,
- int_arm_neon_vqdmull, SQDMLSLdsv_4S, v1i64, FPR64, FPR32, v1i32, v4i32,
- i32, VPR128Lo, neon_uimm2_bare>;
-
-// Scalar Signed saturating doubling multiply returning
-// high half (scalar, by element)
-def SQDMULHhhv_4H : NeonI_ScalarXIndexedElemArith<"sqdmulh",
- 0b1100, ".h", 0b0, 0b0, 0b1, FPR16, FPR16, VPR64Lo, neon_uimm2_bare> {
- let Inst{11} = 0b0; // h
- let Inst{21} = Imm{1}; // l
- let Inst{20} = Imm{0}; // m
- let Inst{19-16} = MRm{3-0};
-}
-def SQDMULHhhv_8H : NeonI_ScalarXIndexedElemArith<"sqdmulh",
- 0b1100, ".h", 0b0, 0b0, 0b1, FPR16, FPR16, VPR128Lo, neon_uimm3_bare> {
- let Inst{11} = Imm{2}; // h
- let Inst{21} = Imm{1}; // l
- let Inst{20} = Imm{0}; // m
- let Inst{19-16} = MRm{3-0};
-}
-def SQDMULHssv_2S : NeonI_ScalarXIndexedElemArith<"sqdmulh",
- 0b1100, ".s", 0b0, 0b1, 0b0, FPR32, FPR32, VPR64, neon_uimm1_bare> {
- let Inst{11} = 0b0; // h
- let Inst{21} = Imm{0}; // l
- let Inst{20-16} = MRm;
-}
-def SQDMULHssv_4S : NeonI_ScalarXIndexedElemArith<"sqdmulh",
- 0b1100, ".s", 0b0, 0b1, 0b0, FPR32, FPR32, VPR128, neon_uimm2_bare> {
- let Inst{11} = Imm{1}; // h
- let Inst{21} = Imm{0}; // l
- let Inst{20-16} = MRm;
-}
-
-// Patterns for Scalar Signed saturating doubling multiply returning
-// high half (scalar, by element)
-defm : Neon_ScalarXIndexedElem_MUL_Patterns<int_arm_neon_vqdmulh,
- SQDMULHhhv_4H, v1i16, FPR16, v1i16, i16, v4i16,
- i32, VPR64Lo, neon_uimm2_bare>;
-defm : Neon_ScalarXIndexedElem_MUL_Patterns<int_arm_neon_vqdmulh,
- SQDMULHhhv_8H, v1i16, FPR16, v1i16, i16, v8i16,
- i32, VPR128Lo, neon_uimm3_bare>;
-defm : Neon_ScalarXIndexedElem_MUL_Patterns<int_arm_neon_vqdmulh,
- SQDMULHssv_2S, v1i32, FPR32, v1i32, i32, v2i32,
- i32, VPR64Lo, neon_uimm1_bare>;
-defm : Neon_ScalarXIndexedElem_MUL_Patterns<int_arm_neon_vqdmulh,
- SQDMULHssv_4S, v1i32, FPR32, v1i32, i32, v4i32,
- i32, VPR128Lo, neon_uimm2_bare>;
-
-// Scalar Signed saturating rounding doubling multiply
-// returning high half (scalar, by element)
-def SQRDMULHhhv_4H : NeonI_ScalarXIndexedElemArith<"sqrdmulh",
- 0b1101, ".h", 0b0, 0b0, 0b1, FPR16, FPR16, VPR64Lo, neon_uimm2_bare> {
- let Inst{11} = 0b0; // h
- let Inst{21} = Imm{1}; // l
- let Inst{20} = Imm{0}; // m
- let Inst{19-16} = MRm{3-0};
-}
-def SQRDMULHhhv_8H : NeonI_ScalarXIndexedElemArith<"sqrdmulh",
- 0b1101, ".h", 0b0, 0b0, 0b1, FPR16, FPR16, VPR128Lo, neon_uimm3_bare> {
- let Inst{11} = Imm{2}; // h
- let Inst{21} = Imm{1}; // l
- let Inst{20} = Imm{0}; // m
- let Inst{19-16} = MRm{3-0};
-}
-def SQRDMULHssv_2S : NeonI_ScalarXIndexedElemArith<"sqrdmulh",
- 0b1101, ".s", 0b0, 0b1, 0b0, FPR32, FPR32, VPR64, neon_uimm1_bare> {
- let Inst{11} = 0b0; // h
- let Inst{21} = Imm{0}; // l
- let Inst{20-16} = MRm;
-}
-def SQRDMULHssv_4S : NeonI_ScalarXIndexedElemArith<"sqrdmulh",
- 0b1101, ".s", 0b0, 0b1, 0b0, FPR32, FPR32, VPR128, neon_uimm2_bare> {
- let Inst{11} = Imm{1}; // h
- let Inst{21} = Imm{0}; // l
- let Inst{20-16} = MRm;
-}
-
-defm : Neon_ScalarXIndexedElem_MUL_Patterns<int_arm_neon_vqrdmulh,
- SQRDMULHhhv_4H, v1i16, FPR16, v1i16, i16, v4i16, i32,
- VPR64Lo, neon_uimm2_bare>;
-defm : Neon_ScalarXIndexedElem_MUL_Patterns<int_arm_neon_vqrdmulh,
- SQRDMULHhhv_8H, v1i16, FPR16, v1i16, i16, v8i16, i32,
- VPR128Lo, neon_uimm3_bare>;
-defm : Neon_ScalarXIndexedElem_MUL_Patterns<int_arm_neon_vqrdmulh,
- SQRDMULHssv_2S, v1i32, FPR32, v1i32, i32, v2i32, i32,
- VPR64Lo, neon_uimm1_bare>;
-defm : Neon_ScalarXIndexedElem_MUL_Patterns<int_arm_neon_vqrdmulh,
- SQRDMULHssv_4S, v1i32, FPR32, v1i32, i32, v4i32, i32,
- VPR128Lo, neon_uimm2_bare>;
-
-// Scalar general arithmetic operation
-class Neon_Scalar_GeneralMath2D_pattern<SDPatternOperator opnode,
- Instruction INST>
- : Pat<(v1f64 (opnode (v1f64 FPR64:$Rn))), (INST FPR64:$Rn)>;
-
-class Neon_Scalar_GeneralMath3D_pattern<SDPatternOperator opnode,
- Instruction INST>
- : Pat<(v1f64 (opnode (v1f64 FPR64:$Rn), (v1f64 FPR64:$Rm))),
- (INST FPR64:$Rn, FPR64:$Rm)>;
-
-class Neon_Scalar_GeneralMath4D_pattern<SDPatternOperator opnode,
- Instruction INST>
- : Pat<(v1f64 (opnode (v1f64 FPR64:$Rn), (v1f64 FPR64:$Rm),
- (v1f64 FPR64:$Ra))),
- (INST FPR64:$Rn, FPR64:$Rm, FPR64:$Ra)>;
-
-def : Neon_Scalar_GeneralMath3D_pattern<fadd, FADDddd>;
-def : Neon_Scalar_GeneralMath3D_pattern<fmul, FMULddd>;
-def : Neon_Scalar_GeneralMath3D_pattern<fsub, FSUBddd>;
-def : Neon_Scalar_GeneralMath3D_pattern<fdiv, FDIVddd>;
-def : Neon_Scalar_GeneralMath3D_pattern<int_arm_neon_vabds, FABDddd>;
-def : Neon_Scalar_GeneralMath3D_pattern<int_arm_neon_vmaxs, FMAXddd>;
-def : Neon_Scalar_GeneralMath3D_pattern<int_arm_neon_vmins, FMINddd>;
-def : Neon_Scalar_GeneralMath3D_pattern<int_aarch64_neon_vmaxnm, FMAXNMddd>;
-def : Neon_Scalar_GeneralMath3D_pattern<int_aarch64_neon_vminnm, FMINNMddd>;
-
-def : Neon_Scalar_GeneralMath2D_pattern<fabs, FABSdd>;
-def : Neon_Scalar_GeneralMath2D_pattern<fneg, FNEGdd>;
-
-def : Neon_Scalar_GeneralMath4D_pattern<fma, FMADDdddd>;
-def : Neon_Scalar_GeneralMath4D_pattern<fmsub, FMSUBdddd>;
-
-// Scalar Copy - DUP element to scalar
-class NeonI_Scalar_DUP<string asmop, string asmlane,
- RegisterClass ResRC, RegisterOperand VPRC,
- Operand OpImm>
- : NeonI_ScalarCopy<(outs ResRC:$Rd), (ins VPRC:$Rn, OpImm:$Imm),
- asmop # "\t$Rd, $Rn." # asmlane # "[$Imm]",
- [],
- NoItinerary>,
- Sched<[WriteFPALU, ReadFPALU]> {
- bits<4> Imm;
-}
-
-def DUPbv_B : NeonI_Scalar_DUP<"dup", "b", FPR8, VPR128, neon_uimm4_bare> {
- let Inst{20-16} = {Imm{3}, Imm{2}, Imm{1}, Imm{0}, 0b1};
-}
-def DUPhv_H : NeonI_Scalar_DUP<"dup", "h", FPR16, VPR128, neon_uimm3_bare> {
- let Inst{20-16} = {Imm{2}, Imm{1}, Imm{0}, 0b1, 0b0};
-}
-def DUPsv_S : NeonI_Scalar_DUP<"dup", "s", FPR32, VPR128, neon_uimm2_bare> {
- let Inst{20-16} = {Imm{1}, Imm{0}, 0b1, 0b0, 0b0};
-}
-def DUPdv_D : NeonI_Scalar_DUP<"dup", "d", FPR64, VPR128, neon_uimm1_bare> {
- let Inst{20-16} = {Imm, 0b1, 0b0, 0b0, 0b0};
-}
-
-def : Pat<(f32 (vector_extract (v4f32 VPR128:$Rn), 0)),
- (f32 (EXTRACT_SUBREG (v4f32 VPR128:$Rn), sub_32))>;
-def : Pat<(f32 (vector_extract (v4f32 VPR128:$Rn), 1)),
- (f32 (DUPsv_S (v4f32 VPR128:$Rn), 1))>;
-def : Pat<(f32 (vector_extract (v4f32 VPR128:$Rn), 2)),
- (f32 (DUPsv_S (v4f32 VPR128:$Rn), 2))>;
-def : Pat<(f32 (vector_extract (v4f32 VPR128:$Rn), 3)),
- (f32 (DUPsv_S (v4f32 VPR128:$Rn), 3))>;
-
-def : Pat<(f64 (vector_extract (v2f64 VPR128:$Rn), 0)),
- (f64 (EXTRACT_SUBREG (v2f64 VPR128:$Rn), sub_64))>;
-def : Pat<(f64 (vector_extract (v2f64 VPR128:$Rn), 1)),
- (f64 (DUPdv_D (v2f64 VPR128:$Rn), 1))>;
-
-def : Pat<(f32 (vector_extract (v2f32 VPR64:$Rn), 0)),
- (f32 (EXTRACT_SUBREG (v2f32 VPR64:$Rn), sub_32))>;
-def : Pat<(f32 (vector_extract (v2f32 VPR64:$Rn), 1)),
- (f32 (DUPsv_S (v4f32 (SUBREG_TO_REG (i64 0), VPR64:$Rn, sub_64)),
- 1))>;
-
-def : Pat<(f64 (vector_extract (v1f64 VPR64:$Rn), 0)),
- (f64 (EXTRACT_SUBREG (v1f64 VPR64:$Rn), sub_64))>;
-
-multiclass NeonI_Scalar_DUP_Ext_Vec_pattern<Instruction DUPI,
- ValueType ResTy, ValueType OpTy,Operand OpLImm,
- ValueType NOpTy, ValueType ExTy, Operand OpNImm> {
-
- def : Pat<(ResTy (extract_subvector (OpTy VPR128:$Rn), OpLImm:$Imm)),
- (ResTy (DUPI VPR128:$Rn, OpLImm:$Imm))>;
-
- def : Pat<(ResTy (extract_subvector (NOpTy VPR64:$Rn), OpNImm:$Imm)),
- (ResTy (DUPI
- (ExTy (SUBREG_TO_REG (i64 0), VPR64:$Rn, sub_64)),
- OpNImm:$Imm))>;
-}
-
-// Patterns for extract subvectors of v1ix data using scalar DUP instructions.
-defm : NeonI_Scalar_DUP_Ext_Vec_pattern<DUPbv_B, v1i8, v16i8, neon_uimm4_bare,
- v8i8, v16i8, neon_uimm3_bare>;
-defm : NeonI_Scalar_DUP_Ext_Vec_pattern<DUPhv_H, v1i16, v8i16, neon_uimm3_bare,
- v4i16, v8i16, neon_uimm2_bare>;
-defm : NeonI_Scalar_DUP_Ext_Vec_pattern<DUPsv_S, v1i32, v4i32, neon_uimm2_bare,
- v2i32, v4i32, neon_uimm1_bare>;
-
-multiclass NeonI_Scalar_DUP_Copy_pattern1<Instruction DUPI, ValueType ResTy,
- ValueType OpTy, ValueType ElemTy,
- Operand OpImm, ValueType OpNTy,
- ValueType ExTy, Operand OpNImm> {
-
- def : Pat<(ResTy (vector_insert (ResTy undef),
- (ElemTy (vector_extract (OpTy VPR128:$Rn), OpImm:$Imm)),
- (neon_uimm0_bare:$Imm))),
- (ResTy (DUPI (OpTy VPR128:$Rn), OpImm:$Imm))>;
-
- def : Pat<(ResTy (vector_insert (ResTy undef),
- (ElemTy (vector_extract (OpNTy VPR64:$Rn), OpNImm:$Imm)),
- (OpNImm:$Imm))),
- (ResTy (DUPI
- (ExTy (SUBREG_TO_REG (i64 0), VPR64:$Rn, sub_64)),
- OpNImm:$Imm))>;
-}
-
-multiclass NeonI_Scalar_DUP_Copy_pattern2<Instruction DUPI, ValueType ResTy,
- ValueType OpTy, ValueType ElemTy,
- Operand OpImm, ValueType OpNTy,
- ValueType ExTy, Operand OpNImm> {
-
- def : Pat<(ResTy (scalar_to_vector
- (ElemTy (vector_extract (OpTy VPR128:$Rn), OpImm:$Imm)))),
- (ResTy (DUPI (OpTy VPR128:$Rn), OpImm:$Imm))>;
-
- def : Pat<(ResTy (scalar_to_vector
- (ElemTy (vector_extract (OpNTy VPR64:$Rn), OpNImm:$Imm)))),
- (ResTy (DUPI
- (ExTy (SUBREG_TO_REG (i64 0), VPR64:$Rn, sub_64)),
- OpNImm:$Imm))>;
-}
-
-// Patterns for vector copy to v1ix and v1fx vectors using scalar DUP
-// instructions.
-defm : NeonI_Scalar_DUP_Copy_pattern1<DUPdv_D,
- v1i64, v2i64, i64, neon_uimm1_bare,
- v1i64, v2i64, neon_uimm0_bare>;
-defm : NeonI_Scalar_DUP_Copy_pattern1<DUPsv_S,
- v1i32, v4i32, i32, neon_uimm2_bare,
- v2i32, v4i32, neon_uimm1_bare>;
-defm : NeonI_Scalar_DUP_Copy_pattern1<DUPhv_H,
- v1i16, v8i16, i32, neon_uimm3_bare,
- v4i16, v8i16, neon_uimm2_bare>;
-defm : NeonI_Scalar_DUP_Copy_pattern1<DUPbv_B,
- v1i8, v16i8, i32, neon_uimm4_bare,
- v8i8, v16i8, neon_uimm3_bare>;
-defm : NeonI_Scalar_DUP_Copy_pattern2<DUPdv_D,
- v1i64, v2i64, i64, neon_uimm1_bare,
- v1i64, v2i64, neon_uimm0_bare>;
-defm : NeonI_Scalar_DUP_Copy_pattern2<DUPsv_S,
- v1i32, v4i32, i32, neon_uimm2_bare,
- v2i32, v4i32, neon_uimm1_bare>;
-defm : NeonI_Scalar_DUP_Copy_pattern2<DUPhv_H,
- v1i16, v8i16, i32, neon_uimm3_bare,
- v4i16, v8i16, neon_uimm2_bare>;
-defm : NeonI_Scalar_DUP_Copy_pattern2<DUPbv_B,
- v1i8, v16i8, i32, neon_uimm4_bare,
- v8i8, v16i8, neon_uimm3_bare>;
-
-multiclass NeonI_Scalar_DUP_alias<string asmop, string asmlane,
- Instruction DUPI, Operand OpImm,
- RegisterClass ResRC> {
- def : NeonInstAlias<!strconcat(asmop, "$Rd, $Rn" # asmlane # "[$Imm]"),
- (DUPI ResRC:$Rd, VPR128:$Rn, OpImm:$Imm), 0b0>;
-}
-
-// Aliases for Scalar copy - DUP element (scalar)
-// FIXME: This is actually the preferred syntax but TableGen can't deal with
-// custom printing of aliases.
-defm : NeonI_Scalar_DUP_alias<"mov", ".b", DUPbv_B, neon_uimm4_bare, FPR8>;
-defm : NeonI_Scalar_DUP_alias<"mov", ".h", DUPhv_H, neon_uimm3_bare, FPR16>;
-defm : NeonI_Scalar_DUP_alias<"mov", ".s", DUPsv_S, neon_uimm2_bare, FPR32>;
-defm : NeonI_Scalar_DUP_alias<"mov", ".d", DUPdv_D, neon_uimm1_bare, FPR64>;
-
-multiclass NeonI_SDUP<PatFrag GetLow, PatFrag GetHigh, ValueType ResTy,
- ValueType OpTy> {
- def : Pat<(ResTy (GetLow VPR128:$Rn)),
- (ResTy (EXTRACT_SUBREG (OpTy VPR128:$Rn), sub_64))>;
- def : Pat<(ResTy (GetHigh VPR128:$Rn)),
- (ResTy (DUPdv_D (OpTy VPR128:$Rn), 1))>;
-}
-
-defm : NeonI_SDUP<Neon_Low16B, Neon_High16B, v8i8, v16i8>;
-defm : NeonI_SDUP<Neon_Low8H, Neon_High8H, v4i16, v8i16>;
-defm : NeonI_SDUP<Neon_Low4S, Neon_High4S, v2i32, v4i32>;
-defm : NeonI_SDUP<Neon_Low2D, Neon_High2D, v1i64, v2i64>;
-defm : NeonI_SDUP<Neon_Low4float, Neon_High4float, v2f32, v4f32>;
-defm : NeonI_SDUP<Neon_Low2double, Neon_High2double, v1f64, v2f64>;
-
-// The following is for sext/zext from v1xx to v1xx
-multiclass NeonI_ext<string prefix, SDNode ExtOp> {
- // v1i32 -> v1i64
- def : Pat<(v1i64 (ExtOp (v1i32 FPR32:$Rn))),
- (EXTRACT_SUBREG
- (v2i64 (!cast<Instruction>(prefix # "_2S")
- (v2i32 (SUBREG_TO_REG (i64 0), $Rn, sub_32)), 0)),
- sub_64)>;
-
- // v1i16 -> v1i32
- def : Pat<(v1i32 (ExtOp (v1i16 FPR16:$Rn))),
- (EXTRACT_SUBREG
- (v4i32 (!cast<Instruction>(prefix # "_4H")
- (v4i16 (SUBREG_TO_REG (i64 0), $Rn, sub_16)), 0)),
- sub_32)>;
-
- // v1i8 -> v1i16
- def : Pat<(v1i16 (ExtOp (v1i8 FPR8:$Rn))),
- (EXTRACT_SUBREG
- (v8i16 (!cast<Instruction>(prefix # "_8B")
- (v8i8 (SUBREG_TO_REG (i64 0), $Rn, sub_8)), 0)),
- sub_16)>;
-}
-
-defm NeonI_zext : NeonI_ext<"USHLLvvi", zext>;
-defm NeonI_sext : NeonI_ext<"SSHLLvvi", sext>;
-
-// zext v1i8 -> v1i32
-def : Pat<(v1i32 (zext (v1i8 FPR8:$Rn))),
- (v1i32 (EXTRACT_SUBREG
- (v1i64 (SUBREG_TO_REG (i64 0),
- (v1i8 (DUPbv_B
- (v16i8 (SUBREG_TO_REG (i64 0), $Rn, sub_8)),
- 0)),
- sub_8)),
- sub_32))>;
-
-// zext v1i8 -> v1i64
-def : Pat<(v1i64 (zext (v1i8 FPR8:$Rn))),
- (v1i64 (SUBREG_TO_REG (i64 0),
- (v1i8 (DUPbv_B
- (v16i8 (SUBREG_TO_REG (i64 0), $Rn, sub_8)),
- 0)),
- sub_8))>;
-
-// zext v1i16 -> v1i64
-def : Pat<(v1i64 (zext (v1i16 FPR16:$Rn))),
- (v1i64 (SUBREG_TO_REG (i64 0),
- (v1i16 (DUPhv_H
- (v8i16 (SUBREG_TO_REG (i64 0), $Rn, sub_16)),
- 0)),
- sub_16))>;
-
-// sext v1i8 -> v1i32
-def : Pat<(v1i32 (sext (v1i8 FPR8:$Rn))),
- (EXTRACT_SUBREG
- (v4i32 (SSHLLvvi_4H
- (v4i16 (SUBREG_TO_REG (i64 0),
- (v1i16 (EXTRACT_SUBREG
- (v8i16 (SSHLLvvi_8B
- (v8i8 (SUBREG_TO_REG (i64 0), $Rn, sub_8)), 0)),
- sub_16)),
- sub_16)), 0)),
- sub_32)>;
-
-// sext v1i8 -> v1i64
-def : Pat<(v1i64 (sext (v1i8 FPR8:$Rn))),
- (EXTRACT_SUBREG
- (v2i64 (SSHLLvvi_2S
- (v2i32 (SUBREG_TO_REG (i64 0),
- (v1i32 (EXTRACT_SUBREG
- (v4i32 (SSHLLvvi_4H
- (v4i16 (SUBREG_TO_REG (i64 0),
- (v1i16 (EXTRACT_SUBREG
- (v8i16 (SSHLLvvi_8B
- (v8i8 (SUBREG_TO_REG (i64 0), $Rn, sub_8)), 0)),
- sub_16)),
- sub_16)), 0)),
- sub_32)),
- sub_32)), 0)),
- sub_64)>;
-
-
-// sext v1i16 -> v1i64
-def : Pat<(v1i64 (sext (v1i16 FPR16:$Rn))),
- (EXTRACT_SUBREG
- (v2i64 (SSHLLvvi_2S
- (v2i32 (SUBREG_TO_REG (i64 0),
- (v1i32 (EXTRACT_SUBREG
- (v4i32 (SSHLLvvi_4H
- (v4i16 (SUBREG_TO_REG (i64 0), $Rn, sub_16)), 0)),
- sub_32)),
- sub_32)), 0)),
- sub_64)>;
-
-//===----------------------------------------------------------------------===//
-// Non-Instruction Patterns
-//===----------------------------------------------------------------------===//
-
-// 64-bit vector bitcasts...
-
-def : Pat<(v1i64 (bitconvert (v8i8 VPR64:$src))), (v1i64 VPR64:$src)>;
-def : Pat<(v2f32 (bitconvert (v8i8 VPR64:$src))), (v2f32 VPR64:$src)>;
-def : Pat<(v2i32 (bitconvert (v8i8 VPR64:$src))), (v2i32 VPR64:$src)>;
-def : Pat<(v4i16 (bitconvert (v8i8 VPR64:$src))), (v4i16 VPR64:$src)>;
-
-def : Pat<(v1i64 (bitconvert (v4i16 VPR64:$src))), (v1i64 VPR64:$src)>;
-def : Pat<(v2i32 (bitconvert (v4i16 VPR64:$src))), (v2i32 VPR64:$src)>;
-def : Pat<(v2f32 (bitconvert (v4i16 VPR64:$src))), (v2f32 VPR64:$src)>;
-def : Pat<(v8i8 (bitconvert (v4i16 VPR64:$src))), (v8i8 VPR64:$src)>;
-
-def : Pat<(v1i64 (bitconvert (v2i32 VPR64:$src))), (v1i64 VPR64:$src)>;
-def : Pat<(v2f32 (bitconvert (v2i32 VPR64:$src))), (v2f32 VPR64:$src)>;
-def : Pat<(v4i16 (bitconvert (v2i32 VPR64:$src))), (v4i16 VPR64:$src)>;
-def : Pat<(v8i8 (bitconvert (v2i32 VPR64:$src))), (v8i8 VPR64:$src)>;
-
-def : Pat<(v1i64 (bitconvert (v2f32 VPR64:$src))), (v1i64 VPR64:$src)>;
-def : Pat<(v2i32 (bitconvert (v2f32 VPR64:$src))), (v2i32 VPR64:$src)>;
-def : Pat<(v4i16 (bitconvert (v2f32 VPR64:$src))), (v4i16 VPR64:$src)>;
-def : Pat<(v8i8 (bitconvert (v2f32 VPR64:$src))), (v8i8 VPR64:$src)>;
-
-def : Pat<(v2f32 (bitconvert (v1i64 VPR64:$src))), (v2f32 VPR64:$src)>;
-def : Pat<(v2i32 (bitconvert (v1i64 VPR64:$src))), (v2i32 VPR64:$src)>;
-def : Pat<(v4i16 (bitconvert (v1i64 VPR64:$src))), (v4i16 VPR64:$src)>;
-def : Pat<(v8i8 (bitconvert (v1i64 VPR64:$src))), (v8i8 VPR64:$src)>;
-
-def : Pat<(v1i64 (bitconvert (v1f64 VPR64:$src))), (v1i64 VPR64:$src)>;
-def : Pat<(v2f32 (bitconvert (v1f64 VPR64:$src))), (v2f32 VPR64:$src)>;
-def : Pat<(v2i32 (bitconvert (v1f64 VPR64:$src))), (v2i32 VPR64:$src)>;
-def : Pat<(v4i16 (bitconvert (v1f64 VPR64:$src))), (v4i16 VPR64:$src)>;
-def : Pat<(v8i8 (bitconvert (v1f64 VPR64:$src))), (v8i8 VPR64:$src)>;
-def : Pat<(f64 (bitconvert (v1f64 VPR64:$src))), (f64 VPR64:$src)>;
-
-def : Pat<(v1f64 (bitconvert (v1i64 VPR64:$src))), (v1f64 VPR64:$src)>;
-def : Pat<(v1f64 (bitconvert (v2f32 VPR64:$src))), (v1f64 VPR64:$src)>;
-def : Pat<(v1f64 (bitconvert (v2i32 VPR64:$src))), (v1f64 VPR64:$src)>;
-def : Pat<(v1f64 (bitconvert (v4i16 VPR64:$src))), (v1f64 VPR64:$src)>;
-def : Pat<(v1f64 (bitconvert (v8i8 VPR64:$src))), (v1f64 VPR64:$src)>;
-def : Pat<(v1f64 (bitconvert (f64 VPR64:$src))), (v1f64 VPR64:$src)>;
-
-// ..and 128-bit vector bitcasts...
-
-def : Pat<(v2f64 (bitconvert (v16i8 VPR128:$src))), (v2f64 VPR128:$src)>;
-def : Pat<(v2i64 (bitconvert (v16i8 VPR128:$src))), (v2i64 VPR128:$src)>;
-def : Pat<(v4f32 (bitconvert (v16i8 VPR128:$src))), (v4f32 VPR128:$src)>;
-def : Pat<(v4i32 (bitconvert (v16i8 VPR128:$src))), (v4i32 VPR128:$src)>;
-def : Pat<(v8i16 (bitconvert (v16i8 VPR128:$src))), (v8i16 VPR128:$src)>;
-
-def : Pat<(v2f64 (bitconvert (v8i16 VPR128:$src))), (v2f64 VPR128:$src)>;
-def : Pat<(v2i64 (bitconvert (v8i16 VPR128:$src))), (v2i64 VPR128:$src)>;
-def : Pat<(v4i32 (bitconvert (v8i16 VPR128:$src))), (v4i32 VPR128:$src)>;
-def : Pat<(v4f32 (bitconvert (v8i16 VPR128:$src))), (v4f32 VPR128:$src)>;
-def : Pat<(v16i8 (bitconvert (v8i16 VPR128:$src))), (v16i8 VPR128:$src)>;
-
-def : Pat<(v2f64 (bitconvert (v4i32 VPR128:$src))), (v2f64 VPR128:$src)>;
-def : Pat<(v2i64 (bitconvert (v4i32 VPR128:$src))), (v2i64 VPR128:$src)>;
-def : Pat<(v4f32 (bitconvert (v4i32 VPR128:$src))), (v4f32 VPR128:$src)>;
-def : Pat<(v8i16 (bitconvert (v4i32 VPR128:$src))), (v8i16 VPR128:$src)>;
-def : Pat<(v16i8 (bitconvert (v4i32 VPR128:$src))), (v16i8 VPR128:$src)>;
-
-def : Pat<(v2f64 (bitconvert (v4f32 VPR128:$src))), (v2f64 VPR128:$src)>;
-def : Pat<(v2i64 (bitconvert (v4f32 VPR128:$src))), (v2i64 VPR128:$src)>;
-def : Pat<(v4i32 (bitconvert (v4f32 VPR128:$src))), (v4i32 VPR128:$src)>;
-def : Pat<(v8i16 (bitconvert (v4f32 VPR128:$src))), (v8i16 VPR128:$src)>;
-def : Pat<(v16i8 (bitconvert (v4f32 VPR128:$src))), (v16i8 VPR128:$src)>;
-
-def : Pat<(v2f64 (bitconvert (v2i64 VPR128:$src))), (v2f64 VPR128:$src)>;
-def : Pat<(v4f32 (bitconvert (v2i64 VPR128:$src))), (v4f32 VPR128:$src)>;
-def : Pat<(v4i32 (bitconvert (v2i64 VPR128:$src))), (v4i32 VPR128:$src)>;
-def : Pat<(v8i16 (bitconvert (v2i64 VPR128:$src))), (v8i16 VPR128:$src)>;
-def : Pat<(v16i8 (bitconvert (v2i64 VPR128:$src))), (v16i8 VPR128:$src)>;
-
-def : Pat<(v2i64 (bitconvert (v2f64 VPR128:$src))), (v2i64 VPR128:$src)>;
-def : Pat<(v4f32 (bitconvert (v2f64 VPR128:$src))), (v4f32 VPR128:$src)>;
-def : Pat<(v4i32 (bitconvert (v2f64 VPR128:$src))), (v4i32 VPR128:$src)>;
-def : Pat<(v8i16 (bitconvert (v2f64 VPR128:$src))), (v8i16 VPR128:$src)>;
-def : Pat<(v16i8 (bitconvert (v2f64 VPR128:$src))), (v16i8 VPR128:$src)>;
-
-// ...and scalar bitcasts...
-def : Pat<(f16 (bitconvert (v1i16 FPR16:$src))), (f16 FPR16:$src)>;
-def : Pat<(f32 (bitconvert (v1i32 FPR32:$src))), (f32 FPR32:$src)>;
-def : Pat<(f64 (bitconvert (v1i64 FPR64:$src))), (f64 FPR64:$src)>;
-def : Pat<(f64 (bitconvert (v1f64 FPR64:$src))), (f64 FPR64:$src)>;
-
-def : Pat<(i64 (bitconvert (v1i64 FPR64:$src))), (FMOVxd $src)>;
-def : Pat<(i64 (bitconvert (v1f64 FPR64:$src))), (FMOVxd $src)>;
-def : Pat<(i64 (bitconvert (v2i32 FPR64:$src))), (FMOVxd $src)>;
-def : Pat<(i64 (bitconvert (v2f32 FPR64:$src))), (FMOVxd $src)>;
-def : Pat<(i64 (bitconvert (v4i16 FPR64:$src))), (FMOVxd $src)>;
-def : Pat<(i64 (bitconvert (v8i8 FPR64:$src))), (FMOVxd $src)>;
-
-def : Pat<(i32 (bitconvert (v1i32 FPR32:$src))), (FMOVws $src)>;
-
-def : Pat<(v8i8 (bitconvert (v1i64 VPR64:$src))), (v8i8 VPR64:$src)>;
-def : Pat<(v4i16 (bitconvert (v1i64 VPR64:$src))), (v4i16 VPR64:$src)>;
-def : Pat<(v2i32 (bitconvert (v1i64 VPR64:$src))), (v2i32 VPR64:$src)>;
-
-def : Pat<(f64 (bitconvert (v8i8 VPR64:$src))), (f64 VPR64:$src)>;
-def : Pat<(f64 (bitconvert (v4i16 VPR64:$src))), (f64 VPR64:$src)>;
-def : Pat<(f64 (bitconvert (v2i32 VPR64:$src))), (f64 VPR64:$src)>;
-def : Pat<(f64 (bitconvert (v2f32 VPR64:$src))), (f64 VPR64:$src)>;
-def : Pat<(f64 (bitconvert (v1i64 VPR64:$src))), (f64 VPR64:$src)>;
-
-def : Pat<(f128 (bitconvert (v16i8 VPR128:$src))), (f128 VPR128:$src)>;
-def : Pat<(f128 (bitconvert (v8i16 VPR128:$src))), (f128 VPR128:$src)>;
-def : Pat<(f128 (bitconvert (v4i32 VPR128:$src))), (f128 VPR128:$src)>;
-def : Pat<(f128 (bitconvert (v2i64 VPR128:$src))), (f128 VPR128:$src)>;
-def : Pat<(f128 (bitconvert (v4f32 VPR128:$src))), (f128 VPR128:$src)>;
-def : Pat<(f128 (bitconvert (v2f64 VPR128:$src))), (f128 VPR128:$src)>;
-
-def : Pat<(v1i16 (bitconvert (f16 FPR16:$src))), (v1i16 FPR16:$src)>;
-def : Pat<(v1i32 (bitconvert (f32 FPR32:$src))), (v1i32 FPR32:$src)>;
-def : Pat<(v1i64 (bitconvert (f64 FPR64:$src))), (v1i64 FPR64:$src)>;
-def : Pat<(v1f64 (bitconvert (f64 FPR64:$src))), (v1f64 FPR64:$src)>;
-
-def : Pat<(v1i64 (bitconvert (i64 GPR64:$src))), (FMOVdx $src)>;
-def : Pat<(v1f64 (bitconvert (i64 GPR64:$src))), (FMOVdx $src)>;
-def : Pat<(v2i32 (bitconvert (i64 GPR64:$src))), (FMOVdx $src)>;
-def : Pat<(v2f32 (bitconvert (i64 GPR64:$src))), (FMOVdx $src)>;
-def : Pat<(v4i16 (bitconvert (i64 GPR64:$src))), (FMOVdx $src)>;
-def : Pat<(v8i8 (bitconvert (i64 GPR64:$src))), (FMOVdx $src)>;
-
-def : Pat<(v1i32 (bitconvert (i32 GPR32:$src))), (FMOVsw $src)>;
-
-def : Pat<(v8i8 (bitconvert (f64 FPR64:$src))), (v8i8 FPR64:$src)>;
-def : Pat<(v4i16 (bitconvert (f64 FPR64:$src))), (v4i16 FPR64:$src)>;
-def : Pat<(v2i32 (bitconvert (f64 FPR64:$src))), (v2i32 FPR64:$src)>;
-def : Pat<(v2f32 (bitconvert (f64 FPR64:$src))), (v2f32 FPR64:$src)>;
-def : Pat<(v1i64 (bitconvert (f64 FPR64:$src))), (v1i64 FPR64:$src)>;
-
-def : Pat<(v16i8 (bitconvert (f128 FPR128:$src))), (v16i8 FPR128:$src)>;
-def : Pat<(v8i16 (bitconvert (f128 FPR128:$src))), (v8i16 FPR128:$src)>;
-def : Pat<(v4i32 (bitconvert (f128 FPR128:$src))), (v4i32 FPR128:$src)>;
-def : Pat<(v2i64 (bitconvert (f128 FPR128:$src))), (v2i64 FPR128:$src)>;
-def : Pat<(v4f32 (bitconvert (f128 FPR128:$src))), (v4f32 FPR128:$src)>;
-def : Pat<(v2f64 (bitconvert (f128 FPR128:$src))), (v2f64 FPR128:$src)>;
-
-// Scalar Three Same
-
-def neon_uimm3 : Operand<i64>,
- ImmLeaf<i64, [{return Imm < 8;}]> {
- let ParserMatchClass = uimm3_asmoperand;
- let PrintMethod = "printUImmHexOperand";
-}
-
-def neon_uimm4 : Operand<i64>,
- ImmLeaf<i64, [{return Imm < 16;}]> {
- let ParserMatchClass = uimm4_asmoperand;
- let PrintMethod = "printUImmHexOperand";
-}
-
-// Bitwise Extract
-class NeonI_Extract<bit q, bits<2> op2, string asmop,
- string OpS, RegisterOperand OpVPR, Operand OpImm>
- : NeonI_BitExtract<q, op2, (outs OpVPR:$Rd),
- (ins OpVPR:$Rn, OpVPR:$Rm, OpImm:$Index),
- asmop # "\t$Rd." # OpS # ", $Rn." # OpS #
- ", $Rm." # OpS # ", $Index",
- [],
- NoItinerary>,
- Sched<[WriteFPALU, ReadFPALU, ReadFPALU]>{
- bits<4> Index;
-}
-
-def EXTvvvi_8b : NeonI_Extract<0b0, 0b00, "ext", "8b",
- VPR64, neon_uimm3> {
- let Inst{14-11} = {0b0, Index{2}, Index{1}, Index{0}};
-}
-
-def EXTvvvi_16b: NeonI_Extract<0b1, 0b00, "ext", "16b",
- VPR128, neon_uimm4> {
- let Inst{14-11} = Index;
-}
-
-class NI_Extract<ValueType OpTy, RegisterOperand OpVPR, Instruction INST,
- Operand OpImm>
- : Pat<(OpTy (Neon_vextract (OpTy OpVPR:$Rn), (OpTy OpVPR:$Rm),
- (i64 OpImm:$Imm))),
- (INST OpVPR:$Rn, OpVPR:$Rm, OpImm:$Imm)>;
-
-def : NI_Extract<v8i8, VPR64, EXTvvvi_8b, neon_uimm3>;
-def : NI_Extract<v4i16, VPR64, EXTvvvi_8b, neon_uimm3>;
-def : NI_Extract<v2i32, VPR64, EXTvvvi_8b, neon_uimm3>;
-def : NI_Extract<v1i64, VPR64, EXTvvvi_8b, neon_uimm3>;
-def : NI_Extract<v2f32, VPR64, EXTvvvi_8b, neon_uimm3>;
-def : NI_Extract<v1f64, VPR64, EXTvvvi_8b, neon_uimm3>;
-def : NI_Extract<v16i8, VPR128, EXTvvvi_16b, neon_uimm4>;
-def : NI_Extract<v8i16, VPR128, EXTvvvi_16b, neon_uimm4>;
-def : NI_Extract<v4i32, VPR128, EXTvvvi_16b, neon_uimm4>;
-def : NI_Extract<v2i64, VPR128, EXTvvvi_16b, neon_uimm4>;
-def : NI_Extract<v4f32, VPR128, EXTvvvi_16b, neon_uimm4>;
-def : NI_Extract<v2f64, VPR128, EXTvvvi_16b, neon_uimm4>;
-
-// Table lookup
-class NI_TBL<bit q, bits<2> op2, bits<2> len, bit op,
- string asmop, string OpS, RegisterOperand OpVPR,
- RegisterOperand VecList>
- : NeonI_TBL<q, op2, len, op,
- (outs OpVPR:$Rd), (ins VecList:$Rn, OpVPR:$Rm),
- asmop # "\t$Rd." # OpS # ", $Rn, $Rm." # OpS,
- [],
- NoItinerary>,
- Sched<[WriteFPALU, ReadFPALU, ReadFPALU]>;
-
-// The vectors in look up table are always 16b
-multiclass NI_TBL_pat<bits<2> len, bit op, string asmop, string List> {
- def _8b : NI_TBL<0, 0b00, len, op, asmop, "8b", VPR64,
- !cast<RegisterOperand>(List # "16B_operand")>;
-
- def _16b : NI_TBL<1, 0b00, len, op, asmop, "16b", VPR128,
- !cast<RegisterOperand>(List # "16B_operand")>;
-}
-
-defm TBL1 : NI_TBL_pat<0b00, 0b0, "tbl", "VOne">;
-defm TBL2 : NI_TBL_pat<0b01, 0b0, "tbl", "VPair">;
-defm TBL3 : NI_TBL_pat<0b10, 0b0, "tbl", "VTriple">;
-defm TBL4 : NI_TBL_pat<0b11, 0b0, "tbl", "VQuad">;
-
-// Table lookup extension
-class NI_TBX<bit q, bits<2> op2, bits<2> len, bit op,
- string asmop, string OpS, RegisterOperand OpVPR,
- RegisterOperand VecList>
- : NeonI_TBL<q, op2, len, op,
- (outs OpVPR:$Rd), (ins OpVPR:$src, VecList:$Rn, OpVPR:$Rm),
- asmop # "\t$Rd." # OpS # ", $Rn, $Rm." # OpS,
- [],
- NoItinerary>,
- Sched<[WriteFPALU, ReadFPALU, ReadFPALU, ReadFPALU]> {
- let Constraints = "$src = $Rd";
-}
-
-// The vectors in look up table are always 16b
-multiclass NI_TBX_pat<bits<2> len, bit op, string asmop, string List> {
- def _8b : NI_TBX<0, 0b00, len, op, asmop, "8b", VPR64,
- !cast<RegisterOperand>(List # "16B_operand")>;
-
- def _16b : NI_TBX<1, 0b00, len, op, asmop, "16b", VPR128,
- !cast<RegisterOperand>(List # "16B_operand")>;
-}
-
-defm TBX1 : NI_TBX_pat<0b00, 0b1, "tbx", "VOne">;
-defm TBX2 : NI_TBX_pat<0b01, 0b1, "tbx", "VPair">;
-defm TBX3 : NI_TBX_pat<0b10, 0b1, "tbx", "VTriple">;
-defm TBX4 : NI_TBX_pat<0b11, 0b1, "tbx", "VQuad">;
-
-class NeonI_INS_main<string asmop, string Res, ValueType ResTy,
- RegisterClass OpGPR, ValueType OpTy, Operand OpImm>
- : NeonI_copy<0b1, 0b0, 0b0011,
- (outs VPR128:$Rd), (ins VPR128:$src, OpGPR:$Rn, OpImm:$Imm),
- asmop # "\t$Rd." # Res # "[$Imm], $Rn",
- [(set (ResTy VPR128:$Rd),
- (ResTy (vector_insert
- (ResTy VPR128:$src),
- (OpTy OpGPR:$Rn),
- (OpImm:$Imm))))],
- NoItinerary>,
- Sched<[WriteFPALU, ReadFPALU, ReadFPALU]> {
- bits<4> Imm;
- let Constraints = "$src = $Rd";
-}
-
-//Insert element (vector, from main)
-def INSbw : NeonI_INS_main<"ins", "b", v16i8, GPR32, i32,
- neon_uimm4_bare> {
- let Inst{20-16} = {Imm{3}, Imm{2}, Imm{1}, Imm{0}, 0b1};
-}
-def INShw : NeonI_INS_main<"ins", "h", v8i16, GPR32, i32,
- neon_uimm3_bare> {
- let Inst{20-16} = {Imm{2}, Imm{1}, Imm{0}, 0b1, 0b0};
-}
-def INSsw : NeonI_INS_main<"ins", "s", v4i32, GPR32, i32,
- neon_uimm2_bare> {
- let Inst{20-16} = {Imm{1}, Imm{0}, 0b1, 0b0, 0b0};
-}
-def INSdx : NeonI_INS_main<"ins", "d", v2i64, GPR64, i64,
- neon_uimm1_bare> {
- let Inst{20-16} = {Imm, 0b1, 0b0, 0b0, 0b0};
-}
-
-def : NeonInstAlias<"mov $Rd.b[$Imm], $Rn",
- (INSbw VPR128:$Rd, GPR32:$Rn, neon_uimm4_bare:$Imm), 0>;
-def : NeonInstAlias<"mov $Rd.h[$Imm], $Rn",
- (INShw VPR128:$Rd, GPR32:$Rn, neon_uimm3_bare:$Imm), 0>;
-def : NeonInstAlias<"mov $Rd.s[$Imm], $Rn",
- (INSsw VPR128:$Rd, GPR32:$Rn, neon_uimm2_bare:$Imm), 0>;
-def : NeonInstAlias<"mov $Rd.d[$Imm], $Rn",
- (INSdx VPR128:$Rd, GPR64:$Rn, neon_uimm1_bare:$Imm), 0>;
-
-class Neon_INS_main_pattern <ValueType ResTy,ValueType ExtResTy,
- RegisterClass OpGPR, ValueType OpTy,
- Operand OpImm, Instruction INS>
- : Pat<(ResTy (vector_insert
- (ResTy VPR64:$src),
- (OpTy OpGPR:$Rn),
- (OpImm:$Imm))),
- (ResTy (EXTRACT_SUBREG
- (ExtResTy (INS (ExtResTy (SUBREG_TO_REG (i64 0), VPR64:$src, sub_64)),
- OpGPR:$Rn, OpImm:$Imm)), sub_64))>;
-
-def INSbw_pattern : Neon_INS_main_pattern<v8i8, v16i8, GPR32, i32,
- neon_uimm3_bare, INSbw>;
-def INShw_pattern : Neon_INS_main_pattern<v4i16, v8i16, GPR32, i32,
- neon_uimm2_bare, INShw>;
-def INSsw_pattern : Neon_INS_main_pattern<v2i32, v4i32, GPR32, i32,
- neon_uimm1_bare, INSsw>;
-def INSdx_pattern : Neon_INS_main_pattern<v1i64, v2i64, GPR64, i64,
- neon_uimm0_bare, INSdx>;
-
-class NeonI_INS_element<string asmop, string Res, Operand ResImm>
- : NeonI_insert<0b1, 0b1,
- (outs VPR128:$Rd), (ins VPR128:$src, VPR128:$Rn,
- ResImm:$Immd, ResImm:$Immn),
- asmop # "\t$Rd." # Res # "[$Immd], $Rn." # Res # "[$Immn]",
- [],
- NoItinerary>,
- Sched<[WriteFPALU, ReadFPALU, ReadFPALU]> {
- let Constraints = "$src = $Rd";
- bits<4> Immd;
- bits<4> Immn;
-}
-
-//Insert element (vector, from element)
-def INSELb : NeonI_INS_element<"ins", "b", neon_uimm4_bare> {
- let Inst{20-16} = {Immd{3}, Immd{2}, Immd{1}, Immd{0}, 0b1};
- let Inst{14-11} = {Immn{3}, Immn{2}, Immn{1}, Immn{0}};
-}
-def INSELh : NeonI_INS_element<"ins", "h", neon_uimm3_bare> {
- let Inst{20-16} = {Immd{2}, Immd{1}, Immd{0}, 0b1, 0b0};
- let Inst{14-11} = {Immn{2}, Immn{1}, Immn{0}, 0b0};
- // bit 11 is unspecified, but should be set to zero.
-}
-def INSELs : NeonI_INS_element<"ins", "s", neon_uimm2_bare> {
- let Inst{20-16} = {Immd{1}, Immd{0}, 0b1, 0b0, 0b0};
- let Inst{14-11} = {Immn{1}, Immn{0}, 0b0, 0b0};
- // bits 11-12 are unspecified, but should be set to zero.
-}
-def INSELd : NeonI_INS_element<"ins", "d", neon_uimm1_bare> {
- let Inst{20-16} = {Immd, 0b1, 0b0, 0b0, 0b0};
- let Inst{14-11} = {Immn{0}, 0b0, 0b0, 0b0};
- // bits 11-13 are unspecified, but should be set to zero.
-}
-
-def : NeonInstAlias<"mov $Rd.b[$Immd], $Rn.b[$Immn]",
- (INSELb VPR128:$Rd, VPR128:$Rn,
- neon_uimm4_bare:$Immd, neon_uimm4_bare:$Immn), 0>;
-def : NeonInstAlias<"mov $Rd.h[$Immd], $Rn.h[$Immn]",
- (INSELh VPR128:$Rd, VPR128:$Rn,
- neon_uimm3_bare:$Immd, neon_uimm3_bare:$Immn), 0>;
-def : NeonInstAlias<"mov $Rd.s[$Immd], $Rn.s[$Immn]",
- (INSELs VPR128:$Rd, VPR128:$Rn,
- neon_uimm2_bare:$Immd, neon_uimm2_bare:$Immn), 0>;
-def : NeonInstAlias<"mov $Rd.d[$Immd], $Rn.d[$Immn]",
- (INSELd VPR128:$Rd, VPR128:$Rn,
- neon_uimm1_bare:$Immd, neon_uimm1_bare:$Immn), 0>;
-
-multiclass Neon_INS_elt_pattern<ValueType ResTy, ValueType NaTy,
- ValueType MidTy, Operand StImm, Operand NaImm,
- Instruction INS> {
-def : Pat<(ResTy (vector_insert
- (ResTy VPR128:$src),
- (MidTy (vector_extract
- (ResTy VPR128:$Rn),
- (StImm:$Immn))),
- (StImm:$Immd))),
- (INS (ResTy VPR128:$src), (ResTy VPR128:$Rn),
- StImm:$Immd, StImm:$Immn)>;
-
-def : Pat <(ResTy (vector_insert
- (ResTy VPR128:$src),
- (MidTy (vector_extract
- (NaTy VPR64:$Rn),
- (NaImm:$Immn))),
- (StImm:$Immd))),
- (INS (ResTy VPR128:$src),
- (ResTy (SUBREG_TO_REG (i64 0), (NaTy VPR64:$Rn), sub_64)),
- StImm:$Immd, NaImm:$Immn)>;
-
-def : Pat <(NaTy (vector_insert
- (NaTy VPR64:$src),
- (MidTy (vector_extract
- (ResTy VPR128:$Rn),
- (StImm:$Immn))),
- (NaImm:$Immd))),
- (NaTy (EXTRACT_SUBREG
- (ResTy (INS
- (ResTy (SUBREG_TO_REG (i64 0), (NaTy VPR64:$src), sub_64)),
- (ResTy VPR128:$Rn),
- NaImm:$Immd, StImm:$Immn)),
- sub_64))>;
-
-def : Pat <(NaTy (vector_insert
- (NaTy VPR64:$src),
- (MidTy (vector_extract
- (NaTy VPR64:$Rn),
- (NaImm:$Immn))),
- (NaImm:$Immd))),
- (NaTy (EXTRACT_SUBREG
- (ResTy (INS
- (ResTy (SUBREG_TO_REG (i64 0), (NaTy VPR64:$src), sub_64)),
- (ResTy (SUBREG_TO_REG (i64 0), (NaTy VPR64:$Rn), sub_64)),
- NaImm:$Immd, NaImm:$Immn)),
- sub_64))>;
-}
-
-defm : Neon_INS_elt_pattern<v4f32, v2f32, f32, neon_uimm2_bare,
- neon_uimm1_bare, INSELs>;
-defm : Neon_INS_elt_pattern<v2f64, v1f64, f64, neon_uimm1_bare,
- neon_uimm0_bare, INSELd>;
-defm : Neon_INS_elt_pattern<v16i8, v8i8, i32, neon_uimm4_bare,
- neon_uimm3_bare, INSELb>;
-defm : Neon_INS_elt_pattern<v8i16, v4i16, i32, neon_uimm3_bare,
- neon_uimm2_bare, INSELh>;
-defm : Neon_INS_elt_pattern<v4i32, v2i32, i32, neon_uimm2_bare,
- neon_uimm1_bare, INSELs>;
-defm : Neon_INS_elt_pattern<v2i64, v1i64, i64, neon_uimm1_bare,
- neon_uimm0_bare, INSELd>;
-
-multiclass Neon_INS_elt_float_pattern<ValueType ResTy, ValueType NaTy,
- ValueType MidTy,
- RegisterClass OpFPR, Operand ResImm,
- SubRegIndex SubIndex, Instruction INS> {
-def : Pat <(ResTy (vector_insert
- (ResTy VPR128:$src),
- (MidTy OpFPR:$Rn),
- (ResImm:$Imm))),
- (INS (ResTy VPR128:$src),
- (ResTy (SUBREG_TO_REG (i64 0), OpFPR:$Rn, SubIndex)),
- ResImm:$Imm,
- (i64 0))>;
-
-def : Pat <(NaTy (vector_insert
- (NaTy VPR64:$src),
- (MidTy OpFPR:$Rn),
- (ResImm:$Imm))),
- (NaTy (EXTRACT_SUBREG
- (ResTy (INS
- (ResTy (SUBREG_TO_REG (i64 0), (NaTy VPR64:$src), sub_64)),
- (ResTy (SUBREG_TO_REG (i64 0), (MidTy OpFPR:$Rn), SubIndex)),
- ResImm:$Imm,
- (i64 0))),
- sub_64))>;
-}
-
-defm : Neon_INS_elt_float_pattern<v4f32, v2f32, f32, FPR32, neon_uimm2_bare,
- sub_32, INSELs>;
-defm : Neon_INS_elt_float_pattern<v2f64, v1f64, f64, FPR64, neon_uimm1_bare,
- sub_64, INSELd>;
-
-class NeonI_SMOV<string asmop, string Res, bit Q,
- ValueType OpTy, ValueType eleTy,
- Operand OpImm, RegisterClass ResGPR, ValueType ResTy>
- : NeonI_copy<Q, 0b0, 0b0101,
- (outs ResGPR:$Rd), (ins VPR128:$Rn, OpImm:$Imm),
- asmop # "\t$Rd, $Rn." # Res # "[$Imm]",
- [(set (ResTy ResGPR:$Rd),
- (ResTy (sext_inreg
- (ResTy (vector_extract
- (OpTy VPR128:$Rn), (OpImm:$Imm))),
- eleTy)))],
- NoItinerary>,
- Sched<[WriteFPALU, ReadFPALU]> {
- bits<4> Imm;
-}
-
-//Signed integer move (main, from element)
-def SMOVwb : NeonI_SMOV<"smov", "b", 0b0, v16i8, i8, neon_uimm4_bare,
- GPR32, i32> {
- let Inst{20-16} = {Imm{3}, Imm{2}, Imm{1}, Imm{0}, 0b1};
-}
-def SMOVwh : NeonI_SMOV<"smov", "h", 0b0, v8i16, i16, neon_uimm3_bare,
- GPR32, i32> {
- let Inst{20-16} = {Imm{2}, Imm{1}, Imm{0}, 0b1, 0b0};
-}
-def SMOVxb : NeonI_SMOV<"smov", "b", 0b1, v16i8, i8, neon_uimm4_bare,
- GPR64, i64> {
- let Inst{20-16} = {Imm{3}, Imm{2}, Imm{1}, Imm{0}, 0b1};
-}
-def SMOVxh : NeonI_SMOV<"smov", "h", 0b1, v8i16, i16, neon_uimm3_bare,
- GPR64, i64> {
- let Inst{20-16} = {Imm{2}, Imm{1}, Imm{0}, 0b1, 0b0};
-}
-def SMOVxs : NeonI_SMOV<"smov", "s", 0b1, v4i32, i32, neon_uimm2_bare,
- GPR64, i64> {
- let Inst{20-16} = {Imm{1}, Imm{0}, 0b1, 0b0, 0b0};
-}
-
-multiclass Neon_SMOVx_pattern <ValueType StTy, ValueType NaTy,
- ValueType eleTy, Operand StImm, Operand NaImm,
- Instruction SMOVI> {
- def : Pat<(i64 (sext_inreg
- (i64 (anyext
- (i32 (vector_extract
- (StTy VPR128:$Rn), (StImm:$Imm))))),
- eleTy)),
- (SMOVI VPR128:$Rn, StImm:$Imm)>;
-
- def : Pat<(i64 (sext
- (i32 (vector_extract
- (StTy VPR128:$Rn), (StImm:$Imm))))),
- (SMOVI VPR128:$Rn, StImm:$Imm)>;
-
- def : Pat<(i64 (sext_inreg
- (i64 (vector_extract
- (NaTy VPR64:$Rn), (NaImm:$Imm))),
- eleTy)),
- (SMOVI (StTy (SUBREG_TO_REG (i64 0), VPR64:$Rn, sub_64)),
- NaImm:$Imm)>;
-
- def : Pat<(i64 (sext_inreg
- (i64 (anyext
- (i32 (vector_extract
- (NaTy VPR64:$Rn), (NaImm:$Imm))))),
- eleTy)),
- (SMOVI (StTy (SUBREG_TO_REG (i64 0), VPR64:$Rn, sub_64)),
- NaImm:$Imm)>;
-
- def : Pat<(i64 (sext
- (i32 (vector_extract
- (NaTy VPR64:$Rn), (NaImm:$Imm))))),
- (SMOVI (StTy (SUBREG_TO_REG (i64 0), VPR64:$Rn, sub_64)),
- NaImm:$Imm)>;
-}
-
-defm : Neon_SMOVx_pattern<v16i8, v8i8, i8, neon_uimm4_bare,
- neon_uimm3_bare, SMOVxb>;
-defm : Neon_SMOVx_pattern<v8i16, v4i16, i16, neon_uimm3_bare,
- neon_uimm2_bare, SMOVxh>;
-defm : Neon_SMOVx_pattern<v4i32, v2i32, i32, neon_uimm2_bare,
- neon_uimm1_bare, SMOVxs>;
-
-class Neon_SMOVw_pattern <ValueType StTy, ValueType NaTy,
- ValueType eleTy, Operand StImm, Operand NaImm,
- Instruction SMOVI>
- : Pat<(i32 (sext_inreg
- (i32 (vector_extract
- (NaTy VPR64:$Rn), (NaImm:$Imm))),
- eleTy)),
- (SMOVI (StTy (SUBREG_TO_REG (i64 0), VPR64:$Rn, sub_64)),
- NaImm:$Imm)>;
-
-def : Neon_SMOVw_pattern<v16i8, v8i8, i8, neon_uimm4_bare,
- neon_uimm3_bare, SMOVwb>;
-def : Neon_SMOVw_pattern<v8i16, v4i16, i16, neon_uimm3_bare,
- neon_uimm2_bare, SMOVwh>;
-
-class NeonI_UMOV<string asmop, string Res, bit Q,
- ValueType OpTy, Operand OpImm,
- RegisterClass ResGPR, ValueType ResTy>
- : NeonI_copy<Q, 0b0, 0b0111,
- (outs ResGPR:$Rd), (ins VPR128:$Rn, OpImm:$Imm),
- asmop # "\t$Rd, $Rn." # Res # "[$Imm]",
- [(set (ResTy ResGPR:$Rd),
- (ResTy (vector_extract
- (OpTy VPR128:$Rn), (OpImm:$Imm))))],
- NoItinerary>,
- Sched<[WriteFPALU, ReadFPALU]> {
- bits<4> Imm;
-}
-
-//Unsigned integer move (main, from element)
-def UMOVwb : NeonI_UMOV<"umov", "b", 0b0, v16i8, neon_uimm4_bare,
- GPR32, i32> {
- let Inst{20-16} = {Imm{3}, Imm{2}, Imm{1}, Imm{0}, 0b1};
-}
-def UMOVwh : NeonI_UMOV<"umov", "h", 0b0, v8i16, neon_uimm3_bare,
- GPR32, i32> {
- let Inst{20-16} = {Imm{2}, Imm{1}, Imm{0}, 0b1, 0b0};
-}
-def UMOVws : NeonI_UMOV<"umov", "s", 0b0, v4i32, neon_uimm2_bare,
- GPR32, i32> {
- let Inst{20-16} = {Imm{1}, Imm{0}, 0b1, 0b0, 0b0};
-}
-def UMOVxd : NeonI_UMOV<"umov", "d", 0b1, v2i64, neon_uimm1_bare,
- GPR64, i64> {
- let Inst{20-16} = {Imm, 0b1, 0b0, 0b0, 0b0};
-}
-
-def : NeonInstAlias<"mov $Rd, $Rn.s[$Imm]",
- (UMOVws GPR32:$Rd, VPR128:$Rn, neon_uimm2_bare:$Imm), 0>;
-def : NeonInstAlias<"mov $Rd, $Rn.d[$Imm]",
- (UMOVxd GPR64:$Rd, VPR128:$Rn, neon_uimm1_bare:$Imm), 0>;
-
-class Neon_UMOV_pattern <ValueType StTy, ValueType NaTy, ValueType ResTy,
- Operand StImm, Operand NaImm,
- Instruction SMOVI>
- : Pat<(ResTy (vector_extract
- (NaTy VPR64:$Rn), NaImm:$Imm)),
- (SMOVI (StTy (SUBREG_TO_REG (i64 0), VPR64:$Rn, sub_64)),
- NaImm:$Imm)>;
-
-def : Neon_UMOV_pattern<v16i8, v8i8, i32, neon_uimm4_bare,
- neon_uimm3_bare, UMOVwb>;
-def : Neon_UMOV_pattern<v8i16, v4i16, i32, neon_uimm3_bare,
- neon_uimm2_bare, UMOVwh>;
-def : Neon_UMOV_pattern<v4i32, v2i32, i32, neon_uimm2_bare,
- neon_uimm1_bare, UMOVws>;
-
-def : Pat<(i32 (and
- (i32 (vector_extract
- (v16i8 VPR128:$Rn), (neon_uimm4_bare:$Imm))),
- 255)),
- (UMOVwb VPR128:$Rn, neon_uimm4_bare:$Imm)>;
-
-def : Pat<(i32 (and
- (i32 (vector_extract
- (v8i16 VPR128:$Rn), (neon_uimm3_bare:$Imm))),
- 65535)),
- (UMOVwh VPR128:$Rn, neon_uimm3_bare:$Imm)>;
-
-def : Pat<(i64 (zext
- (i32 (vector_extract
- (v2i64 VPR128:$Rn), (neon_uimm1_bare:$Imm))))),
- (UMOVxd VPR128:$Rn, neon_uimm1_bare:$Imm)>;
-
-def : Pat<(i32 (and
- (i32 (vector_extract
- (v8i8 VPR64:$Rn), (neon_uimm3_bare:$Imm))),
- 255)),
- (UMOVwb (SUBREG_TO_REG (i64 0), VPR64:$Rn, sub_64),
- neon_uimm3_bare:$Imm)>;
-
-def : Pat<(i32 (and
- (i32 (vector_extract
- (v4i16 VPR64:$Rn), (neon_uimm2_bare:$Imm))),
- 65535)),
- (UMOVwh (SUBREG_TO_REG (i64 0), VPR64:$Rn, sub_64),
- neon_uimm2_bare:$Imm)>;
-
-def : Pat<(i64 (zext
- (i32 (vector_extract
- (v1i64 VPR64:$Rn), (neon_uimm0_bare:$Imm))))),
- (UMOVxd (SUBREG_TO_REG (i64 0), VPR64:$Rn, sub_64),
- neon_uimm0_bare:$Imm)>;
-
-// Additional copy patterns for scalar types
-def : Pat<(i32 (vector_extract (v1i8 FPR8:$Rn), (i64 0))),
- (UMOVwb (v16i8
- (SUBREG_TO_REG (i64 0), FPR8:$Rn, sub_8)), (i64 0))>;
-
-def : Pat<(i32 (vector_extract (v1i16 FPR16:$Rn), (i64 0))),
- (UMOVwh (v8i16
- (SUBREG_TO_REG (i64 0), FPR16:$Rn, sub_16)), (i64 0))>;
-
-def : Pat<(i32 (vector_extract (v1i32 FPR32:$Rn), (i64 0))),
- (FMOVws FPR32:$Rn)>;
-
-def : Pat<(i64 (vector_extract (v1i64 FPR64:$Rn), (i64 0))),
- (FMOVxd FPR64:$Rn)>;
-
-def : Pat<(f64 (vector_extract (v1f64 FPR64:$Rn), (i64 0))),
- (f64 FPR64:$Rn)>;
-
-def : Pat<(v1i8 (scalar_to_vector GPR32:$Rn)),
- (v1i8 (EXTRACT_SUBREG (v16i8
- (INSbw (v16i8 (IMPLICIT_DEF)), $Rn, (i64 0))),
- sub_8))>;
-
-def : Pat<(v1i16 (scalar_to_vector GPR32:$Rn)),
- (v1i16 (EXTRACT_SUBREG (v8i16
- (INShw (v8i16 (IMPLICIT_DEF)), $Rn, (i64 0))),
- sub_16))>;
-
-def : Pat<(v1i32 (scalar_to_vector GPR32:$src)),
- (FMOVsw $src)>;
-
-def : Pat<(v1i64 (scalar_to_vector GPR64:$src)),
- (FMOVdx $src)>;
-
-def : Pat<(v8i8 (scalar_to_vector GPR32:$Rn)),
- (v8i8 (EXTRACT_SUBREG (v16i8
- (INSbw (v16i8 (IMPLICIT_DEF)), $Rn, (i64 0))),
- sub_64))>;
-
-def : Pat<(v4i16 (scalar_to_vector GPR32:$Rn)),
- (v4i16 (EXTRACT_SUBREG (v8i16
- (INShw (v8i16 (IMPLICIT_DEF)), $Rn, (i64 0))),
- sub_64))>;
-
-def : Pat<(v2i32 (scalar_to_vector GPR32:$Rn)),
- (v2i32 (EXTRACT_SUBREG (v16i8
- (INSsw (v4i32 (IMPLICIT_DEF)), $Rn, (i64 0))),
- sub_64))>;
-
-def : Pat<(v16i8 (scalar_to_vector GPR32:$Rn)),
- (INSbw (v16i8 (IMPLICIT_DEF)), $Rn, (i64 0))>;
-
-def : Pat<(v8i16 (scalar_to_vector GPR32:$Rn)),
- (INShw (v8i16 (IMPLICIT_DEF)), $Rn, (i64 0))>;
-
-def : Pat<(v4i32 (scalar_to_vector GPR32:$Rn)),
- (INSsw (v4i32 (IMPLICIT_DEF)), $Rn, (i64 0))>;
-
-def : Pat<(v2i64 (scalar_to_vector GPR64:$Rn)),
- (INSdx (v2i64 (IMPLICIT_DEF)), $Rn, (i64 0))>;
-
-def : Pat<(v2f32 (scalar_to_vector (f32 FPR32:$Rn))),
- (SUBREG_TO_REG (i64 0), FPR32:$Rn, sub_32)>;
-def : Pat<(v4f32 (scalar_to_vector (f32 FPR32:$Rn))),
- (SUBREG_TO_REG (i64 0), FPR32:$Rn, sub_32)>;
-
-def : Pat<(v1f64 (scalar_to_vector (f64 FPR64:$Rn))),
- (v1f64 FPR64:$Rn)>;
-
-def : Pat<(v2f64 (scalar_to_vector (f64 FPR64:$src))),
- (INSERT_SUBREG (v2f64 (IMPLICIT_DEF)),
- (f64 FPR64:$src), sub_64)>;
-
-class NeonI_DUP_Elt<bit Q, string asmop, string rdlane, string rnlane,
- RegisterOperand ResVPR, Operand OpImm>
- : NeonI_copy<Q, 0b0, 0b0000, (outs ResVPR:$Rd),
- (ins VPR128:$Rn, OpImm:$Imm),
- asmop # "\t$Rd" # rdlane # ", $Rn" # rnlane # "[$Imm]",
- [],
- NoItinerary>,
- Sched<[WriteFPALU, ReadFPALU]> {
- bits<4> Imm;
-}
-
-def DUPELT16b : NeonI_DUP_Elt<0b1, "dup", ".16b", ".b", VPR128,
- neon_uimm4_bare> {
- let Inst{20-16} = {Imm{3}, Imm{2}, Imm{1}, Imm{0}, 0b1};
-}
-
-def DUPELT8h : NeonI_DUP_Elt<0b1, "dup", ".8h", ".h", VPR128,
- neon_uimm3_bare> {
- let Inst{20-16} = {Imm{2}, Imm{1}, Imm{0}, 0b1, 0b0};
-}
-
-def DUPELT4s : NeonI_DUP_Elt<0b1, "dup", ".4s", ".s", VPR128,
- neon_uimm2_bare> {
- let Inst{20-16} = {Imm{1}, Imm{0}, 0b1, 0b0, 0b0};
-}
-
-def DUPELT2d : NeonI_DUP_Elt<0b1, "dup", ".2d", ".d", VPR128,
- neon_uimm1_bare> {
- let Inst{20-16} = {Imm, 0b1, 0b0, 0b0, 0b0};
-}
-
-def DUPELT8b : NeonI_DUP_Elt<0b0, "dup", ".8b", ".b", VPR64,
- neon_uimm4_bare> {
- let Inst{20-16} = {Imm{3}, Imm{2}, Imm{1}, Imm{0}, 0b1};
-}
-
-def DUPELT4h : NeonI_DUP_Elt<0b0, "dup", ".4h", ".h", VPR64,
- neon_uimm3_bare> {
- let Inst{20-16} = {Imm{2}, Imm{1}, Imm{0}, 0b1, 0b0};
-}
-
-def DUPELT2s : NeonI_DUP_Elt<0b0, "dup", ".2s", ".s", VPR64,
- neon_uimm2_bare> {
- let Inst{20-16} = {Imm{1}, Imm{0}, 0b1, 0b0, 0b0};
-}
-
-multiclass NeonI_DUP_Elt_pattern<Instruction DUPELT, ValueType ResTy,
- ValueType OpTy,ValueType NaTy,
- ValueType ExTy, Operand OpLImm,
- Operand OpNImm> {
-def : Pat<(ResTy (Neon_vduplane (OpTy VPR128:$Rn), OpLImm:$Imm)),
- (ResTy (DUPELT (OpTy VPR128:$Rn), OpLImm:$Imm))>;
-
-def : Pat<(ResTy (Neon_vduplane
- (NaTy VPR64:$Rn), OpNImm:$Imm)),
- (ResTy (DUPELT
- (ExTy (SUBREG_TO_REG (i64 0), VPR64:$Rn, sub_64)), OpNImm:$Imm))>;
-}
-defm : NeonI_DUP_Elt_pattern<DUPELT16b, v16i8, v16i8, v8i8, v16i8,
- neon_uimm4_bare, neon_uimm3_bare>;
-defm : NeonI_DUP_Elt_pattern<DUPELT8b, v8i8, v16i8, v8i8, v16i8,
- neon_uimm4_bare, neon_uimm3_bare>;
-defm : NeonI_DUP_Elt_pattern<DUPELT8h, v8i16, v8i16, v4i16, v8i16,
- neon_uimm3_bare, neon_uimm2_bare>;
-defm : NeonI_DUP_Elt_pattern<DUPELT4h, v4i16, v8i16, v4i16, v8i16,
- neon_uimm3_bare, neon_uimm2_bare>;
-defm : NeonI_DUP_Elt_pattern<DUPELT4s, v4i32, v4i32, v2i32, v4i32,
- neon_uimm2_bare, neon_uimm1_bare>;
-defm : NeonI_DUP_Elt_pattern<DUPELT2s, v2i32, v4i32, v2i32, v4i32,
- neon_uimm2_bare, neon_uimm1_bare>;
-defm : NeonI_DUP_Elt_pattern<DUPELT2d, v2i64, v2i64, v1i64, v2i64,
- neon_uimm1_bare, neon_uimm0_bare>;
-defm : NeonI_DUP_Elt_pattern<DUPELT4s, v4f32, v4f32, v2f32, v4f32,
- neon_uimm2_bare, neon_uimm1_bare>;
-defm : NeonI_DUP_Elt_pattern<DUPELT2s, v2f32, v4f32, v2f32, v4f32,
- neon_uimm2_bare, neon_uimm1_bare>;
-defm : NeonI_DUP_Elt_pattern<DUPELT2d, v2f64, v2f64, v1f64, v2f64,
- neon_uimm1_bare, neon_uimm0_bare>;
-
-def : Pat<(v2f32 (Neon_vdup (f32 FPR32:$Rn))),
- (v2f32 (DUPELT2s
- (SUBREG_TO_REG (i64 0), FPR32:$Rn, sub_32),
- (i64 0)))>;
-def : Pat<(v4f32 (Neon_vdup (f32 FPR32:$Rn))),
- (v4f32 (DUPELT4s
- (SUBREG_TO_REG (i64 0), FPR32:$Rn, sub_32),
- (i64 0)))>;
-def : Pat<(v2f64 (Neon_vdup (f64 FPR64:$Rn))),
- (v2f64 (DUPELT2d
- (SUBREG_TO_REG (i64 0), FPR64:$Rn, sub_64),
- (i64 0)))>;
-
-multiclass NeonI_DUP_pattern<Instruction DUPELT, ValueType ResTy,
- ValueType OpTy, RegisterClass OpRC,
- Operand OpNImm, SubRegIndex SubIndex> {
-def : Pat<(ResTy (Neon_vduplane (OpTy OpRC:$Rn), OpNImm:$Imm)),
- (ResTy (DUPELT
- (SUBREG_TO_REG (i64 0), OpRC:$Rn, SubIndex), OpNImm:$Imm))>;
-}
-
-defm : NeonI_DUP_pattern<DUPELT4h, v4i16, v1i16, FPR16, neon_uimm2_bare,sub_16>;
-defm : NeonI_DUP_pattern<DUPELT4s, v4i32, v1i32, FPR32, neon_uimm2_bare,sub_32>;
-defm : NeonI_DUP_pattern<DUPELT8b, v8i8, v1i8, FPR8, neon_uimm3_bare, sub_8>;
-defm : NeonI_DUP_pattern<DUPELT8h, v8i16, v1i16, FPR16, neon_uimm3_bare,sub_16>;
-defm : NeonI_DUP_pattern<DUPELT16b, v16i8, v1i8, FPR8, neon_uimm4_bare, sub_8>;
-
-class NeonI_DUP<bit Q, string asmop, string rdlane,
- RegisterOperand ResVPR, ValueType ResTy,
- RegisterClass OpGPR, ValueType OpTy>
- : NeonI_copy<Q, 0b0, 0b0001, (outs ResVPR:$Rd), (ins OpGPR:$Rn),
- asmop # "\t$Rd" # rdlane # ", $Rn",
- [(set (ResTy ResVPR:$Rd),
- (ResTy (Neon_vdup (OpTy OpGPR:$Rn))))],
- NoItinerary>,
- Sched<[WriteFPALU, ReadFPALU]>;
-
-def DUP16b : NeonI_DUP<0b1, "dup", ".16b", VPR128, v16i8, GPR32, i32> {
- let Inst{20-16} = 0b00001;
- // bits 17-20 are unspecified, but should be set to zero.
-}
-
-def DUP8h : NeonI_DUP<0b1, "dup", ".8h", VPR128, v8i16, GPR32, i32> {
- let Inst{20-16} = 0b00010;
- // bits 18-20 are unspecified, but should be set to zero.
-}
-
-def DUP4s : NeonI_DUP<0b1, "dup", ".4s", VPR128, v4i32, GPR32, i32> {
- let Inst{20-16} = 0b00100;
- // bits 19-20 are unspecified, but should be set to zero.
-}
-
-def DUP2d : NeonI_DUP<0b1, "dup", ".2d", VPR128, v2i64, GPR64, i64> {
- let Inst{20-16} = 0b01000;
- // bit 20 is unspecified, but should be set to zero.
-}
-
-def DUP8b : NeonI_DUP<0b0, "dup", ".8b", VPR64, v8i8, GPR32, i32> {
- let Inst{20-16} = 0b00001;
- // bits 17-20 are unspecified, but should be set to zero.
-}
-
-def DUP4h : NeonI_DUP<0b0, "dup", ".4h", VPR64, v4i16, GPR32, i32> {
- let Inst{20-16} = 0b00010;
- // bits 18-20 are unspecified, but should be set to zero.
-}
-
-def DUP2s : NeonI_DUP<0b0, "dup", ".2s", VPR64, v2i32, GPR32, i32> {
- let Inst{20-16} = 0b00100;
- // bits 19-20 are unspecified, but should be set to zero.
-}
-
-// patterns for CONCAT_VECTORS
-multiclass Concat_Vector_Pattern<ValueType ResTy, ValueType OpTy> {
-def : Pat<(ResTy (concat_vectors (OpTy VPR64:$Rn), undef)),
- (SUBREG_TO_REG (i64 0), VPR64:$Rn, sub_64)>;
-def : Pat<(ResTy (concat_vectors (OpTy VPR64:$Rn), (OpTy VPR64:$Rm))),
- (INSELd
- (v2i64 (SUBREG_TO_REG (i64 0), VPR64:$Rn, sub_64)),
- (v2i64 (SUBREG_TO_REG (i64 0), VPR64:$Rm, sub_64)),
- (i64 1),
- (i64 0))>;
-def : Pat<(ResTy (concat_vectors (OpTy VPR64:$Rn), (OpTy VPR64:$Rn))),
- (DUPELT2d
- (v2i64 (SUBREG_TO_REG (i64 0), VPR64:$Rn, sub_64)),
- (i64 0))> ;
-}
-
-defm : Concat_Vector_Pattern<v16i8, v8i8>;
-defm : Concat_Vector_Pattern<v8i16, v4i16>;
-defm : Concat_Vector_Pattern<v4i32, v2i32>;
-defm : Concat_Vector_Pattern<v2i64, v1i64>;
-defm : Concat_Vector_Pattern<v4f32, v2f32>;
-defm : Concat_Vector_Pattern<v2f64, v1f64>;
-
-def : Pat<(v2i32 (concat_vectors (v1i32 FPR32:$Rn), undef)),
- (v2i32 (SUBREG_TO_REG(i64 0), $Rn, sub_32))>;
-def : Pat<(v2i32 (concat_vectors (v1i32 FPR32:$Rn), (v1i32 FPR32:$Rm))),
- (EXTRACT_SUBREG
- (v4i32 (INSELs
- (v4i32 (SUBREG_TO_REG (i64 0), FPR32:$Rn, sub_32)),
- (v4i32 (SUBREG_TO_REG (i64 0), FPR32:$Rm, sub_32)),
- (i64 1),
- (i64 0))),
- sub_64)>;
-def : Pat<(v2i32 (concat_vectors (v1i32 FPR32:$Rn), (v1i32 FPR32:$Rn))),
- (DUPELT2s (v4i32 (SUBREG_TO_REG(i64 0), $Rn, sub_32)), 0)>;
-
-//patterns for EXTRACT_SUBVECTOR
-def : Pat<(v8i8 (extract_subvector (v16i8 VPR128:$Rn), (i64 0))),
- (v8i8 (EXTRACT_SUBREG VPR128:$Rn, sub_64))>;
-def : Pat<(v4i16 (extract_subvector (v8i16 VPR128:$Rn), (i64 0))),
- (v4i16 (EXTRACT_SUBREG VPR128:$Rn, sub_64))>;
-def : Pat<(v2i32 (extract_subvector (v4i32 VPR128:$Rn), (i64 0))),
- (v2i32 (EXTRACT_SUBREG VPR128:$Rn, sub_64))>;
-def : Pat<(v1i64 (extract_subvector (v2i64 VPR128:$Rn), (i64 0))),
- (v1i64 (EXTRACT_SUBREG VPR128:$Rn, sub_64))>;
-def : Pat<(v2f32 (extract_subvector (v4f32 VPR128:$Rn), (i64 0))),
- (v2f32 (EXTRACT_SUBREG VPR128:$Rn, sub_64))>;
-def : Pat<(v1f64 (extract_subvector (v2f64 VPR128:$Rn), (i64 0))),
- (v1f64 (EXTRACT_SUBREG VPR128:$Rn, sub_64))>;
-
-// The followings are for instruction class (3V Elem)
-
-// Variant 1
-
-class NI_2VE<bit q, bit u, bits<2> size, bits<4> opcode,
- string asmop, string ResS, string OpS, string EleOpS,
- Operand OpImm, RegisterOperand ResVPR,
- RegisterOperand OpVPR, RegisterOperand EleOpVPR>
- : NeonI_2VElem<q, u, size, opcode,
- (outs ResVPR:$Rd), (ins ResVPR:$src, OpVPR:$Rn,
- EleOpVPR:$Re, OpImm:$Index),
- asmop # "\t$Rd." # ResS # ", $Rn." # OpS #
- ", $Re." # EleOpS # "[$Index]",
- [],
- NoItinerary>,
- Sched<[WriteFPMAC, ReadFPMAC, ReadFPMAC, ReadFPMAC]> {
- bits<3> Index;
- bits<5> Re;
-
- let Constraints = "$src = $Rd";
-}
-
-multiclass NI_2VE_v1<bit u, bits<4> opcode, string asmop> {
- // vector register class for element is always 128-bit to cover the max index
- def _2s4s : NI_2VE<0b0, u, 0b10, opcode, asmop, "2s", "2s", "s",
- neon_uimm2_bare, VPR64, VPR64, VPR128> {
- let Inst{11} = {Index{1}};
- let Inst{21} = {Index{0}};
- let Inst{20-16} = Re;
- }
-
- def _4s4s : NI_2VE<0b1, u, 0b10, opcode, asmop, "4s", "4s", "s",
- neon_uimm2_bare, VPR128, VPR128, VPR128> {
- let Inst{11} = {Index{1}};
- let Inst{21} = {Index{0}};
- let Inst{20-16} = Re;
- }
-
- // Index operations on 16-bit(H) elements are restricted to using v0-v15.
- def _4h8h : NI_2VE<0b0, u, 0b01, opcode, asmop, "4h", "4h", "h",
- neon_uimm3_bare, VPR64, VPR64, VPR128Lo> {
- let Inst{11} = {Index{2}};
- let Inst{21} = {Index{1}};
- let Inst{20} = {Index{0}};
- let Inst{19-16} = Re{3-0};
- }
-
- def _8h8h : NI_2VE<0b1, u, 0b01, opcode, asmop, "8h", "8h", "h",
- neon_uimm3_bare, VPR128, VPR128, VPR128Lo> {
- let Inst{11} = {Index{2}};
- let Inst{21} = {Index{1}};
- let Inst{20} = {Index{0}};
- let Inst{19-16} = Re{3-0};
- }
-}
-
-defm MLAvve : NI_2VE_v1<0b1, 0b0000, "mla">;
-defm MLSvve : NI_2VE_v1<0b1, 0b0100, "mls">;
-
-// Pattern for lane in 128-bit vector
-class NI_2VE_laneq<Instruction INST, Operand OpImm, SDPatternOperator op,
- RegisterOperand ResVPR, RegisterOperand OpVPR,
- RegisterOperand EleOpVPR, ValueType ResTy, ValueType OpTy,
- ValueType EleOpTy>
- : Pat<(ResTy (op (ResTy ResVPR:$src), (OpTy OpVPR:$Rn),
- (OpTy (Neon_vduplane (EleOpTy EleOpVPR:$Re), (i64 OpImm:$Index))))),
- (INST ResVPR:$src, OpVPR:$Rn, EleOpVPR:$Re, OpImm:$Index)>;
-
-// Pattern for lane in 64-bit vector
-class NI_2VE_lane<Instruction INST, Operand OpImm, SDPatternOperator op,
- RegisterOperand ResVPR, RegisterOperand OpVPR,
- RegisterOperand EleOpVPR, ValueType ResTy, ValueType OpTy,
- ValueType EleOpTy>
- : Pat<(ResTy (op (ResTy ResVPR:$src), (OpTy OpVPR:$Rn),
- (OpTy (Neon_vduplane (EleOpTy EleOpVPR:$Re), (i64 OpImm:$Index))))),
- (INST ResVPR:$src, OpVPR:$Rn,
- (SUBREG_TO_REG (i64 0), EleOpVPR:$Re, sub_64), OpImm:$Index)>;
-
-multiclass NI_2VE_v1_pat<string subop, SDPatternOperator op>
-{
- def : NI_2VE_laneq<!cast<Instruction>(subop # "_2s4s"), neon_uimm2_bare,
- op, VPR64, VPR64, VPR128, v2i32, v2i32, v4i32>;
-
- def : NI_2VE_laneq<!cast<Instruction>(subop # "_4s4s"), neon_uimm2_bare,
- op, VPR128, VPR128, VPR128, v4i32, v4i32, v4i32>;
-
- def : NI_2VE_laneq<!cast<Instruction>(subop # "_4h8h"), neon_uimm3_bare,
- op, VPR64, VPR64, VPR128Lo, v4i16, v4i16, v8i16>;
-
- def : NI_2VE_laneq<!cast<Instruction>(subop # "_8h8h"), neon_uimm3_bare,
- op, VPR128, VPR128, VPR128Lo, v8i16, v8i16, v8i16>;
-
- // Index can only be half of the max value for lane in 64-bit vector
-
- def : NI_2VE_lane<!cast<Instruction>(subop # "_2s4s"), neon_uimm1_bare,
- op, VPR64, VPR64, VPR64, v2i32, v2i32, v2i32>;
-
- def : NI_2VE_lane<!cast<Instruction>(subop # "_4h8h"), neon_uimm2_bare,
- op, VPR64, VPR64, VPR64Lo, v4i16, v4i16, v4i16>;
-}
-
-defm MLA_lane_v1 : NI_2VE_v1_pat<"MLAvve", Neon_mla>;
-defm MLS_lane_v1 : NI_2VE_v1_pat<"MLSvve", Neon_mls>;
-
-class NI_2VE_2op<bit q, bit u, bits<2> size, bits<4> opcode,
- string asmop, string ResS, string OpS, string EleOpS,
- Operand OpImm, RegisterOperand ResVPR,
- RegisterOperand OpVPR, RegisterOperand EleOpVPR>
- : NeonI_2VElem<q, u, size, opcode,
- (outs ResVPR:$Rd), (ins OpVPR:$Rn,
- EleOpVPR:$Re, OpImm:$Index),
- asmop # "\t$Rd." # ResS # ", $Rn." # OpS #
- ", $Re." # EleOpS # "[$Index]",
- [],
- NoItinerary>,
- Sched<[WriteFPALU, ReadFPALU, ReadFPALU]> {
- bits<3> Index;
- bits<5> Re;
-}
-
-multiclass NI_2VE_v1_2op<bit u, bits<4> opcode, string asmop> {
- // vector register class for element is always 128-bit to cover the max index
- def _2s4s : NI_2VE_2op<0b0, u, 0b10, opcode, asmop, "2s", "2s", "s",
- neon_uimm2_bare, VPR64, VPR64, VPR128> {
- let Inst{11} = {Index{1}};
- let Inst{21} = {Index{0}};
- let Inst{20-16} = Re;
- }
-
- def _4s4s : NI_2VE_2op<0b1, u, 0b10, opcode, asmop, "4s", "4s", "s",
- neon_uimm2_bare, VPR128, VPR128, VPR128> {
- let Inst{11} = {Index{1}};
- let Inst{21} = {Index{0}};
- let Inst{20-16} = Re;
- }
-
- // Index operations on 16-bit(H) elements are restricted to using v0-v15.
- def _4h8h : NI_2VE_2op<0b0, u, 0b01, opcode, asmop, "4h", "4h", "h",
- neon_uimm3_bare, VPR64, VPR64, VPR128Lo> {
- let Inst{11} = {Index{2}};
- let Inst{21} = {Index{1}};
- let Inst{20} = {Index{0}};
- let Inst{19-16} = Re{3-0};
- }
-
- def _8h8h : NI_2VE_2op<0b1, u, 0b01, opcode, asmop, "8h", "8h", "h",
- neon_uimm3_bare, VPR128, VPR128, VPR128Lo> {
- let Inst{11} = {Index{2}};
- let Inst{21} = {Index{1}};
- let Inst{20} = {Index{0}};
- let Inst{19-16} = Re{3-0};
- }
-}
-
-let SchedRW = [WriteFPMul, ReadFPMul, ReadFPMul] in {
-defm MULve : NI_2VE_v1_2op<0b0, 0b1000, "mul">;
-defm SQDMULHve : NI_2VE_v1_2op<0b0, 0b1100, "sqdmulh">;
-defm SQRDMULHve : NI_2VE_v1_2op<0b0, 0b1101, "sqrdmulh">;
-}
-
-// Pattern for lane in 128-bit vector
-class NI_2VE_mul_laneq<Instruction INST, Operand OpImm, SDPatternOperator op,
- RegisterOperand OpVPR, RegisterOperand EleOpVPR,
- ValueType ResTy, ValueType OpTy, ValueType EleOpTy>
- : Pat<(ResTy (op (OpTy OpVPR:$Rn),
- (OpTy (Neon_vduplane (EleOpTy EleOpVPR:$Re), (i64 OpImm:$Index))))),
- (INST OpVPR:$Rn, EleOpVPR:$Re, OpImm:$Index)>;
-
-// Pattern for lane in 64-bit vector
-class NI_2VE_mul_lane<Instruction INST, Operand OpImm, SDPatternOperator op,
- RegisterOperand OpVPR, RegisterOperand EleOpVPR,
- ValueType ResTy, ValueType OpTy, ValueType EleOpTy>
- : Pat<(ResTy (op (OpTy OpVPR:$Rn),
- (OpTy (Neon_vduplane (EleOpTy EleOpVPR:$Re), (i64 OpImm:$Index))))),
- (INST OpVPR:$Rn,
- (SUBREG_TO_REG (i64 0), EleOpVPR:$Re, sub_64), OpImm:$Index)>;
-
-multiclass NI_2VE_mul_v1_pat<string subop, SDPatternOperator op> {
- def : NI_2VE_mul_laneq<!cast<Instruction>(subop # "_2s4s"), neon_uimm2_bare,
- op, VPR64, VPR128, v2i32, v2i32, v4i32>;
-
- def : NI_2VE_mul_laneq<!cast<Instruction>(subop # "_4s4s"), neon_uimm2_bare,
- op, VPR128, VPR128, v4i32, v4i32, v4i32>;
-
- def : NI_2VE_mul_laneq<!cast<Instruction>(subop # "_4h8h"), neon_uimm3_bare,
- op, VPR64, VPR128Lo, v4i16, v4i16, v8i16>;
-
- def : NI_2VE_mul_laneq<!cast<Instruction>(subop # "_8h8h"), neon_uimm3_bare,
- op, VPR128, VPR128Lo, v8i16, v8i16, v8i16>;
-
- // Index can only be half of the max value for lane in 64-bit vector
-
- def : NI_2VE_mul_lane<!cast<Instruction>(subop # "_2s4s"), neon_uimm1_bare,
- op, VPR64, VPR64, v2i32, v2i32, v2i32>;
-
- def : NI_2VE_mul_lane<!cast<Instruction>(subop # "_4h8h"), neon_uimm2_bare,
- op, VPR64, VPR64Lo, v4i16, v4i16, v4i16>;
-}
-
-defm MUL_lane_v1 : NI_2VE_mul_v1_pat<"MULve", mul>;
-defm SQDMULH_lane_v1 : NI_2VE_mul_v1_pat<"SQDMULHve", int_arm_neon_vqdmulh>;
-defm SQRDMULH_lane_v1 : NI_2VE_mul_v1_pat<"SQRDMULHve", int_arm_neon_vqrdmulh>;
-
-// Variant 2
-
-multiclass NI_2VE_v2_2op<bit u, bits<4> opcode, string asmop> {
- // vector register class for element is always 128-bit to cover the max index
- def _2s4s : NI_2VE_2op<0b0, u, 0b10, opcode, asmop, "2s", "2s", "s",
- neon_uimm2_bare, VPR64, VPR64, VPR128> {
- let Inst{11} = {Index{1}};
- let Inst{21} = {Index{0}};
- let Inst{20-16} = Re;
- }
-
- def _4s4s : NI_2VE_2op<0b1, u, 0b10, opcode, asmop, "4s", "4s", "s",
- neon_uimm2_bare, VPR128, VPR128, VPR128> {
- let Inst{11} = {Index{1}};
- let Inst{21} = {Index{0}};
- let Inst{20-16} = Re;
- }
-
- // _1d2d doesn't exist!
-
- def _2d2d : NI_2VE_2op<0b1, u, 0b11, opcode, asmop, "2d", "2d", "d",
- neon_uimm1_bare, VPR128, VPR128, VPR128> {
- let Inst{11} = {Index{0}};
- let Inst{21} = 0b0;
- let Inst{20-16} = Re;
- }
-}
-
-let SchedRW = [WriteFPMul, ReadFPMul, ReadFPMul] in {
-defm FMULve : NI_2VE_v2_2op<0b0, 0b1001, "fmul">;
-defm FMULXve : NI_2VE_v2_2op<0b1, 0b1001, "fmulx">;
-}
-
-class NI_2VE_mul_lane_2d<Instruction INST, Operand OpImm, SDPatternOperator op,
- RegisterOperand OpVPR, RegisterOperand EleOpVPR,
- ValueType ResTy, ValueType OpTy, ValueType EleOpTy,
- SDPatternOperator coreop>
- : Pat<(ResTy (op (OpTy OpVPR:$Rn),
- (OpTy (coreop (EleOpTy EleOpVPR:$Re), (EleOpTy EleOpVPR:$Re))))),
- (INST OpVPR:$Rn,
- (SUBREG_TO_REG (i64 0), EleOpVPR:$Re, sub_64), 0)>;
-
-multiclass NI_2VE_mul_v2_pat<string subop, SDPatternOperator op> {
- def : NI_2VE_mul_laneq<!cast<Instruction>(subop # "_2s4s"), neon_uimm2_bare,
- op, VPR64, VPR128, v2f32, v2f32, v4f32>;
-
- def : NI_2VE_mul_laneq<!cast<Instruction>(subop # "_4s4s"), neon_uimm2_bare,
- op, VPR128, VPR128, v4f32, v4f32, v4f32>;
-
- def : NI_2VE_mul_laneq<!cast<Instruction>(subop # "_2d2d"), neon_uimm1_bare,
- op, VPR128, VPR128, v2f64, v2f64, v2f64>;
-
- // Index can only be half of the max value for lane in 64-bit vector
-
- def : NI_2VE_mul_lane<!cast<Instruction>(subop # "_2s4s"), neon_uimm1_bare,
- op, VPR64, VPR64, v2f32, v2f32, v2f32>;
-
- def : NI_2VE_mul_lane_2d<!cast<Instruction>(subop # "_2d2d"), neon_uimm1_bare,
- op, VPR128, VPR64, v2f64, v2f64, v1f64,
- BinOpFrag<(Neon_combine_2d node:$LHS, node:$RHS)>>;
-}
-
-defm FMUL_lane_v2 : NI_2VE_mul_v2_pat<"FMULve", fmul>;
-defm FMULX_lane_v2 : NI_2VE_mul_v2_pat<"FMULXve", int_aarch64_neon_vmulx>;
-
-def : Pat<(v2f32 (fmul (v2f32 (Neon_vdup (f32 FPR32:$Re))),
- (v2f32 VPR64:$Rn))),
- (FMULve_2s4s VPR64:$Rn, (SUBREG_TO_REG (i32 0), $Re, sub_32), 0)>;
-
-def : Pat<(v4f32 (fmul (v4f32 (Neon_vdup (f32 FPR32:$Re))),
- (v4f32 VPR128:$Rn))),
- (FMULve_4s4s VPR128:$Rn, (SUBREG_TO_REG (i32 0), $Re, sub_32), 0)>;
-
-def : Pat<(v2f64 (fmul (v2f64 (Neon_vdup (f64 FPR64:$Re))),
- (v2f64 VPR128:$Rn))),
- (FMULve_2d2d VPR128:$Rn, (SUBREG_TO_REG (i64 0), $Re, sub_64), 0)>;
-
-// The followings are patterns using fma
-// -ffp-contract=fast generates fma
-
-multiclass NI_2VE_v2<bit u, bits<4> opcode, string asmop> {
- // vector register class for element is always 128-bit to cover the max index
- def _2s4s : NI_2VE<0b0, u, 0b10, opcode, asmop, "2s", "2s", "s",
- neon_uimm2_bare, VPR64, VPR64, VPR128> {
- let Inst{11} = {Index{1}};
- let Inst{21} = {Index{0}};
- let Inst{20-16} = Re;
- }
-
- def _4s4s : NI_2VE<0b1, u, 0b10, opcode, asmop, "4s", "4s", "s",
- neon_uimm2_bare, VPR128, VPR128, VPR128> {
- let Inst{11} = {Index{1}};
- let Inst{21} = {Index{0}};
- let Inst{20-16} = Re;
- }
-
- // _1d2d doesn't exist!
-
- def _2d2d : NI_2VE<0b1, u, 0b11, opcode, asmop, "2d", "2d", "d",
- neon_uimm1_bare, VPR128, VPR128, VPR128> {
- let Inst{11} = {Index{0}};
- let Inst{21} = 0b0;
- let Inst{20-16} = Re;
- }
-}
-
-defm FMLAvve : NI_2VE_v2<0b0, 0b0001, "fmla">;
-defm FMLSvve : NI_2VE_v2<0b0, 0b0101, "fmls">;
-
-// Pattern for lane in 128-bit vector
-class NI_2VEswap_laneq<Instruction INST, Operand OpImm, SDPatternOperator op,
- RegisterOperand ResVPR, RegisterOperand OpVPR,
- ValueType ResTy, ValueType OpTy,
- SDPatternOperator coreop>
- : Pat<(ResTy (op (ResTy (coreop (OpTy OpVPR:$Re), (i64 OpImm:$Index))),
- (ResTy ResVPR:$Rn), (ResTy ResVPR:$src))),
- (INST ResVPR:$src, ResVPR:$Rn, OpVPR:$Re, OpImm:$Index)>;
-
-// Pattern for lane 0
-class NI_2VEfma_lane0<Instruction INST, SDPatternOperator op,
- RegisterOperand ResVPR, ValueType ResTy>
- : Pat<(ResTy (op (ResTy ResVPR:$Rn),
- (ResTy (Neon_vdup (f32 FPR32:$Re))),
- (ResTy ResVPR:$src))),
- (INST ResVPR:$src, ResVPR:$Rn,
- (SUBREG_TO_REG (i32 0), $Re, sub_32), 0)>;
-
-// Pattern for lane in 64-bit vector
-class NI_2VEswap_lane<Instruction INST, Operand OpImm, SDPatternOperator op,
- RegisterOperand ResVPR, RegisterOperand OpVPR,
- ValueType ResTy, ValueType OpTy,
- SDPatternOperator coreop>
- : Pat<(ResTy (op (ResTy (coreop (OpTy OpVPR:$Re), (i64 OpImm:$Index))),
- (ResTy ResVPR:$Rn), (ResTy ResVPR:$src))),
- (INST ResVPR:$src, ResVPR:$Rn,
- (SUBREG_TO_REG (i64 0), OpVPR:$Re, sub_64), OpImm:$Index)>;
-
-// Pattern for lane in 64-bit vector
-class NI_2VEswap_lane_2d2d<Instruction INST, Operand OpImm,
- SDPatternOperator op,
- RegisterOperand ResVPR, RegisterOperand OpVPR,
- ValueType ResTy, ValueType OpTy,
- SDPatternOperator coreop>
- : Pat<(ResTy (op (ResTy (coreop (OpTy OpVPR:$Re), (OpTy OpVPR:$Re))),
- (ResTy ResVPR:$Rn), (ResTy ResVPR:$src))),
- (INST ResVPR:$src, ResVPR:$Rn,
- (SUBREG_TO_REG (i64 0), OpVPR:$Re, sub_64), 0)>;
-
-
-multiclass NI_2VE_fma_v2_pat<string subop, SDPatternOperator op> {
- def : NI_2VEswap_laneq<!cast<Instruction>(subop # "_2s4s"),
- neon_uimm2_bare, op, VPR64, VPR128, v2f32, v4f32,
- BinOpFrag<(Neon_vduplane node:$LHS, node:$RHS)>>;
-
- def : NI_2VEfma_lane0<!cast<Instruction>(subop # "_2s4s"),
- op, VPR64, v2f32>;
-
- def : NI_2VEswap_laneq<!cast<Instruction>(subop # "_4s4s"),
- neon_uimm2_bare, op, VPR128, VPR128, v4f32, v4f32,
- BinOpFrag<(Neon_vduplane node:$LHS, node:$RHS)>>;
-
- def : NI_2VEfma_lane0<!cast<Instruction>(subop # "_4s4s"),
- op, VPR128, v4f32>;
-
- def : NI_2VEswap_laneq<!cast<Instruction>(subop # "_2d2d"),
- neon_uimm1_bare, op, VPR128, VPR128, v2f64, v2f64,
- BinOpFrag<(Neon_vduplane node:$LHS, node:$RHS)>>;
-
- // Index can only be half of the max value for lane in 64-bit vector
-
- def : NI_2VEswap_lane<!cast<Instruction>(subop # "_2s4s"),
- neon_uimm1_bare, op, VPR64, VPR64, v2f32, v2f32,
- BinOpFrag<(Neon_vduplane node:$LHS, node:$RHS)>>;
-
- def : NI_2VEswap_lane_2d2d<!cast<Instruction>(subop # "_2d2d"),
- neon_uimm1_bare, op, VPR128, VPR64, v2f64, v1f64,
- BinOpFrag<(Neon_combine_2d node:$LHS, node:$RHS)>>;
-}
-
-defm FMLA_lane_v2_s : NI_2VE_fma_v2_pat<"FMLAvve", fma>;
-
-// Pattern for lane 0
-class NI_2VEfms_lane0<Instruction INST, SDPatternOperator op,
- RegisterOperand ResVPR, ValueType ResTy>
- : Pat<(ResTy (op (ResTy (fneg ResVPR:$Rn)),
- (ResTy (Neon_vdup (f32 FPR32:$Re))),
- (ResTy ResVPR:$src))),
- (INST ResVPR:$src, ResVPR:$Rn,
- (SUBREG_TO_REG (i32 0), $Re, sub_32), 0)>;
-
-multiclass NI_2VE_fms_v2_pat<string subop, SDPatternOperator op>
-{
- def : NI_2VEswap_laneq<!cast<Instruction>(subop # "_2s4s"),
- neon_uimm2_bare, op, VPR64, VPR128, v2f32, v4f32,
- BinOpFrag<(fneg (Neon_vduplane node:$LHS, node:$RHS))>>;
-
- def : NI_2VEswap_laneq<!cast<Instruction>(subop # "_2s4s"),
- neon_uimm2_bare, op, VPR64, VPR128, v2f32, v4f32,
- BinOpFrag<(Neon_vduplane
- (fneg node:$LHS), node:$RHS)>>;
-
- def : NI_2VEfms_lane0<!cast<Instruction>(subop # "_2s4s"),
- op, VPR64, v2f32>;
-
- def : NI_2VEswap_laneq<!cast<Instruction>(subop # "_4s4s"),
- neon_uimm2_bare, op, VPR128, VPR128, v4f32, v4f32,
- BinOpFrag<(fneg (Neon_vduplane
- node:$LHS, node:$RHS))>>;
-
- def : NI_2VEswap_laneq<!cast<Instruction>(subop # "_4s4s"),
- neon_uimm2_bare, op, VPR128, VPR128, v4f32, v4f32,
- BinOpFrag<(Neon_vduplane
- (fneg node:$LHS), node:$RHS)>>;
-
- def : NI_2VEfms_lane0<!cast<Instruction>(subop # "_4s4s"),
- op, VPR128, v4f32>;
-
- def : NI_2VEswap_laneq<!cast<Instruction>(subop # "_2d2d"),
- neon_uimm1_bare, op, VPR128, VPR128, v2f64, v2f64,
- BinOpFrag<(fneg (Neon_vduplane
- node:$LHS, node:$RHS))>>;
-
- def : NI_2VEswap_laneq<!cast<Instruction>(subop # "_2d2d"),
- neon_uimm1_bare, op, VPR128, VPR128, v2f64, v2f64,
- BinOpFrag<(Neon_vduplane
- (fneg node:$LHS), node:$RHS)>>;
-
- // Index can only be half of the max value for lane in 64-bit vector
-
- def : NI_2VEswap_lane<!cast<Instruction>(subop # "_2s4s"),
- neon_uimm1_bare, op, VPR64, VPR64, v2f32, v2f32,
- BinOpFrag<(fneg (Neon_vduplane
- node:$LHS, node:$RHS))>>;
-
- def : NI_2VEswap_lane<!cast<Instruction>(subop # "_2s4s"),
- neon_uimm1_bare, op, VPR64, VPR64, v2f32, v2f32,
- BinOpFrag<(Neon_vduplane
- (fneg node:$LHS), node:$RHS)>>;
-
- def : NI_2VEswap_lane<!cast<Instruction>(subop # "_4s4s"),
- neon_uimm1_bare, op, VPR128, VPR64, v4f32, v2f32,
- BinOpFrag<(fneg (Neon_vduplane node:$LHS, node:$RHS))>>;
-
- def : NI_2VEswap_lane<!cast<Instruction>(subop # "_4s4s"),
- neon_uimm1_bare, op, VPR128, VPR64, v4f32, v2f32,
- BinOpFrag<(Neon_vduplane (fneg node:$LHS), node:$RHS)>>;
-
- def : NI_2VEswap_lane_2d2d<!cast<Instruction>(subop # "_2d2d"),
- neon_uimm1_bare, op, VPR128, VPR64, v2f64, v1f64,
- BinOpFrag<(fneg (Neon_combine_2d
- node:$LHS, node:$RHS))>>;
-
- def : NI_2VEswap_lane_2d2d<!cast<Instruction>(subop # "_2d2d"),
- neon_uimm1_bare, op, VPR128, VPR64, v2f64, v1f64,
- BinOpFrag<(Neon_combine_2d
- (fneg node:$LHS), (fneg node:$RHS))>>;
-}
-
-defm FMLS_lane_v2_s : NI_2VE_fms_v2_pat<"FMLSvve", fma>;
-
-// Variant 3: Long type
-// E.g. SMLAL : 4S/4H/H (v0-v15), 2D/2S/S
-// SMLAL2: 4S/8H/H (v0-v15), 2D/4S/S
-
-multiclass NI_2VE_v3<bit u, bits<4> opcode, string asmop> {
- // vector register class for element is always 128-bit to cover the max index
- def _2d2s : NI_2VE<0b0, u, 0b10, opcode, asmop, "2d", "2s", "s",
- neon_uimm2_bare, VPR128, VPR64, VPR128> {
- let Inst{11} = {Index{1}};
- let Inst{21} = {Index{0}};
- let Inst{20-16} = Re;
- }
-
- def _2d4s : NI_2VE<0b1, u, 0b10, opcode, asmop # "2", "2d", "4s", "s",
- neon_uimm2_bare, VPR128, VPR128, VPR128> {
- let Inst{11} = {Index{1}};
- let Inst{21} = {Index{0}};
- let Inst{20-16} = Re;
- }
-
- // Index operations on 16-bit(H) elements are restricted to using v0-v15.
- def _4s8h : NI_2VE<0b1, u, 0b01, opcode, asmop # "2", "4s", "8h", "h",
- neon_uimm3_bare, VPR128, VPR128, VPR128Lo> {
- let Inst{11} = {Index{2}};
- let Inst{21} = {Index{1}};
- let Inst{20} = {Index{0}};
- let Inst{19-16} = Re{3-0};
- }
-
- def _4s4h : NI_2VE<0b0, u, 0b01, opcode, asmop, "4s", "4h", "h",
- neon_uimm3_bare, VPR128, VPR64, VPR128Lo> {
- let Inst{11} = {Index{2}};
- let Inst{21} = {Index{1}};
- let Inst{20} = {Index{0}};
- let Inst{19-16} = Re{3-0};
- }
-}
-
-defm SMLALvve : NI_2VE_v3<0b0, 0b0010, "smlal">;
-defm UMLALvve : NI_2VE_v3<0b1, 0b0010, "umlal">;
-defm SMLSLvve : NI_2VE_v3<0b0, 0b0110, "smlsl">;
-defm UMLSLvve : NI_2VE_v3<0b1, 0b0110, "umlsl">;
-defm SQDMLALvve : NI_2VE_v3<0b0, 0b0011, "sqdmlal">;
-defm SQDMLSLvve : NI_2VE_v3<0b0, 0b0111, "sqdmlsl">;
-
-multiclass NI_2VE_v3_2op<bit u, bits<4> opcode, string asmop> {
- // vector register class for element is always 128-bit to cover the max index
- def _2d2s : NI_2VE_2op<0b0, u, 0b10, opcode, asmop, "2d", "2s", "s",
- neon_uimm2_bare, VPR128, VPR64, VPR128> {
- let Inst{11} = {Index{1}};
- let Inst{21} = {Index{0}};
- let Inst{20-16} = Re;
- }
-
- def _2d4s : NI_2VE_2op<0b1, u, 0b10, opcode, asmop # "2", "2d", "4s", "s",
- neon_uimm2_bare, VPR128, VPR128, VPR128> {
- let Inst{11} = {Index{1}};
- let Inst{21} = {Index{0}};
- let Inst{20-16} = Re;
- }
-
- // Index operations on 16-bit(H) elements are restricted to using v0-v15.
- def _4s8h : NI_2VE_2op<0b1, u, 0b01, opcode, asmop # "2", "4s", "8h", "h",
- neon_uimm3_bare, VPR128, VPR128, VPR128Lo> {
- let Inst{11} = {Index{2}};
- let Inst{21} = {Index{1}};
- let Inst{20} = {Index{0}};
- let Inst{19-16} = Re{3-0};
- }
-
- def _4s4h : NI_2VE_2op<0b0, u, 0b01, opcode, asmop, "4s", "4h", "h",
- neon_uimm3_bare, VPR128, VPR64, VPR128Lo> {
- let Inst{11} = {Index{2}};
- let Inst{21} = {Index{1}};
- let Inst{20} = {Index{0}};
- let Inst{19-16} = Re{3-0};
- }
-}
-
-let SchedRW = [WriteFPMul, ReadFPMul, ReadFPMul] in {
-defm SMULLve : NI_2VE_v3_2op<0b0, 0b1010, "smull">;
-defm UMULLve : NI_2VE_v3_2op<0b1, 0b1010, "umull">;
-defm SQDMULLve : NI_2VE_v3_2op<0b0, 0b1011, "sqdmull">;
-}
-
-def : Pat<(v1f64 (scalar_to_vector (f64 FPR64:$src))),
- (FMOVdd $src)>;
-
-// Pattern for lane in 128-bit vector
-class NI_2VEL2_laneq<Instruction INST, Operand OpImm, SDPatternOperator op,
- RegisterOperand EleOpVPR, ValueType ResTy,
- ValueType OpTy, ValueType EleOpTy, ValueType HalfOpTy,
- SDPatternOperator hiop>
- : Pat<(ResTy (op (ResTy VPR128:$src),
- (HalfOpTy (hiop (OpTy VPR128:$Rn))),
- (HalfOpTy (Neon_vduplane
- (EleOpTy EleOpVPR:$Re), (i64 OpImm:$Index))))),
- (INST VPR128:$src, VPR128:$Rn, EleOpVPR:$Re, OpImm:$Index)>;
-
-// Pattern for lane in 64-bit vector
-class NI_2VEL2_lane<Instruction INST, Operand OpImm, SDPatternOperator op,
- RegisterOperand EleOpVPR, ValueType ResTy,
- ValueType OpTy, ValueType EleOpTy, ValueType HalfOpTy,
- SDPatternOperator hiop>
- : Pat<(ResTy (op (ResTy VPR128:$src),
- (HalfOpTy (hiop (OpTy VPR128:$Rn))),
- (HalfOpTy (Neon_vduplane
- (EleOpTy EleOpVPR:$Re), (i64 OpImm:$Index))))),
- (INST VPR128:$src, VPR128:$Rn,
- (SUBREG_TO_REG (i64 0), EleOpVPR:$Re, sub_64), OpImm:$Index)>;
-
-class NI_2VEL2_lane0<Instruction INST, SDPatternOperator op,
- ValueType ResTy, ValueType OpTy, ValueType HalfOpTy,
- SDPatternOperator hiop, Instruction DupInst>
- : Pat<(ResTy (op (ResTy VPR128:$src),
- (HalfOpTy (hiop (OpTy VPR128:$Rn))),
- (HalfOpTy (Neon_vdup (i32 GPR32:$Re))))),
- (INST VPR128:$src, VPR128:$Rn, (DupInst $Re), 0)>;
-
-multiclass NI_2VEL_v3_pat<string subop, SDPatternOperator op> {
- def : NI_2VE_laneq<!cast<Instruction>(subop # "_4s4h"), neon_uimm3_bare,
- op, VPR128, VPR64, VPR128Lo, v4i32, v4i16, v8i16>;
-
- def : NI_2VE_laneq<!cast<Instruction>(subop # "_2d2s"), neon_uimm2_bare,
- op, VPR128, VPR64, VPR128, v2i64, v2i32, v4i32>;
-
- def : NI_2VEL2_laneq<!cast<Instruction>(subop # "_4s8h"), neon_uimm3_bare,
- op, VPR128Lo, v4i32, v8i16, v8i16, v4i16, Neon_High8H>;
-
- def : NI_2VEL2_laneq<!cast<Instruction>(subop # "_2d4s"), neon_uimm2_bare,
- op, VPR128, v2i64, v4i32, v4i32, v2i32, Neon_High4S>;
-
- def : NI_2VEL2_lane0<!cast<Instruction>(subop # "_4s8h"),
- op, v4i32, v8i16, v4i16, Neon_High8H, DUP8h>;
-
- def : NI_2VEL2_lane0<!cast<Instruction>(subop # "_2d4s"),
- op, v2i64, v4i32, v2i32, Neon_High4S, DUP4s>;
-
- // Index can only be half of the max value for lane in 64-bit vector
-
- def : NI_2VE_lane<!cast<Instruction>(subop # "_4s4h"), neon_uimm2_bare,
- op, VPR128, VPR64, VPR64Lo, v4i32, v4i16, v4i16>;
-
- def : NI_2VE_lane<!cast<Instruction>(subop # "_2d2s"), neon_uimm1_bare,
- op, VPR128, VPR64, VPR64, v2i64, v2i32, v2i32>;
-
- def : NI_2VEL2_lane<!cast<Instruction>(subop # "_4s8h"), neon_uimm2_bare,
- op, VPR64Lo, v4i32, v8i16, v4i16, v4i16, Neon_High8H>;
-
- def : NI_2VEL2_lane<!cast<Instruction>(subop # "_2d4s"), neon_uimm1_bare,
- op, VPR64, v2i64, v4i32, v2i32, v2i32, Neon_High4S>;
-}
-
-defm SMLAL_lane_v3 : NI_2VEL_v3_pat<"SMLALvve", Neon_smlal>;
-defm UMLAL_lane_v3 : NI_2VEL_v3_pat<"UMLALvve", Neon_umlal>;
-defm SMLSL_lane_v3 : NI_2VEL_v3_pat<"SMLSLvve", Neon_smlsl>;
-defm UMLSL_lane_v3 : NI_2VEL_v3_pat<"UMLSLvve", Neon_umlsl>;
-
-// Pattern for lane in 128-bit vector
-class NI_2VEL2_mul_laneq<Instruction INST, Operand OpImm, SDPatternOperator op,
- RegisterOperand EleOpVPR, ValueType ResTy,
- ValueType OpTy, ValueType EleOpTy, ValueType HalfOpTy,
- SDPatternOperator hiop>
- : Pat<(ResTy (op
- (HalfOpTy (hiop (OpTy VPR128:$Rn))),
- (HalfOpTy (Neon_vduplane
- (EleOpTy EleOpVPR:$Re), (i64 OpImm:$Index))))),
- (INST VPR128:$Rn, EleOpVPR:$Re, OpImm:$Index)>;
-
-// Pattern for lane in 64-bit vector
-class NI_2VEL2_mul_lane<Instruction INST, Operand OpImm, SDPatternOperator op,
- RegisterOperand EleOpVPR, ValueType ResTy,
- ValueType OpTy, ValueType EleOpTy, ValueType HalfOpTy,
- SDPatternOperator hiop>
- : Pat<(ResTy (op
- (HalfOpTy (hiop (OpTy VPR128:$Rn))),
- (HalfOpTy (Neon_vduplane
- (EleOpTy EleOpVPR:$Re), (i64 OpImm:$Index))))),
- (INST VPR128:$Rn,
- (SUBREG_TO_REG (i64 0), EleOpVPR:$Re, sub_64), OpImm:$Index)>;
-
-// Pattern for fixed lane 0
-class NI_2VEL2_mul_lane0<Instruction INST, SDPatternOperator op,
- ValueType ResTy, ValueType OpTy, ValueType HalfOpTy,
- SDPatternOperator hiop, Instruction DupInst>
- : Pat<(ResTy (op
- (HalfOpTy (hiop (OpTy VPR128:$Rn))),
- (HalfOpTy (Neon_vdup (i32 GPR32:$Re))))),
- (INST VPR128:$Rn, (DupInst $Re), 0)>;
-
-multiclass NI_2VEL_mul_v3_pat<string subop, SDPatternOperator op> {
- def : NI_2VE_mul_laneq<!cast<Instruction>(subop # "_4s4h"), neon_uimm3_bare,
- op, VPR64, VPR128Lo, v4i32, v4i16, v8i16>;
-
- def : NI_2VE_mul_laneq<!cast<Instruction>(subop # "_2d2s"), neon_uimm2_bare,
- op, VPR64, VPR128, v2i64, v2i32, v4i32>;
-
- def : NI_2VEL2_mul_laneq<!cast<Instruction>(subop # "_4s8h"), neon_uimm3_bare,
- op, VPR128Lo, v4i32, v8i16, v8i16, v4i16, Neon_High8H>;
-
- def : NI_2VEL2_mul_laneq<!cast<Instruction>(subop # "_2d4s"), neon_uimm2_bare,
- op, VPR128, v2i64, v4i32, v4i32, v2i32, Neon_High4S>;
-
- def : NI_2VEL2_mul_lane0<!cast<Instruction>(subop # "_4s8h"),
- op, v4i32, v8i16, v4i16, Neon_High8H, DUP8h>;
-
- def : NI_2VEL2_mul_lane0<!cast<Instruction>(subop # "_2d4s"),
- op, v2i64, v4i32, v2i32, Neon_High4S, DUP4s>;
-
- // Index can only be half of the max value for lane in 64-bit vector
-
- def : NI_2VE_mul_lane<!cast<Instruction>(subop # "_4s4h"), neon_uimm2_bare,
- op, VPR64, VPR64Lo, v4i32, v4i16, v4i16>;
-
- def : NI_2VE_mul_lane<!cast<Instruction>(subop # "_2d2s"), neon_uimm1_bare,
- op, VPR64, VPR64, v2i64, v2i32, v2i32>;
-
- def : NI_2VEL2_mul_lane<!cast<Instruction>(subop # "_4s8h"), neon_uimm2_bare,
- op, VPR64Lo, v4i32, v8i16, v4i16, v4i16, Neon_High8H>;
-
- def : NI_2VEL2_mul_lane<!cast<Instruction>(subop # "_2d4s"), neon_uimm1_bare,
- op, VPR64, v2i64, v4i32, v2i32, v2i32, Neon_High4S>;
-}
-
-defm SMULL_lane_v3 : NI_2VEL_mul_v3_pat<"SMULLve", int_arm_neon_vmulls>;
-defm UMULL_lane_v3 : NI_2VEL_mul_v3_pat<"UMULLve", int_arm_neon_vmullu>;
-defm SQDMULL_lane_v3 : NI_2VEL_mul_v3_pat<"SQDMULLve", int_arm_neon_vqdmull>;
-
-multiclass NI_qdma<SDPatternOperator op> {
- def _4s : PatFrag<(ops node:$Ra, node:$Rn, node:$Rm),
- (op node:$Ra,
- (v4i32 (int_arm_neon_vqdmull node:$Rn, node:$Rm)))>;
-
- def _2d : PatFrag<(ops node:$Ra, node:$Rn, node:$Rm),
- (op node:$Ra,
- (v2i64 (int_arm_neon_vqdmull node:$Rn, node:$Rm)))>;
-}
-
-defm Neon_qdmlal : NI_qdma<int_arm_neon_vqadds>;
-defm Neon_qdmlsl : NI_qdma<int_arm_neon_vqsubs>;
-
-multiclass NI_2VEL_v3_qdma_pat<string subop, string op> {
- def : NI_2VE_laneq<!cast<Instruction>(subop # "_4s4h"), neon_uimm3_bare,
- !cast<PatFrag>(op # "_4s"), VPR128, VPR64, VPR128Lo,
- v4i32, v4i16, v8i16>;
-
- def : NI_2VE_laneq<!cast<Instruction>(subop # "_2d2s"), neon_uimm2_bare,
- !cast<PatFrag>(op # "_2d"), VPR128, VPR64, VPR128,
- v2i64, v2i32, v4i32>;
-
- def : NI_2VEL2_laneq<!cast<Instruction>(subop # "_4s8h"), neon_uimm3_bare,
- !cast<PatFrag>(op # "_4s"), VPR128Lo,
- v4i32, v8i16, v8i16, v4i16, Neon_High8H>;
-
- def : NI_2VEL2_laneq<!cast<Instruction>(subop # "_2d4s"), neon_uimm2_bare,
- !cast<PatFrag>(op # "_2d"), VPR128,
- v2i64, v4i32, v4i32, v2i32, Neon_High4S>;
-
- def : NI_2VEL2_lane0<!cast<Instruction>(subop # "_4s8h"),
- !cast<PatFrag>(op # "_4s"),
- v4i32, v8i16, v4i16, Neon_High8H, DUP8h>;
-
- def : NI_2VEL2_lane0<!cast<Instruction>(subop # "_2d4s"),
- !cast<PatFrag>(op # "_2d"),
- v2i64, v4i32, v2i32, Neon_High4S, DUP4s>;
-
- // Index can only be half of the max value for lane in 64-bit vector
-
- def : NI_2VE_lane<!cast<Instruction>(subop # "_4s4h"), neon_uimm2_bare,
- !cast<PatFrag>(op # "_4s"), VPR128, VPR64, VPR64Lo,
- v4i32, v4i16, v4i16>;
-
- def : NI_2VE_lane<!cast<Instruction>(subop # "_2d2s"), neon_uimm1_bare,
- !cast<PatFrag>(op # "_2d"), VPR128, VPR64, VPR64,
- v2i64, v2i32, v2i32>;
-
- def : NI_2VEL2_lane<!cast<Instruction>(subop # "_4s8h"), neon_uimm2_bare,
- !cast<PatFrag>(op # "_4s"), VPR64Lo,
- v4i32, v8i16, v4i16, v4i16, Neon_High8H>;
-
- def : NI_2VEL2_lane<!cast<Instruction>(subop # "_2d4s"), neon_uimm1_bare,
- !cast<PatFrag>(op # "_2d"), VPR64,
- v2i64, v4i32, v2i32, v2i32, Neon_High4S>;
-}
-
-defm SQDMLAL_lane_v3 : NI_2VEL_v3_qdma_pat<"SQDMLALvve", "Neon_qdmlal">;
-defm SQDMLSL_lane_v3 : NI_2VEL_v3_qdma_pat<"SQDMLSLvve", "Neon_qdmlsl">;
-
-// End of implementation for instruction class (3V Elem)
-
-class NeonI_REV<string asmop, string Res, bits<2> size, bit Q, bit U,
- bits<5> opcode, RegisterOperand ResVPR, ValueType ResTy,
- SDPatternOperator Neon_Rev>
- : NeonI_2VMisc<Q, U, size, opcode,
- (outs ResVPR:$Rd), (ins ResVPR:$Rn),
- asmop # "\t$Rd." # Res # ", $Rn." # Res,
- [(set (ResTy ResVPR:$Rd),
- (ResTy (Neon_Rev (ResTy ResVPR:$Rn))))],
- NoItinerary>,
- Sched<[WriteFPALU, ReadFPALU]>;
-
-def REV64_16b : NeonI_REV<"rev64", "16b", 0b00, 0b1, 0b0, 0b00000, VPR128,
- v16i8, Neon_rev64>;
-def REV64_8h : NeonI_REV<"rev64", "8h", 0b01, 0b1, 0b0, 0b00000, VPR128,
- v8i16, Neon_rev64>;
-def REV64_4s : NeonI_REV<"rev64", "4s", 0b10, 0b1, 0b0, 0b00000, VPR128,
- v4i32, Neon_rev64>;
-def REV64_8b : NeonI_REV<"rev64", "8b", 0b00, 0b0, 0b0, 0b00000, VPR64,
- v8i8, Neon_rev64>;
-def REV64_4h : NeonI_REV<"rev64", "4h", 0b01, 0b0, 0b0, 0b00000, VPR64,
- v4i16, Neon_rev64>;
-def REV64_2s : NeonI_REV<"rev64", "2s", 0b10, 0b0, 0b0, 0b00000, VPR64,
- v2i32, Neon_rev64>;
-
-def : Pat<(v4f32 (Neon_rev64 (v4f32 VPR128:$Rn))), (REV64_4s VPR128:$Rn)>;
-def : Pat<(v2f32 (Neon_rev64 (v2f32 VPR64:$Rn))), (REV64_2s VPR64:$Rn)>;
-
-def REV32_16b : NeonI_REV<"rev32", "16b", 0b00, 0b1, 0b1, 0b00000, VPR128,
- v16i8, Neon_rev32>;
-def REV32_8h : NeonI_REV<"rev32", "8h", 0b01, 0b1, 0b1, 0b00000, VPR128,
- v8i16, Neon_rev32>;
-def REV32_8b : NeonI_REV<"rev32", "8b", 0b00, 0b0, 0b1, 0b00000, VPR64,
- v8i8, Neon_rev32>;
-def REV32_4h : NeonI_REV<"rev32", "4h", 0b01, 0b0, 0b1, 0b00000, VPR64,
- v4i16, Neon_rev32>;
-
-def REV16_16b : NeonI_REV<"rev16", "16b", 0b00, 0b1, 0b0, 0b00001, VPR128,
- v16i8, Neon_rev16>;
-def REV16_8b : NeonI_REV<"rev16", "8b", 0b00, 0b0, 0b0, 0b00001, VPR64,
- v8i8, Neon_rev16>;
-
-multiclass NeonI_PairwiseAdd<string asmop, bit U, bits<5> opcode,
- SDPatternOperator Neon_Padd> {
- def 16b8h : NeonI_2VMisc<0b1, U, 0b00, opcode,
- (outs VPR128:$Rd), (ins VPR128:$Rn),
- asmop # "\t$Rd.8h, $Rn.16b",
- [(set (v8i16 VPR128:$Rd),
- (v8i16 (Neon_Padd (v16i8 VPR128:$Rn))))],
- NoItinerary>,
- Sched<[WriteFPALU, ReadFPALU]>;
-
- def 8b4h : NeonI_2VMisc<0b0, U, 0b00, opcode,
- (outs VPR64:$Rd), (ins VPR64:$Rn),
- asmop # "\t$Rd.4h, $Rn.8b",
- [(set (v4i16 VPR64:$Rd),
- (v4i16 (Neon_Padd (v8i8 VPR64:$Rn))))],
- NoItinerary>,
- Sched<[WriteFPALU, ReadFPALU]>;
-
- def 8h4s : NeonI_2VMisc<0b1, U, 0b01, opcode,
- (outs VPR128:$Rd), (ins VPR128:$Rn),
- asmop # "\t$Rd.4s, $Rn.8h",
- [(set (v4i32 VPR128:$Rd),
- (v4i32 (Neon_Padd (v8i16 VPR128:$Rn))))],
- NoItinerary>,
- Sched<[WriteFPALU, ReadFPALU]>;
-
- def 4h2s : NeonI_2VMisc<0b0, U, 0b01, opcode,
- (outs VPR64:$Rd), (ins VPR64:$Rn),
- asmop # "\t$Rd.2s, $Rn.4h",
- [(set (v2i32 VPR64:$Rd),
- (v2i32 (Neon_Padd (v4i16 VPR64:$Rn))))],
- NoItinerary>,
- Sched<[WriteFPALU, ReadFPALU]>;
-
- def 4s2d : NeonI_2VMisc<0b1, U, 0b10, opcode,
- (outs VPR128:$Rd), (ins VPR128:$Rn),
- asmop # "\t$Rd.2d, $Rn.4s",
- [(set (v2i64 VPR128:$Rd),
- (v2i64 (Neon_Padd (v4i32 VPR128:$Rn))))],
- NoItinerary>,
- Sched<[WriteFPALU, ReadFPALU]>;
-
- def 2s1d : NeonI_2VMisc<0b0, U, 0b10, opcode,
- (outs VPR64:$Rd), (ins VPR64:$Rn),
- asmop # "\t$Rd.1d, $Rn.2s",
- [(set (v1i64 VPR64:$Rd),
- (v1i64 (Neon_Padd (v2i32 VPR64:$Rn))))],
- NoItinerary>,
- Sched<[WriteFPALU, ReadFPALU]>;
-}
-
-defm SADDLP : NeonI_PairwiseAdd<"saddlp", 0b0, 0b00010,
- int_arm_neon_vpaddls>;
-defm UADDLP : NeonI_PairwiseAdd<"uaddlp", 0b1, 0b00010,
- int_arm_neon_vpaddlu>;
-
-def : Pat<(v1i64 (int_aarch64_neon_saddlv (v2i32 VPR64:$Rn))),
- (SADDLP2s1d $Rn)>;
-def : Pat<(v1i64 (int_aarch64_neon_uaddlv (v2i32 VPR64:$Rn))),
- (UADDLP2s1d $Rn)>;
-
-multiclass NeonI_PairwiseAddAcc<string asmop, bit U, bits<5> opcode,
- SDPatternOperator Neon_Padd> {
- let Constraints = "$src = $Rd" in {
- def 16b8h : NeonI_2VMisc<0b1, U, 0b00, opcode,
- (outs VPR128:$Rd), (ins VPR128:$src, VPR128:$Rn),
- asmop # "\t$Rd.8h, $Rn.16b",
- [(set (v8i16 VPR128:$Rd),
- (v8i16 (Neon_Padd
- (v8i16 VPR128:$src), (v16i8 VPR128:$Rn))))],
- NoItinerary>,
- Sched<[WriteFPALU, ReadFPALU, ReadFPALU]>;
-
- def 8b4h : NeonI_2VMisc<0b0, U, 0b00, opcode,
- (outs VPR64:$Rd), (ins VPR64:$src, VPR64:$Rn),
- asmop # "\t$Rd.4h, $Rn.8b",
- [(set (v4i16 VPR64:$Rd),
- (v4i16 (Neon_Padd
- (v4i16 VPR64:$src), (v8i8 VPR64:$Rn))))],
- NoItinerary>,
- Sched<[WriteFPALU, ReadFPALU, ReadFPALU]>;
-
- def 8h4s : NeonI_2VMisc<0b1, U, 0b01, opcode,
- (outs VPR128:$Rd), (ins VPR128:$src, VPR128:$Rn),
- asmop # "\t$Rd.4s, $Rn.8h",
- [(set (v4i32 VPR128:$Rd),
- (v4i32 (Neon_Padd
- (v4i32 VPR128:$src), (v8i16 VPR128:$Rn))))],
- NoItinerary>,
- Sched<[WriteFPALU, ReadFPALU, ReadFPALU]>;
-
- def 4h2s : NeonI_2VMisc<0b0, U, 0b01, opcode,
- (outs VPR64:$Rd), (ins VPR64:$src, VPR64:$Rn),
- asmop # "\t$Rd.2s, $Rn.4h",
- [(set (v2i32 VPR64:$Rd),
- (v2i32 (Neon_Padd
- (v2i32 VPR64:$src), (v4i16 VPR64:$Rn))))],
- NoItinerary>,
- Sched<[WriteFPALU, ReadFPALU, ReadFPALU]>;
-
- def 4s2d : NeonI_2VMisc<0b1, U, 0b10, opcode,
- (outs VPR128:$Rd), (ins VPR128:$src, VPR128:$Rn),
- asmop # "\t$Rd.2d, $Rn.4s",
- [(set (v2i64 VPR128:$Rd),
- (v2i64 (Neon_Padd
- (v2i64 VPR128:$src), (v4i32 VPR128:$Rn))))],
- NoItinerary>,
- Sched<[WriteFPALU, ReadFPALU, ReadFPALU]>;
-
- def 2s1d : NeonI_2VMisc<0b0, U, 0b10, opcode,
- (outs VPR64:$Rd), (ins VPR64:$src, VPR64:$Rn),
- asmop # "\t$Rd.1d, $Rn.2s",
- [(set (v1i64 VPR64:$Rd),
- (v1i64 (Neon_Padd
- (v1i64 VPR64:$src), (v2i32 VPR64:$Rn))))],
- NoItinerary>,
- Sched<[WriteFPALU, ReadFPALU, ReadFPALU]>;
- }
-}
-
-defm SADALP : NeonI_PairwiseAddAcc<"sadalp", 0b0, 0b00110,
- int_arm_neon_vpadals>;
-defm UADALP : NeonI_PairwiseAddAcc<"uadalp", 0b1, 0b00110,
- int_arm_neon_vpadalu>;
-
-multiclass NeonI_2VMisc_BHSDsize_1Arg<string asmop, bit U, bits<5> opcode> {
- def 16b : NeonI_2VMisc<0b1, U, 0b00, opcode,
- (outs VPR128:$Rd), (ins VPR128:$Rn),
- asmop # "\t$Rd.16b, $Rn.16b",
- [], NoItinerary>,
- Sched<[WriteFPALU, ReadFPALU]>;
-
- def 8h : NeonI_2VMisc<0b1, U, 0b01, opcode,
- (outs VPR128:$Rd), (ins VPR128:$Rn),
- asmop # "\t$Rd.8h, $Rn.8h",
- [], NoItinerary>,
- Sched<[WriteFPALU, ReadFPALU]>;
-
- def 4s : NeonI_2VMisc<0b1, U, 0b10, opcode,
- (outs VPR128:$Rd), (ins VPR128:$Rn),
- asmop # "\t$Rd.4s, $Rn.4s",
- [], NoItinerary>,
- Sched<[WriteFPALU, ReadFPALU]>;
-
- def 2d : NeonI_2VMisc<0b1, U, 0b11, opcode,
- (outs VPR128:$Rd), (ins VPR128:$Rn),
- asmop # "\t$Rd.2d, $Rn.2d",
- [], NoItinerary>,
- Sched<[WriteFPALU, ReadFPALU]>;
-
- def 8b : NeonI_2VMisc<0b0, U, 0b00, opcode,
- (outs VPR64:$Rd), (ins VPR64:$Rn),
- asmop # "\t$Rd.8b, $Rn.8b",
- [], NoItinerary>,
- Sched<[WriteFPALU, ReadFPALU]>;
-
- def 4h : NeonI_2VMisc<0b0, U, 0b01, opcode,
- (outs VPR64:$Rd), (ins VPR64:$Rn),
- asmop # "\t$Rd.4h, $Rn.4h",
- [], NoItinerary>,
- Sched<[WriteFPALU, ReadFPALU]>;
-
- def 2s : NeonI_2VMisc<0b0, U, 0b10, opcode,
- (outs VPR64:$Rd), (ins VPR64:$Rn),
- asmop # "\t$Rd.2s, $Rn.2s",
- [], NoItinerary>,
- Sched<[WriteFPALU, ReadFPALU]>;
-}
-
-defm SQABS : NeonI_2VMisc_BHSDsize_1Arg<"sqabs", 0b0, 0b00111>;
-defm SQNEG : NeonI_2VMisc_BHSDsize_1Arg<"sqneg", 0b1, 0b00111>;
-defm ABS : NeonI_2VMisc_BHSDsize_1Arg<"abs", 0b0, 0b01011>;
-defm NEG : NeonI_2VMisc_BHSDsize_1Arg<"neg", 0b1, 0b01011>;
-
-multiclass NeonI_2VMisc_BHSD_1Arg_Pattern<string Prefix,
- SDPatternOperator Neon_Op> {
- def : Pat<(v16i8 (Neon_Op (v16i8 VPR128:$Rn))),
- (v16i8 (!cast<Instruction>(Prefix # 16b) (v16i8 VPR128:$Rn)))>;
-
- def : Pat<(v8i16 (Neon_Op (v8i16 VPR128:$Rn))),
- (v8i16 (!cast<Instruction>(Prefix # 8h) (v8i16 VPR128:$Rn)))>;
-
- def : Pat<(v4i32 (Neon_Op (v4i32 VPR128:$Rn))),
- (v4i32 (!cast<Instruction>(Prefix # 4s) (v4i32 VPR128:$Rn)))>;
-
- def : Pat<(v2i64 (Neon_Op (v2i64 VPR128:$Rn))),
- (v2i64 (!cast<Instruction>(Prefix # 2d) (v2i64 VPR128:$Rn)))>;
-
- def : Pat<(v8i8 (Neon_Op (v8i8 VPR64:$Rn))),
- (v8i8 (!cast<Instruction>(Prefix # 8b) (v8i8 VPR64:$Rn)))>;
-
- def : Pat<(v4i16 (Neon_Op (v4i16 VPR64:$Rn))),
- (v4i16 (!cast<Instruction>(Prefix # 4h) (v4i16 VPR64:$Rn)))>;
-
- def : Pat<(v2i32 (Neon_Op (v2i32 VPR64:$Rn))),
- (v2i32 (!cast<Instruction>(Prefix # 2s) (v2i32 VPR64:$Rn)))>;
-}
-
-defm : NeonI_2VMisc_BHSD_1Arg_Pattern<"SQABS", int_arm_neon_vqabs>;
-defm : NeonI_2VMisc_BHSD_1Arg_Pattern<"SQNEG", int_arm_neon_vqneg>;
-defm : NeonI_2VMisc_BHSD_1Arg_Pattern<"ABS", int_arm_neon_vabs>;
-
-def : Pat<(v16i8 (sub
- (v16i8 Neon_AllZero),
- (v16i8 VPR128:$Rn))),
- (v16i8 (NEG16b (v16i8 VPR128:$Rn)))>;
-def : Pat<(v8i8 (sub
- (v8i8 Neon_AllZero),
- (v8i8 VPR64:$Rn))),
- (v8i8 (NEG8b (v8i8 VPR64:$Rn)))>;
-def : Pat<(v8i16 (sub
- (v8i16 (bitconvert (v16i8 Neon_AllZero))),
- (v8i16 VPR128:$Rn))),
- (v8i16 (NEG8h (v8i16 VPR128:$Rn)))>;
-def : Pat<(v4i16 (sub
- (v4i16 (bitconvert (v8i8 Neon_AllZero))),
- (v4i16 VPR64:$Rn))),
- (v4i16 (NEG4h (v4i16 VPR64:$Rn)))>;
-def : Pat<(v4i32 (sub
- (v4i32 (bitconvert (v16i8 Neon_AllZero))),
- (v4i32 VPR128:$Rn))),
- (v4i32 (NEG4s (v4i32 VPR128:$Rn)))>;
-def : Pat<(v2i32 (sub
- (v2i32 (bitconvert (v8i8 Neon_AllZero))),
- (v2i32 VPR64:$Rn))),
- (v2i32 (NEG2s (v2i32 VPR64:$Rn)))>;
-def : Pat<(v2i64 (sub
- (v2i64 (bitconvert (v16i8 Neon_AllZero))),
- (v2i64 VPR128:$Rn))),
- (v2i64 (NEG2d (v2i64 VPR128:$Rn)))>;
-
-multiclass NeonI_2VMisc_BHSDsize_2Args<string asmop, bit U, bits<5> opcode> {
- let Constraints = "$src = $Rd" in {
- def 16b : NeonI_2VMisc<0b1, U, 0b00, opcode,
- (outs VPR128:$Rd), (ins VPR128:$src, VPR128:$Rn),
- asmop # "\t$Rd.16b, $Rn.16b",
- [], NoItinerary>,
- Sched<[WriteFPALU, ReadFPALU, ReadFPALU]>;
-
- def 8h : NeonI_2VMisc<0b1, U, 0b01, opcode,
- (outs VPR128:$Rd), (ins VPR128:$src, VPR128:$Rn),
- asmop # "\t$Rd.8h, $Rn.8h",
- [], NoItinerary>,
- Sched<[WriteFPALU, ReadFPALU, ReadFPALU]>;
-
- def 4s : NeonI_2VMisc<0b1, U, 0b10, opcode,
- (outs VPR128:$Rd), (ins VPR128:$src, VPR128:$Rn),
- asmop # "\t$Rd.4s, $Rn.4s",
- [], NoItinerary>,
- Sched<[WriteFPALU, ReadFPALU, ReadFPALU]>;
-
- def 2d : NeonI_2VMisc<0b1, U, 0b11, opcode,
- (outs VPR128:$Rd), (ins VPR128:$src, VPR128:$Rn),
- asmop # "\t$Rd.2d, $Rn.2d",
- [], NoItinerary>,
- Sched<[WriteFPALU, ReadFPALU, ReadFPALU]>;
-
- def 8b : NeonI_2VMisc<0b0, U, 0b00, opcode,
- (outs VPR64:$Rd), (ins VPR64:$src, VPR64:$Rn),
- asmop # "\t$Rd.8b, $Rn.8b",
- [], NoItinerary>,
- Sched<[WriteFPALU, ReadFPALU, ReadFPALU]>;
-
- def 4h : NeonI_2VMisc<0b0, U, 0b01, opcode,
- (outs VPR64:$Rd), (ins VPR64:$src, VPR64:$Rn),
- asmop # "\t$Rd.4h, $Rn.4h",
- [], NoItinerary>,
- Sched<[WriteFPALU, ReadFPALU, ReadFPALU]>;
-
- def 2s : NeonI_2VMisc<0b0, U, 0b10, opcode,
- (outs VPR64:$Rd), (ins VPR64:$src, VPR64:$Rn),
- asmop # "\t$Rd.2s, $Rn.2s",
- [], NoItinerary>,
- Sched<[WriteFPALU, ReadFPALU, ReadFPALU]>;
- }
-}
-
-defm SUQADD : NeonI_2VMisc_BHSDsize_2Args<"suqadd", 0b0, 0b00011>;
-defm USQADD : NeonI_2VMisc_BHSDsize_2Args<"usqadd", 0b1, 0b00011>;
-
-multiclass NeonI_2VMisc_BHSD_2Args_Pattern<string Prefix,
- SDPatternOperator Neon_Op> {
- def : Pat<(v16i8 (Neon_Op (v16i8 VPR128:$src), (v16i8 VPR128:$Rn))),
- (v16i8 (!cast<Instruction>(Prefix # 16b)
- (v16i8 VPR128:$src), (v16i8 VPR128:$Rn)))>;
-
- def : Pat<(v8i16 (Neon_Op (v8i16 VPR128:$src), (v8i16 VPR128:$Rn))),
- (v8i16 (!cast<Instruction>(Prefix # 8h)
- (v8i16 VPR128:$src), (v8i16 VPR128:$Rn)))>;
-
- def : Pat<(v4i32 (Neon_Op (v4i32 VPR128:$src), (v4i32 VPR128:$Rn))),
- (v4i32 (!cast<Instruction>(Prefix # 4s)
- (v4i32 VPR128:$src), (v4i32 VPR128:$Rn)))>;
-
- def : Pat<(v2i64 (Neon_Op (v2i64 VPR128:$src), (v2i64 VPR128:$Rn))),
- (v2i64 (!cast<Instruction>(Prefix # 2d)
- (v2i64 VPR128:$src), (v2i64 VPR128:$Rn)))>;
-
- def : Pat<(v8i8 (Neon_Op (v8i8 VPR64:$src), (v8i8 VPR64:$Rn))),
- (v8i8 (!cast<Instruction>(Prefix # 8b)
- (v8i8 VPR64:$src), (v8i8 VPR64:$Rn)))>;
-
- def : Pat<(v4i16 (Neon_Op (v4i16 VPR64:$src), (v4i16 VPR64:$Rn))),
- (v4i16 (!cast<Instruction>(Prefix # 4h)
- (v4i16 VPR64:$src), (v4i16 VPR64:$Rn)))>;
-
- def : Pat<(v2i32 (Neon_Op (v2i32 VPR64:$src), (v2i32 VPR64:$Rn))),
- (v2i32 (!cast<Instruction>(Prefix # 2s)
- (v2i32 VPR64:$src), (v2i32 VPR64:$Rn)))>;
-}
-
-defm : NeonI_2VMisc_BHSD_2Args_Pattern<"SUQADD", int_aarch64_neon_suqadd>;
-defm : NeonI_2VMisc_BHSD_2Args_Pattern<"USQADD", int_aarch64_neon_usqadd>;
-
-multiclass NeonI_2VMisc_BHSsizes<string asmop, bit U,
- SDPatternOperator Neon_Op> {
- def 16b : NeonI_2VMisc<0b1, U, 0b00, 0b00100,
- (outs VPR128:$Rd), (ins VPR128:$Rn),
- asmop # "\t$Rd.16b, $Rn.16b",
- [(set (v16i8 VPR128:$Rd),
- (v16i8 (Neon_Op (v16i8 VPR128:$Rn))))],
- NoItinerary>,
- Sched<[WriteFPALU, ReadFPALU]>;
-
- def 8h : NeonI_2VMisc<0b1, U, 0b01, 0b00100,
- (outs VPR128:$Rd), (ins VPR128:$Rn),
- asmop # "\t$Rd.8h, $Rn.8h",
- [(set (v8i16 VPR128:$Rd),
- (v8i16 (Neon_Op (v8i16 VPR128:$Rn))))],
- NoItinerary>,
- Sched<[WriteFPALU, ReadFPALU]>;
-
- def 4s : NeonI_2VMisc<0b1, U, 0b10, 0b00100,
- (outs VPR128:$Rd), (ins VPR128:$Rn),
- asmop # "\t$Rd.4s, $Rn.4s",
- [(set (v4i32 VPR128:$Rd),
- (v4i32 (Neon_Op (v4i32 VPR128:$Rn))))],
- NoItinerary>,
- Sched<[WriteFPALU, ReadFPALU]>;
-
- def 8b : NeonI_2VMisc<0b0, U, 0b00, 0b00100,
- (outs VPR64:$Rd), (ins VPR64:$Rn),
- asmop # "\t$Rd.8b, $Rn.8b",
- [(set (v8i8 VPR64:$Rd),
- (v8i8 (Neon_Op (v8i8 VPR64:$Rn))))],
- NoItinerary>,
- Sched<[WriteFPALU, ReadFPALU]>;
-
- def 4h : NeonI_2VMisc<0b0, U, 0b01, 0b00100,
- (outs VPR64:$Rd), (ins VPR64:$Rn),
- asmop # "\t$Rd.4h, $Rn.4h",
- [(set (v4i16 VPR64:$Rd),
- (v4i16 (Neon_Op (v4i16 VPR64:$Rn))))],
- NoItinerary>,
- Sched<[WriteFPALU, ReadFPALU]>;
-
- def 2s : NeonI_2VMisc<0b0, U, 0b10, 0b00100,
- (outs VPR64:$Rd), (ins VPR64:$Rn),
- asmop # "\t$Rd.2s, $Rn.2s",
- [(set (v2i32 VPR64:$Rd),
- (v2i32 (Neon_Op (v2i32 VPR64:$Rn))))],
- NoItinerary>,
- Sched<[WriteFPALU, ReadFPALU]>;
-}
-
-defm CLS : NeonI_2VMisc_BHSsizes<"cls", 0b0, int_arm_neon_vcls>;
-defm CLZ : NeonI_2VMisc_BHSsizes<"clz", 0b1, ctlz>;
-
-multiclass NeonI_2VMisc_Bsize<string asmop, bit U, bits<2> size,
- bits<5> Opcode> {
- def 16b : NeonI_2VMisc<0b1, U, size, Opcode,
- (outs VPR128:$Rd), (ins VPR128:$Rn),
- asmop # "\t$Rd.16b, $Rn.16b",
- [], NoItinerary>,
- Sched<[WriteFPALU, ReadFPALU]>;
-
- def 8b : NeonI_2VMisc<0b0, U, size, Opcode,
- (outs VPR64:$Rd), (ins VPR64:$Rn),
- asmop # "\t$Rd.8b, $Rn.8b",
- [], NoItinerary>,
- Sched<[WriteFPALU, ReadFPALU]>;
-}
-
-defm CNT : NeonI_2VMisc_Bsize<"cnt", 0b0, 0b00, 0b00101>;
-defm NOT : NeonI_2VMisc_Bsize<"not", 0b1, 0b00, 0b00101>;
-defm RBIT : NeonI_2VMisc_Bsize<"rbit", 0b1, 0b01, 0b00101>;
-
-def : NeonInstAlias<"mvn $Rd.16b, $Rn.16b",
- (NOT16b VPR128:$Rd, VPR128:$Rn), 0>;
-def : NeonInstAlias<"mvn $Rd.8b, $Rn.8b",
- (NOT8b VPR64:$Rd, VPR64:$Rn), 0>;
-
-def : Pat<(v16i8 (ctpop (v16i8 VPR128:$Rn))),
- (v16i8 (CNT16b (v16i8 VPR128:$Rn)))>;
-def : Pat<(v8i8 (ctpop (v8i8 VPR64:$Rn))),
- (v8i8 (CNT8b (v8i8 VPR64:$Rn)))>;
-
-def : Pat<(v16i8 (xor
- (v16i8 VPR128:$Rn),
- (v16i8 Neon_AllOne))),
- (v16i8 (NOT16b (v16i8 VPR128:$Rn)))>;
-def : Pat<(v8i8 (xor
- (v8i8 VPR64:$Rn),
- (v8i8 Neon_AllOne))),
- (v8i8 (NOT8b (v8i8 VPR64:$Rn)))>;
-def : Pat<(v8i16 (xor
- (v8i16 VPR128:$Rn),
- (v8i16 (bitconvert (v16i8 Neon_AllOne))))),
- (NOT16b VPR128:$Rn)>;
-def : Pat<(v4i16 (xor
- (v4i16 VPR64:$Rn),
- (v4i16 (bitconvert (v8i8 Neon_AllOne))))),
- (NOT8b VPR64:$Rn)>;
-def : Pat<(v4i32 (xor
- (v4i32 VPR128:$Rn),
- (v4i32 (bitconvert (v16i8 Neon_AllOne))))),
- (NOT16b VPR128:$Rn)>;
-def : Pat<(v2i32 (xor
- (v2i32 VPR64:$Rn),
- (v2i32 (bitconvert (v8i8 Neon_AllOne))))),
- (NOT8b VPR64:$Rn)>;
-def : Pat<(v2i64 (xor
- (v2i64 VPR128:$Rn),
- (v2i64 (bitconvert (v16i8 Neon_AllOne))))),
- (NOT16b VPR128:$Rn)>;
-
-def : Pat<(v16i8 (int_aarch64_neon_rbit (v16i8 VPR128:$Rn))),
- (v16i8 (RBIT16b (v16i8 VPR128:$Rn)))>;
-def : Pat<(v8i8 (int_aarch64_neon_rbit (v8i8 VPR64:$Rn))),
- (v8i8 (RBIT8b (v8i8 VPR64:$Rn)))>;
-
-multiclass NeonI_2VMisc_SDsizes<string asmop, bit U, bits<5> opcode,
- SDPatternOperator Neon_Op> {
- def 4s : NeonI_2VMisc<0b1, U, 0b10, opcode,
- (outs VPR128:$Rd), (ins VPR128:$Rn),
- asmop # "\t$Rd.4s, $Rn.4s",
- [(set (v4f32 VPR128:$Rd),
- (v4f32 (Neon_Op (v4f32 VPR128:$Rn))))],
- NoItinerary>,
- Sched<[WriteFPALU, ReadFPALU]>;
-
- def 2d : NeonI_2VMisc<0b1, U, 0b11, opcode,
- (outs VPR128:$Rd), (ins VPR128:$Rn),
- asmop # "\t$Rd.2d, $Rn.2d",
- [(set (v2f64 VPR128:$Rd),
- (v2f64 (Neon_Op (v2f64 VPR128:$Rn))))],
- NoItinerary>,
- Sched<[WriteFPALU, ReadFPALU]>;
-
- def 2s : NeonI_2VMisc<0b0, U, 0b10, opcode,
- (outs VPR64:$Rd), (ins VPR64:$Rn),
- asmop # "\t$Rd.2s, $Rn.2s",
- [(set (v2f32 VPR64:$Rd),
- (v2f32 (Neon_Op (v2f32 VPR64:$Rn))))],
- NoItinerary>,
- Sched<[WriteFPALU, ReadFPALU]>;
-}
-
-defm FABS : NeonI_2VMisc_SDsizes<"fabs", 0b0, 0b01111, fabs>;
-defm FNEG : NeonI_2VMisc_SDsizes<"fneg", 0b1, 0b01111, fneg>;
-
-multiclass NeonI_2VMisc_HSD_Narrow<string asmop, bit U, bits<5> opcode> {
- def 8h8b : NeonI_2VMisc<0b0, U, 0b00, opcode,
- (outs VPR64:$Rd), (ins VPR128:$Rn),
- asmop # "\t$Rd.8b, $Rn.8h",
- [], NoItinerary>,
- Sched<[WriteFPALU, ReadFPALU]>;
-
- def 4s4h : NeonI_2VMisc<0b0, U, 0b01, opcode,
- (outs VPR64:$Rd), (ins VPR128:$Rn),
- asmop # "\t$Rd.4h, $Rn.4s",
- [], NoItinerary>,
- Sched<[WriteFPALU, ReadFPALU]>;
-
- def 2d2s : NeonI_2VMisc<0b0, U, 0b10, opcode,
- (outs VPR64:$Rd), (ins VPR128:$Rn),
- asmop # "\t$Rd.2s, $Rn.2d",
- [], NoItinerary>,
- Sched<[WriteFPALU, ReadFPALU]>;
-
- let Constraints = "$Rd = $src" in {
- def 8h16b : NeonI_2VMisc<0b1, U, 0b00, opcode,
- (outs VPR128:$Rd), (ins VPR128:$src, VPR128:$Rn),
- asmop # "2\t$Rd.16b, $Rn.8h",
- [], NoItinerary>,
- Sched<[WriteFPALU, ReadFPALU, ReadFPALU]>;
-
- def 4s8h : NeonI_2VMisc<0b1, U, 0b01, opcode,
- (outs VPR128:$Rd), (ins VPR128:$src, VPR128:$Rn),
- asmop # "2\t$Rd.8h, $Rn.4s",
- [], NoItinerary>,
- Sched<[WriteFPALU, ReadFPALU, ReadFPALU]>;
-
- def 2d4s : NeonI_2VMisc<0b1, U, 0b10, opcode,
- (outs VPR128:$Rd), (ins VPR128:$src, VPR128:$Rn),
- asmop # "2\t$Rd.4s, $Rn.2d",
- [], NoItinerary>,
- Sched<[WriteFPALU, ReadFPALU, ReadFPALU]>;
- }
-}
-
-defm XTN : NeonI_2VMisc_HSD_Narrow<"xtn", 0b0, 0b10010>;
-defm SQXTUN : NeonI_2VMisc_HSD_Narrow<"sqxtun", 0b1, 0b10010>;
-defm SQXTN : NeonI_2VMisc_HSD_Narrow<"sqxtn", 0b0, 0b10100>;
-defm UQXTN : NeonI_2VMisc_HSD_Narrow<"uqxtn", 0b1, 0b10100>;
-
-multiclass NeonI_2VMisc_Narrow_Patterns<string Prefix,
- SDPatternOperator Neon_Op> {
- def : Pat<(v8i8 (Neon_Op (v8i16 VPR128:$Rn))),
- (v8i8 (!cast<Instruction>(Prefix # 8h8b) (v8i16 VPR128:$Rn)))>;
-
- def : Pat<(v4i16 (Neon_Op (v4i32 VPR128:$Rn))),
- (v4i16 (!cast<Instruction>(Prefix # 4s4h) (v4i32 VPR128:$Rn)))>;
-
- def : Pat<(v2i32 (Neon_Op (v2i64 VPR128:$Rn))),
- (v2i32 (!cast<Instruction>(Prefix # 2d2s) (v2i64 VPR128:$Rn)))>;
-
- def : Pat<(v16i8 (concat_vectors
- (v8i8 VPR64:$src),
- (v8i8 (Neon_Op (v8i16 VPR128:$Rn))))),
- (!cast<Instruction>(Prefix # 8h16b)
- (SUBREG_TO_REG (i32 0), VPR64:$src, sub_64),
- VPR128:$Rn)>;
-
- def : Pat<(v8i16 (concat_vectors
- (v4i16 VPR64:$src),
- (v4i16 (Neon_Op (v4i32 VPR128:$Rn))))),
- (!cast<Instruction>(Prefix # 4s8h)
- (SUBREG_TO_REG (i32 0), VPR64:$src, sub_64),
- VPR128:$Rn)>;
-
- def : Pat<(v4i32 (concat_vectors
- (v2i32 VPR64:$src),
- (v2i32 (Neon_Op (v2i64 VPR128:$Rn))))),
- (!cast<Instruction>(Prefix # 2d4s)
- (SUBREG_TO_REG (i32 0), VPR64:$src, sub_64),
- VPR128:$Rn)>;
-}
-
-defm : NeonI_2VMisc_Narrow_Patterns<"XTN", trunc>;
-defm : NeonI_2VMisc_Narrow_Patterns<"SQXTUN", int_arm_neon_vqmovnsu>;
-defm : NeonI_2VMisc_Narrow_Patterns<"SQXTN", int_arm_neon_vqmovns>;
-defm : NeonI_2VMisc_Narrow_Patterns<"UQXTN", int_arm_neon_vqmovnu>;
-
-multiclass NeonI_2VMisc_SHIFT<string asmop, bit U, bits<5> opcode> {
- let DecoderMethod = "DecodeSHLLInstruction" in {
- def 8b8h : NeonI_2VMisc<0b0, U, 0b00, opcode,
- (outs VPR128:$Rd),
- (ins VPR64:$Rn, uimm_exact8:$Imm),
- asmop # "\t$Rd.8h, $Rn.8b, $Imm",
- [], NoItinerary>,
- Sched<[WriteFPALU, ReadFPALU]>;
-
- def 4h4s : NeonI_2VMisc<0b0, U, 0b01, opcode,
- (outs VPR128:$Rd),
- (ins VPR64:$Rn, uimm_exact16:$Imm),
- asmop # "\t$Rd.4s, $Rn.4h, $Imm",
- [], NoItinerary>,
- Sched<[WriteFPALU, ReadFPALU]>;
-
- def 2s2d : NeonI_2VMisc<0b0, U, 0b10, opcode,
- (outs VPR128:$Rd),
- (ins VPR64:$Rn, uimm_exact32:$Imm),
- asmop # "\t$Rd.2d, $Rn.2s, $Imm",
- [], NoItinerary>,
- Sched<[WriteFPALU, ReadFPALU]>;
-
- def 16b8h : NeonI_2VMisc<0b1, U, 0b00, opcode,
- (outs VPR128:$Rd),
- (ins VPR128:$Rn, uimm_exact8:$Imm),
- asmop # "2\t$Rd.8h, $Rn.16b, $Imm",
- [], NoItinerary>,
- Sched<[WriteFPALU, ReadFPALU]>;
-
- def 8h4s : NeonI_2VMisc<0b1, U, 0b01, opcode,
- (outs VPR128:$Rd),
- (ins VPR128:$Rn, uimm_exact16:$Imm),
- asmop # "2\t$Rd.4s, $Rn.8h, $Imm",
- [], NoItinerary>,
- Sched<[WriteFPALU, ReadFPALU]>;
-
- def 4s2d : NeonI_2VMisc<0b1, U, 0b10, opcode,
- (outs VPR128:$Rd),
- (ins VPR128:$Rn, uimm_exact32:$Imm),
- asmop # "2\t$Rd.2d, $Rn.4s, $Imm",
- [], NoItinerary>,
- Sched<[WriteFPALU, ReadFPALU]>;
- }
-}
-
-defm SHLL : NeonI_2VMisc_SHIFT<"shll", 0b1, 0b10011>;
-
-class NeonI_SHLL_Patterns<ValueType OpTy, ValueType DesTy,
- SDPatternOperator ExtOp, Operand Neon_Imm,
- string suffix>
- : Pat<(DesTy (shl
- (DesTy (ExtOp (OpTy VPR64:$Rn))),
- (DesTy (Neon_vdup
- (i32 Neon_Imm:$Imm))))),
- (!cast<Instruction>("SHLL" # suffix) VPR64:$Rn, Neon_Imm:$Imm)>;
-
-class NeonI_SHLL_High_Patterns<ValueType OpTy, ValueType DesTy,
- SDPatternOperator ExtOp, Operand Neon_Imm,
- string suffix, PatFrag GetHigh>
- : Pat<(DesTy (shl
- (DesTy (ExtOp
- (OpTy (GetHigh VPR128:$Rn)))),
- (DesTy (Neon_vdup
- (i32 Neon_Imm:$Imm))))),
- (!cast<Instruction>("SHLL" # suffix) VPR128:$Rn, Neon_Imm:$Imm)>;
-
-def : NeonI_SHLL_Patterns<v8i8, v8i16, zext, uimm_exact8, "8b8h">;
-def : NeonI_SHLL_Patterns<v8i8, v8i16, sext, uimm_exact8, "8b8h">;
-def : NeonI_SHLL_Patterns<v4i16, v4i32, zext, uimm_exact16, "4h4s">;
-def : NeonI_SHLL_Patterns<v4i16, v4i32, sext, uimm_exact16, "4h4s">;
-def : NeonI_SHLL_Patterns<v2i32, v2i64, zext, uimm_exact32, "2s2d">;
-def : NeonI_SHLL_Patterns<v2i32, v2i64, sext, uimm_exact32, "2s2d">;
-def : NeonI_SHLL_High_Patterns<v8i8, v8i16, zext, uimm_exact8, "16b8h",
- Neon_High16B>;
-def : NeonI_SHLL_High_Patterns<v8i8, v8i16, sext, uimm_exact8, "16b8h",
- Neon_High16B>;
-def : NeonI_SHLL_High_Patterns<v4i16, v4i32, zext, uimm_exact16, "8h4s",
- Neon_High8H>;
-def : NeonI_SHLL_High_Patterns<v4i16, v4i32, sext, uimm_exact16, "8h4s",
- Neon_High8H>;
-def : NeonI_SHLL_High_Patterns<v2i32, v2i64, zext, uimm_exact32, "4s2d",
- Neon_High4S>;
-def : NeonI_SHLL_High_Patterns<v2i32, v2i64, sext, uimm_exact32, "4s2d",
- Neon_High4S>;
-
-multiclass NeonI_2VMisc_SD_Narrow<string asmop, bit U, bits<5> opcode> {
- def 4s4h : NeonI_2VMisc<0b0, U, 0b00, opcode,
- (outs VPR64:$Rd), (ins VPR128:$Rn),
- asmop # "\t$Rd.4h, $Rn.4s",
- [], NoItinerary>,
- Sched<[WriteFPALU, ReadFPALU]>;
-
- def 2d2s : NeonI_2VMisc<0b0, U, 0b01, opcode,
- (outs VPR64:$Rd), (ins VPR128:$Rn),
- asmop # "\t$Rd.2s, $Rn.2d",
- [], NoItinerary>,
- Sched<[WriteFPALU, ReadFPALU]>;
-
- let Constraints = "$src = $Rd" in {
- def 4s8h : NeonI_2VMisc<0b1, U, 0b00, opcode,
- (outs VPR128:$Rd), (ins VPR128:$src, VPR128:$Rn),
- asmop # "2\t$Rd.8h, $Rn.4s",
- [], NoItinerary>,
- Sched<[WriteFPALU, ReadFPALU, ReadFPALU]>;
-
- def 2d4s : NeonI_2VMisc<0b1, U, 0b01, opcode,
- (outs VPR128:$Rd), (ins VPR128:$src, VPR128:$Rn),
- asmop # "2\t$Rd.4s, $Rn.2d",
- [], NoItinerary>,
- Sched<[WriteFPALU, ReadFPALU, ReadFPALU]>;
- }
-}
-
-defm FCVTN : NeonI_2VMisc_SD_Narrow<"fcvtn", 0b0, 0b10110>;
-
-multiclass NeonI_2VMisc_Narrow_Pattern<string prefix,
- SDPatternOperator f32_to_f16_Op,
- SDPatternOperator f64_to_f32_Op> {
-
- def : Pat<(v4i16 (f32_to_f16_Op (v4f32 VPR128:$Rn))),
- (!cast<Instruction>(prefix # "4s4h") (v4f32 VPR128:$Rn))>;
-
- def : Pat<(v8i16 (concat_vectors
- (v4i16 VPR64:$src),
- (v4i16 (f32_to_f16_Op (v4f32 VPR128:$Rn))))),
- (!cast<Instruction>(prefix # "4s8h")
- (v4f32 (SUBREG_TO_REG (i32 0), VPR64:$src, sub_64)),
- (v4f32 VPR128:$Rn))>;
-
- def : Pat<(v2f32 (f64_to_f32_Op (v2f64 VPR128:$Rn))),
- (!cast<Instruction>(prefix # "2d2s") (v2f64 VPR128:$Rn))>;
-
- def : Pat<(v4f32 (concat_vectors
- (v2f32 VPR64:$src),
- (v2f32 (f64_to_f32_Op (v2f64 VPR128:$Rn))))),
- (!cast<Instruction>(prefix # "2d4s")
- (v4f32 (SUBREG_TO_REG (i32 0), VPR64:$src, sub_64)),
- (v2f64 VPR128:$Rn))>;
-}
-
-defm : NeonI_2VMisc_Narrow_Pattern<"FCVTN", int_arm_neon_vcvtfp2hf, fround>;
-
-multiclass NeonI_2VMisc_D_Narrow<string asmop, string prefix, bit U,
- bits<5> opcode> {
- def 2d2s : NeonI_2VMisc<0b0, U, 0b01, opcode,
- (outs VPR64:$Rd), (ins VPR128:$Rn),
- asmop # "\t$Rd.2s, $Rn.2d",
- [], NoItinerary>,
- Sched<[WriteFPALU, ReadFPALU]>;
-
- def 2d4s : NeonI_2VMisc<0b1, U, 0b01, opcode,
- (outs VPR128:$Rd), (ins VPR128:$src, VPR128:$Rn),
- asmop # "2\t$Rd.4s, $Rn.2d",
- [], NoItinerary>,
- Sched<[WriteFPALU, ReadFPALU, ReadFPALU]> {
- let Constraints = "$src = $Rd";
- }
-
- def : Pat<(v2f32 (int_aarch64_neon_vcvtxn (v2f64 VPR128:$Rn))),
- (!cast<Instruction>(prefix # "2d2s") VPR128:$Rn)>;
-
- def : Pat<(v4f32 (concat_vectors
- (v2f32 VPR64:$src),
- (v2f32 (int_aarch64_neon_vcvtxn (v2f64 VPR128:$Rn))))),
- (!cast<Instruction>(prefix # "2d4s")
- (v4f32 (SUBREG_TO_REG (i32 0), VPR64:$src, sub_64)),
- VPR128:$Rn)>;
-}
-
-defm FCVTXN : NeonI_2VMisc_D_Narrow<"fcvtxn","FCVTXN", 0b1, 0b10110>;
-
-def Neon_High4Float : PatFrag<(ops node:$in),
- (extract_subvector (v4f32 node:$in), (iPTR 2))>;
-
-multiclass NeonI_2VMisc_HS_Extend<string asmop, bit U, bits<5> opcode> {
- def 4h4s : NeonI_2VMisc<0b0, U, 0b00, opcode,
- (outs VPR128:$Rd), (ins VPR64:$Rn),
- asmop # "\t$Rd.4s, $Rn.4h",
- [], NoItinerary>,
- Sched<[WriteFPALU, ReadFPALU]>;
-
- def 2s2d : NeonI_2VMisc<0b0, U, 0b01, opcode,
- (outs VPR128:$Rd), (ins VPR64:$Rn),
- asmop # "\t$Rd.2d, $Rn.2s",
- [], NoItinerary>,
- Sched<[WriteFPALU, ReadFPALU]>;
-
- def 8h4s : NeonI_2VMisc<0b1, U, 0b00, opcode,
- (outs VPR128:$Rd), (ins VPR128:$Rn),
- asmop # "2\t$Rd.4s, $Rn.8h",
- [], NoItinerary>,
- Sched<[WriteFPALU, ReadFPALU]>;
-
- def 4s2d : NeonI_2VMisc<0b1, U, 0b01, opcode,
- (outs VPR128:$Rd), (ins VPR128:$Rn),
- asmop # "2\t$Rd.2d, $Rn.4s",
- [], NoItinerary>,
- Sched<[WriteFPALU, ReadFPALU]>;
-}
-
-defm FCVTL : NeonI_2VMisc_HS_Extend<"fcvtl", 0b0, 0b10111>;
-
-multiclass NeonI_2VMisc_Extend_Pattern<string prefix> {
- def : Pat<(v4f32 (int_arm_neon_vcvthf2fp (v4i16 VPR64:$Rn))),
- (!cast<Instruction>(prefix # "4h4s") VPR64:$Rn)>;
-
- def : Pat<(v4f32 (int_arm_neon_vcvthf2fp
- (v4i16 (Neon_High8H
- (v8i16 VPR128:$Rn))))),
- (!cast<Instruction>(prefix # "8h4s") VPR128:$Rn)>;
-
- def : Pat<(v2f64 (fextend (v2f32 VPR64:$Rn))),
- (!cast<Instruction>(prefix # "2s2d") VPR64:$Rn)>;
-
- def : Pat<(v2f64 (fextend
- (v2f32 (Neon_High4Float
- (v4f32 VPR128:$Rn))))),
- (!cast<Instruction>(prefix # "4s2d") VPR128:$Rn)>;
-}
-
-defm : NeonI_2VMisc_Extend_Pattern<"FCVTL">;
-
-multiclass NeonI_2VMisc_SD_Conv<string asmop, bit Size, bit U, bits<5> opcode,
- ValueType ResTy4s, ValueType OpTy4s,
- ValueType ResTy2d, ValueType OpTy2d,
- ValueType ResTy2s, ValueType OpTy2s,
- SDPatternOperator Neon_Op> {
-
- def 4s : NeonI_2VMisc<0b1, U, {Size, 0b0}, opcode,
- (outs VPR128:$Rd), (ins VPR128:$Rn),
- asmop # "\t$Rd.4s, $Rn.4s",
- [(set (ResTy4s VPR128:$Rd),
- (ResTy4s (Neon_Op (OpTy4s VPR128:$Rn))))],
- NoItinerary>,
- Sched<[WriteFPALU, ReadFPALU]>;
-
- def 2d : NeonI_2VMisc<0b1, U, {Size, 0b1}, opcode,
- (outs VPR128:$Rd), (ins VPR128:$Rn),
- asmop # "\t$Rd.2d, $Rn.2d",
- [(set (ResTy2d VPR128:$Rd),
- (ResTy2d (Neon_Op (OpTy2d VPR128:$Rn))))],
- NoItinerary>,
- Sched<[WriteFPALU, ReadFPALU]>;
-
- def 2s : NeonI_2VMisc<0b0, U, {Size, 0b0}, opcode,
- (outs VPR64:$Rd), (ins VPR64:$Rn),
- asmop # "\t$Rd.2s, $Rn.2s",
- [(set (ResTy2s VPR64:$Rd),
- (ResTy2s (Neon_Op (OpTy2s VPR64:$Rn))))],
- NoItinerary>,
- Sched<[WriteFPALU, ReadFPALU]>;
-}
-
-multiclass NeonI_2VMisc_fp_to_int<string asmop, bit Size, bit U,
- bits<5> opcode, SDPatternOperator Neon_Op> {
- defm _ : NeonI_2VMisc_SD_Conv<asmop, Size, U, opcode, v4i32, v4f32, v2i64,
- v2f64, v2i32, v2f32, Neon_Op>;
-}
-
-defm FCVTNS : NeonI_2VMisc_fp_to_int<"fcvtns", 0b0, 0b0, 0b11010,
- int_arm_neon_vcvtns>;
-defm FCVTNU : NeonI_2VMisc_fp_to_int<"fcvtnu", 0b0, 0b1, 0b11010,
- int_arm_neon_vcvtnu>;
-defm FCVTPS : NeonI_2VMisc_fp_to_int<"fcvtps", 0b1, 0b0, 0b11010,
- int_arm_neon_vcvtps>;
-defm FCVTPU : NeonI_2VMisc_fp_to_int<"fcvtpu", 0b1, 0b1, 0b11010,
- int_arm_neon_vcvtpu>;
-defm FCVTMS : NeonI_2VMisc_fp_to_int<"fcvtms", 0b0, 0b0, 0b11011,
- int_arm_neon_vcvtms>;
-defm FCVTMU : NeonI_2VMisc_fp_to_int<"fcvtmu", 0b0, 0b1, 0b11011,
- int_arm_neon_vcvtmu>;
-defm FCVTZS : NeonI_2VMisc_fp_to_int<"fcvtzs", 0b1, 0b0, 0b11011, fp_to_sint>;
-defm FCVTZU : NeonI_2VMisc_fp_to_int<"fcvtzu", 0b1, 0b1, 0b11011, fp_to_uint>;
-defm FCVTAS : NeonI_2VMisc_fp_to_int<"fcvtas", 0b0, 0b0, 0b11100,
- int_arm_neon_vcvtas>;
-defm FCVTAU : NeonI_2VMisc_fp_to_int<"fcvtau", 0b0, 0b1, 0b11100,
- int_arm_neon_vcvtau>;
-
-multiclass NeonI_2VMisc_int_to_fp<string asmop, bit Size, bit U,
- bits<5> opcode, SDPatternOperator Neon_Op> {
- defm _ : NeonI_2VMisc_SD_Conv<asmop, Size, U, opcode, v4f32, v4i32, v2f64,
- v2i64, v2f32, v2i32, Neon_Op>;
-}
-
-defm SCVTF : NeonI_2VMisc_int_to_fp<"scvtf", 0b0, 0b0, 0b11101, sint_to_fp>;
-defm UCVTF : NeonI_2VMisc_int_to_fp<"ucvtf", 0b0, 0b1, 0b11101, uint_to_fp>;
-
-multiclass NeonI_2VMisc_fp_to_fp<string asmop, bit Size, bit U,
- bits<5> opcode, SDPatternOperator Neon_Op> {
- defm _ : NeonI_2VMisc_SD_Conv<asmop, Size, U, opcode, v4f32, v4f32, v2f64,
- v2f64, v2f32, v2f32, Neon_Op>;
-}
-
-defm FRINTN : NeonI_2VMisc_fp_to_fp<"frintn", 0b0, 0b0, 0b11000,
- int_aarch64_neon_frintn>;
-defm FRINTA : NeonI_2VMisc_fp_to_fp<"frinta", 0b0, 0b1, 0b11000, frnd>;
-defm FRINTP : NeonI_2VMisc_fp_to_fp<"frintp", 0b1, 0b0, 0b11000, fceil>;
-defm FRINTM : NeonI_2VMisc_fp_to_fp<"frintm", 0b0, 0b0, 0b11001, ffloor>;
-defm FRINTX : NeonI_2VMisc_fp_to_fp<"frintx", 0b0, 0b1, 0b11001, frint>;
-defm FRINTZ : NeonI_2VMisc_fp_to_fp<"frintz", 0b1, 0b0, 0b11001, ftrunc>;
-defm FRINTI : NeonI_2VMisc_fp_to_fp<"frinti", 0b1, 0b1, 0b11001, fnearbyint>;
-defm FRECPE : NeonI_2VMisc_fp_to_fp<"frecpe", 0b1, 0b0, 0b11101,
- int_arm_neon_vrecpe>;
-defm FRSQRTE : NeonI_2VMisc_fp_to_fp<"frsqrte", 0b1, 0b1, 0b11101,
- int_arm_neon_vrsqrte>;
-let SchedRW = [WriteFPSqrt, ReadFPSqrt] in {
-defm FSQRT : NeonI_2VMisc_fp_to_fp<"fsqrt", 0b1, 0b1, 0b11111, fsqrt>;
-}
-
-multiclass NeonI_2VMisc_S_Conv<string asmop, bit Size, bit U,
- bits<5> opcode, SDPatternOperator Neon_Op> {
- def 4s : NeonI_2VMisc<0b1, U, {Size, 0b0}, opcode,
- (outs VPR128:$Rd), (ins VPR128:$Rn),
- asmop # "\t$Rd.4s, $Rn.4s",
- [(set (v4i32 VPR128:$Rd),
- (v4i32 (Neon_Op (v4i32 VPR128:$Rn))))],
- NoItinerary>,
- Sched<[WriteFPALU, ReadFPALU]>;
-
- def 2s : NeonI_2VMisc<0b0, U, {Size, 0b0}, opcode,
- (outs VPR64:$Rd), (ins VPR64:$Rn),
- asmop # "\t$Rd.2s, $Rn.2s",
- [(set (v2i32 VPR64:$Rd),
- (v2i32 (Neon_Op (v2i32 VPR64:$Rn))))],
- NoItinerary>,
- Sched<[WriteFPALU, ReadFPALU]>;
-}
-
-defm URECPE : NeonI_2VMisc_S_Conv<"urecpe", 0b1, 0b0, 0b11100,
- int_arm_neon_vrecpe>;
-defm URSQRTE : NeonI_2VMisc_S_Conv<"ursqrte", 0b1, 0b1, 0b11100,
- int_arm_neon_vrsqrte>;
-
-// Crypto Class
-class NeonI_Cryptoaes_2v<bits<2> size, bits<5> opcode,
- string asmop, SDPatternOperator opnode>
- : NeonI_Crypto_AES<size, opcode,
- (outs VPR128:$Rd), (ins VPR128:$src, VPR128:$Rn),
- asmop # "\t$Rd.16b, $Rn.16b",
- [(set (v16i8 VPR128:$Rd),
- (v16i8 (opnode (v16i8 VPR128:$src),
- (v16i8 VPR128:$Rn))))],
- NoItinerary>,
- Sched<[WriteFPALU, ReadFPALU, ReadFPALU]> {
- let Constraints = "$src = $Rd";
- let Predicates = [HasNEON, HasCrypto];
-}
-
-def AESE : NeonI_Cryptoaes_2v<0b00, 0b00100, "aese", int_arm_neon_aese>;
-def AESD : NeonI_Cryptoaes_2v<0b00, 0b00101, "aesd", int_arm_neon_aesd>;
-
-class NeonI_Cryptoaes<bits<2> size, bits<5> opcode,
- string asmop, SDPatternOperator opnode>
- : NeonI_Crypto_AES<size, opcode,
- (outs VPR128:$Rd), (ins VPR128:$Rn),
- asmop # "\t$Rd.16b, $Rn.16b",
- [(set (v16i8 VPR128:$Rd),
- (v16i8 (opnode (v16i8 VPR128:$Rn))))],
- NoItinerary>,
- Sched<[WriteFPALU, ReadFPALU]>;
-
-def AESMC : NeonI_Cryptoaes<0b00, 0b00110, "aesmc", int_arm_neon_aesmc>;
-def AESIMC : NeonI_Cryptoaes<0b00, 0b00111, "aesimc", int_arm_neon_aesimc>;
-
-class NeonI_Cryptosha_vv<bits<2> size, bits<5> opcode,
- string asmop, SDPatternOperator opnode>
- : NeonI_Crypto_SHA<size, opcode,
- (outs VPR128:$Rd), (ins VPR128:$src, VPR128:$Rn),
- asmop # "\t$Rd.4s, $Rn.4s",
- [(set (v4i32 VPR128:$Rd),
- (v4i32 (opnode (v4i32 VPR128:$src),
- (v4i32 VPR128:$Rn))))],
- NoItinerary>,
- Sched<[WriteFPALU, ReadFPALU, ReadFPALU]> {
- let Constraints = "$src = $Rd";
- let Predicates = [HasNEON, HasCrypto];
-}
-
-def SHA1SU1 : NeonI_Cryptosha_vv<0b00, 0b00001, "sha1su1",
- int_arm_neon_sha1su1>;
-def SHA256SU0 : NeonI_Cryptosha_vv<0b00, 0b00010, "sha256su0",
- int_arm_neon_sha256su0>;
-
-class NeonI_Cryptosha_ss<bits<2> size, bits<5> opcode,
- string asmop, SDPatternOperator opnode>
- : NeonI_Crypto_SHA<size, opcode,
- (outs FPR32:$Rd), (ins FPR32:$Rn),
- asmop # "\t$Rd, $Rn",
- [], NoItinerary>,
- Sched<[WriteFPALU, ReadFPALU]> {
- let Predicates = [HasNEON, HasCrypto];
- let hasSideEffects = 0;
-}
-
-def SHA1H : NeonI_Cryptosha_ss<0b00, 0b00000, "sha1h", int_arm_neon_sha1h>;
-def : Pat<(i32 (int_arm_neon_sha1h i32:$Rn)),
- (COPY_TO_REGCLASS (SHA1H (COPY_TO_REGCLASS i32:$Rn, FPR32)), GPR32)>;
-
-
-class NeonI_Cryptosha3_vvv<bits<2> size, bits<3> opcode, string asmop,
- SDPatternOperator opnode>
- : NeonI_Crypto_3VSHA<size, opcode,
- (outs VPR128:$Rd),
- (ins VPR128:$src, VPR128:$Rn, VPR128:$Rm),
- asmop # "\t$Rd.4s, $Rn.4s, $Rm.4s",
- [(set (v4i32 VPR128:$Rd),
- (v4i32 (opnode (v4i32 VPR128:$src),
- (v4i32 VPR128:$Rn),
- (v4i32 VPR128:$Rm))))],
- NoItinerary>,
- Sched<[WriteFPALU, ReadFPALU, ReadFPALU, ReadFPALU]> {
- let Constraints = "$src = $Rd";
- let Predicates = [HasNEON, HasCrypto];
-}
-
-def SHA1SU0 : NeonI_Cryptosha3_vvv<0b00, 0b011, "sha1su0",
- int_arm_neon_sha1su0>;
-def SHA256SU1 : NeonI_Cryptosha3_vvv<0b00, 0b110, "sha256su1",
- int_arm_neon_sha256su1>;
-
-class NeonI_Cryptosha3_qqv<bits<2> size, bits<3> opcode, string asmop,
- SDPatternOperator opnode>
- : NeonI_Crypto_3VSHA<size, opcode,
- (outs FPR128:$Rd),
- (ins FPR128:$src, FPR128:$Rn, VPR128:$Rm),
- asmop # "\t$Rd, $Rn, $Rm.4s",
- [(set (v4i32 FPR128:$Rd),
- (v4i32 (opnode (v4i32 FPR128:$src),
- (v4i32 FPR128:$Rn),
- (v4i32 VPR128:$Rm))))],
- NoItinerary>,
- Sched<[WriteFPALU, ReadFPALU, ReadFPALU, ReadFPALU]> {
- let Constraints = "$src = $Rd";
- let Predicates = [HasNEON, HasCrypto];
-}
-
-def SHA256H : NeonI_Cryptosha3_qqv<0b00, 0b100, "sha256h",
- int_arm_neon_sha256h>;
-def SHA256H2 : NeonI_Cryptosha3_qqv<0b00, 0b101, "sha256h2",
- int_arm_neon_sha256h2>;
-
-class NeonI_Cryptosha3_qsv<bits<2> size, bits<3> opcode, string asmop>
- : NeonI_Crypto_3VSHA<size, opcode,
- (outs FPR128:$Rd),
- (ins FPR128:$src, FPR32:$Rn, VPR128:$Rm),
- asmop # "\t$Rd, $Rn, $Rm.4s",
- [], NoItinerary>,
- Sched<[WriteFPALU, ReadFPALU, ReadFPALU, ReadFPALU]> {
- let Constraints = "$src = $Rd";
- let hasSideEffects = 0;
- let Predicates = [HasNEON, HasCrypto];
-}
-
-def SHA1C : NeonI_Cryptosha3_qsv<0b00, 0b000, "sha1c">;
-def SHA1P : NeonI_Cryptosha3_qsv<0b00, 0b001, "sha1p">;
-def SHA1M : NeonI_Cryptosha3_qsv<0b00, 0b010, "sha1m">;
-
-def : Pat<(int_arm_neon_sha1c v4i32:$hash_abcd, i32:$hash_e, v4i32:$wk),
- (SHA1C v4i32:$hash_abcd,
- (COPY_TO_REGCLASS i32:$hash_e, FPR32), v4i32:$wk)>;
-def : Pat<(int_arm_neon_sha1m v4i32:$hash_abcd, i32:$hash_e, v4i32:$wk),
- (SHA1M v4i32:$hash_abcd,
- (COPY_TO_REGCLASS i32:$hash_e, FPR32), v4i32:$wk)>;
-def : Pat<(int_arm_neon_sha1p v4i32:$hash_abcd, i32:$hash_e, v4i32:$wk),
- (SHA1P v4i32:$hash_abcd,
- (COPY_TO_REGCLASS i32:$hash_e, FPR32), v4i32:$wk)>;
-
-// Additional patterns to match shl to USHL.
-def : Pat<(v8i8 (shl (v8i8 VPR64:$Rn), (v8i8 VPR64:$Rm))),
- (USHLvvv_8B $Rn, $Rm)>;
-def : Pat<(v4i16 (shl (v4i16 VPR64:$Rn), (v4i16 VPR64:$Rm))),
- (USHLvvv_4H $Rn, $Rm)>;
-def : Pat<(v2i32 (shl (v2i32 VPR64:$Rn), (v2i32 VPR64:$Rm))),
- (USHLvvv_2S $Rn, $Rm)>;
-def : Pat<(v1i64 (shl (v1i64 FPR64:$Rn), (v1i64 FPR64:$Rm))),
- (USHLddd $Rn, $Rm)>;
-def : Pat<(v16i8 (shl (v16i8 VPR128:$Rn), (v16i8 VPR128:$Rm))),
- (USHLvvv_16B $Rn, $Rm)>;
-def : Pat<(v8i16 (shl (v8i16 VPR128:$Rn), (v8i16 VPR128:$Rm))),
- (USHLvvv_8H $Rn, $Rm)>;
-def : Pat<(v4i32 (shl (v4i32 VPR128:$Rn), (v4i32 VPR128:$Rm))),
- (USHLvvv_4S $Rn, $Rm)>;
-def : Pat<(v2i64 (shl (v2i64 VPR128:$Rn), (v2i64 VPR128:$Rm))),
- (USHLvvv_2D $Rn, $Rm)>;
-
-def : Pat<(v1i8 (shl (v1i8 FPR8:$Rn), (v1i8 FPR8:$Rm))),
- (EXTRACT_SUBREG
- (USHLvvv_8B (SUBREG_TO_REG (i64 0), FPR8:$Rn, sub_8),
- (SUBREG_TO_REG (i64 0), FPR8:$Rm, sub_8)),
- sub_8)>;
-def : Pat<(v1i16 (shl (v1i16 FPR16:$Rn), (v1i16 FPR16:$Rm))),
- (EXTRACT_SUBREG
- (USHLvvv_4H (SUBREG_TO_REG (i64 0), FPR16:$Rn, sub_16),
- (SUBREG_TO_REG (i64 0), FPR16:$Rm, sub_16)),
- sub_16)>;
-def : Pat<(v1i32 (shl (v1i32 FPR32:$Rn), (v1i32 FPR32:$Rm))),
- (EXTRACT_SUBREG
- (USHLvvv_2S (SUBREG_TO_REG (i64 0), FPR32:$Rn, sub_32),
- (SUBREG_TO_REG (i64 0), FPR32:$Rm, sub_32)),
- sub_32)>;
-
-// Additional patterns to match sra, srl.
-// For a vector right shift by vector, the shift amounts of SSHL/USHL are
-// negative. Negate the vector of shift amount first.
-def : Pat<(v8i8 (srl (v8i8 VPR64:$Rn), (v8i8 VPR64:$Rm))),
- (USHLvvv_8B $Rn, (NEG8b $Rm))>;
-def : Pat<(v4i16 (srl (v4i16 VPR64:$Rn), (v4i16 VPR64:$Rm))),
- (USHLvvv_4H $Rn, (NEG4h $Rm))>;
-def : Pat<(v2i32 (srl (v2i32 VPR64:$Rn), (v2i32 VPR64:$Rm))),
- (USHLvvv_2S $Rn, (NEG2s $Rm))>;
-def : Pat<(v1i64 (srl (v1i64 FPR64:$Rn), (v1i64 FPR64:$Rm))),
- (USHLddd $Rn, (NEGdd $Rm))>;
-def : Pat<(v16i8 (srl (v16i8 VPR128:$Rn), (v16i8 VPR128:$Rm))),
- (USHLvvv_16B $Rn, (NEG16b $Rm))>;
-def : Pat<(v8i16 (srl (v8i16 VPR128:$Rn), (v8i16 VPR128:$Rm))),
- (USHLvvv_8H $Rn, (NEG8h $Rm))>;
-def : Pat<(v4i32 (srl (v4i32 VPR128:$Rn), (v4i32 VPR128:$Rm))),
- (USHLvvv_4S $Rn, (NEG4s $Rm))>;
-def : Pat<(v2i64 (srl (v2i64 VPR128:$Rn), (v2i64 VPR128:$Rm))),
- (USHLvvv_2D $Rn, (NEG2d $Rm))>;
-
-def : Pat<(v1i8 (srl (v1i8 FPR8:$Rn), (v1i8 FPR8:$Rm))),
- (EXTRACT_SUBREG
- (USHLvvv_8B (SUBREG_TO_REG (i64 0), FPR8:$Rn, sub_8),
- (NEG8b (SUBREG_TO_REG (i64 0), FPR8:$Rm, sub_8))),
- sub_8)>;
-def : Pat<(v1i16 (srl (v1i16 FPR16:$Rn), (v1i16 FPR16:$Rm))),
- (EXTRACT_SUBREG
- (USHLvvv_4H (SUBREG_TO_REG (i64 0), FPR16:$Rn, sub_16),
- (NEG4h (SUBREG_TO_REG (i64 0), FPR16:$Rm, sub_16))),
- sub_16)>;
-def : Pat<(v1i32 (srl (v1i32 FPR32:$Rn), (v1i32 FPR32:$Rm))),
- (EXTRACT_SUBREG
- (USHLvvv_2S (SUBREG_TO_REG (i64 0), FPR32:$Rn, sub_32),
- (NEG2s (SUBREG_TO_REG (i64 0), FPR32:$Rm, sub_32))),
- sub_32)>;
-
-def : Pat<(v8i8 (sra (v8i8 VPR64:$Rn), (v8i8 VPR64:$Rm))),
- (SSHLvvv_8B $Rn, (NEG8b $Rm))>;
-def : Pat<(v4i16 (sra (v4i16 VPR64:$Rn), (v4i16 VPR64:$Rm))),
- (SSHLvvv_4H $Rn, (NEG4h $Rm))>;
-def : Pat<(v2i32 (sra (v2i32 VPR64:$Rn), (v2i32 VPR64:$Rm))),
- (SSHLvvv_2S $Rn, (NEG2s $Rm))>;
-def : Pat<(v1i64 (sra (v1i64 FPR64:$Rn), (v1i64 FPR64:$Rm))),
- (SSHLddd $Rn, (NEGdd $Rm))>;
-def : Pat<(v16i8 (sra (v16i8 VPR128:$Rn), (v16i8 VPR128:$Rm))),
- (SSHLvvv_16B $Rn, (NEG16b $Rm))>;
-def : Pat<(v8i16 (sra (v8i16 VPR128:$Rn), (v8i16 VPR128:$Rm))),
- (SSHLvvv_8H $Rn, (NEG8h $Rm))>;
-def : Pat<(v4i32 (sra (v4i32 VPR128:$Rn), (v4i32 VPR128:$Rm))),
- (SSHLvvv_4S $Rn, (NEG4s $Rm))>;
-def : Pat<(v2i64 (sra (v2i64 VPR128:$Rn), (v2i64 VPR128:$Rm))),
- (SSHLvvv_2D $Rn, (NEG2d $Rm))>;
-
-def : Pat<(v1i8 (sra (v1i8 FPR8:$Rn), (v1i8 FPR8:$Rm))),
- (EXTRACT_SUBREG
- (SSHLvvv_8B (SUBREG_TO_REG (i64 0), FPR8:$Rn, sub_8),
- (NEG8b (SUBREG_TO_REG (i64 0), FPR8:$Rm, sub_8))),
- sub_8)>;
-def : Pat<(v1i16 (sra (v1i16 FPR16:$Rn), (v1i16 FPR16:$Rm))),
- (EXTRACT_SUBREG
- (SSHLvvv_4H (SUBREG_TO_REG (i64 0), FPR16:$Rn, sub_16),
- (NEG4h (SUBREG_TO_REG (i64 0), FPR16:$Rm, sub_16))),
- sub_16)>;
-def : Pat<(v1i32 (sra (v1i32 FPR32:$Rn), (v1i32 FPR32:$Rm))),
- (EXTRACT_SUBREG
- (SSHLvvv_2S (SUBREG_TO_REG (i64 0), FPR32:$Rn, sub_32),
- (NEG2s (SUBREG_TO_REG (i64 0), FPR32:$Rm, sub_32))),
- sub_32)>;
-
-//
-// Patterns for handling half-precision values
-//
-
-// Convert between f16 value and f32 value
-def : Pat<(f32 (f16_to_f32 (i32 GPR32:$Rn))),
- (FCVTsh (EXTRACT_SUBREG (FMOVsw $Rn), sub_16))>;
-def : Pat<(i32 (f32_to_f16 (f32 FPR32:$Rn))),
- (FMOVws (SUBREG_TO_REG (i64 0), (f16 (FCVThs $Rn)), sub_16))>;
-
-// Convert f16 value coming in as i16 value to f32
-def : Pat<(f32 (f16_to_f32 (i32 (and (i32 GPR32:$Rn), 65535)))),
- (FCVTsh (EXTRACT_SUBREG (FMOVsw GPR32:$Rn), sub_16))>;
-def : Pat<(f32 (f16_to_f32 (i32 (assertzext GPR32:$Rn)))),
- (FCVTsh (EXTRACT_SUBREG (FMOVsw GPR32:$Rn), sub_16))>;
-
-def : Pat<(f32 (f16_to_f32 (i32 (assertzext (i32 (
- f32_to_f16 (f32 FPR32:$Rn))))))),
- (f32 FPR32:$Rn)>;
-
-// Patterns for vector extract of half-precision FP value in i16 storage type
-def : Pat<(f32 (f16_to_f32 ( i32 (and (i32 (vector_extract
- (v4i16 VPR64:$Rn), neon_uimm2_bare:$Imm)), 65535)))),
- (FCVTsh (f16 (DUPhv_H
- (v8i16 (SUBREG_TO_REG (i64 0), VPR64:$Rn, sub_64)),
- neon_uimm2_bare:$Imm)))>;
-
-def : Pat<(f32 (f16_to_f32 ( i32 (and (i32 (vector_extract
- (v8i16 VPR128:$Rn), neon_uimm3_bare:$Imm)), 65535)))),
- (FCVTsh (f16 (DUPhv_H (v8i16 VPR128:$Rn), neon_uimm3_bare:$Imm)))>;
-
-// Patterns for vector insert of half-precision FP value 0 in i16 storage type
-def : Pat<(v8i16 (vector_insert (v8i16 VPR128:$Rn),
- (i32 (assertsext (i32 (fp_to_sint(f32 (f16_to_f32 (i32 0))))))),
- (neon_uimm3_bare:$Imm))),
- (v8i16 (INSELh (v8i16 VPR128:$Rn),
- (v8i16 (SUBREG_TO_REG (i64 0),
- (f16 (EXTRACT_SUBREG (f32 (FMOVsw (i32 WZR))), sub_16)),
- sub_16)),
- neon_uimm3_bare:$Imm, 0))>;
-
-def : Pat<(v4i16 (vector_insert (v4i16 VPR64:$Rn),
- (i32 (assertsext (i32 (fp_to_sint(f32 (f16_to_f32 (i32 0))))))),
- (neon_uimm2_bare:$Imm))),
- (v4i16 (EXTRACT_SUBREG
- (v8i16 (INSELh
- (v8i16 (SUBREG_TO_REG (i64 0), VPR64:$Rn, sub_64)),
- (v8i16 (SUBREG_TO_REG (i64 0),
- (f16 (EXTRACT_SUBREG (f32 (FMOVsw (i32 WZR))), sub_16)),
- sub_16)),
- neon_uimm2_bare:$Imm, 0)),
- sub_64))>;
-
-// Patterns for vector insert of half-precision FP value in i16 storage type
-def : Pat<(v8i16 (vector_insert (v8i16 VPR128:$Rn),
- (i32 (assertsext (i32 (fp_to_sint
- (f32 (f16_to_f32 (i32 (and (i32 GPR32:$src), 65535)))))))),
- (neon_uimm3_bare:$Imm))),
- (v8i16 (INSELh (v8i16 VPR128:$Rn),
- (v8i16 (SUBREG_TO_REG (i64 0),
- (f16 (EXTRACT_SUBREG (f32 (FMOVsw (i32 GPR32:$src))), sub_16)),
- sub_16)),
- neon_uimm3_bare:$Imm, 0))>;
-
-def : Pat<(v4i16 (vector_insert (v4i16 VPR64:$Rn),
- (i32 (assertsext (i32 (fp_to_sint
- (f32 (f16_to_f32 (i32 (and (i32 GPR32:$src), 65535)))))))),
- (neon_uimm2_bare:$Imm))),
- (v4i16 (EXTRACT_SUBREG
- (v8i16 (INSELh
- (v8i16 (SUBREG_TO_REG (i64 0), VPR64:$Rn, sub_64)),
- (v8i16 (SUBREG_TO_REG (i64 0),
- (f16 (EXTRACT_SUBREG (f32 (FMOVsw (i32 GPR32:$src))), sub_16)),
- sub_16)),
- neon_uimm2_bare:$Imm, 0)),
- sub_64))>;
-
-def : Pat<(v8i16 (vector_insert (v8i16 VPR128:$Rn),
- (i32 (vector_extract (v8i16 VPR128:$src), neon_uimm3_bare:$Imm2)),
- (neon_uimm3_bare:$Imm1))),
- (v8i16 (INSELh (v8i16 VPR128:$Rn), (v8i16 VPR128:$src),
- neon_uimm3_bare:$Imm1, neon_uimm3_bare:$Imm2))>;
-
-// Patterns for vector copy of half-precision FP value in i16 storage type
-def : Pat<(v8i16 (vector_insert (v8i16 VPR128:$Rn),
- (i32 (assertsext (i32 (fp_to_sint(f32 (f16_to_f32 (i32 (and (i32
- (vector_extract (v8i16 VPR128:$src), neon_uimm3_bare:$Imm2)),
- 65535)))))))),
- (neon_uimm3_bare:$Imm1))),
- (v8i16 (INSELh (v8i16 VPR128:$Rn), (v8i16 VPR128:$src),
- neon_uimm3_bare:$Imm1, neon_uimm3_bare:$Imm2))>;
-
-def : Pat<(v4i16 (vector_insert (v4i16 VPR64:$Rn),
- (i32 (assertsext (i32 (fp_to_sint(f32 (f16_to_f32 (i32 (and (i32
- (vector_extract (v4i16 VPR64:$src), neon_uimm3_bare:$Imm2)),
- 65535)))))))),
- (neon_uimm3_bare:$Imm1))),
- (v4i16 (EXTRACT_SUBREG
- (v8i16 (INSELh
- (v8i16 (SUBREG_TO_REG (i64 0), VPR64:$Rn, sub_64)),
- (v8i16 (SUBREG_TO_REG (i64 0), VPR64:$src, sub_64)),
- neon_uimm3_bare:$Imm1, neon_uimm3_bare:$Imm2)),
- sub_64))>;
-
-
diff --git a/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp b/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp
new file mode 100644
index 0000000..e7454be
--- /dev/null
+++ b/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp
@@ -0,0 +1,942 @@
+//=- AArch64LoadStoreOptimizer.cpp - AArch64 load/store opt. pass -*- C++ -*-=//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains a pass that performs load / store related peephole
+// optimizations. This pass should be run after register allocation.
+//
+//===----------------------------------------------------------------------===//
+
+#include "AArch64InstrInfo.h"
+#include "MCTargetDesc/AArch64AddressingModes.h"
+#include "llvm/ADT/BitVector.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/ADT/Statistic.h"
+using namespace llvm;
+
+#define DEBUG_TYPE "aarch64-ldst-opt"
+
+/// AArch64AllocLoadStoreOpt - Post-register allocation pass to combine
+/// load / store instructions to form ldp / stp instructions.
+
+STATISTIC(NumPairCreated, "Number of load/store pair instructions generated");
+STATISTIC(NumPostFolded, "Number of post-index updates folded");
+STATISTIC(NumPreFolded, "Number of pre-index updates folded");
+STATISTIC(NumUnscaledPairCreated,
+ "Number of load/store from unscaled generated");
+
+static cl::opt<unsigned> ScanLimit("aarch64-load-store-scan-limit", cl::init(20),
+ cl::Hidden);
+
+// Place holder while testing unscaled load/store combining
+static cl::opt<bool>
+EnableAArch64UnscaledMemOp("aarch64-unscaled-mem-op", cl::Hidden,
+ cl::desc("Allow AArch64 unscaled load/store combining"),
+ cl::init(true));
+
+namespace {
+struct AArch64LoadStoreOpt : public MachineFunctionPass {
+ static char ID;
+ AArch64LoadStoreOpt() : MachineFunctionPass(ID) {}
+
+ const AArch64InstrInfo *TII;
+ const TargetRegisterInfo *TRI;
+
+ // Scan the instructions looking for a load/store that can be combined
+ // with the current instruction into a load/store pair.
+ // Return the matching instruction if one is found, else MBB->end().
+ // If a matching instruction is found, mergeForward is set to true if the
+ // merge is to remove the first instruction and replace the second with
+ // a pair-wise insn, and false if the reverse is true.
+ MachineBasicBlock::iterator findMatchingInsn(MachineBasicBlock::iterator I,
+ bool &mergeForward,
+ unsigned Limit);
+ // Merge the two instructions indicated into a single pair-wise instruction.
+ // If mergeForward is true, erase the first instruction and fold its
+ // operation into the second. If false, the reverse. Return the instruction
+ // following the first instruction (which may change during processing).
+ MachineBasicBlock::iterator
+ mergePairedInsns(MachineBasicBlock::iterator I,
+ MachineBasicBlock::iterator Paired, bool mergeForward);
+
+ // Scan the instruction list to find a base register update that can
+ // be combined with the current instruction (a load or store) using
+ // pre or post indexed addressing with writeback. Scan forwards.
+ MachineBasicBlock::iterator
+ findMatchingUpdateInsnForward(MachineBasicBlock::iterator I, unsigned Limit,
+ int Value);
+
+ // Scan the instruction list to find a base register update that can
+ // be combined with the current instruction (a load or store) using
+ // pre or post indexed addressing with writeback. Scan backwards.
+ MachineBasicBlock::iterator
+ findMatchingUpdateInsnBackward(MachineBasicBlock::iterator I, unsigned Limit);
+
+ // Merge a pre-index base register update into a ld/st instruction.
+ MachineBasicBlock::iterator
+ mergePreIdxUpdateInsn(MachineBasicBlock::iterator I,
+ MachineBasicBlock::iterator Update);
+
+ // Merge a post-index base register update into a ld/st instruction.
+ MachineBasicBlock::iterator
+ mergePostIdxUpdateInsn(MachineBasicBlock::iterator I,
+ MachineBasicBlock::iterator Update);
+
+ bool optimizeBlock(MachineBasicBlock &MBB);
+
+ bool runOnMachineFunction(MachineFunction &Fn) override;
+
+ const char *getPassName() const override {
+ return "AArch64 load / store optimization pass";
+ }
+
+private:
+ int getMemSize(MachineInstr *MemMI);
+};
+char AArch64LoadStoreOpt::ID = 0;
+}
+
+static bool isUnscaledLdst(unsigned Opc) {
+ switch (Opc) {
+ default:
+ return false;
+ case AArch64::STURSi:
+ return true;
+ case AArch64::STURDi:
+ return true;
+ case AArch64::STURQi:
+ return true;
+ case AArch64::STURWi:
+ return true;
+ case AArch64::STURXi:
+ return true;
+ case AArch64::LDURSi:
+ return true;
+ case AArch64::LDURDi:
+ return true;
+ case AArch64::LDURQi:
+ return true;
+ case AArch64::LDURWi:
+ return true;
+ case AArch64::LDURXi:
+ return true;
+ }
+}
+
+// Size in bytes of the data moved by an unscaled load or store
+int AArch64LoadStoreOpt::getMemSize(MachineInstr *MemMI) {
+ switch (MemMI->getOpcode()) {
+ default:
+ llvm_unreachable("Opcode has has unknown size!");
+ case AArch64::STRSui:
+ case AArch64::STURSi:
+ return 4;
+ case AArch64::STRDui:
+ case AArch64::STURDi:
+ return 8;
+ case AArch64::STRQui:
+ case AArch64::STURQi:
+ return 16;
+ case AArch64::STRWui:
+ case AArch64::STURWi:
+ return 4;
+ case AArch64::STRXui:
+ case AArch64::STURXi:
+ return 8;
+ case AArch64::LDRSui:
+ case AArch64::LDURSi:
+ return 4;
+ case AArch64::LDRDui:
+ case AArch64::LDURDi:
+ return 8;
+ case AArch64::LDRQui:
+ case AArch64::LDURQi:
+ return 16;
+ case AArch64::LDRWui:
+ case AArch64::LDURWi:
+ return 4;
+ case AArch64::LDRXui:
+ case AArch64::LDURXi:
+ return 8;
+ }
+}
+
+static unsigned getMatchingPairOpcode(unsigned Opc) {
+ switch (Opc) {
+ default:
+ llvm_unreachable("Opcode has no pairwise equivalent!");
+ case AArch64::STRSui:
+ case AArch64::STURSi:
+ return AArch64::STPSi;
+ case AArch64::STRDui:
+ case AArch64::STURDi:
+ return AArch64::STPDi;
+ case AArch64::STRQui:
+ case AArch64::STURQi:
+ return AArch64::STPQi;
+ case AArch64::STRWui:
+ case AArch64::STURWi:
+ return AArch64::STPWi;
+ case AArch64::STRXui:
+ case AArch64::STURXi:
+ return AArch64::STPXi;
+ case AArch64::LDRSui:
+ case AArch64::LDURSi:
+ return AArch64::LDPSi;
+ case AArch64::LDRDui:
+ case AArch64::LDURDi:
+ return AArch64::LDPDi;
+ case AArch64::LDRQui:
+ case AArch64::LDURQi:
+ return AArch64::LDPQi;
+ case AArch64::LDRWui:
+ case AArch64::LDURWi:
+ return AArch64::LDPWi;
+ case AArch64::LDRXui:
+ case AArch64::LDURXi:
+ return AArch64::LDPXi;
+ }
+}
+
+static unsigned getPreIndexedOpcode(unsigned Opc) {
+ switch (Opc) {
+ default:
+ llvm_unreachable("Opcode has no pre-indexed equivalent!");
+ case AArch64::STRSui: return AArch64::STRSpre;
+ case AArch64::STRDui: return AArch64::STRDpre;
+ case AArch64::STRQui: return AArch64::STRQpre;
+ case AArch64::STRWui: return AArch64::STRWpre;
+ case AArch64::STRXui: return AArch64::STRXpre;
+ case AArch64::LDRSui: return AArch64::LDRSpre;
+ case AArch64::LDRDui: return AArch64::LDRDpre;
+ case AArch64::LDRQui: return AArch64::LDRQpre;
+ case AArch64::LDRWui: return AArch64::LDRWpre;
+ case AArch64::LDRXui: return AArch64::LDRXpre;
+ }
+}
+
+static unsigned getPostIndexedOpcode(unsigned Opc) {
+ switch (Opc) {
+ default:
+ llvm_unreachable("Opcode has no post-indexed wise equivalent!");
+ case AArch64::STRSui:
+ return AArch64::STRSpost;
+ case AArch64::STRDui:
+ return AArch64::STRDpost;
+ case AArch64::STRQui:
+ return AArch64::STRQpost;
+ case AArch64::STRWui:
+ return AArch64::STRWpost;
+ case AArch64::STRXui:
+ return AArch64::STRXpost;
+ case AArch64::LDRSui:
+ return AArch64::LDRSpost;
+ case AArch64::LDRDui:
+ return AArch64::LDRDpost;
+ case AArch64::LDRQui:
+ return AArch64::LDRQpost;
+ case AArch64::LDRWui:
+ return AArch64::LDRWpost;
+ case AArch64::LDRXui:
+ return AArch64::LDRXpost;
+ }
+}
+
+MachineBasicBlock::iterator
+AArch64LoadStoreOpt::mergePairedInsns(MachineBasicBlock::iterator I,
+ MachineBasicBlock::iterator Paired,
+ bool mergeForward) {
+ MachineBasicBlock::iterator NextI = I;
+ ++NextI;
+ // If NextI is the second of the two instructions to be merged, we need
+ // to skip one further. Either way we merge will invalidate the iterator,
+ // and we don't need to scan the new instruction, as it's a pairwise
+ // instruction, which we're not considering for further action anyway.
+ if (NextI == Paired)
+ ++NextI;
+
+ bool IsUnscaled = isUnscaledLdst(I->getOpcode());
+ int OffsetStride =
+ IsUnscaled && EnableAArch64UnscaledMemOp ? getMemSize(I) : 1;
+
+ unsigned NewOpc = getMatchingPairOpcode(I->getOpcode());
+ // Insert our new paired instruction after whichever of the paired
+ // instructions mergeForward indicates.
+ MachineBasicBlock::iterator InsertionPoint = mergeForward ? Paired : I;
+ // Also based on mergeForward is from where we copy the base register operand
+ // so we get the flags compatible with the input code.
+ MachineOperand &BaseRegOp =
+ mergeForward ? Paired->getOperand(1) : I->getOperand(1);
+
+ // Which register is Rt and which is Rt2 depends on the offset order.
+ MachineInstr *RtMI, *Rt2MI;
+ if (I->getOperand(2).getImm() ==
+ Paired->getOperand(2).getImm() + OffsetStride) {
+ RtMI = Paired;
+ Rt2MI = I;
+ } else {
+ RtMI = I;
+ Rt2MI = Paired;
+ }
+ // Handle Unscaled
+ int OffsetImm = RtMI->getOperand(2).getImm();
+ if (IsUnscaled && EnableAArch64UnscaledMemOp)
+ OffsetImm /= OffsetStride;
+
+ // Construct the new instruction.
+ MachineInstrBuilder MIB = BuildMI(*I->getParent(), InsertionPoint,
+ I->getDebugLoc(), TII->get(NewOpc))
+ .addOperand(RtMI->getOperand(0))
+ .addOperand(Rt2MI->getOperand(0))
+ .addOperand(BaseRegOp)
+ .addImm(OffsetImm);
+ (void)MIB;
+
+ // FIXME: Do we need/want to copy the mem operands from the source
+ // instructions? Probably. What uses them after this?
+
+ DEBUG(dbgs() << "Creating pair load/store. Replacing instructions:\n ");
+ DEBUG(I->print(dbgs()));
+ DEBUG(dbgs() << " ");
+ DEBUG(Paired->print(dbgs()));
+ DEBUG(dbgs() << " with instruction:\n ");
+ DEBUG(((MachineInstr *)MIB)->print(dbgs()));
+ DEBUG(dbgs() << "\n");
+
+ // Erase the old instructions.
+ I->eraseFromParent();
+ Paired->eraseFromParent();
+
+ return NextI;
+}
+
+/// trackRegDefsUses - Remember what registers the specified instruction uses
+/// and modifies.
+static void trackRegDefsUses(MachineInstr *MI, BitVector &ModifiedRegs,
+ BitVector &UsedRegs,
+ const TargetRegisterInfo *TRI) {
+ for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
+ MachineOperand &MO = MI->getOperand(i);
+ if (MO.isRegMask())
+ ModifiedRegs.setBitsNotInMask(MO.getRegMask());
+
+ if (!MO.isReg())
+ continue;
+ unsigned Reg = MO.getReg();
+ if (MO.isDef()) {
+ for (MCRegAliasIterator AI(Reg, TRI, true); AI.isValid(); ++AI)
+ ModifiedRegs.set(*AI);
+ } else {
+ assert(MO.isUse() && "Reg operand not a def and not a use?!?");
+ for (MCRegAliasIterator AI(Reg, TRI, true); AI.isValid(); ++AI)
+ UsedRegs.set(*AI);
+ }
+ }
+}
+
+static bool inBoundsForPair(bool IsUnscaled, int Offset, int OffsetStride) {
+ if (!IsUnscaled && (Offset > 63 || Offset < -64))
+ return false;
+ if (IsUnscaled) {
+ // Convert the byte-offset used by unscaled into an "element" offset used
+ // by the scaled pair load/store instructions.
+ int elemOffset = Offset / OffsetStride;
+ if (elemOffset > 63 || elemOffset < -64)
+ return false;
+ }
+ return true;
+}
+
+// Do alignment, specialized to power of 2 and for signed ints,
+// avoiding having to do a C-style cast from uint_64t to int when
+// using RoundUpToAlignment from include/llvm/Support/MathExtras.h.
+// FIXME: Move this function to include/MathExtras.h?
+static int alignTo(int Num, int PowOf2) {
+ return (Num + PowOf2 - 1) & ~(PowOf2 - 1);
+}
+
+/// findMatchingInsn - Scan the instructions looking for a load/store that can
+/// be combined with the current instruction into a load/store pair.
+MachineBasicBlock::iterator
+AArch64LoadStoreOpt::findMatchingInsn(MachineBasicBlock::iterator I,
+ bool &mergeForward, unsigned Limit) {
+ MachineBasicBlock::iterator E = I->getParent()->end();
+ MachineBasicBlock::iterator MBBI = I;
+ MachineInstr *FirstMI = I;
+ ++MBBI;
+
+ int Opc = FirstMI->getOpcode();
+ bool mayLoad = FirstMI->mayLoad();
+ bool IsUnscaled = isUnscaledLdst(Opc);
+ unsigned Reg = FirstMI->getOperand(0).getReg();
+ unsigned BaseReg = FirstMI->getOperand(1).getReg();
+ int Offset = FirstMI->getOperand(2).getImm();
+
+ // Early exit if the first instruction modifies the base register.
+ // e.g., ldr x0, [x0]
+ // Early exit if the offset if not possible to match. (6 bits of positive
+ // range, plus allow an extra one in case we find a later insn that matches
+ // with Offset-1
+ if (FirstMI->modifiesRegister(BaseReg, TRI))
+ return E;
+ int OffsetStride =
+ IsUnscaled && EnableAArch64UnscaledMemOp ? getMemSize(FirstMI) : 1;
+ if (!inBoundsForPair(IsUnscaled, Offset, OffsetStride))
+ return E;
+
+ // Track which registers have been modified and used between the first insn
+ // (inclusive) and the second insn.
+ BitVector ModifiedRegs, UsedRegs;
+ ModifiedRegs.resize(TRI->getNumRegs());
+ UsedRegs.resize(TRI->getNumRegs());
+ for (unsigned Count = 0; MBBI != E && Count < Limit; ++MBBI) {
+ MachineInstr *MI = MBBI;
+ // Skip DBG_VALUE instructions. Otherwise debug info can affect the
+ // optimization by changing how far we scan.
+ if (MI->isDebugValue())
+ continue;
+
+ // Now that we know this is a real instruction, count it.
+ ++Count;
+
+ if (Opc == MI->getOpcode() && MI->getOperand(2).isImm()) {
+ // If we've found another instruction with the same opcode, check to see
+ // if the base and offset are compatible with our starting instruction.
+ // These instructions all have scaled immediate operands, so we just
+ // check for +1/-1. Make sure to check the new instruction offset is
+ // actually an immediate and not a symbolic reference destined for
+ // a relocation.
+ //
+ // Pairwise instructions have a 7-bit signed offset field. Single insns
+ // have a 12-bit unsigned offset field. To be a valid combine, the
+ // final offset must be in range.
+ unsigned MIBaseReg = MI->getOperand(1).getReg();
+ int MIOffset = MI->getOperand(2).getImm();
+ if (BaseReg == MIBaseReg && ((Offset == MIOffset + OffsetStride) ||
+ (Offset + OffsetStride == MIOffset))) {
+ int MinOffset = Offset < MIOffset ? Offset : MIOffset;
+ // If this is a volatile load/store that otherwise matched, stop looking
+ // as something is going on that we don't have enough information to
+ // safely transform. Similarly, stop if we see a hint to avoid pairs.
+ if (MI->hasOrderedMemoryRef() || TII->isLdStPairSuppressed(MI))
+ return E;
+ // If the resultant immediate offset of merging these instructions
+ // is out of range for a pairwise instruction, bail and keep looking.
+ bool MIIsUnscaled = isUnscaledLdst(MI->getOpcode());
+ if (!inBoundsForPair(MIIsUnscaled, MinOffset, OffsetStride)) {
+ trackRegDefsUses(MI, ModifiedRegs, UsedRegs, TRI);
+ continue;
+ }
+ // If the alignment requirements of the paired (scaled) instruction
+ // can't express the offset of the unscaled input, bail and keep
+ // looking.
+ if (IsUnscaled && EnableAArch64UnscaledMemOp &&
+ (alignTo(MinOffset, OffsetStride) != MinOffset)) {
+ trackRegDefsUses(MI, ModifiedRegs, UsedRegs, TRI);
+ continue;
+ }
+ // If the destination register of the loads is the same register, bail
+ // and keep looking. A load-pair instruction with both destination
+ // registers the same is UNPREDICTABLE and will result in an exception.
+ if (mayLoad && Reg == MI->getOperand(0).getReg()) {
+ trackRegDefsUses(MI, ModifiedRegs, UsedRegs, TRI);
+ continue;
+ }
+
+ // If the Rt of the second instruction was not modified or used between
+ // the two instructions, we can combine the second into the first.
+ if (!ModifiedRegs[MI->getOperand(0).getReg()] &&
+ !UsedRegs[MI->getOperand(0).getReg()]) {
+ mergeForward = false;
+ return MBBI;
+ }
+
+ // Likewise, if the Rt of the first instruction is not modified or used
+ // between the two instructions, we can combine the first into the
+ // second.
+ if (!ModifiedRegs[FirstMI->getOperand(0).getReg()] &&
+ !UsedRegs[FirstMI->getOperand(0).getReg()]) {
+ mergeForward = true;
+ return MBBI;
+ }
+ // Unable to combine these instructions due to interference in between.
+ // Keep looking.
+ }
+ }
+
+ // If the instruction wasn't a matching load or store, but does (or can)
+ // modify memory, stop searching, as we don't have alias analysis or
+ // anything like that to tell us whether the access is tromping on the
+ // locations we care about. The big one we want to catch is calls.
+ //
+ // FIXME: Theoretically, we can do better than that for SP and FP based
+ // references since we can effectively know where those are touching. It's
+ // unclear if it's worth the extra code, though. Most paired instructions
+ // will be sequential, perhaps with a few intervening non-memory related
+ // instructions.
+ if (MI->mayStore() || MI->isCall())
+ return E;
+ // Likewise, if we're matching a store instruction, we don't want to
+ // move across a load, as it may be reading the same location.
+ if (FirstMI->mayStore() && MI->mayLoad())
+ return E;
+
+ // Update modified / uses register lists.
+ trackRegDefsUses(MI, ModifiedRegs, UsedRegs, TRI);
+
+ // Otherwise, if the base register is modified, we have no match, so
+ // return early.
+ if (ModifiedRegs[BaseReg])
+ return E;
+ }
+ return E;
+}
+
+MachineBasicBlock::iterator
+AArch64LoadStoreOpt::mergePreIdxUpdateInsn(MachineBasicBlock::iterator I,
+ MachineBasicBlock::iterator Update) {
+ assert((Update->getOpcode() == AArch64::ADDXri ||
+ Update->getOpcode() == AArch64::SUBXri) &&
+ "Unexpected base register update instruction to merge!");
+ MachineBasicBlock::iterator NextI = I;
+ // Return the instruction following the merged instruction, which is
+ // the instruction following our unmerged load. Unless that's the add/sub
+ // instruction we're merging, in which case it's the one after that.
+ if (++NextI == Update)
+ ++NextI;
+
+ int Value = Update->getOperand(2).getImm();
+ assert(AArch64_AM::getShiftValue(Update->getOperand(3).getImm()) == 0 &&
+ "Can't merge 1 << 12 offset into pre-indexed load / store");
+ if (Update->getOpcode() == AArch64::SUBXri)
+ Value = -Value;
+
+ unsigned NewOpc = getPreIndexedOpcode(I->getOpcode());
+ MachineInstrBuilder MIB =
+ BuildMI(*I->getParent(), I, I->getDebugLoc(), TII->get(NewOpc))
+ .addOperand(Update->getOperand(0))
+ .addOperand(I->getOperand(0))
+ .addOperand(I->getOperand(1))
+ .addImm(Value);
+ (void)MIB;
+
+ DEBUG(dbgs() << "Creating pre-indexed load/store.");
+ DEBUG(dbgs() << " Replacing instructions:\n ");
+ DEBUG(I->print(dbgs()));
+ DEBUG(dbgs() << " ");
+ DEBUG(Update->print(dbgs()));
+ DEBUG(dbgs() << " with instruction:\n ");
+ DEBUG(((MachineInstr *)MIB)->print(dbgs()));
+ DEBUG(dbgs() << "\n");
+
+ // Erase the old instructions for the block.
+ I->eraseFromParent();
+ Update->eraseFromParent();
+
+ return NextI;
+}
+
+MachineBasicBlock::iterator AArch64LoadStoreOpt::mergePostIdxUpdateInsn(
+ MachineBasicBlock::iterator I, MachineBasicBlock::iterator Update) {
+ assert((Update->getOpcode() == AArch64::ADDXri ||
+ Update->getOpcode() == AArch64::SUBXri) &&
+ "Unexpected base register update instruction to merge!");
+ MachineBasicBlock::iterator NextI = I;
+ // Return the instruction following the merged instruction, which is
+ // the instruction following our unmerged load. Unless that's the add/sub
+ // instruction we're merging, in which case it's the one after that.
+ if (++NextI == Update)
+ ++NextI;
+
+ int Value = Update->getOperand(2).getImm();
+ assert(AArch64_AM::getShiftValue(Update->getOperand(3).getImm()) == 0 &&
+ "Can't merge 1 << 12 offset into post-indexed load / store");
+ if (Update->getOpcode() == AArch64::SUBXri)
+ Value = -Value;
+
+ unsigned NewOpc = getPostIndexedOpcode(I->getOpcode());
+ MachineInstrBuilder MIB =
+ BuildMI(*I->getParent(), I, I->getDebugLoc(), TII->get(NewOpc))
+ .addOperand(Update->getOperand(0))
+ .addOperand(I->getOperand(0))
+ .addOperand(I->getOperand(1))
+ .addImm(Value);
+ (void)MIB;
+
+ DEBUG(dbgs() << "Creating post-indexed load/store.");
+ DEBUG(dbgs() << " Replacing instructions:\n ");
+ DEBUG(I->print(dbgs()));
+ DEBUG(dbgs() << " ");
+ DEBUG(Update->print(dbgs()));
+ DEBUG(dbgs() << " with instruction:\n ");
+ DEBUG(((MachineInstr *)MIB)->print(dbgs()));
+ DEBUG(dbgs() << "\n");
+
+ // Erase the old instructions for the block.
+ I->eraseFromParent();
+ Update->eraseFromParent();
+
+ return NextI;
+}
+
+static bool isMatchingUpdateInsn(MachineInstr *MI, unsigned BaseReg,
+ int Offset) {
+ switch (MI->getOpcode()) {
+ default:
+ break;
+ case AArch64::SUBXri:
+ // Negate the offset for a SUB instruction.
+ Offset *= -1;
+ // FALLTHROUGH
+ case AArch64::ADDXri:
+ // Make sure it's a vanilla immediate operand, not a relocation or
+ // anything else we can't handle.
+ if (!MI->getOperand(2).isImm())
+ break;
+ // Watch out for 1 << 12 shifted value.
+ if (AArch64_AM::getShiftValue(MI->getOperand(3).getImm()))
+ break;
+ // If the instruction has the base register as source and dest and the
+ // immediate will fit in a signed 9-bit integer, then we have a match.
+ if (MI->getOperand(0).getReg() == BaseReg &&
+ MI->getOperand(1).getReg() == BaseReg &&
+ MI->getOperand(2).getImm() <= 255 &&
+ MI->getOperand(2).getImm() >= -256) {
+ // If we have a non-zero Offset, we check that it matches the amount
+ // we're adding to the register.
+ if (!Offset || Offset == MI->getOperand(2).getImm())
+ return true;
+ }
+ break;
+ }
+ return false;
+}
+
+MachineBasicBlock::iterator AArch64LoadStoreOpt::findMatchingUpdateInsnForward(
+ MachineBasicBlock::iterator I, unsigned Limit, int Value) {
+ MachineBasicBlock::iterator E = I->getParent()->end();
+ MachineInstr *MemMI = I;
+ MachineBasicBlock::iterator MBBI = I;
+ const MachineFunction &MF = *MemMI->getParent()->getParent();
+
+ unsigned DestReg = MemMI->getOperand(0).getReg();
+ unsigned BaseReg = MemMI->getOperand(1).getReg();
+ int Offset = MemMI->getOperand(2).getImm() *
+ TII->getRegClass(MemMI->getDesc(), 0, TRI, MF)->getSize();
+
+ // If the base register overlaps the destination register, we can't
+ // merge the update.
+ if (DestReg == BaseReg || TRI->isSubRegister(BaseReg, DestReg))
+ return E;
+
+ // Scan forward looking for post-index opportunities.
+ // Updating instructions can't be formed if the memory insn already
+ // has an offset other than the value we're looking for.
+ if (Offset != Value)
+ return E;
+
+ // Track which registers have been modified and used between the first insn
+ // (inclusive) and the second insn.
+ BitVector ModifiedRegs, UsedRegs;
+ ModifiedRegs.resize(TRI->getNumRegs());
+ UsedRegs.resize(TRI->getNumRegs());
+ ++MBBI;
+ for (unsigned Count = 0; MBBI != E; ++MBBI) {
+ MachineInstr *MI = MBBI;
+ // Skip DBG_VALUE instructions. Otherwise debug info can affect the
+ // optimization by changing how far we scan.
+ if (MI->isDebugValue())
+ continue;
+
+ // Now that we know this is a real instruction, count it.
+ ++Count;
+
+ // If we found a match, return it.
+ if (isMatchingUpdateInsn(MI, BaseReg, Value))
+ return MBBI;
+
+ // Update the status of what the instruction clobbered and used.
+ trackRegDefsUses(MI, ModifiedRegs, UsedRegs, TRI);
+
+ // Otherwise, if the base register is used or modified, we have no match, so
+ // return early.
+ if (ModifiedRegs[BaseReg] || UsedRegs[BaseReg])
+ return E;
+ }
+ return E;
+}
+
+MachineBasicBlock::iterator AArch64LoadStoreOpt::findMatchingUpdateInsnBackward(
+ MachineBasicBlock::iterator I, unsigned Limit) {
+ MachineBasicBlock::iterator B = I->getParent()->begin();
+ MachineBasicBlock::iterator E = I->getParent()->end();
+ MachineInstr *MemMI = I;
+ MachineBasicBlock::iterator MBBI = I;
+ const MachineFunction &MF = *MemMI->getParent()->getParent();
+
+ unsigned DestReg = MemMI->getOperand(0).getReg();
+ unsigned BaseReg = MemMI->getOperand(1).getReg();
+ int Offset = MemMI->getOperand(2).getImm();
+ unsigned RegSize = TII->getRegClass(MemMI->getDesc(), 0, TRI, MF)->getSize();
+
+ // If the load/store is the first instruction in the block, there's obviously
+ // not any matching update. Ditto if the memory offset isn't zero.
+ if (MBBI == B || Offset != 0)
+ return E;
+ // If the base register overlaps the destination register, we can't
+ // merge the update.
+ if (DestReg == BaseReg || TRI->isSubRegister(BaseReg, DestReg))
+ return E;
+
+ // Track which registers have been modified and used between the first insn
+ // (inclusive) and the second insn.
+ BitVector ModifiedRegs, UsedRegs;
+ ModifiedRegs.resize(TRI->getNumRegs());
+ UsedRegs.resize(TRI->getNumRegs());
+ --MBBI;
+ for (unsigned Count = 0; MBBI != B; --MBBI) {
+ MachineInstr *MI = MBBI;
+ // Skip DBG_VALUE instructions. Otherwise debug info can affect the
+ // optimization by changing how far we scan.
+ if (MI->isDebugValue())
+ continue;
+
+ // Now that we know this is a real instruction, count it.
+ ++Count;
+
+ // If we found a match, return it.
+ if (isMatchingUpdateInsn(MI, BaseReg, RegSize))
+ return MBBI;
+
+ // Update the status of what the instruction clobbered and used.
+ trackRegDefsUses(MI, ModifiedRegs, UsedRegs, TRI);
+
+ // Otherwise, if the base register is used or modified, we have no match, so
+ // return early.
+ if (ModifiedRegs[BaseReg] || UsedRegs[BaseReg])
+ return E;
+ }
+ return E;
+}
+
+bool AArch64LoadStoreOpt::optimizeBlock(MachineBasicBlock &MBB) {
+ bool Modified = false;
+ // Two tranformations to do here:
+ // 1) Find loads and stores that can be merged into a single load or store
+ // pair instruction.
+ // e.g.,
+ // ldr x0, [x2]
+ // ldr x1, [x2, #8]
+ // ; becomes
+ // ldp x0, x1, [x2]
+ // 2) Find base register updates that can be merged into the load or store
+ // as a base-reg writeback.
+ // e.g.,
+ // ldr x0, [x2]
+ // add x2, x2, #4
+ // ; becomes
+ // ldr x0, [x2], #4
+
+ for (MachineBasicBlock::iterator MBBI = MBB.begin(), E = MBB.end();
+ MBBI != E;) {
+ MachineInstr *MI = MBBI;
+ switch (MI->getOpcode()) {
+ default:
+ // Just move on to the next instruction.
+ ++MBBI;
+ break;
+ case AArch64::STRSui:
+ case AArch64::STRDui:
+ case AArch64::STRQui:
+ case AArch64::STRXui:
+ case AArch64::STRWui:
+ case AArch64::LDRSui:
+ case AArch64::LDRDui:
+ case AArch64::LDRQui:
+ case AArch64::LDRXui:
+ case AArch64::LDRWui:
+ // do the unscaled versions as well
+ case AArch64::STURSi:
+ case AArch64::STURDi:
+ case AArch64::STURQi:
+ case AArch64::STURWi:
+ case AArch64::STURXi:
+ case AArch64::LDURSi:
+ case AArch64::LDURDi:
+ case AArch64::LDURQi:
+ case AArch64::LDURWi:
+ case AArch64::LDURXi: {
+ // If this is a volatile load/store, don't mess with it.
+ if (MI->hasOrderedMemoryRef()) {
+ ++MBBI;
+ break;
+ }
+ // Make sure this is a reg+imm (as opposed to an address reloc).
+ if (!MI->getOperand(2).isImm()) {
+ ++MBBI;
+ break;
+ }
+ // Check if this load/store has a hint to avoid pair formation.
+ // MachineMemOperands hints are set by the AArch64StorePairSuppress pass.
+ if (TII->isLdStPairSuppressed(MI)) {
+ ++MBBI;
+ break;
+ }
+ // Look ahead up to ScanLimit instructions for a pairable instruction.
+ bool mergeForward = false;
+ MachineBasicBlock::iterator Paired =
+ findMatchingInsn(MBBI, mergeForward, ScanLimit);
+ if (Paired != E) {
+ // Merge the loads into a pair. Keeping the iterator straight is a
+ // pain, so we let the merge routine tell us what the next instruction
+ // is after it's done mucking about.
+ MBBI = mergePairedInsns(MBBI, Paired, mergeForward);
+
+ Modified = true;
+ ++NumPairCreated;
+ if (isUnscaledLdst(MI->getOpcode()))
+ ++NumUnscaledPairCreated;
+ break;
+ }
+ ++MBBI;
+ break;
+ }
+ // FIXME: Do the other instructions.
+ }
+ }
+
+ for (MachineBasicBlock::iterator MBBI = MBB.begin(), E = MBB.end();
+ MBBI != E;) {
+ MachineInstr *MI = MBBI;
+ // Do update merging. It's simpler to keep this separate from the above
+ // switch, though not strictly necessary.
+ int Opc = MI->getOpcode();
+ switch (Opc) {
+ default:
+ // Just move on to the next instruction.
+ ++MBBI;
+ break;
+ case AArch64::STRSui:
+ case AArch64::STRDui:
+ case AArch64::STRQui:
+ case AArch64::STRXui:
+ case AArch64::STRWui:
+ case AArch64::LDRSui:
+ case AArch64::LDRDui:
+ case AArch64::LDRQui:
+ case AArch64::LDRXui:
+ case AArch64::LDRWui:
+ // do the unscaled versions as well
+ case AArch64::STURSi:
+ case AArch64::STURDi:
+ case AArch64::STURQi:
+ case AArch64::STURWi:
+ case AArch64::STURXi:
+ case AArch64::LDURSi:
+ case AArch64::LDURDi:
+ case AArch64::LDURQi:
+ case AArch64::LDURWi:
+ case AArch64::LDURXi: {
+ // Make sure this is a reg+imm (as opposed to an address reloc).
+ if (!MI->getOperand(2).isImm()) {
+ ++MBBI;
+ break;
+ }
+ // Look ahead up to ScanLimit instructions for a mergable instruction.
+ MachineBasicBlock::iterator Update =
+ findMatchingUpdateInsnForward(MBBI, ScanLimit, 0);
+ if (Update != E) {
+ // Merge the update into the ld/st.
+ MBBI = mergePostIdxUpdateInsn(MBBI, Update);
+ Modified = true;
+ ++NumPostFolded;
+ break;
+ }
+ // Don't know how to handle pre/post-index versions, so move to the next
+ // instruction.
+ if (isUnscaledLdst(Opc)) {
+ ++MBBI;
+ break;
+ }
+
+ // Look back to try to find a pre-index instruction. For example,
+ // add x0, x0, #8
+ // ldr x1, [x0]
+ // merged into:
+ // ldr x1, [x0, #8]!
+ Update = findMatchingUpdateInsnBackward(MBBI, ScanLimit);
+ if (Update != E) {
+ // Merge the update into the ld/st.
+ MBBI = mergePreIdxUpdateInsn(MBBI, Update);
+ Modified = true;
+ ++NumPreFolded;
+ break;
+ }
+
+ // Look forward to try to find a post-index instruction. For example,
+ // ldr x1, [x0, #64]
+ // add x0, x0, #64
+ // merged into:
+ // ldr x1, [x0, #64]!
+
+ // The immediate in the load/store is scaled by the size of the register
+ // being loaded. The immediate in the add we're looking for,
+ // however, is not, so adjust here.
+ int Value = MI->getOperand(2).getImm() *
+ TII->getRegClass(MI->getDesc(), 0, TRI, *(MBB.getParent()))
+ ->getSize();
+ Update = findMatchingUpdateInsnForward(MBBI, ScanLimit, Value);
+ if (Update != E) {
+ // Merge the update into the ld/st.
+ MBBI = mergePreIdxUpdateInsn(MBBI, Update);
+ Modified = true;
+ ++NumPreFolded;
+ break;
+ }
+
+ // Nothing found. Just move to the next instruction.
+ ++MBBI;
+ break;
+ }
+ // FIXME: Do the other instructions.
+ }
+ }
+
+ return Modified;
+}
+
+bool AArch64LoadStoreOpt::runOnMachineFunction(MachineFunction &Fn) {
+ const TargetMachine &TM = Fn.getTarget();
+ TII = static_cast<const AArch64InstrInfo *>(TM.getInstrInfo());
+ TRI = TM.getRegisterInfo();
+
+ bool Modified = false;
+ for (auto &MBB : Fn)
+ Modified |= optimizeBlock(MBB);
+
+ return Modified;
+}
+
+// FIXME: Do we need/want a pre-alloc pass like ARM has to try to keep
+// loads and stores near one another?
+
+/// createARMLoadStoreOptimizationPass - returns an instance of the load / store
+/// optimization pass.
+FunctionPass *llvm::createAArch64LoadStoreOptimizationPass() {
+ return new AArch64LoadStoreOpt();
+}
diff --git a/lib/Target/AArch64/AArch64MCInstLower.cpp b/lib/Target/AArch64/AArch64MCInstLower.cpp
index 3842bfd..ab6d375 100644
--- a/lib/Target/AArch64/AArch64MCInstLower.cpp
+++ b/lib/Target/AArch64/AArch64MCInstLower.cpp
@@ -1,4 +1,4 @@
-//===-- AArch64MCInstLower.cpp - Convert AArch64 MachineInstr to an MCInst -==//
+//==-- AArch64MCInstLower.cpp - Convert AArch64 MachineInstr to an MCInst --==//
//
// The LLVM Compiler Infrastructure
//
@@ -12,146 +12,191 @@
//
//===----------------------------------------------------------------------===//
-#include "AArch64AsmPrinter.h"
-#include "AArch64TargetMachine.h"
+#include "AArch64MCInstLower.h"
#include "MCTargetDesc/AArch64MCExpr.h"
#include "Utils/AArch64BaseInfo.h"
-#include "llvm/ADT/SmallString.h"
#include "llvm/CodeGen/AsmPrinter.h"
-#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/CodeGen/MachineInstr.h"
#include "llvm/IR/Mangler.h"
-#include "llvm/MC/MCAsmInfo.h"
-#include "llvm/MC/MCContext.h"
#include "llvm/MC/MCExpr.h"
#include "llvm/MC/MCInst.h"
-
+#include "llvm/Support/CodeGen.h"
+#include "llvm/Target/TargetMachine.h"
using namespace llvm;
-MCOperand
-AArch64AsmPrinter::lowerSymbolOperand(const MachineOperand &MO,
- const MCSymbol *Sym) const {
- const MCExpr *Expr = 0;
+AArch64MCInstLower::AArch64MCInstLower(MCContext &ctx, Mangler &mang,
+ AsmPrinter &printer)
+ : Ctx(ctx), Printer(printer), TargetTriple(printer.getTargetTriple()) {}
- Expr = MCSymbolRefExpr::Create(Sym, MCSymbolRefExpr::VK_None, OutContext);
+MCSymbol *
+AArch64MCInstLower::GetGlobalAddressSymbol(const MachineOperand &MO) const {
+ return Printer.getSymbol(MO.getGlobal());
+}
- switch (MO.getTargetFlags()) {
- case AArch64II::MO_GOT:
- Expr = AArch64MCExpr::CreateGOT(Expr, OutContext);
- break;
- case AArch64II::MO_GOT_LO12:
- Expr = AArch64MCExpr::CreateGOTLo12(Expr, OutContext);
- break;
- case AArch64II::MO_LO12:
- Expr = AArch64MCExpr::CreateLo12(Expr, OutContext);
- break;
- case AArch64II::MO_DTPREL_G1:
- Expr = AArch64MCExpr::CreateDTPREL_G1(Expr, OutContext);
- break;
- case AArch64II::MO_DTPREL_G0_NC:
- Expr = AArch64MCExpr::CreateDTPREL_G0_NC(Expr, OutContext);
- break;
- case AArch64II::MO_GOTTPREL:
- Expr = AArch64MCExpr::CreateGOTTPREL(Expr, OutContext);
- break;
- case AArch64II::MO_GOTTPREL_LO12:
- Expr = AArch64MCExpr::CreateGOTTPRELLo12(Expr, OutContext);
- break;
- case AArch64II::MO_TLSDESC:
- Expr = AArch64MCExpr::CreateTLSDesc(Expr, OutContext);
- break;
- case AArch64II::MO_TLSDESC_LO12:
- Expr = AArch64MCExpr::CreateTLSDescLo12(Expr, OutContext);
- break;
- case AArch64II::MO_TPREL_G1:
- Expr = AArch64MCExpr::CreateTPREL_G1(Expr, OutContext);
- break;
- case AArch64II::MO_TPREL_G0_NC:
- Expr = AArch64MCExpr::CreateTPREL_G0_NC(Expr, OutContext);
- break;
- case AArch64II::MO_ABS_G3:
- Expr = AArch64MCExpr::CreateABS_G3(Expr, OutContext);
- break;
- case AArch64II::MO_ABS_G2_NC:
- Expr = AArch64MCExpr::CreateABS_G2_NC(Expr, OutContext);
- break;
- case AArch64II::MO_ABS_G1_NC:
- Expr = AArch64MCExpr::CreateABS_G1_NC(Expr, OutContext);
- break;
- case AArch64II::MO_ABS_G0_NC:
- Expr = AArch64MCExpr::CreateABS_G0_NC(Expr, OutContext);
- break;
- case AArch64II::MO_NO_FLAG:
- // Expr is already correct
- break;
- default:
- llvm_unreachable("Unexpected MachineOperand flag");
+MCSymbol *
+AArch64MCInstLower::GetExternalSymbolSymbol(const MachineOperand &MO) const {
+ return Printer.GetExternalSymbolSymbol(MO.getSymbolName());
+}
+
+MCOperand AArch64MCInstLower::lowerSymbolOperandDarwin(const MachineOperand &MO,
+ MCSymbol *Sym) const {
+ // FIXME: We would like an efficient form for this, so we don't have to do a
+ // lot of extra uniquing.
+ MCSymbolRefExpr::VariantKind RefKind = MCSymbolRefExpr::VK_None;
+ if ((MO.getTargetFlags() & AArch64II::MO_GOT) != 0) {
+ if ((MO.getTargetFlags() & AArch64II::MO_FRAGMENT) == AArch64II::MO_PAGE)
+ RefKind = MCSymbolRefExpr::VK_GOTPAGE;
+ else if ((MO.getTargetFlags() & AArch64II::MO_FRAGMENT) ==
+ AArch64II::MO_PAGEOFF)
+ RefKind = MCSymbolRefExpr::VK_GOTPAGEOFF;
+ else
+ assert(0 && "Unexpected target flags with MO_GOT on GV operand");
+ } else if ((MO.getTargetFlags() & AArch64II::MO_TLS) != 0) {
+ if ((MO.getTargetFlags() & AArch64II::MO_FRAGMENT) == AArch64II::MO_PAGE)
+ RefKind = MCSymbolRefExpr::VK_TLVPPAGE;
+ else if ((MO.getTargetFlags() & AArch64II::MO_FRAGMENT) ==
+ AArch64II::MO_PAGEOFF)
+ RefKind = MCSymbolRefExpr::VK_TLVPPAGEOFF;
+ else
+ llvm_unreachable("Unexpected target flags with MO_TLS on GV operand");
+ } else {
+ if ((MO.getTargetFlags() & AArch64II::MO_FRAGMENT) == AArch64II::MO_PAGE)
+ RefKind = MCSymbolRefExpr::VK_PAGE;
+ else if ((MO.getTargetFlags() & AArch64II::MO_FRAGMENT) ==
+ AArch64II::MO_PAGEOFF)
+ RefKind = MCSymbolRefExpr::VK_PAGEOFF;
+ }
+ const MCExpr *Expr = MCSymbolRefExpr::Create(Sym, RefKind, Ctx);
+ if (!MO.isJTI() && MO.getOffset())
+ Expr = MCBinaryExpr::CreateAdd(
+ Expr, MCConstantExpr::Create(MO.getOffset(), Ctx), Ctx);
+ return MCOperand::CreateExpr(Expr);
+}
+
+MCOperand AArch64MCInstLower::lowerSymbolOperandELF(const MachineOperand &MO,
+ MCSymbol *Sym) const {
+ uint32_t RefFlags = 0;
+
+ if (MO.getTargetFlags() & AArch64II::MO_GOT)
+ RefFlags |= AArch64MCExpr::VK_GOT;
+ else if (MO.getTargetFlags() & AArch64II::MO_TLS) {
+ TLSModel::Model Model;
+ if (MO.isGlobal()) {
+ const GlobalValue *GV = MO.getGlobal();
+ Model = Printer.TM.getTLSModel(GV);
+ } else {
+ assert(MO.isSymbol() &&
+ StringRef(MO.getSymbolName()) == "_TLS_MODULE_BASE_" &&
+ "unexpected external TLS symbol");
+ Model = TLSModel::GeneralDynamic;
+ }
+ switch (Model) {
+ case TLSModel::InitialExec:
+ RefFlags |= AArch64MCExpr::VK_GOTTPREL;
+ break;
+ case TLSModel::LocalExec:
+ RefFlags |= AArch64MCExpr::VK_TPREL;
+ break;
+ case TLSModel::LocalDynamic:
+ RefFlags |= AArch64MCExpr::VK_DTPREL;
+ break;
+ case TLSModel::GeneralDynamic:
+ RefFlags |= AArch64MCExpr::VK_TLSDESC;
+ break;
+ }
+ } else {
+ // No modifier means this is a generic reference, classified as absolute for
+ // the cases where it matters (:abs_g0: etc).
+ RefFlags |= AArch64MCExpr::VK_ABS;
}
+ if ((MO.getTargetFlags() & AArch64II::MO_FRAGMENT) == AArch64II::MO_PAGE)
+ RefFlags |= AArch64MCExpr::VK_PAGE;
+ else if ((MO.getTargetFlags() & AArch64II::MO_FRAGMENT) ==
+ AArch64II::MO_PAGEOFF)
+ RefFlags |= AArch64MCExpr::VK_PAGEOFF;
+ else if ((MO.getTargetFlags() & AArch64II::MO_FRAGMENT) == AArch64II::MO_G3)
+ RefFlags |= AArch64MCExpr::VK_G3;
+ else if ((MO.getTargetFlags() & AArch64II::MO_FRAGMENT) == AArch64II::MO_G2)
+ RefFlags |= AArch64MCExpr::VK_G2;
+ else if ((MO.getTargetFlags() & AArch64II::MO_FRAGMENT) == AArch64II::MO_G1)
+ RefFlags |= AArch64MCExpr::VK_G1;
+ else if ((MO.getTargetFlags() & AArch64II::MO_FRAGMENT) == AArch64II::MO_G0)
+ RefFlags |= AArch64MCExpr::VK_G0;
+
+ if (MO.getTargetFlags() & AArch64II::MO_NC)
+ RefFlags |= AArch64MCExpr::VK_NC;
+
+ const MCExpr *Expr =
+ MCSymbolRefExpr::Create(Sym, MCSymbolRefExpr::VK_None, Ctx);
if (!MO.isJTI() && MO.getOffset())
- Expr = MCBinaryExpr::CreateAdd(Expr,
- MCConstantExpr::Create(MO.getOffset(),
- OutContext),
- OutContext);
+ Expr = MCBinaryExpr::CreateAdd(
+ Expr, MCConstantExpr::Create(MO.getOffset(), Ctx), Ctx);
+
+ AArch64MCExpr::VariantKind RefKind;
+ RefKind = static_cast<AArch64MCExpr::VariantKind>(RefFlags);
+ Expr = AArch64MCExpr::Create(Expr, RefKind, Ctx);
return MCOperand::CreateExpr(Expr);
}
-bool AArch64AsmPrinter::lowerOperand(const MachineOperand &MO,
- MCOperand &MCOp) const {
+MCOperand AArch64MCInstLower::LowerSymbolOperand(const MachineOperand &MO,
+ MCSymbol *Sym) const {
+ if (TargetTriple.isOSDarwin())
+ return lowerSymbolOperandDarwin(MO, Sym);
+
+ assert(TargetTriple.isOSBinFormatELF() && "Expect Darwin or ELF target");
+ return lowerSymbolOperandELF(MO, Sym);
+}
+
+bool AArch64MCInstLower::lowerOperand(const MachineOperand &MO,
+ MCOperand &MCOp) const {
switch (MO.getType()) {
- default: llvm_unreachable("unknown operand type");
+ default:
+ assert(0 && "unknown operand type");
case MachineOperand::MO_Register:
+ // Ignore all implicit register operands.
if (MO.isImplicit())
return false;
- assert(!MO.getSubReg() && "Subregs should be eliminated!");
MCOp = MCOperand::CreateReg(MO.getReg());
break;
+ case MachineOperand::MO_RegisterMask:
+ // Regmasks are like implicit defs.
+ return false;
case MachineOperand::MO_Immediate:
MCOp = MCOperand::CreateImm(MO.getImm());
break;
- case MachineOperand::MO_FPImmediate: {
- assert(MO.getFPImm()->isZero() && "Only fp imm 0.0 is supported");
- MCOp = MCOperand::CreateFPImm(0.0);
- break;
- }
- case MachineOperand::MO_BlockAddress:
- MCOp = lowerSymbolOperand(MO, GetBlockAddressSymbol(MO.getBlockAddress()));
- break;
- case MachineOperand::MO_ExternalSymbol:
- MCOp = lowerSymbolOperand(MO, GetExternalSymbolSymbol(MO.getSymbolName()));
+ case MachineOperand::MO_MachineBasicBlock:
+ MCOp = MCOperand::CreateExpr(
+ MCSymbolRefExpr::Create(MO.getMBB()->getSymbol(), Ctx));
break;
case MachineOperand::MO_GlobalAddress:
- MCOp = lowerSymbolOperand(MO, getSymbol(MO.getGlobal()));
+ MCOp = LowerSymbolOperand(MO, GetGlobalAddressSymbol(MO));
break;
- case MachineOperand::MO_MachineBasicBlock:
- MCOp = MCOperand::CreateExpr(MCSymbolRefExpr::Create(
- MO.getMBB()->getSymbol(), OutContext));
+ case MachineOperand::MO_ExternalSymbol:
+ MCOp = LowerSymbolOperand(MO, GetExternalSymbolSymbol(MO));
break;
case MachineOperand::MO_JumpTableIndex:
- MCOp = lowerSymbolOperand(MO, GetJTISymbol(MO.getIndex()));
+ MCOp = LowerSymbolOperand(MO, Printer.GetJTISymbol(MO.getIndex()));
break;
case MachineOperand::MO_ConstantPoolIndex:
- MCOp = lowerSymbolOperand(MO, GetCPISymbol(MO.getIndex()));
+ MCOp = LowerSymbolOperand(MO, Printer.GetCPISymbol(MO.getIndex()));
break;
- case MachineOperand::MO_RegisterMask:
- // Ignore call clobbers
- return false;
-
+ case MachineOperand::MO_BlockAddress:
+ MCOp = LowerSymbolOperand(
+ MO, Printer.GetBlockAddressSymbol(MO.getBlockAddress()));
+ break;
}
-
return true;
}
-void llvm::LowerAArch64MachineInstrToMCInst(const MachineInstr *MI,
- MCInst &OutMI,
- AArch64AsmPrinter &AP) {
+void AArch64MCInstLower::Lower(const MachineInstr *MI, MCInst &OutMI) const {
OutMI.setOpcode(MI->getOpcode());
for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
- const MachineOperand &MO = MI->getOperand(i);
-
MCOperand MCOp;
- if (AP.lowerOperand(MO, MCOp))
+ if (lowerOperand(MI->getOperand(i), MCOp))
OutMI.addOperand(MCOp);
}
}
diff --git a/lib/Target/AArch64/AArch64MCInstLower.h b/lib/Target/AArch64/AArch64MCInstLower.h
new file mode 100644
index 0000000..ba50ba9
--- /dev/null
+++ b/lib/Target/AArch64/AArch64MCInstLower.h
@@ -0,0 +1,52 @@
+//===-- AArch64MCInstLower.h - Lower MachineInstr to MCInst ---------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef AArch64_MCINSTLOWER_H
+#define AArch64_MCINSTLOWER_H
+
+#include "llvm/ADT/Triple.h"
+#include "llvm/Support/Compiler.h"
+
+namespace llvm {
+class AsmPrinter;
+class MCAsmInfo;
+class MCContext;
+class MCInst;
+class MCOperand;
+class MCSymbol;
+class MachineInstr;
+class MachineModuleInfoMachO;
+class MachineOperand;
+class Mangler;
+
+/// AArch64MCInstLower - This class is used to lower an MachineInstr
+/// into an MCInst.
+class LLVM_LIBRARY_VISIBILITY AArch64MCInstLower {
+ MCContext &Ctx;
+ AsmPrinter &Printer;
+ Triple TargetTriple;
+
+public:
+ AArch64MCInstLower(MCContext &ctx, Mangler &mang, AsmPrinter &printer);
+
+ bool lowerOperand(const MachineOperand &MO, MCOperand &MCOp) const;
+ void Lower(const MachineInstr *MI, MCInst &OutMI) const;
+
+ MCOperand lowerSymbolOperandDarwin(const MachineOperand &MO,
+ MCSymbol *Sym) const;
+ MCOperand lowerSymbolOperandELF(const MachineOperand &MO,
+ MCSymbol *Sym) const;
+ MCOperand LowerSymbolOperand(const MachineOperand &MO, MCSymbol *Sym) const;
+
+ MCSymbol *GetGlobalAddressSymbol(const MachineOperand &MO) const;
+ MCSymbol *GetExternalSymbolSymbol(const MachineOperand &MO) const;
+};
+}
+
+#endif
diff --git a/lib/Target/AArch64/AArch64MachineFunctionInfo.cpp b/lib/Target/AArch64/AArch64MachineFunctionInfo.cpp
deleted file mode 100644
index f45d8f7..0000000
--- a/lib/Target/AArch64/AArch64MachineFunctionInfo.cpp
+++ /dev/null
@@ -1,18 +0,0 @@
-//===-- AArch64MachineFuctionInfo.cpp - AArch64 machine function info -----===//
-//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This file just contains the anchor for the AArch64MachineFunctionInfo to
-// force vtable emission.
-//
-//===----------------------------------------------------------------------===//
-#include "AArch64MachineFunctionInfo.h"
-
-using namespace llvm;
-
-void AArch64MachineFunctionInfo::anchor() { }
diff --git a/lib/Target/AArch64/AArch64MachineFunctionInfo.h b/lib/Target/AArch64/AArch64MachineFunctionInfo.h
index 33da54f..7c257ba 100644
--- a/lib/Target/AArch64/AArch64MachineFunctionInfo.h
+++ b/lib/Target/AArch64/AArch64MachineFunctionInfo.h
@@ -1,4 +1,4 @@
-//=- AArch64MachineFuctionInfo.h - AArch64 machine function info -*- C++ -*-==//
+//=- AArch64MachineFuctionInfo.h - AArch64 machine function info --*- C++ -*-=//
//
// The LLVM Compiler Infrastructure
//
@@ -11,17 +11,19 @@
//
//===----------------------------------------------------------------------===//
-#ifndef AARCH64MACHINEFUNCTIONINFO_H
-#define AARCH64MACHINEFUNCTIONINFO_H
+#ifndef AArch64MACHINEFUNCTIONINFO_H
+#define AArch64MACHINEFUNCTIONINFO_H
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/SmallVector.h"
#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/MC/MCLinkerOptimizationHint.h"
namespace llvm {
-/// This class is derived from MachineFunctionInfo and contains private AArch64
-/// target-specific information for each MachineFunction.
-class AArch64MachineFunctionInfo : public MachineFunctionInfo {
- virtual void anchor();
+/// AArch64FunctionInfo - This class is derived from MachineFunctionInfo and
+/// contains private AArch64-specific information for each MachineFunction.
+class AArch64FunctionInfo : public MachineFunctionInfo {
/// Number of bytes of arguments this function has on the stack. If the callee
/// is expected to restore the argument stack this should be a multiple of 16,
@@ -39,111 +41,123 @@
/// callee is expected to pop the args.
unsigned ArgumentStackToRestore;
- /// If the stack needs to be adjusted on frame entry in two stages, this
- /// records the size of the first adjustment just prior to storing
- /// callee-saved registers. The callee-saved slots are addressed assuming
- /// SP == <incoming-SP> - InitialStackAdjust.
- unsigned InitialStackAdjust;
+ /// HasStackFrame - True if this function has a stack frame. Set by
+ /// processFunctionBeforeCalleeSavedScan().
+ bool HasStackFrame;
- /// Number of local-dynamic TLS accesses.
- unsigned NumLocalDynamics;
+ /// \brief Amount of stack frame size, not including callee-saved registers.
+ unsigned LocalStackSize;
- /// @see AArch64 Procedure Call Standard, B.3
- ///
- /// The Frame index of the area where LowerFormalArguments puts the
- /// general-purpose registers that might contain variadic parameters.
- int VariadicGPRIdx;
+ /// \brief Number of TLS accesses using the special (combinable)
+ /// _TLS_MODULE_BASE_ symbol.
+ unsigned NumLocalDynamicTLSAccesses;
- /// @see AArch64 Procedure Call Standard, B.3
- ///
- /// The size of the frame object used to store the general-purpose registers
- /// which might contain variadic arguments. This is the offset from
- /// VariadicGPRIdx to what's stored in __gr_top.
- unsigned VariadicGPRSize;
+ /// \brief FrameIndex for start of varargs area for arguments passed on the
+ /// stack.
+ int VarArgsStackIndex;
- /// @see AArch64 Procedure Call Standard, B.3
- ///
- /// The Frame index of the area where LowerFormalArguments puts the
- /// floating-point registers that might contain variadic parameters.
- int VariadicFPRIdx;
+ /// \brief FrameIndex for start of varargs area for arguments passed in
+ /// general purpose registers.
+ int VarArgsGPRIndex;
- /// @see AArch64 Procedure Call Standard, B.3
- ///
- /// The size of the frame object used to store the floating-point registers
- /// which might contain variadic arguments. This is the offset from
- /// VariadicFPRIdx to what's stored in __vr_top.
- unsigned VariadicFPRSize;
+ /// \brief Size of the varargs area for arguments passed in general purpose
+ /// registers.
+ unsigned VarArgsGPRSize;
- /// @see AArch64 Procedure Call Standard, B.3
- ///
- /// The Frame index of an object pointing just past the last known stacked
- /// argument on entry to a variadic function. This goes into the __stack field
- /// of the va_list type.
- int VariadicStackIdx;
+ /// \brief FrameIndex for start of varargs area for arguments passed in
+ /// floating-point registers.
+ int VarArgsFPRIndex;
- /// The offset of the frame pointer from the stack pointer on function
- /// entry. This is expected to be negative.
- int FramePointerOffset;
+ /// \brief Size of the varargs area for arguments passed in floating-point
+ /// registers.
+ unsigned VarArgsFPRSize;
public:
- AArch64MachineFunctionInfo()
- : BytesInStackArgArea(0),
- ArgumentStackToRestore(0),
- InitialStackAdjust(0),
- NumLocalDynamics(0),
- VariadicGPRIdx(0),
- VariadicGPRSize(0),
- VariadicFPRIdx(0),
- VariadicFPRSize(0),
- VariadicStackIdx(0),
- FramePointerOffset(0) {}
+ AArch64FunctionInfo()
+ : BytesInStackArgArea(0), ArgumentStackToRestore(0), HasStackFrame(false),
+ NumLocalDynamicTLSAccesses(0), VarArgsStackIndex(0), VarArgsGPRIndex(0),
+ VarArgsGPRSize(0), VarArgsFPRIndex(0), VarArgsFPRSize(0) {}
- explicit AArch64MachineFunctionInfo(MachineFunction &MF)
- : BytesInStackArgArea(0),
- ArgumentStackToRestore(0),
- InitialStackAdjust(0),
- NumLocalDynamics(0),
- VariadicGPRIdx(0),
- VariadicGPRSize(0),
- VariadicFPRIdx(0),
- VariadicFPRSize(0),
- VariadicStackIdx(0),
- FramePointerOffset(0) {}
+ explicit AArch64FunctionInfo(MachineFunction &MF)
+ : BytesInStackArgArea(0), ArgumentStackToRestore(0), HasStackFrame(false),
+ NumLocalDynamicTLSAccesses(0), VarArgsStackIndex(0), VarArgsGPRIndex(0),
+ VarArgsGPRSize(0), VarArgsFPRIndex(0), VarArgsFPRSize(0) {
+ (void)MF;
+ }
unsigned getBytesInStackArgArea() const { return BytesInStackArgArea; }
- void setBytesInStackArgArea (unsigned bytes) { BytesInStackArgArea = bytes;}
+ void setBytesInStackArgArea(unsigned bytes) { BytesInStackArgArea = bytes; }
unsigned getArgumentStackToRestore() const { return ArgumentStackToRestore; }
void setArgumentStackToRestore(unsigned bytes) {
ArgumentStackToRestore = bytes;
}
- unsigned getInitialStackAdjust() const { return InitialStackAdjust; }
- void setInitialStackAdjust(unsigned bytes) { InitialStackAdjust = bytes; }
+ bool hasStackFrame() const { return HasStackFrame; }
+ void setHasStackFrame(bool s) { HasStackFrame = s; }
- unsigned getNumLocalDynamicTLSAccesses() const { return NumLocalDynamics; }
- void incNumLocalDynamicTLSAccesses() { ++NumLocalDynamics; }
+ void setLocalStackSize(unsigned Size) { LocalStackSize = Size; }
+ unsigned getLocalStackSize() const { return LocalStackSize; }
- int getVariadicGPRIdx() const { return VariadicGPRIdx; }
- void setVariadicGPRIdx(int Idx) { VariadicGPRIdx = Idx; }
+ void incNumLocalDynamicTLSAccesses() { ++NumLocalDynamicTLSAccesses; }
+ unsigned getNumLocalDynamicTLSAccesses() const {
+ return NumLocalDynamicTLSAccesses;
+ }
- unsigned getVariadicGPRSize() const { return VariadicGPRSize; }
- void setVariadicGPRSize(unsigned Size) { VariadicGPRSize = Size; }
+ int getVarArgsStackIndex() const { return VarArgsStackIndex; }
+ void setVarArgsStackIndex(int Index) { VarArgsStackIndex = Index; }
- int getVariadicFPRIdx() const { return VariadicFPRIdx; }
- void setVariadicFPRIdx(int Idx) { VariadicFPRIdx = Idx; }
+ int getVarArgsGPRIndex() const { return VarArgsGPRIndex; }
+ void setVarArgsGPRIndex(int Index) { VarArgsGPRIndex = Index; }
- unsigned getVariadicFPRSize() const { return VariadicFPRSize; }
- void setVariadicFPRSize(unsigned Size) { VariadicFPRSize = Size; }
+ unsigned getVarArgsGPRSize() const { return VarArgsGPRSize; }
+ void setVarArgsGPRSize(unsigned Size) { VarArgsGPRSize = Size; }
- int getVariadicStackIdx() const { return VariadicStackIdx; }
- void setVariadicStackIdx(int Idx) { VariadicStackIdx = Idx; }
+ int getVarArgsFPRIndex() const { return VarArgsFPRIndex; }
+ void setVarArgsFPRIndex(int Index) { VarArgsFPRIndex = Index; }
- int getFramePointerOffset() const { return FramePointerOffset; }
- void setFramePointerOffset(int Idx) { FramePointerOffset = Idx; }
+ unsigned getVarArgsFPRSize() const { return VarArgsFPRSize; }
+ void setVarArgsFPRSize(unsigned Size) { VarArgsFPRSize = Size; }
+ typedef SmallPtrSet<const MachineInstr *, 16> SetOfInstructions;
+
+ const SetOfInstructions &getLOHRelated() const { return LOHRelated; }
+
+ // Shortcuts for LOH related types.
+ class MILOHDirective {
+ MCLOHType Kind;
+
+ /// Arguments of this directive. Order matters.
+ SmallVector<const MachineInstr *, 3> Args;
+
+ public:
+ typedef SmallVectorImpl<const MachineInstr *> LOHArgs;
+
+ MILOHDirective(MCLOHType Kind, const LOHArgs &Args)
+ : Kind(Kind), Args(Args.begin(), Args.end()) {
+ assert(isValidMCLOHType(Kind) && "Invalid LOH directive type!");
+ }
+
+ MCLOHType getKind() const { return Kind; }
+ const LOHArgs &getArgs() const { return Args; }
+ };
+
+ typedef MILOHDirective::LOHArgs MILOHArgs;
+ typedef SmallVector<MILOHDirective, 32> MILOHContainer;
+
+ const MILOHContainer &getLOHContainer() const { return LOHContainerSet; }
+
+ /// Add a LOH directive of this @p Kind and this @p Args.
+ void addLOHDirective(MCLOHType Kind, const MILOHArgs &Args) {
+ LOHContainerSet.push_back(MILOHDirective(Kind, Args));
+ LOHRelated.insert(Args.begin(), Args.end());
+ }
+
+private:
+ // Hold the lists of LOHs.
+ MILOHContainer LOHContainerSet;
+ SetOfInstructions LOHRelated;
};
-
} // End llvm namespace
-#endif
+#endif // AArch64MACHINEFUNCTIONINFO_H
diff --git a/lib/Target/AArch64/AArch64PerfectShuffle.h b/lib/Target/AArch64/AArch64PerfectShuffle.h
new file mode 100644
index 0000000..b22fa24
--- /dev/null
+++ b/lib/Target/AArch64/AArch64PerfectShuffle.h
@@ -0,0 +1,6586 @@
+//===-- AArch64PerfectShuffle.h - AdvSIMD Perfect Shuffle Table -----------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file, which was autogenerated by llvm-PerfectShuffle, contains data
+// for the optimal way to build a perfect shuffle using AdvSIMD instructions.
+//
+//===----------------------------------------------------------------------===//
+
+// 31 entries have cost 0
+// 242 entries have cost 1
+// 1447 entries have cost 2
+// 3602 entries have cost 3
+// 1237 entries have cost 4
+// 2 entries have cost 5
+
+// This table is 6561*4 = 26244 bytes in size.
+static const unsigned PerfectShuffleTable[6561+1] = {
+ 135053414U, // <0,0,0,0>: Cost 1 vdup0 LHS
+ 1543503974U, // <0,0,0,1>: Cost 2 vext2 <0,0,0,0>, LHS
+ 2618572962U, // <0,0,0,2>: Cost 3 vext2 <0,2,0,0>, <0,2,0,0>
+ 2568054923U, // <0,0,0,3>: Cost 3 vext1 <3,0,0,0>, <3,0,0,0>
+ 1476398390U, // <0,0,0,4>: Cost 2 vext1 <0,0,0,0>, RHS
+ 2550140624U, // <0,0,0,5>: Cost 3 vext1 <0,0,0,0>, <5,1,7,3>
+ 2550141434U, // <0,0,0,6>: Cost 3 vext1 <0,0,0,0>, <6,2,7,3>
+ 2591945711U, // <0,0,0,7>: Cost 3 vext1 <7,0,0,0>, <7,0,0,0>
+ 135053414U, // <0,0,0,u>: Cost 1 vdup0 LHS
+ 2886516736U, // <0,0,1,0>: Cost 3 vzipl LHS, <0,0,0,0>
+ 1812775014U, // <0,0,1,1>: Cost 2 vzipl LHS, LHS
+ 1618133094U, // <0,0,1,2>: Cost 2 vext3 <1,2,3,0>, LHS
+ 2625209292U, // <0,0,1,3>: Cost 3 vext2 <1,3,0,0>, <1,3,0,0>
+ 2886558034U, // <0,0,1,4>: Cost 3 vzipl LHS, <0,4,1,5>
+ 2617246864U, // <0,0,1,5>: Cost 3 vext2 <0,0,0,0>, <1,5,3,7>
+ 3659723031U, // <0,0,1,6>: Cost 4 vext1 <6,0,0,1>, <6,0,0,1>
+ 2591953904U, // <0,0,1,7>: Cost 3 vext1 <7,0,0,1>, <7,0,0,1>
+ 1812775581U, // <0,0,1,u>: Cost 2 vzipl LHS, LHS
+ 3020734464U, // <0,0,2,0>: Cost 3 vtrnl LHS, <0,0,0,0>
+ 3020734474U, // <0,0,2,1>: Cost 3 vtrnl LHS, <0,0,1,1>
+ 1946992742U, // <0,0,2,2>: Cost 2 vtrnl LHS, LHS
+ 2631181989U, // <0,0,2,3>: Cost 3 vext2 <2,3,0,0>, <2,3,0,0>
+ 3020734668U, // <0,0,2,4>: Cost 3 vtrnl LHS, <0,2,4,6>
+ 3826550569U, // <0,0,2,5>: Cost 4 vuzpl <0,2,0,2>, <2,4,5,6>
+ 2617247674U, // <0,0,2,6>: Cost 3 vext2 <0,0,0,0>, <2,6,3,7>
+ 2591962097U, // <0,0,2,7>: Cost 3 vext1 <7,0,0,2>, <7,0,0,2>
+ 1946992796U, // <0,0,2,u>: Cost 2 vtrnl LHS, LHS
+ 2635163787U, // <0,0,3,0>: Cost 3 vext2 <3,0,0,0>, <3,0,0,0>
+ 2686419196U, // <0,0,3,1>: Cost 3 vext3 <0,3,1,0>, <0,3,1,0>
+ 2686492933U, // <0,0,3,2>: Cost 3 vext3 <0,3,2,0>, <0,3,2,0>
+ 2617248156U, // <0,0,3,3>: Cost 3 vext2 <0,0,0,0>, <3,3,3,3>
+ 2617248258U, // <0,0,3,4>: Cost 3 vext2 <0,0,0,0>, <3,4,5,6>
+ 3826551298U, // <0,0,3,5>: Cost 4 vuzpl <0,2,0,2>, <3,4,5,6>
+ 3690990200U, // <0,0,3,6>: Cost 4 vext2 <0,0,0,0>, <3,6,0,7>
+ 3713551042U, // <0,0,3,7>: Cost 4 vext2 <3,7,0,0>, <3,7,0,0>
+ 2635163787U, // <0,0,3,u>: Cost 3 vext2 <3,0,0,0>, <3,0,0,0>
+ 2617248658U, // <0,0,4,0>: Cost 3 vext2 <0,0,0,0>, <4,0,5,1>
+ 2888450150U, // <0,0,4,1>: Cost 3 vzipl <0,4,1,5>, LHS
+ 3021570150U, // <0,0,4,2>: Cost 3 vtrnl <0,2,4,6>, LHS
+ 3641829519U, // <0,0,4,3>: Cost 4 vext1 <3,0,0,4>, <3,0,0,4>
+ 3021570252U, // <0,0,4,4>: Cost 3 vtrnl <0,2,4,6>, <0,2,4,6>
+ 1543507254U, // <0,0,4,5>: Cost 2 vext2 <0,0,0,0>, RHS
+ 2752810294U, // <0,0,4,6>: Cost 3 vuzpl <0,2,0,2>, RHS
+ 3786998152U, // <0,0,4,7>: Cost 4 vext3 <4,7,5,0>, <0,4,7,5>
+ 1543507497U, // <0,0,4,u>: Cost 2 vext2 <0,0,0,0>, RHS
+ 2684354972U, // <0,0,5,0>: Cost 3 vext3 <0,0,0,0>, <0,5,0,7>
+ 2617249488U, // <0,0,5,1>: Cost 3 vext2 <0,0,0,0>, <5,1,7,3>
+ 3765617070U, // <0,0,5,2>: Cost 4 vext3 <1,2,3,0>, <0,5,2,7>
+ 3635865780U, // <0,0,5,3>: Cost 4 vext1 <2,0,0,5>, <3,0,4,5>
+ 2617249734U, // <0,0,5,4>: Cost 3 vext2 <0,0,0,0>, <5,4,7,6>
+ 2617249796U, // <0,0,5,5>: Cost 3 vext2 <0,0,0,0>, <5,5,5,5>
+ 2718712274U, // <0,0,5,6>: Cost 3 vext3 <5,6,7,0>, <0,5,6,7>
+ 2617249960U, // <0,0,5,7>: Cost 3 vext2 <0,0,0,0>, <5,7,5,7>
+ 2720039396U, // <0,0,5,u>: Cost 3 vext3 <5,u,7,0>, <0,5,u,7>
+ 2684355053U, // <0,0,6,0>: Cost 3 vext3 <0,0,0,0>, <0,6,0,7>
+ 3963609190U, // <0,0,6,1>: Cost 4 vzipl <0,6,2,7>, LHS
+ 2617250298U, // <0,0,6,2>: Cost 3 vext2 <0,0,0,0>, <6,2,7,3>
+ 3796435464U, // <0,0,6,3>: Cost 4 vext3 <6,3,7,0>, <0,6,3,7>
+ 3659762998U, // <0,0,6,4>: Cost 4 vext1 <6,0,0,6>, RHS
+ 3659763810U, // <0,0,6,5>: Cost 4 vext1 <6,0,0,6>, <5,6,7,0>
+ 2617250616U, // <0,0,6,6>: Cost 3 vext2 <0,0,0,0>, <6,6,6,6>
+ 2657727309U, // <0,0,6,7>: Cost 3 vext2 <6,7,0,0>, <6,7,0,0>
+ 2658390942U, // <0,0,6,u>: Cost 3 vext2 <6,u,0,0>, <6,u,0,0>
+ 2659054575U, // <0,0,7,0>: Cost 3 vext2 <7,0,0,0>, <7,0,0,0>
+ 3635880854U, // <0,0,7,1>: Cost 4 vext1 <2,0,0,7>, <1,2,3,0>
+ 3635881401U, // <0,0,7,2>: Cost 4 vext1 <2,0,0,7>, <2,0,0,7>
+ 3734787298U, // <0,0,7,3>: Cost 4 vext2 <7,3,0,0>, <7,3,0,0>
+ 2617251174U, // <0,0,7,4>: Cost 3 vext2 <0,0,0,0>, <7,4,5,6>
+ 3659772002U, // <0,0,7,5>: Cost 4 vext1 <6,0,0,7>, <5,6,7,0>
+ 3659772189U, // <0,0,7,6>: Cost 4 vext1 <6,0,0,7>, <6,0,0,7>
+ 2617251436U, // <0,0,7,7>: Cost 3 vext2 <0,0,0,0>, <7,7,7,7>
+ 2659054575U, // <0,0,7,u>: Cost 3 vext2 <7,0,0,0>, <7,0,0,0>
+ 135053414U, // <0,0,u,0>: Cost 1 vdup0 LHS
+ 1817419878U, // <0,0,u,1>: Cost 2 vzipl LHS, LHS
+ 1947435110U, // <0,0,u,2>: Cost 2 vtrnl LHS, LHS
+ 2568120467U, // <0,0,u,3>: Cost 3 vext1 <3,0,0,u>, <3,0,0,u>
+ 1476463926U, // <0,0,u,4>: Cost 2 vext1 <0,0,0,u>, RHS
+ 1543510170U, // <0,0,u,5>: Cost 2 vext2 <0,0,0,0>, RHS
+ 2752813210U, // <0,0,u,6>: Cost 3 vuzpl <0,2,0,2>, RHS
+ 2592011255U, // <0,0,u,7>: Cost 3 vext1 <7,0,0,u>, <7,0,0,u>
+ 135053414U, // <0,0,u,u>: Cost 1 vdup0 LHS
+ 2618581002U, // <0,1,0,0>: Cost 3 vext2 <0,2,0,1>, <0,0,1,1>
+ 1557446758U, // <0,1,0,1>: Cost 2 vext2 <2,3,0,1>, LHS
+ 2618581155U, // <0,1,0,2>: Cost 3 vext2 <0,2,0,1>, <0,2,0,1>
+ 2690548468U, // <0,1,0,3>: Cost 3 vext3 <1,0,3,0>, <1,0,3,0>
+ 2626543954U, // <0,1,0,4>: Cost 3 vext2 <1,5,0,1>, <0,4,1,5>
+ 4094985216U, // <0,1,0,5>: Cost 4 vtrnl <0,2,0,2>, <1,3,5,7>
+ 2592019278U, // <0,1,0,6>: Cost 3 vext1 <7,0,1,0>, <6,7,0,1>
+ 2592019448U, // <0,1,0,7>: Cost 3 vext1 <7,0,1,0>, <7,0,1,0>
+ 1557447325U, // <0,1,0,u>: Cost 2 vext2 <2,3,0,1>, LHS
+ 1476476938U, // <0,1,1,0>: Cost 2 vext1 <0,0,1,1>, <0,0,1,1>
+ 2886517556U, // <0,1,1,1>: Cost 3 vzipl LHS, <1,1,1,1>
+ 2886517654U, // <0,1,1,2>: Cost 3 vzipl LHS, <1,2,3,0>
+ 2886517720U, // <0,1,1,3>: Cost 3 vzipl LHS, <1,3,1,3>
+ 1476480310U, // <0,1,1,4>: Cost 2 vext1 <0,0,1,1>, RHS
+ 2886558864U, // <0,1,1,5>: Cost 3 vzipl LHS, <1,5,3,7>
+ 2550223354U, // <0,1,1,6>: Cost 3 vext1 <0,0,1,1>, <6,2,7,3>
+ 2550223856U, // <0,1,1,7>: Cost 3 vext1 <0,0,1,1>, <7,0,0,1>
+ 1476482862U, // <0,1,1,u>: Cost 2 vext1 <0,0,1,1>, LHS
+ 1494401126U, // <0,1,2,0>: Cost 2 vext1 <3,0,1,2>, LHS
+ 3020735284U, // <0,1,2,1>: Cost 3 vtrnl LHS, <1,1,1,1>
+ 2562172349U, // <0,1,2,2>: Cost 3 vext1 <2,0,1,2>, <2,0,1,2>
+ 835584U, // <0,1,2,3>: Cost 0 copy LHS
+ 1494404406U, // <0,1,2,4>: Cost 2 vext1 <3,0,1,2>, RHS
+ 3020735488U, // <0,1,2,5>: Cost 3 vtrnl LHS, <1,3,5,7>
+ 2631190458U, // <0,1,2,6>: Cost 3 vext2 <2,3,0,1>, <2,6,3,7>
+ 1518294010U, // <0,1,2,7>: Cost 2 vext1 <7,0,1,2>, <7,0,1,2>
+ 835584U, // <0,1,2,u>: Cost 0 copy LHS
+ 2692318156U, // <0,1,3,0>: Cost 3 vext3 <1,3,0,0>, <1,3,0,0>
+ 2691875800U, // <0,1,3,1>: Cost 3 vext3 <1,2,3,0>, <1,3,1,3>
+ 2691875806U, // <0,1,3,2>: Cost 3 vext3 <1,2,3,0>, <1,3,2,0>
+ 2692539367U, // <0,1,3,3>: Cost 3 vext3 <1,3,3,0>, <1,3,3,0>
+ 2562182454U, // <0,1,3,4>: Cost 3 vext1 <2,0,1,3>, RHS
+ 2691875840U, // <0,1,3,5>: Cost 3 vext3 <1,2,3,0>, <1,3,5,7>
+ 2692760578U, // <0,1,3,6>: Cost 3 vext3 <1,3,6,0>, <1,3,6,0>
+ 2639817411U, // <0,1,3,7>: Cost 3 vext2 <3,7,0,1>, <3,7,0,1>
+ 2691875863U, // <0,1,3,u>: Cost 3 vext3 <1,2,3,0>, <1,3,u,3>
+ 2568159334U, // <0,1,4,0>: Cost 3 vext1 <3,0,1,4>, LHS
+ 4095312692U, // <0,1,4,1>: Cost 4 vtrnl <0,2,4,6>, <1,1,1,1>
+ 2568160934U, // <0,1,4,2>: Cost 3 vext1 <3,0,1,4>, <2,3,0,1>
+ 2568161432U, // <0,1,4,3>: Cost 3 vext1 <3,0,1,4>, <3,0,1,4>
+ 2568162614U, // <0,1,4,4>: Cost 3 vext1 <3,0,1,4>, RHS
+ 1557450038U, // <0,1,4,5>: Cost 2 vext2 <2,3,0,1>, RHS
+ 2754235702U, // <0,1,4,6>: Cost 3 vuzpl <0,4,1,5>, RHS
+ 2592052220U, // <0,1,4,7>: Cost 3 vext1 <7,0,1,4>, <7,0,1,4>
+ 1557450281U, // <0,1,4,u>: Cost 2 vext2 <2,3,0,1>, RHS
+ 3765617775U, // <0,1,5,0>: Cost 4 vext3 <1,2,3,0>, <1,5,0,1>
+ 2647781007U, // <0,1,5,1>: Cost 3 vext2 <5,1,0,1>, <5,1,0,1>
+ 3704934138U, // <0,1,5,2>: Cost 4 vext2 <2,3,0,1>, <5,2,3,0>
+ 2691875984U, // <0,1,5,3>: Cost 3 vext3 <1,2,3,0>, <1,5,3,7>
+ 2657734598U, // <0,1,5,4>: Cost 3 vext2 <6,7,0,1>, <5,4,7,6>
+ 2650435539U, // <0,1,5,5>: Cost 3 vext2 <5,5,0,1>, <5,5,0,1>
+ 2651099172U, // <0,1,5,6>: Cost 3 vext2 <5,6,0,1>, <5,6,0,1>
+ 2651762805U, // <0,1,5,7>: Cost 3 vext2 <5,7,0,1>, <5,7,0,1>
+ 2691876029U, // <0,1,5,u>: Cost 3 vext3 <1,2,3,0>, <1,5,u,7>
+ 2592063590U, // <0,1,6,0>: Cost 3 vext1 <7,0,1,6>, LHS
+ 3765617871U, // <0,1,6,1>: Cost 4 vext3 <1,2,3,0>, <1,6,1,7>
+ 2654417337U, // <0,1,6,2>: Cost 3 vext2 <6,2,0,1>, <6,2,0,1>
+ 3765617889U, // <0,1,6,3>: Cost 4 vext3 <1,2,3,0>, <1,6,3,7>
+ 2592066870U, // <0,1,6,4>: Cost 3 vext1 <7,0,1,6>, RHS
+ 3765617907U, // <0,1,6,5>: Cost 4 vext3 <1,2,3,0>, <1,6,5,7>
+ 2657071869U, // <0,1,6,6>: Cost 3 vext2 <6,6,0,1>, <6,6,0,1>
+ 1583993678U, // <0,1,6,7>: Cost 2 vext2 <6,7,0,1>, <6,7,0,1>
+ 1584657311U, // <0,1,6,u>: Cost 2 vext2 <6,u,0,1>, <6,u,0,1>
+ 2657735672U, // <0,1,7,0>: Cost 3 vext2 <6,7,0,1>, <7,0,1,0>
+ 2657735808U, // <0,1,7,1>: Cost 3 vext2 <6,7,0,1>, <7,1,7,1>
+ 2631193772U, // <0,1,7,2>: Cost 3 vext2 <2,3,0,1>, <7,2,3,0>
+ 2661053667U, // <0,1,7,3>: Cost 3 vext2 <7,3,0,1>, <7,3,0,1>
+ 2657736038U, // <0,1,7,4>: Cost 3 vext2 <6,7,0,1>, <7,4,5,6>
+ 3721524621U, // <0,1,7,5>: Cost 4 vext2 <5,1,0,1>, <7,5,1,0>
+ 2657736158U, // <0,1,7,6>: Cost 3 vext2 <6,7,0,1>, <7,6,1,0>
+ 2657736300U, // <0,1,7,7>: Cost 3 vext2 <6,7,0,1>, <7,7,7,7>
+ 2657736322U, // <0,1,7,u>: Cost 3 vext2 <6,7,0,1>, <7,u,1,2>
+ 1494450278U, // <0,1,u,0>: Cost 2 vext1 <3,0,1,u>, LHS
+ 1557452590U, // <0,1,u,1>: Cost 2 vext2 <2,3,0,1>, LHS
+ 2754238254U, // <0,1,u,2>: Cost 3 vuzpl <0,4,1,5>, LHS
+ 835584U, // <0,1,u,3>: Cost 0 copy LHS
+ 1494453558U, // <0,1,u,4>: Cost 2 vext1 <3,0,1,u>, RHS
+ 1557452954U, // <0,1,u,5>: Cost 2 vext2 <2,3,0,1>, RHS
+ 2754238618U, // <0,1,u,6>: Cost 3 vuzpl <0,4,1,5>, RHS
+ 1518343168U, // <0,1,u,7>: Cost 2 vext1 <7,0,1,u>, <7,0,1,u>
+ 835584U, // <0,1,u,u>: Cost 0 copy LHS
+ 2752299008U, // <0,2,0,0>: Cost 3 vuzpl LHS, <0,0,0,0>
+ 1544847462U, // <0,2,0,1>: Cost 2 vext2 <0,2,0,2>, LHS
+ 1678557286U, // <0,2,0,2>: Cost 2 vuzpl LHS, LHS
+ 2696521165U, // <0,2,0,3>: Cost 3 vext3 <2,0,3,0>, <2,0,3,0>
+ 2752340172U, // <0,2,0,4>: Cost 3 vuzpl LHS, <0,2,4,6>
+ 2691876326U, // <0,2,0,5>: Cost 3 vext3 <1,2,3,0>, <2,0,5,7>
+ 2618589695U, // <0,2,0,6>: Cost 3 vext2 <0,2,0,2>, <0,6,2,7>
+ 2592093185U, // <0,2,0,7>: Cost 3 vext1 <7,0,2,0>, <7,0,2,0>
+ 1678557340U, // <0,2,0,u>: Cost 2 vuzpl LHS, LHS
+ 2618589942U, // <0,2,1,0>: Cost 3 vext2 <0,2,0,2>, <1,0,3,2>
+ 2752299828U, // <0,2,1,1>: Cost 3 vuzpl LHS, <1,1,1,1>
+ 2886518376U, // <0,2,1,2>: Cost 3 vzipl LHS, <2,2,2,2>
+ 2752299766U, // <0,2,1,3>: Cost 3 vuzpl LHS, <1,0,3,2>
+ 2550295862U, // <0,2,1,4>: Cost 3 vext1 <0,0,2,1>, RHS
+ 2752340992U, // <0,2,1,5>: Cost 3 vuzpl LHS, <1,3,5,7>
+ 2886559674U, // <0,2,1,6>: Cost 3 vzipl LHS, <2,6,3,7>
+ 3934208106U, // <0,2,1,7>: Cost 4 vuzpr <7,0,1,2>, <0,1,2,7>
+ 2752340771U, // <0,2,1,u>: Cost 3 vuzpl LHS, <1,0,u,2>
+ 1476558868U, // <0,2,2,0>: Cost 2 vext1 <0,0,2,2>, <0,0,2,2>
+ 2226628029U, // <0,2,2,1>: Cost 3 vrev <2,0,1,2>
+ 2752300648U, // <0,2,2,2>: Cost 3 vuzpl LHS, <2,2,2,2>
+ 3020736114U, // <0,2,2,3>: Cost 3 vtrnl LHS, <2,2,3,3>
+ 1476562230U, // <0,2,2,4>: Cost 2 vext1 <0,0,2,2>, RHS
+ 2550304464U, // <0,2,2,5>: Cost 3 vext1 <0,0,2,2>, <5,1,7,3>
+ 2618591162U, // <0,2,2,6>: Cost 3 vext2 <0,2,0,2>, <2,6,3,7>
+ 2550305777U, // <0,2,2,7>: Cost 3 vext1 <0,0,2,2>, <7,0,0,2>
+ 1476564782U, // <0,2,2,u>: Cost 2 vext1 <0,0,2,2>, LHS
+ 2618591382U, // <0,2,3,0>: Cost 3 vext2 <0,2,0,2>, <3,0,1,2>
+ 2752301206U, // <0,2,3,1>: Cost 3 vuzpl LHS, <3,0,1,2>
+ 3826043121U, // <0,2,3,2>: Cost 4 vuzpl LHS, <3,1,2,3>
+ 2752301468U, // <0,2,3,3>: Cost 3 vuzpl LHS, <3,3,3,3>
+ 2618591746U, // <0,2,3,4>: Cost 3 vext2 <0,2,0,2>, <3,4,5,6>
+ 2752301570U, // <0,2,3,5>: Cost 3 vuzpl LHS, <3,4,5,6>
+ 3830688102U, // <0,2,3,6>: Cost 4 vuzpl LHS, <3,2,6,3>
+ 2698807012U, // <0,2,3,7>: Cost 3 vext3 <2,3,7,0>, <2,3,7,0>
+ 2752301269U, // <0,2,3,u>: Cost 3 vuzpl LHS, <3,0,u,2>
+ 2562261094U, // <0,2,4,0>: Cost 3 vext1 <2,0,2,4>, LHS
+ 4095313828U, // <0,2,4,1>: Cost 4 vtrnl <0,2,4,6>, <2,6,1,3>
+ 2226718152U, // <0,2,4,2>: Cost 3 vrev <2,0,2,4>
+ 2568235169U, // <0,2,4,3>: Cost 3 vext1 <3,0,2,4>, <3,0,2,4>
+ 2562264374U, // <0,2,4,4>: Cost 3 vext1 <2,0,2,4>, RHS
+ 1544850742U, // <0,2,4,5>: Cost 2 vext2 <0,2,0,2>, RHS
+ 1678560566U, // <0,2,4,6>: Cost 2 vuzpl LHS, RHS
+ 2592125957U, // <0,2,4,7>: Cost 3 vext1 <7,0,2,4>, <7,0,2,4>
+ 1678560584U, // <0,2,4,u>: Cost 2 vuzpl LHS, RHS
+ 2691876686U, // <0,2,5,0>: Cost 3 vext3 <1,2,3,0>, <2,5,0,7>
+ 2618592976U, // <0,2,5,1>: Cost 3 vext2 <0,2,0,2>, <5,1,7,3>
+ 3765618528U, // <0,2,5,2>: Cost 4 vext3 <1,2,3,0>, <2,5,2,7>
+ 3765618536U, // <0,2,5,3>: Cost 4 vext3 <1,2,3,0>, <2,5,3,6>
+ 2618593222U, // <0,2,5,4>: Cost 3 vext2 <0,2,0,2>, <5,4,7,6>
+ 2752303108U, // <0,2,5,5>: Cost 3 vuzpl LHS, <5,5,5,5>
+ 2618593378U, // <0,2,5,6>: Cost 3 vext2 <0,2,0,2>, <5,6,7,0>
+ 2824785206U, // <0,2,5,7>: Cost 3 vuzpr <1,0,3,2>, RHS
+ 2824785207U, // <0,2,5,u>: Cost 3 vuzpr <1,0,3,2>, RHS
+ 2752303950U, // <0,2,6,0>: Cost 3 vuzpl LHS, <6,7,0,1>
+ 3830690081U, // <0,2,6,1>: Cost 4 vuzpl LHS, <6,0,1,2>
+ 2618593786U, // <0,2,6,2>: Cost 3 vext2 <0,2,0,2>, <6,2,7,3>
+ 2691876794U, // <0,2,6,3>: Cost 3 vext3 <1,2,3,0>, <2,6,3,7>
+ 2752303990U, // <0,2,6,4>: Cost 3 vuzpl LHS, <6,7,4,5>
+ 3830690445U, // <0,2,6,5>: Cost 4 vuzpl LHS, <6,4,5,6>
+ 2752303928U, // <0,2,6,6>: Cost 3 vuzpl LHS, <6,6,6,6>
+ 2657743695U, // <0,2,6,7>: Cost 3 vext2 <6,7,0,2>, <6,7,0,2>
+ 2691876839U, // <0,2,6,u>: Cost 3 vext3 <1,2,3,0>, <2,6,u,7>
+ 2659070961U, // <0,2,7,0>: Cost 3 vext2 <7,0,0,2>, <7,0,0,2>
+ 2659734594U, // <0,2,7,1>: Cost 3 vext2 <7,1,0,2>, <7,1,0,2>
+ 3734140051U, // <0,2,7,2>: Cost 4 vext2 <7,2,0,2>, <7,2,0,2>
+ 2701166596U, // <0,2,7,3>: Cost 3 vext3 <2,7,3,0>, <2,7,3,0>
+ 2662389094U, // <0,2,7,4>: Cost 3 vext2 <7,5,0,2>, <7,4,5,6>
+ 2662389126U, // <0,2,7,5>: Cost 3 vext2 <7,5,0,2>, <7,5,0,2>
+ 3736794583U, // <0,2,7,6>: Cost 4 vext2 <7,6,0,2>, <7,6,0,2>
+ 2752304748U, // <0,2,7,7>: Cost 3 vuzpl LHS, <7,7,7,7>
+ 2659070961U, // <0,2,7,u>: Cost 3 vext2 <7,0,0,2>, <7,0,0,2>
+ 1476608026U, // <0,2,u,0>: Cost 2 vext1 <0,0,2,u>, <0,0,2,u>
+ 1544853294U, // <0,2,u,1>: Cost 2 vext2 <0,2,0,2>, LHS
+ 1678563118U, // <0,2,u,2>: Cost 2 vuzpl LHS, LHS
+ 3021178482U, // <0,2,u,3>: Cost 3 vtrnl LHS, <2,2,3,3>
+ 1476611382U, // <0,2,u,4>: Cost 2 vext1 <0,0,2,u>, RHS
+ 1544853658U, // <0,2,u,5>: Cost 2 vext2 <0,2,0,2>, RHS
+ 1678563482U, // <0,2,u,6>: Cost 2 vuzpl LHS, RHS
+ 2824785449U, // <0,2,u,7>: Cost 3 vuzpr <1,0,3,2>, RHS
+ 1678563172U, // <0,2,u,u>: Cost 2 vuzpl LHS, LHS
+ 2556329984U, // <0,3,0,0>: Cost 3 vext1 <1,0,3,0>, <0,0,0,0>
+ 2686421142U, // <0,3,0,1>: Cost 3 vext3 <0,3,1,0>, <3,0,1,2>
+ 2562303437U, // <0,3,0,2>: Cost 3 vext1 <2,0,3,0>, <2,0,3,0>
+ 4094986652U, // <0,3,0,3>: Cost 4 vtrnl <0,2,0,2>, <3,3,3,3>
+ 2556333366U, // <0,3,0,4>: Cost 3 vext1 <1,0,3,0>, RHS
+ 4094986754U, // <0,3,0,5>: Cost 4 vtrnl <0,2,0,2>, <3,4,5,6>
+ 3798796488U, // <0,3,0,6>: Cost 4 vext3 <6,7,3,0>, <3,0,6,7>
+ 3776530634U, // <0,3,0,7>: Cost 4 vext3 <3,0,7,0>, <3,0,7,0>
+ 2556335918U, // <0,3,0,u>: Cost 3 vext1 <1,0,3,0>, LHS
+ 2886518934U, // <0,3,1,0>: Cost 3 vzipl LHS, <3,0,1,2>
+ 2556338933U, // <0,3,1,1>: Cost 3 vext1 <1,0,3,1>, <1,0,3,1>
+ 2691877105U, // <0,3,1,2>: Cost 3 vext3 <1,2,3,0>, <3,1,2,3>
+ 2886519196U, // <0,3,1,3>: Cost 3 vzipl LHS, <3,3,3,3>
+ 2886519298U, // <0,3,1,4>: Cost 3 vzipl LHS, <3,4,5,6>
+ 4095740418U, // <0,3,1,5>: Cost 4 vtrnl <0,3,1,4>, <3,4,5,6>
+ 3659944242U, // <0,3,1,6>: Cost 4 vext1 <6,0,3,1>, <6,0,3,1>
+ 3769600286U, // <0,3,1,7>: Cost 4 vext3 <1,u,3,0>, <3,1,7,3>
+ 2886519582U, // <0,3,1,u>: Cost 3 vzipl LHS, <3,u,1,2>
+ 1482604646U, // <0,3,2,0>: Cost 2 vext1 <1,0,3,2>, LHS
+ 1482605302U, // <0,3,2,1>: Cost 2 vext1 <1,0,3,2>, <1,0,3,2>
+ 2556348008U, // <0,3,2,2>: Cost 3 vext1 <1,0,3,2>, <2,2,2,2>
+ 3020736924U, // <0,3,2,3>: Cost 3 vtrnl LHS, <3,3,3,3>
+ 1482607926U, // <0,3,2,4>: Cost 2 vext1 <1,0,3,2>, RHS
+ 3020737026U, // <0,3,2,5>: Cost 3 vtrnl LHS, <3,4,5,6>
+ 2598154746U, // <0,3,2,6>: Cost 3 vext1 <u,0,3,2>, <6,2,7,3>
+ 2598155258U, // <0,3,2,7>: Cost 3 vext1 <u,0,3,2>, <7,0,1,2>
+ 1482610478U, // <0,3,2,u>: Cost 2 vext1 <1,0,3,2>, LHS
+ 3692341398U, // <0,3,3,0>: Cost 4 vext2 <0,2,0,3>, <3,0,1,2>
+ 2635851999U, // <0,3,3,1>: Cost 3 vext2 <3,1,0,3>, <3,1,0,3>
+ 3636069840U, // <0,3,3,2>: Cost 4 vext1 <2,0,3,3>, <2,0,3,3>
+ 2691877276U, // <0,3,3,3>: Cost 3 vext3 <1,2,3,0>, <3,3,3,3>
+ 3961522690U, // <0,3,3,4>: Cost 4 vzipl <0,3,1,4>, <3,4,5,6>
+ 3826797058U, // <0,3,3,5>: Cost 4 vuzpl <0,2,3,5>, <3,4,5,6>
+ 3703622282U, // <0,3,3,6>: Cost 4 vext2 <2,1,0,3>, <3,6,2,7>
+ 3769600452U, // <0,3,3,7>: Cost 4 vext3 <1,u,3,0>, <3,3,7,7>
+ 2640497430U, // <0,3,3,u>: Cost 3 vext2 <3,u,0,3>, <3,u,0,3>
+ 3962194070U, // <0,3,4,0>: Cost 4 vzipl <0,4,1,5>, <3,0,1,2>
+ 2232617112U, // <0,3,4,1>: Cost 3 vrev <3,0,1,4>
+ 2232690849U, // <0,3,4,2>: Cost 3 vrev <3,0,2,4>
+ 4095314332U, // <0,3,4,3>: Cost 4 vtrnl <0,2,4,6>, <3,3,3,3>
+ 3962194434U, // <0,3,4,4>: Cost 4 vzipl <0,4,1,5>, <3,4,5,6>
+ 2691877378U, // <0,3,4,5>: Cost 3 vext3 <1,2,3,0>, <3,4,5,6>
+ 3826765110U, // <0,3,4,6>: Cost 4 vuzpl <0,2,3,1>, RHS
+ 3665941518U, // <0,3,4,7>: Cost 4 vext1 <7,0,3,4>, <7,0,3,4>
+ 2691877405U, // <0,3,4,u>: Cost 3 vext3 <1,2,3,0>, <3,4,u,6>
+ 3630112870U, // <0,3,5,0>: Cost 4 vext1 <1,0,3,5>, LHS
+ 3630113526U, // <0,3,5,1>: Cost 4 vext1 <1,0,3,5>, <1,0,3,2>
+ 4035199734U, // <0,3,5,2>: Cost 4 vzipr <1,4,0,5>, <1,0,3,2>
+ 3769600578U, // <0,3,5,3>: Cost 4 vext3 <1,u,3,0>, <3,5,3,7>
+ 2232846516U, // <0,3,5,4>: Cost 3 vrev <3,0,4,5>
+ 3779037780U, // <0,3,5,5>: Cost 4 vext3 <3,4,5,0>, <3,5,5,7>
+ 2718714461U, // <0,3,5,6>: Cost 3 vext3 <5,6,7,0>, <3,5,6,7>
+ 2706106975U, // <0,3,5,7>: Cost 3 vext3 <3,5,7,0>, <3,5,7,0>
+ 2233141464U, // <0,3,5,u>: Cost 3 vrev <3,0,u,5>
+ 2691877496U, // <0,3,6,0>: Cost 3 vext3 <1,2,3,0>, <3,6,0,7>
+ 3727511914U, // <0,3,6,1>: Cost 4 vext2 <6,1,0,3>, <6,1,0,3>
+ 3765619338U, // <0,3,6,2>: Cost 4 vext3 <1,2,3,0>, <3,6,2,7>
+ 3765619347U, // <0,3,6,3>: Cost 4 vext3 <1,2,3,0>, <3,6,3,7>
+ 3765987996U, // <0,3,6,4>: Cost 4 vext3 <1,2,u,0>, <3,6,4,7>
+ 3306670270U, // <0,3,6,5>: Cost 4 vrev <3,0,5,6>
+ 3792456365U, // <0,3,6,6>: Cost 4 vext3 <5,6,7,0>, <3,6,6,6>
+ 2706770608U, // <0,3,6,7>: Cost 3 vext3 <3,6,7,0>, <3,6,7,0>
+ 2706844345U, // <0,3,6,u>: Cost 3 vext3 <3,6,u,0>, <3,6,u,0>
+ 3769600707U, // <0,3,7,0>: Cost 4 vext3 <1,u,3,0>, <3,7,0,1>
+ 2659742787U, // <0,3,7,1>: Cost 3 vext2 <7,1,0,3>, <7,1,0,3>
+ 3636102612U, // <0,3,7,2>: Cost 4 vext1 <2,0,3,7>, <2,0,3,7>
+ 3769600740U, // <0,3,7,3>: Cost 4 vext3 <1,u,3,0>, <3,7,3,7>
+ 3769600747U, // <0,3,7,4>: Cost 4 vext3 <1,u,3,0>, <3,7,4,5>
+ 3769600758U, // <0,3,7,5>: Cost 4 vext3 <1,u,3,0>, <3,7,5,7>
+ 3659993400U, // <0,3,7,6>: Cost 4 vext1 <6,0,3,7>, <6,0,3,7>
+ 3781176065U, // <0,3,7,7>: Cost 4 vext3 <3,7,7,0>, <3,7,7,0>
+ 2664388218U, // <0,3,7,u>: Cost 3 vext2 <7,u,0,3>, <7,u,0,3>
+ 1482653798U, // <0,3,u,0>: Cost 2 vext1 <1,0,3,u>, LHS
+ 1482654460U, // <0,3,u,1>: Cost 2 vext1 <1,0,3,u>, <1,0,3,u>
+ 2556397160U, // <0,3,u,2>: Cost 3 vext1 <1,0,3,u>, <2,2,2,2>
+ 3021179292U, // <0,3,u,3>: Cost 3 vtrnl LHS, <3,3,3,3>
+ 1482657078U, // <0,3,u,4>: Cost 2 vext1 <1,0,3,u>, RHS
+ 3021179394U, // <0,3,u,5>: Cost 3 vtrnl LHS, <3,4,5,6>
+ 2598203898U, // <0,3,u,6>: Cost 3 vext1 <u,0,3,u>, <6,2,7,3>
+ 2708097874U, // <0,3,u,7>: Cost 3 vext3 <3,u,7,0>, <3,u,7,0>
+ 1482659630U, // <0,3,u,u>: Cost 2 vext1 <1,0,3,u>, LHS
+ 2617278468U, // <0,4,0,0>: Cost 3 vext2 <0,0,0,4>, <0,0,0,4>
+ 2618605670U, // <0,4,0,1>: Cost 3 vext2 <0,2,0,4>, LHS
+ 2618605734U, // <0,4,0,2>: Cost 3 vext2 <0,2,0,4>, <0,2,0,4>
+ 3642091695U, // <0,4,0,3>: Cost 4 vext1 <3,0,4,0>, <3,0,4,0>
+ 2753134796U, // <0,4,0,4>: Cost 3 vuzpl <0,2,4,6>, <0,2,4,6>
+ 2718714770U, // <0,4,0,5>: Cost 3 vext3 <5,6,7,0>, <4,0,5,1>
+ 3021245750U, // <0,4,0,6>: Cost 3 vtrnl <0,2,0,2>, RHS
+ 3665982483U, // <0,4,0,7>: Cost 4 vext1 <7,0,4,0>, <7,0,4,0>
+ 3021245768U, // <0,4,0,u>: Cost 3 vtrnl <0,2,0,2>, RHS
+ 2568355942U, // <0,4,1,0>: Cost 3 vext1 <3,0,4,1>, LHS
+ 3692348212U, // <0,4,1,1>: Cost 4 vext2 <0,2,0,4>, <1,1,1,1>
+ 3692348310U, // <0,4,1,2>: Cost 4 vext2 <0,2,0,4>, <1,2,3,0>
+ 2568358064U, // <0,4,1,3>: Cost 3 vext1 <3,0,4,1>, <3,0,4,1>
+ 2568359222U, // <0,4,1,4>: Cost 3 vext1 <3,0,4,1>, RHS
+ 1812778294U, // <0,4,1,5>: Cost 2 vzipl LHS, RHS
+ 3022671158U, // <0,4,1,6>: Cost 3 vtrnl <0,4,1,5>, RHS
+ 2592248852U, // <0,4,1,7>: Cost 3 vext1 <7,0,4,1>, <7,0,4,1>
+ 1812778537U, // <0,4,1,u>: Cost 2 vzipl LHS, RHS
+ 2568364134U, // <0,4,2,0>: Cost 3 vext1 <3,0,4,2>, LHS
+ 2238573423U, // <0,4,2,1>: Cost 3 vrev <4,0,1,2>
+ 3692349032U, // <0,4,2,2>: Cost 4 vext2 <0,2,0,4>, <2,2,2,2>
+ 2631214761U, // <0,4,2,3>: Cost 3 vext2 <2,3,0,4>, <2,3,0,4>
+ 2568367414U, // <0,4,2,4>: Cost 3 vext1 <3,0,4,2>, RHS
+ 2887028022U, // <0,4,2,5>: Cost 3 vzipl <0,2,0,2>, RHS
+ 1946996022U, // <0,4,2,6>: Cost 2 vtrnl LHS, RHS
+ 2592257045U, // <0,4,2,7>: Cost 3 vext1 <7,0,4,2>, <7,0,4,2>
+ 1946996040U, // <0,4,2,u>: Cost 2 vtrnl LHS, RHS
+ 3692349590U, // <0,4,3,0>: Cost 4 vext2 <0,2,0,4>, <3,0,1,2>
+ 3826878614U, // <0,4,3,1>: Cost 4 vuzpl <0,2,4,6>, <3,0,1,2>
+ 3826878625U, // <0,4,3,2>: Cost 4 vuzpl <0,2,4,6>, <3,0,2,4>
+ 3692349852U, // <0,4,3,3>: Cost 4 vext2 <0,2,0,4>, <3,3,3,3>
+ 3692349954U, // <0,4,3,4>: Cost 4 vext2 <0,2,0,4>, <3,4,5,6>
+ 3826878978U, // <0,4,3,5>: Cost 4 vuzpl <0,2,4,6>, <3,4,5,6>
+ 4095200566U, // <0,4,3,6>: Cost 4 vtrnl <0,2,3,1>, RHS
+ 3713583814U, // <0,4,3,7>: Cost 4 vext2 <3,7,0,4>, <3,7,0,4>
+ 3692350238U, // <0,4,3,u>: Cost 4 vext2 <0,2,0,4>, <3,u,1,2>
+ 2550464552U, // <0,4,4,0>: Cost 3 vext1 <0,0,4,4>, <0,0,4,4>
+ 3962194914U, // <0,4,4,1>: Cost 4 vzipl <0,4,1,5>, <4,1,5,0>
+ 3693677631U, // <0,4,4,2>: Cost 4 vext2 <0,4,0,4>, <4,2,6,3>
+ 3642124467U, // <0,4,4,3>: Cost 4 vext1 <3,0,4,4>, <3,0,4,4>
+ 2718715088U, // <0,4,4,4>: Cost 3 vext3 <5,6,7,0>, <4,4,4,4>
+ 2618608950U, // <0,4,4,5>: Cost 3 vext2 <0,2,0,4>, RHS
+ 2753137974U, // <0,4,4,6>: Cost 3 vuzpl <0,2,4,6>, RHS
+ 3666015255U, // <0,4,4,7>: Cost 4 vext1 <7,0,4,4>, <7,0,4,4>
+ 2618609193U, // <0,4,4,u>: Cost 3 vext2 <0,2,0,4>, RHS
+ 2568388710U, // <0,4,5,0>: Cost 3 vext1 <3,0,4,5>, LHS
+ 2568389526U, // <0,4,5,1>: Cost 3 vext1 <3,0,4,5>, <1,2,3,0>
+ 3636159963U, // <0,4,5,2>: Cost 4 vext1 <2,0,4,5>, <2,0,4,5>
+ 2568390836U, // <0,4,5,3>: Cost 3 vext1 <3,0,4,5>, <3,0,4,5>
+ 2568391990U, // <0,4,5,4>: Cost 3 vext1 <3,0,4,5>, RHS
+ 2718715180U, // <0,4,5,5>: Cost 3 vext3 <5,6,7,0>, <4,5,5,6>
+ 1618136374U, // <0,4,5,6>: Cost 2 vext3 <1,2,3,0>, RHS
+ 2592281624U, // <0,4,5,7>: Cost 3 vext1 <7,0,4,5>, <7,0,4,5>
+ 1618136392U, // <0,4,5,u>: Cost 2 vext3 <1,2,3,0>, RHS
+ 2550480938U, // <0,4,6,0>: Cost 3 vext1 <0,0,4,6>, <0,0,4,6>
+ 3826880801U, // <0,4,6,1>: Cost 4 vuzpl <0,2,4,6>, <6,0,1,2>
+ 2562426332U, // <0,4,6,2>: Cost 3 vext1 <2,0,4,6>, <2,0,4,6>
+ 3786190181U, // <0,4,6,3>: Cost 4 vext3 <4,6,3,0>, <4,6,3,0>
+ 2718715252U, // <0,4,6,4>: Cost 3 vext3 <5,6,7,0>, <4,6,4,6>
+ 3826881165U, // <0,4,6,5>: Cost 4 vuzpl <0,2,4,6>, <6,4,5,6>
+ 2712669568U, // <0,4,6,6>: Cost 3 vext3 <4,6,6,0>, <4,6,6,0>
+ 2657760081U, // <0,4,6,7>: Cost 3 vext2 <6,7,0,4>, <6,7,0,4>
+ 2718715284U, // <0,4,6,u>: Cost 3 vext3 <5,6,7,0>, <4,6,u,2>
+ 3654090854U, // <0,4,7,0>: Cost 4 vext1 <5,0,4,7>, LHS
+ 3934229326U, // <0,4,7,1>: Cost 4 vuzpr <7,0,1,4>, <6,7,0,1>
+ 3734156437U, // <0,4,7,2>: Cost 4 vext2 <7,2,0,4>, <7,2,0,4>
+ 3734820070U, // <0,4,7,3>: Cost 4 vext2 <7,3,0,4>, <7,3,0,4>
+ 3654094134U, // <0,4,7,4>: Cost 4 vext1 <5,0,4,7>, RHS
+ 2713259464U, // <0,4,7,5>: Cost 3 vext3 <4,7,5,0>, <4,7,5,0>
+ 2713333201U, // <0,4,7,6>: Cost 3 vext3 <4,7,6,0>, <4,7,6,0>
+ 3654095866U, // <0,4,7,7>: Cost 4 vext1 <5,0,4,7>, <7,0,1,2>
+ 2713259464U, // <0,4,7,u>: Cost 3 vext3 <4,7,5,0>, <4,7,5,0>
+ 2568413286U, // <0,4,u,0>: Cost 3 vext1 <3,0,4,u>, LHS
+ 2618611502U, // <0,4,u,1>: Cost 3 vext2 <0,2,0,4>, LHS
+ 2753140526U, // <0,4,u,2>: Cost 3 vuzpl <0,2,4,6>, LHS
+ 2568415415U, // <0,4,u,3>: Cost 3 vext1 <3,0,4,u>, <3,0,4,u>
+ 2568416566U, // <0,4,u,4>: Cost 3 vext1 <3,0,4,u>, RHS
+ 1817423158U, // <0,4,u,5>: Cost 2 vzipl LHS, RHS
+ 1947438390U, // <0,4,u,6>: Cost 2 vtrnl LHS, RHS
+ 2592306203U, // <0,4,u,7>: Cost 3 vext1 <7,0,4,u>, <7,0,4,u>
+ 1947438408U, // <0,4,u,u>: Cost 2 vtrnl LHS, RHS
+ 3630219264U, // <0,5,0,0>: Cost 4 vext1 <1,0,5,0>, <0,0,0,0>
+ 2625912934U, // <0,5,0,1>: Cost 3 vext2 <1,4,0,5>, LHS
+ 3692355748U, // <0,5,0,2>: Cost 4 vext2 <0,2,0,5>, <0,2,0,2>
+ 3693019384U, // <0,5,0,3>: Cost 4 vext2 <0,3,0,5>, <0,3,0,5>
+ 3630222646U, // <0,5,0,4>: Cost 4 vext1 <1,0,5,0>, RHS
+ 3699655062U, // <0,5,0,5>: Cost 4 vext2 <1,4,0,5>, <0,5,0,1>
+ 2718715508U, // <0,5,0,6>: Cost 3 vext3 <5,6,7,0>, <5,0,6,1>
+ 3087011126U, // <0,5,0,7>: Cost 3 vtrnr <0,0,0,0>, RHS
+ 2625913501U, // <0,5,0,u>: Cost 3 vext2 <1,4,0,5>, LHS
+ 1500659814U, // <0,5,1,0>: Cost 2 vext1 <4,0,5,1>, LHS
+ 2886520528U, // <0,5,1,1>: Cost 3 vzipl LHS, <5,1,7,3>
+ 2574403176U, // <0,5,1,2>: Cost 3 vext1 <4,0,5,1>, <2,2,2,2>
+ 2574403734U, // <0,5,1,3>: Cost 3 vext1 <4,0,5,1>, <3,0,1,2>
+ 1500662674U, // <0,5,1,4>: Cost 2 vext1 <4,0,5,1>, <4,0,5,1>
+ 2886520836U, // <0,5,1,5>: Cost 3 vzipl LHS, <5,5,5,5>
+ 2886520930U, // <0,5,1,6>: Cost 3 vzipl LHS, <5,6,7,0>
+ 2718715600U, // <0,5,1,7>: Cost 3 vext3 <5,6,7,0>, <5,1,7,3>
+ 1500665646U, // <0,5,1,u>: Cost 2 vext1 <4,0,5,1>, LHS
+ 2556493926U, // <0,5,2,0>: Cost 3 vext1 <1,0,5,2>, LHS
+ 2244546120U, // <0,5,2,1>: Cost 3 vrev <5,0,1,2>
+ 3692357256U, // <0,5,2,2>: Cost 4 vext2 <0,2,0,5>, <2,2,5,7>
+ 2568439994U, // <0,5,2,3>: Cost 3 vext1 <3,0,5,2>, <3,0,5,2>
+ 2556497206U, // <0,5,2,4>: Cost 3 vext1 <1,0,5,2>, RHS
+ 3020738564U, // <0,5,2,5>: Cost 3 vtrnl LHS, <5,5,5,5>
+ 4027877161U, // <0,5,2,6>: Cost 4 vzipr <0,2,0,2>, <2,4,5,6>
+ 3093220662U, // <0,5,2,7>: Cost 3 vtrnr <1,0,3,2>, RHS
+ 3093220663U, // <0,5,2,u>: Cost 3 vtrnr <1,0,3,2>, RHS
+ 3699656854U, // <0,5,3,0>: Cost 4 vext2 <1,4,0,5>, <3,0,1,2>
+ 3699656927U, // <0,5,3,1>: Cost 4 vext2 <1,4,0,5>, <3,1,0,3>
+ 3699657006U, // <0,5,3,2>: Cost 4 vext2 <1,4,0,5>, <3,2,0,1>
+ 3699657116U, // <0,5,3,3>: Cost 4 vext2 <1,4,0,5>, <3,3,3,3>
+ 2637859284U, // <0,5,3,4>: Cost 3 vext2 <3,4,0,5>, <3,4,0,5>
+ 3790319453U, // <0,5,3,5>: Cost 4 vext3 <5,3,5,0>, <5,3,5,0>
+ 3699657354U, // <0,5,3,6>: Cost 4 vext2 <1,4,0,5>, <3,6,2,7>
+ 2716725103U, // <0,5,3,7>: Cost 3 vext3 <5,3,7,0>, <5,3,7,0>
+ 2716798840U, // <0,5,3,u>: Cost 3 vext3 <5,3,u,0>, <5,3,u,0>
+ 2661747602U, // <0,5,4,0>: Cost 3 vext2 <7,4,0,5>, <4,0,5,1>
+ 3630252810U, // <0,5,4,1>: Cost 4 vext1 <1,0,5,4>, <1,0,5,4>
+ 3636225507U, // <0,5,4,2>: Cost 4 vext1 <2,0,5,4>, <2,0,5,4>
+ 3716910172U, // <0,5,4,3>: Cost 4 vext2 <4,3,0,5>, <4,3,0,5>
+ 3962195892U, // <0,5,4,4>: Cost 4 vzipl <0,4,1,5>, <5,4,5,6>
+ 2625916214U, // <0,5,4,5>: Cost 3 vext2 <1,4,0,5>, RHS
+ 3718901071U, // <0,5,4,6>: Cost 4 vext2 <4,6,0,5>, <4,6,0,5>
+ 2718715846U, // <0,5,4,7>: Cost 3 vext3 <5,6,7,0>, <5,4,7,6>
+ 2625916457U, // <0,5,4,u>: Cost 3 vext2 <1,4,0,5>, RHS
+ 3791278034U, // <0,5,5,0>: Cost 4 vext3 <5,5,0,0>, <5,5,0,0>
+ 3791351771U, // <0,5,5,1>: Cost 4 vext3 <5,5,1,0>, <5,5,1,0>
+ 3318386260U, // <0,5,5,2>: Cost 4 vrev <5,0,2,5>
+ 3791499245U, // <0,5,5,3>: Cost 4 vext3 <5,5,3,0>, <5,5,3,0>
+ 3318533734U, // <0,5,5,4>: Cost 4 vrev <5,0,4,5>
+ 2718715908U, // <0,5,5,5>: Cost 3 vext3 <5,6,7,0>, <5,5,5,5>
+ 2657767522U, // <0,5,5,6>: Cost 3 vext2 <6,7,0,5>, <5,6,7,0>
+ 2718715928U, // <0,5,5,7>: Cost 3 vext3 <5,6,7,0>, <5,5,7,7>
+ 2718715937U, // <0,5,5,u>: Cost 3 vext3 <5,6,7,0>, <5,5,u,7>
+ 2592358502U, // <0,5,6,0>: Cost 3 vext1 <7,0,5,6>, LHS
+ 3792015404U, // <0,5,6,1>: Cost 4 vext3 <5,6,1,0>, <5,6,1,0>
+ 3731509754U, // <0,5,6,2>: Cost 4 vext2 <6,7,0,5>, <6,2,7,3>
+ 3785748546U, // <0,5,6,3>: Cost 4 vext3 <4,5,6,0>, <5,6,3,4>
+ 2592361782U, // <0,5,6,4>: Cost 3 vext1 <7,0,5,6>, RHS
+ 2592362594U, // <0,5,6,5>: Cost 3 vext1 <7,0,5,6>, <5,6,7,0>
+ 3785748576U, // <0,5,6,6>: Cost 4 vext3 <4,5,6,0>, <5,6,6,7>
+ 1644974178U, // <0,5,6,7>: Cost 2 vext3 <5,6,7,0>, <5,6,7,0>
+ 1645047915U, // <0,5,6,u>: Cost 2 vext3 <5,6,u,0>, <5,6,u,0>
+ 2562506854U, // <0,5,7,0>: Cost 3 vext1 <2,0,5,7>, LHS
+ 2562507670U, // <0,5,7,1>: Cost 3 vext1 <2,0,5,7>, <1,2,3,0>
+ 2562508262U, // <0,5,7,2>: Cost 3 vext1 <2,0,5,7>, <2,0,5,7>
+ 3636250774U, // <0,5,7,3>: Cost 4 vext1 <2,0,5,7>, <3,0,1,2>
+ 2562510134U, // <0,5,7,4>: Cost 3 vext1 <2,0,5,7>, RHS
+ 2718716072U, // <0,5,7,5>: Cost 3 vext3 <5,6,7,0>, <5,7,5,7>
+ 2718716074U, // <0,5,7,6>: Cost 3 vext3 <5,6,7,0>, <5,7,6,0>
+ 2719379635U, // <0,5,7,7>: Cost 3 vext3 <5,7,7,0>, <5,7,7,0>
+ 2562512686U, // <0,5,7,u>: Cost 3 vext1 <2,0,5,7>, LHS
+ 1500717158U, // <0,5,u,0>: Cost 2 vext1 <4,0,5,u>, LHS
+ 2625918766U, // <0,5,u,1>: Cost 3 vext2 <1,4,0,5>, LHS
+ 2719674583U, // <0,5,u,2>: Cost 3 vext3 <5,u,2,0>, <5,u,2,0>
+ 2568489152U, // <0,5,u,3>: Cost 3 vext1 <3,0,5,u>, <3,0,5,u>
+ 1500720025U, // <0,5,u,4>: Cost 2 vext1 <4,0,5,u>, <4,0,5,u>
+ 2625919130U, // <0,5,u,5>: Cost 3 vext2 <1,4,0,5>, RHS
+ 2586407243U, // <0,5,u,6>: Cost 3 vext1 <6,0,5,u>, <6,0,5,u>
+ 1646301444U, // <0,5,u,7>: Cost 2 vext3 <5,u,7,0>, <5,u,7,0>
+ 1646375181U, // <0,5,u,u>: Cost 2 vext3 <5,u,u,0>, <5,u,u,0>
+ 2586411110U, // <0,6,0,0>: Cost 3 vext1 <6,0,6,0>, LHS
+ 2619949158U, // <0,6,0,1>: Cost 3 vext2 <0,4,0,6>, LHS
+ 2619949220U, // <0,6,0,2>: Cost 3 vext2 <0,4,0,6>, <0,2,0,2>
+ 3785748789U, // <0,6,0,3>: Cost 4 vext3 <4,5,6,0>, <6,0,3,4>
+ 2619949386U, // <0,6,0,4>: Cost 3 vext2 <0,4,0,6>, <0,4,0,6>
+ 2586415202U, // <0,6,0,5>: Cost 3 vext1 <6,0,6,0>, <5,6,7,0>
+ 2586415436U, // <0,6,0,6>: Cost 3 vext1 <6,0,6,0>, <6,0,6,0>
+ 2952793398U, // <0,6,0,7>: Cost 3 vzipr <0,0,0,0>, RHS
+ 2619949725U, // <0,6,0,u>: Cost 3 vext2 <0,4,0,6>, LHS
+ 2562531430U, // <0,6,1,0>: Cost 3 vext1 <2,0,6,1>, LHS
+ 3693691700U, // <0,6,1,1>: Cost 4 vext2 <0,4,0,6>, <1,1,1,1>
+ 2886521338U, // <0,6,1,2>: Cost 3 vzipl LHS, <6,2,7,3>
+ 3693691864U, // <0,6,1,3>: Cost 4 vext2 <0,4,0,6>, <1,3,1,3>
+ 2562534710U, // <0,6,1,4>: Cost 3 vext1 <2,0,6,1>, RHS
+ 2580450932U, // <0,6,1,5>: Cost 3 vext1 <5,0,6,1>, <5,0,6,1>
+ 2886521656U, // <0,6,1,6>: Cost 3 vzipl LHS, <6,6,6,6>
+ 2966736182U, // <0,6,1,7>: Cost 3 vzipr <2,3,0,1>, RHS
+ 2966736183U, // <0,6,1,u>: Cost 3 vzipr <2,3,0,1>, RHS
+ 1500741734U, // <0,6,2,0>: Cost 2 vext1 <4,0,6,2>, LHS
+ 2250518817U, // <0,6,2,1>: Cost 3 vrev <6,0,1,2>
+ 2574485096U, // <0,6,2,2>: Cost 3 vext1 <4,0,6,2>, <2,2,2,2>
+ 2631894694U, // <0,6,2,3>: Cost 3 vext2 <2,4,0,6>, <2,3,0,1>
+ 1500744604U, // <0,6,2,4>: Cost 2 vext1 <4,0,6,2>, <4,0,6,2>
+ 2574487248U, // <0,6,2,5>: Cost 3 vext1 <4,0,6,2>, <5,1,7,3>
+ 3020739384U, // <0,6,2,6>: Cost 3 vtrnl LHS, <6,6,6,6>
+ 2954136886U, // <0,6,2,7>: Cost 3 vzipr <0,2,0,2>, RHS
+ 1500747566U, // <0,6,2,u>: Cost 2 vext1 <4,0,6,2>, LHS
+ 3693693078U, // <0,6,3,0>: Cost 4 vext2 <0,4,0,6>, <3,0,1,2>
+ 3705637136U, // <0,6,3,1>: Cost 4 vext2 <2,4,0,6>, <3,1,5,7>
+ 3705637192U, // <0,6,3,2>: Cost 4 vext2 <2,4,0,6>, <3,2,3,0>
+ 3693693340U, // <0,6,3,3>: Cost 4 vext2 <0,4,0,6>, <3,3,3,3>
+ 2637867477U, // <0,6,3,4>: Cost 3 vext2 <3,4,0,6>, <3,4,0,6>
+ 3705637424U, // <0,6,3,5>: Cost 4 vext2 <2,4,0,6>, <3,5,1,7>
+ 3666154056U, // <0,6,3,6>: Cost 4 vext1 <7,0,6,3>, <6,3,7,0>
+ 2722697800U, // <0,6,3,7>: Cost 3 vext3 <6,3,7,0>, <6,3,7,0>
+ 2722771537U, // <0,6,3,u>: Cost 3 vext3 <6,3,u,0>, <6,3,u,0>
+ 2562556006U, // <0,6,4,0>: Cost 3 vext1 <2,0,6,4>, LHS
+ 4095316257U, // <0,6,4,1>: Cost 4 vtrnl <0,2,4,6>, <6,0,1,2>
+ 2562557420U, // <0,6,4,2>: Cost 3 vext1 <2,0,6,4>, <2,0,6,4>
+ 3636299926U, // <0,6,4,3>: Cost 4 vext1 <2,0,6,4>, <3,0,1,2>
+ 2562559286U, // <0,6,4,4>: Cost 3 vext1 <2,0,6,4>, RHS
+ 2619952438U, // <0,6,4,5>: Cost 3 vext2 <0,4,0,6>, RHS
+ 2723287696U, // <0,6,4,6>: Cost 3 vext3 <6,4,6,0>, <6,4,6,0>
+ 4027895094U, // <0,6,4,7>: Cost 4 vzipr <0,2,0,4>, RHS
+ 2619952681U, // <0,6,4,u>: Cost 3 vext2 <0,4,0,6>, RHS
+ 2718716594U, // <0,6,5,0>: Cost 3 vext3 <5,6,7,0>, <6,5,0,7>
+ 3648250774U, // <0,6,5,1>: Cost 4 vext1 <4,0,6,5>, <1,2,3,0>
+ 3792458436U, // <0,6,5,2>: Cost 4 vext3 <5,6,7,0>, <6,5,2,7>
+ 3705638767U, // <0,6,5,3>: Cost 5 vext2 <2,4,0,6>, <5,3,7,0>
+ 3648252831U, // <0,6,5,4>: Cost 4 vext1 <4,0,6,5>, <4,0,6,5>
+ 3797619416U, // <0,6,5,5>: Cost 4 vext3 <6,5,5,0>, <6,5,5,0>
+ 3792458472U, // <0,6,5,6>: Cost 4 vext3 <5,6,7,0>, <6,5,6,7>
+ 4035202358U, // <0,6,5,7>: Cost 4 vzipr <1,4,0,5>, RHS
+ 2718716594U, // <0,6,5,u>: Cost 3 vext3 <5,6,7,0>, <6,5,0,7>
+ 3786412796U, // <0,6,6,0>: Cost 4 vext3 <4,6,6,0>, <6,6,0,0>
+ 3792458504U, // <0,6,6,1>: Cost 4 vext3 <5,6,7,0>, <6,6,1,3>
+ 3728200126U, // <0,6,6,2>: Cost 4 vext2 <6,2,0,6>, <6,2,0,6>
+ 3798135575U, // <0,6,6,3>: Cost 4 vext3 <6,6,3,0>, <6,6,3,0>
+ 3786412836U, // <0,6,6,4>: Cost 4 vext3 <4,6,6,0>, <6,6,4,4>
+ 3792458543U, // <0,6,6,5>: Cost 4 vext3 <5,6,7,0>, <6,6,5,6>
+ 2718716728U, // <0,6,6,6>: Cost 3 vext3 <5,6,7,0>, <6,6,6,6>
+ 2718716738U, // <0,6,6,7>: Cost 3 vext3 <5,6,7,0>, <6,6,7,7>
+ 2718716747U, // <0,6,6,u>: Cost 3 vext3 <5,6,7,0>, <6,6,u,7>
+ 2718716750U, // <0,6,7,0>: Cost 3 vext3 <5,6,7,0>, <6,7,0,1>
+ 2724909910U, // <0,6,7,1>: Cost 3 vext3 <6,7,1,0>, <6,7,1,0>
+ 3636323823U, // <0,6,7,2>: Cost 4 vext1 <2,0,6,7>, <2,0,6,7>
+ 2725057384U, // <0,6,7,3>: Cost 3 vext3 <6,7,3,0>, <6,7,3,0>
+ 2718716790U, // <0,6,7,4>: Cost 3 vext3 <5,6,7,0>, <6,7,4,5>
+ 2718716800U, // <0,6,7,5>: Cost 3 vext3 <5,6,7,0>, <6,7,5,6>
+ 3792458629U, // <0,6,7,6>: Cost 4 vext3 <5,6,7,0>, <6,7,6,2>
+ 2725352332U, // <0,6,7,7>: Cost 3 vext3 <6,7,7,0>, <6,7,7,0>
+ 2718716822U, // <0,6,7,u>: Cost 3 vext3 <5,6,7,0>, <6,7,u,1>
+ 1500790886U, // <0,6,u,0>: Cost 2 vext1 <4,0,6,u>, LHS
+ 2619954990U, // <0,6,u,1>: Cost 3 vext2 <0,4,0,6>, LHS
+ 2562590192U, // <0,6,u,2>: Cost 3 vext1 <2,0,6,u>, <2,0,6,u>
+ 2725721017U, // <0,6,u,3>: Cost 3 vext3 <6,u,3,0>, <6,u,3,0>
+ 1500793762U, // <0,6,u,4>: Cost 2 vext1 <4,0,6,u>, <4,0,6,u>
+ 2619955354U, // <0,6,u,5>: Cost 3 vext2 <0,4,0,6>, RHS
+ 2725942228U, // <0,6,u,6>: Cost 3 vext3 <6,u,6,0>, <6,u,6,0>
+ 2954186038U, // <0,6,u,7>: Cost 3 vzipr <0,2,0,u>, RHS
+ 1500796718U, // <0,6,u,u>: Cost 2 vext1 <4,0,6,u>, LHS
+ 2256401391U, // <0,7,0,0>: Cost 3 vrev <7,0,0,0>
+ 2632564838U, // <0,7,0,1>: Cost 3 vext2 <2,5,0,7>, LHS
+ 2256548865U, // <0,7,0,2>: Cost 3 vrev <7,0,2,0>
+ 3700998396U, // <0,7,0,3>: Cost 4 vext2 <1,6,0,7>, <0,3,1,0>
+ 2718716952U, // <0,7,0,4>: Cost 3 vext3 <5,6,7,0>, <7,0,4,5>
+ 2718716962U, // <0,7,0,5>: Cost 3 vext3 <5,6,7,0>, <7,0,5,6>
+ 2621284845U, // <0,7,0,6>: Cost 3 vext2 <0,6,0,7>, <0,6,0,7>
+ 3904685542U, // <0,7,0,7>: Cost 4 vuzpr <2,0,5,7>, <2,0,5,7>
+ 2632565405U, // <0,7,0,u>: Cost 3 vext2 <2,5,0,7>, LHS
+ 2256409584U, // <0,7,1,0>: Cost 3 vrev <7,0,0,1>
+ 3706307380U, // <0,7,1,1>: Cost 4 vext2 <2,5,0,7>, <1,1,1,1>
+ 2632565654U, // <0,7,1,2>: Cost 3 vext2 <2,5,0,7>, <1,2,3,0>
+ 3769603168U, // <0,7,1,3>: Cost 4 vext3 <1,u,3,0>, <7,1,3,5>
+ 2256704532U, // <0,7,1,4>: Cost 3 vrev <7,0,4,1>
+ 3769603184U, // <0,7,1,5>: Cost 4 vext3 <1,u,3,0>, <7,1,5,3>
+ 3700999366U, // <0,7,1,6>: Cost 4 vext2 <1,6,0,7>, <1,6,0,7>
+ 2886522476U, // <0,7,1,7>: Cost 3 vzipl LHS, <7,7,7,7>
+ 2256999480U, // <0,7,1,u>: Cost 3 vrev <7,0,u,1>
+ 2586501222U, // <0,7,2,0>: Cost 3 vext1 <6,0,7,2>, LHS
+ 1182749690U, // <0,7,2,1>: Cost 2 vrev <7,0,1,2>
+ 3636356595U, // <0,7,2,2>: Cost 4 vext1 <2,0,7,2>, <2,0,7,2>
+ 2727711916U, // <0,7,2,3>: Cost 3 vext3 <7,2,3,0>, <7,2,3,0>
+ 2586504502U, // <0,7,2,4>: Cost 3 vext1 <6,0,7,2>, RHS
+ 2632566606U, // <0,7,2,5>: Cost 3 vext2 <2,5,0,7>, <2,5,0,7>
+ 2586505559U, // <0,7,2,6>: Cost 3 vext1 <6,0,7,2>, <6,0,7,2>
+ 3020740204U, // <0,7,2,7>: Cost 3 vtrnl LHS, <7,7,7,7>
+ 1183265849U, // <0,7,2,u>: Cost 2 vrev <7,0,u,2>
+ 3701000342U, // <0,7,3,0>: Cost 4 vext2 <1,6,0,7>, <3,0,1,2>
+ 3706308849U, // <0,7,3,1>: Cost 4 vext2 <2,5,0,7>, <3,1,2,3>
+ 3330315268U, // <0,7,3,2>: Cost 4 vrev <7,0,2,3>
+ 3706309020U, // <0,7,3,3>: Cost 4 vext2 <2,5,0,7>, <3,3,3,3>
+ 3706309122U, // <0,7,3,4>: Cost 4 vext2 <2,5,0,7>, <3,4,5,6>
+ 3712281127U, // <0,7,3,5>: Cost 4 vext2 <3,5,0,7>, <3,5,0,7>
+ 2639202936U, // <0,7,3,6>: Cost 3 vext2 <3,6,0,7>, <3,6,0,7>
+ 3802412321U, // <0,7,3,7>: Cost 4 vext3 <7,3,7,0>, <7,3,7,0>
+ 2640530202U, // <0,7,3,u>: Cost 3 vext2 <3,u,0,7>, <3,u,0,7>
+ 3654287462U, // <0,7,4,0>: Cost 4 vext1 <5,0,7,4>, LHS
+ 2256507900U, // <0,7,4,1>: Cost 3 vrev <7,0,1,4>
+ 2256581637U, // <0,7,4,2>: Cost 3 vrev <7,0,2,4>
+ 3660262008U, // <0,7,4,3>: Cost 4 vext1 <6,0,7,4>, <3,6,0,7>
+ 3786413405U, // <0,7,4,4>: Cost 4 vext3 <4,6,6,0>, <7,4,4,6>
+ 2632568118U, // <0,7,4,5>: Cost 3 vext2 <2,5,0,7>, RHS
+ 3718917457U, // <0,7,4,6>: Cost 4 vext2 <4,6,0,7>, <4,6,0,7>
+ 3787003255U, // <0,7,4,7>: Cost 4 vext3 <4,7,5,0>, <7,4,7,5>
+ 2632568361U, // <0,7,4,u>: Cost 3 vext2 <2,5,0,7>, RHS
+ 3706310268U, // <0,7,5,0>: Cost 4 vext2 <2,5,0,7>, <5,0,7,0>
+ 3792459156U, // <0,7,5,1>: Cost 4 vext3 <5,6,7,0>, <7,5,1,7>
+ 3330331654U, // <0,7,5,2>: Cost 4 vrev <7,0,2,5>
+ 3722899255U, // <0,7,5,3>: Cost 4 vext2 <5,3,0,7>, <5,3,0,7>
+ 2256737304U, // <0,7,5,4>: Cost 3 vrev <7,0,4,5>
+ 3724226521U, // <0,7,5,5>: Cost 4 vext2 <5,5,0,7>, <5,5,0,7>
+ 2718717377U, // <0,7,5,6>: Cost 3 vext3 <5,6,7,0>, <7,5,6,7>
+ 2729997763U, // <0,7,5,7>: Cost 3 vext3 <7,5,7,0>, <7,5,7,0>
+ 2720044499U, // <0,7,5,u>: Cost 3 vext3 <5,u,7,0>, <7,5,u,7>
+ 3712946517U, // <0,7,6,0>: Cost 4 vext2 <3,6,0,7>, <6,0,7,0>
+ 2256524286U, // <0,7,6,1>: Cost 3 vrev <7,0,1,6>
+ 3792459246U, // <0,7,6,2>: Cost 4 vext3 <5,6,7,0>, <7,6,2,7>
+ 3796440567U, // <0,7,6,3>: Cost 4 vext3 <6,3,7,0>, <7,6,3,7>
+ 3654307126U, // <0,7,6,4>: Cost 4 vext1 <5,0,7,6>, RHS
+ 2656457394U, // <0,7,6,5>: Cost 3 vext2 <6,5,0,7>, <6,5,0,7>
+ 3792459281U, // <0,7,6,6>: Cost 4 vext3 <5,6,7,0>, <7,6,6,6>
+ 2730661396U, // <0,7,6,7>: Cost 3 vext3 <7,6,7,0>, <7,6,7,0>
+ 2658448293U, // <0,7,6,u>: Cost 3 vext2 <6,u,0,7>, <6,u,0,7>
+ 3787003431U, // <0,7,7,0>: Cost 4 vext3 <4,7,5,0>, <7,7,0,1>
+ 3654312854U, // <0,7,7,1>: Cost 4 vext1 <5,0,7,7>, <1,2,3,0>
+ 3654313446U, // <0,7,7,2>: Cost 4 vext1 <5,0,7,7>, <2,0,5,7>
+ 3804771905U, // <0,7,7,3>: Cost 4 vext3 <7,7,3,0>, <7,7,3,0>
+ 3654315318U, // <0,7,7,4>: Cost 4 vext1 <5,0,7,7>, RHS
+ 3654315651U, // <0,7,7,5>: Cost 4 vext1 <5,0,7,7>, <5,0,7,7>
+ 3660288348U, // <0,7,7,6>: Cost 4 vext1 <6,0,7,7>, <6,0,7,7>
+ 2718717548U, // <0,7,7,7>: Cost 3 vext3 <5,6,7,0>, <7,7,7,7>
+ 2664420990U, // <0,7,7,u>: Cost 3 vext2 <7,u,0,7>, <7,u,0,7>
+ 2256466935U, // <0,7,u,0>: Cost 3 vrev <7,0,0,u>
+ 1182798848U, // <0,7,u,1>: Cost 2 vrev <7,0,1,u>
+ 2256614409U, // <0,7,u,2>: Cost 3 vrev <7,0,2,u>
+ 2731693714U, // <0,7,u,3>: Cost 3 vext3 <7,u,3,0>, <7,u,3,0>
+ 2256761883U, // <0,7,u,4>: Cost 3 vrev <7,0,4,u>
+ 2632571034U, // <0,7,u,5>: Cost 3 vext2 <2,5,0,7>, RHS
+ 2669066421U, // <0,7,u,6>: Cost 3 vext2 <u,6,0,7>, <u,6,0,7>
+ 2731988662U, // <0,7,u,7>: Cost 3 vext3 <7,u,7,0>, <7,u,7,0>
+ 1183315007U, // <0,7,u,u>: Cost 2 vrev <7,0,u,u>
+ 135053414U, // <0,u,0,0>: Cost 1 vdup0 LHS
+ 1544896614U, // <0,u,0,1>: Cost 2 vext2 <0,2,0,u>, LHS
+ 1678999654U, // <0,u,0,2>: Cost 2 vuzpl LHS, LHS
+ 2691880677U, // <0,u,0,3>: Cost 3 vext3 <1,2,3,0>, <u,0,3,2>
+ 1476988214U, // <0,u,0,4>: Cost 2 vext1 <0,0,u,0>, RHS
+ 2718791419U, // <0,u,0,5>: Cost 3 vext3 <5,6,u,0>, <u,0,5,6>
+ 3021248666U, // <0,u,0,6>: Cost 3 vtrnl <0,2,0,2>, RHS
+ 2592535607U, // <0,u,0,7>: Cost 3 vext1 <7,0,u,0>, <7,0,u,0>
+ 135053414U, // <0,u,0,u>: Cost 1 vdup0 LHS
+ 1476993097U, // <0,u,1,0>: Cost 2 vext1 <0,0,u,1>, <0,0,u,1>
+ 1812780846U, // <0,u,1,1>: Cost 2 vzipl LHS, LHS
+ 1618138926U, // <0,u,1,2>: Cost 2 vext3 <1,2,3,0>, LHS
+ 2752742134U, // <0,u,1,3>: Cost 3 vuzpl LHS, <1,0,3,2>
+ 1476996406U, // <0,u,1,4>: Cost 2 vext1 <0,0,u,1>, RHS
+ 1812781210U, // <0,u,1,5>: Cost 2 vzipl LHS, RHS
+ 2887006416U, // <0,u,1,6>: Cost 3 vzipl LHS, <u,6,3,7>
+ 2966736200U, // <0,u,1,7>: Cost 3 vzipr <2,3,0,1>, RHS
+ 1812781413U, // <0,u,1,u>: Cost 2 vzipl LHS, LHS
+ 1482973286U, // <0,u,2,0>: Cost 2 vext1 <1,0,u,2>, LHS
+ 1482973987U, // <0,u,2,1>: Cost 2 vext1 <1,0,u,2>, <1,0,u,2>
+ 1946998574U, // <0,u,2,2>: Cost 2 vtrnl LHS, LHS
+ 835584U, // <0,u,2,3>: Cost 0 copy LHS
+ 1482976566U, // <0,u,2,4>: Cost 2 vext1 <1,0,u,2>, RHS
+ 3020781631U, // <0,u,2,5>: Cost 3 vtrnl LHS, <u,4,5,6>
+ 1946998938U, // <0,u,2,6>: Cost 2 vtrnl LHS, RHS
+ 1518810169U, // <0,u,2,7>: Cost 2 vext1 <7,0,u,2>, <7,0,u,2>
+ 835584U, // <0,u,2,u>: Cost 0 copy LHS
+ 2618640534U, // <0,u,3,0>: Cost 3 vext2 <0,2,0,u>, <3,0,1,2>
+ 2752743574U, // <0,u,3,1>: Cost 3 vuzpl LHS, <3,0,1,2>
+ 2636556597U, // <0,u,3,2>: Cost 3 vext2 <3,2,0,u>, <3,2,0,u>
+ 2752743836U, // <0,u,3,3>: Cost 3 vuzpl LHS, <3,3,3,3>
+ 2618640898U, // <0,u,3,4>: Cost 3 vext2 <0,2,0,u>, <3,4,5,6>
+ 2752743938U, // <0,u,3,5>: Cost 3 vuzpl LHS, <3,4,5,6>
+ 2639202936U, // <0,u,3,6>: Cost 3 vext2 <3,6,0,7>, <3,6,0,7>
+ 2639874762U, // <0,u,3,7>: Cost 3 vext2 <3,7,0,u>, <3,7,0,u>
+ 2752743637U, // <0,u,3,u>: Cost 3 vuzpl LHS, <3,0,u,2>
+ 2562703462U, // <0,u,4,0>: Cost 3 vext1 <2,0,u,4>, LHS
+ 2888455982U, // <0,u,4,1>: Cost 3 vzipl <0,4,1,5>, LHS
+ 3021575982U, // <0,u,4,2>: Cost 3 vtrnl <0,2,4,6>, LHS
+ 2568677591U, // <0,u,4,3>: Cost 3 vext1 <3,0,u,4>, <3,0,u,4>
+ 2562706742U, // <0,u,4,4>: Cost 3 vext1 <2,0,u,4>, RHS
+ 1544899894U, // <0,u,4,5>: Cost 2 vext2 <0,2,0,u>, RHS
+ 1679002934U, // <0,u,4,6>: Cost 2 vuzpl LHS, RHS
+ 2718718033U, // <0,u,4,7>: Cost 3 vext3 <5,6,7,0>, <u,4,7,6>
+ 1679002952U, // <0,u,4,u>: Cost 2 vuzpl LHS, RHS
+ 2568683622U, // <0,u,5,0>: Cost 3 vext1 <3,0,u,5>, LHS
+ 2568684438U, // <0,u,5,1>: Cost 3 vext1 <3,0,u,5>, <1,2,3,0>
+ 3765622902U, // <0,u,5,2>: Cost 4 vext3 <1,2,3,0>, <u,5,2,7>
+ 2691881087U, // <0,u,5,3>: Cost 3 vext3 <1,2,3,0>, <u,5,3,7>
+ 2568686902U, // <0,u,5,4>: Cost 3 vext1 <3,0,u,5>, RHS
+ 2650492890U, // <0,u,5,5>: Cost 3 vext2 <5,5,0,u>, <5,5,0,u>
+ 1618139290U, // <0,u,5,6>: Cost 2 vext3 <1,2,3,0>, RHS
+ 2824834358U, // <0,u,5,7>: Cost 3 vuzpr <1,0,3,u>, RHS
+ 1618139308U, // <0,u,5,u>: Cost 2 vext3 <1,2,3,0>, RHS
+ 2592579686U, // <0,u,6,0>: Cost 3 vext1 <7,0,u,6>, LHS
+ 2262496983U, // <0,u,6,1>: Cost 3 vrev <u,0,1,6>
+ 2654474688U, // <0,u,6,2>: Cost 3 vext2 <6,2,0,u>, <6,2,0,u>
+ 2691881168U, // <0,u,6,3>: Cost 3 vext3 <1,2,3,0>, <u,6,3,7>
+ 2592582966U, // <0,u,6,4>: Cost 3 vext1 <7,0,u,6>, RHS
+ 2656465587U, // <0,u,6,5>: Cost 3 vext2 <6,5,0,u>, <6,5,0,u>
+ 2657129220U, // <0,u,6,6>: Cost 3 vext2 <6,6,0,u>, <6,6,0,u>
+ 1584051029U, // <0,u,6,7>: Cost 2 vext2 <6,7,0,u>, <6,7,0,u>
+ 1584714662U, // <0,u,6,u>: Cost 2 vext2 <6,u,0,u>, <6,u,0,u>
+ 2562728038U, // <0,u,7,0>: Cost 3 vext1 <2,0,u,7>, LHS
+ 2562728854U, // <0,u,7,1>: Cost 3 vext1 <2,0,u,7>, <1,2,3,0>
+ 2562729473U, // <0,u,7,2>: Cost 3 vext1 <2,0,u,7>, <2,0,u,7>
+ 2661111018U, // <0,u,7,3>: Cost 3 vext2 <7,3,0,u>, <7,3,0,u>
+ 2562731318U, // <0,u,7,4>: Cost 3 vext1 <2,0,u,7>, RHS
+ 2718718258U, // <0,u,7,5>: Cost 3 vext3 <5,6,7,0>, <u,7,5,6>
+ 2586620261U, // <0,u,7,6>: Cost 3 vext1 <6,0,u,7>, <6,0,u,7>
+ 2657793644U, // <0,u,7,7>: Cost 3 vext2 <6,7,0,u>, <7,7,7,7>
+ 2562733870U, // <0,u,7,u>: Cost 3 vext1 <2,0,u,7>, LHS
+ 135053414U, // <0,u,u,0>: Cost 1 vdup0 LHS
+ 1544902446U, // <0,u,u,1>: Cost 2 vext2 <0,2,0,u>, LHS
+ 1679005486U, // <0,u,u,2>: Cost 2 vuzpl LHS, LHS
+ 835584U, // <0,u,u,3>: Cost 0 copy LHS
+ 1483025718U, // <0,u,u,4>: Cost 2 vext1 <1,0,u,u>, RHS
+ 1544902810U, // <0,u,u,5>: Cost 2 vext2 <0,2,0,u>, RHS
+ 1679005850U, // <0,u,u,6>: Cost 2 vuzpl LHS, RHS
+ 1518859327U, // <0,u,u,7>: Cost 2 vext1 <7,0,u,u>, <7,0,u,u>
+ 835584U, // <0,u,u,u>: Cost 0 copy LHS
+ 2689744896U, // <1,0,0,0>: Cost 3 vext3 <0,u,1,1>, <0,0,0,0>
+ 1610694666U, // <1,0,0,1>: Cost 2 vext3 <0,0,1,1>, <0,0,1,1>
+ 2689744916U, // <1,0,0,2>: Cost 3 vext3 <0,u,1,1>, <0,0,2,2>
+ 2619310332U, // <1,0,0,3>: Cost 3 vext2 <0,3,1,0>, <0,3,1,0>
+ 2684657701U, // <1,0,0,4>: Cost 3 vext3 <0,0,4,1>, <0,0,4,1>
+ 2620637598U, // <1,0,0,5>: Cost 3 vext2 <0,5,1,0>, <0,5,1,0>
+ 3708977654U, // <1,0,0,6>: Cost 4 vext2 <3,0,1,0>, <0,6,1,7>
+ 3666351168U, // <1,0,0,7>: Cost 4 vext1 <7,1,0,0>, <7,1,0,0>
+ 1611210825U, // <1,0,0,u>: Cost 2 vext3 <0,0,u,1>, <0,0,u,1>
+ 2556780646U, // <1,0,1,0>: Cost 3 vext1 <1,1,0,1>, LHS
+ 2556781355U, // <1,0,1,1>: Cost 3 vext1 <1,1,0,1>, <1,1,0,1>
+ 1616003174U, // <1,0,1,2>: Cost 2 vext3 <0,u,1,1>, LHS
+ 3693052888U, // <1,0,1,3>: Cost 4 vext2 <0,3,1,0>, <1,3,1,3>
+ 2556783926U, // <1,0,1,4>: Cost 3 vext1 <1,1,0,1>, RHS
+ 2580672143U, // <1,0,1,5>: Cost 3 vext1 <5,1,0,1>, <5,1,0,1>
+ 2724839566U, // <1,0,1,6>: Cost 3 vext3 <6,7,0,1>, <0,1,6,7>
+ 3654415354U, // <1,0,1,7>: Cost 4 vext1 <5,1,0,1>, <7,0,1,2>
+ 1616003228U, // <1,0,1,u>: Cost 2 vext3 <0,u,1,1>, LHS
+ 2685690019U, // <1,0,2,0>: Cost 3 vext3 <0,2,0,1>, <0,2,0,1>
+ 2685763756U, // <1,0,2,1>: Cost 3 vext3 <0,2,1,1>, <0,2,1,1>
+ 2698297524U, // <1,0,2,2>: Cost 3 vext3 <2,3,0,1>, <0,2,2,0>
+ 2685911230U, // <1,0,2,3>: Cost 3 vext3 <0,2,3,1>, <0,2,3,1>
+ 2689745100U, // <1,0,2,4>: Cost 3 vext3 <0,u,1,1>, <0,2,4,6>
+ 3764814038U, // <1,0,2,5>: Cost 4 vext3 <1,1,1,1>, <0,2,5,7>
+ 2724839640U, // <1,0,2,6>: Cost 3 vext3 <6,7,0,1>, <0,2,6,0>
+ 2592625658U, // <1,0,2,7>: Cost 3 vext1 <7,1,0,2>, <7,0,1,2>
+ 2686279915U, // <1,0,2,u>: Cost 3 vext3 <0,2,u,1>, <0,2,u,1>
+ 3087843328U, // <1,0,3,0>: Cost 3 vtrnr LHS, <0,0,0,0>
+ 3087843338U, // <1,0,3,1>: Cost 3 vtrnr LHS, <0,0,1,1>
+ 67944550U, // <1,0,3,2>: Cost 1 vrev LHS
+ 2568743135U, // <1,0,3,3>: Cost 3 vext1 <3,1,0,3>, <3,1,0,3>
+ 2562772278U, // <1,0,3,4>: Cost 3 vext1 <2,1,0,3>, RHS
+ 4099850454U, // <1,0,3,5>: Cost 4 vtrnl <1,0,3,2>, <0,2,5,7>
+ 3704998538U, // <1,0,3,6>: Cost 4 vext2 <2,3,1,0>, <3,6,2,7>
+ 2592633923U, // <1,0,3,7>: Cost 3 vext1 <7,1,0,3>, <7,1,0,3>
+ 68386972U, // <1,0,3,u>: Cost 1 vrev LHS
+ 2620640146U, // <1,0,4,0>: Cost 3 vext2 <0,5,1,0>, <4,0,5,1>
+ 2689745234U, // <1,0,4,1>: Cost 3 vext3 <0,u,1,1>, <0,4,1,5>
+ 2689745244U, // <1,0,4,2>: Cost 3 vext3 <0,u,1,1>, <0,4,2,6>
+ 3760980320U, // <1,0,4,3>: Cost 4 vext3 <0,4,3,1>, <0,4,3,1>
+ 3761054057U, // <1,0,4,4>: Cost 4 vext3 <0,4,4,1>, <0,4,4,1>
+ 2619313462U, // <1,0,4,5>: Cost 3 vext2 <0,3,1,0>, RHS
+ 3761201531U, // <1,0,4,6>: Cost 4 vext3 <0,4,6,1>, <0,4,6,1>
+ 3666383940U, // <1,0,4,7>: Cost 4 vext1 <7,1,0,4>, <7,1,0,4>
+ 2619313705U, // <1,0,4,u>: Cost 3 vext2 <0,3,1,0>, RHS
+ 4029300736U, // <1,0,5,0>: Cost 4 vzipr <0,4,1,5>, <0,0,0,0>
+ 2895249510U, // <1,0,5,1>: Cost 3 vzipl <1,5,3,7>, LHS
+ 3028287590U, // <1,0,5,2>: Cost 3 vtrnl <1,3,5,7>, LHS
+ 3642501345U, // <1,0,5,3>: Cost 4 vext1 <3,1,0,5>, <3,1,0,5>
+ 2215592058U, // <1,0,5,4>: Cost 3 vrev <0,1,4,5>
+ 3724242907U, // <1,0,5,5>: Cost 4 vext2 <5,5,1,0>, <5,5,1,0>
+ 3724906540U, // <1,0,5,6>: Cost 4 vext2 <5,6,1,0>, <5,6,1,0>
+ 3911118134U, // <1,0,5,7>: Cost 4 vuzpr <3,1,3,0>, RHS
+ 3028287644U, // <1,0,5,u>: Cost 3 vtrnl <1,3,5,7>, LHS
+ 3762086375U, // <1,0,6,0>: Cost 4 vext3 <0,6,0,1>, <0,6,0,1>
+ 2698297846U, // <1,0,6,1>: Cost 3 vext3 <2,3,0,1>, <0,6,1,7>
+ 3760022015U, // <1,0,6,2>: Cost 4 vext3 <0,2,u,1>, <0,6,2,7>
+ 3642509538U, // <1,0,6,3>: Cost 4 vext1 <3,1,0,6>, <3,1,0,6>
+ 3762381323U, // <1,0,6,4>: Cost 4 vext3 <0,6,4,1>, <0,6,4,1>
+ 3730215604U, // <1,0,6,5>: Cost 4 vext2 <6,5,1,0>, <6,5,1,0>
+ 3730879237U, // <1,0,6,6>: Cost 4 vext2 <6,6,1,0>, <6,6,1,0>
+ 2657801046U, // <1,0,6,7>: Cost 3 vext2 <6,7,1,0>, <6,7,1,0>
+ 2658464679U, // <1,0,6,u>: Cost 3 vext2 <6,u,1,0>, <6,u,1,0>
+ 2659128312U, // <1,0,7,0>: Cost 3 vext2 <7,0,1,0>, <7,0,1,0>
+ 4047898278U, // <1,0,7,1>: Cost 4 vzipr <3,5,1,7>, <2,3,0,1>
+ 2215460970U, // <1,0,7,2>: Cost 3 vrev <0,1,2,7>
+ 3734861035U, // <1,0,7,3>: Cost 4 vext2 <7,3,1,0>, <7,3,1,0>
+ 3731543398U, // <1,0,7,4>: Cost 4 vext2 <6,7,1,0>, <7,4,5,6>
+ 3736188301U, // <1,0,7,5>: Cost 4 vext2 <7,5,1,0>, <7,5,1,0>
+ 2663110110U, // <1,0,7,6>: Cost 3 vext2 <7,6,1,0>, <7,6,1,0>
+ 3731543660U, // <1,0,7,7>: Cost 4 vext2 <6,7,1,0>, <7,7,7,7>
+ 2664437376U, // <1,0,7,u>: Cost 3 vext2 <7,u,1,0>, <7,u,1,0>
+ 3087884288U, // <1,0,u,0>: Cost 3 vtrnr LHS, <0,0,0,0>
+ 1616003730U, // <1,0,u,1>: Cost 2 vext3 <0,u,1,1>, <0,u,1,1>
+ 67985515U, // <1,0,u,2>: Cost 1 vrev LHS
+ 2689893028U, // <1,0,u,3>: Cost 3 vext3 <0,u,3,1>, <0,u,3,1>
+ 2689745586U, // <1,0,u,4>: Cost 3 vext3 <0,u,1,1>, <0,u,4,6>
+ 2619316378U, // <1,0,u,5>: Cost 3 vext2 <0,3,1,0>, RHS
+ 2669082807U, // <1,0,u,6>: Cost 3 vext2 <u,6,1,0>, <u,6,1,0>
+ 2592674888U, // <1,0,u,7>: Cost 3 vext1 <7,1,0,u>, <7,1,0,u>
+ 68427937U, // <1,0,u,u>: Cost 1 vrev LHS
+ 1543585802U, // <1,1,0,0>: Cost 2 vext2 <0,0,1,1>, <0,0,1,1>
+ 1548894310U, // <1,1,0,1>: Cost 2 vext2 <0,u,1,1>, LHS
+ 2618654892U, // <1,1,0,2>: Cost 3 vext2 <0,2,1,1>, <0,2,1,1>
+ 2689745654U, // <1,1,0,3>: Cost 3 vext3 <0,u,1,1>, <1,0,3,2>
+ 2622636370U, // <1,1,0,4>: Cost 3 vext2 <0,u,1,1>, <0,4,1,5>
+ 2620645791U, // <1,1,0,5>: Cost 3 vext2 <0,5,1,1>, <0,5,1,1>
+ 3696378367U, // <1,1,0,6>: Cost 4 vext2 <0,u,1,1>, <0,6,2,7>
+ 3666424905U, // <1,1,0,7>: Cost 4 vext1 <7,1,1,0>, <7,1,1,0>
+ 1548894866U, // <1,1,0,u>: Cost 2 vext2 <0,u,1,1>, <0,u,1,1>
+ 1483112550U, // <1,1,1,0>: Cost 2 vext1 <1,1,1,1>, LHS
+ 202162278U, // <1,1,1,1>: Cost 1 vdup1 LHS
+ 2622636950U, // <1,1,1,2>: Cost 3 vext2 <0,u,1,1>, <1,2,3,0>
+ 2622637016U, // <1,1,1,3>: Cost 3 vext2 <0,u,1,1>, <1,3,1,3>
+ 1483115830U, // <1,1,1,4>: Cost 2 vext1 <1,1,1,1>, RHS
+ 2622637200U, // <1,1,1,5>: Cost 3 vext2 <0,u,1,1>, <1,5,3,7>
+ 2622637263U, // <1,1,1,6>: Cost 3 vext2 <0,u,1,1>, <1,6,1,7>
+ 2592691274U, // <1,1,1,7>: Cost 3 vext1 <7,1,1,1>, <7,1,1,1>
+ 202162278U, // <1,1,1,u>: Cost 1 vdup1 LHS
+ 2550890588U, // <1,1,2,0>: Cost 3 vext1 <0,1,1,2>, <0,1,1,2>
+ 2617329183U, // <1,1,2,1>: Cost 3 vext2 <0,0,1,1>, <2,1,3,1>
+ 2622637672U, // <1,1,2,2>: Cost 3 vext2 <0,u,1,1>, <2,2,2,2>
+ 2622637734U, // <1,1,2,3>: Cost 3 vext2 <0,u,1,1>, <2,3,0,1>
+ 2550893878U, // <1,1,2,4>: Cost 3 vext1 <0,1,1,2>, RHS
+ 3696379744U, // <1,1,2,5>: Cost 4 vext2 <0,u,1,1>, <2,5,2,7>
+ 2622638010U, // <1,1,2,6>: Cost 3 vext2 <0,u,1,1>, <2,6,3,7>
+ 3804554170U, // <1,1,2,7>: Cost 4 vext3 <7,7,0,1>, <1,2,7,0>
+ 2622638139U, // <1,1,2,u>: Cost 3 vext2 <0,u,1,1>, <2,u,0,1>
+ 2622638230U, // <1,1,3,0>: Cost 3 vext2 <0,u,1,1>, <3,0,1,2>
+ 3087844148U, // <1,1,3,1>: Cost 3 vtrnr LHS, <1,1,1,1>
+ 4161585244U, // <1,1,3,2>: Cost 4 vtrnr LHS, <0,1,1,2>
+ 2014101606U, // <1,1,3,3>: Cost 2 vtrnr LHS, LHS
+ 2622638594U, // <1,1,3,4>: Cost 3 vext2 <0,u,1,1>, <3,4,5,6>
+ 2689745920U, // <1,1,3,5>: Cost 3 vext3 <0,u,1,1>, <1,3,5,7>
+ 3763487753U, // <1,1,3,6>: Cost 4 vext3 <0,u,1,1>, <1,3,6,7>
+ 2592707660U, // <1,1,3,7>: Cost 3 vext1 <7,1,1,3>, <7,1,1,3>
+ 2014101611U, // <1,1,3,u>: Cost 2 vtrnr LHS, LHS
+ 2556878950U, // <1,1,4,0>: Cost 3 vext1 <1,1,1,4>, LHS
+ 2221335351U, // <1,1,4,1>: Cost 3 vrev <1,1,1,4>
+ 3696380988U, // <1,1,4,2>: Cost 4 vext2 <0,u,1,1>, <4,2,6,0>
+ 3763487805U, // <1,1,4,3>: Cost 4 vext3 <0,u,1,1>, <1,4,3,5>
+ 2556882230U, // <1,1,4,4>: Cost 3 vext1 <1,1,1,4>, RHS
+ 1548897590U, // <1,1,4,5>: Cost 2 vext2 <0,u,1,1>, RHS
+ 2758184246U, // <1,1,4,6>: Cost 3 vuzpl <1,1,1,1>, RHS
+ 3666457677U, // <1,1,4,7>: Cost 4 vext1 <7,1,1,4>, <7,1,1,4>
+ 1548897833U, // <1,1,4,u>: Cost 2 vext2 <0,u,1,1>, RHS
+ 2693653615U, // <1,1,5,0>: Cost 3 vext3 <1,5,0,1>, <1,5,0,1>
+ 2617331408U, // <1,1,5,1>: Cost 3 vext2 <0,0,1,1>, <5,1,7,3>
+ 4029302934U, // <1,1,5,2>: Cost 4 vzipr <0,4,1,5>, <3,0,1,2>
+ 2689746064U, // <1,1,5,3>: Cost 3 vext3 <0,u,1,1>, <1,5,3,7>
+ 2221564755U, // <1,1,5,4>: Cost 3 vrev <1,1,4,5>
+ 2955559250U, // <1,1,5,5>: Cost 3 vzipr <0,4,1,5>, <0,4,1,5>
+ 2617331810U, // <1,1,5,6>: Cost 3 vext2 <0,0,1,1>, <5,6,7,0>
+ 2825293110U, // <1,1,5,7>: Cost 3 vuzpr <1,1,1,1>, RHS
+ 2689746109U, // <1,1,5,u>: Cost 3 vext3 <0,u,1,1>, <1,5,u,7>
+ 3696382241U, // <1,1,6,0>: Cost 4 vext2 <0,u,1,1>, <6,0,1,2>
+ 2689746127U, // <1,1,6,1>: Cost 3 vext3 <0,u,1,1>, <1,6,1,7>
+ 2617332218U, // <1,1,6,2>: Cost 3 vext2 <0,0,1,1>, <6,2,7,3>
+ 3763487969U, // <1,1,6,3>: Cost 4 vext3 <0,u,1,1>, <1,6,3,7>
+ 3696382605U, // <1,1,6,4>: Cost 4 vext2 <0,u,1,1>, <6,4,5,6>
+ 4029309266U, // <1,1,6,5>: Cost 4 vzipr <0,4,1,6>, <0,4,1,5>
+ 2617332536U, // <1,1,6,6>: Cost 3 vext2 <0,0,1,1>, <6,6,6,6>
+ 2724840702U, // <1,1,6,7>: Cost 3 vext3 <6,7,0,1>, <1,6,7,0>
+ 2725504263U, // <1,1,6,u>: Cost 3 vext3 <6,u,0,1>, <1,6,u,0>
+ 2617332720U, // <1,1,7,0>: Cost 3 vext2 <0,0,1,1>, <7,0,0,1>
+ 2659800138U, // <1,1,7,1>: Cost 3 vext2 <7,1,1,1>, <7,1,1,1>
+ 3691074717U, // <1,1,7,2>: Cost 4 vext2 <0,0,1,1>, <7,2,1,3>
+ 4167811174U, // <1,1,7,3>: Cost 4 vtrnr <1,1,5,7>, LHS
+ 2617333094U, // <1,1,7,4>: Cost 3 vext2 <0,0,1,1>, <7,4,5,6>
+ 3295396702U, // <1,1,7,5>: Cost 4 vrev <1,1,5,7>
+ 3803891014U, // <1,1,7,6>: Cost 4 vext3 <7,6,0,1>, <1,7,6,0>
+ 2617333356U, // <1,1,7,7>: Cost 3 vext2 <0,0,1,1>, <7,7,7,7>
+ 2659800138U, // <1,1,7,u>: Cost 3 vext2 <7,1,1,1>, <7,1,1,1>
+ 1483112550U, // <1,1,u,0>: Cost 2 vext1 <1,1,1,1>, LHS
+ 202162278U, // <1,1,u,1>: Cost 1 vdup1 LHS
+ 2622642056U, // <1,1,u,2>: Cost 3 vext2 <0,u,1,1>, <u,2,3,3>
+ 2014142566U, // <1,1,u,3>: Cost 2 vtrnr LHS, LHS
+ 1483115830U, // <1,1,u,4>: Cost 2 vext1 <1,1,1,1>, RHS
+ 1548900506U, // <1,1,u,5>: Cost 2 vext2 <0,u,1,1>, RHS
+ 2622642384U, // <1,1,u,6>: Cost 3 vext2 <0,u,1,1>, <u,6,3,7>
+ 2825293353U, // <1,1,u,7>: Cost 3 vuzpr <1,1,1,1>, RHS
+ 202162278U, // <1,1,u,u>: Cost 1 vdup1 LHS
+ 2635251712U, // <1,2,0,0>: Cost 3 vext2 <3,0,1,2>, <0,0,0,0>
+ 1561509990U, // <1,2,0,1>: Cost 2 vext2 <3,0,1,2>, LHS
+ 2618663085U, // <1,2,0,2>: Cost 3 vext2 <0,2,1,2>, <0,2,1,2>
+ 2696529358U, // <1,2,0,3>: Cost 3 vext3 <2,0,3,1>, <2,0,3,1>
+ 2635252050U, // <1,2,0,4>: Cost 3 vext2 <3,0,1,2>, <0,4,1,5>
+ 3769533926U, // <1,2,0,5>: Cost 4 vext3 <1,u,2,1>, <2,0,5,7>
+ 2621317617U, // <1,2,0,6>: Cost 3 vext2 <0,6,1,2>, <0,6,1,2>
+ 2659140170U, // <1,2,0,7>: Cost 3 vext2 <7,0,1,2>, <0,7,2,1>
+ 1561510557U, // <1,2,0,u>: Cost 2 vext2 <3,0,1,2>, LHS
+ 2623308516U, // <1,2,1,0>: Cost 3 vext2 <1,0,1,2>, <1,0,1,2>
+ 2635252532U, // <1,2,1,1>: Cost 3 vext2 <3,0,1,2>, <1,1,1,1>
+ 2631271318U, // <1,2,1,2>: Cost 3 vext2 <2,3,1,2>, <1,2,3,0>
+ 2958180454U, // <1,2,1,3>: Cost 3 vzipr <0,u,1,1>, LHS
+ 2550959414U, // <1,2,1,4>: Cost 3 vext1 <0,1,2,1>, RHS
+ 2635252880U, // <1,2,1,5>: Cost 3 vext2 <3,0,1,2>, <1,5,3,7>
+ 2635252952U, // <1,2,1,6>: Cost 3 vext2 <3,0,1,2>, <1,6,2,7>
+ 3732882731U, // <1,2,1,7>: Cost 4 vext2 <7,0,1,2>, <1,7,3,0>
+ 2958180459U, // <1,2,1,u>: Cost 3 vzipr <0,u,1,1>, LHS
+ 2629281213U, // <1,2,2,0>: Cost 3 vext2 <2,0,1,2>, <2,0,1,2>
+ 2635253280U, // <1,2,2,1>: Cost 3 vext2 <3,0,1,2>, <2,1,3,2>
+ 2618664552U, // <1,2,2,2>: Cost 3 vext2 <0,2,1,2>, <2,2,2,2>
+ 2689746546U, // <1,2,2,3>: Cost 3 vext3 <0,u,1,1>, <2,2,3,3>
+ 3764815485U, // <1,2,2,4>: Cost 4 vext3 <1,1,1,1>, <2,2,4,5>
+ 3760023176U, // <1,2,2,5>: Cost 4 vext3 <0,2,u,1>, <2,2,5,7>
+ 2635253690U, // <1,2,2,6>: Cost 3 vext2 <3,0,1,2>, <2,6,3,7>
+ 2659141610U, // <1,2,2,7>: Cost 3 vext2 <7,0,1,2>, <2,7,0,1>
+ 2689746591U, // <1,2,2,u>: Cost 3 vext3 <0,u,1,1>, <2,2,u,3>
+ 403488870U, // <1,2,3,0>: Cost 1 vext1 LHS, LHS
+ 1477231350U, // <1,2,3,1>: Cost 2 vext1 LHS, <1,0,3,2>
+ 1477232232U, // <1,2,3,2>: Cost 2 vext1 LHS, <2,2,2,2>
+ 1477233052U, // <1,2,3,3>: Cost 2 vext1 LHS, <3,3,3,3>
+ 403492150U, // <1,2,3,4>: Cost 1 vext1 LHS, RHS
+ 1525010128U, // <1,2,3,5>: Cost 2 vext1 LHS, <5,1,7,3>
+ 1525010938U, // <1,2,3,6>: Cost 2 vext1 LHS, <6,2,7,3>
+ 1525011450U, // <1,2,3,7>: Cost 2 vext1 LHS, <7,0,1,2>
+ 403494702U, // <1,2,3,u>: Cost 1 vext1 LHS, LHS
+ 2641226607U, // <1,2,4,0>: Cost 3 vext2 <4,0,1,2>, <4,0,1,2>
+ 3624723446U, // <1,2,4,1>: Cost 4 vext1 <0,1,2,4>, <1,3,4,6>
+ 3301123609U, // <1,2,4,2>: Cost 4 vrev <2,1,2,4>
+ 2598759198U, // <1,2,4,3>: Cost 3 vext1 <u,1,2,4>, <3,u,1,2>
+ 2659142864U, // <1,2,4,4>: Cost 3 vext2 <7,0,1,2>, <4,4,4,4>
+ 1561513270U, // <1,2,4,5>: Cost 2 vext2 <3,0,1,2>, RHS
+ 2659143028U, // <1,2,4,6>: Cost 3 vext2 <7,0,1,2>, <4,6,4,6>
+ 2659143112U, // <1,2,4,7>: Cost 3 vext2 <7,0,1,2>, <4,7,5,0>
+ 1561513513U, // <1,2,4,u>: Cost 2 vext2 <3,0,1,2>, RHS
+ 2550988902U, // <1,2,5,0>: Cost 3 vext1 <0,1,2,5>, LHS
+ 2550989824U, // <1,2,5,1>: Cost 3 vext1 <0,1,2,5>, <1,3,5,7>
+ 3624732264U, // <1,2,5,2>: Cost 4 vext1 <0,1,2,5>, <2,2,2,2>
+ 2955559014U, // <1,2,5,3>: Cost 3 vzipr <0,4,1,5>, LHS
+ 2550992182U, // <1,2,5,4>: Cost 3 vext1 <0,1,2,5>, RHS
+ 2659143684U, // <1,2,5,5>: Cost 3 vext2 <7,0,1,2>, <5,5,5,5>
+ 2659143778U, // <1,2,5,6>: Cost 3 vext2 <7,0,1,2>, <5,6,7,0>
+ 2659143848U, // <1,2,5,7>: Cost 3 vext2 <7,0,1,2>, <5,7,5,7>
+ 2550994734U, // <1,2,5,u>: Cost 3 vext1 <0,1,2,5>, LHS
+ 2700289945U, // <1,2,6,0>: Cost 3 vext3 <2,6,0,1>, <2,6,0,1>
+ 2635256232U, // <1,2,6,1>: Cost 3 vext2 <3,0,1,2>, <6,1,7,2>
+ 2659144186U, // <1,2,6,2>: Cost 3 vext2 <7,0,1,2>, <6,2,7,3>
+ 2689746874U, // <1,2,6,3>: Cost 3 vext3 <0,u,1,1>, <2,6,3,7>
+ 3763488705U, // <1,2,6,4>: Cost 4 vext3 <0,u,1,1>, <2,6,4,5>
+ 3763488716U, // <1,2,6,5>: Cost 4 vext3 <0,u,1,1>, <2,6,5,7>
+ 2659144504U, // <1,2,6,6>: Cost 3 vext2 <7,0,1,2>, <6,6,6,6>
+ 2657817432U, // <1,2,6,7>: Cost 3 vext2 <6,7,1,2>, <6,7,1,2>
+ 2689746919U, // <1,2,6,u>: Cost 3 vext3 <0,u,1,1>, <2,6,u,7>
+ 1585402874U, // <1,2,7,0>: Cost 2 vext2 <7,0,1,2>, <7,0,1,2>
+ 2659144770U, // <1,2,7,1>: Cost 3 vext2 <7,0,1,2>, <7,1,0,2>
+ 3708998858U, // <1,2,7,2>: Cost 4 vext2 <3,0,1,2>, <7,2,6,3>
+ 2635257059U, // <1,2,7,3>: Cost 3 vext2 <3,0,1,2>, <7,3,0,1>
+ 2659145062U, // <1,2,7,4>: Cost 3 vext2 <7,0,1,2>, <7,4,5,6>
+ 3732886916U, // <1,2,7,5>: Cost 4 vext2 <7,0,1,2>, <7,5,0,0>
+ 3732886998U, // <1,2,7,6>: Cost 4 vext2 <7,0,1,2>, <7,6,0,1>
+ 2659145255U, // <1,2,7,7>: Cost 3 vext2 <7,0,1,2>, <7,7,0,1>
+ 1590711938U, // <1,2,7,u>: Cost 2 vext2 <7,u,1,2>, <7,u,1,2>
+ 403529835U, // <1,2,u,0>: Cost 1 vext1 LHS, LHS
+ 1477272310U, // <1,2,u,1>: Cost 2 vext1 LHS, <1,0,3,2>
+ 1477273192U, // <1,2,u,2>: Cost 2 vext1 LHS, <2,2,2,2>
+ 1477273750U, // <1,2,u,3>: Cost 2 vext1 LHS, <3,0,1,2>
+ 403533110U, // <1,2,u,4>: Cost 1 vext1 LHS, RHS
+ 1561516186U, // <1,2,u,5>: Cost 2 vext2 <3,0,1,2>, RHS
+ 1525051898U, // <1,2,u,6>: Cost 2 vext1 LHS, <6,2,7,3>
+ 1525052410U, // <1,2,u,7>: Cost 2 vext1 LHS, <7,0,1,2>
+ 403535662U, // <1,2,u,u>: Cost 1 vext1 LHS, LHS
+ 2819407872U, // <1,3,0,0>: Cost 3 vuzpr LHS, <0,0,0,0>
+ 1551564902U, // <1,3,0,1>: Cost 2 vext2 <1,3,1,3>, LHS
+ 2819408630U, // <1,3,0,2>: Cost 3 vuzpr LHS, <1,0,3,2>
+ 2619334911U, // <1,3,0,3>: Cost 3 vext2 <0,3,1,3>, <0,3,1,3>
+ 2625306962U, // <1,3,0,4>: Cost 3 vext2 <1,3,1,3>, <0,4,1,5>
+ 3832725879U, // <1,3,0,5>: Cost 4 vuzpl <1,2,3,0>, <0,4,5,6>
+ 3699048959U, // <1,3,0,6>: Cost 4 vext2 <1,3,1,3>, <0,6,2,7>
+ 3776538827U, // <1,3,0,7>: Cost 4 vext3 <3,0,7,1>, <3,0,7,1>
+ 1551565469U, // <1,3,0,u>: Cost 2 vext2 <1,3,1,3>, LHS
+ 2618671862U, // <1,3,1,0>: Cost 3 vext2 <0,2,1,3>, <1,0,3,2>
+ 2819408692U, // <1,3,1,1>: Cost 3 vuzpr LHS, <1,1,1,1>
+ 2624643975U, // <1,3,1,2>: Cost 3 vext2 <1,2,1,3>, <1,2,1,3>
+ 1745666150U, // <1,3,1,3>: Cost 2 vuzpr LHS, LHS
+ 2557005110U, // <1,3,1,4>: Cost 3 vext1 <1,1,3,1>, RHS
+ 2625307792U, // <1,3,1,5>: Cost 3 vext2 <1,3,1,3>, <1,5,3,7>
+ 3698386127U, // <1,3,1,6>: Cost 4 vext2 <1,2,1,3>, <1,6,1,7>
+ 2592838748U, // <1,3,1,7>: Cost 3 vext1 <7,1,3,1>, <7,1,3,1>
+ 1745666155U, // <1,3,1,u>: Cost 2 vuzpr LHS, LHS
+ 2819408790U, // <1,3,2,0>: Cost 3 vuzpr LHS, <1,2,3,0>
+ 2625308193U, // <1,3,2,1>: Cost 3 vext2 <1,3,1,3>, <2,1,3,3>
+ 2819408036U, // <1,3,2,2>: Cost 3 vuzpr LHS, <0,2,0,2>
+ 2819851890U, // <1,3,2,3>: Cost 3 vuzpr LHS, <2,2,3,3>
+ 2819408794U, // <1,3,2,4>: Cost 3 vuzpr LHS, <1,2,3,4>
+ 3893149890U, // <1,3,2,5>: Cost 4 vuzpr LHS, <0,2,3,5>
+ 2819408076U, // <1,3,2,6>: Cost 3 vuzpr LHS, <0,2,4,6>
+ 3772041583U, // <1,3,2,7>: Cost 4 vext3 <2,3,0,1>, <3,2,7,3>
+ 2819408042U, // <1,3,2,u>: Cost 3 vuzpr LHS, <0,2,0,u>
+ 1483276390U, // <1,3,3,0>: Cost 2 vext1 <1,1,3,3>, LHS
+ 1483277128U, // <1,3,3,1>: Cost 2 vext1 <1,1,3,3>, <1,1,3,3>
+ 2557019752U, // <1,3,3,2>: Cost 3 vext1 <1,1,3,3>, <2,2,2,2>
+ 2819408856U, // <1,3,3,3>: Cost 3 vuzpr LHS, <1,3,1,3>
+ 1483279670U, // <1,3,3,4>: Cost 2 vext1 <1,1,3,3>, RHS
+ 2819409614U, // <1,3,3,5>: Cost 3 vuzpr LHS, <2,3,4,5>
+ 2598826490U, // <1,3,3,6>: Cost 3 vext1 <u,1,3,3>, <6,2,7,3>
+ 3087844352U, // <1,3,3,7>: Cost 3 vtrnr LHS, <1,3,5,7>
+ 1483282222U, // <1,3,3,u>: Cost 2 vext1 <1,1,3,3>, LHS
+ 2568970342U, // <1,3,4,0>: Cost 3 vext1 <3,1,3,4>, LHS
+ 2568971224U, // <1,3,4,1>: Cost 3 vext1 <3,1,3,4>, <1,3,1,3>
+ 3832761290U, // <1,3,4,2>: Cost 4 vuzpl <1,2,3,4>, <4,1,2,3>
+ 2233428219U, // <1,3,4,3>: Cost 3 vrev <3,1,3,4>
+ 2568973622U, // <1,3,4,4>: Cost 3 vext1 <3,1,3,4>, RHS
+ 1551568182U, // <1,3,4,5>: Cost 2 vext2 <1,3,1,3>, RHS
+ 2819410434U, // <1,3,4,6>: Cost 3 vuzpr LHS, <3,4,5,6>
+ 3666605151U, // <1,3,4,7>: Cost 4 vext1 <7,1,3,4>, <7,1,3,4>
+ 1551568425U, // <1,3,4,u>: Cost 2 vext2 <1,3,1,3>, RHS
+ 2563006566U, // <1,3,5,0>: Cost 3 vext1 <2,1,3,5>, LHS
+ 2568979456U, // <1,3,5,1>: Cost 3 vext1 <3,1,3,5>, <1,3,5,7>
+ 2563008035U, // <1,3,5,2>: Cost 3 vext1 <2,1,3,5>, <2,1,3,5>
+ 2233436412U, // <1,3,5,3>: Cost 3 vrev <3,1,3,5>
+ 2563009846U, // <1,3,5,4>: Cost 3 vext1 <2,1,3,5>, RHS
+ 2867187716U, // <1,3,5,5>: Cost 3 vuzpr LHS, <5,5,5,5>
+ 2655834214U, // <1,3,5,6>: Cost 3 vext2 <6,4,1,3>, <5,6,7,4>
+ 1745669430U, // <1,3,5,7>: Cost 2 vuzpr LHS, RHS
+ 1745669431U, // <1,3,5,u>: Cost 2 vuzpr LHS, RHS
+ 2867187810U, // <1,3,6,0>: Cost 3 vuzpr LHS, <5,6,7,0>
+ 3699052931U, // <1,3,6,1>: Cost 4 vext2 <1,3,1,3>, <6,1,3,1>
+ 2654507460U, // <1,3,6,2>: Cost 3 vext2 <6,2,1,3>, <6,2,1,3>
+ 3766291091U, // <1,3,6,3>: Cost 4 vext3 <1,3,3,1>, <3,6,3,7>
+ 2655834726U, // <1,3,6,4>: Cost 3 vext2 <6,4,1,3>, <6,4,1,3>
+ 3923384562U, // <1,3,6,5>: Cost 4 vuzpr <5,1,7,3>, <u,6,7,5>
+ 2657161992U, // <1,3,6,6>: Cost 3 vext2 <6,6,1,3>, <6,6,1,3>
+ 2819852218U, // <1,3,6,7>: Cost 3 vuzpr LHS, <2,6,3,7>
+ 2819852219U, // <1,3,6,u>: Cost 3 vuzpr LHS, <2,6,3,u>
+ 2706926275U, // <1,3,7,0>: Cost 3 vext3 <3,7,0,1>, <3,7,0,1>
+ 2659816524U, // <1,3,7,1>: Cost 3 vext2 <7,1,1,3>, <7,1,1,3>
+ 3636766245U, // <1,3,7,2>: Cost 4 vext1 <2,1,3,7>, <2,1,3,7>
+ 2867187903U, // <1,3,7,3>: Cost 3 vuzpr LHS, <5,7,u,3>
+ 2625312102U, // <1,3,7,4>: Cost 3 vext2 <1,3,1,3>, <7,4,5,6>
+ 2867188598U, // <1,3,7,5>: Cost 3 vuzpr LHS, <6,7,4,5>
+ 3728250344U, // <1,3,7,6>: Cost 4 vext2 <6,2,1,3>, <7,6,2,1>
+ 2867187880U, // <1,3,7,7>: Cost 3 vuzpr LHS, <5,7,5,7>
+ 2707516171U, // <1,3,7,u>: Cost 3 vext3 <3,7,u,1>, <3,7,u,1>
+ 1483317350U, // <1,3,u,0>: Cost 2 vext1 <1,1,3,u>, LHS
+ 1483318093U, // <1,3,u,1>: Cost 2 vext1 <1,1,3,u>, <1,1,3,u>
+ 2819410718U, // <1,3,u,2>: Cost 3 vuzpr LHS, <3,u,1,2>
+ 1745666717U, // <1,3,u,3>: Cost 2 vuzpr LHS, LHS
+ 1483320630U, // <1,3,u,4>: Cost 2 vext1 <1,1,3,u>, RHS
+ 1551571098U, // <1,3,u,5>: Cost 2 vext2 <1,3,1,3>, RHS
+ 2819410758U, // <1,3,u,6>: Cost 3 vuzpr LHS, <3,u,5,6>
+ 1745669673U, // <1,3,u,7>: Cost 2 vuzpr LHS, RHS
+ 1745666722U, // <1,3,u,u>: Cost 2 vuzpr LHS, LHS
+ 2617352205U, // <1,4,0,0>: Cost 3 vext2 <0,0,1,4>, <0,0,1,4>
+ 2619342950U, // <1,4,0,1>: Cost 3 vext2 <0,3,1,4>, LHS
+ 3692421295U, // <1,4,0,2>: Cost 4 vext2 <0,2,1,4>, <0,2,1,4>
+ 2619343104U, // <1,4,0,3>: Cost 3 vext2 <0,3,1,4>, <0,3,1,4>
+ 2617352530U, // <1,4,0,4>: Cost 3 vext2 <0,0,1,4>, <0,4,1,5>
+ 1634880402U, // <1,4,0,5>: Cost 2 vext3 <4,0,5,1>, <4,0,5,1>
+ 2713930652U, // <1,4,0,6>: Cost 3 vext3 <4,u,5,1>, <4,0,6,2>
+ 3732898396U, // <1,4,0,7>: Cost 4 vext2 <7,0,1,4>, <0,7,4,1>
+ 1635101613U, // <1,4,0,u>: Cost 2 vext3 <4,0,u,1>, <4,0,u,1>
+ 3693085430U, // <1,4,1,0>: Cost 4 vext2 <0,3,1,4>, <1,0,3,2>
+ 2623988535U, // <1,4,1,1>: Cost 3 vext2 <1,1,1,4>, <1,1,1,4>
+ 3693085590U, // <1,4,1,2>: Cost 4 vext2 <0,3,1,4>, <1,2,3,0>
+ 3692422134U, // <1,4,1,3>: Cost 4 vext2 <0,2,1,4>, <1,3,4,6>
+ 3693085726U, // <1,4,1,4>: Cost 4 vext2 <0,3,1,4>, <1,4,0,1>
+ 2892401974U, // <1,4,1,5>: Cost 3 vzipl <1,1,1,1>, RHS
+ 3026619702U, // <1,4,1,6>: Cost 3 vtrnl <1,1,1,1>, RHS
+ 3800206324U, // <1,4,1,7>: Cost 4 vext3 <7,0,4,1>, <4,1,7,0>
+ 2892402217U, // <1,4,1,u>: Cost 3 vzipl <1,1,1,1>, RHS
+ 3966978927U, // <1,4,2,0>: Cost 4 vzipl <1,2,3,4>, <4,0,1,2>
+ 3966979018U, // <1,4,2,1>: Cost 4 vzipl <1,2,3,4>, <4,1,2,3>
+ 3693086312U, // <1,4,2,2>: Cost 4 vext2 <0,3,1,4>, <2,2,2,2>
+ 2635269798U, // <1,4,2,3>: Cost 3 vext2 <3,0,1,4>, <2,3,0,1>
+ 3966979280U, // <1,4,2,4>: Cost 4 vzipl <1,2,3,4>, <4,4,4,4>
+ 2893204790U, // <1,4,2,5>: Cost 3 vzipl <1,2,3,0>, RHS
+ 3693086650U, // <1,4,2,6>: Cost 4 vext2 <0,3,1,4>, <2,6,3,7>
+ 3666662502U, // <1,4,2,7>: Cost 4 vext1 <7,1,4,2>, <7,1,4,2>
+ 2893205033U, // <1,4,2,u>: Cost 3 vzipl <1,2,3,0>, RHS
+ 2563063910U, // <1,4,3,0>: Cost 3 vext1 <2,1,4,3>, LHS
+ 2563064730U, // <1,4,3,1>: Cost 3 vext1 <2,1,4,3>, <1,2,3,4>
+ 2563065386U, // <1,4,3,2>: Cost 3 vext1 <2,1,4,3>, <2,1,4,3>
+ 3693087132U, // <1,4,3,3>: Cost 4 vext2 <0,3,1,4>, <3,3,3,3>
+ 2619345410U, // <1,4,3,4>: Cost 3 vext2 <0,3,1,4>, <3,4,5,6>
+ 3087843666U, // <1,4,3,5>: Cost 3 vtrnr LHS, <0,4,1,5>
+ 3087843676U, // <1,4,3,6>: Cost 3 vtrnr LHS, <0,4,2,6>
+ 3666670695U, // <1,4,3,7>: Cost 4 vext1 <7,1,4,3>, <7,1,4,3>
+ 3087843669U, // <1,4,3,u>: Cost 3 vtrnr LHS, <0,4,1,u>
+ 2620672914U, // <1,4,4,0>: Cost 3 vext2 <0,5,1,4>, <4,0,5,1>
+ 3630842706U, // <1,4,4,1>: Cost 4 vext1 <1,1,4,4>, <1,1,4,4>
+ 3313069003U, // <1,4,4,2>: Cost 4 vrev <4,1,2,4>
+ 3642788100U, // <1,4,4,3>: Cost 4 vext1 <3,1,4,4>, <3,1,4,4>
+ 2713930960U, // <1,4,4,4>: Cost 3 vext3 <4,u,5,1>, <4,4,4,4>
+ 2619346230U, // <1,4,4,5>: Cost 3 vext2 <0,3,1,4>, RHS
+ 2713930980U, // <1,4,4,6>: Cost 3 vext3 <4,u,5,1>, <4,4,6,6>
+ 3736882642U, // <1,4,4,7>: Cost 4 vext2 <7,6,1,4>, <4,7,6,1>
+ 2619346473U, // <1,4,4,u>: Cost 3 vext2 <0,3,1,4>, RHS
+ 2557108326U, // <1,4,5,0>: Cost 3 vext1 <1,1,4,5>, LHS
+ 2557109075U, // <1,4,5,1>: Cost 3 vext1 <1,1,4,5>, <1,1,4,5>
+ 2598913774U, // <1,4,5,2>: Cost 3 vext1 <u,1,4,5>, <2,3,u,1>
+ 3630852246U, // <1,4,5,3>: Cost 4 vext1 <1,1,4,5>, <3,0,1,2>
+ 2557111606U, // <1,4,5,4>: Cost 3 vext1 <1,1,4,5>, RHS
+ 2895252790U, // <1,4,5,5>: Cost 3 vzipl <1,5,3,7>, RHS
+ 1616006454U, // <1,4,5,6>: Cost 2 vext3 <0,u,1,1>, RHS
+ 3899059510U, // <1,4,5,7>: Cost 4 vuzpr <1,1,1,4>, RHS
+ 1616006472U, // <1,4,5,u>: Cost 2 vext3 <0,u,1,1>, RHS
+ 2557116518U, // <1,4,6,0>: Cost 3 vext1 <1,1,4,6>, LHS
+ 2557117236U, // <1,4,6,1>: Cost 3 vext1 <1,1,4,6>, <1,1,1,1>
+ 3630859880U, // <1,4,6,2>: Cost 4 vext1 <1,1,4,6>, <2,2,2,2>
+ 2569062550U, // <1,4,6,3>: Cost 3 vext1 <3,1,4,6>, <3,0,1,2>
+ 2557119798U, // <1,4,6,4>: Cost 3 vext1 <1,1,4,6>, RHS
+ 3763490174U, // <1,4,6,5>: Cost 4 vext3 <0,u,1,1>, <4,6,5,7>
+ 3763490183U, // <1,4,6,6>: Cost 4 vext3 <0,u,1,1>, <4,6,6,7>
+ 2712751498U, // <1,4,6,7>: Cost 3 vext3 <4,6,7,1>, <4,6,7,1>
+ 2557122350U, // <1,4,6,u>: Cost 3 vext1 <1,1,4,6>, LHS
+ 2659161084U, // <1,4,7,0>: Cost 3 vext2 <7,0,1,4>, <7,0,1,4>
+ 3732903040U, // <1,4,7,1>: Cost 4 vext2 <7,0,1,4>, <7,1,7,1>
+ 3734230174U, // <1,4,7,2>: Cost 4 vext2 <7,2,1,4>, <7,2,1,4>
+ 3734893807U, // <1,4,7,3>: Cost 4 vext2 <7,3,1,4>, <7,3,1,4>
+ 3660729654U, // <1,4,7,4>: Cost 4 vext1 <6,1,4,7>, RHS
+ 3786493384U, // <1,4,7,5>: Cost 4 vext3 <4,6,7,1>, <4,7,5,0>
+ 2713341394U, // <1,4,7,6>: Cost 3 vext3 <4,7,6,1>, <4,7,6,1>
+ 3660731386U, // <1,4,7,7>: Cost 4 vext1 <6,1,4,7>, <7,0,1,2>
+ 2664470148U, // <1,4,7,u>: Cost 3 vext2 <7,u,1,4>, <7,u,1,4>
+ 2557132902U, // <1,4,u,0>: Cost 3 vext1 <1,1,4,u>, LHS
+ 2619348782U, // <1,4,u,1>: Cost 3 vext2 <0,3,1,4>, LHS
+ 2563106351U, // <1,4,u,2>: Cost 3 vext1 <2,1,4,u>, <2,1,4,u>
+ 2713783816U, // <1,4,u,3>: Cost 3 vext3 <4,u,3,1>, <4,u,3,1>
+ 2622666815U, // <1,4,u,4>: Cost 3 vext2 <0,u,1,4>, <u,4,5,6>
+ 1640189466U, // <1,4,u,5>: Cost 2 vext3 <4,u,5,1>, <4,u,5,1>
+ 1616006697U, // <1,4,u,6>: Cost 2 vext3 <0,u,1,1>, RHS
+ 2712751498U, // <1,4,u,7>: Cost 3 vext3 <4,6,7,1>, <4,6,7,1>
+ 1616006715U, // <1,4,u,u>: Cost 2 vext3 <0,u,1,1>, RHS
+ 2620014592U, // <1,5,0,0>: Cost 3 vext2 <0,4,1,5>, <0,0,0,0>
+ 1546272870U, // <1,5,0,1>: Cost 2 vext2 <0,4,1,5>, LHS
+ 2618687664U, // <1,5,0,2>: Cost 3 vext2 <0,2,1,5>, <0,2,1,5>
+ 3693093120U, // <1,5,0,3>: Cost 4 vext2 <0,3,1,5>, <0,3,1,4>
+ 1546273106U, // <1,5,0,4>: Cost 2 vext2 <0,4,1,5>, <0,4,1,5>
+ 2620678563U, // <1,5,0,5>: Cost 3 vext2 <0,5,1,5>, <0,5,1,5>
+ 2714668660U, // <1,5,0,6>: Cost 3 vext3 <5,0,6,1>, <5,0,6,1>
+ 3772042877U, // <1,5,0,7>: Cost 4 vext3 <2,3,0,1>, <5,0,7,1>
+ 1546273437U, // <1,5,0,u>: Cost 2 vext2 <0,4,1,5>, LHS
+ 2620015350U, // <1,5,1,0>: Cost 3 vext2 <0,4,1,5>, <1,0,3,2>
+ 2620015412U, // <1,5,1,1>: Cost 3 vext2 <0,4,1,5>, <1,1,1,1>
+ 2620015510U, // <1,5,1,2>: Cost 3 vext2 <0,4,1,5>, <1,2,3,0>
+ 2618688512U, // <1,5,1,3>: Cost 3 vext2 <0,2,1,5>, <1,3,5,7>
+ 2620015677U, // <1,5,1,4>: Cost 3 vext2 <0,4,1,5>, <1,4,3,5>
+ 2620015727U, // <1,5,1,5>: Cost 3 vext2 <0,4,1,5>, <1,5,0,1>
+ 2620015859U, // <1,5,1,6>: Cost 3 vext2 <0,4,1,5>, <1,6,5,7>
+ 3093728566U, // <1,5,1,7>: Cost 3 vtrnr <1,1,1,1>, RHS
+ 2620015981U, // <1,5,1,u>: Cost 3 vext2 <0,4,1,5>, <1,u,1,3>
+ 3692430816U, // <1,5,2,0>: Cost 4 vext2 <0,2,1,5>, <2,0,5,1>
+ 2620016163U, // <1,5,2,1>: Cost 3 vext2 <0,4,1,5>, <2,1,3,5>
+ 2620016232U, // <1,5,2,2>: Cost 3 vext2 <0,4,1,5>, <2,2,2,2>
+ 2620016294U, // <1,5,2,3>: Cost 3 vext2 <0,4,1,5>, <2,3,0,1>
+ 3693758221U, // <1,5,2,4>: Cost 4 vext2 <0,4,1,5>, <2,4,2,5>
+ 3692431209U, // <1,5,2,5>: Cost 4 vext2 <0,2,1,5>, <2,5,3,7>
+ 2620016570U, // <1,5,2,6>: Cost 3 vext2 <0,4,1,5>, <2,6,3,7>
+ 4173598006U, // <1,5,2,7>: Cost 4 vtrnr <2,1,3,2>, RHS
+ 2620016699U, // <1,5,2,u>: Cost 3 vext2 <0,4,1,5>, <2,u,0,1>
+ 2620016790U, // <1,5,3,0>: Cost 3 vext2 <0,4,1,5>, <3,0,1,2>
+ 2569110672U, // <1,5,3,1>: Cost 3 vext1 <3,1,5,3>, <1,5,3,7>
+ 3693758785U, // <1,5,3,2>: Cost 4 vext2 <0,4,1,5>, <3,2,2,2>
+ 2620017052U, // <1,5,3,3>: Cost 3 vext2 <0,4,1,5>, <3,3,3,3>
+ 2620017154U, // <1,5,3,4>: Cost 3 vext2 <0,4,1,5>, <3,4,5,6>
+ 3135623172U, // <1,5,3,5>: Cost 3 vtrnr LHS, <5,5,5,5>
+ 4161587048U, // <1,5,3,6>: Cost 4 vtrnr LHS, <2,5,3,6>
+ 2014104886U, // <1,5,3,7>: Cost 2 vtrnr LHS, RHS
+ 2014104887U, // <1,5,3,u>: Cost 2 vtrnr LHS, RHS
+ 2620017554U, // <1,5,4,0>: Cost 3 vext2 <0,4,1,5>, <4,0,5,1>
+ 2620017634U, // <1,5,4,1>: Cost 3 vext2 <0,4,1,5>, <4,1,5,0>
+ 3693759551U, // <1,5,4,2>: Cost 4 vext2 <0,4,1,5>, <4,2,6,3>
+ 3642861837U, // <1,5,4,3>: Cost 4 vext1 <3,1,5,4>, <3,1,5,4>
+ 2575092710U, // <1,5,4,4>: Cost 3 vext1 <4,1,5,4>, <4,1,5,4>
+ 1546276150U, // <1,5,4,5>: Cost 2 vext2 <0,4,1,5>, RHS
+ 2759855414U, // <1,5,4,6>: Cost 3 vuzpl <1,3,5,7>, RHS
+ 2713931718U, // <1,5,4,7>: Cost 3 vext3 <4,u,5,1>, <5,4,7,6>
+ 1546276393U, // <1,5,4,u>: Cost 2 vext2 <0,4,1,5>, RHS
+ 2557182054U, // <1,5,5,0>: Cost 3 vext1 <1,1,5,5>, LHS
+ 2557182812U, // <1,5,5,1>: Cost 3 vext1 <1,1,5,5>, <1,1,5,5>
+ 3630925347U, // <1,5,5,2>: Cost 4 vext1 <1,1,5,5>, <2,1,3,5>
+ 4029301675U, // <1,5,5,3>: Cost 4 vzipr <0,4,1,5>, <1,2,5,3>
+ 2557185334U, // <1,5,5,4>: Cost 3 vext1 <1,1,5,5>, RHS
+ 2713931780U, // <1,5,5,5>: Cost 3 vext3 <4,u,5,1>, <5,5,5,5>
+ 2667794530U, // <1,5,5,6>: Cost 3 vext2 <u,4,1,5>, <5,6,7,0>
+ 2713931800U, // <1,5,5,7>: Cost 3 vext3 <4,u,5,1>, <5,5,7,7>
+ 2557187886U, // <1,5,5,u>: Cost 3 vext1 <1,1,5,5>, LHS
+ 2718208036U, // <1,5,6,0>: Cost 3 vext3 <5,6,0,1>, <5,6,0,1>
+ 2620019115U, // <1,5,6,1>: Cost 3 vext2 <0,4,1,5>, <6,1,7,5>
+ 2667794938U, // <1,5,6,2>: Cost 3 vext2 <u,4,1,5>, <6,2,7,3>
+ 3787673666U, // <1,5,6,3>: Cost 4 vext3 <4,u,5,1>, <5,6,3,4>
+ 3693761165U, // <1,5,6,4>: Cost 4 vext2 <0,4,1,5>, <6,4,5,6>
+ 3319279297U, // <1,5,6,5>: Cost 4 vrev <5,1,5,6>
+ 2667795256U, // <1,5,6,6>: Cost 3 vext2 <u,4,1,5>, <6,6,6,6>
+ 2713931874U, // <1,5,6,7>: Cost 3 vext3 <4,u,5,1>, <5,6,7,0>
+ 2713931883U, // <1,5,6,u>: Cost 3 vext3 <4,u,5,1>, <5,6,u,0>
+ 2557198438U, // <1,5,7,0>: Cost 3 vext1 <1,1,5,7>, LHS
+ 2557199156U, // <1,5,7,1>: Cost 3 vext1 <1,1,5,7>, <1,1,1,1>
+ 2569143974U, // <1,5,7,2>: Cost 3 vext1 <3,1,5,7>, <2,3,0,1>
+ 2569144592U, // <1,5,7,3>: Cost 3 vext1 <3,1,5,7>, <3,1,5,7>
+ 2557201718U, // <1,5,7,4>: Cost 3 vext1 <1,1,5,7>, RHS
+ 2713931944U, // <1,5,7,5>: Cost 3 vext3 <4,u,5,1>, <5,7,5,7>
+ 3787673770U, // <1,5,7,6>: Cost 4 vext3 <4,u,5,1>, <5,7,6,0>
+ 2719387828U, // <1,5,7,7>: Cost 3 vext3 <5,7,7,1>, <5,7,7,1>
+ 2557204270U, // <1,5,7,u>: Cost 3 vext1 <1,1,5,7>, LHS
+ 2620020435U, // <1,5,u,0>: Cost 3 vext2 <0,4,1,5>, <u,0,1,2>
+ 1546278702U, // <1,5,u,1>: Cost 2 vext2 <0,4,1,5>, LHS
+ 2620020616U, // <1,5,u,2>: Cost 3 vext2 <0,4,1,5>, <u,2,3,3>
+ 2620020668U, // <1,5,u,3>: Cost 3 vext2 <0,4,1,5>, <u,3,0,1>
+ 1594054682U, // <1,5,u,4>: Cost 2 vext2 <u,4,1,5>, <u,4,1,5>
+ 1546279066U, // <1,5,u,5>: Cost 2 vext2 <0,4,1,5>, RHS
+ 2620020944U, // <1,5,u,6>: Cost 3 vext2 <0,4,1,5>, <u,6,3,7>
+ 2014145846U, // <1,5,u,7>: Cost 2 vtrnr LHS, RHS
+ 2014145847U, // <1,5,u,u>: Cost 2 vtrnr LHS, RHS
+ 3692437504U, // <1,6,0,0>: Cost 4 vext2 <0,2,1,6>, <0,0,0,0>
+ 2618695782U, // <1,6,0,1>: Cost 3 vext2 <0,2,1,6>, LHS
+ 2618695857U, // <1,6,0,2>: Cost 3 vext2 <0,2,1,6>, <0,2,1,6>
+ 3794161970U, // <1,6,0,3>: Cost 4 vext3 <6,0,3,1>, <6,0,3,1>
+ 2620023122U, // <1,6,0,4>: Cost 3 vext2 <0,4,1,6>, <0,4,1,5>
+ 2620686756U, // <1,6,0,5>: Cost 3 vext2 <0,5,1,6>, <0,5,1,6>
+ 2621350389U, // <1,6,0,6>: Cost 3 vext2 <0,6,1,6>, <0,6,1,6>
+ 4028599606U, // <1,6,0,7>: Cost 4 vzipr <0,3,1,0>, RHS
+ 2618696349U, // <1,6,0,u>: Cost 3 vext2 <0,2,1,6>, LHS
+ 3692438262U, // <1,6,1,0>: Cost 4 vext2 <0,2,1,6>, <1,0,3,2>
+ 2625995572U, // <1,6,1,1>: Cost 3 vext2 <1,4,1,6>, <1,1,1,1>
+ 3692438422U, // <1,6,1,2>: Cost 4 vext2 <0,2,1,6>, <1,2,3,0>
+ 3692438488U, // <1,6,1,3>: Cost 4 vext2 <0,2,1,6>, <1,3,1,3>
+ 2625995820U, // <1,6,1,4>: Cost 3 vext2 <1,4,1,6>, <1,4,1,6>
+ 3692438672U, // <1,6,1,5>: Cost 4 vext2 <0,2,1,6>, <1,5,3,7>
+ 3692438720U, // <1,6,1,6>: Cost 4 vext2 <0,2,1,6>, <1,6,0,1>
+ 2958183734U, // <1,6,1,7>: Cost 3 vzipr <0,u,1,1>, RHS
+ 2958183735U, // <1,6,1,u>: Cost 3 vzipr <0,u,1,1>, RHS
+ 2721526201U, // <1,6,2,0>: Cost 3 vext3 <6,2,0,1>, <6,2,0,1>
+ 3692439097U, // <1,6,2,1>: Cost 4 vext2 <0,2,1,6>, <2,1,6,0>
+ 3692439144U, // <1,6,2,2>: Cost 4 vext2 <0,2,1,6>, <2,2,2,2>
+ 3692439206U, // <1,6,2,3>: Cost 4 vext2 <0,2,1,6>, <2,3,0,1>
+ 3636948278U, // <1,6,2,4>: Cost 4 vext1 <2,1,6,2>, RHS
+ 3787674092U, // <1,6,2,5>: Cost 4 vext3 <4,u,5,1>, <6,2,5,7>
+ 2618697658U, // <1,6,2,6>: Cost 3 vext2 <0,2,1,6>, <2,6,3,7>
+ 2970799414U, // <1,6,2,7>: Cost 3 vzipr <3,0,1,2>, RHS
+ 2970799415U, // <1,6,2,u>: Cost 3 vzipr <3,0,1,2>, RHS
+ 2563211366U, // <1,6,3,0>: Cost 3 vext1 <2,1,6,3>, LHS
+ 3699738854U, // <1,6,3,1>: Cost 4 vext2 <1,4,1,6>, <3,1,1,1>
+ 2563212860U, // <1,6,3,2>: Cost 3 vext1 <2,1,6,3>, <2,1,6,3>
+ 3692439964U, // <1,6,3,3>: Cost 4 vext2 <0,2,1,6>, <3,3,3,3>
+ 2563214646U, // <1,6,3,4>: Cost 3 vext1 <2,1,6,3>, RHS
+ 4191820018U, // <1,6,3,5>: Cost 4 vtrnr <5,1,7,3>, <u,6,7,5>
+ 2587103648U, // <1,6,3,6>: Cost 3 vext1 <6,1,6,3>, <6,1,6,3>
+ 3087845306U, // <1,6,3,7>: Cost 3 vtrnr LHS, <2,6,3,7>
+ 3087845307U, // <1,6,3,u>: Cost 3 vtrnr LHS, <2,6,3,u>
+ 3693767570U, // <1,6,4,0>: Cost 4 vext2 <0,4,1,6>, <4,0,5,1>
+ 3693767650U, // <1,6,4,1>: Cost 4 vext2 <0,4,1,6>, <4,1,5,0>
+ 3636962877U, // <1,6,4,2>: Cost 4 vext1 <2,1,6,4>, <2,1,6,4>
+ 3325088134U, // <1,6,4,3>: Cost 4 vrev <6,1,3,4>
+ 3693767898U, // <1,6,4,4>: Cost 4 vext2 <0,4,1,6>, <4,4,5,5>
+ 2618699062U, // <1,6,4,5>: Cost 3 vext2 <0,2,1,6>, RHS
+ 3833670966U, // <1,6,4,6>: Cost 4 vuzpl <1,3,6,7>, RHS
+ 4028632374U, // <1,6,4,7>: Cost 4 vzipr <0,3,1,4>, RHS
+ 2618699305U, // <1,6,4,u>: Cost 3 vext2 <0,2,1,6>, RHS
+ 3693768264U, // <1,6,5,0>: Cost 4 vext2 <0,4,1,6>, <5,0,1,2>
+ 3630998373U, // <1,6,5,1>: Cost 4 vext1 <1,1,6,5>, <1,1,6,5>
+ 3636971070U, // <1,6,5,2>: Cost 4 vext1 <2,1,6,5>, <2,1,6,5>
+ 3642943767U, // <1,6,5,3>: Cost 4 vext1 <3,1,6,5>, <3,1,6,5>
+ 3693768628U, // <1,6,5,4>: Cost 4 vext2 <0,4,1,6>, <5,4,5,6>
+ 3732918276U, // <1,6,5,5>: Cost 4 vext2 <7,0,1,6>, <5,5,5,5>
+ 2620690530U, // <1,6,5,6>: Cost 3 vext2 <0,5,1,6>, <5,6,7,0>
+ 2955562294U, // <1,6,5,7>: Cost 3 vzipr <0,4,1,5>, RHS
+ 2955562295U, // <1,6,5,u>: Cost 3 vzipr <0,4,1,5>, RHS
+ 2724180733U, // <1,6,6,0>: Cost 3 vext3 <6,6,0,1>, <6,6,0,1>
+ 3631006566U, // <1,6,6,1>: Cost 4 vext1 <1,1,6,6>, <1,1,6,6>
+ 3631007674U, // <1,6,6,2>: Cost 4 vext1 <1,1,6,6>, <2,6,3,7>
+ 3692442184U, // <1,6,6,3>: Cost 4 vext2 <0,2,1,6>, <6,3,7,0>
+ 3631009078U, // <1,6,6,4>: Cost 4 vext1 <1,1,6,6>, RHS
+ 3787674416U, // <1,6,6,5>: Cost 4 vext3 <4,u,5,1>, <6,6,5,7>
+ 2713932600U, // <1,6,6,6>: Cost 3 vext3 <4,u,5,1>, <6,6,6,6>
+ 2713932610U, // <1,6,6,7>: Cost 3 vext3 <4,u,5,1>, <6,6,7,7>
+ 2713932619U, // <1,6,6,u>: Cost 3 vext3 <4,u,5,1>, <6,6,u,7>
+ 1651102542U, // <1,6,7,0>: Cost 2 vext3 <6,7,0,1>, <6,7,0,1>
+ 2724918103U, // <1,6,7,1>: Cost 3 vext3 <6,7,1,1>, <6,7,1,1>
+ 2698302306U, // <1,6,7,2>: Cost 3 vext3 <2,3,0,1>, <6,7,2,3>
+ 3642960153U, // <1,6,7,3>: Cost 4 vext1 <3,1,6,7>, <3,1,6,7>
+ 2713932662U, // <1,6,7,4>: Cost 3 vext3 <4,u,5,1>, <6,7,4,5>
+ 2725213051U, // <1,6,7,5>: Cost 3 vext3 <6,7,5,1>, <6,7,5,1>
+ 2724844426U, // <1,6,7,6>: Cost 3 vext3 <6,7,0,1>, <6,7,6,7>
+ 4035956022U, // <1,6,7,7>: Cost 4 vzipr <1,5,1,7>, RHS
+ 1651692438U, // <1,6,7,u>: Cost 2 vext3 <6,7,u,1>, <6,7,u,1>
+ 1651766175U, // <1,6,u,0>: Cost 2 vext3 <6,u,0,1>, <6,u,0,1>
+ 2618701614U, // <1,6,u,1>: Cost 3 vext2 <0,2,1,6>, LHS
+ 3135663508U, // <1,6,u,2>: Cost 3 vtrnr LHS, <4,6,u,2>
+ 3692443580U, // <1,6,u,3>: Cost 4 vext2 <0,2,1,6>, <u,3,0,1>
+ 2713932743U, // <1,6,u,4>: Cost 3 vext3 <4,u,5,1>, <6,u,4,5>
+ 2618701978U, // <1,6,u,5>: Cost 3 vext2 <0,2,1,6>, RHS
+ 2622683344U, // <1,6,u,6>: Cost 3 vext2 <0,u,1,6>, <u,6,3,7>
+ 3087886266U, // <1,6,u,7>: Cost 3 vtrnr LHS, <2,6,3,7>
+ 1652356071U, // <1,6,u,u>: Cost 2 vext3 <6,u,u,1>, <6,u,u,1>
+ 2726171632U, // <1,7,0,0>: Cost 3 vext3 <7,0,0,1>, <7,0,0,1>
+ 2626666598U, // <1,7,0,1>: Cost 3 vext2 <1,5,1,7>, LHS
+ 3695100067U, // <1,7,0,2>: Cost 4 vext2 <0,6,1,7>, <0,2,0,1>
+ 3707044102U, // <1,7,0,3>: Cost 4 vext2 <2,6,1,7>, <0,3,2,1>
+ 2726466580U, // <1,7,0,4>: Cost 3 vext3 <7,0,4,1>, <7,0,4,1>
+ 3654921933U, // <1,7,0,5>: Cost 4 vext1 <5,1,7,0>, <5,1,7,0>
+ 2621358582U, // <1,7,0,6>: Cost 3 vext2 <0,6,1,7>, <0,6,1,7>
+ 2622022215U, // <1,7,0,7>: Cost 3 vext2 <0,7,1,7>, <0,7,1,7>
+ 2626667165U, // <1,7,0,u>: Cost 3 vext2 <1,5,1,7>, LHS
+ 2593128550U, // <1,7,1,0>: Cost 3 vext1 <7,1,7,1>, LHS
+ 2626667316U, // <1,7,1,1>: Cost 3 vext2 <1,5,1,7>, <1,1,1,1>
+ 3700409238U, // <1,7,1,2>: Cost 4 vext2 <1,5,1,7>, <1,2,3,0>
+ 2257294428U, // <1,7,1,3>: Cost 3 vrev <7,1,3,1>
+ 2593131830U, // <1,7,1,4>: Cost 3 vext1 <7,1,7,1>, RHS
+ 2626667646U, // <1,7,1,5>: Cost 3 vext2 <1,5,1,7>, <1,5,1,7>
+ 2627331279U, // <1,7,1,6>: Cost 3 vext2 <1,6,1,7>, <1,6,1,7>
+ 2593133696U, // <1,7,1,7>: Cost 3 vext1 <7,1,7,1>, <7,1,7,1>
+ 2628658545U, // <1,7,1,u>: Cost 3 vext2 <1,u,1,7>, <1,u,1,7>
+ 2587164774U, // <1,7,2,0>: Cost 3 vext1 <6,1,7,2>, LHS
+ 3701073445U, // <1,7,2,1>: Cost 4 vext2 <1,6,1,7>, <2,1,3,7>
+ 3700409960U, // <1,7,2,2>: Cost 4 vext2 <1,5,1,7>, <2,2,2,2>
+ 2638612134U, // <1,7,2,3>: Cost 3 vext2 <3,5,1,7>, <2,3,0,1>
+ 2587168054U, // <1,7,2,4>: Cost 3 vext1 <6,1,7,2>, RHS
+ 3706382167U, // <1,7,2,5>: Cost 4 vext2 <2,5,1,7>, <2,5,1,7>
+ 2587169192U, // <1,7,2,6>: Cost 3 vext1 <6,1,7,2>, <6,1,7,2>
+ 3660911610U, // <1,7,2,7>: Cost 4 vext1 <6,1,7,2>, <7,0,1,2>
+ 2587170606U, // <1,7,2,u>: Cost 3 vext1 <6,1,7,2>, LHS
+ 1507459174U, // <1,7,3,0>: Cost 2 vext1 <5,1,7,3>, LHS
+ 2569257984U, // <1,7,3,1>: Cost 3 vext1 <3,1,7,3>, <1,3,5,7>
+ 2581202536U, // <1,7,3,2>: Cost 3 vext1 <5,1,7,3>, <2,2,2,2>
+ 2569259294U, // <1,7,3,3>: Cost 3 vext1 <3,1,7,3>, <3,1,7,3>
+ 1507462454U, // <1,7,3,4>: Cost 2 vext1 <5,1,7,3>, RHS
+ 1507462864U, // <1,7,3,5>: Cost 2 vext1 <5,1,7,3>, <5,1,7,3>
+ 2581205498U, // <1,7,3,6>: Cost 3 vext1 <5,1,7,3>, <6,2,7,3>
+ 2581206010U, // <1,7,3,7>: Cost 3 vext1 <5,1,7,3>, <7,0,1,2>
+ 1507465006U, // <1,7,3,u>: Cost 2 vext1 <5,1,7,3>, LHS
+ 2728826164U, // <1,7,4,0>: Cost 3 vext3 <7,4,0,1>, <7,4,0,1>
+ 3654951732U, // <1,7,4,1>: Cost 4 vext1 <5,1,7,4>, <1,1,1,1>
+ 3330987094U, // <1,7,4,2>: Cost 4 vrev <7,1,2,4>
+ 3331060831U, // <1,7,4,3>: Cost 4 vrev <7,1,3,4>
+ 3787674971U, // <1,7,4,4>: Cost 4 vext3 <4,u,5,1>, <7,4,4,4>
+ 2626669878U, // <1,7,4,5>: Cost 3 vext2 <1,5,1,7>, RHS
+ 3785979241U, // <1,7,4,6>: Cost 4 vext3 <4,6,0,1>, <7,4,6,0>
+ 3787085176U, // <1,7,4,7>: Cost 4 vext3 <4,7,6,1>, <7,4,7,6>
+ 2626670121U, // <1,7,4,u>: Cost 3 vext2 <1,5,1,7>, RHS
+ 2569273446U, // <1,7,5,0>: Cost 3 vext1 <3,1,7,5>, LHS
+ 2569274368U, // <1,7,5,1>: Cost 3 vext1 <3,1,7,5>, <1,3,5,7>
+ 3643016808U, // <1,7,5,2>: Cost 4 vext1 <3,1,7,5>, <2,2,2,2>
+ 2569275680U, // <1,7,5,3>: Cost 3 vext1 <3,1,7,5>, <3,1,7,5>
+ 2569276726U, // <1,7,5,4>: Cost 3 vext1 <3,1,7,5>, RHS
+ 4102034790U, // <1,7,5,5>: Cost 4 vtrnl <1,3,5,7>, <7,4,5,6>
+ 2651222067U, // <1,7,5,6>: Cost 3 vext2 <5,6,1,7>, <5,6,1,7>
+ 3899378998U, // <1,7,5,7>: Cost 4 vuzpr <1,1,5,7>, RHS
+ 2569279278U, // <1,7,5,u>: Cost 3 vext1 <3,1,7,5>, LHS
+ 2730153430U, // <1,7,6,0>: Cost 3 vext3 <7,6,0,1>, <7,6,0,1>
+ 2724845022U, // <1,7,6,1>: Cost 3 vext3 <6,7,0,1>, <7,6,1,0>
+ 3643025338U, // <1,7,6,2>: Cost 4 vext1 <3,1,7,6>, <2,6,3,7>
+ 3643025697U, // <1,7,6,3>: Cost 4 vext1 <3,1,7,6>, <3,1,7,6>
+ 3643026742U, // <1,7,6,4>: Cost 4 vext1 <3,1,7,6>, RHS
+ 3654971091U, // <1,7,6,5>: Cost 4 vext1 <5,1,7,6>, <5,1,7,6>
+ 3787675153U, // <1,7,6,6>: Cost 4 vext3 <4,u,5,1>, <7,6,6,6>
+ 2724845076U, // <1,7,6,7>: Cost 3 vext3 <6,7,0,1>, <7,6,7,0>
+ 2725508637U, // <1,7,6,u>: Cost 3 vext3 <6,u,0,1>, <7,6,u,0>
+ 2730817063U, // <1,7,7,0>: Cost 3 vext3 <7,7,0,1>, <7,7,0,1>
+ 3631088436U, // <1,7,7,1>: Cost 4 vext1 <1,1,7,7>, <1,1,1,1>
+ 3660949158U, // <1,7,7,2>: Cost 4 vext1 <6,1,7,7>, <2,3,0,1>
+ 3801904705U, // <1,7,7,3>: Cost 4 vext3 <7,3,0,1>, <7,7,3,0>
+ 3631090998U, // <1,7,7,4>: Cost 4 vext1 <1,1,7,7>, RHS
+ 2662503828U, // <1,7,7,5>: Cost 3 vext2 <7,5,1,7>, <7,5,1,7>
+ 3660951981U, // <1,7,7,6>: Cost 4 vext1 <6,1,7,7>, <6,1,7,7>
+ 2713933420U, // <1,7,7,7>: Cost 3 vext3 <4,u,5,1>, <7,7,7,7>
+ 2731406959U, // <1,7,7,u>: Cost 3 vext3 <7,7,u,1>, <7,7,u,1>
+ 1507500134U, // <1,7,u,0>: Cost 2 vext1 <5,1,7,u>, LHS
+ 2626672430U, // <1,7,u,1>: Cost 3 vext2 <1,5,1,7>, LHS
+ 2581243496U, // <1,7,u,2>: Cost 3 vext1 <5,1,7,u>, <2,2,2,2>
+ 2569300259U, // <1,7,u,3>: Cost 3 vext1 <3,1,7,u>, <3,1,7,u>
+ 1507503414U, // <1,7,u,4>: Cost 2 vext1 <5,1,7,u>, RHS
+ 1507503829U, // <1,7,u,5>: Cost 2 vext1 <5,1,7,u>, <5,1,7,u>
+ 2581246458U, // <1,7,u,6>: Cost 3 vext1 <5,1,7,u>, <6,2,7,3>
+ 2581246970U, // <1,7,u,7>: Cost 3 vext1 <5,1,7,u>, <7,0,1,2>
+ 1507505966U, // <1,7,u,u>: Cost 2 vext1 <5,1,7,u>, LHS
+ 1543643153U, // <1,u,0,0>: Cost 2 vext2 <0,0,1,u>, <0,0,1,u>
+ 1546297446U, // <1,u,0,1>: Cost 2 vext2 <0,4,1,u>, LHS
+ 2819448852U, // <1,u,0,2>: Cost 3 vuzpr LHS, <0,0,2,2>
+ 2619375876U, // <1,u,0,3>: Cost 3 vext2 <0,3,1,u>, <0,3,1,u>
+ 1546297685U, // <1,u,0,4>: Cost 2 vext2 <0,4,1,u>, <0,4,1,u>
+ 1658771190U, // <1,u,0,5>: Cost 2 vext3 <u,0,5,1>, <u,0,5,1>
+ 2736789248U, // <1,u,0,6>: Cost 3 vext3 <u,7,0,1>, <u,0,6,2>
+ 2659189376U, // <1,u,0,7>: Cost 3 vext2 <7,0,1,u>, <0,7,u,1>
+ 1546298013U, // <1,u,0,u>: Cost 2 vext2 <0,4,1,u>, LHS
+ 1483112550U, // <1,u,1,0>: Cost 2 vext1 <1,1,1,1>, LHS
+ 202162278U, // <1,u,1,1>: Cost 1 vdup1 LHS
+ 1616009006U, // <1,u,1,2>: Cost 2 vext3 <0,u,1,1>, LHS
+ 1745707110U, // <1,u,1,3>: Cost 2 vuzpr LHS, LHS
+ 1483115830U, // <1,u,1,4>: Cost 2 vext1 <1,1,1,1>, RHS
+ 2620040336U, // <1,u,1,5>: Cost 3 vext2 <0,4,1,u>, <1,5,3,7>
+ 3026622618U, // <1,u,1,6>: Cost 3 vtrnl <1,1,1,1>, RHS
+ 2958183752U, // <1,u,1,7>: Cost 3 vzipr <0,u,1,1>, RHS
+ 202162278U, // <1,u,1,u>: Cost 1 vdup1 LHS
+ 2819449750U, // <1,u,2,0>: Cost 3 vuzpr LHS, <1,2,3,0>
+ 2893207342U, // <1,u,2,1>: Cost 3 vzipl <1,2,3,0>, LHS
+ 2819448996U, // <1,u,2,2>: Cost 3 vuzpr LHS, <0,2,0,2>
+ 2819450482U, // <1,u,2,3>: Cost 3 vuzpr LHS, <2,2,3,3>
+ 2819449754U, // <1,u,2,4>: Cost 3 vuzpr LHS, <1,2,3,4>
+ 2893207706U, // <1,u,2,5>: Cost 3 vzipl <1,2,3,0>, RHS
+ 2819449036U, // <1,u,2,6>: Cost 3 vuzpr LHS, <0,2,4,6>
+ 2970799432U, // <1,u,2,7>: Cost 3 vzipr <3,0,1,2>, RHS
+ 2819449002U, // <1,u,2,u>: Cost 3 vuzpr LHS, <0,2,0,u>
+ 403931292U, // <1,u,3,0>: Cost 1 vext1 LHS, LHS
+ 1477673718U, // <1,u,3,1>: Cost 2 vext1 LHS, <1,0,3,2>
+ 115726126U, // <1,u,3,2>: Cost 1 vrev LHS
+ 2014102173U, // <1,u,3,3>: Cost 2 vtrnr LHS, LHS
+ 403934518U, // <1,u,3,4>: Cost 1 vext1 LHS, RHS
+ 1507536601U, // <1,u,3,5>: Cost 2 vext1 <5,1,u,3>, <5,1,u,3>
+ 1525453306U, // <1,u,3,6>: Cost 2 vext1 LHS, <6,2,7,3>
+ 2014105129U, // <1,u,3,7>: Cost 2 vtrnr LHS, RHS
+ 403937070U, // <1,u,3,u>: Cost 1 vext1 LHS, LHS
+ 2620042157U, // <1,u,4,0>: Cost 3 vext2 <0,4,1,u>, <4,0,u,1>
+ 2620042237U, // <1,u,4,1>: Cost 3 vext2 <0,4,1,u>, <4,1,u,0>
+ 2263217967U, // <1,u,4,2>: Cost 3 vrev <u,1,2,4>
+ 2569341224U, // <1,u,4,3>: Cost 3 vext1 <3,1,u,4>, <3,1,u,4>
+ 2569342262U, // <1,u,4,4>: Cost 3 vext1 <3,1,u,4>, RHS
+ 1546300726U, // <1,u,4,5>: Cost 2 vext2 <0,4,1,u>, RHS
+ 2819449180U, // <1,u,4,6>: Cost 3 vuzpr LHS, <0,4,2,6>
+ 2724845649U, // <1,u,4,7>: Cost 3 vext3 <6,7,0,1>, <u,4,7,6>
+ 1546300969U, // <1,u,4,u>: Cost 2 vext2 <0,4,1,u>, RHS
+ 2551431270U, // <1,u,5,0>: Cost 3 vext1 <0,1,u,5>, LHS
+ 2551432192U, // <1,u,5,1>: Cost 3 vext1 <0,1,u,5>, <1,3,5,7>
+ 3028293422U, // <1,u,5,2>: Cost 3 vtrnl <1,3,5,7>, LHS
+ 2955559068U, // <1,u,5,3>: Cost 3 vzipr <0,4,1,5>, LHS
+ 2551434550U, // <1,u,5,4>: Cost 3 vext1 <0,1,u,5>, RHS
+ 2895255706U, // <1,u,5,5>: Cost 3 vzipl <1,5,3,7>, RHS
+ 1616009370U, // <1,u,5,6>: Cost 2 vext3 <0,u,1,1>, RHS
+ 1745710390U, // <1,u,5,7>: Cost 2 vuzpr LHS, RHS
+ 1745710391U, // <1,u,5,u>: Cost 2 vuzpr LHS, RHS
+ 2653221159U, // <1,u,6,0>: Cost 3 vext2 <6,0,1,u>, <6,0,1,u>
+ 2725509303U, // <1,u,6,1>: Cost 3 vext3 <6,u,0,1>, <u,6,1,0>
+ 2659193338U, // <1,u,6,2>: Cost 3 vext2 <7,0,1,u>, <6,2,7,3>
+ 2689751248U, // <1,u,6,3>: Cost 3 vext3 <0,u,1,1>, <u,6,3,7>
+ 2867228774U, // <1,u,6,4>: Cost 3 vuzpr LHS, <5,6,7,4>
+ 3764820194U, // <1,u,6,5>: Cost 4 vext3 <1,1,1,1>, <u,6,5,7>
+ 2657202957U, // <1,u,6,6>: Cost 3 vext2 <6,6,1,u>, <6,6,1,u>
+ 2819450810U, // <1,u,6,7>: Cost 3 vuzpr LHS, <2,6,3,7>
+ 2819450811U, // <1,u,6,u>: Cost 3 vuzpr LHS, <2,6,3,u>
+ 1585452032U, // <1,u,7,0>: Cost 2 vext2 <7,0,1,u>, <7,0,1,u>
+ 2557420340U, // <1,u,7,1>: Cost 3 vext1 <1,1,u,7>, <1,1,1,1>
+ 2569365158U, // <1,u,7,2>: Cost 3 vext1 <3,1,u,7>, <2,3,0,1>
+ 2569365803U, // <1,u,7,3>: Cost 3 vext1 <3,1,u,7>, <3,1,u,7>
+ 2557422902U, // <1,u,7,4>: Cost 3 vext1 <1,1,u,7>, RHS
+ 2662512021U, // <1,u,7,5>: Cost 3 vext2 <7,5,1,u>, <7,5,1,u>
+ 2724845884U, // <1,u,7,6>: Cost 3 vext3 <6,7,0,1>, <u,7,6,7>
+ 2659194476U, // <1,u,7,7>: Cost 3 vext2 <7,0,1,u>, <7,7,7,7>
+ 1590761096U, // <1,u,7,u>: Cost 2 vext2 <7,u,1,u>, <7,u,1,u>
+ 403972257U, // <1,u,u,0>: Cost 1 vext1 LHS, LHS
+ 202162278U, // <1,u,u,1>: Cost 1 vdup1 LHS
+ 115767091U, // <1,u,u,2>: Cost 1 vrev LHS
+ 1745707677U, // <1,u,u,3>: Cost 2 vuzpr LHS, LHS
+ 403975478U, // <1,u,u,4>: Cost 1 vext1 LHS, RHS
+ 1546303642U, // <1,u,u,5>: Cost 2 vext2 <0,4,1,u>, RHS
+ 1616009613U, // <1,u,u,6>: Cost 2 vext3 <0,u,1,1>, RHS
+ 1745710633U, // <1,u,u,7>: Cost 2 vuzpr LHS, RHS
+ 403978030U, // <1,u,u,u>: Cost 1 vext1 LHS, LHS
+ 2551463936U, // <2,0,0,0>: Cost 3 vext1 <0,2,0,0>, <0,0,0,0>
+ 2685698058U, // <2,0,0,1>: Cost 3 vext3 <0,2,0,2>, <0,0,1,1>
+ 1610776596U, // <2,0,0,2>: Cost 2 vext3 <0,0,2,2>, <0,0,2,2>
+ 2619384069U, // <2,0,0,3>: Cost 3 vext2 <0,3,2,0>, <0,3,2,0>
+ 2551467318U, // <2,0,0,4>: Cost 3 vext1 <0,2,0,0>, RHS
+ 3899836596U, // <2,0,0,5>: Cost 4 vuzpr <1,2,3,0>, <3,0,4,5>
+ 2621374968U, // <2,0,0,6>: Cost 3 vext2 <0,6,2,0>, <0,6,2,0>
+ 4168271334U, // <2,0,0,7>: Cost 4 vtrnr <1,2,3,0>, <2,0,5,7>
+ 1611219018U, // <2,0,0,u>: Cost 2 vext3 <0,0,u,2>, <0,0,u,2>
+ 2551472138U, // <2,0,1,0>: Cost 3 vext1 <0,2,0,1>, <0,0,1,1>
+ 2690564186U, // <2,0,1,1>: Cost 3 vext3 <1,0,3,2>, <0,1,1,0>
+ 1611956326U, // <2,0,1,2>: Cost 2 vext3 <0,2,0,2>, LHS
+ 2826092646U, // <2,0,1,3>: Cost 3 vuzpr <1,2,3,0>, LHS
+ 2551475510U, // <2,0,1,4>: Cost 3 vext1 <0,2,0,1>, RHS
+ 3692463248U, // <2,0,1,5>: Cost 4 vext2 <0,2,2,0>, <1,5,3,7>
+ 2587308473U, // <2,0,1,6>: Cost 3 vext1 <6,2,0,1>, <6,2,0,1>
+ 3661050874U, // <2,0,1,7>: Cost 4 vext1 <6,2,0,1>, <7,0,1,2>
+ 1611956380U, // <2,0,1,u>: Cost 2 vext3 <0,2,0,2>, LHS
+ 1477738598U, // <2,0,2,0>: Cost 2 vext1 <0,2,0,2>, LHS
+ 2551481078U, // <2,0,2,1>: Cost 3 vext1 <0,2,0,2>, <1,0,3,2>
+ 2551481796U, // <2,0,2,2>: Cost 3 vext1 <0,2,0,2>, <2,0,2,0>
+ 2551482518U, // <2,0,2,3>: Cost 3 vext1 <0,2,0,2>, <3,0,1,2>
+ 1477741878U, // <2,0,2,4>: Cost 2 vext1 <0,2,0,2>, RHS
+ 2551484112U, // <2,0,2,5>: Cost 3 vext1 <0,2,0,2>, <5,1,7,3>
+ 2551484759U, // <2,0,2,6>: Cost 3 vext1 <0,2,0,2>, <6,0,7,2>
+ 2551485434U, // <2,0,2,7>: Cost 3 vext1 <0,2,0,2>, <7,0,1,2>
+ 1477744430U, // <2,0,2,u>: Cost 2 vext1 <0,2,0,2>, LHS
+ 2953625600U, // <2,0,3,0>: Cost 3 vzipr LHS, <0,0,0,0>
+ 2953627302U, // <2,0,3,1>: Cost 3 vzipr LHS, <2,3,0,1>
+ 2953625764U, // <2,0,3,2>: Cost 3 vzipr LHS, <0,2,0,2>
+ 4027369695U, // <2,0,3,3>: Cost 4 vzipr LHS, <3,1,0,3>
+ 3625233718U, // <2,0,3,4>: Cost 4 vext1 <0,2,0,3>, RHS
+ 3899836110U, // <2,0,3,5>: Cost 4 vuzpr <1,2,3,0>, <2,3,4,5>
+ 4032012618U, // <2,0,3,6>: Cost 4 vzipr LHS, <0,4,0,6>
+ 3899835392U, // <2,0,3,7>: Cost 4 vuzpr <1,2,3,0>, <1,3,5,7>
+ 2953625770U, // <2,0,3,u>: Cost 3 vzipr LHS, <0,2,0,u>
+ 2551496806U, // <2,0,4,0>: Cost 3 vext1 <0,2,0,4>, LHS
+ 2685698386U, // <2,0,4,1>: Cost 3 vext3 <0,2,0,2>, <0,4,1,5>
+ 2685698396U, // <2,0,4,2>: Cost 3 vext3 <0,2,0,2>, <0,4,2,6>
+ 3625240726U, // <2,0,4,3>: Cost 4 vext1 <0,2,0,4>, <3,0,1,2>
+ 2551500086U, // <2,0,4,4>: Cost 3 vext1 <0,2,0,4>, RHS
+ 2618723638U, // <2,0,4,5>: Cost 3 vext2 <0,2,2,0>, RHS
+ 2765409590U, // <2,0,4,6>: Cost 3 vuzpl <2,3,0,1>, RHS
+ 3799990664U, // <2,0,4,7>: Cost 4 vext3 <7,0,1,2>, <0,4,7,5>
+ 2685698450U, // <2,0,4,u>: Cost 3 vext3 <0,2,0,2>, <0,4,u,6>
+ 3625246822U, // <2,0,5,0>: Cost 4 vext1 <0,2,0,5>, LHS
+ 3289776304U, // <2,0,5,1>: Cost 4 vrev <0,2,1,5>
+ 2690564526U, // <2,0,5,2>: Cost 3 vext3 <1,0,3,2>, <0,5,2,7>
+ 3289923778U, // <2,0,5,3>: Cost 4 vrev <0,2,3,5>
+ 2216255691U, // <2,0,5,4>: Cost 3 vrev <0,2,4,5>
+ 3726307332U, // <2,0,5,5>: Cost 4 vext2 <5,u,2,0>, <5,5,5,5>
+ 3726307426U, // <2,0,5,6>: Cost 4 vext2 <5,u,2,0>, <5,6,7,0>
+ 2826095926U, // <2,0,5,7>: Cost 3 vuzpr <1,2,3,0>, RHS
+ 2216550639U, // <2,0,5,u>: Cost 3 vrev <0,2,u,5>
+ 4162420736U, // <2,0,6,0>: Cost 4 vtrnr <0,2,4,6>, <0,0,0,0>
+ 2901885030U, // <2,0,6,1>: Cost 3 vzipl <2,6,3,7>, LHS
+ 2685698559U, // <2,0,6,2>: Cost 3 vext3 <0,2,0,2>, <0,6,2,7>
+ 3643173171U, // <2,0,6,3>: Cost 4 vext1 <3,2,0,6>, <3,2,0,6>
+ 2216263884U, // <2,0,6,4>: Cost 3 vrev <0,2,4,6>
+ 3730289341U, // <2,0,6,5>: Cost 4 vext2 <6,5,2,0>, <6,5,2,0>
+ 3726308152U, // <2,0,6,6>: Cost 4 vext2 <5,u,2,0>, <6,6,6,6>
+ 3899836346U, // <2,0,6,7>: Cost 4 vuzpr <1,2,3,0>, <2,6,3,7>
+ 2216558832U, // <2,0,6,u>: Cost 3 vrev <0,2,u,6>
+ 2659202049U, // <2,0,7,0>: Cost 3 vext2 <7,0,2,0>, <7,0,2,0>
+ 3726308437U, // <2,0,7,1>: Cost 4 vext2 <5,u,2,0>, <7,1,2,3>
+ 2726249034U, // <2,0,7,2>: Cost 3 vext3 <7,0,1,2>, <0,7,2,1>
+ 3734934772U, // <2,0,7,3>: Cost 4 vext2 <7,3,2,0>, <7,3,2,0>
+ 3726308710U, // <2,0,7,4>: Cost 4 vext2 <5,u,2,0>, <7,4,5,6>
+ 3726308814U, // <2,0,7,5>: Cost 4 vext2 <5,u,2,0>, <7,5,u,2>
+ 3736925671U, // <2,0,7,6>: Cost 4 vext2 <7,6,2,0>, <7,6,2,0>
+ 3726308972U, // <2,0,7,7>: Cost 4 vext2 <5,u,2,0>, <7,7,7,7>
+ 2659202049U, // <2,0,7,u>: Cost 3 vext2 <7,0,2,0>, <7,0,2,0>
+ 1477787750U, // <2,0,u,0>: Cost 2 vext1 <0,2,0,u>, LHS
+ 2953668262U, // <2,0,u,1>: Cost 3 vzipr LHS, <2,3,0,1>
+ 1611956893U, // <2,0,u,2>: Cost 2 vext3 <0,2,0,2>, LHS
+ 2551531670U, // <2,0,u,3>: Cost 3 vext1 <0,2,0,u>, <3,0,1,2>
+ 1477791030U, // <2,0,u,4>: Cost 2 vext1 <0,2,0,u>, RHS
+ 2618726554U, // <2,0,u,5>: Cost 3 vext2 <0,2,2,0>, RHS
+ 2765412506U, // <2,0,u,6>: Cost 3 vuzpl <2,3,0,1>, RHS
+ 2826096169U, // <2,0,u,7>: Cost 3 vuzpr <1,2,3,0>, RHS
+ 1611956947U, // <2,0,u,u>: Cost 2 vext3 <0,2,0,2>, LHS
+ 2569453670U, // <2,1,0,0>: Cost 3 vext1 <3,2,1,0>, LHS
+ 2619392102U, // <2,1,0,1>: Cost 3 vext2 <0,3,2,1>, LHS
+ 3759440619U, // <2,1,0,2>: Cost 4 vext3 <0,2,0,2>, <1,0,2,0>
+ 1616823030U, // <2,1,0,3>: Cost 2 vext3 <1,0,3,2>, <1,0,3,2>
+ 2569456950U, // <2,1,0,4>: Cost 3 vext1 <3,2,1,0>, RHS
+ 2690712328U, // <2,1,0,5>: Cost 3 vext3 <1,0,5,2>, <1,0,5,2>
+ 3661115841U, // <2,1,0,6>: Cost 4 vext1 <6,2,1,0>, <6,2,1,0>
+ 2622046794U, // <2,1,0,7>: Cost 3 vext2 <0,7,2,1>, <0,7,2,1>
+ 1617191715U, // <2,1,0,u>: Cost 2 vext3 <1,0,u,2>, <1,0,u,2>
+ 2551545958U, // <2,1,1,0>: Cost 3 vext1 <0,2,1,1>, LHS
+ 2685698868U, // <2,1,1,1>: Cost 3 vext3 <0,2,0,2>, <1,1,1,1>
+ 2628682646U, // <2,1,1,2>: Cost 3 vext2 <1,u,2,1>, <1,2,3,0>
+ 2685698888U, // <2,1,1,3>: Cost 3 vext3 <0,2,0,2>, <1,1,3,3>
+ 2551549238U, // <2,1,1,4>: Cost 3 vext1 <0,2,1,1>, RHS
+ 3693134992U, // <2,1,1,5>: Cost 4 vext2 <0,3,2,1>, <1,5,3,7>
+ 3661124034U, // <2,1,1,6>: Cost 4 vext1 <6,2,1,1>, <6,2,1,1>
+ 3625292794U, // <2,1,1,7>: Cost 4 vext1 <0,2,1,1>, <7,0,1,2>
+ 2685698933U, // <2,1,1,u>: Cost 3 vext3 <0,2,0,2>, <1,1,u,3>
+ 2551554150U, // <2,1,2,0>: Cost 3 vext1 <0,2,1,2>, LHS
+ 3893649571U, // <2,1,2,1>: Cost 4 vuzpr <0,2,0,1>, <0,2,0,1>
+ 2551555688U, // <2,1,2,2>: Cost 3 vext1 <0,2,1,2>, <2,2,2,2>
+ 2685698966U, // <2,1,2,3>: Cost 3 vext3 <0,2,0,2>, <1,2,3,0>
+ 2551557430U, // <2,1,2,4>: Cost 3 vext1 <0,2,1,2>, RHS
+ 3763422123U, // <2,1,2,5>: Cost 4 vext3 <0,u,0,2>, <1,2,5,3>
+ 3693135802U, // <2,1,2,6>: Cost 4 vext2 <0,3,2,1>, <2,6,3,7>
+ 2726249402U, // <2,1,2,7>: Cost 3 vext3 <7,0,1,2>, <1,2,7,0>
+ 2685699011U, // <2,1,2,u>: Cost 3 vext3 <0,2,0,2>, <1,2,u,0>
+ 2551562342U, // <2,1,3,0>: Cost 3 vext1 <0,2,1,3>, LHS
+ 2953625610U, // <2,1,3,1>: Cost 3 vzipr LHS, <0,0,1,1>
+ 2953627798U, // <2,1,3,2>: Cost 3 vzipr LHS, <3,0,1,2>
+ 2953626584U, // <2,1,3,3>: Cost 3 vzipr LHS, <1,3,1,3>
+ 2551565622U, // <2,1,3,4>: Cost 3 vext1 <0,2,1,3>, RHS
+ 2953625938U, // <2,1,3,5>: Cost 3 vzipr LHS, <0,4,1,5>
+ 2587398596U, // <2,1,3,6>: Cost 3 vext1 <6,2,1,3>, <6,2,1,3>
+ 4032013519U, // <2,1,3,7>: Cost 4 vzipr LHS, <1,6,1,7>
+ 2953625617U, // <2,1,3,u>: Cost 3 vzipr LHS, <0,0,1,u>
+ 2690565154U, // <2,1,4,0>: Cost 3 vext3 <1,0,3,2>, <1,4,0,5>
+ 3625313270U, // <2,1,4,1>: Cost 4 vext1 <0,2,1,4>, <1,3,4,6>
+ 3771532340U, // <2,1,4,2>: Cost 4 vext3 <2,2,2,2>, <1,4,2,5>
+ 1148404634U, // <2,1,4,3>: Cost 2 vrev <1,2,3,4>
+ 3625315638U, // <2,1,4,4>: Cost 4 vext1 <0,2,1,4>, RHS
+ 2619395382U, // <2,1,4,5>: Cost 3 vext2 <0,3,2,1>, RHS
+ 3837242678U, // <2,1,4,6>: Cost 4 vuzpl <2,0,1,2>, RHS
+ 3799991394U, // <2,1,4,7>: Cost 4 vext3 <7,0,1,2>, <1,4,7,6>
+ 1148773319U, // <2,1,4,u>: Cost 2 vrev <1,2,u,4>
+ 2551578726U, // <2,1,5,0>: Cost 3 vext1 <0,2,1,5>, LHS
+ 2551579648U, // <2,1,5,1>: Cost 3 vext1 <0,2,1,5>, <1,3,5,7>
+ 3625321952U, // <2,1,5,2>: Cost 4 vext1 <0,2,1,5>, <2,0,5,1>
+ 2685699216U, // <2,1,5,3>: Cost 3 vext3 <0,2,0,2>, <1,5,3,7>
+ 2551582006U, // <2,1,5,4>: Cost 3 vext1 <0,2,1,5>, RHS
+ 3740913668U, // <2,1,5,5>: Cost 4 vext2 <u,3,2,1>, <5,5,5,5>
+ 3661156806U, // <2,1,5,6>: Cost 4 vext1 <6,2,1,5>, <6,2,1,5>
+ 3893652790U, // <2,1,5,7>: Cost 4 vuzpr <0,2,0,1>, RHS
+ 2685699261U, // <2,1,5,u>: Cost 3 vext3 <0,2,0,2>, <1,5,u,7>
+ 2551586918U, // <2,1,6,0>: Cost 3 vext1 <0,2,1,6>, LHS
+ 3625329398U, // <2,1,6,1>: Cost 4 vext1 <0,2,1,6>, <1,0,3,2>
+ 2551588794U, // <2,1,6,2>: Cost 3 vext1 <0,2,1,6>, <2,6,3,7>
+ 3088679014U, // <2,1,6,3>: Cost 3 vtrnr <0,2,4,6>, LHS
+ 2551590198U, // <2,1,6,4>: Cost 3 vext1 <0,2,1,6>, RHS
+ 4029382994U, // <2,1,6,5>: Cost 4 vzipr <0,4,2,6>, <0,4,1,5>
+ 3625333560U, // <2,1,6,6>: Cost 4 vext1 <0,2,1,6>, <6,6,6,6>
+ 3731624800U, // <2,1,6,7>: Cost 4 vext2 <6,7,2,1>, <6,7,2,1>
+ 2551592750U, // <2,1,6,u>: Cost 3 vext1 <0,2,1,6>, LHS
+ 2622051322U, // <2,1,7,0>: Cost 3 vext2 <0,7,2,1>, <7,0,1,2>
+ 3733615699U, // <2,1,7,1>: Cost 4 vext2 <7,1,2,1>, <7,1,2,1>
+ 3795125538U, // <2,1,7,2>: Cost 4 vext3 <6,1,7,2>, <1,7,2,0>
+ 2222171037U, // <2,1,7,3>: Cost 3 vrev <1,2,3,7>
+ 3740915046U, // <2,1,7,4>: Cost 4 vext2 <u,3,2,1>, <7,4,5,6>
+ 3296060335U, // <2,1,7,5>: Cost 4 vrev <1,2,5,7>
+ 3736933864U, // <2,1,7,6>: Cost 4 vext2 <7,6,2,1>, <7,6,2,1>
+ 3805300055U, // <2,1,7,7>: Cost 4 vext3 <7,u,1,2>, <1,7,7,u>
+ 2669827714U, // <2,1,7,u>: Cost 3 vext2 <u,7,2,1>, <7,u,1,2>
+ 2551603302U, // <2,1,u,0>: Cost 3 vext1 <0,2,1,u>, LHS
+ 2953666570U, // <2,1,u,1>: Cost 3 vzipr LHS, <0,0,1,1>
+ 2953668758U, // <2,1,u,2>: Cost 3 vzipr LHS, <3,0,1,2>
+ 1148437406U, // <2,1,u,3>: Cost 2 vrev <1,2,3,u>
+ 2551606582U, // <2,1,u,4>: Cost 3 vext1 <0,2,1,u>, RHS
+ 2953666898U, // <2,1,u,5>: Cost 3 vzipr LHS, <0,4,1,5>
+ 2587398596U, // <2,1,u,6>: Cost 3 vext1 <6,2,1,3>, <6,2,1,3>
+ 2669828370U, // <2,1,u,7>: Cost 3 vext2 <u,7,2,1>, <u,7,2,1>
+ 1148806091U, // <2,1,u,u>: Cost 2 vrev <1,2,u,u>
+ 1543667732U, // <2,2,0,0>: Cost 2 vext2 <0,0,2,2>, <0,0,2,2>
+ 1548976230U, // <2,2,0,1>: Cost 2 vext2 <0,u,2,2>, LHS
+ 2685699524U, // <2,2,0,2>: Cost 3 vext3 <0,2,0,2>, <2,0,2,0>
+ 2685699535U, // <2,2,0,3>: Cost 3 vext3 <0,2,0,2>, <2,0,3,2>
+ 2551614774U, // <2,2,0,4>: Cost 3 vext1 <0,2,2,0>, RHS
+ 3704422830U, // <2,2,0,5>: Cost 4 vext2 <2,2,2,2>, <0,5,2,7>
+ 3893657642U, // <2,2,0,6>: Cost 4 vuzpr <0,2,0,2>, <0,0,4,6>
+ 3770574323U, // <2,2,0,7>: Cost 4 vext3 <2,0,7,2>, <2,0,7,2>
+ 1548976796U, // <2,2,0,u>: Cost 2 vext2 <0,u,2,2>, <0,u,2,2>
+ 2622718710U, // <2,2,1,0>: Cost 3 vext2 <0,u,2,2>, <1,0,3,2>
+ 2622718772U, // <2,2,1,1>: Cost 3 vext2 <0,u,2,2>, <1,1,1,1>
+ 2622718870U, // <2,2,1,2>: Cost 3 vext2 <0,u,2,2>, <1,2,3,0>
+ 2819915878U, // <2,2,1,3>: Cost 3 vuzpr <0,2,0,2>, LHS
+ 3625364790U, // <2,2,1,4>: Cost 4 vext1 <0,2,2,1>, RHS
+ 2622719120U, // <2,2,1,5>: Cost 3 vext2 <0,u,2,2>, <1,5,3,7>
+ 3760031292U, // <2,2,1,6>: Cost 4 vext3 <0,2,u,2>, <2,1,6,3>
+ 3667170468U, // <2,2,1,7>: Cost 4 vext1 <7,2,2,1>, <7,2,2,1>
+ 2819915883U, // <2,2,1,u>: Cost 3 vuzpr <0,2,0,2>, LHS
+ 1489829990U, // <2,2,2,0>: Cost 2 vext1 <2,2,2,2>, LHS
+ 2563572470U, // <2,2,2,1>: Cost 3 vext1 <2,2,2,2>, <1,0,3,2>
+ 269271142U, // <2,2,2,2>: Cost 1 vdup2 LHS
+ 2685699698U, // <2,2,2,3>: Cost 3 vext3 <0,2,0,2>, <2,2,3,3>
+ 1489833270U, // <2,2,2,4>: Cost 2 vext1 <2,2,2,2>, RHS
+ 2685699720U, // <2,2,2,5>: Cost 3 vext3 <0,2,0,2>, <2,2,5,7>
+ 2622719930U, // <2,2,2,6>: Cost 3 vext2 <0,u,2,2>, <2,6,3,7>
+ 2593436837U, // <2,2,2,7>: Cost 3 vext1 <7,2,2,2>, <7,2,2,2>
+ 269271142U, // <2,2,2,u>: Cost 1 vdup2 LHS
+ 2685699750U, // <2,2,3,0>: Cost 3 vext3 <0,2,0,2>, <2,3,0,1>
+ 2690565806U, // <2,2,3,1>: Cost 3 vext3 <1,0,3,2>, <2,3,1,0>
+ 2953627240U, // <2,2,3,2>: Cost 3 vzipr LHS, <2,2,2,2>
+ 1879883878U, // <2,2,3,3>: Cost 2 vzipr LHS, LHS
+ 2685699790U, // <2,2,3,4>: Cost 3 vext3 <0,2,0,2>, <2,3,4,5>
+ 3893659342U, // <2,2,3,5>: Cost 4 vuzpr <0,2,0,2>, <2,3,4,5>
+ 2958270812U, // <2,2,3,6>: Cost 3 vzipr LHS, <0,4,2,6>
+ 2593445030U, // <2,2,3,7>: Cost 3 vext1 <7,2,2,3>, <7,2,2,3>
+ 1879883883U, // <2,2,3,u>: Cost 2 vzipr LHS, LHS
+ 2551644262U, // <2,2,4,0>: Cost 3 vext1 <0,2,2,4>, LHS
+ 3625386742U, // <2,2,4,1>: Cost 4 vext1 <0,2,2,4>, <1,0,3,2>
+ 2551645902U, // <2,2,4,2>: Cost 3 vext1 <0,2,2,4>, <2,3,4,5>
+ 3759441686U, // <2,2,4,3>: Cost 4 vext3 <0,2,0,2>, <2,4,3,5>
+ 2551647542U, // <2,2,4,4>: Cost 3 vext1 <0,2,2,4>, RHS
+ 1548979510U, // <2,2,4,5>: Cost 2 vext2 <0,u,2,2>, RHS
+ 2764901686U, // <2,2,4,6>: Cost 3 vuzpl <2,2,2,2>, RHS
+ 3667195047U, // <2,2,4,7>: Cost 4 vext1 <7,2,2,4>, <7,2,2,4>
+ 1548979753U, // <2,2,4,u>: Cost 2 vext2 <0,u,2,2>, RHS
+ 3696463432U, // <2,2,5,0>: Cost 4 vext2 <0,u,2,2>, <5,0,1,2>
+ 2617413328U, // <2,2,5,1>: Cost 3 vext2 <0,0,2,2>, <5,1,7,3>
+ 2685699936U, // <2,2,5,2>: Cost 3 vext3 <0,2,0,2>, <2,5,2,7>
+ 4027383910U, // <2,2,5,3>: Cost 4 vzipr <0,1,2,5>, LHS
+ 2228201085U, // <2,2,5,4>: Cost 3 vrev <2,2,4,5>
+ 2617413636U, // <2,2,5,5>: Cost 3 vext2 <0,0,2,2>, <5,5,5,5>
+ 2617413730U, // <2,2,5,6>: Cost 3 vext2 <0,0,2,2>, <5,6,7,0>
+ 2819919158U, // <2,2,5,7>: Cost 3 vuzpr <0,2,0,2>, RHS
+ 2819919159U, // <2,2,5,u>: Cost 3 vuzpr <0,2,0,2>, RHS
+ 3625402554U, // <2,2,6,0>: Cost 4 vext1 <0,2,2,6>, <0,2,2,6>
+ 3760031652U, // <2,2,6,1>: Cost 4 vext3 <0,2,u,2>, <2,6,1,3>
+ 2617414138U, // <2,2,6,2>: Cost 3 vext2 <0,0,2,2>, <6,2,7,3>
+ 2685700026U, // <2,2,6,3>: Cost 3 vext3 <0,2,0,2>, <2,6,3,7>
+ 3625405750U, // <2,2,6,4>: Cost 4 vext1 <0,2,2,6>, RHS
+ 3760031692U, // <2,2,6,5>: Cost 4 vext3 <0,2,u,2>, <2,6,5,7>
+ 3088679116U, // <2,2,6,6>: Cost 3 vtrnr <0,2,4,6>, <0,2,4,6>
+ 2657891169U, // <2,2,6,7>: Cost 3 vext2 <6,7,2,2>, <6,7,2,2>
+ 2685700071U, // <2,2,6,u>: Cost 3 vext3 <0,2,0,2>, <2,6,u,7>
+ 2726250474U, // <2,2,7,0>: Cost 3 vext3 <7,0,1,2>, <2,7,0,1>
+ 3704427616U, // <2,2,7,1>: Cost 4 vext2 <2,2,2,2>, <7,1,3,5>
+ 2660545701U, // <2,2,7,2>: Cost 3 vext2 <7,2,2,2>, <7,2,2,2>
+ 4030718054U, // <2,2,7,3>: Cost 4 vzipr <0,6,2,7>, LHS
+ 2617415014U, // <2,2,7,4>: Cost 3 vext2 <0,0,2,2>, <7,4,5,6>
+ 3302033032U, // <2,2,7,5>: Cost 4 vrev <2,2,5,7>
+ 3661246929U, // <2,2,7,6>: Cost 4 vext1 <6,2,2,7>, <6,2,2,7>
+ 2617415276U, // <2,2,7,7>: Cost 3 vext2 <0,0,2,2>, <7,7,7,7>
+ 2731558962U, // <2,2,7,u>: Cost 3 vext3 <7,u,1,2>, <2,7,u,1>
+ 1489829990U, // <2,2,u,0>: Cost 2 vext1 <2,2,2,2>, LHS
+ 1548982062U, // <2,2,u,1>: Cost 2 vext2 <0,u,2,2>, LHS
+ 269271142U, // <2,2,u,2>: Cost 1 vdup2 LHS
+ 1879924838U, // <2,2,u,3>: Cost 2 vzipr LHS, LHS
+ 1489833270U, // <2,2,u,4>: Cost 2 vext1 <2,2,2,2>, RHS
+ 1548982426U, // <2,2,u,5>: Cost 2 vext2 <0,u,2,2>, RHS
+ 2953666908U, // <2,2,u,6>: Cost 3 vzipr LHS, <0,4,2,6>
+ 2819919401U, // <2,2,u,7>: Cost 3 vuzpr <0,2,0,2>, RHS
+ 269271142U, // <2,2,u,u>: Cost 1 vdup2 LHS
+ 1544339456U, // <2,3,0,0>: Cost 2 vext2 LHS, <0,0,0,0>
+ 470597734U, // <2,3,0,1>: Cost 1 vext2 LHS, LHS
+ 1548984484U, // <2,3,0,2>: Cost 2 vext2 LHS, <0,2,0,2>
+ 2619408648U, // <2,3,0,3>: Cost 3 vext2 <0,3,2,3>, <0,3,2,3>
+ 1548984658U, // <2,3,0,4>: Cost 2 vext2 LHS, <0,4,1,5>
+ 2665857454U, // <2,3,0,5>: Cost 3 vext2 LHS, <0,5,2,7>
+ 2622726655U, // <2,3,0,6>: Cost 3 vext2 LHS, <0,6,2,7>
+ 2593494188U, // <2,3,0,7>: Cost 3 vext1 <7,2,3,0>, <7,2,3,0>
+ 470598301U, // <2,3,0,u>: Cost 1 vext2 LHS, LHS
+ 1544340214U, // <2,3,1,0>: Cost 2 vext2 LHS, <1,0,3,2>
+ 1544340276U, // <2,3,1,1>: Cost 2 vext2 LHS, <1,1,1,1>
+ 1544340374U, // <2,3,1,2>: Cost 2 vext2 LHS, <1,2,3,0>
+ 1548985304U, // <2,3,1,3>: Cost 2 vext2 LHS, <1,3,1,3>
+ 2551696694U, // <2,3,1,4>: Cost 3 vext1 <0,2,3,1>, RHS
+ 1548985488U, // <2,3,1,5>: Cost 2 vext2 LHS, <1,5,3,7>
+ 2622727375U, // <2,3,1,6>: Cost 3 vext2 LHS, <1,6,1,7>
+ 2665858347U, // <2,3,1,7>: Cost 3 vext2 LHS, <1,7,3,0>
+ 1548985709U, // <2,3,1,u>: Cost 2 vext2 LHS, <1,u,1,3>
+ 2622727613U, // <2,3,2,0>: Cost 3 vext2 LHS, <2,0,1,2>
+ 2622727711U, // <2,3,2,1>: Cost 3 vext2 LHS, <2,1,3,1>
+ 1544341096U, // <2,3,2,2>: Cost 2 vext2 LHS, <2,2,2,2>
+ 1544341158U, // <2,3,2,3>: Cost 2 vext2 LHS, <2,3,0,1>
+ 2622727958U, // <2,3,2,4>: Cost 3 vext2 LHS, <2,4,3,5>
+ 2622728032U, // <2,3,2,5>: Cost 3 vext2 LHS, <2,5,2,7>
+ 1548986298U, // <2,3,2,6>: Cost 2 vext2 LHS, <2,6,3,7>
+ 2665859050U, // <2,3,2,7>: Cost 3 vext2 LHS, <2,7,0,1>
+ 1548986427U, // <2,3,2,u>: Cost 2 vext2 LHS, <2,u,0,1>
+ 1548986518U, // <2,3,3,0>: Cost 2 vext2 LHS, <3,0,1,2>
+ 2622728415U, // <2,3,3,1>: Cost 3 vext2 LHS, <3,1,0,3>
+ 1489913458U, // <2,3,3,2>: Cost 2 vext1 <2,2,3,3>, <2,2,3,3>
+ 1544341916U, // <2,3,3,3>: Cost 2 vext2 LHS, <3,3,3,3>
+ 1548986882U, // <2,3,3,4>: Cost 2 vext2 LHS, <3,4,5,6>
+ 2665859632U, // <2,3,3,5>: Cost 3 vext2 LHS, <3,5,1,7>
+ 2234304870U, // <2,3,3,6>: Cost 3 vrev <3,2,6,3>
+ 2958271632U, // <2,3,3,7>: Cost 3 vzipr LHS, <1,5,3,7>
+ 1548987166U, // <2,3,3,u>: Cost 2 vext2 LHS, <3,u,1,2>
+ 1483948134U, // <2,3,4,0>: Cost 2 vext1 <1,2,3,4>, LHS
+ 1483948954U, // <2,3,4,1>: Cost 2 vext1 <1,2,3,4>, <1,2,3,4>
+ 2622729276U, // <2,3,4,2>: Cost 3 vext2 LHS, <4,2,6,0>
+ 2557692054U, // <2,3,4,3>: Cost 3 vext1 <1,2,3,4>, <3,0,1,2>
+ 1483951414U, // <2,3,4,4>: Cost 2 vext1 <1,2,3,4>, RHS
+ 470601014U, // <2,3,4,5>: Cost 1 vext2 LHS, RHS
+ 1592118644U, // <2,3,4,6>: Cost 2 vext2 LHS, <4,6,4,6>
+ 2593526960U, // <2,3,4,7>: Cost 3 vext1 <7,2,3,4>, <7,2,3,4>
+ 470601257U, // <2,3,4,u>: Cost 1 vext2 LHS, RHS
+ 2551726182U, // <2,3,5,0>: Cost 3 vext1 <0,2,3,5>, LHS
+ 1592118992U, // <2,3,5,1>: Cost 2 vext2 LHS, <5,1,7,3>
+ 2665860862U, // <2,3,5,2>: Cost 3 vext2 LHS, <5,2,3,4>
+ 2551728642U, // <2,3,5,3>: Cost 3 vext1 <0,2,3,5>, <3,4,5,6>
+ 1592119238U, // <2,3,5,4>: Cost 2 vext2 LHS, <5,4,7,6>
+ 1592119300U, // <2,3,5,5>: Cost 2 vext2 LHS, <5,5,5,5>
+ 1592119394U, // <2,3,5,6>: Cost 2 vext2 LHS, <5,6,7,0>
+ 1592119464U, // <2,3,5,7>: Cost 2 vext2 LHS, <5,7,5,7>
+ 1592119545U, // <2,3,5,u>: Cost 2 vext2 LHS, <5,u,5,7>
+ 2622730529U, // <2,3,6,0>: Cost 3 vext2 LHS, <6,0,1,2>
+ 2557707164U, // <2,3,6,1>: Cost 3 vext1 <1,2,3,6>, <1,2,3,6>
+ 1592119802U, // <2,3,6,2>: Cost 2 vext2 LHS, <6,2,7,3>
+ 2665861682U, // <2,3,6,3>: Cost 3 vext2 LHS, <6,3,4,5>
+ 2622730893U, // <2,3,6,4>: Cost 3 vext2 LHS, <6,4,5,6>
+ 2665861810U, // <2,3,6,5>: Cost 3 vext2 LHS, <6,5,0,7>
+ 1592120120U, // <2,3,6,6>: Cost 2 vext2 LHS, <6,6,6,6>
+ 1592120142U, // <2,3,6,7>: Cost 2 vext2 LHS, <6,7,0,1>
+ 1592120223U, // <2,3,6,u>: Cost 2 vext2 LHS, <6,u,0,1>
+ 1592120314U, // <2,3,7,0>: Cost 2 vext2 LHS, <7,0,1,2>
+ 2659890261U, // <2,3,7,1>: Cost 3 vext2 <7,1,2,3>, <7,1,2,3>
+ 2660553894U, // <2,3,7,2>: Cost 3 vext2 <7,2,2,3>, <7,2,2,3>
+ 2665862371U, // <2,3,7,3>: Cost 3 vext2 LHS, <7,3,0,1>
+ 1592120678U, // <2,3,7,4>: Cost 2 vext2 LHS, <7,4,5,6>
+ 2665862534U, // <2,3,7,5>: Cost 3 vext2 LHS, <7,5,0,2>
+ 2665862614U, // <2,3,7,6>: Cost 3 vext2 LHS, <7,6,0,1>
+ 1592120940U, // <2,3,7,7>: Cost 2 vext2 LHS, <7,7,7,7>
+ 1592120962U, // <2,3,7,u>: Cost 2 vext2 LHS, <7,u,1,2>
+ 1548990163U, // <2,3,u,0>: Cost 2 vext2 LHS, <u,0,1,2>
+ 470603566U, // <2,3,u,1>: Cost 1 vext2 LHS, LHS
+ 1548990341U, // <2,3,u,2>: Cost 2 vext2 LHS, <u,2,3,0>
+ 1548990396U, // <2,3,u,3>: Cost 2 vext2 LHS, <u,3,0,1>
+ 1548990527U, // <2,3,u,4>: Cost 2 vext2 LHS, <u,4,5,6>
+ 470603930U, // <2,3,u,5>: Cost 1 vext2 LHS, RHS
+ 1548990672U, // <2,3,u,6>: Cost 2 vext2 LHS, <u,6,3,7>
+ 1592121600U, // <2,3,u,7>: Cost 2 vext2 LHS, <u,7,0,1>
+ 470604133U, // <2,3,u,u>: Cost 1 vext2 LHS, LHS
+ 2617425942U, // <2,4,0,0>: Cost 3 vext2 <0,0,2,4>, <0,0,2,4>
+ 2618753126U, // <2,4,0,1>: Cost 3 vext2 <0,2,2,4>, LHS
+ 2618753208U, // <2,4,0,2>: Cost 3 vext2 <0,2,2,4>, <0,2,2,4>
+ 2619416841U, // <2,4,0,3>: Cost 3 vext2 <0,3,2,4>, <0,3,2,4>
+ 2587593628U, // <2,4,0,4>: Cost 3 vext1 <6,2,4,0>, <4,0,6,2>
+ 2712832914U, // <2,4,0,5>: Cost 3 vext3 <4,6,u,2>, <4,0,5,1>
+ 1634962332U, // <2,4,0,6>: Cost 2 vext3 <4,0,6,2>, <4,0,6,2>
+ 3799993252U, // <2,4,0,7>: Cost 4 vext3 <7,0,1,2>, <4,0,7,1>
+ 1634962332U, // <2,4,0,u>: Cost 2 vext3 <4,0,6,2>, <4,0,6,2>
+ 2619417334U, // <2,4,1,0>: Cost 3 vext2 <0,3,2,4>, <1,0,3,2>
+ 3692495668U, // <2,4,1,1>: Cost 4 vext2 <0,2,2,4>, <1,1,1,1>
+ 2625389466U, // <2,4,1,2>: Cost 3 vext2 <1,3,2,4>, <1,2,3,4>
+ 2826125414U, // <2,4,1,3>: Cost 3 vuzpr <1,2,3,4>, LHS
+ 3699794995U, // <2,4,1,4>: Cost 4 vext2 <1,4,2,4>, <1,4,2,4>
+ 3692496016U, // <2,4,1,5>: Cost 4 vext2 <0,2,2,4>, <1,5,3,7>
+ 3763424238U, // <2,4,1,6>: Cost 4 vext3 <0,u,0,2>, <4,1,6,3>
+ 3667317942U, // <2,4,1,7>: Cost 4 vext1 <7,2,4,1>, <7,2,4,1>
+ 2826125419U, // <2,4,1,u>: Cost 3 vuzpr <1,2,3,4>, LHS
+ 2629371336U, // <2,4,2,0>: Cost 3 vext2 <2,0,2,4>, <2,0,2,4>
+ 3699131946U, // <2,4,2,1>: Cost 4 vext2 <1,3,2,4>, <2,1,4,3>
+ 2630698602U, // <2,4,2,2>: Cost 3 vext2 <2,2,2,4>, <2,2,2,4>
+ 2618754766U, // <2,4,2,3>: Cost 3 vext2 <0,2,2,4>, <2,3,4,5>
+ 2826126234U, // <2,4,2,4>: Cost 3 vuzpr <1,2,3,4>, <1,2,3,4>
+ 2899119414U, // <2,4,2,5>: Cost 3 vzipl <2,2,2,2>, RHS
+ 3033337142U, // <2,4,2,6>: Cost 3 vtrnl <2,2,2,2>, RHS
+ 3800214597U, // <2,4,2,7>: Cost 4 vext3 <7,0,4,2>, <4,2,7,0>
+ 2899119657U, // <2,4,2,u>: Cost 3 vzipl <2,2,2,2>, RHS
+ 2635344033U, // <2,4,3,0>: Cost 3 vext2 <3,0,2,4>, <3,0,2,4>
+ 4032012325U, // <2,4,3,1>: Cost 4 vzipr LHS, <0,0,4,1>
+ 3692497228U, // <2,4,3,2>: Cost 4 vext2 <0,2,2,4>, <3,2,3,4>
+ 3692497308U, // <2,4,3,3>: Cost 4 vext2 <0,2,2,4>, <3,3,3,3>
+ 3001404624U, // <2,4,3,4>: Cost 3 vzipr LHS, <4,4,4,4>
+ 2953627342U, // <2,4,3,5>: Cost 3 vzipr LHS, <2,3,4,5>
+ 2953625804U, // <2,4,3,6>: Cost 3 vzipr LHS, <0,2,4,6>
+ 3899868160U, // <2,4,3,7>: Cost 4 vuzpr <1,2,3,4>, <1,3,5,7>
+ 2953625806U, // <2,4,3,u>: Cost 3 vzipr LHS, <0,2,4,u>
+ 2710916266U, // <2,4,4,0>: Cost 3 vext3 <4,4,0,2>, <4,4,0,2>
+ 3899869648U, // <2,4,4,1>: Cost 4 vuzpr <1,2,3,4>, <3,4,0,1>
+ 3899869658U, // <2,4,4,2>: Cost 4 vuzpr <1,2,3,4>, <3,4,1,2>
+ 3899868930U, // <2,4,4,3>: Cost 4 vuzpr <1,2,3,4>, <2,4,1,3>
+ 2712833232U, // <2,4,4,4>: Cost 3 vext3 <4,6,u,2>, <4,4,4,4>
+ 2618756406U, // <2,4,4,5>: Cost 3 vext2 <0,2,2,4>, RHS
+ 2765737270U, // <2,4,4,6>: Cost 3 vuzpl <2,3,4,5>, RHS
+ 4168304426U, // <2,4,4,7>: Cost 4 vtrnr <1,2,3,4>, <2,4,5,7>
+ 2618756649U, // <2,4,4,u>: Cost 3 vext2 <0,2,2,4>, RHS
+ 2551800011U, // <2,4,5,0>: Cost 3 vext1 <0,2,4,5>, <0,2,4,5>
+ 2569716470U, // <2,4,5,1>: Cost 3 vext1 <3,2,4,5>, <1,0,3,2>
+ 2563745405U, // <2,4,5,2>: Cost 3 vext1 <2,2,4,5>, <2,2,4,5>
+ 2569718102U, // <2,4,5,3>: Cost 3 vext1 <3,2,4,5>, <3,2,4,5>
+ 2551803190U, // <2,4,5,4>: Cost 3 vext1 <0,2,4,5>, RHS
+ 3625545732U, // <2,4,5,5>: Cost 4 vext1 <0,2,4,5>, <5,5,5,5>
+ 1611959606U, // <2,4,5,6>: Cost 2 vext3 <0,2,0,2>, RHS
+ 2826128694U, // <2,4,5,7>: Cost 3 vuzpr <1,2,3,4>, RHS
+ 1611959624U, // <2,4,5,u>: Cost 2 vext3 <0,2,0,2>, RHS
+ 1478066278U, // <2,4,6,0>: Cost 2 vext1 <0,2,4,6>, LHS
+ 2551808758U, // <2,4,6,1>: Cost 3 vext1 <0,2,4,6>, <1,0,3,2>
+ 2551809516U, // <2,4,6,2>: Cost 3 vext1 <0,2,4,6>, <2,0,6,4>
+ 2551810198U, // <2,4,6,3>: Cost 3 vext1 <0,2,4,6>, <3,0,1,2>
+ 1478069558U, // <2,4,6,4>: Cost 2 vext1 <0,2,4,6>, RHS
+ 2901888310U, // <2,4,6,5>: Cost 3 vzipl <2,6,3,7>, RHS
+ 2551812920U, // <2,4,6,6>: Cost 3 vext1 <0,2,4,6>, <6,6,6,6>
+ 2726251914U, // <2,4,6,7>: Cost 3 vext3 <7,0,1,2>, <4,6,7,1>
+ 1478072110U, // <2,4,6,u>: Cost 2 vext1 <0,2,4,6>, LHS
+ 2659234821U, // <2,4,7,0>: Cost 3 vext2 <7,0,2,4>, <7,0,2,4>
+ 3786722726U, // <2,4,7,1>: Cost 4 vext3 <4,7,1,2>, <4,7,1,2>
+ 3734303911U, // <2,4,7,2>: Cost 4 vext2 <7,2,2,4>, <7,2,2,4>
+ 3734967544U, // <2,4,7,3>: Cost 4 vext2 <7,3,2,4>, <7,3,2,4>
+ 3727005030U, // <2,4,7,4>: Cost 4 vext2 <6,0,2,4>, <7,4,5,6>
+ 2726251976U, // <2,4,7,5>: Cost 3 vext3 <7,0,1,2>, <4,7,5,0>
+ 2726251986U, // <2,4,7,6>: Cost 3 vext3 <7,0,1,2>, <4,7,6,1>
+ 3727005292U, // <2,4,7,7>: Cost 4 vext2 <6,0,2,4>, <7,7,7,7>
+ 2659234821U, // <2,4,7,u>: Cost 3 vext2 <7,0,2,4>, <7,0,2,4>
+ 1478082662U, // <2,4,u,0>: Cost 2 vext1 <0,2,4,u>, LHS
+ 2618758958U, // <2,4,u,1>: Cost 3 vext2 <0,2,2,4>, LHS
+ 2551826024U, // <2,4,u,2>: Cost 3 vext1 <0,2,4,u>, <2,2,2,2>
+ 2551826582U, // <2,4,u,3>: Cost 3 vext1 <0,2,4,u>, <3,0,1,2>
+ 1478085942U, // <2,4,u,4>: Cost 2 vext1 <0,2,4,u>, RHS
+ 2953668302U, // <2,4,u,5>: Cost 3 vzipr LHS, <2,3,4,5>
+ 1611959849U, // <2,4,u,6>: Cost 2 vext3 <0,2,0,2>, RHS
+ 2826128937U, // <2,4,u,7>: Cost 3 vuzpr <1,2,3,4>, RHS
+ 1611959867U, // <2,4,u,u>: Cost 2 vext3 <0,2,0,2>, RHS
+ 3691839488U, // <2,5,0,0>: Cost 4 vext2 <0,1,2,5>, <0,0,0,0>
+ 2618097766U, // <2,5,0,1>: Cost 3 vext2 <0,1,2,5>, LHS
+ 2620088484U, // <2,5,0,2>: Cost 3 vext2 <0,4,2,5>, <0,2,0,2>
+ 2619425034U, // <2,5,0,3>: Cost 3 vext2 <0,3,2,5>, <0,3,2,5>
+ 2620088667U, // <2,5,0,4>: Cost 3 vext2 <0,4,2,5>, <0,4,2,5>
+ 2620752300U, // <2,5,0,5>: Cost 3 vext2 <0,5,2,5>, <0,5,2,5>
+ 3693830655U, // <2,5,0,6>: Cost 4 vext2 <0,4,2,5>, <0,6,2,7>
+ 3094531382U, // <2,5,0,7>: Cost 3 vtrnr <1,2,3,0>, RHS
+ 2618098333U, // <2,5,0,u>: Cost 3 vext2 <0,1,2,5>, LHS
+ 3691840246U, // <2,5,1,0>: Cost 4 vext2 <0,1,2,5>, <1,0,3,2>
+ 3691840308U, // <2,5,1,1>: Cost 4 vext2 <0,1,2,5>, <1,1,1,1>
+ 2626061206U, // <2,5,1,2>: Cost 3 vext2 <1,4,2,5>, <1,2,3,0>
+ 2618098688U, // <2,5,1,3>: Cost 3 vext2 <0,1,2,5>, <1,3,5,7>
+ 2626061364U, // <2,5,1,4>: Cost 3 vext2 <1,4,2,5>, <1,4,2,5>
+ 3691840656U, // <2,5,1,5>: Cost 4 vext2 <0,1,2,5>, <1,5,3,7>
+ 3789082310U, // <2,5,1,6>: Cost 4 vext3 <5,1,6,2>, <5,1,6,2>
+ 2712833744U, // <2,5,1,7>: Cost 3 vext3 <4,6,u,2>, <5,1,7,3>
+ 2628715896U, // <2,5,1,u>: Cost 3 vext2 <1,u,2,5>, <1,u,2,5>
+ 3693831613U, // <2,5,2,0>: Cost 4 vext2 <0,4,2,5>, <2,0,1,2>
+ 4026698642U, // <2,5,2,1>: Cost 4 vzipr <0,0,2,2>, <4,0,5,1>
+ 2632033896U, // <2,5,2,2>: Cost 3 vext2 <2,4,2,5>, <2,2,2,2>
+ 3691841190U, // <2,5,2,3>: Cost 4 vext2 <0,1,2,5>, <2,3,0,1>
+ 2632034061U, // <2,5,2,4>: Cost 3 vext2 <2,4,2,5>, <2,4,2,5>
+ 3691841352U, // <2,5,2,5>: Cost 4 vext2 <0,1,2,5>, <2,5,0,1>
+ 3691841466U, // <2,5,2,6>: Cost 4 vext2 <0,1,2,5>, <2,6,3,7>
+ 3088354614U, // <2,5,2,7>: Cost 3 vtrnr <0,2,0,2>, RHS
+ 3088354615U, // <2,5,2,u>: Cost 3 vtrnr <0,2,0,2>, RHS
+ 2557829222U, // <2,5,3,0>: Cost 3 vext1 <1,2,5,3>, LHS
+ 2557830059U, // <2,5,3,1>: Cost 3 vext1 <1,2,5,3>, <1,2,5,3>
+ 2575746766U, // <2,5,3,2>: Cost 3 vext1 <4,2,5,3>, <2,3,4,5>
+ 3691841948U, // <2,5,3,3>: Cost 4 vext2 <0,1,2,5>, <3,3,3,3>
+ 2619427330U, // <2,5,3,4>: Cost 3 vext2 <0,3,2,5>, <3,4,5,6>
+ 2581720847U, // <2,5,3,5>: Cost 3 vext1 <5,2,5,3>, <5,2,5,3>
+ 2953628162U, // <2,5,3,6>: Cost 3 vzipr LHS, <3,4,5,6>
+ 2953626624U, // <2,5,3,7>: Cost 3 vzipr LHS, <1,3,5,7>
+ 2953626625U, // <2,5,3,u>: Cost 3 vzipr LHS, <1,3,5,u>
+ 2569781350U, // <2,5,4,0>: Cost 3 vext1 <3,2,5,4>, LHS
+ 3631580076U, // <2,5,4,1>: Cost 4 vext1 <1,2,5,4>, <1,2,5,4>
+ 2569782990U, // <2,5,4,2>: Cost 3 vext1 <3,2,5,4>, <2,3,4,5>
+ 2569783646U, // <2,5,4,3>: Cost 3 vext1 <3,2,5,4>, <3,2,5,4>
+ 2569784630U, // <2,5,4,4>: Cost 3 vext1 <3,2,5,4>, RHS
+ 2618101046U, // <2,5,4,5>: Cost 3 vext2 <0,1,2,5>, RHS
+ 3893905922U, // <2,5,4,6>: Cost 4 vuzpr <0,2,3,5>, <3,4,5,6>
+ 3094564150U, // <2,5,4,7>: Cost 3 vtrnr <1,2,3,4>, RHS
+ 2618101289U, // <2,5,4,u>: Cost 3 vext2 <0,1,2,5>, RHS
+ 2551873638U, // <2,5,5,0>: Cost 3 vext1 <0,2,5,5>, LHS
+ 3637560320U, // <2,5,5,1>: Cost 4 vext1 <2,2,5,5>, <1,3,5,7>
+ 3637560966U, // <2,5,5,2>: Cost 4 vext1 <2,2,5,5>, <2,2,5,5>
+ 3723030343U, // <2,5,5,3>: Cost 4 vext2 <5,3,2,5>, <5,3,2,5>
+ 2551876918U, // <2,5,5,4>: Cost 3 vext1 <0,2,5,5>, RHS
+ 2712834052U, // <2,5,5,5>: Cost 3 vext3 <4,6,u,2>, <5,5,5,5>
+ 4028713474U, // <2,5,5,6>: Cost 4 vzipr <0,3,2,5>, <3,4,5,6>
+ 2712834072U, // <2,5,5,7>: Cost 3 vext3 <4,6,u,2>, <5,5,7,7>
+ 2712834081U, // <2,5,5,u>: Cost 3 vext3 <4,6,u,2>, <5,5,u,7>
+ 2575769702U, // <2,5,6,0>: Cost 3 vext1 <4,2,5,6>, LHS
+ 3631596462U, // <2,5,6,1>: Cost 4 vext1 <1,2,5,6>, <1,2,5,6>
+ 2655924730U, // <2,5,6,2>: Cost 3 vext2 <6,4,2,5>, <6,2,7,3>
+ 3643541856U, // <2,5,6,3>: Cost 4 vext1 <3,2,5,6>, <3,2,5,6>
+ 2655924849U, // <2,5,6,4>: Cost 3 vext2 <6,4,2,5>, <6,4,2,5>
+ 3787755607U, // <2,5,6,5>: Cost 4 vext3 <4,u,6,2>, <5,6,5,7>
+ 4029385218U, // <2,5,6,6>: Cost 4 vzipr <0,4,2,6>, <3,4,5,6>
+ 3088682294U, // <2,5,6,7>: Cost 3 vtrnr <0,2,4,6>, RHS
+ 3088682295U, // <2,5,6,u>: Cost 3 vtrnr <0,2,4,6>, RHS
+ 2563833958U, // <2,5,7,0>: Cost 3 vext1 <2,2,5,7>, LHS
+ 2551890678U, // <2,5,7,1>: Cost 3 vext1 <0,2,5,7>, <1,0,3,2>
+ 2563835528U, // <2,5,7,2>: Cost 3 vext1 <2,2,5,7>, <2,2,5,7>
+ 3637577878U, // <2,5,7,3>: Cost 4 vext1 <2,2,5,7>, <3,0,1,2>
+ 2563837238U, // <2,5,7,4>: Cost 3 vext1 <2,2,5,7>, RHS
+ 2712834216U, // <2,5,7,5>: Cost 3 vext3 <4,6,u,2>, <5,7,5,7>
+ 2712834220U, // <2,5,7,6>: Cost 3 vext3 <4,6,u,2>, <5,7,6,2>
+ 4174449974U, // <2,5,7,7>: Cost 4 vtrnr <2,2,5,7>, RHS
+ 2563839790U, // <2,5,7,u>: Cost 3 vext1 <2,2,5,7>, LHS
+ 2563842150U, // <2,5,u,0>: Cost 3 vext1 <2,2,5,u>, LHS
+ 2618103598U, // <2,5,u,1>: Cost 3 vext2 <0,1,2,5>, LHS
+ 2563843721U, // <2,5,u,2>: Cost 3 vext1 <2,2,5,u>, <2,2,5,u>
+ 2569816418U, // <2,5,u,3>: Cost 3 vext1 <3,2,5,u>, <3,2,5,u>
+ 2622748735U, // <2,5,u,4>: Cost 3 vext2 <0,u,2,5>, <u,4,5,6>
+ 2618103962U, // <2,5,u,5>: Cost 3 vext2 <0,1,2,5>, RHS
+ 2953669122U, // <2,5,u,6>: Cost 3 vzipr LHS, <3,4,5,6>
+ 2953667584U, // <2,5,u,7>: Cost 3 vzipr LHS, <1,3,5,7>
+ 2618104165U, // <2,5,u,u>: Cost 3 vext2 <0,1,2,5>, LHS
+ 2620096512U, // <2,6,0,0>: Cost 3 vext2 <0,4,2,6>, <0,0,0,0>
+ 1546354790U, // <2,6,0,1>: Cost 2 vext2 <0,4,2,6>, LHS
+ 2620096676U, // <2,6,0,2>: Cost 3 vext2 <0,4,2,6>, <0,2,0,2>
+ 3693838588U, // <2,6,0,3>: Cost 4 vext2 <0,4,2,6>, <0,3,1,0>
+ 1546355036U, // <2,6,0,4>: Cost 2 vext2 <0,4,2,6>, <0,4,2,6>
+ 3694502317U, // <2,6,0,5>: Cost 4 vext2 <0,5,2,6>, <0,5,2,6>
+ 2551911246U, // <2,6,0,6>: Cost 3 vext1 <0,2,6,0>, <6,7,0,1>
+ 2720723287U, // <2,6,0,7>: Cost 3 vext3 <6,0,7,2>, <6,0,7,2>
+ 1546355357U, // <2,6,0,u>: Cost 2 vext2 <0,4,2,6>, LHS
+ 2620097270U, // <2,6,1,0>: Cost 3 vext2 <0,4,2,6>, <1,0,3,2>
+ 2620097332U, // <2,6,1,1>: Cost 3 vext2 <0,4,2,6>, <1,1,1,1>
+ 2620097430U, // <2,6,1,2>: Cost 3 vext2 <0,4,2,6>, <1,2,3,0>
+ 2820243558U, // <2,6,1,3>: Cost 3 vuzpr <0,2,4,6>, LHS
+ 2620097598U, // <2,6,1,4>: Cost 3 vext2 <0,4,2,6>, <1,4,3,6>
+ 2620097680U, // <2,6,1,5>: Cost 3 vext2 <0,4,2,6>, <1,5,3,7>
+ 3693839585U, // <2,6,1,6>: Cost 4 vext2 <0,4,2,6>, <1,6,3,7>
+ 2721386920U, // <2,6,1,7>: Cost 3 vext3 <6,1,7,2>, <6,1,7,2>
+ 2820243563U, // <2,6,1,u>: Cost 3 vuzpr <0,2,4,6>, LHS
+ 2714014137U, // <2,6,2,0>: Cost 3 vext3 <4,u,6,2>, <6,2,0,1>
+ 2712834500U, // <2,6,2,1>: Cost 3 vext3 <4,6,u,2>, <6,2,1,3>
+ 2620098152U, // <2,6,2,2>: Cost 3 vext2 <0,4,2,6>, <2,2,2,2>
+ 2620098214U, // <2,6,2,3>: Cost 3 vext2 <0,4,2,6>, <2,3,0,1>
+ 2632042254U, // <2,6,2,4>: Cost 3 vext2 <2,4,2,6>, <2,4,2,6>
+ 2712834540U, // <2,6,2,5>: Cost 3 vext3 <4,6,u,2>, <6,2,5,7>
+ 2820243660U, // <2,6,2,6>: Cost 3 vuzpr <0,2,4,6>, <0,2,4,6>
+ 2958265654U, // <2,6,2,7>: Cost 3 vzipr <0,u,2,2>, RHS
+ 2620098619U, // <2,6,2,u>: Cost 3 vext2 <0,4,2,6>, <2,u,0,1>
+ 2620098710U, // <2,6,3,0>: Cost 3 vext2 <0,4,2,6>, <3,0,1,2>
+ 3893986982U, // <2,6,3,1>: Cost 4 vuzpr <0,2,4,6>, <2,3,0,1>
+ 2569848762U, // <2,6,3,2>: Cost 3 vext1 <3,2,6,3>, <2,6,3,7>
+ 2620098972U, // <2,6,3,3>: Cost 3 vext2 <0,4,2,6>, <3,3,3,3>
+ 2620099074U, // <2,6,3,4>: Cost 3 vext2 <0,4,2,6>, <3,4,5,6>
+ 3893987022U, // <2,6,3,5>: Cost 4 vuzpr <0,2,4,6>, <2,3,4,5>
+ 3001404644U, // <2,6,3,6>: Cost 3 vzipr LHS, <4,4,6,6>
+ 1879887158U, // <2,6,3,7>: Cost 2 vzipr LHS, RHS
+ 1879887159U, // <2,6,3,u>: Cost 2 vzipr LHS, RHS
+ 2620099484U, // <2,6,4,0>: Cost 3 vext2 <0,4,2,6>, <4,0,6,2>
+ 2620099566U, // <2,6,4,1>: Cost 3 vext2 <0,4,2,6>, <4,1,6,3>
+ 2620099644U, // <2,6,4,2>: Cost 3 vext2 <0,4,2,6>, <4,2,6,0>
+ 3643599207U, // <2,6,4,3>: Cost 4 vext1 <3,2,6,4>, <3,2,6,4>
+ 2575830080U, // <2,6,4,4>: Cost 3 vext1 <4,2,6,4>, <4,2,6,4>
+ 1546358070U, // <2,6,4,5>: Cost 2 vext2 <0,4,2,6>, RHS
+ 2667875700U, // <2,6,4,6>: Cost 3 vext2 <u,4,2,6>, <4,6,4,6>
+ 4028042550U, // <2,6,4,7>: Cost 4 vzipr <0,2,2,4>, RHS
+ 1546358313U, // <2,6,4,u>: Cost 2 vext2 <0,4,2,6>, RHS
+ 3693841992U, // <2,6,5,0>: Cost 4 vext2 <0,4,2,6>, <5,0,1,2>
+ 2667876048U, // <2,6,5,1>: Cost 3 vext2 <u,4,2,6>, <5,1,7,3>
+ 2712834756U, // <2,6,5,2>: Cost 3 vext3 <4,6,u,2>, <6,5,2,7>
+ 3643607400U, // <2,6,5,3>: Cost 4 vext1 <3,2,6,5>, <3,2,6,5>
+ 2252091873U, // <2,6,5,4>: Cost 3 vrev <6,2,4,5>
+ 2667876356U, // <2,6,5,5>: Cost 3 vext2 <u,4,2,6>, <5,5,5,5>
+ 2667876450U, // <2,6,5,6>: Cost 3 vext2 <u,4,2,6>, <5,6,7,0>
+ 2820246838U, // <2,6,5,7>: Cost 3 vuzpr <0,2,4,6>, RHS
+ 2820246839U, // <2,6,5,u>: Cost 3 vuzpr <0,2,4,6>, RHS
+ 2563899494U, // <2,6,6,0>: Cost 3 vext1 <2,2,6,6>, LHS
+ 3893988683U, // <2,6,6,1>: Cost 4 vuzpr <0,2,4,6>, <4,6,0,1>
+ 2563901072U, // <2,6,6,2>: Cost 3 vext1 <2,2,6,6>, <2,2,6,6>
+ 3893987236U, // <2,6,6,3>: Cost 4 vuzpr <0,2,4,6>, <2,6,1,3>
+ 2563902774U, // <2,6,6,4>: Cost 3 vext1 <2,2,6,6>, RHS
+ 3893988723U, // <2,6,6,5>: Cost 4 vuzpr <0,2,4,6>, <4,6,4,5>
+ 2712834872U, // <2,6,6,6>: Cost 3 vext3 <4,6,u,2>, <6,6,6,6>
+ 2955644214U, // <2,6,6,7>: Cost 3 vzipr <0,4,2,6>, RHS
+ 2955644215U, // <2,6,6,u>: Cost 3 vzipr <0,4,2,6>, RHS
+ 2712834894U, // <2,6,7,0>: Cost 3 vext3 <4,6,u,2>, <6,7,0,1>
+ 2724926296U, // <2,6,7,1>: Cost 3 vext3 <6,7,1,2>, <6,7,1,2>
+ 2725000033U, // <2,6,7,2>: Cost 3 vext3 <6,7,2,2>, <6,7,2,2>
+ 2702365544U, // <2,6,7,3>: Cost 3 vext3 <3,0,1,2>, <6,7,3,0>
+ 2712834934U, // <2,6,7,4>: Cost 3 vext3 <4,6,u,2>, <6,7,4,5>
+ 3776107393U, // <2,6,7,5>: Cost 4 vext3 <3,0,1,2>, <6,7,5,7>
+ 2725294981U, // <2,6,7,6>: Cost 3 vext3 <6,7,6,2>, <6,7,6,2>
+ 2726253452U, // <2,6,7,7>: Cost 3 vext3 <7,0,1,2>, <6,7,7,0>
+ 2712834966U, // <2,6,7,u>: Cost 3 vext3 <4,6,u,2>, <6,7,u,1>
+ 2620102355U, // <2,6,u,0>: Cost 3 vext2 <0,4,2,6>, <u,0,1,2>
+ 1546360622U, // <2,6,u,1>: Cost 2 vext2 <0,4,2,6>, LHS
+ 2620102536U, // <2,6,u,2>: Cost 3 vext2 <0,4,2,6>, <u,2,3,3>
+ 2820244125U, // <2,6,u,3>: Cost 3 vuzpr <0,2,4,6>, LHS
+ 1594136612U, // <2,6,u,4>: Cost 2 vext2 <u,4,2,6>, <u,4,2,6>
+ 1546360986U, // <2,6,u,5>: Cost 2 vext2 <0,4,2,6>, RHS
+ 2620102864U, // <2,6,u,6>: Cost 3 vext2 <0,4,2,6>, <u,6,3,7>
+ 1879928118U, // <2,6,u,7>: Cost 2 vzipr LHS, RHS
+ 1879928119U, // <2,6,u,u>: Cost 2 vzipr LHS, RHS
+ 2726179825U, // <2,7,0,0>: Cost 3 vext3 <7,0,0,2>, <7,0,0,2>
+ 1652511738U, // <2,7,0,1>: Cost 2 vext3 <7,0,1,2>, <7,0,1,2>
+ 2621431972U, // <2,7,0,2>: Cost 3 vext2 <0,6,2,7>, <0,2,0,2>
+ 2257949868U, // <2,7,0,3>: Cost 3 vrev <7,2,3,0>
+ 2726474773U, // <2,7,0,4>: Cost 3 vext3 <7,0,4,2>, <7,0,4,2>
+ 2620768686U, // <2,7,0,5>: Cost 3 vext2 <0,5,2,7>, <0,5,2,7>
+ 2621432319U, // <2,7,0,6>: Cost 3 vext2 <0,6,2,7>, <0,6,2,7>
+ 2599760953U, // <2,7,0,7>: Cost 3 vext1 <u,2,7,0>, <7,0,u,2>
+ 1653027897U, // <2,7,0,u>: Cost 2 vext3 <7,0,u,2>, <7,0,u,2>
+ 2639348470U, // <2,7,1,0>: Cost 3 vext2 <3,6,2,7>, <1,0,3,2>
+ 3695174452U, // <2,7,1,1>: Cost 4 vext2 <0,6,2,7>, <1,1,1,1>
+ 3695174550U, // <2,7,1,2>: Cost 4 vext2 <0,6,2,7>, <1,2,3,0>
+ 3694511104U, // <2,7,1,3>: Cost 4 vext2 <0,5,2,7>, <1,3,5,7>
+ 3713090594U, // <2,7,1,4>: Cost 4 vext2 <3,6,2,7>, <1,4,0,5>
+ 3693184144U, // <2,7,1,5>: Cost 4 vext2 <0,3,2,7>, <1,5,3,7>
+ 2627405016U, // <2,7,1,6>: Cost 3 vext2 <1,6,2,7>, <1,6,2,7>
+ 3799995519U, // <2,7,1,7>: Cost 4 vext3 <7,0,1,2>, <7,1,7,0>
+ 2639348470U, // <2,7,1,u>: Cost 3 vext2 <3,6,2,7>, <1,0,3,2>
+ 3695175101U, // <2,7,2,0>: Cost 4 vext2 <0,6,2,7>, <2,0,1,2>
+ 3643655168U, // <2,7,2,1>: Cost 4 vext1 <3,2,7,2>, <1,3,5,7>
+ 2257892517U, // <2,7,2,2>: Cost 3 vrev <7,2,2,2>
+ 3695175334U, // <2,7,2,3>: Cost 4 vext2 <0,6,2,7>, <2,3,0,1>
+ 3695175465U, // <2,7,2,4>: Cost 4 vext2 <0,6,2,7>, <2,4,5,6>
+ 2632714080U, // <2,7,2,5>: Cost 3 vext2 <2,5,2,7>, <2,5,2,7>
+ 2633377713U, // <2,7,2,6>: Cost 3 vext2 <2,6,2,7>, <2,6,2,7>
+ 3695175658U, // <2,7,2,7>: Cost 4 vext2 <0,6,2,7>, <2,7,0,1>
+ 2634704979U, // <2,7,2,u>: Cost 3 vext2 <2,u,2,7>, <2,u,2,7>
+ 1514094694U, // <2,7,3,0>: Cost 2 vext1 <6,2,7,3>, LHS
+ 2569921680U, // <2,7,3,1>: Cost 3 vext1 <3,2,7,3>, <1,5,3,7>
+ 2587838056U, // <2,7,3,2>: Cost 3 vext1 <6,2,7,3>, <2,2,2,2>
+ 2569922927U, // <2,7,3,3>: Cost 3 vext1 <3,2,7,3>, <3,2,7,3>
+ 1514097974U, // <2,7,3,4>: Cost 2 vext1 <6,2,7,3>, RHS
+ 2581868321U, // <2,7,3,5>: Cost 3 vext1 <5,2,7,3>, <5,2,7,3>
+ 1514099194U, // <2,7,3,6>: Cost 2 vext1 <6,2,7,3>, <6,2,7,3>
+ 2587841530U, // <2,7,3,7>: Cost 3 vext1 <6,2,7,3>, <7,0,1,2>
+ 1514100526U, // <2,7,3,u>: Cost 2 vext1 <6,2,7,3>, LHS
+ 2708706617U, // <2,7,4,0>: Cost 3 vext3 <4,0,6,2>, <7,4,0,6>
+ 3649643418U, // <2,7,4,1>: Cost 4 vext1 <4,2,7,4>, <1,2,3,4>
+ 3649644330U, // <2,7,4,2>: Cost 4 vext1 <4,2,7,4>, <2,4,5,7>
+ 2257982640U, // <2,7,4,3>: Cost 3 vrev <7,2,3,4>
+ 3649645641U, // <2,7,4,4>: Cost 4 vext1 <4,2,7,4>, <4,2,7,4>
+ 2621435190U, // <2,7,4,5>: Cost 3 vext2 <0,6,2,7>, RHS
+ 2712835441U, // <2,7,4,6>: Cost 3 vext3 <4,6,u,2>, <7,4,6,u>
+ 3799995762U, // <2,7,4,7>: Cost 4 vext3 <7,0,1,2>, <7,4,7,0>
+ 2621435433U, // <2,7,4,u>: Cost 3 vext2 <0,6,2,7>, RHS
+ 2729497990U, // <2,7,5,0>: Cost 3 vext3 <7,5,0,2>, <7,5,0,2>
+ 3643679744U, // <2,7,5,1>: Cost 4 vext1 <3,2,7,5>, <1,3,5,7>
+ 3637708424U, // <2,7,5,2>: Cost 4 vext1 <2,2,7,5>, <2,2,5,7>
+ 3643681137U, // <2,7,5,3>: Cost 4 vext1 <3,2,7,5>, <3,2,7,5>
+ 2599800118U, // <2,7,5,4>: Cost 3 vext1 <u,2,7,5>, RHS
+ 3786577334U, // <2,7,5,5>: Cost 4 vext3 <4,6,u,2>, <7,5,5,5>
+ 3786577345U, // <2,7,5,6>: Cost 4 vext3 <4,6,u,2>, <7,5,6,7>
+ 2599802214U, // <2,7,5,7>: Cost 3 vext1 <u,2,7,5>, <7,4,5,6>
+ 2599802670U, // <2,7,5,u>: Cost 3 vext1 <u,2,7,5>, LHS
+ 2581889126U, // <2,7,6,0>: Cost 3 vext1 <5,2,7,6>, LHS
+ 3643687936U, // <2,7,6,1>: Cost 4 vext1 <3,2,7,6>, <1,3,5,7>
+ 2663240186U, // <2,7,6,2>: Cost 3 vext2 <7,6,2,7>, <6,2,7,3>
+ 3643689330U, // <2,7,6,3>: Cost 4 vext1 <3,2,7,6>, <3,2,7,6>
+ 2581892406U, // <2,7,6,4>: Cost 3 vext1 <5,2,7,6>, RHS
+ 2581892900U, // <2,7,6,5>: Cost 3 vext1 <5,2,7,6>, <5,2,7,6>
+ 2587865597U, // <2,7,6,6>: Cost 3 vext1 <6,2,7,6>, <6,2,7,6>
+ 3786577428U, // <2,7,6,7>: Cost 4 vext3 <4,6,u,2>, <7,6,7,0>
+ 2581894958U, // <2,7,6,u>: Cost 3 vext1 <5,2,7,6>, LHS
+ 2726254119U, // <2,7,7,0>: Cost 3 vext3 <7,0,1,2>, <7,7,0,1>
+ 3804640817U, // <2,7,7,1>: Cost 4 vext3 <7,7,1,2>, <7,7,1,2>
+ 3637724826U, // <2,7,7,2>: Cost 4 vext1 <2,2,7,7>, <2,2,7,7>
+ 3734992123U, // <2,7,7,3>: Cost 4 vext2 <7,3,2,7>, <7,3,2,7>
+ 2552040758U, // <2,7,7,4>: Cost 3 vext1 <0,2,7,7>, RHS
+ 3799995992U, // <2,7,7,5>: Cost 4 vext3 <7,0,1,2>, <7,7,5,5>
+ 2663241198U, // <2,7,7,6>: Cost 3 vext2 <7,6,2,7>, <7,6,2,7>
+ 2712835692U, // <2,7,7,7>: Cost 3 vext3 <4,6,u,2>, <7,7,7,7>
+ 2731562607U, // <2,7,7,u>: Cost 3 vext3 <7,u,1,2>, <7,7,u,1>
+ 1514135654U, // <2,7,u,0>: Cost 2 vext1 <6,2,7,u>, LHS
+ 1657820802U, // <2,7,u,1>: Cost 2 vext3 <7,u,1,2>, <7,u,1,2>
+ 2587879016U, // <2,7,u,2>: Cost 3 vext1 <6,2,7,u>, <2,2,2,2>
+ 2569963892U, // <2,7,u,3>: Cost 3 vext1 <3,2,7,u>, <3,2,7,u>
+ 1514138934U, // <2,7,u,4>: Cost 2 vext1 <6,2,7,u>, RHS
+ 2621438106U, // <2,7,u,5>: Cost 3 vext2 <0,6,2,7>, RHS
+ 1514140159U, // <2,7,u,6>: Cost 2 vext1 <6,2,7,u>, <6,2,7,u>
+ 2587882490U, // <2,7,u,7>: Cost 3 vext1 <6,2,7,u>, <7,0,1,2>
+ 1514141486U, // <2,7,u,u>: Cost 2 vext1 <6,2,7,u>, LHS
+ 1544380416U, // <2,u,0,0>: Cost 2 vext2 LHS, <0,0,0,0>
+ 470638699U, // <2,u,0,1>: Cost 1 vext2 LHS, LHS
+ 1544380580U, // <2,u,0,2>: Cost 2 vext2 LHS, <0,2,0,2>
+ 1658631909U, // <2,u,0,3>: Cost 2 vext3 <u,0,3,2>, <u,0,3,2>
+ 1544380754U, // <2,u,0,4>: Cost 2 vext2 LHS, <0,4,1,5>
+ 2665898414U, // <2,u,0,5>: Cost 3 vext2 LHS, <0,5,2,7>
+ 1658853120U, // <2,u,0,6>: Cost 2 vext3 <u,0,6,2>, <u,0,6,2>
+ 3094531625U, // <2,u,0,7>: Cost 3 vtrnr <1,2,3,0>, RHS
+ 470639261U, // <2,u,0,u>: Cost 1 vext2 LHS, LHS
+ 1544381174U, // <2,u,1,0>: Cost 2 vext2 LHS, <1,0,3,2>
+ 1544381236U, // <2,u,1,1>: Cost 2 vext2 LHS, <1,1,1,1>
+ 1544381334U, // <2,u,1,2>: Cost 2 vext2 LHS, <1,2,3,0>
+ 1544381400U, // <2,u,1,3>: Cost 2 vext2 LHS, <1,3,1,3>
+ 2618123325U, // <2,u,1,4>: Cost 3 vext2 LHS, <1,4,3,5>
+ 1544381584U, // <2,u,1,5>: Cost 2 vext2 LHS, <1,5,3,7>
+ 2618123489U, // <2,u,1,6>: Cost 3 vext2 LHS, <1,6,3,7>
+ 2726254427U, // <2,u,1,7>: Cost 3 vext3 <7,0,1,2>, <u,1,7,3>
+ 1544381823U, // <2,u,1,u>: Cost 2 vext2 LHS, <1,u,3,3>
+ 1478328422U, // <2,u,2,0>: Cost 2 vext1 <0,2,u,2>, LHS
+ 2618123807U, // <2,u,2,1>: Cost 3 vext2 LHS, <2,1,3,1>
+ 269271142U, // <2,u,2,2>: Cost 1 vdup2 LHS
+ 1544382118U, // <2,u,2,3>: Cost 2 vext2 LHS, <2,3,0,1>
+ 1478331702U, // <2,u,2,4>: Cost 2 vext1 <0,2,u,2>, RHS
+ 2618124136U, // <2,u,2,5>: Cost 3 vext2 LHS, <2,5,3,6>
+ 1544382394U, // <2,u,2,6>: Cost 2 vext2 LHS, <2,6,3,7>
+ 3088354857U, // <2,u,2,7>: Cost 3 vtrnr <0,2,0,2>, RHS
+ 269271142U, // <2,u,2,u>: Cost 1 vdup2 LHS
+ 1544382614U, // <2,u,3,0>: Cost 2 vext2 LHS, <3,0,1,2>
+ 2953627374U, // <2,u,3,1>: Cost 3 vzipr LHS, <2,3,u,1>
+ 1490282143U, // <2,u,3,2>: Cost 2 vext1 <2,2,u,3>, <2,2,u,3>
+ 1879883932U, // <2,u,3,3>: Cost 2 vzipr LHS, LHS
+ 1544382978U, // <2,u,3,4>: Cost 2 vext2 LHS, <3,4,5,6>
+ 2953627378U, // <2,u,3,5>: Cost 3 vzipr LHS, <2,3,u,5>
+ 1514172931U, // <2,u,3,6>: Cost 2 vext1 <6,2,u,3>, <6,2,u,3>
+ 1879887176U, // <2,u,3,7>: Cost 2 vzipr LHS, RHS
+ 1879883937U, // <2,u,3,u>: Cost 2 vzipr LHS, LHS
+ 1484316774U, // <2,u,4,0>: Cost 2 vext1 <1,2,u,4>, LHS
+ 1484317639U, // <2,u,4,1>: Cost 2 vext1 <1,2,u,4>, <1,2,u,4>
+ 2552088270U, // <2,u,4,2>: Cost 3 vext1 <0,2,u,4>, <2,3,4,5>
+ 1190213513U, // <2,u,4,3>: Cost 2 vrev <u,2,3,4>
+ 1484320054U, // <2,u,4,4>: Cost 2 vext1 <1,2,u,4>, RHS
+ 470641974U, // <2,u,4,5>: Cost 1 vext2 LHS, RHS
+ 1592159604U, // <2,u,4,6>: Cost 2 vext2 LHS, <4,6,4,6>
+ 3094564393U, // <2,u,4,7>: Cost 3 vtrnr <1,2,3,4>, RHS
+ 470642217U, // <2,u,4,u>: Cost 1 vext2 LHS, RHS
+ 2552094959U, // <2,u,5,0>: Cost 3 vext1 <0,2,u,5>, <0,2,u,5>
+ 1592159952U, // <2,u,5,1>: Cost 2 vext2 LHS, <5,1,7,3>
+ 2564040353U, // <2,u,5,2>: Cost 3 vext1 <2,2,u,5>, <2,2,u,5>
+ 2690275455U, // <2,u,5,3>: Cost 3 vext3 <0,u,u,2>, <u,5,3,7>
+ 1592160198U, // <2,u,5,4>: Cost 2 vext2 LHS, <5,4,7,6>
+ 1592160260U, // <2,u,5,5>: Cost 2 vext2 LHS, <5,5,5,5>
+ 1611962522U, // <2,u,5,6>: Cost 2 vext3 <0,2,0,2>, RHS
+ 1592160424U, // <2,u,5,7>: Cost 2 vext2 LHS, <5,7,5,7>
+ 1611962540U, // <2,u,5,u>: Cost 2 vext3 <0,2,0,2>, RHS
+ 1478361190U, // <2,u,6,0>: Cost 2 vext1 <0,2,u,6>, LHS
+ 2552103670U, // <2,u,6,1>: Cost 3 vext1 <0,2,u,6>, <1,0,3,2>
+ 1592160762U, // <2,u,6,2>: Cost 2 vext2 LHS, <6,2,7,3>
+ 2685704400U, // <2,u,6,3>: Cost 3 vext3 <0,2,0,2>, <u,6,3,7>
+ 1478364470U, // <2,u,6,4>: Cost 2 vext1 <0,2,u,6>, RHS
+ 2901891226U, // <2,u,6,5>: Cost 3 vzipl <2,6,3,7>, RHS
+ 1592161080U, // <2,u,6,6>: Cost 2 vext2 LHS, <6,6,6,6>
+ 1592161102U, // <2,u,6,7>: Cost 2 vext2 LHS, <6,7,0,1>
+ 1478367022U, // <2,u,6,u>: Cost 2 vext1 <0,2,u,6>, LHS
+ 1592161274U, // <2,u,7,0>: Cost 2 vext2 LHS, <7,0,1,2>
+ 2659931226U, // <2,u,7,1>: Cost 3 vext2 <7,1,2,u>, <7,1,2,u>
+ 2564056739U, // <2,u,7,2>: Cost 3 vext1 <2,2,u,7>, <2,2,u,7>
+ 2665903331U, // <2,u,7,3>: Cost 3 vext2 LHS, <7,3,0,1>
+ 1592161638U, // <2,u,7,4>: Cost 2 vext2 LHS, <7,4,5,6>
+ 2665903494U, // <2,u,7,5>: Cost 3 vext2 LHS, <7,5,0,2>
+ 2587947527U, // <2,u,7,6>: Cost 3 vext1 <6,2,u,7>, <6,2,u,7>
+ 1592161900U, // <2,u,7,7>: Cost 2 vext2 LHS, <7,7,7,7>
+ 1592161922U, // <2,u,7,u>: Cost 2 vext2 LHS, <7,u,1,2>
+ 1478377574U, // <2,u,u,0>: Cost 2 vext1 <0,2,u,u>, LHS
+ 470644526U, // <2,u,u,1>: Cost 1 vext2 LHS, LHS
+ 269271142U, // <2,u,u,2>: Cost 1 vdup2 LHS
+ 1879924892U, // <2,u,u,3>: Cost 2 vzipr LHS, LHS
+ 1478380854U, // <2,u,u,4>: Cost 2 vext1 <0,2,u,u>, RHS
+ 470644890U, // <2,u,u,5>: Cost 1 vext2 LHS, RHS
+ 1611962765U, // <2,u,u,6>: Cost 2 vext3 <0,2,0,2>, RHS
+ 1879928136U, // <2,u,u,7>: Cost 2 vzipr LHS, RHS
+ 470645093U, // <2,u,u,u>: Cost 1 vext2 LHS, LHS
+ 1611448320U, // <3,0,0,0>: Cost 2 vext3 LHS, <0,0,0,0>
+ 1611890698U, // <3,0,0,1>: Cost 2 vext3 LHS, <0,0,1,1>
+ 1611890708U, // <3,0,0,2>: Cost 2 vext3 LHS, <0,0,2,2>
+ 3763576860U, // <3,0,0,3>: Cost 4 vext3 LHS, <0,0,3,1>
+ 2689835045U, // <3,0,0,4>: Cost 3 vext3 LHS, <0,0,4,1>
+ 3698508206U, // <3,0,0,5>: Cost 4 vext2 <1,2,3,0>, <0,5,2,7>
+ 3763576887U, // <3,0,0,6>: Cost 4 vext3 LHS, <0,0,6,1>
+ 3667678434U, // <3,0,0,7>: Cost 4 vext1 <7,3,0,0>, <7,3,0,0>
+ 1616093258U, // <3,0,0,u>: Cost 2 vext3 LHS, <0,0,u,2>
+ 1490337894U, // <3,0,1,0>: Cost 2 vext1 <2,3,0,1>, LHS
+ 2685632602U, // <3,0,1,1>: Cost 3 vext3 LHS, <0,1,1,0>
+ 537706598U, // <3,0,1,2>: Cost 1 vext3 LHS, LHS
+ 2624766936U, // <3,0,1,3>: Cost 3 vext2 <1,2,3,0>, <1,3,1,3>
+ 1490341174U, // <3,0,1,4>: Cost 2 vext1 <2,3,0,1>, RHS
+ 2624767120U, // <3,0,1,5>: Cost 3 vext2 <1,2,3,0>, <1,5,3,7>
+ 2732966030U, // <3,0,1,6>: Cost 3 vext3 LHS, <0,1,6,7>
+ 2593944803U, // <3,0,1,7>: Cost 3 vext1 <7,3,0,1>, <7,3,0,1>
+ 537706652U, // <3,0,1,u>: Cost 1 vext3 LHS, LHS
+ 1611890852U, // <3,0,2,0>: Cost 2 vext3 LHS, <0,2,0,2>
+ 2685632684U, // <3,0,2,1>: Cost 3 vext3 LHS, <0,2,1,1>
+ 2685632692U, // <3,0,2,2>: Cost 3 vext3 LHS, <0,2,2,0>
+ 2685632702U, // <3,0,2,3>: Cost 3 vext3 LHS, <0,2,3,1>
+ 1611890892U, // <3,0,2,4>: Cost 2 vext3 LHS, <0,2,4,6>
+ 2732966102U, // <3,0,2,5>: Cost 3 vext3 LHS, <0,2,5,7>
+ 2624767930U, // <3,0,2,6>: Cost 3 vext2 <1,2,3,0>, <2,6,3,7>
+ 2685632744U, // <3,0,2,7>: Cost 3 vext3 LHS, <0,2,7,7>
+ 1611890924U, // <3,0,2,u>: Cost 2 vext3 LHS, <0,2,u,2>
+ 2624768150U, // <3,0,3,0>: Cost 3 vext2 <1,2,3,0>, <3,0,1,2>
+ 2685632764U, // <3,0,3,1>: Cost 3 vext3 LHS, <0,3,1,0>
+ 2685632774U, // <3,0,3,2>: Cost 3 vext3 LHS, <0,3,2,1>
+ 2624768412U, // <3,0,3,3>: Cost 3 vext2 <1,2,3,0>, <3,3,3,3>
+ 2624768514U, // <3,0,3,4>: Cost 3 vext2 <1,2,3,0>, <3,4,5,6>
+ 3702491714U, // <3,0,3,5>: Cost 4 vext2 <1,u,3,0>, <3,5,3,7>
+ 2624768632U, // <3,0,3,6>: Cost 3 vext2 <1,2,3,0>, <3,6,0,7>
+ 3702491843U, // <3,0,3,7>: Cost 4 vext2 <1,u,3,0>, <3,7,0,1>
+ 2686959934U, // <3,0,3,u>: Cost 3 vext3 <0,3,u,3>, <0,3,u,3>
+ 2689835336U, // <3,0,4,0>: Cost 3 vext3 LHS, <0,4,0,4>
+ 1611891026U, // <3,0,4,1>: Cost 2 vext3 LHS, <0,4,1,5>
+ 1611891036U, // <3,0,4,2>: Cost 2 vext3 LHS, <0,4,2,6>
+ 3763577184U, // <3,0,4,3>: Cost 4 vext3 LHS, <0,4,3,1>
+ 2689835374U, // <3,0,4,4>: Cost 3 vext3 LHS, <0,4,4,6>
+ 1551027510U, // <3,0,4,5>: Cost 2 vext2 <1,2,3,0>, RHS
+ 2666573172U, // <3,0,4,6>: Cost 3 vext2 <u,2,3,0>, <4,6,4,6>
+ 3667711206U, // <3,0,4,7>: Cost 4 vext1 <7,3,0,4>, <7,3,0,4>
+ 1616093586U, // <3,0,4,u>: Cost 2 vext3 LHS, <0,4,u,6>
+ 2685190556U, // <3,0,5,0>: Cost 3 vext3 LHS, <0,5,0,7>
+ 2666573520U, // <3,0,5,1>: Cost 3 vext2 <u,2,3,0>, <5,1,7,3>
+ 3040886886U, // <3,0,5,2>: Cost 3 vtrnl <3,4,5,6>, LHS
+ 3625912834U, // <3,0,5,3>: Cost 4 vext1 <0,3,0,5>, <3,4,5,6>
+ 2666573766U, // <3,0,5,4>: Cost 3 vext2 <u,2,3,0>, <5,4,7,6>
+ 2666573828U, // <3,0,5,5>: Cost 3 vext2 <u,2,3,0>, <5,5,5,5>
+ 2732966354U, // <3,0,5,6>: Cost 3 vext3 LHS, <0,5,6,7>
+ 2666573992U, // <3,0,5,7>: Cost 3 vext2 <u,2,3,0>, <5,7,5,7>
+ 3040886940U, // <3,0,5,u>: Cost 3 vtrnl <3,4,5,6>, LHS
+ 2685190637U, // <3,0,6,0>: Cost 3 vext3 LHS, <0,6,0,7>
+ 2732966390U, // <3,0,6,1>: Cost 3 vext3 LHS, <0,6,1,7>
+ 2689835519U, // <3,0,6,2>: Cost 3 vext3 LHS, <0,6,2,7>
+ 3667724438U, // <3,0,6,3>: Cost 4 vext1 <7,3,0,6>, <3,0,1,2>
+ 3763577355U, // <3,0,6,4>: Cost 4 vext3 LHS, <0,6,4,1>
+ 3806708243U, // <3,0,6,5>: Cost 4 vext3 LHS, <0,6,5,0>
+ 2666574648U, // <3,0,6,6>: Cost 3 vext2 <u,2,3,0>, <6,6,6,6>
+ 2657948520U, // <3,0,6,7>: Cost 3 vext2 <6,7,3,0>, <6,7,3,0>
+ 2689835573U, // <3,0,6,u>: Cost 3 vext3 LHS, <0,6,u,7>
+ 2666574842U, // <3,0,7,0>: Cost 3 vext2 <u,2,3,0>, <7,0,1,2>
+ 2685633095U, // <3,0,7,1>: Cost 3 vext3 LHS, <0,7,1,7>
+ 2660603052U, // <3,0,7,2>: Cost 3 vext2 <7,2,3,0>, <7,2,3,0>
+ 3643844997U, // <3,0,7,3>: Cost 4 vext1 <3,3,0,7>, <3,3,0,7>
+ 2666575206U, // <3,0,7,4>: Cost 3 vext2 <u,2,3,0>, <7,4,5,6>
+ 3655790391U, // <3,0,7,5>: Cost 4 vext1 <5,3,0,7>, <5,3,0,7>
+ 3731690968U, // <3,0,7,6>: Cost 4 vext2 <6,7,3,0>, <7,6,0,3>
+ 2666575468U, // <3,0,7,7>: Cost 3 vext2 <u,2,3,0>, <7,7,7,7>
+ 2664584850U, // <3,0,7,u>: Cost 3 vext2 <7,u,3,0>, <7,u,3,0>
+ 1616093834U, // <3,0,u,0>: Cost 2 vext3 LHS, <0,u,0,2>
+ 1611891346U, // <3,0,u,1>: Cost 2 vext3 LHS, <0,u,1,1>
+ 537707165U, // <3,0,u,2>: Cost 1 vext3 LHS, LHS
+ 2689835684U, // <3,0,u,3>: Cost 3 vext3 LHS, <0,u,3,1>
+ 1616093874U, // <3,0,u,4>: Cost 2 vext3 LHS, <0,u,4,6>
+ 1551030426U, // <3,0,u,5>: Cost 2 vext2 <1,2,3,0>, RHS
+ 2624772304U, // <3,0,u,6>: Cost 3 vext2 <1,2,3,0>, <u,6,3,7>
+ 2594002154U, // <3,0,u,7>: Cost 3 vext1 <7,3,0,u>, <7,3,0,u>
+ 537707219U, // <3,0,u,u>: Cost 1 vext3 LHS, LHS
+ 2552201318U, // <3,1,0,0>: Cost 3 vext1 <0,3,1,0>, LHS
+ 2618802278U, // <3,1,0,1>: Cost 3 vext2 <0,2,3,1>, LHS
+ 2618802366U, // <3,1,0,2>: Cost 3 vext2 <0,2,3,1>, <0,2,3,1>
+ 1611449078U, // <3,1,0,3>: Cost 2 vext3 LHS, <1,0,3,2>
+ 2552204598U, // <3,1,0,4>: Cost 3 vext1 <0,3,1,0>, RHS
+ 2732966663U, // <3,1,0,5>: Cost 3 vext3 LHS, <1,0,5,1>
+ 3906258396U, // <3,1,0,6>: Cost 4 vuzpr <2,3,0,1>, <2,0,4,6>
+ 3667752171U, // <3,1,0,7>: Cost 4 vext1 <7,3,1,0>, <7,3,1,0>
+ 1611891491U, // <3,1,0,u>: Cost 2 vext3 LHS, <1,0,u,2>
+ 2689835819U, // <3,1,1,0>: Cost 3 vext3 LHS, <1,1,0,1>
+ 1611449140U, // <3,1,1,1>: Cost 2 vext3 LHS, <1,1,1,1>
+ 2624775063U, // <3,1,1,2>: Cost 3 vext2 <1,2,3,1>, <1,2,3,1>
+ 1611891528U, // <3,1,1,3>: Cost 2 vext3 LHS, <1,1,3,3>
+ 2689835859U, // <3,1,1,4>: Cost 3 vext3 LHS, <1,1,4,5>
+ 2689835868U, // <3,1,1,5>: Cost 3 vext3 LHS, <1,1,5,5>
+ 3763577701U, // <3,1,1,6>: Cost 4 vext3 LHS, <1,1,6,5>
+ 3765273452U, // <3,1,1,7>: Cost 4 vext3 <1,1,7,3>, <1,1,7,3>
+ 1611891573U, // <3,1,1,u>: Cost 2 vext3 LHS, <1,1,u,3>
+ 2629420494U, // <3,1,2,0>: Cost 3 vext2 <2,0,3,1>, <2,0,3,1>
+ 2689835911U, // <3,1,2,1>: Cost 3 vext3 LHS, <1,2,1,3>
+ 2564163248U, // <3,1,2,2>: Cost 3 vext1 <2,3,1,2>, <2,3,1,2>
+ 1611449238U, // <3,1,2,3>: Cost 2 vext3 LHS, <1,2,3,0>
+ 2564164918U, // <3,1,2,4>: Cost 3 vext1 <2,3,1,2>, RHS
+ 2689835947U, // <3,1,2,5>: Cost 3 vext3 LHS, <1,2,5,3>
+ 3692545978U, // <3,1,2,6>: Cost 4 vext2 <0,2,3,1>, <2,6,3,7>
+ 2732966842U, // <3,1,2,7>: Cost 3 vext3 LHS, <1,2,7,0>
+ 1611891651U, // <3,1,2,u>: Cost 2 vext3 LHS, <1,2,u,0>
+ 1484456038U, // <3,1,3,0>: Cost 2 vext1 <1,3,1,3>, LHS
+ 1611891672U, // <3,1,3,1>: Cost 2 vext3 LHS, <1,3,1,3>
+ 2685633502U, // <3,1,3,2>: Cost 3 vext3 LHS, <1,3,2,0>
+ 2685633512U, // <3,1,3,3>: Cost 3 vext3 LHS, <1,3,3,1>
+ 1484459318U, // <3,1,3,4>: Cost 2 vext1 <1,3,1,3>, RHS
+ 1611891712U, // <3,1,3,5>: Cost 2 vext3 LHS, <1,3,5,7>
+ 2689836041U, // <3,1,3,6>: Cost 3 vext3 LHS, <1,3,6,7>
+ 2733409294U, // <3,1,3,7>: Cost 3 vext3 LHS, <1,3,7,3>
+ 1611891735U, // <3,1,3,u>: Cost 2 vext3 LHS, <1,3,u,3>
+ 2552234086U, // <3,1,4,0>: Cost 3 vext1 <0,3,1,4>, LHS
+ 2732966955U, // <3,1,4,1>: Cost 3 vext3 LHS, <1,4,1,5>
+ 2732966964U, // <3,1,4,2>: Cost 3 vext3 LHS, <1,4,2,5>
+ 2685633597U, // <3,1,4,3>: Cost 3 vext3 LHS, <1,4,3,5>
+ 2552237366U, // <3,1,4,4>: Cost 3 vext1 <0,3,1,4>, RHS
+ 2618805558U, // <3,1,4,5>: Cost 3 vext2 <0,2,3,1>, RHS
+ 2769472822U, // <3,1,4,6>: Cost 3 vuzpl <3,0,1,2>, RHS
+ 3667784943U, // <3,1,4,7>: Cost 4 vext1 <7,3,1,4>, <7,3,1,4>
+ 2685633642U, // <3,1,4,u>: Cost 3 vext3 LHS, <1,4,u,5>
+ 2689836143U, // <3,1,5,0>: Cost 3 vext3 LHS, <1,5,0,1>
+ 2564187280U, // <3,1,5,1>: Cost 3 vext1 <2,3,1,5>, <1,5,3,7>
+ 2564187827U, // <3,1,5,2>: Cost 3 vext1 <2,3,1,5>, <2,3,1,5>
+ 1611891856U, // <3,1,5,3>: Cost 2 vext3 LHS, <1,5,3,7>
+ 2689836183U, // <3,1,5,4>: Cost 3 vext3 LHS, <1,5,4,5>
+ 3759375522U, // <3,1,5,5>: Cost 4 vext3 LHS, <1,5,5,7>
+ 3720417378U, // <3,1,5,6>: Cost 4 vext2 <4,u,3,1>, <5,6,7,0>
+ 2832518454U, // <3,1,5,7>: Cost 3 vuzpr <2,3,0,1>, RHS
+ 1611891901U, // <3,1,5,u>: Cost 2 vext3 LHS, <1,5,u,7>
+ 3763578048U, // <3,1,6,0>: Cost 4 vext3 LHS, <1,6,0,1>
+ 2689836239U, // <3,1,6,1>: Cost 3 vext3 LHS, <1,6,1,7>
+ 2732967128U, // <3,1,6,2>: Cost 3 vext3 LHS, <1,6,2,7>
+ 2685633761U, // <3,1,6,3>: Cost 3 vext3 LHS, <1,6,3,7>
+ 3763578088U, // <3,1,6,4>: Cost 4 vext3 LHS, <1,6,4,5>
+ 2689836275U, // <3,1,6,5>: Cost 3 vext3 LHS, <1,6,5,7>
+ 3763578108U, // <3,1,6,6>: Cost 4 vext3 LHS, <1,6,6,7>
+ 2732967166U, // <3,1,6,7>: Cost 3 vext3 LHS, <1,6,7,0>
+ 2685633806U, // <3,1,6,u>: Cost 3 vext3 LHS, <1,6,u,7>
+ 3631972454U, // <3,1,7,0>: Cost 4 vext1 <1,3,1,7>, LHS
+ 2659947612U, // <3,1,7,1>: Cost 3 vext2 <7,1,3,1>, <7,1,3,1>
+ 4036102294U, // <3,1,7,2>: Cost 4 vzipr <1,5,3,7>, <3,0,1,2>
+ 3095396454U, // <3,1,7,3>: Cost 3 vtrnr <1,3,5,7>, LHS
+ 3631975734U, // <3,1,7,4>: Cost 4 vext1 <1,3,1,7>, RHS
+ 2222982144U, // <3,1,7,5>: Cost 3 vrev <1,3,5,7>
+ 3296797705U, // <3,1,7,6>: Cost 4 vrev <1,3,6,7>
+ 3720418924U, // <3,1,7,7>: Cost 4 vext2 <4,u,3,1>, <7,7,7,7>
+ 3095396459U, // <3,1,7,u>: Cost 3 vtrnr <1,3,5,7>, LHS
+ 1484496998U, // <3,1,u,0>: Cost 2 vext1 <1,3,1,u>, LHS
+ 1611892077U, // <3,1,u,1>: Cost 2 vext3 LHS, <1,u,1,3>
+ 2685633907U, // <3,1,u,2>: Cost 3 vext3 LHS, <1,u,2,0>
+ 1611892092U, // <3,1,u,3>: Cost 2 vext3 LHS, <1,u,3,0>
+ 1484500278U, // <3,1,u,4>: Cost 2 vext1 <1,3,1,u>, RHS
+ 1611892117U, // <3,1,u,5>: Cost 2 vext3 LHS, <1,u,5,7>
+ 2685633950U, // <3,1,u,6>: Cost 3 vext3 LHS, <1,u,6,7>
+ 2832518697U, // <3,1,u,7>: Cost 3 vuzpr <2,3,0,1>, RHS
+ 1611892140U, // <3,1,u,u>: Cost 2 vext3 LHS, <1,u,u,3>
+ 2623455232U, // <3,2,0,0>: Cost 3 vext2 <1,0,3,2>, <0,0,0,0>
+ 1549713510U, // <3,2,0,1>: Cost 2 vext2 <1,0,3,2>, LHS
+ 2689836484U, // <3,2,0,2>: Cost 3 vext3 LHS, <2,0,2,0>
+ 2685633997U, // <3,2,0,3>: Cost 3 vext3 LHS, <2,0,3,0>
+ 2623455570U, // <3,2,0,4>: Cost 3 vext2 <1,0,3,2>, <0,4,1,5>
+ 2732967398U, // <3,2,0,5>: Cost 3 vext3 LHS, <2,0,5,7>
+ 2689836524U, // <3,2,0,6>: Cost 3 vext3 LHS, <2,0,6,4>
+ 2229044964U, // <3,2,0,7>: Cost 3 vrev <2,3,7,0>
+ 1549714077U, // <3,2,0,u>: Cost 2 vext2 <1,0,3,2>, LHS
+ 1549714166U, // <3,2,1,0>: Cost 2 vext2 <1,0,3,2>, <1,0,3,2>
+ 2623456052U, // <3,2,1,1>: Cost 3 vext2 <1,0,3,2>, <1,1,1,1>
+ 2623456150U, // <3,2,1,2>: Cost 3 vext2 <1,0,3,2>, <1,2,3,0>
+ 2685634079U, // <3,2,1,3>: Cost 3 vext3 LHS, <2,1,3,1>
+ 2552286518U, // <3,2,1,4>: Cost 3 vext1 <0,3,2,1>, RHS
+ 2623456400U, // <3,2,1,5>: Cost 3 vext2 <1,0,3,2>, <1,5,3,7>
+ 2689836604U, // <3,2,1,6>: Cost 3 vext3 LHS, <2,1,6,3>
+ 3667834101U, // <3,2,1,7>: Cost 4 vext1 <7,3,2,1>, <7,3,2,1>
+ 1155385070U, // <3,2,1,u>: Cost 2 vrev <2,3,u,1>
+ 2689836629U, // <3,2,2,0>: Cost 3 vext3 LHS, <2,2,0,1>
+ 2689836640U, // <3,2,2,1>: Cost 3 vext3 LHS, <2,2,1,3>
+ 1611449960U, // <3,2,2,2>: Cost 2 vext3 LHS, <2,2,2,2>
+ 1611892338U, // <3,2,2,3>: Cost 2 vext3 LHS, <2,2,3,3>
+ 2689836669U, // <3,2,2,4>: Cost 3 vext3 LHS, <2,2,4,5>
+ 2689836680U, // <3,2,2,5>: Cost 3 vext3 LHS, <2,2,5,7>
+ 2689836688U, // <3,2,2,6>: Cost 3 vext3 LHS, <2,2,6,6>
+ 3763578518U, // <3,2,2,7>: Cost 4 vext3 LHS, <2,2,7,3>
+ 1611892383U, // <3,2,2,u>: Cost 2 vext3 LHS, <2,2,u,3>
+ 1611450022U, // <3,2,3,0>: Cost 2 vext3 LHS, <2,3,0,1>
+ 2685191854U, // <3,2,3,1>: Cost 3 vext3 LHS, <2,3,1,0>
+ 2685191865U, // <3,2,3,2>: Cost 3 vext3 LHS, <2,3,2,2>
+ 2685191875U, // <3,2,3,3>: Cost 3 vext3 LHS, <2,3,3,3>
+ 1611450062U, // <3,2,3,4>: Cost 2 vext3 LHS, <2,3,4,5>
+ 2732967635U, // <3,2,3,5>: Cost 3 vext3 LHS, <2,3,5,1>
+ 2732967645U, // <3,2,3,6>: Cost 3 vext3 LHS, <2,3,6,2>
+ 2732967652U, // <3,2,3,7>: Cost 3 vext3 LHS, <2,3,7,0>
+ 1611450094U, // <3,2,3,u>: Cost 2 vext3 LHS, <2,3,u,1>
+ 2558279782U, // <3,2,4,0>: Cost 3 vext1 <1,3,2,4>, LHS
+ 2558280602U, // <3,2,4,1>: Cost 3 vext1 <1,3,2,4>, <1,2,3,4>
+ 2732967692U, // <3,2,4,2>: Cost 3 vext3 LHS, <2,4,2,4>
+ 2685634326U, // <3,2,4,3>: Cost 3 vext3 LHS, <2,4,3,5>
+ 2558283062U, // <3,2,4,4>: Cost 3 vext1 <1,3,2,4>, RHS
+ 1549716790U, // <3,2,4,5>: Cost 2 vext2 <1,0,3,2>, RHS
+ 2689836844U, // <3,2,4,6>: Cost 3 vext3 LHS, <2,4,6,0>
+ 2229077736U, // <3,2,4,7>: Cost 3 vrev <2,3,7,4>
+ 1549717033U, // <3,2,4,u>: Cost 2 vext2 <1,0,3,2>, RHS
+ 2552316006U, // <3,2,5,0>: Cost 3 vext1 <0,3,2,5>, LHS
+ 2228643507U, // <3,2,5,1>: Cost 3 vrev <2,3,1,5>
+ 2689836896U, // <3,2,5,2>: Cost 3 vext3 LHS, <2,5,2,7>
+ 2685634408U, // <3,2,5,3>: Cost 3 vext3 LHS, <2,5,3,6>
+ 1155122894U, // <3,2,5,4>: Cost 2 vrev <2,3,4,5>
+ 2665263108U, // <3,2,5,5>: Cost 3 vext2 <u,0,3,2>, <5,5,5,5>
+ 2689836932U, // <3,2,5,6>: Cost 3 vext3 LHS, <2,5,6,7>
+ 2665263272U, // <3,2,5,7>: Cost 3 vext2 <u,0,3,2>, <5,7,5,7>
+ 1155417842U, // <3,2,5,u>: Cost 2 vrev <2,3,u,5>
+ 2689836953U, // <3,2,6,0>: Cost 3 vext3 LHS, <2,6,0,1>
+ 2689836964U, // <3,2,6,1>: Cost 3 vext3 LHS, <2,6,1,3>
+ 2689836976U, // <3,2,6,2>: Cost 3 vext3 LHS, <2,6,2,6>
+ 1611892666U, // <3,2,6,3>: Cost 2 vext3 LHS, <2,6,3,7>
+ 2689836993U, // <3,2,6,4>: Cost 3 vext3 LHS, <2,6,4,5>
+ 2689837004U, // <3,2,6,5>: Cost 3 vext3 LHS, <2,6,5,7>
+ 2689837013U, // <3,2,6,6>: Cost 3 vext3 LHS, <2,6,6,7>
+ 2665263950U, // <3,2,6,7>: Cost 3 vext2 <u,0,3,2>, <6,7,0,1>
+ 1611892711U, // <3,2,6,u>: Cost 2 vext3 LHS, <2,6,u,7>
+ 2665264122U, // <3,2,7,0>: Cost 3 vext2 <u,0,3,2>, <7,0,1,2>
+ 2623460419U, // <3,2,7,1>: Cost 3 vext2 <1,0,3,2>, <7,1,0,3>
+ 4169138340U, // <3,2,7,2>: Cost 4 vtrnr <1,3,5,7>, <0,2,0,2>
+ 2962358374U, // <3,2,7,3>: Cost 3 vzipr <1,5,3,7>, LHS
+ 2665264486U, // <3,2,7,4>: Cost 3 vext2 <u,0,3,2>, <7,4,5,6>
+ 2228954841U, // <3,2,7,5>: Cost 3 vrev <2,3,5,7>
+ 2229028578U, // <3,2,7,6>: Cost 3 vrev <2,3,6,7>
+ 2665264748U, // <3,2,7,7>: Cost 3 vext2 <u,0,3,2>, <7,7,7,7>
+ 2962358379U, // <3,2,7,u>: Cost 3 vzipr <1,5,3,7>, LHS
+ 1611892795U, // <3,2,u,0>: Cost 2 vext3 LHS, <2,u,0,1>
+ 1549719342U, // <3,2,u,1>: Cost 2 vext2 <1,0,3,2>, LHS
+ 1611449960U, // <3,2,u,2>: Cost 2 vext3 LHS, <2,2,2,2>
+ 1611892824U, // <3,2,u,3>: Cost 2 vext3 LHS, <2,u,3,3>
+ 1611892835U, // <3,2,u,4>: Cost 2 vext3 LHS, <2,u,4,5>
+ 1549719706U, // <3,2,u,5>: Cost 2 vext2 <1,0,3,2>, RHS
+ 2689837168U, // <3,2,u,6>: Cost 3 vext3 LHS, <2,u,6,0>
+ 2665265408U, // <3,2,u,7>: Cost 3 vext2 <u,0,3,2>, <u,7,0,1>
+ 1611892867U, // <3,2,u,u>: Cost 2 vext3 LHS, <2,u,u,1>
+ 2685192331U, // <3,3,0,0>: Cost 3 vext3 LHS, <3,0,0,0>
+ 1611450518U, // <3,3,0,1>: Cost 2 vext3 LHS, <3,0,1,2>
+ 2685634717U, // <3,3,0,2>: Cost 3 vext3 LHS, <3,0,2,0>
+ 2564294806U, // <3,3,0,3>: Cost 3 vext1 <2,3,3,0>, <3,0,1,2>
+ 2685634736U, // <3,3,0,4>: Cost 3 vext3 LHS, <3,0,4,1>
+ 2732968122U, // <3,3,0,5>: Cost 3 vext3 LHS, <3,0,5,2>
+ 3763579075U, // <3,3,0,6>: Cost 4 vext3 LHS, <3,0,6,2>
+ 4034053264U, // <3,3,0,7>: Cost 4 vzipr <1,2,3,0>, <1,5,3,7>
+ 1611450581U, // <3,3,0,u>: Cost 2 vext3 LHS, <3,0,u,2>
+ 2685192415U, // <3,3,1,0>: Cost 3 vext3 LHS, <3,1,0,3>
+ 1550385992U, // <3,3,1,1>: Cost 2 vext2 <1,1,3,3>, <1,1,3,3>
+ 2685192433U, // <3,3,1,2>: Cost 3 vext3 LHS, <3,1,2,3>
+ 2685634808U, // <3,3,1,3>: Cost 3 vext3 LHS, <3,1,3,1>
+ 2558332214U, // <3,3,1,4>: Cost 3 vext1 <1,3,3,1>, RHS
+ 2685634828U, // <3,3,1,5>: Cost 3 vext3 LHS, <3,1,5,3>
+ 3759376661U, // <3,3,1,6>: Cost 4 vext3 LHS, <3,1,6,3>
+ 2703477022U, // <3,3,1,7>: Cost 3 vext3 <3,1,7,3>, <3,1,7,3>
+ 1555031423U, // <3,3,1,u>: Cost 2 vext2 <1,u,3,3>, <1,u,3,3>
+ 2564309094U, // <3,3,2,0>: Cost 3 vext1 <2,3,3,2>, LHS
+ 2630100513U, // <3,3,2,1>: Cost 3 vext2 <2,1,3,3>, <2,1,3,3>
+ 1557022322U, // <3,3,2,2>: Cost 2 vext2 <2,2,3,3>, <2,2,3,3>
+ 2685192520U, // <3,3,2,3>: Cost 3 vext3 LHS, <3,2,3,0>
+ 2564312374U, // <3,3,2,4>: Cost 3 vext1 <2,3,3,2>, RHS
+ 2732968286U, // <3,3,2,5>: Cost 3 vext3 LHS, <3,2,5,4>
+ 2685634918U, // <3,3,2,6>: Cost 3 vext3 LHS, <3,2,6,3>
+ 2704140655U, // <3,3,2,7>: Cost 3 vext3 <3,2,7,3>, <3,2,7,3>
+ 1561004120U, // <3,3,2,u>: Cost 2 vext2 <2,u,3,3>, <2,u,3,3>
+ 1496547430U, // <3,3,3,0>: Cost 2 vext1 <3,3,3,3>, LHS
+ 2624129256U, // <3,3,3,1>: Cost 3 vext2 <1,1,3,3>, <3,1,1,3>
+ 2630764866U, // <3,3,3,2>: Cost 3 vext2 <2,2,3,3>, <3,2,2,3>
+ 336380006U, // <3,3,3,3>: Cost 1 vdup3 LHS
+ 1496550710U, // <3,3,3,4>: Cost 2 vext1 <3,3,3,3>, RHS
+ 2732968368U, // <3,3,3,5>: Cost 3 vext3 LHS, <3,3,5,5>
+ 2624129683U, // <3,3,3,6>: Cost 3 vext2 <1,1,3,3>, <3,6,3,7>
+ 2594182400U, // <3,3,3,7>: Cost 3 vext1 <7,3,3,3>, <7,3,3,3>
+ 336380006U, // <3,3,3,u>: Cost 1 vdup3 LHS
+ 2558353510U, // <3,3,4,0>: Cost 3 vext1 <1,3,3,4>, LHS
+ 2558354411U, // <3,3,4,1>: Cost 3 vext1 <1,3,3,4>, <1,3,3,4>
+ 2564327108U, // <3,3,4,2>: Cost 3 vext1 <2,3,3,4>, <2,3,3,4>
+ 2564327938U, // <3,3,4,3>: Cost 3 vext1 <2,3,3,4>, <3,4,5,6>
+ 2960343962U, // <3,3,4,4>: Cost 3 vzipr <1,2,3,4>, <1,2,3,4>
+ 1611893250U, // <3,3,4,5>: Cost 2 vext3 LHS, <3,4,5,6>
+ 2771619126U, // <3,3,4,6>: Cost 3 vuzpl <3,3,3,3>, RHS
+ 4034086032U, // <3,3,4,7>: Cost 4 vzipr <1,2,3,4>, <1,5,3,7>
+ 1611893277U, // <3,3,4,u>: Cost 2 vext3 LHS, <3,4,u,6>
+ 2558361702U, // <3,3,5,0>: Cost 3 vext1 <1,3,3,5>, LHS
+ 2558362604U, // <3,3,5,1>: Cost 3 vext1 <1,3,3,5>, <1,3,3,5>
+ 2558363342U, // <3,3,5,2>: Cost 3 vext1 <1,3,3,5>, <2,3,4,5>
+ 2732968512U, // <3,3,5,3>: Cost 3 vext3 LHS, <3,5,3,5>
+ 2558364982U, // <3,3,5,4>: Cost 3 vext1 <1,3,3,5>, RHS
+ 3101279950U, // <3,3,5,5>: Cost 3 vtrnr <2,3,4,5>, <2,3,4,5>
+ 2665934946U, // <3,3,5,6>: Cost 3 vext2 <u,1,3,3>, <5,6,7,0>
+ 2826636598U, // <3,3,5,7>: Cost 3 vuzpr <1,3,1,3>, RHS
+ 2826636599U, // <3,3,5,u>: Cost 3 vuzpr <1,3,1,3>, RHS
+ 2732968568U, // <3,3,6,0>: Cost 3 vext3 LHS, <3,6,0,7>
+ 3763579521U, // <3,3,6,1>: Cost 4 vext3 LHS, <3,6,1,7>
+ 2732968586U, // <3,3,6,2>: Cost 3 vext3 LHS, <3,6,2,7>
+ 2732968595U, // <3,3,6,3>: Cost 3 vext3 LHS, <3,6,3,7>
+ 2732968604U, // <3,3,6,4>: Cost 3 vext3 LHS, <3,6,4,7>
+ 3763579557U, // <3,3,6,5>: Cost 4 vext3 LHS, <3,6,5,7>
+ 2732968621U, // <3,3,6,6>: Cost 3 vext3 LHS, <3,6,6,6>
+ 2657973099U, // <3,3,6,7>: Cost 3 vext2 <6,7,3,3>, <6,7,3,3>
+ 2658636732U, // <3,3,6,u>: Cost 3 vext2 <6,u,3,3>, <6,u,3,3>
+ 2558378086U, // <3,3,7,0>: Cost 3 vext1 <1,3,3,7>, LHS
+ 2558378990U, // <3,3,7,1>: Cost 3 vext1 <1,3,3,7>, <1,3,3,7>
+ 2564351687U, // <3,3,7,2>: Cost 3 vext1 <2,3,3,7>, <2,3,3,7>
+ 2661291264U, // <3,3,7,3>: Cost 3 vext2 <7,3,3,3>, <7,3,3,3>
+ 2558381366U, // <3,3,7,4>: Cost 3 vext1 <1,3,3,7>, RHS
+ 2732968694U, // <3,3,7,5>: Cost 3 vext3 LHS, <3,7,5,7>
+ 3781126907U, // <3,3,7,6>: Cost 4 vext3 <3,7,6,3>, <3,7,6,3>
+ 3095397376U, // <3,3,7,7>: Cost 3 vtrnr <1,3,5,7>, <1,3,5,7>
+ 2558383918U, // <3,3,7,u>: Cost 3 vext1 <1,3,3,7>, LHS
+ 1496547430U, // <3,3,u,0>: Cost 2 vext1 <3,3,3,3>, LHS
+ 1611893534U, // <3,3,u,1>: Cost 2 vext3 LHS, <3,u,1,2>
+ 1592858504U, // <3,3,u,2>: Cost 2 vext2 <u,2,3,3>, <u,2,3,3>
+ 336380006U, // <3,3,u,3>: Cost 1 vdup3 LHS
+ 1496550710U, // <3,3,u,4>: Cost 2 vext1 <3,3,3,3>, RHS
+ 1611893574U, // <3,3,u,5>: Cost 2 vext3 LHS, <3,u,5,6>
+ 2690280268U, // <3,3,u,6>: Cost 3 vext3 LHS, <3,u,6,3>
+ 2826636841U, // <3,3,u,7>: Cost 3 vuzpr <1,3,1,3>, RHS
+ 336380006U, // <3,3,u,u>: Cost 1 vdup3 LHS
+ 2624798720U, // <3,4,0,0>: Cost 3 vext2 <1,2,3,4>, <0,0,0,0>
+ 1551056998U, // <3,4,0,1>: Cost 2 vext2 <1,2,3,4>, LHS
+ 2624798884U, // <3,4,0,2>: Cost 3 vext2 <1,2,3,4>, <0,2,0,2>
+ 3693232384U, // <3,4,0,3>: Cost 4 vext2 <0,3,3,4>, <0,3,1,4>
+ 2624799058U, // <3,4,0,4>: Cost 3 vext2 <1,2,3,4>, <0,4,1,5>
+ 1659227026U, // <3,4,0,5>: Cost 2 vext3 LHS, <4,0,5,1>
+ 1659227036U, // <3,4,0,6>: Cost 2 vext3 LHS, <4,0,6,2>
+ 3667973382U, // <3,4,0,7>: Cost 4 vext1 <7,3,4,0>, <7,3,4,0>
+ 1551057565U, // <3,4,0,u>: Cost 2 vext2 <1,2,3,4>, LHS
+ 2624799478U, // <3,4,1,0>: Cost 3 vext2 <1,2,3,4>, <1,0,3,2>
+ 2624799540U, // <3,4,1,1>: Cost 3 vext2 <1,2,3,4>, <1,1,1,1>
+ 1551057818U, // <3,4,1,2>: Cost 2 vext2 <1,2,3,4>, <1,2,3,4>
+ 2624799704U, // <3,4,1,3>: Cost 3 vext2 <1,2,3,4>, <1,3,1,3>
+ 2564377910U, // <3,4,1,4>: Cost 3 vext1 <2,3,4,1>, RHS
+ 2689838050U, // <3,4,1,5>: Cost 3 vext3 LHS, <4,1,5,0>
+ 2689838062U, // <3,4,1,6>: Cost 3 vext3 LHS, <4,1,6,3>
+ 2628117807U, // <3,4,1,7>: Cost 3 vext2 <1,7,3,4>, <1,7,3,4>
+ 1555039616U, // <3,4,1,u>: Cost 2 vext2 <1,u,3,4>, <1,u,3,4>
+ 3626180710U, // <3,4,2,0>: Cost 4 vext1 <0,3,4,2>, LHS
+ 2624800298U, // <3,4,2,1>: Cost 3 vext2 <1,2,3,4>, <2,1,4,3>
+ 2624800360U, // <3,4,2,2>: Cost 3 vext2 <1,2,3,4>, <2,2,2,2>
+ 2624800422U, // <3,4,2,3>: Cost 3 vext2 <1,2,3,4>, <2,3,0,1>
+ 2624800514U, // <3,4,2,4>: Cost 3 vext2 <1,2,3,4>, <2,4,1,3>
+ 2709965878U, // <3,4,2,5>: Cost 3 vext3 <4,2,5,3>, <4,2,5,3>
+ 2689838140U, // <3,4,2,6>: Cost 3 vext3 LHS, <4,2,6,0>
+ 2634090504U, // <3,4,2,7>: Cost 3 vext2 <2,7,3,4>, <2,7,3,4>
+ 2689838158U, // <3,4,2,u>: Cost 3 vext3 LHS, <4,2,u,0>
+ 2624800918U, // <3,4,3,0>: Cost 3 vext2 <1,2,3,4>, <3,0,1,2>
+ 2636081403U, // <3,4,3,1>: Cost 3 vext2 <3,1,3,4>, <3,1,3,4>
+ 2636745036U, // <3,4,3,2>: Cost 3 vext2 <3,2,3,4>, <3,2,3,4>
+ 2624801180U, // <3,4,3,3>: Cost 3 vext2 <1,2,3,4>, <3,3,3,3>
+ 2624801232U, // <3,4,3,4>: Cost 3 vext2 <1,2,3,4>, <3,4,0,1>
+ 2905836854U, // <3,4,3,5>: Cost 3 vzipl <3,3,3,3>, RHS
+ 3040054582U, // <3,4,3,6>: Cost 3 vtrnl <3,3,3,3>, RHS
+ 3702524611U, // <3,4,3,7>: Cost 4 vext2 <1,u,3,4>, <3,7,0,1>
+ 2624801566U, // <3,4,3,u>: Cost 3 vext2 <1,2,3,4>, <3,u,1,2>
+ 2564399206U, // <3,4,4,0>: Cost 3 vext1 <2,3,4,4>, LHS
+ 2564400026U, // <3,4,4,1>: Cost 3 vext1 <2,3,4,4>, <1,2,3,4>
+ 2564400845U, // <3,4,4,2>: Cost 3 vext1 <2,3,4,4>, <2,3,4,4>
+ 2570373542U, // <3,4,4,3>: Cost 3 vext1 <3,3,4,4>, <3,3,4,4>
+ 1659227344U, // <3,4,4,4>: Cost 2 vext3 LHS, <4,4,4,4>
+ 1551060278U, // <3,4,4,5>: Cost 2 vext2 <1,2,3,4>, RHS
+ 1659227364U, // <3,4,4,6>: Cost 2 vext3 LHS, <4,4,6,6>
+ 3668006154U, // <3,4,4,7>: Cost 4 vext1 <7,3,4,4>, <7,3,4,4>
+ 1551060521U, // <3,4,4,u>: Cost 2 vext2 <1,2,3,4>, RHS
+ 1490665574U, // <3,4,5,0>: Cost 2 vext1 <2,3,4,5>, LHS
+ 2689838341U, // <3,4,5,1>: Cost 3 vext3 LHS, <4,5,1,3>
+ 1490667214U, // <3,4,5,2>: Cost 2 vext1 <2,3,4,5>, <2,3,4,5>
+ 2564409494U, // <3,4,5,3>: Cost 3 vext1 <2,3,4,5>, <3,0,1,2>
+ 1490668854U, // <3,4,5,4>: Cost 2 vext1 <2,3,4,5>, RHS
+ 2689838381U, // <3,4,5,5>: Cost 3 vext3 LHS, <4,5,5,7>
+ 537709878U, // <3,4,5,6>: Cost 1 vext3 LHS, RHS
+ 2594272523U, // <3,4,5,7>: Cost 3 vext1 <7,3,4,5>, <7,3,4,5>
+ 537709896U, // <3,4,5,u>: Cost 1 vext3 LHS, RHS
+ 2689838411U, // <3,4,6,0>: Cost 3 vext3 LHS, <4,6,0,1>
+ 2558444534U, // <3,4,6,1>: Cost 3 vext1 <1,3,4,6>, <1,3,4,6>
+ 2666607098U, // <3,4,6,2>: Cost 3 vext2 <u,2,3,4>, <6,2,7,3>
+ 2558446082U, // <3,4,6,3>: Cost 3 vext1 <1,3,4,6>, <3,4,5,6>
+ 1659227508U, // <3,4,6,4>: Cost 2 vext3 LHS, <4,6,4,6>
+ 2689838462U, // <3,4,6,5>: Cost 3 vext3 LHS, <4,6,5,7>
+ 2689838471U, // <3,4,6,6>: Cost 3 vext3 LHS, <4,6,6,7>
+ 2657981292U, // <3,4,6,7>: Cost 3 vext2 <6,7,3,4>, <6,7,3,4>
+ 1659227540U, // <3,4,6,u>: Cost 2 vext3 LHS, <4,6,u,2>
+ 2666607610U, // <3,4,7,0>: Cost 3 vext2 <u,2,3,4>, <7,0,1,2>
+ 3702527072U, // <3,4,7,1>: Cost 4 vext2 <1,u,3,4>, <7,1,3,5>
+ 2660635824U, // <3,4,7,2>: Cost 3 vext2 <7,2,3,4>, <7,2,3,4>
+ 3644139945U, // <3,4,7,3>: Cost 4 vext1 <3,3,4,7>, <3,3,4,7>
+ 2666607974U, // <3,4,7,4>: Cost 3 vext2 <u,2,3,4>, <7,4,5,6>
+ 2732969416U, // <3,4,7,5>: Cost 3 vext3 LHS, <4,7,5,0>
+ 2732969425U, // <3,4,7,6>: Cost 3 vext3 LHS, <4,7,6,0>
+ 2666608236U, // <3,4,7,7>: Cost 3 vext2 <u,2,3,4>, <7,7,7,7>
+ 2664617622U, // <3,4,7,u>: Cost 3 vext2 <7,u,3,4>, <7,u,3,4>
+ 1490690150U, // <3,4,u,0>: Cost 2 vext1 <2,3,4,u>, LHS
+ 1551062830U, // <3,4,u,1>: Cost 2 vext2 <1,2,3,4>, LHS
+ 1490691793U, // <3,4,u,2>: Cost 2 vext1 <2,3,4,u>, <2,3,4,u>
+ 2624804796U, // <3,4,u,3>: Cost 3 vext2 <1,2,3,4>, <u,3,0,1>
+ 1490693430U, // <3,4,u,4>: Cost 2 vext1 <2,3,4,u>, RHS
+ 1551063194U, // <3,4,u,5>: Cost 2 vext2 <1,2,3,4>, RHS
+ 537710121U, // <3,4,u,6>: Cost 1 vext3 LHS, RHS
+ 2594297102U, // <3,4,u,7>: Cost 3 vext1 <7,3,4,u>, <7,3,4,u>
+ 537710139U, // <3,4,u,u>: Cost 1 vext3 LHS, RHS
+ 3692576768U, // <3,5,0,0>: Cost 4 vext2 <0,2,3,5>, <0,0,0,0>
+ 2618835046U, // <3,5,0,1>: Cost 3 vext2 <0,2,3,5>, LHS
+ 2618835138U, // <3,5,0,2>: Cost 3 vext2 <0,2,3,5>, <0,2,3,5>
+ 3692577024U, // <3,5,0,3>: Cost 4 vext2 <0,2,3,5>, <0,3,1,4>
+ 2689838690U, // <3,5,0,4>: Cost 3 vext3 LHS, <5,0,4,1>
+ 2732969579U, // <3,5,0,5>: Cost 3 vext3 LHS, <5,0,5,1>
+ 2732969588U, // <3,5,0,6>: Cost 3 vext3 LHS, <5,0,6,1>
+ 2246963055U, // <3,5,0,7>: Cost 3 vrev <5,3,7,0>
+ 2618835613U, // <3,5,0,u>: Cost 3 vext2 <0,2,3,5>, LHS
+ 2594308198U, // <3,5,1,0>: Cost 3 vext1 <7,3,5,1>, LHS
+ 3692577588U, // <3,5,1,1>: Cost 4 vext2 <0,2,3,5>, <1,1,1,1>
+ 2624807835U, // <3,5,1,2>: Cost 3 vext2 <1,2,3,5>, <1,2,3,5>
+ 2625471468U, // <3,5,1,3>: Cost 3 vext2 <1,3,3,5>, <1,3,3,5>
+ 2626135101U, // <3,5,1,4>: Cost 3 vext2 <1,4,3,5>, <1,4,3,5>
+ 2594311888U, // <3,5,1,5>: Cost 3 vext1 <7,3,5,1>, <5,1,7,3>
+ 3699877107U, // <3,5,1,6>: Cost 4 vext2 <1,4,3,5>, <1,6,5,7>
+ 1641680592U, // <3,5,1,7>: Cost 2 vext3 <5,1,7,3>, <5,1,7,3>
+ 1641754329U, // <3,5,1,u>: Cost 2 vext3 <5,1,u,3>, <5,1,u,3>
+ 3692578274U, // <3,5,2,0>: Cost 4 vext2 <0,2,3,5>, <2,0,5,3>
+ 2630116899U, // <3,5,2,1>: Cost 3 vext2 <2,1,3,5>, <2,1,3,5>
+ 3692578408U, // <3,5,2,2>: Cost 4 vext2 <0,2,3,5>, <2,2,2,2>
+ 2625472206U, // <3,5,2,3>: Cost 3 vext2 <1,3,3,5>, <2,3,4,5>
+ 2632107798U, // <3,5,2,4>: Cost 3 vext2 <2,4,3,5>, <2,4,3,5>
+ 2715938575U, // <3,5,2,5>: Cost 3 vext3 <5,2,5,3>, <5,2,5,3>
+ 3692578746U, // <3,5,2,6>: Cost 4 vext2 <0,2,3,5>, <2,6,3,7>
+ 2716086049U, // <3,5,2,7>: Cost 3 vext3 <5,2,7,3>, <5,2,7,3>
+ 2634762330U, // <3,5,2,u>: Cost 3 vext2 <2,u,3,5>, <2,u,3,5>
+ 3692578966U, // <3,5,3,0>: Cost 4 vext2 <0,2,3,5>, <3,0,1,2>
+ 2636089596U, // <3,5,3,1>: Cost 3 vext2 <3,1,3,5>, <3,1,3,5>
+ 3699214668U, // <3,5,3,2>: Cost 4 vext2 <1,3,3,5>, <3,2,3,4>
+ 2638080412U, // <3,5,3,3>: Cost 3 vext2 <3,4,3,5>, <3,3,3,3>
+ 2618837506U, // <3,5,3,4>: Cost 3 vext2 <0,2,3,5>, <3,4,5,6>
+ 2832844494U, // <3,5,3,5>: Cost 3 vuzpr <2,3,4,5>, <2,3,4,5>
+ 4033415682U, // <3,5,3,6>: Cost 4 vzipr <1,1,3,3>, <3,4,5,6>
+ 3095072054U, // <3,5,3,7>: Cost 3 vtrnr <1,3,1,3>, RHS
+ 3095072055U, // <3,5,3,u>: Cost 3 vtrnr <1,3,1,3>, RHS
+ 2600304742U, // <3,5,4,0>: Cost 3 vext1 <u,3,5,4>, LHS
+ 3763580815U, // <3,5,4,1>: Cost 4 vext3 LHS, <5,4,1,5>
+ 2564474582U, // <3,5,4,2>: Cost 3 vext1 <2,3,5,4>, <2,3,5,4>
+ 3699879044U, // <3,5,4,3>: Cost 4 vext2 <1,4,3,5>, <4,3,5,0>
+ 2600308022U, // <3,5,4,4>: Cost 3 vext1 <u,3,5,4>, RHS
+ 2618838326U, // <3,5,4,5>: Cost 3 vext2 <0,2,3,5>, RHS
+ 2772454710U, // <3,5,4,6>: Cost 3 vuzpl <3,4,5,6>, RHS
+ 1659228102U, // <3,5,4,7>: Cost 2 vext3 LHS, <5,4,7,6>
+ 1659228111U, // <3,5,4,u>: Cost 2 vext3 LHS, <5,4,u,6>
+ 2570453094U, // <3,5,5,0>: Cost 3 vext1 <3,3,5,5>, LHS
+ 2624810704U, // <3,5,5,1>: Cost 3 vext2 <1,2,3,5>, <5,1,7,3>
+ 2570454734U, // <3,5,5,2>: Cost 3 vext1 <3,3,5,5>, <2,3,4,5>
+ 2570455472U, // <3,5,5,3>: Cost 3 vext1 <3,3,5,5>, <3,3,5,5>
+ 2570456374U, // <3,5,5,4>: Cost 3 vext1 <3,3,5,5>, RHS
+ 1659228164U, // <3,5,5,5>: Cost 2 vext3 LHS, <5,5,5,5>
+ 2732969998U, // <3,5,5,6>: Cost 3 vext3 LHS, <5,5,6,6>
+ 1659228184U, // <3,5,5,7>: Cost 2 vext3 LHS, <5,5,7,7>
+ 1659228193U, // <3,5,5,u>: Cost 2 vext3 LHS, <5,5,u,7>
+ 2732970020U, // <3,5,6,0>: Cost 3 vext3 LHS, <5,6,0,1>
+ 2732970035U, // <3,5,6,1>: Cost 3 vext3 LHS, <5,6,1,7>
+ 2564490968U, // <3,5,6,2>: Cost 3 vext1 <2,3,5,6>, <2,3,5,6>
+ 2732970050U, // <3,5,6,3>: Cost 3 vext3 LHS, <5,6,3,4>
+ 2732970060U, // <3,5,6,4>: Cost 3 vext3 LHS, <5,6,4,5>
+ 2732970071U, // <3,5,6,5>: Cost 3 vext3 LHS, <5,6,5,7>
+ 2732970080U, // <3,5,6,6>: Cost 3 vext3 LHS, <5,6,6,7>
+ 1659228258U, // <3,5,6,7>: Cost 2 vext3 LHS, <5,6,7,0>
+ 1659228267U, // <3,5,6,u>: Cost 2 vext3 LHS, <5,6,u,0>
+ 1484783718U, // <3,5,7,0>: Cost 2 vext1 <1,3,5,7>, LHS
+ 1484784640U, // <3,5,7,1>: Cost 2 vext1 <1,3,5,7>, <1,3,5,7>
+ 2558527080U, // <3,5,7,2>: Cost 3 vext1 <1,3,5,7>, <2,2,2,2>
+ 2558527638U, // <3,5,7,3>: Cost 3 vext1 <1,3,5,7>, <3,0,1,2>
+ 1484786998U, // <3,5,7,4>: Cost 2 vext1 <1,3,5,7>, RHS
+ 1659228328U, // <3,5,7,5>: Cost 2 vext3 LHS, <5,7,5,7>
+ 2732970154U, // <3,5,7,6>: Cost 3 vext3 LHS, <5,7,6,0>
+ 2558531180U, // <3,5,7,7>: Cost 3 vext1 <1,3,5,7>, <7,7,7,7>
+ 1484789550U, // <3,5,7,u>: Cost 2 vext1 <1,3,5,7>, LHS
+ 1484791910U, // <3,5,u,0>: Cost 2 vext1 <1,3,5,u>, LHS
+ 1484792833U, // <3,5,u,1>: Cost 2 vext1 <1,3,5,u>, <1,3,5,u>
+ 2558535272U, // <3,5,u,2>: Cost 3 vext1 <1,3,5,u>, <2,2,2,2>
+ 2558535830U, // <3,5,u,3>: Cost 3 vext1 <1,3,5,u>, <3,0,1,2>
+ 1484795190U, // <3,5,u,4>: Cost 2 vext1 <1,3,5,u>, RHS
+ 1659228409U, // <3,5,u,5>: Cost 2 vext3 LHS, <5,u,5,7>
+ 2772457626U, // <3,5,u,6>: Cost 3 vuzpl <3,4,5,6>, RHS
+ 1646326023U, // <3,5,u,7>: Cost 2 vext3 <5,u,7,3>, <5,u,7,3>
+ 1484797742U, // <3,5,u,u>: Cost 2 vext1 <1,3,5,u>, LHS
+ 2558541926U, // <3,6,0,0>: Cost 3 vext1 <1,3,6,0>, LHS
+ 2689839393U, // <3,6,0,1>: Cost 3 vext3 LHS, <6,0,1,2>
+ 2689839404U, // <3,6,0,2>: Cost 3 vext3 LHS, <6,0,2,4>
+ 3706519808U, // <3,6,0,3>: Cost 4 vext2 <2,5,3,6>, <0,3,1,4>
+ 2689839420U, // <3,6,0,4>: Cost 3 vext3 LHS, <6,0,4,2>
+ 2732970314U, // <3,6,0,5>: Cost 3 vext3 LHS, <6,0,5,7>
+ 2732970316U, // <3,6,0,6>: Cost 3 vext3 LHS, <6,0,6,0>
+ 2960313654U, // <3,6,0,7>: Cost 3 vzipr <1,2,3,0>, RHS
+ 2689839456U, // <3,6,0,u>: Cost 3 vext3 LHS, <6,0,u,2>
+ 3763581290U, // <3,6,1,0>: Cost 4 vext3 LHS, <6,1,0,3>
+ 3763581297U, // <3,6,1,1>: Cost 4 vext3 LHS, <6,1,1,1>
+ 2624816028U, // <3,6,1,2>: Cost 3 vext2 <1,2,3,6>, <1,2,3,6>
+ 3763581315U, // <3,6,1,3>: Cost 4 vext3 LHS, <6,1,3,1>
+ 2626143294U, // <3,6,1,4>: Cost 3 vext2 <1,4,3,6>, <1,4,3,6>
+ 3763581335U, // <3,6,1,5>: Cost 4 vext3 LHS, <6,1,5,3>
+ 2721321376U, // <3,6,1,6>: Cost 3 vext3 <6,1,6,3>, <6,1,6,3>
+ 2721395113U, // <3,6,1,7>: Cost 3 vext3 <6,1,7,3>, <6,1,7,3>
+ 2628797826U, // <3,6,1,u>: Cost 3 vext2 <1,u,3,6>, <1,u,3,6>
+ 2594390118U, // <3,6,2,0>: Cost 3 vext1 <7,3,6,2>, LHS
+ 2721616324U, // <3,6,2,1>: Cost 3 vext3 <6,2,1,3>, <6,2,1,3>
+ 2630788725U, // <3,6,2,2>: Cost 3 vext2 <2,2,3,6>, <2,2,3,6>
+ 3763581395U, // <3,6,2,3>: Cost 4 vext3 LHS, <6,2,3,0>
+ 2632115991U, // <3,6,2,4>: Cost 3 vext2 <2,4,3,6>, <2,4,3,6>
+ 2632779624U, // <3,6,2,5>: Cost 3 vext2 <2,5,3,6>, <2,5,3,6>
+ 2594394618U, // <3,6,2,6>: Cost 3 vext1 <7,3,6,2>, <6,2,7,3>
+ 1648316922U, // <3,6,2,7>: Cost 2 vext3 <6,2,7,3>, <6,2,7,3>
+ 1648390659U, // <3,6,2,u>: Cost 2 vext3 <6,2,u,3>, <6,2,u,3>
+ 3693914262U, // <3,6,3,0>: Cost 4 vext2 <0,4,3,6>, <3,0,1,2>
+ 3638281176U, // <3,6,3,1>: Cost 4 vext1 <2,3,6,3>, <1,3,1,3>
+ 3696568678U, // <3,6,3,2>: Cost 4 vext2 <0,u,3,6>, <3,2,6,3>
+ 2638088604U, // <3,6,3,3>: Cost 3 vext2 <3,4,3,6>, <3,3,3,3>
+ 2632780290U, // <3,6,3,4>: Cost 3 vext2 <2,5,3,6>, <3,4,5,6>
+ 3712494145U, // <3,6,3,5>: Cost 4 vext2 <3,5,3,6>, <3,5,3,6>
+ 3698559612U, // <3,6,3,6>: Cost 4 vext2 <1,2,3,6>, <3,6,1,2>
+ 2959674678U, // <3,6,3,7>: Cost 3 vzipr <1,1,3,3>, RHS
+ 2959674679U, // <3,6,3,u>: Cost 3 vzipr <1,1,3,3>, RHS
+ 3763581536U, // <3,6,4,0>: Cost 4 vext3 LHS, <6,4,0,6>
+ 2722943590U, // <3,6,4,1>: Cost 3 vext3 <6,4,1,3>, <6,4,1,3>
+ 2732970609U, // <3,6,4,2>: Cost 3 vext3 LHS, <6,4,2,5>
+ 3698560147U, // <3,6,4,3>: Cost 4 vext2 <1,2,3,6>, <4,3,6,6>
+ 2732970628U, // <3,6,4,4>: Cost 3 vext3 LHS, <6,4,4,6>
+ 2689839757U, // <3,6,4,5>: Cost 3 vext3 LHS, <6,4,5,6>
+ 2732970640U, // <3,6,4,6>: Cost 3 vext3 LHS, <6,4,6,0>
+ 2960346422U, // <3,6,4,7>: Cost 3 vzipr <1,2,3,4>, RHS
+ 2689839784U, // <3,6,4,u>: Cost 3 vext3 LHS, <6,4,u,6>
+ 2576498790U, // <3,6,5,0>: Cost 3 vext1 <4,3,6,5>, LHS
+ 3650241270U, // <3,6,5,1>: Cost 4 vext1 <4,3,6,5>, <1,0,3,2>
+ 2732970692U, // <3,6,5,2>: Cost 3 vext3 LHS, <6,5,2,7>
+ 2576501250U, // <3,6,5,3>: Cost 3 vext1 <4,3,6,5>, <3,4,5,6>
+ 2576501906U, // <3,6,5,4>: Cost 3 vext1 <4,3,6,5>, <4,3,6,5>
+ 3650244622U, // <3,6,5,5>: Cost 4 vext1 <4,3,6,5>, <5,5,6,6>
+ 4114633528U, // <3,6,5,6>: Cost 4 vtrnl <3,4,5,6>, <6,6,6,6>
+ 2732970735U, // <3,6,5,7>: Cost 3 vext3 LHS, <6,5,7,5>
+ 2576504622U, // <3,6,5,u>: Cost 3 vext1 <4,3,6,5>, LHS
+ 2732970749U, // <3,6,6,0>: Cost 3 vext3 LHS, <6,6,0,1>
+ 2724270856U, // <3,6,6,1>: Cost 3 vext3 <6,6,1,3>, <6,6,1,3>
+ 2624819706U, // <3,6,6,2>: Cost 3 vext2 <1,2,3,6>, <6,2,7,3>
+ 3656223234U, // <3,6,6,3>: Cost 4 vext1 <5,3,6,6>, <3,4,5,6>
+ 2732970788U, // <3,6,6,4>: Cost 3 vext3 LHS, <6,6,4,4>
+ 2732970800U, // <3,6,6,5>: Cost 3 vext3 LHS, <6,6,5,7>
+ 1659228984U, // <3,6,6,6>: Cost 2 vext3 LHS, <6,6,6,6>
+ 1659228994U, // <3,6,6,7>: Cost 2 vext3 LHS, <6,6,7,7>
+ 1659229003U, // <3,6,6,u>: Cost 2 vext3 LHS, <6,6,u,7>
+ 1659229006U, // <3,6,7,0>: Cost 2 vext3 LHS, <6,7,0,1>
+ 2558600201U, // <3,6,7,1>: Cost 3 vext1 <1,3,6,7>, <1,3,6,7>
+ 2558601146U, // <3,6,7,2>: Cost 3 vext1 <1,3,6,7>, <2,6,3,7>
+ 2725081963U, // <3,6,7,3>: Cost 3 vext3 <6,7,3,3>, <6,7,3,3>
+ 1659229046U, // <3,6,7,4>: Cost 2 vext3 LHS, <6,7,4,5>
+ 2715423611U, // <3,6,7,5>: Cost 3 vext3 <5,1,7,3>, <6,7,5,1>
+ 2722059141U, // <3,6,7,6>: Cost 3 vext3 <6,2,7,3>, <6,7,6,2>
+ 2962361654U, // <3,6,7,7>: Cost 3 vzipr <1,5,3,7>, RHS
+ 1659229078U, // <3,6,7,u>: Cost 2 vext3 LHS, <6,7,u,1>
+ 1659229087U, // <3,6,u,0>: Cost 2 vext3 LHS, <6,u,0,1>
+ 2689840041U, // <3,6,u,1>: Cost 3 vext3 LHS, <6,u,1,2>
+ 2558609339U, // <3,6,u,2>: Cost 3 vext1 <1,3,6,u>, <2,6,3,u>
+ 2576525853U, // <3,6,u,3>: Cost 3 vext1 <4,3,6,u>, <3,4,u,6>
+ 1659229127U, // <3,6,u,4>: Cost 2 vext3 LHS, <6,u,4,5>
+ 2689840081U, // <3,6,u,5>: Cost 3 vext3 LHS, <6,u,5,6>
+ 1659228984U, // <3,6,u,6>: Cost 2 vext3 LHS, <6,6,6,6>
+ 1652298720U, // <3,6,u,7>: Cost 2 vext3 <6,u,7,3>, <6,u,7,3>
+ 1659229159U, // <3,6,u,u>: Cost 2 vext3 LHS, <6,u,u,1>
+ 2626813952U, // <3,7,0,0>: Cost 3 vext2 <1,5,3,7>, <0,0,0,0>
+ 1553072230U, // <3,7,0,1>: Cost 2 vext2 <1,5,3,7>, LHS
+ 2626814116U, // <3,7,0,2>: Cost 3 vext2 <1,5,3,7>, <0,2,0,2>
+ 3700556028U, // <3,7,0,3>: Cost 4 vext2 <1,5,3,7>, <0,3,1,0>
+ 2626814290U, // <3,7,0,4>: Cost 3 vext2 <1,5,3,7>, <0,4,1,5>
+ 2582507375U, // <3,7,0,5>: Cost 3 vext1 <5,3,7,0>, <5,3,7,0>
+ 2588480072U, // <3,7,0,6>: Cost 3 vext1 <6,3,7,0>, <6,3,7,0>
+ 2732971055U, // <3,7,0,7>: Cost 3 vext3 LHS, <7,0,7,1>
+ 1553072797U, // <3,7,0,u>: Cost 2 vext2 <1,5,3,7>, LHS
+ 2626814710U, // <3,7,1,0>: Cost 3 vext2 <1,5,3,7>, <1,0,3,2>
+ 2626814772U, // <3,7,1,1>: Cost 3 vext2 <1,5,3,7>, <1,1,1,1>
+ 2626814870U, // <3,7,1,2>: Cost 3 vext2 <1,5,3,7>, <1,2,3,0>
+ 2625487854U, // <3,7,1,3>: Cost 3 vext2 <1,3,3,7>, <1,3,3,7>
+ 2582514998U, // <3,7,1,4>: Cost 3 vext1 <5,3,7,1>, RHS
+ 1553073296U, // <3,7,1,5>: Cost 2 vext2 <1,5,3,7>, <1,5,3,7>
+ 2627478753U, // <3,7,1,6>: Cost 3 vext2 <1,6,3,7>, <1,6,3,7>
+ 2727367810U, // <3,7,1,7>: Cost 3 vext3 <7,1,7,3>, <7,1,7,3>
+ 1555064195U, // <3,7,1,u>: Cost 2 vext2 <1,u,3,7>, <1,u,3,7>
+ 2588491878U, // <3,7,2,0>: Cost 3 vext1 <6,3,7,2>, LHS
+ 3700557318U, // <3,7,2,1>: Cost 4 vext2 <1,5,3,7>, <2,1,0,3>
+ 2626815592U, // <3,7,2,2>: Cost 3 vext2 <1,5,3,7>, <2,2,2,2>
+ 2626815654U, // <3,7,2,3>: Cost 3 vext2 <1,5,3,7>, <2,3,0,1>
+ 2588495158U, // <3,7,2,4>: Cost 3 vext1 <6,3,7,2>, RHS
+ 2632787817U, // <3,7,2,5>: Cost 3 vext2 <2,5,3,7>, <2,5,3,7>
+ 1559709626U, // <3,7,2,6>: Cost 2 vext2 <2,6,3,7>, <2,6,3,7>
+ 2728031443U, // <3,7,2,7>: Cost 3 vext3 <7,2,7,3>, <7,2,7,3>
+ 1561036892U, // <3,7,2,u>: Cost 2 vext2 <2,u,3,7>, <2,u,3,7>
+ 2626816150U, // <3,7,3,0>: Cost 3 vext2 <1,5,3,7>, <3,0,1,2>
+ 2626816268U, // <3,7,3,1>: Cost 3 vext2 <1,5,3,7>, <3,1,5,3>
+ 2633451878U, // <3,7,3,2>: Cost 3 vext2 <2,6,3,7>, <3,2,6,3>
+ 2626816412U, // <3,7,3,3>: Cost 3 vext2 <1,5,3,7>, <3,3,3,3>
+ 2626816514U, // <3,7,3,4>: Cost 3 vext2 <1,5,3,7>, <3,4,5,6>
+ 2638760514U, // <3,7,3,5>: Cost 3 vext2 <3,5,3,7>, <3,5,3,7>
+ 2639424147U, // <3,7,3,6>: Cost 3 vext2 <3,6,3,7>, <3,6,3,7>
+ 2826961920U, // <3,7,3,7>: Cost 3 vuzpr <1,3,5,7>, <1,3,5,7>
+ 2626816798U, // <3,7,3,u>: Cost 3 vext2 <1,5,3,7>, <3,u,1,2>
+ 2582536294U, // <3,7,4,0>: Cost 3 vext1 <5,3,7,4>, LHS
+ 2582537360U, // <3,7,4,1>: Cost 3 vext1 <5,3,7,4>, <1,5,3,7>
+ 2588510138U, // <3,7,4,2>: Cost 3 vext1 <6,3,7,4>, <2,6,3,7>
+ 3700558996U, // <3,7,4,3>: Cost 4 vext2 <1,5,3,7>, <4,3,6,7>
+ 2582539574U, // <3,7,4,4>: Cost 3 vext1 <5,3,7,4>, RHS
+ 1553075510U, // <3,7,4,5>: Cost 2 vext2 <1,5,3,7>, RHS
+ 2588512844U, // <3,7,4,6>: Cost 3 vext1 <6,3,7,4>, <6,3,7,4>
+ 2564625766U, // <3,7,4,7>: Cost 3 vext1 <2,3,7,4>, <7,4,5,6>
+ 1553075753U, // <3,7,4,u>: Cost 2 vext2 <1,5,3,7>, RHS
+ 2732971398U, // <3,7,5,0>: Cost 3 vext3 LHS, <7,5,0,2>
+ 2626817744U, // <3,7,5,1>: Cost 3 vext2 <1,5,3,7>, <5,1,7,3>
+ 3700559649U, // <3,7,5,2>: Cost 4 vext2 <1,5,3,7>, <5,2,7,3>
+ 2626817903U, // <3,7,5,3>: Cost 3 vext2 <1,5,3,7>, <5,3,7,0>
+ 2258728203U, // <3,7,5,4>: Cost 3 vrev <7,3,4,5>
+ 2732971446U, // <3,7,5,5>: Cost 3 vext3 LHS, <7,5,5,5>
+ 2732971457U, // <3,7,5,6>: Cost 3 vext3 LHS, <7,5,6,7>
+ 2826964278U, // <3,7,5,7>: Cost 3 vuzpr <1,3,5,7>, RHS
+ 2826964279U, // <3,7,5,u>: Cost 3 vuzpr <1,3,5,7>, RHS
+ 2732971478U, // <3,7,6,0>: Cost 3 vext3 LHS, <7,6,0,1>
+ 2732971486U, // <3,7,6,1>: Cost 3 vext3 LHS, <7,6,1,0>
+ 2633454074U, // <3,7,6,2>: Cost 3 vext2 <2,6,3,7>, <6,2,7,3>
+ 2633454152U, // <3,7,6,3>: Cost 3 vext2 <2,6,3,7>, <6,3,7,0>
+ 2732971518U, // <3,7,6,4>: Cost 3 vext3 LHS, <7,6,4,5>
+ 2732971526U, // <3,7,6,5>: Cost 3 vext3 LHS, <7,6,5,4>
+ 2732971537U, // <3,7,6,6>: Cost 3 vext3 LHS, <7,6,6,6>
+ 2732971540U, // <3,7,6,7>: Cost 3 vext3 LHS, <7,6,7,0>
+ 2726041124U, // <3,7,6,u>: Cost 3 vext3 <6,u,7,3>, <7,6,u,7>
+ 2570616934U, // <3,7,7,0>: Cost 3 vext1 <3,3,7,7>, LHS
+ 2570617856U, // <3,7,7,1>: Cost 3 vext1 <3,3,7,7>, <1,3,5,7>
+ 2564646635U, // <3,7,7,2>: Cost 3 vext1 <2,3,7,7>, <2,3,7,7>
+ 2570619332U, // <3,7,7,3>: Cost 3 vext1 <3,3,7,7>, <3,3,7,7>
+ 2570620214U, // <3,7,7,4>: Cost 3 vext1 <3,3,7,7>, RHS
+ 2582564726U, // <3,7,7,5>: Cost 3 vext1 <5,3,7,7>, <5,3,7,7>
+ 2588537423U, // <3,7,7,6>: Cost 3 vext1 <6,3,7,7>, <6,3,7,7>
+ 1659229804U, // <3,7,7,7>: Cost 2 vext3 LHS, <7,7,7,7>
+ 1659229804U, // <3,7,7,u>: Cost 2 vext3 LHS, <7,7,7,7>
+ 2626819795U, // <3,7,u,0>: Cost 3 vext2 <1,5,3,7>, <u,0,1,2>
+ 1553078062U, // <3,7,u,1>: Cost 2 vext2 <1,5,3,7>, LHS
+ 2626819973U, // <3,7,u,2>: Cost 3 vext2 <1,5,3,7>, <u,2,3,0>
+ 2826961565U, // <3,7,u,3>: Cost 3 vuzpr <1,3,5,7>, LHS
+ 2626820159U, // <3,7,u,4>: Cost 3 vext2 <1,5,3,7>, <u,4,5,6>
+ 1553078426U, // <3,7,u,5>: Cost 2 vext2 <1,5,3,7>, RHS
+ 1595545808U, // <3,7,u,6>: Cost 2 vext2 <u,6,3,7>, <u,6,3,7>
+ 1659229804U, // <3,7,u,7>: Cost 2 vext3 LHS, <7,7,7,7>
+ 1553078629U, // <3,7,u,u>: Cost 2 vext2 <1,5,3,7>, LHS
+ 1611448320U, // <3,u,0,0>: Cost 2 vext3 LHS, <0,0,0,0>
+ 1611896531U, // <3,u,0,1>: Cost 2 vext3 LHS, <u,0,1,2>
+ 1659672284U, // <3,u,0,2>: Cost 2 vext3 LHS, <u,0,2,2>
+ 1616099045U, // <3,u,0,3>: Cost 2 vext3 LHS, <u,0,3,2>
+ 2685638381U, // <3,u,0,4>: Cost 3 vext3 LHS, <u,0,4,1>
+ 1663874806U, // <3,u,0,5>: Cost 2 vext3 LHS, <u,0,5,1>
+ 1663874816U, // <3,u,0,6>: Cost 2 vext3 LHS, <u,0,6,2>
+ 2960313672U, // <3,u,0,7>: Cost 3 vzipr <1,2,3,0>, RHS
+ 1611896594U, // <3,u,0,u>: Cost 2 vext3 LHS, <u,0,u,2>
+ 1549763324U, // <3,u,1,0>: Cost 2 vext2 <1,0,3,u>, <1,0,3,u>
+ 1550426957U, // <3,u,1,1>: Cost 2 vext2 <1,1,3,u>, <1,1,3,u>
+ 537712430U, // <3,u,1,2>: Cost 1 vext3 LHS, LHS
+ 1616541495U, // <3,u,1,3>: Cost 2 vext3 LHS, <u,1,3,3>
+ 1490930998U, // <3,u,1,4>: Cost 2 vext1 <2,3,u,1>, RHS
+ 1553081489U, // <3,u,1,5>: Cost 2 vext2 <1,5,3,u>, <1,5,3,u>
+ 2627486946U, // <3,u,1,6>: Cost 3 vext2 <1,6,3,u>, <1,6,3,u>
+ 1659230043U, // <3,u,1,7>: Cost 2 vext3 LHS, <u,1,7,3>
+ 537712484U, // <3,u,1,u>: Cost 1 vext3 LHS, LHS
+ 1611890852U, // <3,u,2,0>: Cost 2 vext3 LHS, <0,2,0,2>
+ 2624833102U, // <3,u,2,1>: Cost 3 vext2 <1,2,3,u>, <2,1,u,3>
+ 1557063287U, // <3,u,2,2>: Cost 2 vext2 <2,2,3,u>, <2,2,3,u>
+ 1616099205U, // <3,u,2,3>: Cost 2 vext3 LHS, <u,2,3,0>
+ 1611890892U, // <3,u,2,4>: Cost 2 vext3 LHS, <0,2,4,6>
+ 2689841054U, // <3,u,2,5>: Cost 3 vext3 LHS, <u,2,5,7>
+ 1559717819U, // <3,u,2,6>: Cost 2 vext2 <2,6,3,u>, <2,6,3,u>
+ 1659230124U, // <3,u,2,7>: Cost 2 vext3 LHS, <u,2,7,3>
+ 1616541618U, // <3,u,2,u>: Cost 2 vext3 LHS, <u,2,u,0>
+ 1611896764U, // <3,u,3,0>: Cost 2 vext3 LHS, <u,3,0,1>
+ 1484973079U, // <3,u,3,1>: Cost 2 vext1 <1,3,u,3>, <1,3,u,3>
+ 2685638607U, // <3,u,3,2>: Cost 3 vext3 LHS, <u,3,2,2>
+ 336380006U, // <3,u,3,3>: Cost 1 vdup3 LHS
+ 1611896804U, // <3,u,3,4>: Cost 2 vext3 LHS, <u,3,4,5>
+ 1616541679U, // <3,u,3,5>: Cost 2 vext3 LHS, <u,3,5,7>
+ 2690283512U, // <3,u,3,6>: Cost 3 vext3 LHS, <u,3,6,7>
+ 2959674696U, // <3,u,3,7>: Cost 3 vzipr <1,1,3,3>, RHS
+ 336380006U, // <3,u,3,u>: Cost 1 vdup3 LHS
+ 2558722150U, // <3,u,4,0>: Cost 3 vext1 <1,3,u,4>, LHS
+ 1659672602U, // <3,u,4,1>: Cost 2 vext3 LHS, <u,4,1,5>
+ 1659672612U, // <3,u,4,2>: Cost 2 vext3 LHS, <u,4,2,6>
+ 2689841196U, // <3,u,4,3>: Cost 3 vext3 LHS, <u,4,3,5>
+ 1659227344U, // <3,u,4,4>: Cost 2 vext3 LHS, <4,4,4,4>
+ 1611896895U, // <3,u,4,5>: Cost 2 vext3 LHS, <u,4,5,6>
+ 1663875144U, // <3,u,4,6>: Cost 2 vext3 LHS, <u,4,6,6>
+ 1659230289U, // <3,u,4,7>: Cost 2 vext3 LHS, <u,4,7,6>
+ 1611896922U, // <3,u,4,u>: Cost 2 vext3 LHS, <u,4,u,6>
+ 1490960486U, // <3,u,5,0>: Cost 2 vext1 <2,3,u,5>, LHS
+ 2689841261U, // <3,u,5,1>: Cost 3 vext3 LHS, <u,5,1,7>
+ 1490962162U, // <3,u,5,2>: Cost 2 vext1 <2,3,u,5>, <2,3,u,5>
+ 1616541823U, // <3,u,5,3>: Cost 2 vext3 LHS, <u,5,3,7>
+ 1490963766U, // <3,u,5,4>: Cost 2 vext1 <2,3,u,5>, RHS
+ 1659228164U, // <3,u,5,5>: Cost 2 vext3 LHS, <5,5,5,5>
+ 537712794U, // <3,u,5,6>: Cost 1 vext3 LHS, RHS
+ 1659230371U, // <3,u,5,7>: Cost 2 vext3 LHS, <u,5,7,7>
+ 537712812U, // <3,u,5,u>: Cost 1 vext3 LHS, RHS
+ 2689841327U, // <3,u,6,0>: Cost 3 vext3 LHS, <u,6,0,1>
+ 2558739482U, // <3,u,6,1>: Cost 3 vext1 <1,3,u,6>, <1,3,u,6>
+ 2689841351U, // <3,u,6,2>: Cost 3 vext3 LHS, <u,6,2,7>
+ 1616099536U, // <3,u,6,3>: Cost 2 vext3 LHS, <u,6,3,7>
+ 1659227508U, // <3,u,6,4>: Cost 2 vext3 LHS, <4,6,4,6>
+ 2690283746U, // <3,u,6,5>: Cost 3 vext3 LHS, <u,6,5,7>
+ 1659228984U, // <3,u,6,6>: Cost 2 vext3 LHS, <6,6,6,6>
+ 1659230445U, // <3,u,6,7>: Cost 2 vext3 LHS, <u,6,7,0>
+ 1616099581U, // <3,u,6,u>: Cost 2 vext3 LHS, <u,6,u,7>
+ 1485004902U, // <3,u,7,0>: Cost 2 vext1 <1,3,u,7>, LHS
+ 1485005851U, // <3,u,7,1>: Cost 2 vext1 <1,3,u,7>, <1,3,u,7>
+ 2558748264U, // <3,u,7,2>: Cost 3 vext1 <1,3,u,7>, <2,2,2,2>
+ 3095397021U, // <3,u,7,3>: Cost 3 vtrnr <1,3,5,7>, LHS
+ 1485008182U, // <3,u,7,4>: Cost 2 vext1 <1,3,u,7>, RHS
+ 1659228328U, // <3,u,7,5>: Cost 2 vext3 LHS, <5,7,5,7>
+ 2722060599U, // <3,u,7,6>: Cost 3 vext3 <6,2,7,3>, <u,7,6,2>
+ 1659229804U, // <3,u,7,7>: Cost 2 vext3 LHS, <7,7,7,7>
+ 1485010734U, // <3,u,7,u>: Cost 2 vext1 <1,3,u,7>, LHS
+ 1616099665U, // <3,u,u,0>: Cost 2 vext3 LHS, <u,u,0,1>
+ 1611897179U, // <3,u,u,1>: Cost 2 vext3 LHS, <u,u,1,2>
+ 537712997U, // <3,u,u,2>: Cost 1 vext3 LHS, LHS
+ 336380006U, // <3,u,u,3>: Cost 1 vdup3 LHS
+ 1616099705U, // <3,u,u,4>: Cost 2 vext3 LHS, <u,u,4,5>
+ 1611897219U, // <3,u,u,5>: Cost 2 vext3 LHS, <u,u,5,6>
+ 537713037U, // <3,u,u,6>: Cost 1 vext3 LHS, RHS
+ 1659230607U, // <3,u,u,7>: Cost 2 vext3 LHS, <u,u,7,0>
+ 537713051U, // <3,u,u,u>: Cost 1 vext3 LHS, LHS
+ 2691907584U, // <4,0,0,0>: Cost 3 vext3 <1,2,3,4>, <0,0,0,0>
+ 2691907594U, // <4,0,0,1>: Cost 3 vext3 <1,2,3,4>, <0,0,1,1>
+ 2691907604U, // <4,0,0,2>: Cost 3 vext3 <1,2,3,4>, <0,0,2,2>
+ 3709862144U, // <4,0,0,3>: Cost 4 vext2 <3,1,4,0>, <0,3,1,4>
+ 2684682280U, // <4,0,0,4>: Cost 3 vext3 <0,0,4,4>, <0,0,4,4>
+ 3694600633U, // <4,0,0,5>: Cost 4 vext2 <0,5,4,0>, <0,5,4,0>
+ 3291431290U, // <4,0,0,6>: Cost 4 vrev <0,4,6,0>
+ 3668342067U, // <4,0,0,7>: Cost 4 vext1 <7,4,0,0>, <7,4,0,0>
+ 2691907657U, // <4,0,0,u>: Cost 3 vext3 <1,2,3,4>, <0,0,u,1>
+ 2570715238U, // <4,0,1,0>: Cost 3 vext1 <3,4,0,1>, LHS
+ 2570716058U, // <4,0,1,1>: Cost 3 vext1 <3,4,0,1>, <1,2,3,4>
+ 1618165862U, // <4,0,1,2>: Cost 2 vext3 <1,2,3,4>, LHS
+ 2570717648U, // <4,0,1,3>: Cost 3 vext1 <3,4,0,1>, <3,4,0,1>
+ 2570718518U, // <4,0,1,4>: Cost 3 vext1 <3,4,0,1>, RHS
+ 2594607206U, // <4,0,1,5>: Cost 3 vext1 <7,4,0,1>, <5,6,7,4>
+ 3662377563U, // <4,0,1,6>: Cost 4 vext1 <6,4,0,1>, <6,4,0,1>
+ 2594608436U, // <4,0,1,7>: Cost 3 vext1 <7,4,0,1>, <7,4,0,1>
+ 1618165916U, // <4,0,1,u>: Cost 2 vext3 <1,2,3,4>, LHS
+ 2685714598U, // <4,0,2,0>: Cost 3 vext3 <0,2,0,4>, <0,2,0,4>
+ 3759530159U, // <4,0,2,1>: Cost 4 vext3 <0,2,1,4>, <0,2,1,4>
+ 2685862072U, // <4,0,2,2>: Cost 3 vext3 <0,2,2,4>, <0,2,2,4>
+ 2631476937U, // <4,0,2,3>: Cost 3 vext2 <2,3,4,0>, <2,3,4,0>
+ 2685714636U, // <4,0,2,4>: Cost 3 vext3 <0,2,0,4>, <0,2,4,6>
+ 3765649622U, // <4,0,2,5>: Cost 4 vext3 <1,2,3,4>, <0,2,5,7>
+ 2686157020U, // <4,0,2,6>: Cost 3 vext3 <0,2,6,4>, <0,2,6,4>
+ 3668358453U, // <4,0,2,7>: Cost 4 vext1 <7,4,0,2>, <7,4,0,2>
+ 2686304494U, // <4,0,2,u>: Cost 3 vext3 <0,2,u,4>, <0,2,u,4>
+ 3632529510U, // <4,0,3,0>: Cost 4 vext1 <1,4,0,3>, LHS
+ 2686451968U, // <4,0,3,1>: Cost 3 vext3 <0,3,1,4>, <0,3,1,4>
+ 2686525705U, // <4,0,3,2>: Cost 3 vext3 <0,3,2,4>, <0,3,2,4>
+ 3760341266U, // <4,0,3,3>: Cost 4 vext3 <0,3,3,4>, <0,3,3,4>
+ 3632532790U, // <4,0,3,4>: Cost 4 vext1 <1,4,0,3>, RHS
+ 3913254606U, // <4,0,3,5>: Cost 4 vuzpr <3,4,5,0>, <2,3,4,5>
+ 3705219740U, // <4,0,3,6>: Cost 4 vext2 <2,3,4,0>, <3,6,4,7>
+ 3713845990U, // <4,0,3,7>: Cost 4 vext2 <3,7,4,0>, <3,7,4,0>
+ 2686451968U, // <4,0,3,u>: Cost 3 vext3 <0,3,1,4>, <0,3,1,4>
+ 2552823910U, // <4,0,4,0>: Cost 3 vext1 <0,4,0,4>, LHS
+ 2691907922U, // <4,0,4,1>: Cost 3 vext3 <1,2,3,4>, <0,4,1,5>
+ 2691907932U, // <4,0,4,2>: Cost 3 vext3 <1,2,3,4>, <0,4,2,6>
+ 3626567830U, // <4,0,4,3>: Cost 4 vext1 <0,4,0,4>, <3,0,1,2>
+ 2552827190U, // <4,0,4,4>: Cost 3 vext1 <0,4,0,4>, RHS
+ 2631478582U, // <4,0,4,5>: Cost 3 vext2 <2,3,4,0>, RHS
+ 3626570017U, // <4,0,4,6>: Cost 4 vext1 <0,4,0,4>, <6,0,1,2>
+ 3668374839U, // <4,0,4,7>: Cost 4 vext1 <7,4,0,4>, <7,4,0,4>
+ 2552829742U, // <4,0,4,u>: Cost 3 vext1 <0,4,0,4>, LHS
+ 2558804070U, // <4,0,5,0>: Cost 3 vext1 <1,4,0,5>, LHS
+ 1839644774U, // <4,0,5,1>: Cost 2 vzipl RHS, LHS
+ 2913386660U, // <4,0,5,2>: Cost 3 vzipl RHS, <0,2,0,2>
+ 2570750420U, // <4,0,5,3>: Cost 3 vext1 <3,4,0,5>, <3,4,0,5>
+ 2558807350U, // <4,0,5,4>: Cost 3 vext1 <1,4,0,5>, RHS
+ 3987128750U, // <4,0,5,5>: Cost 4 vzipl RHS, <0,5,2,7>
+ 3987128822U, // <4,0,5,6>: Cost 4 vzipl RHS, <0,6,1,7>
+ 2594641208U, // <4,0,5,7>: Cost 3 vext1 <7,4,0,5>, <7,4,0,5>
+ 1839645341U, // <4,0,5,u>: Cost 2 vzipl RHS, LHS
+ 2552840294U, // <4,0,6,0>: Cost 3 vext1 <0,4,0,6>, LHS
+ 3047604234U, // <4,0,6,1>: Cost 3 vtrnl RHS, <0,0,1,1>
+ 1973862502U, // <4,0,6,2>: Cost 2 vtrnl RHS, LHS
+ 2570758613U, // <4,0,6,3>: Cost 3 vext1 <3,4,0,6>, <3,4,0,6>
+ 2552843574U, // <4,0,6,4>: Cost 3 vext1 <0,4,0,6>, RHS
+ 2217664887U, // <4,0,6,5>: Cost 3 vrev <0,4,5,6>
+ 3662418528U, // <4,0,6,6>: Cost 4 vext1 <6,4,0,6>, <6,4,0,6>
+ 2658022257U, // <4,0,6,7>: Cost 3 vext2 <6,7,4,0>, <6,7,4,0>
+ 1973862556U, // <4,0,6,u>: Cost 2 vtrnl RHS, LHS
+ 3731764218U, // <4,0,7,0>: Cost 4 vext2 <6,7,4,0>, <7,0,1,2>
+ 3988324454U, // <4,0,7,1>: Cost 4 vzipl <4,7,5,0>, LHS
+ 4122034278U, // <4,0,7,2>: Cost 4 vtrnl <4,6,7,1>, LHS
+ 3735082246U, // <4,0,7,3>: Cost 4 vext2 <7,3,4,0>, <7,3,4,0>
+ 3731764536U, // <4,0,7,4>: Cost 4 vext2 <6,7,4,0>, <7,4,0,5>
+ 3937145718U, // <4,0,7,5>: Cost 4 vuzpr <7,4,5,0>, <6,7,4,5>
+ 3737073145U, // <4,0,7,6>: Cost 4 vext2 <7,6,4,0>, <7,6,4,0>
+ 3731764844U, // <4,0,7,7>: Cost 4 vext2 <6,7,4,0>, <7,7,7,7>
+ 4122034332U, // <4,0,7,u>: Cost 4 vtrnl <4,6,7,1>, LHS
+ 2552856678U, // <4,0,u,0>: Cost 3 vext1 <0,4,0,u>, LHS
+ 1841635430U, // <4,0,u,1>: Cost 2 vzipl RHS, LHS
+ 1618166429U, // <4,0,u,2>: Cost 2 vext3 <1,2,3,4>, LHS
+ 2570774999U, // <4,0,u,3>: Cost 3 vext1 <3,4,0,u>, <3,4,0,u>
+ 2552859958U, // <4,0,u,4>: Cost 3 vext1 <0,4,0,u>, RHS
+ 2631481498U, // <4,0,u,5>: Cost 3 vext2 <2,3,4,0>, RHS
+ 2686157020U, // <4,0,u,6>: Cost 3 vext3 <0,2,6,4>, <0,2,6,4>
+ 2594665787U, // <4,0,u,7>: Cost 3 vext1 <7,4,0,u>, <7,4,0,u>
+ 1618166483U, // <4,0,u,u>: Cost 2 vext3 <1,2,3,4>, LHS
+ 2617548837U, // <4,1,0,0>: Cost 3 vext2 <0,0,4,1>, <0,0,4,1>
+ 2622857318U, // <4,1,0,1>: Cost 3 vext2 <0,u,4,1>, LHS
+ 3693281484U, // <4,1,0,2>: Cost 4 vext2 <0,3,4,1>, <0,2,4,6>
+ 2691908342U, // <4,1,0,3>: Cost 3 vext3 <1,2,3,4>, <1,0,3,2>
+ 2622857554U, // <4,1,0,4>: Cost 3 vext2 <0,u,4,1>, <0,4,1,5>
+ 3764470538U, // <4,1,0,5>: Cost 4 vext3 <1,0,5,4>, <1,0,5,4>
+ 3695272459U, // <4,1,0,6>: Cost 4 vext2 <0,6,4,1>, <0,6,4,1>
+ 3733094980U, // <4,1,0,7>: Cost 4 vext2 <7,0,4,1>, <0,7,1,4>
+ 2622857885U, // <4,1,0,u>: Cost 3 vext2 <0,u,4,1>, LHS
+ 3696599798U, // <4,1,1,0>: Cost 4 vext2 <0,u,4,1>, <1,0,3,2>
+ 2691097399U, // <4,1,1,1>: Cost 3 vext3 <1,1,1,4>, <1,1,1,4>
+ 2631484314U, // <4,1,1,2>: Cost 3 vext2 <2,3,4,1>, <1,2,3,4>
+ 2691908424U, // <4,1,1,3>: Cost 3 vext3 <1,2,3,4>, <1,1,3,3>
+ 3696600125U, // <4,1,1,4>: Cost 4 vext2 <0,u,4,1>, <1,4,3,5>
+ 3696600175U, // <4,1,1,5>: Cost 4 vext2 <0,u,4,1>, <1,5,0,1>
+ 3696600307U, // <4,1,1,6>: Cost 4 vext2 <0,u,4,1>, <1,6,5,7>
+ 3668423997U, // <4,1,1,7>: Cost 4 vext1 <7,4,1,1>, <7,4,1,1>
+ 2691908469U, // <4,1,1,u>: Cost 3 vext3 <1,2,3,4>, <1,1,u,3>
+ 2570797158U, // <4,1,2,0>: Cost 3 vext1 <3,4,1,2>, LHS
+ 2570797978U, // <4,1,2,1>: Cost 3 vext1 <3,4,1,2>, <1,2,3,4>
+ 3696600680U, // <4,1,2,2>: Cost 4 vext2 <0,u,4,1>, <2,2,2,2>
+ 1618166682U, // <4,1,2,3>: Cost 2 vext3 <1,2,3,4>, <1,2,3,4>
+ 2570800438U, // <4,1,2,4>: Cost 3 vext1 <3,4,1,2>, RHS
+ 3765650347U, // <4,1,2,5>: Cost 4 vext3 <1,2,3,4>, <1,2,5,3>
+ 3696601018U, // <4,1,2,6>: Cost 4 vext2 <0,u,4,1>, <2,6,3,7>
+ 3668432190U, // <4,1,2,7>: Cost 4 vext1 <7,4,1,2>, <7,4,1,2>
+ 1618535367U, // <4,1,2,u>: Cost 2 vext3 <1,2,u,4>, <1,2,u,4>
+ 2564833382U, // <4,1,3,0>: Cost 3 vext1 <2,4,1,3>, LHS
+ 2691908568U, // <4,1,3,1>: Cost 3 vext3 <1,2,3,4>, <1,3,1,3>
+ 2691908578U, // <4,1,3,2>: Cost 3 vext3 <1,2,3,4>, <1,3,2,4>
+ 2692572139U, // <4,1,3,3>: Cost 3 vext3 <1,3,3,4>, <1,3,3,4>
+ 2564836662U, // <4,1,3,4>: Cost 3 vext1 <2,4,1,3>, RHS
+ 2691908608U, // <4,1,3,5>: Cost 3 vext3 <1,2,3,4>, <1,3,5,7>
+ 2588725862U, // <4,1,3,6>: Cost 3 vext1 <6,4,1,3>, <6,4,1,3>
+ 3662468090U, // <4,1,3,7>: Cost 4 vext1 <6,4,1,3>, <7,0,1,2>
+ 2691908631U, // <4,1,3,u>: Cost 3 vext3 <1,2,3,4>, <1,3,u,3>
+ 3760194590U, // <4,1,4,0>: Cost 4 vext3 <0,3,1,4>, <1,4,0,1>
+ 3693947874U, // <4,1,4,1>: Cost 4 vext2 <0,4,4,1>, <4,1,5,0>
+ 3765650484U, // <4,1,4,2>: Cost 4 vext3 <1,2,3,4>, <1,4,2,5>
+ 3113877606U, // <4,1,4,3>: Cost 3 vtrnr <4,4,4,4>, LHS
+ 3760194630U, // <4,1,4,4>: Cost 4 vext3 <0,3,1,4>, <1,4,4,5>
+ 2622860598U, // <4,1,4,5>: Cost 3 vext2 <0,u,4,1>, RHS
+ 3297436759U, // <4,1,4,6>: Cost 4 vrev <1,4,6,4>
+ 3800007772U, // <4,1,4,7>: Cost 4 vext3 <7,0,1,4>, <1,4,7,0>
+ 2622860841U, // <4,1,4,u>: Cost 3 vext2 <0,u,4,1>, RHS
+ 1479164006U, // <4,1,5,0>: Cost 2 vext1 <0,4,1,5>, LHS
+ 2552906486U, // <4,1,5,1>: Cost 3 vext1 <0,4,1,5>, <1,0,3,2>
+ 2552907299U, // <4,1,5,2>: Cost 3 vext1 <0,4,1,5>, <2,1,3,5>
+ 2552907926U, // <4,1,5,3>: Cost 3 vext1 <0,4,1,5>, <3,0,1,2>
+ 1479167286U, // <4,1,5,4>: Cost 2 vext1 <0,4,1,5>, RHS
+ 2913387664U, // <4,1,5,5>: Cost 3 vzipl RHS, <1,5,3,7>
+ 2600686074U, // <4,1,5,6>: Cost 3 vext1 <u,4,1,5>, <6,2,7,3>
+ 2600686586U, // <4,1,5,7>: Cost 3 vext1 <u,4,1,5>, <7,0,1,2>
+ 1479169838U, // <4,1,5,u>: Cost 2 vext1 <0,4,1,5>, LHS
+ 2552914022U, // <4,1,6,0>: Cost 3 vext1 <0,4,1,6>, LHS
+ 2558886708U, // <4,1,6,1>: Cost 3 vext1 <1,4,1,6>, <1,1,1,1>
+ 4028205206U, // <4,1,6,2>: Cost 4 vzipr <0,2,4,6>, <3,0,1,2>
+ 3089858662U, // <4,1,6,3>: Cost 3 vtrnr <0,4,2,6>, LHS
+ 2552917302U, // <4,1,6,4>: Cost 3 vext1 <0,4,1,6>, RHS
+ 2223637584U, // <4,1,6,5>: Cost 3 vrev <1,4,5,6>
+ 4121347081U, // <4,1,6,6>: Cost 4 vtrnl RHS, <1,3,6,7>
+ 3721155406U, // <4,1,6,7>: Cost 4 vext2 <5,0,4,1>, <6,7,0,1>
+ 2552919854U, // <4,1,6,u>: Cost 3 vext1 <0,4,1,6>, LHS
+ 2659357716U, // <4,1,7,0>: Cost 3 vext2 <7,0,4,1>, <7,0,4,1>
+ 3733763173U, // <4,1,7,1>: Cost 4 vext2 <7,1,4,1>, <7,1,4,1>
+ 3734426806U, // <4,1,7,2>: Cost 4 vext2 <7,2,4,1>, <7,2,4,1>
+ 2695226671U, // <4,1,7,3>: Cost 3 vext3 <1,7,3,4>, <1,7,3,4>
+ 3721155942U, // <4,1,7,4>: Cost 4 vext2 <5,0,4,1>, <7,4,5,6>
+ 3721155976U, // <4,1,7,5>: Cost 4 vext2 <5,0,4,1>, <7,5,0,4>
+ 3662500458U, // <4,1,7,6>: Cost 4 vext1 <6,4,1,7>, <6,4,1,7>
+ 3721156204U, // <4,1,7,7>: Cost 4 vext2 <5,0,4,1>, <7,7,7,7>
+ 2659357716U, // <4,1,7,u>: Cost 3 vext2 <7,0,4,1>, <7,0,4,1>
+ 1479188582U, // <4,1,u,0>: Cost 2 vext1 <0,4,1,u>, LHS
+ 2552931062U, // <4,1,u,1>: Cost 3 vext1 <0,4,1,u>, <1,0,3,2>
+ 2552931944U, // <4,1,u,2>: Cost 3 vext1 <0,4,1,u>, <2,2,2,2>
+ 1622148480U, // <4,1,u,3>: Cost 2 vext3 <1,u,3,4>, <1,u,3,4>
+ 1479191862U, // <4,1,u,4>: Cost 2 vext1 <0,4,1,u>, RHS
+ 2622863514U, // <4,1,u,5>: Cost 3 vext2 <0,u,4,1>, RHS
+ 2588725862U, // <4,1,u,6>: Cost 3 vext1 <6,4,1,3>, <6,4,1,3>
+ 2600686586U, // <4,1,u,7>: Cost 3 vext1 <u,4,1,5>, <7,0,1,2>
+ 1479194414U, // <4,1,u,u>: Cost 2 vext1 <0,4,1,u>, LHS
+ 2617557030U, // <4,2,0,0>: Cost 3 vext2 <0,0,4,2>, <0,0,4,2>
+ 2622865510U, // <4,2,0,1>: Cost 3 vext2 <0,u,4,2>, LHS
+ 2622865612U, // <4,2,0,2>: Cost 3 vext2 <0,u,4,2>, <0,2,4,6>
+ 3693289753U, // <4,2,0,3>: Cost 4 vext2 <0,3,4,2>, <0,3,4,2>
+ 2635473244U, // <4,2,0,4>: Cost 3 vext2 <3,0,4,2>, <0,4,2,6>
+ 3765650918U, // <4,2,0,5>: Cost 4 vext3 <1,2,3,4>, <2,0,5,7>
+ 2696775148U, // <4,2,0,6>: Cost 3 vext3 <2,0,6,4>, <2,0,6,4>
+ 3695944285U, // <4,2,0,7>: Cost 4 vext2 <0,7,4,2>, <0,7,4,2>
+ 2622866077U, // <4,2,0,u>: Cost 3 vext2 <0,u,4,2>, LHS
+ 3696607990U, // <4,2,1,0>: Cost 4 vext2 <0,u,4,2>, <1,0,3,2>
+ 3696608052U, // <4,2,1,1>: Cost 4 vext2 <0,u,4,2>, <1,1,1,1>
+ 3696608150U, // <4,2,1,2>: Cost 4 vext2 <0,u,4,2>, <1,2,3,0>
+ 3895574630U, // <4,2,1,3>: Cost 4 vuzpr <0,4,u,2>, LHS
+ 2691909162U, // <4,2,1,4>: Cost 3 vext3 <1,2,3,4>, <2,1,4,3>
+ 3696608400U, // <4,2,1,5>: Cost 4 vext2 <0,u,4,2>, <1,5,3,7>
+ 3760784956U, // <4,2,1,6>: Cost 4 vext3 <0,4,0,4>, <2,1,6,3>
+ 3773908549U, // <4,2,1,7>: Cost 5 vext3 <2,5,7,4>, <2,1,7,3>
+ 2691909162U, // <4,2,1,u>: Cost 3 vext3 <1,2,3,4>, <2,1,4,3>
+ 3696608748U, // <4,2,2,0>: Cost 4 vext2 <0,u,4,2>, <2,0,6,4>
+ 3696608828U, // <4,2,2,1>: Cost 4 vext2 <0,u,4,2>, <2,1,6,3>
+ 2691909224U, // <4,2,2,2>: Cost 3 vext3 <1,2,3,4>, <2,2,2,2>
+ 2691909234U, // <4,2,2,3>: Cost 3 vext3 <1,2,3,4>, <2,2,3,3>
+ 3759605368U, // <4,2,2,4>: Cost 4 vext3 <0,2,2,4>, <2,2,4,0>
+ 3696609156U, // <4,2,2,5>: Cost 4 vext2 <0,u,4,2>, <2,5,6,7>
+ 3760785040U, // <4,2,2,6>: Cost 4 vext3 <0,4,0,4>, <2,2,6,6>
+ 3668505927U, // <4,2,2,7>: Cost 4 vext1 <7,4,2,2>, <7,4,2,2>
+ 2691909279U, // <4,2,2,u>: Cost 3 vext3 <1,2,3,4>, <2,2,u,3>
+ 2691909286U, // <4,2,3,0>: Cost 3 vext3 <1,2,3,4>, <2,3,0,1>
+ 3764840111U, // <4,2,3,1>: Cost 4 vext3 <1,1,1,4>, <2,3,1,1>
+ 3765651129U, // <4,2,3,2>: Cost 4 vext3 <1,2,3,4>, <2,3,2,2>
+ 2698544836U, // <4,2,3,3>: Cost 3 vext3 <2,3,3,4>, <2,3,3,4>
+ 2685863630U, // <4,2,3,4>: Cost 3 vext3 <0,2,2,4>, <2,3,4,5>
+ 2698692310U, // <4,2,3,5>: Cost 3 vext3 <2,3,5,4>, <2,3,5,4>
+ 3772507871U, // <4,2,3,6>: Cost 4 vext3 <2,3,6,4>, <2,3,6,4>
+ 2698839784U, // <4,2,3,7>: Cost 3 vext3 <2,3,7,4>, <2,3,7,4>
+ 2691909358U, // <4,2,3,u>: Cost 3 vext3 <1,2,3,4>, <2,3,u,1>
+ 2564915302U, // <4,2,4,0>: Cost 3 vext1 <2,4,2,4>, LHS
+ 2564916122U, // <4,2,4,1>: Cost 3 vext1 <2,4,2,4>, <1,2,3,4>
+ 2564917004U, // <4,2,4,2>: Cost 3 vext1 <2,4,2,4>, <2,4,2,4>
+ 2699208469U, // <4,2,4,3>: Cost 3 vext3 <2,4,3,4>, <2,4,3,4>
+ 2564918582U, // <4,2,4,4>: Cost 3 vext1 <2,4,2,4>, RHS
+ 2622868790U, // <4,2,4,5>: Cost 3 vext2 <0,u,4,2>, RHS
+ 2229667632U, // <4,2,4,6>: Cost 3 vrev <2,4,6,4>
+ 3800082229U, // <4,2,4,7>: Cost 4 vext3 <7,0,2,4>, <2,4,7,0>
+ 2622869033U, // <4,2,4,u>: Cost 3 vext2 <0,u,4,2>, RHS
+ 2552979558U, // <4,2,5,0>: Cost 3 vext1 <0,4,2,5>, LHS
+ 2558952342U, // <4,2,5,1>: Cost 3 vext1 <1,4,2,5>, <1,2,3,0>
+ 2564925032U, // <4,2,5,2>: Cost 3 vext1 <2,4,2,5>, <2,2,2,2>
+ 2967060582U, // <4,2,5,3>: Cost 3 vzipr <2,3,4,5>, LHS
+ 2552982838U, // <4,2,5,4>: Cost 3 vext1 <0,4,2,5>, RHS
+ 3987130190U, // <4,2,5,5>: Cost 4 vzipl RHS, <2,5,0,7>
+ 2913388474U, // <4,2,5,6>: Cost 3 vzipl RHS, <2,6,3,7>
+ 3895577910U, // <4,2,5,7>: Cost 4 vuzpr <0,4,u,2>, RHS
+ 2552985390U, // <4,2,5,u>: Cost 3 vext1 <0,4,2,5>, LHS
+ 1479245926U, // <4,2,6,0>: Cost 2 vext1 <0,4,2,6>, LHS
+ 2552988406U, // <4,2,6,1>: Cost 3 vext1 <0,4,2,6>, <1,0,3,2>
+ 2552989288U, // <4,2,6,2>: Cost 3 vext1 <0,4,2,6>, <2,2,2,2>
+ 2954461286U, // <4,2,6,3>: Cost 3 vzipr <0,2,4,6>, LHS
+ 1479249206U, // <4,2,6,4>: Cost 2 vext1 <0,4,2,6>, RHS
+ 2229610281U, // <4,2,6,5>: Cost 3 vrev <2,4,5,6>
+ 2600767994U, // <4,2,6,6>: Cost 3 vext1 <u,4,2,6>, <6,2,7,3>
+ 2600768506U, // <4,2,6,7>: Cost 3 vext1 <u,4,2,6>, <7,0,1,2>
+ 1479251758U, // <4,2,6,u>: Cost 2 vext1 <0,4,2,6>, LHS
+ 2659365909U, // <4,2,7,0>: Cost 3 vext2 <7,0,4,2>, <7,0,4,2>
+ 3733771366U, // <4,2,7,1>: Cost 4 vext2 <7,1,4,2>, <7,1,4,2>
+ 3734434999U, // <4,2,7,2>: Cost 4 vext2 <7,2,4,2>, <7,2,4,2>
+ 2701199368U, // <4,2,7,3>: Cost 3 vext3 <2,7,3,4>, <2,7,3,4>
+ 4175774618U, // <4,2,7,4>: Cost 4 vtrnr <2,4,5,7>, <1,2,3,4>
+ 3303360298U, // <4,2,7,5>: Cost 4 vrev <2,4,5,7>
+ 3727136217U, // <4,2,7,6>: Cost 4 vext2 <6,0,4,2>, <7,6,0,4>
+ 3727136364U, // <4,2,7,7>: Cost 4 vext2 <6,0,4,2>, <7,7,7,7>
+ 2659365909U, // <4,2,7,u>: Cost 3 vext2 <7,0,4,2>, <7,0,4,2>
+ 1479262310U, // <4,2,u,0>: Cost 2 vext1 <0,4,2,u>, LHS
+ 2553004790U, // <4,2,u,1>: Cost 3 vext1 <0,4,2,u>, <1,0,3,2>
+ 2553005672U, // <4,2,u,2>: Cost 3 vext1 <0,4,2,u>, <2,2,2,2>
+ 2954477670U, // <4,2,u,3>: Cost 3 vzipr <0,2,4,u>, LHS
+ 1479265590U, // <4,2,u,4>: Cost 2 vext1 <0,4,2,u>, RHS
+ 2622871706U, // <4,2,u,5>: Cost 3 vext2 <0,u,4,2>, RHS
+ 2229700404U, // <4,2,u,6>: Cost 3 vrev <2,4,6,u>
+ 2600784890U, // <4,2,u,7>: Cost 3 vext1 <u,4,2,u>, <7,0,1,2>
+ 1479268142U, // <4,2,u,u>: Cost 2 vext1 <0,4,2,u>, LHS
+ 3765651595U, // <4,3,0,0>: Cost 4 vext3 <1,2,3,4>, <3,0,0,0>
+ 2691909782U, // <4,3,0,1>: Cost 3 vext3 <1,2,3,4>, <3,0,1,2>
+ 2702452897U, // <4,3,0,2>: Cost 3 vext3 <3,0,2,4>, <3,0,2,4>
+ 3693297946U, // <4,3,0,3>: Cost 4 vext2 <0,3,4,3>, <0,3,4,3>
+ 3760711856U, // <4,3,0,4>: Cost 4 vext3 <0,3,u,4>, <3,0,4,1>
+ 2235533820U, // <4,3,0,5>: Cost 3 vrev <3,4,5,0>
+ 3309349381U, // <4,3,0,6>: Cost 4 vrev <3,4,6,0>
+ 3668563278U, // <4,3,0,7>: Cost 4 vext1 <7,4,3,0>, <7,4,3,0>
+ 2691909845U, // <4,3,0,u>: Cost 3 vext3 <1,2,3,4>, <3,0,u,2>
+ 2235173328U, // <4,3,1,0>: Cost 3 vrev <3,4,0,1>
+ 3764840678U, // <4,3,1,1>: Cost 4 vext3 <1,1,1,4>, <3,1,1,1>
+ 2630173594U, // <4,3,1,2>: Cost 3 vext2 <2,1,4,3>, <1,2,3,4>
+ 2703190267U, // <4,3,1,3>: Cost 3 vext3 <3,1,3,4>, <3,1,3,4>
+ 3760195840U, // <4,3,1,4>: Cost 4 vext3 <0,3,1,4>, <3,1,4,0>
+ 3765651724U, // <4,3,1,5>: Cost 4 vext3 <1,2,3,4>, <3,1,5,3>
+ 3309357574U, // <4,3,1,6>: Cost 4 vrev <3,4,6,1>
+ 3769633054U, // <4,3,1,7>: Cost 4 vext3 <1,u,3,4>, <3,1,7,3>
+ 2703558952U, // <4,3,1,u>: Cost 3 vext3 <3,1,u,4>, <3,1,u,4>
+ 3626770534U, // <4,3,2,0>: Cost 4 vext1 <0,4,3,2>, LHS
+ 2630174250U, // <4,3,2,1>: Cost 3 vext2 <2,1,4,3>, <2,1,4,3>
+ 3765651777U, // <4,3,2,2>: Cost 4 vext3 <1,2,3,4>, <3,2,2,2>
+ 2703853900U, // <4,3,2,3>: Cost 3 vext3 <3,2,3,4>, <3,2,3,4>
+ 3626773814U, // <4,3,2,4>: Cost 4 vext1 <0,4,3,2>, RHS
+ 2704001374U, // <4,3,2,5>: Cost 3 vext3 <3,2,5,4>, <3,2,5,4>
+ 3765651814U, // <4,3,2,6>: Cost 4 vext3 <1,2,3,4>, <3,2,6,3>
+ 3769633135U, // <4,3,2,7>: Cost 4 vext3 <1,u,3,4>, <3,2,7,3>
+ 2634819681U, // <4,3,2,u>: Cost 3 vext2 <2,u,4,3>, <2,u,4,3>
+ 3765651839U, // <4,3,3,0>: Cost 4 vext3 <1,2,3,4>, <3,3,0,1>
+ 3765651848U, // <4,3,3,1>: Cost 4 vext3 <1,2,3,4>, <3,3,1,1>
+ 3710552404U, // <4,3,3,2>: Cost 4 vext2 <3,2,4,3>, <3,2,4,3>
+ 2691910044U, // <4,3,3,3>: Cost 3 vext3 <1,2,3,4>, <3,3,3,3>
+ 2704591270U, // <4,3,3,4>: Cost 3 vext3 <3,3,4,4>, <3,3,4,4>
+ 3769633202U, // <4,3,3,5>: Cost 4 vext3 <1,u,3,4>, <3,3,5,7>
+ 3703917212U, // <4,3,3,6>: Cost 4 vext2 <2,1,4,3>, <3,6,4,7>
+ 3769633220U, // <4,3,3,7>: Cost 4 vext3 <1,u,3,4>, <3,3,7,7>
+ 2691910044U, // <4,3,3,u>: Cost 3 vext3 <1,2,3,4>, <3,3,3,3>
+ 2691910096U, // <4,3,4,0>: Cost 3 vext3 <1,2,3,4>, <3,4,0,1>
+ 2691910106U, // <4,3,4,1>: Cost 3 vext3 <1,2,3,4>, <3,4,1,2>
+ 2564990741U, // <4,3,4,2>: Cost 3 vext1 <2,4,3,4>, <2,4,3,4>
+ 3765651946U, // <4,3,4,3>: Cost 4 vext3 <1,2,3,4>, <3,4,3,0>
+ 2691910136U, // <4,3,4,4>: Cost 3 vext3 <1,2,3,4>, <3,4,4,5>
+ 2686454274U, // <4,3,4,5>: Cost 3 vext3 <0,3,1,4>, <3,4,5,6>
+ 2235640329U, // <4,3,4,6>: Cost 3 vrev <3,4,6,4>
+ 3801483792U, // <4,3,4,7>: Cost 4 vext3 <7,2,3,4>, <3,4,7,2>
+ 2691910168U, // <4,3,4,u>: Cost 3 vext3 <1,2,3,4>, <3,4,u,1>
+ 2559025254U, // <4,3,5,0>: Cost 3 vext1 <1,4,3,5>, LHS
+ 2559026237U, // <4,3,5,1>: Cost 3 vext1 <1,4,3,5>, <1,4,3,5>
+ 2564998862U, // <4,3,5,2>: Cost 3 vext1 <2,4,3,5>, <2,3,4,5>
+ 2570971548U, // <4,3,5,3>: Cost 3 vext1 <3,4,3,5>, <3,3,3,3>
+ 2559028534U, // <4,3,5,4>: Cost 3 vext1 <1,4,3,5>, RHS
+ 4163519477U, // <4,3,5,5>: Cost 4 vtrnr <0,4,1,5>, <1,3,4,5>
+ 3309390346U, // <4,3,5,6>: Cost 4 vrev <3,4,6,5>
+ 2706139747U, // <4,3,5,7>: Cost 3 vext3 <3,5,7,4>, <3,5,7,4>
+ 2559031086U, // <4,3,5,u>: Cost 3 vext1 <1,4,3,5>, LHS
+ 2559033446U, // <4,3,6,0>: Cost 3 vext1 <1,4,3,6>, LHS
+ 2559034430U, // <4,3,6,1>: Cost 3 vext1 <1,4,3,6>, <1,4,3,6>
+ 2565007127U, // <4,3,6,2>: Cost 3 vext1 <2,4,3,6>, <2,4,3,6>
+ 2570979740U, // <4,3,6,3>: Cost 3 vext1 <3,4,3,6>, <3,3,3,3>
+ 2559036726U, // <4,3,6,4>: Cost 3 vext1 <1,4,3,6>, RHS
+ 1161841154U, // <4,3,6,5>: Cost 2 vrev <3,4,5,6>
+ 4028203932U, // <4,3,6,6>: Cost 4 vzipr <0,2,4,6>, <1,2,3,6>
+ 2706803380U, // <4,3,6,7>: Cost 3 vext3 <3,6,7,4>, <3,6,7,4>
+ 1162062365U, // <4,3,6,u>: Cost 2 vrev <3,4,u,6>
+ 3769633475U, // <4,3,7,0>: Cost 4 vext3 <1,u,3,4>, <3,7,0,1>
+ 3769633488U, // <4,3,7,1>: Cost 4 vext3 <1,u,3,4>, <3,7,1,5>
+ 3638757144U, // <4,3,7,2>: Cost 4 vext1 <2,4,3,7>, <2,4,3,7>
+ 3769633508U, // <4,3,7,3>: Cost 4 vext3 <1,u,3,4>, <3,7,3,7>
+ 3769633515U, // <4,3,7,4>: Cost 4 vext3 <1,u,3,4>, <3,7,4,5>
+ 3769633526U, // <4,3,7,5>: Cost 4 vext3 <1,u,3,4>, <3,7,5,7>
+ 3662647932U, // <4,3,7,6>: Cost 4 vext1 <6,4,3,7>, <6,4,3,7>
+ 3781208837U, // <4,3,7,7>: Cost 4 vext3 <3,7,7,4>, <3,7,7,4>
+ 3769633547U, // <4,3,7,u>: Cost 4 vext3 <1,u,3,4>, <3,7,u,1>
+ 2559049830U, // <4,3,u,0>: Cost 3 vext1 <1,4,3,u>, LHS
+ 2691910430U, // <4,3,u,1>: Cost 3 vext3 <1,2,3,4>, <3,u,1,2>
+ 2565023513U, // <4,3,u,2>: Cost 3 vext1 <2,4,3,u>, <2,4,3,u>
+ 2707835698U, // <4,3,u,3>: Cost 3 vext3 <3,u,3,4>, <3,u,3,4>
+ 2559053110U, // <4,3,u,4>: Cost 3 vext1 <1,4,3,u>, RHS
+ 1161857540U, // <4,3,u,5>: Cost 2 vrev <3,4,5,u>
+ 2235673101U, // <4,3,u,6>: Cost 3 vrev <3,4,6,u>
+ 2708130646U, // <4,3,u,7>: Cost 3 vext3 <3,u,7,4>, <3,u,7,4>
+ 1162078751U, // <4,3,u,u>: Cost 2 vrev <3,4,u,u>
+ 2617573416U, // <4,4,0,0>: Cost 3 vext2 <0,0,4,4>, <0,0,4,4>
+ 1570373734U, // <4,4,0,1>: Cost 2 vext2 <4,4,4,4>, LHS
+ 2779676774U, // <4,4,0,2>: Cost 3 vuzpl <4,6,4,6>, LHS
+ 3760196480U, // <4,4,0,3>: Cost 4 vext3 <0,3,1,4>, <4,0,3,1>
+ 2576977100U, // <4,4,0,4>: Cost 3 vext1 <4,4,4,0>, <4,4,4,0>
+ 2718747538U, // <4,4,0,5>: Cost 3 vext3 <5,6,7,4>, <4,0,5,1>
+ 2718747548U, // <4,4,0,6>: Cost 3 vext3 <5,6,7,4>, <4,0,6,2>
+ 3668637015U, // <4,4,0,7>: Cost 4 vext1 <7,4,4,0>, <7,4,4,0>
+ 1570374301U, // <4,4,0,u>: Cost 2 vext2 <4,4,4,4>, LHS
+ 2644116214U, // <4,4,1,0>: Cost 3 vext2 <4,4,4,4>, <1,0,3,2>
+ 2644116276U, // <4,4,1,1>: Cost 3 vext2 <4,4,4,4>, <1,1,1,1>
+ 2691910602U, // <4,4,1,2>: Cost 3 vext3 <1,2,3,4>, <4,1,2,3>
+ 2644116440U, // <4,4,1,3>: Cost 3 vext2 <4,4,4,4>, <1,3,1,3>
+ 2711227356U, // <4,4,1,4>: Cost 3 vext3 <4,4,4,4>, <4,1,4,3>
+ 2709310438U, // <4,4,1,5>: Cost 3 vext3 <4,1,5,4>, <4,1,5,4>
+ 3765652462U, // <4,4,1,6>: Cost 4 vext3 <1,2,3,4>, <4,1,6,3>
+ 3768970231U, // <4,4,1,7>: Cost 4 vext3 <1,7,3,4>, <4,1,7,3>
+ 2695891968U, // <4,4,1,u>: Cost 3 vext3 <1,u,3,4>, <4,1,u,3>
+ 3703260634U, // <4,4,2,0>: Cost 4 vext2 <2,0,4,4>, <2,0,4,4>
+ 3765652499U, // <4,4,2,1>: Cost 4 vext3 <1,2,3,4>, <4,2,1,4>
+ 2644117096U, // <4,4,2,2>: Cost 3 vext2 <4,4,4,4>, <2,2,2,2>
+ 2631509709U, // <4,4,2,3>: Cost 3 vext2 <2,3,4,4>, <2,3,4,4>
+ 2644117269U, // <4,4,2,4>: Cost 3 vext2 <4,4,4,4>, <2,4,3,4>
+ 3705251698U, // <4,4,2,5>: Cost 4 vext2 <2,3,4,4>, <2,5,4,7>
+ 2710047808U, // <4,4,2,6>: Cost 3 vext3 <4,2,6,4>, <4,2,6,4>
+ 3783863369U, // <4,4,2,7>: Cost 4 vext3 <4,2,7,4>, <4,2,7,4>
+ 2634827874U, // <4,4,2,u>: Cost 3 vext2 <2,u,4,4>, <2,u,4,4>
+ 2644117654U, // <4,4,3,0>: Cost 3 vext2 <4,4,4,4>, <3,0,1,2>
+ 3638797210U, // <4,4,3,1>: Cost 4 vext1 <2,4,4,3>, <1,2,3,4>
+ 3638798082U, // <4,4,3,2>: Cost 4 vext1 <2,4,4,3>, <2,4,1,3>
+ 2637482406U, // <4,4,3,3>: Cost 3 vext2 <3,3,4,4>, <3,3,4,4>
+ 2638146039U, // <4,4,3,4>: Cost 3 vext2 <3,4,4,4>, <3,4,4,4>
+ 3913287374U, // <4,4,3,5>: Cost 4 vuzpr <3,4,5,4>, <2,3,4,5>
+ 3765652625U, // <4,4,3,6>: Cost 4 vext3 <1,2,3,4>, <4,3,6,4>
+ 3713878762U, // <4,4,3,7>: Cost 4 vext2 <3,7,4,4>, <3,7,4,4>
+ 2637482406U, // <4,4,3,u>: Cost 3 vext2 <3,3,4,4>, <3,3,4,4>
+ 1503264870U, // <4,4,4,0>: Cost 2 vext1 <4,4,4,4>, LHS
+ 2577007514U, // <4,4,4,1>: Cost 3 vext1 <4,4,4,4>, <1,2,3,4>
+ 2577008232U, // <4,4,4,2>: Cost 3 vext1 <4,4,4,4>, <2,2,2,2>
+ 2571037175U, // <4,4,4,3>: Cost 3 vext1 <3,4,4,4>, <3,4,4,4>
+ 161926454U, // <4,4,4,4>: Cost 1 vdup0 RHS
+ 1570377014U, // <4,4,4,5>: Cost 2 vext2 <4,4,4,4>, RHS
+ 2779680054U, // <4,4,4,6>: Cost 3 vuzpl <4,6,4,6>, RHS
+ 2594927963U, // <4,4,4,7>: Cost 3 vext1 <7,4,4,4>, <7,4,4,4>
+ 161926454U, // <4,4,4,u>: Cost 1 vdup0 RHS
+ 2571042918U, // <4,4,5,0>: Cost 3 vext1 <3,4,4,5>, LHS
+ 2571043738U, // <4,4,5,1>: Cost 3 vext1 <3,4,4,5>, <1,2,3,4>
+ 3638814495U, // <4,4,5,2>: Cost 4 vext1 <2,4,4,5>, <2,4,4,5>
+ 2571045368U, // <4,4,5,3>: Cost 3 vext1 <3,4,4,5>, <3,4,4,5>
+ 2571046198U, // <4,4,5,4>: Cost 3 vext1 <3,4,4,5>, RHS
+ 1839648054U, // <4,4,5,5>: Cost 2 vzipl RHS, RHS
+ 1618169142U, // <4,4,5,6>: Cost 2 vext3 <1,2,3,4>, RHS
+ 2594936156U, // <4,4,5,7>: Cost 3 vext1 <7,4,4,5>, <7,4,4,5>
+ 1618169160U, // <4,4,5,u>: Cost 2 vext3 <1,2,3,4>, RHS
+ 2553135206U, // <4,4,6,0>: Cost 3 vext1 <0,4,4,6>, LHS
+ 3626877686U, // <4,4,6,1>: Cost 4 vext1 <0,4,4,6>, <1,0,3,2>
+ 2565080782U, // <4,4,6,2>: Cost 3 vext1 <2,4,4,6>, <2,3,4,5>
+ 2571053561U, // <4,4,6,3>: Cost 3 vext1 <3,4,4,6>, <3,4,4,6>
+ 2553138486U, // <4,4,6,4>: Cost 3 vext1 <0,4,4,6>, RHS
+ 2241555675U, // <4,4,6,5>: Cost 3 vrev <4,4,5,6>
+ 1973865782U, // <4,4,6,6>: Cost 2 vtrnl RHS, RHS
+ 2658055029U, // <4,4,6,7>: Cost 3 vext2 <6,7,4,4>, <6,7,4,4>
+ 1973865800U, // <4,4,6,u>: Cost 2 vtrnl RHS, RHS
+ 2644120570U, // <4,4,7,0>: Cost 3 vext2 <4,4,4,4>, <7,0,1,2>
+ 3638829978U, // <4,4,7,1>: Cost 4 vext1 <2,4,4,7>, <1,2,3,4>
+ 3638830881U, // <4,4,7,2>: Cost 4 vext1 <2,4,4,7>, <2,4,4,7>
+ 3735115018U, // <4,4,7,3>: Cost 4 vext2 <7,3,4,4>, <7,3,4,4>
+ 2662036827U, // <4,4,7,4>: Cost 3 vext2 <7,4,4,4>, <7,4,4,4>
+ 2713292236U, // <4,4,7,5>: Cost 3 vext3 <4,7,5,4>, <4,7,5,4>
+ 2713365973U, // <4,4,7,6>: Cost 3 vext3 <4,7,6,4>, <4,7,6,4>
+ 2644121196U, // <4,4,7,7>: Cost 3 vext2 <4,4,4,4>, <7,7,7,7>
+ 2662036827U, // <4,4,7,u>: Cost 3 vext2 <7,4,4,4>, <7,4,4,4>
+ 1503297638U, // <4,4,u,0>: Cost 2 vext1 <4,4,4,u>, LHS
+ 1570379566U, // <4,4,u,1>: Cost 2 vext2 <4,4,4,4>, LHS
+ 2779682606U, // <4,4,u,2>: Cost 3 vuzpl <4,6,4,6>, LHS
+ 2571069947U, // <4,4,u,3>: Cost 3 vext1 <3,4,4,u>, <3,4,4,u>
+ 161926454U, // <4,4,u,4>: Cost 1 vdup0 RHS
+ 1841638710U, // <4,4,u,5>: Cost 2 vzipl RHS, RHS
+ 1618169385U, // <4,4,u,6>: Cost 2 vext3 <1,2,3,4>, RHS
+ 2594960735U, // <4,4,u,7>: Cost 3 vext1 <7,4,4,u>, <7,4,4,u>
+ 161926454U, // <4,4,u,u>: Cost 1 vdup0 RHS
+ 2631516160U, // <4,5,0,0>: Cost 3 vext2 <2,3,4,5>, <0,0,0,0>
+ 1557774438U, // <4,5,0,1>: Cost 2 vext2 <2,3,4,5>, LHS
+ 2618908875U, // <4,5,0,2>: Cost 3 vext2 <0,2,4,5>, <0,2,4,5>
+ 2571078140U, // <4,5,0,3>: Cost 3 vext1 <3,4,5,0>, <3,4,5,0>
+ 2626871634U, // <4,5,0,4>: Cost 3 vext2 <1,5,4,5>, <0,4,1,5>
+ 3705258414U, // <4,5,0,5>: Cost 4 vext2 <2,3,4,5>, <0,5,2,7>
+ 2594968438U, // <4,5,0,6>: Cost 3 vext1 <7,4,5,0>, <6,7,4,5>
+ 2594968928U, // <4,5,0,7>: Cost 3 vext1 <7,4,5,0>, <7,4,5,0>
+ 1557775005U, // <4,5,0,u>: Cost 2 vext2 <2,3,4,5>, LHS
+ 2631516918U, // <4,5,1,0>: Cost 3 vext2 <2,3,4,5>, <1,0,3,2>
+ 2624217939U, // <4,5,1,1>: Cost 3 vext2 <1,1,4,5>, <1,1,4,5>
+ 2631517078U, // <4,5,1,2>: Cost 3 vext2 <2,3,4,5>, <1,2,3,0>
+ 2821341286U, // <4,5,1,3>: Cost 3 vuzpr <0,4,1,5>, LHS
+ 3895086054U, // <4,5,1,4>: Cost 4 vuzpr <0,4,1,5>, <4,1,5,4>
+ 2626872471U, // <4,5,1,5>: Cost 3 vext2 <1,5,4,5>, <1,5,4,5>
+ 3895083131U, // <4,5,1,6>: Cost 4 vuzpr <0,4,1,5>, <0,1,4,6>
+ 2718748368U, // <4,5,1,7>: Cost 3 vext3 <5,6,7,4>, <5,1,7,3>
+ 2821341291U, // <4,5,1,u>: Cost 3 vuzpr <0,4,1,5>, LHS
+ 2571092070U, // <4,5,2,0>: Cost 3 vext1 <3,4,5,2>, LHS
+ 3699287585U, // <4,5,2,1>: Cost 4 vext2 <1,3,4,5>, <2,1,3,3>
+ 2630854269U, // <4,5,2,2>: Cost 3 vext2 <2,2,4,5>, <2,2,4,5>
+ 1557776078U, // <4,5,2,3>: Cost 2 vext2 <2,3,4,5>, <2,3,4,5>
+ 2631517974U, // <4,5,2,4>: Cost 3 vext2 <2,3,4,5>, <2,4,3,5>
+ 3692652384U, // <4,5,2,5>: Cost 4 vext2 <0,2,4,5>, <2,5,2,7>
+ 2631518138U, // <4,5,2,6>: Cost 3 vext2 <2,3,4,5>, <2,6,3,7>
+ 4164013366U, // <4,5,2,7>: Cost 4 vtrnr <0,4,u,2>, RHS
+ 1561094243U, // <4,5,2,u>: Cost 2 vext2 <2,u,4,5>, <2,u,4,5>
+ 2631518358U, // <4,5,3,0>: Cost 3 vext2 <2,3,4,5>, <3,0,1,2>
+ 3895084710U, // <4,5,3,1>: Cost 4 vuzpr <0,4,1,5>, <2,3,0,1>
+ 2631518540U, // <4,5,3,2>: Cost 3 vext2 <2,3,4,5>, <3,2,3,4>
+ 2631518620U, // <4,5,3,3>: Cost 3 vext2 <2,3,4,5>, <3,3,3,3>
+ 2631518716U, // <4,5,3,4>: Cost 3 vext2 <2,3,4,5>, <3,4,5,0>
+ 2631518784U, // <4,5,3,5>: Cost 3 vext2 <2,3,4,5>, <3,5,3,5>
+ 2658060980U, // <4,5,3,6>: Cost 3 vext2 <6,7,4,5>, <3,6,7,4>
+ 2640145131U, // <4,5,3,7>: Cost 3 vext2 <3,7,4,5>, <3,7,4,5>
+ 2631519006U, // <4,5,3,u>: Cost 3 vext2 <2,3,4,5>, <3,u,1,2>
+ 2571108454U, // <4,5,4,0>: Cost 3 vext1 <3,4,5,4>, LHS
+ 3632907342U, // <4,5,4,1>: Cost 4 vext1 <1,4,5,4>, <1,4,5,4>
+ 2571110094U, // <4,5,4,2>: Cost 3 vext1 <3,4,5,4>, <2,3,4,5>
+ 2571110912U, // <4,5,4,3>: Cost 3 vext1 <3,4,5,4>, <3,4,5,4>
+ 2571111734U, // <4,5,4,4>: Cost 3 vext1 <3,4,5,4>, RHS
+ 1557777718U, // <4,5,4,5>: Cost 2 vext2 <2,3,4,5>, RHS
+ 2645454195U, // <4,5,4,6>: Cost 3 vext2 <4,6,4,5>, <4,6,4,5>
+ 2718748614U, // <4,5,4,7>: Cost 3 vext3 <5,6,7,4>, <5,4,7,6>
+ 1557777961U, // <4,5,4,u>: Cost 2 vext2 <2,3,4,5>, RHS
+ 1503346790U, // <4,5,5,0>: Cost 2 vext1 <4,4,5,5>, LHS
+ 2913398480U, // <4,5,5,1>: Cost 3 vzipl RHS, <5,1,7,3>
+ 2631519998U, // <4,5,5,2>: Cost 3 vext2 <2,3,4,5>, <5,2,3,4>
+ 2577090710U, // <4,5,5,3>: Cost 3 vext1 <4,4,5,5>, <3,0,1,2>
+ 1503349978U, // <4,5,5,4>: Cost 2 vext1 <4,4,5,5>, <4,4,5,5>
+ 2631520260U, // <4,5,5,5>: Cost 3 vext2 <2,3,4,5>, <5,5,5,5>
+ 2913390690U, // <4,5,5,6>: Cost 3 vzipl RHS, <5,6,7,0>
+ 2821344566U, // <4,5,5,7>: Cost 3 vuzpr <0,4,1,5>, RHS
+ 1503352622U, // <4,5,5,u>: Cost 2 vext1 <4,4,5,5>, LHS
+ 1497383014U, // <4,5,6,0>: Cost 2 vext1 <3,4,5,6>, LHS
+ 2559181904U, // <4,5,6,1>: Cost 3 vext1 <1,4,5,6>, <1,4,5,6>
+ 2565154601U, // <4,5,6,2>: Cost 3 vext1 <2,4,5,6>, <2,4,5,6>
+ 1497385474U, // <4,5,6,3>: Cost 2 vext1 <3,4,5,6>, <3,4,5,6>
+ 1497386294U, // <4,5,6,4>: Cost 2 vext1 <3,4,5,6>, RHS
+ 3047608324U, // <4,5,6,5>: Cost 3 vtrnl RHS, <5,5,5,5>
+ 2571129656U, // <4,5,6,6>: Cost 3 vext1 <3,4,5,6>, <6,6,6,6>
+ 27705344U, // <4,5,6,7>: Cost 0 copy RHS
+ 27705344U, // <4,5,6,u>: Cost 0 copy RHS
+ 2565161062U, // <4,5,7,0>: Cost 3 vext1 <2,4,5,7>, LHS
+ 2565161882U, // <4,5,7,1>: Cost 3 vext1 <2,4,5,7>, <1,2,3,4>
+ 2565162794U, // <4,5,7,2>: Cost 3 vext1 <2,4,5,7>, <2,4,5,7>
+ 2661381387U, // <4,5,7,3>: Cost 3 vext2 <7,3,4,5>, <7,3,4,5>
+ 2565164342U, // <4,5,7,4>: Cost 3 vext1 <2,4,5,7>, RHS
+ 2718748840U, // <4,5,7,5>: Cost 3 vext3 <5,6,7,4>, <5,7,5,7>
+ 2718748846U, // <4,5,7,6>: Cost 3 vext3 <5,6,7,4>, <5,7,6,4>
+ 2719412407U, // <4,5,7,7>: Cost 3 vext3 <5,7,7,4>, <5,7,7,4>
+ 2565166894U, // <4,5,7,u>: Cost 3 vext1 <2,4,5,7>, LHS
+ 1497399398U, // <4,5,u,0>: Cost 2 vext1 <3,4,5,u>, LHS
+ 1557780270U, // <4,5,u,1>: Cost 2 vext2 <2,3,4,5>, LHS
+ 2631522181U, // <4,5,u,2>: Cost 3 vext2 <2,3,4,5>, <u,2,3,0>
+ 1497401860U, // <4,5,u,3>: Cost 2 vext1 <3,4,5,u>, <3,4,5,u>
+ 1497402678U, // <4,5,u,4>: Cost 2 vext1 <3,4,5,u>, RHS
+ 1557780634U, // <4,5,u,5>: Cost 2 vext2 <2,3,4,5>, RHS
+ 2631522512U, // <4,5,u,6>: Cost 3 vext2 <2,3,4,5>, <u,6,3,7>
+ 27705344U, // <4,5,u,7>: Cost 0 copy RHS
+ 27705344U, // <4,5,u,u>: Cost 0 copy RHS
+ 2618916864U, // <4,6,0,0>: Cost 3 vext2 <0,2,4,6>, <0,0,0,0>
+ 1545175142U, // <4,6,0,1>: Cost 2 vext2 <0,2,4,6>, LHS
+ 1545175244U, // <4,6,0,2>: Cost 2 vext2 <0,2,4,6>, <0,2,4,6>
+ 3692658940U, // <4,6,0,3>: Cost 4 vext2 <0,2,4,6>, <0,3,1,0>
+ 2618917202U, // <4,6,0,4>: Cost 3 vext2 <0,2,4,6>, <0,4,1,5>
+ 3852910806U, // <4,6,0,5>: Cost 4 vuzpl RHS, <0,2,5,7>
+ 2253525648U, // <4,6,0,6>: Cost 3 vrev <6,4,6,0>
+ 4040764726U, // <4,6,0,7>: Cost 4 vzipr <2,3,4,0>, RHS
+ 1545175709U, // <4,6,0,u>: Cost 2 vext2 <0,2,4,6>, LHS
+ 2618917622U, // <4,6,1,0>: Cost 3 vext2 <0,2,4,6>, <1,0,3,2>
+ 2618917684U, // <4,6,1,1>: Cost 3 vext2 <0,2,4,6>, <1,1,1,1>
+ 2618917782U, // <4,6,1,2>: Cost 3 vext2 <0,2,4,6>, <1,2,3,0>
+ 2618917848U, // <4,6,1,3>: Cost 3 vext2 <0,2,4,6>, <1,3,1,3>
+ 3692659773U, // <4,6,1,4>: Cost 4 vext2 <0,2,4,6>, <1,4,3,5>
+ 2618918032U, // <4,6,1,5>: Cost 3 vext2 <0,2,4,6>, <1,5,3,7>
+ 3692659937U, // <4,6,1,6>: Cost 4 vext2 <0,2,4,6>, <1,6,3,7>
+ 4032146742U, // <4,6,1,7>: Cost 4 vzipr <0,u,4,1>, RHS
+ 2618918253U, // <4,6,1,u>: Cost 3 vext2 <0,2,4,6>, <1,u,1,3>
+ 2618918380U, // <4,6,2,0>: Cost 3 vext2 <0,2,4,6>, <2,0,6,4>
+ 2618918460U, // <4,6,2,1>: Cost 3 vext2 <0,2,4,6>, <2,1,6,3>
+ 2618918504U, // <4,6,2,2>: Cost 3 vext2 <0,2,4,6>, <2,2,2,2>
+ 2618918566U, // <4,6,2,3>: Cost 3 vext2 <0,2,4,6>, <2,3,0,1>
+ 2618918679U, // <4,6,2,4>: Cost 3 vext2 <0,2,4,6>, <2,4,3,6>
+ 2618918788U, // <4,6,2,5>: Cost 3 vext2 <0,2,4,6>, <2,5,6,7>
+ 2618918842U, // <4,6,2,6>: Cost 3 vext2 <0,2,4,6>, <2,6,3,7>
+ 2718749178U, // <4,6,2,7>: Cost 3 vext3 <5,6,7,4>, <6,2,7,3>
+ 2618918971U, // <4,6,2,u>: Cost 3 vext2 <0,2,4,6>, <2,u,0,1>
+ 2618919062U, // <4,6,3,0>: Cost 3 vext2 <0,2,4,6>, <3,0,1,2>
+ 2636171526U, // <4,6,3,1>: Cost 3 vext2 <3,1,4,6>, <3,1,4,6>
+ 3692661057U, // <4,6,3,2>: Cost 4 vext2 <0,2,4,6>, <3,2,2,2>
+ 2618919324U, // <4,6,3,3>: Cost 3 vext2 <0,2,4,6>, <3,3,3,3>
+ 2618919426U, // <4,6,3,4>: Cost 3 vext2 <0,2,4,6>, <3,4,5,6>
+ 2638826058U, // <4,6,3,5>: Cost 3 vext2 <3,5,4,6>, <3,5,4,6>
+ 3913303030U, // <4,6,3,6>: Cost 4 vuzpr <3,4,5,6>, <1,3,4,6>
+ 2722730572U, // <4,6,3,7>: Cost 3 vext3 <6,3,7,4>, <6,3,7,4>
+ 2618919710U, // <4,6,3,u>: Cost 3 vext2 <0,2,4,6>, <3,u,1,2>
+ 2565210214U, // <4,6,4,0>: Cost 3 vext1 <2,4,6,4>, LHS
+ 2718749286U, // <4,6,4,1>: Cost 3 vext3 <5,6,7,4>, <6,4,1,3>
+ 2565211952U, // <4,6,4,2>: Cost 3 vext1 <2,4,6,4>, <2,4,6,4>
+ 2571184649U, // <4,6,4,3>: Cost 3 vext1 <3,4,6,4>, <3,4,6,4>
+ 2565213494U, // <4,6,4,4>: Cost 3 vext1 <2,4,6,4>, RHS
+ 1545178422U, // <4,6,4,5>: Cost 2 vext2 <0,2,4,6>, RHS
+ 1705430326U, // <4,6,4,6>: Cost 2 vuzpl RHS, RHS
+ 2595075437U, // <4,6,4,7>: Cost 3 vext1 <7,4,6,4>, <7,4,6,4>
+ 1545178665U, // <4,6,4,u>: Cost 2 vext2 <0,2,4,6>, RHS
+ 2565218406U, // <4,6,5,0>: Cost 3 vext1 <2,4,6,5>, LHS
+ 2645462736U, // <4,6,5,1>: Cost 3 vext2 <4,6,4,6>, <5,1,7,3>
+ 2913399290U, // <4,6,5,2>: Cost 3 vzipl RHS, <6,2,7,3>
+ 3913305394U, // <4,6,5,3>: Cost 4 vuzpr <3,4,5,6>, <4,5,6,3>
+ 2645462982U, // <4,6,5,4>: Cost 3 vext2 <4,6,4,6>, <5,4,7,6>
+ 2779172868U, // <4,6,5,5>: Cost 3 vuzpl RHS, <5,5,5,5>
+ 2913391416U, // <4,6,5,6>: Cost 3 vzipl RHS, <6,6,6,6>
+ 2821426486U, // <4,6,5,7>: Cost 3 vuzpr <0,4,2,6>, RHS
+ 2821426487U, // <4,6,5,u>: Cost 3 vuzpr <0,4,2,6>, RHS
+ 1503428710U, // <4,6,6,0>: Cost 2 vext1 <4,4,6,6>, LHS
+ 2577171190U, // <4,6,6,1>: Cost 3 vext1 <4,4,6,6>, <1,0,3,2>
+ 2645463546U, // <4,6,6,2>: Cost 3 vext2 <4,6,4,6>, <6,2,7,3>
+ 2577172630U, // <4,6,6,3>: Cost 3 vext1 <4,4,6,6>, <3,0,1,2>
+ 1503431908U, // <4,6,6,4>: Cost 2 vext1 <4,4,6,6>, <4,4,6,6>
+ 2253501069U, // <4,6,6,5>: Cost 3 vrev <6,4,5,6>
+ 2618921784U, // <4,6,6,6>: Cost 3 vext2 <0,2,4,6>, <6,6,6,6>
+ 2954464566U, // <4,6,6,7>: Cost 3 vzipr <0,2,4,6>, RHS
+ 1503434542U, // <4,6,6,u>: Cost 2 vext1 <4,4,6,6>, LHS
+ 2645464058U, // <4,6,7,0>: Cost 3 vext2 <4,6,4,6>, <7,0,1,2>
+ 2779173882U, // <4,6,7,1>: Cost 3 vuzpl RHS, <7,0,1,2>
+ 3638978355U, // <4,6,7,2>: Cost 4 vext1 <2,4,6,7>, <2,4,6,7>
+ 2725090156U, // <4,6,7,3>: Cost 3 vext3 <6,7,3,4>, <6,7,3,4>
+ 2645464422U, // <4,6,7,4>: Cost 3 vext2 <4,6,4,6>, <7,4,5,6>
+ 2779174246U, // <4,6,7,5>: Cost 3 vuzpl RHS, <7,4,5,6>
+ 3852915914U, // <4,6,7,6>: Cost 4 vuzpl RHS, <7,2,6,3>
+ 2779174508U, // <4,6,7,7>: Cost 3 vuzpl RHS, <7,7,7,7>
+ 2779173945U, // <4,6,7,u>: Cost 3 vuzpl RHS, <7,0,u,2>
+ 1503445094U, // <4,6,u,0>: Cost 2 vext1 <4,4,6,u>, LHS
+ 1545180974U, // <4,6,u,1>: Cost 2 vext2 <0,2,4,6>, LHS
+ 1705432878U, // <4,6,u,2>: Cost 2 vuzpl RHS, LHS
+ 2618922940U, // <4,6,u,3>: Cost 3 vext2 <0,2,4,6>, <u,3,0,1>
+ 1503448294U, // <4,6,u,4>: Cost 2 vext1 <4,4,6,u>, <4,4,6,u>
+ 1545181338U, // <4,6,u,5>: Cost 2 vext2 <0,2,4,6>, RHS
+ 1705433242U, // <4,6,u,6>: Cost 2 vuzpl RHS, RHS
+ 2954480950U, // <4,6,u,7>: Cost 3 vzipr <0,2,4,u>, RHS
+ 1545181541U, // <4,6,u,u>: Cost 2 vext2 <0,2,4,6>, LHS
+ 3706601472U, // <4,7,0,0>: Cost 4 vext2 <2,5,4,7>, <0,0,0,0>
+ 2632859750U, // <4,7,0,1>: Cost 3 vext2 <2,5,4,7>, LHS
+ 2726343685U, // <4,7,0,2>: Cost 3 vext3 <7,0,2,4>, <7,0,2,4>
+ 3701293312U, // <4,7,0,3>: Cost 4 vext2 <1,6,4,7>, <0,3,1,4>
+ 3706601810U, // <4,7,0,4>: Cost 4 vext2 <2,5,4,7>, <0,4,1,5>
+ 2259424608U, // <4,7,0,5>: Cost 3 vrev <7,4,5,0>
+ 3695321617U, // <4,7,0,6>: Cost 4 vext2 <0,6,4,7>, <0,6,4,7>
+ 3800454194U, // <4,7,0,7>: Cost 4 vext3 <7,0,7,4>, <7,0,7,4>
+ 2632860317U, // <4,7,0,u>: Cost 3 vext2 <2,5,4,7>, LHS
+ 2259064116U, // <4,7,1,0>: Cost 3 vrev <7,4,0,1>
+ 3700630324U, // <4,7,1,1>: Cost 4 vext2 <1,5,4,7>, <1,1,1,1>
+ 2632860570U, // <4,7,1,2>: Cost 3 vext2 <2,5,4,7>, <1,2,3,4>
+ 3769635936U, // <4,7,1,3>: Cost 4 vext3 <1,u,3,4>, <7,1,3,5>
+ 3656920374U, // <4,7,1,4>: Cost 4 vext1 <5,4,7,1>, RHS
+ 3700630681U, // <4,7,1,5>: Cost 4 vext2 <1,5,4,7>, <1,5,4,7>
+ 3701294314U, // <4,7,1,6>: Cost 4 vext2 <1,6,4,7>, <1,6,4,7>
+ 3793818754U, // <4,7,1,7>: Cost 4 vext3 <5,u,7,4>, <7,1,7,3>
+ 2259654012U, // <4,7,1,u>: Cost 3 vrev <7,4,u,1>
+ 3656925286U, // <4,7,2,0>: Cost 4 vext1 <5,4,7,2>, LHS
+ 3706603050U, // <4,7,2,1>: Cost 4 vext2 <2,5,4,7>, <2,1,4,3>
+ 3706603112U, // <4,7,2,2>: Cost 4 vext2 <2,5,4,7>, <2,2,2,2>
+ 2727744688U, // <4,7,2,3>: Cost 3 vext3 <7,2,3,4>, <7,2,3,4>
+ 3705939745U, // <4,7,2,4>: Cost 4 vext2 <2,4,4,7>, <2,4,4,7>
+ 2632861554U, // <4,7,2,5>: Cost 3 vext2 <2,5,4,7>, <2,5,4,7>
+ 3706603450U, // <4,7,2,6>: Cost 4 vext2 <2,5,4,7>, <2,6,3,7>
+ 3792491731U, // <4,7,2,7>: Cost 4 vext3 <5,6,7,4>, <7,2,7,3>
+ 2634852453U, // <4,7,2,u>: Cost 3 vext2 <2,u,4,7>, <2,u,4,7>
+ 3706603670U, // <4,7,3,0>: Cost 4 vext2 <2,5,4,7>, <3,0,1,2>
+ 3662906266U, // <4,7,3,1>: Cost 4 vext1 <6,4,7,3>, <1,2,3,4>
+ 3725183326U, // <4,7,3,2>: Cost 4 vext2 <5,6,4,7>, <3,2,5,4>
+ 3706603932U, // <4,7,3,3>: Cost 4 vext2 <2,5,4,7>, <3,3,3,3>
+ 3701295618U, // <4,7,3,4>: Cost 4 vext2 <1,6,4,7>, <3,4,5,6>
+ 2638834251U, // <4,7,3,5>: Cost 3 vext2 <3,5,4,7>, <3,5,4,7>
+ 2639497884U, // <4,7,3,6>: Cost 3 vext2 <3,6,4,7>, <3,6,4,7>
+ 3802445093U, // <4,7,3,7>: Cost 4 vext3 <7,3,7,4>, <7,3,7,4>
+ 2640825150U, // <4,7,3,u>: Cost 3 vext2 <3,u,4,7>, <3,u,4,7>
+ 2718750004U, // <4,7,4,0>: Cost 3 vext3 <5,6,7,4>, <7,4,0,1>
+ 3706604490U, // <4,7,4,1>: Cost 4 vext2 <2,5,4,7>, <4,1,2,3>
+ 3656943474U, // <4,7,4,2>: Cost 4 vext1 <5,4,7,4>, <2,5,4,7>
+ 3779884371U, // <4,7,4,3>: Cost 4 vext3 <3,5,7,4>, <7,4,3,5>
+ 2259383643U, // <4,7,4,4>: Cost 3 vrev <7,4,4,4>
+ 2632863030U, // <4,7,4,5>: Cost 3 vext2 <2,5,4,7>, RHS
+ 2259531117U, // <4,7,4,6>: Cost 3 vrev <7,4,6,4>
+ 3907340074U, // <4,7,4,7>: Cost 4 vuzpr <2,4,5,7>, <2,4,5,7>
+ 2632863273U, // <4,7,4,u>: Cost 3 vext2 <2,5,4,7>, RHS
+ 2913391610U, // <4,7,5,0>: Cost 3 vzipl RHS, <7,0,1,2>
+ 3645006848U, // <4,7,5,1>: Cost 4 vext1 <3,4,7,5>, <1,3,5,7>
+ 2589181646U, // <4,7,5,2>: Cost 3 vext1 <6,4,7,5>, <2,3,4,5>
+ 3645008403U, // <4,7,5,3>: Cost 4 vext1 <3,4,7,5>, <3,4,7,5>
+ 2913391974U, // <4,7,5,4>: Cost 3 vzipl RHS, <7,4,5,6>
+ 2583211973U, // <4,7,5,5>: Cost 3 vext1 <5,4,7,5>, <5,4,7,5>
+ 2589184670U, // <4,7,5,6>: Cost 3 vext1 <6,4,7,5>, <6,4,7,5>
+ 2913392236U, // <4,7,5,7>: Cost 3 vzipl RHS, <7,7,7,7>
+ 2913392258U, // <4,7,5,u>: Cost 3 vzipl RHS, <7,u,1,2>
+ 1509474406U, // <4,7,6,0>: Cost 2 vext1 <5,4,7,6>, LHS
+ 3047609338U, // <4,7,6,1>: Cost 3 vtrnl RHS, <7,0,1,2>
+ 2583217768U, // <4,7,6,2>: Cost 3 vext1 <5,4,7,6>, <2,2,2,2>
+ 2583218326U, // <4,7,6,3>: Cost 3 vext1 <5,4,7,6>, <3,0,1,2>
+ 1509477686U, // <4,7,6,4>: Cost 2 vext1 <5,4,7,6>, RHS
+ 1509478342U, // <4,7,6,5>: Cost 2 vext1 <5,4,7,6>, <5,4,7,6>
+ 2583220730U, // <4,7,6,6>: Cost 3 vext1 <5,4,7,6>, <6,2,7,3>
+ 3047609964U, // <4,7,6,7>: Cost 3 vtrnl RHS, <7,7,7,7>
+ 1509480238U, // <4,7,6,u>: Cost 2 vext1 <5,4,7,6>, LHS
+ 3650994278U, // <4,7,7,0>: Cost 4 vext1 <4,4,7,7>, LHS
+ 3650995098U, // <4,7,7,1>: Cost 4 vext1 <4,4,7,7>, <1,2,3,4>
+ 3650996010U, // <4,7,7,2>: Cost 4 vext1 <4,4,7,7>, <2,4,5,7>
+ 3804804677U, // <4,7,7,3>: Cost 4 vext3 <7,7,3,4>, <7,7,3,4>
+ 3650997486U, // <4,7,7,4>: Cost 4 vext1 <4,4,7,7>, <4,4,7,7>
+ 2662725039U, // <4,7,7,5>: Cost 3 vext2 <7,5,4,7>, <7,5,4,7>
+ 3662942880U, // <4,7,7,6>: Cost 4 vext1 <6,4,7,7>, <6,4,7,7>
+ 2718750316U, // <4,7,7,7>: Cost 3 vext3 <5,6,7,4>, <7,7,7,7>
+ 2664715938U, // <4,7,7,u>: Cost 3 vext2 <7,u,4,7>, <7,u,4,7>
+ 1509490790U, // <4,7,u,0>: Cost 2 vext1 <5,4,7,u>, LHS
+ 2632865582U, // <4,7,u,1>: Cost 3 vext2 <2,5,4,7>, LHS
+ 2583234152U, // <4,7,u,2>: Cost 3 vext1 <5,4,7,u>, <2,2,2,2>
+ 2583234710U, // <4,7,u,3>: Cost 3 vext1 <5,4,7,u>, <3,0,1,2>
+ 1509494070U, // <4,7,u,4>: Cost 2 vext1 <5,4,7,u>, RHS
+ 1509494728U, // <4,7,u,5>: Cost 2 vext1 <5,4,7,u>, <5,4,7,u>
+ 2583237114U, // <4,7,u,6>: Cost 3 vext1 <5,4,7,u>, <6,2,7,3>
+ 3047757420U, // <4,7,u,7>: Cost 3 vtrnl RHS, <7,7,7,7>
+ 1509496622U, // <4,7,u,u>: Cost 2 vext1 <5,4,7,u>, LHS
+ 2618933248U, // <4,u,0,0>: Cost 3 vext2 <0,2,4,u>, <0,0,0,0>
+ 1545191526U, // <4,u,0,1>: Cost 2 vext2 <0,2,4,u>, LHS
+ 1545191630U, // <4,u,0,2>: Cost 2 vext2 <0,2,4,u>, <0,2,4,u>
+ 2691913445U, // <4,u,0,3>: Cost 3 vext3 <1,2,3,4>, <u,0,3,2>
+ 2618933586U, // <4,u,0,4>: Cost 3 vext2 <0,2,4,u>, <0,4,1,5>
+ 2265397305U, // <4,u,0,5>: Cost 3 vrev <u,4,5,0>
+ 2595189625U, // <4,u,0,6>: Cost 3 vext1 <7,4,u,0>, <6,7,4,u>
+ 2595190139U, // <4,u,0,7>: Cost 3 vext1 <7,4,u,0>, <7,4,u,0>
+ 1545192093U, // <4,u,0,u>: Cost 2 vext2 <0,2,4,u>, LHS
+ 2618934006U, // <4,u,1,0>: Cost 3 vext2 <0,2,4,u>, <1,0,3,2>
+ 2618934068U, // <4,u,1,1>: Cost 3 vext2 <0,2,4,u>, <1,1,1,1>
+ 1618171694U, // <4,u,1,2>: Cost 2 vext3 <1,2,3,4>, LHS
+ 2618934232U, // <4,u,1,3>: Cost 3 vext2 <0,2,4,u>, <1,3,1,3>
+ 2695894848U, // <4,u,1,4>: Cost 3 vext3 <1,u,3,4>, <u,1,4,3>
+ 2618934416U, // <4,u,1,5>: Cost 3 vext2 <0,2,4,u>, <1,5,3,7>
+ 3692676321U, // <4,u,1,6>: Cost 4 vext2 <0,2,4,u>, <1,6,3,7>
+ 2718750555U, // <4,u,1,7>: Cost 3 vext3 <5,6,7,4>, <u,1,7,3>
+ 1618171748U, // <4,u,1,u>: Cost 2 vext3 <1,2,3,4>, LHS
+ 2553397350U, // <4,u,2,0>: Cost 3 vext1 <0,4,u,2>, LHS
+ 2630215215U, // <4,u,2,1>: Cost 3 vext2 <2,1,4,u>, <2,1,4,u>
+ 2618934888U, // <4,u,2,2>: Cost 3 vext2 <0,2,4,u>, <2,2,2,2>
+ 1557800657U, // <4,u,2,3>: Cost 2 vext2 <2,3,4,u>, <2,3,4,u>
+ 2618935065U, // <4,u,2,4>: Cost 3 vext2 <0,2,4,u>, <2,4,3,u>
+ 2733864859U, // <4,u,2,5>: Cost 3 vext3 <u,2,5,4>, <u,2,5,4>
+ 2618935226U, // <4,u,2,6>: Cost 3 vext2 <0,2,4,u>, <2,6,3,7>
+ 2718750636U, // <4,u,2,7>: Cost 3 vext3 <5,6,7,4>, <u,2,7,3>
+ 1561118822U, // <4,u,2,u>: Cost 2 vext2 <2,u,4,u>, <2,u,4,u>
+ 2618935446U, // <4,u,3,0>: Cost 3 vext2 <0,2,4,u>, <3,0,1,2>
+ 2779318422U, // <4,u,3,1>: Cost 3 vuzpl RHS, <3,0,1,2>
+ 2636851545U, // <4,u,3,2>: Cost 3 vext2 <3,2,4,u>, <3,2,4,u>
+ 2618935708U, // <4,u,3,3>: Cost 3 vext2 <0,2,4,u>, <3,3,3,3>
+ 2618935810U, // <4,u,3,4>: Cost 3 vext2 <0,2,4,u>, <3,4,5,6>
+ 2691913711U, // <4,u,3,5>: Cost 3 vext3 <1,2,3,4>, <u,3,5,7>
+ 2588725862U, // <4,u,3,6>: Cost 3 vext1 <6,4,1,3>, <6,4,1,3>
+ 2640169710U, // <4,u,3,7>: Cost 3 vext2 <3,7,4,u>, <3,7,4,u>
+ 2618936094U, // <4,u,3,u>: Cost 3 vext2 <0,2,4,u>, <3,u,1,2>
+ 1503559782U, // <4,u,4,0>: Cost 2 vext1 <4,4,u,4>, LHS
+ 2692282391U, // <4,u,4,1>: Cost 3 vext3 <1,2,u,4>, <u,4,1,2>
+ 2565359426U, // <4,u,4,2>: Cost 3 vext1 <2,4,u,4>, <2,4,u,4>
+ 2571332123U, // <4,u,4,3>: Cost 3 vext1 <3,4,u,4>, <3,4,u,4>
+ 161926454U, // <4,u,4,4>: Cost 1 vdup0 RHS
+ 1545194806U, // <4,u,4,5>: Cost 2 vext2 <0,2,4,u>, RHS
+ 1705577782U, // <4,u,4,6>: Cost 2 vuzpl RHS, RHS
+ 2718750801U, // <4,u,4,7>: Cost 3 vext3 <5,6,7,4>, <u,4,7,6>
+ 161926454U, // <4,u,4,u>: Cost 1 vdup0 RHS
+ 1479164006U, // <4,u,5,0>: Cost 2 vext1 <0,4,1,5>, LHS
+ 1839650606U, // <4,u,5,1>: Cost 2 vzipl RHS, LHS
+ 2565367502U, // <4,u,5,2>: Cost 3 vext1 <2,4,u,5>, <2,3,4,5>
+ 3089777309U, // <4,u,5,3>: Cost 3 vtrnr <0,4,1,5>, LHS
+ 1479167286U, // <4,u,5,4>: Cost 2 vext1 <0,4,1,5>, RHS
+ 1839650970U, // <4,u,5,5>: Cost 2 vzipl RHS, RHS
+ 1618172058U, // <4,u,5,6>: Cost 2 vext3 <1,2,3,4>, RHS
+ 3089780265U, // <4,u,5,7>: Cost 3 vtrnr <0,4,1,5>, RHS
+ 1618172076U, // <4,u,5,u>: Cost 2 vext3 <1,2,3,4>, RHS
+ 1479688294U, // <4,u,6,0>: Cost 2 vext1 <0,4,u,6>, LHS
+ 2553430774U, // <4,u,6,1>: Cost 3 vext1 <0,4,u,6>, <1,0,3,2>
+ 1973868334U, // <4,u,6,2>: Cost 2 vtrnl RHS, LHS
+ 1497606685U, // <4,u,6,3>: Cost 2 vext1 <3,4,u,6>, <3,4,u,6>
+ 1479691574U, // <4,u,6,4>: Cost 2 vext1 <0,4,u,6>, RHS
+ 1509552079U, // <4,u,6,5>: Cost 2 vext1 <5,4,u,6>, <5,4,u,6>
+ 1973868698U, // <4,u,6,6>: Cost 2 vtrnl RHS, RHS
+ 27705344U, // <4,u,6,7>: Cost 0 copy RHS
+ 27705344U, // <4,u,6,u>: Cost 0 copy RHS
+ 2565382246U, // <4,u,7,0>: Cost 3 vext1 <2,4,u,7>, LHS
+ 2565383066U, // <4,u,7,1>: Cost 3 vext1 <2,4,u,7>, <1,2,3,4>
+ 2565384005U, // <4,u,7,2>: Cost 3 vext1 <2,4,u,7>, <2,4,u,7>
+ 2661405966U, // <4,u,7,3>: Cost 3 vext2 <7,3,4,u>, <7,3,4,u>
+ 2565385526U, // <4,u,7,4>: Cost 3 vext1 <2,4,u,7>, RHS
+ 2779321702U, // <4,u,7,5>: Cost 3 vuzpl RHS, <7,4,5,6>
+ 2589274793U, // <4,u,7,6>: Cost 3 vext1 <6,4,u,7>, <6,4,u,7>
+ 2779321964U, // <4,u,7,7>: Cost 3 vuzpl RHS, <7,7,7,7>
+ 2565388078U, // <4,u,7,u>: Cost 3 vext1 <2,4,u,7>, LHS
+ 1479704678U, // <4,u,u,0>: Cost 2 vext1 <0,4,u,u>, LHS
+ 1545197358U, // <4,u,u,1>: Cost 2 vext2 <0,2,4,u>, LHS
+ 1618172261U, // <4,u,u,2>: Cost 2 vext3 <1,2,3,4>, LHS
+ 1497623071U, // <4,u,u,3>: Cost 2 vext1 <3,4,u,u>, <3,4,u,u>
+ 161926454U, // <4,u,u,4>: Cost 1 vdup0 RHS
+ 1545197722U, // <4,u,u,5>: Cost 2 vext2 <0,2,4,u>, RHS
+ 1618172301U, // <4,u,u,6>: Cost 2 vext3 <1,2,3,4>, RHS
+ 27705344U, // <4,u,u,7>: Cost 0 copy RHS
+ 27705344U, // <4,u,u,u>: Cost 0 copy RHS
+ 2687123456U, // <5,0,0,0>: Cost 3 vext3 <0,4,1,5>, <0,0,0,0>
+ 2687123466U, // <5,0,0,1>: Cost 3 vext3 <0,4,1,5>, <0,0,1,1>
+ 2687123476U, // <5,0,0,2>: Cost 3 vext3 <0,4,1,5>, <0,0,2,2>
+ 3710599434U, // <5,0,0,3>: Cost 4 vext2 <3,2,5,0>, <0,3,2,5>
+ 2642166098U, // <5,0,0,4>: Cost 3 vext2 <4,1,5,0>, <0,4,1,5>
+ 3657060306U, // <5,0,0,5>: Cost 4 vext1 <5,5,0,0>, <5,5,0,0>
+ 3292094923U, // <5,0,0,6>: Cost 4 vrev <0,5,6,0>
+ 3669005700U, // <5,0,0,7>: Cost 4 vext1 <7,5,0,0>, <7,5,0,0>
+ 2687123530U, // <5,0,0,u>: Cost 3 vext3 <0,4,1,5>, <0,0,u,2>
+ 2559434854U, // <5,0,1,0>: Cost 3 vext1 <1,5,0,1>, LHS
+ 2559435887U, // <5,0,1,1>: Cost 3 vext1 <1,5,0,1>, <1,5,0,1>
+ 1613381734U, // <5,0,1,2>: Cost 2 vext3 <0,4,1,5>, LHS
+ 3698656256U, // <5,0,1,3>: Cost 4 vext2 <1,2,5,0>, <1,3,5,7>
+ 2559438134U, // <5,0,1,4>: Cost 3 vext1 <1,5,0,1>, RHS
+ 2583326675U, // <5,0,1,5>: Cost 3 vext1 <5,5,0,1>, <5,5,0,1>
+ 3715908851U, // <5,0,1,6>: Cost 4 vext2 <4,1,5,0>, <1,6,5,7>
+ 3657069562U, // <5,0,1,7>: Cost 4 vext1 <5,5,0,1>, <7,0,1,2>
+ 1613381788U, // <5,0,1,u>: Cost 2 vext3 <0,4,1,5>, LHS
+ 2686017700U, // <5,0,2,0>: Cost 3 vext3 <0,2,4,5>, <0,2,0,2>
+ 2685796528U, // <5,0,2,1>: Cost 3 vext3 <0,2,1,5>, <0,2,1,5>
+ 2698625208U, // <5,0,2,2>: Cost 3 vext3 <2,3,4,5>, <0,2,2,4>
+ 2685944002U, // <5,0,2,3>: Cost 3 vext3 <0,2,3,5>, <0,2,3,5>
+ 2686017739U, // <5,0,2,4>: Cost 3 vext3 <0,2,4,5>, <0,2,4,5>
+ 2686091476U, // <5,0,2,5>: Cost 3 vext3 <0,2,5,5>, <0,2,5,5>
+ 2725167324U, // <5,0,2,6>: Cost 3 vext3 <6,7,4,5>, <0,2,6,4>
+ 2595280230U, // <5,0,2,7>: Cost 3 vext1 <7,5,0,2>, <7,4,5,6>
+ 2686312687U, // <5,0,2,u>: Cost 3 vext3 <0,2,u,5>, <0,2,u,5>
+ 3760128248U, // <5,0,3,0>: Cost 4 vext3 <0,3,0,5>, <0,3,0,5>
+ 3759685888U, // <5,0,3,1>: Cost 4 vext3 <0,2,3,5>, <0,3,1,4>
+ 2686533898U, // <5,0,3,2>: Cost 3 vext3 <0,3,2,5>, <0,3,2,5>
+ 3760349459U, // <5,0,3,3>: Cost 4 vext3 <0,3,3,5>, <0,3,3,5>
+ 2638187004U, // <5,0,3,4>: Cost 3 vext2 <3,4,5,0>, <3,4,5,0>
+ 3776348452U, // <5,0,3,5>: Cost 4 vext3 <3,0,4,5>, <0,3,5,4>
+ 3713256094U, // <5,0,3,6>: Cost 4 vext2 <3,6,5,0>, <3,6,5,0>
+ 3914064896U, // <5,0,3,7>: Cost 4 vuzpr <3,5,7,0>, <1,3,5,7>
+ 2686976320U, // <5,0,3,u>: Cost 3 vext3 <0,3,u,5>, <0,3,u,5>
+ 2559459430U, // <5,0,4,0>: Cost 3 vext1 <1,5,0,4>, LHS
+ 1613381970U, // <5,0,4,1>: Cost 2 vext3 <0,4,1,5>, <0,4,1,5>
+ 2687123804U, // <5,0,4,2>: Cost 3 vext3 <0,4,1,5>, <0,4,2,6>
+ 3761013092U, // <5,0,4,3>: Cost 4 vext3 <0,4,3,5>, <0,4,3,5>
+ 2559462710U, // <5,0,4,4>: Cost 3 vext1 <1,5,0,4>, RHS
+ 2638187830U, // <5,0,4,5>: Cost 3 vext2 <3,4,5,0>, RHS
+ 3761234303U, // <5,0,4,6>: Cost 4 vext3 <0,4,6,5>, <0,4,6,5>
+ 2646150600U, // <5,0,4,7>: Cost 3 vext2 <4,7,5,0>, <4,7,5,0>
+ 1613381970U, // <5,0,4,u>: Cost 2 vext3 <0,4,1,5>, <0,4,1,5>
+ 3766763926U, // <5,0,5,0>: Cost 4 vext3 <1,4,0,5>, <0,5,0,1>
+ 2919268454U, // <5,0,5,1>: Cost 3 vzipl <5,5,5,5>, LHS
+ 3053486182U, // <5,0,5,2>: Cost 3 vtrnl <5,5,5,5>, LHS
+ 3723210589U, // <5,0,5,3>: Cost 4 vext2 <5,3,5,0>, <5,3,5,0>
+ 3766763966U, // <5,0,5,4>: Cost 4 vext3 <1,4,0,5>, <0,5,4,5>
+ 2650796031U, // <5,0,5,5>: Cost 3 vext2 <5,5,5,0>, <5,5,5,0>
+ 3719893090U, // <5,0,5,6>: Cost 4 vext2 <4,7,5,0>, <5,6,7,0>
+ 3914067254U, // <5,0,5,7>: Cost 4 vuzpr <3,5,7,0>, RHS
+ 2919269021U, // <5,0,5,u>: Cost 3 vzipl <5,5,5,5>, LHS
+ 4047519744U, // <5,0,6,0>: Cost 4 vzipr <3,4,5,6>, <0,0,0,0>
+ 2920038502U, // <5,0,6,1>: Cost 3 vzipl <5,6,7,0>, LHS
+ 3759759871U, // <5,0,6,2>: Cost 4 vext3 <0,2,4,5>, <0,6,2,7>
+ 3645164070U, // <5,0,6,3>: Cost 4 vext1 <3,5,0,6>, <3,5,0,6>
+ 3762414095U, // <5,0,6,4>: Cost 4 vext3 <0,6,4,5>, <0,6,4,5>
+ 3993780690U, // <5,0,6,5>: Cost 4 vzipl <5,6,7,0>, <0,5,6,7>
+ 3719893816U, // <5,0,6,6>: Cost 4 vext2 <4,7,5,0>, <6,6,6,6>
+ 2662077302U, // <5,0,6,7>: Cost 3 vext2 <7,4,5,0>, <6,7,4,5>
+ 2920039069U, // <5,0,6,u>: Cost 3 vzipl <5,6,7,0>, LHS
+ 2565455974U, // <5,0,7,0>: Cost 3 vext1 <2,5,0,7>, LHS
+ 2565456790U, // <5,0,7,1>: Cost 3 vext1 <2,5,0,7>, <1,2,3,0>
+ 2565457742U, // <5,0,7,2>: Cost 3 vext1 <2,5,0,7>, <2,5,0,7>
+ 3639199894U, // <5,0,7,3>: Cost 4 vext1 <2,5,0,7>, <3,0,1,2>
+ 2565459254U, // <5,0,7,4>: Cost 3 vext1 <2,5,0,7>, RHS
+ 2589347938U, // <5,0,7,5>: Cost 3 vext1 <6,5,0,7>, <5,6,7,0>
+ 2589348530U, // <5,0,7,6>: Cost 3 vext1 <6,5,0,7>, <6,5,0,7>
+ 4188456422U, // <5,0,7,7>: Cost 4 vtrnr RHS, <2,0,5,7>
+ 2565461806U, // <5,0,7,u>: Cost 3 vext1 <2,5,0,7>, LHS
+ 2687124106U, // <5,0,u,0>: Cost 3 vext3 <0,4,1,5>, <0,u,0,2>
+ 1616036502U, // <5,0,u,1>: Cost 2 vext3 <0,u,1,5>, <0,u,1,5>
+ 1613382301U, // <5,0,u,2>: Cost 2 vext3 <0,4,1,5>, LHS
+ 2689925800U, // <5,0,u,3>: Cost 3 vext3 <0,u,3,5>, <0,u,3,5>
+ 2687124146U, // <5,0,u,4>: Cost 3 vext3 <0,4,1,5>, <0,u,4,6>
+ 2638190746U, // <5,0,u,5>: Cost 3 vext2 <3,4,5,0>, RHS
+ 2589356723U, // <5,0,u,6>: Cost 3 vext1 <6,5,0,u>, <6,5,0,u>
+ 2595280230U, // <5,0,u,7>: Cost 3 vext1 <7,5,0,2>, <7,4,5,6>
+ 1613382355U, // <5,0,u,u>: Cost 2 vext3 <0,4,1,5>, LHS
+ 2646818816U, // <5,1,0,0>: Cost 3 vext2 <4,u,5,1>, <0,0,0,0>
+ 1573077094U, // <5,1,0,1>: Cost 2 vext2 <4,u,5,1>, LHS
+ 2646818980U, // <5,1,0,2>: Cost 3 vext2 <4,u,5,1>, <0,2,0,2>
+ 2687124214U, // <5,1,0,3>: Cost 3 vext3 <0,4,1,5>, <1,0,3,2>
+ 2641510738U, // <5,1,0,4>: Cost 3 vext2 <4,0,5,1>, <0,4,1,5>
+ 2641510814U, // <5,1,0,5>: Cost 3 vext2 <4,0,5,1>, <0,5,1,0>
+ 3720561142U, // <5,1,0,6>: Cost 4 vext2 <4,u,5,1>, <0,6,1,7>
+ 3298141357U, // <5,1,0,7>: Cost 4 vrev <1,5,7,0>
+ 1573077661U, // <5,1,0,u>: Cost 2 vext2 <4,u,5,1>, LHS
+ 2223891567U, // <5,1,1,0>: Cost 3 vrev <1,5,0,1>
+ 2687124276U, // <5,1,1,1>: Cost 3 vext3 <0,4,1,5>, <1,1,1,1>
+ 2646819734U, // <5,1,1,2>: Cost 3 vext2 <4,u,5,1>, <1,2,3,0>
+ 2687124296U, // <5,1,1,3>: Cost 3 vext3 <0,4,1,5>, <1,1,3,3>
+ 2691326803U, // <5,1,1,4>: Cost 3 vext3 <1,1,4,5>, <1,1,4,5>
+ 2691400540U, // <5,1,1,5>: Cost 3 vext3 <1,1,5,5>, <1,1,5,5>
+ 3765216101U, // <5,1,1,6>: Cost 4 vext3 <1,1,6,5>, <1,1,6,5>
+ 3765289838U, // <5,1,1,7>: Cost 4 vext3 <1,1,7,5>, <1,1,7,5>
+ 2687124341U, // <5,1,1,u>: Cost 3 vext3 <0,4,1,5>, <1,1,u,3>
+ 3297641584U, // <5,1,2,0>: Cost 4 vrev <1,5,0,2>
+ 3763520391U, // <5,1,2,1>: Cost 4 vext3 <0,u,1,5>, <1,2,1,3>
+ 2646820456U, // <5,1,2,2>: Cost 3 vext2 <4,u,5,1>, <2,2,2,2>
+ 2687124374U, // <5,1,2,3>: Cost 3 vext3 <0,4,1,5>, <1,2,3,0>
+ 2691990436U, // <5,1,2,4>: Cost 3 vext3 <1,2,4,5>, <1,2,4,5>
+ 2687124395U, // <5,1,2,5>: Cost 3 vext3 <0,4,1,5>, <1,2,5,3>
+ 2646820794U, // <5,1,2,6>: Cost 3 vext2 <4,u,5,1>, <2,6,3,7>
+ 3808199610U, // <5,1,2,7>: Cost 4 vext3 <u,3,4,5>, <1,2,7,0>
+ 2687124419U, // <5,1,2,u>: Cost 3 vext3 <0,4,1,5>, <1,2,u,0>
+ 2577440870U, // <5,1,3,0>: Cost 3 vext1 <4,5,1,3>, LHS
+ 2687124440U, // <5,1,3,1>: Cost 3 vext3 <0,4,1,5>, <1,3,1,3>
+ 3759686627U, // <5,1,3,2>: Cost 4 vext3 <0,2,3,5>, <1,3,2,5>
+ 2692580332U, // <5,1,3,3>: Cost 3 vext3 <1,3,3,5>, <1,3,3,5>
+ 2687124469U, // <5,1,3,4>: Cost 3 vext3 <0,4,1,5>, <1,3,4,5>
+ 2685207552U, // <5,1,3,5>: Cost 3 vext3 <0,1,2,5>, <1,3,5,7>
+ 3760866313U, // <5,1,3,6>: Cost 4 vext3 <0,4,1,5>, <1,3,6,7>
+ 2692875280U, // <5,1,3,7>: Cost 3 vext3 <1,3,7,5>, <1,3,7,5>
+ 2687124503U, // <5,1,3,u>: Cost 3 vext3 <0,4,1,5>, <1,3,u,3>
+ 1567771538U, // <5,1,4,0>: Cost 2 vext2 <4,0,5,1>, <4,0,5,1>
+ 2693096491U, // <5,1,4,1>: Cost 3 vext3 <1,4,1,5>, <1,4,1,5>
+ 2693170228U, // <5,1,4,2>: Cost 3 vext3 <1,4,2,5>, <1,4,2,5>
+ 2687124541U, // <5,1,4,3>: Cost 3 vext3 <0,4,1,5>, <1,4,3,5>
+ 2646822096U, // <5,1,4,4>: Cost 3 vext2 <4,u,5,1>, <4,4,4,4>
+ 1573080374U, // <5,1,4,5>: Cost 2 vext2 <4,u,5,1>, RHS
+ 2646822260U, // <5,1,4,6>: Cost 3 vext2 <4,u,5,1>, <4,6,4,6>
+ 3298174129U, // <5,1,4,7>: Cost 4 vrev <1,5,7,4>
+ 1573080602U, // <5,1,4,u>: Cost 2 vext2 <4,u,5,1>, <4,u,5,1>
+ 2687124591U, // <5,1,5,0>: Cost 3 vext3 <0,4,1,5>, <1,5,0,1>
+ 2646822543U, // <5,1,5,1>: Cost 3 vext2 <4,u,5,1>, <5,1,0,1>
+ 3760866433U, // <5,1,5,2>: Cost 4 vext3 <0,4,1,5>, <1,5,2,1>
+ 2687124624U, // <5,1,5,3>: Cost 3 vext3 <0,4,1,5>, <1,5,3,7>
+ 2687124631U, // <5,1,5,4>: Cost 3 vext3 <0,4,1,5>, <1,5,4,5>
+ 2646822916U, // <5,1,5,5>: Cost 3 vext2 <4,u,5,1>, <5,5,5,5>
+ 2646823010U, // <5,1,5,6>: Cost 3 vext2 <4,u,5,1>, <5,6,7,0>
+ 2646823080U, // <5,1,5,7>: Cost 3 vext2 <4,u,5,1>, <5,7,5,7>
+ 2687124663U, // <5,1,5,u>: Cost 3 vext3 <0,4,1,5>, <1,5,u,1>
+ 2553577574U, // <5,1,6,0>: Cost 3 vext1 <0,5,1,6>, LHS
+ 3763520719U, // <5,1,6,1>: Cost 4 vext3 <0,u,1,5>, <1,6,1,7>
+ 2646823418U, // <5,1,6,2>: Cost 3 vext2 <4,u,5,1>, <6,2,7,3>
+ 3760866529U, // <5,1,6,3>: Cost 4 vext3 <0,4,1,5>, <1,6,3,7>
+ 2553580854U, // <5,1,6,4>: Cost 3 vext1 <0,5,1,6>, RHS
+ 2687124723U, // <5,1,6,5>: Cost 3 vext3 <0,4,1,5>, <1,6,5,7>
+ 2646823736U, // <5,1,6,6>: Cost 3 vext2 <4,u,5,1>, <6,6,6,6>
+ 2646823758U, // <5,1,6,7>: Cost 3 vext2 <4,u,5,1>, <6,7,0,1>
+ 2646823839U, // <5,1,6,u>: Cost 3 vext2 <4,u,5,1>, <6,u,0,1>
+ 2559557734U, // <5,1,7,0>: Cost 3 vext1 <1,5,1,7>, LHS
+ 2559558452U, // <5,1,7,1>: Cost 3 vext1 <1,5,1,7>, <1,1,1,1>
+ 2571503270U, // <5,1,7,2>: Cost 3 vext1 <3,5,1,7>, <2,3,0,1>
+ 2040971366U, // <5,1,7,3>: Cost 2 vtrnr RHS, LHS
+ 2559561014U, // <5,1,7,4>: Cost 3 vext1 <1,5,1,7>, RHS
+ 2595393232U, // <5,1,7,5>: Cost 3 vext1 <7,5,1,7>, <5,1,7,3>
+ 4188455035U, // <5,1,7,6>: Cost 4 vtrnr RHS, <0,1,4,6>
+ 2646824556U, // <5,1,7,7>: Cost 3 vext2 <4,u,5,1>, <7,7,7,7>
+ 2040971371U, // <5,1,7,u>: Cost 2 vtrnr RHS, LHS
+ 1591662326U, // <5,1,u,0>: Cost 2 vext2 <u,0,5,1>, <u,0,5,1>
+ 1573082926U, // <5,1,u,1>: Cost 2 vext2 <4,u,5,1>, LHS
+ 2695824760U, // <5,1,u,2>: Cost 3 vext3 <1,u,2,5>, <1,u,2,5>
+ 2040979558U, // <5,1,u,3>: Cost 2 vtrnr RHS, LHS
+ 2687124874U, // <5,1,u,4>: Cost 3 vext3 <0,4,1,5>, <1,u,4,5>
+ 1573083290U, // <5,1,u,5>: Cost 2 vext2 <4,u,5,1>, RHS
+ 2646825168U, // <5,1,u,6>: Cost 3 vext2 <4,u,5,1>, <u,6,3,7>
+ 2646825216U, // <5,1,u,7>: Cost 3 vext2 <4,u,5,1>, <u,7,0,1>
+ 2040979563U, // <5,1,u,u>: Cost 2 vtrnr RHS, LHS
+ 3702652928U, // <5,2,0,0>: Cost 4 vext2 <1,u,5,2>, <0,0,0,0>
+ 2628911206U, // <5,2,0,1>: Cost 3 vext2 <1,u,5,2>, LHS
+ 2641518756U, // <5,2,0,2>: Cost 3 vext2 <4,0,5,2>, <0,2,0,2>
+ 3759760847U, // <5,2,0,3>: Cost 4 vext3 <0,2,4,5>, <2,0,3,2>
+ 3760866775U, // <5,2,0,4>: Cost 4 vext3 <0,4,1,5>, <2,0,4,1>
+ 3759539680U, // <5,2,0,5>: Cost 4 vext3 <0,2,1,5>, <2,0,5,1>
+ 3760866796U, // <5,2,0,6>: Cost 4 vext3 <0,4,1,5>, <2,0,6,4>
+ 3304114054U, // <5,2,0,7>: Cost 4 vrev <2,5,7,0>
+ 2628911773U, // <5,2,0,u>: Cost 3 vext2 <1,u,5,2>, LHS
+ 2623603464U, // <5,2,1,0>: Cost 3 vext2 <1,0,5,2>, <1,0,5,2>
+ 3698008921U, // <5,2,1,1>: Cost 4 vext2 <1,1,5,2>, <1,1,5,2>
+ 3633325603U, // <5,2,1,2>: Cost 4 vext1 <1,5,2,1>, <2,1,3,5>
+ 2687125027U, // <5,2,1,3>: Cost 3 vext3 <0,4,1,5>, <2,1,3,5>
+ 3633327414U, // <5,2,1,4>: Cost 4 vext1 <1,5,2,1>, RHS
+ 3759539760U, // <5,2,1,5>: Cost 4 vext3 <0,2,1,5>, <2,1,5,0>
+ 3760866876U, // <5,2,1,6>: Cost 4 vext3 <0,4,1,5>, <2,1,6,3>
+ 3304122247U, // <5,2,1,7>: Cost 4 vrev <2,5,7,1>
+ 2687125072U, // <5,2,1,u>: Cost 3 vext3 <0,4,1,5>, <2,1,u,5>
+ 3633332326U, // <5,2,2,0>: Cost 4 vext1 <1,5,2,2>, LHS
+ 3759760992U, // <5,2,2,1>: Cost 4 vext3 <0,2,4,5>, <2,2,1,3>
+ 2687125096U, // <5,2,2,2>: Cost 3 vext3 <0,4,1,5>, <2,2,2,2>
+ 2687125106U, // <5,2,2,3>: Cost 3 vext3 <0,4,1,5>, <2,2,3,3>
+ 2697963133U, // <5,2,2,4>: Cost 3 vext3 <2,2,4,5>, <2,2,4,5>
+ 3759466120U, // <5,2,2,5>: Cost 4 vext3 <0,2,0,5>, <2,2,5,7>
+ 3760866960U, // <5,2,2,6>: Cost 4 vext3 <0,4,1,5>, <2,2,6,6>
+ 3771926168U, // <5,2,2,7>: Cost 4 vext3 <2,2,7,5>, <2,2,7,5>
+ 2687125151U, // <5,2,2,u>: Cost 3 vext3 <0,4,1,5>, <2,2,u,3>
+ 2687125158U, // <5,2,3,0>: Cost 3 vext3 <0,4,1,5>, <2,3,0,1>
+ 2698405555U, // <5,2,3,1>: Cost 3 vext3 <2,3,1,5>, <2,3,1,5>
+ 2577516238U, // <5,2,3,2>: Cost 3 vext1 <4,5,2,3>, <2,3,4,5>
+ 3759687365U, // <5,2,3,3>: Cost 4 vext3 <0,2,3,5>, <2,3,3,5>
+ 1624884942U, // <5,2,3,4>: Cost 2 vext3 <2,3,4,5>, <2,3,4,5>
+ 2698700503U, // <5,2,3,5>: Cost 3 vext3 <2,3,5,5>, <2,3,5,5>
+ 3772368608U, // <5,2,3,6>: Cost 4 vext3 <2,3,4,5>, <2,3,6,5>
+ 3702655716U, // <5,2,3,7>: Cost 4 vext2 <1,u,5,2>, <3,7,3,7>
+ 1625179890U, // <5,2,3,u>: Cost 2 vext3 <2,3,u,5>, <2,3,u,5>
+ 2641521555U, // <5,2,4,0>: Cost 3 vext2 <4,0,5,2>, <4,0,5,2>
+ 3772368642U, // <5,2,4,1>: Cost 4 vext3 <2,3,4,5>, <2,4,1,3>
+ 2699142925U, // <5,2,4,2>: Cost 3 vext3 <2,4,2,5>, <2,4,2,5>
+ 2698626838U, // <5,2,4,3>: Cost 3 vext3 <2,3,4,5>, <2,4,3,5>
+ 2698626848U, // <5,2,4,4>: Cost 3 vext3 <2,3,4,5>, <2,4,4,6>
+ 2628914486U, // <5,2,4,5>: Cost 3 vext2 <1,u,5,2>, RHS
+ 2645503353U, // <5,2,4,6>: Cost 3 vext2 <4,6,5,2>, <4,6,5,2>
+ 3304146826U, // <5,2,4,7>: Cost 4 vrev <2,5,7,4>
+ 2628914729U, // <5,2,4,u>: Cost 3 vext2 <1,u,5,2>, RHS
+ 2553643110U, // <5,2,5,0>: Cost 3 vext1 <0,5,2,5>, LHS
+ 3758950227U, // <5,2,5,1>: Cost 4 vext3 <0,1,2,5>, <2,5,1,3>
+ 3759761248U, // <5,2,5,2>: Cost 4 vext3 <0,2,4,5>, <2,5,2,7>
+ 2982396006U, // <5,2,5,3>: Cost 3 vzipr <4,u,5,5>, LHS
+ 2553646390U, // <5,2,5,4>: Cost 3 vext1 <0,5,2,5>, RHS
+ 2553647108U, // <5,2,5,5>: Cost 3 vext1 <0,5,2,5>, <5,5,5,5>
+ 3760867204U, // <5,2,5,6>: Cost 4 vext3 <0,4,1,5>, <2,5,6,7>
+ 3702657141U, // <5,2,5,7>: Cost 4 vext2 <1,u,5,2>, <5,7,0,1>
+ 2982396011U, // <5,2,5,u>: Cost 3 vzipr <4,u,5,5>, LHS
+ 3627393126U, // <5,2,6,0>: Cost 4 vext1 <0,5,2,6>, LHS
+ 3760867236U, // <5,2,6,1>: Cost 4 vext3 <0,4,1,5>, <2,6,1,3>
+ 2645504506U, // <5,2,6,2>: Cost 3 vext2 <4,6,5,2>, <6,2,7,3>
+ 2687125434U, // <5,2,6,3>: Cost 3 vext3 <0,4,1,5>, <2,6,3,7>
+ 2700617665U, // <5,2,6,4>: Cost 3 vext3 <2,6,4,5>, <2,6,4,5>
+ 3760867276U, // <5,2,6,5>: Cost 4 vext3 <0,4,1,5>, <2,6,5,7>
+ 3763521493U, // <5,2,6,6>: Cost 4 vext3 <0,u,1,5>, <2,6,6,7>
+ 3719246670U, // <5,2,6,7>: Cost 4 vext2 <4,6,5,2>, <6,7,0,1>
+ 2687125479U, // <5,2,6,u>: Cost 3 vext3 <0,4,1,5>, <2,6,u,7>
+ 2565603430U, // <5,2,7,0>: Cost 3 vext1 <2,5,2,7>, LHS
+ 2553660150U, // <5,2,7,1>: Cost 3 vext1 <0,5,2,7>, <1,0,3,2>
+ 2565605216U, // <5,2,7,2>: Cost 3 vext1 <2,5,2,7>, <2,5,2,7>
+ 2961178726U, // <5,2,7,3>: Cost 3 vzipr <1,3,5,7>, LHS
+ 2565606710U, // <5,2,7,4>: Cost 3 vext1 <2,5,2,7>, RHS
+ 4034920552U, // <5,2,7,5>: Cost 4 vzipr <1,3,5,7>, <0,1,2,5>
+ 3114713292U, // <5,2,7,6>: Cost 3 vtrnr RHS, <0,2,4,6>
+ 3702658668U, // <5,2,7,7>: Cost 4 vext2 <1,u,5,2>, <7,7,7,7>
+ 2961178731U, // <5,2,7,u>: Cost 3 vzipr <1,3,5,7>, LHS
+ 2687125563U, // <5,2,u,0>: Cost 3 vext3 <0,4,1,5>, <2,u,0,1>
+ 2628917038U, // <5,2,u,1>: Cost 3 vext2 <1,u,5,2>, LHS
+ 2565613409U, // <5,2,u,2>: Cost 3 vext1 <2,5,2,u>, <2,5,2,u>
+ 2687125592U, // <5,2,u,3>: Cost 3 vext3 <0,4,1,5>, <2,u,3,3>
+ 1628203107U, // <5,2,u,4>: Cost 2 vext3 <2,u,4,5>, <2,u,4,5>
+ 2628917402U, // <5,2,u,5>: Cost 3 vext2 <1,u,5,2>, RHS
+ 2702092405U, // <5,2,u,6>: Cost 3 vext3 <2,u,6,5>, <2,u,6,5>
+ 3304179598U, // <5,2,u,7>: Cost 4 vrev <2,5,7,u>
+ 1628498055U, // <5,2,u,u>: Cost 2 vext3 <2,u,u,5>, <2,u,u,5>
+ 3760867467U, // <5,3,0,0>: Cost 4 vext3 <0,4,1,5>, <3,0,0,0>
+ 2687125654U, // <5,3,0,1>: Cost 3 vext3 <0,4,1,5>, <3,0,1,2>
+ 3759761565U, // <5,3,0,2>: Cost 4 vext3 <0,2,4,5>, <3,0,2,0>
+ 3633391766U, // <5,3,0,3>: Cost 4 vext1 <1,5,3,0>, <3,0,1,2>
+ 2687125680U, // <5,3,0,4>: Cost 3 vext3 <0,4,1,5>, <3,0,4,1>
+ 3760277690U, // <5,3,0,5>: Cost 4 vext3 <0,3,2,5>, <3,0,5,2>
+ 3310013014U, // <5,3,0,6>: Cost 4 vrev <3,5,6,0>
+ 2236344927U, // <5,3,0,7>: Cost 3 vrev <3,5,7,0>
+ 2687125717U, // <5,3,0,u>: Cost 3 vext3 <0,4,1,5>, <3,0,u,2>
+ 3760867551U, // <5,3,1,0>: Cost 4 vext3 <0,4,1,5>, <3,1,0,3>
+ 3760867558U, // <5,3,1,1>: Cost 4 vext3 <0,4,1,5>, <3,1,1,1>
+ 2624938923U, // <5,3,1,2>: Cost 3 vext2 <1,2,5,3>, <1,2,5,3>
+ 2703198460U, // <5,3,1,3>: Cost 3 vext3 <3,1,3,5>, <3,1,3,5>
+ 3760867587U, // <5,3,1,4>: Cost 4 vext3 <0,4,1,5>, <3,1,4,3>
+ 2636219536U, // <5,3,1,5>: Cost 3 vext2 <3,1,5,3>, <1,5,3,7>
+ 3698681075U, // <5,3,1,6>: Cost 4 vext2 <1,2,5,3>, <1,6,5,7>
+ 2703493408U, // <5,3,1,7>: Cost 3 vext3 <3,1,7,5>, <3,1,7,5>
+ 2628920721U, // <5,3,1,u>: Cost 3 vext2 <1,u,5,3>, <1,u,5,3>
+ 3766765870U, // <5,3,2,0>: Cost 4 vext3 <1,4,0,5>, <3,2,0,1>
+ 3698681379U, // <5,3,2,1>: Cost 4 vext2 <1,2,5,3>, <2,1,3,5>
+ 3760867649U, // <5,3,2,2>: Cost 4 vext3 <0,4,1,5>, <3,2,2,2>
+ 2698627404U, // <5,3,2,3>: Cost 3 vext3 <2,3,4,5>, <3,2,3,4>
+ 2703935830U, // <5,3,2,4>: Cost 3 vext3 <3,2,4,5>, <3,2,4,5>
+ 2698627422U, // <5,3,2,5>: Cost 3 vext3 <2,3,4,5>, <3,2,5,4>
+ 3760867686U, // <5,3,2,6>: Cost 4 vext3 <0,4,1,5>, <3,2,6,3>
+ 3769788783U, // <5,3,2,7>: Cost 4 vext3 <1,u,5,5>, <3,2,7,3>
+ 2701945209U, // <5,3,2,u>: Cost 3 vext3 <2,u,4,5>, <3,2,u,4>
+ 3760867711U, // <5,3,3,0>: Cost 4 vext3 <0,4,1,5>, <3,3,0,1>
+ 2636220684U, // <5,3,3,1>: Cost 3 vext2 <3,1,5,3>, <3,1,5,3>
+ 3772369298U, // <5,3,3,2>: Cost 4 vext3 <2,3,4,5>, <3,3,2,2>
+ 2687125916U, // <5,3,3,3>: Cost 3 vext3 <0,4,1,5>, <3,3,3,3>
+ 2704599463U, // <5,3,3,4>: Cost 3 vext3 <3,3,4,5>, <3,3,4,5>
+ 2704673200U, // <5,3,3,5>: Cost 3 vext3 <3,3,5,5>, <3,3,5,5>
+ 3709962935U, // <5,3,3,6>: Cost 4 vext2 <3,1,5,3>, <3,6,7,7>
+ 3772369346U, // <5,3,3,7>: Cost 4 vext3 <2,3,4,5>, <3,3,7,5>
+ 2704894411U, // <5,3,3,u>: Cost 3 vext3 <3,3,u,5>, <3,3,u,5>
+ 2704968148U, // <5,3,4,0>: Cost 3 vext3 <3,4,0,5>, <3,4,0,5>
+ 3698682850U, // <5,3,4,1>: Cost 4 vext2 <1,2,5,3>, <4,1,5,0>
+ 2642857014U, // <5,3,4,2>: Cost 3 vext2 <4,2,5,3>, <4,2,5,3>
+ 2705189359U, // <5,3,4,3>: Cost 3 vext3 <3,4,3,5>, <3,4,3,5>
+ 2705263096U, // <5,3,4,4>: Cost 3 vext3 <3,4,4,5>, <3,4,4,5>
+ 2685946370U, // <5,3,4,5>: Cost 3 vext3 <0,2,3,5>, <3,4,5,6>
+ 3779152394U, // <5,3,4,6>: Cost 4 vext3 <3,4,6,5>, <3,4,6,5>
+ 2236377699U, // <5,3,4,7>: Cost 3 vrev <3,5,7,4>
+ 2687126045U, // <5,3,4,u>: Cost 3 vext3 <0,4,1,5>, <3,4,u,6>
+ 2571632742U, // <5,3,5,0>: Cost 3 vext1 <3,5,3,5>, LHS
+ 2559689870U, // <5,3,5,1>: Cost 3 vext1 <1,5,3,5>, <1,5,3,5>
+ 2571634382U, // <5,3,5,2>: Cost 3 vext1 <3,5,3,5>, <2,3,4,5>
+ 2571635264U, // <5,3,5,3>: Cost 3 vext1 <3,5,3,5>, <3,5,3,5>
+ 2571636022U, // <5,3,5,4>: Cost 3 vext1 <3,5,3,5>, RHS
+ 2559692804U, // <5,3,5,5>: Cost 3 vext1 <1,5,3,5>, <5,5,5,5>
+ 3720581218U, // <5,3,5,6>: Cost 4 vext2 <4,u,5,3>, <5,6,7,0>
+ 2236385892U, // <5,3,5,7>: Cost 3 vrev <3,5,7,5>
+ 2571638574U, // <5,3,5,u>: Cost 3 vext1 <3,5,3,5>, LHS
+ 2565668966U, // <5,3,6,0>: Cost 3 vext1 <2,5,3,6>, LHS
+ 3633439887U, // <5,3,6,1>: Cost 4 vext1 <1,5,3,6>, <1,5,3,6>
+ 2565670760U, // <5,3,6,2>: Cost 3 vext1 <2,5,3,6>, <2,5,3,6>
+ 2565671426U, // <5,3,6,3>: Cost 3 vext1 <2,5,3,6>, <3,4,5,6>
+ 2565672246U, // <5,3,6,4>: Cost 3 vext1 <2,5,3,6>, RHS
+ 3639414630U, // <5,3,6,5>: Cost 4 vext1 <2,5,3,6>, <5,3,6,0>
+ 4047521640U, // <5,3,6,6>: Cost 4 vzipr <3,4,5,6>, <2,5,3,6>
+ 2725169844U, // <5,3,6,7>: Cost 3 vext3 <6,7,4,5>, <3,6,7,4>
+ 2565674798U, // <5,3,6,u>: Cost 3 vext1 <2,5,3,6>, LHS
+ 1485963366U, // <5,3,7,0>: Cost 2 vext1 <1,5,3,7>, LHS
+ 1485964432U, // <5,3,7,1>: Cost 2 vext1 <1,5,3,7>, <1,5,3,7>
+ 2559706728U, // <5,3,7,2>: Cost 3 vext1 <1,5,3,7>, <2,2,2,2>
+ 2559707286U, // <5,3,7,3>: Cost 3 vext1 <1,5,3,7>, <3,0,1,2>
+ 1485966646U, // <5,3,7,4>: Cost 2 vext1 <1,5,3,7>, RHS
+ 2559708880U, // <5,3,7,5>: Cost 3 vext1 <1,5,3,7>, <5,1,7,3>
+ 2601513466U, // <5,3,7,6>: Cost 3 vext1 <u,5,3,7>, <6,2,7,3>
+ 3114714112U, // <5,3,7,7>: Cost 3 vtrnr RHS, <1,3,5,7>
+ 1485969198U, // <5,3,7,u>: Cost 2 vext1 <1,5,3,7>, LHS
+ 1485971558U, // <5,3,u,0>: Cost 2 vext1 <1,5,3,u>, LHS
+ 1485972625U, // <5,3,u,1>: Cost 2 vext1 <1,5,3,u>, <1,5,3,u>
+ 2559714920U, // <5,3,u,2>: Cost 3 vext1 <1,5,3,u>, <2,2,2,2>
+ 2559715478U, // <5,3,u,3>: Cost 3 vext1 <1,5,3,u>, <3,0,1,2>
+ 1485974838U, // <5,3,u,4>: Cost 2 vext1 <1,5,3,u>, RHS
+ 2687126342U, // <5,3,u,5>: Cost 3 vext3 <0,4,1,5>, <3,u,5,6>
+ 2601521658U, // <5,3,u,6>: Cost 3 vext1 <u,5,3,u>, <6,2,7,3>
+ 2236410471U, // <5,3,u,7>: Cost 3 vrev <3,5,7,u>
+ 1485977390U, // <5,3,u,u>: Cost 2 vext1 <1,5,3,u>, LHS
+ 3627491430U, // <5,4,0,0>: Cost 4 vext1 <0,5,4,0>, LHS
+ 2636890214U, // <5,4,0,1>: Cost 3 vext2 <3,2,5,4>, LHS
+ 3703333028U, // <5,4,0,2>: Cost 4 vext2 <2,0,5,4>, <0,2,0,2>
+ 3782249348U, // <5,4,0,3>: Cost 4 vext3 <4,0,3,5>, <4,0,3,5>
+ 2642198866U, // <5,4,0,4>: Cost 3 vext2 <4,1,5,4>, <0,4,1,5>
+ 2687126418U, // <5,4,0,5>: Cost 3 vext3 <0,4,1,5>, <4,0,5,1>
+ 2242243887U, // <5,4,0,6>: Cost 3 vrev <4,5,6,0>
+ 3316059448U, // <5,4,0,7>: Cost 4 vrev <4,5,7,0>
+ 2636890781U, // <5,4,0,u>: Cost 3 vext2 <3,2,5,4>, LHS
+ 2241809658U, // <5,4,1,0>: Cost 3 vrev <4,5,0,1>
+ 3698025307U, // <5,4,1,1>: Cost 4 vext2 <1,1,5,4>, <1,1,5,4>
+ 3698688940U, // <5,4,1,2>: Cost 4 vext2 <1,2,5,4>, <1,2,5,4>
+ 3698689024U, // <5,4,1,3>: Cost 4 vext2 <1,2,5,4>, <1,3,5,7>
+ 3700016206U, // <5,4,1,4>: Cost 4 vext2 <1,4,5,4>, <1,4,5,4>
+ 2687126498U, // <5,4,1,5>: Cost 3 vext3 <0,4,1,5>, <4,1,5,0>
+ 3760868336U, // <5,4,1,6>: Cost 4 vext3 <0,4,1,5>, <4,1,6,5>
+ 3316067641U, // <5,4,1,7>: Cost 4 vrev <4,5,7,1>
+ 2242399554U, // <5,4,1,u>: Cost 3 vrev <4,5,u,1>
+ 3703334371U, // <5,4,2,0>: Cost 4 vext2 <2,0,5,4>, <2,0,5,4>
+ 3703998004U, // <5,4,2,1>: Cost 4 vext2 <2,1,5,4>, <2,1,5,4>
+ 3704661637U, // <5,4,2,2>: Cost 4 vext2 <2,2,5,4>, <2,2,5,4>
+ 2636891854U, // <5,4,2,3>: Cost 3 vext2 <3,2,5,4>, <2,3,4,5>
+ 3705988903U, // <5,4,2,4>: Cost 4 vext2 <2,4,5,4>, <2,4,5,4>
+ 2698628150U, // <5,4,2,5>: Cost 3 vext3 <2,3,4,5>, <4,2,5,3>
+ 3760868415U, // <5,4,2,6>: Cost 4 vext3 <0,4,1,5>, <4,2,6,3>
+ 3783871562U, // <5,4,2,7>: Cost 4 vext3 <4,2,7,5>, <4,2,7,5>
+ 2666752099U, // <5,4,2,u>: Cost 3 vext2 <u,2,5,4>, <2,u,4,5>
+ 3639459942U, // <5,4,3,0>: Cost 4 vext1 <2,5,4,3>, LHS
+ 3709970701U, // <5,4,3,1>: Cost 4 vext2 <3,1,5,4>, <3,1,5,4>
+ 2636892510U, // <5,4,3,2>: Cost 3 vext2 <3,2,5,4>, <3,2,5,4>
+ 3710634396U, // <5,4,3,3>: Cost 4 vext2 <3,2,5,4>, <3,3,3,3>
+ 2638219776U, // <5,4,3,4>: Cost 3 vext2 <3,4,5,4>, <3,4,5,4>
+ 3766987908U, // <5,4,3,5>: Cost 4 vext3 <1,4,3,5>, <4,3,5,0>
+ 2710719634U, // <5,4,3,6>: Cost 3 vext3 <4,3,6,5>, <4,3,6,5>
+ 3914097664U, // <5,4,3,7>: Cost 4 vuzpr <3,5,7,4>, <1,3,5,7>
+ 2640874308U, // <5,4,3,u>: Cost 3 vext2 <3,u,5,4>, <3,u,5,4>
+ 2583642214U, // <5,4,4,0>: Cost 3 vext1 <5,5,4,4>, LHS
+ 2642201574U, // <5,4,4,1>: Cost 3 vext2 <4,1,5,4>, <4,1,5,4>
+ 3710635062U, // <5,4,4,2>: Cost 4 vext2 <3,2,5,4>, <4,2,5,3>
+ 3717270664U, // <5,4,4,3>: Cost 4 vext2 <4,3,5,4>, <4,3,5,4>
+ 2713963728U, // <5,4,4,4>: Cost 3 vext3 <4,u,5,5>, <4,4,4,4>
+ 1637567706U, // <5,4,4,5>: Cost 2 vext3 <4,4,5,5>, <4,4,5,5>
+ 2242276659U, // <5,4,4,6>: Cost 3 vrev <4,5,6,4>
+ 2646183372U, // <5,4,4,7>: Cost 3 vext2 <4,7,5,4>, <4,7,5,4>
+ 1637788917U, // <5,4,4,u>: Cost 2 vext3 <4,4,u,5>, <4,4,u,5>
+ 2559762534U, // <5,4,5,0>: Cost 3 vext1 <1,5,4,5>, LHS
+ 2559763607U, // <5,4,5,1>: Cost 3 vext1 <1,5,4,5>, <1,5,4,5>
+ 2698628366U, // <5,4,5,2>: Cost 3 vext3 <2,3,4,5>, <4,5,2,3>
+ 3633506454U, // <5,4,5,3>: Cost 4 vext1 <1,5,4,5>, <3,0,1,2>
+ 2559765814U, // <5,4,5,4>: Cost 3 vext1 <1,5,4,5>, RHS
+ 2583654395U, // <5,4,5,5>: Cost 3 vext1 <5,5,4,5>, <5,5,4,5>
+ 1613385014U, // <5,4,5,6>: Cost 2 vext3 <0,4,1,5>, RHS
+ 3901639990U, // <5,4,5,7>: Cost 4 vuzpr <1,5,0,4>, RHS
+ 1613385032U, // <5,4,5,u>: Cost 2 vext3 <0,4,1,5>, RHS
+ 2559770726U, // <5,4,6,0>: Cost 3 vext1 <1,5,4,6>, LHS
+ 2559771648U, // <5,4,6,1>: Cost 3 vext1 <1,5,4,6>, <1,3,5,7>
+ 3633514088U, // <5,4,6,2>: Cost 4 vext1 <1,5,4,6>, <2,2,2,2>
+ 2571717122U, // <5,4,6,3>: Cost 3 vext1 <3,5,4,6>, <3,4,5,6>
+ 2559774006U, // <5,4,6,4>: Cost 3 vext1 <1,5,4,6>, RHS
+ 2712636796U, // <5,4,6,5>: Cost 3 vext3 <4,6,5,5>, <4,6,5,5>
+ 3760868743U, // <5,4,6,6>: Cost 4 vext3 <0,4,1,5>, <4,6,6,7>
+ 2712784270U, // <5,4,6,7>: Cost 3 vext3 <4,6,7,5>, <4,6,7,5>
+ 2559776558U, // <5,4,6,u>: Cost 3 vext1 <1,5,4,6>, LHS
+ 2565750886U, // <5,4,7,0>: Cost 3 vext1 <2,5,4,7>, LHS
+ 2565751706U, // <5,4,7,1>: Cost 3 vext1 <2,5,4,7>, <1,2,3,4>
+ 2565752690U, // <5,4,7,2>: Cost 3 vext1 <2,5,4,7>, <2,5,4,7>
+ 2571725387U, // <5,4,7,3>: Cost 3 vext1 <3,5,4,7>, <3,5,4,7>
+ 2565754166U, // <5,4,7,4>: Cost 3 vext1 <2,5,4,7>, RHS
+ 3114713426U, // <5,4,7,5>: Cost 3 vtrnr RHS, <0,4,1,5>
+ 94817590U, // <5,4,7,6>: Cost 1 vrev RHS
+ 2595616175U, // <5,4,7,7>: Cost 3 vext1 <7,5,4,7>, <7,5,4,7>
+ 94965064U, // <5,4,7,u>: Cost 1 vrev RHS
+ 2559787110U, // <5,4,u,0>: Cost 3 vext1 <1,5,4,u>, LHS
+ 2559788186U, // <5,4,u,1>: Cost 3 vext1 <1,5,4,u>, <1,5,4,u>
+ 2242014483U, // <5,4,u,2>: Cost 3 vrev <4,5,2,u>
+ 2667419628U, // <5,4,u,3>: Cost 3 vext2 <u,3,5,4>, <u,3,5,4>
+ 2559790390U, // <5,4,u,4>: Cost 3 vext1 <1,5,4,u>, RHS
+ 1640222238U, // <5,4,u,5>: Cost 2 vext3 <4,u,5,5>, <4,u,5,5>
+ 94825783U, // <5,4,u,6>: Cost 1 vrev RHS
+ 2714111536U, // <5,4,u,7>: Cost 3 vext3 <4,u,7,5>, <4,u,7,5>
+ 94973257U, // <5,4,u,u>: Cost 1 vrev RHS
+ 2646851584U, // <5,5,0,0>: Cost 3 vext2 <4,u,5,5>, <0,0,0,0>
+ 1573109862U, // <5,5,0,1>: Cost 2 vext2 <4,u,5,5>, LHS
+ 2646851748U, // <5,5,0,2>: Cost 3 vext2 <4,u,5,5>, <0,2,0,2>
+ 3760279130U, // <5,5,0,3>: Cost 4 vext3 <0,3,2,5>, <5,0,3,2>
+ 2687127138U, // <5,5,0,4>: Cost 3 vext3 <0,4,1,5>, <5,0,4,1>
+ 2248142847U, // <5,5,0,5>: Cost 3 vrev <5,5,5,0>
+ 3720593910U, // <5,5,0,6>: Cost 4 vext2 <4,u,5,5>, <0,6,1,7>
+ 4182502710U, // <5,5,0,7>: Cost 4 vtrnr <3,5,7,0>, RHS
+ 1573110429U, // <5,5,0,u>: Cost 2 vext2 <4,u,5,5>, LHS
+ 2646852342U, // <5,5,1,0>: Cost 3 vext2 <4,u,5,5>, <1,0,3,2>
+ 2624291676U, // <5,5,1,1>: Cost 3 vext2 <1,1,5,5>, <1,1,5,5>
+ 2646852502U, // <5,5,1,2>: Cost 3 vext2 <4,u,5,5>, <1,2,3,0>
+ 2646852568U, // <5,5,1,3>: Cost 3 vext2 <4,u,5,5>, <1,3,1,3>
+ 2715217591U, // <5,5,1,4>: Cost 3 vext3 <5,1,4,5>, <5,1,4,5>
+ 2628936848U, // <5,5,1,5>: Cost 3 vext2 <1,u,5,5>, <1,5,3,7>
+ 3698033907U, // <5,5,1,6>: Cost 4 vext2 <1,1,5,5>, <1,6,5,7>
+ 2713964240U, // <5,5,1,7>: Cost 3 vext3 <4,u,5,5>, <5,1,7,3>
+ 2628937107U, // <5,5,1,u>: Cost 3 vext2 <1,u,5,5>, <1,u,5,5>
+ 3645497446U, // <5,5,2,0>: Cost 4 vext1 <3,5,5,2>, LHS
+ 3760869099U, // <5,5,2,1>: Cost 4 vext3 <0,4,1,5>, <5,2,1,3>
+ 2646853224U, // <5,5,2,2>: Cost 3 vext2 <4,u,5,5>, <2,2,2,2>
+ 2698628862U, // <5,5,2,3>: Cost 3 vext3 <2,3,4,5>, <5,2,3,4>
+ 3772370694U, // <5,5,2,4>: Cost 4 vext3 <2,3,4,5>, <5,2,4,3>
+ 2713964303U, // <5,5,2,5>: Cost 3 vext3 <4,u,5,5>, <5,2,5,3>
+ 2646853562U, // <5,5,2,6>: Cost 3 vext2 <4,u,5,5>, <2,6,3,7>
+ 4038198272U, // <5,5,2,7>: Cost 4 vzipr <1,u,5,2>, <1,3,5,7>
+ 2701946667U, // <5,5,2,u>: Cost 3 vext3 <2,u,4,5>, <5,2,u,4>
+ 2646853782U, // <5,5,3,0>: Cost 3 vext2 <4,u,5,5>, <3,0,1,2>
+ 3698034922U, // <5,5,3,1>: Cost 4 vext2 <1,1,5,5>, <3,1,1,5>
+ 3702679919U, // <5,5,3,2>: Cost 4 vext2 <1,u,5,5>, <3,2,7,3>
+ 2637564336U, // <5,5,3,3>: Cost 3 vext2 <3,3,5,5>, <3,3,5,5>
+ 2646854146U, // <5,5,3,4>: Cost 3 vext2 <4,u,5,5>, <3,4,5,6>
+ 2638891602U, // <5,5,3,5>: Cost 3 vext2 <3,5,5,5>, <3,5,5,5>
+ 3702680247U, // <5,5,3,6>: Cost 4 vext2 <1,u,5,5>, <3,6,7,7>
+ 3702680259U, // <5,5,3,7>: Cost 4 vext2 <1,u,5,5>, <3,7,0,1>
+ 2646854430U, // <5,5,3,u>: Cost 3 vext2 <4,u,5,5>, <3,u,1,2>
+ 2646854546U, // <5,5,4,0>: Cost 3 vext2 <4,u,5,5>, <4,0,5,1>
+ 2642209767U, // <5,5,4,1>: Cost 3 vext2 <4,1,5,5>, <4,1,5,5>
+ 3711306806U, // <5,5,4,2>: Cost 4 vext2 <3,3,5,5>, <4,2,5,3>
+ 3645516369U, // <5,5,4,3>: Cost 4 vext1 <3,5,5,4>, <3,5,5,4>
+ 1570458842U, // <5,5,4,4>: Cost 2 vext2 <4,4,5,5>, <4,4,5,5>
+ 1573113142U, // <5,5,4,5>: Cost 2 vext2 <4,u,5,5>, RHS
+ 2645527932U, // <5,5,4,6>: Cost 3 vext2 <4,6,5,5>, <4,6,5,5>
+ 2713964486U, // <5,5,4,7>: Cost 3 vext3 <4,u,5,5>, <5,4,7,6>
+ 1573113374U, // <5,5,4,u>: Cost 2 vext2 <4,u,5,5>, <4,u,5,5>
+ 1509982310U, // <5,5,5,0>: Cost 2 vext1 <5,5,5,5>, LHS
+ 2646855376U, // <5,5,5,1>: Cost 3 vext2 <4,u,5,5>, <5,1,7,3>
+ 2583725672U, // <5,5,5,2>: Cost 3 vext1 <5,5,5,5>, <2,2,2,2>
+ 2583726230U, // <5,5,5,3>: Cost 3 vext1 <5,5,5,5>, <3,0,1,2>
+ 1509985590U, // <5,5,5,4>: Cost 2 vext1 <5,5,5,5>, RHS
+ 229035318U, // <5,5,5,5>: Cost 1 vdup1 RHS
+ 2646855778U, // <5,5,5,6>: Cost 3 vext2 <4,u,5,5>, <5,6,7,0>
+ 2646855848U, // <5,5,5,7>: Cost 3 vext2 <4,u,5,5>, <5,7,5,7>
+ 229035318U, // <5,5,5,u>: Cost 1 vdup1 RHS
+ 2577760358U, // <5,5,6,0>: Cost 3 vext1 <4,5,5,6>, LHS
+ 3633587361U, // <5,5,6,1>: Cost 4 vext1 <1,5,5,6>, <1,5,5,6>
+ 2646856186U, // <5,5,6,2>: Cost 3 vext2 <4,u,5,5>, <6,2,7,3>
+ 3633588738U, // <5,5,6,3>: Cost 4 vext1 <1,5,5,6>, <3,4,5,6>
+ 2718535756U, // <5,5,6,4>: Cost 3 vext3 <5,6,4,5>, <5,6,4,5>
+ 2644202223U, // <5,5,6,5>: Cost 3 vext2 <4,4,5,5>, <6,5,7,5>
+ 2973780482U, // <5,5,6,6>: Cost 3 vzipr <3,4,5,6>, <3,4,5,6>
+ 2646856526U, // <5,5,6,7>: Cost 3 vext2 <4,u,5,5>, <6,7,0,1>
+ 2646856607U, // <5,5,6,u>: Cost 3 vext2 <4,u,5,5>, <6,u,0,1>
+ 2571796582U, // <5,5,7,0>: Cost 3 vext1 <3,5,5,7>, LHS
+ 3633595392U, // <5,5,7,1>: Cost 4 vext1 <1,5,5,7>, <1,3,5,7>
+ 2571798222U, // <5,5,7,2>: Cost 3 vext1 <3,5,5,7>, <2,3,4,5>
+ 2571799124U, // <5,5,7,3>: Cost 3 vext1 <3,5,5,7>, <3,5,5,7>
+ 2571799862U, // <5,5,7,4>: Cost 3 vext1 <3,5,5,7>, RHS
+ 3114717188U, // <5,5,7,5>: Cost 3 vtrnr RHS, <5,5,5,5>
+ 4034923010U, // <5,5,7,6>: Cost 4 vzipr <1,3,5,7>, <3,4,5,6>
+ 2040974646U, // <5,5,7,7>: Cost 2 vtrnr RHS, RHS
+ 2040974647U, // <5,5,7,u>: Cost 2 vtrnr RHS, RHS
+ 1509982310U, // <5,5,u,0>: Cost 2 vext1 <5,5,5,5>, LHS
+ 1573115694U, // <5,5,u,1>: Cost 2 vext2 <4,u,5,5>, LHS
+ 2571806414U, // <5,5,u,2>: Cost 3 vext1 <3,5,5,u>, <2,3,4,5>
+ 2571807317U, // <5,5,u,3>: Cost 3 vext1 <3,5,5,u>, <3,5,5,u>
+ 1509985590U, // <5,5,u,4>: Cost 2 vext1 <5,5,5,5>, RHS
+ 229035318U, // <5,5,u,5>: Cost 1 vdup1 RHS
+ 2646857936U, // <5,5,u,6>: Cost 3 vext2 <4,u,5,5>, <u,6,3,7>
+ 2040982838U, // <5,5,u,7>: Cost 2 vtrnr RHS, RHS
+ 229035318U, // <5,5,u,u>: Cost 1 vdup1 RHS
+ 2638233600U, // <5,6,0,0>: Cost 3 vext2 <3,4,5,6>, <0,0,0,0>
+ 1564491878U, // <5,6,0,1>: Cost 2 vext2 <3,4,5,6>, LHS
+ 2632261796U, // <5,6,0,2>: Cost 3 vext2 <2,4,5,6>, <0,2,0,2>
+ 2638233856U, // <5,6,0,3>: Cost 3 vext2 <3,4,5,6>, <0,3,1,4>
+ 2638233938U, // <5,6,0,4>: Cost 3 vext2 <3,4,5,6>, <0,4,1,5>
+ 3706003885U, // <5,6,0,5>: Cost 4 vext2 <2,4,5,6>, <0,5,2,6>
+ 3706003967U, // <5,6,0,6>: Cost 4 vext2 <2,4,5,6>, <0,6,2,7>
+ 4047473974U, // <5,6,0,7>: Cost 4 vzipr <3,4,5,0>, RHS
+ 1564492445U, // <5,6,0,u>: Cost 2 vext2 <3,4,5,6>, LHS
+ 2638234358U, // <5,6,1,0>: Cost 3 vext2 <3,4,5,6>, <1,0,3,2>
+ 2638234420U, // <5,6,1,1>: Cost 3 vext2 <3,4,5,6>, <1,1,1,1>
+ 2638234518U, // <5,6,1,2>: Cost 3 vext2 <3,4,5,6>, <1,2,3,0>
+ 2638234584U, // <5,6,1,3>: Cost 3 vext2 <3,4,5,6>, <1,3,1,3>
+ 2626290768U, // <5,6,1,4>: Cost 3 vext2 <1,4,5,6>, <1,4,5,6>
+ 2638234768U, // <5,6,1,5>: Cost 3 vext2 <3,4,5,6>, <1,5,3,7>
+ 3700032719U, // <5,6,1,6>: Cost 4 vext2 <1,4,5,6>, <1,6,1,7>
+ 2982366518U, // <5,6,1,7>: Cost 3 vzipr <4,u,5,1>, RHS
+ 2628945300U, // <5,6,1,u>: Cost 3 vext2 <1,u,5,6>, <1,u,5,6>
+ 3706004925U, // <5,6,2,0>: Cost 4 vext2 <2,4,5,6>, <2,0,1,2>
+ 3711976966U, // <5,6,2,1>: Cost 4 vext2 <3,4,5,6>, <2,1,0,3>
+ 2638235240U, // <5,6,2,2>: Cost 3 vext2 <3,4,5,6>, <2,2,2,2>
+ 2638235302U, // <5,6,2,3>: Cost 3 vext2 <3,4,5,6>, <2,3,0,1>
+ 2632263465U, // <5,6,2,4>: Cost 3 vext2 <2,4,5,6>, <2,4,5,6>
+ 2638235496U, // <5,6,2,5>: Cost 3 vext2 <3,4,5,6>, <2,5,3,6>
+ 2638235578U, // <5,6,2,6>: Cost 3 vext2 <3,4,5,6>, <2,6,3,7>
+ 2713965050U, // <5,6,2,7>: Cost 3 vext3 <4,u,5,5>, <6,2,7,3>
+ 2634917997U, // <5,6,2,u>: Cost 3 vext2 <2,u,5,6>, <2,u,5,6>
+ 2638235798U, // <5,6,3,0>: Cost 3 vext2 <3,4,5,6>, <3,0,1,2>
+ 3711977695U, // <5,6,3,1>: Cost 4 vext2 <3,4,5,6>, <3,1,0,3>
+ 3710650720U, // <5,6,3,2>: Cost 4 vext2 <3,2,5,6>, <3,2,5,6>
+ 2638236060U, // <5,6,3,3>: Cost 3 vext2 <3,4,5,6>, <3,3,3,3>
+ 1564494338U, // <5,6,3,4>: Cost 2 vext2 <3,4,5,6>, <3,4,5,6>
+ 2638236234U, // <5,6,3,5>: Cost 3 vext2 <3,4,5,6>, <3,5,4,6>
+ 3711978104U, // <5,6,3,6>: Cost 4 vext2 <3,4,5,6>, <3,6,0,7>
+ 4034227510U, // <5,6,3,7>: Cost 4 vzipr <1,2,5,3>, RHS
+ 1567148870U, // <5,6,3,u>: Cost 2 vext2 <3,u,5,6>, <3,u,5,6>
+ 2577817702U, // <5,6,4,0>: Cost 3 vext1 <4,5,6,4>, LHS
+ 3700034544U, // <5,6,4,1>: Cost 4 vext2 <1,4,5,6>, <4,1,6,5>
+ 2723033713U, // <5,6,4,2>: Cost 3 vext3 <6,4,2,5>, <6,4,2,5>
+ 2638236818U, // <5,6,4,3>: Cost 3 vext2 <3,4,5,6>, <4,3,6,5>
+ 2644208859U, // <5,6,4,4>: Cost 3 vext2 <4,4,5,6>, <4,4,5,6>
+ 1564495158U, // <5,6,4,5>: Cost 2 vext2 <3,4,5,6>, RHS
+ 2645536125U, // <5,6,4,6>: Cost 3 vext2 <4,6,5,6>, <4,6,5,6>
+ 2723402398U, // <5,6,4,7>: Cost 3 vext3 <6,4,7,5>, <6,4,7,5>
+ 1564495401U, // <5,6,4,u>: Cost 2 vext2 <3,4,5,6>, RHS
+ 2577825894U, // <5,6,5,0>: Cost 3 vext1 <4,5,6,5>, LHS
+ 2662125264U, // <5,6,5,1>: Cost 3 vext2 <7,4,5,6>, <5,1,7,3>
+ 3775836867U, // <5,6,5,2>: Cost 4 vext3 <2,u,6,5>, <6,5,2,6>
+ 3711979343U, // <5,6,5,3>: Cost 4 vext2 <3,4,5,6>, <5,3,3,4>
+ 2650181556U, // <5,6,5,4>: Cost 3 vext2 <5,4,5,6>, <5,4,5,6>
+ 2662125572U, // <5,6,5,5>: Cost 3 vext2 <7,4,5,6>, <5,5,5,5>
+ 2638237732U, // <5,6,5,6>: Cost 3 vext2 <3,4,5,6>, <5,6,0,1>
+ 2982399286U, // <5,6,5,7>: Cost 3 vzipr <4,u,5,5>, RHS
+ 2982399287U, // <5,6,5,u>: Cost 3 vzipr <4,u,5,5>, RHS
+ 2583806054U, // <5,6,6,0>: Cost 3 vext1 <5,5,6,6>, LHS
+ 3711979910U, // <5,6,6,1>: Cost 4 vext2 <3,4,5,6>, <6,1,3,4>
+ 2662126074U, // <5,6,6,2>: Cost 3 vext2 <7,4,5,6>, <6,2,7,3>
+ 2583808514U, // <5,6,6,3>: Cost 3 vext1 <5,5,6,6>, <3,4,5,6>
+ 2583809334U, // <5,6,6,4>: Cost 3 vext1 <5,5,6,6>, RHS
+ 2583810062U, // <5,6,6,5>: Cost 3 vext1 <5,5,6,6>, <5,5,6,6>
+ 2638238520U, // <5,6,6,6>: Cost 3 vext2 <3,4,5,6>, <6,6,6,6>
+ 2973781302U, // <5,6,6,7>: Cost 3 vzipr <3,4,5,6>, RHS
+ 2973781303U, // <5,6,6,u>: Cost 3 vzipr <3,4,5,6>, RHS
+ 430358630U, // <5,6,7,0>: Cost 1 vext1 RHS, LHS
+ 1504101110U, // <5,6,7,1>: Cost 2 vext1 RHS, <1,0,3,2>
+ 1504101992U, // <5,6,7,2>: Cost 2 vext1 RHS, <2,2,2,2>
+ 1504102550U, // <5,6,7,3>: Cost 2 vext1 RHS, <3,0,1,2>
+ 430361910U, // <5,6,7,4>: Cost 1 vext1 RHS, RHS
+ 1504104390U, // <5,6,7,5>: Cost 2 vext1 RHS, <5,4,7,6>
+ 1504105272U, // <5,6,7,6>: Cost 2 vext1 RHS, <6,6,6,6>
+ 1504106092U, // <5,6,7,7>: Cost 2 vext1 RHS, <7,7,7,7>
+ 430364462U, // <5,6,7,u>: Cost 1 vext1 RHS, LHS
+ 430366822U, // <5,6,u,0>: Cost 1 vext1 RHS, LHS
+ 1564497710U, // <5,6,u,1>: Cost 2 vext2 <3,4,5,6>, LHS
+ 1504110184U, // <5,6,u,2>: Cost 2 vext1 RHS, <2,2,2,2>
+ 1504110742U, // <5,6,u,3>: Cost 2 vext1 RHS, <3,0,1,2>
+ 430370103U, // <5,6,u,4>: Cost 1 vext1 RHS, RHS
+ 1564498074U, // <5,6,u,5>: Cost 2 vext2 <3,4,5,6>, RHS
+ 1504113146U, // <5,6,u,6>: Cost 2 vext1 RHS, <6,2,7,3>
+ 1504113658U, // <5,6,u,7>: Cost 2 vext1 RHS, <7,0,1,2>
+ 430372654U, // <5,6,u,u>: Cost 1 vext1 RHS, LHS
+ 2625634304U, // <5,7,0,0>: Cost 3 vext2 <1,3,5,7>, <0,0,0,0>
+ 1551892582U, // <5,7,0,1>: Cost 2 vext2 <1,3,5,7>, LHS
+ 2625634468U, // <5,7,0,2>: Cost 3 vext2 <1,3,5,7>, <0,2,0,2>
+ 2571889247U, // <5,7,0,3>: Cost 3 vext1 <3,5,7,0>, <3,5,7,0>
+ 2625634642U, // <5,7,0,4>: Cost 3 vext2 <1,3,5,7>, <0,4,1,5>
+ 2595778728U, // <5,7,0,5>: Cost 3 vext1 <7,5,7,0>, <5,7,5,7>
+ 3699376639U, // <5,7,0,6>: Cost 4 vext2 <1,3,5,7>, <0,6,2,7>
+ 2260235715U, // <5,7,0,7>: Cost 3 vrev <7,5,7,0>
+ 1551893149U, // <5,7,0,u>: Cost 2 vext2 <1,3,5,7>, LHS
+ 2625635062U, // <5,7,1,0>: Cost 3 vext2 <1,3,5,7>, <1,0,3,2>
+ 2624308020U, // <5,7,1,1>: Cost 3 vext2 <1,1,5,7>, <1,1,1,1>
+ 2625635222U, // <5,7,1,2>: Cost 3 vext2 <1,3,5,7>, <1,2,3,0>
+ 1551893504U, // <5,7,1,3>: Cost 2 vext2 <1,3,5,7>, <1,3,5,7>
+ 2571898166U, // <5,7,1,4>: Cost 3 vext1 <3,5,7,1>, RHS
+ 2625635472U, // <5,7,1,5>: Cost 3 vext2 <1,3,5,7>, <1,5,3,7>
+ 2627626227U, // <5,7,1,6>: Cost 3 vext2 <1,6,5,7>, <1,6,5,7>
+ 3702031684U, // <5,7,1,7>: Cost 4 vext2 <1,7,5,7>, <1,7,5,7>
+ 1555211669U, // <5,7,1,u>: Cost 2 vext2 <1,u,5,7>, <1,u,5,7>
+ 2629617126U, // <5,7,2,0>: Cost 3 vext2 <2,0,5,7>, <2,0,5,7>
+ 3699377670U, // <5,7,2,1>: Cost 4 vext2 <1,3,5,7>, <2,1,0,3>
+ 2625635944U, // <5,7,2,2>: Cost 3 vext2 <1,3,5,7>, <2,2,2,2>
+ 2625636006U, // <5,7,2,3>: Cost 3 vext2 <1,3,5,7>, <2,3,0,1>
+ 2632271658U, // <5,7,2,4>: Cost 3 vext2 <2,4,5,7>, <2,4,5,7>
+ 2625636201U, // <5,7,2,5>: Cost 3 vext2 <1,3,5,7>, <2,5,3,7>
+ 2625636282U, // <5,7,2,6>: Cost 3 vext2 <1,3,5,7>, <2,6,3,7>
+ 3708004381U, // <5,7,2,7>: Cost 4 vext2 <2,7,5,7>, <2,7,5,7>
+ 2625636411U, // <5,7,2,u>: Cost 3 vext2 <1,3,5,7>, <2,u,0,1>
+ 2625636502U, // <5,7,3,0>: Cost 3 vext2 <1,3,5,7>, <3,0,1,2>
+ 2625636604U, // <5,7,3,1>: Cost 3 vext2 <1,3,5,7>, <3,1,3,5>
+ 3699378478U, // <5,7,3,2>: Cost 4 vext2 <1,3,5,7>, <3,2,0,1>
+ 2625636764U, // <5,7,3,3>: Cost 3 vext2 <1,3,5,7>, <3,3,3,3>
+ 2625636866U, // <5,7,3,4>: Cost 3 vext2 <1,3,5,7>, <3,4,5,6>
+ 2625636959U, // <5,7,3,5>: Cost 3 vext2 <1,3,5,7>, <3,5,7,0>
+ 3699378808U, // <5,7,3,6>: Cost 4 vext2 <1,3,5,7>, <3,6,0,7>
+ 2640235254U, // <5,7,3,7>: Cost 3 vext2 <3,7,5,7>, <3,7,5,7>
+ 2625637150U, // <5,7,3,u>: Cost 3 vext2 <1,3,5,7>, <3,u,1,2>
+ 2571919462U, // <5,7,4,0>: Cost 3 vext1 <3,5,7,4>, LHS
+ 2571920384U, // <5,7,4,1>: Cost 3 vext1 <3,5,7,4>, <1,3,5,7>
+ 3699379260U, // <5,7,4,2>: Cost 4 vext2 <1,3,5,7>, <4,2,6,0>
+ 2571922019U, // <5,7,4,3>: Cost 3 vext1 <3,5,7,4>, <3,5,7,4>
+ 2571922742U, // <5,7,4,4>: Cost 3 vext1 <3,5,7,4>, RHS
+ 1551895862U, // <5,7,4,5>: Cost 2 vext2 <1,3,5,7>, RHS
+ 2846277980U, // <5,7,4,6>: Cost 3 vuzpr RHS, <0,4,2,6>
+ 2646207951U, // <5,7,4,7>: Cost 3 vext2 <4,7,5,7>, <4,7,5,7>
+ 1551896105U, // <5,7,4,u>: Cost 2 vext2 <1,3,5,7>, RHS
+ 2583871590U, // <5,7,5,0>: Cost 3 vext1 <5,5,7,5>, LHS
+ 2652180176U, // <5,7,5,1>: Cost 3 vext2 <5,7,5,7>, <5,1,7,3>
+ 2625638177U, // <5,7,5,2>: Cost 3 vext2 <1,3,5,7>, <5,2,7,3>
+ 2625638262U, // <5,7,5,3>: Cost 3 vext2 <1,3,5,7>, <5,3,7,7>
+ 2583874870U, // <5,7,5,4>: Cost 3 vext1 <5,5,7,5>, RHS
+ 2846281732U, // <5,7,5,5>: Cost 3 vuzpr RHS, <5,5,5,5>
+ 2651517015U, // <5,7,5,6>: Cost 3 vext2 <5,6,5,7>, <5,6,5,7>
+ 1772539190U, // <5,7,5,7>: Cost 2 vuzpr RHS, RHS
+ 1772539191U, // <5,7,5,u>: Cost 2 vuzpr RHS, RHS
+ 2846281826U, // <5,7,6,0>: Cost 3 vuzpr RHS, <5,6,7,0>
+ 3699380615U, // <5,7,6,1>: Cost 4 vext2 <1,3,5,7>, <6,1,3,5>
+ 2846281108U, // <5,7,6,2>: Cost 3 vuzpr RHS, <4,6,u,2>
+ 2589854210U, // <5,7,6,3>: Cost 3 vext1 <6,5,7,6>, <3,4,5,6>
+ 2846281830U, // <5,7,6,4>: Cost 3 vuzpr RHS, <5,6,7,4>
+ 2725467658U, // <5,7,6,5>: Cost 3 vext3 <6,7,u,5>, <7,6,5,u>
+ 2846281076U, // <5,7,6,6>: Cost 3 vuzpr RHS, <4,6,4,6>
+ 2846279610U, // <5,7,6,7>: Cost 3 vuzpr RHS, <2,6,3,7>
+ 2846279611U, // <5,7,6,u>: Cost 3 vuzpr RHS, <2,6,3,u>
+ 1510146150U, // <5,7,7,0>: Cost 2 vext1 <5,5,7,7>, LHS
+ 2846282574U, // <5,7,7,1>: Cost 3 vuzpr RHS, <6,7,0,1>
+ 2583889512U, // <5,7,7,2>: Cost 3 vext1 <5,5,7,7>, <2,2,2,2>
+ 2846281919U, // <5,7,7,3>: Cost 3 vuzpr RHS, <5,7,u,3>
+ 1510149430U, // <5,7,7,4>: Cost 2 vext1 <5,5,7,7>, RHS
+ 1510150168U, // <5,7,7,5>: Cost 2 vext1 <5,5,7,7>, <5,5,7,7>
+ 2583892474U, // <5,7,7,6>: Cost 3 vext1 <5,5,7,7>, <6,2,7,3>
+ 2625640044U, // <5,7,7,7>: Cost 3 vext2 <1,3,5,7>, <7,7,7,7>
+ 1510151982U, // <5,7,7,u>: Cost 2 vext1 <5,5,7,7>, LHS
+ 1510154342U, // <5,7,u,0>: Cost 2 vext1 <5,5,7,u>, LHS
+ 1551898414U, // <5,7,u,1>: Cost 2 vext2 <1,3,5,7>, LHS
+ 2625640325U, // <5,7,u,2>: Cost 3 vext2 <1,3,5,7>, <u,2,3,0>
+ 1772536477U, // <5,7,u,3>: Cost 2 vuzpr RHS, LHS
+ 1510157622U, // <5,7,u,4>: Cost 2 vext1 <5,5,7,u>, RHS
+ 1551898778U, // <5,7,u,5>: Cost 2 vext2 <1,3,5,7>, RHS
+ 2625640656U, // <5,7,u,6>: Cost 3 vext2 <1,3,5,7>, <u,6,3,7>
+ 1772539433U, // <5,7,u,7>: Cost 2 vuzpr RHS, RHS
+ 1551898981U, // <5,7,u,u>: Cost 2 vext2 <1,3,5,7>, LHS
+ 2625642496U, // <5,u,0,0>: Cost 3 vext2 <1,3,5,u>, <0,0,0,0>
+ 1551900774U, // <5,u,0,1>: Cost 2 vext2 <1,3,5,u>, LHS
+ 2625642660U, // <5,u,0,2>: Cost 3 vext2 <1,3,5,u>, <0,2,0,2>
+ 2698630885U, // <5,u,0,3>: Cost 3 vext3 <2,3,4,5>, <u,0,3,2>
+ 2687129325U, // <5,u,0,4>: Cost 3 vext3 <0,4,1,5>, <u,0,4,1>
+ 2689783542U, // <5,u,0,5>: Cost 3 vext3 <0,u,1,5>, <u,0,5,1>
+ 2266134675U, // <5,u,0,6>: Cost 3 vrev <u,5,6,0>
+ 2595853772U, // <5,u,0,7>: Cost 3 vext1 <7,5,u,0>, <7,5,u,0>
+ 1551901341U, // <5,u,0,u>: Cost 2 vext2 <1,3,5,u>, LHS
+ 2625643254U, // <5,u,1,0>: Cost 3 vext2 <1,3,5,u>, <1,0,3,2>
+ 2625643316U, // <5,u,1,1>: Cost 3 vext2 <1,3,5,u>, <1,1,1,1>
+ 1613387566U, // <5,u,1,2>: Cost 2 vext3 <0,4,1,5>, LHS
+ 1551901697U, // <5,u,1,3>: Cost 2 vext2 <1,3,5,u>, <1,3,5,u>
+ 2626307154U, // <5,u,1,4>: Cost 3 vext2 <1,4,5,u>, <1,4,5,u>
+ 2689783622U, // <5,u,1,5>: Cost 3 vext3 <0,u,1,5>, <u,1,5,0>
+ 2627634420U, // <5,u,1,6>: Cost 3 vext2 <1,6,5,u>, <1,6,5,u>
+ 2982366536U, // <5,u,1,7>: Cost 3 vzipr <4,u,5,1>, RHS
+ 1613387620U, // <5,u,1,u>: Cost 2 vext3 <0,4,1,5>, LHS
+ 2846286742U, // <5,u,2,0>: Cost 3 vuzpr RHS, <1,2,3,0>
+ 2685796528U, // <5,u,2,1>: Cost 3 vext3 <0,2,1,5>, <0,2,1,5>
+ 2625644136U, // <5,u,2,2>: Cost 3 vext2 <1,3,5,u>, <2,2,2,2>
+ 2687129480U, // <5,u,2,3>: Cost 3 vext3 <0,4,1,5>, <u,2,3,3>
+ 2632279851U, // <5,u,2,4>: Cost 3 vext2 <2,4,5,u>, <2,4,5,u>
+ 2625644394U, // <5,u,2,5>: Cost 3 vext2 <1,3,5,u>, <2,5,3,u>
+ 2625644474U, // <5,u,2,6>: Cost 3 vext2 <1,3,5,u>, <2,6,3,7>
+ 2713966508U, // <5,u,2,7>: Cost 3 vext3 <4,u,5,5>, <u,2,7,3>
+ 2625644603U, // <5,u,2,u>: Cost 3 vext2 <1,3,5,u>, <2,u,0,1>
+ 2687129532U, // <5,u,3,0>: Cost 3 vext3 <0,4,1,5>, <u,3,0,1>
+ 2636261649U, // <5,u,3,1>: Cost 3 vext2 <3,1,5,u>, <3,1,5,u>
+ 2636925282U, // <5,u,3,2>: Cost 3 vext2 <3,2,5,u>, <3,2,5,u>
+ 2625644956U, // <5,u,3,3>: Cost 3 vext2 <1,3,5,u>, <3,3,3,3>
+ 1564510724U, // <5,u,3,4>: Cost 2 vext2 <3,4,5,u>, <3,4,5,u>
+ 2625645160U, // <5,u,3,5>: Cost 3 vext2 <1,3,5,u>, <3,5,u,0>
+ 2734610422U, // <5,u,3,6>: Cost 3 vext3 <u,3,6,5>, <u,3,6,5>
+ 2640243447U, // <5,u,3,7>: Cost 3 vext2 <3,7,5,u>, <3,7,5,u>
+ 1567165256U, // <5,u,3,u>: Cost 2 vext2 <3,u,5,u>, <3,u,5,u>
+ 1567828889U, // <5,u,4,0>: Cost 2 vext2 <4,0,5,u>, <4,0,5,u>
+ 1661163546U, // <5,u,4,1>: Cost 2 vext3 <u,4,1,5>, <u,4,1,5>
+ 2734463012U, // <5,u,4,2>: Cost 3 vext3 <u,3,4,5>, <u,4,2,6>
+ 2698631212U, // <5,u,4,3>: Cost 3 vext3 <2,3,4,5>, <u,4,3,5>
+ 1570458842U, // <5,u,4,4>: Cost 2 vext2 <4,4,5,5>, <4,4,5,5>
+ 1551904054U, // <5,u,4,5>: Cost 2 vext2 <1,3,5,u>, RHS
+ 2846286172U, // <5,u,4,6>: Cost 3 vuzpr RHS, <0,4,2,6>
+ 2646216144U, // <5,u,4,7>: Cost 3 vext2 <4,7,5,u>, <4,7,5,u>
+ 1551904297U, // <5,u,4,u>: Cost 2 vext2 <1,3,5,u>, RHS
+ 1509982310U, // <5,u,5,0>: Cost 2 vext1 <5,5,5,5>, LHS
+ 2560058555U, // <5,u,5,1>: Cost 3 vext1 <1,5,u,5>, <1,5,u,5>
+ 2698926194U, // <5,u,5,2>: Cost 3 vext3 <2,3,u,5>, <u,5,2,3>
+ 2698631295U, // <5,u,5,3>: Cost 3 vext3 <2,3,4,5>, <u,5,3,7>
+ 1509985590U, // <5,u,5,4>: Cost 2 vext1 <5,5,5,5>, RHS
+ 229035318U, // <5,u,5,5>: Cost 1 vdup1 RHS
+ 1613387930U, // <5,u,5,6>: Cost 2 vext3 <0,4,1,5>, RHS
+ 1772547382U, // <5,u,5,7>: Cost 2 vuzpr RHS, RHS
+ 229035318U, // <5,u,5,u>: Cost 1 vdup1 RHS
+ 2566037606U, // <5,u,6,0>: Cost 3 vext1 <2,5,u,6>, LHS
+ 2920044334U, // <5,u,6,1>: Cost 3 vzipl <5,6,7,0>, LHS
+ 2566039445U, // <5,u,6,2>: Cost 3 vext1 <2,5,u,6>, <2,5,u,6>
+ 2687129808U, // <5,u,6,3>: Cost 3 vext3 <0,4,1,5>, <u,6,3,7>
+ 2566040886U, // <5,u,6,4>: Cost 3 vext1 <2,5,u,6>, RHS
+ 2920044698U, // <5,u,6,5>: Cost 3 vzipl <5,6,7,0>, RHS
+ 2846289268U, // <5,u,6,6>: Cost 3 vuzpr RHS, <4,6,4,6>
+ 2973781320U, // <5,u,6,7>: Cost 3 vzipr <3,4,5,6>, RHS
+ 2687129853U, // <5,u,6,u>: Cost 3 vext3 <0,4,1,5>, <u,6,u,7>
+ 430506086U, // <5,u,7,0>: Cost 1 vext1 RHS, LHS
+ 1486333117U, // <5,u,7,1>: Cost 2 vext1 <1,5,u,7>, <1,5,u,7>
+ 1504249448U, // <5,u,7,2>: Cost 2 vext1 RHS, <2,2,2,2>
+ 2040971933U, // <5,u,7,3>: Cost 2 vtrnr RHS, LHS
+ 430509384U, // <5,u,7,4>: Cost 1 vext1 RHS, RHS
+ 1504251600U, // <5,u,7,5>: Cost 2 vext1 RHS, <5,1,7,3>
+ 118708378U, // <5,u,7,6>: Cost 1 vrev RHS
+ 2040974889U, // <5,u,7,7>: Cost 2 vtrnr RHS, RHS
+ 430511918U, // <5,u,7,u>: Cost 1 vext1 RHS, LHS
+ 430514278U, // <5,u,u,0>: Cost 1 vext1 RHS, LHS
+ 1551906606U, // <5,u,u,1>: Cost 2 vext2 <1,3,5,u>, LHS
+ 1613388133U, // <5,u,u,2>: Cost 2 vext3 <0,4,1,5>, LHS
+ 1772544669U, // <5,u,u,3>: Cost 2 vuzpr RHS, LHS
+ 430517577U, // <5,u,u,4>: Cost 1 vext1 RHS, RHS
+ 229035318U, // <5,u,u,5>: Cost 1 vdup1 RHS
+ 118716571U, // <5,u,u,6>: Cost 1 vrev RHS
+ 1772547625U, // <5,u,u,7>: Cost 2 vuzpr RHS, RHS
+ 430520110U, // <5,u,u,u>: Cost 1 vext1 RHS, LHS
+ 2686025728U, // <6,0,0,0>: Cost 3 vext3 <0,2,4,6>, <0,0,0,0>
+ 2686025738U, // <6,0,0,1>: Cost 3 vext3 <0,2,4,6>, <0,0,1,1>
+ 2686025748U, // <6,0,0,2>: Cost 3 vext3 <0,2,4,6>, <0,0,2,2>
+ 3779084320U, // <6,0,0,3>: Cost 4 vext3 <3,4,5,6>, <0,0,3,5>
+ 2642903388U, // <6,0,0,4>: Cost 3 vext2 <4,2,6,0>, <0,4,2,6>
+ 3657723939U, // <6,0,0,5>: Cost 4 vext1 <5,6,0,0>, <5,6,0,0>
+ 3926676514U, // <6,0,0,6>: Cost 4 vuzpr <5,6,7,0>, <7,0,5,6>
+ 3926675786U, // <6,0,0,7>: Cost 4 vuzpr <5,6,7,0>, <6,0,5,7>
+ 2686025802U, // <6,0,0,u>: Cost 3 vext3 <0,2,4,6>, <0,0,u,2>
+ 2566070374U, // <6,0,1,0>: Cost 3 vext1 <2,6,0,1>, LHS
+ 3759767642U, // <6,0,1,1>: Cost 4 vext3 <0,2,4,6>, <0,1,1,0>
+ 1612284006U, // <6,0,1,2>: Cost 2 vext3 <0,2,4,6>, LHS
+ 2583988738U, // <6,0,1,3>: Cost 3 vext1 <5,6,0,1>, <3,4,5,6>
+ 2566073654U, // <6,0,1,4>: Cost 3 vext1 <2,6,0,1>, RHS
+ 2583990308U, // <6,0,1,5>: Cost 3 vext1 <5,6,0,1>, <5,6,0,1>
+ 2589963005U, // <6,0,1,6>: Cost 3 vext1 <6,6,0,1>, <6,6,0,1>
+ 2595935702U, // <6,0,1,7>: Cost 3 vext1 <7,6,0,1>, <7,6,0,1>
+ 1612284060U, // <6,0,1,u>: Cost 2 vext3 <0,2,4,6>, LHS
+ 2686025892U, // <6,0,2,0>: Cost 3 vext3 <0,2,4,6>, <0,2,0,2>
+ 2685804721U, // <6,0,2,1>: Cost 3 vext3 <0,2,1,6>, <0,2,1,6>
+ 3759620282U, // <6,0,2,2>: Cost 4 vext3 <0,2,2,6>, <0,2,2,6>
+ 2705342658U, // <6,0,2,3>: Cost 3 vext3 <3,4,5,6>, <0,2,3,5>
+ 1612284108U, // <6,0,2,4>: Cost 2 vext3 <0,2,4,6>, <0,2,4,6>
+ 3706029956U, // <6,0,2,5>: Cost 4 vext2 <2,4,6,0>, <2,5,6,7>
+ 2686173406U, // <6,0,2,6>: Cost 3 vext3 <0,2,6,6>, <0,2,6,6>
+ 3651769338U, // <6,0,2,7>: Cost 4 vext1 <4,6,0,2>, <7,0,1,2>
+ 1612579056U, // <6,0,2,u>: Cost 2 vext3 <0,2,u,6>, <0,2,u,6>
+ 3706030230U, // <6,0,3,0>: Cost 4 vext2 <2,4,6,0>, <3,0,1,2>
+ 2705342720U, // <6,0,3,1>: Cost 3 vext3 <3,4,5,6>, <0,3,1,4>
+ 2705342730U, // <6,0,3,2>: Cost 3 vext3 <3,4,5,6>, <0,3,2,5>
+ 3706030492U, // <6,0,3,3>: Cost 4 vext2 <2,4,6,0>, <3,3,3,3>
+ 2644896258U, // <6,0,3,4>: Cost 3 vext2 <4,5,6,0>, <3,4,5,6>
+ 3718638154U, // <6,0,3,5>: Cost 4 vext2 <4,5,6,0>, <3,5,4,6>
+ 3729918619U, // <6,0,3,6>: Cost 4 vext2 <6,4,6,0>, <3,6,4,6>
+ 3926672384U, // <6,0,3,7>: Cost 4 vuzpr <5,6,7,0>, <1,3,5,7>
+ 2705342784U, // <6,0,3,u>: Cost 3 vext3 <3,4,5,6>, <0,3,u,5>
+ 2687058250U, // <6,0,4,0>: Cost 3 vext3 <0,4,0,6>, <0,4,0,6>
+ 2686026066U, // <6,0,4,1>: Cost 3 vext3 <0,2,4,6>, <0,4,1,5>
+ 1613463900U, // <6,0,4,2>: Cost 2 vext3 <0,4,2,6>, <0,4,2,6>
+ 3761021285U, // <6,0,4,3>: Cost 4 vext3 <0,4,3,6>, <0,4,3,6>
+ 2687353198U, // <6,0,4,4>: Cost 3 vext3 <0,4,4,6>, <0,4,4,6>
+ 2632289590U, // <6,0,4,5>: Cost 3 vext2 <2,4,6,0>, RHS
+ 2645560704U, // <6,0,4,6>: Cost 3 vext2 <4,6,6,0>, <4,6,6,0>
+ 2646224337U, // <6,0,4,7>: Cost 3 vext2 <4,7,6,0>, <4,7,6,0>
+ 1613906322U, // <6,0,4,u>: Cost 2 vext3 <0,4,u,6>, <0,4,u,6>
+ 3651788902U, // <6,0,5,0>: Cost 4 vext1 <4,6,0,5>, LHS
+ 2687795620U, // <6,0,5,1>: Cost 3 vext3 <0,5,1,6>, <0,5,1,6>
+ 3761611181U, // <6,0,5,2>: Cost 4 vext3 <0,5,2,6>, <0,5,2,6>
+ 3723284326U, // <6,0,5,3>: Cost 4 vext2 <5,3,6,0>, <5,3,6,0>
+ 2646224838U, // <6,0,5,4>: Cost 3 vext2 <4,7,6,0>, <5,4,7,6>
+ 3718639630U, // <6,0,5,5>: Cost 4 vext2 <4,5,6,0>, <5,5,6,6>
+ 2652196962U, // <6,0,5,6>: Cost 3 vext2 <5,7,6,0>, <5,6,7,0>
+ 2852932918U, // <6,0,5,7>: Cost 3 vuzpr <5,6,7,0>, RHS
+ 2852932919U, // <6,0,5,u>: Cost 3 vuzpr <5,6,7,0>, RHS
+ 2852933730U, // <6,0,6,0>: Cost 3 vuzpr <5,6,7,0>, <5,6,7,0>
+ 2925985894U, // <6,0,6,1>: Cost 3 vzipl <6,6,6,6>, LHS
+ 3060203622U, // <6,0,6,2>: Cost 3 vtrnl <6,6,6,6>, LHS
+ 3718640178U, // <6,0,6,3>: Cost 4 vext2 <4,5,6,0>, <6,3,4,5>
+ 2656178832U, // <6,0,6,4>: Cost 3 vext2 <6,4,6,0>, <6,4,6,0>
+ 3725939378U, // <6,0,6,5>: Cost 4 vext2 <5,7,6,0>, <6,5,0,7>
+ 2657506098U, // <6,0,6,6>: Cost 3 vext2 <6,6,6,0>, <6,6,6,0>
+ 2619020110U, // <6,0,6,7>: Cost 3 vext2 <0,2,6,0>, <6,7,0,1>
+ 2925986461U, // <6,0,6,u>: Cost 3 vzipl <6,6,6,6>, LHS
+ 2572091494U, // <6,0,7,0>: Cost 3 vext1 <3,6,0,7>, LHS
+ 2572092310U, // <6,0,7,1>: Cost 3 vext1 <3,6,0,7>, <1,2,3,0>
+ 2980495524U, // <6,0,7,2>: Cost 3 vzipr RHS, <0,2,0,2>
+ 2572094072U, // <6,0,7,3>: Cost 3 vext1 <3,6,0,7>, <3,6,0,7>
+ 2572094774U, // <6,0,7,4>: Cost 3 vext1 <3,6,0,7>, RHS
+ 4054238242U, // <6,0,7,5>: Cost 4 vzipr RHS, <1,4,0,5>
+ 3645837653U, // <6,0,7,6>: Cost 4 vext1 <3,6,0,7>, <6,0,7,0>
+ 4054239054U, // <6,0,7,7>: Cost 4 vzipr RHS, <2,5,0,7>
+ 2572097326U, // <6,0,7,u>: Cost 3 vext1 <3,6,0,7>, LHS
+ 2686026378U, // <6,0,u,0>: Cost 3 vext3 <0,2,4,6>, <0,u,0,2>
+ 2686026386U, // <6,0,u,1>: Cost 3 vext3 <0,2,4,6>, <0,u,1,1>
+ 1612284573U, // <6,0,u,2>: Cost 2 vext3 <0,2,4,6>, LHS
+ 2705343144U, // <6,0,u,3>: Cost 3 vext3 <3,4,5,6>, <0,u,3,5>
+ 1616265906U, // <6,0,u,4>: Cost 2 vext3 <0,u,4,6>, <0,u,4,6>
+ 2632292506U, // <6,0,u,5>: Cost 3 vext2 <2,4,6,0>, RHS
+ 2590020356U, // <6,0,u,6>: Cost 3 vext1 <6,6,0,u>, <6,6,0,u>
+ 2852933161U, // <6,0,u,7>: Cost 3 vuzpr <5,6,7,0>, RHS
+ 1612284627U, // <6,0,u,u>: Cost 2 vext3 <0,2,4,6>, LHS
+ 2595995750U, // <6,1,0,0>: Cost 3 vext1 <7,6,1,0>, LHS
+ 2646229094U, // <6,1,0,1>: Cost 3 vext2 <4,7,6,1>, LHS
+ 3694092492U, // <6,1,0,2>: Cost 4 vext2 <0,4,6,1>, <0,2,4,6>
+ 2686026486U, // <6,1,0,3>: Cost 3 vext3 <0,2,4,6>, <1,0,3,2>
+ 2595999030U, // <6,1,0,4>: Cost 3 vext1 <7,6,1,0>, RHS
+ 3767730952U, // <6,1,0,5>: Cost 4 vext3 <1,5,4,6>, <1,0,5,2>
+ 2596000590U, // <6,1,0,6>: Cost 3 vext1 <7,6,1,0>, <6,7,0,1>
+ 2596001246U, // <6,1,0,7>: Cost 3 vext1 <7,6,1,0>, <7,6,1,0>
+ 2686026531U, // <6,1,0,u>: Cost 3 vext3 <0,2,4,6>, <1,0,u,2>
+ 3763602219U, // <6,1,1,0>: Cost 4 vext3 <0,u,2,6>, <1,1,0,1>
+ 2686026548U, // <6,1,1,1>: Cost 3 vext3 <0,2,4,6>, <1,1,1,1>
+ 3764929346U, // <6,1,1,2>: Cost 4 vext3 <1,1,2,6>, <1,1,2,6>
+ 2686026568U, // <6,1,1,3>: Cost 3 vext3 <0,2,4,6>, <1,1,3,3>
+ 2691334996U, // <6,1,1,4>: Cost 3 vext3 <1,1,4,6>, <1,1,4,6>
+ 3760874332U, // <6,1,1,5>: Cost 4 vext3 <0,4,1,6>, <1,1,5,5>
+ 3765224294U, // <6,1,1,6>: Cost 4 vext3 <1,1,6,6>, <1,1,6,6>
+ 3669751263U, // <6,1,1,7>: Cost 4 vext1 <7,6,1,1>, <7,6,1,1>
+ 2686026613U, // <6,1,1,u>: Cost 3 vext3 <0,2,4,6>, <1,1,u,3>
+ 2554208358U, // <6,1,2,0>: Cost 3 vext1 <0,6,1,2>, LHS
+ 3763602311U, // <6,1,2,1>: Cost 4 vext3 <0,u,2,6>, <1,2,1,3>
+ 3639895971U, // <6,1,2,2>: Cost 4 vext1 <2,6,1,2>, <2,6,1,2>
+ 2686026646U, // <6,1,2,3>: Cost 3 vext3 <0,2,4,6>, <1,2,3,0>
+ 2554211638U, // <6,1,2,4>: Cost 3 vext1 <0,6,1,2>, RHS
+ 3760874411U, // <6,1,2,5>: Cost 4 vext3 <0,4,1,6>, <1,2,5,3>
+ 2554212858U, // <6,1,2,6>: Cost 3 vext1 <0,6,1,2>, <6,2,7,3>
+ 3802973114U, // <6,1,2,7>: Cost 4 vext3 <7,4,5,6>, <1,2,7,0>
+ 2686026691U, // <6,1,2,u>: Cost 3 vext3 <0,2,4,6>, <1,2,u,0>
+ 2566160486U, // <6,1,3,0>: Cost 3 vext1 <2,6,1,3>, LHS
+ 2686026712U, // <6,1,3,1>: Cost 3 vext3 <0,2,4,6>, <1,3,1,3>
+ 2686026724U, // <6,1,3,2>: Cost 3 vext3 <0,2,4,6>, <1,3,2,6>
+ 3759768552U, // <6,1,3,3>: Cost 4 vext3 <0,2,4,6>, <1,3,3,1>
+ 2692662262U, // <6,1,3,4>: Cost 3 vext3 <1,3,4,6>, <1,3,4,6>
+ 2686026752U, // <6,1,3,5>: Cost 3 vext3 <0,2,4,6>, <1,3,5,7>
+ 2590053128U, // <6,1,3,6>: Cost 3 vext1 <6,6,1,3>, <6,6,1,3>
+ 3663795194U, // <6,1,3,7>: Cost 4 vext1 <6,6,1,3>, <7,0,1,2>
+ 2686026775U, // <6,1,3,u>: Cost 3 vext3 <0,2,4,6>, <1,3,u,3>
+ 2641587099U, // <6,1,4,0>: Cost 3 vext2 <4,0,6,1>, <4,0,6,1>
+ 2693104684U, // <6,1,4,1>: Cost 3 vext3 <1,4,1,6>, <1,4,1,6>
+ 3639912357U, // <6,1,4,2>: Cost 4 vext1 <2,6,1,4>, <2,6,1,4>
+ 2687206462U, // <6,1,4,3>: Cost 3 vext3 <0,4,2,6>, <1,4,3,6>
+ 3633941814U, // <6,1,4,4>: Cost 4 vext1 <1,6,1,4>, RHS
+ 2693399632U, // <6,1,4,5>: Cost 3 vext3 <1,4,5,6>, <1,4,5,6>
+ 3765077075U, // <6,1,4,6>: Cost 4 vext3 <1,1,4,6>, <1,4,6,0>
+ 2646232530U, // <6,1,4,7>: Cost 3 vext2 <4,7,6,1>, <4,7,6,1>
+ 2687206507U, // <6,1,4,u>: Cost 3 vext3 <0,4,2,6>, <1,4,u,6>
+ 2647559796U, // <6,1,5,0>: Cost 3 vext2 <5,0,6,1>, <5,0,6,1>
+ 3765077118U, // <6,1,5,1>: Cost 4 vext3 <1,1,4,6>, <1,5,1,7>
+ 3767583878U, // <6,1,5,2>: Cost 4 vext3 <1,5,2,6>, <1,5,2,6>
+ 2686026896U, // <6,1,5,3>: Cost 3 vext3 <0,2,4,6>, <1,5,3,7>
+ 2693989528U, // <6,1,5,4>: Cost 3 vext3 <1,5,4,6>, <1,5,4,6>
+ 3767805089U, // <6,1,5,5>: Cost 4 vext3 <1,5,5,6>, <1,5,5,6>
+ 2652868706U, // <6,1,5,6>: Cost 3 vext2 <5,u,6,1>, <5,6,7,0>
+ 3908250934U, // <6,1,5,7>: Cost 4 vuzpr <2,6,0,1>, RHS
+ 2686026941U, // <6,1,5,u>: Cost 3 vext3 <0,2,4,6>, <1,5,u,7>
+ 2554241126U, // <6,1,6,0>: Cost 3 vext1 <0,6,1,6>, LHS
+ 3763602639U, // <6,1,6,1>: Cost 4 vext3 <0,u,2,6>, <1,6,1,7>
+ 3759547607U, // <6,1,6,2>: Cost 4 vext3 <0,2,1,6>, <1,6,2,6>
+ 3115221094U, // <6,1,6,3>: Cost 3 vtrnr <4,6,4,6>, LHS
+ 2554244406U, // <6,1,6,4>: Cost 3 vext1 <0,6,1,6>, RHS
+ 3760874739U, // <6,1,6,5>: Cost 4 vext3 <0,4,1,6>, <1,6,5,7>
+ 2554245944U, // <6,1,6,6>: Cost 3 vext1 <0,6,1,6>, <6,6,6,6>
+ 3719975758U, // <6,1,6,7>: Cost 4 vext2 <4,7,6,1>, <6,7,0,1>
+ 3115221099U, // <6,1,6,u>: Cost 3 vtrnr <4,6,4,6>, LHS
+ 2560221286U, // <6,1,7,0>: Cost 3 vext1 <1,6,1,7>, LHS
+ 2560222415U, // <6,1,7,1>: Cost 3 vext1 <1,6,1,7>, <1,6,1,7>
+ 2980497558U, // <6,1,7,2>: Cost 3 vzipr RHS, <3,0,1,2>
+ 3103211622U, // <6,1,7,3>: Cost 3 vtrnr <2,6,3,7>, LHS
+ 2560224566U, // <6,1,7,4>: Cost 3 vext1 <1,6,1,7>, RHS
+ 2980495698U, // <6,1,7,5>: Cost 3 vzipr RHS, <0,4,1,5>
+ 3633967526U, // <6,1,7,6>: Cost 4 vext1 <1,6,1,7>, <6,1,7,0>
+ 4054237686U, // <6,1,7,7>: Cost 4 vzipr RHS, <0,6,1,7>
+ 2560227118U, // <6,1,7,u>: Cost 3 vext1 <1,6,1,7>, LHS
+ 2560229478U, // <6,1,u,0>: Cost 3 vext1 <1,6,1,u>, LHS
+ 2686027117U, // <6,1,u,1>: Cost 3 vext3 <0,2,4,6>, <1,u,1,3>
+ 2686027129U, // <6,1,u,2>: Cost 3 vext3 <0,2,4,6>, <1,u,2,6>
+ 2686027132U, // <6,1,u,3>: Cost 3 vext3 <0,2,4,6>, <1,u,3,0>
+ 2687206795U, // <6,1,u,4>: Cost 3 vext3 <0,4,2,6>, <1,u,4,6>
+ 2686027157U, // <6,1,u,5>: Cost 3 vext3 <0,2,4,6>, <1,u,5,7>
+ 2590094093U, // <6,1,u,6>: Cost 3 vext1 <6,6,1,u>, <6,6,1,u>
+ 2596066790U, // <6,1,u,7>: Cost 3 vext1 <7,6,1,u>, <7,6,1,u>
+ 2686027177U, // <6,1,u,u>: Cost 3 vext3 <0,2,4,6>, <1,u,u,0>
+ 2646900736U, // <6,2,0,0>: Cost 3 vext2 <4,u,6,2>, <0,0,0,0>
+ 1573159014U, // <6,2,0,1>: Cost 2 vext2 <4,u,6,2>, LHS
+ 2646900900U, // <6,2,0,2>: Cost 3 vext2 <4,u,6,2>, <0,2,0,2>
+ 3759769037U, // <6,2,0,3>: Cost 4 vext3 <0,2,4,6>, <2,0,3,0>
+ 2641592668U, // <6,2,0,4>: Cost 3 vext2 <4,0,6,2>, <0,4,2,6>
+ 3779085794U, // <6,2,0,5>: Cost 4 vext3 <3,4,5,6>, <2,0,5,3>
+ 2686027244U, // <6,2,0,6>: Cost 3 vext3 <0,2,4,6>, <2,0,6,4>
+ 3669816807U, // <6,2,0,7>: Cost 4 vext1 <7,6,2,0>, <7,6,2,0>
+ 1573159581U, // <6,2,0,u>: Cost 2 vext2 <4,u,6,2>, LHS
+ 2230527897U, // <6,2,1,0>: Cost 3 vrev <2,6,0,1>
+ 2646901556U, // <6,2,1,1>: Cost 3 vext2 <4,u,6,2>, <1,1,1,1>
+ 2646901654U, // <6,2,1,2>: Cost 3 vext2 <4,u,6,2>, <1,2,3,0>
+ 2847047782U, // <6,2,1,3>: Cost 3 vuzpr <4,6,u,2>, LHS
+ 3771049517U, // <6,2,1,4>: Cost 4 vext3 <2,1,4,6>, <2,1,4,6>
+ 2646901904U, // <6,2,1,5>: Cost 3 vext2 <4,u,6,2>, <1,5,3,7>
+ 2686027324U, // <6,2,1,6>: Cost 3 vext3 <0,2,4,6>, <2,1,6,3>
+ 3669825000U, // <6,2,1,7>: Cost 4 vext1 <7,6,2,1>, <7,6,2,1>
+ 2231117793U, // <6,2,1,u>: Cost 3 vrev <2,6,u,1>
+ 3763603029U, // <6,2,2,0>: Cost 4 vext3 <0,u,2,6>, <2,2,0,1>
+ 3759769184U, // <6,2,2,1>: Cost 4 vext3 <0,2,4,6>, <2,2,1,3>
+ 2686027368U, // <6,2,2,2>: Cost 3 vext3 <0,2,4,6>, <2,2,2,2>
+ 2686027378U, // <6,2,2,3>: Cost 3 vext3 <0,2,4,6>, <2,2,3,3>
+ 2697971326U, // <6,2,2,4>: Cost 3 vext3 <2,2,4,6>, <2,2,4,6>
+ 3759769224U, // <6,2,2,5>: Cost 4 vext3 <0,2,4,6>, <2,2,5,7>
+ 2698118800U, // <6,2,2,6>: Cost 3 vext3 <2,2,6,6>, <2,2,6,6>
+ 3920794092U, // <6,2,2,7>: Cost 4 vuzpr <4,6,u,2>, <6,2,5,7>
+ 2686027423U, // <6,2,2,u>: Cost 3 vext3 <0,2,4,6>, <2,2,u,3>
+ 2686027430U, // <6,2,3,0>: Cost 3 vext3 <0,2,4,6>, <2,3,0,1>
+ 3759769262U, // <6,2,3,1>: Cost 4 vext3 <0,2,4,6>, <2,3,1,0>
+ 2698487485U, // <6,2,3,2>: Cost 3 vext3 <2,3,2,6>, <2,3,2,6>
+ 2705344196U, // <6,2,3,3>: Cost 3 vext3 <3,4,5,6>, <2,3,3,4>
+ 2686027470U, // <6,2,3,4>: Cost 3 vext3 <0,2,4,6>, <2,3,4,5>
+ 2698708696U, // <6,2,3,5>: Cost 3 vext3 <2,3,5,6>, <2,3,5,6>
+ 2724660961U, // <6,2,3,6>: Cost 3 vext3 <6,6,6,6>, <2,3,6,6>
+ 2729232104U, // <6,2,3,7>: Cost 3 vext3 <7,4,5,6>, <2,3,7,4>
+ 2686027502U, // <6,2,3,u>: Cost 3 vext3 <0,2,4,6>, <2,3,u,1>
+ 1567853468U, // <6,2,4,0>: Cost 2 vext2 <4,0,6,2>, <4,0,6,2>
+ 3759769351U, // <6,2,4,1>: Cost 4 vext3 <0,2,4,6>, <2,4,1,u>
+ 2699151118U, // <6,2,4,2>: Cost 3 vext3 <2,4,2,6>, <2,4,2,6>
+ 2686027543U, // <6,2,4,3>: Cost 3 vext3 <0,2,4,6>, <2,4,3,6>
+ 2699298592U, // <6,2,4,4>: Cost 3 vext3 <2,4,4,6>, <2,4,4,6>
+ 1573162294U, // <6,2,4,5>: Cost 2 vext2 <4,u,6,2>, RHS
+ 2686027564U, // <6,2,4,6>: Cost 3 vext3 <0,2,4,6>, <2,4,6,0>
+ 3719982547U, // <6,2,4,7>: Cost 4 vext2 <4,7,6,2>, <4,7,6,2>
+ 1573162532U, // <6,2,4,u>: Cost 2 vext2 <4,u,6,2>, <4,u,6,2>
+ 3779086154U, // <6,2,5,0>: Cost 4 vext3 <3,4,5,6>, <2,5,0,3>
+ 2646904528U, // <6,2,5,1>: Cost 3 vext2 <4,u,6,2>, <5,1,7,3>
+ 3759769440U, // <6,2,5,2>: Cost 4 vext3 <0,2,4,6>, <2,5,2,7>
+ 2699888488U, // <6,2,5,3>: Cost 3 vext3 <2,5,3,6>, <2,5,3,6>
+ 2230855617U, // <6,2,5,4>: Cost 3 vrev <2,6,4,5>
+ 2646904836U, // <6,2,5,5>: Cost 3 vext2 <4,u,6,2>, <5,5,5,5>
+ 2646904930U, // <6,2,5,6>: Cost 3 vext2 <4,u,6,2>, <5,6,7,0>
+ 2847051062U, // <6,2,5,7>: Cost 3 vuzpr <4,6,u,2>, RHS
+ 2700257173U, // <6,2,5,u>: Cost 3 vext3 <2,5,u,6>, <2,5,u,6>
+ 2687207321U, // <6,2,6,0>: Cost 3 vext3 <0,4,2,6>, <2,6,0,1>
+ 2686027684U, // <6,2,6,1>: Cost 3 vext3 <0,2,4,6>, <2,6,1,3>
+ 2566260656U, // <6,2,6,2>: Cost 3 vext1 <2,6,2,6>, <2,6,2,6>
+ 2685806522U, // <6,2,6,3>: Cost 3 vext3 <0,2,1,6>, <2,6,3,7>
+ 2687207361U, // <6,2,6,4>: Cost 3 vext3 <0,4,2,6>, <2,6,4,5>
+ 2686027724U, // <6,2,6,5>: Cost 3 vext3 <0,2,4,6>, <2,6,5,7>
+ 2646905656U, // <6,2,6,6>: Cost 3 vext2 <4,u,6,2>, <6,6,6,6>
+ 2646905678U, // <6,2,6,7>: Cost 3 vext2 <4,u,6,2>, <6,7,0,1>
+ 2686027751U, // <6,2,6,u>: Cost 3 vext3 <0,2,4,6>, <2,6,u,7>
+ 2554323046U, // <6,2,7,0>: Cost 3 vext1 <0,6,2,7>, LHS
+ 2572239606U, // <6,2,7,1>: Cost 3 vext1 <3,6,2,7>, <1,0,3,2>
+ 2566268849U, // <6,2,7,2>: Cost 3 vext1 <2,6,2,7>, <2,6,2,7>
+ 1906753638U, // <6,2,7,3>: Cost 2 vzipr RHS, LHS
+ 2554326326U, // <6,2,7,4>: Cost 3 vext1 <0,6,2,7>, RHS
+ 3304687564U, // <6,2,7,5>: Cost 4 vrev <2,6,5,7>
+ 2980495708U, // <6,2,7,6>: Cost 3 vzipr RHS, <0,4,2,6>
+ 2646906476U, // <6,2,7,7>: Cost 3 vext2 <4,u,6,2>, <7,7,7,7>
+ 1906753643U, // <6,2,7,u>: Cost 2 vzipr RHS, LHS
+ 1591744256U, // <6,2,u,0>: Cost 2 vext2 <u,0,6,2>, <u,0,6,2>
+ 1573164846U, // <6,2,u,1>: Cost 2 vext2 <4,u,6,2>, LHS
+ 2701805650U, // <6,2,u,2>: Cost 3 vext3 <2,u,2,6>, <2,u,2,6>
+ 1906761830U, // <6,2,u,3>: Cost 2 vzipr RHS, LHS
+ 2686027875U, // <6,2,u,4>: Cost 3 vext3 <0,2,4,6>, <2,u,4,5>
+ 1573165210U, // <6,2,u,5>: Cost 2 vext2 <4,u,6,2>, RHS
+ 2686322800U, // <6,2,u,6>: Cost 3 vext3 <0,2,u,6>, <2,u,6,0>
+ 2847051305U, // <6,2,u,7>: Cost 3 vuzpr <4,6,u,2>, RHS
+ 1906761835U, // <6,2,u,u>: Cost 2 vzipr RHS, LHS
+ 3759769739U, // <6,3,0,0>: Cost 4 vext3 <0,2,4,6>, <3,0,0,0>
+ 2686027926U, // <6,3,0,1>: Cost 3 vext3 <0,2,4,6>, <3,0,1,2>
+ 2686027937U, // <6,3,0,2>: Cost 3 vext3 <0,2,4,6>, <3,0,2,4>
+ 3640027286U, // <6,3,0,3>: Cost 4 vext1 <2,6,3,0>, <3,0,1,2>
+ 2687207601U, // <6,3,0,4>: Cost 3 vext3 <0,4,2,6>, <3,0,4,2>
+ 2705344698U, // <6,3,0,5>: Cost 3 vext3 <3,4,5,6>, <3,0,5,2>
+ 3663917847U, // <6,3,0,6>: Cost 4 vext1 <6,6,3,0>, <6,6,3,0>
+ 2237008560U, // <6,3,0,7>: Cost 3 vrev <3,6,7,0>
+ 2686027989U, // <6,3,0,u>: Cost 3 vext3 <0,2,4,6>, <3,0,u,2>
+ 3759769823U, // <6,3,1,0>: Cost 4 vext3 <0,2,4,6>, <3,1,0,3>
+ 3759769830U, // <6,3,1,1>: Cost 4 vext3 <0,2,4,6>, <3,1,1,1>
+ 3759769841U, // <6,3,1,2>: Cost 4 vext3 <0,2,4,6>, <3,1,2,3>
+ 3759769848U, // <6,3,1,3>: Cost 4 vext3 <0,2,4,6>, <3,1,3,1>
+ 2703280390U, // <6,3,1,4>: Cost 3 vext3 <3,1,4,6>, <3,1,4,6>
+ 3759769868U, // <6,3,1,5>: Cost 4 vext3 <0,2,4,6>, <3,1,5,3>
+ 3704063194U, // <6,3,1,6>: Cost 4 vext2 <2,1,6,3>, <1,6,3,0>
+ 3767732510U, // <6,3,1,7>: Cost 4 vext3 <1,5,4,6>, <3,1,7,3>
+ 2703280390U, // <6,3,1,u>: Cost 3 vext3 <3,1,4,6>, <3,1,4,6>
+ 3704063468U, // <6,3,2,0>: Cost 4 vext2 <2,1,6,3>, <2,0,6,4>
+ 2630321724U, // <6,3,2,1>: Cost 3 vext2 <2,1,6,3>, <2,1,6,3>
+ 3759769921U, // <6,3,2,2>: Cost 4 vext3 <0,2,4,6>, <3,2,2,2>
+ 3759769928U, // <6,3,2,3>: Cost 4 vext3 <0,2,4,6>, <3,2,3,0>
+ 3704063767U, // <6,3,2,4>: Cost 4 vext2 <2,1,6,3>, <2,4,3,6>
+ 3704063876U, // <6,3,2,5>: Cost 4 vext2 <2,1,6,3>, <2,5,6,7>
+ 2636957626U, // <6,3,2,6>: Cost 3 vext2 <3,2,6,3>, <2,6,3,7>
+ 3777907058U, // <6,3,2,7>: Cost 4 vext3 <3,2,7,6>, <3,2,7,6>
+ 2630321724U, // <6,3,2,u>: Cost 3 vext2 <2,1,6,3>, <2,1,6,3>
+ 3759769983U, // <6,3,3,0>: Cost 4 vext3 <0,2,4,6>, <3,3,0,1>
+ 3710036245U, // <6,3,3,1>: Cost 4 vext2 <3,1,6,3>, <3,1,6,3>
+ 2636958054U, // <6,3,3,2>: Cost 3 vext2 <3,2,6,3>, <3,2,6,3>
+ 2686028188U, // <6,3,3,3>: Cost 3 vext3 <0,2,4,6>, <3,3,3,3>
+ 2704607656U, // <6,3,3,4>: Cost 3 vext3 <3,3,4,6>, <3,3,4,6>
+ 3773041072U, // <6,3,3,5>: Cost 4 vext3 <2,4,4,6>, <3,3,5,5>
+ 3711363731U, // <6,3,3,6>: Cost 4 vext2 <3,3,6,3>, <3,6,3,7>
+ 3767732676U, // <6,3,3,7>: Cost 4 vext3 <1,5,4,6>, <3,3,7,7>
+ 2707999179U, // <6,3,3,u>: Cost 3 vext3 <3,u,5,6>, <3,3,u,5>
+ 2584232038U, // <6,3,4,0>: Cost 3 vext1 <5,6,3,4>, LHS
+ 2642267118U, // <6,3,4,1>: Cost 3 vext2 <4,1,6,3>, <4,1,6,3>
+ 2642930751U, // <6,3,4,2>: Cost 3 vext2 <4,2,6,3>, <4,2,6,3>
+ 2705197552U, // <6,3,4,3>: Cost 3 vext3 <3,4,3,6>, <3,4,3,6>
+ 2584235318U, // <6,3,4,4>: Cost 3 vext1 <5,6,3,4>, RHS
+ 1631603202U, // <6,3,4,5>: Cost 2 vext3 <3,4,5,6>, <3,4,5,6>
+ 2654211444U, // <6,3,4,6>: Cost 3 vext2 <6,1,6,3>, <4,6,4,6>
+ 2237041332U, // <6,3,4,7>: Cost 3 vrev <3,6,7,4>
+ 1631824413U, // <6,3,4,u>: Cost 2 vext3 <3,4,u,6>, <3,4,u,6>
+ 3640066150U, // <6,3,5,0>: Cost 4 vext1 <2,6,3,5>, LHS
+ 3772746288U, // <6,3,5,1>: Cost 4 vext3 <2,4,0,6>, <3,5,1,7>
+ 3640067790U, // <6,3,5,2>: Cost 4 vext1 <2,6,3,5>, <2,3,4,5>
+ 3773041216U, // <6,3,5,3>: Cost 4 vext3 <2,4,4,6>, <3,5,3,5>
+ 2705934922U, // <6,3,5,4>: Cost 3 vext3 <3,5,4,6>, <3,5,4,6>
+ 3773041236U, // <6,3,5,5>: Cost 4 vext3 <2,4,4,6>, <3,5,5,7>
+ 3779086940U, // <6,3,5,6>: Cost 4 vext3 <3,4,5,6>, <3,5,6,6>
+ 3767732831U, // <6,3,5,7>: Cost 4 vext3 <1,5,4,6>, <3,5,7,0>
+ 2706229870U, // <6,3,5,u>: Cost 3 vext3 <3,5,u,6>, <3,5,u,6>
+ 2602164326U, // <6,3,6,0>: Cost 3 vext1 <u,6,3,6>, LHS
+ 2654212512U, // <6,3,6,1>: Cost 3 vext2 <6,1,6,3>, <6,1,6,3>
+ 2566334393U, // <6,3,6,2>: Cost 3 vext1 <2,6,3,6>, <2,6,3,6>
+ 3704066588U, // <6,3,6,3>: Cost 4 vext2 <2,1,6,3>, <6,3,2,1>
+ 2602167524U, // <6,3,6,4>: Cost 3 vext1 <u,6,3,6>, <4,4,6,6>
+ 3710702321U, // <6,3,6,5>: Cost 4 vext2 <3,2,6,3>, <6,5,7,7>
+ 2724661933U, // <6,3,6,6>: Cost 3 vext3 <6,6,6,6>, <3,6,6,6>
+ 3710702465U, // <6,3,6,7>: Cost 4 vext2 <3,2,6,3>, <6,7,5,7>
+ 2602170158U, // <6,3,6,u>: Cost 3 vext1 <u,6,3,6>, LHS
+ 1492598886U, // <6,3,7,0>: Cost 2 vext1 <2,6,3,7>, LHS
+ 2560369889U, // <6,3,7,1>: Cost 3 vext1 <1,6,3,7>, <1,6,3,7>
+ 1492600762U, // <6,3,7,2>: Cost 2 vext1 <2,6,3,7>, <2,6,3,7>
+ 2566342806U, // <6,3,7,3>: Cost 3 vext1 <2,6,3,7>, <3,0,1,2>
+ 1492602166U, // <6,3,7,4>: Cost 2 vext1 <2,6,3,7>, RHS
+ 2602176208U, // <6,3,7,5>: Cost 3 vext1 <u,6,3,7>, <5,1,7,3>
+ 2566345210U, // <6,3,7,6>: Cost 3 vext1 <2,6,3,7>, <6,2,7,3>
+ 2980496528U, // <6,3,7,7>: Cost 3 vzipr RHS, <1,5,3,7>
+ 1492604718U, // <6,3,7,u>: Cost 2 vext1 <2,6,3,7>, LHS
+ 1492607078U, // <6,3,u,0>: Cost 2 vext1 <2,6,3,u>, LHS
+ 2686028574U, // <6,3,u,1>: Cost 3 vext3 <0,2,4,6>, <3,u,1,2>
+ 1492608955U, // <6,3,u,2>: Cost 2 vext1 <2,6,3,u>, <2,6,3,u>
+ 2566350998U, // <6,3,u,3>: Cost 3 vext1 <2,6,3,u>, <3,0,1,2>
+ 1492610358U, // <6,3,u,4>: Cost 2 vext1 <2,6,3,u>, RHS
+ 1634257734U, // <6,3,u,5>: Cost 2 vext3 <3,u,5,6>, <3,u,5,6>
+ 2566353489U, // <6,3,u,6>: Cost 3 vext1 <2,6,3,u>, <6,3,u,0>
+ 2980504720U, // <6,3,u,7>: Cost 3 vzipr RHS, <1,5,3,7>
+ 1492612910U, // <6,3,u,u>: Cost 2 vext1 <2,6,3,u>, LHS
+ 3703406592U, // <6,4,0,0>: Cost 4 vext2 <2,0,6,4>, <0,0,0,0>
+ 2629664870U, // <6,4,0,1>: Cost 3 vext2 <2,0,6,4>, LHS
+ 2629664972U, // <6,4,0,2>: Cost 3 vext2 <2,0,6,4>, <0,2,4,6>
+ 3779087232U, // <6,4,0,3>: Cost 4 vext3 <3,4,5,6>, <4,0,3,1>
+ 2642936156U, // <6,4,0,4>: Cost 3 vext2 <4,2,6,4>, <0,4,2,6>
+ 2712570770U, // <6,4,0,5>: Cost 3 vext3 <4,6,4,6>, <4,0,5,1>
+ 2687208348U, // <6,4,0,6>: Cost 3 vext3 <0,4,2,6>, <4,0,6,2>
+ 3316723081U, // <6,4,0,7>: Cost 4 vrev <4,6,7,0>
+ 2629665437U, // <6,4,0,u>: Cost 3 vext2 <2,0,6,4>, LHS
+ 2242473291U, // <6,4,1,0>: Cost 3 vrev <4,6,0,1>
+ 3700089652U, // <6,4,1,1>: Cost 4 vext2 <1,4,6,4>, <1,1,1,1>
+ 3703407510U, // <6,4,1,2>: Cost 4 vext2 <2,0,6,4>, <1,2,3,0>
+ 2852962406U, // <6,4,1,3>: Cost 3 vuzpr <5,6,7,4>, LHS
+ 3628166454U, // <6,4,1,4>: Cost 4 vext1 <0,6,4,1>, RHS
+ 3760876514U, // <6,4,1,5>: Cost 4 vext3 <0,4,1,6>, <4,1,5,0>
+ 2687208430U, // <6,4,1,6>: Cost 3 vext3 <0,4,2,6>, <4,1,6,3>
+ 3316731274U, // <6,4,1,7>: Cost 4 vrev <4,6,7,1>
+ 2243063187U, // <6,4,1,u>: Cost 3 vrev <4,6,u,1>
+ 2629666284U, // <6,4,2,0>: Cost 3 vext2 <2,0,6,4>, <2,0,6,4>
+ 3703408188U, // <6,4,2,1>: Cost 4 vext2 <2,0,6,4>, <2,1,6,3>
+ 3703408232U, // <6,4,2,2>: Cost 4 vext2 <2,0,6,4>, <2,2,2,2>
+ 3703408294U, // <6,4,2,3>: Cost 4 vext2 <2,0,6,4>, <2,3,0,1>
+ 2632320816U, // <6,4,2,4>: Cost 3 vext2 <2,4,6,4>, <2,4,6,4>
+ 2923384118U, // <6,4,2,5>: Cost 3 vzipl <6,2,7,3>, RHS
+ 2687208508U, // <6,4,2,6>: Cost 3 vext3 <0,4,2,6>, <4,2,6,0>
+ 3760950341U, // <6,4,2,7>: Cost 4 vext3 <0,4,2,6>, <4,2,7,0>
+ 2634975348U, // <6,4,2,u>: Cost 3 vext2 <2,u,6,4>, <2,u,6,4>
+ 3703408790U, // <6,4,3,0>: Cost 4 vext2 <2,0,6,4>, <3,0,1,2>
+ 3316305238U, // <6,4,3,1>: Cost 4 vrev <4,6,1,3>
+ 3703408947U, // <6,4,3,2>: Cost 4 vext2 <2,0,6,4>, <3,2,0,6>
+ 3703409052U, // <6,4,3,3>: Cost 4 vext2 <2,0,6,4>, <3,3,3,3>
+ 2644929026U, // <6,4,3,4>: Cost 3 vext2 <4,5,6,4>, <3,4,5,6>
+ 3718670922U, // <6,4,3,5>: Cost 4 vext2 <4,5,6,4>, <3,5,4,6>
+ 2705345682U, // <6,4,3,6>: Cost 3 vext3 <3,4,5,6>, <4,3,6,5>
+ 3926705152U, // <6,4,3,7>: Cost 4 vuzpr <5,6,7,4>, <1,3,5,7>
+ 2668817222U, // <6,4,3,u>: Cost 3 vext2 <u,5,6,4>, <3,u,5,6>
+ 2590277734U, // <6,4,4,0>: Cost 3 vext1 <6,6,4,4>, LHS
+ 3716017135U, // <6,4,4,1>: Cost 4 vext2 <4,1,6,4>, <4,1,6,4>
+ 2642938944U, // <6,4,4,2>: Cost 3 vext2 <4,2,6,4>, <4,2,6,4>
+ 3717344401U, // <6,4,4,3>: Cost 4 vext2 <4,3,6,4>, <4,3,6,4>
+ 2712571088U, // <6,4,4,4>: Cost 3 vext3 <4,6,4,6>, <4,4,4,4>
+ 2629668150U, // <6,4,4,5>: Cost 3 vext2 <2,0,6,4>, RHS
+ 1637649636U, // <6,4,4,6>: Cost 2 vext3 <4,4,6,6>, <4,4,6,6>
+ 2646257109U, // <6,4,4,7>: Cost 3 vext2 <4,7,6,4>, <4,7,6,4>
+ 1637649636U, // <6,4,4,u>: Cost 2 vext3 <4,4,6,6>, <4,4,6,6>
+ 2566398054U, // <6,4,5,0>: Cost 3 vext1 <2,6,4,5>, LHS
+ 3760876805U, // <6,4,5,1>: Cost 4 vext3 <0,4,1,6>, <4,5,1,3>
+ 2566399937U, // <6,4,5,2>: Cost 3 vext1 <2,6,4,5>, <2,6,4,5>
+ 2584316418U, // <6,4,5,3>: Cost 3 vext1 <5,6,4,5>, <3,4,5,6>
+ 2566401334U, // <6,4,5,4>: Cost 3 vext1 <2,6,4,5>, RHS
+ 2584318028U, // <6,4,5,5>: Cost 3 vext1 <5,6,4,5>, <5,6,4,5>
+ 1612287286U, // <6,4,5,6>: Cost 2 vext3 <0,2,4,6>, RHS
+ 2852965686U, // <6,4,5,7>: Cost 3 vuzpr <5,6,7,4>, RHS
+ 1612287304U, // <6,4,5,u>: Cost 2 vext3 <0,2,4,6>, RHS
+ 1504608358U, // <6,4,6,0>: Cost 2 vext1 <4,6,4,6>, LHS
+ 2578350838U, // <6,4,6,1>: Cost 3 vext1 <4,6,4,6>, <1,0,3,2>
+ 2578351720U, // <6,4,6,2>: Cost 3 vext1 <4,6,4,6>, <2,2,2,2>
+ 2578352278U, // <6,4,6,3>: Cost 3 vext1 <4,6,4,6>, <3,0,1,2>
+ 1504611638U, // <6,4,6,4>: Cost 2 vext1 <4,6,4,6>, RHS
+ 2578353872U, // <6,4,6,5>: Cost 3 vext1 <4,6,4,6>, <5,1,7,3>
+ 2578354682U, // <6,4,6,6>: Cost 3 vext1 <4,6,4,6>, <6,2,7,3>
+ 2578355194U, // <6,4,6,7>: Cost 3 vext1 <4,6,4,6>, <7,0,1,2>
+ 1504614190U, // <6,4,6,u>: Cost 2 vext1 <4,6,4,6>, LHS
+ 2572386406U, // <6,4,7,0>: Cost 3 vext1 <3,6,4,7>, LHS
+ 2572387226U, // <6,4,7,1>: Cost 3 vext1 <3,6,4,7>, <1,2,3,4>
+ 3640157902U, // <6,4,7,2>: Cost 4 vext1 <2,6,4,7>, <2,3,4,5>
+ 2572389020U, // <6,4,7,3>: Cost 3 vext1 <3,6,4,7>, <3,6,4,7>
+ 2572389686U, // <6,4,7,4>: Cost 3 vext1 <3,6,4,7>, RHS
+ 2980497102U, // <6,4,7,5>: Cost 3 vzipr RHS, <2,3,4,5>
+ 2980495564U, // <6,4,7,6>: Cost 3 vzipr RHS, <0,2,4,6>
+ 4054239090U, // <6,4,7,7>: Cost 4 vzipr RHS, <2,5,4,7>
+ 2572392238U, // <6,4,7,u>: Cost 3 vext1 <3,6,4,7>, LHS
+ 1504608358U, // <6,4,u,0>: Cost 2 vext1 <4,6,4,6>, LHS
+ 2629670702U, // <6,4,u,1>: Cost 3 vext2 <2,0,6,4>, LHS
+ 2566424516U, // <6,4,u,2>: Cost 3 vext1 <2,6,4,u>, <2,6,4,u>
+ 2584340994U, // <6,4,u,3>: Cost 3 vext1 <5,6,4,u>, <3,4,5,6>
+ 1640156694U, // <6,4,u,4>: Cost 2 vext3 <4,u,4,6>, <4,u,4,6>
+ 2629671066U, // <6,4,u,5>: Cost 3 vext2 <2,0,6,4>, RHS
+ 1612287529U, // <6,4,u,6>: Cost 2 vext3 <0,2,4,6>, RHS
+ 2852965929U, // <6,4,u,7>: Cost 3 vuzpr <5,6,7,4>, RHS
+ 1612287547U, // <6,4,u,u>: Cost 2 vext3 <0,2,4,6>, RHS
+ 3708723200U, // <6,5,0,0>: Cost 4 vext2 <2,u,6,5>, <0,0,0,0>
+ 2634981478U, // <6,5,0,1>: Cost 3 vext2 <2,u,6,5>, LHS
+ 3694125260U, // <6,5,0,2>: Cost 4 vext2 <0,4,6,5>, <0,2,4,6>
+ 3779087962U, // <6,5,0,3>: Cost 4 vext3 <3,4,5,6>, <5,0,3,2>
+ 3760877154U, // <6,5,0,4>: Cost 4 vext3 <0,4,1,6>, <5,0,4,1>
+ 4195110916U, // <6,5,0,5>: Cost 4 vtrnr <5,6,7,0>, <5,5,5,5>
+ 3696779775U, // <6,5,0,6>: Cost 4 vext2 <0,u,6,5>, <0,6,2,7>
+ 1175212130U, // <6,5,0,7>: Cost 2 vrev <5,6,7,0>
+ 1175285867U, // <6,5,0,u>: Cost 2 vrev <5,6,u,0>
+ 2248445988U, // <6,5,1,0>: Cost 3 vrev <5,6,0,1>
+ 3698107237U, // <6,5,1,1>: Cost 4 vext2 <1,1,6,5>, <1,1,6,5>
+ 3708724118U, // <6,5,1,2>: Cost 4 vext2 <2,u,6,5>, <1,2,3,0>
+ 3908575334U, // <6,5,1,3>: Cost 4 vuzpr <2,6,4,5>, LHS
+ 3716023376U, // <6,5,1,4>: Cost 4 vext2 <4,1,6,5>, <1,4,5,6>
+ 3708724368U, // <6,5,1,5>: Cost 4 vext2 <2,u,6,5>, <1,5,3,7>
+ 3767733960U, // <6,5,1,6>: Cost 4 vext3 <1,5,4,6>, <5,1,6,4>
+ 2712571600U, // <6,5,1,7>: Cost 3 vext3 <4,6,4,6>, <5,1,7,3>
+ 2712571609U, // <6,5,1,u>: Cost 3 vext3 <4,6,4,6>, <5,1,u,3>
+ 2578391142U, // <6,5,2,0>: Cost 3 vext1 <4,6,5,2>, LHS
+ 3704079934U, // <6,5,2,1>: Cost 4 vext2 <2,1,6,5>, <2,1,6,5>
+ 3708724840U, // <6,5,2,2>: Cost 4 vext2 <2,u,6,5>, <2,2,2,2>
+ 3705407182U, // <6,5,2,3>: Cost 4 vext2 <2,3,6,5>, <2,3,4,5>
+ 2578394422U, // <6,5,2,4>: Cost 3 vext1 <4,6,5,2>, RHS
+ 3717351272U, // <6,5,2,5>: Cost 4 vext2 <4,3,6,5>, <2,5,3,6>
+ 2634983354U, // <6,5,2,6>: Cost 3 vext2 <2,u,6,5>, <2,6,3,7>
+ 3115486518U, // <6,5,2,7>: Cost 3 vtrnr <4,6,u,2>, RHS
+ 2634983541U, // <6,5,2,u>: Cost 3 vext2 <2,u,6,5>, <2,u,6,5>
+ 3708725398U, // <6,5,3,0>: Cost 4 vext2 <2,u,6,5>, <3,0,1,2>
+ 3710052631U, // <6,5,3,1>: Cost 4 vext2 <3,1,6,5>, <3,1,6,5>
+ 3708725606U, // <6,5,3,2>: Cost 4 vext2 <2,u,6,5>, <3,2,6,3>
+ 3708725660U, // <6,5,3,3>: Cost 4 vext2 <2,u,6,5>, <3,3,3,3>
+ 2643610114U, // <6,5,3,4>: Cost 3 vext2 <4,3,6,5>, <3,4,5,6>
+ 3717352010U, // <6,5,3,5>: Cost 4 vext2 <4,3,6,5>, <3,5,4,6>
+ 3773632358U, // <6,5,3,6>: Cost 4 vext3 <2,5,3,6>, <5,3,6,0>
+ 2248978533U, // <6,5,3,7>: Cost 3 vrev <5,6,7,3>
+ 2249052270U, // <6,5,3,u>: Cost 3 vrev <5,6,u,3>
+ 2596323430U, // <6,5,4,0>: Cost 3 vext1 <7,6,5,4>, LHS
+ 3716025328U, // <6,5,4,1>: Cost 4 vext2 <4,1,6,5>, <4,1,6,5>
+ 3716688961U, // <6,5,4,2>: Cost 4 vext2 <4,2,6,5>, <4,2,6,5>
+ 2643610770U, // <6,5,4,3>: Cost 3 vext2 <4,3,6,5>, <4,3,6,5>
+ 2596326710U, // <6,5,4,4>: Cost 3 vext1 <7,6,5,4>, RHS
+ 2634984758U, // <6,5,4,5>: Cost 3 vext2 <2,u,6,5>, RHS
+ 3767734199U, // <6,5,4,6>: Cost 4 vext3 <1,5,4,6>, <5,4,6,0>
+ 1643696070U, // <6,5,4,7>: Cost 2 vext3 <5,4,7,6>, <5,4,7,6>
+ 1643769807U, // <6,5,4,u>: Cost 2 vext3 <5,4,u,6>, <5,4,u,6>
+ 2578415718U, // <6,5,5,0>: Cost 3 vext1 <4,6,5,5>, LHS
+ 3652158198U, // <6,5,5,1>: Cost 4 vext1 <4,6,5,5>, <1,0,3,2>
+ 3652159080U, // <6,5,5,2>: Cost 4 vext1 <4,6,5,5>, <2,2,2,2>
+ 3652159638U, // <6,5,5,3>: Cost 4 vext1 <4,6,5,5>, <3,0,1,2>
+ 2578418998U, // <6,5,5,4>: Cost 3 vext1 <4,6,5,5>, RHS
+ 2712571908U, // <6,5,5,5>: Cost 3 vext3 <4,6,4,6>, <5,5,5,5>
+ 2718027790U, // <6,5,5,6>: Cost 3 vext3 <5,5,6,6>, <5,5,6,6>
+ 2712571928U, // <6,5,5,7>: Cost 3 vext3 <4,6,4,6>, <5,5,7,7>
+ 2712571937U, // <6,5,5,u>: Cost 3 vext3 <4,6,4,6>, <5,5,u,7>
+ 2705346596U, // <6,5,6,0>: Cost 3 vext3 <3,4,5,6>, <5,6,0,1>
+ 3767144496U, // <6,5,6,1>: Cost 4 vext3 <1,4,5,6>, <5,6,1,4>
+ 3773116473U, // <6,5,6,2>: Cost 4 vext3 <2,4,5,6>, <5,6,2,4>
+ 2705346626U, // <6,5,6,3>: Cost 3 vext3 <3,4,5,6>, <5,6,3,4>
+ 2705346636U, // <6,5,6,4>: Cost 3 vext3 <3,4,5,6>, <5,6,4,5>
+ 3908577217U, // <6,5,6,5>: Cost 4 vuzpr <2,6,4,5>, <2,6,4,5>
+ 2578428728U, // <6,5,6,6>: Cost 3 vext1 <4,6,5,6>, <6,6,6,6>
+ 2712572002U, // <6,5,6,7>: Cost 3 vext3 <4,6,4,6>, <5,6,7,0>
+ 2705346668U, // <6,5,6,u>: Cost 3 vext3 <3,4,5,6>, <5,6,u,1>
+ 2560516198U, // <6,5,7,0>: Cost 3 vext1 <1,6,5,7>, LHS
+ 2560517363U, // <6,5,7,1>: Cost 3 vext1 <1,6,5,7>, <1,6,5,7>
+ 2566490060U, // <6,5,7,2>: Cost 3 vext1 <2,6,5,7>, <2,6,5,7>
+ 3634260118U, // <6,5,7,3>: Cost 4 vext1 <1,6,5,7>, <3,0,1,2>
+ 2560519478U, // <6,5,7,4>: Cost 3 vext1 <1,6,5,7>, RHS
+ 2980498650U, // <6,5,7,5>: Cost 3 vzipr RHS, <4,4,5,5>
+ 2980497922U, // <6,5,7,6>: Cost 3 vzipr RHS, <3,4,5,6>
+ 3103214902U, // <6,5,7,7>: Cost 3 vtrnr <2,6,3,7>, RHS
+ 2560522030U, // <6,5,7,u>: Cost 3 vext1 <1,6,5,7>, LHS
+ 2560524390U, // <6,5,u,0>: Cost 3 vext1 <1,6,5,u>, LHS
+ 2560525556U, // <6,5,u,1>: Cost 3 vext1 <1,6,5,u>, <1,6,5,u>
+ 2566498253U, // <6,5,u,2>: Cost 3 vext1 <2,6,5,u>, <2,6,5,u>
+ 2646931439U, // <6,5,u,3>: Cost 3 vext2 <4,u,6,5>, <u,3,5,7>
+ 2560527670U, // <6,5,u,4>: Cost 3 vext1 <1,6,5,u>, RHS
+ 2634987674U, // <6,5,u,5>: Cost 3 vext2 <2,u,6,5>, RHS
+ 2980506114U, // <6,5,u,6>: Cost 3 vzipr RHS, <3,4,5,6>
+ 1175277674U, // <6,5,u,7>: Cost 2 vrev <5,6,7,u>
+ 1175351411U, // <6,5,u,u>: Cost 2 vrev <5,6,u,u>
+ 2578448486U, // <6,6,0,0>: Cost 3 vext1 <4,6,6,0>, LHS
+ 1573191782U, // <6,6,0,1>: Cost 2 vext2 <4,u,6,6>, LHS
+ 2686030124U, // <6,6,0,2>: Cost 3 vext3 <0,2,4,6>, <6,0,2,4>
+ 3779088690U, // <6,6,0,3>: Cost 4 vext3 <3,4,5,6>, <6,0,3,1>
+ 2687209788U, // <6,6,0,4>: Cost 3 vext3 <0,4,2,6>, <6,0,4,2>
+ 3652194000U, // <6,6,0,5>: Cost 4 vext1 <4,6,6,0>, <5,1,7,3>
+ 2254852914U, // <6,6,0,6>: Cost 3 vrev <6,6,6,0>
+ 4041575734U, // <6,6,0,7>: Cost 4 vzipr <2,4,6,0>, RHS
+ 1573192349U, // <6,6,0,u>: Cost 2 vext2 <4,u,6,6>, LHS
+ 2646934262U, // <6,6,1,0>: Cost 3 vext2 <4,u,6,6>, <1,0,3,2>
+ 2646934324U, // <6,6,1,1>: Cost 3 vext2 <4,u,6,6>, <1,1,1,1>
+ 2646934422U, // <6,6,1,2>: Cost 3 vext2 <4,u,6,6>, <1,2,3,0>
+ 2846785638U, // <6,6,1,3>: Cost 3 vuzpr <4,6,4,6>, LHS
+ 3760951694U, // <6,6,1,4>: Cost 4 vext3 <0,4,2,6>, <6,1,4,3>
+ 2646934672U, // <6,6,1,5>: Cost 3 vext2 <4,u,6,6>, <1,5,3,7>
+ 2712572320U, // <6,6,1,6>: Cost 3 vext3 <4,6,4,6>, <6,1,6,3>
+ 3775549865U, // <6,6,1,7>: Cost 4 vext3 <2,u,2,6>, <6,1,7,3>
+ 2846785643U, // <6,6,1,u>: Cost 3 vuzpr <4,6,4,6>, LHS
+ 3759772094U, // <6,6,2,0>: Cost 4 vext3 <0,2,4,6>, <6,2,0,6>
+ 3704751676U, // <6,6,2,1>: Cost 4 vext2 <2,2,6,6>, <2,1,6,3>
+ 2631009936U, // <6,6,2,2>: Cost 3 vext2 <2,2,6,6>, <2,2,6,6>
+ 2646935206U, // <6,6,2,3>: Cost 3 vext2 <4,u,6,6>, <2,3,0,1>
+ 3759772127U, // <6,6,2,4>: Cost 4 vext3 <0,2,4,6>, <6,2,4,3>
+ 3704752004U, // <6,6,2,5>: Cost 4 vext2 <2,2,6,6>, <2,5,6,7>
+ 2646935482U, // <6,6,2,6>: Cost 3 vext2 <4,u,6,6>, <2,6,3,7>
+ 2712572410U, // <6,6,2,7>: Cost 3 vext3 <4,6,4,6>, <6,2,7,3>
+ 2712572419U, // <6,6,2,u>: Cost 3 vext3 <4,6,4,6>, <6,2,u,3>
+ 2646935702U, // <6,6,3,0>: Cost 3 vext2 <4,u,6,6>, <3,0,1,2>
+ 3777024534U, // <6,6,3,1>: Cost 4 vext3 <3,1,4,6>, <6,3,1,4>
+ 3704752453U, // <6,6,3,2>: Cost 4 vext2 <2,2,6,6>, <3,2,2,6>
+ 2646935964U, // <6,6,3,3>: Cost 3 vext2 <4,u,6,6>, <3,3,3,3>
+ 2705347122U, // <6,6,3,4>: Cost 3 vext3 <3,4,5,6>, <6,3,4,5>
+ 3779678778U, // <6,6,3,5>: Cost 4 vext3 <3,5,4,6>, <6,3,5,4>
+ 2657553069U, // <6,6,3,6>: Cost 3 vext2 <6,6,6,6>, <3,6,6,6>
+ 4039609654U, // <6,6,3,7>: Cost 4 vzipr <2,1,6,3>, RHS
+ 2708001366U, // <6,6,3,u>: Cost 3 vext3 <3,u,5,6>, <6,3,u,5>
+ 2578481254U, // <6,6,4,0>: Cost 3 vext1 <4,6,6,4>, LHS
+ 3652223734U, // <6,6,4,1>: Cost 4 vext1 <4,6,6,4>, <1,0,3,2>
+ 3760951922U, // <6,6,4,2>: Cost 4 vext3 <0,4,2,6>, <6,4,2,6>
+ 3779089019U, // <6,6,4,3>: Cost 4 vext3 <3,4,5,6>, <6,4,3,6>
+ 1570540772U, // <6,6,4,4>: Cost 2 vext2 <4,4,6,6>, <4,4,6,6>
+ 1573195062U, // <6,6,4,5>: Cost 2 vext2 <4,u,6,6>, RHS
+ 2712572560U, // <6,6,4,6>: Cost 3 vext3 <4,6,4,6>, <6,4,6,0>
+ 2723410591U, // <6,6,4,7>: Cost 3 vext3 <6,4,7,6>, <6,4,7,6>
+ 1573195304U, // <6,6,4,u>: Cost 2 vext2 <4,u,6,6>, <4,u,6,6>
+ 3640287334U, // <6,6,5,0>: Cost 4 vext1 <2,6,6,5>, LHS
+ 2646937296U, // <6,6,5,1>: Cost 3 vext2 <4,u,6,6>, <5,1,7,3>
+ 3640289235U, // <6,6,5,2>: Cost 4 vext1 <2,6,6,5>, <2,6,6,5>
+ 3720679279U, // <6,6,5,3>: Cost 4 vext2 <4,u,6,6>, <5,3,7,0>
+ 2646937542U, // <6,6,5,4>: Cost 3 vext2 <4,u,6,6>, <5,4,7,6>
+ 2646937604U, // <6,6,5,5>: Cost 3 vext2 <4,u,6,6>, <5,5,5,5>
+ 2646937698U, // <6,6,5,6>: Cost 3 vext2 <4,u,6,6>, <5,6,7,0>
+ 2846788918U, // <6,6,5,7>: Cost 3 vuzpr <4,6,4,6>, RHS
+ 2846788919U, // <6,6,5,u>: Cost 3 vuzpr <4,6,4,6>, RHS
+ 1516699750U, // <6,6,6,0>: Cost 2 vext1 <6,6,6,6>, LHS
+ 2590442230U, // <6,6,6,1>: Cost 3 vext1 <6,6,6,6>, <1,0,3,2>
+ 2646938106U, // <6,6,6,2>: Cost 3 vext2 <4,u,6,6>, <6,2,7,3>
+ 2590443670U, // <6,6,6,3>: Cost 3 vext1 <6,6,6,6>, <3,0,1,2>
+ 1516703030U, // <6,6,6,4>: Cost 2 vext1 <6,6,6,6>, RHS
+ 2590445264U, // <6,6,6,5>: Cost 3 vext1 <6,6,6,6>, <5,1,7,3>
+ 296144182U, // <6,6,6,6>: Cost 1 vdup2 RHS
+ 2712572738U, // <6,6,6,7>: Cost 3 vext3 <4,6,4,6>, <6,6,7,7>
+ 296144182U, // <6,6,6,u>: Cost 1 vdup2 RHS
+ 2566561894U, // <6,6,7,0>: Cost 3 vext1 <2,6,6,7>, LHS
+ 3634332924U, // <6,6,7,1>: Cost 4 vext1 <1,6,6,7>, <1,6,6,7>
+ 2566563797U, // <6,6,7,2>: Cost 3 vext1 <2,6,6,7>, <2,6,6,7>
+ 2584480258U, // <6,6,7,3>: Cost 3 vext1 <5,6,6,7>, <3,4,5,6>
+ 2566565174U, // <6,6,7,4>: Cost 3 vext1 <2,6,6,7>, RHS
+ 2717438846U, // <6,6,7,5>: Cost 3 vext3 <5,4,7,6>, <6,7,5,4>
+ 2980500280U, // <6,6,7,6>: Cost 3 vzipr RHS, <6,6,6,6>
+ 1906756918U, // <6,6,7,7>: Cost 2 vzipr RHS, RHS
+ 1906756919U, // <6,6,7,u>: Cost 2 vzipr RHS, RHS
+ 1516699750U, // <6,6,u,0>: Cost 2 vext1 <6,6,6,6>, LHS
+ 1573197614U, // <6,6,u,1>: Cost 2 vext2 <4,u,6,6>, LHS
+ 2566571990U, // <6,6,u,2>: Cost 3 vext1 <2,6,6,u>, <2,6,6,u>
+ 2846786205U, // <6,6,u,3>: Cost 3 vuzpr <4,6,4,6>, LHS
+ 1516703030U, // <6,6,u,4>: Cost 2 vext1 <6,6,6,6>, RHS
+ 1573197978U, // <6,6,u,5>: Cost 2 vext2 <4,u,6,6>, RHS
+ 296144182U, // <6,6,u,6>: Cost 1 vdup2 RHS
+ 1906765110U, // <6,6,u,7>: Cost 2 vzipr RHS, RHS
+ 296144182U, // <6,6,u,u>: Cost 1 vdup2 RHS
+ 1571209216U, // <6,7,0,0>: Cost 2 vext2 RHS, <0,0,0,0>
+ 497467494U, // <6,7,0,1>: Cost 1 vext2 RHS, LHS
+ 1571209380U, // <6,7,0,2>: Cost 2 vext2 RHS, <0,2,0,2>
+ 2644951292U, // <6,7,0,3>: Cost 3 vext2 RHS, <0,3,1,0>
+ 1571209554U, // <6,7,0,4>: Cost 2 vext2 RHS, <0,4,1,5>
+ 1510756450U, // <6,7,0,5>: Cost 2 vext1 <5,6,7,0>, <5,6,7,0>
+ 2644951542U, // <6,7,0,6>: Cost 3 vext2 RHS, <0,6,1,7>
+ 2584499194U, // <6,7,0,7>: Cost 3 vext1 <5,6,7,0>, <7,0,1,2>
+ 497468061U, // <6,7,0,u>: Cost 1 vext2 RHS, LHS
+ 1571209974U, // <6,7,1,0>: Cost 2 vext2 RHS, <1,0,3,2>
+ 1571210036U, // <6,7,1,1>: Cost 2 vext2 RHS, <1,1,1,1>
+ 1571210134U, // <6,7,1,2>: Cost 2 vext2 RHS, <1,2,3,0>
+ 1571210200U, // <6,7,1,3>: Cost 2 vext2 RHS, <1,3,1,3>
+ 2644952098U, // <6,7,1,4>: Cost 3 vext2 RHS, <1,4,0,5>
+ 1571210384U, // <6,7,1,5>: Cost 2 vext2 RHS, <1,5,3,7>
+ 2644952271U, // <6,7,1,6>: Cost 3 vext2 RHS, <1,6,1,7>
+ 2578535418U, // <6,7,1,7>: Cost 3 vext1 <4,6,7,1>, <7,0,1,2>
+ 1571210605U, // <6,7,1,u>: Cost 2 vext2 RHS, <1,u,1,3>
+ 2644952509U, // <6,7,2,0>: Cost 3 vext2 RHS, <2,0,1,2>
+ 2644952582U, // <6,7,2,1>: Cost 3 vext2 RHS, <2,1,0,3>
+ 1571210856U, // <6,7,2,2>: Cost 2 vext2 RHS, <2,2,2,2>
+ 1571210918U, // <6,7,2,3>: Cost 2 vext2 RHS, <2,3,0,1>
+ 2644952828U, // <6,7,2,4>: Cost 3 vext2 RHS, <2,4,0,6>
+ 2633009028U, // <6,7,2,5>: Cost 3 vext2 <2,5,6,7>, <2,5,6,7>
+ 1571211194U, // <6,7,2,6>: Cost 2 vext2 RHS, <2,6,3,7>
+ 2668840938U, // <6,7,2,7>: Cost 3 vext2 RHS, <2,7,0,1>
+ 1571211323U, // <6,7,2,u>: Cost 2 vext2 RHS, <2,u,0,1>
+ 1571211414U, // <6,7,3,0>: Cost 2 vext2 RHS, <3,0,1,2>
+ 2644953311U, // <6,7,3,1>: Cost 3 vext2 RHS, <3,1,0,3>
+ 2644953390U, // <6,7,3,2>: Cost 3 vext2 RHS, <3,2,0,1>
+ 1571211676U, // <6,7,3,3>: Cost 2 vext2 RHS, <3,3,3,3>
+ 1571211778U, // <6,7,3,4>: Cost 2 vext2 RHS, <3,4,5,6>
+ 2644953648U, // <6,7,3,5>: Cost 3 vext2 RHS, <3,5,1,7>
+ 2644953720U, // <6,7,3,6>: Cost 3 vext2 RHS, <3,6,0,7>
+ 2644953795U, // <6,7,3,7>: Cost 3 vext2 RHS, <3,7,0,1>
+ 1571212062U, // <6,7,3,u>: Cost 2 vext2 RHS, <3,u,1,2>
+ 1573202834U, // <6,7,4,0>: Cost 2 vext2 RHS, <4,0,5,1>
+ 2644954058U, // <6,7,4,1>: Cost 3 vext2 RHS, <4,1,2,3>
+ 2644954166U, // <6,7,4,2>: Cost 3 vext2 RHS, <4,2,5,3>
+ 2644954258U, // <6,7,4,3>: Cost 3 vext2 RHS, <4,3,6,5>
+ 1571212496U, // <6,7,4,4>: Cost 2 vext2 RHS, <4,4,4,4>
+ 497470774U, // <6,7,4,5>: Cost 1 vext2 RHS, RHS
+ 1573203316U, // <6,7,4,6>: Cost 2 vext2 RHS, <4,6,4,6>
+ 2646281688U, // <6,7,4,7>: Cost 3 vext2 <4,7,6,7>, <4,7,6,7>
+ 497471017U, // <6,7,4,u>: Cost 1 vext2 RHS, RHS
+ 2644954696U, // <6,7,5,0>: Cost 3 vext2 RHS, <5,0,1,2>
+ 1573203664U, // <6,7,5,1>: Cost 2 vext2 RHS, <5,1,7,3>
+ 2644954878U, // <6,7,5,2>: Cost 3 vext2 RHS, <5,2,3,4>
+ 2644954991U, // <6,7,5,3>: Cost 3 vext2 RHS, <5,3,7,0>
+ 1571213254U, // <6,7,5,4>: Cost 2 vext2 RHS, <5,4,7,6>
+ 1571213316U, // <6,7,5,5>: Cost 2 vext2 RHS, <5,5,5,5>
+ 1571213410U, // <6,7,5,6>: Cost 2 vext2 RHS, <5,6,7,0>
+ 1573204136U, // <6,7,5,7>: Cost 2 vext2 RHS, <5,7,5,7>
+ 1573204217U, // <6,7,5,u>: Cost 2 vext2 RHS, <5,u,5,7>
+ 2644955425U, // <6,7,6,0>: Cost 3 vext2 RHS, <6,0,1,2>
+ 2644955561U, // <6,7,6,1>: Cost 3 vext2 RHS, <6,1,7,3>
+ 1573204474U, // <6,7,6,2>: Cost 2 vext2 RHS, <6,2,7,3>
+ 2644955698U, // <6,7,6,3>: Cost 3 vext2 RHS, <6,3,4,5>
+ 2644955789U, // <6,7,6,4>: Cost 3 vext2 RHS, <6,4,5,6>
+ 2644955889U, // <6,7,6,5>: Cost 3 vext2 RHS, <6,5,7,7>
+ 1571214136U, // <6,7,6,6>: Cost 2 vext2 RHS, <6,6,6,6>
+ 1571214158U, // <6,7,6,7>: Cost 2 vext2 RHS, <6,7,0,1>
+ 1573204895U, // <6,7,6,u>: Cost 2 vext2 RHS, <6,u,0,1>
+ 1573204986U, // <6,7,7,0>: Cost 2 vext2 RHS, <7,0,1,2>
+ 2572608656U, // <6,7,7,1>: Cost 3 vext1 <3,6,7,7>, <1,5,3,7>
+ 2644956362U, // <6,7,7,2>: Cost 3 vext2 RHS, <7,2,6,3>
+ 2572610231U, // <6,7,7,3>: Cost 3 vext1 <3,6,7,7>, <3,6,7,7>
+ 1573205350U, // <6,7,7,4>: Cost 2 vext2 RHS, <7,4,5,6>
+ 2646947220U, // <6,7,7,5>: Cost 3 vext2 RHS, <7,5,1,7>
+ 1516786498U, // <6,7,7,6>: Cost 2 vext1 <6,6,7,7>, <6,6,7,7>
+ 1571214956U, // <6,7,7,7>: Cost 2 vext2 RHS, <7,7,7,7>
+ 1573205634U, // <6,7,7,u>: Cost 2 vext2 RHS, <7,u,1,2>
+ 1571215059U, // <6,7,u,0>: Cost 2 vext2 RHS, <u,0,1,2>
+ 497473326U, // <6,7,u,1>: Cost 1 vext2 RHS, LHS
+ 1571215237U, // <6,7,u,2>: Cost 2 vext2 RHS, <u,2,3,0>
+ 1571215292U, // <6,7,u,3>: Cost 2 vext2 RHS, <u,3,0,1>
+ 1571215423U, // <6,7,u,4>: Cost 2 vext2 RHS, <u,4,5,6>
+ 497473690U, // <6,7,u,5>: Cost 1 vext2 RHS, RHS
+ 1571215568U, // <6,7,u,6>: Cost 2 vext2 RHS, <u,6,3,7>
+ 1573206272U, // <6,7,u,7>: Cost 2 vext2 RHS, <u,7,0,1>
+ 497473893U, // <6,7,u,u>: Cost 1 vext2 RHS, LHS
+ 1571217408U, // <6,u,0,0>: Cost 2 vext2 RHS, <0,0,0,0>
+ 497475686U, // <6,u,0,1>: Cost 1 vext2 RHS, LHS
+ 1571217572U, // <6,u,0,2>: Cost 2 vext2 RHS, <0,2,0,2>
+ 2689865445U, // <6,u,0,3>: Cost 3 vext3 <0,u,2,6>, <u,0,3,2>
+ 1571217746U, // <6,u,0,4>: Cost 2 vext2 RHS, <0,4,1,5>
+ 1510830187U, // <6,u,0,5>: Cost 2 vext1 <5,6,u,0>, <5,6,u,0>
+ 2644959734U, // <6,u,0,6>: Cost 3 vext2 RHS, <0,6,1,7>
+ 1193130221U, // <6,u,0,7>: Cost 2 vrev <u,6,7,0>
+ 497476253U, // <6,u,0,u>: Cost 1 vext2 RHS, LHS
+ 1571218166U, // <6,u,1,0>: Cost 2 vext2 RHS, <1,0,3,2>
+ 1571218228U, // <6,u,1,1>: Cost 2 vext2 RHS, <1,1,1,1>
+ 1612289838U, // <6,u,1,2>: Cost 2 vext3 <0,2,4,6>, LHS
+ 1571218392U, // <6,u,1,3>: Cost 2 vext2 RHS, <1,3,1,3>
+ 2566663478U, // <6,u,1,4>: Cost 3 vext1 <2,6,u,1>, RHS
+ 1571218576U, // <6,u,1,5>: Cost 2 vext2 RHS, <1,5,3,7>
+ 2644960463U, // <6,u,1,6>: Cost 3 vext2 RHS, <1,6,1,7>
+ 2717439835U, // <6,u,1,7>: Cost 3 vext3 <5,4,7,6>, <u,1,7,3>
+ 1612289892U, // <6,u,1,u>: Cost 2 vext3 <0,2,4,6>, LHS
+ 1504870502U, // <6,u,2,0>: Cost 2 vext1 <4,6,u,2>, LHS
+ 2644960774U, // <6,u,2,1>: Cost 3 vext2 RHS, <2,1,0,3>
+ 1571219048U, // <6,u,2,2>: Cost 2 vext2 RHS, <2,2,2,2>
+ 1571219110U, // <6,u,2,3>: Cost 2 vext2 RHS, <2,3,0,1>
+ 1504873782U, // <6,u,2,4>: Cost 2 vext1 <4,6,u,2>, RHS
+ 2633017221U, // <6,u,2,5>: Cost 3 vext2 <2,5,6,u>, <2,5,6,u>
+ 1571219386U, // <6,u,2,6>: Cost 2 vext2 RHS, <2,6,3,7>
+ 2712573868U, // <6,u,2,7>: Cost 3 vext3 <4,6,4,6>, <u,2,7,3>
+ 1571219515U, // <6,u,2,u>: Cost 2 vext2 RHS, <2,u,0,1>
+ 1571219606U, // <6,u,3,0>: Cost 2 vext2 RHS, <3,0,1,2>
+ 2644961503U, // <6,u,3,1>: Cost 3 vext2 RHS, <3,1,0,3>
+ 2566678499U, // <6,u,3,2>: Cost 3 vext1 <2,6,u,3>, <2,6,u,3>
+ 1571219868U, // <6,u,3,3>: Cost 2 vext2 RHS, <3,3,3,3>
+ 1571219970U, // <6,u,3,4>: Cost 2 vext2 RHS, <3,4,5,6>
+ 2689865711U, // <6,u,3,5>: Cost 3 vext3 <0,u,2,6>, <u,3,5,7>
+ 2708002806U, // <6,u,3,6>: Cost 3 vext3 <3,u,5,6>, <u,3,6,5>
+ 2644961987U, // <6,u,3,7>: Cost 3 vext2 RHS, <3,7,0,1>
+ 1571220254U, // <6,u,3,u>: Cost 2 vext2 RHS, <3,u,1,2>
+ 1571220370U, // <6,u,4,0>: Cost 2 vext2 RHS, <4,0,5,1>
+ 2644962250U, // <6,u,4,1>: Cost 3 vext2 RHS, <4,1,2,3>
+ 1661245476U, // <6,u,4,2>: Cost 2 vext3 <u,4,2,6>, <u,4,2,6>
+ 2686031917U, // <6,u,4,3>: Cost 3 vext3 <0,2,4,6>, <u,4,3,6>
+ 1571220688U, // <6,u,4,4>: Cost 2 vext2 RHS, <4,4,4,4>
+ 497478967U, // <6,u,4,5>: Cost 1 vext2 RHS, RHS
+ 1571220852U, // <6,u,4,6>: Cost 2 vext2 RHS, <4,6,4,6>
+ 1661614161U, // <6,u,4,7>: Cost 2 vext3 <u,4,7,6>, <u,4,7,6>
+ 497479209U, // <6,u,4,u>: Cost 1 vext2 RHS, RHS
+ 2566692966U, // <6,u,5,0>: Cost 3 vext1 <2,6,u,5>, LHS
+ 1571221200U, // <6,u,5,1>: Cost 2 vext2 RHS, <5,1,7,3>
+ 2566694885U, // <6,u,5,2>: Cost 3 vext1 <2,6,u,5>, <2,6,u,5>
+ 2689865855U, // <6,u,5,3>: Cost 3 vext3 <0,u,2,6>, <u,5,3,7>
+ 1571221446U, // <6,u,5,4>: Cost 2 vext2 RHS, <5,4,7,6>
+ 1571221508U, // <6,u,5,5>: Cost 2 vext2 RHS, <5,5,5,5>
+ 1612290202U, // <6,u,5,6>: Cost 2 vext3 <0,2,4,6>, RHS
+ 1571221672U, // <6,u,5,7>: Cost 2 vext2 RHS, <5,7,5,7>
+ 1612290220U, // <6,u,5,u>: Cost 2 vext3 <0,2,4,6>, RHS
+ 1504903270U, // <6,u,6,0>: Cost 2 vext1 <4,6,u,6>, LHS
+ 2644963752U, // <6,u,6,1>: Cost 3 vext2 RHS, <6,1,7,2>
+ 1571222010U, // <6,u,6,2>: Cost 2 vext2 RHS, <6,2,7,3>
+ 2686032080U, // <6,u,6,3>: Cost 3 vext3 <0,2,4,6>, <u,6,3,7>
+ 1504906550U, // <6,u,6,4>: Cost 2 vext1 <4,6,u,6>, RHS
+ 2644964079U, // <6,u,6,5>: Cost 3 vext2 RHS, <6,5,7,5>
+ 296144182U, // <6,u,6,6>: Cost 1 vdup2 RHS
+ 1571222350U, // <6,u,6,7>: Cost 2 vext2 RHS, <6,7,0,1>
+ 296144182U, // <6,u,6,u>: Cost 1 vdup2 RHS
+ 1492967526U, // <6,u,7,0>: Cost 2 vext1 <2,6,u,7>, LHS
+ 2560738574U, // <6,u,7,1>: Cost 3 vext1 <1,6,u,7>, <1,6,u,7>
+ 1492969447U, // <6,u,7,2>: Cost 2 vext1 <2,6,u,7>, <2,6,u,7>
+ 1906753692U, // <6,u,7,3>: Cost 2 vzipr RHS, LHS
+ 1492970806U, // <6,u,7,4>: Cost 2 vext1 <2,6,u,7>, RHS
+ 2980495761U, // <6,u,7,5>: Cost 3 vzipr RHS, <0,4,u,5>
+ 1516860235U, // <6,u,7,6>: Cost 2 vext1 <6,6,u,7>, <6,6,u,7>
+ 1906756936U, // <6,u,7,7>: Cost 2 vzipr RHS, RHS
+ 1492973358U, // <6,u,7,u>: Cost 2 vext1 <2,6,u,7>, LHS
+ 1492975718U, // <6,u,u,0>: Cost 2 vext1 <2,6,u,u>, LHS
+ 497481518U, // <6,u,u,1>: Cost 1 vext2 RHS, LHS
+ 1612290405U, // <6,u,u,2>: Cost 2 vext3 <0,2,4,6>, LHS
+ 1571223484U, // <6,u,u,3>: Cost 2 vext2 RHS, <u,3,0,1>
+ 1492978998U, // <6,u,u,4>: Cost 2 vext1 <2,6,u,u>, RHS
+ 497481882U, // <6,u,u,5>: Cost 1 vext2 RHS, RHS
+ 296144182U, // <6,u,u,6>: Cost 1 vdup2 RHS
+ 1906765128U, // <6,u,u,7>: Cost 2 vzipr RHS, RHS
+ 497482085U, // <6,u,u,u>: Cost 1 vext2 RHS, LHS
+ 1638318080U, // <7,0,0,0>: Cost 2 vext3 RHS, <0,0,0,0>
+ 1638318090U, // <7,0,0,1>: Cost 2 vext3 RHS, <0,0,1,1>
+ 1638318100U, // <7,0,0,2>: Cost 2 vext3 RHS, <0,0,2,2>
+ 3646442178U, // <7,0,0,3>: Cost 4 vext1 <3,7,0,0>, <3,7,0,0>
+ 2712059941U, // <7,0,0,4>: Cost 3 vext3 RHS, <0,0,4,1>
+ 2651603364U, // <7,0,0,5>: Cost 3 vext2 <5,6,7,0>, <0,5,1,6>
+ 2590618445U, // <7,0,0,6>: Cost 3 vext1 <6,7,0,0>, <6,7,0,0>
+ 3785801798U, // <7,0,0,7>: Cost 4 vext3 RHS, <0,0,7,7>
+ 1638318153U, // <7,0,0,u>: Cost 2 vext3 RHS, <0,0,u,1>
+ 1516879974U, // <7,0,1,0>: Cost 2 vext1 <6,7,0,1>, LHS
+ 2693922911U, // <7,0,1,1>: Cost 3 vext3 <1,5,3,7>, <0,1,1,5>
+ 564576358U, // <7,0,1,2>: Cost 1 vext3 RHS, LHS
+ 2638996480U, // <7,0,1,3>: Cost 3 vext2 <3,5,7,0>, <1,3,5,7>
+ 1516883254U, // <7,0,1,4>: Cost 2 vext1 <6,7,0,1>, RHS
+ 2649613456U, // <7,0,1,5>: Cost 3 vext2 <5,3,7,0>, <1,5,3,7>
+ 1516884814U, // <7,0,1,6>: Cost 2 vext1 <6,7,0,1>, <6,7,0,1>
+ 2590626808U, // <7,0,1,7>: Cost 3 vext1 <6,7,0,1>, <7,0,1,0>
+ 564576412U, // <7,0,1,u>: Cost 1 vext3 RHS, LHS
+ 1638318244U, // <7,0,2,0>: Cost 2 vext3 RHS, <0,2,0,2>
+ 2692743344U, // <7,0,2,1>: Cost 3 vext3 <1,3,5,7>, <0,2,1,5>
+ 2712060084U, // <7,0,2,2>: Cost 3 vext3 RHS, <0,2,2,0>
+ 2712060094U, // <7,0,2,3>: Cost 3 vext3 RHS, <0,2,3,1>
+ 1638318284U, // <7,0,2,4>: Cost 2 vext3 RHS, <0,2,4,6>
+ 2712060118U, // <7,0,2,5>: Cost 3 vext3 RHS, <0,2,5,7>
+ 2651604922U, // <7,0,2,6>: Cost 3 vext2 <5,6,7,0>, <2,6,3,7>
+ 2686255336U, // <7,0,2,7>: Cost 3 vext3 <0,2,7,7>, <0,2,7,7>
+ 1638318316U, // <7,0,2,u>: Cost 2 vext3 RHS, <0,2,u,2>
+ 2651605142U, // <7,0,3,0>: Cost 3 vext2 <5,6,7,0>, <3,0,1,2>
+ 2712060156U, // <7,0,3,1>: Cost 3 vext3 RHS, <0,3,1,0>
+ 2712060165U, // <7,0,3,2>: Cost 3 vext3 RHS, <0,3,2,0>
+ 2651605404U, // <7,0,3,3>: Cost 3 vext2 <5,6,7,0>, <3,3,3,3>
+ 2651605506U, // <7,0,3,4>: Cost 3 vext2 <5,6,7,0>, <3,4,5,6>
+ 2638998111U, // <7,0,3,5>: Cost 3 vext2 <3,5,7,0>, <3,5,7,0>
+ 2639661744U, // <7,0,3,6>: Cost 3 vext2 <3,6,7,0>, <3,6,7,0>
+ 3712740068U, // <7,0,3,7>: Cost 4 vext2 <3,5,7,0>, <3,7,3,7>
+ 2640989010U, // <7,0,3,u>: Cost 3 vext2 <3,u,7,0>, <3,u,7,0>
+ 2712060232U, // <7,0,4,0>: Cost 3 vext3 RHS, <0,4,0,4>
+ 1638318418U, // <7,0,4,1>: Cost 2 vext3 RHS, <0,4,1,5>
+ 1638318428U, // <7,0,4,2>: Cost 2 vext3 RHS, <0,4,2,6>
+ 3646474950U, // <7,0,4,3>: Cost 4 vext1 <3,7,0,4>, <3,7,0,4>
+ 2712060270U, // <7,0,4,4>: Cost 3 vext3 RHS, <0,4,4,6>
+ 1577864502U, // <7,0,4,5>: Cost 2 vext2 <5,6,7,0>, RHS
+ 2651606388U, // <7,0,4,6>: Cost 3 vext2 <5,6,7,0>, <4,6,4,6>
+ 3787792776U, // <7,0,4,7>: Cost 4 vext3 RHS, <0,4,7,5>
+ 1638318481U, // <7,0,4,u>: Cost 2 vext3 RHS, <0,4,u,5>
+ 2590654566U, // <7,0,5,0>: Cost 3 vext1 <6,7,0,5>, LHS
+ 2651606736U, // <7,0,5,1>: Cost 3 vext2 <5,6,7,0>, <5,1,7,3>
+ 2712060334U, // <7,0,5,2>: Cost 3 vext3 RHS, <0,5,2,7>
+ 2649616239U, // <7,0,5,3>: Cost 3 vext2 <5,3,7,0>, <5,3,7,0>
+ 2651606982U, // <7,0,5,4>: Cost 3 vext2 <5,6,7,0>, <5,4,7,6>
+ 2651607044U, // <7,0,5,5>: Cost 3 vext2 <5,6,7,0>, <5,5,5,5>
+ 1577865314U, // <7,0,5,6>: Cost 2 vext2 <5,6,7,0>, <5,6,7,0>
+ 2651607208U, // <7,0,5,7>: Cost 3 vext2 <5,6,7,0>, <5,7,5,7>
+ 1579192580U, // <7,0,5,u>: Cost 2 vext2 <5,u,7,0>, <5,u,7,0>
+ 2688393709U, // <7,0,6,0>: Cost 3 vext3 <0,6,0,7>, <0,6,0,7>
+ 2712060406U, // <7,0,6,1>: Cost 3 vext3 RHS, <0,6,1,7>
+ 2688541183U, // <7,0,6,2>: Cost 3 vext3 <0,6,2,7>, <0,6,2,7>
+ 2655588936U, // <7,0,6,3>: Cost 3 vext2 <6,3,7,0>, <6,3,7,0>
+ 3762430481U, // <7,0,6,4>: Cost 4 vext3 <0,6,4,7>, <0,6,4,7>
+ 2651607730U, // <7,0,6,5>: Cost 3 vext2 <5,6,7,0>, <6,5,0,7>
+ 2651607864U, // <7,0,6,6>: Cost 3 vext2 <5,6,7,0>, <6,6,6,6>
+ 2651607886U, // <7,0,6,7>: Cost 3 vext2 <5,6,7,0>, <6,7,0,1>
+ 2688983605U, // <7,0,6,u>: Cost 3 vext3 <0,6,u,7>, <0,6,u,7>
+ 2651608058U, // <7,0,7,0>: Cost 3 vext2 <5,6,7,0>, <7,0,1,2>
+ 2932703334U, // <7,0,7,1>: Cost 3 vzipl <7,7,7,7>, LHS
+ 3066921062U, // <7,0,7,2>: Cost 3 vtrnl <7,7,7,7>, LHS
+ 3712742678U, // <7,0,7,3>: Cost 4 vext2 <3,5,7,0>, <7,3,5,7>
+ 2651608422U, // <7,0,7,4>: Cost 3 vext2 <5,6,7,0>, <7,4,5,6>
+ 2651608513U, // <7,0,7,5>: Cost 3 vext2 <5,6,7,0>, <7,5,6,7>
+ 2663552532U, // <7,0,7,6>: Cost 3 vext2 <7,6,7,0>, <7,6,7,0>
+ 2651608684U, // <7,0,7,7>: Cost 3 vext2 <5,6,7,0>, <7,7,7,7>
+ 2651608706U, // <7,0,7,u>: Cost 3 vext2 <5,6,7,0>, <7,u,1,2>
+ 1638318730U, // <7,0,u,0>: Cost 2 vext3 RHS, <0,u,0,2>
+ 1638318738U, // <7,0,u,1>: Cost 2 vext3 RHS, <0,u,1,1>
+ 564576925U, // <7,0,u,2>: Cost 1 vext3 RHS, LHS
+ 2572765898U, // <7,0,u,3>: Cost 3 vext1 <3,7,0,u>, <3,7,0,u>
+ 1638318770U, // <7,0,u,4>: Cost 2 vext3 RHS, <0,u,4,6>
+ 1577867418U, // <7,0,u,5>: Cost 2 vext2 <5,6,7,0>, RHS
+ 1516942165U, // <7,0,u,6>: Cost 2 vext1 <6,7,0,u>, <6,7,0,u>
+ 2651609344U, // <7,0,u,7>: Cost 3 vext2 <5,6,7,0>, <u,7,0,1>
+ 564576979U, // <7,0,u,u>: Cost 1 vext3 RHS, LHS
+ 2590687334U, // <7,1,0,0>: Cost 3 vext1 <6,7,1,0>, LHS
+ 2639003750U, // <7,1,0,1>: Cost 3 vext2 <3,5,7,1>, LHS
+ 2793357414U, // <7,1,0,2>: Cost 3 vuzpl <7,0,1,2>, LHS
+ 1638318838U, // <7,1,0,3>: Cost 2 vext3 RHS, <1,0,3,2>
+ 2590690614U, // <7,1,0,4>: Cost 3 vext1 <6,7,1,0>, RHS
+ 2712060679U, // <7,1,0,5>: Cost 3 vext3 RHS, <1,0,5,1>
+ 2590692182U, // <7,1,0,6>: Cost 3 vext1 <6,7,1,0>, <6,7,1,0>
+ 3785802521U, // <7,1,0,7>: Cost 4 vext3 RHS, <1,0,7,1>
+ 1638318883U, // <7,1,0,u>: Cost 2 vext3 RHS, <1,0,u,2>
+ 2712060715U, // <7,1,1,0>: Cost 3 vext3 RHS, <1,1,0,1>
+ 1638318900U, // <7,1,1,1>: Cost 2 vext3 RHS, <1,1,1,1>
+ 3774300994U, // <7,1,1,2>: Cost 4 vext3 <2,6,3,7>, <1,1,2,6>
+ 1638318920U, // <7,1,1,3>: Cost 2 vext3 RHS, <1,1,3,3>
+ 2712060755U, // <7,1,1,4>: Cost 3 vext3 RHS, <1,1,4,5>
+ 2691416926U, // <7,1,1,5>: Cost 3 vext3 <1,1,5,7>, <1,1,5,7>
+ 2590700375U, // <7,1,1,6>: Cost 3 vext1 <6,7,1,1>, <6,7,1,1>
+ 3765158766U, // <7,1,1,7>: Cost 4 vext3 <1,1,5,7>, <1,1,7,5>
+ 1638318965U, // <7,1,1,u>: Cost 2 vext3 RHS, <1,1,u,3>
+ 2712060796U, // <7,1,2,0>: Cost 3 vext3 RHS, <1,2,0,1>
+ 2712060807U, // <7,1,2,1>: Cost 3 vext3 RHS, <1,2,1,3>
+ 3712747112U, // <7,1,2,2>: Cost 4 vext2 <3,5,7,1>, <2,2,2,2>
+ 1638318998U, // <7,1,2,3>: Cost 2 vext3 RHS, <1,2,3,0>
+ 2712060836U, // <7,1,2,4>: Cost 3 vext3 RHS, <1,2,4,5>
+ 2712060843U, // <7,1,2,5>: Cost 3 vext3 RHS, <1,2,5,3>
+ 2590708568U, // <7,1,2,6>: Cost 3 vext1 <6,7,1,2>, <6,7,1,2>
+ 2735948730U, // <7,1,2,7>: Cost 3 vext3 RHS, <1,2,7,0>
+ 1638319043U, // <7,1,2,u>: Cost 2 vext3 RHS, <1,2,u,0>
+ 2712060876U, // <7,1,3,0>: Cost 3 vext3 RHS, <1,3,0,0>
+ 1638319064U, // <7,1,3,1>: Cost 2 vext3 RHS, <1,3,1,3>
+ 2712060894U, // <7,1,3,2>: Cost 3 vext3 RHS, <1,3,2,0>
+ 2692596718U, // <7,1,3,3>: Cost 3 vext3 <1,3,3,7>, <1,3,3,7>
+ 2712060917U, // <7,1,3,4>: Cost 3 vext3 RHS, <1,3,4,5>
+ 1619002368U, // <7,1,3,5>: Cost 2 vext3 <1,3,5,7>, <1,3,5,7>
+ 2692817929U, // <7,1,3,6>: Cost 3 vext3 <1,3,6,7>, <1,3,6,7>
+ 2735948814U, // <7,1,3,7>: Cost 3 vext3 RHS, <1,3,7,3>
+ 1619223579U, // <7,1,3,u>: Cost 2 vext3 <1,3,u,7>, <1,3,u,7>
+ 2712060962U, // <7,1,4,0>: Cost 3 vext3 RHS, <1,4,0,5>
+ 2712060971U, // <7,1,4,1>: Cost 3 vext3 RHS, <1,4,1,5>
+ 2712060980U, // <7,1,4,2>: Cost 3 vext3 RHS, <1,4,2,5>
+ 2712060989U, // <7,1,4,3>: Cost 3 vext3 RHS, <1,4,3,5>
+ 3785802822U, // <7,1,4,4>: Cost 4 vext3 RHS, <1,4,4,5>
+ 2639007030U, // <7,1,4,5>: Cost 3 vext2 <3,5,7,1>, RHS
+ 2645642634U, // <7,1,4,6>: Cost 3 vext2 <4,6,7,1>, <4,6,7,1>
+ 3719384520U, // <7,1,4,7>: Cost 4 vext2 <4,6,7,1>, <4,7,5,0>
+ 2639007273U, // <7,1,4,u>: Cost 3 vext2 <3,5,7,1>, RHS
+ 2572812390U, // <7,1,5,0>: Cost 3 vext1 <3,7,1,5>, LHS
+ 2693776510U, // <7,1,5,1>: Cost 3 vext3 <1,5,1,7>, <1,5,1,7>
+ 3774301318U, // <7,1,5,2>: Cost 4 vext3 <2,6,3,7>, <1,5,2,6>
+ 1620182160U, // <7,1,5,3>: Cost 2 vext3 <1,5,3,7>, <1,5,3,7>
+ 2572815670U, // <7,1,5,4>: Cost 3 vext1 <3,7,1,5>, RHS
+ 3766486178U, // <7,1,5,5>: Cost 4 vext3 <1,3,5,7>, <1,5,5,7>
+ 2651615331U, // <7,1,5,6>: Cost 3 vext2 <5,6,7,1>, <5,6,7,1>
+ 2652278964U, // <7,1,5,7>: Cost 3 vext2 <5,7,7,1>, <5,7,7,1>
+ 1620550845U, // <7,1,5,u>: Cost 2 vext3 <1,5,u,7>, <1,5,u,7>
+ 3768108230U, // <7,1,6,0>: Cost 4 vext3 <1,6,0,7>, <1,6,0,7>
+ 2694440143U, // <7,1,6,1>: Cost 3 vext3 <1,6,1,7>, <1,6,1,7>
+ 2712061144U, // <7,1,6,2>: Cost 3 vext3 RHS, <1,6,2,7>
+ 2694587617U, // <7,1,6,3>: Cost 3 vext3 <1,6,3,7>, <1,6,3,7>
+ 3768403178U, // <7,1,6,4>: Cost 4 vext3 <1,6,4,7>, <1,6,4,7>
+ 2694735091U, // <7,1,6,5>: Cost 3 vext3 <1,6,5,7>, <1,6,5,7>
+ 3768550652U, // <7,1,6,6>: Cost 4 vext3 <1,6,6,7>, <1,6,6,7>
+ 2652279630U, // <7,1,6,7>: Cost 3 vext2 <5,7,7,1>, <6,7,0,1>
+ 2694956302U, // <7,1,6,u>: Cost 3 vext3 <1,6,u,7>, <1,6,u,7>
+ 2645644282U, // <7,1,7,0>: Cost 3 vext2 <4,6,7,1>, <7,0,1,2>
+ 2859062094U, // <7,1,7,1>: Cost 3 vuzpr <6,7,0,1>, <6,7,0,1>
+ 3779462437U, // <7,1,7,2>: Cost 4 vext3 <3,5,1,7>, <1,7,2,3>
+ 3121938534U, // <7,1,7,3>: Cost 3 vtrnr <5,7,5,7>, LHS
+ 2554916150U, // <7,1,7,4>: Cost 3 vext1 <0,7,1,7>, RHS
+ 3769140548U, // <7,1,7,5>: Cost 4 vext3 <1,7,5,7>, <1,7,5,7>
+ 3726022164U, // <7,1,7,6>: Cost 4 vext2 <5,7,7,1>, <7,6,7,0>
+ 2554918508U, // <7,1,7,7>: Cost 3 vext1 <0,7,1,7>, <7,7,7,7>
+ 3121938539U, // <7,1,7,u>: Cost 3 vtrnr <5,7,5,7>, LHS
+ 2572836966U, // <7,1,u,0>: Cost 3 vext1 <3,7,1,u>, LHS
+ 1638319469U, // <7,1,u,1>: Cost 2 vext3 RHS, <1,u,1,3>
+ 2712061299U, // <7,1,u,2>: Cost 3 vext3 RHS, <1,u,2,0>
+ 1622173059U, // <7,1,u,3>: Cost 2 vext3 <1,u,3,7>, <1,u,3,7>
+ 2572840246U, // <7,1,u,4>: Cost 3 vext1 <3,7,1,u>, RHS
+ 1622320533U, // <7,1,u,5>: Cost 2 vext3 <1,u,5,7>, <1,u,5,7>
+ 2696136094U, // <7,1,u,6>: Cost 3 vext3 <1,u,6,7>, <1,u,6,7>
+ 2859060777U, // <7,1,u,7>: Cost 3 vuzpr <6,7,0,1>, RHS
+ 1622541744U, // <7,1,u,u>: Cost 2 vext3 <1,u,u,7>, <1,u,u,7>
+ 2712061364U, // <7,2,0,0>: Cost 3 vext3 RHS, <2,0,0,2>
+ 2712061373U, // <7,2,0,1>: Cost 3 vext3 RHS, <2,0,1,2>
+ 2712061380U, // <7,2,0,2>: Cost 3 vext3 RHS, <2,0,2,0>
+ 2712061389U, // <7,2,0,3>: Cost 3 vext3 RHS, <2,0,3,0>
+ 2712061404U, // <7,2,0,4>: Cost 3 vext3 RHS, <2,0,4,6>
+ 2696725990U, // <7,2,0,5>: Cost 3 vext3 <2,0,5,7>, <2,0,5,7>
+ 2712061417U, // <7,2,0,6>: Cost 3 vext3 RHS, <2,0,6,1>
+ 3785803251U, // <7,2,0,7>: Cost 4 vext3 RHS, <2,0,7,2>
+ 2696947201U, // <7,2,0,u>: Cost 3 vext3 <2,0,u,7>, <2,0,u,7>
+ 2712061446U, // <7,2,1,0>: Cost 3 vext3 RHS, <2,1,0,3>
+ 3785803276U, // <7,2,1,1>: Cost 4 vext3 RHS, <2,1,1,0>
+ 3785803285U, // <7,2,1,2>: Cost 4 vext3 RHS, <2,1,2,0>
+ 2712061471U, // <7,2,1,3>: Cost 3 vext3 RHS, <2,1,3,1>
+ 2712061482U, // <7,2,1,4>: Cost 3 vext3 RHS, <2,1,4,3>
+ 3766486576U, // <7,2,1,5>: Cost 4 vext3 <1,3,5,7>, <2,1,5,0>
+ 2712061500U, // <7,2,1,6>: Cost 3 vext3 RHS, <2,1,6,3>
+ 2602718850U, // <7,2,1,7>: Cost 3 vext1 <u,7,2,1>, <7,u,1,2>
+ 2712061516U, // <7,2,1,u>: Cost 3 vext3 RHS, <2,1,u,1>
+ 2712061525U, // <7,2,2,0>: Cost 3 vext3 RHS, <2,2,0,1>
+ 2712061536U, // <7,2,2,1>: Cost 3 vext3 RHS, <2,2,1,3>
+ 1638319720U, // <7,2,2,2>: Cost 2 vext3 RHS, <2,2,2,2>
+ 1638319730U, // <7,2,2,3>: Cost 2 vext3 RHS, <2,2,3,3>
+ 2712061565U, // <7,2,2,4>: Cost 3 vext3 RHS, <2,2,4,5>
+ 2698053256U, // <7,2,2,5>: Cost 3 vext3 <2,2,5,7>, <2,2,5,7>
+ 2712061584U, // <7,2,2,6>: Cost 3 vext3 RHS, <2,2,6,6>
+ 3771795096U, // <7,2,2,7>: Cost 4 vext3 <2,2,5,7>, <2,2,7,5>
+ 1638319775U, // <7,2,2,u>: Cost 2 vext3 RHS, <2,2,u,3>
+ 1638319782U, // <7,2,3,0>: Cost 2 vext3 RHS, <2,3,0,1>
+ 2693924531U, // <7,2,3,1>: Cost 3 vext3 <1,5,3,7>, <2,3,1,5>
+ 2700560061U, // <7,2,3,2>: Cost 3 vext3 <2,6,3,7>, <2,3,2,6>
+ 2693924551U, // <7,2,3,3>: Cost 3 vext3 <1,5,3,7>, <2,3,3,7>
+ 1638319822U, // <7,2,3,4>: Cost 2 vext3 RHS, <2,3,4,5>
+ 2698716889U, // <7,2,3,5>: Cost 3 vext3 <2,3,5,7>, <2,3,5,7>
+ 2712061665U, // <7,2,3,6>: Cost 3 vext3 RHS, <2,3,6,6>
+ 2735949540U, // <7,2,3,7>: Cost 3 vext3 RHS, <2,3,7,0>
+ 1638319854U, // <7,2,3,u>: Cost 2 vext3 RHS, <2,3,u,1>
+ 2712061692U, // <7,2,4,0>: Cost 3 vext3 RHS, <2,4,0,6>
+ 2712061698U, // <7,2,4,1>: Cost 3 vext3 RHS, <2,4,1,3>
+ 2712061708U, // <7,2,4,2>: Cost 3 vext3 RHS, <2,4,2,4>
+ 2712061718U, // <7,2,4,3>: Cost 3 vext3 RHS, <2,4,3,5>
+ 2712061728U, // <7,2,4,4>: Cost 3 vext3 RHS, <2,4,4,6>
+ 2699380522U, // <7,2,4,5>: Cost 3 vext3 <2,4,5,7>, <2,4,5,7>
+ 2712061740U, // <7,2,4,6>: Cost 3 vext3 RHS, <2,4,6,0>
+ 3809691445U, // <7,2,4,7>: Cost 4 vext3 RHS, <2,4,7,0>
+ 2699601733U, // <7,2,4,u>: Cost 3 vext3 <2,4,u,7>, <2,4,u,7>
+ 2699675470U, // <7,2,5,0>: Cost 3 vext3 <2,5,0,7>, <2,5,0,7>
+ 3766486867U, // <7,2,5,1>: Cost 4 vext3 <1,3,5,7>, <2,5,1,3>
+ 2699822944U, // <7,2,5,2>: Cost 3 vext3 <2,5,2,7>, <2,5,2,7>
+ 2692745065U, // <7,2,5,3>: Cost 3 vext3 <1,3,5,7>, <2,5,3,7>
+ 2699970418U, // <7,2,5,4>: Cost 3 vext3 <2,5,4,7>, <2,5,4,7>
+ 3766486907U, // <7,2,5,5>: Cost 4 vext3 <1,3,5,7>, <2,5,5,7>
+ 2700117892U, // <7,2,5,6>: Cost 3 vext3 <2,5,6,7>, <2,5,6,7>
+ 3771795334U, // <7,2,5,7>: Cost 4 vext3 <2,2,5,7>, <2,5,7,0>
+ 2692745110U, // <7,2,5,u>: Cost 3 vext3 <1,3,5,7>, <2,5,u,7>
+ 2572894310U, // <7,2,6,0>: Cost 3 vext1 <3,7,2,6>, LHS
+ 2712061860U, // <7,2,6,1>: Cost 3 vext3 RHS, <2,6,1,3>
+ 2700486577U, // <7,2,6,2>: Cost 3 vext3 <2,6,2,7>, <2,6,2,7>
+ 1626818490U, // <7,2,6,3>: Cost 2 vext3 <2,6,3,7>, <2,6,3,7>
+ 2572897590U, // <7,2,6,4>: Cost 3 vext1 <3,7,2,6>, RHS
+ 2700707788U, // <7,2,6,5>: Cost 3 vext3 <2,6,5,7>, <2,6,5,7>
+ 2700781525U, // <7,2,6,6>: Cost 3 vext3 <2,6,6,7>, <2,6,6,7>
+ 3774597086U, // <7,2,6,7>: Cost 4 vext3 <2,6,7,7>, <2,6,7,7>
+ 1627187175U, // <7,2,6,u>: Cost 2 vext3 <2,6,u,7>, <2,6,u,7>
+ 2735949802U, // <7,2,7,0>: Cost 3 vext3 RHS, <2,7,0,1>
+ 3780200434U, // <7,2,7,1>: Cost 4 vext3 <3,6,2,7>, <2,7,1,0>
+ 3773564928U, // <7,2,7,2>: Cost 4 vext3 <2,5,2,7>, <2,7,2,5>
+ 2986541158U, // <7,2,7,3>: Cost 3 vzipr <5,5,7,7>, LHS
+ 2554989878U, // <7,2,7,4>: Cost 3 vext1 <0,7,2,7>, RHS
+ 3775113245U, // <7,2,7,5>: Cost 4 vext3 <2,7,5,7>, <2,7,5,7>
+ 4060283228U, // <7,2,7,6>: Cost 4 vzipr <5,5,7,7>, <0,4,2,6>
+ 2554992236U, // <7,2,7,7>: Cost 3 vext1 <0,7,2,7>, <7,7,7,7>
+ 2986541163U, // <7,2,7,u>: Cost 3 vzipr <5,5,7,7>, LHS
+ 1638320187U, // <7,2,u,0>: Cost 2 vext3 RHS, <2,u,0,1>
+ 2693924936U, // <7,2,u,1>: Cost 3 vext3 <1,5,3,7>, <2,u,1,5>
+ 1638319720U, // <7,2,u,2>: Cost 2 vext3 RHS, <2,2,2,2>
+ 1628145756U, // <7,2,u,3>: Cost 2 vext3 <2,u,3,7>, <2,u,3,7>
+ 1638320227U, // <7,2,u,4>: Cost 2 vext3 RHS, <2,u,4,5>
+ 2702035054U, // <7,2,u,5>: Cost 3 vext3 <2,u,5,7>, <2,u,5,7>
+ 2702108791U, // <7,2,u,6>: Cost 3 vext3 <2,u,6,7>, <2,u,6,7>
+ 2735949945U, // <7,2,u,7>: Cost 3 vext3 RHS, <2,u,7,0>
+ 1628514441U, // <7,2,u,u>: Cost 2 vext3 <2,u,u,7>, <2,u,u,7>
+ 2712062091U, // <7,3,0,0>: Cost 3 vext3 RHS, <3,0,0,0>
+ 1638320278U, // <7,3,0,1>: Cost 2 vext3 RHS, <3,0,1,2>
+ 2712062109U, // <7,3,0,2>: Cost 3 vext3 RHS, <3,0,2,0>
+ 2590836886U, // <7,3,0,3>: Cost 3 vext1 <6,7,3,0>, <3,0,1,2>
+ 2712062128U, // <7,3,0,4>: Cost 3 vext3 RHS, <3,0,4,1>
+ 2712062138U, // <7,3,0,5>: Cost 3 vext3 RHS, <3,0,5,2>
+ 2590839656U, // <7,3,0,6>: Cost 3 vext1 <6,7,3,0>, <6,7,3,0>
+ 3311414017U, // <7,3,0,7>: Cost 4 vrev <3,7,7,0>
+ 1638320341U, // <7,3,0,u>: Cost 2 vext3 RHS, <3,0,u,2>
+ 2237164227U, // <7,3,1,0>: Cost 3 vrev <3,7,0,1>
+ 2712062182U, // <7,3,1,1>: Cost 3 vext3 RHS, <3,1,1,1>
+ 2712062193U, // <7,3,1,2>: Cost 3 vext3 RHS, <3,1,2,3>
+ 2692745468U, // <7,3,1,3>: Cost 3 vext3 <1,3,5,7>, <3,1,3,5>
+ 2712062214U, // <7,3,1,4>: Cost 3 vext3 RHS, <3,1,4,6>
+ 2693925132U, // <7,3,1,5>: Cost 3 vext3 <1,5,3,7>, <3,1,5,3>
+ 3768183059U, // <7,3,1,6>: Cost 4 vext3 <1,6,1,7>, <3,1,6,1>
+ 2692745504U, // <7,3,1,7>: Cost 3 vext3 <1,3,5,7>, <3,1,7,5>
+ 2696063273U, // <7,3,1,u>: Cost 3 vext3 <1,u,5,7>, <3,1,u,5>
+ 2712062254U, // <7,3,2,0>: Cost 3 vext3 RHS, <3,2,0,1>
+ 2712062262U, // <7,3,2,1>: Cost 3 vext3 RHS, <3,2,1,0>
+ 2712062273U, // <7,3,2,2>: Cost 3 vext3 RHS, <3,2,2,2>
+ 2712062280U, // <7,3,2,3>: Cost 3 vext3 RHS, <3,2,3,0>
+ 2712062294U, // <7,3,2,4>: Cost 3 vext3 RHS, <3,2,4,5>
+ 2712062302U, // <7,3,2,5>: Cost 3 vext3 RHS, <3,2,5,4>
+ 2700560742U, // <7,3,2,6>: Cost 3 vext3 <2,6,3,7>, <3,2,6,3>
+ 2712062319U, // <7,3,2,7>: Cost 3 vext3 RHS, <3,2,7,3>
+ 2712062325U, // <7,3,2,u>: Cost 3 vext3 RHS, <3,2,u,0>
+ 2712062335U, // <7,3,3,0>: Cost 3 vext3 RHS, <3,3,0,1>
+ 2636368158U, // <7,3,3,1>: Cost 3 vext2 <3,1,7,3>, <3,1,7,3>
+ 2637031791U, // <7,3,3,2>: Cost 3 vext2 <3,2,7,3>, <3,2,7,3>
+ 1638320540U, // <7,3,3,3>: Cost 2 vext3 RHS, <3,3,3,3>
+ 2712062374U, // <7,3,3,4>: Cost 3 vext3 RHS, <3,3,4,4>
+ 2704689586U, // <7,3,3,5>: Cost 3 vext3 <3,3,5,7>, <3,3,5,7>
+ 2590864235U, // <7,3,3,6>: Cost 3 vext1 <6,7,3,3>, <6,7,3,3>
+ 2704837060U, // <7,3,3,7>: Cost 3 vext3 <3,3,7,7>, <3,3,7,7>
+ 1638320540U, // <7,3,3,u>: Cost 2 vext3 RHS, <3,3,3,3>
+ 2712062416U, // <7,3,4,0>: Cost 3 vext3 RHS, <3,4,0,1>
+ 2712062426U, // <7,3,4,1>: Cost 3 vext3 RHS, <3,4,1,2>
+ 2566981640U, // <7,3,4,2>: Cost 3 vext1 <2,7,3,4>, <2,7,3,4>
+ 2712062447U, // <7,3,4,3>: Cost 3 vext3 RHS, <3,4,3,5>
+ 2712062456U, // <7,3,4,4>: Cost 3 vext3 RHS, <3,4,4,5>
+ 1638320642U, // <7,3,4,5>: Cost 2 vext3 RHS, <3,4,5,6>
+ 2648313204U, // <7,3,4,6>: Cost 3 vext2 <5,1,7,3>, <4,6,4,6>
+ 3311446789U, // <7,3,4,7>: Cost 4 vrev <3,7,7,4>
+ 1638320669U, // <7,3,4,u>: Cost 2 vext3 RHS, <3,4,u,6>
+ 2602819686U, // <7,3,5,0>: Cost 3 vext1 <u,7,3,5>, LHS
+ 1574571728U, // <7,3,5,1>: Cost 2 vext2 <5,1,7,3>, <5,1,7,3>
+ 2648977185U, // <7,3,5,2>: Cost 3 vext2 <5,2,7,3>, <5,2,7,3>
+ 2705869378U, // <7,3,5,3>: Cost 3 vext3 <3,5,3,7>, <3,5,3,7>
+ 2237491947U, // <7,3,5,4>: Cost 3 vrev <3,7,4,5>
+ 2706016852U, // <7,3,5,5>: Cost 3 vext3 <3,5,5,7>, <3,5,5,7>
+ 2648313954U, // <7,3,5,6>: Cost 3 vext2 <5,1,7,3>, <5,6,7,0>
+ 2692745823U, // <7,3,5,7>: Cost 3 vext3 <1,3,5,7>, <3,5,7,0>
+ 1579217159U, // <7,3,5,u>: Cost 2 vext2 <5,u,7,3>, <5,u,7,3>
+ 2706311800U, // <7,3,6,0>: Cost 3 vext3 <3,6,0,7>, <3,6,0,7>
+ 2654286249U, // <7,3,6,1>: Cost 3 vext2 <6,1,7,3>, <6,1,7,3>
+ 1581208058U, // <7,3,6,2>: Cost 2 vext2 <6,2,7,3>, <6,2,7,3>
+ 2706533011U, // <7,3,6,3>: Cost 3 vext3 <3,6,3,7>, <3,6,3,7>
+ 2706606748U, // <7,3,6,4>: Cost 3 vext3 <3,6,4,7>, <3,6,4,7>
+ 3780422309U, // <7,3,6,5>: Cost 4 vext3 <3,6,5,7>, <3,6,5,7>
+ 2712062637U, // <7,3,6,6>: Cost 3 vext3 RHS, <3,6,6,6>
+ 2706827959U, // <7,3,6,7>: Cost 3 vext3 <3,6,7,7>, <3,6,7,7>
+ 1585189856U, // <7,3,6,u>: Cost 2 vext2 <6,u,7,3>, <6,u,7,3>
+ 2693925571U, // <7,3,7,0>: Cost 3 vext3 <1,5,3,7>, <3,7,0,1>
+ 2693925584U, // <7,3,7,1>: Cost 3 vext3 <1,5,3,7>, <3,7,1,5>
+ 2700561114U, // <7,3,7,2>: Cost 3 vext3 <2,6,3,7>, <3,7,2,6>
+ 2572978916U, // <7,3,7,3>: Cost 3 vext1 <3,7,3,7>, <3,7,3,7>
+ 2693925611U, // <7,3,7,4>: Cost 3 vext3 <1,5,3,7>, <3,7,4,5>
+ 2707344118U, // <7,3,7,5>: Cost 3 vext3 <3,7,5,7>, <3,7,5,7>
+ 2654950894U, // <7,3,7,6>: Cost 3 vext2 <6,2,7,3>, <7,6,2,7>
+ 2648315500U, // <7,3,7,7>: Cost 3 vext2 <5,1,7,3>, <7,7,7,7>
+ 2693925643U, // <7,3,7,u>: Cost 3 vext3 <1,5,3,7>, <3,7,u,1>
+ 2237221578U, // <7,3,u,0>: Cost 3 vrev <3,7,0,u>
+ 1638320926U, // <7,3,u,1>: Cost 2 vext3 RHS, <3,u,1,2>
+ 1593153452U, // <7,3,u,2>: Cost 2 vext2 <u,2,7,3>, <u,2,7,3>
+ 1638320540U, // <7,3,u,3>: Cost 2 vext3 RHS, <3,3,3,3>
+ 2237516526U, // <7,3,u,4>: Cost 3 vrev <3,7,4,u>
+ 1638320966U, // <7,3,u,5>: Cost 2 vext3 RHS, <3,u,5,6>
+ 2712062796U, // <7,3,u,6>: Cost 3 vext3 RHS, <3,u,6,3>
+ 2692967250U, // <7,3,u,7>: Cost 3 vext3 <1,3,u,7>, <3,u,7,0>
+ 1638320989U, // <7,3,u,u>: Cost 2 vext3 RHS, <3,u,u,2>
+ 2651635712U, // <7,4,0,0>: Cost 3 vext2 <5,6,7,4>, <0,0,0,0>
+ 1577893990U, // <7,4,0,1>: Cost 2 vext2 <5,6,7,4>, LHS
+ 2651635876U, // <7,4,0,2>: Cost 3 vext2 <5,6,7,4>, <0,2,0,2>
+ 3785804672U, // <7,4,0,3>: Cost 4 vext3 RHS, <4,0,3,1>
+ 2651636050U, // <7,4,0,4>: Cost 3 vext2 <5,6,7,4>, <0,4,1,5>
+ 1638468498U, // <7,4,0,5>: Cost 2 vext3 RHS, <4,0,5,1>
+ 1638468508U, // <7,4,0,6>: Cost 2 vext3 RHS, <4,0,6,2>
+ 3787795364U, // <7,4,0,7>: Cost 4 vext3 RHS, <4,0,7,1>
+ 1640459181U, // <7,4,0,u>: Cost 2 vext3 RHS, <4,0,u,1>
+ 2651636470U, // <7,4,1,0>: Cost 3 vext2 <5,6,7,4>, <1,0,3,2>
+ 2651636532U, // <7,4,1,1>: Cost 3 vext2 <5,6,7,4>, <1,1,1,1>
+ 2712062922U, // <7,4,1,2>: Cost 3 vext3 RHS, <4,1,2,3>
+ 2639029248U, // <7,4,1,3>: Cost 3 vext2 <3,5,7,4>, <1,3,5,7>
+ 2712062940U, // <7,4,1,4>: Cost 3 vext3 RHS, <4,1,4,3>
+ 2712062946U, // <7,4,1,5>: Cost 3 vext3 RHS, <4,1,5,0>
+ 2712062958U, // <7,4,1,6>: Cost 3 vext3 RHS, <4,1,6,3>
+ 3785804791U, // <7,4,1,7>: Cost 4 vext3 RHS, <4,1,7,3>
+ 2712062973U, // <7,4,1,u>: Cost 3 vext3 RHS, <4,1,u,0>
+ 3785804807U, // <7,4,2,0>: Cost 4 vext3 RHS, <4,2,0,1>
+ 3785804818U, // <7,4,2,1>: Cost 4 vext3 RHS, <4,2,1,3>
+ 2651637352U, // <7,4,2,2>: Cost 3 vext2 <5,6,7,4>, <2,2,2,2>
+ 2651637414U, // <7,4,2,3>: Cost 3 vext2 <5,6,7,4>, <2,3,0,1>
+ 3716753194U, // <7,4,2,4>: Cost 4 vext2 <4,2,7,4>, <2,4,5,7>
+ 2712063030U, // <7,4,2,5>: Cost 3 vext3 RHS, <4,2,5,3>
+ 2712063036U, // <7,4,2,6>: Cost 3 vext3 RHS, <4,2,6,0>
+ 3773123658U, // <7,4,2,7>: Cost 4 vext3 <2,4,5,7>, <4,2,7,5>
+ 2712063054U, // <7,4,2,u>: Cost 3 vext3 RHS, <4,2,u,0>
+ 2651637910U, // <7,4,3,0>: Cost 3 vext2 <5,6,7,4>, <3,0,1,2>
+ 3712772348U, // <7,4,3,1>: Cost 4 vext2 <3,5,7,4>, <3,1,3,5>
+ 3785804906U, // <7,4,3,2>: Cost 4 vext3 RHS, <4,3,2,1>
+ 2651638172U, // <7,4,3,3>: Cost 3 vext2 <5,6,7,4>, <3,3,3,3>
+ 2651638274U, // <7,4,3,4>: Cost 3 vext2 <5,6,7,4>, <3,4,5,6>
+ 2639030883U, // <7,4,3,5>: Cost 3 vext2 <3,5,7,4>, <3,5,7,4>
+ 2712063122U, // <7,4,3,6>: Cost 3 vext3 RHS, <4,3,6,5>
+ 3712772836U, // <7,4,3,7>: Cost 4 vext2 <3,5,7,4>, <3,7,3,7>
+ 2641021782U, // <7,4,3,u>: Cost 3 vext2 <3,u,7,4>, <3,u,7,4>
+ 2714053802U, // <7,4,4,0>: Cost 3 vext3 RHS, <4,4,0,2>
+ 3785804978U, // <7,4,4,1>: Cost 4 vext3 RHS, <4,4,1,1>
+ 3716754505U, // <7,4,4,2>: Cost 4 vext2 <4,2,7,4>, <4,2,7,4>
+ 3785804998U, // <7,4,4,3>: Cost 4 vext3 RHS, <4,4,3,3>
+ 1638321360U, // <7,4,4,4>: Cost 2 vext3 RHS, <4,4,4,4>
+ 1638468826U, // <7,4,4,5>: Cost 2 vext3 RHS, <4,4,5,5>
+ 1638468836U, // <7,4,4,6>: Cost 2 vext3 RHS, <4,4,6,6>
+ 3785215214U, // <7,4,4,7>: Cost 4 vext3 <4,4,7,7>, <4,4,7,7>
+ 1640459509U, // <7,4,4,u>: Cost 2 vext3 RHS, <4,4,u,5>
+ 1517207654U, // <7,4,5,0>: Cost 2 vext1 <6,7,4,5>, LHS
+ 2573034640U, // <7,4,5,1>: Cost 3 vext1 <3,7,4,5>, <1,5,3,7>
+ 2712063246U, // <7,4,5,2>: Cost 3 vext3 RHS, <4,5,2,3>
+ 2573036267U, // <7,4,5,3>: Cost 3 vext1 <3,7,4,5>, <3,7,4,5>
+ 1517210934U, // <7,4,5,4>: Cost 2 vext1 <6,7,4,5>, RHS
+ 2711989549U, // <7,4,5,5>: Cost 3 vext3 <4,5,5,7>, <4,5,5,7>
+ 564579638U, // <7,4,5,6>: Cost 1 vext3 RHS, RHS
+ 2651639976U, // <7,4,5,7>: Cost 3 vext2 <5,6,7,4>, <5,7,5,7>
+ 564579656U, // <7,4,5,u>: Cost 1 vext3 RHS, RHS
+ 2712063307U, // <7,4,6,0>: Cost 3 vext3 RHS, <4,6,0,1>
+ 3767668056U, // <7,4,6,1>: Cost 4 vext3 <1,5,3,7>, <4,6,1,5>
+ 2651640314U, // <7,4,6,2>: Cost 3 vext2 <5,6,7,4>, <6,2,7,3>
+ 2655621708U, // <7,4,6,3>: Cost 3 vext2 <6,3,7,4>, <6,3,7,4>
+ 1638468980U, // <7,4,6,4>: Cost 2 vext3 RHS, <4,6,4,6>
+ 2712063358U, // <7,4,6,5>: Cost 3 vext3 RHS, <4,6,5,7>
+ 2712063367U, // <7,4,6,6>: Cost 3 vext3 RHS, <4,6,6,7>
+ 2712210826U, // <7,4,6,7>: Cost 3 vext3 RHS, <4,6,7,1>
+ 1638469012U, // <7,4,6,u>: Cost 2 vext3 RHS, <4,6,u,2>
+ 2651640826U, // <7,4,7,0>: Cost 3 vext2 <5,6,7,4>, <7,0,1,2>
+ 3773713830U, // <7,4,7,1>: Cost 4 vext3 <2,5,4,7>, <4,7,1,2>
+ 3773713842U, // <7,4,7,2>: Cost 4 vext3 <2,5,4,7>, <4,7,2,5>
+ 3780349372U, // <7,4,7,3>: Cost 4 vext3 <3,6,4,7>, <4,7,3,6>
+ 2651641140U, // <7,4,7,4>: Cost 3 vext2 <5,6,7,4>, <7,4,0,1>
+ 2712210888U, // <7,4,7,5>: Cost 3 vext3 RHS, <4,7,5,0>
+ 2712210898U, // <7,4,7,6>: Cost 3 vext3 RHS, <4,7,6,1>
+ 2651641452U, // <7,4,7,7>: Cost 3 vext2 <5,6,7,4>, <7,7,7,7>
+ 2713538026U, // <7,4,7,u>: Cost 3 vext3 <4,7,u,7>, <4,7,u,7>
+ 1517232230U, // <7,4,u,0>: Cost 2 vext1 <6,7,4,u>, LHS
+ 1577899822U, // <7,4,u,1>: Cost 2 vext2 <5,6,7,4>, LHS
+ 2712063489U, // <7,4,u,2>: Cost 3 vext3 RHS, <4,u,2,3>
+ 2573060846U, // <7,4,u,3>: Cost 3 vext1 <3,7,4,u>, <3,7,4,u>
+ 1640312342U, // <7,4,u,4>: Cost 2 vext3 RHS, <4,u,4,6>
+ 1638469146U, // <7,4,u,5>: Cost 2 vext3 RHS, <4,u,5,1>
+ 564579881U, // <7,4,u,6>: Cost 1 vext3 RHS, RHS
+ 2714054192U, // <7,4,u,7>: Cost 3 vext3 RHS, <4,u,7,5>
+ 564579899U, // <7,4,u,u>: Cost 1 vext3 RHS, RHS
+ 2579038310U, // <7,5,0,0>: Cost 3 vext1 <4,7,5,0>, LHS
+ 2636382310U, // <7,5,0,1>: Cost 3 vext2 <3,1,7,5>, LHS
+ 2796339302U, // <7,5,0,2>: Cost 3 vuzpl <7,4,5,6>, LHS
+ 3646810719U, // <7,5,0,3>: Cost 4 vext1 <3,7,5,0>, <3,5,7,0>
+ 2712063586U, // <7,5,0,4>: Cost 3 vext3 RHS, <5,0,4,1>
+ 2735951467U, // <7,5,0,5>: Cost 3 vext3 RHS, <5,0,5,1>
+ 2735951476U, // <7,5,0,6>: Cost 3 vext3 RHS, <5,0,6,1>
+ 2579043322U, // <7,5,0,7>: Cost 3 vext1 <4,7,5,0>, <7,0,1,2>
+ 2636382877U, // <7,5,0,u>: Cost 3 vext2 <3,1,7,5>, LHS
+ 2712211087U, // <7,5,1,0>: Cost 3 vext3 RHS, <5,1,0,1>
+ 3698180916U, // <7,5,1,1>: Cost 4 vext2 <1,1,7,5>, <1,1,1,1>
+ 3710124950U, // <7,5,1,2>: Cost 4 vext2 <3,1,7,5>, <1,2,3,0>
+ 2636383232U, // <7,5,1,3>: Cost 3 vext2 <3,1,7,5>, <1,3,5,7>
+ 2712211127U, // <7,5,1,4>: Cost 3 vext3 RHS, <5,1,4,5>
+ 2590994128U, // <7,5,1,5>: Cost 3 vext1 <6,7,5,1>, <5,1,7,3>
+ 2590995323U, // <7,5,1,6>: Cost 3 vext1 <6,7,5,1>, <6,7,5,1>
+ 1638469328U, // <7,5,1,7>: Cost 2 vext3 RHS, <5,1,7,3>
+ 1638469337U, // <7,5,1,u>: Cost 2 vext3 RHS, <5,1,u,3>
+ 3785805536U, // <7,5,2,0>: Cost 4 vext3 RHS, <5,2,0,1>
+ 3785805544U, // <7,5,2,1>: Cost 4 vext3 RHS, <5,2,1,0>
+ 3704817288U, // <7,5,2,2>: Cost 4 vext2 <2,2,7,5>, <2,2,5,7>
+ 2712063742U, // <7,5,2,3>: Cost 3 vext3 RHS, <5,2,3,4>
+ 3716761386U, // <7,5,2,4>: Cost 4 vext2 <4,2,7,5>, <2,4,5,7>
+ 2714054415U, // <7,5,2,5>: Cost 3 vext3 RHS, <5,2,5,3>
+ 3774304024U, // <7,5,2,6>: Cost 4 vext3 <2,6,3,7>, <5,2,6,3>
+ 2712063777U, // <7,5,2,7>: Cost 3 vext3 RHS, <5,2,7,3>
+ 2712063787U, // <7,5,2,u>: Cost 3 vext3 RHS, <5,2,u,4>
+ 3634888806U, // <7,5,3,0>: Cost 4 vext1 <1,7,5,3>, LHS
+ 2636384544U, // <7,5,3,1>: Cost 3 vext2 <3,1,7,5>, <3,1,7,5>
+ 3710790001U, // <7,5,3,2>: Cost 4 vext2 <3,2,7,5>, <3,2,7,5>
+ 3710126492U, // <7,5,3,3>: Cost 4 vext2 <3,1,7,5>, <3,3,3,3>
+ 3634892086U, // <7,5,3,4>: Cost 4 vext1 <1,7,5,3>, RHS
+ 2639039076U, // <7,5,3,5>: Cost 3 vext2 <3,5,7,5>, <3,5,7,5>
+ 3713444533U, // <7,5,3,6>: Cost 4 vext2 <3,6,7,5>, <3,6,7,5>
+ 2693926767U, // <7,5,3,7>: Cost 3 vext3 <1,5,3,7>, <5,3,7,0>
+ 2712063864U, // <7,5,3,u>: Cost 3 vext3 RHS, <5,3,u,0>
+ 2579071078U, // <7,5,4,0>: Cost 3 vext1 <4,7,5,4>, LHS
+ 3646841856U, // <7,5,4,1>: Cost 4 vext1 <3,7,5,4>, <1,3,5,7>
+ 3716762698U, // <7,5,4,2>: Cost 4 vext2 <4,2,7,5>, <4,2,7,5>
+ 3646843491U, // <7,5,4,3>: Cost 4 vext1 <3,7,5,4>, <3,5,7,4>
+ 2579074358U, // <7,5,4,4>: Cost 3 vext1 <4,7,5,4>, RHS
+ 2636385590U, // <7,5,4,5>: Cost 3 vext2 <3,1,7,5>, RHS
+ 2645675406U, // <7,5,4,6>: Cost 3 vext2 <4,6,7,5>, <4,6,7,5>
+ 1638322118U, // <7,5,4,7>: Cost 2 vext3 RHS, <5,4,7,6>
+ 1638469583U, // <7,5,4,u>: Cost 2 vext3 RHS, <5,4,u,6>
+ 2714054611U, // <7,5,5,0>: Cost 3 vext3 RHS, <5,5,0,1>
+ 2652974800U, // <7,5,5,1>: Cost 3 vext2 <5,u,7,5>, <5,1,7,3>
+ 3710127905U, // <7,5,5,2>: Cost 4 vext2 <3,1,7,5>, <5,2,7,3>
+ 3785805808U, // <7,5,5,3>: Cost 4 vext3 RHS, <5,5,3,3>
+ 2712211450U, // <7,5,5,4>: Cost 3 vext3 RHS, <5,5,4,4>
+ 1638322180U, // <7,5,5,5>: Cost 2 vext3 RHS, <5,5,5,5>
+ 2712064014U, // <7,5,5,6>: Cost 3 vext3 RHS, <5,5,6,6>
+ 1638469656U, // <7,5,5,7>: Cost 2 vext3 RHS, <5,5,7,7>
+ 1638469665U, // <7,5,5,u>: Cost 2 vext3 RHS, <5,5,u,7>
+ 2712064036U, // <7,5,6,0>: Cost 3 vext3 RHS, <5,6,0,1>
+ 2714054707U, // <7,5,6,1>: Cost 3 vext3 RHS, <5,6,1,7>
+ 3785805879U, // <7,5,6,2>: Cost 4 vext3 RHS, <5,6,2,2>
+ 2712064066U, // <7,5,6,3>: Cost 3 vext3 RHS, <5,6,3,4>
+ 2712064076U, // <7,5,6,4>: Cost 3 vext3 RHS, <5,6,4,5>
+ 2714054743U, // <7,5,6,5>: Cost 3 vext3 RHS, <5,6,5,7>
+ 2712064096U, // <7,5,6,6>: Cost 3 vext3 RHS, <5,6,6,7>
+ 1638322274U, // <7,5,6,7>: Cost 2 vext3 RHS, <5,6,7,0>
+ 1638469739U, // <7,5,6,u>: Cost 2 vext3 RHS, <5,6,u,0>
+ 1511325798U, // <7,5,7,0>: Cost 2 vext1 <5,7,5,7>, LHS
+ 2692747392U, // <7,5,7,1>: Cost 3 vext3 <1,3,5,7>, <5,7,1,3>
+ 2585069160U, // <7,5,7,2>: Cost 3 vext1 <5,7,5,7>, <2,2,2,2>
+ 2573126390U, // <7,5,7,3>: Cost 3 vext1 <3,7,5,7>, <3,7,5,7>
+ 1511329078U, // <7,5,7,4>: Cost 2 vext1 <5,7,5,7>, RHS
+ 1638469800U, // <7,5,7,5>: Cost 2 vext3 RHS, <5,7,5,7>
+ 2712211626U, // <7,5,7,6>: Cost 3 vext3 RHS, <5,7,6,0>
+ 2712211636U, // <7,5,7,7>: Cost 3 vext3 RHS, <5,7,7,1>
+ 1638469823U, // <7,5,7,u>: Cost 2 vext3 RHS, <5,7,u,3>
+ 1511333990U, // <7,5,u,0>: Cost 2 vext1 <5,7,5,u>, LHS
+ 2636388142U, // <7,5,u,1>: Cost 3 vext2 <3,1,7,5>, LHS
+ 2712211671U, // <7,5,u,2>: Cost 3 vext3 RHS, <5,u,2,0>
+ 2573134583U, // <7,5,u,3>: Cost 3 vext1 <3,7,5,u>, <3,7,5,u>
+ 1511337270U, // <7,5,u,4>: Cost 2 vext1 <5,7,5,u>, RHS
+ 1638469881U, // <7,5,u,5>: Cost 2 vext3 RHS, <5,u,5,7>
+ 2712064258U, // <7,5,u,6>: Cost 3 vext3 RHS, <5,u,6,7>
+ 1638469892U, // <7,5,u,7>: Cost 2 vext3 RHS, <5,u,7,0>
+ 1638469904U, // <7,5,u,u>: Cost 2 vext3 RHS, <5,u,u,3>
+ 2650324992U, // <7,6,0,0>: Cost 3 vext2 <5,4,7,6>, <0,0,0,0>
+ 1576583270U, // <7,6,0,1>: Cost 2 vext2 <5,4,7,6>, LHS
+ 2712064300U, // <7,6,0,2>: Cost 3 vext3 RHS, <6,0,2,4>
+ 2255295336U, // <7,6,0,3>: Cost 3 vrev <6,7,3,0>
+ 2712064316U, // <7,6,0,4>: Cost 3 vext3 RHS, <6,0,4,2>
+ 2585088098U, // <7,6,0,5>: Cost 3 vext1 <5,7,6,0>, <5,6,7,0>
+ 2735952204U, // <7,6,0,6>: Cost 3 vext3 RHS, <6,0,6,0>
+ 2712211799U, // <7,6,0,7>: Cost 3 vext3 RHS, <6,0,7,2>
+ 1576583837U, // <7,6,0,u>: Cost 2 vext2 <5,4,7,6>, LHS
+ 1181340494U, // <7,6,1,0>: Cost 2 vrev <6,7,0,1>
+ 2650325812U, // <7,6,1,1>: Cost 3 vext2 <5,4,7,6>, <1,1,1,1>
+ 2650325910U, // <7,6,1,2>: Cost 3 vext2 <5,4,7,6>, <1,2,3,0>
+ 2650325976U, // <7,6,1,3>: Cost 3 vext2 <5,4,7,6>, <1,3,1,3>
+ 2579123510U, // <7,6,1,4>: Cost 3 vext1 <4,7,6,1>, RHS
+ 2650326160U, // <7,6,1,5>: Cost 3 vext2 <5,4,7,6>, <1,5,3,7>
+ 2714055072U, // <7,6,1,6>: Cost 3 vext3 RHS, <6,1,6,3>
+ 2712064425U, // <7,6,1,7>: Cost 3 vext3 RHS, <6,1,7,3>
+ 1181930390U, // <7,6,1,u>: Cost 2 vrev <6,7,u,1>
+ 2712211897U, // <7,6,2,0>: Cost 3 vext3 RHS, <6,2,0,1>
+ 2714055108U, // <7,6,2,1>: Cost 3 vext3 RHS, <6,2,1,3>
+ 2650326632U, // <7,6,2,2>: Cost 3 vext2 <5,4,7,6>, <2,2,2,2>
+ 2650326694U, // <7,6,2,3>: Cost 3 vext2 <5,4,7,6>, <2,3,0,1>
+ 2714055137U, // <7,6,2,4>: Cost 3 vext3 RHS, <6,2,4,5>
+ 2714055148U, // <7,6,2,5>: Cost 3 vext3 RHS, <6,2,5,7>
+ 2650326970U, // <7,6,2,6>: Cost 3 vext2 <5,4,7,6>, <2,6,3,7>
+ 1638470138U, // <7,6,2,7>: Cost 2 vext3 RHS, <6,2,7,3>
+ 1638470147U, // <7,6,2,u>: Cost 2 vext3 RHS, <6,2,u,3>
+ 2650327190U, // <7,6,3,0>: Cost 3 vext2 <5,4,7,6>, <3,0,1,2>
+ 2255172441U, // <7,6,3,1>: Cost 3 vrev <6,7,1,3>
+ 2255246178U, // <7,6,3,2>: Cost 3 vrev <6,7,2,3>
+ 2650327452U, // <7,6,3,3>: Cost 3 vext2 <5,4,7,6>, <3,3,3,3>
+ 2712064562U, // <7,6,3,4>: Cost 3 vext3 RHS, <6,3,4,5>
+ 2650327627U, // <7,6,3,5>: Cost 3 vext2 <5,4,7,6>, <3,5,4,7>
+ 3713452726U, // <7,6,3,6>: Cost 4 vext2 <3,6,7,6>, <3,6,7,6>
+ 2700563016U, // <7,6,3,7>: Cost 3 vext3 <2,6,3,7>, <6,3,7,0>
+ 2712064593U, // <7,6,3,u>: Cost 3 vext3 RHS, <6,3,u,0>
+ 2650327954U, // <7,6,4,0>: Cost 3 vext2 <5,4,7,6>, <4,0,5,1>
+ 2735952486U, // <7,6,4,1>: Cost 3 vext3 RHS, <6,4,1,3>
+ 2735952497U, // <7,6,4,2>: Cost 3 vext3 RHS, <6,4,2,5>
+ 2255328108U, // <7,6,4,3>: Cost 3 vrev <6,7,3,4>
+ 2712212100U, // <7,6,4,4>: Cost 3 vext3 RHS, <6,4,4,6>
+ 1576586550U, // <7,6,4,5>: Cost 2 vext2 <5,4,7,6>, RHS
+ 2714055312U, // <7,6,4,6>: Cost 3 vext3 RHS, <6,4,6,0>
+ 2712212126U, // <7,6,4,7>: Cost 3 vext3 RHS, <6,4,7,5>
+ 1576586793U, // <7,6,4,u>: Cost 2 vext2 <5,4,7,6>, RHS
+ 2579152998U, // <7,6,5,0>: Cost 3 vext1 <4,7,6,5>, LHS
+ 2650328784U, // <7,6,5,1>: Cost 3 vext2 <5,4,7,6>, <5,1,7,3>
+ 2714055364U, // <7,6,5,2>: Cost 3 vext3 RHS, <6,5,2,7>
+ 3785806538U, // <7,6,5,3>: Cost 4 vext3 RHS, <6,5,3,4>
+ 1576587206U, // <7,6,5,4>: Cost 2 vext2 <5,4,7,6>, <5,4,7,6>
+ 2650329092U, // <7,6,5,5>: Cost 3 vext2 <5,4,7,6>, <5,5,5,5>
+ 2650329186U, // <7,6,5,6>: Cost 3 vext2 <5,4,7,6>, <5,6,7,0>
+ 2712064753U, // <7,6,5,7>: Cost 3 vext3 RHS, <6,5,7,7>
+ 1181963162U, // <7,6,5,u>: Cost 2 vrev <6,7,u,5>
+ 2714055421U, // <7,6,6,0>: Cost 3 vext3 RHS, <6,6,0,1>
+ 2714055432U, // <7,6,6,1>: Cost 3 vext3 RHS, <6,6,1,3>
+ 2650329594U, // <7,6,6,2>: Cost 3 vext2 <5,4,7,6>, <6,2,7,3>
+ 3785806619U, // <7,6,6,3>: Cost 4 vext3 RHS, <6,6,3,4>
+ 2712212260U, // <7,6,6,4>: Cost 3 vext3 RHS, <6,6,4,4>
+ 2714055472U, // <7,6,6,5>: Cost 3 vext3 RHS, <6,6,5,7>
+ 1638323000U, // <7,6,6,6>: Cost 2 vext3 RHS, <6,6,6,6>
+ 1638470466U, // <7,6,6,7>: Cost 2 vext3 RHS, <6,6,7,7>
+ 1638470475U, // <7,6,6,u>: Cost 2 vext3 RHS, <6,6,u,7>
+ 1638323022U, // <7,6,7,0>: Cost 2 vext3 RHS, <6,7,0,1>
+ 2712064854U, // <7,6,7,1>: Cost 3 vext3 RHS, <6,7,1,0>
+ 2712064865U, // <7,6,7,2>: Cost 3 vext3 RHS, <6,7,2,2>
+ 2712064872U, // <7,6,7,3>: Cost 3 vext3 RHS, <6,7,3,0>
+ 1638323062U, // <7,6,7,4>: Cost 2 vext3 RHS, <6,7,4,5>
+ 2712064894U, // <7,6,7,5>: Cost 3 vext3 RHS, <6,7,5,4>
+ 2712064905U, // <7,6,7,6>: Cost 3 vext3 RHS, <6,7,6,6>
+ 2712064915U, // <7,6,7,7>: Cost 3 vext3 RHS, <6,7,7,7>
+ 1638323094U, // <7,6,7,u>: Cost 2 vext3 RHS, <6,7,u,1>
+ 1638470559U, // <7,6,u,0>: Cost 2 vext3 RHS, <6,u,0,1>
+ 1576589102U, // <7,6,u,1>: Cost 2 vext2 <5,4,7,6>, LHS
+ 2712212402U, // <7,6,u,2>: Cost 3 vext3 RHS, <6,u,2,2>
+ 2712212409U, // <7,6,u,3>: Cost 3 vext3 RHS, <6,u,3,0>
+ 1638470599U, // <7,6,u,4>: Cost 2 vext3 RHS, <6,u,4,5>
+ 1576589466U, // <7,6,u,5>: Cost 2 vext2 <5,4,7,6>, RHS
+ 1638323000U, // <7,6,u,6>: Cost 2 vext3 RHS, <6,6,6,6>
+ 1638470624U, // <7,6,u,7>: Cost 2 vext3 RHS, <6,u,7,3>
+ 1638470631U, // <7,6,u,u>: Cost 2 vext3 RHS, <6,u,u,1>
+ 2712065007U, // <7,7,0,0>: Cost 3 vext3 RHS, <7,0,0,0>
+ 1638323194U, // <7,7,0,1>: Cost 2 vext3 RHS, <7,0,1,2>
+ 2712065025U, // <7,7,0,2>: Cost 3 vext3 RHS, <7,0,2,0>
+ 3646958337U, // <7,7,0,3>: Cost 4 vext1 <3,7,7,0>, <3,7,7,0>
+ 2712065044U, // <7,7,0,4>: Cost 3 vext3 RHS, <7,0,4,1>
+ 2585161907U, // <7,7,0,5>: Cost 3 vext1 <5,7,7,0>, <5,7,7,0>
+ 2591134604U, // <7,7,0,6>: Cost 3 vext1 <6,7,7,0>, <6,7,7,0>
+ 2591134714U, // <7,7,0,7>: Cost 3 vext1 <6,7,7,0>, <7,0,1,2>
+ 1638323257U, // <7,7,0,u>: Cost 2 vext3 RHS, <7,0,u,2>
+ 2712065091U, // <7,7,1,0>: Cost 3 vext3 RHS, <7,1,0,3>
+ 2712065098U, // <7,7,1,1>: Cost 3 vext3 RHS, <7,1,1,1>
+ 2712065109U, // <7,7,1,2>: Cost 3 vext3 RHS, <7,1,2,3>
+ 2692748384U, // <7,7,1,3>: Cost 3 vext3 <1,3,5,7>, <7,1,3,5>
+ 2585169206U, // <7,7,1,4>: Cost 3 vext1 <5,7,7,1>, RHS
+ 2693928048U, // <7,7,1,5>: Cost 3 vext3 <1,5,3,7>, <7,1,5,3>
+ 2585170766U, // <7,7,1,6>: Cost 3 vext1 <5,7,7,1>, <6,7,0,1>
+ 2735953024U, // <7,7,1,7>: Cost 3 vext3 RHS, <7,1,7,1>
+ 2695918731U, // <7,7,1,u>: Cost 3 vext3 <1,u,3,7>, <7,1,u,3>
+ 3770471574U, // <7,7,2,0>: Cost 4 vext3 <2,0,5,7>, <7,2,0,5>
+ 3785807002U, // <7,7,2,1>: Cost 4 vext3 RHS, <7,2,1,0>
+ 2712065189U, // <7,7,2,2>: Cost 3 vext3 RHS, <7,2,2,2>
+ 2712065196U, // <7,7,2,3>: Cost 3 vext3 RHS, <7,2,3,0>
+ 3773125818U, // <7,7,2,4>: Cost 4 vext3 <2,4,5,7>, <7,2,4,5>
+ 3766490305U, // <7,7,2,5>: Cost 4 vext3 <1,3,5,7>, <7,2,5,3>
+ 2700563658U, // <7,7,2,6>: Cost 3 vext3 <2,6,3,7>, <7,2,6,3>
+ 2735953107U, // <7,7,2,7>: Cost 3 vext3 RHS, <7,2,7,3>
+ 2701890780U, // <7,7,2,u>: Cost 3 vext3 <2,u,3,7>, <7,2,u,3>
+ 2712065251U, // <7,7,3,0>: Cost 3 vext3 RHS, <7,3,0,1>
+ 3766490350U, // <7,7,3,1>: Cost 4 vext3 <1,3,5,7>, <7,3,1,3>
+ 3774305530U, // <7,7,3,2>: Cost 4 vext3 <2,6,3,7>, <7,3,2,6>
+ 2637728196U, // <7,7,3,3>: Cost 3 vext2 <3,3,7,7>, <3,3,7,7>
+ 2712065291U, // <7,7,3,4>: Cost 3 vext3 RHS, <7,3,4,5>
+ 2585186486U, // <7,7,3,5>: Cost 3 vext1 <5,7,7,3>, <5,7,7,3>
+ 2639719095U, // <7,7,3,6>: Cost 3 vext2 <3,6,7,7>, <3,6,7,7>
+ 2640382728U, // <7,7,3,7>: Cost 3 vext2 <3,7,7,7>, <3,7,7,7>
+ 2641046361U, // <7,7,3,u>: Cost 3 vext2 <3,u,7,7>, <3,u,7,7>
+ 2712212792U, // <7,7,4,0>: Cost 3 vext3 RHS, <7,4,0,5>
+ 3646989312U, // <7,7,4,1>: Cost 4 vext1 <3,7,7,4>, <1,3,5,7>
+ 3785807176U, // <7,7,4,2>: Cost 4 vext3 RHS, <7,4,2,3>
+ 3646991109U, // <7,7,4,3>: Cost 4 vext1 <3,7,7,4>, <3,7,7,4>
+ 2712065371U, // <7,7,4,4>: Cost 3 vext3 RHS, <7,4,4,4>
+ 1638323558U, // <7,7,4,5>: Cost 2 vext3 RHS, <7,4,5,6>
+ 2712212845U, // <7,7,4,6>: Cost 3 vext3 RHS, <7,4,6,4>
+ 2591167846U, // <7,7,4,7>: Cost 3 vext1 <6,7,7,4>, <7,4,5,6>
+ 1638323585U, // <7,7,4,u>: Cost 2 vext3 RHS, <7,4,u,6>
+ 2585198694U, // <7,7,5,0>: Cost 3 vext1 <5,7,7,5>, LHS
+ 2712212884U, // <7,7,5,1>: Cost 3 vext3 RHS, <7,5,1,7>
+ 3711471393U, // <7,7,5,2>: Cost 4 vext2 <3,3,7,7>, <5,2,7,3>
+ 2649673590U, // <7,7,5,3>: Cost 3 vext2 <5,3,7,7>, <5,3,7,7>
+ 2712065455U, // <7,7,5,4>: Cost 3 vext3 RHS, <7,5,4,7>
+ 1577259032U, // <7,7,5,5>: Cost 2 vext2 <5,5,7,7>, <5,5,7,7>
+ 2712065473U, // <7,7,5,6>: Cost 3 vext3 RHS, <7,5,6,7>
+ 2712212936U, // <7,7,5,7>: Cost 3 vext3 RHS, <7,5,7,5>
+ 1579249931U, // <7,7,5,u>: Cost 2 vext2 <5,u,7,7>, <5,u,7,7>
+ 2591178854U, // <7,7,6,0>: Cost 3 vext1 <6,7,7,6>, LHS
+ 2735953374U, // <7,7,6,1>: Cost 3 vext3 RHS, <7,6,1,0>
+ 2712212974U, // <7,7,6,2>: Cost 3 vext3 RHS, <7,6,2,7>
+ 2655646287U, // <7,7,6,3>: Cost 3 vext2 <6,3,7,7>, <6,3,7,7>
+ 2591182134U, // <7,7,6,4>: Cost 3 vext1 <6,7,7,6>, RHS
+ 2656973553U, // <7,7,6,5>: Cost 3 vext2 <6,5,7,7>, <6,5,7,7>
+ 1583895362U, // <7,7,6,6>: Cost 2 vext2 <6,6,7,7>, <6,6,7,7>
+ 2712065556U, // <7,7,6,7>: Cost 3 vext3 RHS, <7,6,7,0>
+ 1585222628U, // <7,7,6,u>: Cost 2 vext2 <6,u,7,7>, <6,u,7,7>
+ 1523417190U, // <7,7,7,0>: Cost 2 vext1 <7,7,7,7>, LHS
+ 2597159670U, // <7,7,7,1>: Cost 3 vext1 <7,7,7,7>, <1,0,3,2>
+ 2597160552U, // <7,7,7,2>: Cost 3 vext1 <7,7,7,7>, <2,2,2,2>
+ 2597161110U, // <7,7,7,3>: Cost 3 vext1 <7,7,7,7>, <3,0,1,2>
+ 1523420470U, // <7,7,7,4>: Cost 2 vext1 <7,7,7,7>, RHS
+ 2651002296U, // <7,7,7,5>: Cost 3 vext2 <5,5,7,7>, <7,5,5,7>
+ 2657637906U, // <7,7,7,6>: Cost 3 vext2 <6,6,7,7>, <7,6,6,7>
+ 363253046U, // <7,7,7,7>: Cost 1 vdup3 RHS
+ 363253046U, // <7,7,7,u>: Cost 1 vdup3 RHS
+ 1523417190U, // <7,7,u,0>: Cost 2 vext1 <7,7,7,7>, LHS
+ 1638471298U, // <7,7,u,1>: Cost 2 vext3 RHS, <7,u,1,2>
+ 2712213132U, // <7,7,u,2>: Cost 3 vext3 RHS, <7,u,2,3>
+ 2712213138U, // <7,7,u,3>: Cost 3 vext3 RHS, <7,u,3,0>
+ 1523420470U, // <7,7,u,4>: Cost 2 vext1 <7,7,7,7>, RHS
+ 1638471338U, // <7,7,u,5>: Cost 2 vext3 RHS, <7,u,5,6>
+ 1595840756U, // <7,7,u,6>: Cost 2 vext2 <u,6,7,7>, <u,6,7,7>
+ 363253046U, // <7,7,u,7>: Cost 1 vdup3 RHS
+ 363253046U, // <7,7,u,u>: Cost 1 vdup3 RHS
+ 1638318080U, // <7,u,0,0>: Cost 2 vext3 RHS, <0,0,0,0>
+ 1638323923U, // <7,u,0,1>: Cost 2 vext3 RHS, <u,0,1,2>
+ 1662211804U, // <7,u,0,2>: Cost 2 vext3 RHS, <u,0,2,2>
+ 1638323941U, // <7,u,0,3>: Cost 2 vext3 RHS, <u,0,3,2>
+ 2712065773U, // <7,u,0,4>: Cost 3 vext3 RHS, <u,0,4,1>
+ 1662359286U, // <7,u,0,5>: Cost 2 vext3 RHS, <u,0,5,1>
+ 1662359296U, // <7,u,0,6>: Cost 2 vext3 RHS, <u,0,6,2>
+ 2987150664U, // <7,u,0,7>: Cost 3 vzipr <5,6,7,0>, RHS
+ 1638323986U, // <7,u,0,u>: Cost 2 vext3 RHS, <u,0,u,2>
+ 1517469798U, // <7,u,1,0>: Cost 2 vext1 <6,7,u,1>, LHS
+ 1638318900U, // <7,u,1,1>: Cost 2 vext3 RHS, <1,1,1,1>
+ 564582190U, // <7,u,1,2>: Cost 1 vext3 RHS, LHS
+ 1638324023U, // <7,u,1,3>: Cost 2 vext3 RHS, <u,1,3,3>
+ 1517473078U, // <7,u,1,4>: Cost 2 vext1 <6,7,u,1>, RHS
+ 2693928777U, // <7,u,1,5>: Cost 3 vext3 <1,5,3,7>, <u,1,5,3>
+ 1517474710U, // <7,u,1,6>: Cost 2 vext1 <6,7,u,1>, <6,7,u,1>
+ 1640462171U, // <7,u,1,7>: Cost 2 vext3 RHS, <u,1,7,3>
+ 564582244U, // <7,u,1,u>: Cost 1 vext3 RHS, LHS
+ 1638318244U, // <7,u,2,0>: Cost 2 vext3 RHS, <0,2,0,2>
+ 2712065907U, // <7,u,2,1>: Cost 3 vext3 RHS, <u,2,1,0>
+ 1638319720U, // <7,u,2,2>: Cost 2 vext3 RHS, <2,2,2,2>
+ 1638324101U, // <7,u,2,3>: Cost 2 vext3 RHS, <u,2,3,0>
+ 1638318284U, // <7,u,2,4>: Cost 2 vext3 RHS, <0,2,4,6>
+ 2712065947U, // <7,u,2,5>: Cost 3 vext3 RHS, <u,2,5,4>
+ 2700564387U, // <7,u,2,6>: Cost 3 vext3 <2,6,3,7>, <u,2,6,3>
+ 1640314796U, // <7,u,2,7>: Cost 2 vext3 RHS, <u,2,7,3>
+ 1638324146U, // <7,u,2,u>: Cost 2 vext3 RHS, <u,2,u,0>
+ 1638324156U, // <7,u,3,0>: Cost 2 vext3 RHS, <u,3,0,1>
+ 1638319064U, // <7,u,3,1>: Cost 2 vext3 RHS, <1,3,1,3>
+ 2700564435U, // <7,u,3,2>: Cost 3 vext3 <2,6,3,7>, <u,3,2,6>
+ 1638320540U, // <7,u,3,3>: Cost 2 vext3 RHS, <3,3,3,3>
+ 1638324196U, // <7,u,3,4>: Cost 2 vext3 RHS, <u,3,4,5>
+ 1638324207U, // <7,u,3,5>: Cost 2 vext3 RHS, <u,3,5,7>
+ 2700564472U, // <7,u,3,6>: Cost 3 vext3 <2,6,3,7>, <u,3,6,7>
+ 2695919610U, // <7,u,3,7>: Cost 3 vext3 <1,u,3,7>, <u,3,7,0>
+ 1638324228U, // <7,u,3,u>: Cost 2 vext3 RHS, <u,3,u,1>
+ 2712066061U, // <7,u,4,0>: Cost 3 vext3 RHS, <u,4,0,1>
+ 1662212122U, // <7,u,4,1>: Cost 2 vext3 RHS, <u,4,1,5>
+ 1662212132U, // <7,u,4,2>: Cost 2 vext3 RHS, <u,4,2,6>
+ 2712066092U, // <7,u,4,3>: Cost 3 vext3 RHS, <u,4,3,5>
+ 1638321360U, // <7,u,4,4>: Cost 2 vext3 RHS, <4,4,4,4>
+ 1638324287U, // <7,u,4,5>: Cost 2 vext3 RHS, <u,4,5,6>
+ 1662359624U, // <7,u,4,6>: Cost 2 vext3 RHS, <u,4,6,6>
+ 1640314961U, // <7,u,4,7>: Cost 2 vext3 RHS, <u,4,7,6>
+ 1638324314U, // <7,u,4,u>: Cost 2 vext3 RHS, <u,4,u,6>
+ 1517502566U, // <7,u,5,0>: Cost 2 vext1 <6,7,u,5>, LHS
+ 1574612693U, // <7,u,5,1>: Cost 2 vext2 <5,1,7,u>, <5,1,7,u>
+ 2712066162U, // <7,u,5,2>: Cost 3 vext3 RHS, <u,5,2,3>
+ 1638324351U, // <7,u,5,3>: Cost 2 vext3 RHS, <u,5,3,7>
+ 1576603592U, // <7,u,5,4>: Cost 2 vext2 <5,4,7,u>, <5,4,7,u>
+ 1577267225U, // <7,u,5,5>: Cost 2 vext2 <5,5,7,u>, <5,5,7,u>
+ 564582554U, // <7,u,5,6>: Cost 1 vext3 RHS, RHS
+ 1640462499U, // <7,u,5,7>: Cost 2 vext3 RHS, <u,5,7,7>
+ 564582572U, // <7,u,5,u>: Cost 1 vext3 RHS, RHS
+ 2712066223U, // <7,u,6,0>: Cost 3 vext3 RHS, <u,6,0,1>
+ 2712066238U, // <7,u,6,1>: Cost 3 vext3 RHS, <u,6,1,7>
+ 1581249023U, // <7,u,6,2>: Cost 2 vext2 <6,2,7,u>, <6,2,7,u>
+ 1638324432U, // <7,u,6,3>: Cost 2 vext3 RHS, <u,6,3,7>
+ 1638468980U, // <7,u,6,4>: Cost 2 vext3 RHS, <4,6,4,6>
+ 2712066274U, // <7,u,6,5>: Cost 3 vext3 RHS, <u,6,5,7>
+ 1583903555U, // <7,u,6,6>: Cost 2 vext2 <6,6,7,u>, <6,6,7,u>
+ 1640315117U, // <7,u,6,7>: Cost 2 vext3 RHS, <u,6,7,0>
+ 1638324477U, // <7,u,6,u>: Cost 2 vext3 RHS, <u,6,u,7>
+ 1638471936U, // <7,u,7,0>: Cost 2 vext3 RHS, <u,7,0,1>
+ 2692970763U, // <7,u,7,1>: Cost 3 vext3 <1,3,u,7>, <u,7,1,3>
+ 2700933399U, // <7,u,7,2>: Cost 3 vext3 <2,6,u,7>, <u,7,2,6>
+ 2573347601U, // <7,u,7,3>: Cost 3 vext1 <3,7,u,7>, <3,7,u,7>
+ 1638471976U, // <7,u,7,4>: Cost 2 vext3 RHS, <u,7,4,5>
+ 1511551171U, // <7,u,7,5>: Cost 2 vext1 <5,7,u,7>, <5,7,u,7>
+ 2712213815U, // <7,u,7,6>: Cost 3 vext3 RHS, <u,7,6,2>
+ 363253046U, // <7,u,7,7>: Cost 1 vdup3 RHS
+ 363253046U, // <7,u,7,u>: Cost 1 vdup3 RHS
+ 1638324561U, // <7,u,u,0>: Cost 2 vext3 RHS, <u,u,0,1>
+ 1638324571U, // <7,u,u,1>: Cost 2 vext3 RHS, <u,u,1,2>
+ 564582757U, // <7,u,u,2>: Cost 1 vext3 RHS, LHS
+ 1638324587U, // <7,u,u,3>: Cost 2 vext3 RHS, <u,u,3,0>
+ 1638324601U, // <7,u,u,4>: Cost 2 vext3 RHS, <u,u,4,5>
+ 1638324611U, // <7,u,u,5>: Cost 2 vext3 RHS, <u,u,5,6>
+ 564582797U, // <7,u,u,6>: Cost 1 vext3 RHS, RHS
+ 363253046U, // <7,u,u,7>: Cost 1 vdup3 RHS
+ 564582811U, // <7,u,u,u>: Cost 1 vext3 RHS, LHS
+ 135053414U, // <u,0,0,0>: Cost 1 vdup0 LHS
+ 1611489290U, // <u,0,0,1>: Cost 2 vext3 LHS, <0,0,1,1>
+ 1611489300U, // <u,0,0,2>: Cost 2 vext3 LHS, <0,0,2,2>
+ 2568054923U, // <u,0,0,3>: Cost 3 vext1 <3,0,0,0>, <3,0,0,0>
+ 1481706806U, // <u,0,0,4>: Cost 2 vext1 <0,u,0,0>, RHS
+ 2555449040U, // <u,0,0,5>: Cost 3 vext1 <0,u,0,0>, <5,1,7,3>
+ 2591282078U, // <u,0,0,6>: Cost 3 vext1 <6,u,0,0>, <6,u,0,0>
+ 2591945711U, // <u,0,0,7>: Cost 3 vext1 <7,0,0,0>, <7,0,0,0>
+ 135053414U, // <u,0,0,u>: Cost 1 vdup0 LHS
+ 1493655654U, // <u,0,1,0>: Cost 2 vext1 <2,u,0,1>, LHS
+ 1860550758U, // <u,0,1,1>: Cost 2 vzipl LHS, LHS
+ 537747563U, // <u,0,1,2>: Cost 1 vext3 LHS, LHS
+ 2625135576U, // <u,0,1,3>: Cost 3 vext2 <1,2,u,0>, <1,3,1,3>
+ 1493658934U, // <u,0,1,4>: Cost 2 vext1 <2,u,0,1>, RHS
+ 2625135760U, // <u,0,1,5>: Cost 3 vext2 <1,2,u,0>, <1,5,3,7>
+ 1517548447U, // <u,0,1,6>: Cost 2 vext1 <6,u,0,1>, <6,u,0,1>
+ 2591290362U, // <u,0,1,7>: Cost 3 vext1 <6,u,0,1>, <7,0,1,2>
+ 537747612U, // <u,0,1,u>: Cost 1 vext3 LHS, LHS
+ 1611489444U, // <u,0,2,0>: Cost 2 vext3 LHS, <0,2,0,2>
+ 2685231276U, // <u,0,2,1>: Cost 3 vext3 LHS, <0,2,1,1>
+ 1994768486U, // <u,0,2,2>: Cost 2 vtrnl LHS, LHS
+ 2685231294U, // <u,0,2,3>: Cost 3 vext3 LHS, <0,2,3,1>
+ 1611489484U, // <u,0,2,4>: Cost 2 vext3 LHS, <0,2,4,6>
+ 2712068310U, // <u,0,2,5>: Cost 3 vext3 RHS, <0,2,5,7>
+ 2625136570U, // <u,0,2,6>: Cost 3 vext2 <1,2,u,0>, <2,6,3,7>
+ 2591962097U, // <u,0,2,7>: Cost 3 vext1 <7,0,0,2>, <7,0,0,2>
+ 1611489516U, // <u,0,2,u>: Cost 2 vext3 LHS, <0,2,u,2>
+ 2954067968U, // <u,0,3,0>: Cost 3 vzipr LHS, <0,0,0,0>
+ 2685231356U, // <u,0,3,1>: Cost 3 vext3 LHS, <0,3,1,0>
+ 72589981U, // <u,0,3,2>: Cost 1 vrev LHS
+ 2625137052U, // <u,0,3,3>: Cost 3 vext2 <1,2,u,0>, <3,3,3,3>
+ 2625137154U, // <u,0,3,4>: Cost 3 vext2 <1,2,u,0>, <3,4,5,6>
+ 2639071848U, // <u,0,3,5>: Cost 3 vext2 <3,5,u,0>, <3,5,u,0>
+ 2639735481U, // <u,0,3,6>: Cost 3 vext2 <3,6,u,0>, <3,6,u,0>
+ 2597279354U, // <u,0,3,7>: Cost 3 vext1 <7,u,0,3>, <7,u,0,3>
+ 73032403U, // <u,0,3,u>: Cost 1 vrev LHS
+ 2687074636U, // <u,0,4,0>: Cost 3 vext3 <0,4,0,u>, <0,4,0,u>
+ 1611489618U, // <u,0,4,1>: Cost 2 vext3 LHS, <0,4,1,5>
+ 1611489628U, // <u,0,4,2>: Cost 2 vext3 LHS, <0,4,2,6>
+ 3629222038U, // <u,0,4,3>: Cost 4 vext1 <0,u,0,4>, <3,0,1,2>
+ 2555481398U, // <u,0,4,4>: Cost 3 vext1 <0,u,0,4>, RHS
+ 1551396150U, // <u,0,4,5>: Cost 2 vext2 <1,2,u,0>, RHS
+ 2651680116U, // <u,0,4,6>: Cost 3 vext2 <5,6,u,0>, <4,6,4,6>
+ 2646150600U, // <u,0,4,7>: Cost 3 vext2 <4,7,5,0>, <4,7,5,0>
+ 1611932050U, // <u,0,4,u>: Cost 2 vext3 LHS, <0,4,u,6>
+ 2561458278U, // <u,0,5,0>: Cost 3 vext1 <1,u,0,5>, LHS
+ 1863532646U, // <u,0,5,1>: Cost 2 vzipl RHS, LHS
+ 2712068526U, // <u,0,5,2>: Cost 3 vext3 RHS, <0,5,2,7>
+ 2649689976U, // <u,0,5,3>: Cost 3 vext2 <5,3,u,0>, <5,3,u,0>
+ 2220237489U, // <u,0,5,4>: Cost 3 vrev <0,u,4,5>
+ 2651680772U, // <u,0,5,5>: Cost 3 vext2 <5,6,u,0>, <5,5,5,5>
+ 1577939051U, // <u,0,5,6>: Cost 2 vext2 <5,6,u,0>, <5,6,u,0>
+ 2830077238U, // <u,0,5,7>: Cost 3 vuzpr <1,u,3,0>, RHS
+ 1579266317U, // <u,0,5,u>: Cost 2 vext2 <5,u,u,0>, <5,u,u,0>
+ 2555494502U, // <u,0,6,0>: Cost 3 vext1 <0,u,0,6>, LHS
+ 2712068598U, // <u,0,6,1>: Cost 3 vext3 RHS, <0,6,1,7>
+ 1997750374U, // <u,0,6,2>: Cost 2 vtrnl RHS, LHS
+ 2655662673U, // <u,0,6,3>: Cost 3 vext2 <6,3,u,0>, <6,3,u,0>
+ 2555497782U, // <u,0,6,4>: Cost 3 vext1 <0,u,0,6>, RHS
+ 2651681459U, // <u,0,6,5>: Cost 3 vext2 <5,6,u,0>, <6,5,0,u>
+ 2651681592U, // <u,0,6,6>: Cost 3 vext2 <5,6,u,0>, <6,6,6,6>
+ 2651681614U, // <u,0,6,7>: Cost 3 vext2 <5,6,u,0>, <6,7,0,1>
+ 1997750428U, // <u,0,6,u>: Cost 2 vtrnl RHS, LHS
+ 2567446630U, // <u,0,7,0>: Cost 3 vext1 <2,u,0,7>, LHS
+ 2567447446U, // <u,0,7,1>: Cost 3 vext1 <2,u,0,7>, <1,2,3,0>
+ 2567448641U, // <u,0,7,2>: Cost 3 vext1 <2,u,0,7>, <2,u,0,7>
+ 2573421338U, // <u,0,7,3>: Cost 3 vext1 <3,u,0,7>, <3,u,0,7>
+ 2567449910U, // <u,0,7,4>: Cost 3 vext1 <2,u,0,7>, RHS
+ 2651682242U, // <u,0,7,5>: Cost 3 vext2 <5,6,u,0>, <7,5,6,u>
+ 2591339429U, // <u,0,7,6>: Cost 3 vext1 <6,u,0,7>, <6,u,0,7>
+ 2651682412U, // <u,0,7,7>: Cost 3 vext2 <5,6,u,0>, <7,7,7,7>
+ 2567452462U, // <u,0,7,u>: Cost 3 vext1 <2,u,0,7>, LHS
+ 135053414U, // <u,0,u,0>: Cost 1 vdup0 LHS
+ 1611489938U, // <u,0,u,1>: Cost 2 vext3 LHS, <0,u,1,1>
+ 537748125U, // <u,0,u,2>: Cost 1 vext3 LHS, LHS
+ 2685674148U, // <u,0,u,3>: Cost 3 vext3 LHS, <0,u,3,1>
+ 1611932338U, // <u,0,u,4>: Cost 2 vext3 LHS, <0,u,4,6>
+ 1551399066U, // <u,0,u,5>: Cost 2 vext2 <1,2,u,0>, RHS
+ 1517605798U, // <u,0,u,6>: Cost 2 vext1 <6,u,0,u>, <6,u,0,u>
+ 2830077481U, // <u,0,u,7>: Cost 3 vuzpr <1,u,3,0>, RHS
+ 537748179U, // <u,0,u,u>: Cost 1 vext3 LHS, LHS
+ 1544101961U, // <u,1,0,0>: Cost 2 vext2 <0,0,u,1>, <0,0,u,1>
+ 1558036582U, // <u,1,0,1>: Cost 2 vext2 <2,3,u,1>, LHS
+ 2619171051U, // <u,1,0,2>: Cost 3 vext2 <0,2,u,1>, <0,2,u,1>
+ 1611490038U, // <u,1,0,3>: Cost 2 vext3 LHS, <1,0,3,2>
+ 2555522358U, // <u,1,0,4>: Cost 3 vext1 <0,u,1,0>, RHS
+ 2712068871U, // <u,1,0,5>: Cost 3 vext3 RHS, <1,0,5,1>
+ 2591355815U, // <u,1,0,6>: Cost 3 vext1 <6,u,1,0>, <6,u,1,0>
+ 2597328512U, // <u,1,0,7>: Cost 3 vext1 <7,u,1,0>, <7,u,1,0>
+ 1611490083U, // <u,1,0,u>: Cost 2 vext3 LHS, <1,0,u,2>
+ 1481785446U, // <u,1,1,0>: Cost 2 vext1 <0,u,1,1>, LHS
+ 202162278U, // <u,1,1,1>: Cost 1 vdup1 LHS
+ 2555528808U, // <u,1,1,2>: Cost 3 vext1 <0,u,1,1>, <2,2,2,2>
+ 1611490120U, // <u,1,1,3>: Cost 2 vext3 LHS, <1,1,3,3>
+ 1481788726U, // <u,1,1,4>: Cost 2 vext1 <0,u,1,1>, RHS
+ 2689876828U, // <u,1,1,5>: Cost 3 vext3 LHS, <1,1,5,5>
+ 2591364008U, // <u,1,1,6>: Cost 3 vext1 <6,u,1,1>, <6,u,1,1>
+ 2592691274U, // <u,1,1,7>: Cost 3 vext1 <7,1,1,1>, <7,1,1,1>
+ 202162278U, // <u,1,1,u>: Cost 1 vdup1 LHS
+ 1499709542U, // <u,1,2,0>: Cost 2 vext1 <3,u,1,2>, LHS
+ 2689876871U, // <u,1,2,1>: Cost 3 vext3 LHS, <1,2,1,3>
+ 2631116445U, // <u,1,2,2>: Cost 3 vext2 <2,2,u,1>, <2,2,u,1>
+ 835584U, // <u,1,2,3>: Cost 0 copy LHS
+ 1499712822U, // <u,1,2,4>: Cost 2 vext1 <3,u,1,2>, RHS
+ 2689876907U, // <u,1,2,5>: Cost 3 vext3 LHS, <1,2,5,3>
+ 2631780282U, // <u,1,2,6>: Cost 3 vext2 <2,3,u,1>, <2,6,3,7>
+ 1523603074U, // <u,1,2,7>: Cost 2 vext1 <7,u,1,2>, <7,u,1,2>
+ 835584U, // <u,1,2,u>: Cost 0 copy LHS
+ 1487773798U, // <u,1,3,0>: Cost 2 vext1 <1,u,1,3>, LHS
+ 1611490264U, // <u,1,3,1>: Cost 2 vext3 LHS, <1,3,1,3>
+ 2685232094U, // <u,1,3,2>: Cost 3 vext3 LHS, <1,3,2,0>
+ 2018746470U, // <u,1,3,3>: Cost 2 vtrnr LHS, LHS
+ 1487777078U, // <u,1,3,4>: Cost 2 vext1 <1,u,1,3>, RHS
+ 1611490304U, // <u,1,3,5>: Cost 2 vext3 LHS, <1,3,5,7>
+ 2685674505U, // <u,1,3,6>: Cost 3 vext3 LHS, <1,3,6,7>
+ 2640407307U, // <u,1,3,7>: Cost 3 vext2 <3,7,u,1>, <3,7,u,1>
+ 1611490327U, // <u,1,3,u>: Cost 2 vext3 LHS, <1,3,u,3>
+ 1567992749U, // <u,1,4,0>: Cost 2 vext2 <4,0,u,1>, <4,0,u,1>
+ 2693121070U, // <u,1,4,1>: Cost 3 vext3 <1,4,1,u>, <1,4,1,u>
+ 2693194807U, // <u,1,4,2>: Cost 3 vext3 <1,4,2,u>, <1,4,2,u>
+ 1152386432U, // <u,1,4,3>: Cost 2 vrev <1,u,3,4>
+ 2555555126U, // <u,1,4,4>: Cost 3 vext1 <0,u,1,4>, RHS
+ 1558039862U, // <u,1,4,5>: Cost 2 vext2 <2,3,u,1>, RHS
+ 2645716371U, // <u,1,4,6>: Cost 3 vext2 <4,6,u,1>, <4,6,u,1>
+ 2597361284U, // <u,1,4,7>: Cost 3 vext1 <7,u,1,4>, <7,u,1,4>
+ 1152755117U, // <u,1,4,u>: Cost 2 vrev <1,u,u,4>
+ 1481818214U, // <u,1,5,0>: Cost 2 vext1 <0,u,1,5>, LHS
+ 2555560694U, // <u,1,5,1>: Cost 3 vext1 <0,u,1,5>, <1,0,3,2>
+ 2555561576U, // <u,1,5,2>: Cost 3 vext1 <0,u,1,5>, <2,2,2,2>
+ 1611490448U, // <u,1,5,3>: Cost 2 vext3 LHS, <1,5,3,7>
+ 1481821494U, // <u,1,5,4>: Cost 2 vext1 <0,u,1,5>, RHS
+ 2651025435U, // <u,1,5,5>: Cost 3 vext2 <5,5,u,1>, <5,5,u,1>
+ 2651689068U, // <u,1,5,6>: Cost 3 vext2 <5,6,u,1>, <5,6,u,1>
+ 2823966006U, // <u,1,5,7>: Cost 3 vuzpr <0,u,1,1>, RHS
+ 1611932861U, // <u,1,5,u>: Cost 2 vext3 LHS, <1,5,u,7>
+ 2555568230U, // <u,1,6,0>: Cost 3 vext1 <0,u,1,6>, LHS
+ 2689877199U, // <u,1,6,1>: Cost 3 vext3 LHS, <1,6,1,7>
+ 2712069336U, // <u,1,6,2>: Cost 3 vext3 RHS, <1,6,2,7>
+ 2685232353U, // <u,1,6,3>: Cost 3 vext3 LHS, <1,6,3,7>
+ 2555571510U, // <u,1,6,4>: Cost 3 vext1 <0,u,1,6>, RHS
+ 2689877235U, // <u,1,6,5>: Cost 3 vext3 LHS, <1,6,5,7>
+ 2657661765U, // <u,1,6,6>: Cost 3 vext2 <6,6,u,1>, <6,6,u,1>
+ 1584583574U, // <u,1,6,7>: Cost 2 vext2 <6,7,u,1>, <6,7,u,1>
+ 1585247207U, // <u,1,6,u>: Cost 2 vext2 <6,u,u,1>, <6,u,u,1>
+ 2561548390U, // <u,1,7,0>: Cost 3 vext1 <1,u,1,7>, LHS
+ 2561549681U, // <u,1,7,1>: Cost 3 vext1 <1,u,1,7>, <1,u,1,7>
+ 2573493926U, // <u,1,7,2>: Cost 3 vext1 <3,u,1,7>, <2,3,0,1>
+ 2042962022U, // <u,1,7,3>: Cost 2 vtrnr RHS, LHS
+ 2561551670U, // <u,1,7,4>: Cost 3 vext1 <1,u,1,7>, RHS
+ 2226300309U, // <u,1,7,5>: Cost 3 vrev <1,u,5,7>
+ 2658325990U, // <u,1,7,6>: Cost 3 vext2 <6,7,u,1>, <7,6,1,u>
+ 2658326124U, // <u,1,7,7>: Cost 3 vext2 <6,7,u,1>, <7,7,7,7>
+ 2042962027U, // <u,1,7,u>: Cost 2 vtrnr RHS, LHS
+ 1481842790U, // <u,1,u,0>: Cost 2 vext1 <0,u,1,u>, LHS
+ 202162278U, // <u,1,u,1>: Cost 1 vdup1 LHS
+ 2685674867U, // <u,1,u,2>: Cost 3 vext3 LHS, <1,u,2,0>
+ 835584U, // <u,1,u,3>: Cost 0 copy LHS
+ 1481846070U, // <u,1,u,4>: Cost 2 vext1 <0,u,1,u>, RHS
+ 1611933077U, // <u,1,u,5>: Cost 2 vext3 LHS, <1,u,5,7>
+ 2685674910U, // <u,1,u,6>: Cost 3 vext3 LHS, <1,u,6,7>
+ 1523652232U, // <u,1,u,7>: Cost 2 vext1 <7,u,1,u>, <7,u,1,u>
+ 835584U, // <u,1,u,u>: Cost 0 copy LHS
+ 1544110154U, // <u,2,0,0>: Cost 2 vext2 <0,0,u,2>, <0,0,u,2>
+ 1545437286U, // <u,2,0,1>: Cost 2 vext2 <0,2,u,2>, LHS
+ 1545437420U, // <u,2,0,2>: Cost 2 vext2 <0,2,u,2>, <0,2,u,2>
+ 2685232589U, // <u,2,0,3>: Cost 3 vext3 LHS, <2,0,3,0>
+ 2619179346U, // <u,2,0,4>: Cost 3 vext2 <0,2,u,2>, <0,4,1,5>
+ 2712069606U, // <u,2,0,5>: Cost 3 vext3 RHS, <2,0,5,7>
+ 2689877484U, // <u,2,0,6>: Cost 3 vext3 LHS, <2,0,6,4>
+ 2659656273U, // <u,2,0,7>: Cost 3 vext2 <7,0,u,2>, <0,7,2,u>
+ 1545437853U, // <u,2,0,u>: Cost 2 vext2 <0,2,u,2>, LHS
+ 1550082851U, // <u,2,1,0>: Cost 2 vext2 <1,0,u,2>, <1,0,u,2>
+ 2619179828U, // <u,2,1,1>: Cost 3 vext2 <0,2,u,2>, <1,1,1,1>
+ 2619179926U, // <u,2,1,2>: Cost 3 vext2 <0,2,u,2>, <1,2,3,0>
+ 2685232671U, // <u,2,1,3>: Cost 3 vext3 LHS, <2,1,3,1>
+ 2555604278U, // <u,2,1,4>: Cost 3 vext1 <0,u,2,1>, RHS
+ 2619180176U, // <u,2,1,5>: Cost 3 vext2 <0,2,u,2>, <1,5,3,7>
+ 2689877564U, // <u,2,1,6>: Cost 3 vext3 LHS, <2,1,6,3>
+ 2602718850U, // <u,2,1,7>: Cost 3 vext1 <u,7,2,1>, <7,u,1,2>
+ 1158703235U, // <u,2,1,u>: Cost 2 vrev <2,u,u,1>
+ 1481867366U, // <u,2,2,0>: Cost 2 vext1 <0,u,2,2>, LHS
+ 2555609846U, // <u,2,2,1>: Cost 3 vext1 <0,u,2,2>, <1,0,3,2>
+ 269271142U, // <u,2,2,2>: Cost 1 vdup2 LHS
+ 1611490930U, // <u,2,2,3>: Cost 2 vext3 LHS, <2,2,3,3>
+ 1481870646U, // <u,2,2,4>: Cost 2 vext1 <0,u,2,2>, RHS
+ 2689877640U, // <u,2,2,5>: Cost 3 vext3 LHS, <2,2,5,7>
+ 2619180986U, // <u,2,2,6>: Cost 3 vext2 <0,2,u,2>, <2,6,3,7>
+ 2593436837U, // <u,2,2,7>: Cost 3 vext1 <7,2,2,2>, <7,2,2,2>
+ 269271142U, // <u,2,2,u>: Cost 1 vdup2 LHS
+ 408134301U, // <u,2,3,0>: Cost 1 vext1 LHS, LHS
+ 1481876214U, // <u,2,3,1>: Cost 2 vext1 LHS, <1,0,3,2>
+ 1481877096U, // <u,2,3,2>: Cost 2 vext1 LHS, <2,2,2,2>
+ 1880326246U, // <u,2,3,3>: Cost 2 vzipr LHS, LHS
+ 408137014U, // <u,2,3,4>: Cost 1 vext1 LHS, RHS
+ 1529654992U, // <u,2,3,5>: Cost 2 vext1 LHS, <5,1,7,3>
+ 1529655802U, // <u,2,3,6>: Cost 2 vext1 LHS, <6,2,7,3>
+ 1529656314U, // <u,2,3,7>: Cost 2 vext1 LHS, <7,0,1,2>
+ 408139566U, // <u,2,3,u>: Cost 1 vext1 LHS, LHS
+ 1567853468U, // <u,2,4,0>: Cost 2 vext2 <4,0,6,2>, <4,0,6,2>
+ 2561598362U, // <u,2,4,1>: Cost 3 vext1 <1,u,2,4>, <1,2,3,4>
+ 2555627214U, // <u,2,4,2>: Cost 3 vext1 <0,u,2,4>, <2,3,4,5>
+ 2685232918U, // <u,2,4,3>: Cost 3 vext3 LHS, <2,4,3,5>
+ 2555628854U, // <u,2,4,4>: Cost 3 vext1 <0,u,2,4>, RHS
+ 1545440566U, // <u,2,4,5>: Cost 2 vext2 <0,2,u,2>, RHS
+ 1571982740U, // <u,2,4,6>: Cost 2 vext2 <4,6,u,2>, <4,6,u,2>
+ 2592125957U, // <u,2,4,7>: Cost 3 vext1 <7,0,2,4>, <7,0,2,4>
+ 1545440809U, // <u,2,4,u>: Cost 2 vext2 <0,2,u,2>, RHS
+ 2555633766U, // <u,2,5,0>: Cost 3 vext1 <0,u,2,5>, LHS
+ 2561606550U, // <u,2,5,1>: Cost 3 vext1 <1,u,2,5>, <1,2,3,0>
+ 2689877856U, // <u,2,5,2>: Cost 3 vext3 LHS, <2,5,2,7>
+ 2685233000U, // <u,2,5,3>: Cost 3 vext3 LHS, <2,5,3,6>
+ 1158441059U, // <u,2,5,4>: Cost 2 vrev <2,u,4,5>
+ 2645725188U, // <u,2,5,5>: Cost 3 vext2 <4,6,u,2>, <5,5,5,5>
+ 2689877892U, // <u,2,5,6>: Cost 3 vext3 LHS, <2,5,6,7>
+ 2823900470U, // <u,2,5,7>: Cost 3 vuzpr <0,u,0,2>, RHS
+ 1158736007U, // <u,2,5,u>: Cost 2 vrev <2,u,u,5>
+ 1481900134U, // <u,2,6,0>: Cost 2 vext1 <0,u,2,6>, LHS
+ 2555642614U, // <u,2,6,1>: Cost 3 vext1 <0,u,2,6>, <1,0,3,2>
+ 2555643496U, // <u,2,6,2>: Cost 3 vext1 <0,u,2,6>, <2,2,2,2>
+ 1611491258U, // <u,2,6,3>: Cost 2 vext3 LHS, <2,6,3,7>
+ 1481903414U, // <u,2,6,4>: Cost 2 vext1 <0,u,2,6>, RHS
+ 2689877964U, // <u,2,6,5>: Cost 3 vext3 LHS, <2,6,5,7>
+ 2689877973U, // <u,2,6,6>: Cost 3 vext3 LHS, <2,6,6,7>
+ 2645726030U, // <u,2,6,7>: Cost 3 vext2 <4,6,u,2>, <6,7,0,1>
+ 1611933671U, // <u,2,6,u>: Cost 2 vext3 LHS, <2,6,u,7>
+ 1585919033U, // <u,2,7,0>: Cost 2 vext2 <7,0,u,2>, <7,0,u,2>
+ 2573566710U, // <u,2,7,1>: Cost 3 vext1 <3,u,2,7>, <1,0,3,2>
+ 2567596115U, // <u,2,7,2>: Cost 3 vext1 <2,u,2,7>, <2,u,2,7>
+ 1906901094U, // <u,2,7,3>: Cost 2 vzipr RHS, LHS
+ 2555653430U, // <u,2,7,4>: Cost 3 vext1 <0,u,2,7>, RHS
+ 2800080230U, // <u,2,7,5>: Cost 3 vuzpl LHS, <7,4,5,6>
+ 2980643164U, // <u,2,7,6>: Cost 3 vzipr RHS, <0,4,2,6>
+ 2645726828U, // <u,2,7,7>: Cost 3 vext2 <4,6,u,2>, <7,7,7,7>
+ 1906901099U, // <u,2,7,u>: Cost 2 vzipr RHS, LHS
+ 408175266U, // <u,2,u,0>: Cost 1 vext1 LHS, LHS
+ 1545443118U, // <u,2,u,1>: Cost 2 vext2 <0,2,u,2>, LHS
+ 269271142U, // <u,2,u,2>: Cost 1 vdup2 LHS
+ 1611491416U, // <u,2,u,3>: Cost 2 vext3 LHS, <2,u,3,3>
+ 408177974U, // <u,2,u,4>: Cost 1 vext1 LHS, RHS
+ 1545443482U, // <u,2,u,5>: Cost 2 vext2 <0,2,u,2>, RHS
+ 1726339226U, // <u,2,u,6>: Cost 2 vuzpl LHS, RHS
+ 1529697274U, // <u,2,u,7>: Cost 2 vext1 LHS, <7,0,1,2>
+ 408180526U, // <u,2,u,u>: Cost 1 vext1 LHS, LHS
+ 1544781824U, // <u,3,0,0>: Cost 2 vext2 LHS, <0,0,0,0>
+ 471040156U, // <u,3,0,1>: Cost 1 vext2 LHS, LHS
+ 1544781988U, // <u,3,0,2>: Cost 2 vext2 LHS, <0,2,0,2>
+ 2618523900U, // <u,3,0,3>: Cost 3 vext2 LHS, <0,3,1,0>
+ 1544782162U, // <u,3,0,4>: Cost 2 vext2 LHS, <0,4,1,5>
+ 2238188352U, // <u,3,0,5>: Cost 3 vrev <3,u,5,0>
+ 2623169023U, // <u,3,0,6>: Cost 3 vext2 LHS, <0,6,2,7>
+ 2238335826U, // <u,3,0,7>: Cost 3 vrev <3,u,7,0>
+ 471040669U, // <u,3,0,u>: Cost 1 vext2 LHS, LHS
+ 1544782582U, // <u,3,1,0>: Cost 2 vext2 LHS, <1,0,3,2>
+ 1544782644U, // <u,3,1,1>: Cost 2 vext2 LHS, <1,1,1,1>
+ 1544782742U, // <u,3,1,2>: Cost 2 vext2 LHS, <1,2,3,0>
+ 1544782808U, // <u,3,1,3>: Cost 2 vext2 LHS, <1,3,1,3>
+ 2618524733U, // <u,3,1,4>: Cost 3 vext2 LHS, <1,4,3,5>
+ 1544782992U, // <u,3,1,5>: Cost 2 vext2 LHS, <1,5,3,7>
+ 2618524897U, // <u,3,1,6>: Cost 3 vext2 LHS, <1,6,3,7>
+ 2703517987U, // <u,3,1,7>: Cost 3 vext3 <3,1,7,u>, <3,1,7,u>
+ 1544783213U, // <u,3,1,u>: Cost 2 vext2 LHS, <1,u,1,3>
+ 1529716838U, // <u,3,2,0>: Cost 2 vext1 <u,u,3,2>, LHS
+ 1164167966U, // <u,3,2,1>: Cost 2 vrev <3,u,1,2>
+ 1544783464U, // <u,3,2,2>: Cost 2 vext2 LHS, <2,2,2,2>
+ 1544783526U, // <u,3,2,3>: Cost 2 vext2 LHS, <2,3,0,1>
+ 1529720118U, // <u,3,2,4>: Cost 2 vext1 <u,u,3,2>, RHS
+ 2618525544U, // <u,3,2,5>: Cost 3 vext2 LHS, <2,5,3,6>
+ 1544783802U, // <u,3,2,6>: Cost 2 vext2 LHS, <2,6,3,7>
+ 2704181620U, // <u,3,2,7>: Cost 3 vext3 <3,2,7,u>, <3,2,7,u>
+ 1544783931U, // <u,3,2,u>: Cost 2 vext2 LHS, <2,u,0,1>
+ 1544784022U, // <u,3,3,0>: Cost 2 vext2 LHS, <3,0,1,2>
+ 1487922559U, // <u,3,3,1>: Cost 2 vext1 <1,u,3,3>, <1,u,3,3>
+ 1493895256U, // <u,3,3,2>: Cost 2 vext1 <2,u,3,3>, <2,u,3,3>
+ 336380006U, // <u,3,3,3>: Cost 1 vdup3 LHS
+ 1544784386U, // <u,3,3,4>: Cost 2 vext2 LHS, <3,4,5,6>
+ 2824054478U, // <u,3,3,5>: Cost 3 vuzpr LHS, <2,3,4,5>
+ 2238286668U, // <u,3,3,6>: Cost 3 vrev <3,u,6,3>
+ 2954069136U, // <u,3,3,7>: Cost 3 vzipr LHS, <1,5,3,7>
+ 336380006U, // <u,3,3,u>: Cost 1 vdup3 LHS
+ 1487929446U, // <u,3,4,0>: Cost 2 vext1 <1,u,3,4>, LHS
+ 1487930752U, // <u,3,4,1>: Cost 2 vext1 <1,u,3,4>, <1,u,3,4>
+ 2623171644U, // <u,3,4,2>: Cost 3 vext2 LHS, <4,2,6,0>
+ 2561673366U, // <u,3,4,3>: Cost 3 vext1 <1,u,3,4>, <3,0,1,2>
+ 1487932726U, // <u,3,4,4>: Cost 2 vext1 <1,u,3,4>, RHS
+ 471043382U, // <u,3,4,5>: Cost 1 vext2 LHS, RHS
+ 1592561012U, // <u,3,4,6>: Cost 2 vext2 LHS, <4,6,4,6>
+ 2238368598U, // <u,3,4,7>: Cost 3 vrev <3,u,7,4>
+ 471043625U, // <u,3,4,u>: Cost 1 vext2 LHS, RHS
+ 2555707494U, // <u,3,5,0>: Cost 3 vext1 <0,u,3,5>, LHS
+ 1574645465U, // <u,3,5,1>: Cost 2 vext2 <5,1,u,3>, <5,1,u,3>
+ 2567653106U, // <u,3,5,2>: Cost 3 vext1 <2,u,3,5>, <2,3,u,5>
+ 2555709954U, // <u,3,5,3>: Cost 3 vext1 <0,u,3,5>, <3,4,5,6>
+ 1592561606U, // <u,3,5,4>: Cost 2 vext2 LHS, <5,4,7,6>
+ 1592561668U, // <u,3,5,5>: Cost 2 vext2 LHS, <5,5,5,5>
+ 1592561762U, // <u,3,5,6>: Cost 2 vext2 LHS, <5,6,7,0>
+ 1750314294U, // <u,3,5,7>: Cost 2 vuzpr LHS, RHS
+ 1750314295U, // <u,3,5,u>: Cost 2 vuzpr LHS, RHS
+ 2623172897U, // <u,3,6,0>: Cost 3 vext2 LHS, <6,0,1,2>
+ 2561688962U, // <u,3,6,1>: Cost 3 vext1 <1,u,3,6>, <1,u,3,6>
+ 1581281795U, // <u,3,6,2>: Cost 2 vext2 <6,2,u,3>, <6,2,u,3>
+ 2706541204U, // <u,3,6,3>: Cost 3 vext3 <3,6,3,u>, <3,6,3,u>
+ 2623173261U, // <u,3,6,4>: Cost 3 vext2 LHS, <6,4,5,6>
+ 1164495686U, // <u,3,6,5>: Cost 2 vrev <3,u,5,6>
+ 1592562488U, // <u,3,6,6>: Cost 2 vext2 LHS, <6,6,6,6>
+ 1592562510U, // <u,3,6,7>: Cost 2 vext2 LHS, <6,7,0,1>
+ 1164716897U, // <u,3,6,u>: Cost 2 vrev <3,u,u,6>
+ 1487954022U, // <u,3,7,0>: Cost 2 vext1 <1,u,3,7>, LHS
+ 1487955331U, // <u,3,7,1>: Cost 2 vext1 <1,u,3,7>, <1,u,3,7>
+ 1493928028U, // <u,3,7,2>: Cost 2 vext1 <2,u,3,7>, <2,u,3,7>
+ 2561697942U, // <u,3,7,3>: Cost 3 vext1 <1,u,3,7>, <3,0,1,2>
+ 1487957302U, // <u,3,7,4>: Cost 2 vext1 <1,u,3,7>, RHS
+ 2707352311U, // <u,3,7,5>: Cost 3 vext3 <3,7,5,u>, <3,7,5,u>
+ 2655024623U, // <u,3,7,6>: Cost 3 vext2 <6,2,u,3>, <7,6,2,u>
+ 1592563308U, // <u,3,7,7>: Cost 2 vext2 LHS, <7,7,7,7>
+ 1487959854U, // <u,3,7,u>: Cost 2 vext1 <1,u,3,7>, LHS
+ 1544787667U, // <u,3,u,0>: Cost 2 vext2 LHS, <u,0,1,2>
+ 471045934U, // <u,3,u,1>: Cost 1 vext2 LHS, LHS
+ 1549432709U, // <u,3,u,2>: Cost 2 vext2 LHS, <u,2,3,0>
+ 336380006U, // <u,3,u,3>: Cost 1 vdup3 LHS
+ 1544788031U, // <u,3,u,4>: Cost 2 vext2 LHS, <u,4,5,6>
+ 471046298U, // <u,3,u,5>: Cost 1 vext2 LHS, RHS
+ 1549433040U, // <u,3,u,6>: Cost 2 vext2 LHS, <u,6,3,7>
+ 1750314537U, // <u,3,u,7>: Cost 2 vuzpr LHS, RHS
+ 471046501U, // <u,3,u,u>: Cost 1 vext2 LHS, LHS
+ 2625167360U, // <u,4,0,0>: Cost 3 vext2 <1,2,u,4>, <0,0,0,0>
+ 1551425638U, // <u,4,0,1>: Cost 2 vext2 <1,2,u,4>, LHS
+ 2619195630U, // <u,4,0,2>: Cost 3 vext2 <0,2,u,4>, <0,2,u,4>
+ 2619343104U, // <u,4,0,3>: Cost 3 vext2 <0,3,1,4>, <0,3,1,4>
+ 2625167698U, // <u,4,0,4>: Cost 3 vext2 <1,2,u,4>, <0,4,1,5>
+ 1638329234U, // <u,4,0,5>: Cost 2 vext3 RHS, <4,0,5,1>
+ 1638329244U, // <u,4,0,6>: Cost 2 vext3 RHS, <4,0,6,2>
+ 3787803556U, // <u,4,0,7>: Cost 4 vext3 RHS, <4,0,7,1>
+ 1551426205U, // <u,4,0,u>: Cost 2 vext2 <1,2,u,4>, LHS
+ 2555748454U, // <u,4,1,0>: Cost 3 vext1 <0,u,4,1>, LHS
+ 2625168180U, // <u,4,1,1>: Cost 3 vext2 <1,2,u,4>, <1,1,1,1>
+ 1551426503U, // <u,4,1,2>: Cost 2 vext2 <1,2,u,4>, <1,2,u,4>
+ 2625168344U, // <u,4,1,3>: Cost 3 vext2 <1,2,u,4>, <1,3,1,3>
+ 2555751734U, // <u,4,1,4>: Cost 3 vext1 <0,u,4,1>, RHS
+ 1860554038U, // <u,4,1,5>: Cost 2 vzipl LHS, RHS
+ 2689879022U, // <u,4,1,6>: Cost 3 vext3 LHS, <4,1,6,3>
+ 2592248852U, // <u,4,1,7>: Cost 3 vext1 <7,0,4,1>, <7,0,4,1>
+ 1555408301U, // <u,4,1,u>: Cost 2 vext2 <1,u,u,4>, <1,u,u,4>
+ 2555756646U, // <u,4,2,0>: Cost 3 vext1 <0,u,4,2>, LHS
+ 2625168943U, // <u,4,2,1>: Cost 3 vext2 <1,2,u,4>, <2,1,4,u>
+ 2625169000U, // <u,4,2,2>: Cost 3 vext2 <1,2,u,4>, <2,2,2,2>
+ 2619197134U, // <u,4,2,3>: Cost 3 vext2 <0,2,u,4>, <2,3,4,5>
+ 2555759926U, // <u,4,2,4>: Cost 3 vext1 <0,u,4,2>, RHS
+ 2712071222U, // <u,4,2,5>: Cost 3 vext3 RHS, <4,2,5,3>
+ 1994771766U, // <u,4,2,6>: Cost 2 vtrnl LHS, RHS
+ 2592257045U, // <u,4,2,7>: Cost 3 vext1 <7,0,4,2>, <7,0,4,2>
+ 1994771784U, // <u,4,2,u>: Cost 2 vtrnl LHS, RHS
+ 2625169558U, // <u,4,3,0>: Cost 3 vext2 <1,2,u,4>, <3,0,1,2>
+ 2567709594U, // <u,4,3,1>: Cost 3 vext1 <2,u,4,3>, <1,2,3,4>
+ 2567710817U, // <u,4,3,2>: Cost 3 vext1 <2,u,4,3>, <2,u,4,3>
+ 2625169820U, // <u,4,3,3>: Cost 3 vext2 <1,2,u,4>, <3,3,3,3>
+ 2625169922U, // <u,4,3,4>: Cost 3 vext2 <1,2,u,4>, <3,4,5,6>
+ 2954069710U, // <u,4,3,5>: Cost 3 vzipr LHS, <2,3,4,5>
+ 2954068172U, // <u,4,3,6>: Cost 3 vzipr LHS, <0,2,4,6>
+ 3903849472U, // <u,4,3,7>: Cost 4 vuzpr <1,u,3,4>, <1,3,5,7>
+ 2954068174U, // <u,4,3,u>: Cost 3 vzipr LHS, <0,2,4,u>
+ 1505919078U, // <u,4,4,0>: Cost 2 vext1 <4,u,4,4>, LHS
+ 2567717831U, // <u,4,4,1>: Cost 3 vext1 <2,u,4,4>, <1,2,u,4>
+ 2567719010U, // <u,4,4,2>: Cost 3 vext1 <2,u,4,4>, <2,u,4,4>
+ 2570373542U, // <u,4,4,3>: Cost 3 vext1 <3,3,4,4>, <3,3,4,4>
+ 161926454U, // <u,4,4,4>: Cost 1 vdup0 RHS
+ 1551428918U, // <u,4,4,5>: Cost 2 vext2 <1,2,u,4>, RHS
+ 1638329572U, // <u,4,4,6>: Cost 2 vext3 RHS, <4,4,6,6>
+ 2594927963U, // <u,4,4,7>: Cost 3 vext1 <7,4,4,4>, <7,4,4,4>
+ 161926454U, // <u,4,4,u>: Cost 1 vdup0 RHS
+ 1493983334U, // <u,4,5,0>: Cost 2 vext1 <2,u,4,5>, LHS
+ 2689879301U, // <u,4,5,1>: Cost 3 vext3 LHS, <4,5,1,3>
+ 1493985379U, // <u,4,5,2>: Cost 2 vext1 <2,u,4,5>, <2,u,4,5>
+ 2567727254U, // <u,4,5,3>: Cost 3 vext1 <2,u,4,5>, <3,0,1,2>
+ 1493986614U, // <u,4,5,4>: Cost 2 vext1 <2,u,4,5>, RHS
+ 1863535926U, // <u,4,5,5>: Cost 2 vzipl RHS, RHS
+ 537750838U, // <u,4,5,6>: Cost 1 vext3 LHS, RHS
+ 2830110006U, // <u,4,5,7>: Cost 3 vuzpr <1,u,3,4>, RHS
+ 537750856U, // <u,4,5,u>: Cost 1 vext3 LHS, RHS
+ 1482047590U, // <u,4,6,0>: Cost 2 vext1 <0,u,4,6>, LHS
+ 2555790070U, // <u,4,6,1>: Cost 3 vext1 <0,u,4,6>, <1,0,3,2>
+ 2555790952U, // <u,4,6,2>: Cost 3 vext1 <0,u,4,6>, <2,2,2,2>
+ 2555791510U, // <u,4,6,3>: Cost 3 vext1 <0,u,4,6>, <3,0,1,2>
+ 1482050870U, // <u,4,6,4>: Cost 2 vext1 <0,u,4,6>, RHS
+ 2689879422U, // <u,4,6,5>: Cost 3 vext3 LHS, <4,6,5,7>
+ 1997753654U, // <u,4,6,6>: Cost 2 vtrnl RHS, RHS
+ 2712071562U, // <u,4,6,7>: Cost 3 vext3 RHS, <4,6,7,1>
+ 1482053422U, // <u,4,6,u>: Cost 2 vext1 <0,u,4,6>, LHS
+ 2567741542U, // <u,4,7,0>: Cost 3 vext1 <2,u,4,7>, LHS
+ 2567742362U, // <u,4,7,1>: Cost 3 vext1 <2,u,4,7>, <1,2,3,4>
+ 2567743589U, // <u,4,7,2>: Cost 3 vext1 <2,u,4,7>, <2,u,4,7>
+ 2573716286U, // <u,4,7,3>: Cost 3 vext1 <3,u,4,7>, <3,u,4,7>
+ 2567744822U, // <u,4,7,4>: Cost 3 vext1 <2,u,4,7>, RHS
+ 2712071624U, // <u,4,7,5>: Cost 3 vext3 RHS, <4,7,5,0>
+ 96808489U, // <u,4,7,6>: Cost 1 vrev RHS
+ 2651715180U, // <u,4,7,7>: Cost 3 vext2 <5,6,u,4>, <7,7,7,7>
+ 96955963U, // <u,4,7,u>: Cost 1 vrev RHS
+ 1482063974U, // <u,4,u,0>: Cost 2 vext1 <0,u,4,u>, LHS
+ 1551431470U, // <u,4,u,1>: Cost 2 vext2 <1,2,u,4>, LHS
+ 1494009958U, // <u,4,u,2>: Cost 2 vext1 <2,u,4,u>, <2,u,4,u>
+ 2555807894U, // <u,4,u,3>: Cost 3 vext1 <0,u,4,u>, <3,0,1,2>
+ 161926454U, // <u,4,u,4>: Cost 1 vdup0 RHS
+ 1551431834U, // <u,4,u,5>: Cost 2 vext2 <1,2,u,4>, RHS
+ 537751081U, // <u,4,u,6>: Cost 1 vext3 LHS, RHS
+ 2830110249U, // <u,4,u,7>: Cost 3 vuzpr <1,u,3,4>, RHS
+ 537751099U, // <u,4,u,u>: Cost 1 vext3 LHS, RHS
+ 2631811072U, // <u,5,0,0>: Cost 3 vext2 <2,3,u,5>, <0,0,0,0>
+ 1558069350U, // <u,5,0,1>: Cost 2 vext2 <2,3,u,5>, LHS
+ 2619203823U, // <u,5,0,2>: Cost 3 vext2 <0,2,u,5>, <0,2,u,5>
+ 2619867456U, // <u,5,0,3>: Cost 3 vext2 <0,3,u,5>, <0,3,u,5>
+ 1546273106U, // <u,5,0,4>: Cost 2 vext2 <0,4,1,5>, <0,4,1,5>
+ 2733010539U, // <u,5,0,5>: Cost 3 vext3 LHS, <5,0,5,1>
+ 2597622682U, // <u,5,0,6>: Cost 3 vext1 <7,u,5,0>, <6,7,u,5>
+ 1176539396U, // <u,5,0,7>: Cost 2 vrev <5,u,7,0>
+ 1558069917U, // <u,5,0,u>: Cost 2 vext2 <2,3,u,5>, LHS
+ 1505968230U, // <u,5,1,0>: Cost 2 vext1 <4,u,5,1>, LHS
+ 2624512887U, // <u,5,1,1>: Cost 3 vext2 <1,1,u,5>, <1,1,u,5>
+ 2631811990U, // <u,5,1,2>: Cost 3 vext2 <2,3,u,5>, <1,2,3,0>
+ 2618541056U, // <u,5,1,3>: Cost 3 vext2 <0,1,u,5>, <1,3,5,7>
+ 1505971510U, // <u,5,1,4>: Cost 2 vext1 <4,u,5,1>, RHS
+ 2627167419U, // <u,5,1,5>: Cost 3 vext2 <1,5,u,5>, <1,5,u,5>
+ 2579714554U, // <u,5,1,6>: Cost 3 vext1 <4,u,5,1>, <6,2,7,3>
+ 1638330064U, // <u,5,1,7>: Cost 2 vext3 RHS, <5,1,7,3>
+ 1638477529U, // <u,5,1,u>: Cost 2 vext3 RHS, <5,1,u,3>
+ 2561802342U, // <u,5,2,0>: Cost 3 vext1 <1,u,5,2>, LHS
+ 2561803264U, // <u,5,2,1>: Cost 3 vext1 <1,u,5,2>, <1,3,5,7>
+ 2631149217U, // <u,5,2,2>: Cost 3 vext2 <2,2,u,5>, <2,2,u,5>
+ 1558071026U, // <u,5,2,3>: Cost 2 vext2 <2,3,u,5>, <2,3,u,5>
+ 2561805622U, // <u,5,2,4>: Cost 3 vext1 <1,u,5,2>, RHS
+ 2714062607U, // <u,5,2,5>: Cost 3 vext3 RHS, <5,2,5,3>
+ 2631813050U, // <u,5,2,6>: Cost 3 vext2 <2,3,u,5>, <2,6,3,7>
+ 3092335926U, // <u,5,2,7>: Cost 3 vtrnr <0,u,0,2>, RHS
+ 1561389191U, // <u,5,2,u>: Cost 2 vext2 <2,u,u,5>, <2,u,u,5>
+ 2561810534U, // <u,5,3,0>: Cost 3 vext1 <1,u,5,3>, LHS
+ 2561811857U, // <u,5,3,1>: Cost 3 vext1 <1,u,5,3>, <1,u,5,3>
+ 2631813474U, // <u,5,3,2>: Cost 3 vext2 <2,3,u,5>, <3,2,5,u>
+ 2631813532U, // <u,5,3,3>: Cost 3 vext2 <2,3,u,5>, <3,3,3,3>
+ 2619869698U, // <u,5,3,4>: Cost 3 vext2 <0,3,u,5>, <3,4,5,6>
+ 3001847002U, // <u,5,3,5>: Cost 3 vzipr LHS, <4,4,5,5>
+ 2954070530U, // <u,5,3,6>: Cost 3 vzipr LHS, <3,4,5,6>
+ 2018749750U, // <u,5,3,7>: Cost 2 vtrnr LHS, RHS
+ 2018749751U, // <u,5,3,u>: Cost 2 vtrnr LHS, RHS
+ 2573762662U, // <u,5,4,0>: Cost 3 vext1 <3,u,5,4>, LHS
+ 2620017634U, // <u,5,4,1>: Cost 3 vext2 <0,4,1,5>, <4,1,5,0>
+ 2573764338U, // <u,5,4,2>: Cost 3 vext1 <3,u,5,4>, <2,3,u,5>
+ 2573765444U, // <u,5,4,3>: Cost 3 vext1 <3,u,5,4>, <3,u,5,4>
+ 1570680053U, // <u,5,4,4>: Cost 2 vext2 <4,4,u,5>, <4,4,u,5>
+ 1558072630U, // <u,5,4,5>: Cost 2 vext2 <2,3,u,5>, RHS
+ 2645749143U, // <u,5,4,6>: Cost 3 vext2 <4,6,u,5>, <4,6,u,5>
+ 1638330310U, // <u,5,4,7>: Cost 2 vext3 RHS, <5,4,7,6>
+ 1558072873U, // <u,5,4,u>: Cost 2 vext2 <2,3,u,5>, RHS
+ 1506000998U, // <u,5,5,0>: Cost 2 vext1 <4,u,5,5>, LHS
+ 2561827984U, // <u,5,5,1>: Cost 3 vext1 <1,u,5,5>, <1,5,3,7>
+ 2579744360U, // <u,5,5,2>: Cost 3 vext1 <4,u,5,5>, <2,2,2,2>
+ 2579744918U, // <u,5,5,3>: Cost 3 vext1 <4,u,5,5>, <3,0,1,2>
+ 1506004278U, // <u,5,5,4>: Cost 2 vext1 <4,u,5,5>, RHS
+ 229035318U, // <u,5,5,5>: Cost 1 vdup1 RHS
+ 2712072206U, // <u,5,5,6>: Cost 3 vext3 RHS, <5,5,6,6>
+ 1638330392U, // <u,5,5,7>: Cost 2 vext3 RHS, <5,5,7,7>
+ 229035318U, // <u,5,5,u>: Cost 1 vdup1 RHS
+ 1500037222U, // <u,5,6,0>: Cost 2 vext1 <3,u,5,6>, LHS
+ 2561836436U, // <u,5,6,1>: Cost 3 vext1 <1,u,5,6>, <1,u,5,6>
+ 2567809133U, // <u,5,6,2>: Cost 3 vext1 <2,u,5,6>, <2,u,5,6>
+ 1500040006U, // <u,5,6,3>: Cost 2 vext1 <3,u,5,6>, <3,u,5,6>
+ 1500040502U, // <u,5,6,4>: Cost 2 vext1 <3,u,5,6>, RHS
+ 2714062935U, // <u,5,6,5>: Cost 3 vext3 RHS, <5,6,5,7>
+ 2712072288U, // <u,5,6,6>: Cost 3 vext3 RHS, <5,6,6,7>
+ 27705344U, // <u,5,6,7>: Cost 0 copy RHS
+ 27705344U, // <u,5,6,u>: Cost 0 copy RHS
+ 1488101478U, // <u,5,7,0>: Cost 2 vext1 <1,u,5,7>, LHS
+ 1488102805U, // <u,5,7,1>: Cost 2 vext1 <1,u,5,7>, <1,u,5,7>
+ 2561844840U, // <u,5,7,2>: Cost 3 vext1 <1,u,5,7>, <2,2,2,2>
+ 2561845398U, // <u,5,7,3>: Cost 3 vext1 <1,u,5,7>, <3,0,1,2>
+ 1488104758U, // <u,5,7,4>: Cost 2 vext1 <1,u,5,7>, RHS
+ 1638330536U, // <u,5,7,5>: Cost 2 vext3 RHS, <5,7,5,7>
+ 2712072362U, // <u,5,7,6>: Cost 3 vext3 RHS, <5,7,6,0>
+ 2042965302U, // <u,5,7,7>: Cost 2 vtrnr RHS, RHS
+ 1488107310U, // <u,5,7,u>: Cost 2 vext1 <1,u,5,7>, LHS
+ 1488109670U, // <u,5,u,0>: Cost 2 vext1 <1,u,5,u>, LHS
+ 1488110998U, // <u,5,u,1>: Cost 2 vext1 <1,u,5,u>, <1,u,5,u>
+ 2561853032U, // <u,5,u,2>: Cost 3 vext1 <1,u,5,u>, <2,2,2,2>
+ 1500056392U, // <u,5,u,3>: Cost 2 vext1 <3,u,5,u>, <3,u,5,u>
+ 1488112950U, // <u,5,u,4>: Cost 2 vext1 <1,u,5,u>, RHS
+ 229035318U, // <u,5,u,5>: Cost 1 vdup1 RHS
+ 2954111490U, // <u,5,u,6>: Cost 3 vzipr LHS, <3,4,5,6>
+ 27705344U, // <u,5,u,7>: Cost 0 copy RHS
+ 27705344U, // <u,5,u,u>: Cost 0 copy RHS
+ 2619211776U, // <u,6,0,0>: Cost 3 vext2 <0,2,u,6>, <0,0,0,0>
+ 1545470054U, // <u,6,0,1>: Cost 2 vext2 <0,2,u,6>, LHS
+ 1545470192U, // <u,6,0,2>: Cost 2 vext2 <0,2,u,6>, <0,2,u,6>
+ 2255958969U, // <u,6,0,3>: Cost 3 vrev <6,u,3,0>
+ 1546797458U, // <u,6,0,4>: Cost 2 vext2 <0,4,u,6>, <0,4,u,6>
+ 2720624971U, // <u,6,0,5>: Cost 3 vext3 <6,0,5,u>, <6,0,5,u>
+ 2256180180U, // <u,6,0,6>: Cost 3 vrev <6,u,6,0>
+ 2960682294U, // <u,6,0,7>: Cost 3 vzipr <1,2,u,0>, RHS
+ 1545470621U, // <u,6,0,u>: Cost 2 vext2 <0,2,u,6>, LHS
+ 1182004127U, // <u,6,1,0>: Cost 2 vrev <6,u,0,1>
+ 2619212596U, // <u,6,1,1>: Cost 3 vext2 <0,2,u,6>, <1,1,1,1>
+ 2619212694U, // <u,6,1,2>: Cost 3 vext2 <0,2,u,6>, <1,2,3,0>
+ 2619212760U, // <u,6,1,3>: Cost 3 vext2 <0,2,u,6>, <1,3,1,3>
+ 2626511979U, // <u,6,1,4>: Cost 3 vext2 <1,4,u,6>, <1,4,u,6>
+ 2619212944U, // <u,6,1,5>: Cost 3 vext2 <0,2,u,6>, <1,5,3,7>
+ 2714063264U, // <u,6,1,6>: Cost 3 vext3 RHS, <6,1,6,3>
+ 2967326006U, // <u,6,1,7>: Cost 3 vzipr <2,3,u,1>, RHS
+ 1182594023U, // <u,6,1,u>: Cost 2 vrev <6,u,u,1>
+ 1506050150U, // <u,6,2,0>: Cost 2 vext1 <4,u,6,2>, LHS
+ 2579792630U, // <u,6,2,1>: Cost 3 vext1 <4,u,6,2>, <1,0,3,2>
+ 2619213416U, // <u,6,2,2>: Cost 3 vext2 <0,2,u,6>, <2,2,2,2>
+ 2619213478U, // <u,6,2,3>: Cost 3 vext2 <0,2,u,6>, <2,3,0,1>
+ 1506053430U, // <u,6,2,4>: Cost 2 vext1 <4,u,6,2>, RHS
+ 2633148309U, // <u,6,2,5>: Cost 3 vext2 <2,5,u,6>, <2,5,u,6>
+ 2619213754U, // <u,6,2,6>: Cost 3 vext2 <0,2,u,6>, <2,6,3,7>
+ 1638330874U, // <u,6,2,7>: Cost 2 vext3 RHS, <6,2,7,3>
+ 1638478339U, // <u,6,2,u>: Cost 2 vext3 RHS, <6,2,u,3>
+ 2619213974U, // <u,6,3,0>: Cost 3 vext2 <0,2,u,6>, <3,0,1,2>
+ 2255836074U, // <u,6,3,1>: Cost 3 vrev <6,u,1,3>
+ 2255909811U, // <u,6,3,2>: Cost 3 vrev <6,u,2,3>
+ 2619214236U, // <u,6,3,3>: Cost 3 vext2 <0,2,u,6>, <3,3,3,3>
+ 1564715549U, // <u,6,3,4>: Cost 2 vext2 <3,4,u,6>, <3,4,u,6>
+ 2639121006U, // <u,6,3,5>: Cost 3 vext2 <3,5,u,6>, <3,5,u,6>
+ 3001847012U, // <u,6,3,6>: Cost 3 vzipr LHS, <4,4,6,6>
+ 1880329526U, // <u,6,3,7>: Cost 2 vzipr LHS, RHS
+ 1880329527U, // <u,6,3,u>: Cost 2 vzipr LHS, RHS
+ 2567864422U, // <u,6,4,0>: Cost 3 vext1 <2,u,6,4>, LHS
+ 2733011558U, // <u,6,4,1>: Cost 3 vext3 LHS, <6,4,1,3>
+ 2567866484U, // <u,6,4,2>: Cost 3 vext1 <2,u,6,4>, <2,u,6,4>
+ 2638458005U, // <u,6,4,3>: Cost 3 vext2 <3,4,u,6>, <4,3,6,u>
+ 1570540772U, // <u,6,4,4>: Cost 2 vext2 <4,4,6,6>, <4,4,6,6>
+ 1545473334U, // <u,6,4,5>: Cost 2 vext2 <0,2,u,6>, RHS
+ 1572015512U, // <u,6,4,6>: Cost 2 vext2 <4,6,u,6>, <4,6,u,6>
+ 2960715062U, // <u,6,4,7>: Cost 3 vzipr <1,2,u,4>, RHS
+ 1545473577U, // <u,6,4,u>: Cost 2 vext2 <0,2,u,6>, RHS
+ 2567872614U, // <u,6,5,0>: Cost 3 vext1 <2,u,6,5>, LHS
+ 2645757648U, // <u,6,5,1>: Cost 3 vext2 <4,6,u,6>, <5,1,7,3>
+ 2567874490U, // <u,6,5,2>: Cost 3 vext1 <2,u,6,5>, <2,6,3,7>
+ 2576501250U, // <u,6,5,3>: Cost 3 vext1 <4,3,6,5>, <3,4,5,6>
+ 1576660943U, // <u,6,5,4>: Cost 2 vext2 <5,4,u,6>, <5,4,u,6>
+ 2645757956U, // <u,6,5,5>: Cost 3 vext2 <4,6,u,6>, <5,5,5,5>
+ 2645758050U, // <u,6,5,6>: Cost 3 vext2 <4,6,u,6>, <5,6,7,0>
+ 2824080694U, // <u,6,5,7>: Cost 3 vuzpr <0,u,2,6>, RHS
+ 1182626795U, // <u,6,5,u>: Cost 2 vrev <6,u,u,5>
+ 1506082918U, // <u,6,6,0>: Cost 2 vext1 <4,u,6,6>, LHS
+ 2579825398U, // <u,6,6,1>: Cost 3 vext1 <4,u,6,6>, <1,0,3,2>
+ 2645758458U, // <u,6,6,2>: Cost 3 vext2 <4,6,u,6>, <6,2,7,3>
+ 2579826838U, // <u,6,6,3>: Cost 3 vext1 <4,u,6,6>, <3,0,1,2>
+ 1506086198U, // <u,6,6,4>: Cost 2 vext1 <4,u,6,6>, RHS
+ 2579828432U, // <u,6,6,5>: Cost 3 vext1 <4,u,6,6>, <5,1,7,3>
+ 296144182U, // <u,6,6,6>: Cost 1 vdup2 RHS
+ 1638331202U, // <u,6,6,7>: Cost 2 vext3 RHS, <6,6,7,7>
+ 296144182U, // <u,6,6,u>: Cost 1 vdup2 RHS
+ 432349286U, // <u,6,7,0>: Cost 1 vext1 RHS, LHS
+ 1506091766U, // <u,6,7,1>: Cost 2 vext1 RHS, <1,0,3,2>
+ 1506092648U, // <u,6,7,2>: Cost 2 vext1 RHS, <2,2,2,2>
+ 1506093206U, // <u,6,7,3>: Cost 2 vext1 RHS, <3,0,1,2>
+ 432352809U, // <u,6,7,4>: Cost 1 vext1 RHS, RHS
+ 1506094800U, // <u,6,7,5>: Cost 2 vext1 RHS, <5,1,7,3>
+ 1506095610U, // <u,6,7,6>: Cost 2 vext1 RHS, <6,2,7,3>
+ 1906904374U, // <u,6,7,7>: Cost 2 vzipr RHS, RHS
+ 432355118U, // <u,6,7,u>: Cost 1 vext1 RHS, LHS
+ 432357478U, // <u,6,u,0>: Cost 1 vext1 RHS, LHS
+ 1545475886U, // <u,6,u,1>: Cost 2 vext2 <0,2,u,6>, LHS
+ 1506100840U, // <u,6,u,2>: Cost 2 vext1 RHS, <2,2,2,2>
+ 1506101398U, // <u,6,u,3>: Cost 2 vext1 RHS, <3,0,1,2>
+ 432361002U, // <u,6,u,4>: Cost 1 vext1 RHS, RHS
+ 1545476250U, // <u,6,u,5>: Cost 2 vext2 <0,2,u,6>, RHS
+ 296144182U, // <u,6,u,6>: Cost 1 vdup2 RHS
+ 1880370486U, // <u,6,u,7>: Cost 2 vzipr LHS, RHS
+ 432363310U, // <u,6,u,u>: Cost 1 vext1 RHS, LHS
+ 1571356672U, // <u,7,0,0>: Cost 2 vext2 RHS, <0,0,0,0>
+ 497614950U, // <u,7,0,1>: Cost 1 vext2 RHS, LHS
+ 1571356836U, // <u,7,0,2>: Cost 2 vext2 RHS, <0,2,0,2>
+ 2573880146U, // <u,7,0,3>: Cost 3 vext1 <3,u,7,0>, <3,u,7,0>
+ 1571357010U, // <u,7,0,4>: Cost 2 vext2 RHS, <0,4,1,5>
+ 1512083716U, // <u,7,0,5>: Cost 2 vext1 <5,u,7,0>, <5,u,7,0>
+ 2621874741U, // <u,7,0,6>: Cost 3 vext2 <0,6,u,7>, <0,6,u,7>
+ 2585826298U, // <u,7,0,7>: Cost 3 vext1 <5,u,7,0>, <7,0,1,2>
+ 497615517U, // <u,7,0,u>: Cost 1 vext2 RHS, LHS
+ 1571357430U, // <u,7,1,0>: Cost 2 vext2 RHS, <1,0,3,2>
+ 1571357492U, // <u,7,1,1>: Cost 2 vext2 RHS, <1,1,1,1>
+ 1571357590U, // <u,7,1,2>: Cost 2 vext2 RHS, <1,2,3,0>
+ 1552114715U, // <u,7,1,3>: Cost 2 vext2 <1,3,u,7>, <1,3,u,7>
+ 2573888822U, // <u,7,1,4>: Cost 3 vext1 <3,u,7,1>, RHS
+ 1553441981U, // <u,7,1,5>: Cost 2 vext2 <1,5,u,7>, <1,5,u,7>
+ 2627847438U, // <u,7,1,6>: Cost 3 vext2 <1,6,u,7>, <1,6,u,7>
+ 2727408775U, // <u,7,1,7>: Cost 3 vext3 <7,1,7,u>, <7,1,7,u>
+ 1555432880U, // <u,7,1,u>: Cost 2 vext2 <1,u,u,7>, <1,u,u,7>
+ 2629838337U, // <u,7,2,0>: Cost 3 vext2 <2,0,u,7>, <2,0,u,7>
+ 1188058754U, // <u,7,2,1>: Cost 2 vrev <7,u,1,2>
+ 1571358312U, // <u,7,2,2>: Cost 2 vext2 RHS, <2,2,2,2>
+ 1571358374U, // <u,7,2,3>: Cost 2 vext2 RHS, <2,3,0,1>
+ 2632492869U, // <u,7,2,4>: Cost 3 vext2 <2,4,u,7>, <2,4,u,7>
+ 2633156502U, // <u,7,2,5>: Cost 3 vext2 <2,5,u,7>, <2,5,u,7>
+ 1560078311U, // <u,7,2,6>: Cost 2 vext2 <2,6,u,7>, <2,6,u,7>
+ 2728072408U, // <u,7,2,7>: Cost 3 vext3 <7,2,7,u>, <7,2,7,u>
+ 1561405577U, // <u,7,2,u>: Cost 2 vext2 <2,u,u,7>, <2,u,u,7>
+ 1571358870U, // <u,7,3,0>: Cost 2 vext2 RHS, <3,0,1,2>
+ 2627184913U, // <u,7,3,1>: Cost 3 vext2 <1,5,u,7>, <3,1,5,u>
+ 2633820523U, // <u,7,3,2>: Cost 3 vext2 <2,6,u,7>, <3,2,6,u>
+ 1571359132U, // <u,7,3,3>: Cost 2 vext2 RHS, <3,3,3,3>
+ 1571359234U, // <u,7,3,4>: Cost 2 vext2 RHS, <3,4,5,6>
+ 1512108295U, // <u,7,3,5>: Cost 2 vext1 <5,u,7,3>, <5,u,7,3>
+ 1518080992U, // <u,7,3,6>: Cost 2 vext1 <6,u,7,3>, <6,u,7,3>
+ 2640456465U, // <u,7,3,7>: Cost 3 vext2 <3,7,u,7>, <3,7,u,7>
+ 1571359518U, // <u,7,3,u>: Cost 2 vext2 RHS, <3,u,1,2>
+ 1571359634U, // <u,7,4,0>: Cost 2 vext2 RHS, <4,0,5,1>
+ 2573911067U, // <u,7,4,1>: Cost 3 vext1 <3,u,7,4>, <1,3,u,7>
+ 2645101622U, // <u,7,4,2>: Cost 3 vext2 RHS, <4,2,5,3>
+ 2573912918U, // <u,7,4,3>: Cost 3 vext1 <3,u,7,4>, <3,u,7,4>
+ 1571359952U, // <u,7,4,4>: Cost 2 vext2 RHS, <4,4,4,4>
+ 497618248U, // <u,7,4,5>: Cost 1 vext2 RHS, RHS
+ 1571360116U, // <u,7,4,6>: Cost 2 vext2 RHS, <4,6,4,6>
+ 2645102024U, // <u,7,4,7>: Cost 3 vext2 RHS, <4,7,5,0>
+ 497618473U, // <u,7,4,u>: Cost 1 vext2 RHS, RHS
+ 2645102152U, // <u,7,5,0>: Cost 3 vext2 RHS, <5,0,1,2>
+ 1571360464U, // <u,7,5,1>: Cost 2 vext2 RHS, <5,1,7,3>
+ 2645102334U, // <u,7,5,2>: Cost 3 vext2 RHS, <5,2,3,4>
+ 2645102447U, // <u,7,5,3>: Cost 3 vext2 RHS, <5,3,7,0>
+ 1571360710U, // <u,7,5,4>: Cost 2 vext2 RHS, <5,4,7,6>
+ 1571360772U, // <u,7,5,5>: Cost 2 vext2 RHS, <5,5,5,5>
+ 1571360866U, // <u,7,5,6>: Cost 2 vext2 RHS, <5,6,7,0>
+ 1571360936U, // <u,7,5,7>: Cost 2 vext2 RHS, <5,7,5,7>
+ 1571361017U, // <u,7,5,u>: Cost 2 vext2 RHS, <5,u,5,7>
+ 1530044518U, // <u,7,6,0>: Cost 2 vext1 <u,u,7,6>, LHS
+ 2645103016U, // <u,7,6,1>: Cost 3 vext2 RHS, <6,1,7,2>
+ 1571361274U, // <u,7,6,2>: Cost 2 vext2 RHS, <6,2,7,3>
+ 2645103154U, // <u,7,6,3>: Cost 3 vext2 RHS, <6,3,4,5>
+ 1530047798U, // <u,7,6,4>: Cost 2 vext1 <u,u,7,6>, RHS
+ 1188386474U, // <u,7,6,5>: Cost 2 vrev <7,u,5,6>
+ 1571361592U, // <u,7,6,6>: Cost 2 vext2 RHS, <6,6,6,6>
+ 1571361614U, // <u,7,6,7>: Cost 2 vext2 RHS, <6,7,0,1>
+ 1571361695U, // <u,7,6,u>: Cost 2 vext2 RHS, <6,u,0,1>
+ 1571361786U, // <u,7,7,0>: Cost 2 vext2 RHS, <7,0,1,2>
+ 2573935616U, // <u,7,7,1>: Cost 3 vext1 <3,u,7,7>, <1,3,5,7>
+ 2645103781U, // <u,7,7,2>: Cost 3 vext2 RHS, <7,2,2,2>
+ 2573937497U, // <u,7,7,3>: Cost 3 vext1 <3,u,7,7>, <3,u,7,7>
+ 1571362150U, // <u,7,7,4>: Cost 2 vext2 RHS, <7,4,5,6>
+ 1512141067U, // <u,7,7,5>: Cost 2 vext1 <5,u,7,7>, <5,u,7,7>
+ 1518113764U, // <u,7,7,6>: Cost 2 vext1 <6,u,7,7>, <6,u,7,7>
+ 363253046U, // <u,7,7,7>: Cost 1 vdup3 RHS
+ 363253046U, // <u,7,7,u>: Cost 1 vdup3 RHS
+ 1571362515U, // <u,7,u,0>: Cost 2 vext2 RHS, <u,0,1,2>
+ 497620782U, // <u,7,u,1>: Cost 1 vext2 RHS, LHS
+ 1571362693U, // <u,7,u,2>: Cost 2 vext2 RHS, <u,2,3,0>
+ 1571362748U, // <u,7,u,3>: Cost 2 vext2 RHS, <u,3,0,1>
+ 1571362879U, // <u,7,u,4>: Cost 2 vext2 RHS, <u,4,5,6>
+ 497621146U, // <u,7,u,5>: Cost 1 vext2 RHS, RHS
+ 1571363024U, // <u,7,u,6>: Cost 2 vext2 RHS, <u,6,3,7>
+ 363253046U, // <u,7,u,7>: Cost 1 vdup3 RHS
+ 497621349U, // <u,7,u,u>: Cost 1 vext2 RHS, LHS
+ 135053414U, // <u,u,0,0>: Cost 1 vdup0 LHS
+ 471081121U, // <u,u,0,1>: Cost 1 vext2 LHS, LHS
+ 1544822948U, // <u,u,0,2>: Cost 2 vext2 LHS, <0,2,0,2>
+ 1616140005U, // <u,u,0,3>: Cost 2 vext3 LHS, <u,0,3,2>
+ 1544823122U, // <u,u,0,4>: Cost 2 vext2 LHS, <0,4,1,5>
+ 1512157453U, // <u,u,0,5>: Cost 2 vext1 <5,u,u,0>, <5,u,u,0>
+ 1662220032U, // <u,u,0,6>: Cost 2 vext3 RHS, <u,0,6,2>
+ 1194457487U, // <u,u,0,7>: Cost 2 vrev <u,u,7,0>
+ 471081629U, // <u,u,0,u>: Cost 1 vext2 LHS, LHS
+ 1544823542U, // <u,u,1,0>: Cost 2 vext2 LHS, <1,0,3,2>
+ 202162278U, // <u,u,1,1>: Cost 1 vdup1 LHS
+ 537753390U, // <u,u,1,2>: Cost 1 vext3 LHS, LHS
+ 1544823768U, // <u,u,1,3>: Cost 2 vext2 LHS, <1,3,1,3>
+ 1494248758U, // <u,u,1,4>: Cost 2 vext1 <2,u,u,1>, RHS
+ 1544823952U, // <u,u,1,5>: Cost 2 vext2 LHS, <1,5,3,7>
+ 1518138343U, // <u,u,1,6>: Cost 2 vext1 <6,u,u,1>, <6,u,u,1>
+ 1640322907U, // <u,u,1,7>: Cost 2 vext3 RHS, <u,1,7,3>
+ 537753444U, // <u,u,1,u>: Cost 1 vext3 LHS, LHS
+ 1482309734U, // <u,u,2,0>: Cost 2 vext1 <0,u,u,2>, LHS
+ 1194031451U, // <u,u,2,1>: Cost 2 vrev <u,u,1,2>
+ 269271142U, // <u,u,2,2>: Cost 1 vdup2 LHS
+ 835584U, // <u,u,2,3>: Cost 0 copy LHS
+ 1482313014U, // <u,u,2,4>: Cost 2 vext1 <0,u,u,2>, RHS
+ 2618566504U, // <u,u,2,5>: Cost 3 vext2 LHS, <2,5,3,6>
+ 1544824762U, // <u,u,2,6>: Cost 2 vext2 LHS, <2,6,3,7>
+ 1638479788U, // <u,u,2,7>: Cost 2 vext3 RHS, <u,2,7,3>
+ 835584U, // <u,u,2,u>: Cost 0 copy LHS
+ 408576723U, // <u,u,3,0>: Cost 1 vext1 LHS, LHS
+ 1482318582U, // <u,u,3,1>: Cost 2 vext1 LHS, <1,0,3,2>
+ 120371557U, // <u,u,3,2>: Cost 1 vrev LHS
+ 336380006U, // <u,u,3,3>: Cost 1 vdup3 LHS
+ 408579382U, // <u,u,3,4>: Cost 1 vext1 LHS, RHS
+ 1616140271U, // <u,u,3,5>: Cost 2 vext3 LHS, <u,3,5,7>
+ 1530098170U, // <u,u,3,6>: Cost 2 vext1 LHS, <6,2,7,3>
+ 1880329544U, // <u,u,3,7>: Cost 2 vzipr LHS, RHS
+ 408581934U, // <u,u,3,u>: Cost 1 vext1 LHS, LHS
+ 1488298086U, // <u,u,4,0>: Cost 2 vext1 <1,u,u,4>, LHS
+ 1488299437U, // <u,u,4,1>: Cost 2 vext1 <1,u,u,4>, <1,u,u,4>
+ 1659271204U, // <u,u,4,2>: Cost 2 vext3 LHS, <u,4,2,6>
+ 1194195311U, // <u,u,4,3>: Cost 2 vrev <u,u,3,4>
+ 161926454U, // <u,u,4,4>: Cost 1 vdup0 RHS
+ 471084342U, // <u,u,4,5>: Cost 1 vext2 LHS, RHS
+ 1571368308U, // <u,u,4,6>: Cost 2 vext2 RHS, <4,6,4,6>
+ 1640323153U, // <u,u,4,7>: Cost 2 vext3 RHS, <u,4,7,6>
+ 471084585U, // <u,u,4,u>: Cost 1 vext2 LHS, RHS
+ 1494278246U, // <u,u,5,0>: Cost 2 vext1 <2,u,u,5>, LHS
+ 1571368656U, // <u,u,5,1>: Cost 2 vext2 RHS, <5,1,7,3>
+ 1494280327U, // <u,u,5,2>: Cost 2 vext1 <2,u,u,5>, <2,u,u,5>
+ 1616140415U, // <u,u,5,3>: Cost 2 vext3 LHS, <u,5,3,7>
+ 1494281526U, // <u,u,5,4>: Cost 2 vext1 <2,u,u,5>, RHS
+ 229035318U, // <u,u,5,5>: Cost 1 vdup1 RHS
+ 537753754U, // <u,u,5,6>: Cost 1 vext3 LHS, RHS
+ 1750355254U, // <u,u,5,7>: Cost 2 vuzpr LHS, RHS
+ 537753772U, // <u,u,5,u>: Cost 1 vext3 LHS, RHS
+ 1482342502U, // <u,u,6,0>: Cost 2 vext1 <0,u,u,6>, LHS
+ 2556084982U, // <u,u,6,1>: Cost 3 vext1 <0,u,u,6>, <1,0,3,2>
+ 1571369466U, // <u,u,6,2>: Cost 2 vext2 RHS, <6,2,7,3>
+ 1611938000U, // <u,u,6,3>: Cost 2 vext3 LHS, <u,6,3,7>
+ 1482345782U, // <u,u,6,4>: Cost 2 vext1 <0,u,u,6>, RHS
+ 1194359171U, // <u,u,6,5>: Cost 2 vrev <u,u,5,6>
+ 296144182U, // <u,u,6,6>: Cost 1 vdup2 RHS
+ 27705344U, // <u,u,6,7>: Cost 0 copy RHS
+ 27705344U, // <u,u,6,u>: Cost 0 copy RHS
+ 432496742U, // <u,u,7,0>: Cost 1 vext1 RHS, LHS
+ 1488324016U, // <u,u,7,1>: Cost 2 vext1 <1,u,u,7>, <1,u,u,7>
+ 1494296713U, // <u,u,7,2>: Cost 2 vext1 <2,u,u,7>, <2,u,u,7>
+ 1906901148U, // <u,u,7,3>: Cost 2 vzipr RHS, LHS
+ 432500283U, // <u,u,7,4>: Cost 1 vext1 RHS, RHS
+ 1506242256U, // <u,u,7,5>: Cost 2 vext1 RHS, <5,1,7,3>
+ 120699277U, // <u,u,7,6>: Cost 1 vrev RHS
+ 363253046U, // <u,u,7,7>: Cost 1 vdup3 RHS
+ 432502574U, // <u,u,7,u>: Cost 1 vext1 RHS, LHS
+ 408617688U, // <u,u,u,0>: Cost 1 vext1 LHS, LHS
+ 471086894U, // <u,u,u,1>: Cost 1 vext2 LHS, LHS
+ 537753957U, // <u,u,u,2>: Cost 1 vext3 LHS, LHS
+ 835584U, // <u,u,u,3>: Cost 0 copy LHS
+ 408620342U, // <u,u,u,4>: Cost 1 vext1 LHS, RHS
+ 471087258U, // <u,u,u,5>: Cost 1 vext2 LHS, RHS
+ 537753997U, // <u,u,u,6>: Cost 1 vext3 LHS, RHS
+ 27705344U, // <u,u,u,7>: Cost 0 copy RHS
+ 835584U, // <u,u,u,u>: Cost 0 copy LHS
+ 0
+};
diff --git a/lib/Target/AArch64/AArch64PromoteConstant.cpp b/lib/Target/AArch64/AArch64PromoteConstant.cpp
new file mode 100644
index 0000000..4723cc4
--- /dev/null
+++ b/lib/Target/AArch64/AArch64PromoteConstant.cpp
@@ -0,0 +1,578 @@
+//=- AArch64PromoteConstant.cpp --- Promote constant to global for AArch64 -==//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the AArch64PromoteConstant pass which promotes constants
+// to global variables when this is likely to be more efficient. Currently only
+// types related to constant vector (i.e., constant vector, array of constant
+// vectors, constant structure with a constant vector field, etc.) are promoted
+// to global variables. Constant vectors are likely to be lowered in target
+// constant pool during instruction selection already; therefore, the access
+// will remain the same (memory load), but the structure types are not split
+// into different constant pool accesses for each field. A bonus side effect is
+// that created globals may be merged by the global merge pass.
+//
+// FIXME: This pass may be useful for other targets too.
+//===----------------------------------------------------------------------===//
+
+#include "AArch64.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/SmallSet.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/GlobalVariable.h"
+#include "llvm/IR/InlineAsm.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/Module.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "aarch64-promote-const"
+
+// Stress testing mode - disable heuristics.
+static cl::opt<bool> Stress("aarch64-stress-promote-const", cl::Hidden,
+ cl::desc("Promote all vector constants"));
+
+STATISTIC(NumPromoted, "Number of promoted constants");
+STATISTIC(NumPromotedUses, "Number of promoted constants uses");
+
+//===----------------------------------------------------------------------===//
+// AArch64PromoteConstant
+//===----------------------------------------------------------------------===//
+
+namespace {
+/// Promotes interesting constant into global variables.
+/// The motivating example is:
+/// static const uint16_t TableA[32] = {
+/// 41944, 40330, 38837, 37450, 36158, 34953, 33826, 32768,
+/// 31776, 30841, 29960, 29128, 28340, 27595, 26887, 26215,
+/// 25576, 24967, 24386, 23832, 23302, 22796, 22311, 21846,
+/// 21400, 20972, 20561, 20165, 19785, 19419, 19066, 18725,
+/// };
+///
+/// uint8x16x4_t LoadStatic(void) {
+/// uint8x16x4_t ret;
+/// ret.val[0] = vld1q_u16(TableA + 0);
+/// ret.val[1] = vld1q_u16(TableA + 8);
+/// ret.val[2] = vld1q_u16(TableA + 16);
+/// ret.val[3] = vld1q_u16(TableA + 24);
+/// return ret;
+/// }
+///
+/// The constants in this example are folded into the uses. Thus, 4 different
+/// constants are created.
+///
+/// As their type is vector the cheapest way to create them is to load them
+/// for the memory.
+///
+/// Therefore the final assembly final has 4 different loads. With this pass
+/// enabled, only one load is issued for the constants.
+class AArch64PromoteConstant : public ModulePass {
+
+public:
+ static char ID;
+ AArch64PromoteConstant() : ModulePass(ID) {}
+
+ const char *getPassName() const override { return "AArch64 Promote Constant"; }
+
+ /// Iterate over the functions and promote the interesting constants into
+ /// global variables with module scope.
+ bool runOnModule(Module &M) override {
+ DEBUG(dbgs() << getPassName() << '\n');
+ bool Changed = false;
+ for (auto &MF : M) {
+ Changed |= runOnFunction(MF);
+ }
+ return Changed;
+ }
+
+private:
+ /// Look for interesting constants used within the given function.
+ /// Promote them into global variables, load these global variables within
+ /// the related function, so that the number of inserted load is minimal.
+ bool runOnFunction(Function &F);
+
+ // This transformation requires dominator info
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
+ AU.setPreservesCFG();
+ AU.addRequired<DominatorTreeWrapperPass>();
+ AU.addPreserved<DominatorTreeWrapperPass>();
+ }
+
+ /// Type to store a list of User.
+ typedef SmallVector<Value::user_iterator, 4> Users;
+ /// Map an insertion point to all the uses it dominates.
+ typedef DenseMap<Instruction *, Users> InsertionPoints;
+ /// Map a function to the required insertion point of load for a
+ /// global variable.
+ typedef DenseMap<Function *, InsertionPoints> InsertionPointsPerFunc;
+
+ /// Find the closest point that dominates the given Use.
+ Instruction *findInsertionPoint(Value::user_iterator &Use);
+
+ /// Check if the given insertion point is dominated by an existing
+ /// insertion point.
+ /// If true, the given use is added to the list of dominated uses for
+ /// the related existing point.
+ /// \param NewPt the insertion point to be checked
+ /// \param UseIt the use to be added into the list of dominated uses
+ /// \param InsertPts existing insertion points
+ /// \pre NewPt and all instruction in InsertPts belong to the same function
+ /// \return true if one of the insertion point in InsertPts dominates NewPt,
+ /// false otherwise
+ bool isDominated(Instruction *NewPt, Value::user_iterator &UseIt,
+ InsertionPoints &InsertPts);
+
+ /// Check if the given insertion point can be merged with an existing
+ /// insertion point in a common dominator.
+ /// If true, the given use is added to the list of the created insertion
+ /// point.
+ /// \param NewPt the insertion point to be checked
+ /// \param UseIt the use to be added into the list of dominated uses
+ /// \param InsertPts existing insertion points
+ /// \pre NewPt and all instruction in InsertPts belong to the same function
+ /// \pre isDominated returns false for the exact same parameters.
+ /// \return true if it exists an insertion point in InsertPts that could
+ /// have been merged with NewPt in a common dominator,
+ /// false otherwise
+ bool tryAndMerge(Instruction *NewPt, Value::user_iterator &UseIt,
+ InsertionPoints &InsertPts);
+
+ /// Compute the minimal insertion points to dominates all the interesting
+ /// uses of value.
+ /// Insertion points are group per function and each insertion point
+ /// contains a list of all the uses it dominates within the related function
+ /// \param Val constant to be examined
+ /// \param[out] InsPtsPerFunc output storage of the analysis
+ void computeInsertionPoints(Constant *Val,
+ InsertionPointsPerFunc &InsPtsPerFunc);
+
+ /// Insert a definition of a new global variable at each point contained in
+ /// InsPtsPerFunc and update the related uses (also contained in
+ /// InsPtsPerFunc).
+ bool insertDefinitions(Constant *Cst, InsertionPointsPerFunc &InsPtsPerFunc);
+
+ /// Compute the minimal insertion points to dominate all the interesting
+ /// uses of Val and insert a definition of a new global variable
+ /// at these points.
+ /// Also update the uses of Val accordingly.
+ /// Currently a use of Val is considered interesting if:
+ /// - Val is not UndefValue
+ /// - Val is not zeroinitialized
+ /// - Replacing Val per a load of a global variable is valid.
+ /// \see shouldConvert for more details
+ bool computeAndInsertDefinitions(Constant *Val);
+
+ /// Promote the given constant into a global variable if it is expected to
+ /// be profitable.
+ /// \return true if Cst has been promoted
+ bool promoteConstant(Constant *Cst);
+
+ /// Transfer the list of dominated uses of IPI to NewPt in InsertPts.
+ /// Append UseIt to this list and delete the entry of IPI in InsertPts.
+ static void appendAndTransferDominatedUses(Instruction *NewPt,
+ Value::user_iterator &UseIt,
+ InsertionPoints::iterator &IPI,
+ InsertionPoints &InsertPts) {
+ // Record the dominated use.
+ IPI->second.push_back(UseIt);
+ // Transfer the dominated uses of IPI to NewPt
+ // Inserting into the DenseMap may invalidate existing iterator.
+ // Keep a copy of the key to find the iterator to erase.
+ Instruction *OldInstr = IPI->first;
+ InsertPts.insert(InsertionPoints::value_type(NewPt, IPI->second));
+ // Erase IPI.
+ IPI = InsertPts.find(OldInstr);
+ InsertPts.erase(IPI);
+ }
+};
+} // end anonymous namespace
+
+char AArch64PromoteConstant::ID = 0;
+
+namespace llvm {
+void initializeAArch64PromoteConstantPass(PassRegistry &);
+}
+
+INITIALIZE_PASS_BEGIN(AArch64PromoteConstant, "aarch64-promote-const",
+ "AArch64 Promote Constant Pass", false, false)
+INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
+INITIALIZE_PASS_END(AArch64PromoteConstant, "aarch64-promote-const",
+ "AArch64 Promote Constant Pass", false, false)
+
+ModulePass *llvm::createAArch64PromoteConstantPass() {
+ return new AArch64PromoteConstant();
+}
+
+/// Check if the given type uses a vector type.
+static bool isConstantUsingVectorTy(const Type *CstTy) {
+ if (CstTy->isVectorTy())
+ return true;
+ if (CstTy->isStructTy()) {
+ for (unsigned EltIdx = 0, EndEltIdx = CstTy->getStructNumElements();
+ EltIdx < EndEltIdx; ++EltIdx)
+ if (isConstantUsingVectorTy(CstTy->getStructElementType(EltIdx)))
+ return true;
+ } else if (CstTy->isArrayTy())
+ return isConstantUsingVectorTy(CstTy->getArrayElementType());
+ return false;
+}
+
+/// Check if the given use (Instruction + OpIdx) of Cst should be converted into
+/// a load of a global variable initialized with Cst.
+/// A use should be converted if it is legal to do so.
+/// For instance, it is not legal to turn the mask operand of a shuffle vector
+/// into a load of a global variable.
+static bool shouldConvertUse(const Constant *Cst, const Instruction *Instr,
+ unsigned OpIdx) {
+ // shufflevector instruction expects a const for the mask argument, i.e., the
+ // third argument. Do not promote this use in that case.
+ if (isa<const ShuffleVectorInst>(Instr) && OpIdx == 2)
+ return false;
+
+ // extractvalue instruction expects a const idx.
+ if (isa<const ExtractValueInst>(Instr) && OpIdx > 0)
+ return false;
+
+ // extractvalue instruction expects a const idx.
+ if (isa<const InsertValueInst>(Instr) && OpIdx > 1)
+ return false;
+
+ if (isa<const AllocaInst>(Instr) && OpIdx > 0)
+ return false;
+
+ // Alignment argument must be constant.
+ if (isa<const LoadInst>(Instr) && OpIdx > 0)
+ return false;
+
+ // Alignment argument must be constant.
+ if (isa<const StoreInst>(Instr) && OpIdx > 1)
+ return false;
+
+ // Index must be constant.
+ if (isa<const GetElementPtrInst>(Instr) && OpIdx > 0)
+ return false;
+
+ // Personality function and filters must be constant.
+ // Give up on that instruction.
+ if (isa<const LandingPadInst>(Instr))
+ return false;
+
+ // Switch instruction expects constants to compare to.
+ if (isa<const SwitchInst>(Instr))
+ return false;
+
+ // Expected address must be a constant.
+ if (isa<const IndirectBrInst>(Instr))
+ return false;
+
+ // Do not mess with intrinsics.
+ if (isa<const IntrinsicInst>(Instr))
+ return false;
+
+ // Do not mess with inline asm.
+ const CallInst *CI = dyn_cast<const CallInst>(Instr);
+ if (CI && isa<const InlineAsm>(CI->getCalledValue()))
+ return false;
+
+ return true;
+}
+
+/// Check if the given Cst should be converted into
+/// a load of a global variable initialized with Cst.
+/// A constant should be converted if it is likely that the materialization of
+/// the constant will be tricky. Thus, we give up on zero or undef values.
+///
+/// \todo Currently, accept only vector related types.
+/// Also we give up on all simple vector type to keep the existing
+/// behavior. Otherwise, we should push here all the check of the lowering of
+/// BUILD_VECTOR. By giving up, we lose the potential benefit of merging
+/// constant via global merge and the fact that the same constant is stored
+/// only once with this method (versus, as many function that uses the constant
+/// for the regular approach, even for float).
+/// Again, the simplest solution would be to promote every
+/// constant and rematerialize them when they are actually cheap to create.
+static bool shouldConvert(const Constant *Cst) {
+ if (isa<const UndefValue>(Cst))
+ return false;
+
+ // FIXME: In some cases, it may be interesting to promote in memory
+ // a zero initialized constant.
+ // E.g., when the type of Cst require more instructions than the
+ // adrp/add/load sequence or when this sequence can be shared by several
+ // instances of Cst.
+ // Ideally, we could promote this into a global and rematerialize the constant
+ // when it was a bad idea.
+ if (Cst->isZeroValue())
+ return false;
+
+ if (Stress)
+ return true;
+
+ // FIXME: see function \todo
+ if (Cst->getType()->isVectorTy())
+ return false;
+ return isConstantUsingVectorTy(Cst->getType());
+}
+
+Instruction *
+AArch64PromoteConstant::findInsertionPoint(Value::user_iterator &Use) {
+ // If this user is a phi, the insertion point is in the related
+ // incoming basic block.
+ PHINode *PhiInst = dyn_cast<PHINode>(*Use);
+ Instruction *InsertionPoint;
+ if (PhiInst)
+ InsertionPoint =
+ PhiInst->getIncomingBlock(Use.getOperandNo())->getTerminator();
+ else
+ InsertionPoint = dyn_cast<Instruction>(*Use);
+ assert(InsertionPoint && "User is not an instruction!");
+ return InsertionPoint;
+}
+
+bool AArch64PromoteConstant::isDominated(Instruction *NewPt,
+ Value::user_iterator &UseIt,
+ InsertionPoints &InsertPts) {
+
+ DominatorTree &DT = getAnalysis<DominatorTreeWrapperPass>(
+ *NewPt->getParent()->getParent()).getDomTree();
+
+ // Traverse all the existing insertion points and check if one is dominating
+ // NewPt. If it is, remember that.
+ for (auto &IPI : InsertPts) {
+ if (NewPt == IPI.first || DT.dominates(IPI.first, NewPt) ||
+ // When IPI.first is a terminator instruction, DT may think that
+ // the result is defined on the edge.
+ // Here we are testing the insertion point, not the definition.
+ (IPI.first->getParent() != NewPt->getParent() &&
+ DT.dominates(IPI.first->getParent(), NewPt->getParent()))) {
+ // No need to insert this point. Just record the dominated use.
+ DEBUG(dbgs() << "Insertion point dominated by:\n");
+ DEBUG(IPI.first->print(dbgs()));
+ DEBUG(dbgs() << '\n');
+ IPI.second.push_back(UseIt);
+ return true;
+ }
+ }
+ return false;
+}
+
+bool AArch64PromoteConstant::tryAndMerge(Instruction *NewPt,
+ Value::user_iterator &UseIt,
+ InsertionPoints &InsertPts) {
+ DominatorTree &DT = getAnalysis<DominatorTreeWrapperPass>(
+ *NewPt->getParent()->getParent()).getDomTree();
+ BasicBlock *NewBB = NewPt->getParent();
+
+ // Traverse all the existing insertion point and check if one is dominated by
+ // NewPt and thus useless or can be combined with NewPt into a common
+ // dominator.
+ for (InsertionPoints::iterator IPI = InsertPts.begin(),
+ EndIPI = InsertPts.end();
+ IPI != EndIPI; ++IPI) {
+ BasicBlock *CurBB = IPI->first->getParent();
+ if (NewBB == CurBB) {
+ // Instructions are in the same block.
+ // By construction, NewPt is dominating the other.
+ // Indeed, isDominated returned false with the exact same arguments.
+ DEBUG(dbgs() << "Merge insertion point with:\n");
+ DEBUG(IPI->first->print(dbgs()));
+ DEBUG(dbgs() << "\nat considered insertion point.\n");
+ appendAndTransferDominatedUses(NewPt, UseIt, IPI, InsertPts);
+ return true;
+ }
+
+ // Look for a common dominator
+ BasicBlock *CommonDominator = DT.findNearestCommonDominator(NewBB, CurBB);
+ // If none exists, we cannot merge these two points.
+ if (!CommonDominator)
+ continue;
+
+ if (CommonDominator != NewBB) {
+ // By construction, the CommonDominator cannot be CurBB.
+ assert(CommonDominator != CurBB &&
+ "Instruction has not been rejected during isDominated check!");
+ // Take the last instruction of the CommonDominator as insertion point
+ NewPt = CommonDominator->getTerminator();
+ }
+ // else, CommonDominator is the block of NewBB, hence NewBB is the last
+ // possible insertion point in that block.
+ DEBUG(dbgs() << "Merge insertion point with:\n");
+ DEBUG(IPI->first->print(dbgs()));
+ DEBUG(dbgs() << '\n');
+ DEBUG(NewPt->print(dbgs()));
+ DEBUG(dbgs() << '\n');
+ appendAndTransferDominatedUses(NewPt, UseIt, IPI, InsertPts);
+ return true;
+ }
+ return false;
+}
+
+void AArch64PromoteConstant::computeInsertionPoints(
+ Constant *Val, InsertionPointsPerFunc &InsPtsPerFunc) {
+ DEBUG(dbgs() << "** Compute insertion points **\n");
+ for (Value::user_iterator UseIt = Val->user_begin(),
+ EndUseIt = Val->user_end();
+ UseIt != EndUseIt; ++UseIt) {
+ // If the user is not an Instruction, we cannot modify it.
+ if (!isa<Instruction>(*UseIt))
+ continue;
+
+ // Filter out uses that should not be converted.
+ if (!shouldConvertUse(Val, cast<Instruction>(*UseIt), UseIt.getOperandNo()))
+ continue;
+
+ DEBUG(dbgs() << "Considered use, opidx " << UseIt.getOperandNo() << ":\n");
+ DEBUG((*UseIt)->print(dbgs()));
+ DEBUG(dbgs() << '\n');
+
+ Instruction *InsertionPoint = findInsertionPoint(UseIt);
+
+ DEBUG(dbgs() << "Considered insertion point:\n");
+ DEBUG(InsertionPoint->print(dbgs()));
+ DEBUG(dbgs() << '\n');
+
+ // Check if the current insertion point is useless, i.e., it is dominated
+ // by another one.
+ InsertionPoints &InsertPts =
+ InsPtsPerFunc[InsertionPoint->getParent()->getParent()];
+ if (isDominated(InsertionPoint, UseIt, InsertPts))
+ continue;
+ // This insertion point is useful, check if we can merge some insertion
+ // point in a common dominator or if NewPt dominates an existing one.
+ if (tryAndMerge(InsertionPoint, UseIt, InsertPts))
+ continue;
+
+ DEBUG(dbgs() << "Keep considered insertion point\n");
+
+ // It is definitely useful by its own
+ InsertPts[InsertionPoint].push_back(UseIt);
+ }
+}
+
+bool AArch64PromoteConstant::insertDefinitions(
+ Constant *Cst, InsertionPointsPerFunc &InsPtsPerFunc) {
+ // We will create one global variable per Module.
+ DenseMap<Module *, GlobalVariable *> ModuleToMergedGV;
+ bool HasChanged = false;
+
+ // Traverse all insertion points in all the function.
+ for (InsertionPointsPerFunc::iterator FctToInstPtsIt = InsPtsPerFunc.begin(),
+ EndIt = InsPtsPerFunc.end();
+ FctToInstPtsIt != EndIt; ++FctToInstPtsIt) {
+ InsertionPoints &InsertPts = FctToInstPtsIt->second;
+// Do more checking for debug purposes.
+#ifndef NDEBUG
+ DominatorTree &DT = getAnalysis<DominatorTreeWrapperPass>(
+ *FctToInstPtsIt->first).getDomTree();
+#endif
+ GlobalVariable *PromotedGV;
+ assert(!InsertPts.empty() && "Empty uses does not need a definition");
+
+ Module *M = FctToInstPtsIt->first->getParent();
+ DenseMap<Module *, GlobalVariable *>::iterator MapIt =
+ ModuleToMergedGV.find(M);
+ if (MapIt == ModuleToMergedGV.end()) {
+ PromotedGV = new GlobalVariable(
+ *M, Cst->getType(), true, GlobalValue::InternalLinkage, nullptr,
+ "_PromotedConst", nullptr, GlobalVariable::NotThreadLocal);
+ PromotedGV->setInitializer(Cst);
+ ModuleToMergedGV[M] = PromotedGV;
+ DEBUG(dbgs() << "Global replacement: ");
+ DEBUG(PromotedGV->print(dbgs()));
+ DEBUG(dbgs() << '\n');
+ ++NumPromoted;
+ HasChanged = true;
+ } else {
+ PromotedGV = MapIt->second;
+ }
+
+ for (InsertionPoints::iterator IPI = InsertPts.begin(),
+ EndIPI = InsertPts.end();
+ IPI != EndIPI; ++IPI) {
+ // Create the load of the global variable.
+ IRBuilder<> Builder(IPI->first->getParent(), IPI->first);
+ LoadInst *LoadedCst = Builder.CreateLoad(PromotedGV);
+ DEBUG(dbgs() << "**********\n");
+ DEBUG(dbgs() << "New def: ");
+ DEBUG(LoadedCst->print(dbgs()));
+ DEBUG(dbgs() << '\n');
+
+ // Update the dominated uses.
+ Users &DominatedUsers = IPI->second;
+ for (Value::user_iterator Use : DominatedUsers) {
+#ifndef NDEBUG
+ assert((DT.dominates(LoadedCst, cast<Instruction>(*Use)) ||
+ (isa<PHINode>(*Use) &&
+ DT.dominates(LoadedCst, findInsertionPoint(Use)))) &&
+ "Inserted definition does not dominate all its uses!");
+#endif
+ DEBUG(dbgs() << "Use to update " << Use.getOperandNo() << ":");
+ DEBUG(Use->print(dbgs()));
+ DEBUG(dbgs() << '\n');
+ Use->setOperand(Use.getOperandNo(), LoadedCst);
+ ++NumPromotedUses;
+ }
+ }
+ }
+ return HasChanged;
+}
+
+bool AArch64PromoteConstant::computeAndInsertDefinitions(Constant *Val) {
+ InsertionPointsPerFunc InsertPtsPerFunc;
+ computeInsertionPoints(Val, InsertPtsPerFunc);
+ return insertDefinitions(Val, InsertPtsPerFunc);
+}
+
+bool AArch64PromoteConstant::promoteConstant(Constant *Cst) {
+ assert(Cst && "Given variable is not a valid constant.");
+
+ if (!shouldConvert(Cst))
+ return false;
+
+ DEBUG(dbgs() << "******************************\n");
+ DEBUG(dbgs() << "Candidate constant: ");
+ DEBUG(Cst->print(dbgs()));
+ DEBUG(dbgs() << '\n');
+
+ return computeAndInsertDefinitions(Cst);
+}
+
+bool AArch64PromoteConstant::runOnFunction(Function &F) {
+ // Look for instructions using constant vector. Promote that constant to a
+ // global variable. Create as few loads of this variable as possible and
+ // update the uses accordingly.
+ bool LocalChange = false;
+ SmallSet<Constant *, 8> AlreadyChecked;
+
+ for (auto &MBB : F) {
+ for (auto &MI : MBB) {
+ // Traverse the operand, looking for constant vectors. Replace them by a
+ // load of a global variable of constant vector type.
+ for (unsigned OpIdx = 0, EndOpIdx = MI.getNumOperands();
+ OpIdx != EndOpIdx; ++OpIdx) {
+ Constant *Cst = dyn_cast<Constant>(MI.getOperand(OpIdx));
+ // There is no point in promoting global values as they are already
+ // global. Do not promote constant expressions either, as they may
+ // require some code expansion.
+ if (Cst && !isa<GlobalValue>(Cst) && !isa<ConstantExpr>(Cst) &&
+ AlreadyChecked.insert(Cst))
+ LocalChange |= promoteConstant(Cst);
+ }
+ }
+ }
+ return LocalChange;
+}
diff --git a/lib/Target/AArch64/AArch64RegisterInfo.cpp b/lib/Target/AArch64/AArch64RegisterInfo.cpp
index 06e1ffb..01b9587 100644
--- a/lib/Target/AArch64/AArch64RegisterInfo.cpp
+++ b/lib/Target/AArch64/AArch64RegisterInfo.cpp
@@ -12,175 +12,393 @@
//
//===----------------------------------------------------------------------===//
-
#include "AArch64RegisterInfo.h"
#include "AArch64FrameLowering.h"
-#include "AArch64MachineFunctionInfo.h"
-#include "AArch64TargetMachine.h"
-#include "MCTargetDesc/AArch64MCTargetDesc.h"
+#include "AArch64InstrInfo.h"
+#include "AArch64Subtarget.h"
+#include "MCTargetDesc/AArch64AddressingModes.h"
#include "llvm/ADT/BitVector.h"
#include "llvm/CodeGen/MachineFrameInfo.h"
#include "llvm/CodeGen/MachineInstrBuilder.h"
#include "llvm/CodeGen/MachineRegisterInfo.h"
#include "llvm/CodeGen/RegisterScavenging.h"
+#include "llvm/IR/Function.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetFrameLowering.h"
+#include "llvm/Target/TargetOptions.h"
+
+using namespace llvm;
#define GET_REGINFO_TARGET_DESC
#include "AArch64GenRegisterInfo.inc"
-using namespace llvm;
+AArch64RegisterInfo::AArch64RegisterInfo(const AArch64InstrInfo *tii,
+ const AArch64Subtarget *sti)
+ : AArch64GenRegisterInfo(AArch64::LR), TII(tii), STI(sti) {}
-AArch64RegisterInfo::AArch64RegisterInfo()
- : AArch64GenRegisterInfo(AArch64::X30) {
-}
-
-const uint16_t *
+const MCPhysReg *
AArch64RegisterInfo::getCalleeSavedRegs(const MachineFunction *MF) const {
- return CSR_PCS_SaveList;
+ assert(MF && "Invalid MachineFunction pointer.");
+ if (MF->getFunction()->getCallingConv() == CallingConv::AnyReg)
+ return CSR_AArch64_AllRegs_SaveList;
+ else
+ return CSR_AArch64_AAPCS_SaveList;
}
-const uint32_t*
-AArch64RegisterInfo::getCallPreservedMask(CallingConv::ID) const {
- return CSR_PCS_RegMask;
+const uint32_t *
+AArch64RegisterInfo::getCallPreservedMask(CallingConv::ID CC) const {
+ if (CC == CallingConv::AnyReg)
+ return CSR_AArch64_AllRegs_RegMask;
+ else
+ return CSR_AArch64_AAPCS_RegMask;
}
-const uint32_t *AArch64RegisterInfo::getTLSDescCallPreservedMask() const {
- return TLSDesc_RegMask;
+const uint32_t *AArch64RegisterInfo::getTLSCallPreservedMask() const {
+ if (STI->isTargetDarwin())
+ return CSR_AArch64_TLS_Darwin_RegMask;
+
+ assert(STI->isTargetELF() && "only expect Darwin or ELF TLS");
+ return CSR_AArch64_TLS_ELF_RegMask;
}
-const TargetRegisterClass *
-AArch64RegisterInfo::getCrossCopyRegClass(const TargetRegisterClass *RC) const {
- if (RC == &AArch64::FlagClassRegClass)
- return &AArch64::GPR64RegClass;
-
- return RC;
+const uint32_t *
+AArch64RegisterInfo::getThisReturnPreservedMask(CallingConv::ID) const {
+ // This should return a register mask that is the same as that returned by
+ // getCallPreservedMask but that additionally preserves the register used for
+ // the first i64 argument (which must also be the register used to return a
+ // single i64 return value)
+ //
+ // In case that the calling convention does not use the same register for
+ // both, the function should return NULL (does not currently apply)
+ return CSR_AArch64_AAPCS_ThisReturn_RegMask;
}
-
-
BitVector
AArch64RegisterInfo::getReservedRegs(const MachineFunction &MF) const {
- BitVector Reserved(getNumRegs());
const TargetFrameLowering *TFI = MF.getTarget().getFrameLowering();
- Reserved.set(AArch64::XSP);
- Reserved.set(AArch64::WSP);
-
+ // FIXME: avoid re-calculating this every time.
+ BitVector Reserved(getNumRegs());
+ Reserved.set(AArch64::SP);
Reserved.set(AArch64::XZR);
+ Reserved.set(AArch64::WSP);
Reserved.set(AArch64::WZR);
- if (TFI->hasFP(MF)) {
- Reserved.set(AArch64::X29);
+ if (TFI->hasFP(MF) || STI->isTargetDarwin()) {
+ Reserved.set(AArch64::FP);
Reserved.set(AArch64::W29);
}
+ if (STI->isTargetDarwin()) {
+ Reserved.set(AArch64::X18); // Platform register
+ Reserved.set(AArch64::W18);
+ }
+
+ if (hasBasePointer(MF)) {
+ Reserved.set(AArch64::X19);
+ Reserved.set(AArch64::W19);
+ }
+
return Reserved;
}
-static bool hasFrameOffset(int opcode) {
- return opcode != AArch64::LD1x2_8B && opcode != AArch64::LD1x3_8B &&
- opcode != AArch64::LD1x4_8B && opcode != AArch64::ST1x2_8B &&
- opcode != AArch64::ST1x3_8B && opcode != AArch64::ST1x4_8B &&
- opcode != AArch64::LD1x2_16B && opcode != AArch64::LD1x3_16B &&
- opcode != AArch64::LD1x4_16B && opcode != AArch64::ST1x2_16B &&
- opcode != AArch64::ST1x3_16B && opcode != AArch64::ST1x4_16B;
+bool AArch64RegisterInfo::isReservedReg(const MachineFunction &MF,
+ unsigned Reg) const {
+ const TargetFrameLowering *TFI = MF.getTarget().getFrameLowering();
+
+ switch (Reg) {
+ default:
+ break;
+ case AArch64::SP:
+ case AArch64::XZR:
+ case AArch64::WSP:
+ case AArch64::WZR:
+ return true;
+ case AArch64::X18:
+ case AArch64::W18:
+ return STI->isTargetDarwin();
+ case AArch64::FP:
+ case AArch64::W29:
+ return TFI->hasFP(MF) || STI->isTargetDarwin();
+ case AArch64::W19:
+ case AArch64::X19:
+ return hasBasePointer(MF);
+ }
+
+ return false;
}
-void
-AArch64RegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MBBI,
- int SPAdj,
- unsigned FIOperandNum,
- RegScavenger *RS) const {
- assert(SPAdj == 0 && "Cannot deal with nonzero SPAdj yet");
- MachineInstr &MI = *MBBI;
- MachineBasicBlock &MBB = *MI.getParent();
- MachineFunction &MF = *MBB.getParent();
- MachineFrameInfo *MFI = MF.getFrameInfo();
- const AArch64FrameLowering *TFI =
- static_cast<const AArch64FrameLowering *>(MF.getTarget().getFrameLowering());
+const TargetRegisterClass *
+AArch64RegisterInfo::getPointerRegClass(const MachineFunction &MF,
+ unsigned Kind) const {
+ return &AArch64::GPR64RegClass;
+}
- // In order to work out the base and offset for addressing, the FrameLowering
- // code needs to know (sometimes) whether the instruction is storing/loading a
- // callee-saved register, or whether it's a more generic
- // operation. Fortunately the frame indices are used *only* for that purpose
- // and are contiguous, so we can check here.
- const std::vector<CalleeSavedInfo> &CSI = MFI->getCalleeSavedInfo();
- int MinCSFI = 0;
- int MaxCSFI = -1;
+const TargetRegisterClass *
+AArch64RegisterInfo::getCrossCopyRegClass(const TargetRegisterClass *RC) const {
+ if (RC == &AArch64::CCRRegClass)
+ return &AArch64::GPR64RegClass; // Only MSR & MRS copy NZCV.
+ return RC;
+}
- if (CSI.size()) {
- MinCSFI = CSI[0].getFrameIdx();
- MaxCSFI = CSI[CSI.size() - 1].getFrameIdx();
+unsigned AArch64RegisterInfo::getBaseRegister() const { return AArch64::X19; }
+
+bool AArch64RegisterInfo::hasBasePointer(const MachineFunction &MF) const {
+ const MachineFrameInfo *MFI = MF.getFrameInfo();
+
+ // In the presence of variable sized objects, if the fixed stack size is
+ // large enough that referencing from the FP won't result in things being
+ // in range relatively often, we can use a base pointer to allow access
+ // from the other direction like the SP normally works.
+ if (MFI->hasVarSizedObjects()) {
+ // Conservatively estimate whether the negative offset from the frame
+ // pointer will be sufficient to reach. If a function has a smallish
+ // frame, it's less likely to have lots of spills and callee saved
+ // space, so it's all more likely to be within range of the frame pointer.
+ // If it's wrong, we'll materialize the constant and still get to the
+ // object; it's just suboptimal. Negative offsets use the unscaled
+ // load/store instructions, which have a 9-bit signed immediate.
+ if (MFI->getLocalFrameSize() < 256)
+ return false;
+ return true;
}
- int FrameIndex = MI.getOperand(FIOperandNum).getIndex();
- bool IsCalleeSaveOp = FrameIndex >= MinCSFI && FrameIndex <= MaxCSFI;
-
- unsigned FrameReg;
- int64_t Offset;
- Offset = TFI->resolveFrameIndexReference(MF, FrameIndex, FrameReg, SPAdj,
- IsCalleeSaveOp);
- // A vector load/store instruction doesn't have an offset operand.
- bool HasOffsetOp = hasFrameOffset(MI.getOpcode());
- if (HasOffsetOp)
- Offset += MI.getOperand(FIOperandNum + 1).getImm();
-
- // DBG_VALUE instructions have no real restrictions so they can be handled
- // easily.
- if (MI.isDebugValue()) {
- MI.getOperand(FIOperandNum).ChangeToRegister(FrameReg, /*isDef=*/ false);
- MI.getOperand(FIOperandNum + 1).ChangeToImmediate(Offset);
- return;
- }
-
- const AArch64InstrInfo &TII =
- *static_cast<const AArch64InstrInfo*>(MF.getTarget().getInstrInfo());
- int MinOffset, MaxOffset, OffsetScale;
- if (MI.getOpcode() == AArch64::ADDxxi_lsl0_s || !HasOffsetOp) {
- MinOffset = 0;
- MaxOffset = 0xfff;
- OffsetScale = 1;
- } else {
- // Load/store of a stack object
- TII.getAddressConstraints(MI, OffsetScale, MinOffset, MaxOffset);
- }
-
- // There are two situations we don't use frame + offset directly in the
- // instruction:
- // (1) The offset can't really be scaled
- // (2) Can't encode offset as it doesn't have an offset operand
- if ((Offset % OffsetScale != 0 || Offset < MinOffset || Offset > MaxOffset) ||
- (!HasOffsetOp && Offset != 0)) {
- unsigned BaseReg =
- MF.getRegInfo().createVirtualRegister(&AArch64::GPR64RegClass);
- emitRegUpdate(MBB, MBBI, MBBI->getDebugLoc(), TII,
- BaseReg, FrameReg, BaseReg, Offset);
- FrameReg = BaseReg;
- Offset = 0;
- }
-
- // Negative offsets are expected if we address from FP, but for
- // now this checks nothing has gone horribly wrong.
- assert(Offset >= 0 && "Unexpected negative offset from SP");
-
- MI.getOperand(FIOperandNum).ChangeToRegister(FrameReg, false, false, true);
- if (HasOffsetOp)
- MI.getOperand(FIOperandNum + 1).ChangeToImmediate(Offset / OffsetScale);
+ return false;
}
unsigned
AArch64RegisterInfo::getFrameRegister(const MachineFunction &MF) const {
const TargetFrameLowering *TFI = MF.getTarget().getFrameLowering();
- if (TFI->hasFP(MF))
- return AArch64::X29;
- else
- return AArch64::XSP;
+ return TFI->hasFP(MF) ? AArch64::FP : AArch64::SP;
+}
+
+bool AArch64RegisterInfo::requiresRegisterScavenging(
+ const MachineFunction &MF) const {
+ return true;
+}
+
+bool AArch64RegisterInfo::requiresVirtualBaseRegisters(
+ const MachineFunction &MF) const {
+ return true;
}
bool
AArch64RegisterInfo::useFPForScavengingIndex(const MachineFunction &MF) const {
- const TargetFrameLowering *TFI = MF.getTarget().getFrameLowering();
- const AArch64FrameLowering *AFI
- = static_cast<const AArch64FrameLowering*>(TFI);
- return AFI->useFPForAddressing(MF);
+ const MachineFrameInfo *MFI = MF.getFrameInfo();
+ // AArch64FrameLowering::resolveFrameIndexReference() can always fall back
+ // to the stack pointer, so only put the emergency spill slot next to the
+ // FP when there's no better way to access it (SP or base pointer).
+ return MFI->hasVarSizedObjects() && !hasBasePointer(MF);
}
+
+bool AArch64RegisterInfo::requiresFrameIndexScavenging(
+ const MachineFunction &MF) const {
+ return true;
+}
+
+bool
+AArch64RegisterInfo::cannotEliminateFrame(const MachineFunction &MF) const {
+ const MachineFrameInfo *MFI = MF.getFrameInfo();
+ // Only consider eliminating leaf frames.
+ if (MFI->hasCalls() || (MF.getTarget().Options.DisableFramePointerElim(MF) &&
+ MFI->adjustsStack()))
+ return true;
+ return MFI->hasVarSizedObjects() || MFI->isFrameAddressTaken();
+}
+
+/// needsFrameBaseReg - Returns true if the instruction's frame index
+/// reference would be better served by a base register other than FP
+/// or SP. Used by LocalStackFrameAllocation to determine which frame index
+/// references it should create new base registers for.
+bool AArch64RegisterInfo::needsFrameBaseReg(MachineInstr *MI,
+ int64_t Offset) const {
+ for (unsigned i = 0; !MI->getOperand(i).isFI(); ++i)
+ assert(i < MI->getNumOperands() &&
+ "Instr doesn't have FrameIndex operand!");
+
+ // It's the load/store FI references that cause issues, as it can be difficult
+ // to materialize the offset if it won't fit in the literal field. Estimate
+ // based on the size of the local frame and some conservative assumptions
+ // about the rest of the stack frame (note, this is pre-regalloc, so
+ // we don't know everything for certain yet) whether this offset is likely
+ // to be out of range of the immediate. Return true if so.
+
+ // We only generate virtual base registers for loads and stores, so
+ // return false for everything else.
+ if (!MI->mayLoad() && !MI->mayStore())
+ return false;
+
+ // Without a virtual base register, if the function has variable sized
+ // objects, all fixed-size local references will be via the frame pointer,
+ // Approximate the offset and see if it's legal for the instruction.
+ // Note that the incoming offset is based on the SP value at function entry,
+ // so it'll be negative.
+ MachineFunction &MF = *MI->getParent()->getParent();
+ const TargetFrameLowering *TFI = MF.getTarget().getFrameLowering();
+ MachineFrameInfo *MFI = MF.getFrameInfo();
+
+ // Estimate an offset from the frame pointer.
+ // Conservatively assume all GPR callee-saved registers get pushed.
+ // FP, LR, X19-X28, D8-D15. 64-bits each.
+ int64_t FPOffset = Offset - 16 * 20;
+ // Estimate an offset from the stack pointer.
+ // The incoming offset is relating to the SP at the start of the function,
+ // but when we access the local it'll be relative to the SP after local
+ // allocation, so adjust our SP-relative offset by that allocation size.
+ Offset += MFI->getLocalFrameSize();
+ // Assume that we'll have at least some spill slots allocated.
+ // FIXME: This is a total SWAG number. We should run some statistics
+ // and pick a real one.
+ Offset += 128; // 128 bytes of spill slots
+
+ // If there is a frame pointer, try using it.
+ // The FP is only available if there is no dynamic realignment. We
+ // don't know for sure yet whether we'll need that, so we guess based
+ // on whether there are any local variables that would trigger it.
+ if (TFI->hasFP(MF) && isFrameOffsetLegal(MI, FPOffset))
+ return false;
+
+ // If we can reference via the stack pointer or base pointer, try that.
+ // FIXME: This (and the code that resolves the references) can be improved
+ // to only disallow SP relative references in the live range of
+ // the VLA(s). In practice, it's unclear how much difference that
+ // would make, but it may be worth doing.
+ if (isFrameOffsetLegal(MI, Offset))
+ return false;
+
+ // The offset likely isn't legal; we want to allocate a virtual base register.
+ return true;
+}
+
+bool AArch64RegisterInfo::isFrameOffsetLegal(const MachineInstr *MI,
+ int64_t Offset) const {
+ assert(Offset <= INT_MAX && "Offset too big to fit in int.");
+ assert(MI && "Unable to get the legal offset for nil instruction.");
+ int SaveOffset = Offset;
+ return isAArch64FrameOffsetLegal(*MI, SaveOffset) & AArch64FrameOffsetIsLegal;
+}
+
+/// Insert defining instruction(s) for BaseReg to be a pointer to FrameIdx
+/// at the beginning of the basic block.
+void AArch64RegisterInfo::materializeFrameBaseRegister(MachineBasicBlock *MBB,
+ unsigned BaseReg,
+ int FrameIdx,
+ int64_t Offset) const {
+ MachineBasicBlock::iterator Ins = MBB->begin();
+ DebugLoc DL; // Defaults to "unknown"
+ if (Ins != MBB->end())
+ DL = Ins->getDebugLoc();
+
+ const MCInstrDesc &MCID = TII->get(AArch64::ADDXri);
+ MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo();
+ const MachineFunction &MF = *MBB->getParent();
+ MRI.constrainRegClass(BaseReg, TII->getRegClass(MCID, 0, this, MF));
+ unsigned Shifter = AArch64_AM::getShifterImm(AArch64_AM::LSL, 0);
+
+ BuildMI(*MBB, Ins, DL, MCID, BaseReg)
+ .addFrameIndex(FrameIdx)
+ .addImm(Offset)
+ .addImm(Shifter);
+}
+
+void AArch64RegisterInfo::resolveFrameIndex(MachineInstr &MI, unsigned BaseReg,
+ int64_t Offset) const {
+ int Off = Offset; // ARM doesn't need the general 64-bit offsets
+ unsigned i = 0;
+
+ while (!MI.getOperand(i).isFI()) {
+ ++i;
+ assert(i < MI.getNumOperands() && "Instr doesn't have FrameIndex operand!");
+ }
+ bool Done = rewriteAArch64FrameIndex(MI, i, BaseReg, Off, TII);
+ assert(Done && "Unable to resolve frame index!");
+ (void)Done;
+}
+
+void AArch64RegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
+ int SPAdj, unsigned FIOperandNum,
+ RegScavenger *RS) const {
+ assert(SPAdj == 0 && "Unexpected");
+
+ MachineInstr &MI = *II;
+ MachineBasicBlock &MBB = *MI.getParent();
+ MachineFunction &MF = *MBB.getParent();
+ const AArch64FrameLowering *TFI = static_cast<const AArch64FrameLowering *>(
+ MF.getTarget().getFrameLowering());
+
+ int FrameIndex = MI.getOperand(FIOperandNum).getIndex();
+ unsigned FrameReg;
+ int Offset;
+
+ // Special handling of dbg_value, stackmap and patchpoint instructions.
+ if (MI.isDebugValue() || MI.getOpcode() == TargetOpcode::STACKMAP ||
+ MI.getOpcode() == TargetOpcode::PATCHPOINT) {
+ Offset = TFI->resolveFrameIndexReference(MF, FrameIndex, FrameReg,
+ /*PreferFP=*/true);
+ Offset += MI.getOperand(FIOperandNum + 1).getImm();
+ MI.getOperand(FIOperandNum).ChangeToRegister(FrameReg, false /*isDef*/);
+ MI.getOperand(FIOperandNum + 1).ChangeToImmediate(Offset);
+ return;
+ }
+
+ // Modify MI as necessary to handle as much of 'Offset' as possible
+ Offset = TFI->resolveFrameIndexReference(MF, FrameIndex, FrameReg);
+ if (rewriteAArch64FrameIndex(MI, FIOperandNum, FrameReg, Offset, TII))
+ return;
+
+ assert((!RS || !RS->isScavengingFrameIndex(FrameIndex)) &&
+ "Emergency spill slot is out of reach");
+
+ // If we get here, the immediate doesn't fit into the instruction. We folded
+ // as much as possible above. Handle the rest, providing a register that is
+ // SP+LargeImm.
+ unsigned ScratchReg =
+ MF.getRegInfo().createVirtualRegister(&AArch64::GPR64RegClass);
+ emitFrameOffset(MBB, II, MI.getDebugLoc(), ScratchReg, FrameReg, Offset, TII);
+ MI.getOperand(FIOperandNum).ChangeToRegister(ScratchReg, false, false, true);
+}
+
+namespace llvm {
+
+unsigned AArch64RegisterInfo::getRegPressureLimit(const TargetRegisterClass *RC,
+ MachineFunction &MF) const {
+ const TargetFrameLowering *TFI = MF.getTarget().getFrameLowering();
+
+ switch (RC->getID()) {
+ default:
+ return 0;
+ case AArch64::GPR32RegClassID:
+ case AArch64::GPR32spRegClassID:
+ case AArch64::GPR32allRegClassID:
+ case AArch64::GPR64spRegClassID:
+ case AArch64::GPR64allRegClassID:
+ case AArch64::GPR64RegClassID:
+ case AArch64::GPR32commonRegClassID:
+ case AArch64::GPR64commonRegClassID:
+ return 32 - 1 // XZR/SP
+ - (TFI->hasFP(MF) || STI->isTargetDarwin()) // FP
+ - STI->isTargetDarwin() // X18 reserved as platform register
+ - hasBasePointer(MF); // X19
+ case AArch64::FPR8RegClassID:
+ case AArch64::FPR16RegClassID:
+ case AArch64::FPR32RegClassID:
+ case AArch64::FPR64RegClassID:
+ case AArch64::FPR128RegClassID:
+ return 32;
+
+ case AArch64::DDRegClassID:
+ case AArch64::DDDRegClassID:
+ case AArch64::DDDDRegClassID:
+ case AArch64::QQRegClassID:
+ case AArch64::QQQRegClassID:
+ case AArch64::QQQQRegClassID:
+ return 32;
+
+ case AArch64::FPR128_loRegClassID:
+ return 16;
+ }
+}
+
+} // namespace llvm
diff --git a/lib/Target/AArch64/AArch64RegisterInfo.h b/lib/Target/AArch64/AArch64RegisterInfo.h
index 4d67943..76af1ed 100644
--- a/lib/Target/AArch64/AArch64RegisterInfo.h
+++ b/lib/Target/AArch64/AArch64RegisterInfo.h
@@ -1,4 +1,4 @@
-//==- AArch64RegisterInfo.h - AArch64 Register Information Impl -*- C++ -*-===//
+//==- AArch64RegisterInfo.h - AArch64 Register Information Impl --*- C++ -*-==//
//
// The LLVM Compiler Infrastructure
//
@@ -7,14 +7,12 @@
//
//===----------------------------------------------------------------------===//
//
-// This file contains the AArch64 implementation of the MCRegisterInfo class.
+// This file contains the AArch64 implementation of the MRegisterInfo class.
//
//===----------------------------------------------------------------------===//
-#ifndef LLVM_TARGET_AARCH64REGISTERINFO_H
-#define LLVM_TARGET_AARCH64REGISTERINFO_H
-
-#include "llvm/Target/TargetRegisterInfo.h"
+#ifndef LLVM_TARGET_AArch64REGISTERINFO_H
+#define LLVM_TARGET_AArch64REGISTERINFO_H
#define GET_REGINFO_HEADER
#include "AArch64GenRegisterInfo.inc"
@@ -23,49 +21,81 @@
class AArch64InstrInfo;
class AArch64Subtarget;
+class MachineFunction;
+class RegScavenger;
+class TargetRegisterClass;
struct AArch64RegisterInfo : public AArch64GenRegisterInfo {
- AArch64RegisterInfo();
+private:
+ const AArch64InstrInfo *TII;
+ const AArch64Subtarget *STI;
- const uint16_t *getCalleeSavedRegs(const MachineFunction *MF = 0) const;
- const uint32_t *getCallPreservedMask(CallingConv::ID) const;
+public:
+ AArch64RegisterInfo(const AArch64InstrInfo *tii, const AArch64Subtarget *sti);
- const uint32_t *getTLSDescCallPreservedMask() const;
+ bool isReservedReg(const MachineFunction &MF, unsigned Reg) const;
- BitVector getReservedRegs(const MachineFunction &MF) const;
- unsigned getFrameRegister(const MachineFunction &MF) const;
+ /// Code Generation virtual methods...
+ const MCPhysReg *
+ getCalleeSavedRegs(const MachineFunction *MF = nullptr) const override;
+ const uint32_t *getCallPreservedMask(CallingConv::ID) const override;
+ unsigned getCSRFirstUseCost() const override {
+ // The cost will be compared against BlockFrequency where entry has the
+ // value of 1 << 14. A value of 5 will choose to spill or split really
+ // cold path instead of using a callee-saved register.
+ return 5;
+ }
+
+ // Calls involved in thread-local variable lookup save more registers than
+ // normal calls, so they need a different mask to represent this.
+ const uint32_t *getTLSCallPreservedMask() const;
+
+ /// getThisReturnPreservedMask - Returns a call preserved mask specific to the
+ /// case that 'returned' is on an i64 first argument if the calling convention
+ /// is one that can (partially) model this attribute with a preserved mask
+ /// (i.e. it is a calling convention that uses the same register for the first
+ /// i64 argument and an i64 return value)
+ ///
+ /// Should return NULL in the case that the calling convention does not have
+ /// this property
+ const uint32_t *getThisReturnPreservedMask(CallingConv::ID) const;
+
+ BitVector getReservedRegs(const MachineFunction &MF) const override;
+ const TargetRegisterClass *
+ getPointerRegClass(const MachineFunction &MF,
+ unsigned Kind = 0) const override;
+ const TargetRegisterClass *
+ getCrossCopyRegClass(const TargetRegisterClass *RC) const override;
+
+ bool requiresRegisterScavenging(const MachineFunction &MF) const override;
+ bool useFPForScavengingIndex(const MachineFunction &MF) const override;
+ bool requiresFrameIndexScavenging(const MachineFunction &MF) const override;
+
+ bool needsFrameBaseReg(MachineInstr *MI, int64_t Offset) const override;
+ bool isFrameOffsetLegal(const MachineInstr *MI,
+ int64_t Offset) const override;
+ void materializeFrameBaseRegister(MachineBasicBlock *MBB, unsigned BaseReg,
+ int FrameIdx,
+ int64_t Offset) const override;
+ void resolveFrameIndex(MachineInstr &MI, unsigned BaseReg,
+ int64_t Offset) const override;
void eliminateFrameIndex(MachineBasicBlock::iterator II, int SPAdj,
unsigned FIOperandNum,
- RegScavenger *Rs = NULL) const;
+ RegScavenger *RS = nullptr) const override;
+ bool cannotEliminateFrame(const MachineFunction &MF) const;
- /// getCrossCopyRegClass - Returns a legal register class to copy a register
- /// in the specified class to or from. Returns original class if it is
- /// possible to copy between a two registers of the specified class.
- const TargetRegisterClass *
- getCrossCopyRegClass(const TargetRegisterClass *RC) const;
+ bool requiresVirtualBaseRegisters(const MachineFunction &MF) const override;
+ bool hasBasePointer(const MachineFunction &MF) const;
+ unsigned getBaseRegister() const;
- /// getLargestLegalSuperClass - Returns the largest super class of RC that is
- /// legal to use in the current sub-target and has the same spill size.
- const TargetRegisterClass*
- getLargestLegalSuperClass(const TargetRegisterClass *RC) const {
- if (RC == &AArch64::tcGPR64RegClass)
- return &AArch64::GPR64RegClass;
+ // Debug information queries.
+ unsigned getFrameRegister(const MachineFunction &MF) const override;
- return RC;
- }
-
- bool requiresRegisterScavenging(const MachineFunction &MF) const {
- return true;
- }
-
- bool requiresFrameIndexScavenging(const MachineFunction &MF) const {
- return true;
- }
-
- bool useFPForScavengingIndex(const MachineFunction &MF) const;
+ unsigned getRegPressureLimit(const TargetRegisterClass *RC,
+ MachineFunction &MF) const override;
};
} // end namespace llvm
-#endif // LLVM_TARGET_AARCH64REGISTERINFO_H
+#endif // LLVM_TARGET_AArch64REGISTERINFO_H
diff --git a/lib/Target/AArch64/AArch64RegisterInfo.td b/lib/Target/AArch64/AArch64RegisterInfo.td
index 9de7abd..21c927f 100644
--- a/lib/Target/AArch64/AArch64RegisterInfo.td
+++ b/lib/Target/AArch64/AArch64RegisterInfo.td
@@ -1,4 +1,4 @@
-//===- AArch64RegisterInfo.td - ARM Register defs ----------*- tablegen -*-===//
+//=- AArch64RegisterInfo.td - Describe the AArch64 Regisers --*- tablegen -*-=//
//
// The LLVM Compiler Infrastructure
//
@@ -7,284 +7,587 @@
//
//===----------------------------------------------------------------------===//
//
-// This file contains declarations that describe the AArch64 register file
//
//===----------------------------------------------------------------------===//
-let Namespace = "AArch64" in {
-def sub_128 : SubRegIndex<128>;
-def sub_64 : SubRegIndex<64>;
-def sub_32 : SubRegIndex<32>;
-def sub_16 : SubRegIndex<16>;
-def sub_8 : SubRegIndex<8>;
-// Note: Code depends on these having consecutive numbers.
-def qqsub : SubRegIndex<256, 256>;
-
-def qsub_0 : SubRegIndex<128>;
-def qsub_1 : SubRegIndex<128, 128>;
-def qsub_2 : ComposedSubRegIndex<qqsub, qsub_0>;
-def qsub_3 : ComposedSubRegIndex<qqsub, qsub_1>;
-
-def dsub_0 : SubRegIndex<64>;
-def dsub_1 : SubRegIndex<64, 64>;
-def dsub_2 : ComposedSubRegIndex<qsub_1, dsub_0>;
-def dsub_3 : ComposedSubRegIndex<qsub_1, dsub_1>;
-}
-
-// Registers are identified with 5-bit ID numbers.
-class AArch64Reg<bits<16> enc, string n> : Register<n> {
+class AArch64Reg<bits<16> enc, string n, list<Register> subregs = [],
+ list<string> altNames = []>
+ : Register<n, altNames> {
let HWEncoding = enc;
let Namespace = "AArch64";
-}
-
-class AArch64RegWithSubs<bits<16> enc, string n, list<Register> subregs = [],
- list<SubRegIndex> inds = []>
- : AArch64Reg<enc, n> {
let SubRegs = subregs;
- let SubRegIndices = inds;
+}
+
+let Namespace = "AArch64" in {
+ def sub_32 : SubRegIndex<32>;
+
+ def bsub : SubRegIndex<8>;
+ def hsub : SubRegIndex<16>;
+ def ssub : SubRegIndex<32>;
+ def dsub : SubRegIndex<32>;
+ def qhisub : SubRegIndex<64>;
+ def qsub : SubRegIndex<64>;
+ // Note: Code depends on these having consecutive numbers
+ def dsub0 : SubRegIndex<64>;
+ def dsub1 : SubRegIndex<64>;
+ def dsub2 : SubRegIndex<64>;
+ def dsub3 : SubRegIndex<64>;
+ // Note: Code depends on these having consecutive numbers
+ def qsub0 : SubRegIndex<128>;
+ def qsub1 : SubRegIndex<128>;
+ def qsub2 : SubRegIndex<128>;
+ def qsub3 : SubRegIndex<128>;
+}
+
+let Namespace = "AArch64" in {
+ def vreg : RegAltNameIndex;
+ def vlist1 : RegAltNameIndex;
}
//===----------------------------------------------------------------------===//
-// Integer registers: w0-w30, wzr, wsp, x0-x30, xzr, sp
+// Registers
//===----------------------------------------------------------------------===//
+def W0 : AArch64Reg<0, "w0" >, DwarfRegNum<[0]>;
+def W1 : AArch64Reg<1, "w1" >, DwarfRegNum<[1]>;
+def W2 : AArch64Reg<2, "w2" >, DwarfRegNum<[2]>;
+def W3 : AArch64Reg<3, "w3" >, DwarfRegNum<[3]>;
+def W4 : AArch64Reg<4, "w4" >, DwarfRegNum<[4]>;
+def W5 : AArch64Reg<5, "w5" >, DwarfRegNum<[5]>;
+def W6 : AArch64Reg<6, "w6" >, DwarfRegNum<[6]>;
+def W7 : AArch64Reg<7, "w7" >, DwarfRegNum<[7]>;
+def W8 : AArch64Reg<8, "w8" >, DwarfRegNum<[8]>;
+def W9 : AArch64Reg<9, "w9" >, DwarfRegNum<[9]>;
+def W10 : AArch64Reg<10, "w10">, DwarfRegNum<[10]>;
+def W11 : AArch64Reg<11, "w11">, DwarfRegNum<[11]>;
+def W12 : AArch64Reg<12, "w12">, DwarfRegNum<[12]>;
+def W13 : AArch64Reg<13, "w13">, DwarfRegNum<[13]>;
+def W14 : AArch64Reg<14, "w14">, DwarfRegNum<[14]>;
+def W15 : AArch64Reg<15, "w15">, DwarfRegNum<[15]>;
+def W16 : AArch64Reg<16, "w16">, DwarfRegNum<[16]>;
+def W17 : AArch64Reg<17, "w17">, DwarfRegNum<[17]>;
+def W18 : AArch64Reg<18, "w18">, DwarfRegNum<[18]>;
+def W19 : AArch64Reg<19, "w19">, DwarfRegNum<[19]>;
+def W20 : AArch64Reg<20, "w20">, DwarfRegNum<[20]>;
+def W21 : AArch64Reg<21, "w21">, DwarfRegNum<[21]>;
+def W22 : AArch64Reg<22, "w22">, DwarfRegNum<[22]>;
+def W23 : AArch64Reg<23, "w23">, DwarfRegNum<[23]>;
+def W24 : AArch64Reg<24, "w24">, DwarfRegNum<[24]>;
+def W25 : AArch64Reg<25, "w25">, DwarfRegNum<[25]>;
+def W26 : AArch64Reg<26, "w26">, DwarfRegNum<[26]>;
+def W27 : AArch64Reg<27, "w27">, DwarfRegNum<[27]>;
+def W28 : AArch64Reg<28, "w28">, DwarfRegNum<[28]>;
+def W29 : AArch64Reg<29, "w29">, DwarfRegNum<[29]>;
+def W30 : AArch64Reg<30, "w30">, DwarfRegNum<[30]>;
+def WSP : AArch64Reg<31, "wsp">, DwarfRegNum<[31]>;
+def WZR : AArch64Reg<31, "wzr">, DwarfRegAlias<WSP>;
-foreach Index = 0-30 in {
- def W#Index : AArch64Reg< Index, "w"#Index>, DwarfRegNum<[Index]>;
+let SubRegIndices = [sub_32] in {
+def X0 : AArch64Reg<0, "x0", [W0]>, DwarfRegAlias<W0>;
+def X1 : AArch64Reg<1, "x1", [W1]>, DwarfRegAlias<W1>;
+def X2 : AArch64Reg<2, "x2", [W2]>, DwarfRegAlias<W2>;
+def X3 : AArch64Reg<3, "x3", [W3]>, DwarfRegAlias<W3>;
+def X4 : AArch64Reg<4, "x4", [W4]>, DwarfRegAlias<W4>;
+def X5 : AArch64Reg<5, "x5", [W5]>, DwarfRegAlias<W5>;
+def X6 : AArch64Reg<6, "x6", [W6]>, DwarfRegAlias<W6>;
+def X7 : AArch64Reg<7, "x7", [W7]>, DwarfRegAlias<W7>;
+def X8 : AArch64Reg<8, "x8", [W8]>, DwarfRegAlias<W8>;
+def X9 : AArch64Reg<9, "x9", [W9]>, DwarfRegAlias<W9>;
+def X10 : AArch64Reg<10, "x10", [W10]>, DwarfRegAlias<W10>;
+def X11 : AArch64Reg<11, "x11", [W11]>, DwarfRegAlias<W11>;
+def X12 : AArch64Reg<12, "x12", [W12]>, DwarfRegAlias<W12>;
+def X13 : AArch64Reg<13, "x13", [W13]>, DwarfRegAlias<W13>;
+def X14 : AArch64Reg<14, "x14", [W14]>, DwarfRegAlias<W14>;
+def X15 : AArch64Reg<15, "x15", [W15]>, DwarfRegAlias<W15>;
+def X16 : AArch64Reg<16, "x16", [W16]>, DwarfRegAlias<W16>;
+def X17 : AArch64Reg<17, "x17", [W17]>, DwarfRegAlias<W17>;
+def X18 : AArch64Reg<18, "x18", [W18]>, DwarfRegAlias<W18>;
+def X19 : AArch64Reg<19, "x19", [W19]>, DwarfRegAlias<W19>;
+def X20 : AArch64Reg<20, "x20", [W20]>, DwarfRegAlias<W20>;
+def X21 : AArch64Reg<21, "x21", [W21]>, DwarfRegAlias<W21>;
+def X22 : AArch64Reg<22, "x22", [W22]>, DwarfRegAlias<W22>;
+def X23 : AArch64Reg<23, "x23", [W23]>, DwarfRegAlias<W23>;
+def X24 : AArch64Reg<24, "x24", [W24]>, DwarfRegAlias<W24>;
+def X25 : AArch64Reg<25, "x25", [W25]>, DwarfRegAlias<W25>;
+def X26 : AArch64Reg<26, "x26", [W26]>, DwarfRegAlias<W26>;
+def X27 : AArch64Reg<27, "x27", [W27]>, DwarfRegAlias<W27>;
+def X28 : AArch64Reg<28, "x28", [W28]>, DwarfRegAlias<W28>;
+def FP : AArch64Reg<29, "x29", [W29]>, DwarfRegAlias<W29>;
+def LR : AArch64Reg<30, "x30", [W30]>, DwarfRegAlias<W30>;
+def SP : AArch64Reg<31, "sp", [WSP]>, DwarfRegAlias<WSP>;
+def XZR : AArch64Reg<31, "xzr", [WZR]>, DwarfRegAlias<WSP>;
}
-def WSP : AArch64Reg<31, "wsp">, DwarfRegNum<[31]>;
-def WZR : AArch64Reg<31, "wzr">;
+// Condition code register.
+def NZCV : AArch64Reg<0, "nzcv">;
-// Could be combined with previous loop, but this way leaves w and x registers
-// consecutive as LLVM register numbers, which makes for easier debugging.
-foreach Index = 0-30 in {
- def X#Index : AArch64RegWithSubs<Index, "x"#Index,
- [!cast<Register>("W"#Index)], [sub_32]>,
- DwarfRegNum<[Index]>;
+// GPR register classes with the intersections of GPR32/GPR32sp and
+// GPR64/GPR64sp for use by the coalescer.
+def GPR32common : RegisterClass<"AArch64", [i32], 32, (sequence "W%u", 0, 30)> {
+ let AltOrders = [(rotl GPR32common, 8)];
+ let AltOrderSelect = [{ return 1; }];
+}
+def GPR64common : RegisterClass<"AArch64", [i64], 64,
+ (add (sequence "X%u", 0, 28), FP, LR)> {
+ let AltOrders = [(rotl GPR64common, 8)];
+ let AltOrderSelect = [{ return 1; }];
+}
+// GPR register classes which exclude SP/WSP.
+def GPR32 : RegisterClass<"AArch64", [i32], 32, (add GPR32common, WZR)> {
+ let AltOrders = [(rotl GPR32, 8)];
+ let AltOrderSelect = [{ return 1; }];
+}
+def GPR64 : RegisterClass<"AArch64", [i64], 64, (add GPR64common, XZR)> {
+ let AltOrders = [(rotl GPR64, 8)];
+ let AltOrderSelect = [{ return 1; }];
}
-def XSP : AArch64RegWithSubs<31, "sp", [WSP], [sub_32]>, DwarfRegNum<[31]>;
-def XZR : AArch64RegWithSubs<31, "xzr", [WZR], [sub_32]>;
-
-// Most instructions treat register 31 as zero for reads and a black-hole for
-// writes.
-
-// Note that the order of registers is important for the Disassembler here:
-// tablegen uses it to form MCRegisterClass::getRegister, which we assume can
-// take an encoding value.
-def GPR32 : RegisterClass<"AArch64", [i32], 32,
- (add (sequence "W%u", 0, 30), WZR)> {
+// GPR register classes which include SP/WSP.
+def GPR32sp : RegisterClass<"AArch64", [i32], 32, (add GPR32common, WSP)> {
+ let AltOrders = [(rotl GPR32sp, 8)];
+ let AltOrderSelect = [{ return 1; }];
+}
+def GPR64sp : RegisterClass<"AArch64", [i64], 64, (add GPR64common, SP)> {
+ let AltOrders = [(rotl GPR64sp, 8)];
+ let AltOrderSelect = [{ return 1; }];
}
-def GPR64 : RegisterClass<"AArch64", [i64], 64,
- (add (sequence "X%u", 0, 30), XZR)> {
-}
+def GPR32sponly : RegisterClass<"AArch64", [i32], 32, (add WSP)>;
+def GPR64sponly : RegisterClass<"AArch64", [i64], 64, (add SP)>;
-def GPR32nowzr : RegisterClass<"AArch64", [i32], 32,
- (sequence "W%u", 0, 30)> {
-}
-
-def GPR64noxzr : RegisterClass<"AArch64", [i64], 64,
- (sequence "X%u", 0, 30)> {
-}
-
-// For tail calls, we can't use callee-saved registers or the structure-return
-// register, as they are supposed to be live across function calls and may be
-// clobbered by the epilogue.
-def tcGPR64 : RegisterClass<"AArch64", [i64], 64,
- (add (sequence "X%u", 0, 7),
- (sequence "X%u", 9, 18))> {
-}
-
-
-// Certain addressing-useful instructions accept sp directly. Again the order of
-// registers is important to the Disassembler.
-def GPR32wsp : RegisterClass<"AArch64", [i32], 32,
- (add (sequence "W%u", 0, 30), WSP)> {
-}
-
-def GPR64xsp : RegisterClass<"AArch64", [i64], 64,
- (add (sequence "X%u", 0, 30), XSP)> {
-}
-
-// Some aliases *only* apply to SP (e.g. MOV uses different encoding for SP and
-// non-SP variants). We can't use a bare register in those patterns because
-// TableGen doesn't like it, so we need a class containing just stack registers
-def Rxsp : RegisterClass<"AArch64", [i64], 64,
- (add XSP)> {
-}
-
-def Rwsp : RegisterClass<"AArch64", [i32], 32,
- (add WSP)> {
-}
-
-//===----------------------------------------------------------------------===//
-// Scalar registers in the vector unit:
-// b0-b31, h0-h31, s0-s31, d0-d31, q0-q31
-//===----------------------------------------------------------------------===//
-
-foreach Index = 0-31 in {
- def B # Index : AArch64Reg< Index, "b" # Index>,
- DwarfRegNum<[!add(Index, 64)]>;
-
- def H # Index : AArch64RegWithSubs<Index, "h" # Index,
- [!cast<Register>("B" # Index)], [sub_8]>,
- DwarfRegNum<[!add(Index, 64)]>;
-
- def S # Index : AArch64RegWithSubs<Index, "s" # Index,
- [!cast<Register>("H" # Index)], [sub_16]>,
- DwarfRegNum<[!add(Index, 64)]>;
-
- def D # Index : AArch64RegWithSubs<Index, "d" # Index,
- [!cast<Register>("S" # Index)], [sub_32]>,
- DwarfRegNum<[!add(Index, 64)]>;
-
- def Q # Index : AArch64RegWithSubs<Index, "q" # Index,
- [!cast<Register>("D" # Index)], [sub_64]>,
- DwarfRegNum<[!add(Index, 64)]>;
-}
-
-
-def FPR8 : RegisterClass<"AArch64", [v1i8], 8,
- (sequence "B%u", 0, 31)> {
-}
-
-def FPR16 : RegisterClass<"AArch64", [f16, v1i16], 16,
- (sequence "H%u", 0, 31)> {
-}
-
-def FPR32 : RegisterClass<"AArch64", [f32, v1i32], 32,
- (sequence "S%u", 0, 31)> {
-}
-
-def FPR64 : RegisterClass<"AArch64",
- [f64, v2f32, v2i32, v4i16, v8i8, v1i64, v1f64],
- 64, (sequence "D%u", 0, 31)>;
-
-def FPR128 : RegisterClass<"AArch64",
- [f128, v2f64, v2i64, v4f32, v4i32, v8i16, v16i8],
- 128, (sequence "Q%u", 0, 31)>;
-
-def FPR64Lo : RegisterClass<"AArch64",
- [f64, v2f32, v2i32, v4i16, v8i8, v1i64, v1f64],
- 64, (sequence "D%u", 0, 15)>;
-
-def FPR128Lo : RegisterClass<"AArch64",
- [f128, v2f64, v2i64, v4f32, v4i32, v8i16, v16i8],
- 128, (sequence "Q%u", 0, 15)>;
-
-//===----------------------------------------------------------------------===//
-// Vector registers:
-//===----------------------------------------------------------------------===//
-
-def VPR64AsmOperand : AsmOperandClass {
- let Name = "VPR";
- let PredicateMethod = "isReg";
+def GPR64spPlus0Operand : AsmOperandClass {
+ let Name = "GPR64sp0";
let RenderMethod = "addRegOperands";
+ let ParserMethod = "tryParseGPR64sp0Operand";
}
-def VPR64 : RegisterOperand<FPR64, "printVPRRegister">;
-
-def VPR128 : RegisterOperand<FPR128, "printVPRRegister">;
-
-def VPR64Lo : RegisterOperand<FPR64Lo, "printVPRRegister">;
-
-def VPR128Lo : RegisterOperand<FPR128Lo, "printVPRRegister">;
-
-// Flags register
-def NZCV : Register<"nzcv"> {
- let Namespace = "AArch64";
+def GPR64sp0 : RegisterOperand<GPR64sp> {
+ let ParserMatchClass = GPR64spPlus0Operand;
}
-def FlagClass : RegisterClass<"AArch64", [i32], 32, (add NZCV)> {
- let CopyCost = -1;
+// GPR register classes which include WZR/XZR AND SP/WSP. This is not a
+// constraint used by any instructions, it is used as a common super-class.
+def GPR32all : RegisterClass<"AArch64", [i32], 32, (add GPR32common, WZR, WSP)>;
+def GPR64all : RegisterClass<"AArch64", [i64], 64, (add GPR64common, XZR, SP)>;
+
+// For tail calls, we can't use callee-saved registers, as they are restored
+// to the saved value before the tail call, which would clobber a call address.
+// This is for indirect tail calls to store the address of the destination.
+def tcGPR64 : RegisterClass<"AArch64", [i64], 64, (sub GPR64common, X19, X20, X21,
+ X22, X23, X24, X25, X26,
+ X27, X28)>;
+
+// GPR register classes for post increment amount of vector load/store that
+// has alternate printing when Rm=31 and prints a constant immediate value
+// equal to the total number of bytes transferred.
+
+// FIXME: TableGen *should* be able to do these itself now. There appears to be
+// a bug in counting how many operands a Post-indexed MCInst should have which
+// means the aliases don't trigger.
+def GPR64pi1 : RegisterOperand<GPR64, "printPostIncOperand<1>">;
+def GPR64pi2 : RegisterOperand<GPR64, "printPostIncOperand<2>">;
+def GPR64pi3 : RegisterOperand<GPR64, "printPostIncOperand<3>">;
+def GPR64pi4 : RegisterOperand<GPR64, "printPostIncOperand<4>">;
+def GPR64pi6 : RegisterOperand<GPR64, "printPostIncOperand<6>">;
+def GPR64pi8 : RegisterOperand<GPR64, "printPostIncOperand<8>">;
+def GPR64pi12 : RegisterOperand<GPR64, "printPostIncOperand<12>">;
+def GPR64pi16 : RegisterOperand<GPR64, "printPostIncOperand<16>">;
+def GPR64pi24 : RegisterOperand<GPR64, "printPostIncOperand<24>">;
+def GPR64pi32 : RegisterOperand<GPR64, "printPostIncOperand<32>">;
+def GPR64pi48 : RegisterOperand<GPR64, "printPostIncOperand<48>">;
+def GPR64pi64 : RegisterOperand<GPR64, "printPostIncOperand<64>">;
+
+// Condition code regclass.
+def CCR : RegisterClass<"AArch64", [i32], 32, (add NZCV)> {
+ let CopyCost = -1; // Don't allow copying of status registers.
+
+ // CCR is not allocatable.
let isAllocatable = 0;
}
//===----------------------------------------------------------------------===//
-// Consecutive vector registers
+// Floating Point Scalar Registers
//===----------------------------------------------------------------------===//
-// 2 Consecutive 64-bit registers: D0_D1, D1_D2, ..., D31_D0
-def Tuples2D : RegisterTuples<[dsub_0, dsub_1],
- [(rotl FPR64, 0), (rotl FPR64, 1)]>;
-
-// 3 Consecutive 64-bit registers: D0_D1_D2, ..., D31_D0_D1
-def Tuples3D : RegisterTuples<[dsub_0, dsub_1, dsub_2],
- [(rotl FPR64, 0), (rotl FPR64, 1),
- (rotl FPR64, 2)]>;
-
-// 4 Consecutive 64-bit registers: D0_D1_D2_D3, ..., D31_D0_D1_D2
-def Tuples4D : RegisterTuples<[dsub_0, dsub_1, dsub_2, dsub_3],
- [(rotl FPR64, 0), (rotl FPR64, 1),
- (rotl FPR64, 2), (rotl FPR64, 3)]>;
-// 2 Consecutive 128-bit registers: Q0_Q1, Q1_Q2, ..., Q30_Q31
-def Tuples2Q : RegisterTuples<[qsub_0, qsub_1],
- [(rotl FPR128, 0), (rotl FPR128, 1)]>;
+def B0 : AArch64Reg<0, "b0">, DwarfRegNum<[64]>;
+def B1 : AArch64Reg<1, "b1">, DwarfRegNum<[65]>;
+def B2 : AArch64Reg<2, "b2">, DwarfRegNum<[66]>;
+def B3 : AArch64Reg<3, "b3">, DwarfRegNum<[67]>;
+def B4 : AArch64Reg<4, "b4">, DwarfRegNum<[68]>;
+def B5 : AArch64Reg<5, "b5">, DwarfRegNum<[69]>;
+def B6 : AArch64Reg<6, "b6">, DwarfRegNum<[70]>;
+def B7 : AArch64Reg<7, "b7">, DwarfRegNum<[71]>;
+def B8 : AArch64Reg<8, "b8">, DwarfRegNum<[72]>;
+def B9 : AArch64Reg<9, "b9">, DwarfRegNum<[73]>;
+def B10 : AArch64Reg<10, "b10">, DwarfRegNum<[74]>;
+def B11 : AArch64Reg<11, "b11">, DwarfRegNum<[75]>;
+def B12 : AArch64Reg<12, "b12">, DwarfRegNum<[76]>;
+def B13 : AArch64Reg<13, "b13">, DwarfRegNum<[77]>;
+def B14 : AArch64Reg<14, "b14">, DwarfRegNum<[78]>;
+def B15 : AArch64Reg<15, "b15">, DwarfRegNum<[79]>;
+def B16 : AArch64Reg<16, "b16">, DwarfRegNum<[80]>;
+def B17 : AArch64Reg<17, "b17">, DwarfRegNum<[81]>;
+def B18 : AArch64Reg<18, "b18">, DwarfRegNum<[82]>;
+def B19 : AArch64Reg<19, "b19">, DwarfRegNum<[83]>;
+def B20 : AArch64Reg<20, "b20">, DwarfRegNum<[84]>;
+def B21 : AArch64Reg<21, "b21">, DwarfRegNum<[85]>;
+def B22 : AArch64Reg<22, "b22">, DwarfRegNum<[86]>;
+def B23 : AArch64Reg<23, "b23">, DwarfRegNum<[87]>;
+def B24 : AArch64Reg<24, "b24">, DwarfRegNum<[88]>;
+def B25 : AArch64Reg<25, "b25">, DwarfRegNum<[89]>;
+def B26 : AArch64Reg<26, "b26">, DwarfRegNum<[90]>;
+def B27 : AArch64Reg<27, "b27">, DwarfRegNum<[91]>;
+def B28 : AArch64Reg<28, "b28">, DwarfRegNum<[92]>;
+def B29 : AArch64Reg<29, "b29">, DwarfRegNum<[93]>;
+def B30 : AArch64Reg<30, "b30">, DwarfRegNum<[94]>;
+def B31 : AArch64Reg<31, "b31">, DwarfRegNum<[95]>;
-// 3 Consecutive 128-bit registers: Q0_Q1_Q2, ..., Q31_Q0_Q1
-def Tuples3Q : RegisterTuples<[qsub_0, qsub_1, qsub_2],
- [(rotl FPR128, 0), (rotl FPR128, 1),
- (rotl FPR128, 2)]>;
-
-// 4 Consecutive 128-bit registers: Q0_Q1_Q2_Q3, ..., Q31_Q0_Q1_Q2
-def Tuples4Q : RegisterTuples<[qsub_0, qsub_1, qsub_2, qsub_3],
- [(rotl FPR128, 0), (rotl FPR128, 1),
- (rotl FPR128, 2), (rotl FPR128, 3)]>;
-
-// The followings are super register classes to model 2/3/4 consecutive
-// 64-bit/128-bit registers.
-
-def DPair : RegisterClass<"AArch64", [v2i64], 64, (add Tuples2D)>;
-
-def DTriple : RegisterClass<"AArch64", [untyped], 64, (add Tuples3D)> {
- let Size = 192; // 3 x 64 bits, we have no predefined type of that size.
+let SubRegIndices = [bsub] in {
+def H0 : AArch64Reg<0, "h0", [B0]>, DwarfRegAlias<B0>;
+def H1 : AArch64Reg<1, "h1", [B1]>, DwarfRegAlias<B1>;
+def H2 : AArch64Reg<2, "h2", [B2]>, DwarfRegAlias<B2>;
+def H3 : AArch64Reg<3, "h3", [B3]>, DwarfRegAlias<B3>;
+def H4 : AArch64Reg<4, "h4", [B4]>, DwarfRegAlias<B4>;
+def H5 : AArch64Reg<5, "h5", [B5]>, DwarfRegAlias<B5>;
+def H6 : AArch64Reg<6, "h6", [B6]>, DwarfRegAlias<B6>;
+def H7 : AArch64Reg<7, "h7", [B7]>, DwarfRegAlias<B7>;
+def H8 : AArch64Reg<8, "h8", [B8]>, DwarfRegAlias<B8>;
+def H9 : AArch64Reg<9, "h9", [B9]>, DwarfRegAlias<B9>;
+def H10 : AArch64Reg<10, "h10", [B10]>, DwarfRegAlias<B10>;
+def H11 : AArch64Reg<11, "h11", [B11]>, DwarfRegAlias<B11>;
+def H12 : AArch64Reg<12, "h12", [B12]>, DwarfRegAlias<B12>;
+def H13 : AArch64Reg<13, "h13", [B13]>, DwarfRegAlias<B13>;
+def H14 : AArch64Reg<14, "h14", [B14]>, DwarfRegAlias<B14>;
+def H15 : AArch64Reg<15, "h15", [B15]>, DwarfRegAlias<B15>;
+def H16 : AArch64Reg<16, "h16", [B16]>, DwarfRegAlias<B16>;
+def H17 : AArch64Reg<17, "h17", [B17]>, DwarfRegAlias<B17>;
+def H18 : AArch64Reg<18, "h18", [B18]>, DwarfRegAlias<B18>;
+def H19 : AArch64Reg<19, "h19", [B19]>, DwarfRegAlias<B19>;
+def H20 : AArch64Reg<20, "h20", [B20]>, DwarfRegAlias<B20>;
+def H21 : AArch64Reg<21, "h21", [B21]>, DwarfRegAlias<B21>;
+def H22 : AArch64Reg<22, "h22", [B22]>, DwarfRegAlias<B22>;
+def H23 : AArch64Reg<23, "h23", [B23]>, DwarfRegAlias<B23>;
+def H24 : AArch64Reg<24, "h24", [B24]>, DwarfRegAlias<B24>;
+def H25 : AArch64Reg<25, "h25", [B25]>, DwarfRegAlias<B25>;
+def H26 : AArch64Reg<26, "h26", [B26]>, DwarfRegAlias<B26>;
+def H27 : AArch64Reg<27, "h27", [B27]>, DwarfRegAlias<B27>;
+def H28 : AArch64Reg<28, "h28", [B28]>, DwarfRegAlias<B28>;
+def H29 : AArch64Reg<29, "h29", [B29]>, DwarfRegAlias<B29>;
+def H30 : AArch64Reg<30, "h30", [B30]>, DwarfRegAlias<B30>;
+def H31 : AArch64Reg<31, "h31", [B31]>, DwarfRegAlias<B31>;
}
-def DQuad : RegisterClass<"AArch64", [v4i64], 64, (add Tuples4D)>;
-
-def QPair : RegisterClass<"AArch64", [v4i64], 128, (add Tuples2Q)>;
-
-def QTriple : RegisterClass<"AArch64", [untyped], 128, (add Tuples3Q)> {
- let Size = 384; // 3 x 128 bits, we have no predefined type of that size.
+let SubRegIndices = [hsub] in {
+def S0 : AArch64Reg<0, "s0", [H0]>, DwarfRegAlias<B0>;
+def S1 : AArch64Reg<1, "s1", [H1]>, DwarfRegAlias<B1>;
+def S2 : AArch64Reg<2, "s2", [H2]>, DwarfRegAlias<B2>;
+def S3 : AArch64Reg<3, "s3", [H3]>, DwarfRegAlias<B3>;
+def S4 : AArch64Reg<4, "s4", [H4]>, DwarfRegAlias<B4>;
+def S5 : AArch64Reg<5, "s5", [H5]>, DwarfRegAlias<B5>;
+def S6 : AArch64Reg<6, "s6", [H6]>, DwarfRegAlias<B6>;
+def S7 : AArch64Reg<7, "s7", [H7]>, DwarfRegAlias<B7>;
+def S8 : AArch64Reg<8, "s8", [H8]>, DwarfRegAlias<B8>;
+def S9 : AArch64Reg<9, "s9", [H9]>, DwarfRegAlias<B9>;
+def S10 : AArch64Reg<10, "s10", [H10]>, DwarfRegAlias<B10>;
+def S11 : AArch64Reg<11, "s11", [H11]>, DwarfRegAlias<B11>;
+def S12 : AArch64Reg<12, "s12", [H12]>, DwarfRegAlias<B12>;
+def S13 : AArch64Reg<13, "s13", [H13]>, DwarfRegAlias<B13>;
+def S14 : AArch64Reg<14, "s14", [H14]>, DwarfRegAlias<B14>;
+def S15 : AArch64Reg<15, "s15", [H15]>, DwarfRegAlias<B15>;
+def S16 : AArch64Reg<16, "s16", [H16]>, DwarfRegAlias<B16>;
+def S17 : AArch64Reg<17, "s17", [H17]>, DwarfRegAlias<B17>;
+def S18 : AArch64Reg<18, "s18", [H18]>, DwarfRegAlias<B18>;
+def S19 : AArch64Reg<19, "s19", [H19]>, DwarfRegAlias<B19>;
+def S20 : AArch64Reg<20, "s20", [H20]>, DwarfRegAlias<B20>;
+def S21 : AArch64Reg<21, "s21", [H21]>, DwarfRegAlias<B21>;
+def S22 : AArch64Reg<22, "s22", [H22]>, DwarfRegAlias<B22>;
+def S23 : AArch64Reg<23, "s23", [H23]>, DwarfRegAlias<B23>;
+def S24 : AArch64Reg<24, "s24", [H24]>, DwarfRegAlias<B24>;
+def S25 : AArch64Reg<25, "s25", [H25]>, DwarfRegAlias<B25>;
+def S26 : AArch64Reg<26, "s26", [H26]>, DwarfRegAlias<B26>;
+def S27 : AArch64Reg<27, "s27", [H27]>, DwarfRegAlias<B27>;
+def S28 : AArch64Reg<28, "s28", [H28]>, DwarfRegAlias<B28>;
+def S29 : AArch64Reg<29, "s29", [H29]>, DwarfRegAlias<B29>;
+def S30 : AArch64Reg<30, "s30", [H30]>, DwarfRegAlias<B30>;
+def S31 : AArch64Reg<31, "s31", [H31]>, DwarfRegAlias<B31>;
}
-def QQuad : RegisterClass<"AArch64", [v8i64], 128, (add Tuples4Q)>;
+let SubRegIndices = [ssub], RegAltNameIndices = [vreg, vlist1] in {
+def D0 : AArch64Reg<0, "d0", [S0], ["v0", ""]>, DwarfRegAlias<B0>;
+def D1 : AArch64Reg<1, "d1", [S1], ["v1", ""]>, DwarfRegAlias<B1>;
+def D2 : AArch64Reg<2, "d2", [S2], ["v2", ""]>, DwarfRegAlias<B2>;
+def D3 : AArch64Reg<3, "d3", [S3], ["v3", ""]>, DwarfRegAlias<B3>;
+def D4 : AArch64Reg<4, "d4", [S4], ["v4", ""]>, DwarfRegAlias<B4>;
+def D5 : AArch64Reg<5, "d5", [S5], ["v5", ""]>, DwarfRegAlias<B5>;
+def D6 : AArch64Reg<6, "d6", [S6], ["v6", ""]>, DwarfRegAlias<B6>;
+def D7 : AArch64Reg<7, "d7", [S7], ["v7", ""]>, DwarfRegAlias<B7>;
+def D8 : AArch64Reg<8, "d8", [S8], ["v8", ""]>, DwarfRegAlias<B8>;
+def D9 : AArch64Reg<9, "d9", [S9], ["v9", ""]>, DwarfRegAlias<B9>;
+def D10 : AArch64Reg<10, "d10", [S10], ["v10", ""]>, DwarfRegAlias<B10>;
+def D11 : AArch64Reg<11, "d11", [S11], ["v11", ""]>, DwarfRegAlias<B11>;
+def D12 : AArch64Reg<12, "d12", [S12], ["v12", ""]>, DwarfRegAlias<B12>;
+def D13 : AArch64Reg<13, "d13", [S13], ["v13", ""]>, DwarfRegAlias<B13>;
+def D14 : AArch64Reg<14, "d14", [S14], ["v14", ""]>, DwarfRegAlias<B14>;
+def D15 : AArch64Reg<15, "d15", [S15], ["v15", ""]>, DwarfRegAlias<B15>;
+def D16 : AArch64Reg<16, "d16", [S16], ["v16", ""]>, DwarfRegAlias<B16>;
+def D17 : AArch64Reg<17, "d17", [S17], ["v17", ""]>, DwarfRegAlias<B17>;
+def D18 : AArch64Reg<18, "d18", [S18], ["v18", ""]>, DwarfRegAlias<B18>;
+def D19 : AArch64Reg<19, "d19", [S19], ["v19", ""]>, DwarfRegAlias<B19>;
+def D20 : AArch64Reg<20, "d20", [S20], ["v20", ""]>, DwarfRegAlias<B20>;
+def D21 : AArch64Reg<21, "d21", [S21], ["v21", ""]>, DwarfRegAlias<B21>;
+def D22 : AArch64Reg<22, "d22", [S22], ["v22", ""]>, DwarfRegAlias<B22>;
+def D23 : AArch64Reg<23, "d23", [S23], ["v23", ""]>, DwarfRegAlias<B23>;
+def D24 : AArch64Reg<24, "d24", [S24], ["v24", ""]>, DwarfRegAlias<B24>;
+def D25 : AArch64Reg<25, "d25", [S25], ["v25", ""]>, DwarfRegAlias<B25>;
+def D26 : AArch64Reg<26, "d26", [S26], ["v26", ""]>, DwarfRegAlias<B26>;
+def D27 : AArch64Reg<27, "d27", [S27], ["v27", ""]>, DwarfRegAlias<B27>;
+def D28 : AArch64Reg<28, "d28", [S28], ["v28", ""]>, DwarfRegAlias<B28>;
+def D29 : AArch64Reg<29, "d29", [S29], ["v29", ""]>, DwarfRegAlias<B29>;
+def D30 : AArch64Reg<30, "d30", [S30], ["v30", ""]>, DwarfRegAlias<B30>;
+def D31 : AArch64Reg<31, "d31", [S31], ["v31", ""]>, DwarfRegAlias<B31>;
+}
+
+let SubRegIndices = [dsub], RegAltNameIndices = [vreg, vlist1] in {
+def Q0 : AArch64Reg<0, "q0", [D0], ["v0", ""]>, DwarfRegAlias<B0>;
+def Q1 : AArch64Reg<1, "q1", [D1], ["v1", ""]>, DwarfRegAlias<B1>;
+def Q2 : AArch64Reg<2, "q2", [D2], ["v2", ""]>, DwarfRegAlias<B2>;
+def Q3 : AArch64Reg<3, "q3", [D3], ["v3", ""]>, DwarfRegAlias<B3>;
+def Q4 : AArch64Reg<4, "q4", [D4], ["v4", ""]>, DwarfRegAlias<B4>;
+def Q5 : AArch64Reg<5, "q5", [D5], ["v5", ""]>, DwarfRegAlias<B5>;
+def Q6 : AArch64Reg<6, "q6", [D6], ["v6", ""]>, DwarfRegAlias<B6>;
+def Q7 : AArch64Reg<7, "q7", [D7], ["v7", ""]>, DwarfRegAlias<B7>;
+def Q8 : AArch64Reg<8, "q8", [D8], ["v8", ""]>, DwarfRegAlias<B8>;
+def Q9 : AArch64Reg<9, "q9", [D9], ["v9", ""]>, DwarfRegAlias<B9>;
+def Q10 : AArch64Reg<10, "q10", [D10], ["v10", ""]>, DwarfRegAlias<B10>;
+def Q11 : AArch64Reg<11, "q11", [D11], ["v11", ""]>, DwarfRegAlias<B11>;
+def Q12 : AArch64Reg<12, "q12", [D12], ["v12", ""]>, DwarfRegAlias<B12>;
+def Q13 : AArch64Reg<13, "q13", [D13], ["v13", ""]>, DwarfRegAlias<B13>;
+def Q14 : AArch64Reg<14, "q14", [D14], ["v14", ""]>, DwarfRegAlias<B14>;
+def Q15 : AArch64Reg<15, "q15", [D15], ["v15", ""]>, DwarfRegAlias<B15>;
+def Q16 : AArch64Reg<16, "q16", [D16], ["v16", ""]>, DwarfRegAlias<B16>;
+def Q17 : AArch64Reg<17, "q17", [D17], ["v17", ""]>, DwarfRegAlias<B17>;
+def Q18 : AArch64Reg<18, "q18", [D18], ["v18", ""]>, DwarfRegAlias<B18>;
+def Q19 : AArch64Reg<19, "q19", [D19], ["v19", ""]>, DwarfRegAlias<B19>;
+def Q20 : AArch64Reg<20, "q20", [D20], ["v20", ""]>, DwarfRegAlias<B20>;
+def Q21 : AArch64Reg<21, "q21", [D21], ["v21", ""]>, DwarfRegAlias<B21>;
+def Q22 : AArch64Reg<22, "q22", [D22], ["v22", ""]>, DwarfRegAlias<B22>;
+def Q23 : AArch64Reg<23, "q23", [D23], ["v23", ""]>, DwarfRegAlias<B23>;
+def Q24 : AArch64Reg<24, "q24", [D24], ["v24", ""]>, DwarfRegAlias<B24>;
+def Q25 : AArch64Reg<25, "q25", [D25], ["v25", ""]>, DwarfRegAlias<B25>;
+def Q26 : AArch64Reg<26, "q26", [D26], ["v26", ""]>, DwarfRegAlias<B26>;
+def Q27 : AArch64Reg<27, "q27", [D27], ["v27", ""]>, DwarfRegAlias<B27>;
+def Q28 : AArch64Reg<28, "q28", [D28], ["v28", ""]>, DwarfRegAlias<B28>;
+def Q29 : AArch64Reg<29, "q29", [D29], ["v29", ""]>, DwarfRegAlias<B29>;
+def Q30 : AArch64Reg<30, "q30", [D30], ["v30", ""]>, DwarfRegAlias<B30>;
+def Q31 : AArch64Reg<31, "q31", [D31], ["v31", ""]>, DwarfRegAlias<B31>;
+}
+
+def FPR8 : RegisterClass<"AArch64", [untyped], 8, (sequence "B%u", 0, 31)> {
+ let Size = 8;
+}
+def FPR16 : RegisterClass<"AArch64", [f16], 16, (sequence "H%u", 0, 31)> {
+ let Size = 16;
+}
+def FPR32 : RegisterClass<"AArch64", [f32, i32], 32,(sequence "S%u", 0, 31)>;
+def FPR64 : RegisterClass<"AArch64", [f64, i64, v2f32, v1f64, v8i8, v4i16, v2i32,
+ v1i64],
+ 64, (sequence "D%u", 0, 31)>;
+// We don't (yet) have an f128 legal type, so don't use that here. We
+// normalize 128-bit vectors to v2f64 for arg passing and such, so use
+// that here.
+def FPR128 : RegisterClass<"AArch64",
+ [v16i8, v8i16, v4i32, v2i64, v4f32, v2f64, f128],
+ 128, (sequence "Q%u", 0, 31)>;
+
+// The lower 16 vector registers. Some instructions can only take registers
+// in this range.
+def FPR128_lo : RegisterClass<"AArch64",
+ [v16i8, v8i16, v4i32, v2i64, v4f32, v2f64],
+ 128, (trunc FPR128, 16)>;
+
+// Pairs, triples, and quads of 64-bit vector registers.
+def DSeqPairs : RegisterTuples<[dsub0, dsub1], [(rotl FPR64, 0), (rotl FPR64, 1)]>;
+def DSeqTriples : RegisterTuples<[dsub0, dsub1, dsub2],
+ [(rotl FPR64, 0), (rotl FPR64, 1),
+ (rotl FPR64, 2)]>;
+def DSeqQuads : RegisterTuples<[dsub0, dsub1, dsub2, dsub3],
+ [(rotl FPR64, 0), (rotl FPR64, 1),
+ (rotl FPR64, 2), (rotl FPR64, 3)]>;
+def DD : RegisterClass<"AArch64", [untyped], 64, (add DSeqPairs)> {
+ let Size = 128;
+}
+def DDD : RegisterClass<"AArch64", [untyped], 64, (add DSeqTriples)> {
+ let Size = 196;
+}
+def DDDD : RegisterClass<"AArch64", [untyped], 64, (add DSeqQuads)> {
+ let Size = 256;
+}
+
+// Pairs, triples, and quads of 128-bit vector registers.
+def QSeqPairs : RegisterTuples<[qsub0, qsub1], [(rotl FPR128, 0), (rotl FPR128, 1)]>;
+def QSeqTriples : RegisterTuples<[qsub0, qsub1, qsub2],
+ [(rotl FPR128, 0), (rotl FPR128, 1),
+ (rotl FPR128, 2)]>;
+def QSeqQuads : RegisterTuples<[qsub0, qsub1, qsub2, qsub3],
+ [(rotl FPR128, 0), (rotl FPR128, 1),
+ (rotl FPR128, 2), (rotl FPR128, 3)]>;
+def QQ : RegisterClass<"AArch64", [untyped], 128, (add QSeqPairs)> {
+ let Size = 256;
+}
+def QQQ : RegisterClass<"AArch64", [untyped], 128, (add QSeqTriples)> {
+ let Size = 384;
+}
+def QQQQ : RegisterClass<"AArch64", [untyped], 128, (add QSeqQuads)> {
+ let Size = 512;
+}
-// The followings are vector list operands
-multiclass VectorList_operands<string PREFIX, string LAYOUT, int Count,
- RegisterClass RegList> {
- def _asmoperand : AsmOperandClass {
- let Name = PREFIX # LAYOUT # Count;
- let RenderMethod = "addVectorListOperands";
- let PredicateMethod =
- "isVectorList<A64Layout::VL_" # LAYOUT # ", " # Count # ">";
- let ParserMethod = "ParseVectorList";
+// Vector operand versions of the FP registers. Alternate name printing and
+// assmebler matching.
+def VectorReg64AsmOperand : AsmOperandClass {
+ let Name = "VectorReg64";
+ let PredicateMethod = "isVectorReg";
+}
+def VectorReg128AsmOperand : AsmOperandClass {
+ let Name = "VectorReg128";
+ let PredicateMethod = "isVectorReg";
+}
+
+def V64 : RegisterOperand<FPR64, "printVRegOperand"> {
+ let ParserMatchClass = VectorReg64AsmOperand;
+}
+
+def V128 : RegisterOperand<FPR128, "printVRegOperand"> {
+ let ParserMatchClass = VectorReg128AsmOperand;
+}
+
+def VectorRegLoAsmOperand : AsmOperandClass { let Name = "VectorRegLo"; }
+def V128_lo : RegisterOperand<FPR128_lo, "printVRegOperand"> {
+ let ParserMatchClass = VectorRegLoAsmOperand;
+}
+
+class TypedVecListAsmOperand<int count, int regsize, int lanes, string kind>
+ : AsmOperandClass {
+ let Name = "TypedVectorList" # count # "_" # lanes # kind;
+
+ let PredicateMethod
+ = "isTypedVectorList<" # count # ", " # lanes # ", '" # kind # "'>";
+ let RenderMethod = "addVectorList" # regsize # "Operands<" # count # ">";
+}
+
+class TypedVecListRegOperand<RegisterClass Reg, int lanes, string kind>
+ : RegisterOperand<Reg, "printTypedVectorList<" # lanes # ", '"
+ # kind # "'>">;
+
+multiclass VectorList<int count, RegisterClass Reg64, RegisterClass Reg128> {
+ // With implicit types (probably on instruction instead). E.g. { v0, v1 }
+ def _64AsmOperand : AsmOperandClass {
+ let Name = NAME # "64";
+ let PredicateMethod = "isImplicitlyTypedVectorList<" # count # ">";
+ let RenderMethod = "addVectorList64Operands<" # count # ">";
}
- def _operand : RegisterOperand<RegList,
- "printVectorList<A64Layout::VL_" # LAYOUT # ", " # Count # ">"> {
- let ParserMatchClass =
- !cast<AsmOperandClass>(PREFIX # LAYOUT # "_asmoperand");
+ def "64" : RegisterOperand<Reg64, "printImplicitlyTypedVectorList"> {
+ let ParserMatchClass = !cast<AsmOperandClass>(NAME # "_64AsmOperand");
}
+
+ def _128AsmOperand : AsmOperandClass {
+ let Name = NAME # "128";
+ let PredicateMethod = "isImplicitlyTypedVectorList<" # count # ">";
+ let RenderMethod = "addVectorList128Operands<" # count # ">";
+ }
+
+ def "128" : RegisterOperand<Reg128, "printImplicitlyTypedVectorList"> {
+ let ParserMatchClass = !cast<AsmOperandClass>(NAME # "_128AsmOperand");
+ }
+
+ // 64-bit register lists with explicit type.
+
+ // { v0.8b, v1.8b }
+ def _8bAsmOperand : TypedVecListAsmOperand<count, 64, 8, "b">;
+ def "8b" : TypedVecListRegOperand<Reg64, 8, "b"> {
+ let ParserMatchClass = !cast<AsmOperandClass>(NAME # "_8bAsmOperand");
+ }
+
+ // { v0.4h, v1.4h }
+ def _4hAsmOperand : TypedVecListAsmOperand<count, 64, 4, "h">;
+ def "4h" : TypedVecListRegOperand<Reg64, 4, "h"> {
+ let ParserMatchClass = !cast<AsmOperandClass>(NAME # "_4hAsmOperand");
+ }
+
+ // { v0.2s, v1.2s }
+ def _2sAsmOperand : TypedVecListAsmOperand<count, 64, 2, "s">;
+ def "2s" : TypedVecListRegOperand<Reg64, 2, "s"> {
+ let ParserMatchClass = !cast<AsmOperandClass>(NAME # "_2sAsmOperand");
+ }
+
+ // { v0.1d, v1.1d }
+ def _1dAsmOperand : TypedVecListAsmOperand<count, 64, 1, "d">;
+ def "1d" : TypedVecListRegOperand<Reg64, 1, "d"> {
+ let ParserMatchClass = !cast<AsmOperandClass>(NAME # "_1dAsmOperand");
+ }
+
+ // 128-bit register lists with explicit type
+
+ // { v0.16b, v1.16b }
+ def _16bAsmOperand : TypedVecListAsmOperand<count, 128, 16, "b">;
+ def "16b" : TypedVecListRegOperand<Reg128, 16, "b"> {
+ let ParserMatchClass = !cast<AsmOperandClass>(NAME # "_16bAsmOperand");
+ }
+
+ // { v0.8h, v1.8h }
+ def _8hAsmOperand : TypedVecListAsmOperand<count, 128, 8, "h">;
+ def "8h" : TypedVecListRegOperand<Reg128, 8, "h"> {
+ let ParserMatchClass = !cast<AsmOperandClass>(NAME # "_8hAsmOperand");
+ }
+
+ // { v0.4s, v1.4s }
+ def _4sAsmOperand : TypedVecListAsmOperand<count, 128, 4, "s">;
+ def "4s" : TypedVecListRegOperand<Reg128, 4, "s"> {
+ let ParserMatchClass = !cast<AsmOperandClass>(NAME # "_4sAsmOperand");
+ }
+
+ // { v0.2d, v1.2d }
+ def _2dAsmOperand : TypedVecListAsmOperand<count, 128, 2, "d">;
+ def "2d" : TypedVecListRegOperand<Reg128, 2, "d"> {
+ let ParserMatchClass = !cast<AsmOperandClass>(NAME # "_2dAsmOperand");
+ }
+
+ // { v0.b, v1.b }
+ def _bAsmOperand : TypedVecListAsmOperand<count, 128, 0, "b">;
+ def "b" : TypedVecListRegOperand<Reg128, 0, "b"> {
+ let ParserMatchClass = !cast<AsmOperandClass>(NAME # "_bAsmOperand");
+ }
+
+ // { v0.h, v1.h }
+ def _hAsmOperand : TypedVecListAsmOperand<count, 128, 0, "h">;
+ def "h" : TypedVecListRegOperand<Reg128, 0, "h"> {
+ let ParserMatchClass = !cast<AsmOperandClass>(NAME # "_hAsmOperand");
+ }
+
+ // { v0.s, v1.s }
+ def _sAsmOperand : TypedVecListAsmOperand<count, 128, 0, "s">;
+ def "s" : TypedVecListRegOperand<Reg128, 0, "s"> {
+ let ParserMatchClass = !cast<AsmOperandClass>(NAME # "_sAsmOperand");
+ }
+
+ // { v0.d, v1.d }
+ def _dAsmOperand : TypedVecListAsmOperand<count, 128, 0, "d">;
+ def "d" : TypedVecListRegOperand<Reg128, 0, "d"> {
+ let ParserMatchClass = !cast<AsmOperandClass>(NAME # "_dAsmOperand");
+ }
+
+
}
-multiclass VectorList_BHSD<string PREFIX, int Count, RegisterClass DRegList,
- RegisterClass QRegList> {
- defm 8B : VectorList_operands<PREFIX, "8B", Count, DRegList>;
- defm 4H : VectorList_operands<PREFIX, "4H", Count, DRegList>;
- defm 2S : VectorList_operands<PREFIX, "2S", Count, DRegList>;
- defm 1D : VectorList_operands<PREFIX, "1D", Count, DRegList>;
- defm 16B : VectorList_operands<PREFIX, "16B", Count, QRegList>;
- defm 8H : VectorList_operands<PREFIX, "8H", Count, QRegList>;
- defm 4S : VectorList_operands<PREFIX, "4S", Count, QRegList>;
- defm 2D : VectorList_operands<PREFIX, "2D", Count, QRegList>;
-}
+defm VecListOne : VectorList<1, FPR64, FPR128>;
+defm VecListTwo : VectorList<2, DD, QQ>;
+defm VecListThree : VectorList<3, DDD, QQQ>;
+defm VecListFour : VectorList<4, DDDD, QQQQ>;
-// Vector list operand with 1/2/3/4 registers: VOne8B_operand,..., VQuad2D_operand
-defm VOne : VectorList_BHSD<"VOne", 1, FPR64, FPR128>;
-defm VPair : VectorList_BHSD<"VPair", 2, DPair, QPair>;
-defm VTriple : VectorList_BHSD<"VTriple", 3, DTriple, QTriple>;
-defm VQuad : VectorList_BHSD<"VQuad", 4, DQuad, QQuad>;
+
+// Register operand versions of the scalar FP registers.
+def FPR16Op : RegisterOperand<FPR16, "printOperand">;
+def FPR32Op : RegisterOperand<FPR32, "printOperand">;
+def FPR64Op : RegisterOperand<FPR64, "printOperand">;
+def FPR128Op : RegisterOperand<FPR128, "printOperand">;
diff --git a/lib/Target/AArch64/AArch64SchedA53.td b/lib/Target/AArch64/AArch64SchedA53.td
new file mode 100644
index 0000000..0c3949e
--- /dev/null
+++ b/lib/Target/AArch64/AArch64SchedA53.td
@@ -0,0 +1,291 @@
+//==- AArch64SchedA53.td - Cortex-A53 Scheduling Definitions -*- tablegen -*-=//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the itinerary class data for the ARM Cortex A53 processors.
+//
+//===----------------------------------------------------------------------===//
+
+// ===---------------------------------------------------------------------===//
+// The following definitions describe the simpler per-operand machine model.
+// This works with MachineScheduler. See MCSchedModel.h for details.
+
+// Cortex-A53 machine model for scheduling and other instruction cost heuristics.
+def CortexA53Model : SchedMachineModel {
+ let MicroOpBufferSize = 0; // Explicitly set to zero since A53 is in-order.
+ let IssueWidth = 2; // 2 micro-ops are dispatched per cycle.
+ let MinLatency = 1 ; // OperandCycles are interpreted as MinLatency.
+ let LoadLatency = 3; // Optimistic load latency assuming bypass.
+ // This is overriden by OperandCycles if the
+ // Itineraries are queried instead.
+ let MispredictPenalty = 9; // Based on "Cortex-A53 Software Optimisation
+ // Specification - Instruction Timings"
+ // v 1.0 Spreadsheet
+}
+
+
+//===----------------------------------------------------------------------===//
+// Define each kind of processor resource and number available.
+
+// Modeling each pipeline as a ProcResource using the BufferSize = 0 since
+// Cortex-A53 is in-order.
+
+def A53UnitALU : ProcResource<2> { let BufferSize = 0; } // Int ALU
+def A53UnitMAC : ProcResource<1> { let BufferSize = 0; } // Int MAC
+def A53UnitDiv : ProcResource<1> { let BufferSize = 0; } // Int Division
+def A53UnitLdSt : ProcResource<1> { let BufferSize = 0; } // Load/Store
+def A53UnitB : ProcResource<1> { let BufferSize = 0; } // Branch
+def A53UnitFPALU : ProcResource<1> { let BufferSize = 0; } // FP ALU
+def A53UnitFPMDS : ProcResource<1> { let BufferSize = 0; } // FP Mult/Div/Sqrt
+
+
+//===----------------------------------------------------------------------===//
+// Subtarget-specific SchedWrite types which both map the ProcResources and
+// set the latency.
+
+let SchedModel = CortexA53Model in {
+
+// ALU - Despite having a full latency of 4, most of the ALU instructions can
+// forward a cycle earlier and then two cycles earlier in the case of a
+// shift-only instruction. These latencies will be incorrect when the
+// result cannot be forwarded, but modeling isn't rocket surgery.
+def : WriteRes<WriteImm, [A53UnitALU]> { let Latency = 3; }
+def : WriteRes<WriteI, [A53UnitALU]> { let Latency = 3; }
+def : WriteRes<WriteISReg, [A53UnitALU]> { let Latency = 3; }
+def : WriteRes<WriteIEReg, [A53UnitALU]> { let Latency = 3; }
+def : WriteRes<WriteIS, [A53UnitALU]> { let Latency = 2; }
+def : WriteRes<WriteExtr, [A53UnitALU]> { let Latency = 3; }
+
+// MAC
+def : WriteRes<WriteIM32, [A53UnitMAC]> { let Latency = 4; }
+def : WriteRes<WriteIM64, [A53UnitMAC]> { let Latency = 4; }
+
+// Div
+def : WriteRes<WriteID32, [A53UnitDiv]> { let Latency = 4; }
+def : WriteRes<WriteID64, [A53UnitDiv]> { let Latency = 4; }
+
+// Load
+def : WriteRes<WriteLD, [A53UnitLdSt]> { let Latency = 4; }
+def : WriteRes<WriteLDIdx, [A53UnitLdSt]> { let Latency = 4; }
+def : WriteRes<WriteLDHi, [A53UnitLdSt]> { let Latency = 4; }
+
+// Vector Load - Vector loads take 1-5 cycles to issue. For the WriteVecLd
+// below, choosing the median of 3 which makes the latency 6.
+// May model this more carefully in the future. The remaining
+// A53WriteVLD# types represent the 1-5 cycle issues explicitly.
+def : WriteRes<WriteVLD, [A53UnitLdSt]> { let Latency = 6;
+ let ResourceCycles = [3]; }
+def A53WriteVLD1 : SchedWriteRes<[A53UnitLdSt]> { let Latency = 4; }
+def A53WriteVLD2 : SchedWriteRes<[A53UnitLdSt]> { let Latency = 5;
+ let ResourceCycles = [2]; }
+def A53WriteVLD3 : SchedWriteRes<[A53UnitLdSt]> { let Latency = 6;
+ let ResourceCycles = [3]; }
+def A53WriteVLD4 : SchedWriteRes<[A53UnitLdSt]> { let Latency = 7;
+ let ResourceCycles = [4]; }
+def A53WriteVLD5 : SchedWriteRes<[A53UnitLdSt]> { let Latency = 8;
+ let ResourceCycles = [5]; }
+
+// Pre/Post Indexing - Performed as part of address generation which is already
+// accounted for in the WriteST* latencies below
+def : WriteRes<WriteAdr, []> { let Latency = 0; }
+
+// Store
+def : WriteRes<WriteST, [A53UnitLdSt]> { let Latency = 4; }
+def : WriteRes<WriteSTP, [A53UnitLdSt]> { let Latency = 4; }
+def : WriteRes<WriteSTIdx, [A53UnitLdSt]> { let Latency = 4; }
+def : WriteRes<WriteSTX, [A53UnitLdSt]> { let Latency = 4; }
+
+// Vector Store - Similar to vector loads, can take 1-3 cycles to issue.
+def : WriteRes<WriteVST, [A53UnitLdSt]> { let Latency = 5;
+ let ResourceCycles = [2];}
+def A53WriteVST1 : SchedWriteRes<[A53UnitLdSt]> { let Latency = 4; }
+def A53WriteVST2 : SchedWriteRes<[A53UnitLdSt]> { let Latency = 5;
+ let ResourceCycles = [2]; }
+def A53WriteVST3 : SchedWriteRes<[A53UnitLdSt]> { let Latency = 6;
+ let ResourceCycles = [3]; }
+
+// Branch
+def : WriteRes<WriteBr, [A53UnitB]>;
+def : WriteRes<WriteBrReg, [A53UnitB]>;
+def : WriteRes<WriteSys, [A53UnitB]>;
+def : WriteRes<WriteBarrier, [A53UnitB]>;
+def : WriteRes<WriteHint, [A53UnitB]>;
+
+// FP ALU
+def : WriteRes<WriteF, [A53UnitFPALU]> { let Latency = 6; }
+def : WriteRes<WriteFCmp, [A53UnitFPALU]> { let Latency = 6; }
+def : WriteRes<WriteFCvt, [A53UnitFPALU]> { let Latency = 6; }
+def : WriteRes<WriteFCopy, [A53UnitFPALU]> { let Latency = 6; }
+def : WriteRes<WriteFImm, [A53UnitFPALU]> { let Latency = 6; }
+def : WriteRes<WriteV, [A53UnitFPALU]> { let Latency = 6; }
+
+// FP Mul, Div, Sqrt
+def : WriteRes<WriteFMul, [A53UnitFPMDS]> { let Latency = 6; }
+def : WriteRes<WriteFDiv, [A53UnitFPMDS]> { let Latency = 33;
+ let ResourceCycles = [29]; }
+def A53WriteFMAC : SchedWriteRes<[A53UnitFPMDS]> { let Latency = 10; }
+def A53WriteFDivSP : SchedWriteRes<[A53UnitFPMDS]> { let Latency = 18;
+ let ResourceCycles = [14]; }
+def A53WriteFDivDP : SchedWriteRes<[A53UnitFPMDS]> { let Latency = 33;
+ let ResourceCycles = [29]; }
+def A53WriteFSqrtSP : SchedWriteRes<[A53UnitFPMDS]> { let Latency = 17;
+ let ResourceCycles = [13]; }
+def A53WriteFSqrtDP : SchedWriteRes<[A53UnitFPMDS]> { let Latency = 32;
+ let ResourceCycles = [28]; }
+
+//===----------------------------------------------------------------------===//
+// Subtarget-specific SchedRead types.
+
+// No forwarding for these reads.
+def : ReadAdvance<ReadExtrHi, 0>;
+def : ReadAdvance<ReadAdrBase, 0>;
+def : ReadAdvance<ReadVLD, 0>;
+
+// ALU - Most operands in the ALU pipes are not needed for two cycles. Shiftable
+// operands are needed one cycle later if and only if they are to be
+// shifted. Otherwise, they too are needed two cycle later. This same
+// ReadAdvance applies to Extended registers as well, even though there is
+// a seperate SchedPredicate for them.
+def : ReadAdvance<ReadI, 2, [WriteImm,WriteI,
+ WriteISReg, WriteIEReg,WriteIS,
+ WriteID32,WriteID64,
+ WriteIM32,WriteIM64]>;
+def A53ReadShifted : SchedReadAdvance<1, [WriteImm,WriteI,
+ WriteISReg, WriteIEReg,WriteIS,
+ WriteID32,WriteID64,
+ WriteIM32,WriteIM64]>;
+def A53ReadNotShifted : SchedReadAdvance<2, [WriteImm,WriteI,
+ WriteISReg, WriteIEReg,WriteIS,
+ WriteID32,WriteID64,
+ WriteIM32,WriteIM64]>;
+def A53ReadISReg : SchedReadVariant<[
+ SchedVar<RegShiftedPred, [A53ReadShifted]>,
+ SchedVar<NoSchedPred, [A53ReadNotShifted]>]>;
+def : SchedAlias<ReadISReg, A53ReadISReg>;
+
+def A53ReadIEReg : SchedReadVariant<[
+ SchedVar<RegExtendedPred, [A53ReadShifted]>,
+ SchedVar<NoSchedPred, [A53ReadNotShifted]>]>;
+def : SchedAlias<ReadIEReg, A53ReadIEReg>;
+
+// MAC - Operands are generally needed one cycle later in the MAC pipe.
+// Accumulator operands are needed two cycles later.
+def : ReadAdvance<ReadIM, 1, [WriteImm,WriteI,
+ WriteISReg, WriteIEReg,WriteIS,
+ WriteID32,WriteID64,
+ WriteIM32,WriteIM64]>;
+def : ReadAdvance<ReadIMA, 2, [WriteImm,WriteI,
+ WriteISReg, WriteIEReg,WriteIS,
+ WriteID32,WriteID64,
+ WriteIM32,WriteIM64]>;
+
+// Div
+def : ReadAdvance<ReadID, 1, [WriteImm,WriteI,
+ WriteISReg, WriteIEReg,WriteIS,
+ WriteID32,WriteID64,
+ WriteIM32,WriteIM64]>;
+
+//===----------------------------------------------------------------------===//
+// Subtarget-specific InstRWs.
+
+//---
+// Miscellaneous
+//---
+def : InstRW<[WriteI], (instrs COPY)>;
+
+//---
+// Vector Loads
+//---
+def : InstRW<[A53WriteVLD1], (instregex "LD1i(8|16|32|64)$")>;
+def : InstRW<[A53WriteVLD1], (instregex "LD1Rv(8b|4h|2s|1d|16b|8h|4s|2d)$")>;
+def : InstRW<[A53WriteVLD1], (instregex "LD1Onev(8b|4h|2s|1d|16b|8h|4s|2d)$")>;
+def : InstRW<[A53WriteVLD2], (instregex "LD1Twov(8b|4h|2s|1d|16b|8h|4s|2d)$")>;
+def : InstRW<[A53WriteVLD3], (instregex "LD1Threev(8b|4h|2s|1d|16b|8h|4s|2d)$")>;
+def : InstRW<[A53WriteVLD4], (instregex "LD1Fourv(8b|4h|2s|1d|16b|8h|4s|2d)$")>;
+def : InstRW<[A53WriteVLD1, WriteAdr], (instregex "LD1i(8|16|32|64)_POST$")>;
+def : InstRW<[A53WriteVLD1, WriteAdr], (instregex "LD1Rv(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>;
+def : InstRW<[A53WriteVLD1, WriteAdr], (instregex "LD1Onev(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>;
+def : InstRW<[A53WriteVLD2, WriteAdr], (instregex "LD1Twov(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>;
+def : InstRW<[A53WriteVLD3, WriteAdr], (instregex "LD1Threev(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>;
+def : InstRW<[A53WriteVLD4, WriteAdr], (instregex "LD1Fourv(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>;
+
+def : InstRW<[A53WriteVLD1], (instregex "LD2i(8|16|32|64)$")>;
+def : InstRW<[A53WriteVLD1], (instregex "LD2Rv(8b|4h|2s|1d|16b|8h|4s|2d)$")>;
+def : InstRW<[A53WriteVLD2], (instregex "LD2Twov(8b|4h|2s)$")>;
+def : InstRW<[A53WriteVLD4], (instregex "LD2Twov(16b|8h|4s|2d)$")>;
+def : InstRW<[A53WriteVLD1, WriteAdr], (instregex "LD2i(8|16|32|64)(_POST)?$")>;
+def : InstRW<[A53WriteVLD1, WriteAdr], (instregex "LD2Rv(8b|4h|2s|1d|16b|8h|4s|2d)(_POST)?$")>;
+def : InstRW<[A53WriteVLD2, WriteAdr], (instregex "LD2Twov(8b|4h|2s)(_POST)?$")>;
+def : InstRW<[A53WriteVLD4, WriteAdr], (instregex "LD2Twov(16b|8h|4s|2d)(_POST)?$")>;
+
+def : InstRW<[A53WriteVLD2], (instregex "LD3i(8|16|32|64)$")>;
+def : InstRW<[A53WriteVLD2], (instregex "LD3Rv(8b|4h|2s|1d|16b|8h|4s|2d)$")>;
+def : InstRW<[A53WriteVLD4], (instregex "LD3Threev(8b|4h|2s|1d|16b|8h|4s)$")>;
+def : InstRW<[A53WriteVLD3], (instregex "LD3Threev(2d)$")>;
+def : InstRW<[A53WriteVLD2, WriteAdr], (instregex "LD3i(8|16|32|64)_POST$")>;
+def : InstRW<[A53WriteVLD2, WriteAdr], (instregex "LD3Rv(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>;
+def : InstRW<[A53WriteVLD4, WriteAdr], (instregex "LD3Threev(8b|4h|2s|1d|16b|8h|4s)_POST$")>;
+def : InstRW<[A53WriteVLD3, WriteAdr], (instregex "LD3Threev(2d)_POST$")>;
+
+def : InstRW<[A53WriteVLD2], (instregex "LD4i(8|16|32|64)$")>;
+def : InstRW<[A53WriteVLD2], (instregex "LD4Rv(8b|4h|2s|1d|16b|8h|4s|2d)$")>;
+def : InstRW<[A53WriteVLD5], (instregex "LD4Fourv(8b|4h|2s|1d|16b|8h|4s)$")>;
+def : InstRW<[A53WriteVLD4], (instregex "LD4Fourv(2d)$")>;
+def : InstRW<[A53WriteVLD2, WriteAdr], (instregex "LD4i(8|16|32|64)_POST$")>;
+def : InstRW<[A53WriteVLD2, WriteAdr], (instregex "LD4Rv(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>;
+def : InstRW<[A53WriteVLD5, WriteAdr], (instregex "LD4Fourv(8b|4h|2s|1d|16b|8h|4s)_POST$")>;
+def : InstRW<[A53WriteVLD4, WriteAdr], (instregex "LD4Fourv(2d)_POST$")>;
+
+//---
+// Vector Stores
+//---
+def : InstRW<[A53WriteVST1], (instregex "ST1i(8|16|32|64)$")>;
+def : InstRW<[A53WriteVST1], (instregex "ST1Onev(8b|4h|2s|1d|16b|8h|4s|2d)$")>;
+def : InstRW<[A53WriteVST1], (instregex "ST1Twov(8b|4h|2s|1d|16b|8h|4s|2d)$")>;
+def : InstRW<[A53WriteVST2], (instregex "ST1Threev(8b|4h|2s|1d|16b|8h|4s|2d)$")>;
+def : InstRW<[A53WriteVST2], (instregex "ST1Fourv(8b|4h|2s|1d|16b|8h|4s|2d)$")>;
+def : InstRW<[A53WriteVST1, WriteAdr], (instregex "ST1i(8|16|32|64)_POST$")>;
+def : InstRW<[A53WriteVST1, WriteAdr], (instregex "ST1Onev(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>;
+def : InstRW<[A53WriteVST1, WriteAdr], (instregex "ST1Twov(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>;
+def : InstRW<[A53WriteVST2, WriteAdr], (instregex "ST1Threev(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>;
+def : InstRW<[A53WriteVST2, WriteAdr], (instregex "ST1Fourv(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>;
+
+def : InstRW<[A53WriteVST1], (instregex "ST2i(8|16|32|64)$")>;
+def : InstRW<[A53WriteVST1], (instregex "ST2Twov(8b|4h|2s)$")>;
+def : InstRW<[A53WriteVST2], (instregex "ST2Twov(16b|8h|4s|2d)$")>;
+def : InstRW<[A53WriteVST1, WriteAdr], (instregex "ST2i(8|16|32|64)_POST$")>;
+def : InstRW<[A53WriteVST1, WriteAdr], (instregex "ST2Twov(8b|4h|2s)_POST$")>;
+def : InstRW<[A53WriteVST2, WriteAdr], (instregex "ST2Twov(16b|8h|4s|2d)_POST$")>;
+
+def : InstRW<[A53WriteVST2], (instregex "ST3i(8|16|32|64)$")>;
+def : InstRW<[A53WriteVST3], (instregex "ST3Threev(8b|4h|2s|1d|16b|8h|4s)$")>;
+def : InstRW<[A53WriteVST2], (instregex "ST3Threev(2d)$")>;
+def : InstRW<[A53WriteVST2, WriteAdr], (instregex "ST3i(8|16|32|64)_POST$")>;
+def : InstRW<[A53WriteVST3, WriteAdr], (instregex "ST3Threev(8b|4h|2s|1d|16b|8h|4s)_POST$")>;
+def : InstRW<[A53WriteVST2, WriteAdr], (instregex "ST3Threev(2d)_POST$")>;
+
+def : InstRW<[A53WriteVST2], (instregex "ST4i(8|16|32|64)$")>;
+def : InstRW<[A53WriteVST3], (instregex "ST4Fourv(8b|4h|2s|1d|16b|8h|4s)$")>;
+def : InstRW<[A53WriteVST2], (instregex "ST4Fourv(2d)$")>;
+def : InstRW<[A53WriteVST2, WriteAdr], (instregex "ST4i(8|16|32|64)_POST$")>;
+def : InstRW<[A53WriteVST3, WriteAdr], (instregex "ST4Fourv(8b|4h|2s|1d|16b|8h|4s)_POST$")>;
+def : InstRW<[A53WriteVST2, WriteAdr], (instregex "ST4Fourv(2d)_POST$")>;
+
+//---
+// Floating Point MAC, DIV, SQRT
+//---
+def : InstRW<[A53WriteFMAC], (instregex "^FN?M(ADD|SUB).*")>;
+def : InstRW<[A53WriteFMAC], (instregex "^FML(A|S).*")>;
+def : InstRW<[A53WriteFDivSP], (instrs FDIVSrr)>;
+def : InstRW<[A53WriteFDivDP], (instrs FDIVDrr)>;
+def : InstRW<[A53WriteFDivSP], (instregex "^FDIVv.*32$")>;
+def : InstRW<[A53WriteFDivDP], (instregex "^FDIVv.*64$")>;
+def : InstRW<[A53WriteFSqrtSP], (instregex "^.*SQRT.*32$")>;
+def : InstRW<[A53WriteFSqrtDP], (instregex "^.*SQRT.*64$")>;
+
+}
diff --git a/lib/Target/AArch64/AArch64SchedCyclone.td b/lib/Target/AArch64/AArch64SchedCyclone.td
new file mode 100644
index 0000000..a2a1802
--- /dev/null
+++ b/lib/Target/AArch64/AArch64SchedCyclone.td
@@ -0,0 +1,865 @@
+//=- ARMSchedCyclone.td - AArch64 Cyclone Scheduling Defs ----*- tablegen -*-=//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the machine model for AArch64 Cyclone to support
+// instruction scheduling and other instruction cost heuristics.
+//
+//===----------------------------------------------------------------------===//
+
+def CycloneModel : SchedMachineModel {
+ let IssueWidth = 6; // 6 micro-ops are dispatched per cycle.
+ let MicroOpBufferSize = 192; // Based on the reorder buffer.
+ let LoadLatency = 4; // Optimistic load latency.
+ let MispredictPenalty = 16; // 14-19 cycles are typical.
+}
+
+//===----------------------------------------------------------------------===//
+// Define each kind of processor resource and number available on Cyclone.
+
+// 4 integer pipes
+def CyUnitI : ProcResource<4> {
+ let BufferSize = 48;
+}
+
+// 2 branch units: I[0..1]
+def CyUnitB : ProcResource<2> {
+ let Super = CyUnitI;
+ let BufferSize = 24;
+}
+
+// 1 indirect-branch unit: I[0]
+def CyUnitBR : ProcResource<1> {
+ let Super = CyUnitB;
+}
+
+// 2 shifter pipes: I[2..3]
+// When an instruction consumes a CyUnitIS, it also consumes a CyUnitI
+def CyUnitIS : ProcResource<2> {
+ let Super = CyUnitI;
+ let BufferSize = 24;
+}
+
+// 1 mul pipe: I[0]
+def CyUnitIM : ProcResource<1> {
+ let Super = CyUnitBR;
+ let BufferSize = 32;
+}
+
+// 1 div pipe: I[1]
+def CyUnitID : ProcResource<1> {
+ let Super = CyUnitB;
+ let BufferSize = 16;
+}
+
+// 1 integer division unit. This is driven by the ID pipe, but only
+// consumes the pipe for one cycle at issue and another cycle at writeback.
+def CyUnitIntDiv : ProcResource<1>;
+
+// 2 ld/st pipes.
+def CyUnitLS : ProcResource<2> {
+ let BufferSize = 28;
+}
+
+// 3 fp/vector pipes.
+def CyUnitV : ProcResource<3> {
+ let BufferSize = 48;
+}
+// 2 fp/vector arithmetic and multiply pipes: V[0-1]
+def CyUnitVM : ProcResource<2> {
+ let Super = CyUnitV;
+ let BufferSize = 32;
+}
+// 1 fp/vector division/sqrt pipe: V[2]
+def CyUnitVD : ProcResource<1> {
+ let Super = CyUnitV;
+ let BufferSize = 16;
+}
+// 1 fp compare pipe: V[0]
+def CyUnitVC : ProcResource<1> {
+ let Super = CyUnitVM;
+ let BufferSize = 16;
+}
+
+// 2 fp division/square-root units. These are driven by the VD pipe,
+// but only consume the pipe for one cycle at issue and a cycle at writeback.
+def CyUnitFloatDiv : ProcResource<2>;
+
+//===----------------------------------------------------------------------===//
+// Define scheduler read/write resources and latency on Cyclone.
+// This mirrors sections 7.7-7.9 of the Tuning Guide v1.0.1.
+
+let SchedModel = CycloneModel in {
+
+//---
+// 7.8.1. Moves
+//---
+
+// A single nop micro-op (uX).
+def WriteX : SchedWriteRes<[]> { let Latency = 0; }
+
+// Move zero is a register rename (to machine register zero).
+// The move is replaced by a single nop micro-op.
+// MOVZ Rd, #0
+// AND Rd, Rzr, #imm
+def WriteZPred : SchedPredicate<[{TII->isGPRZero(MI)}]>;
+def WriteImmZ : SchedWriteVariant<[
+ SchedVar<WriteZPred, [WriteX]>,
+ SchedVar<NoSchedPred, [WriteImm]>]>;
+def : InstRW<[WriteImmZ], (instrs MOVZWi,MOVZXi,ANDWri,ANDXri)>;
+
+// Move GPR is a register rename and single nop micro-op.
+// ORR Xd, XZR, Xm
+// ADD Xd, Xn, #0
+def WriteIMovPred : SchedPredicate<[{TII->isGPRCopy(MI)}]>;
+def WriteVMovPred : SchedPredicate<[{TII->isFPRCopy(MI)}]>;
+def WriteMov : SchedWriteVariant<[
+ SchedVar<WriteIMovPred, [WriteX]>,
+ SchedVar<WriteVMovPred, [WriteX]>,
+ SchedVar<NoSchedPred, [WriteI]>]>;
+def : InstRW<[WriteMov], (instrs COPY,ORRXrr,ADDXrr)>;
+
+// Move non-zero immediate is an integer ALU op.
+// MOVN,MOVZ,MOVK
+def : WriteRes<WriteImm, [CyUnitI]>;
+
+//---
+// 7.8.2-7.8.5. Arithmetic and Logical, Comparison, Conditional,
+// Shifts and Bitfield Operations
+//---
+
+// ADR,ADRP
+// ADD(S)ri,SUB(S)ri,AND(S)ri,EORri,ORRri
+// ADD(S)rr,SUB(S)rr,AND(S)rr,BIC(S)rr,EONrr,EORrr,ORNrr,ORRrr
+// ADC(S),SBC(S)
+// Aliases: CMN, CMP, TST
+//
+// Conditional operations.
+// CCMNi,CCMPi,CCMNr,CCMPr,
+// CSEL,CSINC,CSINV,CSNEG
+//
+// Bit counting and reversal operations.
+// CLS,CLZ,RBIT,REV,REV16,REV32
+def : WriteRes<WriteI, [CyUnitI]>;
+
+// ADD with shifted register operand is a single micro-op that
+// consumes a shift pipeline for two cycles.
+// ADD(S)rs,SUB(S)rs,AND(S)rs,BIC(S)rs,EONrs,EORrs,ORNrs,ORRrs
+// EXAMPLE: ADDrs Xn, Xm LSL #imm
+def : WriteRes<WriteISReg, [CyUnitIS]> {
+ let Latency = 2;
+ let ResourceCycles = [2];
+}
+
+// ADD with extended register operand is the same as shifted reg operand.
+// ADD(S)re,SUB(S)re
+// EXAMPLE: ADDXre Xn, Xm, UXTB #1
+def : WriteRes<WriteIEReg, [CyUnitIS]> {
+ let Latency = 2;
+ let ResourceCycles = [2];
+}
+
+// Variable shift and bitfield operations.
+// ASRV,LSLV,LSRV,RORV,BFM,SBFM,UBFM
+def : WriteRes<WriteIS, [CyUnitIS]>;
+
+// EXTR Shifts a pair of registers and requires two micro-ops.
+// The second micro-op is delayed, as modeled by ReadExtrHi.
+// EXTR Xn, Xm, #imm
+def : WriteRes<WriteExtr, [CyUnitIS, CyUnitIS]> {
+ let Latency = 2;
+ let NumMicroOps = 2;
+}
+
+// EXTR's first register read is delayed by one cycle, effectively
+// shortening its writer's latency.
+// EXTR Xn, Xm, #imm
+def : ReadAdvance<ReadExtrHi, 1>;
+
+//---
+// 7.8.6. Multiplies
+//---
+
+// MUL/MNEG are aliases for MADD/MSUB.
+// MADDW,MSUBW,SMADDL,SMSUBL,UMADDL,UMSUBL
+def : WriteRes<WriteIM32, [CyUnitIM]> {
+ let Latency = 4;
+}
+// MADDX,MSUBX,SMULH,UMULH
+def : WriteRes<WriteIM64, [CyUnitIM]> {
+ let Latency = 5;
+}
+
+//---
+// 7.8.7. Divide
+//---
+
+// 32-bit divide takes 7-13 cycles. 10 cycles covers a 20-bit quotient.
+// The ID pipe is consumed for 2 cycles: issue and writeback.
+// SDIVW,UDIVW
+def : WriteRes<WriteID32, [CyUnitID, CyUnitIntDiv]> {
+ let Latency = 10;
+ let ResourceCycles = [2, 10];
+}
+// 64-bit divide takes 7-21 cycles. 13 cycles covers a 32-bit quotient.
+// The ID pipe is consumed for 2 cycles: issue and writeback.
+// SDIVX,UDIVX
+def : WriteRes<WriteID64, [CyUnitID, CyUnitIntDiv]> {
+ let Latency = 13;
+ let ResourceCycles = [2, 13];
+}
+
+//---
+// 7.8.8,7.8.10. Load/Store, single element
+//---
+
+// Integer loads take 4 cycles and use one LS unit for one cycle.
+def : WriteRes<WriteLD, [CyUnitLS]> {
+ let Latency = 4;
+}
+
+// Store-load forwarding is 4 cycles.
+//
+// Note: The store-exclusive sequence incorporates this
+// latency. However, general heuristics should not model the
+// dependence between a store and subsequent may-alias load because
+// hardware speculation works.
+def : WriteRes<WriteST, [CyUnitLS]> {
+ let Latency = 4;
+}
+
+// Load from base address plus an optionally scaled register offset.
+// Rt latency is latency WriteIS + WriteLD.
+// EXAMPLE: LDR Xn, Xm [, lsl 3]
+def CyWriteLDIdx : SchedWriteVariant<[
+ SchedVar<ScaledIdxPred, [WriteIS, WriteLD]>, // Load from scaled register.
+ SchedVar<NoSchedPred, [WriteLD]>]>; // Load from register offset.
+def : SchedAlias<WriteLDIdx, CyWriteLDIdx>; // Map AArch64->Cyclone type.
+
+// EXAMPLE: STR Xn, Xm [, lsl 3]
+def CyWriteSTIdx : SchedWriteVariant<[
+ SchedVar<ScaledIdxPred, [WriteIS, WriteST]>, // Store to scaled register.
+ SchedVar<NoSchedPred, [WriteST]>]>; // Store to register offset.
+def : SchedAlias<WriteSTIdx, CyWriteSTIdx>; // Map AArch64->Cyclone type.
+
+// Read the (unshifted) base register Xn in the second micro-op one cycle later.
+// EXAMPLE: LDR Xn, Xm [, lsl 3]
+def ReadBaseRS : SchedReadAdvance<1>;
+def CyReadAdrBase : SchedReadVariant<[
+ SchedVar<ScaledIdxPred, [ReadBaseRS]>, // Read base reg after shifting offset.
+ SchedVar<NoSchedPred, [ReadDefault]>]>; // Read base reg with no shift.
+def : SchedAlias<ReadAdrBase, CyReadAdrBase>; // Map AArch64->Cyclone type.
+
+//---
+// 7.8.9,7.8.11. Load/Store, paired
+//---
+
+// Address pre/post increment is a simple ALU op with one cycle latency.
+def : WriteRes<WriteAdr, [CyUnitI]>;
+
+// LDP high register write is fused with the load, but a nop micro-op remains.
+def : WriteRes<WriteLDHi, []> {
+ let Latency = 4;
+}
+
+// STP is a vector op and store, except for QQ, which is just two stores.
+def : SchedAlias<WriteSTP, WriteVSTShuffle>;
+def : InstRW<[WriteST, WriteST], (instrs STPQi)>;
+
+//---
+// 7.8.13. Branches
+//---
+
+// Branches take a single micro-op.
+// The misprediction penalty is defined as a SchedMachineModel property.
+def : WriteRes<WriteBr, [CyUnitB]> {let Latency = 0;}
+def : WriteRes<WriteBrReg, [CyUnitBR]> {let Latency = 0;}
+
+//---
+// 7.8.14. Never-issued Instructions, Barrier and Hint Operations
+//---
+
+// NOP,SEV,SEVL,WFE,WFI,YIELD
+def : WriteRes<WriteHint, []> {let Latency = 0;}
+// ISB
+def : InstRW<[WriteI], (instrs ISB)>;
+// SLREX,DMB,DSB
+def : WriteRes<WriteBarrier, [CyUnitLS]>;
+
+// System instructions get an invalid latency because the latency of
+// other operations across them is meaningless.
+def : WriteRes<WriteSys, []> {let Latency = -1;}
+
+//===----------------------------------------------------------------------===//
+// 7.9 Vector Unit Instructions
+
+// Simple vector operations take 2 cycles.
+def : WriteRes<WriteV, [CyUnitV]> {let Latency = 2;}
+
+// Define some longer latency vector op types for Cyclone.
+def CyWriteV3 : SchedWriteRes<[CyUnitV]> {let Latency = 3;}
+def CyWriteV4 : SchedWriteRes<[CyUnitV]> {let Latency = 4;}
+def CyWriteV5 : SchedWriteRes<[CyUnitV]> {let Latency = 5;}
+def CyWriteV6 : SchedWriteRes<[CyUnitV]> {let Latency = 6;}
+
+// Simple floating-point operations take 2 cycles.
+def : WriteRes<WriteF, [CyUnitV]> {let Latency = 2;}
+
+//---
+// 7.9.1 Vector Moves
+//---
+
+// TODO: Add Cyclone-specific zero-cycle zeros. LLVM currently
+// generates expensive int-float conversion instead:
+// FMOVDi Dd, #0.0
+// FMOVv2f64ns Vd.2d, #0.0
+
+// FMOVSi,FMOVDi
+def : WriteRes<WriteFImm, [CyUnitV]> {let Latency = 2;}
+
+// MOVI,MVNI are WriteV
+// FMOVv2f32ns,FMOVv2f64ns,FMOVv4f32ns are WriteV
+
+// Move FPR is a register rename and single nop micro-op.
+// ORR.16b Vd,Vn,Vn
+// COPY is handled above in the WriteMov Variant.
+def WriteVMov : SchedWriteVariant<[
+ SchedVar<WriteVMovPred, [WriteX]>,
+ SchedVar<NoSchedPred, [WriteV]>]>;
+def : InstRW<[WriteVMov], (instrs ORRv16i8)>;
+
+// FMOVSr,FMOVDr are WriteF.
+
+// MOV V,V is a WriteV.
+
+// CPY D,V[x] is a WriteV
+
+// INS V[x],V[y] is a WriteV.
+
+// FMOVWSr,FMOVXDr,FMOVXDHighr
+def : WriteRes<WriteFCopy, [CyUnitLS]> {
+ let Latency = 5;
+}
+
+// FMOVSWr,FMOVDXr
+def : InstRW<[WriteLD], (instrs FMOVSWr,FMOVDXr,FMOVDXHighr)>;
+
+// INS V[x],R
+def CyWriteCopyToFPR : WriteSequence<[WriteVLD, WriteV]>;
+def : InstRW<[CyWriteCopyToFPR], (instregex "INSv")>;
+
+// SMOV,UMOV R,V[x]
+def CyWriteCopyToGPR : WriteSequence<[WriteLD, WriteI]>;
+def : InstRW<[CyWriteCopyToGPR], (instregex "SMOVv","UMOVv")>;
+
+// DUP V,R
+def : InstRW<[CyWriteCopyToFPR], (instregex "DUPv")>;
+
+// DUP V,V[x] is a WriteV.
+
+//---
+// 7.9.2 Integer Arithmetic, Logical, and Comparisons
+//---
+
+// BIC,ORR V,#imm are WriteV
+
+def : InstRW<[CyWriteV3], (instregex "ABSv")>;
+
+// MVN,NEG,NOT are WriteV
+
+def : InstRW<[CyWriteV3], (instregex "SQABSv","SQNEGv")>;
+
+// ADDP is a WriteV.
+def CyWriteVADDLP : SchedWriteRes<[CyUnitV]> {let Latency = 2;}
+def : InstRW<[CyWriteVADDLP], (instregex "SADDLPv","UADDLPv")>;
+
+def : InstRW<[CyWriteV3],
+ (instregex "ADDVv","SMAXVv","UMAXVv","SMINVv","UMINVv")>;
+
+def : InstRW<[CyWriteV3], (instregex "SADDLV","UADDLV")>;
+
+// ADD,SUB are WriteV
+
+// Forward declare.
+def CyWriteVABD : SchedWriteRes<[CyUnitV]> {let Latency = 3;}
+
+// Add/Diff and accumulate uses the vector multiply unit.
+def CyWriteVAccum : SchedWriteRes<[CyUnitVM]> {let Latency = 3;}
+def CyReadVAccum : SchedReadAdvance<1,
+ [CyWriteVAccum, CyWriteVADDLP, CyWriteVABD]>;
+
+def : InstRW<[CyWriteVAccum, CyReadVAccum],
+ (instregex "SADALP","UADALP")>;
+
+def : InstRW<[CyWriteVAccum, CyReadVAccum],
+ (instregex "SABAv","UABAv","SABALv","UABALv")>;
+
+def : InstRW<[CyWriteV3], (instregex "SQADDv","SQSUBv","UQADDv","UQSUBv")>;
+
+def : InstRW<[CyWriteV3], (instregex "SUQADDv","USQADDv")>;
+
+def : InstRW<[CyWriteV4], (instregex "ADDHNv","RADDHNv", "RSUBHNv", "SUBHNv")>;
+
+// WriteV includes:
+// AND,BIC,CMTST,EOR,ORN,ORR
+// ADDP
+// SHADD,SHSUB,SRHADD,UHADD,UHSUB,URHADD
+// SADDL,SSUBL,UADDL,USUBL
+// SADDW,SSUBW,UADDW,USUBW
+
+def : InstRW<[CyWriteV3], (instregex "CMEQv","CMGEv","CMGTv",
+ "CMLEv","CMLTv",
+ "CMHIv","CMHSv")>;
+
+def : InstRW<[CyWriteV3], (instregex "SMAXv","SMINv","UMAXv","UMINv",
+ "SMAXPv","SMINPv","UMAXPv","UMINPv")>;
+
+def : InstRW<[CyWriteVABD], (instregex "SABDv","UABDv",
+ "SABDLv","UABDLv")>;
+
+//---
+// 7.9.3 Floating Point Arithmetic and Comparisons
+//---
+
+// FABS,FNEG are WriteF
+
+def : InstRW<[CyWriteV4], (instrs FADDPv2i32p)>;
+def : InstRW<[CyWriteV5], (instrs FADDPv2i64p)>;
+
+def : InstRW<[CyWriteV3], (instregex "FMAXPv2i","FMAXNMPv2i",
+ "FMINPv2i","FMINNMPv2i")>;
+
+def : InstRW<[CyWriteV4], (instregex "FMAXVv","FMAXNMVv","FMINVv","FMINNMVv")>;
+
+def : InstRW<[CyWriteV4], (instrs FADDSrr,FADDv2f32,FADDv4f32,
+ FSUBSrr,FSUBv2f32,FSUBv4f32,
+ FADDPv2f32,FADDPv4f32,
+ FABD32,FABDv2f32,FABDv4f32)>;
+def : InstRW<[CyWriteV5], (instrs FADDDrr,FADDv2f64,
+ FSUBDrr,FSUBv2f64,
+ FADDPv2f64,
+ FABD64,FABDv2f64)>;
+
+def : InstRW<[CyWriteV3], (instregex "FCMEQ","FCMGT","FCMLE","FCMLT")>;
+
+def : InstRW<[CyWriteV3], (instregex "FACGE","FACGT",
+ "FMAXS","FMAXD","FMAXv",
+ "FMINS","FMIND","FMINv",
+ "FMAXNMS","FMAXNMD","FMAXNMv",
+ "FMINNMS","FMINNMD","FMINNMv",
+ "FMAXPv2f","FMAXPv4f",
+ "FMINPv2f","FMINPv4f",
+ "FMAXNMPv2f","FMAXNMPv4f",
+ "FMINNMPv2f","FMINNMPv4f")>;
+
+// FCMP,FCMPE,FCCMP,FCCMPE
+def : WriteRes<WriteFCmp, [CyUnitVC]> {let Latency = 4;}
+
+// FCSEL is a WriteF.
+
+//---
+// 7.9.4 Shifts and Bitfield Operations
+//---
+
+// SHL is a WriteV
+
+def CyWriteVSHR : SchedWriteRes<[CyUnitV]> {let Latency = 2;}
+def : InstRW<[CyWriteVSHR], (instregex "SSHRv","USHRv")>;
+
+def CyWriteVSRSHR : SchedWriteRes<[CyUnitV]> {let Latency = 3;}
+def : InstRW<[CyWriteVSRSHR], (instregex "SRSHRv","URSHRv")>;
+
+// Shift and accumulate uses the vector multiply unit.
+def CyWriteVShiftAcc : SchedWriteRes<[CyUnitVM]> {let Latency = 3;}
+def CyReadVShiftAcc : SchedReadAdvance<1,
+ [CyWriteVShiftAcc, CyWriteVSHR, CyWriteVSRSHR]>;
+def : InstRW<[CyWriteVShiftAcc, CyReadVShiftAcc],
+ (instregex "SRSRAv","SSRAv","URSRAv","USRAv")>;
+
+// SSHL,USHL are WriteV.
+
+def : InstRW<[CyWriteV3], (instregex "SRSHLv","URSHLv")>;
+
+// SQSHL,SQSHLU,UQSHL are WriteV.
+
+def : InstRW<[CyWriteV3], (instregex "SQRSHLv","UQRSHLv")>;
+
+// WriteV includes:
+// SHLL,SSHLL,USHLL
+// SLI,SRI
+// BIF,BIT,BSL
+// EXT
+// CLS,CLZ,CNT,RBIT,REV16,REV32,REV64,XTN
+// XTN2
+
+def : InstRW<[CyWriteV4],
+ (instregex "RSHRNv","SHRNv",
+ "SQRSHRNv","SQRSHRUNv","SQSHRNv","SQSHRUNv",
+ "UQRSHRNv","UQSHRNv","SQXTNv","SQXTUNv","UQXTNv")>;
+
+//---
+// 7.9.5 Multiplication
+//---
+
+def CyWriteVMul : SchedWriteRes<[CyUnitVM]> { let Latency = 4;}
+def : InstRW<[CyWriteVMul], (instregex "MULv","SMULLv","UMULLv",
+ "SQDMULLv","SQDMULHv","SQRDMULHv")>;
+
+// FMUL,FMULX,FNMUL default to WriteFMul.
+def : WriteRes<WriteFMul, [CyUnitVM]> { let Latency = 4;}
+
+def CyWriteV64Mul : SchedWriteRes<[CyUnitVM]> { let Latency = 5;}
+def : InstRW<[CyWriteV64Mul], (instrs FMULDrr,FMULv2f64,FMULv2i64_indexed,
+ FNMULDrr,FMULX64,FMULXv2f64,FMULXv2i64_indexed)>;
+
+def CyReadVMulAcc : SchedReadAdvance<1, [CyWriteVMul, CyWriteV64Mul]>;
+def : InstRW<[CyWriteVMul, CyReadVMulAcc],
+ (instregex "MLA","MLS","SMLAL","SMLSL","UMLAL","UMLSL",
+ "SQDMLAL","SQDMLSL")>;
+
+def CyWriteSMul : SchedWriteRes<[CyUnitVM]> { let Latency = 8;}
+def CyWriteDMul : SchedWriteRes<[CyUnitVM]> { let Latency = 10;}
+def CyReadSMul : SchedReadAdvance<4, [CyWriteSMul]>;
+def CyReadDMul : SchedReadAdvance<5, [CyWriteDMul]>;
+
+def : InstRW<[CyWriteSMul, CyReadSMul],
+ (instrs FMADDSrrr,FMSUBSrrr,FNMADDSrrr,FNMSUBSrrr,
+ FMLAv2f32,FMLAv4f32,
+ FMLAv1i32_indexed,FMLAv1i64_indexed,FMLAv2i32_indexed)>;
+def : InstRW<[CyWriteDMul, CyReadDMul],
+ (instrs FMADDDrrr,FMSUBDrrr,FNMADDDrrr,FNMSUBDrrr,
+ FMLAv2f64,FMLAv2i64_indexed,
+ FMLSv2f64,FMLSv2i64_indexed)>;
+
+def CyWritePMUL : SchedWriteRes<[CyUnitVD]> { let Latency = 3; }
+def : InstRW<[CyWritePMUL], (instregex "PMULv", "PMULLv")>;
+
+//---
+// 7.9.6 Divide and Square Root
+//---
+
+// FDIV,FSQRT
+// TODO: Add 64-bit variant with 19 cycle latency.
+// TODO: Specialize FSQRT for longer latency.
+def : WriteRes<WriteFDiv, [CyUnitVD, CyUnitFloatDiv]> {
+ let Latency = 17;
+ let ResourceCycles = [2, 17];
+}
+
+def : InstRW<[CyWriteV4], (instregex "FRECPEv","FRECPXv","URECPEv","URSQRTEv")>;
+
+def WriteFRSQRTE : SchedWriteRes<[CyUnitVM]> { let Latency = 4; }
+def : InstRW<[WriteFRSQRTE], (instregex "FRSQRTEv")>;
+
+def WriteFRECPS : SchedWriteRes<[CyUnitVM]> { let Latency = 8; }
+def WriteFRSQRTS : SchedWriteRes<[CyUnitVM]> { let Latency = 10; }
+def : InstRW<[WriteFRECPS], (instregex "FRECPSv")>;
+def : InstRW<[WriteFRSQRTS], (instregex "FRSQRTSv")>;
+
+//---
+// 7.9.7 Integer-FP Conversions
+//---
+
+// FCVT lengthen f16/s32
+def : InstRW<[WriteV], (instrs FCVTSHr,FCVTDHr,FCVTDSr)>;
+
+// FCVT,FCVTN,FCVTXN
+// SCVTF,UCVTF V,V
+// FRINT(AIMNPXZ) V,V
+def : WriteRes<WriteFCvt, [CyUnitV]> {let Latency = 4;}
+
+// SCVT/UCVT S/D, Rd = VLD5+V4: 9 cycles.
+def CyWriteCvtToFPR : WriteSequence<[WriteVLD, CyWriteV4]>;
+def : InstRW<[CyWriteCopyToFPR], (instregex "FCVT[AMNPZ][SU][SU][WX][SD]r")>;
+
+// FCVT Rd, S/D = V6+LD4: 10 cycles
+def CyWriteCvtToGPR : WriteSequence<[CyWriteV6, WriteLD]>;
+def : InstRW<[CyWriteCvtToGPR], (instregex "[SU]CVTF[SU][WX][SD]r")>;
+
+// FCVTL is a WriteV
+
+//---
+// 7.9.8-7.9.10 Cryptography, Data Transposition, Table Lookup
+//---
+
+def CyWriteCrypto2 : SchedWriteRes<[CyUnitVD]> {let Latency = 2;}
+def : InstRW<[CyWriteCrypto2], (instrs AESIMCrr, AESMCrr, SHA1Hrr,
+ AESDrr, AESErr, SHA1SU1rr, SHA256SU0rr,
+ SHA1SU0rrr)>;
+
+def CyWriteCrypto3 : SchedWriteRes<[CyUnitVD]> {let Latency = 3;}
+def : InstRW<[CyWriteCrypto3], (instrs SHA256SU1rrr)>;
+
+def CyWriteCrypto6 : SchedWriteRes<[CyUnitVD]> {let Latency = 6;}
+def : InstRW<[CyWriteCrypto6], (instrs SHA1Crrr, SHA1Mrrr, SHA1Prrr,
+ SHA256Hrrr,SHA256H2rrr)>;
+
+// TRN,UZP,ZUP are WriteV.
+
+// TBL,TBX are WriteV.
+
+//---
+// 7.9.11-7.9.14 Load/Store, single element and paired
+//---
+
+// Loading into the vector unit takes 5 cycles vs 4 for integer loads.
+def : WriteRes<WriteVLD, [CyUnitLS]> {
+ let Latency = 5;
+}
+
+// Store-load forwarding is 4 cycles.
+def : WriteRes<WriteVST, [CyUnitLS]> {
+ let Latency = 4;
+}
+
+// WriteVLDPair/VSTPair sequences are expanded by the target description.
+
+//---
+// 7.9.15 Load, element operations
+//---
+
+// Only the first WriteVLD and WriteAdr for writeback matches def operands.
+// Subsequent WriteVLDs consume resources. Since all loaded values have the
+// same latency, this is acceptable.
+
+// Vd is read 5 cycles after issuing the vector load.
+def : ReadAdvance<ReadVLD, 5>;
+
+def : InstRW<[WriteVLD],
+ (instregex "LD1Onev(8b|4h|2s|1d|16b|8h|4s|2d)$")>;
+def : InstRW<[WriteVLD, WriteAdr],
+ (instregex "LD1Onev(8b|4h|2s|1d|16b|8h|4s|2d)_POST")>;
+
+// Register writes from the load's high half are fused micro-ops.
+def : InstRW<[WriteVLD],
+ (instregex "LD1Twov(8b|4h|2s|1d)$")>;
+def : InstRW<[WriteVLD, WriteAdr],
+ (instregex "LD1Twov(8b|4h|2s|1d)_POST")>;
+def : InstRW<[WriteVLD, WriteVLD],
+ (instregex "LD1Twov(16b|8h|4s|2d)$")>;
+def : InstRW<[WriteVLD, WriteAdr, WriteVLD],
+ (instregex "LD1Twov(16b|8h|4s|2d)_POST")>;
+
+def : InstRW<[WriteVLD, WriteVLD],
+ (instregex "LD1Threev(8b|4h|2s|1d)$")>;
+def : InstRW<[WriteVLD, WriteAdr, WriteVLD],
+ (instregex "LD1Threev(8b|4h|2s|1d)_POST")>;
+def : InstRW<[WriteVLD, WriteVLD, WriteVLD],
+ (instregex "LD1Threev(16b|8h|4s|2d)$")>;
+def : InstRW<[WriteVLD, WriteAdr, WriteVLD, WriteVLD],
+ (instregex "LD1Threev(16b|8h|4s|2d)_POST")>;
+
+def : InstRW<[WriteVLD, WriteVLD],
+ (instregex "LD1Fourv(8b|4h|2s|1d)$")>;
+def : InstRW<[WriteVLD, WriteAdr, WriteVLD],
+ (instregex "LD1Fourv(8b|4h|2s|1d)_POST")>;
+def : InstRW<[WriteVLD, WriteVLD, WriteVLD, WriteVLD],
+ (instregex "LD1Fourv(16b|8h|4s|2d)$")>;
+def : InstRW<[WriteVLD, WriteAdr, WriteVLD, WriteVLD, WriteVLD],
+ (instregex "LD1Fourv(16b|8h|4s|2d)_POST")>;
+
+def : InstRW<[WriteVLDShuffle, ReadVLD],
+ (instregex "LD1i(8|16|32)$")>;
+def : InstRW<[WriteVLDShuffle, ReadVLD, WriteAdr],
+ (instregex "LD1i(8|16|32)_POST")>;
+
+def : InstRW<[WriteVLDShuffle, ReadVLD], (instrs LD1i64)>;
+def : InstRW<[WriteVLDShuffle, ReadVLD, WriteAdr],(instrs LD1i64_POST)>;
+
+def : InstRW<[WriteVLDShuffle],
+ (instregex "LD1Rv(8b|4h|2s|1d|16b|8h|4s|2d)$")>;
+def : InstRW<[WriteVLDShuffle, WriteAdr],
+ (instregex "LD1Rv(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>;
+
+def : InstRW<[WriteVLDShuffle, WriteV],
+ (instregex "LD2Twov(8b|4h|2s)$")>;
+def : InstRW<[WriteVLDShuffle, WriteAdr, WriteV],
+ (instregex "LD2Twov(8b|4h|2s)_POST$")>;
+def : InstRW<[WriteVLDShuffle, WriteVLDShuffle],
+ (instregex "LD2Twov(16b|8h|4s|2d)$")>;
+def : InstRW<[WriteVLDShuffle, WriteAdr, WriteVLDShuffle],
+ (instregex "LD2Twov(16b|8h|4s|2d)_POST")>;
+
+def : InstRW<[WriteVLDShuffle, ReadVLD, WriteV],
+ (instregex "LD2i(8|16|32)$")>;
+def : InstRW<[WriteVLDShuffle, ReadVLD, WriteAdr, WriteV],
+ (instregex "LD2i(8|16|32)_POST")>;
+def : InstRW<[WriteVLDShuffle, ReadVLD, WriteV],
+ (instregex "LD2i64$")>;
+def : InstRW<[WriteVLDShuffle, ReadVLD, WriteAdr, WriteV],
+ (instregex "LD2i64_POST")>;
+
+def : InstRW<[WriteVLDShuffle, WriteV],
+ (instregex "LD2Rv(8b|4h|2s|1d|16b|8h|4s|2d)$")>;
+def : InstRW<[WriteVLDShuffle, WriteAdr, WriteV],
+ (instregex "LD2Rv(8b|4h|2s|1d|16b|8h|4s|2d)_POST")>;
+
+def : InstRW<[WriteVLDShuffle, WriteVLDShuffle, WriteV],
+ (instregex "LD3Threev(8b|4h|2s)$")>;
+def : InstRW<[WriteVLDShuffle, WriteAdr, WriteVLDShuffle, WriteV],
+ (instregex "LD3Threev(8b|4h|2s)_POST")>;
+def : InstRW<[WriteVLDShuffle, WriteVLDShuffle, WriteVLDShuffle],
+ (instregex "LD3Threev(16b|8h|4s|2d)$")>;
+def : InstRW<[WriteVLDShuffle, WriteAdr, WriteVLDShuffle, WriteVLDShuffle],
+ (instregex "LD3Threev(16b|8h|4s|2d)_POST")>;
+
+def : InstRW<[WriteVLDShuffle, ReadVLD, WriteV, WriteV],
+ (instregex "LD3i(8|16|32)$")>;
+def : InstRW<[WriteVLDShuffle, ReadVLD, WriteAdr, WriteV, WriteV],
+ (instregex "LD3i(8|16|32)_POST")>;
+
+def : InstRW<[WriteVLDShuffle, ReadVLD, WriteVLDShuffle, WriteV],
+ (instregex "LD3i64$")>;
+def : InstRW<[WriteVLDShuffle, ReadVLD, WriteAdr, WriteVLDShuffle, WriteV],
+ (instregex "LD3i64_POST")>;
+
+def : InstRW<[WriteVLDShuffle, WriteV, WriteV],
+ (instregex "LD3Rv(8b|4h|2s|16b|8h|4s)$")>;
+def : InstRW<[WriteVLDShuffle, WriteAdr, WriteV, WriteV],
+ (instregex "LD3Rv(8b|4h|2s|16b|8h|4s)_POST")>;
+
+def : InstRW<[WriteVLDShuffle, WriteVLDShuffle, WriteV],
+ (instrs LD3Rv1d,LD3Rv2d)>;
+def : InstRW<[WriteVLDShuffle, WriteAdr, WriteVLDShuffle, WriteV],
+ (instrs LD3Rv2d_POST,LD3Rv2d_POST)>;
+
+def : InstRW<[WriteVLDShuffle, WriteVLDShuffle, WriteV, WriteV],
+ (instregex "LD4Fourv(8b|4h|2s)$")>;
+def : InstRW<[WriteVLDShuffle, WriteAdr, WriteVLDShuffle, WriteV, WriteV],
+ (instregex "LD4Fourv(8b|4h|2s)_POST")>;
+def : InstRW<[WriteVLDPairShuffle, WriteVLDPairShuffle,
+ WriteVLDPairShuffle, WriteVLDPairShuffle],
+ (instregex "LD4Fourv(16b|8h|4s|2d)$")>;
+def : InstRW<[WriteVLDPairShuffle, WriteAdr, WriteVLDPairShuffle,
+ WriteVLDPairShuffle, WriteVLDPairShuffle],
+ (instregex "LD4Fourv(16b|8h|4s|2d)_POST")>;
+
+def : InstRW<[WriteVLDShuffle, ReadVLD, WriteV, WriteV, WriteV],
+ (instregex "LD4i(8|16|32)$")>;
+def : InstRW<[WriteVLDShuffle, ReadVLD, WriteAdr, WriteV, WriteV, WriteV],
+ (instregex "LD4i(8|16|32)_POST")>;
+
+
+def : InstRW<[WriteVLDShuffle, ReadVLD, WriteVLDShuffle, WriteV, WriteV],
+ (instrs LD4i64)>;
+def : InstRW<[WriteVLDShuffle, ReadVLD, WriteAdr, WriteVLDShuffle, WriteV],
+ (instrs LD4i64_POST)>;
+
+def : InstRW<[WriteVLDShuffle, WriteV, WriteV, WriteV],
+ (instregex "LD4Rv(8b|4h|2s|16b|8h|4s)$")>;
+def : InstRW<[WriteVLDShuffle, WriteAdr, WriteV, WriteV, WriteV],
+ (instregex "LD4Rv(8b|4h|2s|16b|8h|4s)_POST")>;
+
+def : InstRW<[WriteVLDShuffle, WriteVLDShuffle, WriteV, WriteV],
+ (instrs LD4Rv1d,LD4Rv2d)>;
+def : InstRW<[WriteVLDShuffle, WriteAdr, WriteVLDShuffle, WriteV, WriteV],
+ (instrs LD4Rv1d_POST,LD4Rv2d_POST)>;
+
+//---
+// 7.9.16 Store, element operations
+//---
+
+// Only the WriteAdr for writeback matches a def operands.
+// Subsequent WriteVLDs only consume resources.
+
+def : InstRW<[WriteVST],
+ (instregex "ST1Onev(8b|4h|2s|1d|16b|8h|4s|2d)$")>;
+def : InstRW<[WriteAdr, WriteVST],
+ (instregex "ST1Onev(8b|4h|2s|1d|16b|8h|4s|2d)_POST")>;
+
+def : InstRW<[WriteVSTShuffle],
+ (instregex "ST1Twov(8b|4h|2s|1d)$")>;
+def : InstRW<[WriteAdr, WriteVSTShuffle],
+ (instregex "ST1Twov(8b|4h|2s|1d)_POST")>;
+def : InstRW<[WriteVST, WriteVST],
+ (instregex "ST1Twov(16b|8h|4s|2d)$")>;
+def : InstRW<[WriteAdr, WriteVST, WriteVST],
+ (instregex "ST1Twov(16b|8h|4s|2d)_POST")>;
+
+def : InstRW<[WriteVSTShuffle, WriteVST],
+ (instregex "ST1Threev(8b|4h|2s|1d)$")>;
+def : InstRW<[WriteAdr, WriteVSTShuffle, WriteVST],
+ (instregex "ST1Threev(8b|4h|2s|1d)_POST")>;
+def : InstRW<[WriteVST, WriteVST, WriteVST],
+ (instregex "ST1Threev(16b|8h|4s|2d)$")>;
+def : InstRW<[WriteAdr, WriteVST, WriteVST, WriteVST],
+ (instregex "ST1Threev(16b|8h|4s|2d)_POST")>;
+
+def : InstRW<[WriteVSTShuffle, WriteVSTShuffle],
+ (instregex "ST1Fourv(8b|4h|2s|1d)$")>;
+def : InstRW<[WriteAdr, WriteVSTShuffle, WriteVSTShuffle],
+ (instregex "ST1Fourv(8b|4h|2s|1d)_POST")>;
+def : InstRW<[WriteVST, WriteVST, WriteVST, WriteVST],
+ (instregex "ST1Fourv(16b|8h|4s|2d)$")>;
+def : InstRW<[WriteAdr, WriteVST, WriteVST, WriteVST, WriteVST],
+ (instregex "ST1Fourv(16b|8h|4s|2d)_POST")>;
+
+def : InstRW<[WriteVSTShuffle], (instregex "ST1i(8|16|32)$")>;
+def : InstRW<[WriteAdr, WriteVSTShuffle], (instregex "ST1i(8|16|32)_POST")>;
+
+def : InstRW<[WriteVSTShuffle], (instrs ST1i64)>;
+def : InstRW<[WriteAdr, WriteVSTShuffle], (instrs ST1i64_POST)>;
+
+def : InstRW<[WriteVSTShuffle],
+ (instregex "ST2Twov(8b|4h|2s)$")>;
+def : InstRW<[WriteAdr, WriteVSTShuffle],
+ (instregex "ST2Twov(8b|4h|2s)_POST")>;
+def : InstRW<[WriteVSTShuffle, WriteVSTShuffle],
+ (instregex "ST2Twov(16b|8h|4s|2d)$")>;
+def : InstRW<[WriteAdr, WriteVSTShuffle, WriteVSTShuffle],
+ (instregex "ST2Twov(16b|8h|4s|2d)_POST")>;
+
+def : InstRW<[WriteVSTShuffle], (instregex "ST2i(8|16|32)$")>;
+def : InstRW<[WriteAdr, WriteVSTShuffle], (instregex "ST2i(8|16|32)_POST")>;
+def : InstRW<[WriteVSTShuffle], (instrs ST2i64)>;
+def : InstRW<[WriteAdr, WriteVSTShuffle], (instrs ST2i64_POST)>;
+
+def : InstRW<[WriteVSTShuffle, WriteVSTShuffle],
+ (instregex "ST3Threev(8b|4h|2s)$")>;
+def : InstRW<[WriteAdr, WriteVSTShuffle, WriteVSTShuffle],
+ (instregex "ST3Threev(8b|4h|2s)_POST")>;
+def : InstRW<[WriteVSTShuffle, WriteVSTShuffle, WriteVSTShuffle],
+ (instregex "ST3Threev(16b|8h|4s|2d)$")>;
+def : InstRW<[WriteAdr, WriteVSTShuffle, WriteVSTShuffle, WriteVSTShuffle],
+ (instregex "ST3Threev(16b|8h|4s|2d)_POST")>;
+
+def : InstRW<[WriteVSTShuffle], (instregex "ST3i(8|16|32)$")>;
+def : InstRW<[WriteAdr, WriteVSTShuffle], (instregex "ST3i(8|16|32)_POST")>;
+
+def :InstRW<[WriteVSTShuffle, WriteVSTShuffle], (instrs ST3i64)>;
+def :InstRW<[WriteAdr, WriteVSTShuffle, WriteVSTShuffle], (instrs ST3i64_POST)>;
+
+def : InstRW<[WriteVSTPairShuffle, WriteVSTPairShuffle],
+ (instregex "ST4Fourv(8b|4h|2s|1d)$")>;
+def : InstRW<[WriteAdr, WriteVSTPairShuffle, WriteVSTPairShuffle],
+ (instregex "ST4Fourv(8b|4h|2s|1d)_POST")>;
+def : InstRW<[WriteVSTPairShuffle, WriteVSTPairShuffle,
+ WriteVSTPairShuffle, WriteVSTPairShuffle],
+ (instregex "ST4Fourv(16b|8h|4s|2d)$")>;
+def : InstRW<[WriteAdr, WriteVSTPairShuffle, WriteVSTPairShuffle,
+ WriteVSTPairShuffle, WriteVSTPairShuffle],
+ (instregex "ST4Fourv(16b|8h|4s|2d)_POST")>;
+
+def : InstRW<[WriteVSTPairShuffle], (instregex "ST4i(8|16|32)$")>;
+def : InstRW<[WriteAdr, WriteVSTPairShuffle], (instregex "ST4i(8|16|32)_POST")>;
+
+def : InstRW<[WriteVSTShuffle, WriteVSTShuffle], (instrs ST4i64)>;
+def : InstRW<[WriteAdr, WriteVSTShuffle, WriteVSTShuffle],(instrs ST4i64_POST)>;
+
+//---
+// Unused SchedRead types
+//---
+
+def : ReadAdvance<ReadI, 0>;
+def : ReadAdvance<ReadISReg, 0>;
+def : ReadAdvance<ReadIEReg, 0>;
+def : ReadAdvance<ReadIM, 0>;
+def : ReadAdvance<ReadIMA, 0>;
+def : ReadAdvance<ReadID, 0>;
+
+} // SchedModel = CycloneModel
diff --git a/lib/Target/AArch64/AArch64Schedule.td b/lib/Target/AArch64/AArch64Schedule.td
index ec8450b..eaa9110 100644
--- a/lib/Target/AArch64/AArch64Schedule.td
+++ b/lib/Target/AArch64/AArch64Schedule.td
@@ -1,4 +1,4 @@
-//===- AArch64Schedule.td - AArch64 Scheduling Definitions -*- tablegen -*-===//
+//==-- AArch64Schedule.td - AArch64 Scheduling Definitions -*- tablegen -*-===//
//
// The LLVM Compiler Infrastructure
//
@@ -7,74 +7,98 @@
//
//===----------------------------------------------------------------------===//
-//===----------------------------------------------------------------------===//
-// Generic processor itineraries for legacy compatibility.
+// Define TII for use in SchedVariant Predicates.
+// const MachineInstr *MI and const TargetSchedModel *SchedModel
+// are defined by default.
+def : PredicateProlog<[{
+ const AArch64InstrInfo *TII =
+ static_cast<const AArch64InstrInfo*>(SchedModel->getInstrInfo());
+ (void)TII;
+}]>;
-def GenericItineraries : ProcessorItineraries<[], [], []>;
+// AArch64 Scheduler Definitions
+def WriteImm : SchedWrite; // MOVN, MOVZ
+// TODO: Provide variants for MOV32/64imm Pseudos that dynamically
+// select the correct sequence of WriteImms.
-//===----------------------------------------------------------------------===//
-// Base SchedReadWrite types
+def WriteI : SchedWrite; // ALU
+def WriteISReg : SchedWrite; // ALU of Shifted-Reg
+def WriteIEReg : SchedWrite; // ALU of Extended-Reg
+def ReadI : SchedRead; // ALU
+def ReadISReg : SchedRead; // ALU of Shifted-Reg
+def ReadIEReg : SchedRead; // ALU of Extended-Reg
+def WriteExtr : SchedWrite; // EXTR shifts a reg pair
+def ReadExtrHi : SchedRead; // Read the high reg of the EXTR pair
+def WriteIS : SchedWrite; // Shift/Scale
+def WriteID32 : SchedWrite; // 32-bit Divide
+def WriteID64 : SchedWrite; // 64-bit Divide
+def ReadID : SchedRead; // 32/64-bit Divide
+def WriteIM32 : SchedWrite; // 32-bit Multiply
+def WriteIM64 : SchedWrite; // 64-bit Multiply
+def ReadIM : SchedRead; // 32/64-bit Multiply
+def ReadIMA : SchedRead; // 32/64-bit Multiply Accumulate
+def WriteBr : SchedWrite; // Branch
+def WriteBrReg : SchedWrite; // Indirect Branch
-// Basic ALU
-def WriteALU : SchedWrite; // Generic: may contain shift and/or ALU operation
-def WriteALUs : SchedWrite; // Shift only with no ALU operation
-def ReadALU : SchedRead; // Operand not needed for shifting
-def ReadALUs : SchedRead; // Operand needed for shifting
+def WriteLD : SchedWrite; // Load from base addr plus immediate offset
+def WriteST : SchedWrite; // Store to base addr plus immediate offset
+def WriteSTP : SchedWrite; // Store a register pair.
+def WriteAdr : SchedWrite; // Address pre/post increment.
-// Multiply with optional accumulate
-def WriteMAC : SchedWrite;
-def ReadMAC : SchedRead;
+def WriteLDIdx : SchedWrite; // Load from a register index (maybe scaled).
+def WriteSTIdx : SchedWrite; // Store to a register index (maybe scaled).
+def ReadAdrBase : SchedRead; // Read the base resister of a reg-offset LD/ST.
-// Compares
-def WriteCMP : SchedWrite;
-def ReadCMP : SchedRead;
+// Predicate for determining when a shiftable register is shifted.
+def RegShiftedPred : SchedPredicate<[{TII->hasShiftedReg(MI)}]>;
-// Division
-def WriteDiv : SchedWrite;
-def ReadDiv : SchedRead;
+// Predicate for determining when a extendedable register is extended.
+def RegExtendedPred : SchedPredicate<[{TII->hasExtendedReg(MI)}]>;
-// Loads
-def WriteLd : SchedWrite;
-def WritePreLd : SchedWrite;
-def WriteVecLd : SchedWrite;
-def ReadLd : SchedRead;
-def ReadPreLd : SchedRead;
-def ReadVecLd : SchedRead;
+// ScaledIdxPred is true if a WriteLDIdx operand will be
+// scaled. Subtargets can use this to dynamically select resources and
+// latency for WriteLDIdx and ReadAdrBase.
+def ScaledIdxPred : SchedPredicate<[{TII->isScaledAddr(MI)}]>;
-// Stores
-def WriteSt : SchedWrite;
-def WriteVecSt : SchedWrite;
-def ReadSt : SchedRead;
-def ReadVecSt : SchedRead;
+// Serialized two-level address load.
+// EXAMPLE: LOADGot
+def WriteLDAdr : WriteSequence<[WriteAdr, WriteLD]>;
-// Branches
-def WriteBr : SchedWrite;
-def WriteBrL : SchedWrite;
-def ReadBr : SchedRead;
+// Serialized two-level address lookup.
+// EXAMPLE: MOVaddr...
+def WriteAdrAdr : WriteSequence<[WriteAdr, WriteAdr]>;
-// Floating Point ALU
-def WriteFPALU : SchedWrite;
-def ReadFPALU : SchedRead;
+// The second register of a load-pair.
+// LDP,LDPSW,LDNP,LDXP,LDAXP
+def WriteLDHi : SchedWrite;
-// Floating Point MAC, Mul, Div, Sqrt
-// Most processors will simply send all of these down a dedicated pipe, but
-// they're explicitly seperated here for flexibility of modeling later. May
-// consider consolidating them into a single WriteFPXXXX type in the future.
-def WriteFPMAC : SchedWrite;
-def WriteFPMul : SchedWrite;
-def WriteFPDiv : SchedWrite;
-def WriteFPSqrt : SchedWrite;
-def ReadFPMAC : SchedRead;
-def ReadFPMul : SchedRead;
-def ReadFPDiv : SchedRead;
-def ReadFPSqrt : SchedRead;
+// Store-exclusive is a store followed by a dependent load.
+def WriteSTX : WriteSequence<[WriteST, WriteLD]>;
-// Noop
-def WriteNoop : SchedWrite;
+def WriteSys : SchedWrite; // Long, variable latency system ops.
+def WriteBarrier : SchedWrite; // Memory barrier.
+def WriteHint : SchedWrite; // Hint instruction.
+def WriteF : SchedWrite; // General floating-point ops.
+def WriteFCmp : SchedWrite; // Floating-point compare.
+def WriteFCvt : SchedWrite; // Float conversion.
+def WriteFCopy : SchedWrite; // Float-int register copy.
+def WriteFImm : SchedWrite; // Floating-point immediate.
+def WriteFMul : SchedWrite; // Floating-point multiply.
+def WriteFDiv : SchedWrite; // Floating-point division.
-//===----------------------------------------------------------------------===//
-// Subtarget specific Machine Models.
+def WriteV : SchedWrite; // Vector ops.
+def WriteVLD : SchedWrite; // Vector loads.
+def WriteVST : SchedWrite; // Vector stores.
-include "AArch64ScheduleA53.td"
+// Read the unwritten lanes of the VLD's destination registers.
+def ReadVLD : SchedRead;
+
+// Sequential vector load and shuffle.
+def WriteVLDShuffle : WriteSequence<[WriteVLD, WriteV]>;
+def WriteVLDPairShuffle : WriteSequence<[WriteVLD, WriteV, WriteV]>;
+
+// Store a shuffled vector.
+def WriteVSTShuffle : WriteSequence<[WriteV, WriteVST]>;
+def WriteVSTPairShuffle : WriteSequence<[WriteV, WriteV, WriteVST]>;
diff --git a/lib/Target/AArch64/AArch64ScheduleA53.td b/lib/Target/AArch64/AArch64ScheduleA53.td
deleted file mode 100644
index 20a14e7..0000000
--- a/lib/Target/AArch64/AArch64ScheduleA53.td
+++ /dev/null
@@ -1,144 +0,0 @@
-//=- AArch64ScheduleA53.td - ARM Cortex-A53 Scheduling Definitions -*- tablegen -*-=//
-//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This file defines the itinerary class data for the ARM Cortex A53 processors.
-//
-//===----------------------------------------------------------------------===//
-
-// ===---------------------------------------------------------------------===//
-// The following definitions describe the simpler per-operand machine model.
-// This works with MachineScheduler. See MCSchedModel.h for details.
-
-// Cortex-A53 machine model for scheduling and other instruction cost heuristics.
-def CortexA53Model : SchedMachineModel {
- let IssueWidth = 2; // 2 micro-ops are dispatched per cycle.
- let MinLatency = 1 ; // OperandCycles are interpreted as MinLatency.
- let LoadLatency = 2; // Optimistic load latency assuming bypass.
- // This is overriden by OperandCycles if the
- // Itineraries are queried instead.
- let MispredictPenalty = 9; // Based on "Cortex-A53 Software Optimisation
- // Specification - Instruction Timings"
- // v 1.0 Spreadsheet
-}
-
-
-//===----------------------------------------------------------------------===//
-// Define each kind of processor resource and number available.
-
-// Modeling each pipeline as a ProcResource using the default BufferSize = -1.
-// Cortex-A53 is in-order and therefore should be using BufferSize = 0. The
-// current configuration performs better with the basic latencies provided so
-// far. Will revisit BufferSize once the latency information is more accurate.
-
-let SchedModel = CortexA53Model in {
-
-def A53UnitALU : ProcResource<2>; // Int ALU
-def A53UnitMAC : ProcResource<1>; // Int MAC
-def A53UnitDiv : ProcResource<1>; // Int Division
-def A53UnitLdSt : ProcResource<1>; // Load/Store
-def A53UnitB : ProcResource<1>; // Branch
-def A53UnitFPALU : ProcResource<1>; // FP ALU
-def A53UnitFPMDS : ProcResource<1>; // FP Mult/Div/Sqrt
-
-
-//===----------------------------------------------------------------------===//
-// Subtarget-specific SchedWrite types which both map the ProcResources and
-// set the latency.
-
-// Issue - Every instruction must consume an A53WriteIssue. Optionally,
-// instructions that cannot be dual-issued will also include the
-// A53WriteIssue2nd in their SchedRW list. That second WriteRes will
-// ensure that a second issue slot is consumed.
-def A53WriteIssue : SchedWriteRes<[]>;
-def A53WriteIssue2nd : SchedWriteRes<[]> { let Latency = 0; }
-
-// ALU - These are reduced to 1 despite a true latency of 4 in order to easily
-// model forwarding logic. Once forwarding is properly modelled, then
-// they'll be corrected.
-def : WriteRes<WriteALU, [A53UnitALU]> { let Latency = 1; }
-def : WriteRes<WriteALUs, [A53UnitALU]> { let Latency = 1; }
-def : WriteRes<WriteCMP, [A53UnitALU]> { let Latency = 1; }
-
-// MAC
-def : WriteRes<WriteMAC, [A53UnitMAC]> { let Latency = 4; }
-
-// Div
-def : WriteRes<WriteDiv, [A53UnitDiv]> { let Latency = 4; }
-
-// Load - Note: Vector loads take 1-5 cycles to issue. For the WriteVecLd below,
-// choosing the median of 3 which makes the latency 6. May model this more
-// carefully in the future.
-def : WriteRes<WriteLd, [A53UnitLdSt]> { let Latency = 4; }
-def : WriteRes<WritePreLd, [A53UnitLdSt]> { let Latency = 4; }
-def : WriteRes<WriteVecLd, [A53UnitLdSt]> { let Latency = 6; }
-
-// Store - Note: Vector stores take 1-3 cycles to issue. For the ReadVecSt below,
-// choosing the median of 2 which makes the latency 5. May model this more
-// carefully in the future.
-def : WriteRes<WriteSt, [A53UnitLdSt]> { let Latency = 4; }
-def : WriteRes<WriteVecSt, [A53UnitLdSt]> { let Latency = 5; }
-
-// Branch
-def : WriteRes<WriteBr, [A53UnitB]>;
-def : WriteRes<WriteBrL, [A53UnitB]>;
-
-// FP ALU
-def : WriteRes<WriteFPALU, [A53UnitFPALU]> {let Latency = 6; }
-
-// FP MAC, Mul, Div, Sqrt
-// Using Double Precision numbers for now as a worst case. Additionally, not
-// modeling the exact hazard but instead treating the whole pipe as a hazard.
-// As an example VMUL, VMLA, and others are actually pipelined. VDIV and VSQRT
-// have a total latency of 33 and 32 respectively but only a hazard of 29 and
-// 28 (double-prescion example).
-def : WriteRes<WriteFPMAC, [A53UnitFPMDS]> { let Latency = 10; }
-def : WriteRes<WriteFPMul, [A53UnitFPMDS]> { let Latency = 6; }
-def : WriteRes<WriteFPDiv, [A53UnitFPMDS]> { let Latency = 33;
- let ResourceCycles = [29]; }
-def : WriteRes<WriteFPSqrt, [A53UnitFPMDS]> { let Latency = 32;
- let ResourceCycles = [28]; }
-
-
-//===----------------------------------------------------------------------===//
-// Subtarget-specific SchedRead types.
-
-// No forwarding defined for ReadALU yet.
-def : ReadAdvance<ReadALU, 0>;
-
-// No forwarding defined for ReadCMP yet.
-def : ReadAdvance<ReadCMP, 0>;
-
-// No forwarding defined for ReadBr yet.
-def : ReadAdvance<ReadBr, 0>;
-
-// No forwarding defined for ReadMAC yet.
-def : ReadAdvance<ReadMAC, 0>;
-
-// No forwarding defined for ReadDiv yet.
-def : ReadAdvance<ReadDiv, 0>;
-
-// No forwarding defined for ReadLd, ReadPreLd, ReadVecLd yet.
-def : ReadAdvance<ReadLd, 0>;
-def : ReadAdvance<ReadPreLd, 0>;
-def : ReadAdvance<ReadVecLd, 0>;
-
-// No forwarding defined for ReadSt and ReadVecSt yet.
-def : ReadAdvance<ReadSt, 0>;
-def : ReadAdvance<ReadVecSt, 0>;
-
-// No forwarding defined for ReadFPALU yet.
-def : ReadAdvance<ReadFPALU, 0>;
-
-// No forwarding defined for ReadFPMAC/Mul/Div/Sqrt yet.
-def : ReadAdvance<ReadFPMAC, 0>;
-def : ReadAdvance<ReadFPMul, 0>;
-def : ReadAdvance<ReadFPDiv, 0>;
-def : ReadAdvance<ReadFPSqrt, 0>;
-
-}
diff --git a/lib/Target/AArch64/AArch64SelectionDAGInfo.cpp b/lib/Target/AArch64/AArch64SelectionDAGInfo.cpp
index 6bbe075..5c65b75 100644
--- a/lib/Target/AArch64/AArch64SelectionDAGInfo.cpp
+++ b/lib/Target/AArch64/AArch64SelectionDAGInfo.cpp
@@ -11,15 +11,49 @@
//
//===----------------------------------------------------------------------===//
-#define DEBUG_TYPE "arm-selectiondag-info"
#include "AArch64TargetMachine.h"
-#include "llvm/CodeGen/SelectionDAG.h"
using namespace llvm;
-AArch64SelectionDAGInfo::AArch64SelectionDAGInfo(const AArch64TargetMachine &TM)
- : TargetSelectionDAGInfo(TM),
- Subtarget(&TM.getSubtarget<AArch64Subtarget>()) {
-}
+#define DEBUG_TYPE "aarch64-selectiondag-info"
-AArch64SelectionDAGInfo::~AArch64SelectionDAGInfo() {
+AArch64SelectionDAGInfo::AArch64SelectionDAGInfo(const TargetMachine &TM)
+ : TargetSelectionDAGInfo(TM),
+ Subtarget(&TM.getSubtarget<AArch64Subtarget>()) {}
+
+AArch64SelectionDAGInfo::~AArch64SelectionDAGInfo() {}
+
+SDValue AArch64SelectionDAGInfo::EmitTargetCodeForMemset(
+ SelectionDAG &DAG, SDLoc dl, SDValue Chain, SDValue Dst, SDValue Src,
+ SDValue Size, unsigned Align, bool isVolatile,
+ MachinePointerInfo DstPtrInfo) const {
+ // Check to see if there is a specialized entry-point for memory zeroing.
+ ConstantSDNode *V = dyn_cast<ConstantSDNode>(Src);
+ ConstantSDNode *SizeValue = dyn_cast<ConstantSDNode>(Size);
+ const char *bzeroEntry =
+ (V && V->isNullValue()) ? Subtarget->getBZeroEntry() : nullptr;
+ // For small size (< 256), it is not beneficial to use bzero
+ // instead of memset.
+ if (bzeroEntry && (!SizeValue || SizeValue->getZExtValue() > 256)) {
+ const AArch64TargetLowering &TLI =
+ *static_cast<const AArch64TargetLowering *>(
+ DAG.getTarget().getTargetLowering());
+
+ EVT IntPtr = TLI.getPointerTy();
+ Type *IntPtrTy = getDataLayout()->getIntPtrType(*DAG.getContext());
+ TargetLowering::ArgListTy Args;
+ TargetLowering::ArgListEntry Entry;
+ Entry.Node = Dst;
+ Entry.Ty = IntPtrTy;
+ Args.push_back(Entry);
+ Entry.Node = Size;
+ Args.push_back(Entry);
+ TargetLowering::CallLoweringInfo CLI(DAG);
+ CLI.setDebugLoc(dl).setChain(Chain)
+ .setCallee(CallingConv::C, Type::getVoidTy(*DAG.getContext()),
+ DAG.getExternalSymbol(bzeroEntry, IntPtr), &Args, 0)
+ .setDiscardResult();
+ std::pair<SDValue, SDValue> CallResult = TLI.LowerCallTo(CLI);
+ return CallResult.second;
+ }
+ return SDValue();
}
diff --git a/lib/Target/AArch64/AArch64SelectionDAGInfo.h b/lib/Target/AArch64/AArch64SelectionDAGInfo.h
index d412ed2..8381f99 100644
--- a/lib/Target/AArch64/AArch64SelectionDAGInfo.h
+++ b/lib/Target/AArch64/AArch64SelectionDAGInfo.h
@@ -11,22 +11,27 @@
//
//===----------------------------------------------------------------------===//
-#ifndef LLVM_AARCH64SELECTIONDAGINFO_H
-#define LLVM_AARCH64SELECTIONDAGINFO_H
+#ifndef AArch64SELECTIONDAGINFO_H
+#define AArch64SELECTIONDAGINFO_H
#include "llvm/Target/TargetSelectionDAGInfo.h"
namespace llvm {
-class AArch64TargetMachine;
-
class AArch64SelectionDAGInfo : public TargetSelectionDAGInfo {
+ /// Subtarget - Keep a pointer to the ARMSubtarget around so that we can
+ /// make the right decision when generating code for different targets.
const AArch64Subtarget *Subtarget;
-public:
- explicit AArch64SelectionDAGInfo(const AArch64TargetMachine &TM);
- ~AArch64SelectionDAGInfo();
-};
+public:
+ explicit AArch64SelectionDAGInfo(const TargetMachine &TM);
+ ~AArch64SelectionDAGInfo();
+
+ SDValue EmitTargetCodeForMemset(SelectionDAG &DAG, SDLoc dl, SDValue Chain,
+ SDValue Dst, SDValue Src, SDValue Size,
+ unsigned Align, bool isVolatile,
+ MachinePointerInfo DstPtrInfo) const override;
+};
}
#endif
diff --git a/lib/Target/AArch64/AArch64StorePairSuppress.cpp b/lib/Target/AArch64/AArch64StorePairSuppress.cpp
new file mode 100644
index 0000000..45f8ddb
--- /dev/null
+++ b/lib/Target/AArch64/AArch64StorePairSuppress.cpp
@@ -0,0 +1,168 @@
+//===--- AArch64StorePairSuppress.cpp --- Suppress store pair formation ---===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass identifies floating point stores that should not be combined into
+// store pairs. Later we may do the same for floating point loads.
+// ===---------------------------------------------------------------------===//
+
+#include "AArch64InstrInfo.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/MachineTraceMetrics.h"
+#include "llvm/CodeGen/TargetSchedule.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetInstrInfo.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "aarch64-stp-suppress"
+
+namespace {
+class AArch64StorePairSuppress : public MachineFunctionPass {
+ const AArch64InstrInfo *TII;
+ const TargetRegisterInfo *TRI;
+ const MachineRegisterInfo *MRI;
+ MachineFunction *MF;
+ TargetSchedModel SchedModel;
+ MachineTraceMetrics *Traces;
+ MachineTraceMetrics::Ensemble *MinInstr;
+
+public:
+ static char ID;
+ AArch64StorePairSuppress() : MachineFunctionPass(ID) {}
+
+ virtual const char *getPassName() const override {
+ return "AArch64 Store Pair Suppression";
+ }
+
+ bool runOnMachineFunction(MachineFunction &F) override;
+
+private:
+ bool shouldAddSTPToBlock(const MachineBasicBlock *BB);
+
+ bool isNarrowFPStore(const MachineInstr &MI);
+
+ virtual void getAnalysisUsage(AnalysisUsage &AU) const override {
+ AU.setPreservesCFG();
+ AU.addRequired<MachineTraceMetrics>();
+ AU.addPreserved<MachineTraceMetrics>();
+ MachineFunctionPass::getAnalysisUsage(AU);
+ }
+};
+char AArch64StorePairSuppress::ID = 0;
+} // anonymous
+
+FunctionPass *llvm::createAArch64StorePairSuppressPass() {
+ return new AArch64StorePairSuppress();
+}
+
+/// Return true if an STP can be added to this block without increasing the
+/// critical resource height. STP is good to form in Ld/St limited blocks and
+/// bad to form in float-point limited blocks. This is true independent of the
+/// critical path. If the critical path is longer than the resource height, the
+/// extra vector ops can limit physreg renaming. Otherwise, it could simply
+/// oversaturate the vector units.
+bool AArch64StorePairSuppress::shouldAddSTPToBlock(const MachineBasicBlock *BB) {
+ if (!MinInstr)
+ MinInstr = Traces->getEnsemble(MachineTraceMetrics::TS_MinInstrCount);
+
+ MachineTraceMetrics::Trace BBTrace = MinInstr->getTrace(BB);
+ unsigned ResLength = BBTrace.getResourceLength();
+
+ // Get the machine model's scheduling class for STPQi.
+ // Bypass TargetSchedule's SchedClass resolution since we only have an opcode.
+ unsigned SCIdx = TII->get(AArch64::STPDi).getSchedClass();
+ const MCSchedClassDesc *SCDesc =
+ SchedModel.getMCSchedModel()->getSchedClassDesc(SCIdx);
+
+ // If a subtarget does not define resources for STPQi, bail here.
+ if (SCDesc->isValid() && !SCDesc->isVariant()) {
+ unsigned ResLenWithSTP = BBTrace.getResourceLength(
+ ArrayRef<const MachineBasicBlock *>(), SCDesc);
+ if (ResLenWithSTP > ResLength) {
+ DEBUG(dbgs() << " Suppress STP in BB: " << BB->getNumber()
+ << " resources " << ResLength << " -> " << ResLenWithSTP
+ << "\n");
+ return false;
+ }
+ }
+ return true;
+}
+
+/// Return true if this is a floating-point store smaller than the V reg. On
+/// cyclone, these require a vector shuffle before storing a pair.
+/// Ideally we would call getMatchingPairOpcode() and have the machine model
+/// tell us if it's profitable with no cpu knowledge here.
+///
+/// FIXME: We plan to develop a decent Target abstraction for simple loads and
+/// stores. Until then use a nasty switch similar to AArch64LoadStoreOptimizer.
+bool AArch64StorePairSuppress::isNarrowFPStore(const MachineInstr &MI) {
+ switch (MI.getOpcode()) {
+ default:
+ return false;
+ case AArch64::STRSui:
+ case AArch64::STRDui:
+ case AArch64::STURSi:
+ case AArch64::STURDi:
+ return true;
+ }
+}
+
+bool AArch64StorePairSuppress::runOnMachineFunction(MachineFunction &mf) {
+ MF = &mf;
+ TII = static_cast<const AArch64InstrInfo *>(MF->getTarget().getInstrInfo());
+ TRI = MF->getTarget().getRegisterInfo();
+ MRI = &MF->getRegInfo();
+ const TargetSubtargetInfo &ST =
+ MF->getTarget().getSubtarget<TargetSubtargetInfo>();
+ SchedModel.init(*ST.getSchedModel(), &ST, TII);
+
+ Traces = &getAnalysis<MachineTraceMetrics>();
+ MinInstr = nullptr;
+
+ DEBUG(dbgs() << "*** " << getPassName() << ": " << MF->getName() << '\n');
+
+ if (!SchedModel.hasInstrSchedModel()) {
+ DEBUG(dbgs() << " Skipping pass: no machine model present.\n");
+ return false;
+ }
+
+ // Check for a sequence of stores to the same base address. We don't need to
+ // precisely determine whether a store pair can be formed. But we do want to
+ // filter out most situations where we can't form store pairs to avoid
+ // computing trace metrics in those cases.
+ for (auto &MBB : *MF) {
+ bool SuppressSTP = false;
+ unsigned PrevBaseReg = 0;
+ for (auto &MI : MBB) {
+ if (!isNarrowFPStore(MI))
+ continue;
+ unsigned BaseReg;
+ unsigned Offset;
+ if (TII->getLdStBaseRegImmOfs(&MI, BaseReg, Offset, TRI)) {
+ if (PrevBaseReg == BaseReg) {
+ // If this block can take STPs, skip ahead to the next block.
+ if (!SuppressSTP && shouldAddSTPToBlock(MI.getParent()))
+ break;
+ // Otherwise, continue unpairing the stores in this block.
+ DEBUG(dbgs() << "Unpairing store " << MI << "\n");
+ SuppressSTP = true;
+ TII->suppressLdStPair(&MI);
+ }
+ PrevBaseReg = BaseReg;
+ } else
+ PrevBaseReg = 0;
+ }
+ }
+ // This pass just sets some internal MachineMemOperand flags. It can't really
+ // invalidate anything.
+ return false;
+}
diff --git a/lib/Target/AArch64/AArch64Subtarget.cpp b/lib/Target/AArch64/AArch64Subtarget.cpp
index 9140bbd..cd69994 100644
--- a/lib/Target/AArch64/AArch64Subtarget.cpp
+++ b/lib/Target/AArch64/AArch64Subtarget.cpp
@@ -1,4 +1,4 @@
-//===-- AArch64Subtarget.cpp - AArch64 Subtarget Information --------------===//
+//===-- AArch64Subtarget.cpp - AArch64 Subtarget Information ----*- C++ -*-===//
//
// The LLVM Compiler Infrastructure
//
@@ -7,57 +7,110 @@
//
//===----------------------------------------------------------------------===//
//
-// This file implements the AArch64 specific subclass of TargetSubtargetInfo.
+// This file implements the AArch64 specific subclass of TargetSubtarget.
//
//===----------------------------------------------------------------------===//
+#include "AArch64InstrInfo.h"
#include "AArch64Subtarget.h"
-#include "AArch64RegisterInfo.h"
-#include "MCTargetDesc/AArch64MCTargetDesc.h"
#include "llvm/ADT/SmallVector.h"
+#include "llvm/CodeGen/MachineScheduler.h"
#include "llvm/IR/GlobalValue.h"
-#include "llvm/Support/CommandLine.h"
-#include "llvm/Target/TargetSubtargetInfo.h"
-
-#define GET_SUBTARGETINFO_TARGET_DESC
-#define GET_SUBTARGETINFO_CTOR
-#include "AArch64GenSubtargetInfo.inc"
+#include "llvm/Support/TargetRegistry.h"
using namespace llvm;
-// Pin the vtable to this file.
-void AArch64Subtarget::anchor() {}
+#define DEBUG_TYPE "aarch64-subtarget"
-AArch64Subtarget::AArch64Subtarget(StringRef TT, StringRef CPU, StringRef FS,
- bool LittleEndian)
+#define GET_SUBTARGETINFO_CTOR
+#define GET_SUBTARGETINFO_TARGET_DESC
+#include "AArch64GenSubtargetInfo.inc"
+
+static cl::opt<bool>
+EnableEarlyIfConvert("aarch64-early-ifcvt", cl::desc("Enable the early if "
+ "converter pass"), cl::init(true), cl::Hidden);
+
+AArch64Subtarget::AArch64Subtarget(const std::string &TT,
+ const std::string &CPU,
+ const std::string &FS, bool LittleEndian)
: AArch64GenSubtargetInfo(TT, CPU, FS), ARMProcFamily(Others),
- HasFPARMv8(false), HasNEON(false), HasCrypto(false), TargetTriple(TT),
- CPUString(CPU), IsLittleEndian(LittleEndian) {
+ HasFPARMv8(false), HasNEON(false), HasCrypto(false), HasCRC(false),
+ HasZeroCycleRegMove(false), HasZeroCycleZeroing(false), CPUString(CPU),
+ TargetTriple(TT), IsLittleEndian(LittleEndian) {
+ // Determine default and user-specified characteristics
- initializeSubtargetFeatures(CPU, FS);
-}
-
-void AArch64Subtarget::initializeSubtargetFeatures(StringRef CPU,
- StringRef FS) {
- if (CPU.empty())
+ if (CPUString.empty())
CPUString = "generic";
- std::string FullFS = FS;
- if (CPUString == "generic") {
- // Enable FP by default.
- if (FullFS.empty())
- FullFS = "+fp-armv8";
- else
- FullFS = "+fp-armv8," + FullFS;
- }
-
- ParseSubtargetFeatures(CPU, FullFS);
+ ParseSubtargetFeatures(CPUString, FS);
}
-bool AArch64Subtarget::GVIsIndirectSymbol(const GlobalValue *GV,
- Reloc::Model RelocM) const {
- if (RelocM == Reloc::Static)
- return false;
+/// ClassifyGlobalReference - Find the target operand flags that describe
+/// how a global value should be referenced for the current subtarget.
+unsigned char
+AArch64Subtarget::ClassifyGlobalReference(const GlobalValue *GV,
+ const TargetMachine &TM) const {
- return !GV->hasLocalLinkage() && !GV->hasHiddenVisibility();
+ // Determine whether this is a reference to a definition or a declaration.
+ // Materializable GVs (in JIT lazy compilation mode) do not require an extra
+ // load from stub.
+ bool isDecl = GV->hasAvailableExternallyLinkage();
+ if (GV->isDeclaration() && !GV->isMaterializable())
+ isDecl = true;
+
+ // MachO large model always goes via a GOT, simply to get a single 8-byte
+ // absolute relocation on all global addresses.
+ if (TM.getCodeModel() == CodeModel::Large && isTargetMachO())
+ return AArch64II::MO_GOT;
+
+ // The small code mode's direct accesses use ADRP, which cannot necessarily
+ // produce the value 0 (if the code is above 4GB). Therefore they must use the
+ // GOT.
+ if (TM.getCodeModel() == CodeModel::Small && GV->isWeakForLinker() && isDecl)
+ return AArch64II::MO_GOT;
+
+ // If symbol visibility is hidden, the extra load is not needed if
+ // the symbol is definitely defined in the current translation unit.
+
+ // The handling of non-hidden symbols in PIC mode is rather target-dependent:
+ // + On MachO, if the symbol is defined in this module the GOT can be
+ // skipped.
+ // + On ELF, the R_AARCH64_COPY relocation means that even symbols actually
+ // defined could end up in unexpected places. Use a GOT.
+ if (TM.getRelocationModel() != Reloc::Static && GV->hasDefaultVisibility()) {
+ if (isTargetMachO())
+ return (isDecl || GV->isWeakForLinker()) ? AArch64II::MO_GOT
+ : AArch64II::MO_NO_FLAG;
+ else
+ // No need to go through the GOT for local symbols on ELF.
+ return GV->hasLocalLinkage() ? AArch64II::MO_NO_FLAG : AArch64II::MO_GOT;
+ }
+
+ return AArch64II::MO_NO_FLAG;
+}
+
+/// This function returns the name of a function which has an interface
+/// like the non-standard bzero function, if such a function exists on
+/// the current subtarget and it is considered prefereable over
+/// memset with zero passed as the second argument. Otherwise it
+/// returns null.
+const char *AArch64Subtarget::getBZeroEntry() const {
+ // Prefer bzero on Darwin only.
+ if(isTargetDarwin())
+ return "bzero";
+
+ return nullptr;
+}
+
+void AArch64Subtarget::overrideSchedPolicy(MachineSchedPolicy &Policy,
+ MachineInstr *begin, MachineInstr *end,
+ unsigned NumRegionInstrs) const {
+ // LNT run (at least on Cyclone) showed reasonably significant gains for
+ // bi-directional scheduling. 253.perlbmk.
+ Policy.OnlyTopDown = false;
+ Policy.OnlyBottomUp = false;
+}
+
+bool AArch64Subtarget::enableEarlyIfConversion() const {
+ return EnableEarlyIfConvert;
}
diff --git a/lib/Target/AArch64/AArch64Subtarget.h b/lib/Target/AArch64/AArch64Subtarget.h
index 68c6c4b..590ea05 100644
--- a/lib/Target/AArch64/AArch64Subtarget.h
+++ b/lib/Target/AArch64/AArch64Subtarget.h
@@ -1,4 +1,4 @@
-//==-- AArch64Subtarget.h - Define Subtarget for the AArch64 ---*- C++ -*--===//
+//===--- AArch64Subtarget.h - Define Subtarget for the AArch64 -*- C++ -*--===//
//
// The LLVM Compiler Infrastructure
//
@@ -7,29 +7,27 @@
//
//===----------------------------------------------------------------------===//
//
-// This file declares the AArch64 specific subclass of TargetSubtargetInfo.
+// This file declares the AArch64 specific subclass of TargetSubtarget.
//
//===----------------------------------------------------------------------===//
-#ifndef LLVM_TARGET_AARCH64_SUBTARGET_H
-#define LLVM_TARGET_AARCH64_SUBTARGET_H
+#ifndef AArch64SUBTARGET_H
+#define AArch64SUBTARGET_H
-#include "llvm/ADT/Triple.h"
#include "llvm/Target/TargetSubtargetInfo.h"
+#include "AArch64RegisterInfo.h"
+#include <string>
#define GET_SUBTARGETINFO_HEADER
#include "AArch64GenSubtargetInfo.inc"
-#include <string>
-
namespace llvm {
-class StringRef;
class GlobalValue;
+class StringRef;
class AArch64Subtarget : public AArch64GenSubtargetInfo {
- virtual void anchor();
protected:
- enum ARMProcFamilyEnum {Others, CortexA53, CortexA57};
+ enum ARMProcFamilyEnum {Others, CortexA53, CortexA57, Cyclone};
/// ARMProcFamily - ARM processor family: Cortex-A53, Cortex-A57, and others.
ARMProcFamilyEnum ARMProcFamily;
@@ -37,47 +35,76 @@
bool HasFPARMv8;
bool HasNEON;
bool HasCrypto;
+ bool HasCRC;
- /// TargetTriple - What processor and OS we're targeting.
- Triple TargetTriple;
+ // HasZeroCycleRegMove - Has zero-cycle register mov instructions.
+ bool HasZeroCycleRegMove;
+
+ // HasZeroCycleZeroing - Has zero-cycle zeroing instructions.
+ bool HasZeroCycleZeroing;
/// CPUString - String name of used CPU.
std::string CPUString;
- /// IsLittleEndian - The target is Little Endian
- bool IsLittleEndian;
+ /// TargetTriple - What processor and OS we're targeting.
+ Triple TargetTriple;
-private:
- void initializeSubtargetFeatures(StringRef CPU, StringRef FS);
+ /// IsLittleEndian - Is the target little endian?
+ bool IsLittleEndian;
public:
/// This constructor initializes the data members to match that
/// of the specified triple.
- ///
- AArch64Subtarget(StringRef TT, StringRef CPU, StringRef FS,
- bool LittleEndian);
+ AArch64Subtarget(const std::string &TT, const std::string &CPU,
+ const std::string &FS, bool LittleEndian);
- virtual bool enableMachineScheduler() const {
- return true;
- }
+ bool enableMachineScheduler() const override { return true; }
+
+ bool hasZeroCycleRegMove() const { return HasZeroCycleRegMove; }
+
+ bool hasZeroCycleZeroing() const { return HasZeroCycleZeroing; }
+
+ bool hasFPARMv8() const { return HasFPARMv8; }
+ bool hasNEON() const { return HasNEON; }
+ bool hasCrypto() const { return HasCrypto; }
+ bool hasCRC() const { return HasCRC; }
+
+ bool isLittleEndian() const { return IsLittleEndian; }
+
+ bool isTargetDarwin() const { return TargetTriple.isOSDarwin(); }
+
+ bool isTargetELF() const { return TargetTriple.isOSBinFormatELF(); }
+
+ bool isTargetMachO() const { return TargetTriple.isOSBinFormatMachO(); }
+
+ bool isCyclone() const { return CPUString == "cyclone"; }
+
+ /// getMaxInlineSizeThreshold - Returns the maximum memset / memcpy size
+ /// that still makes it profitable to inline the call.
+ unsigned getMaxInlineSizeThreshold() const { return 64; }
/// ParseSubtargetFeatures - Parses features string setting specified
/// subtarget options. Definition of function is auto generated by tblgen.
void ParseSubtargetFeatures(StringRef CPU, StringRef FS);
- bool GVIsIndirectSymbol(const GlobalValue *GV, Reloc::Model RelocM) const;
+ /// ClassifyGlobalReference - Find the target operand flags that describe
+ /// how a global value should be referenced for the current subtarget.
+ unsigned char ClassifyGlobalReference(const GlobalValue *GV,
+ const TargetMachine &TM) const;
- bool isTargetELF() const { return TargetTriple.isOSBinFormatELF(); }
- bool isTargetLinux() const { return TargetTriple.isOSLinux(); }
+ /// This function returns the name of a function which has an interface
+ /// like the non-standard bzero function, if such a function exists on
+ /// the current subtarget and it is considered prefereable over
+ /// memset with zero passed as the second argument. Otherwise it
+ /// returns null.
+ const char *getBZeroEntry() const;
- bool hasFPARMv8() const { return HasFPARMv8; }
- bool hasNEON() const { return HasNEON; }
- bool hasCrypto() const { return HasCrypto; }
+ void overrideSchedPolicy(MachineSchedPolicy &Policy, MachineInstr *begin,
+ MachineInstr *end,
+ unsigned NumRegionInstrs) const override;
- bool isLittle() const { return IsLittleEndian; }
-
- const std::string & getCPUString() const { return CPUString; }
+ bool enableEarlyIfConversion() const override;
};
} // End llvm namespace
-#endif // LLVM_TARGET_AARCH64_SUBTARGET_H
+#endif // AArch64SUBTARGET_H
diff --git a/lib/Target/AArch64/AArch64TargetMachine.cpp b/lib/Target/AArch64/AArch64TargetMachine.cpp
index d9c990d..0b5dd2f 100644
--- a/lib/Target/AArch64/AArch64TargetMachine.cpp
+++ b/lib/Target/AArch64/AArch64TargetMachine.cpp
@@ -7,41 +7,80 @@
//
//===----------------------------------------------------------------------===//
//
-// This file contains the implementation of the AArch64TargetMachine
-// methods. Principally just setting up the passes needed to generate correct
-// code on this architecture.
//
//===----------------------------------------------------------------------===//
#include "AArch64.h"
#include "AArch64TargetMachine.h"
-#include "MCTargetDesc/AArch64MCTargetDesc.h"
-#include "llvm/CodeGen/Passes.h"
#include "llvm/PassManager.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/Support/CommandLine.h"
#include "llvm/Support/TargetRegistry.h"
-
+#include "llvm/Target/TargetOptions.h"
+#include "llvm/Transforms/Scalar.h"
using namespace llvm;
+static cl::opt<bool>
+EnableCCMP("aarch64-ccmp", cl::desc("Enable the CCMP formation pass"),
+ cl::init(true), cl::Hidden);
+
+static cl::opt<bool>
+EnableStPairSuppress("aarch64-stp-suppress", cl::desc("Suppress STP for AArch64"),
+ cl::init(true), cl::Hidden);
+
+static cl::opt<bool>
+EnableAdvSIMDScalar("aarch64-simd-scalar", cl::desc("Enable use of AdvSIMD scalar"
+ " integer instructions"), cl::init(false), cl::Hidden);
+
+static cl::opt<bool>
+EnablePromoteConstant("aarch64-promote-const", cl::desc("Enable the promote "
+ "constant pass"), cl::init(true), cl::Hidden);
+
+static cl::opt<bool>
+EnableCollectLOH("aarch64-collect-loh", cl::desc("Enable the pass that emits the"
+ " linker optimization hints (LOH)"), cl::init(true),
+ cl::Hidden);
+
+static cl::opt<bool>
+EnableDeadRegisterElimination("aarch64-dead-def-elimination", cl::Hidden,
+ cl::desc("Enable the pass that removes dead"
+ " definitons and replaces stores to"
+ " them with stores to the zero"
+ " register"),
+ cl::init(true));
+
+static cl::opt<bool>
+EnableLoadStoreOpt("aarch64-load-store-opt", cl::desc("Enable the load/store pair"
+ " optimization pass"), cl::init(true), cl::Hidden);
+
extern "C" void LLVMInitializeAArch64Target() {
+ // Register the target.
RegisterTargetMachine<AArch64leTargetMachine> X(TheAArch64leTarget);
RegisterTargetMachine<AArch64beTargetMachine> Y(TheAArch64beTarget);
+
+ RegisterTargetMachine<AArch64leTargetMachine> Z(TheARM64leTarget);
+ RegisterTargetMachine<AArch64beTargetMachine> W(TheARM64beTarget);
}
+/// TargetMachine ctor - Create an AArch64 architecture model.
+///
AArch64TargetMachine::AArch64TargetMachine(const Target &T, StringRef TT,
StringRef CPU, StringRef FS,
const TargetOptions &Options,
Reloc::Model RM, CodeModel::Model CM,
CodeGenOpt::Level OL,
bool LittleEndian)
- : LLVMTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL),
- Subtarget(TT, CPU, FS, LittleEndian),
- InstrInfo(Subtarget),
- DL(LittleEndian ?
- "e-m:e-i64:64-i128:128-n32:64-S128" :
- "E-m:e-i64:64-i128:128-n32:64-S128"),
- TLInfo(*this),
- TSInfo(*this),
- FrameLowering(Subtarget) {
+ : LLVMTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL),
+ Subtarget(TT, CPU, FS, LittleEndian),
+ // This nested ternary is horrible, but DL needs to be properly
+ // initialized
+ // before TLInfo is constructed.
+ DL(Subtarget.isTargetMachO()
+ ? "e-m:o-i64:64-i128:128-n32:64-S128"
+ : (LittleEndian ? "e-m:e-i64:64-i128:128-n32:64-S128"
+ : "E-m:e-i64:64-i128:128-n32:64-S128")),
+ InstrInfo(Subtarget), TLInfo(*this), FrameLowering(*this, Subtarget),
+ TSInfo(*this) {
initAsmInfo();
}
@@ -63,6 +102,27 @@
CodeGenOpt::Level OL)
: AArch64TargetMachine(T, TT, CPU, FS, Options, RM, CM, OL, false) {}
+namespace {
+/// AArch64 Code Generator Pass Configuration Options.
+class AArch64PassConfig : public TargetPassConfig {
+public:
+ AArch64PassConfig(AArch64TargetMachine *TM, PassManagerBase &PM)
+ : TargetPassConfig(TM, PM) {}
+
+ AArch64TargetMachine &getAArch64TargetMachine() const {
+ return getTM<AArch64TargetMachine>();
+ }
+
+ bool addPreISel() override;
+ bool addInstSelector() override;
+ bool addILPOpts() override;
+ bool addPreRegAlloc() override;
+ bool addPostRegAlloc() override;
+ bool addPreSched2() override;
+ bool addPreEmitPass() override;
+};
+} // namespace
+
void AArch64TargetMachine::addAnalysisPasses(PassManagerBase &PM) {
// Add first the target-independent BasicTTI pass, then our AArch64 pass. This
// allows the AArch64 pass to delegate to the target independent layer when
@@ -71,42 +131,78 @@
PM.add(createAArch64TargetTransformInfoPass(this));
}
-namespace {
-/// AArch64 Code Generator Pass Configuration Options.
-class AArch64PassConfig : public TargetPassConfig {
-public:
- AArch64PassConfig(AArch64TargetMachine *TM, PassManagerBase &PM)
- : TargetPassConfig(TM, PM) {}
-
- AArch64TargetMachine &getAArch64TargetMachine() const {
- return getTM<AArch64TargetMachine>();
- }
-
- const AArch64Subtarget &getAArch64Subtarget() const {
- return *getAArch64TargetMachine().getSubtargetImpl();
- }
-
- virtual bool addInstSelector();
- virtual bool addPreEmitPass();
-};
-} // namespace
-
TargetPassConfig *AArch64TargetMachine::createPassConfig(PassManagerBase &PM) {
return new AArch64PassConfig(this, PM);
}
-bool AArch64PassConfig::addPreEmitPass() {
- addPass(&UnpackMachineBundlesID);
- addPass(createAArch64BranchFixupPass());
- return true;
+// Pass Pipeline Configuration
+bool AArch64PassConfig::addPreISel() {
+ // Run promote constant before global merge, so that the promoted constants
+ // get a chance to be merged
+ if (TM->getOptLevel() != CodeGenOpt::None && EnablePromoteConstant)
+ addPass(createAArch64PromoteConstantPass());
+ if (TM->getOptLevel() != CodeGenOpt::None)
+ addPass(createGlobalMergePass(TM));
+ if (TM->getOptLevel() != CodeGenOpt::None)
+ addPass(createAArch64AddressTypePromotionPass());
+
+ // Always expand atomic operations, we don't deal with atomicrmw or cmpxchg
+ // ourselves.
+ addPass(createAtomicExpandLoadLinkedPass(TM));
+
+ return false;
}
bool AArch64PassConfig::addInstSelector() {
- addPass(createAArch64ISelDAG(getAArch64TargetMachine(), getOptLevel()));
+ addPass(createAArch64ISelDag(getAArch64TargetMachine(), getOptLevel()));
- // For ELF, cleanup any local-dynamic TLS accesses.
- if (getAArch64Subtarget().isTargetELF() && getOptLevel() != CodeGenOpt::None)
+ // For ELF, cleanup any local-dynamic TLS accesses (i.e. combine as many
+ // references to _TLS_MODULE_BASE_ as possible.
+ if (TM->getSubtarget<AArch64Subtarget>().isTargetELF() &&
+ getOptLevel() != CodeGenOpt::None)
addPass(createAArch64CleanupLocalDynamicTLSPass());
return false;
}
+
+bool AArch64PassConfig::addILPOpts() {
+ if (EnableCCMP)
+ addPass(createAArch64ConditionalCompares());
+ addPass(&EarlyIfConverterID);
+ if (EnableStPairSuppress)
+ addPass(createAArch64StorePairSuppressPass());
+ return true;
+}
+
+bool AArch64PassConfig::addPreRegAlloc() {
+ // Use AdvSIMD scalar instructions whenever profitable.
+ if (TM->getOptLevel() != CodeGenOpt::None && EnableAdvSIMDScalar)
+ addPass(createAArch64AdvSIMDScalar());
+ return true;
+}
+
+bool AArch64PassConfig::addPostRegAlloc() {
+ // Change dead register definitions to refer to the zero register.
+ if (TM->getOptLevel() != CodeGenOpt::None && EnableDeadRegisterElimination)
+ addPass(createAArch64DeadRegisterDefinitions());
+ return true;
+}
+
+bool AArch64PassConfig::addPreSched2() {
+ // Expand some pseudo instructions to allow proper scheduling.
+ addPass(createAArch64ExpandPseudoPass());
+ // Use load/store pair instructions when possible.
+ if (TM->getOptLevel() != CodeGenOpt::None && EnableLoadStoreOpt)
+ addPass(createAArch64LoadStoreOptimizationPass());
+ return true;
+}
+
+bool AArch64PassConfig::addPreEmitPass() {
+ // Relax conditional branch instructions if they're otherwise out of
+ // range of their destination.
+ addPass(createAArch64BranchRelaxation());
+ if (TM->getOptLevel() != CodeGenOpt::None && EnableCollectLOH &&
+ TM->getSubtarget<AArch64Subtarget>().isTargetMachO())
+ addPass(createAArch64CollectLOHPass());
+ return true;
+}
diff --git a/lib/Target/AArch64/AArch64TargetMachine.h b/lib/Target/AArch64/AArch64TargetMachine.h
index 4297c92..079b19b 100644
--- a/lib/Target/AArch64/AArch64TargetMachine.h
+++ b/lib/Target/AArch64/AArch64TargetMachine.h
@@ -1,4 +1,4 @@
-//=== AArch64TargetMachine.h - Define TargetMachine for AArch64 -*- C++ -*-===//
+//==-- AArch64TargetMachine.h - Define TargetMachine for AArch64 -*- C++ -*-==//
//
// The LLVM Compiler Infrastructure
//
@@ -11,60 +11,60 @@
//
//===----------------------------------------------------------------------===//
-#ifndef LLVM_AARCH64TARGETMACHINE_H
-#define LLVM_AARCH64TARGETMACHINE_H
+#ifndef AArch64TARGETMACHINE_H
+#define AArch64TARGETMACHINE_H
-#include "AArch64FrameLowering.h"
-#include "AArch64ISelLowering.h"
#include "AArch64InstrInfo.h"
-#include "AArch64SelectionDAGInfo.h"
+#include "AArch64ISelLowering.h"
#include "AArch64Subtarget.h"
+#include "AArch64FrameLowering.h"
+#include "AArch64SelectionDAGInfo.h"
#include "llvm/IR/DataLayout.h"
#include "llvm/Target/TargetMachine.h"
+#include "llvm/MC/MCStreamer.h"
namespace llvm {
class AArch64TargetMachine : public LLVMTargetMachine {
- AArch64Subtarget Subtarget;
- AArch64InstrInfo InstrInfo;
- const DataLayout DL;
- AArch64TargetLowering TLInfo;
- AArch64SelectionDAGInfo TSInfo;
- AArch64FrameLowering FrameLowering;
+protected:
+ AArch64Subtarget Subtarget;
+
+private:
+ const DataLayout DL;
+ AArch64InstrInfo InstrInfo;
+ AArch64TargetLowering TLInfo;
+ AArch64FrameLowering FrameLowering;
+ AArch64SelectionDAGInfo TSInfo;
public:
AArch64TargetMachine(const Target &T, StringRef TT, StringRef CPU,
StringRef FS, const TargetOptions &Options,
Reloc::Model RM, CodeModel::Model CM,
- CodeGenOpt::Level OL,
- bool LittleEndian);
+ CodeGenOpt::Level OL, bool IsLittleEndian);
- const AArch64InstrInfo *getInstrInfo() const {
- return &InstrInfo;
+ const AArch64Subtarget *getSubtargetImpl() const override {
+ return &Subtarget;
}
-
- const AArch64FrameLowering *getFrameLowering() const {
- return &FrameLowering;
- }
-
- const AArch64TargetLowering *getTargetLowering() const {
+ const AArch64TargetLowering *getTargetLowering() const override {
return &TLInfo;
}
-
- const AArch64SelectionDAGInfo *getSelectionDAGInfo() const {
+ const DataLayout *getDataLayout() const override { return &DL; }
+ const AArch64FrameLowering *getFrameLowering() const override {
+ return &FrameLowering;
+ }
+ const AArch64InstrInfo *getInstrInfo() const override { return &InstrInfo; }
+ const AArch64RegisterInfo *getRegisterInfo() const override {
+ return &InstrInfo.getRegisterInfo();
+ }
+ const AArch64SelectionDAGInfo *getSelectionDAGInfo() const override {
return &TSInfo;
}
- const AArch64Subtarget *getSubtargetImpl() const { return &Subtarget; }
+ // Pass Pipeline Configuration
+ TargetPassConfig *createPassConfig(PassManagerBase &PM) override;
- const DataLayout *getDataLayout() const { return &DL; }
-
- const TargetRegisterInfo *getRegisterInfo() const {
- return &InstrInfo.getRegisterInfo();
- }
- TargetPassConfig *createPassConfig(PassManagerBase &PM);
-
- virtual void addAnalysisPasses(PassManagerBase &PM);
+ /// \brief Register AArch64 analysis passes with a pass manager.
+ void addAnalysisPasses(PassManagerBase &PM) override;
};
// AArch64leTargetMachine - AArch64 little endian target machine.
@@ -72,8 +72,8 @@
class AArch64leTargetMachine : public AArch64TargetMachine {
virtual void anchor();
public:
- AArch64leTargetMachine(const Target &T, StringRef TT,
- StringRef CPU, StringRef FS, const TargetOptions &Options,
+ AArch64leTargetMachine(const Target &T, StringRef TT, StringRef CPU,
+ StringRef FS, const TargetOptions &Options,
Reloc::Model RM, CodeModel::Model CM,
CodeGenOpt::Level OL);
};
@@ -83,12 +83,12 @@
class AArch64beTargetMachine : public AArch64TargetMachine {
virtual void anchor();
public:
- AArch64beTargetMachine(const Target &T, StringRef TT,
- StringRef CPU, StringRef FS, const TargetOptions &Options,
+ AArch64beTargetMachine(const Target &T, StringRef TT, StringRef CPU,
+ StringRef FS, const TargetOptions &Options,
Reloc::Model RM, CodeModel::Model CM,
CodeGenOpt::Level OL);
};
-} // End llvm namespace
+} // end namespace llvm
#endif
diff --git a/lib/Target/AArch64/AArch64TargetObjectFile.cpp b/lib/Target/AArch64/AArch64TargetObjectFile.cpp
index 663d619..4069038 100644
--- a/lib/Target/AArch64/AArch64TargetObjectFile.cpp
+++ b/lib/Target/AArch64/AArch64TargetObjectFile.cpp
@@ -6,19 +6,47 @@
// License. See LICENSE.TXT for details.
//
//===----------------------------------------------------------------------===//
-//
-// This file deals with any AArch64 specific requirements on object files.
-//
-//===----------------------------------------------------------------------===//
-
#include "AArch64TargetObjectFile.h"
-
+#include "AArch64TargetMachine.h"
+#include "llvm/IR/Mangler.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCStreamer.h"
+#include "llvm/Support/Dwarf.h"
using namespace llvm;
+using namespace dwarf;
-void
-AArch64ElfTargetObjectFile::Initialize(MCContext &Ctx,
- const TargetMachine &TM) {
+void AArch64_ELFTargetObjectFile::Initialize(MCContext &Ctx,
+ const TargetMachine &TM) {
TargetLoweringObjectFileELF::Initialize(Ctx, TM);
InitializeELF(TM.Options.UseInitArray);
}
+
+const MCExpr *AArch64_MachoTargetObjectFile::getTTypeGlobalReference(
+ const GlobalValue *GV, unsigned Encoding, Mangler &Mang,
+ const TargetMachine &TM, MachineModuleInfo *MMI,
+ MCStreamer &Streamer) const {
+ // On Darwin, we can reference dwarf symbols with foo@GOT-., which
+ // is an indirect pc-relative reference. The default implementation
+ // won't reference using the GOT, so we need this target-specific
+ // version.
+ if (Encoding & (DW_EH_PE_indirect | DW_EH_PE_pcrel)) {
+ const MCSymbol *Sym = TM.getSymbol(GV, Mang);
+ const MCExpr *Res =
+ MCSymbolRefExpr::Create(Sym, MCSymbolRefExpr::VK_GOT, getContext());
+ MCSymbol *PCSym = getContext().CreateTempSymbol();
+ Streamer.EmitLabel(PCSym);
+ const MCExpr *PC = MCSymbolRefExpr::Create(PCSym, getContext());
+ return MCBinaryExpr::CreateSub(Res, PC, getContext());
+ }
+
+ return TargetLoweringObjectFileMachO::getTTypeGlobalReference(
+ GV, Encoding, Mang, TM, MMI, Streamer);
+}
+
+MCSymbol *AArch64_MachoTargetObjectFile::getCFIPersonalitySymbol(
+ const GlobalValue *GV, Mangler &Mang, const TargetMachine &TM,
+ MachineModuleInfo *MMI) const {
+ return TM.getSymbol(GV, Mang);
+}
diff --git a/lib/Target/AArch64/AArch64TargetObjectFile.h b/lib/Target/AArch64/AArch64TargetObjectFile.h
index 0f00a78..de63cb4 100644
--- a/lib/Target/AArch64/AArch64TargetObjectFile.h
+++ b/lib/Target/AArch64/AArch64TargetObjectFile.h
@@ -1,4 +1,4 @@
-//===-- AArch64TargetObjectFile.h - AArch64 Object Info ---------*- C++ -*-===//
+//===-- AArch64TargetObjectFile.h - AArch64 Object Info -*- C++ ---------*-===//
//
// The LLVM Compiler Infrastructure
//
@@ -6,25 +6,34 @@
// License. See LICENSE.TXT for details.
//
//===----------------------------------------------------------------------===//
-//
-// This file deals with any AArch64 specific requirements on object files.
-//
-//===----------------------------------------------------------------------===//
-#ifndef LLVM_TARGET_AARCH64_TARGETOBJECTFILE_H
-#define LLVM_TARGET_AARCH64_TARGETOBJECTFILE_H
+#ifndef LLVM_TARGET_AArch64_TARGETOBJECTFILE_H
+#define LLVM_TARGET_AArch64_TARGETOBJECTFILE_H
#include "llvm/CodeGen/TargetLoweringObjectFileImpl.h"
#include "llvm/Target/TargetLoweringObjectFile.h"
-#include "llvm/Target/TargetMachine.h"
namespace llvm {
+class AArch64TargetMachine;
- /// AArch64ElfTargetObjectFile - This implementation is used for ELF
- /// AArch64 targets.
- class AArch64ElfTargetObjectFile : public TargetLoweringObjectFileELF {
- virtual void Initialize(MCContext &Ctx, const TargetMachine &TM);
- };
+/// This implementation is used for AArch64 ELF targets (Linux in particular).
+class AArch64_ELFTargetObjectFile : public TargetLoweringObjectFileELF {
+ void Initialize(MCContext &Ctx, const TargetMachine &TM) override;
+};
+
+/// AArch64_MachoTargetObjectFile - This TLOF implementation is used for Darwin.
+class AArch64_MachoTargetObjectFile : public TargetLoweringObjectFileMachO {
+public:
+ const MCExpr *getTTypeGlobalReference(const GlobalValue *GV,
+ unsigned Encoding, Mangler &Mang,
+ const TargetMachine &TM,
+ MachineModuleInfo *MMI,
+ MCStreamer &Streamer) const override;
+
+ MCSymbol *getCFIPersonalitySymbol(const GlobalValue *GV, Mangler &Mang,
+ const TargetMachine &TM,
+ MachineModuleInfo *MMI) const override;
+};
} // end namespace llvm
diff --git a/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
index e2a1647..33e482a 100644
--- a/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
+++ b/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
@@ -1,4 +1,4 @@
-//===- AArch64TargetTransformInfo.cpp - AArch64 specific TTI pass ---------===//
+//===-- AArch64TargetTransformInfo.cpp - AArch64 specific TTI pass --------===//
//
// The LLVM Compiler Infrastructure
//
@@ -14,15 +14,18 @@
///
//===----------------------------------------------------------------------===//
-#define DEBUG_TYPE "aarch64tti"
#include "AArch64.h"
#include "AArch64TargetMachine.h"
+#include "MCTargetDesc/AArch64AddressingModes.h"
#include "llvm/Analysis/TargetTransformInfo.h"
#include "llvm/Support/Debug.h"
#include "llvm/Target/CostTable.h"
#include "llvm/Target/TargetLowering.h"
+#include <algorithm>
using namespace llvm;
+#define DEBUG_TYPE "aarch64tti"
+
// Declare the pass initialization routine locally as target-specific passes
// don't have a target-wide initialization entry point, and so we rely on the
// pass constructor initialization.
@@ -33,25 +36,28 @@
namespace {
class AArch64TTI final : public ImmutablePass, public TargetTransformInfo {
+ const AArch64TargetMachine *TM;
const AArch64Subtarget *ST;
const AArch64TargetLowering *TLI;
+ /// Estimate the overhead of scalarizing an instruction. Insert and Extract
+ /// are set if the result needs to be inserted and/or extracted from vectors.
+ unsigned getScalarizationOverhead(Type *Ty, bool Insert, bool Extract) const;
+
public:
- AArch64TTI() : ImmutablePass(ID), ST(0), TLI(0) {
+ AArch64TTI() : ImmutablePass(ID), TM(nullptr), ST(nullptr), TLI(nullptr) {
llvm_unreachable("This pass cannot be directly constructed");
}
AArch64TTI(const AArch64TargetMachine *TM)
- : ImmutablePass(ID), ST(TM->getSubtargetImpl()),
+ : ImmutablePass(ID), TM(TM), ST(TM->getSubtargetImpl()),
TLI(TM->getTargetLowering()) {
initializeAArch64TTIPass(*PassRegistry::getPassRegistry());
}
- virtual void initializePass() override {
- pushTTIStack(this);
- }
+ void initializePass() override { pushTTIStack(this); }
- virtual void getAnalysisUsage(AnalysisUsage &AU) const override {
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
TargetTransformInfo::getAnalysisUsage(AU);
}
@@ -59,31 +65,37 @@
static char ID;
/// Provide necessary pointer adjustments for the two base classes.
- virtual void *getAdjustedAnalysisPointer(const void *ID) override {
+ void *getAdjustedAnalysisPointer(const void *ID) override {
if (ID == &TargetTransformInfo::ID)
- return (TargetTransformInfo*)this;
+ return (TargetTransformInfo *)this;
return this;
}
/// \name Scalar TTI Implementations
/// @{
+ unsigned getIntImmCost(int64_t Val) const;
+ unsigned getIntImmCost(const APInt &Imm, Type *Ty) const override;
+ unsigned getIntImmCost(unsigned Opcode, unsigned Idx, const APInt &Imm,
+ Type *Ty) const override;
+ unsigned getIntImmCost(Intrinsic::ID IID, unsigned Idx, const APInt &Imm,
+ Type *Ty) const override;
+ PopcntSupportKind getPopcntSupport(unsigned TyWidth) const override;
/// @}
-
/// \name Vector TTI Implementations
/// @{
- unsigned getNumberOfRegisters(bool Vector) const {
+ unsigned getNumberOfRegisters(bool Vector) const override {
if (Vector) {
if (ST->hasNEON())
return 32;
return 0;
}
- return 32;
+ return 31;
}
- unsigned getRegisterBitWidth(bool Vector) const {
+ unsigned getRegisterBitWidth(bool Vector) const override {
if (Vector) {
if (ST->hasNEON())
return 128;
@@ -92,6 +104,26 @@
return 64;
}
+ unsigned getMaximumUnrollFactor() const override { return 2; }
+
+ unsigned getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src) const
+ override;
+
+ unsigned getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index) const
+ override;
+
+ unsigned getArithmeticInstrCost(unsigned Opcode, Type *Ty,
+ OperandValueKind Opd1Info = OK_AnyValue,
+ OperandValueKind Opd2Info = OK_AnyValue) const
+ override;
+
+ unsigned getAddressComputationCost(Type *Ty, bool IsComplex) const override;
+
+ unsigned getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy) const
+ override;
+
+ unsigned getMemoryOpCost(unsigned Opcode, Type *Src, unsigned Alignment,
+ unsigned AddressSpace) const override;
/// @}
};
@@ -105,3 +137,328 @@
llvm::createAArch64TargetTransformInfoPass(const AArch64TargetMachine *TM) {
return new AArch64TTI(TM);
}
+
+/// \brief Calculate the cost of materializing a 64-bit value. This helper
+/// method might only calculate a fraction of a larger immediate. Therefore it
+/// is valid to return a cost of ZERO.
+unsigned AArch64TTI::getIntImmCost(int64_t Val) const {
+ // Check if the immediate can be encoded within an instruction.
+ if (Val == 0 || AArch64_AM::isLogicalImmediate(Val, 64))
+ return 0;
+
+ if (Val < 0)
+ Val = ~Val;
+
+ // Calculate how many moves we will need to materialize this constant.
+ unsigned LZ = countLeadingZeros((uint64_t)Val);
+ return (64 - LZ + 15) / 16;
+}
+
+/// \brief Calculate the cost of materializing the given constant.
+unsigned AArch64TTI::getIntImmCost(const APInt &Imm, Type *Ty) const {
+ assert(Ty->isIntegerTy());
+
+ unsigned BitSize = Ty->getPrimitiveSizeInBits();
+ if (BitSize == 0)
+ return ~0U;
+
+ // Sign-extend all constants to a multiple of 64-bit.
+ APInt ImmVal = Imm;
+ if (BitSize & 0x3f)
+ ImmVal = Imm.sext((BitSize + 63) & ~0x3fU);
+
+ // Split the constant into 64-bit chunks and calculate the cost for each
+ // chunk.
+ unsigned Cost = 0;
+ for (unsigned ShiftVal = 0; ShiftVal < BitSize; ShiftVal += 64) {
+ APInt Tmp = ImmVal.ashr(ShiftVal).sextOrTrunc(64);
+ int64_t Val = Tmp.getSExtValue();
+ Cost += getIntImmCost(Val);
+ }
+ // We need at least one instruction to materialze the constant.
+ return std::max(1U, Cost);
+}
+
+unsigned AArch64TTI::getIntImmCost(unsigned Opcode, unsigned Idx,
+ const APInt &Imm, Type *Ty) const {
+ assert(Ty->isIntegerTy());
+
+ unsigned BitSize = Ty->getPrimitiveSizeInBits();
+ // There is no cost model for constants with a bit size of 0. Return TCC_Free
+ // here, so that constant hoisting will ignore this constant.
+ if (BitSize == 0)
+ return TCC_Free;
+
+ unsigned ImmIdx = ~0U;
+ switch (Opcode) {
+ default:
+ return TCC_Free;
+ case Instruction::GetElementPtr:
+ // Always hoist the base address of a GetElementPtr.
+ if (Idx == 0)
+ return 2 * TCC_Basic;
+ return TCC_Free;
+ case Instruction::Store:
+ ImmIdx = 0;
+ break;
+ case Instruction::Add:
+ case Instruction::Sub:
+ case Instruction::Mul:
+ case Instruction::UDiv:
+ case Instruction::SDiv:
+ case Instruction::URem:
+ case Instruction::SRem:
+ case Instruction::And:
+ case Instruction::Or:
+ case Instruction::Xor:
+ case Instruction::ICmp:
+ ImmIdx = 1;
+ break;
+ // Always return TCC_Free for the shift value of a shift instruction.
+ case Instruction::Shl:
+ case Instruction::LShr:
+ case Instruction::AShr:
+ if (Idx == 1)
+ return TCC_Free;
+ break;
+ case Instruction::Trunc:
+ case Instruction::ZExt:
+ case Instruction::SExt:
+ case Instruction::IntToPtr:
+ case Instruction::PtrToInt:
+ case Instruction::BitCast:
+ case Instruction::PHI:
+ case Instruction::Call:
+ case Instruction::Select:
+ case Instruction::Ret:
+ case Instruction::Load:
+ break;
+ }
+
+ if (Idx == ImmIdx) {
+ unsigned NumConstants = (BitSize + 63) / 64;
+ unsigned Cost = AArch64TTI::getIntImmCost(Imm, Ty);
+ return (Cost <= NumConstants * TCC_Basic)
+ ? static_cast<unsigned>(TCC_Free) : Cost;
+ }
+ return AArch64TTI::getIntImmCost(Imm, Ty);
+}
+
+unsigned AArch64TTI::getIntImmCost(Intrinsic::ID IID, unsigned Idx,
+ const APInt &Imm, Type *Ty) const {
+ assert(Ty->isIntegerTy());
+
+ unsigned BitSize = Ty->getPrimitiveSizeInBits();
+ // There is no cost model for constants with a bit size of 0. Return TCC_Free
+ // here, so that constant hoisting will ignore this constant.
+ if (BitSize == 0)
+ return TCC_Free;
+
+ switch (IID) {
+ default:
+ return TCC_Free;
+ case Intrinsic::sadd_with_overflow:
+ case Intrinsic::uadd_with_overflow:
+ case Intrinsic::ssub_with_overflow:
+ case Intrinsic::usub_with_overflow:
+ case Intrinsic::smul_with_overflow:
+ case Intrinsic::umul_with_overflow:
+ if (Idx == 1) {
+ unsigned NumConstants = (BitSize + 63) / 64;
+ unsigned Cost = AArch64TTI::getIntImmCost(Imm, Ty);
+ return (Cost <= NumConstants * TCC_Basic)
+ ? static_cast<unsigned>(TCC_Free) : Cost;
+ }
+ break;
+ case Intrinsic::experimental_stackmap:
+ if ((Idx < 2) || (Imm.getBitWidth() <= 64 && isInt<64>(Imm.getSExtValue())))
+ return TCC_Free;
+ break;
+ case Intrinsic::experimental_patchpoint_void:
+ case Intrinsic::experimental_patchpoint_i64:
+ if ((Idx < 4) || (Imm.getBitWidth() <= 64 && isInt<64>(Imm.getSExtValue())))
+ return TCC_Free;
+ break;
+ }
+ return AArch64TTI::getIntImmCost(Imm, Ty);
+}
+
+AArch64TTI::PopcntSupportKind
+AArch64TTI::getPopcntSupport(unsigned TyWidth) const {
+ assert(isPowerOf2_32(TyWidth) && "Ty width must be power of 2");
+ if (TyWidth == 32 || TyWidth == 64)
+ return PSK_FastHardware;
+ // TODO: AArch64TargetLowering::LowerCTPOP() supports 128bit popcount.
+ return PSK_Software;
+}
+
+unsigned AArch64TTI::getCastInstrCost(unsigned Opcode, Type *Dst,
+ Type *Src) const {
+ int ISD = TLI->InstructionOpcodeToISD(Opcode);
+ assert(ISD && "Invalid opcode");
+
+ EVT SrcTy = TLI->getValueType(Src);
+ EVT DstTy = TLI->getValueType(Dst);
+
+ if (!SrcTy.isSimple() || !DstTy.isSimple())
+ return TargetTransformInfo::getCastInstrCost(Opcode, Dst, Src);
+
+ static const TypeConversionCostTblEntry<MVT> ConversionTbl[] = {
+ // LowerVectorINT_TO_FP:
+ { ISD::SINT_TO_FP, MVT::v2f32, MVT::v2i32, 1 },
+ { ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i8, 1 },
+ { ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i16, 1 },
+ { ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i32, 1 },
+ { ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i64, 1 },
+ { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i32, 1 },
+ { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i8, 1 },
+ { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i16, 1 },
+ { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i32, 1 },
+ { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i64, 1 },
+ // LowerVectorFP_TO_INT
+ { ISD::FP_TO_SINT, MVT::v4i32, MVT::v4f32, 1 },
+ { ISD::FP_TO_SINT, MVT::v2i64, MVT::v2f64, 1 },
+ { ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f32, 1 },
+ { ISD::FP_TO_UINT, MVT::v2i64, MVT::v2f64, 1 },
+ { ISD::FP_TO_UINT, MVT::v2i32, MVT::v2f64, 1 },
+ { ISD::FP_TO_SINT, MVT::v2i32, MVT::v2f64, 1 },
+ { ISD::FP_TO_UINT, MVT::v2i64, MVT::v2f32, 4 },
+ { ISD::FP_TO_SINT, MVT::v2i64, MVT::v2f32, 4 },
+ { ISD::FP_TO_UINT, MVT::v4i16, MVT::v4f32, 4 },
+ { ISD::FP_TO_SINT, MVT::v4i16, MVT::v4f32, 4 },
+ { ISD::FP_TO_UINT, MVT::v2i64, MVT::v2f64, 4 },
+ { ISD::FP_TO_SINT, MVT::v2i64, MVT::v2f64, 4 },
+ };
+
+ int Idx = ConvertCostTableLookup<MVT>(
+ ConversionTbl, array_lengthof(ConversionTbl), ISD, DstTy.getSimpleVT(),
+ SrcTy.getSimpleVT());
+ if (Idx != -1)
+ return ConversionTbl[Idx].Cost;
+
+ return TargetTransformInfo::getCastInstrCost(Opcode, Dst, Src);
+}
+
+unsigned AArch64TTI::getVectorInstrCost(unsigned Opcode, Type *Val,
+ unsigned Index) const {
+ assert(Val->isVectorTy() && "This must be a vector type");
+
+ if (Index != -1U) {
+ // Legalize the type.
+ std::pair<unsigned, MVT> LT = TLI->getTypeLegalizationCost(Val);
+
+ // This type is legalized to a scalar type.
+ if (!LT.second.isVector())
+ return 0;
+
+ // The type may be split. Normalize the index to the new type.
+ unsigned Width = LT.second.getVectorNumElements();
+ Index = Index % Width;
+
+ // The element at index zero is already inside the vector.
+ if (Index == 0)
+ return 0;
+ }
+
+ // All other insert/extracts cost this much.
+ return 2;
+}
+
+unsigned AArch64TTI::getArithmeticInstrCost(unsigned Opcode, Type *Ty,
+ OperandValueKind Opd1Info,
+ OperandValueKind Opd2Info) const {
+ // Legalize the type.
+ std::pair<unsigned, MVT> LT = TLI->getTypeLegalizationCost(Ty);
+
+ int ISD = TLI->InstructionOpcodeToISD(Opcode);
+
+ switch (ISD) {
+ default:
+ return TargetTransformInfo::getArithmeticInstrCost(Opcode, Ty, Opd1Info,
+ Opd2Info);
+ case ISD::ADD:
+ case ISD::MUL:
+ case ISD::XOR:
+ case ISD::OR:
+ case ISD::AND:
+ // These nodes are marked as 'custom' for combining purposes only.
+ // We know that they are legal. See LowerAdd in ISelLowering.
+ return 1 * LT.first;
+ }
+}
+
+unsigned AArch64TTI::getAddressComputationCost(Type *Ty, bool IsComplex) const {
+ // Address computations in vectorized code with non-consecutive addresses will
+ // likely result in more instructions compared to scalar code where the
+ // computation can more often be merged into the index mode. The resulting
+ // extra micro-ops can significantly decrease throughput.
+ unsigned NumVectorInstToHideOverhead = 10;
+
+ if (Ty->isVectorTy() && IsComplex)
+ return NumVectorInstToHideOverhead;
+
+ // In many cases the address computation is not merged into the instruction
+ // addressing mode.
+ return 1;
+}
+
+unsigned AArch64TTI::getCmpSelInstrCost(unsigned Opcode, Type *ValTy,
+ Type *CondTy) const {
+
+ int ISD = TLI->InstructionOpcodeToISD(Opcode);
+ // We don't lower vector selects well that are wider than the register width.
+ if (ValTy->isVectorTy() && ISD == ISD::SELECT) {
+ // We would need this many instructions to hide the scalarization happening.
+ unsigned AmortizationCost = 20;
+ static const TypeConversionCostTblEntry<MVT::SimpleValueType>
+ VectorSelectTbl[] = {
+ { ISD::SELECT, MVT::v16i1, MVT::v16i16, 16 * AmortizationCost },
+ { ISD::SELECT, MVT::v8i1, MVT::v8i32, 8 * AmortizationCost },
+ { ISD::SELECT, MVT::v16i1, MVT::v16i32, 16 * AmortizationCost },
+ { ISD::SELECT, MVT::v4i1, MVT::v4i64, 4 * AmortizationCost },
+ { ISD::SELECT, MVT::v8i1, MVT::v8i64, 8 * AmortizationCost },
+ { ISD::SELECT, MVT::v16i1, MVT::v16i64, 16 * AmortizationCost }
+ };
+
+ EVT SelCondTy = TLI->getValueType(CondTy);
+ EVT SelValTy = TLI->getValueType(ValTy);
+ if (SelCondTy.isSimple() && SelValTy.isSimple()) {
+ int Idx =
+ ConvertCostTableLookup(VectorSelectTbl, ISD, SelCondTy.getSimpleVT(),
+ SelValTy.getSimpleVT());
+ if (Idx != -1)
+ return VectorSelectTbl[Idx].Cost;
+ }
+ }
+ return TargetTransformInfo::getCmpSelInstrCost(Opcode, ValTy, CondTy);
+}
+
+unsigned AArch64TTI::getMemoryOpCost(unsigned Opcode, Type *Src,
+ unsigned Alignment,
+ unsigned AddressSpace) const {
+ std::pair<unsigned, MVT> LT = TLI->getTypeLegalizationCost(Src);
+
+ if (Opcode == Instruction::Store && Src->isVectorTy() && Alignment != 16 &&
+ Src->getVectorElementType()->isIntegerTy(64)) {
+ // Unaligned stores are extremely inefficient. We don't split
+ // unaligned v2i64 stores because the negative impact that has shown in
+ // practice on inlined memcpy code.
+ // We make v2i64 stores expensive so that we will only vectorize if there
+ // are 6 other instructions getting vectorized.
+ unsigned AmortizationCost = 6;
+
+ return LT.first * 2 * AmortizationCost;
+ }
+
+ if (Src->isVectorTy() && Src->getVectorElementType()->isIntegerTy(8) &&
+ Src->getVectorNumElements() < 8) {
+ // We scalarize the loads/stores because there is not v.4b register and we
+ // have to promote the elements to v.4h.
+ unsigned NumVecElts = Src->getVectorNumElements();
+ unsigned NumVectorizableInstsToAmortize = NumVecElts * 2;
+ // We generate 2 instructions per vector element.
+ return NumVectorizableInstsToAmortize * NumVecElts * 2;
+ }
+
+ return LT.first;
+}
diff --git a/lib/Target/AArch64/Android.mk b/lib/Target/AArch64/Android.mk
index 144c2d3..d0a50da 100644
--- a/lib/Target/AArch64/Android.mk
+++ b/lib/Target/AArch64/Android.mk
@@ -3,31 +3,41 @@
arm64_codegen_TBLGEN_TABLES := \
AArch64GenRegisterInfo.inc \
AArch64GenInstrInfo.inc \
- AArch64GenCodeEmitter.inc \
- AArch64GenMCCodeEmitter.inc \
- AArch64GenMCPseudoLowering.inc \
AArch64GenAsmWriter.inc \
- AArch64GenAsmMatcher.inc \
+ AArch64GenAsmWriter1.inc \
AArch64GenDAGISel.inc \
- AArch64GenFastISel.inc \
AArch64GenCallingConv.inc \
+ AArch64GenAsmMatcher.inc \
AArch64GenSubtargetInfo.inc \
- AArch64GenDisassemblerTables.inc
+ AArch64GenMCCodeEmitter.inc \
+ AArch64GenFastISel.inc \
+ AArch64GenDisassemblerTables.inc \
+ AArch64GenMCPseudoLowering.inc \
arm64_codegen_SRC_FILES := \
+ AArch64AddressTypePromotion.cpp \
+ AArch64AdvSIMDScalarPass.cpp \
AArch64AsmPrinter.cpp \
+ AArch64BranchRelaxation.cpp \
+ AArch64CleanupLocalDynamicTLSPass.cpp \
+ AArch64CollectLOH.cpp \
+ AArch64ConditionalCompares.cpp \
+ AArch64DeadRegisterDefinitionsPass.cpp \
+ AArch64ExpandPseudoInsts.cpp \
+ AArch64FastISel.cpp \
AArch64FrameLowering.cpp \
- AArch64ISelDAGToDAG.cpp \
- AArch64MachineFunctionInfo.cpp \
- AArch64RegisterInfo.cpp \
- AArch64Subtarget.cpp \
- AArch64TargetObjectFile.cpp \
- AArch64BranchFixupPass.cpp \
AArch64InstrInfo.cpp \
+ AArch64ISelDAGToDAG.cpp \
AArch64ISelLowering.cpp \
+ AArch64LoadStoreOptimizer.cpp \
AArch64MCInstLower.cpp \
+ AArch64PromoteConstant.cpp \
+ AArch64RegisterInfo.cpp \
AArch64SelectionDAGInfo.cpp \
+ AArch64StorePairSuppress.cpp \
+ AArch64Subtarget.cpp \
AArch64TargetMachine.cpp \
+ AArch64TargetObjectFile.cpp \
AArch64TargetTransformInfo.cpp
# For the host
diff --git a/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp b/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp
index e933ec1..65b77c5 100644
--- a/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp
+++ b/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp
@@ -6,34 +6,31 @@
// License. See LICENSE.TXT for details.
//
//===----------------------------------------------------------------------===//
-//
-// This file contains the (GNU-style) assembly parser for the AArch64
-// architecture.
-//
-//===----------------------------------------------------------------------===//
-
-#include "MCTargetDesc/AArch64MCTargetDesc.h"
+#include "MCTargetDesc/AArch64AddressingModes.h"
#include "MCTargetDesc/AArch64MCExpr.h"
#include "Utils/AArch64BaseInfo.h"
-#include "llvm/ADT/APFloat.h"
-#include "llvm/ADT/APInt.h"
-#include "llvm/ADT/STLExtras.h"
-#include "llvm/ADT/StringSwitch.h"
-#include "llvm/MC/MCContext.h"
-#include "llvm/MC/MCExpr.h"
-#include "llvm/MC/MCInst.h"
#include "llvm/MC/MCParser/MCAsmLexer.h"
#include "llvm/MC/MCParser/MCAsmParser.h"
#include "llvm/MC/MCParser/MCParsedAsmOperand.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCInst.h"
#include "llvm/MC/MCRegisterInfo.h"
#include "llvm/MC/MCStreamer.h"
#include "llvm/MC/MCSubtargetInfo.h"
+#include "llvm/MC/MCSymbol.h"
#include "llvm/MC/MCTargetAsmParser.h"
-#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/SourceMgr.h"
#include "llvm/Support/TargetRegistry.h"
+#include "llvm/Support/ErrorHandling.h"
#include "llvm/Support/raw_ostream.h"
-
+#include "llvm/ADT/SmallString.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/StringSwitch.h"
+#include "llvm/ADT/Twine.h"
+#include <cstdio>
using namespace llvm;
namespace {
@@ -41,21 +38,74 @@
class AArch64Operand;
class AArch64AsmParser : public MCTargetAsmParser {
+public:
+ typedef SmallVectorImpl<MCParsedAsmOperand *> OperandVector;
+
+private:
+ StringRef Mnemonic; ///< Instruction mnemonic.
MCSubtargetInfo &STI;
MCAsmParser &Parser;
+ MCAsmParser &getParser() const { return Parser; }
+ MCAsmLexer &getLexer() const { return Parser.getLexer(); }
+
+ SMLoc getLoc() const { return Parser.getTok().getLoc(); }
+
+ bool parseSysAlias(StringRef Name, SMLoc NameLoc, OperandVector &Operands);
+ AArch64CC::CondCode parseCondCodeString(StringRef Cond);
+ bool parseCondCode(OperandVector &Operands, bool invertCondCode);
+ int tryParseRegister();
+ int tryMatchVectorRegister(StringRef &Kind, bool expected);
+ bool parseRegister(OperandVector &Operands);
+ bool parseSymbolicImmVal(const MCExpr *&ImmVal);
+ bool parseVectorList(OperandVector &Operands);
+ bool parseOperand(OperandVector &Operands, bool isCondCode,
+ bool invertCondCode);
+
+ void Warning(SMLoc L, const Twine &Msg) { Parser.Warning(L, Msg); }
+ bool Error(SMLoc L, const Twine &Msg) { return Parser.Error(L, Msg); }
+ bool showMatchError(SMLoc Loc, unsigned ErrCode);
+
+ bool parseDirectiveWord(unsigned Size, SMLoc L);
+ bool parseDirectiveTLSDescCall(SMLoc L);
+
+ bool parseDirectiveLOH(StringRef LOH, SMLoc L);
+
+ bool validateInstruction(MCInst &Inst, SmallVectorImpl<SMLoc> &Loc);
+ bool MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
+ OperandVector &Operands, MCStreamer &Out,
+ unsigned &ErrorInfo,
+ bool MatchingInlineAsm) override;
+/// @name Auto-generated Match Functions
+/// {
+
#define GET_ASSEMBLER_HEADER
#include "AArch64GenAsmMatcher.inc"
+ /// }
+
+ OperandMatchResultTy tryParseOptionalShiftExtend(OperandVector &Operands);
+ OperandMatchResultTy tryParseBarrierOperand(OperandVector &Operands);
+ OperandMatchResultTy tryParseMRSSystemRegister(OperandVector &Operands);
+ OperandMatchResultTy tryParseSysReg(OperandVector &Operands);
+ OperandMatchResultTy tryParseSysCROperand(OperandVector &Operands);
+ OperandMatchResultTy tryParsePrefetch(OperandVector &Operands);
+ OperandMatchResultTy tryParseAdrpLabel(OperandVector &Operands);
+ OperandMatchResultTy tryParseAdrLabel(OperandVector &Operands);
+ OperandMatchResultTy tryParseFPImm(OperandVector &Operands);
+ OperandMatchResultTy tryParseAddSubImm(OperandVector &Operands);
+ OperandMatchResultTy tryParseGPR64sp0Operand(OperandVector &Operands);
+ bool tryParseVectorRegister(OperandVector &Operands);
+
public:
enum AArch64MatchResultTy {
- Match_FirstAArch64 = FIRST_TARGET_MATCH_RESULT_TY,
+ Match_InvalidSuffix = FIRST_TARGET_MATCH_RESULT_TY,
#define GET_OPERAND_DIAGNOSTIC_TYPES
#include "AArch64GenAsmMatcher.inc"
};
-
AArch64AsmParser(MCSubtargetInfo &_STI, MCAsmParser &_Parser,
- const MCInstrInfo &MII)
+ const MCInstrInfo &MII,
+ const MCTargetOptions &Options)
: MCTargetAsmParser(), STI(_STI), Parser(_Parser) {
MCAsmParserExtension::Initialize(_Parser);
@@ -63,191 +113,197 @@
setAvailableFeatures(ComputeAvailableFeatures(STI.getFeatureBits()));
}
- // These are the public interface of the MCTargetAsmParser
- bool ParseRegister(unsigned &RegNo, SMLoc &StartLoc, SMLoc &EndLoc);
bool ParseInstruction(ParseInstructionInfo &Info, StringRef Name,
- SMLoc NameLoc,
- SmallVectorImpl<MCParsedAsmOperand*> &Operands);
+ SMLoc NameLoc, OperandVector &Operands) override;
+ bool ParseRegister(unsigned &RegNo, SMLoc &StartLoc, SMLoc &EndLoc) override;
+ bool ParseDirective(AsmToken DirectiveID) override;
+ unsigned validateTargetOperandClass(MCParsedAsmOperand *Op,
+ unsigned Kind) override;
- bool ParseDirective(AsmToken DirectiveID);
- bool ParseDirectiveTLSDescCall(SMLoc L);
- bool ParseDirectiveWord(unsigned Size, SMLoc L);
-
- bool MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
- SmallVectorImpl<MCParsedAsmOperand*> &Operands,
- MCStreamer&Out, unsigned &ErrorInfo,
- bool MatchingInlineAsm);
-
- // The rest of the sub-parsers have more freedom over interface: they return
- // an OperandMatchResultTy because it's less ambiguous than true/false or
- // -1/0/1 even if it is more verbose
- OperandMatchResultTy
- ParseOperand(SmallVectorImpl<MCParsedAsmOperand*> &Operands,
- StringRef Mnemonic);
-
- OperandMatchResultTy ParseImmediate(const MCExpr *&ExprVal);
-
- OperandMatchResultTy ParseRelocPrefix(AArch64MCExpr::VariantKind &RefKind);
-
- OperandMatchResultTy
- ParseNEONLane(SmallVectorImpl<MCParsedAsmOperand*> &Operands,
- uint32_t NumLanes);
-
- OperandMatchResultTy
- ParseRegister(SmallVectorImpl<MCParsedAsmOperand*> &Operands,
- uint32_t &NumLanes);
-
- OperandMatchResultTy
- ParseImmWithLSLOperand(SmallVectorImpl<MCParsedAsmOperand*> &Operands);
-
- OperandMatchResultTy
- ParseCondCodeOperand(SmallVectorImpl<MCParsedAsmOperand*> &Operands);
-
- OperandMatchResultTy
- ParseCRxOperand(SmallVectorImpl<MCParsedAsmOperand*> &Operands);
-
- OperandMatchResultTy
- ParseFPImmOperand(SmallVectorImpl<MCParsedAsmOperand*> &Operands);
-
- OperandMatchResultTy
- ParseFPImm0AndImm0Operand( SmallVectorImpl<MCParsedAsmOperand*> &Operands);
-
- template<typename SomeNamedImmMapper> OperandMatchResultTy
- ParseNamedImmOperand(SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
- return ParseNamedImmOperand(SomeNamedImmMapper(), Operands);
- }
-
- OperandMatchResultTy
- ParseNamedImmOperand(const NamedImmMapper &Mapper,
- SmallVectorImpl<MCParsedAsmOperand*> &Operands);
-
- OperandMatchResultTy
- ParseLSXAddressOperand(SmallVectorImpl<MCParsedAsmOperand*> &Operands);
-
- OperandMatchResultTy
- ParseShiftExtend(SmallVectorImpl<MCParsedAsmOperand*> &Operands);
-
- OperandMatchResultTy
- ParseSysRegOperand(SmallVectorImpl<MCParsedAsmOperand*> &Operands);
-
- bool TryParseVector(uint32_t &RegNum, SMLoc &RegEndLoc, StringRef &Layout,
- SMLoc &LayoutLoc);
-
- OperandMatchResultTy ParseVectorList(SmallVectorImpl<MCParsedAsmOperand *> &);
-
- bool validateInstruction(MCInst &Inst,
- const SmallVectorImpl<MCParsedAsmOperand*> &Operands);
-
- /// Scan the next token (which had better be an identifier) and determine
- /// whether it represents a general-purpose or vector register. It returns
- /// true if an identifier was found and populates its reference arguments. It
- /// does not consume the token.
- bool
- IdentifyRegister(unsigned &RegNum, SMLoc &RegEndLoc, StringRef &LayoutSpec,
- SMLoc &LayoutLoc) const;
-
+ static bool classifySymbolRef(const MCExpr *Expr,
+ AArch64MCExpr::VariantKind &ELFRefKind,
+ MCSymbolRefExpr::VariantKind &DarwinRefKind,
+ int64_t &Addend);
};
-
-}
+} // end anonymous namespace
namespace {
-/// Instances of this class represent a parsed AArch64 machine instruction.
+/// AArch64Operand - Instances of this class represent a parsed AArch64 machine
+/// instruction.
class AArch64Operand : public MCParsedAsmOperand {
private:
enum KindTy {
- k_ImmWithLSL, // #uimm {, LSL #amt }
- k_CondCode, // eq/ne/...
- k_FPImmediate, // Limited-precision floating-point imm
- k_Immediate, // Including expressions referencing symbols
+ k_Immediate,
+ k_ShiftedImm,
+ k_CondCode,
k_Register,
+ k_VectorList,
+ k_VectorIndex,
+ k_Token,
+ k_SysReg,
+ k_SysCR,
+ k_Prefetch,
k_ShiftExtend,
- k_VectorList, // A sequential list of 1 to 4 registers.
- k_SysReg, // The register operand of MRS and MSR instructions
- k_Token, // The mnemonic; other raw tokens the auto-generated
- k_WrappedRegister // Load/store exclusive permit a wrapped register.
+ k_FPImm,
+ k_Barrier
} Kind;
SMLoc StartLoc, EndLoc;
- struct ImmWithLSLOp {
- const MCExpr *Val;
- unsigned ShiftAmount;
- bool ImplicitAmount;
+ struct TokOp {
+ const char *Data;
+ unsigned Length;
+ bool IsSuffix; // Is the operand actually a suffix on the mnemonic.
};
- struct CondCodeOp {
- A64CC::CondCodes Code;
+ struct RegOp {
+ unsigned RegNum;
+ bool isVector;
};
- struct FPImmOp {
- double Val;
+ struct VectorListOp {
+ unsigned RegNum;
+ unsigned Count;
+ unsigned NumElements;
+ unsigned ElementKind;
+ };
+
+ struct VectorIndexOp {
+ unsigned Val;
};
struct ImmOp {
const MCExpr *Val;
};
- struct RegOp {
- unsigned RegNum;
+ struct ShiftedImmOp {
+ const MCExpr *Val;
+ unsigned ShiftAmount;
};
- struct ShiftExtendOp {
- A64SE::ShiftExtSpecifiers ShiftType;
- unsigned Amount;
- bool ImplicitAmount;
+ struct CondCodeOp {
+ AArch64CC::CondCode Code;
};
- // A vector register list is a sequential list of 1 to 4 registers.
- struct VectorListOp {
- unsigned RegNum;
- unsigned Count;
- A64Layout::VectorLayout Layout;
+ struct FPImmOp {
+ unsigned Val; // Encoded 8-bit representation.
+ };
+
+ struct BarrierOp {
+ unsigned Val; // Not the enum since not all values have names.
};
struct SysRegOp {
const char *Data;
unsigned Length;
+ uint64_t FeatureBits; // We need to pass through information about which
+ // core we are compiling for so that the SysReg
+ // Mappers can appropriately conditionalize.
};
- struct TokOp {
- const char *Data;
- unsigned Length;
+ struct SysCRImmOp {
+ unsigned Val;
+ };
+
+ struct PrefetchOp {
+ unsigned Val;
+ };
+
+ struct ShiftExtendOp {
+ AArch64_AM::ShiftExtendType Type;
+ unsigned Amount;
+ bool HasExplicitAmount;
+ };
+
+ struct ExtendOp {
+ unsigned Val;
};
union {
- struct ImmWithLSLOp ImmWithLSL;
+ struct TokOp Tok;
+ struct RegOp Reg;
+ struct VectorListOp VectorList;
+ struct VectorIndexOp VectorIndex;
+ struct ImmOp Imm;
+ struct ShiftedImmOp ShiftedImm;
struct CondCodeOp CondCode;
struct FPImmOp FPImm;
- struct ImmOp Imm;
- struct RegOp Reg;
- struct ShiftExtendOp ShiftExtend;
- struct VectorListOp VectorList;
+ struct BarrierOp Barrier;
struct SysRegOp SysReg;
- struct TokOp Tok;
+ struct SysCRImmOp SysCRImm;
+ struct PrefetchOp Prefetch;
+ struct ShiftExtendOp ShiftExtend;
};
- AArch64Operand(KindTy K, SMLoc S, SMLoc E)
- : MCParsedAsmOperand(), Kind(K), StartLoc(S), EndLoc(E) {}
+ // Keep the MCContext around as the MCExprs may need manipulated during
+ // the add<>Operands() calls.
+ MCContext &Ctx;
+
+ AArch64Operand(KindTy K, MCContext &_Ctx)
+ : MCParsedAsmOperand(), Kind(K), Ctx(_Ctx) {}
public:
- AArch64Operand(const AArch64Operand &o) : MCParsedAsmOperand() {
+ AArch64Operand(const AArch64Operand &o) : MCParsedAsmOperand(), Ctx(o.Ctx) {
+ Kind = o.Kind;
+ StartLoc = o.StartLoc;
+ EndLoc = o.EndLoc;
+ switch (Kind) {
+ case k_Token:
+ Tok = o.Tok;
+ break;
+ case k_Immediate:
+ Imm = o.Imm;
+ break;
+ case k_ShiftedImm:
+ ShiftedImm = o.ShiftedImm;
+ break;
+ case k_CondCode:
+ CondCode = o.CondCode;
+ break;
+ case k_FPImm:
+ FPImm = o.FPImm;
+ break;
+ case k_Barrier:
+ Barrier = o.Barrier;
+ break;
+ case k_Register:
+ Reg = o.Reg;
+ break;
+ case k_VectorList:
+ VectorList = o.VectorList;
+ break;
+ case k_VectorIndex:
+ VectorIndex = o.VectorIndex;
+ break;
+ case k_SysReg:
+ SysReg = o.SysReg;
+ break;
+ case k_SysCR:
+ SysCRImm = o.SysCRImm;
+ break;
+ case k_Prefetch:
+ Prefetch = o.Prefetch;
+ break;
+ case k_ShiftExtend:
+ ShiftExtend = o.ShiftExtend;
+ break;
+ }
}
- SMLoc getStartLoc() const { return StartLoc; }
- SMLoc getEndLoc() const { return EndLoc; }
- void print(raw_ostream&) const;
- void dump() const;
+ /// getStartLoc - Get the location of the first token of this operand.
+ SMLoc getStartLoc() const override { return StartLoc; }
+ /// getEndLoc - Get the location of the last token of this operand.
+ SMLoc getEndLoc() const override { return EndLoc; }
StringRef getToken() const {
assert(Kind == k_Token && "Invalid access!");
return StringRef(Tok.Data, Tok.Length);
}
- unsigned getReg() const {
- assert((Kind == k_Register || Kind == k_WrappedRegister)
- && "Invalid access!");
- return Reg.RegNum;
+ bool isTokenSuffix() const {
+ assert(Kind == k_Token && "Invalid access!");
+ return Tok.IsSuffix;
}
const MCExpr *getImm() const {
@@ -255,1307 +311,1572 @@
return Imm.Val;
}
- A64CC::CondCodes getCondCode() const {
+ const MCExpr *getShiftedImmVal() const {
+ assert(Kind == k_ShiftedImm && "Invalid access!");
+ return ShiftedImm.Val;
+ }
+
+ unsigned getShiftedImmShift() const {
+ assert(Kind == k_ShiftedImm && "Invalid access!");
+ return ShiftedImm.ShiftAmount;
+ }
+
+ AArch64CC::CondCode getCondCode() const {
assert(Kind == k_CondCode && "Invalid access!");
return CondCode.Code;
}
- static bool isNonConstantExpr(const MCExpr *E,
- AArch64MCExpr::VariantKind &Variant) {
- if (const AArch64MCExpr *A64E = dyn_cast<AArch64MCExpr>(E)) {
- Variant = A64E->getKind();
+ unsigned getFPImm() const {
+ assert(Kind == k_FPImm && "Invalid access!");
+ return FPImm.Val;
+ }
+
+ unsigned getBarrier() const {
+ assert(Kind == k_Barrier && "Invalid access!");
+ return Barrier.Val;
+ }
+
+ unsigned getReg() const override {
+ assert(Kind == k_Register && "Invalid access!");
+ return Reg.RegNum;
+ }
+
+ unsigned getVectorListStart() const {
+ assert(Kind == k_VectorList && "Invalid access!");
+ return VectorList.RegNum;
+ }
+
+ unsigned getVectorListCount() const {
+ assert(Kind == k_VectorList && "Invalid access!");
+ return VectorList.Count;
+ }
+
+ unsigned getVectorIndex() const {
+ assert(Kind == k_VectorIndex && "Invalid access!");
+ return VectorIndex.Val;
+ }
+
+ StringRef getSysReg() const {
+ assert(Kind == k_SysReg && "Invalid access!");
+ return StringRef(SysReg.Data, SysReg.Length);
+ }
+
+ uint64_t getSysRegFeatureBits() const {
+ assert(Kind == k_SysReg && "Invalid access!");
+ return SysReg.FeatureBits;
+ }
+
+ unsigned getSysCR() const {
+ assert(Kind == k_SysCR && "Invalid access!");
+ return SysCRImm.Val;
+ }
+
+ unsigned getPrefetch() const {
+ assert(Kind == k_Prefetch && "Invalid access!");
+ return Prefetch.Val;
+ }
+
+ AArch64_AM::ShiftExtendType getShiftExtendType() const {
+ assert(Kind == k_ShiftExtend && "Invalid access!");
+ return ShiftExtend.Type;
+ }
+
+ unsigned getShiftExtendAmount() const {
+ assert(Kind == k_ShiftExtend && "Invalid access!");
+ return ShiftExtend.Amount;
+ }
+
+ bool hasShiftExtendAmount() const {
+ assert(Kind == k_ShiftExtend && "Invalid access!");
+ return ShiftExtend.HasExplicitAmount;
+ }
+
+ bool isImm() const override { return Kind == k_Immediate; }
+ bool isMem() const override { return false; }
+ bool isSImm9() const {
+ if (!isImm())
+ return false;
+ const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
+ if (!MCE)
+ return false;
+ int64_t Val = MCE->getValue();
+ return (Val >= -256 && Val < 256);
+ }
+ bool isSImm7s4() const {
+ if (!isImm())
+ return false;
+ const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
+ if (!MCE)
+ return false;
+ int64_t Val = MCE->getValue();
+ return (Val >= -256 && Val <= 252 && (Val & 3) == 0);
+ }
+ bool isSImm7s8() const {
+ if (!isImm())
+ return false;
+ const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
+ if (!MCE)
+ return false;
+ int64_t Val = MCE->getValue();
+ return (Val >= -512 && Val <= 504 && (Val & 7) == 0);
+ }
+ bool isSImm7s16() const {
+ if (!isImm())
+ return false;
+ const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
+ if (!MCE)
+ return false;
+ int64_t Val = MCE->getValue();
+ return (Val >= -1024 && Val <= 1008 && (Val & 15) == 0);
+ }
+
+ bool isSymbolicUImm12Offset(const MCExpr *Expr, unsigned Scale) const {
+ AArch64MCExpr::VariantKind ELFRefKind;
+ MCSymbolRefExpr::VariantKind DarwinRefKind;
+ int64_t Addend;
+ if (!AArch64AsmParser::classifySymbolRef(Expr, ELFRefKind, DarwinRefKind,
+ Addend)) {
+ // If we don't understand the expression, assume the best and
+ // let the fixup and relocation code deal with it.
return true;
- } else if (!isa<MCConstantExpr>(E)) {
- Variant = AArch64MCExpr::VK_AARCH64_None;
- return true;
+ }
+
+ if (DarwinRefKind == MCSymbolRefExpr::VK_PAGEOFF ||
+ ELFRefKind == AArch64MCExpr::VK_LO12 ||
+ ELFRefKind == AArch64MCExpr::VK_GOT_LO12 ||
+ ELFRefKind == AArch64MCExpr::VK_DTPREL_LO12 ||
+ ELFRefKind == AArch64MCExpr::VK_DTPREL_LO12_NC ||
+ ELFRefKind == AArch64MCExpr::VK_TPREL_LO12 ||
+ ELFRefKind == AArch64MCExpr::VK_TPREL_LO12_NC ||
+ ELFRefKind == AArch64MCExpr::VK_GOTTPREL_LO12_NC ||
+ ELFRefKind == AArch64MCExpr::VK_TLSDESC_LO12) {
+ // Note that we don't range-check the addend. It's adjusted modulo page
+ // size when converted, so there is no "out of range" condition when using
+ // @pageoff.
+ return Addend >= 0 && (Addend % Scale) == 0;
+ } else if (DarwinRefKind == MCSymbolRefExpr::VK_GOTPAGEOFF ||
+ DarwinRefKind == MCSymbolRefExpr::VK_TLVPPAGEOFF) {
+ // @gotpageoff/@tlvppageoff can only be used directly, not with an addend.
+ return Addend == 0;
}
return false;
}
+ template <int Scale> bool isUImm12Offset() const {
+ if (!isImm())
+ return false;
+
+ const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
+ if (!MCE)
+ return isSymbolicUImm12Offset(getImm(), Scale);
+
+ int64_t Val = MCE->getValue();
+ return (Val % Scale) == 0 && Val >= 0 && (Val / Scale) < 0x1000;
+ }
+
+ bool isImm0_7() const {
+ if (!isImm())
+ return false;
+ const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
+ if (!MCE)
+ return false;
+ int64_t Val = MCE->getValue();
+ return (Val >= 0 && Val < 8);
+ }
+ bool isImm1_8() const {
+ if (!isImm())
+ return false;
+ const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
+ if (!MCE)
+ return false;
+ int64_t Val = MCE->getValue();
+ return (Val > 0 && Val < 9);
+ }
+ bool isImm0_15() const {
+ if (!isImm())
+ return false;
+ const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
+ if (!MCE)
+ return false;
+ int64_t Val = MCE->getValue();
+ return (Val >= 0 && Val < 16);
+ }
+ bool isImm1_16() const {
+ if (!isImm())
+ return false;
+ const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
+ if (!MCE)
+ return false;
+ int64_t Val = MCE->getValue();
+ return (Val > 0 && Val < 17);
+ }
+ bool isImm0_31() const {
+ if (!isImm())
+ return false;
+ const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
+ if (!MCE)
+ return false;
+ int64_t Val = MCE->getValue();
+ return (Val >= 0 && Val < 32);
+ }
+ bool isImm1_31() const {
+ if (!isImm())
+ return false;
+ const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
+ if (!MCE)
+ return false;
+ int64_t Val = MCE->getValue();
+ return (Val >= 1 && Val < 32);
+ }
+ bool isImm1_32() const {
+ if (!isImm())
+ return false;
+ const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
+ if (!MCE)
+ return false;
+ int64_t Val = MCE->getValue();
+ return (Val >= 1 && Val < 33);
+ }
+ bool isImm0_63() const {
+ if (!isImm())
+ return false;
+ const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
+ if (!MCE)
+ return false;
+ int64_t Val = MCE->getValue();
+ return (Val >= 0 && Val < 64);
+ }
+ bool isImm1_63() const {
+ if (!isImm())
+ return false;
+ const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
+ if (!MCE)
+ return false;
+ int64_t Val = MCE->getValue();
+ return (Val >= 1 && Val < 64);
+ }
+ bool isImm1_64() const {
+ if (!isImm())
+ return false;
+ const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
+ if (!MCE)
+ return false;
+ int64_t Val = MCE->getValue();
+ return (Val >= 1 && Val < 65);
+ }
+ bool isImm0_127() const {
+ if (!isImm())
+ return false;
+ const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
+ if (!MCE)
+ return false;
+ int64_t Val = MCE->getValue();
+ return (Val >= 0 && Val < 128);
+ }
+ bool isImm0_255() const {
+ if (!isImm())
+ return false;
+ const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
+ if (!MCE)
+ return false;
+ int64_t Val = MCE->getValue();
+ return (Val >= 0 && Val < 256);
+ }
+ bool isImm0_65535() const {
+ if (!isImm())
+ return false;
+ const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
+ if (!MCE)
+ return false;
+ int64_t Val = MCE->getValue();
+ return (Val >= 0 && Val < 65536);
+ }
+ bool isImm32_63() const {
+ if (!isImm())
+ return false;
+ const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
+ if (!MCE)
+ return false;
+ int64_t Val = MCE->getValue();
+ return (Val >= 32 && Val < 64);
+ }
+ bool isLogicalImm32() const {
+ if (!isImm())
+ return false;
+ const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
+ if (!MCE)
+ return false;
+ return AArch64_AM::isLogicalImmediate(MCE->getValue(), 32);
+ }
+ bool isLogicalImm64() const {
+ if (!isImm())
+ return false;
+ const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
+ if (!MCE)
+ return false;
+ return AArch64_AM::isLogicalImmediate(MCE->getValue(), 64);
+ }
+ bool isShiftedImm() const { return Kind == k_ShiftedImm; }
+ bool isAddSubImm() const {
+ if (!isShiftedImm() && !isImm())
+ return false;
+
+ const MCExpr *Expr;
+
+ // An ADD/SUB shifter is either 'lsl #0' or 'lsl #12'.
+ if (isShiftedImm()) {
+ unsigned Shift = ShiftedImm.ShiftAmount;
+ Expr = ShiftedImm.Val;
+ if (Shift != 0 && Shift != 12)
+ return false;
+ } else {
+ Expr = getImm();
+ }
+
+ AArch64MCExpr::VariantKind ELFRefKind;
+ MCSymbolRefExpr::VariantKind DarwinRefKind;
+ int64_t Addend;
+ if (AArch64AsmParser::classifySymbolRef(Expr, ELFRefKind,
+ DarwinRefKind, Addend)) {
+ return DarwinRefKind == MCSymbolRefExpr::VK_PAGEOFF
+ || DarwinRefKind == MCSymbolRefExpr::VK_TLVPPAGEOFF
+ || (DarwinRefKind == MCSymbolRefExpr::VK_GOTPAGEOFF && Addend == 0)
+ || ELFRefKind == AArch64MCExpr::VK_LO12
+ || ELFRefKind == AArch64MCExpr::VK_DTPREL_HI12
+ || ELFRefKind == AArch64MCExpr::VK_DTPREL_LO12
+ || ELFRefKind == AArch64MCExpr::VK_DTPREL_LO12_NC
+ || ELFRefKind == AArch64MCExpr::VK_TPREL_HI12
+ || ELFRefKind == AArch64MCExpr::VK_TPREL_LO12
+ || ELFRefKind == AArch64MCExpr::VK_TPREL_LO12_NC
+ || ELFRefKind == AArch64MCExpr::VK_TLSDESC_LO12;
+ }
+
+ // Otherwise it should be a real immediate in range:
+ const MCConstantExpr *CE = cast<MCConstantExpr>(Expr);
+ return CE->getValue() >= 0 && CE->getValue() <= 0xfff;
+ }
bool isCondCode() const { return Kind == k_CondCode; }
- bool isToken() const { return Kind == k_Token; }
- bool isReg() const { return Kind == k_Register; }
- bool isImm() const { return Kind == k_Immediate; }
- bool isMem() const { return false; }
- bool isFPImm() const { return Kind == k_FPImmediate; }
- bool isShiftOrExtend() const { return Kind == k_ShiftExtend; }
+ bool isSIMDImmType10() const {
+ if (!isImm())
+ return false;
+ const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
+ if (!MCE)
+ return false;
+ return AArch64_AM::isAdvSIMDModImmType10(MCE->getValue());
+ }
+ bool isBranchTarget26() const {
+ if (!isImm())
+ return false;
+ const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
+ if (!MCE)
+ return true;
+ int64_t Val = MCE->getValue();
+ if (Val & 0x3)
+ return false;
+ return (Val >= -(0x2000000 << 2) && Val <= (0x1ffffff << 2));
+ }
+ bool isPCRelLabel19() const {
+ if (!isImm())
+ return false;
+ const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
+ if (!MCE)
+ return true;
+ int64_t Val = MCE->getValue();
+ if (Val & 0x3)
+ return false;
+ return (Val >= -(0x40000 << 2) && Val <= (0x3ffff << 2));
+ }
+ bool isBranchTarget14() const {
+ if (!isImm())
+ return false;
+ const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
+ if (!MCE)
+ return true;
+ int64_t Val = MCE->getValue();
+ if (Val & 0x3)
+ return false;
+ return (Val >= -(0x2000 << 2) && Val <= (0x1fff << 2));
+ }
+
+ bool
+ isMovWSymbol(ArrayRef<AArch64MCExpr::VariantKind> AllowedModifiers) const {
+ if (!isImm())
+ return false;
+
+ AArch64MCExpr::VariantKind ELFRefKind;
+ MCSymbolRefExpr::VariantKind DarwinRefKind;
+ int64_t Addend;
+ if (!AArch64AsmParser::classifySymbolRef(getImm(), ELFRefKind,
+ DarwinRefKind, Addend)) {
+ return false;
+ }
+ if (DarwinRefKind != MCSymbolRefExpr::VK_None)
+ return false;
+
+ for (unsigned i = 0; i != AllowedModifiers.size(); ++i) {
+ if (ELFRefKind == AllowedModifiers[i])
+ return Addend == 0;
+ }
+
+ return false;
+ }
+
+ bool isMovZSymbolG3() const {
+ static AArch64MCExpr::VariantKind Variants[] = { AArch64MCExpr::VK_ABS_G3 };
+ return isMovWSymbol(Variants);
+ }
+
+ bool isMovZSymbolG2() const {
+ static AArch64MCExpr::VariantKind Variants[] = {
+ AArch64MCExpr::VK_ABS_G2, AArch64MCExpr::VK_ABS_G2_S,
+ AArch64MCExpr::VK_TPREL_G2, AArch64MCExpr::VK_DTPREL_G2};
+ return isMovWSymbol(Variants);
+ }
+
+ bool isMovZSymbolG1() const {
+ static AArch64MCExpr::VariantKind Variants[] = {
+ AArch64MCExpr::VK_ABS_G1, AArch64MCExpr::VK_ABS_G1_S,
+ AArch64MCExpr::VK_GOTTPREL_G1, AArch64MCExpr::VK_TPREL_G1,
+ AArch64MCExpr::VK_DTPREL_G1,
+ };
+ return isMovWSymbol(Variants);
+ }
+
+ bool isMovZSymbolG0() const {
+ static AArch64MCExpr::VariantKind Variants[] = {
+ AArch64MCExpr::VK_ABS_G0, AArch64MCExpr::VK_ABS_G0_S,
+ AArch64MCExpr::VK_TPREL_G0, AArch64MCExpr::VK_DTPREL_G0};
+ return isMovWSymbol(Variants);
+ }
+
+ bool isMovKSymbolG3() const {
+ static AArch64MCExpr::VariantKind Variants[] = { AArch64MCExpr::VK_ABS_G3 };
+ return isMovWSymbol(Variants);
+ }
+
+ bool isMovKSymbolG2() const {
+ static AArch64MCExpr::VariantKind Variants[] = {
+ AArch64MCExpr::VK_ABS_G2_NC};
+ return isMovWSymbol(Variants);
+ }
+
+ bool isMovKSymbolG1() const {
+ static AArch64MCExpr::VariantKind Variants[] = {
+ AArch64MCExpr::VK_ABS_G1_NC, AArch64MCExpr::VK_TPREL_G1_NC,
+ AArch64MCExpr::VK_DTPREL_G1_NC
+ };
+ return isMovWSymbol(Variants);
+ }
+
+ bool isMovKSymbolG0() const {
+ static AArch64MCExpr::VariantKind Variants[] = {
+ AArch64MCExpr::VK_ABS_G0_NC, AArch64MCExpr::VK_GOTTPREL_G0_NC,
+ AArch64MCExpr::VK_TPREL_G0_NC, AArch64MCExpr::VK_DTPREL_G0_NC
+ };
+ return isMovWSymbol(Variants);
+ }
+
+ template<int RegWidth, int Shift>
+ bool isMOVZMovAlias() const {
+ if (!isImm()) return false;
+
+ const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+ if (!CE) return false;
+ uint64_t Value = CE->getValue();
+
+ if (RegWidth == 32)
+ Value &= 0xffffffffULL;
+
+ // "lsl #0" takes precedence: in practice this only affects "#0, lsl #0".
+ if (Value == 0 && Shift != 0)
+ return false;
+
+ return (Value & ~(0xffffULL << Shift)) == 0;
+ }
+
+ template<int RegWidth, int Shift>
+ bool isMOVNMovAlias() const {
+ if (!isImm()) return false;
+
+ const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+ if (!CE) return false;
+ uint64_t Value = CE->getValue();
+
+ // MOVZ takes precedence over MOVN.
+ for (int MOVZShift = 0; MOVZShift <= 48; MOVZShift += 16)
+ if ((Value & ~(0xffffULL << MOVZShift)) == 0)
+ return false;
+
+ Value = ~Value;
+ if (RegWidth == 32)
+ Value &= 0xffffffffULL;
+
+ return (Value & ~(0xffffULL << Shift)) == 0;
+ }
+
+ bool isFPImm() const { return Kind == k_FPImm; }
+ bool isBarrier() const { return Kind == k_Barrier; }
bool isSysReg() const { return Kind == k_SysReg; }
- bool isImmWithLSL() const { return Kind == k_ImmWithLSL; }
- bool isWrappedReg() const { return Kind == k_WrappedRegister; }
+ bool isMRSSystemRegister() const {
+ if (!isSysReg()) return false;
- bool isAddSubImmLSL0() const {
- if (!isImmWithLSL()) return false;
- if (ImmWithLSL.ShiftAmount != 0) return false;
+ bool IsKnownRegister;
+ auto Mapper = AArch64SysReg::MRSMapper(getSysRegFeatureBits());
+ Mapper.fromString(getSysReg(), IsKnownRegister);
- AArch64MCExpr::VariantKind Variant;
- if (isNonConstantExpr(ImmWithLSL.Val, Variant)) {
- return Variant == AArch64MCExpr::VK_AARCH64_LO12
- || Variant == AArch64MCExpr::VK_AARCH64_DTPREL_LO12
- || Variant == AArch64MCExpr::VK_AARCH64_DTPREL_LO12_NC
- || Variant == AArch64MCExpr::VK_AARCH64_TPREL_LO12
- || Variant == AArch64MCExpr::VK_AARCH64_TPREL_LO12_NC
- || Variant == AArch64MCExpr::VK_AARCH64_TLSDESC_LO12;
- }
+ return IsKnownRegister;
+ }
+ bool isMSRSystemRegister() const {
+ if (!isSysReg()) return false;
- // Otherwise it should be a real immediate in range:
- const MCConstantExpr *CE = cast<MCConstantExpr>(ImmWithLSL.Val);
- return CE->getValue() >= 0 && CE->getValue() <= 0xfff;
+ bool IsKnownRegister;
+ auto Mapper = AArch64SysReg::MSRMapper(getSysRegFeatureBits());
+ Mapper.fromString(getSysReg(), IsKnownRegister);
+
+ return IsKnownRegister;
+ }
+ bool isSystemPStateField() const {
+ if (!isSysReg()) return false;
+
+ bool IsKnownRegister;
+ AArch64PState::PStateMapper().fromString(getSysReg(), IsKnownRegister);
+
+ return IsKnownRegister;
+ }
+ bool isReg() const override { return Kind == k_Register && !Reg.isVector; }
+ bool isVectorReg() const { return Kind == k_Register && Reg.isVector; }
+ bool isVectorRegLo() const {
+ return Kind == k_Register && Reg.isVector &&
+ AArch64MCRegisterClasses[AArch64::FPR128_loRegClassID].contains(
+ Reg.RegNum);
+ }
+ bool isGPR32as64() const {
+ return Kind == k_Register && !Reg.isVector &&
+ AArch64MCRegisterClasses[AArch64::GPR64RegClassID].contains(Reg.RegNum);
}
- bool isAddSubImmLSL12() const {
- if (!isImmWithLSL()) return false;
- if (ImmWithLSL.ShiftAmount != 12) return false;
-
- AArch64MCExpr::VariantKind Variant;
- if (isNonConstantExpr(ImmWithLSL.Val, Variant)) {
- return Variant == AArch64MCExpr::VK_AARCH64_DTPREL_HI12
- || Variant == AArch64MCExpr::VK_AARCH64_TPREL_HI12;
- }
-
- // Otherwise it should be a real immediate in range:
- const MCConstantExpr *CE = cast<MCConstantExpr>(ImmWithLSL.Val);
- return CE->getValue() >= 0 && CE->getValue() <= 0xfff;
+ bool isGPR64sp0() const {
+ return Kind == k_Register && !Reg.isVector &&
+ AArch64MCRegisterClasses[AArch64::GPR64spRegClassID].contains(Reg.RegNum);
}
- template<unsigned MemSize, unsigned RmSize> bool isAddrRegExtend() const {
- if (!isShiftOrExtend()) return false;
+ /// Is this a vector list with the type implicit (presumably attached to the
+ /// instruction itself)?
+ template <unsigned NumRegs> bool isImplicitlyTypedVectorList() const {
+ return Kind == k_VectorList && VectorList.Count == NumRegs &&
+ !VectorList.ElementKind;
+ }
- A64SE::ShiftExtSpecifiers Ext = ShiftExtend.ShiftType;
- if (RmSize == 32 && !(Ext == A64SE::UXTW || Ext == A64SE::SXTW))
+ template <unsigned NumRegs, unsigned NumElements, char ElementKind>
+ bool isTypedVectorList() const {
+ if (Kind != k_VectorList)
+ return false;
+ if (VectorList.Count != NumRegs)
+ return false;
+ if (VectorList.ElementKind != ElementKind)
+ return false;
+ return VectorList.NumElements == NumElements;
+ }
+
+ bool isVectorIndex1() const {
+ return Kind == k_VectorIndex && VectorIndex.Val == 1;
+ }
+ bool isVectorIndexB() const {
+ return Kind == k_VectorIndex && VectorIndex.Val < 16;
+ }
+ bool isVectorIndexH() const {
+ return Kind == k_VectorIndex && VectorIndex.Val < 8;
+ }
+ bool isVectorIndexS() const {
+ return Kind == k_VectorIndex && VectorIndex.Val < 4;
+ }
+ bool isVectorIndexD() const {
+ return Kind == k_VectorIndex && VectorIndex.Val < 2;
+ }
+ bool isToken() const override { return Kind == k_Token; }
+ bool isTokenEqual(StringRef Str) const {
+ return Kind == k_Token && getToken() == Str;
+ }
+ bool isSysCR() const { return Kind == k_SysCR; }
+ bool isPrefetch() const { return Kind == k_Prefetch; }
+ bool isShiftExtend() const { return Kind == k_ShiftExtend; }
+ bool isShifter() const {
+ if (!isShiftExtend())
return false;
- if (RmSize == 64 && !(Ext == A64SE::LSL || Ext == A64SE::SXTX))
+ AArch64_AM::ShiftExtendType ST = getShiftExtendType();
+ return (ST == AArch64_AM::LSL || ST == AArch64_AM::LSR ||
+ ST == AArch64_AM::ASR || ST == AArch64_AM::ROR ||
+ ST == AArch64_AM::MSL);
+ }
+ bool isExtend() const {
+ if (!isShiftExtend())
return false;
- return ShiftExtend.Amount == Log2_32(MemSize) || ShiftExtend.Amount == 0;
+ AArch64_AM::ShiftExtendType ET = getShiftExtendType();
+ return (ET == AArch64_AM::UXTB || ET == AArch64_AM::SXTB ||
+ ET == AArch64_AM::UXTH || ET == AArch64_AM::SXTH ||
+ ET == AArch64_AM::UXTW || ET == AArch64_AM::SXTW ||
+ ET == AArch64_AM::UXTX || ET == AArch64_AM::SXTX ||
+ ET == AArch64_AM::LSL) &&
+ getShiftExtendAmount() <= 4;
+ }
+
+ bool isExtend64() const {
+ if (!isExtend())
+ return false;
+ // UXTX and SXTX require a 64-bit source register (the ExtendLSL64 class).
+ AArch64_AM::ShiftExtendType ET = getShiftExtendType();
+ return ET != AArch64_AM::UXTX && ET != AArch64_AM::SXTX;
+ }
+ bool isExtendLSL64() const {
+ if (!isExtend())
+ return false;
+ AArch64_AM::ShiftExtendType ET = getShiftExtendType();
+ return (ET == AArch64_AM::UXTX || ET == AArch64_AM::SXTX ||
+ ET == AArch64_AM::LSL) &&
+ getShiftExtendAmount() <= 4;
+ }
+
+ template<int Width> bool isMemXExtend() const {
+ if (!isExtend())
+ return false;
+ AArch64_AM::ShiftExtendType ET = getShiftExtendType();
+ return (ET == AArch64_AM::LSL || ET == AArch64_AM::SXTX) &&
+ (getShiftExtendAmount() == Log2_32(Width / 8) ||
+ getShiftExtendAmount() == 0);
+ }
+
+ template<int Width> bool isMemWExtend() const {
+ if (!isExtend())
+ return false;
+ AArch64_AM::ShiftExtendType ET = getShiftExtendType();
+ return (ET == AArch64_AM::UXTW || ET == AArch64_AM::SXTW) &&
+ (getShiftExtendAmount() == Log2_32(Width / 8) ||
+ getShiftExtendAmount() == 0);
+ }
+
+ template <unsigned width>
+ bool isArithmeticShifter() const {
+ if (!isShifter())
+ return false;
+
+ // An arithmetic shifter is LSL, LSR, or ASR.
+ AArch64_AM::ShiftExtendType ST = getShiftExtendType();
+ return (ST == AArch64_AM::LSL || ST == AArch64_AM::LSR ||
+ ST == AArch64_AM::ASR) && getShiftExtendAmount() < width;
+ }
+
+ template <unsigned width>
+ bool isLogicalShifter() const {
+ if (!isShifter())
+ return false;
+
+ // A logical shifter is LSL, LSR, ASR or ROR.
+ AArch64_AM::ShiftExtendType ST = getShiftExtendType();
+ return (ST == AArch64_AM::LSL || ST == AArch64_AM::LSR ||
+ ST == AArch64_AM::ASR || ST == AArch64_AM::ROR) &&
+ getShiftExtendAmount() < width;
+ }
+
+ bool isMovImm32Shifter() const {
+ if (!isShifter())
+ return false;
+
+ // A MOVi shifter is LSL of 0, 16, 32, or 48.
+ AArch64_AM::ShiftExtendType ST = getShiftExtendType();
+ if (ST != AArch64_AM::LSL)
+ return false;
+ uint64_t Val = getShiftExtendAmount();
+ return (Val == 0 || Val == 16);
+ }
+
+ bool isMovImm64Shifter() const {
+ if (!isShifter())
+ return false;
+
+ // A MOVi shifter is LSL of 0 or 16.
+ AArch64_AM::ShiftExtendType ST = getShiftExtendType();
+ if (ST != AArch64_AM::LSL)
+ return false;
+ uint64_t Val = getShiftExtendAmount();
+ return (Val == 0 || Val == 16 || Val == 32 || Val == 48);
+ }
+
+ bool isLogicalVecShifter() const {
+ if (!isShifter())
+ return false;
+
+ // A logical vector shifter is a left shift by 0, 8, 16, or 24.
+ unsigned Shift = getShiftExtendAmount();
+ return getShiftExtendType() == AArch64_AM::LSL &&
+ (Shift == 0 || Shift == 8 || Shift == 16 || Shift == 24);
+ }
+
+ bool isLogicalVecHalfWordShifter() const {
+ if (!isLogicalVecShifter())
+ return false;
+
+ // A logical vector shifter is a left shift by 0 or 8.
+ unsigned Shift = getShiftExtendAmount();
+ return getShiftExtendType() == AArch64_AM::LSL &&
+ (Shift == 0 || Shift == 8);
+ }
+
+ bool isMoveVecShifter() const {
+ if (!isShiftExtend())
+ return false;
+
+ // A logical vector shifter is a left shift by 8 or 16.
+ unsigned Shift = getShiftExtendAmount();
+ return getShiftExtendType() == AArch64_AM::MSL &&
+ (Shift == 8 || Shift == 16);
+ }
+
+ // Fallback unscaled operands are for aliases of LDR/STR that fall back
+ // to LDUR/STUR when the offset is not legal for the former but is for
+ // the latter. As such, in addition to checking for being a legal unscaled
+ // address, also check that it is not a legal scaled address. This avoids
+ // ambiguity in the matcher.
+ template<int Width>
+ bool isSImm9OffsetFB() const {
+ return isSImm9() && !isUImm12Offset<Width / 8>();
}
bool isAdrpLabel() const {
- if (!isImm()) return false;
+ // Validation was handled during parsing, so we just sanity check that
+ // something didn't go haywire.
+ if (!isImm())
+ return false;
- AArch64MCExpr::VariantKind Variant;
- if (isNonConstantExpr(getImm(), Variant)) {
- return Variant == AArch64MCExpr::VK_AARCH64_None
- || Variant == AArch64MCExpr::VK_AARCH64_GOT
- || Variant == AArch64MCExpr::VK_AARCH64_GOTTPREL
- || Variant == AArch64MCExpr::VK_AARCH64_TLSDESC;
- }
-
- return isLabel<21, 4096>();
- }
-
- template<unsigned RegWidth> bool isBitfieldWidth() const {
- if (!isImm()) return false;
-
- const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
- if (!CE) return false;
-
- return CE->getValue() >= 1 && CE->getValue() <= RegWidth;
- }
-
- template<int RegWidth>
- bool isCVTFixedPos() const {
- if (!isImm()) return false;
-
- const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
- if (!CE) return false;
-
- return CE->getValue() >= 1 && CE->getValue() <= RegWidth;
- }
-
- bool isFMOVImm() const {
- if (!isFPImm()) return false;
-
- APFloat RealVal(FPImm.Val);
- uint32_t ImmVal;
- return A64Imms::isFPImm(RealVal, ImmVal);
- }
-
- bool isFPZero() const {
- if (!isFPImm()) return false;
-
- APFloat RealVal(FPImm.Val);
- return RealVal.isPosZero();
- }
-
- template<unsigned field_width, unsigned scale>
- bool isLabel() const {
- if (!isImm()) return false;
-
- if (dyn_cast<MCSymbolRefExpr>(Imm.Val)) {
- return true;
- } else if (const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(Imm.Val)) {
+ if (const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(Imm.Val)) {
int64_t Val = CE->getValue();
- int64_t Min = - (scale * (1LL << (field_width - 1)));
- int64_t Max = scale * ((1LL << (field_width - 1)) - 1);
- return (Val % scale) == 0 && Val >= Min && Val <= Max;
+ int64_t Min = - (4096 * (1LL << (21 - 1)));
+ int64_t Max = 4096 * ((1LL << (21 - 1)) - 1);
+ return (Val % 4096) == 0 && Val >= Min && Val <= Max;
}
- // N.b. this disallows explicit relocation specifications via an
- // AArch64MCExpr. Users needing that behaviour
- return false;
- }
-
- bool isLane1() const {
- if (!isImm()) return false;
-
- // Because it's come through custom assembly parsing, it must always be a
- // constant expression.
- return cast<MCConstantExpr>(getImm())->getValue() == 1;
- }
-
- bool isLoadLitLabel() const {
- if (!isImm()) return false;
-
- AArch64MCExpr::VariantKind Variant;
- if (isNonConstantExpr(getImm(), Variant)) {
- return Variant == AArch64MCExpr::VK_AARCH64_None
- || Variant == AArch64MCExpr::VK_AARCH64_GOTTPREL;
- }
-
- return isLabel<19, 4>();
- }
-
- template<unsigned RegWidth> bool isLogicalImm() const {
- if (!isImm()) return false;
-
- const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(Imm.Val);
- if (!CE) return false;
-
- uint32_t Bits;
- return A64Imms::isLogicalImm(RegWidth, CE->getValue(), Bits);
- }
-
- template<unsigned RegWidth> bool isLogicalImmMOV() const {
- if (!isLogicalImm<RegWidth>()) return false;
-
- const MCConstantExpr *CE = cast<MCConstantExpr>(Imm.Val);
-
- // The move alias for ORR is only valid if the immediate cannot be
- // represented with a move (immediate) instruction; they take priority.
- int UImm16, Shift;
- return !A64Imms::isMOVZImm(RegWidth, CE->getValue(), UImm16, Shift)
- && !A64Imms::isMOVNImm(RegWidth, CE->getValue(), UImm16, Shift);
- }
-
- template<int MemSize>
- bool isOffsetUImm12() const {
- if (!isImm()) return false;
-
- const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
-
- // Assume they know what they're doing for now if they've given us a
- // non-constant expression. In principle we could check for ridiculous
- // things that can't possibly work or relocations that would almost
- // certainly break resulting code.
- if (!CE)
- return true;
-
- int64_t Val = CE->getValue();
-
- // Must be a multiple of the access size in bytes.
- if ((Val & (MemSize - 1)) != 0) return false;
-
- // Must be 12-bit unsigned
- return Val >= 0 && Val <= 0xfff * MemSize;
- }
-
- template<A64SE::ShiftExtSpecifiers SHKind, bool is64Bit>
- bool isShift() const {
- if (!isShiftOrExtend()) return false;
-
- if (ShiftExtend.ShiftType != SHKind)
- return false;
-
- return is64Bit ? ShiftExtend.Amount <= 63 : ShiftExtend.Amount <= 31;
- }
-
- bool isMOVN32Imm() const {
- static const AArch64MCExpr::VariantKind PermittedModifiers[] = {
- AArch64MCExpr::VK_AARCH64_SABS_G0,
- AArch64MCExpr::VK_AARCH64_SABS_G1,
- AArch64MCExpr::VK_AARCH64_DTPREL_G1,
- AArch64MCExpr::VK_AARCH64_DTPREL_G0,
- AArch64MCExpr::VK_AARCH64_GOTTPREL_G1,
- AArch64MCExpr::VK_AARCH64_TPREL_G1,
- AArch64MCExpr::VK_AARCH64_TPREL_G0,
- };
- const unsigned NumModifiers = llvm::array_lengthof(PermittedModifiers);
-
- return isMoveWideImm(32, PermittedModifiers, NumModifiers);
- }
-
- bool isMOVN64Imm() const {
- static const AArch64MCExpr::VariantKind PermittedModifiers[] = {
- AArch64MCExpr::VK_AARCH64_SABS_G0,
- AArch64MCExpr::VK_AARCH64_SABS_G1,
- AArch64MCExpr::VK_AARCH64_SABS_G2,
- AArch64MCExpr::VK_AARCH64_DTPREL_G2,
- AArch64MCExpr::VK_AARCH64_DTPREL_G1,
- AArch64MCExpr::VK_AARCH64_DTPREL_G0,
- AArch64MCExpr::VK_AARCH64_GOTTPREL_G1,
- AArch64MCExpr::VK_AARCH64_TPREL_G2,
- AArch64MCExpr::VK_AARCH64_TPREL_G1,
- AArch64MCExpr::VK_AARCH64_TPREL_G0,
- };
- const unsigned NumModifiers = llvm::array_lengthof(PermittedModifiers);
-
- return isMoveWideImm(64, PermittedModifiers, NumModifiers);
- }
-
-
- bool isMOVZ32Imm() const {
- static const AArch64MCExpr::VariantKind PermittedModifiers[] = {
- AArch64MCExpr::VK_AARCH64_ABS_G0,
- AArch64MCExpr::VK_AARCH64_ABS_G1,
- AArch64MCExpr::VK_AARCH64_SABS_G0,
- AArch64MCExpr::VK_AARCH64_SABS_G1,
- AArch64MCExpr::VK_AARCH64_DTPREL_G1,
- AArch64MCExpr::VK_AARCH64_DTPREL_G0,
- AArch64MCExpr::VK_AARCH64_GOTTPREL_G1,
- AArch64MCExpr::VK_AARCH64_TPREL_G1,
- AArch64MCExpr::VK_AARCH64_TPREL_G0,
- };
- const unsigned NumModifiers = llvm::array_lengthof(PermittedModifiers);
-
- return isMoveWideImm(32, PermittedModifiers, NumModifiers);
- }
-
- bool isMOVZ64Imm() const {
- static const AArch64MCExpr::VariantKind PermittedModifiers[] = {
- AArch64MCExpr::VK_AARCH64_ABS_G0,
- AArch64MCExpr::VK_AARCH64_ABS_G1,
- AArch64MCExpr::VK_AARCH64_ABS_G2,
- AArch64MCExpr::VK_AARCH64_ABS_G3,
- AArch64MCExpr::VK_AARCH64_SABS_G0,
- AArch64MCExpr::VK_AARCH64_SABS_G1,
- AArch64MCExpr::VK_AARCH64_SABS_G2,
- AArch64MCExpr::VK_AARCH64_DTPREL_G2,
- AArch64MCExpr::VK_AARCH64_DTPREL_G1,
- AArch64MCExpr::VK_AARCH64_DTPREL_G0,
- AArch64MCExpr::VK_AARCH64_GOTTPREL_G1,
- AArch64MCExpr::VK_AARCH64_TPREL_G2,
- AArch64MCExpr::VK_AARCH64_TPREL_G1,
- AArch64MCExpr::VK_AARCH64_TPREL_G0,
- };
- const unsigned NumModifiers = llvm::array_lengthof(PermittedModifiers);
-
- return isMoveWideImm(64, PermittedModifiers, NumModifiers);
- }
-
- bool isMOVK32Imm() const {
- static const AArch64MCExpr::VariantKind PermittedModifiers[] = {
- AArch64MCExpr::VK_AARCH64_ABS_G0_NC,
- AArch64MCExpr::VK_AARCH64_ABS_G1_NC,
- AArch64MCExpr::VK_AARCH64_DTPREL_G1_NC,
- AArch64MCExpr::VK_AARCH64_DTPREL_G0_NC,
- AArch64MCExpr::VK_AARCH64_GOTTPREL_G0_NC,
- AArch64MCExpr::VK_AARCH64_TPREL_G1_NC,
- AArch64MCExpr::VK_AARCH64_TPREL_G0_NC,
- };
- const unsigned NumModifiers = llvm::array_lengthof(PermittedModifiers);
-
- return isMoveWideImm(32, PermittedModifiers, NumModifiers);
- }
-
- bool isMOVK64Imm() const {
- static const AArch64MCExpr::VariantKind PermittedModifiers[] = {
- AArch64MCExpr::VK_AARCH64_ABS_G0_NC,
- AArch64MCExpr::VK_AARCH64_ABS_G1_NC,
- AArch64MCExpr::VK_AARCH64_ABS_G2_NC,
- AArch64MCExpr::VK_AARCH64_ABS_G3,
- AArch64MCExpr::VK_AARCH64_DTPREL_G1_NC,
- AArch64MCExpr::VK_AARCH64_DTPREL_G0_NC,
- AArch64MCExpr::VK_AARCH64_GOTTPREL_G0_NC,
- AArch64MCExpr::VK_AARCH64_TPREL_G1_NC,
- AArch64MCExpr::VK_AARCH64_TPREL_G0_NC,
- };
- const unsigned NumModifiers = llvm::array_lengthof(PermittedModifiers);
-
- return isMoveWideImm(64, PermittedModifiers, NumModifiers);
- }
-
- bool isMoveWideImm(unsigned RegWidth,
- const AArch64MCExpr::VariantKind *PermittedModifiers,
- unsigned NumModifiers) const {
- if (!isImmWithLSL()) return false;
-
- if (ImmWithLSL.ShiftAmount % 16 != 0) return false;
- if (ImmWithLSL.ShiftAmount >= RegWidth) return false;
-
- AArch64MCExpr::VariantKind Modifier;
- if (isNonConstantExpr(ImmWithLSL.Val, Modifier)) {
- // E.g. "#:abs_g0:sym, lsl #16" makes no sense.
- if (!ImmWithLSL.ImplicitAmount) return false;
-
- for (unsigned i = 0; i < NumModifiers; ++i)
- if (PermittedModifiers[i] == Modifier) return true;
-
- return false;
- }
-
- const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(ImmWithLSL.Val);
- return CE && CE->getValue() >= 0 && CE->getValue() <= 0xffff;
- }
-
- template<int RegWidth, bool (*isValidImm)(int, uint64_t, int&, int&)>
- bool isMoveWideMovAlias() const {
- if (!isImm()) return false;
-
- const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
- if (!CE) return false;
-
- int UImm16, Shift;
- uint64_t Value = CE->getValue();
-
- // If this is a 32-bit instruction then all bits above 32 should be the
- // same: either of these is fine because signed/unsigned values should be
- // permitted.
- if (RegWidth == 32) {
- if ((Value >> 32) != 0 && (Value >> 32) != 0xffffffff)
- return false;
-
- Value &= 0xffffffffULL;
- }
-
- return isValidImm(RegWidth, Value, UImm16, Shift);
- }
-
- bool isMSRWithReg() const {
- if (!isSysReg()) return false;
-
- bool IsKnownRegister;
- StringRef Name(SysReg.Data, SysReg.Length);
- A64SysReg::MSRMapper().fromString(Name, IsKnownRegister);
-
- return IsKnownRegister;
- }
-
- bool isMSRPState() const {
- if (!isSysReg()) return false;
-
- bool IsKnownRegister;
- StringRef Name(SysReg.Data, SysReg.Length);
- A64PState::PStateMapper().fromString(Name, IsKnownRegister);
-
- return IsKnownRegister;
- }
-
- bool isMRS() const {
- if (!isSysReg()) return false;
-
- // First check against specific MSR-only (write-only) registers
- bool IsKnownRegister;
- StringRef Name(SysReg.Data, SysReg.Length);
- A64SysReg::MRSMapper().fromString(Name, IsKnownRegister);
-
- return IsKnownRegister;
- }
-
- bool isPRFM() const {
- if (!isImm()) return false;
-
- const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
-
- if (!CE)
- return false;
-
- return CE->getValue() >= 0 && CE->getValue() <= 31;
- }
-
- template<A64SE::ShiftExtSpecifiers SHKind> bool isRegExtend() const {
- if (!isShiftOrExtend()) return false;
-
- if (ShiftExtend.ShiftType != SHKind)
- return false;
-
- return ShiftExtend.Amount <= 4;
- }
-
- bool isRegExtendLSL() const {
- if (!isShiftOrExtend()) return false;
-
- if (ShiftExtend.ShiftType != A64SE::LSL)
- return false;
-
- return !ShiftExtend.ImplicitAmount && ShiftExtend.Amount <= 4;
- }
-
- // if 0 < value <= w, return true
- bool isShrFixedWidth(int w) const {
- if (!isImm())
- return false;
- const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
- if (!CE)
- return false;
- int64_t Value = CE->getValue();
- return Value > 0 && Value <= w;
- }
-
- bool isShrImm8() const { return isShrFixedWidth(8); }
-
- bool isShrImm16() const { return isShrFixedWidth(16); }
-
- bool isShrImm32() const { return isShrFixedWidth(32); }
-
- bool isShrImm64() const { return isShrFixedWidth(64); }
-
- // if 0 <= value < w, return true
- bool isShlFixedWidth(int w) const {
- if (!isImm())
- return false;
- const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
- if (!CE)
- return false;
- int64_t Value = CE->getValue();
- return Value >= 0 && Value < w;
- }
-
- bool isShlImm8() const { return isShlFixedWidth(8); }
-
- bool isShlImm16() const { return isShlFixedWidth(16); }
-
- bool isShlImm32() const { return isShlFixedWidth(32); }
-
- bool isShlImm64() const { return isShlFixedWidth(64); }
-
- bool isNeonMovImmShiftLSL() const {
- if (!isShiftOrExtend())
- return false;
-
- if (ShiftExtend.ShiftType != A64SE::LSL)
- return false;
-
- // Valid shift amount is 0, 8, 16 and 24.
- return ShiftExtend.Amount % 8 == 0 && ShiftExtend.Amount <= 24;
- }
-
- bool isNeonMovImmShiftLSLH() const {
- if (!isShiftOrExtend())
- return false;
-
- if (ShiftExtend.ShiftType != A64SE::LSL)
- return false;
-
- // Valid shift amount is 0 and 8.
- return ShiftExtend.Amount == 0 || ShiftExtend.Amount == 8;
- }
-
- bool isNeonMovImmShiftMSL() const {
- if (!isShiftOrExtend())
- return false;
-
- if (ShiftExtend.ShiftType != A64SE::MSL)
- return false;
-
- // Valid shift amount is 8 and 16.
- return ShiftExtend.Amount == 8 || ShiftExtend.Amount == 16;
- }
-
- template <A64Layout::VectorLayout Layout, unsigned Count>
- bool isVectorList() const {
- return Kind == k_VectorList && VectorList.Layout == Layout &&
- VectorList.Count == Count;
- }
-
- template <int MemSize> bool isSImm7Scaled() const {
- if (!isImm())
- return false;
-
- const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
- if (!CE) return false;
-
- int64_t Val = CE->getValue();
- if (Val % MemSize != 0) return false;
-
- Val /= MemSize;
-
- return Val >= -64 && Val < 64;
- }
-
- template<int BitWidth>
- bool isSImm() const {
- if (!isImm()) return false;
-
- const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
- if (!CE) return false;
-
- return CE->getValue() >= -(1LL << (BitWidth - 1))
- && CE->getValue() < (1LL << (BitWidth - 1));
- }
-
- template<int bitWidth>
- bool isUImm() const {
- if (!isImm()) return false;
-
- const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
- if (!CE) return false;
-
- return CE->getValue() >= 0 && CE->getValue() < (1LL << bitWidth);
- }
-
- bool isUImm() const {
- if (!isImm()) return false;
-
- return isa<MCConstantExpr>(getImm());
- }
-
- bool isNeonUImm64Mask() const {
- if (!isImm())
- return false;
-
- const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
- if (!CE)
- return false;
-
- uint64_t Value = CE->getValue();
-
- // i64 value with each byte being either 0x00 or 0xff.
- for (unsigned i = 0; i < 8; ++i, Value >>= 8)
- if ((Value & 0xff) != 0 && (Value & 0xff) != 0xff)
- return false;
return true;
}
- // if value == N, return true
- template<int N>
- bool isExactImm() const {
- if (!isImm()) return false;
+ bool isAdrLabel() const {
+ // Validation was handled during parsing, so we just sanity check that
+ // something didn't go haywire.
+ if (!isImm())
+ return false;
- const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
- if (!CE) return false;
+ if (const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(Imm.Val)) {
+ int64_t Val = CE->getValue();
+ int64_t Min = - (1LL << (21 - 1));
+ int64_t Max = ((1LL << (21 - 1)) - 1);
+ return Val >= Min && Val <= Max;
+ }
- return CE->getValue() == N;
+ return true;
}
- bool isFPZeroIZero() const {
- return isFPZero();
- }
-
- static AArch64Operand *CreateImmWithLSL(const MCExpr *Val,
- unsigned ShiftAmount,
- bool ImplicitAmount,
- SMLoc S,SMLoc E) {
- AArch64Operand *Op = new AArch64Operand(k_ImmWithLSL, S, E);
- Op->ImmWithLSL.Val = Val;
- Op->ImmWithLSL.ShiftAmount = ShiftAmount;
- Op->ImmWithLSL.ImplicitAmount = ImplicitAmount;
- return Op;
- }
-
- static AArch64Operand *CreateCondCode(A64CC::CondCodes Code,
- SMLoc S, SMLoc E) {
- AArch64Operand *Op = new AArch64Operand(k_CondCode, S, E);
- Op->CondCode.Code = Code;
- return Op;
- }
-
- static AArch64Operand *CreateFPImm(double Val,
- SMLoc S, SMLoc E) {
- AArch64Operand *Op = new AArch64Operand(k_FPImmediate, S, E);
- Op->FPImm.Val = Val;
- return Op;
- }
-
- static AArch64Operand *CreateImm(const MCExpr *Val, SMLoc S, SMLoc E) {
- AArch64Operand *Op = new AArch64Operand(k_Immediate, S, E);
- Op->Imm.Val = Val;
- return Op;
- }
-
- static AArch64Operand *CreateReg(unsigned RegNum, SMLoc S, SMLoc E) {
- AArch64Operand *Op = new AArch64Operand(k_Register, S, E);
- Op->Reg.RegNum = RegNum;
- return Op;
- }
-
- static AArch64Operand *CreateWrappedReg(unsigned RegNum, SMLoc S, SMLoc E) {
- AArch64Operand *Op = new AArch64Operand(k_WrappedRegister, S, E);
- Op->Reg.RegNum = RegNum;
- return Op;
- }
-
- static AArch64Operand *CreateShiftExtend(A64SE::ShiftExtSpecifiers ShiftTyp,
- unsigned Amount,
- bool ImplicitAmount,
- SMLoc S, SMLoc E) {
- AArch64Operand *Op = new AArch64Operand(k_ShiftExtend, S, E);
- Op->ShiftExtend.ShiftType = ShiftTyp;
- Op->ShiftExtend.Amount = Amount;
- Op->ShiftExtend.ImplicitAmount = ImplicitAmount;
- return Op;
- }
-
- static AArch64Operand *CreateSysReg(StringRef Str, SMLoc S) {
- AArch64Operand *Op = new AArch64Operand(k_SysReg, S, S);
- Op->Tok.Data = Str.data();
- Op->Tok.Length = Str.size();
- return Op;
- }
-
- static AArch64Operand *CreateVectorList(unsigned RegNum, unsigned Count,
- A64Layout::VectorLayout Layout,
- SMLoc S, SMLoc E) {
- AArch64Operand *Op = new AArch64Operand(k_VectorList, S, E);
- Op->VectorList.RegNum = RegNum;
- Op->VectorList.Count = Count;
- Op->VectorList.Layout = Layout;
- Op->StartLoc = S;
- Op->EndLoc = E;
- return Op;
- }
-
- static AArch64Operand *CreateToken(StringRef Str, SMLoc S) {
- AArch64Operand *Op = new AArch64Operand(k_Token, S, S);
- Op->Tok.Data = Str.data();
- Op->Tok.Length = Str.size();
- return Op;
- }
-
-
void addExpr(MCInst &Inst, const MCExpr *Expr) const {
- // Add as immediates when possible.
- if (const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(Expr))
+ // Add as immediates when possible. Null MCExpr = 0.
+ if (!Expr)
+ Inst.addOperand(MCOperand::CreateImm(0));
+ else if (const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(Expr))
Inst.addOperand(MCOperand::CreateImm(CE->getValue()));
else
Inst.addOperand(MCOperand::CreateExpr(Expr));
}
- template<unsigned RegWidth>
- void addBFILSBOperands(MCInst &Inst, unsigned N) const {
- assert(N == 1 && "Invalid number of operands!");
- const MCConstantExpr *CE = cast<MCConstantExpr>(getImm());
- unsigned EncodedVal = (RegWidth - CE->getValue()) % RegWidth;
- Inst.addOperand(MCOperand::CreateImm(EncodedVal));
- }
-
- void addBFIWidthOperands(MCInst &Inst, unsigned N) const {
- assert(N == 1 && "Invalid number of operands!");
- const MCConstantExpr *CE = cast<MCConstantExpr>(getImm());
- Inst.addOperand(MCOperand::CreateImm(CE->getValue() - 1));
- }
-
- void addBFXWidthOperands(MCInst &Inst, unsigned N) const {
- assert(N == 1 && "Invalid number of operands!");
-
- uint64_t LSB = Inst.getOperand(Inst.getNumOperands()-1).getImm();
- const MCConstantExpr *CE = cast<MCConstantExpr>(getImm());
-
- Inst.addOperand(MCOperand::CreateImm(LSB + CE->getValue() - 1));
- }
-
- void addCondCodeOperands(MCInst &Inst, unsigned N) const {
- assert(N == 1 && "Invalid number of operands!");
- Inst.addOperand(MCOperand::CreateImm(getCondCode()));
- }
-
- void addCVTFixedPosOperands(MCInst &Inst, unsigned N) const {
- assert(N == 1 && "Invalid number of operands!");
-
- const MCConstantExpr *CE = cast<MCConstantExpr>(getImm());
- Inst.addOperand(MCOperand::CreateImm(64 - CE->getValue()));
- }
-
- void addFMOVImmOperands(MCInst &Inst, unsigned N) const {
- assert(N == 1 && "Invalid number of operands!");
-
- APFloat RealVal(FPImm.Val);
- uint32_t ImmVal;
- A64Imms::isFPImm(RealVal, ImmVal);
-
- Inst.addOperand(MCOperand::CreateImm(ImmVal));
- }
-
- void addFPZeroOperands(MCInst &Inst, unsigned N) const {
- assert(N == 1 && "Invalid number of operands");
- Inst.addOperand(MCOperand::CreateImm(0));
- }
-
- void addFPZeroIZeroOperands(MCInst &Inst, unsigned N) const {
- addFPZeroOperands(Inst, N);
- }
-
- void addInvCondCodeOperands(MCInst &Inst, unsigned N) const {
- assert(N == 1 && "Invalid number of operands!");
- unsigned Encoded = A64InvertCondCode(getCondCode());
- Inst.addOperand(MCOperand::CreateImm(Encoded));
- }
-
void addRegOperands(MCInst &Inst, unsigned N) const {
assert(N == 1 && "Invalid number of operands!");
Inst.addOperand(MCOperand::CreateReg(getReg()));
}
+ void addGPR32as64Operands(MCInst &Inst, unsigned N) const {
+ assert(N == 1 && "Invalid number of operands!");
+ assert(
+ AArch64MCRegisterClasses[AArch64::GPR64RegClassID].contains(getReg()));
+
+ const MCRegisterInfo *RI = Ctx.getRegisterInfo();
+ uint32_t Reg = RI->getRegClass(AArch64::GPR32RegClassID).getRegister(
+ RI->getEncodingValue(getReg()));
+
+ Inst.addOperand(MCOperand::CreateReg(Reg));
+ }
+
+ void addVectorReg64Operands(MCInst &Inst, unsigned N) const {
+ assert(N == 1 && "Invalid number of operands!");
+ assert(
+ AArch64MCRegisterClasses[AArch64::FPR128RegClassID].contains(getReg()));
+ Inst.addOperand(MCOperand::CreateReg(AArch64::D0 + getReg() - AArch64::Q0));
+ }
+
+ void addVectorReg128Operands(MCInst &Inst, unsigned N) const {
+ assert(N == 1 && "Invalid number of operands!");
+ assert(
+ AArch64MCRegisterClasses[AArch64::FPR128RegClassID].contains(getReg()));
+ Inst.addOperand(MCOperand::CreateReg(getReg()));
+ }
+
+ void addVectorRegLoOperands(MCInst &Inst, unsigned N) const {
+ assert(N == 1 && "Invalid number of operands!");
+ Inst.addOperand(MCOperand::CreateReg(getReg()));
+ }
+
+ template <unsigned NumRegs>
+ void addVectorList64Operands(MCInst &Inst, unsigned N) const {
+ assert(N == 1 && "Invalid number of operands!");
+ static unsigned FirstRegs[] = { AArch64::D0, AArch64::D0_D1,
+ AArch64::D0_D1_D2, AArch64::D0_D1_D2_D3 };
+ unsigned FirstReg = FirstRegs[NumRegs - 1];
+
+ Inst.addOperand(
+ MCOperand::CreateReg(FirstReg + getVectorListStart() - AArch64::Q0));
+ }
+
+ template <unsigned NumRegs>
+ void addVectorList128Operands(MCInst &Inst, unsigned N) const {
+ assert(N == 1 && "Invalid number of operands!");
+ static unsigned FirstRegs[] = { AArch64::Q0, AArch64::Q0_Q1,
+ AArch64::Q0_Q1_Q2, AArch64::Q0_Q1_Q2_Q3 };
+ unsigned FirstReg = FirstRegs[NumRegs - 1];
+
+ Inst.addOperand(
+ MCOperand::CreateReg(FirstReg + getVectorListStart() - AArch64::Q0));
+ }
+
+ void addVectorIndex1Operands(MCInst &Inst, unsigned N) const {
+ assert(N == 1 && "Invalid number of operands!");
+ Inst.addOperand(MCOperand::CreateImm(getVectorIndex()));
+ }
+
+ void addVectorIndexBOperands(MCInst &Inst, unsigned N) const {
+ assert(N == 1 && "Invalid number of operands!");
+ Inst.addOperand(MCOperand::CreateImm(getVectorIndex()));
+ }
+
+ void addVectorIndexHOperands(MCInst &Inst, unsigned N) const {
+ assert(N == 1 && "Invalid number of operands!");
+ Inst.addOperand(MCOperand::CreateImm(getVectorIndex()));
+ }
+
+ void addVectorIndexSOperands(MCInst &Inst, unsigned N) const {
+ assert(N == 1 && "Invalid number of operands!");
+ Inst.addOperand(MCOperand::CreateImm(getVectorIndex()));
+ }
+
+ void addVectorIndexDOperands(MCInst &Inst, unsigned N) const {
+ assert(N == 1 && "Invalid number of operands!");
+ Inst.addOperand(MCOperand::CreateImm(getVectorIndex()));
+ }
+
void addImmOperands(MCInst &Inst, unsigned N) const {
assert(N == 1 && "Invalid number of operands!");
+ // If this is a pageoff symrefexpr with an addend, adjust the addend
+ // to be only the page-offset portion. Otherwise, just add the expr
+ // as-is.
addExpr(Inst, getImm());
}
- template<int MemSize>
- void addSImm7ScaledOperands(MCInst &Inst, unsigned N) const {
- assert(N == 1 && "Invalid number of operands!");
-
- const MCConstantExpr *CE = cast<MCConstantExpr>(getImm());
- uint64_t Val = CE->getValue() / MemSize;
- Inst.addOperand(MCOperand::CreateImm(Val & 0x7f));
- }
-
- template<int BitWidth>
- void addSImmOperands(MCInst &Inst, unsigned N) const {
- assert(N == 1 && "Invalid number of operands!");
-
- const MCConstantExpr *CE = cast<MCConstantExpr>(getImm());
- uint64_t Val = CE->getValue();
- Inst.addOperand(MCOperand::CreateImm(Val & ((1ULL << BitWidth) - 1)));
- }
-
- void addImmWithLSLOperands(MCInst &Inst, unsigned N) const {
- assert (N == 1 && "Invalid number of operands!");
-
- addExpr(Inst, ImmWithLSL.Val);
- }
-
- template<unsigned field_width, unsigned scale>
- void addLabelOperands(MCInst &Inst, unsigned N) const {
- assert(N == 1 && "Invalid number of operands!");
-
- const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(Imm.Val);
-
- if (!CE) {
- addExpr(Inst, Imm.Val);
- return;
- }
-
- int64_t Val = CE->getValue();
- assert(Val % scale == 0 && "Unaligned immediate in instruction");
- Val /= scale;
-
- Inst.addOperand(MCOperand::CreateImm(Val & ((1LL << field_width) - 1)));
- }
-
- template<int MemSize>
- void addOffsetUImm12Operands(MCInst &Inst, unsigned N) const {
- assert(N == 1 && "Invalid number of operands!");
-
- if (const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm())) {
- Inst.addOperand(MCOperand::CreateImm(CE->getValue() / MemSize));
+ void addAddSubImmOperands(MCInst &Inst, unsigned N) const {
+ assert(N == 2 && "Invalid number of operands!");
+ if (isShiftedImm()) {
+ addExpr(Inst, getShiftedImmVal());
+ Inst.addOperand(MCOperand::CreateImm(getShiftedImmShift()));
} else {
- Inst.addOperand(MCOperand::CreateExpr(getImm()));
+ addExpr(Inst, getImm());
+ Inst.addOperand(MCOperand::CreateImm(0));
}
}
- template<unsigned RegWidth>
- void addLogicalImmOperands(MCInst &Inst, unsigned N) const {
- assert(N == 1 && "Invalid number of operands");
- const MCConstantExpr *CE = cast<MCConstantExpr>(Imm.Val);
-
- uint32_t Bits;
- A64Imms::isLogicalImm(RegWidth, CE->getValue(), Bits);
-
- Inst.addOperand(MCOperand::CreateImm(Bits));
- }
-
- void addMRSOperands(MCInst &Inst, unsigned N) const {
+ void addCondCodeOperands(MCInst &Inst, unsigned N) const {
assert(N == 1 && "Invalid number of operands!");
-
- bool Valid;
- StringRef Name(SysReg.Data, SysReg.Length);
- uint32_t Bits = A64SysReg::MRSMapper().fromString(Name, Valid);
-
- Inst.addOperand(MCOperand::CreateImm(Bits));
+ Inst.addOperand(MCOperand::CreateImm(getCondCode()));
}
- void addMSRWithRegOperands(MCInst &Inst, unsigned N) const {
+ void addAdrpLabelOperands(MCInst &Inst, unsigned N) const {
assert(N == 1 && "Invalid number of operands!");
-
- bool Valid;
- StringRef Name(SysReg.Data, SysReg.Length);
- uint32_t Bits = A64SysReg::MSRMapper().fromString(Name, Valid);
-
- Inst.addOperand(MCOperand::CreateImm(Bits));
+ const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
+ if (!MCE)
+ addExpr(Inst, getImm());
+ else
+ Inst.addOperand(MCOperand::CreateImm(MCE->getValue() >> 12));
}
- void addMSRPStateOperands(MCInst &Inst, unsigned N) const {
+ void addAdrLabelOperands(MCInst &Inst, unsigned N) const {
+ addImmOperands(Inst, N);
+ }
+
+ template<int Scale>
+ void addUImm12OffsetOperands(MCInst &Inst, unsigned N) const {
assert(N == 1 && "Invalid number of operands!");
+ const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
- bool Valid;
- StringRef Name(SysReg.Data, SysReg.Length);
- uint32_t Bits = A64PState::PStateMapper().fromString(Name, Valid);
-
- Inst.addOperand(MCOperand::CreateImm(Bits));
- }
-
- void addMoveWideImmOperands(MCInst &Inst, unsigned N) const {
- assert(N == 2 && "Invalid number of operands!");
-
- addExpr(Inst, ImmWithLSL.Val);
-
- AArch64MCExpr::VariantKind Variant;
- if (!isNonConstantExpr(ImmWithLSL.Val, Variant)) {
- Inst.addOperand(MCOperand::CreateImm(ImmWithLSL.ShiftAmount / 16));
+ if (!MCE) {
+ Inst.addOperand(MCOperand::CreateExpr(getImm()));
return;
}
-
- // We know it's relocated
- switch (Variant) {
- case AArch64MCExpr::VK_AARCH64_ABS_G0:
- case AArch64MCExpr::VK_AARCH64_ABS_G0_NC:
- case AArch64MCExpr::VK_AARCH64_SABS_G0:
- case AArch64MCExpr::VK_AARCH64_DTPREL_G0:
- case AArch64MCExpr::VK_AARCH64_DTPREL_G0_NC:
- case AArch64MCExpr::VK_AARCH64_GOTTPREL_G0_NC:
- case AArch64MCExpr::VK_AARCH64_TPREL_G0:
- case AArch64MCExpr::VK_AARCH64_TPREL_G0_NC:
- Inst.addOperand(MCOperand::CreateImm(0));
- break;
- case AArch64MCExpr::VK_AARCH64_ABS_G1:
- case AArch64MCExpr::VK_AARCH64_ABS_G1_NC:
- case AArch64MCExpr::VK_AARCH64_SABS_G1:
- case AArch64MCExpr::VK_AARCH64_DTPREL_G1:
- case AArch64MCExpr::VK_AARCH64_DTPREL_G1_NC:
- case AArch64MCExpr::VK_AARCH64_GOTTPREL_G1:
- case AArch64MCExpr::VK_AARCH64_TPREL_G1:
- case AArch64MCExpr::VK_AARCH64_TPREL_G1_NC:
- Inst.addOperand(MCOperand::CreateImm(1));
- break;
- case AArch64MCExpr::VK_AARCH64_ABS_G2:
- case AArch64MCExpr::VK_AARCH64_ABS_G2_NC:
- case AArch64MCExpr::VK_AARCH64_SABS_G2:
- case AArch64MCExpr::VK_AARCH64_DTPREL_G2:
- case AArch64MCExpr::VK_AARCH64_TPREL_G2:
- Inst.addOperand(MCOperand::CreateImm(2));
- break;
- case AArch64MCExpr::VK_AARCH64_ABS_G3:
- Inst.addOperand(MCOperand::CreateImm(3));
- break;
- default: llvm_unreachable("Inappropriate move wide relocation");
- }
+ Inst.addOperand(MCOperand::CreateImm(MCE->getValue() / Scale));
}
- template<int RegWidth, bool isValidImm(int, uint64_t, int&, int&)>
- void addMoveWideMovAliasOperands(MCInst &Inst, unsigned N) const {
+ void addSImm9Operands(MCInst &Inst, unsigned N) const {
+ assert(N == 1 && "Invalid number of operands!");
+ const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
+ assert(MCE && "Invalid constant immediate operand!");
+ Inst.addOperand(MCOperand::CreateImm(MCE->getValue()));
+ }
+
+ void addSImm7s4Operands(MCInst &Inst, unsigned N) const {
+ assert(N == 1 && "Invalid number of operands!");
+ const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
+ assert(MCE && "Invalid constant immediate operand!");
+ Inst.addOperand(MCOperand::CreateImm(MCE->getValue() / 4));
+ }
+
+ void addSImm7s8Operands(MCInst &Inst, unsigned N) const {
+ assert(N == 1 && "Invalid number of operands!");
+ const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
+ assert(MCE && "Invalid constant immediate operand!");
+ Inst.addOperand(MCOperand::CreateImm(MCE->getValue() / 8));
+ }
+
+ void addSImm7s16Operands(MCInst &Inst, unsigned N) const {
+ assert(N == 1 && "Invalid number of operands!");
+ const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
+ assert(MCE && "Invalid constant immediate operand!");
+ Inst.addOperand(MCOperand::CreateImm(MCE->getValue() / 16));
+ }
+
+ void addImm0_7Operands(MCInst &Inst, unsigned N) const {
+ assert(N == 1 && "Invalid number of operands!");
+ const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
+ assert(MCE && "Invalid constant immediate operand!");
+ Inst.addOperand(MCOperand::CreateImm(MCE->getValue()));
+ }
+
+ void addImm1_8Operands(MCInst &Inst, unsigned N) const {
+ assert(N == 1 && "Invalid number of operands!");
+ const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
+ assert(MCE && "Invalid constant immediate operand!");
+ Inst.addOperand(MCOperand::CreateImm(MCE->getValue()));
+ }
+
+ void addImm0_15Operands(MCInst &Inst, unsigned N) const {
+ assert(N == 1 && "Invalid number of operands!");
+ const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
+ assert(MCE && "Invalid constant immediate operand!");
+ Inst.addOperand(MCOperand::CreateImm(MCE->getValue()));
+ }
+
+ void addImm1_16Operands(MCInst &Inst, unsigned N) const {
+ assert(N == 1 && "Invalid number of operands!");
+ const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
+ assert(MCE && "Invalid constant immediate operand!");
+ Inst.addOperand(MCOperand::CreateImm(MCE->getValue()));
+ }
+
+ void addImm0_31Operands(MCInst &Inst, unsigned N) const {
+ assert(N == 1 && "Invalid number of operands!");
+ const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
+ assert(MCE && "Invalid constant immediate operand!");
+ Inst.addOperand(MCOperand::CreateImm(MCE->getValue()));
+ }
+
+ void addImm1_31Operands(MCInst &Inst, unsigned N) const {
+ assert(N == 1 && "Invalid number of operands!");
+ const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
+ assert(MCE && "Invalid constant immediate operand!");
+ Inst.addOperand(MCOperand::CreateImm(MCE->getValue()));
+ }
+
+ void addImm1_32Operands(MCInst &Inst, unsigned N) const {
+ assert(N == 1 && "Invalid number of operands!");
+ const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
+ assert(MCE && "Invalid constant immediate operand!");
+ Inst.addOperand(MCOperand::CreateImm(MCE->getValue()));
+ }
+
+ void addImm0_63Operands(MCInst &Inst, unsigned N) const {
+ assert(N == 1 && "Invalid number of operands!");
+ const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
+ assert(MCE && "Invalid constant immediate operand!");
+ Inst.addOperand(MCOperand::CreateImm(MCE->getValue()));
+ }
+
+ void addImm1_63Operands(MCInst &Inst, unsigned N) const {
+ assert(N == 1 && "Invalid number of operands!");
+ const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
+ assert(MCE && "Invalid constant immediate operand!");
+ Inst.addOperand(MCOperand::CreateImm(MCE->getValue()));
+ }
+
+ void addImm1_64Operands(MCInst &Inst, unsigned N) const {
+ assert(N == 1 && "Invalid number of operands!");
+ const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
+ assert(MCE && "Invalid constant immediate operand!");
+ Inst.addOperand(MCOperand::CreateImm(MCE->getValue()));
+ }
+
+ void addImm0_127Operands(MCInst &Inst, unsigned N) const {
+ assert(N == 1 && "Invalid number of operands!");
+ const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
+ assert(MCE && "Invalid constant immediate operand!");
+ Inst.addOperand(MCOperand::CreateImm(MCE->getValue()));
+ }
+
+ void addImm0_255Operands(MCInst &Inst, unsigned N) const {
+ assert(N == 1 && "Invalid number of operands!");
+ const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
+ assert(MCE && "Invalid constant immediate operand!");
+ Inst.addOperand(MCOperand::CreateImm(MCE->getValue()));
+ }
+
+ void addImm0_65535Operands(MCInst &Inst, unsigned N) const {
+ assert(N == 1 && "Invalid number of operands!");
+ const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
+ assert(MCE && "Invalid constant immediate operand!");
+ Inst.addOperand(MCOperand::CreateImm(MCE->getValue()));
+ }
+
+ void addImm32_63Operands(MCInst &Inst, unsigned N) const {
+ assert(N == 1 && "Invalid number of operands!");
+ const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
+ assert(MCE && "Invalid constant immediate operand!");
+ Inst.addOperand(MCOperand::CreateImm(MCE->getValue()));
+ }
+
+ void addLogicalImm32Operands(MCInst &Inst, unsigned N) const {
+ assert(N == 1 && "Invalid number of operands!");
+ const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
+ assert(MCE && "Invalid logical immediate operand!");
+ uint64_t encoding = AArch64_AM::encodeLogicalImmediate(MCE->getValue(), 32);
+ Inst.addOperand(MCOperand::CreateImm(encoding));
+ }
+
+ void addLogicalImm64Operands(MCInst &Inst, unsigned N) const {
+ assert(N == 1 && "Invalid number of operands!");
+ const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
+ assert(MCE && "Invalid logical immediate operand!");
+ uint64_t encoding = AArch64_AM::encodeLogicalImmediate(MCE->getValue(), 64);
+ Inst.addOperand(MCOperand::CreateImm(encoding));
+ }
+
+ void addSIMDImmType10Operands(MCInst &Inst, unsigned N) const {
+ assert(N == 1 && "Invalid number of operands!");
+ const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
+ assert(MCE && "Invalid immediate operand!");
+ uint64_t encoding = AArch64_AM::encodeAdvSIMDModImmType10(MCE->getValue());
+ Inst.addOperand(MCOperand::CreateImm(encoding));
+ }
+
+ void addBranchTarget26Operands(MCInst &Inst, unsigned N) const {
+ // Branch operands don't encode the low bits, so shift them off
+ // here. If it's a label, however, just put it on directly as there's
+ // not enough information now to do anything.
+ assert(N == 1 && "Invalid number of operands!");
+ const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
+ if (!MCE) {
+ addExpr(Inst, getImm());
+ return;
+ }
+ assert(MCE && "Invalid constant immediate operand!");
+ Inst.addOperand(MCOperand::CreateImm(MCE->getValue() >> 2));
+ }
+
+ void addPCRelLabel19Operands(MCInst &Inst, unsigned N) const {
+ // Branch operands don't encode the low bits, so shift them off
+ // here. If it's a label, however, just put it on directly as there's
+ // not enough information now to do anything.
+ assert(N == 1 && "Invalid number of operands!");
+ const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
+ if (!MCE) {
+ addExpr(Inst, getImm());
+ return;
+ }
+ assert(MCE && "Invalid constant immediate operand!");
+ Inst.addOperand(MCOperand::CreateImm(MCE->getValue() >> 2));
+ }
+
+ void addBranchTarget14Operands(MCInst &Inst, unsigned N) const {
+ // Branch operands don't encode the low bits, so shift them off
+ // here. If it's a label, however, just put it on directly as there's
+ // not enough information now to do anything.
+ assert(N == 1 && "Invalid number of operands!");
+ const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
+ if (!MCE) {
+ addExpr(Inst, getImm());
+ return;
+ }
+ assert(MCE && "Invalid constant immediate operand!");
+ Inst.addOperand(MCOperand::CreateImm(MCE->getValue() >> 2));
+ }
+
+ void addFPImmOperands(MCInst &Inst, unsigned N) const {
+ assert(N == 1 && "Invalid number of operands!");
+ Inst.addOperand(MCOperand::CreateImm(getFPImm()));
+ }
+
+ void addBarrierOperands(MCInst &Inst, unsigned N) const {
+ assert(N == 1 && "Invalid number of operands!");
+ Inst.addOperand(MCOperand::CreateImm(getBarrier()));
+ }
+
+ void addMRSSystemRegisterOperands(MCInst &Inst, unsigned N) const {
+ assert(N == 1 && "Invalid number of operands!");
+
+ bool Valid;
+ auto Mapper = AArch64SysReg::MRSMapper(getSysRegFeatureBits());
+ uint32_t Bits = Mapper.fromString(getSysReg(), Valid);
+
+ Inst.addOperand(MCOperand::CreateImm(Bits));
+ }
+
+ void addMSRSystemRegisterOperands(MCInst &Inst, unsigned N) const {
+ assert(N == 1 && "Invalid number of operands!");
+
+ bool Valid;
+ auto Mapper = AArch64SysReg::MSRMapper(getSysRegFeatureBits());
+ uint32_t Bits = Mapper.fromString(getSysReg(), Valid);
+
+ Inst.addOperand(MCOperand::CreateImm(Bits));
+ }
+
+ void addSystemPStateFieldOperands(MCInst &Inst, unsigned N) const {
+ assert(N == 1 && "Invalid number of operands!");
+
+ bool Valid;
+ uint32_t Bits =
+ AArch64PState::PStateMapper().fromString(getSysReg(), Valid);
+
+ Inst.addOperand(MCOperand::CreateImm(Bits));
+ }
+
+ void addSysCROperands(MCInst &Inst, unsigned N) const {
+ assert(N == 1 && "Invalid number of operands!");
+ Inst.addOperand(MCOperand::CreateImm(getSysCR()));
+ }
+
+ void addPrefetchOperands(MCInst &Inst, unsigned N) const {
+ assert(N == 1 && "Invalid number of operands!");
+ Inst.addOperand(MCOperand::CreateImm(getPrefetch()));
+ }
+
+ void addShifterOperands(MCInst &Inst, unsigned N) const {
+ assert(N == 1 && "Invalid number of operands!");
+ unsigned Imm =
+ AArch64_AM::getShifterImm(getShiftExtendType(), getShiftExtendAmount());
+ Inst.addOperand(MCOperand::CreateImm(Imm));
+ }
+
+ void addExtendOperands(MCInst &Inst, unsigned N) const {
+ assert(N == 1 && "Invalid number of operands!");
+ AArch64_AM::ShiftExtendType ET = getShiftExtendType();
+ if (ET == AArch64_AM::LSL) ET = AArch64_AM::UXTW;
+ unsigned Imm = AArch64_AM::getArithExtendImm(ET, getShiftExtendAmount());
+ Inst.addOperand(MCOperand::CreateImm(Imm));
+ }
+
+ void addExtend64Operands(MCInst &Inst, unsigned N) const {
+ assert(N == 1 && "Invalid number of operands!");
+ AArch64_AM::ShiftExtendType ET = getShiftExtendType();
+ if (ET == AArch64_AM::LSL) ET = AArch64_AM::UXTX;
+ unsigned Imm = AArch64_AM::getArithExtendImm(ET, getShiftExtendAmount());
+ Inst.addOperand(MCOperand::CreateImm(Imm));
+ }
+
+ void addMemExtendOperands(MCInst &Inst, unsigned N) const {
assert(N == 2 && "Invalid number of operands!");
- int UImm16, Shift;
-
- const MCConstantExpr *CE = cast<MCConstantExpr>(getImm());
- uint64_t Value = CE->getValue();
-
- if (RegWidth == 32) {
- Value &= 0xffffffffULL;
- }
-
- bool Valid = isValidImm(RegWidth, Value, UImm16, Shift);
- (void)Valid;
- assert(Valid && "Invalid immediates should have been weeded out by now");
-
- Inst.addOperand(MCOperand::CreateImm(UImm16));
- Inst.addOperand(MCOperand::CreateImm(Shift));
+ AArch64_AM::ShiftExtendType ET = getShiftExtendType();
+ bool IsSigned = ET == AArch64_AM::SXTW || ET == AArch64_AM::SXTX;
+ Inst.addOperand(MCOperand::CreateImm(IsSigned));
+ Inst.addOperand(MCOperand::CreateImm(getShiftExtendAmount() != 0));
}
- void addPRFMOperands(MCInst &Inst, unsigned N) const {
+ // For 8-bit load/store instructions with a register offset, both the
+ // "DoShift" and "NoShift" variants have a shift of 0. Because of this,
+ // they're disambiguated by whether the shift was explicit or implicit rather
+ // than its size.
+ void addMemExtend8Operands(MCInst &Inst, unsigned N) const {
+ assert(N == 2 && "Invalid number of operands!");
+ AArch64_AM::ShiftExtendType ET = getShiftExtendType();
+ bool IsSigned = ET == AArch64_AM::SXTW || ET == AArch64_AM::SXTX;
+ Inst.addOperand(MCOperand::CreateImm(IsSigned));
+ Inst.addOperand(MCOperand::CreateImm(hasShiftExtendAmount()));
+ }
+
+ template<int Shift>
+ void addMOVZMovAliasOperands(MCInst &Inst, unsigned N) const {
assert(N == 1 && "Invalid number of operands!");
const MCConstantExpr *CE = cast<MCConstantExpr>(getImm());
- assert(CE->getValue() >= 0 && CE->getValue() <= 31
- && "PRFM operand should be 5-bits");
-
- Inst.addOperand(MCOperand::CreateImm(CE->getValue()));
- }
-
- // For Add-sub (extended register) operands.
- void addRegExtendOperands(MCInst &Inst, unsigned N) const {
- assert(N == 1 && "Invalid number of operands!");
-
- Inst.addOperand(MCOperand::CreateImm(ShiftExtend.Amount));
- }
-
- // For Vector Immediates shifted imm operands.
- void addNeonMovImmShiftLSLOperands(MCInst &Inst, unsigned N) const {
- assert(N == 1 && "Invalid number of operands!");
-
- if (ShiftExtend.Amount % 8 != 0 || ShiftExtend.Amount > 24)
- llvm_unreachable("Invalid shift amount for vector immediate inst.");
-
- // Encode LSL shift amount 0, 8, 16, 24 as 0, 1, 2, 3.
- int64_t Imm = ShiftExtend.Amount / 8;
- Inst.addOperand(MCOperand::CreateImm(Imm));
- }
-
- void addNeonMovImmShiftLSLHOperands(MCInst &Inst, unsigned N) const {
- assert(N == 1 && "Invalid number of operands!");
-
- if (ShiftExtend.Amount != 0 && ShiftExtend.Amount != 8)
- llvm_unreachable("Invalid shift amount for vector immediate inst.");
-
- // Encode LSLH shift amount 0, 8 as 0, 1.
- int64_t Imm = ShiftExtend.Amount / 8;
- Inst.addOperand(MCOperand::CreateImm(Imm));
- }
-
- void addNeonMovImmShiftMSLOperands(MCInst &Inst, unsigned N) const {
- assert(N == 1 && "Invalid number of operands!");
-
- if (ShiftExtend.Amount != 8 && ShiftExtend.Amount != 16)
- llvm_unreachable("Invalid shift amount for vector immediate inst.");
-
- // Encode MSL shift amount 8, 16 as 0, 1.
- int64_t Imm = ShiftExtend.Amount / 8 - 1;
- Inst.addOperand(MCOperand::CreateImm(Imm));
- }
-
- // For the extend in load-store (register offset) instructions.
- template<unsigned MemSize>
- void addAddrRegExtendOperands(MCInst &Inst, unsigned N) const {
- addAddrRegExtendOperands(Inst, N, MemSize);
- }
-
- void addAddrRegExtendOperands(MCInst &Inst, unsigned N,
- unsigned MemSize) const {
- assert(N == 1 && "Invalid number of operands!");
-
- // First bit of Option is set in instruction classes, the high two bits are
- // as follows:
- unsigned OptionHi = 0;
- switch (ShiftExtend.ShiftType) {
- case A64SE::UXTW:
- case A64SE::LSL:
- OptionHi = 1;
- break;
- case A64SE::SXTW:
- case A64SE::SXTX:
- OptionHi = 3;
- break;
- default:
- llvm_unreachable("Invalid extend type for register offset");
- }
-
- unsigned S = 0;
- if (MemSize == 1 && !ShiftExtend.ImplicitAmount)
- S = 1;
- else if (MemSize != 1 && ShiftExtend.Amount != 0)
- S = 1;
-
- Inst.addOperand(MCOperand::CreateImm((OptionHi << 1) | S));
- }
- void addShiftOperands(MCInst &Inst, unsigned N) const {
- assert(N == 1 && "Invalid number of operands!");
-
- Inst.addOperand(MCOperand::CreateImm(ShiftExtend.Amount));
- }
-
- void addNeonUImm64MaskOperands(MCInst &Inst, unsigned N) const {
- assert(N == 1 && "Invalid number of operands!");
-
- // A bit from each byte in the constant forms the encoded immediate
- const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
uint64_t Value = CE->getValue();
-
- unsigned Imm = 0;
- for (unsigned i = 0; i < 8; ++i, Value >>= 8) {
- Imm |= (Value & 1) << i;
- }
- Inst.addOperand(MCOperand::CreateImm(Imm));
+ Inst.addOperand(MCOperand::CreateImm((Value >> Shift) & 0xffff));
}
- void addVectorListOperands(MCInst &Inst, unsigned N) const {
+ template<int Shift>
+ void addMOVNMovAliasOperands(MCInst &Inst, unsigned N) const {
assert(N == 1 && "Invalid number of operands!");
- Inst.addOperand(MCOperand::CreateReg(VectorList.RegNum));
+
+ const MCConstantExpr *CE = cast<MCConstantExpr>(getImm());
+ uint64_t Value = CE->getValue();
+ Inst.addOperand(MCOperand::CreateImm((~Value >> Shift) & 0xffff));
+ }
+
+ void print(raw_ostream &OS) const override;
+
+ static AArch64Operand *CreateToken(StringRef Str, bool IsSuffix, SMLoc S,
+ MCContext &Ctx) {
+ AArch64Operand *Op = new AArch64Operand(k_Token, Ctx);
+ Op->Tok.Data = Str.data();
+ Op->Tok.Length = Str.size();
+ Op->Tok.IsSuffix = IsSuffix;
+ Op->StartLoc = S;
+ Op->EndLoc = S;
+ return Op;
+ }
+
+ static AArch64Operand *CreateReg(unsigned RegNum, bool isVector, SMLoc S,
+ SMLoc E, MCContext &Ctx) {
+ AArch64Operand *Op = new AArch64Operand(k_Register, Ctx);
+ Op->Reg.RegNum = RegNum;
+ Op->Reg.isVector = isVector;
+ Op->StartLoc = S;
+ Op->EndLoc = E;
+ return Op;
+ }
+
+ static AArch64Operand *CreateVectorList(unsigned RegNum, unsigned Count,
+ unsigned NumElements, char ElementKind,
+ SMLoc S, SMLoc E, MCContext &Ctx) {
+ AArch64Operand *Op = new AArch64Operand(k_VectorList, Ctx);
+ Op->VectorList.RegNum = RegNum;
+ Op->VectorList.Count = Count;
+ Op->VectorList.NumElements = NumElements;
+ Op->VectorList.ElementKind = ElementKind;
+ Op->StartLoc = S;
+ Op->EndLoc = E;
+ return Op;
+ }
+
+ static AArch64Operand *CreateVectorIndex(unsigned Idx, SMLoc S, SMLoc E,
+ MCContext &Ctx) {
+ AArch64Operand *Op = new AArch64Operand(k_VectorIndex, Ctx);
+ Op->VectorIndex.Val = Idx;
+ Op->StartLoc = S;
+ Op->EndLoc = E;
+ return Op;
+ }
+
+ static AArch64Operand *CreateImm(const MCExpr *Val, SMLoc S, SMLoc E,
+ MCContext &Ctx) {
+ AArch64Operand *Op = new AArch64Operand(k_Immediate, Ctx);
+ Op->Imm.Val = Val;
+ Op->StartLoc = S;
+ Op->EndLoc = E;
+ return Op;
+ }
+
+ static AArch64Operand *CreateShiftedImm(const MCExpr *Val,
+ unsigned ShiftAmount, SMLoc S,
+ SMLoc E, MCContext &Ctx) {
+ AArch64Operand *Op = new AArch64Operand(k_ShiftedImm, Ctx);
+ Op->ShiftedImm .Val = Val;
+ Op->ShiftedImm.ShiftAmount = ShiftAmount;
+ Op->StartLoc = S;
+ Op->EndLoc = E;
+ return Op;
+ }
+
+ static AArch64Operand *CreateCondCode(AArch64CC::CondCode Code, SMLoc S,
+ SMLoc E, MCContext &Ctx) {
+ AArch64Operand *Op = new AArch64Operand(k_CondCode, Ctx);
+ Op->CondCode.Code = Code;
+ Op->StartLoc = S;
+ Op->EndLoc = E;
+ return Op;
+ }
+
+ static AArch64Operand *CreateFPImm(unsigned Val, SMLoc S, MCContext &Ctx) {
+ AArch64Operand *Op = new AArch64Operand(k_FPImm, Ctx);
+ Op->FPImm.Val = Val;
+ Op->StartLoc = S;
+ Op->EndLoc = S;
+ return Op;
+ }
+
+ static AArch64Operand *CreateBarrier(unsigned Val, SMLoc S, MCContext &Ctx) {
+ AArch64Operand *Op = new AArch64Operand(k_Barrier, Ctx);
+ Op->Barrier.Val = Val;
+ Op->StartLoc = S;
+ Op->EndLoc = S;
+ return Op;
+ }
+
+ static AArch64Operand *CreateSysReg(StringRef Str, SMLoc S,
+ uint64_t FeatureBits, MCContext &Ctx) {
+ AArch64Operand *Op = new AArch64Operand(k_SysReg, Ctx);
+ Op->SysReg.Data = Str.data();
+ Op->SysReg.Length = Str.size();
+ Op->SysReg.FeatureBits = FeatureBits;
+ Op->StartLoc = S;
+ Op->EndLoc = S;
+ return Op;
+ }
+
+ static AArch64Operand *CreateSysCR(unsigned Val, SMLoc S, SMLoc E,
+ MCContext &Ctx) {
+ AArch64Operand *Op = new AArch64Operand(k_SysCR, Ctx);
+ Op->SysCRImm.Val = Val;
+ Op->StartLoc = S;
+ Op->EndLoc = E;
+ return Op;
+ }
+
+ static AArch64Operand *CreatePrefetch(unsigned Val, SMLoc S, MCContext &Ctx) {
+ AArch64Operand *Op = new AArch64Operand(k_Prefetch, Ctx);
+ Op->Prefetch.Val = Val;
+ Op->StartLoc = S;
+ Op->EndLoc = S;
+ return Op;
+ }
+
+ static AArch64Operand *CreateShiftExtend(AArch64_AM::ShiftExtendType ShOp,
+ unsigned Val, bool HasExplicitAmount,
+ SMLoc S, SMLoc E, MCContext &Ctx) {
+ AArch64Operand *Op = new AArch64Operand(k_ShiftExtend, Ctx);
+ Op->ShiftExtend.Type = ShOp;
+ Op->ShiftExtend.Amount = Val;
+ Op->ShiftExtend.HasExplicitAmount = HasExplicitAmount;
+ Op->StartLoc = S;
+ Op->EndLoc = E;
+ return Op;
}
};
} // end anonymous namespace.
-AArch64AsmParser::OperandMatchResultTy
-AArch64AsmParser::ParseOperand(SmallVectorImpl<MCParsedAsmOperand*> &Operands,
- StringRef Mnemonic) {
-
- // See if the operand has a custom parser
- OperandMatchResultTy ResTy = MatchOperandParserImpl(Operands, Mnemonic);
-
- // It could either succeed, fail or just not care.
- if (ResTy != MatchOperand_NoMatch)
- return ResTy;
-
- switch (getLexer().getKind()) {
- default:
- Error(Parser.getTok().getLoc(), "unexpected token in operand");
- return MatchOperand_ParseFail;
- case AsmToken::Identifier: {
- // It might be in the LSL/UXTB family ...
- OperandMatchResultTy GotShift = ParseShiftExtend(Operands);
-
- // We can only continue if no tokens were eaten.
- if (GotShift != MatchOperand_NoMatch)
- return GotShift;
-
- // ... or it might be a register ...
- uint32_t NumLanes = 0;
- OperandMatchResultTy GotReg = ParseRegister(Operands, NumLanes);
- assert(GotReg != MatchOperand_ParseFail
- && "register parsing shouldn't partially succeed");
-
- if (GotReg == MatchOperand_Success) {
- if (Parser.getTok().is(AsmToken::LBrac))
- return ParseNEONLane(Operands, NumLanes);
- else
- return MatchOperand_Success;
- }
- // ... or it might be a symbolish thing
+void AArch64Operand::print(raw_ostream &OS) const {
+ switch (Kind) {
+ case k_FPImm:
+ OS << "<fpimm " << getFPImm() << "("
+ << AArch64_AM::getFPImmFloat(getFPImm()) << ") >";
+ break;
+ case k_Barrier: {
+ bool Valid;
+ StringRef Name = AArch64DB::DBarrierMapper().toString(getBarrier(), Valid);
+ if (Valid)
+ OS << "<barrier " << Name << ">";
+ else
+ OS << "<barrier invalid #" << getBarrier() << ">";
+ break;
}
- // Fall through
- case AsmToken::LParen: // E.g. (strcmp-4)
- case AsmToken::Integer: // 1f, 2b labels
- case AsmToken::String: // quoted labels
- case AsmToken::Dot: // . is Current location
- case AsmToken::Dollar: // $ is PC
- case AsmToken::Colon: {
- SMLoc StartLoc = Parser.getTok().getLoc();
- SMLoc EndLoc;
- const MCExpr *ImmVal = 0;
-
- if (ParseImmediate(ImmVal) != MatchOperand_Success)
- return MatchOperand_ParseFail;
-
- EndLoc = SMLoc::getFromPointer(Parser.getTok().getLoc().getPointer() - 1);
- Operands.push_back(AArch64Operand::CreateImm(ImmVal, StartLoc, EndLoc));
- return MatchOperand_Success;
+ case k_Immediate:
+ getImm()->print(OS);
+ break;
+ case k_ShiftedImm: {
+ unsigned Shift = getShiftedImmShift();
+ OS << "<shiftedimm ";
+ getShiftedImmVal()->print(OS);
+ OS << ", lsl #" << AArch64_AM::getShiftValue(Shift) << ">";
+ break;
}
- case AsmToken::Hash: { // Immediates
- SMLoc StartLoc = Parser.getTok().getLoc();
- SMLoc EndLoc;
- const MCExpr *ImmVal = 0;
- Parser.Lex();
-
- if (ParseImmediate(ImmVal) != MatchOperand_Success)
- return MatchOperand_ParseFail;
-
- EndLoc = SMLoc::getFromPointer(Parser.getTok().getLoc().getPointer() - 1);
- Operands.push_back(AArch64Operand::CreateImm(ImmVal, StartLoc, EndLoc));
- return MatchOperand_Success;
+ case k_CondCode:
+ OS << "<condcode " << getCondCode() << ">";
+ break;
+ case k_Register:
+ OS << "<register " << getReg() << ">";
+ break;
+ case k_VectorList: {
+ OS << "<vectorlist ";
+ unsigned Reg = getVectorListStart();
+ for (unsigned i = 0, e = getVectorListCount(); i != e; ++i)
+ OS << Reg + i << " ";
+ OS << ">";
+ break;
}
- case AsmToken::LBrac: {
- SMLoc Loc = Parser.getTok().getLoc();
- Operands.push_back(AArch64Operand::CreateToken("[", Loc));
- Parser.Lex(); // Eat '['
-
- // There's no comma after a '[', so we can parse the next operand
- // immediately.
- return ParseOperand(Operands, Mnemonic);
+ case k_VectorIndex:
+ OS << "<vectorindex " << getVectorIndex() << ">";
+ break;
+ case k_SysReg:
+ OS << "<sysreg: " << getSysReg() << '>';
+ break;
+ case k_Token:
+ OS << "'" << getToken() << "'";
+ break;
+ case k_SysCR:
+ OS << "c" << getSysCR();
+ break;
+ case k_Prefetch: {
+ bool Valid;
+ StringRef Name = AArch64PRFM::PRFMMapper().toString(getPrefetch(), Valid);
+ if (Valid)
+ OS << "<prfop " << Name << ">";
+ else
+ OS << "<prfop invalid #" << getPrefetch() << ">";
+ break;
}
- // The following will likely be useful later, but not in very early cases
- case AsmToken::LCurly: // SIMD vector list is not parsed here
- llvm_unreachable("Don't know how to deal with '{' in operand");
- return MatchOperand_ParseFail;
+ case k_ShiftExtend: {
+ OS << "<" << AArch64_AM::getShiftExtendName(getShiftExtendType()) << " #"
+ << getShiftExtendAmount();
+ if (!hasShiftExtendAmount())
+ OS << "<imp>";
+ OS << '>';
+ break;
+ }
}
}
-AArch64AsmParser::OperandMatchResultTy
-AArch64AsmParser::ParseImmediate(const MCExpr *&ExprVal) {
- if (getLexer().is(AsmToken::Colon)) {
- AArch64MCExpr::VariantKind RefKind;
+/// @name Auto-generated Match Functions
+/// {
- OperandMatchResultTy ResTy = ParseRelocPrefix(RefKind);
- if (ResTy != MatchOperand_Success)
- return ResTy;
+static unsigned MatchRegisterName(StringRef Name);
- const MCExpr *SubExprVal;
- if (getParser().parseExpression(SubExprVal))
- return MatchOperand_ParseFail;
+/// }
- ExprVal = AArch64MCExpr::Create(RefKind, SubExprVal, getContext());
- return MatchOperand_Success;
- }
-
- // No weird AArch64MCExpr prefix
- return getParser().parseExpression(ExprVal)
- ? MatchOperand_ParseFail : MatchOperand_Success;
+static unsigned matchVectorRegName(StringRef Name) {
+ return StringSwitch<unsigned>(Name)
+ .Case("v0", AArch64::Q0)
+ .Case("v1", AArch64::Q1)
+ .Case("v2", AArch64::Q2)
+ .Case("v3", AArch64::Q3)
+ .Case("v4", AArch64::Q4)
+ .Case("v5", AArch64::Q5)
+ .Case("v6", AArch64::Q6)
+ .Case("v7", AArch64::Q7)
+ .Case("v8", AArch64::Q8)
+ .Case("v9", AArch64::Q9)
+ .Case("v10", AArch64::Q10)
+ .Case("v11", AArch64::Q11)
+ .Case("v12", AArch64::Q12)
+ .Case("v13", AArch64::Q13)
+ .Case("v14", AArch64::Q14)
+ .Case("v15", AArch64::Q15)
+ .Case("v16", AArch64::Q16)
+ .Case("v17", AArch64::Q17)
+ .Case("v18", AArch64::Q18)
+ .Case("v19", AArch64::Q19)
+ .Case("v20", AArch64::Q20)
+ .Case("v21", AArch64::Q21)
+ .Case("v22", AArch64::Q22)
+ .Case("v23", AArch64::Q23)
+ .Case("v24", AArch64::Q24)
+ .Case("v25", AArch64::Q25)
+ .Case("v26", AArch64::Q26)
+ .Case("v27", AArch64::Q27)
+ .Case("v28", AArch64::Q28)
+ .Case("v29", AArch64::Q29)
+ .Case("v30", AArch64::Q30)
+ .Case("v31", AArch64::Q31)
+ .Default(0);
}
-// A lane attached to a NEON register. "[N]", which should yield three tokens:
-// '[', N, ']'. A hash is not allowed to precede the immediate here.
-AArch64AsmParser::OperandMatchResultTy
-AArch64AsmParser::ParseNEONLane(SmallVectorImpl<MCParsedAsmOperand*> &Operands,
- uint32_t NumLanes) {
- SMLoc Loc = Parser.getTok().getLoc();
-
- assert(Parser.getTok().is(AsmToken::LBrac) && "inappropriate operand");
- Operands.push_back(AArch64Operand::CreateToken("[", Loc));
- Parser.Lex(); // Eat '['
-
- if (Parser.getTok().isNot(AsmToken::Integer)) {
- Error(Parser.getTok().getLoc(), "expected lane number");
- return MatchOperand_ParseFail;
- }
-
- if (Parser.getTok().getIntVal() >= NumLanes) {
- Error(Parser.getTok().getLoc(), "lane number incompatible with layout");
- return MatchOperand_ParseFail;
- }
-
- const MCExpr *Lane = MCConstantExpr::Create(Parser.getTok().getIntVal(),
- getContext());
- SMLoc S = Parser.getTok().getLoc();
- Parser.Lex(); // Eat actual lane
- SMLoc E = Parser.getTok().getLoc();
- Operands.push_back(AArch64Operand::CreateImm(Lane, S, E));
-
-
- if (Parser.getTok().isNot(AsmToken::RBrac)) {
- Error(Parser.getTok().getLoc(), "expected ']' after lane");
- return MatchOperand_ParseFail;
- }
-
- Operands.push_back(AArch64Operand::CreateToken("]", Loc));
- Parser.Lex(); // Eat ']'
-
- return MatchOperand_Success;
+static bool isValidVectorKind(StringRef Name) {
+ return StringSwitch<bool>(Name.lower())
+ .Case(".8b", true)
+ .Case(".16b", true)
+ .Case(".4h", true)
+ .Case(".8h", true)
+ .Case(".2s", true)
+ .Case(".4s", true)
+ .Case(".1d", true)
+ .Case(".2d", true)
+ .Case(".1q", true)
+ // Accept the width neutral ones, too, for verbose syntax. If those
+ // aren't used in the right places, the token operand won't match so
+ // all will work out.
+ .Case(".b", true)
+ .Case(".h", true)
+ .Case(".s", true)
+ .Case(".d", true)
+ .Default(false);
}
-AArch64AsmParser::OperandMatchResultTy
-AArch64AsmParser::ParseRelocPrefix(AArch64MCExpr::VariantKind &RefKind) {
- assert(getLexer().is(AsmToken::Colon) && "expected a ':'");
- Parser.Lex();
+static void parseValidVectorKind(StringRef Name, unsigned &NumElements,
+ char &ElementKind) {
+ assert(isValidVectorKind(Name));
- if (getLexer().isNot(AsmToken::Identifier)) {
- Error(Parser.getTok().getLoc(),
- "expected relocation specifier in operand after ':'");
- return MatchOperand_ParseFail;
+ ElementKind = Name.lower()[Name.size() - 1];
+ NumElements = 0;
+
+ if (Name.size() == 2)
+ return;
+
+ // Parse the lane count
+ Name = Name.drop_front();
+ while (isdigit(Name.front())) {
+ NumElements = 10 * NumElements + (Name.front() - '0');
+ Name = Name.drop_front();
}
-
- std::string LowerCase = Parser.getTok().getIdentifier().lower();
- RefKind = StringSwitch<AArch64MCExpr::VariantKind>(LowerCase)
- .Case("got", AArch64MCExpr::VK_AARCH64_GOT)
- .Case("got_lo12", AArch64MCExpr::VK_AARCH64_GOT_LO12)
- .Case("lo12", AArch64MCExpr::VK_AARCH64_LO12)
- .Case("abs_g0", AArch64MCExpr::VK_AARCH64_ABS_G0)
- .Case("abs_g0_nc", AArch64MCExpr::VK_AARCH64_ABS_G0_NC)
- .Case("abs_g1", AArch64MCExpr::VK_AARCH64_ABS_G1)
- .Case("abs_g1_nc", AArch64MCExpr::VK_AARCH64_ABS_G1_NC)
- .Case("abs_g2", AArch64MCExpr::VK_AARCH64_ABS_G2)
- .Case("abs_g2_nc", AArch64MCExpr::VK_AARCH64_ABS_G2_NC)
- .Case("abs_g3", AArch64MCExpr::VK_AARCH64_ABS_G3)
- .Case("abs_g0_s", AArch64MCExpr::VK_AARCH64_SABS_G0)
- .Case("abs_g1_s", AArch64MCExpr::VK_AARCH64_SABS_G1)
- .Case("abs_g2_s", AArch64MCExpr::VK_AARCH64_SABS_G2)
- .Case("dtprel_g2", AArch64MCExpr::VK_AARCH64_DTPREL_G2)
- .Case("dtprel_g1", AArch64MCExpr::VK_AARCH64_DTPREL_G1)
- .Case("dtprel_g1_nc", AArch64MCExpr::VK_AARCH64_DTPREL_G1_NC)
- .Case("dtprel_g0", AArch64MCExpr::VK_AARCH64_DTPREL_G0)
- .Case("dtprel_g0_nc", AArch64MCExpr::VK_AARCH64_DTPREL_G0_NC)
- .Case("dtprel_hi12", AArch64MCExpr::VK_AARCH64_DTPREL_HI12)
- .Case("dtprel_lo12", AArch64MCExpr::VK_AARCH64_DTPREL_LO12)
- .Case("dtprel_lo12_nc", AArch64MCExpr::VK_AARCH64_DTPREL_LO12_NC)
- .Case("gottprel_g1", AArch64MCExpr::VK_AARCH64_GOTTPREL_G1)
- .Case("gottprel_g0_nc", AArch64MCExpr::VK_AARCH64_GOTTPREL_G0_NC)
- .Case("gottprel", AArch64MCExpr::VK_AARCH64_GOTTPREL)
- .Case("gottprel_lo12", AArch64MCExpr::VK_AARCH64_GOTTPREL_LO12)
- .Case("tprel_g2", AArch64MCExpr::VK_AARCH64_TPREL_G2)
- .Case("tprel_g1", AArch64MCExpr::VK_AARCH64_TPREL_G1)
- .Case("tprel_g1_nc", AArch64MCExpr::VK_AARCH64_TPREL_G1_NC)
- .Case("tprel_g0", AArch64MCExpr::VK_AARCH64_TPREL_G0)
- .Case("tprel_g0_nc", AArch64MCExpr::VK_AARCH64_TPREL_G0_NC)
- .Case("tprel_hi12", AArch64MCExpr::VK_AARCH64_TPREL_HI12)
- .Case("tprel_lo12", AArch64MCExpr::VK_AARCH64_TPREL_LO12)
- .Case("tprel_lo12_nc", AArch64MCExpr::VK_AARCH64_TPREL_LO12_NC)
- .Case("tlsdesc", AArch64MCExpr::VK_AARCH64_TLSDESC)
- .Case("tlsdesc_lo12", AArch64MCExpr::VK_AARCH64_TLSDESC_LO12)
- .Default(AArch64MCExpr::VK_AARCH64_None);
-
- if (RefKind == AArch64MCExpr::VK_AARCH64_None) {
- Error(Parser.getTok().getLoc(),
- "expected relocation specifier in operand after ':'");
- return MatchOperand_ParseFail;
- }
- Parser.Lex(); // Eat identifier
-
- if (getLexer().isNot(AsmToken::Colon)) {
- Error(Parser.getTok().getLoc(),
- "expected ':' after relocation specifier");
- return MatchOperand_ParseFail;
- }
- Parser.Lex();
- return MatchOperand_Success;
}
-AArch64AsmParser::OperandMatchResultTy
-AArch64AsmParser::ParseImmWithLSLOperand(
- SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
+bool AArch64AsmParser::ParseRegister(unsigned &RegNo, SMLoc &StartLoc,
+ SMLoc &EndLoc) {
+ StartLoc = getLoc();
+ RegNo = tryParseRegister();
+ EndLoc = SMLoc::getFromPointer(getLoc().getPointer() - 1);
+ return (RegNo == (unsigned)-1);
+}
- SMLoc S = Parser.getTok().getLoc();
+/// tryParseRegister - Try to parse a register name. The token must be an
+/// Identifier when called, and if it is a register name the token is eaten and
+/// the register is added to the operand list.
+int AArch64AsmParser::tryParseRegister() {
+ const AsmToken &Tok = Parser.getTok();
+ assert(Tok.is(AsmToken::Identifier) && "Token is not an Identifier");
- if (Parser.getTok().is(AsmToken::Hash))
- Parser.Lex(); // Eat '#'
- else if (Parser.getTok().isNot(AsmToken::Integer))
- // Operand should start from # or should be integer, emit error otherwise.
- return MatchOperand_NoMatch;
+ std::string lowerCase = Tok.getString().lower();
+ unsigned RegNum = MatchRegisterName(lowerCase);
+ // Also handle a few aliases of registers.
+ if (RegNum == 0)
+ RegNum = StringSwitch<unsigned>(lowerCase)
+ .Case("fp", AArch64::FP)
+ .Case("lr", AArch64::LR)
+ .Case("x31", AArch64::XZR)
+ .Case("w31", AArch64::WZR)
+ .Default(0);
- const MCExpr *Imm;
- if (ParseImmediate(Imm) != MatchOperand_Success)
- return MatchOperand_ParseFail;
- else if (Parser.getTok().isNot(AsmToken::Comma)) {
- SMLoc E = Parser.getTok().getLoc();
- Operands.push_back(AArch64Operand::CreateImmWithLSL(Imm, 0, true, S, E));
- return MatchOperand_Success;
+ if (RegNum == 0)
+ return -1;
+
+ Parser.Lex(); // Eat identifier token.
+ return RegNum;
+}
+
+/// tryMatchVectorRegister - Try to parse a vector register name with optional
+/// kind specifier. If it is a register specifier, eat the token and return it.
+int AArch64AsmParser::tryMatchVectorRegister(StringRef &Kind, bool expected) {
+ if (Parser.getTok().isNot(AsmToken::Identifier)) {
+ TokError("vector register expected");
+ return -1;
}
- // Eat ','
- Parser.Lex();
-
- // The optional operand must be "lsl #N" where N is non-negative.
- if (Parser.getTok().is(AsmToken::Identifier)
- && Parser.getTok().getIdentifier().equals_lower("lsl")) {
- Parser.Lex();
-
- if (Parser.getTok().is(AsmToken::Hash)) {
- Parser.Lex();
-
- if (Parser.getTok().isNot(AsmToken::Integer)) {
- Error(Parser.getTok().getLoc(), "only 'lsl #+N' valid after immediate");
- return MatchOperand_ParseFail;
+ StringRef Name = Parser.getTok().getString();
+ // If there is a kind specifier, it's separated from the register name by
+ // a '.'.
+ size_t Start = 0, Next = Name.find('.');
+ StringRef Head = Name.slice(Start, Next);
+ unsigned RegNum = matchVectorRegName(Head);
+ if (RegNum) {
+ if (Next != StringRef::npos) {
+ Kind = Name.slice(Next, StringRef::npos);
+ if (!isValidVectorKind(Kind)) {
+ TokError("invalid vector kind qualifier");
+ return -1;
}
}
+ Parser.Lex(); // Eat the register token.
+ return RegNum;
}
- int64_t ShiftAmount = Parser.getTok().getIntVal();
-
- if (ShiftAmount < 0) {
- Error(Parser.getTok().getLoc(), "positive shift amount required");
- return MatchOperand_ParseFail;
- }
- Parser.Lex(); // Eat the number
-
- SMLoc E = Parser.getTok().getLoc();
- Operands.push_back(AArch64Operand::CreateImmWithLSL(Imm, ShiftAmount,
- false, S, E));
- return MatchOperand_Success;
+ if (expected)
+ TokError("vector register expected");
+ return -1;
}
-
+/// tryParseSysCROperand - Try to parse a system instruction CR operand name.
AArch64AsmParser::OperandMatchResultTy
-AArch64AsmParser::ParseCondCodeOperand(
- SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
- if (Parser.getTok().isNot(AsmToken::Identifier))
- return MatchOperand_NoMatch;
+AArch64AsmParser::tryParseSysCROperand(OperandVector &Operands) {
+ SMLoc S = getLoc();
- StringRef Tok = Parser.getTok().getIdentifier();
- A64CC::CondCodes CondCode = A64StringToCondCode(Tok);
-
- if (CondCode == A64CC::Invalid)
- return MatchOperand_NoMatch;
-
- SMLoc S = Parser.getTok().getLoc();
- Parser.Lex(); // Eat condition code
- SMLoc E = Parser.getTok().getLoc();
-
- Operands.push_back(AArch64Operand::CreateCondCode(CondCode, S, E));
- return MatchOperand_Success;
-}
-
-AArch64AsmParser::OperandMatchResultTy
-AArch64AsmParser::ParseCRxOperand(
- SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
- SMLoc S = Parser.getTok().getLoc();
if (Parser.getTok().isNot(AsmToken::Identifier)) {
Error(S, "Expected cN operand where 0 <= N <= 15");
return MatchOperand_ParseFail;
@@ -1574,426 +1895,350 @@
return MatchOperand_ParseFail;
}
- const MCExpr *CRImm = MCConstantExpr::Create(CRNum, getContext());
-
- Parser.Lex();
- SMLoc E = Parser.getTok().getLoc();
-
- Operands.push_back(AArch64Operand::CreateImm(CRImm, S, E));
+ Parser.Lex(); // Eat identifier token.
+ Operands.push_back(
+ AArch64Operand::CreateSysCR(CRNum, S, getLoc(), getContext()));
return MatchOperand_Success;
}
+/// tryParsePrefetch - Try to parse a prefetch operand.
AArch64AsmParser::OperandMatchResultTy
-AArch64AsmParser::ParseFPImmOperand(
- SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
-
- SMLoc S = Parser.getTok().getLoc();
-
- bool Hash = false;
- if (Parser.getTok().is(AsmToken::Hash)) {
- Parser.Lex(); // Eat '#'
- Hash = true;
- }
-
- bool Negative = false;
- if (Parser.getTok().is(AsmToken::Minus)) {
- Negative = true;
- Parser.Lex(); // Eat '-'
- } else if (Parser.getTok().is(AsmToken::Plus)) {
- Parser.Lex(); // Eat '+'
- }
-
- if (Parser.getTok().isNot(AsmToken::Real)) {
- if (!Hash)
- return MatchOperand_NoMatch;
- Error(S, "Expected floating-point immediate");
- return MatchOperand_ParseFail;
- }
-
- APFloat RealVal(APFloat::IEEEdouble, Parser.getTok().getString());
- if (Negative) RealVal.changeSign();
- double DblVal = RealVal.convertToDouble();
-
- Parser.Lex(); // Eat real number
- SMLoc E = Parser.getTok().getLoc();
-
- Operands.push_back(AArch64Operand::CreateFPImm(DblVal, S, E));
- return MatchOperand_Success;
-}
-
-AArch64AsmParser::OperandMatchResultTy
-AArch64AsmParser::ParseFPImm0AndImm0Operand(
- SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
-
- SMLoc S = Parser.getTok().getLoc();
-
- bool Hash = false;
- if (Parser.getTok().is(AsmToken::Hash)) {
- Parser.Lex(); // Eat '#'
- Hash = true;
- }
-
- APFloat RealVal(0.0);
- if (Parser.getTok().is(AsmToken::Real)) {
- if(Parser.getTok().getString() != "0.0") {
- Error(S, "only #0.0 is acceptable as immediate");
- return MatchOperand_ParseFail;
- }
- }
- else if (Parser.getTok().is(AsmToken::Integer)) {
- if(Parser.getTok().getIntVal() != 0) {
- Error(S, "only #0.0 is acceptable as immediate");
- return MatchOperand_ParseFail;
- }
- }
- else {
- if (!Hash)
- return MatchOperand_NoMatch;
- Error(S, "only #0.0 is acceptable as immediate");
- return MatchOperand_ParseFail;
- }
-
- Parser.Lex(); // Eat real number
- SMLoc E = Parser.getTok().getLoc();
-
- Operands.push_back(AArch64Operand::CreateFPImm(0.0, S, E));
- return MatchOperand_Success;
-}
-
-// Automatically generated
-static unsigned MatchRegisterName(StringRef Name);
-
-bool
-AArch64AsmParser::IdentifyRegister(unsigned &RegNum, SMLoc &RegEndLoc,
- StringRef &Layout,
- SMLoc &LayoutLoc) const {
+AArch64AsmParser::tryParsePrefetch(OperandVector &Operands) {
+ SMLoc S = getLoc();
const AsmToken &Tok = Parser.getTok();
+ // Either an identifier for named values or a 5-bit immediate.
+ bool Hash = Tok.is(AsmToken::Hash);
+ if (Hash || Tok.is(AsmToken::Integer)) {
+ if (Hash)
+ Parser.Lex(); // Eat hash token.
+ const MCExpr *ImmVal;
+ if (getParser().parseExpression(ImmVal))
+ return MatchOperand_ParseFail;
- if (Tok.isNot(AsmToken::Identifier))
- return false;
-
- std::string LowerReg = Tok.getString().lower();
- size_t DotPos = LowerReg.find('.');
-
- bool IsVec128 = false;
- SMLoc S = Tok.getLoc();
- RegEndLoc = SMLoc::getFromPointer(S.getPointer() + DotPos);
-
- if (DotPos == std::string::npos) {
- Layout = StringRef();
- } else {
- // Everything afterwards needs to be a literal token, expected to be
- // '.2d','.b' etc for vector registers.
-
- // This StringSwitch validates the input and (perhaps more importantly)
- // gives us a permanent string to use in the token (a pointer into LowerReg
- // would go out of scope when we return).
- LayoutLoc = SMLoc::getFromPointer(S.getPointer() + DotPos + 1);
- StringRef LayoutText = StringRef(LowerReg).substr(DotPos);
-
- // See if it's a 128-bit layout first.
- Layout = StringSwitch<const char *>(LayoutText)
- .Case(".q", ".q").Case(".1q", ".1q")
- .Case(".d", ".d").Case(".2d", ".2d")
- .Case(".s", ".s").Case(".4s", ".4s")
- .Case(".h", ".h").Case(".8h", ".8h")
- .Case(".b", ".b").Case(".16b", ".16b")
- .Default("");
-
- if (Layout.size() != 0)
- IsVec128 = true;
- else {
- Layout = StringSwitch<const char *>(LayoutText)
- .Case(".1d", ".1d")
- .Case(".2s", ".2s")
- .Case(".4h", ".4h")
- .Case(".8b", ".8b")
- .Default("");
+ const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(ImmVal);
+ if (!MCE) {
+ TokError("immediate value expected for prefetch operand");
+ return MatchOperand_ParseFail;
+ }
+ unsigned prfop = MCE->getValue();
+ if (prfop > 31) {
+ TokError("prefetch operand out of range, [0,31] expected");
+ return MatchOperand_ParseFail;
}
- if (Layout.size() == 0) {
- // If we've still not pinned it down the register is malformed.
- return false;
- }
+ Operands.push_back(AArch64Operand::CreatePrefetch(prfop, S, getContext()));
+ return MatchOperand_Success;
}
- RegNum = MatchRegisterName(LowerReg.substr(0, DotPos));
- if (RegNum == AArch64::NoRegister) {
- RegNum = StringSwitch<unsigned>(LowerReg.substr(0, DotPos))
- .Case("ip0", AArch64::X16)
- .Case("ip1", AArch64::X17)
- .Case("fp", AArch64::X29)
- .Case("lr", AArch64::X30)
- .Case("v0", IsVec128 ? AArch64::Q0 : AArch64::D0)
- .Case("v1", IsVec128 ? AArch64::Q1 : AArch64::D1)
- .Case("v2", IsVec128 ? AArch64::Q2 : AArch64::D2)
- .Case("v3", IsVec128 ? AArch64::Q3 : AArch64::D3)
- .Case("v4", IsVec128 ? AArch64::Q4 : AArch64::D4)
- .Case("v5", IsVec128 ? AArch64::Q5 : AArch64::D5)
- .Case("v6", IsVec128 ? AArch64::Q6 : AArch64::D6)
- .Case("v7", IsVec128 ? AArch64::Q7 : AArch64::D7)
- .Case("v8", IsVec128 ? AArch64::Q8 : AArch64::D8)
- .Case("v9", IsVec128 ? AArch64::Q9 : AArch64::D9)
- .Case("v10", IsVec128 ? AArch64::Q10 : AArch64::D10)
- .Case("v11", IsVec128 ? AArch64::Q11 : AArch64::D11)
- .Case("v12", IsVec128 ? AArch64::Q12 : AArch64::D12)
- .Case("v13", IsVec128 ? AArch64::Q13 : AArch64::D13)
- .Case("v14", IsVec128 ? AArch64::Q14 : AArch64::D14)
- .Case("v15", IsVec128 ? AArch64::Q15 : AArch64::D15)
- .Case("v16", IsVec128 ? AArch64::Q16 : AArch64::D16)
- .Case("v17", IsVec128 ? AArch64::Q17 : AArch64::D17)
- .Case("v18", IsVec128 ? AArch64::Q18 : AArch64::D18)
- .Case("v19", IsVec128 ? AArch64::Q19 : AArch64::D19)
- .Case("v20", IsVec128 ? AArch64::Q20 : AArch64::D20)
- .Case("v21", IsVec128 ? AArch64::Q21 : AArch64::D21)
- .Case("v22", IsVec128 ? AArch64::Q22 : AArch64::D22)
- .Case("v23", IsVec128 ? AArch64::Q23 : AArch64::D23)
- .Case("v24", IsVec128 ? AArch64::Q24 : AArch64::D24)
- .Case("v25", IsVec128 ? AArch64::Q25 : AArch64::D25)
- .Case("v26", IsVec128 ? AArch64::Q26 : AArch64::D26)
- .Case("v27", IsVec128 ? AArch64::Q27 : AArch64::D27)
- .Case("v28", IsVec128 ? AArch64::Q28 : AArch64::D28)
- .Case("v29", IsVec128 ? AArch64::Q29 : AArch64::D29)
- .Case("v30", IsVec128 ? AArch64::Q30 : AArch64::D30)
- .Case("v31", IsVec128 ? AArch64::Q31 : AArch64::D31)
- .Default(AArch64::NoRegister);
+ if (Tok.isNot(AsmToken::Identifier)) {
+ TokError("pre-fetch hint expected");
+ return MatchOperand_ParseFail;
}
- if (RegNum == AArch64::NoRegister)
- return false;
- return true;
+ bool Valid;
+ unsigned prfop = AArch64PRFM::PRFMMapper().fromString(Tok.getString(), Valid);
+ if (!Valid) {
+ TokError("pre-fetch hint expected");
+ return MatchOperand_ParseFail;
+ }
+
+ Parser.Lex(); // Eat identifier token.
+ Operands.push_back(AArch64Operand::CreatePrefetch(prfop, S, getContext()));
+ return MatchOperand_Success;
}
+/// tryParseAdrpLabel - Parse and validate a source label for the ADRP
+/// instruction.
AArch64AsmParser::OperandMatchResultTy
-AArch64AsmParser::ParseRegister(SmallVectorImpl<MCParsedAsmOperand*> &Operands,
- uint32_t &NumLanes) {
- unsigned RegNum;
- StringRef Layout;
- SMLoc RegEndLoc, LayoutLoc;
- SMLoc S = Parser.getTok().getLoc();
+AArch64AsmParser::tryParseAdrpLabel(OperandVector &Operands) {
+ SMLoc S = getLoc();
+ const MCExpr *Expr;
- if (!IdentifyRegister(RegNum, RegEndLoc, Layout, LayoutLoc))
+ if (Parser.getTok().is(AsmToken::Hash)) {
+ Parser.Lex(); // Eat hash token.
+ }
+
+ if (parseSymbolicImmVal(Expr))
+ return MatchOperand_ParseFail;
+
+ AArch64MCExpr::VariantKind ELFRefKind;
+ MCSymbolRefExpr::VariantKind DarwinRefKind;
+ int64_t Addend;
+ if (classifySymbolRef(Expr, ELFRefKind, DarwinRefKind, Addend)) {
+ if (DarwinRefKind == MCSymbolRefExpr::VK_None &&
+ ELFRefKind == AArch64MCExpr::VK_INVALID) {
+ // No modifier was specified at all; this is the syntax for an ELF basic
+ // ADRP relocation (unfortunately).
+ Expr =
+ AArch64MCExpr::Create(Expr, AArch64MCExpr::VK_ABS_PAGE, getContext());
+ } else if ((DarwinRefKind == MCSymbolRefExpr::VK_GOTPAGE ||
+ DarwinRefKind == MCSymbolRefExpr::VK_TLVPPAGE) &&
+ Addend != 0) {
+ Error(S, "gotpage label reference not allowed an addend");
+ return MatchOperand_ParseFail;
+ } else if (DarwinRefKind != MCSymbolRefExpr::VK_PAGE &&
+ DarwinRefKind != MCSymbolRefExpr::VK_GOTPAGE &&
+ DarwinRefKind != MCSymbolRefExpr::VK_TLVPPAGE &&
+ ELFRefKind != AArch64MCExpr::VK_GOT_PAGE &&
+ ELFRefKind != AArch64MCExpr::VK_GOTTPREL_PAGE &&
+ ELFRefKind != AArch64MCExpr::VK_TLSDESC_PAGE) {
+ // The operand must be an @page or @gotpage qualified symbolref.
+ Error(S, "page or gotpage label reference expected");
+ return MatchOperand_ParseFail;
+ }
+ }
+
+ // We have either a label reference possibly with addend or an immediate. The
+ // addend is a raw value here. The linker will adjust it to only reference the
+ // page.
+ SMLoc E = SMLoc::getFromPointer(getLoc().getPointer() - 1);
+ Operands.push_back(AArch64Operand::CreateImm(Expr, S, E, getContext()));
+
+ return MatchOperand_Success;
+}
+
+/// tryParseAdrLabel - Parse and validate a source label for the ADR
+/// instruction.
+AArch64AsmParser::OperandMatchResultTy
+AArch64AsmParser::tryParseAdrLabel(OperandVector &Operands) {
+ SMLoc S = getLoc();
+ const MCExpr *Expr;
+
+ if (Parser.getTok().is(AsmToken::Hash)) {
+ Parser.Lex(); // Eat hash token.
+ }
+
+ if (getParser().parseExpression(Expr))
+ return MatchOperand_ParseFail;
+
+ SMLoc E = SMLoc::getFromPointer(getLoc().getPointer() - 1);
+ Operands.push_back(AArch64Operand::CreateImm(Expr, S, E, getContext()));
+
+ return MatchOperand_Success;
+}
+
+/// tryParseFPImm - A floating point immediate expression operand.
+AArch64AsmParser::OperandMatchResultTy
+AArch64AsmParser::tryParseFPImm(OperandVector &Operands) {
+ SMLoc S = getLoc();
+
+ bool Hash = false;
+ if (Parser.getTok().is(AsmToken::Hash)) {
+ Parser.Lex(); // Eat '#'
+ Hash = true;
+ }
+
+ // Handle negation, as that still comes through as a separate token.
+ bool isNegative = false;
+ if (Parser.getTok().is(AsmToken::Minus)) {
+ isNegative = true;
+ Parser.Lex();
+ }
+ const AsmToken &Tok = Parser.getTok();
+ if (Tok.is(AsmToken::Real)) {
+ APFloat RealVal(APFloat::IEEEdouble, Tok.getString());
+ uint64_t IntVal = RealVal.bitcastToAPInt().getZExtValue();
+ // If we had a '-' in front, toggle the sign bit.
+ IntVal ^= (uint64_t)isNegative << 63;
+ int Val = AArch64_AM::getFP64Imm(APInt(64, IntVal));
+ Parser.Lex(); // Eat the token.
+ // Check for out of range values. As an exception, we let Zero through,
+ // as we handle that special case in post-processing before matching in
+ // order to use the zero register for it.
+ if (Val == -1 && !RealVal.isZero()) {
+ TokError("expected compatible register or floating-point constant");
+ return MatchOperand_ParseFail;
+ }
+ Operands.push_back(AArch64Operand::CreateFPImm(Val, S, getContext()));
+ return MatchOperand_Success;
+ }
+ if (Tok.is(AsmToken::Integer)) {
+ int64_t Val;
+ if (!isNegative && Tok.getString().startswith("0x")) {
+ Val = Tok.getIntVal();
+ if (Val > 255 || Val < 0) {
+ TokError("encoded floating point value out of range");
+ return MatchOperand_ParseFail;
+ }
+ } else {
+ APFloat RealVal(APFloat::IEEEdouble, Tok.getString());
+ uint64_t IntVal = RealVal.bitcastToAPInt().getZExtValue();
+ // If we had a '-' in front, toggle the sign bit.
+ IntVal ^= (uint64_t)isNegative << 63;
+ Val = AArch64_AM::getFP64Imm(APInt(64, IntVal));
+ }
+ Parser.Lex(); // Eat the token.
+ Operands.push_back(AArch64Operand::CreateFPImm(Val, S, getContext()));
+ return MatchOperand_Success;
+ }
+
+ if (!Hash)
return MatchOperand_NoMatch;
- Operands.push_back(AArch64Operand::CreateReg(RegNum, S, RegEndLoc));
-
- if (Layout.size() != 0) {
- unsigned long long TmpLanes = 0;
- llvm::getAsUnsignedInteger(Layout.substr(1), 10, TmpLanes);
- if (TmpLanes != 0) {
- NumLanes = TmpLanes;
- } else {
- // If the number of lanes isn't specified explicitly, a valid instruction
- // will have an element specifier and be capable of acting on the entire
- // vector register.
- switch (Layout.back()) {
- default: llvm_unreachable("Invalid layout specifier");
- case 'b': NumLanes = 16; break;
- case 'h': NumLanes = 8; break;
- case 's': NumLanes = 4; break;
- case 'd': NumLanes = 2; break;
- case 'q': NumLanes = 1; break;
- }
- }
-
- Operands.push_back(AArch64Operand::CreateToken(Layout, LayoutLoc));
- }
-
- Parser.Lex();
- return MatchOperand_Success;
-}
-
-bool
-AArch64AsmParser::ParseRegister(unsigned &RegNo, SMLoc &StartLoc,
- SMLoc &EndLoc) {
- // This callback is used for things like DWARF frame directives in
- // assembly. They don't care about things like NEON layouts or lanes, they
- // just want to be able to produce the DWARF register number.
- StringRef LayoutSpec;
- SMLoc RegEndLoc, LayoutLoc;
- StartLoc = Parser.getTok().getLoc();
-
- if (!IdentifyRegister(RegNo, RegEndLoc, LayoutSpec, LayoutLoc))
- return true;
-
- Parser.Lex();
- EndLoc = Parser.getTok().getLoc();
-
- return false;
-}
-
-AArch64AsmParser::OperandMatchResultTy
-AArch64AsmParser::ParseNamedImmOperand(const NamedImmMapper &Mapper,
- SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
- // Since these operands occur in very limited circumstances, without
- // alternatives, we actually signal an error if there is no match. If relaxing
- // this, beware of unintended consequences: an immediate will be accepted
- // during matching, no matter how it gets into the AArch64Operand.
- const AsmToken &Tok = Parser.getTok();
- SMLoc S = Tok.getLoc();
-
- if (Tok.is(AsmToken::Identifier)) {
- bool ValidName;
- uint32_t Code = Mapper.fromString(Tok.getString().lower(), ValidName);
-
- if (!ValidName) {
- Error(S, "operand specifier not recognised");
- return MatchOperand_ParseFail;
- }
-
- Parser.Lex(); // We're done with the identifier. Eat it
-
- SMLoc E = Parser.getTok().getLoc();
- const MCExpr *Imm = MCConstantExpr::Create(Code, getContext());
- Operands.push_back(AArch64Operand::CreateImm(Imm, S, E));
- return MatchOperand_Success;
- } else if (Tok.is(AsmToken::Hash)) {
- Parser.Lex();
-
- const MCExpr *ImmVal;
- if (ParseImmediate(ImmVal) != MatchOperand_Success)
- return MatchOperand_ParseFail;
-
- const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(ImmVal);
- if (!CE || CE->getValue() < 0 || !Mapper.validImm(CE->getValue())) {
- Error(S, "Invalid immediate for instruction");
- return MatchOperand_ParseFail;
- }
-
- SMLoc E = Parser.getTok().getLoc();
- Operands.push_back(AArch64Operand::CreateImm(ImmVal, S, E));
- return MatchOperand_Success;
- }
-
- Error(S, "unexpected operand for instruction");
+ TokError("invalid floating point immediate");
return MatchOperand_ParseFail;
}
+/// tryParseAddSubImm - Parse ADD/SUB shifted immediate operand
AArch64AsmParser::OperandMatchResultTy
-AArch64AsmParser::ParseSysRegOperand(
- SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
- const AsmToken &Tok = Parser.getTok();
+AArch64AsmParser::tryParseAddSubImm(OperandVector &Operands) {
+ SMLoc S = getLoc();
- // Any MSR/MRS operand will be an identifier, and we want to store it as some
- // kind of string: SPSel is valid for two different forms of MSR with two
- // different encodings. There's no collision at the moment, but the potential
- // is there.
- if (!Tok.is(AsmToken::Identifier)) {
+ if (Parser.getTok().is(AsmToken::Hash))
+ Parser.Lex(); // Eat '#'
+ else if (Parser.getTok().isNot(AsmToken::Integer))
+ // Operand should start from # or should be integer, emit error otherwise.
return MatchOperand_NoMatch;
- }
- SMLoc S = Tok.getLoc();
- Operands.push_back(AArch64Operand::CreateSysReg(Tok.getString(), S));
- Parser.Lex(); // Eat identifier
-
- return MatchOperand_Success;
-}
-
-AArch64AsmParser::OperandMatchResultTy
-AArch64AsmParser::ParseLSXAddressOperand(
- SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
- SMLoc S = Parser.getTok().getLoc();
-
- unsigned RegNum;
- SMLoc RegEndLoc, LayoutLoc;
- StringRef Layout;
- if(!IdentifyRegister(RegNum, RegEndLoc, Layout, LayoutLoc)
- || !AArch64MCRegisterClasses[AArch64::GPR64xspRegClassID].contains(RegNum)
- || Layout.size() != 0) {
- // Check Layout.size because we don't want to let "x3.4s" or similar
- // through.
- return MatchOperand_NoMatch;
- }
- Parser.Lex(); // Eat register
-
- if (Parser.getTok().is(AsmToken::RBrac)) {
- // We're done
+ const MCExpr *Imm;
+ if (parseSymbolicImmVal(Imm))
+ return MatchOperand_ParseFail;
+ else if (Parser.getTok().isNot(AsmToken::Comma)) {
+ uint64_t ShiftAmount = 0;
+ const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(Imm);
+ if (MCE) {
+ int64_t Val = MCE->getValue();
+ if (Val > 0xfff && (Val & 0xfff) == 0) {
+ Imm = MCConstantExpr::Create(Val >> 12, getContext());
+ ShiftAmount = 12;
+ }
+ }
SMLoc E = Parser.getTok().getLoc();
- Operands.push_back(AArch64Operand::CreateWrappedReg(RegNum, S, E));
+ Operands.push_back(AArch64Operand::CreateShiftedImm(Imm, ShiftAmount, S, E,
+ getContext()));
return MatchOperand_Success;
}
- // Otherwise, only ", #0" is valid
+ // Eat ','
+ Parser.Lex();
- if (Parser.getTok().isNot(AsmToken::Comma)) {
- Error(Parser.getTok().getLoc(), "expected ',' or ']' after register");
+ // The optional operand must be "lsl #N" where N is non-negative.
+ if (!Parser.getTok().is(AsmToken::Identifier) ||
+ !Parser.getTok().getIdentifier().equals_lower("lsl")) {
+ Error(Parser.getTok().getLoc(), "only 'lsl #+N' valid after immediate");
return MatchOperand_ParseFail;
}
- Parser.Lex(); // Eat ','
- if (Parser.getTok().isNot(AsmToken::Hash)) {
- Error(Parser.getTok().getLoc(), "expected '#0'");
+ // Eat 'lsl'
+ Parser.Lex();
+
+ if (Parser.getTok().is(AsmToken::Hash)) {
+ Parser.Lex();
+ }
+
+ if (Parser.getTok().isNot(AsmToken::Integer)) {
+ Error(Parser.getTok().getLoc(), "only 'lsl #+N' valid after immediate");
return MatchOperand_ParseFail;
}
- Parser.Lex(); // Eat '#'
- if (Parser.getTok().isNot(AsmToken::Integer)
- || Parser.getTok().getIntVal() != 0 ) {
- Error(Parser.getTok().getLoc(), "expected '#0'");
+ int64_t ShiftAmount = Parser.getTok().getIntVal();
+
+ if (ShiftAmount < 0) {
+ Error(Parser.getTok().getLoc(), "positive shift amount required");
return MatchOperand_ParseFail;
}
- Parser.Lex(); // Eat '0'
+ Parser.Lex(); // Eat the number
SMLoc E = Parser.getTok().getLoc();
- Operands.push_back(AArch64Operand::CreateWrappedReg(RegNum, S, E));
+ Operands.push_back(AArch64Operand::CreateShiftedImm(Imm, ShiftAmount,
+ S, E, getContext()));
return MatchOperand_Success;
}
+/// parseCondCodeString - Parse a Condition Code string.
+AArch64CC::CondCode AArch64AsmParser::parseCondCodeString(StringRef Cond) {
+ AArch64CC::CondCode CC = StringSwitch<AArch64CC::CondCode>(Cond.lower())
+ .Case("eq", AArch64CC::EQ)
+ .Case("ne", AArch64CC::NE)
+ .Case("cs", AArch64CC::HS)
+ .Case("hs", AArch64CC::HS)
+ .Case("cc", AArch64CC::LO)
+ .Case("lo", AArch64CC::LO)
+ .Case("mi", AArch64CC::MI)
+ .Case("pl", AArch64CC::PL)
+ .Case("vs", AArch64CC::VS)
+ .Case("vc", AArch64CC::VC)
+ .Case("hi", AArch64CC::HI)
+ .Case("ls", AArch64CC::LS)
+ .Case("ge", AArch64CC::GE)
+ .Case("lt", AArch64CC::LT)
+ .Case("gt", AArch64CC::GT)
+ .Case("le", AArch64CC::LE)
+ .Case("al", AArch64CC::AL)
+ .Case("nv", AArch64CC::NV)
+ .Default(AArch64CC::Invalid);
+ return CC;
+}
+
+/// parseCondCode - Parse a Condition Code operand.
+bool AArch64AsmParser::parseCondCode(OperandVector &Operands,
+ bool invertCondCode) {
+ SMLoc S = getLoc();
+ const AsmToken &Tok = Parser.getTok();
+ assert(Tok.is(AsmToken::Identifier) && "Token is not an Identifier");
+
+ StringRef Cond = Tok.getString();
+ AArch64CC::CondCode CC = parseCondCodeString(Cond);
+ if (CC == AArch64CC::Invalid)
+ return TokError("invalid condition code");
+ Parser.Lex(); // Eat identifier token.
+
+ if (invertCondCode)
+ CC = AArch64CC::getInvertedCondCode(AArch64CC::CondCode(CC));
+
+ Operands.push_back(
+ AArch64Operand::CreateCondCode(CC, S, getLoc(), getContext()));
+ return false;
+}
+
+/// tryParseOptionalShift - Some operands take an optional shift argument. Parse
+/// them if present.
AArch64AsmParser::OperandMatchResultTy
-AArch64AsmParser::ParseShiftExtend(
- SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
- StringRef IDVal = Parser.getTok().getIdentifier();
- std::string LowerID = IDVal.lower();
+AArch64AsmParser::tryParseOptionalShiftExtend(OperandVector &Operands) {
+ const AsmToken &Tok = Parser.getTok();
+ std::string LowerID = Tok.getString().lower();
+ AArch64_AM::ShiftExtendType ShOp =
+ StringSwitch<AArch64_AM::ShiftExtendType>(LowerID)
+ .Case("lsl", AArch64_AM::LSL)
+ .Case("lsr", AArch64_AM::LSR)
+ .Case("asr", AArch64_AM::ASR)
+ .Case("ror", AArch64_AM::ROR)
+ .Case("msl", AArch64_AM::MSL)
+ .Case("uxtb", AArch64_AM::UXTB)
+ .Case("uxth", AArch64_AM::UXTH)
+ .Case("uxtw", AArch64_AM::UXTW)
+ .Case("uxtx", AArch64_AM::UXTX)
+ .Case("sxtb", AArch64_AM::SXTB)
+ .Case("sxth", AArch64_AM::SXTH)
+ .Case("sxtw", AArch64_AM::SXTW)
+ .Case("sxtx", AArch64_AM::SXTX)
+ .Default(AArch64_AM::InvalidShiftExtend);
- A64SE::ShiftExtSpecifiers Spec =
- StringSwitch<A64SE::ShiftExtSpecifiers>(LowerID)
- .Case("lsl", A64SE::LSL)
- .Case("msl", A64SE::MSL)
- .Case("lsr", A64SE::LSR)
- .Case("asr", A64SE::ASR)
- .Case("ror", A64SE::ROR)
- .Case("uxtb", A64SE::UXTB)
- .Case("uxth", A64SE::UXTH)
- .Case("uxtw", A64SE::UXTW)
- .Case("uxtx", A64SE::UXTX)
- .Case("sxtb", A64SE::SXTB)
- .Case("sxth", A64SE::SXTH)
- .Case("sxtw", A64SE::SXTW)
- .Case("sxtx", A64SE::SXTX)
- .Default(A64SE::Invalid);
-
- if (Spec == A64SE::Invalid)
+ if (ShOp == AArch64_AM::InvalidShiftExtend)
return MatchOperand_NoMatch;
- // Eat the shift
- SMLoc S, E;
- S = Parser.getTok().getLoc();
+ SMLoc S = Tok.getLoc();
Parser.Lex();
- if (Spec != A64SE::LSL && Spec != A64SE::LSR && Spec != A64SE::ASR &&
- Spec != A64SE::ROR && Spec != A64SE::MSL) {
- // The shift amount can be omitted for the extending versions, but not real
- // shifts:
- // add x0, x0, x0, uxtb
- // is valid, and equivalent to
- // add x0, x0, x0, uxtb #0
-
- if (Parser.getTok().is(AsmToken::Comma) ||
- Parser.getTok().is(AsmToken::EndOfStatement) ||
- Parser.getTok().is(AsmToken::RBrac)) {
- Operands.push_back(AArch64Operand::CreateShiftExtend(Spec, 0, true,
- S, E));
- return MatchOperand_Success;
+ bool Hash = getLexer().is(AsmToken::Hash);
+ if (!Hash && getLexer().isNot(AsmToken::Integer)) {
+ if (ShOp == AArch64_AM::LSL || ShOp == AArch64_AM::LSR ||
+ ShOp == AArch64_AM::ASR || ShOp == AArch64_AM::ROR ||
+ ShOp == AArch64_AM::MSL) {
+ // We expect a number here.
+ TokError("expected #imm after shift specifier");
+ return MatchOperand_ParseFail;
}
+
+ // "extend" type operatoins don't need an immediate, #0 is implicit.
+ SMLoc E = SMLoc::getFromPointer(getLoc().getPointer() - 1);
+ Operands.push_back(
+ AArch64Operand::CreateShiftExtend(ShOp, 0, false, S, E, getContext()));
+ return MatchOperand_Success;
}
- // Eat # at beginning of immediate
- if (!Parser.getTok().is(AsmToken::Hash)) {
- Error(Parser.getTok().getLoc(),
- "expected #imm after shift specifier");
- return MatchOperand_ParseFail;
- }
- Parser.Lex();
+ if (Hash)
+ Parser.Lex(); // Eat the '#'.
// Make sure we do actually have a number
if (!Parser.getTok().is(AsmToken::Integer)) {
@@ -2001,316 +2246,871 @@
"expected integer shift amount");
return MatchOperand_ParseFail;
}
- unsigned Amount = Parser.getTok().getIntVal();
- Parser.Lex();
- E = Parser.getTok().getLoc();
- Operands.push_back(AArch64Operand::CreateShiftExtend(Spec, Amount, false,
- S, E));
+ const MCExpr *ImmVal;
+ if (getParser().parseExpression(ImmVal))
+ return MatchOperand_ParseFail;
+
+ const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(ImmVal);
+ if (!MCE) {
+ TokError("expected #imm after shift specifier");
+ return MatchOperand_ParseFail;
+ }
+
+ SMLoc E = SMLoc::getFromPointer(getLoc().getPointer() - 1);
+ Operands.push_back(AArch64Operand::CreateShiftExtend(
+ ShOp, MCE->getValue(), true, S, E, getContext()));
+ return MatchOperand_Success;
+}
+
+/// parseSysAlias - The IC, DC, AT, and TLBI instructions are simple aliases for
+/// the SYS instruction. Parse them specially so that we create a SYS MCInst.
+bool AArch64AsmParser::parseSysAlias(StringRef Name, SMLoc NameLoc,
+ OperandVector &Operands) {
+ if (Name.find('.') != StringRef::npos)
+ return TokError("invalid operand");
+
+ Mnemonic = Name;
+ Operands.push_back(
+ AArch64Operand::CreateToken("sys", false, NameLoc, getContext()));
+
+ const AsmToken &Tok = Parser.getTok();
+ StringRef Op = Tok.getString();
+ SMLoc S = Tok.getLoc();
+
+ const MCExpr *Expr = nullptr;
+
+#define SYS_ALIAS(op1, Cn, Cm, op2) \
+ do { \
+ Expr = MCConstantExpr::Create(op1, getContext()); \
+ Operands.push_back( \
+ AArch64Operand::CreateImm(Expr, S, getLoc(), getContext())); \
+ Operands.push_back( \
+ AArch64Operand::CreateSysCR(Cn, S, getLoc(), getContext())); \
+ Operands.push_back( \
+ AArch64Operand::CreateSysCR(Cm, S, getLoc(), getContext())); \
+ Expr = MCConstantExpr::Create(op2, getContext()); \
+ Operands.push_back( \
+ AArch64Operand::CreateImm(Expr, S, getLoc(), getContext())); \
+ } while (0)
+
+ if (Mnemonic == "ic") {
+ if (!Op.compare_lower("ialluis")) {
+ // SYS #0, C7, C1, #0
+ SYS_ALIAS(0, 7, 1, 0);
+ } else if (!Op.compare_lower("iallu")) {
+ // SYS #0, C7, C5, #0
+ SYS_ALIAS(0, 7, 5, 0);
+ } else if (!Op.compare_lower("ivau")) {
+ // SYS #3, C7, C5, #1
+ SYS_ALIAS(3, 7, 5, 1);
+ } else {
+ return TokError("invalid operand for IC instruction");
+ }
+ } else if (Mnemonic == "dc") {
+ if (!Op.compare_lower("zva")) {
+ // SYS #3, C7, C4, #1
+ SYS_ALIAS(3, 7, 4, 1);
+ } else if (!Op.compare_lower("ivac")) {
+ // SYS #3, C7, C6, #1
+ SYS_ALIAS(0, 7, 6, 1);
+ } else if (!Op.compare_lower("isw")) {
+ // SYS #0, C7, C6, #2
+ SYS_ALIAS(0, 7, 6, 2);
+ } else if (!Op.compare_lower("cvac")) {
+ // SYS #3, C7, C10, #1
+ SYS_ALIAS(3, 7, 10, 1);
+ } else if (!Op.compare_lower("csw")) {
+ // SYS #0, C7, C10, #2
+ SYS_ALIAS(0, 7, 10, 2);
+ } else if (!Op.compare_lower("cvau")) {
+ // SYS #3, C7, C11, #1
+ SYS_ALIAS(3, 7, 11, 1);
+ } else if (!Op.compare_lower("civac")) {
+ // SYS #3, C7, C14, #1
+ SYS_ALIAS(3, 7, 14, 1);
+ } else if (!Op.compare_lower("cisw")) {
+ // SYS #0, C7, C14, #2
+ SYS_ALIAS(0, 7, 14, 2);
+ } else {
+ return TokError("invalid operand for DC instruction");
+ }
+ } else if (Mnemonic == "at") {
+ if (!Op.compare_lower("s1e1r")) {
+ // SYS #0, C7, C8, #0
+ SYS_ALIAS(0, 7, 8, 0);
+ } else if (!Op.compare_lower("s1e2r")) {
+ // SYS #4, C7, C8, #0
+ SYS_ALIAS(4, 7, 8, 0);
+ } else if (!Op.compare_lower("s1e3r")) {
+ // SYS #6, C7, C8, #0
+ SYS_ALIAS(6, 7, 8, 0);
+ } else if (!Op.compare_lower("s1e1w")) {
+ // SYS #0, C7, C8, #1
+ SYS_ALIAS(0, 7, 8, 1);
+ } else if (!Op.compare_lower("s1e2w")) {
+ // SYS #4, C7, C8, #1
+ SYS_ALIAS(4, 7, 8, 1);
+ } else if (!Op.compare_lower("s1e3w")) {
+ // SYS #6, C7, C8, #1
+ SYS_ALIAS(6, 7, 8, 1);
+ } else if (!Op.compare_lower("s1e0r")) {
+ // SYS #0, C7, C8, #3
+ SYS_ALIAS(0, 7, 8, 2);
+ } else if (!Op.compare_lower("s1e0w")) {
+ // SYS #0, C7, C8, #3
+ SYS_ALIAS(0, 7, 8, 3);
+ } else if (!Op.compare_lower("s12e1r")) {
+ // SYS #4, C7, C8, #4
+ SYS_ALIAS(4, 7, 8, 4);
+ } else if (!Op.compare_lower("s12e1w")) {
+ // SYS #4, C7, C8, #5
+ SYS_ALIAS(4, 7, 8, 5);
+ } else if (!Op.compare_lower("s12e0r")) {
+ // SYS #4, C7, C8, #6
+ SYS_ALIAS(4, 7, 8, 6);
+ } else if (!Op.compare_lower("s12e0w")) {
+ // SYS #4, C7, C8, #7
+ SYS_ALIAS(4, 7, 8, 7);
+ } else {
+ return TokError("invalid operand for AT instruction");
+ }
+ } else if (Mnemonic == "tlbi") {
+ if (!Op.compare_lower("vmalle1is")) {
+ // SYS #0, C8, C3, #0
+ SYS_ALIAS(0, 8, 3, 0);
+ } else if (!Op.compare_lower("alle2is")) {
+ // SYS #4, C8, C3, #0
+ SYS_ALIAS(4, 8, 3, 0);
+ } else if (!Op.compare_lower("alle3is")) {
+ // SYS #6, C8, C3, #0
+ SYS_ALIAS(6, 8, 3, 0);
+ } else if (!Op.compare_lower("vae1is")) {
+ // SYS #0, C8, C3, #1
+ SYS_ALIAS(0, 8, 3, 1);
+ } else if (!Op.compare_lower("vae2is")) {
+ // SYS #4, C8, C3, #1
+ SYS_ALIAS(4, 8, 3, 1);
+ } else if (!Op.compare_lower("vae3is")) {
+ // SYS #6, C8, C3, #1
+ SYS_ALIAS(6, 8, 3, 1);
+ } else if (!Op.compare_lower("aside1is")) {
+ // SYS #0, C8, C3, #2
+ SYS_ALIAS(0, 8, 3, 2);
+ } else if (!Op.compare_lower("vaae1is")) {
+ // SYS #0, C8, C3, #3
+ SYS_ALIAS(0, 8, 3, 3);
+ } else if (!Op.compare_lower("alle1is")) {
+ // SYS #4, C8, C3, #4
+ SYS_ALIAS(4, 8, 3, 4);
+ } else if (!Op.compare_lower("vale1is")) {
+ // SYS #0, C8, C3, #5
+ SYS_ALIAS(0, 8, 3, 5);
+ } else if (!Op.compare_lower("vaale1is")) {
+ // SYS #0, C8, C3, #7
+ SYS_ALIAS(0, 8, 3, 7);
+ } else if (!Op.compare_lower("vmalle1")) {
+ // SYS #0, C8, C7, #0
+ SYS_ALIAS(0, 8, 7, 0);
+ } else if (!Op.compare_lower("alle2")) {
+ // SYS #4, C8, C7, #0
+ SYS_ALIAS(4, 8, 7, 0);
+ } else if (!Op.compare_lower("vale2is")) {
+ // SYS #4, C8, C3, #5
+ SYS_ALIAS(4, 8, 3, 5);
+ } else if (!Op.compare_lower("vale3is")) {
+ // SYS #6, C8, C3, #5
+ SYS_ALIAS(6, 8, 3, 5);
+ } else if (!Op.compare_lower("alle3")) {
+ // SYS #6, C8, C7, #0
+ SYS_ALIAS(6, 8, 7, 0);
+ } else if (!Op.compare_lower("vae1")) {
+ // SYS #0, C8, C7, #1
+ SYS_ALIAS(0, 8, 7, 1);
+ } else if (!Op.compare_lower("vae2")) {
+ // SYS #4, C8, C7, #1
+ SYS_ALIAS(4, 8, 7, 1);
+ } else if (!Op.compare_lower("vae3")) {
+ // SYS #6, C8, C7, #1
+ SYS_ALIAS(6, 8, 7, 1);
+ } else if (!Op.compare_lower("aside1")) {
+ // SYS #0, C8, C7, #2
+ SYS_ALIAS(0, 8, 7, 2);
+ } else if (!Op.compare_lower("vaae1")) {
+ // SYS #0, C8, C7, #3
+ SYS_ALIAS(0, 8, 7, 3);
+ } else if (!Op.compare_lower("alle1")) {
+ // SYS #4, C8, C7, #4
+ SYS_ALIAS(4, 8, 7, 4);
+ } else if (!Op.compare_lower("vale1")) {
+ // SYS #0, C8, C7, #5
+ SYS_ALIAS(0, 8, 7, 5);
+ } else if (!Op.compare_lower("vale2")) {
+ // SYS #4, C8, C7, #5
+ SYS_ALIAS(4, 8, 7, 5);
+ } else if (!Op.compare_lower("vale3")) {
+ // SYS #6, C8, C7, #5
+ SYS_ALIAS(6, 8, 7, 5);
+ } else if (!Op.compare_lower("vaale1")) {
+ // SYS #0, C8, C7, #7
+ SYS_ALIAS(0, 8, 7, 7);
+ } else if (!Op.compare_lower("ipas2e1")) {
+ // SYS #4, C8, C4, #1
+ SYS_ALIAS(4, 8, 4, 1);
+ } else if (!Op.compare_lower("ipas2le1")) {
+ // SYS #4, C8, C4, #5
+ SYS_ALIAS(4, 8, 4, 5);
+ } else if (!Op.compare_lower("ipas2e1is")) {
+ // SYS #4, C8, C4, #1
+ SYS_ALIAS(4, 8, 0, 1);
+ } else if (!Op.compare_lower("ipas2le1is")) {
+ // SYS #4, C8, C4, #5
+ SYS_ALIAS(4, 8, 0, 5);
+ } else if (!Op.compare_lower("vmalls12e1")) {
+ // SYS #4, C8, C7, #6
+ SYS_ALIAS(4, 8, 7, 6);
+ } else if (!Op.compare_lower("vmalls12e1is")) {
+ // SYS #4, C8, C3, #6
+ SYS_ALIAS(4, 8, 3, 6);
+ } else {
+ return TokError("invalid operand for TLBI instruction");
+ }
+ }
+
+#undef SYS_ALIAS
+
+ Parser.Lex(); // Eat operand.
+
+ bool ExpectRegister = (Op.lower().find("all") == StringRef::npos);
+ bool HasRegister = false;
+
+ // Check for the optional register operand.
+ if (getLexer().is(AsmToken::Comma)) {
+ Parser.Lex(); // Eat comma.
+
+ if (Tok.isNot(AsmToken::Identifier) || parseRegister(Operands))
+ return TokError("expected register operand");
+
+ HasRegister = true;
+ }
+
+ if (getLexer().isNot(AsmToken::EndOfStatement)) {
+ Parser.eatToEndOfStatement();
+ return TokError("unexpected token in argument list");
+ }
+
+ if (ExpectRegister && !HasRegister) {
+ return TokError("specified " + Mnemonic + " op requires a register");
+ }
+ else if (!ExpectRegister && HasRegister) {
+ return TokError("specified " + Mnemonic + " op does not use a register");
+ }
+
+ Parser.Lex(); // Consume the EndOfStatement
+ return false;
+}
+
+AArch64AsmParser::OperandMatchResultTy
+AArch64AsmParser::tryParseBarrierOperand(OperandVector &Operands) {
+ const AsmToken &Tok = Parser.getTok();
+
+ // Can be either a #imm style literal or an option name
+ bool Hash = Tok.is(AsmToken::Hash);
+ if (Hash || Tok.is(AsmToken::Integer)) {
+ // Immediate operand.
+ if (Hash)
+ Parser.Lex(); // Eat the '#'
+ const MCExpr *ImmVal;
+ SMLoc ExprLoc = getLoc();
+ if (getParser().parseExpression(ImmVal))
+ return MatchOperand_ParseFail;
+ const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(ImmVal);
+ if (!MCE) {
+ Error(ExprLoc, "immediate value expected for barrier operand");
+ return MatchOperand_ParseFail;
+ }
+ if (MCE->getValue() < 0 || MCE->getValue() > 15) {
+ Error(ExprLoc, "barrier operand out of range");
+ return MatchOperand_ParseFail;
+ }
+ Operands.push_back(
+ AArch64Operand::CreateBarrier(MCE->getValue(), ExprLoc, getContext()));
+ return MatchOperand_Success;
+ }
+
+ if (Tok.isNot(AsmToken::Identifier)) {
+ TokError("invalid operand for instruction");
+ return MatchOperand_ParseFail;
+ }
+
+ bool Valid;
+ unsigned Opt = AArch64DB::DBarrierMapper().fromString(Tok.getString(), Valid);
+ if (!Valid) {
+ TokError("invalid barrier option name");
+ return MatchOperand_ParseFail;
+ }
+
+ // The only valid named option for ISB is 'sy'
+ if (Mnemonic == "isb" && Opt != AArch64DB::SY) {
+ TokError("'sy' or #imm operand expected");
+ return MatchOperand_ParseFail;
+ }
+
+ Operands.push_back(
+ AArch64Operand::CreateBarrier(Opt, getLoc(), getContext()));
+ Parser.Lex(); // Consume the option
return MatchOperand_Success;
}
-/// Try to parse a vector register token, If it is a vector register,
-/// the token is eaten and return true. Otherwise return false.
-bool AArch64AsmParser::TryParseVector(uint32_t &RegNum, SMLoc &RegEndLoc,
- StringRef &Layout, SMLoc &LayoutLoc) {
- bool IsVector = true;
+AArch64AsmParser::OperandMatchResultTy
+AArch64AsmParser::tryParseSysReg(OperandVector &Operands) {
+ const AsmToken &Tok = Parser.getTok();
- if (!IdentifyRegister(RegNum, RegEndLoc, Layout, LayoutLoc))
- IsVector = false;
- else if (!AArch64MCRegisterClasses[AArch64::FPR64RegClassID]
- .contains(RegNum) &&
- !AArch64MCRegisterClasses[AArch64::FPR128RegClassID]
- .contains(RegNum))
- IsVector = false;
- else if (Layout.size() == 0)
- IsVector = false;
+ if (Tok.isNot(AsmToken::Identifier))
+ return MatchOperand_NoMatch;
- if (!IsVector)
- Error(Parser.getTok().getLoc(), "expected vector type register");
+ Operands.push_back(AArch64Operand::CreateSysReg(Tok.getString(), getLoc(),
+ STI.getFeatureBits(), getContext()));
+ Parser.Lex(); // Eat identifier
- Parser.Lex(); // Eat this token.
- return IsVector;
+ return MatchOperand_Success;
}
+/// tryParseVectorRegister - Parse a vector register operand.
+bool AArch64AsmParser::tryParseVectorRegister(OperandVector &Operands) {
+ if (Parser.getTok().isNot(AsmToken::Identifier))
+ return true;
-// A vector list contains 1-4 consecutive registers.
-// Now there are two kinds of vector list when number of vector > 1:
-// (1) {Vn.layout, Vn+1.layout, ... , Vm.layout}
-// (2) {Vn.layout - Vm.layout}
-// If the layout is like .b/.h/.s/.d, also parse the lane.
-AArch64AsmParser::OperandMatchResultTy AArch64AsmParser::ParseVectorList(
- SmallVectorImpl<MCParsedAsmOperand *> &Operands) {
- if (Parser.getTok().isNot(AsmToken::LCurly)) {
- Error(Parser.getTok().getLoc(), "'{' expected");
- return MatchOperand_ParseFail;
- }
- SMLoc SLoc = Parser.getTok().getLoc();
- Parser.Lex(); // Eat '{' token.
-
- unsigned Reg, Count = 1;
- StringRef LayoutStr;
- SMLoc RegEndLoc, LayoutLoc;
- if (!TryParseVector(Reg, RegEndLoc, LayoutStr, LayoutLoc))
- return MatchOperand_ParseFail;
-
- if (Parser.getTok().is(AsmToken::Minus)) {
- Parser.Lex(); // Eat the minus.
-
- unsigned Reg2;
- StringRef LayoutStr2;
- SMLoc RegEndLoc2, LayoutLoc2;
- SMLoc RegLoc2 = Parser.getTok().getLoc();
-
- if (!TryParseVector(Reg2, RegEndLoc2, LayoutStr2, LayoutLoc2))
- return MatchOperand_ParseFail;
- unsigned Space = (Reg < Reg2) ? (Reg2 - Reg) : (Reg2 + 32 - Reg);
-
- if (LayoutStr != LayoutStr2) {
- Error(LayoutLoc2, "expected the same vector layout");
- return MatchOperand_ParseFail;
- }
- if (Space == 0 || Space > 3) {
- Error(RegLoc2, "invalid number of vectors");
- return MatchOperand_ParseFail;
- }
-
- Count += Space;
- } else {
- unsigned LastReg = Reg;
- while (Parser.getTok().is(AsmToken::Comma)) {
- Parser.Lex(); // Eat the comma.
- unsigned Reg2;
- StringRef LayoutStr2;
- SMLoc RegEndLoc2, LayoutLoc2;
- SMLoc RegLoc2 = Parser.getTok().getLoc();
-
- if (!TryParseVector(Reg2, RegEndLoc2, LayoutStr2, LayoutLoc2))
- return MatchOperand_ParseFail;
- unsigned Space = (LastReg < Reg2) ? (Reg2 - LastReg)
- : (Reg2 + 32 - LastReg);
- Count++;
-
- // The space between two vectors should be 1. And they should have the same layout.
- // Total count shouldn't be great than 4
- if (Space != 1) {
- Error(RegLoc2, "invalid space between two vectors");
- return MatchOperand_ParseFail;
- }
- if (LayoutStr != LayoutStr2) {
- Error(LayoutLoc2, "expected the same vector layout");
- return MatchOperand_ParseFail;
- }
- if (Count > 4) {
- Error(RegLoc2, "invalid number of vectors");
- return MatchOperand_ParseFail;
- }
-
- LastReg = Reg2;
- }
- }
-
- if (Parser.getTok().isNot(AsmToken::RCurly)) {
- Error(Parser.getTok().getLoc(), "'}' expected");
- return MatchOperand_ParseFail;
- }
- SMLoc ELoc = Parser.getTok().getLoc();
- Parser.Lex(); // Eat '}' token.
-
- A64Layout::VectorLayout Layout = A64StringToVectorLayout(LayoutStr);
- if (Count > 1) { // If count > 1, create vector list using super register.
- bool IsVec64 = (Layout < A64Layout::VL_16B);
- static unsigned SupRegIDs[3][2] = {
- { AArch64::QPairRegClassID, AArch64::DPairRegClassID },
- { AArch64::QTripleRegClassID, AArch64::DTripleRegClassID },
- { AArch64::QQuadRegClassID, AArch64::DQuadRegClassID }
- };
- unsigned SupRegID = SupRegIDs[Count - 2][static_cast<int>(IsVec64)];
- unsigned Sub0 = IsVec64 ? AArch64::dsub_0 : AArch64::qsub_0;
- const MCRegisterInfo *MRI = getContext().getRegisterInfo();
- Reg = MRI->getMatchingSuperReg(Reg, Sub0,
- &AArch64MCRegisterClasses[SupRegID]);
- }
+ SMLoc S = getLoc();
+ // Check for a vector register specifier first.
+ StringRef Kind;
+ int64_t Reg = tryMatchVectorRegister(Kind, false);
+ if (Reg == -1)
+ return true;
Operands.push_back(
- AArch64Operand::CreateVectorList(Reg, Count, Layout, SLoc, ELoc));
+ AArch64Operand::CreateReg(Reg, true, S, getLoc(), getContext()));
+ // If there was an explicit qualifier, that goes on as a literal text
+ // operand.
+ if (!Kind.empty())
+ Operands.push_back(
+ AArch64Operand::CreateToken(Kind, false, S, getContext()));
+ // If there is an index specifier following the register, parse that too.
if (Parser.getTok().is(AsmToken::LBrac)) {
- uint32_t NumLanes = 0;
- switch(Layout) {
- case A64Layout::VL_B : NumLanes = 16; break;
- case A64Layout::VL_H : NumLanes = 8; break;
- case A64Layout::VL_S : NumLanes = 4; break;
- case A64Layout::VL_D : NumLanes = 2; break;
- default:
- SMLoc Loc = getLexer().getLoc();
- Error(Loc, "expected comma before next operand");
- return MatchOperand_ParseFail;
- }
- return ParseNEONLane(Operands, NumLanes);
- } else {
- return MatchOperand_Success;
- }
-}
+ SMLoc SIdx = getLoc();
+ Parser.Lex(); // Eat left bracket token.
-// FIXME: We would really like to be able to tablegen'erate this.
-bool AArch64AsmParser::
-validateInstruction(MCInst &Inst,
- const SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
- switch (Inst.getOpcode()) {
- case AArch64::BFIwwii:
- case AArch64::BFIxxii:
- case AArch64::SBFIZwwii:
- case AArch64::SBFIZxxii:
- case AArch64::UBFIZwwii:
- case AArch64::UBFIZxxii: {
- unsigned ImmOps = Inst.getNumOperands() - 2;
- int64_t ImmR = Inst.getOperand(ImmOps).getImm();
- int64_t ImmS = Inst.getOperand(ImmOps+1).getImm();
-
- if (ImmR != 0 && ImmS >= ImmR) {
- return Error(Operands[4]->getStartLoc(),
- "requested insert overflows register");
- }
- return false;
- }
- case AArch64::BFXILwwii:
- case AArch64::BFXILxxii:
- case AArch64::SBFXwwii:
- case AArch64::SBFXxxii:
- case AArch64::UBFXwwii:
- case AArch64::UBFXxxii: {
- unsigned ImmOps = Inst.getNumOperands() - 2;
- int64_t ImmR = Inst.getOperand(ImmOps).getImm();
- int64_t ImmS = Inst.getOperand(ImmOps+1).getImm();
- int64_t RegWidth = 0;
- switch (Inst.getOpcode()) {
- case AArch64::SBFXxxii: case AArch64::UBFXxxii: case AArch64::BFXILxxii:
- RegWidth = 64;
- break;
- case AArch64::SBFXwwii: case AArch64::UBFXwwii: case AArch64::BFXILwwii:
- RegWidth = 32;
- break;
+ const MCExpr *ImmVal;
+ if (getParser().parseExpression(ImmVal))
+ return false;
+ const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(ImmVal);
+ if (!MCE) {
+ TokError("immediate value expected for vector index");
+ return false;
}
- if (ImmS >= RegWidth || ImmS < ImmR) {
- return Error(Operands[4]->getStartLoc(),
- "requested extract overflows register");
+ SMLoc E = getLoc();
+ if (Parser.getTok().isNot(AsmToken::RBrac)) {
+ Error(E, "']' expected");
+ return false;
}
- return false;
- }
- case AArch64::ICix: {
- int64_t ImmVal = Inst.getOperand(0).getImm();
- A64IC::ICValues ICOp = static_cast<A64IC::ICValues>(ImmVal);
- if (!A64IC::NeedsRegister(ICOp)) {
- return Error(Operands[1]->getStartLoc(),
- "specified IC op does not use a register");
- }
- return false;
- }
- case AArch64::ICi: {
- int64_t ImmVal = Inst.getOperand(0).getImm();
- A64IC::ICValues ICOp = static_cast<A64IC::ICValues>(ImmVal);
- if (A64IC::NeedsRegister(ICOp)) {
- return Error(Operands[1]->getStartLoc(),
- "specified IC op requires a register");
- }
- return false;
- }
- case AArch64::TLBIix: {
- int64_t ImmVal = Inst.getOperand(0).getImm();
- A64TLBI::TLBIValues TLBIOp = static_cast<A64TLBI::TLBIValues>(ImmVal);
- if (!A64TLBI::NeedsRegister(TLBIOp)) {
- return Error(Operands[1]->getStartLoc(),
- "specified TLBI op does not use a register");
- }
- return false;
- }
- case AArch64::TLBIi: {
- int64_t ImmVal = Inst.getOperand(0).getImm();
- A64TLBI::TLBIValues TLBIOp = static_cast<A64TLBI::TLBIValues>(ImmVal);
- if (A64TLBI::NeedsRegister(TLBIOp)) {
- return Error(Operands[1]->getStartLoc(),
- "specified TLBI op requires a register");
- }
- return false;
- }
+
+ Parser.Lex(); // Eat right bracket token.
+
+ Operands.push_back(AArch64Operand::CreateVectorIndex(MCE->getValue(), SIdx,
+ E, getContext()));
}
return false;
}
+/// parseRegister - Parse a non-vector register operand.
+bool AArch64AsmParser::parseRegister(OperandVector &Operands) {
+ SMLoc S = getLoc();
+ // Try for a vector register.
+ if (!tryParseVectorRegister(Operands))
+ return false;
-// Parses the instruction *together with* all operands, appending each parsed
-// operand to the "Operands" list
-bool AArch64AsmParser::ParseInstruction(ParseInstructionInfo &Info,
- StringRef Name, SMLoc NameLoc,
- SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
- StringRef PatchedName = StringSwitch<StringRef>(Name.lower())
- .Case("beq", "b.eq")
- .Case("bne", "b.ne")
- .Case("bhs", "b.hs")
- .Case("bcs", "b.cs")
- .Case("blo", "b.lo")
- .Case("bcc", "b.cc")
- .Case("bmi", "b.mi")
- .Case("bpl", "b.pl")
- .Case("bvs", "b.vs")
- .Case("bvc", "b.vc")
- .Case("bhi", "b.hi")
- .Case("bls", "b.ls")
- .Case("bge", "b.ge")
- .Case("blt", "b.lt")
- .Case("bgt", "b.gt")
- .Case("ble", "b.le")
- .Case("bal", "b.al")
- .Case("bnv", "b.nv")
- .Default(Name);
+ // Try for a scalar register.
+ int64_t Reg = tryParseRegister();
+ if (Reg == -1)
+ return true;
+ Operands.push_back(
+ AArch64Operand::CreateReg(Reg, false, S, getLoc(), getContext()));
- size_t CondCodePos = PatchedName.find('.');
-
- StringRef Mnemonic = PatchedName.substr(0, CondCodePos);
- Operands.push_back(AArch64Operand::CreateToken(Mnemonic, NameLoc));
-
- if (CondCodePos != StringRef::npos) {
- // We have a condition code
- SMLoc S = SMLoc::getFromPointer(NameLoc.getPointer() + CondCodePos + 1);
- StringRef CondStr = PatchedName.substr(CondCodePos + 1, StringRef::npos);
- A64CC::CondCodes Code;
-
- Code = A64StringToCondCode(CondStr);
-
- if (Code == A64CC::Invalid) {
- Error(S, "invalid condition code");
- Parser.eatToEndOfStatement();
- return true;
+ // A small number of instructions (FMOVXDhighr, for example) have "[1]"
+ // as a string token in the instruction itself.
+ if (getLexer().getKind() == AsmToken::LBrac) {
+ SMLoc LBracS = getLoc();
+ Parser.Lex();
+ const AsmToken &Tok = Parser.getTok();
+ if (Tok.is(AsmToken::Integer)) {
+ SMLoc IntS = getLoc();
+ int64_t Val = Tok.getIntVal();
+ if (Val == 1) {
+ Parser.Lex();
+ if (getLexer().getKind() == AsmToken::RBrac) {
+ SMLoc RBracS = getLoc();
+ Parser.Lex();
+ Operands.push_back(
+ AArch64Operand::CreateToken("[", false, LBracS, getContext()));
+ Operands.push_back(
+ AArch64Operand::CreateToken("1", false, IntS, getContext()));
+ Operands.push_back(
+ AArch64Operand::CreateToken("]", false, RBracS, getContext()));
+ return false;
+ }
+ }
}
-
- SMLoc DotL = SMLoc::getFromPointer(NameLoc.getPointer() + CondCodePos);
-
- Operands.push_back(AArch64Operand::CreateToken(".", DotL));
- SMLoc E = SMLoc::getFromPointer(NameLoc.getPointer() + CondCodePos + 3);
- Operands.push_back(AArch64Operand::CreateCondCode(Code, S, E));
}
- // Now we parse the operands of this instruction
+ return false;
+}
+
+bool AArch64AsmParser::parseSymbolicImmVal(const MCExpr *&ImmVal) {
+ bool HasELFModifier = false;
+ AArch64MCExpr::VariantKind RefKind;
+
+ if (Parser.getTok().is(AsmToken::Colon)) {
+ Parser.Lex(); // Eat ':"
+ HasELFModifier = true;
+
+ if (Parser.getTok().isNot(AsmToken::Identifier)) {
+ Error(Parser.getTok().getLoc(),
+ "expect relocation specifier in operand after ':'");
+ return true;
+ }
+
+ std::string LowerCase = Parser.getTok().getIdentifier().lower();
+ RefKind = StringSwitch<AArch64MCExpr::VariantKind>(LowerCase)
+ .Case("lo12", AArch64MCExpr::VK_LO12)
+ .Case("abs_g3", AArch64MCExpr::VK_ABS_G3)
+ .Case("abs_g2", AArch64MCExpr::VK_ABS_G2)
+ .Case("abs_g2_s", AArch64MCExpr::VK_ABS_G2_S)
+ .Case("abs_g2_nc", AArch64MCExpr::VK_ABS_G2_NC)
+ .Case("abs_g1", AArch64MCExpr::VK_ABS_G1)
+ .Case("abs_g1_s", AArch64MCExpr::VK_ABS_G1_S)
+ .Case("abs_g1_nc", AArch64MCExpr::VK_ABS_G1_NC)
+ .Case("abs_g0", AArch64MCExpr::VK_ABS_G0)
+ .Case("abs_g0_s", AArch64MCExpr::VK_ABS_G0_S)
+ .Case("abs_g0_nc", AArch64MCExpr::VK_ABS_G0_NC)
+ .Case("dtprel_g2", AArch64MCExpr::VK_DTPREL_G2)
+ .Case("dtprel_g1", AArch64MCExpr::VK_DTPREL_G1)
+ .Case("dtprel_g1_nc", AArch64MCExpr::VK_DTPREL_G1_NC)
+ .Case("dtprel_g0", AArch64MCExpr::VK_DTPREL_G0)
+ .Case("dtprel_g0_nc", AArch64MCExpr::VK_DTPREL_G0_NC)
+ .Case("dtprel_hi12", AArch64MCExpr::VK_DTPREL_HI12)
+ .Case("dtprel_lo12", AArch64MCExpr::VK_DTPREL_LO12)
+ .Case("dtprel_lo12_nc", AArch64MCExpr::VK_DTPREL_LO12_NC)
+ .Case("tprel_g2", AArch64MCExpr::VK_TPREL_G2)
+ .Case("tprel_g1", AArch64MCExpr::VK_TPREL_G1)
+ .Case("tprel_g1_nc", AArch64MCExpr::VK_TPREL_G1_NC)
+ .Case("tprel_g0", AArch64MCExpr::VK_TPREL_G0)
+ .Case("tprel_g0_nc", AArch64MCExpr::VK_TPREL_G0_NC)
+ .Case("tprel_hi12", AArch64MCExpr::VK_TPREL_HI12)
+ .Case("tprel_lo12", AArch64MCExpr::VK_TPREL_LO12)
+ .Case("tprel_lo12_nc", AArch64MCExpr::VK_TPREL_LO12_NC)
+ .Case("tlsdesc_lo12", AArch64MCExpr::VK_TLSDESC_LO12)
+ .Case("got", AArch64MCExpr::VK_GOT_PAGE)
+ .Case("got_lo12", AArch64MCExpr::VK_GOT_LO12)
+ .Case("gottprel", AArch64MCExpr::VK_GOTTPREL_PAGE)
+ .Case("gottprel_lo12", AArch64MCExpr::VK_GOTTPREL_LO12_NC)
+ .Case("gottprel_g1", AArch64MCExpr::VK_GOTTPREL_G1)
+ .Case("gottprel_g0_nc", AArch64MCExpr::VK_GOTTPREL_G0_NC)
+ .Case("tlsdesc", AArch64MCExpr::VK_TLSDESC_PAGE)
+ .Default(AArch64MCExpr::VK_INVALID);
+
+ if (RefKind == AArch64MCExpr::VK_INVALID) {
+ Error(Parser.getTok().getLoc(),
+ "expect relocation specifier in operand after ':'");
+ return true;
+ }
+
+ Parser.Lex(); // Eat identifier
+
+ if (Parser.getTok().isNot(AsmToken::Colon)) {
+ Error(Parser.getTok().getLoc(), "expect ':' after relocation specifier");
+ return true;
+ }
+ Parser.Lex(); // Eat ':'
+ }
+
+ if (getParser().parseExpression(ImmVal))
+ return true;
+
+ if (HasELFModifier)
+ ImmVal = AArch64MCExpr::Create(ImmVal, RefKind, getContext());
+
+ return false;
+}
+
+/// parseVectorList - Parse a vector list operand for AdvSIMD instructions.
+bool AArch64AsmParser::parseVectorList(OperandVector &Operands) {
+ assert(Parser.getTok().is(AsmToken::LCurly) && "Token is not a Left Bracket");
+ SMLoc S = getLoc();
+ Parser.Lex(); // Eat left bracket token.
+ StringRef Kind;
+ int64_t FirstReg = tryMatchVectorRegister(Kind, true);
+ if (FirstReg == -1)
+ return true;
+ int64_t PrevReg = FirstReg;
+ unsigned Count = 1;
+
+ if (Parser.getTok().is(AsmToken::Minus)) {
+ Parser.Lex(); // Eat the minus.
+
+ SMLoc Loc = getLoc();
+ StringRef NextKind;
+ int64_t Reg = tryMatchVectorRegister(NextKind, true);
+ if (Reg == -1)
+ return true;
+ // Any Kind suffices must match on all regs in the list.
+ if (Kind != NextKind)
+ return Error(Loc, "mismatched register size suffix");
+
+ unsigned Space = (PrevReg < Reg) ? (Reg - PrevReg) : (Reg + 32 - PrevReg);
+
+ if (Space == 0 || Space > 3) {
+ return Error(Loc, "invalid number of vectors");
+ }
+
+ Count += Space;
+ }
+ else {
+ while (Parser.getTok().is(AsmToken::Comma)) {
+ Parser.Lex(); // Eat the comma token.
+
+ SMLoc Loc = getLoc();
+ StringRef NextKind;
+ int64_t Reg = tryMatchVectorRegister(NextKind, true);
+ if (Reg == -1)
+ return true;
+ // Any Kind suffices must match on all regs in the list.
+ if (Kind != NextKind)
+ return Error(Loc, "mismatched register size suffix");
+
+ // Registers must be incremental (with wraparound at 31)
+ if (getContext().getRegisterInfo()->getEncodingValue(Reg) !=
+ (getContext().getRegisterInfo()->getEncodingValue(PrevReg) + 1) % 32)
+ return Error(Loc, "registers must be sequential");
+
+ PrevReg = Reg;
+ ++Count;
+ }
+ }
+
+ if (Parser.getTok().isNot(AsmToken::RCurly))
+ return Error(getLoc(), "'}' expected");
+ Parser.Lex(); // Eat the '}' token.
+
+ if (Count > 4)
+ return Error(S, "invalid number of vectors");
+
+ unsigned NumElements = 0;
+ char ElementKind = 0;
+ if (!Kind.empty())
+ parseValidVectorKind(Kind, NumElements, ElementKind);
+
+ Operands.push_back(AArch64Operand::CreateVectorList(
+ FirstReg, Count, NumElements, ElementKind, S, getLoc(), getContext()));
+
+ // If there is an index specifier following the list, parse that too.
+ if (Parser.getTok().is(AsmToken::LBrac)) {
+ SMLoc SIdx = getLoc();
+ Parser.Lex(); // Eat left bracket token.
+
+ const MCExpr *ImmVal;
+ if (getParser().parseExpression(ImmVal))
+ return false;
+ const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(ImmVal);
+ if (!MCE) {
+ TokError("immediate value expected for vector index");
+ return false;
+ }
+
+ SMLoc E = getLoc();
+ if (Parser.getTok().isNot(AsmToken::RBrac)) {
+ Error(E, "']' expected");
+ return false;
+ }
+
+ Parser.Lex(); // Eat right bracket token.
+
+ Operands.push_back(AArch64Operand::CreateVectorIndex(MCE->getValue(), SIdx,
+ E, getContext()));
+ }
+ return false;
+}
+
+AArch64AsmParser::OperandMatchResultTy
+AArch64AsmParser::tryParseGPR64sp0Operand(OperandVector &Operands) {
+ const AsmToken &Tok = Parser.getTok();
+ if (!Tok.is(AsmToken::Identifier))
+ return MatchOperand_NoMatch;
+
+ unsigned RegNum = MatchRegisterName(Tok.getString().lower());
+
+ MCContext &Ctx = getContext();
+ const MCRegisterInfo *RI = Ctx.getRegisterInfo();
+ if (!RI->getRegClass(AArch64::GPR64spRegClassID).contains(RegNum))
+ return MatchOperand_NoMatch;
+
+ SMLoc S = getLoc();
+ Parser.Lex(); // Eat register
+
+ if (Parser.getTok().isNot(AsmToken::Comma)) {
+ Operands.push_back(
+ AArch64Operand::CreateReg(RegNum, false, S, getLoc(), Ctx));
+ return MatchOperand_Success;
+ }
+ Parser.Lex(); // Eat comma.
+
+ if (Parser.getTok().is(AsmToken::Hash))
+ Parser.Lex(); // Eat hash
+
+ if (Parser.getTok().isNot(AsmToken::Integer)) {
+ Error(getLoc(), "index must be absent or #0");
+ return MatchOperand_ParseFail;
+ }
+
+ const MCExpr *ImmVal;
+ if (Parser.parseExpression(ImmVal) || !isa<MCConstantExpr>(ImmVal) ||
+ cast<MCConstantExpr>(ImmVal)->getValue() != 0) {
+ Error(getLoc(), "index must be absent or #0");
+ return MatchOperand_ParseFail;
+ }
+
+ Operands.push_back(
+ AArch64Operand::CreateReg(RegNum, false, S, getLoc(), Ctx));
+ return MatchOperand_Success;
+}
+
+/// parseOperand - Parse a arm instruction operand. For now this parses the
+/// operand regardless of the mnemonic.
+bool AArch64AsmParser::parseOperand(OperandVector &Operands, bool isCondCode,
+ bool invertCondCode) {
+ // Check if the current operand has a custom associated parser, if so, try to
+ // custom parse the operand, or fallback to the general approach.
+ OperandMatchResultTy ResTy = MatchOperandParserImpl(Operands, Mnemonic);
+ if (ResTy == MatchOperand_Success)
+ return false;
+ // If there wasn't a custom match, try the generic matcher below. Otherwise,
+ // there was a match, but an error occurred, in which case, just return that
+ // the operand parsing failed.
+ if (ResTy == MatchOperand_ParseFail)
+ return true;
+
+ // Nothing custom, so do general case parsing.
+ SMLoc S, E;
+ switch (getLexer().getKind()) {
+ default: {
+ SMLoc S = getLoc();
+ const MCExpr *Expr;
+ if (parseSymbolicImmVal(Expr))
+ return Error(S, "invalid operand");
+
+ SMLoc E = SMLoc::getFromPointer(getLoc().getPointer() - 1);
+ Operands.push_back(AArch64Operand::CreateImm(Expr, S, E, getContext()));
+ return false;
+ }
+ case AsmToken::LBrac: {
+ SMLoc Loc = Parser.getTok().getLoc();
+ Operands.push_back(AArch64Operand::CreateToken("[", false, Loc,
+ getContext()));
+ Parser.Lex(); // Eat '['
+
+ // There's no comma after a '[', so we can parse the next operand
+ // immediately.
+ return parseOperand(Operands, false, false);
+ }
+ case AsmToken::LCurly:
+ return parseVectorList(Operands);
+ case AsmToken::Identifier: {
+ // If we're expecting a Condition Code operand, then just parse that.
+ if (isCondCode)
+ return parseCondCode(Operands, invertCondCode);
+
+ // If it's a register name, parse it.
+ if (!parseRegister(Operands))
+ return false;
+
+ // This could be an optional "shift" or "extend" operand.
+ OperandMatchResultTy GotShift = tryParseOptionalShiftExtend(Operands);
+ // We can only continue if no tokens were eaten.
+ if (GotShift != MatchOperand_NoMatch)
+ return GotShift;
+
+ // This was not a register so parse other operands that start with an
+ // identifier (like labels) as expressions and create them as immediates.
+ const MCExpr *IdVal;
+ S = getLoc();
+ if (getParser().parseExpression(IdVal))
+ return true;
+
+ E = SMLoc::getFromPointer(getLoc().getPointer() - 1);
+ Operands.push_back(AArch64Operand::CreateImm(IdVal, S, E, getContext()));
+ return false;
+ }
+ case AsmToken::Integer:
+ case AsmToken::Real:
+ case AsmToken::Hash: {
+ // #42 -> immediate.
+ S = getLoc();
+ if (getLexer().is(AsmToken::Hash))
+ Parser.Lex();
+
+ // Parse a negative sign
+ bool isNegative = false;
+ if (Parser.getTok().is(AsmToken::Minus)) {
+ isNegative = true;
+ // We need to consume this token only when we have a Real, otherwise
+ // we let parseSymbolicImmVal take care of it
+ if (Parser.getLexer().peekTok().is(AsmToken::Real))
+ Parser.Lex();
+ }
+
+ // The only Real that should come through here is a literal #0.0 for
+ // the fcmp[e] r, #0.0 instructions. They expect raw token operands,
+ // so convert the value.
+ const AsmToken &Tok = Parser.getTok();
+ if (Tok.is(AsmToken::Real)) {
+ APFloat RealVal(APFloat::IEEEdouble, Tok.getString());
+ uint64_t IntVal = RealVal.bitcastToAPInt().getZExtValue();
+ if (Mnemonic != "fcmp" && Mnemonic != "fcmpe" && Mnemonic != "fcmeq" &&
+ Mnemonic != "fcmge" && Mnemonic != "fcmgt" && Mnemonic != "fcmle" &&
+ Mnemonic != "fcmlt")
+ return TokError("unexpected floating point literal");
+ else if (IntVal != 0 || isNegative)
+ return TokError("expected floating-point constant #0.0");
+ Parser.Lex(); // Eat the token.
+
+ Operands.push_back(
+ AArch64Operand::CreateToken("#0", false, S, getContext()));
+ Operands.push_back(
+ AArch64Operand::CreateToken(".0", false, S, getContext()));
+ return false;
+ }
+
+ const MCExpr *ImmVal;
+ if (parseSymbolicImmVal(ImmVal))
+ return true;
+
+ E = SMLoc::getFromPointer(getLoc().getPointer() - 1);
+ Operands.push_back(AArch64Operand::CreateImm(ImmVal, S, E, getContext()));
+ return false;
+ }
+ }
+}
+
+/// ParseInstruction - Parse an AArch64 instruction mnemonic followed by its
+/// operands.
+bool AArch64AsmParser::ParseInstruction(ParseInstructionInfo &Info,
+ StringRef Name, SMLoc NameLoc,
+ OperandVector &Operands) {
+ Name = StringSwitch<StringRef>(Name.lower())
+ .Case("beq", "b.eq")
+ .Case("bne", "b.ne")
+ .Case("bhs", "b.hs")
+ .Case("bcs", "b.cs")
+ .Case("blo", "b.lo")
+ .Case("bcc", "b.cc")
+ .Case("bmi", "b.mi")
+ .Case("bpl", "b.pl")
+ .Case("bvs", "b.vs")
+ .Case("bvc", "b.vc")
+ .Case("bhi", "b.hi")
+ .Case("bls", "b.ls")
+ .Case("bge", "b.ge")
+ .Case("blt", "b.lt")
+ .Case("bgt", "b.gt")
+ .Case("ble", "b.le")
+ .Case("bal", "b.al")
+ .Case("bnv", "b.nv")
+ .Default(Name);
+
+ // Create the leading tokens for the mnemonic, split by '.' characters.
+ size_t Start = 0, Next = Name.find('.');
+ StringRef Head = Name.slice(Start, Next);
+
+ // IC, DC, AT, and TLBI instructions are aliases for the SYS instruction.
+ if (Head == "ic" || Head == "dc" || Head == "at" || Head == "tlbi") {
+ bool IsError = parseSysAlias(Head, NameLoc, Operands);
+ if (IsError && getLexer().isNot(AsmToken::EndOfStatement))
+ Parser.eatToEndOfStatement();
+ return IsError;
+ }
+
+ Operands.push_back(
+ AArch64Operand::CreateToken(Head, false, NameLoc, getContext()));
+ Mnemonic = Head;
+
+ // Handle condition codes for a branch mnemonic
+ if (Head == "b" && Next != StringRef::npos) {
+ Start = Next;
+ Next = Name.find('.', Start + 1);
+ Head = Name.slice(Start + 1, Next);
+
+ SMLoc SuffixLoc = SMLoc::getFromPointer(NameLoc.getPointer() +
+ (Head.data() - Name.data()));
+ AArch64CC::CondCode CC = parseCondCodeString(Head);
+ if (CC == AArch64CC::Invalid)
+ return Error(SuffixLoc, "invalid condition code");
+ Operands.push_back(
+ AArch64Operand::CreateToken(".", true, SuffixLoc, getContext()));
+ Operands.push_back(
+ AArch64Operand::CreateCondCode(CC, NameLoc, NameLoc, getContext()));
+ }
+
+ // Add the remaining tokens in the mnemonic.
+ while (Next != StringRef::npos) {
+ Start = Next;
+ Next = Name.find('.', Start + 1);
+ Head = Name.slice(Start, Next);
+ SMLoc SuffixLoc = SMLoc::getFromPointer(NameLoc.getPointer() +
+ (Head.data() - Name.data()) + 1);
+ Operands.push_back(
+ AArch64Operand::CreateToken(Head, true, SuffixLoc, getContext()));
+ }
+
+ // Conditional compare instructions have a Condition Code operand, which needs
+ // to be parsed and an immediate operand created.
+ bool condCodeFourthOperand =
+ (Head == "ccmp" || Head == "ccmn" || Head == "fccmp" ||
+ Head == "fccmpe" || Head == "fcsel" || Head == "csel" ||
+ Head == "csinc" || Head == "csinv" || Head == "csneg");
+
+ // These instructions are aliases to some of the conditional select
+ // instructions. However, the condition code is inverted in the aliased
+ // instruction.
+ //
+ // FIXME: Is this the correct way to handle these? Or should the parser
+ // generate the aliased instructions directly?
+ bool condCodeSecondOperand = (Head == "cset" || Head == "csetm");
+ bool condCodeThirdOperand =
+ (Head == "cinc" || Head == "cinv" || Head == "cneg");
+
+ // Read the remaining operands.
if (getLexer().isNot(AsmToken::EndOfStatement)) {
// Read the first operand.
- if (ParseOperand(Operands, Mnemonic)) {
+ if (parseOperand(Operands, false, false)) {
Parser.eatToEndOfStatement();
return true;
}
+ unsigned N = 2;
while (getLexer().is(AsmToken::Comma)) {
- Parser.Lex(); // Eat the comma.
+ Parser.Lex(); // Eat the comma.
// Parse and remember the operand.
- if (ParseOperand(Operands, Mnemonic)) {
+ if (parseOperand(Operands, (N == 4 && condCodeFourthOperand) ||
+ (N == 3 && condCodeThirdOperand) ||
+ (N == 2 && condCodeSecondOperand),
+ condCodeSecondOperand || condCodeThirdOperand)) {
Parser.eatToEndOfStatement();
return true;
}
-
// After successfully parsing some operands there are two special cases to
// consider (i.e. notional operands not separated by commas). Both are due
// to memory specifiers:
@@ -2321,52 +3121,716 @@
// in the given context!
if (Parser.getTok().is(AsmToken::RBrac)) {
SMLoc Loc = Parser.getTok().getLoc();
- Operands.push_back(AArch64Operand::CreateToken("]", Loc));
+ Operands.push_back(AArch64Operand::CreateToken("]", false, Loc,
+ getContext()));
Parser.Lex();
}
if (Parser.getTok().is(AsmToken::Exclaim)) {
SMLoc Loc = Parser.getTok().getLoc();
- Operands.push_back(AArch64Operand::CreateToken("!", Loc));
+ Operands.push_back(AArch64Operand::CreateToken("!", false, Loc,
+ getContext()));
Parser.Lex();
}
+
+ ++N;
}
}
if (getLexer().isNot(AsmToken::EndOfStatement)) {
- SMLoc Loc = getLexer().getLoc();
+ SMLoc Loc = Parser.getTok().getLoc();
Parser.eatToEndOfStatement();
- return Error(Loc, "expected comma before next operand");
+ return Error(Loc, "unexpected token in argument list");
}
- // Eat the EndOfStatement
- Parser.Lex();
-
+ Parser.Lex(); // Consume the EndOfStatement
return false;
}
+// FIXME: This entire function is a giant hack to provide us with decent
+// operand range validation/diagnostics until TableGen/MC can be extended
+// to support autogeneration of this kind of validation.
+bool AArch64AsmParser::validateInstruction(MCInst &Inst,
+ SmallVectorImpl<SMLoc> &Loc) {
+ const MCRegisterInfo *RI = getContext().getRegisterInfo();
+ // Check for indexed addressing modes w/ the base register being the
+ // same as a destination/source register or pair load where
+ // the Rt == Rt2. All of those are undefined behaviour.
+ switch (Inst.getOpcode()) {
+ case AArch64::LDPSWpre:
+ case AArch64::LDPWpost:
+ case AArch64::LDPWpre:
+ case AArch64::LDPXpost:
+ case AArch64::LDPXpre: {
+ unsigned Rt = Inst.getOperand(1).getReg();
+ unsigned Rt2 = Inst.getOperand(2).getReg();
+ unsigned Rn = Inst.getOperand(3).getReg();
+ if (RI->isSubRegisterEq(Rn, Rt))
+ return Error(Loc[0], "unpredictable LDP instruction, writeback base "
+ "is also a destination");
+ if (RI->isSubRegisterEq(Rn, Rt2))
+ return Error(Loc[1], "unpredictable LDP instruction, writeback base "
+ "is also a destination");
+ // FALLTHROUGH
+ }
+ case AArch64::LDPDi:
+ case AArch64::LDPQi:
+ case AArch64::LDPSi:
+ case AArch64::LDPSWi:
+ case AArch64::LDPWi:
+ case AArch64::LDPXi: {
+ unsigned Rt = Inst.getOperand(0).getReg();
+ unsigned Rt2 = Inst.getOperand(1).getReg();
+ if (Rt == Rt2)
+ return Error(Loc[1], "unpredictable LDP instruction, Rt2==Rt");
+ break;
+ }
+ case AArch64::LDPDpost:
+ case AArch64::LDPDpre:
+ case AArch64::LDPQpost:
+ case AArch64::LDPQpre:
+ case AArch64::LDPSpost:
+ case AArch64::LDPSpre:
+ case AArch64::LDPSWpost: {
+ unsigned Rt = Inst.getOperand(1).getReg();
+ unsigned Rt2 = Inst.getOperand(2).getReg();
+ if (Rt == Rt2)
+ return Error(Loc[1], "unpredictable LDP instruction, Rt2==Rt");
+ break;
+ }
+ case AArch64::STPDpost:
+ case AArch64::STPDpre:
+ case AArch64::STPQpost:
+ case AArch64::STPQpre:
+ case AArch64::STPSpost:
+ case AArch64::STPSpre:
+ case AArch64::STPWpost:
+ case AArch64::STPWpre:
+ case AArch64::STPXpost:
+ case AArch64::STPXpre: {
+ unsigned Rt = Inst.getOperand(1).getReg();
+ unsigned Rt2 = Inst.getOperand(2).getReg();
+ unsigned Rn = Inst.getOperand(3).getReg();
+ if (RI->isSubRegisterEq(Rn, Rt))
+ return Error(Loc[0], "unpredictable STP instruction, writeback base "
+ "is also a source");
+ if (RI->isSubRegisterEq(Rn, Rt2))
+ return Error(Loc[1], "unpredictable STP instruction, writeback base "
+ "is also a source");
+ break;
+ }
+ case AArch64::LDRBBpre:
+ case AArch64::LDRBpre:
+ case AArch64::LDRHHpre:
+ case AArch64::LDRHpre:
+ case AArch64::LDRSBWpre:
+ case AArch64::LDRSBXpre:
+ case AArch64::LDRSHWpre:
+ case AArch64::LDRSHXpre:
+ case AArch64::LDRSWpre:
+ case AArch64::LDRWpre:
+ case AArch64::LDRXpre:
+ case AArch64::LDRBBpost:
+ case AArch64::LDRBpost:
+ case AArch64::LDRHHpost:
+ case AArch64::LDRHpost:
+ case AArch64::LDRSBWpost:
+ case AArch64::LDRSBXpost:
+ case AArch64::LDRSHWpost:
+ case AArch64::LDRSHXpost:
+ case AArch64::LDRSWpost:
+ case AArch64::LDRWpost:
+ case AArch64::LDRXpost: {
+ unsigned Rt = Inst.getOperand(1).getReg();
+ unsigned Rn = Inst.getOperand(2).getReg();
+ if (RI->isSubRegisterEq(Rn, Rt))
+ return Error(Loc[0], "unpredictable LDR instruction, writeback base "
+ "is also a source");
+ break;
+ }
+ case AArch64::STRBBpost:
+ case AArch64::STRBpost:
+ case AArch64::STRHHpost:
+ case AArch64::STRHpost:
+ case AArch64::STRWpost:
+ case AArch64::STRXpost:
+ case AArch64::STRBBpre:
+ case AArch64::STRBpre:
+ case AArch64::STRHHpre:
+ case AArch64::STRHpre:
+ case AArch64::STRWpre:
+ case AArch64::STRXpre: {
+ unsigned Rt = Inst.getOperand(1).getReg();
+ unsigned Rn = Inst.getOperand(2).getReg();
+ if (RI->isSubRegisterEq(Rn, Rt))
+ return Error(Loc[0], "unpredictable STR instruction, writeback base "
+ "is also a source");
+ break;
+ }
+ }
+
+ // Now check immediate ranges. Separate from the above as there is overlap
+ // in the instructions being checked and this keeps the nested conditionals
+ // to a minimum.
+ switch (Inst.getOpcode()) {
+ case AArch64::ADDSWri:
+ case AArch64::ADDSXri:
+ case AArch64::ADDWri:
+ case AArch64::ADDXri:
+ case AArch64::SUBSWri:
+ case AArch64::SUBSXri:
+ case AArch64::SUBWri:
+ case AArch64::SUBXri: {
+ // Annoyingly we can't do this in the isAddSubImm predicate, so there is
+ // some slight duplication here.
+ if (Inst.getOperand(2).isExpr()) {
+ const MCExpr *Expr = Inst.getOperand(2).getExpr();
+ AArch64MCExpr::VariantKind ELFRefKind;
+ MCSymbolRefExpr::VariantKind DarwinRefKind;
+ int64_t Addend;
+ if (!classifySymbolRef(Expr, ELFRefKind, DarwinRefKind, Addend)) {
+ return Error(Loc[2], "invalid immediate expression");
+ }
+
+ // Only allow these with ADDXri.
+ if ((DarwinRefKind == MCSymbolRefExpr::VK_PAGEOFF ||
+ DarwinRefKind == MCSymbolRefExpr::VK_TLVPPAGEOFF) &&
+ Inst.getOpcode() == AArch64::ADDXri)
+ return false;
+
+ // Only allow these with ADDXri/ADDWri
+ if ((ELFRefKind == AArch64MCExpr::VK_LO12 ||
+ ELFRefKind == AArch64MCExpr::VK_DTPREL_HI12 ||
+ ELFRefKind == AArch64MCExpr::VK_DTPREL_LO12 ||
+ ELFRefKind == AArch64MCExpr::VK_DTPREL_LO12_NC ||
+ ELFRefKind == AArch64MCExpr::VK_TPREL_HI12 ||
+ ELFRefKind == AArch64MCExpr::VK_TPREL_LO12 ||
+ ELFRefKind == AArch64MCExpr::VK_TPREL_LO12_NC ||
+ ELFRefKind == AArch64MCExpr::VK_TLSDESC_LO12) &&
+ (Inst.getOpcode() == AArch64::ADDXri ||
+ Inst.getOpcode() == AArch64::ADDWri))
+ return false;
+
+ // Don't allow expressions in the immediate field otherwise
+ return Error(Loc[2], "invalid immediate expression");
+ }
+ return false;
+ }
+ default:
+ return false;
+ }
+}
+
+bool AArch64AsmParser::showMatchError(SMLoc Loc, unsigned ErrCode) {
+ switch (ErrCode) {
+ case Match_MissingFeature:
+ return Error(Loc,
+ "instruction requires a CPU feature not currently enabled");
+ case Match_InvalidOperand:
+ return Error(Loc, "invalid operand for instruction");
+ case Match_InvalidSuffix:
+ return Error(Loc, "invalid type suffix for instruction");
+ case Match_InvalidCondCode:
+ return Error(Loc, "expected AArch64 condition code");
+ case Match_AddSubRegExtendSmall:
+ return Error(Loc,
+ "expected '[su]xt[bhw]' or 'lsl' with optional integer in range [0, 4]");
+ case Match_AddSubRegExtendLarge:
+ return Error(Loc,
+ "expected 'sxtx' 'uxtx' or 'lsl' with optional integer in range [0, 4]");
+ case Match_AddSubSecondSource:
+ return Error(Loc,
+ "expected compatible register, symbol or integer in range [0, 4095]");
+ case Match_LogicalSecondSource:
+ return Error(Loc, "expected compatible register or logical immediate");
+ case Match_InvalidMovImm32Shift:
+ return Error(Loc, "expected 'lsl' with optional integer 0 or 16");
+ case Match_InvalidMovImm64Shift:
+ return Error(Loc, "expected 'lsl' with optional integer 0, 16, 32 or 48");
+ case Match_AddSubRegShift32:
+ return Error(Loc,
+ "expected 'lsl', 'lsr' or 'asr' with optional integer in range [0, 31]");
+ case Match_AddSubRegShift64:
+ return Error(Loc,
+ "expected 'lsl', 'lsr' or 'asr' with optional integer in range [0, 63]");
+ case Match_InvalidFPImm:
+ return Error(Loc,
+ "expected compatible register or floating-point constant");
+ case Match_InvalidMemoryIndexedSImm9:
+ return Error(Loc, "index must be an integer in range [-256, 255].");
+ case Match_InvalidMemoryIndexed4SImm7:
+ return Error(Loc, "index must be a multiple of 4 in range [-256, 252].");
+ case Match_InvalidMemoryIndexed8SImm7:
+ return Error(Loc, "index must be a multiple of 8 in range [-512, 504].");
+ case Match_InvalidMemoryIndexed16SImm7:
+ return Error(Loc, "index must be a multiple of 16 in range [-1024, 1008].");
+ case Match_InvalidMemoryWExtend8:
+ return Error(Loc,
+ "expected 'uxtw' or 'sxtw' with optional shift of #0");
+ case Match_InvalidMemoryWExtend16:
+ return Error(Loc,
+ "expected 'uxtw' or 'sxtw' with optional shift of #0 or #1");
+ case Match_InvalidMemoryWExtend32:
+ return Error(Loc,
+ "expected 'uxtw' or 'sxtw' with optional shift of #0 or #2");
+ case Match_InvalidMemoryWExtend64:
+ return Error(Loc,
+ "expected 'uxtw' or 'sxtw' with optional shift of #0 or #3");
+ case Match_InvalidMemoryWExtend128:
+ return Error(Loc,
+ "expected 'uxtw' or 'sxtw' with optional shift of #0 or #4");
+ case Match_InvalidMemoryXExtend8:
+ return Error(Loc,
+ "expected 'lsl' or 'sxtx' with optional shift of #0");
+ case Match_InvalidMemoryXExtend16:
+ return Error(Loc,
+ "expected 'lsl' or 'sxtx' with optional shift of #0 or #1");
+ case Match_InvalidMemoryXExtend32:
+ return Error(Loc,
+ "expected 'lsl' or 'sxtx' with optional shift of #0 or #2");
+ case Match_InvalidMemoryXExtend64:
+ return Error(Loc,
+ "expected 'lsl' or 'sxtx' with optional shift of #0 or #3");
+ case Match_InvalidMemoryXExtend128:
+ return Error(Loc,
+ "expected 'lsl' or 'sxtx' with optional shift of #0 or #4");
+ case Match_InvalidMemoryIndexed1:
+ return Error(Loc, "index must be an integer in range [0, 4095].");
+ case Match_InvalidMemoryIndexed2:
+ return Error(Loc, "index must be a multiple of 2 in range [0, 8190].");
+ case Match_InvalidMemoryIndexed4:
+ return Error(Loc, "index must be a multiple of 4 in range [0, 16380].");
+ case Match_InvalidMemoryIndexed8:
+ return Error(Loc, "index must be a multiple of 8 in range [0, 32760].");
+ case Match_InvalidMemoryIndexed16:
+ return Error(Loc, "index must be a multiple of 16 in range [0, 65520].");
+ case Match_InvalidImm0_7:
+ return Error(Loc, "immediate must be an integer in range [0, 7].");
+ case Match_InvalidImm0_15:
+ return Error(Loc, "immediate must be an integer in range [0, 15].");
+ case Match_InvalidImm0_31:
+ return Error(Loc, "immediate must be an integer in range [0, 31].");
+ case Match_InvalidImm0_63:
+ return Error(Loc, "immediate must be an integer in range [0, 63].");
+ case Match_InvalidImm0_127:
+ return Error(Loc, "immediate must be an integer in range [0, 127].");
+ case Match_InvalidImm0_65535:
+ return Error(Loc, "immediate must be an integer in range [0, 65535].");
+ case Match_InvalidImm1_8:
+ return Error(Loc, "immediate must be an integer in range [1, 8].");
+ case Match_InvalidImm1_16:
+ return Error(Loc, "immediate must be an integer in range [1, 16].");
+ case Match_InvalidImm1_32:
+ return Error(Loc, "immediate must be an integer in range [1, 32].");
+ case Match_InvalidImm1_64:
+ return Error(Loc, "immediate must be an integer in range [1, 64].");
+ case Match_InvalidIndex1:
+ return Error(Loc, "expected lane specifier '[1]'");
+ case Match_InvalidIndexB:
+ return Error(Loc, "vector lane must be an integer in range [0, 15].");
+ case Match_InvalidIndexH:
+ return Error(Loc, "vector lane must be an integer in range [0, 7].");
+ case Match_InvalidIndexS:
+ return Error(Loc, "vector lane must be an integer in range [0, 3].");
+ case Match_InvalidIndexD:
+ return Error(Loc, "vector lane must be an integer in range [0, 1].");
+ case Match_InvalidLabel:
+ return Error(Loc, "expected label or encodable integer pc offset");
+ case Match_MRS:
+ return Error(Loc, "expected readable system register");
+ case Match_MSR:
+ return Error(Loc, "expected writable system register or pstate");
+ case Match_MnemonicFail:
+ return Error(Loc, "unrecognized instruction mnemonic");
+ default:
+ assert(0 && "unexpected error code!");
+ return Error(Loc, "invalid instruction format");
+ }
+}
+
+static const char *getSubtargetFeatureName(unsigned Val);
+
+bool AArch64AsmParser::MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
+ OperandVector &Operands,
+ MCStreamer &Out,
+ unsigned &ErrorInfo,
+ bool MatchingInlineAsm) {
+ assert(!Operands.empty() && "Unexpect empty operand list!");
+ AArch64Operand *Op = static_cast<AArch64Operand *>(Operands[0]);
+ assert(Op->isToken() && "Leading operand should always be a mnemonic!");
+
+ StringRef Tok = Op->getToken();
+ unsigned NumOperands = Operands.size();
+
+ if (NumOperands == 4 && Tok == "lsl") {
+ AArch64Operand *Op2 = static_cast<AArch64Operand *>(Operands[2]);
+ AArch64Operand *Op3 = static_cast<AArch64Operand *>(Operands[3]);
+ if (Op2->isReg() && Op3->isImm()) {
+ const MCConstantExpr *Op3CE = dyn_cast<MCConstantExpr>(Op3->getImm());
+ if (Op3CE) {
+ uint64_t Op3Val = Op3CE->getValue();
+ uint64_t NewOp3Val = 0;
+ uint64_t NewOp4Val = 0;
+ if (AArch64MCRegisterClasses[AArch64::GPR32allRegClassID].contains(
+ Op2->getReg())) {
+ NewOp3Val = (32 - Op3Val) & 0x1f;
+ NewOp4Val = 31 - Op3Val;
+ } else {
+ NewOp3Val = (64 - Op3Val) & 0x3f;
+ NewOp4Val = 63 - Op3Val;
+ }
+
+ const MCExpr *NewOp3 = MCConstantExpr::Create(NewOp3Val, getContext());
+ const MCExpr *NewOp4 = MCConstantExpr::Create(NewOp4Val, getContext());
+
+ Operands[0] = AArch64Operand::CreateToken(
+ "ubfm", false, Op->getStartLoc(), getContext());
+ Operands[3] = AArch64Operand::CreateImm(NewOp3, Op3->getStartLoc(),
+ Op3->getEndLoc(), getContext());
+ Operands.push_back(AArch64Operand::CreateImm(
+ NewOp4, Op3->getStartLoc(), Op3->getEndLoc(), getContext()));
+ delete Op3;
+ delete Op;
+ }
+ }
+ } else if (NumOperands == 5) {
+ // FIXME: Horrible hack to handle the BFI -> BFM, SBFIZ->SBFM, and
+ // UBFIZ -> UBFM aliases.
+ if (Tok == "bfi" || Tok == "sbfiz" || Tok == "ubfiz") {
+ AArch64Operand *Op1 = static_cast<AArch64Operand *>(Operands[1]);
+ AArch64Operand *Op3 = static_cast<AArch64Operand *>(Operands[3]);
+ AArch64Operand *Op4 = static_cast<AArch64Operand *>(Operands[4]);
+
+ if (Op1->isReg() && Op3->isImm() && Op4->isImm()) {
+ const MCConstantExpr *Op3CE = dyn_cast<MCConstantExpr>(Op3->getImm());
+ const MCConstantExpr *Op4CE = dyn_cast<MCConstantExpr>(Op4->getImm());
+
+ if (Op3CE && Op4CE) {
+ uint64_t Op3Val = Op3CE->getValue();
+ uint64_t Op4Val = Op4CE->getValue();
+
+ uint64_t RegWidth = 0;
+ if (AArch64MCRegisterClasses[AArch64::GPR64allRegClassID].contains(
+ Op1->getReg()))
+ RegWidth = 64;
+ else
+ RegWidth = 32;
+
+ if (Op3Val >= RegWidth)
+ return Error(Op3->getStartLoc(),
+ "expected integer in range [0, 31]");
+ if (Op4Val < 1 || Op4Val > RegWidth)
+ return Error(Op4->getStartLoc(),
+ "expected integer in range [1, 32]");
+
+ uint64_t NewOp3Val = 0;
+ if (AArch64MCRegisterClasses[AArch64::GPR32allRegClassID].contains(
+ Op1->getReg()))
+ NewOp3Val = (32 - Op3Val) & 0x1f;
+ else
+ NewOp3Val = (64 - Op3Val) & 0x3f;
+
+ uint64_t NewOp4Val = Op4Val - 1;
+
+ if (NewOp3Val != 0 && NewOp4Val >= NewOp3Val)
+ return Error(Op4->getStartLoc(),
+ "requested insert overflows register");
+
+ const MCExpr *NewOp3 =
+ MCConstantExpr::Create(NewOp3Val, getContext());
+ const MCExpr *NewOp4 =
+ MCConstantExpr::Create(NewOp4Val, getContext());
+ Operands[3] = AArch64Operand::CreateImm(
+ NewOp3, Op3->getStartLoc(), Op3->getEndLoc(), getContext());
+ Operands[4] = AArch64Operand::CreateImm(
+ NewOp4, Op4->getStartLoc(), Op4->getEndLoc(), getContext());
+ if (Tok == "bfi")
+ Operands[0] = AArch64Operand::CreateToken(
+ "bfm", false, Op->getStartLoc(), getContext());
+ else if (Tok == "sbfiz")
+ Operands[0] = AArch64Operand::CreateToken(
+ "sbfm", false, Op->getStartLoc(), getContext());
+ else if (Tok == "ubfiz")
+ Operands[0] = AArch64Operand::CreateToken(
+ "ubfm", false, Op->getStartLoc(), getContext());
+ else
+ llvm_unreachable("No valid mnemonic for alias?");
+
+ delete Op;
+ delete Op3;
+ delete Op4;
+ }
+ }
+
+ // FIXME: Horrible hack to handle the BFXIL->BFM, SBFX->SBFM, and
+ // UBFX -> UBFM aliases.
+ } else if (NumOperands == 5 &&
+ (Tok == "bfxil" || Tok == "sbfx" || Tok == "ubfx")) {
+ AArch64Operand *Op1 = static_cast<AArch64Operand *>(Operands[1]);
+ AArch64Operand *Op3 = static_cast<AArch64Operand *>(Operands[3]);
+ AArch64Operand *Op4 = static_cast<AArch64Operand *>(Operands[4]);
+
+ if (Op1->isReg() && Op3->isImm() && Op4->isImm()) {
+ const MCConstantExpr *Op3CE = dyn_cast<MCConstantExpr>(Op3->getImm());
+ const MCConstantExpr *Op4CE = dyn_cast<MCConstantExpr>(Op4->getImm());
+
+ if (Op3CE && Op4CE) {
+ uint64_t Op3Val = Op3CE->getValue();
+ uint64_t Op4Val = Op4CE->getValue();
+
+ uint64_t RegWidth = 0;
+ if (AArch64MCRegisterClasses[AArch64::GPR64allRegClassID].contains(
+ Op1->getReg()))
+ RegWidth = 64;
+ else
+ RegWidth = 32;
+
+ if (Op3Val >= RegWidth)
+ return Error(Op3->getStartLoc(),
+ "expected integer in range [0, 31]");
+ if (Op4Val < 1 || Op4Val > RegWidth)
+ return Error(Op4->getStartLoc(),
+ "expected integer in range [1, 32]");
+
+ uint64_t NewOp4Val = Op3Val + Op4Val - 1;
+
+ if (NewOp4Val >= RegWidth || NewOp4Val < Op3Val)
+ return Error(Op4->getStartLoc(),
+ "requested extract overflows register");
+
+ const MCExpr *NewOp4 =
+ MCConstantExpr::Create(NewOp4Val, getContext());
+ Operands[4] = AArch64Operand::CreateImm(
+ NewOp4, Op4->getStartLoc(), Op4->getEndLoc(), getContext());
+ if (Tok == "bfxil")
+ Operands[0] = AArch64Operand::CreateToken(
+ "bfm", false, Op->getStartLoc(), getContext());
+ else if (Tok == "sbfx")
+ Operands[0] = AArch64Operand::CreateToken(
+ "sbfm", false, Op->getStartLoc(), getContext());
+ else if (Tok == "ubfx")
+ Operands[0] = AArch64Operand::CreateToken(
+ "ubfm", false, Op->getStartLoc(), getContext());
+ else
+ llvm_unreachable("No valid mnemonic for alias?");
+
+ delete Op;
+ delete Op4;
+ }
+ }
+ }
+ }
+ // FIXME: Horrible hack for sxtw and uxtw with Wn src and Xd dst operands.
+ // InstAlias can't quite handle this since the reg classes aren't
+ // subclasses.
+ if (NumOperands == 3 && (Tok == "sxtw" || Tok == "uxtw")) {
+ // The source register can be Wn here, but the matcher expects a
+ // GPR64. Twiddle it here if necessary.
+ AArch64Operand *Op = static_cast<AArch64Operand *>(Operands[2]);
+ if (Op->isReg()) {
+ unsigned Reg = getXRegFromWReg(Op->getReg());
+ Operands[2] = AArch64Operand::CreateReg(Reg, false, Op->getStartLoc(),
+ Op->getEndLoc(), getContext());
+ delete Op;
+ }
+ }
+ // FIXME: Likewise for sxt[bh] with a Xd dst operand
+ else if (NumOperands == 3 && (Tok == "sxtb" || Tok == "sxth")) {
+ AArch64Operand *Op = static_cast<AArch64Operand *>(Operands[1]);
+ if (Op->isReg() &&
+ AArch64MCRegisterClasses[AArch64::GPR64allRegClassID].contains(
+ Op->getReg())) {
+ // The source register can be Wn here, but the matcher expects a
+ // GPR64. Twiddle it here if necessary.
+ AArch64Operand *Op = static_cast<AArch64Operand *>(Operands[2]);
+ if (Op->isReg()) {
+ unsigned Reg = getXRegFromWReg(Op->getReg());
+ Operands[2] = AArch64Operand::CreateReg(Reg, false, Op->getStartLoc(),
+ Op->getEndLoc(), getContext());
+ delete Op;
+ }
+ }
+ }
+ // FIXME: Likewise for uxt[bh] with a Xd dst operand
+ else if (NumOperands == 3 && (Tok == "uxtb" || Tok == "uxth")) {
+ AArch64Operand *Op = static_cast<AArch64Operand *>(Operands[1]);
+ if (Op->isReg() &&
+ AArch64MCRegisterClasses[AArch64::GPR64allRegClassID].contains(
+ Op->getReg())) {
+ // The source register can be Wn here, but the matcher expects a
+ // GPR32. Twiddle it here if necessary.
+ AArch64Operand *Op = static_cast<AArch64Operand *>(Operands[1]);
+ if (Op->isReg()) {
+ unsigned Reg = getWRegFromXReg(Op->getReg());
+ Operands[1] = AArch64Operand::CreateReg(Reg, false, Op->getStartLoc(),
+ Op->getEndLoc(), getContext());
+ delete Op;
+ }
+ }
+ }
+
+ // Yet another horrible hack to handle FMOV Rd, #0.0 using [WX]ZR.
+ if (NumOperands == 3 && Tok == "fmov") {
+ AArch64Operand *RegOp = static_cast<AArch64Operand *>(Operands[1]);
+ AArch64Operand *ImmOp = static_cast<AArch64Operand *>(Operands[2]);
+ if (RegOp->isReg() && ImmOp->isFPImm() &&
+ ImmOp->getFPImm() == (unsigned)-1) {
+ unsigned zreg =
+ AArch64MCRegisterClasses[AArch64::FPR32RegClassID].contains(
+ RegOp->getReg())
+ ? AArch64::WZR
+ : AArch64::XZR;
+ Operands[2] = AArch64Operand::CreateReg(zreg, false, Op->getStartLoc(),
+ Op->getEndLoc(), getContext());
+ delete ImmOp;
+ }
+ }
+
+ MCInst Inst;
+ // First try to match against the secondary set of tables containing the
+ // short-form NEON instructions (e.g. "fadd.2s v0, v1, v2").
+ unsigned MatchResult =
+ MatchInstructionImpl(Operands, Inst, ErrorInfo, MatchingInlineAsm, 1);
+
+ // If that fails, try against the alternate table containing long-form NEON:
+ // "fadd v0.2s, v1.2s, v2.2s"
+ if (MatchResult != Match_Success)
+ MatchResult =
+ MatchInstructionImpl(Operands, Inst, ErrorInfo, MatchingInlineAsm, 0);
+
+ switch (MatchResult) {
+ case Match_Success: {
+ // Perform range checking and other semantic validations
+ SmallVector<SMLoc, 8> OperandLocs;
+ NumOperands = Operands.size();
+ for (unsigned i = 1; i < NumOperands; ++i)
+ OperandLocs.push_back(Operands[i]->getStartLoc());
+ if (validateInstruction(Inst, OperandLocs))
+ return true;
+
+ Inst.setLoc(IDLoc);
+ Out.EmitInstruction(Inst, STI);
+ return false;
+ }
+ case Match_MissingFeature: {
+ assert(ErrorInfo && "Unknown missing feature!");
+ // Special case the error message for the very common case where only
+ // a single subtarget feature is missing (neon, e.g.).
+ std::string Msg = "instruction requires:";
+ unsigned Mask = 1;
+ for (unsigned i = 0; i < (sizeof(ErrorInfo)*8-1); ++i) {
+ if (ErrorInfo & Mask) {
+ Msg += " ";
+ Msg += getSubtargetFeatureName(ErrorInfo & Mask);
+ }
+ Mask <<= 1;
+ }
+ return Error(IDLoc, Msg);
+ }
+ case Match_MnemonicFail:
+ return showMatchError(IDLoc, MatchResult);
+ case Match_InvalidOperand: {
+ SMLoc ErrorLoc = IDLoc;
+ if (ErrorInfo != ~0U) {
+ if (ErrorInfo >= Operands.size())
+ return Error(IDLoc, "too few operands for instruction");
+
+ ErrorLoc = ((AArch64Operand *)Operands[ErrorInfo])->getStartLoc();
+ if (ErrorLoc == SMLoc())
+ ErrorLoc = IDLoc;
+ }
+ // If the match failed on a suffix token operand, tweak the diagnostic
+ // accordingly.
+ if (((AArch64Operand *)Operands[ErrorInfo])->isToken() &&
+ ((AArch64Operand *)Operands[ErrorInfo])->isTokenSuffix())
+ MatchResult = Match_InvalidSuffix;
+
+ return showMatchError(ErrorLoc, MatchResult);
+ }
+ case Match_InvalidMemoryIndexed1:
+ case Match_InvalidMemoryIndexed2:
+ case Match_InvalidMemoryIndexed4:
+ case Match_InvalidMemoryIndexed8:
+ case Match_InvalidMemoryIndexed16:
+ case Match_InvalidCondCode:
+ case Match_AddSubRegExtendSmall:
+ case Match_AddSubRegExtendLarge:
+ case Match_AddSubSecondSource:
+ case Match_LogicalSecondSource:
+ case Match_AddSubRegShift32:
+ case Match_AddSubRegShift64:
+ case Match_InvalidMovImm32Shift:
+ case Match_InvalidMovImm64Shift:
+ case Match_InvalidFPImm:
+ case Match_InvalidMemoryWExtend8:
+ case Match_InvalidMemoryWExtend16:
+ case Match_InvalidMemoryWExtend32:
+ case Match_InvalidMemoryWExtend64:
+ case Match_InvalidMemoryWExtend128:
+ case Match_InvalidMemoryXExtend8:
+ case Match_InvalidMemoryXExtend16:
+ case Match_InvalidMemoryXExtend32:
+ case Match_InvalidMemoryXExtend64:
+ case Match_InvalidMemoryXExtend128:
+ case Match_InvalidMemoryIndexed4SImm7:
+ case Match_InvalidMemoryIndexed8SImm7:
+ case Match_InvalidMemoryIndexed16SImm7:
+ case Match_InvalidMemoryIndexedSImm9:
+ case Match_InvalidImm0_7:
+ case Match_InvalidImm0_15:
+ case Match_InvalidImm0_31:
+ case Match_InvalidImm0_63:
+ case Match_InvalidImm0_127:
+ case Match_InvalidImm0_65535:
+ case Match_InvalidImm1_8:
+ case Match_InvalidImm1_16:
+ case Match_InvalidImm1_32:
+ case Match_InvalidImm1_64:
+ case Match_InvalidIndex1:
+ case Match_InvalidIndexB:
+ case Match_InvalidIndexH:
+ case Match_InvalidIndexS:
+ case Match_InvalidIndexD:
+ case Match_InvalidLabel:
+ case Match_MSR:
+ case Match_MRS: {
+ // Any time we get here, there's nothing fancy to do. Just get the
+ // operand SMLoc and display the diagnostic.
+ SMLoc ErrorLoc = ((AArch64Operand *)Operands[ErrorInfo])->getStartLoc();
+ if (ErrorLoc == SMLoc())
+ ErrorLoc = IDLoc;
+ return showMatchError(ErrorLoc, MatchResult);
+ }
+ }
+
+ llvm_unreachable("Implement any new match types added!");
+ return true;
+}
+
+/// ParseDirective parses the arm specific directives
bool AArch64AsmParser::ParseDirective(AsmToken DirectiveID) {
StringRef IDVal = DirectiveID.getIdentifier();
+ SMLoc Loc = DirectiveID.getLoc();
if (IDVal == ".hword")
- return ParseDirectiveWord(2, DirectiveID.getLoc());
- else if (IDVal == ".word")
- return ParseDirectiveWord(4, DirectiveID.getLoc());
- else if (IDVal == ".xword")
- return ParseDirectiveWord(8, DirectiveID.getLoc());
- else if (IDVal == ".tlsdesccall")
- return ParseDirectiveTLSDescCall(DirectiveID.getLoc());
+ return parseDirectiveWord(2, Loc);
+ if (IDVal == ".word")
+ return parseDirectiveWord(4, Loc);
+ if (IDVal == ".xword")
+ return parseDirectiveWord(8, Loc);
+ if (IDVal == ".tlsdesccall")
+ return parseDirectiveTLSDescCall(Loc);
- return true;
+ return parseDirectiveLOH(IDVal, Loc);
}
/// parseDirectiveWord
/// ::= .word [ expression (, expression)* ]
-bool AArch64AsmParser::ParseDirectiveWord(unsigned Size, SMLoc L) {
+bool AArch64AsmParser::parseDirectiveWord(unsigned Size, SMLoc L) {
if (getLexer().isNot(AsmToken::EndOfStatement)) {
for (;;) {
const MCExpr *Value;
if (getParser().parseExpression(Value))
- return false;
+ return true;
getParser().getStreamer().EmitValue(Value, Size);
@@ -2374,10 +3838,8 @@
break;
// FIXME: Improve diagnostic.
- if (getLexer().isNot(AsmToken::Comma)) {
- Error(L, "unexpected token in directive");
- return false;
- }
+ if (getLexer().isNot(AsmToken::Comma))
+ return Error(L, "unexpected token in directive");
Parser.Lex();
}
}
@@ -2388,15 +3850,14 @@
// parseDirectiveTLSDescCall:
// ::= .tlsdesccall symbol
-bool AArch64AsmParser::ParseDirectiveTLSDescCall(SMLoc L) {
+bool AArch64AsmParser::parseDirectiveTLSDescCall(SMLoc L) {
StringRef Name;
- if (getParser().parseIdentifier(Name)) {
- Error(L, "expected symbol after directive");
- return false;
- }
+ if (getParser().parseIdentifier(Name))
+ return Error(L, "expected symbol after directive");
MCSymbol *Sym = getContext().GetOrCreateSymbol(Name);
- const MCSymbolRefExpr *Expr = MCSymbolRefExpr::Create(Sym, getContext());
+ const MCExpr *Expr = MCSymbolRefExpr::Create(Sym, getContext());
+ Expr = AArch64MCExpr::Create(Expr, AArch64MCExpr::VK_TLSDESC, getContext());
MCInst Inst;
Inst.setOpcode(AArch64::TLSDESCCALL);
@@ -2406,271 +3867,181 @@
return false;
}
-
-bool AArch64AsmParser::MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
- SmallVectorImpl<MCParsedAsmOperand*> &Operands,
- MCStreamer &Out, unsigned &ErrorInfo,
- bool MatchingInlineAsm) {
- MCInst Inst;
- unsigned MatchResult;
- MatchResult = MatchInstructionImpl(Operands, Inst, ErrorInfo,
- MatchingInlineAsm);
-
- if (ErrorInfo != ~0U && ErrorInfo >= Operands.size())
- return Error(IDLoc, "too few operands for instruction");
-
- switch (MatchResult) {
- default: break;
- case Match_Success:
- if (validateInstruction(Inst, Operands))
- return true;
-
- Out.EmitInstruction(Inst, STI);
- return false;
- case Match_MissingFeature:
- Error(IDLoc, "instruction requires a CPU feature not currently enabled");
+/// ::= .loh <lohName | lohId> label1, ..., labelN
+/// The number of arguments depends on the loh identifier.
+bool AArch64AsmParser::parseDirectiveLOH(StringRef IDVal, SMLoc Loc) {
+ if (IDVal != MCLOHDirectiveName())
return true;
- case Match_InvalidOperand: {
- SMLoc ErrorLoc = IDLoc;
- if (ErrorInfo != ~0U) {
- ErrorLoc = ((AArch64Operand*)Operands[ErrorInfo])->getStartLoc();
- if (ErrorLoc == SMLoc()) ErrorLoc = IDLoc;
- }
+ MCLOHType Kind;
+ if (getParser().getTok().isNot(AsmToken::Identifier)) {
+ if (getParser().getTok().isNot(AsmToken::Integer))
+ return TokError("expected an identifier or a number in directive");
+ // We successfully get a numeric value for the identifier.
+ // Check if it is valid.
+ int64_t Id = getParser().getTok().getIntVal();
+ Kind = (MCLOHType)Id;
+ // Check that Id does not overflow MCLOHType.
+ if (!isValidMCLOHType(Kind) || Id != Kind)
+ return TokError("invalid numeric identifier in directive");
+ } else {
+ StringRef Name = getTok().getIdentifier();
+ // We successfully parse an identifier.
+ // Check if it is a recognized one.
+ int Id = MCLOHNameToId(Name);
- return Error(ErrorLoc, "invalid operand for instruction");
+ if (Id == -1)
+ return TokError("invalid identifier in directive");
+ Kind = (MCLOHType)Id;
}
- case Match_MnemonicFail:
- return Error(IDLoc, "invalid instruction");
+ // Consume the identifier.
+ Lex();
+ // Get the number of arguments of this LOH.
+ int NbArgs = MCLOHIdToNbArgs(Kind);
- case Match_AddSubRegExtendSmall:
- return Error(((AArch64Operand*)Operands[ErrorInfo])->getStartLoc(),
- "expected '[su]xt[bhw]' or 'lsl' with optional integer in range [0, 4]");
- case Match_AddSubRegExtendLarge:
- return Error(((AArch64Operand*)Operands[ErrorInfo])->getStartLoc(),
- "expected 'sxtx' 'uxtx' or 'lsl' with optional integer in range [0, 4]");
- case Match_AddSubRegShift32:
- return Error(((AArch64Operand*)Operands[ErrorInfo])->getStartLoc(),
- "expected 'lsl', 'lsr' or 'asr' with optional integer in range [0, 31]");
- case Match_AddSubRegShift64:
- return Error(((AArch64Operand*)Operands[ErrorInfo])->getStartLoc(),
- "expected 'lsl', 'lsr' or 'asr' with optional integer in range [0, 63]");
- case Match_AddSubSecondSource:
- return Error(((AArch64Operand*)Operands[ErrorInfo])->getStartLoc(),
- "expected compatible register, symbol or integer in range [0, 4095]");
- case Match_CVTFixedPos32:
- return Error(((AArch64Operand*)Operands[ErrorInfo])->getStartLoc(),
- "expected integer in range [1, 32]");
- case Match_CVTFixedPos64:
- return Error(((AArch64Operand*)Operands[ErrorInfo])->getStartLoc(),
- "expected integer in range [1, 64]");
- case Match_CondCode:
- return Error(((AArch64Operand*)Operands[ErrorInfo])->getStartLoc(),
- "expected AArch64 condition code");
- case Match_FPImm:
- // Any situation which allows a nontrivial floating-point constant also
- // allows a register.
- return Error(((AArch64Operand*)Operands[ErrorInfo])->getStartLoc(),
- "expected compatible register or floating-point constant");
- case Match_FPZero:
- return Error(((AArch64Operand*)Operands[ErrorInfo])->getStartLoc(),
- "expected floating-point constant #0.0 or invalid register type");
- case Match_Label:
- return Error(((AArch64Operand*)Operands[ErrorInfo])->getStartLoc(),
- "expected label or encodable integer pc offset");
- case Match_Lane1:
- return Error(((AArch64Operand*)Operands[ErrorInfo])->getStartLoc(),
- "expected lane specifier '[1]'");
- case Match_LoadStoreExtend32_1:
- return Error(((AArch64Operand*)Operands[ErrorInfo])->getStartLoc(),
- "expected 'uxtw' or 'sxtw' with optional shift of #0");
- case Match_LoadStoreExtend32_2:
- return Error(((AArch64Operand*)Operands[ErrorInfo])->getStartLoc(),
- "expected 'uxtw' or 'sxtw' with optional shift of #0 or #1");
- case Match_LoadStoreExtend32_4:
- return Error(((AArch64Operand*)Operands[ErrorInfo])->getStartLoc(),
- "expected 'uxtw' or 'sxtw' with optional shift of #0 or #2");
- case Match_LoadStoreExtend32_8:
- return Error(((AArch64Operand*)Operands[ErrorInfo])->getStartLoc(),
- "expected 'uxtw' or 'sxtw' with optional shift of #0 or #3");
- case Match_LoadStoreExtend32_16:
- return Error(((AArch64Operand*)Operands[ErrorInfo])->getStartLoc(),
- "expected 'lsl' or 'sxtw' with optional shift of #0 or #4");
- case Match_LoadStoreExtend64_1:
- return Error(((AArch64Operand*)Operands[ErrorInfo])->getStartLoc(),
- "expected 'lsl' or 'sxtx' with optional shift of #0");
- case Match_LoadStoreExtend64_2:
- return Error(((AArch64Operand*)Operands[ErrorInfo])->getStartLoc(),
- "expected 'lsl' or 'sxtx' with optional shift of #0 or #1");
- case Match_LoadStoreExtend64_4:
- return Error(((AArch64Operand*)Operands[ErrorInfo])->getStartLoc(),
- "expected 'lsl' or 'sxtx' with optional shift of #0 or #2");
- case Match_LoadStoreExtend64_8:
- return Error(((AArch64Operand*)Operands[ErrorInfo])->getStartLoc(),
- "expected 'lsl' or 'sxtx' with optional shift of #0 or #3");
- case Match_LoadStoreExtend64_16:
- return Error(((AArch64Operand*)Operands[ErrorInfo])->getStartLoc(),
- "expected 'lsl' or 'sxtx' with optional shift of #0 or #4");
- case Match_LoadStoreSImm7_4:
- return Error(((AArch64Operand*)Operands[ErrorInfo])->getStartLoc(),
- "expected integer multiple of 4 in range [-256, 252]");
- case Match_LoadStoreSImm7_8:
- return Error(((AArch64Operand*)Operands[ErrorInfo])->getStartLoc(),
- "expected integer multiple of 8 in range [-512, 504]");
- case Match_LoadStoreSImm7_16:
- return Error(((AArch64Operand*)Operands[ErrorInfo])->getStartLoc(),
- "expected integer multiple of 16 in range [-1024, 1008]");
- case Match_LoadStoreSImm9:
- return Error(((AArch64Operand*)Operands[ErrorInfo])->getStartLoc(),
- "expected integer in range [-256, 255]");
- case Match_LoadStoreUImm12_1:
- return Error(((AArch64Operand*)Operands[ErrorInfo])->getStartLoc(),
- "expected symbolic reference or integer in range [0, 4095]");
- case Match_LoadStoreUImm12_2:
- return Error(((AArch64Operand*)Operands[ErrorInfo])->getStartLoc(),
- "expected symbolic reference or integer in range [0, 8190]");
- case Match_LoadStoreUImm12_4:
- return Error(((AArch64Operand*)Operands[ErrorInfo])->getStartLoc(),
- "expected symbolic reference or integer in range [0, 16380]");
- case Match_LoadStoreUImm12_8:
- return Error(((AArch64Operand*)Operands[ErrorInfo])->getStartLoc(),
- "expected symbolic reference or integer in range [0, 32760]");
- case Match_LoadStoreUImm12_16:
- return Error(((AArch64Operand*)Operands[ErrorInfo])->getStartLoc(),
- "expected symbolic reference or integer in range [0, 65520]");
- case Match_LogicalSecondSource:
- return Error(((AArch64Operand*)Operands[ErrorInfo])->getStartLoc(),
- "expected compatible register or logical immediate");
- case Match_MOVWUImm16:
- return Error(((AArch64Operand*)Operands[ErrorInfo])->getStartLoc(),
- "expected relocated symbol or integer in range [0, 65535]");
- case Match_MRS:
- return Error(((AArch64Operand*)Operands[ErrorInfo])->getStartLoc(),
- "expected readable system register");
- case Match_MSR:
- return Error(((AArch64Operand*)Operands[ErrorInfo])->getStartLoc(),
- "expected writable system register or pstate");
- case Match_NamedImm_at:
- return Error(((AArch64Operand*)Operands[ErrorInfo])->getStartLoc(),
- "expected symbolic 'at' operand: s1e[0-3][rw] or s12e[01][rw]");
- case Match_NamedImm_dbarrier:
- return Error(((AArch64Operand*)Operands[ErrorInfo])->getStartLoc(),
- "expected integer in range [0, 15] or symbolic barrier operand");
- case Match_NamedImm_dc:
- return Error(((AArch64Operand*)Operands[ErrorInfo])->getStartLoc(),
- "expected symbolic 'dc' operand");
- case Match_NamedImm_ic:
- return Error(((AArch64Operand*)Operands[ErrorInfo])->getStartLoc(),
- "expected 'ic' operand: 'ialluis', 'iallu' or 'ivau'");
- case Match_NamedImm_isb:
- return Error(((AArch64Operand*)Operands[ErrorInfo])->getStartLoc(),
- "expected integer in range [0, 15] or 'sy'");
- case Match_NamedImm_prefetch:
- return Error(((AArch64Operand*)Operands[ErrorInfo])->getStartLoc(),
- "expected prefetch hint: p(ld|st|i)l[123](strm|keep)");
- case Match_NamedImm_tlbi:
- return Error(((AArch64Operand*)Operands[ErrorInfo])->getStartLoc(),
- "expected translation buffer invalidation operand");
- case Match_UImm16:
- return Error(((AArch64Operand*)Operands[ErrorInfo])->getStartLoc(),
- "expected integer in range [0, 65535]");
- case Match_UImm3:
- return Error(((AArch64Operand*)Operands[ErrorInfo])->getStartLoc(),
- "expected integer in range [0, 7]");
- case Match_UImm4:
- return Error(((AArch64Operand*)Operands[ErrorInfo])->getStartLoc(),
- "expected integer in range [0, 15]");
- case Match_UImm5:
- return Error(((AArch64Operand*)Operands[ErrorInfo])->getStartLoc(),
- "expected integer in range [0, 31]");
- case Match_UImm6:
- return Error(((AArch64Operand*)Operands[ErrorInfo])->getStartLoc(),
- "expected integer in range [0, 63]");
- case Match_UImm7:
- return Error(((AArch64Operand*)Operands[ErrorInfo])->getStartLoc(),
- "expected integer in range [0, 127]");
- case Match_Width32:
- return Error(((AArch64Operand*)Operands[ErrorInfo])->getStartLoc(),
- "expected integer in range [<lsb>, 31]");
- case Match_Width64:
- return Error(((AArch64Operand*)Operands[ErrorInfo])->getStartLoc(),
- "expected integer in range [<lsb>, 63]");
- case Match_ShrImm8:
- return Error(((AArch64Operand *)Operands[ErrorInfo])->getStartLoc(),
- "expected integer in range [1, 8]");
- case Match_ShrImm16:
- return Error(((AArch64Operand *)Operands[ErrorInfo])->getStartLoc(),
- "expected integer in range [1, 16]");
- case Match_ShrImm32:
- return Error(((AArch64Operand *)Operands[ErrorInfo])->getStartLoc(),
- "expected integer in range [1, 32]");
- case Match_ShrImm64:
- return Error(((AArch64Operand *)Operands[ErrorInfo])->getStartLoc(),
- "expected integer in range [1, 64]");
- case Match_ShlImm8:
- return Error(((AArch64Operand *)Operands[ErrorInfo])->getStartLoc(),
- "expected integer in range [0, 7]");
- case Match_ShlImm16:
- return Error(((AArch64Operand *)Operands[ErrorInfo])->getStartLoc(),
- "expected integer in range [0, 15]");
- case Match_ShlImm32:
- return Error(((AArch64Operand *)Operands[ErrorInfo])->getStartLoc(),
- "expected integer in range [0, 31]");
- case Match_ShlImm64:
- return Error(((AArch64Operand *)Operands[ErrorInfo])->getStartLoc(),
- "expected integer in range [0, 63]");
+ assert(NbArgs != -1 && "Invalid number of arguments");
+
+ SmallVector<MCSymbol *, 3> Args;
+ for (int Idx = 0; Idx < NbArgs; ++Idx) {
+ StringRef Name;
+ if (getParser().parseIdentifier(Name))
+ return TokError("expected identifier in directive");
+ Args.push_back(getContext().GetOrCreateSymbol(Name));
+
+ if (Idx + 1 == NbArgs)
+ break;
+ if (getLexer().isNot(AsmToken::Comma))
+ return TokError("unexpected token in '" + Twine(IDVal) + "' directive");
+ Lex();
}
+ if (getLexer().isNot(AsmToken::EndOfStatement))
+ return TokError("unexpected token in '" + Twine(IDVal) + "' directive");
- llvm_unreachable("Implement any new match types added!");
- return true;
+ getStreamer().EmitLOHDirective((MCLOHType)Kind, Args);
+ return false;
}
-void AArch64Operand::print(raw_ostream &OS) const {
- switch (Kind) {
- case k_CondCode:
- OS << "<CondCode: " << CondCode.Code << ">";
- break;
- case k_FPImmediate:
- OS << "<fpimm: " << FPImm.Val << ">";
- break;
- case k_ImmWithLSL:
- OS << "<immwithlsl: imm=" << ImmWithLSL.Val
- << ", shift=" << ImmWithLSL.ShiftAmount << ">";
- break;
- case k_Immediate:
- getImm()->print(OS);
- break;
- case k_Register:
- OS << "<register " << getReg() << '>';
- break;
- case k_Token:
- OS << '\'' << getToken() << '\'';
- break;
- case k_ShiftExtend:
- OS << "<shift: type=" << ShiftExtend.ShiftType
- << ", amount=" << ShiftExtend.Amount << ">";
- break;
- case k_SysReg: {
- StringRef Name(SysReg.Data, SysReg.Length);
- OS << "<sysreg: " << Name << '>';
- break;
- }
- default:
- llvm_unreachable("No idea how to print this kind of operand");
- break;
- }
-}
+bool
+AArch64AsmParser::classifySymbolRef(const MCExpr *Expr,
+ AArch64MCExpr::VariantKind &ELFRefKind,
+ MCSymbolRefExpr::VariantKind &DarwinRefKind,
+ int64_t &Addend) {
+ ELFRefKind = AArch64MCExpr::VK_INVALID;
+ DarwinRefKind = MCSymbolRefExpr::VK_None;
+ Addend = 0;
-void AArch64Operand::dump() const {
- print(errs());
-}
+ if (const AArch64MCExpr *AE = dyn_cast<AArch64MCExpr>(Expr)) {
+ ELFRefKind = AE->getKind();
+ Expr = AE->getSubExpr();
+ }
+ const MCSymbolRefExpr *SE = dyn_cast<MCSymbolRefExpr>(Expr);
+ if (SE) {
+ // It's a simple symbol reference with no addend.
+ DarwinRefKind = SE->getKind();
+ return true;
+ }
+
+ const MCBinaryExpr *BE = dyn_cast<MCBinaryExpr>(Expr);
+ if (!BE)
+ return false;
+
+ SE = dyn_cast<MCSymbolRefExpr>(BE->getLHS());
+ if (!SE)
+ return false;
+ DarwinRefKind = SE->getKind();
+
+ if (BE->getOpcode() != MCBinaryExpr::Add &&
+ BE->getOpcode() != MCBinaryExpr::Sub)
+ return false;
+
+ // See if the addend is is a constant, otherwise there's more going
+ // on here than we can deal with.
+ auto AddendExpr = dyn_cast<MCConstantExpr>(BE->getRHS());
+ if (!AddendExpr)
+ return false;
+
+ Addend = AddendExpr->getValue();
+ if (BE->getOpcode() == MCBinaryExpr::Sub)
+ Addend = -Addend;
+
+ // It's some symbol reference + a constant addend, but really
+ // shouldn't use both Darwin and ELF syntax.
+ return ELFRefKind == AArch64MCExpr::VK_INVALID ||
+ DarwinRefKind == MCSymbolRefExpr::VK_None;
+}
/// Force static initialization.
extern "C" void LLVMInitializeAArch64AsmParser() {
RegisterMCAsmParser<AArch64AsmParser> X(TheAArch64leTarget);
RegisterMCAsmParser<AArch64AsmParser> Y(TheAArch64beTarget);
+
+ RegisterMCAsmParser<AArch64AsmParser> Z(TheARM64leTarget);
+ RegisterMCAsmParser<AArch64AsmParser> W(TheARM64beTarget);
}
#define GET_REGISTER_MATCHER
+#define GET_SUBTARGET_FEATURE_NAME
#define GET_MATCHER_IMPLEMENTATION
#include "AArch64GenAsmMatcher.inc"
+
+// Define this matcher function after the auto-generated include so we
+// have the match class enum definitions.
+unsigned AArch64AsmParser::validateTargetOperandClass(MCParsedAsmOperand *AsmOp,
+ unsigned Kind) {
+ AArch64Operand *Op = static_cast<AArch64Operand *>(AsmOp);
+ // If the kind is a token for a literal immediate, check if our asm
+ // operand matches. This is for InstAliases which have a fixed-value
+ // immediate in the syntax.
+ int64_t ExpectedVal;
+ switch (Kind) {
+ default:
+ return Match_InvalidOperand;
+ case MCK__35_0:
+ ExpectedVal = 0;
+ break;
+ case MCK__35_1:
+ ExpectedVal = 1;
+ break;
+ case MCK__35_12:
+ ExpectedVal = 12;
+ break;
+ case MCK__35_16:
+ ExpectedVal = 16;
+ break;
+ case MCK__35_2:
+ ExpectedVal = 2;
+ break;
+ case MCK__35_24:
+ ExpectedVal = 24;
+ break;
+ case MCK__35_3:
+ ExpectedVal = 3;
+ break;
+ case MCK__35_32:
+ ExpectedVal = 32;
+ break;
+ case MCK__35_4:
+ ExpectedVal = 4;
+ break;
+ case MCK__35_48:
+ ExpectedVal = 48;
+ break;
+ case MCK__35_6:
+ ExpectedVal = 6;
+ break;
+ case MCK__35_64:
+ ExpectedVal = 64;
+ break;
+ case MCK__35_8:
+ ExpectedVal = 8;
+ break;
+ }
+ if (!Op->isImm())
+ return Match_InvalidOperand;
+ const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(Op->getImm());
+ if (!CE)
+ return Match_InvalidOperand;
+ if (CE->getValue() == ExpectedVal)
+ return Match_Success;
+ return Match_InvalidOperand;
+}
diff --git a/lib/Target/AArch64/AsmParser/CMakeLists.txt b/lib/Target/AArch64/AsmParser/CMakeLists.txt
index e81ec70..cc0a9d8 100644
--- a/lib/Target/AArch64/AsmParser/CMakeLists.txt
+++ b/lib/Target/AArch64/AsmParser/CMakeLists.txt
@@ -1,3 +1,6 @@
+include_directories( ${CMAKE_CURRENT_BINARY_DIR}/.. ${CMAKE_CURRENT_SOURCE_DIR}/.. )
+
add_llvm_library(LLVMAArch64AsmParser
AArch64AsmParser.cpp
)
+
diff --git a/lib/Target/AArch64/AsmParser/LLVMBuild.txt b/lib/Target/AArch64/AsmParser/LLVMBuild.txt
index 2d8f632..11eb9d5 100644
--- a/lib/Target/AArch64/AsmParser/LLVMBuild.txt
+++ b/lib/Target/AArch64/AsmParser/LLVMBuild.txt
@@ -1,4 +1,4 @@
-;===- ./lib/Target/AArch64/AsmParser/LLVMBuild.txt -------------*- Conf -*--===;
+;===- ./lib/Target/AArch64/AsmParser/LLVMBuild.txt ---------------*- Conf -*--===;
;
; The LLVM Compiler Infrastructure
;
diff --git a/lib/Target/AArch64/AsmParser/Makefile b/lib/Target/AArch64/AsmParser/Makefile
index 56c9ef5..00268c7 100644
--- a/lib/Target/AArch64/AsmParser/Makefile
+++ b/lib/Target/AArch64/AsmParser/Makefile
@@ -9,7 +9,7 @@
LEVEL = ../../../..
LIBRARYNAME = LLVMAArch64AsmParser
-# Hack: we need to include 'main' target directory to grab private headers
+# Hack: we need to include 'main' ARM target directory to grab private headers
CPP.Flags += -I$(PROJ_OBJ_DIR)/.. -I$(PROJ_SRC_DIR)/..
include $(LEVEL)/Makefile.common
diff --git a/lib/Target/AArch64/CMakeLists.txt b/lib/Target/AArch64/CMakeLists.txt
index dfc10af..789d549 100644
--- a/lib/Target/AArch64/CMakeLists.txt
+++ b/lib/Target/AArch64/CMakeLists.txt
@@ -1,37 +1,51 @@
set(LLVM_TARGET_DEFINITIONS AArch64.td)
-tablegen(LLVM AArch64GenAsmMatcher.inc -gen-asm-matcher)
-tablegen(LLVM AArch64GenAsmWriter.inc -gen-asm-writer)
-tablegen(LLVM AArch64GenCallingConv.inc -gen-callingconv)
-tablegen(LLVM AArch64GenDisassemblerTables.inc -gen-disassembler)
+tablegen(LLVM AArch64GenRegisterInfo.inc -gen-register-info)
tablegen(LLVM AArch64GenInstrInfo.inc -gen-instr-info)
tablegen(LLVM AArch64GenMCCodeEmitter.inc -gen-emitter -mc-emitter)
tablegen(LLVM AArch64GenMCPseudoLowering.inc -gen-pseudo-lowering)
-tablegen(LLVM AArch64GenRegisterInfo.inc -gen-register-info)
+tablegen(LLVM AArch64GenAsmWriter.inc -gen-asm-writer)
+tablegen(LLVM AArch64GenAsmWriter1.inc -gen-asm-writer -asmwriternum=1)
+tablegen(LLVM AArch64GenAsmMatcher.inc -gen-asm-matcher)
tablegen(LLVM AArch64GenDAGISel.inc -gen-dag-isel)
+tablegen(LLVM AArch64GenFastISel.inc -gen-fast-isel)
+tablegen(LLVM AArch64GenCallingConv.inc -gen-callingconv)
tablegen(LLVM AArch64GenSubtargetInfo.inc -gen-subtarget)
+tablegen(LLVM AArch64GenDisassemblerTables.inc -gen-disassembler)
add_public_tablegen_target(AArch64CommonTableGen)
add_llvm_target(AArch64CodeGen
+ AArch64AddressTypePromotion.cpp
+ AArch64AdvSIMDScalarPass.cpp
AArch64AsmPrinter.cpp
- AArch64BranchFixupPass.cpp
+ AArch64BranchRelaxation.cpp
+ AArch64CleanupLocalDynamicTLSPass.cpp
+ AArch64CollectLOH.cpp
+ AArch64ConditionalCompares.cpp
+ AArch64DeadRegisterDefinitionsPass.cpp
+ AArch64ExpandPseudoInsts.cpp
+ AArch64FastISel.cpp
AArch64FrameLowering.cpp
AArch64ISelDAGToDAG.cpp
AArch64ISelLowering.cpp
AArch64InstrInfo.cpp
- AArch64MachineFunctionInfo.cpp
+ AArch64LoadStoreOptimizer.cpp
AArch64MCInstLower.cpp
+ AArch64PromoteConstant.cpp
AArch64RegisterInfo.cpp
AArch64SelectionDAGInfo.cpp
+ AArch64StorePairSuppress.cpp
AArch64Subtarget.cpp
AArch64TargetMachine.cpp
AArch64TargetObjectFile.cpp
AArch64TargetTransformInfo.cpp
- )
+)
+add_dependencies(LLVMAArch64CodeGen intrinsics_gen)
+
+add_subdirectory(TargetInfo)
add_subdirectory(AsmParser)
add_subdirectory(Disassembler)
add_subdirectory(InstPrinter)
add_subdirectory(MCTargetDesc)
-add_subdirectory(TargetInfo)
add_subdirectory(Utils)
diff --git a/lib/Target/AArch64/Disassembler/AArch64Disassembler.cpp b/lib/Target/AArch64/Disassembler/AArch64Disassembler.cpp
index 9bd363a..6de27d6 100644
--- a/lib/Target/AArch64/Disassembler/AArch64Disassembler.cpp
+++ b/lib/Target/AArch64/Disassembler/AArch64Disassembler.cpp
@@ -1,4 +1,4 @@
-//===- AArch64Disassembler.cpp - Disassembler for AArch64 ISA -------------===//
+//===- AArch64Disassembler.cpp - Disassembler for AArch64 -------*- C++ -*-===//
//
// The LLVM Compiler Infrastructure
//
@@ -7,244 +7,169 @@
//
//===----------------------------------------------------------------------===//
//
-// This file contains the functions necessary to decode AArch64 instruction
-// bitpatterns into MCInsts (with the help of TableGenerated information from
-// the instruction definitions).
//
//===----------------------------------------------------------------------===//
-#define DEBUG_TYPE "arm-disassembler"
-
-#include "AArch64.h"
-#include "AArch64RegisterInfo.h"
+#include "AArch64Disassembler.h"
+#include "AArch64ExternalSymbolizer.h"
#include "AArch64Subtarget.h"
+#include "MCTargetDesc/AArch64AddressingModes.h"
#include "Utils/AArch64BaseInfo.h"
-#include "llvm/MC/MCContext.h"
-#include "llvm/MC/MCDisassembler.h"
-#include "llvm/MC/MCExpr.h"
-#include "llvm/MC/MCFixedLenDisassembler.h"
#include "llvm/MC/MCInst.h"
-#include "llvm/MC/MCInstrDesc.h"
+#include "llvm/MC/MCFixedLenDisassembler.h"
#include "llvm/Support/Debug.h"
-#include "llvm/Support/ErrorHandling.h"
#include "llvm/Support/MemoryObject.h"
#include "llvm/Support/TargetRegistry.h"
-#include "llvm/Support/raw_ostream.h"
+#include "llvm/Support/ErrorHandling.h"
using namespace llvm;
-typedef MCDisassembler::DecodeStatus DecodeStatus;
+#define DEBUG_TYPE "aarch64-disassembler"
-namespace {
-/// AArch64 disassembler for all AArch64 platforms.
-class AArch64Disassembler : public MCDisassembler {
- OwningPtr<const MCRegisterInfo> RegInfo;
-public:
- /// Initializes the disassembler.
- ///
- AArch64Disassembler(const MCSubtargetInfo &STI, const MCRegisterInfo *Info)
- : MCDisassembler(STI), RegInfo(Info) {
- }
+// Pull DecodeStatus and its enum values into the global namespace.
+typedef llvm::MCDisassembler::DecodeStatus DecodeStatus;
- ~AArch64Disassembler() {}
-
- /// See MCDisassembler.
- DecodeStatus getInstruction(MCInst &instr,
- uint64_t &size,
- const MemoryObject ®ion,
- uint64_t address,
- raw_ostream &vStream,
- raw_ostream &cStream) const;
-
- const MCRegisterInfo *getRegInfo() const { return RegInfo.get(); }
-};
-
-}
-
-// Forward-declarations used in the auto-generated files.
-static DecodeStatus DecodeGPR64RegisterClass(llvm::MCInst &Inst, unsigned RegNo,
- uint64_t Address, const void *Decoder);
-static DecodeStatus
-DecodeGPR64xspRegisterClass(llvm::MCInst &Inst, unsigned RegNo,
- uint64_t Address, const void *Decoder);
-
-static DecodeStatus DecodeGPR32RegisterClass(llvm::MCInst &Inst, unsigned RegNo,
- uint64_t Address, const void *Decoder);
-static DecodeStatus
-DecodeGPR32wspRegisterClass(llvm::MCInst &Inst, unsigned RegNo,
- uint64_t Address, const void *Decoder);
-
-static DecodeStatus DecodeFPR8RegisterClass(llvm::MCInst &Inst, unsigned RegNo,
- uint64_t Address, const void *Decoder);
-static DecodeStatus DecodeFPR16RegisterClass(llvm::MCInst &Inst, unsigned RegNo,
- uint64_t Address, const void *Decoder);
-static DecodeStatus DecodeFPR32RegisterClass(llvm::MCInst &Inst, unsigned RegNo,
- uint64_t Address, const void *Decoder);
-static DecodeStatus DecodeFPR64RegisterClass(llvm::MCInst &Inst, unsigned RegNo,
- uint64_t Address, const void *Decoder);
-static DecodeStatus DecodeFPR64LoRegisterClass(llvm::MCInst &Inst, unsigned RegNo,
- uint64_t Address, const void *Decoder);
+// Forward declare these because the autogenerated code will reference them.
+// Definitions are further down.
static DecodeStatus DecodeFPR128RegisterClass(llvm::MCInst &Inst,
unsigned RegNo, uint64_t Address,
const void *Decoder);
-static DecodeStatus DecodeFPR128LoRegisterClass(llvm::MCInst &Inst,
- unsigned RegNo, uint64_t Address,
- const void *Decoder);
-
-static DecodeStatus DecodeGPR64noxzrRegisterClass(llvm::MCInst &Inst,
- unsigned RegNo,
- uint64_t Address,
- const void *Decoder);
-
-static DecodeStatus DecodeDPairRegisterClass(llvm::MCInst &Inst, unsigned RegNo,
+static DecodeStatus DecodeFPR128_loRegisterClass(llvm::MCInst &Inst,
+ unsigned RegNo,
+ uint64_t Address,
+ const void *Decoder);
+static DecodeStatus DecodeFPR64RegisterClass(llvm::MCInst &Inst, unsigned RegNo,
uint64_t Address,
const void *Decoder);
-static DecodeStatus DecodeQPairRegisterClass(llvm::MCInst &Inst, unsigned RegNo,
+static DecodeStatus DecodeFPR32RegisterClass(llvm::MCInst &Inst, unsigned RegNo,
uint64_t Address,
const void *Decoder);
-static DecodeStatus DecodeDTripleRegisterClass(llvm::MCInst &Inst,
- unsigned RegNo, uint64_t Address,
- const void *Decoder);
-static DecodeStatus DecodeQTripleRegisterClass(llvm::MCInst &Inst,
- unsigned RegNo, uint64_t Address,
- const void *Decoder);
-static DecodeStatus DecodeDQuadRegisterClass(llvm::MCInst &Inst, unsigned RegNo,
+static DecodeStatus DecodeFPR16RegisterClass(llvm::MCInst &Inst, unsigned RegNo,
uint64_t Address,
const void *Decoder);
-static DecodeStatus DecodeQQuadRegisterClass(llvm::MCInst &Inst, unsigned RegNo,
- uint64_t Address,
- const void *Decoder);
-
-static DecodeStatus DecodeAddrRegExtendOperand(llvm::MCInst &Inst,
- unsigned OptionHiS,
- uint64_t Address,
- const void *Decoder);
-
-
-static DecodeStatus DecodeBitfield32ImmOperand(llvm::MCInst &Inst,
- unsigned Imm6Bits,
- uint64_t Address,
- const void *Decoder);
-
-static DecodeStatus DecodeCVT32FixedPosOperand(llvm::MCInst &Inst,
- unsigned Imm6Bits,
- uint64_t Address,
- const void *Decoder);
-
-static DecodeStatus DecodeFPZeroOperand(llvm::MCInst &Inst,
- unsigned RmBits,
- uint64_t Address,
- const void *Decoder);
-
-static DecodeStatus DecodeShiftRightImm8(MCInst &Inst, unsigned Val,
- uint64_t Address, const void *Decoder);
-static DecodeStatus DecodeShiftRightImm16(MCInst &Inst, unsigned Val,
- uint64_t Address,
- const void *Decoder);
-static DecodeStatus DecodeShiftRightImm32(MCInst &Inst, unsigned Val,
- uint64_t Address,
- const void *Decoder);
-static DecodeStatus DecodeShiftRightImm64(MCInst &Inst, unsigned Val,
- uint64_t Address,
- const void *Decoder);
-
-static DecodeStatus DecodeShiftLeftImm8(MCInst &Inst, unsigned Val,
- uint64_t Address, const void *Decoder);
-static DecodeStatus DecodeShiftLeftImm16(MCInst &Inst, unsigned Val,
- uint64_t Address,
- const void *Decoder);
-static DecodeStatus DecodeShiftLeftImm32(MCInst &Inst, unsigned Val,
- uint64_t Address,
- const void *Decoder);
-static DecodeStatus DecodeShiftLeftImm64(MCInst &Inst, unsigned Val,
- uint64_t Address,
- const void *Decoder);
-
-template<int RegWidth>
-static DecodeStatus DecodeMoveWideImmOperand(llvm::MCInst &Inst,
- unsigned FullImm,
- uint64_t Address,
- const void *Decoder);
-
-template<int RegWidth>
-static DecodeStatus DecodeLogicalImmOperand(llvm::MCInst &Inst,
- unsigned Bits,
+static DecodeStatus DecodeFPR8RegisterClass(llvm::MCInst &Inst, unsigned RegNo,
uint64_t Address,
const void *Decoder);
-
-static DecodeStatus DecodeRegExtendOperand(llvm::MCInst &Inst,
- unsigned ShiftAmount,
+static DecodeStatus DecodeGPR64RegisterClass(llvm::MCInst &Inst, unsigned RegNo,
+ uint64_t Address,
+ const void *Decoder);
+static DecodeStatus DecodeGPR64spRegisterClass(llvm::MCInst &Inst,
+ unsigned RegNo, uint64_t Address,
+ const void *Decoder);
+static DecodeStatus DecodeGPR32RegisterClass(llvm::MCInst &Inst, unsigned RegNo,
+ uint64_t Address,
+ const void *Decoder);
+static DecodeStatus DecodeGPR32spRegisterClass(llvm::MCInst &Inst,
+ unsigned RegNo, uint64_t Address,
+ const void *Decoder);
+static DecodeStatus DecodeQQRegisterClass(llvm::MCInst &Inst, unsigned RegNo,
+ uint64_t Address,
+ const void *Decoder);
+static DecodeStatus DecodeQQQRegisterClass(llvm::MCInst &Inst, unsigned RegNo,
uint64_t Address,
const void *Decoder);
-template <A64SE::ShiftExtSpecifiers Ext, bool IsHalf>
-static DecodeStatus
-DecodeNeonMovImmShiftOperand(llvm::MCInst &Inst, unsigned ShiftAmount,
- uint64_t Address, const void *Decoder);
-
-static DecodeStatus Decode32BitShiftOperand(llvm::MCInst &Inst,
- unsigned ShiftAmount,
+static DecodeStatus DecodeQQQQRegisterClass(llvm::MCInst &Inst, unsigned RegNo,
uint64_t Address,
const void *Decoder);
-static DecodeStatus DecodeBitfieldInstruction(llvm::MCInst &Inst, unsigned Insn,
+static DecodeStatus DecodeDDRegisterClass(llvm::MCInst &Inst, unsigned RegNo,
+ uint64_t Address,
+ const void *Decoder);
+static DecodeStatus DecodeDDDRegisterClass(llvm::MCInst &Inst, unsigned RegNo,
+ uint64_t Address,
+ const void *Decoder);
+static DecodeStatus DecodeDDDDRegisterClass(llvm::MCInst &Inst, unsigned RegNo,
+ uint64_t Address,
+ const void *Decoder);
+
+static DecodeStatus DecodeFixedPointScaleImm32(llvm::MCInst &Inst, unsigned Imm,
+ uint64_t Address,
+ const void *Decoder);
+static DecodeStatus DecodeFixedPointScaleImm64(llvm::MCInst &Inst, unsigned Imm,
+ uint64_t Address,
+ const void *Decoder);
+static DecodeStatus DecodePCRelLabel19(llvm::MCInst &Inst, unsigned Imm,
+ uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeMemExtend(llvm::MCInst &Inst, unsigned Imm,
+ uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeMRSSystemRegister(llvm::MCInst &Inst, unsigned Imm,
+ uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeMSRSystemRegister(llvm::MCInst &Inst, unsigned Imm,
+ uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeThreeAddrSRegInstruction(llvm::MCInst &Inst,
+ uint32_t insn,
+ uint64_t Address,
+ const void *Decoder);
+static DecodeStatus DecodeMoveImmInstruction(llvm::MCInst &Inst, uint32_t insn,
+ uint64_t Address,
+ const void *Decoder);
+static DecodeStatus DecodeUnsignedLdStInstruction(llvm::MCInst &Inst,
+ uint32_t insn,
+ uint64_t Address,
+ const void *Decoder);
+static DecodeStatus DecodeSignedLdStInstruction(llvm::MCInst &Inst,
+ uint32_t insn, uint64_t Address,
+ const void *Decoder);
+static DecodeStatus DecodeExclusiveLdStInstruction(llvm::MCInst &Inst,
+ uint32_t insn,
+ uint64_t Address,
+ const void *Decoder);
+static DecodeStatus DecodePairLdStInstruction(llvm::MCInst &Inst, uint32_t insn,
uint64_t Address,
const void *Decoder);
+static DecodeStatus DecodeAddSubERegInstruction(llvm::MCInst &Inst,
+ uint32_t insn, uint64_t Address,
+ const void *Decoder);
+static DecodeStatus DecodeLogicalImmInstruction(llvm::MCInst &Inst,
+ uint32_t insn, uint64_t Address,
+ const void *Decoder);
+static DecodeStatus DecodeModImmInstruction(llvm::MCInst &Inst, uint32_t insn,
+ uint64_t Address,
+ const void *Decoder);
+static DecodeStatus DecodeModImmTiedInstruction(llvm::MCInst &Inst,
+ uint32_t insn, uint64_t Address,
+ const void *Decoder);
+static DecodeStatus DecodeAdrInstruction(llvm::MCInst &Inst, uint32_t insn,
+ uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeBaseAddSubImm(llvm::MCInst &Inst, uint32_t insn,
+ uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeUnconditionalBranch(llvm::MCInst &Inst, uint32_t insn,
+ uint64_t Address,
+ const void *Decoder);
+static DecodeStatus DecodeSystemPStateInstruction(llvm::MCInst &Inst,
+ uint32_t insn,
+ uint64_t Address,
+ const void *Decoder);
+static DecodeStatus DecodeTestAndBranch(llvm::MCInst &Inst, uint32_t insn,
+ uint64_t Address, const void *Decoder);
static DecodeStatus DecodeFMOVLaneInstruction(llvm::MCInst &Inst, unsigned Insn,
uint64_t Address,
const void *Decoder);
-
-static DecodeStatus DecodeLDSTPairInstruction(llvm::MCInst &Inst,
- unsigned Insn,
- uint64_t Address,
- const void *Decoder);
-
-static DecodeStatus DecodeLoadPairExclusiveInstruction(llvm::MCInst &Inst,
- unsigned Val,
- uint64_t Address,
- const void *Decoder);
-
-template<typename SomeNamedImmMapper>
-static DecodeStatus DecodeNamedImmOperand(llvm::MCInst &Inst,
- unsigned Val,
- uint64_t Address,
- const void *Decoder);
-
-static DecodeStatus
-DecodeSysRegOperand(const A64SysReg::SysRegMapper &InstMapper,
- llvm::MCInst &Inst, unsigned Val,
- uint64_t Address, const void *Decoder);
-
-static DecodeStatus DecodeMRSOperand(llvm::MCInst &Inst,
- unsigned Val,
- uint64_t Address,
- const void *Decoder);
-
-static DecodeStatus DecodeMSROperand(llvm::MCInst &Inst,
- unsigned Val,
- uint64_t Address,
- const void *Decoder);
-
-
-static DecodeStatus DecodeSingleIndexedInstruction(llvm::MCInst &Inst,
- unsigned Val,
- uint64_t Address,
- const void *Decoder);
-
-static DecodeStatus DecodeVLDSTPostInstruction(MCInst &Inst, unsigned Val,
- uint64_t Address,
+static DecodeStatus DecodeVecShiftR64Imm(llvm::MCInst &Inst, unsigned Imm,
+ uint64_t Addr, const void *Decoder);
+static DecodeStatus DecodeVecShiftR64ImmNarrow(llvm::MCInst &Inst, unsigned Imm,
+ uint64_t Addr,
const void *Decoder);
-
-static DecodeStatus DecodeVLDSTLanePostInstruction(MCInst &Inst, unsigned Insn,
- uint64_t Address,
- const void *Decoder);
-
-static DecodeStatus DecodeSHLLInstruction(MCInst &Inst, unsigned Insn,
- uint64_t Address,
- const void *Decoder);
-
-static bool Check(DecodeStatus &Out, DecodeStatus In);
-
-#include "AArch64GenDisassemblerTables.inc"
+static DecodeStatus DecodeVecShiftR32Imm(llvm::MCInst &Inst, unsigned Imm,
+ uint64_t Addr, const void *Decoder);
+static DecodeStatus DecodeVecShiftR32ImmNarrow(llvm::MCInst &Inst, unsigned Imm,
+ uint64_t Addr,
+ const void *Decoder);
+static DecodeStatus DecodeVecShiftR16Imm(llvm::MCInst &Inst, unsigned Imm,
+ uint64_t Addr, const void *Decoder);
+static DecodeStatus DecodeVecShiftR16ImmNarrow(llvm::MCInst &Inst, unsigned Imm,
+ uint64_t Addr,
+ const void *Decoder);
+static DecodeStatus DecodeVecShiftR8Imm(llvm::MCInst &Inst, unsigned Imm,
+ uint64_t Addr, const void *Decoder);
+static DecodeStatus DecodeVecShiftL64Imm(llvm::MCInst &Inst, unsigned Imm,
+ uint64_t Addr, const void *Decoder);
+static DecodeStatus DecodeVecShiftL32Imm(llvm::MCInst &Inst, unsigned Imm,
+ uint64_t Addr, const void *Decoder);
+static DecodeStatus DecodeVecShiftL16Imm(llvm::MCInst &Inst, unsigned Imm,
+ uint64_t Addr, const void *Decoder);
+static DecodeStatus DecodeVecShiftL8Imm(llvm::MCInst &Inst, unsigned Imm,
+ uint64_t Addr, const void *Decoder);
static bool Check(DecodeStatus &Out, DecodeStatus In) {
switch (In) {
@@ -261,486 +186,479 @@
llvm_unreachable("Invalid DecodeStatus!");
}
+#include "AArch64GenDisassemblerTables.inc"
+#include "AArch64GenInstrInfo.inc"
+
+#define Success llvm::MCDisassembler::Success
+#define Fail llvm::MCDisassembler::Fail
+#define SoftFail llvm::MCDisassembler::SoftFail
+
+static MCDisassembler *createAArch64Disassembler(const Target &T,
+ const MCSubtargetInfo &STI,
+ MCContext &Ctx) {
+ return new AArch64Disassembler(STI, Ctx);
+}
+
DecodeStatus AArch64Disassembler::getInstruction(MCInst &MI, uint64_t &Size,
- const MemoryObject &Region,
- uint64_t Address,
- raw_ostream &os,
- raw_ostream &cs) const {
+ const MemoryObject &Region,
+ uint64_t Address,
+ raw_ostream &os,
+ raw_ostream &cs) const {
CommentStream = &cs;
uint8_t bytes[4];
+ Size = 0;
// We want to read exactly 4 bytes of data.
- if (Region.readBytes(Address, 4, bytes) == -1) {
- Size = 0;
- return MCDisassembler::Fail;
- }
+ if (Region.readBytes(Address, 4, (uint8_t *)bytes) == -1)
+ return Fail;
+ Size = 4;
// Encoded as a small-endian 32-bit word in the stream.
- uint32_t insn = (bytes[3] << 24) |
- (bytes[2] << 16) |
- (bytes[1] << 8) |
- (bytes[0] << 0);
+ uint32_t insn =
+ (bytes[3] << 24) | (bytes[2] << 16) | (bytes[1] << 8) | (bytes[0] << 0);
// Calling the auto-generated decoder function.
- DecodeStatus result = decodeInstruction(DecoderTableA6432, MI, insn, Address,
- this, STI);
- if (result != MCDisassembler::Fail) {
- Size = 4;
- return result;
- }
-
- MI.clear();
- Size = 0;
- return MCDisassembler::Fail;
+ return decodeInstruction(DecoderTable32, MI, insn, Address, this, STI);
}
-static unsigned getReg(const void *D, unsigned RC, unsigned RegNo) {
- const AArch64Disassembler *Dis = static_cast<const AArch64Disassembler*>(D);
- return Dis->getRegInfo()->getRegClass(RC).getRegister(RegNo);
+static MCSymbolizer *
+createAArch64ExternalSymbolizer(StringRef TT, LLVMOpInfoCallback GetOpInfo,
+ LLVMSymbolLookupCallback SymbolLookUp,
+ void *DisInfo, MCContext *Ctx,
+ MCRelocationInfo *RelInfo) {
+ return new llvm::AArch64ExternalSymbolizer(
+ *Ctx,
+ std::unique_ptr<MCRelocationInfo>(RelInfo),
+ GetOpInfo, SymbolLookUp, DisInfo);
}
-static DecodeStatus DecodeGPR64RegisterClass(llvm::MCInst &Inst, unsigned RegNo,
- uint64_t Address, const void *Decoder) {
- if (RegNo > 31)
- return MCDisassembler::Fail;
+extern "C" void LLVMInitializeAArch64Disassembler() {
+ TargetRegistry::RegisterMCDisassembler(TheAArch64leTarget,
+ createAArch64Disassembler);
+ TargetRegistry::RegisterMCDisassembler(TheAArch64beTarget,
+ createAArch64Disassembler);
+ TargetRegistry::RegisterMCSymbolizer(TheAArch64leTarget,
+ createAArch64ExternalSymbolizer);
+ TargetRegistry::RegisterMCSymbolizer(TheAArch64beTarget,
+ createAArch64ExternalSymbolizer);
- uint16_t Register = getReg(Decoder, AArch64::GPR64RegClassID, RegNo);
- Inst.addOperand(MCOperand::CreateReg(Register));
- return MCDisassembler::Success;
+ TargetRegistry::RegisterMCDisassembler(TheARM64leTarget,
+ createAArch64Disassembler);
+ TargetRegistry::RegisterMCDisassembler(TheARM64beTarget,
+ createAArch64Disassembler);
+ TargetRegistry::RegisterMCSymbolizer(TheARM64leTarget,
+ createAArch64ExternalSymbolizer);
+ TargetRegistry::RegisterMCSymbolizer(TheARM64beTarget,
+ createAArch64ExternalSymbolizer);
}
-static DecodeStatus
-DecodeGPR64xspRegisterClass(llvm::MCInst &Inst, unsigned RegNo,
- uint64_t Address, const void *Decoder) {
- if (RegNo > 31)
- return MCDisassembler::Fail;
+static const unsigned FPR128DecoderTable[] = {
+ AArch64::Q0, AArch64::Q1, AArch64::Q2, AArch64::Q3, AArch64::Q4,
+ AArch64::Q5, AArch64::Q6, AArch64::Q7, AArch64::Q8, AArch64::Q9,
+ AArch64::Q10, AArch64::Q11, AArch64::Q12, AArch64::Q13, AArch64::Q14,
+ AArch64::Q15, AArch64::Q16, AArch64::Q17, AArch64::Q18, AArch64::Q19,
+ AArch64::Q20, AArch64::Q21, AArch64::Q22, AArch64::Q23, AArch64::Q24,
+ AArch64::Q25, AArch64::Q26, AArch64::Q27, AArch64::Q28, AArch64::Q29,
+ AArch64::Q30, AArch64::Q31
+};
- uint16_t Register = getReg(Decoder, AArch64::GPR64xspRegClassID, RegNo);
- Inst.addOperand(MCOperand::CreateReg(Register));
- return MCDisassembler::Success;
-}
-
-static DecodeStatus DecodeGPR32RegisterClass(llvm::MCInst &Inst, unsigned RegNo,
- uint64_t Address,
- const void *Decoder) {
- if (RegNo > 31)
- return MCDisassembler::Fail;
-
- uint16_t Register = getReg(Decoder, AArch64::GPR32RegClassID, RegNo);
- Inst.addOperand(MCOperand::CreateReg(Register));
- return MCDisassembler::Success;
-}
-
-static DecodeStatus
-DecodeGPR32wspRegisterClass(llvm::MCInst &Inst, unsigned RegNo,
- uint64_t Address, const void *Decoder) {
- if (RegNo > 31)
- return MCDisassembler::Fail;
-
- uint16_t Register = getReg(Decoder, AArch64::GPR32wspRegClassID, RegNo);
- Inst.addOperand(MCOperand::CreateReg(Register));
- return MCDisassembler::Success;
-}
-
-static DecodeStatus
-DecodeFPR8RegisterClass(llvm::MCInst &Inst, unsigned RegNo,
- uint64_t Address, const void *Decoder) {
- if (RegNo > 31)
- return MCDisassembler::Fail;
-
- uint16_t Register = getReg(Decoder, AArch64::FPR8RegClassID, RegNo);
- Inst.addOperand(MCOperand::CreateReg(Register));
- return MCDisassembler::Success;
-}
-
-static DecodeStatus
-DecodeFPR16RegisterClass(llvm::MCInst &Inst, unsigned RegNo,
- uint64_t Address, const void *Decoder) {
- if (RegNo > 31)
- return MCDisassembler::Fail;
-
- uint16_t Register = getReg(Decoder, AArch64::FPR16RegClassID, RegNo);
- Inst.addOperand(MCOperand::CreateReg(Register));
- return MCDisassembler::Success;
-}
-
-
-static DecodeStatus
-DecodeFPR32RegisterClass(llvm::MCInst &Inst, unsigned RegNo,
- uint64_t Address, const void *Decoder) {
- if (RegNo > 31)
- return MCDisassembler::Fail;
-
- uint16_t Register = getReg(Decoder, AArch64::FPR32RegClassID, RegNo);
- Inst.addOperand(MCOperand::CreateReg(Register));
- return MCDisassembler::Success;
-}
-
-static DecodeStatus
-DecodeFPR64RegisterClass(llvm::MCInst &Inst, unsigned RegNo,
- uint64_t Address, const void *Decoder) {
- if (RegNo > 31)
- return MCDisassembler::Fail;
-
- uint16_t Register = getReg(Decoder, AArch64::FPR64RegClassID, RegNo);
- Inst.addOperand(MCOperand::CreateReg(Register));
- return MCDisassembler::Success;
-}
-
-static DecodeStatus
-DecodeFPR64LoRegisterClass(llvm::MCInst &Inst, unsigned RegNo,
- uint64_t Address, const void *Decoder) {
- if (RegNo > 15)
- return MCDisassembler::Fail;
-
- return DecodeFPR64RegisterClass(Inst, RegNo, Address, Decoder);
-}
-
-static DecodeStatus
-DecodeFPR128RegisterClass(llvm::MCInst &Inst, unsigned RegNo,
- uint64_t Address, const void *Decoder) {
- if (RegNo > 31)
- return MCDisassembler::Fail;
-
- uint16_t Register = getReg(Decoder, AArch64::FPR128RegClassID, RegNo);
- Inst.addOperand(MCOperand::CreateReg(Register));
- return MCDisassembler::Success;
-}
-
-static DecodeStatus
-DecodeFPR128LoRegisterClass(llvm::MCInst &Inst, unsigned RegNo,
- uint64_t Address, const void *Decoder) {
- if (RegNo > 15)
- return MCDisassembler::Fail;
-
- return DecodeFPR128RegisterClass(Inst, RegNo, Address, Decoder);
-}
-
-static DecodeStatus DecodeGPR64noxzrRegisterClass(llvm::MCInst &Inst,
- unsigned RegNo,
- uint64_t Address,
- const void *Decoder) {
- if (RegNo > 30)
- return MCDisassembler::Fail;
-
- uint16_t Register = getReg(Decoder, AArch64::GPR64noxzrRegClassID, RegNo);
- Inst.addOperand(MCOperand::CreateReg(Register));
- return MCDisassembler::Success;
-}
-
-static DecodeStatus DecodeRegisterClassByID(llvm::MCInst &Inst, unsigned RegNo,
- unsigned RegID,
- const void *Decoder) {
- if (RegNo > 31)
- return MCDisassembler::Fail;
-
- uint16_t Register = getReg(Decoder, RegID, RegNo);
- Inst.addOperand(MCOperand::CreateReg(Register));
- return MCDisassembler::Success;
-}
-
-static DecodeStatus DecodeDPairRegisterClass(llvm::MCInst &Inst, unsigned RegNo,
- uint64_t Address,
- const void *Decoder) {
- return DecodeRegisterClassByID(Inst, RegNo, AArch64::DPairRegClassID,
- Decoder);
-}
-
-static DecodeStatus DecodeQPairRegisterClass(llvm::MCInst &Inst, unsigned RegNo,
- uint64_t Address,
- const void *Decoder) {
- return DecodeRegisterClassByID(Inst, RegNo, AArch64::QPairRegClassID,
- Decoder);
-}
-
-static DecodeStatus DecodeDTripleRegisterClass(llvm::MCInst &Inst,
- unsigned RegNo, uint64_t Address,
- const void *Decoder) {
- return DecodeRegisterClassByID(Inst, RegNo, AArch64::DTripleRegClassID,
- Decoder);
-}
-
-static DecodeStatus DecodeQTripleRegisterClass(llvm::MCInst &Inst,
- unsigned RegNo, uint64_t Address,
- const void *Decoder) {
- return DecodeRegisterClassByID(Inst, RegNo, AArch64::QTripleRegClassID,
- Decoder);
-}
-
-static DecodeStatus DecodeDQuadRegisterClass(llvm::MCInst &Inst, unsigned RegNo,
- uint64_t Address,
- const void *Decoder) {
- return DecodeRegisterClassByID(Inst, RegNo, AArch64::DQuadRegClassID,
- Decoder);
-}
-
-static DecodeStatus DecodeQQuadRegisterClass(llvm::MCInst &Inst, unsigned RegNo,
- uint64_t Address,
- const void *Decoder) {
- return DecodeRegisterClassByID(Inst, RegNo, AArch64::QQuadRegClassID,
- Decoder);
-}
-
-static DecodeStatus DecodeAddrRegExtendOperand(llvm::MCInst &Inst,
- unsigned OptionHiS,
- uint64_t Address,
- const void *Decoder) {
- // Option{1} must be 1. OptionHiS is made up of {Option{2}, Option{1},
- // S}. Hence we want to check bit 1.
- if (!(OptionHiS & 2))
- return MCDisassembler::Fail;
-
- Inst.addOperand(MCOperand::CreateImm(OptionHiS));
- return MCDisassembler::Success;
-}
-
-static DecodeStatus DecodeBitfield32ImmOperand(llvm::MCInst &Inst,
- unsigned Imm6Bits,
- uint64_t Address,
- const void *Decoder) {
- // In the 32-bit variant, bit 6 must be zero. I.e. the immediate must be
- // between 0 and 31.
- if (Imm6Bits > 31)
- return MCDisassembler::Fail;
-
- Inst.addOperand(MCOperand::CreateImm(Imm6Bits));
- return MCDisassembler::Success;
-}
-
-static DecodeStatus DecodeCVT32FixedPosOperand(llvm::MCInst &Inst,
- unsigned Imm6Bits,
- uint64_t Address,
- const void *Decoder) {
- // 1 <= Imm <= 32. Encoded as 64 - Imm so: 63 >= Encoded >= 32.
- if (Imm6Bits < 32)
- return MCDisassembler::Fail;
-
- Inst.addOperand(MCOperand::CreateImm(Imm6Bits));
- return MCDisassembler::Success;
-}
-
-static DecodeStatus DecodeFPZeroOperand(llvm::MCInst &Inst,
- unsigned RmBits,
- uint64_t Address,
- const void *Decoder) {
- // Any bits are valid in the instruction (they're architecturally ignored),
- // but a code generator should insert 0.
- Inst.addOperand(MCOperand::CreateImm(0));
- return MCDisassembler::Success;
-}
-
-static DecodeStatus DecodeShiftRightImm8(MCInst &Inst, unsigned Val,
- uint64_t Address,
- const void *Decoder) {
- Inst.addOperand(MCOperand::CreateImm(8 - Val));
- return MCDisassembler::Success;
-}
-
-static DecodeStatus DecodeShiftRightImm16(MCInst &Inst, unsigned Val,
- uint64_t Address,
- const void *Decoder) {
- Inst.addOperand(MCOperand::CreateImm(16 - Val));
- return MCDisassembler::Success;
-}
-
-static DecodeStatus DecodeShiftRightImm32(MCInst &Inst, unsigned Val,
- uint64_t Address,
- const void *Decoder) {
- Inst.addOperand(MCOperand::CreateImm(32 - Val));
- return MCDisassembler::Success;
-}
-
-static DecodeStatus DecodeShiftRightImm64(MCInst &Inst, unsigned Val,
- uint64_t Address,
- const void *Decoder) {
- Inst.addOperand(MCOperand::CreateImm(64 - Val));
- return MCDisassembler::Success;
-}
-
-static DecodeStatus DecodeShiftLeftImm8(MCInst &Inst, unsigned Val,
- uint64_t Address,
- const void *Decoder) {
- if (Val > 7)
- return MCDisassembler::Fail;
-
- Inst.addOperand(MCOperand::CreateImm(Val));
- return MCDisassembler::Success;
-}
-
-static DecodeStatus DecodeShiftLeftImm16(MCInst &Inst, unsigned Val,
- uint64_t Address,
- const void *Decoder) {
- if (Val > 15)
- return MCDisassembler::Fail;
-
- Inst.addOperand(MCOperand::CreateImm(Val));
- return MCDisassembler::Success;
-}
-
-static DecodeStatus DecodeShiftLeftImm32(MCInst &Inst, unsigned Val,
- uint64_t Address,
- const void *Decoder) {
- if (Val > 31)
- return MCDisassembler::Fail;
-
- Inst.addOperand(MCOperand::CreateImm(Val));
- return MCDisassembler::Success;
-}
-
-static DecodeStatus DecodeShiftLeftImm64(MCInst &Inst, unsigned Val,
- uint64_t Address,
- const void *Decoder) {
- if (Val > 63)
- return MCDisassembler::Fail;
-
- Inst.addOperand(MCOperand::CreateImm(Val));
- return MCDisassembler::Success;
-}
-
-template<int RegWidth>
-static DecodeStatus DecodeMoveWideImmOperand(llvm::MCInst &Inst,
- unsigned FullImm,
- uint64_t Address,
- const void *Decoder) {
- unsigned Imm16 = FullImm & 0xffff;
- unsigned Shift = FullImm >> 16;
-
- if (RegWidth == 32 && Shift > 1) return MCDisassembler::Fail;
-
- Inst.addOperand(MCOperand::CreateImm(Imm16));
- Inst.addOperand(MCOperand::CreateImm(Shift));
- return MCDisassembler::Success;
-}
-
-template<int RegWidth>
-static DecodeStatus DecodeLogicalImmOperand(llvm::MCInst &Inst,
- unsigned Bits,
- uint64_t Address,
- const void *Decoder) {
- uint64_t Imm;
- if (!A64Imms::isLogicalImmBits(RegWidth, Bits, Imm))
- return MCDisassembler::Fail;
-
- Inst.addOperand(MCOperand::CreateImm(Bits));
- return MCDisassembler::Success;
-}
-
-
-static DecodeStatus DecodeRegExtendOperand(llvm::MCInst &Inst,
- unsigned ShiftAmount,
- uint64_t Address,
- const void *Decoder) {
- // Only values 0-4 are valid for this 3-bit field
- if (ShiftAmount > 4)
- return MCDisassembler::Fail;
-
- Inst.addOperand(MCOperand::CreateImm(ShiftAmount));
- return MCDisassembler::Success;
-}
-
-static DecodeStatus Decode32BitShiftOperand(llvm::MCInst &Inst,
- unsigned ShiftAmount,
- uint64_t Address,
- const void *Decoder) {
- // Only values below 32 are valid for a 32-bit register
- if (ShiftAmount > 31)
- return MCDisassembler::Fail;
-
- Inst.addOperand(MCOperand::CreateImm(ShiftAmount));
- return MCDisassembler::Success;
-}
-
-static DecodeStatus DecodeBitfieldInstruction(llvm::MCInst &Inst, unsigned Insn,
- uint64_t Address,
+static DecodeStatus DecodeFPR128RegisterClass(MCInst &Inst, unsigned RegNo,
+ uint64_t Addr,
const void *Decoder) {
- unsigned Rd = fieldFromInstruction(Insn, 0, 5);
- unsigned Rn = fieldFromInstruction(Insn, 5, 5);
- unsigned ImmS = fieldFromInstruction(Insn, 10, 6);
- unsigned ImmR = fieldFromInstruction(Insn, 16, 6);
- unsigned SF = fieldFromInstruction(Insn, 31, 1);
+ if (RegNo > 31)
+ return Fail;
- // Undef for 0b11 just in case it occurs. Don't want the compiler to optimise
- // out assertions that it thinks should never be hit.
- enum OpcTypes { SBFM = 0, BFM, UBFM, Undef } Opc;
- Opc = (OpcTypes)fieldFromInstruction(Insn, 29, 2);
+ unsigned Register = FPR128DecoderTable[RegNo];
+ Inst.addOperand(MCOperand::CreateReg(Register));
+ return Success;
+}
- if (!SF) {
- // ImmR and ImmS must be between 0 and 31 for 32-bit instructions.
- if (ImmR > 31 || ImmS > 31)
- return MCDisassembler::Fail;
- }
+static DecodeStatus DecodeFPR128_loRegisterClass(MCInst &Inst, unsigned RegNo,
+ uint64_t Addr,
+ const void *Decoder) {
+ if (RegNo > 15)
+ return Fail;
+ return DecodeFPR128RegisterClass(Inst, RegNo, Addr, Decoder);
+}
- if (SF) {
- DecodeGPR64RegisterClass(Inst, Rd, Address, Decoder);
- // BFM MCInsts use Rd as a source too.
- if (Opc == BFM) DecodeGPR64RegisterClass(Inst, Rd, Address, Decoder);
- DecodeGPR64RegisterClass(Inst, Rn, Address, Decoder);
- } else {
- DecodeGPR32RegisterClass(Inst, Rd, Address, Decoder);
- // BFM MCInsts use Rd as a source too.
- if (Opc == BFM) DecodeGPR32RegisterClass(Inst, Rd, Address, Decoder);
- DecodeGPR32RegisterClass(Inst, Rn, Address, Decoder);
- }
+static const unsigned FPR64DecoderTable[] = {
+ AArch64::D0, AArch64::D1, AArch64::D2, AArch64::D3, AArch64::D4,
+ AArch64::D5, AArch64::D6, AArch64::D7, AArch64::D8, AArch64::D9,
+ AArch64::D10, AArch64::D11, AArch64::D12, AArch64::D13, AArch64::D14,
+ AArch64::D15, AArch64::D16, AArch64::D17, AArch64::D18, AArch64::D19,
+ AArch64::D20, AArch64::D21, AArch64::D22, AArch64::D23, AArch64::D24,
+ AArch64::D25, AArch64::D26, AArch64::D27, AArch64::D28, AArch64::D29,
+ AArch64::D30, AArch64::D31
+};
- // ASR and LSR have more specific patterns so they won't get here:
- assert(!(ImmS == 31 && !SF && Opc != BFM)
- && "shift should have used auto decode");
- assert(!(ImmS == 63 && SF && Opc != BFM)
- && "shift should have used auto decode");
+static DecodeStatus DecodeFPR64RegisterClass(MCInst &Inst, unsigned RegNo,
+ uint64_t Addr,
+ const void *Decoder) {
+ if (RegNo > 31)
+ return Fail;
- // Extension instructions similarly:
- if (Opc == SBFM && ImmR == 0) {
- assert((ImmS != 7 && ImmS != 15) && "extension got here");
- assert((ImmS != 31 || SF == 0) && "extension got here");
- } else if (Opc == UBFM && ImmR == 0) {
- assert((SF != 0 || (ImmS != 7 && ImmS != 15)) && "extension got here");
- }
+ unsigned Register = FPR64DecoderTable[RegNo];
+ Inst.addOperand(MCOperand::CreateReg(Register));
+ return Success;
+}
- if (Opc == UBFM) {
- // It might be a LSL instruction, which actually takes the shift amount
- // itself as an MCInst operand.
- if (SF && (ImmS + 1) % 64 == ImmR) {
- Inst.setOpcode(AArch64::LSLxxi);
- Inst.addOperand(MCOperand::CreateImm(63 - ImmS));
- return MCDisassembler::Success;
- } else if (!SF && (ImmS + 1) % 32 == ImmR) {
- Inst.setOpcode(AArch64::LSLwwi);
- Inst.addOperand(MCOperand::CreateImm(31 - ImmS));
- return MCDisassembler::Success;
- }
- }
+static const unsigned FPR32DecoderTable[] = {
+ AArch64::S0, AArch64::S1, AArch64::S2, AArch64::S3, AArch64::S4,
+ AArch64::S5, AArch64::S6, AArch64::S7, AArch64::S8, AArch64::S9,
+ AArch64::S10, AArch64::S11, AArch64::S12, AArch64::S13, AArch64::S14,
+ AArch64::S15, AArch64::S16, AArch64::S17, AArch64::S18, AArch64::S19,
+ AArch64::S20, AArch64::S21, AArch64::S22, AArch64::S23, AArch64::S24,
+ AArch64::S25, AArch64::S26, AArch64::S27, AArch64::S28, AArch64::S29,
+ AArch64::S30, AArch64::S31
+};
- // Otherwise it's definitely either an extract or an insert depending on which
- // of ImmR or ImmS is larger.
- unsigned ExtractOp, InsertOp;
- switch (Opc) {
- default: llvm_unreachable("unexpected instruction trying to decode bitfield");
- case SBFM:
- ExtractOp = SF ? AArch64::SBFXxxii : AArch64::SBFXwwii;
- InsertOp = SF ? AArch64::SBFIZxxii : AArch64::SBFIZwwii;
- break;
- case BFM:
- ExtractOp = SF ? AArch64::BFXILxxii : AArch64::BFXILwwii;
- InsertOp = SF ? AArch64::BFIxxii : AArch64::BFIwwii;
- break;
- case UBFM:
- ExtractOp = SF ? AArch64::UBFXxxii : AArch64::UBFXwwii;
- InsertOp = SF ? AArch64::UBFIZxxii : AArch64::UBFIZwwii;
- break;
- }
+static DecodeStatus DecodeFPR32RegisterClass(MCInst &Inst, unsigned RegNo,
+ uint64_t Addr,
+ const void *Decoder) {
+ if (RegNo > 31)
+ return Fail;
- // Otherwise it's a boring insert or extract
- Inst.addOperand(MCOperand::CreateImm(ImmR));
- Inst.addOperand(MCOperand::CreateImm(ImmS));
+ unsigned Register = FPR32DecoderTable[RegNo];
+ Inst.addOperand(MCOperand::CreateReg(Register));
+ return Success;
+}
+static const unsigned FPR16DecoderTable[] = {
+ AArch64::H0, AArch64::H1, AArch64::H2, AArch64::H3, AArch64::H4,
+ AArch64::H5, AArch64::H6, AArch64::H7, AArch64::H8, AArch64::H9,
+ AArch64::H10, AArch64::H11, AArch64::H12, AArch64::H13, AArch64::H14,
+ AArch64::H15, AArch64::H16, AArch64::H17, AArch64::H18, AArch64::H19,
+ AArch64::H20, AArch64::H21, AArch64::H22, AArch64::H23, AArch64::H24,
+ AArch64::H25, AArch64::H26, AArch64::H27, AArch64::H28, AArch64::H29,
+ AArch64::H30, AArch64::H31
+};
- if (ImmS < ImmR)
- Inst.setOpcode(InsertOp);
- else
- Inst.setOpcode(ExtractOp);
+static DecodeStatus DecodeFPR16RegisterClass(MCInst &Inst, unsigned RegNo,
+ uint64_t Addr,
+ const void *Decoder) {
+ if (RegNo > 31)
+ return Fail;
- return MCDisassembler::Success;
+ unsigned Register = FPR16DecoderTable[RegNo];
+ Inst.addOperand(MCOperand::CreateReg(Register));
+ return Success;
+}
+
+static const unsigned FPR8DecoderTable[] = {
+ AArch64::B0, AArch64::B1, AArch64::B2, AArch64::B3, AArch64::B4,
+ AArch64::B5, AArch64::B6, AArch64::B7, AArch64::B8, AArch64::B9,
+ AArch64::B10, AArch64::B11, AArch64::B12, AArch64::B13, AArch64::B14,
+ AArch64::B15, AArch64::B16, AArch64::B17, AArch64::B18, AArch64::B19,
+ AArch64::B20, AArch64::B21, AArch64::B22, AArch64::B23, AArch64::B24,
+ AArch64::B25, AArch64::B26, AArch64::B27, AArch64::B28, AArch64::B29,
+ AArch64::B30, AArch64::B31
+};
+
+static DecodeStatus DecodeFPR8RegisterClass(MCInst &Inst, unsigned RegNo,
+ uint64_t Addr,
+ const void *Decoder) {
+ if (RegNo > 31)
+ return Fail;
+
+ unsigned Register = FPR8DecoderTable[RegNo];
+ Inst.addOperand(MCOperand::CreateReg(Register));
+ return Success;
+}
+
+static const unsigned GPR64DecoderTable[] = {
+ AArch64::X0, AArch64::X1, AArch64::X2, AArch64::X3, AArch64::X4,
+ AArch64::X5, AArch64::X6, AArch64::X7, AArch64::X8, AArch64::X9,
+ AArch64::X10, AArch64::X11, AArch64::X12, AArch64::X13, AArch64::X14,
+ AArch64::X15, AArch64::X16, AArch64::X17, AArch64::X18, AArch64::X19,
+ AArch64::X20, AArch64::X21, AArch64::X22, AArch64::X23, AArch64::X24,
+ AArch64::X25, AArch64::X26, AArch64::X27, AArch64::X28, AArch64::FP,
+ AArch64::LR, AArch64::XZR
+};
+
+static DecodeStatus DecodeGPR64RegisterClass(MCInst &Inst, unsigned RegNo,
+ uint64_t Addr,
+ const void *Decoder) {
+ if (RegNo > 31)
+ return Fail;
+
+ unsigned Register = GPR64DecoderTable[RegNo];
+ Inst.addOperand(MCOperand::CreateReg(Register));
+ return Success;
+}
+
+static DecodeStatus DecodeGPR64spRegisterClass(MCInst &Inst, unsigned RegNo,
+ uint64_t Addr,
+ const void *Decoder) {
+ if (RegNo > 31)
+ return Fail;
+ unsigned Register = GPR64DecoderTable[RegNo];
+ if (Register == AArch64::XZR)
+ Register = AArch64::SP;
+ Inst.addOperand(MCOperand::CreateReg(Register));
+ return Success;
+}
+
+static const unsigned GPR32DecoderTable[] = {
+ AArch64::W0, AArch64::W1, AArch64::W2, AArch64::W3, AArch64::W4,
+ AArch64::W5, AArch64::W6, AArch64::W7, AArch64::W8, AArch64::W9,
+ AArch64::W10, AArch64::W11, AArch64::W12, AArch64::W13, AArch64::W14,
+ AArch64::W15, AArch64::W16, AArch64::W17, AArch64::W18, AArch64::W19,
+ AArch64::W20, AArch64::W21, AArch64::W22, AArch64::W23, AArch64::W24,
+ AArch64::W25, AArch64::W26, AArch64::W27, AArch64::W28, AArch64::W29,
+ AArch64::W30, AArch64::WZR
+};
+
+static DecodeStatus DecodeGPR32RegisterClass(MCInst &Inst, unsigned RegNo,
+ uint64_t Addr,
+ const void *Decoder) {
+ if (RegNo > 31)
+ return Fail;
+
+ unsigned Register = GPR32DecoderTable[RegNo];
+ Inst.addOperand(MCOperand::CreateReg(Register));
+ return Success;
+}
+
+static DecodeStatus DecodeGPR32spRegisterClass(MCInst &Inst, unsigned RegNo,
+ uint64_t Addr,
+ const void *Decoder) {
+ if (RegNo > 31)
+ return Fail;
+
+ unsigned Register = GPR32DecoderTable[RegNo];
+ if (Register == AArch64::WZR)
+ Register = AArch64::WSP;
+ Inst.addOperand(MCOperand::CreateReg(Register));
+ return Success;
+}
+
+static const unsigned VectorDecoderTable[] = {
+ AArch64::Q0, AArch64::Q1, AArch64::Q2, AArch64::Q3, AArch64::Q4,
+ AArch64::Q5, AArch64::Q6, AArch64::Q7, AArch64::Q8, AArch64::Q9,
+ AArch64::Q10, AArch64::Q11, AArch64::Q12, AArch64::Q13, AArch64::Q14,
+ AArch64::Q15, AArch64::Q16, AArch64::Q17, AArch64::Q18, AArch64::Q19,
+ AArch64::Q20, AArch64::Q21, AArch64::Q22, AArch64::Q23, AArch64::Q24,
+ AArch64::Q25, AArch64::Q26, AArch64::Q27, AArch64::Q28, AArch64::Q29,
+ AArch64::Q30, AArch64::Q31
+};
+
+static DecodeStatus DecodeVectorRegisterClass(MCInst &Inst, unsigned RegNo,
+ uint64_t Addr,
+ const void *Decoder) {
+ if (RegNo > 31)
+ return Fail;
+
+ unsigned Register = VectorDecoderTable[RegNo];
+ Inst.addOperand(MCOperand::CreateReg(Register));
+ return Success;
+}
+
+static const unsigned QQDecoderTable[] = {
+ AArch64::Q0_Q1, AArch64::Q1_Q2, AArch64::Q2_Q3, AArch64::Q3_Q4,
+ AArch64::Q4_Q5, AArch64::Q5_Q6, AArch64::Q6_Q7, AArch64::Q7_Q8,
+ AArch64::Q8_Q9, AArch64::Q9_Q10, AArch64::Q10_Q11, AArch64::Q11_Q12,
+ AArch64::Q12_Q13, AArch64::Q13_Q14, AArch64::Q14_Q15, AArch64::Q15_Q16,
+ AArch64::Q16_Q17, AArch64::Q17_Q18, AArch64::Q18_Q19, AArch64::Q19_Q20,
+ AArch64::Q20_Q21, AArch64::Q21_Q22, AArch64::Q22_Q23, AArch64::Q23_Q24,
+ AArch64::Q24_Q25, AArch64::Q25_Q26, AArch64::Q26_Q27, AArch64::Q27_Q28,
+ AArch64::Q28_Q29, AArch64::Q29_Q30, AArch64::Q30_Q31, AArch64::Q31_Q0
+};
+
+static DecodeStatus DecodeQQRegisterClass(MCInst &Inst, unsigned RegNo,
+ uint64_t Addr, const void *Decoder) {
+ if (RegNo > 31)
+ return Fail;
+ unsigned Register = QQDecoderTable[RegNo];
+ Inst.addOperand(MCOperand::CreateReg(Register));
+ return Success;
+}
+
+static const unsigned QQQDecoderTable[] = {
+ AArch64::Q0_Q1_Q2, AArch64::Q1_Q2_Q3, AArch64::Q2_Q3_Q4,
+ AArch64::Q3_Q4_Q5, AArch64::Q4_Q5_Q6, AArch64::Q5_Q6_Q7,
+ AArch64::Q6_Q7_Q8, AArch64::Q7_Q8_Q9, AArch64::Q8_Q9_Q10,
+ AArch64::Q9_Q10_Q11, AArch64::Q10_Q11_Q12, AArch64::Q11_Q12_Q13,
+ AArch64::Q12_Q13_Q14, AArch64::Q13_Q14_Q15, AArch64::Q14_Q15_Q16,
+ AArch64::Q15_Q16_Q17, AArch64::Q16_Q17_Q18, AArch64::Q17_Q18_Q19,
+ AArch64::Q18_Q19_Q20, AArch64::Q19_Q20_Q21, AArch64::Q20_Q21_Q22,
+ AArch64::Q21_Q22_Q23, AArch64::Q22_Q23_Q24, AArch64::Q23_Q24_Q25,
+ AArch64::Q24_Q25_Q26, AArch64::Q25_Q26_Q27, AArch64::Q26_Q27_Q28,
+ AArch64::Q27_Q28_Q29, AArch64::Q28_Q29_Q30, AArch64::Q29_Q30_Q31,
+ AArch64::Q30_Q31_Q0, AArch64::Q31_Q0_Q1
+};
+
+static DecodeStatus DecodeQQQRegisterClass(MCInst &Inst, unsigned RegNo,
+ uint64_t Addr, const void *Decoder) {
+ if (RegNo > 31)
+ return Fail;
+ unsigned Register = QQQDecoderTable[RegNo];
+ Inst.addOperand(MCOperand::CreateReg(Register));
+ return Success;
+}
+
+static const unsigned QQQQDecoderTable[] = {
+ AArch64::Q0_Q1_Q2_Q3, AArch64::Q1_Q2_Q3_Q4, AArch64::Q2_Q3_Q4_Q5,
+ AArch64::Q3_Q4_Q5_Q6, AArch64::Q4_Q5_Q6_Q7, AArch64::Q5_Q6_Q7_Q8,
+ AArch64::Q6_Q7_Q8_Q9, AArch64::Q7_Q8_Q9_Q10, AArch64::Q8_Q9_Q10_Q11,
+ AArch64::Q9_Q10_Q11_Q12, AArch64::Q10_Q11_Q12_Q13, AArch64::Q11_Q12_Q13_Q14,
+ AArch64::Q12_Q13_Q14_Q15, AArch64::Q13_Q14_Q15_Q16, AArch64::Q14_Q15_Q16_Q17,
+ AArch64::Q15_Q16_Q17_Q18, AArch64::Q16_Q17_Q18_Q19, AArch64::Q17_Q18_Q19_Q20,
+ AArch64::Q18_Q19_Q20_Q21, AArch64::Q19_Q20_Q21_Q22, AArch64::Q20_Q21_Q22_Q23,
+ AArch64::Q21_Q22_Q23_Q24, AArch64::Q22_Q23_Q24_Q25, AArch64::Q23_Q24_Q25_Q26,
+ AArch64::Q24_Q25_Q26_Q27, AArch64::Q25_Q26_Q27_Q28, AArch64::Q26_Q27_Q28_Q29,
+ AArch64::Q27_Q28_Q29_Q30, AArch64::Q28_Q29_Q30_Q31, AArch64::Q29_Q30_Q31_Q0,
+ AArch64::Q30_Q31_Q0_Q1, AArch64::Q31_Q0_Q1_Q2
+};
+
+static DecodeStatus DecodeQQQQRegisterClass(MCInst &Inst, unsigned RegNo,
+ uint64_t Addr,
+ const void *Decoder) {
+ if (RegNo > 31)
+ return Fail;
+ unsigned Register = QQQQDecoderTable[RegNo];
+ Inst.addOperand(MCOperand::CreateReg(Register));
+ return Success;
+}
+
+static const unsigned DDDecoderTable[] = {
+ AArch64::D0_D1, AArch64::D1_D2, AArch64::D2_D3, AArch64::D3_D4,
+ AArch64::D4_D5, AArch64::D5_D6, AArch64::D6_D7, AArch64::D7_D8,
+ AArch64::D8_D9, AArch64::D9_D10, AArch64::D10_D11, AArch64::D11_D12,
+ AArch64::D12_D13, AArch64::D13_D14, AArch64::D14_D15, AArch64::D15_D16,
+ AArch64::D16_D17, AArch64::D17_D18, AArch64::D18_D19, AArch64::D19_D20,
+ AArch64::D20_D21, AArch64::D21_D22, AArch64::D22_D23, AArch64::D23_D24,
+ AArch64::D24_D25, AArch64::D25_D26, AArch64::D26_D27, AArch64::D27_D28,
+ AArch64::D28_D29, AArch64::D29_D30, AArch64::D30_D31, AArch64::D31_D0
+};
+
+static DecodeStatus DecodeDDRegisterClass(MCInst &Inst, unsigned RegNo,
+ uint64_t Addr, const void *Decoder) {
+ if (RegNo > 31)
+ return Fail;
+ unsigned Register = DDDecoderTable[RegNo];
+ Inst.addOperand(MCOperand::CreateReg(Register));
+ return Success;
+}
+
+static const unsigned DDDDecoderTable[] = {
+ AArch64::D0_D1_D2, AArch64::D1_D2_D3, AArch64::D2_D3_D4,
+ AArch64::D3_D4_D5, AArch64::D4_D5_D6, AArch64::D5_D6_D7,
+ AArch64::D6_D7_D8, AArch64::D7_D8_D9, AArch64::D8_D9_D10,
+ AArch64::D9_D10_D11, AArch64::D10_D11_D12, AArch64::D11_D12_D13,
+ AArch64::D12_D13_D14, AArch64::D13_D14_D15, AArch64::D14_D15_D16,
+ AArch64::D15_D16_D17, AArch64::D16_D17_D18, AArch64::D17_D18_D19,
+ AArch64::D18_D19_D20, AArch64::D19_D20_D21, AArch64::D20_D21_D22,
+ AArch64::D21_D22_D23, AArch64::D22_D23_D24, AArch64::D23_D24_D25,
+ AArch64::D24_D25_D26, AArch64::D25_D26_D27, AArch64::D26_D27_D28,
+ AArch64::D27_D28_D29, AArch64::D28_D29_D30, AArch64::D29_D30_D31,
+ AArch64::D30_D31_D0, AArch64::D31_D0_D1
+};
+
+static DecodeStatus DecodeDDDRegisterClass(MCInst &Inst, unsigned RegNo,
+ uint64_t Addr, const void *Decoder) {
+ if (RegNo > 31)
+ return Fail;
+ unsigned Register = DDDDecoderTable[RegNo];
+ Inst.addOperand(MCOperand::CreateReg(Register));
+ return Success;
+}
+
+static const unsigned DDDDDecoderTable[] = {
+ AArch64::D0_D1_D2_D3, AArch64::D1_D2_D3_D4, AArch64::D2_D3_D4_D5,
+ AArch64::D3_D4_D5_D6, AArch64::D4_D5_D6_D7, AArch64::D5_D6_D7_D8,
+ AArch64::D6_D7_D8_D9, AArch64::D7_D8_D9_D10, AArch64::D8_D9_D10_D11,
+ AArch64::D9_D10_D11_D12, AArch64::D10_D11_D12_D13, AArch64::D11_D12_D13_D14,
+ AArch64::D12_D13_D14_D15, AArch64::D13_D14_D15_D16, AArch64::D14_D15_D16_D17,
+ AArch64::D15_D16_D17_D18, AArch64::D16_D17_D18_D19, AArch64::D17_D18_D19_D20,
+ AArch64::D18_D19_D20_D21, AArch64::D19_D20_D21_D22, AArch64::D20_D21_D22_D23,
+ AArch64::D21_D22_D23_D24, AArch64::D22_D23_D24_D25, AArch64::D23_D24_D25_D26,
+ AArch64::D24_D25_D26_D27, AArch64::D25_D26_D27_D28, AArch64::D26_D27_D28_D29,
+ AArch64::D27_D28_D29_D30, AArch64::D28_D29_D30_D31, AArch64::D29_D30_D31_D0,
+ AArch64::D30_D31_D0_D1, AArch64::D31_D0_D1_D2
+};
+
+static DecodeStatus DecodeDDDDRegisterClass(MCInst &Inst, unsigned RegNo,
+ uint64_t Addr,
+ const void *Decoder) {
+ if (RegNo > 31)
+ return Fail;
+ unsigned Register = DDDDDecoderTable[RegNo];
+ Inst.addOperand(MCOperand::CreateReg(Register));
+ return Success;
+}
+
+static DecodeStatus DecodeFixedPointScaleImm32(llvm::MCInst &Inst, unsigned Imm,
+ uint64_t Addr,
+ const void *Decoder) {
+ // scale{5} is asserted as 1 in tblgen.
+ Imm |= 0x20;
+ Inst.addOperand(MCOperand::CreateImm(64 - Imm));
+ return Success;
+}
+
+static DecodeStatus DecodeFixedPointScaleImm64(llvm::MCInst &Inst, unsigned Imm,
+ uint64_t Addr,
+ const void *Decoder) {
+ Inst.addOperand(MCOperand::CreateImm(64 - Imm));
+ return Success;
+}
+
+static DecodeStatus DecodePCRelLabel19(llvm::MCInst &Inst, unsigned Imm,
+ uint64_t Addr, const void *Decoder) {
+ int64_t ImmVal = Imm;
+ const AArch64Disassembler *Dis =
+ static_cast<const AArch64Disassembler *>(Decoder);
+
+ // Sign-extend 19-bit immediate.
+ if (ImmVal & (1 << (19 - 1)))
+ ImmVal |= ~((1LL << 19) - 1);
+
+ if (!Dis->tryAddingSymbolicOperand(Inst, ImmVal << 2, Addr,
+ Inst.getOpcode() != AArch64::LDRXl, 0, 4))
+ Inst.addOperand(MCOperand::CreateImm(ImmVal));
+ return Success;
+}
+
+static DecodeStatus DecodeMemExtend(llvm::MCInst &Inst, unsigned Imm,
+ uint64_t Address, const void *Decoder) {
+ Inst.addOperand(MCOperand::CreateImm((Imm >> 1) & 1));
+ Inst.addOperand(MCOperand::CreateImm(Imm & 1));
+ return Success;
+}
+
+static DecodeStatus DecodeMRSSystemRegister(llvm::MCInst &Inst, unsigned Imm,
+ uint64_t Address,
+ const void *Decoder) {
+ const AArch64Disassembler *Dis =
+ static_cast<const AArch64Disassembler *>(Decoder);
+ const MCSubtargetInfo &STI = Dis->getSubtargetInfo();
+
+ Imm |= 0x8000;
+ Inst.addOperand(MCOperand::CreateImm(Imm));
+
+ bool ValidNamed;
+ (void)AArch64SysReg::MRSMapper(STI.getFeatureBits())
+ .toString(Imm, ValidNamed);
+
+ return ValidNamed ? Success : Fail;
+}
+
+static DecodeStatus DecodeMSRSystemRegister(llvm::MCInst &Inst, unsigned Imm,
+ uint64_t Address,
+ const void *Decoder) {
+ const AArch64Disassembler *Dis =
+ static_cast<const AArch64Disassembler *>(Decoder);
+ const MCSubtargetInfo &STI = Dis->getSubtargetInfo();
+
+ Imm |= 0x8000;
+ Inst.addOperand(MCOperand::CreateImm(Imm));
+
+ bool ValidNamed;
+ (void)AArch64SysReg::MSRMapper(STI.getFeatureBits())
+ .toString(Imm, ValidNamed);
+
+ return ValidNamed ? Success : Fail;
}
static DecodeStatus DecodeFMOVLaneInstruction(llvm::MCInst &Inst, unsigned Insn,
@@ -763,811 +681,879 @@
// Add the lane
Inst.addOperand(MCOperand::CreateImm(1));
- return MCDisassembler::Success;
+ return Success;
}
+static DecodeStatus DecodeVecShiftRImm(llvm::MCInst &Inst, unsigned Imm,
+ unsigned Add) {
+ Inst.addOperand(MCOperand::CreateImm(Add - Imm));
+ return Success;
+}
-static DecodeStatus DecodeLDSTPairInstruction(llvm::MCInst &Inst,
- unsigned Insn,
- uint64_t Address,
- const void *Decoder) {
- DecodeStatus Result = MCDisassembler::Success;
- unsigned Rt = fieldFromInstruction(Insn, 0, 5);
- unsigned Rn = fieldFromInstruction(Insn, 5, 5);
- unsigned Rt2 = fieldFromInstruction(Insn, 10, 5);
- unsigned SImm7 = fieldFromInstruction(Insn, 15, 7);
- unsigned L = fieldFromInstruction(Insn, 22, 1);
- unsigned V = fieldFromInstruction(Insn, 26, 1);
- unsigned Opc = fieldFromInstruction(Insn, 30, 2);
+static DecodeStatus DecodeVecShiftLImm(llvm::MCInst &Inst, unsigned Imm,
+ unsigned Add) {
+ Inst.addOperand(MCOperand::CreateImm((Imm + Add) & (Add - 1)));
+ return Success;
+}
- // Not an official name, but it turns out that bit 23 distinguishes indexed
- // from non-indexed operations.
- unsigned Indexed = fieldFromInstruction(Insn, 23, 1);
+static DecodeStatus DecodeVecShiftR64Imm(llvm::MCInst &Inst, unsigned Imm,
+ uint64_t Addr, const void *Decoder) {
+ return DecodeVecShiftRImm(Inst, Imm, 64);
+}
- if (Indexed && L == 0) {
- // The MCInst for an indexed store has an out operand and 4 ins:
- // Rn_wb, Rt, Rt2, Rn, Imm
- DecodeGPR64xspRegisterClass(Inst, Rn, Address, Decoder);
+static DecodeStatus DecodeVecShiftR64ImmNarrow(llvm::MCInst &Inst, unsigned Imm,
+ uint64_t Addr,
+ const void *Decoder) {
+ return DecodeVecShiftRImm(Inst, Imm | 0x20, 64);
+}
+
+static DecodeStatus DecodeVecShiftR32Imm(llvm::MCInst &Inst, unsigned Imm,
+ uint64_t Addr, const void *Decoder) {
+ return DecodeVecShiftRImm(Inst, Imm, 32);
+}
+
+static DecodeStatus DecodeVecShiftR32ImmNarrow(llvm::MCInst &Inst, unsigned Imm,
+ uint64_t Addr,
+ const void *Decoder) {
+ return DecodeVecShiftRImm(Inst, Imm | 0x10, 32);
+}
+
+static DecodeStatus DecodeVecShiftR16Imm(llvm::MCInst &Inst, unsigned Imm,
+ uint64_t Addr, const void *Decoder) {
+ return DecodeVecShiftRImm(Inst, Imm, 16);
+}
+
+static DecodeStatus DecodeVecShiftR16ImmNarrow(llvm::MCInst &Inst, unsigned Imm,
+ uint64_t Addr,
+ const void *Decoder) {
+ return DecodeVecShiftRImm(Inst, Imm | 0x8, 16);
+}
+
+static DecodeStatus DecodeVecShiftR8Imm(llvm::MCInst &Inst, unsigned Imm,
+ uint64_t Addr, const void *Decoder) {
+ return DecodeVecShiftRImm(Inst, Imm, 8);
+}
+
+static DecodeStatus DecodeVecShiftL64Imm(llvm::MCInst &Inst, unsigned Imm,
+ uint64_t Addr, const void *Decoder) {
+ return DecodeVecShiftLImm(Inst, Imm, 64);
+}
+
+static DecodeStatus DecodeVecShiftL32Imm(llvm::MCInst &Inst, unsigned Imm,
+ uint64_t Addr, const void *Decoder) {
+ return DecodeVecShiftLImm(Inst, Imm, 32);
+}
+
+static DecodeStatus DecodeVecShiftL16Imm(llvm::MCInst &Inst, unsigned Imm,
+ uint64_t Addr, const void *Decoder) {
+ return DecodeVecShiftLImm(Inst, Imm, 16);
+}
+
+static DecodeStatus DecodeVecShiftL8Imm(llvm::MCInst &Inst, unsigned Imm,
+ uint64_t Addr, const void *Decoder) {
+ return DecodeVecShiftLImm(Inst, Imm, 8);
+}
+
+static DecodeStatus DecodeThreeAddrSRegInstruction(llvm::MCInst &Inst,
+ uint32_t insn, uint64_t Addr,
+ const void *Decoder) {
+ unsigned Rd = fieldFromInstruction(insn, 0, 5);
+ unsigned Rn = fieldFromInstruction(insn, 5, 5);
+ unsigned Rm = fieldFromInstruction(insn, 16, 5);
+ unsigned shiftHi = fieldFromInstruction(insn, 22, 2);
+ unsigned shiftLo = fieldFromInstruction(insn, 10, 6);
+ unsigned shift = (shiftHi << 6) | shiftLo;
+ switch (Inst.getOpcode()) {
+ default:
+ return Fail;
+ case AArch64::ADDWrs:
+ case AArch64::ADDSWrs:
+ case AArch64::SUBWrs:
+ case AArch64::SUBSWrs:
+ // if shift == '11' then ReservedValue()
+ if (shiftHi == 0x3)
+ return Fail;
+ // Deliberate fallthrough
+ case AArch64::ANDWrs:
+ case AArch64::ANDSWrs:
+ case AArch64::BICWrs:
+ case AArch64::BICSWrs:
+ case AArch64::ORRWrs:
+ case AArch64::ORNWrs:
+ case AArch64::EORWrs:
+ case AArch64::EONWrs: {
+ // if sf == '0' and imm6<5> == '1' then ReservedValue()
+ if (shiftLo >> 5 == 1)
+ return Fail;
+ DecodeGPR32RegisterClass(Inst, Rd, Addr, Decoder);
+ DecodeGPR32RegisterClass(Inst, Rn, Addr, Decoder);
+ DecodeGPR32RegisterClass(Inst, Rm, Addr, Decoder);
+ break;
+ }
+ case AArch64::ADDXrs:
+ case AArch64::ADDSXrs:
+ case AArch64::SUBXrs:
+ case AArch64::SUBSXrs:
+ // if shift == '11' then ReservedValue()
+ if (shiftHi == 0x3)
+ return Fail;
+ // Deliberate fallthrough
+ case AArch64::ANDXrs:
+ case AArch64::ANDSXrs:
+ case AArch64::BICXrs:
+ case AArch64::BICSXrs:
+ case AArch64::ORRXrs:
+ case AArch64::ORNXrs:
+ case AArch64::EORXrs:
+ case AArch64::EONXrs:
+ DecodeGPR64RegisterClass(Inst, Rd, Addr, Decoder);
+ DecodeGPR64RegisterClass(Inst, Rn, Addr, Decoder);
+ DecodeGPR64RegisterClass(Inst, Rm, Addr, Decoder);
+ break;
}
+ Inst.addOperand(MCOperand::CreateImm(shift));
+ return Success;
+}
+
+static DecodeStatus DecodeMoveImmInstruction(llvm::MCInst &Inst, uint32_t insn,
+ uint64_t Addr,
+ const void *Decoder) {
+ unsigned Rd = fieldFromInstruction(insn, 0, 5);
+ unsigned imm = fieldFromInstruction(insn, 5, 16);
+ unsigned shift = fieldFromInstruction(insn, 21, 2);
+ shift <<= 4;
+ switch (Inst.getOpcode()) {
+ default:
+ return Fail;
+ case AArch64::MOVZWi:
+ case AArch64::MOVNWi:
+ case AArch64::MOVKWi:
+ if (shift & (1U << 5))
+ return Fail;
+ DecodeGPR32RegisterClass(Inst, Rd, Addr, Decoder);
+ break;
+ case AArch64::MOVZXi:
+ case AArch64::MOVNXi:
+ case AArch64::MOVKXi:
+ DecodeGPR64RegisterClass(Inst, Rd, Addr, Decoder);
+ break;
+ }
+
+ if (Inst.getOpcode() == AArch64::MOVKWi ||
+ Inst.getOpcode() == AArch64::MOVKXi)
+ Inst.addOperand(Inst.getOperand(0));
+
+ Inst.addOperand(MCOperand::CreateImm(imm));
+ Inst.addOperand(MCOperand::CreateImm(shift));
+ return Success;
+}
+
+static DecodeStatus DecodeUnsignedLdStInstruction(llvm::MCInst &Inst,
+ uint32_t insn, uint64_t Addr,
+ const void *Decoder) {
+ unsigned Rt = fieldFromInstruction(insn, 0, 5);
+ unsigned Rn = fieldFromInstruction(insn, 5, 5);
+ unsigned offset = fieldFromInstruction(insn, 10, 12);
+ const AArch64Disassembler *Dis =
+ static_cast<const AArch64Disassembler *>(Decoder);
+
+ switch (Inst.getOpcode()) {
+ default:
+ return Fail;
+ case AArch64::PRFMui:
+ // Rt is an immediate in prefetch.
+ Inst.addOperand(MCOperand::CreateImm(Rt));
+ break;
+ case AArch64::STRBBui:
+ case AArch64::LDRBBui:
+ case AArch64::LDRSBWui:
+ case AArch64::STRHHui:
+ case AArch64::LDRHHui:
+ case AArch64::LDRSHWui:
+ case AArch64::STRWui:
+ case AArch64::LDRWui:
+ DecodeGPR32RegisterClass(Inst, Rt, Addr, Decoder);
+ break;
+ case AArch64::LDRSBXui:
+ case AArch64::LDRSHXui:
+ case AArch64::LDRSWui:
+ case AArch64::STRXui:
+ case AArch64::LDRXui:
+ DecodeGPR64RegisterClass(Inst, Rt, Addr, Decoder);
+ break;
+ case AArch64::LDRQui:
+ case AArch64::STRQui:
+ DecodeFPR128RegisterClass(Inst, Rt, Addr, Decoder);
+ break;
+ case AArch64::LDRDui:
+ case AArch64::STRDui:
+ DecodeFPR64RegisterClass(Inst, Rt, Addr, Decoder);
+ break;
+ case AArch64::LDRSui:
+ case AArch64::STRSui:
+ DecodeFPR32RegisterClass(Inst, Rt, Addr, Decoder);
+ break;
+ case AArch64::LDRHui:
+ case AArch64::STRHui:
+ DecodeFPR16RegisterClass(Inst, Rt, Addr, Decoder);
+ break;
+ case AArch64::LDRBui:
+ case AArch64::STRBui:
+ DecodeFPR8RegisterClass(Inst, Rt, Addr, Decoder);
+ break;
+ }
+
+ DecodeGPR64spRegisterClass(Inst, Rn, Addr, Decoder);
+ if (!Dis->tryAddingSymbolicOperand(Inst, offset, Addr, Fail, 0, 4))
+ Inst.addOperand(MCOperand::CreateImm(offset));
+ return Success;
+}
+
+static DecodeStatus DecodeSignedLdStInstruction(llvm::MCInst &Inst,
+ uint32_t insn, uint64_t Addr,
+ const void *Decoder) {
+ unsigned Rt = fieldFromInstruction(insn, 0, 5);
+ unsigned Rn = fieldFromInstruction(insn, 5, 5);
+ int64_t offset = fieldFromInstruction(insn, 12, 9);
+
+ // offset is a 9-bit signed immediate, so sign extend it to
+ // fill the unsigned.
+ if (offset & (1 << (9 - 1)))
+ offset |= ~((1LL << 9) - 1);
+
+ // First operand is always the writeback to the address register, if needed.
+ switch (Inst.getOpcode()) {
+ default:
+ break;
+ case AArch64::LDRSBWpre:
+ case AArch64::LDRSHWpre:
+ case AArch64::STRBBpre:
+ case AArch64::LDRBBpre:
+ case AArch64::STRHHpre:
+ case AArch64::LDRHHpre:
+ case AArch64::STRWpre:
+ case AArch64::LDRWpre:
+ case AArch64::LDRSBWpost:
+ case AArch64::LDRSHWpost:
+ case AArch64::STRBBpost:
+ case AArch64::LDRBBpost:
+ case AArch64::STRHHpost:
+ case AArch64::LDRHHpost:
+ case AArch64::STRWpost:
+ case AArch64::LDRWpost:
+ case AArch64::LDRSBXpre:
+ case AArch64::LDRSHXpre:
+ case AArch64::STRXpre:
+ case AArch64::LDRSWpre:
+ case AArch64::LDRXpre:
+ case AArch64::LDRSBXpost:
+ case AArch64::LDRSHXpost:
+ case AArch64::STRXpost:
+ case AArch64::LDRSWpost:
+ case AArch64::LDRXpost:
+ case AArch64::LDRQpre:
+ case AArch64::STRQpre:
+ case AArch64::LDRQpost:
+ case AArch64::STRQpost:
+ case AArch64::LDRDpre:
+ case AArch64::STRDpre:
+ case AArch64::LDRDpost:
+ case AArch64::STRDpost:
+ case AArch64::LDRSpre:
+ case AArch64::STRSpre:
+ case AArch64::LDRSpost:
+ case AArch64::STRSpost:
+ case AArch64::LDRHpre:
+ case AArch64::STRHpre:
+ case AArch64::LDRHpost:
+ case AArch64::STRHpost:
+ case AArch64::LDRBpre:
+ case AArch64::STRBpre:
+ case AArch64::LDRBpost:
+ case AArch64::STRBpost:
+ DecodeGPR64spRegisterClass(Inst, Rn, Addr, Decoder);
+ break;
+ }
+
+ switch (Inst.getOpcode()) {
+ default:
+ return Fail;
+ case AArch64::PRFUMi:
+ // Rt is an immediate in prefetch.
+ Inst.addOperand(MCOperand::CreateImm(Rt));
+ break;
+ case AArch64::STURBBi:
+ case AArch64::LDURBBi:
+ case AArch64::LDURSBWi:
+ case AArch64::STURHHi:
+ case AArch64::LDURHHi:
+ case AArch64::LDURSHWi:
+ case AArch64::STURWi:
+ case AArch64::LDURWi:
+ case AArch64::LDTRSBWi:
+ case AArch64::LDTRSHWi:
+ case AArch64::STTRWi:
+ case AArch64::LDTRWi:
+ case AArch64::STTRHi:
+ case AArch64::LDTRHi:
+ case AArch64::LDTRBi:
+ case AArch64::STTRBi:
+ case AArch64::LDRSBWpre:
+ case AArch64::LDRSHWpre:
+ case AArch64::STRBBpre:
+ case AArch64::LDRBBpre:
+ case AArch64::STRHHpre:
+ case AArch64::LDRHHpre:
+ case AArch64::STRWpre:
+ case AArch64::LDRWpre:
+ case AArch64::LDRSBWpost:
+ case AArch64::LDRSHWpost:
+ case AArch64::STRBBpost:
+ case AArch64::LDRBBpost:
+ case AArch64::STRHHpost:
+ case AArch64::LDRHHpost:
+ case AArch64::STRWpost:
+ case AArch64::LDRWpost:
+ DecodeGPR32RegisterClass(Inst, Rt, Addr, Decoder);
+ break;
+ case AArch64::LDURSBXi:
+ case AArch64::LDURSHXi:
+ case AArch64::LDURSWi:
+ case AArch64::STURXi:
+ case AArch64::LDURXi:
+ case AArch64::LDTRSBXi:
+ case AArch64::LDTRSHXi:
+ case AArch64::LDTRSWi:
+ case AArch64::STTRXi:
+ case AArch64::LDTRXi:
+ case AArch64::LDRSBXpre:
+ case AArch64::LDRSHXpre:
+ case AArch64::STRXpre:
+ case AArch64::LDRSWpre:
+ case AArch64::LDRXpre:
+ case AArch64::LDRSBXpost:
+ case AArch64::LDRSHXpost:
+ case AArch64::STRXpost:
+ case AArch64::LDRSWpost:
+ case AArch64::LDRXpost:
+ DecodeGPR64RegisterClass(Inst, Rt, Addr, Decoder);
+ break;
+ case AArch64::LDURQi:
+ case AArch64::STURQi:
+ case AArch64::LDRQpre:
+ case AArch64::STRQpre:
+ case AArch64::LDRQpost:
+ case AArch64::STRQpost:
+ DecodeFPR128RegisterClass(Inst, Rt, Addr, Decoder);
+ break;
+ case AArch64::LDURDi:
+ case AArch64::STURDi:
+ case AArch64::LDRDpre:
+ case AArch64::STRDpre:
+ case AArch64::LDRDpost:
+ case AArch64::STRDpost:
+ DecodeFPR64RegisterClass(Inst, Rt, Addr, Decoder);
+ break;
+ case AArch64::LDURSi:
+ case AArch64::STURSi:
+ case AArch64::LDRSpre:
+ case AArch64::STRSpre:
+ case AArch64::LDRSpost:
+ case AArch64::STRSpost:
+ DecodeFPR32RegisterClass(Inst, Rt, Addr, Decoder);
+ break;
+ case AArch64::LDURHi:
+ case AArch64::STURHi:
+ case AArch64::LDRHpre:
+ case AArch64::STRHpre:
+ case AArch64::LDRHpost:
+ case AArch64::STRHpost:
+ DecodeFPR16RegisterClass(Inst, Rt, Addr, Decoder);
+ break;
+ case AArch64::LDURBi:
+ case AArch64::STURBi:
+ case AArch64::LDRBpre:
+ case AArch64::STRBpre:
+ case AArch64::LDRBpost:
+ case AArch64::STRBpost:
+ DecodeFPR8RegisterClass(Inst, Rt, Addr, Decoder);
+ break;
+ }
+
+ DecodeGPR64spRegisterClass(Inst, Rn, Addr, Decoder);
+ Inst.addOperand(MCOperand::CreateImm(offset));
+
+ bool IsLoad = fieldFromInstruction(insn, 22, 1);
+ bool IsIndexed = fieldFromInstruction(insn, 10, 2) != 0;
+ bool IsFP = fieldFromInstruction(insn, 26, 1);
+
+ // Cannot write back to a transfer register (but xzr != sp).
+ if (IsLoad && IsIndexed && !IsFP && Rn != 31 && Rt == Rn)
+ return SoftFail;
+
+ return Success;
+}
+
+static DecodeStatus DecodeExclusiveLdStInstruction(llvm::MCInst &Inst,
+ uint32_t insn, uint64_t Addr,
+ const void *Decoder) {
+ unsigned Rt = fieldFromInstruction(insn, 0, 5);
+ unsigned Rn = fieldFromInstruction(insn, 5, 5);
+ unsigned Rt2 = fieldFromInstruction(insn, 10, 5);
+ unsigned Rs = fieldFromInstruction(insn, 16, 5);
+
+ unsigned Opcode = Inst.getOpcode();
+ switch (Opcode) {
+ default:
+ return Fail;
+ case AArch64::STLXRW:
+ case AArch64::STLXRB:
+ case AArch64::STLXRH:
+ case AArch64::STXRW:
+ case AArch64::STXRB:
+ case AArch64::STXRH:
+ DecodeGPR32RegisterClass(Inst, Rs, Addr, Decoder);
+ // FALLTHROUGH
+ case AArch64::LDARW:
+ case AArch64::LDARB:
+ case AArch64::LDARH:
+ case AArch64::LDAXRW:
+ case AArch64::LDAXRB:
+ case AArch64::LDAXRH:
+ case AArch64::LDXRW:
+ case AArch64::LDXRB:
+ case AArch64::LDXRH:
+ case AArch64::STLRW:
+ case AArch64::STLRB:
+ case AArch64::STLRH:
+ DecodeGPR32RegisterClass(Inst, Rt, Addr, Decoder);
+ break;
+ case AArch64::STLXRX:
+ case AArch64::STXRX:
+ DecodeGPR32RegisterClass(Inst, Rs, Addr, Decoder);
+ // FALLTHROUGH
+ case AArch64::LDARX:
+ case AArch64::LDAXRX:
+ case AArch64::LDXRX:
+ case AArch64::STLRX:
+ DecodeGPR64RegisterClass(Inst, Rt, Addr, Decoder);
+ break;
+ case AArch64::STLXPW:
+ case AArch64::STXPW:
+ DecodeGPR32RegisterClass(Inst, Rs, Addr, Decoder);
+ // FALLTHROUGH
+ case AArch64::LDAXPW:
+ case AArch64::LDXPW:
+ DecodeGPR32RegisterClass(Inst, Rt, Addr, Decoder);
+ DecodeGPR32RegisterClass(Inst, Rt2, Addr, Decoder);
+ break;
+ case AArch64::STLXPX:
+ case AArch64::STXPX:
+ DecodeGPR32RegisterClass(Inst, Rs, Addr, Decoder);
+ // FALLTHROUGH
+ case AArch64::LDAXPX:
+ case AArch64::LDXPX:
+ DecodeGPR64RegisterClass(Inst, Rt, Addr, Decoder);
+ DecodeGPR64RegisterClass(Inst, Rt2, Addr, Decoder);
+ break;
+ }
+
+ DecodeGPR64spRegisterClass(Inst, Rn, Addr, Decoder);
+
// You shouldn't load to the same register twice in an instruction...
- if (L && Rt == Rt2)
- Result = MCDisassembler::SoftFail;
+ if ((Opcode == AArch64::LDAXPW || Opcode == AArch64::LDXPW ||
+ Opcode == AArch64::LDAXPX || Opcode == AArch64::LDXPX) &&
+ Rt == Rt2)
+ return SoftFail;
+
+ return Success;
+}
+
+static DecodeStatus DecodePairLdStInstruction(llvm::MCInst &Inst, uint32_t insn,
+ uint64_t Addr,
+ const void *Decoder) {
+ unsigned Rt = fieldFromInstruction(insn, 0, 5);
+ unsigned Rn = fieldFromInstruction(insn, 5, 5);
+ unsigned Rt2 = fieldFromInstruction(insn, 10, 5);
+ int64_t offset = fieldFromInstruction(insn, 15, 7);
+ bool IsLoad = fieldFromInstruction(insn, 22, 1);
+
+ // offset is a 7-bit signed immediate, so sign extend it to
+ // fill the unsigned.
+ if (offset & (1 << (7 - 1)))
+ offset |= ~((1LL << 7) - 1);
+
+ unsigned Opcode = Inst.getOpcode();
+ bool NeedsDisjointWritebackTransfer = false;
+
+ // First operand is always writeback of base register.
+ switch (Opcode) {
+ default:
+ break;
+ case AArch64::LDPXpost:
+ case AArch64::STPXpost:
+ case AArch64::LDPSWpost:
+ case AArch64::LDPXpre:
+ case AArch64::STPXpre:
+ case AArch64::LDPSWpre:
+ case AArch64::LDPWpost:
+ case AArch64::STPWpost:
+ case AArch64::LDPWpre:
+ case AArch64::STPWpre:
+ case AArch64::LDPQpost:
+ case AArch64::STPQpost:
+ case AArch64::LDPQpre:
+ case AArch64::STPQpre:
+ case AArch64::LDPDpost:
+ case AArch64::STPDpost:
+ case AArch64::LDPDpre:
+ case AArch64::STPDpre:
+ case AArch64::LDPSpost:
+ case AArch64::STPSpost:
+ case AArch64::LDPSpre:
+ case AArch64::STPSpre:
+ DecodeGPR64spRegisterClass(Inst, Rn, Addr, Decoder);
+ break;
+ }
+
+ switch (Opcode) {
+ default:
+ return Fail;
+ case AArch64::LDPXpost:
+ case AArch64::STPXpost:
+ case AArch64::LDPSWpost:
+ case AArch64::LDPXpre:
+ case AArch64::STPXpre:
+ case AArch64::LDPSWpre:
+ NeedsDisjointWritebackTransfer = true;
+ // Fallthrough
+ case AArch64::LDNPXi:
+ case AArch64::STNPXi:
+ case AArch64::LDPXi:
+ case AArch64::STPXi:
+ case AArch64::LDPSWi:
+ DecodeGPR64RegisterClass(Inst, Rt, Addr, Decoder);
+ DecodeGPR64RegisterClass(Inst, Rt2, Addr, Decoder);
+ break;
+ case AArch64::LDPWpost:
+ case AArch64::STPWpost:
+ case AArch64::LDPWpre:
+ case AArch64::STPWpre:
+ NeedsDisjointWritebackTransfer = true;
+ // Fallthrough
+ case AArch64::LDNPWi:
+ case AArch64::STNPWi:
+ case AArch64::LDPWi:
+ case AArch64::STPWi:
+ DecodeGPR32RegisterClass(Inst, Rt, Addr, Decoder);
+ DecodeGPR32RegisterClass(Inst, Rt2, Addr, Decoder);
+ break;
+ case AArch64::LDNPQi:
+ case AArch64::STNPQi:
+ case AArch64::LDPQpost:
+ case AArch64::STPQpost:
+ case AArch64::LDPQi:
+ case AArch64::STPQi:
+ case AArch64::LDPQpre:
+ case AArch64::STPQpre:
+ DecodeFPR128RegisterClass(Inst, Rt, Addr, Decoder);
+ DecodeFPR128RegisterClass(Inst, Rt2, Addr, Decoder);
+ break;
+ case AArch64::LDNPDi:
+ case AArch64::STNPDi:
+ case AArch64::LDPDpost:
+ case AArch64::STPDpost:
+ case AArch64::LDPDi:
+ case AArch64::STPDi:
+ case AArch64::LDPDpre:
+ case AArch64::STPDpre:
+ DecodeFPR64RegisterClass(Inst, Rt, Addr, Decoder);
+ DecodeFPR64RegisterClass(Inst, Rt2, Addr, Decoder);
+ break;
+ case AArch64::LDNPSi:
+ case AArch64::STNPSi:
+ case AArch64::LDPSpost:
+ case AArch64::STPSpost:
+ case AArch64::LDPSi:
+ case AArch64::STPSi:
+ case AArch64::LDPSpre:
+ case AArch64::STPSpre:
+ DecodeFPR32RegisterClass(Inst, Rt, Addr, Decoder);
+ DecodeFPR32RegisterClass(Inst, Rt2, Addr, Decoder);
+ break;
+ }
+
+ DecodeGPR64spRegisterClass(Inst, Rn, Addr, Decoder);
+ Inst.addOperand(MCOperand::CreateImm(offset));
+
+ // You shouldn't load to the same register twice in an instruction...
+ if (IsLoad && Rt == Rt2)
+ return SoftFail;
// ... or do any operation that writes-back to a transfer register. But note
// that "stp xzr, xzr, [sp], #4" is fine because xzr and sp are different.
- if (Indexed && V == 0 && Rn != 31 && (Rt == Rn || Rt2 == Rn))
- Result = MCDisassembler::SoftFail;
+ if (NeedsDisjointWritebackTransfer && Rn != 31 && (Rt == Rn || Rt2 == Rn))
+ return SoftFail;
- // Exactly how we decode the MCInst's registers depends on the Opc and V
- // fields of the instruction. These also obviously determine the size of the
- // operation so we can fill in that information while we're at it.
- if (V) {
- // The instruction operates on the FP/SIMD registers
- switch (Opc) {
- default: return MCDisassembler::Fail;
- case 0:
- DecodeFPR32RegisterClass(Inst, Rt, Address, Decoder);
- DecodeFPR32RegisterClass(Inst, Rt2, Address, Decoder);
- break;
- case 1:
- DecodeFPR64RegisterClass(Inst, Rt, Address, Decoder);
- DecodeFPR64RegisterClass(Inst, Rt2, Address, Decoder);
- break;
- case 2:
- DecodeFPR128RegisterClass(Inst, Rt, Address, Decoder);
- DecodeFPR128RegisterClass(Inst, Rt2, Address, Decoder);
- break;
- }
- } else {
- switch (Opc) {
- default: return MCDisassembler::Fail;
- case 0:
- DecodeGPR32RegisterClass(Inst, Rt, Address, Decoder);
- DecodeGPR32RegisterClass(Inst, Rt2, Address, Decoder);
- break;
- case 1:
- assert(L && "unexpected \"store signed\" attempt");
- DecodeGPR64RegisterClass(Inst, Rt, Address, Decoder);
- DecodeGPR64RegisterClass(Inst, Rt2, Address, Decoder);
- break;
- case 2:
- DecodeGPR64RegisterClass(Inst, Rt, Address, Decoder);
- DecodeGPR64RegisterClass(Inst, Rt2, Address, Decoder);
- break;
- }
- }
-
- if (Indexed && L == 1) {
- // The MCInst for an indexed load has 3 out operands and an 3 ins:
- // Rt, Rt2, Rn_wb, Rt2, Rn, Imm
- DecodeGPR64xspRegisterClass(Inst, Rn, Address, Decoder);
- }
-
-
- DecodeGPR64xspRegisterClass(Inst, Rn, Address, Decoder);
- Inst.addOperand(MCOperand::CreateImm(SImm7));
-
- return Result;
+ return Success;
}
-static DecodeStatus DecodeLoadPairExclusiveInstruction(llvm::MCInst &Inst,
- uint32_t Val,
- uint64_t Address,
- const void *Decoder) {
- unsigned Rt = fieldFromInstruction(Val, 0, 5);
- unsigned Rn = fieldFromInstruction(Val, 5, 5);
- unsigned Rt2 = fieldFromInstruction(Val, 10, 5);
- unsigned MemSize = fieldFromInstruction(Val, 30, 2);
+static DecodeStatus DecodeAddSubERegInstruction(llvm::MCInst &Inst,
+ uint32_t insn, uint64_t Addr,
+ const void *Decoder) {
+ unsigned Rd = fieldFromInstruction(insn, 0, 5);
+ unsigned Rn = fieldFromInstruction(insn, 5, 5);
+ unsigned Rm = fieldFromInstruction(insn, 16, 5);
+ unsigned extend = fieldFromInstruction(insn, 10, 6);
- DecodeStatus S = MCDisassembler::Success;
- if (Rt == Rt2) S = MCDisassembler::SoftFail;
+ unsigned shift = extend & 0x7;
+ if (shift > 4)
+ return Fail;
- switch (MemSize) {
- case 2:
- if (!Check(S, DecodeGPR32RegisterClass(Inst, Rt, Address, Decoder)))
- return MCDisassembler::Fail;
- if (!Check(S, DecodeGPR32RegisterClass(Inst, Rt2, Address, Decoder)))
- return MCDisassembler::Fail;
- break;
- case 3:
- if (!Check(S, DecodeGPR64RegisterClass(Inst, Rt, Address, Decoder)))
- return MCDisassembler::Fail;
- if (!Check(S, DecodeGPR64RegisterClass(Inst, Rt2, Address, Decoder)))
- return MCDisassembler::Fail;
- break;
- default:
- llvm_unreachable("Invalid MemSize in DecodeLoadPairExclusiveInstruction");
- }
-
- if (!Check(S, DecodeGPR64xspRegisterClass(Inst, Rn, Address, Decoder)))
- return MCDisassembler::Fail;
-
- return S;
-}
-
-template<typename SomeNamedImmMapper>
-static DecodeStatus DecodeNamedImmOperand(llvm::MCInst &Inst,
- unsigned Val,
- uint64_t Address,
- const void *Decoder) {
- SomeNamedImmMapper Mapper;
- bool ValidNamed;
- Mapper.toString(Val, ValidNamed);
- if (ValidNamed || Mapper.validImm(Val)) {
- Inst.addOperand(MCOperand::CreateImm(Val));
- return MCDisassembler::Success;
- }
-
- return MCDisassembler::Fail;
-}
-
-static DecodeStatus DecodeSysRegOperand(const A64SysReg::SysRegMapper &Mapper,
- llvm::MCInst &Inst,
- unsigned Val,
- uint64_t Address,
- const void *Decoder) {
- bool ValidNamed;
- Mapper.toString(Val, ValidNamed);
-
- Inst.addOperand(MCOperand::CreateImm(Val));
-
- return ValidNamed ? MCDisassembler::Success : MCDisassembler::Fail;
-}
-
-static DecodeStatus DecodeMRSOperand(llvm::MCInst &Inst,
- unsigned Val,
- uint64_t Address,
- const void *Decoder) {
- return DecodeSysRegOperand(A64SysReg::MRSMapper(), Inst, Val, Address,
- Decoder);
-}
-
-static DecodeStatus DecodeMSROperand(llvm::MCInst &Inst,
- unsigned Val,
- uint64_t Address,
- const void *Decoder) {
- return DecodeSysRegOperand(A64SysReg::MSRMapper(), Inst, Val, Address,
- Decoder);
-}
-
-static DecodeStatus DecodeSingleIndexedInstruction(llvm::MCInst &Inst,
- unsigned Insn,
- uint64_t Address,
- const void *Decoder) {
- unsigned Rt = fieldFromInstruction(Insn, 0, 5);
- unsigned Rn = fieldFromInstruction(Insn, 5, 5);
- unsigned Imm9 = fieldFromInstruction(Insn, 12, 9);
-
- unsigned Opc = fieldFromInstruction(Insn, 22, 2);
- unsigned V = fieldFromInstruction(Insn, 26, 1);
- unsigned Size = fieldFromInstruction(Insn, 30, 2);
-
- if (Opc == 0 || (V == 1 && Opc == 2)) {
- // It's a store, the MCInst gets: Rn_wb, Rt, Rn, Imm
- DecodeGPR64xspRegisterClass(Inst, Rn, Address, Decoder);
- }
-
- if (V == 0 && (Opc == 2 || Size == 3)) {
- DecodeGPR64RegisterClass(Inst, Rt, Address, Decoder);
- } else if (V == 0) {
- DecodeGPR32RegisterClass(Inst, Rt, Address, Decoder);
- } else if (V == 1 && (Opc & 2)) {
- DecodeFPR128RegisterClass(Inst, Rt, Address, Decoder);
- } else {
- switch (Size) {
- case 0:
- DecodeFPR8RegisterClass(Inst, Rt, Address, Decoder);
- break;
- case 1:
- DecodeFPR16RegisterClass(Inst, Rt, Address, Decoder);
- break;
- case 2:
- DecodeFPR32RegisterClass(Inst, Rt, Address, Decoder);
- break;
- case 3:
- DecodeFPR64RegisterClass(Inst, Rt, Address, Decoder);
- break;
- }
- }
-
- if (Opc != 0 && (V != 1 || Opc != 2)) {
- // It's a load, the MCInst gets: Rt, Rn_wb, Rn, Imm
- DecodeGPR64xspRegisterClass(Inst, Rn, Address, Decoder);
- }
-
- DecodeGPR64xspRegisterClass(Inst, Rn, Address, Decoder);
-
- Inst.addOperand(MCOperand::CreateImm(Imm9));
-
- // N.b. The official documentation says undpredictable if Rt == Rn, but this
- // takes place at the architectural rather than encoding level:
- //
- // "STR xzr, [sp], #4" is perfectly valid.
- if (V == 0 && Rt == Rn && Rn != 31)
- return MCDisassembler::SoftFail;
- else
- return MCDisassembler::Success;
-}
-
-static MCDisassembler *createAArch64Disassembler(const Target &T,
- const MCSubtargetInfo &STI) {
- return new AArch64Disassembler(STI, T.createMCRegInfo(""));
-}
-
-extern "C" void LLVMInitializeAArch64Disassembler() {
- TargetRegistry::RegisterMCDisassembler(TheAArch64leTarget,
- createAArch64Disassembler);
- TargetRegistry::RegisterMCDisassembler(TheAArch64beTarget,
- createAArch64Disassembler);
-}
-
-template <A64SE::ShiftExtSpecifiers Ext, bool IsHalf>
-static DecodeStatus
-DecodeNeonMovImmShiftOperand(llvm::MCInst &Inst, unsigned ShiftAmount,
- uint64_t Address, const void *Decoder) {
- bool IsLSL = false;
- if (Ext == A64SE::LSL)
- IsLSL = true;
- else if (Ext != A64SE::MSL)
- return MCDisassembler::Fail;
-
- // MSL and LSLH accepts encoded shift amount 0 or 1.
- if ((!IsLSL || (IsLSL && IsHalf)) && ShiftAmount != 0 && ShiftAmount != 1)
- return MCDisassembler::Fail;
-
- // LSL accepts encoded shift amount 0, 1, 2 or 3.
- if (IsLSL && ShiftAmount > 3)
- return MCDisassembler::Fail;
-
- Inst.addOperand(MCOperand::CreateImm(ShiftAmount));
- return MCDisassembler::Success;
-}
-
-// Decode post-index vector load/store instructions.
-// This is necessary as we need to decode Rm: if Rm == 0b11111, the last
-// operand is an immediate equal the the length of vector list in bytes,
-// or Rm is decoded to a GPR64noxzr register.
-static DecodeStatus DecodeVLDSTPostInstruction(MCInst &Inst, unsigned Insn,
- uint64_t Address,
- const void *Decoder) {
- unsigned Rt = fieldFromInstruction(Insn, 0, 5);
- unsigned Rn = fieldFromInstruction(Insn, 5, 5);
- unsigned Rm = fieldFromInstruction(Insn, 16, 5);
- unsigned Opcode = fieldFromInstruction(Insn, 12, 4);
- unsigned IsLoad = fieldFromInstruction(Insn, 22, 1);
- // 0 for 64bit vector list, 1 for 128bit vector list
- unsigned Is128BitVec = fieldFromInstruction(Insn, 30, 1);
-
- unsigned NumVecs;
- switch (Opcode) {
- case 0: // ld4/st4
- case 2: // ld1/st1 with 4 vectors
- NumVecs = 4; break;
- case 4: // ld3/st3
- case 6: // ld1/st1 with 3 vectors
- NumVecs = 3; break;
- case 7: // ld1/st1 with 1 vector
- NumVecs = 1; break;
- case 8: // ld2/st2
- case 10: // ld1/st1 with 2 vectors
- NumVecs = 2; break;
+ switch (Inst.getOpcode()) {
default:
- llvm_unreachable("Invalid opcode for post-index load/store instructions");
+ return Fail;
+ case AArch64::ADDWrx:
+ case AArch64::SUBWrx:
+ DecodeGPR32spRegisterClass(Inst, Rd, Addr, Decoder);
+ DecodeGPR32spRegisterClass(Inst, Rn, Addr, Decoder);
+ DecodeGPR32RegisterClass(Inst, Rm, Addr, Decoder);
+ break;
+ case AArch64::ADDSWrx:
+ case AArch64::SUBSWrx:
+ DecodeGPR32RegisterClass(Inst, Rd, Addr, Decoder);
+ DecodeGPR32spRegisterClass(Inst, Rn, Addr, Decoder);
+ DecodeGPR32RegisterClass(Inst, Rm, Addr, Decoder);
+ break;
+ case AArch64::ADDXrx:
+ case AArch64::SUBXrx:
+ DecodeGPR64spRegisterClass(Inst, Rd, Addr, Decoder);
+ DecodeGPR64spRegisterClass(Inst, Rn, Addr, Decoder);
+ DecodeGPR32RegisterClass(Inst, Rm, Addr, Decoder);
+ break;
+ case AArch64::ADDSXrx:
+ case AArch64::SUBSXrx:
+ DecodeGPR64RegisterClass(Inst, Rd, Addr, Decoder);
+ DecodeGPR64spRegisterClass(Inst, Rn, Addr, Decoder);
+ DecodeGPR32RegisterClass(Inst, Rm, Addr, Decoder);
+ break;
+ case AArch64::ADDXrx64:
+ case AArch64::SUBXrx64:
+ DecodeGPR64spRegisterClass(Inst, Rd, Addr, Decoder);
+ DecodeGPR64spRegisterClass(Inst, Rn, Addr, Decoder);
+ DecodeGPR64RegisterClass(Inst, Rm, Addr, Decoder);
+ break;
+ case AArch64::SUBSXrx64:
+ case AArch64::ADDSXrx64:
+ DecodeGPR64RegisterClass(Inst, Rd, Addr, Decoder);
+ DecodeGPR64spRegisterClass(Inst, Rn, Addr, Decoder);
+ DecodeGPR64RegisterClass(Inst, Rm, Addr, Decoder);
+ break;
}
- // Decode vector list of 1/2/3/4 vectors for load instructions.
- if (IsLoad) {
- switch (NumVecs) {
- case 1:
- Is128BitVec ? DecodeFPR128RegisterClass(Inst, Rt, Address, Decoder)
- : DecodeFPR64RegisterClass(Inst, Rt, Address, Decoder);
- break;
- case 2:
- Is128BitVec ? DecodeQPairRegisterClass(Inst, Rt, Address, Decoder)
- : DecodeDPairRegisterClass(Inst, Rt, Address, Decoder);
- break;
- case 3:
- Is128BitVec ? DecodeQTripleRegisterClass(Inst, Rt, Address, Decoder)
- : DecodeDTripleRegisterClass(Inst, Rt, Address, Decoder);
- break;
- case 4:
- Is128BitVec ? DecodeQQuadRegisterClass(Inst, Rt, Address, Decoder)
- : DecodeDQuadRegisterClass(Inst, Rt, Address, Decoder);
- break;
- }
- }
-
- // Decode write back register, which is equal to Rn.
- DecodeGPR64xspRegisterClass(Inst, Rn, Address, Decoder);
- DecodeGPR64xspRegisterClass(Inst, Rn, Address, Decoder);
-
- if (Rm == 31) // If Rm is 0x11111, add the vector list length in byte
- Inst.addOperand(MCOperand::CreateImm(NumVecs * (Is128BitVec ? 16 : 8)));
- else // Decode Rm
- DecodeGPR64noxzrRegisterClass(Inst, Rm, Address, Decoder);
-
- // Decode vector list of 1/2/3/4 vectors for load instructions.
- if (!IsLoad) {
- switch (NumVecs) {
- case 1:
- Is128BitVec ? DecodeFPR128RegisterClass(Inst, Rt, Address, Decoder)
- : DecodeFPR64RegisterClass(Inst, Rt, Address, Decoder);
- break;
- case 2:
- Is128BitVec ? DecodeQPairRegisterClass(Inst, Rt, Address, Decoder)
- : DecodeDPairRegisterClass(Inst, Rt, Address, Decoder);
- break;
- case 3:
- Is128BitVec ? DecodeQTripleRegisterClass(Inst, Rt, Address, Decoder)
- : DecodeDTripleRegisterClass(Inst, Rt, Address, Decoder);
- break;
- case 4:
- Is128BitVec ? DecodeQQuadRegisterClass(Inst, Rt, Address, Decoder)
- : DecodeDQuadRegisterClass(Inst, Rt, Address, Decoder);
- break;
- }
- }
-
- return MCDisassembler::Success;
+ Inst.addOperand(MCOperand::CreateImm(extend));
+ return Success;
}
-// Decode post-index vector load/store lane instructions.
-// This is necessary as we need to decode Rm: if Rm == 0b11111, the last
-// operand is an immediate equal the the length of the changed bytes,
-// or Rm is decoded to a GPR64noxzr register.
-static DecodeStatus DecodeVLDSTLanePostInstruction(MCInst &Inst, unsigned Insn,
- uint64_t Address,
- const void *Decoder) {
- bool Is64bitVec = false;
- bool IsLoadDup = false;
- bool IsLoad = false;
- // The total number of bytes transferred.
- // TransferBytes = NumVecs * OneLaneBytes
- unsigned TransferBytes = 0;
- unsigned NumVecs = 0;
- unsigned Opc = Inst.getOpcode();
- switch (Opc) {
- case AArch64::LD1R_WB_8B_fixed: case AArch64::LD1R_WB_8B_register:
- case AArch64::LD1R_WB_4H_fixed: case AArch64::LD1R_WB_4H_register:
- case AArch64::LD1R_WB_2S_fixed: case AArch64::LD1R_WB_2S_register:
- case AArch64::LD1R_WB_1D_fixed: case AArch64::LD1R_WB_1D_register: {
- switch (Opc) {
- case AArch64::LD1R_WB_8B_fixed: case AArch64::LD1R_WB_8B_register:
- TransferBytes = 1; break;
- case AArch64::LD1R_WB_4H_fixed: case AArch64::LD1R_WB_4H_register:
- TransferBytes = 2; break;
- case AArch64::LD1R_WB_2S_fixed: case AArch64::LD1R_WB_2S_register:
- TransferBytes = 4; break;
- case AArch64::LD1R_WB_1D_fixed: case AArch64::LD1R_WB_1D_register:
- TransferBytes = 8; break;
- }
- Is64bitVec = true;
- IsLoadDup = true;
- NumVecs = 1;
- break;
+static DecodeStatus DecodeLogicalImmInstruction(llvm::MCInst &Inst,
+ uint32_t insn, uint64_t Addr,
+ const void *Decoder) {
+ unsigned Rd = fieldFromInstruction(insn, 0, 5);
+ unsigned Rn = fieldFromInstruction(insn, 5, 5);
+ unsigned Datasize = fieldFromInstruction(insn, 31, 1);
+ unsigned imm;
+
+ if (Datasize) {
+ if (Inst.getOpcode() == AArch64::ANDSXri)
+ DecodeGPR64RegisterClass(Inst, Rd, Addr, Decoder);
+ else
+ DecodeGPR64spRegisterClass(Inst, Rd, Addr, Decoder);
+ DecodeGPR64RegisterClass(Inst, Rn, Addr, Decoder);
+ imm = fieldFromInstruction(insn, 10, 13);
+ if (!AArch64_AM::isValidDecodeLogicalImmediate(imm, 64))
+ return Fail;
+ } else {
+ if (Inst.getOpcode() == AArch64::ANDSWri)
+ DecodeGPR32RegisterClass(Inst, Rd, Addr, Decoder);
+ else
+ DecodeGPR32spRegisterClass(Inst, Rd, Addr, Decoder);
+ DecodeGPR32RegisterClass(Inst, Rn, Addr, Decoder);
+ imm = fieldFromInstruction(insn, 10, 12);
+ if (!AArch64_AM::isValidDecodeLogicalImmediate(imm, 32))
+ return Fail;
}
-
- case AArch64::LD1R_WB_16B_fixed: case AArch64::LD1R_WB_16B_register:
- case AArch64::LD1R_WB_8H_fixed: case AArch64::LD1R_WB_8H_register:
- case AArch64::LD1R_WB_4S_fixed: case AArch64::LD1R_WB_4S_register:
- case AArch64::LD1R_WB_2D_fixed: case AArch64::LD1R_WB_2D_register: {
- switch (Opc) {
- case AArch64::LD1R_WB_16B_fixed: case AArch64::LD1R_WB_16B_register:
- TransferBytes = 1; break;
- case AArch64::LD1R_WB_8H_fixed: case AArch64::LD1R_WB_8H_register:
- TransferBytes = 2; break;
- case AArch64::LD1R_WB_4S_fixed: case AArch64::LD1R_WB_4S_register:
- TransferBytes = 4; break;
- case AArch64::LD1R_WB_2D_fixed: case AArch64::LD1R_WB_2D_register:
- TransferBytes = 8; break;
- }
- IsLoadDup = true;
- NumVecs = 1;
- break;
- }
-
- case AArch64::LD2R_WB_8B_fixed: case AArch64::LD2R_WB_8B_register:
- case AArch64::LD2R_WB_4H_fixed: case AArch64::LD2R_WB_4H_register:
- case AArch64::LD2R_WB_2S_fixed: case AArch64::LD2R_WB_2S_register:
- case AArch64::LD2R_WB_1D_fixed: case AArch64::LD2R_WB_1D_register: {
- switch (Opc) {
- case AArch64::LD2R_WB_8B_fixed: case AArch64::LD2R_WB_8B_register:
- TransferBytes = 2; break;
- case AArch64::LD2R_WB_4H_fixed: case AArch64::LD2R_WB_4H_register:
- TransferBytes = 4; break;
- case AArch64::LD2R_WB_2S_fixed: case AArch64::LD2R_WB_2S_register:
- TransferBytes = 8; break;
- case AArch64::LD2R_WB_1D_fixed: case AArch64::LD2R_WB_1D_register:
- TransferBytes = 16; break;
- }
- Is64bitVec = true;
- IsLoadDup = true;
- NumVecs = 2;
- break;
- }
-
- case AArch64::LD2R_WB_16B_fixed: case AArch64::LD2R_WB_16B_register:
- case AArch64::LD2R_WB_8H_fixed: case AArch64::LD2R_WB_8H_register:
- case AArch64::LD2R_WB_4S_fixed: case AArch64::LD2R_WB_4S_register:
- case AArch64::LD2R_WB_2D_fixed: case AArch64::LD2R_WB_2D_register: {
- switch (Opc) {
- case AArch64::LD2R_WB_16B_fixed: case AArch64::LD2R_WB_16B_register:
- TransferBytes = 2; break;
- case AArch64::LD2R_WB_8H_fixed: case AArch64::LD2R_WB_8H_register:
- TransferBytes = 4; break;
- case AArch64::LD2R_WB_4S_fixed: case AArch64::LD2R_WB_4S_register:
- TransferBytes = 8; break;
- case AArch64::LD2R_WB_2D_fixed: case AArch64::LD2R_WB_2D_register:
- TransferBytes = 16; break;
- }
- IsLoadDup = true;
- NumVecs = 2;
- break;
- }
-
- case AArch64::LD3R_WB_8B_fixed: case AArch64::LD3R_WB_8B_register:
- case AArch64::LD3R_WB_4H_fixed: case AArch64::LD3R_WB_4H_register:
- case AArch64::LD3R_WB_2S_fixed: case AArch64::LD3R_WB_2S_register:
- case AArch64::LD3R_WB_1D_fixed: case AArch64::LD3R_WB_1D_register: {
- switch (Opc) {
- case AArch64::LD3R_WB_8B_fixed: case AArch64::LD3R_WB_8B_register:
- TransferBytes = 3; break;
- case AArch64::LD3R_WB_4H_fixed: case AArch64::LD3R_WB_4H_register:
- TransferBytes = 6; break;
- case AArch64::LD3R_WB_2S_fixed: case AArch64::LD3R_WB_2S_register:
- TransferBytes = 12; break;
- case AArch64::LD3R_WB_1D_fixed: case AArch64::LD3R_WB_1D_register:
- TransferBytes = 24; break;
- }
- Is64bitVec = true;
- IsLoadDup = true;
- NumVecs = 3;
- break;
- }
-
- case AArch64::LD3R_WB_16B_fixed: case AArch64::LD3R_WB_16B_register:
- case AArch64::LD3R_WB_4S_fixed: case AArch64::LD3R_WB_8H_register:
- case AArch64::LD3R_WB_8H_fixed: case AArch64::LD3R_WB_4S_register:
- case AArch64::LD3R_WB_2D_fixed: case AArch64::LD3R_WB_2D_register: {
- switch (Opc) {
- case AArch64::LD3R_WB_16B_fixed: case AArch64::LD3R_WB_16B_register:
- TransferBytes = 3; break;
- case AArch64::LD3R_WB_8H_fixed: case AArch64::LD3R_WB_8H_register:
- TransferBytes = 6; break;
- case AArch64::LD3R_WB_4S_fixed: case AArch64::LD3R_WB_4S_register:
- TransferBytes = 12; break;
- case AArch64::LD3R_WB_2D_fixed: case AArch64::LD3R_WB_2D_register:
- TransferBytes = 24; break;
- }
- IsLoadDup = true;
- NumVecs = 3;
- break;
- }
-
- case AArch64::LD4R_WB_8B_fixed: case AArch64::LD4R_WB_8B_register:
- case AArch64::LD4R_WB_4H_fixed: case AArch64::LD4R_WB_4H_register:
- case AArch64::LD4R_WB_2S_fixed: case AArch64::LD4R_WB_2S_register:
- case AArch64::LD4R_WB_1D_fixed: case AArch64::LD4R_WB_1D_register: {
- switch (Opc) {
- case AArch64::LD4R_WB_8B_fixed: case AArch64::LD4R_WB_8B_register:
- TransferBytes = 4; break;
- case AArch64::LD4R_WB_4H_fixed: case AArch64::LD4R_WB_4H_register:
- TransferBytes = 8; break;
- case AArch64::LD4R_WB_2S_fixed: case AArch64::LD4R_WB_2S_register:
- TransferBytes = 16; break;
- case AArch64::LD4R_WB_1D_fixed: case AArch64::LD4R_WB_1D_register:
- TransferBytes = 32; break;
- }
- Is64bitVec = true;
- IsLoadDup = true;
- NumVecs = 4;
- break;
- }
-
- case AArch64::LD4R_WB_16B_fixed: case AArch64::LD4R_WB_16B_register:
- case AArch64::LD4R_WB_4S_fixed: case AArch64::LD4R_WB_8H_register:
- case AArch64::LD4R_WB_8H_fixed: case AArch64::LD4R_WB_4S_register:
- case AArch64::LD4R_WB_2D_fixed: case AArch64::LD4R_WB_2D_register: {
- switch (Opc) {
- case AArch64::LD4R_WB_16B_fixed: case AArch64::LD4R_WB_16B_register:
- TransferBytes = 4; break;
- case AArch64::LD4R_WB_8H_fixed: case AArch64::LD4R_WB_8H_register:
- TransferBytes = 8; break;
- case AArch64::LD4R_WB_4S_fixed: case AArch64::LD4R_WB_4S_register:
- TransferBytes = 16; break;
- case AArch64::LD4R_WB_2D_fixed: case AArch64::LD4R_WB_2D_register:
- TransferBytes = 32; break;
- }
- IsLoadDup = true;
- NumVecs = 4;
- break;
- }
-
- case AArch64::LD1LN_WB_B_fixed: case AArch64::LD1LN_WB_B_register:
- case AArch64::LD1LN_WB_H_fixed: case AArch64::LD1LN_WB_H_register:
- case AArch64::LD1LN_WB_S_fixed: case AArch64::LD1LN_WB_S_register:
- case AArch64::LD1LN_WB_D_fixed: case AArch64::LD1LN_WB_D_register: {
- switch (Opc) {
- case AArch64::LD1LN_WB_B_fixed: case AArch64::LD1LN_WB_B_register:
- TransferBytes = 1; break;
- case AArch64::LD1LN_WB_H_fixed: case AArch64::LD1LN_WB_H_register:
- TransferBytes = 2; break;
- case AArch64::LD1LN_WB_S_fixed: case AArch64::LD1LN_WB_S_register:
- TransferBytes = 4; break;
- case AArch64::LD1LN_WB_D_fixed: case AArch64::LD1LN_WB_D_register:
- TransferBytes = 8; break;
- }
- IsLoad = true;
- NumVecs = 1;
- break;
- }
-
- case AArch64::LD2LN_WB_B_fixed: case AArch64::LD2LN_WB_B_register:
- case AArch64::LD2LN_WB_H_fixed: case AArch64::LD2LN_WB_H_register:
- case AArch64::LD2LN_WB_S_fixed: case AArch64::LD2LN_WB_S_register:
- case AArch64::LD2LN_WB_D_fixed: case AArch64::LD2LN_WB_D_register: {
- switch (Opc) {
- case AArch64::LD2LN_WB_B_fixed: case AArch64::LD2LN_WB_B_register:
- TransferBytes = 2; break;
- case AArch64::LD2LN_WB_H_fixed: case AArch64::LD2LN_WB_H_register:
- TransferBytes = 4; break;
- case AArch64::LD2LN_WB_S_fixed: case AArch64::LD2LN_WB_S_register:
- TransferBytes = 8; break;
- case AArch64::LD2LN_WB_D_fixed: case AArch64::LD2LN_WB_D_register:
- TransferBytes = 16; break;
- }
- IsLoad = true;
- NumVecs = 2;
- break;
- }
-
- case AArch64::LD3LN_WB_B_fixed: case AArch64::LD3LN_WB_B_register:
- case AArch64::LD3LN_WB_H_fixed: case AArch64::LD3LN_WB_H_register:
- case AArch64::LD3LN_WB_S_fixed: case AArch64::LD3LN_WB_S_register:
- case AArch64::LD3LN_WB_D_fixed: case AArch64::LD3LN_WB_D_register: {
- switch (Opc) {
- case AArch64::LD3LN_WB_B_fixed: case AArch64::LD3LN_WB_B_register:
- TransferBytes = 3; break;
- case AArch64::LD3LN_WB_H_fixed: case AArch64::LD3LN_WB_H_register:
- TransferBytes = 6; break;
- case AArch64::LD3LN_WB_S_fixed: case AArch64::LD3LN_WB_S_register:
- TransferBytes = 12; break;
- case AArch64::LD3LN_WB_D_fixed: case AArch64::LD3LN_WB_D_register:
- TransferBytes = 24; break;
- }
- IsLoad = true;
- NumVecs = 3;
- break;
- }
-
- case AArch64::LD4LN_WB_B_fixed: case AArch64::LD4LN_WB_B_register:
- case AArch64::LD4LN_WB_H_fixed: case AArch64::LD4LN_WB_H_register:
- case AArch64::LD4LN_WB_S_fixed: case AArch64::LD4LN_WB_S_register:
- case AArch64::LD4LN_WB_D_fixed: case AArch64::LD4LN_WB_D_register: {
- switch (Opc) {
- case AArch64::LD4LN_WB_B_fixed: case AArch64::LD4LN_WB_B_register:
- TransferBytes = 4; break;
- case AArch64::LD4LN_WB_H_fixed: case AArch64::LD4LN_WB_H_register:
- TransferBytes = 8; break;
- case AArch64::LD4LN_WB_S_fixed: case AArch64::LD4LN_WB_S_register:
- TransferBytes = 16; break;
- case AArch64::LD4LN_WB_D_fixed: case AArch64::LD4LN_WB_D_register:
- TransferBytes = 32; break;
- }
- IsLoad = true;
- NumVecs = 4;
- break;
- }
-
- case AArch64::ST1LN_WB_B_fixed: case AArch64::ST1LN_WB_B_register:
- case AArch64::ST1LN_WB_H_fixed: case AArch64::ST1LN_WB_H_register:
- case AArch64::ST1LN_WB_S_fixed: case AArch64::ST1LN_WB_S_register:
- case AArch64::ST1LN_WB_D_fixed: case AArch64::ST1LN_WB_D_register: {
- switch (Opc) {
- case AArch64::ST1LN_WB_B_fixed: case AArch64::ST1LN_WB_B_register:
- TransferBytes = 1; break;
- case AArch64::ST1LN_WB_H_fixed: case AArch64::ST1LN_WB_H_register:
- TransferBytes = 2; break;
- case AArch64::ST1LN_WB_S_fixed: case AArch64::ST1LN_WB_S_register:
- TransferBytes = 4; break;
- case AArch64::ST1LN_WB_D_fixed: case AArch64::ST1LN_WB_D_register:
- TransferBytes = 8; break;
- }
- NumVecs = 1;
- break;
- }
-
- case AArch64::ST2LN_WB_B_fixed: case AArch64::ST2LN_WB_B_register:
- case AArch64::ST2LN_WB_H_fixed: case AArch64::ST2LN_WB_H_register:
- case AArch64::ST2LN_WB_S_fixed: case AArch64::ST2LN_WB_S_register:
- case AArch64::ST2LN_WB_D_fixed: case AArch64::ST2LN_WB_D_register: {
- switch (Opc) {
- case AArch64::ST2LN_WB_B_fixed: case AArch64::ST2LN_WB_B_register:
- TransferBytes = 2; break;
- case AArch64::ST2LN_WB_H_fixed: case AArch64::ST2LN_WB_H_register:
- TransferBytes = 4; break;
- case AArch64::ST2LN_WB_S_fixed: case AArch64::ST2LN_WB_S_register:
- TransferBytes = 8; break;
- case AArch64::ST2LN_WB_D_fixed: case AArch64::ST2LN_WB_D_register:
- TransferBytes = 16; break;
- }
- NumVecs = 2;
- break;
- }
-
- case AArch64::ST3LN_WB_B_fixed: case AArch64::ST3LN_WB_B_register:
- case AArch64::ST3LN_WB_H_fixed: case AArch64::ST3LN_WB_H_register:
- case AArch64::ST3LN_WB_S_fixed: case AArch64::ST3LN_WB_S_register:
- case AArch64::ST3LN_WB_D_fixed: case AArch64::ST3LN_WB_D_register: {
- switch (Opc) {
- case AArch64::ST3LN_WB_B_fixed: case AArch64::ST3LN_WB_B_register:
- TransferBytes = 3; break;
- case AArch64::ST3LN_WB_H_fixed: case AArch64::ST3LN_WB_H_register:
- TransferBytes = 6; break;
- case AArch64::ST3LN_WB_S_fixed: case AArch64::ST3LN_WB_S_register:
- TransferBytes = 12; break;
- case AArch64::ST3LN_WB_D_fixed: case AArch64::ST3LN_WB_D_register:
- TransferBytes = 24; break;
- }
- NumVecs = 3;
- break;
- }
-
- case AArch64::ST4LN_WB_B_fixed: case AArch64::ST4LN_WB_B_register:
- case AArch64::ST4LN_WB_H_fixed: case AArch64::ST4LN_WB_H_register:
- case AArch64::ST4LN_WB_S_fixed: case AArch64::ST4LN_WB_S_register:
- case AArch64::ST4LN_WB_D_fixed: case AArch64::ST4LN_WB_D_register: {
- switch (Opc) {
- case AArch64::ST4LN_WB_B_fixed: case AArch64::ST4LN_WB_B_register:
- TransferBytes = 4; break;
- case AArch64::ST4LN_WB_H_fixed: case AArch64::ST4LN_WB_H_register:
- TransferBytes = 8; break;
- case AArch64::ST4LN_WB_S_fixed: case AArch64::ST4LN_WB_S_register:
- TransferBytes = 16; break;
- case AArch64::ST4LN_WB_D_fixed: case AArch64::ST4LN_WB_D_register:
- TransferBytes = 32; break;
- }
- NumVecs = 4;
- break;
- }
-
- default:
- return MCDisassembler::Fail;
- } // End of switch (Opc)
-
- unsigned Rt = fieldFromInstruction(Insn, 0, 5);
- unsigned Rn = fieldFromInstruction(Insn, 5, 5);
- unsigned Rm = fieldFromInstruction(Insn, 16, 5);
-
- // Decode post-index of load duplicate lane
- if (IsLoadDup) {
- switch (NumVecs) {
- case 1:
- Is64bitVec ? DecodeFPR64RegisterClass(Inst, Rt, Address, Decoder)
- : DecodeFPR128RegisterClass(Inst, Rt, Address, Decoder);
- break;
- case 2:
- Is64bitVec ? DecodeDPairRegisterClass(Inst, Rt, Address, Decoder)
- : DecodeQPairRegisterClass(Inst, Rt, Address, Decoder);
- break;
- case 3:
- Is64bitVec ? DecodeDTripleRegisterClass(Inst, Rt, Address, Decoder)
- : DecodeQTripleRegisterClass(Inst, Rt, Address, Decoder);
- break;
- case 4:
- Is64bitVec ? DecodeDQuadRegisterClass(Inst, Rt, Address, Decoder)
- : DecodeQQuadRegisterClass(Inst, Rt, Address, Decoder);
- }
-
- // Decode write back register, which is equal to Rn.
- DecodeGPR64xspRegisterClass(Inst, Rn, Address, Decoder);
- DecodeGPR64xspRegisterClass(Inst, Rn, Address, Decoder);
-
- if (Rm == 31) // If Rm is 0x11111, add the number of transferred bytes
- Inst.addOperand(MCOperand::CreateImm(TransferBytes));
- else // Decode Rm
- DecodeGPR64noxzrRegisterClass(Inst, Rm, Address, Decoder);
-
- return MCDisassembler::Success;
- }
-
- // Decode post-index of load/store lane
- // Loads have a vector list as output.
- if (IsLoad) {
- switch (NumVecs) {
- case 1:
- DecodeFPR128RegisterClass(Inst, Rt, Address, Decoder);
- break;
- case 2:
- DecodeQPairRegisterClass(Inst, Rt, Address, Decoder);
- break;
- case 3:
- DecodeQTripleRegisterClass(Inst, Rt, Address, Decoder);
- break;
- case 4:
- DecodeQQuadRegisterClass(Inst, Rt, Address, Decoder);
- }
- }
-
- // Decode write back register, which is equal to Rn.
- DecodeGPR64xspRegisterClass(Inst, Rn, Address, Decoder);
- DecodeGPR64xspRegisterClass(Inst, Rn, Address, Decoder);
-
- if (Rm == 31) // If Rm is 0x11111, add the number of transferred bytes
- Inst.addOperand(MCOperand::CreateImm(TransferBytes));
- else // Decode Rm
- DecodeGPR64noxzrRegisterClass(Inst, Rm, Address, Decoder);
-
- // Decode the source vector list.
- switch (NumVecs) {
- case 1:
- DecodeFPR128RegisterClass(Inst, Rt, Address, Decoder);
- break;
- case 2:
- DecodeQPairRegisterClass(Inst, Rt, Address, Decoder);
- break;
- case 3:
- DecodeQTripleRegisterClass(Inst, Rt, Address, Decoder);
- break;
- case 4:
- DecodeQQuadRegisterClass(Inst, Rt, Address, Decoder);
- }
-
- // Decode lane
- unsigned Q = fieldFromInstruction(Insn, 30, 1);
- unsigned S = fieldFromInstruction(Insn, 10, 3);
- unsigned lane = 0;
- // Calculate the number of lanes by number of vectors and transferred bytes.
- // NumLanes = 16 bytes / bytes of each lane
- unsigned NumLanes = 16 / (TransferBytes / NumVecs);
- switch (NumLanes) {
- case 16: // A vector has 16 lanes, each lane is 1 bytes.
- lane = (Q << 3) | S;
- break;
- case 8:
- lane = (Q << 2) | (S >> 1);
- break;
- case 4:
- lane = (Q << 1) | (S >> 2);
- break;
- case 2:
- lane = Q;
- break;
- }
- Inst.addOperand(MCOperand::CreateImm(lane));
-
- return MCDisassembler::Success;
+ Inst.addOperand(MCOperand::CreateImm(imm));
+ return Success;
}
-static DecodeStatus DecodeSHLLInstruction(MCInst &Inst, unsigned Insn,
- uint64_t Address,
- const void *Decoder) {
- unsigned Rd = fieldFromInstruction(Insn, 0, 5);
- unsigned Rn = fieldFromInstruction(Insn, 5, 5);
- unsigned size = fieldFromInstruction(Insn, 22, 2);
- unsigned Q = fieldFromInstruction(Insn, 30, 1);
+static DecodeStatus DecodeModImmInstruction(llvm::MCInst &Inst, uint32_t insn,
+ uint64_t Addr,
+ const void *Decoder) {
+ unsigned Rd = fieldFromInstruction(insn, 0, 5);
+ unsigned cmode = fieldFromInstruction(insn, 12, 4);
+ unsigned imm = fieldFromInstruction(insn, 16, 3) << 5;
+ imm |= fieldFromInstruction(insn, 5, 5);
- DecodeFPR128RegisterClass(Inst, Rd, Address, Decoder);
-
- if(Q)
- DecodeFPR128RegisterClass(Inst, Rn, Address, Decoder);
+ if (Inst.getOpcode() == AArch64::MOVID)
+ DecodeFPR64RegisterClass(Inst, Rd, Addr, Decoder);
else
- DecodeFPR64RegisterClass(Inst, Rn, Address, Decoder);
+ DecodeVectorRegisterClass(Inst, Rd, Addr, Decoder);
- switch (size) {
- case 0:
- Inst.addOperand(MCOperand::CreateImm(8));
+ Inst.addOperand(MCOperand::CreateImm(imm));
+
+ switch (Inst.getOpcode()) {
+ default:
break;
- case 1:
- Inst.addOperand(MCOperand::CreateImm(16));
+ case AArch64::MOVIv4i16:
+ case AArch64::MOVIv8i16:
+ case AArch64::MVNIv4i16:
+ case AArch64::MVNIv8i16:
+ case AArch64::MOVIv2i32:
+ case AArch64::MOVIv4i32:
+ case AArch64::MVNIv2i32:
+ case AArch64::MVNIv4i32:
+ Inst.addOperand(MCOperand::CreateImm((cmode & 6) << 2));
break;
- case 2:
- Inst.addOperand(MCOperand::CreateImm(32));
+ case AArch64::MOVIv2s_msl:
+ case AArch64::MOVIv4s_msl:
+ case AArch64::MVNIv2s_msl:
+ case AArch64::MVNIv4s_msl:
+ Inst.addOperand(MCOperand::CreateImm(cmode & 1 ? 0x110 : 0x108));
break;
- default :
- return MCDisassembler::Fail;
}
- return MCDisassembler::Success;
+
+ return Success;
}
+static DecodeStatus DecodeModImmTiedInstruction(llvm::MCInst &Inst,
+ uint32_t insn, uint64_t Addr,
+ const void *Decoder) {
+ unsigned Rd = fieldFromInstruction(insn, 0, 5);
+ unsigned cmode = fieldFromInstruction(insn, 12, 4);
+ unsigned imm = fieldFromInstruction(insn, 16, 3) << 5;
+ imm |= fieldFromInstruction(insn, 5, 5);
+
+ // Tied operands added twice.
+ DecodeVectorRegisterClass(Inst, Rd, Addr, Decoder);
+ DecodeVectorRegisterClass(Inst, Rd, Addr, Decoder);
+
+ Inst.addOperand(MCOperand::CreateImm(imm));
+ Inst.addOperand(MCOperand::CreateImm((cmode & 6) << 2));
+
+ return Success;
+}
+
+static DecodeStatus DecodeAdrInstruction(llvm::MCInst &Inst, uint32_t insn,
+ uint64_t Addr, const void *Decoder) {
+ unsigned Rd = fieldFromInstruction(insn, 0, 5);
+ int64_t imm = fieldFromInstruction(insn, 5, 19) << 2;
+ imm |= fieldFromInstruction(insn, 29, 2);
+ const AArch64Disassembler *Dis =
+ static_cast<const AArch64Disassembler *>(Decoder);
+
+ // Sign-extend the 21-bit immediate.
+ if (imm & (1 << (21 - 1)))
+ imm |= ~((1LL << 21) - 1);
+
+ DecodeGPR64RegisterClass(Inst, Rd, Addr, Decoder);
+ if (!Dis->tryAddingSymbolicOperand(Inst, imm, Addr, Fail, 0, 4))
+ Inst.addOperand(MCOperand::CreateImm(imm));
+
+ return Success;
+}
+
+static DecodeStatus DecodeBaseAddSubImm(llvm::MCInst &Inst, uint32_t insn,
+ uint64_t Addr, const void *Decoder) {
+ unsigned Rd = fieldFromInstruction(insn, 0, 5);
+ unsigned Rn = fieldFromInstruction(insn, 5, 5);
+ unsigned Imm = fieldFromInstruction(insn, 10, 14);
+ unsigned S = fieldFromInstruction(insn, 29, 1);
+ unsigned Datasize = fieldFromInstruction(insn, 31, 1);
+
+ unsigned ShifterVal = (Imm >> 12) & 3;
+ unsigned ImmVal = Imm & 0xFFF;
+ const AArch64Disassembler *Dis =
+ static_cast<const AArch64Disassembler *>(Decoder);
+
+ if (ShifterVal != 0 && ShifterVal != 1)
+ return Fail;
+
+ if (Datasize) {
+ if (Rd == 31 && !S)
+ DecodeGPR64spRegisterClass(Inst, Rd, Addr, Decoder);
+ else
+ DecodeGPR64RegisterClass(Inst, Rd, Addr, Decoder);
+ DecodeGPR64spRegisterClass(Inst, Rn, Addr, Decoder);
+ } else {
+ if (Rd == 31 && !S)
+ DecodeGPR32spRegisterClass(Inst, Rd, Addr, Decoder);
+ else
+ DecodeGPR32RegisterClass(Inst, Rd, Addr, Decoder);
+ DecodeGPR32spRegisterClass(Inst, Rn, Addr, Decoder);
+ }
+
+ if (!Dis->tryAddingSymbolicOperand(Inst, Imm, Addr, Fail, 0, 4))
+ Inst.addOperand(MCOperand::CreateImm(ImmVal));
+ Inst.addOperand(MCOperand::CreateImm(12 * ShifterVal));
+ return Success;
+}
+
+static DecodeStatus DecodeUnconditionalBranch(llvm::MCInst &Inst, uint32_t insn,
+ uint64_t Addr,
+ const void *Decoder) {
+ int64_t imm = fieldFromInstruction(insn, 0, 26);
+ const AArch64Disassembler *Dis =
+ static_cast<const AArch64Disassembler *>(Decoder);
+
+ // Sign-extend the 26-bit immediate.
+ if (imm & (1 << (26 - 1)))
+ imm |= ~((1LL << 26) - 1);
+
+ if (!Dis->tryAddingSymbolicOperand(Inst, imm << 2, Addr, true, 0, 4))
+ Inst.addOperand(MCOperand::CreateImm(imm));
+
+ return Success;
+}
+
+static DecodeStatus DecodeSystemPStateInstruction(llvm::MCInst &Inst,
+ uint32_t insn, uint64_t Addr,
+ const void *Decoder) {
+ uint64_t op1 = fieldFromInstruction(insn, 16, 3);
+ uint64_t op2 = fieldFromInstruction(insn, 5, 3);
+ uint64_t crm = fieldFromInstruction(insn, 8, 4);
+
+ uint64_t pstate_field = (op1 << 3) | op2;
+
+ Inst.addOperand(MCOperand::CreateImm(pstate_field));
+ Inst.addOperand(MCOperand::CreateImm(crm));
+
+ bool ValidNamed;
+ (void)AArch64PState::PStateMapper().toString(pstate_field, ValidNamed);
+
+ return ValidNamed ? Success : Fail;
+}
+
+static DecodeStatus DecodeTestAndBranch(llvm::MCInst &Inst, uint32_t insn,
+ uint64_t Addr, const void *Decoder) {
+ uint64_t Rt = fieldFromInstruction(insn, 0, 5);
+ uint64_t bit = fieldFromInstruction(insn, 31, 1) << 5;
+ bit |= fieldFromInstruction(insn, 19, 5);
+ int64_t dst = fieldFromInstruction(insn, 5, 14);
+ const AArch64Disassembler *Dis =
+ static_cast<const AArch64Disassembler *>(Decoder);
+
+ // Sign-extend 14-bit immediate.
+ if (dst & (1 << (14 - 1)))
+ dst |= ~((1LL << 14) - 1);
+
+ if (fieldFromInstruction(insn, 31, 1) == 0)
+ DecodeGPR32RegisterClass(Inst, Rt, Addr, Decoder);
+ else
+ DecodeGPR64RegisterClass(Inst, Rt, Addr, Decoder);
+ Inst.addOperand(MCOperand::CreateImm(bit));
+ if (!Dis->tryAddingSymbolicOperand(Inst, dst << 2, Addr, true, 0, 4))
+ Inst.addOperand(MCOperand::CreateImm(dst));
+
+ return Success;
+}
diff --git a/lib/Target/AArch64/Disassembler/AArch64Disassembler.h b/lib/Target/AArch64/Disassembler/AArch64Disassembler.h
new file mode 100644
index 0000000..68d4867
--- /dev/null
+++ b/lib/Target/AArch64/Disassembler/AArch64Disassembler.h
@@ -0,0 +1,40 @@
+//===- AArch64Disassembler.h - Disassembler for AArch64 ---------*- C++ -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef AArch64DISASSEMBLER_H
+#define AArch64DISASSEMBLER_H
+
+#include "llvm/MC/MCDisassembler.h"
+
+namespace llvm {
+
+class MCInst;
+class MemoryObject;
+class raw_ostream;
+
+class AArch64Disassembler : public MCDisassembler {
+public:
+ AArch64Disassembler(const MCSubtargetInfo &STI, MCContext &Ctx)
+ : MCDisassembler(STI, Ctx) {}
+
+ ~AArch64Disassembler() {}
+
+ /// getInstruction - See MCDisassembler.
+ MCDisassembler::DecodeStatus
+ getInstruction(MCInst &instr, uint64_t &size, const MemoryObject ®ion,
+ uint64_t address, raw_ostream &vStream,
+ raw_ostream &cStream) const override;
+};
+
+} // namespace llvm
+
+#endif
diff --git a/lib/Target/AArch64/Disassembler/AArch64ExternalSymbolizer.cpp b/lib/Target/AArch64/Disassembler/AArch64ExternalSymbolizer.cpp
new file mode 100644
index 0000000..2466368
--- /dev/null
+++ b/lib/Target/AArch64/Disassembler/AArch64ExternalSymbolizer.cpp
@@ -0,0 +1,221 @@
+//===- AArch64ExternalSymbolizer.cpp - Symbolizer for AArch64 ---*- C++ -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "AArch64ExternalSymbolizer.h"
+#include "AArch64Subtarget.h"
+#include "MCTargetDesc/AArch64AddressingModes.h"
+#include "Utils/AArch64BaseInfo.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCInst.h"
+#include "llvm/Support/Format.h"
+#include "llvm/Support/raw_ostream.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "aarch64-disassembler"
+
+static MCSymbolRefExpr::VariantKind
+getVariant(uint64_t LLVMDisassembler_VariantKind) {
+ switch (LLVMDisassembler_VariantKind) {
+ case LLVMDisassembler_VariantKind_None:
+ return MCSymbolRefExpr::VK_None;
+ case LLVMDisassembler_VariantKind_ARM64_PAGE:
+ return MCSymbolRefExpr::VK_PAGE;
+ case LLVMDisassembler_VariantKind_ARM64_PAGEOFF:
+ return MCSymbolRefExpr::VK_PAGEOFF;
+ case LLVMDisassembler_VariantKind_ARM64_GOTPAGE:
+ return MCSymbolRefExpr::VK_GOTPAGE;
+ case LLVMDisassembler_VariantKind_ARM64_GOTPAGEOFF:
+ return MCSymbolRefExpr::VK_GOTPAGEOFF;
+ case LLVMDisassembler_VariantKind_ARM64_TLVP:
+ case LLVMDisassembler_VariantKind_ARM64_TLVOFF:
+ default:
+ assert(0 && "bad LLVMDisassembler_VariantKind");
+ return MCSymbolRefExpr::VK_None;
+ }
+}
+
+/// tryAddingSymbolicOperand - tryAddingSymbolicOperand trys to add a symbolic
+/// operand in place of the immediate Value in the MCInst. The immediate
+/// Value has not had any PC adjustment made by the caller. If the instruction
+/// is a branch that adds the PC to the immediate Value then isBranch is
+/// Success, else Fail. If GetOpInfo is non-null, then it is called to get any
+/// symbolic information at the Address for this instrution. If that returns
+/// non-zero then the symbolic information it returns is used to create an
+/// MCExpr and that is added as an operand to the MCInst. If GetOpInfo()
+/// returns zero and isBranch is Success then a symbol look up for
+/// Address + Value is done and if a symbol is found an MCExpr is created with
+/// that, else an MCExpr with Address + Value is created. If GetOpInfo()
+/// returns zero and isBranch is Fail then the the Opcode of the MCInst is
+/// tested and for ADRP an other instructions that help to load of pointers
+/// a symbol look up is done to see it is returns a specific reference type
+/// to add to the comment stream. This function returns Success if it adds
+/// an operand to the MCInst and Fail otherwise.
+bool AArch64ExternalSymbolizer::tryAddingSymbolicOperand(
+ MCInst &MI, raw_ostream &CommentStream, int64_t Value, uint64_t Address,
+ bool IsBranch, uint64_t Offset, uint64_t InstSize) {
+ // FIXME: This method shares a lot of code with
+ // MCExternalSymbolizer::tryAddingSymbolicOperand. It may be possible
+ // refactor the MCExternalSymbolizer interface to allow more of this
+ // implementation to be shared.
+ //
+ struct LLVMOpInfo1 SymbolicOp;
+ memset(&SymbolicOp, '\0', sizeof(struct LLVMOpInfo1));
+ SymbolicOp.Value = Value;
+ uint64_t ReferenceType;
+ const char *ReferenceName;
+ if (!GetOpInfo ||
+ !GetOpInfo(DisInfo, Address, 0 /* Offset */, InstSize, 1, &SymbolicOp)) {
+ if (IsBranch) {
+ ReferenceType = LLVMDisassembler_ReferenceType_In_Branch;
+ const char *Name = SymbolLookUp(DisInfo, Address + Value, &ReferenceType,
+ Address, &ReferenceName);
+ if (Name) {
+ SymbolicOp.AddSymbol.Name = Name;
+ SymbolicOp.AddSymbol.Present = true;
+ SymbolicOp.Value = 0;
+ } else {
+ SymbolicOp.Value = Address + Value;
+ }
+ if (ReferenceType == LLVMDisassembler_ReferenceType_Out_SymbolStub)
+ CommentStream << "symbol stub for: " << ReferenceName;
+ else if (ReferenceType ==
+ LLVMDisassembler_ReferenceType_Out_Objc_Message)
+ CommentStream << "Objc message: " << ReferenceName;
+ } else if (MI.getOpcode() == AArch64::ADRP) {
+ ReferenceType = LLVMDisassembler_ReferenceType_In_ARM64_ADRP;
+ // otool expects the fully encoded ADRP instruction to be passed in as
+ // the value here, so reconstruct it:
+ const MCRegisterInfo &MCRI = *Ctx.getRegisterInfo();
+ uint32_t EncodedInst = 0x90000000;
+ EncodedInst |= (Value & 0x3) << 29; // immlo
+ EncodedInst |= ((Value >> 2) & 0x7FFFF) << 5; // immhi
+ EncodedInst |= MCRI.getEncodingValue(MI.getOperand(0).getReg()); // reg
+ SymbolLookUp(DisInfo, EncodedInst, &ReferenceType, Address,
+ &ReferenceName);
+ CommentStream << format("0x%llx",
+ 0xfffffffffffff000LL & (Address + Value));
+ } else if (MI.getOpcode() == AArch64::ADDXri ||
+ MI.getOpcode() == AArch64::LDRXui ||
+ MI.getOpcode() == AArch64::LDRXl ||
+ MI.getOpcode() == AArch64::ADR) {
+ if (MI.getOpcode() == AArch64::ADDXri)
+ ReferenceType = LLVMDisassembler_ReferenceType_In_ARM64_ADDXri;
+ else if (MI.getOpcode() == AArch64::LDRXui)
+ ReferenceType = LLVMDisassembler_ReferenceType_In_ARM64_LDRXui;
+ if (MI.getOpcode() == AArch64::LDRXl) {
+ ReferenceType = LLVMDisassembler_ReferenceType_In_ARM64_LDRXl;
+ SymbolLookUp(DisInfo, Address + Value, &ReferenceType, Address,
+ &ReferenceName);
+ } else if (MI.getOpcode() == AArch64::ADR) {
+ ReferenceType = LLVMDisassembler_ReferenceType_In_ARM64_ADR;
+ SymbolLookUp(DisInfo, Address + Value, &ReferenceType, Address,
+ &ReferenceName);
+ } else {
+ const MCRegisterInfo &MCRI = *Ctx.getRegisterInfo();
+ // otool expects the fully encoded ADD/LDR instruction to be passed in
+ // as the value here, so reconstruct it:
+ unsigned EncodedInst =
+ MI.getOpcode() == AArch64::ADDXri ? 0x91000000: 0xF9400000;
+ EncodedInst |= Value << 10; // imm12 [+ shift:2 for ADD]
+ EncodedInst |=
+ MCRI.getEncodingValue(MI.getOperand(1).getReg()) << 5; // Rn
+ EncodedInst |= MCRI.getEncodingValue(MI.getOperand(0).getReg()); // Rd
+
+ SymbolLookUp(DisInfo, EncodedInst, &ReferenceType, Address,
+ &ReferenceName);
+ }
+ if (ReferenceType == LLVMDisassembler_ReferenceType_Out_LitPool_SymAddr)
+ CommentStream << "literal pool symbol address: " << ReferenceName;
+ else if (ReferenceType ==
+ LLVMDisassembler_ReferenceType_Out_LitPool_CstrAddr)
+ CommentStream << "literal pool for: \"" << ReferenceName << "\"";
+ else if (ReferenceType ==
+ LLVMDisassembler_ReferenceType_Out_Objc_CFString_Ref)
+ CommentStream << "Objc cfstring ref: @\"" << ReferenceName << "\"";
+ else if (ReferenceType ==
+ LLVMDisassembler_ReferenceType_Out_Objc_Message)
+ CommentStream << "Objc message: " << ReferenceName;
+ else if (ReferenceType ==
+ LLVMDisassembler_ReferenceType_Out_Objc_Message_Ref)
+ CommentStream << "Objc message ref: " << ReferenceName;
+ else if (ReferenceType ==
+ LLVMDisassembler_ReferenceType_Out_Objc_Selector_Ref)
+ CommentStream << "Objc selector ref: " << ReferenceName;
+ else if (ReferenceType ==
+ LLVMDisassembler_ReferenceType_Out_Objc_Class_Ref)
+ CommentStream << "Objc class ref: " << ReferenceName;
+ // For these instructions, the SymbolLookUp() above is just to get the
+ // ReferenceType and ReferenceName. We want to make sure not to
+ // fall through so we don't build an MCExpr to leave the disassembly
+ // of the immediate values of these instructions to the InstPrinter.
+ return false;
+ } else {
+ return false;
+ }
+ }
+
+ const MCExpr *Add = nullptr;
+ if (SymbolicOp.AddSymbol.Present) {
+ if (SymbolicOp.AddSymbol.Name) {
+ StringRef Name(SymbolicOp.AddSymbol.Name);
+ MCSymbol *Sym = Ctx.GetOrCreateSymbol(Name);
+ MCSymbolRefExpr::VariantKind Variant = getVariant(SymbolicOp.VariantKind);
+ if (Variant != MCSymbolRefExpr::VK_None)
+ Add = MCSymbolRefExpr::Create(Sym, Variant, Ctx);
+ else
+ Add = MCSymbolRefExpr::Create(Sym, Ctx);
+ } else {
+ Add = MCConstantExpr::Create(SymbolicOp.AddSymbol.Value, Ctx);
+ }
+ }
+
+ const MCExpr *Sub = nullptr;
+ if (SymbolicOp.SubtractSymbol.Present) {
+ if (SymbolicOp.SubtractSymbol.Name) {
+ StringRef Name(SymbolicOp.SubtractSymbol.Name);
+ MCSymbol *Sym = Ctx.GetOrCreateSymbol(Name);
+ Sub = MCSymbolRefExpr::Create(Sym, Ctx);
+ } else {
+ Sub = MCConstantExpr::Create(SymbolicOp.SubtractSymbol.Value, Ctx);
+ }
+ }
+
+ const MCExpr *Off = nullptr;
+ if (SymbolicOp.Value != 0)
+ Off = MCConstantExpr::Create(SymbolicOp.Value, Ctx);
+
+ const MCExpr *Expr;
+ if (Sub) {
+ const MCExpr *LHS;
+ if (Add)
+ LHS = MCBinaryExpr::CreateSub(Add, Sub, Ctx);
+ else
+ LHS = MCUnaryExpr::CreateMinus(Sub, Ctx);
+ if (Off)
+ Expr = MCBinaryExpr::CreateAdd(LHS, Off, Ctx);
+ else
+ Expr = LHS;
+ } else if (Add) {
+ if (Off)
+ Expr = MCBinaryExpr::CreateAdd(Add, Off, Ctx);
+ else
+ Expr = Add;
+ } else {
+ if (Off)
+ Expr = Off;
+ else
+ Expr = MCConstantExpr::Create(0, Ctx);
+ }
+
+ MI.addOperand(MCOperand::CreateExpr(Expr));
+
+ return true;
+}
diff --git a/lib/Target/AArch64/Disassembler/AArch64ExternalSymbolizer.h b/lib/Target/AArch64/Disassembler/AArch64ExternalSymbolizer.h
new file mode 100644
index 0000000..171d31c
--- /dev/null
+++ b/lib/Target/AArch64/Disassembler/AArch64ExternalSymbolizer.h
@@ -0,0 +1,38 @@
+//===- AArch64ExternalSymbolizer.h - Symbolizer for AArch64 -----*- C++ -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Symbolize AArch64 assembly code during disassembly using callbacks.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef AArch64EXTERNALSYMBOLIZER_H
+#define AArch64EXTERNALSYMBOLIZER_H
+
+#include "llvm/MC/MCExternalSymbolizer.h"
+
+namespace llvm {
+
+class AArch64ExternalSymbolizer : public MCExternalSymbolizer {
+public:
+ AArch64ExternalSymbolizer(MCContext &Ctx,
+ std::unique_ptr<MCRelocationInfo> RelInfo,
+ LLVMOpInfoCallback GetOpInfo,
+ LLVMSymbolLookupCallback SymbolLookUp,
+ void *DisInfo)
+ : MCExternalSymbolizer(Ctx, std::move(RelInfo), GetOpInfo, SymbolLookUp,
+ DisInfo) {}
+
+ bool tryAddingSymbolicOperand(MCInst &MI, raw_ostream &CommentStream,
+ int64_t Value, uint64_t Address, bool IsBranch,
+ uint64_t Offset, uint64_t InstSize) override;
+};
+
+} // namespace llvm
+
+#endif
diff --git a/lib/Target/AArch64/Disassembler/Android.mk b/lib/Target/AArch64/Disassembler/Android.mk
index fcc53ad..b89538d 100644
--- a/lib/Target/AArch64/Disassembler/Android.mk
+++ b/lib/Target/AArch64/Disassembler/Android.mk
@@ -7,7 +7,8 @@
AArch64GenRegisterInfo.inc
arm64_disassembler_SRC_FILES := \
- AArch64Disassembler.cpp
+ AArch64Disassembler.cpp \
+ AArch64ExternalSymbolizer.cpp
# For the device
# =====================================================
diff --git a/lib/Target/AArch64/Disassembler/CMakeLists.txt b/lib/Target/AArch64/Disassembler/CMakeLists.txt
index 21baf25..be4ccad 100644
--- a/lib/Target/AArch64/Disassembler/CMakeLists.txt
+++ b/lib/Target/AArch64/Disassembler/CMakeLists.txt
@@ -1,3 +1,14 @@
+include_directories( ${CMAKE_CURRENT_BINARY_DIR}/.. ${CMAKE_CURRENT_SOURCE_DIR}/.. )
+
add_llvm_library(LLVMAArch64Disassembler
AArch64Disassembler.cpp
+ AArch64ExternalSymbolizer.cpp
)
+# workaround for hanging compilation on MSVC8, 9 and 10
+#if( MSVC_VERSION EQUAL 1400 OR MSVC_VERSION EQUAL 1500 OR MSVC_VERSION EQUAL 1600 )
+#set_property(
+# SOURCE ARMDisassembler.cpp
+# PROPERTY COMPILE_FLAGS "/Od"
+# )
+#endif()
+add_dependencies(LLVMAArch64Disassembler AArch64CommonTableGen)
diff --git a/lib/Target/AArch64/Disassembler/LLVMBuild.txt b/lib/Target/AArch64/Disassembler/LLVMBuild.txt
index 05c4ed1..a4224f4 100644
--- a/lib/Target/AArch64/Disassembler/LLVMBuild.txt
+++ b/lib/Target/AArch64/Disassembler/LLVMBuild.txt
@@ -1,4 +1,4 @@
-;===- ./lib/Target/AArch64/Disassembler/LLVMBuild.txt ----------*- Conf -*--===;
+;===- ./lib/Target/AArch64/Disassembler/LLVMBuild.txt ------------*- Conf -*--===;
;
; The LLVM Compiler Infrastructure
;
diff --git a/lib/Target/AArch64/Disassembler/Makefile b/lib/Target/AArch64/Disassembler/Makefile
index 5c86120..741bb81 100644
--- a/lib/Target/AArch64/Disassembler/Makefile
+++ b/lib/Target/AArch64/Disassembler/Makefile
@@ -10,7 +10,7 @@
LEVEL = ../../../..
LIBRARYNAME = LLVMAArch64Disassembler
-# Hack: we need to include 'main' target directory to grab private headers
+# Hack: we need to include 'main' arm target directory to grab private headers
CPPFLAGS = -I$(PROJ_OBJ_DIR)/.. -I$(PROJ_SRC_DIR)/..
include $(LEVEL)/Makefile.common
diff --git a/lib/Target/AArch64/InstPrinter/AArch64InstPrinter.cpp b/lib/Target/AArch64/InstPrinter/AArch64InstPrinter.cpp
index fd3f009..f484a5b 100644
--- a/lib/Target/AArch64/InstPrinter/AArch64InstPrinter.cpp
+++ b/lib/Target/AArch64/InstPrinter/AArch64InstPrinter.cpp
@@ -11,369 +11,881 @@
//
//===----------------------------------------------------------------------===//
-#define DEBUG_TYPE "asm-printer"
#include "AArch64InstPrinter.h"
-#include "MCTargetDesc/AArch64MCTargetDesc.h"
+#include "MCTargetDesc/AArch64AddressingModes.h"
#include "Utils/AArch64BaseInfo.h"
-#include "llvm/MC/MCExpr.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/StringExtras.h"
#include "llvm/MC/MCInst.h"
+#include "llvm/MC/MCExpr.h"
#include "llvm/MC/MCRegisterInfo.h"
-#include "llvm/Support/ErrorHandling.h"
#include "llvm/Support/Format.h"
#include "llvm/Support/raw_ostream.h"
-
using namespace llvm;
+#define DEBUG_TYPE "asm-printer"
+
#define GET_INSTRUCTION_NAME
#define PRINT_ALIAS_INSTR
#include "AArch64GenAsmWriter.inc"
-
-static int64_t unpackSignedImm(int BitWidth, uint64_t Value) {
- assert(!(Value & ~((1ULL << BitWidth)-1)) && "immediate not n-bit");
- if (Value & (1ULL << (BitWidth - 1)))
- return static_cast<int64_t>(Value) - (1LL << BitWidth);
- else
- return Value;
-}
+#define GET_INSTRUCTION_NAME
+#define PRINT_ALIAS_INSTR
+#include "AArch64GenAsmWriter1.inc"
AArch64InstPrinter::AArch64InstPrinter(const MCAsmInfo &MAI,
const MCInstrInfo &MII,
const MCRegisterInfo &MRI,
- const MCSubtargetInfo &STI) :
- MCInstPrinter(MAI, MII, MRI) {
+ const MCSubtargetInfo &STI)
+ : MCInstPrinter(MAI, MII, MRI) {
// Initialize the set of available features.
setAvailableFeatures(STI.getFeatureBits());
}
+AArch64AppleInstPrinter::AArch64AppleInstPrinter(const MCAsmInfo &MAI,
+ const MCInstrInfo &MII,
+ const MCRegisterInfo &MRI,
+ const MCSubtargetInfo &STI)
+ : AArch64InstPrinter(MAI, MII, MRI, STI) {}
+
void AArch64InstPrinter::printRegName(raw_ostream &OS, unsigned RegNo) const {
+ // This is for .cfi directives.
OS << getRegisterName(RegNo);
}
-void
-AArch64InstPrinter::printOffsetSImm9Operand(const MCInst *MI,
- unsigned OpNum, raw_ostream &O) {
- const MCOperand &MOImm = MI->getOperand(OpNum);
- int32_t Imm = unpackSignedImm(9, MOImm.getImm());
+void AArch64InstPrinter::printInst(const MCInst *MI, raw_ostream &O,
+ StringRef Annot) {
+ // Check for special encodings and print the canonical alias instead.
- O << '#' << Imm;
-}
+ unsigned Opcode = MI->getOpcode();
-void
-AArch64InstPrinter::printAddrRegExtendOperand(const MCInst *MI, unsigned OpNum,
- raw_ostream &O, unsigned MemSize,
- unsigned RmSize) {
- unsigned ExtImm = MI->getOperand(OpNum).getImm();
- unsigned OptionHi = ExtImm >> 1;
- unsigned S = ExtImm & 1;
- bool IsLSL = OptionHi == 1 && RmSize == 64;
-
- const char *Ext;
- switch (OptionHi) {
- case 1:
- Ext = (RmSize == 32) ? "uxtw" : "lsl";
- break;
- case 3:
- Ext = (RmSize == 32) ? "sxtw" : "sxtx";
- break;
- default:
- llvm_unreachable("Incorrect Option on load/store (reg offset)");
- }
- O << Ext;
-
- if (S) {
- unsigned ShiftAmt = Log2_32(MemSize);
- O << " #" << ShiftAmt;
- } else if (IsLSL) {
- O << " #0";
- }
-}
-
-void
-AArch64InstPrinter::printAddSubImmLSL0Operand(const MCInst *MI,
- unsigned OpNum, raw_ostream &O) {
- const MCOperand &Imm12Op = MI->getOperand(OpNum);
-
- if (Imm12Op.isImm()) {
- int64_t Imm12 = Imm12Op.getImm();
- assert(Imm12 >= 0 && "Invalid immediate for add/sub imm");
- O << "#" << Imm12;
- } else {
- assert(Imm12Op.isExpr() && "Unexpected shift operand type");
- O << "#" << *Imm12Op.getExpr();
- }
-}
-
-void
-AArch64InstPrinter::printAddSubImmLSL12Operand(const MCInst *MI, unsigned OpNum,
- raw_ostream &O) {
-
- printAddSubImmLSL0Operand(MI, OpNum, O);
-
- O << ", lsl #12";
-}
-
-void
-AArch64InstPrinter::printBareImmOperand(const MCInst *MI, unsigned OpNum,
- raw_ostream &O) {
- const MCOperand &MO = MI->getOperand(OpNum);
- O << MO.getImm();
-}
-
-template<unsigned RegWidth> void
-AArch64InstPrinter::printBFILSBOperand(const MCInst *MI, unsigned OpNum,
- raw_ostream &O) {
- const MCOperand &ImmROp = MI->getOperand(OpNum);
- unsigned LSB = ImmROp.getImm() == 0 ? 0 : RegWidth - ImmROp.getImm();
-
- O << '#' << LSB;
-}
-
-void AArch64InstPrinter::printBFIWidthOperand(const MCInst *MI, unsigned OpNum,
- raw_ostream &O) {
- const MCOperand &ImmSOp = MI->getOperand(OpNum);
- unsigned Width = ImmSOp.getImm() + 1;
-
- O << '#' << Width;
-}
-
-void
-AArch64InstPrinter::printBFXWidthOperand(const MCInst *MI, unsigned OpNum,
- raw_ostream &O) {
- const MCOperand &ImmSOp = MI->getOperand(OpNum);
- const MCOperand &ImmROp = MI->getOperand(OpNum - 1);
-
- unsigned ImmR = ImmROp.getImm();
- unsigned ImmS = ImmSOp.getImm();
-
- assert(ImmS >= ImmR && "Invalid ImmR, ImmS combination for bitfield extract");
-
- O << '#' << (ImmS - ImmR + 1);
-}
-
-void
-AArch64InstPrinter::printCRxOperand(const MCInst *MI, unsigned OpNum,
- raw_ostream &O) {
- const MCOperand &CRx = MI->getOperand(OpNum);
-
- O << 'c' << CRx.getImm();
-}
-
-
-void
-AArch64InstPrinter::printCVTFixedPosOperand(const MCInst *MI, unsigned OpNum,
- raw_ostream &O) {
- const MCOperand &ScaleOp = MI->getOperand(OpNum);
-
- O << '#' << (64 - ScaleOp.getImm());
-}
-
-
-void AArch64InstPrinter::printFPImmOperand(const MCInst *MI, unsigned OpNum,
- raw_ostream &o) {
- const MCOperand &MOImm8 = MI->getOperand(OpNum);
-
- assert(MOImm8.isImm()
- && "Immediate operand required for floating-point immediate inst");
-
- uint32_t Imm8 = MOImm8.getImm();
- uint32_t Fraction = Imm8 & 0xf;
- uint32_t Exponent = (Imm8 >> 4) & 0x7;
- uint32_t Negative = (Imm8 >> 7) & 0x1;
-
- float Val = 1.0f + Fraction / 16.0f;
-
- // That is:
- // 000 -> 2^1, 001 -> 2^2, 010 -> 2^3, 011 -> 2^4,
- // 100 -> 2^-3, 101 -> 2^-2, 110 -> 2^-1, 111 -> 2^0
- if (Exponent & 0x4) {
- Val /= 1 << (7 - Exponent);
- } else {
- Val *= 1 << (Exponent + 1);
- }
-
- Val = Negative ? -Val : Val;
-
- o << '#' << format("%.8f", Val);
-}
-
-void AArch64InstPrinter::printFPZeroOperand(const MCInst *MI, unsigned OpNum,
- raw_ostream &o) {
- o << "#0.0";
-}
-
-void
-AArch64InstPrinter::printCondCodeOperand(const MCInst *MI, unsigned OpNum,
- raw_ostream &O) {
- const MCOperand &MO = MI->getOperand(OpNum);
-
- O << A64CondCodeToString(static_cast<A64CC::CondCodes>(MO.getImm()));
-}
-
-template <unsigned field_width, unsigned scale> void
-AArch64InstPrinter::printLabelOperand(const MCInst *MI, unsigned OpNum,
- raw_ostream &O) {
- const MCOperand &MO = MI->getOperand(OpNum);
-
- if (!MO.isImm()) {
- printOperand(MI, OpNum, O);
- return;
- }
-
- // The immediate of LDR (lit) instructions is a signed 19-bit immediate, which
- // is multiplied by 4 (because all A64 instructions are 32-bits wide).
- uint64_t UImm = MO.getImm();
- uint64_t Sign = UImm & (1LL << (field_width - 1));
- int64_t SImm = scale * ((UImm & ~Sign) - Sign);
-
- O << "#" << SImm;
-}
-
-template<unsigned RegWidth> void
-AArch64InstPrinter::printLogicalImmOperand(const MCInst *MI, unsigned OpNum,
- raw_ostream &O) {
- const MCOperand &MO = MI->getOperand(OpNum);
- uint64_t Val;
- A64Imms::isLogicalImmBits(RegWidth, MO.getImm(), Val);
- O << "#0x";
- O.write_hex(Val);
-}
-
-void
-AArch64InstPrinter::printOffsetUImm12Operand(const MCInst *MI, unsigned OpNum,
- raw_ostream &O, int MemSize) {
- const MCOperand &MOImm = MI->getOperand(OpNum);
-
- if (MOImm.isImm()) {
- uint32_t Imm = MOImm.getImm() * MemSize;
-
- O << "#" << Imm;
- } else {
- O << "#" << *MOImm.getExpr();
- }
-}
-
-void
-AArch64InstPrinter::printShiftOperand(const MCInst *MI, unsigned OpNum,
- raw_ostream &O,
- A64SE::ShiftExtSpecifiers Shift) {
- const MCOperand &MO = MI->getOperand(OpNum);
-
- // LSL #0 is not printed
- if (Shift == A64SE::LSL && MO.isImm() && MO.getImm() == 0)
- return;
-
- switch (Shift) {
- case A64SE::LSL: O << "lsl"; break;
- case A64SE::LSR: O << "lsr"; break;
- case A64SE::ASR: O << "asr"; break;
- case A64SE::ROR: O << "ror"; break;
- default: llvm_unreachable("Invalid shift specifier in logical instruction");
- }
-
- O << " #" << MO.getImm();
-}
-
-void
-AArch64InstPrinter::printMoveWideImmOperand(const MCInst *MI, unsigned OpNum,
- raw_ostream &O) {
- const MCOperand &UImm16MO = MI->getOperand(OpNum);
- const MCOperand &ShiftMO = MI->getOperand(OpNum + 1);
-
- if (UImm16MO.isImm()) {
- O << '#' << UImm16MO.getImm();
-
- if (ShiftMO.getImm() != 0)
- O << ", lsl #" << (ShiftMO.getImm() * 16);
-
- return;
- }
-
- O << "#" << *UImm16MO.getExpr();
-}
-
-void AArch64InstPrinter::printNamedImmOperand(const NamedImmMapper &Mapper,
- const MCInst *MI, unsigned OpNum,
- raw_ostream &O) {
- bool ValidName;
- const MCOperand &MO = MI->getOperand(OpNum);
- StringRef Name = Mapper.toString(MO.getImm(), ValidName);
-
- if (ValidName)
- O << Name;
- else
- O << '#' << MO.getImm();
-}
-
-void
-AArch64InstPrinter::printSysRegOperand(const A64SysReg::SysRegMapper &Mapper,
- const MCInst *MI, unsigned OpNum,
- raw_ostream &O) {
- const MCOperand &MO = MI->getOperand(OpNum);
-
- bool ValidName;
- std::string Name = Mapper.toString(MO.getImm(), ValidName);
- if (ValidName) {
- O << Name;
- return;
- }
-}
-
-
-void AArch64InstPrinter::printRegExtendOperand(const MCInst *MI,
- unsigned OpNum,
- raw_ostream &O,
- A64SE::ShiftExtSpecifiers Ext) {
- // FIXME: In principle TableGen should be able to detect this itself far more
- // easily. We will only accumulate more of these hacks.
- unsigned Reg0 = MI->getOperand(0).getReg();
- unsigned Reg1 = MI->getOperand(1).getReg();
-
- if (isStackReg(Reg0) || isStackReg(Reg1)) {
- A64SE::ShiftExtSpecifiers LSLEquiv;
-
- if (Reg0 == AArch64::XSP || Reg1 == AArch64::XSP)
- LSLEquiv = A64SE::UXTX;
- else
- LSLEquiv = A64SE::UXTW;
-
- if (Ext == LSLEquiv) {
- O << "lsl #" << MI->getOperand(OpNum).getImm();
+ if (Opcode == AArch64::SYSxt)
+ if (printSysAlias(MI, O)) {
+ printAnnotation(O, Annot);
return;
}
+
+ // SBFM/UBFM should print to a nicer aliased form if possible.
+ if (Opcode == AArch64::SBFMXri || Opcode == AArch64::SBFMWri ||
+ Opcode == AArch64::UBFMXri || Opcode == AArch64::UBFMWri) {
+ const MCOperand &Op0 = MI->getOperand(0);
+ const MCOperand &Op1 = MI->getOperand(1);
+ const MCOperand &Op2 = MI->getOperand(2);
+ const MCOperand &Op3 = MI->getOperand(3);
+
+ bool IsSigned = (Opcode == AArch64::SBFMXri || Opcode == AArch64::SBFMWri);
+ bool Is64Bit = (Opcode == AArch64::SBFMXri || Opcode == AArch64::UBFMXri);
+ if (Op2.isImm() && Op2.getImm() == 0 && Op3.isImm()) {
+ const char *AsmMnemonic = nullptr;
+
+ switch (Op3.getImm()) {
+ default:
+ break;
+ case 7:
+ if (IsSigned)
+ AsmMnemonic = "sxtb";
+ else if (!Is64Bit)
+ AsmMnemonic = "uxtb";
+ break;
+ case 15:
+ if (IsSigned)
+ AsmMnemonic = "sxth";
+ else if (!Is64Bit)
+ AsmMnemonic = "uxth";
+ break;
+ case 31:
+ // *xtw is only valid for signed 64-bit operations.
+ if (Is64Bit && IsSigned)
+ AsmMnemonic = "sxtw";
+ break;
+ }
+
+ if (AsmMnemonic) {
+ O << '\t' << AsmMnemonic << '\t' << getRegisterName(Op0.getReg())
+ << ", " << getRegisterName(getWRegFromXReg(Op1.getReg()));
+ printAnnotation(O, Annot);
+ return;
+ }
+ }
+
+ // All immediate shifts are aliases, implemented using the Bitfield
+ // instruction. In all cases the immediate shift amount shift must be in
+ // the range 0 to (reg.size -1).
+ if (Op2.isImm() && Op3.isImm()) {
+ const char *AsmMnemonic = nullptr;
+ int shift = 0;
+ int64_t immr = Op2.getImm();
+ int64_t imms = Op3.getImm();
+ if (Opcode == AArch64::UBFMWri && imms != 0x1F && ((imms + 1) == immr)) {
+ AsmMnemonic = "lsl";
+ shift = 31 - imms;
+ } else if (Opcode == AArch64::UBFMXri && imms != 0x3f &&
+ ((imms + 1 == immr))) {
+ AsmMnemonic = "lsl";
+ shift = 63 - imms;
+ } else if (Opcode == AArch64::UBFMWri && imms == 0x1f) {
+ AsmMnemonic = "lsr";
+ shift = immr;
+ } else if (Opcode == AArch64::UBFMXri && imms == 0x3f) {
+ AsmMnemonic = "lsr";
+ shift = immr;
+ } else if (Opcode == AArch64::SBFMWri && imms == 0x1f) {
+ AsmMnemonic = "asr";
+ shift = immr;
+ } else if (Opcode == AArch64::SBFMXri && imms == 0x3f) {
+ AsmMnemonic = "asr";
+ shift = immr;
+ }
+ if (AsmMnemonic) {
+ O << '\t' << AsmMnemonic << '\t' << getRegisterName(Op0.getReg())
+ << ", " << getRegisterName(Op1.getReg()) << ", #" << shift;
+ printAnnotation(O, Annot);
+ return;
+ }
+ }
+
+ // SBFIZ/UBFIZ aliases
+ if (Op2.getImm() > Op3.getImm()) {
+ O << '\t' << (IsSigned ? "sbfiz" : "ubfiz") << '\t'
+ << getRegisterName(Op0.getReg()) << ", " << getRegisterName(Op1.getReg())
+ << ", #" << (Is64Bit ? 64 : 32) - Op2.getImm() << ", #" << Op3.getImm() + 1;
+ printAnnotation(O, Annot);
+ return;
+ }
+
+ // Otherwise SBFX/UBFX is the preferred form
+ O << '\t' << (IsSigned ? "sbfx" : "ubfx") << '\t'
+ << getRegisterName(Op0.getReg()) << ", " << getRegisterName(Op1.getReg())
+ << ", #" << Op2.getImm() << ", #" << Op3.getImm() - Op2.getImm() + 1;
+ printAnnotation(O, Annot);
+ return;
}
- switch (Ext) {
- case A64SE::UXTB: O << "uxtb"; break;
- case A64SE::UXTH: O << "uxth"; break;
- case A64SE::UXTW: O << "uxtw"; break;
- case A64SE::UXTX: O << "uxtx"; break;
- case A64SE::SXTB: O << "sxtb"; break;
- case A64SE::SXTH: O << "sxth"; break;
- case A64SE::SXTW: O << "sxtw"; break;
- case A64SE::SXTX: O << "sxtx"; break;
- default: llvm_unreachable("Unexpected shift type for printing");
+ if (Opcode == AArch64::BFMXri || Opcode == AArch64::BFMWri) {
+ const MCOperand &Op0 = MI->getOperand(0); // Op1 == Op0
+ const MCOperand &Op2 = MI->getOperand(2);
+ int ImmR = MI->getOperand(3).getImm();
+ int ImmS = MI->getOperand(4).getImm();
+
+ // BFI alias
+ if (ImmS < ImmR) {
+ int BitWidth = Opcode == AArch64::BFMXri ? 64 : 32;
+ int LSB = (BitWidth - ImmR) % BitWidth;
+ int Width = ImmS + 1;
+ O << "\tbfi\t" << getRegisterName(Op0.getReg()) << ", "
+ << getRegisterName(Op2.getReg()) << ", #" << LSB << ", #" << Width;
+ printAnnotation(O, Annot);
+ return;
+ }
+
+ int LSB = ImmR;
+ int Width = ImmS - ImmR + 1;
+ // Otherwise BFXIL the preferred form
+ O << "\tbfxil\t"
+ << getRegisterName(Op0.getReg()) << ", " << getRegisterName(Op2.getReg())
+ << ", #" << LSB << ", #" << Width;
+ printAnnotation(O, Annot);
+ return;
}
- const MCOperand &MO = MI->getOperand(OpNum);
- if (MO.getImm() != 0)
- O << " #" << MO.getImm();
+ // Symbolic operands for MOVZ, MOVN and MOVK already imply a shift
+ // (e.g. :gottprel_g1: is always going to be "lsl #16") so it should not be
+ // printed.
+ if ((Opcode == AArch64::MOVZXi || Opcode == AArch64::MOVZWi ||
+ Opcode == AArch64::MOVNXi || Opcode == AArch64::MOVNWi) &&
+ MI->getOperand(1).isExpr()) {
+ if (Opcode == AArch64::MOVZXi || Opcode == AArch64::MOVZWi)
+ O << "\tmovz\t";
+ else
+ O << "\tmovn\t";
+
+ O << getRegisterName(MI->getOperand(0).getReg()) << ", #"
+ << *MI->getOperand(1).getExpr();
+ return;
+ }
+
+ if ((Opcode == AArch64::MOVKXi || Opcode == AArch64::MOVKWi) &&
+ MI->getOperand(2).isExpr()) {
+ O << "\tmovk\t" << getRegisterName(MI->getOperand(0).getReg()) << ", #"
+ << *MI->getOperand(2).getExpr();
+ return;
+ }
+
+ if (!printAliasInstr(MI, O))
+ printInstruction(MI, O);
+
+ printAnnotation(O, Annot);
}
-template<int MemScale> void
-AArch64InstPrinter::printSImm7ScaledOperand(const MCInst *MI, unsigned OpNum,
- raw_ostream &O) {
- const MCOperand &MOImm = MI->getOperand(OpNum);
- int32_t Imm = unpackSignedImm(7, MOImm.getImm());
-
- O << "#" << (Imm * MemScale);
+static bool isTblTbxInstruction(unsigned Opcode, StringRef &Layout,
+ bool &IsTbx) {
+ switch (Opcode) {
+ case AArch64::TBXv8i8One:
+ case AArch64::TBXv8i8Two:
+ case AArch64::TBXv8i8Three:
+ case AArch64::TBXv8i8Four:
+ IsTbx = true;
+ Layout = ".8b";
+ return true;
+ case AArch64::TBLv8i8One:
+ case AArch64::TBLv8i8Two:
+ case AArch64::TBLv8i8Three:
+ case AArch64::TBLv8i8Four:
+ IsTbx = false;
+ Layout = ".8b";
+ return true;
+ case AArch64::TBXv16i8One:
+ case AArch64::TBXv16i8Two:
+ case AArch64::TBXv16i8Three:
+ case AArch64::TBXv16i8Four:
+ IsTbx = true;
+ Layout = ".16b";
+ return true;
+ case AArch64::TBLv16i8One:
+ case AArch64::TBLv16i8Two:
+ case AArch64::TBLv16i8Three:
+ case AArch64::TBLv16i8Four:
+ IsTbx = false;
+ Layout = ".16b";
+ return true;
+ default:
+ return false;
+ }
}
-void AArch64InstPrinter::printVPRRegister(const MCInst *MI, unsigned OpNo,
- raw_ostream &O) {
- unsigned Reg = MI->getOperand(OpNo).getReg();
- std::string Name = getRegisterName(Reg);
- Name[0] = 'v';
- O << Name;
+struct LdStNInstrDesc {
+ unsigned Opcode;
+ const char *Mnemonic;
+ const char *Layout;
+ int ListOperand;
+ bool HasLane;
+ int NaturalOffset;
+};
+
+static LdStNInstrDesc LdStNInstInfo[] = {
+ { AArch64::LD1i8, "ld1", ".b", 1, true, 0 },
+ { AArch64::LD1i16, "ld1", ".h", 1, true, 0 },
+ { AArch64::LD1i32, "ld1", ".s", 1, true, 0 },
+ { AArch64::LD1i64, "ld1", ".d", 1, true, 0 },
+ { AArch64::LD1i8_POST, "ld1", ".b", 2, true, 1 },
+ { AArch64::LD1i16_POST, "ld1", ".h", 2, true, 2 },
+ { AArch64::LD1i32_POST, "ld1", ".s", 2, true, 4 },
+ { AArch64::LD1i64_POST, "ld1", ".d", 2, true, 8 },
+ { AArch64::LD1Rv16b, "ld1r", ".16b", 0, false, 0 },
+ { AArch64::LD1Rv8h, "ld1r", ".8h", 0, false, 0 },
+ { AArch64::LD1Rv4s, "ld1r", ".4s", 0, false, 0 },
+ { AArch64::LD1Rv2d, "ld1r", ".2d", 0, false, 0 },
+ { AArch64::LD1Rv8b, "ld1r", ".8b", 0, false, 0 },
+ { AArch64::LD1Rv4h, "ld1r", ".4h", 0, false, 0 },
+ { AArch64::LD1Rv2s, "ld1r", ".2s", 0, false, 0 },
+ { AArch64::LD1Rv1d, "ld1r", ".1d", 0, false, 0 },
+ { AArch64::LD1Rv16b_POST, "ld1r", ".16b", 1, false, 1 },
+ { AArch64::LD1Rv8h_POST, "ld1r", ".8h", 1, false, 2 },
+ { AArch64::LD1Rv4s_POST, "ld1r", ".4s", 1, false, 4 },
+ { AArch64::LD1Rv2d_POST, "ld1r", ".2d", 1, false, 8 },
+ { AArch64::LD1Rv8b_POST, "ld1r", ".8b", 1, false, 1 },
+ { AArch64::LD1Rv4h_POST, "ld1r", ".4h", 1, false, 2 },
+ { AArch64::LD1Rv2s_POST, "ld1r", ".2s", 1, false, 4 },
+ { AArch64::LD1Rv1d_POST, "ld1r", ".1d", 1, false, 8 },
+ { AArch64::LD1Onev16b, "ld1", ".16b", 0, false, 0 },
+ { AArch64::LD1Onev8h, "ld1", ".8h", 0, false, 0 },
+ { AArch64::LD1Onev4s, "ld1", ".4s", 0, false, 0 },
+ { AArch64::LD1Onev2d, "ld1", ".2d", 0, false, 0 },
+ { AArch64::LD1Onev8b, "ld1", ".8b", 0, false, 0 },
+ { AArch64::LD1Onev4h, "ld1", ".4h", 0, false, 0 },
+ { AArch64::LD1Onev2s, "ld1", ".2s", 0, false, 0 },
+ { AArch64::LD1Onev1d, "ld1", ".1d", 0, false, 0 },
+ { AArch64::LD1Onev16b_POST, "ld1", ".16b", 1, false, 16 },
+ { AArch64::LD1Onev8h_POST, "ld1", ".8h", 1, false, 16 },
+ { AArch64::LD1Onev4s_POST, "ld1", ".4s", 1, false, 16 },
+ { AArch64::LD1Onev2d_POST, "ld1", ".2d", 1, false, 16 },
+ { AArch64::LD1Onev8b_POST, "ld1", ".8b", 1, false, 8 },
+ { AArch64::LD1Onev4h_POST, "ld1", ".4h", 1, false, 8 },
+ { AArch64::LD1Onev2s_POST, "ld1", ".2s", 1, false, 8 },
+ { AArch64::LD1Onev1d_POST, "ld1", ".1d", 1, false, 8 },
+ { AArch64::LD1Twov16b, "ld1", ".16b", 0, false, 0 },
+ { AArch64::LD1Twov8h, "ld1", ".8h", 0, false, 0 },
+ { AArch64::LD1Twov4s, "ld1", ".4s", 0, false, 0 },
+ { AArch64::LD1Twov2d, "ld1", ".2d", 0, false, 0 },
+ { AArch64::LD1Twov8b, "ld1", ".8b", 0, false, 0 },
+ { AArch64::LD1Twov4h, "ld1", ".4h", 0, false, 0 },
+ { AArch64::LD1Twov2s, "ld1", ".2s", 0, false, 0 },
+ { AArch64::LD1Twov1d, "ld1", ".1d", 0, false, 0 },
+ { AArch64::LD1Twov16b_POST, "ld1", ".16b", 1, false, 32 },
+ { AArch64::LD1Twov8h_POST, "ld1", ".8h", 1, false, 32 },
+ { AArch64::LD1Twov4s_POST, "ld1", ".4s", 1, false, 32 },
+ { AArch64::LD1Twov2d_POST, "ld1", ".2d", 1, false, 32 },
+ { AArch64::LD1Twov8b_POST, "ld1", ".8b", 1, false, 16 },
+ { AArch64::LD1Twov4h_POST, "ld1", ".4h", 1, false, 16 },
+ { AArch64::LD1Twov2s_POST, "ld1", ".2s", 1, false, 16 },
+ { AArch64::LD1Twov1d_POST, "ld1", ".1d", 1, false, 16 },
+ { AArch64::LD1Threev16b, "ld1", ".16b", 0, false, 0 },
+ { AArch64::LD1Threev8h, "ld1", ".8h", 0, false, 0 },
+ { AArch64::LD1Threev4s, "ld1", ".4s", 0, false, 0 },
+ { AArch64::LD1Threev2d, "ld1", ".2d", 0, false, 0 },
+ { AArch64::LD1Threev8b, "ld1", ".8b", 0, false, 0 },
+ { AArch64::LD1Threev4h, "ld1", ".4h", 0, false, 0 },
+ { AArch64::LD1Threev2s, "ld1", ".2s", 0, false, 0 },
+ { AArch64::LD1Threev1d, "ld1", ".1d", 0, false, 0 },
+ { AArch64::LD1Threev16b_POST, "ld1", ".16b", 1, false, 48 },
+ { AArch64::LD1Threev8h_POST, "ld1", ".8h", 1, false, 48 },
+ { AArch64::LD1Threev4s_POST, "ld1", ".4s", 1, false, 48 },
+ { AArch64::LD1Threev2d_POST, "ld1", ".2d", 1, false, 48 },
+ { AArch64::LD1Threev8b_POST, "ld1", ".8b", 1, false, 24 },
+ { AArch64::LD1Threev4h_POST, "ld1", ".4h", 1, false, 24 },
+ { AArch64::LD1Threev2s_POST, "ld1", ".2s", 1, false, 24 },
+ { AArch64::LD1Threev1d_POST, "ld1", ".1d", 1, false, 24 },
+ { AArch64::LD1Fourv16b, "ld1", ".16b", 0, false, 0 },
+ { AArch64::LD1Fourv8h, "ld1", ".8h", 0, false, 0 },
+ { AArch64::LD1Fourv4s, "ld1", ".4s", 0, false, 0 },
+ { AArch64::LD1Fourv2d, "ld1", ".2d", 0, false, 0 },
+ { AArch64::LD1Fourv8b, "ld1", ".8b", 0, false, 0 },
+ { AArch64::LD1Fourv4h, "ld1", ".4h", 0, false, 0 },
+ { AArch64::LD1Fourv2s, "ld1", ".2s", 0, false, 0 },
+ { AArch64::LD1Fourv1d, "ld1", ".1d", 0, false, 0 },
+ { AArch64::LD1Fourv16b_POST, "ld1", ".16b", 1, false, 64 },
+ { AArch64::LD1Fourv8h_POST, "ld1", ".8h", 1, false, 64 },
+ { AArch64::LD1Fourv4s_POST, "ld1", ".4s", 1, false, 64 },
+ { AArch64::LD1Fourv2d_POST, "ld1", ".2d", 1, false, 64 },
+ { AArch64::LD1Fourv8b_POST, "ld1", ".8b", 1, false, 32 },
+ { AArch64::LD1Fourv4h_POST, "ld1", ".4h", 1, false, 32 },
+ { AArch64::LD1Fourv2s_POST, "ld1", ".2s", 1, false, 32 },
+ { AArch64::LD1Fourv1d_POST, "ld1", ".1d", 1, false, 32 },
+ { AArch64::LD2i8, "ld2", ".b", 1, true, 0 },
+ { AArch64::LD2i16, "ld2", ".h", 1, true, 0 },
+ { AArch64::LD2i32, "ld2", ".s", 1, true, 0 },
+ { AArch64::LD2i64, "ld2", ".d", 1, true, 0 },
+ { AArch64::LD2i8_POST, "ld2", ".b", 2, true, 2 },
+ { AArch64::LD2i16_POST, "ld2", ".h", 2, true, 4 },
+ { AArch64::LD2i32_POST, "ld2", ".s", 2, true, 8 },
+ { AArch64::LD2i64_POST, "ld2", ".d", 2, true, 16 },
+ { AArch64::LD2Rv16b, "ld2r", ".16b", 0, false, 0 },
+ { AArch64::LD2Rv8h, "ld2r", ".8h", 0, false, 0 },
+ { AArch64::LD2Rv4s, "ld2r", ".4s", 0, false, 0 },
+ { AArch64::LD2Rv2d, "ld2r", ".2d", 0, false, 0 },
+ { AArch64::LD2Rv8b, "ld2r", ".8b", 0, false, 0 },
+ { AArch64::LD2Rv4h, "ld2r", ".4h", 0, false, 0 },
+ { AArch64::LD2Rv2s, "ld2r", ".2s", 0, false, 0 },
+ { AArch64::LD2Rv1d, "ld2r", ".1d", 0, false, 0 },
+ { AArch64::LD2Rv16b_POST, "ld2r", ".16b", 1, false, 2 },
+ { AArch64::LD2Rv8h_POST, "ld2r", ".8h", 1, false, 4 },
+ { AArch64::LD2Rv4s_POST, "ld2r", ".4s", 1, false, 8 },
+ { AArch64::LD2Rv2d_POST, "ld2r", ".2d", 1, false, 16 },
+ { AArch64::LD2Rv8b_POST, "ld2r", ".8b", 1, false, 2 },
+ { AArch64::LD2Rv4h_POST, "ld2r", ".4h", 1, false, 4 },
+ { AArch64::LD2Rv2s_POST, "ld2r", ".2s", 1, false, 8 },
+ { AArch64::LD2Rv1d_POST, "ld2r", ".1d", 1, false, 16 },
+ { AArch64::LD2Twov16b, "ld2", ".16b", 0, false, 0 },
+ { AArch64::LD2Twov8h, "ld2", ".8h", 0, false, 0 },
+ { AArch64::LD2Twov4s, "ld2", ".4s", 0, false, 0 },
+ { AArch64::LD2Twov2d, "ld2", ".2d", 0, false, 0 },
+ { AArch64::LD2Twov8b, "ld2", ".8b", 0, false, 0 },
+ { AArch64::LD2Twov4h, "ld2", ".4h", 0, false, 0 },
+ { AArch64::LD2Twov2s, "ld2", ".2s", 0, false, 0 },
+ { AArch64::LD2Twov16b_POST, "ld2", ".16b", 1, false, 32 },
+ { AArch64::LD2Twov8h_POST, "ld2", ".8h", 1, false, 32 },
+ { AArch64::LD2Twov4s_POST, "ld2", ".4s", 1, false, 32 },
+ { AArch64::LD2Twov2d_POST, "ld2", ".2d", 1, false, 32 },
+ { AArch64::LD2Twov8b_POST, "ld2", ".8b", 1, false, 16 },
+ { AArch64::LD2Twov4h_POST, "ld2", ".4h", 1, false, 16 },
+ { AArch64::LD2Twov2s_POST, "ld2", ".2s", 1, false, 16 },
+ { AArch64::LD3i8, "ld3", ".b", 1, true, 0 },
+ { AArch64::LD3i16, "ld3", ".h", 1, true, 0 },
+ { AArch64::LD3i32, "ld3", ".s", 1, true, 0 },
+ { AArch64::LD3i64, "ld3", ".d", 1, true, 0 },
+ { AArch64::LD3i8_POST, "ld3", ".b", 2, true, 3 },
+ { AArch64::LD3i16_POST, "ld3", ".h", 2, true, 6 },
+ { AArch64::LD3i32_POST, "ld3", ".s", 2, true, 12 },
+ { AArch64::LD3i64_POST, "ld3", ".d", 2, true, 24 },
+ { AArch64::LD3Rv16b, "ld3r", ".16b", 0, false, 0 },
+ { AArch64::LD3Rv8h, "ld3r", ".8h", 0, false, 0 },
+ { AArch64::LD3Rv4s, "ld3r", ".4s", 0, false, 0 },
+ { AArch64::LD3Rv2d, "ld3r", ".2d", 0, false, 0 },
+ { AArch64::LD3Rv8b, "ld3r", ".8b", 0, false, 0 },
+ { AArch64::LD3Rv4h, "ld3r", ".4h", 0, false, 0 },
+ { AArch64::LD3Rv2s, "ld3r", ".2s", 0, false, 0 },
+ { AArch64::LD3Rv1d, "ld3r", ".1d", 0, false, 0 },
+ { AArch64::LD3Rv16b_POST, "ld3r", ".16b", 1, false, 3 },
+ { AArch64::LD3Rv8h_POST, "ld3r", ".8h", 1, false, 6 },
+ { AArch64::LD3Rv4s_POST, "ld3r", ".4s", 1, false, 12 },
+ { AArch64::LD3Rv2d_POST, "ld3r", ".2d", 1, false, 24 },
+ { AArch64::LD3Rv8b_POST, "ld3r", ".8b", 1, false, 3 },
+ { AArch64::LD3Rv4h_POST, "ld3r", ".4h", 1, false, 6 },
+ { AArch64::LD3Rv2s_POST, "ld3r", ".2s", 1, false, 12 },
+ { AArch64::LD3Rv1d_POST, "ld3r", ".1d", 1, false, 24 },
+ { AArch64::LD3Threev16b, "ld3", ".16b", 0, false, 0 },
+ { AArch64::LD3Threev8h, "ld3", ".8h", 0, false, 0 },
+ { AArch64::LD3Threev4s, "ld3", ".4s", 0, false, 0 },
+ { AArch64::LD3Threev2d, "ld3", ".2d", 0, false, 0 },
+ { AArch64::LD3Threev8b, "ld3", ".8b", 0, false, 0 },
+ { AArch64::LD3Threev4h, "ld3", ".4h", 0, false, 0 },
+ { AArch64::LD3Threev2s, "ld3", ".2s", 0, false, 0 },
+ { AArch64::LD3Threev16b_POST, "ld3", ".16b", 1, false, 48 },
+ { AArch64::LD3Threev8h_POST, "ld3", ".8h", 1, false, 48 },
+ { AArch64::LD3Threev4s_POST, "ld3", ".4s", 1, false, 48 },
+ { AArch64::LD3Threev2d_POST, "ld3", ".2d", 1, false, 48 },
+ { AArch64::LD3Threev8b_POST, "ld3", ".8b", 1, false, 24 },
+ { AArch64::LD3Threev4h_POST, "ld3", ".4h", 1, false, 24 },
+ { AArch64::LD3Threev2s_POST, "ld3", ".2s", 1, false, 24 },
+ { AArch64::LD4i8, "ld4", ".b", 1, true, 0 },
+ { AArch64::LD4i16, "ld4", ".h", 1, true, 0 },
+ { AArch64::LD4i32, "ld4", ".s", 1, true, 0 },
+ { AArch64::LD4i64, "ld4", ".d", 1, true, 0 },
+ { AArch64::LD4i8_POST, "ld4", ".b", 2, true, 4 },
+ { AArch64::LD4i16_POST, "ld4", ".h", 2, true, 8 },
+ { AArch64::LD4i32_POST, "ld4", ".s", 2, true, 16 },
+ { AArch64::LD4i64_POST, "ld4", ".d", 2, true, 32 },
+ { AArch64::LD4Rv16b, "ld4r", ".16b", 0, false, 0 },
+ { AArch64::LD4Rv8h, "ld4r", ".8h", 0, false, 0 },
+ { AArch64::LD4Rv4s, "ld4r", ".4s", 0, false, 0 },
+ { AArch64::LD4Rv2d, "ld4r", ".2d", 0, false, 0 },
+ { AArch64::LD4Rv8b, "ld4r", ".8b", 0, false, 0 },
+ { AArch64::LD4Rv4h, "ld4r", ".4h", 0, false, 0 },
+ { AArch64::LD4Rv2s, "ld4r", ".2s", 0, false, 0 },
+ { AArch64::LD4Rv1d, "ld4r", ".1d", 0, false, 0 },
+ { AArch64::LD4Rv16b_POST, "ld4r", ".16b", 1, false, 4 },
+ { AArch64::LD4Rv8h_POST, "ld4r", ".8h", 1, false, 8 },
+ { AArch64::LD4Rv4s_POST, "ld4r", ".4s", 1, false, 16 },
+ { AArch64::LD4Rv2d_POST, "ld4r", ".2d", 1, false, 32 },
+ { AArch64::LD4Rv8b_POST, "ld4r", ".8b", 1, false, 4 },
+ { AArch64::LD4Rv4h_POST, "ld4r", ".4h", 1, false, 8 },
+ { AArch64::LD4Rv2s_POST, "ld4r", ".2s", 1, false, 16 },
+ { AArch64::LD4Rv1d_POST, "ld4r", ".1d", 1, false, 32 },
+ { AArch64::LD4Fourv16b, "ld4", ".16b", 0, false, 0 },
+ { AArch64::LD4Fourv8h, "ld4", ".8h", 0, false, 0 },
+ { AArch64::LD4Fourv4s, "ld4", ".4s", 0, false, 0 },
+ { AArch64::LD4Fourv2d, "ld4", ".2d", 0, false, 0 },
+ { AArch64::LD4Fourv8b, "ld4", ".8b", 0, false, 0 },
+ { AArch64::LD4Fourv4h, "ld4", ".4h", 0, false, 0 },
+ { AArch64::LD4Fourv2s, "ld4", ".2s", 0, false, 0 },
+ { AArch64::LD4Fourv16b_POST, "ld4", ".16b", 1, false, 64 },
+ { AArch64::LD4Fourv8h_POST, "ld4", ".8h", 1, false, 64 },
+ { AArch64::LD4Fourv4s_POST, "ld4", ".4s", 1, false, 64 },
+ { AArch64::LD4Fourv2d_POST, "ld4", ".2d", 1, false, 64 },
+ { AArch64::LD4Fourv8b_POST, "ld4", ".8b", 1, false, 32 },
+ { AArch64::LD4Fourv4h_POST, "ld4", ".4h", 1, false, 32 },
+ { AArch64::LD4Fourv2s_POST, "ld4", ".2s", 1, false, 32 },
+ { AArch64::ST1i8, "st1", ".b", 0, true, 0 },
+ { AArch64::ST1i16, "st1", ".h", 0, true, 0 },
+ { AArch64::ST1i32, "st1", ".s", 0, true, 0 },
+ { AArch64::ST1i64, "st1", ".d", 0, true, 0 },
+ { AArch64::ST1i8_POST, "st1", ".b", 1, true, 1 },
+ { AArch64::ST1i16_POST, "st1", ".h", 1, true, 2 },
+ { AArch64::ST1i32_POST, "st1", ".s", 1, true, 4 },
+ { AArch64::ST1i64_POST, "st1", ".d", 1, true, 8 },
+ { AArch64::ST1Onev16b, "st1", ".16b", 0, false, 0 },
+ { AArch64::ST1Onev8h, "st1", ".8h", 0, false, 0 },
+ { AArch64::ST1Onev4s, "st1", ".4s", 0, false, 0 },
+ { AArch64::ST1Onev2d, "st1", ".2d", 0, false, 0 },
+ { AArch64::ST1Onev8b, "st1", ".8b", 0, false, 0 },
+ { AArch64::ST1Onev4h, "st1", ".4h", 0, false, 0 },
+ { AArch64::ST1Onev2s, "st1", ".2s", 0, false, 0 },
+ { AArch64::ST1Onev1d, "st1", ".1d", 0, false, 0 },
+ { AArch64::ST1Onev16b_POST, "st1", ".16b", 1, false, 16 },
+ { AArch64::ST1Onev8h_POST, "st1", ".8h", 1, false, 16 },
+ { AArch64::ST1Onev4s_POST, "st1", ".4s", 1, false, 16 },
+ { AArch64::ST1Onev2d_POST, "st1", ".2d", 1, false, 16 },
+ { AArch64::ST1Onev8b_POST, "st1", ".8b", 1, false, 8 },
+ { AArch64::ST1Onev4h_POST, "st1", ".4h", 1, false, 8 },
+ { AArch64::ST1Onev2s_POST, "st1", ".2s", 1, false, 8 },
+ { AArch64::ST1Onev1d_POST, "st1", ".1d", 1, false, 8 },
+ { AArch64::ST1Twov16b, "st1", ".16b", 0, false, 0 },
+ { AArch64::ST1Twov8h, "st1", ".8h", 0, false, 0 },
+ { AArch64::ST1Twov4s, "st1", ".4s", 0, false, 0 },
+ { AArch64::ST1Twov2d, "st1", ".2d", 0, false, 0 },
+ { AArch64::ST1Twov8b, "st1", ".8b", 0, false, 0 },
+ { AArch64::ST1Twov4h, "st1", ".4h", 0, false, 0 },
+ { AArch64::ST1Twov2s, "st1", ".2s", 0, false, 0 },
+ { AArch64::ST1Twov1d, "st1", ".1d", 0, false, 0 },
+ { AArch64::ST1Twov16b_POST, "st1", ".16b", 1, false, 32 },
+ { AArch64::ST1Twov8h_POST, "st1", ".8h", 1, false, 32 },
+ { AArch64::ST1Twov4s_POST, "st1", ".4s", 1, false, 32 },
+ { AArch64::ST1Twov2d_POST, "st1", ".2d", 1, false, 32 },
+ { AArch64::ST1Twov8b_POST, "st1", ".8b", 1, false, 16 },
+ { AArch64::ST1Twov4h_POST, "st1", ".4h", 1, false, 16 },
+ { AArch64::ST1Twov2s_POST, "st1", ".2s", 1, false, 16 },
+ { AArch64::ST1Twov1d_POST, "st1", ".1d", 1, false, 16 },
+ { AArch64::ST1Threev16b, "st1", ".16b", 0, false, 0 },
+ { AArch64::ST1Threev8h, "st1", ".8h", 0, false, 0 },
+ { AArch64::ST1Threev4s, "st1", ".4s", 0, false, 0 },
+ { AArch64::ST1Threev2d, "st1", ".2d", 0, false, 0 },
+ { AArch64::ST1Threev8b, "st1", ".8b", 0, false, 0 },
+ { AArch64::ST1Threev4h, "st1", ".4h", 0, false, 0 },
+ { AArch64::ST1Threev2s, "st1", ".2s", 0, false, 0 },
+ { AArch64::ST1Threev1d, "st1", ".1d", 0, false, 0 },
+ { AArch64::ST1Threev16b_POST, "st1", ".16b", 1, false, 48 },
+ { AArch64::ST1Threev8h_POST, "st1", ".8h", 1, false, 48 },
+ { AArch64::ST1Threev4s_POST, "st1", ".4s", 1, false, 48 },
+ { AArch64::ST1Threev2d_POST, "st1", ".2d", 1, false, 48 },
+ { AArch64::ST1Threev8b_POST, "st1", ".8b", 1, false, 24 },
+ { AArch64::ST1Threev4h_POST, "st1", ".4h", 1, false, 24 },
+ { AArch64::ST1Threev2s_POST, "st1", ".2s", 1, false, 24 },
+ { AArch64::ST1Threev1d_POST, "st1", ".1d", 1, false, 24 },
+ { AArch64::ST1Fourv16b, "st1", ".16b", 0, false, 0 },
+ { AArch64::ST1Fourv8h, "st1", ".8h", 0, false, 0 },
+ { AArch64::ST1Fourv4s, "st1", ".4s", 0, false, 0 },
+ { AArch64::ST1Fourv2d, "st1", ".2d", 0, false, 0 },
+ { AArch64::ST1Fourv8b, "st1", ".8b", 0, false, 0 },
+ { AArch64::ST1Fourv4h, "st1", ".4h", 0, false, 0 },
+ { AArch64::ST1Fourv2s, "st1", ".2s", 0, false, 0 },
+ { AArch64::ST1Fourv1d, "st1", ".1d", 0, false, 0 },
+ { AArch64::ST1Fourv16b_POST, "st1", ".16b", 1, false, 64 },
+ { AArch64::ST1Fourv8h_POST, "st1", ".8h", 1, false, 64 },
+ { AArch64::ST1Fourv4s_POST, "st1", ".4s", 1, false, 64 },
+ { AArch64::ST1Fourv2d_POST, "st1", ".2d", 1, false, 64 },
+ { AArch64::ST1Fourv8b_POST, "st1", ".8b", 1, false, 32 },
+ { AArch64::ST1Fourv4h_POST, "st1", ".4h", 1, false, 32 },
+ { AArch64::ST1Fourv2s_POST, "st1", ".2s", 1, false, 32 },
+ { AArch64::ST1Fourv1d_POST, "st1", ".1d", 1, false, 32 },
+ { AArch64::ST2i8, "st2", ".b", 0, true, 0 },
+ { AArch64::ST2i16, "st2", ".h", 0, true, 0 },
+ { AArch64::ST2i32, "st2", ".s", 0, true, 0 },
+ { AArch64::ST2i64, "st2", ".d", 0, true, 0 },
+ { AArch64::ST2i8_POST, "st2", ".b", 1, true, 2 },
+ { AArch64::ST2i16_POST, "st2", ".h", 1, true, 4 },
+ { AArch64::ST2i32_POST, "st2", ".s", 1, true, 8 },
+ { AArch64::ST2i64_POST, "st2", ".d", 1, true, 16 },
+ { AArch64::ST2Twov16b, "st2", ".16b", 0, false, 0 },
+ { AArch64::ST2Twov8h, "st2", ".8h", 0, false, 0 },
+ { AArch64::ST2Twov4s, "st2", ".4s", 0, false, 0 },
+ { AArch64::ST2Twov2d, "st2", ".2d", 0, false, 0 },
+ { AArch64::ST2Twov8b, "st2", ".8b", 0, false, 0 },
+ { AArch64::ST2Twov4h, "st2", ".4h", 0, false, 0 },
+ { AArch64::ST2Twov2s, "st2", ".2s", 0, false, 0 },
+ { AArch64::ST2Twov16b_POST, "st2", ".16b", 1, false, 32 },
+ { AArch64::ST2Twov8h_POST, "st2", ".8h", 1, false, 32 },
+ { AArch64::ST2Twov4s_POST, "st2", ".4s", 1, false, 32 },
+ { AArch64::ST2Twov2d_POST, "st2", ".2d", 1, false, 32 },
+ { AArch64::ST2Twov8b_POST, "st2", ".8b", 1, false, 16 },
+ { AArch64::ST2Twov4h_POST, "st2", ".4h", 1, false, 16 },
+ { AArch64::ST2Twov2s_POST, "st2", ".2s", 1, false, 16 },
+ { AArch64::ST3i8, "st3", ".b", 0, true, 0 },
+ { AArch64::ST3i16, "st3", ".h", 0, true, 0 },
+ { AArch64::ST3i32, "st3", ".s", 0, true, 0 },
+ { AArch64::ST3i64, "st3", ".d", 0, true, 0 },
+ { AArch64::ST3i8_POST, "st3", ".b", 1, true, 3 },
+ { AArch64::ST3i16_POST, "st3", ".h", 1, true, 6 },
+ { AArch64::ST3i32_POST, "st3", ".s", 1, true, 12 },
+ { AArch64::ST3i64_POST, "st3", ".d", 1, true, 24 },
+ { AArch64::ST3Threev16b, "st3", ".16b", 0, false, 0 },
+ { AArch64::ST3Threev8h, "st3", ".8h", 0, false, 0 },
+ { AArch64::ST3Threev4s, "st3", ".4s", 0, false, 0 },
+ { AArch64::ST3Threev2d, "st3", ".2d", 0, false, 0 },
+ { AArch64::ST3Threev8b, "st3", ".8b", 0, false, 0 },
+ { AArch64::ST3Threev4h, "st3", ".4h", 0, false, 0 },
+ { AArch64::ST3Threev2s, "st3", ".2s", 0, false, 0 },
+ { AArch64::ST3Threev16b_POST, "st3", ".16b", 1, false, 48 },
+ { AArch64::ST3Threev8h_POST, "st3", ".8h", 1, false, 48 },
+ { AArch64::ST3Threev4s_POST, "st3", ".4s", 1, false, 48 },
+ { AArch64::ST3Threev2d_POST, "st3", ".2d", 1, false, 48 },
+ { AArch64::ST3Threev8b_POST, "st3", ".8b", 1, false, 24 },
+ { AArch64::ST3Threev4h_POST, "st3", ".4h", 1, false, 24 },
+ { AArch64::ST3Threev2s_POST, "st3", ".2s", 1, false, 24 },
+ { AArch64::ST4i8, "st4", ".b", 0, true, 0 },
+ { AArch64::ST4i16, "st4", ".h", 0, true, 0 },
+ { AArch64::ST4i32, "st4", ".s", 0, true, 0 },
+ { AArch64::ST4i64, "st4", ".d", 0, true, 0 },
+ { AArch64::ST4i8_POST, "st4", ".b", 1, true, 4 },
+ { AArch64::ST4i16_POST, "st4", ".h", 1, true, 8 },
+ { AArch64::ST4i32_POST, "st4", ".s", 1, true, 16 },
+ { AArch64::ST4i64_POST, "st4", ".d", 1, true, 32 },
+ { AArch64::ST4Fourv16b, "st4", ".16b", 0, false, 0 },
+ { AArch64::ST4Fourv8h, "st4", ".8h", 0, false, 0 },
+ { AArch64::ST4Fourv4s, "st4", ".4s", 0, false, 0 },
+ { AArch64::ST4Fourv2d, "st4", ".2d", 0, false, 0 },
+ { AArch64::ST4Fourv8b, "st4", ".8b", 0, false, 0 },
+ { AArch64::ST4Fourv4h, "st4", ".4h", 0, false, 0 },
+ { AArch64::ST4Fourv2s, "st4", ".2s", 0, false, 0 },
+ { AArch64::ST4Fourv16b_POST, "st4", ".16b", 1, false, 64 },
+ { AArch64::ST4Fourv8h_POST, "st4", ".8h", 1, false, 64 },
+ { AArch64::ST4Fourv4s_POST, "st4", ".4s", 1, false, 64 },
+ { AArch64::ST4Fourv2d_POST, "st4", ".2d", 1, false, 64 },
+ { AArch64::ST4Fourv8b_POST, "st4", ".8b", 1, false, 32 },
+ { AArch64::ST4Fourv4h_POST, "st4", ".4h", 1, false, 32 },
+ { AArch64::ST4Fourv2s_POST, "st4", ".2s", 1, false, 32 },
+};
+
+static LdStNInstrDesc *getLdStNInstrDesc(unsigned Opcode) {
+ unsigned Idx;
+ for (Idx = 0; Idx != array_lengthof(LdStNInstInfo); ++Idx)
+ if (LdStNInstInfo[Idx].Opcode == Opcode)
+ return &LdStNInstInfo[Idx];
+
+ return nullptr;
+}
+
+void AArch64AppleInstPrinter::printInst(const MCInst *MI, raw_ostream &O,
+ StringRef Annot) {
+ unsigned Opcode = MI->getOpcode();
+ StringRef Layout, Mnemonic;
+
+ bool IsTbx;
+ if (isTblTbxInstruction(MI->getOpcode(), Layout, IsTbx)) {
+ O << "\t" << (IsTbx ? "tbx" : "tbl") << Layout << '\t'
+ << getRegisterName(MI->getOperand(0).getReg(), AArch64::vreg) << ", ";
+
+ unsigned ListOpNum = IsTbx ? 2 : 1;
+ printVectorList(MI, ListOpNum, O, "");
+
+ O << ", "
+ << getRegisterName(MI->getOperand(ListOpNum + 1).getReg(), AArch64::vreg);
+ printAnnotation(O, Annot);
+ return;
+ }
+
+ if (LdStNInstrDesc *LdStDesc = getLdStNInstrDesc(Opcode)) {
+ O << "\t" << LdStDesc->Mnemonic << LdStDesc->Layout << '\t';
+
+ // Now onto the operands: first a vector list with possible lane
+ // specifier. E.g. { v0 }[2]
+ int OpNum = LdStDesc->ListOperand;
+ printVectorList(MI, OpNum++, O, "");
+
+ if (LdStDesc->HasLane)
+ O << '[' << MI->getOperand(OpNum++).getImm() << ']';
+
+ // Next the address: [xN]
+ unsigned AddrReg = MI->getOperand(OpNum++).getReg();
+ O << ", [" << getRegisterName(AddrReg) << ']';
+
+ // Finally, there might be a post-indexed offset.
+ if (LdStDesc->NaturalOffset != 0) {
+ unsigned Reg = MI->getOperand(OpNum++).getReg();
+ if (Reg != AArch64::XZR)
+ O << ", " << getRegisterName(Reg);
+ else {
+ assert(LdStDesc->NaturalOffset && "no offset on post-inc instruction?");
+ O << ", #" << LdStDesc->NaturalOffset;
+ }
+ }
+
+ printAnnotation(O, Annot);
+ return;
+ }
+
+ AArch64InstPrinter::printInst(MI, O, Annot);
+}
+
+bool AArch64InstPrinter::printSysAlias(const MCInst *MI, raw_ostream &O) {
+#ifndef NDEBUG
+ unsigned Opcode = MI->getOpcode();
+ assert(Opcode == AArch64::SYSxt && "Invalid opcode for SYS alias!");
+#endif
+
+ const char *Asm = nullptr;
+ const MCOperand &Op1 = MI->getOperand(0);
+ const MCOperand &Cn = MI->getOperand(1);
+ const MCOperand &Cm = MI->getOperand(2);
+ const MCOperand &Op2 = MI->getOperand(3);
+
+ unsigned Op1Val = Op1.getImm();
+ unsigned CnVal = Cn.getImm();
+ unsigned CmVal = Cm.getImm();
+ unsigned Op2Val = Op2.getImm();
+
+ if (CnVal == 7) {
+ switch (CmVal) {
+ default:
+ break;
+
+ // IC aliases
+ case 1:
+ if (Op1Val == 0 && Op2Val == 0)
+ Asm = "ic\tialluis";
+ break;
+ case 5:
+ if (Op1Val == 0 && Op2Val == 0)
+ Asm = "ic\tiallu";
+ else if (Op1Val == 3 && Op2Val == 1)
+ Asm = "ic\tivau";
+ break;
+
+ // DC aliases
+ case 4:
+ if (Op1Val == 3 && Op2Val == 1)
+ Asm = "dc\tzva";
+ break;
+ case 6:
+ if (Op1Val == 0 && Op2Val == 1)
+ Asm = "dc\tivac";
+ if (Op1Val == 0 && Op2Val == 2)
+ Asm = "dc\tisw";
+ break;
+ case 10:
+ if (Op1Val == 3 && Op2Val == 1)
+ Asm = "dc\tcvac";
+ else if (Op1Val == 0 && Op2Val == 2)
+ Asm = "dc\tcsw";
+ break;
+ case 11:
+ if (Op1Val == 3 && Op2Val == 1)
+ Asm = "dc\tcvau";
+ break;
+ case 14:
+ if (Op1Val == 3 && Op2Val == 1)
+ Asm = "dc\tcivac";
+ else if (Op1Val == 0 && Op2Val == 2)
+ Asm = "dc\tcisw";
+ break;
+
+ // AT aliases
+ case 8:
+ switch (Op1Val) {
+ default:
+ break;
+ case 0:
+ switch (Op2Val) {
+ default:
+ break;
+ case 0: Asm = "at\ts1e1r"; break;
+ case 1: Asm = "at\ts1e1w"; break;
+ case 2: Asm = "at\ts1e0r"; break;
+ case 3: Asm = "at\ts1e0w"; break;
+ }
+ break;
+ case 4:
+ switch (Op2Val) {
+ default:
+ break;
+ case 0: Asm = "at\ts1e2r"; break;
+ case 1: Asm = "at\ts1e2w"; break;
+ case 4: Asm = "at\ts12e1r"; break;
+ case 5: Asm = "at\ts12e1w"; break;
+ case 6: Asm = "at\ts12e0r"; break;
+ case 7: Asm = "at\ts12e0w"; break;
+ }
+ break;
+ case 6:
+ switch (Op2Val) {
+ default:
+ break;
+ case 0: Asm = "at\ts1e3r"; break;
+ case 1: Asm = "at\ts1e3w"; break;
+ }
+ break;
+ }
+ break;
+ }
+ } else if (CnVal == 8) {
+ // TLBI aliases
+ switch (CmVal) {
+ default:
+ break;
+ case 3:
+ switch (Op1Val) {
+ default:
+ break;
+ case 0:
+ switch (Op2Val) {
+ default:
+ break;
+ case 0: Asm = "tlbi\tvmalle1is"; break;
+ case 1: Asm = "tlbi\tvae1is"; break;
+ case 2: Asm = "tlbi\taside1is"; break;
+ case 3: Asm = "tlbi\tvaae1is"; break;
+ case 5: Asm = "tlbi\tvale1is"; break;
+ case 7: Asm = "tlbi\tvaale1is"; break;
+ }
+ break;
+ case 4:
+ switch (Op2Val) {
+ default:
+ break;
+ case 0: Asm = "tlbi\talle2is"; break;
+ case 1: Asm = "tlbi\tvae2is"; break;
+ case 4: Asm = "tlbi\talle1is"; break;
+ case 5: Asm = "tlbi\tvale2is"; break;
+ case 6: Asm = "tlbi\tvmalls12e1is"; break;
+ }
+ break;
+ case 6:
+ switch (Op2Val) {
+ default:
+ break;
+ case 0: Asm = "tlbi\talle3is"; break;
+ case 1: Asm = "tlbi\tvae3is"; break;
+ case 5: Asm = "tlbi\tvale3is"; break;
+ }
+ break;
+ }
+ break;
+ case 0:
+ switch (Op1Val) {
+ default:
+ break;
+ case 4:
+ switch (Op2Val) {
+ default:
+ break;
+ case 1: Asm = "tlbi\tipas2e1is"; break;
+ case 5: Asm = "tlbi\tipas2le1is"; break;
+ }
+ break;
+ }
+ break;
+ case 4:
+ switch (Op1Val) {
+ default:
+ break;
+ case 4:
+ switch (Op2Val) {
+ default:
+ break;
+ case 1: Asm = "tlbi\tipas2e1"; break;
+ case 5: Asm = "tlbi\tipas2le1"; break;
+ }
+ break;
+ }
+ break;
+ case 7:
+ switch (Op1Val) {
+ default:
+ break;
+ case 0:
+ switch (Op2Val) {
+ default:
+ break;
+ case 0: Asm = "tlbi\tvmalle1"; break;
+ case 1: Asm = "tlbi\tvae1"; break;
+ case 2: Asm = "tlbi\taside1"; break;
+ case 3: Asm = "tlbi\tvaae1"; break;
+ case 5: Asm = "tlbi\tvale1"; break;
+ case 7: Asm = "tlbi\tvaale1"; break;
+ }
+ break;
+ case 4:
+ switch (Op2Val) {
+ default:
+ break;
+ case 0: Asm = "tlbi\talle2"; break;
+ case 1: Asm = "tlbi\tvae2"; break;
+ case 4: Asm = "tlbi\talle1"; break;
+ case 5: Asm = "tlbi\tvale2"; break;
+ case 6: Asm = "tlbi\tvmalls12e1"; break;
+ }
+ break;
+ case 6:
+ switch (Op2Val) {
+ default:
+ break;
+ case 0: Asm = "tlbi\talle3"; break;
+ case 1: Asm = "tlbi\tvae3"; break;
+ case 5: Asm = "tlbi\tvale3"; break;
+ }
+ break;
+ }
+ break;
+ }
+ }
+
+ if (Asm) {
+ unsigned Reg = MI->getOperand(4).getReg();
+
+ O << '\t' << Asm;
+ if (StringRef(Asm).lower().find("all") == StringRef::npos)
+ O << ", " << getRegisterName(Reg);
+ }
+
+ return Asm != nullptr;
}
void AArch64InstPrinter::printOperand(const MCInst *MI, unsigned OpNo,
@@ -386,154 +898,419 @@
O << '#' << Op.getImm();
} else {
assert(Op.isExpr() && "unknown operand kind in printOperand");
- // If a symbolic branch target was added as a constant expression then print
- // that address in hex.
- const MCConstantExpr *BranchTarget = dyn_cast<MCConstantExpr>(Op.getExpr());
- int64_t Address;
- if (BranchTarget && BranchTarget->EvaluateAsAbsolute(Address)) {
- O << "0x";
- O.write_hex(Address);
- }
- else {
- // Otherwise, just print the expression.
- O << *Op.getExpr();
- }
+ O << *Op.getExpr();
}
}
-
-void AArch64InstPrinter::printInst(const MCInst *MI, raw_ostream &O,
- StringRef Annot) {
- if (MI->getOpcode() == AArch64::TLSDESCCALL) {
- // This is a special assembler directive which applies an
- // R_AARCH64_TLSDESC_CALL to the following (BLR) instruction. It has a fixed
- // form outside the normal TableGenerated scheme.
- O << "\t.tlsdesccall " << *MI->getOperand(0).getExpr();
- } else if (!printAliasInstr(MI, O))
- printInstruction(MI, O);
-
- printAnnotation(O, Annot);
+void AArch64InstPrinter::printHexImm(const MCInst *MI, unsigned OpNo,
+ raw_ostream &O) {
+ const MCOperand &Op = MI->getOperand(OpNo);
+ O << format("#%#llx", Op.getImm());
}
-template <A64SE::ShiftExtSpecifiers Ext, bool isHalf>
-void AArch64InstPrinter::printNeonMovImmShiftOperand(const MCInst *MI,
- unsigned OpNum,
- raw_ostream &O) {
- const MCOperand &MO = MI->getOperand(OpNum);
-
- assert(MO.isImm() &&
- "Immediate operand required for Neon vector immediate inst.");
-
- bool IsLSL = false;
- if (Ext == A64SE::LSL)
- IsLSL = true;
- else if (Ext != A64SE::MSL)
- llvm_unreachable("Invalid shift specifier in movi instruction");
-
- int64_t Imm = MO.getImm();
-
- // MSL and LSLH accepts encoded shift amount 0 or 1.
- if ((!IsLSL || (IsLSL && isHalf)) && Imm != 0 && Imm != 1)
- llvm_unreachable("Invalid shift amount in movi instruction");
-
- // LSH accepts encoded shift amount 0, 1, 2 or 3.
- if (IsLSL && (Imm < 0 || Imm > 3))
- llvm_unreachable("Invalid shift amount in movi instruction");
-
- // Print shift amount as multiple of 8 with MSL encoded shift amount
- // 0 and 1 printed as 8 and 16.
- if (!IsLSL)
- Imm++;
- Imm *= 8;
-
- // LSL #0 is not printed
- if (IsLSL) {
- if (Imm == 0)
- return;
- O << ", lsl";
+void AArch64InstPrinter::printPostIncOperand(const MCInst *MI, unsigned OpNo,
+ unsigned Imm, raw_ostream &O) {
+ const MCOperand &Op = MI->getOperand(OpNo);
+ if (Op.isReg()) {
+ unsigned Reg = Op.getReg();
+ if (Reg == AArch64::XZR)
+ O << "#" << Imm;
+ else
+ O << getRegisterName(Reg);
} else
- O << ", msl";
-
- O << " #" << Imm;
+ assert(0 && "unknown operand kind in printPostIncOperand64");
}
-void AArch64InstPrinter::printNeonUImm0Operand(const MCInst *MI, unsigned OpNum,
- raw_ostream &o) {
- o << "#0x0";
+void AArch64InstPrinter::printVRegOperand(const MCInst *MI, unsigned OpNo,
+ raw_ostream &O) {
+ const MCOperand &Op = MI->getOperand(OpNo);
+ assert(Op.isReg() && "Non-register vreg operand!");
+ unsigned Reg = Op.getReg();
+ O << getRegisterName(Reg, AArch64::vreg);
}
-void AArch64InstPrinter::printUImmHexOperand(const MCInst *MI, unsigned OpNum,
- raw_ostream &O) {
- const MCOperand &MOUImm = MI->getOperand(OpNum);
+void AArch64InstPrinter::printSysCROperand(const MCInst *MI, unsigned OpNo,
+ raw_ostream &O) {
+ const MCOperand &Op = MI->getOperand(OpNo);
+ assert(Op.isImm() && "System instruction C[nm] operands must be immediates!");
+ O << "c" << Op.getImm();
+}
- assert(MOUImm.isImm() &&
- "Immediate operand required for Neon vector immediate inst.");
+void AArch64InstPrinter::printAddSubImm(const MCInst *MI, unsigned OpNum,
+ raw_ostream &O) {
+ const MCOperand &MO = MI->getOperand(OpNum);
+ if (MO.isImm()) {
+ unsigned Val = (MO.getImm() & 0xfff);
+ assert(Val == MO.getImm() && "Add/sub immediate out of range!");
+ unsigned Shift =
+ AArch64_AM::getShiftValue(MI->getOperand(OpNum + 1).getImm());
+ O << '#' << Val;
+ if (Shift != 0)
+ printShifter(MI, OpNum + 1, O);
- unsigned Imm = MOUImm.getImm();
+ if (CommentStream)
+ *CommentStream << '=' << (Val << Shift) << '\n';
+ } else {
+ assert(MO.isExpr() && "Unexpected operand type!");
+ O << *MO.getExpr();
+ printShifter(MI, OpNum + 1, O);
+ }
+}
+void AArch64InstPrinter::printLogicalImm32(const MCInst *MI, unsigned OpNum,
+ raw_ostream &O) {
+ uint64_t Val = MI->getOperand(OpNum).getImm();
O << "#0x";
- O.write_hex(Imm);
+ O.write_hex(AArch64_AM::decodeLogicalImmediate(Val, 32));
}
-void AArch64InstPrinter::printUImmBareOperand(const MCInst *MI,
- unsigned OpNum,
+void AArch64InstPrinter::printLogicalImm64(const MCInst *MI, unsigned OpNum,
+ raw_ostream &O) {
+ uint64_t Val = MI->getOperand(OpNum).getImm();
+ O << "#0x";
+ O.write_hex(AArch64_AM::decodeLogicalImmediate(Val, 64));
+}
+
+void AArch64InstPrinter::printShifter(const MCInst *MI, unsigned OpNum,
+ raw_ostream &O) {
+ unsigned Val = MI->getOperand(OpNum).getImm();
+ // LSL #0 should not be printed.
+ if (AArch64_AM::getShiftType(Val) == AArch64_AM::LSL &&
+ AArch64_AM::getShiftValue(Val) == 0)
+ return;
+ O << ", " << AArch64_AM::getShiftExtendName(AArch64_AM::getShiftType(Val))
+ << " #" << AArch64_AM::getShiftValue(Val);
+}
+
+void AArch64InstPrinter::printShiftedRegister(const MCInst *MI, unsigned OpNum,
raw_ostream &O) {
- const MCOperand &MOUImm = MI->getOperand(OpNum);
-
- assert(MOUImm.isImm()
- && "Immediate operand required for Neon vector immediate inst.");
-
- unsigned Imm = MOUImm.getImm();
- O << Imm;
+ O << getRegisterName(MI->getOperand(OpNum).getReg());
+ printShifter(MI, OpNum + 1, O);
}
-void AArch64InstPrinter::printNeonUImm64MaskOperand(const MCInst *MI,
- unsigned OpNum,
- raw_ostream &O) {
- const MCOperand &MOUImm8 = MI->getOperand(OpNum);
-
- assert(MOUImm8.isImm() &&
- "Immediate operand required for Neon vector immediate bytemask inst.");
-
- uint32_t UImm8 = MOUImm8.getImm();
- uint64_t Mask = 0;
-
- // Replicates 0x00 or 0xff byte in a 64-bit vector
- for (unsigned ByteNum = 0; ByteNum < 8; ++ByteNum) {
- if ((UImm8 >> ByteNum) & 1)
- Mask |= (uint64_t)0xff << (8 * ByteNum);
- }
-
- O << "#0x";
- O.write_hex(Mask);
+void AArch64InstPrinter::printExtendedRegister(const MCInst *MI, unsigned OpNum,
+ raw_ostream &O) {
+ O << getRegisterName(MI->getOperand(OpNum).getReg());
+ printArithExtend(MI, OpNum + 1, O);
}
-// If Count > 1, there are two valid kinds of vector list:
-// (1) {Vn.layout, Vn+1.layout, ... , Vm.layout}
-// (2) {Vn.layout - Vm.layout}
-// We choose the first kind as output.
-template <A64Layout::VectorLayout Layout, unsigned Count>
-void AArch64InstPrinter::printVectorList(const MCInst *MI, unsigned OpNum,
- raw_ostream &O) {
- assert(Count >= 1 && Count <= 4 && "Invalid Number of Vectors");
+void AArch64InstPrinter::printArithExtend(const MCInst *MI, unsigned OpNum,
+ raw_ostream &O) {
+ unsigned Val = MI->getOperand(OpNum).getImm();
+ AArch64_AM::ShiftExtendType ExtType = AArch64_AM::getArithExtendType(Val);
+ unsigned ShiftVal = AArch64_AM::getArithShiftValue(Val);
- unsigned Reg = MI->getOperand(OpNum).getReg();
- std::string LayoutStr = A64VectorLayoutToString(Layout);
- O << "{";
- if (Count > 1) { // Print sub registers separately
- bool IsVec64 = (Layout < A64Layout::VL_16B);
- unsigned SubRegIdx = IsVec64 ? AArch64::dsub_0 : AArch64::qsub_0;
- for (unsigned I = 0; I < Count; I++) {
- std::string Name = getRegisterName(MRI.getSubReg(Reg, SubRegIdx++));
- Name[0] = 'v';
- O << Name << LayoutStr;
- if (I != Count - 1)
- O << ", ";
+ // If the destination or first source register operand is [W]SP, print
+ // UXTW/UXTX as LSL, and if the shift amount is also zero, print nothing at
+ // all.
+ if (ExtType == AArch64_AM::UXTW || ExtType == AArch64_AM::UXTX) {
+ unsigned Dest = MI->getOperand(0).getReg();
+ unsigned Src1 = MI->getOperand(1).getReg();
+ if ( ((Dest == AArch64::SP || Src1 == AArch64::SP) &&
+ ExtType == AArch64_AM::UXTX) ||
+ ((Dest == AArch64::WSP || Src1 == AArch64::WSP) &&
+ ExtType == AArch64_AM::UXTW) ) {
+ if (ShiftVal != 0)
+ O << ", lsl #" << ShiftVal;
+ return;
}
- } else { // Print the register directly when NumVecs is 1.
- std::string Name = getRegisterName(Reg);
- Name[0] = 'v';
- O << Name << LayoutStr;
}
- O << "}";
+ O << ", " << AArch64_AM::getShiftExtendName(ExtType);
+ if (ShiftVal != 0)
+ O << " #" << ShiftVal;
+}
+
+void AArch64InstPrinter::printMemExtend(const MCInst *MI, unsigned OpNum,
+ raw_ostream &O, char SrcRegKind,
+ unsigned Width) {
+ unsigned SignExtend = MI->getOperand(OpNum).getImm();
+ unsigned DoShift = MI->getOperand(OpNum + 1).getImm();
+
+ // sxtw, sxtx, uxtw or lsl (== uxtx)
+ bool IsLSL = !SignExtend && SrcRegKind == 'x';
+ if (IsLSL)
+ O << "lsl";
+ else
+ O << (SignExtend ? 's' : 'u') << "xt" << SrcRegKind;
+
+ if (DoShift || IsLSL)
+ O << " #" << Log2_32(Width / 8);
+}
+
+void AArch64InstPrinter::printCondCode(const MCInst *MI, unsigned OpNum,
+ raw_ostream &O) {
+ AArch64CC::CondCode CC = (AArch64CC::CondCode)MI->getOperand(OpNum).getImm();
+ O << AArch64CC::getCondCodeName(CC);
+}
+
+void AArch64InstPrinter::printInverseCondCode(const MCInst *MI, unsigned OpNum,
+ raw_ostream &O) {
+ AArch64CC::CondCode CC = (AArch64CC::CondCode)MI->getOperand(OpNum).getImm();
+ O << AArch64CC::getCondCodeName(AArch64CC::getInvertedCondCode(CC));
+}
+
+void AArch64InstPrinter::printAMNoIndex(const MCInst *MI, unsigned OpNum,
+ raw_ostream &O) {
+ O << '[' << getRegisterName(MI->getOperand(OpNum).getReg()) << ']';
+}
+
+template<int Scale>
+void AArch64InstPrinter::printImmScale(const MCInst *MI, unsigned OpNum,
+ raw_ostream &O) {
+ O << '#' << Scale * MI->getOperand(OpNum).getImm();
+}
+
+void AArch64InstPrinter::printUImm12Offset(const MCInst *MI, unsigned OpNum,
+ unsigned Scale, raw_ostream &O) {
+ const MCOperand MO = MI->getOperand(OpNum);
+ if (MO.isImm()) {
+ O << "#" << (MO.getImm() * Scale);
+ } else {
+ assert(MO.isExpr() && "Unexpected operand type!");
+ O << *MO.getExpr();
+ }
+}
+
+void AArch64InstPrinter::printAMIndexedWB(const MCInst *MI, unsigned OpNum,
+ unsigned Scale, raw_ostream &O) {
+ const MCOperand MO1 = MI->getOperand(OpNum + 1);
+ O << '[' << getRegisterName(MI->getOperand(OpNum).getReg());
+ if (MO1.isImm()) {
+ O << ", #" << (MO1.getImm() * Scale);
+ } else {
+ assert(MO1.isExpr() && "Unexpected operand type!");
+ O << ", " << *MO1.getExpr();
+ }
+ O << ']';
+}
+
+void AArch64InstPrinter::printPrefetchOp(const MCInst *MI, unsigned OpNum,
+ raw_ostream &O) {
+ unsigned prfop = MI->getOperand(OpNum).getImm();
+ bool Valid;
+ StringRef Name = AArch64PRFM::PRFMMapper().toString(prfop, Valid);
+ if (Valid)
+ O << Name;
+ else
+ O << '#' << prfop;
+}
+
+void AArch64InstPrinter::printFPImmOperand(const MCInst *MI, unsigned OpNum,
+ raw_ostream &O) {
+ const MCOperand &MO = MI->getOperand(OpNum);
+ float FPImm =
+ MO.isFPImm() ? MO.getFPImm() : AArch64_AM::getFPImmFloat(MO.getImm());
+
+ // 8 decimal places are enough to perfectly represent permitted floats.
+ O << format("#%.8f", FPImm);
+}
+
+static unsigned getNextVectorRegister(unsigned Reg, unsigned Stride = 1) {
+ while (Stride--) {
+ switch (Reg) {
+ default:
+ assert(0 && "Vector register expected!");
+ case AArch64::Q0: Reg = AArch64::Q1; break;
+ case AArch64::Q1: Reg = AArch64::Q2; break;
+ case AArch64::Q2: Reg = AArch64::Q3; break;
+ case AArch64::Q3: Reg = AArch64::Q4; break;
+ case AArch64::Q4: Reg = AArch64::Q5; break;
+ case AArch64::Q5: Reg = AArch64::Q6; break;
+ case AArch64::Q6: Reg = AArch64::Q7; break;
+ case AArch64::Q7: Reg = AArch64::Q8; break;
+ case AArch64::Q8: Reg = AArch64::Q9; break;
+ case AArch64::Q9: Reg = AArch64::Q10; break;
+ case AArch64::Q10: Reg = AArch64::Q11; break;
+ case AArch64::Q11: Reg = AArch64::Q12; break;
+ case AArch64::Q12: Reg = AArch64::Q13; break;
+ case AArch64::Q13: Reg = AArch64::Q14; break;
+ case AArch64::Q14: Reg = AArch64::Q15; break;
+ case AArch64::Q15: Reg = AArch64::Q16; break;
+ case AArch64::Q16: Reg = AArch64::Q17; break;
+ case AArch64::Q17: Reg = AArch64::Q18; break;
+ case AArch64::Q18: Reg = AArch64::Q19; break;
+ case AArch64::Q19: Reg = AArch64::Q20; break;
+ case AArch64::Q20: Reg = AArch64::Q21; break;
+ case AArch64::Q21: Reg = AArch64::Q22; break;
+ case AArch64::Q22: Reg = AArch64::Q23; break;
+ case AArch64::Q23: Reg = AArch64::Q24; break;
+ case AArch64::Q24: Reg = AArch64::Q25; break;
+ case AArch64::Q25: Reg = AArch64::Q26; break;
+ case AArch64::Q26: Reg = AArch64::Q27; break;
+ case AArch64::Q27: Reg = AArch64::Q28; break;
+ case AArch64::Q28: Reg = AArch64::Q29; break;
+ case AArch64::Q29: Reg = AArch64::Q30; break;
+ case AArch64::Q30: Reg = AArch64::Q31; break;
+ // Vector lists can wrap around.
+ case AArch64::Q31:
+ Reg = AArch64::Q0;
+ break;
+ }
+ }
+ return Reg;
+}
+
+void AArch64InstPrinter::printVectorList(const MCInst *MI, unsigned OpNum,
+ raw_ostream &O,
+ StringRef LayoutSuffix) {
+ unsigned Reg = MI->getOperand(OpNum).getReg();
+
+ O << "{ ";
+
+ // Work out how many registers there are in the list (if there is an actual
+ // list).
+ unsigned NumRegs = 1;
+ if (MRI.getRegClass(AArch64::DDRegClassID).contains(Reg) ||
+ MRI.getRegClass(AArch64::QQRegClassID).contains(Reg))
+ NumRegs = 2;
+ else if (MRI.getRegClass(AArch64::DDDRegClassID).contains(Reg) ||
+ MRI.getRegClass(AArch64::QQQRegClassID).contains(Reg))
+ NumRegs = 3;
+ else if (MRI.getRegClass(AArch64::DDDDRegClassID).contains(Reg) ||
+ MRI.getRegClass(AArch64::QQQQRegClassID).contains(Reg))
+ NumRegs = 4;
+
+ // Now forget about the list and find out what the first register is.
+ if (unsigned FirstReg = MRI.getSubReg(Reg, AArch64::dsub0))
+ Reg = FirstReg;
+ else if (unsigned FirstReg = MRI.getSubReg(Reg, AArch64::qsub0))
+ Reg = FirstReg;
+
+ // If it's a D-reg, we need to promote it to the equivalent Q-reg before
+ // printing (otherwise getRegisterName fails).
+ if (MRI.getRegClass(AArch64::FPR64RegClassID).contains(Reg)) {
+ const MCRegisterClass &FPR128RC =
+ MRI.getRegClass(AArch64::FPR128RegClassID);
+ Reg = MRI.getMatchingSuperReg(Reg, AArch64::dsub, &FPR128RC);
+ }
+
+ for (unsigned i = 0; i < NumRegs; ++i, Reg = getNextVectorRegister(Reg)) {
+ O << getRegisterName(Reg, AArch64::vreg) << LayoutSuffix;
+ if (i + 1 != NumRegs)
+ O << ", ";
+ }
+
+ O << " }";
+}
+
+void AArch64InstPrinter::printImplicitlyTypedVectorList(const MCInst *MI,
+ unsigned OpNum,
+ raw_ostream &O) {
+ printVectorList(MI, OpNum, O, "");
+}
+
+template <unsigned NumLanes, char LaneKind>
+void AArch64InstPrinter::printTypedVectorList(const MCInst *MI, unsigned OpNum,
+ raw_ostream &O) {
+ std::string Suffix(".");
+ if (NumLanes)
+ Suffix += itostr(NumLanes) + LaneKind;
+ else
+ Suffix += LaneKind;
+
+ printVectorList(MI, OpNum, O, Suffix);
+}
+
+void AArch64InstPrinter::printVectorIndex(const MCInst *MI, unsigned OpNum,
+ raw_ostream &O) {
+ O << "[" << MI->getOperand(OpNum).getImm() << "]";
+}
+
+void AArch64InstPrinter::printAlignedLabel(const MCInst *MI, unsigned OpNum,
+ raw_ostream &O) {
+ const MCOperand &Op = MI->getOperand(OpNum);
+
+ // If the label has already been resolved to an immediate offset (say, when
+ // we're running the disassembler), just print the immediate.
+ if (Op.isImm()) {
+ O << "#" << (Op.getImm() << 2);
+ return;
+ }
+
+ // If the branch target is simply an address then print it in hex.
+ const MCConstantExpr *BranchTarget =
+ dyn_cast<MCConstantExpr>(MI->getOperand(OpNum).getExpr());
+ int64_t Address;
+ if (BranchTarget && BranchTarget->EvaluateAsAbsolute(Address)) {
+ O << "0x";
+ O.write_hex(Address);
+ } else {
+ // Otherwise, just print the expression.
+ O << *MI->getOperand(OpNum).getExpr();
+ }
+}
+
+void AArch64InstPrinter::printAdrpLabel(const MCInst *MI, unsigned OpNum,
+ raw_ostream &O) {
+ const MCOperand &Op = MI->getOperand(OpNum);
+
+ // If the label has already been resolved to an immediate offset (say, when
+ // we're running the disassembler), just print the immediate.
+ if (Op.isImm()) {
+ O << "#" << (Op.getImm() << 12);
+ return;
+ }
+
+ // Otherwise, just print the expression.
+ O << *MI->getOperand(OpNum).getExpr();
+}
+
+void AArch64InstPrinter::printBarrierOption(const MCInst *MI, unsigned OpNo,
+ raw_ostream &O) {
+ unsigned Val = MI->getOperand(OpNo).getImm();
+ unsigned Opcode = MI->getOpcode();
+
+ bool Valid;
+ StringRef Name;
+ if (Opcode == AArch64::ISB)
+ Name = AArch64ISB::ISBMapper().toString(Val, Valid);
+ else
+ Name = AArch64DB::DBarrierMapper().toString(Val, Valid);
+ if (Valid)
+ O << Name;
+ else
+ O << "#" << Val;
+}
+
+void AArch64InstPrinter::printMRSSystemRegister(const MCInst *MI, unsigned OpNo,
+ raw_ostream &O) {
+ unsigned Val = MI->getOperand(OpNo).getImm();
+
+ bool Valid;
+ auto Mapper = AArch64SysReg::MRSMapper(getAvailableFeatures());
+ std::string Name = Mapper.toString(Val, Valid);
+
+ if (Valid)
+ O << StringRef(Name).upper();
+}
+
+void AArch64InstPrinter::printMSRSystemRegister(const MCInst *MI, unsigned OpNo,
+ raw_ostream &O) {
+ unsigned Val = MI->getOperand(OpNo).getImm();
+
+ bool Valid;
+ auto Mapper = AArch64SysReg::MSRMapper(getAvailableFeatures());
+ std::string Name = Mapper.toString(Val, Valid);
+
+ if (Valid)
+ O << StringRef(Name).upper();
+}
+
+void AArch64InstPrinter::printSystemPStateField(const MCInst *MI, unsigned OpNo,
+ raw_ostream &O) {
+ unsigned Val = MI->getOperand(OpNo).getImm();
+
+ bool Valid;
+ StringRef Name = AArch64PState::PStateMapper().toString(Val, Valid);
+ if (Valid)
+ O << StringRef(Name.str()).upper();
+ else
+ O << "#" << Val;
+}
+
+void AArch64InstPrinter::printSIMDType10Operand(const MCInst *MI, unsigned OpNo,
+ raw_ostream &O) {
+ unsigned RawVal = MI->getOperand(OpNo).getImm();
+ uint64_t Val = AArch64_AM::decodeAdvSIMDModImmType10(RawVal);
+ O << format("#%#016llx", Val);
}
diff --git a/lib/Target/AArch64/InstPrinter/AArch64InstPrinter.h b/lib/Target/AArch64/InstPrinter/AArch64InstPrinter.h
index 37b7273..fe7666e 100644
--- a/lib/Target/AArch64/InstPrinter/AArch64InstPrinter.h
+++ b/lib/Target/AArch64/InstPrinter/AArch64InstPrinter.h
@@ -11,11 +11,11 @@
//
//===----------------------------------------------------------------------===//
-#ifndef LLVM_AARCH64INSTPRINTER_H
-#define LLVM_AARCH64INSTPRINTER_H
+#ifndef AArch64INSTPRINTER_H
+#define AArch64INSTPRINTER_H
#include "MCTargetDesc/AArch64MCTargetDesc.h"
-#include "Utils/AArch64BaseInfo.h"
+#include "llvm/ADT/StringRef.h"
#include "llvm/MC/MCInstPrinter.h"
#include "llvm/MC/MCSubtargetInfo.h"
@@ -28,154 +28,112 @@
AArch64InstPrinter(const MCAsmInfo &MAI, const MCInstrInfo &MII,
const MCRegisterInfo &MRI, const MCSubtargetInfo &STI);
- // Autogenerated by tblgen
- void printInstruction(const MCInst *MI, raw_ostream &O);
- bool printAliasInstr(const MCInst *MI, raw_ostream &O);
- static const char *getRegisterName(unsigned RegNo);
- static const char *getInstructionName(unsigned Opcode);
+ void printInst(const MCInst *MI, raw_ostream &O, StringRef Annot) override;
+ void printRegName(raw_ostream &OS, unsigned RegNo) const override;
- void printRegName(raw_ostream &O, unsigned RegNum) const;
-
- template<unsigned MemSize, unsigned RmSize>
- void printAddrRegExtendOperand(const MCInst *MI, unsigned OpNum,
- raw_ostream &O) {
- printAddrRegExtendOperand(MI, OpNum, O, MemSize, RmSize);
+ // Autogenerated by tblgen.
+ virtual void printInstruction(const MCInst *MI, raw_ostream &O);
+ virtual bool printAliasInstr(const MCInst *MI, raw_ostream &O);
+ virtual void printCustomAliasOperand(const MCInst *MI, unsigned OpIdx,
+ unsigned PrintMethodIdx, raw_ostream &O);
+ virtual StringRef getRegName(unsigned RegNo) const {
+ return getRegisterName(RegNo);
}
+ static const char *getRegisterName(unsigned RegNo,
+ unsigned AltIdx = AArch64::NoRegAltName);
-
- void printAddrRegExtendOperand(const MCInst *MI, unsigned OpNum,
- raw_ostream &O, unsigned MemSize,
- unsigned RmSize);
-
- void printAddSubImmLSL0Operand(const MCInst *MI,
- unsigned OpNum, raw_ostream &O);
- void printAddSubImmLSL12Operand(const MCInst *MI,
- unsigned OpNum, raw_ostream &O);
-
- void printBareImmOperand(const MCInst *MI, unsigned OpNum, raw_ostream &O);
-
- template<unsigned RegWidth>
- void printBFILSBOperand(const MCInst *MI, unsigned OpNum, raw_ostream &O);
- void printBFIWidthOperand(const MCInst *MI, unsigned OpNum, raw_ostream &O);
- void printBFXWidthOperand(const MCInst *MI, unsigned OpNum, raw_ostream &O);
-
-
- void printCondCodeOperand(const MCInst *MI, unsigned OpNum,
- raw_ostream &O);
-
- void printCRxOperand(const MCInst *MI, unsigned OpNum,
- raw_ostream &O);
-
- void printCVTFixedPosOperand(const MCInst *MI, unsigned OpNum,
- raw_ostream &O);
-
- void printFPImmOperand(const MCInst *MI, unsigned OpNum, raw_ostream &o);
-
- void printFPZeroOperand(const MCInst *MI, unsigned OpNum, raw_ostream &o);
-
- template<int MemScale>
- void printOffsetUImm12Operand(const MCInst *MI,
- unsigned OpNum, raw_ostream &o) {
- printOffsetUImm12Operand(MI, OpNum, o, MemScale);
- }
-
- void printOffsetUImm12Operand(const MCInst *MI, unsigned OpNum,
- raw_ostream &o, int MemScale);
-
- template<unsigned field_width, unsigned scale>
- void printLabelOperand(const MCInst *MI, unsigned OpNum,
- raw_ostream &O);
-
- template<unsigned RegWidth>
- void printLogicalImmOperand(const MCInst *MI, unsigned OpNum, raw_ostream &O);
-
- template<typename SomeNamedImmMapper>
- void printNamedImmOperand(const MCInst *MI, unsigned OpNum,
- raw_ostream &O) {
- printNamedImmOperand(SomeNamedImmMapper(), MI, OpNum, O);
- }
-
- void printNamedImmOperand(const NamedImmMapper &Mapper,
- const MCInst *MI, unsigned OpNum,
- raw_ostream &O);
-
- void printSysRegOperand(const A64SysReg::SysRegMapper &Mapper,
- const MCInst *MI, unsigned OpNum,
- raw_ostream &O);
-
- void printMRSOperand(const MCInst *MI, unsigned OpNum,
- raw_ostream &O) {
- printSysRegOperand(A64SysReg::MRSMapper(), MI, OpNum, O);
- }
-
- void printMSROperand(const MCInst *MI, unsigned OpNum,
- raw_ostream &O) {
- printSysRegOperand(A64SysReg::MSRMapper(), MI, OpNum, O);
- }
-
- void printShiftOperand(const char *name, const MCInst *MI,
- unsigned OpIdx, raw_ostream &O);
-
- void printLSLOperand(const MCInst *MI, unsigned OpNum, raw_ostream &O);
-
- void printLSROperand(const MCInst *MI, unsigned OpNum, raw_ostream &O) {
- printShiftOperand("lsr", MI, OpNum, O);
- }
- void printASROperand(const MCInst *MI, unsigned OpNum, raw_ostream &O) {
- printShiftOperand("asr", MI, OpNum, O);
- }
- void printROROperand(const MCInst *MI, unsigned OpNum, raw_ostream &O) {
- printShiftOperand("ror", MI, OpNum, O);
- }
-
- template<A64SE::ShiftExtSpecifiers Shift>
- void printShiftOperand(const MCInst *MI, unsigned OpNum, raw_ostream &O) {
- printShiftOperand(MI, OpNum, O, Shift);
- }
-
- void printShiftOperand(const MCInst *MI, unsigned OpNum,
- raw_ostream &O, A64SE::ShiftExtSpecifiers Sh);
-
-
- void printMoveWideImmOperand(const MCInst *MI, unsigned OpNum,
- raw_ostream &O);
-
- template<int MemSize> void
- printSImm7ScaledOperand(const MCInst *MI, unsigned OpNum, raw_ostream &O);
-
- void printOffsetSImm9Operand(const MCInst *MI, unsigned OpNum,
- raw_ostream &O);
-
- void printPRFMOperand(const MCInst *MI, unsigned OpNum, raw_ostream &O);
-
- template<A64SE::ShiftExtSpecifiers EXT>
- void printRegExtendOperand(const MCInst *MI, unsigned OpNum,
- raw_ostream &O) {
- printRegExtendOperand(MI, OpNum, O, EXT);
- }
-
- void printRegExtendOperand(const MCInst *MI, unsigned OpNum,
- raw_ostream &O, A64SE::ShiftExtSpecifiers Ext);
-
- void printVPRRegister(const MCInst *MI, unsigned OpNo, raw_ostream &O);
+protected:
+ bool printSysAlias(const MCInst *MI, raw_ostream &O);
+ // Operand printers
void printOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O);
- virtual void printInst(const MCInst *MI, raw_ostream &O, StringRef Annot);
-
- bool isStackReg(unsigned RegNo) {
- return RegNo == AArch64::XSP || RegNo == AArch64::WSP;
+ void printHexImm(const MCInst *MI, unsigned OpNo, raw_ostream &O);
+ void printPostIncOperand(const MCInst *MI, unsigned OpNo, unsigned Imm,
+ raw_ostream &O);
+ template<int Amount>
+ void printPostIncOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O) {
+ printPostIncOperand(MI, OpNo, Amount, O);
}
- template <A64SE::ShiftExtSpecifiers Ext, bool IsHalf>
- void printNeonMovImmShiftOperand(const MCInst *MI, unsigned OpNum,
- raw_ostream &O);
- void printNeonUImm0Operand(const MCInst *MI, unsigned OpNum, raw_ostream &O);
- void printUImmHexOperand(const MCInst *MI, unsigned OpNum, raw_ostream &O);
- void printUImmBareOperand(const MCInst *MI, unsigned OpNum, raw_ostream &O);
- void printNeonUImm64MaskOperand(const MCInst *MI, unsigned OpNum,
- raw_ostream &O);
+ void printVRegOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O);
+ void printSysCROperand(const MCInst *MI, unsigned OpNo, raw_ostream &O);
+ void printAddSubImm(const MCInst *MI, unsigned OpNum, raw_ostream &O);
+ void printLogicalImm32(const MCInst *MI, unsigned OpNum, raw_ostream &O);
+ void printLogicalImm64(const MCInst *MI, unsigned OpNum, raw_ostream &O);
+ void printShifter(const MCInst *MI, unsigned OpNum, raw_ostream &O);
+ void printShiftedRegister(const MCInst *MI, unsigned OpNum, raw_ostream &O);
+ void printExtendedRegister(const MCInst *MI, unsigned OpNum, raw_ostream &O);
+ void printArithExtend(const MCInst *MI, unsigned OpNum, raw_ostream &O);
- template <A64Layout::VectorLayout Layout, unsigned Count>
- void printVectorList(const MCInst *MI, unsigned OpNum, raw_ostream &O);
+ void printMemExtend(const MCInst *MI, unsigned OpNum, raw_ostream &O,
+ char SrcRegKind, unsigned Width);
+ template <char SrcRegKind, unsigned Width>
+ void printMemExtend(const MCInst *MI, unsigned OpNum, raw_ostream &O) {
+ printMemExtend(MI, OpNum, O, SrcRegKind, Width);
+ }
+
+ void printCondCode(const MCInst *MI, unsigned OpNum, raw_ostream &O);
+ void printInverseCondCode(const MCInst *MI, unsigned OpNum, raw_ostream &O);
+ void printAlignedLabel(const MCInst *MI, unsigned OpNum, raw_ostream &O);
+ void printUImm12Offset(const MCInst *MI, unsigned OpNum, unsigned Scale,
+ raw_ostream &O);
+ void printAMIndexedWB(const MCInst *MI, unsigned OpNum, unsigned Scale,
+ raw_ostream &O);
+
+ template<int Scale>
+ void printUImm12Offset(const MCInst *MI, unsigned OpNum, raw_ostream &O) {
+ printUImm12Offset(MI, OpNum, Scale, O);
+ }
+
+ template<int BitWidth>
+ void printAMIndexedWB(const MCInst *MI, unsigned OpNum, raw_ostream &O) {
+ printAMIndexedWB(MI, OpNum, BitWidth / 8, O);
+ }
+
+ void printAMNoIndex(const MCInst *MI, unsigned OpNum, raw_ostream &O);
+
+ template<int Scale>
+ void printImmScale(const MCInst *MI, unsigned OpNum, raw_ostream &O);
+
+ void printPrefetchOp(const MCInst *MI, unsigned OpNum, raw_ostream &O);
+
+ void printFPImmOperand(const MCInst *MI, unsigned OpNum, raw_ostream &O);
+
+ void printVectorList(const MCInst *MI, unsigned OpNum, raw_ostream &O,
+ StringRef LayoutSuffix);
+
+ /// Print a list of vector registers where the type suffix is implicit
+ /// (i.e. attached to the instruction rather than the registers).
+ void printImplicitlyTypedVectorList(const MCInst *MI, unsigned OpNum,
+ raw_ostream &O);
+
+ template <unsigned NumLanes, char LaneKind>
+ void printTypedVectorList(const MCInst *MI, unsigned OpNum, raw_ostream &O);
+
+ void printVectorIndex(const MCInst *MI, unsigned OpNum, raw_ostream &O);
+ void printAdrpLabel(const MCInst *MI, unsigned OpNum, raw_ostream &O);
+ void printBarrierOption(const MCInst *MI, unsigned OpNum, raw_ostream &O);
+ void printMSRSystemRegister(const MCInst *MI, unsigned OpNum, raw_ostream &O);
+ void printMRSSystemRegister(const MCInst *MI, unsigned OpNum, raw_ostream &O);
+ void printSystemPStateField(const MCInst *MI, unsigned OpNum, raw_ostream &O);
+ void printSIMDType10Operand(const MCInst *MI, unsigned OpNum, raw_ostream &O);
+};
+
+class AArch64AppleInstPrinter : public AArch64InstPrinter {
+public:
+ AArch64AppleInstPrinter(const MCAsmInfo &MAI, const MCInstrInfo &MII,
+ const MCRegisterInfo &MRI, const MCSubtargetInfo &STI);
+
+ void printInst(const MCInst *MI, raw_ostream &O, StringRef Annot) override;
+
+ void printInstruction(const MCInst *MI, raw_ostream &O) override;
+ bool printAliasInstr(const MCInst *MI, raw_ostream &O) override;
+ virtual void printCustomAliasOperand(const MCInst *MI, unsigned OpIdx,
+ unsigned PrintMethodIdx, raw_ostream &O);
+ StringRef getRegName(unsigned RegNo) const override {
+ return getRegisterName(RegNo);
+ }
+ static const char *getRegisterName(unsigned RegNo,
+ unsigned AltIdx = AArch64::NoRegAltName);
};
}
diff --git a/lib/Target/AArch64/InstPrinter/Android.mk b/lib/Target/AArch64/InstPrinter/Android.mk
index ac9b0df..de6aa89 100644
--- a/lib/Target/AArch64/InstPrinter/Android.mk
+++ b/lib/Target/AArch64/InstPrinter/Android.mk
@@ -2,6 +2,7 @@
arm64_asm_printer_TBLGEN_TABLES := \
AArch64GenAsmWriter.inc \
+ AArch64GenAsmWriter1.inc \
AArch64GenRegisterInfo.inc \
AArch64GenSubtargetInfo.inc \
AArch64GenInstrInfo.inc
diff --git a/lib/Target/AArch64/InstPrinter/CMakeLists.txt b/lib/Target/AArch64/InstPrinter/CMakeLists.txt
index 3db56e4..363f502 100644
--- a/lib/Target/AArch64/InstPrinter/CMakeLists.txt
+++ b/lib/Target/AArch64/InstPrinter/CMakeLists.txt
@@ -1,3 +1,7 @@
+include_directories( ${CMAKE_CURRENT_BINARY_DIR}/.. ${CMAKE_CURRENT_SOURCE_DIR}/.. )
+
add_llvm_library(LLVMAArch64AsmPrinter
AArch64InstPrinter.cpp
)
+
+add_dependencies(LLVMAArch64AsmPrinter AArch64CommonTableGen)
diff --git a/lib/Target/AArch64/InstPrinter/LLVMBuild.txt b/lib/Target/AArch64/InstPrinter/LLVMBuild.txt
index 4836c7c..a13e842 100644
--- a/lib/Target/AArch64/InstPrinter/LLVMBuild.txt
+++ b/lib/Target/AArch64/InstPrinter/LLVMBuild.txt
@@ -1,4 +1,4 @@
-;===- ./lib/Target/AArch64/InstPrinter/LLVMBuild.txt -----------*- Conf -*--===;
+;===- ./lib/Target/AArch64/InstPrinter/LLVMBuild.txt -------------*- Conf -*--===;
;
; The LLVM Compiler Infrastructure
;
diff --git a/lib/Target/AArch64/InstPrinter/Makefile b/lib/Target/AArch64/InstPrinter/Makefile
index 1c36a8d..b17e8d0 100644
--- a/lib/Target/AArch64/InstPrinter/Makefile
+++ b/lib/Target/AArch64/InstPrinter/Makefile
@@ -9,7 +9,7 @@
LEVEL = ../../../..
LIBRARYNAME = LLVMAArch64AsmPrinter
-# Hack: we need to include 'main' target directory to grab private headers
+# Hack: we need to include 'main' arm target directory to grab private headers
CPP.Flags += -I$(PROJ_OBJ_DIR)/.. -I$(PROJ_SRC_DIR)/..
include $(LEVEL)/Makefile.common
diff --git a/lib/Target/AArch64/LLVMBuild.txt b/lib/Target/AArch64/LLVMBuild.txt
index 4c8f101..642c183 100644
--- a/lib/Target/AArch64/LLVMBuild.txt
+++ b/lib/Target/AArch64/LLVMBuild.txt
@@ -1,4 +1,4 @@
-;===- ./lib/Target/AArch64/LLVMBuild.txt -----------------------*- Conf -*--===;
+;===- ./lib/Target/AArch64/LLVMBuild.txt -------------------------*- Conf -*--===;
;
; The LLVM Compiler Infrastructure
;
@@ -31,5 +31,5 @@
type = Library
name = AArch64CodeGen
parent = AArch64
-required_libraries = AArch64AsmPrinter AArch64Desc AArch64Info AArch64Utils AsmPrinter CodeGen Core MC SelectionDAG Support Target
+required_libraries = AArch64AsmPrinter AArch64Desc AArch64Info AArch64Utils Analysis AsmPrinter CodeGen Core MC Scalar SelectionDAG Support Target
add_to_library_groups = AArch64
diff --git a/lib/Target/AArch64/MCTargetDesc/AArch64AddressingModes.h b/lib/Target/AArch64/MCTargetDesc/AArch64AddressingModes.h
new file mode 100644
index 0000000..8b1e44e
--- /dev/null
+++ b/lib/Target/AArch64/MCTargetDesc/AArch64AddressingModes.h
@@ -0,0 +1,738 @@
+//===- AArch64AddressingModes.h - AArch64 Addressing Modes ------*- C++ -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the AArch64 addressing mode implementation stuff.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_TARGET_AArch64_AArch64ADDRESSINGMODES_H
+#define LLVM_TARGET_AArch64_AArch64ADDRESSINGMODES_H
+
+#include "llvm/ADT/APFloat.h"
+#include "llvm/ADT/APInt.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/MathExtras.h"
+#include <cassert>
+
+namespace llvm {
+
+/// AArch64_AM - AArch64 Addressing Mode Stuff
+namespace AArch64_AM {
+
+//===----------------------------------------------------------------------===//
+// Shifts
+//
+
+enum ShiftExtendType {
+ InvalidShiftExtend = -1,
+ LSL = 0,
+ LSR,
+ ASR,
+ ROR,
+ MSL,
+
+ UXTB,
+ UXTH,
+ UXTW,
+ UXTX,
+
+ SXTB,
+ SXTH,
+ SXTW,
+ SXTX,
+};
+
+/// getShiftName - Get the string encoding for the shift type.
+static inline const char *getShiftExtendName(AArch64_AM::ShiftExtendType ST) {
+ switch (ST) {
+ default: assert(false && "unhandled shift type!");
+ case AArch64_AM::LSL: return "lsl";
+ case AArch64_AM::LSR: return "lsr";
+ case AArch64_AM::ASR: return "asr";
+ case AArch64_AM::ROR: return "ror";
+ case AArch64_AM::MSL: return "msl";
+ case AArch64_AM::UXTB: return "uxtb";
+ case AArch64_AM::UXTH: return "uxth";
+ case AArch64_AM::UXTW: return "uxtw";
+ case AArch64_AM::UXTX: return "uxtx";
+ case AArch64_AM::SXTB: return "sxtb";
+ case AArch64_AM::SXTH: return "sxth";
+ case AArch64_AM::SXTW: return "sxtw";
+ case AArch64_AM::SXTX: return "sxtx";
+ }
+ return nullptr;
+}
+
+/// getShiftType - Extract the shift type.
+static inline AArch64_AM::ShiftExtendType getShiftType(unsigned Imm) {
+ switch ((Imm >> 6) & 0x7) {
+ default: return AArch64_AM::InvalidShiftExtend;
+ case 0: return AArch64_AM::LSL;
+ case 1: return AArch64_AM::LSR;
+ case 2: return AArch64_AM::ASR;
+ case 3: return AArch64_AM::ROR;
+ case 4: return AArch64_AM::MSL;
+ }
+}
+
+/// getShiftValue - Extract the shift value.
+static inline unsigned getShiftValue(unsigned Imm) {
+ return Imm & 0x3f;
+}
+
+/// getShifterImm - Encode the shift type and amount:
+/// imm: 6-bit shift amount
+/// shifter: 000 ==> lsl
+/// 001 ==> lsr
+/// 010 ==> asr
+/// 011 ==> ror
+/// 100 ==> msl
+/// {8-6} = shifter
+/// {5-0} = imm
+static inline unsigned getShifterImm(AArch64_AM::ShiftExtendType ST,
+ unsigned Imm) {
+ assert((Imm & 0x3f) == Imm && "Illegal shifted immedate value!");
+ unsigned STEnc = 0;
+ switch (ST) {
+ default: llvm_unreachable("Invalid shift requested");
+ case AArch64_AM::LSL: STEnc = 0; break;
+ case AArch64_AM::LSR: STEnc = 1; break;
+ case AArch64_AM::ASR: STEnc = 2; break;
+ case AArch64_AM::ROR: STEnc = 3; break;
+ case AArch64_AM::MSL: STEnc = 4; break;
+ }
+ return (STEnc << 6) | (Imm & 0x3f);
+}
+
+//===----------------------------------------------------------------------===//
+// Extends
+//
+
+/// getArithShiftValue - get the arithmetic shift value.
+static inline unsigned getArithShiftValue(unsigned Imm) {
+ return Imm & 0x7;
+}
+
+/// getExtendType - Extract the extend type for operands of arithmetic ops.
+static inline AArch64_AM::ShiftExtendType getExtendType(unsigned Imm) {
+ assert((Imm & 0x7) == Imm && "invalid immediate!");
+ switch (Imm) {
+ default: llvm_unreachable("Compiler bug!");
+ case 0: return AArch64_AM::UXTB;
+ case 1: return AArch64_AM::UXTH;
+ case 2: return AArch64_AM::UXTW;
+ case 3: return AArch64_AM::UXTX;
+ case 4: return AArch64_AM::SXTB;
+ case 5: return AArch64_AM::SXTH;
+ case 6: return AArch64_AM::SXTW;
+ case 7: return AArch64_AM::SXTX;
+ }
+}
+
+static inline AArch64_AM::ShiftExtendType getArithExtendType(unsigned Imm) {
+ return getExtendType((Imm >> 3) & 0x7);
+}
+
+/// Mapping from extend bits to required operation:
+/// shifter: 000 ==> uxtb
+/// 001 ==> uxth
+/// 010 ==> uxtw
+/// 011 ==> uxtx
+/// 100 ==> sxtb
+/// 101 ==> sxth
+/// 110 ==> sxtw
+/// 111 ==> sxtx
+inline unsigned getExtendEncoding(AArch64_AM::ShiftExtendType ET) {
+ switch (ET) {
+ default: llvm_unreachable("Invalid extend type requested");
+ case AArch64_AM::UXTB: return 0; break;
+ case AArch64_AM::UXTH: return 1; break;
+ case AArch64_AM::UXTW: return 2; break;
+ case AArch64_AM::UXTX: return 3; break;
+ case AArch64_AM::SXTB: return 4; break;
+ case AArch64_AM::SXTH: return 5; break;
+ case AArch64_AM::SXTW: return 6; break;
+ case AArch64_AM::SXTX: return 7; break;
+ }
+}
+
+/// getArithExtendImm - Encode the extend type and shift amount for an
+/// arithmetic instruction:
+/// imm: 3-bit extend amount
+/// {5-3} = shifter
+/// {2-0} = imm3
+static inline unsigned getArithExtendImm(AArch64_AM::ShiftExtendType ET,
+ unsigned Imm) {
+ assert((Imm & 0x7) == Imm && "Illegal shifted immedate value!");
+ return (getExtendEncoding(ET) << 3) | (Imm & 0x7);
+}
+
+/// getMemDoShift - Extract the "do shift" flag value for load/store
+/// instructions.
+static inline bool getMemDoShift(unsigned Imm) {
+ return (Imm & 0x1) != 0;
+}
+
+/// getExtendType - Extract the extend type for the offset operand of
+/// loads/stores.
+static inline AArch64_AM::ShiftExtendType getMemExtendType(unsigned Imm) {
+ return getExtendType((Imm >> 1) & 0x7);
+}
+
+/// getExtendImm - Encode the extend type and amount for a load/store inst:
+/// doshift: should the offset be scaled by the access size
+/// shifter: 000 ==> uxtb
+/// 001 ==> uxth
+/// 010 ==> uxtw
+/// 011 ==> uxtx
+/// 100 ==> sxtb
+/// 101 ==> sxth
+/// 110 ==> sxtw
+/// 111 ==> sxtx
+/// {3-1} = shifter
+/// {0} = doshift
+static inline unsigned getMemExtendImm(AArch64_AM::ShiftExtendType ET,
+ bool DoShift) {
+ return (getExtendEncoding(ET) << 1) | unsigned(DoShift);
+}
+
+static inline uint64_t ror(uint64_t elt, unsigned size) {
+ return ((elt & 1) << (size-1)) | (elt >> 1);
+}
+
+/// processLogicalImmediate - Determine if an immediate value can be encoded
+/// as the immediate operand of a logical instruction for the given register
+/// size. If so, return true with "encoding" set to the encoded value in
+/// the form N:immr:imms.
+static inline bool processLogicalImmediate(uint64_t imm, unsigned regSize,
+ uint64_t &encoding) {
+ if (imm == 0ULL || imm == ~0ULL ||
+ (regSize != 64 && (imm >> regSize != 0 || imm == ~0U)))
+ return false;
+
+ unsigned size = 2;
+ uint64_t eltVal = imm;
+
+ // First, determine the element size.
+ while (size < regSize) {
+ unsigned numElts = regSize / size;
+ unsigned mask = (1ULL << size) - 1;
+ uint64_t lowestEltVal = imm & mask;
+
+ bool allMatched = true;
+ for (unsigned i = 1; i < numElts; ++i) {
+ uint64_t currEltVal = (imm >> (i*size)) & mask;
+ if (currEltVal != lowestEltVal) {
+ allMatched = false;
+ break;
+ }
+ }
+
+ if (allMatched) {
+ eltVal = lowestEltVal;
+ break;
+ }
+
+ size *= 2;
+ }
+
+ // Second, determine the rotation to make the element be: 0^m 1^n.
+ for (unsigned i = 0; i < size; ++i) {
+ eltVal = ror(eltVal, size);
+ uint32_t clz = countLeadingZeros(eltVal) - (64 - size);
+ uint32_t cto = CountTrailingOnes_64(eltVal);
+
+ if (clz + cto == size) {
+ // Encode in immr the number of RORs it would take to get *from* this
+ // element value to our target value, where i+1 is the number of RORs
+ // to go the opposite direction.
+ unsigned immr = size - (i + 1);
+
+ // If size has a 1 in the n'th bit, create a value that has zeroes in
+ // bits [0, n] and ones above that.
+ uint64_t nimms = ~(size-1) << 1;
+
+ // Or the CTO value into the low bits, which must be below the Nth bit
+ // bit mentioned above.
+ nimms |= (cto-1);
+
+ // Extract the seventh bit and toggle it to create the N field.
+ unsigned N = ((nimms >> 6) & 1) ^ 1;
+
+ encoding = (N << 12) | (immr << 6) | (nimms & 0x3f);
+ return true;
+ }
+ }
+
+ return false;
+}
+
+/// isLogicalImmediate - Return true if the immediate is valid for a logical
+/// immediate instruction of the given register size. Return false otherwise.
+static inline bool isLogicalImmediate(uint64_t imm, unsigned regSize) {
+ uint64_t encoding;
+ return processLogicalImmediate(imm, regSize, encoding);
+}
+
+/// encodeLogicalImmediate - Return the encoded immediate value for a logical
+/// immediate instruction of the given register size.
+static inline uint64_t encodeLogicalImmediate(uint64_t imm, unsigned regSize) {
+ uint64_t encoding = 0;
+ bool res = processLogicalImmediate(imm, regSize, encoding);
+ assert(res && "invalid logical immediate");
+ (void)res;
+ return encoding;
+}
+
+/// decodeLogicalImmediate - Decode a logical immediate value in the form
+/// "N:immr:imms" (where the immr and imms fields are each 6 bits) into the
+/// integer value it represents with regSize bits.
+static inline uint64_t decodeLogicalImmediate(uint64_t val, unsigned regSize) {
+ // Extract the N, imms, and immr fields.
+ unsigned N = (val >> 12) & 1;
+ unsigned immr = (val >> 6) & 0x3f;
+ unsigned imms = val & 0x3f;
+
+ assert((regSize == 64 || N == 0) && "undefined logical immediate encoding");
+ int len = 31 - countLeadingZeros((N << 6) | (~imms & 0x3f));
+ assert(len >= 0 && "undefined logical immediate encoding");
+ unsigned size = (1 << len);
+ unsigned R = immr & (size - 1);
+ unsigned S = imms & (size - 1);
+ assert(S != size - 1 && "undefined logical immediate encoding");
+ uint64_t pattern = (1ULL << (S + 1)) - 1;
+ for (unsigned i = 0; i < R; ++i)
+ pattern = ror(pattern, size);
+
+ // Replicate the pattern to fill the regSize.
+ while (size != regSize) {
+ pattern |= (pattern << size);
+ size *= 2;
+ }
+ return pattern;
+}
+
+/// isValidDecodeLogicalImmediate - Check to see if the logical immediate value
+/// in the form "N:immr:imms" (where the immr and imms fields are each 6 bits)
+/// is a valid encoding for an integer value with regSize bits.
+static inline bool isValidDecodeLogicalImmediate(uint64_t val,
+ unsigned regSize) {
+ // Extract the N and imms fields needed for checking.
+ unsigned N = (val >> 12) & 1;
+ unsigned imms = val & 0x3f;
+
+ if (regSize == 32 && N != 0) // undefined logical immediate encoding
+ return false;
+ int len = 31 - countLeadingZeros((N << 6) | (~imms & 0x3f));
+ if (len < 0) // undefined logical immediate encoding
+ return false;
+ unsigned size = (1 << len);
+ unsigned S = imms & (size - 1);
+ if (S == size - 1) // undefined logical immediate encoding
+ return false;
+
+ return true;
+}
+
+//===----------------------------------------------------------------------===//
+// Floating-point Immediates
+//
+static inline float getFPImmFloat(unsigned Imm) {
+ // We expect an 8-bit binary encoding of a floating-point number here.
+ union {
+ uint32_t I;
+ float F;
+ } FPUnion;
+
+ uint8_t Sign = (Imm >> 7) & 0x1;
+ uint8_t Exp = (Imm >> 4) & 0x7;
+ uint8_t Mantissa = Imm & 0xf;
+
+ // 8-bit FP iEEEE Float Encoding
+ // abcd efgh aBbbbbbc defgh000 00000000 00000000
+ //
+ // where B = NOT(b);
+
+ FPUnion.I = 0;
+ FPUnion.I |= Sign << 31;
+ FPUnion.I |= ((Exp & 0x4) != 0 ? 0 : 1) << 30;
+ FPUnion.I |= ((Exp & 0x4) != 0 ? 0x1f : 0) << 25;
+ FPUnion.I |= (Exp & 0x3) << 23;
+ FPUnion.I |= Mantissa << 19;
+ return FPUnion.F;
+}
+
+/// getFP32Imm - Return an 8-bit floating-point version of the 32-bit
+/// floating-point value. If the value cannot be represented as an 8-bit
+/// floating-point value, then return -1.
+static inline int getFP32Imm(const APInt &Imm) {
+ uint32_t Sign = Imm.lshr(31).getZExtValue() & 1;
+ int32_t Exp = (Imm.lshr(23).getSExtValue() & 0xff) - 127; // -126 to 127
+ int64_t Mantissa = Imm.getZExtValue() & 0x7fffff; // 23 bits
+
+ // We can handle 4 bits of mantissa.
+ // mantissa = (16+UInt(e:f:g:h))/16.
+ if (Mantissa & 0x7ffff)
+ return -1;
+ Mantissa >>= 19;
+ if ((Mantissa & 0xf) != Mantissa)
+ return -1;
+
+ // We can handle 3 bits of exponent: exp == UInt(NOT(b):c:d)-3
+ if (Exp < -3 || Exp > 4)
+ return -1;
+ Exp = ((Exp+3) & 0x7) ^ 4;
+
+ return ((int)Sign << 7) | (Exp << 4) | Mantissa;
+}
+
+static inline int getFP32Imm(const APFloat &FPImm) {
+ return getFP32Imm(FPImm.bitcastToAPInt());
+}
+
+/// getFP64Imm - Return an 8-bit floating-point version of the 64-bit
+/// floating-point value. If the value cannot be represented as an 8-bit
+/// floating-point value, then return -1.
+static inline int getFP64Imm(const APInt &Imm) {
+ uint64_t Sign = Imm.lshr(63).getZExtValue() & 1;
+ int64_t Exp = (Imm.lshr(52).getSExtValue() & 0x7ff) - 1023; // -1022 to 1023
+ uint64_t Mantissa = Imm.getZExtValue() & 0xfffffffffffffULL;
+
+ // We can handle 4 bits of mantissa.
+ // mantissa = (16+UInt(e:f:g:h))/16.
+ if (Mantissa & 0xffffffffffffULL)
+ return -1;
+ Mantissa >>= 48;
+ if ((Mantissa & 0xf) != Mantissa)
+ return -1;
+
+ // We can handle 3 bits of exponent: exp == UInt(NOT(b):c:d)-3
+ if (Exp < -3 || Exp > 4)
+ return -1;
+ Exp = ((Exp+3) & 0x7) ^ 4;
+
+ return ((int)Sign << 7) | (Exp << 4) | Mantissa;
+}
+
+static inline int getFP64Imm(const APFloat &FPImm) {
+ return getFP64Imm(FPImm.bitcastToAPInt());
+}
+
+//===--------------------------------------------------------------------===//
+// AdvSIMD Modified Immediates
+//===--------------------------------------------------------------------===//
+
+// 0x00 0x00 0x00 abcdefgh 0x00 0x00 0x00 abcdefgh
+static inline bool isAdvSIMDModImmType1(uint64_t Imm) {
+ return ((Imm >> 32) == (Imm & 0xffffffffULL)) &&
+ ((Imm & 0xffffff00ffffff00ULL) == 0);
+}
+
+static inline uint8_t encodeAdvSIMDModImmType1(uint64_t Imm) {
+ return (Imm & 0xffULL);
+}
+
+static inline uint64_t decodeAdvSIMDModImmType1(uint8_t Imm) {
+ uint64_t EncVal = Imm;
+ return (EncVal << 32) | EncVal;
+}
+
+// 0x00 0x00 abcdefgh 0x00 0x00 0x00 abcdefgh 0x00
+static inline bool isAdvSIMDModImmType2(uint64_t Imm) {
+ return ((Imm >> 32) == (Imm & 0xffffffffULL)) &&
+ ((Imm & 0xffff00ffffff00ffULL) == 0);
+}
+
+static inline uint8_t encodeAdvSIMDModImmType2(uint64_t Imm) {
+ return (Imm & 0xff00ULL) >> 8;
+}
+
+static inline uint64_t decodeAdvSIMDModImmType2(uint8_t Imm) {
+ uint64_t EncVal = Imm;
+ return (EncVal << 40) | (EncVal << 8);
+}
+
+// 0x00 abcdefgh 0x00 0x00 0x00 abcdefgh 0x00 0x00
+static inline bool isAdvSIMDModImmType3(uint64_t Imm) {
+ return ((Imm >> 32) == (Imm & 0xffffffffULL)) &&
+ ((Imm & 0xff00ffffff00ffffULL) == 0);
+}
+
+static inline uint8_t encodeAdvSIMDModImmType3(uint64_t Imm) {
+ return (Imm & 0xff0000ULL) >> 16;
+}
+
+static inline uint64_t decodeAdvSIMDModImmType3(uint8_t Imm) {
+ uint64_t EncVal = Imm;
+ return (EncVal << 48) | (EncVal << 16);
+}
+
+// abcdefgh 0x00 0x00 0x00 abcdefgh 0x00 0x00 0x00
+static inline bool isAdvSIMDModImmType4(uint64_t Imm) {
+ return ((Imm >> 32) == (Imm & 0xffffffffULL)) &&
+ ((Imm & 0x00ffffff00ffffffULL) == 0);
+}
+
+static inline uint8_t encodeAdvSIMDModImmType4(uint64_t Imm) {
+ return (Imm & 0xff000000ULL) >> 24;
+}
+
+static inline uint64_t decodeAdvSIMDModImmType4(uint8_t Imm) {
+ uint64_t EncVal = Imm;
+ return (EncVal << 56) | (EncVal << 24);
+}
+
+// 0x00 abcdefgh 0x00 abcdefgh 0x00 abcdefgh 0x00 abcdefgh
+static inline bool isAdvSIMDModImmType5(uint64_t Imm) {
+ return ((Imm >> 32) == (Imm & 0xffffffffULL)) &&
+ (((Imm & 0x00ff0000ULL) >> 16) == (Imm & 0x000000ffULL)) &&
+ ((Imm & 0xff00ff00ff00ff00ULL) == 0);
+}
+
+static inline uint8_t encodeAdvSIMDModImmType5(uint64_t Imm) {
+ return (Imm & 0xffULL);
+}
+
+static inline uint64_t decodeAdvSIMDModImmType5(uint8_t Imm) {
+ uint64_t EncVal = Imm;
+ return (EncVal << 48) | (EncVal << 32) | (EncVal << 16) | EncVal;
+}
+
+// abcdefgh 0x00 abcdefgh 0x00 abcdefgh 0x00 abcdefgh 0x00
+static inline bool isAdvSIMDModImmType6(uint64_t Imm) {
+ return ((Imm >> 32) == (Imm & 0xffffffffULL)) &&
+ (((Imm & 0xff000000ULL) >> 16) == (Imm & 0x0000ff00ULL)) &&
+ ((Imm & 0x00ff00ff00ff00ffULL) == 0);
+}
+
+static inline uint8_t encodeAdvSIMDModImmType6(uint64_t Imm) {
+ return (Imm & 0xff00ULL) >> 8;
+}
+
+static inline uint64_t decodeAdvSIMDModImmType6(uint8_t Imm) {
+ uint64_t EncVal = Imm;
+ return (EncVal << 56) | (EncVal << 40) | (EncVal << 24) | (EncVal << 8);
+}
+
+// 0x00 0x00 abcdefgh 0xFF 0x00 0x00 abcdefgh 0xFF
+static inline bool isAdvSIMDModImmType7(uint64_t Imm) {
+ return ((Imm >> 32) == (Imm & 0xffffffffULL)) &&
+ ((Imm & 0xffff00ffffff00ffULL) == 0x000000ff000000ffULL);
+}
+
+static inline uint8_t encodeAdvSIMDModImmType7(uint64_t Imm) {
+ return (Imm & 0xff00ULL) >> 8;
+}
+
+static inline uint64_t decodeAdvSIMDModImmType7(uint8_t Imm) {
+ uint64_t EncVal = Imm;
+ return (EncVal << 40) | (EncVal << 8) | 0x000000ff000000ffULL;
+}
+
+// 0x00 abcdefgh 0xFF 0xFF 0x00 abcdefgh 0xFF 0xFF
+static inline bool isAdvSIMDModImmType8(uint64_t Imm) {
+ return ((Imm >> 32) == (Imm & 0xffffffffULL)) &&
+ ((Imm & 0xff00ffffff00ffffULL) == 0x0000ffff0000ffffULL);
+}
+
+static inline uint64_t decodeAdvSIMDModImmType8(uint8_t Imm) {
+ uint64_t EncVal = Imm;
+ return (EncVal << 48) | (EncVal << 16) | 0x0000ffff0000ffffULL;
+}
+
+static inline uint8_t encodeAdvSIMDModImmType8(uint64_t Imm) {
+ return (Imm & 0x00ff0000ULL) >> 16;
+}
+
+// abcdefgh abcdefgh abcdefgh abcdefgh abcdefgh abcdefgh abcdefgh abcdefgh
+static inline bool isAdvSIMDModImmType9(uint64_t Imm) {
+ return ((Imm >> 32) == (Imm & 0xffffffffULL)) &&
+ ((Imm >> 48) == (Imm & 0x0000ffffULL)) &&
+ ((Imm >> 56) == (Imm & 0x000000ffULL));
+}
+
+static inline uint8_t encodeAdvSIMDModImmType9(uint64_t Imm) {
+ return (Imm & 0xffULL);
+}
+
+static inline uint64_t decodeAdvSIMDModImmType9(uint8_t Imm) {
+ uint64_t EncVal = Imm;
+ EncVal |= (EncVal << 8);
+ EncVal |= (EncVal << 16);
+ EncVal |= (EncVal << 32);
+ return EncVal;
+}
+
+// aaaaaaaa bbbbbbbb cccccccc dddddddd eeeeeeee ffffffff gggggggg hhhhhhhh
+// cmode: 1110, op: 1
+static inline bool isAdvSIMDModImmType10(uint64_t Imm) {
+ uint64_t ByteA = Imm & 0xff00000000000000ULL;
+ uint64_t ByteB = Imm & 0x00ff000000000000ULL;
+ uint64_t ByteC = Imm & 0x0000ff0000000000ULL;
+ uint64_t ByteD = Imm & 0x000000ff00000000ULL;
+ uint64_t ByteE = Imm & 0x00000000ff000000ULL;
+ uint64_t ByteF = Imm & 0x0000000000ff0000ULL;
+ uint64_t ByteG = Imm & 0x000000000000ff00ULL;
+ uint64_t ByteH = Imm & 0x00000000000000ffULL;
+
+ return (ByteA == 0ULL || ByteA == 0xff00000000000000ULL) &&
+ (ByteB == 0ULL || ByteB == 0x00ff000000000000ULL) &&
+ (ByteC == 0ULL || ByteC == 0x0000ff0000000000ULL) &&
+ (ByteD == 0ULL || ByteD == 0x000000ff00000000ULL) &&
+ (ByteE == 0ULL || ByteE == 0x00000000ff000000ULL) &&
+ (ByteF == 0ULL || ByteF == 0x0000000000ff0000ULL) &&
+ (ByteG == 0ULL || ByteG == 0x000000000000ff00ULL) &&
+ (ByteH == 0ULL || ByteH == 0x00000000000000ffULL);
+}
+
+static inline uint8_t encodeAdvSIMDModImmType10(uint64_t Imm) {
+ uint8_t BitA = (Imm & 0xff00000000000000ULL) != 0;
+ uint8_t BitB = (Imm & 0x00ff000000000000ULL) != 0;
+ uint8_t BitC = (Imm & 0x0000ff0000000000ULL) != 0;
+ uint8_t BitD = (Imm & 0x000000ff00000000ULL) != 0;
+ uint8_t BitE = (Imm & 0x00000000ff000000ULL) != 0;
+ uint8_t BitF = (Imm & 0x0000000000ff0000ULL) != 0;
+ uint8_t BitG = (Imm & 0x000000000000ff00ULL) != 0;
+ uint8_t BitH = (Imm & 0x00000000000000ffULL) != 0;
+
+ uint8_t EncVal = BitA;
+ EncVal <<= 1;
+ EncVal |= BitB;
+ EncVal <<= 1;
+ EncVal |= BitC;
+ EncVal <<= 1;
+ EncVal |= BitD;
+ EncVal <<= 1;
+ EncVal |= BitE;
+ EncVal <<= 1;
+ EncVal |= BitF;
+ EncVal <<= 1;
+ EncVal |= BitG;
+ EncVal <<= 1;
+ EncVal |= BitH;
+ return EncVal;
+}
+
+static inline uint64_t decodeAdvSIMDModImmType10(uint8_t Imm) {
+ uint64_t EncVal = 0;
+ if (Imm & 0x80) EncVal |= 0xff00000000000000ULL;
+ if (Imm & 0x40) EncVal |= 0x00ff000000000000ULL;
+ if (Imm & 0x20) EncVal |= 0x0000ff0000000000ULL;
+ if (Imm & 0x10) EncVal |= 0x000000ff00000000ULL;
+ if (Imm & 0x08) EncVal |= 0x00000000ff000000ULL;
+ if (Imm & 0x04) EncVal |= 0x0000000000ff0000ULL;
+ if (Imm & 0x02) EncVal |= 0x000000000000ff00ULL;
+ if (Imm & 0x01) EncVal |= 0x00000000000000ffULL;
+ return EncVal;
+}
+
+// aBbbbbbc defgh000 0x00 0x00 aBbbbbbc defgh000 0x00 0x00
+static inline bool isAdvSIMDModImmType11(uint64_t Imm) {
+ uint64_t BString = (Imm & 0x7E000000ULL) >> 25;
+ return ((Imm >> 32) == (Imm & 0xffffffffULL)) &&
+ (BString == 0x1f || BString == 0x20) &&
+ ((Imm & 0x0007ffff0007ffffULL) == 0);
+}
+
+static inline uint8_t encodeAdvSIMDModImmType11(uint64_t Imm) {
+ uint8_t BitA = (Imm & 0x80000000ULL) != 0;
+ uint8_t BitB = (Imm & 0x20000000ULL) != 0;
+ uint8_t BitC = (Imm & 0x01000000ULL) != 0;
+ uint8_t BitD = (Imm & 0x00800000ULL) != 0;
+ uint8_t BitE = (Imm & 0x00400000ULL) != 0;
+ uint8_t BitF = (Imm & 0x00200000ULL) != 0;
+ uint8_t BitG = (Imm & 0x00100000ULL) != 0;
+ uint8_t BitH = (Imm & 0x00080000ULL) != 0;
+
+ uint8_t EncVal = BitA;
+ EncVal <<= 1;
+ EncVal |= BitB;
+ EncVal <<= 1;
+ EncVal |= BitC;
+ EncVal <<= 1;
+ EncVal |= BitD;
+ EncVal <<= 1;
+ EncVal |= BitE;
+ EncVal <<= 1;
+ EncVal |= BitF;
+ EncVal <<= 1;
+ EncVal |= BitG;
+ EncVal <<= 1;
+ EncVal |= BitH;
+ return EncVal;
+}
+
+static inline uint64_t decodeAdvSIMDModImmType11(uint8_t Imm) {
+ uint64_t EncVal = 0;
+ if (Imm & 0x80) EncVal |= 0x80000000ULL;
+ if (Imm & 0x40) EncVal |= 0x3e000000ULL;
+ else EncVal |= 0x40000000ULL;
+ if (Imm & 0x20) EncVal |= 0x01000000ULL;
+ if (Imm & 0x10) EncVal |= 0x00800000ULL;
+ if (Imm & 0x08) EncVal |= 0x00400000ULL;
+ if (Imm & 0x04) EncVal |= 0x00200000ULL;
+ if (Imm & 0x02) EncVal |= 0x00100000ULL;
+ if (Imm & 0x01) EncVal |= 0x00080000ULL;
+ return (EncVal << 32) | EncVal;
+}
+
+// aBbbbbbb bbcdefgh 0x00 0x00 0x00 0x00 0x00 0x00
+static inline bool isAdvSIMDModImmType12(uint64_t Imm) {
+ uint64_t BString = (Imm & 0x7fc0000000000000ULL) >> 54;
+ return ((BString == 0xff || BString == 0x100) &&
+ ((Imm & 0x0000ffffffffffffULL) == 0));
+}
+
+static inline uint8_t encodeAdvSIMDModImmType12(uint64_t Imm) {
+ uint8_t BitA = (Imm & 0x8000000000000000ULL) != 0;
+ uint8_t BitB = (Imm & 0x0040000000000000ULL) != 0;
+ uint8_t BitC = (Imm & 0x0020000000000000ULL) != 0;
+ uint8_t BitD = (Imm & 0x0010000000000000ULL) != 0;
+ uint8_t BitE = (Imm & 0x0008000000000000ULL) != 0;
+ uint8_t BitF = (Imm & 0x0004000000000000ULL) != 0;
+ uint8_t BitG = (Imm & 0x0002000000000000ULL) != 0;
+ uint8_t BitH = (Imm & 0x0001000000000000ULL) != 0;
+
+ uint8_t EncVal = BitA;
+ EncVal <<= 1;
+ EncVal |= BitB;
+ EncVal <<= 1;
+ EncVal |= BitC;
+ EncVal <<= 1;
+ EncVal |= BitD;
+ EncVal <<= 1;
+ EncVal |= BitE;
+ EncVal <<= 1;
+ EncVal |= BitF;
+ EncVal <<= 1;
+ EncVal |= BitG;
+ EncVal <<= 1;
+ EncVal |= BitH;
+ return EncVal;
+}
+
+static inline uint64_t decodeAdvSIMDModImmType12(uint8_t Imm) {
+ uint64_t EncVal = 0;
+ if (Imm & 0x80) EncVal |= 0x8000000000000000ULL;
+ if (Imm & 0x40) EncVal |= 0x3fc0000000000000ULL;
+ else EncVal |= 0x4000000000000000ULL;
+ if (Imm & 0x20) EncVal |= 0x0020000000000000ULL;
+ if (Imm & 0x10) EncVal |= 0x0010000000000000ULL;
+ if (Imm & 0x08) EncVal |= 0x0008000000000000ULL;
+ if (Imm & 0x04) EncVal |= 0x0004000000000000ULL;
+ if (Imm & 0x02) EncVal |= 0x0002000000000000ULL;
+ if (Imm & 0x01) EncVal |= 0x0001000000000000ULL;
+ return (EncVal << 32) | EncVal;
+}
+
+} // end namespace AArch64_AM
+
+} // end namespace llvm
+
+#endif
diff --git a/lib/Target/AArch64/MCTargetDesc/AArch64AsmBackend.cpp b/lib/Target/AArch64/MCTargetDesc/AArch64AsmBackend.cpp
index f1452ab..d8900d4 100644
--- a/lib/Target/AArch64/MCTargetDesc/AArch64AsmBackend.cpp
+++ b/lib/Target/AArch64/MCTargetDesc/AArch64AsmBackend.cpp
@@ -6,54 +6,512 @@
// License. See LICENSE.TXT for details.
//
//===----------------------------------------------------------------------===//
-//
-// This file contains the AArch64 implementation of the MCAsmBackend class,
-// which is principally concerned with relaxation of the various fixup kinds.
-//
-//===----------------------------------------------------------------------===//
+#include "AArch64.h"
+#include "AArch64RegisterInfo.h"
#include "MCTargetDesc/AArch64FixupKinds.h"
-#include "MCTargetDesc/AArch64MCTargetDesc.h"
+#include "llvm/ADT/Triple.h"
#include "llvm/MC/MCAsmBackend.h"
-#include "llvm/MC/MCELFObjectWriter.h"
+#include "llvm/MC/MCDirectives.h"
#include "llvm/MC/MCFixupKindInfo.h"
#include "llvm/MC/MCObjectWriter.h"
-#include "llvm/MC/MCSubtargetInfo.h"
-#include "llvm/Support/ELF.h"
+#include "llvm/MC/MCSectionMachO.h"
+#include "llvm/MC/MCSectionELF.h"
#include "llvm/Support/ErrorHandling.h"
-#include "llvm/Support/raw_ostream.h"
+#include "llvm/Support/MachO.h"
using namespace llvm;
namespace {
+
class AArch64AsmBackend : public MCAsmBackend {
- const MCSubtargetInfo* STI;
+ static const unsigned PCRelFlagVal =
+ MCFixupKindInfo::FKF_IsAlignedDownTo32Bits | MCFixupKindInfo::FKF_IsPCRel;
+
public:
- AArch64AsmBackend(const Target &T, const StringRef TT)
- : MCAsmBackend(),
- STI(AArch64_MC::createAArch64MCSubtargetInfo(TT, "", ""))
- {}
+ AArch64AsmBackend(const Target &T) : MCAsmBackend() {}
-
- ~AArch64AsmBackend() {
- delete STI;
+ unsigned getNumFixupKinds() const override {
+ return AArch64::NumTargetFixupKinds;
}
- bool writeNopData(uint64_t Count, MCObjectWriter *OW) const;
+ const MCFixupKindInfo &getFixupKindInfo(MCFixupKind Kind) const override {
+ const static MCFixupKindInfo Infos[AArch64::NumTargetFixupKinds] = {
+ // This table *must* be in the order that the fixup_* kinds are defined in
+ // AArch64FixupKinds.h.
+ //
+ // Name Offset (bits) Size (bits) Flags
+ { "fixup_aarch64_pcrel_adr_imm21", 0, 32, PCRelFlagVal },
+ { "fixup_aarch64_pcrel_adrp_imm21", 0, 32, PCRelFlagVal },
+ { "fixup_aarch64_add_imm12", 10, 12, 0 },
+ { "fixup_aarch64_ldst_imm12_scale1", 10, 12, 0 },
+ { "fixup_aarch64_ldst_imm12_scale2", 10, 12, 0 },
+ { "fixup_aarch64_ldst_imm12_scale4", 10, 12, 0 },
+ { "fixup_aarch64_ldst_imm12_scale8", 10, 12, 0 },
+ { "fixup_aarch64_ldst_imm12_scale16", 10, 12, 0 },
+ { "fixup_aarch64_ldr_pcrel_imm19", 5, 19, PCRelFlagVal },
+ { "fixup_aarch64_movw", 5, 16, 0 },
+ { "fixup_aarch64_pcrel_branch14", 5, 14, PCRelFlagVal },
+ { "fixup_aarch64_pcrel_branch19", 5, 19, PCRelFlagVal },
+ { "fixup_aarch64_pcrel_branch26", 0, 26, PCRelFlagVal },
+ { "fixup_aarch64_pcrel_call26", 0, 26, PCRelFlagVal },
+ { "fixup_aarch64_tlsdesc_call", 0, 0, 0 }
+ };
- virtual void processFixupValue(const MCAssembler &Asm,
- const MCAsmLayout &Layout,
- const MCFixup &Fixup, const MCFragment *DF,
- const MCValue &Target, uint64_t &Value,
- bool &IsResolved);
+ if (Kind < FirstTargetFixupKind)
+ return MCAsmBackend::getFixupKindInfo(Kind);
+
+ assert(unsigned(Kind - FirstTargetFixupKind) < getNumFixupKinds() &&
+ "Invalid kind!");
+ return Infos[Kind - FirstTargetFixupKind];
+ }
+
+ void applyFixup(const MCFixup &Fixup, char *Data, unsigned DataSize,
+ uint64_t Value, bool IsPCRel) const override;
+
+ bool mayNeedRelaxation(const MCInst &Inst) const override;
+ bool fixupNeedsRelaxation(const MCFixup &Fixup, uint64_t Value,
+ const MCRelaxableFragment *DF,
+ const MCAsmLayout &Layout) const override;
+ void relaxInstruction(const MCInst &Inst, MCInst &Res) const override;
+ bool writeNopData(uint64_t Count, MCObjectWriter *OW) const override;
+
+ void HandleAssemblerFlag(MCAssemblerFlag Flag) {}
+
+ unsigned getPointerSize() const { return 8; }
};
+
} // end anonymous namespace
-void AArch64AsmBackend::processFixupValue(const MCAssembler &Asm,
- const MCAsmLayout &Layout,
- const MCFixup &Fixup,
- const MCFragment *DF,
- const MCValue &Target,
- uint64_t &Value, bool &IsResolved) {
+/// \brief The number of bytes the fixup may change.
+static unsigned getFixupKindNumBytes(unsigned Kind) {
+ switch (Kind) {
+ default:
+ assert(0 && "Unknown fixup kind!");
+
+ case AArch64::fixup_aarch64_tlsdesc_call:
+ return 0;
+
+ case FK_Data_1:
+ return 1;
+
+ case FK_Data_2:
+ case AArch64::fixup_aarch64_movw:
+ return 2;
+
+ case AArch64::fixup_aarch64_pcrel_branch14:
+ case AArch64::fixup_aarch64_add_imm12:
+ case AArch64::fixup_aarch64_ldst_imm12_scale1:
+ case AArch64::fixup_aarch64_ldst_imm12_scale2:
+ case AArch64::fixup_aarch64_ldst_imm12_scale4:
+ case AArch64::fixup_aarch64_ldst_imm12_scale8:
+ case AArch64::fixup_aarch64_ldst_imm12_scale16:
+ case AArch64::fixup_aarch64_ldr_pcrel_imm19:
+ case AArch64::fixup_aarch64_pcrel_branch19:
+ return 3;
+
+ case AArch64::fixup_aarch64_pcrel_adr_imm21:
+ case AArch64::fixup_aarch64_pcrel_adrp_imm21:
+ case AArch64::fixup_aarch64_pcrel_branch26:
+ case AArch64::fixup_aarch64_pcrel_call26:
+ case FK_Data_4:
+ return 4;
+
+ case FK_Data_8:
+ return 8;
+ }
+}
+
+static unsigned AdrImmBits(unsigned Value) {
+ unsigned lo2 = Value & 0x3;
+ unsigned hi19 = (Value & 0x1ffffc) >> 2;
+ return (hi19 << 5) | (lo2 << 29);
+}
+
+static uint64_t adjustFixupValue(unsigned Kind, uint64_t Value) {
+ int64_t SignedValue = static_cast<int64_t>(Value);
+ switch (Kind) {
+ default:
+ assert(false && "Unknown fixup kind!");
+ case AArch64::fixup_aarch64_pcrel_adr_imm21:
+ if (SignedValue > 2097151 || SignedValue < -2097152)
+ report_fatal_error("fixup value out of range");
+ return AdrImmBits(Value & 0x1fffffULL);
+ case AArch64::fixup_aarch64_pcrel_adrp_imm21:
+ return AdrImmBits((Value & 0x1fffff000ULL) >> 12);
+ case AArch64::fixup_aarch64_ldr_pcrel_imm19:
+ case AArch64::fixup_aarch64_pcrel_branch19:
+ // Signed 21-bit immediate
+ if (SignedValue > 2097151 || SignedValue < -2097152)
+ report_fatal_error("fixup value out of range");
+ // Low two bits are not encoded.
+ return (Value >> 2) & 0x7ffff;
+ case AArch64::fixup_aarch64_add_imm12:
+ case AArch64::fixup_aarch64_ldst_imm12_scale1:
+ // Unsigned 12-bit immediate
+ if (Value >= 0x1000)
+ report_fatal_error("invalid imm12 fixup value");
+ return Value;
+ case AArch64::fixup_aarch64_ldst_imm12_scale2:
+ // Unsigned 12-bit immediate which gets multiplied by 2
+ if (Value & 1 || Value >= 0x2000)
+ report_fatal_error("invalid imm12 fixup value");
+ return Value >> 1;
+ case AArch64::fixup_aarch64_ldst_imm12_scale4:
+ // Unsigned 12-bit immediate which gets multiplied by 4
+ if (Value & 3 || Value >= 0x4000)
+ report_fatal_error("invalid imm12 fixup value");
+ return Value >> 2;
+ case AArch64::fixup_aarch64_ldst_imm12_scale8:
+ // Unsigned 12-bit immediate which gets multiplied by 8
+ if (Value & 7 || Value >= 0x8000)
+ report_fatal_error("invalid imm12 fixup value");
+ return Value >> 3;
+ case AArch64::fixup_aarch64_ldst_imm12_scale16:
+ // Unsigned 12-bit immediate which gets multiplied by 16
+ if (Value & 15 || Value >= 0x10000)
+ report_fatal_error("invalid imm12 fixup value");
+ return Value >> 4;
+ case AArch64::fixup_aarch64_movw:
+ report_fatal_error("no resolvable MOVZ/MOVK fixups supported yet");
+ return Value;
+ case AArch64::fixup_aarch64_pcrel_branch14:
+ // Signed 16-bit immediate
+ if (SignedValue > 32767 || SignedValue < -32768)
+ report_fatal_error("fixup value out of range");
+ // Low two bits are not encoded (4-byte alignment assumed).
+ if (Value & 0x3)
+ report_fatal_error("fixup not sufficiently aligned");
+ return (Value >> 2) & 0x3fff;
+ case AArch64::fixup_aarch64_pcrel_branch26:
+ case AArch64::fixup_aarch64_pcrel_call26:
+ // Signed 28-bit immediate
+ if (SignedValue > 134217727 || SignedValue < -134217728)
+ report_fatal_error("fixup value out of range");
+ // Low two bits are not encoded (4-byte alignment assumed).
+ if (Value & 0x3)
+ report_fatal_error("fixup not sufficiently aligned");
+ return (Value >> 2) & 0x3ffffff;
+ case FK_Data_1:
+ case FK_Data_2:
+ case FK_Data_4:
+ case FK_Data_8:
+ return Value;
+ }
+}
+
+void AArch64AsmBackend::applyFixup(const MCFixup &Fixup, char *Data,
+ unsigned DataSize, uint64_t Value,
+ bool IsPCRel) const {
+ unsigned NumBytes = getFixupKindNumBytes(Fixup.getKind());
+ if (!Value)
+ return; // Doesn't change encoding.
+ MCFixupKindInfo Info = getFixupKindInfo(Fixup.getKind());
+ // Apply any target-specific value adjustments.
+ Value = adjustFixupValue(Fixup.getKind(), Value);
+
+ // Shift the value into position.
+ Value <<= Info.TargetOffset;
+
+ unsigned Offset = Fixup.getOffset();
+ assert(Offset + NumBytes <= DataSize && "Invalid fixup offset!");
+
+ // For each byte of the fragment that the fixup touches, mask in the
+ // bits from the fixup value.
+ for (unsigned i = 0; i != NumBytes; ++i)
+ Data[Offset + i] |= uint8_t((Value >> (i * 8)) & 0xff);
+}
+
+bool AArch64AsmBackend::mayNeedRelaxation(const MCInst &Inst) const {
+ return false;
+}
+
+bool AArch64AsmBackend::fixupNeedsRelaxation(const MCFixup &Fixup,
+ uint64_t Value,
+ const MCRelaxableFragment *DF,
+ const MCAsmLayout &Layout) const {
+ // FIXME: This isn't correct for AArch64. Just moving the "generic" logic
+ // into the targets for now.
+ //
+ // Relax if the value is too big for a (signed) i8.
+ return int64_t(Value) != int64_t(int8_t(Value));
+}
+
+void AArch64AsmBackend::relaxInstruction(const MCInst &Inst,
+ MCInst &Res) const {
+ assert(false && "AArch64AsmBackend::relaxInstruction() unimplemented");
+}
+
+bool AArch64AsmBackend::writeNopData(uint64_t Count, MCObjectWriter *OW) const {
+ // If the count is not 4-byte aligned, we must be writing data into the text
+ // section (otherwise we have unaligned instructions, and thus have far
+ // bigger problems), so just write zeros instead.
+ if ((Count & 3) != 0) {
+ for (uint64_t i = 0, e = (Count & 3); i != e; ++i)
+ OW->Write8(0);
+ }
+
+ // We are properly aligned, so write NOPs as requested.
+ Count /= 4;
+ for (uint64_t i = 0; i != Count; ++i)
+ OW->Write32(0xd503201f);
+ return true;
+}
+
+namespace {
+
+namespace CU {
+
+/// \brief Compact unwind encoding values.
+enum CompactUnwindEncodings {
+ /// \brief A "frameless" leaf function, where no non-volatile registers are
+ /// saved. The return remains in LR throughout the function.
+ UNWIND_AArch64_MODE_FRAMELESS = 0x02000000,
+
+ /// \brief No compact unwind encoding available. Instead the low 23-bits of
+ /// the compact unwind encoding is the offset of the DWARF FDE in the
+ /// __eh_frame section. This mode is never used in object files. It is only
+ /// generated by the linker in final linked images, which have only DWARF info
+ /// for a function.
+ UNWIND_AArch64_MODE_DWARF = 0x03000000,
+
+ /// \brief This is a standard arm64 prologue where FP/LR are immediately
+ /// pushed on the stack, then SP is copied to FP. If there are any
+ /// non-volatile register saved, they are copied into the stack fame in pairs
+ /// in a contiguous ranger right below the saved FP/LR pair. Any subset of the
+ /// five X pairs and four D pairs can be saved, but the memory layout must be
+ /// in register number order.
+ UNWIND_AArch64_MODE_FRAME = 0x04000000,
+
+ /// \brief Frame register pair encodings.
+ UNWIND_AArch64_FRAME_X19_X20_PAIR = 0x00000001,
+ UNWIND_AArch64_FRAME_X21_X22_PAIR = 0x00000002,
+ UNWIND_AArch64_FRAME_X23_X24_PAIR = 0x00000004,
+ UNWIND_AArch64_FRAME_X25_X26_PAIR = 0x00000008,
+ UNWIND_AArch64_FRAME_X27_X28_PAIR = 0x00000010,
+ UNWIND_AArch64_FRAME_D8_D9_PAIR = 0x00000100,
+ UNWIND_AArch64_FRAME_D10_D11_PAIR = 0x00000200,
+ UNWIND_AArch64_FRAME_D12_D13_PAIR = 0x00000400,
+ UNWIND_AArch64_FRAME_D14_D15_PAIR = 0x00000800
+};
+
+} // end CU namespace
+
+// FIXME: This should be in a separate file.
+class DarwinAArch64AsmBackend : public AArch64AsmBackend {
+ const MCRegisterInfo &MRI;
+
+ /// \brief Encode compact unwind stack adjustment for frameless functions.
+ /// See UNWIND_AArch64_FRAMELESS_STACK_SIZE_MASK in compact_unwind_encoding.h.
+ /// The stack size always needs to be 16 byte aligned.
+ uint32_t encodeStackAdjustment(uint32_t StackSize) const {
+ return (StackSize / 16) << 12;
+ }
+
+public:
+ DarwinAArch64AsmBackend(const Target &T, const MCRegisterInfo &MRI)
+ : AArch64AsmBackend(T), MRI(MRI) {}
+
+ MCObjectWriter *createObjectWriter(raw_ostream &OS) const override {
+ return createAArch64MachObjectWriter(OS, MachO::CPU_TYPE_ARM64,
+ MachO::CPU_SUBTYPE_ARM64_ALL);
+ }
+
+ bool doesSectionRequireSymbols(const MCSection &Section) const override {
+ // Any section for which the linker breaks things into atoms needs to
+ // preserve symbols, including assembler local symbols, to identify
+ // those atoms. These sections are:
+ // Sections of type:
+ //
+ // S_CSTRING_LITERALS (e.g. __cstring)
+ // S_LITERAL_POINTERS (e.g. objc selector pointers)
+ // S_16BYTE_LITERALS, S_8BYTE_LITERALS, S_4BYTE_LITERALS
+ //
+ // Sections named:
+ //
+ // __TEXT,__eh_frame
+ // __TEXT,__ustring
+ // __DATA,__cfstring
+ // __DATA,__objc_classrefs
+ // __DATA,__objc_catlist
+ //
+ // FIXME: It would be better if the compiler used actual linker local
+ // symbols for each of these sections rather than preserving what
+ // are ostensibly assembler local symbols.
+ const MCSectionMachO &SMO = static_cast<const MCSectionMachO &>(Section);
+ return (SMO.getType() == MachO::S_CSTRING_LITERALS ||
+ SMO.getType() == MachO::S_4BYTE_LITERALS ||
+ SMO.getType() == MachO::S_8BYTE_LITERALS ||
+ SMO.getType() == MachO::S_16BYTE_LITERALS ||
+ SMO.getType() == MachO::S_LITERAL_POINTERS ||
+ (SMO.getSegmentName() == "__TEXT" &&
+ (SMO.getSectionName() == "__eh_frame" ||
+ SMO.getSectionName() == "__ustring")) ||
+ (SMO.getSegmentName() == "__DATA" &&
+ (SMO.getSectionName() == "__cfstring" ||
+ SMO.getSectionName() == "__objc_classrefs" ||
+ SMO.getSectionName() == "__objc_catlist")));
+ }
+
+ /// \brief Generate the compact unwind encoding from the CFI directives.
+ uint32_t generateCompactUnwindEncoding(
+ ArrayRef<MCCFIInstruction> Instrs) const override {
+ if (Instrs.empty())
+ return CU::UNWIND_AArch64_MODE_FRAMELESS;
+
+ bool HasFP = false;
+ unsigned StackSize = 0;
+
+ uint32_t CompactUnwindEncoding = 0;
+ for (size_t i = 0, e = Instrs.size(); i != e; ++i) {
+ const MCCFIInstruction &Inst = Instrs[i];
+
+ switch (Inst.getOperation()) {
+ default:
+ // Cannot handle this directive: bail out.
+ return CU::UNWIND_AArch64_MODE_DWARF;
+ case MCCFIInstruction::OpDefCfa: {
+ // Defines a frame pointer.
+ assert(getXRegFromWReg(MRI.getLLVMRegNum(Inst.getRegister(), true)) ==
+ AArch64::FP &&
+ "Invalid frame pointer!");
+ assert(i + 2 < e && "Insufficient CFI instructions to define a frame!");
+
+ const MCCFIInstruction &LRPush = Instrs[++i];
+ assert(LRPush.getOperation() == MCCFIInstruction::OpOffset &&
+ "Link register not pushed!");
+ const MCCFIInstruction &FPPush = Instrs[++i];
+ assert(FPPush.getOperation() == MCCFIInstruction::OpOffset &&
+ "Frame pointer not pushed!");
+
+ unsigned LRReg = MRI.getLLVMRegNum(LRPush.getRegister(), true);
+ unsigned FPReg = MRI.getLLVMRegNum(FPPush.getRegister(), true);
+
+ LRReg = getXRegFromWReg(LRReg);
+ FPReg = getXRegFromWReg(FPReg);
+
+ assert(LRReg == AArch64::LR && FPReg == AArch64::FP &&
+ "Pushing invalid registers for frame!");
+
+ // Indicate that the function has a frame.
+ CompactUnwindEncoding |= CU::UNWIND_AArch64_MODE_FRAME;
+ HasFP = true;
+ break;
+ }
+ case MCCFIInstruction::OpDefCfaOffset: {
+ assert(StackSize == 0 && "We already have the CFA offset!");
+ StackSize = std::abs(Inst.getOffset());
+ break;
+ }
+ case MCCFIInstruction::OpOffset: {
+ // Registers are saved in pairs. We expect there to be two consecutive
+ // `.cfi_offset' instructions with the appropriate registers specified.
+ unsigned Reg1 = MRI.getLLVMRegNum(Inst.getRegister(), true);
+ if (i + 1 == e)
+ return CU::UNWIND_AArch64_MODE_DWARF;
+
+ const MCCFIInstruction &Inst2 = Instrs[++i];
+ if (Inst2.getOperation() != MCCFIInstruction::OpOffset)
+ return CU::UNWIND_AArch64_MODE_DWARF;
+ unsigned Reg2 = MRI.getLLVMRegNum(Inst2.getRegister(), true);
+
+ // N.B. The encodings must be in register number order, and the X
+ // registers before the D registers.
+
+ // X19/X20 pair = 0x00000001,
+ // X21/X22 pair = 0x00000002,
+ // X23/X24 pair = 0x00000004,
+ // X25/X26 pair = 0x00000008,
+ // X27/X28 pair = 0x00000010
+ Reg1 = getXRegFromWReg(Reg1);
+ Reg2 = getXRegFromWReg(Reg2);
+
+ if (Reg1 == AArch64::X19 && Reg2 == AArch64::X20 &&
+ (CompactUnwindEncoding & 0xF1E) == 0)
+ CompactUnwindEncoding |= CU::UNWIND_AArch64_FRAME_X19_X20_PAIR;
+ else if (Reg1 == AArch64::X21 && Reg2 == AArch64::X22 &&
+ (CompactUnwindEncoding & 0xF1C) == 0)
+ CompactUnwindEncoding |= CU::UNWIND_AArch64_FRAME_X21_X22_PAIR;
+ else if (Reg1 == AArch64::X23 && Reg2 == AArch64::X24 &&
+ (CompactUnwindEncoding & 0xF18) == 0)
+ CompactUnwindEncoding |= CU::UNWIND_AArch64_FRAME_X23_X24_PAIR;
+ else if (Reg1 == AArch64::X25 && Reg2 == AArch64::X26 &&
+ (CompactUnwindEncoding & 0xF10) == 0)
+ CompactUnwindEncoding |= CU::UNWIND_AArch64_FRAME_X25_X26_PAIR;
+ else if (Reg1 == AArch64::X27 && Reg2 == AArch64::X28 &&
+ (CompactUnwindEncoding & 0xF00) == 0)
+ CompactUnwindEncoding |= CU::UNWIND_AArch64_FRAME_X27_X28_PAIR;
+ else {
+ Reg1 = getDRegFromBReg(Reg1);
+ Reg2 = getDRegFromBReg(Reg2);
+
+ // D8/D9 pair = 0x00000100,
+ // D10/D11 pair = 0x00000200,
+ // D12/D13 pair = 0x00000400,
+ // D14/D15 pair = 0x00000800
+ if (Reg1 == AArch64::D8 && Reg2 == AArch64::D9 &&
+ (CompactUnwindEncoding & 0xE00) == 0)
+ CompactUnwindEncoding |= CU::UNWIND_AArch64_FRAME_D8_D9_PAIR;
+ else if (Reg1 == AArch64::D10 && Reg2 == AArch64::D11 &&
+ (CompactUnwindEncoding & 0xC00) == 0)
+ CompactUnwindEncoding |= CU::UNWIND_AArch64_FRAME_D10_D11_PAIR;
+ else if (Reg1 == AArch64::D12 && Reg2 == AArch64::D13 &&
+ (CompactUnwindEncoding & 0x800) == 0)
+ CompactUnwindEncoding |= CU::UNWIND_AArch64_FRAME_D12_D13_PAIR;
+ else if (Reg1 == AArch64::D14 && Reg2 == AArch64::D15)
+ CompactUnwindEncoding |= CU::UNWIND_AArch64_FRAME_D14_D15_PAIR;
+ else
+ // A pair was pushed which we cannot handle.
+ return CU::UNWIND_AArch64_MODE_DWARF;
+ }
+
+ break;
+ }
+ }
+ }
+
+ if (!HasFP) {
+ // With compact unwind info we can only represent stack adjustments of up
+ // to 65520 bytes.
+ if (StackSize > 65520)
+ return CU::UNWIND_AArch64_MODE_DWARF;
+
+ CompactUnwindEncoding |= CU::UNWIND_AArch64_MODE_FRAMELESS;
+ CompactUnwindEncoding |= encodeStackAdjustment(StackSize);
+ }
+
+ return CompactUnwindEncoding;
+ }
+};
+
+} // end anonymous namespace
+
+namespace {
+
+class ELFAArch64AsmBackend : public AArch64AsmBackend {
+public:
+ uint8_t OSABI;
+ bool IsLittleEndian;
+
+ ELFAArch64AsmBackend(const Target &T, uint8_t OSABI, bool IsLittleEndian)
+ : AArch64AsmBackend(T), OSABI(OSABI), IsLittleEndian(IsLittleEndian) {}
+
+ MCObjectWriter *createObjectWriter(raw_ostream &OS) const override {
+ return createAArch64ELFObjectWriter(OS, OSABI, IsLittleEndian);
+ }
+
+ void processFixupValue(const MCAssembler &Asm, const MCAsmLayout &Layout,
+ const MCFixup &Fixup, const MCFragment *DF,
+ const MCValue &Target, uint64_t &Value,
+ bool &IsResolved) override;
+
+ void applyFixup(const MCFixup &Fixup, char *Data, unsigned DataSize,
+ uint64_t Value, bool IsPCRel) const override;
+};
+
+void ELFAArch64AsmBackend::processFixupValue(
+ const MCAssembler &Asm, const MCAsmLayout &Layout, const MCFixup &Fixup,
+ const MCFragment *DF, const MCValue &Target, uint64_t &Value,
+ bool &IsResolved) {
// The ADRP instruction adds some multiple of 0x1000 to the current PC &
// ~0xfff. This means that the required offset to reach a symbol can vary by
// up to one step depending on where the ADRP is in memory. For example:
@@ -66,528 +524,43 @@
// same page as the ADRP and the instruction should encode 0x0. Assuming the
// section isn't 0x1000-aligned, we therefore need to delegate this decision
// to the linker -- a relocation!
- if ((uint32_t)Fixup.getKind() == AArch64::fixup_a64_adr_prel_page ||
- (uint32_t)Fixup.getKind() == AArch64::fixup_a64_adr_prel_got_page ||
- (uint32_t)Fixup.getKind() == AArch64::fixup_a64_adr_gottprel_page ||
- (uint32_t)Fixup.getKind() == AArch64::fixup_a64_tlsdesc_adr_page)
+ if ((uint32_t)Fixup.getKind() == AArch64::fixup_aarch64_pcrel_adrp_imm21)
IsResolved = false;
}
-
-static uint64_t adjustFixupValue(unsigned Kind, uint64_t Value);
-
-namespace {
-
-class ELFAArch64AsmBackend : public AArch64AsmBackend {
- uint8_t OSABI;
- bool IsLittle; // Big or little endian
-public:
- ELFAArch64AsmBackend(const Target &T, const StringRef TT,
- uint8_t _OSABI, bool isLittle)
- : AArch64AsmBackend(T, TT), OSABI(_OSABI), IsLittle(isLittle) { }
-
- bool fixupNeedsRelaxation(const MCFixup &Fixup,
- uint64_t Value,
- const MCRelaxableFragment *DF,
- const MCAsmLayout &Layout) const;
-
- unsigned int getNumFixupKinds() const {
- return AArch64::NumTargetFixupKinds;
+void ELFAArch64AsmBackend::applyFixup(const MCFixup &Fixup, char *Data,
+ unsigned DataSize, uint64_t Value,
+ bool IsPCRel) const {
+ // store fixups in .eh_frame section in big endian order
+ if (!IsLittleEndian && Fixup.getKind() == FK_Data_4) {
+ const MCSection *Sec = Fixup.getValue()->FindAssociatedSection();
+ const MCSectionELF *SecELF = static_cast<const MCSectionELF *>(Sec);
+ if (SecELF->getSectionName() == ".eh_frame")
+ Value = ByteSwap_32(unsigned(Value));
}
-
- const MCFixupKindInfo &getFixupKindInfo(MCFixupKind Kind) const {
- const static MCFixupKindInfo Infos[AArch64::NumTargetFixupKinds] = {
-// This table *must* be in the order that the fixup_* kinds are defined in
-// AArch64FixupKinds.h.
-//
-// Name Offset (bits) Size (bits) Flags
-{ "fixup_a64_ld_prel", 0, 32, MCFixupKindInfo::FKF_IsPCRel },
-{ "fixup_a64_adr_prel", 0, 32, MCFixupKindInfo::FKF_IsPCRel },
-{ "fixup_a64_adr_prel_page", 0, 32, MCFixupKindInfo::FKF_IsPCRel },
-{ "fixup_a64_add_lo12", 0, 32, 0 },
-{ "fixup_a64_ldst8_lo12", 0, 32, 0 },
-{ "fixup_a64_ldst16_lo12", 0, 32, 0 },
-{ "fixup_a64_ldst32_lo12", 0, 32, 0 },
-{ "fixup_a64_ldst64_lo12", 0, 32, 0 },
-{ "fixup_a64_ldst128_lo12", 0, 32, 0 },
-{ "fixup_a64_tstbr", 0, 32, MCFixupKindInfo::FKF_IsPCRel },
-{ "fixup_a64_condbr", 0, 32, MCFixupKindInfo::FKF_IsPCRel },
-{ "fixup_a64_uncondbr", 0, 32, MCFixupKindInfo::FKF_IsPCRel },
-{ "fixup_a64_call", 0, 32, MCFixupKindInfo::FKF_IsPCRel },
-{ "fixup_a64_movw_uabs_g0", 0, 32, 0 },
-{ "fixup_a64_movw_uabs_g0_nc", 0, 32, 0 },
-{ "fixup_a64_movw_uabs_g1", 0, 32, 0 },
-{ "fixup_a64_movw_uabs_g1_nc", 0, 32, 0 },
-{ "fixup_a64_movw_uabs_g2", 0, 32, 0 },
-{ "fixup_a64_movw_uabs_g2_nc", 0, 32, 0 },
-{ "fixup_a64_movw_uabs_g3", 0, 32, 0 },
-{ "fixup_a64_movw_sabs_g0", 0, 32, 0 },
-{ "fixup_a64_movw_sabs_g1", 0, 32, 0 },
-{ "fixup_a64_movw_sabs_g2", 0, 32, 0 },
-{ "fixup_a64_adr_prel_got_page", 0, 32, MCFixupKindInfo::FKF_IsPCRel },
-{ "fixup_a64_ld64_got_lo12_nc", 0, 32, 0 },
-{ "fixup_a64_movw_dtprel_g2", 0, 32, 0 },
-{ "fixup_a64_movw_dtprel_g1", 0, 32, 0 },
-{ "fixup_a64_movw_dtprel_g1_nc", 0, 32, 0 },
-{ "fixup_a64_movw_dtprel_g0", 0, 32, 0 },
-{ "fixup_a64_movw_dtprel_g0_nc", 0, 32, 0 },
-{ "fixup_a64_add_dtprel_hi12", 0, 32, 0 },
-{ "fixup_a64_add_dtprel_lo12", 0, 32, 0 },
-{ "fixup_a64_add_dtprel_lo12_nc", 0, 32, 0 },
-{ "fixup_a64_ldst8_dtprel_lo12", 0, 32, 0 },
-{ "fixup_a64_ldst8_dtprel_lo12_nc", 0, 32, 0 },
-{ "fixup_a64_ldst16_dtprel_lo12", 0, 32, 0 },
-{ "fixup_a64_ldst16_dtprel_lo12_nc", 0, 32, 0 },
-{ "fixup_a64_ldst32_dtprel_lo12", 0, 32, 0 },
-{ "fixup_a64_ldst32_dtprel_lo12_nc", 0, 32, 0 },
-{ "fixup_a64_ldst64_dtprel_lo12", 0, 32, 0 },
-{ "fixup_a64_ldst64_dtprel_lo12_nc", 0, 32, 0 },
-{ "fixup_a64_movw_gottprel_g1", 0, 32, 0 },
-{ "fixup_a64_movw_gottprel_g0_nc", 0, 32, 0 },
-{ "fixup_a64_adr_gottprel_page", 0, 32, MCFixupKindInfo::FKF_IsPCRel },
-{ "fixup_a64_ld64_gottprel_lo12_nc", 0, 32, 0 },
-{ "fixup_a64_ld_gottprel_prel19", 0, 32, MCFixupKindInfo::FKF_IsPCRel },
-{ "fixup_a64_movw_tprel_g2", 0, 32, 0 },
-{ "fixup_a64_movw_tprel_g1", 0, 32, 0 },
-{ "fixup_a64_movw_tprel_g1_nc", 0, 32, 0 },
-{ "fixup_a64_movw_tprel_g0", 0, 32, 0 },
-{ "fixup_a64_movw_tprel_g0_nc", 0, 32, 0 },
-{ "fixup_a64_add_tprel_hi12", 0, 32, 0 },
-{ "fixup_a64_add_tprel_lo12", 0, 32, 0 },
-{ "fixup_a64_add_tprel_lo12_nc", 0, 32, 0 },
-{ "fixup_a64_ldst8_tprel_lo12", 0, 32, 0 },
-{ "fixup_a64_ldst8_tprel_lo12_nc", 0, 32, 0 },
-{ "fixup_a64_ldst16_tprel_lo12", 0, 32, 0 },
-{ "fixup_a64_ldst16_tprel_lo12_nc", 0, 32, 0 },
-{ "fixup_a64_ldst32_tprel_lo12", 0, 32, 0 },
-{ "fixup_a64_ldst32_tprel_lo12_nc", 0, 32, 0 },
-{ "fixup_a64_ldst64_tprel_lo12", 0, 32, 0 },
-{ "fixup_a64_ldst64_tprel_lo12_nc", 0, 32, 0 },
-{ "fixup_a64_tlsdesc_adr_page", 0, 32, MCFixupKindInfo::FKF_IsPCRel },
-{ "fixup_a64_tlsdesc_ld64_lo12_nc", 0, 32, 0 },
-{ "fixup_a64_tlsdesc_add_lo12_nc", 0, 32, 0 },
-{ "fixup_a64_tlsdesc_call", 0, 0, 0 }
- };
- if (Kind < FirstTargetFixupKind)
- return MCAsmBackend::getFixupKindInfo(Kind);
-
- assert(unsigned(Kind - FirstTargetFixupKind) < getNumFixupKinds() &&
- "Invalid kind!");
- return Infos[Kind - FirstTargetFixupKind];
- }
-
- void applyFixup(const MCFixup &Fixup, char *Data, unsigned DataSize,
- uint64_t Value, bool IsPCRel) const {
- unsigned NumBytes = getFixupKindInfo(Fixup.getKind()).TargetSize / 8;
- Value = adjustFixupValue(Fixup.getKind(), Value);
- if (!Value) return; // Doesn't change encoding.
-
- unsigned Offset = Fixup.getOffset();
- assert(Offset + NumBytes <= DataSize && "Invalid fixup offset!");
-
- // For each byte of the fragment that the fixup touches, mask in the bits
- // from the fixup value.
- for (unsigned i = 0; i != NumBytes; ++i) {
- Data[Offset + i] |= uint8_t((Value >> (i * 8)) & 0xff);
- }
- }
-
- bool mayNeedRelaxation(const MCInst&) const {
- return false;
- }
-
- void relaxInstruction(const MCInst&, llvm::MCInst&) const {
- llvm_unreachable("Cannot relax instructions");
- }
-
- MCObjectWriter *createObjectWriter(raw_ostream &OS) const {
- return createAArch64ELFObjectWriter(OS, OSABI, IsLittle);
- }
-};
-
-} // end anonymous namespace
-
-bool
-ELFAArch64AsmBackend::fixupNeedsRelaxation(const MCFixup &Fixup,
- uint64_t Value,
- const MCRelaxableFragment *DF,
- const MCAsmLayout &Layout) const {
- // Correct for now. With all instructions 32-bit only very low-level
- // considerations could make you select something which may fail.
- return false;
+ AArch64AsmBackend::applyFixup (Fixup, Data, DataSize, Value, IsPCRel);
+}
}
-
-bool AArch64AsmBackend::writeNopData(uint64_t Count, MCObjectWriter *OW) const {
- // Can't emit NOP with size not multiple of 32-bits
- if (Count % 4 != 0)
- return false;
-
- uint64_t NumNops = Count / 4;
- for (uint64_t i = 0; i != NumNops; ++i)
- OW->Write32(0xd503201f);
-
- return true;
-}
-
-static unsigned ADRImmBits(unsigned Value) {
- unsigned lo2 = Value & 0x3;
- unsigned hi19 = (Value & 0x1fffff) >> 2;
-
- return (hi19 << 5) | (lo2 << 29);
-}
-
-static uint64_t adjustFixupValue(unsigned Kind, uint64_t Value) {
- switch (Kind) {
- default:
- llvm_unreachable("Unknown fixup kind!");
- case FK_Data_2:
- assert((int64_t)Value >= -32768 &&
- (int64_t)Value <= 65536 &&
- "Out of range ABS16 fixup");
- return Value;
- case FK_Data_4:
- assert((int64_t)Value >= -(1LL << 31) &&
- (int64_t)Value <= (1LL << 32) - 1 &&
- "Out of range ABS32 fixup");
- return Value;
- case FK_Data_8:
- return Value;
-
- case AArch64::fixup_a64_ld_gottprel_prel19:
- // R_AARCH64_LD_GOTTPREL_PREL19: Set a load-literal immediate to bits 1F
- // FFFC of G(TPREL(S+A)) - P; check -2^20 <= X < 2^20.
- case AArch64::fixup_a64_ld_prel:
- // R_AARCH64_LD_PREL_LO19: Sets a load-literal (immediate) value to bits
- // 1F FFFC of S+A-P, checking that -2^20 <= S+A-P < 2^20.
- assert((int64_t)Value >= -(1LL << 20) &&
- (int64_t)Value < (1LL << 20) && "Out of range LDR (lit) fixup");
- return (Value & 0x1ffffc) << 3;
-
- case AArch64::fixup_a64_adr_prel:
- // R_AARCH64_ADR_PREL_LO21: Sets an ADR immediate value to bits 1F FFFF of
- // the result of S+A-P, checking that -2^20 <= S+A-P < 2^20.
- assert((int64_t)Value >= -(1LL << 20) &&
- (int64_t)Value < (1LL << 20) && "Out of range ADR fixup");
- return ADRImmBits(Value & 0x1fffff);
-
- case AArch64::fixup_a64_adr_prel_page:
- // R_AARCH64_ADR_PREL_PG_HI21: Sets an ADRP immediate value to bits 1 FFFF
- // F000 of the result of the operation, checking that -2^32 <= result <
- // 2^32.
- assert((int64_t)Value >= -(1LL << 32) &&
- (int64_t)Value < (1LL << 32) && "Out of range ADRP fixup");
- return ADRImmBits((Value & 0x1fffff000ULL) >> 12);
-
- case AArch64::fixup_a64_add_dtprel_hi12:
- // R_AARCH64_TLSLD_ADD_DTPREL_LO12: Set an ADD immediate field to bits
- // FF F000 of DTPREL(S+A), check 0 <= X < 2^24.
- case AArch64::fixup_a64_add_tprel_hi12:
- // R_AARCH64_TLSLD_ADD_TPREL_LO12: Set an ADD immediate field to bits
- // FF F000 of TPREL(S+A), check 0 <= X < 2^24.
- assert((int64_t)Value >= 0 &&
- (int64_t)Value < (1LL << 24) && "Out of range ADD fixup");
- return (Value & 0xfff000) >> 2;
-
- case AArch64::fixup_a64_add_dtprel_lo12:
- // R_AARCH64_TLSLD_ADD_DTPREL_LO12: Set an ADD immediate field to bits
- // FFF of DTPREL(S+A), check 0 <= X < 2^12.
- case AArch64::fixup_a64_add_tprel_lo12:
- // R_AARCH64_TLSLD_ADD_TPREL_LO12: Set an ADD immediate field to bits
- // FFF of TPREL(S+A), check 0 <= X < 2^12.
- assert((int64_t)Value >= 0 &&
- (int64_t)Value < (1LL << 12) && "Out of range ADD fixup");
- // ... fallthrough to no-checking versions ...
- case AArch64::fixup_a64_add_dtprel_lo12_nc:
- // R_AARCH64_TLSLD_ADD_DTPREL_LO12_NC: Set an ADD immediate field to bits
- // FFF of DTPREL(S+A) with no overflow check.
- case AArch64::fixup_a64_add_tprel_lo12_nc:
- // R_AARCH64_TLSLD_ADD_TPREL_LO12_NC: Set an ADD immediate field to bits
- // FFF of TPREL(S+A) with no overflow check.
- case AArch64::fixup_a64_tlsdesc_add_lo12_nc:
- // R_AARCH64_TLSDESC_ADD_LO12_NC: Set an ADD immediate field to bits
- // FFF of G(TLSDESC(S+A)), with no overflow check.
- case AArch64::fixup_a64_add_lo12:
- // R_AARCH64_ADD_ABS_LO12_NC: Sets an ADD immediate value to bits FFF of
- // S+A, with no overflow check.
- return (Value & 0xfff) << 10;
-
- case AArch64::fixup_a64_ldst8_dtprel_lo12:
- // R_AARCH64_TLSLD_LDST8_DTPREL_LO12: Set an LD/ST offset field to bits FFF
- // of DTPREL(S+A), check 0 <= X < 2^12.
- case AArch64::fixup_a64_ldst8_tprel_lo12:
- // R_AARCH64_TLSLE_LDST8_TPREL_LO12: Set an LD/ST offset field to bits FFF
- // of DTPREL(S+A), check 0 <= X < 2^12.
- assert((int64_t) Value >= 0 &&
- (int64_t) Value < (1LL << 12) && "Out of range LD/ST fixup");
- // ... fallthrough to no-checking versions ...
- case AArch64::fixup_a64_ldst8_dtprel_lo12_nc:
- // R_AARCH64_TLSLD_LDST8_DTPREL_LO12: Set an LD/ST offset field to bits FFF
- // of DTPREL(S+A), with no overflow check.
- case AArch64::fixup_a64_ldst8_tprel_lo12_nc:
- // R_AARCH64_TLSLD_LDST8_TPREL_LO12: Set an LD/ST offset field to bits FFF
- // of TPREL(S+A), with no overflow check.
- case AArch64::fixup_a64_ldst8_lo12:
- // R_AARCH64_LDST8_ABS_LO12_NC: Sets an LD/ST immediate value to bits FFF
- // of S+A, with no overflow check.
- return (Value & 0xfff) << 10;
-
- case AArch64::fixup_a64_ldst16_dtprel_lo12:
- // R_AARCH64_TLSLD_LDST16_DTPREL_LO12: Set an LD/ST offset field to bits FFE
- // of DTPREL(S+A), check 0 <= X < 2^12.
- case AArch64::fixup_a64_ldst16_tprel_lo12:
- // R_AARCH64_TLSLE_LDST16_TPREL_LO12: Set an LD/ST offset field to bits FFE
- // of DTPREL(S+A), check 0 <= X < 2^12.
- assert((int64_t) Value >= 0 &&
- (int64_t) Value < (1LL << 12) && "Out of range LD/ST fixup");
- // ... fallthrough to no-checking versions ...
- case AArch64::fixup_a64_ldst16_dtprel_lo12_nc:
- // R_AARCH64_TLSLD_LDST16_DTPREL_LO12: Set an LD/ST offset field to bits FFE
- // of DTPREL(S+A), with no overflow check.
- case AArch64::fixup_a64_ldst16_tprel_lo12_nc:
- // R_AARCH64_TLSLD_LDST16_TPREL_LO12: Set an LD/ST offset field to bits FFE
- // of TPREL(S+A), with no overflow check.
- case AArch64::fixup_a64_ldst16_lo12:
- // R_AARCH64_LDST16_ABS_LO12_NC: Sets an LD/ST immediate value to bits FFE
- // of S+A, with no overflow check.
- return (Value & 0xffe) << 9;
-
- case AArch64::fixup_a64_ldst32_dtprel_lo12:
- // R_AARCH64_TLSLD_LDST32_DTPREL_LO12: Set an LD/ST offset field to bits FFC
- // of DTPREL(S+A), check 0 <= X < 2^12.
- case AArch64::fixup_a64_ldst32_tprel_lo12:
- // R_AARCH64_TLSLE_LDST32_TPREL_LO12: Set an LD/ST offset field to bits FFC
- // of DTPREL(S+A), check 0 <= X < 2^12.
- assert((int64_t) Value >= 0 &&
- (int64_t) Value < (1LL << 12) && "Out of range LD/ST fixup");
- // ... fallthrough to no-checking versions ...
- case AArch64::fixup_a64_ldst32_dtprel_lo12_nc:
- // R_AARCH64_TLSLD_LDST32_DTPREL_LO12: Set an LD/ST offset field to bits FFC
- // of DTPREL(S+A), with no overflow check.
- case AArch64::fixup_a64_ldst32_tprel_lo12_nc:
- // R_AARCH64_TLSLD_LDST32_TPREL_LO12: Set an LD/ST offset field to bits FFC
- // of TPREL(S+A), with no overflow check.
- case AArch64::fixup_a64_ldst32_lo12:
- // R_AARCH64_LDST32_ABS_LO12_NC: Sets an LD/ST immediate value to bits FFC
- // of S+A, with no overflow check.
- return (Value & 0xffc) << 8;
-
- case AArch64::fixup_a64_ldst64_dtprel_lo12:
- // R_AARCH64_TLSLD_LDST64_DTPREL_LO12: Set an LD/ST offset field to bits FF8
- // of DTPREL(S+A), check 0 <= X < 2^12.
- case AArch64::fixup_a64_ldst64_tprel_lo12:
- // R_AARCH64_TLSLE_LDST64_TPREL_LO12: Set an LD/ST offset field to bits FF8
- // of DTPREL(S+A), check 0 <= X < 2^12.
- assert((int64_t) Value >= 0 &&
- (int64_t) Value < (1LL << 12) && "Out of range LD/ST fixup");
- // ... fallthrough to no-checking versions ...
- case AArch64::fixup_a64_ldst64_dtprel_lo12_nc:
- // R_AARCH64_TLSLD_LDST64_DTPREL_LO12: Set an LD/ST offset field to bits FF8
- // of DTPREL(S+A), with no overflow check.
- case AArch64::fixup_a64_ldst64_tprel_lo12_nc:
- // R_AARCH64_TLSLD_LDST64_TPREL_LO12: Set an LD/ST offset field to bits FF8
- // of TPREL(S+A), with no overflow check.
- case AArch64::fixup_a64_ldst64_lo12:
- // R_AARCH64_LDST64_ABS_LO12_NC: Sets an LD/ST immediate value to bits FF8
- // of S+A, with no overflow check.
- return (Value & 0xff8) << 7;
-
- case AArch64::fixup_a64_ldst128_lo12:
- // R_AARCH64_LDST128_ABS_LO12_NC: Sets an LD/ST immediate value to bits FF0
- // of S+A, with no overflow check.
- return (Value & 0xff0) << 6;
-
- case AArch64::fixup_a64_movw_uabs_g0:
- // R_AARCH64_MOVW_UABS_G0: Sets a MOVZ immediate field to bits FFFF of S+A
- // with a check that S+A < 2^16
- assert(Value <= 0xffff && "Out of range move wide fixup");
- return (Value & 0xffff) << 5;
-
- case AArch64::fixup_a64_movw_dtprel_g0_nc:
- // R_AARCH64_TLSLD_MOVW_DTPREL_G0_NC: Sets a MOVK immediate field to bits
- // FFFF of DTPREL(S+A) with no overflow check.
- case AArch64::fixup_a64_movw_gottprel_g0_nc:
- // R_AARCH64_TLSIE_MOVW_GOTTPREL_G0_NC: Sets a MOVK immediate field to bits
- // FFFF of G(TPREL(S+A)) - GOT with no overflow check.
- case AArch64::fixup_a64_movw_tprel_g0_nc:
- // R_AARCH64_TLSLE_MOVW_TPREL_G0_NC: Sets a MOVK immediate field to bits
- // FFFF of TPREL(S+A) with no overflow check.
- case AArch64::fixup_a64_movw_uabs_g0_nc:
- // R_AARCH64_MOVW_UABS_G0_NC: Sets a MOVK immediate field to bits FFFF of
- // S+A with no overflow check.
- return (Value & 0xffff) << 5;
-
- case AArch64::fixup_a64_movw_uabs_g1:
- // R_AARCH64_MOVW_UABS_G1: Sets a MOVZ immediate field to bits FFFF0000 of
- // S+A with a check that S+A < 2^32
- assert(Value <= 0xffffffffull && "Out of range move wide fixup");
- return ((Value >> 16) & 0xffff) << 5;
-
- case AArch64::fixup_a64_movw_dtprel_g1_nc:
- // R_AARCH64_TLSLD_MOVW_DTPREL_G1_NC: Set a MOVK immediate field
- // to bits FFFF0000 of DTPREL(S+A), with no overflow check.
- case AArch64::fixup_a64_movw_tprel_g1_nc:
- // R_AARCH64_TLSLD_MOVW_TPREL_G1_NC: Set a MOVK immediate field
- // to bits FFFF0000 of TPREL(S+A), with no overflow check.
- case AArch64::fixup_a64_movw_uabs_g1_nc:
- // R_AARCH64_MOVW_UABS_G1_NC: Sets a MOVK immediate field to bits
- // FFFF0000 of S+A with no overflow check.
- return ((Value >> 16) & 0xffff) << 5;
-
- case AArch64::fixup_a64_movw_uabs_g2:
- // R_AARCH64_MOVW_UABS_G2: Sets a MOVZ immediate field to bits FFFF 0000
- // 0000 of S+A with a check that S+A < 2^48
- assert(Value <= 0xffffffffffffull && "Out of range move wide fixup");
- return ((Value >> 32) & 0xffff) << 5;
-
- case AArch64::fixup_a64_movw_uabs_g2_nc:
- // R_AARCH64_MOVW_UABS_G2: Sets a MOVK immediate field to bits FFFF 0000
- // 0000 of S+A with no overflow check.
- return ((Value >> 32) & 0xffff) << 5;
-
- case AArch64::fixup_a64_movw_uabs_g3:
- // R_AARCH64_MOVW_UABS_G3: Sets a MOVZ immediate field to bits FFFF 0000
- // 0000 0000 of S+A (no overflow check needed)
- return ((Value >> 48) & 0xffff) << 5;
-
- case AArch64::fixup_a64_movw_dtprel_g0:
- // R_AARCH64_TLSLD_MOVW_DTPREL_G0: Set a MOV[NZ] immediate field
- // to bits FFFF of DTPREL(S+A).
- case AArch64::fixup_a64_movw_tprel_g0:
- // R_AARCH64_TLSLE_MOVW_TPREL_G0: Set a MOV[NZ] immediate field to
- // bits FFFF of TPREL(S+A).
- case AArch64::fixup_a64_movw_sabs_g0: {
- // R_AARCH64_MOVW_SABS_G0: Sets MOV[NZ] immediate field using bits FFFF of
- // S+A (see notes below); check -2^16 <= S+A < 2^16. (notes say that we
- // should convert between MOVN and MOVZ to achieve our goals).
- int64_t Signed = Value;
- assert(Signed >= -(1LL << 16) && Signed < (1LL << 16)
- && "Out of range move wide fixup");
- if (Signed >= 0) {
- Value = (Value & 0xffff) << 5;
- // Bit 30 converts the MOVN encoding into a MOVZ
- Value |= 1 << 30;
- } else {
- // MCCodeEmitter should have encoded a MOVN, which is fine.
- Value = (~Value & 0xffff) << 5;
- }
- return Value;
- }
-
- case AArch64::fixup_a64_movw_dtprel_g1:
- // R_AARCH64_TLSLD_MOVW_DTPREL_G1: Set a MOV[NZ] immediate field
- // to bits FFFF0000 of DTPREL(S+A).
- case AArch64::fixup_a64_movw_gottprel_g1:
- // R_AARCH64_TLSIE_MOVW_GOTTPREL_G1: Set a MOV[NZ] immediate field
- // to bits FFFF0000 of G(TPREL(S+A)) - GOT.
- case AArch64::fixup_a64_movw_tprel_g1:
- // R_AARCH64_TLSLE_MOVW_TPREL_G1: Set a MOV[NZ] immediate field to
- // bits FFFF0000 of TPREL(S+A).
- case AArch64::fixup_a64_movw_sabs_g1: {
- // R_AARCH64_MOVW_SABS_G1: Sets MOV[NZ] immediate field using bits FFFF 0000
- // of S+A (see notes below); check -2^32 <= S+A < 2^32. (notes say that we
- // should convert between MOVN and MOVZ to achieve our goals).
- int64_t Signed = Value;
- assert(Signed >= -(1LL << 32) && Signed < (1LL << 32)
- && "Out of range move wide fixup");
- if (Signed >= 0) {
- Value = ((Value >> 16) & 0xffff) << 5;
- // Bit 30 converts the MOVN encoding into a MOVZ
- Value |= 1 << 30;
- } else {
- Value = ((~Value >> 16) & 0xffff) << 5;
- }
- return Value;
- }
-
- case AArch64::fixup_a64_movw_dtprel_g2:
- // R_AARCH64_TLSLD_MOVW_DTPREL_G2: Set a MOV[NZ] immediate field
- // to bits FFFF 0000 0000 of DTPREL(S+A).
- case AArch64::fixup_a64_movw_tprel_g2:
- // R_AARCH64_TLSLE_MOVW_TPREL_G2: Set a MOV[NZ] immediate field to
- // bits FFFF 0000 0000 of TPREL(S+A).
- case AArch64::fixup_a64_movw_sabs_g2: {
- // R_AARCH64_MOVW_SABS_G2: Sets MOV[NZ] immediate field using bits FFFF 0000
- // 0000 of S+A (see notes below); check -2^48 <= S+A < 2^48. (notes say that
- // we should convert between MOVN and MOVZ to achieve our goals).
- int64_t Signed = Value;
- assert(Signed >= -(1LL << 48) && Signed < (1LL << 48)
- && "Out of range move wide fixup");
- if (Signed >= 0) {
- Value = ((Value >> 32) & 0xffff) << 5;
- // Bit 30 converts the MOVN encoding into a MOVZ
- Value |= 1 << 30;
- } else {
- Value = ((~Value >> 32) & 0xffff) << 5;
- }
- return Value;
- }
-
- case AArch64::fixup_a64_tstbr:
- // R_AARCH64_TSTBR14: Sets the immediate field of a TBZ/TBNZ instruction to
- // bits FFFC of S+A-P, checking -2^15 <= S+A-P < 2^15.
- assert((int64_t)Value >= -(1LL << 15) &&
- (int64_t)Value < (1LL << 15) && "Out of range TBZ/TBNZ fixup");
- return (Value & 0xfffc) << (5 - 2);
-
- case AArch64::fixup_a64_condbr:
- // R_AARCH64_CONDBR19: Sets the immediate field of a conditional branch
- // instruction to bits 1FFFFC of S+A-P, checking -2^20 <= S+A-P < 2^20.
- assert((int64_t)Value >= -(1LL << 20) &&
- (int64_t)Value < (1LL << 20) && "Out of range B.cond fixup");
- return (Value & 0x1ffffc) << (5 - 2);
-
- case AArch64::fixup_a64_uncondbr:
- // R_AARCH64_JUMP26 same as below (except to a linker, possibly).
- case AArch64::fixup_a64_call:
- // R_AARCH64_CALL26: Sets a CALL immediate field to bits FFFFFFC of S+A-P,
- // checking that -2^27 <= S+A-P < 2^27.
- assert((int64_t)Value >= -(1LL << 27) &&
- (int64_t)Value < (1LL << 27) && "Out of range branch fixup");
- return (Value & 0xffffffc) >> 2;
-
- case AArch64::fixup_a64_adr_gottprel_page:
- // R_AARCH64_TLSIE_ADR_GOTTPREL_PAGE21: Set an ADRP immediate field to bits
- // 1FFFFF000 of Page(G(TPREL(S+A))) - Page(P); check -2^32 <= X < 2^32.
- case AArch64::fixup_a64_tlsdesc_adr_page:
- // R_AARCH64_TLSDESC_ADR_PAGE: Set an ADRP immediate field to bits 1FFFFF000
- // of Page(G(TLSDESC(S+A))) - Page(P); check -2^32 <= X < 2^32.
- case AArch64::fixup_a64_adr_prel_got_page:
- // R_AARCH64_ADR_GOT_PAGE: Sets the immediate value of an ADRP to bits
- // 1FFFFF000 of the operation, checking that -2^32 < Page(G(S))-Page(GOT) <
- // 2^32.
- assert((int64_t)Value >= -(1LL << 32) &&
- (int64_t)Value < (1LL << 32) && "Out of range ADRP fixup");
- return ADRImmBits((Value & 0x1fffff000ULL) >> 12);
-
- case AArch64::fixup_a64_ld64_gottprel_lo12_nc:
- // R_AARCH64_TLSIE_LD64_GOTTPREL_LO12_NC: Set an LD offset field to bits FF8
- // of X, with no overflow check. Check that X & 7 == 0.
- case AArch64::fixup_a64_tlsdesc_ld64_lo12_nc:
- // R_AARCH64_TLSDESC_LD64_LO12_NC: Set an LD offset field to bits FF8 of
- // G(TLSDESC(S+A)), with no overflow check. Check that X & 7 == 0.
- case AArch64::fixup_a64_ld64_got_lo12_nc:
- // R_AARCH64_LD64_GOT_LO12_NC: Sets the LD/ST immediate field to bits FF8 of
- // G(S) with no overflow check. Check X & 7 == 0
- assert(((int64_t)Value & 7) == 0 && "Misaligned fixup");
- return (Value & 0xff8) << 7;
-
- case AArch64::fixup_a64_tlsdesc_call:
- // R_AARCH64_TLSDESC_CALL: For relaxation only.
- return 0;
- }
-}
-
-MCAsmBackend *
-llvm::createAArch64leAsmBackend(const Target &T, const MCRegisterInfo &MRI,
- StringRef TT, StringRef CPU) {
+MCAsmBackend *llvm::createAArch64leAsmBackend(const Target &T,
+ const MCRegisterInfo &MRI,
+ StringRef TT, StringRef CPU) {
Triple TheTriple(TT);
- return new ELFAArch64AsmBackend(T, TT, TheTriple.getOS(), /*isLittle*/ true);
+
+ if (TheTriple.isOSDarwin())
+ return new DarwinAArch64AsmBackend(T, MRI);
+
+ assert(TheTriple.isOSBinFormatELF() && "Expect either MachO or ELF target");
+ return new ELFAArch64AsmBackend(T, TheTriple.getOS(), /*IsLittleEndian=*/true);
}
-MCAsmBackend *
-llvm::createAArch64beAsmBackend(const Target &T, const MCRegisterInfo &MRI,
- StringRef TT, StringRef CPU) {
+MCAsmBackend *llvm::createAArch64beAsmBackend(const Target &T,
+ const MCRegisterInfo &MRI,
+ StringRef TT, StringRef CPU) {
Triple TheTriple(TT);
- return new ELFAArch64AsmBackend(T, TT, TheTriple.getOS(), /*isLittle*/ false);
+
+ assert(TheTriple.isOSBinFormatELF() &&
+ "Big endian is only supported for ELF targets!");
+ return new ELFAArch64AsmBackend(T, TheTriple.getOS(),
+ /*IsLittleEndian=*/false);
}
diff --git a/lib/Target/AArch64/MCTargetDesc/AArch64ELFObjectWriter.cpp b/lib/Target/AArch64/MCTargetDesc/AArch64ELFObjectWriter.cpp
index a5fe914..e05191e 100644
--- a/lib/Target/AArch64/MCTargetDesc/AArch64ELFObjectWriter.cpp
+++ b/lib/Target/AArch64/MCTargetDesc/AArch64ELFObjectWriter.cpp
@@ -13,6 +13,7 @@
//===----------------------------------------------------------------------===//
#include "MCTargetDesc/AArch64FixupKinds.h"
+#include "MCTargetDesc/AArch64MCExpr.h"
#include "MCTargetDesc/AArch64MCTargetDesc.h"
#include "llvm/MC/MCELFObjectWriter.h"
#include "llvm/MC/MCValue.h"
@@ -35,257 +36,222 @@
};
}
-AArch64ELFObjectWriter::AArch64ELFObjectWriter(uint8_t OSABI, bool IsLittleEndian)
- : MCELFObjectTargetWriter(/*Is64Bit*/ true, OSABI, ELF::EM_AARCH64,
- /*HasRelocationAddend*/ true)
-{}
+AArch64ELFObjectWriter::AArch64ELFObjectWriter(uint8_t OSABI,
+ bool IsLittleEndian)
+ : MCELFObjectTargetWriter(/*Is64Bit*/ true, OSABI, ELF::EM_AARCH64,
+ /*HasRelocationAddend*/ true) {}
-AArch64ELFObjectWriter::~AArch64ELFObjectWriter()
-{}
+AArch64ELFObjectWriter::~AArch64ELFObjectWriter() {}
unsigned AArch64ELFObjectWriter::GetRelocType(const MCValue &Target,
- const MCFixup &Fixup,
- bool IsPCRel) const {
- unsigned Type;
+ const MCFixup &Fixup,
+ bool IsPCRel) const {
+ AArch64MCExpr::VariantKind RefKind =
+ static_cast<AArch64MCExpr::VariantKind>(Target.getRefKind());
+ AArch64MCExpr::VariantKind SymLoc = AArch64MCExpr::getSymbolLoc(RefKind);
+ bool IsNC = AArch64MCExpr::isNotChecked(RefKind);
+
+ assert((!Target.getSymA() ||
+ Target.getSymA()->getKind() == MCSymbolRefExpr::VK_None) &&
+ "Should only be expression-level modifiers here");
+
+ assert((!Target.getSymB() ||
+ Target.getSymB()->getKind() == MCSymbolRefExpr::VK_None) &&
+ "Should only be expression-level modifiers here");
+
if (IsPCRel) {
switch ((unsigned)Fixup.getKind()) {
- default:
- llvm_unreachable("Unimplemented fixup -> relocation");
- case FK_Data_8:
- return ELF::R_AARCH64_PREL64;
- case FK_Data_4:
- return ELF::R_AARCH64_PREL32;
case FK_Data_2:
return ELF::R_AARCH64_PREL16;
- case AArch64::fixup_a64_ld_prel:
- Type = ELF::R_AARCH64_LD_PREL_LO19;
- break;
- case AArch64::fixup_a64_adr_prel:
- Type = ELF::R_AARCH64_ADR_PREL_LO21;
- break;
- case AArch64::fixup_a64_adr_prel_page:
- Type = ELF::R_AARCH64_ADR_PREL_PG_HI21;
- break;
- case AArch64::fixup_a64_adr_prel_got_page:
- Type = ELF::R_AARCH64_ADR_GOT_PAGE;
- break;
- case AArch64::fixup_a64_tstbr:
- Type = ELF::R_AARCH64_TSTBR14;
- break;
- case AArch64::fixup_a64_condbr:
- Type = ELF::R_AARCH64_CONDBR19;
- break;
- case AArch64::fixup_a64_uncondbr:
- Type = ELF::R_AARCH64_JUMP26;
- break;
- case AArch64::fixup_a64_call:
- Type = ELF::R_AARCH64_CALL26;
- break;
- case AArch64::fixup_a64_adr_gottprel_page:
- Type = ELF::R_AARCH64_TLSIE_ADR_GOTTPREL_PAGE21;
- break;
- case AArch64::fixup_a64_ld_gottprel_prel19:
- Type = ELF::R_AARCH64_TLSIE_LD_GOTTPREL_PREL19;
- break;
- case AArch64::fixup_a64_tlsdesc_adr_page:
- Type = ELF::R_AARCH64_TLSDESC_ADR_PAGE;
- break;
+ case FK_Data_4:
+ return ELF::R_AARCH64_PREL32;
+ case FK_Data_8:
+ return ELF::R_AARCH64_PREL64;
+ case AArch64::fixup_aarch64_pcrel_adr_imm21:
+ assert(SymLoc == AArch64MCExpr::VK_NONE && "unexpected ADR relocation");
+ return ELF::R_AARCH64_ADR_PREL_LO21;
+ case AArch64::fixup_aarch64_pcrel_adrp_imm21:
+ if (SymLoc == AArch64MCExpr::VK_ABS && !IsNC)
+ return ELF::R_AARCH64_ADR_PREL_PG_HI21;
+ if (SymLoc == AArch64MCExpr::VK_GOT && !IsNC)
+ return ELF::R_AARCH64_ADR_GOT_PAGE;
+ if (SymLoc == AArch64MCExpr::VK_GOTTPREL && !IsNC)
+ return ELF::R_AARCH64_TLSIE_ADR_GOTTPREL_PAGE21;
+ if (SymLoc == AArch64MCExpr::VK_TLSDESC && !IsNC)
+ return ELF::R_AARCH64_TLSDESC_ADR_PAGE;
+ llvm_unreachable("invalid symbol kind for ADRP relocation");
+ case AArch64::fixup_aarch64_pcrel_branch26:
+ return ELF::R_AARCH64_JUMP26;
+ case AArch64::fixup_aarch64_pcrel_call26:
+ return ELF::R_AARCH64_CALL26;
+ case AArch64::fixup_aarch64_ldr_pcrel_imm19:
+ if (SymLoc == AArch64MCExpr::VK_GOTTPREL)
+ return ELF::R_AARCH64_TLSIE_LD_GOTTPREL_PREL19;
+ return ELF::R_AARCH64_LD_PREL_LO19;
+ case AArch64::fixup_aarch64_pcrel_branch14:
+ return ELF::R_AARCH64_TSTBR14;
+ case AArch64::fixup_aarch64_pcrel_branch19:
+ return ELF::R_AARCH64_CONDBR19;
+ default:
+ llvm_unreachable("Unsupported pc-relative fixup kind");
}
} else {
switch ((unsigned)Fixup.getKind()) {
- default:
- llvm_unreachable("Unimplemented fixup -> relocation");
- case FK_Data_8:
- return ELF::R_AARCH64_ABS64;
- case FK_Data_4:
- return ELF::R_AARCH64_ABS32;
case FK_Data_2:
return ELF::R_AARCH64_ABS16;
- case AArch64::fixup_a64_add_lo12:
- Type = ELF::R_AARCH64_ADD_ABS_LO12_NC;
- break;
- case AArch64::fixup_a64_ld64_got_lo12_nc:
- Type = ELF::R_AARCH64_LD64_GOT_LO12_NC;
- break;
- case AArch64::fixup_a64_ldst8_lo12:
- Type = ELF::R_AARCH64_LDST8_ABS_LO12_NC;
- break;
- case AArch64::fixup_a64_ldst16_lo12:
- Type = ELF::R_AARCH64_LDST16_ABS_LO12_NC;
- break;
- case AArch64::fixup_a64_ldst32_lo12:
- Type = ELF::R_AARCH64_LDST32_ABS_LO12_NC;
- break;
- case AArch64::fixup_a64_ldst64_lo12:
- Type = ELF::R_AARCH64_LDST64_ABS_LO12_NC;
- break;
- case AArch64::fixup_a64_ldst128_lo12:
- Type = ELF::R_AARCH64_LDST128_ABS_LO12_NC;
- break;
- case AArch64::fixup_a64_movw_uabs_g0:
- Type = ELF::R_AARCH64_MOVW_UABS_G0;
- break;
- case AArch64::fixup_a64_movw_uabs_g0_nc:
- Type = ELF::R_AARCH64_MOVW_UABS_G0_NC;
- break;
- case AArch64::fixup_a64_movw_uabs_g1:
- Type = ELF::R_AARCH64_MOVW_UABS_G1;
- break;
- case AArch64::fixup_a64_movw_uabs_g1_nc:
- Type = ELF::R_AARCH64_MOVW_UABS_G1_NC;
- break;
- case AArch64::fixup_a64_movw_uabs_g2:
- Type = ELF::R_AARCH64_MOVW_UABS_G2;
- break;
- case AArch64::fixup_a64_movw_uabs_g2_nc:
- Type = ELF::R_AARCH64_MOVW_UABS_G2_NC;
- break;
- case AArch64::fixup_a64_movw_uabs_g3:
- Type = ELF::R_AARCH64_MOVW_UABS_G3;
- break;
- case AArch64::fixup_a64_movw_sabs_g0:
- Type = ELF::R_AARCH64_MOVW_SABS_G0;
- break;
- case AArch64::fixup_a64_movw_sabs_g1:
- Type = ELF::R_AARCH64_MOVW_SABS_G1;
- break;
- case AArch64::fixup_a64_movw_sabs_g2:
- Type = ELF::R_AARCH64_MOVW_SABS_G2;
- break;
+ case FK_Data_4:
+ return ELF::R_AARCH64_ABS32;
+ case FK_Data_8:
+ return ELF::R_AARCH64_ABS64;
+ case AArch64::fixup_aarch64_add_imm12:
+ if (RefKind == AArch64MCExpr::VK_DTPREL_HI12)
+ return ELF::R_AARCH64_TLSLD_ADD_DTPREL_HI12;
+ if (RefKind == AArch64MCExpr::VK_TPREL_HI12)
+ return ELF::R_AARCH64_TLSLE_ADD_TPREL_HI12;
+ if (RefKind == AArch64MCExpr::VK_DTPREL_LO12_NC)
+ return ELF::R_AARCH64_TLSLD_ADD_DTPREL_LO12_NC;
+ if (RefKind == AArch64MCExpr::VK_DTPREL_LO12)
+ return ELF::R_AARCH64_TLSLD_ADD_DTPREL_LO12;
+ if (RefKind == AArch64MCExpr::VK_TPREL_LO12_NC)
+ return ELF::R_AARCH64_TLSLE_ADD_TPREL_LO12_NC;
+ if (RefKind == AArch64MCExpr::VK_TPREL_LO12)
+ return ELF::R_AARCH64_TLSLE_ADD_TPREL_LO12;
+ if (RefKind == AArch64MCExpr::VK_TLSDESC_LO12)
+ return ELF::R_AARCH64_TLSDESC_ADD_LO12_NC;
+ if (SymLoc == AArch64MCExpr::VK_ABS && IsNC)
+ return ELF::R_AARCH64_ADD_ABS_LO12_NC;
- // TLS Local-dynamic block
- case AArch64::fixup_a64_movw_dtprel_g2:
- Type = ELF::R_AARCH64_TLSLD_MOVW_DTPREL_G2;
- break;
- case AArch64::fixup_a64_movw_dtprel_g1:
- Type = ELF::R_AARCH64_TLSLD_MOVW_DTPREL_G1;
- break;
- case AArch64::fixup_a64_movw_dtprel_g1_nc:
- Type = ELF::R_AARCH64_TLSLD_MOVW_DTPREL_G1_NC;
- break;
- case AArch64::fixup_a64_movw_dtprel_g0:
- Type = ELF::R_AARCH64_TLSLD_MOVW_DTPREL_G0;
- break;
- case AArch64::fixup_a64_movw_dtprel_g0_nc:
- Type = ELF::R_AARCH64_TLSLD_MOVW_DTPREL_G0_NC;
- break;
- case AArch64::fixup_a64_add_dtprel_hi12:
- Type = ELF::R_AARCH64_TLSLD_ADD_DTPREL_HI12;
- break;
- case AArch64::fixup_a64_add_dtprel_lo12:
- Type = ELF::R_AARCH64_TLSLD_ADD_DTPREL_LO12;
- break;
- case AArch64::fixup_a64_add_dtprel_lo12_nc:
- Type = ELF::R_AARCH64_TLSLD_ADD_DTPREL_LO12_NC;
- break;
- case AArch64::fixup_a64_ldst8_dtprel_lo12:
- Type = ELF::R_AARCH64_TLSLD_LDST8_DTPREL_LO12;
- break;
- case AArch64::fixup_a64_ldst8_dtprel_lo12_nc:
- Type = ELF::R_AARCH64_TLSLD_LDST8_DTPREL_LO12_NC;
- break;
- case AArch64::fixup_a64_ldst16_dtprel_lo12:
- Type = ELF::R_AARCH64_TLSLD_LDST16_DTPREL_LO12;
- break;
- case AArch64::fixup_a64_ldst16_dtprel_lo12_nc:
- Type = ELF::R_AARCH64_TLSLD_LDST16_DTPREL_LO12_NC;
- break;
- case AArch64::fixup_a64_ldst32_dtprel_lo12:
- Type = ELF::R_AARCH64_TLSLD_LDST32_DTPREL_LO12;
- break;
- case AArch64::fixup_a64_ldst32_dtprel_lo12_nc:
- Type = ELF::R_AARCH64_TLSLD_LDST32_DTPREL_LO12_NC;
- break;
- case AArch64::fixup_a64_ldst64_dtprel_lo12:
- Type = ELF::R_AARCH64_TLSLD_LDST64_DTPREL_LO12;
- break;
- case AArch64::fixup_a64_ldst64_dtprel_lo12_nc:
- Type = ELF::R_AARCH64_TLSLD_LDST64_DTPREL_LO12_NC;
- break;
+ report_fatal_error("invalid fixup for add (uimm12) instruction");
+ return 0;
+ case AArch64::fixup_aarch64_ldst_imm12_scale1:
+ if (SymLoc == AArch64MCExpr::VK_ABS && IsNC)
+ return ELF::R_AARCH64_LDST8_ABS_LO12_NC;
+ if (SymLoc == AArch64MCExpr::VK_DTPREL && !IsNC)
+ return ELF::R_AARCH64_TLSLD_LDST8_DTPREL_LO12;
+ if (SymLoc == AArch64MCExpr::VK_DTPREL && IsNC)
+ return ELF::R_AARCH64_TLSLD_LDST8_DTPREL_LO12_NC;
+ if (SymLoc == AArch64MCExpr::VK_TPREL && !IsNC)
+ return ELF::R_AARCH64_TLSLE_LDST8_TPREL_LO12;
+ if (SymLoc == AArch64MCExpr::VK_TPREL && IsNC)
+ return ELF::R_AARCH64_TLSLE_LDST8_TPREL_LO12_NC;
- // TLS initial-exec block
- case AArch64::fixup_a64_movw_gottprel_g1:
- Type = ELF::R_AARCH64_TLSIE_MOVW_GOTTPREL_G1;
- break;
- case AArch64::fixup_a64_movw_gottprel_g0_nc:
- Type = ELF::R_AARCH64_TLSIE_MOVW_GOTTPREL_G0_NC;
- break;
- case AArch64::fixup_a64_ld64_gottprel_lo12_nc:
- Type = ELF::R_AARCH64_TLSIE_LD64_GOTTPREL_LO12_NC;
- break;
+ report_fatal_error("invalid fixup for 8-bit load/store instruction");
+ return 0;
+ case AArch64::fixup_aarch64_ldst_imm12_scale2:
+ if (SymLoc == AArch64MCExpr::VK_ABS && IsNC)
+ return ELF::R_AARCH64_LDST16_ABS_LO12_NC;
+ if (SymLoc == AArch64MCExpr::VK_DTPREL && !IsNC)
+ return ELF::R_AARCH64_TLSLD_LDST16_DTPREL_LO12;
+ if (SymLoc == AArch64MCExpr::VK_DTPREL && IsNC)
+ return ELF::R_AARCH64_TLSLD_LDST16_DTPREL_LO12_NC;
+ if (SymLoc == AArch64MCExpr::VK_TPREL && !IsNC)
+ return ELF::R_AARCH64_TLSLE_LDST16_TPREL_LO12;
+ if (SymLoc == AArch64MCExpr::VK_TPREL && IsNC)
+ return ELF::R_AARCH64_TLSLE_LDST16_TPREL_LO12_NC;
- // TLS local-exec block
- case AArch64::fixup_a64_movw_tprel_g2:
- Type = ELF::R_AARCH64_TLSLE_MOVW_TPREL_G2;
- break;
- case AArch64::fixup_a64_movw_tprel_g1:
- Type = ELF::R_AARCH64_TLSLE_MOVW_TPREL_G1;
- break;
- case AArch64::fixup_a64_movw_tprel_g1_nc:
- Type = ELF::R_AARCH64_TLSLE_MOVW_TPREL_G1_NC;
- break;
- case AArch64::fixup_a64_movw_tprel_g0:
- Type = ELF::R_AARCH64_TLSLE_MOVW_TPREL_G0;
- break;
- case AArch64::fixup_a64_movw_tprel_g0_nc:
- Type = ELF::R_AARCH64_TLSLE_MOVW_TPREL_G0_NC;
- break;
- case AArch64::fixup_a64_add_tprel_hi12:
- Type = ELF::R_AARCH64_TLSLE_ADD_TPREL_HI12;
- break;
- case AArch64::fixup_a64_add_tprel_lo12:
- Type = ELF::R_AARCH64_TLSLE_ADD_TPREL_LO12;
- break;
- case AArch64::fixup_a64_add_tprel_lo12_nc:
- Type = ELF::R_AARCH64_TLSLE_ADD_TPREL_LO12_NC;
- break;
- case AArch64::fixup_a64_ldst8_tprel_lo12:
- Type = ELF::R_AARCH64_TLSLE_LDST8_TPREL_LO12;
- break;
- case AArch64::fixup_a64_ldst8_tprel_lo12_nc:
- Type = ELF::R_AARCH64_TLSLE_LDST8_TPREL_LO12_NC;
- break;
- case AArch64::fixup_a64_ldst16_tprel_lo12:
- Type = ELF::R_AARCH64_TLSLE_LDST16_TPREL_LO12;
- break;
- case AArch64::fixup_a64_ldst16_tprel_lo12_nc:
- Type = ELF::R_AARCH64_TLSLE_LDST16_TPREL_LO12_NC;
- break;
- case AArch64::fixup_a64_ldst32_tprel_lo12:
- Type = ELF::R_AARCH64_TLSLE_LDST32_TPREL_LO12;
- break;
- case AArch64::fixup_a64_ldst32_tprel_lo12_nc:
- Type = ELF::R_AARCH64_TLSLE_LDST32_TPREL_LO12_NC;
- break;
- case AArch64::fixup_a64_ldst64_tprel_lo12:
- Type = ELF::R_AARCH64_TLSLE_LDST64_TPREL_LO12;
- break;
- case AArch64::fixup_a64_ldst64_tprel_lo12_nc:
- Type = ELF::R_AARCH64_TLSLE_LDST64_TPREL_LO12_NC;
- break;
+ report_fatal_error("invalid fixup for 16-bit load/store instruction");
+ return 0;
+ case AArch64::fixup_aarch64_ldst_imm12_scale4:
+ if (SymLoc == AArch64MCExpr::VK_ABS && IsNC)
+ return ELF::R_AARCH64_LDST32_ABS_LO12_NC;
+ if (SymLoc == AArch64MCExpr::VK_DTPREL && !IsNC)
+ return ELF::R_AARCH64_TLSLD_LDST32_DTPREL_LO12;
+ if (SymLoc == AArch64MCExpr::VK_DTPREL && IsNC)
+ return ELF::R_AARCH64_TLSLD_LDST32_DTPREL_LO12_NC;
+ if (SymLoc == AArch64MCExpr::VK_TPREL && !IsNC)
+ return ELF::R_AARCH64_TLSLE_LDST32_TPREL_LO12;
+ if (SymLoc == AArch64MCExpr::VK_TPREL && IsNC)
+ return ELF::R_AARCH64_TLSLE_LDST32_TPREL_LO12_NC;
- // TLS general-dynamic block
- case AArch64::fixup_a64_tlsdesc_adr_page:
- Type = ELF::R_AARCH64_TLSDESC_ADR_PAGE;
- break;
- case AArch64::fixup_a64_tlsdesc_ld64_lo12_nc:
- Type = ELF::R_AARCH64_TLSDESC_LD64_LO12_NC;
- break;
- case AArch64::fixup_a64_tlsdesc_add_lo12_nc:
- Type = ELF::R_AARCH64_TLSDESC_ADD_LO12_NC;
- break;
- case AArch64::fixup_a64_tlsdesc_call:
- Type = ELF::R_AARCH64_TLSDESC_CALL;
- break;
+ report_fatal_error("invalid fixup for 32-bit load/store instruction");
+ return 0;
+ case AArch64::fixup_aarch64_ldst_imm12_scale8:
+ if (SymLoc == AArch64MCExpr::VK_ABS && IsNC)
+ return ELF::R_AARCH64_LDST64_ABS_LO12_NC;
+ if (SymLoc == AArch64MCExpr::VK_GOT && IsNC)
+ return ELF::R_AARCH64_LD64_GOT_LO12_NC;
+ if (SymLoc == AArch64MCExpr::VK_DTPREL && !IsNC)
+ return ELF::R_AARCH64_TLSLD_LDST64_DTPREL_LO12;
+ if (SymLoc == AArch64MCExpr::VK_DTPREL && IsNC)
+ return ELF::R_AARCH64_TLSLD_LDST64_DTPREL_LO12_NC;
+ if (SymLoc == AArch64MCExpr::VK_TPREL && !IsNC)
+ return ELF::R_AARCH64_TLSLE_LDST64_TPREL_LO12;
+ if (SymLoc == AArch64MCExpr::VK_TPREL && IsNC)
+ return ELF::R_AARCH64_TLSLE_LDST64_TPREL_LO12_NC;
+ if (SymLoc == AArch64MCExpr::VK_GOTTPREL && IsNC)
+ return ELF::R_AARCH64_TLSIE_LD64_GOTTPREL_LO12_NC;
+ if (SymLoc == AArch64MCExpr::VK_TLSDESC && IsNC)
+ return ELF::R_AARCH64_TLSDESC_LD64_LO12_NC;
+
+ report_fatal_error("invalid fixup for 64-bit load/store instruction");
+ return 0;
+ case AArch64::fixup_aarch64_ldst_imm12_scale16:
+ if (SymLoc == AArch64MCExpr::VK_ABS && IsNC)
+ return ELF::R_AARCH64_LDST128_ABS_LO12_NC;
+
+ report_fatal_error("invalid fixup for 128-bit load/store instruction");
+ return 0;
+ case AArch64::fixup_aarch64_movw:
+ if (RefKind == AArch64MCExpr::VK_ABS_G3)
+ return ELF::R_AARCH64_MOVW_UABS_G3;
+ if (RefKind == AArch64MCExpr::VK_ABS_G2)
+ return ELF::R_AARCH64_MOVW_UABS_G2;
+ if (RefKind == AArch64MCExpr::VK_ABS_G2_S)
+ return ELF::R_AARCH64_MOVW_SABS_G2;
+ if (RefKind == AArch64MCExpr::VK_ABS_G2_NC)
+ return ELF::R_AARCH64_MOVW_UABS_G2_NC;
+ if (RefKind == AArch64MCExpr::VK_ABS_G1)
+ return ELF::R_AARCH64_MOVW_UABS_G1;
+ if (RefKind == AArch64MCExpr::VK_ABS_G1_S)
+ return ELF::R_AARCH64_MOVW_SABS_G1;
+ if (RefKind == AArch64MCExpr::VK_ABS_G1_NC)
+ return ELF::R_AARCH64_MOVW_UABS_G1_NC;
+ if (RefKind == AArch64MCExpr::VK_ABS_G0)
+ return ELF::R_AARCH64_MOVW_UABS_G0;
+ if (RefKind == AArch64MCExpr::VK_ABS_G0_S)
+ return ELF::R_AARCH64_MOVW_SABS_G0;
+ if (RefKind == AArch64MCExpr::VK_ABS_G0_NC)
+ return ELF::R_AARCH64_MOVW_UABS_G0_NC;
+ if (RefKind == AArch64MCExpr::VK_DTPREL_G2)
+ return ELF::R_AARCH64_TLSLD_MOVW_DTPREL_G2;
+ if (RefKind == AArch64MCExpr::VK_DTPREL_G1)
+ return ELF::R_AARCH64_TLSLD_MOVW_DTPREL_G1;
+ if (RefKind == AArch64MCExpr::VK_DTPREL_G1_NC)
+ return ELF::R_AARCH64_TLSLD_MOVW_DTPREL_G1_NC;
+ if (RefKind == AArch64MCExpr::VK_DTPREL_G0)
+ return ELF::R_AARCH64_TLSLD_MOVW_DTPREL_G0;
+ if (RefKind == AArch64MCExpr::VK_DTPREL_G0_NC)
+ return ELF::R_AARCH64_TLSLD_MOVW_DTPREL_G0_NC;
+ if (RefKind == AArch64MCExpr::VK_TPREL_G2)
+ return ELF::R_AARCH64_TLSLE_MOVW_TPREL_G2;
+ if (RefKind == AArch64MCExpr::VK_TPREL_G1)
+ return ELF::R_AARCH64_TLSLE_MOVW_TPREL_G1;
+ if (RefKind == AArch64MCExpr::VK_TPREL_G1_NC)
+ return ELF::R_AARCH64_TLSLE_MOVW_TPREL_G1_NC;
+ if (RefKind == AArch64MCExpr::VK_TPREL_G0)
+ return ELF::R_AARCH64_TLSLE_MOVW_TPREL_G0;
+ if (RefKind == AArch64MCExpr::VK_TPREL_G0_NC)
+ return ELF::R_AARCH64_TLSLE_MOVW_TPREL_G0_NC;
+ if (RefKind == AArch64MCExpr::VK_GOTTPREL_G1)
+ return ELF::R_AARCH64_TLSIE_MOVW_GOTTPREL_G1;
+ if (RefKind == AArch64MCExpr::VK_GOTTPREL_G0_NC)
+ return ELF::R_AARCH64_TLSIE_MOVW_GOTTPREL_G0_NC;
+ report_fatal_error("invalid fixup for movz/movk instruction");
+ return 0;
+ case AArch64::fixup_aarch64_tlsdesc_call:
+ return ELF::R_AARCH64_TLSDESC_CALL;
+ default:
+ llvm_unreachable("Unknown ELF relocation type");
}
}
- return Type;
+ llvm_unreachable("Unimplemented fixup -> relocation");
}
MCObjectWriter *llvm::createAArch64ELFObjectWriter(raw_ostream &OS,
- uint8_t OSABI,
- bool IsLittleEndian) {
- MCELFObjectTargetWriter *MOTW = new AArch64ELFObjectWriter(OSABI, IsLittleEndian);
- return createELFObjectWriter(MOTW, OS, IsLittleEndian);
+ uint8_t OSABI,
+ bool IsLittleEndian) {
+ MCELFObjectTargetWriter *MOTW =
+ new AArch64ELFObjectWriter(OSABI, IsLittleEndian);
+ return createELFObjectWriter(MOTW, OS, IsLittleEndian);
}
diff --git a/lib/Target/AArch64/MCTargetDesc/AArch64ELFStreamer.cpp b/lib/Target/AArch64/MCTargetDesc/AArch64ELFStreamer.cpp
index 473b7dd..a79406d 100644
--- a/lib/Target/AArch64/MCTargetDesc/AArch64ELFStreamer.cpp
+++ b/lib/Target/AArch64/MCTargetDesc/AArch64ELFStreamer.cpp
@@ -56,14 +56,14 @@
class AArch64ELFStreamer : public MCELFStreamer {
public:
AArch64ELFStreamer(MCContext &Context, MCAsmBackend &TAB, raw_ostream &OS,
- MCCodeEmitter *Emitter)
+ MCCodeEmitter *Emitter)
: MCELFStreamer(Context, TAB, OS, Emitter), MappingSymbolCounter(0),
LastEMS(EMS_None) {}
~AArch64ELFStreamer() {}
- virtual void ChangeSection(const MCSection *Section,
- const MCExpr *Subsection) {
+ void ChangeSection(const MCSection *Section,
+ const MCExpr *Subsection) override {
// We have to keep track of the mapping symbol state of any sections we
// use. Each one should start off as EMS_None, which is provided as the
// default constructor by DenseMap::lookup.
@@ -76,7 +76,8 @@
/// This function is the one used to emit instruction data into the ELF
/// streamer. We override it to add the appropriate mapping symbol if
/// necessary.
- virtual void EmitInstruction(const MCInst& Inst, const MCSubtargetInfo &STI) {
+ void EmitInstruction(const MCInst &Inst,
+ const MCSubtargetInfo &STI) override {
EmitA64MappingSymbol();
MCELFStreamer::EmitInstruction(Inst, STI);
}
@@ -84,7 +85,7 @@
/// This is one of the functions used to emit data into an ELF section, so the
/// AArch64 streamer overrides it to add the appropriate mapping symbol ($d)
/// if necessary.
- virtual void EmitBytes(StringRef Data) {
+ void EmitBytes(StringRef Data) override {
EmitDataMappingSymbol();
MCELFStreamer::EmitBytes(Data);
}
@@ -92,7 +93,8 @@
/// This is one of the functions used to emit data into an ELF section, so the
/// AArch64 streamer overrides it to add the appropriate mapping symbol ($d)
/// if necessary.
- virtual void EmitValueImpl(const MCExpr *Value, unsigned Size) {
+ void EmitValueImpl(const MCExpr *Value, unsigned Size,
+ const SMLoc &Loc) override {
EmitDataMappingSymbol();
MCELFStreamer::EmitValueImpl(Value, Size);
}
@@ -105,13 +107,15 @@
};
void EmitDataMappingSymbol() {
- if (LastEMS == EMS_Data) return;
+ if (LastEMS == EMS_Data)
+ return;
EmitMappingSymbol("$d");
LastEMS = EMS_Data;
}
void EmitA64MappingSymbol() {
- if (LastEMS == EMS_A64) return;
+ if (LastEMS == EMS_A64)
+ return;
EmitMappingSymbol("$x");
LastEMS = EMS_A64;
}
@@ -120,15 +124,14 @@
MCSymbol *Start = getContext().CreateTempSymbol();
EmitLabel(Start);
- MCSymbol *Symbol =
- getContext().GetOrCreateSymbol(Name + "." +
- Twine(MappingSymbolCounter++));
+ MCSymbol *Symbol = getContext().GetOrCreateSymbol(
+ Name + "." + Twine(MappingSymbolCounter++));
MCSymbolData &SD = getAssembler().getOrCreateSymbolData(*Symbol);
MCELF::SetType(SD, ELF::STT_NOTYPE);
MCELF::SetBinding(SD, ELF::STB_LOCAL);
SD.setExternal(false);
- AssignSection(Symbol, getCurrentSection().first);
+ Symbol->setSection(*getCurrentSection().first);
const MCExpr *Value = MCSymbolRefExpr::Create(Start, getContext());
Symbol->setVariableValue(Value);
@@ -144,16 +147,14 @@
}
namespace llvm {
- MCELFStreamer* createAArch64ELFStreamer(MCContext &Context, MCAsmBackend &TAB,
- raw_ostream &OS, MCCodeEmitter *Emitter,
- bool RelaxAll, bool NoExecStack) {
- AArch64ELFStreamer *S = new AArch64ELFStreamer(Context, TAB, OS, Emitter);
- if (RelaxAll)
- S->getAssembler().setRelaxAll(true);
- if (NoExecStack)
- S->getAssembler().setNoExecStack(true);
- return S;
- }
+MCELFStreamer *createAArch64ELFStreamer(MCContext &Context, MCAsmBackend &TAB,
+ raw_ostream &OS, MCCodeEmitter *Emitter,
+ bool RelaxAll, bool NoExecStack) {
+ AArch64ELFStreamer *S = new AArch64ELFStreamer(Context, TAB, OS, Emitter);
+ if (RelaxAll)
+ S->getAssembler().setRelaxAll(true);
+ if (NoExecStack)
+ S->getAssembler().setNoExecStack(true);
+ return S;
}
-
-
+}
diff --git a/lib/Target/AArch64/MCTargetDesc/AArch64ELFStreamer.h b/lib/Target/AArch64/MCTargetDesc/AArch64ELFStreamer.h
index 5a89ca5..bc6973b 100644
--- a/lib/Target/AArch64/MCTargetDesc/AArch64ELFStreamer.h
+++ b/lib/Target/AArch64/MCTargetDesc/AArch64ELFStreamer.h
@@ -18,10 +18,9 @@
namespace llvm {
- MCELFStreamer* createAArch64ELFStreamer(MCContext &Context, MCAsmBackend &TAB,
- raw_ostream &OS,
- MCCodeEmitter *Emitter,
- bool RelaxAll, bool NoExecStack);
+MCELFStreamer *createAArch64ELFStreamer(MCContext &Context, MCAsmBackend &TAB,
+ raw_ostream &OS, MCCodeEmitter *Emitter,
+ bool RelaxAll, bool NoExecStack);
}
#endif // AArch64_ELF_STREAMER_H
diff --git a/lib/Target/AArch64/MCTargetDesc/AArch64FixupKinds.h b/lib/Target/AArch64/MCTargetDesc/AArch64FixupKinds.h
index eeb122d..bf405fb 100644
--- a/lib/Target/AArch64/MCTargetDesc/AArch64FixupKinds.h
+++ b/lib/Target/AArch64/MCTargetDesc/AArch64FixupKinds.h
@@ -1,4 +1,4 @@
-//=- AArch64/AArch64FixupKinds.h - AArch64 Specific Fixup Entries -*- C++ -*-=//
+//===-- AArch64FixupKinds.h - AArch64 Specific Fixup Entries ----*- C++ -*-===//
//
// The LLVM Compiler Infrastructure
//
@@ -6,108 +6,71 @@
// License. See LICENSE.TXT for details.
//
//===----------------------------------------------------------------------===//
-//
-// This file describes the LLVM fixups applied to MCInsts in the AArch64
-// backend.
-//
-//===----------------------------------------------------------------------===//
-#ifndef LLVM_AARCH64_AARCH64FIXUPKINDS_H
-#define LLVM_AARCH64_AARCH64FIXUPKINDS_H
+#ifndef LLVM_AArch64FIXUPKINDS_H
+#define LLVM_AArch64FIXUPKINDS_H
#include "llvm/MC/MCFixup.h"
namespace llvm {
- namespace AArch64 {
- enum Fixups {
- fixup_a64_ld_prel = FirstTargetFixupKind,
- fixup_a64_adr_prel,
- fixup_a64_adr_prel_page,
+namespace AArch64 {
- fixup_a64_add_lo12,
+enum Fixups {
+ // fixup_aarch64_pcrel_adr_imm21 - A 21-bit pc-relative immediate inserted into
+ // an ADR instruction.
+ fixup_aarch64_pcrel_adr_imm21 = FirstTargetFixupKind,
- fixup_a64_ldst8_lo12,
- fixup_a64_ldst16_lo12,
- fixup_a64_ldst32_lo12,
- fixup_a64_ldst64_lo12,
- fixup_a64_ldst128_lo12,
+ // fixup_aarch64_pcrel_adrp_imm21 - A 21-bit pc-relative immediate inserted into
+ // an ADRP instruction.
+ fixup_aarch64_pcrel_adrp_imm21,
- fixup_a64_tstbr,
- fixup_a64_condbr,
- fixup_a64_uncondbr,
- fixup_a64_call,
+ // fixup_aarch64_imm12 - 12-bit fixup for add/sub instructions.
+ // No alignment adjustment. All value bits are encoded.
+ fixup_aarch64_add_imm12,
- fixup_a64_movw_uabs_g0,
- fixup_a64_movw_uabs_g0_nc,
- fixup_a64_movw_uabs_g1,
- fixup_a64_movw_uabs_g1_nc,
- fixup_a64_movw_uabs_g2,
- fixup_a64_movw_uabs_g2_nc,
- fixup_a64_movw_uabs_g3,
+ // fixup_aarch64_ldst_imm12_* - unsigned 12-bit fixups for load and
+ // store instructions.
+ fixup_aarch64_ldst_imm12_scale1,
+ fixup_aarch64_ldst_imm12_scale2,
+ fixup_aarch64_ldst_imm12_scale4,
+ fixup_aarch64_ldst_imm12_scale8,
+ fixup_aarch64_ldst_imm12_scale16,
- fixup_a64_movw_sabs_g0,
- fixup_a64_movw_sabs_g1,
- fixup_a64_movw_sabs_g2,
+ // fixup_aarch64_ldr_pcrel_imm19 - The high 19 bits of a 21-bit pc-relative
+ // immediate. Same encoding as fixup_aarch64_pcrel_adrhi, except this is used by
+ // pc-relative loads and generates relocations directly when necessary.
+ fixup_aarch64_ldr_pcrel_imm19,
- fixup_a64_adr_prel_got_page,
- fixup_a64_ld64_got_lo12_nc,
+ // FIXME: comment
+ fixup_aarch64_movw,
- // Produce offsets relative to the module's dynamic TLS area.
- fixup_a64_movw_dtprel_g2,
- fixup_a64_movw_dtprel_g1,
- fixup_a64_movw_dtprel_g1_nc,
- fixup_a64_movw_dtprel_g0,
- fixup_a64_movw_dtprel_g0_nc,
- fixup_a64_add_dtprel_hi12,
- fixup_a64_add_dtprel_lo12,
- fixup_a64_add_dtprel_lo12_nc,
- fixup_a64_ldst8_dtprel_lo12,
- fixup_a64_ldst8_dtprel_lo12_nc,
- fixup_a64_ldst16_dtprel_lo12,
- fixup_a64_ldst16_dtprel_lo12_nc,
- fixup_a64_ldst32_dtprel_lo12,
- fixup_a64_ldst32_dtprel_lo12_nc,
- fixup_a64_ldst64_dtprel_lo12,
- fixup_a64_ldst64_dtprel_lo12_nc,
+ // fixup_aarch64_pcrel_imm14 - The high 14 bits of a 21-bit pc-relative
+ // immediate.
+ fixup_aarch64_pcrel_branch14,
- // Produce the GOT entry containing a variable's address in TLS's
- // initial-exec mode.
- fixup_a64_movw_gottprel_g1,
- fixup_a64_movw_gottprel_g0_nc,
- fixup_a64_adr_gottprel_page,
- fixup_a64_ld64_gottprel_lo12_nc,
- fixup_a64_ld_gottprel_prel19,
+ // fixup_aarch64_pcrel_branch19 - The high 19 bits of a 21-bit pc-relative
+ // immediate. Same encoding as fixup_aarch64_pcrel_adrhi, except this is use by
+ // b.cc and generates relocations directly when necessary.
+ fixup_aarch64_pcrel_branch19,
- // Produce offsets relative to the thread pointer: TPIDR_EL0.
- fixup_a64_movw_tprel_g2,
- fixup_a64_movw_tprel_g1,
- fixup_a64_movw_tprel_g1_nc,
- fixup_a64_movw_tprel_g0,
- fixup_a64_movw_tprel_g0_nc,
- fixup_a64_add_tprel_hi12,
- fixup_a64_add_tprel_lo12,
- fixup_a64_add_tprel_lo12_nc,
- fixup_a64_ldst8_tprel_lo12,
- fixup_a64_ldst8_tprel_lo12_nc,
- fixup_a64_ldst16_tprel_lo12,
- fixup_a64_ldst16_tprel_lo12_nc,
- fixup_a64_ldst32_tprel_lo12,
- fixup_a64_ldst32_tprel_lo12_nc,
- fixup_a64_ldst64_tprel_lo12,
- fixup_a64_ldst64_tprel_lo12_nc,
+ // fixup_aarch64_pcrel_branch26 - The high 26 bits of a 28-bit pc-relative
+ // immediate.
+ fixup_aarch64_pcrel_branch26,
- // Produce the special fixups used by the general-dynamic TLS model.
- fixup_a64_tlsdesc_adr_page,
- fixup_a64_tlsdesc_ld64_lo12_nc,
- fixup_a64_tlsdesc_add_lo12_nc,
- fixup_a64_tlsdesc_call,
+ // fixup_aarch64_pcrel_call26 - The high 26 bits of a 28-bit pc-relative
+ // immediate. Distinguished from branch26 only on ELF.
+ fixup_aarch64_pcrel_call26,
+ // fixup_aarch64_tlsdesc_call - zero-space placeholder for the ELF
+ // R_AARCH64_TLSDESC_CALL relocation.
+ fixup_aarch64_tlsdesc_call,
- // Marker
- LastTargetFixupKind,
- NumTargetFixupKinds = LastTargetFixupKind - FirstTargetFixupKind
- };
- }
-}
+ // Marker
+ LastTargetFixupKind,
+ NumTargetFixupKinds = LastTargetFixupKind - FirstTargetFixupKind
+};
+
+} // end namespace AArch64
+} // end namespace llvm
#endif
diff --git a/lib/Target/AArch64/MCTargetDesc/AArch64MCAsmInfo.cpp b/lib/Target/AArch64/MCTargetDesc/AArch64MCAsmInfo.cpp
index b090a55..dc4a8bf 100644
--- a/lib/Target/AArch64/MCTargetDesc/AArch64MCAsmInfo.cpp
+++ b/lib/Target/AArch64/MCTargetDesc/AArch64MCAsmInfo.cpp
@@ -13,26 +13,82 @@
#include "AArch64MCAsmInfo.h"
#include "llvm/ADT/Triple.h"
-
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCStreamer.h"
+#include "llvm/Support/CommandLine.h"
using namespace llvm;
-AArch64ELFMCAsmInfo::AArch64ELFMCAsmInfo(StringRef TT) {
- Triple TheTriple(TT);
- if (TheTriple.getArch() == Triple::aarch64_be)
+enum AsmWriterVariantTy {
+ Default = -1,
+ Generic = 0,
+ Apple = 1
+};
+
+static cl::opt<AsmWriterVariantTy> AsmWriterVariant(
+ "aarch64-neon-syntax", cl::init(Default),
+ cl::desc("Choose style of NEON code to emit from AArch64 backend:"),
+ cl::values(clEnumValN(Generic, "generic", "Emit generic NEON assembly"),
+ clEnumValN(Apple, "apple", "Emit Apple-style NEON assembly"),
+ clEnumValEnd));
+
+AArch64MCAsmInfoDarwin::AArch64MCAsmInfoDarwin() {
+ // We prefer NEON instructions to be printed in the short form.
+ AssemblerDialect = AsmWriterVariant == Default ? 1 : AsmWriterVariant;
+
+ PrivateGlobalPrefix = "L";
+ SeparatorString = "%%";
+ CommentString = ";";
+ PointerSize = CalleeSaveStackSlotSize = 8;
+
+ AlignmentIsInBytes = false;
+ UsesELFSectionDirectiveForBSS = true;
+ SupportsDebugInformation = true;
+ UseDataRegionDirectives = true;
+
+ ExceptionsType = ExceptionHandling::DwarfCFI;
+}
+
+const MCExpr *AArch64MCAsmInfoDarwin::getExprForPersonalitySymbol(
+ const MCSymbol *Sym, unsigned Encoding, MCStreamer &Streamer) const {
+ // On Darwin, we can reference dwarf symbols with foo@GOT-., which
+ // is an indirect pc-relative reference. The default implementation
+ // won't reference using the GOT, so we need this target-specific
+ // version.
+ MCContext &Context = Streamer.getContext();
+ const MCExpr *Res =
+ MCSymbolRefExpr::Create(Sym, MCSymbolRefExpr::VK_GOT, Context);
+ MCSymbol *PCSym = Context.CreateTempSymbol();
+ Streamer.EmitLabel(PCSym);
+ const MCExpr *PC = MCSymbolRefExpr::Create(PCSym, Context);
+ return MCBinaryExpr::CreateSub(Res, PC, Context);
+}
+
+AArch64MCAsmInfoELF::AArch64MCAsmInfoELF(StringRef TT) {
+ Triple T(TT);
+ if (T.getArch() == Triple::arm64_be || T.getArch() == Triple::aarch64_be)
IsLittleEndian = false;
+ // We prefer NEON instructions to be printed in the short form.
+ AssemblerDialect = AsmWriterVariant == Default ? 0 : AsmWriterVariant;
+
PointerSize = 8;
// ".comm align is in bytes but .align is pow-2."
AlignmentIsInBytes = false;
CommentString = "//";
+ PrivateGlobalPrefix = ".L";
Code32Directive = ".code\t32";
Data16bitsDirective = "\t.hword\t";
Data32bitsDirective = "\t.word\t";
Data64bitsDirective = "\t.xword\t";
+ UseDataRegionDirectives = false;
+
+ WeakRefDirective = "\t.weak\t";
+
HasLEB128 = true;
SupportsDebugInformation = true;
@@ -41,6 +97,3 @@
UseIntegratedAssembler = true;
}
-
-// Pin the vtable to this file.
-void AArch64ELFMCAsmInfo::anchor() {}
diff --git a/lib/Target/AArch64/MCTargetDesc/AArch64MCAsmInfo.h b/lib/Target/AArch64/MCTargetDesc/AArch64MCAsmInfo.h
index 43c0e47..42a031d 100644
--- a/lib/Target/AArch64/MCTargetDesc/AArch64MCAsmInfo.h
+++ b/lib/Target/AArch64/MCTargetDesc/AArch64MCAsmInfo.h
@@ -1,4 +1,4 @@
-//==-- AArch64MCAsmInfo.h - AArch64 asm properties -------------*- C++ -*--===//
+//=====-- AArch64MCAsmInfo.h - AArch64 asm properties ---------*- C++ -*--====//
//
// The LLVM Compiler Infrastructure
//
@@ -11,17 +11,24 @@
//
//===----------------------------------------------------------------------===//
-#ifndef LLVM_AARCH64TARGETASMINFO_H
-#define LLVM_AARCH64TARGETASMINFO_H
+#ifndef AArch64TARGETASMINFO_H
+#define AArch64TARGETASMINFO_H
-#include "llvm/MC/MCAsmInfoELF.h"
+#include "llvm/MC/MCAsmInfoDarwin.h"
namespace llvm {
+class Target;
+class StringRef;
+class MCStreamer;
+struct AArch64MCAsmInfoDarwin : public MCAsmInfoDarwin {
+ explicit AArch64MCAsmInfoDarwin();
+ const MCExpr *
+ getExprForPersonalitySymbol(const MCSymbol *Sym, unsigned Encoding,
+ MCStreamer &Streamer) const override;
+};
-struct AArch64ELFMCAsmInfo : public MCAsmInfoELF {
- explicit AArch64ELFMCAsmInfo(StringRef TT);
-private:
- virtual void anchor();
+struct AArch64MCAsmInfoELF : public MCAsmInfo {
+ explicit AArch64MCAsmInfoELF(StringRef TT);
};
} // namespace llvm
diff --git a/lib/Target/AArch64/MCTargetDesc/AArch64MCCodeEmitter.cpp b/lib/Target/AArch64/MCTargetDesc/AArch64MCCodeEmitter.cpp
index b9a61ef..464a18c 100644
--- a/lib/Target/AArch64/MCTargetDesc/AArch64MCCodeEmitter.cpp
+++ b/lib/Target/AArch64/MCTargetDesc/AArch64MCCodeEmitter.cpp
@@ -1,4 +1,4 @@
-//=- AArch64/AArch64MCCodeEmitter.cpp - Convert AArch64 code to machine code =//
+//=- AArch64/AArch64MCCodeEmitter.cpp - Convert AArch64 code to machine code-=//
//
// The LLVM Compiler Infrastructure
//
@@ -11,10 +11,9 @@
//
//===----------------------------------------------------------------------===//
-#define DEBUG_TYPE "mccodeemitter"
+#include "MCTargetDesc/AArch64AddressingModes.h"
#include "MCTargetDesc/AArch64FixupKinds.h"
#include "MCTargetDesc/AArch64MCExpr.h"
-#include "MCTargetDesc/AArch64MCTargetDesc.h"
#include "Utils/AArch64BaseInfo.h"
#include "llvm/MC/MCCodeEmitter.h"
#include "llvm/MC/MCContext.h"
@@ -22,98 +21,29 @@
#include "llvm/MC/MCInstrInfo.h"
#include "llvm/MC/MCRegisterInfo.h"
#include "llvm/MC/MCSubtargetInfo.h"
-#include "llvm/Support/ErrorHandling.h"
+#include "llvm/ADT/Statistic.h"
#include "llvm/Support/raw_ostream.h"
-
using namespace llvm;
+#define DEBUG_TYPE "mccodeemitter"
+
+STATISTIC(MCNumEmitted, "Number of MC instructions emitted.");
+STATISTIC(MCNumFixups, "Number of MC fixups created.");
+
namespace {
+
class AArch64MCCodeEmitter : public MCCodeEmitter {
- AArch64MCCodeEmitter(const AArch64MCCodeEmitter &) LLVM_DELETED_FUNCTION;
- void operator=(const AArch64MCCodeEmitter &) LLVM_DELETED_FUNCTION;
MCContext &Ctx;
+ AArch64MCCodeEmitter(const AArch64MCCodeEmitter &); // DO NOT IMPLEMENT
+ void operator=(const AArch64MCCodeEmitter &); // DO NOT IMPLEMENT
public:
- AArch64MCCodeEmitter(MCContext &ctx) : Ctx(ctx) {}
+ AArch64MCCodeEmitter(const MCInstrInfo &mcii, const MCSubtargetInfo &sti,
+ MCContext &ctx)
+ : Ctx(ctx) {}
~AArch64MCCodeEmitter() {}
- unsigned getAddSubImmOpValue(const MCInst &MI, unsigned OpIdx,
- SmallVectorImpl<MCFixup> &Fixups,
- const MCSubtargetInfo &STI) const;
-
- unsigned getAdrpLabelOpValue(const MCInst &MI, unsigned OpIdx,
- SmallVectorImpl<MCFixup> &Fixups,
- const MCSubtargetInfo &STI) const;
-
- template<int MemSize>
- unsigned getOffsetUImm12OpValue(const MCInst &MI, unsigned OpIdx,
- SmallVectorImpl<MCFixup> &Fixups,
- const MCSubtargetInfo &STI) const {
- return getOffsetUImm12OpValue(MI, OpIdx, Fixups, STI, MemSize);
- }
-
- unsigned getOffsetUImm12OpValue(const MCInst &MI, unsigned OpIdx,
- SmallVectorImpl<MCFixup> &Fixups,
- const MCSubtargetInfo &STI,
- int MemSize) const;
-
- unsigned getBitfield32LSLOpValue(const MCInst &MI, unsigned OpIdx,
- SmallVectorImpl<MCFixup> &Fixups,
- const MCSubtargetInfo &STI) const;
- unsigned getBitfield64LSLOpValue(const MCInst &MI, unsigned OpIdx,
- SmallVectorImpl<MCFixup> &Fixups,
- const MCSubtargetInfo &STI) const;
-
- unsigned getShiftRightImm8(const MCInst &MI, unsigned Op,
- SmallVectorImpl<MCFixup> &Fixups,
- const MCSubtargetInfo &STI) const;
- unsigned getShiftRightImm16(const MCInst &MI, unsigned Op,
- SmallVectorImpl<MCFixup> &Fixups,
- const MCSubtargetInfo &STI) const;
- unsigned getShiftRightImm32(const MCInst &MI, unsigned Op,
- SmallVectorImpl<MCFixup> &Fixups,
- const MCSubtargetInfo &STI) const;
- unsigned getShiftRightImm64(const MCInst &MI, unsigned Op,
- SmallVectorImpl<MCFixup> &Fixups,
- const MCSubtargetInfo &STI) const;
-
- unsigned getShiftLeftImm8(const MCInst &MI, unsigned Op,
- SmallVectorImpl<MCFixup> &Fixups,
- const MCSubtargetInfo &STI) const;
- unsigned getShiftLeftImm16(const MCInst &MI, unsigned Op,
- SmallVectorImpl<MCFixup> &Fixups,
- const MCSubtargetInfo &STI) const;
- unsigned getShiftLeftImm32(const MCInst &MI, unsigned Op,
- SmallVectorImpl<MCFixup> &Fixups,
- const MCSubtargetInfo &STI) const;
- unsigned getShiftLeftImm64(const MCInst &MI, unsigned Op,
- SmallVectorImpl<MCFixup> &Fixups,
- const MCSubtargetInfo &STI) const;
-
- // Labels are handled mostly the same way: a symbol is needed, and
- // just gets some fixup attached.
- template<AArch64::Fixups fixupDesired>
- unsigned getLabelOpValue(const MCInst &MI, unsigned OpIdx,
- SmallVectorImpl<MCFixup> &Fixups,
- const MCSubtargetInfo &STI) const;
-
- unsigned getLoadLitLabelOpValue(const MCInst &MI, unsigned OpIdx,
- SmallVectorImpl<MCFixup> &Fixups,
- const MCSubtargetInfo &STI) const;
-
-
- unsigned getMoveWideImmOpValue(const MCInst &MI, unsigned OpIdx,
- SmallVectorImpl<MCFixup> &Fixups,
- const MCSubtargetInfo &STI) const;
-
-
- unsigned getAddressWithFixup(const MCOperand &MO,
- unsigned FixupKind,
- SmallVectorImpl<MCFixup> &Fixups,
- const MCSubtargetInfo &STI) const;
-
-
// getBinaryCodeForInstr - TableGen'erated function for getting the
// binary encoding for an instruction.
uint64_t getBinaryCodeForInstr(const MCInst &MI,
@@ -122,424 +52,531 @@
/// getMachineOpValue - Return binary encoding of operand. If the machine
/// operand requires relocation, record the relocation and return zero.
- unsigned getMachineOpValue(const MCInst &MI,const MCOperand &MO,
+ unsigned getMachineOpValue(const MCInst &MI, const MCOperand &MO,
SmallVectorImpl<MCFixup> &Fixups,
const MCSubtargetInfo &STI) const;
+ /// getLdStUImm12OpValue - Return encoding info for 12-bit unsigned immediate
+ /// attached to a load, store or prfm instruction. If operand requires a
+ /// relocation, record it and return zero in that part of the encoding.
+ template <uint32_t FixupKind>
+ uint32_t getLdStUImm12OpValue(const MCInst &MI, unsigned OpIdx,
+ SmallVectorImpl<MCFixup> &Fixups,
+ const MCSubtargetInfo &STI) const;
- void EmitByte(unsigned char C, raw_ostream &OS) const {
- OS << (char)C;
- }
+ /// getAdrLabelOpValue - Return encoding info for 21-bit immediate ADR label
+ /// target.
+ uint32_t getAdrLabelOpValue(const MCInst &MI, unsigned OpIdx,
+ SmallVectorImpl<MCFixup> &Fixups,
+ const MCSubtargetInfo &STI) const;
- void EmitInstruction(uint32_t Val, raw_ostream &OS) const {
+ /// getAddSubImmOpValue - Return encoding for the 12-bit immediate value and
+ /// the 2-bit shift field.
+ uint32_t getAddSubImmOpValue(const MCInst &MI, unsigned OpIdx,
+ SmallVectorImpl<MCFixup> &Fixups,
+ const MCSubtargetInfo &STI) const;
+
+ /// getCondBranchTargetOpValue - Return the encoded value for a conditional
+ /// branch target.
+ uint32_t getCondBranchTargetOpValue(const MCInst &MI, unsigned OpIdx,
+ SmallVectorImpl<MCFixup> &Fixups,
+ const MCSubtargetInfo &STI) const;
+
+ /// getLoadLiteralOpValue - Return the encoded value for a load-literal
+ /// pc-relative address.
+ uint32_t getLoadLiteralOpValue(const MCInst &MI, unsigned OpIdx,
+ SmallVectorImpl<MCFixup> &Fixups,
+ const MCSubtargetInfo &STI) const;
+
+ /// getMemExtendOpValue - Return the encoded value for a reg-extend load/store
+ /// instruction: bit 0 is whether a shift is present, bit 1 is whether the
+ /// operation is a sign extend (as opposed to a zero extend).
+ uint32_t getMemExtendOpValue(const MCInst &MI, unsigned OpIdx,
+ SmallVectorImpl<MCFixup> &Fixups,
+ const MCSubtargetInfo &STI) const;
+
+ /// getTestBranchTargetOpValue - Return the encoded value for a test-bit-and-
+ /// branch target.
+ uint32_t getTestBranchTargetOpValue(const MCInst &MI, unsigned OpIdx,
+ SmallVectorImpl<MCFixup> &Fixups,
+ const MCSubtargetInfo &STI) const;
+
+ /// getBranchTargetOpValue - Return the encoded value for an unconditional
+ /// branch target.
+ uint32_t getBranchTargetOpValue(const MCInst &MI, unsigned OpIdx,
+ SmallVectorImpl<MCFixup> &Fixups,
+ const MCSubtargetInfo &STI) const;
+
+ /// getMoveWideImmOpValue - Return the encoded value for the immediate operand
+ /// of a MOVZ or MOVK instruction.
+ uint32_t getMoveWideImmOpValue(const MCInst &MI, unsigned OpIdx,
+ SmallVectorImpl<MCFixup> &Fixups,
+ const MCSubtargetInfo &STI) const;
+
+ /// getVecShifterOpValue - Return the encoded value for the vector shifter.
+ uint32_t getVecShifterOpValue(const MCInst &MI, unsigned OpIdx,
+ SmallVectorImpl<MCFixup> &Fixups,
+ const MCSubtargetInfo &STI) const;
+
+ /// getMoveVecShifterOpValue - Return the encoded value for the vector move
+ /// shifter (MSL).
+ uint32_t getMoveVecShifterOpValue(const MCInst &MI, unsigned OpIdx,
+ SmallVectorImpl<MCFixup> &Fixups,
+ const MCSubtargetInfo &STI) const;
+
+ /// getFixedPointScaleOpValue - Return the encoded value for the
+ // FP-to-fixed-point scale factor.
+ uint32_t getFixedPointScaleOpValue(const MCInst &MI, unsigned OpIdx,
+ SmallVectorImpl<MCFixup> &Fixups,
+ const MCSubtargetInfo &STI) const;
+
+ uint32_t getVecShiftR64OpValue(const MCInst &MI, unsigned OpIdx,
+ SmallVectorImpl<MCFixup> &Fixups,
+ const MCSubtargetInfo &STI) const;
+ uint32_t getVecShiftR32OpValue(const MCInst &MI, unsigned OpIdx,
+ SmallVectorImpl<MCFixup> &Fixups,
+ const MCSubtargetInfo &STI) const;
+ uint32_t getVecShiftR16OpValue(const MCInst &MI, unsigned OpIdx,
+ SmallVectorImpl<MCFixup> &Fixups,
+ const MCSubtargetInfo &STI) const;
+ uint32_t getVecShiftR8OpValue(const MCInst &MI, unsigned OpIdx,
+ SmallVectorImpl<MCFixup> &Fixups,
+ const MCSubtargetInfo &STI) const;
+ uint32_t getVecShiftL64OpValue(const MCInst &MI, unsigned OpIdx,
+ SmallVectorImpl<MCFixup> &Fixups,
+ const MCSubtargetInfo &STI) const;
+ uint32_t getVecShiftL32OpValue(const MCInst &MI, unsigned OpIdx,
+ SmallVectorImpl<MCFixup> &Fixups,
+ const MCSubtargetInfo &STI) const;
+ uint32_t getVecShiftL16OpValue(const MCInst &MI, unsigned OpIdx,
+ SmallVectorImpl<MCFixup> &Fixups,
+ const MCSubtargetInfo &STI) const;
+ uint32_t getVecShiftL8OpValue(const MCInst &MI, unsigned OpIdx,
+ SmallVectorImpl<MCFixup> &Fixups,
+ const MCSubtargetInfo &STI) const;
+
+ /// getSIMDShift64OpValue - Return the encoded value for the
+ // shift-by-immediate AdvSIMD instructions.
+ uint32_t getSIMDShift64OpValue(const MCInst &MI, unsigned OpIdx,
+ SmallVectorImpl<MCFixup> &Fixups,
+ const MCSubtargetInfo &STI) const;
+
+ uint32_t getSIMDShift64_32OpValue(const MCInst &MI, unsigned OpIdx,
+ SmallVectorImpl<MCFixup> &Fixups,
+ const MCSubtargetInfo &STI) const;
+
+ uint32_t getSIMDShift32OpValue(const MCInst &MI, unsigned OpIdx,
+ SmallVectorImpl<MCFixup> &Fixups,
+ const MCSubtargetInfo &STI) const;
+
+ uint32_t getSIMDShift16OpValue(const MCInst &MI, unsigned OpIdx,
+ SmallVectorImpl<MCFixup> &Fixups,
+ const MCSubtargetInfo &STI) const;
+
+ unsigned fixMOVZ(const MCInst &MI, unsigned EncodedValue,
+ const MCSubtargetInfo &STI) const;
+
+ void EmitByte(unsigned char C, raw_ostream &OS) const { OS << (char)C; }
+
+ void EmitConstant(uint64_t Val, unsigned Size, raw_ostream &OS) const {
// Output the constant in little endian byte order.
- for (unsigned i = 0; i != 4; ++i) {
- EmitByte(Val & 0xff, OS);
+ for (unsigned i = 0; i != Size; ++i) {
+ EmitByte(Val & 255, OS);
Val >>= 8;
}
}
-
void EncodeInstruction(const MCInst &MI, raw_ostream &OS,
SmallVectorImpl<MCFixup> &Fixups,
- const MCSubtargetInfo &STI) const;
+ const MCSubtargetInfo &STI) const override;
+
+ unsigned fixMulHigh(const MCInst &MI, unsigned EncodedValue,
+ const MCSubtargetInfo &STI) const;
template<int hasRs, int hasRt2> unsigned
fixLoadStoreExclusive(const MCInst &MI, unsigned EncodedValue,
const MCSubtargetInfo &STI) const;
- unsigned fixMOVZ(const MCInst &MI, unsigned EncodedValue,
- const MCSubtargetInfo &STI) const;
-
- unsigned fixMulHigh(const MCInst &MI, unsigned EncodedValue,
- const MCSubtargetInfo &STI) const;
-
-
+ unsigned fixOneOperandFPComparison(const MCInst &MI, unsigned EncodedValue,
+ const MCSubtargetInfo &STI) const;
};
} // end anonymous namespace
-unsigned AArch64MCCodeEmitter::getAddressWithFixup(const MCOperand &MO,
- unsigned FixupKind,
- SmallVectorImpl<MCFixup> &Fixups,
- const MCSubtargetInfo &STI) const {
- if (!MO.isExpr()) {
- // This can occur for manually decoded or constructed MCInsts, but neither
- // the assembly-parser nor instruction selection will currently produce an
- // MCInst that's not a symbol reference.
- assert(MO.isImm() && "Unexpected address requested");
- return MO.getImm();
- }
-
- const MCExpr *Expr = MO.getExpr();
- MCFixupKind Kind = MCFixupKind(FixupKind);
- Fixups.push_back(MCFixup::Create(0, Expr, Kind));
-
- return 0;
+MCCodeEmitter *llvm::createAArch64MCCodeEmitter(const MCInstrInfo &MCII,
+ const MCRegisterInfo &MRI,
+ const MCSubtargetInfo &STI,
+ MCContext &Ctx) {
+ return new AArch64MCCodeEmitter(MCII, STI, Ctx);
}
-unsigned AArch64MCCodeEmitter::
-getOffsetUImm12OpValue(const MCInst &MI, unsigned OpIdx,
- SmallVectorImpl<MCFixup> &Fixups,
- const MCSubtargetInfo &STI,
- int MemSize) const {
- const MCOperand &ImmOp = MI.getOperand(OpIdx);
- if (ImmOp.isImm())
- return ImmOp.getImm();
-
- assert(ImmOp.isExpr() && "Unexpected operand type");
- const AArch64MCExpr *Expr = cast<AArch64MCExpr>(ImmOp.getExpr());
- unsigned FixupKind;
-
-
- switch (Expr->getKind()) {
- default: llvm_unreachable("Unexpected operand modifier");
- case AArch64MCExpr::VK_AARCH64_LO12: {
- static const unsigned FixupsBySize[] = { AArch64::fixup_a64_ldst8_lo12,
- AArch64::fixup_a64_ldst16_lo12,
- AArch64::fixup_a64_ldst32_lo12,
- AArch64::fixup_a64_ldst64_lo12,
- AArch64::fixup_a64_ldst128_lo12 };
- assert(MemSize <= 16 && "Invalid fixup for operation");
- FixupKind = FixupsBySize[Log2_32(MemSize)];
- break;
- }
- case AArch64MCExpr::VK_AARCH64_GOT_LO12:
- assert(MemSize == 8 && "Invalid fixup for operation");
- FixupKind = AArch64::fixup_a64_ld64_got_lo12_nc;
- break;
- case AArch64MCExpr::VK_AARCH64_DTPREL_LO12: {
- static const unsigned FixupsBySize[] = {
- AArch64::fixup_a64_ldst8_dtprel_lo12,
- AArch64::fixup_a64_ldst16_dtprel_lo12,
- AArch64::fixup_a64_ldst32_dtprel_lo12,
- AArch64::fixup_a64_ldst64_dtprel_lo12
- };
- assert(MemSize <= 8 && "Invalid fixup for operation");
- FixupKind = FixupsBySize[Log2_32(MemSize)];
- break;
- }
- case AArch64MCExpr::VK_AARCH64_DTPREL_LO12_NC: {
- static const unsigned FixupsBySize[] = {
- AArch64::fixup_a64_ldst8_dtprel_lo12_nc,
- AArch64::fixup_a64_ldst16_dtprel_lo12_nc,
- AArch64::fixup_a64_ldst32_dtprel_lo12_nc,
- AArch64::fixup_a64_ldst64_dtprel_lo12_nc
- };
- assert(MemSize <= 8 && "Invalid fixup for operation");
- FixupKind = FixupsBySize[Log2_32(MemSize)];
- break;
- }
- case AArch64MCExpr::VK_AARCH64_GOTTPREL_LO12:
- assert(MemSize == 8 && "Invalid fixup for operation");
- FixupKind = AArch64::fixup_a64_ld64_gottprel_lo12_nc;
- break;
- case AArch64MCExpr::VK_AARCH64_TPREL_LO12:{
- static const unsigned FixupsBySize[] = {
- AArch64::fixup_a64_ldst8_tprel_lo12,
- AArch64::fixup_a64_ldst16_tprel_lo12,
- AArch64::fixup_a64_ldst32_tprel_lo12,
- AArch64::fixup_a64_ldst64_tprel_lo12
- };
- assert(MemSize <= 8 && "Invalid fixup for operation");
- FixupKind = FixupsBySize[Log2_32(MemSize)];
- break;
- }
- case AArch64MCExpr::VK_AARCH64_TPREL_LO12_NC: {
- static const unsigned FixupsBySize[] = {
- AArch64::fixup_a64_ldst8_tprel_lo12_nc,
- AArch64::fixup_a64_ldst16_tprel_lo12_nc,
- AArch64::fixup_a64_ldst32_tprel_lo12_nc,
- AArch64::fixup_a64_ldst64_tprel_lo12_nc
- };
- assert(MemSize <= 8 && "Invalid fixup for operation");
- FixupKind = FixupsBySize[Log2_32(MemSize)];
- break;
- }
- case AArch64MCExpr::VK_AARCH64_TLSDESC_LO12:
- assert(MemSize == 8 && "Invalid fixup for operation");
- FixupKind = AArch64::fixup_a64_tlsdesc_ld64_lo12_nc;
- break;
- }
-
- return getAddressWithFixup(ImmOp, FixupKind, Fixups, STI);
-}
-
+/// getMachineOpValue - Return binary encoding of operand. If the machine
+/// operand requires relocation, record the relocation and return zero.
unsigned
-AArch64MCCodeEmitter::getAddSubImmOpValue(const MCInst &MI, unsigned OpIdx,
- SmallVectorImpl<MCFixup> &Fixups,
- const MCSubtargetInfo &STI) const {
- const MCOperand &MO = MI.getOperand(OpIdx);
- if (MO.isImm())
- return static_cast<unsigned>(MO.getImm());
-
- assert(MO.isExpr());
-
- unsigned FixupKind = 0;
- switch(cast<AArch64MCExpr>(MO.getExpr())->getKind()) {
- default: llvm_unreachable("Invalid expression modifier");
- case AArch64MCExpr::VK_AARCH64_LO12:
- FixupKind = AArch64::fixup_a64_add_lo12; break;
- case AArch64MCExpr::VK_AARCH64_DTPREL_HI12:
- FixupKind = AArch64::fixup_a64_add_dtprel_hi12; break;
- case AArch64MCExpr::VK_AARCH64_DTPREL_LO12:
- FixupKind = AArch64::fixup_a64_add_dtprel_lo12; break;
- case AArch64MCExpr::VK_AARCH64_DTPREL_LO12_NC:
- FixupKind = AArch64::fixup_a64_add_dtprel_lo12_nc; break;
- case AArch64MCExpr::VK_AARCH64_TPREL_HI12:
- FixupKind = AArch64::fixup_a64_add_tprel_hi12; break;
- case AArch64MCExpr::VK_AARCH64_TPREL_LO12:
- FixupKind = AArch64::fixup_a64_add_tprel_lo12; break;
- case AArch64MCExpr::VK_AARCH64_TPREL_LO12_NC:
- FixupKind = AArch64::fixup_a64_add_tprel_lo12_nc; break;
- case AArch64MCExpr::VK_AARCH64_TLSDESC_LO12:
- FixupKind = AArch64::fixup_a64_tlsdesc_add_lo12_nc; break;
- }
-
- return getAddressWithFixup(MO, FixupKind, Fixups, STI);
-}
-
-unsigned
-AArch64MCCodeEmitter::getAdrpLabelOpValue(const MCInst &MI, unsigned OpIdx,
- SmallVectorImpl<MCFixup> &Fixups,
- const MCSubtargetInfo &STI) const {
-
- const MCOperand &MO = MI.getOperand(OpIdx);
- if (MO.isImm())
- return static_cast<unsigned>(MO.getImm());
-
- assert(MO.isExpr());
-
- unsigned Modifier = AArch64MCExpr::VK_AARCH64_None;
- if (const AArch64MCExpr *Expr = dyn_cast<AArch64MCExpr>(MO.getExpr()))
- Modifier = Expr->getKind();
-
- unsigned FixupKind = 0;
- switch(Modifier) {
- case AArch64MCExpr::VK_AARCH64_None:
- FixupKind = AArch64::fixup_a64_adr_prel_page;
- break;
- case AArch64MCExpr::VK_AARCH64_GOT:
- FixupKind = AArch64::fixup_a64_adr_prel_got_page;
- break;
- case AArch64MCExpr::VK_AARCH64_GOTTPREL:
- FixupKind = AArch64::fixup_a64_adr_gottprel_page;
- break;
- case AArch64MCExpr::VK_AARCH64_TLSDESC:
- FixupKind = AArch64::fixup_a64_tlsdesc_adr_page;
- break;
- default:
- llvm_unreachable("Unknown symbol reference kind for ADRP instruction");
- }
-
- return getAddressWithFixup(MO, FixupKind, Fixups, STI);
-}
-
-unsigned
-AArch64MCCodeEmitter::getBitfield32LSLOpValue(const MCInst &MI, unsigned OpIdx,
- SmallVectorImpl<MCFixup> &Fixups,
- const MCSubtargetInfo &STI) const {
-
- const MCOperand &MO = MI.getOperand(OpIdx);
- assert(MO.isImm() && "Only immediate expected for shift");
-
- return ((32 - MO.getImm()) & 0x1f) | (31 - MO.getImm()) << 6;
-}
-
-unsigned
-AArch64MCCodeEmitter::getBitfield64LSLOpValue(const MCInst &MI, unsigned OpIdx,
- SmallVectorImpl<MCFixup> &Fixups,
- const MCSubtargetInfo &STI) const {
-
- const MCOperand &MO = MI.getOperand(OpIdx);
- assert(MO.isImm() && "Only immediate expected for shift");
-
- return ((64 - MO.getImm()) & 0x3f) | (63 - MO.getImm()) << 6;
-}
-
-unsigned AArch64MCCodeEmitter::getShiftRightImm8(
- const MCInst &MI, unsigned Op, SmallVectorImpl<MCFixup> &Fixups,
- const MCSubtargetInfo &STI) const {
- return 8 - MI.getOperand(Op).getImm();
-}
-
-unsigned AArch64MCCodeEmitter::getShiftRightImm16(
- const MCInst &MI, unsigned Op, SmallVectorImpl<MCFixup> &Fixups,
- const MCSubtargetInfo &STI) const {
- return 16 - MI.getOperand(Op).getImm();
-}
-
-unsigned AArch64MCCodeEmitter::getShiftRightImm32(
- const MCInst &MI, unsigned Op, SmallVectorImpl<MCFixup> &Fixups,
- const MCSubtargetInfo &STI) const {
- return 32 - MI.getOperand(Op).getImm();
-}
-
-unsigned AArch64MCCodeEmitter::getShiftRightImm64(
- const MCInst &MI, unsigned Op, SmallVectorImpl<MCFixup> &Fixups,
- const MCSubtargetInfo &STI) const {
- return 64 - MI.getOperand(Op).getImm();
-}
-
-unsigned AArch64MCCodeEmitter::getShiftLeftImm8(
- const MCInst &MI, unsigned Op, SmallVectorImpl<MCFixup> &Fixups,
- const MCSubtargetInfo &STI) const {
- return MI.getOperand(Op).getImm() - 8;
-}
-
-unsigned AArch64MCCodeEmitter::getShiftLeftImm16(
- const MCInst &MI, unsigned Op, SmallVectorImpl<MCFixup> &Fixups,
- const MCSubtargetInfo &STI) const {
- return MI.getOperand(Op).getImm() - 16;
-}
-
-unsigned AArch64MCCodeEmitter::getShiftLeftImm32(
- const MCInst &MI, unsigned Op, SmallVectorImpl<MCFixup> &Fixups,
- const MCSubtargetInfo &STI) const {
- return MI.getOperand(Op).getImm() - 32;
-}
-
-unsigned AArch64MCCodeEmitter::getShiftLeftImm64(
- const MCInst &MI, unsigned Op, SmallVectorImpl<MCFixup> &Fixups,
- const MCSubtargetInfo &STI) const {
- return MI.getOperand(Op).getImm() - 64;
-}
-
-template<AArch64::Fixups fixupDesired> unsigned
-AArch64MCCodeEmitter::getLabelOpValue(const MCInst &MI,
- unsigned OpIdx,
- SmallVectorImpl<MCFixup> &Fixups,
- const MCSubtargetInfo &STI) const {
- const MCOperand &MO = MI.getOperand(OpIdx);
-
- if (MO.isExpr())
- return getAddressWithFixup(MO, fixupDesired, Fixups, STI);
-
- assert(MO.isImm());
- return MO.getImm();
-}
-
-unsigned
-AArch64MCCodeEmitter::getLoadLitLabelOpValue(const MCInst &MI,
- unsigned OpIdx,
- SmallVectorImpl<MCFixup> &Fixups,
- const MCSubtargetInfo &STI) const {
- const MCOperand &MO = MI.getOperand(OpIdx);
-
- if (MO.isImm())
- return MO.getImm();
-
- assert(MO.isExpr());
-
- unsigned FixupKind;
- if (isa<AArch64MCExpr>(MO.getExpr())) {
- assert(dyn_cast<AArch64MCExpr>(MO.getExpr())->getKind()
- == AArch64MCExpr::VK_AARCH64_GOTTPREL
- && "Invalid symbol modifier for literal load");
- FixupKind = AArch64::fixup_a64_ld_gottprel_prel19;
- } else {
- FixupKind = AArch64::fixup_a64_ld_prel;
- }
-
- return getAddressWithFixup(MO, FixupKind, Fixups, STI);
-}
-
-
-unsigned
-AArch64MCCodeEmitter::getMachineOpValue(const MCInst &MI,
- const MCOperand &MO,
- SmallVectorImpl<MCFixup> &Fixups,
- const MCSubtargetInfo &STI) const {
- if (MO.isReg()) {
+AArch64MCCodeEmitter::getMachineOpValue(const MCInst &MI, const MCOperand &MO,
+ SmallVectorImpl<MCFixup> &Fixups,
+ const MCSubtargetInfo &STI) const {
+ if (MO.isReg())
return Ctx.getRegisterInfo()->getEncodingValue(MO.getReg());
- } else if (MO.isImm()) {
+ else {
+ assert(MO.isImm() && "did not expect relocated expression");
return static_cast<unsigned>(MO.getImm());
}
- llvm_unreachable("Unable to encode MCOperand!");
+ assert(0 && "Unable to encode MCOperand!");
return 0;
}
-unsigned
-AArch64MCCodeEmitter::getMoveWideImmOpValue(const MCInst &MI, unsigned OpIdx,
- SmallVectorImpl<MCFixup> &Fixups,
- const MCSubtargetInfo &STI) const {
- const MCOperand &UImm16MO = MI.getOperand(OpIdx);
- const MCOperand &ShiftMO = MI.getOperand(OpIdx + 1);
+template<unsigned FixupKind> uint32_t
+AArch64MCCodeEmitter::getLdStUImm12OpValue(const MCInst &MI, unsigned OpIdx,
+ SmallVectorImpl<MCFixup> &Fixups,
+ const MCSubtargetInfo &STI) const {
+ const MCOperand &MO = MI.getOperand(OpIdx);
+ uint32_t ImmVal = 0;
- unsigned Result = static_cast<unsigned>(ShiftMO.getImm()) << 16;
-
- if (UImm16MO.isImm()) {
- Result |= UImm16MO.getImm();
- return Result;
+ if (MO.isImm())
+ ImmVal = static_cast<uint32_t>(MO.getImm());
+ else {
+ assert(MO.isExpr() && "unable to encode load/store imm operand");
+ MCFixupKind Kind = MCFixupKind(FixupKind);
+ Fixups.push_back(MCFixup::Create(0, MO.getExpr(), Kind, MI.getLoc()));
+ ++MCNumFixups;
}
- const AArch64MCExpr *A64E = cast<AArch64MCExpr>(UImm16MO.getExpr());
- AArch64::Fixups requestedFixup;
- switch (A64E->getKind()) {
- default: llvm_unreachable("unexpected expression modifier");
- case AArch64MCExpr::VK_AARCH64_ABS_G0:
- requestedFixup = AArch64::fixup_a64_movw_uabs_g0; break;
- case AArch64MCExpr::VK_AARCH64_ABS_G0_NC:
- requestedFixup = AArch64::fixup_a64_movw_uabs_g0_nc; break;
- case AArch64MCExpr::VK_AARCH64_ABS_G1:
- requestedFixup = AArch64::fixup_a64_movw_uabs_g1; break;
- case AArch64MCExpr::VK_AARCH64_ABS_G1_NC:
- requestedFixup = AArch64::fixup_a64_movw_uabs_g1_nc; break;
- case AArch64MCExpr::VK_AARCH64_ABS_G2:
- requestedFixup = AArch64::fixup_a64_movw_uabs_g2; break;
- case AArch64MCExpr::VK_AARCH64_ABS_G2_NC:
- requestedFixup = AArch64::fixup_a64_movw_uabs_g2_nc; break;
- case AArch64MCExpr::VK_AARCH64_ABS_G3:
- requestedFixup = AArch64::fixup_a64_movw_uabs_g3; break;
- case AArch64MCExpr::VK_AARCH64_SABS_G0:
- requestedFixup = AArch64::fixup_a64_movw_sabs_g0; break;
- case AArch64MCExpr::VK_AARCH64_SABS_G1:
- requestedFixup = AArch64::fixup_a64_movw_sabs_g1; break;
- case AArch64MCExpr::VK_AARCH64_SABS_G2:
- requestedFixup = AArch64::fixup_a64_movw_sabs_g2; break;
- case AArch64MCExpr::VK_AARCH64_DTPREL_G2:
- requestedFixup = AArch64::fixup_a64_movw_dtprel_g2; break;
- case AArch64MCExpr::VK_AARCH64_DTPREL_G1:
- requestedFixup = AArch64::fixup_a64_movw_dtprel_g1; break;
- case AArch64MCExpr::VK_AARCH64_DTPREL_G1_NC:
- requestedFixup = AArch64::fixup_a64_movw_dtprel_g1_nc; break;
- case AArch64MCExpr::VK_AARCH64_DTPREL_G0:
- requestedFixup = AArch64::fixup_a64_movw_dtprel_g0; break;
- case AArch64MCExpr::VK_AARCH64_DTPREL_G0_NC:
- requestedFixup = AArch64::fixup_a64_movw_dtprel_g0_nc; break;
- case AArch64MCExpr::VK_AARCH64_GOTTPREL_G1:
- requestedFixup = AArch64::fixup_a64_movw_gottprel_g1; break;
- case AArch64MCExpr::VK_AARCH64_GOTTPREL_G0_NC:
- requestedFixup = AArch64::fixup_a64_movw_gottprel_g0_nc; break;
- case AArch64MCExpr::VK_AARCH64_TPREL_G2:
- requestedFixup = AArch64::fixup_a64_movw_tprel_g2; break;
- case AArch64MCExpr::VK_AARCH64_TPREL_G1:
- requestedFixup = AArch64::fixup_a64_movw_tprel_g1; break;
- case AArch64MCExpr::VK_AARCH64_TPREL_G1_NC:
- requestedFixup = AArch64::fixup_a64_movw_tprel_g1_nc; break;
- case AArch64MCExpr::VK_AARCH64_TPREL_G0:
- requestedFixup = AArch64::fixup_a64_movw_tprel_g0; break;
- case AArch64MCExpr::VK_AARCH64_TPREL_G0_NC:
- requestedFixup = AArch64::fixup_a64_movw_tprel_g0_nc; break;
- }
-
- return Result | getAddressWithFixup(UImm16MO, requestedFixup, Fixups, STI);
+ return ImmVal;
}
-template<int hasRs, int hasRt2> unsigned
-AArch64MCCodeEmitter::fixLoadStoreExclusive(const MCInst &MI,
- unsigned EncodedValue,
+/// getAdrLabelOpValue - Return encoding info for 21-bit immediate ADR label
+/// target.
+uint32_t
+AArch64MCCodeEmitter::getAdrLabelOpValue(const MCInst &MI, unsigned OpIdx,
+ SmallVectorImpl<MCFixup> &Fixups,
+ const MCSubtargetInfo &STI) const {
+ const MCOperand &MO = MI.getOperand(OpIdx);
+
+ // If the destination is an immediate, we have nothing to do.
+ if (MO.isImm())
+ return MO.getImm();
+ assert(MO.isExpr() && "Unexpected target type!");
+ const MCExpr *Expr = MO.getExpr();
+
+ MCFixupKind Kind = MI.getOpcode() == AArch64::ADR
+ ? MCFixupKind(AArch64::fixup_aarch64_pcrel_adr_imm21)
+ : MCFixupKind(AArch64::fixup_aarch64_pcrel_adrp_imm21);
+ Fixups.push_back(MCFixup::Create(0, Expr, Kind, MI.getLoc()));
+
+ MCNumFixups += 1;
+
+ // All of the information is in the fixup.
+ return 0;
+}
+
+/// getAddSubImmOpValue - Return encoding for the 12-bit immediate value and
+/// the 2-bit shift field. The shift field is stored in bits 13-14 of the
+/// return value.
+uint32_t
+AArch64MCCodeEmitter::getAddSubImmOpValue(const MCInst &MI, unsigned OpIdx,
+ SmallVectorImpl<MCFixup> &Fixups,
+ const MCSubtargetInfo &STI) const {
+ // Suboperands are [imm, shifter].
+ const MCOperand &MO = MI.getOperand(OpIdx);
+ const MCOperand &MO1 = MI.getOperand(OpIdx + 1);
+ assert(AArch64_AM::getShiftType(MO1.getImm()) == AArch64_AM::LSL &&
+ "unexpected shift type for add/sub immediate");
+ unsigned ShiftVal = AArch64_AM::getShiftValue(MO1.getImm());
+ assert((ShiftVal == 0 || ShiftVal == 12) &&
+ "unexpected shift value for add/sub immediate");
+ if (MO.isImm())
+ return MO.getImm() | (ShiftVal == 0 ? 0 : (1 << 12));
+ assert(MO.isExpr() && "Unable to encode MCOperand!");
+ const MCExpr *Expr = MO.getExpr();
+
+ // Encode the 12 bits of the fixup.
+ MCFixupKind Kind = MCFixupKind(AArch64::fixup_aarch64_add_imm12);
+ Fixups.push_back(MCFixup::Create(0, Expr, Kind, MI.getLoc()));
+
+ ++MCNumFixups;
+
+ return 0;
+}
+
+/// getCondBranchTargetOpValue - Return the encoded value for a conditional
+/// branch target.
+uint32_t AArch64MCCodeEmitter::getCondBranchTargetOpValue(
+ const MCInst &MI, unsigned OpIdx, SmallVectorImpl<MCFixup> &Fixups,
+ const MCSubtargetInfo &STI) const {
+ const MCOperand &MO = MI.getOperand(OpIdx);
+
+ // If the destination is an immediate, we have nothing to do.
+ if (MO.isImm())
+ return MO.getImm();
+ assert(MO.isExpr() && "Unexpected target type!");
+
+ MCFixupKind Kind = MCFixupKind(AArch64::fixup_aarch64_pcrel_branch19);
+ Fixups.push_back(MCFixup::Create(0, MO.getExpr(), Kind, MI.getLoc()));
+
+ ++MCNumFixups;
+
+ // All of the information is in the fixup.
+ return 0;
+}
+
+/// getLoadLiteralOpValue - Return the encoded value for a load-literal
+/// pc-relative address.
+uint32_t
+AArch64MCCodeEmitter::getLoadLiteralOpValue(const MCInst &MI, unsigned OpIdx,
+ SmallVectorImpl<MCFixup> &Fixups,
const MCSubtargetInfo &STI) const {
- if (!hasRs) EncodedValue |= 0x001F0000;
- if (!hasRt2) EncodedValue |= 0x00007C00;
+ const MCOperand &MO = MI.getOperand(OpIdx);
- return EncodedValue;
+ // If the destination is an immediate, we have nothing to do.
+ if (MO.isImm())
+ return MO.getImm();
+ assert(MO.isExpr() && "Unexpected target type!");
+
+ MCFixupKind Kind = MCFixupKind(AArch64::fixup_aarch64_ldr_pcrel_imm19);
+ Fixups.push_back(MCFixup::Create(0, MO.getExpr(), Kind, MI.getLoc()));
+
+ ++MCNumFixups;
+
+ // All of the information is in the fixup.
+ return 0;
}
-unsigned
-AArch64MCCodeEmitter::fixMOVZ(const MCInst &MI, unsigned EncodedValue,
- const MCSubtargetInfo &STI) const {
+uint32_t
+AArch64MCCodeEmitter::getMemExtendOpValue(const MCInst &MI, unsigned OpIdx,
+ SmallVectorImpl<MCFixup> &Fixups,
+ const MCSubtargetInfo &STI) const {
+ unsigned SignExtend = MI.getOperand(OpIdx).getImm();
+ unsigned DoShift = MI.getOperand(OpIdx + 1).getImm();
+ return (SignExtend << 1) | DoShift;
+}
+
+uint32_t
+AArch64MCCodeEmitter::getMoveWideImmOpValue(const MCInst &MI, unsigned OpIdx,
+ SmallVectorImpl<MCFixup> &Fixups,
+ const MCSubtargetInfo &STI) const {
+ const MCOperand &MO = MI.getOperand(OpIdx);
+
+ if (MO.isImm())
+ return MO.getImm();
+ assert(MO.isExpr() && "Unexpected movz/movk immediate");
+
+ Fixups.push_back(MCFixup::Create(
+ 0, MO.getExpr(), MCFixupKind(AArch64::fixup_aarch64_movw), MI.getLoc()));
+
+ ++MCNumFixups;
+
+ return 0;
+}
+
+/// getTestBranchTargetOpValue - Return the encoded value for a test-bit-and-
+/// branch target.
+uint32_t AArch64MCCodeEmitter::getTestBranchTargetOpValue(
+ const MCInst &MI, unsigned OpIdx, SmallVectorImpl<MCFixup> &Fixups,
+ const MCSubtargetInfo &STI) const {
+ const MCOperand &MO = MI.getOperand(OpIdx);
+
+ // If the destination is an immediate, we have nothing to do.
+ if (MO.isImm())
+ return MO.getImm();
+ assert(MO.isExpr() && "Unexpected ADR target type!");
+
+ MCFixupKind Kind = MCFixupKind(AArch64::fixup_aarch64_pcrel_branch14);
+ Fixups.push_back(MCFixup::Create(0, MO.getExpr(), Kind, MI.getLoc()));
+
+ ++MCNumFixups;
+
+ // All of the information is in the fixup.
+ return 0;
+}
+
+/// getBranchTargetOpValue - Return the encoded value for an unconditional
+/// branch target.
+uint32_t
+AArch64MCCodeEmitter::getBranchTargetOpValue(const MCInst &MI, unsigned OpIdx,
+ SmallVectorImpl<MCFixup> &Fixups,
+ const MCSubtargetInfo &STI) const {
+ const MCOperand &MO = MI.getOperand(OpIdx);
+
+ // If the destination is an immediate, we have nothing to do.
+ if (MO.isImm())
+ return MO.getImm();
+ assert(MO.isExpr() && "Unexpected ADR target type!");
+
+ MCFixupKind Kind = MI.getOpcode() == AArch64::BL
+ ? MCFixupKind(AArch64::fixup_aarch64_pcrel_call26)
+ : MCFixupKind(AArch64::fixup_aarch64_pcrel_branch26);
+ Fixups.push_back(MCFixup::Create(0, MO.getExpr(), Kind, MI.getLoc()));
+
+ ++MCNumFixups;
+
+ // All of the information is in the fixup.
+ return 0;
+}
+
+/// getVecShifterOpValue - Return the encoded value for the vector shifter:
+///
+/// 00 -> 0
+/// 01 -> 8
+/// 10 -> 16
+/// 11 -> 24
+uint32_t
+AArch64MCCodeEmitter::getVecShifterOpValue(const MCInst &MI, unsigned OpIdx,
+ SmallVectorImpl<MCFixup> &Fixups,
+ const MCSubtargetInfo &STI) const {
+ const MCOperand &MO = MI.getOperand(OpIdx);
+ assert(MO.isImm() && "Expected an immediate value for the shift amount!");
+
+ switch (MO.getImm()) {
+ default:
+ break;
+ case 0:
+ return 0;
+ case 8:
+ return 1;
+ case 16:
+ return 2;
+ case 24:
+ return 3;
+ }
+
+ assert(false && "Invalid value for vector shift amount!");
+ return 0;
+}
+
+uint32_t
+AArch64MCCodeEmitter::getSIMDShift64OpValue(const MCInst &MI, unsigned OpIdx,
+ SmallVectorImpl<MCFixup> &Fixups,
+ const MCSubtargetInfo &STI) const {
+ const MCOperand &MO = MI.getOperand(OpIdx);
+ assert(MO.isImm() && "Expected an immediate value for the shift amount!");
+ return 64 - (MO.getImm());
+}
+
+uint32_t AArch64MCCodeEmitter::getSIMDShift64_32OpValue(
+ const MCInst &MI, unsigned OpIdx, SmallVectorImpl<MCFixup> &Fixups,
+ const MCSubtargetInfo &STI) const {
+ const MCOperand &MO = MI.getOperand(OpIdx);
+ assert(MO.isImm() && "Expected an immediate value for the shift amount!");
+ return 64 - (MO.getImm() | 32);
+}
+
+uint32_t
+AArch64MCCodeEmitter::getSIMDShift32OpValue(const MCInst &MI, unsigned OpIdx,
+ SmallVectorImpl<MCFixup> &Fixups,
+ const MCSubtargetInfo &STI) const {
+ const MCOperand &MO = MI.getOperand(OpIdx);
+ assert(MO.isImm() && "Expected an immediate value for the shift amount!");
+ return 32 - (MO.getImm() | 16);
+}
+
+uint32_t
+AArch64MCCodeEmitter::getSIMDShift16OpValue(const MCInst &MI, unsigned OpIdx,
+ SmallVectorImpl<MCFixup> &Fixups,
+ const MCSubtargetInfo &STI) const {
+ const MCOperand &MO = MI.getOperand(OpIdx);
+ assert(MO.isImm() && "Expected an immediate value for the shift amount!");
+ return 16 - (MO.getImm() | 8);
+}
+
+/// getFixedPointScaleOpValue - Return the encoded value for the
+// FP-to-fixed-point scale factor.
+uint32_t AArch64MCCodeEmitter::getFixedPointScaleOpValue(
+ const MCInst &MI, unsigned OpIdx, SmallVectorImpl<MCFixup> &Fixups,
+ const MCSubtargetInfo &STI) const {
+ const MCOperand &MO = MI.getOperand(OpIdx);
+ assert(MO.isImm() && "Expected an immediate value for the scale amount!");
+ return 64 - MO.getImm();
+}
+
+uint32_t
+AArch64MCCodeEmitter::getVecShiftR64OpValue(const MCInst &MI, unsigned OpIdx,
+ SmallVectorImpl<MCFixup> &Fixups,
+ const MCSubtargetInfo &STI) const {
+ const MCOperand &MO = MI.getOperand(OpIdx);
+ assert(MO.isImm() && "Expected an immediate value for the scale amount!");
+ return 64 - MO.getImm();
+}
+
+uint32_t
+AArch64MCCodeEmitter::getVecShiftR32OpValue(const MCInst &MI, unsigned OpIdx,
+ SmallVectorImpl<MCFixup> &Fixups,
+ const MCSubtargetInfo &STI) const {
+ const MCOperand &MO = MI.getOperand(OpIdx);
+ assert(MO.isImm() && "Expected an immediate value for the scale amount!");
+ return 32 - MO.getImm();
+}
+
+uint32_t
+AArch64MCCodeEmitter::getVecShiftR16OpValue(const MCInst &MI, unsigned OpIdx,
+ SmallVectorImpl<MCFixup> &Fixups,
+ const MCSubtargetInfo &STI) const {
+ const MCOperand &MO = MI.getOperand(OpIdx);
+ assert(MO.isImm() && "Expected an immediate value for the scale amount!");
+ return 16 - MO.getImm();
+}
+
+uint32_t
+AArch64MCCodeEmitter::getVecShiftR8OpValue(const MCInst &MI, unsigned OpIdx,
+ SmallVectorImpl<MCFixup> &Fixups,
+ const MCSubtargetInfo &STI) const {
+ const MCOperand &MO = MI.getOperand(OpIdx);
+ assert(MO.isImm() && "Expected an immediate value for the scale amount!");
+ return 8 - MO.getImm();
+}
+
+uint32_t
+AArch64MCCodeEmitter::getVecShiftL64OpValue(const MCInst &MI, unsigned OpIdx,
+ SmallVectorImpl<MCFixup> &Fixups,
+ const MCSubtargetInfo &STI) const {
+ const MCOperand &MO = MI.getOperand(OpIdx);
+ assert(MO.isImm() && "Expected an immediate value for the scale amount!");
+ return MO.getImm() - 64;
+}
+
+uint32_t
+AArch64MCCodeEmitter::getVecShiftL32OpValue(const MCInst &MI, unsigned OpIdx,
+ SmallVectorImpl<MCFixup> &Fixups,
+ const MCSubtargetInfo &STI) const {
+ const MCOperand &MO = MI.getOperand(OpIdx);
+ assert(MO.isImm() && "Expected an immediate value for the scale amount!");
+ return MO.getImm() - 32;
+}
+
+uint32_t
+AArch64MCCodeEmitter::getVecShiftL16OpValue(const MCInst &MI, unsigned OpIdx,
+ SmallVectorImpl<MCFixup> &Fixups,
+ const MCSubtargetInfo &STI) const {
+ const MCOperand &MO = MI.getOperand(OpIdx);
+ assert(MO.isImm() && "Expected an immediate value for the scale amount!");
+ return MO.getImm() - 16;
+}
+
+uint32_t
+AArch64MCCodeEmitter::getVecShiftL8OpValue(const MCInst &MI, unsigned OpIdx,
+ SmallVectorImpl<MCFixup> &Fixups,
+ const MCSubtargetInfo &STI) const {
+ const MCOperand &MO = MI.getOperand(OpIdx);
+ assert(MO.isImm() && "Expected an immediate value for the scale amount!");
+ return MO.getImm() - 8;
+}
+
+/// getMoveVecShifterOpValue - Return the encoded value for the vector move
+/// shifter (MSL).
+uint32_t AArch64MCCodeEmitter::getMoveVecShifterOpValue(
+ const MCInst &MI, unsigned OpIdx, SmallVectorImpl<MCFixup> &Fixups,
+ const MCSubtargetInfo &STI) const {
+ const MCOperand &MO = MI.getOperand(OpIdx);
+ assert(MO.isImm() &&
+ "Expected an immediate value for the move shift amount!");
+ unsigned ShiftVal = AArch64_AM::getShiftValue(MO.getImm());
+ assert((ShiftVal == 8 || ShiftVal == 16) && "Invalid shift amount!");
+ return ShiftVal == 8 ? 0 : 1;
+}
+
+unsigned AArch64MCCodeEmitter::fixMOVZ(const MCInst &MI, unsigned EncodedValue,
+ const MCSubtargetInfo &STI) const {
// If one of the signed fixup kinds is applied to a MOVZ instruction, the
// eventual result could be either a MOVZ or a MOVN. It's the MCCodeEmitter's
// job to ensure that any bits possibly affected by this are 0. This means we
@@ -552,23 +589,38 @@
const AArch64MCExpr *A64E = cast<AArch64MCExpr>(UImm16MO.getExpr());
switch (A64E->getKind()) {
- case AArch64MCExpr::VK_AARCH64_SABS_G0:
- case AArch64MCExpr::VK_AARCH64_SABS_G1:
- case AArch64MCExpr::VK_AARCH64_SABS_G2:
- case AArch64MCExpr::VK_AARCH64_DTPREL_G2:
- case AArch64MCExpr::VK_AARCH64_DTPREL_G1:
- case AArch64MCExpr::VK_AARCH64_DTPREL_G0:
- case AArch64MCExpr::VK_AARCH64_GOTTPREL_G1:
- case AArch64MCExpr::VK_AARCH64_TPREL_G2:
- case AArch64MCExpr::VK_AARCH64_TPREL_G1:
- case AArch64MCExpr::VK_AARCH64_TPREL_G0:
+ case AArch64MCExpr::VK_DTPREL_G2:
+ case AArch64MCExpr::VK_DTPREL_G1:
+ case AArch64MCExpr::VK_DTPREL_G0:
+ case AArch64MCExpr::VK_GOTTPREL_G1:
+ case AArch64MCExpr::VK_TPREL_G2:
+ case AArch64MCExpr::VK_TPREL_G1:
+ case AArch64MCExpr::VK_TPREL_G0:
return EncodedValue & ~(1u << 30);
default:
// Nothing to do for an unsigned fixup.
return EncodedValue;
}
- llvm_unreachable("Should have returned by now");
+
+ return EncodedValue & ~(1u << 30);
+}
+
+void AArch64MCCodeEmitter::EncodeInstruction(const MCInst &MI, raw_ostream &OS,
+ SmallVectorImpl<MCFixup> &Fixups,
+ const MCSubtargetInfo &STI) const {
+ if (MI.getOpcode() == AArch64::TLSDESCCALL) {
+ // This is a directive which applies an R_AARCH64_TLSDESC_CALL to the
+ // following (BLR) instruction. It doesn't emit any code itself so it
+ // doesn't go through the normal TableGenerated channels.
+ MCFixupKind Fixup = MCFixupKind(AArch64::fixup_aarch64_tlsdesc_call);
+ Fixups.push_back(MCFixup::Create(0, MI.getOperand(0).getExpr(), Fixup));
+ return;
+ }
+
+ uint64_t Binary = getBinaryCodeForInstr(MI, Fixups, STI);
+ EmitConstant(Binary, 4, OS);
+ ++MCNumEmitted; // Keep track of the # of mi's emitted.
}
unsigned
@@ -581,32 +633,22 @@
return EncodedValue;
}
-MCCodeEmitter *llvm::createAArch64MCCodeEmitter(const MCInstrInfo &MCII,
- const MCRegisterInfo &MRI,
- const MCSubtargetInfo &STI,
- MCContext &Ctx) {
- return new AArch64MCCodeEmitter(Ctx);
+template<int hasRs, int hasRt2> unsigned
+AArch64MCCodeEmitter::fixLoadStoreExclusive(const MCInst &MI,
+ unsigned EncodedValue,
+ const MCSubtargetInfo &STI) const {
+ if (!hasRs) EncodedValue |= 0x001F0000;
+ if (!hasRt2) EncodedValue |= 0x00007C00;
+
+ return EncodedValue;
}
-void AArch64MCCodeEmitter::
-EncodeInstruction(const MCInst &MI, raw_ostream &OS,
- SmallVectorImpl<MCFixup> &Fixups,
- const MCSubtargetInfo &STI) const {
- if (MI.getOpcode() == AArch64::TLSDESCCALL) {
- // This is a directive which applies an R_AARCH64_TLSDESC_CALL to the
- // following (BLR) instruction. It doesn't emit any code itself so it
- // doesn't go through the normal TableGenerated channels.
- MCFixupKind Fixup = MCFixupKind(AArch64::fixup_a64_tlsdesc_call);
- const MCExpr *Expr;
- Expr = AArch64MCExpr::CreateTLSDesc(MI.getOperand(0).getExpr(), Ctx);
- Fixups.push_back(MCFixup::Create(0, Expr, Fixup));
- return;
- }
-
- uint32_t Binary = getBinaryCodeForInstr(MI, Fixups, STI);
-
- EmitInstruction(Binary, OS);
+unsigned AArch64MCCodeEmitter::fixOneOperandFPComparison(
+ const MCInst &MI, unsigned EncodedValue, const MCSubtargetInfo &STI) const {
+ // The Rm field of FCMP and friends is unused - it should be assembled
+ // as 0, but is ignored by the processor.
+ EncodedValue &= ~(0x1f << 16);
+ return EncodedValue;
}
-
#include "AArch64GenMCCodeEmitter.inc"
diff --git a/lib/Target/AArch64/MCTargetDesc/AArch64MCExpr.cpp b/lib/Target/AArch64/MCTargetDesc/AArch64MCExpr.cpp
index c7ccaee..85c3ec7 100644
--- a/lib/Target/AArch64/MCTargetDesc/AArch64MCExpr.cpp
+++ b/lib/Target/AArch64/MCTargetDesc/AArch64MCExpr.cpp
@@ -12,136 +12,73 @@
//
//===----------------------------------------------------------------------===//
-#define DEBUG_TYPE "aarch64mcexpr"
#include "AArch64MCExpr.h"
#include "llvm/MC/MCAssembler.h"
#include "llvm/MC/MCContext.h"
#include "llvm/MC/MCELF.h"
+#include "llvm/MC/MCSymbol.h"
+#include "llvm/MC/MCValue.h"
#include "llvm/Object/ELF.h"
+#include "llvm/Support/ErrorHandling.h"
using namespace llvm;
-const AArch64MCExpr*
-AArch64MCExpr::Create(VariantKind Kind, const MCExpr *Expr,
- MCContext &Ctx) {
- return new (Ctx) AArch64MCExpr(Kind, Expr);
+#define DEBUG_TYPE "aarch64symbolrefexpr"
+
+const AArch64MCExpr *AArch64MCExpr::Create(const MCExpr *Expr, VariantKind Kind,
+ MCContext &Ctx) {
+ return new (Ctx) AArch64MCExpr(Expr, Kind);
+}
+
+StringRef AArch64MCExpr::getVariantKindName() const {
+ switch (static_cast<uint32_t>(getKind())) {
+ case VK_CALL: return "";
+ case VK_LO12: return ":lo12:";
+ case VK_ABS_G3: return ":abs_g3:";
+ case VK_ABS_G2: return ":abs_g2:";
+ case VK_ABS_G2_S: return ":abs_g2_s:";
+ case VK_ABS_G2_NC: return ":abs_g2_nc:";
+ case VK_ABS_G1: return ":abs_g1:";
+ case VK_ABS_G1_S: return ":abs_g1_s:";
+ case VK_ABS_G1_NC: return ":abs_g1_nc:";
+ case VK_ABS_G0: return ":abs_g0:";
+ case VK_ABS_G0_S: return ":abs_g0_s:";
+ case VK_ABS_G0_NC: return ":abs_g0_nc:";
+ case VK_DTPREL_G2: return ":dtprel_g2:";
+ case VK_DTPREL_G1: return ":dtprel_g1:";
+ case VK_DTPREL_G1_NC: return ":dtprel_g1_nc:";
+ case VK_DTPREL_G0: return ":dtprel_g0:";
+ case VK_DTPREL_G0_NC: return ":dtprel_g0_nc:";
+ case VK_DTPREL_HI12: return ":dtprel_hi12:";
+ case VK_DTPREL_LO12: return ":dtprel_lo12:";
+ case VK_DTPREL_LO12_NC: return ":dtprel_lo12_nc:";
+ case VK_TPREL_G2: return ":tprel_g2:";
+ case VK_TPREL_G1: return ":tprel_g1:";
+ case VK_TPREL_G1_NC: return ":tprel_g1_nc:";
+ case VK_TPREL_G0: return ":tprel_g0:";
+ case VK_TPREL_G0_NC: return ":tprel_g0_nc:";
+ case VK_TPREL_HI12: return ":tprel_hi12:";
+ case VK_TPREL_LO12: return ":tprel_lo12:";
+ case VK_TPREL_LO12_NC: return ":tprel_lo12_nc:";
+ case VK_TLSDESC_LO12: return ":tlsdesc_lo12:";
+ case VK_ABS_PAGE: return "";
+ case VK_GOT_PAGE: return ":got:";
+ case VK_GOT_LO12: return ":got_lo12:";
+ case VK_GOTTPREL_PAGE: return ":gottprel:";
+ case VK_GOTTPREL_LO12_NC: return ":gottprel_lo12:";
+ case VK_GOTTPREL_G1: return ":gottprel_g1:";
+ case VK_GOTTPREL_G0_NC: return ":gottprel_g0_nc:";
+ case VK_TLSDESC: return "";
+ case VK_TLSDESC_PAGE: return ":tlsdesc:";
+ default:
+ llvm_unreachable("Invalid ELF symbol kind");
+ }
}
void AArch64MCExpr::PrintImpl(raw_ostream &OS) const {
- switch (Kind) {
- default: llvm_unreachable("Invalid kind!");
- case VK_AARCH64_GOT: OS << ":got:"; break;
- case VK_AARCH64_GOT_LO12: OS << ":got_lo12:"; break;
- case VK_AARCH64_LO12: OS << ":lo12:"; break;
- case VK_AARCH64_ABS_G0: OS << ":abs_g0:"; break;
- case VK_AARCH64_ABS_G0_NC: OS << ":abs_g0_nc:"; break;
- case VK_AARCH64_ABS_G1: OS << ":abs_g1:"; break;
- case VK_AARCH64_ABS_G1_NC: OS << ":abs_g1_nc:"; break;
- case VK_AARCH64_ABS_G2: OS << ":abs_g2:"; break;
- case VK_AARCH64_ABS_G2_NC: OS << ":abs_g2_nc:"; break;
- case VK_AARCH64_ABS_G3: OS << ":abs_g3:"; break;
- case VK_AARCH64_SABS_G0: OS << ":abs_g0_s:"; break;
- case VK_AARCH64_SABS_G1: OS << ":abs_g1_s:"; break;
- case VK_AARCH64_SABS_G2: OS << ":abs_g2_s:"; break;
- case VK_AARCH64_DTPREL_G2: OS << ":dtprel_g2:"; break;
- case VK_AARCH64_DTPREL_G1: OS << ":dtprel_g1:"; break;
- case VK_AARCH64_DTPREL_G1_NC: OS << ":dtprel_g1_nc:"; break;
- case VK_AARCH64_DTPREL_G0: OS << ":dtprel_g0:"; break;
- case VK_AARCH64_DTPREL_G0_NC: OS << ":dtprel_g0_nc:"; break;
- case VK_AARCH64_DTPREL_HI12: OS << ":dtprel_hi12:"; break;
- case VK_AARCH64_DTPREL_LO12: OS << ":dtprel_lo12:"; break;
- case VK_AARCH64_DTPREL_LO12_NC: OS << ":dtprel_lo12_nc:"; break;
- case VK_AARCH64_GOTTPREL_G1: OS << ":gottprel_g1:"; break;
- case VK_AARCH64_GOTTPREL_G0_NC: OS << ":gottprel_g0_nc:"; break;
- case VK_AARCH64_GOTTPREL: OS << ":gottprel:"; break;
- case VK_AARCH64_GOTTPREL_LO12: OS << ":gottprel_lo12:"; break;
- case VK_AARCH64_TPREL_G2: OS << ":tprel_g2:"; break;
- case VK_AARCH64_TPREL_G1: OS << ":tprel_g1:"; break;
- case VK_AARCH64_TPREL_G1_NC: OS << ":tprel_g1_nc:"; break;
- case VK_AARCH64_TPREL_G0: OS << ":tprel_g0:"; break;
- case VK_AARCH64_TPREL_G0_NC: OS << ":tprel_g0_nc:"; break;
- case VK_AARCH64_TPREL_HI12: OS << ":tprel_hi12:"; break;
- case VK_AARCH64_TPREL_LO12: OS << ":tprel_lo12:"; break;
- case VK_AARCH64_TPREL_LO12_NC: OS << ":tprel_lo12_nc:"; break;
- case VK_AARCH64_TLSDESC: OS << ":tlsdesc:"; break;
- case VK_AARCH64_TLSDESC_LO12: OS << ":tlsdesc_lo12:"; break;
-
- }
-
- const MCExpr *Expr = getSubExpr();
- if (Expr->getKind() != MCExpr::SymbolRef)
- OS << '(';
- Expr->print(OS);
- if (Expr->getKind() != MCExpr::SymbolRef)
- OS << ')';
-}
-
-bool
-AArch64MCExpr::EvaluateAsRelocatableImpl(MCValue &Res,
- const MCAsmLayout *Layout) const {
- return getSubExpr()->EvaluateAsRelocatable(Res, Layout);
-}
-
-static void fixELFSymbolsInTLSFixupsImpl(const MCExpr *Expr, MCAssembler &Asm) {
- switch (Expr->getKind()) {
- case MCExpr::Target:
- llvm_unreachable("Can't handle nested target expression");
- break;
- case MCExpr::Constant:
- break;
-
- case MCExpr::Binary: {
- const MCBinaryExpr *BE = cast<MCBinaryExpr>(Expr);
- fixELFSymbolsInTLSFixupsImpl(BE->getLHS(), Asm);
- fixELFSymbolsInTLSFixupsImpl(BE->getRHS(), Asm);
- break;
- }
-
- case MCExpr::SymbolRef: {
- // We're known to be under a TLS fixup, so any symbol should be
- // modified. There should be only one.
- const MCSymbolRefExpr &SymRef = *cast<MCSymbolRefExpr>(Expr);
- MCSymbolData &SD = Asm.getOrCreateSymbolData(SymRef.getSymbol());
- MCELF::SetType(SD, ELF::STT_TLS);
- break;
- }
-
- case MCExpr::Unary:
- fixELFSymbolsInTLSFixupsImpl(cast<MCUnaryExpr>(Expr)->getSubExpr(), Asm);
- break;
- }
-}
-
-void AArch64MCExpr::fixELFSymbolsInTLSFixups(MCAssembler &Asm) const {
- switch (getKind()) {
- default:
- return;
- case VK_AARCH64_DTPREL_G2:
- case VK_AARCH64_DTPREL_G1:
- case VK_AARCH64_DTPREL_G1_NC:
- case VK_AARCH64_DTPREL_G0:
- case VK_AARCH64_DTPREL_G0_NC:
- case VK_AARCH64_DTPREL_HI12:
- case VK_AARCH64_DTPREL_LO12:
- case VK_AARCH64_DTPREL_LO12_NC:
- case VK_AARCH64_GOTTPREL_G1:
- case VK_AARCH64_GOTTPREL_G0_NC:
- case VK_AARCH64_GOTTPREL:
- case VK_AARCH64_GOTTPREL_LO12:
- case VK_AARCH64_TPREL_G2:
- case VK_AARCH64_TPREL_G1:
- case VK_AARCH64_TPREL_G1_NC:
- case VK_AARCH64_TPREL_G0:
- case VK_AARCH64_TPREL_G0_NC:
- case VK_AARCH64_TPREL_HI12:
- case VK_AARCH64_TPREL_LO12:
- case VK_AARCH64_TPREL_LO12_NC:
- case VK_AARCH64_TLSDESC:
- case VK_AARCH64_TLSDESC_LO12:
- break;
- }
-
- fixELFSymbolsInTLSFixupsImpl(getSubExpr(), Asm);
+ if (getKind() != VK_NONE)
+ OS << getVariantKindName();
+ OS << *Expr;
}
// FIXME: This basically copies MCObjectStreamer::AddValueSymbols. Perhaps
@@ -176,3 +113,62 @@
void AArch64MCExpr::AddValueSymbols(MCAssembler *Asm) const {
AddValueSymbolsImpl(getSubExpr(), Asm);
}
+
+const MCSection *AArch64MCExpr::FindAssociatedSection() const {
+ llvm_unreachable("FIXME: what goes here?");
+}
+
+bool AArch64MCExpr::EvaluateAsRelocatableImpl(MCValue &Res,
+ const MCAsmLayout *Layout) const {
+ if (!getSubExpr()->EvaluateAsRelocatable(Res, Layout))
+ return false;
+
+ Res =
+ MCValue::get(Res.getSymA(), Res.getSymB(), Res.getConstant(), getKind());
+
+ return true;
+}
+
+static void fixELFSymbolsInTLSFixupsImpl(const MCExpr *Expr, MCAssembler &Asm) {
+ switch (Expr->getKind()) {
+ case MCExpr::Target:
+ llvm_unreachable("Can't handle nested target expression");
+ break;
+ case MCExpr::Constant:
+ break;
+
+ case MCExpr::Binary: {
+ const MCBinaryExpr *BE = cast<MCBinaryExpr>(Expr);
+ fixELFSymbolsInTLSFixupsImpl(BE->getLHS(), Asm);
+ fixELFSymbolsInTLSFixupsImpl(BE->getRHS(), Asm);
+ break;
+ }
+
+ case MCExpr::SymbolRef: {
+ // We're known to be under a TLS fixup, so any symbol should be
+ // modified. There should be only one.
+ const MCSymbolRefExpr &SymRef = *cast<MCSymbolRefExpr>(Expr);
+ MCSymbolData &SD = Asm.getOrCreateSymbolData(SymRef.getSymbol());
+ MCELF::SetType(SD, ELF::STT_TLS);
+ break;
+ }
+
+ case MCExpr::Unary:
+ fixELFSymbolsInTLSFixupsImpl(cast<MCUnaryExpr>(Expr)->getSubExpr(), Asm);
+ break;
+ }
+}
+
+void AArch64MCExpr::fixELFSymbolsInTLSFixups(MCAssembler &Asm) const {
+ switch (getSymbolLoc(Kind)) {
+ default:
+ return;
+ case VK_DTPREL:
+ case VK_GOTTPREL:
+ case VK_TPREL:
+ case VK_TLSDESC:
+ break;
+ }
+
+ fixELFSymbolsInTLSFixupsImpl(getSubExpr(), Asm);
+}
diff --git a/lib/Target/AArch64/MCTargetDesc/AArch64MCExpr.h b/lib/Target/AArch64/MCTargetDesc/AArch64MCExpr.h
index d9798ae..e869ed0 100644
--- a/lib/Target/AArch64/MCTargetDesc/AArch64MCExpr.h
+++ b/lib/Target/AArch64/MCTargetDesc/AArch64MCExpr.h
@@ -1,4 +1,4 @@
-//==- AArch64MCExpr.h - AArch64 specific MC expression classes --*- C++ -*-===//
+//=--- AArch64MCExpr.h - AArch64 specific MC expression classes ---*- C++ -*-=//
//
// The LLVM Compiler Infrastructure
//
@@ -12,168 +12,149 @@
//
//===----------------------------------------------------------------------===//
-#ifndef LLVM_AARCH64MCEXPR_H
-#define LLVM_AARCH64MCEXPR_H
+#ifndef LLVM_AArch64MCEXPR_H
+#define LLVM_AArch64MCEXPR_H
#include "llvm/MC/MCExpr.h"
+#include "llvm/Support/ErrorHandling.h"
namespace llvm {
class AArch64MCExpr : public MCTargetExpr {
public:
enum VariantKind {
- VK_AARCH64_None,
- VK_AARCH64_GOT, // :got: modifier in assembly
- VK_AARCH64_GOT_LO12, // :got_lo12:
- VK_AARCH64_LO12, // :lo12:
+ VK_NONE = 0x000,
- VK_AARCH64_ABS_G0, // :abs_g0:
- VK_AARCH64_ABS_G0_NC, // :abs_g0_nc:
- VK_AARCH64_ABS_G1,
- VK_AARCH64_ABS_G1_NC,
- VK_AARCH64_ABS_G2,
- VK_AARCH64_ABS_G2_NC,
- VK_AARCH64_ABS_G3,
+ // Symbol locations specifying (roughly speaking) what calculation should be
+ // performed to construct the final address for the relocated
+ // symbol. E.g. direct, via the GOT, ...
+ VK_ABS = 0x001,
+ VK_SABS = 0x002,
+ VK_GOT = 0x003,
+ VK_DTPREL = 0x004,
+ VK_GOTTPREL = 0x005,
+ VK_TPREL = 0x006,
+ VK_TLSDESC = 0x007,
+ VK_SymLocBits = 0x00f,
- VK_AARCH64_SABS_G0, // :abs_g0_s:
- VK_AARCH64_SABS_G1,
- VK_AARCH64_SABS_G2,
+ // Variants specifying which part of the final address calculation is
+ // used. E.g. the low 12 bits for an ADD/LDR, the middle 16 bits for a
+ // MOVZ/MOVK.
+ VK_PAGE = 0x010,
+ VK_PAGEOFF = 0x020,
+ VK_HI12 = 0x030,
+ VK_G0 = 0x040,
+ VK_G1 = 0x050,
+ VK_G2 = 0x060,
+ VK_G3 = 0x070,
+ VK_AddressFragBits = 0x0f0,
- VK_AARCH64_DTPREL_G2, // :dtprel_g2:
- VK_AARCH64_DTPREL_G1,
- VK_AARCH64_DTPREL_G1_NC,
- VK_AARCH64_DTPREL_G0,
- VK_AARCH64_DTPREL_G0_NC,
- VK_AARCH64_DTPREL_HI12,
- VK_AARCH64_DTPREL_LO12,
- VK_AARCH64_DTPREL_LO12_NC,
+ // Whether the final relocation is a checked one (where a linker should
+ // perform a range-check on the final address) or not. Note that this field
+ // is unfortunately sometimes omitted from the assembly syntax. E.g. :lo12:
+ // on its own is a non-checked relocation. We side with ELF on being
+ // explicit about this!
+ VK_NC = 0x100,
- VK_AARCH64_GOTTPREL_G1, // :gottprel:
- VK_AARCH64_GOTTPREL_G0_NC,
- VK_AARCH64_GOTTPREL,
- VK_AARCH64_GOTTPREL_LO12,
+ // Convenience definitions for referring to specific textual representations
+ // of relocation specifiers. Note that this means the "_NC" is sometimes
+ // omitted in line with assembly syntax here (VK_LO12 rather than VK_LO12_NC
+ // since a user would write ":lo12:").
+ VK_CALL = VK_ABS,
+ VK_ABS_PAGE = VK_ABS | VK_PAGE,
+ VK_ABS_G3 = VK_ABS | VK_G3,
+ VK_ABS_G2 = VK_ABS | VK_G2,
+ VK_ABS_G2_S = VK_SABS | VK_G2,
+ VK_ABS_G2_NC = VK_ABS | VK_G2 | VK_NC,
+ VK_ABS_G1 = VK_ABS | VK_G1,
+ VK_ABS_G1_S = VK_SABS | VK_G1,
+ VK_ABS_G1_NC = VK_ABS | VK_G1 | VK_NC,
+ VK_ABS_G0 = VK_ABS | VK_G0,
+ VK_ABS_G0_S = VK_SABS | VK_G0,
+ VK_ABS_G0_NC = VK_ABS | VK_G0 | VK_NC,
+ VK_LO12 = VK_ABS | VK_PAGEOFF | VK_NC,
+ VK_GOT_LO12 = VK_GOT | VK_PAGEOFF | VK_NC,
+ VK_GOT_PAGE = VK_GOT | VK_PAGE,
+ VK_DTPREL_G2 = VK_DTPREL | VK_G2,
+ VK_DTPREL_G1 = VK_DTPREL | VK_G1,
+ VK_DTPREL_G1_NC = VK_DTPREL | VK_G1 | VK_NC,
+ VK_DTPREL_G0 = VK_DTPREL | VK_G0,
+ VK_DTPREL_G0_NC = VK_DTPREL | VK_G0 | VK_NC,
+ VK_DTPREL_HI12 = VK_DTPREL | VK_HI12,
+ VK_DTPREL_LO12 = VK_DTPREL | VK_PAGEOFF,
+ VK_DTPREL_LO12_NC = VK_DTPREL | VK_PAGEOFF | VK_NC,
+ VK_GOTTPREL_PAGE = VK_GOTTPREL | VK_PAGE,
+ VK_GOTTPREL_LO12_NC = VK_GOTTPREL | VK_PAGEOFF | VK_NC,
+ VK_GOTTPREL_G1 = VK_GOTTPREL | VK_G1,
+ VK_GOTTPREL_G0_NC = VK_GOTTPREL | VK_G0 | VK_NC,
+ VK_TPREL_G2 = VK_TPREL | VK_G2,
+ VK_TPREL_G1 = VK_TPREL | VK_G1,
+ VK_TPREL_G1_NC = VK_TPREL | VK_G1 | VK_NC,
+ VK_TPREL_G0 = VK_TPREL | VK_G0,
+ VK_TPREL_G0_NC = VK_TPREL | VK_G0 | VK_NC,
+ VK_TPREL_HI12 = VK_TPREL | VK_HI12,
+ VK_TPREL_LO12 = VK_TPREL | VK_PAGEOFF,
+ VK_TPREL_LO12_NC = VK_TPREL | VK_PAGEOFF | VK_NC,
+ VK_TLSDESC_LO12 = VK_TLSDESC | VK_PAGEOFF | VK_NC,
+ VK_TLSDESC_PAGE = VK_TLSDESC | VK_PAGE,
- VK_AARCH64_TPREL_G2, // :tprel:
- VK_AARCH64_TPREL_G1,
- VK_AARCH64_TPREL_G1_NC,
- VK_AARCH64_TPREL_G0,
- VK_AARCH64_TPREL_G0_NC,
- VK_AARCH64_TPREL_HI12,
- VK_AARCH64_TPREL_LO12,
- VK_AARCH64_TPREL_LO12_NC,
-
- VK_AARCH64_TLSDESC, // :tlsdesc:
- VK_AARCH64_TLSDESC_LO12
+ VK_INVALID = 0xfff
};
private:
- const VariantKind Kind;
const MCExpr *Expr;
+ const VariantKind Kind;
- explicit AArch64MCExpr(VariantKind _Kind, const MCExpr *_Expr)
- : Kind(_Kind), Expr(_Expr) {}
+ explicit AArch64MCExpr(const MCExpr *Expr, VariantKind Kind)
+ : Expr(Expr), Kind(Kind) {}
public:
/// @name Construction
/// @{
- static const AArch64MCExpr *Create(VariantKind Kind, const MCExpr *Expr,
- MCContext &Ctx);
-
- static const AArch64MCExpr *CreateLo12(const MCExpr *Expr, MCContext &Ctx) {
- return Create(VK_AARCH64_LO12, Expr, Ctx);
- }
-
- static const AArch64MCExpr *CreateGOT(const MCExpr *Expr, MCContext &Ctx) {
- return Create(VK_AARCH64_GOT, Expr, Ctx);
- }
-
- static const AArch64MCExpr *CreateGOTLo12(const MCExpr *Expr,
- MCContext &Ctx) {
- return Create(VK_AARCH64_GOT_LO12, Expr, Ctx);
- }
-
- static const AArch64MCExpr *CreateDTPREL_G1(const MCExpr *Expr,
- MCContext &Ctx) {
- return Create(VK_AARCH64_DTPREL_G1, Expr, Ctx);
- }
-
- static const AArch64MCExpr *CreateDTPREL_G0_NC(const MCExpr *Expr,
- MCContext &Ctx) {
- return Create(VK_AARCH64_DTPREL_G0_NC, Expr, Ctx);
- }
-
- static const AArch64MCExpr *CreateGOTTPREL(const MCExpr *Expr,
- MCContext &Ctx) {
- return Create(VK_AARCH64_GOTTPREL, Expr, Ctx);
- }
-
- static const AArch64MCExpr *CreateGOTTPRELLo12(const MCExpr *Expr,
- MCContext &Ctx) {
- return Create(VK_AARCH64_GOTTPREL_LO12, Expr, Ctx);
- }
-
- static const AArch64MCExpr *CreateTLSDesc(const MCExpr *Expr,
- MCContext &Ctx) {
- return Create(VK_AARCH64_TLSDESC, Expr, Ctx);
- }
-
- static const AArch64MCExpr *CreateTLSDescLo12(const MCExpr *Expr,
- MCContext &Ctx) {
- return Create(VK_AARCH64_TLSDESC_LO12, Expr, Ctx);
- }
-
- static const AArch64MCExpr *CreateTPREL_G1(const MCExpr *Expr,
- MCContext &Ctx) {
- return Create(VK_AARCH64_TPREL_G1, Expr, Ctx);
- }
-
- static const AArch64MCExpr *CreateTPREL_G0_NC(const MCExpr *Expr,
- MCContext &Ctx) {
- return Create(VK_AARCH64_TPREL_G0_NC, Expr, Ctx);
- }
-
- static const AArch64MCExpr *CreateABS_G3(const MCExpr *Expr,
- MCContext &Ctx) {
- return Create(VK_AARCH64_ABS_G3, Expr, Ctx);
- }
-
- static const AArch64MCExpr *CreateABS_G2_NC(const MCExpr *Expr,
- MCContext &Ctx) {
- return Create(VK_AARCH64_ABS_G2_NC, Expr, Ctx);
- }
-
- static const AArch64MCExpr *CreateABS_G1_NC(const MCExpr *Expr,
- MCContext &Ctx) {
- return Create(VK_AARCH64_ABS_G1_NC, Expr, Ctx);
- }
-
- static const AArch64MCExpr *CreateABS_G0_NC(const MCExpr *Expr,
- MCContext &Ctx) {
- return Create(VK_AARCH64_ABS_G0_NC, Expr, Ctx);
- }
+ static const AArch64MCExpr *Create(const MCExpr *Expr, VariantKind Kind,
+ MCContext &Ctx);
/// @}
/// @name Accessors
/// @{
- /// getOpcode - Get the kind of this expression.
- VariantKind getKind() const { return Kind; }
+ /// Get the kind of this expression.
+ VariantKind getKind() const { return static_cast<VariantKind>(Kind); }
- /// getSubExpr - Get the child of this expression.
+ /// Get the expression this modifier applies to.
const MCExpr *getSubExpr() const { return Expr; }
/// @}
+ /// @name VariantKind information extractors.
+ /// @{
- void PrintImpl(raw_ostream &OS) const;
- bool EvaluateAsRelocatableImpl(MCValue &Res,
- const MCAsmLayout *Layout) const;
- void AddValueSymbols(MCAssembler *) const;
- const MCSection *FindAssociatedSection() const {
- return getSubExpr()->FindAssociatedSection();
+ static VariantKind getSymbolLoc(VariantKind Kind) {
+ return static_cast<VariantKind>(Kind & VK_SymLocBits);
}
- void fixELFSymbolsInTLSFixups(MCAssembler &Asm) const;
+ static VariantKind getAddressFrag(VariantKind Kind) {
+ return static_cast<VariantKind>(Kind & VK_AddressFragBits);
+ }
+
+ static bool isNotChecked(VariantKind Kind) { return Kind & VK_NC; }
+
+ /// @}
+
+ /// Convert the variant kind into an ELF-appropriate modifier
+ /// (e.g. ":got:", ":lo12:").
+ StringRef getVariantKindName() const;
+
+ void PrintImpl(raw_ostream &OS) const override;
+
+ void AddValueSymbols(MCAssembler *) const override;
+
+ const MCSection *FindAssociatedSection() const override;
+
+ bool EvaluateAsRelocatableImpl(MCValue &Res,
+ const MCAsmLayout *Layout) const override;
+
+ void fixELFSymbolsInTLSFixups(MCAssembler &Asm) const override;
static bool classof(const MCExpr *E) {
return E->getKind() == MCExpr::Target;
diff --git a/lib/Target/AArch64/MCTargetDesc/AArch64MCTargetDesc.cpp b/lib/Target/AArch64/MCTargetDesc/AArch64MCTargetDesc.cpp
index 3d19e42..ae698c5 100644
--- a/lib/Target/AArch64/MCTargetDesc/AArch64MCTargetDesc.cpp
+++ b/lib/Target/AArch64/MCTargetDesc/AArch64MCTargetDesc.cpp
@@ -1,4 +1,4 @@
-//===-- AArch64MCTargetDesc.cpp - AArch64 Target Descriptions -------------===//
+//===-- AArch64MCTargetDesc.cpp - AArch64 Target Descriptions ---*- C++ -*-===//
//
// The LLVM Compiler Infrastructure
//
@@ -15,9 +15,7 @@
#include "AArch64ELFStreamer.h"
#include "AArch64MCAsmInfo.h"
#include "InstPrinter/AArch64InstPrinter.h"
-#include "llvm/ADT/APInt.h"
#include "llvm/MC/MCCodeGenInfo.h"
-#include "llvm/MC/MCInstrAnalysis.h"
#include "llvm/MC/MCInstrInfo.h"
#include "llvm/MC/MCRegisterInfo.h"
#include "llvm/MC/MCStreamer.h"
@@ -25,8 +23,7 @@
#include "llvm/Support/ErrorHandling.h"
#include "llvm/Support/TargetRegistry.h"
-#define GET_REGINFO_MC_DESC
-#include "AArch64GenRegisterInfo.inc"
+using namespace llvm;
#define GET_INSTRINFO_MC_DESC
#include "AArch64GenInstrInfo.inc"
@@ -34,16 +31,8 @@
#define GET_SUBTARGETINFO_MC_DESC
#include "AArch64GenSubtargetInfo.inc"
-using namespace llvm;
-
-MCSubtargetInfo *AArch64_MC::createAArch64MCSubtargetInfo(StringRef TT,
- StringRef CPU,
- StringRef FS) {
- MCSubtargetInfo *X = new MCSubtargetInfo();
- InitAArch64MCSubtargetInfo(X, TT, CPU, FS);
- return X;
-}
-
+#define GET_REGINFO_MC_DESC
+#include "AArch64GenRegisterInfo.inc"
static MCInstrInfo *createAArch64MCInstrInfo() {
MCInstrInfo *X = new MCInstrInfo();
@@ -51,9 +40,20 @@
return X;
}
+static MCSubtargetInfo *
+createAArch64MCSubtargetInfo(StringRef TT, StringRef CPU, StringRef FS) {
+ MCSubtargetInfo *X = new MCSubtargetInfo();
+
+ if (CPU.empty())
+ CPU = "generic";
+
+ InitAArch64MCSubtargetInfo(X, TT, CPU, FS);
+ return X;
+}
+
static MCRegisterInfo *createAArch64MCRegisterInfo(StringRef Triple) {
MCRegisterInfo *X = new MCRegisterInfo();
- InitAArch64MCRegisterInfo(X, AArch64::X30);
+ InitAArch64MCRegisterInfo(X, AArch64::LR);
return X;
}
@@ -61,9 +61,17 @@
StringRef TT) {
Triple TheTriple(TT);
- MCAsmInfo *MAI = new AArch64ELFMCAsmInfo(TT);
- unsigned Reg = MRI.getDwarfRegNum(AArch64::XSP, true);
- MCCFIInstruction Inst = MCCFIInstruction::createDefCfa(0, Reg, 0);
+ MCAsmInfo *MAI;
+ if (TheTriple.isOSDarwin())
+ MAI = new AArch64MCAsmInfoDarwin();
+ else {
+ assert(TheTriple.isOSBinFormatELF() && "Only expect Darwin or ELF");
+ MAI = new AArch64MCAsmInfoELF(TT);
+ }
+
+ // Initial state of the frame pointer is SP.
+ unsigned Reg = MRI.getDwarfRegNum(AArch64::SP, true);
+ MCCFIInstruction Inst = MCCFIInstruction::createDefCfa(nullptr, Reg, 0);
MAI->addInitialFrameState(Inst);
return MAI;
@@ -72,40 +80,35 @@
static MCCodeGenInfo *createAArch64MCCodeGenInfo(StringRef TT, Reloc::Model RM,
CodeModel::Model CM,
CodeGenOpt::Level OL) {
- MCCodeGenInfo *X = new MCCodeGenInfo();
- if (RM == Reloc::Default || RM == Reloc::DynamicNoPIC) {
- // On ELF platforms the default static relocation model has a smart enough
- // linker to cope with referencing external symbols defined in a shared
- // library. Hence DynamicNoPIC doesn't need to be promoted to PIC.
- RM = Reloc::Static;
- }
+ Triple TheTriple(TT);
+ assert((TheTriple.isOSBinFormatELF() || TheTriple.isOSBinFormatMachO()) &&
+ "Only expect Darwin and ELF targets");
if (CM == CodeModel::Default)
CM = CodeModel::Small;
- else if (CM == CodeModel::JITDefault) {
- // The default MCJIT memory managers make no guarantees about where they can
- // find an executable page; JITed code needs to be able to refer to globals
- // no matter how far away they are.
+ // The default MCJIT memory managers make no guarantees about where they can
+ // find an executable page; JITed code needs to be able to refer to globals
+ // no matter how far away they are.
+ else if (CM == CodeModel::JITDefault)
CM = CodeModel::Large;
- }
+ else if (CM != CodeModel::Small && CM != CodeModel::Large)
+ report_fatal_error(
+ "Only small and large code models are allowed on AArch64");
+ // AArch64 Darwin is always PIC.
+ if (TheTriple.isOSDarwin())
+ RM = Reloc::PIC_;
+ // On ELF platforms the default static relocation model has a smart enough
+ // linker to cope with referencing external symbols defined in a shared
+ // library. Hence DynamicNoPIC doesn't need to be promoted to PIC.
+ else if (RM == Reloc::Default || RM == Reloc::DynamicNoPIC)
+ RM = Reloc::Static;
+
+ MCCodeGenInfo *X = new MCCodeGenInfo();
X->InitMCCodeGenInfo(RM, CM, OL);
return X;
}
-static MCStreamer *createMCStreamer(const Target &T, StringRef TT,
- MCContext &Ctx, MCAsmBackend &MAB,
- raw_ostream &OS,
- MCCodeEmitter *Emitter,
- const MCSubtargetInfo &STI,
- bool RelaxAll,
- bool NoExecStack) {
- Triple TheTriple(TT);
-
- return createAArch64ELFStreamer(Ctx, MAB, OS, Emitter, RelaxAll, NoExecStack);
-}
-
-
static MCInstPrinter *createAArch64MCInstPrinter(const Target &T,
unsigned SyntaxVariant,
const MCAsmInfo &MAI,
@@ -114,108 +117,109 @@
const MCSubtargetInfo &STI) {
if (SyntaxVariant == 0)
return new AArch64InstPrinter(MAI, MII, MRI, STI);
- return 0;
+ if (SyntaxVariant == 1)
+ return new AArch64AppleInstPrinter(MAI, MII, MRI, STI);
+
+ return nullptr;
}
-namespace {
+static MCStreamer *createMCStreamer(const Target &T, StringRef TT,
+ MCContext &Ctx, MCAsmBackend &TAB,
+ raw_ostream &OS, MCCodeEmitter *Emitter,
+ const MCSubtargetInfo &STI, bool RelaxAll,
+ bool NoExecStack) {
+ Triple TheTriple(TT);
-class AArch64MCInstrAnalysis : public MCInstrAnalysis {
-public:
- AArch64MCInstrAnalysis(const MCInstrInfo *Info) : MCInstrAnalysis(Info) {}
+ if (TheTriple.isOSDarwin())
+ return createMachOStreamer(Ctx, TAB, OS, Emitter, RelaxAll,
+ /*LabelSections*/ true);
- virtual bool isUnconditionalBranch(const MCInst &Inst) const {
- if (Inst.getOpcode() == AArch64::Bcc
- && Inst.getOperand(0).getImm() == A64CC::AL)
- return true;
- return MCInstrAnalysis::isUnconditionalBranch(Inst);
- }
-
- virtual bool isConditionalBranch(const MCInst &Inst) const {
- if (Inst.getOpcode() == AArch64::Bcc
- && Inst.getOperand(0).getImm() == A64CC::AL)
- return false;
- return MCInstrAnalysis::isConditionalBranch(Inst);
- }
-
- bool evaluateBranch(const MCInst &Inst, uint64_t Addr,
- uint64_t Size, uint64_t &Target) const {
- unsigned LblOperand = Inst.getOpcode() == AArch64::Bcc ? 1 : 0;
- // FIXME: We only handle PCRel branches for now.
- if (Info->get(Inst.getOpcode()).OpInfo[LblOperand].OperandType
- != MCOI::OPERAND_PCREL)
- return false;
-
- int64_t Imm = Inst.getOperand(LblOperand).getImm();
- Target = Addr + Imm;
- return true;
- }
-};
-
+ return createAArch64ELFStreamer(Ctx, TAB, OS, Emitter, RelaxAll, NoExecStack);
}
-static MCInstrAnalysis *createAArch64MCInstrAnalysis(const MCInstrInfo *Info) {
- return new AArch64MCInstrAnalysis(Info);
-}
-
-
-
+// Force static initialization.
extern "C" void LLVMInitializeAArch64TargetMC() {
// Register the MC asm info.
- RegisterMCAsmInfoFn A(TheAArch64leTarget, createAArch64MCAsmInfo);
- RegisterMCAsmInfoFn B(TheAArch64beTarget, createAArch64MCAsmInfo);
+ RegisterMCAsmInfoFn X(TheAArch64leTarget, createAArch64MCAsmInfo);
+ RegisterMCAsmInfoFn Y(TheAArch64beTarget, createAArch64MCAsmInfo);
+ RegisterMCAsmInfoFn Z(TheARM64leTarget, createAArch64MCAsmInfo);
+ RegisterMCAsmInfoFn W(TheARM64beTarget, createAArch64MCAsmInfo);
// Register the MC codegen info.
TargetRegistry::RegisterMCCodeGenInfo(TheAArch64leTarget,
createAArch64MCCodeGenInfo);
TargetRegistry::RegisterMCCodeGenInfo(TheAArch64beTarget,
createAArch64MCCodeGenInfo);
+ TargetRegistry::RegisterMCCodeGenInfo(TheARM64leTarget,
+ createAArch64MCCodeGenInfo);
+ TargetRegistry::RegisterMCCodeGenInfo(TheARM64beTarget,
+ createAArch64MCCodeGenInfo);
// Register the MC instruction info.
TargetRegistry::RegisterMCInstrInfo(TheAArch64leTarget,
createAArch64MCInstrInfo);
TargetRegistry::RegisterMCInstrInfo(TheAArch64beTarget,
createAArch64MCInstrInfo);
+ TargetRegistry::RegisterMCInstrInfo(TheARM64leTarget,
+ createAArch64MCInstrInfo);
+ TargetRegistry::RegisterMCInstrInfo(TheARM64beTarget,
+ createAArch64MCInstrInfo);
// Register the MC register info.
TargetRegistry::RegisterMCRegInfo(TheAArch64leTarget,
createAArch64MCRegisterInfo);
TargetRegistry::RegisterMCRegInfo(TheAArch64beTarget,
createAArch64MCRegisterInfo);
+ TargetRegistry::RegisterMCRegInfo(TheARM64leTarget,
+ createAArch64MCRegisterInfo);
+ TargetRegistry::RegisterMCRegInfo(TheARM64beTarget,
+ createAArch64MCRegisterInfo);
// Register the MC subtarget info.
- using AArch64_MC::createAArch64MCSubtargetInfo;
TargetRegistry::RegisterMCSubtargetInfo(TheAArch64leTarget,
createAArch64MCSubtargetInfo);
TargetRegistry::RegisterMCSubtargetInfo(TheAArch64beTarget,
createAArch64MCSubtargetInfo);
-
- // Register the MC instruction analyzer.
- TargetRegistry::RegisterMCInstrAnalysis(TheAArch64leTarget,
- createAArch64MCInstrAnalysis);
- TargetRegistry::RegisterMCInstrAnalysis(TheAArch64beTarget,
- createAArch64MCInstrAnalysis);
-
- // Register the MC Code Emitter
- TargetRegistry::RegisterMCCodeEmitter(TheAArch64leTarget,
- createAArch64MCCodeEmitter);
- TargetRegistry::RegisterMCCodeEmitter(TheAArch64beTarget,
- createAArch64MCCodeEmitter);
+ TargetRegistry::RegisterMCSubtargetInfo(TheARM64leTarget,
+ createAArch64MCSubtargetInfo);
+ TargetRegistry::RegisterMCSubtargetInfo(TheARM64beTarget,
+ createAArch64MCSubtargetInfo);
// Register the asm backend.
TargetRegistry::RegisterMCAsmBackend(TheAArch64leTarget,
createAArch64leAsmBackend);
TargetRegistry::RegisterMCAsmBackend(TheAArch64beTarget,
createAArch64beAsmBackend);
+ TargetRegistry::RegisterMCAsmBackend(TheARM64leTarget,
+ createAArch64leAsmBackend);
+ TargetRegistry::RegisterMCAsmBackend(TheARM64beTarget,
+ createAArch64beAsmBackend);
+
+ // Register the MC Code Emitter
+ TargetRegistry::RegisterMCCodeEmitter(TheAArch64leTarget,
+ createAArch64MCCodeEmitter);
+ TargetRegistry::RegisterMCCodeEmitter(TheAArch64beTarget,
+ createAArch64MCCodeEmitter);
+ TargetRegistry::RegisterMCCodeEmitter(TheARM64leTarget,
+ createAArch64MCCodeEmitter);
+ TargetRegistry::RegisterMCCodeEmitter(TheARM64beTarget,
+ createAArch64MCCodeEmitter);
// Register the object streamer.
TargetRegistry::RegisterMCObjectStreamer(TheAArch64leTarget,
createMCStreamer);
TargetRegistry::RegisterMCObjectStreamer(TheAArch64beTarget,
createMCStreamer);
+ TargetRegistry::RegisterMCObjectStreamer(TheARM64leTarget, createMCStreamer);
+ TargetRegistry::RegisterMCObjectStreamer(TheARM64beTarget, createMCStreamer);
// Register the MCInstPrinter.
TargetRegistry::RegisterMCInstPrinter(TheAArch64leTarget,
createAArch64MCInstPrinter);
TargetRegistry::RegisterMCInstPrinter(TheAArch64beTarget,
createAArch64MCInstPrinter);
+ TargetRegistry::RegisterMCInstPrinter(TheARM64leTarget,
+ createAArch64MCInstPrinter);
+ TargetRegistry::RegisterMCInstPrinter(TheARM64beTarget,
+ createAArch64MCInstPrinter);
}
diff --git a/lib/Target/AArch64/MCTargetDesc/AArch64MCTargetDesc.h b/lib/Target/AArch64/MCTargetDesc/AArch64MCTargetDesc.h
index bd8beaf..d886ea2 100644
--- a/lib/Target/AArch64/MCTargetDesc/AArch64MCTargetDesc.h
+++ b/lib/Target/AArch64/MCTargetDesc/AArch64MCTargetDesc.h
@@ -11,18 +11,19 @@
//
//===----------------------------------------------------------------------===//
-#ifndef LLVM_AARCH64MCTARGETDESC_H
-#define LLVM_AARCH64MCTARGETDESC_H
+#ifndef AArch64MCTARGETDESC_H
+#define AArch64MCTARGETDESC_H
#include "llvm/Support/DataTypes.h"
+#include <string>
namespace llvm {
class MCAsmBackend;
class MCCodeEmitter;
class MCContext;
class MCInstrInfo;
-class MCObjectWriter;
class MCRegisterInfo;
+class MCObjectWriter;
class MCSubtargetInfo;
class StringRef;
class Target;
@@ -30,28 +31,25 @@
extern Target TheAArch64leTarget;
extern Target TheAArch64beTarget;
-
-namespace AArch64_MC {
- MCSubtargetInfo *createAArch64MCSubtargetInfo(StringRef TT, StringRef CPU,
- StringRef FS);
-}
+extern Target TheARM64leTarget;
+extern Target TheARM64beTarget;
MCCodeEmitter *createAArch64MCCodeEmitter(const MCInstrInfo &MCII,
- const MCRegisterInfo &MRI,
- const MCSubtargetInfo &STI,
- MCContext &Ctx);
+ const MCRegisterInfo &MRI,
+ const MCSubtargetInfo &STI,
+ MCContext &Ctx);
+MCAsmBackend *createAArch64leAsmBackend(const Target &T,
+ const MCRegisterInfo &MRI, StringRef TT,
+ StringRef CPU);
+MCAsmBackend *createAArch64beAsmBackend(const Target &T,
+ const MCRegisterInfo &MRI, StringRef TT,
+ StringRef CPU);
-MCObjectWriter *createAArch64ELFObjectWriter(raw_ostream &OS,
- uint8_t OSABI,
+MCObjectWriter *createAArch64ELFObjectWriter(raw_ostream &OS, uint8_t OSABI,
bool IsLittleEndian);
-MCAsmBackend *createAArch64leAsmBackend(const Target &T,
- const MCRegisterInfo &MRI,
- StringRef TT, StringRef CPU);
-
-MCAsmBackend *createAArch64beAsmBackend(const Target &T,
- const MCRegisterInfo &MRI,
- StringRef TT, StringRef CPU);
+MCObjectWriter *createAArch64MachObjectWriter(raw_ostream &OS, uint32_t CPUType,
+ uint32_t CPUSubtype);
} // End llvm namespace
diff --git a/lib/Target/AArch64/MCTargetDesc/AArch64MachObjectWriter.cpp b/lib/Target/AArch64/MCTargetDesc/AArch64MachObjectWriter.cpp
new file mode 100644
index 0000000..5c86189
--- /dev/null
+++ b/lib/Target/AArch64/MCTargetDesc/AArch64MachObjectWriter.cpp
@@ -0,0 +1,396 @@
+//===-- AArch64MachObjectWriter.cpp - ARM Mach Object Writer --------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "MCTargetDesc/AArch64FixupKinds.h"
+#include "MCTargetDesc/AArch64MCTargetDesc.h"
+#include "llvm/MC/MCAssembler.h"
+#include "llvm/MC/MCAsmLayout.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCFixup.h"
+#include "llvm/MC/MCMachObjectWriter.h"
+#include "llvm/MC/MCSectionMachO.h"
+#include "llvm/MC/MCValue.h"
+#include "llvm/ADT/Twine.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/MachO.h"
+using namespace llvm;
+
+namespace {
+class AArch64MachObjectWriter : public MCMachObjectTargetWriter {
+ bool getAArch64FixupKindMachOInfo(const MCFixup &Fixup, unsigned &RelocType,
+ const MCSymbolRefExpr *Sym,
+ unsigned &Log2Size, const MCAssembler &Asm);
+
+public:
+ AArch64MachObjectWriter(uint32_t CPUType, uint32_t CPUSubtype)
+ : MCMachObjectTargetWriter(true /* is64Bit */, CPUType, CPUSubtype,
+ /*UseAggressiveSymbolFolding=*/true) {}
+
+ void RecordRelocation(MachObjectWriter *Writer, const MCAssembler &Asm,
+ const MCAsmLayout &Layout, const MCFragment *Fragment,
+ const MCFixup &Fixup, MCValue Target,
+ uint64_t &FixedValue) override;
+};
+}
+
+bool AArch64MachObjectWriter::getAArch64FixupKindMachOInfo(
+ const MCFixup &Fixup, unsigned &RelocType, const MCSymbolRefExpr *Sym,
+ unsigned &Log2Size, const MCAssembler &Asm) {
+ RelocType = unsigned(MachO::ARM64_RELOC_UNSIGNED);
+ Log2Size = ~0U;
+
+ switch ((unsigned)Fixup.getKind()) {
+ default:
+ return false;
+
+ case FK_Data_1:
+ Log2Size = llvm::Log2_32(1);
+ return true;
+ case FK_Data_2:
+ Log2Size = llvm::Log2_32(2);
+ return true;
+ case FK_Data_4:
+ Log2Size = llvm::Log2_32(4);
+ if (Sym->getKind() == MCSymbolRefExpr::VK_GOT)
+ RelocType = unsigned(MachO::ARM64_RELOC_POINTER_TO_GOT);
+ return true;
+ case FK_Data_8:
+ Log2Size = llvm::Log2_32(8);
+ if (Sym->getKind() == MCSymbolRefExpr::VK_GOT)
+ RelocType = unsigned(MachO::ARM64_RELOC_POINTER_TO_GOT);
+ return true;
+ case AArch64::fixup_aarch64_add_imm12:
+ case AArch64::fixup_aarch64_ldst_imm12_scale1:
+ case AArch64::fixup_aarch64_ldst_imm12_scale2:
+ case AArch64::fixup_aarch64_ldst_imm12_scale4:
+ case AArch64::fixup_aarch64_ldst_imm12_scale8:
+ case AArch64::fixup_aarch64_ldst_imm12_scale16:
+ Log2Size = llvm::Log2_32(4);
+ switch (Sym->getKind()) {
+ default:
+ assert(0 && "Unexpected symbol reference variant kind!");
+ case MCSymbolRefExpr::VK_PAGEOFF:
+ RelocType = unsigned(MachO::ARM64_RELOC_PAGEOFF12);
+ return true;
+ case MCSymbolRefExpr::VK_GOTPAGEOFF:
+ RelocType = unsigned(MachO::ARM64_RELOC_GOT_LOAD_PAGEOFF12);
+ return true;
+ case MCSymbolRefExpr::VK_TLVPPAGEOFF:
+ RelocType = unsigned(MachO::ARM64_RELOC_TLVP_LOAD_PAGEOFF12);
+ return true;
+ }
+ case AArch64::fixup_aarch64_pcrel_adrp_imm21:
+ Log2Size = llvm::Log2_32(4);
+ // This encompasses the relocation for the whole 21-bit value.
+ switch (Sym->getKind()) {
+ default:
+ Asm.getContext().FatalError(Fixup.getLoc(),
+ "ADR/ADRP relocations must be GOT relative");
+ case MCSymbolRefExpr::VK_PAGE:
+ RelocType = unsigned(MachO::ARM64_RELOC_PAGE21);
+ return true;
+ case MCSymbolRefExpr::VK_GOTPAGE:
+ RelocType = unsigned(MachO::ARM64_RELOC_GOT_LOAD_PAGE21);
+ return true;
+ case MCSymbolRefExpr::VK_TLVPPAGE:
+ RelocType = unsigned(MachO::ARM64_RELOC_TLVP_LOAD_PAGE21);
+ return true;
+ }
+ return true;
+ case AArch64::fixup_aarch64_pcrel_branch26:
+ case AArch64::fixup_aarch64_pcrel_call26:
+ Log2Size = llvm::Log2_32(4);
+ RelocType = unsigned(MachO::ARM64_RELOC_BRANCH26);
+ return true;
+ }
+}
+
+void AArch64MachObjectWriter::RecordRelocation(
+ MachObjectWriter *Writer, const MCAssembler &Asm, const MCAsmLayout &Layout,
+ const MCFragment *Fragment, const MCFixup &Fixup, MCValue Target,
+ uint64_t &FixedValue) {
+ unsigned IsPCRel = Writer->isFixupKindPCRel(Asm, Fixup.getKind());
+
+ // See <reloc.h>.
+ uint32_t FixupOffset = Layout.getFragmentOffset(Fragment);
+ unsigned Log2Size = 0;
+ int64_t Value = 0;
+ unsigned Index = 0;
+ unsigned IsExtern = 0;
+ unsigned Type = 0;
+ unsigned Kind = Fixup.getKind();
+
+ FixupOffset += Fixup.getOffset();
+
+ // AArch64 pcrel relocation addends do not include the section offset.
+ if (IsPCRel)
+ FixedValue += FixupOffset;
+
+ // ADRP fixups use relocations for the whole symbol value and only
+ // put the addend in the instruction itself. Clear out any value the
+ // generic code figured out from the sybmol definition.
+ if (Kind == AArch64::fixup_aarch64_pcrel_adrp_imm21)
+ FixedValue = 0;
+
+ // imm19 relocations are for conditional branches, which require
+ // assembler local symbols. If we got here, that's not what we have,
+ // so complain loudly.
+ if (Kind == AArch64::fixup_aarch64_pcrel_branch19) {
+ Asm.getContext().FatalError(Fixup.getLoc(),
+ "conditional branch requires assembler-local"
+ " label. '" +
+ Target.getSymA()->getSymbol().getName() +
+ "' is external.");
+ return;
+ }
+
+ // 14-bit branch relocations should only target internal labels, and so
+ // should never get here.
+ if (Kind == AArch64::fixup_aarch64_pcrel_branch14) {
+ Asm.getContext().FatalError(Fixup.getLoc(),
+ "Invalid relocation on conditional branch!");
+ return;
+ }
+
+ if (!getAArch64FixupKindMachOInfo(Fixup, Type, Target.getSymA(), Log2Size,
+ Asm)) {
+ Asm.getContext().FatalError(Fixup.getLoc(), "unknown AArch64 fixup kind!");
+ return;
+ }
+
+ Value = Target.getConstant();
+
+ if (Target.isAbsolute()) { // constant
+ // FIXME: Should this always be extern?
+ // SymbolNum of 0 indicates the absolute section.
+ Type = MachO::ARM64_RELOC_UNSIGNED;
+ Index = 0;
+
+ if (IsPCRel) {
+ IsExtern = 1;
+ Asm.getContext().FatalError(Fixup.getLoc(),
+ "PC relative absolute relocation!");
+
+ // FIXME: x86_64 sets the type to a branch reloc here. Should we do
+ // something similar?
+ }
+ } else if (Target.getSymB()) { // A - B + constant
+ const MCSymbol *A = &Target.getSymA()->getSymbol();
+ const MCSymbolData &A_SD = Asm.getSymbolData(*A);
+ const MCSymbolData *A_Base = Asm.getAtom(&A_SD);
+
+ const MCSymbol *B = &Target.getSymB()->getSymbol();
+ const MCSymbolData &B_SD = Asm.getSymbolData(*B);
+ const MCSymbolData *B_Base = Asm.getAtom(&B_SD);
+
+ // Check for "_foo@got - .", which comes through here as:
+ // Ltmp0:
+ // ... _foo@got - Ltmp0
+ if (Target.getSymA()->getKind() == MCSymbolRefExpr::VK_GOT &&
+ Target.getSymB()->getKind() == MCSymbolRefExpr::VK_None &&
+ Layout.getSymbolOffset(&B_SD) ==
+ Layout.getFragmentOffset(Fragment) + Fixup.getOffset()) {
+ // SymB is the PC, so use a PC-rel pointer-to-GOT relocation.
+ Index = A_Base->getIndex();
+ IsExtern = 1;
+ Type = MachO::ARM64_RELOC_POINTER_TO_GOT;
+ IsPCRel = 1;
+ MachO::any_relocation_info MRE;
+ MRE.r_word0 = FixupOffset;
+ MRE.r_word1 = ((Index << 0) | (IsPCRel << 24) | (Log2Size << 25) |
+ (IsExtern << 27) | (Type << 28));
+ Writer->addRelocation(Fragment->getParent(), MRE);
+ return;
+ } else if (Target.getSymA()->getKind() != MCSymbolRefExpr::VK_None ||
+ Target.getSymB()->getKind() != MCSymbolRefExpr::VK_None)
+ // Otherwise, neither symbol can be modified.
+ Asm.getContext().FatalError(Fixup.getLoc(),
+ "unsupported relocation of modified symbol");
+
+ // We don't support PCrel relocations of differences.
+ if (IsPCRel)
+ Asm.getContext().FatalError(Fixup.getLoc(),
+ "unsupported pc-relative relocation of "
+ "difference");
+
+ // AArch64 always uses external relocations. If there is no symbol to use as
+ // a base address (a local symbol with no preceding non-local symbol),
+ // error out.
+ //
+ // FIXME: We should probably just synthesize an external symbol and use
+ // that.
+ if (!A_Base)
+ Asm.getContext().FatalError(
+ Fixup.getLoc(),
+ "unsupported relocation of local symbol '" + A->getName() +
+ "'. Must have non-local symbol earlier in section.");
+ if (!B_Base)
+ Asm.getContext().FatalError(
+ Fixup.getLoc(),
+ "unsupported relocation of local symbol '" + B->getName() +
+ "'. Must have non-local symbol earlier in section.");
+
+ if (A_Base == B_Base && A_Base)
+ Asm.getContext().FatalError(Fixup.getLoc(),
+ "unsupported relocation with identical base");
+
+ Value += (!A_SD.getFragment() ? 0
+ : Writer->getSymbolAddress(&A_SD, Layout)) -
+ (!A_Base || !A_Base->getFragment()
+ ? 0
+ : Writer->getSymbolAddress(A_Base, Layout));
+ Value -= (!B_SD.getFragment() ? 0
+ : Writer->getSymbolAddress(&B_SD, Layout)) -
+ (!B_Base || !B_Base->getFragment()
+ ? 0
+ : Writer->getSymbolAddress(B_Base, Layout));
+
+ Index = A_Base->getIndex();
+ IsExtern = 1;
+ Type = MachO::ARM64_RELOC_UNSIGNED;
+
+ MachO::any_relocation_info MRE;
+ MRE.r_word0 = FixupOffset;
+ MRE.r_word1 = ((Index << 0) | (IsPCRel << 24) | (Log2Size << 25) |
+ (IsExtern << 27) | (Type << 28));
+ Writer->addRelocation(Fragment->getParent(), MRE);
+
+ Index = B_Base->getIndex();
+ IsExtern = 1;
+ Type = MachO::ARM64_RELOC_SUBTRACTOR;
+ } else { // A + constant
+ const MCSymbol *Symbol = &Target.getSymA()->getSymbol();
+ const MCSymbolData &SD = Asm.getSymbolData(*Symbol);
+ const MCSymbolData *Base = Asm.getAtom(&SD);
+ const MCSectionMachO &Section = static_cast<const MCSectionMachO &>(
+ Fragment->getParent()->getSection());
+
+ // If the symbol is a variable and we weren't able to get a Base for it
+ // (i.e., it's not in the symbol table associated with a section) resolve
+ // the relocation based its expansion instead.
+ if (Symbol->isVariable() && !Base) {
+ // If the evaluation is an absolute value, just use that directly
+ // to keep things easy.
+ int64_t Res;
+ if (SD.getSymbol().getVariableValue()->EvaluateAsAbsolute(
+ Res, Layout, Writer->getSectionAddressMap())) {
+ FixedValue = Res;
+ return;
+ }
+
+ // FIXME: Will the Target we already have ever have any data in it
+ // we need to preserve and merge with the new Target? How about
+ // the FixedValue?
+ if (!Symbol->getVariableValue()->EvaluateAsRelocatable(Target, &Layout))
+ Asm.getContext().FatalError(Fixup.getLoc(),
+ "unable to resolve variable '" +
+ Symbol->getName() + "'");
+ return RecordRelocation(Writer, Asm, Layout, Fragment, Fixup, Target,
+ FixedValue);
+ }
+
+ // Relocations inside debug sections always use local relocations when
+ // possible. This seems to be done because the debugger doesn't fully
+ // understand relocation entries and expects to find values that
+ // have already been fixed up.
+ if (Symbol->isInSection()) {
+ if (Section.hasAttribute(MachO::S_ATTR_DEBUG))
+ Base = nullptr;
+ }
+
+ // AArch64 uses external relocations as much as possible. For debug
+ // sections, and for pointer-sized relocations (.quad), we allow section
+ // relocations. It's code sections that run into trouble.
+ if (Base) {
+ Index = Base->getIndex();
+ IsExtern = 1;
+
+ // Add the local offset, if needed.
+ if (Base != &SD)
+ Value += Layout.getSymbolOffset(&SD) - Layout.getSymbolOffset(Base);
+ } else if (Symbol->isInSection()) {
+ // Pointer-sized relocations can use a local relocation. Otherwise,
+ // we have to be in a debug info section.
+ if (!Section.hasAttribute(MachO::S_ATTR_DEBUG) && Log2Size != 3)
+ Asm.getContext().FatalError(
+ Fixup.getLoc(),
+ "unsupported relocation of local symbol '" + Symbol->getName() +
+ "'. Must have non-local symbol earlier in section.");
+ // Adjust the relocation to be section-relative.
+ // The index is the section ordinal (1-based).
+ const MCSectionData &SymSD =
+ Asm.getSectionData(SD.getSymbol().getSection());
+ Index = SymSD.getOrdinal() + 1;
+ IsExtern = 0;
+ Value += Writer->getSymbolAddress(&SD, Layout);
+
+ if (IsPCRel)
+ Value -= Writer->getFragmentAddress(Fragment, Layout) +
+ Fixup.getOffset() + (1ULL << Log2Size);
+ } else {
+ // Resolve constant variables.
+ if (SD.getSymbol().isVariable()) {
+ int64_t Res;
+ if (SD.getSymbol().getVariableValue()->EvaluateAsAbsolute(
+ Res, Layout, Writer->getSectionAddressMap())) {
+ FixedValue = Res;
+ return;
+ }
+ }
+ Asm.getContext().FatalError(Fixup.getLoc(),
+ "unsupported relocation of variable '" +
+ Symbol->getName() + "'");
+ }
+ }
+
+ // If the relocation kind is Branch26, Page21, or Pageoff12, any addend
+ // is represented via an Addend relocation, not encoded directly into
+ // the instruction.
+ if ((Type == MachO::ARM64_RELOC_BRANCH26 ||
+ Type == MachO::ARM64_RELOC_PAGE21 ||
+ Type == MachO::ARM64_RELOC_PAGEOFF12) &&
+ Value) {
+ assert((Value & 0xff000000) == 0 && "Added relocation out of range!");
+
+ MachO::any_relocation_info MRE;
+ MRE.r_word0 = FixupOffset;
+ MRE.r_word1 = ((Index << 0) | (IsPCRel << 24) | (Log2Size << 25) |
+ (IsExtern << 27) | (Type << 28));
+ Writer->addRelocation(Fragment->getParent(), MRE);
+
+ // Now set up the Addend relocation.
+ Type = MachO::ARM64_RELOC_ADDEND;
+ Index = Value;
+ IsPCRel = 0;
+ Log2Size = 2;
+ IsExtern = 0;
+
+ // Put zero into the instruction itself. The addend is in the relocation.
+ Value = 0;
+ }
+
+ // If there's any addend left to handle, encode it in the instruction.
+ FixedValue = Value;
+
+ // struct relocation_info (8 bytes)
+ MachO::any_relocation_info MRE;
+ MRE.r_word0 = FixupOffset;
+ MRE.r_word1 = ((Index << 0) | (IsPCRel << 24) | (Log2Size << 25) |
+ (IsExtern << 27) | (Type << 28));
+ Writer->addRelocation(Fragment->getParent(), MRE);
+}
+
+MCObjectWriter *llvm::createAArch64MachObjectWriter(raw_ostream &OS,
+ uint32_t CPUType,
+ uint32_t CPUSubtype) {
+ return createMachObjectWriter(
+ new AArch64MachObjectWriter(CPUType, CPUSubtype), OS,
+ /*IsLittleEndian=*/true);
+}
diff --git a/lib/Target/AArch64/MCTargetDesc/Android.mk b/lib/Target/AArch64/MCTargetDesc/Android.mk
index edcf1f2..c0cdb2b 100644
--- a/lib/Target/AArch64/MCTargetDesc/Android.mk
+++ b/lib/Target/AArch64/MCTargetDesc/Android.mk
@@ -10,6 +10,7 @@
AArch64AsmBackend.cpp \
AArch64ELFObjectWriter.cpp \
AArch64ELFStreamer.cpp \
+ AArch64MachObjectWriter.cpp \
AArch64MCAsmInfo.cpp \
AArch64MCCodeEmitter.cpp \
AArch64MCExpr.cpp \
diff --git a/lib/Target/AArch64/MCTargetDesc/CMakeLists.txt b/lib/Target/AArch64/MCTargetDesc/CMakeLists.txt
index 54c4465..7d5bced 100644
--- a/lib/Target/AArch64/MCTargetDesc/CMakeLists.txt
+++ b/lib/Target/AArch64/MCTargetDesc/CMakeLists.txt
@@ -6,4 +6,9 @@
AArch64MCCodeEmitter.cpp
AArch64MCExpr.cpp
AArch64MCTargetDesc.cpp
- )
+ AArch64MachObjectWriter.cpp
+)
+add_dependencies(LLVMAArch64Desc AArch64CommonTableGen)
+
+# Hack: we need to include 'main' target directory to grab private headers
+include_directories(${CMAKE_CURRENT_SOURCE_DIR}/.. ${CMAKE_CURRENT_BINARY_DIR}/..)
diff --git a/lib/Target/AArch64/MCTargetDesc/LLVMBuild.txt b/lib/Target/AArch64/MCTargetDesc/LLVMBuild.txt
index 37c8035..70cff0b 100644
--- a/lib/Target/AArch64/MCTargetDesc/LLVMBuild.txt
+++ b/lib/Target/AArch64/MCTargetDesc/LLVMBuild.txt
@@ -1,4 +1,4 @@
-;===- ./lib/Target/AArch64/MCTargetDesc/LLVMBuild.txt ----------*- Conf -*--===;
+;===- ./lib/Target/AArch64/MCTargetDesc/LLVMBuild.txt ------------*- Conf -*--===;
;
; The LLVM Compiler Infrastructure
;
diff --git a/lib/Target/AArch64/Makefile b/lib/Target/AArch64/Makefile
index 641bb83..f356c58 100644
--- a/lib/Target/AArch64/Makefile
+++ b/lib/Target/AArch64/Makefile
@@ -12,19 +12,14 @@
TARGET = AArch64
# Make sure that tblgen is run, first thing.
-BUILT_SOURCES = AArch64GenAsmMatcher.inc \
- AArch64GenAsmWriter.inc \
- AArch64GenCallingConv.inc \
- AArch64GenDAGISel.inc \
- AArch64GenDisassemblerTables.inc \
- AArch64GenInstrInfo.inc \
- AArch64GenMCCodeEmitter.inc \
- AArch64GenMCPseudoLowering.inc \
- AArch64GenRegisterInfo.inc \
- AArch64GenSubtargetInfo.inc
+BUILT_SOURCES = AArch64GenRegisterInfo.inc AArch64GenInstrInfo.inc \
+ AArch64GenAsmWriter.inc AArch64GenAsmWriter1.inc \
+ AArch64GenDAGISel.inc \
+ AArch64GenCallingConv.inc AArch64GenAsmMatcher.inc \
+ AArch64GenSubtargetInfo.inc AArch64GenMCCodeEmitter.inc \
+ AArch64GenFastISel.inc AArch64GenDisassemblerTables.inc \
+ AArch64GenMCPseudoLowering.inc
-DIRS = InstPrinter AsmParser Disassembler TargetInfo MCTargetDesc Utils
+DIRS = TargetInfo InstPrinter AsmParser Disassembler MCTargetDesc Utils
include $(LEVEL)/Makefile.common
-
-
diff --git a/lib/Target/AArch64/README.txt b/lib/Target/AArch64/README.txt
deleted file mode 100644
index 601990f..0000000
--- a/lib/Target/AArch64/README.txt
+++ /dev/null
@@ -1,2 +0,0 @@
-This file will contain changes that need to be made before AArch64 can become an
-officially supported target. Currently a placeholder.
diff --git a/lib/Target/AArch64/TargetInfo/AArch64TargetInfo.cpp b/lib/Target/AArch64/TargetInfo/AArch64TargetInfo.cpp
index 9281e4e..3a382c1 100644
--- a/lib/Target/AArch64/TargetInfo/AArch64TargetInfo.cpp
+++ b/lib/Target/AArch64/TargetInfo/AArch64TargetInfo.cpp
@@ -1,4 +1,4 @@
-//===-- AArch64TargetInfo.cpp - AArch64 Target Implementation -------------===//
+//===-- AArch64TargetInfo.cpp - AArch64 Target Implementation -----------------===//
//
// The LLVM Compiler Infrastructure
//
@@ -6,22 +6,26 @@
// License. See LICENSE.TXT for details.
//
//===----------------------------------------------------------------------===//
-//
-// This file contains the key registration step for the architecture.
-//
-//===----------------------------------------------------------------------===//
-#include "AArch64.h"
-#include "llvm/IR/Module.h"
+#include "llvm/ADT/Triple.h"
#include "llvm/Support/TargetRegistry.h"
using namespace llvm;
-Target llvm::TheAArch64leTarget;
-Target llvm::TheAArch64beTarget;
+namespace llvm {
+Target TheAArch64leTarget;
+Target TheAArch64beTarget;
+Target TheARM64leTarget;
+Target TheARM64beTarget;
+} // end namespace llvm
extern "C" void LLVMInitializeAArch64TargetInfo() {
- RegisterTarget<Triple::aarch64, /*HasJIT=*/true>
- X(TheAArch64leTarget, "aarch64", "AArch64 (ARM 64-bit little endian target)");
- RegisterTarget<Triple::aarch64_be, /*HasJIT=*/true>
- Y(TheAArch64beTarget, "aarch64_be", "AArch64 (ARM 64-bit big endian target)");
+ RegisterTarget<Triple::arm64, /*HasJIT=*/true> X(TheARM64leTarget, "arm64",
+ "AArch64 (little endian)");
+ RegisterTarget<Triple::arm64_be, /*HasJIT=*/true> Y(TheARM64beTarget, "arm64_be",
+ "AArch64 (big endian)");
+
+ RegisterTarget<Triple::aarch64, /*HasJIT=*/true> Z(
+ TheAArch64leTarget, "aarch64", "AArch64 (little endian)");
+ RegisterTarget<Triple::aarch64_be, /*HasJIT=*/true> W(
+ TheAArch64beTarget, "aarch64_be", "AArch64 (big endian)");
}
diff --git a/lib/Target/AArch64/TargetInfo/CMakeLists.txt b/lib/Target/AArch64/TargetInfo/CMakeLists.txt
index ee734c6..e236eed 100644
--- a/lib/Target/AArch64/TargetInfo/CMakeLists.txt
+++ b/lib/Target/AArch64/TargetInfo/CMakeLists.txt
@@ -1,3 +1,7 @@
+include_directories( ${CMAKE_CURRENT_BINARY_DIR}/.. ${CMAKE_CURRENT_SOURCE_DIR}/.. )
+
add_llvm_library(LLVMAArch64Info
AArch64TargetInfo.cpp
)
+
+add_dependencies(LLVMAArch64Info AArch64CommonTableGen)
diff --git a/lib/Target/AArch64/TargetInfo/LLVMBuild.txt b/lib/Target/AArch64/TargetInfo/LLVMBuild.txt
index 6429172..93c5407 100644
--- a/lib/Target/AArch64/TargetInfo/LLVMBuild.txt
+++ b/lib/Target/AArch64/TargetInfo/LLVMBuild.txt
@@ -1,4 +1,4 @@
-;===- ./lib/Target/AArch64/TargetInfo/LLVMBuild.txt ------------*- Conf -*--===;
+;===- ./lib/Target/AArch64/TargetInfo/LLVMBuild.txt --------------*- Conf -*--===;
;
; The LLVM Compiler Infrastructure
;
diff --git a/lib/Target/AArch64/Utils/AArch64BaseInfo.cpp b/lib/Target/AArch64/Utils/AArch64BaseInfo.cpp
index 2a97cd6..3c24bb3 100644
--- a/lib/Target/AArch64/Utils/AArch64BaseInfo.cpp
+++ b/lib/Target/AArch64/Utils/AArch64BaseInfo.cpp
@@ -18,7 +18,7 @@
using namespace llvm;
-StringRef NamedImmMapper::toString(uint32_t Value, bool &Valid) const {
+StringRef AArch64NamedImmMapper::toString(uint32_t Value, bool &Valid) const {
for (unsigned i = 0; i < NumPairs; ++i) {
if (Pairs[i].Value == Value) {
Valid = true;
@@ -30,7 +30,7 @@
return StringRef();
}
-uint32_t NamedImmMapper::fromString(StringRef Name, bool &Valid) const {
+uint32_t AArch64NamedImmMapper::fromString(StringRef Name, bool &Valid) const {
std::string LowerCaseName = Name.lower();
for (unsigned i = 0; i < NumPairs; ++i) {
if (Pairs[i].Name == LowerCaseName) {
@@ -43,11 +43,11 @@
return -1;
}
-bool NamedImmMapper::validImm(uint32_t Value) const {
+bool AArch64NamedImmMapper::validImm(uint32_t Value) const {
return Value < TooBigImm;
}
-const NamedImmMapper::Mapping A64AT::ATMapper::ATPairs[] = {
+const AArch64NamedImmMapper::Mapping AArch64AT::ATMapper::ATPairs[] = {
{"s1e1r", S1E1R},
{"s1e2r", S1E2R},
{"s1e3r", S1E3R},
@@ -62,10 +62,10 @@
{"s12e0w", S12E0W},
};
-A64AT::ATMapper::ATMapper()
- : NamedImmMapper(ATPairs, 0) {}
+AArch64AT::ATMapper::ATMapper()
+ : AArch64NamedImmMapper(ATPairs, 0) {}
-const NamedImmMapper::Mapping A64DB::DBarrierMapper::DBarrierPairs[] = {
+const AArch64NamedImmMapper::Mapping AArch64DB::DBarrierMapper::DBarrierPairs[] = {
{"oshld", OSHLD},
{"oshst", OSHST},
{"osh", OSH},
@@ -80,10 +80,10 @@
{"sy", SY}
};
-A64DB::DBarrierMapper::DBarrierMapper()
- : NamedImmMapper(DBarrierPairs, 16u) {}
+AArch64DB::DBarrierMapper::DBarrierMapper()
+ : AArch64NamedImmMapper(DBarrierPairs, 16u) {}
-const NamedImmMapper::Mapping A64DC::DCMapper::DCPairs[] = {
+const AArch64NamedImmMapper::Mapping AArch64DC::DCMapper::DCPairs[] = {
{"zva", ZVA},
{"ivac", IVAC},
{"isw", ISW},
@@ -94,26 +94,26 @@
{"cisw", CISW}
};
-A64DC::DCMapper::DCMapper()
- : NamedImmMapper(DCPairs, 0) {}
+AArch64DC::DCMapper::DCMapper()
+ : AArch64NamedImmMapper(DCPairs, 0) {}
-const NamedImmMapper::Mapping A64IC::ICMapper::ICPairs[] = {
+const AArch64NamedImmMapper::Mapping AArch64IC::ICMapper::ICPairs[] = {
{"ialluis", IALLUIS},
{"iallu", IALLU},
{"ivau", IVAU}
};
-A64IC::ICMapper::ICMapper()
- : NamedImmMapper(ICPairs, 0) {}
+AArch64IC::ICMapper::ICMapper()
+ : AArch64NamedImmMapper(ICPairs, 0) {}
-const NamedImmMapper::Mapping A64ISB::ISBMapper::ISBPairs[] = {
+const AArch64NamedImmMapper::Mapping AArch64ISB::ISBMapper::ISBPairs[] = {
{"sy", SY},
};
-A64ISB::ISBMapper::ISBMapper()
- : NamedImmMapper(ISBPairs, 16) {}
+AArch64ISB::ISBMapper::ISBMapper()
+ : AArch64NamedImmMapper(ISBPairs, 16) {}
-const NamedImmMapper::Mapping A64PRFM::PRFMMapper::PRFMPairs[] = {
+const AArch64NamedImmMapper::Mapping AArch64PRFM::PRFMMapper::PRFMPairs[] = {
{"pldl1keep", PLDL1KEEP},
{"pldl1strm", PLDL1STRM},
{"pldl2keep", PLDL2KEEP},
@@ -134,19 +134,19 @@
{"pstl3strm", PSTL3STRM}
};
-A64PRFM::PRFMMapper::PRFMMapper()
- : NamedImmMapper(PRFMPairs, 32) {}
+AArch64PRFM::PRFMMapper::PRFMMapper()
+ : AArch64NamedImmMapper(PRFMPairs, 32) {}
-const NamedImmMapper::Mapping A64PState::PStateMapper::PStatePairs[] = {
+const AArch64NamedImmMapper::Mapping AArch64PState::PStateMapper::PStatePairs[] = {
{"spsel", SPSel},
{"daifset", DAIFSet},
{"daifclr", DAIFClr}
};
-A64PState::PStateMapper::PStateMapper()
- : NamedImmMapper(PStatePairs, 0) {}
+AArch64PState::PStateMapper::PStateMapper()
+ : AArch64NamedImmMapper(PStatePairs, 0) {}
-const NamedImmMapper::Mapping A64SysReg::MRSMapper::MRSPairs[] = {
+const AArch64NamedImmMapper::Mapping AArch64SysReg::MRSMapper::MRSPairs[] = {
{"mdccsr_el0", MDCCSR_EL0},
{"dbgdtrrx_el0", DBGDTRRX_EL0},
{"mdrar_el1", MDRAR_EL1},
@@ -176,16 +176,16 @@
{"id_isar3_el1", ID_ISAR3_EL1},
{"id_isar4_el1", ID_ISAR4_EL1},
{"id_isar5_el1", ID_ISAR5_EL1},
- {"id_aa64pfr0_el1", ID_AA64PFR0_EL1},
- {"id_aa64pfr1_el1", ID_AA64PFR1_EL1},
- {"id_aa64dfr0_el1", ID_AA64DFR0_EL1},
- {"id_aa64dfr1_el1", ID_AA64DFR1_EL1},
- {"id_aa64afr0_el1", ID_AA64AFR0_EL1},
- {"id_aa64afr1_el1", ID_AA64AFR1_EL1},
- {"id_aa64isar0_el1", ID_AA64ISAR0_EL1},
- {"id_aa64isar1_el1", ID_AA64ISAR1_EL1},
- {"id_aa64mmfr0_el1", ID_AA64MMFR0_EL1},
- {"id_aa64mmfr1_el1", ID_AA64MMFR1_EL1},
+ {"id_aa64pfr0_el1", ID_A64PFR0_EL1},
+ {"id_aa64pfr1_el1", ID_A64PFR1_EL1},
+ {"id_aa64dfr0_el1", ID_A64DFR0_EL1},
+ {"id_aa64dfr1_el1", ID_A64DFR1_EL1},
+ {"id_aa64afr0_el1", ID_A64AFR0_EL1},
+ {"id_aa64afr1_el1", ID_A64AFR1_EL1},
+ {"id_aa64isar0_el1", ID_A64ISAR0_EL1},
+ {"id_aa64isar1_el1", ID_A64ISAR1_EL1},
+ {"id_aa64mmfr0_el1", ID_A64MMFR0_EL1},
+ {"id_aa64mmfr1_el1", ID_A64MMFR1_EL1},
{"mvfr0_el1", MVFR0_EL1},
{"mvfr1_el1", MVFR1_EL1},
{"mvfr2_el1", MVFR2_EL1},
@@ -245,12 +245,13 @@
{"ich_elsr_el2", ICH_ELSR_EL2}
};
-A64SysReg::MRSMapper::MRSMapper() {
+AArch64SysReg::MRSMapper::MRSMapper(uint64_t FeatureBits)
+ : SysRegMapper(FeatureBits) {
InstPairs = &MRSPairs[0];
NumInstPairs = llvm::array_lengthof(MRSPairs);
}
-const NamedImmMapper::Mapping A64SysReg::MSRMapper::MSRPairs[] = {
+const AArch64NamedImmMapper::Mapping AArch64SysReg::MSRMapper::MSRPairs[] = {
{"dbgdtrtx_el0", DBGDTRTX_EL0},
{"oslar_el1", OSLAR_EL1},
{"pmswinc_el0", PMSWINC_EL0},
@@ -268,13 +269,14 @@
{"icc_sgi0r_el1", ICC_SGI0R_EL1}
};
-A64SysReg::MSRMapper::MSRMapper() {
+AArch64SysReg::MSRMapper::MSRMapper(uint64_t FeatureBits)
+ : SysRegMapper(FeatureBits) {
InstPairs = &MSRPairs[0];
NumInstPairs = llvm::array_lengthof(MSRPairs);
}
-const NamedImmMapper::Mapping A64SysReg::SysRegMapper::SysRegPairs[] = {
+const AArch64NamedImmMapper::Mapping AArch64SysReg::SysRegMapper::SysRegPairs[] = {
{"osdtrrx_el1", OSDTRRX_EL1},
{"osdtrtx_el1", OSDTRTX_EL1},
{"teecr32_el1", TEECR32_EL1},
@@ -753,10 +755,16 @@
{"ich_lr15_el2", ICH_LR15_EL2}
};
+const AArch64NamedImmMapper::Mapping
+AArch64SysReg::SysRegMapper::CycloneSysRegPairs[] = {
+ {"cpm_ioacc_ctl_el3", CPM_IOACC_CTL_EL3}
+};
+
uint32_t
-A64SysReg::SysRegMapper::fromString(StringRef Name, bool &Valid) const {
- // First search the registers shared by all
+AArch64SysReg::SysRegMapper::fromString(StringRef Name, bool &Valid) const {
std::string NameLower = Name.lower();
+
+ // First search the registers shared by all
for (unsigned i = 0; i < array_lengthof(SysRegPairs); ++i) {
if (SysRegPairs[i].Name == NameLower) {
Valid = true;
@@ -764,6 +772,16 @@
}
}
+ // Next search for target specific registers
+ if (FeatureBits & AArch64::ProcCyclone) {
+ for (unsigned i = 0; i < array_lengthof(CycloneSysRegPairs); ++i) {
+ if (CycloneSysRegPairs[i].Name == NameLower) {
+ Valid = true;
+ return CycloneSysRegPairs[i].Value;
+ }
+ }
+ }
+
// Now try the instruction-specific registers (either read-only or
// write-only).
for (unsigned i = 0; i < NumInstPairs; ++i) {
@@ -796,7 +814,8 @@
}
std::string
-A64SysReg::SysRegMapper::toString(uint32_t Bits, bool &Valid) const {
+AArch64SysReg::SysRegMapper::toString(uint32_t Bits, bool &Valid) const {
+ // First search the registers shared by all
for (unsigned i = 0; i < array_lengthof(SysRegPairs); ++i) {
if (SysRegPairs[i].Value == Bits) {
Valid = true;
@@ -804,6 +823,18 @@
}
}
+ // Next search for target specific registers
+ if (FeatureBits & AArch64::ProcCyclone) {
+ for (unsigned i = 0; i < array_lengthof(CycloneSysRegPairs); ++i) {
+ if (CycloneSysRegPairs[i].Value == Bits) {
+ Valid = true;
+ return CycloneSysRegPairs[i].Name;
+ }
+ }
+ }
+
+ // Now try the instruction-specific registers (either read-only or
+ // write-only).
for (unsigned i = 0; i < NumInstPairs; ++i) {
if (InstPairs[i].Value == Bits) {
Valid = true;
@@ -831,7 +862,7 @@
+ "_c" + utostr(CRm) + "_" + utostr(Op2);
}
-const NamedImmMapper::Mapping A64TLBI::TLBIMapper::TLBIPairs[] = {
+const AArch64NamedImmMapper::Mapping AArch64TLBI::TLBIMapper::TLBIPairs[] = {
{"ipas2e1is", IPAS2E1IS},
{"ipas2le1is", IPAS2LE1IS},
{"vmalle1is", VMALLE1IS},
@@ -866,308 +897,5 @@
{"vaale1", VAALE1}
};
-A64TLBI::TLBIMapper::TLBIMapper()
- : NamedImmMapper(TLBIPairs, 0) {}
-
-bool A64Imms::isFPImm(const APFloat &Val, uint32_t &Imm8Bits) {
- const fltSemantics &Sem = Val.getSemantics();
- unsigned FracBits = APFloat::semanticsPrecision(Sem) - 1;
-
- uint32_t ExpMask;
- switch (FracBits) {
- case 10: // IEEE half-precision
- ExpMask = 0x1f;
- break;
- case 23: // IEEE single-precision
- ExpMask = 0xff;
- break;
- case 52: // IEEE double-precision
- ExpMask = 0x7ff;
- break;
- case 112: // IEEE quad-precision
- // No immediates are valid for double precision.
- return false;
- default:
- llvm_unreachable("Only half, single and double precision supported");
- }
-
- uint32_t ExpStart = FracBits;
- uint64_t FracMask = (1ULL << FracBits) - 1;
-
- uint32_t Sign = Val.isNegative();
-
- uint64_t Bits= Val.bitcastToAPInt().getLimitedValue();
- uint64_t Fraction = Bits & FracMask;
- int32_t Exponent = ((Bits >> ExpStart) & ExpMask);
- Exponent -= ExpMask >> 1;
-
- // S[d] = imm8<7>:NOT(imm8<6>):Replicate(imm8<6>, 5):imm8<5:0>:Zeros(19)
- // D[d] = imm8<7>:NOT(imm8<6>):Replicate(imm8<6>, 8):imm8<5:0>:Zeros(48)
- // This translates to: only 4 bits of fraction; -3 <= exp <= 4.
- uint64_t A64FracStart = FracBits - 4;
- uint64_t A64FracMask = 0xf;
-
- // Are there too many fraction bits?
- if (Fraction & ~(A64FracMask << A64FracStart))
- return false;
-
- if (Exponent < -3 || Exponent > 4)
- return false;
-
- uint32_t PackedFraction = (Fraction >> A64FracStart) & A64FracMask;
- uint32_t PackedExp = (Exponent + 7) & 0x7;
-
- Imm8Bits = (Sign << 7) | (PackedExp << 4) | PackedFraction;
- return true;
-}
-
-// Encoding of the immediate for logical (immediate) instructions:
-//
-// | N | imms | immr | size | R | S |
-// |---+--------+--------+------+--------------+--------------|
-// | 1 | ssssss | rrrrrr | 64 | UInt(rrrrrr) | UInt(ssssss) |
-// | 0 | 0sssss | xrrrrr | 32 | UInt(rrrrr) | UInt(sssss) |
-// | 0 | 10ssss | xxrrrr | 16 | UInt(rrrr) | UInt(ssss) |
-// | 0 | 110sss | xxxrrr | 8 | UInt(rrr) | UInt(sss) |
-// | 0 | 1110ss | xxxxrr | 4 | UInt(rr) | UInt(ss) |
-// | 0 | 11110s | xxxxxr | 2 | UInt(r) | UInt(s) |
-// | 0 | 11111x | - | | UNALLOCATED | |
-//
-// Columns 'R', 'S' and 'size' specify a "bitmask immediate" of size bits in
-// which the lower S+1 bits are ones and the remaining bits are zero, then
-// rotated right by R bits, which is then replicated across the datapath.
-//
-// + Values of 'N', 'imms' and 'immr' which do not match the above table are
-// RESERVED.
-// + If all 's' bits in the imms field are set then the instruction is
-// RESERVED.
-// + The 'x' bits in the 'immr' field are IGNORED.
-
-bool A64Imms::isLogicalImm(unsigned RegWidth, uint64_t Imm, uint32_t &Bits) {
- int RepeatWidth;
- int Rotation = 0;
- int Num1s = 0;
-
- // Because there are S+1 ones in the replicated mask, an immediate of all
- // zeros is not allowed. Filtering it here is probably more efficient.
- if (Imm == 0) return false;
-
- for (RepeatWidth = RegWidth; RepeatWidth > 1; RepeatWidth /= 2) {
- uint64_t RepeatMask = RepeatWidth == 64 ? -1 : (1ULL << RepeatWidth) - 1;
- uint64_t ReplicatedMask = Imm & RepeatMask;
-
- if (ReplicatedMask == 0) continue;
-
- // First we have to make sure the mask is actually repeated in each slot for
- // this width-specifier.
- bool IsReplicatedMask = true;
- for (unsigned i = RepeatWidth; i < RegWidth; i += RepeatWidth) {
- if (((Imm >> i) & RepeatMask) != ReplicatedMask) {
- IsReplicatedMask = false;
- break;
- }
- }
- if (!IsReplicatedMask) continue;
-
- // Now we have to work out the amount of rotation needed. The first part of
- // this calculation is actually independent of RepeatWidth, but the complex
- // case will depend on it.
- Rotation = countTrailingZeros(Imm);
- if (Rotation == 0) {
- // There were no leading zeros, which means it's either in place or there
- // are 1s at each end (e.g. 0x8003 needs rotating).
- Rotation = RegWidth == 64 ? CountLeadingOnes_64(Imm)
- : CountLeadingOnes_32(Imm);
- Rotation = RepeatWidth - Rotation;
- }
-
- uint64_t ReplicatedOnes = ReplicatedMask;
- if (Rotation != 0 && Rotation != 64)
- ReplicatedOnes = (ReplicatedMask >> Rotation)
- | ((ReplicatedMask << (RepeatWidth - Rotation)) & RepeatMask);
-
- // Of course, they may not actually be ones, so we have to check that:
- if (!isMask_64(ReplicatedOnes))
- continue;
-
- Num1s = CountTrailingOnes_64(ReplicatedOnes);
-
- // We know we've got an almost valid encoding (certainly, if this is invalid
- // no other parameters would work).
- break;
- }
-
- // The encodings which would produce all 1s are RESERVED.
- if (RepeatWidth == 1 || Num1s == RepeatWidth) return false;
-
- uint32_t N = RepeatWidth == 64;
- uint32_t ImmR = RepeatWidth - Rotation;
- uint32_t ImmS = Num1s - 1;
-
- switch (RepeatWidth) {
- default: break; // No action required for other valid rotations.
- case 16: ImmS |= 0x20; break; // 10ssss
- case 8: ImmS |= 0x30; break; // 110sss
- case 4: ImmS |= 0x38; break; // 1110ss
- case 2: ImmS |= 0x3c; break; // 11110s
- }
-
- Bits = ImmS | (ImmR << 6) | (N << 12);
-
- return true;
-}
-
-
-bool A64Imms::isLogicalImmBits(unsigned RegWidth, uint32_t Bits,
- uint64_t &Imm) {
- uint32_t N = Bits >> 12;
- uint32_t ImmR = (Bits >> 6) & 0x3f;
- uint32_t ImmS = Bits & 0x3f;
-
- // N=1 encodes a 64-bit replication and is invalid for the 32-bit
- // instructions.
- if (RegWidth == 32 && N != 0) return false;
-
- int Width = 0;
- if (N == 1)
- Width = 64;
- else if ((ImmS & 0x20) == 0)
- Width = 32;
- else if ((ImmS & 0x10) == 0)
- Width = 16;
- else if ((ImmS & 0x08) == 0)
- Width = 8;
- else if ((ImmS & 0x04) == 0)
- Width = 4;
- else if ((ImmS & 0x02) == 0)
- Width = 2;
- else {
- // ImmS is 0b11111x: UNALLOCATED
- return false;
- }
-
- int Num1s = (ImmS & (Width - 1)) + 1;
-
- // All encodings which would map to -1 (signed) are RESERVED.
- if (Num1s == Width) return false;
-
- int Rotation = (ImmR & (Width - 1));
- uint64_t Mask = (1ULL << Num1s) - 1;
- uint64_t WidthMask = Width == 64 ? -1 : (1ULL << Width) - 1;
- if (Rotation != 0 && Rotation != 64)
- Mask = (Mask >> Rotation)
- | ((Mask << (Width - Rotation)) & WidthMask);
-
- Imm = Mask;
- for (unsigned i = 1; i < RegWidth / Width; ++i) {
- Mask <<= Width;
- Imm |= Mask;
- }
-
- return true;
-}
-
-bool A64Imms::isMOVZImm(int RegWidth, uint64_t Value, int &UImm16, int &Shift) {
- // If high bits are set then a 32-bit MOVZ can't possibly work.
- if (RegWidth == 32 && (Value & ~0xffffffffULL))
- return false;
-
- for (int i = 0; i < RegWidth; i += 16) {
- // If the value is 0 when we mask out all the bits that could be set with
- // the current LSL value then it's representable.
- if ((Value & ~(0xffffULL << i)) == 0) {
- Shift = i / 16;
- UImm16 = (Value >> i) & 0xffff;
- return true;
- }
- }
- return false;
-}
-
-bool A64Imms::isMOVNImm(int RegWidth, uint64_t Value, int &UImm16, int &Shift) {
- // MOVN is defined to set its register to NOT(LSL(imm16, shift)).
-
- // We have to be a little careful about a 32-bit register: 0xffff_1234 *is*
- // representable, but ~0xffff_1234 == 0xffff_ffff_0000_edcb which is not
- // a valid input for isMOVZImm.
- if (RegWidth == 32 && (Value & ~0xffffffffULL))
- return false;
-
- uint64_t MOVZEquivalent = RegWidth == 32 ? ~Value & 0xffffffff : ~Value;
-
- return isMOVZImm(RegWidth, MOVZEquivalent, UImm16, Shift);
-}
-
-bool A64Imms::isOnlyMOVNImm(int RegWidth, uint64_t Value,
- int &UImm16, int &Shift) {
- if (isMOVZImm(RegWidth, Value, UImm16, Shift))
- return false;
-
- return isMOVNImm(RegWidth, Value, UImm16, Shift);
-}
-
-// decodeNeonModShiftImm - Decode a Neon OpCmode value into the
-// the shift amount and the shift type (shift zeros or ones in) and
-// returns whether the OpCmode value implies a shift operation.
-bool A64Imms::decodeNeonModShiftImm(unsigned OpCmode, unsigned &ShiftImm,
- unsigned &ShiftOnesIn) {
- ShiftImm = 0;
- ShiftOnesIn = false;
- bool HasShift = true;
-
- if (OpCmode == 0xe) {
- // movi byte
- HasShift = false;
- } else if (OpCmode == 0x1e) {
- // movi 64-bit bytemask
- HasShift = false;
- } else if ((OpCmode & 0xc) == 0x8) {
- // shift zeros, per halfword
- ShiftImm = ((OpCmode & 0x2) >> 1);
- } else if ((OpCmode & 0x8) == 0) {
- // shift zeros, per word
- ShiftImm = ((OpCmode & 0x6) >> 1);
- } else if ((OpCmode & 0xe) == 0xc) {
- // shift ones, per word
- ShiftOnesIn = true;
- ShiftImm = (OpCmode & 0x1);
- } else {
- // per byte, per bytemask
- llvm_unreachable("Unsupported Neon modified immediate");
- }
-
- return HasShift;
-}
-
-// decodeNeonModImm - Decode a NEON modified immediate and OpCmode values
-// into the element value and the element size in bits.
-uint64_t A64Imms::decodeNeonModImm(unsigned Val, unsigned OpCmode,
- unsigned &EltBits) {
- uint64_t DecodedVal = Val;
- EltBits = 0;
-
- if (OpCmode == 0xe) {
- // movi byte
- EltBits = 8;
- } else if (OpCmode == 0x1e) {
- // movi 64-bit bytemask
- DecodedVal = 0;
- for (unsigned ByteNum = 0; ByteNum < 8; ++ByteNum) {
- if ((Val >> ByteNum) & 1)
- DecodedVal |= (uint64_t)0xff << (8 * ByteNum);
- }
- EltBits = 64;
- } else if ((OpCmode & 0xc) == 0x8) {
- // shift zeros, per halfword
- EltBits = 16;
- } else if ((OpCmode & 0x8) == 0) {
- // shift zeros, per word
- EltBits = 32;
- } else if ((OpCmode & 0xe) == 0xc) {
- // shift ones, per word
- EltBits = 32;
- } else {
- llvm_unreachable("Unsupported Neon modified immediate");
- }
- return DecodedVal;
-}
+AArch64TLBI::TLBIMapper::TLBIMapper()
+ : AArch64NamedImmMapper(TLBIPairs, 0) {}
diff --git a/lib/Target/AArch64/Utils/AArch64BaseInfo.h b/lib/Target/AArch64/Utils/AArch64BaseInfo.h
index 39b042b..9e4c389 100644
--- a/lib/Target/AArch64/Utils/AArch64BaseInfo.h
+++ b/lib/Target/AArch64/Utils/AArch64BaseInfo.h
@@ -1,4 +1,4 @@
-//===-- AArch64BaseInfo.h - Top level definitions for AArch64- --*- C++ -*-===//
+//===-- AArch64BaseInfo.h - Top level definitions for AArch64 ---*- C++ -*-===//
//
// The LLVM Compiler Infrastructure
//
@@ -14,95 +14,270 @@
//
//===----------------------------------------------------------------------===//
-#ifndef LLVM_AARCH64_BASEINFO_H
-#define LLVM_AARCH64_BASEINFO_H
+#ifndef AArch64BASEINFO_H
+#define AArch64BASEINFO_H
+// FIXME: Is it easiest to fix this layering violation by moving the .inc
+// #includes from AArch64MCTargetDesc.h to here?
+#include "MCTargetDesc/AArch64MCTargetDesc.h" // For AArch64::X0 and friends.
#include "llvm/ADT/STLExtras.h"
#include "llvm/ADT/StringSwitch.h"
#include "llvm/Support/ErrorHandling.h"
namespace llvm {
-// // Enums corresponding to AArch64 condition codes
-namespace A64CC {
- // The CondCodes constants map directly to the 4-bit encoding of the
- // condition field for predicated instructions.
- enum CondCodes { // Meaning (integer) Meaning (floating-point)
- EQ = 0, // Equal Equal
- NE, // Not equal Not equal, or unordered
- HS, // Unsigned higher or same >, ==, or unordered
- LO, // Unsigned lower or same Less than
- MI, // Minus, negative Less than
- PL, // Plus, positive or zero >, ==, or unordered
- VS, // Overflow Unordered
- VC, // No overflow Ordered
- HI, // Unsigned higher Greater than, or unordered
- LS, // Unsigned lower or same Less than or equal
- GE, // Greater than or equal Greater than or equal
- LT, // Less than Less than, or unordered
- GT, // Signed greater than Greater than
- LE, // Signed less than or equal <, ==, or unordered
- AL, // Always (unconditional) Always (unconditional)
- NV, // Always (unconditional) Always (unconditional)
- // Note the NV exists purely to disassemble 0b1111. Execution
- // is "always".
- Invalid
- };
+inline static unsigned getWRegFromXReg(unsigned Reg) {
+ switch (Reg) {
+ case AArch64::X0: return AArch64::W0;
+ case AArch64::X1: return AArch64::W1;
+ case AArch64::X2: return AArch64::W2;
+ case AArch64::X3: return AArch64::W3;
+ case AArch64::X4: return AArch64::W4;
+ case AArch64::X5: return AArch64::W5;
+ case AArch64::X6: return AArch64::W6;
+ case AArch64::X7: return AArch64::W7;
+ case AArch64::X8: return AArch64::W8;
+ case AArch64::X9: return AArch64::W9;
+ case AArch64::X10: return AArch64::W10;
+ case AArch64::X11: return AArch64::W11;
+ case AArch64::X12: return AArch64::W12;
+ case AArch64::X13: return AArch64::W13;
+ case AArch64::X14: return AArch64::W14;
+ case AArch64::X15: return AArch64::W15;
+ case AArch64::X16: return AArch64::W16;
+ case AArch64::X17: return AArch64::W17;
+ case AArch64::X18: return AArch64::W18;
+ case AArch64::X19: return AArch64::W19;
+ case AArch64::X20: return AArch64::W20;
+ case AArch64::X21: return AArch64::W21;
+ case AArch64::X22: return AArch64::W22;
+ case AArch64::X23: return AArch64::W23;
+ case AArch64::X24: return AArch64::W24;
+ case AArch64::X25: return AArch64::W25;
+ case AArch64::X26: return AArch64::W26;
+ case AArch64::X27: return AArch64::W27;
+ case AArch64::X28: return AArch64::W28;
+ case AArch64::FP: return AArch64::W29;
+ case AArch64::LR: return AArch64::W30;
+ case AArch64::SP: return AArch64::WSP;
+ case AArch64::XZR: return AArch64::WZR;
+ }
+ // For anything else, return it unchanged.
+ return Reg;
+}
-} // namespace A64CC
+inline static unsigned getXRegFromWReg(unsigned Reg) {
+ switch (Reg) {
+ case AArch64::W0: return AArch64::X0;
+ case AArch64::W1: return AArch64::X1;
+ case AArch64::W2: return AArch64::X2;
+ case AArch64::W3: return AArch64::X3;
+ case AArch64::W4: return AArch64::X4;
+ case AArch64::W5: return AArch64::X5;
+ case AArch64::W6: return AArch64::X6;
+ case AArch64::W7: return AArch64::X7;
+ case AArch64::W8: return AArch64::X8;
+ case AArch64::W9: return AArch64::X9;
+ case AArch64::W10: return AArch64::X10;
+ case AArch64::W11: return AArch64::X11;
+ case AArch64::W12: return AArch64::X12;
+ case AArch64::W13: return AArch64::X13;
+ case AArch64::W14: return AArch64::X14;
+ case AArch64::W15: return AArch64::X15;
+ case AArch64::W16: return AArch64::X16;
+ case AArch64::W17: return AArch64::X17;
+ case AArch64::W18: return AArch64::X18;
+ case AArch64::W19: return AArch64::X19;
+ case AArch64::W20: return AArch64::X20;
+ case AArch64::W21: return AArch64::X21;
+ case AArch64::W22: return AArch64::X22;
+ case AArch64::W23: return AArch64::X23;
+ case AArch64::W24: return AArch64::X24;
+ case AArch64::W25: return AArch64::X25;
+ case AArch64::W26: return AArch64::X26;
+ case AArch64::W27: return AArch64::X27;
+ case AArch64::W28: return AArch64::X28;
+ case AArch64::W29: return AArch64::FP;
+ case AArch64::W30: return AArch64::LR;
+ case AArch64::WSP: return AArch64::SP;
+ case AArch64::WZR: return AArch64::XZR;
+ }
+ // For anything else, return it unchanged.
+ return Reg;
+}
-inline static const char *A64CondCodeToString(A64CC::CondCodes CC) {
- switch (CC) {
+static inline unsigned getBRegFromDReg(unsigned Reg) {
+ switch (Reg) {
+ case AArch64::D0: return AArch64::B0;
+ case AArch64::D1: return AArch64::B1;
+ case AArch64::D2: return AArch64::B2;
+ case AArch64::D3: return AArch64::B3;
+ case AArch64::D4: return AArch64::B4;
+ case AArch64::D5: return AArch64::B5;
+ case AArch64::D6: return AArch64::B6;
+ case AArch64::D7: return AArch64::B7;
+ case AArch64::D8: return AArch64::B8;
+ case AArch64::D9: return AArch64::B9;
+ case AArch64::D10: return AArch64::B10;
+ case AArch64::D11: return AArch64::B11;
+ case AArch64::D12: return AArch64::B12;
+ case AArch64::D13: return AArch64::B13;
+ case AArch64::D14: return AArch64::B14;
+ case AArch64::D15: return AArch64::B15;
+ case AArch64::D16: return AArch64::B16;
+ case AArch64::D17: return AArch64::B17;
+ case AArch64::D18: return AArch64::B18;
+ case AArch64::D19: return AArch64::B19;
+ case AArch64::D20: return AArch64::B20;
+ case AArch64::D21: return AArch64::B21;
+ case AArch64::D22: return AArch64::B22;
+ case AArch64::D23: return AArch64::B23;
+ case AArch64::D24: return AArch64::B24;
+ case AArch64::D25: return AArch64::B25;
+ case AArch64::D26: return AArch64::B26;
+ case AArch64::D27: return AArch64::B27;
+ case AArch64::D28: return AArch64::B28;
+ case AArch64::D29: return AArch64::B29;
+ case AArch64::D30: return AArch64::B30;
+ case AArch64::D31: return AArch64::B31;
+ }
+ // For anything else, return it unchanged.
+ return Reg;
+}
+
+
+static inline unsigned getDRegFromBReg(unsigned Reg) {
+ switch (Reg) {
+ case AArch64::B0: return AArch64::D0;
+ case AArch64::B1: return AArch64::D1;
+ case AArch64::B2: return AArch64::D2;
+ case AArch64::B3: return AArch64::D3;
+ case AArch64::B4: return AArch64::D4;
+ case AArch64::B5: return AArch64::D5;
+ case AArch64::B6: return AArch64::D6;
+ case AArch64::B7: return AArch64::D7;
+ case AArch64::B8: return AArch64::D8;
+ case AArch64::B9: return AArch64::D9;
+ case AArch64::B10: return AArch64::D10;
+ case AArch64::B11: return AArch64::D11;
+ case AArch64::B12: return AArch64::D12;
+ case AArch64::B13: return AArch64::D13;
+ case AArch64::B14: return AArch64::D14;
+ case AArch64::B15: return AArch64::D15;
+ case AArch64::B16: return AArch64::D16;
+ case AArch64::B17: return AArch64::D17;
+ case AArch64::B18: return AArch64::D18;
+ case AArch64::B19: return AArch64::D19;
+ case AArch64::B20: return AArch64::D20;
+ case AArch64::B21: return AArch64::D21;
+ case AArch64::B22: return AArch64::D22;
+ case AArch64::B23: return AArch64::D23;
+ case AArch64::B24: return AArch64::D24;
+ case AArch64::B25: return AArch64::D25;
+ case AArch64::B26: return AArch64::D26;
+ case AArch64::B27: return AArch64::D27;
+ case AArch64::B28: return AArch64::D28;
+ case AArch64::B29: return AArch64::D29;
+ case AArch64::B30: return AArch64::D30;
+ case AArch64::B31: return AArch64::D31;
+ }
+ // For anything else, return it unchanged.
+ return Reg;
+}
+
+namespace AArch64CC {
+
+// The CondCodes constants map directly to the 4-bit encoding of the condition
+// field for predicated instructions.
+enum CondCode { // Meaning (integer) Meaning (floating-point)
+ EQ = 0x0, // Equal Equal
+ NE = 0x1, // Not equal Not equal, or unordered
+ HS = 0x2, // Unsigned higher or same >, ==, or unordered
+ LO = 0x3, // Unsigned lower Less than
+ MI = 0x4, // Minus, negative Less than
+ PL = 0x5, // Plus, positive or zero >, ==, or unordered
+ VS = 0x6, // Overflow Unordered
+ VC = 0x7, // No overflow Not unordered
+ HI = 0x8, // Unsigned higher Greater than, or unordered
+ LS = 0x9, // Unsigned lower or same Less than or equal
+ GE = 0xa, // Greater than or equal Greater than or equal
+ LT = 0xb, // Less than Less than, or unordered
+ GT = 0xc, // Greater than Greater than
+ LE = 0xd, // Less than or equal <, ==, or unordered
+ AL = 0xe, // Always (unconditional) Always (unconditional)
+ NV = 0xf, // Always (unconditional) Always (unconditional)
+ // Note the NV exists purely to disassemble 0b1111. Execution is "always".
+ Invalid
+};
+
+inline static const char *getCondCodeName(CondCode Code) {
+ switch (Code) {
default: llvm_unreachable("Unknown condition code");
- case A64CC::EQ: return "eq";
- case A64CC::NE: return "ne";
- case A64CC::HS: return "hs";
- case A64CC::LO: return "lo";
- case A64CC::MI: return "mi";
- case A64CC::PL: return "pl";
- case A64CC::VS: return "vs";
- case A64CC::VC: return "vc";
- case A64CC::HI: return "hi";
- case A64CC::LS: return "ls";
- case A64CC::GE: return "ge";
- case A64CC::LT: return "lt";
- case A64CC::GT: return "gt";
- case A64CC::LE: return "le";
- case A64CC::AL: return "al";
- case A64CC::NV: return "nv";
+ case EQ: return "eq";
+ case NE: return "ne";
+ case HS: return "hs";
+ case LO: return "lo";
+ case MI: return "mi";
+ case PL: return "pl";
+ case VS: return "vs";
+ case VC: return "vc";
+ case HI: return "hi";
+ case LS: return "ls";
+ case GE: return "ge";
+ case LT: return "lt";
+ case GT: return "gt";
+ case LE: return "le";
+ case AL: return "al";
+ case NV: return "nv";
}
}
-inline static A64CC::CondCodes A64StringToCondCode(StringRef CondStr) {
- return StringSwitch<A64CC::CondCodes>(CondStr.lower())
- .Case("eq", A64CC::EQ)
- .Case("ne", A64CC::NE)
- .Case("ne", A64CC::NE)
- .Case("hs", A64CC::HS)
- .Case("cs", A64CC::HS)
- .Case("lo", A64CC::LO)
- .Case("cc", A64CC::LO)
- .Case("mi", A64CC::MI)
- .Case("pl", A64CC::PL)
- .Case("vs", A64CC::VS)
- .Case("vc", A64CC::VC)
- .Case("hi", A64CC::HI)
- .Case("ls", A64CC::LS)
- .Case("ge", A64CC::GE)
- .Case("lt", A64CC::LT)
- .Case("gt", A64CC::GT)
- .Case("le", A64CC::LE)
- .Case("al", A64CC::AL)
- .Case("nv", A64CC::NV)
- .Default(A64CC::Invalid);
+inline static CondCode getInvertedCondCode(CondCode Code) {
+ switch (Code) {
+ default: llvm_unreachable("Unknown condition code");
+ case EQ: return NE;
+ case NE: return EQ;
+ case HS: return LO;
+ case LO: return HS;
+ case MI: return PL;
+ case PL: return MI;
+ case VS: return VC;
+ case VC: return VS;
+ case HI: return LS;
+ case LS: return HI;
+ case GE: return LT;
+ case LT: return GE;
+ case GT: return LE;
+ case LE: return GT;
+ }
}
-inline static A64CC::CondCodes A64InvertCondCode(A64CC::CondCodes CC) {
- // It turns out that the condition codes have been designed so that in order
- // to reverse the intent of the condition you only have to invert the low bit:
-
- return static_cast<A64CC::CondCodes>(static_cast<unsigned>(CC) ^ 0x1);
+/// Given a condition code, return NZCV flags that would satisfy that condition.
+/// The flag bits are in the format expected by the ccmp instructions.
+/// Note that many different flag settings can satisfy a given condition code,
+/// this function just returns one of them.
+inline static unsigned getNZCVToSatisfyCondCode(CondCode Code) {
+ // NZCV flags encoded as expected by ccmp instructions, ARMv8 ISA 5.5.7.
+ enum { N = 8, Z = 4, C = 2, V = 1 };
+ switch (Code) {
+ default: llvm_unreachable("Unknown condition code");
+ case EQ: return Z; // Z == 1
+ case NE: return 0; // Z == 0
+ case HS: return C; // C == 1
+ case LO: return 0; // C == 0
+ case MI: return N; // N == 1
+ case PL: return 0; // N == 0
+ case VS: return V; // V == 1
+ case VC: return 0; // V == 0
+ case HI: return C; // C == 1 && Z == 0
+ case LS: return 0; // C == 0 || Z == 1
+ case GE: return 0; // N == V
+ case LT: return N; // N != V
+ case GT: return 0; // Z == 0 && N == V
+ case LE: return Z; // Z == 1 || N != V
+ }
}
+} // end namespace AArch64CC
/// Instances of this class can perform bidirectional mapping from random
/// identifier strings to operand encodings. For example "MSR" takes a named
@@ -115,14 +290,14 @@
/// out just how often these instructions are emitted before working on it. It
/// might even be optimal to just reorder the tables for the common instructions
/// rather than changing the algorithm.
-struct NamedImmMapper {
+struct AArch64NamedImmMapper {
struct Mapping {
const char *Name;
uint32_t Value;
};
template<int N>
- NamedImmMapper(const Mapping (&Pairs)[N], uint32_t TooBigImm)
+ AArch64NamedImmMapper(const Mapping (&Pairs)[N], uint32_t TooBigImm)
: Pairs(&Pairs[0]), NumPairs(N), TooBigImm(TooBigImm) {}
StringRef toString(uint32_t Value, bool &Valid) const;
@@ -138,7 +313,7 @@
uint32_t TooBigImm;
};
-namespace A64AT {
+namespace AArch64AT {
enum ATValues {
Invalid = -1, // Op0 Op1 CRn CRm Op2
S1E1R = 0x43c0, // 01 000 0111 1000 000
@@ -155,14 +330,14 @@
S12E0W = 0x63c7 // 01 100 0111 1000 111
};
- struct ATMapper : NamedImmMapper {
+ struct ATMapper : AArch64NamedImmMapper {
const static Mapping ATPairs[];
ATMapper();
};
}
-namespace A64DB {
+namespace AArch64DB {
enum DBValues {
Invalid = -1,
OSHLD = 0x1,
@@ -179,14 +354,14 @@
SY = 0xf
};
- struct DBarrierMapper : NamedImmMapper {
+ struct DBarrierMapper : AArch64NamedImmMapper {
const static Mapping DBarrierPairs[];
DBarrierMapper();
};
}
-namespace A64DC {
+namespace AArch64DC {
enum DCValues {
Invalid = -1, // Op1 CRn CRm Op2
ZVA = 0x5ba1, // 01 011 0111 0100 001
@@ -199,7 +374,7 @@
CISW = 0x43f2 // 01 000 0111 1110 010
};
- struct DCMapper : NamedImmMapper {
+ struct DCMapper : AArch64NamedImmMapper {
const static Mapping DCPairs[];
DCMapper();
@@ -207,7 +382,7 @@
}
-namespace A64IC {
+namespace AArch64IC {
enum ICValues {
Invalid = -1, // Op1 CRn CRm Op2
IALLUIS = 0x0388, // 000 0111 0001 000
@@ -216,7 +391,7 @@
};
- struct ICMapper : NamedImmMapper {
+ struct ICMapper : AArch64NamedImmMapper {
const static Mapping ICPairs[];
ICMapper();
@@ -227,19 +402,19 @@
}
}
-namespace A64ISB {
+namespace AArch64ISB {
enum ISBValues {
Invalid = -1,
SY = 0xf
};
- struct ISBMapper : NamedImmMapper {
+ struct ISBMapper : AArch64NamedImmMapper {
const static Mapping ISBPairs[];
ISBMapper();
};
}
-namespace A64PRFM {
+namespace AArch64PRFM {
enum PRFMValues {
Invalid = -1,
PLDL1KEEP = 0x00,
@@ -262,14 +437,14 @@
PSTL3STRM = 0x15
};
- struct PRFMMapper : NamedImmMapper {
+ struct PRFMMapper : AArch64NamedImmMapper {
const static Mapping PRFMPairs[];
PRFMMapper();
};
}
-namespace A64PState {
+namespace AArch64PState {
enum PStateValues {
Invalid = -1,
SPSel = 0x05,
@@ -277,7 +452,7 @@
DAIFClr = 0x1f
};
- struct PStateMapper : NamedImmMapper {
+ struct PStateMapper : AArch64NamedImmMapper {
const static Mapping PStatePairs[];
PStateMapper();
@@ -285,7 +460,7 @@
}
-namespace A64SE {
+namespace AArch64SE {
enum ShiftExtSpecifiers {
Invalid = -1,
LSL,
@@ -306,7 +481,7 @@
};
}
-namespace A64Layout {
+namespace AArch64Layout {
enum VectorLayout {
Invalid = -1,
VL_8B,
@@ -329,43 +504,43 @@
}
inline static const char *
-A64VectorLayoutToString(A64Layout::VectorLayout Layout) {
+AArch64VectorLayoutToString(AArch64Layout::VectorLayout Layout) {
switch (Layout) {
- case A64Layout::VL_8B: return ".8b";
- case A64Layout::VL_4H: return ".4h";
- case A64Layout::VL_2S: return ".2s";
- case A64Layout::VL_1D: return ".1d";
- case A64Layout::VL_16B: return ".16b";
- case A64Layout::VL_8H: return ".8h";
- case A64Layout::VL_4S: return ".4s";
- case A64Layout::VL_2D: return ".2d";
- case A64Layout::VL_B: return ".b";
- case A64Layout::VL_H: return ".h";
- case A64Layout::VL_S: return ".s";
- case A64Layout::VL_D: return ".d";
+ case AArch64Layout::VL_8B: return ".8b";
+ case AArch64Layout::VL_4H: return ".4h";
+ case AArch64Layout::VL_2S: return ".2s";
+ case AArch64Layout::VL_1D: return ".1d";
+ case AArch64Layout::VL_16B: return ".16b";
+ case AArch64Layout::VL_8H: return ".8h";
+ case AArch64Layout::VL_4S: return ".4s";
+ case AArch64Layout::VL_2D: return ".2d";
+ case AArch64Layout::VL_B: return ".b";
+ case AArch64Layout::VL_H: return ".h";
+ case AArch64Layout::VL_S: return ".s";
+ case AArch64Layout::VL_D: return ".d";
default: llvm_unreachable("Unknown Vector Layout");
}
}
-inline static A64Layout::VectorLayout
-A64StringToVectorLayout(StringRef LayoutStr) {
- return StringSwitch<A64Layout::VectorLayout>(LayoutStr)
- .Case(".8b", A64Layout::VL_8B)
- .Case(".4h", A64Layout::VL_4H)
- .Case(".2s", A64Layout::VL_2S)
- .Case(".1d", A64Layout::VL_1D)
- .Case(".16b", A64Layout::VL_16B)
- .Case(".8h", A64Layout::VL_8H)
- .Case(".4s", A64Layout::VL_4S)
- .Case(".2d", A64Layout::VL_2D)
- .Case(".b", A64Layout::VL_B)
- .Case(".h", A64Layout::VL_H)
- .Case(".s", A64Layout::VL_S)
- .Case(".d", A64Layout::VL_D)
- .Default(A64Layout::Invalid);
+inline static AArch64Layout::VectorLayout
+AArch64StringToVectorLayout(StringRef LayoutStr) {
+ return StringSwitch<AArch64Layout::VectorLayout>(LayoutStr)
+ .Case(".8b", AArch64Layout::VL_8B)
+ .Case(".4h", AArch64Layout::VL_4H)
+ .Case(".2s", AArch64Layout::VL_2S)
+ .Case(".1d", AArch64Layout::VL_1D)
+ .Case(".16b", AArch64Layout::VL_16B)
+ .Case(".8h", AArch64Layout::VL_8H)
+ .Case(".4s", AArch64Layout::VL_4S)
+ .Case(".2d", AArch64Layout::VL_2D)
+ .Case(".b", AArch64Layout::VL_B)
+ .Case(".h", AArch64Layout::VL_H)
+ .Case(".s", AArch64Layout::VL_S)
+ .Case(".d", AArch64Layout::VL_D)
+ .Default(AArch64Layout::Invalid);
}
-namespace A64SysReg {
+namespace AArch64SysReg {
enum SysRegROValues {
MDCCSR_EL0 = 0x9808, // 10 011 0000 0001 000
DBGDTRRX_EL0 = 0x9828, // 10 011 0000 0101 000
@@ -396,16 +571,16 @@
ID_ISAR3_EL1 = 0xc013, // 11 000 0000 0010 011
ID_ISAR4_EL1 = 0xc014, // 11 000 0000 0010 100
ID_ISAR5_EL1 = 0xc015, // 11 000 0000 0010 101
- ID_AA64PFR0_EL1 = 0xc020, // 11 000 0000 0100 000
- ID_AA64PFR1_EL1 = 0xc021, // 11 000 0000 0100 001
- ID_AA64DFR0_EL1 = 0xc028, // 11 000 0000 0101 000
- ID_AA64DFR1_EL1 = 0xc029, // 11 000 0000 0101 001
- ID_AA64AFR0_EL1 = 0xc02c, // 11 000 0000 0101 100
- ID_AA64AFR1_EL1 = 0xc02d, // 11 000 0000 0101 101
- ID_AA64ISAR0_EL1 = 0xc030, // 11 000 0000 0110 000
- ID_AA64ISAR1_EL1 = 0xc031, // 11 000 0000 0110 001
- ID_AA64MMFR0_EL1 = 0xc038, // 11 000 0000 0111 000
- ID_AA64MMFR1_EL1 = 0xc039, // 11 000 0000 0111 001
+ ID_A64PFR0_EL1 = 0xc020, // 11 000 0000 0100 000
+ ID_A64PFR1_EL1 = 0xc021, // 11 000 0000 0100 001
+ ID_A64DFR0_EL1 = 0xc028, // 11 000 0000 0101 000
+ ID_A64DFR1_EL1 = 0xc029, // 11 000 0000 0101 001
+ ID_A64AFR0_EL1 = 0xc02c, // 11 000 0000 0101 100
+ ID_A64AFR1_EL1 = 0xc02d, // 11 000 0000 0101 101
+ ID_A64ISAR0_EL1 = 0xc030, // 11 000 0000 0110 000
+ ID_A64ISAR1_EL1 = 0xc031, // 11 000 0000 0110 001
+ ID_A64MMFR0_EL1 = 0xc038, // 11 000 0000 0111 000
+ ID_A64MMFR1_EL1 = 0xc039, // 11 000 0000 0111 001
MVFR0_EL1 = 0xc018, // 11 000 0000 0011 000
MVFR1_EL1 = 0xc019, // 11 000 0000 0011 001
MVFR2_EL1 = 0xc01a, // 11 000 0000 0011 010
@@ -960,38 +1135,45 @@
ICH_LR12_EL2 = 0xe66c, // 11 100 1100 1101 100
ICH_LR13_EL2 = 0xe66d, // 11 100 1100 1101 101
ICH_LR14_EL2 = 0xe66e, // 11 100 1100 1101 110
- ICH_LR15_EL2 = 0xe66f // 11 100 1100 1101 111
+ ICH_LR15_EL2 = 0xe66f, // 11 100 1100 1101 111
};
- // Note that these do not inherit from NamedImmMapper. This class is
+ // Cyclone specific system registers
+ enum CycloneSysRegValues {
+ CPM_IOACC_CTL_EL3 = 0xff90
+ };
+
+ // Note that these do not inherit from AArch64NamedImmMapper. This class is
// sufficiently different in its behaviour that I don't believe it's worth
- // burdening the common NamedImmMapper with abstractions only needed in
+ // burdening the common AArch64NamedImmMapper with abstractions only needed in
// this one case.
struct SysRegMapper {
- static const NamedImmMapper::Mapping SysRegPairs[];
+ static const AArch64NamedImmMapper::Mapping SysRegPairs[];
+ static const AArch64NamedImmMapper::Mapping CycloneSysRegPairs[];
- const NamedImmMapper::Mapping *InstPairs;
+ const AArch64NamedImmMapper::Mapping *InstPairs;
size_t NumInstPairs;
+ uint64_t FeatureBits;
- SysRegMapper() {}
+ SysRegMapper(uint64_t FeatureBits) : FeatureBits(FeatureBits) { }
uint32_t fromString(StringRef Name, bool &Valid) const;
std::string toString(uint32_t Bits, bool &Valid) const;
};
struct MSRMapper : SysRegMapper {
- static const NamedImmMapper::Mapping MSRPairs[];
- MSRMapper();
+ static const AArch64NamedImmMapper::Mapping MSRPairs[];
+ MSRMapper(uint64_t FeatureBits);
};
struct MRSMapper : SysRegMapper {
- static const NamedImmMapper::Mapping MRSPairs[];
- MRSMapper();
+ static const AArch64NamedImmMapper::Mapping MRSPairs[];
+ MRSMapper(uint64_t FeatureBits);
};
uint32_t ParseGenericRegister(StringRef Name, bool &Valid);
}
-namespace A64TLBI {
+namespace AArch64TLBI {
enum TLBIValues {
Invalid = -1, // Op0 Op1 CRn CRm Op2
IPAS2E1IS = 0x6401, // 01 100 1000 0000 001
@@ -1028,7 +1210,7 @@
VAALE1 = 0x443f // 01 000 1000 0111 111
};
- struct TLBIMapper : NamedImmMapper {
+ struct TLBIMapper : AArch64NamedImmMapper {
const static Mapping TLBIPairs[];
TLBIMapper();
@@ -1051,88 +1233,62 @@
return true;
}
}
-}
+}
namespace AArch64II {
-
+ /// Target Operand Flag enum.
enum TOF {
- //===--------------------------------------------------------------===//
+ //===------------------------------------------------------------------===//
// AArch64 Specific MachineOperand flags.
MO_NO_FLAG,
- // MO_GOT - Represents a relocation referring to the GOT entry of a given
- // symbol. Used in adrp.
- MO_GOT,
+ MO_FRAGMENT = 0x7,
- // MO_GOT_LO12 - Represents a relocation referring to the low 12 bits of the
- // GOT entry of a given symbol. Used in ldr only.
- MO_GOT_LO12,
+ /// MO_PAGE - A symbol operand with this flag represents the pc-relative
+ /// offset of the 4K page containing the symbol. This is used with the
+ /// ADRP instruction.
+ MO_PAGE = 1,
- // MO_DTPREL_* - Represents a relocation referring to the offset from a
- // module's dynamic thread pointer. Used in the local-dynamic TLS access
- // model.
- MO_DTPREL_G1,
- MO_DTPREL_G0_NC,
+ /// MO_PAGEOFF - A symbol operand with this flag represents the offset of
+ /// that symbol within a 4K page. This offset is added to the page address
+ /// to produce the complete address.
+ MO_PAGEOFF = 2,
- // MO_GOTTPREL_* - Represents a relocation referring to a GOT entry
- // providing the offset of a variable from the thread-pointer. Used in
- // initial-exec TLS model where this offset is assigned in the static thread
- // block and thus known by the dynamic linker.
- MO_GOTTPREL,
- MO_GOTTPREL_LO12,
+ /// MO_G3 - A symbol operand with this flag (granule 3) represents the high
+ /// 16-bits of a 64-bit address, used in a MOVZ or MOVK instruction
+ MO_G3 = 3,
- // MO_TLSDESC_* - Represents a relocation referring to a GOT entry providing
- // a TLS descriptor chosen by the dynamic linker. Used for the
- // general-dynamic and local-dynamic TLS access models where very littls is
- // known at link-time.
- MO_TLSDESC,
- MO_TLSDESC_LO12,
+ /// MO_G2 - A symbol operand with this flag (granule 2) represents the bits
+ /// 32-47 of a 64-bit address, used in a MOVZ or MOVK instruction
+ MO_G2 = 4,
- // MO_TPREL_* - Represents a relocation referring to the offset of a
- // variable from the thread pointer itself. Used in the local-exec TLS
- // access model.
- MO_TPREL_G1,
- MO_TPREL_G0_NC,
+ /// MO_G1 - A symbol operand with this flag (granule 1) represents the bits
+ /// 16-31 of a 64-bit address, used in a MOVZ or MOVK instruction
+ MO_G1 = 5,
- // MO_LO12 - On a symbol operand, this represents a relocation containing
- // lower 12 bits of the address. Used in add/sub/ldr/str.
- MO_LO12,
+ /// MO_G0 - A symbol operand with this flag (granule 0) represents the bits
+ /// 0-15 of a 64-bit address, used in a MOVZ or MOVK instruction
+ MO_G0 = 6,
- // MO_ABS_G* - Represent the 16-bit granules of an absolute reference using
- // movz/movk instructions.
- MO_ABS_G3,
- MO_ABS_G2_NC,
- MO_ABS_G1_NC,
- MO_ABS_G0_NC
+ /// MO_GOT - This flag indicates that a symbol operand represents the
+ /// address of the GOT entry for the symbol, rather than the address of
+ /// the symbol itself.
+ MO_GOT = 8,
+
+ /// MO_NC - Indicates whether the linker is expected to check the symbol
+ /// reference for overflow. For example in an ADRP/ADD pair of relocations
+ /// the ADRP usually does check, but not the ADD.
+ MO_NC = 0x10,
+
+ /// MO_TLS - Indicates that the operand being accessed is some kind of
+ /// thread-local symbol. On Darwin, only one type of thread-local access
+ /// exists (pre linker-relaxation), but on ELF the TLSModel used for the
+ /// referee will affect interpretation.
+ MO_TLS = 0x20
};
-}
+} // end namespace AArch64II
-class APFloat;
-
-namespace A64Imms {
- bool isFPImm(const APFloat &Val, uint32_t &Imm8Bits);
-
- inline bool isFPImm(const APFloat &Val) {
- uint32_t Imm8;
- return isFPImm(Val, Imm8);
- }
-
- bool isLogicalImm(unsigned RegWidth, uint64_t Imm, uint32_t &Bits);
- bool isLogicalImmBits(unsigned RegWidth, uint32_t Bits, uint64_t &Imm);
-
- bool isMOVZImm(int RegWidth, uint64_t Value, int &UImm16, int &Shift);
- bool isMOVNImm(int RegWidth, uint64_t Value, int &UImm16, int &Shift);
-
- // We sometimes want to know whether the immediate is representable with a
- // MOVN but *not* with a MOVZ (because that would take priority).
- bool isOnlyMOVNImm(int RegWidth, uint64_t Value, int &UImm16, int &Shift);
-
- uint64_t decodeNeonModImm(unsigned Val, unsigned OpCmode, unsigned &EltBits);
- bool decodeNeonModShiftImm(unsigned OpCmode, unsigned &ShiftImm,
- unsigned &ShiftOnesIn);
- }
-
-} // end namespace llvm;
+} // end namespace llvm
#endif
diff --git a/lib/Target/AArch64/Utils/Android.mk b/lib/Target/AArch64/Utils/Android.mk
index b8bf795..3c1d194 100644
--- a/lib/Target/AArch64/Utils/Android.mk
+++ b/lib/Target/AArch64/Utils/Android.mk
@@ -1,5 +1,10 @@
LOCAL_PATH := $(call my-dir)
+arm64_utils_TBLGEN_TABLES := \
+ AArch64GenRegisterInfo.inc \
+ AArch64GenInstrInfo.inc \
+ AArch64GenSubtargetInfo.inc
+
arm64_utils_SRC_FILES := \
AArch64BaseInfo.cpp
@@ -16,7 +21,12 @@
LOCAL_MODULE_TAGS := optional
+TBLGEN_TD_DIR := $(LOCAL_PATH)/..
+TBLGEN_TABLES := $(arm64_utils_TBLGEN_TABLES)
+
include $(LLVM_DEVICE_BUILD_MK)
+include $(LLVM_TBLGEN_RULES_MK)
+include $(LLVM_GEN_INTRINSICS_MK)
include $(BUILD_STATIC_LIBRARY)
endif
@@ -32,5 +42,10 @@
LOCAL_MODULE_TAGS := optional
+TBLGEN_TD_DIR := $(LOCAL_PATH)/..
+TBLGEN_TABLES := $(arm64_utils_TBLGEN_TABLES)
+
include $(LLVM_HOST_BUILD_MK)
+include $(LLVM_TBLGEN_RULES_MK)
+include $(LLVM_GEN_INTRINSICS_MK)
include $(BUILD_HOST_STATIC_LIBRARY)
diff --git a/lib/Target/AArch64/Utils/LLVMBuild.txt b/lib/Target/AArch64/Utils/LLVMBuild.txt
index 4acecc9..bcefeb6 100644
--- a/lib/Target/AArch64/Utils/LLVMBuild.txt
+++ b/lib/Target/AArch64/Utils/LLVMBuild.txt
@@ -1,4 +1,4 @@
-;===- ./lib/Target/AArch646/Utils/LLVMBuild.txt ----------------*- Conf -*--===;
+;===- ./lib/Target/AArch64/Utils/LLVMBuild.txt ----------------*- Conf -*--===;
;
; The LLVM Compiler Infrastructure
;
diff --git a/lib/Target/AArch64/Utils/Makefile b/lib/Target/AArch64/Utils/Makefile
index 0f4a645..0b80f82 100644
--- a/lib/Target/AArch64/Utils/Makefile
+++ b/lib/Target/AArch64/Utils/Makefile
@@ -9,7 +9,8 @@
LEVEL = ../../../..
LIBRARYNAME = LLVMAArch64Utils
-# Hack: we need to include 'main' AArch64 target directory to grab private headers
-#CPP.Flags += -I$(PROJ_OBJ_DIR)/.. -I$(PROJ_SRC_DIR)/..
+# Hack: we need to include 'main' AArch64 target directory to grab private
+# headers
+CPP.Flags += -I$(PROJ_OBJ_DIR)/.. -I$(PROJ_SRC_DIR)/..
include $(LEVEL)/Makefile.common