blob: 870fa407679e7f9957976ded0c60272367fbcf5e [file] [log] [blame]
// Copyright 2021 Google LLC
//
// This source code is licensed under the BSD-style license found in the
// LICENSE file in the root directory of this source tree.
#include <xnnpack/allocator.h>
#include <array>
#include <cstddef>
#include <cstdint>
#include <initializer_list>
namespace xnnpack {
namespace aarch32 {
struct CoreRegister {
uint8_t code;
};
constexpr CoreRegister r0{0};
constexpr CoreRegister r1{1};
constexpr CoreRegister r2{2};
constexpr CoreRegister r3{3};
constexpr CoreRegister r4{4};
constexpr CoreRegister r5{5};
constexpr CoreRegister r6{6};
constexpr CoreRegister r7{7};
constexpr CoreRegister r8{8};
constexpr CoreRegister r9{9};
constexpr CoreRegister r10{10};
constexpr CoreRegister r11{11};
constexpr CoreRegister r12{12};
constexpr CoreRegister r13{13};
constexpr CoreRegister r14{14};
constexpr CoreRegister r15{15};
constexpr CoreRegister sp = r13;
constexpr CoreRegister lr = r14;
constexpr CoreRegister pc = r15;
static inline bool operator==(const CoreRegister lhs, const CoreRegister rhs) {
return lhs.code == rhs.code;
}
struct CoreRegisterList {
CoreRegisterList(std::initializer_list<CoreRegister> rs) {
for (auto r : rs) {
list |= 1 << r.code;
}
}
bool has_more_than_one_register() { return (list & (list - 1)) != 0; }
// Bit i is set if CoreRegister is in the list.
uint16_t list = 0;
};
static inline bool operator==(int i, CoreRegisterList registers) {
return i == registers.list;
}
struct SRegister {
uint8_t code;
uint8_t d() const { return code & 0x1; }
uint8_t vd() const { return (code & 0x1e) >> 1; }
};
static inline bool operator==(const SRegister lhs, const SRegister rhs) {
return lhs.code == rhs.code;
}
constexpr SRegister s0{0};
constexpr SRegister s1{1};
constexpr SRegister s2{2};
constexpr SRegister s3{3};
constexpr SRegister s4{4};
constexpr SRegister s5{5};
constexpr SRegister s6{6};
constexpr SRegister s7{7};
constexpr SRegister s8{8};
constexpr SRegister s9{9};
constexpr SRegister s10{10};
constexpr SRegister s11{11};
constexpr SRegister s12{12};
constexpr SRegister s13{13};
constexpr SRegister s14{14};
constexpr SRegister s15{15};
constexpr SRegister s16{16};
constexpr SRegister s17{17};
constexpr SRegister s18{18};
constexpr SRegister s19{19};
constexpr SRegister s20{20};
constexpr SRegister s21{21};
constexpr SRegister s22{22};
constexpr SRegister s23{23};
constexpr SRegister s24{24};
constexpr SRegister s25{25};
constexpr SRegister s26{26};
constexpr SRegister s27{27};
constexpr SRegister s28{28};
constexpr SRegister s29{29};
constexpr SRegister s30{30};
constexpr SRegister s31{31};
// Define DRegisterLane before DRegister so that we can have the operator[] overloading for nice syntax.
struct DRegisterLane {
uint8_t code;
uint8_t lane;
uint8_t d() const { return (code & 0x10) >> 4; }
uint8_t vd() const { return code & 0xf; }
};
static inline bool operator==(const DRegisterLane lhs, const DRegisterLane rhs) {
return lhs.code == rhs.code && lhs.lane == rhs.lane;
}
struct DRegister {
uint8_t code;
uint8_t d() const { return (code & 0x10) >> 4; }
uint8_t vd() const { return code & 0xf; }
const DRegisterLane operator[](std::size_t pos) const {
return DRegisterLane{code, static_cast<uint8_t>(pos)};
}
};
static inline bool operator==(const DRegister lhs, const DRegister rhs) {
return lhs.code == rhs.code;
}
constexpr DRegister d0{0};
constexpr DRegister d1{1};
constexpr DRegister d2{2};
constexpr DRegister d3{3};
constexpr DRegister d4{4};
constexpr DRegister d5{5};
constexpr DRegister d6{6};
constexpr DRegister d7{7};
constexpr DRegister d8{8};
constexpr DRegister d9{9};
constexpr DRegister d10{10};
constexpr DRegister d11{11};
constexpr DRegister d12{12};
constexpr DRegister d13{13};
constexpr DRegister d14{14};
constexpr DRegister d15{15};
constexpr DRegister d16{16};
constexpr DRegister d17{17};
constexpr DRegister d18{18};
constexpr DRegister d19{19};
constexpr DRegister d20{20};
constexpr DRegister d21{21};
constexpr DRegister d22{22};
constexpr DRegister d23{23};
constexpr DRegister d24{24};
constexpr DRegister d25{25};
constexpr DRegister d26{26};
constexpr DRegister d27{27};
constexpr DRegister d28{28};
constexpr DRegister d29{29};
constexpr DRegister d30{30};
constexpr DRegister d31{31};
struct QRegister {
uint8_t code;
// Encode code * 2.
uint8_t d() const { return (code & 0x8) >> 3; }
uint8_t vd() const { return (code & 0x7) << 1; }
};
constexpr QRegister q0{0};
constexpr QRegister q1{1};
constexpr QRegister q2{2};
constexpr QRegister q3{3};
constexpr QRegister q4{4};
constexpr QRegister q5{5};
constexpr QRegister q6{6};
constexpr QRegister q7{7};
constexpr QRegister q8{8};
constexpr QRegister q9{9};
constexpr QRegister q10{10};
constexpr QRegister q11{11};
constexpr QRegister q12{12};
constexpr QRegister q13{13};
constexpr QRegister q14{14};
constexpr QRegister q15{15};
// SIMD register lists are used in a more restrictive way, compared to core
// registers, only consecutive registers are used as an operand to instruction.
template <typename RegType>
struct ConsecutiveRegisterList {
// End must be >= start.
ConsecutiveRegisterList(RegType s, RegType end)
: start(s),
length(end.code - s.code + 1) {}
ConsecutiveRegisterList(RegType start)
: ConsecutiveRegisterList(start, start) {}
RegType start;
int length;
};
using SRegisterList = ConsecutiveRegisterList<SRegister>;
using DRegisterList = ConsecutiveRegisterList<DRegister>;
constexpr size_t max_label_users = 10;
// Label is a target of a branch. You call Assembler::bind to bind a label to an
// actual location in the instruction stream.
//
// ```
// Label l;
// b(kAl, l1); // branch to an unbound label is fine, it will be patched later.
// a.bind(l); // binds label to this location in the instruction stream.
// b(kAl, l1); // branch to an already bound label.
// ```
struct Label {
// Location of label within Assembler buffer.
uint32_t* offset = nullptr;
// A label can only be bound once, binding it again leads to an error.
bool bound = (offset != nullptr);
// All users of this label, recorded by their offset in the Assembler buffer.
std::array<uint32_t*, max_label_users> users = {0};
size_t num_users = 0;
// Records a user (e.g. branch instruction) of this label.
// Returns true if success, false if number of users exceeds maximum.
bool add_use(uint32_t* offset) {
if (num_users >= max_label_users) {
return false;
}
users[num_users++] = offset;
return true;
}
};
// A8.5 Addressing modes for memory access.
enum class AddressingMode {
// [<Rn>, <offset>], offset applied to address in Rn.
kOffset,
// Pre-indexed not used, so not implemented.
// [<Rn>], <offset>, address from Rn, offset applied, written back to Rn.
kPostIndexed,
};
// Memory operands, operands for memory access instructions. See
// "MemOperandHelper mem" for a nicer syntax that is closer to assembly.
class MemOperand {
public:
MemOperand(CoreRegister rn, int32_t offset)
: mode_(AddressingMode::kOffset),
rn_(rn),
offset_(offset) {}
MemOperand(CoreRegister rn, int32_t offset, AddressingMode mode)
: mode_(mode),
rn_(rn),
offset_(offset) {}
CoreRegister base() const { return rn_; }
int32_t offset() const { return offset_; }
AddressingMode mode() const { return mode_; }
// These are bits used for encoding, named based on the encoding description.
int32_t u() { return offset_ >= 0; }
int32_t p() { return mode_ != AddressingMode::kPostIndexed; }
// Note, kPostIndexed will write back, but doesn't need to set bit w.
int32_t w() { return 0; }
// Overload postfix increment to indicate a post-indexed addressing mode for load/stores.
MemOperand operator++(int) {
mode_ = AddressingMode::kPostIndexed;
return *this;
}
private:
AddressingMode mode_;
CoreRegister rn_;
int32_t offset_;
};
static inline bool operator==(const MemOperand lhs, const MemOperand rhs) {
return lhs.mode() == rhs.mode() && lhs.base() == rhs.base() && lhs.offset() == rhs.offset();
}
static inline MemOperand operator,(CoreRegister r, int32_t offset) {
return MemOperand(r, offset);
}
// Helper struct for some syntax sugar to look like native assembly, see mem.
struct MemOperandHelper {
const MemOperand operator[](MemOperand op) const { return op; }
MemOperand operator[](CoreRegister r) const { return MemOperand(r, 0); }
};
// Use "mem" (and its overload of array subscript operator) to get some syntax
// that looks closer to native assembly when accessing memory. For example:
// - ldr(r0, mem[rn, offset]); // offset
// - ldr(r0, mem[rn], offset); // post-indexed
constexpr MemOperandHelper mem;
// Conditional execution, only support AL (always) for now.
enum Condition : uint32_t {
kEQ = 0x00000000,
kNE = 0x10000000,
kCS = 0x20000000,
kCC = 0x30000000,
kMI = 0x40000000,
kPL = 0x50000000,
kVS = 0x60000000,
kVC = 0x70000000,
kHI = 0x80000000,
kLS = 0x90000000,
kGE = 0xa0000000,
kLT = 0xB0000000,
kGT = 0xC0000000,
kLE = 0xD0000000,
kAL = 0xE0000000,
kHS = kCS,
kLO = kCC,
};
enum class Error {
kNoError,
kOutOfMemory,
kInvalidOperand,
kLabelAlreadyBound,
kLabelOffsetOutOfBounds,
kLabelHasTooManyUsers,
kInvalidLaneIndex,
kInvalidRegisterListLength,
};
enum DataSize {
k8 = 0,
k16 = 1,
k32 = 2,
};
// A simple AAarch32 assembler.
class Assembler {
public:
// Takes an xnn_code_buffer with a pointer to allocated memory.
explicit Assembler(xnn_code_buffer* buf);
Assembler& add(CoreRegister rd, CoreRegister rn, CoreRegister rm);
// Only support uint8_t immediates for now, it simplifies encoding.
Assembler& add(CoreRegister rd, CoreRegister rn, uint8_t imm);
Assembler& beq(Label& l) { return b(kEQ, l); }
Assembler& bne(Label& l) { return b(kNE, l); }
Assembler& bhi(Label& l) { return b(kHI, l); }
Assembler& bhs(Label& l) { return b(kHS, l); }
Assembler& blo(Label& l) { return b(kLO, l); }
Assembler& bx(CoreRegister rm);
// Cmp supports a subset of uint32_t offsets, see "A5.2.4 Modified immediate
// constants in ARM instructions", for simplicity we start with uint8_t, which
// is fully representation using a "rotation" of 0.
Assembler& cmp(CoreRegister rn, uint8_t imm);
Assembler& ldr(CoreRegister rt, MemOperand operand, int32_t offset);
Assembler& ldr(CoreRegister rt, MemOperand operand);
Assembler& mov(CoreRegister rd, CoreRegister rm);
Assembler& movlo(CoreRegister rd, CoreRegister rm);
Assembler& movls(CoreRegister rd, CoreRegister rm);
Assembler& pld(MemOperand operand);
Assembler& pop(CoreRegisterList regs);
Assembler& push(CoreRegisterList regs);
Assembler& sub(CoreRegister rd, CoreRegister rn, CoreRegister rm);
// Only support uint8_t immediates for now, it simplifies encoding.
Assembler& subs(CoreRegister rd, CoreRegister rn, uint8_t imm);
Assembler& tst(CoreRegister rn, uint8_t imm);
// SIMD instructions.
Assembler& vdup_8(QRegister qd, DRegisterLane dm) { return vdup(k8, qd, dm); }
Assembler& vdup_16(QRegister qd, DRegisterLane dm) { return vdup(k16, qd, dm); }
Assembler& vdup_32(QRegister qd, DRegisterLane dm) { return vdup(k32, qd, dm); }
Assembler& vext_8(QRegister qd, QRegister qn, QRegister qm, uint8_t imm4);
// VLD1.8 <list>, [<Rn>]{!} (multiple single elements).
Assembler& vld1_8(DRegisterList regs, MemOperand op);
// VLD1.32 <list>, [<Rn>]{!} (multiple single elements).
Assembler& vld1_32(DRegisterList regs, MemOperand op);
// VLD1.32 <list>, [<Rn>]{!} (single element to all lanes).
// We cannot differentiate the register list in C++ syntax, so use an instruction name similar to AArch64 LD1R.
Assembler& vld1r_32(DRegisterList regs, MemOperand op);
// VLDM <Rn>{!}, <list>. {!} is indicated by setting `wb` argument.
Assembler& vldm(CoreRegister rn, SRegisterList regs, bool wb);
Assembler& vldm(CoreRegister rn, DRegisterList regs, bool wb);
Assembler& vldm(CoreRegister rn, SRegisterList regs) { return vldm(rn, regs, false); }
Assembler& vldm(CoreRegister rn, DRegisterList regs) { return vldm(rn, regs, false); }
Assembler& vldr(DRegister dd, MemOperand op);
Assembler& vmax_f32(QRegister qd, QRegister qn, QRegister qm);
Assembler& vmax_s8(QRegister qd, QRegister qn, QRegister qm);
Assembler& vmin_f32(QRegister qd, QRegister qn, QRegister qm);
Assembler& vmin_s8(QRegister qd, QRegister qn, QRegister qm);
// VMLA.F32 <Qd>, <Qn>, <Dm[x]>
Assembler& vmla_f32(QRegister qd, QRegister qn, DRegisterLane dm);
// VMLAL.S16 <Qd>, <Dn>, <Dm[x]>
Assembler& vmlal_s16(QRegister qd, DRegister dn, DRegisterLane dm);
// VMOV.F32 <Sd>, <Sm>; encoding A2.
Assembler& vmov(SRegister sd, SRegister sm);
// VMOV <Dm>, <Rt>, <Rt2>; encoding A1.
Assembler& vmov(DRegister dm, CoreRegister rt, CoreRegister rt2);
// VMOV <Dd>, <Dm>; encoding A1.
Assembler& vmov(DRegister dd, DRegister dm);
// VMOV <Qd>, <Qm>; encoding A1.
Assembler& vmov(QRegister qd, QRegister qm);
// VMOVL.S8 <Qd>, <Dm>
Assembler& vmovl_s8(QRegister qd, DRegister dm);
Assembler& vpop(DRegisterList regs);
Assembler& vpush(SRegisterList regs);
Assembler& vpush(DRegisterList regs);
Assembler& vqadd_s16(QRegister qd, QRegister qn, QRegister qm);
Assembler& vqdmulh_s32(QRegister qd, QRegister qn, DRegisterLane dm);
Assembler& vqmovn_s32(DRegister dd, QRegister qm);
Assembler& vqshl_s32(QRegister qd, QRegister qm, QRegister qn);
Assembler& vrshl_s32(QRegister qd, QRegister qm, QRegister qn);
// VST1.8 <list>, [<Rn>]{!} (multiple single elements).
Assembler& vst1_8(DRegisterList regs, MemOperand op) { return vst1(k8, regs, op); }
// VST1.8 <list>, [<Rn>]{!}, <Rm> (multiple single elements).
Assembler& vst1_8(DRegisterList regs, MemOperand op, CoreRegister rm) { return vst1(k8, regs, op, rm); }
// VST1.8 <list>, [<Rn>]{!} (single element form one lane).
Assembler& vst1_8(DRegisterLane dd, MemOperand op) { return vst1(k8, dd, op); }
// VST1.16 <list>, [<Rn>]{!} (multiple single elements).
Assembler& vst1_16(DRegisterList regs, MemOperand op) { return vst1(k16, regs, op); }
// VST1.16 <list>, [<Rn>]{!}, <Rm> (multiple single elements).
Assembler& vst1_16(DRegisterList regs, MemOperand op, CoreRegister rm) { return vst1(k16, regs, op, rm); }
// VST1.16 <list>, [<Rn>]{!} (single element form one lane).
Assembler& vst1_16(DRegisterLane dd, MemOperand op) { return vst1(k16, dd, op); }
// VST1.32 <list>, [<Rn>]{!} (multiple single elements).
Assembler& vst1_32(DRegisterList regs, MemOperand op) { return vst1(k32, regs, op); }
// VST1.32 <list>, [<Rn>]{!}, <Rm> (multiple single elements).
Assembler& vst1_32(DRegisterList regs, MemOperand op, CoreRegister rm) { return vst1(k32, regs, op, rm); }
// VST1.32 <list>, [<Rn>]{!} (single element form one lane).
Assembler& vst1_32(DRegisterLane dd, MemOperand op) { return vst1(k32, dd, op); }
// Binds Label l to the current location in the code buffer.
Assembler& bind(Label& l);
// Finish assembly of code, this should be the last function called on an
// instance of Assembler. Returns a pointer to the start of code region.
void* finalize();
// Reset the assembler state (no memory is freed).
void reset();
// Get a pointer to the start of code buffer.
const uint32_t* start() const { return buffer_; }
const uint32_t* offset() const { return cursor_; }
// Returns the number of bytes of code actually in the buffer.
size_t code_size_in_bytes() const { return (cursor_ - buffer_) * 4; }
const Error error() const { return error_; }
private:
// Emits a 32-bit value to the code buffer.
Assembler& emit32(uint32_t value);
Assembler& mov(Condition c, CoreRegister rd, CoreRegister rm);
Assembler& b(Condition c, Label& l);
Assembler& vdup(DataSize size, QRegister qd, DRegisterLane dm);
Assembler& vst1(DataSize size, DRegisterList regs, MemOperand op);
Assembler& vst1(DataSize size, DRegisterList regs, MemOperand op, CoreRegister rm);
Assembler& vst1(DataSize size, DRegisterLane dd, MemOperand op);
// Pointer to start of code buffer.
uint32_t* buffer_;
// Pointer to current place in code buffer.
uint32_t* cursor_;
// Pointer to out-of-bounds of code buffer.
uint32_t* top_;
// Errors encountered while assembling code.
Error error_ = Error::kNoError;
// Holds an xnn_code_buffer, will write code to its code pointer, and unmap
// unused pages on finalizing.
xnn_code_buffer* xnn_buffer = nullptr;
};
} // namespace aarch32
} // namespace xnnpack