native f32 min/max

No diffs.

Change-Id: Ia0b35c2787e27d74763f21b81072affa6caf1e5a
Reviewed-on: https://skia-review.googlesource.com/c/skia/+/253720
Commit-Queue: Mike Klein <mtklein@google.com>
Commit-Queue: Mike Reed <reed@google.com>
Auto-Submit: Mike Klein <mtklein@google.com>
Reviewed-by: Mike Reed <reed@google.com>
diff --git a/src/core/SkVM.cpp b/src/core/SkVM.cpp
index a51b4f0..62b395a 100644
--- a/src/core/SkVM.cpp
+++ b/src/core/SkVM.cpp
@@ -138,6 +138,8 @@
                 case Op::sub_f32: write(o, V{id}, "= sub_f32", V{x}, V{y}      ); break;
                 case Op::mul_f32: write(o, V{id}, "= mul_f32", V{x}, V{y}      ); break;
                 case Op::div_f32: write(o, V{id}, "= div_f32", V{x}, V{y}      ); break;
+                case Op::min_f32: write(o, V{id}, "= min_f32", V{x}, V{y}      ); break;
+                case Op::max_f32: write(o, V{id}, "= max_f32", V{x}, V{y}      ); break;
                 case Op::mad_f32: write(o, V{id}, "= mad_f32", V{x}, V{y}, V{z}); break;
 
                 case Op:: eq_f32: write(o, V{id}, "= eq_f32", V{x}, V{y}); break;
@@ -243,6 +245,8 @@
                 case Op::sub_f32: write(o, R{d}, "= sub_f32", R{x}, R{y}      ); break;
                 case Op::mul_f32: write(o, R{d}, "= mul_f32", R{x}, R{y}      ); break;
                 case Op::div_f32: write(o, R{d}, "= div_f32", R{x}, R{y}      ); break;
+                case Op::min_f32: write(o, R{d}, "= min_f32", R{x}, R{y}      ); break;
+                case Op::max_f32: write(o, R{d}, "= max_f32", R{x}, R{y}      ); break;
                 case Op::mad_f32: write(o, R{d}, "= mad_f32", R{x}, R{y}, R{z}); break;
 
                 case Op:: eq_f32: write(o, R{d}, "= eq_f32", R{x}, R{y}); break;
@@ -497,6 +501,8 @@
     F32 Builder::sub(F32 x, F32 y       ) { return {this->push(Op::sub_f32, x.id, y.id)}; }
     F32 Builder::mul(F32 x, F32 y       ) { return {this->push(Op::mul_f32, x.id, y.id)}; }
     F32 Builder::div(F32 x, F32 y       ) { return {this->push(Op::div_f32, x.id, y.id)}; }
+    F32 Builder::min(F32 x, F32 y       ) { return {this->push(Op::min_f32, x.id, y.id)}; }
+    F32 Builder::max(F32 x, F32 y       ) { return {this->push(Op::max_f32, x.id, y.id)}; }
     F32 Builder::mad(F32 x, F32 y, F32 z) {
         if (this->isZero(z.id)) {
             return this->mul(x,y);
@@ -755,6 +761,8 @@
     void Assembler::vsubps(Ymm dst, Ymm x, Ymm y) { this->op(0,0x0f,0x5c, dst,x,y); }
     void Assembler::vmulps(Ymm dst, Ymm x, Ymm y) { this->op(0,0x0f,0x59, dst,x,y); }
     void Assembler::vdivps(Ymm dst, Ymm x, Ymm y) { this->op(0,0x0f,0x5e, dst,x,y); }
+    void Assembler::vminps(Ymm dst, Ymm x, Ymm y) { this->op(0,0x0f,0x5d, dst,x,y); }
+    void Assembler::vmaxps(Ymm dst, Ymm x, Ymm y) { this->op(0,0x0f,0x5f, dst,x,y); }
 
     void Assembler::vfmadd132ps(Ymm dst, Ymm x, Ymm y) { this->op(0x66,0x380f,0x98, dst,x,y); }
     void Assembler::vfmadd213ps(Ymm dst, Ymm x, Ymm y) { this->op(0x66,0x380f,0xa8, dst,x,y); }
@@ -1063,6 +1071,8 @@
     void Assembler::fsub4s(V d, V n, V m) { this->op(0b0'1'0'01110'1'0'1, m, 0b11010'1, n, d); }
     void Assembler::fmul4s(V d, V n, V m) { this->op(0b0'1'1'01110'0'0'1, m, 0b11011'1, n, d); }
     void Assembler::fdiv4s(V d, V n, V m) { this->op(0b0'1'1'01110'0'0'1, m, 0b11111'1, n, d); }
+    void Assembler::fmin4s(V d, V n, V m) { this->op(0b0'1'0'01110'1'0'1, m, 0b11110'1, n, d); }
+    void Assembler::fmax4s(V d, V n, V m) { this->op(0b0'1'0'01110'0'0'1, m, 0b11110'1, n, d); }
 
     void Assembler::fcmeq4s(V d, V n, V m) { this->op(0b0'1'0'01110'0'0'1, m, 0b1110'0'1, n, d); }
     void Assembler::fcmgt4s(V d, V n, V m) { this->op(0b0'1'1'01110'1'0'1, m, 0b1110'0'1, n, d); }
@@ -1364,6 +1374,8 @@
                     CASE(Op::sub_f32): r(d).f32 = r(x).f32 - r(y).f32; break;
                     CASE(Op::mul_f32): r(d).f32 = r(x).f32 * r(y).f32; break;
                     CASE(Op::div_f32): r(d).f32 = r(x).f32 / r(y).f32; break;
+                    CASE(Op::min_f32): r(d).f32 = min(r(x).f32, r(y).f32); break;
+                    CASE(Op::max_f32): r(d).f32 = max(r(x).f32, r(y).f32); break;
 
                     CASE(Op::mad_f32): r(d).f32 = r(x).f32 * r(y).f32 + r(z).f32; break;
 
@@ -1872,6 +1884,8 @@
                 case Op::sub_f32: a->vsubps(dst(), r[x], r[y]); break;
                 case Op::mul_f32: a->vmulps(dst(), r[x], r[y]); break;
                 case Op::div_f32: a->vdivps(dst(), r[x], r[y]); break;
+                case Op::min_f32: a->vminps(dst(), r[x], r[y]); break;
+                case Op::max_f32: a->vmaxps(dst(), r[x], r[y]); break;
 
                 case Op::mad_f32:
                     if      (avail & (1<<r[x])) { set_dst(r[x]); a->vfmadd132ps(r[x], r[z], r[y]); }
@@ -1956,6 +1970,8 @@
                 case Op::sub_f32: a->fsub4s(dst(), r[x], r[y]); break;
                 case Op::mul_f32: a->fmul4s(dst(), r[x], r[y]); break;
                 case Op::div_f32: a->fdiv4s(dst(), r[x], r[y]); break;
+                case Op::min_f32: a->fmin4s(dst(), r[x], r[y]); break;
+                case Op::max_f32: a->fmax4s(dst(), r[x], r[y]); break;
 
                 case Op::mad_f32: // fmla4s is z += x*y
                     if (avail & (1<<r[z])) { set_dst(r[z]); a->fmla4s( r[z],  r[x],  r[y]);   }
diff --git a/src/core/SkVM.h b/src/core/SkVM.h
index 24f95ca..ddda739 100644
--- a/src/core/SkVM.h
+++ b/src/core/SkVM.h
@@ -69,7 +69,7 @@
         DstEqXOpY vpand, vpor, vpxor, vpandn,
                   vpaddd, vpsubd, vpmulld,
                           vpsubw, vpmullw,
-                  vaddps, vsubps, vmulps, vdivps,
+                  vaddps, vsubps, vmulps, vdivps, vminps, vmaxps,
                   vfmadd132ps, vfmadd213ps, vfmadd231ps,
                   vpackusdw, vpackuswb,
                   vpcmpeqd, vpcmpgtd;
@@ -144,7 +144,7 @@
                add4s,  sub4s,  mul4s,
               cmeq4s, cmgt4s,
                        sub8h,  mul8h,
-              fadd4s, fsub4s, fmul4s, fdiv4s,
+              fadd4s, fsub4s, fmul4s, fdiv4s, fmin4s, fmax4s,
               fcmeq4s, fcmgt4s, fcmge4s,
               tbl;
 
@@ -262,6 +262,8 @@
         sub_f32, sub_i32, sub_i16x2,
         mul_f32, mul_i32, mul_i16x2,
         div_f32,
+        min_f32,
+        max_f32,
         mad_f32,
                  shl_i32, shl_i16x2,
                  shr_i32, shr_i16x2,
@@ -368,6 +370,8 @@
         F32 sub(F32 x, F32 y);
         F32 mul(F32 x, F32 y);
         F32 div(F32 x, F32 y);
+        F32 min(F32 x, F32 y);
+        F32 max(F32 x, F32 y);
         F32 mad(F32 x, F32 y, F32 z);  //  x*y+z, often an FMA
 
         I32 eq (F32 x, F32 y);
@@ -457,10 +461,6 @@
 
         uint32_t hash() const;
 
-        // TODO: native min/max ops
-        skvm::F32 min(skvm::F32 x, skvm::F32 y) { return select(lt(x,y), x,y); }
-        skvm::F32 max(skvm::F32 x, skvm::F32 y) { return select(gt(x,y), x,y); }
-
     private:
         struct InstructionHash {
             size_t operator()(const Instruction& inst) const;
diff --git a/tests/SkVMTest.cpp b/tests/SkVMTest.cpp
index 86a2ae5..f298021 100644
--- a/tests/SkVMTest.cpp
+++ b/tests/SkVMTest.cpp
@@ -829,6 +829,14 @@
     });
 
     test_asm(r, [&](A& a) {
+        a.vminps(A::ymm0, A::ymm1, A::ymm2);
+        a.vmaxps(A::ymm0, A::ymm1, A::ymm2);
+    },{
+        0xc5,0xf4,0x5d,0xc2,
+        0xc5,0xf4,0x5f,0xc2,
+    });
+
+    test_asm(r, [&](A& a) {
         a.vpblendvb(A::ymm0, A::ymm1, A::ymm2, A::ymm3);
     },{
         0xc4,0xe3,0x75, 0x4c, 0xc2, 0x30,
@@ -1063,6 +1071,8 @@
         a.fsub4s(A::v4, A::v3, A::v1);
         a.fmul4s(A::v4, A::v3, A::v1);
         a.fdiv4s(A::v4, A::v3, A::v1);
+        a.fmin4s(A::v4, A::v3, A::v1);
+        a.fmax4s(A::v4, A::v3, A::v1);
 
         a.fmla4s(A::v4, A::v3, A::v1);
 
@@ -1091,6 +1101,8 @@
         0x64,0xd4,0xa1,0x4e,
         0x64,0xdc,0x21,0x6e,
         0x64,0xfc,0x21,0x6e,
+        0x64,0xf4,0xa1,0x4e,
+        0x64,0xf4,0x21,0x4e,
 
         0x64,0xcc,0x21,0x4e,