Rename vunop and vbinop functions
- Normalize names with other microkernel-related functions and files. Only operator-related functions have "op" in the name.
- vunop -> vunary
- vbinop -> vbinary
PiperOrigin-RevId: 281864878
diff --git a/BUILD.bazel b/BUILD.bazel
index 7cdc8ef..2a6803a 100644
--- a/BUILD.bazel
+++ b/BUILD.bazel
@@ -74,57 +74,32 @@
]
SCALAR_UKERNELS = [
- "src/f32-argmaxpool/9p8x-scalar-c1.c",
"src/f32-argmaxpool/4x-scalar-c1.c",
+ "src/f32-argmaxpool/9p8x-scalar-c1.c",
"src/f32-argmaxpool/9x-scalar-c1.c",
"src/f32-avgpool/mp9p8q-scalar.c",
"src/f32-avgpool/up9-scalar.c",
"src/f32-bilinear/scalar-c1.c",
"src/f32-bilinear/scalar-c2.c",
"src/f32-bilinear/scalar-c4.c",
- "src/f32-binop/vadd-scalar-x1.c",
- "src/f32-binop/vadd-scalar-x2.c",
- "src/f32-binop/vadd-scalar-x4.c",
- "src/f32-binop/vaddc-scalar-x1.c",
- "src/f32-binop/vaddc-scalar-x2.c",
- "src/f32-binop/vaddc-scalar-x4.c",
- "src/f32-binop/vmul-scalar-x1.c",
- "src/f32-binop/vmul-scalar-x2.c",
- "src/f32-binop/vmul-scalar-x4.c",
- "src/f32-binop/vmulc-scalar-x1.c",
- "src/f32-binop/vmulc-scalar-x2.c",
- "src/f32-binop/vmulc-scalar-x4.c",
- "src/f32-binop/vsub-scalar-x1.c",
- "src/f32-binop/vsub-scalar-x2.c",
- "src/f32-binop/vsub-scalar-x4.c",
- "src/f32-binop/vsubc-scalar-x1.c",
- "src/f32-binop/vsubc-scalar-x2.c",
- "src/f32-binop/vsubc-scalar-x4.c",
- "src/f32-binop/vrsubc-scalar-x1.c",
- "src/f32-binop/vrsubc-scalar-x2.c",
- "src/f32-binop/vrsubc-scalar-x4.c",
"src/f32-clamp/scalar.c",
"src/f32-conv-hwc2spchw/3x3s2p1c3x4-scalar-1x1.c",
- "src/f32-igemm/1x4-scalar.c",
- "src/f32-igemm/2x4-scalar.c",
- "src/f32-igemm/4x2-scalar.c",
- "src/f32-igemm/4x4-scalar.c",
- "src/f32-dwconv/up1x25-scalar.c",
- "src/f32-dwconv/up1x25-scalar-acc2.c",
- "src/f32-dwconv/up2x25-scalar.c",
- "src/f32-dwconv/up2x25-scalar-acc2.c",
- "src/f32-dwconv/up1x4-scalar.c",
- "src/f32-dwconv/up1x4-scalar-acc2.c",
- "src/f32-dwconv/up2x4-scalar.c",
- "src/f32-dwconv/up2x4-scalar-acc2.c",
- "src/f32-dwconv/up1x9-scalar.c",
- "src/f32-dwconv/up1x9-scalar-acc2.c",
- "src/f32-dwconv/up2x9-scalar.c",
- "src/f32-dwconv/up2x9-scalar-acc2.c",
"src/f32-dwconv-spchw/3x3p1-scalar.c",
- "src/f32-dwconv-spchw/5x5p2-scalar.c",
"src/f32-dwconv-spchw/3x3s2p1-scalar.c",
+ "src/f32-dwconv-spchw/5x5p2-scalar.c",
"src/f32-dwconv-spchw/5x5s2p2-scalar.c",
+ "src/f32-dwconv/up1x25-scalar-acc2.c",
+ "src/f32-dwconv/up1x25-scalar.c",
+ "src/f32-dwconv/up1x4-scalar-acc2.c",
+ "src/f32-dwconv/up1x4-scalar.c",
+ "src/f32-dwconv/up1x9-scalar-acc2.c",
+ "src/f32-dwconv/up1x9-scalar.c",
+ "src/f32-dwconv/up2x25-scalar-acc2.c",
+ "src/f32-dwconv/up2x25-scalar.c",
+ "src/f32-dwconv/up2x4-scalar-acc2.c",
+ "src/f32-dwconv/up2x4-scalar.c",
+ "src/f32-dwconv/up2x9-scalar-acc2.c",
+ "src/f32-dwconv/up2x9-scalar.c",
"src/f32-gavgpool-spchw/scalar-x1.c",
"src/f32-gavgpool/mp7p7q-scalar.c",
"src/f32-gavgpool/up7-scalar.c",
@@ -136,6 +111,10 @@
"src/f32-gemminc/2x4-scalar.c",
"src/f32-gemminc/4x4-scalar.c",
"src/f32-hswish/scalar.c",
+ "src/f32-igemm/1x4-scalar.c",
+ "src/f32-igemm/2x4-scalar.c",
+ "src/f32-igemm/4x2-scalar.c",
+ "src/f32-igemm/4x4-scalar.c",
"src/f32-maxpool/9p8x-scalar-c1.c",
"src/f32-pavgpool/mp9p8q-scalar.c",
"src/f32-pavgpool/up9-scalar.c",
@@ -156,16 +135,37 @@
"src/f32-spmm/8x1-scalar.c",
"src/f32-spmm/8x2-scalar.c",
"src/f32-spmm/8x4-scalar.c",
+ "src/f32-vbinary/vadd-scalar-x1.c",
+ "src/f32-vbinary/vadd-scalar-x2.c",
+ "src/f32-vbinary/vadd-scalar-x4.c",
+ "src/f32-vbinary/vaddc-scalar-x1.c",
+ "src/f32-vbinary/vaddc-scalar-x2.c",
+ "src/f32-vbinary/vaddc-scalar-x4.c",
+ "src/f32-vbinary/vmul-scalar-x1.c",
+ "src/f32-vbinary/vmul-scalar-x2.c",
+ "src/f32-vbinary/vmul-scalar-x4.c",
+ "src/f32-vbinary/vmulc-scalar-x1.c",
+ "src/f32-vbinary/vmulc-scalar-x2.c",
+ "src/f32-vbinary/vmulc-scalar-x4.c",
+ "src/f32-vbinary/vrsubc-scalar-x1.c",
+ "src/f32-vbinary/vrsubc-scalar-x2.c",
+ "src/f32-vbinary/vrsubc-scalar-x4.c",
+ "src/f32-vbinary/vsub-scalar-x1.c",
+ "src/f32-vbinary/vsub-scalar-x2.c",
+ "src/f32-vbinary/vsub-scalar-x4.c",
+ "src/f32-vbinary/vsubc-scalar-x1.c",
+ "src/f32-vbinary/vsubc-scalar-x2.c",
+ "src/f32-vbinary/vsubc-scalar-x4.c",
"src/f32-vmulcaddc/c1-scalar-2x.c",
"src/f32-vmulcaddc/c2-scalar-2x.c",
"src/f32-vmulcaddc/c4-scalar-2x.c",
"src/q8-avgpool/mp9p8q-scalar.c",
"src/q8-avgpool/up9-scalar.c",
- "src/q8-igemm/2x2-scalar.c",
"src/q8-dwconv/up1x9-scalar.c",
"src/q8-gavgpool/mp7p7q-scalar.c",
"src/q8-gavgpool/up7-scalar.c",
"src/q8-gemm/2x2-scalar.c",
+ "src/q8-igemm/2x2-scalar.c",
"src/q8-vadd/scalar.c",
"src/u8-clamp/scalar.c",
"src/u8-lut32norm/scalar.c",
@@ -188,50 +188,26 @@
]
PSIMD_UKERNELS = [
- "src/f32-argmaxpool/9p8x-psimd-c4.c",
"src/f32-argmaxpool/4x-psimd-c4.c",
+ "src/f32-argmaxpool/9p8x-psimd-c4.c",
"src/f32-argmaxpool/9x-psimd-c4.c",
"src/f32-avgpool/mp9p8q-psimd.c",
"src/f32-avgpool/up9-psimd.c",
"src/f32-bilinear/psimd-c4.c",
"src/f32-bilinear/psimd-c8.c",
- "src/f32-binop/vadd-psimd-x4.c",
- "src/f32-binop/vadd-psimd-x8.c",
- "src/f32-binop/vaddc-psimd-x4.c",
- "src/f32-binop/vaddc-psimd-x8.c",
- "src/f32-binop/vmul-psimd-x4.c",
- "src/f32-binop/vmul-psimd-x8.c",
- "src/f32-binop/vmulc-psimd-x4.c",
- "src/f32-binop/vmulc-psimd-x8.c",
- "src/f32-binop/vrsubc-psimd-x4.c",
- "src/f32-binop/vrsubc-psimd-x8.c",
- "src/f32-binop/vsub-psimd-x4.c",
- "src/f32-binop/vsub-psimd-x8.c",
- "src/f32-binop/vsubc-psimd-x4.c",
- "src/f32-binop/vsubc-psimd-x8.c",
"src/f32-clamp/psimd.c",
- "src/f32-igemm/1x8-psimd-loadsplat.c",
- "src/f32-igemm/1x8-psimd-splat.c",
- "src/f32-igemm/1x8s4-psimd.c",
- "src/f32-igemm/4x2c4-psimd.c",
- "src/f32-igemm/4x8-psimd-loadsplat.c",
- "src/f32-igemm/4x8-psimd-splat.c",
- "src/f32-igemm/4x8s4-psimd.c",
- "src/f32-igemm/6x8-psimd-loadsplat.c",
- "src/f32-igemm/6x8-psimd-splat.c",
- "src/f32-igemm/6x8s4-psimd.c",
- "src/f32-dwconv/up4x25-psimd.c",
"src/f32-dwconv/up4x25-psimd-acc2.c",
- "src/f32-dwconv/up8x25-psimd.c",
- "src/f32-dwconv/up8x25-psimd-acc2.c",
- "src/f32-dwconv/up4x4-psimd.c",
+ "src/f32-dwconv/up4x25-psimd.c",
"src/f32-dwconv/up4x4-psimd-acc2.c",
- "src/f32-dwconv/up8x4-psimd.c",
- "src/f32-dwconv/up8x4-psimd-acc2.c",
- "src/f32-dwconv/up4x9-psimd.c",
+ "src/f32-dwconv/up4x4-psimd.c",
"src/f32-dwconv/up4x9-psimd-acc2.c",
- "src/f32-dwconv/up8x9-psimd.c",
+ "src/f32-dwconv/up4x9-psimd.c",
+ "src/f32-dwconv/up8x25-psimd-acc2.c",
+ "src/f32-dwconv/up8x25-psimd.c",
+ "src/f32-dwconv/up8x4-psimd-acc2.c",
+ "src/f32-dwconv/up8x4-psimd.c",
"src/f32-dwconv/up8x9-psimd-acc2.c",
+ "src/f32-dwconv/up8x9-psimd.c",
"src/f32-gavgpool/mp7p7q-psimd.c",
"src/f32-gavgpool/up7-psimd.c",
"src/f32-gemm/1x8-psimd-loadsplat.c",
@@ -253,12 +229,36 @@
"src/f32-gemminc/6x8-psimd-splat.c",
"src/f32-gemminc/6x8s4-psimd.c",
"src/f32-hswish/psimd.c",
+ "src/f32-igemm/1x8-psimd-loadsplat.c",
+ "src/f32-igemm/1x8-psimd-splat.c",
+ "src/f32-igemm/1x8s4-psimd.c",
+ "src/f32-igemm/4x2c4-psimd.c",
+ "src/f32-igemm/4x8-psimd-loadsplat.c",
+ "src/f32-igemm/4x8-psimd-splat.c",
+ "src/f32-igemm/4x8s4-psimd.c",
+ "src/f32-igemm/6x8-psimd-loadsplat.c",
+ "src/f32-igemm/6x8-psimd-splat.c",
+ "src/f32-igemm/6x8s4-psimd.c",
"src/f32-maxpool/9p8x-psimd-c4.c",
"src/f32-pavgpool/mp9p8q-psimd.c",
"src/f32-pavgpool/up9-psimd.c",
"src/f32-ppmm/4x8-psimd.c",
"src/f32-prelu/psimd-2x4.c",
"src/f32-prelu/psimd-2x8.c",
+ "src/f32-vbinary/vadd-psimd-x4.c",
+ "src/f32-vbinary/vadd-psimd-x8.c",
+ "src/f32-vbinary/vaddc-psimd-x4.c",
+ "src/f32-vbinary/vaddc-psimd-x8.c",
+ "src/f32-vbinary/vmul-psimd-x4.c",
+ "src/f32-vbinary/vmul-psimd-x8.c",
+ "src/f32-vbinary/vmulc-psimd-x4.c",
+ "src/f32-vbinary/vmulc-psimd-x8.c",
+ "src/f32-vbinary/vrsubc-psimd-x4.c",
+ "src/f32-vbinary/vrsubc-psimd-x8.c",
+ "src/f32-vbinary/vsub-psimd-x4.c",
+ "src/f32-vbinary/vsub-psimd-x8.c",
+ "src/f32-vbinary/vsubc-psimd-x4.c",
+ "src/f32-vbinary/vsubc-psimd-x8.c",
"src/f32-vmulcaddc/c4-psimd-2x.c",
"src/f32-vmulcaddc/c8-psimd-2x.c",
"src/x32-packx/x4-psimd.c",
@@ -276,56 +276,42 @@
"src/f32-avgpool/up9-neon.c",
"src/f32-bilinear/neon-c4.c",
"src/f32-bilinear/neon-c8.c",
- "src/f32-binop/vadd-neon-x4.c",
- "src/f32-binop/vadd-neon-x8.c",
- "src/f32-binop/vaddc-neon-x4.c",
- "src/f32-binop/vaddc-neon-x8.c",
- "src/f32-binop/vmul-neon-x4.c",
- "src/f32-binop/vmul-neon-x8.c",
- "src/f32-binop/vmulc-neon-x4.c",
- "src/f32-binop/vmulc-neon-x8.c",
- "src/f32-binop/vrsubc-neon-x4.c",
- "src/f32-binop/vrsubc-neon-x8.c",
- "src/f32-binop/vsub-neon-x4.c",
- "src/f32-binop/vsub-neon-x8.c",
- "src/f32-binop/vsubc-neon-x4.c",
- "src/f32-binop/vsubc-neon-x8.c",
"src/f32-clamp/neon.c",
- "src/f32-dwconv/up4x9-neon.c",
"src/f32-dwconv/up4x9-neon-acc2.c",
- "src/f32-dwconv/up8x9-neon.c",
+ "src/f32-dwconv/up4x9-neon.c",
"src/f32-dwconv/up8x9-neon-acc2.c",
+ "src/f32-dwconv/up8x9-neon.c",
"src/f32-gavgpool-spchw/neon-x4.c",
"src/f32-gavgpool/mp7p7q-neon.c",
"src/f32-gavgpool/up7-neon.c",
"src/f32-gemm/1x8-neon-ld64.c",
+ "src/f32-gemm/1x8s4-neon.c",
"src/f32-gemm/4x2-neon-ld64.c",
"src/f32-gemm/4x8-neon-ld128.c",
"src/f32-gemm/4x8-neon-ld64.c",
+ "src/f32-gemm/4x8s4-neon.c",
"src/f32-gemm/5x8-neon-ld64.c",
"src/f32-gemm/6x8-neon-ld64.c",
- "src/f32-gemm/1x8s4-neon.c",
- "src/f32-gemm/4x8s4-neon.c",
"src/f32-gemm/6x8s4-neon.c",
"src/f32-gemm/8x8s4-neon.c",
"src/f32-gemminc/1x8-neon-ld64.c",
+ "src/f32-gemminc/1x8s4-neon.c",
"src/f32-gemminc/4x8-neon-ld128.c",
"src/f32-gemminc/4x8-neon-ld64.c",
+ "src/f32-gemminc/4x8s4-neon.c",
"src/f32-gemminc/5x8-neon-ld64.c",
"src/f32-gemminc/6x8-neon-ld64.c",
- "src/f32-gemminc/1x8s4-neon.c",
- "src/f32-gemminc/4x8s4-neon.c",
"src/f32-gemminc/6x8s4-neon.c",
"src/f32-gemminc/8x8s4-neon.c",
"src/f32-hswish/neon.c",
"src/f32-igemm/1x8-neon-ld64.c",
+ "src/f32-igemm/1x8s4-neon.c",
"src/f32-igemm/4x2-neon-ld64.c",
"src/f32-igemm/4x4-neon-ld64.c",
"src/f32-igemm/4x8-neon-ld128.c",
"src/f32-igemm/4x8-neon-ld64.c",
- "src/f32-igemm/6x8-neon-ld64.c",
- "src/f32-igemm/1x8s4-neon.c",
"src/f32-igemm/4x8s4-neon.c",
+ "src/f32-igemm/6x8-neon-ld64.c",
"src/f32-igemm/6x8s4-neon.c",
"src/f32-igemm/8x8s4-neon.c",
"src/f32-pavgpool/mp9p8q-neon.c",
@@ -336,6 +322,20 @@
"src/f32-prelu/neon-2x8.c",
"src/f32-rmax/neon.c",
"src/f32-sigmoid/neon-frac-p9-p10-nr1recps-x16.c",
+ "src/f32-vbinary/vadd-neon-x4.c",
+ "src/f32-vbinary/vadd-neon-x8.c",
+ "src/f32-vbinary/vaddc-neon-x4.c",
+ "src/f32-vbinary/vaddc-neon-x8.c",
+ "src/f32-vbinary/vmul-neon-x4.c",
+ "src/f32-vbinary/vmul-neon-x8.c",
+ "src/f32-vbinary/vmulc-neon-x4.c",
+ "src/f32-vbinary/vmulc-neon-x8.c",
+ "src/f32-vbinary/vrsubc-neon-x4.c",
+ "src/f32-vbinary/vrsubc-neon-x8.c",
+ "src/f32-vbinary/vsub-neon-x4.c",
+ "src/f32-vbinary/vsub-neon-x8.c",
+ "src/f32-vbinary/vsubc-neon-x4.c",
+ "src/f32-vbinary/vsubc-neon-x8.c",
"src/f32-vmulcaddc/c4-neon-2x.c",
"src/f32-vmulcaddc/c8-neon-2x.c",
"src/q8-avgpool/mp9p8q-neon.c",
@@ -450,40 +450,21 @@
"src/f32-avgpool/up9-sse.c",
"src/f32-bilinear/sse-c4.c",
"src/f32-bilinear/sse-c8.c",
- "src/f32-binop/vadd-sse-x4.c",
- "src/f32-binop/vadd-sse-x8.c",
- "src/f32-binop/vaddc-sse-x4.c",
- "src/f32-binop/vaddc-sse-x8.c",
- "src/f32-binop/vmul-sse-x4.c",
- "src/f32-binop/vmul-sse-x8.c",
- "src/f32-binop/vmulc-sse-x4.c",
- "src/f32-binop/vmulc-sse-x8.c",
- "src/f32-binop/vrsubc-sse-x4.c",
- "src/f32-binop/vrsubc-sse-x8.c",
- "src/f32-binop/vsub-sse-x4.c",
- "src/f32-binop/vsub-sse-x8.c",
- "src/f32-binop/vsubc-sse-x4.c",
- "src/f32-binop/vsubc-sse-x8.c",
"src/f32-clamp/sse.c",
- "src/f32-igemm/1x8-sse-dup.c",
- "src/f32-igemm/1x8-sse-load1.c",
- "src/f32-igemm/1x8s4-sse.c",
- "src/f32-igemm/4x2c4-sse.c",
- "src/f32-igemm/4x8-sse-dup.c",
- "src/f32-igemm/4x8-sse-load1.c",
- "src/f32-igemm/4x8s4-sse.c",
- "src/f32-dwconv/up4x25-sse.c",
- "src/f32-dwconv/up4x4-sse.c",
- "src/f32-dwconv/up4x9-sse.c",
+ "src/f32-dwconv-spchw/3x3p1-sse.c",
+ "src/f32-dwconv-spchw/3x3s2p1-sse.c",
"src/f32-dwconv/up4x25-sse-acc2.c",
+ "src/f32-dwconv/up4x25-sse.c",
"src/f32-dwconv/up4x4-sse-acc2.c",
+ "src/f32-dwconv/up4x4-sse.c",
"src/f32-dwconv/up4x9-sse-acc2.c",
- "src/f32-dwconv/up8x25-sse.c",
- "src/f32-dwconv/up8x4-sse.c",
- "src/f32-dwconv/up8x9-sse.c",
+ "src/f32-dwconv/up4x9-sse.c",
"src/f32-dwconv/up8x25-sse-acc2.c",
+ "src/f32-dwconv/up8x25-sse.c",
"src/f32-dwconv/up8x4-sse-acc2.c",
+ "src/f32-dwconv/up8x4-sse.c",
"src/f32-dwconv/up8x9-sse-acc2.c",
+ "src/f32-dwconv/up8x9-sse.c",
"src/f32-gavgpool-spchw/sse-x4.c",
"src/f32-gavgpool/mp7p7q-sse.c",
"src/f32-gavgpool/up7-sse.c",
@@ -500,15 +481,34 @@
"src/f32-gemminc/4x8-sse-load1.c",
"src/f32-gemminc/4x8s4-sse.c",
"src/f32-hswish/sse.c",
+ "src/f32-igemm/1x8-sse-dup.c",
+ "src/f32-igemm/1x8-sse-load1.c",
+ "src/f32-igemm/1x8s4-sse.c",
+ "src/f32-igemm/4x2c4-sse.c",
+ "src/f32-igemm/4x8-sse-dup.c",
+ "src/f32-igemm/4x8-sse-load1.c",
+ "src/f32-igemm/4x8s4-sse.c",
"src/f32-maxpool/9p8x-sse-c4.c",
"src/f32-pavgpool/mp9p8q-sse.c",
"src/f32-pavgpool/up9-sse.c",
- "src/f32-dwconv-spchw/3x3p1-sse.c",
- "src/f32-dwconv-spchw/3x3s2p1-sse.c",
"src/f32-ppmm/4x8-sse.c",
"src/f32-rmax/sse.c",
"src/f32-spmm/4x1-sse.c",
"src/f32-spmm/8x1-sse.c",
+ "src/f32-vbinary/vadd-sse-x4.c",
+ "src/f32-vbinary/vadd-sse-x8.c",
+ "src/f32-vbinary/vaddc-sse-x4.c",
+ "src/f32-vbinary/vaddc-sse-x8.c",
+ "src/f32-vbinary/vmul-sse-x4.c",
+ "src/f32-vbinary/vmul-sse-x8.c",
+ "src/f32-vbinary/vmulc-sse-x4.c",
+ "src/f32-vbinary/vmulc-sse-x8.c",
+ "src/f32-vbinary/vrsubc-sse-x4.c",
+ "src/f32-vbinary/vrsubc-sse-x8.c",
+ "src/f32-vbinary/vsub-sse-x4.c",
+ "src/f32-vbinary/vsub-sse-x8.c",
+ "src/f32-vbinary/vsubc-sse-x4.c",
+ "src/f32-vbinary/vsubc-sse-x8.c",
"src/f32-vmulcaddc/c4-sse-2x.c",
"src/f32-vmulcaddc/c8-sse-2x.c",
"src/x32-packx/x4-sse.c",
@@ -709,12 +709,12 @@
"src/xnnpack/spmm.h",
"src/xnnpack/unpool.h",
"src/xnnpack/vadd.h",
- "src/xnnpack/vbinop.h",
+ "src/xnnpack/vbinary.h",
"src/xnnpack/vmulcaddc.h",
"src/xnnpack/vscale.h",
"src/xnnpack/vscaleexpminusmax.h",
"src/xnnpack/vscaleextexp.h",
- "src/xnnpack/vunop.h",
+ "src/xnnpack/vunary.h",
"src/xnnpack/zip.h",
]
@@ -1637,7 +1637,7 @@
name = "f32_sigmoid_test",
srcs = [
"test/f32-sigmoid.cc",
- "test/vunop-microkernel-tester.h",
+ "test/vunary-microkernel-tester.h",
] + MICROKERNEL_TEST_HDRS,
deps = MICROKERNEL_TEST_DEPS,
)
@@ -1656,7 +1656,7 @@
name = "f32_vadd_test",
srcs = [
"test/f32-vadd.cc",
- "test/vbinop-microkernel-tester.h",
+ "test/vbinary-microkernel-tester.h",
] + MICROKERNEL_TEST_HDRS,
deps = MICROKERNEL_TEST_DEPS,
)
@@ -1665,7 +1665,7 @@
name = "f32_vaddc_test",
srcs = [
"test/f32-vaddc.cc",
- "test/vbinopc-microkernel-tester.h",
+ "test/vbinaryc-microkernel-tester.h",
] + MICROKERNEL_TEST_HDRS,
deps = MICROKERNEL_TEST_DEPS,
)
@@ -1674,7 +1674,7 @@
name = "f32_vmul_test",
srcs = [
"test/f32-vmul.cc",
- "test/vbinop-microkernel-tester.h",
+ "test/vbinary-microkernel-tester.h",
] + MICROKERNEL_TEST_HDRS,
deps = MICROKERNEL_TEST_DEPS,
)
@@ -1683,7 +1683,7 @@
name = "f32_vmulc_test",
srcs = [
"test/f32-vmulc.cc",
- "test/vbinopc-microkernel-tester.h",
+ "test/vbinaryc-microkernel-tester.h",
] + MICROKERNEL_TEST_HDRS,
deps = MICROKERNEL_TEST_DEPS,
)
@@ -1729,7 +1729,7 @@
name = "f32_vsub_test",
srcs = [
"test/f32-vsub.cc",
- "test/vbinop-microkernel-tester.h",
+ "test/vbinary-microkernel-tester.h",
] + MICROKERNEL_TEST_HDRS,
deps = MICROKERNEL_TEST_DEPS,
)
@@ -1738,7 +1738,7 @@
name = "f32_vsubc_test",
srcs = [
"test/f32-vsubc.cc",
- "test/vbinopc-microkernel-tester.h",
+ "test/vbinaryc-microkernel-tester.h",
] + MICROKERNEL_TEST_HDRS,
deps = MICROKERNEL_TEST_DEPS,
)
@@ -1747,7 +1747,7 @@
name = "f32_vrsubc_test",
srcs = [
"test/f32-vrsubc.cc",
- "test/vbinopc-microkernel-tester.h",
+ "test/vbinaryc-microkernel-tester.h",
] + MICROKERNEL_TEST_HDRS,
deps = MICROKERNEL_TEST_DEPS,
)
diff --git a/CMakeLists.txt b/CMakeLists.txt
index f34fbeb..cbd9213 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -172,49 +172,24 @@
src/f32-bilinear/scalar-c1.c
src/f32-bilinear/scalar-c2.c
src/f32-bilinear/scalar-c4.c
- src/f32-binop/vadd-scalar-x1.c
- src/f32-binop/vadd-scalar-x2.c
- src/f32-binop/vadd-scalar-x4.c
- src/f32-binop/vaddc-scalar-x1.c
- src/f32-binop/vaddc-scalar-x2.c
- src/f32-binop/vaddc-scalar-x4.c
- src/f32-binop/vmul-scalar-x1.c
- src/f32-binop/vmul-scalar-x2.c
- src/f32-binop/vmul-scalar-x4.c
- src/f32-binop/vmulc-scalar-x1.c
- src/f32-binop/vmulc-scalar-x2.c
- src/f32-binop/vmulc-scalar-x4.c
- src/f32-binop/vsub-scalar-x1.c
- src/f32-binop/vsub-scalar-x2.c
- src/f32-binop/vsub-scalar-x4.c
- src/f32-binop/vsubc-scalar-x1.c
- src/f32-binop/vsubc-scalar-x2.c
- src/f32-binop/vsubc-scalar-x4.c
- src/f32-binop/vrsubc-scalar-x1.c
- src/f32-binop/vrsubc-scalar-x2.c
- src/f32-binop/vrsubc-scalar-x4.c
src/f32-clamp/scalar.c
src/f32-conv-hwc2spchw/3x3s2p1c3x4-scalar-1x1.c
- src/f32-igemm/1x4-scalar.c
- src/f32-igemm/2x4-scalar.c
- src/f32-igemm/4x2-scalar.c
- src/f32-igemm/4x4-scalar.c
- src/f32-dwconv/up1x25-scalar.c
- src/f32-dwconv/up1x25-scalar-acc2.c
- src/f32-dwconv/up2x25-scalar.c
- src/f32-dwconv/up2x25-scalar-acc2.c
- src/f32-dwconv/up1x4-scalar.c
- src/f32-dwconv/up1x4-scalar-acc2.c
- src/f32-dwconv/up2x4-scalar.c
- src/f32-dwconv/up2x4-scalar-acc2.c
- src/f32-dwconv/up1x9-scalar.c
- src/f32-dwconv/up1x9-scalar-acc2.c
- src/f32-dwconv/up2x9-scalar.c
- src/f32-dwconv/up2x9-scalar-acc2.c
src/f32-dwconv-spchw/3x3p1-scalar.c
- src/f32-dwconv-spchw/5x5p2-scalar.c
src/f32-dwconv-spchw/3x3s2p1-scalar.c
+ src/f32-dwconv-spchw/5x5p2-scalar.c
src/f32-dwconv-spchw/5x5s2p2-scalar.c
+ src/f32-dwconv/up1x25-scalar-acc2.c
+ src/f32-dwconv/up1x25-scalar.c
+ src/f32-dwconv/up1x4-scalar-acc2.c
+ src/f32-dwconv/up1x4-scalar.c
+ src/f32-dwconv/up1x9-scalar-acc2.c
+ src/f32-dwconv/up1x9-scalar.c
+ src/f32-dwconv/up2x25-scalar-acc2.c
+ src/f32-dwconv/up2x25-scalar.c
+ src/f32-dwconv/up2x4-scalar-acc2.c
+ src/f32-dwconv/up2x4-scalar.c
+ src/f32-dwconv/up2x9-scalar-acc2.c
+ src/f32-dwconv/up2x9-scalar.c
src/f32-gavgpool-spchw/scalar-x1.c
src/f32-gavgpool/mp7p7q-scalar.c
src/f32-gavgpool/up7-scalar.c
@@ -226,6 +201,10 @@
src/f32-gemminc/2x4-scalar.c
src/f32-gemminc/4x4-scalar.c
src/f32-hswish/scalar.c
+ src/f32-igemm/1x4-scalar.c
+ src/f32-igemm/2x4-scalar.c
+ src/f32-igemm/4x2-scalar.c
+ src/f32-igemm/4x4-scalar.c
src/f32-maxpool/9p8x-scalar-c1.c
src/f32-pavgpool/mp9p8q-scalar.c
src/f32-pavgpool/up9-scalar.c
@@ -246,16 +225,37 @@
src/f32-spmm/8x1-scalar.c
src/f32-spmm/8x2-scalar.c
src/f32-spmm/8x4-scalar.c
+ src/f32-vbinary/vadd-scalar-x1.c
+ src/f32-vbinary/vadd-scalar-x2.c
+ src/f32-vbinary/vadd-scalar-x4.c
+ src/f32-vbinary/vaddc-scalar-x1.c
+ src/f32-vbinary/vaddc-scalar-x2.c
+ src/f32-vbinary/vaddc-scalar-x4.c
+ src/f32-vbinary/vmul-scalar-x1.c
+ src/f32-vbinary/vmul-scalar-x2.c
+ src/f32-vbinary/vmul-scalar-x4.c
+ src/f32-vbinary/vmulc-scalar-x1.c
+ src/f32-vbinary/vmulc-scalar-x2.c
+ src/f32-vbinary/vmulc-scalar-x4.c
+ src/f32-vbinary/vrsubc-scalar-x1.c
+ src/f32-vbinary/vrsubc-scalar-x2.c
+ src/f32-vbinary/vrsubc-scalar-x4.c
+ src/f32-vbinary/vsub-scalar-x1.c
+ src/f32-vbinary/vsub-scalar-x2.c
+ src/f32-vbinary/vsub-scalar-x4.c
+ src/f32-vbinary/vsubc-scalar-x1.c
+ src/f32-vbinary/vsubc-scalar-x2.c
+ src/f32-vbinary/vsubc-scalar-x4.c
src/f32-vmulcaddc/c1-scalar-2x.c
src/f32-vmulcaddc/c2-scalar-2x.c
src/f32-vmulcaddc/c4-scalar-2x.c
src/q8-avgpool/mp9p8q-scalar.c
src/q8-avgpool/up9-scalar.c
- src/q8-igemm/2x2-scalar.c
src/q8-dwconv/up1x9-scalar.c
src/q8-gavgpool/mp7p7q-scalar.c
src/q8-gavgpool/up7-scalar.c
src/q8-gemm/2x2-scalar.c
+ src/q8-igemm/2x2-scalar.c
src/q8-vadd/scalar.c
src/u8-clamp/scalar.c
src/u8-lut32norm/scalar.c
@@ -284,43 +284,19 @@
src/f32-avgpool/up9-psimd.c
src/f32-bilinear/psimd-c4.c
src/f32-bilinear/psimd-c8.c
- src/f32-binop/vadd-psimd-x4.c
- src/f32-binop/vadd-psimd-x8.c
- src/f32-binop/vaddc-psimd-x4.c
- src/f32-binop/vaddc-psimd-x8.c
- src/f32-binop/vmul-psimd-x4.c
- src/f32-binop/vmul-psimd-x8.c
- src/f32-binop/vmulc-psimd-x4.c
- src/f32-binop/vmulc-psimd-x8.c
- src/f32-binop/vrsubc-psimd-x4.c
- src/f32-binop/vrsubc-psimd-x8.c
- src/f32-binop/vsub-psimd-x4.c
- src/f32-binop/vsub-psimd-x8.c
- src/f32-binop/vsubc-psimd-x4.c
- src/f32-binop/vsubc-psimd-x8.c
src/f32-clamp/psimd.c
- src/f32-igemm/1x8-psimd-loadsplat.c
- src/f32-igemm/1x8-psimd-splat.c
- src/f32-igemm/1x8s4-psimd.c
- src/f32-igemm/4x2c4-psimd.c
- src/f32-igemm/4x8-psimd-loadsplat.c
- src/f32-igemm/4x8-psimd-splat.c
- src/f32-igemm/4x8s4-psimd.c
- src/f32-igemm/6x8-psimd-loadsplat.c
- src/f32-igemm/6x8-psimd-splat.c
- src/f32-igemm/6x8s4-psimd.c
- src/f32-dwconv/up4x25-psimd.c
- src/f32-dwconv/up4x4-psimd.c
- src/f32-dwconv/up4x9-psimd.c
src/f32-dwconv/up4x25-psimd-acc2.c
+ src/f32-dwconv/up4x25-psimd.c
src/f32-dwconv/up4x4-psimd-acc2.c
+ src/f32-dwconv/up4x4-psimd.c
src/f32-dwconv/up4x9-psimd-acc2.c
- src/f32-dwconv/up8x25-psimd.c
- src/f32-dwconv/up8x4-psimd.c
- src/f32-dwconv/up8x9-psimd.c
+ src/f32-dwconv/up4x9-psimd.c
src/f32-dwconv/up8x25-psimd-acc2.c
+ src/f32-dwconv/up8x25-psimd.c
src/f32-dwconv/up8x4-psimd-acc2.c
+ src/f32-dwconv/up8x4-psimd.c
src/f32-dwconv/up8x9-psimd-acc2.c
+ src/f32-dwconv/up8x9-psimd.c
src/f32-gavgpool/mp7p7q-psimd.c
src/f32-gavgpool/up7-psimd.c
src/f32-gemm/1x8-psimd-loadsplat.c
@@ -342,12 +318,36 @@
src/f32-gemminc/6x8-psimd-splat.c
src/f32-gemminc/6x8s4-psimd.c
src/f32-hswish/psimd.c
+ src/f32-igemm/1x8-psimd-loadsplat.c
+ src/f32-igemm/1x8-psimd-splat.c
+ src/f32-igemm/1x8s4-psimd.c
+ src/f32-igemm/4x2c4-psimd.c
+ src/f32-igemm/4x8-psimd-loadsplat.c
+ src/f32-igemm/4x8-psimd-splat.c
+ src/f32-igemm/4x8s4-psimd.c
+ src/f32-igemm/6x8-psimd-loadsplat.c
+ src/f32-igemm/6x8-psimd-splat.c
+ src/f32-igemm/6x8s4-psimd.c
src/f32-maxpool/9p8x-psimd-c4.c
src/f32-pavgpool/mp9p8q-psimd.c
src/f32-pavgpool/up9-psimd.c
src/f32-ppmm/4x8-psimd.c
src/f32-prelu/psimd-2x4.c
src/f32-prelu/psimd-2x8.c
+ src/f32-vbinary/vadd-psimd-x4.c
+ src/f32-vbinary/vadd-psimd-x8.c
+ src/f32-vbinary/vaddc-psimd-x4.c
+ src/f32-vbinary/vaddc-psimd-x8.c
+ src/f32-vbinary/vmul-psimd-x4.c
+ src/f32-vbinary/vmul-psimd-x8.c
+ src/f32-vbinary/vmulc-psimd-x4.c
+ src/f32-vbinary/vmulc-psimd-x8.c
+ src/f32-vbinary/vrsubc-psimd-x4.c
+ src/f32-vbinary/vrsubc-psimd-x8.c
+ src/f32-vbinary/vsub-psimd-x4.c
+ src/f32-vbinary/vsub-psimd-x8.c
+ src/f32-vbinary/vsubc-psimd-x4.c
+ src/f32-vbinary/vsubc-psimd-x8.c
src/f32-vmulcaddc/c4-psimd-2x.c
src/f32-vmulcaddc/c8-psimd-2x.c
src/x32-packx/x4-psimd.c
@@ -363,58 +363,44 @@
src/f32-avgpool/up9-neon.c
src/f32-bilinear/neon-c4.c
src/f32-bilinear/neon-c8.c
- src/f32-binop/vadd-neon-x4.c
- src/f32-binop/vadd-neon-x8.c
- src/f32-binop/vaddc-neon-x4.c
- src/f32-binop/vaddc-neon-x8.c
- src/f32-binop/vmul-neon-x4.c
- src/f32-binop/vmul-neon-x8.c
- src/f32-binop/vmulc-neon-x4.c
- src/f32-binop/vmulc-neon-x8.c
- src/f32-binop/vrsubc-neon-x4.c
- src/f32-binop/vrsubc-neon-x8.c
- src/f32-binop/vsub-neon-x4.c
- src/f32-binop/vsub-neon-x8.c
- src/f32-binop/vsubc-neon-x4.c
- src/f32-binop/vsubc-neon-x8.c
src/f32-clamp/neon.c
- src/f32-igemm/1x8-neon-ld64.c
- src/f32-igemm/4x2-neon-ld64.c
- src/f32-igemm/4x4-neon-ld64.c
- src/f32-igemm/4x8-neon-ld128.c
- src/f32-igemm/4x8-neon-ld64.c
- src/f32-igemm/6x8-neon-ld64.c
- src/f32-igemm/1x8s4-neon.c
- src/f32-igemm/4x8s4-neon.c
- src/f32-igemm/6x8s4-neon.c
- src/f32-igemm/8x8s4-neon.c
- src/f32-dwconv/up4x9-neon.c
src/f32-dwconv/up4x9-neon-acc2.c
- src/f32-dwconv/up8x9-neon.c
+ src/f32-dwconv/up4x9-neon.c
src/f32-dwconv/up8x9-neon-acc2.c
+ src/f32-dwconv/up8x9-neon.c
src/f32-gavgpool-spchw/neon-x4.c
src/f32-gavgpool/mp7p7q-neon.c
src/f32-gavgpool/up7-neon.c
src/f32-gemm/1x8-neon-ld64.c
+ src/f32-gemm/1x8s4-neon.c
src/f32-gemm/4x2-neon-ld64.c
src/f32-gemm/4x8-neon-ld128.c
src/f32-gemm/4x8-neon-ld64.c
+ src/f32-gemm/4x8s4-neon.c
src/f32-gemm/5x8-neon-ld64.c
src/f32-gemm/6x8-neon-ld64.c
- src/f32-gemm/1x8s4-neon.c
- src/f32-gemm/4x8s4-neon.c
src/f32-gemm/6x8s4-neon.c
src/f32-gemm/8x8s4-neon.c
src/f32-gemminc/1x8-neon-ld64.c
+ src/f32-gemminc/1x8s4-neon.c
src/f32-gemminc/4x8-neon-ld128.c
src/f32-gemminc/4x8-neon-ld64.c
+ src/f32-gemminc/4x8s4-neon.c
src/f32-gemminc/5x8-neon-ld64.c
src/f32-gemminc/6x8-neon-ld64.c
- src/f32-gemminc/1x8s4-neon.c
- src/f32-gemminc/4x8s4-neon.c
src/f32-gemminc/6x8s4-neon.c
src/f32-gemminc/8x8s4-neon.c
src/f32-hswish/neon.c
+ src/f32-igemm/1x8-neon-ld64.c
+ src/f32-igemm/1x8s4-neon.c
+ src/f32-igemm/4x2-neon-ld64.c
+ src/f32-igemm/4x4-neon-ld64.c
+ src/f32-igemm/4x8-neon-ld128.c
+ src/f32-igemm/4x8-neon-ld64.c
+ src/f32-igemm/4x8s4-neon.c
+ src/f32-igemm/6x8-neon-ld64.c
+ src/f32-igemm/6x8s4-neon.c
+ src/f32-igemm/8x8s4-neon.c
src/f32-pavgpool/mp9p8q-neon.c
src/f32-pavgpool/up9-neon.c
src/f32-ppmm/4x8-neon.c
@@ -423,17 +409,31 @@
src/f32-prelu/neon-2x8.c
src/f32-rmax/neon.c
src/f32-sigmoid/neon-frac-p9-p10-nr1recps-x16.c
+ src/f32-vbinary/vadd-neon-x4.c
+ src/f32-vbinary/vadd-neon-x8.c
+ src/f32-vbinary/vaddc-neon-x4.c
+ src/f32-vbinary/vaddc-neon-x8.c
+ src/f32-vbinary/vmul-neon-x4.c
+ src/f32-vbinary/vmul-neon-x8.c
+ src/f32-vbinary/vmulc-neon-x4.c
+ src/f32-vbinary/vmulc-neon-x8.c
+ src/f32-vbinary/vrsubc-neon-x4.c
+ src/f32-vbinary/vrsubc-neon-x8.c
+ src/f32-vbinary/vsub-neon-x4.c
+ src/f32-vbinary/vsub-neon-x8.c
+ src/f32-vbinary/vsubc-neon-x4.c
+ src/f32-vbinary/vsubc-neon-x8.c
src/f32-vmulcaddc/c4-neon-2x.c
src/f32-vmulcaddc/c8-neon-2x.c
src/q8-avgpool/mp9p8q-neon.c
src/q8-avgpool/up9-neon.c
- src/q8-igemm/4x8-neon.c
- src/q8-igemm/8x8-neon.c
src/q8-dwconv/up8x9-neon.c
src/q8-gavgpool/mp7p7q-neon.c
src/q8-gavgpool/up7-neon.c
src/q8-gemm/4x8-neon.c
src/q8-gemm/8x8-neon.c
+ src/q8-igemm/4x8-neon.c
+ src/q8-igemm/8x8-neon.c
src/q8-vadd/neon.c
src/u8-clamp/neon.c
src/u8-maxpool/9p8x-neon-c16.c
@@ -533,40 +533,21 @@
src/f32-avgpool/up9-sse.c
src/f32-bilinear/sse-c4.c
src/f32-bilinear/sse-c8.c
- src/f32-binop/vadd-sse-x4.c
- src/f32-binop/vadd-sse-x8.c
- src/f32-binop/vaddc-sse-x4.c
- src/f32-binop/vaddc-sse-x8.c
- src/f32-binop/vmul-sse-x4.c
- src/f32-binop/vmul-sse-x8.c
- src/f32-binop/vmulc-sse-x4.c
- src/f32-binop/vmulc-sse-x8.c
- src/f32-binop/vrsubc-sse-x4.c
- src/f32-binop/vrsubc-sse-x8.c
- src/f32-binop/vsub-sse-x4.c
- src/f32-binop/vsub-sse-x8.c
- src/f32-binop/vsubc-sse-x4.c
- src/f32-binop/vsubc-sse-x8.c
src/f32-clamp/sse.c
- src/f32-igemm/1x8-sse-dup.c
- src/f32-igemm/1x8-sse-load1.c
- src/f32-igemm/1x8s4-sse.c
- src/f32-igemm/4x2c4-sse.c
- src/f32-igemm/4x8-sse-dup.c
- src/f32-igemm/4x8-sse-load1.c
- src/f32-igemm/4x8s4-sse.c
- src/f32-dwconv/up4x25-sse.c
- src/f32-dwconv/up4x4-sse.c
- src/f32-dwconv/up4x9-sse.c
+ src/f32-dwconv-spchw/3x3p1-sse.c
+ src/f32-dwconv-spchw/3x3s2p1-sse.c
src/f32-dwconv/up4x25-sse-acc2.c
+ src/f32-dwconv/up4x25-sse.c
src/f32-dwconv/up4x4-sse-acc2.c
+ src/f32-dwconv/up4x4-sse.c
src/f32-dwconv/up4x9-sse-acc2.c
- src/f32-dwconv/up8x25-sse.c
- src/f32-dwconv/up8x4-sse.c
- src/f32-dwconv/up8x9-sse.c
+ src/f32-dwconv/up4x9-sse.c
src/f32-dwconv/up8x25-sse-acc2.c
+ src/f32-dwconv/up8x25-sse.c
src/f32-dwconv/up8x4-sse-acc2.c
+ src/f32-dwconv/up8x4-sse.c
src/f32-dwconv/up8x9-sse-acc2.c
+ src/f32-dwconv/up8x9-sse.c
src/f32-gavgpool-spchw/sse-x4.c
src/f32-gavgpool/mp7p7q-sse.c
src/f32-gavgpool/up7-sse.c
@@ -583,15 +564,34 @@
src/f32-gemminc/4x8-sse-load1.c
src/f32-gemminc/4x8s4-sse.c
src/f32-hswish/sse.c
+ src/f32-igemm/1x8-sse-dup.c
+ src/f32-igemm/1x8-sse-load1.c
+ src/f32-igemm/1x8s4-sse.c
+ src/f32-igemm/4x2c4-sse.c
+ src/f32-igemm/4x8-sse-dup.c
+ src/f32-igemm/4x8-sse-load1.c
+ src/f32-igemm/4x8s4-sse.c
src/f32-maxpool/9p8x-sse-c4.c
src/f32-pavgpool/mp9p8q-sse.c
src/f32-pavgpool/up9-sse.c
- src/f32-dwconv-spchw/3x3p1-sse.c
- src/f32-dwconv-spchw/3x3s2p1-sse.c
src/f32-ppmm/4x8-sse.c
src/f32-rmax/sse.c
src/f32-spmm/4x1-sse.c
src/f32-spmm/8x1-sse.c
+ src/f32-vbinary/vadd-sse-x4.c
+ src/f32-vbinary/vadd-sse-x8.c
+ src/f32-vbinary/vaddc-sse-x4.c
+ src/f32-vbinary/vaddc-sse-x8.c
+ src/f32-vbinary/vmul-sse-x4.c
+ src/f32-vbinary/vmul-sse-x8.c
+ src/f32-vbinary/vmulc-sse-x4.c
+ src/f32-vbinary/vmulc-sse-x8.c
+ src/f32-vbinary/vrsubc-sse-x4.c
+ src/f32-vbinary/vrsubc-sse-x8.c
+ src/f32-vbinary/vsub-sse-x4.c
+ src/f32-vbinary/vsub-sse-x8.c
+ src/f32-vbinary/vsubc-sse-x4.c
+ src/f32-vbinary/vsubc-sse-x8.c
src/f32-vmulcaddc/c4-sse-2x.c
src/f32-vmulcaddc/c8-sse-2x.c
src/x32-packx/x4-sse.c)
diff --git a/bench/f32-sigmoid.cc b/bench/f32-sigmoid.cc
index b042e44..ba12aea 100644
--- a/bench/f32-sigmoid.cc
+++ b/bench/f32-sigmoid.cc
@@ -14,12 +14,12 @@
#include <xnnpack/AlignedAllocator.h>
#include <xnnpack/common.h>
#include <xnnpack/params.h>
-#include <xnnpack/vunop.h>
+#include <xnnpack/vunary.h>
static void f32_sigmoid(
benchmark::State& state,
- xnn_f32_vunop_ukernel_function sigmoid)
+ xnn_f32_vunary_ukernel_function sigmoid)
{
const size_t elements = state.range(0);
diff --git a/scripts/generate-f32-sigmoid.sh b/scripts/generate-f32-sigmoid.sh
index 0db9ffc..99a94ce 100755
--- a/scripts/generate-f32-sigmoid.sh
+++ b/scripts/generate-f32-sigmoid.sh
@@ -13,4 +13,4 @@
tools/xngen src/f32-sigmoid/sse-p5-div.c.in -D BATCH_TILE=16 -D BLEND=1 -o src/f32-sigmoid/sse41-p5-div-x16.c
################################## Unit tests #################################
-tools/generate-vunop-test.py --spec test/f32-sigmoid.yaml --output test/f32-sigmoid.cc
+tools/generate-vunary-test.py --spec test/f32-sigmoid.yaml --output test/f32-sigmoid.cc
diff --git a/scripts/generate-f32-vbinop.sh b/scripts/generate-f32-vbinary.sh
similarity index 91%
rename from scripts/generate-f32-vbinop.sh
rename to scripts/generate-f32-vbinary.sh
index dd23bc7..cb5eb27 100755
--- a/scripts/generate-f32-vbinop.sh
+++ b/scripts/generate-f32-vbinary.sh
@@ -80,10 +80,10 @@
tools/xngen src/f32-binop/vopc-sse.c.in -D OP=RSUB -D BATCH_TILE=8 -o src/f32-binop/vrsubc-sse-x8.c
################################## Unit tests #################################
-tools/generate-vbinop-test.py --spec test/f32-vadd.yaml --output test/f32-vadd.cc
-tools/generate-vbinop-test.py --spec test/f32-vmul.yaml --output test/f32-vmul.cc
-tools/generate-vbinop-test.py --spec test/f32-vsub.yaml --output test/f32-vsub.cc
-tools/generate-vbinop-test.py --spec test/f32-vaddc.yaml --output test/f32-vaddc.cc
-tools/generate-vbinop-test.py --spec test/f32-vmulc.yaml --output test/f32-vmulc.cc
-tools/generate-vbinop-test.py --spec test/f32-vsubc.yaml --output test/f32-vsubc.cc
-tools/generate-vbinop-test.py --spec test/f32-vrsubc.yaml --output test/f32-vrsubc.cc
+tools/generate-vbinary-test.py --spec test/f32-vadd.yaml --output test/f32-vadd.cc
+tools/generate-vbinary-test.py --spec test/f32-vmul.yaml --output test/f32-vmul.cc
+tools/generate-vbinary-test.py --spec test/f32-vsub.yaml --output test/f32-vsub.cc
+tools/generate-vbinary-test.py --spec test/f32-vaddc.yaml --output test/f32-vaddc.cc
+tools/generate-vbinary-test.py --spec test/f32-vmulc.yaml --output test/f32-vmulc.cc
+tools/generate-vbinary-test.py --spec test/f32-vsubc.yaml --output test/f32-vsubc.cc
+tools/generate-vbinary-test.py --spec test/f32-vrsubc.yaml --output test/f32-vrsubc.cc
diff --git a/src/f32-sigmoid/neon-frac-p9-p10-nr1recps-x16.c b/src/f32-sigmoid/neon-frac-p9-p10-nr1recps-x16.c
index 994f7af..b66f073 100644
--- a/src/f32-sigmoid/neon-frac-p9-p10-nr1recps-x16.c
+++ b/src/f32-sigmoid/neon-frac-p9-p10-nr1recps-x16.c
@@ -12,7 +12,7 @@
#include <arm_neon.h>
#include <xnnpack/common.h>
-#include <xnnpack/vunop.h>
+#include <xnnpack/vunary.h>
void xnn_f32_sigmoid_ukernel__neon_frac_p9_p10_nr1recps_x16(
diff --git a/src/f32-sigmoid/neon-frac-p9-p10-nr1recps.c.in b/src/f32-sigmoid/neon-frac-p9-p10-nr1recps.c.in
index 012b8eb..434f86e 100644
--- a/src/f32-sigmoid/neon-frac-p9-p10-nr1recps.c.in
+++ b/src/f32-sigmoid/neon-frac-p9-p10-nr1recps.c.in
@@ -11,7 +11,7 @@
#include <arm_neon.h>
#include <xnnpack/common.h>
-#include <xnnpack/vunop.h>
+#include <xnnpack/vunary.h>
void xnn_f32_sigmoid_ukernel__neon_frac_p9_p10_nr1recps_x${BATCH_TILE}(
diff --git a/src/f32-sigmoid/neonfma-p5-nr2fma-x16.c b/src/f32-sigmoid/neonfma-p5-nr2fma-x16.c
index 5555154..0c8ac1b 100644
--- a/src/f32-sigmoid/neonfma-p5-nr2fma-x16.c
+++ b/src/f32-sigmoid/neonfma-p5-nr2fma-x16.c
@@ -12,7 +12,7 @@
#include <arm_neon.h>
#include <xnnpack/common.h>
-#include <xnnpack/vunop.h>
+#include <xnnpack/vunary.h>
void xnn_f32_sigmoid_ukernel__neonfma_p5_nr2fma_x16(
diff --git a/src/f32-sigmoid/neonfma-p5-nr2fma.c.in b/src/f32-sigmoid/neonfma-p5-nr2fma.c.in
index c11b062..fe952bf 100644
--- a/src/f32-sigmoid/neonfma-p5-nr2fma.c.in
+++ b/src/f32-sigmoid/neonfma-p5-nr2fma.c.in
@@ -11,7 +11,7 @@
#include <arm_neon.h>
#include <xnnpack/common.h>
-#include <xnnpack/vunop.h>
+#include <xnnpack/vunary.h>
void xnn_f32_sigmoid_ukernel__neonfma_p5_nr2fma_x${BATCH_TILE}(
diff --git a/src/f32-sigmoid/sse-p5-div.c.in b/src/f32-sigmoid/sse-p5-div.c.in
index e3c1a00..4ae3897 100644
--- a/src/f32-sigmoid/sse-p5-div.c.in
+++ b/src/f32-sigmoid/sse-p5-div.c.in
@@ -14,7 +14,7 @@
#include <emmintrin.h>
#include <xnnpack/common.h>
-#include <xnnpack/vunop.h>
+#include <xnnpack/vunary.h>
void xnn_f32_sigmoid_ukernel__${"sse41" if BLEND else "sse2"}_p5_div_x${BATCH_TILE}(
diff --git a/src/f32-sigmoid/sse2-p5-div-x16.c b/src/f32-sigmoid/sse2-p5-div-x16.c
index 7dbc685..5e232cf 100644
--- a/src/f32-sigmoid/sse2-p5-div-x16.c
+++ b/src/f32-sigmoid/sse2-p5-div-x16.c
@@ -12,7 +12,7 @@
#include <emmintrin.h>
#include <xnnpack/common.h>
-#include <xnnpack/vunop.h>
+#include <xnnpack/vunary.h>
void xnn_f32_sigmoid_ukernel__sse2_p5_div_x16(
diff --git a/src/f32-sigmoid/sse2-p5-div-x8.c b/src/f32-sigmoid/sse2-p5-div-x8.c
index c869639..bf9bf94 100644
--- a/src/f32-sigmoid/sse2-p5-div-x8.c
+++ b/src/f32-sigmoid/sse2-p5-div-x8.c
@@ -12,7 +12,7 @@
#include <emmintrin.h>
#include <xnnpack/common.h>
-#include <xnnpack/vunop.h>
+#include <xnnpack/vunary.h>
void xnn_f32_sigmoid_ukernel__sse2_p5_div_x8(
diff --git a/src/f32-sigmoid/sse2-p5-div.c.in b/src/f32-sigmoid/sse2-p5-div.c.in
new file mode 100644
index 0000000..ab02615
--- /dev/null
+++ b/src/f32-sigmoid/sse2-p5-div.c.in
@@ -0,0 +1,286 @@
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+$assert BATCH_TILE % 4 == 0
+$assert BATCH_TILE >= 4
+$ABC = "0123456789ABCDEFGHIJKLMN"
+#include <assert.h>
+
+#include <emmintrin.h>
+
+#include <xnnpack/common.h>
+#include <xnnpack/vunary.h>
+
+
+void xnn_f32_sigmoid_ukernel__sse2_p5_div_x${BATCH_TILE}(
+ size_t n,
+ const float* x,
+ float* y,
+ const void* params)
+{
+ assert(n % sizeof(float) == 0);
+
+ const __m128 vmagic_bias = _mm_set1_ps(0x1.8000FEp23f);
+ // The smallest x for which sigmoidf(x) is normalized.
+ // This number is also the smallest x for which expf(x) is normalized.
+ const __m128 vdenorm_cutoff = _mm_set1_ps(-0x1.5D589Ep+6f);
+ // The largest x for which sigmoidf(x) is not equal 1.0.
+ const __m128 vone_cutoff = _mm_set1_ps(0x1.154244p+4f);
+ const __m128 vlog2e = _mm_set1_ps(0x1.715476p+0f);
+ // Last 8 bits are zeroes
+ const __m128 vminus_ln2_hi = _mm_set1_ps(-0x1.62E400p-1f);
+ const __m128 vminus_ln2_lo = _mm_set1_ps(-0x1.7F7D1Cp-20f);
+ const __m128 vone = _mm_set1_ps(1.0f);
+ const __m128 vsign_mask = _mm_set1_ps(-0.0f);
+
+ const __m128 vc1 = _mm_set1_ps(0x1.FFFFF6p-1f);
+ const __m128 vc2 = _mm_set1_ps(0x1.FFFDC6p-2f);
+ const __m128 vc3 = _mm_set1_ps(0x1.555A80p-3f);
+ const __m128 vc4 = _mm_set1_ps(0x1.573A1Ap-5f);
+ const __m128 vc5 = _mm_set1_ps(0x1.0F9F9Cp-7f);
+
+ for (; n >= ${BATCH_TILE} * sizeof(float); n -= ${BATCH_TILE} * sizeof(float)) {
+ const __m128 vx${ABC[0:4]} = _mm_loadu_ps(x);
+ $for N in range(4, BATCH_TILE, 4):
+ const __m128 vx${ABC[N:N+4]} = _mm_loadu_ps(x + ${N});
+
+ // General structure of the algorithm:
+ // / exp(x) / (1 + exp(x)) if x <= 0
+ // f[x] :=
+ // \ 1 - f[-x] if x >= 0
+ //
+ // First we compute f[z] := exp(z) / (1 + exp(z)) where z = -abs(x),
+ // then replace result with 1 - f[z] if x >= 0.
+ $for N in range(0, BATCH_TILE, 4):
+ const __m128 vz${ABC[N:N+4]} = _mm_or_ps(vx${ABC[N:N+4]}, vsign_mask);
+
+ // Compute reduced argument n := round(z / log(2)).
+ // We do it by adding a large number (magic bias) to the product z * (1/log(2)), which cause rounding of the result
+ // to an integer, then subtracing the large number back. The trick with adding large number is valid only within
+ // certain bounds (|x| <= 2**22), but thats ok, because inputs x outside of [-87.336544, 17.328678] (i.e. z outsize
+ // [0, 87.336544]) underflow or saturate sigmoidf(x) anyway. We fixup the result for such inputs at the very end of
+ // the algorithm.
+ $for N in range(0, BATCH_TILE, 4):
+ __m128 vn${ABC[N:N+4]} = _mm_add_ps(_mm_mul_ps(vz${ABC[N:N+4]}, vlog2e), vmagic_bias);
+
+ // Create a floating-point number s (scale) such that s == 2**n for inputs which don't cause underflow, i.e.
+ // -87.33642 <= z <= 0.0, and -126 <= n <= 0 accordingly.
+ $for N in range(0, BATCH_TILE, 4):
+ const __m128 vs${ABC[N:N+4]} = _mm_castsi128_ps(_mm_slli_epi32(_mm_castps_si128(vn${ABC[N:N+4]}), 23));
+
+ // Subtract the large number back to get final n := round(z / log(2)).
+ $for N in range(0, BATCH_TILE, 4):
+ vn${ABC[N:N+4]} = _mm_sub_ps(vn${ABC[N:N+4]}, vmagic_bias);
+
+ // Compute reduced argument t := z - n * log(2).
+ // Use Cody-Waite range reduction method (note two constants to represent log(2)) to improve accuracy.
+ $for N in range(0, BATCH_TILE, 4):
+ __m128 vt${ABC[N:N+4]} = _mm_add_ps(_mm_mul_ps(vn${ABC[N:N+4]}, vminus_ln2_hi), vz${ABC[N:N+4]});
+
+ $for N in range(0, BATCH_TILE, 4):
+ vt${ABC[N:N+4]} = _mm_add_ps(_mm_mul_ps(vn${ABC[N:N+4]}, vminus_ln2_lo), vt${ABC[N:N+4]});
+
+ // Compute degree-5 polynomial approxiatmion for exp(t) on [-log(2)/2, log(2)/2].
+ $for N in range(0, BATCH_TILE, 4):
+ __m128 vp${ABC[N:N+4]} = _mm_add_ps(_mm_mul_ps(vc5, vt${ABC[N:N+4]}), vc4);
+
+ $for N in range(0, BATCH_TILE, 4):
+ vp${ABC[N:N+4]} = _mm_add_ps(_mm_mul_ps(vp${ABC[N:N+4]}, vt${ABC[N:N+4]}), vc3);
+
+ $for N in range(0, BATCH_TILE, 4):
+ vp${ABC[N:N+4]} = _mm_add_ps(_mm_mul_ps(vp${ABC[N:N+4]}, vt${ABC[N:N+4]}), vc2);
+
+ $for N in range(0, BATCH_TILE, 4):
+ vp${ABC[N:N+4]} = _mm_add_ps(_mm_mul_ps(vp${ABC[N:N+4]}, vt${ABC[N:N+4]}), vc1);
+
+ // Reconstruct the exp(z) value:
+ // e = s * (1 + t * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5)))))
+ // = s + (t * s) * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5))))
+ // = s + (t * s) * p
+ $for N in range(0, BATCH_TILE, 4):
+ vt${ABC[N:N+4]} = _mm_mul_ps(vt${ABC[N:N+4]}, vs${ABC[N:N+4]});
+
+ $for N in range(0, BATCH_TILE, 4):
+ __m128 ve${ABC[N:N+4]} = _mm_add_ps(_mm_mul_ps(vt${ABC[N:N+4]}, vp${ABC[N:N+4]}), vs${ABC[N:N+4]});
+
+ // Denominator of the sigmoid fraction: 1.0 + exp(z)
+ $for N in range(0, BATCH_TILE, 4):
+ __m128 vd${ABC[N:N+4]} = _mm_add_ps(ve${ABC[N:N+4]}, vone);
+
+ // Reconstruct sigmoid(-z) = exp(z) / (1.0 + exp(z))
+ $for N in range(0, BATCH_TILE, 4):
+ __m128 vf${ABC[N:N+4]} = _mm_div_ps(ve${ABC[N:N+4]}, vd${ABC[N:N+4]});
+
+ // Reconstruct sigmoid(x) = x < 0 ? sigmoid(z) : 1.0 - sigmoid(z)
+ $for N in range(0, BATCH_TILE, 4):
+ __m128 vm${ABC[N:N+4]} = _mm_castsi128_ps(_mm_cmpgt_epi32(_mm_setzero_si128(), _mm_castps_si128(vx${ABC[N:N+4]})));
+
+ $for N in range(0, BATCH_TILE, 4):
+ vf${ABC[N:N+4]} = _mm_or_ps(_mm_and_ps(vf${ABC[N:N+4]}, vm${ABC[N:N+4]}), _mm_andnot_ps(vm${ABC[N:N+4]}, _mm_sub_ps(vone, vf${ABC[N:N+4]})));
+
+ // For inputs above 1.0 cutoff, replace output with 1.0.
+ // Note that for NaN inputs, comparison result is false, and outputs are left unchanged.
+ $for N in range(0, BATCH_TILE, 4):
+ vm${ABC[N:N+4]} = _mm_cmpgt_ps(vx${ABC[N:N+4]}, vone_cutoff);
+
+ $for N in range(0, BATCH_TILE, 4):
+ vf${ABC[N:N+4]} = _mm_or_ps(_mm_and_ps(vone, vm${ABC[N:N+4]}), _mm_andnot_ps(vm${ABC[N:N+4]}, vf${ABC[N:N+4]}));
+
+ // For inputs below denormal cutoff, replace output with +0.0f.
+ // Note that for NaN inputs, comparison result is false, and outputs are left unchanged.
+ $for N in range(0, BATCH_TILE, 4):
+ vf${ABC[N:N+4]} = _mm_andnot_ps(_mm_cmplt_ps(vx${ABC[N:N+4]}, vdenorm_cutoff), vf${ABC[N:N+4]});
+
+ _mm_storeu_ps(y, vf${ABC[0:4]});
+ $for N in range(4, BATCH_TILE, 4):
+ _mm_storeu_ps(y + ${N}, vf${ABC[N:N+4]});
+
+ x += ${BATCH_TILE};
+ y += ${BATCH_TILE};
+ }
+ $if BATCH_TILE > 4:
+ for (; n >= 4 * sizeof(float); n -= 4 * sizeof(float)) {
+ const __m128 vx0123 = _mm_loadu_ps(x);
+
+ // General structure of the algorithm:
+ // / exp(x) / (1 + exp(x)) if x <= 0
+ // f[x] :=
+ // \ 1 - f[-x] if x >= 0
+ //
+ // First we compute f[z] := exp(z) / (1 + exp(z)) where z = -abs(x),
+ // then replace result with 1 - f[z] if x >= 0.
+ const __m128 vz0123 = _mm_or_ps(vx0123, vsign_mask);
+
+ // Compute reduced argument n := round(z / log(2)).
+ // We do it by adding a large number (magic bias) to the product z * (1/log(2)), which cause rounding of the result
+ // to an integer, then subtracing the large number back. The trick with adding large number is valid only within
+ // certain bounds (|x| <= 2**22), but thats ok, because inputs x outside of [-87.336544, 17.328678] (i.e. z outsize
+ // [0, 87.336544]) underflow or saturate sigmoidf(x) anyway. We fixup the result for such inputs at the very end of
+ // the algorithm.
+ __m128 vn0123 = _mm_add_ps(_mm_mul_ps(vz0123, vlog2e), vmagic_bias);
+
+ // Create a floating-point number s (scale) such that s == 2**n for inputs which don't cause underflow, i.e.
+ // -87.33642 <= z <= 0.0, and -126 <= n <= 0 accordingly.
+ const __m128 vs0123 = _mm_castsi128_ps(_mm_slli_epi32(_mm_castps_si128(vn0123), 23));
+
+ // Subtract the large number back to get final n := round(z / log(2)).
+ vn0123 = _mm_sub_ps(vn0123, vmagic_bias);
+
+ // Compute reduced argument t := z - n * log(2).
+ // Use Cody-Waite range reduction method (note two constants to represent log(2)) to improve accuracy.
+ __m128 vt0123 = _mm_add_ps(_mm_mul_ps(vn0123, vminus_ln2_hi), vz0123);
+ vt0123 = _mm_add_ps(_mm_mul_ps(vn0123, vminus_ln2_lo), vt0123);
+
+ // Compute degree-5 polynomial approxiatmion for exp(t) on [-log(2)/2, log(2)/2].
+ __m128 vp0123 = _mm_add_ps(_mm_mul_ps(vc5, vt0123), vc4);
+ vp0123 = _mm_add_ps(_mm_mul_ps(vp0123, vt0123), vc3);
+ vp0123 = _mm_add_ps(_mm_mul_ps(vp0123, vt0123), vc2);
+ vp0123 = _mm_add_ps(_mm_mul_ps(vp0123, vt0123), vc1);
+
+ // Reconstruct the exp(z) value:
+ // e = s * (1 + t * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5)))))
+ // = s + (t * s) * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5))))
+ // = s + (t * s) * p
+ vt0123 = _mm_mul_ps(vt0123, vs0123);
+ __m128 ve0123 = _mm_add_ps(_mm_mul_ps(vt0123, vp0123), vs0123);
+
+ // Denominator of the sigmoid fraction: 1.0 + exp(z)
+ __m128 vd0123 = _mm_add_ps(ve0123, vone);
+
+ // Reconstruct sigmoid(-z) = exp(z) / (1.0 + exp(z))
+ __m128 vf0123 = _mm_div_ps(ve0123, vd0123);
+
+ // Reconstruct sigmoid(x) = x < 0 ? sigmoid(z) : 1.0 - sigmoid(z)
+ __m128 vm0123 = _mm_castsi128_ps(_mm_cmpgt_epi32(_mm_setzero_si128(), _mm_castps_si128(vx0123)));
+ vf0123 = _mm_or_ps(_mm_and_ps(vf0123, vm0123), _mm_andnot_ps(vm0123, _mm_sub_ps(vone, vf0123)));
+
+ // For inputs above 1.0 cutoff, replace output with 1.0.
+ // Note that for NaN inputs, comparison result is false, and outputs are left unchanged.
+ vm0123 = _mm_cmpgt_ps(vx0123, vone_cutoff);
+ vf0123 = _mm_or_ps(_mm_and_ps(vone, vm0123), _mm_andnot_ps(vm0123, vf0123));
+
+ // For inputs below denormal cutoff, replace output with +0.0f.
+ // Note that for NaN inputs, comparison result is false, and outputs are left unchanged.
+ vf0123 = _mm_andnot_ps(_mm_cmplt_ps(vx0123, vdenorm_cutoff), vf0123);
+
+ _mm_storeu_ps(y, vf0123);
+
+ x += 4;
+ y += 4;
+ }
+ if XNN_UNLIKELY(n != 0) {
+ const __m128 vx0123 = _mm_loadu_ps(x);
+
+ // General structure of the algorithm:
+ // / exp(x) / (1 + exp(x)) if x <= 0
+ // f[x] :=
+ // \ 1 - f[-x] if x >= 0
+ //
+ // First we compute f[z] := exp(z) / (1 + exp(z)) where z = -abs(x),
+ // then replace result with 1 - f[z] if x >= 0.
+ const __m128 vz0123 = _mm_or_ps(vx0123, vsign_mask);
+
+ // Compute reduced argument n := round(z / log(2)).
+ // We do it by adding a large number (magic bias) to the product z * (1/log(2)), which cause rounding of the result
+ // to an integer, then subtracing the large number back. The trick with adding large number is valid only within
+ // certain bounds (|x| <= 2**22), but thats ok, because inputs x outside of [-87.336544, 17.328678] (i.e. z outsize
+ // [0, 87.336544]) underflow or saturate sigmoidf(x) anyway. We fixup the result for such inputs at the very end of
+ // the algorithm.
+ __m128 vn0123 = _mm_add_ps(_mm_mul_ps(vz0123, vlog2e), vmagic_bias);
+
+ // Create a floating-point number s (scale) such that s == 2**n for inputs which don't cause underflow, i.e.
+ // -87.33642 <= z <= 0.0, and -126 <= n <= 0 accordingly.
+ const __m128 vs0123 = _mm_castsi128_ps(_mm_slli_epi32(_mm_castps_si128(vn0123), 23));
+
+ // Subtract the large number back to get final n := round(z / log(2)).
+ vn0123 = _mm_sub_ps(vn0123, vmagic_bias);
+
+ // Compute reduced argument t := z - n * log(2).
+ // Use Cody-Waite range reduction method (note two constants to represent log(2)) to improve accuracy.
+ __m128 vt0123 = _mm_add_ps(_mm_mul_ps(vn0123, vminus_ln2_hi), vz0123);
+ vt0123 = _mm_add_ps(_mm_mul_ps(vn0123, vminus_ln2_lo), vt0123);
+
+ // Compute degree-5 polynomial approxiatmion for exp(t) on [-log(2)/2, log(2)/2].
+ __m128 vp0123 = _mm_add_ps(_mm_mul_ps(vc5, vt0123), vc4);
+ vp0123 = _mm_add_ps(_mm_mul_ps(vp0123, vt0123), vc3);
+ vp0123 = _mm_add_ps(_mm_mul_ps(vp0123, vt0123), vc2);
+ vp0123 = _mm_add_ps(_mm_mul_ps(vp0123, vt0123), vc1);
+
+ // Reconstruct the exp(z) value:
+ // e = s * (1 + t * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5)))))
+ // = s + (t * s) * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5))))
+ // = s + (t * s) * p
+ vt0123 = _mm_mul_ps(vt0123, vs0123);
+ __m128 ve0123 = _mm_add_ps(_mm_mul_ps(vt0123, vp0123), vs0123);
+
+ // Denominator of the sigmoid fraction: 1.0 + exp(z)
+ __m128 vd0123 = _mm_add_ps(ve0123, vone);
+
+ // Reconstruct sigmoid(-z) = exp(z) / (1.0 + exp(z))
+ __m128 vf0123 = _mm_div_ps(ve0123, vd0123);
+
+ // Reconstruct sigmoid(x) = x < 0 ? sigmoid(z) : 1.0 - sigmoid(z)
+ __m128 vm0123 = _mm_castsi128_ps(_mm_cmpgt_epi32(_mm_setzero_si128(), _mm_castps_si128(vx0123)));
+ vf0123 = _mm_or_ps(_mm_and_ps(vf0123, vm0123), _mm_andnot_ps(vm0123, _mm_sub_ps(vone, vf0123)));
+
+ // For inputs above 1.0 cutoff, replace output with 1.0.
+ // Note that for NaN inputs, comparison result is false, and outputs are left unchanged.
+ vm0123 = _mm_cmpgt_ps(vx0123, vone_cutoff);
+ vf0123 = _mm_or_ps(_mm_and_ps(vone, vm0123), _mm_andnot_ps(vm0123, vf0123));
+
+ // For inputs below denormal cutoff, replace output with +0.0f.
+ // Note that for NaN inputs, comparison result is false, and outputs are left unchanged.
+ vf0123 = _mm_andnot_ps(_mm_cmplt_ps(vx0123, vdenorm_cutoff), vf0123);
+
+ if (n & (2 * sizeof(float))) {
+ _mm_storel_pi((__m64*) y, vf0123);
+ vf0123 = _mm_movehl_ps(vf0123, vf0123);
+ y += 2;
+ }
+ if (n & (1 * sizeof(float))) {
+ _mm_store_ss(y, vf0123);
+ }
+ }
+}
diff --git a/src/f32-sigmoid/sse41-p5-div-x16.c b/src/f32-sigmoid/sse41-p5-div-x16.c
index 092eef0..cf95fec 100644
--- a/src/f32-sigmoid/sse41-p5-div-x16.c
+++ b/src/f32-sigmoid/sse41-p5-div-x16.c
@@ -12,7 +12,7 @@
#include <smmintrin.h>
#include <xnnpack/common.h>
-#include <xnnpack/vunop.h>
+#include <xnnpack/vunary.h>
void xnn_f32_sigmoid_ukernel__sse41_p5_div_x16(
diff --git a/src/f32-sigmoid/sse41-p5-div-x8.c b/src/f32-sigmoid/sse41-p5-div-x8.c
index 69957a4..a2c16a5 100644
--- a/src/f32-sigmoid/sse41-p5-div-x8.c
+++ b/src/f32-sigmoid/sse41-p5-div-x8.c
@@ -12,7 +12,7 @@
#include <smmintrin.h>
#include <xnnpack/common.h>
-#include <xnnpack/vunop.h>
+#include <xnnpack/vunary.h>
void xnn_f32_sigmoid_ukernel__sse41_p5_div_x8(
diff --git a/src/f32-binop/vadd-neon-x4.c b/src/f32-vbinary/vadd-neon-x4.c
similarity index 98%
rename from src/f32-binop/vadd-neon-x4.c
rename to src/f32-vbinary/vadd-neon-x4.c
index 97566f4..9e74662 100644
--- a/src/f32-binop/vadd-neon-x4.c
+++ b/src/f32-vbinary/vadd-neon-x4.c
@@ -12,7 +12,7 @@
#include <arm_neon.h>
#include <xnnpack/common.h>
-#include <xnnpack/vbinop.h>
+#include <xnnpack/vbinary.h>
void xnn_f32_vadd_ukernel__neon_x4(
diff --git a/src/f32-binop/vadd-neon-x8.c b/src/f32-vbinary/vadd-neon-x8.c
similarity index 98%
rename from src/f32-binop/vadd-neon-x8.c
rename to src/f32-vbinary/vadd-neon-x8.c
index b71b5e4..fffdcb7 100644
--- a/src/f32-binop/vadd-neon-x8.c
+++ b/src/f32-vbinary/vadd-neon-x8.c
@@ -12,7 +12,7 @@
#include <arm_neon.h>
#include <xnnpack/common.h>
-#include <xnnpack/vbinop.h>
+#include <xnnpack/vbinary.h>
void xnn_f32_vadd_ukernel__neon_x8(
diff --git a/src/f32-binop/vadd-psimd-x4.c b/src/f32-vbinary/vadd-psimd-x4.c
similarity index 98%
rename from src/f32-binop/vadd-psimd-x4.c
rename to src/f32-vbinary/vadd-psimd-x4.c
index 145917c..8c98c88 100644
--- a/src/f32-binop/vadd-psimd-x4.c
+++ b/src/f32-vbinary/vadd-psimd-x4.c
@@ -12,7 +12,7 @@
#include <psimd.h>
#include <xnnpack/common.h>
-#include <xnnpack/vbinop.h>
+#include <xnnpack/vbinary.h>
void xnn_f32_vadd_ukernel__psimd_x4(
diff --git a/src/f32-binop/vadd-psimd-x8.c b/src/f32-vbinary/vadd-psimd-x8.c
similarity index 98%
rename from src/f32-binop/vadd-psimd-x8.c
rename to src/f32-vbinary/vadd-psimd-x8.c
index e52b614..41d8e90 100644
--- a/src/f32-binop/vadd-psimd-x8.c
+++ b/src/f32-vbinary/vadd-psimd-x8.c
@@ -12,7 +12,7 @@
#include <psimd.h>
#include <xnnpack/common.h>
-#include <xnnpack/vbinop.h>
+#include <xnnpack/vbinary.h>
void xnn_f32_vadd_ukernel__psimd_x8(
diff --git a/src/f32-binop/vadd-scalar-x1.c b/src/f32-vbinary/vadd-scalar-x1.c
similarity index 96%
rename from src/f32-binop/vadd-scalar-x1.c
rename to src/f32-vbinary/vadd-scalar-x1.c
index f71539f..11e9031 100644
--- a/src/f32-binop/vadd-scalar-x1.c
+++ b/src/f32-vbinary/vadd-scalar-x1.c
@@ -11,7 +11,7 @@
#include <xnnpack/common.h>
#include <xnnpack/math.h>
-#include <xnnpack/vbinop.h>
+#include <xnnpack/vbinary.h>
void xnn_f32_vadd_ukernel__scalar_x1(
diff --git a/src/f32-binop/vadd-scalar-x2.c b/src/f32-vbinary/vadd-scalar-x2.c
similarity index 97%
rename from src/f32-binop/vadd-scalar-x2.c
rename to src/f32-vbinary/vadd-scalar-x2.c
index d4931fd..25e21f3 100644
--- a/src/f32-binop/vadd-scalar-x2.c
+++ b/src/f32-vbinary/vadd-scalar-x2.c
@@ -11,7 +11,7 @@
#include <xnnpack/common.h>
#include <xnnpack/math.h>
-#include <xnnpack/vbinop.h>
+#include <xnnpack/vbinary.h>
void xnn_f32_vadd_ukernel__scalar_x2(
diff --git a/src/f32-binop/vadd-scalar-x4.c b/src/f32-vbinary/vadd-scalar-x4.c
similarity index 98%
rename from src/f32-binop/vadd-scalar-x4.c
rename to src/f32-vbinary/vadd-scalar-x4.c
index a285245..761bbbe 100644
--- a/src/f32-binop/vadd-scalar-x4.c
+++ b/src/f32-vbinary/vadd-scalar-x4.c
@@ -11,7 +11,7 @@
#include <xnnpack/common.h>
#include <xnnpack/math.h>
-#include <xnnpack/vbinop.h>
+#include <xnnpack/vbinary.h>
void xnn_f32_vadd_ukernel__scalar_x4(
diff --git a/src/f32-binop/vadd-sse-x4.c b/src/f32-vbinary/vadd-sse-x4.c
similarity index 98%
rename from src/f32-binop/vadd-sse-x4.c
rename to src/f32-vbinary/vadd-sse-x4.c
index 7b09071..a73e9e6 100644
--- a/src/f32-binop/vadd-sse-x4.c
+++ b/src/f32-vbinary/vadd-sse-x4.c
@@ -12,7 +12,7 @@
#include <xmmintrin.h>
#include <xnnpack/common.h>
-#include <xnnpack/vbinop.h>
+#include <xnnpack/vbinary.h>
void xnn_f32_vadd_ukernel__sse_x4(
diff --git a/src/f32-binop/vadd-sse-x8.c b/src/f32-vbinary/vadd-sse-x8.c
similarity index 98%
rename from src/f32-binop/vadd-sse-x8.c
rename to src/f32-vbinary/vadd-sse-x8.c
index 92d5219..0e58308 100644
--- a/src/f32-binop/vadd-sse-x8.c
+++ b/src/f32-vbinary/vadd-sse-x8.c
@@ -12,7 +12,7 @@
#include <xmmintrin.h>
#include <xnnpack/common.h>
-#include <xnnpack/vbinop.h>
+#include <xnnpack/vbinary.h>
void xnn_f32_vadd_ukernel__sse_x8(
diff --git a/src/f32-binop/vaddc-neon-x4.c b/src/f32-vbinary/vaddc-neon-x4.c
similarity index 98%
rename from src/f32-binop/vaddc-neon-x4.c
rename to src/f32-vbinary/vaddc-neon-x4.c
index 0fc102d..8c911de 100644
--- a/src/f32-binop/vaddc-neon-x4.c
+++ b/src/f32-vbinary/vaddc-neon-x4.c
@@ -12,7 +12,7 @@
#include <arm_neon.h>
#include <xnnpack/common.h>
-#include <xnnpack/vbinop.h>
+#include <xnnpack/vbinary.h>
void xnn_f32_vaddc_ukernel__neon_x4(
diff --git a/src/f32-binop/vaddc-neon-x8.c b/src/f32-vbinary/vaddc-neon-x8.c
similarity index 98%
rename from src/f32-binop/vaddc-neon-x8.c
rename to src/f32-vbinary/vaddc-neon-x8.c
index 0ba792f..907a7e5 100644
--- a/src/f32-binop/vaddc-neon-x8.c
+++ b/src/f32-vbinary/vaddc-neon-x8.c
@@ -12,7 +12,7 @@
#include <arm_neon.h>
#include <xnnpack/common.h>
-#include <xnnpack/vbinop.h>
+#include <xnnpack/vbinary.h>
void xnn_f32_vaddc_ukernel__neon_x8(
diff --git a/src/f32-binop/vaddc-psimd-x4.c b/src/f32-vbinary/vaddc-psimd-x4.c
similarity index 98%
rename from src/f32-binop/vaddc-psimd-x4.c
rename to src/f32-vbinary/vaddc-psimd-x4.c
index 8a7ca57..ac68b36 100644
--- a/src/f32-binop/vaddc-psimd-x4.c
+++ b/src/f32-vbinary/vaddc-psimd-x4.c
@@ -12,7 +12,7 @@
#include <psimd.h>
#include <xnnpack/common.h>
-#include <xnnpack/vbinop.h>
+#include <xnnpack/vbinary.h>
void xnn_f32_vaddc_ukernel__psimd_x4(
diff --git a/src/f32-binop/vaddc-psimd-x8.c b/src/f32-vbinary/vaddc-psimd-x8.c
similarity index 98%
rename from src/f32-binop/vaddc-psimd-x8.c
rename to src/f32-vbinary/vaddc-psimd-x8.c
index dd70d25..afa7ad1 100644
--- a/src/f32-binop/vaddc-psimd-x8.c
+++ b/src/f32-vbinary/vaddc-psimd-x8.c
@@ -12,7 +12,7 @@
#include <psimd.h>
#include <xnnpack/common.h>
-#include <xnnpack/vbinop.h>
+#include <xnnpack/vbinary.h>
void xnn_f32_vaddc_ukernel__psimd_x8(
diff --git a/src/f32-binop/vaddc-scalar-x1.c b/src/f32-vbinary/vaddc-scalar-x1.c
similarity index 96%
rename from src/f32-binop/vaddc-scalar-x1.c
rename to src/f32-vbinary/vaddc-scalar-x1.c
index 2ae3ffe..d591020 100644
--- a/src/f32-binop/vaddc-scalar-x1.c
+++ b/src/f32-vbinary/vaddc-scalar-x1.c
@@ -11,7 +11,7 @@
#include <xnnpack/common.h>
#include <xnnpack/math.h>
-#include <xnnpack/vbinop.h>
+#include <xnnpack/vbinary.h>
void xnn_f32_vaddc_ukernel__scalar_x1(
diff --git a/src/f32-binop/vaddc-scalar-x2.c b/src/f32-vbinary/vaddc-scalar-x2.c
similarity index 97%
rename from src/f32-binop/vaddc-scalar-x2.c
rename to src/f32-vbinary/vaddc-scalar-x2.c
index 677b3bf..a41445e 100644
--- a/src/f32-binop/vaddc-scalar-x2.c
+++ b/src/f32-vbinary/vaddc-scalar-x2.c
@@ -11,7 +11,7 @@
#include <xnnpack/common.h>
#include <xnnpack/math.h>
-#include <xnnpack/vbinop.h>
+#include <xnnpack/vbinary.h>
void xnn_f32_vaddc_ukernel__scalar_x2(
diff --git a/src/f32-binop/vaddc-scalar-x4.c b/src/f32-vbinary/vaddc-scalar-x4.c
similarity index 97%
rename from src/f32-binop/vaddc-scalar-x4.c
rename to src/f32-vbinary/vaddc-scalar-x4.c
index 21caa29..92792d9 100644
--- a/src/f32-binop/vaddc-scalar-x4.c
+++ b/src/f32-vbinary/vaddc-scalar-x4.c
@@ -11,7 +11,7 @@
#include <xnnpack/common.h>
#include <xnnpack/math.h>
-#include <xnnpack/vbinop.h>
+#include <xnnpack/vbinary.h>
void xnn_f32_vaddc_ukernel__scalar_x4(
diff --git a/src/f32-binop/vaddc-sse-x4.c b/src/f32-vbinary/vaddc-sse-x4.c
similarity index 97%
rename from src/f32-binop/vaddc-sse-x4.c
rename to src/f32-vbinary/vaddc-sse-x4.c
index e18007d..4442a01 100644
--- a/src/f32-binop/vaddc-sse-x4.c
+++ b/src/f32-vbinary/vaddc-sse-x4.c
@@ -12,7 +12,7 @@
#include <xmmintrin.h>
#include <xnnpack/common.h>
-#include <xnnpack/vbinop.h>
+#include <xnnpack/vbinary.h>
void xnn_f32_vaddc_ukernel__sse_x4(
diff --git a/src/f32-binop/vaddc-sse-x8.c b/src/f32-vbinary/vaddc-sse-x8.c
similarity index 98%
rename from src/f32-binop/vaddc-sse-x8.c
rename to src/f32-vbinary/vaddc-sse-x8.c
index 6fc13f6..6580953 100644
--- a/src/f32-binop/vaddc-sse-x8.c
+++ b/src/f32-vbinary/vaddc-sse-x8.c
@@ -12,7 +12,7 @@
#include <xmmintrin.h>
#include <xnnpack/common.h>
-#include <xnnpack/vbinop.h>
+#include <xnnpack/vbinary.h>
void xnn_f32_vaddc_ukernel__sse_x8(
diff --git a/src/f32-binop/vmul-neon-x4.c b/src/f32-vbinary/vmul-neon-x4.c
similarity index 98%
rename from src/f32-binop/vmul-neon-x4.c
rename to src/f32-vbinary/vmul-neon-x4.c
index 873051d..93df938 100644
--- a/src/f32-binop/vmul-neon-x4.c
+++ b/src/f32-vbinary/vmul-neon-x4.c
@@ -12,7 +12,7 @@
#include <arm_neon.h>
#include <xnnpack/common.h>
-#include <xnnpack/vbinop.h>
+#include <xnnpack/vbinary.h>
void xnn_f32_vmul_ukernel__neon_x4(
diff --git a/src/f32-binop/vmul-neon-x8.c b/src/f32-vbinary/vmul-neon-x8.c
similarity index 98%
rename from src/f32-binop/vmul-neon-x8.c
rename to src/f32-vbinary/vmul-neon-x8.c
index d543a63..7cdb655 100644
--- a/src/f32-binop/vmul-neon-x8.c
+++ b/src/f32-vbinary/vmul-neon-x8.c
@@ -12,7 +12,7 @@
#include <arm_neon.h>
#include <xnnpack/common.h>
-#include <xnnpack/vbinop.h>
+#include <xnnpack/vbinary.h>
void xnn_f32_vmul_ukernel__neon_x8(
diff --git a/src/f32-binop/vmul-psimd-x4.c b/src/f32-vbinary/vmul-psimd-x4.c
similarity index 98%
rename from src/f32-binop/vmul-psimd-x4.c
rename to src/f32-vbinary/vmul-psimd-x4.c
index df9ee0e..7802a34 100644
--- a/src/f32-binop/vmul-psimd-x4.c
+++ b/src/f32-vbinary/vmul-psimd-x4.c
@@ -12,7 +12,7 @@
#include <psimd.h>
#include <xnnpack/common.h>
-#include <xnnpack/vbinop.h>
+#include <xnnpack/vbinary.h>
void xnn_f32_vmul_ukernel__psimd_x4(
diff --git a/src/f32-binop/vmul-psimd-x8.c b/src/f32-vbinary/vmul-psimd-x8.c
similarity index 98%
rename from src/f32-binop/vmul-psimd-x8.c
rename to src/f32-vbinary/vmul-psimd-x8.c
index 1149da4..c51e94e 100644
--- a/src/f32-binop/vmul-psimd-x8.c
+++ b/src/f32-vbinary/vmul-psimd-x8.c
@@ -12,7 +12,7 @@
#include <psimd.h>
#include <xnnpack/common.h>
-#include <xnnpack/vbinop.h>
+#include <xnnpack/vbinary.h>
void xnn_f32_vmul_ukernel__psimd_x8(
diff --git a/src/f32-binop/vmul-scalar-x1.c b/src/f32-vbinary/vmul-scalar-x1.c
similarity index 96%
rename from src/f32-binop/vmul-scalar-x1.c
rename to src/f32-vbinary/vmul-scalar-x1.c
index 33567c5..2f47570 100644
--- a/src/f32-binop/vmul-scalar-x1.c
+++ b/src/f32-vbinary/vmul-scalar-x1.c
@@ -11,7 +11,7 @@
#include <xnnpack/common.h>
#include <xnnpack/math.h>
-#include <xnnpack/vbinop.h>
+#include <xnnpack/vbinary.h>
void xnn_f32_vmul_ukernel__scalar_x1(
diff --git a/src/f32-binop/vmul-scalar-x2.c b/src/f32-vbinary/vmul-scalar-x2.c
similarity index 97%
rename from src/f32-binop/vmul-scalar-x2.c
rename to src/f32-vbinary/vmul-scalar-x2.c
index cf46b7d..317be4f 100644
--- a/src/f32-binop/vmul-scalar-x2.c
+++ b/src/f32-vbinary/vmul-scalar-x2.c
@@ -11,7 +11,7 @@
#include <xnnpack/common.h>
#include <xnnpack/math.h>
-#include <xnnpack/vbinop.h>
+#include <xnnpack/vbinary.h>
void xnn_f32_vmul_ukernel__scalar_x2(
diff --git a/src/f32-binop/vmul-scalar-x4.c b/src/f32-vbinary/vmul-scalar-x4.c
similarity index 98%
rename from src/f32-binop/vmul-scalar-x4.c
rename to src/f32-vbinary/vmul-scalar-x4.c
index bab1c56..a840d27 100644
--- a/src/f32-binop/vmul-scalar-x4.c
+++ b/src/f32-vbinary/vmul-scalar-x4.c
@@ -11,7 +11,7 @@
#include <xnnpack/common.h>
#include <xnnpack/math.h>
-#include <xnnpack/vbinop.h>
+#include <xnnpack/vbinary.h>
void xnn_f32_vmul_ukernel__scalar_x4(
diff --git a/src/f32-binop/vmul-sse-x4.c b/src/f32-vbinary/vmul-sse-x4.c
similarity index 98%
rename from src/f32-binop/vmul-sse-x4.c
rename to src/f32-vbinary/vmul-sse-x4.c
index ab1f096..a100481 100644
--- a/src/f32-binop/vmul-sse-x4.c
+++ b/src/f32-vbinary/vmul-sse-x4.c
@@ -12,7 +12,7 @@
#include <xmmintrin.h>
#include <xnnpack/common.h>
-#include <xnnpack/vbinop.h>
+#include <xnnpack/vbinary.h>
void xnn_f32_vmul_ukernel__sse_x4(
diff --git a/src/f32-binop/vmul-sse-x8.c b/src/f32-vbinary/vmul-sse-x8.c
similarity index 98%
rename from src/f32-binop/vmul-sse-x8.c
rename to src/f32-vbinary/vmul-sse-x8.c
index f4e8909..b9849f6 100644
--- a/src/f32-binop/vmul-sse-x8.c
+++ b/src/f32-vbinary/vmul-sse-x8.c
@@ -12,7 +12,7 @@
#include <xmmintrin.h>
#include <xnnpack/common.h>
-#include <xnnpack/vbinop.h>
+#include <xnnpack/vbinary.h>
void xnn_f32_vmul_ukernel__sse_x8(
diff --git a/src/f32-binop/vmulc-neon-x4.c b/src/f32-vbinary/vmulc-neon-x4.c
similarity index 98%
rename from src/f32-binop/vmulc-neon-x4.c
rename to src/f32-vbinary/vmulc-neon-x4.c
index 1926c92..8b4b8d5 100644
--- a/src/f32-binop/vmulc-neon-x4.c
+++ b/src/f32-vbinary/vmulc-neon-x4.c
@@ -12,7 +12,7 @@
#include <arm_neon.h>
#include <xnnpack/common.h>
-#include <xnnpack/vbinop.h>
+#include <xnnpack/vbinary.h>
void xnn_f32_vmulc_ukernel__neon_x4(
diff --git a/src/f32-binop/vmulc-neon-x8.c b/src/f32-vbinary/vmulc-neon-x8.c
similarity index 98%
rename from src/f32-binop/vmulc-neon-x8.c
rename to src/f32-vbinary/vmulc-neon-x8.c
index 971fbf8..8b362e5 100644
--- a/src/f32-binop/vmulc-neon-x8.c
+++ b/src/f32-vbinary/vmulc-neon-x8.c
@@ -12,7 +12,7 @@
#include <arm_neon.h>
#include <xnnpack/common.h>
-#include <xnnpack/vbinop.h>
+#include <xnnpack/vbinary.h>
void xnn_f32_vmulc_ukernel__neon_x8(
diff --git a/src/f32-binop/vmulc-psimd-x4.c b/src/f32-vbinary/vmulc-psimd-x4.c
similarity index 98%
rename from src/f32-binop/vmulc-psimd-x4.c
rename to src/f32-vbinary/vmulc-psimd-x4.c
index 552fc9d..cbb9ae6 100644
--- a/src/f32-binop/vmulc-psimd-x4.c
+++ b/src/f32-vbinary/vmulc-psimd-x4.c
@@ -12,7 +12,7 @@
#include <psimd.h>
#include <xnnpack/common.h>
-#include <xnnpack/vbinop.h>
+#include <xnnpack/vbinary.h>
void xnn_f32_vmulc_ukernel__psimd_x4(
diff --git a/src/f32-binop/vmulc-psimd-x8.c b/src/f32-vbinary/vmulc-psimd-x8.c
similarity index 98%
rename from src/f32-binop/vmulc-psimd-x8.c
rename to src/f32-vbinary/vmulc-psimd-x8.c
index 2a55630..4a5fbe4 100644
--- a/src/f32-binop/vmulc-psimd-x8.c
+++ b/src/f32-vbinary/vmulc-psimd-x8.c
@@ -12,7 +12,7 @@
#include <psimd.h>
#include <xnnpack/common.h>
-#include <xnnpack/vbinop.h>
+#include <xnnpack/vbinary.h>
void xnn_f32_vmulc_ukernel__psimd_x8(
diff --git a/src/f32-binop/vmulc-scalar-x1.c b/src/f32-vbinary/vmulc-scalar-x1.c
similarity index 96%
rename from src/f32-binop/vmulc-scalar-x1.c
rename to src/f32-vbinary/vmulc-scalar-x1.c
index 33e1b1e..b0a485a 100644
--- a/src/f32-binop/vmulc-scalar-x1.c
+++ b/src/f32-vbinary/vmulc-scalar-x1.c
@@ -11,7 +11,7 @@
#include <xnnpack/common.h>
#include <xnnpack/math.h>
-#include <xnnpack/vbinop.h>
+#include <xnnpack/vbinary.h>
void xnn_f32_vmulc_ukernel__scalar_x1(
diff --git a/src/f32-binop/vmulc-scalar-x2.c b/src/f32-vbinary/vmulc-scalar-x2.c
similarity index 97%
rename from src/f32-binop/vmulc-scalar-x2.c
rename to src/f32-vbinary/vmulc-scalar-x2.c
index 6c55f10..7d58a74 100644
--- a/src/f32-binop/vmulc-scalar-x2.c
+++ b/src/f32-vbinary/vmulc-scalar-x2.c
@@ -11,7 +11,7 @@
#include <xnnpack/common.h>
#include <xnnpack/math.h>
-#include <xnnpack/vbinop.h>
+#include <xnnpack/vbinary.h>
void xnn_f32_vmulc_ukernel__scalar_x2(
diff --git a/src/f32-binop/vmulc-scalar-x4.c b/src/f32-vbinary/vmulc-scalar-x4.c
similarity index 97%
rename from src/f32-binop/vmulc-scalar-x4.c
rename to src/f32-vbinary/vmulc-scalar-x4.c
index 82917a5..c8888fc 100644
--- a/src/f32-binop/vmulc-scalar-x4.c
+++ b/src/f32-vbinary/vmulc-scalar-x4.c
@@ -11,7 +11,7 @@
#include <xnnpack/common.h>
#include <xnnpack/math.h>
-#include <xnnpack/vbinop.h>
+#include <xnnpack/vbinary.h>
void xnn_f32_vmulc_ukernel__scalar_x4(
diff --git a/src/f32-binop/vmulc-sse-x4.c b/src/f32-vbinary/vmulc-sse-x4.c
similarity index 97%
rename from src/f32-binop/vmulc-sse-x4.c
rename to src/f32-vbinary/vmulc-sse-x4.c
index 6793c85..8dafee9 100644
--- a/src/f32-binop/vmulc-sse-x4.c
+++ b/src/f32-vbinary/vmulc-sse-x4.c
@@ -12,7 +12,7 @@
#include <xmmintrin.h>
#include <xnnpack/common.h>
-#include <xnnpack/vbinop.h>
+#include <xnnpack/vbinary.h>
void xnn_f32_vmulc_ukernel__sse_x4(
diff --git a/src/f32-binop/vmulc-sse-x8.c b/src/f32-vbinary/vmulc-sse-x8.c
similarity index 98%
rename from src/f32-binop/vmulc-sse-x8.c
rename to src/f32-vbinary/vmulc-sse-x8.c
index fa57654..0f7f892 100644
--- a/src/f32-binop/vmulc-sse-x8.c
+++ b/src/f32-vbinary/vmulc-sse-x8.c
@@ -12,7 +12,7 @@
#include <xmmintrin.h>
#include <xnnpack/common.h>
-#include <xnnpack/vbinop.h>
+#include <xnnpack/vbinary.h>
void xnn_f32_vmulc_ukernel__sse_x8(
diff --git a/src/f32-binop/vop-neon.c.in b/src/f32-vbinary/vop-neon.c.in
similarity index 98%
rename from src/f32-binop/vop-neon.c.in
rename to src/f32-vbinary/vop-neon.c.in
index d635849..7018fc6 100644
--- a/src/f32-binop/vop-neon.c.in
+++ b/src/f32-vbinary/vop-neon.c.in
@@ -12,7 +12,7 @@
#include <arm_neon.h>
#include <xnnpack/common.h>
-#include <xnnpack/vbinop.h>
+#include <xnnpack/vbinary.h>
$VOPQ_F32 = {
diff --git a/src/f32-binop/vop-psimd.c.in b/src/f32-vbinary/vop-psimd.c.in
similarity index 98%
rename from src/f32-binop/vop-psimd.c.in
rename to src/f32-vbinary/vop-psimd.c.in
index ece6eca..0ba0dcf 100644
--- a/src/f32-binop/vop-psimd.c.in
+++ b/src/f32-vbinary/vop-psimd.c.in
@@ -12,7 +12,7 @@
#include <psimd.h>
#include <xnnpack/common.h>
-#include <xnnpack/vbinop.h>
+#include <xnnpack/vbinary.h>
$PSIMD_OP_F32 = {
diff --git a/src/f32-binop/vop-scalar.c.in b/src/f32-vbinary/vop-scalar.c.in
similarity index 98%
rename from src/f32-binop/vop-scalar.c.in
rename to src/f32-vbinary/vop-scalar.c.in
index ab6cfba..633218a 100644
--- a/src/f32-binop/vop-scalar.c.in
+++ b/src/f32-vbinary/vop-scalar.c.in
@@ -10,7 +10,7 @@
#include <xnnpack/common.h>
#include <xnnpack/math.h>
-#include <xnnpack/vbinop.h>
+#include <xnnpack/vbinary.h>
$OP_FUNC = {
diff --git a/src/f32-binop/vop-sse.c.in b/src/f32-vbinary/vop-sse.c.in
similarity index 98%
rename from src/f32-binop/vop-sse.c.in
rename to src/f32-vbinary/vop-sse.c.in
index 45a4247..d19b794 100644
--- a/src/f32-binop/vop-sse.c.in
+++ b/src/f32-vbinary/vop-sse.c.in
@@ -12,7 +12,7 @@
#include <xmmintrin.h>
#include <xnnpack/common.h>
-#include <xnnpack/vbinop.h>
+#include <xnnpack/vbinary.h>
$_MM_OP_PS = {
diff --git a/src/f32-binop/vopc-neon.c.in b/src/f32-vbinary/vopc-neon.c.in
similarity index 98%
rename from src/f32-binop/vopc-neon.c.in
rename to src/f32-vbinary/vopc-neon.c.in
index 2916336..dbf214f 100644
--- a/src/f32-binop/vopc-neon.c.in
+++ b/src/f32-vbinary/vopc-neon.c.in
@@ -12,7 +12,7 @@
#include <arm_neon.h>
#include <xnnpack/common.h>
-#include <xnnpack/vbinop.h>
+#include <xnnpack/vbinary.h>
$VOPQ_F32 = {
diff --git a/src/f32-binop/vopc-psimd.c.in b/src/f32-vbinary/vopc-psimd.c.in
similarity index 98%
rename from src/f32-binop/vopc-psimd.c.in
rename to src/f32-vbinary/vopc-psimd.c.in
index ae12d21..6402f59 100644
--- a/src/f32-binop/vopc-psimd.c.in
+++ b/src/f32-vbinary/vopc-psimd.c.in
@@ -12,7 +12,7 @@
#include <psimd.h>
#include <xnnpack/common.h>
-#include <xnnpack/vbinop.h>
+#include <xnnpack/vbinary.h>
$PSIMD_OP_F32 = {
diff --git a/src/f32-binop/vopc-scalar.c.in b/src/f32-vbinary/vopc-scalar.c.in
similarity index 98%
rename from src/f32-binop/vopc-scalar.c.in
rename to src/f32-vbinary/vopc-scalar.c.in
index 410d7e5..88fb010 100644
--- a/src/f32-binop/vopc-scalar.c.in
+++ b/src/f32-vbinary/vopc-scalar.c.in
@@ -10,7 +10,7 @@
#include <xnnpack/common.h>
#include <xnnpack/math.h>
-#include <xnnpack/vbinop.h>
+#include <xnnpack/vbinary.h>
$OP_FUNC = {
diff --git a/src/f32-binop/vopc-sse.c.in b/src/f32-vbinary/vopc-sse.c.in
similarity index 98%
rename from src/f32-binop/vopc-sse.c.in
rename to src/f32-vbinary/vopc-sse.c.in
index 44dd95e..aae6bd1 100644
--- a/src/f32-binop/vopc-sse.c.in
+++ b/src/f32-vbinary/vopc-sse.c.in
@@ -12,7 +12,7 @@
#include <xmmintrin.h>
#include <xnnpack/common.h>
-#include <xnnpack/vbinop.h>
+#include <xnnpack/vbinary.h>
$_MM_OP_PS = {
diff --git a/src/f32-binop/vrsubc-neon-x4.c b/src/f32-vbinary/vrsubc-neon-x4.c
similarity index 98%
rename from src/f32-binop/vrsubc-neon-x4.c
rename to src/f32-vbinary/vrsubc-neon-x4.c
index 49cbfac..b185e54 100644
--- a/src/f32-binop/vrsubc-neon-x4.c
+++ b/src/f32-vbinary/vrsubc-neon-x4.c
@@ -12,7 +12,7 @@
#include <arm_neon.h>
#include <xnnpack/common.h>
-#include <xnnpack/vbinop.h>
+#include <xnnpack/vbinary.h>
void xnn_f32_vrsubc_ukernel__neon_x4(
diff --git a/src/f32-binop/vrsubc-neon-x8.c b/src/f32-vbinary/vrsubc-neon-x8.c
similarity index 98%
rename from src/f32-binop/vrsubc-neon-x8.c
rename to src/f32-vbinary/vrsubc-neon-x8.c
index 5242e23..fcaf1d4 100644
--- a/src/f32-binop/vrsubc-neon-x8.c
+++ b/src/f32-vbinary/vrsubc-neon-x8.c
@@ -12,7 +12,7 @@
#include <arm_neon.h>
#include <xnnpack/common.h>
-#include <xnnpack/vbinop.h>
+#include <xnnpack/vbinary.h>
void xnn_f32_vrsubc_ukernel__neon_x8(
diff --git a/src/f32-binop/vrsubc-psimd-x4.c b/src/f32-vbinary/vrsubc-psimd-x4.c
similarity index 98%
rename from src/f32-binop/vrsubc-psimd-x4.c
rename to src/f32-vbinary/vrsubc-psimd-x4.c
index 92c7bcb..80cf600 100644
--- a/src/f32-binop/vrsubc-psimd-x4.c
+++ b/src/f32-vbinary/vrsubc-psimd-x4.c
@@ -12,7 +12,7 @@
#include <psimd.h>
#include <xnnpack/common.h>
-#include <xnnpack/vbinop.h>
+#include <xnnpack/vbinary.h>
void xnn_f32_vrsubc_ukernel__psimd_x4(
diff --git a/src/f32-binop/vrsubc-psimd-x8.c b/src/f32-vbinary/vrsubc-psimd-x8.c
similarity index 98%
rename from src/f32-binop/vrsubc-psimd-x8.c
rename to src/f32-vbinary/vrsubc-psimd-x8.c
index 98f1645..0518f5f 100644
--- a/src/f32-binop/vrsubc-psimd-x8.c
+++ b/src/f32-vbinary/vrsubc-psimd-x8.c
@@ -12,7 +12,7 @@
#include <psimd.h>
#include <xnnpack/common.h>
-#include <xnnpack/vbinop.h>
+#include <xnnpack/vbinary.h>
void xnn_f32_vrsubc_ukernel__psimd_x8(
diff --git a/src/f32-binop/vrsubc-scalar-x1.c b/src/f32-vbinary/vrsubc-scalar-x1.c
similarity index 96%
rename from src/f32-binop/vrsubc-scalar-x1.c
rename to src/f32-vbinary/vrsubc-scalar-x1.c
index defd37b..338a439 100644
--- a/src/f32-binop/vrsubc-scalar-x1.c
+++ b/src/f32-vbinary/vrsubc-scalar-x1.c
@@ -11,7 +11,7 @@
#include <xnnpack/common.h>
#include <xnnpack/math.h>
-#include <xnnpack/vbinop.h>
+#include <xnnpack/vbinary.h>
void xnn_f32_vrsubc_ukernel__scalar_x1(
diff --git a/src/f32-binop/vrsubc-scalar-x2.c b/src/f32-vbinary/vrsubc-scalar-x2.c
similarity index 97%
rename from src/f32-binop/vrsubc-scalar-x2.c
rename to src/f32-vbinary/vrsubc-scalar-x2.c
index 63a9d88..2d10e46 100644
--- a/src/f32-binop/vrsubc-scalar-x2.c
+++ b/src/f32-vbinary/vrsubc-scalar-x2.c
@@ -11,7 +11,7 @@
#include <xnnpack/common.h>
#include <xnnpack/math.h>
-#include <xnnpack/vbinop.h>
+#include <xnnpack/vbinary.h>
void xnn_f32_vrsubc_ukernel__scalar_x2(
diff --git a/src/f32-binop/vrsubc-scalar-x4.c b/src/f32-vbinary/vrsubc-scalar-x4.c
similarity index 97%
rename from src/f32-binop/vrsubc-scalar-x4.c
rename to src/f32-vbinary/vrsubc-scalar-x4.c
index 7ac3b94..03c5923 100644
--- a/src/f32-binop/vrsubc-scalar-x4.c
+++ b/src/f32-vbinary/vrsubc-scalar-x4.c
@@ -11,7 +11,7 @@
#include <xnnpack/common.h>
#include <xnnpack/math.h>
-#include <xnnpack/vbinop.h>
+#include <xnnpack/vbinary.h>
void xnn_f32_vrsubc_ukernel__scalar_x4(
diff --git a/src/f32-binop/vrsubc-sse-x4.c b/src/f32-vbinary/vrsubc-sse-x4.c
similarity index 97%
rename from src/f32-binop/vrsubc-sse-x4.c
rename to src/f32-vbinary/vrsubc-sse-x4.c
index 910174e..0cf727e 100644
--- a/src/f32-binop/vrsubc-sse-x4.c
+++ b/src/f32-vbinary/vrsubc-sse-x4.c
@@ -12,7 +12,7 @@
#include <xmmintrin.h>
#include <xnnpack/common.h>
-#include <xnnpack/vbinop.h>
+#include <xnnpack/vbinary.h>
void xnn_f32_vrsubc_ukernel__sse_x4(
diff --git a/src/f32-binop/vrsubc-sse-x8.c b/src/f32-vbinary/vrsubc-sse-x8.c
similarity index 98%
rename from src/f32-binop/vrsubc-sse-x8.c
rename to src/f32-vbinary/vrsubc-sse-x8.c
index 84a4bde..48137f4 100644
--- a/src/f32-binop/vrsubc-sse-x8.c
+++ b/src/f32-vbinary/vrsubc-sse-x8.c
@@ -12,7 +12,7 @@
#include <xmmintrin.h>
#include <xnnpack/common.h>
-#include <xnnpack/vbinop.h>
+#include <xnnpack/vbinary.h>
void xnn_f32_vrsubc_ukernel__sse_x8(
diff --git a/src/f32-binop/vsub-neon-x4.c b/src/f32-vbinary/vsub-neon-x4.c
similarity index 98%
rename from src/f32-binop/vsub-neon-x4.c
rename to src/f32-vbinary/vsub-neon-x4.c
index ec345dd..859aa62 100644
--- a/src/f32-binop/vsub-neon-x4.c
+++ b/src/f32-vbinary/vsub-neon-x4.c
@@ -12,7 +12,7 @@
#include <arm_neon.h>
#include <xnnpack/common.h>
-#include <xnnpack/vbinop.h>
+#include <xnnpack/vbinary.h>
void xnn_f32_vsub_ukernel__neon_x4(
diff --git a/src/f32-binop/vsub-neon-x8.c b/src/f32-vbinary/vsub-neon-x8.c
similarity index 98%
rename from src/f32-binop/vsub-neon-x8.c
rename to src/f32-vbinary/vsub-neon-x8.c
index cfbfe06..0bd9fa2 100644
--- a/src/f32-binop/vsub-neon-x8.c
+++ b/src/f32-vbinary/vsub-neon-x8.c
@@ -12,7 +12,7 @@
#include <arm_neon.h>
#include <xnnpack/common.h>
-#include <xnnpack/vbinop.h>
+#include <xnnpack/vbinary.h>
void xnn_f32_vsub_ukernel__neon_x8(
diff --git a/src/f32-binop/vsub-psimd-x4.c b/src/f32-vbinary/vsub-psimd-x4.c
similarity index 98%
rename from src/f32-binop/vsub-psimd-x4.c
rename to src/f32-vbinary/vsub-psimd-x4.c
index c1cbd74..de35ab2 100644
--- a/src/f32-binop/vsub-psimd-x4.c
+++ b/src/f32-vbinary/vsub-psimd-x4.c
@@ -12,7 +12,7 @@
#include <psimd.h>
#include <xnnpack/common.h>
-#include <xnnpack/vbinop.h>
+#include <xnnpack/vbinary.h>
void xnn_f32_vsub_ukernel__psimd_x4(
diff --git a/src/f32-binop/vsub-psimd-x8.c b/src/f32-vbinary/vsub-psimd-x8.c
similarity index 98%
rename from src/f32-binop/vsub-psimd-x8.c
rename to src/f32-vbinary/vsub-psimd-x8.c
index 6160cc5..a0bd614 100644
--- a/src/f32-binop/vsub-psimd-x8.c
+++ b/src/f32-vbinary/vsub-psimd-x8.c
@@ -12,7 +12,7 @@
#include <psimd.h>
#include <xnnpack/common.h>
-#include <xnnpack/vbinop.h>
+#include <xnnpack/vbinary.h>
void xnn_f32_vsub_ukernel__psimd_x8(
diff --git a/src/f32-binop/vsub-scalar-x1.c b/src/f32-vbinary/vsub-scalar-x1.c
similarity index 96%
rename from src/f32-binop/vsub-scalar-x1.c
rename to src/f32-vbinary/vsub-scalar-x1.c
index 47591a2..6f88531 100644
--- a/src/f32-binop/vsub-scalar-x1.c
+++ b/src/f32-vbinary/vsub-scalar-x1.c
@@ -11,7 +11,7 @@
#include <xnnpack/common.h>
#include <xnnpack/math.h>
-#include <xnnpack/vbinop.h>
+#include <xnnpack/vbinary.h>
void xnn_f32_vsub_ukernel__scalar_x1(
diff --git a/src/f32-binop/vsub-scalar-x2.c b/src/f32-vbinary/vsub-scalar-x2.c
similarity index 97%
rename from src/f32-binop/vsub-scalar-x2.c
rename to src/f32-vbinary/vsub-scalar-x2.c
index 106c79f..b9948ff 100644
--- a/src/f32-binop/vsub-scalar-x2.c
+++ b/src/f32-vbinary/vsub-scalar-x2.c
@@ -11,7 +11,7 @@
#include <xnnpack/common.h>
#include <xnnpack/math.h>
-#include <xnnpack/vbinop.h>
+#include <xnnpack/vbinary.h>
void xnn_f32_vsub_ukernel__scalar_x2(
diff --git a/src/f32-binop/vsub-scalar-x4.c b/src/f32-vbinary/vsub-scalar-x4.c
similarity index 98%
rename from src/f32-binop/vsub-scalar-x4.c
rename to src/f32-vbinary/vsub-scalar-x4.c
index 5e8bab3..dca0c77 100644
--- a/src/f32-binop/vsub-scalar-x4.c
+++ b/src/f32-vbinary/vsub-scalar-x4.c
@@ -11,7 +11,7 @@
#include <xnnpack/common.h>
#include <xnnpack/math.h>
-#include <xnnpack/vbinop.h>
+#include <xnnpack/vbinary.h>
void xnn_f32_vsub_ukernel__scalar_x4(
diff --git a/src/f32-binop/vsub-sse-x4.c b/src/f32-vbinary/vsub-sse-x4.c
similarity index 98%
rename from src/f32-binop/vsub-sse-x4.c
rename to src/f32-vbinary/vsub-sse-x4.c
index 832c40a..d8c23be 100644
--- a/src/f32-binop/vsub-sse-x4.c
+++ b/src/f32-vbinary/vsub-sse-x4.c
@@ -12,7 +12,7 @@
#include <xmmintrin.h>
#include <xnnpack/common.h>
-#include <xnnpack/vbinop.h>
+#include <xnnpack/vbinary.h>
void xnn_f32_vsub_ukernel__sse_x4(
diff --git a/src/f32-binop/vsub-sse-x8.c b/src/f32-vbinary/vsub-sse-x8.c
similarity index 98%
rename from src/f32-binop/vsub-sse-x8.c
rename to src/f32-vbinary/vsub-sse-x8.c
index e0e0d52..0c157a3 100644
--- a/src/f32-binop/vsub-sse-x8.c
+++ b/src/f32-vbinary/vsub-sse-x8.c
@@ -12,7 +12,7 @@
#include <xmmintrin.h>
#include <xnnpack/common.h>
-#include <xnnpack/vbinop.h>
+#include <xnnpack/vbinary.h>
void xnn_f32_vsub_ukernel__sse_x8(
diff --git a/src/f32-binop/vsubc-neon-x4.c b/src/f32-vbinary/vsubc-neon-x4.c
similarity index 98%
rename from src/f32-binop/vsubc-neon-x4.c
rename to src/f32-vbinary/vsubc-neon-x4.c
index 690802d..eab5204 100644
--- a/src/f32-binop/vsubc-neon-x4.c
+++ b/src/f32-vbinary/vsubc-neon-x4.c
@@ -12,7 +12,7 @@
#include <arm_neon.h>
#include <xnnpack/common.h>
-#include <xnnpack/vbinop.h>
+#include <xnnpack/vbinary.h>
void xnn_f32_vsubc_ukernel__neon_x4(
diff --git a/src/f32-binop/vsubc-neon-x8.c b/src/f32-vbinary/vsubc-neon-x8.c
similarity index 98%
rename from src/f32-binop/vsubc-neon-x8.c
rename to src/f32-vbinary/vsubc-neon-x8.c
index ad7e72f..18cd847 100644
--- a/src/f32-binop/vsubc-neon-x8.c
+++ b/src/f32-vbinary/vsubc-neon-x8.c
@@ -12,7 +12,7 @@
#include <arm_neon.h>
#include <xnnpack/common.h>
-#include <xnnpack/vbinop.h>
+#include <xnnpack/vbinary.h>
void xnn_f32_vsubc_ukernel__neon_x8(
diff --git a/src/f32-binop/vsubc-psimd-x4.c b/src/f32-vbinary/vsubc-psimd-x4.c
similarity index 98%
rename from src/f32-binop/vsubc-psimd-x4.c
rename to src/f32-vbinary/vsubc-psimd-x4.c
index 911e375..54b4ece 100644
--- a/src/f32-binop/vsubc-psimd-x4.c
+++ b/src/f32-vbinary/vsubc-psimd-x4.c
@@ -12,7 +12,7 @@
#include <psimd.h>
#include <xnnpack/common.h>
-#include <xnnpack/vbinop.h>
+#include <xnnpack/vbinary.h>
void xnn_f32_vsubc_ukernel__psimd_x4(
diff --git a/src/f32-binop/vsubc-psimd-x8.c b/src/f32-vbinary/vsubc-psimd-x8.c
similarity index 98%
rename from src/f32-binop/vsubc-psimd-x8.c
rename to src/f32-vbinary/vsubc-psimd-x8.c
index 0c08465..f393a93 100644
--- a/src/f32-binop/vsubc-psimd-x8.c
+++ b/src/f32-vbinary/vsubc-psimd-x8.c
@@ -12,7 +12,7 @@
#include <psimd.h>
#include <xnnpack/common.h>
-#include <xnnpack/vbinop.h>
+#include <xnnpack/vbinary.h>
void xnn_f32_vsubc_ukernel__psimd_x8(
diff --git a/src/f32-binop/vsubc-scalar-x1.c b/src/f32-vbinary/vsubc-scalar-x1.c
similarity index 96%
rename from src/f32-binop/vsubc-scalar-x1.c
rename to src/f32-vbinary/vsubc-scalar-x1.c
index c252c94..7a346af 100644
--- a/src/f32-binop/vsubc-scalar-x1.c
+++ b/src/f32-vbinary/vsubc-scalar-x1.c
@@ -11,7 +11,7 @@
#include <xnnpack/common.h>
#include <xnnpack/math.h>
-#include <xnnpack/vbinop.h>
+#include <xnnpack/vbinary.h>
void xnn_f32_vsubc_ukernel__scalar_x1(
diff --git a/src/f32-binop/vsubc-scalar-x2.c b/src/f32-vbinary/vsubc-scalar-x2.c
similarity index 97%
rename from src/f32-binop/vsubc-scalar-x2.c
rename to src/f32-vbinary/vsubc-scalar-x2.c
index 57bdd74..3b40715 100644
--- a/src/f32-binop/vsubc-scalar-x2.c
+++ b/src/f32-vbinary/vsubc-scalar-x2.c
@@ -11,7 +11,7 @@
#include <xnnpack/common.h>
#include <xnnpack/math.h>
-#include <xnnpack/vbinop.h>
+#include <xnnpack/vbinary.h>
void xnn_f32_vsubc_ukernel__scalar_x2(
diff --git a/src/f32-binop/vsubc-scalar-x4.c b/src/f32-vbinary/vsubc-scalar-x4.c
similarity index 97%
rename from src/f32-binop/vsubc-scalar-x4.c
rename to src/f32-vbinary/vsubc-scalar-x4.c
index ce7dd54..1095de8 100644
--- a/src/f32-binop/vsubc-scalar-x4.c
+++ b/src/f32-vbinary/vsubc-scalar-x4.c
@@ -11,7 +11,7 @@
#include <xnnpack/common.h>
#include <xnnpack/math.h>
-#include <xnnpack/vbinop.h>
+#include <xnnpack/vbinary.h>
void xnn_f32_vsubc_ukernel__scalar_x4(
diff --git a/src/f32-binop/vsubc-sse-x4.c b/src/f32-vbinary/vsubc-sse-x4.c
similarity index 97%
rename from src/f32-binop/vsubc-sse-x4.c
rename to src/f32-vbinary/vsubc-sse-x4.c
index 9cf424b..279e884 100644
--- a/src/f32-binop/vsubc-sse-x4.c
+++ b/src/f32-vbinary/vsubc-sse-x4.c
@@ -12,7 +12,7 @@
#include <xmmintrin.h>
#include <xnnpack/common.h>
-#include <xnnpack/vbinop.h>
+#include <xnnpack/vbinary.h>
void xnn_f32_vsubc_ukernel__sse_x4(
diff --git a/src/f32-binop/vsubc-sse-x8.c b/src/f32-vbinary/vsubc-sse-x8.c
similarity index 98%
rename from src/f32-binop/vsubc-sse-x8.c
rename to src/f32-vbinary/vsubc-sse-x8.c
index bc81a8e..caa16e9 100644
--- a/src/f32-binop/vsubc-sse-x8.c
+++ b/src/f32-vbinary/vsubc-sse-x8.c
@@ -12,7 +12,7 @@
#include <xmmintrin.h>
#include <xnnpack/common.h>
-#include <xnnpack/vbinop.h>
+#include <xnnpack/vbinary.h>
void xnn_f32_vsubc_ukernel__sse_x8(
diff --git a/src/init.c b/src/init.c
index fc15c32..b5795af 100644
--- a/src/init.c
+++ b/src/init.c
@@ -41,9 +41,9 @@
#include <xnnpack/spmm.h>
#include <xnnpack/unpool.h>
#include <xnnpack/vadd.h>
-#include <xnnpack/vbinop.h>
+#include <xnnpack/vbinary.h>
#include <xnnpack/vmulcaddc.h>
-#include <xnnpack/vunop.h>
+#include <xnnpack/vunary.h>
#include <xnnpack/zip.h>
#ifndef XNN_ENABLE_ASSEMBLY
@@ -208,10 +208,10 @@
.channel_tile = 8,
};
xnn_params.f32.vadd = (xnn_vadd_ukernel_function) xnn_f32_vadd_ukernel__neon_x8;
- xnn_params.f32.vmul = (struct vbinop_parameters) {
- .op_ukernel = (xnn_vbinop_ukernel_function) xnn_f32_vmul_ukernel__neon_x8,
- .opc_ukernel = (xnn_vbinop_ukernel_function) xnn_f32_vmulc_ukernel__neon_x8,
- .ropc_ukernel = (xnn_vbinop_ukernel_function) xnn_f32_vmulc_ukernel__neon_x8,
+ xnn_params.f32.vmul = (struct vbinary_parameters) {
+ .op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmul_ukernel__neon_x8,
+ .opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmulc_ukernel__neon_x8,
+ .ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmulc_ukernel__neon_x8,
.element_tile = 8,
};
xnn_params.f32.vmulcaddc = (struct vmulcaddc_parameters) {
@@ -483,10 +483,10 @@
.channel_tile = 8,
};
xnn_params.f32.vadd = (xnn_vadd_ukernel_function) xnn_f32_vadd_ukernel__neon_x8;
- xnn_params.f32.vmul = (struct vbinop_parameters) {
- .op_ukernel = (xnn_vbinop_ukernel_function) xnn_f32_vmul_ukernel__neon_x8,
- .opc_ukernel = (xnn_vbinop_ukernel_function) xnn_f32_vmulc_ukernel__neon_x8,
- .ropc_ukernel = (xnn_vbinop_ukernel_function) xnn_f32_vmulc_ukernel__neon_x8,
+ xnn_params.f32.vmul = (struct vbinary_parameters) {
+ .op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmul_ukernel__neon_x8,
+ .opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmulc_ukernel__neon_x8,
+ .ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmulc_ukernel__neon_x8,
.element_tile = 8,
};
xnn_params.f32.vmulcaddc = (struct vmulcaddc_parameters) {
@@ -721,10 +721,10 @@
.channel_tile = 8,
};
xnn_params.f32.vadd = (xnn_vadd_ukernel_function) xnn_f32_vadd_ukernel__sse_x8;
- xnn_params.f32.vmul = (struct vbinop_parameters) {
- .op_ukernel = (xnn_vbinop_ukernel_function) xnn_f32_vmul_ukernel__sse_x8,
- .opc_ukernel = (xnn_vbinop_ukernel_function) xnn_f32_vmulc_ukernel__sse_x8,
- .ropc_ukernel = (xnn_vbinop_ukernel_function) xnn_f32_vmulc_ukernel__sse_x8,
+ xnn_params.f32.vmul = (struct vbinary_parameters) {
+ .op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmul_ukernel__sse_x8,
+ .opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmulc_ukernel__sse_x8,
+ .ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmulc_ukernel__sse_x8,
.element_tile = 8,
};
xnn_params.f32.vmulcaddc = (struct vmulcaddc_parameters) {
@@ -922,10 +922,10 @@
.channel_tile = 8,
};
xnn_params.f32.vadd = (xnn_vadd_ukernel_function) xnn_f32_vadd_ukernel__psimd_x8;
- xnn_params.f32.vmul = (struct vbinop_parameters) {
- .op_ukernel = (xnn_vbinop_ukernel_function) xnn_f32_vmul_ukernel__psimd_x8,
- .opc_ukernel = (xnn_vbinop_ukernel_function) xnn_f32_vmulc_ukernel__psimd_x8,
- .ropc_ukernel = (xnn_vbinop_ukernel_function) xnn_f32_vmulc_ukernel__psimd_x8,
+ xnn_params.f32.vmul = (struct vbinary_parameters) {
+ .op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmul_ukernel__psimd_x8,
+ .opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmulc_ukernel__psimd_x8,
+ .ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmulc_ukernel__psimd_x8,
.element_tile = 8,
};
xnn_params.f32.vmulcaddc = (struct vmulcaddc_parameters) {
@@ -1098,10 +1098,10 @@
.channel_tile = 4,
};
xnn_params.f32.vadd = (xnn_vadd_ukernel_function) xnn_f32_vadd_ukernel__scalar_x4;
- xnn_params.f32.vmul = (struct vbinop_parameters) {
- .op_ukernel = (xnn_vbinop_ukernel_function) xnn_f32_vmul_ukernel__scalar_x4,
- .opc_ukernel = (xnn_vbinop_ukernel_function) xnn_f32_vmulc_ukernel__scalar_x4,
- .ropc_ukernel = (xnn_vbinop_ukernel_function) xnn_f32_vmulc_ukernel__scalar_x4,
+ xnn_params.f32.vmul = (struct vbinary_parameters) {
+ .op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmul_ukernel__scalar_x4,
+ .opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmulc_ukernel__scalar_x4,
+ .ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmulc_ukernel__scalar_x4,
.element_tile = 8,
};
xnn_params.f32.vmulcaddc = (struct vmulcaddc_parameters) {
diff --git a/src/xnnpack/compute.h b/src/xnnpack/compute.h
index 8eb4fa4..4f2833a 100644
--- a/src/xnnpack/compute.h
+++ b/src/xnnpack/compute.h
@@ -583,7 +583,7 @@
union xnn_q8_add_params q8;
union xnn_f32_output_params f32;
} params;
- xnn_vbinop_ukernel_function ukernel;
+ xnn_vbinary_ukernel_function ukernel;
};
#ifndef __cplusplus
diff --git a/src/xnnpack/params.h b/src/xnnpack/params.h
index c76315e..87625eb 100644
--- a/src/xnnpack/params.h
+++ b/src/xnnpack/params.h
@@ -1076,27 +1076,27 @@
uint8_t* y,
const union xnn_q8_add_params* params);
-typedef void (*xnn_vbinop_ukernel_function)(
+typedef void (*xnn_vbinary_ukernel_function)(
size_t n,
const void* a,
const void* b,
void* y,
const void* params);
-typedef void (*xnn_f32_vbinop_ukernel_function)(
+typedef void (*xnn_f32_vbinary_ukernel_function)(
size_t n,
const float* a,
const float* b,
float* y,
const union xnn_f32_output_params* params);
-typedef void (*xnn_vunop_ukernel_function)(
+typedef void (*xnn_vunary_ukernel_function)(
size_t n,
const void* x,
void* y,
const void* params);
-typedef void (*xnn_f32_vunop_ukernel_function)(
+typedef void (*xnn_f32_vunary_ukernel_function)(
size_t n,
const float* x,
float* y,
@@ -1195,10 +1195,10 @@
uint8_t log2_sr;
};
-struct vbinop_parameters {
- xnn_vbinop_ukernel_function op_ukernel;
- xnn_vbinop_ukernel_function opc_ukernel;
- xnn_vbinop_ukernel_function ropc_ukernel;
+struct vbinary_parameters {
+ xnn_vbinary_ukernel_function op_ukernel;
+ xnn_vbinary_ukernel_function opc_ukernel;
+ xnn_vbinary_ukernel_function ropc_ukernel;
// Number of elements in a tile.
// For best efficiency, micro-kernel must process a multiple of this number of elements in each call.
uint8_t element_tile;
@@ -1363,7 +1363,7 @@
xnn_univector_ukernel_function sigmoid;
struct prelu_parameters prelu;
xnn_vadd_ukernel_function vadd;
- struct vbinop_parameters vmul;
+ struct vbinary_parameters vmul;
struct vmulcaddc_parameters vmulcaddc;
// Sparse Matrix-Dense Matrix Multiplication (NR=1 block).
struct spmm_parameters spmm;
diff --git a/src/xnnpack/vbinop.h b/src/xnnpack/vbinary.h
similarity index 100%
rename from src/xnnpack/vbinop.h
rename to src/xnnpack/vbinary.h
diff --git a/src/xnnpack/vunary.h b/src/xnnpack/vunary.h
new file mode 100644
index 0000000..46de442
--- /dev/null
+++ b/src/xnnpack/vunary.h
@@ -0,0 +1,35 @@
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#pragma once
+
+#include <stddef.h>
+#include <stdint.h>
+
+#include <xnnpack/params.h>
+#include <xnnpack/common.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+
+#define DECLARE_F32_VUNARY_UKERNEL_FUNCTION(fn_name) \
+ XNN_INTERNAL void fn_name( \
+ size_t n, \
+ const float* x, \
+ float* y, \
+ const void* params);
+
+DECLARE_F32_VUNARY_UKERNEL_FUNCTION(xnn_f32_sigmoid_ukernel__neonfma_p5_nr2fma_x16)
+DECLARE_F32_VUNARY_UKERNEL_FUNCTION(xnn_f32_sigmoid_ukernel__neon_frac_p9_p10_nr1recps_x16)
+DECLARE_F32_VUNARY_UKERNEL_FUNCTION(xnn_f32_sigmoid_ukernel__sse2_p5_div_x8)
+DECLARE_F32_VUNARY_UKERNEL_FUNCTION(xnn_f32_sigmoid_ukernel__sse2_p5_div_x16)
+DECLARE_F32_VUNARY_UKERNEL_FUNCTION(xnn_f32_sigmoid_ukernel__sse41_p5_div_x8)
+DECLARE_F32_VUNARY_UKERNEL_FUNCTION(xnn_f32_sigmoid_ukernel__sse41_p5_div_x16)
+
+#ifdef __cplusplus
+} // extern "C"
+#endif
diff --git a/src/xnnpack/vunop.h b/src/xnnpack/vunop.h
deleted file mode 100644
index 86b0723..0000000
--- a/src/xnnpack/vunop.h
+++ /dev/null
@@ -1,36 +0,0 @@
-// Copyright 2019 Google LLC
-//
-// This source code is licensed under the BSD-style license found in the
-// LICENSE file in the root directory of this source tree.
-
-#pragma once
-
-#include <stddef.h>
-#include <stdint.h>
-
-#include <xnnpack/params.h>
-#include <xnnpack/common.h>
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-
-#define DECLARE_F32_VUNOP_UKERNEL_FUNCTION(fn_name) \
- XNN_INTERNAL void fn_name( \
- size_t n, \
- const float* x, \
- float* y, \
- const void* params);
-
-DECLARE_F32_VUNOP_UKERNEL_FUNCTION(xnn_f32_sigmoid_ukernel__neonfma_p5_nr2fma_x16)
-DECLARE_F32_VUNOP_UKERNEL_FUNCTION(xnn_f32_sigmoid_ukernel__neon_frac_p9_p10_nr1recps_x16)
-DECLARE_F32_VUNOP_UKERNEL_FUNCTION(xnn_f32_sigmoid_ukernel__sse2_p5_div_x8)
-DECLARE_F32_VUNOP_UKERNEL_FUNCTION(xnn_f32_sigmoid_ukernel__sse2_p5_div_x16)
-DECLARE_F32_VUNOP_UKERNEL_FUNCTION(xnn_f32_sigmoid_ukernel__sse41_p5_div_x8)
-DECLARE_F32_VUNOP_UKERNEL_FUNCTION(xnn_f32_sigmoid_ukernel__sse41_p5_div_x16)
-
-
-#ifdef __cplusplus
-} // extern "C"
-#endif
diff --git a/test/f32-sigmoid.cc b/test/f32-sigmoid.cc
index d8b3e69..1c5ff16 100644
--- a/test/f32-sigmoid.cc
+++ b/test/f32-sigmoid.cc
@@ -5,7 +5,7 @@
//
// Auto-generated file. Do not edit!
// Specification: test/f32-sigmoid.yaml
-// Generator: tools/generate-vunop-test.py
+// Generator: tools/generate-vunary-test.py
#include <gtest/gtest.h>
@@ -13,8 +13,8 @@
#include <xnnpack/common.h>
#include <xnnpack/isa-checks.h>
-#include <xnnpack/vunop.h>
-#include "vunop-microkernel-tester.h"
+#include <xnnpack/vunary.h>
+#include "vunary-microkernel-tester.h"
#if XNN_ARCH_ARM || XNN_ARCH_ARM64
diff --git a/test/f32-vadd.cc b/test/f32-vadd.cc
index a77e736..093f590 100644
--- a/test/f32-vadd.cc
+++ b/test/f32-vadd.cc
@@ -5,7 +5,7 @@
//
// Auto-generated file. Do not edit!
// Specification: test/f32-vadd.yaml
-// Generator: tools/generate-vbinop-test.py
+// Generator: tools/generate-vbinary-test.py
#include <gtest/gtest.h>
@@ -13,8 +13,8 @@
#include <xnnpack/common.h>
#include <xnnpack/isa-checks.h>
-#include <xnnpack/vbinop.h>
-#include "vbinop-microkernel-tester.h"
+#include <xnnpack/vbinary.h>
+#include "vbinary-microkernel-tester.h"
#if XNN_ARCH_ARM || XNN_ARCH_ARM64
diff --git a/test/f32-vaddc.cc b/test/f32-vaddc.cc
index 95f6027..bf8421c 100644
--- a/test/f32-vaddc.cc
+++ b/test/f32-vaddc.cc
@@ -5,7 +5,7 @@
//
// Auto-generated file. Do not edit!
// Specification: test/f32-vaddc.yaml
-// Generator: tools/generate-vbinop-test.py
+// Generator: tools/generate-vbinary-test.py
#include <gtest/gtest.h>
@@ -13,8 +13,8 @@
#include <xnnpack/common.h>
#include <xnnpack/isa-checks.h>
-#include <xnnpack/vbinop.h>
-#include "vbinopc-microkernel-tester.h"
+#include <xnnpack/vbinary.h>
+#include "vbinaryc-microkernel-tester.h"
#if XNN_ARCH_ARM || XNN_ARCH_ARM64
diff --git a/test/f32-vmul.cc b/test/f32-vmul.cc
index b68d1d1..1d61b6c 100644
--- a/test/f32-vmul.cc
+++ b/test/f32-vmul.cc
@@ -5,7 +5,7 @@
//
// Auto-generated file. Do not edit!
// Specification: test/f32-vmul.yaml
-// Generator: tools/generate-vbinop-test.py
+// Generator: tools/generate-vbinary-test.py
#include <gtest/gtest.h>
@@ -13,8 +13,8 @@
#include <xnnpack/common.h>
#include <xnnpack/isa-checks.h>
-#include <xnnpack/vbinop.h>
-#include "vbinop-microkernel-tester.h"
+#include <xnnpack/vbinary.h>
+#include "vbinary-microkernel-tester.h"
#if XNN_ARCH_ARM || XNN_ARCH_ARM64
diff --git a/test/f32-vmulc.cc b/test/f32-vmulc.cc
index b76a770..6743035 100644
--- a/test/f32-vmulc.cc
+++ b/test/f32-vmulc.cc
@@ -5,7 +5,7 @@
//
// Auto-generated file. Do not edit!
// Specification: test/f32-vmulc.yaml
-// Generator: tools/generate-vbinop-test.py
+// Generator: tools/generate-vbinary-test.py
#include <gtest/gtest.h>
@@ -13,8 +13,8 @@
#include <xnnpack/common.h>
#include <xnnpack/isa-checks.h>
-#include <xnnpack/vbinop.h>
-#include "vbinopc-microkernel-tester.h"
+#include <xnnpack/vbinary.h>
+#include "vbinaryc-microkernel-tester.h"
#if XNN_ARCH_ARM || XNN_ARCH_ARM64
diff --git a/test/f32-vrsubc.cc b/test/f32-vrsubc.cc
index 931ce16..b4a42eb 100644
--- a/test/f32-vrsubc.cc
+++ b/test/f32-vrsubc.cc
@@ -5,7 +5,7 @@
//
// Auto-generated file. Do not edit!
// Specification: test/f32-vrsubc.yaml
-// Generator: tools/generate-vbinop-test.py
+// Generator: tools/generate-vbinary-test.py
#include <gtest/gtest.h>
@@ -13,8 +13,8 @@
#include <xnnpack/common.h>
#include <xnnpack/isa-checks.h>
-#include <xnnpack/vbinop.h>
-#include "vbinopc-microkernel-tester.h"
+#include <xnnpack/vbinary.h>
+#include "vbinaryc-microkernel-tester.h"
#if XNN_ARCH_ARM || XNN_ARCH_ARM64
diff --git a/test/f32-vsub.cc b/test/f32-vsub.cc
index d875c55..a2b86a2 100644
--- a/test/f32-vsub.cc
+++ b/test/f32-vsub.cc
@@ -5,7 +5,7 @@
//
// Auto-generated file. Do not edit!
// Specification: test/f32-vsub.yaml
-// Generator: tools/generate-vbinop-test.py
+// Generator: tools/generate-vbinary-test.py
#include <gtest/gtest.h>
@@ -13,8 +13,8 @@
#include <xnnpack/common.h>
#include <xnnpack/isa-checks.h>
-#include <xnnpack/vbinop.h>
-#include "vbinop-microkernel-tester.h"
+#include <xnnpack/vbinary.h>
+#include "vbinary-microkernel-tester.h"
#if XNN_ARCH_ARM || XNN_ARCH_ARM64
diff --git a/test/f32-vsubc.cc b/test/f32-vsubc.cc
index 3b4d91c..12a8b6f 100644
--- a/test/f32-vsubc.cc
+++ b/test/f32-vsubc.cc
@@ -5,7 +5,7 @@
//
// Auto-generated file. Do not edit!
// Specification: test/f32-vsubc.yaml
-// Generator: tools/generate-vbinop-test.py
+// Generator: tools/generate-vbinary-test.py
#include <gtest/gtest.h>
@@ -13,8 +13,8 @@
#include <xnnpack/common.h>
#include <xnnpack/isa-checks.h>
-#include <xnnpack/vbinop.h>
-#include "vbinopc-microkernel-tester.h"
+#include <xnnpack/vbinary.h>
+#include "vbinaryc-microkernel-tester.h"
#if XNN_ARCH_ARM || XNN_ARCH_ARM64
diff --git a/test/vbinop-microkernel-tester.h b/test/vbinary-microkernel-tester.h
similarity index 95%
rename from test/vbinop-microkernel-tester.h
rename to test/vbinary-microkernel-tester.h
index 3372bf6..0189883 100644
--- a/test/vbinop-microkernel-tester.h
+++ b/test/vbinary-microkernel-tester.h
@@ -88,7 +88,7 @@
return this->iterations_;
}
- void Test(xnn_f32_vbinop_ukernel_function vbinop, OpType op_type, Variant variant = Variant::Native) const {
+ void Test(xnn_f32_vbinary_ukernel_function vbinary, OpType op_type, Variant variant = Variant::Native) const {
std::random_device random_device;
auto rng = std::mt19937(random_device());
auto f32rng = std::bind(std::uniform_real_distribution<float>(0.0f, 1.0f), rng);
@@ -147,7 +147,7 @@
}
// Call optimized micro-kernel.
- vbinop(batch_size() * sizeof(float), a_data, b_data, y.data(), &output_params);
+ vbinary(batch_size() * sizeof(float), a_data, b_data, y.data(), &output_params);
// Verify results.
for (size_t i = 0; i < batch_size(); i++) {
diff --git a/test/vbinopc-microkernel-tester.h b/test/vbinaryc-microkernel-tester.h
similarity index 95%
rename from test/vbinopc-microkernel-tester.h
rename to test/vbinaryc-microkernel-tester.h
index e6fb173..eec19d0 100644
--- a/test/vbinopc-microkernel-tester.h
+++ b/test/vbinaryc-microkernel-tester.h
@@ -80,7 +80,7 @@
return this->iterations_;
}
- void Test(xnn_f32_vbinop_ukernel_function vbinopc, OpType op_type, Variant variant = Variant::Native) const {
+ void Test(xnn_f32_vbinary_ukernel_function vbinaryc, OpType op_type, Variant variant = Variant::Native) const {
std::random_device random_device;
auto rng = std::mt19937(random_device());
auto f32rng = std::bind(std::uniform_real_distribution<float>(0.0f, 1.0f), rng);
@@ -140,7 +140,7 @@
}
// Call optimized micro-kernel.
- vbinopc(batch_size() * sizeof(float), a_data, &b, y.data(), &output_params);
+ vbinaryc(batch_size() * sizeof(float), a_data, &b, y.data(), &output_params);
// Verify results.
for (size_t i = 0; i < batch_size(); i++) {
diff --git a/test/vunop-microkernel-tester.h b/test/vunary-microkernel-tester.h
similarity index 95%
rename from test/vunop-microkernel-tester.h
rename to test/vunary-microkernel-tester.h
index 26e3a25..e0387e6 100644
--- a/test/vunop-microkernel-tester.h
+++ b/test/vunary-microkernel-tester.h
@@ -77,7 +77,7 @@
return this->iterations_;
}
- void Test(xnn_f32_vunop_ukernel_function vunop, OpType op_type, Variant variant = Variant::Native) const {
+ void Test(xnn_f32_vunary_ukernel_function vunary, OpType op_type, Variant variant = Variant::Native) const {
std::random_device random_device;
auto rng = std::mt19937(random_device());
auto f32rng = std::bind(std::uniform_real_distribution<float>(-25.0f, 25.0f), rng);
@@ -130,7 +130,7 @@
}
// Call optimized micro-kernel.
- vunop(batch_size() * sizeof(float), x_data, y.data(), &output_params);
+ vunary(batch_size() * sizeof(float), x_data, y.data(), &output_params);
// Verify results.
for (size_t i = 0; i < batch_size(); i++) {
diff --git a/tools/generate-vbinop-test.py b/tools/generate-vbinary-test.py
similarity index 97%
rename from tools/generate-vbinop-test.py
rename to tools/generate-vbinary-test.py
index 36bc030..74a21c4 100755
--- a/tools/generate-vbinop-test.py
+++ b/tools/generate-vbinary-test.py
@@ -198,9 +198,9 @@
raise ValueError("expected a list of micro-kernels in the spec")
if os.path.splitext(options.spec)[0].endswith("c"):
- header = "vbinopc-microkernel-tester.h"
+ header = "vbinaryc-microkernel-tester.h"
else:
- header = "vbinop-microkernel-tester.h"
+ header = "vbinary-microkernel-tester.h"
tests = """\
// Copyright 2019 Google LLC
//
@@ -217,7 +217,7 @@
#include <xnnpack/common.h>
#include <xnnpack/isa-checks.h>
-#include <xnnpack/vbinop.h>
+#include <xnnpack/vbinary.h>
#include "{header}"
""".format(specification=options.spec, generator=sys.argv[0], header=header)
diff --git a/tools/generate-vunop-test.py b/tools/generate-vunary-test.py
similarity index 98%
rename from tools/generate-vunop-test.py
rename to tools/generate-vunary-test.py
index d4adedd..9c0f808 100755
--- a/tools/generate-vunop-test.py
+++ b/tools/generate-vunary-test.py
@@ -148,8 +148,8 @@
#include <xnnpack/common.h>
#include <xnnpack/isa-checks.h>
-#include <xnnpack/vunop.h>
-#include "vunop-microkernel-tester.h"
+#include <xnnpack/vunary.h>
+#include "vunary-microkernel-tester.h"
""".format(specification=options.spec, generator=sys.argv[0])
for ukernel_spec in spec_yaml: