Binary elementwise microkernels

- ADD/MUL/SUB microkernel templates in scalar, PSIMD, SSE, NEON implementations
- ADDC/MULC/SUBC/RSUBC microkernel templates in scalar, PSIMD, SSE, NEON implementations
- Unit test generators
- Remove legacy (non-generated) microkernel implementations

PiperOrigin-RevId: 280528154
diff --git a/src/f32-binop/vop-scalar.c.in b/src/f32-binop/vop-scalar.c.in
new file mode 100644
index 0000000..ab6cfba
--- /dev/null
+++ b/src/f32-binop/vop-scalar.c.in
@@ -0,0 +1,85 @@
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+$assert BATCH_TILE >= 1
+$ABC = "0123456789ABCDEFGHIJKLMN"
+$assert OP in ["ADD", "MUL", "SUB"]
+#include <assert.h>
+
+#include <xnnpack/common.h>
+#include <xnnpack/math.h>
+#include <xnnpack/vbinop.h>
+
+
+$OP_FUNC = {
+$  "ADD": lambda x, y: "%s + %s" % (x, y),
+$  "MUL": lambda x, y: "%s * %s" % (x, y),
+$  "SUB": lambda x, y: "%s - %s" % (x, y),
+$}[OP]
+void xnn_f32_v${OP.lower()}_ukernel__scalar_x${BATCH_TILE}(
+    size_t n,
+    const float* a,
+    const float* b,
+    float* y,
+    const union xnn_f32_output_params params[restrict static 1])
+{
+  assert(n != 0);
+  assert(n % sizeof(float) == 0);
+
+  const float vy_min = params->scalar.min;
+  const float vy_max = params->scalar.max;
+
+  $if BATCH_TILE > 1:
+    for (; n >= ${BATCH_TILE} * sizeof(float); n -= ${BATCH_TILE} * sizeof(float)) {
+      $for N in range(BATCH_TILE):
+        const float va${ABC[N]} = a[${N}];
+      a += ${BATCH_TILE};
+
+      $for N in range(BATCH_TILE):
+        const float vb${ABC[N]} = b[${N}];
+      b += ${BATCH_TILE};
+
+      $for N in range(BATCH_TILE):
+        float vy${ABC[N]} = ${OP_FUNC("va" + ABC[N], "vb" + ABC[N])};
+
+      $for N in range(BATCH_TILE):
+        vy${ABC[N]} = math_max_f32(vy${ABC[N]}, vy_min);
+
+      $for N in range(BATCH_TILE):
+        vy${ABC[N]} = math_min_f32(vy${ABC[N]}, vy_max);
+
+      $for N in range(BATCH_TILE):
+        y[${N}] = vy${ABC[N]};
+      y += ${BATCH_TILE};
+    }
+    if XNN_UNLIKELY(n != 0) {
+      $if BATCH_TILE > 2:
+        do {
+          const float va = *a++;
+          const float vb = *b++;
+          float vy = ${OP_FUNC("va", "vb")};
+          vy = math_max_f32(vy, vy_min);
+          vy = math_min_f32(vy, vy_max);
+          *y++ = vy;
+          n -= sizeof(float);
+        } while (n != 0);
+      $else:
+        const float va = *a;
+        const float vb = *b;
+        float vy = ${OP_FUNC("va", "vb")};
+        vy = math_max_f32(vy, vy_min);
+        vy = math_min_f32(vy, vy_max);
+        *y = vy;
+    }
+  $else:
+    for (; n >= sizeof(float); n -= sizeof(float)) {
+      const float va = *a++;
+      const float vb = *b++;
+      float vy = ${OP_FUNC("va", "vb")};
+      vy = math_max_f32(vy, vy_min);
+      vy = math_min_f32(vy, vy_max);
+      *y++ = vy;
+    }
+}