Initial open-source release

PiperOrigin-RevId: 271685289
diff --git a/scripts/generate-f16-gemm.sh b/scripts/generate-f16-gemm.sh
new file mode 100755
index 0000000..0a00557
--- /dev/null
+++ b/scripts/generate-f16-gemm.sh
@@ -0,0 +1,15 @@
+#!/bin/sh
+# Copyright 2019 Google LLC
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+########################## ARM NEON with FP16 compute #########################
+### LD64 micro-kernels
+tools/xngen src/f16-gemm/neonfp16arith-ld64.c.in -D MR=4 -D NR=8 -o src/f16-gemm/4x8-neonfp16arith-ld64.c
+tools/xngen src/f16-gemm/neonfp16arith-ld64.c.in -D MR=6 -D NR=8 -o src/f16-gemm/6x8-neonfp16arith-ld64.c
+tools/xngen src/f16-gemm/neonfp16arith-ld64.c.in -D MR=8 -D NR=8 -o src/f16-gemm/8x8-neonfp16arith-ld64.c
+
+
+################################## Unit tests #################################
+tools/generate-gemm-test.py --spec test/f16-gemm.yaml --output test/f16-gemm.cc
diff --git a/scripts/generate-f32-dwconv.sh b/scripts/generate-f32-dwconv.sh
new file mode 100755
index 0000000..7f08ef8
--- /dev/null
+++ b/scripts/generate-f32-dwconv.sh
@@ -0,0 +1,29 @@
+#!/bin/sh
+# Copyright 2019 Google LLC
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+#################################### Scalar ###################################
+tools/xngen src/f32-dwconv/up-scalar.c.in -D CR=1 -D MR=4  -D AR=2 -o src/f32-dwconv/up1x4-scalar.c
+tools/xngen src/f32-dwconv/up-scalar.c.in -D CR=1 -D MR=9  -D AR=2 -o src/f32-dwconv/up1x9-scalar.c
+tools/xngen src/f32-dwconv/up-scalar.c.in -D CR=1 -D MR=25 -D AR=2 -o src/f32-dwconv/up1x25-scalar.c
+
+################################### ARM NEON ##################################
+tools/xngen src/f32-dwconv/up-neon.c.in -D CR=4 -D MR=9 -D AR=1 -D FMA=0 -o src/f32-dwconv/up4x9-neon.c
+tools/xngen src/f32-dwconv/up-neon.c.in -D CR=4 -D MR=9 -D AR=1 -D FMA=1 -o src/f32-dwconv/up4x9-neonfma.c
+tools/xngen src/f32-dwconv/up-neon.c.in -D CR=8 -D MR=9 -D AR=1 -D FMA=1 -o src/f32-dwconv/up8x9-neonfma.c
+
+#################################### PSIMD ####################################
+tools/xngen src/f32-dwconv/up-psimd.c.in -D CR=4 -D MR=4 -D AR=2 -o src/f32-dwconv/up4x4-psimd.c
+tools/xngen src/f32-dwconv/up-psimd.c.in -D CR=4 -D MR=9 -D AR=2 -o src/f32-dwconv/up4x9-psimd.c
+tools/xngen src/f32-dwconv/up-psimd.c.in -D CR=4 -D MR=25 -D AR=2 -o src/f32-dwconv/up4x25-psimd.c
+
+################################### x86 SSE ###################################
+tools/xngen src/f32-dwconv/up-sse.c.in -D CR=4 -D MR=4 -D AR=2 -o src/f32-dwconv/up4x4-sse.c
+tools/xngen src/f32-dwconv/up-sse.c.in -D CR=4 -D MR=9 -D AR=2 -o src/f32-dwconv/up4x9-sse.c
+tools/xngen src/f32-dwconv/up-sse.c.in -D CR=4 -D MR=25 -D AR=2 -o src/f32-dwconv/up4x25-sse.c
+
+
+################################## Unit tests #################################
+tools/generate-dwconv-test.py --spec test/f32-dwconv.yaml --output test/f32-dwconv.cc
diff --git a/scripts/generate-f32-gemm.sh b/scripts/generate-f32-gemm.sh
new file mode 100755
index 0000000..a2b8090
--- /dev/null
+++ b/scripts/generate-f32-gemm.sh
@@ -0,0 +1,75 @@
+#!/bin/sh
+# Copyright 2019 Google LLC
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+#################################### Scalar ###################################
+tools/xngen src/f32-gemm/scalar.c.in -D MR=1 -D NR=4 -D INC=0 -o src/f32-gemm/1x4-scalar.c
+tools/xngen src/f32-gemm/scalar.c.in -D MR=2 -D NR=4 -D INC=0 -o src/f32-gemm/2x4-scalar.c
+tools/xngen src/f32-gemm/scalar.c.in -D MR=4 -D NR=2 -D INC=0 -o src/f32-gemm/4x2-scalar.c
+tools/xngen src/f32-gemm/scalar.c.in -D MR=4 -D NR=4 -D INC=0 -o src/f32-gemm/4x4-scalar.c
+
+############################### AArch64 assembly ##############################
+tools/xngen src/f32-gemm/1x12-aarch64-neonfma-cortex-a53.S.in -D INC=0 -o src/f32-gemm/1x12-aarch64-neonfma-cortex-a53.S
+tools/xngen src/f32-gemm/1x8-aarch64-neonfma-cortex-a57.S.in  -D INC=0 -o src/f32-gemm/1x8-aarch64-neonfma-cortex-a57.S
+tools/xngen src/f32-gemm/1x8-aarch64-neonfma-cortex-a75.S.in  -D INC=0 -o src/f32-gemm/1x8-aarch64-neonfma-cortex-a75.S
+tools/xngen src/f32-gemm/4x12-aarch64-neonfma-cortex-a53.S.in -D INC=0 -o src/f32-gemm/4x12-aarch64-neonfma-cortex-a53.S
+tools/xngen src/f32-gemm/4x8-aarch64-neonfma-cortex-a57.S.in  -D INC=0 -o src/f32-gemm/4x8-aarch64-neonfma-cortex-a57.S
+tools/xngen src/f32-gemm/4x8-aarch64-neonfma-cortex-a75.S.in  -D INC=0 -o src/f32-gemm/4x8-aarch64-neonfma-cortex-a75.S
+tools/xngen src/f32-gemm/4x8-aarch64-neonfma-ld128.S.in       -D INC=0 -o src/f32-gemm/4x8-aarch64-neonfma-ld128.S
+tools/xngen src/f32-gemm/4x8-aarch64-neonfma-ld64.S.in        -D INC=0 -o src/f32-gemm/4x8-aarch64-neonfma-ld64.S
+tools/xngen src/f32-gemm/5x8-aarch64-neonfma-cortex-a75.S.in  -D INC=0 -o src/f32-gemm/5x8-aarch64-neonfma-cortex-a75.S
+tools/xngen src/f32-gemm/6x8-aarch64-neonfma-cortex-a57.S.in  -D INC=0 -o src/f32-gemm/6x8-aarch64-neonfma-cortex-a57.S
+tools/xngen src/f32-gemm/6x8-aarch64-neonfma-cortex-a73.S.in  -D INC=0 -o src/f32-gemm/6x8-aarch64-neonfma-cortex-a73.S
+tools/xngen src/f32-gemm/6x8-aarch64-neonfma-cortex-a75.S.in  -D INC=0 -o src/f32-gemm/6x8-aarch64-neonfma-cortex-a75.S
+tools/xngen src/f32-gemm/6x8-aarch64-neonfma-ld64.S.in        -D INC=0 -o src/f32-gemm/6x8-aarch64-neonfma-ld64.S
+tools/xngen src/f32-gemm/6x8-aarch64-neonfma-ld128.S.in       -D INC=0 -o src/f32-gemm/6x8-aarch64-neonfma-ld128.S
+
+################################### ARM NEON ##################################
+### LD64 micro-kernels
+tools/xngen src/f32-gemm/neon-ld64.c.in      -D MR=1 -D NR=8  -D FMA=1 -D INC=0 -o src/f32-gemm/1x8-neonfma-ld64.c
+tools/xngen src/f32-gemm/neon-ld64.c.in      -D MR=4 -D NR=12 -D FMA=0 -D INC=0 -o src/f32-gemm/4x12-neon-ld64.c
+tools/xngen src/f32-gemm/neon-ld64.c.in      -D MR=4 -D NR=12 -D FMA=1 -D INC=0 -o src/f32-gemm/4x12-neonfma-ld64.c
+tools/xngen src/f32-gemm/neon-ld64.c.in      -D MR=4 -D NR=8  -D FMA=0 -D INC=0 -o src/f32-gemm/4x8-neon-ld64.c
+tools/xngen src/f32-gemm/neon-ld64.c.in      -D MR=4 -D NR=8  -D FMA=1 -D INC=0 -o src/f32-gemm/4x8-neonfma-ld64.c
+tools/xngen src/f32-gemm/neon-ld64.c.in      -D MR=5 -D NR=8  -D FMA=0 -D INC=0 -o src/f32-gemm/5x8-neon-ld64.c
+tools/xngen src/f32-gemm/neon-ld64.c.in      -D MR=5 -D NR=8  -D FMA=1 -D INC=0 -o src/f32-gemm/5x8-neonfma-ld64.c
+tools/xngen src/f32-gemm/neon-ld64.c.in      -D MR=6 -D NR=8  -D FMA=0 -D INC=0 -o src/f32-gemm/6x8-neon-ld64.c
+tools/xngen src/f32-gemm/neon-ld64.c.in      -D MR=6 -D NR=8  -D FMA=1 -D INC=0 -o src/f32-gemm/6x8-neonfma-ld64.c
+### LD128 micro-kernels
+tools/xngen src/f32-gemm/neon-ld128.c.in     -D MR=4 -D NR=8  -D FMA=0 -D INC=0 -o src/f32-gemm/4x8-neon-ld128.c
+tools/xngen src/f32-gemm/neon-ld128.c.in     -D MR=4 -D NR=8  -D FMA=1 -D INC=0 -o src/f32-gemm/4x8-neonfma-ld128.c
+tools/xngen src/f32-gemm/neon-ld64.c.in      -D MR=1 -D NR=8  -D FMA=0 -D INC=0 -o src/f32-gemm/1x8-neon-ld64.c
+### MRx2 micro-kernels
+tools/xngen src/f32-gemm/MRx2-neon-ld64.c.in -D MR=4 -D NR=2  -D FMA=0 -D INC=0 -o src/f32-gemm/4x2-neon-ld64.c
+tools/xngen src/f32-gemm/MRx2-neon-ld64.c.in -D MR=4 -D NR=2  -D FMA=1 -D INC=0 -o src/f32-gemm/4x2-neonfma-ld64.c
+
+#################################### PSIMD ####################################
+### LOAD1+BROADCAST micro-kernels
+tools/xngen src/f32-gemm/psimd-loadsplat.c.in -D MR=1 -D NR=8 -D INC=0 -o src/f32-gemm/1x8-psimd-loadsplat.c
+tools/xngen src/f32-gemm/psimd-loadsplat.c.in -D MR=4 -D NR=8 -D INC=0 -o src/f32-gemm/4x8-psimd-loadsplat.c
+tools/xngen src/f32-gemm/psimd-loadsplat.c.in -D MR=6 -D NR=8 -D INC=0 -o src/f32-gemm/6x8-psimd-loadsplat.c
+### LOAD4+DUPLICATE micro-kernels
+tools/xngen src/f32-gemm/psimd-splat.c.in -D MR=1 -D NR=8 -D INC=0 -o src/f32-gemm/1x8-psimd-splat.c
+tools/xngen src/f32-gemm/psimd-splat.c.in -D MR=4 -D NR=8 -D INC=0 -o src/f32-gemm/4x8-psimd-splat.c
+tools/xngen src/f32-gemm/psimd-splat.c.in -D MR=6 -D NR=8 -D INC=0 -o src/f32-gemm/6x8-psimd-splat.c
+### LOAD4+PERMUTE micro-kernels
+tools/xngen src/f32-gemm/psimd-s4.c.in -D MR=1 -D NR=8 -D INC=0 -o src/f32-gemm/1x8s4-psimd.c
+tools/xngen src/f32-gemm/psimd-s4.c.in -D MR=4 -D NR=8 -D INC=0 -o src/f32-gemm/4x8s4-psimd.c
+tools/xngen src/f32-gemm/psimd-s4.c.in -D MR=6 -D NR=8 -D INC=0 -o src/f32-gemm/6x8s4-psimd.c
+
+################################### x86 SSE ###################################
+### LOAD1+BROADCAST micro-kernels
+tools/xngen src/f32-gemm/sse-load1.c.in -D MR=1 -D NR=8 -D INC=0 -o src/f32-gemm/1x8-sse-load1.c
+tools/xngen src/f32-gemm/sse-load1.c.in -D MR=4 -D NR=8 -D INC=0 -o src/f32-gemm/4x8-sse-load1.c
+### LOAD4+DUPLICATE micro-kernels
+tools/xngen src/f32-gemm/sse-dup.c.in -D MR=1 -D NR=8 -D INC=0 -o src/f32-gemm/1x8-sse-dup.c
+tools/xngen src/f32-gemm/sse-dup.c.in -D MR=4 -D NR=8 -D INC=0 -o src/f32-gemm/4x8-sse-dup.c
+### LOAD4+PERMUTE micro-kernels
+tools/xngen src/f32-gemm/sse-shuffle.c.in -D MR=1 -D NR=8 -D INC=0 -o src/f32-gemm/1x8s4-sse.c
+tools/xngen src/f32-gemm/sse-shuffle.c.in -D MR=4 -D NR=8 -D INC=0 -o src/f32-gemm/4x8s4-sse.c
+
+
+################################## Unit tests #################################
+tools/generate-gemm-test.py --spec test/f32-gemm.yaml --output test/f32-gemm.cc
diff --git a/scripts/generate-f32-gemminc.sh b/scripts/generate-f32-gemminc.sh
new file mode 100755
index 0000000..e141a52
--- /dev/null
+++ b/scripts/generate-f32-gemminc.sh
@@ -0,0 +1,71 @@
+#!/bin/sh
+# Copyright 2019 Google LLC
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+#################################### Scalar ###################################
+tools/xngen src/f32-gemm/scalar.c.in -D MR=1 -D NR=4 -D INC=1 -o src/f32-gemminc/1x4-scalar.c
+tools/xngen src/f32-gemm/scalar.c.in -D MR=2 -D NR=4 -D INC=1 -o src/f32-gemminc/2x4-scalar.c
+tools/xngen src/f32-gemm/scalar.c.in -D MR=4 -D NR=4 -D INC=1 -o src/f32-gemminc/4x4-scalar.c
+
+############################### AArch64 assembly ##############################
+tools/xngen src/f32-gemm/1x12-aarch64-neonfma-cortex-a53.S.in -D INC=1 -o src/f32-gemminc/1x12-aarch64-neonfma-cortex-a53.S
+tools/xngen src/f32-gemm/1x8-aarch64-neonfma-cortex-a57.S.in  -D INC=1 -o src/f32-gemminc/1x8-aarch64-neonfma-cortex-a57.S
+tools/xngen src/f32-gemm/1x8-aarch64-neonfma-cortex-a75.S.in  -D INC=1 -o src/f32-gemminc/1x8-aarch64-neonfma-cortex-a75.S
+tools/xngen src/f32-gemm/4x12-aarch64-neonfma-cortex-a53.S.in -D INC=1 -o src/f32-gemminc/4x12-aarch64-neonfma-cortex-a53.S
+tools/xngen src/f32-gemm/4x8-aarch64-neonfma-cortex-a57.S.in  -D INC=1 -o src/f32-gemminc/4x8-aarch64-neonfma-cortex-a57.S
+tools/xngen src/f32-gemm/4x8-aarch64-neonfma-cortex-a75.S.in  -D INC=1 -o src/f32-gemminc/4x8-aarch64-neonfma-cortex-a75.S
+tools/xngen src/f32-gemm/4x8-aarch64-neonfma-ld128.S.in       -D INC=1 -o src/f32-gemminc/4x8-aarch64-neonfma-ld128.S
+tools/xngen src/f32-gemm/4x8-aarch64-neonfma-ld64.S.in        -D INC=1 -o src/f32-gemminc/4x8-aarch64-neonfma-ld64.S
+tools/xngen src/f32-gemm/5x8-aarch64-neonfma-cortex-a75.S.in  -D INC=1 -o src/f32-gemminc/5x8-aarch64-neonfma-cortex-a75.S
+tools/xngen src/f32-gemm/6x8-aarch64-neonfma-cortex-a57.S.in  -D INC=1 -o src/f32-gemminc/6x8-aarch64-neonfma-cortex-a57.S
+tools/xngen src/f32-gemm/6x8-aarch64-neonfma-cortex-a73.S.in  -D INC=1 -o src/f32-gemminc/6x8-aarch64-neonfma-cortex-a73.S
+tools/xngen src/f32-gemm/6x8-aarch64-neonfma-cortex-a75.S.in  -D INC=1 -o src/f32-gemminc/6x8-aarch64-neonfma-cortex-a75.S
+tools/xngen src/f32-gemm/6x8-aarch64-neonfma-ld64.S.in        -D INC=1 -o src/f32-gemminc/6x8-aarch64-neonfma-ld64.S
+tools/xngen src/f32-gemm/6x8-aarch64-neonfma-ld128.S.in       -D INC=1 -o src/f32-gemminc/6x8-aarch64-neonfma-ld128.S
+
+################################### ARM NEON ##################################
+### LD64 micro-kernels
+tools/xngen src/f32-gemm/neon-ld64.c.in  -D MR=1 -D NR=8  -D FMA=1 -D INC=1 -o src/f32-gemminc/1x8-neonfma-ld64.c
+tools/xngen src/f32-gemm/neon-ld64.c.in  -D MR=4 -D NR=12 -D FMA=0 -D INC=1 -o src/f32-gemminc/4x12-neon-ld64.c
+tools/xngen src/f32-gemm/neon-ld64.c.in  -D MR=4 -D NR=12 -D FMA=1 -D INC=1 -o src/f32-gemminc/4x12-neonfma-ld64.c
+tools/xngen src/f32-gemm/neon-ld64.c.in  -D MR=4 -D NR=8  -D FMA=0 -D INC=1 -o src/f32-gemminc/4x8-neon-ld64.c
+tools/xngen src/f32-gemm/neon-ld64.c.in  -D MR=4 -D NR=8  -D FMA=1 -D INC=1 -o src/f32-gemminc/4x8-neonfma-ld64.c
+tools/xngen src/f32-gemm/neon-ld64.c.in  -D MR=5 -D NR=8  -D FMA=0 -D INC=1 -o src/f32-gemminc/5x8-neon-ld64.c
+tools/xngen src/f32-gemm/neon-ld64.c.in  -D MR=5 -D NR=8  -D FMA=1 -D INC=1 -o src/f32-gemminc/5x8-neonfma-ld64.c
+tools/xngen src/f32-gemm/neon-ld64.c.in  -D MR=6 -D NR=8  -D FMA=0 -D INC=1 -o src/f32-gemminc/6x8-neon-ld64.c
+tools/xngen src/f32-gemm/neon-ld64.c.in  -D MR=6 -D NR=8  -D FMA=1 -D INC=1 -o src/f32-gemminc/6x8-neonfma-ld64.c
+### LD128 micro-kernels
+tools/xngen src/f32-gemm/neon-ld128.c.in -D MR=4 -D NR=8  -D FMA=0 -D INC=1 -o src/f32-gemminc/4x8-neon-ld128.c
+tools/xngen src/f32-gemm/neon-ld128.c.in -D MR=4 -D NR=8  -D FMA=1 -D INC=1 -o src/f32-gemminc/4x8-neonfma-ld128.c
+tools/xngen src/f32-gemm/neon-ld64.c.in  -D MR=1 -D NR=8  -D FMA=0 -D INC=1 -o src/f32-gemminc/1x8-neon-ld64.c
+
+#################################### PSIMD ####################################
+### LOAD1+BROADCAST micro-kernels
+tools/xngen src/f32-gemm/psimd-loadsplat.c.in -D MR=1 -D NR=8 -D INC=1 -o src/f32-gemminc/1x8-psimd-loadsplat.c
+tools/xngen src/f32-gemm/psimd-loadsplat.c.in -D MR=4 -D NR=8 -D INC=1 -o src/f32-gemminc/4x8-psimd-loadsplat.c
+tools/xngen src/f32-gemm/psimd-loadsplat.c.in -D MR=6 -D NR=8 -D INC=1 -o src/f32-gemminc/6x8-psimd-loadsplat.c
+### LOAD4+DUPLICATE micro-kernels
+tools/xngen src/f32-gemm/psimd-splat.c.in -D MR=1 -D NR=8 -D INC=1 -o src/f32-gemminc/1x8-psimd-splat.c
+tools/xngen src/f32-gemm/psimd-splat.c.in -D MR=4 -D NR=8 -D INC=1 -o src/f32-gemminc/4x8-psimd-splat.c
+tools/xngen src/f32-gemm/psimd-splat.c.in -D MR=6 -D NR=8 -D INC=1 -o src/f32-gemminc/6x8-psimd-splat.c
+### LOAD4+PERMUTE micro-kernels
+tools/xngen src/f32-gemm/psimd-s4.c.in -D MR=1 -D NR=8 -D INC=1 -o src/f32-gemminc/1x8s4-psimd.c
+tools/xngen src/f32-gemm/psimd-s4.c.in -D MR=4 -D NR=8 -D INC=1 -o src/f32-gemminc/4x8s4-psimd.c
+tools/xngen src/f32-gemm/psimd-s4.c.in -D MR=6 -D NR=8 -D INC=1 -o src/f32-gemminc/6x8s4-psimd.c
+
+################################### x86 SSE ###################################
+### LOAD1+BROADCAST micro-kernels
+tools/xngen src/f32-gemm/sse-load1.c.in -D MR=1 -D NR=8 -D INC=1 -o src/f32-gemminc/1x8-sse-load1.c
+tools/xngen src/f32-gemm/sse-load1.c.in -D MR=4 -D NR=8 -D INC=1 -o src/f32-gemminc/4x8-sse-load1.c
+### LOAD4+DUPLICATE micro-kernels
+tools/xngen src/f32-gemm/sse-dup.c.in -D MR=1 -D NR=8 -D INC=1 -o src/f32-gemminc/1x8-sse-dup.c
+tools/xngen src/f32-gemm/sse-dup.c.in -D MR=4 -D NR=8 -D INC=1 -o src/f32-gemminc/4x8-sse-dup.c
+### LOAD4+PERMUTE micro-kernels
+tools/xngen src/f32-gemm/sse-shuffle.c.in -D MR=1 -D NR=8 -D INC=1 -o src/f32-gemminc/1x8s4-sse.c
+tools/xngen src/f32-gemm/sse-shuffle.c.in -D MR=4 -D NR=8 -D INC=1 -o src/f32-gemminc/4x8s4-sse.c
+
+
+################################## Unit tests #################################
+tools/generate-gemm-test.py --spec test/f32-gemminc.yaml --output test/f32-gemminc.cc
diff --git a/scripts/generate-f32-igemm.sh b/scripts/generate-f32-igemm.sh
new file mode 100755
index 0000000..a91f909
--- /dev/null
+++ b/scripts/generate-f32-igemm.sh
@@ -0,0 +1,62 @@
+#!/bin/sh
+# Copyright 2019 Google LLC
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+#################################### Scalar ###################################
+tools/xngen src/f32-igemm/scalar.c.in -D MR=1 -D NR=4 -o src/f32-igemm/1x4-scalar.c
+tools/xngen src/f32-igemm/scalar.c.in -D MR=2 -D NR=4 -o src/f32-igemm/2x4-scalar.c
+tools/xngen src/f32-igemm/scalar.c.in -D MR=4 -D NR=2 -o src/f32-igemm/4x2-scalar.c
+tools/xngen src/f32-igemm/scalar.c.in -D MR=4 -D NR=4 -o src/f32-igemm/4x4-scalar.c
+
+################################### ARM NEON ##################################
+### LD64 micro-kernels
+tools/xngen src/f32-igemm/neon-ld64.c.in -D MR=1 -D NR=8  -D FMA=0 -o src/f32-igemm/1x8-neon-ld64.c
+tools/xngen src/f32-igemm/neon-ld64.c.in -D MR=4 -D NR=12 -D FMA=0 -o src/f32-igemm/4x12-neon-ld64.c
+tools/xngen src/f32-igemm/neon-ld64.c.in -D MR=4 -D NR=12 -D FMA=1 -o src/f32-igemm/4x12-neonfma-ld64.c
+tools/xngen src/f32-igemm/neon-ld64.c.in -D MR=4 -D NR=4  -D FMA=0 -o src/f32-igemm/4x4-neon-ld64.c
+tools/xngen src/f32-igemm/neon-ld64.c.in -D MR=4 -D NR=4  -D FMA=1 -o src/f32-igemm/4x4-neonfma-ld64.c
+tools/xngen src/f32-igemm/neon-ld64.c.in -D MR=4 -D NR=8  -D FMA=0 -o src/f32-igemm/4x8-neon-ld64.c
+tools/xngen src/f32-igemm/neon-ld64.c.in -D MR=4 -D NR=8  -D FMA=1 -o src/f32-igemm/4x8-neonfma-ld64.c
+tools/xngen src/f32-igemm/neon-ld64.c.in -D MR=6 -D NR=8  -D FMA=0 -o src/f32-igemm/6x8-neon-ld64.c
+tools/xngen src/f32-igemm/neon-ld64.c.in -D MR=6 -D NR=8  -D FMA=1 -o src/f32-igemm/6x8-neonfma-ld64.c
+### LD128 micro-kernels
+tools/xngen src/f32-igemm/neon-ld128.c.in -D MR=4 -D NR=8 -D FMA=0 -o src/f32-igemm/4x8-neon-ld128.c
+tools/xngen src/f32-igemm/neon-ld128.c.in -D MR=4 -D NR=8 -D FMA=1 -o src/f32-igemm/4x8-neonfma-ld128.c
+### MRx2 micro-kernels
+tools/xngen src/f32-igemm/MRx2-neon-ld64.c.in -D MR=4 -D NR=2 -D FMA=0 -o src/f32-igemm/4x2-neon-ld64.c
+tools/xngen src/f32-igemm/MRx2-neon-ld64.c.in -D MR=4 -D NR=2 -D FMA=1 -o src/f32-igemm/4x2-neonfma-ld64.c
+
+#################################### PSIMD ####################################
+### LOAD1+BROADCAST micro-kernels
+tools/xngen src/f32-igemm/psimd-loadsplat.c.in -D MR=1 -D NR=8 -o src/f32-igemm/1x8-psimd-loadsplat.c
+tools/xngen src/f32-igemm/psimd-loadsplat.c.in -D MR=4 -D NR=8 -o src/f32-igemm/4x8-psimd-loadsplat.c
+tools/xngen src/f32-igemm/psimd-loadsplat.c.in -D MR=6 -D NR=8 -o src/f32-igemm/6x8-psimd-loadsplat.c
+### LOAD4+DUPLICATE micro-kernels
+tools/xngen src/f32-igemm/psimd-splat.c.in -D MR=1 -D NR=8 -o src/f32-igemm/1x8-psimd-splat.c
+tools/xngen src/f32-igemm/psimd-splat.c.in -D MR=4 -D NR=8 -o src/f32-igemm/4x8-psimd-splat.c
+tools/xngen src/f32-igemm/psimd-splat.c.in -D MR=6 -D NR=8 -o src/f32-igemm/6x8-psimd-splat.c
+### LOAD4+PERMUTE micro-kernels
+tools/xngen src/f32-igemm/psimd-s4.c.in -D MR=1 -D NR=8 -o src/f32-igemm/1x8s4-psimd.c
+tools/xngen src/f32-igemm/psimd-s4.c.in -D MR=4 -D NR=8 -o src/f32-igemm/4x8s4-psimd.c
+tools/xngen src/f32-igemm/psimd-s4.c.in -D MR=6 -D NR=8 -o src/f32-igemm/6x8s4-psimd.c
+### MRx2 micro-kernels
+tools/xngen src/f32-igemm/MRx2c4-psimd.c.in -D MR=4 -D NR=2 -o src/f32-igemm/4x2c4-psimd.c
+
+################################### x86 SSE ###################################
+### LOAD1+BROADCAST micro-kernels
+tools/xngen src/f32-igemm/sse-load1.c.in -D MR=1 -D NR=8 -o src/f32-igemm/1x8-sse-load1.c
+tools/xngen src/f32-igemm/sse-load1.c.in -D MR=4 -D NR=8 -o src/f32-igemm/4x8-sse-load1.c
+### LOAD4+DUPLICATE micro-kernels
+tools/xngen src/f32-igemm/sse-dup.c.in -D MR=1 -D NR=8 -o src/f32-igemm/1x8-sse-dup.c
+tools/xngen src/f32-igemm/sse-dup.c.in -D MR=4 -D NR=8 -o src/f32-igemm/4x8-sse-dup.c
+### LOAD4+PERMUTE micro-kernels
+tools/xngen src/f32-igemm/sse-shuffle.c.in -D MR=1 -D NR=8 -o src/f32-igemm/1x8s4-sse.c
+tools/xngen src/f32-igemm/sse-shuffle.c.in -D MR=4 -D NR=8 -o src/f32-igemm/4x8s4-sse.c
+### MRx2 micro-kernels
+tools/xngen src/f32-igemm/MRx2c4-sse.c.in -D MR=4 -D NR=2 -o src/f32-igemm/4x2c4-sse.c
+
+
+################################## Unit tests #################################
+tools/generate-gemm-test.py --spec test/f32-igemm.yaml --output test/f32-igemm.cc
diff --git a/scripts/generate-f32-ppmm.sh b/scripts/generate-f32-ppmm.sh
new file mode 100755
index 0000000..f915e81
--- /dev/null
+++ b/scripts/generate-f32-ppmm.sh
@@ -0,0 +1,27 @@
+#!/bin/sh
+# Copyright 2019 Google LLC
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+#################################### Scalar ###################################
+tools/xngen src/f32-ppmm/scalar.c.in -D MR=4 -D NR=4 -o src/f32-ppmm/4x4-scalar.c
+tools/xngen src/f32-ppmm/scalar.c.in -D MR=2 -D NR=4 -o src/f32-ppmm/2x4-scalar.c
+tools/xngen src/f32-ppmm/scalar.c.in -D MR=4 -D NR=2 -o src/f32-ppmm/4x2-scalar.c
+tools/xngen src/f32-ppmm/scalar.c.in -D MR=3 -D NR=3 -o src/f32-ppmm/3x3-scalar.c
+
+################################### ARM NEON ##################################
+tools/xngen src/f32-ppmm/neon.c.in -D MR=4 -D NR=8 -D FMA=0 -o src/f32-ppmm/4x8-neon.c
+tools/xngen src/f32-ppmm/neon.c.in -D MR=4 -D NR=8 -D FMA=1 -o src/f32-ppmm/4x8-neonfma.c
+tools/xngen src/f32-ppmm/neon.c.in -D MR=8 -D NR=8 -D FMA=0 -o src/f32-ppmm/8x8-neon.c
+tools/xngen src/f32-ppmm/neon.c.in -D MR=8 -D NR=8 -D FMA=1 -o src/f32-ppmm/8x8-neonfma.c
+
+#################################### PSIMD ####################################
+tools/xngen src/f32-ppmm/psimd.c.in -D MR=4 -D NR=8 -o src/f32-ppmm/4x8-psimd.c
+
+################################### x86 SSE ###################################
+tools/xngen src/f32-ppmm/sse.c.in -D MR=4 -D NR=8 -o src/f32-ppmm/4x8-sse.c
+
+
+################################## Unit tests #################################
+tools/generate-gemm-test.py --spec test/f32-ppmm.yaml --output test/f32-ppmm.cc
diff --git a/scripts/generate-f32-spmm.sh b/scripts/generate-f32-spmm.sh
new file mode 100755
index 0000000..6a51e4f
--- /dev/null
+++ b/scripts/generate-f32-spmm.sh
@@ -0,0 +1,55 @@
+#!/bin/sh
+# Copyright 2019 Google LLC
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+#################################### Scalar ###################################
+### Microkernels without unrolling
+tools/xngen src/f32-spmm/scalar.c.in -D MR=1 -D NR=1 -D UNROLL=1 -o src/f32-spmm/1x1-scalar.c
+tools/xngen src/f32-spmm/scalar.c.in -D MR=2 -D NR=1 -D UNROLL=1 -o src/f32-spmm/2x1-scalar.c
+tools/xngen src/f32-spmm/scalar.c.in -D MR=4 -D NR=1 -D UNROLL=1 -o src/f32-spmm/4x1-scalar.c
+tools/xngen src/f32-spmm/scalar.c.in -D MR=8 -D NR=1 -D UNROLL=1 -o src/f32-spmm/8x1-scalar.c
+### Microkernels with 2X unrolling
+tools/xngen src/f32-spmm/scalar.c.in -D MR=1 -D NR=1 -D UNROLL=2 -o src/f32-spmm/1x1-scalar-unroll2.c
+tools/xngen src/f32-spmm/scalar.c.in -D MR=2 -D NR=1 -D UNROLL=2 -o src/f32-spmm/2x1-scalar-unroll2.c
+tools/xngen src/f32-spmm/scalar.c.in -D MR=4 -D NR=1 -D UNROLL=2 -o src/f32-spmm/4x1-scalar-unroll2.c
+tools/xngen src/f32-spmm/scalar.c.in -D MR=8 -D NR=1 -D UNROLL=2 -o src/f32-spmm/8x1-scalar-unroll2.c
+### Microkernels with software pipelining
+tools/xngen src/f32-spmm/scalar-pipelined.c.in -D MR=1 -D NR=1 -o src/f32-spmm/1x1-scalar-pipelined.c
+tools/xngen src/f32-spmm/scalar-pipelined.c.in -D MR=2 -D NR=1 -o src/f32-spmm/2x1-scalar-pipelined.c
+tools/xngen src/f32-spmm/scalar-pipelined.c.in -D MR=4 -D NR=1 -o src/f32-spmm/4x1-scalar-pipelined.c
+tools/xngen src/f32-spmm/scalar-pipelined.c.in -D MR=8 -D NR=1 -o src/f32-spmm/8x1-scalar-pipelined.c
+
+################################### ARM NEON ##################################
+### Microkernels without unrolling
+tools/xngen src/f32-spmm/neon.c.in -D MR=4 -D NR=1 -D UNROLL=1 -D FMA=1 -o src/f32-spmm/4x1-neonfma.c
+tools/xngen src/f32-spmm/neon.c.in -D MR=8 -D NR=1 -D UNROLL=1 -D FMA=1 -o src/f32-spmm/8x1-neonfma.c
+tools/xngen src/f32-spmm/neon.c.in -D MR=12 -D NR=1 -D UNROLL=1 -D FMA=1 -o src/f32-spmm/12x1-neonfma.c
+tools/xngen src/f32-spmm/neon.c.in -D MR=16 -D NR=1 -D UNROLL=1 -D FMA=1 -o src/f32-spmm/16x1-neonfma.c
+### Microkernels with 2X unrolling
+tools/xngen src/f32-spmm/neon.c.in -D MR=4  -D NR=1 -D UNROLL=2 -D FMA=1 -o src/f32-spmm/4x1-neonfma-unroll2.c
+tools/xngen src/f32-spmm/neon.c.in -D MR=8  -D NR=1 -D UNROLL=2 -D FMA=1 -o src/f32-spmm/8x1-neonfma-unroll2.c
+tools/xngen src/f32-spmm/neon.c.in -D MR=16  -D NR=1 -D UNROLL=2 -D FMA=1 -o src/f32-spmm/16x1-neonfma-unroll2.c
+### Microkernels for blocks of several output channels
+tools/xngen src/f32-spmm/neon-blocked.c.in -D MR=4 -D NR=2 -D UNROLL=1 -D FMA=1 -o src/f32-spmm/4x2-neonfma.c
+tools/xngen src/f32-spmm/neon-blocked.c.in -D MR=8 -D NR=2 -D UNROLL=1 -D FMA=1 -o src/f32-spmm/8x2-neonfma.c
+tools/xngen src/f32-spmm/neon-blocked.c.in -D MR=12 -D NR=2 -D UNROLL=1 -D FMA=1 -o src/f32-spmm/12x2-neonfma.c
+tools/xngen src/f32-spmm/neon-blocked.c.in -D MR=16 -D NR=2 -D UNROLL=1 -D FMA=1 -o src/f32-spmm/16x2-neonfma.c
+tools/xngen src/f32-spmm/neon-blocked.c.in -D MR=4 -D NR=4 -D UNROLL=1 -D FMA=1 -o src/f32-spmm/4x4-neonfma.c
+tools/xngen src/f32-spmm/neon-blocked.c.in -D MR=8 -D NR=4 -D UNROLL=1 -D FMA=1 -o src/f32-spmm/8x4-neonfma.c
+tools/xngen src/f32-spmm/neon-blocked.c.in -D MR=12 -D NR=4 -D UNROLL=1 -D FMA=1 -o src/f32-spmm/12x4-neonfma.c
+tools/xngen src/f32-spmm/neon-blocked.c.in -D MR=16 -D NR=4 -D UNROLL=1 -D FMA=1 -o src/f32-spmm/16x4-neonfma.c
+### Microkernels with software pipelining
+tools/xngen src/f32-spmm/neon-pipelined.c.in -D MR=4 -D NR=1 -D FMA=1 -o src/f32-spmm/4x1-neonfma-pipelined.c
+tools/xngen src/f32-spmm/neon-pipelined.c.in -D MR=8 -D NR=1 -D FMA=1 -o src/f32-spmm/8x1-neonfma-pipelined.c
+tools/xngen src/f32-spmm/neon-pipelined.c.in -D MR=16 -D NR=1 -D FMA=1 -o src/f32-spmm/16x1-neonfma-pipelined.c
+
+################################### x86 SSE ###################################
+### Microkernels without unrolling
+tools/xngen src/f32-spmm/sse.c.in -D MR=4 -D NR=1 -D UNROLL=1 -o src/f32-spmm/4x1-sse.c
+tools/xngen src/f32-spmm/sse.c.in -D MR=8 -D NR=1 -D UNROLL=1 -o src/f32-spmm/8x1-sse.c
+
+
+################################## Unit tests #################################
+tools/generate-spmm-test.py --spec test/f32-spmm.yaml --output test/f32-spmm.cc
diff --git a/scripts/generate-f32-vmulcaddc.sh b/scripts/generate-f32-vmulcaddc.sh
new file mode 100755
index 0000000..ae70cea
--- /dev/null
+++ b/scripts/generate-f32-vmulcaddc.sh
@@ -0,0 +1,22 @@
+#!/bin/sh
+# Copyright 2019 Google LLC
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+#################################### Scalar ###################################
+tools/xngen src/f32-vmulcaddc/scalar.c.in -D CR=1 -D MR=2 -o src/f32-vmulcaddc/c1-scalar-x2.c
+
+################################### ARM NEON ##################################
+tools/xngen src/f32-vmulcaddc/neon.c.in -D CR=4 -D MR=2 -D FMA=1 -o src/f32-vmulcaddc/c4-neonfma-x2.c
+tools/xngen src/f32-vmulcaddc/neon.c.in -D CR=4 -D MR=2 -D FMA=0 -o src/f32-vmulcaddc/c4-neon-x2.c
+
+#################################### PSIMD ####################################
+tools/xngen src/f32-vmulcaddc/psimd.c.in -D CR=4 -D MR=2 -o src/f32-vmulcaddc/c4-psimd-x2.c
+
+################################### x86 SSE ###################################
+tools/xngen src/f32-vmulcaddc/sse.c.in -D CR=4 -D MR=2 -o src/f32-vmulcaddc/c4-sse-x2.c
+
+
+################################## Unit tests #################################
+tools/generate-vmulcaddc-test.py --spec test/f32-vmulcaddc.yaml --output test/f32-vmulcaddc.cc
diff --git a/scripts/generate-tests.sh b/scripts/generate-tests.sh
new file mode 100755
index 0000000..821f456
--- /dev/null
+++ b/scripts/generate-tests.sh
@@ -0,0 +1,14 @@
+#!/bin/sh
+# Copyright 2019 Google LLC
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+### Tests for Q8 micro-kernels
+tools/generate-gemm-test.py --spec test/q8-gemm.yaml --output test/q8-gemm.cc
+tools/generate-gemm-test.py --spec test/q8-igemm.yaml --output test/q8-igemm.cc
+tools/generate-dwconv-test.py --spec test/q8-dwconv.yaml --output test/q8-dwconv.cc
+
+### Tests for packing micro-kernels
+tools/generate-pack-test.py --spec test/x32-packx.yaml --output test/x32-packx.cc
+