AVX and FMA3 microkernels for GEMM/GEMMINC/IGEMM

PiperOrigin-RevId: 281807374
diff --git a/src/f32-gemm/1x8-avx-broadcast.c b/src/f32-gemm/1x8-avx-broadcast.c
new file mode 100644
index 0000000..79265ea
--- /dev/null
+++ b/src/f32-gemm/1x8-avx-broadcast.c
@@ -0,0 +1,94 @@
+// Auto-generated file. Do not edit!
+//   Template: src/f32-gemm/avx-broadcast.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <immintrin.h>
+
+#include <xnnpack/gemm.h>
+
+
+void xnn_f32_gemm_ukernel_1x8__avx_broadcast(
+    size_t mr,
+    size_t nc,
+    size_t kc,
+    const float*restrict a,
+    size_t a_stride,
+    const float*restrict w,
+    float*restrict c,
+    size_t cm_stride,
+    size_t cn_stride,
+    const union xnn_f32_output_params params[restrict static 1])
+{
+  assert(mr != 0);
+  assert(mr <= 1);
+  assert(nc != 0);
+  assert(kc != 0);
+  assert(kc % sizeof(float) == 0);
+  assert(a != NULL);
+  assert(w != NULL);
+  assert(c != NULL);
+
+  const float* a0 = a;
+  float* c0 = c;
+
+  do {
+    __m256 vacc0x01234567 = _mm256_load_ps(w + 0);
+    w += 8;
+
+    size_t k = kc;
+    do {
+      const __m256 va0 = _mm256_broadcast_ss(a0);
+      a0 += 1;
+
+      const __m256 vb01234567 = _mm256_load_ps(w);
+      w += 8;
+
+      vacc0x01234567 = _mm256_add_ps(vacc0x01234567, _mm256_mul_ps(va0, vb01234567));
+
+      k -= sizeof(float);
+    } while (k != 0);
+
+    const __m256 vmax = _mm256_broadcast_ps((const __m128*) params->sse.max);
+    vacc0x01234567 = _mm256_min_ps(vacc0x01234567, vmax);
+
+    const __m256 vmin = _mm256_broadcast_ps((const __m128*) params->sse.min);
+    vacc0x01234567 = _mm256_max_ps(vacc0x01234567, vmin);
+
+    if XNN_LIKELY(nc >= 8) {
+      _mm256_storeu_ps(c0, vacc0x01234567);
+      c0 = (float*) ((uintptr_t) c0 + cn_stride);
+
+      a0 = (const float*) ((uintptr_t) a0 - kc);
+
+      nc -= 8;
+    } else {
+      __m128 vacc0x0123 = _mm256_castps256_ps128(vacc0x01234567);
+      if (nc & 4) {
+        _mm_storeu_ps(c0, vacc0x0123);
+
+        vacc0x0123 = _mm256_extractf128_ps(vacc0x01234567, 1);
+
+        c0 += 4;
+      }
+      if (nc & 2) {
+        _mm_storel_pi((__m64*) c0, vacc0x0123);
+
+        vacc0x0123 = _mm_movehl_ps(vacc0x0123, vacc0x0123);
+
+        c0 += 2;
+      }
+      if (nc & 1) {
+        _mm_store_ss(c0, vacc0x0123);
+      }
+
+      nc = 0;
+    }
+  } while (nc != 0);
+}
diff --git a/src/f32-gemm/1x8-fma3-broadcast.c b/src/f32-gemm/1x8-fma3-broadcast.c
new file mode 100644
index 0000000..ae51c41
--- /dev/null
+++ b/src/f32-gemm/1x8-fma3-broadcast.c
@@ -0,0 +1,94 @@
+// Auto-generated file. Do not edit!
+//   Template: src/f32-gemm/avx-broadcast.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <immintrin.h>
+
+#include <xnnpack/gemm.h>
+
+
+void xnn_f32_gemm_ukernel_1x8__fma3_broadcast(
+    size_t mr,
+    size_t nc,
+    size_t kc,
+    const float*restrict a,
+    size_t a_stride,
+    const float*restrict w,
+    float*restrict c,
+    size_t cm_stride,
+    size_t cn_stride,
+    const union xnn_f32_output_params params[restrict static 1])
+{
+  assert(mr != 0);
+  assert(mr <= 1);
+  assert(nc != 0);
+  assert(kc != 0);
+  assert(kc % sizeof(float) == 0);
+  assert(a != NULL);
+  assert(w != NULL);
+  assert(c != NULL);
+
+  const float* a0 = a;
+  float* c0 = c;
+
+  do {
+    __m256 vacc0x01234567 = _mm256_load_ps(w + 0);
+    w += 8;
+
+    size_t k = kc;
+    do {
+      const __m256 va0 = _mm256_broadcast_ss(a0);
+      a0 += 1;
+
+      const __m256 vb01234567 = _mm256_load_ps(w);
+      w += 8;
+
+      vacc0x01234567 = _mm256_fmadd_ps(va0, vb01234567, vacc0x01234567);
+
+      k -= sizeof(float);
+    } while (k != 0);
+
+    const __m256 vmax = _mm256_broadcast_ps((const __m128*) params->sse.max);
+    vacc0x01234567 = _mm256_min_ps(vacc0x01234567, vmax);
+
+    const __m256 vmin = _mm256_broadcast_ps((const __m128*) params->sse.min);
+    vacc0x01234567 = _mm256_max_ps(vacc0x01234567, vmin);
+
+    if XNN_LIKELY(nc >= 8) {
+      _mm256_storeu_ps(c0, vacc0x01234567);
+      c0 = (float*) ((uintptr_t) c0 + cn_stride);
+
+      a0 = (const float*) ((uintptr_t) a0 - kc);
+
+      nc -= 8;
+    } else {
+      __m128 vacc0x0123 = _mm256_castps256_ps128(vacc0x01234567);
+      if (nc & 4) {
+        _mm_storeu_ps(c0, vacc0x0123);
+
+        vacc0x0123 = _mm256_extractf128_ps(vacc0x01234567, 1);
+
+        c0 += 4;
+      }
+      if (nc & 2) {
+        _mm_storel_pi((__m64*) c0, vacc0x0123);
+
+        vacc0x0123 = _mm_movehl_ps(vacc0x0123, vacc0x0123);
+
+        c0 += 2;
+      }
+      if (nc & 1) {
+        _mm_store_ss(c0, vacc0x0123);
+      }
+
+      nc = 0;
+    }
+  } while (nc != 0);
+}
diff --git a/src/f32-gemm/4x8-avx-broadcast.c b/src/f32-gemm/4x8-avx-broadcast.c
new file mode 100644
index 0000000..ecd3a98
--- /dev/null
+++ b/src/f32-gemm/4x8-avx-broadcast.c
@@ -0,0 +1,163 @@
+// Auto-generated file. Do not edit!
+//   Template: src/f32-gemm/avx-broadcast.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <immintrin.h>
+
+#include <xnnpack/gemm.h>
+
+
+void xnn_f32_gemm_ukernel_4x8__avx_broadcast(
+    size_t mr,
+    size_t nc,
+    size_t kc,
+    const float*restrict a,
+    size_t a_stride,
+    const float*restrict w,
+    float*restrict c,
+    size_t cm_stride,
+    size_t cn_stride,
+    const union xnn_f32_output_params params[restrict static 1])
+{
+  assert(mr != 0);
+  assert(mr <= 4);
+  assert(nc != 0);
+  assert(kc != 0);
+  assert(kc % sizeof(float) == 0);
+  assert(a != NULL);
+  assert(w != NULL);
+  assert(c != NULL);
+
+  const float* a0 = a;
+  float* c0 = c;
+  const float* a1 = (const float*) ((uintptr_t) a0 + a_stride);
+  float* c1 = (float*) ((uintptr_t) c0 + cm_stride);
+  if XNN_UNPREDICTABLE(mr < 2) {
+    a1 = a0;
+    c1 = c0;
+  }
+  const float* a2 = (const float*) ((uintptr_t) a1 + a_stride);
+  float* c2 = (float*) ((uintptr_t) c1 + cm_stride);
+  if XNN_UNPREDICTABLE(mr <= 2) {
+    a2 = a1;
+    c2 = c1;
+  }
+  const float* a3 = (const float*) ((uintptr_t) a2 + a_stride);
+  float* c3 = (float*) ((uintptr_t) c2 + cm_stride);
+  if XNN_UNPREDICTABLE(mr != 4) {
+    a3 = a2;
+    c3 = c2;
+  }
+
+  do {
+    __m256 vacc0x01234567 = _mm256_load_ps(w + 0);
+    __m256 vacc1x01234567 = vacc0x01234567;
+    __m256 vacc2x01234567 = vacc0x01234567;
+    __m256 vacc3x01234567 = vacc0x01234567;
+    w += 8;
+
+    size_t k = kc;
+    do {
+      const __m256 va0 = _mm256_broadcast_ss(a0);
+      a0 += 1;
+      const __m256 va1 = _mm256_broadcast_ss(a1);
+      a1 += 1;
+      const __m256 va2 = _mm256_broadcast_ss(a2);
+      a2 += 1;
+      const __m256 va3 = _mm256_broadcast_ss(a3);
+      a3 += 1;
+
+      const __m256 vb01234567 = _mm256_load_ps(w);
+      w += 8;
+
+      vacc0x01234567 = _mm256_add_ps(vacc0x01234567, _mm256_mul_ps(va0, vb01234567));
+      vacc1x01234567 = _mm256_add_ps(vacc1x01234567, _mm256_mul_ps(va1, vb01234567));
+      vacc2x01234567 = _mm256_add_ps(vacc2x01234567, _mm256_mul_ps(va2, vb01234567));
+      vacc3x01234567 = _mm256_add_ps(vacc3x01234567, _mm256_mul_ps(va3, vb01234567));
+
+      k -= sizeof(float);
+    } while (k != 0);
+
+    const __m256 vmax = _mm256_broadcast_ps((const __m128*) params->sse.max);
+    vacc0x01234567 = _mm256_min_ps(vacc0x01234567, vmax);
+    vacc1x01234567 = _mm256_min_ps(vacc1x01234567, vmax);
+    vacc2x01234567 = _mm256_min_ps(vacc2x01234567, vmax);
+    vacc3x01234567 = _mm256_min_ps(vacc3x01234567, vmax);
+
+    const __m256 vmin = _mm256_broadcast_ps((const __m128*) params->sse.min);
+    vacc0x01234567 = _mm256_max_ps(vacc0x01234567, vmin);
+    vacc1x01234567 = _mm256_max_ps(vacc1x01234567, vmin);
+    vacc2x01234567 = _mm256_max_ps(vacc2x01234567, vmin);
+    vacc3x01234567 = _mm256_max_ps(vacc3x01234567, vmin);
+
+    if XNN_LIKELY(nc >= 8) {
+      _mm256_storeu_ps(c3, vacc3x01234567);
+      c3 = (float*) ((uintptr_t) c3 + cn_stride);
+      _mm256_storeu_ps(c2, vacc2x01234567);
+      c2 = (float*) ((uintptr_t) c2 + cn_stride);
+      _mm256_storeu_ps(c1, vacc1x01234567);
+      c1 = (float*) ((uintptr_t) c1 + cn_stride);
+      _mm256_storeu_ps(c0, vacc0x01234567);
+      c0 = (float*) ((uintptr_t) c0 + cn_stride);
+
+      a3 = (const float*) ((uintptr_t) a3 - kc);
+      a2 = (const float*) ((uintptr_t) a2 - kc);
+      a1 = (const float*) ((uintptr_t) a1 - kc);
+      a0 = (const float*) ((uintptr_t) a0 - kc);
+
+      nc -= 8;
+    } else {
+      __m128 vacc3x0123 = _mm256_castps256_ps128(vacc3x01234567);
+      __m128 vacc2x0123 = _mm256_castps256_ps128(vacc2x01234567);
+      __m128 vacc1x0123 = _mm256_castps256_ps128(vacc1x01234567);
+      __m128 vacc0x0123 = _mm256_castps256_ps128(vacc0x01234567);
+      if (nc & 4) {
+        _mm_storeu_ps(c3, vacc3x0123);
+        _mm_storeu_ps(c2, vacc2x0123);
+        _mm_storeu_ps(c1, vacc1x0123);
+        _mm_storeu_ps(c0, vacc0x0123);
+
+        vacc3x0123 = _mm256_extractf128_ps(vacc3x01234567, 1);
+        vacc2x0123 = _mm256_extractf128_ps(vacc2x01234567, 1);
+        vacc1x0123 = _mm256_extractf128_ps(vacc1x01234567, 1);
+        vacc0x0123 = _mm256_extractf128_ps(vacc0x01234567, 1);
+
+        c3 += 4;
+        c2 += 4;
+        c1 += 4;
+        c0 += 4;
+      }
+      if (nc & 2) {
+        _mm_storel_pi((__m64*) c3, vacc3x0123);
+        _mm_storel_pi((__m64*) c2, vacc2x0123);
+        _mm_storel_pi((__m64*) c1, vacc1x0123);
+        _mm_storel_pi((__m64*) c0, vacc0x0123);
+
+        vacc3x0123 = _mm_movehl_ps(vacc3x0123, vacc3x0123);
+        vacc2x0123 = _mm_movehl_ps(vacc2x0123, vacc2x0123);
+        vacc1x0123 = _mm_movehl_ps(vacc1x0123, vacc1x0123);
+        vacc0x0123 = _mm_movehl_ps(vacc0x0123, vacc0x0123);
+
+        c3 += 2;
+        c2 += 2;
+        c1 += 2;
+        c0 += 2;
+      }
+      if (nc & 1) {
+        _mm_store_ss(c3, vacc3x0123);
+        _mm_store_ss(c2, vacc2x0123);
+        _mm_store_ss(c1, vacc1x0123);
+        _mm_store_ss(c0, vacc0x0123);
+      }
+
+      nc = 0;
+    }
+  } while (nc != 0);
+}
diff --git a/src/f32-gemm/4x8-fma3-broadcast.c b/src/f32-gemm/4x8-fma3-broadcast.c
new file mode 100644
index 0000000..8523d2b
--- /dev/null
+++ b/src/f32-gemm/4x8-fma3-broadcast.c
@@ -0,0 +1,163 @@
+// Auto-generated file. Do not edit!
+//   Template: src/f32-gemm/avx-broadcast.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <immintrin.h>
+
+#include <xnnpack/gemm.h>
+
+
+void xnn_f32_gemm_ukernel_4x8__fma3_broadcast(
+    size_t mr,
+    size_t nc,
+    size_t kc,
+    const float*restrict a,
+    size_t a_stride,
+    const float*restrict w,
+    float*restrict c,
+    size_t cm_stride,
+    size_t cn_stride,
+    const union xnn_f32_output_params params[restrict static 1])
+{
+  assert(mr != 0);
+  assert(mr <= 4);
+  assert(nc != 0);
+  assert(kc != 0);
+  assert(kc % sizeof(float) == 0);
+  assert(a != NULL);
+  assert(w != NULL);
+  assert(c != NULL);
+
+  const float* a0 = a;
+  float* c0 = c;
+  const float* a1 = (const float*) ((uintptr_t) a0 + a_stride);
+  float* c1 = (float*) ((uintptr_t) c0 + cm_stride);
+  if XNN_UNPREDICTABLE(mr < 2) {
+    a1 = a0;
+    c1 = c0;
+  }
+  const float* a2 = (const float*) ((uintptr_t) a1 + a_stride);
+  float* c2 = (float*) ((uintptr_t) c1 + cm_stride);
+  if XNN_UNPREDICTABLE(mr <= 2) {
+    a2 = a1;
+    c2 = c1;
+  }
+  const float* a3 = (const float*) ((uintptr_t) a2 + a_stride);
+  float* c3 = (float*) ((uintptr_t) c2 + cm_stride);
+  if XNN_UNPREDICTABLE(mr != 4) {
+    a3 = a2;
+    c3 = c2;
+  }
+
+  do {
+    __m256 vacc0x01234567 = _mm256_load_ps(w + 0);
+    __m256 vacc1x01234567 = vacc0x01234567;
+    __m256 vacc2x01234567 = vacc0x01234567;
+    __m256 vacc3x01234567 = vacc0x01234567;
+    w += 8;
+
+    size_t k = kc;
+    do {
+      const __m256 va0 = _mm256_broadcast_ss(a0);
+      a0 += 1;
+      const __m256 va1 = _mm256_broadcast_ss(a1);
+      a1 += 1;
+      const __m256 va2 = _mm256_broadcast_ss(a2);
+      a2 += 1;
+      const __m256 va3 = _mm256_broadcast_ss(a3);
+      a3 += 1;
+
+      const __m256 vb01234567 = _mm256_load_ps(w);
+      w += 8;
+
+      vacc0x01234567 = _mm256_fmadd_ps(va0, vb01234567, vacc0x01234567);
+      vacc1x01234567 = _mm256_fmadd_ps(va1, vb01234567, vacc1x01234567);
+      vacc2x01234567 = _mm256_fmadd_ps(va2, vb01234567, vacc2x01234567);
+      vacc3x01234567 = _mm256_fmadd_ps(va3, vb01234567, vacc3x01234567);
+
+      k -= sizeof(float);
+    } while (k != 0);
+
+    const __m256 vmax = _mm256_broadcast_ps((const __m128*) params->sse.max);
+    vacc0x01234567 = _mm256_min_ps(vacc0x01234567, vmax);
+    vacc1x01234567 = _mm256_min_ps(vacc1x01234567, vmax);
+    vacc2x01234567 = _mm256_min_ps(vacc2x01234567, vmax);
+    vacc3x01234567 = _mm256_min_ps(vacc3x01234567, vmax);
+
+    const __m256 vmin = _mm256_broadcast_ps((const __m128*) params->sse.min);
+    vacc0x01234567 = _mm256_max_ps(vacc0x01234567, vmin);
+    vacc1x01234567 = _mm256_max_ps(vacc1x01234567, vmin);
+    vacc2x01234567 = _mm256_max_ps(vacc2x01234567, vmin);
+    vacc3x01234567 = _mm256_max_ps(vacc3x01234567, vmin);
+
+    if XNN_LIKELY(nc >= 8) {
+      _mm256_storeu_ps(c3, vacc3x01234567);
+      c3 = (float*) ((uintptr_t) c3 + cn_stride);
+      _mm256_storeu_ps(c2, vacc2x01234567);
+      c2 = (float*) ((uintptr_t) c2 + cn_stride);
+      _mm256_storeu_ps(c1, vacc1x01234567);
+      c1 = (float*) ((uintptr_t) c1 + cn_stride);
+      _mm256_storeu_ps(c0, vacc0x01234567);
+      c0 = (float*) ((uintptr_t) c0 + cn_stride);
+
+      a3 = (const float*) ((uintptr_t) a3 - kc);
+      a2 = (const float*) ((uintptr_t) a2 - kc);
+      a1 = (const float*) ((uintptr_t) a1 - kc);
+      a0 = (const float*) ((uintptr_t) a0 - kc);
+
+      nc -= 8;
+    } else {
+      __m128 vacc3x0123 = _mm256_castps256_ps128(vacc3x01234567);
+      __m128 vacc2x0123 = _mm256_castps256_ps128(vacc2x01234567);
+      __m128 vacc1x0123 = _mm256_castps256_ps128(vacc1x01234567);
+      __m128 vacc0x0123 = _mm256_castps256_ps128(vacc0x01234567);
+      if (nc & 4) {
+        _mm_storeu_ps(c3, vacc3x0123);
+        _mm_storeu_ps(c2, vacc2x0123);
+        _mm_storeu_ps(c1, vacc1x0123);
+        _mm_storeu_ps(c0, vacc0x0123);
+
+        vacc3x0123 = _mm256_extractf128_ps(vacc3x01234567, 1);
+        vacc2x0123 = _mm256_extractf128_ps(vacc2x01234567, 1);
+        vacc1x0123 = _mm256_extractf128_ps(vacc1x01234567, 1);
+        vacc0x0123 = _mm256_extractf128_ps(vacc0x01234567, 1);
+
+        c3 += 4;
+        c2 += 4;
+        c1 += 4;
+        c0 += 4;
+      }
+      if (nc & 2) {
+        _mm_storel_pi((__m64*) c3, vacc3x0123);
+        _mm_storel_pi((__m64*) c2, vacc2x0123);
+        _mm_storel_pi((__m64*) c1, vacc1x0123);
+        _mm_storel_pi((__m64*) c0, vacc0x0123);
+
+        vacc3x0123 = _mm_movehl_ps(vacc3x0123, vacc3x0123);
+        vacc2x0123 = _mm_movehl_ps(vacc2x0123, vacc2x0123);
+        vacc1x0123 = _mm_movehl_ps(vacc1x0123, vacc1x0123);
+        vacc0x0123 = _mm_movehl_ps(vacc0x0123, vacc0x0123);
+
+        c3 += 2;
+        c2 += 2;
+        c1 += 2;
+        c0 += 2;
+      }
+      if (nc & 1) {
+        _mm_store_ss(c3, vacc3x0123);
+        _mm_store_ss(c2, vacc2x0123);
+        _mm_store_ss(c1, vacc1x0123);
+        _mm_store_ss(c0, vacc0x0123);
+      }
+
+      nc = 0;
+    }
+  } while (nc != 0);
+}
diff --git a/src/f32-gemm/5x8-avx-broadcast.c b/src/f32-gemm/5x8-avx-broadcast.c
new file mode 100644
index 0000000..62c2a36
--- /dev/null
+++ b/src/f32-gemm/5x8-avx-broadcast.c
@@ -0,0 +1,186 @@
+// Auto-generated file. Do not edit!
+//   Template: src/f32-gemm/avx-broadcast.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <immintrin.h>
+
+#include <xnnpack/gemm.h>
+
+
+void xnn_f32_gemm_ukernel_5x8__avx_broadcast(
+    size_t mr,
+    size_t nc,
+    size_t kc,
+    const float*restrict a,
+    size_t a_stride,
+    const float*restrict w,
+    float*restrict c,
+    size_t cm_stride,
+    size_t cn_stride,
+    const union xnn_f32_output_params params[restrict static 1])
+{
+  assert(mr != 0);
+  assert(mr <= 5);
+  assert(nc != 0);
+  assert(kc != 0);
+  assert(kc % sizeof(float) == 0);
+  assert(a != NULL);
+  assert(w != NULL);
+  assert(c != NULL);
+
+  const float* a0 = a;
+  float* c0 = c;
+  const float* a1 = (const float*) ((uintptr_t) a0 + a_stride);
+  float* c1 = (float*) ((uintptr_t) c0 + cm_stride);
+  if XNN_UNPREDICTABLE(mr < 2) {
+    a1 = a0;
+    c1 = c0;
+  }
+  const float* a2 = (const float*) ((uintptr_t) a1 + a_stride);
+  float* c2 = (float*) ((uintptr_t) c1 + cm_stride);
+  if XNN_UNPREDICTABLE(mr <= 2) {
+    a2 = a1;
+    c2 = c1;
+  }
+  const float* a3 = (const float*) ((uintptr_t) a2 + a_stride);
+  float* c3 = (float*) ((uintptr_t) c2 + cm_stride);
+  if XNN_UNPREDICTABLE(mr < 4) {
+    a3 = a2;
+    c3 = c2;
+  }
+  const float* a4 = (const float*) ((uintptr_t) a3 + a_stride);
+  float* c4 = (float*) ((uintptr_t) c3 + cm_stride);
+  if XNN_UNPREDICTABLE(mr <= 4) {
+    a4 = a3;
+    c4 = c3;
+  }
+
+  do {
+    __m256 vacc0x01234567 = _mm256_load_ps(w + 0);
+    __m256 vacc1x01234567 = vacc0x01234567;
+    __m256 vacc2x01234567 = vacc0x01234567;
+    __m256 vacc3x01234567 = vacc0x01234567;
+    __m256 vacc4x01234567 = vacc0x01234567;
+    w += 8;
+
+    size_t k = kc;
+    do {
+      const __m256 va0 = _mm256_broadcast_ss(a0);
+      a0 += 1;
+      const __m256 va1 = _mm256_broadcast_ss(a1);
+      a1 += 1;
+      const __m256 va2 = _mm256_broadcast_ss(a2);
+      a2 += 1;
+      const __m256 va3 = _mm256_broadcast_ss(a3);
+      a3 += 1;
+      const __m256 va4 = _mm256_broadcast_ss(a4);
+      a4 += 1;
+
+      const __m256 vb01234567 = _mm256_load_ps(w);
+      w += 8;
+
+      vacc0x01234567 = _mm256_add_ps(vacc0x01234567, _mm256_mul_ps(va0, vb01234567));
+      vacc1x01234567 = _mm256_add_ps(vacc1x01234567, _mm256_mul_ps(va1, vb01234567));
+      vacc2x01234567 = _mm256_add_ps(vacc2x01234567, _mm256_mul_ps(va2, vb01234567));
+      vacc3x01234567 = _mm256_add_ps(vacc3x01234567, _mm256_mul_ps(va3, vb01234567));
+      vacc4x01234567 = _mm256_add_ps(vacc4x01234567, _mm256_mul_ps(va4, vb01234567));
+
+      k -= sizeof(float);
+    } while (k != 0);
+
+    const __m256 vmax = _mm256_broadcast_ps((const __m128*) params->sse.max);
+    vacc0x01234567 = _mm256_min_ps(vacc0x01234567, vmax);
+    vacc1x01234567 = _mm256_min_ps(vacc1x01234567, vmax);
+    vacc2x01234567 = _mm256_min_ps(vacc2x01234567, vmax);
+    vacc3x01234567 = _mm256_min_ps(vacc3x01234567, vmax);
+    vacc4x01234567 = _mm256_min_ps(vacc4x01234567, vmax);
+
+    const __m256 vmin = _mm256_broadcast_ps((const __m128*) params->sse.min);
+    vacc0x01234567 = _mm256_max_ps(vacc0x01234567, vmin);
+    vacc1x01234567 = _mm256_max_ps(vacc1x01234567, vmin);
+    vacc2x01234567 = _mm256_max_ps(vacc2x01234567, vmin);
+    vacc3x01234567 = _mm256_max_ps(vacc3x01234567, vmin);
+    vacc4x01234567 = _mm256_max_ps(vacc4x01234567, vmin);
+
+    if XNN_LIKELY(nc >= 8) {
+      _mm256_storeu_ps(c4, vacc4x01234567);
+      c4 = (float*) ((uintptr_t) c4 + cn_stride);
+      _mm256_storeu_ps(c3, vacc3x01234567);
+      c3 = (float*) ((uintptr_t) c3 + cn_stride);
+      _mm256_storeu_ps(c2, vacc2x01234567);
+      c2 = (float*) ((uintptr_t) c2 + cn_stride);
+      _mm256_storeu_ps(c1, vacc1x01234567);
+      c1 = (float*) ((uintptr_t) c1 + cn_stride);
+      _mm256_storeu_ps(c0, vacc0x01234567);
+      c0 = (float*) ((uintptr_t) c0 + cn_stride);
+
+      a4 = (const float*) ((uintptr_t) a4 - kc);
+      a3 = (const float*) ((uintptr_t) a3 - kc);
+      a2 = (const float*) ((uintptr_t) a2 - kc);
+      a1 = (const float*) ((uintptr_t) a1 - kc);
+      a0 = (const float*) ((uintptr_t) a0 - kc);
+
+      nc -= 8;
+    } else {
+      __m128 vacc4x0123 = _mm256_castps256_ps128(vacc4x01234567);
+      __m128 vacc3x0123 = _mm256_castps256_ps128(vacc3x01234567);
+      __m128 vacc2x0123 = _mm256_castps256_ps128(vacc2x01234567);
+      __m128 vacc1x0123 = _mm256_castps256_ps128(vacc1x01234567);
+      __m128 vacc0x0123 = _mm256_castps256_ps128(vacc0x01234567);
+      if (nc & 4) {
+        _mm_storeu_ps(c4, vacc4x0123);
+        _mm_storeu_ps(c3, vacc3x0123);
+        _mm_storeu_ps(c2, vacc2x0123);
+        _mm_storeu_ps(c1, vacc1x0123);
+        _mm_storeu_ps(c0, vacc0x0123);
+
+        vacc4x0123 = _mm256_extractf128_ps(vacc4x01234567, 1);
+        vacc3x0123 = _mm256_extractf128_ps(vacc3x01234567, 1);
+        vacc2x0123 = _mm256_extractf128_ps(vacc2x01234567, 1);
+        vacc1x0123 = _mm256_extractf128_ps(vacc1x01234567, 1);
+        vacc0x0123 = _mm256_extractf128_ps(vacc0x01234567, 1);
+
+        c4 += 4;
+        c3 += 4;
+        c2 += 4;
+        c1 += 4;
+        c0 += 4;
+      }
+      if (nc & 2) {
+        _mm_storel_pi((__m64*) c4, vacc4x0123);
+        _mm_storel_pi((__m64*) c3, vacc3x0123);
+        _mm_storel_pi((__m64*) c2, vacc2x0123);
+        _mm_storel_pi((__m64*) c1, vacc1x0123);
+        _mm_storel_pi((__m64*) c0, vacc0x0123);
+
+        vacc4x0123 = _mm_movehl_ps(vacc4x0123, vacc4x0123);
+        vacc3x0123 = _mm_movehl_ps(vacc3x0123, vacc3x0123);
+        vacc2x0123 = _mm_movehl_ps(vacc2x0123, vacc2x0123);
+        vacc1x0123 = _mm_movehl_ps(vacc1x0123, vacc1x0123);
+        vacc0x0123 = _mm_movehl_ps(vacc0x0123, vacc0x0123);
+
+        c4 += 2;
+        c3 += 2;
+        c2 += 2;
+        c1 += 2;
+        c0 += 2;
+      }
+      if (nc & 1) {
+        _mm_store_ss(c4, vacc4x0123);
+        _mm_store_ss(c3, vacc3x0123);
+        _mm_store_ss(c2, vacc2x0123);
+        _mm_store_ss(c1, vacc1x0123);
+        _mm_store_ss(c0, vacc0x0123);
+      }
+
+      nc = 0;
+    }
+  } while (nc != 0);
+}
diff --git a/src/f32-gemm/5x8-fma3-broadcast.c b/src/f32-gemm/5x8-fma3-broadcast.c
new file mode 100644
index 0000000..e5ad31f
--- /dev/null
+++ b/src/f32-gemm/5x8-fma3-broadcast.c
@@ -0,0 +1,186 @@
+// Auto-generated file. Do not edit!
+//   Template: src/f32-gemm/avx-broadcast.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <immintrin.h>
+
+#include <xnnpack/gemm.h>
+
+
+void xnn_f32_gemm_ukernel_5x8__fma3_broadcast(
+    size_t mr,
+    size_t nc,
+    size_t kc,
+    const float*restrict a,
+    size_t a_stride,
+    const float*restrict w,
+    float*restrict c,
+    size_t cm_stride,
+    size_t cn_stride,
+    const union xnn_f32_output_params params[restrict static 1])
+{
+  assert(mr != 0);
+  assert(mr <= 5);
+  assert(nc != 0);
+  assert(kc != 0);
+  assert(kc % sizeof(float) == 0);
+  assert(a != NULL);
+  assert(w != NULL);
+  assert(c != NULL);
+
+  const float* a0 = a;
+  float* c0 = c;
+  const float* a1 = (const float*) ((uintptr_t) a0 + a_stride);
+  float* c1 = (float*) ((uintptr_t) c0 + cm_stride);
+  if XNN_UNPREDICTABLE(mr < 2) {
+    a1 = a0;
+    c1 = c0;
+  }
+  const float* a2 = (const float*) ((uintptr_t) a1 + a_stride);
+  float* c2 = (float*) ((uintptr_t) c1 + cm_stride);
+  if XNN_UNPREDICTABLE(mr <= 2) {
+    a2 = a1;
+    c2 = c1;
+  }
+  const float* a3 = (const float*) ((uintptr_t) a2 + a_stride);
+  float* c3 = (float*) ((uintptr_t) c2 + cm_stride);
+  if XNN_UNPREDICTABLE(mr < 4) {
+    a3 = a2;
+    c3 = c2;
+  }
+  const float* a4 = (const float*) ((uintptr_t) a3 + a_stride);
+  float* c4 = (float*) ((uintptr_t) c3 + cm_stride);
+  if XNN_UNPREDICTABLE(mr <= 4) {
+    a4 = a3;
+    c4 = c3;
+  }
+
+  do {
+    __m256 vacc0x01234567 = _mm256_load_ps(w + 0);
+    __m256 vacc1x01234567 = vacc0x01234567;
+    __m256 vacc2x01234567 = vacc0x01234567;
+    __m256 vacc3x01234567 = vacc0x01234567;
+    __m256 vacc4x01234567 = vacc0x01234567;
+    w += 8;
+
+    size_t k = kc;
+    do {
+      const __m256 va0 = _mm256_broadcast_ss(a0);
+      a0 += 1;
+      const __m256 va1 = _mm256_broadcast_ss(a1);
+      a1 += 1;
+      const __m256 va2 = _mm256_broadcast_ss(a2);
+      a2 += 1;
+      const __m256 va3 = _mm256_broadcast_ss(a3);
+      a3 += 1;
+      const __m256 va4 = _mm256_broadcast_ss(a4);
+      a4 += 1;
+
+      const __m256 vb01234567 = _mm256_load_ps(w);
+      w += 8;
+
+      vacc0x01234567 = _mm256_fmadd_ps(va0, vb01234567, vacc0x01234567);
+      vacc1x01234567 = _mm256_fmadd_ps(va1, vb01234567, vacc1x01234567);
+      vacc2x01234567 = _mm256_fmadd_ps(va2, vb01234567, vacc2x01234567);
+      vacc3x01234567 = _mm256_fmadd_ps(va3, vb01234567, vacc3x01234567);
+      vacc4x01234567 = _mm256_fmadd_ps(va4, vb01234567, vacc4x01234567);
+
+      k -= sizeof(float);
+    } while (k != 0);
+
+    const __m256 vmax = _mm256_broadcast_ps((const __m128*) params->sse.max);
+    vacc0x01234567 = _mm256_min_ps(vacc0x01234567, vmax);
+    vacc1x01234567 = _mm256_min_ps(vacc1x01234567, vmax);
+    vacc2x01234567 = _mm256_min_ps(vacc2x01234567, vmax);
+    vacc3x01234567 = _mm256_min_ps(vacc3x01234567, vmax);
+    vacc4x01234567 = _mm256_min_ps(vacc4x01234567, vmax);
+
+    const __m256 vmin = _mm256_broadcast_ps((const __m128*) params->sse.min);
+    vacc0x01234567 = _mm256_max_ps(vacc0x01234567, vmin);
+    vacc1x01234567 = _mm256_max_ps(vacc1x01234567, vmin);
+    vacc2x01234567 = _mm256_max_ps(vacc2x01234567, vmin);
+    vacc3x01234567 = _mm256_max_ps(vacc3x01234567, vmin);
+    vacc4x01234567 = _mm256_max_ps(vacc4x01234567, vmin);
+
+    if XNN_LIKELY(nc >= 8) {
+      _mm256_storeu_ps(c4, vacc4x01234567);
+      c4 = (float*) ((uintptr_t) c4 + cn_stride);
+      _mm256_storeu_ps(c3, vacc3x01234567);
+      c3 = (float*) ((uintptr_t) c3 + cn_stride);
+      _mm256_storeu_ps(c2, vacc2x01234567);
+      c2 = (float*) ((uintptr_t) c2 + cn_stride);
+      _mm256_storeu_ps(c1, vacc1x01234567);
+      c1 = (float*) ((uintptr_t) c1 + cn_stride);
+      _mm256_storeu_ps(c0, vacc0x01234567);
+      c0 = (float*) ((uintptr_t) c0 + cn_stride);
+
+      a4 = (const float*) ((uintptr_t) a4 - kc);
+      a3 = (const float*) ((uintptr_t) a3 - kc);
+      a2 = (const float*) ((uintptr_t) a2 - kc);
+      a1 = (const float*) ((uintptr_t) a1 - kc);
+      a0 = (const float*) ((uintptr_t) a0 - kc);
+
+      nc -= 8;
+    } else {
+      __m128 vacc4x0123 = _mm256_castps256_ps128(vacc4x01234567);
+      __m128 vacc3x0123 = _mm256_castps256_ps128(vacc3x01234567);
+      __m128 vacc2x0123 = _mm256_castps256_ps128(vacc2x01234567);
+      __m128 vacc1x0123 = _mm256_castps256_ps128(vacc1x01234567);
+      __m128 vacc0x0123 = _mm256_castps256_ps128(vacc0x01234567);
+      if (nc & 4) {
+        _mm_storeu_ps(c4, vacc4x0123);
+        _mm_storeu_ps(c3, vacc3x0123);
+        _mm_storeu_ps(c2, vacc2x0123);
+        _mm_storeu_ps(c1, vacc1x0123);
+        _mm_storeu_ps(c0, vacc0x0123);
+
+        vacc4x0123 = _mm256_extractf128_ps(vacc4x01234567, 1);
+        vacc3x0123 = _mm256_extractf128_ps(vacc3x01234567, 1);
+        vacc2x0123 = _mm256_extractf128_ps(vacc2x01234567, 1);
+        vacc1x0123 = _mm256_extractf128_ps(vacc1x01234567, 1);
+        vacc0x0123 = _mm256_extractf128_ps(vacc0x01234567, 1);
+
+        c4 += 4;
+        c3 += 4;
+        c2 += 4;
+        c1 += 4;
+        c0 += 4;
+      }
+      if (nc & 2) {
+        _mm_storel_pi((__m64*) c4, vacc4x0123);
+        _mm_storel_pi((__m64*) c3, vacc3x0123);
+        _mm_storel_pi((__m64*) c2, vacc2x0123);
+        _mm_storel_pi((__m64*) c1, vacc1x0123);
+        _mm_storel_pi((__m64*) c0, vacc0x0123);
+
+        vacc4x0123 = _mm_movehl_ps(vacc4x0123, vacc4x0123);
+        vacc3x0123 = _mm_movehl_ps(vacc3x0123, vacc3x0123);
+        vacc2x0123 = _mm_movehl_ps(vacc2x0123, vacc2x0123);
+        vacc1x0123 = _mm_movehl_ps(vacc1x0123, vacc1x0123);
+        vacc0x0123 = _mm_movehl_ps(vacc0x0123, vacc0x0123);
+
+        c4 += 2;
+        c3 += 2;
+        c2 += 2;
+        c1 += 2;
+        c0 += 2;
+      }
+      if (nc & 1) {
+        _mm_store_ss(c4, vacc4x0123);
+        _mm_store_ss(c3, vacc3x0123);
+        _mm_store_ss(c2, vacc2x0123);
+        _mm_store_ss(c1, vacc1x0123);
+        _mm_store_ss(c0, vacc0x0123);
+      }
+
+      nc = 0;
+    }
+  } while (nc != 0);
+}
diff --git a/src/f32-gemm/6x8-avx-broadcast.c b/src/f32-gemm/6x8-avx-broadcast.c
new file mode 100644
index 0000000..4a718e8
--- /dev/null
+++ b/src/f32-gemm/6x8-avx-broadcast.c
@@ -0,0 +1,209 @@
+// Auto-generated file. Do not edit!
+//   Template: src/f32-gemm/avx-broadcast.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <immintrin.h>
+
+#include <xnnpack/gemm.h>
+
+
+void xnn_f32_gemm_ukernel_6x8__avx_broadcast(
+    size_t mr,
+    size_t nc,
+    size_t kc,
+    const float*restrict a,
+    size_t a_stride,
+    const float*restrict w,
+    float*restrict c,
+    size_t cm_stride,
+    size_t cn_stride,
+    const union xnn_f32_output_params params[restrict static 1])
+{
+  assert(mr != 0);
+  assert(mr <= 6);
+  assert(nc != 0);
+  assert(kc != 0);
+  assert(kc % sizeof(float) == 0);
+  assert(a != NULL);
+  assert(w != NULL);
+  assert(c != NULL);
+
+  const float* a0 = a;
+  float* c0 = c;
+  const float* a1 = (const float*) ((uintptr_t) a0 + a_stride);
+  float* c1 = (float*) ((uintptr_t) c0 + cm_stride);
+  if XNN_UNPREDICTABLE(mr < 2) {
+    a1 = a0;
+    c1 = c0;
+  }
+  const float* a2 = (const float*) ((uintptr_t) a1 + a_stride);
+  float* c2 = (float*) ((uintptr_t) c1 + cm_stride);
+  if XNN_UNPREDICTABLE(mr <= 2) {
+    a2 = a1;
+    c2 = c1;
+  }
+  const float* a3 = (const float*) ((uintptr_t) a2 + a_stride);
+  float* c3 = (float*) ((uintptr_t) c2 + cm_stride);
+  if XNN_UNPREDICTABLE(mr < 4) {
+    a3 = a2;
+    c3 = c2;
+  }
+  const float* a4 = (const float*) ((uintptr_t) a3 + a_stride);
+  float* c4 = (float*) ((uintptr_t) c3 + cm_stride);
+  if XNN_UNPREDICTABLE(mr <= 4) {
+    a4 = a3;
+    c4 = c3;
+  }
+  const float* a5 = (const float*) ((uintptr_t) a4 + a_stride);
+  float* c5 = (float*) ((uintptr_t) c4 + cm_stride);
+  if XNN_UNPREDICTABLE(mr != 6) {
+    a5 = a4;
+    c5 = c4;
+  }
+
+  do {
+    __m256 vacc0x01234567 = _mm256_load_ps(w + 0);
+    __m256 vacc1x01234567 = vacc0x01234567;
+    __m256 vacc2x01234567 = vacc0x01234567;
+    __m256 vacc3x01234567 = vacc0x01234567;
+    __m256 vacc4x01234567 = vacc0x01234567;
+    __m256 vacc5x01234567 = vacc0x01234567;
+    w += 8;
+
+    size_t k = kc;
+    do {
+      const __m256 va0 = _mm256_broadcast_ss(a0);
+      a0 += 1;
+      const __m256 va1 = _mm256_broadcast_ss(a1);
+      a1 += 1;
+      const __m256 va2 = _mm256_broadcast_ss(a2);
+      a2 += 1;
+      const __m256 va3 = _mm256_broadcast_ss(a3);
+      a3 += 1;
+      const __m256 va4 = _mm256_broadcast_ss(a4);
+      a4 += 1;
+      const __m256 va5 = _mm256_broadcast_ss(a5);
+      a5 += 1;
+
+      const __m256 vb01234567 = _mm256_load_ps(w);
+      w += 8;
+
+      vacc0x01234567 = _mm256_add_ps(vacc0x01234567, _mm256_mul_ps(va0, vb01234567));
+      vacc1x01234567 = _mm256_add_ps(vacc1x01234567, _mm256_mul_ps(va1, vb01234567));
+      vacc2x01234567 = _mm256_add_ps(vacc2x01234567, _mm256_mul_ps(va2, vb01234567));
+      vacc3x01234567 = _mm256_add_ps(vacc3x01234567, _mm256_mul_ps(va3, vb01234567));
+      vacc4x01234567 = _mm256_add_ps(vacc4x01234567, _mm256_mul_ps(va4, vb01234567));
+      vacc5x01234567 = _mm256_add_ps(vacc5x01234567, _mm256_mul_ps(va5, vb01234567));
+
+      k -= sizeof(float);
+    } while (k != 0);
+
+    const __m256 vmax = _mm256_broadcast_ps((const __m128*) params->sse.max);
+    vacc0x01234567 = _mm256_min_ps(vacc0x01234567, vmax);
+    vacc1x01234567 = _mm256_min_ps(vacc1x01234567, vmax);
+    vacc2x01234567 = _mm256_min_ps(vacc2x01234567, vmax);
+    vacc3x01234567 = _mm256_min_ps(vacc3x01234567, vmax);
+    vacc4x01234567 = _mm256_min_ps(vacc4x01234567, vmax);
+    vacc5x01234567 = _mm256_min_ps(vacc5x01234567, vmax);
+
+    const __m256 vmin = _mm256_broadcast_ps((const __m128*) params->sse.min);
+    vacc0x01234567 = _mm256_max_ps(vacc0x01234567, vmin);
+    vacc1x01234567 = _mm256_max_ps(vacc1x01234567, vmin);
+    vacc2x01234567 = _mm256_max_ps(vacc2x01234567, vmin);
+    vacc3x01234567 = _mm256_max_ps(vacc3x01234567, vmin);
+    vacc4x01234567 = _mm256_max_ps(vacc4x01234567, vmin);
+    vacc5x01234567 = _mm256_max_ps(vacc5x01234567, vmin);
+
+    if XNN_LIKELY(nc >= 8) {
+      _mm256_storeu_ps(c5, vacc5x01234567);
+      c5 = (float*) ((uintptr_t) c5 + cn_stride);
+      _mm256_storeu_ps(c4, vacc4x01234567);
+      c4 = (float*) ((uintptr_t) c4 + cn_stride);
+      _mm256_storeu_ps(c3, vacc3x01234567);
+      c3 = (float*) ((uintptr_t) c3 + cn_stride);
+      _mm256_storeu_ps(c2, vacc2x01234567);
+      c2 = (float*) ((uintptr_t) c2 + cn_stride);
+      _mm256_storeu_ps(c1, vacc1x01234567);
+      c1 = (float*) ((uintptr_t) c1 + cn_stride);
+      _mm256_storeu_ps(c0, vacc0x01234567);
+      c0 = (float*) ((uintptr_t) c0 + cn_stride);
+
+      a5 = (const float*) ((uintptr_t) a5 - kc);
+      a4 = (const float*) ((uintptr_t) a4 - kc);
+      a3 = (const float*) ((uintptr_t) a3 - kc);
+      a2 = (const float*) ((uintptr_t) a2 - kc);
+      a1 = (const float*) ((uintptr_t) a1 - kc);
+      a0 = (const float*) ((uintptr_t) a0 - kc);
+
+      nc -= 8;
+    } else {
+      __m128 vacc5x0123 = _mm256_castps256_ps128(vacc5x01234567);
+      __m128 vacc4x0123 = _mm256_castps256_ps128(vacc4x01234567);
+      __m128 vacc3x0123 = _mm256_castps256_ps128(vacc3x01234567);
+      __m128 vacc2x0123 = _mm256_castps256_ps128(vacc2x01234567);
+      __m128 vacc1x0123 = _mm256_castps256_ps128(vacc1x01234567);
+      __m128 vacc0x0123 = _mm256_castps256_ps128(vacc0x01234567);
+      if (nc & 4) {
+        _mm_storeu_ps(c5, vacc5x0123);
+        _mm_storeu_ps(c4, vacc4x0123);
+        _mm_storeu_ps(c3, vacc3x0123);
+        _mm_storeu_ps(c2, vacc2x0123);
+        _mm_storeu_ps(c1, vacc1x0123);
+        _mm_storeu_ps(c0, vacc0x0123);
+
+        vacc5x0123 = _mm256_extractf128_ps(vacc5x01234567, 1);
+        vacc4x0123 = _mm256_extractf128_ps(vacc4x01234567, 1);
+        vacc3x0123 = _mm256_extractf128_ps(vacc3x01234567, 1);
+        vacc2x0123 = _mm256_extractf128_ps(vacc2x01234567, 1);
+        vacc1x0123 = _mm256_extractf128_ps(vacc1x01234567, 1);
+        vacc0x0123 = _mm256_extractf128_ps(vacc0x01234567, 1);
+
+        c5 += 4;
+        c4 += 4;
+        c3 += 4;
+        c2 += 4;
+        c1 += 4;
+        c0 += 4;
+      }
+      if (nc & 2) {
+        _mm_storel_pi((__m64*) c5, vacc5x0123);
+        _mm_storel_pi((__m64*) c4, vacc4x0123);
+        _mm_storel_pi((__m64*) c3, vacc3x0123);
+        _mm_storel_pi((__m64*) c2, vacc2x0123);
+        _mm_storel_pi((__m64*) c1, vacc1x0123);
+        _mm_storel_pi((__m64*) c0, vacc0x0123);
+
+        vacc5x0123 = _mm_movehl_ps(vacc5x0123, vacc5x0123);
+        vacc4x0123 = _mm_movehl_ps(vacc4x0123, vacc4x0123);
+        vacc3x0123 = _mm_movehl_ps(vacc3x0123, vacc3x0123);
+        vacc2x0123 = _mm_movehl_ps(vacc2x0123, vacc2x0123);
+        vacc1x0123 = _mm_movehl_ps(vacc1x0123, vacc1x0123);
+        vacc0x0123 = _mm_movehl_ps(vacc0x0123, vacc0x0123);
+
+        c5 += 2;
+        c4 += 2;
+        c3 += 2;
+        c2 += 2;
+        c1 += 2;
+        c0 += 2;
+      }
+      if (nc & 1) {
+        _mm_store_ss(c5, vacc5x0123);
+        _mm_store_ss(c4, vacc4x0123);
+        _mm_store_ss(c3, vacc3x0123);
+        _mm_store_ss(c2, vacc2x0123);
+        _mm_store_ss(c1, vacc1x0123);
+        _mm_store_ss(c0, vacc0x0123);
+      }
+
+      nc = 0;
+    }
+  } while (nc != 0);
+}
diff --git a/src/f32-gemm/6x8-fma3-broadcast.c b/src/f32-gemm/6x8-fma3-broadcast.c
new file mode 100644
index 0000000..05c8e0d
--- /dev/null
+++ b/src/f32-gemm/6x8-fma3-broadcast.c
@@ -0,0 +1,209 @@
+// Auto-generated file. Do not edit!
+//   Template: src/f32-gemm/avx-broadcast.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <immintrin.h>
+
+#include <xnnpack/gemm.h>
+
+
+void xnn_f32_gemm_ukernel_6x8__fma3_broadcast(
+    size_t mr,
+    size_t nc,
+    size_t kc,
+    const float*restrict a,
+    size_t a_stride,
+    const float*restrict w,
+    float*restrict c,
+    size_t cm_stride,
+    size_t cn_stride,
+    const union xnn_f32_output_params params[restrict static 1])
+{
+  assert(mr != 0);
+  assert(mr <= 6);
+  assert(nc != 0);
+  assert(kc != 0);
+  assert(kc % sizeof(float) == 0);
+  assert(a != NULL);
+  assert(w != NULL);
+  assert(c != NULL);
+
+  const float* a0 = a;
+  float* c0 = c;
+  const float* a1 = (const float*) ((uintptr_t) a0 + a_stride);
+  float* c1 = (float*) ((uintptr_t) c0 + cm_stride);
+  if XNN_UNPREDICTABLE(mr < 2) {
+    a1 = a0;
+    c1 = c0;
+  }
+  const float* a2 = (const float*) ((uintptr_t) a1 + a_stride);
+  float* c2 = (float*) ((uintptr_t) c1 + cm_stride);
+  if XNN_UNPREDICTABLE(mr <= 2) {
+    a2 = a1;
+    c2 = c1;
+  }
+  const float* a3 = (const float*) ((uintptr_t) a2 + a_stride);
+  float* c3 = (float*) ((uintptr_t) c2 + cm_stride);
+  if XNN_UNPREDICTABLE(mr < 4) {
+    a3 = a2;
+    c3 = c2;
+  }
+  const float* a4 = (const float*) ((uintptr_t) a3 + a_stride);
+  float* c4 = (float*) ((uintptr_t) c3 + cm_stride);
+  if XNN_UNPREDICTABLE(mr <= 4) {
+    a4 = a3;
+    c4 = c3;
+  }
+  const float* a5 = (const float*) ((uintptr_t) a4 + a_stride);
+  float* c5 = (float*) ((uintptr_t) c4 + cm_stride);
+  if XNN_UNPREDICTABLE(mr != 6) {
+    a5 = a4;
+    c5 = c4;
+  }
+
+  do {
+    __m256 vacc0x01234567 = _mm256_load_ps(w + 0);
+    __m256 vacc1x01234567 = vacc0x01234567;
+    __m256 vacc2x01234567 = vacc0x01234567;
+    __m256 vacc3x01234567 = vacc0x01234567;
+    __m256 vacc4x01234567 = vacc0x01234567;
+    __m256 vacc5x01234567 = vacc0x01234567;
+    w += 8;
+
+    size_t k = kc;
+    do {
+      const __m256 va0 = _mm256_broadcast_ss(a0);
+      a0 += 1;
+      const __m256 va1 = _mm256_broadcast_ss(a1);
+      a1 += 1;
+      const __m256 va2 = _mm256_broadcast_ss(a2);
+      a2 += 1;
+      const __m256 va3 = _mm256_broadcast_ss(a3);
+      a3 += 1;
+      const __m256 va4 = _mm256_broadcast_ss(a4);
+      a4 += 1;
+      const __m256 va5 = _mm256_broadcast_ss(a5);
+      a5 += 1;
+
+      const __m256 vb01234567 = _mm256_load_ps(w);
+      w += 8;
+
+      vacc0x01234567 = _mm256_fmadd_ps(va0, vb01234567, vacc0x01234567);
+      vacc1x01234567 = _mm256_fmadd_ps(va1, vb01234567, vacc1x01234567);
+      vacc2x01234567 = _mm256_fmadd_ps(va2, vb01234567, vacc2x01234567);
+      vacc3x01234567 = _mm256_fmadd_ps(va3, vb01234567, vacc3x01234567);
+      vacc4x01234567 = _mm256_fmadd_ps(va4, vb01234567, vacc4x01234567);
+      vacc5x01234567 = _mm256_fmadd_ps(va5, vb01234567, vacc5x01234567);
+
+      k -= sizeof(float);
+    } while (k != 0);
+
+    const __m256 vmax = _mm256_broadcast_ps((const __m128*) params->sse.max);
+    vacc0x01234567 = _mm256_min_ps(vacc0x01234567, vmax);
+    vacc1x01234567 = _mm256_min_ps(vacc1x01234567, vmax);
+    vacc2x01234567 = _mm256_min_ps(vacc2x01234567, vmax);
+    vacc3x01234567 = _mm256_min_ps(vacc3x01234567, vmax);
+    vacc4x01234567 = _mm256_min_ps(vacc4x01234567, vmax);
+    vacc5x01234567 = _mm256_min_ps(vacc5x01234567, vmax);
+
+    const __m256 vmin = _mm256_broadcast_ps((const __m128*) params->sse.min);
+    vacc0x01234567 = _mm256_max_ps(vacc0x01234567, vmin);
+    vacc1x01234567 = _mm256_max_ps(vacc1x01234567, vmin);
+    vacc2x01234567 = _mm256_max_ps(vacc2x01234567, vmin);
+    vacc3x01234567 = _mm256_max_ps(vacc3x01234567, vmin);
+    vacc4x01234567 = _mm256_max_ps(vacc4x01234567, vmin);
+    vacc5x01234567 = _mm256_max_ps(vacc5x01234567, vmin);
+
+    if XNN_LIKELY(nc >= 8) {
+      _mm256_storeu_ps(c5, vacc5x01234567);
+      c5 = (float*) ((uintptr_t) c5 + cn_stride);
+      _mm256_storeu_ps(c4, vacc4x01234567);
+      c4 = (float*) ((uintptr_t) c4 + cn_stride);
+      _mm256_storeu_ps(c3, vacc3x01234567);
+      c3 = (float*) ((uintptr_t) c3 + cn_stride);
+      _mm256_storeu_ps(c2, vacc2x01234567);
+      c2 = (float*) ((uintptr_t) c2 + cn_stride);
+      _mm256_storeu_ps(c1, vacc1x01234567);
+      c1 = (float*) ((uintptr_t) c1 + cn_stride);
+      _mm256_storeu_ps(c0, vacc0x01234567);
+      c0 = (float*) ((uintptr_t) c0 + cn_stride);
+
+      a5 = (const float*) ((uintptr_t) a5 - kc);
+      a4 = (const float*) ((uintptr_t) a4 - kc);
+      a3 = (const float*) ((uintptr_t) a3 - kc);
+      a2 = (const float*) ((uintptr_t) a2 - kc);
+      a1 = (const float*) ((uintptr_t) a1 - kc);
+      a0 = (const float*) ((uintptr_t) a0 - kc);
+
+      nc -= 8;
+    } else {
+      __m128 vacc5x0123 = _mm256_castps256_ps128(vacc5x01234567);
+      __m128 vacc4x0123 = _mm256_castps256_ps128(vacc4x01234567);
+      __m128 vacc3x0123 = _mm256_castps256_ps128(vacc3x01234567);
+      __m128 vacc2x0123 = _mm256_castps256_ps128(vacc2x01234567);
+      __m128 vacc1x0123 = _mm256_castps256_ps128(vacc1x01234567);
+      __m128 vacc0x0123 = _mm256_castps256_ps128(vacc0x01234567);
+      if (nc & 4) {
+        _mm_storeu_ps(c5, vacc5x0123);
+        _mm_storeu_ps(c4, vacc4x0123);
+        _mm_storeu_ps(c3, vacc3x0123);
+        _mm_storeu_ps(c2, vacc2x0123);
+        _mm_storeu_ps(c1, vacc1x0123);
+        _mm_storeu_ps(c0, vacc0x0123);
+
+        vacc5x0123 = _mm256_extractf128_ps(vacc5x01234567, 1);
+        vacc4x0123 = _mm256_extractf128_ps(vacc4x01234567, 1);
+        vacc3x0123 = _mm256_extractf128_ps(vacc3x01234567, 1);
+        vacc2x0123 = _mm256_extractf128_ps(vacc2x01234567, 1);
+        vacc1x0123 = _mm256_extractf128_ps(vacc1x01234567, 1);
+        vacc0x0123 = _mm256_extractf128_ps(vacc0x01234567, 1);
+
+        c5 += 4;
+        c4 += 4;
+        c3 += 4;
+        c2 += 4;
+        c1 += 4;
+        c0 += 4;
+      }
+      if (nc & 2) {
+        _mm_storel_pi((__m64*) c5, vacc5x0123);
+        _mm_storel_pi((__m64*) c4, vacc4x0123);
+        _mm_storel_pi((__m64*) c3, vacc3x0123);
+        _mm_storel_pi((__m64*) c2, vacc2x0123);
+        _mm_storel_pi((__m64*) c1, vacc1x0123);
+        _mm_storel_pi((__m64*) c0, vacc0x0123);
+
+        vacc5x0123 = _mm_movehl_ps(vacc5x0123, vacc5x0123);
+        vacc4x0123 = _mm_movehl_ps(vacc4x0123, vacc4x0123);
+        vacc3x0123 = _mm_movehl_ps(vacc3x0123, vacc3x0123);
+        vacc2x0123 = _mm_movehl_ps(vacc2x0123, vacc2x0123);
+        vacc1x0123 = _mm_movehl_ps(vacc1x0123, vacc1x0123);
+        vacc0x0123 = _mm_movehl_ps(vacc0x0123, vacc0x0123);
+
+        c5 += 2;
+        c4 += 2;
+        c3 += 2;
+        c2 += 2;
+        c1 += 2;
+        c0 += 2;
+      }
+      if (nc & 1) {
+        _mm_store_ss(c5, vacc5x0123);
+        _mm_store_ss(c4, vacc4x0123);
+        _mm_store_ss(c3, vacc3x0123);
+        _mm_store_ss(c2, vacc2x0123);
+        _mm_store_ss(c1, vacc1x0123);
+        _mm_store_ss(c0, vacc0x0123);
+      }
+
+      nc = 0;
+    }
+  } while (nc != 0);
+}
diff --git a/src/f32-gemm/7x8-avx-broadcast.c b/src/f32-gemm/7x8-avx-broadcast.c
new file mode 100644
index 0000000..ed6745d
--- /dev/null
+++ b/src/f32-gemm/7x8-avx-broadcast.c
@@ -0,0 +1,232 @@
+// Auto-generated file. Do not edit!
+//   Template: src/f32-gemm/avx-broadcast.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <immintrin.h>
+
+#include <xnnpack/gemm.h>
+
+
+void xnn_f32_gemm_ukernel_7x8__avx_broadcast(
+    size_t mr,
+    size_t nc,
+    size_t kc,
+    const float*restrict a,
+    size_t a_stride,
+    const float*restrict w,
+    float*restrict c,
+    size_t cm_stride,
+    size_t cn_stride,
+    const union xnn_f32_output_params params[restrict static 1])
+{
+  assert(mr != 0);
+  assert(mr <= 7);
+  assert(nc != 0);
+  assert(kc != 0);
+  assert(kc % sizeof(float) == 0);
+  assert(a != NULL);
+  assert(w != NULL);
+  assert(c != NULL);
+
+  const float* a0 = a;
+  float* c0 = c;
+  const float* a1 = (const float*) ((uintptr_t) a0 + a_stride);
+  float* c1 = (float*) ((uintptr_t) c0 + cm_stride);
+  if XNN_UNPREDICTABLE(mr < 2) {
+    a1 = a0;
+    c1 = c0;
+  }
+  const float* a2 = (const float*) ((uintptr_t) a1 + a_stride);
+  float* c2 = (float*) ((uintptr_t) c1 + cm_stride);
+  if XNN_UNPREDICTABLE(mr <= 2) {
+    a2 = a1;
+    c2 = c1;
+  }
+  const float* a3 = (const float*) ((uintptr_t) a2 + a_stride);
+  float* c3 = (float*) ((uintptr_t) c2 + cm_stride);
+  if XNN_UNPREDICTABLE(mr < 4) {
+    a3 = a2;
+    c3 = c2;
+  }
+  const float* a4 = (const float*) ((uintptr_t) a3 + a_stride);
+  float* c4 = (float*) ((uintptr_t) c3 + cm_stride);
+  if XNN_UNPREDICTABLE(mr <= 4) {
+    a4 = a3;
+    c4 = c3;
+  }
+  const float* a5 = (const float*) ((uintptr_t) a4 + a_stride);
+  float* c5 = (float*) ((uintptr_t) c4 + cm_stride);
+  if XNN_UNPREDICTABLE(mr < 6) {
+    a5 = a4;
+    c5 = c4;
+  }
+  const float* a6 = (const float*) ((uintptr_t) a5 + a_stride);
+  float* c6 = (float*) ((uintptr_t) c5 + cm_stride);
+  if XNN_UNPREDICTABLE(mr <= 6) {
+    a6 = a5;
+    c6 = c5;
+  }
+
+  do {
+    __m256 vacc0x01234567 = _mm256_load_ps(w + 0);
+    __m256 vacc1x01234567 = vacc0x01234567;
+    __m256 vacc2x01234567 = vacc0x01234567;
+    __m256 vacc3x01234567 = vacc0x01234567;
+    __m256 vacc4x01234567 = vacc0x01234567;
+    __m256 vacc5x01234567 = vacc0x01234567;
+    __m256 vacc6x01234567 = vacc0x01234567;
+    w += 8;
+
+    size_t k = kc;
+    do {
+      const __m256 va0 = _mm256_broadcast_ss(a0);
+      a0 += 1;
+      const __m256 va1 = _mm256_broadcast_ss(a1);
+      a1 += 1;
+      const __m256 va2 = _mm256_broadcast_ss(a2);
+      a2 += 1;
+      const __m256 va3 = _mm256_broadcast_ss(a3);
+      a3 += 1;
+      const __m256 va4 = _mm256_broadcast_ss(a4);
+      a4 += 1;
+      const __m256 va5 = _mm256_broadcast_ss(a5);
+      a5 += 1;
+      const __m256 va6 = _mm256_broadcast_ss(a6);
+      a6 += 1;
+
+      const __m256 vb01234567 = _mm256_load_ps(w);
+      w += 8;
+
+      vacc0x01234567 = _mm256_add_ps(vacc0x01234567, _mm256_mul_ps(va0, vb01234567));
+      vacc1x01234567 = _mm256_add_ps(vacc1x01234567, _mm256_mul_ps(va1, vb01234567));
+      vacc2x01234567 = _mm256_add_ps(vacc2x01234567, _mm256_mul_ps(va2, vb01234567));
+      vacc3x01234567 = _mm256_add_ps(vacc3x01234567, _mm256_mul_ps(va3, vb01234567));
+      vacc4x01234567 = _mm256_add_ps(vacc4x01234567, _mm256_mul_ps(va4, vb01234567));
+      vacc5x01234567 = _mm256_add_ps(vacc5x01234567, _mm256_mul_ps(va5, vb01234567));
+      vacc6x01234567 = _mm256_add_ps(vacc6x01234567, _mm256_mul_ps(va6, vb01234567));
+
+      k -= sizeof(float);
+    } while (k != 0);
+
+    const __m256 vmax = _mm256_broadcast_ps((const __m128*) params->sse.max);
+    vacc0x01234567 = _mm256_min_ps(vacc0x01234567, vmax);
+    vacc1x01234567 = _mm256_min_ps(vacc1x01234567, vmax);
+    vacc2x01234567 = _mm256_min_ps(vacc2x01234567, vmax);
+    vacc3x01234567 = _mm256_min_ps(vacc3x01234567, vmax);
+    vacc4x01234567 = _mm256_min_ps(vacc4x01234567, vmax);
+    vacc5x01234567 = _mm256_min_ps(vacc5x01234567, vmax);
+    vacc6x01234567 = _mm256_min_ps(vacc6x01234567, vmax);
+
+    const __m256 vmin = _mm256_broadcast_ps((const __m128*) params->sse.min);
+    vacc0x01234567 = _mm256_max_ps(vacc0x01234567, vmin);
+    vacc1x01234567 = _mm256_max_ps(vacc1x01234567, vmin);
+    vacc2x01234567 = _mm256_max_ps(vacc2x01234567, vmin);
+    vacc3x01234567 = _mm256_max_ps(vacc3x01234567, vmin);
+    vacc4x01234567 = _mm256_max_ps(vacc4x01234567, vmin);
+    vacc5x01234567 = _mm256_max_ps(vacc5x01234567, vmin);
+    vacc6x01234567 = _mm256_max_ps(vacc6x01234567, vmin);
+
+    if XNN_LIKELY(nc >= 8) {
+      _mm256_storeu_ps(c6, vacc6x01234567);
+      c6 = (float*) ((uintptr_t) c6 + cn_stride);
+      _mm256_storeu_ps(c5, vacc5x01234567);
+      c5 = (float*) ((uintptr_t) c5 + cn_stride);
+      _mm256_storeu_ps(c4, vacc4x01234567);
+      c4 = (float*) ((uintptr_t) c4 + cn_stride);
+      _mm256_storeu_ps(c3, vacc3x01234567);
+      c3 = (float*) ((uintptr_t) c3 + cn_stride);
+      _mm256_storeu_ps(c2, vacc2x01234567);
+      c2 = (float*) ((uintptr_t) c2 + cn_stride);
+      _mm256_storeu_ps(c1, vacc1x01234567);
+      c1 = (float*) ((uintptr_t) c1 + cn_stride);
+      _mm256_storeu_ps(c0, vacc0x01234567);
+      c0 = (float*) ((uintptr_t) c0 + cn_stride);
+
+      a6 = (const float*) ((uintptr_t) a6 - kc);
+      a5 = (const float*) ((uintptr_t) a5 - kc);
+      a4 = (const float*) ((uintptr_t) a4 - kc);
+      a3 = (const float*) ((uintptr_t) a3 - kc);
+      a2 = (const float*) ((uintptr_t) a2 - kc);
+      a1 = (const float*) ((uintptr_t) a1 - kc);
+      a0 = (const float*) ((uintptr_t) a0 - kc);
+
+      nc -= 8;
+    } else {
+      __m128 vacc6x0123 = _mm256_castps256_ps128(vacc6x01234567);
+      __m128 vacc5x0123 = _mm256_castps256_ps128(vacc5x01234567);
+      __m128 vacc4x0123 = _mm256_castps256_ps128(vacc4x01234567);
+      __m128 vacc3x0123 = _mm256_castps256_ps128(vacc3x01234567);
+      __m128 vacc2x0123 = _mm256_castps256_ps128(vacc2x01234567);
+      __m128 vacc1x0123 = _mm256_castps256_ps128(vacc1x01234567);
+      __m128 vacc0x0123 = _mm256_castps256_ps128(vacc0x01234567);
+      if (nc & 4) {
+        _mm_storeu_ps(c6, vacc6x0123);
+        _mm_storeu_ps(c5, vacc5x0123);
+        _mm_storeu_ps(c4, vacc4x0123);
+        _mm_storeu_ps(c3, vacc3x0123);
+        _mm_storeu_ps(c2, vacc2x0123);
+        _mm_storeu_ps(c1, vacc1x0123);
+        _mm_storeu_ps(c0, vacc0x0123);
+
+        vacc6x0123 = _mm256_extractf128_ps(vacc6x01234567, 1);
+        vacc5x0123 = _mm256_extractf128_ps(vacc5x01234567, 1);
+        vacc4x0123 = _mm256_extractf128_ps(vacc4x01234567, 1);
+        vacc3x0123 = _mm256_extractf128_ps(vacc3x01234567, 1);
+        vacc2x0123 = _mm256_extractf128_ps(vacc2x01234567, 1);
+        vacc1x0123 = _mm256_extractf128_ps(vacc1x01234567, 1);
+        vacc0x0123 = _mm256_extractf128_ps(vacc0x01234567, 1);
+
+        c6 += 4;
+        c5 += 4;
+        c4 += 4;
+        c3 += 4;
+        c2 += 4;
+        c1 += 4;
+        c0 += 4;
+      }
+      if (nc & 2) {
+        _mm_storel_pi((__m64*) c6, vacc6x0123);
+        _mm_storel_pi((__m64*) c5, vacc5x0123);
+        _mm_storel_pi((__m64*) c4, vacc4x0123);
+        _mm_storel_pi((__m64*) c3, vacc3x0123);
+        _mm_storel_pi((__m64*) c2, vacc2x0123);
+        _mm_storel_pi((__m64*) c1, vacc1x0123);
+        _mm_storel_pi((__m64*) c0, vacc0x0123);
+
+        vacc6x0123 = _mm_movehl_ps(vacc6x0123, vacc6x0123);
+        vacc5x0123 = _mm_movehl_ps(vacc5x0123, vacc5x0123);
+        vacc4x0123 = _mm_movehl_ps(vacc4x0123, vacc4x0123);
+        vacc3x0123 = _mm_movehl_ps(vacc3x0123, vacc3x0123);
+        vacc2x0123 = _mm_movehl_ps(vacc2x0123, vacc2x0123);
+        vacc1x0123 = _mm_movehl_ps(vacc1x0123, vacc1x0123);
+        vacc0x0123 = _mm_movehl_ps(vacc0x0123, vacc0x0123);
+
+        c6 += 2;
+        c5 += 2;
+        c4 += 2;
+        c3 += 2;
+        c2 += 2;
+        c1 += 2;
+        c0 += 2;
+      }
+      if (nc & 1) {
+        _mm_store_ss(c6, vacc6x0123);
+        _mm_store_ss(c5, vacc5x0123);
+        _mm_store_ss(c4, vacc4x0123);
+        _mm_store_ss(c3, vacc3x0123);
+        _mm_store_ss(c2, vacc2x0123);
+        _mm_store_ss(c1, vacc1x0123);
+        _mm_store_ss(c0, vacc0x0123);
+      }
+
+      nc = 0;
+    }
+  } while (nc != 0);
+}
diff --git a/src/f32-gemm/7x8-fma3-broadcast.c b/src/f32-gemm/7x8-fma3-broadcast.c
new file mode 100644
index 0000000..4f0d223
--- /dev/null
+++ b/src/f32-gemm/7x8-fma3-broadcast.c
@@ -0,0 +1,232 @@
+// Auto-generated file. Do not edit!
+//   Template: src/f32-gemm/avx-broadcast.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <immintrin.h>
+
+#include <xnnpack/gemm.h>
+
+
+void xnn_f32_gemm_ukernel_7x8__fma3_broadcast(
+    size_t mr,
+    size_t nc,
+    size_t kc,
+    const float*restrict a,
+    size_t a_stride,
+    const float*restrict w,
+    float*restrict c,
+    size_t cm_stride,
+    size_t cn_stride,
+    const union xnn_f32_output_params params[restrict static 1])
+{
+  assert(mr != 0);
+  assert(mr <= 7);
+  assert(nc != 0);
+  assert(kc != 0);
+  assert(kc % sizeof(float) == 0);
+  assert(a != NULL);
+  assert(w != NULL);
+  assert(c != NULL);
+
+  const float* a0 = a;
+  float* c0 = c;
+  const float* a1 = (const float*) ((uintptr_t) a0 + a_stride);
+  float* c1 = (float*) ((uintptr_t) c0 + cm_stride);
+  if XNN_UNPREDICTABLE(mr < 2) {
+    a1 = a0;
+    c1 = c0;
+  }
+  const float* a2 = (const float*) ((uintptr_t) a1 + a_stride);
+  float* c2 = (float*) ((uintptr_t) c1 + cm_stride);
+  if XNN_UNPREDICTABLE(mr <= 2) {
+    a2 = a1;
+    c2 = c1;
+  }
+  const float* a3 = (const float*) ((uintptr_t) a2 + a_stride);
+  float* c3 = (float*) ((uintptr_t) c2 + cm_stride);
+  if XNN_UNPREDICTABLE(mr < 4) {
+    a3 = a2;
+    c3 = c2;
+  }
+  const float* a4 = (const float*) ((uintptr_t) a3 + a_stride);
+  float* c4 = (float*) ((uintptr_t) c3 + cm_stride);
+  if XNN_UNPREDICTABLE(mr <= 4) {
+    a4 = a3;
+    c4 = c3;
+  }
+  const float* a5 = (const float*) ((uintptr_t) a4 + a_stride);
+  float* c5 = (float*) ((uintptr_t) c4 + cm_stride);
+  if XNN_UNPREDICTABLE(mr < 6) {
+    a5 = a4;
+    c5 = c4;
+  }
+  const float* a6 = (const float*) ((uintptr_t) a5 + a_stride);
+  float* c6 = (float*) ((uintptr_t) c5 + cm_stride);
+  if XNN_UNPREDICTABLE(mr <= 6) {
+    a6 = a5;
+    c6 = c5;
+  }
+
+  do {
+    __m256 vacc0x01234567 = _mm256_load_ps(w + 0);
+    __m256 vacc1x01234567 = vacc0x01234567;
+    __m256 vacc2x01234567 = vacc0x01234567;
+    __m256 vacc3x01234567 = vacc0x01234567;
+    __m256 vacc4x01234567 = vacc0x01234567;
+    __m256 vacc5x01234567 = vacc0x01234567;
+    __m256 vacc6x01234567 = vacc0x01234567;
+    w += 8;
+
+    size_t k = kc;
+    do {
+      const __m256 va0 = _mm256_broadcast_ss(a0);
+      a0 += 1;
+      const __m256 va1 = _mm256_broadcast_ss(a1);
+      a1 += 1;
+      const __m256 va2 = _mm256_broadcast_ss(a2);
+      a2 += 1;
+      const __m256 va3 = _mm256_broadcast_ss(a3);
+      a3 += 1;
+      const __m256 va4 = _mm256_broadcast_ss(a4);
+      a4 += 1;
+      const __m256 va5 = _mm256_broadcast_ss(a5);
+      a5 += 1;
+      const __m256 va6 = _mm256_broadcast_ss(a6);
+      a6 += 1;
+
+      const __m256 vb01234567 = _mm256_load_ps(w);
+      w += 8;
+
+      vacc0x01234567 = _mm256_fmadd_ps(va0, vb01234567, vacc0x01234567);
+      vacc1x01234567 = _mm256_fmadd_ps(va1, vb01234567, vacc1x01234567);
+      vacc2x01234567 = _mm256_fmadd_ps(va2, vb01234567, vacc2x01234567);
+      vacc3x01234567 = _mm256_fmadd_ps(va3, vb01234567, vacc3x01234567);
+      vacc4x01234567 = _mm256_fmadd_ps(va4, vb01234567, vacc4x01234567);
+      vacc5x01234567 = _mm256_fmadd_ps(va5, vb01234567, vacc5x01234567);
+      vacc6x01234567 = _mm256_fmadd_ps(va6, vb01234567, vacc6x01234567);
+
+      k -= sizeof(float);
+    } while (k != 0);
+
+    const __m256 vmax = _mm256_broadcast_ps((const __m128*) params->sse.max);
+    vacc0x01234567 = _mm256_min_ps(vacc0x01234567, vmax);
+    vacc1x01234567 = _mm256_min_ps(vacc1x01234567, vmax);
+    vacc2x01234567 = _mm256_min_ps(vacc2x01234567, vmax);
+    vacc3x01234567 = _mm256_min_ps(vacc3x01234567, vmax);
+    vacc4x01234567 = _mm256_min_ps(vacc4x01234567, vmax);
+    vacc5x01234567 = _mm256_min_ps(vacc5x01234567, vmax);
+    vacc6x01234567 = _mm256_min_ps(vacc6x01234567, vmax);
+
+    const __m256 vmin = _mm256_broadcast_ps((const __m128*) params->sse.min);
+    vacc0x01234567 = _mm256_max_ps(vacc0x01234567, vmin);
+    vacc1x01234567 = _mm256_max_ps(vacc1x01234567, vmin);
+    vacc2x01234567 = _mm256_max_ps(vacc2x01234567, vmin);
+    vacc3x01234567 = _mm256_max_ps(vacc3x01234567, vmin);
+    vacc4x01234567 = _mm256_max_ps(vacc4x01234567, vmin);
+    vacc5x01234567 = _mm256_max_ps(vacc5x01234567, vmin);
+    vacc6x01234567 = _mm256_max_ps(vacc6x01234567, vmin);
+
+    if XNN_LIKELY(nc >= 8) {
+      _mm256_storeu_ps(c6, vacc6x01234567);
+      c6 = (float*) ((uintptr_t) c6 + cn_stride);
+      _mm256_storeu_ps(c5, vacc5x01234567);
+      c5 = (float*) ((uintptr_t) c5 + cn_stride);
+      _mm256_storeu_ps(c4, vacc4x01234567);
+      c4 = (float*) ((uintptr_t) c4 + cn_stride);
+      _mm256_storeu_ps(c3, vacc3x01234567);
+      c3 = (float*) ((uintptr_t) c3 + cn_stride);
+      _mm256_storeu_ps(c2, vacc2x01234567);
+      c2 = (float*) ((uintptr_t) c2 + cn_stride);
+      _mm256_storeu_ps(c1, vacc1x01234567);
+      c1 = (float*) ((uintptr_t) c1 + cn_stride);
+      _mm256_storeu_ps(c0, vacc0x01234567);
+      c0 = (float*) ((uintptr_t) c0 + cn_stride);
+
+      a6 = (const float*) ((uintptr_t) a6 - kc);
+      a5 = (const float*) ((uintptr_t) a5 - kc);
+      a4 = (const float*) ((uintptr_t) a4 - kc);
+      a3 = (const float*) ((uintptr_t) a3 - kc);
+      a2 = (const float*) ((uintptr_t) a2 - kc);
+      a1 = (const float*) ((uintptr_t) a1 - kc);
+      a0 = (const float*) ((uintptr_t) a0 - kc);
+
+      nc -= 8;
+    } else {
+      __m128 vacc6x0123 = _mm256_castps256_ps128(vacc6x01234567);
+      __m128 vacc5x0123 = _mm256_castps256_ps128(vacc5x01234567);
+      __m128 vacc4x0123 = _mm256_castps256_ps128(vacc4x01234567);
+      __m128 vacc3x0123 = _mm256_castps256_ps128(vacc3x01234567);
+      __m128 vacc2x0123 = _mm256_castps256_ps128(vacc2x01234567);
+      __m128 vacc1x0123 = _mm256_castps256_ps128(vacc1x01234567);
+      __m128 vacc0x0123 = _mm256_castps256_ps128(vacc0x01234567);
+      if (nc & 4) {
+        _mm_storeu_ps(c6, vacc6x0123);
+        _mm_storeu_ps(c5, vacc5x0123);
+        _mm_storeu_ps(c4, vacc4x0123);
+        _mm_storeu_ps(c3, vacc3x0123);
+        _mm_storeu_ps(c2, vacc2x0123);
+        _mm_storeu_ps(c1, vacc1x0123);
+        _mm_storeu_ps(c0, vacc0x0123);
+
+        vacc6x0123 = _mm256_extractf128_ps(vacc6x01234567, 1);
+        vacc5x0123 = _mm256_extractf128_ps(vacc5x01234567, 1);
+        vacc4x0123 = _mm256_extractf128_ps(vacc4x01234567, 1);
+        vacc3x0123 = _mm256_extractf128_ps(vacc3x01234567, 1);
+        vacc2x0123 = _mm256_extractf128_ps(vacc2x01234567, 1);
+        vacc1x0123 = _mm256_extractf128_ps(vacc1x01234567, 1);
+        vacc0x0123 = _mm256_extractf128_ps(vacc0x01234567, 1);
+
+        c6 += 4;
+        c5 += 4;
+        c4 += 4;
+        c3 += 4;
+        c2 += 4;
+        c1 += 4;
+        c0 += 4;
+      }
+      if (nc & 2) {
+        _mm_storel_pi((__m64*) c6, vacc6x0123);
+        _mm_storel_pi((__m64*) c5, vacc5x0123);
+        _mm_storel_pi((__m64*) c4, vacc4x0123);
+        _mm_storel_pi((__m64*) c3, vacc3x0123);
+        _mm_storel_pi((__m64*) c2, vacc2x0123);
+        _mm_storel_pi((__m64*) c1, vacc1x0123);
+        _mm_storel_pi((__m64*) c0, vacc0x0123);
+
+        vacc6x0123 = _mm_movehl_ps(vacc6x0123, vacc6x0123);
+        vacc5x0123 = _mm_movehl_ps(vacc5x0123, vacc5x0123);
+        vacc4x0123 = _mm_movehl_ps(vacc4x0123, vacc4x0123);
+        vacc3x0123 = _mm_movehl_ps(vacc3x0123, vacc3x0123);
+        vacc2x0123 = _mm_movehl_ps(vacc2x0123, vacc2x0123);
+        vacc1x0123 = _mm_movehl_ps(vacc1x0123, vacc1x0123);
+        vacc0x0123 = _mm_movehl_ps(vacc0x0123, vacc0x0123);
+
+        c6 += 2;
+        c5 += 2;
+        c4 += 2;
+        c3 += 2;
+        c2 += 2;
+        c1 += 2;
+        c0 += 2;
+      }
+      if (nc & 1) {
+        _mm_store_ss(c6, vacc6x0123);
+        _mm_store_ss(c5, vacc5x0123);
+        _mm_store_ss(c4, vacc4x0123);
+        _mm_store_ss(c3, vacc3x0123);
+        _mm_store_ss(c2, vacc2x0123);
+        _mm_store_ss(c1, vacc1x0123);
+        _mm_store_ss(c0, vacc0x0123);
+      }
+
+      nc = 0;
+    }
+  } while (nc != 0);
+}
diff --git a/src/f32-gemm/8x8-fma3-broadcast.c b/src/f32-gemm/8x8-fma3-broadcast.c
new file mode 100644
index 0000000..c4ef3d5
--- /dev/null
+++ b/src/f32-gemm/8x8-fma3-broadcast.c
@@ -0,0 +1,255 @@
+// Auto-generated file. Do not edit!
+//   Template: src/f32-gemm/avx-broadcast.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <immintrin.h>
+
+#include <xnnpack/gemm.h>
+
+
+void xnn_f32_gemm_ukernel_8x8__fma3_broadcast(
+    size_t mr,
+    size_t nc,
+    size_t kc,
+    const float*restrict a,
+    size_t a_stride,
+    const float*restrict w,
+    float*restrict c,
+    size_t cm_stride,
+    size_t cn_stride,
+    const union xnn_f32_output_params params[restrict static 1])
+{
+  assert(mr != 0);
+  assert(mr <= 8);
+  assert(nc != 0);
+  assert(kc != 0);
+  assert(kc % sizeof(float) == 0);
+  assert(a != NULL);
+  assert(w != NULL);
+  assert(c != NULL);
+
+  const float* a0 = a;
+  float* c0 = c;
+  const float* a1 = (const float*) ((uintptr_t) a0 + a_stride);
+  float* c1 = (float*) ((uintptr_t) c0 + cm_stride);
+  if XNN_UNPREDICTABLE(mr < 2) {
+    a1 = a0;
+    c1 = c0;
+  }
+  const float* a2 = (const float*) ((uintptr_t) a1 + a_stride);
+  float* c2 = (float*) ((uintptr_t) c1 + cm_stride);
+  if XNN_UNPREDICTABLE(mr <= 2) {
+    a2 = a1;
+    c2 = c1;
+  }
+  const float* a3 = (const float*) ((uintptr_t) a2 + a_stride);
+  float* c3 = (float*) ((uintptr_t) c2 + cm_stride);
+  if XNN_UNPREDICTABLE(mr < 4) {
+    a3 = a2;
+    c3 = c2;
+  }
+  const float* a4 = (const float*) ((uintptr_t) a3 + a_stride);
+  float* c4 = (float*) ((uintptr_t) c3 + cm_stride);
+  if XNN_UNPREDICTABLE(mr <= 4) {
+    a4 = a3;
+    c4 = c3;
+  }
+  const float* a5 = (const float*) ((uintptr_t) a4 + a_stride);
+  float* c5 = (float*) ((uintptr_t) c4 + cm_stride);
+  if XNN_UNPREDICTABLE(mr < 6) {
+    a5 = a4;
+    c5 = c4;
+  }
+  const float* a6 = (const float*) ((uintptr_t) a5 + a_stride);
+  float* c6 = (float*) ((uintptr_t) c5 + cm_stride);
+  if XNN_UNPREDICTABLE(mr <= 6) {
+    a6 = a5;
+    c6 = c5;
+  }
+  const float* a7 = (const float*) ((uintptr_t) a6 + a_stride);
+  float* c7 = (float*) ((uintptr_t) c6 + cm_stride);
+  if XNN_UNPREDICTABLE(mr != 8) {
+    a7 = a6;
+    c7 = c6;
+  }
+
+  do {
+    __m256 vacc0x01234567 = _mm256_load_ps(w + 0);
+    __m256 vacc1x01234567 = vacc0x01234567;
+    __m256 vacc2x01234567 = vacc0x01234567;
+    __m256 vacc3x01234567 = vacc0x01234567;
+    __m256 vacc4x01234567 = vacc0x01234567;
+    __m256 vacc5x01234567 = vacc0x01234567;
+    __m256 vacc6x01234567 = vacc0x01234567;
+    __m256 vacc7x01234567 = vacc0x01234567;
+    w += 8;
+
+    size_t k = kc;
+    do {
+      const __m256 va0 = _mm256_broadcast_ss(a0);
+      a0 += 1;
+      const __m256 va1 = _mm256_broadcast_ss(a1);
+      a1 += 1;
+      const __m256 va2 = _mm256_broadcast_ss(a2);
+      a2 += 1;
+      const __m256 va3 = _mm256_broadcast_ss(a3);
+      a3 += 1;
+      const __m256 va4 = _mm256_broadcast_ss(a4);
+      a4 += 1;
+      const __m256 va5 = _mm256_broadcast_ss(a5);
+      a5 += 1;
+      const __m256 va6 = _mm256_broadcast_ss(a6);
+      a6 += 1;
+      const __m256 va7 = _mm256_broadcast_ss(a7);
+      a7 += 1;
+
+      const __m256 vb01234567 = _mm256_load_ps(w);
+      w += 8;
+
+      vacc0x01234567 = _mm256_fmadd_ps(va0, vb01234567, vacc0x01234567);
+      vacc1x01234567 = _mm256_fmadd_ps(va1, vb01234567, vacc1x01234567);
+      vacc2x01234567 = _mm256_fmadd_ps(va2, vb01234567, vacc2x01234567);
+      vacc3x01234567 = _mm256_fmadd_ps(va3, vb01234567, vacc3x01234567);
+      vacc4x01234567 = _mm256_fmadd_ps(va4, vb01234567, vacc4x01234567);
+      vacc5x01234567 = _mm256_fmadd_ps(va5, vb01234567, vacc5x01234567);
+      vacc6x01234567 = _mm256_fmadd_ps(va6, vb01234567, vacc6x01234567);
+      vacc7x01234567 = _mm256_fmadd_ps(va7, vb01234567, vacc7x01234567);
+
+      k -= sizeof(float);
+    } while (k != 0);
+
+    const __m256 vmax = _mm256_broadcast_ps((const __m128*) params->sse.max);
+    vacc0x01234567 = _mm256_min_ps(vacc0x01234567, vmax);
+    vacc1x01234567 = _mm256_min_ps(vacc1x01234567, vmax);
+    vacc2x01234567 = _mm256_min_ps(vacc2x01234567, vmax);
+    vacc3x01234567 = _mm256_min_ps(vacc3x01234567, vmax);
+    vacc4x01234567 = _mm256_min_ps(vacc4x01234567, vmax);
+    vacc5x01234567 = _mm256_min_ps(vacc5x01234567, vmax);
+    vacc6x01234567 = _mm256_min_ps(vacc6x01234567, vmax);
+    vacc7x01234567 = _mm256_min_ps(vacc7x01234567, vmax);
+
+    const __m256 vmin = _mm256_broadcast_ps((const __m128*) params->sse.min);
+    vacc0x01234567 = _mm256_max_ps(vacc0x01234567, vmin);
+    vacc1x01234567 = _mm256_max_ps(vacc1x01234567, vmin);
+    vacc2x01234567 = _mm256_max_ps(vacc2x01234567, vmin);
+    vacc3x01234567 = _mm256_max_ps(vacc3x01234567, vmin);
+    vacc4x01234567 = _mm256_max_ps(vacc4x01234567, vmin);
+    vacc5x01234567 = _mm256_max_ps(vacc5x01234567, vmin);
+    vacc6x01234567 = _mm256_max_ps(vacc6x01234567, vmin);
+    vacc7x01234567 = _mm256_max_ps(vacc7x01234567, vmin);
+
+    if XNN_LIKELY(nc >= 8) {
+      _mm256_storeu_ps(c7, vacc7x01234567);
+      c7 = (float*) ((uintptr_t) c7 + cn_stride);
+      _mm256_storeu_ps(c6, vacc6x01234567);
+      c6 = (float*) ((uintptr_t) c6 + cn_stride);
+      _mm256_storeu_ps(c5, vacc5x01234567);
+      c5 = (float*) ((uintptr_t) c5 + cn_stride);
+      _mm256_storeu_ps(c4, vacc4x01234567);
+      c4 = (float*) ((uintptr_t) c4 + cn_stride);
+      _mm256_storeu_ps(c3, vacc3x01234567);
+      c3 = (float*) ((uintptr_t) c3 + cn_stride);
+      _mm256_storeu_ps(c2, vacc2x01234567);
+      c2 = (float*) ((uintptr_t) c2 + cn_stride);
+      _mm256_storeu_ps(c1, vacc1x01234567);
+      c1 = (float*) ((uintptr_t) c1 + cn_stride);
+      _mm256_storeu_ps(c0, vacc0x01234567);
+      c0 = (float*) ((uintptr_t) c0 + cn_stride);
+
+      a7 = (const float*) ((uintptr_t) a7 - kc);
+      a6 = (const float*) ((uintptr_t) a6 - kc);
+      a5 = (const float*) ((uintptr_t) a5 - kc);
+      a4 = (const float*) ((uintptr_t) a4 - kc);
+      a3 = (const float*) ((uintptr_t) a3 - kc);
+      a2 = (const float*) ((uintptr_t) a2 - kc);
+      a1 = (const float*) ((uintptr_t) a1 - kc);
+      a0 = (const float*) ((uintptr_t) a0 - kc);
+
+      nc -= 8;
+    } else {
+      __m128 vacc7x0123 = _mm256_castps256_ps128(vacc7x01234567);
+      __m128 vacc6x0123 = _mm256_castps256_ps128(vacc6x01234567);
+      __m128 vacc5x0123 = _mm256_castps256_ps128(vacc5x01234567);
+      __m128 vacc4x0123 = _mm256_castps256_ps128(vacc4x01234567);
+      __m128 vacc3x0123 = _mm256_castps256_ps128(vacc3x01234567);
+      __m128 vacc2x0123 = _mm256_castps256_ps128(vacc2x01234567);
+      __m128 vacc1x0123 = _mm256_castps256_ps128(vacc1x01234567);
+      __m128 vacc0x0123 = _mm256_castps256_ps128(vacc0x01234567);
+      if (nc & 4) {
+        _mm_storeu_ps(c7, vacc7x0123);
+        _mm_storeu_ps(c6, vacc6x0123);
+        _mm_storeu_ps(c5, vacc5x0123);
+        _mm_storeu_ps(c4, vacc4x0123);
+        _mm_storeu_ps(c3, vacc3x0123);
+        _mm_storeu_ps(c2, vacc2x0123);
+        _mm_storeu_ps(c1, vacc1x0123);
+        _mm_storeu_ps(c0, vacc0x0123);
+
+        vacc7x0123 = _mm256_extractf128_ps(vacc7x01234567, 1);
+        vacc6x0123 = _mm256_extractf128_ps(vacc6x01234567, 1);
+        vacc5x0123 = _mm256_extractf128_ps(vacc5x01234567, 1);
+        vacc4x0123 = _mm256_extractf128_ps(vacc4x01234567, 1);
+        vacc3x0123 = _mm256_extractf128_ps(vacc3x01234567, 1);
+        vacc2x0123 = _mm256_extractf128_ps(vacc2x01234567, 1);
+        vacc1x0123 = _mm256_extractf128_ps(vacc1x01234567, 1);
+        vacc0x0123 = _mm256_extractf128_ps(vacc0x01234567, 1);
+
+        c7 += 4;
+        c6 += 4;
+        c5 += 4;
+        c4 += 4;
+        c3 += 4;
+        c2 += 4;
+        c1 += 4;
+        c0 += 4;
+      }
+      if (nc & 2) {
+        _mm_storel_pi((__m64*) c7, vacc7x0123);
+        _mm_storel_pi((__m64*) c6, vacc6x0123);
+        _mm_storel_pi((__m64*) c5, vacc5x0123);
+        _mm_storel_pi((__m64*) c4, vacc4x0123);
+        _mm_storel_pi((__m64*) c3, vacc3x0123);
+        _mm_storel_pi((__m64*) c2, vacc2x0123);
+        _mm_storel_pi((__m64*) c1, vacc1x0123);
+        _mm_storel_pi((__m64*) c0, vacc0x0123);
+
+        vacc7x0123 = _mm_movehl_ps(vacc7x0123, vacc7x0123);
+        vacc6x0123 = _mm_movehl_ps(vacc6x0123, vacc6x0123);
+        vacc5x0123 = _mm_movehl_ps(vacc5x0123, vacc5x0123);
+        vacc4x0123 = _mm_movehl_ps(vacc4x0123, vacc4x0123);
+        vacc3x0123 = _mm_movehl_ps(vacc3x0123, vacc3x0123);
+        vacc2x0123 = _mm_movehl_ps(vacc2x0123, vacc2x0123);
+        vacc1x0123 = _mm_movehl_ps(vacc1x0123, vacc1x0123);
+        vacc0x0123 = _mm_movehl_ps(vacc0x0123, vacc0x0123);
+
+        c7 += 2;
+        c6 += 2;
+        c5 += 2;
+        c4 += 2;
+        c3 += 2;
+        c2 += 2;
+        c1 += 2;
+        c0 += 2;
+      }
+      if (nc & 1) {
+        _mm_store_ss(c7, vacc7x0123);
+        _mm_store_ss(c6, vacc6x0123);
+        _mm_store_ss(c5, vacc5x0123);
+        _mm_store_ss(c4, vacc4x0123);
+        _mm_store_ss(c3, vacc3x0123);
+        _mm_store_ss(c2, vacc2x0123);
+        _mm_store_ss(c1, vacc1x0123);
+        _mm_store_ss(c0, vacc0x0123);
+      }
+
+      nc = 0;
+    }
+  } while (nc != 0);
+}
diff --git a/src/f32-gemm/avx-broadcast.c.in b/src/f32-gemm/avx-broadcast.c.in
new file mode 100644
index 0000000..90d7e08
--- /dev/null
+++ b/src/f32-gemm/avx-broadcast.c.in
@@ -0,0 +1,163 @@
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+$assert NR % 4 == 0
+$ABC = "0123456789ABCDEFGHIJKLMN"
+#include <assert.h>
+
+#include <immintrin.h>
+
+#include <xnnpack/gemm.h>
+
+
+$ISA = {0: "avx", 3: "fma3"}[FMA]
+void xnn_f32_gemm${"inc" if INC else ""}_ukernel_${MR}x${NR}__${ISA}_broadcast(
+    size_t mr,
+    size_t nc,
+    size_t kc,
+    const float*restrict a,
+    size_t a_stride,
+    const float*restrict w,
+    float*restrict c,
+    size_t cm_stride,
+    size_t cn_stride,
+    $if INC:
+      const float*restrict acc,
+    const union xnn_f32_output_params params[restrict static 1])
+{
+  assert(mr != 0);
+  assert(mr <= ${MR});
+  assert(nc != 0);
+  assert(kc != 0);
+  assert(kc % sizeof(float) == 0);
+  assert(a != NULL);
+  assert(w != NULL);
+  assert(c != NULL);
+  $if INC:
+    assert(acc != NULL);
+
+  const float* a0 = a;
+  float* c0 = c;
+  $for M in range(1, MR):
+    const float* a${M} = (const float*) ((uintptr_t) a${M-1} + a_stride);
+    float* c${M} = (float*) ((uintptr_t) c${M-1} + cm_stride);
+    $if M % 2 == 0:
+      if XNN_UNPREDICTABLE(mr <= ${M}) {
+        a${M} = a${M-1};
+        c${M} = c${M-1};
+      }
+    $elif M + 1 == MR:
+      if XNN_UNPREDICTABLE(mr != ${M+1}) {
+        a${M} = a${M-1};
+        c${M} = c${M-1};
+      }
+    $else:
+      if XNN_UNPREDICTABLE(mr < ${M+1}) {
+        a${M} = a${M-1};
+        c${M} = c${M-1};
+      }
+
+  do {
+    $if INC:
+      $for M in range(MR):
+        $for N in range(0, NR, 8):
+          __m256 vacc${M}x${ABC[N:N+8]} = _mm256_load_ps(acc + ${M*NR+N});
+      acc += ${MR*NR};
+    $else:
+      $for N in range(0, NR, 8):
+        __m256 vacc0x${ABC[N:N+8]} = _mm256_load_ps(w + ${N});
+      $for M in range(1, MR):
+        $for N in range(0, NR, 8):
+          __m256 vacc${M}x${ABC[N:N+8]} = vacc0x${ABC[N:N+8]};
+      w += ${NR};
+
+    size_t k = kc;
+    do {
+      $for M in range(MR):
+        const __m256 va${M} = _mm256_broadcast_ss(a${M});
+        a${M} += 1;
+
+      const __m256 vb${ABC[0:8]} = _mm256_load_ps(w);
+      $for N in range(8, NR, 8):
+        const __m256 vb${ABC[N:N+8]} = _mm256_load_ps(w + ${N});
+      w += ${NR};
+
+      $for N in range(0, NR, 8):
+        $for M in range(MR):
+          $if FMA == 3:
+            vacc${M}x${ABC[N:N+8]} = _mm256_fmadd_ps(va${M}, vb${ABC[N:N+8]}, vacc${M}x${ABC[N:N+8]});
+          $else:
+            vacc${M}x${ABC[N:N+8]} = _mm256_add_ps(vacc${M}x${ABC[N:N+8]}, _mm256_mul_ps(va${M}, vb${ABC[N:N+8]}));
+
+      k -= sizeof(float);
+    } while (k != 0);
+
+    const __m256 vmax = _mm256_broadcast_ps((const __m128*) params->sse.max);
+    $for N in range(0, NR, 8):
+      $for M in range(MR):
+        vacc${M}x${ABC[N:N+8]} = _mm256_min_ps(vacc${M}x${ABC[N:N+8]}, vmax);
+
+    const __m256 vmin = _mm256_broadcast_ps((const __m128*) params->sse.min);
+    $for N in range(0, NR, 8):
+      $for M in range(MR):
+        vacc${M}x${ABC[N:N+8]} = _mm256_max_ps(vacc${M}x${ABC[N:N+8]}, vmin);
+
+    if XNN_LIKELY(nc >= ${NR}) {
+      $for M in reversed(range(MR)):
+        _mm256_storeu_ps(c${M}, vacc${M}x${ABC[0:8]});
+        $for N in range(8, NR, 8):
+          _mm256_storeu_ps(c${M} + ${N}, vacc${M}x${ABC[N:N+8]});
+        c${M} = (float*) ((uintptr_t) c${M} + cn_stride);
+
+      $for M in reversed(range(MR)):
+        a${M} = (const float*) ((uintptr_t) a${M} - kc);
+
+      nc -= ${NR};
+    } else {
+      $for LOG2N in reversed(range(NR.bit_length())):
+        $if NR != 1 << LOG2N:
+          if (nc & ${1 << LOG2N}) {
+            $if LOG2N >= 3:
+              $for M in reversed(range(MR)):
+                _mm256_storeu_ps(c${M}, vacc${M}x${ABC[0:8]});
+                $for N in range(8, 1 << LOG2N, 8):
+                  _mm256_storeu_ps(c${M} + ${N}, vacc${M}x${ABC[N:N+8]});
+
+              $for M in reversed(range(MR)):
+                $for N in range(0, 1 << (LOG2N - 1), 8):
+                  vacc${M}x${ABC[N:N+8]} = vacc${M}x${ABC[N + (1 << LOG2N):N + (1 << LOG2N)+8]};
+
+              $for M in reversed(range(MR)):
+                c${M} += ${1 << LOG2N};
+            $elif LOG2N == 2:
+              $for M in reversed(range(MR)):
+                _mm_storeu_ps(c${M}, vacc${M}x${ABC[0:4]});
+
+              $for M in reversed(range(MR)):
+                vacc${M}x${ABC[0:4]} = _mm256_extractf128_ps(vacc${M}x${ABC[0:8]}, 1);
+
+              $for M in reversed(range(MR)):
+                c${M} += 4;
+            $elif LOG2N == 1:
+              $for M in reversed(range(MR)):
+                _mm_storel_pi((__m64*) c${M}, vacc${M}x${ABC[0:4]});
+
+              $for M in reversed(range(MR)):
+                vacc${M}x${ABC[0:4]} = _mm_movehl_ps(vacc${M}x${ABC[0:4]}, vacc${M}x${ABC[0:4]});
+
+              $for M in reversed(range(MR)):
+                c${M} += 2;
+            $elif LOG2N == 0:
+              $for M in reversed(range(MR)):
+                _mm_store_ss(c${M}, vacc${M}x${ABC[0:4]});
+          }
+        $if LOG2N == 3:
+          $for M in reversed(range(MR)):
+            __m128 vacc${M}x${ABC[0:4]} = _mm256_castps256_ps128(vacc${M}x${ABC[0:8]});
+
+      nc = 0;
+    }
+  } while (nc != 0);
+}
diff --git a/src/f32-gemminc/1x8-avx-broadcast.c b/src/f32-gemminc/1x8-avx-broadcast.c
new file mode 100644
index 0000000..f1e4490
--- /dev/null
+++ b/src/f32-gemminc/1x8-avx-broadcast.c
@@ -0,0 +1,96 @@
+// Auto-generated file. Do not edit!
+//   Template: src/f32-gemm/avx-broadcast.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <immintrin.h>
+
+#include <xnnpack/gemm.h>
+
+
+void xnn_f32_gemminc_ukernel_1x8__avx_broadcast(
+    size_t mr,
+    size_t nc,
+    size_t kc,
+    const float*restrict a,
+    size_t a_stride,
+    const float*restrict w,
+    float*restrict c,
+    size_t cm_stride,
+    size_t cn_stride,
+    const float*restrict acc,
+    const union xnn_f32_output_params params[restrict static 1])
+{
+  assert(mr != 0);
+  assert(mr <= 1);
+  assert(nc != 0);
+  assert(kc != 0);
+  assert(kc % sizeof(float) == 0);
+  assert(a != NULL);
+  assert(w != NULL);
+  assert(c != NULL);
+  assert(acc != NULL);
+
+  const float* a0 = a;
+  float* c0 = c;
+
+  do {
+    __m256 vacc0x01234567 = _mm256_load_ps(acc + 0);
+    acc += 8;
+
+    size_t k = kc;
+    do {
+      const __m256 va0 = _mm256_broadcast_ss(a0);
+      a0 += 1;
+
+      const __m256 vb01234567 = _mm256_load_ps(w);
+      w += 8;
+
+      vacc0x01234567 = _mm256_add_ps(vacc0x01234567, _mm256_mul_ps(va0, vb01234567));
+
+      k -= sizeof(float);
+    } while (k != 0);
+
+    const __m256 vmax = _mm256_broadcast_ps((const __m128*) params->sse.max);
+    vacc0x01234567 = _mm256_min_ps(vacc0x01234567, vmax);
+
+    const __m256 vmin = _mm256_broadcast_ps((const __m128*) params->sse.min);
+    vacc0x01234567 = _mm256_max_ps(vacc0x01234567, vmin);
+
+    if XNN_LIKELY(nc >= 8) {
+      _mm256_storeu_ps(c0, vacc0x01234567);
+      c0 = (float*) ((uintptr_t) c0 + cn_stride);
+
+      a0 = (const float*) ((uintptr_t) a0 - kc);
+
+      nc -= 8;
+    } else {
+      __m128 vacc0x0123 = _mm256_castps256_ps128(vacc0x01234567);
+      if (nc & 4) {
+        _mm_storeu_ps(c0, vacc0x0123);
+
+        vacc0x0123 = _mm256_extractf128_ps(vacc0x01234567, 1);
+
+        c0 += 4;
+      }
+      if (nc & 2) {
+        _mm_storel_pi((__m64*) c0, vacc0x0123);
+
+        vacc0x0123 = _mm_movehl_ps(vacc0x0123, vacc0x0123);
+
+        c0 += 2;
+      }
+      if (nc & 1) {
+        _mm_store_ss(c0, vacc0x0123);
+      }
+
+      nc = 0;
+    }
+  } while (nc != 0);
+}
diff --git a/src/f32-gemminc/1x8-fma3-broadcast.c b/src/f32-gemminc/1x8-fma3-broadcast.c
new file mode 100644
index 0000000..204cfca
--- /dev/null
+++ b/src/f32-gemminc/1x8-fma3-broadcast.c
@@ -0,0 +1,96 @@
+// Auto-generated file. Do not edit!
+//   Template: src/f32-gemm/avx-broadcast.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <immintrin.h>
+
+#include <xnnpack/gemm.h>
+
+
+void xnn_f32_gemminc_ukernel_1x8__fma3_broadcast(
+    size_t mr,
+    size_t nc,
+    size_t kc,
+    const float*restrict a,
+    size_t a_stride,
+    const float*restrict w,
+    float*restrict c,
+    size_t cm_stride,
+    size_t cn_stride,
+    const float*restrict acc,
+    const union xnn_f32_output_params params[restrict static 1])
+{
+  assert(mr != 0);
+  assert(mr <= 1);
+  assert(nc != 0);
+  assert(kc != 0);
+  assert(kc % sizeof(float) == 0);
+  assert(a != NULL);
+  assert(w != NULL);
+  assert(c != NULL);
+  assert(acc != NULL);
+
+  const float* a0 = a;
+  float* c0 = c;
+
+  do {
+    __m256 vacc0x01234567 = _mm256_load_ps(acc + 0);
+    acc += 8;
+
+    size_t k = kc;
+    do {
+      const __m256 va0 = _mm256_broadcast_ss(a0);
+      a0 += 1;
+
+      const __m256 vb01234567 = _mm256_load_ps(w);
+      w += 8;
+
+      vacc0x01234567 = _mm256_fmadd_ps(va0, vb01234567, vacc0x01234567);
+
+      k -= sizeof(float);
+    } while (k != 0);
+
+    const __m256 vmax = _mm256_broadcast_ps((const __m128*) params->sse.max);
+    vacc0x01234567 = _mm256_min_ps(vacc0x01234567, vmax);
+
+    const __m256 vmin = _mm256_broadcast_ps((const __m128*) params->sse.min);
+    vacc0x01234567 = _mm256_max_ps(vacc0x01234567, vmin);
+
+    if XNN_LIKELY(nc >= 8) {
+      _mm256_storeu_ps(c0, vacc0x01234567);
+      c0 = (float*) ((uintptr_t) c0 + cn_stride);
+
+      a0 = (const float*) ((uintptr_t) a0 - kc);
+
+      nc -= 8;
+    } else {
+      __m128 vacc0x0123 = _mm256_castps256_ps128(vacc0x01234567);
+      if (nc & 4) {
+        _mm_storeu_ps(c0, vacc0x0123);
+
+        vacc0x0123 = _mm256_extractf128_ps(vacc0x01234567, 1);
+
+        c0 += 4;
+      }
+      if (nc & 2) {
+        _mm_storel_pi((__m64*) c0, vacc0x0123);
+
+        vacc0x0123 = _mm_movehl_ps(vacc0x0123, vacc0x0123);
+
+        c0 += 2;
+      }
+      if (nc & 1) {
+        _mm_store_ss(c0, vacc0x0123);
+      }
+
+      nc = 0;
+    }
+  } while (nc != 0);
+}
diff --git a/src/f32-gemminc/4x8-avx-broadcast.c b/src/f32-gemminc/4x8-avx-broadcast.c
new file mode 100644
index 0000000..79ece60
--- /dev/null
+++ b/src/f32-gemminc/4x8-avx-broadcast.c
@@ -0,0 +1,165 @@
+// Auto-generated file. Do not edit!
+//   Template: src/f32-gemm/avx-broadcast.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <immintrin.h>
+
+#include <xnnpack/gemm.h>
+
+
+void xnn_f32_gemminc_ukernel_4x8__avx_broadcast(
+    size_t mr,
+    size_t nc,
+    size_t kc,
+    const float*restrict a,
+    size_t a_stride,
+    const float*restrict w,
+    float*restrict c,
+    size_t cm_stride,
+    size_t cn_stride,
+    const float*restrict acc,
+    const union xnn_f32_output_params params[restrict static 1])
+{
+  assert(mr != 0);
+  assert(mr <= 4);
+  assert(nc != 0);
+  assert(kc != 0);
+  assert(kc % sizeof(float) == 0);
+  assert(a != NULL);
+  assert(w != NULL);
+  assert(c != NULL);
+  assert(acc != NULL);
+
+  const float* a0 = a;
+  float* c0 = c;
+  const float* a1 = (const float*) ((uintptr_t) a0 + a_stride);
+  float* c1 = (float*) ((uintptr_t) c0 + cm_stride);
+  if XNN_UNPREDICTABLE(mr < 2) {
+    a1 = a0;
+    c1 = c0;
+  }
+  const float* a2 = (const float*) ((uintptr_t) a1 + a_stride);
+  float* c2 = (float*) ((uintptr_t) c1 + cm_stride);
+  if XNN_UNPREDICTABLE(mr <= 2) {
+    a2 = a1;
+    c2 = c1;
+  }
+  const float* a3 = (const float*) ((uintptr_t) a2 + a_stride);
+  float* c3 = (float*) ((uintptr_t) c2 + cm_stride);
+  if XNN_UNPREDICTABLE(mr != 4) {
+    a3 = a2;
+    c3 = c2;
+  }
+
+  do {
+    __m256 vacc0x01234567 = _mm256_load_ps(acc + 0);
+    __m256 vacc1x01234567 = _mm256_load_ps(acc + 8);
+    __m256 vacc2x01234567 = _mm256_load_ps(acc + 16);
+    __m256 vacc3x01234567 = _mm256_load_ps(acc + 24);
+    acc += 32;
+
+    size_t k = kc;
+    do {
+      const __m256 va0 = _mm256_broadcast_ss(a0);
+      a0 += 1;
+      const __m256 va1 = _mm256_broadcast_ss(a1);
+      a1 += 1;
+      const __m256 va2 = _mm256_broadcast_ss(a2);
+      a2 += 1;
+      const __m256 va3 = _mm256_broadcast_ss(a3);
+      a3 += 1;
+
+      const __m256 vb01234567 = _mm256_load_ps(w);
+      w += 8;
+
+      vacc0x01234567 = _mm256_add_ps(vacc0x01234567, _mm256_mul_ps(va0, vb01234567));
+      vacc1x01234567 = _mm256_add_ps(vacc1x01234567, _mm256_mul_ps(va1, vb01234567));
+      vacc2x01234567 = _mm256_add_ps(vacc2x01234567, _mm256_mul_ps(va2, vb01234567));
+      vacc3x01234567 = _mm256_add_ps(vacc3x01234567, _mm256_mul_ps(va3, vb01234567));
+
+      k -= sizeof(float);
+    } while (k != 0);
+
+    const __m256 vmax = _mm256_broadcast_ps((const __m128*) params->sse.max);
+    vacc0x01234567 = _mm256_min_ps(vacc0x01234567, vmax);
+    vacc1x01234567 = _mm256_min_ps(vacc1x01234567, vmax);
+    vacc2x01234567 = _mm256_min_ps(vacc2x01234567, vmax);
+    vacc3x01234567 = _mm256_min_ps(vacc3x01234567, vmax);
+
+    const __m256 vmin = _mm256_broadcast_ps((const __m128*) params->sse.min);
+    vacc0x01234567 = _mm256_max_ps(vacc0x01234567, vmin);
+    vacc1x01234567 = _mm256_max_ps(vacc1x01234567, vmin);
+    vacc2x01234567 = _mm256_max_ps(vacc2x01234567, vmin);
+    vacc3x01234567 = _mm256_max_ps(vacc3x01234567, vmin);
+
+    if XNN_LIKELY(nc >= 8) {
+      _mm256_storeu_ps(c3, vacc3x01234567);
+      c3 = (float*) ((uintptr_t) c3 + cn_stride);
+      _mm256_storeu_ps(c2, vacc2x01234567);
+      c2 = (float*) ((uintptr_t) c2 + cn_stride);
+      _mm256_storeu_ps(c1, vacc1x01234567);
+      c1 = (float*) ((uintptr_t) c1 + cn_stride);
+      _mm256_storeu_ps(c0, vacc0x01234567);
+      c0 = (float*) ((uintptr_t) c0 + cn_stride);
+
+      a3 = (const float*) ((uintptr_t) a3 - kc);
+      a2 = (const float*) ((uintptr_t) a2 - kc);
+      a1 = (const float*) ((uintptr_t) a1 - kc);
+      a0 = (const float*) ((uintptr_t) a0 - kc);
+
+      nc -= 8;
+    } else {
+      __m128 vacc3x0123 = _mm256_castps256_ps128(vacc3x01234567);
+      __m128 vacc2x0123 = _mm256_castps256_ps128(vacc2x01234567);
+      __m128 vacc1x0123 = _mm256_castps256_ps128(vacc1x01234567);
+      __m128 vacc0x0123 = _mm256_castps256_ps128(vacc0x01234567);
+      if (nc & 4) {
+        _mm_storeu_ps(c3, vacc3x0123);
+        _mm_storeu_ps(c2, vacc2x0123);
+        _mm_storeu_ps(c1, vacc1x0123);
+        _mm_storeu_ps(c0, vacc0x0123);
+
+        vacc3x0123 = _mm256_extractf128_ps(vacc3x01234567, 1);
+        vacc2x0123 = _mm256_extractf128_ps(vacc2x01234567, 1);
+        vacc1x0123 = _mm256_extractf128_ps(vacc1x01234567, 1);
+        vacc0x0123 = _mm256_extractf128_ps(vacc0x01234567, 1);
+
+        c3 += 4;
+        c2 += 4;
+        c1 += 4;
+        c0 += 4;
+      }
+      if (nc & 2) {
+        _mm_storel_pi((__m64*) c3, vacc3x0123);
+        _mm_storel_pi((__m64*) c2, vacc2x0123);
+        _mm_storel_pi((__m64*) c1, vacc1x0123);
+        _mm_storel_pi((__m64*) c0, vacc0x0123);
+
+        vacc3x0123 = _mm_movehl_ps(vacc3x0123, vacc3x0123);
+        vacc2x0123 = _mm_movehl_ps(vacc2x0123, vacc2x0123);
+        vacc1x0123 = _mm_movehl_ps(vacc1x0123, vacc1x0123);
+        vacc0x0123 = _mm_movehl_ps(vacc0x0123, vacc0x0123);
+
+        c3 += 2;
+        c2 += 2;
+        c1 += 2;
+        c0 += 2;
+      }
+      if (nc & 1) {
+        _mm_store_ss(c3, vacc3x0123);
+        _mm_store_ss(c2, vacc2x0123);
+        _mm_store_ss(c1, vacc1x0123);
+        _mm_store_ss(c0, vacc0x0123);
+      }
+
+      nc = 0;
+    }
+  } while (nc != 0);
+}
diff --git a/src/f32-gemminc/4x8-fma3-broadcast.c b/src/f32-gemminc/4x8-fma3-broadcast.c
new file mode 100644
index 0000000..07e0c56
--- /dev/null
+++ b/src/f32-gemminc/4x8-fma3-broadcast.c
@@ -0,0 +1,165 @@
+// Auto-generated file. Do not edit!
+//   Template: src/f32-gemm/avx-broadcast.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <immintrin.h>
+
+#include <xnnpack/gemm.h>
+
+
+void xnn_f32_gemminc_ukernel_4x8__fma3_broadcast(
+    size_t mr,
+    size_t nc,
+    size_t kc,
+    const float*restrict a,
+    size_t a_stride,
+    const float*restrict w,
+    float*restrict c,
+    size_t cm_stride,
+    size_t cn_stride,
+    const float*restrict acc,
+    const union xnn_f32_output_params params[restrict static 1])
+{
+  assert(mr != 0);
+  assert(mr <= 4);
+  assert(nc != 0);
+  assert(kc != 0);
+  assert(kc % sizeof(float) == 0);
+  assert(a != NULL);
+  assert(w != NULL);
+  assert(c != NULL);
+  assert(acc != NULL);
+
+  const float* a0 = a;
+  float* c0 = c;
+  const float* a1 = (const float*) ((uintptr_t) a0 + a_stride);
+  float* c1 = (float*) ((uintptr_t) c0 + cm_stride);
+  if XNN_UNPREDICTABLE(mr < 2) {
+    a1 = a0;
+    c1 = c0;
+  }
+  const float* a2 = (const float*) ((uintptr_t) a1 + a_stride);
+  float* c2 = (float*) ((uintptr_t) c1 + cm_stride);
+  if XNN_UNPREDICTABLE(mr <= 2) {
+    a2 = a1;
+    c2 = c1;
+  }
+  const float* a3 = (const float*) ((uintptr_t) a2 + a_stride);
+  float* c3 = (float*) ((uintptr_t) c2 + cm_stride);
+  if XNN_UNPREDICTABLE(mr != 4) {
+    a3 = a2;
+    c3 = c2;
+  }
+
+  do {
+    __m256 vacc0x01234567 = _mm256_load_ps(acc + 0);
+    __m256 vacc1x01234567 = _mm256_load_ps(acc + 8);
+    __m256 vacc2x01234567 = _mm256_load_ps(acc + 16);
+    __m256 vacc3x01234567 = _mm256_load_ps(acc + 24);
+    acc += 32;
+
+    size_t k = kc;
+    do {
+      const __m256 va0 = _mm256_broadcast_ss(a0);
+      a0 += 1;
+      const __m256 va1 = _mm256_broadcast_ss(a1);
+      a1 += 1;
+      const __m256 va2 = _mm256_broadcast_ss(a2);
+      a2 += 1;
+      const __m256 va3 = _mm256_broadcast_ss(a3);
+      a3 += 1;
+
+      const __m256 vb01234567 = _mm256_load_ps(w);
+      w += 8;
+
+      vacc0x01234567 = _mm256_fmadd_ps(va0, vb01234567, vacc0x01234567);
+      vacc1x01234567 = _mm256_fmadd_ps(va1, vb01234567, vacc1x01234567);
+      vacc2x01234567 = _mm256_fmadd_ps(va2, vb01234567, vacc2x01234567);
+      vacc3x01234567 = _mm256_fmadd_ps(va3, vb01234567, vacc3x01234567);
+
+      k -= sizeof(float);
+    } while (k != 0);
+
+    const __m256 vmax = _mm256_broadcast_ps((const __m128*) params->sse.max);
+    vacc0x01234567 = _mm256_min_ps(vacc0x01234567, vmax);
+    vacc1x01234567 = _mm256_min_ps(vacc1x01234567, vmax);
+    vacc2x01234567 = _mm256_min_ps(vacc2x01234567, vmax);
+    vacc3x01234567 = _mm256_min_ps(vacc3x01234567, vmax);
+
+    const __m256 vmin = _mm256_broadcast_ps((const __m128*) params->sse.min);
+    vacc0x01234567 = _mm256_max_ps(vacc0x01234567, vmin);
+    vacc1x01234567 = _mm256_max_ps(vacc1x01234567, vmin);
+    vacc2x01234567 = _mm256_max_ps(vacc2x01234567, vmin);
+    vacc3x01234567 = _mm256_max_ps(vacc3x01234567, vmin);
+
+    if XNN_LIKELY(nc >= 8) {
+      _mm256_storeu_ps(c3, vacc3x01234567);
+      c3 = (float*) ((uintptr_t) c3 + cn_stride);
+      _mm256_storeu_ps(c2, vacc2x01234567);
+      c2 = (float*) ((uintptr_t) c2 + cn_stride);
+      _mm256_storeu_ps(c1, vacc1x01234567);
+      c1 = (float*) ((uintptr_t) c1 + cn_stride);
+      _mm256_storeu_ps(c0, vacc0x01234567);
+      c0 = (float*) ((uintptr_t) c0 + cn_stride);
+
+      a3 = (const float*) ((uintptr_t) a3 - kc);
+      a2 = (const float*) ((uintptr_t) a2 - kc);
+      a1 = (const float*) ((uintptr_t) a1 - kc);
+      a0 = (const float*) ((uintptr_t) a0 - kc);
+
+      nc -= 8;
+    } else {
+      __m128 vacc3x0123 = _mm256_castps256_ps128(vacc3x01234567);
+      __m128 vacc2x0123 = _mm256_castps256_ps128(vacc2x01234567);
+      __m128 vacc1x0123 = _mm256_castps256_ps128(vacc1x01234567);
+      __m128 vacc0x0123 = _mm256_castps256_ps128(vacc0x01234567);
+      if (nc & 4) {
+        _mm_storeu_ps(c3, vacc3x0123);
+        _mm_storeu_ps(c2, vacc2x0123);
+        _mm_storeu_ps(c1, vacc1x0123);
+        _mm_storeu_ps(c0, vacc0x0123);
+
+        vacc3x0123 = _mm256_extractf128_ps(vacc3x01234567, 1);
+        vacc2x0123 = _mm256_extractf128_ps(vacc2x01234567, 1);
+        vacc1x0123 = _mm256_extractf128_ps(vacc1x01234567, 1);
+        vacc0x0123 = _mm256_extractf128_ps(vacc0x01234567, 1);
+
+        c3 += 4;
+        c2 += 4;
+        c1 += 4;
+        c0 += 4;
+      }
+      if (nc & 2) {
+        _mm_storel_pi((__m64*) c3, vacc3x0123);
+        _mm_storel_pi((__m64*) c2, vacc2x0123);
+        _mm_storel_pi((__m64*) c1, vacc1x0123);
+        _mm_storel_pi((__m64*) c0, vacc0x0123);
+
+        vacc3x0123 = _mm_movehl_ps(vacc3x0123, vacc3x0123);
+        vacc2x0123 = _mm_movehl_ps(vacc2x0123, vacc2x0123);
+        vacc1x0123 = _mm_movehl_ps(vacc1x0123, vacc1x0123);
+        vacc0x0123 = _mm_movehl_ps(vacc0x0123, vacc0x0123);
+
+        c3 += 2;
+        c2 += 2;
+        c1 += 2;
+        c0 += 2;
+      }
+      if (nc & 1) {
+        _mm_store_ss(c3, vacc3x0123);
+        _mm_store_ss(c2, vacc2x0123);
+        _mm_store_ss(c1, vacc1x0123);
+        _mm_store_ss(c0, vacc0x0123);
+      }
+
+      nc = 0;
+    }
+  } while (nc != 0);
+}
diff --git a/src/f32-gemminc/5x8-avx-broadcast.c b/src/f32-gemminc/5x8-avx-broadcast.c
new file mode 100644
index 0000000..53ab1a8
--- /dev/null
+++ b/src/f32-gemminc/5x8-avx-broadcast.c
@@ -0,0 +1,188 @@
+// Auto-generated file. Do not edit!
+//   Template: src/f32-gemm/avx-broadcast.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <immintrin.h>
+
+#include <xnnpack/gemm.h>
+
+
+void xnn_f32_gemminc_ukernel_5x8__avx_broadcast(
+    size_t mr,
+    size_t nc,
+    size_t kc,
+    const float*restrict a,
+    size_t a_stride,
+    const float*restrict w,
+    float*restrict c,
+    size_t cm_stride,
+    size_t cn_stride,
+    const float*restrict acc,
+    const union xnn_f32_output_params params[restrict static 1])
+{
+  assert(mr != 0);
+  assert(mr <= 5);
+  assert(nc != 0);
+  assert(kc != 0);
+  assert(kc % sizeof(float) == 0);
+  assert(a != NULL);
+  assert(w != NULL);
+  assert(c != NULL);
+  assert(acc != NULL);
+
+  const float* a0 = a;
+  float* c0 = c;
+  const float* a1 = (const float*) ((uintptr_t) a0 + a_stride);
+  float* c1 = (float*) ((uintptr_t) c0 + cm_stride);
+  if XNN_UNPREDICTABLE(mr < 2) {
+    a1 = a0;
+    c1 = c0;
+  }
+  const float* a2 = (const float*) ((uintptr_t) a1 + a_stride);
+  float* c2 = (float*) ((uintptr_t) c1 + cm_stride);
+  if XNN_UNPREDICTABLE(mr <= 2) {
+    a2 = a1;
+    c2 = c1;
+  }
+  const float* a3 = (const float*) ((uintptr_t) a2 + a_stride);
+  float* c3 = (float*) ((uintptr_t) c2 + cm_stride);
+  if XNN_UNPREDICTABLE(mr < 4) {
+    a3 = a2;
+    c3 = c2;
+  }
+  const float* a4 = (const float*) ((uintptr_t) a3 + a_stride);
+  float* c4 = (float*) ((uintptr_t) c3 + cm_stride);
+  if XNN_UNPREDICTABLE(mr <= 4) {
+    a4 = a3;
+    c4 = c3;
+  }
+
+  do {
+    __m256 vacc0x01234567 = _mm256_load_ps(acc + 0);
+    __m256 vacc1x01234567 = _mm256_load_ps(acc + 8);
+    __m256 vacc2x01234567 = _mm256_load_ps(acc + 16);
+    __m256 vacc3x01234567 = _mm256_load_ps(acc + 24);
+    __m256 vacc4x01234567 = _mm256_load_ps(acc + 32);
+    acc += 40;
+
+    size_t k = kc;
+    do {
+      const __m256 va0 = _mm256_broadcast_ss(a0);
+      a0 += 1;
+      const __m256 va1 = _mm256_broadcast_ss(a1);
+      a1 += 1;
+      const __m256 va2 = _mm256_broadcast_ss(a2);
+      a2 += 1;
+      const __m256 va3 = _mm256_broadcast_ss(a3);
+      a3 += 1;
+      const __m256 va4 = _mm256_broadcast_ss(a4);
+      a4 += 1;
+
+      const __m256 vb01234567 = _mm256_load_ps(w);
+      w += 8;
+
+      vacc0x01234567 = _mm256_add_ps(vacc0x01234567, _mm256_mul_ps(va0, vb01234567));
+      vacc1x01234567 = _mm256_add_ps(vacc1x01234567, _mm256_mul_ps(va1, vb01234567));
+      vacc2x01234567 = _mm256_add_ps(vacc2x01234567, _mm256_mul_ps(va2, vb01234567));
+      vacc3x01234567 = _mm256_add_ps(vacc3x01234567, _mm256_mul_ps(va3, vb01234567));
+      vacc4x01234567 = _mm256_add_ps(vacc4x01234567, _mm256_mul_ps(va4, vb01234567));
+
+      k -= sizeof(float);
+    } while (k != 0);
+
+    const __m256 vmax = _mm256_broadcast_ps((const __m128*) params->sse.max);
+    vacc0x01234567 = _mm256_min_ps(vacc0x01234567, vmax);
+    vacc1x01234567 = _mm256_min_ps(vacc1x01234567, vmax);
+    vacc2x01234567 = _mm256_min_ps(vacc2x01234567, vmax);
+    vacc3x01234567 = _mm256_min_ps(vacc3x01234567, vmax);
+    vacc4x01234567 = _mm256_min_ps(vacc4x01234567, vmax);
+
+    const __m256 vmin = _mm256_broadcast_ps((const __m128*) params->sse.min);
+    vacc0x01234567 = _mm256_max_ps(vacc0x01234567, vmin);
+    vacc1x01234567 = _mm256_max_ps(vacc1x01234567, vmin);
+    vacc2x01234567 = _mm256_max_ps(vacc2x01234567, vmin);
+    vacc3x01234567 = _mm256_max_ps(vacc3x01234567, vmin);
+    vacc4x01234567 = _mm256_max_ps(vacc4x01234567, vmin);
+
+    if XNN_LIKELY(nc >= 8) {
+      _mm256_storeu_ps(c4, vacc4x01234567);
+      c4 = (float*) ((uintptr_t) c4 + cn_stride);
+      _mm256_storeu_ps(c3, vacc3x01234567);
+      c3 = (float*) ((uintptr_t) c3 + cn_stride);
+      _mm256_storeu_ps(c2, vacc2x01234567);
+      c2 = (float*) ((uintptr_t) c2 + cn_stride);
+      _mm256_storeu_ps(c1, vacc1x01234567);
+      c1 = (float*) ((uintptr_t) c1 + cn_stride);
+      _mm256_storeu_ps(c0, vacc0x01234567);
+      c0 = (float*) ((uintptr_t) c0 + cn_stride);
+
+      a4 = (const float*) ((uintptr_t) a4 - kc);
+      a3 = (const float*) ((uintptr_t) a3 - kc);
+      a2 = (const float*) ((uintptr_t) a2 - kc);
+      a1 = (const float*) ((uintptr_t) a1 - kc);
+      a0 = (const float*) ((uintptr_t) a0 - kc);
+
+      nc -= 8;
+    } else {
+      __m128 vacc4x0123 = _mm256_castps256_ps128(vacc4x01234567);
+      __m128 vacc3x0123 = _mm256_castps256_ps128(vacc3x01234567);
+      __m128 vacc2x0123 = _mm256_castps256_ps128(vacc2x01234567);
+      __m128 vacc1x0123 = _mm256_castps256_ps128(vacc1x01234567);
+      __m128 vacc0x0123 = _mm256_castps256_ps128(vacc0x01234567);
+      if (nc & 4) {
+        _mm_storeu_ps(c4, vacc4x0123);
+        _mm_storeu_ps(c3, vacc3x0123);
+        _mm_storeu_ps(c2, vacc2x0123);
+        _mm_storeu_ps(c1, vacc1x0123);
+        _mm_storeu_ps(c0, vacc0x0123);
+
+        vacc4x0123 = _mm256_extractf128_ps(vacc4x01234567, 1);
+        vacc3x0123 = _mm256_extractf128_ps(vacc3x01234567, 1);
+        vacc2x0123 = _mm256_extractf128_ps(vacc2x01234567, 1);
+        vacc1x0123 = _mm256_extractf128_ps(vacc1x01234567, 1);
+        vacc0x0123 = _mm256_extractf128_ps(vacc0x01234567, 1);
+
+        c4 += 4;
+        c3 += 4;
+        c2 += 4;
+        c1 += 4;
+        c0 += 4;
+      }
+      if (nc & 2) {
+        _mm_storel_pi((__m64*) c4, vacc4x0123);
+        _mm_storel_pi((__m64*) c3, vacc3x0123);
+        _mm_storel_pi((__m64*) c2, vacc2x0123);
+        _mm_storel_pi((__m64*) c1, vacc1x0123);
+        _mm_storel_pi((__m64*) c0, vacc0x0123);
+
+        vacc4x0123 = _mm_movehl_ps(vacc4x0123, vacc4x0123);
+        vacc3x0123 = _mm_movehl_ps(vacc3x0123, vacc3x0123);
+        vacc2x0123 = _mm_movehl_ps(vacc2x0123, vacc2x0123);
+        vacc1x0123 = _mm_movehl_ps(vacc1x0123, vacc1x0123);
+        vacc0x0123 = _mm_movehl_ps(vacc0x0123, vacc0x0123);
+
+        c4 += 2;
+        c3 += 2;
+        c2 += 2;
+        c1 += 2;
+        c0 += 2;
+      }
+      if (nc & 1) {
+        _mm_store_ss(c4, vacc4x0123);
+        _mm_store_ss(c3, vacc3x0123);
+        _mm_store_ss(c2, vacc2x0123);
+        _mm_store_ss(c1, vacc1x0123);
+        _mm_store_ss(c0, vacc0x0123);
+      }
+
+      nc = 0;
+    }
+  } while (nc != 0);
+}
diff --git a/src/f32-gemminc/5x8-fma3-broadcast.c b/src/f32-gemminc/5x8-fma3-broadcast.c
new file mode 100644
index 0000000..5589ab1
--- /dev/null
+++ b/src/f32-gemminc/5x8-fma3-broadcast.c
@@ -0,0 +1,188 @@
+// Auto-generated file. Do not edit!
+//   Template: src/f32-gemm/avx-broadcast.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <immintrin.h>
+
+#include <xnnpack/gemm.h>
+
+
+void xnn_f32_gemminc_ukernel_5x8__fma3_broadcast(
+    size_t mr,
+    size_t nc,
+    size_t kc,
+    const float*restrict a,
+    size_t a_stride,
+    const float*restrict w,
+    float*restrict c,
+    size_t cm_stride,
+    size_t cn_stride,
+    const float*restrict acc,
+    const union xnn_f32_output_params params[restrict static 1])
+{
+  assert(mr != 0);
+  assert(mr <= 5);
+  assert(nc != 0);
+  assert(kc != 0);
+  assert(kc % sizeof(float) == 0);
+  assert(a != NULL);
+  assert(w != NULL);
+  assert(c != NULL);
+  assert(acc != NULL);
+
+  const float* a0 = a;
+  float* c0 = c;
+  const float* a1 = (const float*) ((uintptr_t) a0 + a_stride);
+  float* c1 = (float*) ((uintptr_t) c0 + cm_stride);
+  if XNN_UNPREDICTABLE(mr < 2) {
+    a1 = a0;
+    c1 = c0;
+  }
+  const float* a2 = (const float*) ((uintptr_t) a1 + a_stride);
+  float* c2 = (float*) ((uintptr_t) c1 + cm_stride);
+  if XNN_UNPREDICTABLE(mr <= 2) {
+    a2 = a1;
+    c2 = c1;
+  }
+  const float* a3 = (const float*) ((uintptr_t) a2 + a_stride);
+  float* c3 = (float*) ((uintptr_t) c2 + cm_stride);
+  if XNN_UNPREDICTABLE(mr < 4) {
+    a3 = a2;
+    c3 = c2;
+  }
+  const float* a4 = (const float*) ((uintptr_t) a3 + a_stride);
+  float* c4 = (float*) ((uintptr_t) c3 + cm_stride);
+  if XNN_UNPREDICTABLE(mr <= 4) {
+    a4 = a3;
+    c4 = c3;
+  }
+
+  do {
+    __m256 vacc0x01234567 = _mm256_load_ps(acc + 0);
+    __m256 vacc1x01234567 = _mm256_load_ps(acc + 8);
+    __m256 vacc2x01234567 = _mm256_load_ps(acc + 16);
+    __m256 vacc3x01234567 = _mm256_load_ps(acc + 24);
+    __m256 vacc4x01234567 = _mm256_load_ps(acc + 32);
+    acc += 40;
+
+    size_t k = kc;
+    do {
+      const __m256 va0 = _mm256_broadcast_ss(a0);
+      a0 += 1;
+      const __m256 va1 = _mm256_broadcast_ss(a1);
+      a1 += 1;
+      const __m256 va2 = _mm256_broadcast_ss(a2);
+      a2 += 1;
+      const __m256 va3 = _mm256_broadcast_ss(a3);
+      a3 += 1;
+      const __m256 va4 = _mm256_broadcast_ss(a4);
+      a4 += 1;
+
+      const __m256 vb01234567 = _mm256_load_ps(w);
+      w += 8;
+
+      vacc0x01234567 = _mm256_fmadd_ps(va0, vb01234567, vacc0x01234567);
+      vacc1x01234567 = _mm256_fmadd_ps(va1, vb01234567, vacc1x01234567);
+      vacc2x01234567 = _mm256_fmadd_ps(va2, vb01234567, vacc2x01234567);
+      vacc3x01234567 = _mm256_fmadd_ps(va3, vb01234567, vacc3x01234567);
+      vacc4x01234567 = _mm256_fmadd_ps(va4, vb01234567, vacc4x01234567);
+
+      k -= sizeof(float);
+    } while (k != 0);
+
+    const __m256 vmax = _mm256_broadcast_ps((const __m128*) params->sse.max);
+    vacc0x01234567 = _mm256_min_ps(vacc0x01234567, vmax);
+    vacc1x01234567 = _mm256_min_ps(vacc1x01234567, vmax);
+    vacc2x01234567 = _mm256_min_ps(vacc2x01234567, vmax);
+    vacc3x01234567 = _mm256_min_ps(vacc3x01234567, vmax);
+    vacc4x01234567 = _mm256_min_ps(vacc4x01234567, vmax);
+
+    const __m256 vmin = _mm256_broadcast_ps((const __m128*) params->sse.min);
+    vacc0x01234567 = _mm256_max_ps(vacc0x01234567, vmin);
+    vacc1x01234567 = _mm256_max_ps(vacc1x01234567, vmin);
+    vacc2x01234567 = _mm256_max_ps(vacc2x01234567, vmin);
+    vacc3x01234567 = _mm256_max_ps(vacc3x01234567, vmin);
+    vacc4x01234567 = _mm256_max_ps(vacc4x01234567, vmin);
+
+    if XNN_LIKELY(nc >= 8) {
+      _mm256_storeu_ps(c4, vacc4x01234567);
+      c4 = (float*) ((uintptr_t) c4 + cn_stride);
+      _mm256_storeu_ps(c3, vacc3x01234567);
+      c3 = (float*) ((uintptr_t) c3 + cn_stride);
+      _mm256_storeu_ps(c2, vacc2x01234567);
+      c2 = (float*) ((uintptr_t) c2 + cn_stride);
+      _mm256_storeu_ps(c1, vacc1x01234567);
+      c1 = (float*) ((uintptr_t) c1 + cn_stride);
+      _mm256_storeu_ps(c0, vacc0x01234567);
+      c0 = (float*) ((uintptr_t) c0 + cn_stride);
+
+      a4 = (const float*) ((uintptr_t) a4 - kc);
+      a3 = (const float*) ((uintptr_t) a3 - kc);
+      a2 = (const float*) ((uintptr_t) a2 - kc);
+      a1 = (const float*) ((uintptr_t) a1 - kc);
+      a0 = (const float*) ((uintptr_t) a0 - kc);
+
+      nc -= 8;
+    } else {
+      __m128 vacc4x0123 = _mm256_castps256_ps128(vacc4x01234567);
+      __m128 vacc3x0123 = _mm256_castps256_ps128(vacc3x01234567);
+      __m128 vacc2x0123 = _mm256_castps256_ps128(vacc2x01234567);
+      __m128 vacc1x0123 = _mm256_castps256_ps128(vacc1x01234567);
+      __m128 vacc0x0123 = _mm256_castps256_ps128(vacc0x01234567);
+      if (nc & 4) {
+        _mm_storeu_ps(c4, vacc4x0123);
+        _mm_storeu_ps(c3, vacc3x0123);
+        _mm_storeu_ps(c2, vacc2x0123);
+        _mm_storeu_ps(c1, vacc1x0123);
+        _mm_storeu_ps(c0, vacc0x0123);
+
+        vacc4x0123 = _mm256_extractf128_ps(vacc4x01234567, 1);
+        vacc3x0123 = _mm256_extractf128_ps(vacc3x01234567, 1);
+        vacc2x0123 = _mm256_extractf128_ps(vacc2x01234567, 1);
+        vacc1x0123 = _mm256_extractf128_ps(vacc1x01234567, 1);
+        vacc0x0123 = _mm256_extractf128_ps(vacc0x01234567, 1);
+
+        c4 += 4;
+        c3 += 4;
+        c2 += 4;
+        c1 += 4;
+        c0 += 4;
+      }
+      if (nc & 2) {
+        _mm_storel_pi((__m64*) c4, vacc4x0123);
+        _mm_storel_pi((__m64*) c3, vacc3x0123);
+        _mm_storel_pi((__m64*) c2, vacc2x0123);
+        _mm_storel_pi((__m64*) c1, vacc1x0123);
+        _mm_storel_pi((__m64*) c0, vacc0x0123);
+
+        vacc4x0123 = _mm_movehl_ps(vacc4x0123, vacc4x0123);
+        vacc3x0123 = _mm_movehl_ps(vacc3x0123, vacc3x0123);
+        vacc2x0123 = _mm_movehl_ps(vacc2x0123, vacc2x0123);
+        vacc1x0123 = _mm_movehl_ps(vacc1x0123, vacc1x0123);
+        vacc0x0123 = _mm_movehl_ps(vacc0x0123, vacc0x0123);
+
+        c4 += 2;
+        c3 += 2;
+        c2 += 2;
+        c1 += 2;
+        c0 += 2;
+      }
+      if (nc & 1) {
+        _mm_store_ss(c4, vacc4x0123);
+        _mm_store_ss(c3, vacc3x0123);
+        _mm_store_ss(c2, vacc2x0123);
+        _mm_store_ss(c1, vacc1x0123);
+        _mm_store_ss(c0, vacc0x0123);
+      }
+
+      nc = 0;
+    }
+  } while (nc != 0);
+}
diff --git a/src/f32-gemminc/6x8-avx-broadcast.c b/src/f32-gemminc/6x8-avx-broadcast.c
new file mode 100644
index 0000000..8bf0197
--- /dev/null
+++ b/src/f32-gemminc/6x8-avx-broadcast.c
@@ -0,0 +1,211 @@
+// Auto-generated file. Do not edit!
+//   Template: src/f32-gemm/avx-broadcast.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <immintrin.h>
+
+#include <xnnpack/gemm.h>
+
+
+void xnn_f32_gemminc_ukernel_6x8__avx_broadcast(
+    size_t mr,
+    size_t nc,
+    size_t kc,
+    const float*restrict a,
+    size_t a_stride,
+    const float*restrict w,
+    float*restrict c,
+    size_t cm_stride,
+    size_t cn_stride,
+    const float*restrict acc,
+    const union xnn_f32_output_params params[restrict static 1])
+{
+  assert(mr != 0);
+  assert(mr <= 6);
+  assert(nc != 0);
+  assert(kc != 0);
+  assert(kc % sizeof(float) == 0);
+  assert(a != NULL);
+  assert(w != NULL);
+  assert(c != NULL);
+  assert(acc != NULL);
+
+  const float* a0 = a;
+  float* c0 = c;
+  const float* a1 = (const float*) ((uintptr_t) a0 + a_stride);
+  float* c1 = (float*) ((uintptr_t) c0 + cm_stride);
+  if XNN_UNPREDICTABLE(mr < 2) {
+    a1 = a0;
+    c1 = c0;
+  }
+  const float* a2 = (const float*) ((uintptr_t) a1 + a_stride);
+  float* c2 = (float*) ((uintptr_t) c1 + cm_stride);
+  if XNN_UNPREDICTABLE(mr <= 2) {
+    a2 = a1;
+    c2 = c1;
+  }
+  const float* a3 = (const float*) ((uintptr_t) a2 + a_stride);
+  float* c3 = (float*) ((uintptr_t) c2 + cm_stride);
+  if XNN_UNPREDICTABLE(mr < 4) {
+    a3 = a2;
+    c3 = c2;
+  }
+  const float* a4 = (const float*) ((uintptr_t) a3 + a_stride);
+  float* c4 = (float*) ((uintptr_t) c3 + cm_stride);
+  if XNN_UNPREDICTABLE(mr <= 4) {
+    a4 = a3;
+    c4 = c3;
+  }
+  const float* a5 = (const float*) ((uintptr_t) a4 + a_stride);
+  float* c5 = (float*) ((uintptr_t) c4 + cm_stride);
+  if XNN_UNPREDICTABLE(mr != 6) {
+    a5 = a4;
+    c5 = c4;
+  }
+
+  do {
+    __m256 vacc0x01234567 = _mm256_load_ps(acc + 0);
+    __m256 vacc1x01234567 = _mm256_load_ps(acc + 8);
+    __m256 vacc2x01234567 = _mm256_load_ps(acc + 16);
+    __m256 vacc3x01234567 = _mm256_load_ps(acc + 24);
+    __m256 vacc4x01234567 = _mm256_load_ps(acc + 32);
+    __m256 vacc5x01234567 = _mm256_load_ps(acc + 40);
+    acc += 48;
+
+    size_t k = kc;
+    do {
+      const __m256 va0 = _mm256_broadcast_ss(a0);
+      a0 += 1;
+      const __m256 va1 = _mm256_broadcast_ss(a1);
+      a1 += 1;
+      const __m256 va2 = _mm256_broadcast_ss(a2);
+      a2 += 1;
+      const __m256 va3 = _mm256_broadcast_ss(a3);
+      a3 += 1;
+      const __m256 va4 = _mm256_broadcast_ss(a4);
+      a4 += 1;
+      const __m256 va5 = _mm256_broadcast_ss(a5);
+      a5 += 1;
+
+      const __m256 vb01234567 = _mm256_load_ps(w);
+      w += 8;
+
+      vacc0x01234567 = _mm256_add_ps(vacc0x01234567, _mm256_mul_ps(va0, vb01234567));
+      vacc1x01234567 = _mm256_add_ps(vacc1x01234567, _mm256_mul_ps(va1, vb01234567));
+      vacc2x01234567 = _mm256_add_ps(vacc2x01234567, _mm256_mul_ps(va2, vb01234567));
+      vacc3x01234567 = _mm256_add_ps(vacc3x01234567, _mm256_mul_ps(va3, vb01234567));
+      vacc4x01234567 = _mm256_add_ps(vacc4x01234567, _mm256_mul_ps(va4, vb01234567));
+      vacc5x01234567 = _mm256_add_ps(vacc5x01234567, _mm256_mul_ps(va5, vb01234567));
+
+      k -= sizeof(float);
+    } while (k != 0);
+
+    const __m256 vmax = _mm256_broadcast_ps((const __m128*) params->sse.max);
+    vacc0x01234567 = _mm256_min_ps(vacc0x01234567, vmax);
+    vacc1x01234567 = _mm256_min_ps(vacc1x01234567, vmax);
+    vacc2x01234567 = _mm256_min_ps(vacc2x01234567, vmax);
+    vacc3x01234567 = _mm256_min_ps(vacc3x01234567, vmax);
+    vacc4x01234567 = _mm256_min_ps(vacc4x01234567, vmax);
+    vacc5x01234567 = _mm256_min_ps(vacc5x01234567, vmax);
+
+    const __m256 vmin = _mm256_broadcast_ps((const __m128*) params->sse.min);
+    vacc0x01234567 = _mm256_max_ps(vacc0x01234567, vmin);
+    vacc1x01234567 = _mm256_max_ps(vacc1x01234567, vmin);
+    vacc2x01234567 = _mm256_max_ps(vacc2x01234567, vmin);
+    vacc3x01234567 = _mm256_max_ps(vacc3x01234567, vmin);
+    vacc4x01234567 = _mm256_max_ps(vacc4x01234567, vmin);
+    vacc5x01234567 = _mm256_max_ps(vacc5x01234567, vmin);
+
+    if XNN_LIKELY(nc >= 8) {
+      _mm256_storeu_ps(c5, vacc5x01234567);
+      c5 = (float*) ((uintptr_t) c5 + cn_stride);
+      _mm256_storeu_ps(c4, vacc4x01234567);
+      c4 = (float*) ((uintptr_t) c4 + cn_stride);
+      _mm256_storeu_ps(c3, vacc3x01234567);
+      c3 = (float*) ((uintptr_t) c3 + cn_stride);
+      _mm256_storeu_ps(c2, vacc2x01234567);
+      c2 = (float*) ((uintptr_t) c2 + cn_stride);
+      _mm256_storeu_ps(c1, vacc1x01234567);
+      c1 = (float*) ((uintptr_t) c1 + cn_stride);
+      _mm256_storeu_ps(c0, vacc0x01234567);
+      c0 = (float*) ((uintptr_t) c0 + cn_stride);
+
+      a5 = (const float*) ((uintptr_t) a5 - kc);
+      a4 = (const float*) ((uintptr_t) a4 - kc);
+      a3 = (const float*) ((uintptr_t) a3 - kc);
+      a2 = (const float*) ((uintptr_t) a2 - kc);
+      a1 = (const float*) ((uintptr_t) a1 - kc);
+      a0 = (const float*) ((uintptr_t) a0 - kc);
+
+      nc -= 8;
+    } else {
+      __m128 vacc5x0123 = _mm256_castps256_ps128(vacc5x01234567);
+      __m128 vacc4x0123 = _mm256_castps256_ps128(vacc4x01234567);
+      __m128 vacc3x0123 = _mm256_castps256_ps128(vacc3x01234567);
+      __m128 vacc2x0123 = _mm256_castps256_ps128(vacc2x01234567);
+      __m128 vacc1x0123 = _mm256_castps256_ps128(vacc1x01234567);
+      __m128 vacc0x0123 = _mm256_castps256_ps128(vacc0x01234567);
+      if (nc & 4) {
+        _mm_storeu_ps(c5, vacc5x0123);
+        _mm_storeu_ps(c4, vacc4x0123);
+        _mm_storeu_ps(c3, vacc3x0123);
+        _mm_storeu_ps(c2, vacc2x0123);
+        _mm_storeu_ps(c1, vacc1x0123);
+        _mm_storeu_ps(c0, vacc0x0123);
+
+        vacc5x0123 = _mm256_extractf128_ps(vacc5x01234567, 1);
+        vacc4x0123 = _mm256_extractf128_ps(vacc4x01234567, 1);
+        vacc3x0123 = _mm256_extractf128_ps(vacc3x01234567, 1);
+        vacc2x0123 = _mm256_extractf128_ps(vacc2x01234567, 1);
+        vacc1x0123 = _mm256_extractf128_ps(vacc1x01234567, 1);
+        vacc0x0123 = _mm256_extractf128_ps(vacc0x01234567, 1);
+
+        c5 += 4;
+        c4 += 4;
+        c3 += 4;
+        c2 += 4;
+        c1 += 4;
+        c0 += 4;
+      }
+      if (nc & 2) {
+        _mm_storel_pi((__m64*) c5, vacc5x0123);
+        _mm_storel_pi((__m64*) c4, vacc4x0123);
+        _mm_storel_pi((__m64*) c3, vacc3x0123);
+        _mm_storel_pi((__m64*) c2, vacc2x0123);
+        _mm_storel_pi((__m64*) c1, vacc1x0123);
+        _mm_storel_pi((__m64*) c0, vacc0x0123);
+
+        vacc5x0123 = _mm_movehl_ps(vacc5x0123, vacc5x0123);
+        vacc4x0123 = _mm_movehl_ps(vacc4x0123, vacc4x0123);
+        vacc3x0123 = _mm_movehl_ps(vacc3x0123, vacc3x0123);
+        vacc2x0123 = _mm_movehl_ps(vacc2x0123, vacc2x0123);
+        vacc1x0123 = _mm_movehl_ps(vacc1x0123, vacc1x0123);
+        vacc0x0123 = _mm_movehl_ps(vacc0x0123, vacc0x0123);
+
+        c5 += 2;
+        c4 += 2;
+        c3 += 2;
+        c2 += 2;
+        c1 += 2;
+        c0 += 2;
+      }
+      if (nc & 1) {
+        _mm_store_ss(c5, vacc5x0123);
+        _mm_store_ss(c4, vacc4x0123);
+        _mm_store_ss(c3, vacc3x0123);
+        _mm_store_ss(c2, vacc2x0123);
+        _mm_store_ss(c1, vacc1x0123);
+        _mm_store_ss(c0, vacc0x0123);
+      }
+
+      nc = 0;
+    }
+  } while (nc != 0);
+}
diff --git a/src/f32-gemminc/6x8-fma3-broadcast.c b/src/f32-gemminc/6x8-fma3-broadcast.c
new file mode 100644
index 0000000..08d2d7a
--- /dev/null
+++ b/src/f32-gemminc/6x8-fma3-broadcast.c
@@ -0,0 +1,211 @@
+// Auto-generated file. Do not edit!
+//   Template: src/f32-gemm/avx-broadcast.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <immintrin.h>
+
+#include <xnnpack/gemm.h>
+
+
+void xnn_f32_gemminc_ukernel_6x8__fma3_broadcast(
+    size_t mr,
+    size_t nc,
+    size_t kc,
+    const float*restrict a,
+    size_t a_stride,
+    const float*restrict w,
+    float*restrict c,
+    size_t cm_stride,
+    size_t cn_stride,
+    const float*restrict acc,
+    const union xnn_f32_output_params params[restrict static 1])
+{
+  assert(mr != 0);
+  assert(mr <= 6);
+  assert(nc != 0);
+  assert(kc != 0);
+  assert(kc % sizeof(float) == 0);
+  assert(a != NULL);
+  assert(w != NULL);
+  assert(c != NULL);
+  assert(acc != NULL);
+
+  const float* a0 = a;
+  float* c0 = c;
+  const float* a1 = (const float*) ((uintptr_t) a0 + a_stride);
+  float* c1 = (float*) ((uintptr_t) c0 + cm_stride);
+  if XNN_UNPREDICTABLE(mr < 2) {
+    a1 = a0;
+    c1 = c0;
+  }
+  const float* a2 = (const float*) ((uintptr_t) a1 + a_stride);
+  float* c2 = (float*) ((uintptr_t) c1 + cm_stride);
+  if XNN_UNPREDICTABLE(mr <= 2) {
+    a2 = a1;
+    c2 = c1;
+  }
+  const float* a3 = (const float*) ((uintptr_t) a2 + a_stride);
+  float* c3 = (float*) ((uintptr_t) c2 + cm_stride);
+  if XNN_UNPREDICTABLE(mr < 4) {
+    a3 = a2;
+    c3 = c2;
+  }
+  const float* a4 = (const float*) ((uintptr_t) a3 + a_stride);
+  float* c4 = (float*) ((uintptr_t) c3 + cm_stride);
+  if XNN_UNPREDICTABLE(mr <= 4) {
+    a4 = a3;
+    c4 = c3;
+  }
+  const float* a5 = (const float*) ((uintptr_t) a4 + a_stride);
+  float* c5 = (float*) ((uintptr_t) c4 + cm_stride);
+  if XNN_UNPREDICTABLE(mr != 6) {
+    a5 = a4;
+    c5 = c4;
+  }
+
+  do {
+    __m256 vacc0x01234567 = _mm256_load_ps(acc + 0);
+    __m256 vacc1x01234567 = _mm256_load_ps(acc + 8);
+    __m256 vacc2x01234567 = _mm256_load_ps(acc + 16);
+    __m256 vacc3x01234567 = _mm256_load_ps(acc + 24);
+    __m256 vacc4x01234567 = _mm256_load_ps(acc + 32);
+    __m256 vacc5x01234567 = _mm256_load_ps(acc + 40);
+    acc += 48;
+
+    size_t k = kc;
+    do {
+      const __m256 va0 = _mm256_broadcast_ss(a0);
+      a0 += 1;
+      const __m256 va1 = _mm256_broadcast_ss(a1);
+      a1 += 1;
+      const __m256 va2 = _mm256_broadcast_ss(a2);
+      a2 += 1;
+      const __m256 va3 = _mm256_broadcast_ss(a3);
+      a3 += 1;
+      const __m256 va4 = _mm256_broadcast_ss(a4);
+      a4 += 1;
+      const __m256 va5 = _mm256_broadcast_ss(a5);
+      a5 += 1;
+
+      const __m256 vb01234567 = _mm256_load_ps(w);
+      w += 8;
+
+      vacc0x01234567 = _mm256_fmadd_ps(va0, vb01234567, vacc0x01234567);
+      vacc1x01234567 = _mm256_fmadd_ps(va1, vb01234567, vacc1x01234567);
+      vacc2x01234567 = _mm256_fmadd_ps(va2, vb01234567, vacc2x01234567);
+      vacc3x01234567 = _mm256_fmadd_ps(va3, vb01234567, vacc3x01234567);
+      vacc4x01234567 = _mm256_fmadd_ps(va4, vb01234567, vacc4x01234567);
+      vacc5x01234567 = _mm256_fmadd_ps(va5, vb01234567, vacc5x01234567);
+
+      k -= sizeof(float);
+    } while (k != 0);
+
+    const __m256 vmax = _mm256_broadcast_ps((const __m128*) params->sse.max);
+    vacc0x01234567 = _mm256_min_ps(vacc0x01234567, vmax);
+    vacc1x01234567 = _mm256_min_ps(vacc1x01234567, vmax);
+    vacc2x01234567 = _mm256_min_ps(vacc2x01234567, vmax);
+    vacc3x01234567 = _mm256_min_ps(vacc3x01234567, vmax);
+    vacc4x01234567 = _mm256_min_ps(vacc4x01234567, vmax);
+    vacc5x01234567 = _mm256_min_ps(vacc5x01234567, vmax);
+
+    const __m256 vmin = _mm256_broadcast_ps((const __m128*) params->sse.min);
+    vacc0x01234567 = _mm256_max_ps(vacc0x01234567, vmin);
+    vacc1x01234567 = _mm256_max_ps(vacc1x01234567, vmin);
+    vacc2x01234567 = _mm256_max_ps(vacc2x01234567, vmin);
+    vacc3x01234567 = _mm256_max_ps(vacc3x01234567, vmin);
+    vacc4x01234567 = _mm256_max_ps(vacc4x01234567, vmin);
+    vacc5x01234567 = _mm256_max_ps(vacc5x01234567, vmin);
+
+    if XNN_LIKELY(nc >= 8) {
+      _mm256_storeu_ps(c5, vacc5x01234567);
+      c5 = (float*) ((uintptr_t) c5 + cn_stride);
+      _mm256_storeu_ps(c4, vacc4x01234567);
+      c4 = (float*) ((uintptr_t) c4 + cn_stride);
+      _mm256_storeu_ps(c3, vacc3x01234567);
+      c3 = (float*) ((uintptr_t) c3 + cn_stride);
+      _mm256_storeu_ps(c2, vacc2x01234567);
+      c2 = (float*) ((uintptr_t) c2 + cn_stride);
+      _mm256_storeu_ps(c1, vacc1x01234567);
+      c1 = (float*) ((uintptr_t) c1 + cn_stride);
+      _mm256_storeu_ps(c0, vacc0x01234567);
+      c0 = (float*) ((uintptr_t) c0 + cn_stride);
+
+      a5 = (const float*) ((uintptr_t) a5 - kc);
+      a4 = (const float*) ((uintptr_t) a4 - kc);
+      a3 = (const float*) ((uintptr_t) a3 - kc);
+      a2 = (const float*) ((uintptr_t) a2 - kc);
+      a1 = (const float*) ((uintptr_t) a1 - kc);
+      a0 = (const float*) ((uintptr_t) a0 - kc);
+
+      nc -= 8;
+    } else {
+      __m128 vacc5x0123 = _mm256_castps256_ps128(vacc5x01234567);
+      __m128 vacc4x0123 = _mm256_castps256_ps128(vacc4x01234567);
+      __m128 vacc3x0123 = _mm256_castps256_ps128(vacc3x01234567);
+      __m128 vacc2x0123 = _mm256_castps256_ps128(vacc2x01234567);
+      __m128 vacc1x0123 = _mm256_castps256_ps128(vacc1x01234567);
+      __m128 vacc0x0123 = _mm256_castps256_ps128(vacc0x01234567);
+      if (nc & 4) {
+        _mm_storeu_ps(c5, vacc5x0123);
+        _mm_storeu_ps(c4, vacc4x0123);
+        _mm_storeu_ps(c3, vacc3x0123);
+        _mm_storeu_ps(c2, vacc2x0123);
+        _mm_storeu_ps(c1, vacc1x0123);
+        _mm_storeu_ps(c0, vacc0x0123);
+
+        vacc5x0123 = _mm256_extractf128_ps(vacc5x01234567, 1);
+        vacc4x0123 = _mm256_extractf128_ps(vacc4x01234567, 1);
+        vacc3x0123 = _mm256_extractf128_ps(vacc3x01234567, 1);
+        vacc2x0123 = _mm256_extractf128_ps(vacc2x01234567, 1);
+        vacc1x0123 = _mm256_extractf128_ps(vacc1x01234567, 1);
+        vacc0x0123 = _mm256_extractf128_ps(vacc0x01234567, 1);
+
+        c5 += 4;
+        c4 += 4;
+        c3 += 4;
+        c2 += 4;
+        c1 += 4;
+        c0 += 4;
+      }
+      if (nc & 2) {
+        _mm_storel_pi((__m64*) c5, vacc5x0123);
+        _mm_storel_pi((__m64*) c4, vacc4x0123);
+        _mm_storel_pi((__m64*) c3, vacc3x0123);
+        _mm_storel_pi((__m64*) c2, vacc2x0123);
+        _mm_storel_pi((__m64*) c1, vacc1x0123);
+        _mm_storel_pi((__m64*) c0, vacc0x0123);
+
+        vacc5x0123 = _mm_movehl_ps(vacc5x0123, vacc5x0123);
+        vacc4x0123 = _mm_movehl_ps(vacc4x0123, vacc4x0123);
+        vacc3x0123 = _mm_movehl_ps(vacc3x0123, vacc3x0123);
+        vacc2x0123 = _mm_movehl_ps(vacc2x0123, vacc2x0123);
+        vacc1x0123 = _mm_movehl_ps(vacc1x0123, vacc1x0123);
+        vacc0x0123 = _mm_movehl_ps(vacc0x0123, vacc0x0123);
+
+        c5 += 2;
+        c4 += 2;
+        c3 += 2;
+        c2 += 2;
+        c1 += 2;
+        c0 += 2;
+      }
+      if (nc & 1) {
+        _mm_store_ss(c5, vacc5x0123);
+        _mm_store_ss(c4, vacc4x0123);
+        _mm_store_ss(c3, vacc3x0123);
+        _mm_store_ss(c2, vacc2x0123);
+        _mm_store_ss(c1, vacc1x0123);
+        _mm_store_ss(c0, vacc0x0123);
+      }
+
+      nc = 0;
+    }
+  } while (nc != 0);
+}
diff --git a/src/f32-gemminc/7x8-avx-broadcast.c b/src/f32-gemminc/7x8-avx-broadcast.c
new file mode 100644
index 0000000..a991ae9
--- /dev/null
+++ b/src/f32-gemminc/7x8-avx-broadcast.c
@@ -0,0 +1,234 @@
+// Auto-generated file. Do not edit!
+//   Template: src/f32-gemm/avx-broadcast.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <immintrin.h>
+
+#include <xnnpack/gemm.h>
+
+
+void xnn_f32_gemminc_ukernel_7x8__avx_broadcast(
+    size_t mr,
+    size_t nc,
+    size_t kc,
+    const float*restrict a,
+    size_t a_stride,
+    const float*restrict w,
+    float*restrict c,
+    size_t cm_stride,
+    size_t cn_stride,
+    const float*restrict acc,
+    const union xnn_f32_output_params params[restrict static 1])
+{
+  assert(mr != 0);
+  assert(mr <= 7);
+  assert(nc != 0);
+  assert(kc != 0);
+  assert(kc % sizeof(float) == 0);
+  assert(a != NULL);
+  assert(w != NULL);
+  assert(c != NULL);
+  assert(acc != NULL);
+
+  const float* a0 = a;
+  float* c0 = c;
+  const float* a1 = (const float*) ((uintptr_t) a0 + a_stride);
+  float* c1 = (float*) ((uintptr_t) c0 + cm_stride);
+  if XNN_UNPREDICTABLE(mr < 2) {
+    a1 = a0;
+    c1 = c0;
+  }
+  const float* a2 = (const float*) ((uintptr_t) a1 + a_stride);
+  float* c2 = (float*) ((uintptr_t) c1 + cm_stride);
+  if XNN_UNPREDICTABLE(mr <= 2) {
+    a2 = a1;
+    c2 = c1;
+  }
+  const float* a3 = (const float*) ((uintptr_t) a2 + a_stride);
+  float* c3 = (float*) ((uintptr_t) c2 + cm_stride);
+  if XNN_UNPREDICTABLE(mr < 4) {
+    a3 = a2;
+    c3 = c2;
+  }
+  const float* a4 = (const float*) ((uintptr_t) a3 + a_stride);
+  float* c4 = (float*) ((uintptr_t) c3 + cm_stride);
+  if XNN_UNPREDICTABLE(mr <= 4) {
+    a4 = a3;
+    c4 = c3;
+  }
+  const float* a5 = (const float*) ((uintptr_t) a4 + a_stride);
+  float* c5 = (float*) ((uintptr_t) c4 + cm_stride);
+  if XNN_UNPREDICTABLE(mr < 6) {
+    a5 = a4;
+    c5 = c4;
+  }
+  const float* a6 = (const float*) ((uintptr_t) a5 + a_stride);
+  float* c6 = (float*) ((uintptr_t) c5 + cm_stride);
+  if XNN_UNPREDICTABLE(mr <= 6) {
+    a6 = a5;
+    c6 = c5;
+  }
+
+  do {
+    __m256 vacc0x01234567 = _mm256_load_ps(acc + 0);
+    __m256 vacc1x01234567 = _mm256_load_ps(acc + 8);
+    __m256 vacc2x01234567 = _mm256_load_ps(acc + 16);
+    __m256 vacc3x01234567 = _mm256_load_ps(acc + 24);
+    __m256 vacc4x01234567 = _mm256_load_ps(acc + 32);
+    __m256 vacc5x01234567 = _mm256_load_ps(acc + 40);
+    __m256 vacc6x01234567 = _mm256_load_ps(acc + 48);
+    acc += 56;
+
+    size_t k = kc;
+    do {
+      const __m256 va0 = _mm256_broadcast_ss(a0);
+      a0 += 1;
+      const __m256 va1 = _mm256_broadcast_ss(a1);
+      a1 += 1;
+      const __m256 va2 = _mm256_broadcast_ss(a2);
+      a2 += 1;
+      const __m256 va3 = _mm256_broadcast_ss(a3);
+      a3 += 1;
+      const __m256 va4 = _mm256_broadcast_ss(a4);
+      a4 += 1;
+      const __m256 va5 = _mm256_broadcast_ss(a5);
+      a5 += 1;
+      const __m256 va6 = _mm256_broadcast_ss(a6);
+      a6 += 1;
+
+      const __m256 vb01234567 = _mm256_load_ps(w);
+      w += 8;
+
+      vacc0x01234567 = _mm256_add_ps(vacc0x01234567, _mm256_mul_ps(va0, vb01234567));
+      vacc1x01234567 = _mm256_add_ps(vacc1x01234567, _mm256_mul_ps(va1, vb01234567));
+      vacc2x01234567 = _mm256_add_ps(vacc2x01234567, _mm256_mul_ps(va2, vb01234567));
+      vacc3x01234567 = _mm256_add_ps(vacc3x01234567, _mm256_mul_ps(va3, vb01234567));
+      vacc4x01234567 = _mm256_add_ps(vacc4x01234567, _mm256_mul_ps(va4, vb01234567));
+      vacc5x01234567 = _mm256_add_ps(vacc5x01234567, _mm256_mul_ps(va5, vb01234567));
+      vacc6x01234567 = _mm256_add_ps(vacc6x01234567, _mm256_mul_ps(va6, vb01234567));
+
+      k -= sizeof(float);
+    } while (k != 0);
+
+    const __m256 vmax = _mm256_broadcast_ps((const __m128*) params->sse.max);
+    vacc0x01234567 = _mm256_min_ps(vacc0x01234567, vmax);
+    vacc1x01234567 = _mm256_min_ps(vacc1x01234567, vmax);
+    vacc2x01234567 = _mm256_min_ps(vacc2x01234567, vmax);
+    vacc3x01234567 = _mm256_min_ps(vacc3x01234567, vmax);
+    vacc4x01234567 = _mm256_min_ps(vacc4x01234567, vmax);
+    vacc5x01234567 = _mm256_min_ps(vacc5x01234567, vmax);
+    vacc6x01234567 = _mm256_min_ps(vacc6x01234567, vmax);
+
+    const __m256 vmin = _mm256_broadcast_ps((const __m128*) params->sse.min);
+    vacc0x01234567 = _mm256_max_ps(vacc0x01234567, vmin);
+    vacc1x01234567 = _mm256_max_ps(vacc1x01234567, vmin);
+    vacc2x01234567 = _mm256_max_ps(vacc2x01234567, vmin);
+    vacc3x01234567 = _mm256_max_ps(vacc3x01234567, vmin);
+    vacc4x01234567 = _mm256_max_ps(vacc4x01234567, vmin);
+    vacc5x01234567 = _mm256_max_ps(vacc5x01234567, vmin);
+    vacc6x01234567 = _mm256_max_ps(vacc6x01234567, vmin);
+
+    if XNN_LIKELY(nc >= 8) {
+      _mm256_storeu_ps(c6, vacc6x01234567);
+      c6 = (float*) ((uintptr_t) c6 + cn_stride);
+      _mm256_storeu_ps(c5, vacc5x01234567);
+      c5 = (float*) ((uintptr_t) c5 + cn_stride);
+      _mm256_storeu_ps(c4, vacc4x01234567);
+      c4 = (float*) ((uintptr_t) c4 + cn_stride);
+      _mm256_storeu_ps(c3, vacc3x01234567);
+      c3 = (float*) ((uintptr_t) c3 + cn_stride);
+      _mm256_storeu_ps(c2, vacc2x01234567);
+      c2 = (float*) ((uintptr_t) c2 + cn_stride);
+      _mm256_storeu_ps(c1, vacc1x01234567);
+      c1 = (float*) ((uintptr_t) c1 + cn_stride);
+      _mm256_storeu_ps(c0, vacc0x01234567);
+      c0 = (float*) ((uintptr_t) c0 + cn_stride);
+
+      a6 = (const float*) ((uintptr_t) a6 - kc);
+      a5 = (const float*) ((uintptr_t) a5 - kc);
+      a4 = (const float*) ((uintptr_t) a4 - kc);
+      a3 = (const float*) ((uintptr_t) a3 - kc);
+      a2 = (const float*) ((uintptr_t) a2 - kc);
+      a1 = (const float*) ((uintptr_t) a1 - kc);
+      a0 = (const float*) ((uintptr_t) a0 - kc);
+
+      nc -= 8;
+    } else {
+      __m128 vacc6x0123 = _mm256_castps256_ps128(vacc6x01234567);
+      __m128 vacc5x0123 = _mm256_castps256_ps128(vacc5x01234567);
+      __m128 vacc4x0123 = _mm256_castps256_ps128(vacc4x01234567);
+      __m128 vacc3x0123 = _mm256_castps256_ps128(vacc3x01234567);
+      __m128 vacc2x0123 = _mm256_castps256_ps128(vacc2x01234567);
+      __m128 vacc1x0123 = _mm256_castps256_ps128(vacc1x01234567);
+      __m128 vacc0x0123 = _mm256_castps256_ps128(vacc0x01234567);
+      if (nc & 4) {
+        _mm_storeu_ps(c6, vacc6x0123);
+        _mm_storeu_ps(c5, vacc5x0123);
+        _mm_storeu_ps(c4, vacc4x0123);
+        _mm_storeu_ps(c3, vacc3x0123);
+        _mm_storeu_ps(c2, vacc2x0123);
+        _mm_storeu_ps(c1, vacc1x0123);
+        _mm_storeu_ps(c0, vacc0x0123);
+
+        vacc6x0123 = _mm256_extractf128_ps(vacc6x01234567, 1);
+        vacc5x0123 = _mm256_extractf128_ps(vacc5x01234567, 1);
+        vacc4x0123 = _mm256_extractf128_ps(vacc4x01234567, 1);
+        vacc3x0123 = _mm256_extractf128_ps(vacc3x01234567, 1);
+        vacc2x0123 = _mm256_extractf128_ps(vacc2x01234567, 1);
+        vacc1x0123 = _mm256_extractf128_ps(vacc1x01234567, 1);
+        vacc0x0123 = _mm256_extractf128_ps(vacc0x01234567, 1);
+
+        c6 += 4;
+        c5 += 4;
+        c4 += 4;
+        c3 += 4;
+        c2 += 4;
+        c1 += 4;
+        c0 += 4;
+      }
+      if (nc & 2) {
+        _mm_storel_pi((__m64*) c6, vacc6x0123);
+        _mm_storel_pi((__m64*) c5, vacc5x0123);
+        _mm_storel_pi((__m64*) c4, vacc4x0123);
+        _mm_storel_pi((__m64*) c3, vacc3x0123);
+        _mm_storel_pi((__m64*) c2, vacc2x0123);
+        _mm_storel_pi((__m64*) c1, vacc1x0123);
+        _mm_storel_pi((__m64*) c0, vacc0x0123);
+
+        vacc6x0123 = _mm_movehl_ps(vacc6x0123, vacc6x0123);
+        vacc5x0123 = _mm_movehl_ps(vacc5x0123, vacc5x0123);
+        vacc4x0123 = _mm_movehl_ps(vacc4x0123, vacc4x0123);
+        vacc3x0123 = _mm_movehl_ps(vacc3x0123, vacc3x0123);
+        vacc2x0123 = _mm_movehl_ps(vacc2x0123, vacc2x0123);
+        vacc1x0123 = _mm_movehl_ps(vacc1x0123, vacc1x0123);
+        vacc0x0123 = _mm_movehl_ps(vacc0x0123, vacc0x0123);
+
+        c6 += 2;
+        c5 += 2;
+        c4 += 2;
+        c3 += 2;
+        c2 += 2;
+        c1 += 2;
+        c0 += 2;
+      }
+      if (nc & 1) {
+        _mm_store_ss(c6, vacc6x0123);
+        _mm_store_ss(c5, vacc5x0123);
+        _mm_store_ss(c4, vacc4x0123);
+        _mm_store_ss(c3, vacc3x0123);
+        _mm_store_ss(c2, vacc2x0123);
+        _mm_store_ss(c1, vacc1x0123);
+        _mm_store_ss(c0, vacc0x0123);
+      }
+
+      nc = 0;
+    }
+  } while (nc != 0);
+}
diff --git a/src/f32-gemminc/7x8-fma3-broadcast.c b/src/f32-gemminc/7x8-fma3-broadcast.c
new file mode 100644
index 0000000..5ef6236
--- /dev/null
+++ b/src/f32-gemminc/7x8-fma3-broadcast.c
@@ -0,0 +1,234 @@
+// Auto-generated file. Do not edit!
+//   Template: src/f32-gemm/avx-broadcast.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <immintrin.h>
+
+#include <xnnpack/gemm.h>
+
+
+void xnn_f32_gemminc_ukernel_7x8__fma3_broadcast(
+    size_t mr,
+    size_t nc,
+    size_t kc,
+    const float*restrict a,
+    size_t a_stride,
+    const float*restrict w,
+    float*restrict c,
+    size_t cm_stride,
+    size_t cn_stride,
+    const float*restrict acc,
+    const union xnn_f32_output_params params[restrict static 1])
+{
+  assert(mr != 0);
+  assert(mr <= 7);
+  assert(nc != 0);
+  assert(kc != 0);
+  assert(kc % sizeof(float) == 0);
+  assert(a != NULL);
+  assert(w != NULL);
+  assert(c != NULL);
+  assert(acc != NULL);
+
+  const float* a0 = a;
+  float* c0 = c;
+  const float* a1 = (const float*) ((uintptr_t) a0 + a_stride);
+  float* c1 = (float*) ((uintptr_t) c0 + cm_stride);
+  if XNN_UNPREDICTABLE(mr < 2) {
+    a1 = a0;
+    c1 = c0;
+  }
+  const float* a2 = (const float*) ((uintptr_t) a1 + a_stride);
+  float* c2 = (float*) ((uintptr_t) c1 + cm_stride);
+  if XNN_UNPREDICTABLE(mr <= 2) {
+    a2 = a1;
+    c2 = c1;
+  }
+  const float* a3 = (const float*) ((uintptr_t) a2 + a_stride);
+  float* c3 = (float*) ((uintptr_t) c2 + cm_stride);
+  if XNN_UNPREDICTABLE(mr < 4) {
+    a3 = a2;
+    c3 = c2;
+  }
+  const float* a4 = (const float*) ((uintptr_t) a3 + a_stride);
+  float* c4 = (float*) ((uintptr_t) c3 + cm_stride);
+  if XNN_UNPREDICTABLE(mr <= 4) {
+    a4 = a3;
+    c4 = c3;
+  }
+  const float* a5 = (const float*) ((uintptr_t) a4 + a_stride);
+  float* c5 = (float*) ((uintptr_t) c4 + cm_stride);
+  if XNN_UNPREDICTABLE(mr < 6) {
+    a5 = a4;
+    c5 = c4;
+  }
+  const float* a6 = (const float*) ((uintptr_t) a5 + a_stride);
+  float* c6 = (float*) ((uintptr_t) c5 + cm_stride);
+  if XNN_UNPREDICTABLE(mr <= 6) {
+    a6 = a5;
+    c6 = c5;
+  }
+
+  do {
+    __m256 vacc0x01234567 = _mm256_load_ps(acc + 0);
+    __m256 vacc1x01234567 = _mm256_load_ps(acc + 8);
+    __m256 vacc2x01234567 = _mm256_load_ps(acc + 16);
+    __m256 vacc3x01234567 = _mm256_load_ps(acc + 24);
+    __m256 vacc4x01234567 = _mm256_load_ps(acc + 32);
+    __m256 vacc5x01234567 = _mm256_load_ps(acc + 40);
+    __m256 vacc6x01234567 = _mm256_load_ps(acc + 48);
+    acc += 56;
+
+    size_t k = kc;
+    do {
+      const __m256 va0 = _mm256_broadcast_ss(a0);
+      a0 += 1;
+      const __m256 va1 = _mm256_broadcast_ss(a1);
+      a1 += 1;
+      const __m256 va2 = _mm256_broadcast_ss(a2);
+      a2 += 1;
+      const __m256 va3 = _mm256_broadcast_ss(a3);
+      a3 += 1;
+      const __m256 va4 = _mm256_broadcast_ss(a4);
+      a4 += 1;
+      const __m256 va5 = _mm256_broadcast_ss(a5);
+      a5 += 1;
+      const __m256 va6 = _mm256_broadcast_ss(a6);
+      a6 += 1;
+
+      const __m256 vb01234567 = _mm256_load_ps(w);
+      w += 8;
+
+      vacc0x01234567 = _mm256_fmadd_ps(va0, vb01234567, vacc0x01234567);
+      vacc1x01234567 = _mm256_fmadd_ps(va1, vb01234567, vacc1x01234567);
+      vacc2x01234567 = _mm256_fmadd_ps(va2, vb01234567, vacc2x01234567);
+      vacc3x01234567 = _mm256_fmadd_ps(va3, vb01234567, vacc3x01234567);
+      vacc4x01234567 = _mm256_fmadd_ps(va4, vb01234567, vacc4x01234567);
+      vacc5x01234567 = _mm256_fmadd_ps(va5, vb01234567, vacc5x01234567);
+      vacc6x01234567 = _mm256_fmadd_ps(va6, vb01234567, vacc6x01234567);
+
+      k -= sizeof(float);
+    } while (k != 0);
+
+    const __m256 vmax = _mm256_broadcast_ps((const __m128*) params->sse.max);
+    vacc0x01234567 = _mm256_min_ps(vacc0x01234567, vmax);
+    vacc1x01234567 = _mm256_min_ps(vacc1x01234567, vmax);
+    vacc2x01234567 = _mm256_min_ps(vacc2x01234567, vmax);
+    vacc3x01234567 = _mm256_min_ps(vacc3x01234567, vmax);
+    vacc4x01234567 = _mm256_min_ps(vacc4x01234567, vmax);
+    vacc5x01234567 = _mm256_min_ps(vacc5x01234567, vmax);
+    vacc6x01234567 = _mm256_min_ps(vacc6x01234567, vmax);
+
+    const __m256 vmin = _mm256_broadcast_ps((const __m128*) params->sse.min);
+    vacc0x01234567 = _mm256_max_ps(vacc0x01234567, vmin);
+    vacc1x01234567 = _mm256_max_ps(vacc1x01234567, vmin);
+    vacc2x01234567 = _mm256_max_ps(vacc2x01234567, vmin);
+    vacc3x01234567 = _mm256_max_ps(vacc3x01234567, vmin);
+    vacc4x01234567 = _mm256_max_ps(vacc4x01234567, vmin);
+    vacc5x01234567 = _mm256_max_ps(vacc5x01234567, vmin);
+    vacc6x01234567 = _mm256_max_ps(vacc6x01234567, vmin);
+
+    if XNN_LIKELY(nc >= 8) {
+      _mm256_storeu_ps(c6, vacc6x01234567);
+      c6 = (float*) ((uintptr_t) c6 + cn_stride);
+      _mm256_storeu_ps(c5, vacc5x01234567);
+      c5 = (float*) ((uintptr_t) c5 + cn_stride);
+      _mm256_storeu_ps(c4, vacc4x01234567);
+      c4 = (float*) ((uintptr_t) c4 + cn_stride);
+      _mm256_storeu_ps(c3, vacc3x01234567);
+      c3 = (float*) ((uintptr_t) c3 + cn_stride);
+      _mm256_storeu_ps(c2, vacc2x01234567);
+      c2 = (float*) ((uintptr_t) c2 + cn_stride);
+      _mm256_storeu_ps(c1, vacc1x01234567);
+      c1 = (float*) ((uintptr_t) c1 + cn_stride);
+      _mm256_storeu_ps(c0, vacc0x01234567);
+      c0 = (float*) ((uintptr_t) c0 + cn_stride);
+
+      a6 = (const float*) ((uintptr_t) a6 - kc);
+      a5 = (const float*) ((uintptr_t) a5 - kc);
+      a4 = (const float*) ((uintptr_t) a4 - kc);
+      a3 = (const float*) ((uintptr_t) a3 - kc);
+      a2 = (const float*) ((uintptr_t) a2 - kc);
+      a1 = (const float*) ((uintptr_t) a1 - kc);
+      a0 = (const float*) ((uintptr_t) a0 - kc);
+
+      nc -= 8;
+    } else {
+      __m128 vacc6x0123 = _mm256_castps256_ps128(vacc6x01234567);
+      __m128 vacc5x0123 = _mm256_castps256_ps128(vacc5x01234567);
+      __m128 vacc4x0123 = _mm256_castps256_ps128(vacc4x01234567);
+      __m128 vacc3x0123 = _mm256_castps256_ps128(vacc3x01234567);
+      __m128 vacc2x0123 = _mm256_castps256_ps128(vacc2x01234567);
+      __m128 vacc1x0123 = _mm256_castps256_ps128(vacc1x01234567);
+      __m128 vacc0x0123 = _mm256_castps256_ps128(vacc0x01234567);
+      if (nc & 4) {
+        _mm_storeu_ps(c6, vacc6x0123);
+        _mm_storeu_ps(c5, vacc5x0123);
+        _mm_storeu_ps(c4, vacc4x0123);
+        _mm_storeu_ps(c3, vacc3x0123);
+        _mm_storeu_ps(c2, vacc2x0123);
+        _mm_storeu_ps(c1, vacc1x0123);
+        _mm_storeu_ps(c0, vacc0x0123);
+
+        vacc6x0123 = _mm256_extractf128_ps(vacc6x01234567, 1);
+        vacc5x0123 = _mm256_extractf128_ps(vacc5x01234567, 1);
+        vacc4x0123 = _mm256_extractf128_ps(vacc4x01234567, 1);
+        vacc3x0123 = _mm256_extractf128_ps(vacc3x01234567, 1);
+        vacc2x0123 = _mm256_extractf128_ps(vacc2x01234567, 1);
+        vacc1x0123 = _mm256_extractf128_ps(vacc1x01234567, 1);
+        vacc0x0123 = _mm256_extractf128_ps(vacc0x01234567, 1);
+
+        c6 += 4;
+        c5 += 4;
+        c4 += 4;
+        c3 += 4;
+        c2 += 4;
+        c1 += 4;
+        c0 += 4;
+      }
+      if (nc & 2) {
+        _mm_storel_pi((__m64*) c6, vacc6x0123);
+        _mm_storel_pi((__m64*) c5, vacc5x0123);
+        _mm_storel_pi((__m64*) c4, vacc4x0123);
+        _mm_storel_pi((__m64*) c3, vacc3x0123);
+        _mm_storel_pi((__m64*) c2, vacc2x0123);
+        _mm_storel_pi((__m64*) c1, vacc1x0123);
+        _mm_storel_pi((__m64*) c0, vacc0x0123);
+
+        vacc6x0123 = _mm_movehl_ps(vacc6x0123, vacc6x0123);
+        vacc5x0123 = _mm_movehl_ps(vacc5x0123, vacc5x0123);
+        vacc4x0123 = _mm_movehl_ps(vacc4x0123, vacc4x0123);
+        vacc3x0123 = _mm_movehl_ps(vacc3x0123, vacc3x0123);
+        vacc2x0123 = _mm_movehl_ps(vacc2x0123, vacc2x0123);
+        vacc1x0123 = _mm_movehl_ps(vacc1x0123, vacc1x0123);
+        vacc0x0123 = _mm_movehl_ps(vacc0x0123, vacc0x0123);
+
+        c6 += 2;
+        c5 += 2;
+        c4 += 2;
+        c3 += 2;
+        c2 += 2;
+        c1 += 2;
+        c0 += 2;
+      }
+      if (nc & 1) {
+        _mm_store_ss(c6, vacc6x0123);
+        _mm_store_ss(c5, vacc5x0123);
+        _mm_store_ss(c4, vacc4x0123);
+        _mm_store_ss(c3, vacc3x0123);
+        _mm_store_ss(c2, vacc2x0123);
+        _mm_store_ss(c1, vacc1x0123);
+        _mm_store_ss(c0, vacc0x0123);
+      }
+
+      nc = 0;
+    }
+  } while (nc != 0);
+}
diff --git a/src/f32-gemminc/8x8-fma3-broadcast.c b/src/f32-gemminc/8x8-fma3-broadcast.c
new file mode 100644
index 0000000..bbdb925
--- /dev/null
+++ b/src/f32-gemminc/8x8-fma3-broadcast.c
@@ -0,0 +1,257 @@
+// Auto-generated file. Do not edit!
+//   Template: src/f32-gemm/avx-broadcast.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <immintrin.h>
+
+#include <xnnpack/gemm.h>
+
+
+void xnn_f32_gemminc_ukernel_8x8__fma3_broadcast(
+    size_t mr,
+    size_t nc,
+    size_t kc,
+    const float*restrict a,
+    size_t a_stride,
+    const float*restrict w,
+    float*restrict c,
+    size_t cm_stride,
+    size_t cn_stride,
+    const float*restrict acc,
+    const union xnn_f32_output_params params[restrict static 1])
+{
+  assert(mr != 0);
+  assert(mr <= 8);
+  assert(nc != 0);
+  assert(kc != 0);
+  assert(kc % sizeof(float) == 0);
+  assert(a != NULL);
+  assert(w != NULL);
+  assert(c != NULL);
+  assert(acc != NULL);
+
+  const float* a0 = a;
+  float* c0 = c;
+  const float* a1 = (const float*) ((uintptr_t) a0 + a_stride);
+  float* c1 = (float*) ((uintptr_t) c0 + cm_stride);
+  if XNN_UNPREDICTABLE(mr < 2) {
+    a1 = a0;
+    c1 = c0;
+  }
+  const float* a2 = (const float*) ((uintptr_t) a1 + a_stride);
+  float* c2 = (float*) ((uintptr_t) c1 + cm_stride);
+  if XNN_UNPREDICTABLE(mr <= 2) {
+    a2 = a1;
+    c2 = c1;
+  }
+  const float* a3 = (const float*) ((uintptr_t) a2 + a_stride);
+  float* c3 = (float*) ((uintptr_t) c2 + cm_stride);
+  if XNN_UNPREDICTABLE(mr < 4) {
+    a3 = a2;
+    c3 = c2;
+  }
+  const float* a4 = (const float*) ((uintptr_t) a3 + a_stride);
+  float* c4 = (float*) ((uintptr_t) c3 + cm_stride);
+  if XNN_UNPREDICTABLE(mr <= 4) {
+    a4 = a3;
+    c4 = c3;
+  }
+  const float* a5 = (const float*) ((uintptr_t) a4 + a_stride);
+  float* c5 = (float*) ((uintptr_t) c4 + cm_stride);
+  if XNN_UNPREDICTABLE(mr < 6) {
+    a5 = a4;
+    c5 = c4;
+  }
+  const float* a6 = (const float*) ((uintptr_t) a5 + a_stride);
+  float* c6 = (float*) ((uintptr_t) c5 + cm_stride);
+  if XNN_UNPREDICTABLE(mr <= 6) {
+    a6 = a5;
+    c6 = c5;
+  }
+  const float* a7 = (const float*) ((uintptr_t) a6 + a_stride);
+  float* c7 = (float*) ((uintptr_t) c6 + cm_stride);
+  if XNN_UNPREDICTABLE(mr != 8) {
+    a7 = a6;
+    c7 = c6;
+  }
+
+  do {
+    __m256 vacc0x01234567 = _mm256_load_ps(acc + 0);
+    __m256 vacc1x01234567 = _mm256_load_ps(acc + 8);
+    __m256 vacc2x01234567 = _mm256_load_ps(acc + 16);
+    __m256 vacc3x01234567 = _mm256_load_ps(acc + 24);
+    __m256 vacc4x01234567 = _mm256_load_ps(acc + 32);
+    __m256 vacc5x01234567 = _mm256_load_ps(acc + 40);
+    __m256 vacc6x01234567 = _mm256_load_ps(acc + 48);
+    __m256 vacc7x01234567 = _mm256_load_ps(acc + 56);
+    acc += 64;
+
+    size_t k = kc;
+    do {
+      const __m256 va0 = _mm256_broadcast_ss(a0);
+      a0 += 1;
+      const __m256 va1 = _mm256_broadcast_ss(a1);
+      a1 += 1;
+      const __m256 va2 = _mm256_broadcast_ss(a2);
+      a2 += 1;
+      const __m256 va3 = _mm256_broadcast_ss(a3);
+      a3 += 1;
+      const __m256 va4 = _mm256_broadcast_ss(a4);
+      a4 += 1;
+      const __m256 va5 = _mm256_broadcast_ss(a5);
+      a5 += 1;
+      const __m256 va6 = _mm256_broadcast_ss(a6);
+      a6 += 1;
+      const __m256 va7 = _mm256_broadcast_ss(a7);
+      a7 += 1;
+
+      const __m256 vb01234567 = _mm256_load_ps(w);
+      w += 8;
+
+      vacc0x01234567 = _mm256_fmadd_ps(va0, vb01234567, vacc0x01234567);
+      vacc1x01234567 = _mm256_fmadd_ps(va1, vb01234567, vacc1x01234567);
+      vacc2x01234567 = _mm256_fmadd_ps(va2, vb01234567, vacc2x01234567);
+      vacc3x01234567 = _mm256_fmadd_ps(va3, vb01234567, vacc3x01234567);
+      vacc4x01234567 = _mm256_fmadd_ps(va4, vb01234567, vacc4x01234567);
+      vacc5x01234567 = _mm256_fmadd_ps(va5, vb01234567, vacc5x01234567);
+      vacc6x01234567 = _mm256_fmadd_ps(va6, vb01234567, vacc6x01234567);
+      vacc7x01234567 = _mm256_fmadd_ps(va7, vb01234567, vacc7x01234567);
+
+      k -= sizeof(float);
+    } while (k != 0);
+
+    const __m256 vmax = _mm256_broadcast_ps((const __m128*) params->sse.max);
+    vacc0x01234567 = _mm256_min_ps(vacc0x01234567, vmax);
+    vacc1x01234567 = _mm256_min_ps(vacc1x01234567, vmax);
+    vacc2x01234567 = _mm256_min_ps(vacc2x01234567, vmax);
+    vacc3x01234567 = _mm256_min_ps(vacc3x01234567, vmax);
+    vacc4x01234567 = _mm256_min_ps(vacc4x01234567, vmax);
+    vacc5x01234567 = _mm256_min_ps(vacc5x01234567, vmax);
+    vacc6x01234567 = _mm256_min_ps(vacc6x01234567, vmax);
+    vacc7x01234567 = _mm256_min_ps(vacc7x01234567, vmax);
+
+    const __m256 vmin = _mm256_broadcast_ps((const __m128*) params->sse.min);
+    vacc0x01234567 = _mm256_max_ps(vacc0x01234567, vmin);
+    vacc1x01234567 = _mm256_max_ps(vacc1x01234567, vmin);
+    vacc2x01234567 = _mm256_max_ps(vacc2x01234567, vmin);
+    vacc3x01234567 = _mm256_max_ps(vacc3x01234567, vmin);
+    vacc4x01234567 = _mm256_max_ps(vacc4x01234567, vmin);
+    vacc5x01234567 = _mm256_max_ps(vacc5x01234567, vmin);
+    vacc6x01234567 = _mm256_max_ps(vacc6x01234567, vmin);
+    vacc7x01234567 = _mm256_max_ps(vacc7x01234567, vmin);
+
+    if XNN_LIKELY(nc >= 8) {
+      _mm256_storeu_ps(c7, vacc7x01234567);
+      c7 = (float*) ((uintptr_t) c7 + cn_stride);
+      _mm256_storeu_ps(c6, vacc6x01234567);
+      c6 = (float*) ((uintptr_t) c6 + cn_stride);
+      _mm256_storeu_ps(c5, vacc5x01234567);
+      c5 = (float*) ((uintptr_t) c5 + cn_stride);
+      _mm256_storeu_ps(c4, vacc4x01234567);
+      c4 = (float*) ((uintptr_t) c4 + cn_stride);
+      _mm256_storeu_ps(c3, vacc3x01234567);
+      c3 = (float*) ((uintptr_t) c3 + cn_stride);
+      _mm256_storeu_ps(c2, vacc2x01234567);
+      c2 = (float*) ((uintptr_t) c2 + cn_stride);
+      _mm256_storeu_ps(c1, vacc1x01234567);
+      c1 = (float*) ((uintptr_t) c1 + cn_stride);
+      _mm256_storeu_ps(c0, vacc0x01234567);
+      c0 = (float*) ((uintptr_t) c0 + cn_stride);
+
+      a7 = (const float*) ((uintptr_t) a7 - kc);
+      a6 = (const float*) ((uintptr_t) a6 - kc);
+      a5 = (const float*) ((uintptr_t) a5 - kc);
+      a4 = (const float*) ((uintptr_t) a4 - kc);
+      a3 = (const float*) ((uintptr_t) a3 - kc);
+      a2 = (const float*) ((uintptr_t) a2 - kc);
+      a1 = (const float*) ((uintptr_t) a1 - kc);
+      a0 = (const float*) ((uintptr_t) a0 - kc);
+
+      nc -= 8;
+    } else {
+      __m128 vacc7x0123 = _mm256_castps256_ps128(vacc7x01234567);
+      __m128 vacc6x0123 = _mm256_castps256_ps128(vacc6x01234567);
+      __m128 vacc5x0123 = _mm256_castps256_ps128(vacc5x01234567);
+      __m128 vacc4x0123 = _mm256_castps256_ps128(vacc4x01234567);
+      __m128 vacc3x0123 = _mm256_castps256_ps128(vacc3x01234567);
+      __m128 vacc2x0123 = _mm256_castps256_ps128(vacc2x01234567);
+      __m128 vacc1x0123 = _mm256_castps256_ps128(vacc1x01234567);
+      __m128 vacc0x0123 = _mm256_castps256_ps128(vacc0x01234567);
+      if (nc & 4) {
+        _mm_storeu_ps(c7, vacc7x0123);
+        _mm_storeu_ps(c6, vacc6x0123);
+        _mm_storeu_ps(c5, vacc5x0123);
+        _mm_storeu_ps(c4, vacc4x0123);
+        _mm_storeu_ps(c3, vacc3x0123);
+        _mm_storeu_ps(c2, vacc2x0123);
+        _mm_storeu_ps(c1, vacc1x0123);
+        _mm_storeu_ps(c0, vacc0x0123);
+
+        vacc7x0123 = _mm256_extractf128_ps(vacc7x01234567, 1);
+        vacc6x0123 = _mm256_extractf128_ps(vacc6x01234567, 1);
+        vacc5x0123 = _mm256_extractf128_ps(vacc5x01234567, 1);
+        vacc4x0123 = _mm256_extractf128_ps(vacc4x01234567, 1);
+        vacc3x0123 = _mm256_extractf128_ps(vacc3x01234567, 1);
+        vacc2x0123 = _mm256_extractf128_ps(vacc2x01234567, 1);
+        vacc1x0123 = _mm256_extractf128_ps(vacc1x01234567, 1);
+        vacc0x0123 = _mm256_extractf128_ps(vacc0x01234567, 1);
+
+        c7 += 4;
+        c6 += 4;
+        c5 += 4;
+        c4 += 4;
+        c3 += 4;
+        c2 += 4;
+        c1 += 4;
+        c0 += 4;
+      }
+      if (nc & 2) {
+        _mm_storel_pi((__m64*) c7, vacc7x0123);
+        _mm_storel_pi((__m64*) c6, vacc6x0123);
+        _mm_storel_pi((__m64*) c5, vacc5x0123);
+        _mm_storel_pi((__m64*) c4, vacc4x0123);
+        _mm_storel_pi((__m64*) c3, vacc3x0123);
+        _mm_storel_pi((__m64*) c2, vacc2x0123);
+        _mm_storel_pi((__m64*) c1, vacc1x0123);
+        _mm_storel_pi((__m64*) c0, vacc0x0123);
+
+        vacc7x0123 = _mm_movehl_ps(vacc7x0123, vacc7x0123);
+        vacc6x0123 = _mm_movehl_ps(vacc6x0123, vacc6x0123);
+        vacc5x0123 = _mm_movehl_ps(vacc5x0123, vacc5x0123);
+        vacc4x0123 = _mm_movehl_ps(vacc4x0123, vacc4x0123);
+        vacc3x0123 = _mm_movehl_ps(vacc3x0123, vacc3x0123);
+        vacc2x0123 = _mm_movehl_ps(vacc2x0123, vacc2x0123);
+        vacc1x0123 = _mm_movehl_ps(vacc1x0123, vacc1x0123);
+        vacc0x0123 = _mm_movehl_ps(vacc0x0123, vacc0x0123);
+
+        c7 += 2;
+        c6 += 2;
+        c5 += 2;
+        c4 += 2;
+        c3 += 2;
+        c2 += 2;
+        c1 += 2;
+        c0 += 2;
+      }
+      if (nc & 1) {
+        _mm_store_ss(c7, vacc7x0123);
+        _mm_store_ss(c6, vacc6x0123);
+        _mm_store_ss(c5, vacc5x0123);
+        _mm_store_ss(c4, vacc4x0123);
+        _mm_store_ss(c3, vacc3x0123);
+        _mm_store_ss(c2, vacc2x0123);
+        _mm_store_ss(c1, vacc1x0123);
+        _mm_store_ss(c0, vacc0x0123);
+      }
+
+      nc = 0;
+    }
+  } while (nc != 0);
+}
diff --git a/src/f32-igemm/1x8-avx-broadcast.c b/src/f32-igemm/1x8-avx-broadcast.c
new file mode 100644
index 0000000..4b760bc
--- /dev/null
+++ b/src/f32-igemm/1x8-avx-broadcast.c
@@ -0,0 +1,107 @@
+// Auto-generated file. Do not edit!
+//   Template: src/f32-igemm/avx-broadcast.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <immintrin.h>
+
+#include <xnnpack/igemm.h>
+
+
+void xnn_f32_igemm_ukernel_1x8__avx_broadcast(
+    size_t mr,
+    size_t nc,
+    size_t kc,
+    size_t ks,
+    const float**restrict a,
+    const float*restrict w,
+    float*restrict c,
+    size_t cm_stride,
+    size_t cn_stride,
+    size_t a_offset,
+    const float* zero,
+    const union xnn_f32_output_params params[restrict static 1])
+{
+  assert(mr != 0);
+  assert(mr <= 1);
+  assert(nc != 0);
+  assert(kc != 0);
+  assert(kc % sizeof(float) == 0);
+  assert(ks != 0);
+  assert(ks % (1 * sizeof(void*)) == 0);
+  assert(a_offset % sizeof(float) == 0);
+  assert(a != NULL);
+  assert(w != NULL);
+  assert(c != NULL);
+
+  float* c0 = c;
+
+  do {
+    __m256 vacc0x01234567 = _mm256_load_ps(w);
+    w += 8;
+
+    size_t p = ks;
+    do {
+      const float* restrict a0 = a[0];
+      assert(a0 != NULL);
+      if XNN_UNPREDICTABLE(a0 != zero) {
+        a0 = (const float*) ((uintptr_t) a0 + a_offset);
+      }
+      a += 1;
+
+      size_t k = kc;
+      do {
+        const __m256 vb01234567 = _mm256_load_ps(w);
+        w += 8;
+
+        const __m256 va0 = _mm256_broadcast_ss(a0);
+        a0 += 1;
+
+        vacc0x01234567 = _mm256_add_ps(vacc0x01234567, _mm256_mul_ps(va0, vb01234567));
+        k -= sizeof(float);
+      } while (k != 0);
+      p -= 1 * sizeof(void*);
+    } while (p != 0);
+
+    const __m256 vmax = _mm256_broadcast_ps((const __m128*) params->sse.max);
+    vacc0x01234567 = _mm256_min_ps(vacc0x01234567, vmax);
+
+    const __m256 vmin = _mm256_broadcast_ps((const __m128*) params->sse.min);
+    vacc0x01234567 = _mm256_max_ps(vacc0x01234567, vmin);
+
+    if XNN_LIKELY(nc >= 8) {
+      _mm256_storeu_ps(c0, vacc0x01234567);
+      c0 = (float*) ((uintptr_t) c0 + cn_stride);
+
+      a = (const float**restrict) ((uintptr_t) a - ks);
+      nc -= 8;
+    } else {
+      __m128 vacc0x0123 = _mm256_castps256_ps128(vacc0x01234567);
+      if (nc & 4) {
+        _mm_storeu_ps(c0, vacc0x0123);
+
+        vacc0x0123 = _mm256_extractf128_ps(vacc0x01234567, 1);
+
+        c0 += 4;
+      }
+      if (nc & 2) {
+        _mm_storel_pi((__m64*) c0, vacc0x0123);
+
+        vacc0x0123 = _mm_movehl_ps(vacc0x0123, vacc0x0123);
+
+        c0 += 2;
+      }
+      if (nc & 1) {
+        _mm_store_ss(c0, vacc0x0123);
+      }
+
+      nc = 0;
+    }
+  } while (nc != 0);
+}
diff --git a/src/f32-igemm/1x8-fma3-broadcast.c b/src/f32-igemm/1x8-fma3-broadcast.c
new file mode 100644
index 0000000..113bbba
--- /dev/null
+++ b/src/f32-igemm/1x8-fma3-broadcast.c
@@ -0,0 +1,107 @@
+// Auto-generated file. Do not edit!
+//   Template: src/f32-igemm/avx-broadcast.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <immintrin.h>
+
+#include <xnnpack/igemm.h>
+
+
+void xnn_f32_igemm_ukernel_1x8__fma3_broadcast(
+    size_t mr,
+    size_t nc,
+    size_t kc,
+    size_t ks,
+    const float**restrict a,
+    const float*restrict w,
+    float*restrict c,
+    size_t cm_stride,
+    size_t cn_stride,
+    size_t a_offset,
+    const float* zero,
+    const union xnn_f32_output_params params[restrict static 1])
+{
+  assert(mr != 0);
+  assert(mr <= 1);
+  assert(nc != 0);
+  assert(kc != 0);
+  assert(kc % sizeof(float) == 0);
+  assert(ks != 0);
+  assert(ks % (1 * sizeof(void*)) == 0);
+  assert(a_offset % sizeof(float) == 0);
+  assert(a != NULL);
+  assert(w != NULL);
+  assert(c != NULL);
+
+  float* c0 = c;
+
+  do {
+    __m256 vacc0x01234567 = _mm256_load_ps(w);
+    w += 8;
+
+    size_t p = ks;
+    do {
+      const float* restrict a0 = a[0];
+      assert(a0 != NULL);
+      if XNN_UNPREDICTABLE(a0 != zero) {
+        a0 = (const float*) ((uintptr_t) a0 + a_offset);
+      }
+      a += 1;
+
+      size_t k = kc;
+      do {
+        const __m256 vb01234567 = _mm256_load_ps(w);
+        w += 8;
+
+        const __m256 va0 = _mm256_broadcast_ss(a0);
+        a0 += 1;
+
+        vacc0x01234567 = _mm256_fmadd_ps(va0, vb01234567, vacc0x01234567);
+        k -= sizeof(float);
+      } while (k != 0);
+      p -= 1 * sizeof(void*);
+    } while (p != 0);
+
+    const __m256 vmax = _mm256_broadcast_ps((const __m128*) params->sse.max);
+    vacc0x01234567 = _mm256_min_ps(vacc0x01234567, vmax);
+
+    const __m256 vmin = _mm256_broadcast_ps((const __m128*) params->sse.min);
+    vacc0x01234567 = _mm256_max_ps(vacc0x01234567, vmin);
+
+    if XNN_LIKELY(nc >= 8) {
+      _mm256_storeu_ps(c0, vacc0x01234567);
+      c0 = (float*) ((uintptr_t) c0 + cn_stride);
+
+      a = (const float**restrict) ((uintptr_t) a - ks);
+      nc -= 8;
+    } else {
+      __m128 vacc0x0123 = _mm256_castps256_ps128(vacc0x01234567);
+      if (nc & 4) {
+        _mm_storeu_ps(c0, vacc0x0123);
+
+        vacc0x0123 = _mm256_extractf128_ps(vacc0x01234567, 1);
+
+        c0 += 4;
+      }
+      if (nc & 2) {
+        _mm_storel_pi((__m64*) c0, vacc0x0123);
+
+        vacc0x0123 = _mm_movehl_ps(vacc0x0123, vacc0x0123);
+
+        c0 += 2;
+      }
+      if (nc & 1) {
+        _mm_store_ss(c0, vacc0x0123);
+      }
+
+      nc = 0;
+    }
+  } while (nc != 0);
+}
diff --git a/src/f32-igemm/4x8-avx-broadcast.c b/src/f32-igemm/4x8-avx-broadcast.c
new file mode 100644
index 0000000..259f4d6
--- /dev/null
+++ b/src/f32-igemm/4x8-avx-broadcast.c
@@ -0,0 +1,182 @@
+// Auto-generated file. Do not edit!
+//   Template: src/f32-igemm/avx-broadcast.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <immintrin.h>
+
+#include <xnnpack/igemm.h>
+
+
+void xnn_f32_igemm_ukernel_4x8__avx_broadcast(
+    size_t mr,
+    size_t nc,
+    size_t kc,
+    size_t ks,
+    const float**restrict a,
+    const float*restrict w,
+    float*restrict c,
+    size_t cm_stride,
+    size_t cn_stride,
+    size_t a_offset,
+    const float* zero,
+    const union xnn_f32_output_params params[restrict static 1])
+{
+  assert(mr != 0);
+  assert(mr <= 4);
+  assert(nc != 0);
+  assert(kc != 0);
+  assert(kc % sizeof(float) == 0);
+  assert(ks != 0);
+  assert(ks % (4 * sizeof(void*)) == 0);
+  assert(a_offset % sizeof(float) == 0);
+  assert(a != NULL);
+  assert(w != NULL);
+  assert(c != NULL);
+
+  float* c0 = c;
+  float* c1 = (float*) ((uintptr_t) c0 + cm_stride);
+  if XNN_UNPREDICTABLE(mr < 2) {
+    c1 = c0;
+  }
+  float* c2 = (float*) ((uintptr_t) c1 + cm_stride);
+  if XNN_UNPREDICTABLE(mr <= 2) {
+    c2 = c1;
+  }
+  float* c3 = (float*) ((uintptr_t) c2 + cm_stride);
+  if XNN_UNPREDICTABLE(mr != 4) {
+    c3 = c2;
+  }
+
+  do {
+    __m256 vacc0x01234567 = _mm256_load_ps(w);
+    __m256 vacc1x01234567 = vacc0x01234567;
+    __m256 vacc2x01234567 = vacc0x01234567;
+    __m256 vacc3x01234567 = vacc0x01234567;
+    w += 8;
+
+    size_t p = ks;
+    do {
+      const float* restrict a0 = a[0];
+      assert(a0 != NULL);
+      if XNN_UNPREDICTABLE(a0 != zero) {
+        a0 = (const float*) ((uintptr_t) a0 + a_offset);
+      }
+      const float* restrict a1 = a[1];
+      assert(a1 != NULL);
+      if XNN_UNPREDICTABLE(a1 != zero) {
+        a1 = (const float*) ((uintptr_t) a1 + a_offset);
+      }
+      const float* restrict a2 = a[2];
+      assert(a2 != NULL);
+      if XNN_UNPREDICTABLE(a2 != zero) {
+        a2 = (const float*) ((uintptr_t) a2 + a_offset);
+      }
+      const float* restrict a3 = a[3];
+      assert(a3 != NULL);
+      if XNN_UNPREDICTABLE(a3 != zero) {
+        a3 = (const float*) ((uintptr_t) a3 + a_offset);
+      }
+      a += 4;
+
+      size_t k = kc;
+      do {
+        const __m256 vb01234567 = _mm256_load_ps(w);
+        w += 8;
+
+        const __m256 va0 = _mm256_broadcast_ss(a0);
+        a0 += 1;
+        const __m256 va1 = _mm256_broadcast_ss(a1);
+        a1 += 1;
+        const __m256 va2 = _mm256_broadcast_ss(a2);
+        a2 += 1;
+        const __m256 va3 = _mm256_broadcast_ss(a3);
+        a3 += 1;
+
+        vacc0x01234567 = _mm256_add_ps(vacc0x01234567, _mm256_mul_ps(va0, vb01234567));
+        vacc1x01234567 = _mm256_add_ps(vacc1x01234567, _mm256_mul_ps(va1, vb01234567));
+        vacc2x01234567 = _mm256_add_ps(vacc2x01234567, _mm256_mul_ps(va2, vb01234567));
+        vacc3x01234567 = _mm256_add_ps(vacc3x01234567, _mm256_mul_ps(va3, vb01234567));
+        k -= sizeof(float);
+      } while (k != 0);
+      p -= 4 * sizeof(void*);
+    } while (p != 0);
+
+    const __m256 vmax = _mm256_broadcast_ps((const __m128*) params->sse.max);
+    vacc0x01234567 = _mm256_min_ps(vacc0x01234567, vmax);
+    vacc1x01234567 = _mm256_min_ps(vacc1x01234567, vmax);
+    vacc2x01234567 = _mm256_min_ps(vacc2x01234567, vmax);
+    vacc3x01234567 = _mm256_min_ps(vacc3x01234567, vmax);
+
+    const __m256 vmin = _mm256_broadcast_ps((const __m128*) params->sse.min);
+    vacc0x01234567 = _mm256_max_ps(vacc0x01234567, vmin);
+    vacc1x01234567 = _mm256_max_ps(vacc1x01234567, vmin);
+    vacc2x01234567 = _mm256_max_ps(vacc2x01234567, vmin);
+    vacc3x01234567 = _mm256_max_ps(vacc3x01234567, vmin);
+
+    if XNN_LIKELY(nc >= 8) {
+      _mm256_storeu_ps(c3, vacc3x01234567);
+      c3 = (float*) ((uintptr_t) c3 + cn_stride);
+      _mm256_storeu_ps(c2, vacc2x01234567);
+      c2 = (float*) ((uintptr_t) c2 + cn_stride);
+      _mm256_storeu_ps(c1, vacc1x01234567);
+      c1 = (float*) ((uintptr_t) c1 + cn_stride);
+      _mm256_storeu_ps(c0, vacc0x01234567);
+      c0 = (float*) ((uintptr_t) c0 + cn_stride);
+
+      a = (const float**restrict) ((uintptr_t) a - ks);
+      nc -= 8;
+    } else {
+      __m128 vacc3x0123 = _mm256_castps256_ps128(vacc3x01234567);
+      __m128 vacc2x0123 = _mm256_castps256_ps128(vacc2x01234567);
+      __m128 vacc1x0123 = _mm256_castps256_ps128(vacc1x01234567);
+      __m128 vacc0x0123 = _mm256_castps256_ps128(vacc0x01234567);
+      if (nc & 4) {
+        _mm_storeu_ps(c3, vacc3x0123);
+        _mm_storeu_ps(c2, vacc2x0123);
+        _mm_storeu_ps(c1, vacc1x0123);
+        _mm_storeu_ps(c0, vacc0x0123);
+
+        vacc3x0123 = _mm256_extractf128_ps(vacc3x01234567, 1);
+        vacc2x0123 = _mm256_extractf128_ps(vacc2x01234567, 1);
+        vacc1x0123 = _mm256_extractf128_ps(vacc1x01234567, 1);
+        vacc0x0123 = _mm256_extractf128_ps(vacc0x01234567, 1);
+
+        c3 += 4;
+        c2 += 4;
+        c1 += 4;
+        c0 += 4;
+      }
+      if (nc & 2) {
+        _mm_storel_pi((__m64*) c3, vacc3x0123);
+        _mm_storel_pi((__m64*) c2, vacc2x0123);
+        _mm_storel_pi((__m64*) c1, vacc1x0123);
+        _mm_storel_pi((__m64*) c0, vacc0x0123);
+
+        vacc3x0123 = _mm_movehl_ps(vacc3x0123, vacc3x0123);
+        vacc2x0123 = _mm_movehl_ps(vacc2x0123, vacc2x0123);
+        vacc1x0123 = _mm_movehl_ps(vacc1x0123, vacc1x0123);
+        vacc0x0123 = _mm_movehl_ps(vacc0x0123, vacc0x0123);
+
+        c3 += 2;
+        c2 += 2;
+        c1 += 2;
+        c0 += 2;
+      }
+      if (nc & 1) {
+        _mm_store_ss(c3, vacc3x0123);
+        _mm_store_ss(c2, vacc2x0123);
+        _mm_store_ss(c1, vacc1x0123);
+        _mm_store_ss(c0, vacc0x0123);
+      }
+
+      nc = 0;
+    }
+  } while (nc != 0);
+}
diff --git a/src/f32-igemm/4x8-fma3-broadcast.c b/src/f32-igemm/4x8-fma3-broadcast.c
new file mode 100644
index 0000000..537bcdf
--- /dev/null
+++ b/src/f32-igemm/4x8-fma3-broadcast.c
@@ -0,0 +1,182 @@
+// Auto-generated file. Do not edit!
+//   Template: src/f32-igemm/avx-broadcast.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <immintrin.h>
+
+#include <xnnpack/igemm.h>
+
+
+void xnn_f32_igemm_ukernel_4x8__fma3_broadcast(
+    size_t mr,
+    size_t nc,
+    size_t kc,
+    size_t ks,
+    const float**restrict a,
+    const float*restrict w,
+    float*restrict c,
+    size_t cm_stride,
+    size_t cn_stride,
+    size_t a_offset,
+    const float* zero,
+    const union xnn_f32_output_params params[restrict static 1])
+{
+  assert(mr != 0);
+  assert(mr <= 4);
+  assert(nc != 0);
+  assert(kc != 0);
+  assert(kc % sizeof(float) == 0);
+  assert(ks != 0);
+  assert(ks % (4 * sizeof(void*)) == 0);
+  assert(a_offset % sizeof(float) == 0);
+  assert(a != NULL);
+  assert(w != NULL);
+  assert(c != NULL);
+
+  float* c0 = c;
+  float* c1 = (float*) ((uintptr_t) c0 + cm_stride);
+  if XNN_UNPREDICTABLE(mr < 2) {
+    c1 = c0;
+  }
+  float* c2 = (float*) ((uintptr_t) c1 + cm_stride);
+  if XNN_UNPREDICTABLE(mr <= 2) {
+    c2 = c1;
+  }
+  float* c3 = (float*) ((uintptr_t) c2 + cm_stride);
+  if XNN_UNPREDICTABLE(mr != 4) {
+    c3 = c2;
+  }
+
+  do {
+    __m256 vacc0x01234567 = _mm256_load_ps(w);
+    __m256 vacc1x01234567 = vacc0x01234567;
+    __m256 vacc2x01234567 = vacc0x01234567;
+    __m256 vacc3x01234567 = vacc0x01234567;
+    w += 8;
+
+    size_t p = ks;
+    do {
+      const float* restrict a0 = a[0];
+      assert(a0 != NULL);
+      if XNN_UNPREDICTABLE(a0 != zero) {
+        a0 = (const float*) ((uintptr_t) a0 + a_offset);
+      }
+      const float* restrict a1 = a[1];
+      assert(a1 != NULL);
+      if XNN_UNPREDICTABLE(a1 != zero) {
+        a1 = (const float*) ((uintptr_t) a1 + a_offset);
+      }
+      const float* restrict a2 = a[2];
+      assert(a2 != NULL);
+      if XNN_UNPREDICTABLE(a2 != zero) {
+        a2 = (const float*) ((uintptr_t) a2 + a_offset);
+      }
+      const float* restrict a3 = a[3];
+      assert(a3 != NULL);
+      if XNN_UNPREDICTABLE(a3 != zero) {
+        a3 = (const float*) ((uintptr_t) a3 + a_offset);
+      }
+      a += 4;
+
+      size_t k = kc;
+      do {
+        const __m256 vb01234567 = _mm256_load_ps(w);
+        w += 8;
+
+        const __m256 va0 = _mm256_broadcast_ss(a0);
+        a0 += 1;
+        const __m256 va1 = _mm256_broadcast_ss(a1);
+        a1 += 1;
+        const __m256 va2 = _mm256_broadcast_ss(a2);
+        a2 += 1;
+        const __m256 va3 = _mm256_broadcast_ss(a3);
+        a3 += 1;
+
+        vacc0x01234567 = _mm256_fmadd_ps(va0, vb01234567, vacc0x01234567);
+        vacc1x01234567 = _mm256_fmadd_ps(va1, vb01234567, vacc1x01234567);
+        vacc2x01234567 = _mm256_fmadd_ps(va2, vb01234567, vacc2x01234567);
+        vacc3x01234567 = _mm256_fmadd_ps(va3, vb01234567, vacc3x01234567);
+        k -= sizeof(float);
+      } while (k != 0);
+      p -= 4 * sizeof(void*);
+    } while (p != 0);
+
+    const __m256 vmax = _mm256_broadcast_ps((const __m128*) params->sse.max);
+    vacc0x01234567 = _mm256_min_ps(vacc0x01234567, vmax);
+    vacc1x01234567 = _mm256_min_ps(vacc1x01234567, vmax);
+    vacc2x01234567 = _mm256_min_ps(vacc2x01234567, vmax);
+    vacc3x01234567 = _mm256_min_ps(vacc3x01234567, vmax);
+
+    const __m256 vmin = _mm256_broadcast_ps((const __m128*) params->sse.min);
+    vacc0x01234567 = _mm256_max_ps(vacc0x01234567, vmin);
+    vacc1x01234567 = _mm256_max_ps(vacc1x01234567, vmin);
+    vacc2x01234567 = _mm256_max_ps(vacc2x01234567, vmin);
+    vacc3x01234567 = _mm256_max_ps(vacc3x01234567, vmin);
+
+    if XNN_LIKELY(nc >= 8) {
+      _mm256_storeu_ps(c3, vacc3x01234567);
+      c3 = (float*) ((uintptr_t) c3 + cn_stride);
+      _mm256_storeu_ps(c2, vacc2x01234567);
+      c2 = (float*) ((uintptr_t) c2 + cn_stride);
+      _mm256_storeu_ps(c1, vacc1x01234567);
+      c1 = (float*) ((uintptr_t) c1 + cn_stride);
+      _mm256_storeu_ps(c0, vacc0x01234567);
+      c0 = (float*) ((uintptr_t) c0 + cn_stride);
+
+      a = (const float**restrict) ((uintptr_t) a - ks);
+      nc -= 8;
+    } else {
+      __m128 vacc3x0123 = _mm256_castps256_ps128(vacc3x01234567);
+      __m128 vacc2x0123 = _mm256_castps256_ps128(vacc2x01234567);
+      __m128 vacc1x0123 = _mm256_castps256_ps128(vacc1x01234567);
+      __m128 vacc0x0123 = _mm256_castps256_ps128(vacc0x01234567);
+      if (nc & 4) {
+        _mm_storeu_ps(c3, vacc3x0123);
+        _mm_storeu_ps(c2, vacc2x0123);
+        _mm_storeu_ps(c1, vacc1x0123);
+        _mm_storeu_ps(c0, vacc0x0123);
+
+        vacc3x0123 = _mm256_extractf128_ps(vacc3x01234567, 1);
+        vacc2x0123 = _mm256_extractf128_ps(vacc2x01234567, 1);
+        vacc1x0123 = _mm256_extractf128_ps(vacc1x01234567, 1);
+        vacc0x0123 = _mm256_extractf128_ps(vacc0x01234567, 1);
+
+        c3 += 4;
+        c2 += 4;
+        c1 += 4;
+        c0 += 4;
+      }
+      if (nc & 2) {
+        _mm_storel_pi((__m64*) c3, vacc3x0123);
+        _mm_storel_pi((__m64*) c2, vacc2x0123);
+        _mm_storel_pi((__m64*) c1, vacc1x0123);
+        _mm_storel_pi((__m64*) c0, vacc0x0123);
+
+        vacc3x0123 = _mm_movehl_ps(vacc3x0123, vacc3x0123);
+        vacc2x0123 = _mm_movehl_ps(vacc2x0123, vacc2x0123);
+        vacc1x0123 = _mm_movehl_ps(vacc1x0123, vacc1x0123);
+        vacc0x0123 = _mm_movehl_ps(vacc0x0123, vacc0x0123);
+
+        c3 += 2;
+        c2 += 2;
+        c1 += 2;
+        c0 += 2;
+      }
+      if (nc & 1) {
+        _mm_store_ss(c3, vacc3x0123);
+        _mm_store_ss(c2, vacc2x0123);
+        _mm_store_ss(c1, vacc1x0123);
+        _mm_store_ss(c0, vacc0x0123);
+      }
+
+      nc = 0;
+    }
+  } while (nc != 0);
+}
diff --git a/src/f32-igemm/5x8-avx-broadcast.c b/src/f32-igemm/5x8-avx-broadcast.c
new file mode 100644
index 0000000..5290cac
--- /dev/null
+++ b/src/f32-igemm/5x8-avx-broadcast.c
@@ -0,0 +1,207 @@
+// Auto-generated file. Do not edit!
+//   Template: src/f32-igemm/avx-broadcast.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <immintrin.h>
+
+#include <xnnpack/igemm.h>
+
+
+void xnn_f32_igemm_ukernel_5x8__avx_broadcast(
+    size_t mr,
+    size_t nc,
+    size_t kc,
+    size_t ks,
+    const float**restrict a,
+    const float*restrict w,
+    float*restrict c,
+    size_t cm_stride,
+    size_t cn_stride,
+    size_t a_offset,
+    const float* zero,
+    const union xnn_f32_output_params params[restrict static 1])
+{
+  assert(mr != 0);
+  assert(mr <= 5);
+  assert(nc != 0);
+  assert(kc != 0);
+  assert(kc % sizeof(float) == 0);
+  assert(ks != 0);
+  assert(ks % (5 * sizeof(void*)) == 0);
+  assert(a_offset % sizeof(float) == 0);
+  assert(a != NULL);
+  assert(w != NULL);
+  assert(c != NULL);
+
+  float* c0 = c;
+  float* c1 = (float*) ((uintptr_t) c0 + cm_stride);
+  if XNN_UNPREDICTABLE(mr < 2) {
+    c1 = c0;
+  }
+  float* c2 = (float*) ((uintptr_t) c1 + cm_stride);
+  if XNN_UNPREDICTABLE(mr <= 2) {
+    c2 = c1;
+  }
+  float* c3 = (float*) ((uintptr_t) c2 + cm_stride);
+  if XNN_UNPREDICTABLE(mr < 4) {
+    c3 = c2;
+  }
+  float* c4 = (float*) ((uintptr_t) c3 + cm_stride);
+  if XNN_UNPREDICTABLE(mr <= 4) {
+    c4 = c3;
+  }
+
+  do {
+    __m256 vacc0x01234567 = _mm256_load_ps(w);
+    __m256 vacc1x01234567 = vacc0x01234567;
+    __m256 vacc2x01234567 = vacc0x01234567;
+    __m256 vacc3x01234567 = vacc0x01234567;
+    __m256 vacc4x01234567 = vacc0x01234567;
+    w += 8;
+
+    size_t p = ks;
+    do {
+      const float* restrict a0 = a[0];
+      assert(a0 != NULL);
+      if XNN_UNPREDICTABLE(a0 != zero) {
+        a0 = (const float*) ((uintptr_t) a0 + a_offset);
+      }
+      const float* restrict a1 = a[1];
+      assert(a1 != NULL);
+      if XNN_UNPREDICTABLE(a1 != zero) {
+        a1 = (const float*) ((uintptr_t) a1 + a_offset);
+      }
+      const float* restrict a2 = a[2];
+      assert(a2 != NULL);
+      if XNN_UNPREDICTABLE(a2 != zero) {
+        a2 = (const float*) ((uintptr_t) a2 + a_offset);
+      }
+      const float* restrict a3 = a[3];
+      assert(a3 != NULL);
+      if XNN_UNPREDICTABLE(a3 != zero) {
+        a3 = (const float*) ((uintptr_t) a3 + a_offset);
+      }
+      const float* restrict a4 = a[4];
+      assert(a4 != NULL);
+      if XNN_UNPREDICTABLE(a4 != zero) {
+        a4 = (const float*) ((uintptr_t) a4 + a_offset);
+      }
+      a += 5;
+
+      size_t k = kc;
+      do {
+        const __m256 vb01234567 = _mm256_load_ps(w);
+        w += 8;
+
+        const __m256 va0 = _mm256_broadcast_ss(a0);
+        a0 += 1;
+        const __m256 va1 = _mm256_broadcast_ss(a1);
+        a1 += 1;
+        const __m256 va2 = _mm256_broadcast_ss(a2);
+        a2 += 1;
+        const __m256 va3 = _mm256_broadcast_ss(a3);
+        a3 += 1;
+        const __m256 va4 = _mm256_broadcast_ss(a4);
+        a4 += 1;
+
+        vacc0x01234567 = _mm256_add_ps(vacc0x01234567, _mm256_mul_ps(va0, vb01234567));
+        vacc1x01234567 = _mm256_add_ps(vacc1x01234567, _mm256_mul_ps(va1, vb01234567));
+        vacc2x01234567 = _mm256_add_ps(vacc2x01234567, _mm256_mul_ps(va2, vb01234567));
+        vacc3x01234567 = _mm256_add_ps(vacc3x01234567, _mm256_mul_ps(va3, vb01234567));
+        vacc4x01234567 = _mm256_add_ps(vacc4x01234567, _mm256_mul_ps(va4, vb01234567));
+        k -= sizeof(float);
+      } while (k != 0);
+      p -= 5 * sizeof(void*);
+    } while (p != 0);
+
+    const __m256 vmax = _mm256_broadcast_ps((const __m128*) params->sse.max);
+    vacc0x01234567 = _mm256_min_ps(vacc0x01234567, vmax);
+    vacc1x01234567 = _mm256_min_ps(vacc1x01234567, vmax);
+    vacc2x01234567 = _mm256_min_ps(vacc2x01234567, vmax);
+    vacc3x01234567 = _mm256_min_ps(vacc3x01234567, vmax);
+    vacc4x01234567 = _mm256_min_ps(vacc4x01234567, vmax);
+
+    const __m256 vmin = _mm256_broadcast_ps((const __m128*) params->sse.min);
+    vacc0x01234567 = _mm256_max_ps(vacc0x01234567, vmin);
+    vacc1x01234567 = _mm256_max_ps(vacc1x01234567, vmin);
+    vacc2x01234567 = _mm256_max_ps(vacc2x01234567, vmin);
+    vacc3x01234567 = _mm256_max_ps(vacc3x01234567, vmin);
+    vacc4x01234567 = _mm256_max_ps(vacc4x01234567, vmin);
+
+    if XNN_LIKELY(nc >= 8) {
+      _mm256_storeu_ps(c4, vacc4x01234567);
+      c4 = (float*) ((uintptr_t) c4 + cn_stride);
+      _mm256_storeu_ps(c3, vacc3x01234567);
+      c3 = (float*) ((uintptr_t) c3 + cn_stride);
+      _mm256_storeu_ps(c2, vacc2x01234567);
+      c2 = (float*) ((uintptr_t) c2 + cn_stride);
+      _mm256_storeu_ps(c1, vacc1x01234567);
+      c1 = (float*) ((uintptr_t) c1 + cn_stride);
+      _mm256_storeu_ps(c0, vacc0x01234567);
+      c0 = (float*) ((uintptr_t) c0 + cn_stride);
+
+      a = (const float**restrict) ((uintptr_t) a - ks);
+      nc -= 8;
+    } else {
+      __m128 vacc4x0123 = _mm256_castps256_ps128(vacc4x01234567);
+      __m128 vacc3x0123 = _mm256_castps256_ps128(vacc3x01234567);
+      __m128 vacc2x0123 = _mm256_castps256_ps128(vacc2x01234567);
+      __m128 vacc1x0123 = _mm256_castps256_ps128(vacc1x01234567);
+      __m128 vacc0x0123 = _mm256_castps256_ps128(vacc0x01234567);
+      if (nc & 4) {
+        _mm_storeu_ps(c4, vacc4x0123);
+        _mm_storeu_ps(c3, vacc3x0123);
+        _mm_storeu_ps(c2, vacc2x0123);
+        _mm_storeu_ps(c1, vacc1x0123);
+        _mm_storeu_ps(c0, vacc0x0123);
+
+        vacc4x0123 = _mm256_extractf128_ps(vacc4x01234567, 1);
+        vacc3x0123 = _mm256_extractf128_ps(vacc3x01234567, 1);
+        vacc2x0123 = _mm256_extractf128_ps(vacc2x01234567, 1);
+        vacc1x0123 = _mm256_extractf128_ps(vacc1x01234567, 1);
+        vacc0x0123 = _mm256_extractf128_ps(vacc0x01234567, 1);
+
+        c4 += 4;
+        c3 += 4;
+        c2 += 4;
+        c1 += 4;
+        c0 += 4;
+      }
+      if (nc & 2) {
+        _mm_storel_pi((__m64*) c4, vacc4x0123);
+        _mm_storel_pi((__m64*) c3, vacc3x0123);
+        _mm_storel_pi((__m64*) c2, vacc2x0123);
+        _mm_storel_pi((__m64*) c1, vacc1x0123);
+        _mm_storel_pi((__m64*) c0, vacc0x0123);
+
+        vacc4x0123 = _mm_movehl_ps(vacc4x0123, vacc4x0123);
+        vacc3x0123 = _mm_movehl_ps(vacc3x0123, vacc3x0123);
+        vacc2x0123 = _mm_movehl_ps(vacc2x0123, vacc2x0123);
+        vacc1x0123 = _mm_movehl_ps(vacc1x0123, vacc1x0123);
+        vacc0x0123 = _mm_movehl_ps(vacc0x0123, vacc0x0123);
+
+        c4 += 2;
+        c3 += 2;
+        c2 += 2;
+        c1 += 2;
+        c0 += 2;
+      }
+      if (nc & 1) {
+        _mm_store_ss(c4, vacc4x0123);
+        _mm_store_ss(c3, vacc3x0123);
+        _mm_store_ss(c2, vacc2x0123);
+        _mm_store_ss(c1, vacc1x0123);
+        _mm_store_ss(c0, vacc0x0123);
+      }
+
+      nc = 0;
+    }
+  } while (nc != 0);
+}
diff --git a/src/f32-igemm/5x8-fma3-broadcast.c b/src/f32-igemm/5x8-fma3-broadcast.c
new file mode 100644
index 0000000..298ed86
--- /dev/null
+++ b/src/f32-igemm/5x8-fma3-broadcast.c
@@ -0,0 +1,207 @@
+// Auto-generated file. Do not edit!
+//   Template: src/f32-igemm/avx-broadcast.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <immintrin.h>
+
+#include <xnnpack/igemm.h>
+
+
+void xnn_f32_igemm_ukernel_5x8__fma3_broadcast(
+    size_t mr,
+    size_t nc,
+    size_t kc,
+    size_t ks,
+    const float**restrict a,
+    const float*restrict w,
+    float*restrict c,
+    size_t cm_stride,
+    size_t cn_stride,
+    size_t a_offset,
+    const float* zero,
+    const union xnn_f32_output_params params[restrict static 1])
+{
+  assert(mr != 0);
+  assert(mr <= 5);
+  assert(nc != 0);
+  assert(kc != 0);
+  assert(kc % sizeof(float) == 0);
+  assert(ks != 0);
+  assert(ks % (5 * sizeof(void*)) == 0);
+  assert(a_offset % sizeof(float) == 0);
+  assert(a != NULL);
+  assert(w != NULL);
+  assert(c != NULL);
+
+  float* c0 = c;
+  float* c1 = (float*) ((uintptr_t) c0 + cm_stride);
+  if XNN_UNPREDICTABLE(mr < 2) {
+    c1 = c0;
+  }
+  float* c2 = (float*) ((uintptr_t) c1 + cm_stride);
+  if XNN_UNPREDICTABLE(mr <= 2) {
+    c2 = c1;
+  }
+  float* c3 = (float*) ((uintptr_t) c2 + cm_stride);
+  if XNN_UNPREDICTABLE(mr < 4) {
+    c3 = c2;
+  }
+  float* c4 = (float*) ((uintptr_t) c3 + cm_stride);
+  if XNN_UNPREDICTABLE(mr <= 4) {
+    c4 = c3;
+  }
+
+  do {
+    __m256 vacc0x01234567 = _mm256_load_ps(w);
+    __m256 vacc1x01234567 = vacc0x01234567;
+    __m256 vacc2x01234567 = vacc0x01234567;
+    __m256 vacc3x01234567 = vacc0x01234567;
+    __m256 vacc4x01234567 = vacc0x01234567;
+    w += 8;
+
+    size_t p = ks;
+    do {
+      const float* restrict a0 = a[0];
+      assert(a0 != NULL);
+      if XNN_UNPREDICTABLE(a0 != zero) {
+        a0 = (const float*) ((uintptr_t) a0 + a_offset);
+      }
+      const float* restrict a1 = a[1];
+      assert(a1 != NULL);
+      if XNN_UNPREDICTABLE(a1 != zero) {
+        a1 = (const float*) ((uintptr_t) a1 + a_offset);
+      }
+      const float* restrict a2 = a[2];
+      assert(a2 != NULL);
+      if XNN_UNPREDICTABLE(a2 != zero) {
+        a2 = (const float*) ((uintptr_t) a2 + a_offset);
+      }
+      const float* restrict a3 = a[3];
+      assert(a3 != NULL);
+      if XNN_UNPREDICTABLE(a3 != zero) {
+        a3 = (const float*) ((uintptr_t) a3 + a_offset);
+      }
+      const float* restrict a4 = a[4];
+      assert(a4 != NULL);
+      if XNN_UNPREDICTABLE(a4 != zero) {
+        a4 = (const float*) ((uintptr_t) a4 + a_offset);
+      }
+      a += 5;
+
+      size_t k = kc;
+      do {
+        const __m256 vb01234567 = _mm256_load_ps(w);
+        w += 8;
+
+        const __m256 va0 = _mm256_broadcast_ss(a0);
+        a0 += 1;
+        const __m256 va1 = _mm256_broadcast_ss(a1);
+        a1 += 1;
+        const __m256 va2 = _mm256_broadcast_ss(a2);
+        a2 += 1;
+        const __m256 va3 = _mm256_broadcast_ss(a3);
+        a3 += 1;
+        const __m256 va4 = _mm256_broadcast_ss(a4);
+        a4 += 1;
+
+        vacc0x01234567 = _mm256_fmadd_ps(va0, vb01234567, vacc0x01234567);
+        vacc1x01234567 = _mm256_fmadd_ps(va1, vb01234567, vacc1x01234567);
+        vacc2x01234567 = _mm256_fmadd_ps(va2, vb01234567, vacc2x01234567);
+        vacc3x01234567 = _mm256_fmadd_ps(va3, vb01234567, vacc3x01234567);
+        vacc4x01234567 = _mm256_fmadd_ps(va4, vb01234567, vacc4x01234567);
+        k -= sizeof(float);
+      } while (k != 0);
+      p -= 5 * sizeof(void*);
+    } while (p != 0);
+
+    const __m256 vmax = _mm256_broadcast_ps((const __m128*) params->sse.max);
+    vacc0x01234567 = _mm256_min_ps(vacc0x01234567, vmax);
+    vacc1x01234567 = _mm256_min_ps(vacc1x01234567, vmax);
+    vacc2x01234567 = _mm256_min_ps(vacc2x01234567, vmax);
+    vacc3x01234567 = _mm256_min_ps(vacc3x01234567, vmax);
+    vacc4x01234567 = _mm256_min_ps(vacc4x01234567, vmax);
+
+    const __m256 vmin = _mm256_broadcast_ps((const __m128*) params->sse.min);
+    vacc0x01234567 = _mm256_max_ps(vacc0x01234567, vmin);
+    vacc1x01234567 = _mm256_max_ps(vacc1x01234567, vmin);
+    vacc2x01234567 = _mm256_max_ps(vacc2x01234567, vmin);
+    vacc3x01234567 = _mm256_max_ps(vacc3x01234567, vmin);
+    vacc4x01234567 = _mm256_max_ps(vacc4x01234567, vmin);
+
+    if XNN_LIKELY(nc >= 8) {
+      _mm256_storeu_ps(c4, vacc4x01234567);
+      c4 = (float*) ((uintptr_t) c4 + cn_stride);
+      _mm256_storeu_ps(c3, vacc3x01234567);
+      c3 = (float*) ((uintptr_t) c3 + cn_stride);
+      _mm256_storeu_ps(c2, vacc2x01234567);
+      c2 = (float*) ((uintptr_t) c2 + cn_stride);
+      _mm256_storeu_ps(c1, vacc1x01234567);
+      c1 = (float*) ((uintptr_t) c1 + cn_stride);
+      _mm256_storeu_ps(c0, vacc0x01234567);
+      c0 = (float*) ((uintptr_t) c0 + cn_stride);
+
+      a = (const float**restrict) ((uintptr_t) a - ks);
+      nc -= 8;
+    } else {
+      __m128 vacc4x0123 = _mm256_castps256_ps128(vacc4x01234567);
+      __m128 vacc3x0123 = _mm256_castps256_ps128(vacc3x01234567);
+      __m128 vacc2x0123 = _mm256_castps256_ps128(vacc2x01234567);
+      __m128 vacc1x0123 = _mm256_castps256_ps128(vacc1x01234567);
+      __m128 vacc0x0123 = _mm256_castps256_ps128(vacc0x01234567);
+      if (nc & 4) {
+        _mm_storeu_ps(c4, vacc4x0123);
+        _mm_storeu_ps(c3, vacc3x0123);
+        _mm_storeu_ps(c2, vacc2x0123);
+        _mm_storeu_ps(c1, vacc1x0123);
+        _mm_storeu_ps(c0, vacc0x0123);
+
+        vacc4x0123 = _mm256_extractf128_ps(vacc4x01234567, 1);
+        vacc3x0123 = _mm256_extractf128_ps(vacc3x01234567, 1);
+        vacc2x0123 = _mm256_extractf128_ps(vacc2x01234567, 1);
+        vacc1x0123 = _mm256_extractf128_ps(vacc1x01234567, 1);
+        vacc0x0123 = _mm256_extractf128_ps(vacc0x01234567, 1);
+
+        c4 += 4;
+        c3 += 4;
+        c2 += 4;
+        c1 += 4;
+        c0 += 4;
+      }
+      if (nc & 2) {
+        _mm_storel_pi((__m64*) c4, vacc4x0123);
+        _mm_storel_pi((__m64*) c3, vacc3x0123);
+        _mm_storel_pi((__m64*) c2, vacc2x0123);
+        _mm_storel_pi((__m64*) c1, vacc1x0123);
+        _mm_storel_pi((__m64*) c0, vacc0x0123);
+
+        vacc4x0123 = _mm_movehl_ps(vacc4x0123, vacc4x0123);
+        vacc3x0123 = _mm_movehl_ps(vacc3x0123, vacc3x0123);
+        vacc2x0123 = _mm_movehl_ps(vacc2x0123, vacc2x0123);
+        vacc1x0123 = _mm_movehl_ps(vacc1x0123, vacc1x0123);
+        vacc0x0123 = _mm_movehl_ps(vacc0x0123, vacc0x0123);
+
+        c4 += 2;
+        c3 += 2;
+        c2 += 2;
+        c1 += 2;
+        c0 += 2;
+      }
+      if (nc & 1) {
+        _mm_store_ss(c4, vacc4x0123);
+        _mm_store_ss(c3, vacc3x0123);
+        _mm_store_ss(c2, vacc2x0123);
+        _mm_store_ss(c1, vacc1x0123);
+        _mm_store_ss(c0, vacc0x0123);
+      }
+
+      nc = 0;
+    }
+  } while (nc != 0);
+}
diff --git a/src/f32-igemm/6x8-avx-broadcast.c b/src/f32-igemm/6x8-avx-broadcast.c
new file mode 100644
index 0000000..6f1f6dc
--- /dev/null
+++ b/src/f32-igemm/6x8-avx-broadcast.c
@@ -0,0 +1,232 @@
+// Auto-generated file. Do not edit!
+//   Template: src/f32-igemm/avx-broadcast.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <immintrin.h>
+
+#include <xnnpack/igemm.h>
+
+
+void xnn_f32_igemm_ukernel_6x8__avx_broadcast(
+    size_t mr,
+    size_t nc,
+    size_t kc,
+    size_t ks,
+    const float**restrict a,
+    const float*restrict w,
+    float*restrict c,
+    size_t cm_stride,
+    size_t cn_stride,
+    size_t a_offset,
+    const float* zero,
+    const union xnn_f32_output_params params[restrict static 1])
+{
+  assert(mr != 0);
+  assert(mr <= 6);
+  assert(nc != 0);
+  assert(kc != 0);
+  assert(kc % sizeof(float) == 0);
+  assert(ks != 0);
+  assert(ks % (6 * sizeof(void*)) == 0);
+  assert(a_offset % sizeof(float) == 0);
+  assert(a != NULL);
+  assert(w != NULL);
+  assert(c != NULL);
+
+  float* c0 = c;
+  float* c1 = (float*) ((uintptr_t) c0 + cm_stride);
+  if XNN_UNPREDICTABLE(mr < 2) {
+    c1 = c0;
+  }
+  float* c2 = (float*) ((uintptr_t) c1 + cm_stride);
+  if XNN_UNPREDICTABLE(mr <= 2) {
+    c2 = c1;
+  }
+  float* c3 = (float*) ((uintptr_t) c2 + cm_stride);
+  if XNN_UNPREDICTABLE(mr < 4) {
+    c3 = c2;
+  }
+  float* c4 = (float*) ((uintptr_t) c3 + cm_stride);
+  if XNN_UNPREDICTABLE(mr <= 4) {
+    c4 = c3;
+  }
+  float* c5 = (float*) ((uintptr_t) c4 + cm_stride);
+  if XNN_UNPREDICTABLE(mr != 6) {
+    c5 = c4;
+  }
+
+  do {
+    __m256 vacc0x01234567 = _mm256_load_ps(w);
+    __m256 vacc1x01234567 = vacc0x01234567;
+    __m256 vacc2x01234567 = vacc0x01234567;
+    __m256 vacc3x01234567 = vacc0x01234567;
+    __m256 vacc4x01234567 = vacc0x01234567;
+    __m256 vacc5x01234567 = vacc0x01234567;
+    w += 8;
+
+    size_t p = ks;
+    do {
+      const float* restrict a0 = a[0];
+      assert(a0 != NULL);
+      if XNN_UNPREDICTABLE(a0 != zero) {
+        a0 = (const float*) ((uintptr_t) a0 + a_offset);
+      }
+      const float* restrict a1 = a[1];
+      assert(a1 != NULL);
+      if XNN_UNPREDICTABLE(a1 != zero) {
+        a1 = (const float*) ((uintptr_t) a1 + a_offset);
+      }
+      const float* restrict a2 = a[2];
+      assert(a2 != NULL);
+      if XNN_UNPREDICTABLE(a2 != zero) {
+        a2 = (const float*) ((uintptr_t) a2 + a_offset);
+      }
+      const float* restrict a3 = a[3];
+      assert(a3 != NULL);
+      if XNN_UNPREDICTABLE(a3 != zero) {
+        a3 = (const float*) ((uintptr_t) a3 + a_offset);
+      }
+      const float* restrict a4 = a[4];
+      assert(a4 != NULL);
+      if XNN_UNPREDICTABLE(a4 != zero) {
+        a4 = (const float*) ((uintptr_t) a4 + a_offset);
+      }
+      const float* restrict a5 = a[5];
+      assert(a5 != NULL);
+      if XNN_UNPREDICTABLE(a5 != zero) {
+        a5 = (const float*) ((uintptr_t) a5 + a_offset);
+      }
+      a += 6;
+
+      size_t k = kc;
+      do {
+        const __m256 vb01234567 = _mm256_load_ps(w);
+        w += 8;
+
+        const __m256 va0 = _mm256_broadcast_ss(a0);
+        a0 += 1;
+        const __m256 va1 = _mm256_broadcast_ss(a1);
+        a1 += 1;
+        const __m256 va2 = _mm256_broadcast_ss(a2);
+        a2 += 1;
+        const __m256 va3 = _mm256_broadcast_ss(a3);
+        a3 += 1;
+        const __m256 va4 = _mm256_broadcast_ss(a4);
+        a4 += 1;
+        const __m256 va5 = _mm256_broadcast_ss(a5);
+        a5 += 1;
+
+        vacc0x01234567 = _mm256_add_ps(vacc0x01234567, _mm256_mul_ps(va0, vb01234567));
+        vacc1x01234567 = _mm256_add_ps(vacc1x01234567, _mm256_mul_ps(va1, vb01234567));
+        vacc2x01234567 = _mm256_add_ps(vacc2x01234567, _mm256_mul_ps(va2, vb01234567));
+        vacc3x01234567 = _mm256_add_ps(vacc3x01234567, _mm256_mul_ps(va3, vb01234567));
+        vacc4x01234567 = _mm256_add_ps(vacc4x01234567, _mm256_mul_ps(va4, vb01234567));
+        vacc5x01234567 = _mm256_add_ps(vacc5x01234567, _mm256_mul_ps(va5, vb01234567));
+        k -= sizeof(float);
+      } while (k != 0);
+      p -= 6 * sizeof(void*);
+    } while (p != 0);
+
+    const __m256 vmax = _mm256_broadcast_ps((const __m128*) params->sse.max);
+    vacc0x01234567 = _mm256_min_ps(vacc0x01234567, vmax);
+    vacc1x01234567 = _mm256_min_ps(vacc1x01234567, vmax);
+    vacc2x01234567 = _mm256_min_ps(vacc2x01234567, vmax);
+    vacc3x01234567 = _mm256_min_ps(vacc3x01234567, vmax);
+    vacc4x01234567 = _mm256_min_ps(vacc4x01234567, vmax);
+    vacc5x01234567 = _mm256_min_ps(vacc5x01234567, vmax);
+
+    const __m256 vmin = _mm256_broadcast_ps((const __m128*) params->sse.min);
+    vacc0x01234567 = _mm256_max_ps(vacc0x01234567, vmin);
+    vacc1x01234567 = _mm256_max_ps(vacc1x01234567, vmin);
+    vacc2x01234567 = _mm256_max_ps(vacc2x01234567, vmin);
+    vacc3x01234567 = _mm256_max_ps(vacc3x01234567, vmin);
+    vacc4x01234567 = _mm256_max_ps(vacc4x01234567, vmin);
+    vacc5x01234567 = _mm256_max_ps(vacc5x01234567, vmin);
+
+    if XNN_LIKELY(nc >= 8) {
+      _mm256_storeu_ps(c5, vacc5x01234567);
+      c5 = (float*) ((uintptr_t) c5 + cn_stride);
+      _mm256_storeu_ps(c4, vacc4x01234567);
+      c4 = (float*) ((uintptr_t) c4 + cn_stride);
+      _mm256_storeu_ps(c3, vacc3x01234567);
+      c3 = (float*) ((uintptr_t) c3 + cn_stride);
+      _mm256_storeu_ps(c2, vacc2x01234567);
+      c2 = (float*) ((uintptr_t) c2 + cn_stride);
+      _mm256_storeu_ps(c1, vacc1x01234567);
+      c1 = (float*) ((uintptr_t) c1 + cn_stride);
+      _mm256_storeu_ps(c0, vacc0x01234567);
+      c0 = (float*) ((uintptr_t) c0 + cn_stride);
+
+      a = (const float**restrict) ((uintptr_t) a - ks);
+      nc -= 8;
+    } else {
+      __m128 vacc5x0123 = _mm256_castps256_ps128(vacc5x01234567);
+      __m128 vacc4x0123 = _mm256_castps256_ps128(vacc4x01234567);
+      __m128 vacc3x0123 = _mm256_castps256_ps128(vacc3x01234567);
+      __m128 vacc2x0123 = _mm256_castps256_ps128(vacc2x01234567);
+      __m128 vacc1x0123 = _mm256_castps256_ps128(vacc1x01234567);
+      __m128 vacc0x0123 = _mm256_castps256_ps128(vacc0x01234567);
+      if (nc & 4) {
+        _mm_storeu_ps(c5, vacc5x0123);
+        _mm_storeu_ps(c4, vacc4x0123);
+        _mm_storeu_ps(c3, vacc3x0123);
+        _mm_storeu_ps(c2, vacc2x0123);
+        _mm_storeu_ps(c1, vacc1x0123);
+        _mm_storeu_ps(c0, vacc0x0123);
+
+        vacc5x0123 = _mm256_extractf128_ps(vacc5x01234567, 1);
+        vacc4x0123 = _mm256_extractf128_ps(vacc4x01234567, 1);
+        vacc3x0123 = _mm256_extractf128_ps(vacc3x01234567, 1);
+        vacc2x0123 = _mm256_extractf128_ps(vacc2x01234567, 1);
+        vacc1x0123 = _mm256_extractf128_ps(vacc1x01234567, 1);
+        vacc0x0123 = _mm256_extractf128_ps(vacc0x01234567, 1);
+
+        c5 += 4;
+        c4 += 4;
+        c3 += 4;
+        c2 += 4;
+        c1 += 4;
+        c0 += 4;
+      }
+      if (nc & 2) {
+        _mm_storel_pi((__m64*) c5, vacc5x0123);
+        _mm_storel_pi((__m64*) c4, vacc4x0123);
+        _mm_storel_pi((__m64*) c3, vacc3x0123);
+        _mm_storel_pi((__m64*) c2, vacc2x0123);
+        _mm_storel_pi((__m64*) c1, vacc1x0123);
+        _mm_storel_pi((__m64*) c0, vacc0x0123);
+
+        vacc5x0123 = _mm_movehl_ps(vacc5x0123, vacc5x0123);
+        vacc4x0123 = _mm_movehl_ps(vacc4x0123, vacc4x0123);
+        vacc3x0123 = _mm_movehl_ps(vacc3x0123, vacc3x0123);
+        vacc2x0123 = _mm_movehl_ps(vacc2x0123, vacc2x0123);
+        vacc1x0123 = _mm_movehl_ps(vacc1x0123, vacc1x0123);
+        vacc0x0123 = _mm_movehl_ps(vacc0x0123, vacc0x0123);
+
+        c5 += 2;
+        c4 += 2;
+        c3 += 2;
+        c2 += 2;
+        c1 += 2;
+        c0 += 2;
+      }
+      if (nc & 1) {
+        _mm_store_ss(c5, vacc5x0123);
+        _mm_store_ss(c4, vacc4x0123);
+        _mm_store_ss(c3, vacc3x0123);
+        _mm_store_ss(c2, vacc2x0123);
+        _mm_store_ss(c1, vacc1x0123);
+        _mm_store_ss(c0, vacc0x0123);
+      }
+
+      nc = 0;
+    }
+  } while (nc != 0);
+}
diff --git a/src/f32-igemm/6x8-fma3-broadcast.c b/src/f32-igemm/6x8-fma3-broadcast.c
new file mode 100644
index 0000000..9d16854
--- /dev/null
+++ b/src/f32-igemm/6x8-fma3-broadcast.c
@@ -0,0 +1,232 @@
+// Auto-generated file. Do not edit!
+//   Template: src/f32-igemm/avx-broadcast.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <immintrin.h>
+
+#include <xnnpack/igemm.h>
+
+
+void xnn_f32_igemm_ukernel_6x8__fma3_broadcast(
+    size_t mr,
+    size_t nc,
+    size_t kc,
+    size_t ks,
+    const float**restrict a,
+    const float*restrict w,
+    float*restrict c,
+    size_t cm_stride,
+    size_t cn_stride,
+    size_t a_offset,
+    const float* zero,
+    const union xnn_f32_output_params params[restrict static 1])
+{
+  assert(mr != 0);
+  assert(mr <= 6);
+  assert(nc != 0);
+  assert(kc != 0);
+  assert(kc % sizeof(float) == 0);
+  assert(ks != 0);
+  assert(ks % (6 * sizeof(void*)) == 0);
+  assert(a_offset % sizeof(float) == 0);
+  assert(a != NULL);
+  assert(w != NULL);
+  assert(c != NULL);
+
+  float* c0 = c;
+  float* c1 = (float*) ((uintptr_t) c0 + cm_stride);
+  if XNN_UNPREDICTABLE(mr < 2) {
+    c1 = c0;
+  }
+  float* c2 = (float*) ((uintptr_t) c1 + cm_stride);
+  if XNN_UNPREDICTABLE(mr <= 2) {
+    c2 = c1;
+  }
+  float* c3 = (float*) ((uintptr_t) c2 + cm_stride);
+  if XNN_UNPREDICTABLE(mr < 4) {
+    c3 = c2;
+  }
+  float* c4 = (float*) ((uintptr_t) c3 + cm_stride);
+  if XNN_UNPREDICTABLE(mr <= 4) {
+    c4 = c3;
+  }
+  float* c5 = (float*) ((uintptr_t) c4 + cm_stride);
+  if XNN_UNPREDICTABLE(mr != 6) {
+    c5 = c4;
+  }
+
+  do {
+    __m256 vacc0x01234567 = _mm256_load_ps(w);
+    __m256 vacc1x01234567 = vacc0x01234567;
+    __m256 vacc2x01234567 = vacc0x01234567;
+    __m256 vacc3x01234567 = vacc0x01234567;
+    __m256 vacc4x01234567 = vacc0x01234567;
+    __m256 vacc5x01234567 = vacc0x01234567;
+    w += 8;
+
+    size_t p = ks;
+    do {
+      const float* restrict a0 = a[0];
+      assert(a0 != NULL);
+      if XNN_UNPREDICTABLE(a0 != zero) {
+        a0 = (const float*) ((uintptr_t) a0 + a_offset);
+      }
+      const float* restrict a1 = a[1];
+      assert(a1 != NULL);
+      if XNN_UNPREDICTABLE(a1 != zero) {
+        a1 = (const float*) ((uintptr_t) a1 + a_offset);
+      }
+      const float* restrict a2 = a[2];
+      assert(a2 != NULL);
+      if XNN_UNPREDICTABLE(a2 != zero) {
+        a2 = (const float*) ((uintptr_t) a2 + a_offset);
+      }
+      const float* restrict a3 = a[3];
+      assert(a3 != NULL);
+      if XNN_UNPREDICTABLE(a3 != zero) {
+        a3 = (const float*) ((uintptr_t) a3 + a_offset);
+      }
+      const float* restrict a4 = a[4];
+      assert(a4 != NULL);
+      if XNN_UNPREDICTABLE(a4 != zero) {
+        a4 = (const float*) ((uintptr_t) a4 + a_offset);
+      }
+      const float* restrict a5 = a[5];
+      assert(a5 != NULL);
+      if XNN_UNPREDICTABLE(a5 != zero) {
+        a5 = (const float*) ((uintptr_t) a5 + a_offset);
+      }
+      a += 6;
+
+      size_t k = kc;
+      do {
+        const __m256 vb01234567 = _mm256_load_ps(w);
+        w += 8;
+
+        const __m256 va0 = _mm256_broadcast_ss(a0);
+        a0 += 1;
+        const __m256 va1 = _mm256_broadcast_ss(a1);
+        a1 += 1;
+        const __m256 va2 = _mm256_broadcast_ss(a2);
+        a2 += 1;
+        const __m256 va3 = _mm256_broadcast_ss(a3);
+        a3 += 1;
+        const __m256 va4 = _mm256_broadcast_ss(a4);
+        a4 += 1;
+        const __m256 va5 = _mm256_broadcast_ss(a5);
+        a5 += 1;
+
+        vacc0x01234567 = _mm256_fmadd_ps(va0, vb01234567, vacc0x01234567);
+        vacc1x01234567 = _mm256_fmadd_ps(va1, vb01234567, vacc1x01234567);
+        vacc2x01234567 = _mm256_fmadd_ps(va2, vb01234567, vacc2x01234567);
+        vacc3x01234567 = _mm256_fmadd_ps(va3, vb01234567, vacc3x01234567);
+        vacc4x01234567 = _mm256_fmadd_ps(va4, vb01234567, vacc4x01234567);
+        vacc5x01234567 = _mm256_fmadd_ps(va5, vb01234567, vacc5x01234567);
+        k -= sizeof(float);
+      } while (k != 0);
+      p -= 6 * sizeof(void*);
+    } while (p != 0);
+
+    const __m256 vmax = _mm256_broadcast_ps((const __m128*) params->sse.max);
+    vacc0x01234567 = _mm256_min_ps(vacc0x01234567, vmax);
+    vacc1x01234567 = _mm256_min_ps(vacc1x01234567, vmax);
+    vacc2x01234567 = _mm256_min_ps(vacc2x01234567, vmax);
+    vacc3x01234567 = _mm256_min_ps(vacc3x01234567, vmax);
+    vacc4x01234567 = _mm256_min_ps(vacc4x01234567, vmax);
+    vacc5x01234567 = _mm256_min_ps(vacc5x01234567, vmax);
+
+    const __m256 vmin = _mm256_broadcast_ps((const __m128*) params->sse.min);
+    vacc0x01234567 = _mm256_max_ps(vacc0x01234567, vmin);
+    vacc1x01234567 = _mm256_max_ps(vacc1x01234567, vmin);
+    vacc2x01234567 = _mm256_max_ps(vacc2x01234567, vmin);
+    vacc3x01234567 = _mm256_max_ps(vacc3x01234567, vmin);
+    vacc4x01234567 = _mm256_max_ps(vacc4x01234567, vmin);
+    vacc5x01234567 = _mm256_max_ps(vacc5x01234567, vmin);
+
+    if XNN_LIKELY(nc >= 8) {
+      _mm256_storeu_ps(c5, vacc5x01234567);
+      c5 = (float*) ((uintptr_t) c5 + cn_stride);
+      _mm256_storeu_ps(c4, vacc4x01234567);
+      c4 = (float*) ((uintptr_t) c4 + cn_stride);
+      _mm256_storeu_ps(c3, vacc3x01234567);
+      c3 = (float*) ((uintptr_t) c3 + cn_stride);
+      _mm256_storeu_ps(c2, vacc2x01234567);
+      c2 = (float*) ((uintptr_t) c2 + cn_stride);
+      _mm256_storeu_ps(c1, vacc1x01234567);
+      c1 = (float*) ((uintptr_t) c1 + cn_stride);
+      _mm256_storeu_ps(c0, vacc0x01234567);
+      c0 = (float*) ((uintptr_t) c0 + cn_stride);
+
+      a = (const float**restrict) ((uintptr_t) a - ks);
+      nc -= 8;
+    } else {
+      __m128 vacc5x0123 = _mm256_castps256_ps128(vacc5x01234567);
+      __m128 vacc4x0123 = _mm256_castps256_ps128(vacc4x01234567);
+      __m128 vacc3x0123 = _mm256_castps256_ps128(vacc3x01234567);
+      __m128 vacc2x0123 = _mm256_castps256_ps128(vacc2x01234567);
+      __m128 vacc1x0123 = _mm256_castps256_ps128(vacc1x01234567);
+      __m128 vacc0x0123 = _mm256_castps256_ps128(vacc0x01234567);
+      if (nc & 4) {
+        _mm_storeu_ps(c5, vacc5x0123);
+        _mm_storeu_ps(c4, vacc4x0123);
+        _mm_storeu_ps(c3, vacc3x0123);
+        _mm_storeu_ps(c2, vacc2x0123);
+        _mm_storeu_ps(c1, vacc1x0123);
+        _mm_storeu_ps(c0, vacc0x0123);
+
+        vacc5x0123 = _mm256_extractf128_ps(vacc5x01234567, 1);
+        vacc4x0123 = _mm256_extractf128_ps(vacc4x01234567, 1);
+        vacc3x0123 = _mm256_extractf128_ps(vacc3x01234567, 1);
+        vacc2x0123 = _mm256_extractf128_ps(vacc2x01234567, 1);
+        vacc1x0123 = _mm256_extractf128_ps(vacc1x01234567, 1);
+        vacc0x0123 = _mm256_extractf128_ps(vacc0x01234567, 1);
+
+        c5 += 4;
+        c4 += 4;
+        c3 += 4;
+        c2 += 4;
+        c1 += 4;
+        c0 += 4;
+      }
+      if (nc & 2) {
+        _mm_storel_pi((__m64*) c5, vacc5x0123);
+        _mm_storel_pi((__m64*) c4, vacc4x0123);
+        _mm_storel_pi((__m64*) c3, vacc3x0123);
+        _mm_storel_pi((__m64*) c2, vacc2x0123);
+        _mm_storel_pi((__m64*) c1, vacc1x0123);
+        _mm_storel_pi((__m64*) c0, vacc0x0123);
+
+        vacc5x0123 = _mm_movehl_ps(vacc5x0123, vacc5x0123);
+        vacc4x0123 = _mm_movehl_ps(vacc4x0123, vacc4x0123);
+        vacc3x0123 = _mm_movehl_ps(vacc3x0123, vacc3x0123);
+        vacc2x0123 = _mm_movehl_ps(vacc2x0123, vacc2x0123);
+        vacc1x0123 = _mm_movehl_ps(vacc1x0123, vacc1x0123);
+        vacc0x0123 = _mm_movehl_ps(vacc0x0123, vacc0x0123);
+
+        c5 += 2;
+        c4 += 2;
+        c3 += 2;
+        c2 += 2;
+        c1 += 2;
+        c0 += 2;
+      }
+      if (nc & 1) {
+        _mm_store_ss(c5, vacc5x0123);
+        _mm_store_ss(c4, vacc4x0123);
+        _mm_store_ss(c3, vacc3x0123);
+        _mm_store_ss(c2, vacc2x0123);
+        _mm_store_ss(c1, vacc1x0123);
+        _mm_store_ss(c0, vacc0x0123);
+      }
+
+      nc = 0;
+    }
+  } while (nc != 0);
+}
diff --git a/src/f32-igemm/7x8-avx-broadcast.c b/src/f32-igemm/7x8-avx-broadcast.c
new file mode 100644
index 0000000..8a56961
--- /dev/null
+++ b/src/f32-igemm/7x8-avx-broadcast.c
@@ -0,0 +1,257 @@
+// Auto-generated file. Do not edit!
+//   Template: src/f32-igemm/avx-broadcast.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <immintrin.h>
+
+#include <xnnpack/igemm.h>
+
+
+void xnn_f32_igemm_ukernel_7x8__avx_broadcast(
+    size_t mr,
+    size_t nc,
+    size_t kc,
+    size_t ks,
+    const float**restrict a,
+    const float*restrict w,
+    float*restrict c,
+    size_t cm_stride,
+    size_t cn_stride,
+    size_t a_offset,
+    const float* zero,
+    const union xnn_f32_output_params params[restrict static 1])
+{
+  assert(mr != 0);
+  assert(mr <= 7);
+  assert(nc != 0);
+  assert(kc != 0);
+  assert(kc % sizeof(float) == 0);
+  assert(ks != 0);
+  assert(ks % (7 * sizeof(void*)) == 0);
+  assert(a_offset % sizeof(float) == 0);
+  assert(a != NULL);
+  assert(w != NULL);
+  assert(c != NULL);
+
+  float* c0 = c;
+  float* c1 = (float*) ((uintptr_t) c0 + cm_stride);
+  if XNN_UNPREDICTABLE(mr < 2) {
+    c1 = c0;
+  }
+  float* c2 = (float*) ((uintptr_t) c1 + cm_stride);
+  if XNN_UNPREDICTABLE(mr <= 2) {
+    c2 = c1;
+  }
+  float* c3 = (float*) ((uintptr_t) c2 + cm_stride);
+  if XNN_UNPREDICTABLE(mr < 4) {
+    c3 = c2;
+  }
+  float* c4 = (float*) ((uintptr_t) c3 + cm_stride);
+  if XNN_UNPREDICTABLE(mr <= 4) {
+    c4 = c3;
+  }
+  float* c5 = (float*) ((uintptr_t) c4 + cm_stride);
+  if XNN_UNPREDICTABLE(mr < 6) {
+    c5 = c4;
+  }
+  float* c6 = (float*) ((uintptr_t) c5 + cm_stride);
+  if XNN_UNPREDICTABLE(mr <= 6) {
+    c6 = c5;
+  }
+
+  do {
+    __m256 vacc0x01234567 = _mm256_load_ps(w);
+    __m256 vacc1x01234567 = vacc0x01234567;
+    __m256 vacc2x01234567 = vacc0x01234567;
+    __m256 vacc3x01234567 = vacc0x01234567;
+    __m256 vacc4x01234567 = vacc0x01234567;
+    __m256 vacc5x01234567 = vacc0x01234567;
+    __m256 vacc6x01234567 = vacc0x01234567;
+    w += 8;
+
+    size_t p = ks;
+    do {
+      const float* restrict a0 = a[0];
+      assert(a0 != NULL);
+      if XNN_UNPREDICTABLE(a0 != zero) {
+        a0 = (const float*) ((uintptr_t) a0 + a_offset);
+      }
+      const float* restrict a1 = a[1];
+      assert(a1 != NULL);
+      if XNN_UNPREDICTABLE(a1 != zero) {
+        a1 = (const float*) ((uintptr_t) a1 + a_offset);
+      }
+      const float* restrict a2 = a[2];
+      assert(a2 != NULL);
+      if XNN_UNPREDICTABLE(a2 != zero) {
+        a2 = (const float*) ((uintptr_t) a2 + a_offset);
+      }
+      const float* restrict a3 = a[3];
+      assert(a3 != NULL);
+      if XNN_UNPREDICTABLE(a3 != zero) {
+        a3 = (const float*) ((uintptr_t) a3 + a_offset);
+      }
+      const float* restrict a4 = a[4];
+      assert(a4 != NULL);
+      if XNN_UNPREDICTABLE(a4 != zero) {
+        a4 = (const float*) ((uintptr_t) a4 + a_offset);
+      }
+      const float* restrict a5 = a[5];
+      assert(a5 != NULL);
+      if XNN_UNPREDICTABLE(a5 != zero) {
+        a5 = (const float*) ((uintptr_t) a5 + a_offset);
+      }
+      const float* restrict a6 = a[6];
+      assert(a6 != NULL);
+      if XNN_UNPREDICTABLE(a6 != zero) {
+        a6 = (const float*) ((uintptr_t) a6 + a_offset);
+      }
+      a += 7;
+
+      size_t k = kc;
+      do {
+        const __m256 vb01234567 = _mm256_load_ps(w);
+        w += 8;
+
+        const __m256 va0 = _mm256_broadcast_ss(a0);
+        a0 += 1;
+        const __m256 va1 = _mm256_broadcast_ss(a1);
+        a1 += 1;
+        const __m256 va2 = _mm256_broadcast_ss(a2);
+        a2 += 1;
+        const __m256 va3 = _mm256_broadcast_ss(a3);
+        a3 += 1;
+        const __m256 va4 = _mm256_broadcast_ss(a4);
+        a4 += 1;
+        const __m256 va5 = _mm256_broadcast_ss(a5);
+        a5 += 1;
+        const __m256 va6 = _mm256_broadcast_ss(a6);
+        a6 += 1;
+
+        vacc0x01234567 = _mm256_add_ps(vacc0x01234567, _mm256_mul_ps(va0, vb01234567));
+        vacc1x01234567 = _mm256_add_ps(vacc1x01234567, _mm256_mul_ps(va1, vb01234567));
+        vacc2x01234567 = _mm256_add_ps(vacc2x01234567, _mm256_mul_ps(va2, vb01234567));
+        vacc3x01234567 = _mm256_add_ps(vacc3x01234567, _mm256_mul_ps(va3, vb01234567));
+        vacc4x01234567 = _mm256_add_ps(vacc4x01234567, _mm256_mul_ps(va4, vb01234567));
+        vacc5x01234567 = _mm256_add_ps(vacc5x01234567, _mm256_mul_ps(va5, vb01234567));
+        vacc6x01234567 = _mm256_add_ps(vacc6x01234567, _mm256_mul_ps(va6, vb01234567));
+        k -= sizeof(float);
+      } while (k != 0);
+      p -= 7 * sizeof(void*);
+    } while (p != 0);
+
+    const __m256 vmax = _mm256_broadcast_ps((const __m128*) params->sse.max);
+    vacc0x01234567 = _mm256_min_ps(vacc0x01234567, vmax);
+    vacc1x01234567 = _mm256_min_ps(vacc1x01234567, vmax);
+    vacc2x01234567 = _mm256_min_ps(vacc2x01234567, vmax);
+    vacc3x01234567 = _mm256_min_ps(vacc3x01234567, vmax);
+    vacc4x01234567 = _mm256_min_ps(vacc4x01234567, vmax);
+    vacc5x01234567 = _mm256_min_ps(vacc5x01234567, vmax);
+    vacc6x01234567 = _mm256_min_ps(vacc6x01234567, vmax);
+
+    const __m256 vmin = _mm256_broadcast_ps((const __m128*) params->sse.min);
+    vacc0x01234567 = _mm256_max_ps(vacc0x01234567, vmin);
+    vacc1x01234567 = _mm256_max_ps(vacc1x01234567, vmin);
+    vacc2x01234567 = _mm256_max_ps(vacc2x01234567, vmin);
+    vacc3x01234567 = _mm256_max_ps(vacc3x01234567, vmin);
+    vacc4x01234567 = _mm256_max_ps(vacc4x01234567, vmin);
+    vacc5x01234567 = _mm256_max_ps(vacc5x01234567, vmin);
+    vacc6x01234567 = _mm256_max_ps(vacc6x01234567, vmin);
+
+    if XNN_LIKELY(nc >= 8) {
+      _mm256_storeu_ps(c6, vacc6x01234567);
+      c6 = (float*) ((uintptr_t) c6 + cn_stride);
+      _mm256_storeu_ps(c5, vacc5x01234567);
+      c5 = (float*) ((uintptr_t) c5 + cn_stride);
+      _mm256_storeu_ps(c4, vacc4x01234567);
+      c4 = (float*) ((uintptr_t) c4 + cn_stride);
+      _mm256_storeu_ps(c3, vacc3x01234567);
+      c3 = (float*) ((uintptr_t) c3 + cn_stride);
+      _mm256_storeu_ps(c2, vacc2x01234567);
+      c2 = (float*) ((uintptr_t) c2 + cn_stride);
+      _mm256_storeu_ps(c1, vacc1x01234567);
+      c1 = (float*) ((uintptr_t) c1 + cn_stride);
+      _mm256_storeu_ps(c0, vacc0x01234567);
+      c0 = (float*) ((uintptr_t) c0 + cn_stride);
+
+      a = (const float**restrict) ((uintptr_t) a - ks);
+      nc -= 8;
+    } else {
+      __m128 vacc6x0123 = _mm256_castps256_ps128(vacc6x01234567);
+      __m128 vacc5x0123 = _mm256_castps256_ps128(vacc5x01234567);
+      __m128 vacc4x0123 = _mm256_castps256_ps128(vacc4x01234567);
+      __m128 vacc3x0123 = _mm256_castps256_ps128(vacc3x01234567);
+      __m128 vacc2x0123 = _mm256_castps256_ps128(vacc2x01234567);
+      __m128 vacc1x0123 = _mm256_castps256_ps128(vacc1x01234567);
+      __m128 vacc0x0123 = _mm256_castps256_ps128(vacc0x01234567);
+      if (nc & 4) {
+        _mm_storeu_ps(c6, vacc6x0123);
+        _mm_storeu_ps(c5, vacc5x0123);
+        _mm_storeu_ps(c4, vacc4x0123);
+        _mm_storeu_ps(c3, vacc3x0123);
+        _mm_storeu_ps(c2, vacc2x0123);
+        _mm_storeu_ps(c1, vacc1x0123);
+        _mm_storeu_ps(c0, vacc0x0123);
+
+        vacc6x0123 = _mm256_extractf128_ps(vacc6x01234567, 1);
+        vacc5x0123 = _mm256_extractf128_ps(vacc5x01234567, 1);
+        vacc4x0123 = _mm256_extractf128_ps(vacc4x01234567, 1);
+        vacc3x0123 = _mm256_extractf128_ps(vacc3x01234567, 1);
+        vacc2x0123 = _mm256_extractf128_ps(vacc2x01234567, 1);
+        vacc1x0123 = _mm256_extractf128_ps(vacc1x01234567, 1);
+        vacc0x0123 = _mm256_extractf128_ps(vacc0x01234567, 1);
+
+        c6 += 4;
+        c5 += 4;
+        c4 += 4;
+        c3 += 4;
+        c2 += 4;
+        c1 += 4;
+        c0 += 4;
+      }
+      if (nc & 2) {
+        _mm_storel_pi((__m64*) c6, vacc6x0123);
+        _mm_storel_pi((__m64*) c5, vacc5x0123);
+        _mm_storel_pi((__m64*) c4, vacc4x0123);
+        _mm_storel_pi((__m64*) c3, vacc3x0123);
+        _mm_storel_pi((__m64*) c2, vacc2x0123);
+        _mm_storel_pi((__m64*) c1, vacc1x0123);
+        _mm_storel_pi((__m64*) c0, vacc0x0123);
+
+        vacc6x0123 = _mm_movehl_ps(vacc6x0123, vacc6x0123);
+        vacc5x0123 = _mm_movehl_ps(vacc5x0123, vacc5x0123);
+        vacc4x0123 = _mm_movehl_ps(vacc4x0123, vacc4x0123);
+        vacc3x0123 = _mm_movehl_ps(vacc3x0123, vacc3x0123);
+        vacc2x0123 = _mm_movehl_ps(vacc2x0123, vacc2x0123);
+        vacc1x0123 = _mm_movehl_ps(vacc1x0123, vacc1x0123);
+        vacc0x0123 = _mm_movehl_ps(vacc0x0123, vacc0x0123);
+
+        c6 += 2;
+        c5 += 2;
+        c4 += 2;
+        c3 += 2;
+        c2 += 2;
+        c1 += 2;
+        c0 += 2;
+      }
+      if (nc & 1) {
+        _mm_store_ss(c6, vacc6x0123);
+        _mm_store_ss(c5, vacc5x0123);
+        _mm_store_ss(c4, vacc4x0123);
+        _mm_store_ss(c3, vacc3x0123);
+        _mm_store_ss(c2, vacc2x0123);
+        _mm_store_ss(c1, vacc1x0123);
+        _mm_store_ss(c0, vacc0x0123);
+      }
+
+      nc = 0;
+    }
+  } while (nc != 0);
+}
diff --git a/src/f32-igemm/7x8-fma3-broadcast.c b/src/f32-igemm/7x8-fma3-broadcast.c
new file mode 100644
index 0000000..89b17f3
--- /dev/null
+++ b/src/f32-igemm/7x8-fma3-broadcast.c
@@ -0,0 +1,257 @@
+// Auto-generated file. Do not edit!
+//   Template: src/f32-igemm/avx-broadcast.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <immintrin.h>
+
+#include <xnnpack/igemm.h>
+
+
+void xnn_f32_igemm_ukernel_7x8__fma3_broadcast(
+    size_t mr,
+    size_t nc,
+    size_t kc,
+    size_t ks,
+    const float**restrict a,
+    const float*restrict w,
+    float*restrict c,
+    size_t cm_stride,
+    size_t cn_stride,
+    size_t a_offset,
+    const float* zero,
+    const union xnn_f32_output_params params[restrict static 1])
+{
+  assert(mr != 0);
+  assert(mr <= 7);
+  assert(nc != 0);
+  assert(kc != 0);
+  assert(kc % sizeof(float) == 0);
+  assert(ks != 0);
+  assert(ks % (7 * sizeof(void*)) == 0);
+  assert(a_offset % sizeof(float) == 0);
+  assert(a != NULL);
+  assert(w != NULL);
+  assert(c != NULL);
+
+  float* c0 = c;
+  float* c1 = (float*) ((uintptr_t) c0 + cm_stride);
+  if XNN_UNPREDICTABLE(mr < 2) {
+    c1 = c0;
+  }
+  float* c2 = (float*) ((uintptr_t) c1 + cm_stride);
+  if XNN_UNPREDICTABLE(mr <= 2) {
+    c2 = c1;
+  }
+  float* c3 = (float*) ((uintptr_t) c2 + cm_stride);
+  if XNN_UNPREDICTABLE(mr < 4) {
+    c3 = c2;
+  }
+  float* c4 = (float*) ((uintptr_t) c3 + cm_stride);
+  if XNN_UNPREDICTABLE(mr <= 4) {
+    c4 = c3;
+  }
+  float* c5 = (float*) ((uintptr_t) c4 + cm_stride);
+  if XNN_UNPREDICTABLE(mr < 6) {
+    c5 = c4;
+  }
+  float* c6 = (float*) ((uintptr_t) c5 + cm_stride);
+  if XNN_UNPREDICTABLE(mr <= 6) {
+    c6 = c5;
+  }
+
+  do {
+    __m256 vacc0x01234567 = _mm256_load_ps(w);
+    __m256 vacc1x01234567 = vacc0x01234567;
+    __m256 vacc2x01234567 = vacc0x01234567;
+    __m256 vacc3x01234567 = vacc0x01234567;
+    __m256 vacc4x01234567 = vacc0x01234567;
+    __m256 vacc5x01234567 = vacc0x01234567;
+    __m256 vacc6x01234567 = vacc0x01234567;
+    w += 8;
+
+    size_t p = ks;
+    do {
+      const float* restrict a0 = a[0];
+      assert(a0 != NULL);
+      if XNN_UNPREDICTABLE(a0 != zero) {
+        a0 = (const float*) ((uintptr_t) a0 + a_offset);
+      }
+      const float* restrict a1 = a[1];
+      assert(a1 != NULL);
+      if XNN_UNPREDICTABLE(a1 != zero) {
+        a1 = (const float*) ((uintptr_t) a1 + a_offset);
+      }
+      const float* restrict a2 = a[2];
+      assert(a2 != NULL);
+      if XNN_UNPREDICTABLE(a2 != zero) {
+        a2 = (const float*) ((uintptr_t) a2 + a_offset);
+      }
+      const float* restrict a3 = a[3];
+      assert(a3 != NULL);
+      if XNN_UNPREDICTABLE(a3 != zero) {
+        a3 = (const float*) ((uintptr_t) a3 + a_offset);
+      }
+      const float* restrict a4 = a[4];
+      assert(a4 != NULL);
+      if XNN_UNPREDICTABLE(a4 != zero) {
+        a4 = (const float*) ((uintptr_t) a4 + a_offset);
+      }
+      const float* restrict a5 = a[5];
+      assert(a5 != NULL);
+      if XNN_UNPREDICTABLE(a5 != zero) {
+        a5 = (const float*) ((uintptr_t) a5 + a_offset);
+      }
+      const float* restrict a6 = a[6];
+      assert(a6 != NULL);
+      if XNN_UNPREDICTABLE(a6 != zero) {
+        a6 = (const float*) ((uintptr_t) a6 + a_offset);
+      }
+      a += 7;
+
+      size_t k = kc;
+      do {
+        const __m256 vb01234567 = _mm256_load_ps(w);
+        w += 8;
+
+        const __m256 va0 = _mm256_broadcast_ss(a0);
+        a0 += 1;
+        const __m256 va1 = _mm256_broadcast_ss(a1);
+        a1 += 1;
+        const __m256 va2 = _mm256_broadcast_ss(a2);
+        a2 += 1;
+        const __m256 va3 = _mm256_broadcast_ss(a3);
+        a3 += 1;
+        const __m256 va4 = _mm256_broadcast_ss(a4);
+        a4 += 1;
+        const __m256 va5 = _mm256_broadcast_ss(a5);
+        a5 += 1;
+        const __m256 va6 = _mm256_broadcast_ss(a6);
+        a6 += 1;
+
+        vacc0x01234567 = _mm256_fmadd_ps(va0, vb01234567, vacc0x01234567);
+        vacc1x01234567 = _mm256_fmadd_ps(va1, vb01234567, vacc1x01234567);
+        vacc2x01234567 = _mm256_fmadd_ps(va2, vb01234567, vacc2x01234567);
+        vacc3x01234567 = _mm256_fmadd_ps(va3, vb01234567, vacc3x01234567);
+        vacc4x01234567 = _mm256_fmadd_ps(va4, vb01234567, vacc4x01234567);
+        vacc5x01234567 = _mm256_fmadd_ps(va5, vb01234567, vacc5x01234567);
+        vacc6x01234567 = _mm256_fmadd_ps(va6, vb01234567, vacc6x01234567);
+        k -= sizeof(float);
+      } while (k != 0);
+      p -= 7 * sizeof(void*);
+    } while (p != 0);
+
+    const __m256 vmax = _mm256_broadcast_ps((const __m128*) params->sse.max);
+    vacc0x01234567 = _mm256_min_ps(vacc0x01234567, vmax);
+    vacc1x01234567 = _mm256_min_ps(vacc1x01234567, vmax);
+    vacc2x01234567 = _mm256_min_ps(vacc2x01234567, vmax);
+    vacc3x01234567 = _mm256_min_ps(vacc3x01234567, vmax);
+    vacc4x01234567 = _mm256_min_ps(vacc4x01234567, vmax);
+    vacc5x01234567 = _mm256_min_ps(vacc5x01234567, vmax);
+    vacc6x01234567 = _mm256_min_ps(vacc6x01234567, vmax);
+
+    const __m256 vmin = _mm256_broadcast_ps((const __m128*) params->sse.min);
+    vacc0x01234567 = _mm256_max_ps(vacc0x01234567, vmin);
+    vacc1x01234567 = _mm256_max_ps(vacc1x01234567, vmin);
+    vacc2x01234567 = _mm256_max_ps(vacc2x01234567, vmin);
+    vacc3x01234567 = _mm256_max_ps(vacc3x01234567, vmin);
+    vacc4x01234567 = _mm256_max_ps(vacc4x01234567, vmin);
+    vacc5x01234567 = _mm256_max_ps(vacc5x01234567, vmin);
+    vacc6x01234567 = _mm256_max_ps(vacc6x01234567, vmin);
+
+    if XNN_LIKELY(nc >= 8) {
+      _mm256_storeu_ps(c6, vacc6x01234567);
+      c6 = (float*) ((uintptr_t) c6 + cn_stride);
+      _mm256_storeu_ps(c5, vacc5x01234567);
+      c5 = (float*) ((uintptr_t) c5 + cn_stride);
+      _mm256_storeu_ps(c4, vacc4x01234567);
+      c4 = (float*) ((uintptr_t) c4 + cn_stride);
+      _mm256_storeu_ps(c3, vacc3x01234567);
+      c3 = (float*) ((uintptr_t) c3 + cn_stride);
+      _mm256_storeu_ps(c2, vacc2x01234567);
+      c2 = (float*) ((uintptr_t) c2 + cn_stride);
+      _mm256_storeu_ps(c1, vacc1x01234567);
+      c1 = (float*) ((uintptr_t) c1 + cn_stride);
+      _mm256_storeu_ps(c0, vacc0x01234567);
+      c0 = (float*) ((uintptr_t) c0 + cn_stride);
+
+      a = (const float**restrict) ((uintptr_t) a - ks);
+      nc -= 8;
+    } else {
+      __m128 vacc6x0123 = _mm256_castps256_ps128(vacc6x01234567);
+      __m128 vacc5x0123 = _mm256_castps256_ps128(vacc5x01234567);
+      __m128 vacc4x0123 = _mm256_castps256_ps128(vacc4x01234567);
+      __m128 vacc3x0123 = _mm256_castps256_ps128(vacc3x01234567);
+      __m128 vacc2x0123 = _mm256_castps256_ps128(vacc2x01234567);
+      __m128 vacc1x0123 = _mm256_castps256_ps128(vacc1x01234567);
+      __m128 vacc0x0123 = _mm256_castps256_ps128(vacc0x01234567);
+      if (nc & 4) {
+        _mm_storeu_ps(c6, vacc6x0123);
+        _mm_storeu_ps(c5, vacc5x0123);
+        _mm_storeu_ps(c4, vacc4x0123);
+        _mm_storeu_ps(c3, vacc3x0123);
+        _mm_storeu_ps(c2, vacc2x0123);
+        _mm_storeu_ps(c1, vacc1x0123);
+        _mm_storeu_ps(c0, vacc0x0123);
+
+        vacc6x0123 = _mm256_extractf128_ps(vacc6x01234567, 1);
+        vacc5x0123 = _mm256_extractf128_ps(vacc5x01234567, 1);
+        vacc4x0123 = _mm256_extractf128_ps(vacc4x01234567, 1);
+        vacc3x0123 = _mm256_extractf128_ps(vacc3x01234567, 1);
+        vacc2x0123 = _mm256_extractf128_ps(vacc2x01234567, 1);
+        vacc1x0123 = _mm256_extractf128_ps(vacc1x01234567, 1);
+        vacc0x0123 = _mm256_extractf128_ps(vacc0x01234567, 1);
+
+        c6 += 4;
+        c5 += 4;
+        c4 += 4;
+        c3 += 4;
+        c2 += 4;
+        c1 += 4;
+        c0 += 4;
+      }
+      if (nc & 2) {
+        _mm_storel_pi((__m64*) c6, vacc6x0123);
+        _mm_storel_pi((__m64*) c5, vacc5x0123);
+        _mm_storel_pi((__m64*) c4, vacc4x0123);
+        _mm_storel_pi((__m64*) c3, vacc3x0123);
+        _mm_storel_pi((__m64*) c2, vacc2x0123);
+        _mm_storel_pi((__m64*) c1, vacc1x0123);
+        _mm_storel_pi((__m64*) c0, vacc0x0123);
+
+        vacc6x0123 = _mm_movehl_ps(vacc6x0123, vacc6x0123);
+        vacc5x0123 = _mm_movehl_ps(vacc5x0123, vacc5x0123);
+        vacc4x0123 = _mm_movehl_ps(vacc4x0123, vacc4x0123);
+        vacc3x0123 = _mm_movehl_ps(vacc3x0123, vacc3x0123);
+        vacc2x0123 = _mm_movehl_ps(vacc2x0123, vacc2x0123);
+        vacc1x0123 = _mm_movehl_ps(vacc1x0123, vacc1x0123);
+        vacc0x0123 = _mm_movehl_ps(vacc0x0123, vacc0x0123);
+
+        c6 += 2;
+        c5 += 2;
+        c4 += 2;
+        c3 += 2;
+        c2 += 2;
+        c1 += 2;
+        c0 += 2;
+      }
+      if (nc & 1) {
+        _mm_store_ss(c6, vacc6x0123);
+        _mm_store_ss(c5, vacc5x0123);
+        _mm_store_ss(c4, vacc4x0123);
+        _mm_store_ss(c3, vacc3x0123);
+        _mm_store_ss(c2, vacc2x0123);
+        _mm_store_ss(c1, vacc1x0123);
+        _mm_store_ss(c0, vacc0x0123);
+      }
+
+      nc = 0;
+    }
+  } while (nc != 0);
+}
diff --git a/src/f32-igemm/8x8-fma3-broadcast.c b/src/f32-igemm/8x8-fma3-broadcast.c
new file mode 100644
index 0000000..7cb4625
--- /dev/null
+++ b/src/f32-igemm/8x8-fma3-broadcast.c
@@ -0,0 +1,282 @@
+// Auto-generated file. Do not edit!
+//   Template: src/f32-igemm/avx-broadcast.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <immintrin.h>
+
+#include <xnnpack/igemm.h>
+
+
+void xnn_f32_igemm_ukernel_8x8__fma3_broadcast(
+    size_t mr,
+    size_t nc,
+    size_t kc,
+    size_t ks,
+    const float**restrict a,
+    const float*restrict w,
+    float*restrict c,
+    size_t cm_stride,
+    size_t cn_stride,
+    size_t a_offset,
+    const float* zero,
+    const union xnn_f32_output_params params[restrict static 1])
+{
+  assert(mr != 0);
+  assert(mr <= 8);
+  assert(nc != 0);
+  assert(kc != 0);
+  assert(kc % sizeof(float) == 0);
+  assert(ks != 0);
+  assert(ks % (8 * sizeof(void*)) == 0);
+  assert(a_offset % sizeof(float) == 0);
+  assert(a != NULL);
+  assert(w != NULL);
+  assert(c != NULL);
+
+  float* c0 = c;
+  float* c1 = (float*) ((uintptr_t) c0 + cm_stride);
+  if XNN_UNPREDICTABLE(mr < 2) {
+    c1 = c0;
+  }
+  float* c2 = (float*) ((uintptr_t) c1 + cm_stride);
+  if XNN_UNPREDICTABLE(mr <= 2) {
+    c2 = c1;
+  }
+  float* c3 = (float*) ((uintptr_t) c2 + cm_stride);
+  if XNN_UNPREDICTABLE(mr < 4) {
+    c3 = c2;
+  }
+  float* c4 = (float*) ((uintptr_t) c3 + cm_stride);
+  if XNN_UNPREDICTABLE(mr <= 4) {
+    c4 = c3;
+  }
+  float* c5 = (float*) ((uintptr_t) c4 + cm_stride);
+  if XNN_UNPREDICTABLE(mr < 6) {
+    c5 = c4;
+  }
+  float* c6 = (float*) ((uintptr_t) c5 + cm_stride);
+  if XNN_UNPREDICTABLE(mr <= 6) {
+    c6 = c5;
+  }
+  float* c7 = (float*) ((uintptr_t) c6 + cm_stride);
+  if XNN_UNPREDICTABLE(mr != 8) {
+    c7 = c6;
+  }
+
+  do {
+    __m256 vacc0x01234567 = _mm256_load_ps(w);
+    __m256 vacc1x01234567 = vacc0x01234567;
+    __m256 vacc2x01234567 = vacc0x01234567;
+    __m256 vacc3x01234567 = vacc0x01234567;
+    __m256 vacc4x01234567 = vacc0x01234567;
+    __m256 vacc5x01234567 = vacc0x01234567;
+    __m256 vacc6x01234567 = vacc0x01234567;
+    __m256 vacc7x01234567 = vacc0x01234567;
+    w += 8;
+
+    size_t p = ks;
+    do {
+      const float* restrict a0 = a[0];
+      assert(a0 != NULL);
+      if XNN_UNPREDICTABLE(a0 != zero) {
+        a0 = (const float*) ((uintptr_t) a0 + a_offset);
+      }
+      const float* restrict a1 = a[1];
+      assert(a1 != NULL);
+      if XNN_UNPREDICTABLE(a1 != zero) {
+        a1 = (const float*) ((uintptr_t) a1 + a_offset);
+      }
+      const float* restrict a2 = a[2];
+      assert(a2 != NULL);
+      if XNN_UNPREDICTABLE(a2 != zero) {
+        a2 = (const float*) ((uintptr_t) a2 + a_offset);
+      }
+      const float* restrict a3 = a[3];
+      assert(a3 != NULL);
+      if XNN_UNPREDICTABLE(a3 != zero) {
+        a3 = (const float*) ((uintptr_t) a3 + a_offset);
+      }
+      const float* restrict a4 = a[4];
+      assert(a4 != NULL);
+      if XNN_UNPREDICTABLE(a4 != zero) {
+        a4 = (const float*) ((uintptr_t) a4 + a_offset);
+      }
+      const float* restrict a5 = a[5];
+      assert(a5 != NULL);
+      if XNN_UNPREDICTABLE(a5 != zero) {
+        a5 = (const float*) ((uintptr_t) a5 + a_offset);
+      }
+      const float* restrict a6 = a[6];
+      assert(a6 != NULL);
+      if XNN_UNPREDICTABLE(a6 != zero) {
+        a6 = (const float*) ((uintptr_t) a6 + a_offset);
+      }
+      const float* restrict a7 = a[7];
+      assert(a7 != NULL);
+      if XNN_UNPREDICTABLE(a7 != zero) {
+        a7 = (const float*) ((uintptr_t) a7 + a_offset);
+      }
+      a += 8;
+
+      size_t k = kc;
+      do {
+        const __m256 vb01234567 = _mm256_load_ps(w);
+        w += 8;
+
+        const __m256 va0 = _mm256_broadcast_ss(a0);
+        a0 += 1;
+        const __m256 va1 = _mm256_broadcast_ss(a1);
+        a1 += 1;
+        const __m256 va2 = _mm256_broadcast_ss(a2);
+        a2 += 1;
+        const __m256 va3 = _mm256_broadcast_ss(a3);
+        a3 += 1;
+        const __m256 va4 = _mm256_broadcast_ss(a4);
+        a4 += 1;
+        const __m256 va5 = _mm256_broadcast_ss(a5);
+        a5 += 1;
+        const __m256 va6 = _mm256_broadcast_ss(a6);
+        a6 += 1;
+        const __m256 va7 = _mm256_broadcast_ss(a7);
+        a7 += 1;
+
+        vacc0x01234567 = _mm256_fmadd_ps(va0, vb01234567, vacc0x01234567);
+        vacc1x01234567 = _mm256_fmadd_ps(va1, vb01234567, vacc1x01234567);
+        vacc2x01234567 = _mm256_fmadd_ps(va2, vb01234567, vacc2x01234567);
+        vacc3x01234567 = _mm256_fmadd_ps(va3, vb01234567, vacc3x01234567);
+        vacc4x01234567 = _mm256_fmadd_ps(va4, vb01234567, vacc4x01234567);
+        vacc5x01234567 = _mm256_fmadd_ps(va5, vb01234567, vacc5x01234567);
+        vacc6x01234567 = _mm256_fmadd_ps(va6, vb01234567, vacc6x01234567);
+        vacc7x01234567 = _mm256_fmadd_ps(va7, vb01234567, vacc7x01234567);
+        k -= sizeof(float);
+      } while (k != 0);
+      p -= 8 * sizeof(void*);
+    } while (p != 0);
+
+    const __m256 vmax = _mm256_broadcast_ps((const __m128*) params->sse.max);
+    vacc0x01234567 = _mm256_min_ps(vacc0x01234567, vmax);
+    vacc1x01234567 = _mm256_min_ps(vacc1x01234567, vmax);
+    vacc2x01234567 = _mm256_min_ps(vacc2x01234567, vmax);
+    vacc3x01234567 = _mm256_min_ps(vacc3x01234567, vmax);
+    vacc4x01234567 = _mm256_min_ps(vacc4x01234567, vmax);
+    vacc5x01234567 = _mm256_min_ps(vacc5x01234567, vmax);
+    vacc6x01234567 = _mm256_min_ps(vacc6x01234567, vmax);
+    vacc7x01234567 = _mm256_min_ps(vacc7x01234567, vmax);
+
+    const __m256 vmin = _mm256_broadcast_ps((const __m128*) params->sse.min);
+    vacc0x01234567 = _mm256_max_ps(vacc0x01234567, vmin);
+    vacc1x01234567 = _mm256_max_ps(vacc1x01234567, vmin);
+    vacc2x01234567 = _mm256_max_ps(vacc2x01234567, vmin);
+    vacc3x01234567 = _mm256_max_ps(vacc3x01234567, vmin);
+    vacc4x01234567 = _mm256_max_ps(vacc4x01234567, vmin);
+    vacc5x01234567 = _mm256_max_ps(vacc5x01234567, vmin);
+    vacc6x01234567 = _mm256_max_ps(vacc6x01234567, vmin);
+    vacc7x01234567 = _mm256_max_ps(vacc7x01234567, vmin);
+
+    if XNN_LIKELY(nc >= 8) {
+      _mm256_storeu_ps(c7, vacc7x01234567);
+      c7 = (float*) ((uintptr_t) c7 + cn_stride);
+      _mm256_storeu_ps(c6, vacc6x01234567);
+      c6 = (float*) ((uintptr_t) c6 + cn_stride);
+      _mm256_storeu_ps(c5, vacc5x01234567);
+      c5 = (float*) ((uintptr_t) c5 + cn_stride);
+      _mm256_storeu_ps(c4, vacc4x01234567);
+      c4 = (float*) ((uintptr_t) c4 + cn_stride);
+      _mm256_storeu_ps(c3, vacc3x01234567);
+      c3 = (float*) ((uintptr_t) c3 + cn_stride);
+      _mm256_storeu_ps(c2, vacc2x01234567);
+      c2 = (float*) ((uintptr_t) c2 + cn_stride);
+      _mm256_storeu_ps(c1, vacc1x01234567);
+      c1 = (float*) ((uintptr_t) c1 + cn_stride);
+      _mm256_storeu_ps(c0, vacc0x01234567);
+      c0 = (float*) ((uintptr_t) c0 + cn_stride);
+
+      a = (const float**restrict) ((uintptr_t) a - ks);
+      nc -= 8;
+    } else {
+      __m128 vacc7x0123 = _mm256_castps256_ps128(vacc7x01234567);
+      __m128 vacc6x0123 = _mm256_castps256_ps128(vacc6x01234567);
+      __m128 vacc5x0123 = _mm256_castps256_ps128(vacc5x01234567);
+      __m128 vacc4x0123 = _mm256_castps256_ps128(vacc4x01234567);
+      __m128 vacc3x0123 = _mm256_castps256_ps128(vacc3x01234567);
+      __m128 vacc2x0123 = _mm256_castps256_ps128(vacc2x01234567);
+      __m128 vacc1x0123 = _mm256_castps256_ps128(vacc1x01234567);
+      __m128 vacc0x0123 = _mm256_castps256_ps128(vacc0x01234567);
+      if (nc & 4) {
+        _mm_storeu_ps(c7, vacc7x0123);
+        _mm_storeu_ps(c6, vacc6x0123);
+        _mm_storeu_ps(c5, vacc5x0123);
+        _mm_storeu_ps(c4, vacc4x0123);
+        _mm_storeu_ps(c3, vacc3x0123);
+        _mm_storeu_ps(c2, vacc2x0123);
+        _mm_storeu_ps(c1, vacc1x0123);
+        _mm_storeu_ps(c0, vacc0x0123);
+
+        vacc7x0123 = _mm256_extractf128_ps(vacc7x01234567, 1);
+        vacc6x0123 = _mm256_extractf128_ps(vacc6x01234567, 1);
+        vacc5x0123 = _mm256_extractf128_ps(vacc5x01234567, 1);
+        vacc4x0123 = _mm256_extractf128_ps(vacc4x01234567, 1);
+        vacc3x0123 = _mm256_extractf128_ps(vacc3x01234567, 1);
+        vacc2x0123 = _mm256_extractf128_ps(vacc2x01234567, 1);
+        vacc1x0123 = _mm256_extractf128_ps(vacc1x01234567, 1);
+        vacc0x0123 = _mm256_extractf128_ps(vacc0x01234567, 1);
+
+        c7 += 4;
+        c6 += 4;
+        c5 += 4;
+        c4 += 4;
+        c3 += 4;
+        c2 += 4;
+        c1 += 4;
+        c0 += 4;
+      }
+      if (nc & 2) {
+        _mm_storel_pi((__m64*) c7, vacc7x0123);
+        _mm_storel_pi((__m64*) c6, vacc6x0123);
+        _mm_storel_pi((__m64*) c5, vacc5x0123);
+        _mm_storel_pi((__m64*) c4, vacc4x0123);
+        _mm_storel_pi((__m64*) c3, vacc3x0123);
+        _mm_storel_pi((__m64*) c2, vacc2x0123);
+        _mm_storel_pi((__m64*) c1, vacc1x0123);
+        _mm_storel_pi((__m64*) c0, vacc0x0123);
+
+        vacc7x0123 = _mm_movehl_ps(vacc7x0123, vacc7x0123);
+        vacc6x0123 = _mm_movehl_ps(vacc6x0123, vacc6x0123);
+        vacc5x0123 = _mm_movehl_ps(vacc5x0123, vacc5x0123);
+        vacc4x0123 = _mm_movehl_ps(vacc4x0123, vacc4x0123);
+        vacc3x0123 = _mm_movehl_ps(vacc3x0123, vacc3x0123);
+        vacc2x0123 = _mm_movehl_ps(vacc2x0123, vacc2x0123);
+        vacc1x0123 = _mm_movehl_ps(vacc1x0123, vacc1x0123);
+        vacc0x0123 = _mm_movehl_ps(vacc0x0123, vacc0x0123);
+
+        c7 += 2;
+        c6 += 2;
+        c5 += 2;
+        c4 += 2;
+        c3 += 2;
+        c2 += 2;
+        c1 += 2;
+        c0 += 2;
+      }
+      if (nc & 1) {
+        _mm_store_ss(c7, vacc7x0123);
+        _mm_store_ss(c6, vacc6x0123);
+        _mm_store_ss(c5, vacc5x0123);
+        _mm_store_ss(c4, vacc4x0123);
+        _mm_store_ss(c3, vacc3x0123);
+        _mm_store_ss(c2, vacc2x0123);
+        _mm_store_ss(c1, vacc1x0123);
+        _mm_store_ss(c0, vacc0x0123);
+      }
+
+      nc = 0;
+    }
+  } while (nc != 0);
+}
diff --git a/src/f32-igemm/avx-broadcast.c.in b/src/f32-igemm/avx-broadcast.c.in
new file mode 100644
index 0000000..29d4206
--- /dev/null
+++ b/src/f32-igemm/avx-broadcast.c.in
@@ -0,0 +1,163 @@
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+$assert NR % 4 == 0
+$ABC = "0123456789ABCDEFGHIJKLMN"
+#include <assert.h>
+
+#include <immintrin.h>
+
+#include <xnnpack/igemm.h>
+
+
+$ISA = {0: "avx", 3: "fma3"}[FMA]
+void xnn_f32_igemm_ukernel_${MR}x${NR}__${ISA}_broadcast(
+    size_t mr,
+    size_t nc,
+    size_t kc,
+    size_t ks,
+    const float**restrict a,
+    const float*restrict w,
+    float*restrict c,
+    size_t cm_stride,
+    size_t cn_stride,
+    size_t a_offset,
+    const float* zero,
+    const union xnn_f32_output_params params[restrict static 1])
+{
+  assert(mr != 0);
+  assert(mr <= ${MR});
+  assert(nc != 0);
+  assert(kc != 0);
+  assert(kc % sizeof(float) == 0);
+  assert(ks != 0);
+  assert(ks % (${MR} * sizeof(void*)) == 0);
+  assert(a_offset % sizeof(float) == 0);
+  assert(a != NULL);
+  assert(w != NULL);
+  assert(c != NULL);
+
+  float* c0 = c;
+  $for M in range(1, MR):
+    float* c${M} = (float*) ((uintptr_t) c${M-1} + cm_stride);
+    $if M % 2 == 0:
+      if XNN_UNPREDICTABLE(mr <= ${M}) {
+        c${M} = c${M-1};
+      }
+    $elif M + 1 == MR:
+      if XNN_UNPREDICTABLE(mr != ${M+1}) {
+        c${M} = c${M-1};
+      }
+    $else:
+      if XNN_UNPREDICTABLE(mr < ${M+1}) {
+        c${M} = c${M-1};
+      }
+
+  do {
+    __m256 vacc0x${ABC[0:8]} = _mm256_load_ps(w);
+    $for N in range(8, NR, 8):
+      __m256 vacc0x${ABC[N:N+8]} = _mm256_load_ps(w + ${N});
+    $for M in range(1, MR):
+      $for N in range(0, NR, 8):
+        __m256 vacc${M}x${ABC[N:N+8]} = vacc0x${ABC[N:N+8]};
+    w += ${NR};
+
+    size_t p = ks;
+    do {
+      $for M in range(MR):
+        const float* restrict a${M} = a[${M}];
+        assert(a${M} != NULL);
+        if XNN_UNPREDICTABLE(a${M} != zero) {
+          a${M} = (const float*) ((uintptr_t) a${M} + a_offset);
+        }
+      a += ${MR};
+
+      size_t k = kc;
+      do {
+        const __m256 vb${ABC[0:8]} = _mm256_load_ps(w);
+        $for N in range(8, NR, 8):
+          const __m256 vb${ABC[N:N+8]} = _mm256_load_ps(w + ${N});
+        w += ${NR};
+
+        $for M in range(MR):
+          const __m256 va${M} = _mm256_broadcast_ss(a${M});
+          a${M} += 1;
+
+        $for M in range(MR):
+          $for N in range(0, NR, 8):
+            $if FMA == 3:
+              vacc${M}x${ABC[N:N+8]} = _mm256_fmadd_ps(va${M}, vb${ABC[N:N+8]}, vacc${M}x${ABC[N:N+8]});
+            $else:
+              vacc${M}x${ABC[N:N+8]} = _mm256_add_ps(vacc${M}x${ABC[N:N+8]}, _mm256_mul_ps(va${M}, vb${ABC[N:N+8]}));
+        k -= sizeof(float);
+      } while (k != 0);
+      p -= ${MR} * sizeof(void*);
+    } while (p != 0);
+
+    const __m256 vmax = _mm256_broadcast_ps((const __m128*) params->sse.max);
+    $for N in range(0, NR, 8):
+      $for M in range(MR):
+        vacc${M}x${ABC[N:N+8]} = _mm256_min_ps(vacc${M}x${ABC[N:N+8]}, vmax);
+
+    const __m256 vmin = _mm256_broadcast_ps((const __m128*) params->sse.min);
+    $for N in range(0, NR, 8):
+      $for M in range(MR):
+        vacc${M}x${ABC[N:N+8]} = _mm256_max_ps(vacc${M}x${ABC[N:N+8]}, vmin);
+
+    if XNN_LIKELY(nc >= ${NR}) {
+      $for M in reversed(range(MR)):
+        _mm256_storeu_ps(c${M}, vacc${M}x${ABC[0:8]});
+        $for N in range(8, NR, 8):
+          _mm256_storeu_ps(c${M} + ${N}, vacc${M}x${ABC[N:N+8]});
+        c${M} = (float*) ((uintptr_t) c${M} + cn_stride);
+
+      a = (const float**restrict) ((uintptr_t) a - ks);
+      nc -= ${NR};
+    } else {
+      $for LOG2N in reversed(range(NR.bit_length())):
+        $if NR != 1 << LOG2N:
+          if (nc & ${1 << LOG2N}) {
+            $if LOG2N >= 3:
+              $for M in reversed(range(MR)):
+                _mm_storeu_ps(c${M}, vacc${M}x${ABC[0:4]});
+                $for N in range(4, 1 << LOG2N, 4):
+                  _mm_storeu_ps(c${M} + ${N}, vacc${M}x${ABC[N:N+4]});
+
+              $for M in reversed(range(MR)):
+                $for N in range(0, 1 << (LOG2N - 1), 4):
+                  vacc${M}x${ABC[N:N+4]} = vacc${M}x${ABC[N + (1 << LOG2N):N + (1 << LOG2N)+4]};
+
+              $for M in reversed(range(MR)):
+                c${M} += ${1 << LOG2N};
+            $elif LOG2N == 2:
+              $for M in reversed(range(MR)):
+                _mm_storeu_ps(c${M}, vacc${M}x${ABC[0:4]});
+
+              $for M in reversed(range(MR)):
+                vacc${M}x${ABC[0:4]} = _mm256_extractf128_ps(vacc${M}x${ABC[0:8]}, 1);
+
+              $for M in reversed(range(MR)):
+                c${M} += 4;
+            $elif LOG2N == 1:
+              $for M in reversed(range(MR)):
+                _mm_storel_pi((__m64*) c${M}, vacc${M}x${ABC[0:4]});
+
+              $for M in reversed(range(MR)):
+                vacc${M}x${ABC[0:4]} = _mm_movehl_ps(vacc${M}x${ABC[0:4]}, vacc${M}x${ABC[0:4]});
+
+              $for M in reversed(range(MR)):
+                c${M} += 2;
+            $elif LOG2N == 0:
+              $for M in reversed(range(MR)):
+                _mm_store_ss(c${M}, vacc${M}x${ABC[0:4]});
+          }
+        $if LOG2N == 3:
+          $for M in reversed(range(MR)):
+            __m128 vacc${M}x${ABC[0:4]} = _mm256_castps256_ps128(vacc${M}x${ABC[0:8]});
+
+      nc = 0;
+    }
+  } while (nc != 0);
+}
diff --git a/src/xnnpack/gemm.h b/src/xnnpack/gemm.h
index 0df00f5..4f16309 100644
--- a/src/xnnpack/gemm.h
+++ b/src/xnnpack/gemm.h
@@ -37,6 +37,8 @@
 DECLARE_F32_GEMM_UKERNEL_FUNCTION(xnn_f32_gemm_ukernel_1x8__aarch64_neonfma_cortex_a53)
 DECLARE_F32_GEMM_UKERNEL_FUNCTION(xnn_f32_gemm_ukernel_1x8__aarch64_neonfma_cortex_a57)
 DECLARE_F32_GEMM_UKERNEL_FUNCTION(xnn_f32_gemm_ukernel_1x8__aarch64_neonfma_cortex_a75)
+DECLARE_F32_GEMM_UKERNEL_FUNCTION(xnn_f32_gemm_ukernel_1x8__avx_broadcast)
+DECLARE_F32_GEMM_UKERNEL_FUNCTION(xnn_f32_gemm_ukernel_1x8__fma3_broadcast)
 DECLARE_F32_GEMM_UKERNEL_FUNCTION(xnn_f32_gemm_ukernel_1x8__neon_ld64)
 DECLARE_F32_GEMM_UKERNEL_FUNCTION(xnn_f32_gemm_ukernel_1x8__neonfma_ld64)
 DECLARE_F32_GEMM_UKERNEL_FUNCTION(xnn_f32_gemm_ukernel_1x8__psimd_loadsplat)
@@ -58,6 +60,8 @@
 DECLARE_F32_GEMM_UKERNEL_FUNCTION(xnn_f32_gemm_ukernel_4x8__aarch64_neonfma_cortex_a75)
 DECLARE_F32_GEMM_UKERNEL_FUNCTION(xnn_f32_gemm_ukernel_4x8__aarch64_neonfma_ld128)
 DECLARE_F32_GEMM_UKERNEL_FUNCTION(xnn_f32_gemm_ukernel_4x8__aarch64_neonfma_ld64)
+DECLARE_F32_GEMM_UKERNEL_FUNCTION(xnn_f32_gemm_ukernel_4x8__avx_broadcast)
+DECLARE_F32_GEMM_UKERNEL_FUNCTION(xnn_f32_gemm_ukernel_4x8__fma3_broadcast)
 DECLARE_F32_GEMM_UKERNEL_FUNCTION(xnn_f32_gemm_ukernel_4x8__neon_ld128)
 DECLARE_F32_GEMM_UKERNEL_FUNCTION(xnn_f32_gemm_ukernel_4x8__neon_ld64)
 DECLARE_F32_GEMM_UKERNEL_FUNCTION(xnn_f32_gemm_ukernel_4x8__neonfma_ld128)
@@ -71,6 +75,8 @@
 DECLARE_F32_GEMM_UKERNEL_FUNCTION(xnn_f32_gemm_ukernel_4x8s4__psimd)
 DECLARE_F32_GEMM_UKERNEL_FUNCTION(xnn_f32_gemm_ukernel_4x8s4__sse)
 DECLARE_F32_GEMM_UKERNEL_FUNCTION(xnn_f32_gemm_ukernel_5x8__aarch64_neonfma_cortex_a75)
+DECLARE_F32_GEMM_UKERNEL_FUNCTION(xnn_f32_gemm_ukernel_5x8__avx_broadcast)
+DECLARE_F32_GEMM_UKERNEL_FUNCTION(xnn_f32_gemm_ukernel_5x8__fma3_broadcast)
 DECLARE_F32_GEMM_UKERNEL_FUNCTION(xnn_f32_gemm_ukernel_5x8__neon_ld64)
 DECLARE_F32_GEMM_UKERNEL_FUNCTION(xnn_f32_gemm_ukernel_5x8__neonfma_ld64)
 DECLARE_F32_GEMM_UKERNEL_FUNCTION(xnn_f32_gemm_ukernel_6x8__aarch64_neonfma_cortex_a53)
@@ -79,6 +85,8 @@
 DECLARE_F32_GEMM_UKERNEL_FUNCTION(xnn_f32_gemm_ukernel_6x8__aarch64_neonfma_cortex_a75)
 DECLARE_F32_GEMM_UKERNEL_FUNCTION(xnn_f32_gemm_ukernel_6x8__aarch64_neonfma_ld128)
 DECLARE_F32_GEMM_UKERNEL_FUNCTION(xnn_f32_gemm_ukernel_6x8__aarch64_neonfma_ld64)
+DECLARE_F32_GEMM_UKERNEL_FUNCTION(xnn_f32_gemm_ukernel_6x8__avx_broadcast)
+DECLARE_F32_GEMM_UKERNEL_FUNCTION(xnn_f32_gemm_ukernel_6x8__fma3_broadcast)
 DECLARE_F32_GEMM_UKERNEL_FUNCTION(xnn_f32_gemm_ukernel_6x8__neon_ld64)
 DECLARE_F32_GEMM_UKERNEL_FUNCTION(xnn_f32_gemm_ukernel_6x8__neonfma_ld64)
 DECLARE_F32_GEMM_UKERNEL_FUNCTION(xnn_f32_gemm_ukernel_6x8__psimd_loadsplat)
@@ -86,6 +94,9 @@
 DECLARE_F32_GEMM_UKERNEL_FUNCTION(xnn_f32_gemm_ukernel_6x8s4__neon)
 DECLARE_F32_GEMM_UKERNEL_FUNCTION(xnn_f32_gemm_ukernel_6x8s4__neonfma)
 DECLARE_F32_GEMM_UKERNEL_FUNCTION(xnn_f32_gemm_ukernel_6x8s4__psimd)
+DECLARE_F32_GEMM_UKERNEL_FUNCTION(xnn_f32_gemm_ukernel_7x8__avx_broadcast)
+DECLARE_F32_GEMM_UKERNEL_FUNCTION(xnn_f32_gemm_ukernel_7x8__fma3_broadcast)
+DECLARE_F32_GEMM_UKERNEL_FUNCTION(xnn_f32_gemm_ukernel_8x8__fma3_broadcast)
 DECLARE_F32_GEMM_UKERNEL_FUNCTION(xnn_f32_gemm_ukernel_8x8s4__neon)
 DECLARE_F32_GEMM_UKERNEL_FUNCTION(xnn_f32_gemm_ukernel_8x8s4__neonfma)
 
@@ -109,6 +120,8 @@
 DECLARE_F32_GEMMINC_UKERNEL_FUNCTION(xnn_f32_gemminc_ukernel_1x8__aarch64_neonfma_cortex_a53)
 DECLARE_F32_GEMMINC_UKERNEL_FUNCTION(xnn_f32_gemminc_ukernel_1x8__aarch64_neonfma_cortex_a57)
 DECLARE_F32_GEMMINC_UKERNEL_FUNCTION(xnn_f32_gemminc_ukernel_1x8__aarch64_neonfma_cortex_a75)
+DECLARE_F32_GEMMINC_UKERNEL_FUNCTION(xnn_f32_gemminc_ukernel_1x8__avx_broadcast)
+DECLARE_F32_GEMMINC_UKERNEL_FUNCTION(xnn_f32_gemminc_ukernel_1x8__fma3_broadcast)
 DECLARE_F32_GEMMINC_UKERNEL_FUNCTION(xnn_f32_gemminc_ukernel_1x8__neon_ld64)
 DECLARE_F32_GEMMINC_UKERNEL_FUNCTION(xnn_f32_gemminc_ukernel_1x8__neonfma_ld64)
 DECLARE_F32_GEMMINC_UKERNEL_FUNCTION(xnn_f32_gemminc_ukernel_1x8__psimd_loadsplat)
@@ -127,6 +140,8 @@
 DECLARE_F32_GEMMINC_UKERNEL_FUNCTION(xnn_f32_gemminc_ukernel_4x8__aarch64_neonfma_cortex_a75)
 DECLARE_F32_GEMMINC_UKERNEL_FUNCTION(xnn_f32_gemminc_ukernel_4x8__aarch64_neonfma_ld128)
 DECLARE_F32_GEMMINC_UKERNEL_FUNCTION(xnn_f32_gemminc_ukernel_4x8__aarch64_neonfma_ld64)
+DECLARE_F32_GEMMINC_UKERNEL_FUNCTION(xnn_f32_gemminc_ukernel_4x8__avx_broadcast)
+DECLARE_F32_GEMMINC_UKERNEL_FUNCTION(xnn_f32_gemminc_ukernel_4x8__fma3_broadcast)
 DECLARE_F32_GEMMINC_UKERNEL_FUNCTION(xnn_f32_gemminc_ukernel_4x8__neon_ld128)
 DECLARE_F32_GEMMINC_UKERNEL_FUNCTION(xnn_f32_gemminc_ukernel_4x8__neon_ld64)
 DECLARE_F32_GEMMINC_UKERNEL_FUNCTION(xnn_f32_gemminc_ukernel_4x8__neonfma_ld128)
@@ -140,14 +155,18 @@
 DECLARE_F32_GEMMINC_UKERNEL_FUNCTION(xnn_f32_gemminc_ukernel_4x8s4__psimd)
 DECLARE_F32_GEMMINC_UKERNEL_FUNCTION(xnn_f32_gemminc_ukernel_4x8s4__sse)
 DECLARE_F32_GEMMINC_UKERNEL_FUNCTION(xnn_f32_gemminc_ukernel_5x8__aarch64_neonfma_cortex_a75)
+DECLARE_F32_GEMMINC_UKERNEL_FUNCTION(xnn_f32_gemminc_ukernel_5x8__avx_broadcast)
+DECLARE_F32_GEMMINC_UKERNEL_FUNCTION(xnn_f32_gemminc_ukernel_5x8__fma3_broadcast)
 DECLARE_F32_GEMMINC_UKERNEL_FUNCTION(xnn_f32_gemminc_ukernel_5x8__neon_ld64)
 DECLARE_F32_GEMMINC_UKERNEL_FUNCTION(xnn_f32_gemminc_ukernel_5x8__neonfma_ld64)
 DECLARE_F32_GEMMINC_UKERNEL_FUNCTION(xnn_f32_gemminc_ukernel_6x8__aarch64_neonfma_cortex_a53)
 DECLARE_F32_GEMMINC_UKERNEL_FUNCTION(xnn_f32_gemminc_ukernel_6x8__aarch64_neonfma_cortex_a57)
 DECLARE_F32_GEMMINC_UKERNEL_FUNCTION(xnn_f32_gemminc_ukernel_6x8__aarch64_neonfma_cortex_a73)
 DECLARE_F32_GEMMINC_UKERNEL_FUNCTION(xnn_f32_gemminc_ukernel_6x8__aarch64_neonfma_cortex_a75)
-DECLARE_F32_GEMMINC_UKERNEL_FUNCTION(xnn_f32_gemminc_ukernel_6x8__aarch64_neonfma_ld64)
 DECLARE_F32_GEMMINC_UKERNEL_FUNCTION(xnn_f32_gemminc_ukernel_6x8__aarch64_neonfma_ld128)
+DECLARE_F32_GEMMINC_UKERNEL_FUNCTION(xnn_f32_gemminc_ukernel_6x8__aarch64_neonfma_ld64)
+DECLARE_F32_GEMMINC_UKERNEL_FUNCTION(xnn_f32_gemminc_ukernel_6x8__avx_broadcast)
+DECLARE_F32_GEMMINC_UKERNEL_FUNCTION(xnn_f32_gemminc_ukernel_6x8__fma3_broadcast)
 DECLARE_F32_GEMMINC_UKERNEL_FUNCTION(xnn_f32_gemminc_ukernel_6x8__neon_ld64)
 DECLARE_F32_GEMMINC_UKERNEL_FUNCTION(xnn_f32_gemminc_ukernel_6x8__neonfma_ld64)
 DECLARE_F32_GEMMINC_UKERNEL_FUNCTION(xnn_f32_gemminc_ukernel_6x8__psimd_loadsplat)
@@ -155,6 +174,9 @@
 DECLARE_F32_GEMMINC_UKERNEL_FUNCTION(xnn_f32_gemminc_ukernel_6x8s4__neon)
 DECLARE_F32_GEMMINC_UKERNEL_FUNCTION(xnn_f32_gemminc_ukernel_6x8s4__neonfma)
 DECLARE_F32_GEMMINC_UKERNEL_FUNCTION(xnn_f32_gemminc_ukernel_6x8s4__psimd)
+DECLARE_F32_GEMMINC_UKERNEL_FUNCTION(xnn_f32_gemminc_ukernel_7x8__avx_broadcast)
+DECLARE_F32_GEMMINC_UKERNEL_FUNCTION(xnn_f32_gemminc_ukernel_7x8__fma3_broadcast)
+DECLARE_F32_GEMMINC_UKERNEL_FUNCTION(xnn_f32_gemminc_ukernel_8x8__fma3_broadcast)
 DECLARE_F32_GEMMINC_UKERNEL_FUNCTION(xnn_f32_gemminc_ukernel_8x8s4__neon)
 DECLARE_F32_GEMMINC_UKERNEL_FUNCTION(xnn_f32_gemminc_ukernel_8x8s4__neonfma)
 
diff --git a/src/xnnpack/igemm.h b/src/xnnpack/igemm.h
index 398545b..5faf766 100644
--- a/src/xnnpack/igemm.h
+++ b/src/xnnpack/igemm.h
@@ -39,6 +39,8 @@
 DECLARE_F32_IGEMM_UKERNEL_FUNCTION(xnn_f32_igemm_ukernel_1x8__aarch64_neonfma_cortex_a53)
 DECLARE_F32_IGEMM_UKERNEL_FUNCTION(xnn_f32_igemm_ukernel_1x8__aarch64_neonfma_cortex_a57)
 DECLARE_F32_IGEMM_UKERNEL_FUNCTION(xnn_f32_igemm_ukernel_1x8__aarch64_neonfma_cortex_a75)
+DECLARE_F32_IGEMM_UKERNEL_FUNCTION(xnn_f32_igemm_ukernel_1x8__avx_broadcast)
+DECLARE_F32_IGEMM_UKERNEL_FUNCTION(xnn_f32_igemm_ukernel_1x8__fma3_broadcast)
 DECLARE_F32_IGEMM_UKERNEL_FUNCTION(xnn_f32_igemm_ukernel_1x8__neon_ld64)
 DECLARE_F32_IGEMM_UKERNEL_FUNCTION(xnn_f32_igemm_ukernel_1x8__neonfma_ld64)
 DECLARE_F32_IGEMM_UKERNEL_FUNCTION(xnn_f32_igemm_ukernel_1x8__psimd_loadsplat)
@@ -61,6 +63,8 @@
 DECLARE_F32_IGEMM_UKERNEL_FUNCTION(xnn_f32_igemm_ukernel_4x4__scalar)
 DECLARE_F32_IGEMM_UKERNEL_FUNCTION(xnn_f32_igemm_ukernel_4x8__aarch64_neonfma_cortex_a53)
 DECLARE_F32_IGEMM_UKERNEL_FUNCTION(xnn_f32_igemm_ukernel_4x8__aarch64_neonfma_cortex_a75)
+DECLARE_F32_IGEMM_UKERNEL_FUNCTION(xnn_f32_igemm_ukernel_4x8__avx_broadcast)
+DECLARE_F32_IGEMM_UKERNEL_FUNCTION(xnn_f32_igemm_ukernel_4x8__fma3_broadcast)
 DECLARE_F32_IGEMM_UKERNEL_FUNCTION(xnn_f32_igemm_ukernel_4x8__neon_ld128)
 DECLARE_F32_IGEMM_UKERNEL_FUNCTION(xnn_f32_igemm_ukernel_4x8__neon_ld64)
 DECLARE_F32_IGEMM_UKERNEL_FUNCTION(xnn_f32_igemm_ukernel_4x8__neonfma_ld128)
@@ -74,10 +78,14 @@
 DECLARE_F32_IGEMM_UKERNEL_FUNCTION(xnn_f32_igemm_ukernel_4x8s4__psimd)
 DECLARE_F32_IGEMM_UKERNEL_FUNCTION(xnn_f32_igemm_ukernel_4x8s4__sse)
 DECLARE_F32_IGEMM_UKERNEL_FUNCTION(xnn_f32_igemm_ukernel_5x8__aarch64_neonfma_cortex_a75)
+DECLARE_F32_IGEMM_UKERNEL_FUNCTION(xnn_f32_igemm_ukernel_5x8__avx_broadcast)
+DECLARE_F32_IGEMM_UKERNEL_FUNCTION(xnn_f32_igemm_ukernel_5x8__fma3_broadcast)
 DECLARE_F32_IGEMM_UKERNEL_FUNCTION(xnn_f32_igemm_ukernel_6x8__aarch64_neonfma_cortex_a53)
 DECLARE_F32_IGEMM_UKERNEL_FUNCTION(xnn_f32_igemm_ukernel_6x8__aarch64_neonfma_cortex_a57)
 DECLARE_F32_IGEMM_UKERNEL_FUNCTION(xnn_f32_igemm_ukernel_6x8__aarch64_neonfma_cortex_a73)
 DECLARE_F32_IGEMM_UKERNEL_FUNCTION(xnn_f32_igemm_ukernel_6x8__aarch64_neonfma_cortex_a75)
+DECLARE_F32_IGEMM_UKERNEL_FUNCTION(xnn_f32_igemm_ukernel_6x8__avx_broadcast)
+DECLARE_F32_IGEMM_UKERNEL_FUNCTION(xnn_f32_igemm_ukernel_6x8__fma3_broadcast)
 DECLARE_F32_IGEMM_UKERNEL_FUNCTION(xnn_f32_igemm_ukernel_6x8__neon_ld64)
 DECLARE_F32_IGEMM_UKERNEL_FUNCTION(xnn_f32_igemm_ukernel_6x8__neonfma_ld64)
 DECLARE_F32_IGEMM_UKERNEL_FUNCTION(xnn_f32_igemm_ukernel_6x8__psimd_loadsplat)
@@ -85,6 +93,9 @@
 DECLARE_F32_IGEMM_UKERNEL_FUNCTION(xnn_f32_igemm_ukernel_6x8s4__neon)
 DECLARE_F32_IGEMM_UKERNEL_FUNCTION(xnn_f32_igemm_ukernel_6x8s4__neonfma)
 DECLARE_F32_IGEMM_UKERNEL_FUNCTION(xnn_f32_igemm_ukernel_6x8s4__psimd)
+DECLARE_F32_IGEMM_UKERNEL_FUNCTION(xnn_f32_igemm_ukernel_7x8__avx_broadcast)
+DECLARE_F32_IGEMM_UKERNEL_FUNCTION(xnn_f32_igemm_ukernel_7x8__fma3_broadcast)
+DECLARE_F32_IGEMM_UKERNEL_FUNCTION(xnn_f32_igemm_ukernel_8x8__fma3_broadcast)
 DECLARE_F32_IGEMM_UKERNEL_FUNCTION(xnn_f32_igemm_ukernel_8x8s4__neon)
 DECLARE_F32_IGEMM_UKERNEL_FUNCTION(xnn_f32_igemm_ukernel_8x8s4__neonfma)
 
diff --git a/src/xnnpack/isa-checks.h b/src/xnnpack/isa-checks.h
index 226640c..e5d1fc6 100644
--- a/src/xnnpack/isa-checks.h
+++ b/src/xnnpack/isa-checks.h
@@ -52,6 +52,13 @@
     } \
   } while (0)
 
+#define TEST_REQUIRES_X86_FMA3 \
+  do { \
+    if (!cpuinfo_initialize() || !cpuinfo_has_x86_fma3()) { \
+      GTEST_SKIP(); \
+    } \
+  } while (0)
+
 #define TEST_REQUIRES_X86_AVX2 \
   do { \
     if (!cpuinfo_initialize() || !cpuinfo_has_x86_avx2()) { \