AVX512F implementation of GEMM & IGEMM micro-kernels

PiperOrigin-RevId: 282810752
diff --git a/src/f32-gemm/avx-broadcast.c.in b/src/f32-gemm/avx-broadcast.c.in
index 90d7e08..c7df40f 100644
--- a/src/f32-gemm/avx-broadcast.c.in
+++ b/src/f32-gemm/avx-broadcast.c.in
@@ -3,8 +3,8 @@
 // This source code is licensed under the BSD-style license found in the
 // LICENSE file in the root directory of this source tree.
 
-$assert NR % 4 == 0
-$ABC = "0123456789ABCDEFGHIJKLMN"
+$assert NR % 8 == 0
+$ABC = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ"
 #include <assert.h>
 
 #include <immintrin.h>
diff --git a/src/f32-gemm/avx512-broadcast.c.in b/src/f32-gemm/avx512-broadcast.c.in
new file mode 100644
index 0000000..4e69c17
--- /dev/null
+++ b/src/f32-gemm/avx512-broadcast.c.in
@@ -0,0 +1,143 @@
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+$assert NR % 16 == 0
+$ABC = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ"
+#include <assert.h>
+
+#include <immintrin.h>
+
+#include <xnnpack/gemm.h>
+
+
+void xnn_f32_gemm${"inc" if INC else ""}_ukernel_${MR}x${NR}__avx512f_broadcast(
+    size_t mr,
+    size_t nc,
+    size_t kc,
+    const float*restrict a,
+    size_t a_stride,
+    const float*restrict w,
+    float*restrict c,
+    size_t cm_stride,
+    size_t cn_stride,
+    $if INC:
+      const float*restrict acc,
+    const union xnn_f32_output_params params[restrict static 1])
+{
+  assert(mr != 0);
+  assert(mr <= ${MR});
+  assert(nc != 0);
+  assert(kc != 0);
+  assert(kc % sizeof(float) == 0);
+  assert(a != NULL);
+  assert(w != NULL);
+  assert(c != NULL);
+  $if INC:
+    assert(acc != NULL);
+
+  const float* a0 = a;
+  float* c0 = c;
+  $for M in range(1, MR):
+    const float* a${M} = (const float*) ((uintptr_t) a${M-1} + a_stride);
+    float* c${M} = (float*) ((uintptr_t) c${M-1} + cm_stride);
+    $if M % 2 == 0:
+      if XNN_UNPREDICTABLE(mr <= ${M}) {
+        a${M} = a${M-1};
+        c${M} = c${M-1};
+      }
+    $elif M + 1 == MR:
+      if XNN_UNPREDICTABLE(mr != ${M+1}) {
+        a${M} = a${M-1};
+        c${M} = c${M-1};
+      }
+    $else:
+      if XNN_UNPREDICTABLE(mr < ${M+1}) {
+        a${M} = a${M-1};
+        c${M} = c${M-1};
+      }
+
+  do {
+    $if INC:
+      $for M in range(MR):
+        $for N in range(0, NR, 16):
+          __m512 vacc${M}x${ABC[N:N+16]} = _mm512_load_ps(acc + ${M*NR+N});
+      acc += ${MR*NR};
+    $else:
+      __m512 vacc0x${ABC[0:16]} = _mm512_load_ps(w);
+      $for N in range(16, NR, 16):
+        __m512 vacc0x${ABC[N:N+16]} = _mm512_load_ps(w + ${N});
+      $for M in range(1, MR):
+        $for N in range(0, NR, 16):
+          __m512 vacc${M}x${ABC[N:N+16]} = vacc0x${ABC[N:N+16]};
+      w += ${NR};
+
+    size_t k = kc;
+    do {
+      const __m512 vb${ABC[0:16]} = _mm512_load_ps(w);
+      $for N in range(16, NR, 16):
+        const __m512 vb${ABC[N:N+16]} = _mm512_load_ps(w + ${N});
+      w += ${NR};
+
+      $for N in range(0, NR, 16):
+        $for M in range(MR):
+          vacc${M}x${ABC[N:N+16]} = _mm512_fmadd_ps(_mm512_set1_ps(*a${M}), vb${ABC[N:N+16]}, vacc${M}x${ABC[N:N+16]});
+
+      $for M in range(MR):
+        a${M} += 1;
+
+      k -= sizeof(float);
+    } while (k != 0);
+
+    const __m512 vmax = _mm512_broadcast_f32x4(_mm_load_ps(params->sse.max));
+    $for N in range(0, NR, 16):
+      $for M in range(MR):
+        vacc${M}x${ABC[N:N+16]} = _mm512_min_ps(vacc${M}x${ABC[N:N+16]}, vmax);
+
+    const __m512 vmin = _mm512_broadcast_f32x4(_mm_load_ps(params->sse.min));
+    $for N in range(0, NR, 16):
+      $for M in range(MR):
+        vacc${M}x${ABC[N:N+16]} = _mm512_max_ps(vacc${M}x${ABC[N:N+16]}, vmin);
+
+    if XNN_LIKELY(nc >= ${NR}) {
+      $for M in reversed(range(MR)):
+        _mm512_storeu_ps(c${M}, vacc${M}x${ABC[0:16]});
+        $for N in range(16, NR, 16):
+          _mm512_storeu_ps(c${M} + ${N}, vacc${M}x${ABC[N:N+16]});
+        c${M} = (float*) ((uintptr_t) c${M} + cn_stride);
+
+      $for M in reversed(range(MR)):
+        a${M} = (const float*) ((uintptr_t) a${M} - kc);
+
+      nc -= ${NR};
+    } else {
+      $for LOG2N in reversed(range(4, NR.bit_length())):
+        $if NR != 1 << LOG2N:
+          if (nc & ${1 << LOG2N}) {
+            $if LOG2N >= 4:
+              $for M in reversed(range(MR)):
+                _mm512_storeu_ps(c${M}, vacc${M}x${ABC[0:16]});
+                $for N in range(16, 1 << LOG2N, 16):
+                  _mm512_storeu_ps(c${M} + ${N}, vacc${M}x${ABC[N:N+16]});
+
+              $for M in reversed(range(MR)):
+                $for N in range(0, 1 << (LOG2N - 1), 16):
+                  vacc${M}x${ABC[N:N+16]} = vacc${M}x${ABC[N + (1 << LOG2N):N + (1 << LOG2N)+16]};
+
+              $for M in reversed(range(MR)):
+                c${M} += ${1 << LOG2N};
+          }
+        $if LOG2N == 4:
+          if (nc & 15) {
+            // Prepare mask for valid 32-bit elements (depends on nc).
+            const __mmask16 vmask = _cvtu32_mask16((uint16_t) ((uint32_t) (UINT32_C(1) << nc) - UINT32_C(1)));
+
+            $for M in reversed(range(MR)):
+              _mm512_mask_storeu_ps(c${M}, vmask, vacc${M}x${ABC[0:16]});
+          }
+
+      nc = 0;
+    }
+  } while (nc != 0);
+}
diff --git a/src/f32-gemm/gen-inc/1x16-avx512f-broadcast.c b/src/f32-gemm/gen-inc/1x16-avx512f-broadcast.c
new file mode 100644
index 0000000..1c65ee6
--- /dev/null
+++ b/src/f32-gemm/gen-inc/1x16-avx512f-broadcast.c
@@ -0,0 +1,83 @@
+// Auto-generated file. Do not edit!
+//   Template: src/f32-gemm/avx512-broadcast.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <immintrin.h>
+
+#include <xnnpack/gemm.h>
+
+
+void xnn_f32_gemminc_ukernel_1x16__avx512f_broadcast(
+    size_t mr,
+    size_t nc,
+    size_t kc,
+    const float*restrict a,
+    size_t a_stride,
+    const float*restrict w,
+    float*restrict c,
+    size_t cm_stride,
+    size_t cn_stride,
+    const float*restrict acc,
+    const union xnn_f32_output_params params[restrict static 1])
+{
+  assert(mr != 0);
+  assert(mr <= 1);
+  assert(nc != 0);
+  assert(kc != 0);
+  assert(kc % sizeof(float) == 0);
+  assert(a != NULL);
+  assert(w != NULL);
+  assert(c != NULL);
+  assert(acc != NULL);
+
+  const float* a0 = a;
+  float* c0 = c;
+
+  do {
+    __m512 vacc0x0123456789ABCDEF = _mm512_load_ps(acc + 0);
+    acc += 16;
+
+    size_t k = kc;
+    do {
+      const __m512 vb0123456789ABCDEF = _mm512_load_ps(w);
+      w += 16;
+
+      vacc0x0123456789ABCDEF = _mm512_fmadd_ps(_mm512_set1_ps(*a0), vb0123456789ABCDEF, vacc0x0123456789ABCDEF);
+
+      a0 += 1;
+
+      k -= sizeof(float);
+    } while (k != 0);
+
+    const __m512 vmax = _mm512_broadcast_f32x4(_mm_load_ps(params->sse.max));
+    vacc0x0123456789ABCDEF = _mm512_min_ps(vacc0x0123456789ABCDEF, vmax);
+
+    const __m512 vmin = _mm512_broadcast_f32x4(_mm_load_ps(params->sse.min));
+    vacc0x0123456789ABCDEF = _mm512_max_ps(vacc0x0123456789ABCDEF, vmin);
+
+    if XNN_LIKELY(nc >= 16) {
+      _mm512_storeu_ps(c0, vacc0x0123456789ABCDEF);
+      c0 = (float*) ((uintptr_t) c0 + cn_stride);
+
+      a0 = (const float*) ((uintptr_t) a0 - kc);
+
+      nc -= 16;
+    } else {
+      if (nc & 15) {
+        // Prepare mask for valid 32-bit elements (depends on nc).
+        const __mmask16 vmask = _cvtu32_mask16((uint16_t) ((uint32_t) (UINT32_C(1) << nc) - UINT32_C(1)));
+
+        _mm512_mask_storeu_ps(c0, vmask, vacc0x0123456789ABCDEF);
+      }
+
+      nc = 0;
+    }
+  } while (nc != 0);
+}
diff --git a/src/f32-gemm/gen-inc/4x16-avx512f-broadcast.c b/src/f32-gemm/gen-inc/4x16-avx512f-broadcast.c
new file mode 100644
index 0000000..93d2335
--- /dev/null
+++ b/src/f32-gemm/gen-inc/4x16-avx512f-broadcast.c
@@ -0,0 +1,128 @@
+// Auto-generated file. Do not edit!
+//   Template: src/f32-gemm/avx512-broadcast.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <immintrin.h>
+
+#include <xnnpack/gemm.h>
+
+
+void xnn_f32_gemminc_ukernel_4x16__avx512f_broadcast(
+    size_t mr,
+    size_t nc,
+    size_t kc,
+    const float*restrict a,
+    size_t a_stride,
+    const float*restrict w,
+    float*restrict c,
+    size_t cm_stride,
+    size_t cn_stride,
+    const float*restrict acc,
+    const union xnn_f32_output_params params[restrict static 1])
+{
+  assert(mr != 0);
+  assert(mr <= 4);
+  assert(nc != 0);
+  assert(kc != 0);
+  assert(kc % sizeof(float) == 0);
+  assert(a != NULL);
+  assert(w != NULL);
+  assert(c != NULL);
+  assert(acc != NULL);
+
+  const float* a0 = a;
+  float* c0 = c;
+  const float* a1 = (const float*) ((uintptr_t) a0 + a_stride);
+  float* c1 = (float*) ((uintptr_t) c0 + cm_stride);
+  if XNN_UNPREDICTABLE(mr < 2) {
+    a1 = a0;
+    c1 = c0;
+  }
+  const float* a2 = (const float*) ((uintptr_t) a1 + a_stride);
+  float* c2 = (float*) ((uintptr_t) c1 + cm_stride);
+  if XNN_UNPREDICTABLE(mr <= 2) {
+    a2 = a1;
+    c2 = c1;
+  }
+  const float* a3 = (const float*) ((uintptr_t) a2 + a_stride);
+  float* c3 = (float*) ((uintptr_t) c2 + cm_stride);
+  if XNN_UNPREDICTABLE(mr != 4) {
+    a3 = a2;
+    c3 = c2;
+  }
+
+  do {
+    __m512 vacc0x0123456789ABCDEF = _mm512_load_ps(acc + 0);
+    __m512 vacc1x0123456789ABCDEF = _mm512_load_ps(acc + 16);
+    __m512 vacc2x0123456789ABCDEF = _mm512_load_ps(acc + 32);
+    __m512 vacc3x0123456789ABCDEF = _mm512_load_ps(acc + 48);
+    acc += 64;
+
+    size_t k = kc;
+    do {
+      const __m512 vb0123456789ABCDEF = _mm512_load_ps(w);
+      w += 16;
+
+      vacc0x0123456789ABCDEF = _mm512_fmadd_ps(_mm512_set1_ps(*a0), vb0123456789ABCDEF, vacc0x0123456789ABCDEF);
+      vacc1x0123456789ABCDEF = _mm512_fmadd_ps(_mm512_set1_ps(*a1), vb0123456789ABCDEF, vacc1x0123456789ABCDEF);
+      vacc2x0123456789ABCDEF = _mm512_fmadd_ps(_mm512_set1_ps(*a2), vb0123456789ABCDEF, vacc2x0123456789ABCDEF);
+      vacc3x0123456789ABCDEF = _mm512_fmadd_ps(_mm512_set1_ps(*a3), vb0123456789ABCDEF, vacc3x0123456789ABCDEF);
+
+      a0 += 1;
+      a1 += 1;
+      a2 += 1;
+      a3 += 1;
+
+      k -= sizeof(float);
+    } while (k != 0);
+
+    const __m512 vmax = _mm512_broadcast_f32x4(_mm_load_ps(params->sse.max));
+    vacc0x0123456789ABCDEF = _mm512_min_ps(vacc0x0123456789ABCDEF, vmax);
+    vacc1x0123456789ABCDEF = _mm512_min_ps(vacc1x0123456789ABCDEF, vmax);
+    vacc2x0123456789ABCDEF = _mm512_min_ps(vacc2x0123456789ABCDEF, vmax);
+    vacc3x0123456789ABCDEF = _mm512_min_ps(vacc3x0123456789ABCDEF, vmax);
+
+    const __m512 vmin = _mm512_broadcast_f32x4(_mm_load_ps(params->sse.min));
+    vacc0x0123456789ABCDEF = _mm512_max_ps(vacc0x0123456789ABCDEF, vmin);
+    vacc1x0123456789ABCDEF = _mm512_max_ps(vacc1x0123456789ABCDEF, vmin);
+    vacc2x0123456789ABCDEF = _mm512_max_ps(vacc2x0123456789ABCDEF, vmin);
+    vacc3x0123456789ABCDEF = _mm512_max_ps(vacc3x0123456789ABCDEF, vmin);
+
+    if XNN_LIKELY(nc >= 16) {
+      _mm512_storeu_ps(c3, vacc3x0123456789ABCDEF);
+      c3 = (float*) ((uintptr_t) c3 + cn_stride);
+      _mm512_storeu_ps(c2, vacc2x0123456789ABCDEF);
+      c2 = (float*) ((uintptr_t) c2 + cn_stride);
+      _mm512_storeu_ps(c1, vacc1x0123456789ABCDEF);
+      c1 = (float*) ((uintptr_t) c1 + cn_stride);
+      _mm512_storeu_ps(c0, vacc0x0123456789ABCDEF);
+      c0 = (float*) ((uintptr_t) c0 + cn_stride);
+
+      a3 = (const float*) ((uintptr_t) a3 - kc);
+      a2 = (const float*) ((uintptr_t) a2 - kc);
+      a1 = (const float*) ((uintptr_t) a1 - kc);
+      a0 = (const float*) ((uintptr_t) a0 - kc);
+
+      nc -= 16;
+    } else {
+      if (nc & 15) {
+        // Prepare mask for valid 32-bit elements (depends on nc).
+        const __mmask16 vmask = _cvtu32_mask16((uint16_t) ((uint32_t) (UINT32_C(1) << nc) - UINT32_C(1)));
+
+        _mm512_mask_storeu_ps(c3, vmask, vacc3x0123456789ABCDEF);
+        _mm512_mask_storeu_ps(c2, vmask, vacc2x0123456789ABCDEF);
+        _mm512_mask_storeu_ps(c1, vmask, vacc1x0123456789ABCDEF);
+        _mm512_mask_storeu_ps(c0, vmask, vacc0x0123456789ABCDEF);
+      }
+
+      nc = 0;
+    }
+  } while (nc != 0);
+}
diff --git a/src/f32-gemm/gen-inc/5x16-avx512f-broadcast.c b/src/f32-gemm/gen-inc/5x16-avx512f-broadcast.c
new file mode 100644
index 0000000..d7b03d1
--- /dev/null
+++ b/src/f32-gemm/gen-inc/5x16-avx512f-broadcast.c
@@ -0,0 +1,143 @@
+// Auto-generated file. Do not edit!
+//   Template: src/f32-gemm/avx512-broadcast.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <immintrin.h>
+
+#include <xnnpack/gemm.h>
+
+
+void xnn_f32_gemminc_ukernel_5x16__avx512f_broadcast(
+    size_t mr,
+    size_t nc,
+    size_t kc,
+    const float*restrict a,
+    size_t a_stride,
+    const float*restrict w,
+    float*restrict c,
+    size_t cm_stride,
+    size_t cn_stride,
+    const float*restrict acc,
+    const union xnn_f32_output_params params[restrict static 1])
+{
+  assert(mr != 0);
+  assert(mr <= 5);
+  assert(nc != 0);
+  assert(kc != 0);
+  assert(kc % sizeof(float) == 0);
+  assert(a != NULL);
+  assert(w != NULL);
+  assert(c != NULL);
+  assert(acc != NULL);
+
+  const float* a0 = a;
+  float* c0 = c;
+  const float* a1 = (const float*) ((uintptr_t) a0 + a_stride);
+  float* c1 = (float*) ((uintptr_t) c0 + cm_stride);
+  if XNN_UNPREDICTABLE(mr < 2) {
+    a1 = a0;
+    c1 = c0;
+  }
+  const float* a2 = (const float*) ((uintptr_t) a1 + a_stride);
+  float* c2 = (float*) ((uintptr_t) c1 + cm_stride);
+  if XNN_UNPREDICTABLE(mr <= 2) {
+    a2 = a1;
+    c2 = c1;
+  }
+  const float* a3 = (const float*) ((uintptr_t) a2 + a_stride);
+  float* c3 = (float*) ((uintptr_t) c2 + cm_stride);
+  if XNN_UNPREDICTABLE(mr < 4) {
+    a3 = a2;
+    c3 = c2;
+  }
+  const float* a4 = (const float*) ((uintptr_t) a3 + a_stride);
+  float* c4 = (float*) ((uintptr_t) c3 + cm_stride);
+  if XNN_UNPREDICTABLE(mr <= 4) {
+    a4 = a3;
+    c4 = c3;
+  }
+
+  do {
+    __m512 vacc0x0123456789ABCDEF = _mm512_load_ps(acc + 0);
+    __m512 vacc1x0123456789ABCDEF = _mm512_load_ps(acc + 16);
+    __m512 vacc2x0123456789ABCDEF = _mm512_load_ps(acc + 32);
+    __m512 vacc3x0123456789ABCDEF = _mm512_load_ps(acc + 48);
+    __m512 vacc4x0123456789ABCDEF = _mm512_load_ps(acc + 64);
+    acc += 80;
+
+    size_t k = kc;
+    do {
+      const __m512 vb0123456789ABCDEF = _mm512_load_ps(w);
+      w += 16;
+
+      vacc0x0123456789ABCDEF = _mm512_fmadd_ps(_mm512_set1_ps(*a0), vb0123456789ABCDEF, vacc0x0123456789ABCDEF);
+      vacc1x0123456789ABCDEF = _mm512_fmadd_ps(_mm512_set1_ps(*a1), vb0123456789ABCDEF, vacc1x0123456789ABCDEF);
+      vacc2x0123456789ABCDEF = _mm512_fmadd_ps(_mm512_set1_ps(*a2), vb0123456789ABCDEF, vacc2x0123456789ABCDEF);
+      vacc3x0123456789ABCDEF = _mm512_fmadd_ps(_mm512_set1_ps(*a3), vb0123456789ABCDEF, vacc3x0123456789ABCDEF);
+      vacc4x0123456789ABCDEF = _mm512_fmadd_ps(_mm512_set1_ps(*a4), vb0123456789ABCDEF, vacc4x0123456789ABCDEF);
+
+      a0 += 1;
+      a1 += 1;
+      a2 += 1;
+      a3 += 1;
+      a4 += 1;
+
+      k -= sizeof(float);
+    } while (k != 0);
+
+    const __m512 vmax = _mm512_broadcast_f32x4(_mm_load_ps(params->sse.max));
+    vacc0x0123456789ABCDEF = _mm512_min_ps(vacc0x0123456789ABCDEF, vmax);
+    vacc1x0123456789ABCDEF = _mm512_min_ps(vacc1x0123456789ABCDEF, vmax);
+    vacc2x0123456789ABCDEF = _mm512_min_ps(vacc2x0123456789ABCDEF, vmax);
+    vacc3x0123456789ABCDEF = _mm512_min_ps(vacc3x0123456789ABCDEF, vmax);
+    vacc4x0123456789ABCDEF = _mm512_min_ps(vacc4x0123456789ABCDEF, vmax);
+
+    const __m512 vmin = _mm512_broadcast_f32x4(_mm_load_ps(params->sse.min));
+    vacc0x0123456789ABCDEF = _mm512_max_ps(vacc0x0123456789ABCDEF, vmin);
+    vacc1x0123456789ABCDEF = _mm512_max_ps(vacc1x0123456789ABCDEF, vmin);
+    vacc2x0123456789ABCDEF = _mm512_max_ps(vacc2x0123456789ABCDEF, vmin);
+    vacc3x0123456789ABCDEF = _mm512_max_ps(vacc3x0123456789ABCDEF, vmin);
+    vacc4x0123456789ABCDEF = _mm512_max_ps(vacc4x0123456789ABCDEF, vmin);
+
+    if XNN_LIKELY(nc >= 16) {
+      _mm512_storeu_ps(c4, vacc4x0123456789ABCDEF);
+      c4 = (float*) ((uintptr_t) c4 + cn_stride);
+      _mm512_storeu_ps(c3, vacc3x0123456789ABCDEF);
+      c3 = (float*) ((uintptr_t) c3 + cn_stride);
+      _mm512_storeu_ps(c2, vacc2x0123456789ABCDEF);
+      c2 = (float*) ((uintptr_t) c2 + cn_stride);
+      _mm512_storeu_ps(c1, vacc1x0123456789ABCDEF);
+      c1 = (float*) ((uintptr_t) c1 + cn_stride);
+      _mm512_storeu_ps(c0, vacc0x0123456789ABCDEF);
+      c0 = (float*) ((uintptr_t) c0 + cn_stride);
+
+      a4 = (const float*) ((uintptr_t) a4 - kc);
+      a3 = (const float*) ((uintptr_t) a3 - kc);
+      a2 = (const float*) ((uintptr_t) a2 - kc);
+      a1 = (const float*) ((uintptr_t) a1 - kc);
+      a0 = (const float*) ((uintptr_t) a0 - kc);
+
+      nc -= 16;
+    } else {
+      if (nc & 15) {
+        // Prepare mask for valid 32-bit elements (depends on nc).
+        const __mmask16 vmask = _cvtu32_mask16((uint16_t) ((uint32_t) (UINT32_C(1) << nc) - UINT32_C(1)));
+
+        _mm512_mask_storeu_ps(c4, vmask, vacc4x0123456789ABCDEF);
+        _mm512_mask_storeu_ps(c3, vmask, vacc3x0123456789ABCDEF);
+        _mm512_mask_storeu_ps(c2, vmask, vacc2x0123456789ABCDEF);
+        _mm512_mask_storeu_ps(c1, vmask, vacc1x0123456789ABCDEF);
+        _mm512_mask_storeu_ps(c0, vmask, vacc0x0123456789ABCDEF);
+      }
+
+      nc = 0;
+    }
+  } while (nc != 0);
+}
diff --git a/src/f32-gemm/gen-inc/6x16-avx512f-broadcast.c b/src/f32-gemm/gen-inc/6x16-avx512f-broadcast.c
new file mode 100644
index 0000000..5efa8df
--- /dev/null
+++ b/src/f32-gemm/gen-inc/6x16-avx512f-broadcast.c
@@ -0,0 +1,158 @@
+// Auto-generated file. Do not edit!
+//   Template: src/f32-gemm/avx512-broadcast.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <immintrin.h>
+
+#include <xnnpack/gemm.h>
+
+
+void xnn_f32_gemminc_ukernel_6x16__avx512f_broadcast(
+    size_t mr,
+    size_t nc,
+    size_t kc,
+    const float*restrict a,
+    size_t a_stride,
+    const float*restrict w,
+    float*restrict c,
+    size_t cm_stride,
+    size_t cn_stride,
+    const float*restrict acc,
+    const union xnn_f32_output_params params[restrict static 1])
+{
+  assert(mr != 0);
+  assert(mr <= 6);
+  assert(nc != 0);
+  assert(kc != 0);
+  assert(kc % sizeof(float) == 0);
+  assert(a != NULL);
+  assert(w != NULL);
+  assert(c != NULL);
+  assert(acc != NULL);
+
+  const float* a0 = a;
+  float* c0 = c;
+  const float* a1 = (const float*) ((uintptr_t) a0 + a_stride);
+  float* c1 = (float*) ((uintptr_t) c0 + cm_stride);
+  if XNN_UNPREDICTABLE(mr < 2) {
+    a1 = a0;
+    c1 = c0;
+  }
+  const float* a2 = (const float*) ((uintptr_t) a1 + a_stride);
+  float* c2 = (float*) ((uintptr_t) c1 + cm_stride);
+  if XNN_UNPREDICTABLE(mr <= 2) {
+    a2 = a1;
+    c2 = c1;
+  }
+  const float* a3 = (const float*) ((uintptr_t) a2 + a_stride);
+  float* c3 = (float*) ((uintptr_t) c2 + cm_stride);
+  if XNN_UNPREDICTABLE(mr < 4) {
+    a3 = a2;
+    c3 = c2;
+  }
+  const float* a4 = (const float*) ((uintptr_t) a3 + a_stride);
+  float* c4 = (float*) ((uintptr_t) c3 + cm_stride);
+  if XNN_UNPREDICTABLE(mr <= 4) {
+    a4 = a3;
+    c4 = c3;
+  }
+  const float* a5 = (const float*) ((uintptr_t) a4 + a_stride);
+  float* c5 = (float*) ((uintptr_t) c4 + cm_stride);
+  if XNN_UNPREDICTABLE(mr != 6) {
+    a5 = a4;
+    c5 = c4;
+  }
+
+  do {
+    __m512 vacc0x0123456789ABCDEF = _mm512_load_ps(acc + 0);
+    __m512 vacc1x0123456789ABCDEF = _mm512_load_ps(acc + 16);
+    __m512 vacc2x0123456789ABCDEF = _mm512_load_ps(acc + 32);
+    __m512 vacc3x0123456789ABCDEF = _mm512_load_ps(acc + 48);
+    __m512 vacc4x0123456789ABCDEF = _mm512_load_ps(acc + 64);
+    __m512 vacc5x0123456789ABCDEF = _mm512_load_ps(acc + 80);
+    acc += 96;
+
+    size_t k = kc;
+    do {
+      const __m512 vb0123456789ABCDEF = _mm512_load_ps(w);
+      w += 16;
+
+      vacc0x0123456789ABCDEF = _mm512_fmadd_ps(_mm512_set1_ps(*a0), vb0123456789ABCDEF, vacc0x0123456789ABCDEF);
+      vacc1x0123456789ABCDEF = _mm512_fmadd_ps(_mm512_set1_ps(*a1), vb0123456789ABCDEF, vacc1x0123456789ABCDEF);
+      vacc2x0123456789ABCDEF = _mm512_fmadd_ps(_mm512_set1_ps(*a2), vb0123456789ABCDEF, vacc2x0123456789ABCDEF);
+      vacc3x0123456789ABCDEF = _mm512_fmadd_ps(_mm512_set1_ps(*a3), vb0123456789ABCDEF, vacc3x0123456789ABCDEF);
+      vacc4x0123456789ABCDEF = _mm512_fmadd_ps(_mm512_set1_ps(*a4), vb0123456789ABCDEF, vacc4x0123456789ABCDEF);
+      vacc5x0123456789ABCDEF = _mm512_fmadd_ps(_mm512_set1_ps(*a5), vb0123456789ABCDEF, vacc5x0123456789ABCDEF);
+
+      a0 += 1;
+      a1 += 1;
+      a2 += 1;
+      a3 += 1;
+      a4 += 1;
+      a5 += 1;
+
+      k -= sizeof(float);
+    } while (k != 0);
+
+    const __m512 vmax = _mm512_broadcast_f32x4(_mm_load_ps(params->sse.max));
+    vacc0x0123456789ABCDEF = _mm512_min_ps(vacc0x0123456789ABCDEF, vmax);
+    vacc1x0123456789ABCDEF = _mm512_min_ps(vacc1x0123456789ABCDEF, vmax);
+    vacc2x0123456789ABCDEF = _mm512_min_ps(vacc2x0123456789ABCDEF, vmax);
+    vacc3x0123456789ABCDEF = _mm512_min_ps(vacc3x0123456789ABCDEF, vmax);
+    vacc4x0123456789ABCDEF = _mm512_min_ps(vacc4x0123456789ABCDEF, vmax);
+    vacc5x0123456789ABCDEF = _mm512_min_ps(vacc5x0123456789ABCDEF, vmax);
+
+    const __m512 vmin = _mm512_broadcast_f32x4(_mm_load_ps(params->sse.min));
+    vacc0x0123456789ABCDEF = _mm512_max_ps(vacc0x0123456789ABCDEF, vmin);
+    vacc1x0123456789ABCDEF = _mm512_max_ps(vacc1x0123456789ABCDEF, vmin);
+    vacc2x0123456789ABCDEF = _mm512_max_ps(vacc2x0123456789ABCDEF, vmin);
+    vacc3x0123456789ABCDEF = _mm512_max_ps(vacc3x0123456789ABCDEF, vmin);
+    vacc4x0123456789ABCDEF = _mm512_max_ps(vacc4x0123456789ABCDEF, vmin);
+    vacc5x0123456789ABCDEF = _mm512_max_ps(vacc5x0123456789ABCDEF, vmin);
+
+    if XNN_LIKELY(nc >= 16) {
+      _mm512_storeu_ps(c5, vacc5x0123456789ABCDEF);
+      c5 = (float*) ((uintptr_t) c5 + cn_stride);
+      _mm512_storeu_ps(c4, vacc4x0123456789ABCDEF);
+      c4 = (float*) ((uintptr_t) c4 + cn_stride);
+      _mm512_storeu_ps(c3, vacc3x0123456789ABCDEF);
+      c3 = (float*) ((uintptr_t) c3 + cn_stride);
+      _mm512_storeu_ps(c2, vacc2x0123456789ABCDEF);
+      c2 = (float*) ((uintptr_t) c2 + cn_stride);
+      _mm512_storeu_ps(c1, vacc1x0123456789ABCDEF);
+      c1 = (float*) ((uintptr_t) c1 + cn_stride);
+      _mm512_storeu_ps(c0, vacc0x0123456789ABCDEF);
+      c0 = (float*) ((uintptr_t) c0 + cn_stride);
+
+      a5 = (const float*) ((uintptr_t) a5 - kc);
+      a4 = (const float*) ((uintptr_t) a4 - kc);
+      a3 = (const float*) ((uintptr_t) a3 - kc);
+      a2 = (const float*) ((uintptr_t) a2 - kc);
+      a1 = (const float*) ((uintptr_t) a1 - kc);
+      a0 = (const float*) ((uintptr_t) a0 - kc);
+
+      nc -= 16;
+    } else {
+      if (nc & 15) {
+        // Prepare mask for valid 32-bit elements (depends on nc).
+        const __mmask16 vmask = _cvtu32_mask16((uint16_t) ((uint32_t) (UINT32_C(1) << nc) - UINT32_C(1)));
+
+        _mm512_mask_storeu_ps(c5, vmask, vacc5x0123456789ABCDEF);
+        _mm512_mask_storeu_ps(c4, vmask, vacc4x0123456789ABCDEF);
+        _mm512_mask_storeu_ps(c3, vmask, vacc3x0123456789ABCDEF);
+        _mm512_mask_storeu_ps(c2, vmask, vacc2x0123456789ABCDEF);
+        _mm512_mask_storeu_ps(c1, vmask, vacc1x0123456789ABCDEF);
+        _mm512_mask_storeu_ps(c0, vmask, vacc0x0123456789ABCDEF);
+      }
+
+      nc = 0;
+    }
+  } while (nc != 0);
+}
diff --git a/src/f32-gemm/gen-inc/7x16-avx512f-broadcast.c b/src/f32-gemm/gen-inc/7x16-avx512f-broadcast.c
new file mode 100644
index 0000000..f8317e2
--- /dev/null
+++ b/src/f32-gemm/gen-inc/7x16-avx512f-broadcast.c
@@ -0,0 +1,173 @@
+// Auto-generated file. Do not edit!
+//   Template: src/f32-gemm/avx512-broadcast.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <immintrin.h>
+
+#include <xnnpack/gemm.h>
+
+
+void xnn_f32_gemminc_ukernel_7x16__avx512f_broadcast(
+    size_t mr,
+    size_t nc,
+    size_t kc,
+    const float*restrict a,
+    size_t a_stride,
+    const float*restrict w,
+    float*restrict c,
+    size_t cm_stride,
+    size_t cn_stride,
+    const float*restrict acc,
+    const union xnn_f32_output_params params[restrict static 1])
+{
+  assert(mr != 0);
+  assert(mr <= 7);
+  assert(nc != 0);
+  assert(kc != 0);
+  assert(kc % sizeof(float) == 0);
+  assert(a != NULL);
+  assert(w != NULL);
+  assert(c != NULL);
+  assert(acc != NULL);
+
+  const float* a0 = a;
+  float* c0 = c;
+  const float* a1 = (const float*) ((uintptr_t) a0 + a_stride);
+  float* c1 = (float*) ((uintptr_t) c0 + cm_stride);
+  if XNN_UNPREDICTABLE(mr < 2) {
+    a1 = a0;
+    c1 = c0;
+  }
+  const float* a2 = (const float*) ((uintptr_t) a1 + a_stride);
+  float* c2 = (float*) ((uintptr_t) c1 + cm_stride);
+  if XNN_UNPREDICTABLE(mr <= 2) {
+    a2 = a1;
+    c2 = c1;
+  }
+  const float* a3 = (const float*) ((uintptr_t) a2 + a_stride);
+  float* c3 = (float*) ((uintptr_t) c2 + cm_stride);
+  if XNN_UNPREDICTABLE(mr < 4) {
+    a3 = a2;
+    c3 = c2;
+  }
+  const float* a4 = (const float*) ((uintptr_t) a3 + a_stride);
+  float* c4 = (float*) ((uintptr_t) c3 + cm_stride);
+  if XNN_UNPREDICTABLE(mr <= 4) {
+    a4 = a3;
+    c4 = c3;
+  }
+  const float* a5 = (const float*) ((uintptr_t) a4 + a_stride);
+  float* c5 = (float*) ((uintptr_t) c4 + cm_stride);
+  if XNN_UNPREDICTABLE(mr < 6) {
+    a5 = a4;
+    c5 = c4;
+  }
+  const float* a6 = (const float*) ((uintptr_t) a5 + a_stride);
+  float* c6 = (float*) ((uintptr_t) c5 + cm_stride);
+  if XNN_UNPREDICTABLE(mr <= 6) {
+    a6 = a5;
+    c6 = c5;
+  }
+
+  do {
+    __m512 vacc0x0123456789ABCDEF = _mm512_load_ps(acc + 0);
+    __m512 vacc1x0123456789ABCDEF = _mm512_load_ps(acc + 16);
+    __m512 vacc2x0123456789ABCDEF = _mm512_load_ps(acc + 32);
+    __m512 vacc3x0123456789ABCDEF = _mm512_load_ps(acc + 48);
+    __m512 vacc4x0123456789ABCDEF = _mm512_load_ps(acc + 64);
+    __m512 vacc5x0123456789ABCDEF = _mm512_load_ps(acc + 80);
+    __m512 vacc6x0123456789ABCDEF = _mm512_load_ps(acc + 96);
+    acc += 112;
+
+    size_t k = kc;
+    do {
+      const __m512 vb0123456789ABCDEF = _mm512_load_ps(w);
+      w += 16;
+
+      vacc0x0123456789ABCDEF = _mm512_fmadd_ps(_mm512_set1_ps(*a0), vb0123456789ABCDEF, vacc0x0123456789ABCDEF);
+      vacc1x0123456789ABCDEF = _mm512_fmadd_ps(_mm512_set1_ps(*a1), vb0123456789ABCDEF, vacc1x0123456789ABCDEF);
+      vacc2x0123456789ABCDEF = _mm512_fmadd_ps(_mm512_set1_ps(*a2), vb0123456789ABCDEF, vacc2x0123456789ABCDEF);
+      vacc3x0123456789ABCDEF = _mm512_fmadd_ps(_mm512_set1_ps(*a3), vb0123456789ABCDEF, vacc3x0123456789ABCDEF);
+      vacc4x0123456789ABCDEF = _mm512_fmadd_ps(_mm512_set1_ps(*a4), vb0123456789ABCDEF, vacc4x0123456789ABCDEF);
+      vacc5x0123456789ABCDEF = _mm512_fmadd_ps(_mm512_set1_ps(*a5), vb0123456789ABCDEF, vacc5x0123456789ABCDEF);
+      vacc6x0123456789ABCDEF = _mm512_fmadd_ps(_mm512_set1_ps(*a6), vb0123456789ABCDEF, vacc6x0123456789ABCDEF);
+
+      a0 += 1;
+      a1 += 1;
+      a2 += 1;
+      a3 += 1;
+      a4 += 1;
+      a5 += 1;
+      a6 += 1;
+
+      k -= sizeof(float);
+    } while (k != 0);
+
+    const __m512 vmax = _mm512_broadcast_f32x4(_mm_load_ps(params->sse.max));
+    vacc0x0123456789ABCDEF = _mm512_min_ps(vacc0x0123456789ABCDEF, vmax);
+    vacc1x0123456789ABCDEF = _mm512_min_ps(vacc1x0123456789ABCDEF, vmax);
+    vacc2x0123456789ABCDEF = _mm512_min_ps(vacc2x0123456789ABCDEF, vmax);
+    vacc3x0123456789ABCDEF = _mm512_min_ps(vacc3x0123456789ABCDEF, vmax);
+    vacc4x0123456789ABCDEF = _mm512_min_ps(vacc4x0123456789ABCDEF, vmax);
+    vacc5x0123456789ABCDEF = _mm512_min_ps(vacc5x0123456789ABCDEF, vmax);
+    vacc6x0123456789ABCDEF = _mm512_min_ps(vacc6x0123456789ABCDEF, vmax);
+
+    const __m512 vmin = _mm512_broadcast_f32x4(_mm_load_ps(params->sse.min));
+    vacc0x0123456789ABCDEF = _mm512_max_ps(vacc0x0123456789ABCDEF, vmin);
+    vacc1x0123456789ABCDEF = _mm512_max_ps(vacc1x0123456789ABCDEF, vmin);
+    vacc2x0123456789ABCDEF = _mm512_max_ps(vacc2x0123456789ABCDEF, vmin);
+    vacc3x0123456789ABCDEF = _mm512_max_ps(vacc3x0123456789ABCDEF, vmin);
+    vacc4x0123456789ABCDEF = _mm512_max_ps(vacc4x0123456789ABCDEF, vmin);
+    vacc5x0123456789ABCDEF = _mm512_max_ps(vacc5x0123456789ABCDEF, vmin);
+    vacc6x0123456789ABCDEF = _mm512_max_ps(vacc6x0123456789ABCDEF, vmin);
+
+    if XNN_LIKELY(nc >= 16) {
+      _mm512_storeu_ps(c6, vacc6x0123456789ABCDEF);
+      c6 = (float*) ((uintptr_t) c6 + cn_stride);
+      _mm512_storeu_ps(c5, vacc5x0123456789ABCDEF);
+      c5 = (float*) ((uintptr_t) c5 + cn_stride);
+      _mm512_storeu_ps(c4, vacc4x0123456789ABCDEF);
+      c4 = (float*) ((uintptr_t) c4 + cn_stride);
+      _mm512_storeu_ps(c3, vacc3x0123456789ABCDEF);
+      c3 = (float*) ((uintptr_t) c3 + cn_stride);
+      _mm512_storeu_ps(c2, vacc2x0123456789ABCDEF);
+      c2 = (float*) ((uintptr_t) c2 + cn_stride);
+      _mm512_storeu_ps(c1, vacc1x0123456789ABCDEF);
+      c1 = (float*) ((uintptr_t) c1 + cn_stride);
+      _mm512_storeu_ps(c0, vacc0x0123456789ABCDEF);
+      c0 = (float*) ((uintptr_t) c0 + cn_stride);
+
+      a6 = (const float*) ((uintptr_t) a6 - kc);
+      a5 = (const float*) ((uintptr_t) a5 - kc);
+      a4 = (const float*) ((uintptr_t) a4 - kc);
+      a3 = (const float*) ((uintptr_t) a3 - kc);
+      a2 = (const float*) ((uintptr_t) a2 - kc);
+      a1 = (const float*) ((uintptr_t) a1 - kc);
+      a0 = (const float*) ((uintptr_t) a0 - kc);
+
+      nc -= 16;
+    } else {
+      if (nc & 15) {
+        // Prepare mask for valid 32-bit elements (depends on nc).
+        const __mmask16 vmask = _cvtu32_mask16((uint16_t) ((uint32_t) (UINT32_C(1) << nc) - UINT32_C(1)));
+
+        _mm512_mask_storeu_ps(c6, vmask, vacc6x0123456789ABCDEF);
+        _mm512_mask_storeu_ps(c5, vmask, vacc5x0123456789ABCDEF);
+        _mm512_mask_storeu_ps(c4, vmask, vacc4x0123456789ABCDEF);
+        _mm512_mask_storeu_ps(c3, vmask, vacc3x0123456789ABCDEF);
+        _mm512_mask_storeu_ps(c2, vmask, vacc2x0123456789ABCDEF);
+        _mm512_mask_storeu_ps(c1, vmask, vacc1x0123456789ABCDEF);
+        _mm512_mask_storeu_ps(c0, vmask, vacc0x0123456789ABCDEF);
+      }
+
+      nc = 0;
+    }
+  } while (nc != 0);
+}
diff --git a/src/f32-gemm/gen-inc/8x16-avx512f-broadcast.c b/src/f32-gemm/gen-inc/8x16-avx512f-broadcast.c
new file mode 100644
index 0000000..4e6575a
--- /dev/null
+++ b/src/f32-gemm/gen-inc/8x16-avx512f-broadcast.c
@@ -0,0 +1,188 @@
+// Auto-generated file. Do not edit!
+//   Template: src/f32-gemm/avx512-broadcast.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <immintrin.h>
+
+#include <xnnpack/gemm.h>
+
+
+void xnn_f32_gemminc_ukernel_8x16__avx512f_broadcast(
+    size_t mr,
+    size_t nc,
+    size_t kc,
+    const float*restrict a,
+    size_t a_stride,
+    const float*restrict w,
+    float*restrict c,
+    size_t cm_stride,
+    size_t cn_stride,
+    const float*restrict acc,
+    const union xnn_f32_output_params params[restrict static 1])
+{
+  assert(mr != 0);
+  assert(mr <= 8);
+  assert(nc != 0);
+  assert(kc != 0);
+  assert(kc % sizeof(float) == 0);
+  assert(a != NULL);
+  assert(w != NULL);
+  assert(c != NULL);
+  assert(acc != NULL);
+
+  const float* a0 = a;
+  float* c0 = c;
+  const float* a1 = (const float*) ((uintptr_t) a0 + a_stride);
+  float* c1 = (float*) ((uintptr_t) c0 + cm_stride);
+  if XNN_UNPREDICTABLE(mr < 2) {
+    a1 = a0;
+    c1 = c0;
+  }
+  const float* a2 = (const float*) ((uintptr_t) a1 + a_stride);
+  float* c2 = (float*) ((uintptr_t) c1 + cm_stride);
+  if XNN_UNPREDICTABLE(mr <= 2) {
+    a2 = a1;
+    c2 = c1;
+  }
+  const float* a3 = (const float*) ((uintptr_t) a2 + a_stride);
+  float* c3 = (float*) ((uintptr_t) c2 + cm_stride);
+  if XNN_UNPREDICTABLE(mr < 4) {
+    a3 = a2;
+    c3 = c2;
+  }
+  const float* a4 = (const float*) ((uintptr_t) a3 + a_stride);
+  float* c4 = (float*) ((uintptr_t) c3 + cm_stride);
+  if XNN_UNPREDICTABLE(mr <= 4) {
+    a4 = a3;
+    c4 = c3;
+  }
+  const float* a5 = (const float*) ((uintptr_t) a4 + a_stride);
+  float* c5 = (float*) ((uintptr_t) c4 + cm_stride);
+  if XNN_UNPREDICTABLE(mr < 6) {
+    a5 = a4;
+    c5 = c4;
+  }
+  const float* a6 = (const float*) ((uintptr_t) a5 + a_stride);
+  float* c6 = (float*) ((uintptr_t) c5 + cm_stride);
+  if XNN_UNPREDICTABLE(mr <= 6) {
+    a6 = a5;
+    c6 = c5;
+  }
+  const float* a7 = (const float*) ((uintptr_t) a6 + a_stride);
+  float* c7 = (float*) ((uintptr_t) c6 + cm_stride);
+  if XNN_UNPREDICTABLE(mr != 8) {
+    a7 = a6;
+    c7 = c6;
+  }
+
+  do {
+    __m512 vacc0x0123456789ABCDEF = _mm512_load_ps(acc + 0);
+    __m512 vacc1x0123456789ABCDEF = _mm512_load_ps(acc + 16);
+    __m512 vacc2x0123456789ABCDEF = _mm512_load_ps(acc + 32);
+    __m512 vacc3x0123456789ABCDEF = _mm512_load_ps(acc + 48);
+    __m512 vacc4x0123456789ABCDEF = _mm512_load_ps(acc + 64);
+    __m512 vacc5x0123456789ABCDEF = _mm512_load_ps(acc + 80);
+    __m512 vacc6x0123456789ABCDEF = _mm512_load_ps(acc + 96);
+    __m512 vacc7x0123456789ABCDEF = _mm512_load_ps(acc + 112);
+    acc += 128;
+
+    size_t k = kc;
+    do {
+      const __m512 vb0123456789ABCDEF = _mm512_load_ps(w);
+      w += 16;
+
+      vacc0x0123456789ABCDEF = _mm512_fmadd_ps(_mm512_set1_ps(*a0), vb0123456789ABCDEF, vacc0x0123456789ABCDEF);
+      vacc1x0123456789ABCDEF = _mm512_fmadd_ps(_mm512_set1_ps(*a1), vb0123456789ABCDEF, vacc1x0123456789ABCDEF);
+      vacc2x0123456789ABCDEF = _mm512_fmadd_ps(_mm512_set1_ps(*a2), vb0123456789ABCDEF, vacc2x0123456789ABCDEF);
+      vacc3x0123456789ABCDEF = _mm512_fmadd_ps(_mm512_set1_ps(*a3), vb0123456789ABCDEF, vacc3x0123456789ABCDEF);
+      vacc4x0123456789ABCDEF = _mm512_fmadd_ps(_mm512_set1_ps(*a4), vb0123456789ABCDEF, vacc4x0123456789ABCDEF);
+      vacc5x0123456789ABCDEF = _mm512_fmadd_ps(_mm512_set1_ps(*a5), vb0123456789ABCDEF, vacc5x0123456789ABCDEF);
+      vacc6x0123456789ABCDEF = _mm512_fmadd_ps(_mm512_set1_ps(*a6), vb0123456789ABCDEF, vacc6x0123456789ABCDEF);
+      vacc7x0123456789ABCDEF = _mm512_fmadd_ps(_mm512_set1_ps(*a7), vb0123456789ABCDEF, vacc7x0123456789ABCDEF);
+
+      a0 += 1;
+      a1 += 1;
+      a2 += 1;
+      a3 += 1;
+      a4 += 1;
+      a5 += 1;
+      a6 += 1;
+      a7 += 1;
+
+      k -= sizeof(float);
+    } while (k != 0);
+
+    const __m512 vmax = _mm512_broadcast_f32x4(_mm_load_ps(params->sse.max));
+    vacc0x0123456789ABCDEF = _mm512_min_ps(vacc0x0123456789ABCDEF, vmax);
+    vacc1x0123456789ABCDEF = _mm512_min_ps(vacc1x0123456789ABCDEF, vmax);
+    vacc2x0123456789ABCDEF = _mm512_min_ps(vacc2x0123456789ABCDEF, vmax);
+    vacc3x0123456789ABCDEF = _mm512_min_ps(vacc3x0123456789ABCDEF, vmax);
+    vacc4x0123456789ABCDEF = _mm512_min_ps(vacc4x0123456789ABCDEF, vmax);
+    vacc5x0123456789ABCDEF = _mm512_min_ps(vacc5x0123456789ABCDEF, vmax);
+    vacc6x0123456789ABCDEF = _mm512_min_ps(vacc6x0123456789ABCDEF, vmax);
+    vacc7x0123456789ABCDEF = _mm512_min_ps(vacc7x0123456789ABCDEF, vmax);
+
+    const __m512 vmin = _mm512_broadcast_f32x4(_mm_load_ps(params->sse.min));
+    vacc0x0123456789ABCDEF = _mm512_max_ps(vacc0x0123456789ABCDEF, vmin);
+    vacc1x0123456789ABCDEF = _mm512_max_ps(vacc1x0123456789ABCDEF, vmin);
+    vacc2x0123456789ABCDEF = _mm512_max_ps(vacc2x0123456789ABCDEF, vmin);
+    vacc3x0123456789ABCDEF = _mm512_max_ps(vacc3x0123456789ABCDEF, vmin);
+    vacc4x0123456789ABCDEF = _mm512_max_ps(vacc4x0123456789ABCDEF, vmin);
+    vacc5x0123456789ABCDEF = _mm512_max_ps(vacc5x0123456789ABCDEF, vmin);
+    vacc6x0123456789ABCDEF = _mm512_max_ps(vacc6x0123456789ABCDEF, vmin);
+    vacc7x0123456789ABCDEF = _mm512_max_ps(vacc7x0123456789ABCDEF, vmin);
+
+    if XNN_LIKELY(nc >= 16) {
+      _mm512_storeu_ps(c7, vacc7x0123456789ABCDEF);
+      c7 = (float*) ((uintptr_t) c7 + cn_stride);
+      _mm512_storeu_ps(c6, vacc6x0123456789ABCDEF);
+      c6 = (float*) ((uintptr_t) c6 + cn_stride);
+      _mm512_storeu_ps(c5, vacc5x0123456789ABCDEF);
+      c5 = (float*) ((uintptr_t) c5 + cn_stride);
+      _mm512_storeu_ps(c4, vacc4x0123456789ABCDEF);
+      c4 = (float*) ((uintptr_t) c4 + cn_stride);
+      _mm512_storeu_ps(c3, vacc3x0123456789ABCDEF);
+      c3 = (float*) ((uintptr_t) c3 + cn_stride);
+      _mm512_storeu_ps(c2, vacc2x0123456789ABCDEF);
+      c2 = (float*) ((uintptr_t) c2 + cn_stride);
+      _mm512_storeu_ps(c1, vacc1x0123456789ABCDEF);
+      c1 = (float*) ((uintptr_t) c1 + cn_stride);
+      _mm512_storeu_ps(c0, vacc0x0123456789ABCDEF);
+      c0 = (float*) ((uintptr_t) c0 + cn_stride);
+
+      a7 = (const float*) ((uintptr_t) a7 - kc);
+      a6 = (const float*) ((uintptr_t) a6 - kc);
+      a5 = (const float*) ((uintptr_t) a5 - kc);
+      a4 = (const float*) ((uintptr_t) a4 - kc);
+      a3 = (const float*) ((uintptr_t) a3 - kc);
+      a2 = (const float*) ((uintptr_t) a2 - kc);
+      a1 = (const float*) ((uintptr_t) a1 - kc);
+      a0 = (const float*) ((uintptr_t) a0 - kc);
+
+      nc -= 16;
+    } else {
+      if (nc & 15) {
+        // Prepare mask for valid 32-bit elements (depends on nc).
+        const __mmask16 vmask = _cvtu32_mask16((uint16_t) ((uint32_t) (UINT32_C(1) << nc) - UINT32_C(1)));
+
+        _mm512_mask_storeu_ps(c7, vmask, vacc7x0123456789ABCDEF);
+        _mm512_mask_storeu_ps(c6, vmask, vacc6x0123456789ABCDEF);
+        _mm512_mask_storeu_ps(c5, vmask, vacc5x0123456789ABCDEF);
+        _mm512_mask_storeu_ps(c4, vmask, vacc4x0123456789ABCDEF);
+        _mm512_mask_storeu_ps(c3, vmask, vacc3x0123456789ABCDEF);
+        _mm512_mask_storeu_ps(c2, vmask, vacc2x0123456789ABCDEF);
+        _mm512_mask_storeu_ps(c1, vmask, vacc1x0123456789ABCDEF);
+        _mm512_mask_storeu_ps(c0, vmask, vacc0x0123456789ABCDEF);
+      }
+
+      nc = 0;
+    }
+  } while (nc != 0);
+}
diff --git a/src/f32-gemm/gen/1x16-avx512f-broadcast.c b/src/f32-gemm/gen/1x16-avx512f-broadcast.c
new file mode 100644
index 0000000..c989d12
--- /dev/null
+++ b/src/f32-gemm/gen/1x16-avx512f-broadcast.c
@@ -0,0 +1,81 @@
+// Auto-generated file. Do not edit!
+//   Template: src/f32-gemm/avx512-broadcast.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <immintrin.h>
+
+#include <xnnpack/gemm.h>
+
+
+void xnn_f32_gemm_ukernel_1x16__avx512f_broadcast(
+    size_t mr,
+    size_t nc,
+    size_t kc,
+    const float*restrict a,
+    size_t a_stride,
+    const float*restrict w,
+    float*restrict c,
+    size_t cm_stride,
+    size_t cn_stride,
+    const union xnn_f32_output_params params[restrict static 1])
+{
+  assert(mr != 0);
+  assert(mr <= 1);
+  assert(nc != 0);
+  assert(kc != 0);
+  assert(kc % sizeof(float) == 0);
+  assert(a != NULL);
+  assert(w != NULL);
+  assert(c != NULL);
+
+  const float* a0 = a;
+  float* c0 = c;
+
+  do {
+    __m512 vacc0x0123456789ABCDEF = _mm512_load_ps(w);
+    w += 16;
+
+    size_t k = kc;
+    do {
+      const __m512 vb0123456789ABCDEF = _mm512_load_ps(w);
+      w += 16;
+
+      vacc0x0123456789ABCDEF = _mm512_fmadd_ps(_mm512_set1_ps(*a0), vb0123456789ABCDEF, vacc0x0123456789ABCDEF);
+
+      a0 += 1;
+
+      k -= sizeof(float);
+    } while (k != 0);
+
+    const __m512 vmax = _mm512_broadcast_f32x4(_mm_load_ps(params->sse.max));
+    vacc0x0123456789ABCDEF = _mm512_min_ps(vacc0x0123456789ABCDEF, vmax);
+
+    const __m512 vmin = _mm512_broadcast_f32x4(_mm_load_ps(params->sse.min));
+    vacc0x0123456789ABCDEF = _mm512_max_ps(vacc0x0123456789ABCDEF, vmin);
+
+    if XNN_LIKELY(nc >= 16) {
+      _mm512_storeu_ps(c0, vacc0x0123456789ABCDEF);
+      c0 = (float*) ((uintptr_t) c0 + cn_stride);
+
+      a0 = (const float*) ((uintptr_t) a0 - kc);
+
+      nc -= 16;
+    } else {
+      if (nc & 15) {
+        // Prepare mask for valid 32-bit elements (depends on nc).
+        const __mmask16 vmask = _cvtu32_mask16((uint16_t) ((uint32_t) (UINT32_C(1) << nc) - UINT32_C(1)));
+
+        _mm512_mask_storeu_ps(c0, vmask, vacc0x0123456789ABCDEF);
+      }
+
+      nc = 0;
+    }
+  } while (nc != 0);
+}
diff --git a/src/f32-gemm/gen/4x16-avx512f-broadcast.c b/src/f32-gemm/gen/4x16-avx512f-broadcast.c
new file mode 100644
index 0000000..27e7c5a
--- /dev/null
+++ b/src/f32-gemm/gen/4x16-avx512f-broadcast.c
@@ -0,0 +1,126 @@
+// Auto-generated file. Do not edit!
+//   Template: src/f32-gemm/avx512-broadcast.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <immintrin.h>
+
+#include <xnnpack/gemm.h>
+
+
+void xnn_f32_gemm_ukernel_4x16__avx512f_broadcast(
+    size_t mr,
+    size_t nc,
+    size_t kc,
+    const float*restrict a,
+    size_t a_stride,
+    const float*restrict w,
+    float*restrict c,
+    size_t cm_stride,
+    size_t cn_stride,
+    const union xnn_f32_output_params params[restrict static 1])
+{
+  assert(mr != 0);
+  assert(mr <= 4);
+  assert(nc != 0);
+  assert(kc != 0);
+  assert(kc % sizeof(float) == 0);
+  assert(a != NULL);
+  assert(w != NULL);
+  assert(c != NULL);
+
+  const float* a0 = a;
+  float* c0 = c;
+  const float* a1 = (const float*) ((uintptr_t) a0 + a_stride);
+  float* c1 = (float*) ((uintptr_t) c0 + cm_stride);
+  if XNN_UNPREDICTABLE(mr < 2) {
+    a1 = a0;
+    c1 = c0;
+  }
+  const float* a2 = (const float*) ((uintptr_t) a1 + a_stride);
+  float* c2 = (float*) ((uintptr_t) c1 + cm_stride);
+  if XNN_UNPREDICTABLE(mr <= 2) {
+    a2 = a1;
+    c2 = c1;
+  }
+  const float* a3 = (const float*) ((uintptr_t) a2 + a_stride);
+  float* c3 = (float*) ((uintptr_t) c2 + cm_stride);
+  if XNN_UNPREDICTABLE(mr != 4) {
+    a3 = a2;
+    c3 = c2;
+  }
+
+  do {
+    __m512 vacc0x0123456789ABCDEF = _mm512_load_ps(w);
+    __m512 vacc1x0123456789ABCDEF = vacc0x0123456789ABCDEF;
+    __m512 vacc2x0123456789ABCDEF = vacc0x0123456789ABCDEF;
+    __m512 vacc3x0123456789ABCDEF = vacc0x0123456789ABCDEF;
+    w += 16;
+
+    size_t k = kc;
+    do {
+      const __m512 vb0123456789ABCDEF = _mm512_load_ps(w);
+      w += 16;
+
+      vacc0x0123456789ABCDEF = _mm512_fmadd_ps(_mm512_set1_ps(*a0), vb0123456789ABCDEF, vacc0x0123456789ABCDEF);
+      vacc1x0123456789ABCDEF = _mm512_fmadd_ps(_mm512_set1_ps(*a1), vb0123456789ABCDEF, vacc1x0123456789ABCDEF);
+      vacc2x0123456789ABCDEF = _mm512_fmadd_ps(_mm512_set1_ps(*a2), vb0123456789ABCDEF, vacc2x0123456789ABCDEF);
+      vacc3x0123456789ABCDEF = _mm512_fmadd_ps(_mm512_set1_ps(*a3), vb0123456789ABCDEF, vacc3x0123456789ABCDEF);
+
+      a0 += 1;
+      a1 += 1;
+      a2 += 1;
+      a3 += 1;
+
+      k -= sizeof(float);
+    } while (k != 0);
+
+    const __m512 vmax = _mm512_broadcast_f32x4(_mm_load_ps(params->sse.max));
+    vacc0x0123456789ABCDEF = _mm512_min_ps(vacc0x0123456789ABCDEF, vmax);
+    vacc1x0123456789ABCDEF = _mm512_min_ps(vacc1x0123456789ABCDEF, vmax);
+    vacc2x0123456789ABCDEF = _mm512_min_ps(vacc2x0123456789ABCDEF, vmax);
+    vacc3x0123456789ABCDEF = _mm512_min_ps(vacc3x0123456789ABCDEF, vmax);
+
+    const __m512 vmin = _mm512_broadcast_f32x4(_mm_load_ps(params->sse.min));
+    vacc0x0123456789ABCDEF = _mm512_max_ps(vacc0x0123456789ABCDEF, vmin);
+    vacc1x0123456789ABCDEF = _mm512_max_ps(vacc1x0123456789ABCDEF, vmin);
+    vacc2x0123456789ABCDEF = _mm512_max_ps(vacc2x0123456789ABCDEF, vmin);
+    vacc3x0123456789ABCDEF = _mm512_max_ps(vacc3x0123456789ABCDEF, vmin);
+
+    if XNN_LIKELY(nc >= 16) {
+      _mm512_storeu_ps(c3, vacc3x0123456789ABCDEF);
+      c3 = (float*) ((uintptr_t) c3 + cn_stride);
+      _mm512_storeu_ps(c2, vacc2x0123456789ABCDEF);
+      c2 = (float*) ((uintptr_t) c2 + cn_stride);
+      _mm512_storeu_ps(c1, vacc1x0123456789ABCDEF);
+      c1 = (float*) ((uintptr_t) c1 + cn_stride);
+      _mm512_storeu_ps(c0, vacc0x0123456789ABCDEF);
+      c0 = (float*) ((uintptr_t) c0 + cn_stride);
+
+      a3 = (const float*) ((uintptr_t) a3 - kc);
+      a2 = (const float*) ((uintptr_t) a2 - kc);
+      a1 = (const float*) ((uintptr_t) a1 - kc);
+      a0 = (const float*) ((uintptr_t) a0 - kc);
+
+      nc -= 16;
+    } else {
+      if (nc & 15) {
+        // Prepare mask for valid 32-bit elements (depends on nc).
+        const __mmask16 vmask = _cvtu32_mask16((uint16_t) ((uint32_t) (UINT32_C(1) << nc) - UINT32_C(1)));
+
+        _mm512_mask_storeu_ps(c3, vmask, vacc3x0123456789ABCDEF);
+        _mm512_mask_storeu_ps(c2, vmask, vacc2x0123456789ABCDEF);
+        _mm512_mask_storeu_ps(c1, vmask, vacc1x0123456789ABCDEF);
+        _mm512_mask_storeu_ps(c0, vmask, vacc0x0123456789ABCDEF);
+      }
+
+      nc = 0;
+    }
+  } while (nc != 0);
+}
diff --git a/src/f32-gemm/gen/5x16-avx512f-broadcast.c b/src/f32-gemm/gen/5x16-avx512f-broadcast.c
new file mode 100644
index 0000000..d6bf560
--- /dev/null
+++ b/src/f32-gemm/gen/5x16-avx512f-broadcast.c
@@ -0,0 +1,141 @@
+// Auto-generated file. Do not edit!
+//   Template: src/f32-gemm/avx512-broadcast.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <immintrin.h>
+
+#include <xnnpack/gemm.h>
+
+
+void xnn_f32_gemm_ukernel_5x16__avx512f_broadcast(
+    size_t mr,
+    size_t nc,
+    size_t kc,
+    const float*restrict a,
+    size_t a_stride,
+    const float*restrict w,
+    float*restrict c,
+    size_t cm_stride,
+    size_t cn_stride,
+    const union xnn_f32_output_params params[restrict static 1])
+{
+  assert(mr != 0);
+  assert(mr <= 5);
+  assert(nc != 0);
+  assert(kc != 0);
+  assert(kc % sizeof(float) == 0);
+  assert(a != NULL);
+  assert(w != NULL);
+  assert(c != NULL);
+
+  const float* a0 = a;
+  float* c0 = c;
+  const float* a1 = (const float*) ((uintptr_t) a0 + a_stride);
+  float* c1 = (float*) ((uintptr_t) c0 + cm_stride);
+  if XNN_UNPREDICTABLE(mr < 2) {
+    a1 = a0;
+    c1 = c0;
+  }
+  const float* a2 = (const float*) ((uintptr_t) a1 + a_stride);
+  float* c2 = (float*) ((uintptr_t) c1 + cm_stride);
+  if XNN_UNPREDICTABLE(mr <= 2) {
+    a2 = a1;
+    c2 = c1;
+  }
+  const float* a3 = (const float*) ((uintptr_t) a2 + a_stride);
+  float* c3 = (float*) ((uintptr_t) c2 + cm_stride);
+  if XNN_UNPREDICTABLE(mr < 4) {
+    a3 = a2;
+    c3 = c2;
+  }
+  const float* a4 = (const float*) ((uintptr_t) a3 + a_stride);
+  float* c4 = (float*) ((uintptr_t) c3 + cm_stride);
+  if XNN_UNPREDICTABLE(mr <= 4) {
+    a4 = a3;
+    c4 = c3;
+  }
+
+  do {
+    __m512 vacc0x0123456789ABCDEF = _mm512_load_ps(w);
+    __m512 vacc1x0123456789ABCDEF = vacc0x0123456789ABCDEF;
+    __m512 vacc2x0123456789ABCDEF = vacc0x0123456789ABCDEF;
+    __m512 vacc3x0123456789ABCDEF = vacc0x0123456789ABCDEF;
+    __m512 vacc4x0123456789ABCDEF = vacc0x0123456789ABCDEF;
+    w += 16;
+
+    size_t k = kc;
+    do {
+      const __m512 vb0123456789ABCDEF = _mm512_load_ps(w);
+      w += 16;
+
+      vacc0x0123456789ABCDEF = _mm512_fmadd_ps(_mm512_set1_ps(*a0), vb0123456789ABCDEF, vacc0x0123456789ABCDEF);
+      vacc1x0123456789ABCDEF = _mm512_fmadd_ps(_mm512_set1_ps(*a1), vb0123456789ABCDEF, vacc1x0123456789ABCDEF);
+      vacc2x0123456789ABCDEF = _mm512_fmadd_ps(_mm512_set1_ps(*a2), vb0123456789ABCDEF, vacc2x0123456789ABCDEF);
+      vacc3x0123456789ABCDEF = _mm512_fmadd_ps(_mm512_set1_ps(*a3), vb0123456789ABCDEF, vacc3x0123456789ABCDEF);
+      vacc4x0123456789ABCDEF = _mm512_fmadd_ps(_mm512_set1_ps(*a4), vb0123456789ABCDEF, vacc4x0123456789ABCDEF);
+
+      a0 += 1;
+      a1 += 1;
+      a2 += 1;
+      a3 += 1;
+      a4 += 1;
+
+      k -= sizeof(float);
+    } while (k != 0);
+
+    const __m512 vmax = _mm512_broadcast_f32x4(_mm_load_ps(params->sse.max));
+    vacc0x0123456789ABCDEF = _mm512_min_ps(vacc0x0123456789ABCDEF, vmax);
+    vacc1x0123456789ABCDEF = _mm512_min_ps(vacc1x0123456789ABCDEF, vmax);
+    vacc2x0123456789ABCDEF = _mm512_min_ps(vacc2x0123456789ABCDEF, vmax);
+    vacc3x0123456789ABCDEF = _mm512_min_ps(vacc3x0123456789ABCDEF, vmax);
+    vacc4x0123456789ABCDEF = _mm512_min_ps(vacc4x0123456789ABCDEF, vmax);
+
+    const __m512 vmin = _mm512_broadcast_f32x4(_mm_load_ps(params->sse.min));
+    vacc0x0123456789ABCDEF = _mm512_max_ps(vacc0x0123456789ABCDEF, vmin);
+    vacc1x0123456789ABCDEF = _mm512_max_ps(vacc1x0123456789ABCDEF, vmin);
+    vacc2x0123456789ABCDEF = _mm512_max_ps(vacc2x0123456789ABCDEF, vmin);
+    vacc3x0123456789ABCDEF = _mm512_max_ps(vacc3x0123456789ABCDEF, vmin);
+    vacc4x0123456789ABCDEF = _mm512_max_ps(vacc4x0123456789ABCDEF, vmin);
+
+    if XNN_LIKELY(nc >= 16) {
+      _mm512_storeu_ps(c4, vacc4x0123456789ABCDEF);
+      c4 = (float*) ((uintptr_t) c4 + cn_stride);
+      _mm512_storeu_ps(c3, vacc3x0123456789ABCDEF);
+      c3 = (float*) ((uintptr_t) c3 + cn_stride);
+      _mm512_storeu_ps(c2, vacc2x0123456789ABCDEF);
+      c2 = (float*) ((uintptr_t) c2 + cn_stride);
+      _mm512_storeu_ps(c1, vacc1x0123456789ABCDEF);
+      c1 = (float*) ((uintptr_t) c1 + cn_stride);
+      _mm512_storeu_ps(c0, vacc0x0123456789ABCDEF);
+      c0 = (float*) ((uintptr_t) c0 + cn_stride);
+
+      a4 = (const float*) ((uintptr_t) a4 - kc);
+      a3 = (const float*) ((uintptr_t) a3 - kc);
+      a2 = (const float*) ((uintptr_t) a2 - kc);
+      a1 = (const float*) ((uintptr_t) a1 - kc);
+      a0 = (const float*) ((uintptr_t) a0 - kc);
+
+      nc -= 16;
+    } else {
+      if (nc & 15) {
+        // Prepare mask for valid 32-bit elements (depends on nc).
+        const __mmask16 vmask = _cvtu32_mask16((uint16_t) ((uint32_t) (UINT32_C(1) << nc) - UINT32_C(1)));
+
+        _mm512_mask_storeu_ps(c4, vmask, vacc4x0123456789ABCDEF);
+        _mm512_mask_storeu_ps(c3, vmask, vacc3x0123456789ABCDEF);
+        _mm512_mask_storeu_ps(c2, vmask, vacc2x0123456789ABCDEF);
+        _mm512_mask_storeu_ps(c1, vmask, vacc1x0123456789ABCDEF);
+        _mm512_mask_storeu_ps(c0, vmask, vacc0x0123456789ABCDEF);
+      }
+
+      nc = 0;
+    }
+  } while (nc != 0);
+}
diff --git a/src/f32-gemm/gen/6x16-avx512f-broadcast.c b/src/f32-gemm/gen/6x16-avx512f-broadcast.c
new file mode 100644
index 0000000..7d1e27e
--- /dev/null
+++ b/src/f32-gemm/gen/6x16-avx512f-broadcast.c
@@ -0,0 +1,156 @@
+// Auto-generated file. Do not edit!
+//   Template: src/f32-gemm/avx512-broadcast.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <immintrin.h>
+
+#include <xnnpack/gemm.h>
+
+
+void xnn_f32_gemm_ukernel_6x16__avx512f_broadcast(
+    size_t mr,
+    size_t nc,
+    size_t kc,
+    const float*restrict a,
+    size_t a_stride,
+    const float*restrict w,
+    float*restrict c,
+    size_t cm_stride,
+    size_t cn_stride,
+    const union xnn_f32_output_params params[restrict static 1])
+{
+  assert(mr != 0);
+  assert(mr <= 6);
+  assert(nc != 0);
+  assert(kc != 0);
+  assert(kc % sizeof(float) == 0);
+  assert(a != NULL);
+  assert(w != NULL);
+  assert(c != NULL);
+
+  const float* a0 = a;
+  float* c0 = c;
+  const float* a1 = (const float*) ((uintptr_t) a0 + a_stride);
+  float* c1 = (float*) ((uintptr_t) c0 + cm_stride);
+  if XNN_UNPREDICTABLE(mr < 2) {
+    a1 = a0;
+    c1 = c0;
+  }
+  const float* a2 = (const float*) ((uintptr_t) a1 + a_stride);
+  float* c2 = (float*) ((uintptr_t) c1 + cm_stride);
+  if XNN_UNPREDICTABLE(mr <= 2) {
+    a2 = a1;
+    c2 = c1;
+  }
+  const float* a3 = (const float*) ((uintptr_t) a2 + a_stride);
+  float* c3 = (float*) ((uintptr_t) c2 + cm_stride);
+  if XNN_UNPREDICTABLE(mr < 4) {
+    a3 = a2;
+    c3 = c2;
+  }
+  const float* a4 = (const float*) ((uintptr_t) a3 + a_stride);
+  float* c4 = (float*) ((uintptr_t) c3 + cm_stride);
+  if XNN_UNPREDICTABLE(mr <= 4) {
+    a4 = a3;
+    c4 = c3;
+  }
+  const float* a5 = (const float*) ((uintptr_t) a4 + a_stride);
+  float* c5 = (float*) ((uintptr_t) c4 + cm_stride);
+  if XNN_UNPREDICTABLE(mr != 6) {
+    a5 = a4;
+    c5 = c4;
+  }
+
+  do {
+    __m512 vacc0x0123456789ABCDEF = _mm512_load_ps(w);
+    __m512 vacc1x0123456789ABCDEF = vacc0x0123456789ABCDEF;
+    __m512 vacc2x0123456789ABCDEF = vacc0x0123456789ABCDEF;
+    __m512 vacc3x0123456789ABCDEF = vacc0x0123456789ABCDEF;
+    __m512 vacc4x0123456789ABCDEF = vacc0x0123456789ABCDEF;
+    __m512 vacc5x0123456789ABCDEF = vacc0x0123456789ABCDEF;
+    w += 16;
+
+    size_t k = kc;
+    do {
+      const __m512 vb0123456789ABCDEF = _mm512_load_ps(w);
+      w += 16;
+
+      vacc0x0123456789ABCDEF = _mm512_fmadd_ps(_mm512_set1_ps(*a0), vb0123456789ABCDEF, vacc0x0123456789ABCDEF);
+      vacc1x0123456789ABCDEF = _mm512_fmadd_ps(_mm512_set1_ps(*a1), vb0123456789ABCDEF, vacc1x0123456789ABCDEF);
+      vacc2x0123456789ABCDEF = _mm512_fmadd_ps(_mm512_set1_ps(*a2), vb0123456789ABCDEF, vacc2x0123456789ABCDEF);
+      vacc3x0123456789ABCDEF = _mm512_fmadd_ps(_mm512_set1_ps(*a3), vb0123456789ABCDEF, vacc3x0123456789ABCDEF);
+      vacc4x0123456789ABCDEF = _mm512_fmadd_ps(_mm512_set1_ps(*a4), vb0123456789ABCDEF, vacc4x0123456789ABCDEF);
+      vacc5x0123456789ABCDEF = _mm512_fmadd_ps(_mm512_set1_ps(*a5), vb0123456789ABCDEF, vacc5x0123456789ABCDEF);
+
+      a0 += 1;
+      a1 += 1;
+      a2 += 1;
+      a3 += 1;
+      a4 += 1;
+      a5 += 1;
+
+      k -= sizeof(float);
+    } while (k != 0);
+
+    const __m512 vmax = _mm512_broadcast_f32x4(_mm_load_ps(params->sse.max));
+    vacc0x0123456789ABCDEF = _mm512_min_ps(vacc0x0123456789ABCDEF, vmax);
+    vacc1x0123456789ABCDEF = _mm512_min_ps(vacc1x0123456789ABCDEF, vmax);
+    vacc2x0123456789ABCDEF = _mm512_min_ps(vacc2x0123456789ABCDEF, vmax);
+    vacc3x0123456789ABCDEF = _mm512_min_ps(vacc3x0123456789ABCDEF, vmax);
+    vacc4x0123456789ABCDEF = _mm512_min_ps(vacc4x0123456789ABCDEF, vmax);
+    vacc5x0123456789ABCDEF = _mm512_min_ps(vacc5x0123456789ABCDEF, vmax);
+
+    const __m512 vmin = _mm512_broadcast_f32x4(_mm_load_ps(params->sse.min));
+    vacc0x0123456789ABCDEF = _mm512_max_ps(vacc0x0123456789ABCDEF, vmin);
+    vacc1x0123456789ABCDEF = _mm512_max_ps(vacc1x0123456789ABCDEF, vmin);
+    vacc2x0123456789ABCDEF = _mm512_max_ps(vacc2x0123456789ABCDEF, vmin);
+    vacc3x0123456789ABCDEF = _mm512_max_ps(vacc3x0123456789ABCDEF, vmin);
+    vacc4x0123456789ABCDEF = _mm512_max_ps(vacc4x0123456789ABCDEF, vmin);
+    vacc5x0123456789ABCDEF = _mm512_max_ps(vacc5x0123456789ABCDEF, vmin);
+
+    if XNN_LIKELY(nc >= 16) {
+      _mm512_storeu_ps(c5, vacc5x0123456789ABCDEF);
+      c5 = (float*) ((uintptr_t) c5 + cn_stride);
+      _mm512_storeu_ps(c4, vacc4x0123456789ABCDEF);
+      c4 = (float*) ((uintptr_t) c4 + cn_stride);
+      _mm512_storeu_ps(c3, vacc3x0123456789ABCDEF);
+      c3 = (float*) ((uintptr_t) c3 + cn_stride);
+      _mm512_storeu_ps(c2, vacc2x0123456789ABCDEF);
+      c2 = (float*) ((uintptr_t) c2 + cn_stride);
+      _mm512_storeu_ps(c1, vacc1x0123456789ABCDEF);
+      c1 = (float*) ((uintptr_t) c1 + cn_stride);
+      _mm512_storeu_ps(c0, vacc0x0123456789ABCDEF);
+      c0 = (float*) ((uintptr_t) c0 + cn_stride);
+
+      a5 = (const float*) ((uintptr_t) a5 - kc);
+      a4 = (const float*) ((uintptr_t) a4 - kc);
+      a3 = (const float*) ((uintptr_t) a3 - kc);
+      a2 = (const float*) ((uintptr_t) a2 - kc);
+      a1 = (const float*) ((uintptr_t) a1 - kc);
+      a0 = (const float*) ((uintptr_t) a0 - kc);
+
+      nc -= 16;
+    } else {
+      if (nc & 15) {
+        // Prepare mask for valid 32-bit elements (depends on nc).
+        const __mmask16 vmask = _cvtu32_mask16((uint16_t) ((uint32_t) (UINT32_C(1) << nc) - UINT32_C(1)));
+
+        _mm512_mask_storeu_ps(c5, vmask, vacc5x0123456789ABCDEF);
+        _mm512_mask_storeu_ps(c4, vmask, vacc4x0123456789ABCDEF);
+        _mm512_mask_storeu_ps(c3, vmask, vacc3x0123456789ABCDEF);
+        _mm512_mask_storeu_ps(c2, vmask, vacc2x0123456789ABCDEF);
+        _mm512_mask_storeu_ps(c1, vmask, vacc1x0123456789ABCDEF);
+        _mm512_mask_storeu_ps(c0, vmask, vacc0x0123456789ABCDEF);
+      }
+
+      nc = 0;
+    }
+  } while (nc != 0);
+}
diff --git a/src/f32-gemm/gen/7x16-avx512f-broadcast.c b/src/f32-gemm/gen/7x16-avx512f-broadcast.c
new file mode 100644
index 0000000..532c972
--- /dev/null
+++ b/src/f32-gemm/gen/7x16-avx512f-broadcast.c
@@ -0,0 +1,171 @@
+// Auto-generated file. Do not edit!
+//   Template: src/f32-gemm/avx512-broadcast.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <immintrin.h>
+
+#include <xnnpack/gemm.h>
+
+
+void xnn_f32_gemm_ukernel_7x16__avx512f_broadcast(
+    size_t mr,
+    size_t nc,
+    size_t kc,
+    const float*restrict a,
+    size_t a_stride,
+    const float*restrict w,
+    float*restrict c,
+    size_t cm_stride,
+    size_t cn_stride,
+    const union xnn_f32_output_params params[restrict static 1])
+{
+  assert(mr != 0);
+  assert(mr <= 7);
+  assert(nc != 0);
+  assert(kc != 0);
+  assert(kc % sizeof(float) == 0);
+  assert(a != NULL);
+  assert(w != NULL);
+  assert(c != NULL);
+
+  const float* a0 = a;
+  float* c0 = c;
+  const float* a1 = (const float*) ((uintptr_t) a0 + a_stride);
+  float* c1 = (float*) ((uintptr_t) c0 + cm_stride);
+  if XNN_UNPREDICTABLE(mr < 2) {
+    a1 = a0;
+    c1 = c0;
+  }
+  const float* a2 = (const float*) ((uintptr_t) a1 + a_stride);
+  float* c2 = (float*) ((uintptr_t) c1 + cm_stride);
+  if XNN_UNPREDICTABLE(mr <= 2) {
+    a2 = a1;
+    c2 = c1;
+  }
+  const float* a3 = (const float*) ((uintptr_t) a2 + a_stride);
+  float* c3 = (float*) ((uintptr_t) c2 + cm_stride);
+  if XNN_UNPREDICTABLE(mr < 4) {
+    a3 = a2;
+    c3 = c2;
+  }
+  const float* a4 = (const float*) ((uintptr_t) a3 + a_stride);
+  float* c4 = (float*) ((uintptr_t) c3 + cm_stride);
+  if XNN_UNPREDICTABLE(mr <= 4) {
+    a4 = a3;
+    c4 = c3;
+  }
+  const float* a5 = (const float*) ((uintptr_t) a4 + a_stride);
+  float* c5 = (float*) ((uintptr_t) c4 + cm_stride);
+  if XNN_UNPREDICTABLE(mr < 6) {
+    a5 = a4;
+    c5 = c4;
+  }
+  const float* a6 = (const float*) ((uintptr_t) a5 + a_stride);
+  float* c6 = (float*) ((uintptr_t) c5 + cm_stride);
+  if XNN_UNPREDICTABLE(mr <= 6) {
+    a6 = a5;
+    c6 = c5;
+  }
+
+  do {
+    __m512 vacc0x0123456789ABCDEF = _mm512_load_ps(w);
+    __m512 vacc1x0123456789ABCDEF = vacc0x0123456789ABCDEF;
+    __m512 vacc2x0123456789ABCDEF = vacc0x0123456789ABCDEF;
+    __m512 vacc3x0123456789ABCDEF = vacc0x0123456789ABCDEF;
+    __m512 vacc4x0123456789ABCDEF = vacc0x0123456789ABCDEF;
+    __m512 vacc5x0123456789ABCDEF = vacc0x0123456789ABCDEF;
+    __m512 vacc6x0123456789ABCDEF = vacc0x0123456789ABCDEF;
+    w += 16;
+
+    size_t k = kc;
+    do {
+      const __m512 vb0123456789ABCDEF = _mm512_load_ps(w);
+      w += 16;
+
+      vacc0x0123456789ABCDEF = _mm512_fmadd_ps(_mm512_set1_ps(*a0), vb0123456789ABCDEF, vacc0x0123456789ABCDEF);
+      vacc1x0123456789ABCDEF = _mm512_fmadd_ps(_mm512_set1_ps(*a1), vb0123456789ABCDEF, vacc1x0123456789ABCDEF);
+      vacc2x0123456789ABCDEF = _mm512_fmadd_ps(_mm512_set1_ps(*a2), vb0123456789ABCDEF, vacc2x0123456789ABCDEF);
+      vacc3x0123456789ABCDEF = _mm512_fmadd_ps(_mm512_set1_ps(*a3), vb0123456789ABCDEF, vacc3x0123456789ABCDEF);
+      vacc4x0123456789ABCDEF = _mm512_fmadd_ps(_mm512_set1_ps(*a4), vb0123456789ABCDEF, vacc4x0123456789ABCDEF);
+      vacc5x0123456789ABCDEF = _mm512_fmadd_ps(_mm512_set1_ps(*a5), vb0123456789ABCDEF, vacc5x0123456789ABCDEF);
+      vacc6x0123456789ABCDEF = _mm512_fmadd_ps(_mm512_set1_ps(*a6), vb0123456789ABCDEF, vacc6x0123456789ABCDEF);
+
+      a0 += 1;
+      a1 += 1;
+      a2 += 1;
+      a3 += 1;
+      a4 += 1;
+      a5 += 1;
+      a6 += 1;
+
+      k -= sizeof(float);
+    } while (k != 0);
+
+    const __m512 vmax = _mm512_broadcast_f32x4(_mm_load_ps(params->sse.max));
+    vacc0x0123456789ABCDEF = _mm512_min_ps(vacc0x0123456789ABCDEF, vmax);
+    vacc1x0123456789ABCDEF = _mm512_min_ps(vacc1x0123456789ABCDEF, vmax);
+    vacc2x0123456789ABCDEF = _mm512_min_ps(vacc2x0123456789ABCDEF, vmax);
+    vacc3x0123456789ABCDEF = _mm512_min_ps(vacc3x0123456789ABCDEF, vmax);
+    vacc4x0123456789ABCDEF = _mm512_min_ps(vacc4x0123456789ABCDEF, vmax);
+    vacc5x0123456789ABCDEF = _mm512_min_ps(vacc5x0123456789ABCDEF, vmax);
+    vacc6x0123456789ABCDEF = _mm512_min_ps(vacc6x0123456789ABCDEF, vmax);
+
+    const __m512 vmin = _mm512_broadcast_f32x4(_mm_load_ps(params->sse.min));
+    vacc0x0123456789ABCDEF = _mm512_max_ps(vacc0x0123456789ABCDEF, vmin);
+    vacc1x0123456789ABCDEF = _mm512_max_ps(vacc1x0123456789ABCDEF, vmin);
+    vacc2x0123456789ABCDEF = _mm512_max_ps(vacc2x0123456789ABCDEF, vmin);
+    vacc3x0123456789ABCDEF = _mm512_max_ps(vacc3x0123456789ABCDEF, vmin);
+    vacc4x0123456789ABCDEF = _mm512_max_ps(vacc4x0123456789ABCDEF, vmin);
+    vacc5x0123456789ABCDEF = _mm512_max_ps(vacc5x0123456789ABCDEF, vmin);
+    vacc6x0123456789ABCDEF = _mm512_max_ps(vacc6x0123456789ABCDEF, vmin);
+
+    if XNN_LIKELY(nc >= 16) {
+      _mm512_storeu_ps(c6, vacc6x0123456789ABCDEF);
+      c6 = (float*) ((uintptr_t) c6 + cn_stride);
+      _mm512_storeu_ps(c5, vacc5x0123456789ABCDEF);
+      c5 = (float*) ((uintptr_t) c5 + cn_stride);
+      _mm512_storeu_ps(c4, vacc4x0123456789ABCDEF);
+      c4 = (float*) ((uintptr_t) c4 + cn_stride);
+      _mm512_storeu_ps(c3, vacc3x0123456789ABCDEF);
+      c3 = (float*) ((uintptr_t) c3 + cn_stride);
+      _mm512_storeu_ps(c2, vacc2x0123456789ABCDEF);
+      c2 = (float*) ((uintptr_t) c2 + cn_stride);
+      _mm512_storeu_ps(c1, vacc1x0123456789ABCDEF);
+      c1 = (float*) ((uintptr_t) c1 + cn_stride);
+      _mm512_storeu_ps(c0, vacc0x0123456789ABCDEF);
+      c0 = (float*) ((uintptr_t) c0 + cn_stride);
+
+      a6 = (const float*) ((uintptr_t) a6 - kc);
+      a5 = (const float*) ((uintptr_t) a5 - kc);
+      a4 = (const float*) ((uintptr_t) a4 - kc);
+      a3 = (const float*) ((uintptr_t) a3 - kc);
+      a2 = (const float*) ((uintptr_t) a2 - kc);
+      a1 = (const float*) ((uintptr_t) a1 - kc);
+      a0 = (const float*) ((uintptr_t) a0 - kc);
+
+      nc -= 16;
+    } else {
+      if (nc & 15) {
+        // Prepare mask for valid 32-bit elements (depends on nc).
+        const __mmask16 vmask = _cvtu32_mask16((uint16_t) ((uint32_t) (UINT32_C(1) << nc) - UINT32_C(1)));
+
+        _mm512_mask_storeu_ps(c6, vmask, vacc6x0123456789ABCDEF);
+        _mm512_mask_storeu_ps(c5, vmask, vacc5x0123456789ABCDEF);
+        _mm512_mask_storeu_ps(c4, vmask, vacc4x0123456789ABCDEF);
+        _mm512_mask_storeu_ps(c3, vmask, vacc3x0123456789ABCDEF);
+        _mm512_mask_storeu_ps(c2, vmask, vacc2x0123456789ABCDEF);
+        _mm512_mask_storeu_ps(c1, vmask, vacc1x0123456789ABCDEF);
+        _mm512_mask_storeu_ps(c0, vmask, vacc0x0123456789ABCDEF);
+      }
+
+      nc = 0;
+    }
+  } while (nc != 0);
+}
diff --git a/src/f32-gemm/gen/8x16-avx512f-broadcast.c b/src/f32-gemm/gen/8x16-avx512f-broadcast.c
new file mode 100644
index 0000000..b35a8b9
--- /dev/null
+++ b/src/f32-gemm/gen/8x16-avx512f-broadcast.c
@@ -0,0 +1,186 @@
+// Auto-generated file. Do not edit!
+//   Template: src/f32-gemm/avx512-broadcast.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <immintrin.h>
+
+#include <xnnpack/gemm.h>
+
+
+void xnn_f32_gemm_ukernel_8x16__avx512f_broadcast(
+    size_t mr,
+    size_t nc,
+    size_t kc,
+    const float*restrict a,
+    size_t a_stride,
+    const float*restrict w,
+    float*restrict c,
+    size_t cm_stride,
+    size_t cn_stride,
+    const union xnn_f32_output_params params[restrict static 1])
+{
+  assert(mr != 0);
+  assert(mr <= 8);
+  assert(nc != 0);
+  assert(kc != 0);
+  assert(kc % sizeof(float) == 0);
+  assert(a != NULL);
+  assert(w != NULL);
+  assert(c != NULL);
+
+  const float* a0 = a;
+  float* c0 = c;
+  const float* a1 = (const float*) ((uintptr_t) a0 + a_stride);
+  float* c1 = (float*) ((uintptr_t) c0 + cm_stride);
+  if XNN_UNPREDICTABLE(mr < 2) {
+    a1 = a0;
+    c1 = c0;
+  }
+  const float* a2 = (const float*) ((uintptr_t) a1 + a_stride);
+  float* c2 = (float*) ((uintptr_t) c1 + cm_stride);
+  if XNN_UNPREDICTABLE(mr <= 2) {
+    a2 = a1;
+    c2 = c1;
+  }
+  const float* a3 = (const float*) ((uintptr_t) a2 + a_stride);
+  float* c3 = (float*) ((uintptr_t) c2 + cm_stride);
+  if XNN_UNPREDICTABLE(mr < 4) {
+    a3 = a2;
+    c3 = c2;
+  }
+  const float* a4 = (const float*) ((uintptr_t) a3 + a_stride);
+  float* c4 = (float*) ((uintptr_t) c3 + cm_stride);
+  if XNN_UNPREDICTABLE(mr <= 4) {
+    a4 = a3;
+    c4 = c3;
+  }
+  const float* a5 = (const float*) ((uintptr_t) a4 + a_stride);
+  float* c5 = (float*) ((uintptr_t) c4 + cm_stride);
+  if XNN_UNPREDICTABLE(mr < 6) {
+    a5 = a4;
+    c5 = c4;
+  }
+  const float* a6 = (const float*) ((uintptr_t) a5 + a_stride);
+  float* c6 = (float*) ((uintptr_t) c5 + cm_stride);
+  if XNN_UNPREDICTABLE(mr <= 6) {
+    a6 = a5;
+    c6 = c5;
+  }
+  const float* a7 = (const float*) ((uintptr_t) a6 + a_stride);
+  float* c7 = (float*) ((uintptr_t) c6 + cm_stride);
+  if XNN_UNPREDICTABLE(mr != 8) {
+    a7 = a6;
+    c7 = c6;
+  }
+
+  do {
+    __m512 vacc0x0123456789ABCDEF = _mm512_load_ps(w);
+    __m512 vacc1x0123456789ABCDEF = vacc0x0123456789ABCDEF;
+    __m512 vacc2x0123456789ABCDEF = vacc0x0123456789ABCDEF;
+    __m512 vacc3x0123456789ABCDEF = vacc0x0123456789ABCDEF;
+    __m512 vacc4x0123456789ABCDEF = vacc0x0123456789ABCDEF;
+    __m512 vacc5x0123456789ABCDEF = vacc0x0123456789ABCDEF;
+    __m512 vacc6x0123456789ABCDEF = vacc0x0123456789ABCDEF;
+    __m512 vacc7x0123456789ABCDEF = vacc0x0123456789ABCDEF;
+    w += 16;
+
+    size_t k = kc;
+    do {
+      const __m512 vb0123456789ABCDEF = _mm512_load_ps(w);
+      w += 16;
+
+      vacc0x0123456789ABCDEF = _mm512_fmadd_ps(_mm512_set1_ps(*a0), vb0123456789ABCDEF, vacc0x0123456789ABCDEF);
+      vacc1x0123456789ABCDEF = _mm512_fmadd_ps(_mm512_set1_ps(*a1), vb0123456789ABCDEF, vacc1x0123456789ABCDEF);
+      vacc2x0123456789ABCDEF = _mm512_fmadd_ps(_mm512_set1_ps(*a2), vb0123456789ABCDEF, vacc2x0123456789ABCDEF);
+      vacc3x0123456789ABCDEF = _mm512_fmadd_ps(_mm512_set1_ps(*a3), vb0123456789ABCDEF, vacc3x0123456789ABCDEF);
+      vacc4x0123456789ABCDEF = _mm512_fmadd_ps(_mm512_set1_ps(*a4), vb0123456789ABCDEF, vacc4x0123456789ABCDEF);
+      vacc5x0123456789ABCDEF = _mm512_fmadd_ps(_mm512_set1_ps(*a5), vb0123456789ABCDEF, vacc5x0123456789ABCDEF);
+      vacc6x0123456789ABCDEF = _mm512_fmadd_ps(_mm512_set1_ps(*a6), vb0123456789ABCDEF, vacc6x0123456789ABCDEF);
+      vacc7x0123456789ABCDEF = _mm512_fmadd_ps(_mm512_set1_ps(*a7), vb0123456789ABCDEF, vacc7x0123456789ABCDEF);
+
+      a0 += 1;
+      a1 += 1;
+      a2 += 1;
+      a3 += 1;
+      a4 += 1;
+      a5 += 1;
+      a6 += 1;
+      a7 += 1;
+
+      k -= sizeof(float);
+    } while (k != 0);
+
+    const __m512 vmax = _mm512_broadcast_f32x4(_mm_load_ps(params->sse.max));
+    vacc0x0123456789ABCDEF = _mm512_min_ps(vacc0x0123456789ABCDEF, vmax);
+    vacc1x0123456789ABCDEF = _mm512_min_ps(vacc1x0123456789ABCDEF, vmax);
+    vacc2x0123456789ABCDEF = _mm512_min_ps(vacc2x0123456789ABCDEF, vmax);
+    vacc3x0123456789ABCDEF = _mm512_min_ps(vacc3x0123456789ABCDEF, vmax);
+    vacc4x0123456789ABCDEF = _mm512_min_ps(vacc4x0123456789ABCDEF, vmax);
+    vacc5x0123456789ABCDEF = _mm512_min_ps(vacc5x0123456789ABCDEF, vmax);
+    vacc6x0123456789ABCDEF = _mm512_min_ps(vacc6x0123456789ABCDEF, vmax);
+    vacc7x0123456789ABCDEF = _mm512_min_ps(vacc7x0123456789ABCDEF, vmax);
+
+    const __m512 vmin = _mm512_broadcast_f32x4(_mm_load_ps(params->sse.min));
+    vacc0x0123456789ABCDEF = _mm512_max_ps(vacc0x0123456789ABCDEF, vmin);
+    vacc1x0123456789ABCDEF = _mm512_max_ps(vacc1x0123456789ABCDEF, vmin);
+    vacc2x0123456789ABCDEF = _mm512_max_ps(vacc2x0123456789ABCDEF, vmin);
+    vacc3x0123456789ABCDEF = _mm512_max_ps(vacc3x0123456789ABCDEF, vmin);
+    vacc4x0123456789ABCDEF = _mm512_max_ps(vacc4x0123456789ABCDEF, vmin);
+    vacc5x0123456789ABCDEF = _mm512_max_ps(vacc5x0123456789ABCDEF, vmin);
+    vacc6x0123456789ABCDEF = _mm512_max_ps(vacc6x0123456789ABCDEF, vmin);
+    vacc7x0123456789ABCDEF = _mm512_max_ps(vacc7x0123456789ABCDEF, vmin);
+
+    if XNN_LIKELY(nc >= 16) {
+      _mm512_storeu_ps(c7, vacc7x0123456789ABCDEF);
+      c7 = (float*) ((uintptr_t) c7 + cn_stride);
+      _mm512_storeu_ps(c6, vacc6x0123456789ABCDEF);
+      c6 = (float*) ((uintptr_t) c6 + cn_stride);
+      _mm512_storeu_ps(c5, vacc5x0123456789ABCDEF);
+      c5 = (float*) ((uintptr_t) c5 + cn_stride);
+      _mm512_storeu_ps(c4, vacc4x0123456789ABCDEF);
+      c4 = (float*) ((uintptr_t) c4 + cn_stride);
+      _mm512_storeu_ps(c3, vacc3x0123456789ABCDEF);
+      c3 = (float*) ((uintptr_t) c3 + cn_stride);
+      _mm512_storeu_ps(c2, vacc2x0123456789ABCDEF);
+      c2 = (float*) ((uintptr_t) c2 + cn_stride);
+      _mm512_storeu_ps(c1, vacc1x0123456789ABCDEF);
+      c1 = (float*) ((uintptr_t) c1 + cn_stride);
+      _mm512_storeu_ps(c0, vacc0x0123456789ABCDEF);
+      c0 = (float*) ((uintptr_t) c0 + cn_stride);
+
+      a7 = (const float*) ((uintptr_t) a7 - kc);
+      a6 = (const float*) ((uintptr_t) a6 - kc);
+      a5 = (const float*) ((uintptr_t) a5 - kc);
+      a4 = (const float*) ((uintptr_t) a4 - kc);
+      a3 = (const float*) ((uintptr_t) a3 - kc);
+      a2 = (const float*) ((uintptr_t) a2 - kc);
+      a1 = (const float*) ((uintptr_t) a1 - kc);
+      a0 = (const float*) ((uintptr_t) a0 - kc);
+
+      nc -= 16;
+    } else {
+      if (nc & 15) {
+        // Prepare mask for valid 32-bit elements (depends on nc).
+        const __mmask16 vmask = _cvtu32_mask16((uint16_t) ((uint32_t) (UINT32_C(1) << nc) - UINT32_C(1)));
+
+        _mm512_mask_storeu_ps(c7, vmask, vacc7x0123456789ABCDEF);
+        _mm512_mask_storeu_ps(c6, vmask, vacc6x0123456789ABCDEF);
+        _mm512_mask_storeu_ps(c5, vmask, vacc5x0123456789ABCDEF);
+        _mm512_mask_storeu_ps(c4, vmask, vacc4x0123456789ABCDEF);
+        _mm512_mask_storeu_ps(c3, vmask, vacc3x0123456789ABCDEF);
+        _mm512_mask_storeu_ps(c2, vmask, vacc2x0123456789ABCDEF);
+        _mm512_mask_storeu_ps(c1, vmask, vacc1x0123456789ABCDEF);
+        _mm512_mask_storeu_ps(c0, vmask, vacc0x0123456789ABCDEF);
+      }
+
+      nc = 0;
+    }
+  } while (nc != 0);
+}
diff --git a/src/f32-igemm/avx-broadcast.c.in b/src/f32-igemm/avx-broadcast.c.in
index 29d4206..72215d5 100644
--- a/src/f32-igemm/avx-broadcast.c.in
+++ b/src/f32-igemm/avx-broadcast.c.in
@@ -3,8 +3,8 @@
 // This source code is licensed under the BSD-style license found in the
 // LICENSE file in the root directory of this source tree.
 
-$assert NR % 4 == 0
-$ABC = "0123456789ABCDEFGHIJKLMN"
+$assert NR % 8 == 0
+$ABC = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ"
 #include <assert.h>
 
 #include <immintrin.h>
diff --git a/src/f32-igemm/avx512-broadcast.c.in b/src/f32-igemm/avx512-broadcast.c.in
new file mode 100644
index 0000000..241b489
--- /dev/null
+++ b/src/f32-igemm/avx512-broadcast.c.in
@@ -0,0 +1,143 @@
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+$assert NR % 16 == 0
+$ABC = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ"
+#include <assert.h>
+
+#include <immintrin.h>
+
+#include <xnnpack/igemm.h>
+
+
+void xnn_f32_igemm_ukernel_${MR}x${NR}__avx512f_broadcast(
+    size_t mr,
+    size_t nc,
+    size_t kc,
+    size_t ks,
+    const float**restrict a,
+    const float*restrict w,
+    float*restrict c,
+    size_t cm_stride,
+    size_t cn_stride,
+    size_t a_offset,
+    const float* zero,
+    const union xnn_f32_output_params params[restrict static 1])
+{
+  assert(mr != 0);
+  assert(mr <= ${MR});
+  assert(nc != 0);
+  assert(kc != 0);
+  assert(kc % sizeof(float) == 0);
+  assert(ks != 0);
+  assert(ks % (${MR} * sizeof(void*)) == 0);
+  assert(a_offset % sizeof(float) == 0);
+  assert(a != NULL);
+  assert(w != NULL);
+  assert(c != NULL);
+
+  float* c0 = c;
+  $for M in range(1, MR):
+    float* c${M} = (float*) ((uintptr_t) c${M-1} + cm_stride);
+    $if M % 2 == 0:
+      if XNN_UNPREDICTABLE(mr <= ${M}) {
+        c${M} = c${M-1};
+      }
+    $elif M + 1 == MR:
+      if XNN_UNPREDICTABLE(mr != ${M+1}) {
+        c${M} = c${M-1};
+      }
+    $else:
+      if XNN_UNPREDICTABLE(mr < ${M+1}) {
+        c${M} = c${M-1};
+      }
+
+  do {
+    __m512 vacc0x${ABC[0:16]} = _mm512_load_ps(w);
+    $for N in range(16, NR, 16):
+      __m512 vacc0x${ABC[N:N+16]} = _mm512_load_ps(w + ${N});
+    $for M in range(1, MR):
+      $for N in range(0, NR, 16):
+        __m512 vacc${M}x${ABC[N:N+16]} = vacc0x${ABC[N:N+16]};
+    w += ${NR};
+
+    size_t p = ks;
+    do {
+      $for M in range(MR):
+        const float* restrict a${M} = a[${M}];
+        assert(a${M} != NULL);
+        if XNN_UNPREDICTABLE(a${M} != zero) {
+          a${M} = (const float*) ((uintptr_t) a${M} + a_offset);
+        }
+      a += ${MR};
+
+      size_t k = kc;
+      do {
+        const __m512 vb${ABC[0:16]} = _mm512_load_ps(w);
+        $for N in range(16, NR, 16):
+          const __m512 vb${ABC[N:N+16]} = _mm512_load_ps(w + ${N});
+        w += ${NR};
+
+        $for M in range(MR):
+          $for N in range(0, NR, 16):
+            vacc${M}x${ABC[N:N+16]} = _mm512_fmadd_ps(_mm512_set1_ps(*a${M}), vb${ABC[N:N+16]}, vacc${M}x${ABC[N:N+16]});
+
+        $for M in range(MR):
+          a${M} += 1;
+
+        k -= sizeof(float);
+      } while (k != 0);
+      p -= ${MR} * sizeof(void*);
+    } while (p != 0);
+
+    const __m512 vmax = _mm512_broadcast_f32x4(_mm_load_ps(params->sse.max));
+    $for N in range(0, NR, 16):
+      $for M in range(MR):
+        vacc${M}x${ABC[N:N+16]} = _mm512_min_ps(vacc${M}x${ABC[N:N+16]}, vmax);
+
+    const __m512 vmin = _mm512_broadcast_f32x4(_mm_load_ps(params->sse.min));
+    $for N in range(0, NR, 16):
+      $for M in range(MR):
+        vacc${M}x${ABC[N:N+16]} = _mm512_max_ps(vacc${M}x${ABC[N:N+16]}, vmin);
+
+    if XNN_LIKELY(nc >= ${NR}) {
+      $for M in reversed(range(MR)):
+        _mm512_storeu_ps(c${M}, vacc${M}x${ABC[0:16]});
+        $for N in range(16, NR, 16):
+          _mm512_storeu_ps(c${M} + ${N}, vacc${M}x${ABC[N:N+16]});
+        c${M} = (float*) ((uintptr_t) c${M} + cn_stride);
+
+      a = (const float**restrict) ((uintptr_t) a - ks);
+      nc -= ${NR};
+    } else {
+      $for LOG2N in reversed(range(4, NR.bit_length())):
+        $if NR != 1 << LOG2N:
+          if (nc & ${1 << LOG2N}) {
+            $if LOG2N >= 4:
+              $for M in reversed(range(MR)):
+                _mm512_storeu_ps(c${M}, vacc${M}x${ABC[0:16]});
+                $for N in range(16, 1 << LOG2N, 16):
+                  _mm512_storeu_ps(c${M} + ${N}, vacc${M}x${ABC[N:N+16]});
+
+              $for M in reversed(range(MR)):
+                $for N in range(0, 1 << (LOG2N - 1), 16):
+                  vacc${M}x${ABC[N:N+16]} = vacc${M}x${ABC[N + (1 << LOG2N):N + (1 << LOG2N)+16]};
+
+              $for M in reversed(range(MR)):
+                c${M} += ${1 << LOG2N};
+          }
+        $if LOG2N == 4:
+          if (nc & 15) {
+            // Prepare mask for valid 32-bit elements (depends on nc).
+            const __mmask16 vmask = _cvtu32_mask16((uint16_t) ((uint32_t) (UINT32_C(1) << nc) - UINT32_C(1)));
+
+            $for M in reversed(range(MR)):
+              _mm512_mask_storeu_ps(c${M}, vmask, vacc${M}x${ABC[0:16]});
+          }
+
+      nc = 0;
+    }
+  } while (nc != 0);
+}
diff --git a/src/f32-igemm/gen/1x16-avx512f-broadcast.c b/src/f32-igemm/gen/1x16-avx512f-broadcast.c
new file mode 100644
index 0000000..e8afc80
--- /dev/null
+++ b/src/f32-igemm/gen/1x16-avx512f-broadcast.c
@@ -0,0 +1,95 @@
+// Auto-generated file. Do not edit!
+//   Template: src/f32-igemm/avx512-broadcast.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <immintrin.h>
+
+#include <xnnpack/igemm.h>
+
+
+void xnn_f32_igemm_ukernel_1x16__avx512f_broadcast(
+    size_t mr,
+    size_t nc,
+    size_t kc,
+    size_t ks,
+    const float**restrict a,
+    const float*restrict w,
+    float*restrict c,
+    size_t cm_stride,
+    size_t cn_stride,
+    size_t a_offset,
+    const float* zero,
+    const union xnn_f32_output_params params[restrict static 1])
+{
+  assert(mr != 0);
+  assert(mr <= 1);
+  assert(nc != 0);
+  assert(kc != 0);
+  assert(kc % sizeof(float) == 0);
+  assert(ks != 0);
+  assert(ks % (1 * sizeof(void*)) == 0);
+  assert(a_offset % sizeof(float) == 0);
+  assert(a != NULL);
+  assert(w != NULL);
+  assert(c != NULL);
+
+  float* c0 = c;
+
+  do {
+    __m512 vacc0x0123456789ABCDEF = _mm512_load_ps(w);
+    w += 16;
+
+    size_t p = ks;
+    do {
+      const float* restrict a0 = a[0];
+      assert(a0 != NULL);
+      if XNN_UNPREDICTABLE(a0 != zero) {
+        a0 = (const float*) ((uintptr_t) a0 + a_offset);
+      }
+      a += 1;
+
+      size_t k = kc;
+      do {
+        const __m512 vb0123456789ABCDEF = _mm512_load_ps(w);
+        w += 16;
+
+        vacc0x0123456789ABCDEF = _mm512_fmadd_ps(_mm512_set1_ps(*a0), vb0123456789ABCDEF, vacc0x0123456789ABCDEF);
+
+        a0 += 1;
+
+        k -= sizeof(float);
+      } while (k != 0);
+      p -= 1 * sizeof(void*);
+    } while (p != 0);
+
+    const __m512 vmax = _mm512_broadcast_f32x4(_mm_load_ps(params->sse.max));
+    vacc0x0123456789ABCDEF = _mm512_min_ps(vacc0x0123456789ABCDEF, vmax);
+
+    const __m512 vmin = _mm512_broadcast_f32x4(_mm_load_ps(params->sse.min));
+    vacc0x0123456789ABCDEF = _mm512_max_ps(vacc0x0123456789ABCDEF, vmin);
+
+    if XNN_LIKELY(nc >= 16) {
+      _mm512_storeu_ps(c0, vacc0x0123456789ABCDEF);
+      c0 = (float*) ((uintptr_t) c0 + cn_stride);
+
+      a = (const float**restrict) ((uintptr_t) a - ks);
+      nc -= 16;
+    } else {
+      if (nc & 15) {
+        // Prepare mask for valid 32-bit elements (depends on nc).
+        const __mmask16 vmask = _cvtu32_mask16((uint16_t) ((uint32_t) (UINT32_C(1) << nc) - UINT32_C(1)));
+
+        _mm512_mask_storeu_ps(c0, vmask, vacc0x0123456789ABCDEF);
+      }
+
+      nc = 0;
+    }
+  } while (nc != 0);
+}
diff --git a/src/f32-igemm/gen/4x16-avx512f-broadcast.c b/src/f32-igemm/gen/4x16-avx512f-broadcast.c
new file mode 100644
index 0000000..e76a8cc
--- /dev/null
+++ b/src/f32-igemm/gen/4x16-avx512f-broadcast.c
@@ -0,0 +1,146 @@
+// Auto-generated file. Do not edit!
+//   Template: src/f32-igemm/avx512-broadcast.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <immintrin.h>
+
+#include <xnnpack/igemm.h>
+
+
+void xnn_f32_igemm_ukernel_4x16__avx512f_broadcast(
+    size_t mr,
+    size_t nc,
+    size_t kc,
+    size_t ks,
+    const float**restrict a,
+    const float*restrict w,
+    float*restrict c,
+    size_t cm_stride,
+    size_t cn_stride,
+    size_t a_offset,
+    const float* zero,
+    const union xnn_f32_output_params params[restrict static 1])
+{
+  assert(mr != 0);
+  assert(mr <= 4);
+  assert(nc != 0);
+  assert(kc != 0);
+  assert(kc % sizeof(float) == 0);
+  assert(ks != 0);
+  assert(ks % (4 * sizeof(void*)) == 0);
+  assert(a_offset % sizeof(float) == 0);
+  assert(a != NULL);
+  assert(w != NULL);
+  assert(c != NULL);
+
+  float* c0 = c;
+  float* c1 = (float*) ((uintptr_t) c0 + cm_stride);
+  if XNN_UNPREDICTABLE(mr < 2) {
+    c1 = c0;
+  }
+  float* c2 = (float*) ((uintptr_t) c1 + cm_stride);
+  if XNN_UNPREDICTABLE(mr <= 2) {
+    c2 = c1;
+  }
+  float* c3 = (float*) ((uintptr_t) c2 + cm_stride);
+  if XNN_UNPREDICTABLE(mr != 4) {
+    c3 = c2;
+  }
+
+  do {
+    __m512 vacc0x0123456789ABCDEF = _mm512_load_ps(w);
+    __m512 vacc1x0123456789ABCDEF = vacc0x0123456789ABCDEF;
+    __m512 vacc2x0123456789ABCDEF = vacc0x0123456789ABCDEF;
+    __m512 vacc3x0123456789ABCDEF = vacc0x0123456789ABCDEF;
+    w += 16;
+
+    size_t p = ks;
+    do {
+      const float* restrict a0 = a[0];
+      assert(a0 != NULL);
+      if XNN_UNPREDICTABLE(a0 != zero) {
+        a0 = (const float*) ((uintptr_t) a0 + a_offset);
+      }
+      const float* restrict a1 = a[1];
+      assert(a1 != NULL);
+      if XNN_UNPREDICTABLE(a1 != zero) {
+        a1 = (const float*) ((uintptr_t) a1 + a_offset);
+      }
+      const float* restrict a2 = a[2];
+      assert(a2 != NULL);
+      if XNN_UNPREDICTABLE(a2 != zero) {
+        a2 = (const float*) ((uintptr_t) a2 + a_offset);
+      }
+      const float* restrict a3 = a[3];
+      assert(a3 != NULL);
+      if XNN_UNPREDICTABLE(a3 != zero) {
+        a3 = (const float*) ((uintptr_t) a3 + a_offset);
+      }
+      a += 4;
+
+      size_t k = kc;
+      do {
+        const __m512 vb0123456789ABCDEF = _mm512_load_ps(w);
+        w += 16;
+
+        vacc0x0123456789ABCDEF = _mm512_fmadd_ps(_mm512_set1_ps(*a0), vb0123456789ABCDEF, vacc0x0123456789ABCDEF);
+        vacc1x0123456789ABCDEF = _mm512_fmadd_ps(_mm512_set1_ps(*a1), vb0123456789ABCDEF, vacc1x0123456789ABCDEF);
+        vacc2x0123456789ABCDEF = _mm512_fmadd_ps(_mm512_set1_ps(*a2), vb0123456789ABCDEF, vacc2x0123456789ABCDEF);
+        vacc3x0123456789ABCDEF = _mm512_fmadd_ps(_mm512_set1_ps(*a3), vb0123456789ABCDEF, vacc3x0123456789ABCDEF);
+
+        a0 += 1;
+        a1 += 1;
+        a2 += 1;
+        a3 += 1;
+
+        k -= sizeof(float);
+      } while (k != 0);
+      p -= 4 * sizeof(void*);
+    } while (p != 0);
+
+    const __m512 vmax = _mm512_broadcast_f32x4(_mm_load_ps(params->sse.max));
+    vacc0x0123456789ABCDEF = _mm512_min_ps(vacc0x0123456789ABCDEF, vmax);
+    vacc1x0123456789ABCDEF = _mm512_min_ps(vacc1x0123456789ABCDEF, vmax);
+    vacc2x0123456789ABCDEF = _mm512_min_ps(vacc2x0123456789ABCDEF, vmax);
+    vacc3x0123456789ABCDEF = _mm512_min_ps(vacc3x0123456789ABCDEF, vmax);
+
+    const __m512 vmin = _mm512_broadcast_f32x4(_mm_load_ps(params->sse.min));
+    vacc0x0123456789ABCDEF = _mm512_max_ps(vacc0x0123456789ABCDEF, vmin);
+    vacc1x0123456789ABCDEF = _mm512_max_ps(vacc1x0123456789ABCDEF, vmin);
+    vacc2x0123456789ABCDEF = _mm512_max_ps(vacc2x0123456789ABCDEF, vmin);
+    vacc3x0123456789ABCDEF = _mm512_max_ps(vacc3x0123456789ABCDEF, vmin);
+
+    if XNN_LIKELY(nc >= 16) {
+      _mm512_storeu_ps(c3, vacc3x0123456789ABCDEF);
+      c3 = (float*) ((uintptr_t) c3 + cn_stride);
+      _mm512_storeu_ps(c2, vacc2x0123456789ABCDEF);
+      c2 = (float*) ((uintptr_t) c2 + cn_stride);
+      _mm512_storeu_ps(c1, vacc1x0123456789ABCDEF);
+      c1 = (float*) ((uintptr_t) c1 + cn_stride);
+      _mm512_storeu_ps(c0, vacc0x0123456789ABCDEF);
+      c0 = (float*) ((uintptr_t) c0 + cn_stride);
+
+      a = (const float**restrict) ((uintptr_t) a - ks);
+      nc -= 16;
+    } else {
+      if (nc & 15) {
+        // Prepare mask for valid 32-bit elements (depends on nc).
+        const __mmask16 vmask = _cvtu32_mask16((uint16_t) ((uint32_t) (UINT32_C(1) << nc) - UINT32_C(1)));
+
+        _mm512_mask_storeu_ps(c3, vmask, vacc3x0123456789ABCDEF);
+        _mm512_mask_storeu_ps(c2, vmask, vacc2x0123456789ABCDEF);
+        _mm512_mask_storeu_ps(c1, vmask, vacc1x0123456789ABCDEF);
+        _mm512_mask_storeu_ps(c0, vmask, vacc0x0123456789ABCDEF);
+      }
+
+      nc = 0;
+    }
+  } while (nc != 0);
+}
diff --git a/src/f32-igemm/gen/5x16-avx512f-broadcast.c b/src/f32-igemm/gen/5x16-avx512f-broadcast.c
new file mode 100644
index 0000000..4abcdbb
--- /dev/null
+++ b/src/f32-igemm/gen/5x16-avx512f-broadcast.c
@@ -0,0 +1,163 @@
+// Auto-generated file. Do not edit!
+//   Template: src/f32-igemm/avx512-broadcast.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <immintrin.h>
+
+#include <xnnpack/igemm.h>
+
+
+void xnn_f32_igemm_ukernel_5x16__avx512f_broadcast(
+    size_t mr,
+    size_t nc,
+    size_t kc,
+    size_t ks,
+    const float**restrict a,
+    const float*restrict w,
+    float*restrict c,
+    size_t cm_stride,
+    size_t cn_stride,
+    size_t a_offset,
+    const float* zero,
+    const union xnn_f32_output_params params[restrict static 1])
+{
+  assert(mr != 0);
+  assert(mr <= 5);
+  assert(nc != 0);
+  assert(kc != 0);
+  assert(kc % sizeof(float) == 0);
+  assert(ks != 0);
+  assert(ks % (5 * sizeof(void*)) == 0);
+  assert(a_offset % sizeof(float) == 0);
+  assert(a != NULL);
+  assert(w != NULL);
+  assert(c != NULL);
+
+  float* c0 = c;
+  float* c1 = (float*) ((uintptr_t) c0 + cm_stride);
+  if XNN_UNPREDICTABLE(mr < 2) {
+    c1 = c0;
+  }
+  float* c2 = (float*) ((uintptr_t) c1 + cm_stride);
+  if XNN_UNPREDICTABLE(mr <= 2) {
+    c2 = c1;
+  }
+  float* c3 = (float*) ((uintptr_t) c2 + cm_stride);
+  if XNN_UNPREDICTABLE(mr < 4) {
+    c3 = c2;
+  }
+  float* c4 = (float*) ((uintptr_t) c3 + cm_stride);
+  if XNN_UNPREDICTABLE(mr <= 4) {
+    c4 = c3;
+  }
+
+  do {
+    __m512 vacc0x0123456789ABCDEF = _mm512_load_ps(w);
+    __m512 vacc1x0123456789ABCDEF = vacc0x0123456789ABCDEF;
+    __m512 vacc2x0123456789ABCDEF = vacc0x0123456789ABCDEF;
+    __m512 vacc3x0123456789ABCDEF = vacc0x0123456789ABCDEF;
+    __m512 vacc4x0123456789ABCDEF = vacc0x0123456789ABCDEF;
+    w += 16;
+
+    size_t p = ks;
+    do {
+      const float* restrict a0 = a[0];
+      assert(a0 != NULL);
+      if XNN_UNPREDICTABLE(a0 != zero) {
+        a0 = (const float*) ((uintptr_t) a0 + a_offset);
+      }
+      const float* restrict a1 = a[1];
+      assert(a1 != NULL);
+      if XNN_UNPREDICTABLE(a1 != zero) {
+        a1 = (const float*) ((uintptr_t) a1 + a_offset);
+      }
+      const float* restrict a2 = a[2];
+      assert(a2 != NULL);
+      if XNN_UNPREDICTABLE(a2 != zero) {
+        a2 = (const float*) ((uintptr_t) a2 + a_offset);
+      }
+      const float* restrict a3 = a[3];
+      assert(a3 != NULL);
+      if XNN_UNPREDICTABLE(a3 != zero) {
+        a3 = (const float*) ((uintptr_t) a3 + a_offset);
+      }
+      const float* restrict a4 = a[4];
+      assert(a4 != NULL);
+      if XNN_UNPREDICTABLE(a4 != zero) {
+        a4 = (const float*) ((uintptr_t) a4 + a_offset);
+      }
+      a += 5;
+
+      size_t k = kc;
+      do {
+        const __m512 vb0123456789ABCDEF = _mm512_load_ps(w);
+        w += 16;
+
+        vacc0x0123456789ABCDEF = _mm512_fmadd_ps(_mm512_set1_ps(*a0), vb0123456789ABCDEF, vacc0x0123456789ABCDEF);
+        vacc1x0123456789ABCDEF = _mm512_fmadd_ps(_mm512_set1_ps(*a1), vb0123456789ABCDEF, vacc1x0123456789ABCDEF);
+        vacc2x0123456789ABCDEF = _mm512_fmadd_ps(_mm512_set1_ps(*a2), vb0123456789ABCDEF, vacc2x0123456789ABCDEF);
+        vacc3x0123456789ABCDEF = _mm512_fmadd_ps(_mm512_set1_ps(*a3), vb0123456789ABCDEF, vacc3x0123456789ABCDEF);
+        vacc4x0123456789ABCDEF = _mm512_fmadd_ps(_mm512_set1_ps(*a4), vb0123456789ABCDEF, vacc4x0123456789ABCDEF);
+
+        a0 += 1;
+        a1 += 1;
+        a2 += 1;
+        a3 += 1;
+        a4 += 1;
+
+        k -= sizeof(float);
+      } while (k != 0);
+      p -= 5 * sizeof(void*);
+    } while (p != 0);
+
+    const __m512 vmax = _mm512_broadcast_f32x4(_mm_load_ps(params->sse.max));
+    vacc0x0123456789ABCDEF = _mm512_min_ps(vacc0x0123456789ABCDEF, vmax);
+    vacc1x0123456789ABCDEF = _mm512_min_ps(vacc1x0123456789ABCDEF, vmax);
+    vacc2x0123456789ABCDEF = _mm512_min_ps(vacc2x0123456789ABCDEF, vmax);
+    vacc3x0123456789ABCDEF = _mm512_min_ps(vacc3x0123456789ABCDEF, vmax);
+    vacc4x0123456789ABCDEF = _mm512_min_ps(vacc4x0123456789ABCDEF, vmax);
+
+    const __m512 vmin = _mm512_broadcast_f32x4(_mm_load_ps(params->sse.min));
+    vacc0x0123456789ABCDEF = _mm512_max_ps(vacc0x0123456789ABCDEF, vmin);
+    vacc1x0123456789ABCDEF = _mm512_max_ps(vacc1x0123456789ABCDEF, vmin);
+    vacc2x0123456789ABCDEF = _mm512_max_ps(vacc2x0123456789ABCDEF, vmin);
+    vacc3x0123456789ABCDEF = _mm512_max_ps(vacc3x0123456789ABCDEF, vmin);
+    vacc4x0123456789ABCDEF = _mm512_max_ps(vacc4x0123456789ABCDEF, vmin);
+
+    if XNN_LIKELY(nc >= 16) {
+      _mm512_storeu_ps(c4, vacc4x0123456789ABCDEF);
+      c4 = (float*) ((uintptr_t) c4 + cn_stride);
+      _mm512_storeu_ps(c3, vacc3x0123456789ABCDEF);
+      c3 = (float*) ((uintptr_t) c3 + cn_stride);
+      _mm512_storeu_ps(c2, vacc2x0123456789ABCDEF);
+      c2 = (float*) ((uintptr_t) c2 + cn_stride);
+      _mm512_storeu_ps(c1, vacc1x0123456789ABCDEF);
+      c1 = (float*) ((uintptr_t) c1 + cn_stride);
+      _mm512_storeu_ps(c0, vacc0x0123456789ABCDEF);
+      c0 = (float*) ((uintptr_t) c0 + cn_stride);
+
+      a = (const float**restrict) ((uintptr_t) a - ks);
+      nc -= 16;
+    } else {
+      if (nc & 15) {
+        // Prepare mask for valid 32-bit elements (depends on nc).
+        const __mmask16 vmask = _cvtu32_mask16((uint16_t) ((uint32_t) (UINT32_C(1) << nc) - UINT32_C(1)));
+
+        _mm512_mask_storeu_ps(c4, vmask, vacc4x0123456789ABCDEF);
+        _mm512_mask_storeu_ps(c3, vmask, vacc3x0123456789ABCDEF);
+        _mm512_mask_storeu_ps(c2, vmask, vacc2x0123456789ABCDEF);
+        _mm512_mask_storeu_ps(c1, vmask, vacc1x0123456789ABCDEF);
+        _mm512_mask_storeu_ps(c0, vmask, vacc0x0123456789ABCDEF);
+      }
+
+      nc = 0;
+    }
+  } while (nc != 0);
+}
diff --git a/src/f32-igemm/gen/6x16-avx512f-broadcast.c b/src/f32-igemm/gen/6x16-avx512f-broadcast.c
new file mode 100644
index 0000000..e85b2bc
--- /dev/null
+++ b/src/f32-igemm/gen/6x16-avx512f-broadcast.c
@@ -0,0 +1,180 @@
+// Auto-generated file. Do not edit!
+//   Template: src/f32-igemm/avx512-broadcast.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <immintrin.h>
+
+#include <xnnpack/igemm.h>
+
+
+void xnn_f32_igemm_ukernel_6x16__avx512f_broadcast(
+    size_t mr,
+    size_t nc,
+    size_t kc,
+    size_t ks,
+    const float**restrict a,
+    const float*restrict w,
+    float*restrict c,
+    size_t cm_stride,
+    size_t cn_stride,
+    size_t a_offset,
+    const float* zero,
+    const union xnn_f32_output_params params[restrict static 1])
+{
+  assert(mr != 0);
+  assert(mr <= 6);
+  assert(nc != 0);
+  assert(kc != 0);
+  assert(kc % sizeof(float) == 0);
+  assert(ks != 0);
+  assert(ks % (6 * sizeof(void*)) == 0);
+  assert(a_offset % sizeof(float) == 0);
+  assert(a != NULL);
+  assert(w != NULL);
+  assert(c != NULL);
+
+  float* c0 = c;
+  float* c1 = (float*) ((uintptr_t) c0 + cm_stride);
+  if XNN_UNPREDICTABLE(mr < 2) {
+    c1 = c0;
+  }
+  float* c2 = (float*) ((uintptr_t) c1 + cm_stride);
+  if XNN_UNPREDICTABLE(mr <= 2) {
+    c2 = c1;
+  }
+  float* c3 = (float*) ((uintptr_t) c2 + cm_stride);
+  if XNN_UNPREDICTABLE(mr < 4) {
+    c3 = c2;
+  }
+  float* c4 = (float*) ((uintptr_t) c3 + cm_stride);
+  if XNN_UNPREDICTABLE(mr <= 4) {
+    c4 = c3;
+  }
+  float* c5 = (float*) ((uintptr_t) c4 + cm_stride);
+  if XNN_UNPREDICTABLE(mr != 6) {
+    c5 = c4;
+  }
+
+  do {
+    __m512 vacc0x0123456789ABCDEF = _mm512_load_ps(w);
+    __m512 vacc1x0123456789ABCDEF = vacc0x0123456789ABCDEF;
+    __m512 vacc2x0123456789ABCDEF = vacc0x0123456789ABCDEF;
+    __m512 vacc3x0123456789ABCDEF = vacc0x0123456789ABCDEF;
+    __m512 vacc4x0123456789ABCDEF = vacc0x0123456789ABCDEF;
+    __m512 vacc5x0123456789ABCDEF = vacc0x0123456789ABCDEF;
+    w += 16;
+
+    size_t p = ks;
+    do {
+      const float* restrict a0 = a[0];
+      assert(a0 != NULL);
+      if XNN_UNPREDICTABLE(a0 != zero) {
+        a0 = (const float*) ((uintptr_t) a0 + a_offset);
+      }
+      const float* restrict a1 = a[1];
+      assert(a1 != NULL);
+      if XNN_UNPREDICTABLE(a1 != zero) {
+        a1 = (const float*) ((uintptr_t) a1 + a_offset);
+      }
+      const float* restrict a2 = a[2];
+      assert(a2 != NULL);
+      if XNN_UNPREDICTABLE(a2 != zero) {
+        a2 = (const float*) ((uintptr_t) a2 + a_offset);
+      }
+      const float* restrict a3 = a[3];
+      assert(a3 != NULL);
+      if XNN_UNPREDICTABLE(a3 != zero) {
+        a3 = (const float*) ((uintptr_t) a3 + a_offset);
+      }
+      const float* restrict a4 = a[4];
+      assert(a4 != NULL);
+      if XNN_UNPREDICTABLE(a4 != zero) {
+        a4 = (const float*) ((uintptr_t) a4 + a_offset);
+      }
+      const float* restrict a5 = a[5];
+      assert(a5 != NULL);
+      if XNN_UNPREDICTABLE(a5 != zero) {
+        a5 = (const float*) ((uintptr_t) a5 + a_offset);
+      }
+      a += 6;
+
+      size_t k = kc;
+      do {
+        const __m512 vb0123456789ABCDEF = _mm512_load_ps(w);
+        w += 16;
+
+        vacc0x0123456789ABCDEF = _mm512_fmadd_ps(_mm512_set1_ps(*a0), vb0123456789ABCDEF, vacc0x0123456789ABCDEF);
+        vacc1x0123456789ABCDEF = _mm512_fmadd_ps(_mm512_set1_ps(*a1), vb0123456789ABCDEF, vacc1x0123456789ABCDEF);
+        vacc2x0123456789ABCDEF = _mm512_fmadd_ps(_mm512_set1_ps(*a2), vb0123456789ABCDEF, vacc2x0123456789ABCDEF);
+        vacc3x0123456789ABCDEF = _mm512_fmadd_ps(_mm512_set1_ps(*a3), vb0123456789ABCDEF, vacc3x0123456789ABCDEF);
+        vacc4x0123456789ABCDEF = _mm512_fmadd_ps(_mm512_set1_ps(*a4), vb0123456789ABCDEF, vacc4x0123456789ABCDEF);
+        vacc5x0123456789ABCDEF = _mm512_fmadd_ps(_mm512_set1_ps(*a5), vb0123456789ABCDEF, vacc5x0123456789ABCDEF);
+
+        a0 += 1;
+        a1 += 1;
+        a2 += 1;
+        a3 += 1;
+        a4 += 1;
+        a5 += 1;
+
+        k -= sizeof(float);
+      } while (k != 0);
+      p -= 6 * sizeof(void*);
+    } while (p != 0);
+
+    const __m512 vmax = _mm512_broadcast_f32x4(_mm_load_ps(params->sse.max));
+    vacc0x0123456789ABCDEF = _mm512_min_ps(vacc0x0123456789ABCDEF, vmax);
+    vacc1x0123456789ABCDEF = _mm512_min_ps(vacc1x0123456789ABCDEF, vmax);
+    vacc2x0123456789ABCDEF = _mm512_min_ps(vacc2x0123456789ABCDEF, vmax);
+    vacc3x0123456789ABCDEF = _mm512_min_ps(vacc3x0123456789ABCDEF, vmax);
+    vacc4x0123456789ABCDEF = _mm512_min_ps(vacc4x0123456789ABCDEF, vmax);
+    vacc5x0123456789ABCDEF = _mm512_min_ps(vacc5x0123456789ABCDEF, vmax);
+
+    const __m512 vmin = _mm512_broadcast_f32x4(_mm_load_ps(params->sse.min));
+    vacc0x0123456789ABCDEF = _mm512_max_ps(vacc0x0123456789ABCDEF, vmin);
+    vacc1x0123456789ABCDEF = _mm512_max_ps(vacc1x0123456789ABCDEF, vmin);
+    vacc2x0123456789ABCDEF = _mm512_max_ps(vacc2x0123456789ABCDEF, vmin);
+    vacc3x0123456789ABCDEF = _mm512_max_ps(vacc3x0123456789ABCDEF, vmin);
+    vacc4x0123456789ABCDEF = _mm512_max_ps(vacc4x0123456789ABCDEF, vmin);
+    vacc5x0123456789ABCDEF = _mm512_max_ps(vacc5x0123456789ABCDEF, vmin);
+
+    if XNN_LIKELY(nc >= 16) {
+      _mm512_storeu_ps(c5, vacc5x0123456789ABCDEF);
+      c5 = (float*) ((uintptr_t) c5 + cn_stride);
+      _mm512_storeu_ps(c4, vacc4x0123456789ABCDEF);
+      c4 = (float*) ((uintptr_t) c4 + cn_stride);
+      _mm512_storeu_ps(c3, vacc3x0123456789ABCDEF);
+      c3 = (float*) ((uintptr_t) c3 + cn_stride);
+      _mm512_storeu_ps(c2, vacc2x0123456789ABCDEF);
+      c2 = (float*) ((uintptr_t) c2 + cn_stride);
+      _mm512_storeu_ps(c1, vacc1x0123456789ABCDEF);
+      c1 = (float*) ((uintptr_t) c1 + cn_stride);
+      _mm512_storeu_ps(c0, vacc0x0123456789ABCDEF);
+      c0 = (float*) ((uintptr_t) c0 + cn_stride);
+
+      a = (const float**restrict) ((uintptr_t) a - ks);
+      nc -= 16;
+    } else {
+      if (nc & 15) {
+        // Prepare mask for valid 32-bit elements (depends on nc).
+        const __mmask16 vmask = _cvtu32_mask16((uint16_t) ((uint32_t) (UINT32_C(1) << nc) - UINT32_C(1)));
+
+        _mm512_mask_storeu_ps(c5, vmask, vacc5x0123456789ABCDEF);
+        _mm512_mask_storeu_ps(c4, vmask, vacc4x0123456789ABCDEF);
+        _mm512_mask_storeu_ps(c3, vmask, vacc3x0123456789ABCDEF);
+        _mm512_mask_storeu_ps(c2, vmask, vacc2x0123456789ABCDEF);
+        _mm512_mask_storeu_ps(c1, vmask, vacc1x0123456789ABCDEF);
+        _mm512_mask_storeu_ps(c0, vmask, vacc0x0123456789ABCDEF);
+      }
+
+      nc = 0;
+    }
+  } while (nc != 0);
+}
diff --git a/src/f32-igemm/gen/7x16-avx512f-broadcast.c b/src/f32-igemm/gen/7x16-avx512f-broadcast.c
new file mode 100644
index 0000000..24a2865
--- /dev/null
+++ b/src/f32-igemm/gen/7x16-avx512f-broadcast.c
@@ -0,0 +1,197 @@
+// Auto-generated file. Do not edit!
+//   Template: src/f32-igemm/avx512-broadcast.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <immintrin.h>
+
+#include <xnnpack/igemm.h>
+
+
+void xnn_f32_igemm_ukernel_7x16__avx512f_broadcast(
+    size_t mr,
+    size_t nc,
+    size_t kc,
+    size_t ks,
+    const float**restrict a,
+    const float*restrict w,
+    float*restrict c,
+    size_t cm_stride,
+    size_t cn_stride,
+    size_t a_offset,
+    const float* zero,
+    const union xnn_f32_output_params params[restrict static 1])
+{
+  assert(mr != 0);
+  assert(mr <= 7);
+  assert(nc != 0);
+  assert(kc != 0);
+  assert(kc % sizeof(float) == 0);
+  assert(ks != 0);
+  assert(ks % (7 * sizeof(void*)) == 0);
+  assert(a_offset % sizeof(float) == 0);
+  assert(a != NULL);
+  assert(w != NULL);
+  assert(c != NULL);
+
+  float* c0 = c;
+  float* c1 = (float*) ((uintptr_t) c0 + cm_stride);
+  if XNN_UNPREDICTABLE(mr < 2) {
+    c1 = c0;
+  }
+  float* c2 = (float*) ((uintptr_t) c1 + cm_stride);
+  if XNN_UNPREDICTABLE(mr <= 2) {
+    c2 = c1;
+  }
+  float* c3 = (float*) ((uintptr_t) c2 + cm_stride);
+  if XNN_UNPREDICTABLE(mr < 4) {
+    c3 = c2;
+  }
+  float* c4 = (float*) ((uintptr_t) c3 + cm_stride);
+  if XNN_UNPREDICTABLE(mr <= 4) {
+    c4 = c3;
+  }
+  float* c5 = (float*) ((uintptr_t) c4 + cm_stride);
+  if XNN_UNPREDICTABLE(mr < 6) {
+    c5 = c4;
+  }
+  float* c6 = (float*) ((uintptr_t) c5 + cm_stride);
+  if XNN_UNPREDICTABLE(mr <= 6) {
+    c6 = c5;
+  }
+
+  do {
+    __m512 vacc0x0123456789ABCDEF = _mm512_load_ps(w);
+    __m512 vacc1x0123456789ABCDEF = vacc0x0123456789ABCDEF;
+    __m512 vacc2x0123456789ABCDEF = vacc0x0123456789ABCDEF;
+    __m512 vacc3x0123456789ABCDEF = vacc0x0123456789ABCDEF;
+    __m512 vacc4x0123456789ABCDEF = vacc0x0123456789ABCDEF;
+    __m512 vacc5x0123456789ABCDEF = vacc0x0123456789ABCDEF;
+    __m512 vacc6x0123456789ABCDEF = vacc0x0123456789ABCDEF;
+    w += 16;
+
+    size_t p = ks;
+    do {
+      const float* restrict a0 = a[0];
+      assert(a0 != NULL);
+      if XNN_UNPREDICTABLE(a0 != zero) {
+        a0 = (const float*) ((uintptr_t) a0 + a_offset);
+      }
+      const float* restrict a1 = a[1];
+      assert(a1 != NULL);
+      if XNN_UNPREDICTABLE(a1 != zero) {
+        a1 = (const float*) ((uintptr_t) a1 + a_offset);
+      }
+      const float* restrict a2 = a[2];
+      assert(a2 != NULL);
+      if XNN_UNPREDICTABLE(a2 != zero) {
+        a2 = (const float*) ((uintptr_t) a2 + a_offset);
+      }
+      const float* restrict a3 = a[3];
+      assert(a3 != NULL);
+      if XNN_UNPREDICTABLE(a3 != zero) {
+        a3 = (const float*) ((uintptr_t) a3 + a_offset);
+      }
+      const float* restrict a4 = a[4];
+      assert(a4 != NULL);
+      if XNN_UNPREDICTABLE(a4 != zero) {
+        a4 = (const float*) ((uintptr_t) a4 + a_offset);
+      }
+      const float* restrict a5 = a[5];
+      assert(a5 != NULL);
+      if XNN_UNPREDICTABLE(a5 != zero) {
+        a5 = (const float*) ((uintptr_t) a5 + a_offset);
+      }
+      const float* restrict a6 = a[6];
+      assert(a6 != NULL);
+      if XNN_UNPREDICTABLE(a6 != zero) {
+        a6 = (const float*) ((uintptr_t) a6 + a_offset);
+      }
+      a += 7;
+
+      size_t k = kc;
+      do {
+        const __m512 vb0123456789ABCDEF = _mm512_load_ps(w);
+        w += 16;
+
+        vacc0x0123456789ABCDEF = _mm512_fmadd_ps(_mm512_set1_ps(*a0), vb0123456789ABCDEF, vacc0x0123456789ABCDEF);
+        vacc1x0123456789ABCDEF = _mm512_fmadd_ps(_mm512_set1_ps(*a1), vb0123456789ABCDEF, vacc1x0123456789ABCDEF);
+        vacc2x0123456789ABCDEF = _mm512_fmadd_ps(_mm512_set1_ps(*a2), vb0123456789ABCDEF, vacc2x0123456789ABCDEF);
+        vacc3x0123456789ABCDEF = _mm512_fmadd_ps(_mm512_set1_ps(*a3), vb0123456789ABCDEF, vacc3x0123456789ABCDEF);
+        vacc4x0123456789ABCDEF = _mm512_fmadd_ps(_mm512_set1_ps(*a4), vb0123456789ABCDEF, vacc4x0123456789ABCDEF);
+        vacc5x0123456789ABCDEF = _mm512_fmadd_ps(_mm512_set1_ps(*a5), vb0123456789ABCDEF, vacc5x0123456789ABCDEF);
+        vacc6x0123456789ABCDEF = _mm512_fmadd_ps(_mm512_set1_ps(*a6), vb0123456789ABCDEF, vacc6x0123456789ABCDEF);
+
+        a0 += 1;
+        a1 += 1;
+        a2 += 1;
+        a3 += 1;
+        a4 += 1;
+        a5 += 1;
+        a6 += 1;
+
+        k -= sizeof(float);
+      } while (k != 0);
+      p -= 7 * sizeof(void*);
+    } while (p != 0);
+
+    const __m512 vmax = _mm512_broadcast_f32x4(_mm_load_ps(params->sse.max));
+    vacc0x0123456789ABCDEF = _mm512_min_ps(vacc0x0123456789ABCDEF, vmax);
+    vacc1x0123456789ABCDEF = _mm512_min_ps(vacc1x0123456789ABCDEF, vmax);
+    vacc2x0123456789ABCDEF = _mm512_min_ps(vacc2x0123456789ABCDEF, vmax);
+    vacc3x0123456789ABCDEF = _mm512_min_ps(vacc3x0123456789ABCDEF, vmax);
+    vacc4x0123456789ABCDEF = _mm512_min_ps(vacc4x0123456789ABCDEF, vmax);
+    vacc5x0123456789ABCDEF = _mm512_min_ps(vacc5x0123456789ABCDEF, vmax);
+    vacc6x0123456789ABCDEF = _mm512_min_ps(vacc6x0123456789ABCDEF, vmax);
+
+    const __m512 vmin = _mm512_broadcast_f32x4(_mm_load_ps(params->sse.min));
+    vacc0x0123456789ABCDEF = _mm512_max_ps(vacc0x0123456789ABCDEF, vmin);
+    vacc1x0123456789ABCDEF = _mm512_max_ps(vacc1x0123456789ABCDEF, vmin);
+    vacc2x0123456789ABCDEF = _mm512_max_ps(vacc2x0123456789ABCDEF, vmin);
+    vacc3x0123456789ABCDEF = _mm512_max_ps(vacc3x0123456789ABCDEF, vmin);
+    vacc4x0123456789ABCDEF = _mm512_max_ps(vacc4x0123456789ABCDEF, vmin);
+    vacc5x0123456789ABCDEF = _mm512_max_ps(vacc5x0123456789ABCDEF, vmin);
+    vacc6x0123456789ABCDEF = _mm512_max_ps(vacc6x0123456789ABCDEF, vmin);
+
+    if XNN_LIKELY(nc >= 16) {
+      _mm512_storeu_ps(c6, vacc6x0123456789ABCDEF);
+      c6 = (float*) ((uintptr_t) c6 + cn_stride);
+      _mm512_storeu_ps(c5, vacc5x0123456789ABCDEF);
+      c5 = (float*) ((uintptr_t) c5 + cn_stride);
+      _mm512_storeu_ps(c4, vacc4x0123456789ABCDEF);
+      c4 = (float*) ((uintptr_t) c4 + cn_stride);
+      _mm512_storeu_ps(c3, vacc3x0123456789ABCDEF);
+      c3 = (float*) ((uintptr_t) c3 + cn_stride);
+      _mm512_storeu_ps(c2, vacc2x0123456789ABCDEF);
+      c2 = (float*) ((uintptr_t) c2 + cn_stride);
+      _mm512_storeu_ps(c1, vacc1x0123456789ABCDEF);
+      c1 = (float*) ((uintptr_t) c1 + cn_stride);
+      _mm512_storeu_ps(c0, vacc0x0123456789ABCDEF);
+      c0 = (float*) ((uintptr_t) c0 + cn_stride);
+
+      a = (const float**restrict) ((uintptr_t) a - ks);
+      nc -= 16;
+    } else {
+      if (nc & 15) {
+        // Prepare mask for valid 32-bit elements (depends on nc).
+        const __mmask16 vmask = _cvtu32_mask16((uint16_t) ((uint32_t) (UINT32_C(1) << nc) - UINT32_C(1)));
+
+        _mm512_mask_storeu_ps(c6, vmask, vacc6x0123456789ABCDEF);
+        _mm512_mask_storeu_ps(c5, vmask, vacc5x0123456789ABCDEF);
+        _mm512_mask_storeu_ps(c4, vmask, vacc4x0123456789ABCDEF);
+        _mm512_mask_storeu_ps(c3, vmask, vacc3x0123456789ABCDEF);
+        _mm512_mask_storeu_ps(c2, vmask, vacc2x0123456789ABCDEF);
+        _mm512_mask_storeu_ps(c1, vmask, vacc1x0123456789ABCDEF);
+        _mm512_mask_storeu_ps(c0, vmask, vacc0x0123456789ABCDEF);
+      }
+
+      nc = 0;
+    }
+  } while (nc != 0);
+}
diff --git a/src/f32-igemm/gen/8x16-avx512f-broadcast.c b/src/f32-igemm/gen/8x16-avx512f-broadcast.c
new file mode 100644
index 0000000..1299bd5
--- /dev/null
+++ b/src/f32-igemm/gen/8x16-avx512f-broadcast.c
@@ -0,0 +1,214 @@
+// Auto-generated file. Do not edit!
+//   Template: src/f32-igemm/avx512-broadcast.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <immintrin.h>
+
+#include <xnnpack/igemm.h>
+
+
+void xnn_f32_igemm_ukernel_8x16__avx512f_broadcast(
+    size_t mr,
+    size_t nc,
+    size_t kc,
+    size_t ks,
+    const float**restrict a,
+    const float*restrict w,
+    float*restrict c,
+    size_t cm_stride,
+    size_t cn_stride,
+    size_t a_offset,
+    const float* zero,
+    const union xnn_f32_output_params params[restrict static 1])
+{
+  assert(mr != 0);
+  assert(mr <= 8);
+  assert(nc != 0);
+  assert(kc != 0);
+  assert(kc % sizeof(float) == 0);
+  assert(ks != 0);
+  assert(ks % (8 * sizeof(void*)) == 0);
+  assert(a_offset % sizeof(float) == 0);
+  assert(a != NULL);
+  assert(w != NULL);
+  assert(c != NULL);
+
+  float* c0 = c;
+  float* c1 = (float*) ((uintptr_t) c0 + cm_stride);
+  if XNN_UNPREDICTABLE(mr < 2) {
+    c1 = c0;
+  }
+  float* c2 = (float*) ((uintptr_t) c1 + cm_stride);
+  if XNN_UNPREDICTABLE(mr <= 2) {
+    c2 = c1;
+  }
+  float* c3 = (float*) ((uintptr_t) c2 + cm_stride);
+  if XNN_UNPREDICTABLE(mr < 4) {
+    c3 = c2;
+  }
+  float* c4 = (float*) ((uintptr_t) c3 + cm_stride);
+  if XNN_UNPREDICTABLE(mr <= 4) {
+    c4 = c3;
+  }
+  float* c5 = (float*) ((uintptr_t) c4 + cm_stride);
+  if XNN_UNPREDICTABLE(mr < 6) {
+    c5 = c4;
+  }
+  float* c6 = (float*) ((uintptr_t) c5 + cm_stride);
+  if XNN_UNPREDICTABLE(mr <= 6) {
+    c6 = c5;
+  }
+  float* c7 = (float*) ((uintptr_t) c6 + cm_stride);
+  if XNN_UNPREDICTABLE(mr != 8) {
+    c7 = c6;
+  }
+
+  do {
+    __m512 vacc0x0123456789ABCDEF = _mm512_load_ps(w);
+    __m512 vacc1x0123456789ABCDEF = vacc0x0123456789ABCDEF;
+    __m512 vacc2x0123456789ABCDEF = vacc0x0123456789ABCDEF;
+    __m512 vacc3x0123456789ABCDEF = vacc0x0123456789ABCDEF;
+    __m512 vacc4x0123456789ABCDEF = vacc0x0123456789ABCDEF;
+    __m512 vacc5x0123456789ABCDEF = vacc0x0123456789ABCDEF;
+    __m512 vacc6x0123456789ABCDEF = vacc0x0123456789ABCDEF;
+    __m512 vacc7x0123456789ABCDEF = vacc0x0123456789ABCDEF;
+    w += 16;
+
+    size_t p = ks;
+    do {
+      const float* restrict a0 = a[0];
+      assert(a0 != NULL);
+      if XNN_UNPREDICTABLE(a0 != zero) {
+        a0 = (const float*) ((uintptr_t) a0 + a_offset);
+      }
+      const float* restrict a1 = a[1];
+      assert(a1 != NULL);
+      if XNN_UNPREDICTABLE(a1 != zero) {
+        a1 = (const float*) ((uintptr_t) a1 + a_offset);
+      }
+      const float* restrict a2 = a[2];
+      assert(a2 != NULL);
+      if XNN_UNPREDICTABLE(a2 != zero) {
+        a2 = (const float*) ((uintptr_t) a2 + a_offset);
+      }
+      const float* restrict a3 = a[3];
+      assert(a3 != NULL);
+      if XNN_UNPREDICTABLE(a3 != zero) {
+        a3 = (const float*) ((uintptr_t) a3 + a_offset);
+      }
+      const float* restrict a4 = a[4];
+      assert(a4 != NULL);
+      if XNN_UNPREDICTABLE(a4 != zero) {
+        a4 = (const float*) ((uintptr_t) a4 + a_offset);
+      }
+      const float* restrict a5 = a[5];
+      assert(a5 != NULL);
+      if XNN_UNPREDICTABLE(a5 != zero) {
+        a5 = (const float*) ((uintptr_t) a5 + a_offset);
+      }
+      const float* restrict a6 = a[6];
+      assert(a6 != NULL);
+      if XNN_UNPREDICTABLE(a6 != zero) {
+        a6 = (const float*) ((uintptr_t) a6 + a_offset);
+      }
+      const float* restrict a7 = a[7];
+      assert(a7 != NULL);
+      if XNN_UNPREDICTABLE(a7 != zero) {
+        a7 = (const float*) ((uintptr_t) a7 + a_offset);
+      }
+      a += 8;
+
+      size_t k = kc;
+      do {
+        const __m512 vb0123456789ABCDEF = _mm512_load_ps(w);
+        w += 16;
+
+        vacc0x0123456789ABCDEF = _mm512_fmadd_ps(_mm512_set1_ps(*a0), vb0123456789ABCDEF, vacc0x0123456789ABCDEF);
+        vacc1x0123456789ABCDEF = _mm512_fmadd_ps(_mm512_set1_ps(*a1), vb0123456789ABCDEF, vacc1x0123456789ABCDEF);
+        vacc2x0123456789ABCDEF = _mm512_fmadd_ps(_mm512_set1_ps(*a2), vb0123456789ABCDEF, vacc2x0123456789ABCDEF);
+        vacc3x0123456789ABCDEF = _mm512_fmadd_ps(_mm512_set1_ps(*a3), vb0123456789ABCDEF, vacc3x0123456789ABCDEF);
+        vacc4x0123456789ABCDEF = _mm512_fmadd_ps(_mm512_set1_ps(*a4), vb0123456789ABCDEF, vacc4x0123456789ABCDEF);
+        vacc5x0123456789ABCDEF = _mm512_fmadd_ps(_mm512_set1_ps(*a5), vb0123456789ABCDEF, vacc5x0123456789ABCDEF);
+        vacc6x0123456789ABCDEF = _mm512_fmadd_ps(_mm512_set1_ps(*a6), vb0123456789ABCDEF, vacc6x0123456789ABCDEF);
+        vacc7x0123456789ABCDEF = _mm512_fmadd_ps(_mm512_set1_ps(*a7), vb0123456789ABCDEF, vacc7x0123456789ABCDEF);
+
+        a0 += 1;
+        a1 += 1;
+        a2 += 1;
+        a3 += 1;
+        a4 += 1;
+        a5 += 1;
+        a6 += 1;
+        a7 += 1;
+
+        k -= sizeof(float);
+      } while (k != 0);
+      p -= 8 * sizeof(void*);
+    } while (p != 0);
+
+    const __m512 vmax = _mm512_broadcast_f32x4(_mm_load_ps(params->sse.max));
+    vacc0x0123456789ABCDEF = _mm512_min_ps(vacc0x0123456789ABCDEF, vmax);
+    vacc1x0123456789ABCDEF = _mm512_min_ps(vacc1x0123456789ABCDEF, vmax);
+    vacc2x0123456789ABCDEF = _mm512_min_ps(vacc2x0123456789ABCDEF, vmax);
+    vacc3x0123456789ABCDEF = _mm512_min_ps(vacc3x0123456789ABCDEF, vmax);
+    vacc4x0123456789ABCDEF = _mm512_min_ps(vacc4x0123456789ABCDEF, vmax);
+    vacc5x0123456789ABCDEF = _mm512_min_ps(vacc5x0123456789ABCDEF, vmax);
+    vacc6x0123456789ABCDEF = _mm512_min_ps(vacc6x0123456789ABCDEF, vmax);
+    vacc7x0123456789ABCDEF = _mm512_min_ps(vacc7x0123456789ABCDEF, vmax);
+
+    const __m512 vmin = _mm512_broadcast_f32x4(_mm_load_ps(params->sse.min));
+    vacc0x0123456789ABCDEF = _mm512_max_ps(vacc0x0123456789ABCDEF, vmin);
+    vacc1x0123456789ABCDEF = _mm512_max_ps(vacc1x0123456789ABCDEF, vmin);
+    vacc2x0123456789ABCDEF = _mm512_max_ps(vacc2x0123456789ABCDEF, vmin);
+    vacc3x0123456789ABCDEF = _mm512_max_ps(vacc3x0123456789ABCDEF, vmin);
+    vacc4x0123456789ABCDEF = _mm512_max_ps(vacc4x0123456789ABCDEF, vmin);
+    vacc5x0123456789ABCDEF = _mm512_max_ps(vacc5x0123456789ABCDEF, vmin);
+    vacc6x0123456789ABCDEF = _mm512_max_ps(vacc6x0123456789ABCDEF, vmin);
+    vacc7x0123456789ABCDEF = _mm512_max_ps(vacc7x0123456789ABCDEF, vmin);
+
+    if XNN_LIKELY(nc >= 16) {
+      _mm512_storeu_ps(c7, vacc7x0123456789ABCDEF);
+      c7 = (float*) ((uintptr_t) c7 + cn_stride);
+      _mm512_storeu_ps(c6, vacc6x0123456789ABCDEF);
+      c6 = (float*) ((uintptr_t) c6 + cn_stride);
+      _mm512_storeu_ps(c5, vacc5x0123456789ABCDEF);
+      c5 = (float*) ((uintptr_t) c5 + cn_stride);
+      _mm512_storeu_ps(c4, vacc4x0123456789ABCDEF);
+      c4 = (float*) ((uintptr_t) c4 + cn_stride);
+      _mm512_storeu_ps(c3, vacc3x0123456789ABCDEF);
+      c3 = (float*) ((uintptr_t) c3 + cn_stride);
+      _mm512_storeu_ps(c2, vacc2x0123456789ABCDEF);
+      c2 = (float*) ((uintptr_t) c2 + cn_stride);
+      _mm512_storeu_ps(c1, vacc1x0123456789ABCDEF);
+      c1 = (float*) ((uintptr_t) c1 + cn_stride);
+      _mm512_storeu_ps(c0, vacc0x0123456789ABCDEF);
+      c0 = (float*) ((uintptr_t) c0 + cn_stride);
+
+      a = (const float**restrict) ((uintptr_t) a - ks);
+      nc -= 16;
+    } else {
+      if (nc & 15) {
+        // Prepare mask for valid 32-bit elements (depends on nc).
+        const __mmask16 vmask = _cvtu32_mask16((uint16_t) ((uint32_t) (UINT32_C(1) << nc) - UINT32_C(1)));
+
+        _mm512_mask_storeu_ps(c7, vmask, vacc7x0123456789ABCDEF);
+        _mm512_mask_storeu_ps(c6, vmask, vacc6x0123456789ABCDEF);
+        _mm512_mask_storeu_ps(c5, vmask, vacc5x0123456789ABCDEF);
+        _mm512_mask_storeu_ps(c4, vmask, vacc4x0123456789ABCDEF);
+        _mm512_mask_storeu_ps(c3, vmask, vacc3x0123456789ABCDEF);
+        _mm512_mask_storeu_ps(c2, vmask, vacc2x0123456789ABCDEF);
+        _mm512_mask_storeu_ps(c1, vmask, vacc1x0123456789ABCDEF);
+        _mm512_mask_storeu_ps(c0, vmask, vacc0x0123456789ABCDEF);
+      }
+
+      nc = 0;
+    }
+  } while (nc != 0);
+}
diff --git a/src/init.c b/src/init.c
index 2be2415..fd2aa12 100644
--- a/src/init.c
+++ b/src/init.c
@@ -622,7 +622,16 @@
 
   /**************************** F32 micro-kernels ****************************/
   #ifndef XNN_NO_F32_OPERATORS
-    if (!XNN_PLATFORM_MOBILE && cpuinfo_has_x86_fma3()) {
+    if (!XNN_PLATFORM_MOBILE && cpuinfo_has_x86_avx512f()) {
+      xnn_params.f32.gemm = (struct gemm_parameters) {
+        .gemm = (xnn_gemm_ukernel_function) xnn_f32_gemm_ukernel_7x16__avx512f_broadcast,
+        .igemm = (xnn_igemm_ukernel_function) xnn_f32_igemm_ukernel_7x16__avx512f_broadcast,
+        .gemm1 = (xnn_gemm_ukernel_function) xnn_f32_gemm_ukernel_1x16__avx512f_broadcast,
+        .igemm1 = (xnn_igemm_ukernel_function) xnn_f32_igemm_ukernel_1x16__avx512f_broadcast,
+        .mr = 7,
+        .nr = 16,
+      };
+    } else if (!XNN_PLATFORM_MOBILE && cpuinfo_has_x86_fma3()) {
       xnn_params.f32.gemm = (struct gemm_parameters) {
         .gemm = (xnn_gemm_ukernel_function) xnn_f32_gemm_ukernel_7x8__fma3_broadcast,
         .igemm = (xnn_igemm_ukernel_function) xnn_f32_igemm_ukernel_7x8__fma3_broadcast,
diff --git a/src/xnnpack/gemm.h b/src/xnnpack/gemm.h
index f69cb08..ed45fb2 100644
--- a/src/xnnpack/gemm.h
+++ b/src/xnnpack/gemm.h
@@ -33,16 +33,17 @@
       const union xnn_f32_output_params* params);
 
 DECLARE_F32_GEMM_UKERNEL_FUNCTION(xnn_f32_gemm_ukernel_1x12__aarch64_neonfma_cortex_a53)
+DECLARE_F32_GEMM_UKERNEL_FUNCTION(xnn_f32_gemm_ukernel_1x16__avx512f_broadcast)
 DECLARE_F32_GEMM_UKERNEL_FUNCTION(xnn_f32_gemm_ukernel_1x4__scalar)
 DECLARE_F32_GEMM_UKERNEL_FUNCTION(xnn_f32_gemm_ukernel_1x8__aarch64_neonfma_cortex_a53)
 DECLARE_F32_GEMM_UKERNEL_FUNCTION(xnn_f32_gemm_ukernel_1x8__aarch64_neonfma_cortex_a57)
 DECLARE_F32_GEMM_UKERNEL_FUNCTION(xnn_f32_gemm_ukernel_1x8__aarch64_neonfma_cortex_a75)
 DECLARE_F32_GEMM_UKERNEL_FUNCTION(xnn_f32_gemm_ukernel_1x8__avx_broadcast)
 DECLARE_F32_GEMM_UKERNEL_FUNCTION(xnn_f32_gemm_ukernel_1x8__fma3_broadcast)
-DECLARE_F32_GEMM_UKERNEL_FUNCTION(xnn_f32_gemm_ukernel_1x8__neon_lane_ld64)
-DECLARE_F32_GEMM_UKERNEL_FUNCTION(xnn_f32_gemm_ukernel_1x8__neonfma_lane_ld64)
 DECLARE_F32_GEMM_UKERNEL_FUNCTION(xnn_f32_gemm_ukernel_1x8__neon_dup_ld64)
+DECLARE_F32_GEMM_UKERNEL_FUNCTION(xnn_f32_gemm_ukernel_1x8__neon_lane_ld64)
 DECLARE_F32_GEMM_UKERNEL_FUNCTION(xnn_f32_gemm_ukernel_1x8__neonfma_dup_ld64)
+DECLARE_F32_GEMM_UKERNEL_FUNCTION(xnn_f32_gemm_ukernel_1x8__neonfma_lane_ld64)
 DECLARE_F32_GEMM_UKERNEL_FUNCTION(xnn_f32_gemm_ukernel_1x8__psimd_loadsplat)
 DECLARE_F32_GEMM_UKERNEL_FUNCTION(xnn_f32_gemm_ukernel_1x8__psimd_splat)
 DECLARE_F32_GEMM_UKERNEL_FUNCTION(xnn_f32_gemm_ukernel_1x8__sse_dup)
@@ -53,6 +54,7 @@
 DECLARE_F32_GEMM_UKERNEL_FUNCTION(xnn_f32_gemm_ukernel_1x8s4__sse)
 DECLARE_F32_GEMM_UKERNEL_FUNCTION(xnn_f32_gemm_ukernel_2x4__scalar)
 DECLARE_F32_GEMM_UKERNEL_FUNCTION(xnn_f32_gemm_ukernel_4x12__aarch64_neonfma_cortex_a53)
+DECLARE_F32_GEMM_UKERNEL_FUNCTION(xnn_f32_gemm_ukernel_4x16__avx512f_broadcast)
 DECLARE_F32_GEMM_UKERNEL_FUNCTION(xnn_f32_gemm_ukernel_4x2__neon_lane_ld64)
 DECLARE_F32_GEMM_UKERNEL_FUNCTION(xnn_f32_gemm_ukernel_4x2__neonfma_lane_ld64)
 DECLARE_F32_GEMM_UKERNEL_FUNCTION(xnn_f32_gemm_ukernel_4x2__scalar)
@@ -64,14 +66,14 @@
 DECLARE_F32_GEMM_UKERNEL_FUNCTION(xnn_f32_gemm_ukernel_4x8__aarch64_neonfma_ld64)
 DECLARE_F32_GEMM_UKERNEL_FUNCTION(xnn_f32_gemm_ukernel_4x8__avx_broadcast)
 DECLARE_F32_GEMM_UKERNEL_FUNCTION(xnn_f32_gemm_ukernel_4x8__fma3_broadcast)
-DECLARE_F32_GEMM_UKERNEL_FUNCTION(xnn_f32_gemm_ukernel_4x8__neon_lane_ld128)
-DECLARE_F32_GEMM_UKERNEL_FUNCTION(xnn_f32_gemm_ukernel_4x8__neonfma_lane_ld128)
-DECLARE_F32_GEMM_UKERNEL_FUNCTION(xnn_f32_gemm_ukernel_4x8__neon_lane_ld64)
-DECLARE_F32_GEMM_UKERNEL_FUNCTION(xnn_f32_gemm_ukernel_4x8__neonfma_lane_ld64)
 DECLARE_F32_GEMM_UKERNEL_FUNCTION(xnn_f32_gemm_ukernel_4x8__neon_dup_ld128)
-DECLARE_F32_GEMM_UKERNEL_FUNCTION(xnn_f32_gemm_ukernel_4x8__neonfma_dup_ld128)
 DECLARE_F32_GEMM_UKERNEL_FUNCTION(xnn_f32_gemm_ukernel_4x8__neon_dup_ld64)
+DECLARE_F32_GEMM_UKERNEL_FUNCTION(xnn_f32_gemm_ukernel_4x8__neon_lane_ld128)
+DECLARE_F32_GEMM_UKERNEL_FUNCTION(xnn_f32_gemm_ukernel_4x8__neon_lane_ld64)
+DECLARE_F32_GEMM_UKERNEL_FUNCTION(xnn_f32_gemm_ukernel_4x8__neonfma_dup_ld128)
 DECLARE_F32_GEMM_UKERNEL_FUNCTION(xnn_f32_gemm_ukernel_4x8__neonfma_dup_ld64)
+DECLARE_F32_GEMM_UKERNEL_FUNCTION(xnn_f32_gemm_ukernel_4x8__neonfma_lane_ld128)
+DECLARE_F32_GEMM_UKERNEL_FUNCTION(xnn_f32_gemm_ukernel_4x8__neonfma_lane_ld64)
 DECLARE_F32_GEMM_UKERNEL_FUNCTION(xnn_f32_gemm_ukernel_4x8__psimd_loadsplat)
 DECLARE_F32_GEMM_UKERNEL_FUNCTION(xnn_f32_gemm_ukernel_4x8__psimd_splat)
 DECLARE_F32_GEMM_UKERNEL_FUNCTION(xnn_f32_gemm_ukernel_4x8__sse_dup)
@@ -80,11 +82,13 @@
 DECLARE_F32_GEMM_UKERNEL_FUNCTION(xnn_f32_gemm_ukernel_4x8s4__neonfma)
 DECLARE_F32_GEMM_UKERNEL_FUNCTION(xnn_f32_gemm_ukernel_4x8s4__psimd)
 DECLARE_F32_GEMM_UKERNEL_FUNCTION(xnn_f32_gemm_ukernel_4x8s4__sse)
+DECLARE_F32_GEMM_UKERNEL_FUNCTION(xnn_f32_gemm_ukernel_5x16__avx512f_broadcast)
 DECLARE_F32_GEMM_UKERNEL_FUNCTION(xnn_f32_gemm_ukernel_5x8__aarch64_neonfma_cortex_a75)
 DECLARE_F32_GEMM_UKERNEL_FUNCTION(xnn_f32_gemm_ukernel_5x8__avx_broadcast)
 DECLARE_F32_GEMM_UKERNEL_FUNCTION(xnn_f32_gemm_ukernel_5x8__fma3_broadcast)
 DECLARE_F32_GEMM_UKERNEL_FUNCTION(xnn_f32_gemm_ukernel_5x8__neon_lane_ld64)
 DECLARE_F32_GEMM_UKERNEL_FUNCTION(xnn_f32_gemm_ukernel_5x8__neonfma_lane_ld64)
+DECLARE_F32_GEMM_UKERNEL_FUNCTION(xnn_f32_gemm_ukernel_6x16__avx512f_broadcast)
 DECLARE_F32_GEMM_UKERNEL_FUNCTION(xnn_f32_gemm_ukernel_6x8__aarch64_neonfma_cortex_a53)
 DECLARE_F32_GEMM_UKERNEL_FUNCTION(xnn_f32_gemm_ukernel_6x8__aarch64_neonfma_cortex_a57)
 DECLARE_F32_GEMM_UKERNEL_FUNCTION(xnn_f32_gemm_ukernel_6x8__aarch64_neonfma_cortex_a73)
@@ -93,10 +97,10 @@
 DECLARE_F32_GEMM_UKERNEL_FUNCTION(xnn_f32_gemm_ukernel_6x8__aarch64_neonfma_ld64)
 DECLARE_F32_GEMM_UKERNEL_FUNCTION(xnn_f32_gemm_ukernel_6x8__avx_broadcast)
 DECLARE_F32_GEMM_UKERNEL_FUNCTION(xnn_f32_gemm_ukernel_6x8__fma3_broadcast)
-DECLARE_F32_GEMM_UKERNEL_FUNCTION(xnn_f32_gemm_ukernel_6x8__neon_lane_ld64)
-DECLARE_F32_GEMM_UKERNEL_FUNCTION(xnn_f32_gemm_ukernel_6x8__neonfma_lane_ld64)
 DECLARE_F32_GEMM_UKERNEL_FUNCTION(xnn_f32_gemm_ukernel_6x8__neon_dup_ld64)
+DECLARE_F32_GEMM_UKERNEL_FUNCTION(xnn_f32_gemm_ukernel_6x8__neon_lane_ld64)
 DECLARE_F32_GEMM_UKERNEL_FUNCTION(xnn_f32_gemm_ukernel_6x8__neonfma_dup_ld64)
+DECLARE_F32_GEMM_UKERNEL_FUNCTION(xnn_f32_gemm_ukernel_6x8__neonfma_lane_ld64)
 DECLARE_F32_GEMM_UKERNEL_FUNCTION(xnn_f32_gemm_ukernel_6x8__neon_lane_ld128)
 DECLARE_F32_GEMM_UKERNEL_FUNCTION(xnn_f32_gemm_ukernel_6x8__neonfma_lane_ld128)
 DECLARE_F32_GEMM_UKERNEL_FUNCTION(xnn_f32_gemm_ukernel_6x8__neon_dup_ld128)
@@ -106,8 +110,10 @@
 DECLARE_F32_GEMM_UKERNEL_FUNCTION(xnn_f32_gemm_ukernel_6x8s4__neon)
 DECLARE_F32_GEMM_UKERNEL_FUNCTION(xnn_f32_gemm_ukernel_6x8s4__neonfma)
 DECLARE_F32_GEMM_UKERNEL_FUNCTION(xnn_f32_gemm_ukernel_6x8s4__psimd)
+DECLARE_F32_GEMM_UKERNEL_FUNCTION(xnn_f32_gemm_ukernel_7x16__avx512f_broadcast)
 DECLARE_F32_GEMM_UKERNEL_FUNCTION(xnn_f32_gemm_ukernel_7x8__avx_broadcast)
 DECLARE_F32_GEMM_UKERNEL_FUNCTION(xnn_f32_gemm_ukernel_7x8__fma3_broadcast)
+DECLARE_F32_GEMM_UKERNEL_FUNCTION(xnn_f32_gemm_ukernel_8x16__avx512f_broadcast)
 DECLARE_F32_GEMM_UKERNEL_FUNCTION(xnn_f32_gemm_ukernel_8x8__fma3_broadcast)
 DECLARE_F32_GEMM_UKERNEL_FUNCTION(xnn_f32_gemm_ukernel_8x8s4__neon)
 DECLARE_F32_GEMM_UKERNEL_FUNCTION(xnn_f32_gemm_ukernel_8x8s4__neonfma)
@@ -127,16 +133,17 @@
       const union xnn_f32_output_params* params);
 
 DECLARE_F32_GEMMINC_UKERNEL_FUNCTION(xnn_f32_gemminc_ukernel_1x12__aarch64_neonfma_cortex_a53)
+DECLARE_F32_GEMMINC_UKERNEL_FUNCTION(xnn_f32_gemminc_ukernel_1x16__avx512f_broadcast)
 DECLARE_F32_GEMMINC_UKERNEL_FUNCTION(xnn_f32_gemminc_ukernel_1x4__scalar)
 DECLARE_F32_GEMMINC_UKERNEL_FUNCTION(xnn_f32_gemminc_ukernel_1x8__aarch64_neonfma_cortex_a53)
 DECLARE_F32_GEMMINC_UKERNEL_FUNCTION(xnn_f32_gemminc_ukernel_1x8__aarch64_neonfma_cortex_a57)
 DECLARE_F32_GEMMINC_UKERNEL_FUNCTION(xnn_f32_gemminc_ukernel_1x8__aarch64_neonfma_cortex_a75)
 DECLARE_F32_GEMMINC_UKERNEL_FUNCTION(xnn_f32_gemminc_ukernel_1x8__avx_broadcast)
 DECLARE_F32_GEMMINC_UKERNEL_FUNCTION(xnn_f32_gemminc_ukernel_1x8__fma3_broadcast)
-DECLARE_F32_GEMMINC_UKERNEL_FUNCTION(xnn_f32_gemminc_ukernel_1x8__neon_lane_ld64)
-DECLARE_F32_GEMMINC_UKERNEL_FUNCTION(xnn_f32_gemminc_ukernel_1x8__neonfma_lane_ld64)
 DECLARE_F32_GEMMINC_UKERNEL_FUNCTION(xnn_f32_gemminc_ukernel_1x8__neon_dup_ld64)
+DECLARE_F32_GEMMINC_UKERNEL_FUNCTION(xnn_f32_gemminc_ukernel_1x8__neon_lane_ld64)
 DECLARE_F32_GEMMINC_UKERNEL_FUNCTION(xnn_f32_gemminc_ukernel_1x8__neonfma_dup_ld64)
+DECLARE_F32_GEMMINC_UKERNEL_FUNCTION(xnn_f32_gemminc_ukernel_1x8__neonfma_lane_ld64)
 DECLARE_F32_GEMMINC_UKERNEL_FUNCTION(xnn_f32_gemminc_ukernel_1x8__psimd_loadsplat)
 DECLARE_F32_GEMMINC_UKERNEL_FUNCTION(xnn_f32_gemminc_ukernel_1x8__psimd_splat)
 DECLARE_F32_GEMMINC_UKERNEL_FUNCTION(xnn_f32_gemminc_ukernel_1x8__sse_dup)
@@ -147,6 +154,7 @@
 DECLARE_F32_GEMMINC_UKERNEL_FUNCTION(xnn_f32_gemminc_ukernel_1x8s4__sse)
 DECLARE_F32_GEMMINC_UKERNEL_FUNCTION(xnn_f32_gemminc_ukernel_2x4__scalar)
 DECLARE_F32_GEMMINC_UKERNEL_FUNCTION(xnn_f32_gemminc_ukernel_4x12__aarch64_neonfma_cortex_a53)
+DECLARE_F32_GEMMINC_UKERNEL_FUNCTION(xnn_f32_gemminc_ukernel_4x16__avx512f_broadcast)
 DECLARE_F32_GEMMINC_UKERNEL_FUNCTION(xnn_f32_gemminc_ukernel_4x2__neon_lane_ld64)
 DECLARE_F32_GEMMINC_UKERNEL_FUNCTION(xnn_f32_gemminc_ukernel_4x2__neonfma_lane_ld64)
 DECLARE_F32_GEMMINC_UKERNEL_FUNCTION(xnn_f32_gemminc_ukernel_4x2__scalar)
@@ -158,14 +166,14 @@
 DECLARE_F32_GEMMINC_UKERNEL_FUNCTION(xnn_f32_gemminc_ukernel_4x8__aarch64_neonfma_ld64)
 DECLARE_F32_GEMMINC_UKERNEL_FUNCTION(xnn_f32_gemminc_ukernel_4x8__avx_broadcast)
 DECLARE_F32_GEMMINC_UKERNEL_FUNCTION(xnn_f32_gemminc_ukernel_4x8__fma3_broadcast)
-DECLARE_F32_GEMMINC_UKERNEL_FUNCTION(xnn_f32_gemminc_ukernel_4x8__neon_lane_ld128)
-DECLARE_F32_GEMMINC_UKERNEL_FUNCTION(xnn_f32_gemminc_ukernel_4x8__neonfma_lane_ld128)
-DECLARE_F32_GEMMINC_UKERNEL_FUNCTION(xnn_f32_gemminc_ukernel_4x8__neon_lane_ld64)
-DECLARE_F32_GEMMINC_UKERNEL_FUNCTION(xnn_f32_gemminc_ukernel_4x8__neonfma_lane_ld64)
 DECLARE_F32_GEMMINC_UKERNEL_FUNCTION(xnn_f32_gemminc_ukernel_4x8__neon_dup_ld128)
-DECLARE_F32_GEMMINC_UKERNEL_FUNCTION(xnn_f32_gemminc_ukernel_4x8__neonfma_dup_ld128)
 DECLARE_F32_GEMMINC_UKERNEL_FUNCTION(xnn_f32_gemminc_ukernel_4x8__neon_dup_ld64)
+DECLARE_F32_GEMMINC_UKERNEL_FUNCTION(xnn_f32_gemminc_ukernel_4x8__neon_lane_ld128)
+DECLARE_F32_GEMMINC_UKERNEL_FUNCTION(xnn_f32_gemminc_ukernel_4x8__neon_lane_ld64)
+DECLARE_F32_GEMMINC_UKERNEL_FUNCTION(xnn_f32_gemminc_ukernel_4x8__neonfma_dup_ld128)
 DECLARE_F32_GEMMINC_UKERNEL_FUNCTION(xnn_f32_gemminc_ukernel_4x8__neonfma_dup_ld64)
+DECLARE_F32_GEMMINC_UKERNEL_FUNCTION(xnn_f32_gemminc_ukernel_4x8__neonfma_lane_ld128)
+DECLARE_F32_GEMMINC_UKERNEL_FUNCTION(xnn_f32_gemminc_ukernel_4x8__neonfma_lane_ld64)
 DECLARE_F32_GEMMINC_UKERNEL_FUNCTION(xnn_f32_gemminc_ukernel_4x8__psimd_loadsplat)
 DECLARE_F32_GEMMINC_UKERNEL_FUNCTION(xnn_f32_gemminc_ukernel_4x8__psimd_splat)
 DECLARE_F32_GEMMINC_UKERNEL_FUNCTION(xnn_f32_gemminc_ukernel_4x8__sse_dup)
@@ -174,11 +182,13 @@
 DECLARE_F32_GEMMINC_UKERNEL_FUNCTION(xnn_f32_gemminc_ukernel_4x8s4__neonfma)
 DECLARE_F32_GEMMINC_UKERNEL_FUNCTION(xnn_f32_gemminc_ukernel_4x8s4__psimd)
 DECLARE_F32_GEMMINC_UKERNEL_FUNCTION(xnn_f32_gemminc_ukernel_4x8s4__sse)
+DECLARE_F32_GEMMINC_UKERNEL_FUNCTION(xnn_f32_gemminc_ukernel_5x16__avx512f_broadcast)
 DECLARE_F32_GEMMINC_UKERNEL_FUNCTION(xnn_f32_gemminc_ukernel_5x8__aarch64_neonfma_cortex_a75)
 DECLARE_F32_GEMMINC_UKERNEL_FUNCTION(xnn_f32_gemminc_ukernel_5x8__avx_broadcast)
 DECLARE_F32_GEMMINC_UKERNEL_FUNCTION(xnn_f32_gemminc_ukernel_5x8__fma3_broadcast)
 DECLARE_F32_GEMMINC_UKERNEL_FUNCTION(xnn_f32_gemminc_ukernel_5x8__neon_lane_ld64)
 DECLARE_F32_GEMMINC_UKERNEL_FUNCTION(xnn_f32_gemminc_ukernel_5x8__neonfma_lane_ld64)
+DECLARE_F32_GEMMINC_UKERNEL_FUNCTION(xnn_f32_gemminc_ukernel_6x16__avx512f_broadcast)
 DECLARE_F32_GEMMINC_UKERNEL_FUNCTION(xnn_f32_gemminc_ukernel_6x8__aarch64_neonfma_cortex_a53)
 DECLARE_F32_GEMMINC_UKERNEL_FUNCTION(xnn_f32_gemminc_ukernel_6x8__aarch64_neonfma_cortex_a57)
 DECLARE_F32_GEMMINC_UKERNEL_FUNCTION(xnn_f32_gemminc_ukernel_6x8__aarch64_neonfma_cortex_a73)
@@ -187,10 +197,10 @@
 DECLARE_F32_GEMMINC_UKERNEL_FUNCTION(xnn_f32_gemminc_ukernel_6x8__aarch64_neonfma_ld64)
 DECLARE_F32_GEMMINC_UKERNEL_FUNCTION(xnn_f32_gemminc_ukernel_6x8__avx_broadcast)
 DECLARE_F32_GEMMINC_UKERNEL_FUNCTION(xnn_f32_gemminc_ukernel_6x8__fma3_broadcast)
-DECLARE_F32_GEMMINC_UKERNEL_FUNCTION(xnn_f32_gemminc_ukernel_6x8__neon_lane_ld64)
-DECLARE_F32_GEMMINC_UKERNEL_FUNCTION(xnn_f32_gemminc_ukernel_6x8__neonfma_lane_ld64)
 DECLARE_F32_GEMMINC_UKERNEL_FUNCTION(xnn_f32_gemminc_ukernel_6x8__neon_dup_ld64)
+DECLARE_F32_GEMMINC_UKERNEL_FUNCTION(xnn_f32_gemminc_ukernel_6x8__neon_lane_ld64)
 DECLARE_F32_GEMMINC_UKERNEL_FUNCTION(xnn_f32_gemminc_ukernel_6x8__neonfma_dup_ld64)
+DECLARE_F32_GEMMINC_UKERNEL_FUNCTION(xnn_f32_gemminc_ukernel_6x8__neonfma_lane_ld64)
 DECLARE_F32_GEMMINC_UKERNEL_FUNCTION(xnn_f32_gemminc_ukernel_6x8__neon_lane_ld128)
 DECLARE_F32_GEMMINC_UKERNEL_FUNCTION(xnn_f32_gemminc_ukernel_6x8__neonfma_lane_ld128)
 DECLARE_F32_GEMMINC_UKERNEL_FUNCTION(xnn_f32_gemminc_ukernel_6x8__neon_dup_ld128)
@@ -200,8 +210,10 @@
 DECLARE_F32_GEMMINC_UKERNEL_FUNCTION(xnn_f32_gemminc_ukernel_6x8s4__neon)
 DECLARE_F32_GEMMINC_UKERNEL_FUNCTION(xnn_f32_gemminc_ukernel_6x8s4__neonfma)
 DECLARE_F32_GEMMINC_UKERNEL_FUNCTION(xnn_f32_gemminc_ukernel_6x8s4__psimd)
+DECLARE_F32_GEMMINC_UKERNEL_FUNCTION(xnn_f32_gemminc_ukernel_7x16__avx512f_broadcast)
 DECLARE_F32_GEMMINC_UKERNEL_FUNCTION(xnn_f32_gemminc_ukernel_7x8__avx_broadcast)
 DECLARE_F32_GEMMINC_UKERNEL_FUNCTION(xnn_f32_gemminc_ukernel_7x8__fma3_broadcast)
+DECLARE_F32_GEMMINC_UKERNEL_FUNCTION(xnn_f32_gemminc_ukernel_8x16__avx512f_broadcast)
 DECLARE_F32_GEMMINC_UKERNEL_FUNCTION(xnn_f32_gemminc_ukernel_8x8__fma3_broadcast)
 DECLARE_F32_GEMMINC_UKERNEL_FUNCTION(xnn_f32_gemminc_ukernel_8x8s4__neon)
 DECLARE_F32_GEMMINC_UKERNEL_FUNCTION(xnn_f32_gemminc_ukernel_8x8s4__neonfma)
diff --git a/src/xnnpack/igemm.h b/src/xnnpack/igemm.h
index fb61cac..96167cd 100644
--- a/src/xnnpack/igemm.h
+++ b/src/xnnpack/igemm.h
@@ -35,16 +35,17 @@
       const union xnn_f32_output_params* params);
 
 DECLARE_F32_IGEMM_UKERNEL_FUNCTION(xnn_f32_igemm_ukernel_1x12__aarch64_neonfma_cortex_a53)
+DECLARE_F32_IGEMM_UKERNEL_FUNCTION(xnn_f32_igemm_ukernel_1x16__avx512f_broadcast)
 DECLARE_F32_IGEMM_UKERNEL_FUNCTION(xnn_f32_igemm_ukernel_1x4__scalar)
 DECLARE_F32_IGEMM_UKERNEL_FUNCTION(xnn_f32_igemm_ukernel_1x8__aarch64_neonfma_cortex_a53)
 DECLARE_F32_IGEMM_UKERNEL_FUNCTION(xnn_f32_igemm_ukernel_1x8__aarch64_neonfma_cortex_a57)
 DECLARE_F32_IGEMM_UKERNEL_FUNCTION(xnn_f32_igemm_ukernel_1x8__aarch64_neonfma_cortex_a75)
 DECLARE_F32_IGEMM_UKERNEL_FUNCTION(xnn_f32_igemm_ukernel_1x8__avx_broadcast)
 DECLARE_F32_IGEMM_UKERNEL_FUNCTION(xnn_f32_igemm_ukernel_1x8__fma3_broadcast)
-DECLARE_F32_IGEMM_UKERNEL_FUNCTION(xnn_f32_igemm_ukernel_1x8__neon_lane_ld64)
-DECLARE_F32_IGEMM_UKERNEL_FUNCTION(xnn_f32_igemm_ukernel_1x8__neonfma_lane_ld64)
 DECLARE_F32_IGEMM_UKERNEL_FUNCTION(xnn_f32_igemm_ukernel_1x8__neon_dup_ld64)
+DECLARE_F32_IGEMM_UKERNEL_FUNCTION(xnn_f32_igemm_ukernel_1x8__neon_lane_ld64)
 DECLARE_F32_IGEMM_UKERNEL_FUNCTION(xnn_f32_igemm_ukernel_1x8__neonfma_dup_ld64)
+DECLARE_F32_IGEMM_UKERNEL_FUNCTION(xnn_f32_igemm_ukernel_1x8__neonfma_lane_ld64)
 DECLARE_F32_IGEMM_UKERNEL_FUNCTION(xnn_f32_igemm_ukernel_1x8__psimd_loadsplat)
 DECLARE_F32_IGEMM_UKERNEL_FUNCTION(xnn_f32_igemm_ukernel_1x8__psimd_splat)
 DECLARE_F32_IGEMM_UKERNEL_FUNCTION(xnn_f32_igemm_ukernel_1x8__sse_dup)
@@ -55,6 +56,7 @@
 DECLARE_F32_IGEMM_UKERNEL_FUNCTION(xnn_f32_igemm_ukernel_1x8s4__sse)
 DECLARE_F32_IGEMM_UKERNEL_FUNCTION(xnn_f32_igemm_ukernel_2x4__scalar)
 DECLARE_F32_IGEMM_UKERNEL_FUNCTION(xnn_f32_igemm_ukernel_4x12__aarch64_neonfma_cortex_a53)
+DECLARE_F32_IGEMM_UKERNEL_FUNCTION(xnn_f32_igemm_ukernel_4x16__avx512f_broadcast)
 DECLARE_F32_IGEMM_UKERNEL_FUNCTION(xnn_f32_igemm_ukernel_4x2__neon_lane_ld64)
 DECLARE_F32_IGEMM_UKERNEL_FUNCTION(xnn_f32_igemm_ukernel_4x2__neonfma_lane_ld64)
 DECLARE_F32_IGEMM_UKERNEL_FUNCTION(xnn_f32_igemm_ukernel_4x2__scalar)
@@ -67,14 +69,14 @@
 DECLARE_F32_IGEMM_UKERNEL_FUNCTION(xnn_f32_igemm_ukernel_4x8__aarch64_neonfma_cortex_a75)
 DECLARE_F32_IGEMM_UKERNEL_FUNCTION(xnn_f32_igemm_ukernel_4x8__avx_broadcast)
 DECLARE_F32_IGEMM_UKERNEL_FUNCTION(xnn_f32_igemm_ukernel_4x8__fma3_broadcast)
-DECLARE_F32_IGEMM_UKERNEL_FUNCTION(xnn_f32_igemm_ukernel_4x8__neon_lane_ld128)
-DECLARE_F32_IGEMM_UKERNEL_FUNCTION(xnn_f32_igemm_ukernel_4x8__neonfma_lane_ld128)
-DECLARE_F32_IGEMM_UKERNEL_FUNCTION(xnn_f32_igemm_ukernel_4x8__neon_lane_ld64)
-DECLARE_F32_IGEMM_UKERNEL_FUNCTION(xnn_f32_igemm_ukernel_4x8__neonfma_lane_ld64)
 DECLARE_F32_IGEMM_UKERNEL_FUNCTION(xnn_f32_igemm_ukernel_4x8__neon_dup_ld128)
-DECLARE_F32_IGEMM_UKERNEL_FUNCTION(xnn_f32_igemm_ukernel_4x8__neonfma_dup_ld128)
 DECLARE_F32_IGEMM_UKERNEL_FUNCTION(xnn_f32_igemm_ukernel_4x8__neon_dup_ld64)
+DECLARE_F32_IGEMM_UKERNEL_FUNCTION(xnn_f32_igemm_ukernel_4x8__neon_lane_ld128)
+DECLARE_F32_IGEMM_UKERNEL_FUNCTION(xnn_f32_igemm_ukernel_4x8__neon_lane_ld64)
+DECLARE_F32_IGEMM_UKERNEL_FUNCTION(xnn_f32_igemm_ukernel_4x8__neonfma_dup_ld128)
 DECLARE_F32_IGEMM_UKERNEL_FUNCTION(xnn_f32_igemm_ukernel_4x8__neonfma_dup_ld64)
+DECLARE_F32_IGEMM_UKERNEL_FUNCTION(xnn_f32_igemm_ukernel_4x8__neonfma_lane_ld128)
+DECLARE_F32_IGEMM_UKERNEL_FUNCTION(xnn_f32_igemm_ukernel_4x8__neonfma_lane_ld64)
 DECLARE_F32_IGEMM_UKERNEL_FUNCTION(xnn_f32_igemm_ukernel_4x8__psimd_loadsplat)
 DECLARE_F32_IGEMM_UKERNEL_FUNCTION(xnn_f32_igemm_ukernel_4x8__psimd_splat)
 DECLARE_F32_IGEMM_UKERNEL_FUNCTION(xnn_f32_igemm_ukernel_4x8__sse_dup)
@@ -83,19 +85,21 @@
 DECLARE_F32_IGEMM_UKERNEL_FUNCTION(xnn_f32_igemm_ukernel_4x8s4__neonfma)
 DECLARE_F32_IGEMM_UKERNEL_FUNCTION(xnn_f32_igemm_ukernel_4x8s4__psimd)
 DECLARE_F32_IGEMM_UKERNEL_FUNCTION(xnn_f32_igemm_ukernel_4x8s4__sse)
+DECLARE_F32_IGEMM_UKERNEL_FUNCTION(xnn_f32_igemm_ukernel_5x16__avx512f_broadcast)
 DECLARE_F32_IGEMM_UKERNEL_FUNCTION(xnn_f32_igemm_ukernel_5x8__aarch64_neonfma_cortex_a75)
 DECLARE_F32_IGEMM_UKERNEL_FUNCTION(xnn_f32_igemm_ukernel_5x8__avx_broadcast)
 DECLARE_F32_IGEMM_UKERNEL_FUNCTION(xnn_f32_igemm_ukernel_5x8__fma3_broadcast)
+DECLARE_F32_IGEMM_UKERNEL_FUNCTION(xnn_f32_igemm_ukernel_6x16__avx512f_broadcast)
 DECLARE_F32_IGEMM_UKERNEL_FUNCTION(xnn_f32_igemm_ukernel_6x8__aarch64_neonfma_cortex_a53)
 DECLARE_F32_IGEMM_UKERNEL_FUNCTION(xnn_f32_igemm_ukernel_6x8__aarch64_neonfma_cortex_a57)
 DECLARE_F32_IGEMM_UKERNEL_FUNCTION(xnn_f32_igemm_ukernel_6x8__aarch64_neonfma_cortex_a73)
 DECLARE_F32_IGEMM_UKERNEL_FUNCTION(xnn_f32_igemm_ukernel_6x8__aarch64_neonfma_cortex_a75)
 DECLARE_F32_IGEMM_UKERNEL_FUNCTION(xnn_f32_igemm_ukernel_6x8__avx_broadcast)
 DECLARE_F32_IGEMM_UKERNEL_FUNCTION(xnn_f32_igemm_ukernel_6x8__fma3_broadcast)
-DECLARE_F32_IGEMM_UKERNEL_FUNCTION(xnn_f32_igemm_ukernel_6x8__neon_lane_ld64)
-DECLARE_F32_IGEMM_UKERNEL_FUNCTION(xnn_f32_igemm_ukernel_6x8__neonfma_lane_ld64)
 DECLARE_F32_IGEMM_UKERNEL_FUNCTION(xnn_f32_igemm_ukernel_6x8__neon_dup_ld64)
+DECLARE_F32_IGEMM_UKERNEL_FUNCTION(xnn_f32_igemm_ukernel_6x8__neon_lane_ld64)
 DECLARE_F32_IGEMM_UKERNEL_FUNCTION(xnn_f32_igemm_ukernel_6x8__neonfma_dup_ld64)
+DECLARE_F32_IGEMM_UKERNEL_FUNCTION(xnn_f32_igemm_ukernel_6x8__neonfma_lane_ld64)
 DECLARE_F32_IGEMM_UKERNEL_FUNCTION(xnn_f32_igemm_ukernel_6x8__neon_lane_ld128)
 DECLARE_F32_IGEMM_UKERNEL_FUNCTION(xnn_f32_igemm_ukernel_6x8__neonfma_lane_ld128)
 DECLARE_F32_IGEMM_UKERNEL_FUNCTION(xnn_f32_igemm_ukernel_6x8__neon_dup_ld128)
@@ -105,8 +109,10 @@
 DECLARE_F32_IGEMM_UKERNEL_FUNCTION(xnn_f32_igemm_ukernel_6x8s4__neon)
 DECLARE_F32_IGEMM_UKERNEL_FUNCTION(xnn_f32_igemm_ukernel_6x8s4__neonfma)
 DECLARE_F32_IGEMM_UKERNEL_FUNCTION(xnn_f32_igemm_ukernel_6x8s4__psimd)
+DECLARE_F32_IGEMM_UKERNEL_FUNCTION(xnn_f32_igemm_ukernel_7x16__avx512f_broadcast)
 DECLARE_F32_IGEMM_UKERNEL_FUNCTION(xnn_f32_igemm_ukernel_7x8__avx_broadcast)
 DECLARE_F32_IGEMM_UKERNEL_FUNCTION(xnn_f32_igemm_ukernel_7x8__fma3_broadcast)
+DECLARE_F32_IGEMM_UKERNEL_FUNCTION(xnn_f32_igemm_ukernel_8x16__avx512f_broadcast)
 DECLARE_F32_IGEMM_UKERNEL_FUNCTION(xnn_f32_igemm_ukernel_8x8__fma3_broadcast)
 DECLARE_F32_IGEMM_UKERNEL_FUNCTION(xnn_f32_igemm_ukernel_8x8s4__neon)
 DECLARE_F32_IGEMM_UKERNEL_FUNCTION(xnn_f32_igemm_ukernel_8x8s4__neonfma)