AVX and FMA3 microkernels for GEMM/GEMMINC/IGEMM
PiperOrigin-RevId: 281807374
diff --git a/src/f32-gemm/1x8-avx-broadcast.c b/src/f32-gemm/1x8-avx-broadcast.c
new file mode 100644
index 0000000..79265ea
--- /dev/null
+++ b/src/f32-gemm/1x8-avx-broadcast.c
@@ -0,0 +1,94 @@
+// Auto-generated file. Do not edit!
+// Template: src/f32-gemm/avx-broadcast.c.in
+// Generator: tools/xngen
+//
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <immintrin.h>
+
+#include <xnnpack/gemm.h>
+
+
+void xnn_f32_gemm_ukernel_1x8__avx_broadcast(
+ size_t mr,
+ size_t nc,
+ size_t kc,
+ const float*restrict a,
+ size_t a_stride,
+ const float*restrict w,
+ float*restrict c,
+ size_t cm_stride,
+ size_t cn_stride,
+ const union xnn_f32_output_params params[restrict static 1])
+{
+ assert(mr != 0);
+ assert(mr <= 1);
+ assert(nc != 0);
+ assert(kc != 0);
+ assert(kc % sizeof(float) == 0);
+ assert(a != NULL);
+ assert(w != NULL);
+ assert(c != NULL);
+
+ const float* a0 = a;
+ float* c0 = c;
+
+ do {
+ __m256 vacc0x01234567 = _mm256_load_ps(w + 0);
+ w += 8;
+
+ size_t k = kc;
+ do {
+ const __m256 va0 = _mm256_broadcast_ss(a0);
+ a0 += 1;
+
+ const __m256 vb01234567 = _mm256_load_ps(w);
+ w += 8;
+
+ vacc0x01234567 = _mm256_add_ps(vacc0x01234567, _mm256_mul_ps(va0, vb01234567));
+
+ k -= sizeof(float);
+ } while (k != 0);
+
+ const __m256 vmax = _mm256_broadcast_ps((const __m128*) params->sse.max);
+ vacc0x01234567 = _mm256_min_ps(vacc0x01234567, vmax);
+
+ const __m256 vmin = _mm256_broadcast_ps((const __m128*) params->sse.min);
+ vacc0x01234567 = _mm256_max_ps(vacc0x01234567, vmin);
+
+ if XNN_LIKELY(nc >= 8) {
+ _mm256_storeu_ps(c0, vacc0x01234567);
+ c0 = (float*) ((uintptr_t) c0 + cn_stride);
+
+ a0 = (const float*) ((uintptr_t) a0 - kc);
+
+ nc -= 8;
+ } else {
+ __m128 vacc0x0123 = _mm256_castps256_ps128(vacc0x01234567);
+ if (nc & 4) {
+ _mm_storeu_ps(c0, vacc0x0123);
+
+ vacc0x0123 = _mm256_extractf128_ps(vacc0x01234567, 1);
+
+ c0 += 4;
+ }
+ if (nc & 2) {
+ _mm_storel_pi((__m64*) c0, vacc0x0123);
+
+ vacc0x0123 = _mm_movehl_ps(vacc0x0123, vacc0x0123);
+
+ c0 += 2;
+ }
+ if (nc & 1) {
+ _mm_store_ss(c0, vacc0x0123);
+ }
+
+ nc = 0;
+ }
+ } while (nc != 0);
+}
diff --git a/src/f32-gemm/1x8-fma3-broadcast.c b/src/f32-gemm/1x8-fma3-broadcast.c
new file mode 100644
index 0000000..ae51c41
--- /dev/null
+++ b/src/f32-gemm/1x8-fma3-broadcast.c
@@ -0,0 +1,94 @@
+// Auto-generated file. Do not edit!
+// Template: src/f32-gemm/avx-broadcast.c.in
+// Generator: tools/xngen
+//
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <immintrin.h>
+
+#include <xnnpack/gemm.h>
+
+
+void xnn_f32_gemm_ukernel_1x8__fma3_broadcast(
+ size_t mr,
+ size_t nc,
+ size_t kc,
+ const float*restrict a,
+ size_t a_stride,
+ const float*restrict w,
+ float*restrict c,
+ size_t cm_stride,
+ size_t cn_stride,
+ const union xnn_f32_output_params params[restrict static 1])
+{
+ assert(mr != 0);
+ assert(mr <= 1);
+ assert(nc != 0);
+ assert(kc != 0);
+ assert(kc % sizeof(float) == 0);
+ assert(a != NULL);
+ assert(w != NULL);
+ assert(c != NULL);
+
+ const float* a0 = a;
+ float* c0 = c;
+
+ do {
+ __m256 vacc0x01234567 = _mm256_load_ps(w + 0);
+ w += 8;
+
+ size_t k = kc;
+ do {
+ const __m256 va0 = _mm256_broadcast_ss(a0);
+ a0 += 1;
+
+ const __m256 vb01234567 = _mm256_load_ps(w);
+ w += 8;
+
+ vacc0x01234567 = _mm256_fmadd_ps(va0, vb01234567, vacc0x01234567);
+
+ k -= sizeof(float);
+ } while (k != 0);
+
+ const __m256 vmax = _mm256_broadcast_ps((const __m128*) params->sse.max);
+ vacc0x01234567 = _mm256_min_ps(vacc0x01234567, vmax);
+
+ const __m256 vmin = _mm256_broadcast_ps((const __m128*) params->sse.min);
+ vacc0x01234567 = _mm256_max_ps(vacc0x01234567, vmin);
+
+ if XNN_LIKELY(nc >= 8) {
+ _mm256_storeu_ps(c0, vacc0x01234567);
+ c0 = (float*) ((uintptr_t) c0 + cn_stride);
+
+ a0 = (const float*) ((uintptr_t) a0 - kc);
+
+ nc -= 8;
+ } else {
+ __m128 vacc0x0123 = _mm256_castps256_ps128(vacc0x01234567);
+ if (nc & 4) {
+ _mm_storeu_ps(c0, vacc0x0123);
+
+ vacc0x0123 = _mm256_extractf128_ps(vacc0x01234567, 1);
+
+ c0 += 4;
+ }
+ if (nc & 2) {
+ _mm_storel_pi((__m64*) c0, vacc0x0123);
+
+ vacc0x0123 = _mm_movehl_ps(vacc0x0123, vacc0x0123);
+
+ c0 += 2;
+ }
+ if (nc & 1) {
+ _mm_store_ss(c0, vacc0x0123);
+ }
+
+ nc = 0;
+ }
+ } while (nc != 0);
+}
diff --git a/src/f32-gemm/4x8-avx-broadcast.c b/src/f32-gemm/4x8-avx-broadcast.c
new file mode 100644
index 0000000..ecd3a98
--- /dev/null
+++ b/src/f32-gemm/4x8-avx-broadcast.c
@@ -0,0 +1,163 @@
+// Auto-generated file. Do not edit!
+// Template: src/f32-gemm/avx-broadcast.c.in
+// Generator: tools/xngen
+//
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <immintrin.h>
+
+#include <xnnpack/gemm.h>
+
+
+void xnn_f32_gemm_ukernel_4x8__avx_broadcast(
+ size_t mr,
+ size_t nc,
+ size_t kc,
+ const float*restrict a,
+ size_t a_stride,
+ const float*restrict w,
+ float*restrict c,
+ size_t cm_stride,
+ size_t cn_stride,
+ const union xnn_f32_output_params params[restrict static 1])
+{
+ assert(mr != 0);
+ assert(mr <= 4);
+ assert(nc != 0);
+ assert(kc != 0);
+ assert(kc % sizeof(float) == 0);
+ assert(a != NULL);
+ assert(w != NULL);
+ assert(c != NULL);
+
+ const float* a0 = a;
+ float* c0 = c;
+ const float* a1 = (const float*) ((uintptr_t) a0 + a_stride);
+ float* c1 = (float*) ((uintptr_t) c0 + cm_stride);
+ if XNN_UNPREDICTABLE(mr < 2) {
+ a1 = a0;
+ c1 = c0;
+ }
+ const float* a2 = (const float*) ((uintptr_t) a1 + a_stride);
+ float* c2 = (float*) ((uintptr_t) c1 + cm_stride);
+ if XNN_UNPREDICTABLE(mr <= 2) {
+ a2 = a1;
+ c2 = c1;
+ }
+ const float* a3 = (const float*) ((uintptr_t) a2 + a_stride);
+ float* c3 = (float*) ((uintptr_t) c2 + cm_stride);
+ if XNN_UNPREDICTABLE(mr != 4) {
+ a3 = a2;
+ c3 = c2;
+ }
+
+ do {
+ __m256 vacc0x01234567 = _mm256_load_ps(w + 0);
+ __m256 vacc1x01234567 = vacc0x01234567;
+ __m256 vacc2x01234567 = vacc0x01234567;
+ __m256 vacc3x01234567 = vacc0x01234567;
+ w += 8;
+
+ size_t k = kc;
+ do {
+ const __m256 va0 = _mm256_broadcast_ss(a0);
+ a0 += 1;
+ const __m256 va1 = _mm256_broadcast_ss(a1);
+ a1 += 1;
+ const __m256 va2 = _mm256_broadcast_ss(a2);
+ a2 += 1;
+ const __m256 va3 = _mm256_broadcast_ss(a3);
+ a3 += 1;
+
+ const __m256 vb01234567 = _mm256_load_ps(w);
+ w += 8;
+
+ vacc0x01234567 = _mm256_add_ps(vacc0x01234567, _mm256_mul_ps(va0, vb01234567));
+ vacc1x01234567 = _mm256_add_ps(vacc1x01234567, _mm256_mul_ps(va1, vb01234567));
+ vacc2x01234567 = _mm256_add_ps(vacc2x01234567, _mm256_mul_ps(va2, vb01234567));
+ vacc3x01234567 = _mm256_add_ps(vacc3x01234567, _mm256_mul_ps(va3, vb01234567));
+
+ k -= sizeof(float);
+ } while (k != 0);
+
+ const __m256 vmax = _mm256_broadcast_ps((const __m128*) params->sse.max);
+ vacc0x01234567 = _mm256_min_ps(vacc0x01234567, vmax);
+ vacc1x01234567 = _mm256_min_ps(vacc1x01234567, vmax);
+ vacc2x01234567 = _mm256_min_ps(vacc2x01234567, vmax);
+ vacc3x01234567 = _mm256_min_ps(vacc3x01234567, vmax);
+
+ const __m256 vmin = _mm256_broadcast_ps((const __m128*) params->sse.min);
+ vacc0x01234567 = _mm256_max_ps(vacc0x01234567, vmin);
+ vacc1x01234567 = _mm256_max_ps(vacc1x01234567, vmin);
+ vacc2x01234567 = _mm256_max_ps(vacc2x01234567, vmin);
+ vacc3x01234567 = _mm256_max_ps(vacc3x01234567, vmin);
+
+ if XNN_LIKELY(nc >= 8) {
+ _mm256_storeu_ps(c3, vacc3x01234567);
+ c3 = (float*) ((uintptr_t) c3 + cn_stride);
+ _mm256_storeu_ps(c2, vacc2x01234567);
+ c2 = (float*) ((uintptr_t) c2 + cn_stride);
+ _mm256_storeu_ps(c1, vacc1x01234567);
+ c1 = (float*) ((uintptr_t) c1 + cn_stride);
+ _mm256_storeu_ps(c0, vacc0x01234567);
+ c0 = (float*) ((uintptr_t) c0 + cn_stride);
+
+ a3 = (const float*) ((uintptr_t) a3 - kc);
+ a2 = (const float*) ((uintptr_t) a2 - kc);
+ a1 = (const float*) ((uintptr_t) a1 - kc);
+ a0 = (const float*) ((uintptr_t) a0 - kc);
+
+ nc -= 8;
+ } else {
+ __m128 vacc3x0123 = _mm256_castps256_ps128(vacc3x01234567);
+ __m128 vacc2x0123 = _mm256_castps256_ps128(vacc2x01234567);
+ __m128 vacc1x0123 = _mm256_castps256_ps128(vacc1x01234567);
+ __m128 vacc0x0123 = _mm256_castps256_ps128(vacc0x01234567);
+ if (nc & 4) {
+ _mm_storeu_ps(c3, vacc3x0123);
+ _mm_storeu_ps(c2, vacc2x0123);
+ _mm_storeu_ps(c1, vacc1x0123);
+ _mm_storeu_ps(c0, vacc0x0123);
+
+ vacc3x0123 = _mm256_extractf128_ps(vacc3x01234567, 1);
+ vacc2x0123 = _mm256_extractf128_ps(vacc2x01234567, 1);
+ vacc1x0123 = _mm256_extractf128_ps(vacc1x01234567, 1);
+ vacc0x0123 = _mm256_extractf128_ps(vacc0x01234567, 1);
+
+ c3 += 4;
+ c2 += 4;
+ c1 += 4;
+ c0 += 4;
+ }
+ if (nc & 2) {
+ _mm_storel_pi((__m64*) c3, vacc3x0123);
+ _mm_storel_pi((__m64*) c2, vacc2x0123);
+ _mm_storel_pi((__m64*) c1, vacc1x0123);
+ _mm_storel_pi((__m64*) c0, vacc0x0123);
+
+ vacc3x0123 = _mm_movehl_ps(vacc3x0123, vacc3x0123);
+ vacc2x0123 = _mm_movehl_ps(vacc2x0123, vacc2x0123);
+ vacc1x0123 = _mm_movehl_ps(vacc1x0123, vacc1x0123);
+ vacc0x0123 = _mm_movehl_ps(vacc0x0123, vacc0x0123);
+
+ c3 += 2;
+ c2 += 2;
+ c1 += 2;
+ c0 += 2;
+ }
+ if (nc & 1) {
+ _mm_store_ss(c3, vacc3x0123);
+ _mm_store_ss(c2, vacc2x0123);
+ _mm_store_ss(c1, vacc1x0123);
+ _mm_store_ss(c0, vacc0x0123);
+ }
+
+ nc = 0;
+ }
+ } while (nc != 0);
+}
diff --git a/src/f32-gemm/4x8-fma3-broadcast.c b/src/f32-gemm/4x8-fma3-broadcast.c
new file mode 100644
index 0000000..8523d2b
--- /dev/null
+++ b/src/f32-gemm/4x8-fma3-broadcast.c
@@ -0,0 +1,163 @@
+// Auto-generated file. Do not edit!
+// Template: src/f32-gemm/avx-broadcast.c.in
+// Generator: tools/xngen
+//
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <immintrin.h>
+
+#include <xnnpack/gemm.h>
+
+
+void xnn_f32_gemm_ukernel_4x8__fma3_broadcast(
+ size_t mr,
+ size_t nc,
+ size_t kc,
+ const float*restrict a,
+ size_t a_stride,
+ const float*restrict w,
+ float*restrict c,
+ size_t cm_stride,
+ size_t cn_stride,
+ const union xnn_f32_output_params params[restrict static 1])
+{
+ assert(mr != 0);
+ assert(mr <= 4);
+ assert(nc != 0);
+ assert(kc != 0);
+ assert(kc % sizeof(float) == 0);
+ assert(a != NULL);
+ assert(w != NULL);
+ assert(c != NULL);
+
+ const float* a0 = a;
+ float* c0 = c;
+ const float* a1 = (const float*) ((uintptr_t) a0 + a_stride);
+ float* c1 = (float*) ((uintptr_t) c0 + cm_stride);
+ if XNN_UNPREDICTABLE(mr < 2) {
+ a1 = a0;
+ c1 = c0;
+ }
+ const float* a2 = (const float*) ((uintptr_t) a1 + a_stride);
+ float* c2 = (float*) ((uintptr_t) c1 + cm_stride);
+ if XNN_UNPREDICTABLE(mr <= 2) {
+ a2 = a1;
+ c2 = c1;
+ }
+ const float* a3 = (const float*) ((uintptr_t) a2 + a_stride);
+ float* c3 = (float*) ((uintptr_t) c2 + cm_stride);
+ if XNN_UNPREDICTABLE(mr != 4) {
+ a3 = a2;
+ c3 = c2;
+ }
+
+ do {
+ __m256 vacc0x01234567 = _mm256_load_ps(w + 0);
+ __m256 vacc1x01234567 = vacc0x01234567;
+ __m256 vacc2x01234567 = vacc0x01234567;
+ __m256 vacc3x01234567 = vacc0x01234567;
+ w += 8;
+
+ size_t k = kc;
+ do {
+ const __m256 va0 = _mm256_broadcast_ss(a0);
+ a0 += 1;
+ const __m256 va1 = _mm256_broadcast_ss(a1);
+ a1 += 1;
+ const __m256 va2 = _mm256_broadcast_ss(a2);
+ a2 += 1;
+ const __m256 va3 = _mm256_broadcast_ss(a3);
+ a3 += 1;
+
+ const __m256 vb01234567 = _mm256_load_ps(w);
+ w += 8;
+
+ vacc0x01234567 = _mm256_fmadd_ps(va0, vb01234567, vacc0x01234567);
+ vacc1x01234567 = _mm256_fmadd_ps(va1, vb01234567, vacc1x01234567);
+ vacc2x01234567 = _mm256_fmadd_ps(va2, vb01234567, vacc2x01234567);
+ vacc3x01234567 = _mm256_fmadd_ps(va3, vb01234567, vacc3x01234567);
+
+ k -= sizeof(float);
+ } while (k != 0);
+
+ const __m256 vmax = _mm256_broadcast_ps((const __m128*) params->sse.max);
+ vacc0x01234567 = _mm256_min_ps(vacc0x01234567, vmax);
+ vacc1x01234567 = _mm256_min_ps(vacc1x01234567, vmax);
+ vacc2x01234567 = _mm256_min_ps(vacc2x01234567, vmax);
+ vacc3x01234567 = _mm256_min_ps(vacc3x01234567, vmax);
+
+ const __m256 vmin = _mm256_broadcast_ps((const __m128*) params->sse.min);
+ vacc0x01234567 = _mm256_max_ps(vacc0x01234567, vmin);
+ vacc1x01234567 = _mm256_max_ps(vacc1x01234567, vmin);
+ vacc2x01234567 = _mm256_max_ps(vacc2x01234567, vmin);
+ vacc3x01234567 = _mm256_max_ps(vacc3x01234567, vmin);
+
+ if XNN_LIKELY(nc >= 8) {
+ _mm256_storeu_ps(c3, vacc3x01234567);
+ c3 = (float*) ((uintptr_t) c3 + cn_stride);
+ _mm256_storeu_ps(c2, vacc2x01234567);
+ c2 = (float*) ((uintptr_t) c2 + cn_stride);
+ _mm256_storeu_ps(c1, vacc1x01234567);
+ c1 = (float*) ((uintptr_t) c1 + cn_stride);
+ _mm256_storeu_ps(c0, vacc0x01234567);
+ c0 = (float*) ((uintptr_t) c0 + cn_stride);
+
+ a3 = (const float*) ((uintptr_t) a3 - kc);
+ a2 = (const float*) ((uintptr_t) a2 - kc);
+ a1 = (const float*) ((uintptr_t) a1 - kc);
+ a0 = (const float*) ((uintptr_t) a0 - kc);
+
+ nc -= 8;
+ } else {
+ __m128 vacc3x0123 = _mm256_castps256_ps128(vacc3x01234567);
+ __m128 vacc2x0123 = _mm256_castps256_ps128(vacc2x01234567);
+ __m128 vacc1x0123 = _mm256_castps256_ps128(vacc1x01234567);
+ __m128 vacc0x0123 = _mm256_castps256_ps128(vacc0x01234567);
+ if (nc & 4) {
+ _mm_storeu_ps(c3, vacc3x0123);
+ _mm_storeu_ps(c2, vacc2x0123);
+ _mm_storeu_ps(c1, vacc1x0123);
+ _mm_storeu_ps(c0, vacc0x0123);
+
+ vacc3x0123 = _mm256_extractf128_ps(vacc3x01234567, 1);
+ vacc2x0123 = _mm256_extractf128_ps(vacc2x01234567, 1);
+ vacc1x0123 = _mm256_extractf128_ps(vacc1x01234567, 1);
+ vacc0x0123 = _mm256_extractf128_ps(vacc0x01234567, 1);
+
+ c3 += 4;
+ c2 += 4;
+ c1 += 4;
+ c0 += 4;
+ }
+ if (nc & 2) {
+ _mm_storel_pi((__m64*) c3, vacc3x0123);
+ _mm_storel_pi((__m64*) c2, vacc2x0123);
+ _mm_storel_pi((__m64*) c1, vacc1x0123);
+ _mm_storel_pi((__m64*) c0, vacc0x0123);
+
+ vacc3x0123 = _mm_movehl_ps(vacc3x0123, vacc3x0123);
+ vacc2x0123 = _mm_movehl_ps(vacc2x0123, vacc2x0123);
+ vacc1x0123 = _mm_movehl_ps(vacc1x0123, vacc1x0123);
+ vacc0x0123 = _mm_movehl_ps(vacc0x0123, vacc0x0123);
+
+ c3 += 2;
+ c2 += 2;
+ c1 += 2;
+ c0 += 2;
+ }
+ if (nc & 1) {
+ _mm_store_ss(c3, vacc3x0123);
+ _mm_store_ss(c2, vacc2x0123);
+ _mm_store_ss(c1, vacc1x0123);
+ _mm_store_ss(c0, vacc0x0123);
+ }
+
+ nc = 0;
+ }
+ } while (nc != 0);
+}
diff --git a/src/f32-gemm/5x8-avx-broadcast.c b/src/f32-gemm/5x8-avx-broadcast.c
new file mode 100644
index 0000000..62c2a36
--- /dev/null
+++ b/src/f32-gemm/5x8-avx-broadcast.c
@@ -0,0 +1,186 @@
+// Auto-generated file. Do not edit!
+// Template: src/f32-gemm/avx-broadcast.c.in
+// Generator: tools/xngen
+//
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <immintrin.h>
+
+#include <xnnpack/gemm.h>
+
+
+void xnn_f32_gemm_ukernel_5x8__avx_broadcast(
+ size_t mr,
+ size_t nc,
+ size_t kc,
+ const float*restrict a,
+ size_t a_stride,
+ const float*restrict w,
+ float*restrict c,
+ size_t cm_stride,
+ size_t cn_stride,
+ const union xnn_f32_output_params params[restrict static 1])
+{
+ assert(mr != 0);
+ assert(mr <= 5);
+ assert(nc != 0);
+ assert(kc != 0);
+ assert(kc % sizeof(float) == 0);
+ assert(a != NULL);
+ assert(w != NULL);
+ assert(c != NULL);
+
+ const float* a0 = a;
+ float* c0 = c;
+ const float* a1 = (const float*) ((uintptr_t) a0 + a_stride);
+ float* c1 = (float*) ((uintptr_t) c0 + cm_stride);
+ if XNN_UNPREDICTABLE(mr < 2) {
+ a1 = a0;
+ c1 = c0;
+ }
+ const float* a2 = (const float*) ((uintptr_t) a1 + a_stride);
+ float* c2 = (float*) ((uintptr_t) c1 + cm_stride);
+ if XNN_UNPREDICTABLE(mr <= 2) {
+ a2 = a1;
+ c2 = c1;
+ }
+ const float* a3 = (const float*) ((uintptr_t) a2 + a_stride);
+ float* c3 = (float*) ((uintptr_t) c2 + cm_stride);
+ if XNN_UNPREDICTABLE(mr < 4) {
+ a3 = a2;
+ c3 = c2;
+ }
+ const float* a4 = (const float*) ((uintptr_t) a3 + a_stride);
+ float* c4 = (float*) ((uintptr_t) c3 + cm_stride);
+ if XNN_UNPREDICTABLE(mr <= 4) {
+ a4 = a3;
+ c4 = c3;
+ }
+
+ do {
+ __m256 vacc0x01234567 = _mm256_load_ps(w + 0);
+ __m256 vacc1x01234567 = vacc0x01234567;
+ __m256 vacc2x01234567 = vacc0x01234567;
+ __m256 vacc3x01234567 = vacc0x01234567;
+ __m256 vacc4x01234567 = vacc0x01234567;
+ w += 8;
+
+ size_t k = kc;
+ do {
+ const __m256 va0 = _mm256_broadcast_ss(a0);
+ a0 += 1;
+ const __m256 va1 = _mm256_broadcast_ss(a1);
+ a1 += 1;
+ const __m256 va2 = _mm256_broadcast_ss(a2);
+ a2 += 1;
+ const __m256 va3 = _mm256_broadcast_ss(a3);
+ a3 += 1;
+ const __m256 va4 = _mm256_broadcast_ss(a4);
+ a4 += 1;
+
+ const __m256 vb01234567 = _mm256_load_ps(w);
+ w += 8;
+
+ vacc0x01234567 = _mm256_add_ps(vacc0x01234567, _mm256_mul_ps(va0, vb01234567));
+ vacc1x01234567 = _mm256_add_ps(vacc1x01234567, _mm256_mul_ps(va1, vb01234567));
+ vacc2x01234567 = _mm256_add_ps(vacc2x01234567, _mm256_mul_ps(va2, vb01234567));
+ vacc3x01234567 = _mm256_add_ps(vacc3x01234567, _mm256_mul_ps(va3, vb01234567));
+ vacc4x01234567 = _mm256_add_ps(vacc4x01234567, _mm256_mul_ps(va4, vb01234567));
+
+ k -= sizeof(float);
+ } while (k != 0);
+
+ const __m256 vmax = _mm256_broadcast_ps((const __m128*) params->sse.max);
+ vacc0x01234567 = _mm256_min_ps(vacc0x01234567, vmax);
+ vacc1x01234567 = _mm256_min_ps(vacc1x01234567, vmax);
+ vacc2x01234567 = _mm256_min_ps(vacc2x01234567, vmax);
+ vacc3x01234567 = _mm256_min_ps(vacc3x01234567, vmax);
+ vacc4x01234567 = _mm256_min_ps(vacc4x01234567, vmax);
+
+ const __m256 vmin = _mm256_broadcast_ps((const __m128*) params->sse.min);
+ vacc0x01234567 = _mm256_max_ps(vacc0x01234567, vmin);
+ vacc1x01234567 = _mm256_max_ps(vacc1x01234567, vmin);
+ vacc2x01234567 = _mm256_max_ps(vacc2x01234567, vmin);
+ vacc3x01234567 = _mm256_max_ps(vacc3x01234567, vmin);
+ vacc4x01234567 = _mm256_max_ps(vacc4x01234567, vmin);
+
+ if XNN_LIKELY(nc >= 8) {
+ _mm256_storeu_ps(c4, vacc4x01234567);
+ c4 = (float*) ((uintptr_t) c4 + cn_stride);
+ _mm256_storeu_ps(c3, vacc3x01234567);
+ c3 = (float*) ((uintptr_t) c3 + cn_stride);
+ _mm256_storeu_ps(c2, vacc2x01234567);
+ c2 = (float*) ((uintptr_t) c2 + cn_stride);
+ _mm256_storeu_ps(c1, vacc1x01234567);
+ c1 = (float*) ((uintptr_t) c1 + cn_stride);
+ _mm256_storeu_ps(c0, vacc0x01234567);
+ c0 = (float*) ((uintptr_t) c0 + cn_stride);
+
+ a4 = (const float*) ((uintptr_t) a4 - kc);
+ a3 = (const float*) ((uintptr_t) a3 - kc);
+ a2 = (const float*) ((uintptr_t) a2 - kc);
+ a1 = (const float*) ((uintptr_t) a1 - kc);
+ a0 = (const float*) ((uintptr_t) a0 - kc);
+
+ nc -= 8;
+ } else {
+ __m128 vacc4x0123 = _mm256_castps256_ps128(vacc4x01234567);
+ __m128 vacc3x0123 = _mm256_castps256_ps128(vacc3x01234567);
+ __m128 vacc2x0123 = _mm256_castps256_ps128(vacc2x01234567);
+ __m128 vacc1x0123 = _mm256_castps256_ps128(vacc1x01234567);
+ __m128 vacc0x0123 = _mm256_castps256_ps128(vacc0x01234567);
+ if (nc & 4) {
+ _mm_storeu_ps(c4, vacc4x0123);
+ _mm_storeu_ps(c3, vacc3x0123);
+ _mm_storeu_ps(c2, vacc2x0123);
+ _mm_storeu_ps(c1, vacc1x0123);
+ _mm_storeu_ps(c0, vacc0x0123);
+
+ vacc4x0123 = _mm256_extractf128_ps(vacc4x01234567, 1);
+ vacc3x0123 = _mm256_extractf128_ps(vacc3x01234567, 1);
+ vacc2x0123 = _mm256_extractf128_ps(vacc2x01234567, 1);
+ vacc1x0123 = _mm256_extractf128_ps(vacc1x01234567, 1);
+ vacc0x0123 = _mm256_extractf128_ps(vacc0x01234567, 1);
+
+ c4 += 4;
+ c3 += 4;
+ c2 += 4;
+ c1 += 4;
+ c0 += 4;
+ }
+ if (nc & 2) {
+ _mm_storel_pi((__m64*) c4, vacc4x0123);
+ _mm_storel_pi((__m64*) c3, vacc3x0123);
+ _mm_storel_pi((__m64*) c2, vacc2x0123);
+ _mm_storel_pi((__m64*) c1, vacc1x0123);
+ _mm_storel_pi((__m64*) c0, vacc0x0123);
+
+ vacc4x0123 = _mm_movehl_ps(vacc4x0123, vacc4x0123);
+ vacc3x0123 = _mm_movehl_ps(vacc3x0123, vacc3x0123);
+ vacc2x0123 = _mm_movehl_ps(vacc2x0123, vacc2x0123);
+ vacc1x0123 = _mm_movehl_ps(vacc1x0123, vacc1x0123);
+ vacc0x0123 = _mm_movehl_ps(vacc0x0123, vacc0x0123);
+
+ c4 += 2;
+ c3 += 2;
+ c2 += 2;
+ c1 += 2;
+ c0 += 2;
+ }
+ if (nc & 1) {
+ _mm_store_ss(c4, vacc4x0123);
+ _mm_store_ss(c3, vacc3x0123);
+ _mm_store_ss(c2, vacc2x0123);
+ _mm_store_ss(c1, vacc1x0123);
+ _mm_store_ss(c0, vacc0x0123);
+ }
+
+ nc = 0;
+ }
+ } while (nc != 0);
+}
diff --git a/src/f32-gemm/5x8-fma3-broadcast.c b/src/f32-gemm/5x8-fma3-broadcast.c
new file mode 100644
index 0000000..e5ad31f
--- /dev/null
+++ b/src/f32-gemm/5x8-fma3-broadcast.c
@@ -0,0 +1,186 @@
+// Auto-generated file. Do not edit!
+// Template: src/f32-gemm/avx-broadcast.c.in
+// Generator: tools/xngen
+//
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <immintrin.h>
+
+#include <xnnpack/gemm.h>
+
+
+void xnn_f32_gemm_ukernel_5x8__fma3_broadcast(
+ size_t mr,
+ size_t nc,
+ size_t kc,
+ const float*restrict a,
+ size_t a_stride,
+ const float*restrict w,
+ float*restrict c,
+ size_t cm_stride,
+ size_t cn_stride,
+ const union xnn_f32_output_params params[restrict static 1])
+{
+ assert(mr != 0);
+ assert(mr <= 5);
+ assert(nc != 0);
+ assert(kc != 0);
+ assert(kc % sizeof(float) == 0);
+ assert(a != NULL);
+ assert(w != NULL);
+ assert(c != NULL);
+
+ const float* a0 = a;
+ float* c0 = c;
+ const float* a1 = (const float*) ((uintptr_t) a0 + a_stride);
+ float* c1 = (float*) ((uintptr_t) c0 + cm_stride);
+ if XNN_UNPREDICTABLE(mr < 2) {
+ a1 = a0;
+ c1 = c0;
+ }
+ const float* a2 = (const float*) ((uintptr_t) a1 + a_stride);
+ float* c2 = (float*) ((uintptr_t) c1 + cm_stride);
+ if XNN_UNPREDICTABLE(mr <= 2) {
+ a2 = a1;
+ c2 = c1;
+ }
+ const float* a3 = (const float*) ((uintptr_t) a2 + a_stride);
+ float* c3 = (float*) ((uintptr_t) c2 + cm_stride);
+ if XNN_UNPREDICTABLE(mr < 4) {
+ a3 = a2;
+ c3 = c2;
+ }
+ const float* a4 = (const float*) ((uintptr_t) a3 + a_stride);
+ float* c4 = (float*) ((uintptr_t) c3 + cm_stride);
+ if XNN_UNPREDICTABLE(mr <= 4) {
+ a4 = a3;
+ c4 = c3;
+ }
+
+ do {
+ __m256 vacc0x01234567 = _mm256_load_ps(w + 0);
+ __m256 vacc1x01234567 = vacc0x01234567;
+ __m256 vacc2x01234567 = vacc0x01234567;
+ __m256 vacc3x01234567 = vacc0x01234567;
+ __m256 vacc4x01234567 = vacc0x01234567;
+ w += 8;
+
+ size_t k = kc;
+ do {
+ const __m256 va0 = _mm256_broadcast_ss(a0);
+ a0 += 1;
+ const __m256 va1 = _mm256_broadcast_ss(a1);
+ a1 += 1;
+ const __m256 va2 = _mm256_broadcast_ss(a2);
+ a2 += 1;
+ const __m256 va3 = _mm256_broadcast_ss(a3);
+ a3 += 1;
+ const __m256 va4 = _mm256_broadcast_ss(a4);
+ a4 += 1;
+
+ const __m256 vb01234567 = _mm256_load_ps(w);
+ w += 8;
+
+ vacc0x01234567 = _mm256_fmadd_ps(va0, vb01234567, vacc0x01234567);
+ vacc1x01234567 = _mm256_fmadd_ps(va1, vb01234567, vacc1x01234567);
+ vacc2x01234567 = _mm256_fmadd_ps(va2, vb01234567, vacc2x01234567);
+ vacc3x01234567 = _mm256_fmadd_ps(va3, vb01234567, vacc3x01234567);
+ vacc4x01234567 = _mm256_fmadd_ps(va4, vb01234567, vacc4x01234567);
+
+ k -= sizeof(float);
+ } while (k != 0);
+
+ const __m256 vmax = _mm256_broadcast_ps((const __m128*) params->sse.max);
+ vacc0x01234567 = _mm256_min_ps(vacc0x01234567, vmax);
+ vacc1x01234567 = _mm256_min_ps(vacc1x01234567, vmax);
+ vacc2x01234567 = _mm256_min_ps(vacc2x01234567, vmax);
+ vacc3x01234567 = _mm256_min_ps(vacc3x01234567, vmax);
+ vacc4x01234567 = _mm256_min_ps(vacc4x01234567, vmax);
+
+ const __m256 vmin = _mm256_broadcast_ps((const __m128*) params->sse.min);
+ vacc0x01234567 = _mm256_max_ps(vacc0x01234567, vmin);
+ vacc1x01234567 = _mm256_max_ps(vacc1x01234567, vmin);
+ vacc2x01234567 = _mm256_max_ps(vacc2x01234567, vmin);
+ vacc3x01234567 = _mm256_max_ps(vacc3x01234567, vmin);
+ vacc4x01234567 = _mm256_max_ps(vacc4x01234567, vmin);
+
+ if XNN_LIKELY(nc >= 8) {
+ _mm256_storeu_ps(c4, vacc4x01234567);
+ c4 = (float*) ((uintptr_t) c4 + cn_stride);
+ _mm256_storeu_ps(c3, vacc3x01234567);
+ c3 = (float*) ((uintptr_t) c3 + cn_stride);
+ _mm256_storeu_ps(c2, vacc2x01234567);
+ c2 = (float*) ((uintptr_t) c2 + cn_stride);
+ _mm256_storeu_ps(c1, vacc1x01234567);
+ c1 = (float*) ((uintptr_t) c1 + cn_stride);
+ _mm256_storeu_ps(c0, vacc0x01234567);
+ c0 = (float*) ((uintptr_t) c0 + cn_stride);
+
+ a4 = (const float*) ((uintptr_t) a4 - kc);
+ a3 = (const float*) ((uintptr_t) a3 - kc);
+ a2 = (const float*) ((uintptr_t) a2 - kc);
+ a1 = (const float*) ((uintptr_t) a1 - kc);
+ a0 = (const float*) ((uintptr_t) a0 - kc);
+
+ nc -= 8;
+ } else {
+ __m128 vacc4x0123 = _mm256_castps256_ps128(vacc4x01234567);
+ __m128 vacc3x0123 = _mm256_castps256_ps128(vacc3x01234567);
+ __m128 vacc2x0123 = _mm256_castps256_ps128(vacc2x01234567);
+ __m128 vacc1x0123 = _mm256_castps256_ps128(vacc1x01234567);
+ __m128 vacc0x0123 = _mm256_castps256_ps128(vacc0x01234567);
+ if (nc & 4) {
+ _mm_storeu_ps(c4, vacc4x0123);
+ _mm_storeu_ps(c3, vacc3x0123);
+ _mm_storeu_ps(c2, vacc2x0123);
+ _mm_storeu_ps(c1, vacc1x0123);
+ _mm_storeu_ps(c0, vacc0x0123);
+
+ vacc4x0123 = _mm256_extractf128_ps(vacc4x01234567, 1);
+ vacc3x0123 = _mm256_extractf128_ps(vacc3x01234567, 1);
+ vacc2x0123 = _mm256_extractf128_ps(vacc2x01234567, 1);
+ vacc1x0123 = _mm256_extractf128_ps(vacc1x01234567, 1);
+ vacc0x0123 = _mm256_extractf128_ps(vacc0x01234567, 1);
+
+ c4 += 4;
+ c3 += 4;
+ c2 += 4;
+ c1 += 4;
+ c0 += 4;
+ }
+ if (nc & 2) {
+ _mm_storel_pi((__m64*) c4, vacc4x0123);
+ _mm_storel_pi((__m64*) c3, vacc3x0123);
+ _mm_storel_pi((__m64*) c2, vacc2x0123);
+ _mm_storel_pi((__m64*) c1, vacc1x0123);
+ _mm_storel_pi((__m64*) c0, vacc0x0123);
+
+ vacc4x0123 = _mm_movehl_ps(vacc4x0123, vacc4x0123);
+ vacc3x0123 = _mm_movehl_ps(vacc3x0123, vacc3x0123);
+ vacc2x0123 = _mm_movehl_ps(vacc2x0123, vacc2x0123);
+ vacc1x0123 = _mm_movehl_ps(vacc1x0123, vacc1x0123);
+ vacc0x0123 = _mm_movehl_ps(vacc0x0123, vacc0x0123);
+
+ c4 += 2;
+ c3 += 2;
+ c2 += 2;
+ c1 += 2;
+ c0 += 2;
+ }
+ if (nc & 1) {
+ _mm_store_ss(c4, vacc4x0123);
+ _mm_store_ss(c3, vacc3x0123);
+ _mm_store_ss(c2, vacc2x0123);
+ _mm_store_ss(c1, vacc1x0123);
+ _mm_store_ss(c0, vacc0x0123);
+ }
+
+ nc = 0;
+ }
+ } while (nc != 0);
+}
diff --git a/src/f32-gemm/6x8-avx-broadcast.c b/src/f32-gemm/6x8-avx-broadcast.c
new file mode 100644
index 0000000..4a718e8
--- /dev/null
+++ b/src/f32-gemm/6x8-avx-broadcast.c
@@ -0,0 +1,209 @@
+// Auto-generated file. Do not edit!
+// Template: src/f32-gemm/avx-broadcast.c.in
+// Generator: tools/xngen
+//
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <immintrin.h>
+
+#include <xnnpack/gemm.h>
+
+
+void xnn_f32_gemm_ukernel_6x8__avx_broadcast(
+ size_t mr,
+ size_t nc,
+ size_t kc,
+ const float*restrict a,
+ size_t a_stride,
+ const float*restrict w,
+ float*restrict c,
+ size_t cm_stride,
+ size_t cn_stride,
+ const union xnn_f32_output_params params[restrict static 1])
+{
+ assert(mr != 0);
+ assert(mr <= 6);
+ assert(nc != 0);
+ assert(kc != 0);
+ assert(kc % sizeof(float) == 0);
+ assert(a != NULL);
+ assert(w != NULL);
+ assert(c != NULL);
+
+ const float* a0 = a;
+ float* c0 = c;
+ const float* a1 = (const float*) ((uintptr_t) a0 + a_stride);
+ float* c1 = (float*) ((uintptr_t) c0 + cm_stride);
+ if XNN_UNPREDICTABLE(mr < 2) {
+ a1 = a0;
+ c1 = c0;
+ }
+ const float* a2 = (const float*) ((uintptr_t) a1 + a_stride);
+ float* c2 = (float*) ((uintptr_t) c1 + cm_stride);
+ if XNN_UNPREDICTABLE(mr <= 2) {
+ a2 = a1;
+ c2 = c1;
+ }
+ const float* a3 = (const float*) ((uintptr_t) a2 + a_stride);
+ float* c3 = (float*) ((uintptr_t) c2 + cm_stride);
+ if XNN_UNPREDICTABLE(mr < 4) {
+ a3 = a2;
+ c3 = c2;
+ }
+ const float* a4 = (const float*) ((uintptr_t) a3 + a_stride);
+ float* c4 = (float*) ((uintptr_t) c3 + cm_stride);
+ if XNN_UNPREDICTABLE(mr <= 4) {
+ a4 = a3;
+ c4 = c3;
+ }
+ const float* a5 = (const float*) ((uintptr_t) a4 + a_stride);
+ float* c5 = (float*) ((uintptr_t) c4 + cm_stride);
+ if XNN_UNPREDICTABLE(mr != 6) {
+ a5 = a4;
+ c5 = c4;
+ }
+
+ do {
+ __m256 vacc0x01234567 = _mm256_load_ps(w + 0);
+ __m256 vacc1x01234567 = vacc0x01234567;
+ __m256 vacc2x01234567 = vacc0x01234567;
+ __m256 vacc3x01234567 = vacc0x01234567;
+ __m256 vacc4x01234567 = vacc0x01234567;
+ __m256 vacc5x01234567 = vacc0x01234567;
+ w += 8;
+
+ size_t k = kc;
+ do {
+ const __m256 va0 = _mm256_broadcast_ss(a0);
+ a0 += 1;
+ const __m256 va1 = _mm256_broadcast_ss(a1);
+ a1 += 1;
+ const __m256 va2 = _mm256_broadcast_ss(a2);
+ a2 += 1;
+ const __m256 va3 = _mm256_broadcast_ss(a3);
+ a3 += 1;
+ const __m256 va4 = _mm256_broadcast_ss(a4);
+ a4 += 1;
+ const __m256 va5 = _mm256_broadcast_ss(a5);
+ a5 += 1;
+
+ const __m256 vb01234567 = _mm256_load_ps(w);
+ w += 8;
+
+ vacc0x01234567 = _mm256_add_ps(vacc0x01234567, _mm256_mul_ps(va0, vb01234567));
+ vacc1x01234567 = _mm256_add_ps(vacc1x01234567, _mm256_mul_ps(va1, vb01234567));
+ vacc2x01234567 = _mm256_add_ps(vacc2x01234567, _mm256_mul_ps(va2, vb01234567));
+ vacc3x01234567 = _mm256_add_ps(vacc3x01234567, _mm256_mul_ps(va3, vb01234567));
+ vacc4x01234567 = _mm256_add_ps(vacc4x01234567, _mm256_mul_ps(va4, vb01234567));
+ vacc5x01234567 = _mm256_add_ps(vacc5x01234567, _mm256_mul_ps(va5, vb01234567));
+
+ k -= sizeof(float);
+ } while (k != 0);
+
+ const __m256 vmax = _mm256_broadcast_ps((const __m128*) params->sse.max);
+ vacc0x01234567 = _mm256_min_ps(vacc0x01234567, vmax);
+ vacc1x01234567 = _mm256_min_ps(vacc1x01234567, vmax);
+ vacc2x01234567 = _mm256_min_ps(vacc2x01234567, vmax);
+ vacc3x01234567 = _mm256_min_ps(vacc3x01234567, vmax);
+ vacc4x01234567 = _mm256_min_ps(vacc4x01234567, vmax);
+ vacc5x01234567 = _mm256_min_ps(vacc5x01234567, vmax);
+
+ const __m256 vmin = _mm256_broadcast_ps((const __m128*) params->sse.min);
+ vacc0x01234567 = _mm256_max_ps(vacc0x01234567, vmin);
+ vacc1x01234567 = _mm256_max_ps(vacc1x01234567, vmin);
+ vacc2x01234567 = _mm256_max_ps(vacc2x01234567, vmin);
+ vacc3x01234567 = _mm256_max_ps(vacc3x01234567, vmin);
+ vacc4x01234567 = _mm256_max_ps(vacc4x01234567, vmin);
+ vacc5x01234567 = _mm256_max_ps(vacc5x01234567, vmin);
+
+ if XNN_LIKELY(nc >= 8) {
+ _mm256_storeu_ps(c5, vacc5x01234567);
+ c5 = (float*) ((uintptr_t) c5 + cn_stride);
+ _mm256_storeu_ps(c4, vacc4x01234567);
+ c4 = (float*) ((uintptr_t) c4 + cn_stride);
+ _mm256_storeu_ps(c3, vacc3x01234567);
+ c3 = (float*) ((uintptr_t) c3 + cn_stride);
+ _mm256_storeu_ps(c2, vacc2x01234567);
+ c2 = (float*) ((uintptr_t) c2 + cn_stride);
+ _mm256_storeu_ps(c1, vacc1x01234567);
+ c1 = (float*) ((uintptr_t) c1 + cn_stride);
+ _mm256_storeu_ps(c0, vacc0x01234567);
+ c0 = (float*) ((uintptr_t) c0 + cn_stride);
+
+ a5 = (const float*) ((uintptr_t) a5 - kc);
+ a4 = (const float*) ((uintptr_t) a4 - kc);
+ a3 = (const float*) ((uintptr_t) a3 - kc);
+ a2 = (const float*) ((uintptr_t) a2 - kc);
+ a1 = (const float*) ((uintptr_t) a1 - kc);
+ a0 = (const float*) ((uintptr_t) a0 - kc);
+
+ nc -= 8;
+ } else {
+ __m128 vacc5x0123 = _mm256_castps256_ps128(vacc5x01234567);
+ __m128 vacc4x0123 = _mm256_castps256_ps128(vacc4x01234567);
+ __m128 vacc3x0123 = _mm256_castps256_ps128(vacc3x01234567);
+ __m128 vacc2x0123 = _mm256_castps256_ps128(vacc2x01234567);
+ __m128 vacc1x0123 = _mm256_castps256_ps128(vacc1x01234567);
+ __m128 vacc0x0123 = _mm256_castps256_ps128(vacc0x01234567);
+ if (nc & 4) {
+ _mm_storeu_ps(c5, vacc5x0123);
+ _mm_storeu_ps(c4, vacc4x0123);
+ _mm_storeu_ps(c3, vacc3x0123);
+ _mm_storeu_ps(c2, vacc2x0123);
+ _mm_storeu_ps(c1, vacc1x0123);
+ _mm_storeu_ps(c0, vacc0x0123);
+
+ vacc5x0123 = _mm256_extractf128_ps(vacc5x01234567, 1);
+ vacc4x0123 = _mm256_extractf128_ps(vacc4x01234567, 1);
+ vacc3x0123 = _mm256_extractf128_ps(vacc3x01234567, 1);
+ vacc2x0123 = _mm256_extractf128_ps(vacc2x01234567, 1);
+ vacc1x0123 = _mm256_extractf128_ps(vacc1x01234567, 1);
+ vacc0x0123 = _mm256_extractf128_ps(vacc0x01234567, 1);
+
+ c5 += 4;
+ c4 += 4;
+ c3 += 4;
+ c2 += 4;
+ c1 += 4;
+ c0 += 4;
+ }
+ if (nc & 2) {
+ _mm_storel_pi((__m64*) c5, vacc5x0123);
+ _mm_storel_pi((__m64*) c4, vacc4x0123);
+ _mm_storel_pi((__m64*) c3, vacc3x0123);
+ _mm_storel_pi((__m64*) c2, vacc2x0123);
+ _mm_storel_pi((__m64*) c1, vacc1x0123);
+ _mm_storel_pi((__m64*) c0, vacc0x0123);
+
+ vacc5x0123 = _mm_movehl_ps(vacc5x0123, vacc5x0123);
+ vacc4x0123 = _mm_movehl_ps(vacc4x0123, vacc4x0123);
+ vacc3x0123 = _mm_movehl_ps(vacc3x0123, vacc3x0123);
+ vacc2x0123 = _mm_movehl_ps(vacc2x0123, vacc2x0123);
+ vacc1x0123 = _mm_movehl_ps(vacc1x0123, vacc1x0123);
+ vacc0x0123 = _mm_movehl_ps(vacc0x0123, vacc0x0123);
+
+ c5 += 2;
+ c4 += 2;
+ c3 += 2;
+ c2 += 2;
+ c1 += 2;
+ c0 += 2;
+ }
+ if (nc & 1) {
+ _mm_store_ss(c5, vacc5x0123);
+ _mm_store_ss(c4, vacc4x0123);
+ _mm_store_ss(c3, vacc3x0123);
+ _mm_store_ss(c2, vacc2x0123);
+ _mm_store_ss(c1, vacc1x0123);
+ _mm_store_ss(c0, vacc0x0123);
+ }
+
+ nc = 0;
+ }
+ } while (nc != 0);
+}
diff --git a/src/f32-gemm/6x8-fma3-broadcast.c b/src/f32-gemm/6x8-fma3-broadcast.c
new file mode 100644
index 0000000..05c8e0d
--- /dev/null
+++ b/src/f32-gemm/6x8-fma3-broadcast.c
@@ -0,0 +1,209 @@
+// Auto-generated file. Do not edit!
+// Template: src/f32-gemm/avx-broadcast.c.in
+// Generator: tools/xngen
+//
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <immintrin.h>
+
+#include <xnnpack/gemm.h>
+
+
+void xnn_f32_gemm_ukernel_6x8__fma3_broadcast(
+ size_t mr,
+ size_t nc,
+ size_t kc,
+ const float*restrict a,
+ size_t a_stride,
+ const float*restrict w,
+ float*restrict c,
+ size_t cm_stride,
+ size_t cn_stride,
+ const union xnn_f32_output_params params[restrict static 1])
+{
+ assert(mr != 0);
+ assert(mr <= 6);
+ assert(nc != 0);
+ assert(kc != 0);
+ assert(kc % sizeof(float) == 0);
+ assert(a != NULL);
+ assert(w != NULL);
+ assert(c != NULL);
+
+ const float* a0 = a;
+ float* c0 = c;
+ const float* a1 = (const float*) ((uintptr_t) a0 + a_stride);
+ float* c1 = (float*) ((uintptr_t) c0 + cm_stride);
+ if XNN_UNPREDICTABLE(mr < 2) {
+ a1 = a0;
+ c1 = c0;
+ }
+ const float* a2 = (const float*) ((uintptr_t) a1 + a_stride);
+ float* c2 = (float*) ((uintptr_t) c1 + cm_stride);
+ if XNN_UNPREDICTABLE(mr <= 2) {
+ a2 = a1;
+ c2 = c1;
+ }
+ const float* a3 = (const float*) ((uintptr_t) a2 + a_stride);
+ float* c3 = (float*) ((uintptr_t) c2 + cm_stride);
+ if XNN_UNPREDICTABLE(mr < 4) {
+ a3 = a2;
+ c3 = c2;
+ }
+ const float* a4 = (const float*) ((uintptr_t) a3 + a_stride);
+ float* c4 = (float*) ((uintptr_t) c3 + cm_stride);
+ if XNN_UNPREDICTABLE(mr <= 4) {
+ a4 = a3;
+ c4 = c3;
+ }
+ const float* a5 = (const float*) ((uintptr_t) a4 + a_stride);
+ float* c5 = (float*) ((uintptr_t) c4 + cm_stride);
+ if XNN_UNPREDICTABLE(mr != 6) {
+ a5 = a4;
+ c5 = c4;
+ }
+
+ do {
+ __m256 vacc0x01234567 = _mm256_load_ps(w + 0);
+ __m256 vacc1x01234567 = vacc0x01234567;
+ __m256 vacc2x01234567 = vacc0x01234567;
+ __m256 vacc3x01234567 = vacc0x01234567;
+ __m256 vacc4x01234567 = vacc0x01234567;
+ __m256 vacc5x01234567 = vacc0x01234567;
+ w += 8;
+
+ size_t k = kc;
+ do {
+ const __m256 va0 = _mm256_broadcast_ss(a0);
+ a0 += 1;
+ const __m256 va1 = _mm256_broadcast_ss(a1);
+ a1 += 1;
+ const __m256 va2 = _mm256_broadcast_ss(a2);
+ a2 += 1;
+ const __m256 va3 = _mm256_broadcast_ss(a3);
+ a3 += 1;
+ const __m256 va4 = _mm256_broadcast_ss(a4);
+ a4 += 1;
+ const __m256 va5 = _mm256_broadcast_ss(a5);
+ a5 += 1;
+
+ const __m256 vb01234567 = _mm256_load_ps(w);
+ w += 8;
+
+ vacc0x01234567 = _mm256_fmadd_ps(va0, vb01234567, vacc0x01234567);
+ vacc1x01234567 = _mm256_fmadd_ps(va1, vb01234567, vacc1x01234567);
+ vacc2x01234567 = _mm256_fmadd_ps(va2, vb01234567, vacc2x01234567);
+ vacc3x01234567 = _mm256_fmadd_ps(va3, vb01234567, vacc3x01234567);
+ vacc4x01234567 = _mm256_fmadd_ps(va4, vb01234567, vacc4x01234567);
+ vacc5x01234567 = _mm256_fmadd_ps(va5, vb01234567, vacc5x01234567);
+
+ k -= sizeof(float);
+ } while (k != 0);
+
+ const __m256 vmax = _mm256_broadcast_ps((const __m128*) params->sse.max);
+ vacc0x01234567 = _mm256_min_ps(vacc0x01234567, vmax);
+ vacc1x01234567 = _mm256_min_ps(vacc1x01234567, vmax);
+ vacc2x01234567 = _mm256_min_ps(vacc2x01234567, vmax);
+ vacc3x01234567 = _mm256_min_ps(vacc3x01234567, vmax);
+ vacc4x01234567 = _mm256_min_ps(vacc4x01234567, vmax);
+ vacc5x01234567 = _mm256_min_ps(vacc5x01234567, vmax);
+
+ const __m256 vmin = _mm256_broadcast_ps((const __m128*) params->sse.min);
+ vacc0x01234567 = _mm256_max_ps(vacc0x01234567, vmin);
+ vacc1x01234567 = _mm256_max_ps(vacc1x01234567, vmin);
+ vacc2x01234567 = _mm256_max_ps(vacc2x01234567, vmin);
+ vacc3x01234567 = _mm256_max_ps(vacc3x01234567, vmin);
+ vacc4x01234567 = _mm256_max_ps(vacc4x01234567, vmin);
+ vacc5x01234567 = _mm256_max_ps(vacc5x01234567, vmin);
+
+ if XNN_LIKELY(nc >= 8) {
+ _mm256_storeu_ps(c5, vacc5x01234567);
+ c5 = (float*) ((uintptr_t) c5 + cn_stride);
+ _mm256_storeu_ps(c4, vacc4x01234567);
+ c4 = (float*) ((uintptr_t) c4 + cn_stride);
+ _mm256_storeu_ps(c3, vacc3x01234567);
+ c3 = (float*) ((uintptr_t) c3 + cn_stride);
+ _mm256_storeu_ps(c2, vacc2x01234567);
+ c2 = (float*) ((uintptr_t) c2 + cn_stride);
+ _mm256_storeu_ps(c1, vacc1x01234567);
+ c1 = (float*) ((uintptr_t) c1 + cn_stride);
+ _mm256_storeu_ps(c0, vacc0x01234567);
+ c0 = (float*) ((uintptr_t) c0 + cn_stride);
+
+ a5 = (const float*) ((uintptr_t) a5 - kc);
+ a4 = (const float*) ((uintptr_t) a4 - kc);
+ a3 = (const float*) ((uintptr_t) a3 - kc);
+ a2 = (const float*) ((uintptr_t) a2 - kc);
+ a1 = (const float*) ((uintptr_t) a1 - kc);
+ a0 = (const float*) ((uintptr_t) a0 - kc);
+
+ nc -= 8;
+ } else {
+ __m128 vacc5x0123 = _mm256_castps256_ps128(vacc5x01234567);
+ __m128 vacc4x0123 = _mm256_castps256_ps128(vacc4x01234567);
+ __m128 vacc3x0123 = _mm256_castps256_ps128(vacc3x01234567);
+ __m128 vacc2x0123 = _mm256_castps256_ps128(vacc2x01234567);
+ __m128 vacc1x0123 = _mm256_castps256_ps128(vacc1x01234567);
+ __m128 vacc0x0123 = _mm256_castps256_ps128(vacc0x01234567);
+ if (nc & 4) {
+ _mm_storeu_ps(c5, vacc5x0123);
+ _mm_storeu_ps(c4, vacc4x0123);
+ _mm_storeu_ps(c3, vacc3x0123);
+ _mm_storeu_ps(c2, vacc2x0123);
+ _mm_storeu_ps(c1, vacc1x0123);
+ _mm_storeu_ps(c0, vacc0x0123);
+
+ vacc5x0123 = _mm256_extractf128_ps(vacc5x01234567, 1);
+ vacc4x0123 = _mm256_extractf128_ps(vacc4x01234567, 1);
+ vacc3x0123 = _mm256_extractf128_ps(vacc3x01234567, 1);
+ vacc2x0123 = _mm256_extractf128_ps(vacc2x01234567, 1);
+ vacc1x0123 = _mm256_extractf128_ps(vacc1x01234567, 1);
+ vacc0x0123 = _mm256_extractf128_ps(vacc0x01234567, 1);
+
+ c5 += 4;
+ c4 += 4;
+ c3 += 4;
+ c2 += 4;
+ c1 += 4;
+ c0 += 4;
+ }
+ if (nc & 2) {
+ _mm_storel_pi((__m64*) c5, vacc5x0123);
+ _mm_storel_pi((__m64*) c4, vacc4x0123);
+ _mm_storel_pi((__m64*) c3, vacc3x0123);
+ _mm_storel_pi((__m64*) c2, vacc2x0123);
+ _mm_storel_pi((__m64*) c1, vacc1x0123);
+ _mm_storel_pi((__m64*) c0, vacc0x0123);
+
+ vacc5x0123 = _mm_movehl_ps(vacc5x0123, vacc5x0123);
+ vacc4x0123 = _mm_movehl_ps(vacc4x0123, vacc4x0123);
+ vacc3x0123 = _mm_movehl_ps(vacc3x0123, vacc3x0123);
+ vacc2x0123 = _mm_movehl_ps(vacc2x0123, vacc2x0123);
+ vacc1x0123 = _mm_movehl_ps(vacc1x0123, vacc1x0123);
+ vacc0x0123 = _mm_movehl_ps(vacc0x0123, vacc0x0123);
+
+ c5 += 2;
+ c4 += 2;
+ c3 += 2;
+ c2 += 2;
+ c1 += 2;
+ c0 += 2;
+ }
+ if (nc & 1) {
+ _mm_store_ss(c5, vacc5x0123);
+ _mm_store_ss(c4, vacc4x0123);
+ _mm_store_ss(c3, vacc3x0123);
+ _mm_store_ss(c2, vacc2x0123);
+ _mm_store_ss(c1, vacc1x0123);
+ _mm_store_ss(c0, vacc0x0123);
+ }
+
+ nc = 0;
+ }
+ } while (nc != 0);
+}
diff --git a/src/f32-gemm/7x8-avx-broadcast.c b/src/f32-gemm/7x8-avx-broadcast.c
new file mode 100644
index 0000000..ed6745d
--- /dev/null
+++ b/src/f32-gemm/7x8-avx-broadcast.c
@@ -0,0 +1,232 @@
+// Auto-generated file. Do not edit!
+// Template: src/f32-gemm/avx-broadcast.c.in
+// Generator: tools/xngen
+//
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <immintrin.h>
+
+#include <xnnpack/gemm.h>
+
+
+void xnn_f32_gemm_ukernel_7x8__avx_broadcast(
+ size_t mr,
+ size_t nc,
+ size_t kc,
+ const float*restrict a,
+ size_t a_stride,
+ const float*restrict w,
+ float*restrict c,
+ size_t cm_stride,
+ size_t cn_stride,
+ const union xnn_f32_output_params params[restrict static 1])
+{
+ assert(mr != 0);
+ assert(mr <= 7);
+ assert(nc != 0);
+ assert(kc != 0);
+ assert(kc % sizeof(float) == 0);
+ assert(a != NULL);
+ assert(w != NULL);
+ assert(c != NULL);
+
+ const float* a0 = a;
+ float* c0 = c;
+ const float* a1 = (const float*) ((uintptr_t) a0 + a_stride);
+ float* c1 = (float*) ((uintptr_t) c0 + cm_stride);
+ if XNN_UNPREDICTABLE(mr < 2) {
+ a1 = a0;
+ c1 = c0;
+ }
+ const float* a2 = (const float*) ((uintptr_t) a1 + a_stride);
+ float* c2 = (float*) ((uintptr_t) c1 + cm_stride);
+ if XNN_UNPREDICTABLE(mr <= 2) {
+ a2 = a1;
+ c2 = c1;
+ }
+ const float* a3 = (const float*) ((uintptr_t) a2 + a_stride);
+ float* c3 = (float*) ((uintptr_t) c2 + cm_stride);
+ if XNN_UNPREDICTABLE(mr < 4) {
+ a3 = a2;
+ c3 = c2;
+ }
+ const float* a4 = (const float*) ((uintptr_t) a3 + a_stride);
+ float* c4 = (float*) ((uintptr_t) c3 + cm_stride);
+ if XNN_UNPREDICTABLE(mr <= 4) {
+ a4 = a3;
+ c4 = c3;
+ }
+ const float* a5 = (const float*) ((uintptr_t) a4 + a_stride);
+ float* c5 = (float*) ((uintptr_t) c4 + cm_stride);
+ if XNN_UNPREDICTABLE(mr < 6) {
+ a5 = a4;
+ c5 = c4;
+ }
+ const float* a6 = (const float*) ((uintptr_t) a5 + a_stride);
+ float* c6 = (float*) ((uintptr_t) c5 + cm_stride);
+ if XNN_UNPREDICTABLE(mr <= 6) {
+ a6 = a5;
+ c6 = c5;
+ }
+
+ do {
+ __m256 vacc0x01234567 = _mm256_load_ps(w + 0);
+ __m256 vacc1x01234567 = vacc0x01234567;
+ __m256 vacc2x01234567 = vacc0x01234567;
+ __m256 vacc3x01234567 = vacc0x01234567;
+ __m256 vacc4x01234567 = vacc0x01234567;
+ __m256 vacc5x01234567 = vacc0x01234567;
+ __m256 vacc6x01234567 = vacc0x01234567;
+ w += 8;
+
+ size_t k = kc;
+ do {
+ const __m256 va0 = _mm256_broadcast_ss(a0);
+ a0 += 1;
+ const __m256 va1 = _mm256_broadcast_ss(a1);
+ a1 += 1;
+ const __m256 va2 = _mm256_broadcast_ss(a2);
+ a2 += 1;
+ const __m256 va3 = _mm256_broadcast_ss(a3);
+ a3 += 1;
+ const __m256 va4 = _mm256_broadcast_ss(a4);
+ a4 += 1;
+ const __m256 va5 = _mm256_broadcast_ss(a5);
+ a5 += 1;
+ const __m256 va6 = _mm256_broadcast_ss(a6);
+ a6 += 1;
+
+ const __m256 vb01234567 = _mm256_load_ps(w);
+ w += 8;
+
+ vacc0x01234567 = _mm256_add_ps(vacc0x01234567, _mm256_mul_ps(va0, vb01234567));
+ vacc1x01234567 = _mm256_add_ps(vacc1x01234567, _mm256_mul_ps(va1, vb01234567));
+ vacc2x01234567 = _mm256_add_ps(vacc2x01234567, _mm256_mul_ps(va2, vb01234567));
+ vacc3x01234567 = _mm256_add_ps(vacc3x01234567, _mm256_mul_ps(va3, vb01234567));
+ vacc4x01234567 = _mm256_add_ps(vacc4x01234567, _mm256_mul_ps(va4, vb01234567));
+ vacc5x01234567 = _mm256_add_ps(vacc5x01234567, _mm256_mul_ps(va5, vb01234567));
+ vacc6x01234567 = _mm256_add_ps(vacc6x01234567, _mm256_mul_ps(va6, vb01234567));
+
+ k -= sizeof(float);
+ } while (k != 0);
+
+ const __m256 vmax = _mm256_broadcast_ps((const __m128*) params->sse.max);
+ vacc0x01234567 = _mm256_min_ps(vacc0x01234567, vmax);
+ vacc1x01234567 = _mm256_min_ps(vacc1x01234567, vmax);
+ vacc2x01234567 = _mm256_min_ps(vacc2x01234567, vmax);
+ vacc3x01234567 = _mm256_min_ps(vacc3x01234567, vmax);
+ vacc4x01234567 = _mm256_min_ps(vacc4x01234567, vmax);
+ vacc5x01234567 = _mm256_min_ps(vacc5x01234567, vmax);
+ vacc6x01234567 = _mm256_min_ps(vacc6x01234567, vmax);
+
+ const __m256 vmin = _mm256_broadcast_ps((const __m128*) params->sse.min);
+ vacc0x01234567 = _mm256_max_ps(vacc0x01234567, vmin);
+ vacc1x01234567 = _mm256_max_ps(vacc1x01234567, vmin);
+ vacc2x01234567 = _mm256_max_ps(vacc2x01234567, vmin);
+ vacc3x01234567 = _mm256_max_ps(vacc3x01234567, vmin);
+ vacc4x01234567 = _mm256_max_ps(vacc4x01234567, vmin);
+ vacc5x01234567 = _mm256_max_ps(vacc5x01234567, vmin);
+ vacc6x01234567 = _mm256_max_ps(vacc6x01234567, vmin);
+
+ if XNN_LIKELY(nc >= 8) {
+ _mm256_storeu_ps(c6, vacc6x01234567);
+ c6 = (float*) ((uintptr_t) c6 + cn_stride);
+ _mm256_storeu_ps(c5, vacc5x01234567);
+ c5 = (float*) ((uintptr_t) c5 + cn_stride);
+ _mm256_storeu_ps(c4, vacc4x01234567);
+ c4 = (float*) ((uintptr_t) c4 + cn_stride);
+ _mm256_storeu_ps(c3, vacc3x01234567);
+ c3 = (float*) ((uintptr_t) c3 + cn_stride);
+ _mm256_storeu_ps(c2, vacc2x01234567);
+ c2 = (float*) ((uintptr_t) c2 + cn_stride);
+ _mm256_storeu_ps(c1, vacc1x01234567);
+ c1 = (float*) ((uintptr_t) c1 + cn_stride);
+ _mm256_storeu_ps(c0, vacc0x01234567);
+ c0 = (float*) ((uintptr_t) c0 + cn_stride);
+
+ a6 = (const float*) ((uintptr_t) a6 - kc);
+ a5 = (const float*) ((uintptr_t) a5 - kc);
+ a4 = (const float*) ((uintptr_t) a4 - kc);
+ a3 = (const float*) ((uintptr_t) a3 - kc);
+ a2 = (const float*) ((uintptr_t) a2 - kc);
+ a1 = (const float*) ((uintptr_t) a1 - kc);
+ a0 = (const float*) ((uintptr_t) a0 - kc);
+
+ nc -= 8;
+ } else {
+ __m128 vacc6x0123 = _mm256_castps256_ps128(vacc6x01234567);
+ __m128 vacc5x0123 = _mm256_castps256_ps128(vacc5x01234567);
+ __m128 vacc4x0123 = _mm256_castps256_ps128(vacc4x01234567);
+ __m128 vacc3x0123 = _mm256_castps256_ps128(vacc3x01234567);
+ __m128 vacc2x0123 = _mm256_castps256_ps128(vacc2x01234567);
+ __m128 vacc1x0123 = _mm256_castps256_ps128(vacc1x01234567);
+ __m128 vacc0x0123 = _mm256_castps256_ps128(vacc0x01234567);
+ if (nc & 4) {
+ _mm_storeu_ps(c6, vacc6x0123);
+ _mm_storeu_ps(c5, vacc5x0123);
+ _mm_storeu_ps(c4, vacc4x0123);
+ _mm_storeu_ps(c3, vacc3x0123);
+ _mm_storeu_ps(c2, vacc2x0123);
+ _mm_storeu_ps(c1, vacc1x0123);
+ _mm_storeu_ps(c0, vacc0x0123);
+
+ vacc6x0123 = _mm256_extractf128_ps(vacc6x01234567, 1);
+ vacc5x0123 = _mm256_extractf128_ps(vacc5x01234567, 1);
+ vacc4x0123 = _mm256_extractf128_ps(vacc4x01234567, 1);
+ vacc3x0123 = _mm256_extractf128_ps(vacc3x01234567, 1);
+ vacc2x0123 = _mm256_extractf128_ps(vacc2x01234567, 1);
+ vacc1x0123 = _mm256_extractf128_ps(vacc1x01234567, 1);
+ vacc0x0123 = _mm256_extractf128_ps(vacc0x01234567, 1);
+
+ c6 += 4;
+ c5 += 4;
+ c4 += 4;
+ c3 += 4;
+ c2 += 4;
+ c1 += 4;
+ c0 += 4;
+ }
+ if (nc & 2) {
+ _mm_storel_pi((__m64*) c6, vacc6x0123);
+ _mm_storel_pi((__m64*) c5, vacc5x0123);
+ _mm_storel_pi((__m64*) c4, vacc4x0123);
+ _mm_storel_pi((__m64*) c3, vacc3x0123);
+ _mm_storel_pi((__m64*) c2, vacc2x0123);
+ _mm_storel_pi((__m64*) c1, vacc1x0123);
+ _mm_storel_pi((__m64*) c0, vacc0x0123);
+
+ vacc6x0123 = _mm_movehl_ps(vacc6x0123, vacc6x0123);
+ vacc5x0123 = _mm_movehl_ps(vacc5x0123, vacc5x0123);
+ vacc4x0123 = _mm_movehl_ps(vacc4x0123, vacc4x0123);
+ vacc3x0123 = _mm_movehl_ps(vacc3x0123, vacc3x0123);
+ vacc2x0123 = _mm_movehl_ps(vacc2x0123, vacc2x0123);
+ vacc1x0123 = _mm_movehl_ps(vacc1x0123, vacc1x0123);
+ vacc0x0123 = _mm_movehl_ps(vacc0x0123, vacc0x0123);
+
+ c6 += 2;
+ c5 += 2;
+ c4 += 2;
+ c3 += 2;
+ c2 += 2;
+ c1 += 2;
+ c0 += 2;
+ }
+ if (nc & 1) {
+ _mm_store_ss(c6, vacc6x0123);
+ _mm_store_ss(c5, vacc5x0123);
+ _mm_store_ss(c4, vacc4x0123);
+ _mm_store_ss(c3, vacc3x0123);
+ _mm_store_ss(c2, vacc2x0123);
+ _mm_store_ss(c1, vacc1x0123);
+ _mm_store_ss(c0, vacc0x0123);
+ }
+
+ nc = 0;
+ }
+ } while (nc != 0);
+}
diff --git a/src/f32-gemm/7x8-fma3-broadcast.c b/src/f32-gemm/7x8-fma3-broadcast.c
new file mode 100644
index 0000000..4f0d223
--- /dev/null
+++ b/src/f32-gemm/7x8-fma3-broadcast.c
@@ -0,0 +1,232 @@
+// Auto-generated file. Do not edit!
+// Template: src/f32-gemm/avx-broadcast.c.in
+// Generator: tools/xngen
+//
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <immintrin.h>
+
+#include <xnnpack/gemm.h>
+
+
+void xnn_f32_gemm_ukernel_7x8__fma3_broadcast(
+ size_t mr,
+ size_t nc,
+ size_t kc,
+ const float*restrict a,
+ size_t a_stride,
+ const float*restrict w,
+ float*restrict c,
+ size_t cm_stride,
+ size_t cn_stride,
+ const union xnn_f32_output_params params[restrict static 1])
+{
+ assert(mr != 0);
+ assert(mr <= 7);
+ assert(nc != 0);
+ assert(kc != 0);
+ assert(kc % sizeof(float) == 0);
+ assert(a != NULL);
+ assert(w != NULL);
+ assert(c != NULL);
+
+ const float* a0 = a;
+ float* c0 = c;
+ const float* a1 = (const float*) ((uintptr_t) a0 + a_stride);
+ float* c1 = (float*) ((uintptr_t) c0 + cm_stride);
+ if XNN_UNPREDICTABLE(mr < 2) {
+ a1 = a0;
+ c1 = c0;
+ }
+ const float* a2 = (const float*) ((uintptr_t) a1 + a_stride);
+ float* c2 = (float*) ((uintptr_t) c1 + cm_stride);
+ if XNN_UNPREDICTABLE(mr <= 2) {
+ a2 = a1;
+ c2 = c1;
+ }
+ const float* a3 = (const float*) ((uintptr_t) a2 + a_stride);
+ float* c3 = (float*) ((uintptr_t) c2 + cm_stride);
+ if XNN_UNPREDICTABLE(mr < 4) {
+ a3 = a2;
+ c3 = c2;
+ }
+ const float* a4 = (const float*) ((uintptr_t) a3 + a_stride);
+ float* c4 = (float*) ((uintptr_t) c3 + cm_stride);
+ if XNN_UNPREDICTABLE(mr <= 4) {
+ a4 = a3;
+ c4 = c3;
+ }
+ const float* a5 = (const float*) ((uintptr_t) a4 + a_stride);
+ float* c5 = (float*) ((uintptr_t) c4 + cm_stride);
+ if XNN_UNPREDICTABLE(mr < 6) {
+ a5 = a4;
+ c5 = c4;
+ }
+ const float* a6 = (const float*) ((uintptr_t) a5 + a_stride);
+ float* c6 = (float*) ((uintptr_t) c5 + cm_stride);
+ if XNN_UNPREDICTABLE(mr <= 6) {
+ a6 = a5;
+ c6 = c5;
+ }
+
+ do {
+ __m256 vacc0x01234567 = _mm256_load_ps(w + 0);
+ __m256 vacc1x01234567 = vacc0x01234567;
+ __m256 vacc2x01234567 = vacc0x01234567;
+ __m256 vacc3x01234567 = vacc0x01234567;
+ __m256 vacc4x01234567 = vacc0x01234567;
+ __m256 vacc5x01234567 = vacc0x01234567;
+ __m256 vacc6x01234567 = vacc0x01234567;
+ w += 8;
+
+ size_t k = kc;
+ do {
+ const __m256 va0 = _mm256_broadcast_ss(a0);
+ a0 += 1;
+ const __m256 va1 = _mm256_broadcast_ss(a1);
+ a1 += 1;
+ const __m256 va2 = _mm256_broadcast_ss(a2);
+ a2 += 1;
+ const __m256 va3 = _mm256_broadcast_ss(a3);
+ a3 += 1;
+ const __m256 va4 = _mm256_broadcast_ss(a4);
+ a4 += 1;
+ const __m256 va5 = _mm256_broadcast_ss(a5);
+ a5 += 1;
+ const __m256 va6 = _mm256_broadcast_ss(a6);
+ a6 += 1;
+
+ const __m256 vb01234567 = _mm256_load_ps(w);
+ w += 8;
+
+ vacc0x01234567 = _mm256_fmadd_ps(va0, vb01234567, vacc0x01234567);
+ vacc1x01234567 = _mm256_fmadd_ps(va1, vb01234567, vacc1x01234567);
+ vacc2x01234567 = _mm256_fmadd_ps(va2, vb01234567, vacc2x01234567);
+ vacc3x01234567 = _mm256_fmadd_ps(va3, vb01234567, vacc3x01234567);
+ vacc4x01234567 = _mm256_fmadd_ps(va4, vb01234567, vacc4x01234567);
+ vacc5x01234567 = _mm256_fmadd_ps(va5, vb01234567, vacc5x01234567);
+ vacc6x01234567 = _mm256_fmadd_ps(va6, vb01234567, vacc6x01234567);
+
+ k -= sizeof(float);
+ } while (k != 0);
+
+ const __m256 vmax = _mm256_broadcast_ps((const __m128*) params->sse.max);
+ vacc0x01234567 = _mm256_min_ps(vacc0x01234567, vmax);
+ vacc1x01234567 = _mm256_min_ps(vacc1x01234567, vmax);
+ vacc2x01234567 = _mm256_min_ps(vacc2x01234567, vmax);
+ vacc3x01234567 = _mm256_min_ps(vacc3x01234567, vmax);
+ vacc4x01234567 = _mm256_min_ps(vacc4x01234567, vmax);
+ vacc5x01234567 = _mm256_min_ps(vacc5x01234567, vmax);
+ vacc6x01234567 = _mm256_min_ps(vacc6x01234567, vmax);
+
+ const __m256 vmin = _mm256_broadcast_ps((const __m128*) params->sse.min);
+ vacc0x01234567 = _mm256_max_ps(vacc0x01234567, vmin);
+ vacc1x01234567 = _mm256_max_ps(vacc1x01234567, vmin);
+ vacc2x01234567 = _mm256_max_ps(vacc2x01234567, vmin);
+ vacc3x01234567 = _mm256_max_ps(vacc3x01234567, vmin);
+ vacc4x01234567 = _mm256_max_ps(vacc4x01234567, vmin);
+ vacc5x01234567 = _mm256_max_ps(vacc5x01234567, vmin);
+ vacc6x01234567 = _mm256_max_ps(vacc6x01234567, vmin);
+
+ if XNN_LIKELY(nc >= 8) {
+ _mm256_storeu_ps(c6, vacc6x01234567);
+ c6 = (float*) ((uintptr_t) c6 + cn_stride);
+ _mm256_storeu_ps(c5, vacc5x01234567);
+ c5 = (float*) ((uintptr_t) c5 + cn_stride);
+ _mm256_storeu_ps(c4, vacc4x01234567);
+ c4 = (float*) ((uintptr_t) c4 + cn_stride);
+ _mm256_storeu_ps(c3, vacc3x01234567);
+ c3 = (float*) ((uintptr_t) c3 + cn_stride);
+ _mm256_storeu_ps(c2, vacc2x01234567);
+ c2 = (float*) ((uintptr_t) c2 + cn_stride);
+ _mm256_storeu_ps(c1, vacc1x01234567);
+ c1 = (float*) ((uintptr_t) c1 + cn_stride);
+ _mm256_storeu_ps(c0, vacc0x01234567);
+ c0 = (float*) ((uintptr_t) c0 + cn_stride);
+
+ a6 = (const float*) ((uintptr_t) a6 - kc);
+ a5 = (const float*) ((uintptr_t) a5 - kc);
+ a4 = (const float*) ((uintptr_t) a4 - kc);
+ a3 = (const float*) ((uintptr_t) a3 - kc);
+ a2 = (const float*) ((uintptr_t) a2 - kc);
+ a1 = (const float*) ((uintptr_t) a1 - kc);
+ a0 = (const float*) ((uintptr_t) a0 - kc);
+
+ nc -= 8;
+ } else {
+ __m128 vacc6x0123 = _mm256_castps256_ps128(vacc6x01234567);
+ __m128 vacc5x0123 = _mm256_castps256_ps128(vacc5x01234567);
+ __m128 vacc4x0123 = _mm256_castps256_ps128(vacc4x01234567);
+ __m128 vacc3x0123 = _mm256_castps256_ps128(vacc3x01234567);
+ __m128 vacc2x0123 = _mm256_castps256_ps128(vacc2x01234567);
+ __m128 vacc1x0123 = _mm256_castps256_ps128(vacc1x01234567);
+ __m128 vacc0x0123 = _mm256_castps256_ps128(vacc0x01234567);
+ if (nc & 4) {
+ _mm_storeu_ps(c6, vacc6x0123);
+ _mm_storeu_ps(c5, vacc5x0123);
+ _mm_storeu_ps(c4, vacc4x0123);
+ _mm_storeu_ps(c3, vacc3x0123);
+ _mm_storeu_ps(c2, vacc2x0123);
+ _mm_storeu_ps(c1, vacc1x0123);
+ _mm_storeu_ps(c0, vacc0x0123);
+
+ vacc6x0123 = _mm256_extractf128_ps(vacc6x01234567, 1);
+ vacc5x0123 = _mm256_extractf128_ps(vacc5x01234567, 1);
+ vacc4x0123 = _mm256_extractf128_ps(vacc4x01234567, 1);
+ vacc3x0123 = _mm256_extractf128_ps(vacc3x01234567, 1);
+ vacc2x0123 = _mm256_extractf128_ps(vacc2x01234567, 1);
+ vacc1x0123 = _mm256_extractf128_ps(vacc1x01234567, 1);
+ vacc0x0123 = _mm256_extractf128_ps(vacc0x01234567, 1);
+
+ c6 += 4;
+ c5 += 4;
+ c4 += 4;
+ c3 += 4;
+ c2 += 4;
+ c1 += 4;
+ c0 += 4;
+ }
+ if (nc & 2) {
+ _mm_storel_pi((__m64*) c6, vacc6x0123);
+ _mm_storel_pi((__m64*) c5, vacc5x0123);
+ _mm_storel_pi((__m64*) c4, vacc4x0123);
+ _mm_storel_pi((__m64*) c3, vacc3x0123);
+ _mm_storel_pi((__m64*) c2, vacc2x0123);
+ _mm_storel_pi((__m64*) c1, vacc1x0123);
+ _mm_storel_pi((__m64*) c0, vacc0x0123);
+
+ vacc6x0123 = _mm_movehl_ps(vacc6x0123, vacc6x0123);
+ vacc5x0123 = _mm_movehl_ps(vacc5x0123, vacc5x0123);
+ vacc4x0123 = _mm_movehl_ps(vacc4x0123, vacc4x0123);
+ vacc3x0123 = _mm_movehl_ps(vacc3x0123, vacc3x0123);
+ vacc2x0123 = _mm_movehl_ps(vacc2x0123, vacc2x0123);
+ vacc1x0123 = _mm_movehl_ps(vacc1x0123, vacc1x0123);
+ vacc0x0123 = _mm_movehl_ps(vacc0x0123, vacc0x0123);
+
+ c6 += 2;
+ c5 += 2;
+ c4 += 2;
+ c3 += 2;
+ c2 += 2;
+ c1 += 2;
+ c0 += 2;
+ }
+ if (nc & 1) {
+ _mm_store_ss(c6, vacc6x0123);
+ _mm_store_ss(c5, vacc5x0123);
+ _mm_store_ss(c4, vacc4x0123);
+ _mm_store_ss(c3, vacc3x0123);
+ _mm_store_ss(c2, vacc2x0123);
+ _mm_store_ss(c1, vacc1x0123);
+ _mm_store_ss(c0, vacc0x0123);
+ }
+
+ nc = 0;
+ }
+ } while (nc != 0);
+}
diff --git a/src/f32-gemm/8x8-fma3-broadcast.c b/src/f32-gemm/8x8-fma3-broadcast.c
new file mode 100644
index 0000000..c4ef3d5
--- /dev/null
+++ b/src/f32-gemm/8x8-fma3-broadcast.c
@@ -0,0 +1,255 @@
+// Auto-generated file. Do not edit!
+// Template: src/f32-gemm/avx-broadcast.c.in
+// Generator: tools/xngen
+//
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <immintrin.h>
+
+#include <xnnpack/gemm.h>
+
+
+void xnn_f32_gemm_ukernel_8x8__fma3_broadcast(
+ size_t mr,
+ size_t nc,
+ size_t kc,
+ const float*restrict a,
+ size_t a_stride,
+ const float*restrict w,
+ float*restrict c,
+ size_t cm_stride,
+ size_t cn_stride,
+ const union xnn_f32_output_params params[restrict static 1])
+{
+ assert(mr != 0);
+ assert(mr <= 8);
+ assert(nc != 0);
+ assert(kc != 0);
+ assert(kc % sizeof(float) == 0);
+ assert(a != NULL);
+ assert(w != NULL);
+ assert(c != NULL);
+
+ const float* a0 = a;
+ float* c0 = c;
+ const float* a1 = (const float*) ((uintptr_t) a0 + a_stride);
+ float* c1 = (float*) ((uintptr_t) c0 + cm_stride);
+ if XNN_UNPREDICTABLE(mr < 2) {
+ a1 = a0;
+ c1 = c0;
+ }
+ const float* a2 = (const float*) ((uintptr_t) a1 + a_stride);
+ float* c2 = (float*) ((uintptr_t) c1 + cm_stride);
+ if XNN_UNPREDICTABLE(mr <= 2) {
+ a2 = a1;
+ c2 = c1;
+ }
+ const float* a3 = (const float*) ((uintptr_t) a2 + a_stride);
+ float* c3 = (float*) ((uintptr_t) c2 + cm_stride);
+ if XNN_UNPREDICTABLE(mr < 4) {
+ a3 = a2;
+ c3 = c2;
+ }
+ const float* a4 = (const float*) ((uintptr_t) a3 + a_stride);
+ float* c4 = (float*) ((uintptr_t) c3 + cm_stride);
+ if XNN_UNPREDICTABLE(mr <= 4) {
+ a4 = a3;
+ c4 = c3;
+ }
+ const float* a5 = (const float*) ((uintptr_t) a4 + a_stride);
+ float* c5 = (float*) ((uintptr_t) c4 + cm_stride);
+ if XNN_UNPREDICTABLE(mr < 6) {
+ a5 = a4;
+ c5 = c4;
+ }
+ const float* a6 = (const float*) ((uintptr_t) a5 + a_stride);
+ float* c6 = (float*) ((uintptr_t) c5 + cm_stride);
+ if XNN_UNPREDICTABLE(mr <= 6) {
+ a6 = a5;
+ c6 = c5;
+ }
+ const float* a7 = (const float*) ((uintptr_t) a6 + a_stride);
+ float* c7 = (float*) ((uintptr_t) c6 + cm_stride);
+ if XNN_UNPREDICTABLE(mr != 8) {
+ a7 = a6;
+ c7 = c6;
+ }
+
+ do {
+ __m256 vacc0x01234567 = _mm256_load_ps(w + 0);
+ __m256 vacc1x01234567 = vacc0x01234567;
+ __m256 vacc2x01234567 = vacc0x01234567;
+ __m256 vacc3x01234567 = vacc0x01234567;
+ __m256 vacc4x01234567 = vacc0x01234567;
+ __m256 vacc5x01234567 = vacc0x01234567;
+ __m256 vacc6x01234567 = vacc0x01234567;
+ __m256 vacc7x01234567 = vacc0x01234567;
+ w += 8;
+
+ size_t k = kc;
+ do {
+ const __m256 va0 = _mm256_broadcast_ss(a0);
+ a0 += 1;
+ const __m256 va1 = _mm256_broadcast_ss(a1);
+ a1 += 1;
+ const __m256 va2 = _mm256_broadcast_ss(a2);
+ a2 += 1;
+ const __m256 va3 = _mm256_broadcast_ss(a3);
+ a3 += 1;
+ const __m256 va4 = _mm256_broadcast_ss(a4);
+ a4 += 1;
+ const __m256 va5 = _mm256_broadcast_ss(a5);
+ a5 += 1;
+ const __m256 va6 = _mm256_broadcast_ss(a6);
+ a6 += 1;
+ const __m256 va7 = _mm256_broadcast_ss(a7);
+ a7 += 1;
+
+ const __m256 vb01234567 = _mm256_load_ps(w);
+ w += 8;
+
+ vacc0x01234567 = _mm256_fmadd_ps(va0, vb01234567, vacc0x01234567);
+ vacc1x01234567 = _mm256_fmadd_ps(va1, vb01234567, vacc1x01234567);
+ vacc2x01234567 = _mm256_fmadd_ps(va2, vb01234567, vacc2x01234567);
+ vacc3x01234567 = _mm256_fmadd_ps(va3, vb01234567, vacc3x01234567);
+ vacc4x01234567 = _mm256_fmadd_ps(va4, vb01234567, vacc4x01234567);
+ vacc5x01234567 = _mm256_fmadd_ps(va5, vb01234567, vacc5x01234567);
+ vacc6x01234567 = _mm256_fmadd_ps(va6, vb01234567, vacc6x01234567);
+ vacc7x01234567 = _mm256_fmadd_ps(va7, vb01234567, vacc7x01234567);
+
+ k -= sizeof(float);
+ } while (k != 0);
+
+ const __m256 vmax = _mm256_broadcast_ps((const __m128*) params->sse.max);
+ vacc0x01234567 = _mm256_min_ps(vacc0x01234567, vmax);
+ vacc1x01234567 = _mm256_min_ps(vacc1x01234567, vmax);
+ vacc2x01234567 = _mm256_min_ps(vacc2x01234567, vmax);
+ vacc3x01234567 = _mm256_min_ps(vacc3x01234567, vmax);
+ vacc4x01234567 = _mm256_min_ps(vacc4x01234567, vmax);
+ vacc5x01234567 = _mm256_min_ps(vacc5x01234567, vmax);
+ vacc6x01234567 = _mm256_min_ps(vacc6x01234567, vmax);
+ vacc7x01234567 = _mm256_min_ps(vacc7x01234567, vmax);
+
+ const __m256 vmin = _mm256_broadcast_ps((const __m128*) params->sse.min);
+ vacc0x01234567 = _mm256_max_ps(vacc0x01234567, vmin);
+ vacc1x01234567 = _mm256_max_ps(vacc1x01234567, vmin);
+ vacc2x01234567 = _mm256_max_ps(vacc2x01234567, vmin);
+ vacc3x01234567 = _mm256_max_ps(vacc3x01234567, vmin);
+ vacc4x01234567 = _mm256_max_ps(vacc4x01234567, vmin);
+ vacc5x01234567 = _mm256_max_ps(vacc5x01234567, vmin);
+ vacc6x01234567 = _mm256_max_ps(vacc6x01234567, vmin);
+ vacc7x01234567 = _mm256_max_ps(vacc7x01234567, vmin);
+
+ if XNN_LIKELY(nc >= 8) {
+ _mm256_storeu_ps(c7, vacc7x01234567);
+ c7 = (float*) ((uintptr_t) c7 + cn_stride);
+ _mm256_storeu_ps(c6, vacc6x01234567);
+ c6 = (float*) ((uintptr_t) c6 + cn_stride);
+ _mm256_storeu_ps(c5, vacc5x01234567);
+ c5 = (float*) ((uintptr_t) c5 + cn_stride);
+ _mm256_storeu_ps(c4, vacc4x01234567);
+ c4 = (float*) ((uintptr_t) c4 + cn_stride);
+ _mm256_storeu_ps(c3, vacc3x01234567);
+ c3 = (float*) ((uintptr_t) c3 + cn_stride);
+ _mm256_storeu_ps(c2, vacc2x01234567);
+ c2 = (float*) ((uintptr_t) c2 + cn_stride);
+ _mm256_storeu_ps(c1, vacc1x01234567);
+ c1 = (float*) ((uintptr_t) c1 + cn_stride);
+ _mm256_storeu_ps(c0, vacc0x01234567);
+ c0 = (float*) ((uintptr_t) c0 + cn_stride);
+
+ a7 = (const float*) ((uintptr_t) a7 - kc);
+ a6 = (const float*) ((uintptr_t) a6 - kc);
+ a5 = (const float*) ((uintptr_t) a5 - kc);
+ a4 = (const float*) ((uintptr_t) a4 - kc);
+ a3 = (const float*) ((uintptr_t) a3 - kc);
+ a2 = (const float*) ((uintptr_t) a2 - kc);
+ a1 = (const float*) ((uintptr_t) a1 - kc);
+ a0 = (const float*) ((uintptr_t) a0 - kc);
+
+ nc -= 8;
+ } else {
+ __m128 vacc7x0123 = _mm256_castps256_ps128(vacc7x01234567);
+ __m128 vacc6x0123 = _mm256_castps256_ps128(vacc6x01234567);
+ __m128 vacc5x0123 = _mm256_castps256_ps128(vacc5x01234567);
+ __m128 vacc4x0123 = _mm256_castps256_ps128(vacc4x01234567);
+ __m128 vacc3x0123 = _mm256_castps256_ps128(vacc3x01234567);
+ __m128 vacc2x0123 = _mm256_castps256_ps128(vacc2x01234567);
+ __m128 vacc1x0123 = _mm256_castps256_ps128(vacc1x01234567);
+ __m128 vacc0x0123 = _mm256_castps256_ps128(vacc0x01234567);
+ if (nc & 4) {
+ _mm_storeu_ps(c7, vacc7x0123);
+ _mm_storeu_ps(c6, vacc6x0123);
+ _mm_storeu_ps(c5, vacc5x0123);
+ _mm_storeu_ps(c4, vacc4x0123);
+ _mm_storeu_ps(c3, vacc3x0123);
+ _mm_storeu_ps(c2, vacc2x0123);
+ _mm_storeu_ps(c1, vacc1x0123);
+ _mm_storeu_ps(c0, vacc0x0123);
+
+ vacc7x0123 = _mm256_extractf128_ps(vacc7x01234567, 1);
+ vacc6x0123 = _mm256_extractf128_ps(vacc6x01234567, 1);
+ vacc5x0123 = _mm256_extractf128_ps(vacc5x01234567, 1);
+ vacc4x0123 = _mm256_extractf128_ps(vacc4x01234567, 1);
+ vacc3x0123 = _mm256_extractf128_ps(vacc3x01234567, 1);
+ vacc2x0123 = _mm256_extractf128_ps(vacc2x01234567, 1);
+ vacc1x0123 = _mm256_extractf128_ps(vacc1x01234567, 1);
+ vacc0x0123 = _mm256_extractf128_ps(vacc0x01234567, 1);
+
+ c7 += 4;
+ c6 += 4;
+ c5 += 4;
+ c4 += 4;
+ c3 += 4;
+ c2 += 4;
+ c1 += 4;
+ c0 += 4;
+ }
+ if (nc & 2) {
+ _mm_storel_pi((__m64*) c7, vacc7x0123);
+ _mm_storel_pi((__m64*) c6, vacc6x0123);
+ _mm_storel_pi((__m64*) c5, vacc5x0123);
+ _mm_storel_pi((__m64*) c4, vacc4x0123);
+ _mm_storel_pi((__m64*) c3, vacc3x0123);
+ _mm_storel_pi((__m64*) c2, vacc2x0123);
+ _mm_storel_pi((__m64*) c1, vacc1x0123);
+ _mm_storel_pi((__m64*) c0, vacc0x0123);
+
+ vacc7x0123 = _mm_movehl_ps(vacc7x0123, vacc7x0123);
+ vacc6x0123 = _mm_movehl_ps(vacc6x0123, vacc6x0123);
+ vacc5x0123 = _mm_movehl_ps(vacc5x0123, vacc5x0123);
+ vacc4x0123 = _mm_movehl_ps(vacc4x0123, vacc4x0123);
+ vacc3x0123 = _mm_movehl_ps(vacc3x0123, vacc3x0123);
+ vacc2x0123 = _mm_movehl_ps(vacc2x0123, vacc2x0123);
+ vacc1x0123 = _mm_movehl_ps(vacc1x0123, vacc1x0123);
+ vacc0x0123 = _mm_movehl_ps(vacc0x0123, vacc0x0123);
+
+ c7 += 2;
+ c6 += 2;
+ c5 += 2;
+ c4 += 2;
+ c3 += 2;
+ c2 += 2;
+ c1 += 2;
+ c0 += 2;
+ }
+ if (nc & 1) {
+ _mm_store_ss(c7, vacc7x0123);
+ _mm_store_ss(c6, vacc6x0123);
+ _mm_store_ss(c5, vacc5x0123);
+ _mm_store_ss(c4, vacc4x0123);
+ _mm_store_ss(c3, vacc3x0123);
+ _mm_store_ss(c2, vacc2x0123);
+ _mm_store_ss(c1, vacc1x0123);
+ _mm_store_ss(c0, vacc0x0123);
+ }
+
+ nc = 0;
+ }
+ } while (nc != 0);
+}
diff --git a/src/f32-gemm/avx-broadcast.c.in b/src/f32-gemm/avx-broadcast.c.in
new file mode 100644
index 0000000..90d7e08
--- /dev/null
+++ b/src/f32-gemm/avx-broadcast.c.in
@@ -0,0 +1,163 @@
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+$assert NR % 4 == 0
+$ABC = "0123456789ABCDEFGHIJKLMN"
+#include <assert.h>
+
+#include <immintrin.h>
+
+#include <xnnpack/gemm.h>
+
+
+$ISA = {0: "avx", 3: "fma3"}[FMA]
+void xnn_f32_gemm${"inc" if INC else ""}_ukernel_${MR}x${NR}__${ISA}_broadcast(
+ size_t mr,
+ size_t nc,
+ size_t kc,
+ const float*restrict a,
+ size_t a_stride,
+ const float*restrict w,
+ float*restrict c,
+ size_t cm_stride,
+ size_t cn_stride,
+ $if INC:
+ const float*restrict acc,
+ const union xnn_f32_output_params params[restrict static 1])
+{
+ assert(mr != 0);
+ assert(mr <= ${MR});
+ assert(nc != 0);
+ assert(kc != 0);
+ assert(kc % sizeof(float) == 0);
+ assert(a != NULL);
+ assert(w != NULL);
+ assert(c != NULL);
+ $if INC:
+ assert(acc != NULL);
+
+ const float* a0 = a;
+ float* c0 = c;
+ $for M in range(1, MR):
+ const float* a${M} = (const float*) ((uintptr_t) a${M-1} + a_stride);
+ float* c${M} = (float*) ((uintptr_t) c${M-1} + cm_stride);
+ $if M % 2 == 0:
+ if XNN_UNPREDICTABLE(mr <= ${M}) {
+ a${M} = a${M-1};
+ c${M} = c${M-1};
+ }
+ $elif M + 1 == MR:
+ if XNN_UNPREDICTABLE(mr != ${M+1}) {
+ a${M} = a${M-1};
+ c${M} = c${M-1};
+ }
+ $else:
+ if XNN_UNPREDICTABLE(mr < ${M+1}) {
+ a${M} = a${M-1};
+ c${M} = c${M-1};
+ }
+
+ do {
+ $if INC:
+ $for M in range(MR):
+ $for N in range(0, NR, 8):
+ __m256 vacc${M}x${ABC[N:N+8]} = _mm256_load_ps(acc + ${M*NR+N});
+ acc += ${MR*NR};
+ $else:
+ $for N in range(0, NR, 8):
+ __m256 vacc0x${ABC[N:N+8]} = _mm256_load_ps(w + ${N});
+ $for M in range(1, MR):
+ $for N in range(0, NR, 8):
+ __m256 vacc${M}x${ABC[N:N+8]} = vacc0x${ABC[N:N+8]};
+ w += ${NR};
+
+ size_t k = kc;
+ do {
+ $for M in range(MR):
+ const __m256 va${M} = _mm256_broadcast_ss(a${M});
+ a${M} += 1;
+
+ const __m256 vb${ABC[0:8]} = _mm256_load_ps(w);
+ $for N in range(8, NR, 8):
+ const __m256 vb${ABC[N:N+8]} = _mm256_load_ps(w + ${N});
+ w += ${NR};
+
+ $for N in range(0, NR, 8):
+ $for M in range(MR):
+ $if FMA == 3:
+ vacc${M}x${ABC[N:N+8]} = _mm256_fmadd_ps(va${M}, vb${ABC[N:N+8]}, vacc${M}x${ABC[N:N+8]});
+ $else:
+ vacc${M}x${ABC[N:N+8]} = _mm256_add_ps(vacc${M}x${ABC[N:N+8]}, _mm256_mul_ps(va${M}, vb${ABC[N:N+8]}));
+
+ k -= sizeof(float);
+ } while (k != 0);
+
+ const __m256 vmax = _mm256_broadcast_ps((const __m128*) params->sse.max);
+ $for N in range(0, NR, 8):
+ $for M in range(MR):
+ vacc${M}x${ABC[N:N+8]} = _mm256_min_ps(vacc${M}x${ABC[N:N+8]}, vmax);
+
+ const __m256 vmin = _mm256_broadcast_ps((const __m128*) params->sse.min);
+ $for N in range(0, NR, 8):
+ $for M in range(MR):
+ vacc${M}x${ABC[N:N+8]} = _mm256_max_ps(vacc${M}x${ABC[N:N+8]}, vmin);
+
+ if XNN_LIKELY(nc >= ${NR}) {
+ $for M in reversed(range(MR)):
+ _mm256_storeu_ps(c${M}, vacc${M}x${ABC[0:8]});
+ $for N in range(8, NR, 8):
+ _mm256_storeu_ps(c${M} + ${N}, vacc${M}x${ABC[N:N+8]});
+ c${M} = (float*) ((uintptr_t) c${M} + cn_stride);
+
+ $for M in reversed(range(MR)):
+ a${M} = (const float*) ((uintptr_t) a${M} - kc);
+
+ nc -= ${NR};
+ } else {
+ $for LOG2N in reversed(range(NR.bit_length())):
+ $if NR != 1 << LOG2N:
+ if (nc & ${1 << LOG2N}) {
+ $if LOG2N >= 3:
+ $for M in reversed(range(MR)):
+ _mm256_storeu_ps(c${M}, vacc${M}x${ABC[0:8]});
+ $for N in range(8, 1 << LOG2N, 8):
+ _mm256_storeu_ps(c${M} + ${N}, vacc${M}x${ABC[N:N+8]});
+
+ $for M in reversed(range(MR)):
+ $for N in range(0, 1 << (LOG2N - 1), 8):
+ vacc${M}x${ABC[N:N+8]} = vacc${M}x${ABC[N + (1 << LOG2N):N + (1 << LOG2N)+8]};
+
+ $for M in reversed(range(MR)):
+ c${M} += ${1 << LOG2N};
+ $elif LOG2N == 2:
+ $for M in reversed(range(MR)):
+ _mm_storeu_ps(c${M}, vacc${M}x${ABC[0:4]});
+
+ $for M in reversed(range(MR)):
+ vacc${M}x${ABC[0:4]} = _mm256_extractf128_ps(vacc${M}x${ABC[0:8]}, 1);
+
+ $for M in reversed(range(MR)):
+ c${M} += 4;
+ $elif LOG2N == 1:
+ $for M in reversed(range(MR)):
+ _mm_storel_pi((__m64*) c${M}, vacc${M}x${ABC[0:4]});
+
+ $for M in reversed(range(MR)):
+ vacc${M}x${ABC[0:4]} = _mm_movehl_ps(vacc${M}x${ABC[0:4]}, vacc${M}x${ABC[0:4]});
+
+ $for M in reversed(range(MR)):
+ c${M} += 2;
+ $elif LOG2N == 0:
+ $for M in reversed(range(MR)):
+ _mm_store_ss(c${M}, vacc${M}x${ABC[0:4]});
+ }
+ $if LOG2N == 3:
+ $for M in reversed(range(MR)):
+ __m128 vacc${M}x${ABC[0:4]} = _mm256_castps256_ps128(vacc${M}x${ABC[0:8]});
+
+ nc = 0;
+ }
+ } while (nc != 0);
+}
diff --git a/src/f32-gemminc/1x8-avx-broadcast.c b/src/f32-gemminc/1x8-avx-broadcast.c
new file mode 100644
index 0000000..f1e4490
--- /dev/null
+++ b/src/f32-gemminc/1x8-avx-broadcast.c
@@ -0,0 +1,96 @@
+// Auto-generated file. Do not edit!
+// Template: src/f32-gemm/avx-broadcast.c.in
+// Generator: tools/xngen
+//
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <immintrin.h>
+
+#include <xnnpack/gemm.h>
+
+
+void xnn_f32_gemminc_ukernel_1x8__avx_broadcast(
+ size_t mr,
+ size_t nc,
+ size_t kc,
+ const float*restrict a,
+ size_t a_stride,
+ const float*restrict w,
+ float*restrict c,
+ size_t cm_stride,
+ size_t cn_stride,
+ const float*restrict acc,
+ const union xnn_f32_output_params params[restrict static 1])
+{
+ assert(mr != 0);
+ assert(mr <= 1);
+ assert(nc != 0);
+ assert(kc != 0);
+ assert(kc % sizeof(float) == 0);
+ assert(a != NULL);
+ assert(w != NULL);
+ assert(c != NULL);
+ assert(acc != NULL);
+
+ const float* a0 = a;
+ float* c0 = c;
+
+ do {
+ __m256 vacc0x01234567 = _mm256_load_ps(acc + 0);
+ acc += 8;
+
+ size_t k = kc;
+ do {
+ const __m256 va0 = _mm256_broadcast_ss(a0);
+ a0 += 1;
+
+ const __m256 vb01234567 = _mm256_load_ps(w);
+ w += 8;
+
+ vacc0x01234567 = _mm256_add_ps(vacc0x01234567, _mm256_mul_ps(va0, vb01234567));
+
+ k -= sizeof(float);
+ } while (k != 0);
+
+ const __m256 vmax = _mm256_broadcast_ps((const __m128*) params->sse.max);
+ vacc0x01234567 = _mm256_min_ps(vacc0x01234567, vmax);
+
+ const __m256 vmin = _mm256_broadcast_ps((const __m128*) params->sse.min);
+ vacc0x01234567 = _mm256_max_ps(vacc0x01234567, vmin);
+
+ if XNN_LIKELY(nc >= 8) {
+ _mm256_storeu_ps(c0, vacc0x01234567);
+ c0 = (float*) ((uintptr_t) c0 + cn_stride);
+
+ a0 = (const float*) ((uintptr_t) a0 - kc);
+
+ nc -= 8;
+ } else {
+ __m128 vacc0x0123 = _mm256_castps256_ps128(vacc0x01234567);
+ if (nc & 4) {
+ _mm_storeu_ps(c0, vacc0x0123);
+
+ vacc0x0123 = _mm256_extractf128_ps(vacc0x01234567, 1);
+
+ c0 += 4;
+ }
+ if (nc & 2) {
+ _mm_storel_pi((__m64*) c0, vacc0x0123);
+
+ vacc0x0123 = _mm_movehl_ps(vacc0x0123, vacc0x0123);
+
+ c0 += 2;
+ }
+ if (nc & 1) {
+ _mm_store_ss(c0, vacc0x0123);
+ }
+
+ nc = 0;
+ }
+ } while (nc != 0);
+}
diff --git a/src/f32-gemminc/1x8-fma3-broadcast.c b/src/f32-gemminc/1x8-fma3-broadcast.c
new file mode 100644
index 0000000..204cfca
--- /dev/null
+++ b/src/f32-gemminc/1x8-fma3-broadcast.c
@@ -0,0 +1,96 @@
+// Auto-generated file. Do not edit!
+// Template: src/f32-gemm/avx-broadcast.c.in
+// Generator: tools/xngen
+//
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <immintrin.h>
+
+#include <xnnpack/gemm.h>
+
+
+void xnn_f32_gemminc_ukernel_1x8__fma3_broadcast(
+ size_t mr,
+ size_t nc,
+ size_t kc,
+ const float*restrict a,
+ size_t a_stride,
+ const float*restrict w,
+ float*restrict c,
+ size_t cm_stride,
+ size_t cn_stride,
+ const float*restrict acc,
+ const union xnn_f32_output_params params[restrict static 1])
+{
+ assert(mr != 0);
+ assert(mr <= 1);
+ assert(nc != 0);
+ assert(kc != 0);
+ assert(kc % sizeof(float) == 0);
+ assert(a != NULL);
+ assert(w != NULL);
+ assert(c != NULL);
+ assert(acc != NULL);
+
+ const float* a0 = a;
+ float* c0 = c;
+
+ do {
+ __m256 vacc0x01234567 = _mm256_load_ps(acc + 0);
+ acc += 8;
+
+ size_t k = kc;
+ do {
+ const __m256 va0 = _mm256_broadcast_ss(a0);
+ a0 += 1;
+
+ const __m256 vb01234567 = _mm256_load_ps(w);
+ w += 8;
+
+ vacc0x01234567 = _mm256_fmadd_ps(va0, vb01234567, vacc0x01234567);
+
+ k -= sizeof(float);
+ } while (k != 0);
+
+ const __m256 vmax = _mm256_broadcast_ps((const __m128*) params->sse.max);
+ vacc0x01234567 = _mm256_min_ps(vacc0x01234567, vmax);
+
+ const __m256 vmin = _mm256_broadcast_ps((const __m128*) params->sse.min);
+ vacc0x01234567 = _mm256_max_ps(vacc0x01234567, vmin);
+
+ if XNN_LIKELY(nc >= 8) {
+ _mm256_storeu_ps(c0, vacc0x01234567);
+ c0 = (float*) ((uintptr_t) c0 + cn_stride);
+
+ a0 = (const float*) ((uintptr_t) a0 - kc);
+
+ nc -= 8;
+ } else {
+ __m128 vacc0x0123 = _mm256_castps256_ps128(vacc0x01234567);
+ if (nc & 4) {
+ _mm_storeu_ps(c0, vacc0x0123);
+
+ vacc0x0123 = _mm256_extractf128_ps(vacc0x01234567, 1);
+
+ c0 += 4;
+ }
+ if (nc & 2) {
+ _mm_storel_pi((__m64*) c0, vacc0x0123);
+
+ vacc0x0123 = _mm_movehl_ps(vacc0x0123, vacc0x0123);
+
+ c0 += 2;
+ }
+ if (nc & 1) {
+ _mm_store_ss(c0, vacc0x0123);
+ }
+
+ nc = 0;
+ }
+ } while (nc != 0);
+}
diff --git a/src/f32-gemminc/4x8-avx-broadcast.c b/src/f32-gemminc/4x8-avx-broadcast.c
new file mode 100644
index 0000000..79ece60
--- /dev/null
+++ b/src/f32-gemminc/4x8-avx-broadcast.c
@@ -0,0 +1,165 @@
+// Auto-generated file. Do not edit!
+// Template: src/f32-gemm/avx-broadcast.c.in
+// Generator: tools/xngen
+//
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <immintrin.h>
+
+#include <xnnpack/gemm.h>
+
+
+void xnn_f32_gemminc_ukernel_4x8__avx_broadcast(
+ size_t mr,
+ size_t nc,
+ size_t kc,
+ const float*restrict a,
+ size_t a_stride,
+ const float*restrict w,
+ float*restrict c,
+ size_t cm_stride,
+ size_t cn_stride,
+ const float*restrict acc,
+ const union xnn_f32_output_params params[restrict static 1])
+{
+ assert(mr != 0);
+ assert(mr <= 4);
+ assert(nc != 0);
+ assert(kc != 0);
+ assert(kc % sizeof(float) == 0);
+ assert(a != NULL);
+ assert(w != NULL);
+ assert(c != NULL);
+ assert(acc != NULL);
+
+ const float* a0 = a;
+ float* c0 = c;
+ const float* a1 = (const float*) ((uintptr_t) a0 + a_stride);
+ float* c1 = (float*) ((uintptr_t) c0 + cm_stride);
+ if XNN_UNPREDICTABLE(mr < 2) {
+ a1 = a0;
+ c1 = c0;
+ }
+ const float* a2 = (const float*) ((uintptr_t) a1 + a_stride);
+ float* c2 = (float*) ((uintptr_t) c1 + cm_stride);
+ if XNN_UNPREDICTABLE(mr <= 2) {
+ a2 = a1;
+ c2 = c1;
+ }
+ const float* a3 = (const float*) ((uintptr_t) a2 + a_stride);
+ float* c3 = (float*) ((uintptr_t) c2 + cm_stride);
+ if XNN_UNPREDICTABLE(mr != 4) {
+ a3 = a2;
+ c3 = c2;
+ }
+
+ do {
+ __m256 vacc0x01234567 = _mm256_load_ps(acc + 0);
+ __m256 vacc1x01234567 = _mm256_load_ps(acc + 8);
+ __m256 vacc2x01234567 = _mm256_load_ps(acc + 16);
+ __m256 vacc3x01234567 = _mm256_load_ps(acc + 24);
+ acc += 32;
+
+ size_t k = kc;
+ do {
+ const __m256 va0 = _mm256_broadcast_ss(a0);
+ a0 += 1;
+ const __m256 va1 = _mm256_broadcast_ss(a1);
+ a1 += 1;
+ const __m256 va2 = _mm256_broadcast_ss(a2);
+ a2 += 1;
+ const __m256 va3 = _mm256_broadcast_ss(a3);
+ a3 += 1;
+
+ const __m256 vb01234567 = _mm256_load_ps(w);
+ w += 8;
+
+ vacc0x01234567 = _mm256_add_ps(vacc0x01234567, _mm256_mul_ps(va0, vb01234567));
+ vacc1x01234567 = _mm256_add_ps(vacc1x01234567, _mm256_mul_ps(va1, vb01234567));
+ vacc2x01234567 = _mm256_add_ps(vacc2x01234567, _mm256_mul_ps(va2, vb01234567));
+ vacc3x01234567 = _mm256_add_ps(vacc3x01234567, _mm256_mul_ps(va3, vb01234567));
+
+ k -= sizeof(float);
+ } while (k != 0);
+
+ const __m256 vmax = _mm256_broadcast_ps((const __m128*) params->sse.max);
+ vacc0x01234567 = _mm256_min_ps(vacc0x01234567, vmax);
+ vacc1x01234567 = _mm256_min_ps(vacc1x01234567, vmax);
+ vacc2x01234567 = _mm256_min_ps(vacc2x01234567, vmax);
+ vacc3x01234567 = _mm256_min_ps(vacc3x01234567, vmax);
+
+ const __m256 vmin = _mm256_broadcast_ps((const __m128*) params->sse.min);
+ vacc0x01234567 = _mm256_max_ps(vacc0x01234567, vmin);
+ vacc1x01234567 = _mm256_max_ps(vacc1x01234567, vmin);
+ vacc2x01234567 = _mm256_max_ps(vacc2x01234567, vmin);
+ vacc3x01234567 = _mm256_max_ps(vacc3x01234567, vmin);
+
+ if XNN_LIKELY(nc >= 8) {
+ _mm256_storeu_ps(c3, vacc3x01234567);
+ c3 = (float*) ((uintptr_t) c3 + cn_stride);
+ _mm256_storeu_ps(c2, vacc2x01234567);
+ c2 = (float*) ((uintptr_t) c2 + cn_stride);
+ _mm256_storeu_ps(c1, vacc1x01234567);
+ c1 = (float*) ((uintptr_t) c1 + cn_stride);
+ _mm256_storeu_ps(c0, vacc0x01234567);
+ c0 = (float*) ((uintptr_t) c0 + cn_stride);
+
+ a3 = (const float*) ((uintptr_t) a3 - kc);
+ a2 = (const float*) ((uintptr_t) a2 - kc);
+ a1 = (const float*) ((uintptr_t) a1 - kc);
+ a0 = (const float*) ((uintptr_t) a0 - kc);
+
+ nc -= 8;
+ } else {
+ __m128 vacc3x0123 = _mm256_castps256_ps128(vacc3x01234567);
+ __m128 vacc2x0123 = _mm256_castps256_ps128(vacc2x01234567);
+ __m128 vacc1x0123 = _mm256_castps256_ps128(vacc1x01234567);
+ __m128 vacc0x0123 = _mm256_castps256_ps128(vacc0x01234567);
+ if (nc & 4) {
+ _mm_storeu_ps(c3, vacc3x0123);
+ _mm_storeu_ps(c2, vacc2x0123);
+ _mm_storeu_ps(c1, vacc1x0123);
+ _mm_storeu_ps(c0, vacc0x0123);
+
+ vacc3x0123 = _mm256_extractf128_ps(vacc3x01234567, 1);
+ vacc2x0123 = _mm256_extractf128_ps(vacc2x01234567, 1);
+ vacc1x0123 = _mm256_extractf128_ps(vacc1x01234567, 1);
+ vacc0x0123 = _mm256_extractf128_ps(vacc0x01234567, 1);
+
+ c3 += 4;
+ c2 += 4;
+ c1 += 4;
+ c0 += 4;
+ }
+ if (nc & 2) {
+ _mm_storel_pi((__m64*) c3, vacc3x0123);
+ _mm_storel_pi((__m64*) c2, vacc2x0123);
+ _mm_storel_pi((__m64*) c1, vacc1x0123);
+ _mm_storel_pi((__m64*) c0, vacc0x0123);
+
+ vacc3x0123 = _mm_movehl_ps(vacc3x0123, vacc3x0123);
+ vacc2x0123 = _mm_movehl_ps(vacc2x0123, vacc2x0123);
+ vacc1x0123 = _mm_movehl_ps(vacc1x0123, vacc1x0123);
+ vacc0x0123 = _mm_movehl_ps(vacc0x0123, vacc0x0123);
+
+ c3 += 2;
+ c2 += 2;
+ c1 += 2;
+ c0 += 2;
+ }
+ if (nc & 1) {
+ _mm_store_ss(c3, vacc3x0123);
+ _mm_store_ss(c2, vacc2x0123);
+ _mm_store_ss(c1, vacc1x0123);
+ _mm_store_ss(c0, vacc0x0123);
+ }
+
+ nc = 0;
+ }
+ } while (nc != 0);
+}
diff --git a/src/f32-gemminc/4x8-fma3-broadcast.c b/src/f32-gemminc/4x8-fma3-broadcast.c
new file mode 100644
index 0000000..07e0c56
--- /dev/null
+++ b/src/f32-gemminc/4x8-fma3-broadcast.c
@@ -0,0 +1,165 @@
+// Auto-generated file. Do not edit!
+// Template: src/f32-gemm/avx-broadcast.c.in
+// Generator: tools/xngen
+//
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <immintrin.h>
+
+#include <xnnpack/gemm.h>
+
+
+void xnn_f32_gemminc_ukernel_4x8__fma3_broadcast(
+ size_t mr,
+ size_t nc,
+ size_t kc,
+ const float*restrict a,
+ size_t a_stride,
+ const float*restrict w,
+ float*restrict c,
+ size_t cm_stride,
+ size_t cn_stride,
+ const float*restrict acc,
+ const union xnn_f32_output_params params[restrict static 1])
+{
+ assert(mr != 0);
+ assert(mr <= 4);
+ assert(nc != 0);
+ assert(kc != 0);
+ assert(kc % sizeof(float) == 0);
+ assert(a != NULL);
+ assert(w != NULL);
+ assert(c != NULL);
+ assert(acc != NULL);
+
+ const float* a0 = a;
+ float* c0 = c;
+ const float* a1 = (const float*) ((uintptr_t) a0 + a_stride);
+ float* c1 = (float*) ((uintptr_t) c0 + cm_stride);
+ if XNN_UNPREDICTABLE(mr < 2) {
+ a1 = a0;
+ c1 = c0;
+ }
+ const float* a2 = (const float*) ((uintptr_t) a1 + a_stride);
+ float* c2 = (float*) ((uintptr_t) c1 + cm_stride);
+ if XNN_UNPREDICTABLE(mr <= 2) {
+ a2 = a1;
+ c2 = c1;
+ }
+ const float* a3 = (const float*) ((uintptr_t) a2 + a_stride);
+ float* c3 = (float*) ((uintptr_t) c2 + cm_stride);
+ if XNN_UNPREDICTABLE(mr != 4) {
+ a3 = a2;
+ c3 = c2;
+ }
+
+ do {
+ __m256 vacc0x01234567 = _mm256_load_ps(acc + 0);
+ __m256 vacc1x01234567 = _mm256_load_ps(acc + 8);
+ __m256 vacc2x01234567 = _mm256_load_ps(acc + 16);
+ __m256 vacc3x01234567 = _mm256_load_ps(acc + 24);
+ acc += 32;
+
+ size_t k = kc;
+ do {
+ const __m256 va0 = _mm256_broadcast_ss(a0);
+ a0 += 1;
+ const __m256 va1 = _mm256_broadcast_ss(a1);
+ a1 += 1;
+ const __m256 va2 = _mm256_broadcast_ss(a2);
+ a2 += 1;
+ const __m256 va3 = _mm256_broadcast_ss(a3);
+ a3 += 1;
+
+ const __m256 vb01234567 = _mm256_load_ps(w);
+ w += 8;
+
+ vacc0x01234567 = _mm256_fmadd_ps(va0, vb01234567, vacc0x01234567);
+ vacc1x01234567 = _mm256_fmadd_ps(va1, vb01234567, vacc1x01234567);
+ vacc2x01234567 = _mm256_fmadd_ps(va2, vb01234567, vacc2x01234567);
+ vacc3x01234567 = _mm256_fmadd_ps(va3, vb01234567, vacc3x01234567);
+
+ k -= sizeof(float);
+ } while (k != 0);
+
+ const __m256 vmax = _mm256_broadcast_ps((const __m128*) params->sse.max);
+ vacc0x01234567 = _mm256_min_ps(vacc0x01234567, vmax);
+ vacc1x01234567 = _mm256_min_ps(vacc1x01234567, vmax);
+ vacc2x01234567 = _mm256_min_ps(vacc2x01234567, vmax);
+ vacc3x01234567 = _mm256_min_ps(vacc3x01234567, vmax);
+
+ const __m256 vmin = _mm256_broadcast_ps((const __m128*) params->sse.min);
+ vacc0x01234567 = _mm256_max_ps(vacc0x01234567, vmin);
+ vacc1x01234567 = _mm256_max_ps(vacc1x01234567, vmin);
+ vacc2x01234567 = _mm256_max_ps(vacc2x01234567, vmin);
+ vacc3x01234567 = _mm256_max_ps(vacc3x01234567, vmin);
+
+ if XNN_LIKELY(nc >= 8) {
+ _mm256_storeu_ps(c3, vacc3x01234567);
+ c3 = (float*) ((uintptr_t) c3 + cn_stride);
+ _mm256_storeu_ps(c2, vacc2x01234567);
+ c2 = (float*) ((uintptr_t) c2 + cn_stride);
+ _mm256_storeu_ps(c1, vacc1x01234567);
+ c1 = (float*) ((uintptr_t) c1 + cn_stride);
+ _mm256_storeu_ps(c0, vacc0x01234567);
+ c0 = (float*) ((uintptr_t) c0 + cn_stride);
+
+ a3 = (const float*) ((uintptr_t) a3 - kc);
+ a2 = (const float*) ((uintptr_t) a2 - kc);
+ a1 = (const float*) ((uintptr_t) a1 - kc);
+ a0 = (const float*) ((uintptr_t) a0 - kc);
+
+ nc -= 8;
+ } else {
+ __m128 vacc3x0123 = _mm256_castps256_ps128(vacc3x01234567);
+ __m128 vacc2x0123 = _mm256_castps256_ps128(vacc2x01234567);
+ __m128 vacc1x0123 = _mm256_castps256_ps128(vacc1x01234567);
+ __m128 vacc0x0123 = _mm256_castps256_ps128(vacc0x01234567);
+ if (nc & 4) {
+ _mm_storeu_ps(c3, vacc3x0123);
+ _mm_storeu_ps(c2, vacc2x0123);
+ _mm_storeu_ps(c1, vacc1x0123);
+ _mm_storeu_ps(c0, vacc0x0123);
+
+ vacc3x0123 = _mm256_extractf128_ps(vacc3x01234567, 1);
+ vacc2x0123 = _mm256_extractf128_ps(vacc2x01234567, 1);
+ vacc1x0123 = _mm256_extractf128_ps(vacc1x01234567, 1);
+ vacc0x0123 = _mm256_extractf128_ps(vacc0x01234567, 1);
+
+ c3 += 4;
+ c2 += 4;
+ c1 += 4;
+ c0 += 4;
+ }
+ if (nc & 2) {
+ _mm_storel_pi((__m64*) c3, vacc3x0123);
+ _mm_storel_pi((__m64*) c2, vacc2x0123);
+ _mm_storel_pi((__m64*) c1, vacc1x0123);
+ _mm_storel_pi((__m64*) c0, vacc0x0123);
+
+ vacc3x0123 = _mm_movehl_ps(vacc3x0123, vacc3x0123);
+ vacc2x0123 = _mm_movehl_ps(vacc2x0123, vacc2x0123);
+ vacc1x0123 = _mm_movehl_ps(vacc1x0123, vacc1x0123);
+ vacc0x0123 = _mm_movehl_ps(vacc0x0123, vacc0x0123);
+
+ c3 += 2;
+ c2 += 2;
+ c1 += 2;
+ c0 += 2;
+ }
+ if (nc & 1) {
+ _mm_store_ss(c3, vacc3x0123);
+ _mm_store_ss(c2, vacc2x0123);
+ _mm_store_ss(c1, vacc1x0123);
+ _mm_store_ss(c0, vacc0x0123);
+ }
+
+ nc = 0;
+ }
+ } while (nc != 0);
+}
diff --git a/src/f32-gemminc/5x8-avx-broadcast.c b/src/f32-gemminc/5x8-avx-broadcast.c
new file mode 100644
index 0000000..53ab1a8
--- /dev/null
+++ b/src/f32-gemminc/5x8-avx-broadcast.c
@@ -0,0 +1,188 @@
+// Auto-generated file. Do not edit!
+// Template: src/f32-gemm/avx-broadcast.c.in
+// Generator: tools/xngen
+//
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <immintrin.h>
+
+#include <xnnpack/gemm.h>
+
+
+void xnn_f32_gemminc_ukernel_5x8__avx_broadcast(
+ size_t mr,
+ size_t nc,
+ size_t kc,
+ const float*restrict a,
+ size_t a_stride,
+ const float*restrict w,
+ float*restrict c,
+ size_t cm_stride,
+ size_t cn_stride,
+ const float*restrict acc,
+ const union xnn_f32_output_params params[restrict static 1])
+{
+ assert(mr != 0);
+ assert(mr <= 5);
+ assert(nc != 0);
+ assert(kc != 0);
+ assert(kc % sizeof(float) == 0);
+ assert(a != NULL);
+ assert(w != NULL);
+ assert(c != NULL);
+ assert(acc != NULL);
+
+ const float* a0 = a;
+ float* c0 = c;
+ const float* a1 = (const float*) ((uintptr_t) a0 + a_stride);
+ float* c1 = (float*) ((uintptr_t) c0 + cm_stride);
+ if XNN_UNPREDICTABLE(mr < 2) {
+ a1 = a0;
+ c1 = c0;
+ }
+ const float* a2 = (const float*) ((uintptr_t) a1 + a_stride);
+ float* c2 = (float*) ((uintptr_t) c1 + cm_stride);
+ if XNN_UNPREDICTABLE(mr <= 2) {
+ a2 = a1;
+ c2 = c1;
+ }
+ const float* a3 = (const float*) ((uintptr_t) a2 + a_stride);
+ float* c3 = (float*) ((uintptr_t) c2 + cm_stride);
+ if XNN_UNPREDICTABLE(mr < 4) {
+ a3 = a2;
+ c3 = c2;
+ }
+ const float* a4 = (const float*) ((uintptr_t) a3 + a_stride);
+ float* c4 = (float*) ((uintptr_t) c3 + cm_stride);
+ if XNN_UNPREDICTABLE(mr <= 4) {
+ a4 = a3;
+ c4 = c3;
+ }
+
+ do {
+ __m256 vacc0x01234567 = _mm256_load_ps(acc + 0);
+ __m256 vacc1x01234567 = _mm256_load_ps(acc + 8);
+ __m256 vacc2x01234567 = _mm256_load_ps(acc + 16);
+ __m256 vacc3x01234567 = _mm256_load_ps(acc + 24);
+ __m256 vacc4x01234567 = _mm256_load_ps(acc + 32);
+ acc += 40;
+
+ size_t k = kc;
+ do {
+ const __m256 va0 = _mm256_broadcast_ss(a0);
+ a0 += 1;
+ const __m256 va1 = _mm256_broadcast_ss(a1);
+ a1 += 1;
+ const __m256 va2 = _mm256_broadcast_ss(a2);
+ a2 += 1;
+ const __m256 va3 = _mm256_broadcast_ss(a3);
+ a3 += 1;
+ const __m256 va4 = _mm256_broadcast_ss(a4);
+ a4 += 1;
+
+ const __m256 vb01234567 = _mm256_load_ps(w);
+ w += 8;
+
+ vacc0x01234567 = _mm256_add_ps(vacc0x01234567, _mm256_mul_ps(va0, vb01234567));
+ vacc1x01234567 = _mm256_add_ps(vacc1x01234567, _mm256_mul_ps(va1, vb01234567));
+ vacc2x01234567 = _mm256_add_ps(vacc2x01234567, _mm256_mul_ps(va2, vb01234567));
+ vacc3x01234567 = _mm256_add_ps(vacc3x01234567, _mm256_mul_ps(va3, vb01234567));
+ vacc4x01234567 = _mm256_add_ps(vacc4x01234567, _mm256_mul_ps(va4, vb01234567));
+
+ k -= sizeof(float);
+ } while (k != 0);
+
+ const __m256 vmax = _mm256_broadcast_ps((const __m128*) params->sse.max);
+ vacc0x01234567 = _mm256_min_ps(vacc0x01234567, vmax);
+ vacc1x01234567 = _mm256_min_ps(vacc1x01234567, vmax);
+ vacc2x01234567 = _mm256_min_ps(vacc2x01234567, vmax);
+ vacc3x01234567 = _mm256_min_ps(vacc3x01234567, vmax);
+ vacc4x01234567 = _mm256_min_ps(vacc4x01234567, vmax);
+
+ const __m256 vmin = _mm256_broadcast_ps((const __m128*) params->sse.min);
+ vacc0x01234567 = _mm256_max_ps(vacc0x01234567, vmin);
+ vacc1x01234567 = _mm256_max_ps(vacc1x01234567, vmin);
+ vacc2x01234567 = _mm256_max_ps(vacc2x01234567, vmin);
+ vacc3x01234567 = _mm256_max_ps(vacc3x01234567, vmin);
+ vacc4x01234567 = _mm256_max_ps(vacc4x01234567, vmin);
+
+ if XNN_LIKELY(nc >= 8) {
+ _mm256_storeu_ps(c4, vacc4x01234567);
+ c4 = (float*) ((uintptr_t) c4 + cn_stride);
+ _mm256_storeu_ps(c3, vacc3x01234567);
+ c3 = (float*) ((uintptr_t) c3 + cn_stride);
+ _mm256_storeu_ps(c2, vacc2x01234567);
+ c2 = (float*) ((uintptr_t) c2 + cn_stride);
+ _mm256_storeu_ps(c1, vacc1x01234567);
+ c1 = (float*) ((uintptr_t) c1 + cn_stride);
+ _mm256_storeu_ps(c0, vacc0x01234567);
+ c0 = (float*) ((uintptr_t) c0 + cn_stride);
+
+ a4 = (const float*) ((uintptr_t) a4 - kc);
+ a3 = (const float*) ((uintptr_t) a3 - kc);
+ a2 = (const float*) ((uintptr_t) a2 - kc);
+ a1 = (const float*) ((uintptr_t) a1 - kc);
+ a0 = (const float*) ((uintptr_t) a0 - kc);
+
+ nc -= 8;
+ } else {
+ __m128 vacc4x0123 = _mm256_castps256_ps128(vacc4x01234567);
+ __m128 vacc3x0123 = _mm256_castps256_ps128(vacc3x01234567);
+ __m128 vacc2x0123 = _mm256_castps256_ps128(vacc2x01234567);
+ __m128 vacc1x0123 = _mm256_castps256_ps128(vacc1x01234567);
+ __m128 vacc0x0123 = _mm256_castps256_ps128(vacc0x01234567);
+ if (nc & 4) {
+ _mm_storeu_ps(c4, vacc4x0123);
+ _mm_storeu_ps(c3, vacc3x0123);
+ _mm_storeu_ps(c2, vacc2x0123);
+ _mm_storeu_ps(c1, vacc1x0123);
+ _mm_storeu_ps(c0, vacc0x0123);
+
+ vacc4x0123 = _mm256_extractf128_ps(vacc4x01234567, 1);
+ vacc3x0123 = _mm256_extractf128_ps(vacc3x01234567, 1);
+ vacc2x0123 = _mm256_extractf128_ps(vacc2x01234567, 1);
+ vacc1x0123 = _mm256_extractf128_ps(vacc1x01234567, 1);
+ vacc0x0123 = _mm256_extractf128_ps(vacc0x01234567, 1);
+
+ c4 += 4;
+ c3 += 4;
+ c2 += 4;
+ c1 += 4;
+ c0 += 4;
+ }
+ if (nc & 2) {
+ _mm_storel_pi((__m64*) c4, vacc4x0123);
+ _mm_storel_pi((__m64*) c3, vacc3x0123);
+ _mm_storel_pi((__m64*) c2, vacc2x0123);
+ _mm_storel_pi((__m64*) c1, vacc1x0123);
+ _mm_storel_pi((__m64*) c0, vacc0x0123);
+
+ vacc4x0123 = _mm_movehl_ps(vacc4x0123, vacc4x0123);
+ vacc3x0123 = _mm_movehl_ps(vacc3x0123, vacc3x0123);
+ vacc2x0123 = _mm_movehl_ps(vacc2x0123, vacc2x0123);
+ vacc1x0123 = _mm_movehl_ps(vacc1x0123, vacc1x0123);
+ vacc0x0123 = _mm_movehl_ps(vacc0x0123, vacc0x0123);
+
+ c4 += 2;
+ c3 += 2;
+ c2 += 2;
+ c1 += 2;
+ c0 += 2;
+ }
+ if (nc & 1) {
+ _mm_store_ss(c4, vacc4x0123);
+ _mm_store_ss(c3, vacc3x0123);
+ _mm_store_ss(c2, vacc2x0123);
+ _mm_store_ss(c1, vacc1x0123);
+ _mm_store_ss(c0, vacc0x0123);
+ }
+
+ nc = 0;
+ }
+ } while (nc != 0);
+}
diff --git a/src/f32-gemminc/5x8-fma3-broadcast.c b/src/f32-gemminc/5x8-fma3-broadcast.c
new file mode 100644
index 0000000..5589ab1
--- /dev/null
+++ b/src/f32-gemminc/5x8-fma3-broadcast.c
@@ -0,0 +1,188 @@
+// Auto-generated file. Do not edit!
+// Template: src/f32-gemm/avx-broadcast.c.in
+// Generator: tools/xngen
+//
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <immintrin.h>
+
+#include <xnnpack/gemm.h>
+
+
+void xnn_f32_gemminc_ukernel_5x8__fma3_broadcast(
+ size_t mr,
+ size_t nc,
+ size_t kc,
+ const float*restrict a,
+ size_t a_stride,
+ const float*restrict w,
+ float*restrict c,
+ size_t cm_stride,
+ size_t cn_stride,
+ const float*restrict acc,
+ const union xnn_f32_output_params params[restrict static 1])
+{
+ assert(mr != 0);
+ assert(mr <= 5);
+ assert(nc != 0);
+ assert(kc != 0);
+ assert(kc % sizeof(float) == 0);
+ assert(a != NULL);
+ assert(w != NULL);
+ assert(c != NULL);
+ assert(acc != NULL);
+
+ const float* a0 = a;
+ float* c0 = c;
+ const float* a1 = (const float*) ((uintptr_t) a0 + a_stride);
+ float* c1 = (float*) ((uintptr_t) c0 + cm_stride);
+ if XNN_UNPREDICTABLE(mr < 2) {
+ a1 = a0;
+ c1 = c0;
+ }
+ const float* a2 = (const float*) ((uintptr_t) a1 + a_stride);
+ float* c2 = (float*) ((uintptr_t) c1 + cm_stride);
+ if XNN_UNPREDICTABLE(mr <= 2) {
+ a2 = a1;
+ c2 = c1;
+ }
+ const float* a3 = (const float*) ((uintptr_t) a2 + a_stride);
+ float* c3 = (float*) ((uintptr_t) c2 + cm_stride);
+ if XNN_UNPREDICTABLE(mr < 4) {
+ a3 = a2;
+ c3 = c2;
+ }
+ const float* a4 = (const float*) ((uintptr_t) a3 + a_stride);
+ float* c4 = (float*) ((uintptr_t) c3 + cm_stride);
+ if XNN_UNPREDICTABLE(mr <= 4) {
+ a4 = a3;
+ c4 = c3;
+ }
+
+ do {
+ __m256 vacc0x01234567 = _mm256_load_ps(acc + 0);
+ __m256 vacc1x01234567 = _mm256_load_ps(acc + 8);
+ __m256 vacc2x01234567 = _mm256_load_ps(acc + 16);
+ __m256 vacc3x01234567 = _mm256_load_ps(acc + 24);
+ __m256 vacc4x01234567 = _mm256_load_ps(acc + 32);
+ acc += 40;
+
+ size_t k = kc;
+ do {
+ const __m256 va0 = _mm256_broadcast_ss(a0);
+ a0 += 1;
+ const __m256 va1 = _mm256_broadcast_ss(a1);
+ a1 += 1;
+ const __m256 va2 = _mm256_broadcast_ss(a2);
+ a2 += 1;
+ const __m256 va3 = _mm256_broadcast_ss(a3);
+ a3 += 1;
+ const __m256 va4 = _mm256_broadcast_ss(a4);
+ a4 += 1;
+
+ const __m256 vb01234567 = _mm256_load_ps(w);
+ w += 8;
+
+ vacc0x01234567 = _mm256_fmadd_ps(va0, vb01234567, vacc0x01234567);
+ vacc1x01234567 = _mm256_fmadd_ps(va1, vb01234567, vacc1x01234567);
+ vacc2x01234567 = _mm256_fmadd_ps(va2, vb01234567, vacc2x01234567);
+ vacc3x01234567 = _mm256_fmadd_ps(va3, vb01234567, vacc3x01234567);
+ vacc4x01234567 = _mm256_fmadd_ps(va4, vb01234567, vacc4x01234567);
+
+ k -= sizeof(float);
+ } while (k != 0);
+
+ const __m256 vmax = _mm256_broadcast_ps((const __m128*) params->sse.max);
+ vacc0x01234567 = _mm256_min_ps(vacc0x01234567, vmax);
+ vacc1x01234567 = _mm256_min_ps(vacc1x01234567, vmax);
+ vacc2x01234567 = _mm256_min_ps(vacc2x01234567, vmax);
+ vacc3x01234567 = _mm256_min_ps(vacc3x01234567, vmax);
+ vacc4x01234567 = _mm256_min_ps(vacc4x01234567, vmax);
+
+ const __m256 vmin = _mm256_broadcast_ps((const __m128*) params->sse.min);
+ vacc0x01234567 = _mm256_max_ps(vacc0x01234567, vmin);
+ vacc1x01234567 = _mm256_max_ps(vacc1x01234567, vmin);
+ vacc2x01234567 = _mm256_max_ps(vacc2x01234567, vmin);
+ vacc3x01234567 = _mm256_max_ps(vacc3x01234567, vmin);
+ vacc4x01234567 = _mm256_max_ps(vacc4x01234567, vmin);
+
+ if XNN_LIKELY(nc >= 8) {
+ _mm256_storeu_ps(c4, vacc4x01234567);
+ c4 = (float*) ((uintptr_t) c4 + cn_stride);
+ _mm256_storeu_ps(c3, vacc3x01234567);
+ c3 = (float*) ((uintptr_t) c3 + cn_stride);
+ _mm256_storeu_ps(c2, vacc2x01234567);
+ c2 = (float*) ((uintptr_t) c2 + cn_stride);
+ _mm256_storeu_ps(c1, vacc1x01234567);
+ c1 = (float*) ((uintptr_t) c1 + cn_stride);
+ _mm256_storeu_ps(c0, vacc0x01234567);
+ c0 = (float*) ((uintptr_t) c0 + cn_stride);
+
+ a4 = (const float*) ((uintptr_t) a4 - kc);
+ a3 = (const float*) ((uintptr_t) a3 - kc);
+ a2 = (const float*) ((uintptr_t) a2 - kc);
+ a1 = (const float*) ((uintptr_t) a1 - kc);
+ a0 = (const float*) ((uintptr_t) a0 - kc);
+
+ nc -= 8;
+ } else {
+ __m128 vacc4x0123 = _mm256_castps256_ps128(vacc4x01234567);
+ __m128 vacc3x0123 = _mm256_castps256_ps128(vacc3x01234567);
+ __m128 vacc2x0123 = _mm256_castps256_ps128(vacc2x01234567);
+ __m128 vacc1x0123 = _mm256_castps256_ps128(vacc1x01234567);
+ __m128 vacc0x0123 = _mm256_castps256_ps128(vacc0x01234567);
+ if (nc & 4) {
+ _mm_storeu_ps(c4, vacc4x0123);
+ _mm_storeu_ps(c3, vacc3x0123);
+ _mm_storeu_ps(c2, vacc2x0123);
+ _mm_storeu_ps(c1, vacc1x0123);
+ _mm_storeu_ps(c0, vacc0x0123);
+
+ vacc4x0123 = _mm256_extractf128_ps(vacc4x01234567, 1);
+ vacc3x0123 = _mm256_extractf128_ps(vacc3x01234567, 1);
+ vacc2x0123 = _mm256_extractf128_ps(vacc2x01234567, 1);
+ vacc1x0123 = _mm256_extractf128_ps(vacc1x01234567, 1);
+ vacc0x0123 = _mm256_extractf128_ps(vacc0x01234567, 1);
+
+ c4 += 4;
+ c3 += 4;
+ c2 += 4;
+ c1 += 4;
+ c0 += 4;
+ }
+ if (nc & 2) {
+ _mm_storel_pi((__m64*) c4, vacc4x0123);
+ _mm_storel_pi((__m64*) c3, vacc3x0123);
+ _mm_storel_pi((__m64*) c2, vacc2x0123);
+ _mm_storel_pi((__m64*) c1, vacc1x0123);
+ _mm_storel_pi((__m64*) c0, vacc0x0123);
+
+ vacc4x0123 = _mm_movehl_ps(vacc4x0123, vacc4x0123);
+ vacc3x0123 = _mm_movehl_ps(vacc3x0123, vacc3x0123);
+ vacc2x0123 = _mm_movehl_ps(vacc2x0123, vacc2x0123);
+ vacc1x0123 = _mm_movehl_ps(vacc1x0123, vacc1x0123);
+ vacc0x0123 = _mm_movehl_ps(vacc0x0123, vacc0x0123);
+
+ c4 += 2;
+ c3 += 2;
+ c2 += 2;
+ c1 += 2;
+ c0 += 2;
+ }
+ if (nc & 1) {
+ _mm_store_ss(c4, vacc4x0123);
+ _mm_store_ss(c3, vacc3x0123);
+ _mm_store_ss(c2, vacc2x0123);
+ _mm_store_ss(c1, vacc1x0123);
+ _mm_store_ss(c0, vacc0x0123);
+ }
+
+ nc = 0;
+ }
+ } while (nc != 0);
+}
diff --git a/src/f32-gemminc/6x8-avx-broadcast.c b/src/f32-gemminc/6x8-avx-broadcast.c
new file mode 100644
index 0000000..8bf0197
--- /dev/null
+++ b/src/f32-gemminc/6x8-avx-broadcast.c
@@ -0,0 +1,211 @@
+// Auto-generated file. Do not edit!
+// Template: src/f32-gemm/avx-broadcast.c.in
+// Generator: tools/xngen
+//
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <immintrin.h>
+
+#include <xnnpack/gemm.h>
+
+
+void xnn_f32_gemminc_ukernel_6x8__avx_broadcast(
+ size_t mr,
+ size_t nc,
+ size_t kc,
+ const float*restrict a,
+ size_t a_stride,
+ const float*restrict w,
+ float*restrict c,
+ size_t cm_stride,
+ size_t cn_stride,
+ const float*restrict acc,
+ const union xnn_f32_output_params params[restrict static 1])
+{
+ assert(mr != 0);
+ assert(mr <= 6);
+ assert(nc != 0);
+ assert(kc != 0);
+ assert(kc % sizeof(float) == 0);
+ assert(a != NULL);
+ assert(w != NULL);
+ assert(c != NULL);
+ assert(acc != NULL);
+
+ const float* a0 = a;
+ float* c0 = c;
+ const float* a1 = (const float*) ((uintptr_t) a0 + a_stride);
+ float* c1 = (float*) ((uintptr_t) c0 + cm_stride);
+ if XNN_UNPREDICTABLE(mr < 2) {
+ a1 = a0;
+ c1 = c0;
+ }
+ const float* a2 = (const float*) ((uintptr_t) a1 + a_stride);
+ float* c2 = (float*) ((uintptr_t) c1 + cm_stride);
+ if XNN_UNPREDICTABLE(mr <= 2) {
+ a2 = a1;
+ c2 = c1;
+ }
+ const float* a3 = (const float*) ((uintptr_t) a2 + a_stride);
+ float* c3 = (float*) ((uintptr_t) c2 + cm_stride);
+ if XNN_UNPREDICTABLE(mr < 4) {
+ a3 = a2;
+ c3 = c2;
+ }
+ const float* a4 = (const float*) ((uintptr_t) a3 + a_stride);
+ float* c4 = (float*) ((uintptr_t) c3 + cm_stride);
+ if XNN_UNPREDICTABLE(mr <= 4) {
+ a4 = a3;
+ c4 = c3;
+ }
+ const float* a5 = (const float*) ((uintptr_t) a4 + a_stride);
+ float* c5 = (float*) ((uintptr_t) c4 + cm_stride);
+ if XNN_UNPREDICTABLE(mr != 6) {
+ a5 = a4;
+ c5 = c4;
+ }
+
+ do {
+ __m256 vacc0x01234567 = _mm256_load_ps(acc + 0);
+ __m256 vacc1x01234567 = _mm256_load_ps(acc + 8);
+ __m256 vacc2x01234567 = _mm256_load_ps(acc + 16);
+ __m256 vacc3x01234567 = _mm256_load_ps(acc + 24);
+ __m256 vacc4x01234567 = _mm256_load_ps(acc + 32);
+ __m256 vacc5x01234567 = _mm256_load_ps(acc + 40);
+ acc += 48;
+
+ size_t k = kc;
+ do {
+ const __m256 va0 = _mm256_broadcast_ss(a0);
+ a0 += 1;
+ const __m256 va1 = _mm256_broadcast_ss(a1);
+ a1 += 1;
+ const __m256 va2 = _mm256_broadcast_ss(a2);
+ a2 += 1;
+ const __m256 va3 = _mm256_broadcast_ss(a3);
+ a3 += 1;
+ const __m256 va4 = _mm256_broadcast_ss(a4);
+ a4 += 1;
+ const __m256 va5 = _mm256_broadcast_ss(a5);
+ a5 += 1;
+
+ const __m256 vb01234567 = _mm256_load_ps(w);
+ w += 8;
+
+ vacc0x01234567 = _mm256_add_ps(vacc0x01234567, _mm256_mul_ps(va0, vb01234567));
+ vacc1x01234567 = _mm256_add_ps(vacc1x01234567, _mm256_mul_ps(va1, vb01234567));
+ vacc2x01234567 = _mm256_add_ps(vacc2x01234567, _mm256_mul_ps(va2, vb01234567));
+ vacc3x01234567 = _mm256_add_ps(vacc3x01234567, _mm256_mul_ps(va3, vb01234567));
+ vacc4x01234567 = _mm256_add_ps(vacc4x01234567, _mm256_mul_ps(va4, vb01234567));
+ vacc5x01234567 = _mm256_add_ps(vacc5x01234567, _mm256_mul_ps(va5, vb01234567));
+
+ k -= sizeof(float);
+ } while (k != 0);
+
+ const __m256 vmax = _mm256_broadcast_ps((const __m128*) params->sse.max);
+ vacc0x01234567 = _mm256_min_ps(vacc0x01234567, vmax);
+ vacc1x01234567 = _mm256_min_ps(vacc1x01234567, vmax);
+ vacc2x01234567 = _mm256_min_ps(vacc2x01234567, vmax);
+ vacc3x01234567 = _mm256_min_ps(vacc3x01234567, vmax);
+ vacc4x01234567 = _mm256_min_ps(vacc4x01234567, vmax);
+ vacc5x01234567 = _mm256_min_ps(vacc5x01234567, vmax);
+
+ const __m256 vmin = _mm256_broadcast_ps((const __m128*) params->sse.min);
+ vacc0x01234567 = _mm256_max_ps(vacc0x01234567, vmin);
+ vacc1x01234567 = _mm256_max_ps(vacc1x01234567, vmin);
+ vacc2x01234567 = _mm256_max_ps(vacc2x01234567, vmin);
+ vacc3x01234567 = _mm256_max_ps(vacc3x01234567, vmin);
+ vacc4x01234567 = _mm256_max_ps(vacc4x01234567, vmin);
+ vacc5x01234567 = _mm256_max_ps(vacc5x01234567, vmin);
+
+ if XNN_LIKELY(nc >= 8) {
+ _mm256_storeu_ps(c5, vacc5x01234567);
+ c5 = (float*) ((uintptr_t) c5 + cn_stride);
+ _mm256_storeu_ps(c4, vacc4x01234567);
+ c4 = (float*) ((uintptr_t) c4 + cn_stride);
+ _mm256_storeu_ps(c3, vacc3x01234567);
+ c3 = (float*) ((uintptr_t) c3 + cn_stride);
+ _mm256_storeu_ps(c2, vacc2x01234567);
+ c2 = (float*) ((uintptr_t) c2 + cn_stride);
+ _mm256_storeu_ps(c1, vacc1x01234567);
+ c1 = (float*) ((uintptr_t) c1 + cn_stride);
+ _mm256_storeu_ps(c0, vacc0x01234567);
+ c0 = (float*) ((uintptr_t) c0 + cn_stride);
+
+ a5 = (const float*) ((uintptr_t) a5 - kc);
+ a4 = (const float*) ((uintptr_t) a4 - kc);
+ a3 = (const float*) ((uintptr_t) a3 - kc);
+ a2 = (const float*) ((uintptr_t) a2 - kc);
+ a1 = (const float*) ((uintptr_t) a1 - kc);
+ a0 = (const float*) ((uintptr_t) a0 - kc);
+
+ nc -= 8;
+ } else {
+ __m128 vacc5x0123 = _mm256_castps256_ps128(vacc5x01234567);
+ __m128 vacc4x0123 = _mm256_castps256_ps128(vacc4x01234567);
+ __m128 vacc3x0123 = _mm256_castps256_ps128(vacc3x01234567);
+ __m128 vacc2x0123 = _mm256_castps256_ps128(vacc2x01234567);
+ __m128 vacc1x0123 = _mm256_castps256_ps128(vacc1x01234567);
+ __m128 vacc0x0123 = _mm256_castps256_ps128(vacc0x01234567);
+ if (nc & 4) {
+ _mm_storeu_ps(c5, vacc5x0123);
+ _mm_storeu_ps(c4, vacc4x0123);
+ _mm_storeu_ps(c3, vacc3x0123);
+ _mm_storeu_ps(c2, vacc2x0123);
+ _mm_storeu_ps(c1, vacc1x0123);
+ _mm_storeu_ps(c0, vacc0x0123);
+
+ vacc5x0123 = _mm256_extractf128_ps(vacc5x01234567, 1);
+ vacc4x0123 = _mm256_extractf128_ps(vacc4x01234567, 1);
+ vacc3x0123 = _mm256_extractf128_ps(vacc3x01234567, 1);
+ vacc2x0123 = _mm256_extractf128_ps(vacc2x01234567, 1);
+ vacc1x0123 = _mm256_extractf128_ps(vacc1x01234567, 1);
+ vacc0x0123 = _mm256_extractf128_ps(vacc0x01234567, 1);
+
+ c5 += 4;
+ c4 += 4;
+ c3 += 4;
+ c2 += 4;
+ c1 += 4;
+ c0 += 4;
+ }
+ if (nc & 2) {
+ _mm_storel_pi((__m64*) c5, vacc5x0123);
+ _mm_storel_pi((__m64*) c4, vacc4x0123);
+ _mm_storel_pi((__m64*) c3, vacc3x0123);
+ _mm_storel_pi((__m64*) c2, vacc2x0123);
+ _mm_storel_pi((__m64*) c1, vacc1x0123);
+ _mm_storel_pi((__m64*) c0, vacc0x0123);
+
+ vacc5x0123 = _mm_movehl_ps(vacc5x0123, vacc5x0123);
+ vacc4x0123 = _mm_movehl_ps(vacc4x0123, vacc4x0123);
+ vacc3x0123 = _mm_movehl_ps(vacc3x0123, vacc3x0123);
+ vacc2x0123 = _mm_movehl_ps(vacc2x0123, vacc2x0123);
+ vacc1x0123 = _mm_movehl_ps(vacc1x0123, vacc1x0123);
+ vacc0x0123 = _mm_movehl_ps(vacc0x0123, vacc0x0123);
+
+ c5 += 2;
+ c4 += 2;
+ c3 += 2;
+ c2 += 2;
+ c1 += 2;
+ c0 += 2;
+ }
+ if (nc & 1) {
+ _mm_store_ss(c5, vacc5x0123);
+ _mm_store_ss(c4, vacc4x0123);
+ _mm_store_ss(c3, vacc3x0123);
+ _mm_store_ss(c2, vacc2x0123);
+ _mm_store_ss(c1, vacc1x0123);
+ _mm_store_ss(c0, vacc0x0123);
+ }
+
+ nc = 0;
+ }
+ } while (nc != 0);
+}
diff --git a/src/f32-gemminc/6x8-fma3-broadcast.c b/src/f32-gemminc/6x8-fma3-broadcast.c
new file mode 100644
index 0000000..08d2d7a
--- /dev/null
+++ b/src/f32-gemminc/6x8-fma3-broadcast.c
@@ -0,0 +1,211 @@
+// Auto-generated file. Do not edit!
+// Template: src/f32-gemm/avx-broadcast.c.in
+// Generator: tools/xngen
+//
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <immintrin.h>
+
+#include <xnnpack/gemm.h>
+
+
+void xnn_f32_gemminc_ukernel_6x8__fma3_broadcast(
+ size_t mr,
+ size_t nc,
+ size_t kc,
+ const float*restrict a,
+ size_t a_stride,
+ const float*restrict w,
+ float*restrict c,
+ size_t cm_stride,
+ size_t cn_stride,
+ const float*restrict acc,
+ const union xnn_f32_output_params params[restrict static 1])
+{
+ assert(mr != 0);
+ assert(mr <= 6);
+ assert(nc != 0);
+ assert(kc != 0);
+ assert(kc % sizeof(float) == 0);
+ assert(a != NULL);
+ assert(w != NULL);
+ assert(c != NULL);
+ assert(acc != NULL);
+
+ const float* a0 = a;
+ float* c0 = c;
+ const float* a1 = (const float*) ((uintptr_t) a0 + a_stride);
+ float* c1 = (float*) ((uintptr_t) c0 + cm_stride);
+ if XNN_UNPREDICTABLE(mr < 2) {
+ a1 = a0;
+ c1 = c0;
+ }
+ const float* a2 = (const float*) ((uintptr_t) a1 + a_stride);
+ float* c2 = (float*) ((uintptr_t) c1 + cm_stride);
+ if XNN_UNPREDICTABLE(mr <= 2) {
+ a2 = a1;
+ c2 = c1;
+ }
+ const float* a3 = (const float*) ((uintptr_t) a2 + a_stride);
+ float* c3 = (float*) ((uintptr_t) c2 + cm_stride);
+ if XNN_UNPREDICTABLE(mr < 4) {
+ a3 = a2;
+ c3 = c2;
+ }
+ const float* a4 = (const float*) ((uintptr_t) a3 + a_stride);
+ float* c4 = (float*) ((uintptr_t) c3 + cm_stride);
+ if XNN_UNPREDICTABLE(mr <= 4) {
+ a4 = a3;
+ c4 = c3;
+ }
+ const float* a5 = (const float*) ((uintptr_t) a4 + a_stride);
+ float* c5 = (float*) ((uintptr_t) c4 + cm_stride);
+ if XNN_UNPREDICTABLE(mr != 6) {
+ a5 = a4;
+ c5 = c4;
+ }
+
+ do {
+ __m256 vacc0x01234567 = _mm256_load_ps(acc + 0);
+ __m256 vacc1x01234567 = _mm256_load_ps(acc + 8);
+ __m256 vacc2x01234567 = _mm256_load_ps(acc + 16);
+ __m256 vacc3x01234567 = _mm256_load_ps(acc + 24);
+ __m256 vacc4x01234567 = _mm256_load_ps(acc + 32);
+ __m256 vacc5x01234567 = _mm256_load_ps(acc + 40);
+ acc += 48;
+
+ size_t k = kc;
+ do {
+ const __m256 va0 = _mm256_broadcast_ss(a0);
+ a0 += 1;
+ const __m256 va1 = _mm256_broadcast_ss(a1);
+ a1 += 1;
+ const __m256 va2 = _mm256_broadcast_ss(a2);
+ a2 += 1;
+ const __m256 va3 = _mm256_broadcast_ss(a3);
+ a3 += 1;
+ const __m256 va4 = _mm256_broadcast_ss(a4);
+ a4 += 1;
+ const __m256 va5 = _mm256_broadcast_ss(a5);
+ a5 += 1;
+
+ const __m256 vb01234567 = _mm256_load_ps(w);
+ w += 8;
+
+ vacc0x01234567 = _mm256_fmadd_ps(va0, vb01234567, vacc0x01234567);
+ vacc1x01234567 = _mm256_fmadd_ps(va1, vb01234567, vacc1x01234567);
+ vacc2x01234567 = _mm256_fmadd_ps(va2, vb01234567, vacc2x01234567);
+ vacc3x01234567 = _mm256_fmadd_ps(va3, vb01234567, vacc3x01234567);
+ vacc4x01234567 = _mm256_fmadd_ps(va4, vb01234567, vacc4x01234567);
+ vacc5x01234567 = _mm256_fmadd_ps(va5, vb01234567, vacc5x01234567);
+
+ k -= sizeof(float);
+ } while (k != 0);
+
+ const __m256 vmax = _mm256_broadcast_ps((const __m128*) params->sse.max);
+ vacc0x01234567 = _mm256_min_ps(vacc0x01234567, vmax);
+ vacc1x01234567 = _mm256_min_ps(vacc1x01234567, vmax);
+ vacc2x01234567 = _mm256_min_ps(vacc2x01234567, vmax);
+ vacc3x01234567 = _mm256_min_ps(vacc3x01234567, vmax);
+ vacc4x01234567 = _mm256_min_ps(vacc4x01234567, vmax);
+ vacc5x01234567 = _mm256_min_ps(vacc5x01234567, vmax);
+
+ const __m256 vmin = _mm256_broadcast_ps((const __m128*) params->sse.min);
+ vacc0x01234567 = _mm256_max_ps(vacc0x01234567, vmin);
+ vacc1x01234567 = _mm256_max_ps(vacc1x01234567, vmin);
+ vacc2x01234567 = _mm256_max_ps(vacc2x01234567, vmin);
+ vacc3x01234567 = _mm256_max_ps(vacc3x01234567, vmin);
+ vacc4x01234567 = _mm256_max_ps(vacc4x01234567, vmin);
+ vacc5x01234567 = _mm256_max_ps(vacc5x01234567, vmin);
+
+ if XNN_LIKELY(nc >= 8) {
+ _mm256_storeu_ps(c5, vacc5x01234567);
+ c5 = (float*) ((uintptr_t) c5 + cn_stride);
+ _mm256_storeu_ps(c4, vacc4x01234567);
+ c4 = (float*) ((uintptr_t) c4 + cn_stride);
+ _mm256_storeu_ps(c3, vacc3x01234567);
+ c3 = (float*) ((uintptr_t) c3 + cn_stride);
+ _mm256_storeu_ps(c2, vacc2x01234567);
+ c2 = (float*) ((uintptr_t) c2 + cn_stride);
+ _mm256_storeu_ps(c1, vacc1x01234567);
+ c1 = (float*) ((uintptr_t) c1 + cn_stride);
+ _mm256_storeu_ps(c0, vacc0x01234567);
+ c0 = (float*) ((uintptr_t) c0 + cn_stride);
+
+ a5 = (const float*) ((uintptr_t) a5 - kc);
+ a4 = (const float*) ((uintptr_t) a4 - kc);
+ a3 = (const float*) ((uintptr_t) a3 - kc);
+ a2 = (const float*) ((uintptr_t) a2 - kc);
+ a1 = (const float*) ((uintptr_t) a1 - kc);
+ a0 = (const float*) ((uintptr_t) a0 - kc);
+
+ nc -= 8;
+ } else {
+ __m128 vacc5x0123 = _mm256_castps256_ps128(vacc5x01234567);
+ __m128 vacc4x0123 = _mm256_castps256_ps128(vacc4x01234567);
+ __m128 vacc3x0123 = _mm256_castps256_ps128(vacc3x01234567);
+ __m128 vacc2x0123 = _mm256_castps256_ps128(vacc2x01234567);
+ __m128 vacc1x0123 = _mm256_castps256_ps128(vacc1x01234567);
+ __m128 vacc0x0123 = _mm256_castps256_ps128(vacc0x01234567);
+ if (nc & 4) {
+ _mm_storeu_ps(c5, vacc5x0123);
+ _mm_storeu_ps(c4, vacc4x0123);
+ _mm_storeu_ps(c3, vacc3x0123);
+ _mm_storeu_ps(c2, vacc2x0123);
+ _mm_storeu_ps(c1, vacc1x0123);
+ _mm_storeu_ps(c0, vacc0x0123);
+
+ vacc5x0123 = _mm256_extractf128_ps(vacc5x01234567, 1);
+ vacc4x0123 = _mm256_extractf128_ps(vacc4x01234567, 1);
+ vacc3x0123 = _mm256_extractf128_ps(vacc3x01234567, 1);
+ vacc2x0123 = _mm256_extractf128_ps(vacc2x01234567, 1);
+ vacc1x0123 = _mm256_extractf128_ps(vacc1x01234567, 1);
+ vacc0x0123 = _mm256_extractf128_ps(vacc0x01234567, 1);
+
+ c5 += 4;
+ c4 += 4;
+ c3 += 4;
+ c2 += 4;
+ c1 += 4;
+ c0 += 4;
+ }
+ if (nc & 2) {
+ _mm_storel_pi((__m64*) c5, vacc5x0123);
+ _mm_storel_pi((__m64*) c4, vacc4x0123);
+ _mm_storel_pi((__m64*) c3, vacc3x0123);
+ _mm_storel_pi((__m64*) c2, vacc2x0123);
+ _mm_storel_pi((__m64*) c1, vacc1x0123);
+ _mm_storel_pi((__m64*) c0, vacc0x0123);
+
+ vacc5x0123 = _mm_movehl_ps(vacc5x0123, vacc5x0123);
+ vacc4x0123 = _mm_movehl_ps(vacc4x0123, vacc4x0123);
+ vacc3x0123 = _mm_movehl_ps(vacc3x0123, vacc3x0123);
+ vacc2x0123 = _mm_movehl_ps(vacc2x0123, vacc2x0123);
+ vacc1x0123 = _mm_movehl_ps(vacc1x0123, vacc1x0123);
+ vacc0x0123 = _mm_movehl_ps(vacc0x0123, vacc0x0123);
+
+ c5 += 2;
+ c4 += 2;
+ c3 += 2;
+ c2 += 2;
+ c1 += 2;
+ c0 += 2;
+ }
+ if (nc & 1) {
+ _mm_store_ss(c5, vacc5x0123);
+ _mm_store_ss(c4, vacc4x0123);
+ _mm_store_ss(c3, vacc3x0123);
+ _mm_store_ss(c2, vacc2x0123);
+ _mm_store_ss(c1, vacc1x0123);
+ _mm_store_ss(c0, vacc0x0123);
+ }
+
+ nc = 0;
+ }
+ } while (nc != 0);
+}
diff --git a/src/f32-gemminc/7x8-avx-broadcast.c b/src/f32-gemminc/7x8-avx-broadcast.c
new file mode 100644
index 0000000..a991ae9
--- /dev/null
+++ b/src/f32-gemminc/7x8-avx-broadcast.c
@@ -0,0 +1,234 @@
+// Auto-generated file. Do not edit!
+// Template: src/f32-gemm/avx-broadcast.c.in
+// Generator: tools/xngen
+//
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <immintrin.h>
+
+#include <xnnpack/gemm.h>
+
+
+void xnn_f32_gemminc_ukernel_7x8__avx_broadcast(
+ size_t mr,
+ size_t nc,
+ size_t kc,
+ const float*restrict a,
+ size_t a_stride,
+ const float*restrict w,
+ float*restrict c,
+ size_t cm_stride,
+ size_t cn_stride,
+ const float*restrict acc,
+ const union xnn_f32_output_params params[restrict static 1])
+{
+ assert(mr != 0);
+ assert(mr <= 7);
+ assert(nc != 0);
+ assert(kc != 0);
+ assert(kc % sizeof(float) == 0);
+ assert(a != NULL);
+ assert(w != NULL);
+ assert(c != NULL);
+ assert(acc != NULL);
+
+ const float* a0 = a;
+ float* c0 = c;
+ const float* a1 = (const float*) ((uintptr_t) a0 + a_stride);
+ float* c1 = (float*) ((uintptr_t) c0 + cm_stride);
+ if XNN_UNPREDICTABLE(mr < 2) {
+ a1 = a0;
+ c1 = c0;
+ }
+ const float* a2 = (const float*) ((uintptr_t) a1 + a_stride);
+ float* c2 = (float*) ((uintptr_t) c1 + cm_stride);
+ if XNN_UNPREDICTABLE(mr <= 2) {
+ a2 = a1;
+ c2 = c1;
+ }
+ const float* a3 = (const float*) ((uintptr_t) a2 + a_stride);
+ float* c3 = (float*) ((uintptr_t) c2 + cm_stride);
+ if XNN_UNPREDICTABLE(mr < 4) {
+ a3 = a2;
+ c3 = c2;
+ }
+ const float* a4 = (const float*) ((uintptr_t) a3 + a_stride);
+ float* c4 = (float*) ((uintptr_t) c3 + cm_stride);
+ if XNN_UNPREDICTABLE(mr <= 4) {
+ a4 = a3;
+ c4 = c3;
+ }
+ const float* a5 = (const float*) ((uintptr_t) a4 + a_stride);
+ float* c5 = (float*) ((uintptr_t) c4 + cm_stride);
+ if XNN_UNPREDICTABLE(mr < 6) {
+ a5 = a4;
+ c5 = c4;
+ }
+ const float* a6 = (const float*) ((uintptr_t) a5 + a_stride);
+ float* c6 = (float*) ((uintptr_t) c5 + cm_stride);
+ if XNN_UNPREDICTABLE(mr <= 6) {
+ a6 = a5;
+ c6 = c5;
+ }
+
+ do {
+ __m256 vacc0x01234567 = _mm256_load_ps(acc + 0);
+ __m256 vacc1x01234567 = _mm256_load_ps(acc + 8);
+ __m256 vacc2x01234567 = _mm256_load_ps(acc + 16);
+ __m256 vacc3x01234567 = _mm256_load_ps(acc + 24);
+ __m256 vacc4x01234567 = _mm256_load_ps(acc + 32);
+ __m256 vacc5x01234567 = _mm256_load_ps(acc + 40);
+ __m256 vacc6x01234567 = _mm256_load_ps(acc + 48);
+ acc += 56;
+
+ size_t k = kc;
+ do {
+ const __m256 va0 = _mm256_broadcast_ss(a0);
+ a0 += 1;
+ const __m256 va1 = _mm256_broadcast_ss(a1);
+ a1 += 1;
+ const __m256 va2 = _mm256_broadcast_ss(a2);
+ a2 += 1;
+ const __m256 va3 = _mm256_broadcast_ss(a3);
+ a3 += 1;
+ const __m256 va4 = _mm256_broadcast_ss(a4);
+ a4 += 1;
+ const __m256 va5 = _mm256_broadcast_ss(a5);
+ a5 += 1;
+ const __m256 va6 = _mm256_broadcast_ss(a6);
+ a6 += 1;
+
+ const __m256 vb01234567 = _mm256_load_ps(w);
+ w += 8;
+
+ vacc0x01234567 = _mm256_add_ps(vacc0x01234567, _mm256_mul_ps(va0, vb01234567));
+ vacc1x01234567 = _mm256_add_ps(vacc1x01234567, _mm256_mul_ps(va1, vb01234567));
+ vacc2x01234567 = _mm256_add_ps(vacc2x01234567, _mm256_mul_ps(va2, vb01234567));
+ vacc3x01234567 = _mm256_add_ps(vacc3x01234567, _mm256_mul_ps(va3, vb01234567));
+ vacc4x01234567 = _mm256_add_ps(vacc4x01234567, _mm256_mul_ps(va4, vb01234567));
+ vacc5x01234567 = _mm256_add_ps(vacc5x01234567, _mm256_mul_ps(va5, vb01234567));
+ vacc6x01234567 = _mm256_add_ps(vacc6x01234567, _mm256_mul_ps(va6, vb01234567));
+
+ k -= sizeof(float);
+ } while (k != 0);
+
+ const __m256 vmax = _mm256_broadcast_ps((const __m128*) params->sse.max);
+ vacc0x01234567 = _mm256_min_ps(vacc0x01234567, vmax);
+ vacc1x01234567 = _mm256_min_ps(vacc1x01234567, vmax);
+ vacc2x01234567 = _mm256_min_ps(vacc2x01234567, vmax);
+ vacc3x01234567 = _mm256_min_ps(vacc3x01234567, vmax);
+ vacc4x01234567 = _mm256_min_ps(vacc4x01234567, vmax);
+ vacc5x01234567 = _mm256_min_ps(vacc5x01234567, vmax);
+ vacc6x01234567 = _mm256_min_ps(vacc6x01234567, vmax);
+
+ const __m256 vmin = _mm256_broadcast_ps((const __m128*) params->sse.min);
+ vacc0x01234567 = _mm256_max_ps(vacc0x01234567, vmin);
+ vacc1x01234567 = _mm256_max_ps(vacc1x01234567, vmin);
+ vacc2x01234567 = _mm256_max_ps(vacc2x01234567, vmin);
+ vacc3x01234567 = _mm256_max_ps(vacc3x01234567, vmin);
+ vacc4x01234567 = _mm256_max_ps(vacc4x01234567, vmin);
+ vacc5x01234567 = _mm256_max_ps(vacc5x01234567, vmin);
+ vacc6x01234567 = _mm256_max_ps(vacc6x01234567, vmin);
+
+ if XNN_LIKELY(nc >= 8) {
+ _mm256_storeu_ps(c6, vacc6x01234567);
+ c6 = (float*) ((uintptr_t) c6 + cn_stride);
+ _mm256_storeu_ps(c5, vacc5x01234567);
+ c5 = (float*) ((uintptr_t) c5 + cn_stride);
+ _mm256_storeu_ps(c4, vacc4x01234567);
+ c4 = (float*) ((uintptr_t) c4 + cn_stride);
+ _mm256_storeu_ps(c3, vacc3x01234567);
+ c3 = (float*) ((uintptr_t) c3 + cn_stride);
+ _mm256_storeu_ps(c2, vacc2x01234567);
+ c2 = (float*) ((uintptr_t) c2 + cn_stride);
+ _mm256_storeu_ps(c1, vacc1x01234567);
+ c1 = (float*) ((uintptr_t) c1 + cn_stride);
+ _mm256_storeu_ps(c0, vacc0x01234567);
+ c0 = (float*) ((uintptr_t) c0 + cn_stride);
+
+ a6 = (const float*) ((uintptr_t) a6 - kc);
+ a5 = (const float*) ((uintptr_t) a5 - kc);
+ a4 = (const float*) ((uintptr_t) a4 - kc);
+ a3 = (const float*) ((uintptr_t) a3 - kc);
+ a2 = (const float*) ((uintptr_t) a2 - kc);
+ a1 = (const float*) ((uintptr_t) a1 - kc);
+ a0 = (const float*) ((uintptr_t) a0 - kc);
+
+ nc -= 8;
+ } else {
+ __m128 vacc6x0123 = _mm256_castps256_ps128(vacc6x01234567);
+ __m128 vacc5x0123 = _mm256_castps256_ps128(vacc5x01234567);
+ __m128 vacc4x0123 = _mm256_castps256_ps128(vacc4x01234567);
+ __m128 vacc3x0123 = _mm256_castps256_ps128(vacc3x01234567);
+ __m128 vacc2x0123 = _mm256_castps256_ps128(vacc2x01234567);
+ __m128 vacc1x0123 = _mm256_castps256_ps128(vacc1x01234567);
+ __m128 vacc0x0123 = _mm256_castps256_ps128(vacc0x01234567);
+ if (nc & 4) {
+ _mm_storeu_ps(c6, vacc6x0123);
+ _mm_storeu_ps(c5, vacc5x0123);
+ _mm_storeu_ps(c4, vacc4x0123);
+ _mm_storeu_ps(c3, vacc3x0123);
+ _mm_storeu_ps(c2, vacc2x0123);
+ _mm_storeu_ps(c1, vacc1x0123);
+ _mm_storeu_ps(c0, vacc0x0123);
+
+ vacc6x0123 = _mm256_extractf128_ps(vacc6x01234567, 1);
+ vacc5x0123 = _mm256_extractf128_ps(vacc5x01234567, 1);
+ vacc4x0123 = _mm256_extractf128_ps(vacc4x01234567, 1);
+ vacc3x0123 = _mm256_extractf128_ps(vacc3x01234567, 1);
+ vacc2x0123 = _mm256_extractf128_ps(vacc2x01234567, 1);
+ vacc1x0123 = _mm256_extractf128_ps(vacc1x01234567, 1);
+ vacc0x0123 = _mm256_extractf128_ps(vacc0x01234567, 1);
+
+ c6 += 4;
+ c5 += 4;
+ c4 += 4;
+ c3 += 4;
+ c2 += 4;
+ c1 += 4;
+ c0 += 4;
+ }
+ if (nc & 2) {
+ _mm_storel_pi((__m64*) c6, vacc6x0123);
+ _mm_storel_pi((__m64*) c5, vacc5x0123);
+ _mm_storel_pi((__m64*) c4, vacc4x0123);
+ _mm_storel_pi((__m64*) c3, vacc3x0123);
+ _mm_storel_pi((__m64*) c2, vacc2x0123);
+ _mm_storel_pi((__m64*) c1, vacc1x0123);
+ _mm_storel_pi((__m64*) c0, vacc0x0123);
+
+ vacc6x0123 = _mm_movehl_ps(vacc6x0123, vacc6x0123);
+ vacc5x0123 = _mm_movehl_ps(vacc5x0123, vacc5x0123);
+ vacc4x0123 = _mm_movehl_ps(vacc4x0123, vacc4x0123);
+ vacc3x0123 = _mm_movehl_ps(vacc3x0123, vacc3x0123);
+ vacc2x0123 = _mm_movehl_ps(vacc2x0123, vacc2x0123);
+ vacc1x0123 = _mm_movehl_ps(vacc1x0123, vacc1x0123);
+ vacc0x0123 = _mm_movehl_ps(vacc0x0123, vacc0x0123);
+
+ c6 += 2;
+ c5 += 2;
+ c4 += 2;
+ c3 += 2;
+ c2 += 2;
+ c1 += 2;
+ c0 += 2;
+ }
+ if (nc & 1) {
+ _mm_store_ss(c6, vacc6x0123);
+ _mm_store_ss(c5, vacc5x0123);
+ _mm_store_ss(c4, vacc4x0123);
+ _mm_store_ss(c3, vacc3x0123);
+ _mm_store_ss(c2, vacc2x0123);
+ _mm_store_ss(c1, vacc1x0123);
+ _mm_store_ss(c0, vacc0x0123);
+ }
+
+ nc = 0;
+ }
+ } while (nc != 0);
+}
diff --git a/src/f32-gemminc/7x8-fma3-broadcast.c b/src/f32-gemminc/7x8-fma3-broadcast.c
new file mode 100644
index 0000000..5ef6236
--- /dev/null
+++ b/src/f32-gemminc/7x8-fma3-broadcast.c
@@ -0,0 +1,234 @@
+// Auto-generated file. Do not edit!
+// Template: src/f32-gemm/avx-broadcast.c.in
+// Generator: tools/xngen
+//
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <immintrin.h>
+
+#include <xnnpack/gemm.h>
+
+
+void xnn_f32_gemminc_ukernel_7x8__fma3_broadcast(
+ size_t mr,
+ size_t nc,
+ size_t kc,
+ const float*restrict a,
+ size_t a_stride,
+ const float*restrict w,
+ float*restrict c,
+ size_t cm_stride,
+ size_t cn_stride,
+ const float*restrict acc,
+ const union xnn_f32_output_params params[restrict static 1])
+{
+ assert(mr != 0);
+ assert(mr <= 7);
+ assert(nc != 0);
+ assert(kc != 0);
+ assert(kc % sizeof(float) == 0);
+ assert(a != NULL);
+ assert(w != NULL);
+ assert(c != NULL);
+ assert(acc != NULL);
+
+ const float* a0 = a;
+ float* c0 = c;
+ const float* a1 = (const float*) ((uintptr_t) a0 + a_stride);
+ float* c1 = (float*) ((uintptr_t) c0 + cm_stride);
+ if XNN_UNPREDICTABLE(mr < 2) {
+ a1 = a0;
+ c1 = c0;
+ }
+ const float* a2 = (const float*) ((uintptr_t) a1 + a_stride);
+ float* c2 = (float*) ((uintptr_t) c1 + cm_stride);
+ if XNN_UNPREDICTABLE(mr <= 2) {
+ a2 = a1;
+ c2 = c1;
+ }
+ const float* a3 = (const float*) ((uintptr_t) a2 + a_stride);
+ float* c3 = (float*) ((uintptr_t) c2 + cm_stride);
+ if XNN_UNPREDICTABLE(mr < 4) {
+ a3 = a2;
+ c3 = c2;
+ }
+ const float* a4 = (const float*) ((uintptr_t) a3 + a_stride);
+ float* c4 = (float*) ((uintptr_t) c3 + cm_stride);
+ if XNN_UNPREDICTABLE(mr <= 4) {
+ a4 = a3;
+ c4 = c3;
+ }
+ const float* a5 = (const float*) ((uintptr_t) a4 + a_stride);
+ float* c5 = (float*) ((uintptr_t) c4 + cm_stride);
+ if XNN_UNPREDICTABLE(mr < 6) {
+ a5 = a4;
+ c5 = c4;
+ }
+ const float* a6 = (const float*) ((uintptr_t) a5 + a_stride);
+ float* c6 = (float*) ((uintptr_t) c5 + cm_stride);
+ if XNN_UNPREDICTABLE(mr <= 6) {
+ a6 = a5;
+ c6 = c5;
+ }
+
+ do {
+ __m256 vacc0x01234567 = _mm256_load_ps(acc + 0);
+ __m256 vacc1x01234567 = _mm256_load_ps(acc + 8);
+ __m256 vacc2x01234567 = _mm256_load_ps(acc + 16);
+ __m256 vacc3x01234567 = _mm256_load_ps(acc + 24);
+ __m256 vacc4x01234567 = _mm256_load_ps(acc + 32);
+ __m256 vacc5x01234567 = _mm256_load_ps(acc + 40);
+ __m256 vacc6x01234567 = _mm256_load_ps(acc + 48);
+ acc += 56;
+
+ size_t k = kc;
+ do {
+ const __m256 va0 = _mm256_broadcast_ss(a0);
+ a0 += 1;
+ const __m256 va1 = _mm256_broadcast_ss(a1);
+ a1 += 1;
+ const __m256 va2 = _mm256_broadcast_ss(a2);
+ a2 += 1;
+ const __m256 va3 = _mm256_broadcast_ss(a3);
+ a3 += 1;
+ const __m256 va4 = _mm256_broadcast_ss(a4);
+ a4 += 1;
+ const __m256 va5 = _mm256_broadcast_ss(a5);
+ a5 += 1;
+ const __m256 va6 = _mm256_broadcast_ss(a6);
+ a6 += 1;
+
+ const __m256 vb01234567 = _mm256_load_ps(w);
+ w += 8;
+
+ vacc0x01234567 = _mm256_fmadd_ps(va0, vb01234567, vacc0x01234567);
+ vacc1x01234567 = _mm256_fmadd_ps(va1, vb01234567, vacc1x01234567);
+ vacc2x01234567 = _mm256_fmadd_ps(va2, vb01234567, vacc2x01234567);
+ vacc3x01234567 = _mm256_fmadd_ps(va3, vb01234567, vacc3x01234567);
+ vacc4x01234567 = _mm256_fmadd_ps(va4, vb01234567, vacc4x01234567);
+ vacc5x01234567 = _mm256_fmadd_ps(va5, vb01234567, vacc5x01234567);
+ vacc6x01234567 = _mm256_fmadd_ps(va6, vb01234567, vacc6x01234567);
+
+ k -= sizeof(float);
+ } while (k != 0);
+
+ const __m256 vmax = _mm256_broadcast_ps((const __m128*) params->sse.max);
+ vacc0x01234567 = _mm256_min_ps(vacc0x01234567, vmax);
+ vacc1x01234567 = _mm256_min_ps(vacc1x01234567, vmax);
+ vacc2x01234567 = _mm256_min_ps(vacc2x01234567, vmax);
+ vacc3x01234567 = _mm256_min_ps(vacc3x01234567, vmax);
+ vacc4x01234567 = _mm256_min_ps(vacc4x01234567, vmax);
+ vacc5x01234567 = _mm256_min_ps(vacc5x01234567, vmax);
+ vacc6x01234567 = _mm256_min_ps(vacc6x01234567, vmax);
+
+ const __m256 vmin = _mm256_broadcast_ps((const __m128*) params->sse.min);
+ vacc0x01234567 = _mm256_max_ps(vacc0x01234567, vmin);
+ vacc1x01234567 = _mm256_max_ps(vacc1x01234567, vmin);
+ vacc2x01234567 = _mm256_max_ps(vacc2x01234567, vmin);
+ vacc3x01234567 = _mm256_max_ps(vacc3x01234567, vmin);
+ vacc4x01234567 = _mm256_max_ps(vacc4x01234567, vmin);
+ vacc5x01234567 = _mm256_max_ps(vacc5x01234567, vmin);
+ vacc6x01234567 = _mm256_max_ps(vacc6x01234567, vmin);
+
+ if XNN_LIKELY(nc >= 8) {
+ _mm256_storeu_ps(c6, vacc6x01234567);
+ c6 = (float*) ((uintptr_t) c6 + cn_stride);
+ _mm256_storeu_ps(c5, vacc5x01234567);
+ c5 = (float*) ((uintptr_t) c5 + cn_stride);
+ _mm256_storeu_ps(c4, vacc4x01234567);
+ c4 = (float*) ((uintptr_t) c4 + cn_stride);
+ _mm256_storeu_ps(c3, vacc3x01234567);
+ c3 = (float*) ((uintptr_t) c3 + cn_stride);
+ _mm256_storeu_ps(c2, vacc2x01234567);
+ c2 = (float*) ((uintptr_t) c2 + cn_stride);
+ _mm256_storeu_ps(c1, vacc1x01234567);
+ c1 = (float*) ((uintptr_t) c1 + cn_stride);
+ _mm256_storeu_ps(c0, vacc0x01234567);
+ c0 = (float*) ((uintptr_t) c0 + cn_stride);
+
+ a6 = (const float*) ((uintptr_t) a6 - kc);
+ a5 = (const float*) ((uintptr_t) a5 - kc);
+ a4 = (const float*) ((uintptr_t) a4 - kc);
+ a3 = (const float*) ((uintptr_t) a3 - kc);
+ a2 = (const float*) ((uintptr_t) a2 - kc);
+ a1 = (const float*) ((uintptr_t) a1 - kc);
+ a0 = (const float*) ((uintptr_t) a0 - kc);
+
+ nc -= 8;
+ } else {
+ __m128 vacc6x0123 = _mm256_castps256_ps128(vacc6x01234567);
+ __m128 vacc5x0123 = _mm256_castps256_ps128(vacc5x01234567);
+ __m128 vacc4x0123 = _mm256_castps256_ps128(vacc4x01234567);
+ __m128 vacc3x0123 = _mm256_castps256_ps128(vacc3x01234567);
+ __m128 vacc2x0123 = _mm256_castps256_ps128(vacc2x01234567);
+ __m128 vacc1x0123 = _mm256_castps256_ps128(vacc1x01234567);
+ __m128 vacc0x0123 = _mm256_castps256_ps128(vacc0x01234567);
+ if (nc & 4) {
+ _mm_storeu_ps(c6, vacc6x0123);
+ _mm_storeu_ps(c5, vacc5x0123);
+ _mm_storeu_ps(c4, vacc4x0123);
+ _mm_storeu_ps(c3, vacc3x0123);
+ _mm_storeu_ps(c2, vacc2x0123);
+ _mm_storeu_ps(c1, vacc1x0123);
+ _mm_storeu_ps(c0, vacc0x0123);
+
+ vacc6x0123 = _mm256_extractf128_ps(vacc6x01234567, 1);
+ vacc5x0123 = _mm256_extractf128_ps(vacc5x01234567, 1);
+ vacc4x0123 = _mm256_extractf128_ps(vacc4x01234567, 1);
+ vacc3x0123 = _mm256_extractf128_ps(vacc3x01234567, 1);
+ vacc2x0123 = _mm256_extractf128_ps(vacc2x01234567, 1);
+ vacc1x0123 = _mm256_extractf128_ps(vacc1x01234567, 1);
+ vacc0x0123 = _mm256_extractf128_ps(vacc0x01234567, 1);
+
+ c6 += 4;
+ c5 += 4;
+ c4 += 4;
+ c3 += 4;
+ c2 += 4;
+ c1 += 4;
+ c0 += 4;
+ }
+ if (nc & 2) {
+ _mm_storel_pi((__m64*) c6, vacc6x0123);
+ _mm_storel_pi((__m64*) c5, vacc5x0123);
+ _mm_storel_pi((__m64*) c4, vacc4x0123);
+ _mm_storel_pi((__m64*) c3, vacc3x0123);
+ _mm_storel_pi((__m64*) c2, vacc2x0123);
+ _mm_storel_pi((__m64*) c1, vacc1x0123);
+ _mm_storel_pi((__m64*) c0, vacc0x0123);
+
+ vacc6x0123 = _mm_movehl_ps(vacc6x0123, vacc6x0123);
+ vacc5x0123 = _mm_movehl_ps(vacc5x0123, vacc5x0123);
+ vacc4x0123 = _mm_movehl_ps(vacc4x0123, vacc4x0123);
+ vacc3x0123 = _mm_movehl_ps(vacc3x0123, vacc3x0123);
+ vacc2x0123 = _mm_movehl_ps(vacc2x0123, vacc2x0123);
+ vacc1x0123 = _mm_movehl_ps(vacc1x0123, vacc1x0123);
+ vacc0x0123 = _mm_movehl_ps(vacc0x0123, vacc0x0123);
+
+ c6 += 2;
+ c5 += 2;
+ c4 += 2;
+ c3 += 2;
+ c2 += 2;
+ c1 += 2;
+ c0 += 2;
+ }
+ if (nc & 1) {
+ _mm_store_ss(c6, vacc6x0123);
+ _mm_store_ss(c5, vacc5x0123);
+ _mm_store_ss(c4, vacc4x0123);
+ _mm_store_ss(c3, vacc3x0123);
+ _mm_store_ss(c2, vacc2x0123);
+ _mm_store_ss(c1, vacc1x0123);
+ _mm_store_ss(c0, vacc0x0123);
+ }
+
+ nc = 0;
+ }
+ } while (nc != 0);
+}
diff --git a/src/f32-gemminc/8x8-fma3-broadcast.c b/src/f32-gemminc/8x8-fma3-broadcast.c
new file mode 100644
index 0000000..bbdb925
--- /dev/null
+++ b/src/f32-gemminc/8x8-fma3-broadcast.c
@@ -0,0 +1,257 @@
+// Auto-generated file. Do not edit!
+// Template: src/f32-gemm/avx-broadcast.c.in
+// Generator: tools/xngen
+//
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <immintrin.h>
+
+#include <xnnpack/gemm.h>
+
+
+void xnn_f32_gemminc_ukernel_8x8__fma3_broadcast(
+ size_t mr,
+ size_t nc,
+ size_t kc,
+ const float*restrict a,
+ size_t a_stride,
+ const float*restrict w,
+ float*restrict c,
+ size_t cm_stride,
+ size_t cn_stride,
+ const float*restrict acc,
+ const union xnn_f32_output_params params[restrict static 1])
+{
+ assert(mr != 0);
+ assert(mr <= 8);
+ assert(nc != 0);
+ assert(kc != 0);
+ assert(kc % sizeof(float) == 0);
+ assert(a != NULL);
+ assert(w != NULL);
+ assert(c != NULL);
+ assert(acc != NULL);
+
+ const float* a0 = a;
+ float* c0 = c;
+ const float* a1 = (const float*) ((uintptr_t) a0 + a_stride);
+ float* c1 = (float*) ((uintptr_t) c0 + cm_stride);
+ if XNN_UNPREDICTABLE(mr < 2) {
+ a1 = a0;
+ c1 = c0;
+ }
+ const float* a2 = (const float*) ((uintptr_t) a1 + a_stride);
+ float* c2 = (float*) ((uintptr_t) c1 + cm_stride);
+ if XNN_UNPREDICTABLE(mr <= 2) {
+ a2 = a1;
+ c2 = c1;
+ }
+ const float* a3 = (const float*) ((uintptr_t) a2 + a_stride);
+ float* c3 = (float*) ((uintptr_t) c2 + cm_stride);
+ if XNN_UNPREDICTABLE(mr < 4) {
+ a3 = a2;
+ c3 = c2;
+ }
+ const float* a4 = (const float*) ((uintptr_t) a3 + a_stride);
+ float* c4 = (float*) ((uintptr_t) c3 + cm_stride);
+ if XNN_UNPREDICTABLE(mr <= 4) {
+ a4 = a3;
+ c4 = c3;
+ }
+ const float* a5 = (const float*) ((uintptr_t) a4 + a_stride);
+ float* c5 = (float*) ((uintptr_t) c4 + cm_stride);
+ if XNN_UNPREDICTABLE(mr < 6) {
+ a5 = a4;
+ c5 = c4;
+ }
+ const float* a6 = (const float*) ((uintptr_t) a5 + a_stride);
+ float* c6 = (float*) ((uintptr_t) c5 + cm_stride);
+ if XNN_UNPREDICTABLE(mr <= 6) {
+ a6 = a5;
+ c6 = c5;
+ }
+ const float* a7 = (const float*) ((uintptr_t) a6 + a_stride);
+ float* c7 = (float*) ((uintptr_t) c6 + cm_stride);
+ if XNN_UNPREDICTABLE(mr != 8) {
+ a7 = a6;
+ c7 = c6;
+ }
+
+ do {
+ __m256 vacc0x01234567 = _mm256_load_ps(acc + 0);
+ __m256 vacc1x01234567 = _mm256_load_ps(acc + 8);
+ __m256 vacc2x01234567 = _mm256_load_ps(acc + 16);
+ __m256 vacc3x01234567 = _mm256_load_ps(acc + 24);
+ __m256 vacc4x01234567 = _mm256_load_ps(acc + 32);
+ __m256 vacc5x01234567 = _mm256_load_ps(acc + 40);
+ __m256 vacc6x01234567 = _mm256_load_ps(acc + 48);
+ __m256 vacc7x01234567 = _mm256_load_ps(acc + 56);
+ acc += 64;
+
+ size_t k = kc;
+ do {
+ const __m256 va0 = _mm256_broadcast_ss(a0);
+ a0 += 1;
+ const __m256 va1 = _mm256_broadcast_ss(a1);
+ a1 += 1;
+ const __m256 va2 = _mm256_broadcast_ss(a2);
+ a2 += 1;
+ const __m256 va3 = _mm256_broadcast_ss(a3);
+ a3 += 1;
+ const __m256 va4 = _mm256_broadcast_ss(a4);
+ a4 += 1;
+ const __m256 va5 = _mm256_broadcast_ss(a5);
+ a5 += 1;
+ const __m256 va6 = _mm256_broadcast_ss(a6);
+ a6 += 1;
+ const __m256 va7 = _mm256_broadcast_ss(a7);
+ a7 += 1;
+
+ const __m256 vb01234567 = _mm256_load_ps(w);
+ w += 8;
+
+ vacc0x01234567 = _mm256_fmadd_ps(va0, vb01234567, vacc0x01234567);
+ vacc1x01234567 = _mm256_fmadd_ps(va1, vb01234567, vacc1x01234567);
+ vacc2x01234567 = _mm256_fmadd_ps(va2, vb01234567, vacc2x01234567);
+ vacc3x01234567 = _mm256_fmadd_ps(va3, vb01234567, vacc3x01234567);
+ vacc4x01234567 = _mm256_fmadd_ps(va4, vb01234567, vacc4x01234567);
+ vacc5x01234567 = _mm256_fmadd_ps(va5, vb01234567, vacc5x01234567);
+ vacc6x01234567 = _mm256_fmadd_ps(va6, vb01234567, vacc6x01234567);
+ vacc7x01234567 = _mm256_fmadd_ps(va7, vb01234567, vacc7x01234567);
+
+ k -= sizeof(float);
+ } while (k != 0);
+
+ const __m256 vmax = _mm256_broadcast_ps((const __m128*) params->sse.max);
+ vacc0x01234567 = _mm256_min_ps(vacc0x01234567, vmax);
+ vacc1x01234567 = _mm256_min_ps(vacc1x01234567, vmax);
+ vacc2x01234567 = _mm256_min_ps(vacc2x01234567, vmax);
+ vacc3x01234567 = _mm256_min_ps(vacc3x01234567, vmax);
+ vacc4x01234567 = _mm256_min_ps(vacc4x01234567, vmax);
+ vacc5x01234567 = _mm256_min_ps(vacc5x01234567, vmax);
+ vacc6x01234567 = _mm256_min_ps(vacc6x01234567, vmax);
+ vacc7x01234567 = _mm256_min_ps(vacc7x01234567, vmax);
+
+ const __m256 vmin = _mm256_broadcast_ps((const __m128*) params->sse.min);
+ vacc0x01234567 = _mm256_max_ps(vacc0x01234567, vmin);
+ vacc1x01234567 = _mm256_max_ps(vacc1x01234567, vmin);
+ vacc2x01234567 = _mm256_max_ps(vacc2x01234567, vmin);
+ vacc3x01234567 = _mm256_max_ps(vacc3x01234567, vmin);
+ vacc4x01234567 = _mm256_max_ps(vacc4x01234567, vmin);
+ vacc5x01234567 = _mm256_max_ps(vacc5x01234567, vmin);
+ vacc6x01234567 = _mm256_max_ps(vacc6x01234567, vmin);
+ vacc7x01234567 = _mm256_max_ps(vacc7x01234567, vmin);
+
+ if XNN_LIKELY(nc >= 8) {
+ _mm256_storeu_ps(c7, vacc7x01234567);
+ c7 = (float*) ((uintptr_t) c7 + cn_stride);
+ _mm256_storeu_ps(c6, vacc6x01234567);
+ c6 = (float*) ((uintptr_t) c6 + cn_stride);
+ _mm256_storeu_ps(c5, vacc5x01234567);
+ c5 = (float*) ((uintptr_t) c5 + cn_stride);
+ _mm256_storeu_ps(c4, vacc4x01234567);
+ c4 = (float*) ((uintptr_t) c4 + cn_stride);
+ _mm256_storeu_ps(c3, vacc3x01234567);
+ c3 = (float*) ((uintptr_t) c3 + cn_stride);
+ _mm256_storeu_ps(c2, vacc2x01234567);
+ c2 = (float*) ((uintptr_t) c2 + cn_stride);
+ _mm256_storeu_ps(c1, vacc1x01234567);
+ c1 = (float*) ((uintptr_t) c1 + cn_stride);
+ _mm256_storeu_ps(c0, vacc0x01234567);
+ c0 = (float*) ((uintptr_t) c0 + cn_stride);
+
+ a7 = (const float*) ((uintptr_t) a7 - kc);
+ a6 = (const float*) ((uintptr_t) a6 - kc);
+ a5 = (const float*) ((uintptr_t) a5 - kc);
+ a4 = (const float*) ((uintptr_t) a4 - kc);
+ a3 = (const float*) ((uintptr_t) a3 - kc);
+ a2 = (const float*) ((uintptr_t) a2 - kc);
+ a1 = (const float*) ((uintptr_t) a1 - kc);
+ a0 = (const float*) ((uintptr_t) a0 - kc);
+
+ nc -= 8;
+ } else {
+ __m128 vacc7x0123 = _mm256_castps256_ps128(vacc7x01234567);
+ __m128 vacc6x0123 = _mm256_castps256_ps128(vacc6x01234567);
+ __m128 vacc5x0123 = _mm256_castps256_ps128(vacc5x01234567);
+ __m128 vacc4x0123 = _mm256_castps256_ps128(vacc4x01234567);
+ __m128 vacc3x0123 = _mm256_castps256_ps128(vacc3x01234567);
+ __m128 vacc2x0123 = _mm256_castps256_ps128(vacc2x01234567);
+ __m128 vacc1x0123 = _mm256_castps256_ps128(vacc1x01234567);
+ __m128 vacc0x0123 = _mm256_castps256_ps128(vacc0x01234567);
+ if (nc & 4) {
+ _mm_storeu_ps(c7, vacc7x0123);
+ _mm_storeu_ps(c6, vacc6x0123);
+ _mm_storeu_ps(c5, vacc5x0123);
+ _mm_storeu_ps(c4, vacc4x0123);
+ _mm_storeu_ps(c3, vacc3x0123);
+ _mm_storeu_ps(c2, vacc2x0123);
+ _mm_storeu_ps(c1, vacc1x0123);
+ _mm_storeu_ps(c0, vacc0x0123);
+
+ vacc7x0123 = _mm256_extractf128_ps(vacc7x01234567, 1);
+ vacc6x0123 = _mm256_extractf128_ps(vacc6x01234567, 1);
+ vacc5x0123 = _mm256_extractf128_ps(vacc5x01234567, 1);
+ vacc4x0123 = _mm256_extractf128_ps(vacc4x01234567, 1);
+ vacc3x0123 = _mm256_extractf128_ps(vacc3x01234567, 1);
+ vacc2x0123 = _mm256_extractf128_ps(vacc2x01234567, 1);
+ vacc1x0123 = _mm256_extractf128_ps(vacc1x01234567, 1);
+ vacc0x0123 = _mm256_extractf128_ps(vacc0x01234567, 1);
+
+ c7 += 4;
+ c6 += 4;
+ c5 += 4;
+ c4 += 4;
+ c3 += 4;
+ c2 += 4;
+ c1 += 4;
+ c0 += 4;
+ }
+ if (nc & 2) {
+ _mm_storel_pi((__m64*) c7, vacc7x0123);
+ _mm_storel_pi((__m64*) c6, vacc6x0123);
+ _mm_storel_pi((__m64*) c5, vacc5x0123);
+ _mm_storel_pi((__m64*) c4, vacc4x0123);
+ _mm_storel_pi((__m64*) c3, vacc3x0123);
+ _mm_storel_pi((__m64*) c2, vacc2x0123);
+ _mm_storel_pi((__m64*) c1, vacc1x0123);
+ _mm_storel_pi((__m64*) c0, vacc0x0123);
+
+ vacc7x0123 = _mm_movehl_ps(vacc7x0123, vacc7x0123);
+ vacc6x0123 = _mm_movehl_ps(vacc6x0123, vacc6x0123);
+ vacc5x0123 = _mm_movehl_ps(vacc5x0123, vacc5x0123);
+ vacc4x0123 = _mm_movehl_ps(vacc4x0123, vacc4x0123);
+ vacc3x0123 = _mm_movehl_ps(vacc3x0123, vacc3x0123);
+ vacc2x0123 = _mm_movehl_ps(vacc2x0123, vacc2x0123);
+ vacc1x0123 = _mm_movehl_ps(vacc1x0123, vacc1x0123);
+ vacc0x0123 = _mm_movehl_ps(vacc0x0123, vacc0x0123);
+
+ c7 += 2;
+ c6 += 2;
+ c5 += 2;
+ c4 += 2;
+ c3 += 2;
+ c2 += 2;
+ c1 += 2;
+ c0 += 2;
+ }
+ if (nc & 1) {
+ _mm_store_ss(c7, vacc7x0123);
+ _mm_store_ss(c6, vacc6x0123);
+ _mm_store_ss(c5, vacc5x0123);
+ _mm_store_ss(c4, vacc4x0123);
+ _mm_store_ss(c3, vacc3x0123);
+ _mm_store_ss(c2, vacc2x0123);
+ _mm_store_ss(c1, vacc1x0123);
+ _mm_store_ss(c0, vacc0x0123);
+ }
+
+ nc = 0;
+ }
+ } while (nc != 0);
+}
diff --git a/src/f32-igemm/1x8-avx-broadcast.c b/src/f32-igemm/1x8-avx-broadcast.c
new file mode 100644
index 0000000..4b760bc
--- /dev/null
+++ b/src/f32-igemm/1x8-avx-broadcast.c
@@ -0,0 +1,107 @@
+// Auto-generated file. Do not edit!
+// Template: src/f32-igemm/avx-broadcast.c.in
+// Generator: tools/xngen
+//
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <immintrin.h>
+
+#include <xnnpack/igemm.h>
+
+
+void xnn_f32_igemm_ukernel_1x8__avx_broadcast(
+ size_t mr,
+ size_t nc,
+ size_t kc,
+ size_t ks,
+ const float**restrict a,
+ const float*restrict w,
+ float*restrict c,
+ size_t cm_stride,
+ size_t cn_stride,
+ size_t a_offset,
+ const float* zero,
+ const union xnn_f32_output_params params[restrict static 1])
+{
+ assert(mr != 0);
+ assert(mr <= 1);
+ assert(nc != 0);
+ assert(kc != 0);
+ assert(kc % sizeof(float) == 0);
+ assert(ks != 0);
+ assert(ks % (1 * sizeof(void*)) == 0);
+ assert(a_offset % sizeof(float) == 0);
+ assert(a != NULL);
+ assert(w != NULL);
+ assert(c != NULL);
+
+ float* c0 = c;
+
+ do {
+ __m256 vacc0x01234567 = _mm256_load_ps(w);
+ w += 8;
+
+ size_t p = ks;
+ do {
+ const float* restrict a0 = a[0];
+ assert(a0 != NULL);
+ if XNN_UNPREDICTABLE(a0 != zero) {
+ a0 = (const float*) ((uintptr_t) a0 + a_offset);
+ }
+ a += 1;
+
+ size_t k = kc;
+ do {
+ const __m256 vb01234567 = _mm256_load_ps(w);
+ w += 8;
+
+ const __m256 va0 = _mm256_broadcast_ss(a0);
+ a0 += 1;
+
+ vacc0x01234567 = _mm256_add_ps(vacc0x01234567, _mm256_mul_ps(va0, vb01234567));
+ k -= sizeof(float);
+ } while (k != 0);
+ p -= 1 * sizeof(void*);
+ } while (p != 0);
+
+ const __m256 vmax = _mm256_broadcast_ps((const __m128*) params->sse.max);
+ vacc0x01234567 = _mm256_min_ps(vacc0x01234567, vmax);
+
+ const __m256 vmin = _mm256_broadcast_ps((const __m128*) params->sse.min);
+ vacc0x01234567 = _mm256_max_ps(vacc0x01234567, vmin);
+
+ if XNN_LIKELY(nc >= 8) {
+ _mm256_storeu_ps(c0, vacc0x01234567);
+ c0 = (float*) ((uintptr_t) c0 + cn_stride);
+
+ a = (const float**restrict) ((uintptr_t) a - ks);
+ nc -= 8;
+ } else {
+ __m128 vacc0x0123 = _mm256_castps256_ps128(vacc0x01234567);
+ if (nc & 4) {
+ _mm_storeu_ps(c0, vacc0x0123);
+
+ vacc0x0123 = _mm256_extractf128_ps(vacc0x01234567, 1);
+
+ c0 += 4;
+ }
+ if (nc & 2) {
+ _mm_storel_pi((__m64*) c0, vacc0x0123);
+
+ vacc0x0123 = _mm_movehl_ps(vacc0x0123, vacc0x0123);
+
+ c0 += 2;
+ }
+ if (nc & 1) {
+ _mm_store_ss(c0, vacc0x0123);
+ }
+
+ nc = 0;
+ }
+ } while (nc != 0);
+}
diff --git a/src/f32-igemm/1x8-fma3-broadcast.c b/src/f32-igemm/1x8-fma3-broadcast.c
new file mode 100644
index 0000000..113bbba
--- /dev/null
+++ b/src/f32-igemm/1x8-fma3-broadcast.c
@@ -0,0 +1,107 @@
+// Auto-generated file. Do not edit!
+// Template: src/f32-igemm/avx-broadcast.c.in
+// Generator: tools/xngen
+//
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <immintrin.h>
+
+#include <xnnpack/igemm.h>
+
+
+void xnn_f32_igemm_ukernel_1x8__fma3_broadcast(
+ size_t mr,
+ size_t nc,
+ size_t kc,
+ size_t ks,
+ const float**restrict a,
+ const float*restrict w,
+ float*restrict c,
+ size_t cm_stride,
+ size_t cn_stride,
+ size_t a_offset,
+ const float* zero,
+ const union xnn_f32_output_params params[restrict static 1])
+{
+ assert(mr != 0);
+ assert(mr <= 1);
+ assert(nc != 0);
+ assert(kc != 0);
+ assert(kc % sizeof(float) == 0);
+ assert(ks != 0);
+ assert(ks % (1 * sizeof(void*)) == 0);
+ assert(a_offset % sizeof(float) == 0);
+ assert(a != NULL);
+ assert(w != NULL);
+ assert(c != NULL);
+
+ float* c0 = c;
+
+ do {
+ __m256 vacc0x01234567 = _mm256_load_ps(w);
+ w += 8;
+
+ size_t p = ks;
+ do {
+ const float* restrict a0 = a[0];
+ assert(a0 != NULL);
+ if XNN_UNPREDICTABLE(a0 != zero) {
+ a0 = (const float*) ((uintptr_t) a0 + a_offset);
+ }
+ a += 1;
+
+ size_t k = kc;
+ do {
+ const __m256 vb01234567 = _mm256_load_ps(w);
+ w += 8;
+
+ const __m256 va0 = _mm256_broadcast_ss(a0);
+ a0 += 1;
+
+ vacc0x01234567 = _mm256_fmadd_ps(va0, vb01234567, vacc0x01234567);
+ k -= sizeof(float);
+ } while (k != 0);
+ p -= 1 * sizeof(void*);
+ } while (p != 0);
+
+ const __m256 vmax = _mm256_broadcast_ps((const __m128*) params->sse.max);
+ vacc0x01234567 = _mm256_min_ps(vacc0x01234567, vmax);
+
+ const __m256 vmin = _mm256_broadcast_ps((const __m128*) params->sse.min);
+ vacc0x01234567 = _mm256_max_ps(vacc0x01234567, vmin);
+
+ if XNN_LIKELY(nc >= 8) {
+ _mm256_storeu_ps(c0, vacc0x01234567);
+ c0 = (float*) ((uintptr_t) c0 + cn_stride);
+
+ a = (const float**restrict) ((uintptr_t) a - ks);
+ nc -= 8;
+ } else {
+ __m128 vacc0x0123 = _mm256_castps256_ps128(vacc0x01234567);
+ if (nc & 4) {
+ _mm_storeu_ps(c0, vacc0x0123);
+
+ vacc0x0123 = _mm256_extractf128_ps(vacc0x01234567, 1);
+
+ c0 += 4;
+ }
+ if (nc & 2) {
+ _mm_storel_pi((__m64*) c0, vacc0x0123);
+
+ vacc0x0123 = _mm_movehl_ps(vacc0x0123, vacc0x0123);
+
+ c0 += 2;
+ }
+ if (nc & 1) {
+ _mm_store_ss(c0, vacc0x0123);
+ }
+
+ nc = 0;
+ }
+ } while (nc != 0);
+}
diff --git a/src/f32-igemm/4x8-avx-broadcast.c b/src/f32-igemm/4x8-avx-broadcast.c
new file mode 100644
index 0000000..259f4d6
--- /dev/null
+++ b/src/f32-igemm/4x8-avx-broadcast.c
@@ -0,0 +1,182 @@
+// Auto-generated file. Do not edit!
+// Template: src/f32-igemm/avx-broadcast.c.in
+// Generator: tools/xngen
+//
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <immintrin.h>
+
+#include <xnnpack/igemm.h>
+
+
+void xnn_f32_igemm_ukernel_4x8__avx_broadcast(
+ size_t mr,
+ size_t nc,
+ size_t kc,
+ size_t ks,
+ const float**restrict a,
+ const float*restrict w,
+ float*restrict c,
+ size_t cm_stride,
+ size_t cn_stride,
+ size_t a_offset,
+ const float* zero,
+ const union xnn_f32_output_params params[restrict static 1])
+{
+ assert(mr != 0);
+ assert(mr <= 4);
+ assert(nc != 0);
+ assert(kc != 0);
+ assert(kc % sizeof(float) == 0);
+ assert(ks != 0);
+ assert(ks % (4 * sizeof(void*)) == 0);
+ assert(a_offset % sizeof(float) == 0);
+ assert(a != NULL);
+ assert(w != NULL);
+ assert(c != NULL);
+
+ float* c0 = c;
+ float* c1 = (float*) ((uintptr_t) c0 + cm_stride);
+ if XNN_UNPREDICTABLE(mr < 2) {
+ c1 = c0;
+ }
+ float* c2 = (float*) ((uintptr_t) c1 + cm_stride);
+ if XNN_UNPREDICTABLE(mr <= 2) {
+ c2 = c1;
+ }
+ float* c3 = (float*) ((uintptr_t) c2 + cm_stride);
+ if XNN_UNPREDICTABLE(mr != 4) {
+ c3 = c2;
+ }
+
+ do {
+ __m256 vacc0x01234567 = _mm256_load_ps(w);
+ __m256 vacc1x01234567 = vacc0x01234567;
+ __m256 vacc2x01234567 = vacc0x01234567;
+ __m256 vacc3x01234567 = vacc0x01234567;
+ w += 8;
+
+ size_t p = ks;
+ do {
+ const float* restrict a0 = a[0];
+ assert(a0 != NULL);
+ if XNN_UNPREDICTABLE(a0 != zero) {
+ a0 = (const float*) ((uintptr_t) a0 + a_offset);
+ }
+ const float* restrict a1 = a[1];
+ assert(a1 != NULL);
+ if XNN_UNPREDICTABLE(a1 != zero) {
+ a1 = (const float*) ((uintptr_t) a1 + a_offset);
+ }
+ const float* restrict a2 = a[2];
+ assert(a2 != NULL);
+ if XNN_UNPREDICTABLE(a2 != zero) {
+ a2 = (const float*) ((uintptr_t) a2 + a_offset);
+ }
+ const float* restrict a3 = a[3];
+ assert(a3 != NULL);
+ if XNN_UNPREDICTABLE(a3 != zero) {
+ a3 = (const float*) ((uintptr_t) a3 + a_offset);
+ }
+ a += 4;
+
+ size_t k = kc;
+ do {
+ const __m256 vb01234567 = _mm256_load_ps(w);
+ w += 8;
+
+ const __m256 va0 = _mm256_broadcast_ss(a0);
+ a0 += 1;
+ const __m256 va1 = _mm256_broadcast_ss(a1);
+ a1 += 1;
+ const __m256 va2 = _mm256_broadcast_ss(a2);
+ a2 += 1;
+ const __m256 va3 = _mm256_broadcast_ss(a3);
+ a3 += 1;
+
+ vacc0x01234567 = _mm256_add_ps(vacc0x01234567, _mm256_mul_ps(va0, vb01234567));
+ vacc1x01234567 = _mm256_add_ps(vacc1x01234567, _mm256_mul_ps(va1, vb01234567));
+ vacc2x01234567 = _mm256_add_ps(vacc2x01234567, _mm256_mul_ps(va2, vb01234567));
+ vacc3x01234567 = _mm256_add_ps(vacc3x01234567, _mm256_mul_ps(va3, vb01234567));
+ k -= sizeof(float);
+ } while (k != 0);
+ p -= 4 * sizeof(void*);
+ } while (p != 0);
+
+ const __m256 vmax = _mm256_broadcast_ps((const __m128*) params->sse.max);
+ vacc0x01234567 = _mm256_min_ps(vacc0x01234567, vmax);
+ vacc1x01234567 = _mm256_min_ps(vacc1x01234567, vmax);
+ vacc2x01234567 = _mm256_min_ps(vacc2x01234567, vmax);
+ vacc3x01234567 = _mm256_min_ps(vacc3x01234567, vmax);
+
+ const __m256 vmin = _mm256_broadcast_ps((const __m128*) params->sse.min);
+ vacc0x01234567 = _mm256_max_ps(vacc0x01234567, vmin);
+ vacc1x01234567 = _mm256_max_ps(vacc1x01234567, vmin);
+ vacc2x01234567 = _mm256_max_ps(vacc2x01234567, vmin);
+ vacc3x01234567 = _mm256_max_ps(vacc3x01234567, vmin);
+
+ if XNN_LIKELY(nc >= 8) {
+ _mm256_storeu_ps(c3, vacc3x01234567);
+ c3 = (float*) ((uintptr_t) c3 + cn_stride);
+ _mm256_storeu_ps(c2, vacc2x01234567);
+ c2 = (float*) ((uintptr_t) c2 + cn_stride);
+ _mm256_storeu_ps(c1, vacc1x01234567);
+ c1 = (float*) ((uintptr_t) c1 + cn_stride);
+ _mm256_storeu_ps(c0, vacc0x01234567);
+ c0 = (float*) ((uintptr_t) c0 + cn_stride);
+
+ a = (const float**restrict) ((uintptr_t) a - ks);
+ nc -= 8;
+ } else {
+ __m128 vacc3x0123 = _mm256_castps256_ps128(vacc3x01234567);
+ __m128 vacc2x0123 = _mm256_castps256_ps128(vacc2x01234567);
+ __m128 vacc1x0123 = _mm256_castps256_ps128(vacc1x01234567);
+ __m128 vacc0x0123 = _mm256_castps256_ps128(vacc0x01234567);
+ if (nc & 4) {
+ _mm_storeu_ps(c3, vacc3x0123);
+ _mm_storeu_ps(c2, vacc2x0123);
+ _mm_storeu_ps(c1, vacc1x0123);
+ _mm_storeu_ps(c0, vacc0x0123);
+
+ vacc3x0123 = _mm256_extractf128_ps(vacc3x01234567, 1);
+ vacc2x0123 = _mm256_extractf128_ps(vacc2x01234567, 1);
+ vacc1x0123 = _mm256_extractf128_ps(vacc1x01234567, 1);
+ vacc0x0123 = _mm256_extractf128_ps(vacc0x01234567, 1);
+
+ c3 += 4;
+ c2 += 4;
+ c1 += 4;
+ c0 += 4;
+ }
+ if (nc & 2) {
+ _mm_storel_pi((__m64*) c3, vacc3x0123);
+ _mm_storel_pi((__m64*) c2, vacc2x0123);
+ _mm_storel_pi((__m64*) c1, vacc1x0123);
+ _mm_storel_pi((__m64*) c0, vacc0x0123);
+
+ vacc3x0123 = _mm_movehl_ps(vacc3x0123, vacc3x0123);
+ vacc2x0123 = _mm_movehl_ps(vacc2x0123, vacc2x0123);
+ vacc1x0123 = _mm_movehl_ps(vacc1x0123, vacc1x0123);
+ vacc0x0123 = _mm_movehl_ps(vacc0x0123, vacc0x0123);
+
+ c3 += 2;
+ c2 += 2;
+ c1 += 2;
+ c0 += 2;
+ }
+ if (nc & 1) {
+ _mm_store_ss(c3, vacc3x0123);
+ _mm_store_ss(c2, vacc2x0123);
+ _mm_store_ss(c1, vacc1x0123);
+ _mm_store_ss(c0, vacc0x0123);
+ }
+
+ nc = 0;
+ }
+ } while (nc != 0);
+}
diff --git a/src/f32-igemm/4x8-fma3-broadcast.c b/src/f32-igemm/4x8-fma3-broadcast.c
new file mode 100644
index 0000000..537bcdf
--- /dev/null
+++ b/src/f32-igemm/4x8-fma3-broadcast.c
@@ -0,0 +1,182 @@
+// Auto-generated file. Do not edit!
+// Template: src/f32-igemm/avx-broadcast.c.in
+// Generator: tools/xngen
+//
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <immintrin.h>
+
+#include <xnnpack/igemm.h>
+
+
+void xnn_f32_igemm_ukernel_4x8__fma3_broadcast(
+ size_t mr,
+ size_t nc,
+ size_t kc,
+ size_t ks,
+ const float**restrict a,
+ const float*restrict w,
+ float*restrict c,
+ size_t cm_stride,
+ size_t cn_stride,
+ size_t a_offset,
+ const float* zero,
+ const union xnn_f32_output_params params[restrict static 1])
+{
+ assert(mr != 0);
+ assert(mr <= 4);
+ assert(nc != 0);
+ assert(kc != 0);
+ assert(kc % sizeof(float) == 0);
+ assert(ks != 0);
+ assert(ks % (4 * sizeof(void*)) == 0);
+ assert(a_offset % sizeof(float) == 0);
+ assert(a != NULL);
+ assert(w != NULL);
+ assert(c != NULL);
+
+ float* c0 = c;
+ float* c1 = (float*) ((uintptr_t) c0 + cm_stride);
+ if XNN_UNPREDICTABLE(mr < 2) {
+ c1 = c0;
+ }
+ float* c2 = (float*) ((uintptr_t) c1 + cm_stride);
+ if XNN_UNPREDICTABLE(mr <= 2) {
+ c2 = c1;
+ }
+ float* c3 = (float*) ((uintptr_t) c2 + cm_stride);
+ if XNN_UNPREDICTABLE(mr != 4) {
+ c3 = c2;
+ }
+
+ do {
+ __m256 vacc0x01234567 = _mm256_load_ps(w);
+ __m256 vacc1x01234567 = vacc0x01234567;
+ __m256 vacc2x01234567 = vacc0x01234567;
+ __m256 vacc3x01234567 = vacc0x01234567;
+ w += 8;
+
+ size_t p = ks;
+ do {
+ const float* restrict a0 = a[0];
+ assert(a0 != NULL);
+ if XNN_UNPREDICTABLE(a0 != zero) {
+ a0 = (const float*) ((uintptr_t) a0 + a_offset);
+ }
+ const float* restrict a1 = a[1];
+ assert(a1 != NULL);
+ if XNN_UNPREDICTABLE(a1 != zero) {
+ a1 = (const float*) ((uintptr_t) a1 + a_offset);
+ }
+ const float* restrict a2 = a[2];
+ assert(a2 != NULL);
+ if XNN_UNPREDICTABLE(a2 != zero) {
+ a2 = (const float*) ((uintptr_t) a2 + a_offset);
+ }
+ const float* restrict a3 = a[3];
+ assert(a3 != NULL);
+ if XNN_UNPREDICTABLE(a3 != zero) {
+ a3 = (const float*) ((uintptr_t) a3 + a_offset);
+ }
+ a += 4;
+
+ size_t k = kc;
+ do {
+ const __m256 vb01234567 = _mm256_load_ps(w);
+ w += 8;
+
+ const __m256 va0 = _mm256_broadcast_ss(a0);
+ a0 += 1;
+ const __m256 va1 = _mm256_broadcast_ss(a1);
+ a1 += 1;
+ const __m256 va2 = _mm256_broadcast_ss(a2);
+ a2 += 1;
+ const __m256 va3 = _mm256_broadcast_ss(a3);
+ a3 += 1;
+
+ vacc0x01234567 = _mm256_fmadd_ps(va0, vb01234567, vacc0x01234567);
+ vacc1x01234567 = _mm256_fmadd_ps(va1, vb01234567, vacc1x01234567);
+ vacc2x01234567 = _mm256_fmadd_ps(va2, vb01234567, vacc2x01234567);
+ vacc3x01234567 = _mm256_fmadd_ps(va3, vb01234567, vacc3x01234567);
+ k -= sizeof(float);
+ } while (k != 0);
+ p -= 4 * sizeof(void*);
+ } while (p != 0);
+
+ const __m256 vmax = _mm256_broadcast_ps((const __m128*) params->sse.max);
+ vacc0x01234567 = _mm256_min_ps(vacc0x01234567, vmax);
+ vacc1x01234567 = _mm256_min_ps(vacc1x01234567, vmax);
+ vacc2x01234567 = _mm256_min_ps(vacc2x01234567, vmax);
+ vacc3x01234567 = _mm256_min_ps(vacc3x01234567, vmax);
+
+ const __m256 vmin = _mm256_broadcast_ps((const __m128*) params->sse.min);
+ vacc0x01234567 = _mm256_max_ps(vacc0x01234567, vmin);
+ vacc1x01234567 = _mm256_max_ps(vacc1x01234567, vmin);
+ vacc2x01234567 = _mm256_max_ps(vacc2x01234567, vmin);
+ vacc3x01234567 = _mm256_max_ps(vacc3x01234567, vmin);
+
+ if XNN_LIKELY(nc >= 8) {
+ _mm256_storeu_ps(c3, vacc3x01234567);
+ c3 = (float*) ((uintptr_t) c3 + cn_stride);
+ _mm256_storeu_ps(c2, vacc2x01234567);
+ c2 = (float*) ((uintptr_t) c2 + cn_stride);
+ _mm256_storeu_ps(c1, vacc1x01234567);
+ c1 = (float*) ((uintptr_t) c1 + cn_stride);
+ _mm256_storeu_ps(c0, vacc0x01234567);
+ c0 = (float*) ((uintptr_t) c0 + cn_stride);
+
+ a = (const float**restrict) ((uintptr_t) a - ks);
+ nc -= 8;
+ } else {
+ __m128 vacc3x0123 = _mm256_castps256_ps128(vacc3x01234567);
+ __m128 vacc2x0123 = _mm256_castps256_ps128(vacc2x01234567);
+ __m128 vacc1x0123 = _mm256_castps256_ps128(vacc1x01234567);
+ __m128 vacc0x0123 = _mm256_castps256_ps128(vacc0x01234567);
+ if (nc & 4) {
+ _mm_storeu_ps(c3, vacc3x0123);
+ _mm_storeu_ps(c2, vacc2x0123);
+ _mm_storeu_ps(c1, vacc1x0123);
+ _mm_storeu_ps(c0, vacc0x0123);
+
+ vacc3x0123 = _mm256_extractf128_ps(vacc3x01234567, 1);
+ vacc2x0123 = _mm256_extractf128_ps(vacc2x01234567, 1);
+ vacc1x0123 = _mm256_extractf128_ps(vacc1x01234567, 1);
+ vacc0x0123 = _mm256_extractf128_ps(vacc0x01234567, 1);
+
+ c3 += 4;
+ c2 += 4;
+ c1 += 4;
+ c0 += 4;
+ }
+ if (nc & 2) {
+ _mm_storel_pi((__m64*) c3, vacc3x0123);
+ _mm_storel_pi((__m64*) c2, vacc2x0123);
+ _mm_storel_pi((__m64*) c1, vacc1x0123);
+ _mm_storel_pi((__m64*) c0, vacc0x0123);
+
+ vacc3x0123 = _mm_movehl_ps(vacc3x0123, vacc3x0123);
+ vacc2x0123 = _mm_movehl_ps(vacc2x0123, vacc2x0123);
+ vacc1x0123 = _mm_movehl_ps(vacc1x0123, vacc1x0123);
+ vacc0x0123 = _mm_movehl_ps(vacc0x0123, vacc0x0123);
+
+ c3 += 2;
+ c2 += 2;
+ c1 += 2;
+ c0 += 2;
+ }
+ if (nc & 1) {
+ _mm_store_ss(c3, vacc3x0123);
+ _mm_store_ss(c2, vacc2x0123);
+ _mm_store_ss(c1, vacc1x0123);
+ _mm_store_ss(c0, vacc0x0123);
+ }
+
+ nc = 0;
+ }
+ } while (nc != 0);
+}
diff --git a/src/f32-igemm/5x8-avx-broadcast.c b/src/f32-igemm/5x8-avx-broadcast.c
new file mode 100644
index 0000000..5290cac
--- /dev/null
+++ b/src/f32-igemm/5x8-avx-broadcast.c
@@ -0,0 +1,207 @@
+// Auto-generated file. Do not edit!
+// Template: src/f32-igemm/avx-broadcast.c.in
+// Generator: tools/xngen
+//
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <immintrin.h>
+
+#include <xnnpack/igemm.h>
+
+
+void xnn_f32_igemm_ukernel_5x8__avx_broadcast(
+ size_t mr,
+ size_t nc,
+ size_t kc,
+ size_t ks,
+ const float**restrict a,
+ const float*restrict w,
+ float*restrict c,
+ size_t cm_stride,
+ size_t cn_stride,
+ size_t a_offset,
+ const float* zero,
+ const union xnn_f32_output_params params[restrict static 1])
+{
+ assert(mr != 0);
+ assert(mr <= 5);
+ assert(nc != 0);
+ assert(kc != 0);
+ assert(kc % sizeof(float) == 0);
+ assert(ks != 0);
+ assert(ks % (5 * sizeof(void*)) == 0);
+ assert(a_offset % sizeof(float) == 0);
+ assert(a != NULL);
+ assert(w != NULL);
+ assert(c != NULL);
+
+ float* c0 = c;
+ float* c1 = (float*) ((uintptr_t) c0 + cm_stride);
+ if XNN_UNPREDICTABLE(mr < 2) {
+ c1 = c0;
+ }
+ float* c2 = (float*) ((uintptr_t) c1 + cm_stride);
+ if XNN_UNPREDICTABLE(mr <= 2) {
+ c2 = c1;
+ }
+ float* c3 = (float*) ((uintptr_t) c2 + cm_stride);
+ if XNN_UNPREDICTABLE(mr < 4) {
+ c3 = c2;
+ }
+ float* c4 = (float*) ((uintptr_t) c3 + cm_stride);
+ if XNN_UNPREDICTABLE(mr <= 4) {
+ c4 = c3;
+ }
+
+ do {
+ __m256 vacc0x01234567 = _mm256_load_ps(w);
+ __m256 vacc1x01234567 = vacc0x01234567;
+ __m256 vacc2x01234567 = vacc0x01234567;
+ __m256 vacc3x01234567 = vacc0x01234567;
+ __m256 vacc4x01234567 = vacc0x01234567;
+ w += 8;
+
+ size_t p = ks;
+ do {
+ const float* restrict a0 = a[0];
+ assert(a0 != NULL);
+ if XNN_UNPREDICTABLE(a0 != zero) {
+ a0 = (const float*) ((uintptr_t) a0 + a_offset);
+ }
+ const float* restrict a1 = a[1];
+ assert(a1 != NULL);
+ if XNN_UNPREDICTABLE(a1 != zero) {
+ a1 = (const float*) ((uintptr_t) a1 + a_offset);
+ }
+ const float* restrict a2 = a[2];
+ assert(a2 != NULL);
+ if XNN_UNPREDICTABLE(a2 != zero) {
+ a2 = (const float*) ((uintptr_t) a2 + a_offset);
+ }
+ const float* restrict a3 = a[3];
+ assert(a3 != NULL);
+ if XNN_UNPREDICTABLE(a3 != zero) {
+ a3 = (const float*) ((uintptr_t) a3 + a_offset);
+ }
+ const float* restrict a4 = a[4];
+ assert(a4 != NULL);
+ if XNN_UNPREDICTABLE(a4 != zero) {
+ a4 = (const float*) ((uintptr_t) a4 + a_offset);
+ }
+ a += 5;
+
+ size_t k = kc;
+ do {
+ const __m256 vb01234567 = _mm256_load_ps(w);
+ w += 8;
+
+ const __m256 va0 = _mm256_broadcast_ss(a0);
+ a0 += 1;
+ const __m256 va1 = _mm256_broadcast_ss(a1);
+ a1 += 1;
+ const __m256 va2 = _mm256_broadcast_ss(a2);
+ a2 += 1;
+ const __m256 va3 = _mm256_broadcast_ss(a3);
+ a3 += 1;
+ const __m256 va4 = _mm256_broadcast_ss(a4);
+ a4 += 1;
+
+ vacc0x01234567 = _mm256_add_ps(vacc0x01234567, _mm256_mul_ps(va0, vb01234567));
+ vacc1x01234567 = _mm256_add_ps(vacc1x01234567, _mm256_mul_ps(va1, vb01234567));
+ vacc2x01234567 = _mm256_add_ps(vacc2x01234567, _mm256_mul_ps(va2, vb01234567));
+ vacc3x01234567 = _mm256_add_ps(vacc3x01234567, _mm256_mul_ps(va3, vb01234567));
+ vacc4x01234567 = _mm256_add_ps(vacc4x01234567, _mm256_mul_ps(va4, vb01234567));
+ k -= sizeof(float);
+ } while (k != 0);
+ p -= 5 * sizeof(void*);
+ } while (p != 0);
+
+ const __m256 vmax = _mm256_broadcast_ps((const __m128*) params->sse.max);
+ vacc0x01234567 = _mm256_min_ps(vacc0x01234567, vmax);
+ vacc1x01234567 = _mm256_min_ps(vacc1x01234567, vmax);
+ vacc2x01234567 = _mm256_min_ps(vacc2x01234567, vmax);
+ vacc3x01234567 = _mm256_min_ps(vacc3x01234567, vmax);
+ vacc4x01234567 = _mm256_min_ps(vacc4x01234567, vmax);
+
+ const __m256 vmin = _mm256_broadcast_ps((const __m128*) params->sse.min);
+ vacc0x01234567 = _mm256_max_ps(vacc0x01234567, vmin);
+ vacc1x01234567 = _mm256_max_ps(vacc1x01234567, vmin);
+ vacc2x01234567 = _mm256_max_ps(vacc2x01234567, vmin);
+ vacc3x01234567 = _mm256_max_ps(vacc3x01234567, vmin);
+ vacc4x01234567 = _mm256_max_ps(vacc4x01234567, vmin);
+
+ if XNN_LIKELY(nc >= 8) {
+ _mm256_storeu_ps(c4, vacc4x01234567);
+ c4 = (float*) ((uintptr_t) c4 + cn_stride);
+ _mm256_storeu_ps(c3, vacc3x01234567);
+ c3 = (float*) ((uintptr_t) c3 + cn_stride);
+ _mm256_storeu_ps(c2, vacc2x01234567);
+ c2 = (float*) ((uintptr_t) c2 + cn_stride);
+ _mm256_storeu_ps(c1, vacc1x01234567);
+ c1 = (float*) ((uintptr_t) c1 + cn_stride);
+ _mm256_storeu_ps(c0, vacc0x01234567);
+ c0 = (float*) ((uintptr_t) c0 + cn_stride);
+
+ a = (const float**restrict) ((uintptr_t) a - ks);
+ nc -= 8;
+ } else {
+ __m128 vacc4x0123 = _mm256_castps256_ps128(vacc4x01234567);
+ __m128 vacc3x0123 = _mm256_castps256_ps128(vacc3x01234567);
+ __m128 vacc2x0123 = _mm256_castps256_ps128(vacc2x01234567);
+ __m128 vacc1x0123 = _mm256_castps256_ps128(vacc1x01234567);
+ __m128 vacc0x0123 = _mm256_castps256_ps128(vacc0x01234567);
+ if (nc & 4) {
+ _mm_storeu_ps(c4, vacc4x0123);
+ _mm_storeu_ps(c3, vacc3x0123);
+ _mm_storeu_ps(c2, vacc2x0123);
+ _mm_storeu_ps(c1, vacc1x0123);
+ _mm_storeu_ps(c0, vacc0x0123);
+
+ vacc4x0123 = _mm256_extractf128_ps(vacc4x01234567, 1);
+ vacc3x0123 = _mm256_extractf128_ps(vacc3x01234567, 1);
+ vacc2x0123 = _mm256_extractf128_ps(vacc2x01234567, 1);
+ vacc1x0123 = _mm256_extractf128_ps(vacc1x01234567, 1);
+ vacc0x0123 = _mm256_extractf128_ps(vacc0x01234567, 1);
+
+ c4 += 4;
+ c3 += 4;
+ c2 += 4;
+ c1 += 4;
+ c0 += 4;
+ }
+ if (nc & 2) {
+ _mm_storel_pi((__m64*) c4, vacc4x0123);
+ _mm_storel_pi((__m64*) c3, vacc3x0123);
+ _mm_storel_pi((__m64*) c2, vacc2x0123);
+ _mm_storel_pi((__m64*) c1, vacc1x0123);
+ _mm_storel_pi((__m64*) c0, vacc0x0123);
+
+ vacc4x0123 = _mm_movehl_ps(vacc4x0123, vacc4x0123);
+ vacc3x0123 = _mm_movehl_ps(vacc3x0123, vacc3x0123);
+ vacc2x0123 = _mm_movehl_ps(vacc2x0123, vacc2x0123);
+ vacc1x0123 = _mm_movehl_ps(vacc1x0123, vacc1x0123);
+ vacc0x0123 = _mm_movehl_ps(vacc0x0123, vacc0x0123);
+
+ c4 += 2;
+ c3 += 2;
+ c2 += 2;
+ c1 += 2;
+ c0 += 2;
+ }
+ if (nc & 1) {
+ _mm_store_ss(c4, vacc4x0123);
+ _mm_store_ss(c3, vacc3x0123);
+ _mm_store_ss(c2, vacc2x0123);
+ _mm_store_ss(c1, vacc1x0123);
+ _mm_store_ss(c0, vacc0x0123);
+ }
+
+ nc = 0;
+ }
+ } while (nc != 0);
+}
diff --git a/src/f32-igemm/5x8-fma3-broadcast.c b/src/f32-igemm/5x8-fma3-broadcast.c
new file mode 100644
index 0000000..298ed86
--- /dev/null
+++ b/src/f32-igemm/5x8-fma3-broadcast.c
@@ -0,0 +1,207 @@
+// Auto-generated file. Do not edit!
+// Template: src/f32-igemm/avx-broadcast.c.in
+// Generator: tools/xngen
+//
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <immintrin.h>
+
+#include <xnnpack/igemm.h>
+
+
+void xnn_f32_igemm_ukernel_5x8__fma3_broadcast(
+ size_t mr,
+ size_t nc,
+ size_t kc,
+ size_t ks,
+ const float**restrict a,
+ const float*restrict w,
+ float*restrict c,
+ size_t cm_stride,
+ size_t cn_stride,
+ size_t a_offset,
+ const float* zero,
+ const union xnn_f32_output_params params[restrict static 1])
+{
+ assert(mr != 0);
+ assert(mr <= 5);
+ assert(nc != 0);
+ assert(kc != 0);
+ assert(kc % sizeof(float) == 0);
+ assert(ks != 0);
+ assert(ks % (5 * sizeof(void*)) == 0);
+ assert(a_offset % sizeof(float) == 0);
+ assert(a != NULL);
+ assert(w != NULL);
+ assert(c != NULL);
+
+ float* c0 = c;
+ float* c1 = (float*) ((uintptr_t) c0 + cm_stride);
+ if XNN_UNPREDICTABLE(mr < 2) {
+ c1 = c0;
+ }
+ float* c2 = (float*) ((uintptr_t) c1 + cm_stride);
+ if XNN_UNPREDICTABLE(mr <= 2) {
+ c2 = c1;
+ }
+ float* c3 = (float*) ((uintptr_t) c2 + cm_stride);
+ if XNN_UNPREDICTABLE(mr < 4) {
+ c3 = c2;
+ }
+ float* c4 = (float*) ((uintptr_t) c3 + cm_stride);
+ if XNN_UNPREDICTABLE(mr <= 4) {
+ c4 = c3;
+ }
+
+ do {
+ __m256 vacc0x01234567 = _mm256_load_ps(w);
+ __m256 vacc1x01234567 = vacc0x01234567;
+ __m256 vacc2x01234567 = vacc0x01234567;
+ __m256 vacc3x01234567 = vacc0x01234567;
+ __m256 vacc4x01234567 = vacc0x01234567;
+ w += 8;
+
+ size_t p = ks;
+ do {
+ const float* restrict a0 = a[0];
+ assert(a0 != NULL);
+ if XNN_UNPREDICTABLE(a0 != zero) {
+ a0 = (const float*) ((uintptr_t) a0 + a_offset);
+ }
+ const float* restrict a1 = a[1];
+ assert(a1 != NULL);
+ if XNN_UNPREDICTABLE(a1 != zero) {
+ a1 = (const float*) ((uintptr_t) a1 + a_offset);
+ }
+ const float* restrict a2 = a[2];
+ assert(a2 != NULL);
+ if XNN_UNPREDICTABLE(a2 != zero) {
+ a2 = (const float*) ((uintptr_t) a2 + a_offset);
+ }
+ const float* restrict a3 = a[3];
+ assert(a3 != NULL);
+ if XNN_UNPREDICTABLE(a3 != zero) {
+ a3 = (const float*) ((uintptr_t) a3 + a_offset);
+ }
+ const float* restrict a4 = a[4];
+ assert(a4 != NULL);
+ if XNN_UNPREDICTABLE(a4 != zero) {
+ a4 = (const float*) ((uintptr_t) a4 + a_offset);
+ }
+ a += 5;
+
+ size_t k = kc;
+ do {
+ const __m256 vb01234567 = _mm256_load_ps(w);
+ w += 8;
+
+ const __m256 va0 = _mm256_broadcast_ss(a0);
+ a0 += 1;
+ const __m256 va1 = _mm256_broadcast_ss(a1);
+ a1 += 1;
+ const __m256 va2 = _mm256_broadcast_ss(a2);
+ a2 += 1;
+ const __m256 va3 = _mm256_broadcast_ss(a3);
+ a3 += 1;
+ const __m256 va4 = _mm256_broadcast_ss(a4);
+ a4 += 1;
+
+ vacc0x01234567 = _mm256_fmadd_ps(va0, vb01234567, vacc0x01234567);
+ vacc1x01234567 = _mm256_fmadd_ps(va1, vb01234567, vacc1x01234567);
+ vacc2x01234567 = _mm256_fmadd_ps(va2, vb01234567, vacc2x01234567);
+ vacc3x01234567 = _mm256_fmadd_ps(va3, vb01234567, vacc3x01234567);
+ vacc4x01234567 = _mm256_fmadd_ps(va4, vb01234567, vacc4x01234567);
+ k -= sizeof(float);
+ } while (k != 0);
+ p -= 5 * sizeof(void*);
+ } while (p != 0);
+
+ const __m256 vmax = _mm256_broadcast_ps((const __m128*) params->sse.max);
+ vacc0x01234567 = _mm256_min_ps(vacc0x01234567, vmax);
+ vacc1x01234567 = _mm256_min_ps(vacc1x01234567, vmax);
+ vacc2x01234567 = _mm256_min_ps(vacc2x01234567, vmax);
+ vacc3x01234567 = _mm256_min_ps(vacc3x01234567, vmax);
+ vacc4x01234567 = _mm256_min_ps(vacc4x01234567, vmax);
+
+ const __m256 vmin = _mm256_broadcast_ps((const __m128*) params->sse.min);
+ vacc0x01234567 = _mm256_max_ps(vacc0x01234567, vmin);
+ vacc1x01234567 = _mm256_max_ps(vacc1x01234567, vmin);
+ vacc2x01234567 = _mm256_max_ps(vacc2x01234567, vmin);
+ vacc3x01234567 = _mm256_max_ps(vacc3x01234567, vmin);
+ vacc4x01234567 = _mm256_max_ps(vacc4x01234567, vmin);
+
+ if XNN_LIKELY(nc >= 8) {
+ _mm256_storeu_ps(c4, vacc4x01234567);
+ c4 = (float*) ((uintptr_t) c4 + cn_stride);
+ _mm256_storeu_ps(c3, vacc3x01234567);
+ c3 = (float*) ((uintptr_t) c3 + cn_stride);
+ _mm256_storeu_ps(c2, vacc2x01234567);
+ c2 = (float*) ((uintptr_t) c2 + cn_stride);
+ _mm256_storeu_ps(c1, vacc1x01234567);
+ c1 = (float*) ((uintptr_t) c1 + cn_stride);
+ _mm256_storeu_ps(c0, vacc0x01234567);
+ c0 = (float*) ((uintptr_t) c0 + cn_stride);
+
+ a = (const float**restrict) ((uintptr_t) a - ks);
+ nc -= 8;
+ } else {
+ __m128 vacc4x0123 = _mm256_castps256_ps128(vacc4x01234567);
+ __m128 vacc3x0123 = _mm256_castps256_ps128(vacc3x01234567);
+ __m128 vacc2x0123 = _mm256_castps256_ps128(vacc2x01234567);
+ __m128 vacc1x0123 = _mm256_castps256_ps128(vacc1x01234567);
+ __m128 vacc0x0123 = _mm256_castps256_ps128(vacc0x01234567);
+ if (nc & 4) {
+ _mm_storeu_ps(c4, vacc4x0123);
+ _mm_storeu_ps(c3, vacc3x0123);
+ _mm_storeu_ps(c2, vacc2x0123);
+ _mm_storeu_ps(c1, vacc1x0123);
+ _mm_storeu_ps(c0, vacc0x0123);
+
+ vacc4x0123 = _mm256_extractf128_ps(vacc4x01234567, 1);
+ vacc3x0123 = _mm256_extractf128_ps(vacc3x01234567, 1);
+ vacc2x0123 = _mm256_extractf128_ps(vacc2x01234567, 1);
+ vacc1x0123 = _mm256_extractf128_ps(vacc1x01234567, 1);
+ vacc0x0123 = _mm256_extractf128_ps(vacc0x01234567, 1);
+
+ c4 += 4;
+ c3 += 4;
+ c2 += 4;
+ c1 += 4;
+ c0 += 4;
+ }
+ if (nc & 2) {
+ _mm_storel_pi((__m64*) c4, vacc4x0123);
+ _mm_storel_pi((__m64*) c3, vacc3x0123);
+ _mm_storel_pi((__m64*) c2, vacc2x0123);
+ _mm_storel_pi((__m64*) c1, vacc1x0123);
+ _mm_storel_pi((__m64*) c0, vacc0x0123);
+
+ vacc4x0123 = _mm_movehl_ps(vacc4x0123, vacc4x0123);
+ vacc3x0123 = _mm_movehl_ps(vacc3x0123, vacc3x0123);
+ vacc2x0123 = _mm_movehl_ps(vacc2x0123, vacc2x0123);
+ vacc1x0123 = _mm_movehl_ps(vacc1x0123, vacc1x0123);
+ vacc0x0123 = _mm_movehl_ps(vacc0x0123, vacc0x0123);
+
+ c4 += 2;
+ c3 += 2;
+ c2 += 2;
+ c1 += 2;
+ c0 += 2;
+ }
+ if (nc & 1) {
+ _mm_store_ss(c4, vacc4x0123);
+ _mm_store_ss(c3, vacc3x0123);
+ _mm_store_ss(c2, vacc2x0123);
+ _mm_store_ss(c1, vacc1x0123);
+ _mm_store_ss(c0, vacc0x0123);
+ }
+
+ nc = 0;
+ }
+ } while (nc != 0);
+}
diff --git a/src/f32-igemm/6x8-avx-broadcast.c b/src/f32-igemm/6x8-avx-broadcast.c
new file mode 100644
index 0000000..6f1f6dc
--- /dev/null
+++ b/src/f32-igemm/6x8-avx-broadcast.c
@@ -0,0 +1,232 @@
+// Auto-generated file. Do not edit!
+// Template: src/f32-igemm/avx-broadcast.c.in
+// Generator: tools/xngen
+//
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <immintrin.h>
+
+#include <xnnpack/igemm.h>
+
+
+void xnn_f32_igemm_ukernel_6x8__avx_broadcast(
+ size_t mr,
+ size_t nc,
+ size_t kc,
+ size_t ks,
+ const float**restrict a,
+ const float*restrict w,
+ float*restrict c,
+ size_t cm_stride,
+ size_t cn_stride,
+ size_t a_offset,
+ const float* zero,
+ const union xnn_f32_output_params params[restrict static 1])
+{
+ assert(mr != 0);
+ assert(mr <= 6);
+ assert(nc != 0);
+ assert(kc != 0);
+ assert(kc % sizeof(float) == 0);
+ assert(ks != 0);
+ assert(ks % (6 * sizeof(void*)) == 0);
+ assert(a_offset % sizeof(float) == 0);
+ assert(a != NULL);
+ assert(w != NULL);
+ assert(c != NULL);
+
+ float* c0 = c;
+ float* c1 = (float*) ((uintptr_t) c0 + cm_stride);
+ if XNN_UNPREDICTABLE(mr < 2) {
+ c1 = c0;
+ }
+ float* c2 = (float*) ((uintptr_t) c1 + cm_stride);
+ if XNN_UNPREDICTABLE(mr <= 2) {
+ c2 = c1;
+ }
+ float* c3 = (float*) ((uintptr_t) c2 + cm_stride);
+ if XNN_UNPREDICTABLE(mr < 4) {
+ c3 = c2;
+ }
+ float* c4 = (float*) ((uintptr_t) c3 + cm_stride);
+ if XNN_UNPREDICTABLE(mr <= 4) {
+ c4 = c3;
+ }
+ float* c5 = (float*) ((uintptr_t) c4 + cm_stride);
+ if XNN_UNPREDICTABLE(mr != 6) {
+ c5 = c4;
+ }
+
+ do {
+ __m256 vacc0x01234567 = _mm256_load_ps(w);
+ __m256 vacc1x01234567 = vacc0x01234567;
+ __m256 vacc2x01234567 = vacc0x01234567;
+ __m256 vacc3x01234567 = vacc0x01234567;
+ __m256 vacc4x01234567 = vacc0x01234567;
+ __m256 vacc5x01234567 = vacc0x01234567;
+ w += 8;
+
+ size_t p = ks;
+ do {
+ const float* restrict a0 = a[0];
+ assert(a0 != NULL);
+ if XNN_UNPREDICTABLE(a0 != zero) {
+ a0 = (const float*) ((uintptr_t) a0 + a_offset);
+ }
+ const float* restrict a1 = a[1];
+ assert(a1 != NULL);
+ if XNN_UNPREDICTABLE(a1 != zero) {
+ a1 = (const float*) ((uintptr_t) a1 + a_offset);
+ }
+ const float* restrict a2 = a[2];
+ assert(a2 != NULL);
+ if XNN_UNPREDICTABLE(a2 != zero) {
+ a2 = (const float*) ((uintptr_t) a2 + a_offset);
+ }
+ const float* restrict a3 = a[3];
+ assert(a3 != NULL);
+ if XNN_UNPREDICTABLE(a3 != zero) {
+ a3 = (const float*) ((uintptr_t) a3 + a_offset);
+ }
+ const float* restrict a4 = a[4];
+ assert(a4 != NULL);
+ if XNN_UNPREDICTABLE(a4 != zero) {
+ a4 = (const float*) ((uintptr_t) a4 + a_offset);
+ }
+ const float* restrict a5 = a[5];
+ assert(a5 != NULL);
+ if XNN_UNPREDICTABLE(a5 != zero) {
+ a5 = (const float*) ((uintptr_t) a5 + a_offset);
+ }
+ a += 6;
+
+ size_t k = kc;
+ do {
+ const __m256 vb01234567 = _mm256_load_ps(w);
+ w += 8;
+
+ const __m256 va0 = _mm256_broadcast_ss(a0);
+ a0 += 1;
+ const __m256 va1 = _mm256_broadcast_ss(a1);
+ a1 += 1;
+ const __m256 va2 = _mm256_broadcast_ss(a2);
+ a2 += 1;
+ const __m256 va3 = _mm256_broadcast_ss(a3);
+ a3 += 1;
+ const __m256 va4 = _mm256_broadcast_ss(a4);
+ a4 += 1;
+ const __m256 va5 = _mm256_broadcast_ss(a5);
+ a5 += 1;
+
+ vacc0x01234567 = _mm256_add_ps(vacc0x01234567, _mm256_mul_ps(va0, vb01234567));
+ vacc1x01234567 = _mm256_add_ps(vacc1x01234567, _mm256_mul_ps(va1, vb01234567));
+ vacc2x01234567 = _mm256_add_ps(vacc2x01234567, _mm256_mul_ps(va2, vb01234567));
+ vacc3x01234567 = _mm256_add_ps(vacc3x01234567, _mm256_mul_ps(va3, vb01234567));
+ vacc4x01234567 = _mm256_add_ps(vacc4x01234567, _mm256_mul_ps(va4, vb01234567));
+ vacc5x01234567 = _mm256_add_ps(vacc5x01234567, _mm256_mul_ps(va5, vb01234567));
+ k -= sizeof(float);
+ } while (k != 0);
+ p -= 6 * sizeof(void*);
+ } while (p != 0);
+
+ const __m256 vmax = _mm256_broadcast_ps((const __m128*) params->sse.max);
+ vacc0x01234567 = _mm256_min_ps(vacc0x01234567, vmax);
+ vacc1x01234567 = _mm256_min_ps(vacc1x01234567, vmax);
+ vacc2x01234567 = _mm256_min_ps(vacc2x01234567, vmax);
+ vacc3x01234567 = _mm256_min_ps(vacc3x01234567, vmax);
+ vacc4x01234567 = _mm256_min_ps(vacc4x01234567, vmax);
+ vacc5x01234567 = _mm256_min_ps(vacc5x01234567, vmax);
+
+ const __m256 vmin = _mm256_broadcast_ps((const __m128*) params->sse.min);
+ vacc0x01234567 = _mm256_max_ps(vacc0x01234567, vmin);
+ vacc1x01234567 = _mm256_max_ps(vacc1x01234567, vmin);
+ vacc2x01234567 = _mm256_max_ps(vacc2x01234567, vmin);
+ vacc3x01234567 = _mm256_max_ps(vacc3x01234567, vmin);
+ vacc4x01234567 = _mm256_max_ps(vacc4x01234567, vmin);
+ vacc5x01234567 = _mm256_max_ps(vacc5x01234567, vmin);
+
+ if XNN_LIKELY(nc >= 8) {
+ _mm256_storeu_ps(c5, vacc5x01234567);
+ c5 = (float*) ((uintptr_t) c5 + cn_stride);
+ _mm256_storeu_ps(c4, vacc4x01234567);
+ c4 = (float*) ((uintptr_t) c4 + cn_stride);
+ _mm256_storeu_ps(c3, vacc3x01234567);
+ c3 = (float*) ((uintptr_t) c3 + cn_stride);
+ _mm256_storeu_ps(c2, vacc2x01234567);
+ c2 = (float*) ((uintptr_t) c2 + cn_stride);
+ _mm256_storeu_ps(c1, vacc1x01234567);
+ c1 = (float*) ((uintptr_t) c1 + cn_stride);
+ _mm256_storeu_ps(c0, vacc0x01234567);
+ c0 = (float*) ((uintptr_t) c0 + cn_stride);
+
+ a = (const float**restrict) ((uintptr_t) a - ks);
+ nc -= 8;
+ } else {
+ __m128 vacc5x0123 = _mm256_castps256_ps128(vacc5x01234567);
+ __m128 vacc4x0123 = _mm256_castps256_ps128(vacc4x01234567);
+ __m128 vacc3x0123 = _mm256_castps256_ps128(vacc3x01234567);
+ __m128 vacc2x0123 = _mm256_castps256_ps128(vacc2x01234567);
+ __m128 vacc1x0123 = _mm256_castps256_ps128(vacc1x01234567);
+ __m128 vacc0x0123 = _mm256_castps256_ps128(vacc0x01234567);
+ if (nc & 4) {
+ _mm_storeu_ps(c5, vacc5x0123);
+ _mm_storeu_ps(c4, vacc4x0123);
+ _mm_storeu_ps(c3, vacc3x0123);
+ _mm_storeu_ps(c2, vacc2x0123);
+ _mm_storeu_ps(c1, vacc1x0123);
+ _mm_storeu_ps(c0, vacc0x0123);
+
+ vacc5x0123 = _mm256_extractf128_ps(vacc5x01234567, 1);
+ vacc4x0123 = _mm256_extractf128_ps(vacc4x01234567, 1);
+ vacc3x0123 = _mm256_extractf128_ps(vacc3x01234567, 1);
+ vacc2x0123 = _mm256_extractf128_ps(vacc2x01234567, 1);
+ vacc1x0123 = _mm256_extractf128_ps(vacc1x01234567, 1);
+ vacc0x0123 = _mm256_extractf128_ps(vacc0x01234567, 1);
+
+ c5 += 4;
+ c4 += 4;
+ c3 += 4;
+ c2 += 4;
+ c1 += 4;
+ c0 += 4;
+ }
+ if (nc & 2) {
+ _mm_storel_pi((__m64*) c5, vacc5x0123);
+ _mm_storel_pi((__m64*) c4, vacc4x0123);
+ _mm_storel_pi((__m64*) c3, vacc3x0123);
+ _mm_storel_pi((__m64*) c2, vacc2x0123);
+ _mm_storel_pi((__m64*) c1, vacc1x0123);
+ _mm_storel_pi((__m64*) c0, vacc0x0123);
+
+ vacc5x0123 = _mm_movehl_ps(vacc5x0123, vacc5x0123);
+ vacc4x0123 = _mm_movehl_ps(vacc4x0123, vacc4x0123);
+ vacc3x0123 = _mm_movehl_ps(vacc3x0123, vacc3x0123);
+ vacc2x0123 = _mm_movehl_ps(vacc2x0123, vacc2x0123);
+ vacc1x0123 = _mm_movehl_ps(vacc1x0123, vacc1x0123);
+ vacc0x0123 = _mm_movehl_ps(vacc0x0123, vacc0x0123);
+
+ c5 += 2;
+ c4 += 2;
+ c3 += 2;
+ c2 += 2;
+ c1 += 2;
+ c0 += 2;
+ }
+ if (nc & 1) {
+ _mm_store_ss(c5, vacc5x0123);
+ _mm_store_ss(c4, vacc4x0123);
+ _mm_store_ss(c3, vacc3x0123);
+ _mm_store_ss(c2, vacc2x0123);
+ _mm_store_ss(c1, vacc1x0123);
+ _mm_store_ss(c0, vacc0x0123);
+ }
+
+ nc = 0;
+ }
+ } while (nc != 0);
+}
diff --git a/src/f32-igemm/6x8-fma3-broadcast.c b/src/f32-igemm/6x8-fma3-broadcast.c
new file mode 100644
index 0000000..9d16854
--- /dev/null
+++ b/src/f32-igemm/6x8-fma3-broadcast.c
@@ -0,0 +1,232 @@
+// Auto-generated file. Do not edit!
+// Template: src/f32-igemm/avx-broadcast.c.in
+// Generator: tools/xngen
+//
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <immintrin.h>
+
+#include <xnnpack/igemm.h>
+
+
+void xnn_f32_igemm_ukernel_6x8__fma3_broadcast(
+ size_t mr,
+ size_t nc,
+ size_t kc,
+ size_t ks,
+ const float**restrict a,
+ const float*restrict w,
+ float*restrict c,
+ size_t cm_stride,
+ size_t cn_stride,
+ size_t a_offset,
+ const float* zero,
+ const union xnn_f32_output_params params[restrict static 1])
+{
+ assert(mr != 0);
+ assert(mr <= 6);
+ assert(nc != 0);
+ assert(kc != 0);
+ assert(kc % sizeof(float) == 0);
+ assert(ks != 0);
+ assert(ks % (6 * sizeof(void*)) == 0);
+ assert(a_offset % sizeof(float) == 0);
+ assert(a != NULL);
+ assert(w != NULL);
+ assert(c != NULL);
+
+ float* c0 = c;
+ float* c1 = (float*) ((uintptr_t) c0 + cm_stride);
+ if XNN_UNPREDICTABLE(mr < 2) {
+ c1 = c0;
+ }
+ float* c2 = (float*) ((uintptr_t) c1 + cm_stride);
+ if XNN_UNPREDICTABLE(mr <= 2) {
+ c2 = c1;
+ }
+ float* c3 = (float*) ((uintptr_t) c2 + cm_stride);
+ if XNN_UNPREDICTABLE(mr < 4) {
+ c3 = c2;
+ }
+ float* c4 = (float*) ((uintptr_t) c3 + cm_stride);
+ if XNN_UNPREDICTABLE(mr <= 4) {
+ c4 = c3;
+ }
+ float* c5 = (float*) ((uintptr_t) c4 + cm_stride);
+ if XNN_UNPREDICTABLE(mr != 6) {
+ c5 = c4;
+ }
+
+ do {
+ __m256 vacc0x01234567 = _mm256_load_ps(w);
+ __m256 vacc1x01234567 = vacc0x01234567;
+ __m256 vacc2x01234567 = vacc0x01234567;
+ __m256 vacc3x01234567 = vacc0x01234567;
+ __m256 vacc4x01234567 = vacc0x01234567;
+ __m256 vacc5x01234567 = vacc0x01234567;
+ w += 8;
+
+ size_t p = ks;
+ do {
+ const float* restrict a0 = a[0];
+ assert(a0 != NULL);
+ if XNN_UNPREDICTABLE(a0 != zero) {
+ a0 = (const float*) ((uintptr_t) a0 + a_offset);
+ }
+ const float* restrict a1 = a[1];
+ assert(a1 != NULL);
+ if XNN_UNPREDICTABLE(a1 != zero) {
+ a1 = (const float*) ((uintptr_t) a1 + a_offset);
+ }
+ const float* restrict a2 = a[2];
+ assert(a2 != NULL);
+ if XNN_UNPREDICTABLE(a2 != zero) {
+ a2 = (const float*) ((uintptr_t) a2 + a_offset);
+ }
+ const float* restrict a3 = a[3];
+ assert(a3 != NULL);
+ if XNN_UNPREDICTABLE(a3 != zero) {
+ a3 = (const float*) ((uintptr_t) a3 + a_offset);
+ }
+ const float* restrict a4 = a[4];
+ assert(a4 != NULL);
+ if XNN_UNPREDICTABLE(a4 != zero) {
+ a4 = (const float*) ((uintptr_t) a4 + a_offset);
+ }
+ const float* restrict a5 = a[5];
+ assert(a5 != NULL);
+ if XNN_UNPREDICTABLE(a5 != zero) {
+ a5 = (const float*) ((uintptr_t) a5 + a_offset);
+ }
+ a += 6;
+
+ size_t k = kc;
+ do {
+ const __m256 vb01234567 = _mm256_load_ps(w);
+ w += 8;
+
+ const __m256 va0 = _mm256_broadcast_ss(a0);
+ a0 += 1;
+ const __m256 va1 = _mm256_broadcast_ss(a1);
+ a1 += 1;
+ const __m256 va2 = _mm256_broadcast_ss(a2);
+ a2 += 1;
+ const __m256 va3 = _mm256_broadcast_ss(a3);
+ a3 += 1;
+ const __m256 va4 = _mm256_broadcast_ss(a4);
+ a4 += 1;
+ const __m256 va5 = _mm256_broadcast_ss(a5);
+ a5 += 1;
+
+ vacc0x01234567 = _mm256_fmadd_ps(va0, vb01234567, vacc0x01234567);
+ vacc1x01234567 = _mm256_fmadd_ps(va1, vb01234567, vacc1x01234567);
+ vacc2x01234567 = _mm256_fmadd_ps(va2, vb01234567, vacc2x01234567);
+ vacc3x01234567 = _mm256_fmadd_ps(va3, vb01234567, vacc3x01234567);
+ vacc4x01234567 = _mm256_fmadd_ps(va4, vb01234567, vacc4x01234567);
+ vacc5x01234567 = _mm256_fmadd_ps(va5, vb01234567, vacc5x01234567);
+ k -= sizeof(float);
+ } while (k != 0);
+ p -= 6 * sizeof(void*);
+ } while (p != 0);
+
+ const __m256 vmax = _mm256_broadcast_ps((const __m128*) params->sse.max);
+ vacc0x01234567 = _mm256_min_ps(vacc0x01234567, vmax);
+ vacc1x01234567 = _mm256_min_ps(vacc1x01234567, vmax);
+ vacc2x01234567 = _mm256_min_ps(vacc2x01234567, vmax);
+ vacc3x01234567 = _mm256_min_ps(vacc3x01234567, vmax);
+ vacc4x01234567 = _mm256_min_ps(vacc4x01234567, vmax);
+ vacc5x01234567 = _mm256_min_ps(vacc5x01234567, vmax);
+
+ const __m256 vmin = _mm256_broadcast_ps((const __m128*) params->sse.min);
+ vacc0x01234567 = _mm256_max_ps(vacc0x01234567, vmin);
+ vacc1x01234567 = _mm256_max_ps(vacc1x01234567, vmin);
+ vacc2x01234567 = _mm256_max_ps(vacc2x01234567, vmin);
+ vacc3x01234567 = _mm256_max_ps(vacc3x01234567, vmin);
+ vacc4x01234567 = _mm256_max_ps(vacc4x01234567, vmin);
+ vacc5x01234567 = _mm256_max_ps(vacc5x01234567, vmin);
+
+ if XNN_LIKELY(nc >= 8) {
+ _mm256_storeu_ps(c5, vacc5x01234567);
+ c5 = (float*) ((uintptr_t) c5 + cn_stride);
+ _mm256_storeu_ps(c4, vacc4x01234567);
+ c4 = (float*) ((uintptr_t) c4 + cn_stride);
+ _mm256_storeu_ps(c3, vacc3x01234567);
+ c3 = (float*) ((uintptr_t) c3 + cn_stride);
+ _mm256_storeu_ps(c2, vacc2x01234567);
+ c2 = (float*) ((uintptr_t) c2 + cn_stride);
+ _mm256_storeu_ps(c1, vacc1x01234567);
+ c1 = (float*) ((uintptr_t) c1 + cn_stride);
+ _mm256_storeu_ps(c0, vacc0x01234567);
+ c0 = (float*) ((uintptr_t) c0 + cn_stride);
+
+ a = (const float**restrict) ((uintptr_t) a - ks);
+ nc -= 8;
+ } else {
+ __m128 vacc5x0123 = _mm256_castps256_ps128(vacc5x01234567);
+ __m128 vacc4x0123 = _mm256_castps256_ps128(vacc4x01234567);
+ __m128 vacc3x0123 = _mm256_castps256_ps128(vacc3x01234567);
+ __m128 vacc2x0123 = _mm256_castps256_ps128(vacc2x01234567);
+ __m128 vacc1x0123 = _mm256_castps256_ps128(vacc1x01234567);
+ __m128 vacc0x0123 = _mm256_castps256_ps128(vacc0x01234567);
+ if (nc & 4) {
+ _mm_storeu_ps(c5, vacc5x0123);
+ _mm_storeu_ps(c4, vacc4x0123);
+ _mm_storeu_ps(c3, vacc3x0123);
+ _mm_storeu_ps(c2, vacc2x0123);
+ _mm_storeu_ps(c1, vacc1x0123);
+ _mm_storeu_ps(c0, vacc0x0123);
+
+ vacc5x0123 = _mm256_extractf128_ps(vacc5x01234567, 1);
+ vacc4x0123 = _mm256_extractf128_ps(vacc4x01234567, 1);
+ vacc3x0123 = _mm256_extractf128_ps(vacc3x01234567, 1);
+ vacc2x0123 = _mm256_extractf128_ps(vacc2x01234567, 1);
+ vacc1x0123 = _mm256_extractf128_ps(vacc1x01234567, 1);
+ vacc0x0123 = _mm256_extractf128_ps(vacc0x01234567, 1);
+
+ c5 += 4;
+ c4 += 4;
+ c3 += 4;
+ c2 += 4;
+ c1 += 4;
+ c0 += 4;
+ }
+ if (nc & 2) {
+ _mm_storel_pi((__m64*) c5, vacc5x0123);
+ _mm_storel_pi((__m64*) c4, vacc4x0123);
+ _mm_storel_pi((__m64*) c3, vacc3x0123);
+ _mm_storel_pi((__m64*) c2, vacc2x0123);
+ _mm_storel_pi((__m64*) c1, vacc1x0123);
+ _mm_storel_pi((__m64*) c0, vacc0x0123);
+
+ vacc5x0123 = _mm_movehl_ps(vacc5x0123, vacc5x0123);
+ vacc4x0123 = _mm_movehl_ps(vacc4x0123, vacc4x0123);
+ vacc3x0123 = _mm_movehl_ps(vacc3x0123, vacc3x0123);
+ vacc2x0123 = _mm_movehl_ps(vacc2x0123, vacc2x0123);
+ vacc1x0123 = _mm_movehl_ps(vacc1x0123, vacc1x0123);
+ vacc0x0123 = _mm_movehl_ps(vacc0x0123, vacc0x0123);
+
+ c5 += 2;
+ c4 += 2;
+ c3 += 2;
+ c2 += 2;
+ c1 += 2;
+ c0 += 2;
+ }
+ if (nc & 1) {
+ _mm_store_ss(c5, vacc5x0123);
+ _mm_store_ss(c4, vacc4x0123);
+ _mm_store_ss(c3, vacc3x0123);
+ _mm_store_ss(c2, vacc2x0123);
+ _mm_store_ss(c1, vacc1x0123);
+ _mm_store_ss(c0, vacc0x0123);
+ }
+
+ nc = 0;
+ }
+ } while (nc != 0);
+}
diff --git a/src/f32-igemm/7x8-avx-broadcast.c b/src/f32-igemm/7x8-avx-broadcast.c
new file mode 100644
index 0000000..8a56961
--- /dev/null
+++ b/src/f32-igemm/7x8-avx-broadcast.c
@@ -0,0 +1,257 @@
+// Auto-generated file. Do not edit!
+// Template: src/f32-igemm/avx-broadcast.c.in
+// Generator: tools/xngen
+//
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <immintrin.h>
+
+#include <xnnpack/igemm.h>
+
+
+void xnn_f32_igemm_ukernel_7x8__avx_broadcast(
+ size_t mr,
+ size_t nc,
+ size_t kc,
+ size_t ks,
+ const float**restrict a,
+ const float*restrict w,
+ float*restrict c,
+ size_t cm_stride,
+ size_t cn_stride,
+ size_t a_offset,
+ const float* zero,
+ const union xnn_f32_output_params params[restrict static 1])
+{
+ assert(mr != 0);
+ assert(mr <= 7);
+ assert(nc != 0);
+ assert(kc != 0);
+ assert(kc % sizeof(float) == 0);
+ assert(ks != 0);
+ assert(ks % (7 * sizeof(void*)) == 0);
+ assert(a_offset % sizeof(float) == 0);
+ assert(a != NULL);
+ assert(w != NULL);
+ assert(c != NULL);
+
+ float* c0 = c;
+ float* c1 = (float*) ((uintptr_t) c0 + cm_stride);
+ if XNN_UNPREDICTABLE(mr < 2) {
+ c1 = c0;
+ }
+ float* c2 = (float*) ((uintptr_t) c1 + cm_stride);
+ if XNN_UNPREDICTABLE(mr <= 2) {
+ c2 = c1;
+ }
+ float* c3 = (float*) ((uintptr_t) c2 + cm_stride);
+ if XNN_UNPREDICTABLE(mr < 4) {
+ c3 = c2;
+ }
+ float* c4 = (float*) ((uintptr_t) c3 + cm_stride);
+ if XNN_UNPREDICTABLE(mr <= 4) {
+ c4 = c3;
+ }
+ float* c5 = (float*) ((uintptr_t) c4 + cm_stride);
+ if XNN_UNPREDICTABLE(mr < 6) {
+ c5 = c4;
+ }
+ float* c6 = (float*) ((uintptr_t) c5 + cm_stride);
+ if XNN_UNPREDICTABLE(mr <= 6) {
+ c6 = c5;
+ }
+
+ do {
+ __m256 vacc0x01234567 = _mm256_load_ps(w);
+ __m256 vacc1x01234567 = vacc0x01234567;
+ __m256 vacc2x01234567 = vacc0x01234567;
+ __m256 vacc3x01234567 = vacc0x01234567;
+ __m256 vacc4x01234567 = vacc0x01234567;
+ __m256 vacc5x01234567 = vacc0x01234567;
+ __m256 vacc6x01234567 = vacc0x01234567;
+ w += 8;
+
+ size_t p = ks;
+ do {
+ const float* restrict a0 = a[0];
+ assert(a0 != NULL);
+ if XNN_UNPREDICTABLE(a0 != zero) {
+ a0 = (const float*) ((uintptr_t) a0 + a_offset);
+ }
+ const float* restrict a1 = a[1];
+ assert(a1 != NULL);
+ if XNN_UNPREDICTABLE(a1 != zero) {
+ a1 = (const float*) ((uintptr_t) a1 + a_offset);
+ }
+ const float* restrict a2 = a[2];
+ assert(a2 != NULL);
+ if XNN_UNPREDICTABLE(a2 != zero) {
+ a2 = (const float*) ((uintptr_t) a2 + a_offset);
+ }
+ const float* restrict a3 = a[3];
+ assert(a3 != NULL);
+ if XNN_UNPREDICTABLE(a3 != zero) {
+ a3 = (const float*) ((uintptr_t) a3 + a_offset);
+ }
+ const float* restrict a4 = a[4];
+ assert(a4 != NULL);
+ if XNN_UNPREDICTABLE(a4 != zero) {
+ a4 = (const float*) ((uintptr_t) a4 + a_offset);
+ }
+ const float* restrict a5 = a[5];
+ assert(a5 != NULL);
+ if XNN_UNPREDICTABLE(a5 != zero) {
+ a5 = (const float*) ((uintptr_t) a5 + a_offset);
+ }
+ const float* restrict a6 = a[6];
+ assert(a6 != NULL);
+ if XNN_UNPREDICTABLE(a6 != zero) {
+ a6 = (const float*) ((uintptr_t) a6 + a_offset);
+ }
+ a += 7;
+
+ size_t k = kc;
+ do {
+ const __m256 vb01234567 = _mm256_load_ps(w);
+ w += 8;
+
+ const __m256 va0 = _mm256_broadcast_ss(a0);
+ a0 += 1;
+ const __m256 va1 = _mm256_broadcast_ss(a1);
+ a1 += 1;
+ const __m256 va2 = _mm256_broadcast_ss(a2);
+ a2 += 1;
+ const __m256 va3 = _mm256_broadcast_ss(a3);
+ a3 += 1;
+ const __m256 va4 = _mm256_broadcast_ss(a4);
+ a4 += 1;
+ const __m256 va5 = _mm256_broadcast_ss(a5);
+ a5 += 1;
+ const __m256 va6 = _mm256_broadcast_ss(a6);
+ a6 += 1;
+
+ vacc0x01234567 = _mm256_add_ps(vacc0x01234567, _mm256_mul_ps(va0, vb01234567));
+ vacc1x01234567 = _mm256_add_ps(vacc1x01234567, _mm256_mul_ps(va1, vb01234567));
+ vacc2x01234567 = _mm256_add_ps(vacc2x01234567, _mm256_mul_ps(va2, vb01234567));
+ vacc3x01234567 = _mm256_add_ps(vacc3x01234567, _mm256_mul_ps(va3, vb01234567));
+ vacc4x01234567 = _mm256_add_ps(vacc4x01234567, _mm256_mul_ps(va4, vb01234567));
+ vacc5x01234567 = _mm256_add_ps(vacc5x01234567, _mm256_mul_ps(va5, vb01234567));
+ vacc6x01234567 = _mm256_add_ps(vacc6x01234567, _mm256_mul_ps(va6, vb01234567));
+ k -= sizeof(float);
+ } while (k != 0);
+ p -= 7 * sizeof(void*);
+ } while (p != 0);
+
+ const __m256 vmax = _mm256_broadcast_ps((const __m128*) params->sse.max);
+ vacc0x01234567 = _mm256_min_ps(vacc0x01234567, vmax);
+ vacc1x01234567 = _mm256_min_ps(vacc1x01234567, vmax);
+ vacc2x01234567 = _mm256_min_ps(vacc2x01234567, vmax);
+ vacc3x01234567 = _mm256_min_ps(vacc3x01234567, vmax);
+ vacc4x01234567 = _mm256_min_ps(vacc4x01234567, vmax);
+ vacc5x01234567 = _mm256_min_ps(vacc5x01234567, vmax);
+ vacc6x01234567 = _mm256_min_ps(vacc6x01234567, vmax);
+
+ const __m256 vmin = _mm256_broadcast_ps((const __m128*) params->sse.min);
+ vacc0x01234567 = _mm256_max_ps(vacc0x01234567, vmin);
+ vacc1x01234567 = _mm256_max_ps(vacc1x01234567, vmin);
+ vacc2x01234567 = _mm256_max_ps(vacc2x01234567, vmin);
+ vacc3x01234567 = _mm256_max_ps(vacc3x01234567, vmin);
+ vacc4x01234567 = _mm256_max_ps(vacc4x01234567, vmin);
+ vacc5x01234567 = _mm256_max_ps(vacc5x01234567, vmin);
+ vacc6x01234567 = _mm256_max_ps(vacc6x01234567, vmin);
+
+ if XNN_LIKELY(nc >= 8) {
+ _mm256_storeu_ps(c6, vacc6x01234567);
+ c6 = (float*) ((uintptr_t) c6 + cn_stride);
+ _mm256_storeu_ps(c5, vacc5x01234567);
+ c5 = (float*) ((uintptr_t) c5 + cn_stride);
+ _mm256_storeu_ps(c4, vacc4x01234567);
+ c4 = (float*) ((uintptr_t) c4 + cn_stride);
+ _mm256_storeu_ps(c3, vacc3x01234567);
+ c3 = (float*) ((uintptr_t) c3 + cn_stride);
+ _mm256_storeu_ps(c2, vacc2x01234567);
+ c2 = (float*) ((uintptr_t) c2 + cn_stride);
+ _mm256_storeu_ps(c1, vacc1x01234567);
+ c1 = (float*) ((uintptr_t) c1 + cn_stride);
+ _mm256_storeu_ps(c0, vacc0x01234567);
+ c0 = (float*) ((uintptr_t) c0 + cn_stride);
+
+ a = (const float**restrict) ((uintptr_t) a - ks);
+ nc -= 8;
+ } else {
+ __m128 vacc6x0123 = _mm256_castps256_ps128(vacc6x01234567);
+ __m128 vacc5x0123 = _mm256_castps256_ps128(vacc5x01234567);
+ __m128 vacc4x0123 = _mm256_castps256_ps128(vacc4x01234567);
+ __m128 vacc3x0123 = _mm256_castps256_ps128(vacc3x01234567);
+ __m128 vacc2x0123 = _mm256_castps256_ps128(vacc2x01234567);
+ __m128 vacc1x0123 = _mm256_castps256_ps128(vacc1x01234567);
+ __m128 vacc0x0123 = _mm256_castps256_ps128(vacc0x01234567);
+ if (nc & 4) {
+ _mm_storeu_ps(c6, vacc6x0123);
+ _mm_storeu_ps(c5, vacc5x0123);
+ _mm_storeu_ps(c4, vacc4x0123);
+ _mm_storeu_ps(c3, vacc3x0123);
+ _mm_storeu_ps(c2, vacc2x0123);
+ _mm_storeu_ps(c1, vacc1x0123);
+ _mm_storeu_ps(c0, vacc0x0123);
+
+ vacc6x0123 = _mm256_extractf128_ps(vacc6x01234567, 1);
+ vacc5x0123 = _mm256_extractf128_ps(vacc5x01234567, 1);
+ vacc4x0123 = _mm256_extractf128_ps(vacc4x01234567, 1);
+ vacc3x0123 = _mm256_extractf128_ps(vacc3x01234567, 1);
+ vacc2x0123 = _mm256_extractf128_ps(vacc2x01234567, 1);
+ vacc1x0123 = _mm256_extractf128_ps(vacc1x01234567, 1);
+ vacc0x0123 = _mm256_extractf128_ps(vacc0x01234567, 1);
+
+ c6 += 4;
+ c5 += 4;
+ c4 += 4;
+ c3 += 4;
+ c2 += 4;
+ c1 += 4;
+ c0 += 4;
+ }
+ if (nc & 2) {
+ _mm_storel_pi((__m64*) c6, vacc6x0123);
+ _mm_storel_pi((__m64*) c5, vacc5x0123);
+ _mm_storel_pi((__m64*) c4, vacc4x0123);
+ _mm_storel_pi((__m64*) c3, vacc3x0123);
+ _mm_storel_pi((__m64*) c2, vacc2x0123);
+ _mm_storel_pi((__m64*) c1, vacc1x0123);
+ _mm_storel_pi((__m64*) c0, vacc0x0123);
+
+ vacc6x0123 = _mm_movehl_ps(vacc6x0123, vacc6x0123);
+ vacc5x0123 = _mm_movehl_ps(vacc5x0123, vacc5x0123);
+ vacc4x0123 = _mm_movehl_ps(vacc4x0123, vacc4x0123);
+ vacc3x0123 = _mm_movehl_ps(vacc3x0123, vacc3x0123);
+ vacc2x0123 = _mm_movehl_ps(vacc2x0123, vacc2x0123);
+ vacc1x0123 = _mm_movehl_ps(vacc1x0123, vacc1x0123);
+ vacc0x0123 = _mm_movehl_ps(vacc0x0123, vacc0x0123);
+
+ c6 += 2;
+ c5 += 2;
+ c4 += 2;
+ c3 += 2;
+ c2 += 2;
+ c1 += 2;
+ c0 += 2;
+ }
+ if (nc & 1) {
+ _mm_store_ss(c6, vacc6x0123);
+ _mm_store_ss(c5, vacc5x0123);
+ _mm_store_ss(c4, vacc4x0123);
+ _mm_store_ss(c3, vacc3x0123);
+ _mm_store_ss(c2, vacc2x0123);
+ _mm_store_ss(c1, vacc1x0123);
+ _mm_store_ss(c0, vacc0x0123);
+ }
+
+ nc = 0;
+ }
+ } while (nc != 0);
+}
diff --git a/src/f32-igemm/7x8-fma3-broadcast.c b/src/f32-igemm/7x8-fma3-broadcast.c
new file mode 100644
index 0000000..89b17f3
--- /dev/null
+++ b/src/f32-igemm/7x8-fma3-broadcast.c
@@ -0,0 +1,257 @@
+// Auto-generated file. Do not edit!
+// Template: src/f32-igemm/avx-broadcast.c.in
+// Generator: tools/xngen
+//
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <immintrin.h>
+
+#include <xnnpack/igemm.h>
+
+
+void xnn_f32_igemm_ukernel_7x8__fma3_broadcast(
+ size_t mr,
+ size_t nc,
+ size_t kc,
+ size_t ks,
+ const float**restrict a,
+ const float*restrict w,
+ float*restrict c,
+ size_t cm_stride,
+ size_t cn_stride,
+ size_t a_offset,
+ const float* zero,
+ const union xnn_f32_output_params params[restrict static 1])
+{
+ assert(mr != 0);
+ assert(mr <= 7);
+ assert(nc != 0);
+ assert(kc != 0);
+ assert(kc % sizeof(float) == 0);
+ assert(ks != 0);
+ assert(ks % (7 * sizeof(void*)) == 0);
+ assert(a_offset % sizeof(float) == 0);
+ assert(a != NULL);
+ assert(w != NULL);
+ assert(c != NULL);
+
+ float* c0 = c;
+ float* c1 = (float*) ((uintptr_t) c0 + cm_stride);
+ if XNN_UNPREDICTABLE(mr < 2) {
+ c1 = c0;
+ }
+ float* c2 = (float*) ((uintptr_t) c1 + cm_stride);
+ if XNN_UNPREDICTABLE(mr <= 2) {
+ c2 = c1;
+ }
+ float* c3 = (float*) ((uintptr_t) c2 + cm_stride);
+ if XNN_UNPREDICTABLE(mr < 4) {
+ c3 = c2;
+ }
+ float* c4 = (float*) ((uintptr_t) c3 + cm_stride);
+ if XNN_UNPREDICTABLE(mr <= 4) {
+ c4 = c3;
+ }
+ float* c5 = (float*) ((uintptr_t) c4 + cm_stride);
+ if XNN_UNPREDICTABLE(mr < 6) {
+ c5 = c4;
+ }
+ float* c6 = (float*) ((uintptr_t) c5 + cm_stride);
+ if XNN_UNPREDICTABLE(mr <= 6) {
+ c6 = c5;
+ }
+
+ do {
+ __m256 vacc0x01234567 = _mm256_load_ps(w);
+ __m256 vacc1x01234567 = vacc0x01234567;
+ __m256 vacc2x01234567 = vacc0x01234567;
+ __m256 vacc3x01234567 = vacc0x01234567;
+ __m256 vacc4x01234567 = vacc0x01234567;
+ __m256 vacc5x01234567 = vacc0x01234567;
+ __m256 vacc6x01234567 = vacc0x01234567;
+ w += 8;
+
+ size_t p = ks;
+ do {
+ const float* restrict a0 = a[0];
+ assert(a0 != NULL);
+ if XNN_UNPREDICTABLE(a0 != zero) {
+ a0 = (const float*) ((uintptr_t) a0 + a_offset);
+ }
+ const float* restrict a1 = a[1];
+ assert(a1 != NULL);
+ if XNN_UNPREDICTABLE(a1 != zero) {
+ a1 = (const float*) ((uintptr_t) a1 + a_offset);
+ }
+ const float* restrict a2 = a[2];
+ assert(a2 != NULL);
+ if XNN_UNPREDICTABLE(a2 != zero) {
+ a2 = (const float*) ((uintptr_t) a2 + a_offset);
+ }
+ const float* restrict a3 = a[3];
+ assert(a3 != NULL);
+ if XNN_UNPREDICTABLE(a3 != zero) {
+ a3 = (const float*) ((uintptr_t) a3 + a_offset);
+ }
+ const float* restrict a4 = a[4];
+ assert(a4 != NULL);
+ if XNN_UNPREDICTABLE(a4 != zero) {
+ a4 = (const float*) ((uintptr_t) a4 + a_offset);
+ }
+ const float* restrict a5 = a[5];
+ assert(a5 != NULL);
+ if XNN_UNPREDICTABLE(a5 != zero) {
+ a5 = (const float*) ((uintptr_t) a5 + a_offset);
+ }
+ const float* restrict a6 = a[6];
+ assert(a6 != NULL);
+ if XNN_UNPREDICTABLE(a6 != zero) {
+ a6 = (const float*) ((uintptr_t) a6 + a_offset);
+ }
+ a += 7;
+
+ size_t k = kc;
+ do {
+ const __m256 vb01234567 = _mm256_load_ps(w);
+ w += 8;
+
+ const __m256 va0 = _mm256_broadcast_ss(a0);
+ a0 += 1;
+ const __m256 va1 = _mm256_broadcast_ss(a1);
+ a1 += 1;
+ const __m256 va2 = _mm256_broadcast_ss(a2);
+ a2 += 1;
+ const __m256 va3 = _mm256_broadcast_ss(a3);
+ a3 += 1;
+ const __m256 va4 = _mm256_broadcast_ss(a4);
+ a4 += 1;
+ const __m256 va5 = _mm256_broadcast_ss(a5);
+ a5 += 1;
+ const __m256 va6 = _mm256_broadcast_ss(a6);
+ a6 += 1;
+
+ vacc0x01234567 = _mm256_fmadd_ps(va0, vb01234567, vacc0x01234567);
+ vacc1x01234567 = _mm256_fmadd_ps(va1, vb01234567, vacc1x01234567);
+ vacc2x01234567 = _mm256_fmadd_ps(va2, vb01234567, vacc2x01234567);
+ vacc3x01234567 = _mm256_fmadd_ps(va3, vb01234567, vacc3x01234567);
+ vacc4x01234567 = _mm256_fmadd_ps(va4, vb01234567, vacc4x01234567);
+ vacc5x01234567 = _mm256_fmadd_ps(va5, vb01234567, vacc5x01234567);
+ vacc6x01234567 = _mm256_fmadd_ps(va6, vb01234567, vacc6x01234567);
+ k -= sizeof(float);
+ } while (k != 0);
+ p -= 7 * sizeof(void*);
+ } while (p != 0);
+
+ const __m256 vmax = _mm256_broadcast_ps((const __m128*) params->sse.max);
+ vacc0x01234567 = _mm256_min_ps(vacc0x01234567, vmax);
+ vacc1x01234567 = _mm256_min_ps(vacc1x01234567, vmax);
+ vacc2x01234567 = _mm256_min_ps(vacc2x01234567, vmax);
+ vacc3x01234567 = _mm256_min_ps(vacc3x01234567, vmax);
+ vacc4x01234567 = _mm256_min_ps(vacc4x01234567, vmax);
+ vacc5x01234567 = _mm256_min_ps(vacc5x01234567, vmax);
+ vacc6x01234567 = _mm256_min_ps(vacc6x01234567, vmax);
+
+ const __m256 vmin = _mm256_broadcast_ps((const __m128*) params->sse.min);
+ vacc0x01234567 = _mm256_max_ps(vacc0x01234567, vmin);
+ vacc1x01234567 = _mm256_max_ps(vacc1x01234567, vmin);
+ vacc2x01234567 = _mm256_max_ps(vacc2x01234567, vmin);
+ vacc3x01234567 = _mm256_max_ps(vacc3x01234567, vmin);
+ vacc4x01234567 = _mm256_max_ps(vacc4x01234567, vmin);
+ vacc5x01234567 = _mm256_max_ps(vacc5x01234567, vmin);
+ vacc6x01234567 = _mm256_max_ps(vacc6x01234567, vmin);
+
+ if XNN_LIKELY(nc >= 8) {
+ _mm256_storeu_ps(c6, vacc6x01234567);
+ c6 = (float*) ((uintptr_t) c6 + cn_stride);
+ _mm256_storeu_ps(c5, vacc5x01234567);
+ c5 = (float*) ((uintptr_t) c5 + cn_stride);
+ _mm256_storeu_ps(c4, vacc4x01234567);
+ c4 = (float*) ((uintptr_t) c4 + cn_stride);
+ _mm256_storeu_ps(c3, vacc3x01234567);
+ c3 = (float*) ((uintptr_t) c3 + cn_stride);
+ _mm256_storeu_ps(c2, vacc2x01234567);
+ c2 = (float*) ((uintptr_t) c2 + cn_stride);
+ _mm256_storeu_ps(c1, vacc1x01234567);
+ c1 = (float*) ((uintptr_t) c1 + cn_stride);
+ _mm256_storeu_ps(c0, vacc0x01234567);
+ c0 = (float*) ((uintptr_t) c0 + cn_stride);
+
+ a = (const float**restrict) ((uintptr_t) a - ks);
+ nc -= 8;
+ } else {
+ __m128 vacc6x0123 = _mm256_castps256_ps128(vacc6x01234567);
+ __m128 vacc5x0123 = _mm256_castps256_ps128(vacc5x01234567);
+ __m128 vacc4x0123 = _mm256_castps256_ps128(vacc4x01234567);
+ __m128 vacc3x0123 = _mm256_castps256_ps128(vacc3x01234567);
+ __m128 vacc2x0123 = _mm256_castps256_ps128(vacc2x01234567);
+ __m128 vacc1x0123 = _mm256_castps256_ps128(vacc1x01234567);
+ __m128 vacc0x0123 = _mm256_castps256_ps128(vacc0x01234567);
+ if (nc & 4) {
+ _mm_storeu_ps(c6, vacc6x0123);
+ _mm_storeu_ps(c5, vacc5x0123);
+ _mm_storeu_ps(c4, vacc4x0123);
+ _mm_storeu_ps(c3, vacc3x0123);
+ _mm_storeu_ps(c2, vacc2x0123);
+ _mm_storeu_ps(c1, vacc1x0123);
+ _mm_storeu_ps(c0, vacc0x0123);
+
+ vacc6x0123 = _mm256_extractf128_ps(vacc6x01234567, 1);
+ vacc5x0123 = _mm256_extractf128_ps(vacc5x01234567, 1);
+ vacc4x0123 = _mm256_extractf128_ps(vacc4x01234567, 1);
+ vacc3x0123 = _mm256_extractf128_ps(vacc3x01234567, 1);
+ vacc2x0123 = _mm256_extractf128_ps(vacc2x01234567, 1);
+ vacc1x0123 = _mm256_extractf128_ps(vacc1x01234567, 1);
+ vacc0x0123 = _mm256_extractf128_ps(vacc0x01234567, 1);
+
+ c6 += 4;
+ c5 += 4;
+ c4 += 4;
+ c3 += 4;
+ c2 += 4;
+ c1 += 4;
+ c0 += 4;
+ }
+ if (nc & 2) {
+ _mm_storel_pi((__m64*) c6, vacc6x0123);
+ _mm_storel_pi((__m64*) c5, vacc5x0123);
+ _mm_storel_pi((__m64*) c4, vacc4x0123);
+ _mm_storel_pi((__m64*) c3, vacc3x0123);
+ _mm_storel_pi((__m64*) c2, vacc2x0123);
+ _mm_storel_pi((__m64*) c1, vacc1x0123);
+ _mm_storel_pi((__m64*) c0, vacc0x0123);
+
+ vacc6x0123 = _mm_movehl_ps(vacc6x0123, vacc6x0123);
+ vacc5x0123 = _mm_movehl_ps(vacc5x0123, vacc5x0123);
+ vacc4x0123 = _mm_movehl_ps(vacc4x0123, vacc4x0123);
+ vacc3x0123 = _mm_movehl_ps(vacc3x0123, vacc3x0123);
+ vacc2x0123 = _mm_movehl_ps(vacc2x0123, vacc2x0123);
+ vacc1x0123 = _mm_movehl_ps(vacc1x0123, vacc1x0123);
+ vacc0x0123 = _mm_movehl_ps(vacc0x0123, vacc0x0123);
+
+ c6 += 2;
+ c5 += 2;
+ c4 += 2;
+ c3 += 2;
+ c2 += 2;
+ c1 += 2;
+ c0 += 2;
+ }
+ if (nc & 1) {
+ _mm_store_ss(c6, vacc6x0123);
+ _mm_store_ss(c5, vacc5x0123);
+ _mm_store_ss(c4, vacc4x0123);
+ _mm_store_ss(c3, vacc3x0123);
+ _mm_store_ss(c2, vacc2x0123);
+ _mm_store_ss(c1, vacc1x0123);
+ _mm_store_ss(c0, vacc0x0123);
+ }
+
+ nc = 0;
+ }
+ } while (nc != 0);
+}
diff --git a/src/f32-igemm/8x8-fma3-broadcast.c b/src/f32-igemm/8x8-fma3-broadcast.c
new file mode 100644
index 0000000..7cb4625
--- /dev/null
+++ b/src/f32-igemm/8x8-fma3-broadcast.c
@@ -0,0 +1,282 @@
+// Auto-generated file. Do not edit!
+// Template: src/f32-igemm/avx-broadcast.c.in
+// Generator: tools/xngen
+//
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <immintrin.h>
+
+#include <xnnpack/igemm.h>
+
+
+void xnn_f32_igemm_ukernel_8x8__fma3_broadcast(
+ size_t mr,
+ size_t nc,
+ size_t kc,
+ size_t ks,
+ const float**restrict a,
+ const float*restrict w,
+ float*restrict c,
+ size_t cm_stride,
+ size_t cn_stride,
+ size_t a_offset,
+ const float* zero,
+ const union xnn_f32_output_params params[restrict static 1])
+{
+ assert(mr != 0);
+ assert(mr <= 8);
+ assert(nc != 0);
+ assert(kc != 0);
+ assert(kc % sizeof(float) == 0);
+ assert(ks != 0);
+ assert(ks % (8 * sizeof(void*)) == 0);
+ assert(a_offset % sizeof(float) == 0);
+ assert(a != NULL);
+ assert(w != NULL);
+ assert(c != NULL);
+
+ float* c0 = c;
+ float* c1 = (float*) ((uintptr_t) c0 + cm_stride);
+ if XNN_UNPREDICTABLE(mr < 2) {
+ c1 = c0;
+ }
+ float* c2 = (float*) ((uintptr_t) c1 + cm_stride);
+ if XNN_UNPREDICTABLE(mr <= 2) {
+ c2 = c1;
+ }
+ float* c3 = (float*) ((uintptr_t) c2 + cm_stride);
+ if XNN_UNPREDICTABLE(mr < 4) {
+ c3 = c2;
+ }
+ float* c4 = (float*) ((uintptr_t) c3 + cm_stride);
+ if XNN_UNPREDICTABLE(mr <= 4) {
+ c4 = c3;
+ }
+ float* c5 = (float*) ((uintptr_t) c4 + cm_stride);
+ if XNN_UNPREDICTABLE(mr < 6) {
+ c5 = c4;
+ }
+ float* c6 = (float*) ((uintptr_t) c5 + cm_stride);
+ if XNN_UNPREDICTABLE(mr <= 6) {
+ c6 = c5;
+ }
+ float* c7 = (float*) ((uintptr_t) c6 + cm_stride);
+ if XNN_UNPREDICTABLE(mr != 8) {
+ c7 = c6;
+ }
+
+ do {
+ __m256 vacc0x01234567 = _mm256_load_ps(w);
+ __m256 vacc1x01234567 = vacc0x01234567;
+ __m256 vacc2x01234567 = vacc0x01234567;
+ __m256 vacc3x01234567 = vacc0x01234567;
+ __m256 vacc4x01234567 = vacc0x01234567;
+ __m256 vacc5x01234567 = vacc0x01234567;
+ __m256 vacc6x01234567 = vacc0x01234567;
+ __m256 vacc7x01234567 = vacc0x01234567;
+ w += 8;
+
+ size_t p = ks;
+ do {
+ const float* restrict a0 = a[0];
+ assert(a0 != NULL);
+ if XNN_UNPREDICTABLE(a0 != zero) {
+ a0 = (const float*) ((uintptr_t) a0 + a_offset);
+ }
+ const float* restrict a1 = a[1];
+ assert(a1 != NULL);
+ if XNN_UNPREDICTABLE(a1 != zero) {
+ a1 = (const float*) ((uintptr_t) a1 + a_offset);
+ }
+ const float* restrict a2 = a[2];
+ assert(a2 != NULL);
+ if XNN_UNPREDICTABLE(a2 != zero) {
+ a2 = (const float*) ((uintptr_t) a2 + a_offset);
+ }
+ const float* restrict a3 = a[3];
+ assert(a3 != NULL);
+ if XNN_UNPREDICTABLE(a3 != zero) {
+ a3 = (const float*) ((uintptr_t) a3 + a_offset);
+ }
+ const float* restrict a4 = a[4];
+ assert(a4 != NULL);
+ if XNN_UNPREDICTABLE(a4 != zero) {
+ a4 = (const float*) ((uintptr_t) a4 + a_offset);
+ }
+ const float* restrict a5 = a[5];
+ assert(a5 != NULL);
+ if XNN_UNPREDICTABLE(a5 != zero) {
+ a5 = (const float*) ((uintptr_t) a5 + a_offset);
+ }
+ const float* restrict a6 = a[6];
+ assert(a6 != NULL);
+ if XNN_UNPREDICTABLE(a6 != zero) {
+ a6 = (const float*) ((uintptr_t) a6 + a_offset);
+ }
+ const float* restrict a7 = a[7];
+ assert(a7 != NULL);
+ if XNN_UNPREDICTABLE(a7 != zero) {
+ a7 = (const float*) ((uintptr_t) a7 + a_offset);
+ }
+ a += 8;
+
+ size_t k = kc;
+ do {
+ const __m256 vb01234567 = _mm256_load_ps(w);
+ w += 8;
+
+ const __m256 va0 = _mm256_broadcast_ss(a0);
+ a0 += 1;
+ const __m256 va1 = _mm256_broadcast_ss(a1);
+ a1 += 1;
+ const __m256 va2 = _mm256_broadcast_ss(a2);
+ a2 += 1;
+ const __m256 va3 = _mm256_broadcast_ss(a3);
+ a3 += 1;
+ const __m256 va4 = _mm256_broadcast_ss(a4);
+ a4 += 1;
+ const __m256 va5 = _mm256_broadcast_ss(a5);
+ a5 += 1;
+ const __m256 va6 = _mm256_broadcast_ss(a6);
+ a6 += 1;
+ const __m256 va7 = _mm256_broadcast_ss(a7);
+ a7 += 1;
+
+ vacc0x01234567 = _mm256_fmadd_ps(va0, vb01234567, vacc0x01234567);
+ vacc1x01234567 = _mm256_fmadd_ps(va1, vb01234567, vacc1x01234567);
+ vacc2x01234567 = _mm256_fmadd_ps(va2, vb01234567, vacc2x01234567);
+ vacc3x01234567 = _mm256_fmadd_ps(va3, vb01234567, vacc3x01234567);
+ vacc4x01234567 = _mm256_fmadd_ps(va4, vb01234567, vacc4x01234567);
+ vacc5x01234567 = _mm256_fmadd_ps(va5, vb01234567, vacc5x01234567);
+ vacc6x01234567 = _mm256_fmadd_ps(va6, vb01234567, vacc6x01234567);
+ vacc7x01234567 = _mm256_fmadd_ps(va7, vb01234567, vacc7x01234567);
+ k -= sizeof(float);
+ } while (k != 0);
+ p -= 8 * sizeof(void*);
+ } while (p != 0);
+
+ const __m256 vmax = _mm256_broadcast_ps((const __m128*) params->sse.max);
+ vacc0x01234567 = _mm256_min_ps(vacc0x01234567, vmax);
+ vacc1x01234567 = _mm256_min_ps(vacc1x01234567, vmax);
+ vacc2x01234567 = _mm256_min_ps(vacc2x01234567, vmax);
+ vacc3x01234567 = _mm256_min_ps(vacc3x01234567, vmax);
+ vacc4x01234567 = _mm256_min_ps(vacc4x01234567, vmax);
+ vacc5x01234567 = _mm256_min_ps(vacc5x01234567, vmax);
+ vacc6x01234567 = _mm256_min_ps(vacc6x01234567, vmax);
+ vacc7x01234567 = _mm256_min_ps(vacc7x01234567, vmax);
+
+ const __m256 vmin = _mm256_broadcast_ps((const __m128*) params->sse.min);
+ vacc0x01234567 = _mm256_max_ps(vacc0x01234567, vmin);
+ vacc1x01234567 = _mm256_max_ps(vacc1x01234567, vmin);
+ vacc2x01234567 = _mm256_max_ps(vacc2x01234567, vmin);
+ vacc3x01234567 = _mm256_max_ps(vacc3x01234567, vmin);
+ vacc4x01234567 = _mm256_max_ps(vacc4x01234567, vmin);
+ vacc5x01234567 = _mm256_max_ps(vacc5x01234567, vmin);
+ vacc6x01234567 = _mm256_max_ps(vacc6x01234567, vmin);
+ vacc7x01234567 = _mm256_max_ps(vacc7x01234567, vmin);
+
+ if XNN_LIKELY(nc >= 8) {
+ _mm256_storeu_ps(c7, vacc7x01234567);
+ c7 = (float*) ((uintptr_t) c7 + cn_stride);
+ _mm256_storeu_ps(c6, vacc6x01234567);
+ c6 = (float*) ((uintptr_t) c6 + cn_stride);
+ _mm256_storeu_ps(c5, vacc5x01234567);
+ c5 = (float*) ((uintptr_t) c5 + cn_stride);
+ _mm256_storeu_ps(c4, vacc4x01234567);
+ c4 = (float*) ((uintptr_t) c4 + cn_stride);
+ _mm256_storeu_ps(c3, vacc3x01234567);
+ c3 = (float*) ((uintptr_t) c3 + cn_stride);
+ _mm256_storeu_ps(c2, vacc2x01234567);
+ c2 = (float*) ((uintptr_t) c2 + cn_stride);
+ _mm256_storeu_ps(c1, vacc1x01234567);
+ c1 = (float*) ((uintptr_t) c1 + cn_stride);
+ _mm256_storeu_ps(c0, vacc0x01234567);
+ c0 = (float*) ((uintptr_t) c0 + cn_stride);
+
+ a = (const float**restrict) ((uintptr_t) a - ks);
+ nc -= 8;
+ } else {
+ __m128 vacc7x0123 = _mm256_castps256_ps128(vacc7x01234567);
+ __m128 vacc6x0123 = _mm256_castps256_ps128(vacc6x01234567);
+ __m128 vacc5x0123 = _mm256_castps256_ps128(vacc5x01234567);
+ __m128 vacc4x0123 = _mm256_castps256_ps128(vacc4x01234567);
+ __m128 vacc3x0123 = _mm256_castps256_ps128(vacc3x01234567);
+ __m128 vacc2x0123 = _mm256_castps256_ps128(vacc2x01234567);
+ __m128 vacc1x0123 = _mm256_castps256_ps128(vacc1x01234567);
+ __m128 vacc0x0123 = _mm256_castps256_ps128(vacc0x01234567);
+ if (nc & 4) {
+ _mm_storeu_ps(c7, vacc7x0123);
+ _mm_storeu_ps(c6, vacc6x0123);
+ _mm_storeu_ps(c5, vacc5x0123);
+ _mm_storeu_ps(c4, vacc4x0123);
+ _mm_storeu_ps(c3, vacc3x0123);
+ _mm_storeu_ps(c2, vacc2x0123);
+ _mm_storeu_ps(c1, vacc1x0123);
+ _mm_storeu_ps(c0, vacc0x0123);
+
+ vacc7x0123 = _mm256_extractf128_ps(vacc7x01234567, 1);
+ vacc6x0123 = _mm256_extractf128_ps(vacc6x01234567, 1);
+ vacc5x0123 = _mm256_extractf128_ps(vacc5x01234567, 1);
+ vacc4x0123 = _mm256_extractf128_ps(vacc4x01234567, 1);
+ vacc3x0123 = _mm256_extractf128_ps(vacc3x01234567, 1);
+ vacc2x0123 = _mm256_extractf128_ps(vacc2x01234567, 1);
+ vacc1x0123 = _mm256_extractf128_ps(vacc1x01234567, 1);
+ vacc0x0123 = _mm256_extractf128_ps(vacc0x01234567, 1);
+
+ c7 += 4;
+ c6 += 4;
+ c5 += 4;
+ c4 += 4;
+ c3 += 4;
+ c2 += 4;
+ c1 += 4;
+ c0 += 4;
+ }
+ if (nc & 2) {
+ _mm_storel_pi((__m64*) c7, vacc7x0123);
+ _mm_storel_pi((__m64*) c6, vacc6x0123);
+ _mm_storel_pi((__m64*) c5, vacc5x0123);
+ _mm_storel_pi((__m64*) c4, vacc4x0123);
+ _mm_storel_pi((__m64*) c3, vacc3x0123);
+ _mm_storel_pi((__m64*) c2, vacc2x0123);
+ _mm_storel_pi((__m64*) c1, vacc1x0123);
+ _mm_storel_pi((__m64*) c0, vacc0x0123);
+
+ vacc7x0123 = _mm_movehl_ps(vacc7x0123, vacc7x0123);
+ vacc6x0123 = _mm_movehl_ps(vacc6x0123, vacc6x0123);
+ vacc5x0123 = _mm_movehl_ps(vacc5x0123, vacc5x0123);
+ vacc4x0123 = _mm_movehl_ps(vacc4x0123, vacc4x0123);
+ vacc3x0123 = _mm_movehl_ps(vacc3x0123, vacc3x0123);
+ vacc2x0123 = _mm_movehl_ps(vacc2x0123, vacc2x0123);
+ vacc1x0123 = _mm_movehl_ps(vacc1x0123, vacc1x0123);
+ vacc0x0123 = _mm_movehl_ps(vacc0x0123, vacc0x0123);
+
+ c7 += 2;
+ c6 += 2;
+ c5 += 2;
+ c4 += 2;
+ c3 += 2;
+ c2 += 2;
+ c1 += 2;
+ c0 += 2;
+ }
+ if (nc & 1) {
+ _mm_store_ss(c7, vacc7x0123);
+ _mm_store_ss(c6, vacc6x0123);
+ _mm_store_ss(c5, vacc5x0123);
+ _mm_store_ss(c4, vacc4x0123);
+ _mm_store_ss(c3, vacc3x0123);
+ _mm_store_ss(c2, vacc2x0123);
+ _mm_store_ss(c1, vacc1x0123);
+ _mm_store_ss(c0, vacc0x0123);
+ }
+
+ nc = 0;
+ }
+ } while (nc != 0);
+}
diff --git a/src/f32-igemm/avx-broadcast.c.in b/src/f32-igemm/avx-broadcast.c.in
new file mode 100644
index 0000000..29d4206
--- /dev/null
+++ b/src/f32-igemm/avx-broadcast.c.in
@@ -0,0 +1,163 @@
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+$assert NR % 4 == 0
+$ABC = "0123456789ABCDEFGHIJKLMN"
+#include <assert.h>
+
+#include <immintrin.h>
+
+#include <xnnpack/igemm.h>
+
+
+$ISA = {0: "avx", 3: "fma3"}[FMA]
+void xnn_f32_igemm_ukernel_${MR}x${NR}__${ISA}_broadcast(
+ size_t mr,
+ size_t nc,
+ size_t kc,
+ size_t ks,
+ const float**restrict a,
+ const float*restrict w,
+ float*restrict c,
+ size_t cm_stride,
+ size_t cn_stride,
+ size_t a_offset,
+ const float* zero,
+ const union xnn_f32_output_params params[restrict static 1])
+{
+ assert(mr != 0);
+ assert(mr <= ${MR});
+ assert(nc != 0);
+ assert(kc != 0);
+ assert(kc % sizeof(float) == 0);
+ assert(ks != 0);
+ assert(ks % (${MR} * sizeof(void*)) == 0);
+ assert(a_offset % sizeof(float) == 0);
+ assert(a != NULL);
+ assert(w != NULL);
+ assert(c != NULL);
+
+ float* c0 = c;
+ $for M in range(1, MR):
+ float* c${M} = (float*) ((uintptr_t) c${M-1} + cm_stride);
+ $if M % 2 == 0:
+ if XNN_UNPREDICTABLE(mr <= ${M}) {
+ c${M} = c${M-1};
+ }
+ $elif M + 1 == MR:
+ if XNN_UNPREDICTABLE(mr != ${M+1}) {
+ c${M} = c${M-1};
+ }
+ $else:
+ if XNN_UNPREDICTABLE(mr < ${M+1}) {
+ c${M} = c${M-1};
+ }
+
+ do {
+ __m256 vacc0x${ABC[0:8]} = _mm256_load_ps(w);
+ $for N in range(8, NR, 8):
+ __m256 vacc0x${ABC[N:N+8]} = _mm256_load_ps(w + ${N});
+ $for M in range(1, MR):
+ $for N in range(0, NR, 8):
+ __m256 vacc${M}x${ABC[N:N+8]} = vacc0x${ABC[N:N+8]};
+ w += ${NR};
+
+ size_t p = ks;
+ do {
+ $for M in range(MR):
+ const float* restrict a${M} = a[${M}];
+ assert(a${M} != NULL);
+ if XNN_UNPREDICTABLE(a${M} != zero) {
+ a${M} = (const float*) ((uintptr_t) a${M} + a_offset);
+ }
+ a += ${MR};
+
+ size_t k = kc;
+ do {
+ const __m256 vb${ABC[0:8]} = _mm256_load_ps(w);
+ $for N in range(8, NR, 8):
+ const __m256 vb${ABC[N:N+8]} = _mm256_load_ps(w + ${N});
+ w += ${NR};
+
+ $for M in range(MR):
+ const __m256 va${M} = _mm256_broadcast_ss(a${M});
+ a${M} += 1;
+
+ $for M in range(MR):
+ $for N in range(0, NR, 8):
+ $if FMA == 3:
+ vacc${M}x${ABC[N:N+8]} = _mm256_fmadd_ps(va${M}, vb${ABC[N:N+8]}, vacc${M}x${ABC[N:N+8]});
+ $else:
+ vacc${M}x${ABC[N:N+8]} = _mm256_add_ps(vacc${M}x${ABC[N:N+8]}, _mm256_mul_ps(va${M}, vb${ABC[N:N+8]}));
+ k -= sizeof(float);
+ } while (k != 0);
+ p -= ${MR} * sizeof(void*);
+ } while (p != 0);
+
+ const __m256 vmax = _mm256_broadcast_ps((const __m128*) params->sse.max);
+ $for N in range(0, NR, 8):
+ $for M in range(MR):
+ vacc${M}x${ABC[N:N+8]} = _mm256_min_ps(vacc${M}x${ABC[N:N+8]}, vmax);
+
+ const __m256 vmin = _mm256_broadcast_ps((const __m128*) params->sse.min);
+ $for N in range(0, NR, 8):
+ $for M in range(MR):
+ vacc${M}x${ABC[N:N+8]} = _mm256_max_ps(vacc${M}x${ABC[N:N+8]}, vmin);
+
+ if XNN_LIKELY(nc >= ${NR}) {
+ $for M in reversed(range(MR)):
+ _mm256_storeu_ps(c${M}, vacc${M}x${ABC[0:8]});
+ $for N in range(8, NR, 8):
+ _mm256_storeu_ps(c${M} + ${N}, vacc${M}x${ABC[N:N+8]});
+ c${M} = (float*) ((uintptr_t) c${M} + cn_stride);
+
+ a = (const float**restrict) ((uintptr_t) a - ks);
+ nc -= ${NR};
+ } else {
+ $for LOG2N in reversed(range(NR.bit_length())):
+ $if NR != 1 << LOG2N:
+ if (nc & ${1 << LOG2N}) {
+ $if LOG2N >= 3:
+ $for M in reversed(range(MR)):
+ _mm_storeu_ps(c${M}, vacc${M}x${ABC[0:4]});
+ $for N in range(4, 1 << LOG2N, 4):
+ _mm_storeu_ps(c${M} + ${N}, vacc${M}x${ABC[N:N+4]});
+
+ $for M in reversed(range(MR)):
+ $for N in range(0, 1 << (LOG2N - 1), 4):
+ vacc${M}x${ABC[N:N+4]} = vacc${M}x${ABC[N + (1 << LOG2N):N + (1 << LOG2N)+4]};
+
+ $for M in reversed(range(MR)):
+ c${M} += ${1 << LOG2N};
+ $elif LOG2N == 2:
+ $for M in reversed(range(MR)):
+ _mm_storeu_ps(c${M}, vacc${M}x${ABC[0:4]});
+
+ $for M in reversed(range(MR)):
+ vacc${M}x${ABC[0:4]} = _mm256_extractf128_ps(vacc${M}x${ABC[0:8]}, 1);
+
+ $for M in reversed(range(MR)):
+ c${M} += 4;
+ $elif LOG2N == 1:
+ $for M in reversed(range(MR)):
+ _mm_storel_pi((__m64*) c${M}, vacc${M}x${ABC[0:4]});
+
+ $for M in reversed(range(MR)):
+ vacc${M}x${ABC[0:4]} = _mm_movehl_ps(vacc${M}x${ABC[0:4]}, vacc${M}x${ABC[0:4]});
+
+ $for M in reversed(range(MR)):
+ c${M} += 2;
+ $elif LOG2N == 0:
+ $for M in reversed(range(MR)):
+ _mm_store_ss(c${M}, vacc${M}x${ABC[0:4]});
+ }
+ $if LOG2N == 3:
+ $for M in reversed(range(MR)):
+ __m128 vacc${M}x${ABC[0:4]} = _mm256_castps256_ps128(vacc${M}x${ABC[0:8]});
+
+ nc = 0;
+ }
+ } while (nc != 0);
+}
diff --git a/src/xnnpack/gemm.h b/src/xnnpack/gemm.h
index 0df00f5..4f16309 100644
--- a/src/xnnpack/gemm.h
+++ b/src/xnnpack/gemm.h
@@ -37,6 +37,8 @@
DECLARE_F32_GEMM_UKERNEL_FUNCTION(xnn_f32_gemm_ukernel_1x8__aarch64_neonfma_cortex_a53)
DECLARE_F32_GEMM_UKERNEL_FUNCTION(xnn_f32_gemm_ukernel_1x8__aarch64_neonfma_cortex_a57)
DECLARE_F32_GEMM_UKERNEL_FUNCTION(xnn_f32_gemm_ukernel_1x8__aarch64_neonfma_cortex_a75)
+DECLARE_F32_GEMM_UKERNEL_FUNCTION(xnn_f32_gemm_ukernel_1x8__avx_broadcast)
+DECLARE_F32_GEMM_UKERNEL_FUNCTION(xnn_f32_gemm_ukernel_1x8__fma3_broadcast)
DECLARE_F32_GEMM_UKERNEL_FUNCTION(xnn_f32_gemm_ukernel_1x8__neon_ld64)
DECLARE_F32_GEMM_UKERNEL_FUNCTION(xnn_f32_gemm_ukernel_1x8__neonfma_ld64)
DECLARE_F32_GEMM_UKERNEL_FUNCTION(xnn_f32_gemm_ukernel_1x8__psimd_loadsplat)
@@ -58,6 +60,8 @@
DECLARE_F32_GEMM_UKERNEL_FUNCTION(xnn_f32_gemm_ukernel_4x8__aarch64_neonfma_cortex_a75)
DECLARE_F32_GEMM_UKERNEL_FUNCTION(xnn_f32_gemm_ukernel_4x8__aarch64_neonfma_ld128)
DECLARE_F32_GEMM_UKERNEL_FUNCTION(xnn_f32_gemm_ukernel_4x8__aarch64_neonfma_ld64)
+DECLARE_F32_GEMM_UKERNEL_FUNCTION(xnn_f32_gemm_ukernel_4x8__avx_broadcast)
+DECLARE_F32_GEMM_UKERNEL_FUNCTION(xnn_f32_gemm_ukernel_4x8__fma3_broadcast)
DECLARE_F32_GEMM_UKERNEL_FUNCTION(xnn_f32_gemm_ukernel_4x8__neon_ld128)
DECLARE_F32_GEMM_UKERNEL_FUNCTION(xnn_f32_gemm_ukernel_4x8__neon_ld64)
DECLARE_F32_GEMM_UKERNEL_FUNCTION(xnn_f32_gemm_ukernel_4x8__neonfma_ld128)
@@ -71,6 +75,8 @@
DECLARE_F32_GEMM_UKERNEL_FUNCTION(xnn_f32_gemm_ukernel_4x8s4__psimd)
DECLARE_F32_GEMM_UKERNEL_FUNCTION(xnn_f32_gemm_ukernel_4x8s4__sse)
DECLARE_F32_GEMM_UKERNEL_FUNCTION(xnn_f32_gemm_ukernel_5x8__aarch64_neonfma_cortex_a75)
+DECLARE_F32_GEMM_UKERNEL_FUNCTION(xnn_f32_gemm_ukernel_5x8__avx_broadcast)
+DECLARE_F32_GEMM_UKERNEL_FUNCTION(xnn_f32_gemm_ukernel_5x8__fma3_broadcast)
DECLARE_F32_GEMM_UKERNEL_FUNCTION(xnn_f32_gemm_ukernel_5x8__neon_ld64)
DECLARE_F32_GEMM_UKERNEL_FUNCTION(xnn_f32_gemm_ukernel_5x8__neonfma_ld64)
DECLARE_F32_GEMM_UKERNEL_FUNCTION(xnn_f32_gemm_ukernel_6x8__aarch64_neonfma_cortex_a53)
@@ -79,6 +85,8 @@
DECLARE_F32_GEMM_UKERNEL_FUNCTION(xnn_f32_gemm_ukernel_6x8__aarch64_neonfma_cortex_a75)
DECLARE_F32_GEMM_UKERNEL_FUNCTION(xnn_f32_gemm_ukernel_6x8__aarch64_neonfma_ld128)
DECLARE_F32_GEMM_UKERNEL_FUNCTION(xnn_f32_gemm_ukernel_6x8__aarch64_neonfma_ld64)
+DECLARE_F32_GEMM_UKERNEL_FUNCTION(xnn_f32_gemm_ukernel_6x8__avx_broadcast)
+DECLARE_F32_GEMM_UKERNEL_FUNCTION(xnn_f32_gemm_ukernel_6x8__fma3_broadcast)
DECLARE_F32_GEMM_UKERNEL_FUNCTION(xnn_f32_gemm_ukernel_6x8__neon_ld64)
DECLARE_F32_GEMM_UKERNEL_FUNCTION(xnn_f32_gemm_ukernel_6x8__neonfma_ld64)
DECLARE_F32_GEMM_UKERNEL_FUNCTION(xnn_f32_gemm_ukernel_6x8__psimd_loadsplat)
@@ -86,6 +94,9 @@
DECLARE_F32_GEMM_UKERNEL_FUNCTION(xnn_f32_gemm_ukernel_6x8s4__neon)
DECLARE_F32_GEMM_UKERNEL_FUNCTION(xnn_f32_gemm_ukernel_6x8s4__neonfma)
DECLARE_F32_GEMM_UKERNEL_FUNCTION(xnn_f32_gemm_ukernel_6x8s4__psimd)
+DECLARE_F32_GEMM_UKERNEL_FUNCTION(xnn_f32_gemm_ukernel_7x8__avx_broadcast)
+DECLARE_F32_GEMM_UKERNEL_FUNCTION(xnn_f32_gemm_ukernel_7x8__fma3_broadcast)
+DECLARE_F32_GEMM_UKERNEL_FUNCTION(xnn_f32_gemm_ukernel_8x8__fma3_broadcast)
DECLARE_F32_GEMM_UKERNEL_FUNCTION(xnn_f32_gemm_ukernel_8x8s4__neon)
DECLARE_F32_GEMM_UKERNEL_FUNCTION(xnn_f32_gemm_ukernel_8x8s4__neonfma)
@@ -109,6 +120,8 @@
DECLARE_F32_GEMMINC_UKERNEL_FUNCTION(xnn_f32_gemminc_ukernel_1x8__aarch64_neonfma_cortex_a53)
DECLARE_F32_GEMMINC_UKERNEL_FUNCTION(xnn_f32_gemminc_ukernel_1x8__aarch64_neonfma_cortex_a57)
DECLARE_F32_GEMMINC_UKERNEL_FUNCTION(xnn_f32_gemminc_ukernel_1x8__aarch64_neonfma_cortex_a75)
+DECLARE_F32_GEMMINC_UKERNEL_FUNCTION(xnn_f32_gemminc_ukernel_1x8__avx_broadcast)
+DECLARE_F32_GEMMINC_UKERNEL_FUNCTION(xnn_f32_gemminc_ukernel_1x8__fma3_broadcast)
DECLARE_F32_GEMMINC_UKERNEL_FUNCTION(xnn_f32_gemminc_ukernel_1x8__neon_ld64)
DECLARE_F32_GEMMINC_UKERNEL_FUNCTION(xnn_f32_gemminc_ukernel_1x8__neonfma_ld64)
DECLARE_F32_GEMMINC_UKERNEL_FUNCTION(xnn_f32_gemminc_ukernel_1x8__psimd_loadsplat)
@@ -127,6 +140,8 @@
DECLARE_F32_GEMMINC_UKERNEL_FUNCTION(xnn_f32_gemminc_ukernel_4x8__aarch64_neonfma_cortex_a75)
DECLARE_F32_GEMMINC_UKERNEL_FUNCTION(xnn_f32_gemminc_ukernel_4x8__aarch64_neonfma_ld128)
DECLARE_F32_GEMMINC_UKERNEL_FUNCTION(xnn_f32_gemminc_ukernel_4x8__aarch64_neonfma_ld64)
+DECLARE_F32_GEMMINC_UKERNEL_FUNCTION(xnn_f32_gemminc_ukernel_4x8__avx_broadcast)
+DECLARE_F32_GEMMINC_UKERNEL_FUNCTION(xnn_f32_gemminc_ukernel_4x8__fma3_broadcast)
DECLARE_F32_GEMMINC_UKERNEL_FUNCTION(xnn_f32_gemminc_ukernel_4x8__neon_ld128)
DECLARE_F32_GEMMINC_UKERNEL_FUNCTION(xnn_f32_gemminc_ukernel_4x8__neon_ld64)
DECLARE_F32_GEMMINC_UKERNEL_FUNCTION(xnn_f32_gemminc_ukernel_4x8__neonfma_ld128)
@@ -140,14 +155,18 @@
DECLARE_F32_GEMMINC_UKERNEL_FUNCTION(xnn_f32_gemminc_ukernel_4x8s4__psimd)
DECLARE_F32_GEMMINC_UKERNEL_FUNCTION(xnn_f32_gemminc_ukernel_4x8s4__sse)
DECLARE_F32_GEMMINC_UKERNEL_FUNCTION(xnn_f32_gemminc_ukernel_5x8__aarch64_neonfma_cortex_a75)
+DECLARE_F32_GEMMINC_UKERNEL_FUNCTION(xnn_f32_gemminc_ukernel_5x8__avx_broadcast)
+DECLARE_F32_GEMMINC_UKERNEL_FUNCTION(xnn_f32_gemminc_ukernel_5x8__fma3_broadcast)
DECLARE_F32_GEMMINC_UKERNEL_FUNCTION(xnn_f32_gemminc_ukernel_5x8__neon_ld64)
DECLARE_F32_GEMMINC_UKERNEL_FUNCTION(xnn_f32_gemminc_ukernel_5x8__neonfma_ld64)
DECLARE_F32_GEMMINC_UKERNEL_FUNCTION(xnn_f32_gemminc_ukernel_6x8__aarch64_neonfma_cortex_a53)
DECLARE_F32_GEMMINC_UKERNEL_FUNCTION(xnn_f32_gemminc_ukernel_6x8__aarch64_neonfma_cortex_a57)
DECLARE_F32_GEMMINC_UKERNEL_FUNCTION(xnn_f32_gemminc_ukernel_6x8__aarch64_neonfma_cortex_a73)
DECLARE_F32_GEMMINC_UKERNEL_FUNCTION(xnn_f32_gemminc_ukernel_6x8__aarch64_neonfma_cortex_a75)
-DECLARE_F32_GEMMINC_UKERNEL_FUNCTION(xnn_f32_gemminc_ukernel_6x8__aarch64_neonfma_ld64)
DECLARE_F32_GEMMINC_UKERNEL_FUNCTION(xnn_f32_gemminc_ukernel_6x8__aarch64_neonfma_ld128)
+DECLARE_F32_GEMMINC_UKERNEL_FUNCTION(xnn_f32_gemminc_ukernel_6x8__aarch64_neonfma_ld64)
+DECLARE_F32_GEMMINC_UKERNEL_FUNCTION(xnn_f32_gemminc_ukernel_6x8__avx_broadcast)
+DECLARE_F32_GEMMINC_UKERNEL_FUNCTION(xnn_f32_gemminc_ukernel_6x8__fma3_broadcast)
DECLARE_F32_GEMMINC_UKERNEL_FUNCTION(xnn_f32_gemminc_ukernel_6x8__neon_ld64)
DECLARE_F32_GEMMINC_UKERNEL_FUNCTION(xnn_f32_gemminc_ukernel_6x8__neonfma_ld64)
DECLARE_F32_GEMMINC_UKERNEL_FUNCTION(xnn_f32_gemminc_ukernel_6x8__psimd_loadsplat)
@@ -155,6 +174,9 @@
DECLARE_F32_GEMMINC_UKERNEL_FUNCTION(xnn_f32_gemminc_ukernel_6x8s4__neon)
DECLARE_F32_GEMMINC_UKERNEL_FUNCTION(xnn_f32_gemminc_ukernel_6x8s4__neonfma)
DECLARE_F32_GEMMINC_UKERNEL_FUNCTION(xnn_f32_gemminc_ukernel_6x8s4__psimd)
+DECLARE_F32_GEMMINC_UKERNEL_FUNCTION(xnn_f32_gemminc_ukernel_7x8__avx_broadcast)
+DECLARE_F32_GEMMINC_UKERNEL_FUNCTION(xnn_f32_gemminc_ukernel_7x8__fma3_broadcast)
+DECLARE_F32_GEMMINC_UKERNEL_FUNCTION(xnn_f32_gemminc_ukernel_8x8__fma3_broadcast)
DECLARE_F32_GEMMINC_UKERNEL_FUNCTION(xnn_f32_gemminc_ukernel_8x8s4__neon)
DECLARE_F32_GEMMINC_UKERNEL_FUNCTION(xnn_f32_gemminc_ukernel_8x8s4__neonfma)
diff --git a/src/xnnpack/igemm.h b/src/xnnpack/igemm.h
index 398545b..5faf766 100644
--- a/src/xnnpack/igemm.h
+++ b/src/xnnpack/igemm.h
@@ -39,6 +39,8 @@
DECLARE_F32_IGEMM_UKERNEL_FUNCTION(xnn_f32_igemm_ukernel_1x8__aarch64_neonfma_cortex_a53)
DECLARE_F32_IGEMM_UKERNEL_FUNCTION(xnn_f32_igemm_ukernel_1x8__aarch64_neonfma_cortex_a57)
DECLARE_F32_IGEMM_UKERNEL_FUNCTION(xnn_f32_igemm_ukernel_1x8__aarch64_neonfma_cortex_a75)
+DECLARE_F32_IGEMM_UKERNEL_FUNCTION(xnn_f32_igemm_ukernel_1x8__avx_broadcast)
+DECLARE_F32_IGEMM_UKERNEL_FUNCTION(xnn_f32_igemm_ukernel_1x8__fma3_broadcast)
DECLARE_F32_IGEMM_UKERNEL_FUNCTION(xnn_f32_igemm_ukernel_1x8__neon_ld64)
DECLARE_F32_IGEMM_UKERNEL_FUNCTION(xnn_f32_igemm_ukernel_1x8__neonfma_ld64)
DECLARE_F32_IGEMM_UKERNEL_FUNCTION(xnn_f32_igemm_ukernel_1x8__psimd_loadsplat)
@@ -61,6 +63,8 @@
DECLARE_F32_IGEMM_UKERNEL_FUNCTION(xnn_f32_igemm_ukernel_4x4__scalar)
DECLARE_F32_IGEMM_UKERNEL_FUNCTION(xnn_f32_igemm_ukernel_4x8__aarch64_neonfma_cortex_a53)
DECLARE_F32_IGEMM_UKERNEL_FUNCTION(xnn_f32_igemm_ukernel_4x8__aarch64_neonfma_cortex_a75)
+DECLARE_F32_IGEMM_UKERNEL_FUNCTION(xnn_f32_igemm_ukernel_4x8__avx_broadcast)
+DECLARE_F32_IGEMM_UKERNEL_FUNCTION(xnn_f32_igemm_ukernel_4x8__fma3_broadcast)
DECLARE_F32_IGEMM_UKERNEL_FUNCTION(xnn_f32_igemm_ukernel_4x8__neon_ld128)
DECLARE_F32_IGEMM_UKERNEL_FUNCTION(xnn_f32_igemm_ukernel_4x8__neon_ld64)
DECLARE_F32_IGEMM_UKERNEL_FUNCTION(xnn_f32_igemm_ukernel_4x8__neonfma_ld128)
@@ -74,10 +78,14 @@
DECLARE_F32_IGEMM_UKERNEL_FUNCTION(xnn_f32_igemm_ukernel_4x8s4__psimd)
DECLARE_F32_IGEMM_UKERNEL_FUNCTION(xnn_f32_igemm_ukernel_4x8s4__sse)
DECLARE_F32_IGEMM_UKERNEL_FUNCTION(xnn_f32_igemm_ukernel_5x8__aarch64_neonfma_cortex_a75)
+DECLARE_F32_IGEMM_UKERNEL_FUNCTION(xnn_f32_igemm_ukernel_5x8__avx_broadcast)
+DECLARE_F32_IGEMM_UKERNEL_FUNCTION(xnn_f32_igemm_ukernel_5x8__fma3_broadcast)
DECLARE_F32_IGEMM_UKERNEL_FUNCTION(xnn_f32_igemm_ukernel_6x8__aarch64_neonfma_cortex_a53)
DECLARE_F32_IGEMM_UKERNEL_FUNCTION(xnn_f32_igemm_ukernel_6x8__aarch64_neonfma_cortex_a57)
DECLARE_F32_IGEMM_UKERNEL_FUNCTION(xnn_f32_igemm_ukernel_6x8__aarch64_neonfma_cortex_a73)
DECLARE_F32_IGEMM_UKERNEL_FUNCTION(xnn_f32_igemm_ukernel_6x8__aarch64_neonfma_cortex_a75)
+DECLARE_F32_IGEMM_UKERNEL_FUNCTION(xnn_f32_igemm_ukernel_6x8__avx_broadcast)
+DECLARE_F32_IGEMM_UKERNEL_FUNCTION(xnn_f32_igemm_ukernel_6x8__fma3_broadcast)
DECLARE_F32_IGEMM_UKERNEL_FUNCTION(xnn_f32_igemm_ukernel_6x8__neon_ld64)
DECLARE_F32_IGEMM_UKERNEL_FUNCTION(xnn_f32_igemm_ukernel_6x8__neonfma_ld64)
DECLARE_F32_IGEMM_UKERNEL_FUNCTION(xnn_f32_igemm_ukernel_6x8__psimd_loadsplat)
@@ -85,6 +93,9 @@
DECLARE_F32_IGEMM_UKERNEL_FUNCTION(xnn_f32_igemm_ukernel_6x8s4__neon)
DECLARE_F32_IGEMM_UKERNEL_FUNCTION(xnn_f32_igemm_ukernel_6x8s4__neonfma)
DECLARE_F32_IGEMM_UKERNEL_FUNCTION(xnn_f32_igemm_ukernel_6x8s4__psimd)
+DECLARE_F32_IGEMM_UKERNEL_FUNCTION(xnn_f32_igemm_ukernel_7x8__avx_broadcast)
+DECLARE_F32_IGEMM_UKERNEL_FUNCTION(xnn_f32_igemm_ukernel_7x8__fma3_broadcast)
+DECLARE_F32_IGEMM_UKERNEL_FUNCTION(xnn_f32_igemm_ukernel_8x8__fma3_broadcast)
DECLARE_F32_IGEMM_UKERNEL_FUNCTION(xnn_f32_igemm_ukernel_8x8s4__neon)
DECLARE_F32_IGEMM_UKERNEL_FUNCTION(xnn_f32_igemm_ukernel_8x8s4__neonfma)
diff --git a/src/xnnpack/isa-checks.h b/src/xnnpack/isa-checks.h
index 226640c..e5d1fc6 100644
--- a/src/xnnpack/isa-checks.h
+++ b/src/xnnpack/isa-checks.h
@@ -52,6 +52,13 @@
} \
} while (0)
+#define TEST_REQUIRES_X86_FMA3 \
+ do { \
+ if (!cpuinfo_initialize() || !cpuinfo_has_x86_fma3()) { \
+ GTEST_SKIP(); \
+ } \
+ } while (0)
+
#define TEST_REQUIRES_X86_AVX2 \
do { \
if (!cpuinfo_initialize() || !cpuinfo_has_x86_avx2()) { \