SpMM move prefetch to fetch next input instead of after the current input.
Add prefetch for weights.
Apply prefetch for all Neon microkernels.  Previously was just 16xN

PiperOrigin-RevId: 341556223
diff --git a/src/f32-spmm/gen/12x1-minmax-neonfma.c b/src/f32-spmm/gen/12x1-minmax-neonfma.c
index be70129..bffe0c0 100644
--- a/src/f32-spmm/gen/12x1-minmax-neonfma.c
+++ b/src/f32-spmm/gen/12x1-minmax-neonfma.c
@@ -46,7 +46,9 @@
           const float32x4_t vi4567 = vld1q_f32(input + 4);
           const float32x4_t vi89AB = vld1q_f32(input + 8);
           input = (const float*restrict) ((uintptr_t) input + (uintptr_t) diff);
+          __builtin_prefetch(input + 16);
           const float32x4_t vw = vld1q_dup_f32(w); w += 1;
+          __builtin_prefetch(w + 32);
           vacc0123 = vfmaq_f32(vacc0123, vi0123, vw);
           vacc4567 = vfmaq_f32(vacc4567, vi4567, vw);
           vacc89AB = vfmaq_f32(vacc89AB, vi89AB, vw);
diff --git a/src/f32-spmm/gen/12x2-minmax-neonfma.c b/src/f32-spmm/gen/12x2-minmax-neonfma.c
index a63e606..fea43a1 100644
--- a/src/f32-spmm/gen/12x2-minmax-neonfma.c
+++ b/src/f32-spmm/gen/12x2-minmax-neonfma.c
@@ -49,8 +49,9 @@
           const float32x4_t vi4567 = vld1q_f32(input + 4);
           const float32x4_t vi89AB = vld1q_f32(input + 8);
           input = (const float*restrict) ((uintptr_t) input + (uintptr_t) diff);
+          __builtin_prefetch(input + 16);
           const float32x2_t vw = vld1_f32(w); w += 2;
-
+          __builtin_prefetch(w + 32);
           vacc0123c0 = vfmaq_lane_f32(vacc0123c0, vi0123, vw, 0);
           vacc4567c0 = vfmaq_lane_f32(vacc4567c0, vi4567, vw, 0);
           vacc89ABc0 = vfmaq_lane_f32(vacc89ABc0, vi89AB, vw, 0);
@@ -97,7 +98,9 @@
             const float32x4_t vi4567 = vld1q_f32(input + 4);
             const float32x4_t vi89AB = vld1q_f32(input + 8);
             input = (const float*restrict) ((uintptr_t) input + (uintptr_t) diff);
+            __builtin_prefetch(input + 16);
             const float32x4_t vw = vld1q_dup_f32(w); w += 1;
+            __builtin_prefetch(w + 32);
             vacc0123 = vfmaq_f32(vacc0123, vi0123, vw);
             vacc4567 = vfmaq_f32(vacc4567, vi4567, vw);
             vacc89AB = vfmaq_f32(vacc89AB, vi89AB, vw);
diff --git a/src/f32-spmm/gen/12x4-minmax-neonfma.c b/src/f32-spmm/gen/12x4-minmax-neonfma.c
index f35ab40..3907163 100644
--- a/src/f32-spmm/gen/12x4-minmax-neonfma.c
+++ b/src/f32-spmm/gen/12x4-minmax-neonfma.c
@@ -55,8 +55,9 @@
           const float32x4_t vi4567 = vld1q_f32(input + 4);
           const float32x4_t vi89AB = vld1q_f32(input + 8);
           input = (const float*restrict) ((uintptr_t) input + (uintptr_t) diff);
+          __builtin_prefetch(input + 16);
           const float32x4_t vw = vld1q_f32(w); w += 4;
-
+          __builtin_prefetch(w + 32);
           vacc0123c0 = vfmaq_laneq_f32(vacc0123c0, vi0123, vw, 0);
           vacc4567c0 = vfmaq_laneq_f32(vacc4567c0, vi4567, vw, 0);
           vacc89ABc0 = vfmaq_laneq_f32(vacc89ABc0, vi89AB, vw, 0);
@@ -127,7 +128,9 @@
             const float32x4_t vi4567 = vld1q_f32(input + 4);
             const float32x4_t vi89AB = vld1q_f32(input + 8);
             input = (const float*restrict) ((uintptr_t) input + (uintptr_t) diff);
+            __builtin_prefetch(input + 16);
             const float32x4_t vw = vld1q_dup_f32(w); w += 1;
+            __builtin_prefetch(w + 32);
             vacc0123 = vfmaq_f32(vacc0123, vi0123, vw);
             vacc4567 = vfmaq_f32(vacc4567, vi4567, vw);
             vacc89AB = vfmaq_f32(vacc89AB, vi89AB, vw);
diff --git a/src/f32-spmm/gen/16x1-minmax-neonfma-pipelined.c b/src/f32-spmm/gen/16x1-minmax-neonfma-pipelined.c
index 34f287c..8c7c9eb 100644
--- a/src/f32-spmm/gen/16x1-minmax-neonfma-pipelined.c
+++ b/src/f32-spmm/gen/16x1-minmax-neonfma-pipelined.c
@@ -39,7 +39,6 @@
     float32x4_t vi4567 = vld1q_f32(input + 4);
     float32x4_t vi89AB = vld1q_f32(input + 8);
     float32x4_t viCDEF = vld1q_f32(input + 12);
-    __builtin_prefetch(input + 16);
     size_t c = output_channels;
     do {
       uint32_t nnz = *nnzmap++;
@@ -55,14 +54,14 @@
           vacc89AB = vfmaq_f32(vacc89AB, vi89AB, vw);
           vaccCDEF = vfmaq_f32(vaccCDEF, viCDEF, vw);
           input = (const float*restrict) ((uintptr_t) input + (uintptr_t) diff);
-
+          __builtin_prefetch(input + 16);
           diff = *dmap++;
           vw = vld1q_dup_f32(w); w += 1;
+          __builtin_prefetch(w + 32);
           vi0123 = vld1q_f32(input);
           vi4567 = vld1q_f32(input + 4);
           vi89AB = vld1q_f32(input + 8);
           viCDEF = vld1q_f32(input + 12);
-          __builtin_prefetch(input + 16);
         } while (--nnz != 0);
       }
       float32x4_t vout0123 = vminq_f32(vacc0123, vmax);
@@ -100,7 +99,9 @@
             const float32x4_t vi0123 = vld1q_f32(input);
             const float32x4_t vi4567 = vld1q_f32(input + 4);
             input = (const float*restrict) ((uintptr_t) input + (uintptr_t) diff);
+            __builtin_prefetch(input + 16);
             const float32x4_t vb = vld1q_dup_f32(w); w += 1;
+            __builtin_prefetch(w + 32);
             vacc0123 = vfmaq_f32(vacc0123, vi0123, vb);
             vacc4567 = vfmaq_f32(vacc4567, vi4567, vb);
           } while (--nnz != 0);
@@ -130,7 +131,9 @@
             const intptr_t diff = *dmap++;
             const float32x4_t vi0123 = vld1q_f32(input);
             input = (const float*restrict) ((uintptr_t) input + (uintptr_t) diff);
+            __builtin_prefetch(input + 16);
             const float32x4_t vb = vld1q_dup_f32(w); w += 1;
+            __builtin_prefetch(w + 32);
             vacc0123 = vfmaq_f32(vacc0123, vi0123, vb);
           } while (--nnz != 0);
         }
@@ -156,7 +159,9 @@
             const intptr_t diff = *dmap++;
             const float32x2_t vi01 = vld1_f32(input);
             input = (const float*restrict) ((uintptr_t) input + (uintptr_t) diff);
+            __builtin_prefetch(input + 16);
             const float32x2_t vb = vld1_dup_f32(w); w += 1;
+            __builtin_prefetch(w + 32);
             vacc01 = vfma_f32(vacc01, vi01, vb);
           } while (--nnz != 0);
         }
@@ -182,7 +187,9 @@
             const intptr_t diff = *dmap++;
             const float32x2_t vi0 = vld1_dup_f32(input);
             input = (const float*restrict) ((uintptr_t) input + (uintptr_t) diff);
+            __builtin_prefetch(input + 16);
             const float32x2_t vb = vld1_dup_f32(w); w += 1;
+            __builtin_prefetch(w + 32);
             vacc0 = vfma_f32(vacc0, vi0, vb);
           } while (--nnz != 0);
         }
diff --git a/src/f32-spmm/gen/16x1-minmax-neonfma-x2.c b/src/f32-spmm/gen/16x1-minmax-neonfma-x2.c
index abf0510..3d5aee4 100644
--- a/src/f32-spmm/gen/16x1-minmax-neonfma-x2.c
+++ b/src/f32-spmm/gen/16x1-minmax-neonfma-x2.c
@@ -52,9 +52,10 @@
         const float32x4_t vi4567x0 = vld1q_f32(input + 4);
         const float32x4_t vi89ABx0 = vld1q_f32(input + 8);
         const float32x4_t viCDEFx0 = vld1q_f32(input + 12);
-        __builtin_prefetch(input + 16);
         input = (const float*restrict) ((uintptr_t) input + (uintptr_t) diff0);
+        __builtin_prefetch(input + 16);
         const float32x4_t vw0 = vld1q_dup_f32(w); w += 1;
+        __builtin_prefetch(w + 32);
         vacc0123x0 = vfmaq_f32(vacc0123x0, vi0123x0, vw0);
         vacc4567x0 = vfmaq_f32(vacc4567x0, vi4567x0, vw0);
         vacc89ABx0 = vfmaq_f32(vacc89ABx0, vi89ABx0, vw0);
@@ -63,9 +64,10 @@
         const float32x4_t vi4567x1 = vld1q_f32(input + 4);
         const float32x4_t vi89ABx1 = vld1q_f32(input + 8);
         const float32x4_t viCDEFx1 = vld1q_f32(input + 12);
-        __builtin_prefetch(input + 16);
         input = (const float*restrict) ((uintptr_t) input + (uintptr_t) diff1);
+        __builtin_prefetch(input + 16);
         const float32x4_t vw1 = vld1q_dup_f32(w); w += 1;
+        __builtin_prefetch(w + 32);
         vacc0123x1 = vfmaq_f32(vacc0123x1, vi0123x1, vw1);
         vacc4567x1 = vfmaq_f32(vacc4567x1, vi4567x1, vw1);
         vacc89ABx1 = vfmaq_f32(vacc89ABx1, vi89ABx1, vw1);
@@ -87,7 +89,9 @@
           const float32x4_t vi89AB = vld1q_f32(input + 8);
           const float32x4_t viCDEF = vld1q_f32(input + 12);
           input = (const float*restrict) ((uintptr_t) input + (uintptr_t) diff);
+          __builtin_prefetch(input + 16);
           const float32x4_t vw = vld1q_dup_f32(w); w += 1;
+          __builtin_prefetch(w + 32);
           vacc0123 = vfmaq_f32(vacc0123, vi0123, vw);
           vacc4567 = vfmaq_f32(vacc4567, vi4567, vw);
           vacc89AB = vfmaq_f32(vacc89AB, vi89AB, vw);
diff --git a/src/f32-spmm/gen/16x1-minmax-neonfma.c b/src/f32-spmm/gen/16x1-minmax-neonfma.c
index 072ee6f..22502e8 100644
--- a/src/f32-spmm/gen/16x1-minmax-neonfma.c
+++ b/src/f32-spmm/gen/16x1-minmax-neonfma.c
@@ -48,7 +48,9 @@
           const float32x4_t vi89AB = vld1q_f32(input + 8);
           const float32x4_t viCDEF = vld1q_f32(input + 12);
           input = (const float*restrict) ((uintptr_t) input + (uintptr_t) diff);
+          __builtin_prefetch(input + 16);
           const float32x4_t vw = vld1q_dup_f32(w); w += 1;
+          __builtin_prefetch(w + 32);
           vacc0123 = vfmaq_f32(vacc0123, vi0123, vw);
           vacc4567 = vfmaq_f32(vacc4567, vi4567, vw);
           vacc89AB = vfmaq_f32(vacc89AB, vi89AB, vw);
diff --git a/src/f32-spmm/gen/16x2-minmax-neonfma.c b/src/f32-spmm/gen/16x2-minmax-neonfma.c
index ab7c1d6..822429c 100644
--- a/src/f32-spmm/gen/16x2-minmax-neonfma.c
+++ b/src/f32-spmm/gen/16x2-minmax-neonfma.c
@@ -51,10 +51,10 @@
           const float32x4_t vi4567 = vld1q_f32(input + 4);
           const float32x4_t vi89AB = vld1q_f32(input + 8);
           const float32x4_t viCDEF = vld1q_f32(input + 12);
-          __builtin_prefetch(input + 16);
           input = (const float*restrict) ((uintptr_t) input + (uintptr_t) diff);
+          __builtin_prefetch(input + 16);
           const float32x2_t vw = vld1_f32(w); w += 2;
-
+          __builtin_prefetch(w + 32);
           vacc0123c0 = vfmaq_lane_f32(vacc0123c0, vi0123, vw, 0);
           vacc4567c0 = vfmaq_lane_f32(vacc4567c0, vi4567, vw, 0);
           vacc89ABc0 = vfmaq_lane_f32(vacc89ABc0, vi89AB, vw, 0);
@@ -111,7 +111,9 @@
             const float32x4_t vi89AB = vld1q_f32(input + 8);
             const float32x4_t viCDEF = vld1q_f32(input + 12);
             input = (const float*restrict) ((uintptr_t) input + (uintptr_t) diff);
+            __builtin_prefetch(input + 16);
             const float32x4_t vw = vld1q_dup_f32(w); w += 1;
+            __builtin_prefetch(w + 32);
             vacc0123 = vfmaq_f32(vacc0123, vi0123, vw);
             vacc4567 = vfmaq_f32(vacc4567, vi4567, vw);
             vacc89AB = vfmaq_f32(vacc89AB, vi89AB, vw);
diff --git a/src/f32-spmm/gen/16x4-minmax-neonfma.c b/src/f32-spmm/gen/16x4-minmax-neonfma.c
index f124917..c887317 100644
--- a/src/f32-spmm/gen/16x4-minmax-neonfma.c
+++ b/src/f32-spmm/gen/16x4-minmax-neonfma.c
@@ -59,10 +59,10 @@
           const float32x4_t vi4567 = vld1q_f32(input + 4);
           const float32x4_t vi89AB = vld1q_f32(input + 8);
           const float32x4_t viCDEF = vld1q_f32(input + 12);
-          __builtin_prefetch(input + 16);
           input = (const float*restrict) ((uintptr_t) input + (uintptr_t) diff);
+          __builtin_prefetch(input + 16);
           const float32x4_t vw = vld1q_f32(w); w += 4;
-
+          __builtin_prefetch(w + 32);
           vacc0123c0 = vfmaq_laneq_f32(vacc0123c0, vi0123, vw, 0);
           vacc4567c0 = vfmaq_laneq_f32(vacc4567c0, vi4567, vw, 0);
           vacc89ABc0 = vfmaq_laneq_f32(vacc89ABc0, vi89AB, vw, 0);
@@ -151,7 +151,9 @@
             const float32x4_t vi89AB = vld1q_f32(input + 8);
             const float32x4_t viCDEF = vld1q_f32(input + 12);
             input = (const float*restrict) ((uintptr_t) input + (uintptr_t) diff);
+            __builtin_prefetch(input + 16);
             const float32x4_t vw = vld1q_dup_f32(w); w += 1;
+            __builtin_prefetch(w + 32);
             vacc0123 = vfmaq_f32(vacc0123, vi0123, vw);
             vacc4567 = vfmaq_f32(vacc4567, vi4567, vw);
             vacc89AB = vfmaq_f32(vacc89AB, vi89AB, vw);
diff --git a/src/f32-spmm/gen/32x1-minmax-neonfma-pipelined.c b/src/f32-spmm/gen/32x1-minmax-neonfma-pipelined.c
index b74ce47..b417362 100644
--- a/src/f32-spmm/gen/32x1-minmax-neonfma-pipelined.c
+++ b/src/f32-spmm/gen/32x1-minmax-neonfma-pipelined.c
@@ -43,7 +43,6 @@
     float32x4_t viKLMN = vld1q_f32(input + 20);
     float32x4_t viOPQR = vld1q_f32(input + 24);
     float32x4_t viSTUV = vld1q_f32(input + 28);
-    __builtin_prefetch(input + 16);
     size_t c = output_channels;
     do {
       uint32_t nnz = *nnzmap++;
@@ -67,9 +66,11 @@
           vaccOPQR = vfmaq_f32(vaccOPQR, viOPQR, vw);
           vaccSTUV = vfmaq_f32(vaccSTUV, viSTUV, vw);
           input = (const float*restrict) ((uintptr_t) input + (uintptr_t) diff);
-
+          __builtin_prefetch(input + 16);
+          __builtin_prefetch(input + 32);
           diff = *dmap++;
           vw = vld1q_dup_f32(w); w += 1;
+          __builtin_prefetch(w + 32);
           vi0123 = vld1q_f32(input);
           vi4567 = vld1q_f32(input + 4);
           vi89AB = vld1q_f32(input + 8);
@@ -78,7 +79,6 @@
           viKLMN = vld1q_f32(input + 20);
           viOPQR = vld1q_f32(input + 24);
           viSTUV = vld1q_f32(input + 28);
-          __builtin_prefetch(input + 16);
         } while (--nnz != 0);
       }
       float32x4_t vout0123 = vminq_f32(vacc0123, vmax);
@@ -132,7 +132,10 @@
             const float32x4_t vi89AB = vld1q_f32(input + 8);
             const float32x4_t viCDEF = vld1q_f32(input + 12);
             input = (const float*restrict) ((uintptr_t) input + (uintptr_t) diff);
+            __builtin_prefetch(input + 16);
+            __builtin_prefetch(input + 32);
             const float32x4_t vb = vld1q_dup_f32(w); w += 1;
+            __builtin_prefetch(w + 32);
             vacc0123 = vfmaq_f32(vacc0123, vi0123, vb);
             vacc4567 = vfmaq_f32(vacc4567, vi4567, vb);
             vacc89AB = vfmaq_f32(vacc89AB, vi89AB, vb);
@@ -172,7 +175,10 @@
             const float32x4_t vi0123 = vld1q_f32(input);
             const float32x4_t vi4567 = vld1q_f32(input + 4);
             input = (const float*restrict) ((uintptr_t) input + (uintptr_t) diff);
+            __builtin_prefetch(input + 16);
+            __builtin_prefetch(input + 32);
             const float32x4_t vb = vld1q_dup_f32(w); w += 1;
+            __builtin_prefetch(w + 32);
             vacc0123 = vfmaq_f32(vacc0123, vi0123, vb);
             vacc4567 = vfmaq_f32(vacc4567, vi4567, vb);
           } while (--nnz != 0);
@@ -202,7 +208,10 @@
             const intptr_t diff = *dmap++;
             const float32x4_t vi0123 = vld1q_f32(input);
             input = (const float*restrict) ((uintptr_t) input + (uintptr_t) diff);
+            __builtin_prefetch(input + 16);
+            __builtin_prefetch(input + 32);
             const float32x4_t vb = vld1q_dup_f32(w); w += 1;
+            __builtin_prefetch(w + 32);
             vacc0123 = vfmaq_f32(vacc0123, vi0123, vb);
           } while (--nnz != 0);
         }
@@ -228,7 +237,10 @@
             const intptr_t diff = *dmap++;
             const float32x2_t vi01 = vld1_f32(input);
             input = (const float*restrict) ((uintptr_t) input + (uintptr_t) diff);
+            __builtin_prefetch(input + 16);
+            __builtin_prefetch(input + 32);
             const float32x2_t vb = vld1_dup_f32(w); w += 1;
+            __builtin_prefetch(w + 32);
             vacc01 = vfma_f32(vacc01, vi01, vb);
           } while (--nnz != 0);
         }
@@ -254,7 +266,10 @@
             const intptr_t diff = *dmap++;
             const float32x2_t vi0 = vld1_dup_f32(input);
             input = (const float*restrict) ((uintptr_t) input + (uintptr_t) diff);
+            __builtin_prefetch(input + 16);
+            __builtin_prefetch(input + 32);
             const float32x2_t vb = vld1_dup_f32(w); w += 1;
+            __builtin_prefetch(w + 32);
             vacc0 = vfma_f32(vacc0, vi0, vb);
           } while (--nnz != 0);
         }
diff --git a/src/f32-spmm/gen/32x1-minmax-neonfma-x2.c b/src/f32-spmm/gen/32x1-minmax-neonfma-x2.c
index 9ec3165..202411c 100644
--- a/src/f32-spmm/gen/32x1-minmax-neonfma-x2.c
+++ b/src/f32-spmm/gen/32x1-minmax-neonfma-x2.c
@@ -64,9 +64,11 @@
         const float32x4_t viKLMNx0 = vld1q_f32(input + 20);
         const float32x4_t viOPQRx0 = vld1q_f32(input + 24);
         const float32x4_t viSTUVx0 = vld1q_f32(input + 28);
-        __builtin_prefetch(input + 16);
         input = (const float*restrict) ((uintptr_t) input + (uintptr_t) diff0);
+        __builtin_prefetch(input + 16);
+        __builtin_prefetch(input + 32);
         const float32x4_t vw0 = vld1q_dup_f32(w); w += 1;
+        __builtin_prefetch(w + 32);
         vacc0123x0 = vfmaq_f32(vacc0123x0, vi0123x0, vw0);
         vacc4567x0 = vfmaq_f32(vacc4567x0, vi4567x0, vw0);
         vacc89ABx0 = vfmaq_f32(vacc89ABx0, vi89ABx0, vw0);
@@ -83,9 +85,11 @@
         const float32x4_t viKLMNx1 = vld1q_f32(input + 20);
         const float32x4_t viOPQRx1 = vld1q_f32(input + 24);
         const float32x4_t viSTUVx1 = vld1q_f32(input + 28);
-        __builtin_prefetch(input + 16);
         input = (const float*restrict) ((uintptr_t) input + (uintptr_t) diff1);
+        __builtin_prefetch(input + 16);
+        __builtin_prefetch(input + 32);
         const float32x4_t vw1 = vld1q_dup_f32(w); w += 1;
+        __builtin_prefetch(w + 32);
         vacc0123x1 = vfmaq_f32(vacc0123x1, vi0123x1, vw1);
         vacc4567x1 = vfmaq_f32(vacc4567x1, vi4567x1, vw1);
         vacc89ABx1 = vfmaq_f32(vacc89ABx1, vi89ABx1, vw1);
@@ -123,7 +127,10 @@
           const float32x4_t viOPQR = vld1q_f32(input + 24);
           const float32x4_t viSTUV = vld1q_f32(input + 28);
           input = (const float*restrict) ((uintptr_t) input + (uintptr_t) diff);
+          __builtin_prefetch(input + 16);
+          __builtin_prefetch(input + 32);
           const float32x4_t vw = vld1q_dup_f32(w); w += 1;
+          __builtin_prefetch(w + 32);
           vacc0123 = vfmaq_f32(vacc0123, vi0123, vw);
           vacc4567 = vfmaq_f32(vacc4567, vi4567, vw);
           vacc89AB = vfmaq_f32(vacc89AB, vi89AB, vw);
diff --git a/src/f32-spmm/gen/32x1-minmax-neonfma.c b/src/f32-spmm/gen/32x1-minmax-neonfma.c
index 63e518a..93ac3bd 100644
--- a/src/f32-spmm/gen/32x1-minmax-neonfma.c
+++ b/src/f32-spmm/gen/32x1-minmax-neonfma.c
@@ -56,7 +56,10 @@
           const float32x4_t viOPQR = vld1q_f32(input + 24);
           const float32x4_t viSTUV = vld1q_f32(input + 28);
           input = (const float*restrict) ((uintptr_t) input + (uintptr_t) diff);
+          __builtin_prefetch(input + 16);
+          __builtin_prefetch(input + 32);
           const float32x4_t vw = vld1q_dup_f32(w); w += 1;
+          __builtin_prefetch(w + 32);
           vacc0123 = vfmaq_f32(vacc0123, vi0123, vw);
           vacc4567 = vfmaq_f32(vacc4567, vi4567, vw);
           vacc89AB = vfmaq_f32(vacc89AB, vi89AB, vw);
diff --git a/src/f32-spmm/gen/32x2-minmax-neonfma.c b/src/f32-spmm/gen/32x2-minmax-neonfma.c
index 417bf02..d46c17e 100644
--- a/src/f32-spmm/gen/32x2-minmax-neonfma.c
+++ b/src/f32-spmm/gen/32x2-minmax-neonfma.c
@@ -63,10 +63,11 @@
           const float32x4_t viKLMN = vld1q_f32(input + 20);
           const float32x4_t viOPQR = vld1q_f32(input + 24);
           const float32x4_t viSTUV = vld1q_f32(input + 28);
-          __builtin_prefetch(input + 16);
           input = (const float*restrict) ((uintptr_t) input + (uintptr_t) diff);
+          __builtin_prefetch(input + 16);
+          __builtin_prefetch(input + 32);
           const float32x2_t vw = vld1_f32(w); w += 2;
-
+          __builtin_prefetch(w + 32);
           vacc0123c0 = vfmaq_lane_f32(vacc0123c0, vi0123, vw, 0);
           vacc4567c0 = vfmaq_lane_f32(vacc4567c0, vi4567, vw, 0);
           vacc89ABc0 = vfmaq_lane_f32(vacc89ABc0, vi89AB, vw, 0);
@@ -163,7 +164,10 @@
             const float32x4_t viOPQR = vld1q_f32(input + 24);
             const float32x4_t viSTUV = vld1q_f32(input + 28);
             input = (const float*restrict) ((uintptr_t) input + (uintptr_t) diff);
+            __builtin_prefetch(input + 16);
+            __builtin_prefetch(input + 32);
             const float32x4_t vw = vld1q_dup_f32(w); w += 1;
+            __builtin_prefetch(w + 32);
             vacc0123 = vfmaq_f32(vacc0123, vi0123, vw);
             vacc4567 = vfmaq_f32(vacc4567, vi4567, vw);
             vacc89AB = vfmaq_f32(vacc89AB, vi89AB, vw);
diff --git a/src/f32-spmm/gen/32x4-minmax-neonfma.c b/src/f32-spmm/gen/32x4-minmax-neonfma.c
index fb21808..9f622af 100644
--- a/src/f32-spmm/gen/32x4-minmax-neonfma.c
+++ b/src/f32-spmm/gen/32x4-minmax-neonfma.c
@@ -79,10 +79,11 @@
           const float32x4_t viKLMN = vld1q_f32(input + 20);
           const float32x4_t viOPQR = vld1q_f32(input + 24);
           const float32x4_t viSTUV = vld1q_f32(input + 28);
-          __builtin_prefetch(input + 16);
           input = (const float*restrict) ((uintptr_t) input + (uintptr_t) diff);
+          __builtin_prefetch(input + 16);
+          __builtin_prefetch(input + 32);
           const float32x4_t vw = vld1q_f32(w); w += 4;
-
+          __builtin_prefetch(w + 32);
           vacc0123c0 = vfmaq_laneq_f32(vacc0123c0, vi0123, vw, 0);
           vacc4567c0 = vfmaq_laneq_f32(vacc4567c0, vi4567, vw, 0);
           vacc89ABc0 = vfmaq_laneq_f32(vacc89ABc0, vi89AB, vw, 0);
@@ -243,7 +244,10 @@
             const float32x4_t viOPQR = vld1q_f32(input + 24);
             const float32x4_t viSTUV = vld1q_f32(input + 28);
             input = (const float*restrict) ((uintptr_t) input + (uintptr_t) diff);
+            __builtin_prefetch(input + 16);
+            __builtin_prefetch(input + 32);
             const float32x4_t vw = vld1q_dup_f32(w); w += 1;
+            __builtin_prefetch(w + 32);
             vacc0123 = vfmaq_f32(vacc0123, vi0123, vw);
             vacc4567 = vfmaq_f32(vacc4567, vi4567, vw);
             vacc89AB = vfmaq_f32(vacc89AB, vi89AB, vw);
diff --git a/src/f32-spmm/gen/4x1-minmax-neonfma-pipelined.c b/src/f32-spmm/gen/4x1-minmax-neonfma-pipelined.c
index f396e95..5495d5d 100644
--- a/src/f32-spmm/gen/4x1-minmax-neonfma-pipelined.c
+++ b/src/f32-spmm/gen/4x1-minmax-neonfma-pipelined.c
@@ -45,9 +45,10 @@
         do {
           vacc0123 = vfmaq_f32(vacc0123, vi0123, vw);
           input = (const float*restrict) ((uintptr_t) input + (uintptr_t) diff);
-
+          __builtin_prefetch(input + 16);
           diff = *dmap++;
           vw = vld1q_dup_f32(w); w += 1;
+          __builtin_prefetch(w + 32);
           vi0123 = vld1q_f32(input);
         } while (--nnz != 0);
       }
@@ -75,7 +76,9 @@
             const intptr_t diff = *dmap++;
             const float32x2_t vi01 = vld1_f32(input);
             input = (const float*restrict) ((uintptr_t) input + (uintptr_t) diff);
+            __builtin_prefetch(input + 16);
             const float32x2_t vb = vld1_dup_f32(w); w += 1;
+            __builtin_prefetch(w + 32);
             vacc01 = vfma_f32(vacc01, vi01, vb);
           } while (--nnz != 0);
         }
@@ -101,7 +104,9 @@
             const intptr_t diff = *dmap++;
             const float32x2_t vi0 = vld1_dup_f32(input);
             input = (const float*restrict) ((uintptr_t) input + (uintptr_t) diff);
+            __builtin_prefetch(input + 16);
             const float32x2_t vb = vld1_dup_f32(w); w += 1;
+            __builtin_prefetch(w + 32);
             vacc0 = vfma_f32(vacc0, vi0, vb);
           } while (--nnz != 0);
         }
diff --git a/src/f32-spmm/gen/4x1-minmax-neonfma-x2.c b/src/f32-spmm/gen/4x1-minmax-neonfma-x2.c
index c0371fa..e295bea 100644
--- a/src/f32-spmm/gen/4x1-minmax-neonfma-x2.c
+++ b/src/f32-spmm/gen/4x1-minmax-neonfma-x2.c
@@ -44,11 +44,15 @@
         dmap += 2;
         const float32x4_t vi0123x0 = vld1q_f32(input);
         input = (const float*restrict) ((uintptr_t) input + (uintptr_t) diff0);
+        __builtin_prefetch(input + 16);
         const float32x4_t vw0 = vld1q_dup_f32(w); w += 1;
+        __builtin_prefetch(w + 32);
         vacc0123x0 = vfmaq_f32(vacc0123x0, vi0123x0, vw0);
         const float32x4_t vi0123x1 = vld1q_f32(input);
         input = (const float*restrict) ((uintptr_t) input + (uintptr_t) diff1);
+        __builtin_prefetch(input + 16);
         const float32x4_t vw1 = vld1q_dup_f32(w); w += 1;
+        __builtin_prefetch(w + 32);
         vacc0123x1 = vfmaq_f32(vacc0123x1, vi0123x1, vw1);
       }
       float32x4_t vacc0123 = vacc0123x0;
@@ -58,7 +62,9 @@
           const intptr_t diff = *dmap++;
           const float32x4_t vi0123 = vld1q_f32(input);
           input = (const float*restrict) ((uintptr_t) input + (uintptr_t) diff);
+          __builtin_prefetch(input + 16);
           const float32x4_t vw = vld1q_dup_f32(w); w += 1;
+          __builtin_prefetch(w + 32);
           vacc0123 = vfmaq_f32(vacc0123, vi0123, vw);
         } while (--nnz != 0);
       }
diff --git a/src/f32-spmm/gen/4x1-minmax-neonfma.c b/src/f32-spmm/gen/4x1-minmax-neonfma.c
index d98524c..3c30fe4 100644
--- a/src/f32-spmm/gen/4x1-minmax-neonfma.c
+++ b/src/f32-spmm/gen/4x1-minmax-neonfma.c
@@ -42,7 +42,9 @@
           const intptr_t diff = *dmap++;
           const float32x4_t vi0123 = vld1q_f32(input);
           input = (const float*restrict) ((uintptr_t) input + (uintptr_t) diff);
+          __builtin_prefetch(input + 16);
           const float32x4_t vw = vld1q_dup_f32(w); w += 1;
+          __builtin_prefetch(w + 32);
           vacc0123 = vfmaq_f32(vacc0123, vi0123, vw);
         } while (--nnz != 0);
       }
diff --git a/src/f32-spmm/gen/4x2-minmax-neonfma.c b/src/f32-spmm/gen/4x2-minmax-neonfma.c
index d5730b4..dcd98ae 100644
--- a/src/f32-spmm/gen/4x2-minmax-neonfma.c
+++ b/src/f32-spmm/gen/4x2-minmax-neonfma.c
@@ -43,8 +43,9 @@
           const intptr_t diff = *dmap++;
           const float32x4_t vi0123 = vld1q_f32(input);
           input = (const float*restrict) ((uintptr_t) input + (uintptr_t) diff);
+          __builtin_prefetch(input + 16);
           const float32x2_t vw = vld1_f32(w); w += 2;
-
+          __builtin_prefetch(w + 32);
           vacc0123c0 = vfmaq_lane_f32(vacc0123c0, vi0123, vw, 0);
           vacc0123c1 = vfmaq_lane_f32(vacc0123c1, vi0123, vw, 1);
         } while (--nnz != 0);
@@ -71,7 +72,9 @@
             const intptr_t diff = *dmap++;
             const float32x4_t vi0123 = vld1q_f32(input);
             input = (const float*restrict) ((uintptr_t) input + (uintptr_t) diff);
+            __builtin_prefetch(input + 16);
             const float32x4_t vw = vld1q_dup_f32(w); w += 1;
+            __builtin_prefetch(w + 32);
             vacc0123 = vfmaq_f32(vacc0123, vi0123, vw);
           } while (--nnz != 0);
         }
diff --git a/src/f32-spmm/gen/4x4-minmax-neonfma.c b/src/f32-spmm/gen/4x4-minmax-neonfma.c
index 04e3574..ecc9b58 100644
--- a/src/f32-spmm/gen/4x4-minmax-neonfma.c
+++ b/src/f32-spmm/gen/4x4-minmax-neonfma.c
@@ -45,8 +45,9 @@
           const intptr_t diff = *dmap++;
           const float32x4_t vi0123 = vld1q_f32(input);
           input = (const float*restrict) ((uintptr_t) input + (uintptr_t) diff);
+          __builtin_prefetch(input + 16);
           const float32x4_t vw = vld1q_f32(w); w += 4;
-
+          __builtin_prefetch(w + 32);
           vacc0123c0 = vfmaq_laneq_f32(vacc0123c0, vi0123, vw, 0);
           vacc0123c1 = vfmaq_laneq_f32(vacc0123c1, vi0123, vw, 1);
           vacc0123c2 = vfmaq_laneq_f32(vacc0123c2, vi0123, vw, 2);
@@ -81,7 +82,9 @@
             const intptr_t diff = *dmap++;
             const float32x4_t vi0123 = vld1q_f32(input);
             input = (const float*restrict) ((uintptr_t) input + (uintptr_t) diff);
+            __builtin_prefetch(input + 16);
             const float32x4_t vw = vld1q_dup_f32(w); w += 1;
+            __builtin_prefetch(w + 32);
             vacc0123 = vfmaq_f32(vacc0123, vi0123, vw);
           } while (--nnz != 0);
         }
diff --git a/src/f32-spmm/gen/8x1-minmax-neonfma-pipelined.c b/src/f32-spmm/gen/8x1-minmax-neonfma-pipelined.c
index c5dd092..72255a7 100644
--- a/src/f32-spmm/gen/8x1-minmax-neonfma-pipelined.c
+++ b/src/f32-spmm/gen/8x1-minmax-neonfma-pipelined.c
@@ -48,9 +48,10 @@
           vacc0123 = vfmaq_f32(vacc0123, vi0123, vw);
           vacc4567 = vfmaq_f32(vacc4567, vi4567, vw);
           input = (const float*restrict) ((uintptr_t) input + (uintptr_t) diff);
-
+          __builtin_prefetch(input + 16);
           diff = *dmap++;
           vw = vld1q_dup_f32(w); w += 1;
+          __builtin_prefetch(w + 32);
           vi0123 = vld1q_f32(input);
           vi4567 = vld1q_f32(input + 4);
         } while (--nnz != 0);
@@ -82,7 +83,9 @@
             const intptr_t diff = *dmap++;
             const float32x4_t vi0123 = vld1q_f32(input);
             input = (const float*restrict) ((uintptr_t) input + (uintptr_t) diff);
+            __builtin_prefetch(input + 16);
             const float32x4_t vb = vld1q_dup_f32(w); w += 1;
+            __builtin_prefetch(w + 32);
             vacc0123 = vfmaq_f32(vacc0123, vi0123, vb);
           } while (--nnz != 0);
         }
@@ -108,7 +111,9 @@
             const intptr_t diff = *dmap++;
             const float32x2_t vi01 = vld1_f32(input);
             input = (const float*restrict) ((uintptr_t) input + (uintptr_t) diff);
+            __builtin_prefetch(input + 16);
             const float32x2_t vb = vld1_dup_f32(w); w += 1;
+            __builtin_prefetch(w + 32);
             vacc01 = vfma_f32(vacc01, vi01, vb);
           } while (--nnz != 0);
         }
@@ -134,7 +139,9 @@
             const intptr_t diff = *dmap++;
             const float32x2_t vi0 = vld1_dup_f32(input);
             input = (const float*restrict) ((uintptr_t) input + (uintptr_t) diff);
+            __builtin_prefetch(input + 16);
             const float32x2_t vb = vld1_dup_f32(w); w += 1;
+            __builtin_prefetch(w + 32);
             vacc0 = vfma_f32(vacc0, vi0, vb);
           } while (--nnz != 0);
         }
diff --git a/src/f32-spmm/gen/8x1-minmax-neonfma-x2.c b/src/f32-spmm/gen/8x1-minmax-neonfma-x2.c
index a3c9262..8526fef 100644
--- a/src/f32-spmm/gen/8x1-minmax-neonfma-x2.c
+++ b/src/f32-spmm/gen/8x1-minmax-neonfma-x2.c
@@ -47,13 +47,17 @@
         const float32x4_t vi0123x0 = vld1q_f32(input);
         const float32x4_t vi4567x0 = vld1q_f32(input + 4);
         input = (const float*restrict) ((uintptr_t) input + (uintptr_t) diff0);
+        __builtin_prefetch(input + 16);
         const float32x4_t vw0 = vld1q_dup_f32(w); w += 1;
+        __builtin_prefetch(w + 32);
         vacc0123x0 = vfmaq_f32(vacc0123x0, vi0123x0, vw0);
         vacc4567x0 = vfmaq_f32(vacc4567x0, vi4567x0, vw0);
         const float32x4_t vi0123x1 = vld1q_f32(input);
         const float32x4_t vi4567x1 = vld1q_f32(input + 4);
         input = (const float*restrict) ((uintptr_t) input + (uintptr_t) diff1);
+        __builtin_prefetch(input + 16);
         const float32x4_t vw1 = vld1q_dup_f32(w); w += 1;
+        __builtin_prefetch(w + 32);
         vacc0123x1 = vfmaq_f32(vacc0123x1, vi0123x1, vw1);
         vacc4567x1 = vfmaq_f32(vacc4567x1, vi4567x1, vw1);
       }
@@ -67,7 +71,9 @@
           const float32x4_t vi0123 = vld1q_f32(input);
           const float32x4_t vi4567 = vld1q_f32(input + 4);
           input = (const float*restrict) ((uintptr_t) input + (uintptr_t) diff);
+          __builtin_prefetch(input + 16);
           const float32x4_t vw = vld1q_dup_f32(w); w += 1;
+          __builtin_prefetch(w + 32);
           vacc0123 = vfmaq_f32(vacc0123, vi0123, vw);
           vacc4567 = vfmaq_f32(vacc4567, vi4567, vw);
         } while (--nnz != 0);
diff --git a/src/f32-spmm/gen/8x1-minmax-neonfma.c b/src/f32-spmm/gen/8x1-minmax-neonfma.c
index c4dc8a3..c68524e 100644
--- a/src/f32-spmm/gen/8x1-minmax-neonfma.c
+++ b/src/f32-spmm/gen/8x1-minmax-neonfma.c
@@ -44,7 +44,9 @@
           const float32x4_t vi0123 = vld1q_f32(input);
           const float32x4_t vi4567 = vld1q_f32(input + 4);
           input = (const float*restrict) ((uintptr_t) input + (uintptr_t) diff);
+          __builtin_prefetch(input + 16);
           const float32x4_t vw = vld1q_dup_f32(w); w += 1;
+          __builtin_prefetch(w + 32);
           vacc0123 = vfmaq_f32(vacc0123, vi0123, vw);
           vacc4567 = vfmaq_f32(vacc4567, vi4567, vw);
         } while (--nnz != 0);
diff --git a/src/f32-spmm/gen/8x2-minmax-neonfma.c b/src/f32-spmm/gen/8x2-minmax-neonfma.c
index c7d3dc4..62f3641 100644
--- a/src/f32-spmm/gen/8x2-minmax-neonfma.c
+++ b/src/f32-spmm/gen/8x2-minmax-neonfma.c
@@ -46,8 +46,9 @@
           const float32x4_t vi0123 = vld1q_f32(input);
           const float32x4_t vi4567 = vld1q_f32(input + 4);
           input = (const float*restrict) ((uintptr_t) input + (uintptr_t) diff);
+          __builtin_prefetch(input + 16);
           const float32x2_t vw = vld1_f32(w); w += 2;
-
+          __builtin_prefetch(w + 32);
           vacc0123c0 = vfmaq_lane_f32(vacc0123c0, vi0123, vw, 0);
           vacc4567c0 = vfmaq_lane_f32(vacc4567c0, vi4567, vw, 0);
           vacc0123c1 = vfmaq_lane_f32(vacc0123c1, vi0123, vw, 1);
@@ -84,7 +85,9 @@
             const float32x4_t vi0123 = vld1q_f32(input);
             const float32x4_t vi4567 = vld1q_f32(input + 4);
             input = (const float*restrict) ((uintptr_t) input + (uintptr_t) diff);
+            __builtin_prefetch(input + 16);
             const float32x4_t vw = vld1q_dup_f32(w); w += 1;
+            __builtin_prefetch(w + 32);
             vacc0123 = vfmaq_f32(vacc0123, vi0123, vw);
             vacc4567 = vfmaq_f32(vacc4567, vi4567, vw);
           } while (--nnz != 0);
diff --git a/src/f32-spmm/gen/8x4-minmax-neonfma.c b/src/f32-spmm/gen/8x4-minmax-neonfma.c
index d4c8704..8db01b7 100644
--- a/src/f32-spmm/gen/8x4-minmax-neonfma.c
+++ b/src/f32-spmm/gen/8x4-minmax-neonfma.c
@@ -50,8 +50,9 @@
           const float32x4_t vi0123 = vld1q_f32(input);
           const float32x4_t vi4567 = vld1q_f32(input + 4);
           input = (const float*restrict) ((uintptr_t) input + (uintptr_t) diff);
+          __builtin_prefetch(input + 16);
           const float32x4_t vw = vld1q_f32(w); w += 4;
-
+          __builtin_prefetch(w + 32);
           vacc0123c0 = vfmaq_laneq_f32(vacc0123c0, vi0123, vw, 0);
           vacc4567c0 = vfmaq_laneq_f32(vacc4567c0, vi4567, vw, 0);
           vacc0123c1 = vfmaq_laneq_f32(vacc0123c1, vi0123, vw, 1);
@@ -104,7 +105,9 @@
             const float32x4_t vi0123 = vld1q_f32(input);
             const float32x4_t vi4567 = vld1q_f32(input + 4);
             input = (const float*restrict) ((uintptr_t) input + (uintptr_t) diff);
+            __builtin_prefetch(input + 16);
             const float32x4_t vw = vld1q_dup_f32(w); w += 1;
+            __builtin_prefetch(w + 32);
             vacc0123 = vfmaq_f32(vacc0123, vi0123, vw);
             vacc4567 = vfmaq_f32(vacc4567, vi4567, vw);
           } while (--nnz != 0);
diff --git a/src/f32-spmm/neon-blocked.c.in b/src/f32-spmm/neon-blocked.c.in
index 35da044..24ebd8e 100644
--- a/src/f32-spmm/neon-blocked.c.in
+++ b/src/f32-spmm/neon-blocked.c.in
@@ -45,16 +45,16 @@
           const float32x4_t vi${ABC[0:4]} = vld1q_f32(input);
           $for M in range(4, MR, 4):
             const float32x4_t vi${ABC[M:M+4]} = vld1q_f32(input + ${M});
-          $if MR >= 16:
-            __builtin_prefetch(input + 16);
           input = (const float*restrict) ((uintptr_t) input + (uintptr_t) diff);
+          $for M in range(0, MR, 16):
+            __builtin_prefetch(input + ${M+16});
           $if NR == 1:
             const float32x4_t vw = vld1q_dup_f32(w); w += 1;
           $elif NR == 2:
             const float32x2_t vw = vld1_f32(w); w += 2;
           $elif NR == 4:
             const float32x4_t vw = vld1q_f32(w); w += 4;
-
+          __builtin_prefetch(w + 32);
           $if NR == 1:
             $for M in range(0, MR, 4):
               vacc${ABC[M:M+4]}c0 = vfmaq_f32(vacc${ABC[M:M+4]}c0, vi${ABC[M:M+4]}, vw);
@@ -93,7 +93,10 @@
             $for M in range(4, MR, 4):
               const float32x4_t vi${ABC[M:M+4]} = vld1q_f32(input + ${M});
             input = (const float*restrict) ((uintptr_t) input + (uintptr_t) diff);
+            $for M in range(0, MR, 16):
+              __builtin_prefetch(input + ${M+16});
             const float32x4_t vw = vld1q_dup_f32(w); w += 1;
+            __builtin_prefetch(w + 32);
             $for M in range(0, MR, 4):
               vacc${ABC[M:M+4]} = vfmaq_f32(vacc${ABC[M:M+4]}, vi${ABC[M:M+4]}, vw);
           } while (--nnz != 0);
diff --git a/src/f32-spmm/neon-pipelined.c.in b/src/f32-spmm/neon-pipelined.c.in
index c63ba74..fab507f 100644
--- a/src/f32-spmm/neon-pipelined.c.in
+++ b/src/f32-spmm/neon-pipelined.c.in
@@ -36,8 +36,6 @@
     float32x4_t vi0123 = vld1q_f32(input);
     $for M in range(4, MR, 4):
       float32x4_t vi${ABC[M:M+4]} = vld1q_f32(input + ${M});
-    $if MR >= 16:
-      __builtin_prefetch(input + 16);
     size_t c = output_channels;
     do {
       uint32_t nnz = *nnzmap++;
@@ -49,14 +47,14 @@
           $for M in range(0, MR, 4):
             vacc${ABC[M:M+4]} = vfmaq_f32(vacc${ABC[M:M+4]}, vi${ABC[M:M+4]}, vw);
           input = (const float*restrict) ((uintptr_t) input + (uintptr_t) diff);
-
+          $for M in range(0, MR, 16):
+            __builtin_prefetch(input + ${M+16});
           diff = *dmap++;
           vw = vld1q_dup_f32(w); w += 1;
+          __builtin_prefetch(w + 32);
           vi0123 = vld1q_f32(input);
           $for M in range(4, MR, 4):
             vi${ABC[M:M+4]} = vld1q_f32(input + ${M});
-          $if MR >= 16:
-            __builtin_prefetch(input + 16);
         } while (--nnz != 0);
       }
       $for M in range(0, MR, 4):
@@ -101,10 +99,13 @@
               $for M in range(4, SUBMR, 4):
                 const float32x4_t vi${ABC[M:M+4]} = vld1q_f32(input + ${M});
               input = (const float*restrict) ((uintptr_t) input + (uintptr_t) diff);
+              $for M in range(0, MR, 16):
+                __builtin_prefetch(input + ${M+16});
               $if SUBMR <= 2:
                 const float32x2_t vb = vld1_dup_f32(w); w += 1;
               $else:
                 const float32x4_t vb = vld1q_dup_f32(w); w += 1;
+              __builtin_prefetch(w + 32);
               $if SUBMR <= 2:
                 vacc${ABC[0:SUBMR]} = vfma_f32(vacc${ABC[0:SUBMR]}, vi${ABC[0:SUBMR]}, vb);
               $else:
diff --git a/src/f32-spmm/neon.c.in b/src/f32-spmm/neon.c.in
index a7d08e3..16dfbbb 100644
--- a/src/f32-spmm/neon.c.in
+++ b/src/f32-spmm/neon.c.in
@@ -50,10 +50,11 @@
             const float32x4_t vi0123x${K} = vld1q_f32(input);
             $for M in range(4, MR, 4):
               const float32x4_t vi${ABC[M:M+4]}x${K} = vld1q_f32(input + ${M});
-            $if MR >= 16:
-              __builtin_prefetch(input + 16);
             input = (const float*restrict) ((uintptr_t) input + (uintptr_t) diff${K});
+            $for M in range(0, MR, 16):
+              __builtin_prefetch(input + ${M+16});
             const float32x4_t vw${K} = vld1q_dup_f32(w); w += 1;
+            __builtin_prefetch(w + 32);
             $for M in range(0, MR, 4):
               vacc${ABC[M:M+4]}x${K} = vfmaq_f32(vacc${ABC[M:M+4]}x${K}, vi${ABC[M:M+4]}x${K}, vw${K});
         }
@@ -73,7 +74,10 @@
           $for M in range(4, MR, 4):
             const float32x4_t vi${ABC[M:M+4]} = vld1q_f32(input + ${M});
           input = (const float*restrict) ((uintptr_t) input + (uintptr_t) diff);
+          $for M in range(0, MR, 16):
+            __builtin_prefetch(input + ${M+16});
           const float32x4_t vw = vld1q_dup_f32(w); w += 1;
+          __builtin_prefetch(w + 32);
           $for M in range(0, MR, 4):
             vacc${ABC[M:M+4]} = vfmaq_f32(vacc${ABC[M:M+4]}, vi${ABC[M:M+4]}, vw);
         } while (--nnz != 0);