Support Convolution, Deconvolution, and Fully Connected operators without bias

PiperOrigin-RevId: 277524405
diff --git a/src/convolution-spnchw.c b/src/convolution-spnchw.c
index 63ea965..fd36480 100644
--- a/src/convolution-spnchw.c
+++ b/src/convolution-spnchw.c
@@ -280,8 +280,14 @@
       size_t first_ic = 0, last_ic = 0;
       bool first_nonzero = true;
       for (size_t ocb = 0; ocb < round_down_po2(group_output_channels, output_channels_block_size); ocb += output_channels_block_size) {
-        for (size_t oco = 0; oco < output_channels_block_size; oco++) {
-          *nonzero_values++ = bias[ocb + oco];
+        if XNN_LIKELY(bias != NULL) {
+          for (size_t oco = 0; oco < output_channels_block_size; oco++) {
+            *nonzero_values++ = bias[ocb + oco];
+          }
+        } else {
+          for (size_t oco = 0; oco < output_channels_block_size; oco++) {
+            *nonzero_values++ = 0.0f;
+          }
         }
         for (size_t ic = 0; ic < group_input_channels; ic++) {
           bool is_nonzero_block = false;
@@ -311,7 +317,11 @@
         output_channel_nonzeros += 1;
       }
       for (size_t oc = round_down_po2(group_output_channels, output_channels_block_size); oc < group_output_channels; oc++) {
-        *nonzero_values++ = bias[oc];
+        if XNN_LIKELY(bias != NULL) {
+          *nonzero_values++ = bias[oc];
+        } else {
+          *nonzero_values++ = 0.0f;
+        }
         for (size_t ic = 0; ic < group_input_channels; ic++) {
           const float weight = kernel[oc * group_input_channels + ic];
           if (weight != 0.0f) {
diff --git a/src/xnnpack/pack.h b/src/xnnpack/pack.h
index ab13696..865c5d7 100644
--- a/src/xnnpack/pack.h
+++ b/src/xnnpack/pack.h
@@ -30,9 +30,17 @@
     for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
       const size_t nr_block_size = min(nc - nr_block_start, nr);
       int32_t* packed_b = (int32_t*) packed_w;
-      for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
-        *((int32_t*) packed_w) = b[nr_block_start + nr_block_offset] + boff;
-        packed_w = (void*) ((uintptr_t) packed_w + sizeof(int32_t));
+      if XNN_LIKELY(b != NULL) {
+        for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
+          *((int32_t*) packed_w) = b[nr_block_start + nr_block_offset] + boff;
+          packed_w = (void*) ((uintptr_t) packed_w + sizeof(int32_t));
+        }
+      } else {
+        size_t n = nr_block_size;
+        do {
+          *((int32_t*) packed_w) = boff;
+          packed_w = (void*) ((uintptr_t) packed_w + sizeof(int32_t));
+        } while (--n != 0);
       }
       packed_w = (void*) ((uintptr_t) packed_w + (nr - nr_block_size) * sizeof(int32_t));
       for (size_t kr_block_start = 0; kr_block_start < kc; kr_block_start += kr) {
@@ -52,7 +60,9 @@
       }
     }
     k += nc * kc;
-    b += nc;
+    if XNN_UNPREDICTABLE(b != NULL) {
+      b += nc;
+    }
   } while (--g != 0);
 }
 
@@ -74,9 +84,17 @@
     for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
       const size_t nr_block_size = min(nc - nr_block_start, nr);
       int32_t* packed_b = (int32_t*) packed_w;
-      for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
-        *((int32_t*) packed_w) = b[nr_block_start + nr_block_offset] + boff;
-        packed_w = (void*) ((uintptr_t) packed_w + sizeof(int32_t));
+      if XNN_LIKELY(b != NULL) {
+        for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
+          *((int32_t*) packed_w) = b[nr_block_start + nr_block_offset] + boff;
+          packed_w = (void*) ((uintptr_t) packed_w + sizeof(int32_t));
+        }
+      } else {
+        size_t n = nr_block_size;
+        do {
+          *((int32_t*) packed_w) = boff;
+          packed_w = (void*) ((uintptr_t) packed_w + sizeof(int32_t));
+        } while (--n != 0);
       }
       packed_w = (void*) ((uintptr_t) packed_w + (nr - nr_block_size) * sizeof(int32_t));
       for (size_t ki = 0; ki < ks; ki++) {
@@ -99,7 +117,9 @@
       }
     }
     k += ks * kc * nc;
-    b += nc;
+    if XNN_UNPREDICTABLE(b != NULL) {
+      b += nc;
+    }
   } while (--g != 0);
 }
 
@@ -120,9 +140,17 @@
     for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
       const size_t nr_block_size = min(nc - nr_block_start, nr);
       int32_t* packed_b = (int32_t*) packed_w;
-      for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
-        *((int32_t*) packed_w) = b[nr_block_start + nr_block_offset] + boff;
-        packed_w = (void*) ((uintptr_t) packed_w + sizeof(int32_t));
+      if XNN_LIKELY(b != NULL) {
+        for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
+          *((int32_t*) packed_w) = b[nr_block_start + nr_block_offset] + boff;
+          packed_w = (void*) ((uintptr_t) packed_w + sizeof(int32_t));
+        }
+      } else {
+        size_t n = nr_block_size;
+        do {
+          *((int32_t*) packed_w) = boff;
+          packed_w = (void*) ((uintptr_t) packed_w + sizeof(int32_t));
+        } while (--n != 0);
       }
       packed_w = (void*) ((uintptr_t) packed_w + (nr - nr_block_size) * sizeof(int32_t));
       for (size_t ki = 0; ki < ks; ki++) {
@@ -137,7 +165,9 @@
       }
     }
     k += nc;
-    b += nc;
+    if XNN_UNPREDICTABLE(b != NULL) {
+      b += nc;
+    }
   }
 }
 
@@ -168,9 +198,17 @@
         for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
           const size_t nr_block_size = min(nc - nr_block_start, nr);
           int32_t* packed_b = (int32_t*) packed_w;
-          for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
-            *((int32_t*) packed_w) = b[nr_block_start + nr_block_offset] + boff;
-            packed_w = (void*) ((uintptr_t) packed_w + sizeof(int32_t));
+          if XNN_LIKELY(b != 0) {
+            for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
+              *((int32_t*) packed_w) = b[nr_block_start + nr_block_offset] + boff;
+              packed_w = (void*) ((uintptr_t) packed_w + sizeof(int32_t));
+            }
+          } else {
+            size_t n = nr_block_size;
+            do {
+              *((int32_t*) packed_w) = boff;
+              packed_w = (void*) ((uintptr_t) packed_w + sizeof(int32_t));
+            } while (--n != 0);
           }
           packed_w = (void*) ((uintptr_t) packed_w + (nr - nr_block_size) * sizeof(int32_t));
           for (size_t ky = oy; ky < kh; ky += sh) {
@@ -197,7 +235,9 @@
       }
     }
     k += kh * kw * kc * nc;
-    b += nc;
+    if XNN_UNPREDICTABLE(b != NULL) {
+      b += nc;
+    }
   }
 }
 
@@ -216,9 +256,17 @@
   for (size_t cr_block_start = 0; cr_block_start < c; cr_block_start += cr) {
     const size_t cr_block_size = min(c - cr_block_start, cr);
     int32_t* packed_b = (int32_t*) packed_w;
-    for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) {
-      *((int32_t*) packed_w) = b[cr_block_start + cr_block_offset] + boff;
-      packed_w = (void*) ((uintptr_t) packed_w + sizeof(int32_t));
+    if XNN_LIKELY(b != NULL) {
+      for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) {
+        *((int32_t*) packed_w) = b[cr_block_start + cr_block_offset] + boff;
+        packed_w = (void*) ((uintptr_t) packed_w + sizeof(int32_t));
+      }
+    } else {
+      size_t n = cr_block_size;
+      do {
+        *((int32_t*) packed_w) = boff;
+        packed_w = (void*) ((uintptr_t) packed_w + sizeof(int32_t));
+      } while (--n != 0);
     }
     packed_w = (void*) ((uintptr_t) packed_w + (cr - cr_block_size) * sizeof(int32_t));
     for (size_t x = 0; x < w; x++) {
@@ -250,9 +298,17 @@
   for (size_t cr_block_start = 0; cr_block_start < c; cr_block_start += cr) {
     const size_t cr_block_size = min(c - cr_block_start, cr);
     int32_t* packed_b = (int32_t*) packed_w;
-    for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) {
-      *((int32_t*) packed_w) = b[cr_block_start + cr_block_offset] + boff;
-      packed_w = (void*) ((uintptr_t) packed_w + sizeof(int32_t));
+    if XNN_LIKELY(b != NULL) {
+      for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) {
+        *((int32_t*) packed_w) = b[cr_block_start + cr_block_offset] + boff;
+        packed_w = (void*) ((uintptr_t) packed_w + sizeof(int32_t));
+      }
+    } else {
+      size_t n = cr_block_size;
+      do {
+        *((int32_t*) packed_w) = boff;
+        packed_w = (void*) ((uintptr_t) packed_w + sizeof(int32_t));
+      } while (--n != 0);
     }
     packed_w = (void*) ((uintptr_t) packed_w + (cr - cr_block_size) * sizeof(int32_t));
     for (size_t x = 0; x < w; x++) {
@@ -282,10 +338,12 @@
   do {
     for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
       const size_t nr_block_size = min(nc - nr_block_start, nr);
-      for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
-        *packed_w++ = b[nr_block_start + nr_block_offset];
+      if XNN_LIKELY(b != NULL) {
+        for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
+          packed_w[nr_block_offset] = b[nr_block_start + nr_block_offset];
+        }
       }
-      packed_w += nr - nr_block_size;
+      packed_w += nr;
       for (size_t kr_block_start = 0; kr_block_start < kc; kr_block_start += kr) {
         const size_t kr_block_size = min(kc - kr_block_start, kr);
         for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
@@ -299,7 +357,9 @@
       }
     }
     k += nc * kc;
-    b += nc;
+    if XNN_UNPREDICTABLE(b != NULL) {
+      b += nc;
+    }
   } while (--g != 0);
 }
 
@@ -320,10 +380,12 @@
   do {
     for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
       const size_t nr_block_size = min(nc - nr_block_start, nr);
-      for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
-        *packed_w++ = b[nr_block_start + nr_block_offset];
+      if XNN_LIKELY(b != NULL) {
+        for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
+          packed_w[nr_block_offset] = b[nr_block_start + nr_block_offset];
+        }
       }
-      packed_w += nr - nr_block_size;
+      packed_w += nr;
 
       for (size_t kr_block_start = 0; kr_block_start < skc; kr_block_start += kr) {
         for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
@@ -348,7 +410,9 @@
       }
     }
     k += nc * kc;
-    b += nc;
+    if XNN_UNPREDICTABLE(b != NULL) {
+      b += nc;
+    }
   } while (--g != 0);
 }
 
@@ -413,10 +477,12 @@
   do {
     for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
       const size_t nr_block_size = min(nc - nr_block_start, nr);
-      for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
-        *packed_w++ = b[nr_block_start + nr_block_offset];
+      if XNN_LIKELY(b != NULL) {
+        for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
+          packed_w[nr_block_offset] = b[nr_block_start + nr_block_offset];
+        }
       }
-      packed_w += nr - nr_block_size;
+      packed_w += nr;
 
       for (size_t ki = 0; ki < ks; ki++) {
         for (size_t kr_block_start = 0; kr_block_start < skc; kr_block_start += kr) {
@@ -443,7 +509,9 @@
       }
     }
     k += ks * kc * nc;
-    b += nc;
+    if XNN_UNPREDICTABLE(b != NULL) {
+      b += nc;
+    }
   } while (--g != 0);
 }
 
@@ -460,10 +528,12 @@
   for (size_t i = 0; i < g; i++) {
     for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
       const size_t nr_block_size = min(nc - nr_block_start, nr);
-      for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
-        *packed_w++ = b[nr_block_start + nr_block_offset];
+      if XNN_LIKELY(b != NULL) {
+        for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
+          packed_w[nr_block_offset] = b[nr_block_start + nr_block_offset];
+        }
       }
-      packed_w += nr - nr_block_size;
+      packed_w += nr;
       for (size_t ki = 0; ki < ks; ki++) {
         for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
           *packed_w =
@@ -474,7 +544,9 @@
       }
     }
     k += nc;
-    b += nc;
+    if XNN_UNPREDICTABLE(b != NULL) {
+      b += nc;
+    }
   }
 }
 
@@ -490,8 +562,15 @@
 {
   for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
     const size_t nr_block_size = min(nc - nr_block_start, nr);
-    for (size_t nr_block_offset = 0; nr_block_offset < nr; nr_block_offset++) {
-      *packed_w++ = b[nr_block_start + min(nr_block_offset, nr_block_size - 1)];
+    if XNN_LIKELY(b != NULL) {
+      for (size_t nr_block_offset = 0; nr_block_offset < nr; nr_block_offset++) {
+        *packed_w++ = b[min(nr_block_offset, nr_block_size - 1)];
+      }
+    } else {
+      size_t n = nr;
+      do {
+        *packed_w++ = 0.0f;
+      } while (--n != 0);
     }
 
     for (size_t kx = 0; kx < kw; kx++) {
@@ -503,6 +582,9 @@
         }
       }
     }
+    if XNN_UNPREDICTABLE(b != NULL) {
+      b += nr;
+    }
   }
 }
 
@@ -533,10 +615,12 @@
         }
         for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
           const size_t nr_block_size = min(nc - nr_block_start, nr);
-          for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
-            *packed_w++ = b[nr_block_start + nr_block_offset];
+          if XNN_LIKELY(b != NULL) {
+            for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
+              packed_w[nr_block_offset] = b[nr_block_start + nr_block_offset];
+            }
           }
-          packed_w += nr - nr_block_size;
+          packed_w += nr;
           for (size_t ky = oy; ky < kh; ky += sh) {
             for (size_t kx = ox; kx < kw; kx += sw) {
               for (size_t kr_block_start = 0; kr_block_start < skc; kr_block_start += kr) {
@@ -566,7 +650,9 @@
       }
     }
     k += kh * kw * kc * nc;
-    b += nc;
+    if XNN_UNPREDICTABLE(b != NULL) {
+      b += nc;
+    }
   }
 }
 
@@ -581,8 +667,15 @@
 {
   for (size_t cr_block_start = 0; cr_block_start < c; cr_block_start += cr) {
     const size_t cr_block_size = min(c - cr_block_start, cr);
-    for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) {
-      *packed_w++ = b[cr_block_start + cr_block_offset];
+    if XNN_LIKELY(b != NULL) {
+      for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) {
+        *packed_w++ = b[cr_block_start + cr_block_offset];
+      }
+    } else {
+      size_t n = cr_block_size;
+      do {
+        *packed_w++ = 0.0f;
+      } while (--n != 0);
     }
     packed_w += cr - cr_block_size;
     for (size_t x = 0; x < w; x++) {
@@ -608,8 +701,15 @@
 {
   for (size_t cr_block_start = 0; cr_block_start < c; cr_block_start += cr) {
     const size_t cr_block_size = min(c - cr_block_start, cr);
-    for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) {
-      *packed_w++ = b[cr_block_start + cr_block_offset];
+    if XNN_LIKELY(b != NULL) {
+      for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) {
+        *packed_w++ = b[cr_block_start + cr_block_offset];
+      }
+    } else {
+      size_t n = cr_block_size;
+      do {
+        *packed_w++ = 0.0f;
+      } while (--n != 0);
     }
     packed_w += cr - cr_block_size;
     for (size_t x = 0; x < w; x++) {
@@ -632,7 +732,12 @@
   float* packed_weights)
 {
   for (size_t g = 0; g < groups; g++) {
-    *packed_weights++ = *bias++;
+    if XNN_LIKELY(bias != NULL) {
+      *packed_weights = *bias++;
+    } else {
+      *packed_weights = 0.0f;
+    }
+    packed_weights += 1;
     for (size_t i = 0; i < kernel_size; i++) {
       *packed_weights++ = kernel[g * kernel_size + i];
     }
@@ -649,12 +754,19 @@
   for (size_t cr_block_start = 0; cr_block_start < c; cr_block_start += cr) {
     const size_t cr_block_size = min(c - cr_block_start, cr);
     for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) {
-      packed_w[cr_block_offset] = s[cr_block_start + cr_block_offset];
+      *packed_w++ = s[cr_block_start + cr_block_offset];
     }
-    packed_w += cr;
-    for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) {
-      packed_w[cr_block_offset] = b[cr_block_start + cr_block_offset];
+    packed_w += cr - cr_block_size;
+    if XNN_LIKELY(b != NULL) {
+      for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) {
+        *packed_w++ = b[cr_block_start + cr_block_offset];
+      }
+    } else {
+      size_t n = cr_block_size;
+      do {
+        *packed_w++ = 0.0f;
+      } while (--n != 0);
     }
-    packed_w += cr;
+    packed_w += cr - cr_block_size;
   }
 }
diff --git a/test/convolution-operator-tester.h b/test/convolution-operator-tester.h
index e637fac..115b846 100644
--- a/test/convolution-operator-tester.h
+++ b/test/convolution-operator-tester.h
@@ -467,6 +467,15 @@
     return this->depthwise_layout_;
   }
 
+  inline ConvolutionOperatorTester& has_bias(bool has_bias) {
+    this->has_bias_ = has_bias;
+    return *this;
+  }
+
+  inline bool has_bias() const {
+    return this->has_bias_;
+  }
+
   inline ConvolutionOperatorTester& iterations(size_t iterations) {
     this->iterations_ = iterations;
     return *this;
@@ -500,17 +509,21 @@
       std::fill(output.begin(), output.end(), 0xA5);
 
       // Compute reference results, without renormalization.
-      for (size_t i = 0; i < batch_size(); i++) {
-        for (size_t oy = 0; oy < output_height(); oy++) {
-          for (size_t ox = 0; ox < output_width(); ox++) {
-            for (size_t g = 0; g < groups(); g++) {
-              for (size_t oc = 0; oc < group_output_channels(); oc++) {
-                accumulators[(((i * output_height() + oy) * output_width() + ox) * groups() + g) * group_output_channels() + oc] =
-                  bias[g * group_output_channels() + oc];
+      if (has_bias()) {
+        for (size_t i = 0; i < batch_size(); i++) {
+          for (size_t oy = 0; oy < output_height(); oy++) {
+            for (size_t ox = 0; ox < output_width(); ox++) {
+              for (size_t g = 0; g < groups(); g++) {
+                for (size_t oc = 0; oc < group_output_channels(); oc++) {
+                  accumulators[(((i * output_height() + oy) * output_width() + ox) * groups() + g) * group_output_channels() + oc] =
+                    bias[g * group_output_channels() + oc];
+                }
               }
             }
           }
         }
+      } else {
+        std::fill(accumulators.begin(), accumulators.end(), 0);
       }
       if (depthwise_layout()) {
         ASSERT_EQ(group_input_channels(), 1);
@@ -596,7 +609,7 @@
           input_pixel_stride(), output_pixel_stride(),
           input_zero_point, 1.0f /* input scale */,
           kernel_zero_point, 1.0f /* kernel scale */,
-          kernel.data(), bias.data(),
+          kernel.data(), has_bias() ? bias.data() : nullptr,
           output_zero_point, output_scale, qmin(), qmax(),
           (depthwise_layout() ? XNN_FLAG_DEPTHWISE_CONVOLUTION : 0) | (padding_tf_same() ? XNN_FLAG_TENSORFLOW_SAME_PADDING : 0),
           &convolution_op));
@@ -656,17 +669,21 @@
       std::fill(output.begin(), output.end(), nanf(""));
 
       // Compute reference results, without clamping.
-      for (size_t i = 0; i < batch_size(); i++) {
-        for (size_t oy = 0; oy < output_height(); oy++) {
-          for (size_t ox = 0; ox < output_width(); ox++) {
-            for (size_t g = 0; g < groups(); g++) {
-              for (size_t oc = 0; oc < group_output_channels(); oc++) {
-                output_ref[(((i * output_height() + oy) * output_width() + ox) * groups() + g) * group_output_channels() + oc] =
-                  bias[g * group_output_channels() + oc];
+      if (has_bias()) {
+        for (size_t i = 0; i < batch_size(); i++) {
+          for (size_t oy = 0; oy < output_height(); oy++) {
+            for (size_t ox = 0; ox < output_width(); ox++) {
+              for (size_t g = 0; g < groups(); g++) {
+                for (size_t oc = 0; oc < group_output_channels(); oc++) {
+                  output_ref[(((i * output_height() + oy) * output_width() + ox) * groups() + g) * group_output_channels() + oc] =
+                    bias[g * group_output_channels() + oc];
+                }
               }
             }
           }
         }
+      } else {
+        std::fill(output_ref.begin(), output_ref.end(), 0.0f);
       }
       if (depthwise_layout()) {
         ASSERT_EQ(group_input_channels(), 1);
@@ -747,7 +764,7 @@
           dilation_height(), dilation_width(),
           groups(), group_input_channels(), group_output_channels(),
           input_pixel_stride(), output_pixel_stride(),
-          kernel.data(), bias.data(),
+          kernel.data(), has_bias() ? bias.data() : nullptr,
           output_min, output_max,
           (depthwise_layout() ? XNN_FLAG_DEPTHWISE_CONVOLUTION : 0) | (padding_tf_same() ? XNN_FLAG_TENSORFLOW_SAME_PADDING : 0),
           &convolution_op));
@@ -819,17 +836,21 @@
       std::fill(output.begin(), output.end(), 0xA5);
 
       // Compute reference results, without renormalization.
-      for (size_t i = 0; i < batch_size(); i++) {
-        for (size_t oy = 0; oy < output_height(); oy++) {
-          for (size_t ox = 0; ox < output_width(); ox++) {
-            for (size_t g = 0; g < groups(); g++) {
-              for (size_t oc = 0; oc < group_output_channels(); oc++) {
-                accumulators[(((i * output_height() + oy) * output_width() + ox) * groups() + g) * group_output_channels() + oc] =
-                  bias[g * group_output_channels() + oc];
+      if (has_bias()) {
+        for (size_t i = 0; i < batch_size(); i++) {
+          for (size_t oy = 0; oy < output_height(); oy++) {
+            for (size_t ox = 0; ox < output_width(); ox++) {
+              for (size_t g = 0; g < groups(); g++) {
+                for (size_t oc = 0; oc < group_output_channels(); oc++) {
+                  accumulators[(((i * output_height() + oy) * output_width() + ox) * groups() + g) * group_output_channels() + oc] =
+                    bias[g * group_output_channels() + oc];
+                }
               }
             }
           }
         }
+      } else {
+        std::fill(accumulators.begin(), accumulators.end(), 0);
       }
       for (size_t i = 0; i < batch_size(); i++) {
         for (size_t oy = 0; oy < output_height(); oy++) {
@@ -886,7 +907,7 @@
           input_pixel_stride(), output_pixel_stride(),
           input_zero_point, 1.0f /* input scale */,
           kernel_zero_point, 1.0f /* kernel scale */,
-          kernel.data(), bias.data(),
+          kernel.data(), has_bias() ? bias.data() : nullptr,
           output_zero_point, output_scale, qmin(), qmax(),
           0, &convolution_op));
 
@@ -929,17 +950,21 @@
       std::fill(output.begin(), output.end(), 0xA5);
 
       // Compute reference results for the second run, including renormalization.
-      for (size_t i = 0; i < next_batch_size(); i++) {
-        for (size_t oy = 0; oy < next_output_height(); oy++) {
-          for (size_t ox = 0; ox < next_output_width(); ox++) {
-            for (size_t g = 0; g < groups(); g++) {
-              for (size_t oc = 0; oc < group_output_channels(); oc++) {
-                next_accumulators[(((i * next_output_height() + oy) * next_output_width() + ox) * groups() + g) * group_output_channels() + oc] =
-                  bias[g * group_output_channels() + oc];
+      if (has_bias()) {
+        for (size_t i = 0; i < next_batch_size(); i++) {
+          for (size_t oy = 0; oy < next_output_height(); oy++) {
+            for (size_t ox = 0; ox < next_output_width(); ox++) {
+              for (size_t g = 0; g < groups(); g++) {
+                for (size_t oc = 0; oc < group_output_channels(); oc++) {
+                  next_accumulators[(((i * next_output_height() + oy) * next_output_width() + ox) * groups() + g) * group_output_channels() + oc] =
+                    bias[g * group_output_channels() + oc];
+                }
               }
             }
           }
         }
+      } else {
+        std::fill(next_accumulators.begin(), next_accumulators.end(), 0);
       }
       for (size_t i = 0; i < next_batch_size(); i++) {
         for (size_t oy = 0; oy < next_output_height(); oy++) {
@@ -1030,17 +1055,21 @@
       std::fill(output.begin(), output.end(), nanf(""));
 
       // Compute reference results, without clamping.
-      for (size_t i = 0; i < batch_size(); i++) {
-        for (size_t oy = 0; oy < output_height(); oy++) {
-          for (size_t ox = 0; ox < output_width(); ox++) {
-            for (size_t g = 0; g < groups(); g++) {
-              for (size_t oc = 0; oc < group_output_channels(); oc++) {
-                output_ref[(((i * output_height() + oy) * output_width() + ox) * groups() + g) * group_output_channels() + oc] =
-                  bias[g * group_output_channels() + oc];
+      if (has_bias()) {
+        for (size_t i = 0; i < batch_size(); i++) {
+          for (size_t oy = 0; oy < output_height(); oy++) {
+            for (size_t ox = 0; ox < output_width(); ox++) {
+              for (size_t g = 0; g < groups(); g++) {
+                for (size_t oc = 0; oc < group_output_channels(); oc++) {
+                  output_ref[(((i * output_height() + oy) * output_width() + ox) * groups() + g) * group_output_channels() + oc] =
+                    bias[g * group_output_channels() + oc];
+                }
               }
             }
           }
         }
+      } else {
+        std::fill(output_ref.begin(), output_ref.end(), 0.0f);
       }
       for (size_t i = 0; i < batch_size(); i++) {
         for (size_t oy = 0; oy < output_height(); oy++) {
@@ -1092,7 +1121,7 @@
           dilation_height(), dilation_width(),
           groups(), group_input_channels(), group_output_channels(),
           input_pixel_stride(), output_pixel_stride(),
-          kernel.data(), bias.data(),
+          kernel.data(), has_bias() ? bias.data() : nullptr,
           output_min, output_max,
           0, &convolution_op));
 
@@ -1135,17 +1164,21 @@
       std::fill(output.begin(), output.end(), nanf(""));
 
       // Compute reference results for the second run, including clamping.
-      for (size_t i = 0; i < next_batch_size(); i++) {
-        for (size_t oy = 0; oy < next_output_height(); oy++) {
-          for (size_t ox = 0; ox < next_output_width(); ox++) {
-            for (size_t g = 0; g < groups(); g++) {
-              for (size_t oc = 0; oc < group_output_channels(); oc++) {
-                next_output_ref[(((i * next_output_height() + oy) * next_output_width() + ox) * groups() + g) * group_output_channels() + oc] =
-                  bias[g * group_output_channels() + oc];
+      if (has_bias()) {
+        for (size_t i = 0; i < next_batch_size(); i++) {
+          for (size_t oy = 0; oy < next_output_height(); oy++) {
+            for (size_t ox = 0; ox < next_output_width(); ox++) {
+              for (size_t g = 0; g < groups(); g++) {
+                for (size_t oc = 0; oc < group_output_channels(); oc++) {
+                  next_output_ref[(((i * next_output_height() + oy) * next_output_width() + ox) * groups() + g) * group_output_channels() + oc] =
+                    bias[g * group_output_channels() + oc];
+                }
               }
             }
           }
         }
+      } else {
+        std::fill(next_output_ref.begin(), next_output_ref.end(), 0.0f);
       }
       for (size_t i = 0; i < next_batch_size(); i++) {
         for (size_t oy = 0; oy < next_output_height(); oy++) {
@@ -1236,5 +1269,6 @@
   uint8_t qmin_{0};
   uint8_t qmax_{255};
   bool depthwise_layout_{false};
+  bool has_bias_{true};
   size_t iterations_{1};
 };
diff --git a/test/convolution-spnchw-operator-tester.h b/test/convolution-spnchw-operator-tester.h
index d860add..a117633 100644
--- a/test/convolution-spnchw-operator-tester.h
+++ b/test/convolution-spnchw-operator-tester.h
@@ -359,6 +359,15 @@
     return this->depthwise_layout_;
   }
 
+  inline ConvolutionSpNCHWOperatorTester& has_bias(bool has_bias) {
+    this->has_bias_ = has_bias;
+    return *this;
+  }
+
+  inline bool has_bias() const {
+    return this->has_bias_;
+  }
+
   inline ConvolutionSpNCHWOperatorTester& iterations(size_t iterations) {
     this->iterations_ = iterations;
     return *this;
@@ -397,17 +406,21 @@
       std::fill(output.begin(), output.end(), nanf(""));
 
       // Compute reference results, without clamping.
-      for (size_t i = 0; i < batch_size(); i++) {
-        for (size_t oy = 0; oy < output_height(); oy++) {
-          for (size_t ox = 0; ox < output_width(); ox++) {
-            for (size_t g = 0; g < groups(); g++) {
-              for (size_t oc = 0; oc < group_output_channels(); oc++) {
-                output_ref[(((i * groups() + g) * group_output_channels() + oc) * output_height() + oy) * output_width() + ox] =
-                  bias[g * group_output_channels() + oc];
+      if (has_bias()) {
+        for (size_t i = 0; i < batch_size(); i++) {
+          for (size_t oy = 0; oy < output_height(); oy++) {
+            for (size_t ox = 0; ox < output_width(); ox++) {
+              for (size_t g = 0; g < groups(); g++) {
+                for (size_t oc = 0; oc < group_output_channels(); oc++) {
+                  output_ref[(((i * groups() + g) * group_output_channels() + oc) * output_height() + oy) * output_width() + ox] =
+                    bias[g * group_output_channels() + oc];
+                }
               }
             }
           }
         }
+      } else {
+        std::fill(output_ref.begin(), output_ref.end(), 0.0f);
       }
       if (nhwc_input()) {
         for (size_t i = 0; i < batch_size(); i++) {
@@ -486,7 +499,7 @@
         subsampling_height(), subsampling_width(),
         dilation_height(), dilation_width(),
         groups(), group_input_channels(), group_output_channels(),
-        kernel.data(), bias.data(),
+        kernel.data(), has_bias() ? bias.data() : nullptr,
         output_min, output_max,
         (depthwise_layout() ? XNN_FLAG_DEPTHWISE_CONVOLUTION : 0) | (nhwc_input() ? XNN_FLAG_INPUT_NHWC : 0),
         &convolution_op);
@@ -555,5 +568,6 @@
   uint8_t qmin_{0};
   uint8_t qmax_{255};
   bool depthwise_layout_{false};
+  bool has_bias_{true};
   size_t iterations_{1};
 };
diff --git a/test/convolution-spnchw.cc b/test/convolution-spnchw.cc
index cad38eb..7690039 100644
--- a/test/convolution-spnchw.cc
+++ b/test/convolution-spnchw.cc
@@ -107,6 +107,17 @@
     .TestF32();
 }
 
+TEST(CONVOLUTION_SpNHWC_OP_F32, 1x1_without_bias) {
+  ConvolutionSpNCHWOperatorTester()
+    .has_bias(false)
+    .input_size(27, 29)
+    .kernel_size(1, 1)
+    .group_input_channels(23)
+    .group_output_channels(19)
+    .iterations(3)
+    .TestF32();
+}
+
 /**************************** SPMM path, batched ****************************/
 
 TEST(CONVOLUTION_SpNHWC_OP_F32, batched_1x1) {
@@ -238,6 +249,18 @@
     .TestF32();
 }
 
+TEST(CONVOLUTION_SpNHWC_OP_F32, batched_1x1_without_bias) {
+  ConvolutionSpNCHWOperatorTester()
+    .has_bias(false)
+    .batch_size(2)
+    .input_size(27, 29)
+    .kernel_size(1, 1)
+    .group_input_channels(23)
+    .group_output_channels(19)
+    .iterations(3)
+    .TestF32();
+}
+
 /**************************** DConv 3x3c3s2 HWC->SpCHW path ****************************/
 
 TEST(CONVOLUTION_HWC2SpNHWC_OP_F32, 3x3c3s2) {
@@ -330,6 +353,20 @@
     .TestF32();
 }
 
+TEST(CONVOLUTION_HWC2SpNHWC_OP_F32, 3x3c3s2_without_bias) {
+  ConvolutionSpNCHWOperatorTester()
+    .has_bias(false)
+    .input_size(27, 29)
+    .padding(1)
+    .kernel_size(3, 3)
+    .subsampling(2)
+    .group_input_channels(3)
+    .group_output_channels(19)
+    .nhwc_input(true)
+    .iterations(3)
+    .TestF32();
+}
+
 /**************************** DConv 3x3c3s2 HWC->SpCHW path, batched ****************************/
 
 TEST(CONVOLUTION_HWC2SpNHWC_OP_F32, batched_3x3c3s2) {
@@ -443,6 +480,21 @@
     .TestF32();
 }
 
+TEST(CONVOLUTION_HWC2SpNHWC_OP_F32, batched_3x3c3s2_without_bias) {
+  ConvolutionSpNCHWOperatorTester()
+    .has_bias(false)
+    .batch_size(2)
+    .input_size(27, 29)
+    .padding(1)
+    .kernel_size(3, 3)
+    .subsampling(2)
+    .group_input_channels(3)
+    .group_output_channels(19)
+    .nhwc_input(true)
+    .iterations(3)
+    .TestF32();
+}
+
 /**************************** DWCONV 3x3 path ****************************/
 
 TEST(CONVOLUTION_SpNHWC_OP_F32, depthwise_3x3) {
@@ -529,6 +581,17 @@
     .TestF32();
 }
 
+TEST(CONVOLUTION_SpNHWC_OP_F32, depthwise_3x3_without_bias) {
+  ConvolutionSpNCHWOperatorTester()
+    .has_bias(false)
+    .input_size(27, 29)
+    .kernel_size(3, 3)
+    .padding_width(1)
+    .groups(19)
+    .iterations(3)
+    .TestF32();
+}
+
 /**************************** DWCONV 3x3 path, batched ****************************/
 
 TEST(CONVOLUTION_SpNHWC_OP_F32, batched_depthwise_3x3) {
@@ -646,6 +709,18 @@
     .TestF32();
 }
 
+TEST(CONVOLUTION_SpNHWC_OP_F32, batched_depthwise_3x3_without_bias) {
+  ConvolutionSpNCHWOperatorTester()
+    .has_bias(false)
+    .batch_size(2)
+    .input_size(27, 29)
+    .kernel_size(3, 3)
+    .padding_width(1)
+    .groups(19)
+    .iterations(3)
+    .TestF32();
+}
+
 /**************************** DWCONV 3x3 stride-2 path ****************************/
 
 TEST(CONVOLUTION_SpNHWC_OP_F32, depthwise_3x3s2) {
@@ -739,6 +814,18 @@
     .TestF32();
 }
 
+TEST(CONVOLUTION_SpNHWC_OP_F32, depthwise_3x3s2_without_bias) {
+  ConvolutionSpNCHWOperatorTester()
+    .has_bias(false)
+    .input_size(27, 29)
+    .kernel_size(3, 3)
+    .padding_width(1)
+    .subsampling(2)
+    .groups(19)
+    .iterations(3)
+    .TestF32();
+}
+
 /**************************** DWCONV 3x3 stride-2 path, batched ****************************/
 
 TEST(CONVOLUTION_SpNHWC_OP_F32, batched_depthwise_3x3s2) {
@@ -864,3 +951,16 @@
     .iterations(3)
     .TestF32();
 }
+
+TEST(CONVOLUTION_SpNHWC_OP_F32, batched_depthwise_3x3s2_without_bias) {
+  ConvolutionSpNCHWOperatorTester()
+    .has_bias(false)
+    .batch_size(2)
+    .input_size(27, 29)
+    .kernel_size(3, 3)
+    .padding_width(1)
+    .subsampling(2)
+    .groups(19)
+    .iterations(3)
+    .TestF32();
+}
diff --git a/test/convolution.cc b/test/convolution.cc
index ebbdfd3..a173619 100644
--- a/test/convolution.cc
+++ b/test/convolution.cc
@@ -65,6 +65,17 @@
     .TestQ8();
 }
 
+TEST(CONVOLUTION_OP_Q8, 1x1_without_bias) {
+  ConvolutionOperatorTester()
+    .has_bias(false)
+    .input_size(13, 14)
+    .kernel_size(1, 1)
+    .group_input_channels(23)
+    .group_output_channels(19)
+    .iterations(3)
+    .TestQ8();
+}
+
 TEST(CONVOLUTION_OP_Q8, 1x1_with_batch) {
   ConvolutionOperatorTester()
     .batch_size(3)
@@ -135,6 +146,18 @@
     .TestQ8();
 }
 
+TEST(CONVOLUTION_OP_Q8, grouped_1x1_without_bias) {
+  ConvolutionOperatorTester()
+    .has_bias(false)
+    .input_size(24, 25)
+    .kernel_size(1, 1)
+    .groups(2)
+    .group_input_channels(17)
+    .group_output_channels(19)
+    .iterations(3)
+    .TestQ8();
+}
+
 TEST(CONVOLUTION_OP_Q8, grouped_1x1_with_batch) {
   ConvolutionOperatorTester()
     .batch_size(3)
@@ -282,6 +305,18 @@
     .TestQ8();
 }
 
+TEST(CONVOLUTION_OP_Q8, 3x3_without_bias) {
+  ConvolutionOperatorTester()
+    .has_bias(false)
+    .input_size(10, 9)
+    .padding(1)
+    .kernel_size(3, 3)
+    .group_input_channels(15)
+    .group_output_channels(17)
+    .iterations(3)
+    .TestQ8();
+}
+
 TEST(CONVOLUTION_OP_Q8, 3x3_with_batch) {
   ConvolutionOperatorTester()
     .batch_size(3)
@@ -390,6 +425,19 @@
     .TestQ8();
 }
 
+TEST(CONVOLUTION_OP_Q8, grouped_3x3_without_bias) {
+  ConvolutionOperatorTester()
+    .has_bias(false)
+    .input_size(10, 11)
+    .padding(1)
+    .kernel_size(3, 3)
+    .groups(2)
+    .group_input_channels(14)
+    .group_output_channels(13)
+    .iterations(3)
+    .TestQ8();
+}
+
 TEST(CONVOLUTION_OP_Q8, grouped_3x3_with_batch) {
   ConvolutionOperatorTester()
     .batch_size(3)
@@ -533,6 +581,17 @@
     .TestQ8();
 }
 
+TEST(CONVOLUTION_OP_Q8, depthwise_3x3_without_bias) {
+  ConvolutionOperatorTester()
+    .has_bias(false)
+    .input_size(15, 14)
+    .padding(1, 1)
+    .kernel_size(3, 3)
+    .groups(27)
+    .iterations(3)
+    .TestQ8();
+}
+
 TEST(CONVOLUTION_OP_Q8, depthwise_3x3s2) {
   ConvolutionOperatorTester()
     .input_size(15, 14)
@@ -665,6 +724,17 @@
     .TestQ8();
 }
 
+TEST(DEPTHWISE_CONVOLUTION_OP_Q8, 1x1_without_bias) {
+  ConvolutionOperatorTester()
+    .depthwise_layout(true)
+    .has_bias(false)
+    .input_size(15, 14)
+    .kernel_size(1, 1)
+    .groups(24)
+    .iterations(3)
+    .TestQ8();
+}
+
 TEST(DEPTHWISE_CONVOLUTION_OP_Q8, 3x3) {
   ConvolutionOperatorTester()
     .depthwise_layout(true)
@@ -688,6 +758,18 @@
     .TestQ8();
 }
 
+TEST(DEPTHWISE_CONVOLUTION_OP_Q8, 3x3_without_bias) {
+  ConvolutionOperatorTester()
+    .depthwise_layout(true)
+    .has_bias(false)
+    .input_size(15, 14)
+    .padding(1, 1)
+    .kernel_size(3, 3)
+    .groups(24)
+    .iterations(3)
+    .TestQ8();
+}
+
 TEST(DEPTHWISE_CONVOLUTION_OP_Q8, 3x3s2_with_tf_same_padding) {
   for (size_t input_height = 14; input_height <= 15; input_height++) {
     for (size_t input_width = 14; input_width <= 15; input_width++) {
@@ -714,6 +796,18 @@
     .TestQ8();
 }
 
+TEST(DEPTHWISE_CONVOLUTION_OP_Q8, 5x5_without_bias) {
+  ConvolutionOperatorTester()
+    .depthwise_layout(true)
+    .has_bias(false)
+    .input_size(15, 14)
+    .padding(2, 2)
+    .kernel_size(5, 5)
+    .groups(24)
+    .iterations(3)
+    .TestQ8();
+}
+
 TEST(CONVOLUTION_OP_Q8, setup_increasing_batch) {
   ASSERT_EQ(xnn_status_success, xnn_initialize());
   ConvolutionOperatorTester()
@@ -1055,6 +1149,17 @@
     .TestF32();
 }
 
+TEST(CONVOLUTION_OP_F32, 1x1_without_bias) {
+  ConvolutionOperatorTester()
+    .has_bias(false)
+    .input_size(13, 14)
+    .kernel_size(1, 1)
+    .group_input_channels(23)
+    .group_output_channels(19)
+    .iterations(3)
+    .TestF32();
+}
+
 TEST(CONVOLUTION_OP_F32, 1x1_with_batch) {
   ConvolutionOperatorTester()
     .batch_size(3)
@@ -1125,6 +1230,18 @@
     .TestF32();
 }
 
+TEST(CONVOLUTION_OP_F32, grouped_1x1_without_bias) {
+  ConvolutionOperatorTester()
+    .has_bias(false)
+    .input_size(24, 25)
+    .kernel_size(1, 1)
+    .groups(2)
+    .group_input_channels(17)
+    .group_output_channels(19)
+    .iterations(3)
+    .TestF32();
+}
+
 TEST(CONVOLUTION_OP_F32, grouped_1x1_with_batch) {
   ConvolutionOperatorTester()
     .batch_size(3)
@@ -1385,6 +1502,18 @@
     .TestF32();
 }
 
+TEST(CONVOLUTION_OP_F32, 3x3_without_bias) {
+  ConvolutionOperatorTester()
+    .has_bias(false)
+    .input_size(10, 9)
+    .padding(1)
+    .kernel_size(3, 3)
+    .group_input_channels(15)
+    .group_output_channels(17)
+    .iterations(3)
+    .TestF32();
+}
+
 TEST(CONVOLUTION_OP_F32, 3x3_with_batch) {
   ConvolutionOperatorTester()
     .batch_size(3)
@@ -1493,6 +1622,19 @@
     .TestF32();
 }
 
+TEST(CONVOLUTION_OP_F32, grouped_3x3_without_bias) {
+  ConvolutionOperatorTester()
+    .has_bias(false)
+    .input_size(10, 11)
+    .padding(1)
+    .kernel_size(3, 3)
+    .groups(2)
+    .group_input_channels(14)
+    .group_output_channels(13)
+    .iterations(3)
+    .TestF32();
+}
+
 TEST(CONVOLUTION_OP_F32, grouped_3x3_with_batch) {
   ConvolutionOperatorTester()
     .batch_size(3)
@@ -1758,6 +1900,16 @@
     .TestF32();
 }
 
+TEST(CONVOLUTION_OP_F32, depthwise_1x1_without_bias) {
+  ConvolutionOperatorTester()
+    .has_bias(false)
+    .input_size(15, 14)
+    .kernel_size(1, 1)
+    .groups(24)
+    .iterations(3)
+    .TestF32();
+}
+
 TEST(CONVOLUTION_OP_F32, depthwise_2x2) {
   ConvolutionOperatorTester()
     .input_size(15, 14)
@@ -1768,6 +1920,17 @@
     .TestF32();
 }
 
+TEST(CONVOLUTION_OP_F32, depthwise_2x2_without_bias) {
+  ConvolutionOperatorTester()
+    .has_bias(false)
+    .input_size(15, 14)
+    .padding(1, 1)
+    .kernel_size(2, 2)
+    .groups(24)
+    .iterations(3)
+    .TestF32();
+}
+
 TEST(CONVOLUTION_OP_F32, depthwise_2x2s2) {
   ConvolutionOperatorTester()
     .input_size(15, 14)
@@ -1844,6 +2007,17 @@
     .TestF32();
 }
 
+TEST(CONVOLUTION_OP_F32, depthwise_3x3_without_bias) {
+  ConvolutionOperatorTester()
+    .has_bias(false)
+    .input_size(15, 14)
+    .padding(1, 1)
+    .kernel_size(3, 3)
+    .groups(24)
+    .iterations(3)
+    .TestF32();
+}
+
 TEST(CONVOLUTION_OP_F32, depthwise_3x3s2) {
   ConvolutionOperatorTester()
     .input_size(15, 14)
@@ -1965,6 +2139,17 @@
     .TestF32();
 }
 
+TEST(CONVOLUTION_OP_F32, depthwise_5x5_without_bias) {
+  ConvolutionOperatorTester()
+    .has_bias(false)
+    .input_size(15, 14)
+    .padding(2, 2)
+    .kernel_size(5, 5)
+    .groups(27)
+    .iterations(3)
+    .TestF32();
+}
+
 TEST(CONVOLUTION_OP_F32, depthwise_5x5s2) {
   ConvolutionOperatorTester()
     .input_size(15, 14)
@@ -2052,6 +2237,17 @@
     .TestF32();
 }
 
+TEST(DEPTHWISE_CONVOLUTION_OP_F32, 1x1_without_bias) {
+  ConvolutionOperatorTester()
+    .depthwise_layout(true)
+    .has_bias(false)
+    .input_size(15, 14)
+    .kernel_size(1, 1)
+    .groups(24)
+    .iterations(3)
+    .TestF32();
+}
+
 TEST(DEPTHWISE_CONVOLUTION_OP_F32, 2x2) {
   ConvolutionOperatorTester()
     .depthwise_layout(true)
@@ -2075,6 +2271,18 @@
     .TestF32();
 }
 
+TEST(DEPTHWISE_CONVOLUTION_OP_F32, 2x2_without_bias) {
+  ConvolutionOperatorTester()
+    .depthwise_layout(true)
+    .has_bias(false)
+    .input_size(15, 14)
+    .padding(1, 1)
+    .kernel_size(2, 2)
+    .groups(24)
+    .iterations(3)
+    .TestF32();
+}
+
 TEST(DEPTHWISE_CONVOLUTION_OP_F32, 3x3) {
   ConvolutionOperatorTester()
     .depthwise_layout(true)
@@ -2098,6 +2306,18 @@
     .TestF32();
 }
 
+TEST(DEPTHWISE_CONVOLUTION_OP_F32, 3x3_without_bias) {
+  ConvolutionOperatorTester()
+    .depthwise_layout(true)
+    .has_bias(false)
+    .input_size(15, 14)
+    .padding(1, 1)
+    .kernel_size(3, 3)
+    .groups(24)
+    .iterations(3)
+    .TestF32();
+}
+
 TEST(DEPTHWISE_CONVOLUTION_OP_F32, 3x3s2_with_tf_same_padding) {
   for (size_t input_height = 14; input_height <= 15; input_height++) {
     for (size_t input_width = 14; input_width <= 15; input_width++) {
@@ -2136,6 +2356,18 @@
     .TestF32();
 }
 
+TEST(DEPTHWISE_CONVOLUTION_OP_F32, 5x5_without_bias) {
+  ConvolutionOperatorTester()
+    .depthwise_layout(true)
+    .has_bias(false)
+    .input_size(15, 14)
+    .padding(2, 2)
+    .kernel_size(5, 5)
+    .groups(24)
+    .iterations(3)
+    .TestF32();
+}
+
 TEST(DEPTHWISE_CONVOLUTION_OP_F32, 7x7) {
   ConvolutionOperatorTester()
     .depthwise_layout(true)
@@ -2147,6 +2379,18 @@
     .TestF32();
 }
 
+TEST(DEPTHWISE_CONVOLUTION_OP_F32, 7x7_without_bias) {
+  ConvolutionOperatorTester()
+    .depthwise_layout(true)
+    .has_bias(false)
+    .input_size(15, 14)
+    .padding(3, 3)
+    .kernel_size(7, 7)
+    .groups(24)
+    .iterations(3)
+    .TestF32();
+}
+
 TEST(CONVOLUTION_OP_F32, setup_increasing_batch) {
   ASSERT_EQ(xnn_status_success, xnn_initialize());
   ConvolutionOperatorTester()
diff --git a/test/deconvolution-operator-tester.h b/test/deconvolution-operator-tester.h
index bb836a6..67f3548 100644
--- a/test/deconvolution-operator-tester.h
+++ b/test/deconvolution-operator-tester.h
@@ -409,6 +409,15 @@
     return this->qmax_;
   }
 
+  inline DeconvolutionOperatorTester& has_bias(bool has_bias) {
+    this->has_bias_ = has_bias;
+    return *this;
+  }
+
+  inline bool has_bias() const {
+    return this->has_bias_;
+  }
+
   inline DeconvolutionOperatorTester& iterations(size_t iterations) {
     this->iterations_ = iterations;
     return *this;
@@ -442,17 +451,21 @@
       std::fill(output.begin(), output.end(), 0xA5);
 
       // Compute reference results, without renormalization.
-      for (size_t i = 0; i < batch_size(); i++) {
-        for (size_t oy = 0; oy < output_height(); oy++) {
-          for (size_t ox = 0; ox < output_width(); ox++) {
-            for (size_t g = 0; g < groups(); g++) {
-              for (size_t oc = 0; oc < group_output_channels(); oc++) {
-                accumulators[(((i * output_height() + oy) * output_width() + ox) * groups() + g) * group_output_channels() + oc] =
-                  bias[g * group_output_channels() + oc];
+      if (has_bias()) {
+        for (size_t i = 0; i < batch_size(); i++) {
+          for (size_t oy = 0; oy < output_height(); oy++) {
+            for (size_t ox = 0; ox < output_width(); ox++) {
+              for (size_t g = 0; g < groups(); g++) {
+                for (size_t oc = 0; oc < group_output_channels(); oc++) {
+                  accumulators[(((i * output_height() + oy) * output_width() + ox) * groups() + g) * group_output_channels() + oc] =
+                    bias[g * group_output_channels() + oc];
+                }
               }
             }
           }
         }
+      } else {
+        std::fill(accumulators.begin(), accumulators.end(), 0);
       }
       for (size_t i = 0; i < batch_size(); i++) {
         for (size_t oy = 0; oy < output_height(); oy++) {
@@ -512,7 +525,7 @@
           input_pixel_stride(), output_pixel_stride(),
           input_zero_point, 1.0f /* input scale */,
           kernel_zero_point, 1.0f /* kernel scale */,
-          kernel.data(), bias.data(),
+          kernel.data(), has_bias() ? bias.data() : nullptr,
           output_zero_point, output_scale, qmin(), qmax(),
           0, &deconvolution_op));
 
@@ -572,17 +585,21 @@
       std::fill(output_ref.begin(), output_ref.end(), 0.0f);
 
       // Compute reference results, without clamping.
-      for (size_t i = 0; i < batch_size(); i++) {
-        for (size_t oy = 0; oy < output_height(); oy++) {
-          for (size_t ox = 0; ox < output_width(); ox++) {
-            for (size_t g = 0; g < groups(); g++) {
-              for (size_t oc = 0; oc < group_output_channels(); oc++) {
-                output_ref[(((i * output_height() + oy) * output_width() + ox) * groups() + g) * group_output_channels() + oc] =
-                  bias[g * group_output_channels() + oc];
+      if (has_bias()) {
+        for (size_t i = 0; i < batch_size(); i++) {
+          for (size_t oy = 0; oy < output_height(); oy++) {
+            for (size_t ox = 0; ox < output_width(); ox++) {
+              for (size_t g = 0; g < groups(); g++) {
+                for (size_t oc = 0; oc < group_output_channels(); oc++) {
+                  output_ref[(((i * output_height() + oy) * output_width() + ox) * groups() + g) * group_output_channels() + oc] =
+                    bias[g * group_output_channels() + oc];
+                }
               }
             }
           }
         }
+      } else {
+        std::fill(output_ref.begin(), output_ref.end(), 0.0f);
       }
       for (size_t i = 0; i < batch_size(); i++) {
         for (size_t oy = 0; oy < output_height(); oy++) {
@@ -637,7 +654,7 @@
           dilation_height(), dilation_width(),
           groups(), group_input_channels(), group_output_channels(),
           input_pixel_stride(), output_pixel_stride(),
-          kernel.data(), bias.data(),
+          kernel.data(), has_bias() ? bias.data() : nullptr,
           output_min, output_max,
           0, &deconvolution_op));
 
@@ -706,17 +723,21 @@
       std::fill(output.begin(), output.end(), 0xA5);
 
       // Compute reference results, without renormalization.
-      for (size_t i = 0; i < batch_size(); i++) {
-        for (size_t oy = 0; oy < output_height(); oy++) {
-          for (size_t ox = 0; ox < output_width(); ox++) {
-            for (size_t g = 0; g < groups(); g++) {
-              for (size_t oc = 0; oc < group_output_channels(); oc++) {
-                accumulators[(((i * output_height() + oy) * output_width() + ox) * groups() + g) * group_output_channels() + oc] =
-                  bias[g * group_output_channels() + oc];
+      if (has_bias()) {
+        for (size_t i = 0; i < batch_size(); i++) {
+          for (size_t oy = 0; oy < output_height(); oy++) {
+            for (size_t ox = 0; ox < output_width(); ox++) {
+              for (size_t g = 0; g < groups(); g++) {
+                for (size_t oc = 0; oc < group_output_channels(); oc++) {
+                  accumulators[(((i * output_height() + oy) * output_width() + ox) * groups() + g) * group_output_channels() + oc] =
+                    bias[g * group_output_channels() + oc];
+                }
               }
             }
           }
         }
+      } else {
+        std::fill(accumulators.begin(), accumulators.end(), 0);
       }
       for (size_t i = 0; i < batch_size(); i++) {
         for (size_t oy = 0; oy < output_height(); oy++) {
@@ -776,7 +797,7 @@
           input_pixel_stride(), output_pixel_stride(),
           input_zero_point, 1.0f /* input scale */,
           kernel_zero_point, 1.0f /* kernel scale */,
-          kernel.data(), bias.data(),
+          kernel.data(), has_bias() ? bias.data() : nullptr,
           output_zero_point, output_scale, qmin(), qmax(),
           0, &deconvolution_op));
 
@@ -819,17 +840,21 @@
       std::fill(output.begin(), output.end(), 0xA5);
 
       // Compute reference results for the second run, including renormalization.
-      for (size_t i = 0; i < next_batch_size(); i++) {
-        for (size_t oy = 0; oy < next_output_height(); oy++) {
-          for (size_t ox = 0; ox < next_output_width(); ox++) {
-            for (size_t g = 0; g < groups(); g++) {
-              for (size_t oc = 0; oc < group_output_channels(); oc++) {
-                next_accumulators[(((i * next_output_height() + oy) * next_output_width() + ox) * groups() + g) * group_output_channels() + oc] =
-                  bias[g * group_output_channels() + oc];
+      if (has_bias()) {
+        for (size_t i = 0; i < next_batch_size(); i++) {
+          for (size_t oy = 0; oy < next_output_height(); oy++) {
+            for (size_t ox = 0; ox < next_output_width(); ox++) {
+              for (size_t g = 0; g < groups(); g++) {
+                for (size_t oc = 0; oc < group_output_channels(); oc++) {
+                  next_accumulators[(((i * next_output_height() + oy) * next_output_width() + ox) * groups() + g) * group_output_channels() + oc] =
+                    bias[g * group_output_channels() + oc];
+                }
               }
             }
           }
         }
+      } else {
+        std::fill(next_accumulators.begin(), next_accumulators.end(), 0);
       }
       for (size_t i = 0; i < next_batch_size(); i++) {
         for (size_t oy = 0; oy < next_output_height(); oy++) {
@@ -920,17 +945,21 @@
       std::fill(output.begin(), output.end(), nanf(""));
 
       // Compute reference results, without clamping.
-      for (size_t i = 0; i < batch_size(); i++) {
-        for (size_t oy = 0; oy < output_height(); oy++) {
-          for (size_t ox = 0; ox < output_width(); ox++) {
-            for (size_t g = 0; g < groups(); g++) {
-              for (size_t oc = 0; oc < group_output_channels(); oc++) {
-                output_ref[(((i * output_height() + oy) * output_width() + ox) * groups() + g) * group_output_channels() + oc] =
-                  bias[g * group_output_channels() + oc];
+      if (has_bias()) {
+        for (size_t i = 0; i < batch_size(); i++) {
+          for (size_t oy = 0; oy < output_height(); oy++) {
+            for (size_t ox = 0; ox < output_width(); ox++) {
+              for (size_t g = 0; g < groups(); g++) {
+                for (size_t oc = 0; oc < group_output_channels(); oc++) {
+                  output_ref[(((i * output_height() + oy) * output_width() + ox) * groups() + g) * group_output_channels() + oc] =
+                    bias[g * group_output_channels() + oc];
+                }
               }
             }
           }
         }
+      } else {
+        std::fill(output_ref.begin(), output_ref.end(), 0.0f);
       }
       for (size_t i = 0; i < batch_size(); i++) {
         for (size_t oy = 0; oy < output_height(); oy++) {
@@ -985,7 +1014,7 @@
           dilation_height(), dilation_width(),
           groups(), group_input_channels(), group_output_channels(),
           input_pixel_stride(), output_pixel_stride(),
-          kernel.data(), bias.data(),
+          kernel.data(), has_bias() ? bias.data() : nullptr,
           output_min, output_max,
           0, &deconvolution_op));
 
@@ -1028,17 +1057,21 @@
       std::fill(output.begin(), output.end(), nanf(""));
 
       // Compute reference results for the second run, including clamping.
-      for (size_t i = 0; i < next_batch_size(); i++) {
-        for (size_t oy = 0; oy < next_output_height(); oy++) {
-          for (size_t ox = 0; ox < next_output_width(); ox++) {
-            for (size_t g = 0; g < groups(); g++) {
-              for (size_t oc = 0; oc < group_output_channels(); oc++) {
-                next_output_ref[(((i * next_output_height() + oy) * next_output_width() + ox) * groups() + g) * group_output_channels() + oc] =
-                  bias[g * group_output_channels() + oc];
+      if (has_bias()) {
+        for (size_t i = 0; i < next_batch_size(); i++) {
+          for (size_t oy = 0; oy < next_output_height(); oy++) {
+            for (size_t ox = 0; ox < next_output_width(); ox++) {
+              for (size_t g = 0; g < groups(); g++) {
+                for (size_t oc = 0; oc < group_output_channels(); oc++) {
+                  next_output_ref[(((i * next_output_height() + oy) * next_output_width() + ox) * groups() + g) * group_output_channels() + oc] =
+                    bias[g * group_output_channels() + oc];
+                }
               }
             }
           }
         }
+      } else {
+        std::fill(next_output_ref.begin(), next_output_ref.end(), 0.0f);
       }
       for (size_t i = 0; i < next_batch_size(); i++) {
         for (size_t oy = 0; oy < next_output_height(); oy++) {
@@ -1131,5 +1164,6 @@
   size_t next_batch_size_{0};
   uint8_t qmin_{0};
   uint8_t qmax_{255};
+  bool has_bias_{true};
   size_t iterations_{1};
 };
diff --git a/test/deconvolution.cc b/test/deconvolution.cc
index 470aa85..30ef3ea 100644
--- a/test/deconvolution.cc
+++ b/test/deconvolution.cc
@@ -123,6 +123,18 @@
     .TestQ8();
 }
 
+TEST(DECONVOLUTION_OP_Q8, 1x1_without_bias) {
+  ASSERT_EQ(xnn_status_success, xnn_initialize());
+  DeconvolutionOperatorTester()
+    .has_bias(false)
+    .input_size(27, 29)
+    .kernel_size(1, 1)
+    .group_input_channels(23)
+    .group_output_channels(xnn_params.q8.gemm.nr * 2 + 3)
+    .iterations(3)
+    .TestQ8();
+}
+
 /**************************** Future GEMM path, grouped ****************************/
 
 TEST(DECONVOLUTION_OP_Q8, grouped_1x1) {
@@ -245,6 +257,19 @@
     .TestQ8();
 }
 
+TEST(DECONVOLUTION_OP_Q8, grouped_1x1_without_bias) {
+  ASSERT_EQ(xnn_status_success, xnn_initialize());
+  DeconvolutionOperatorTester()
+    .has_bias(false)
+    .input_size(27, 29)
+    .kernel_size(1, 1)
+    .groups(2)
+    .group_input_channels(23)
+    .group_output_channels(xnn_params.q8.gemm.nr * 2 + 3)
+    .iterations(3)
+    .TestQ8();
+}
+
 /**************************** Future GEMM path, batched ****************************/
 
 TEST(DECONVOLUTION_OP_Q8, batched_1x1) {
@@ -367,6 +392,19 @@
     .TestQ8();
 }
 
+TEST(DECONVOLUTION_OP_Q8, batched_1x1_without_bias) {
+  ASSERT_EQ(xnn_status_success, xnn_initialize());
+  DeconvolutionOperatorTester()
+    .has_bias(false)
+    .batch_size(2)
+    .input_size(27, 29)
+    .kernel_size(1, 1)
+    .group_input_channels(23)
+    .group_output_channels(xnn_params.q8.gemm.nr * 2 + 3)
+    .iterations(3)
+    .TestQ8();
+}
+
 /**************************** Future GEMM path, batched, grouped ****************************/
 
 TEST(DECONVOLUTION_OP_Q8, batched_grouped_1x1) {
@@ -498,6 +536,20 @@
     .TestQ8();
 }
 
+TEST(DECONVOLUTION_OP_Q8, batched_grouped_1x1_without_bias) {
+  ASSERT_EQ(xnn_status_success, xnn_initialize());
+  DeconvolutionOperatorTester()
+    .has_bias(false)
+    .batch_size(2)
+    .input_size(27, 29)
+    .kernel_size(1, 1)
+    .groups(2)
+    .group_input_channels(23)
+    .group_output_channels(xnn_params.q8.gemm.nr * 2 + 3)
+    .iterations(3)
+    .TestQ8();
+}
+
 /**************************** CONV path ****************************/
 
 TEST(DECONVOLUTION_OP_Q8, 3x3) {
@@ -772,6 +824,19 @@
     .TestQ8();
 }
 
+TEST(DECONVOLUTION_OP_Q8, 3x3_without_bias) {
+  ASSERT_EQ(xnn_status_success, xnn_initialize());
+  DeconvolutionOperatorTester()
+    .has_bias(false)
+    .input_size(13, 12)
+    .padding(1)
+    .kernel_size(3, 3)
+    .group_input_channels(23)
+    .group_output_channels(xnn_params.q8.gemm.nr * 2 + 3)
+    .iterations(3)
+    .TestQ8();
+}
+
 /**************************** CONV path, grouped ****************************/
 
 TEST(DECONVOLUTION_OP_Q8, grouped_3x3) {
@@ -1065,6 +1130,20 @@
     .TestQ8();
 }
 
+TEST(DECONVOLUTION_OP_Q8, grouped_3x3_without_bias) {
+  ASSERT_EQ(xnn_status_success, xnn_initialize());
+  DeconvolutionOperatorTester()
+    .has_bias(false)
+    .input_size(13, 12)
+    .padding(1)
+    .kernel_size(3, 3)
+    .groups(2)
+    .group_input_channels(23)
+    .group_output_channels(xnn_params.q8.gemm.nr * 2 + 3)
+    .iterations(3)
+    .TestQ8();
+}
+
 /**************************** CONV path, batched ****************************/
 
 TEST(DECONVOLUTION_OP_Q8, batched_3x3) {
@@ -1358,6 +1437,20 @@
     .TestQ8();
 }
 
+TEST(DECONVOLUTION_OP_Q8, batched_3x3_without_bias) {
+  ASSERT_EQ(xnn_status_success, xnn_initialize());
+  DeconvolutionOperatorTester()
+    .has_bias(false)
+    .batch_size(2)
+    .input_size(13, 12)
+    .padding(1)
+    .kernel_size(3, 3)
+    .group_input_channels(23)
+    .group_output_channels(xnn_params.q8.gemm.nr * 2 + 3)
+    .iterations(3)
+    .TestQ8();
+}
+
 /**************************** CONV path, grouped, batched ****************************/
 
 TEST(DECONVOLUTION_OP_Q8, batched_grouped_3x3) {
@@ -1670,6 +1763,21 @@
     .TestQ8();
 }
 
+TEST(DECONVOLUTION_OP_Q8, batched_grouped_3x3_without_bias) {
+  ASSERT_EQ(xnn_status_success, xnn_initialize());
+  DeconvolutionOperatorTester()
+    .has_bias(false)
+    .batch_size(2)
+    .input_size(13, 12)
+    .padding(1)
+    .kernel_size(3, 3)
+    .groups(2)
+    .group_input_channels(23)
+    .group_output_channels(xnn_params.q8.gemm.nr * 2 + 3)
+    .iterations(3)
+    .TestQ8();
+}
+
 /**************************** CONV path, setup ****************************/
 
 TEST(DECONVOLUTION_OP_Q8, 3x3_setup_changing_batch) {
@@ -1980,6 +2088,20 @@
     .TestQ8();
 }
 
+TEST(DECONVOLUTION_OP_Q8, 3x3s2_without_bias) {
+  ASSERT_EQ(xnn_status_success, xnn_initialize());
+  DeconvolutionOperatorTester()
+    .has_bias(false)
+    .input_size(10, 9)
+    .padding(1)
+    .kernel_size(3, 3)
+    .stride(2)
+    .group_input_channels(23)
+    .group_output_channels(xnn_params.q8.gemm.nr * 2 + 3)
+    .iterations(3)
+    .TestQ8();
+}
+
 /**************************** SUBCONV2D path, grouped ****************************/
 
 TEST(DECONVOLUTION_OP_Q8, grouped_3x3s2) {
@@ -2260,6 +2382,21 @@
     .TestQ8();
 }
 
+TEST(DECONVOLUTION_OP_Q8, grouped_3x3s2_without_bias) {
+  ASSERT_EQ(xnn_status_success, xnn_initialize());
+  DeconvolutionOperatorTester()
+    .has_bias(false)
+    .input_size(10, 9)
+    .padding(1)
+    .kernel_size(3, 3)
+    .stride(2)
+    .groups(2)
+    .group_input_channels(17)
+    .group_output_channels(xnn_params.q8.gemm.nr * 2 + 3)
+    .iterations(3)
+    .TestQ8();
+}
+
 /**************************** SUBCONV2D path, batched ****************************/
 
 TEST(DECONVOLUTION_OP_Q8, batched_3x3s2) {
@@ -2540,6 +2677,21 @@
     .TestQ8();
 }
 
+TEST(DECONVOLUTION_OP_Q8, batched_3x3s2_without_bias) {
+  ASSERT_EQ(xnn_status_success, xnn_initialize());
+  DeconvolutionOperatorTester()
+    .has_bias(false)
+    .batch_size(2)
+    .input_size(10, 9)
+    .padding(1)
+    .kernel_size(3, 3)
+    .stride(2)
+    .group_input_channels(23)
+    .group_output_channels(xnn_params.q8.gemm.nr * 2 + 3)
+    .iterations(3)
+    .TestQ8();
+}
+
 /**************************** SUBCONV2D path, grouped, batched ****************************/
 
 TEST(DECONVOLUTION_OP_Q8, batched_grouped_3x3s2) {
@@ -2837,6 +2989,22 @@
     .TestQ8();
 }
 
+TEST(DECONVOLUTION_OP_Q8, batched_grouped_3x3s2_without_bias) {
+  ASSERT_EQ(xnn_status_success, xnn_initialize());
+  DeconvolutionOperatorTester()
+    .has_bias(false)
+    .batch_size(2)
+    .input_size(10, 9)
+    .padding(1)
+    .kernel_size(3, 3)
+    .stride(2)
+    .groups(2)
+    .group_input_channels(17)
+    .group_output_channels(xnn_params.q8.gemm.nr * 2 + 3)
+    .iterations(3)
+    .TestQ8();
+}
+
 /**************************** SUBCONV2D path, setup ****************************/
 
 TEST(DECONVOLUTION_OP_Q8, 3x3s2_setup_changing_batch) {
@@ -3000,6 +3168,18 @@
     .TestF32();
 }
 
+TEST(DECONVOLUTION_OP_F32, 1x1_without_bias) {
+  ASSERT_EQ(xnn_status_success, xnn_initialize());
+  DeconvolutionOperatorTester()
+    .has_bias(false)
+    .input_size(27, 29)
+    .kernel_size(1, 1)
+    .group_input_channels(23)
+    .group_output_channels(xnn_params.f32.gemm.nr * 2 + 3)
+    .iterations(3)
+    .TestF32();
+}
+
 /**************************** Future GEMM path, grouped ****************************/
 
 TEST(DECONVOLUTION_OP_F32, grouped_1x1) {
@@ -3122,6 +3302,19 @@
     .TestF32();
 }
 
+TEST(DECONVOLUTION_OP_F32, grouped_1x1_without_bias) {
+  ASSERT_EQ(xnn_status_success, xnn_initialize());
+  DeconvolutionOperatorTester()
+    .has_bias(false)
+    .input_size(27, 29)
+    .kernel_size(1, 1)
+    .groups(2)
+    .group_input_channels(23)
+    .group_output_channels(xnn_params.f32.gemm.nr * 2 + 3)
+    .iterations(3)
+    .TestF32();
+}
+
 /**************************** Future GEMM path, batched ****************************/
 
 TEST(DECONVOLUTION_OP_F32, batched_1x1) {
@@ -3244,6 +3437,19 @@
     .TestF32();
 }
 
+TEST(DECONVOLUTION_OP_F32, batched_1x1_without_bias) {
+  ASSERT_EQ(xnn_status_success, xnn_initialize());
+  DeconvolutionOperatorTester()
+    .has_bias(false)
+    .batch_size(2)
+    .input_size(27, 29)
+    .kernel_size(1, 1)
+    .group_input_channels(23)
+    .group_output_channels(xnn_params.f32.gemm.nr * 2 + 3)
+    .iterations(3)
+    .TestF32();
+}
+
 /**************************** Future GEMM path, batched, grouped ****************************/
 
 TEST(DECONVOLUTION_OP_F32, batched_grouped_1x1) {
@@ -3375,6 +3581,20 @@
     .TestF32();
 }
 
+TEST(DECONVOLUTION_OP_F32, batched_grouped_1x1_without_bias) {
+  ASSERT_EQ(xnn_status_success, xnn_initialize());
+  DeconvolutionOperatorTester()
+    .has_bias(false)
+    .batch_size(2)
+    .input_size(27, 29)
+    .kernel_size(1, 1)
+    .groups(2)
+    .group_input_channels(23)
+    .group_output_channels(xnn_params.f32.gemm.nr * 2 + 3)
+    .iterations(3)
+    .TestF32();
+}
+
 /**************************** CONV path ****************************/
 
 TEST(DECONVOLUTION_OP_F32, 3x3) {
@@ -3649,6 +3869,19 @@
     .TestF32();
 }
 
+TEST(DECONVOLUTION_OP_F32, 3x3_without_bias) {
+  ASSERT_EQ(xnn_status_success, xnn_initialize());
+  DeconvolutionOperatorTester()
+    .has_bias(false)
+    .input_size(13, 12)
+    .padding(1)
+    .kernel_size(3, 3)
+    .group_input_channels(23)
+    .group_output_channels(xnn_params.f32.gemm.nr * 2 + 3)
+    .iterations(3)
+    .TestF32();
+}
+
 /**************************** CONV path, grouped ****************************/
 
 TEST(DECONVOLUTION_OP_F32, grouped_3x3) {
@@ -3942,6 +4175,20 @@
     .TestF32();
 }
 
+TEST(DECONVOLUTION_OP_F32, grouped_3x3_without_bias) {
+  ASSERT_EQ(xnn_status_success, xnn_initialize());
+  DeconvolutionOperatorTester()
+    .has_bias(false)
+    .input_size(13, 12)
+    .padding(1)
+    .kernel_size(3, 3)
+    .groups(2)
+    .group_input_channels(23)
+    .group_output_channels(xnn_params.f32.gemm.nr * 2 + 3)
+    .iterations(3)
+    .TestF32();
+}
+
 /**************************** CONV path, batched ****************************/
 
 TEST(DECONVOLUTION_OP_F32, batched_3x3) {
@@ -4235,6 +4482,20 @@
     .TestF32();
 }
 
+TEST(DECONVOLUTION_OP_F32, batched_3x3_without_bias) {
+  ASSERT_EQ(xnn_status_success, xnn_initialize());
+  DeconvolutionOperatorTester()
+    .has_bias(false)
+    .batch_size(2)
+    .input_size(13, 12)
+    .padding(1)
+    .kernel_size(3, 3)
+    .group_input_channels(23)
+    .group_output_channels(xnn_params.f32.gemm.nr * 2 + 3)
+    .iterations(3)
+    .TestF32();
+}
+
 /**************************** CONV path, grouped, batched ****************************/
 
 TEST(DECONVOLUTION_OP_F32, batched_grouped_3x3) {
@@ -4547,6 +4808,21 @@
     .TestF32();
 }
 
+TEST(DECONVOLUTION_OP_F32, batched_grouped_3x3_without_bias) {
+  ASSERT_EQ(xnn_status_success, xnn_initialize());
+  DeconvolutionOperatorTester()
+    .has_bias(false)
+    .batch_size(2)
+    .input_size(13, 12)
+    .padding(1)
+    .kernel_size(3, 3)
+    .groups(2)
+    .group_input_channels(23)
+    .group_output_channels(xnn_params.f32.gemm.nr * 2 + 3)
+    .iterations(3)
+    .TestF32();
+}
+
 /**************************** CONV path, setup ****************************/
 
 TEST(DECONVOLUTION_OP_F32, 3x3_setup_changing_batch) {
@@ -4857,6 +5133,20 @@
     .TestF32();
 }
 
+TEST(DECONVOLUTION_OP_F32, 3x3s2_without_bias) {
+  ASSERT_EQ(xnn_status_success, xnn_initialize());
+  DeconvolutionOperatorTester()
+    .has_bias(false)
+    .input_size(10, 9)
+    .padding(1)
+    .kernel_size(3, 3)
+    .stride(2)
+    .group_input_channels(23)
+    .group_output_channels(xnn_params.f32.gemm.nr * 2 + 3)
+    .iterations(3)
+    .TestF32();
+}
+
 /**************************** SUBCONV2D path, grouped ****************************/
 
 TEST(DECONVOLUTION_OP_F32, grouped_3x3s2) {
@@ -5137,6 +5427,21 @@
     .TestF32();
 }
 
+TEST(DECONVOLUTION_OP_F32, grouped_3x3s2_without_bias) {
+  ASSERT_EQ(xnn_status_success, xnn_initialize());
+  DeconvolutionOperatorTester()
+    .has_bias(false)
+    .input_size(10, 9)
+    .padding(1)
+    .kernel_size(3, 3)
+    .stride(2)
+    .groups(2)
+    .group_input_channels(17)
+    .group_output_channels(xnn_params.f32.gemm.nr * 2 + 3)
+    .iterations(3)
+    .TestF32();
+}
+
 /**************************** SUBCONV2D path, batched ****************************/
 
 TEST(DECONVOLUTION_OP_F32, batched_3x3s2) {
@@ -5417,6 +5722,21 @@
     .TestF32();
 }
 
+TEST(DECONVOLUTION_OP_F32, batched_3x3s2_without_bias) {
+  ASSERT_EQ(xnn_status_success, xnn_initialize());
+  DeconvolutionOperatorTester()
+    .has_bias(false)
+    .batch_size(2)
+    .input_size(10, 9)
+    .padding(1)
+    .kernel_size(3, 3)
+    .stride(2)
+    .group_input_channels(23)
+    .group_output_channels(xnn_params.f32.gemm.nr * 2 + 3)
+    .iterations(3)
+    .TestF32();
+}
+
 /**************************** SUBCONV2D path, grouped, batched ****************************/
 
 TEST(DECONVOLUTION_OP_F32, batched_grouped_3x3s2) {
@@ -5714,6 +6034,22 @@
     .TestF32();
 }
 
+TEST(DECONVOLUTION_OP_F32, batched_grouped_3x3s2_without_bias) {
+  ASSERT_EQ(xnn_status_success, xnn_initialize());
+  DeconvolutionOperatorTester()
+    .has_bias(false)
+    .batch_size(2)
+    .input_size(10, 9)
+    .padding(1)
+    .kernel_size(3, 3)
+    .stride(2)
+    .groups(2)
+    .group_input_channels(17)
+    .group_output_channels(xnn_params.f32.gemm.nr * 2 + 3)
+    .iterations(3)
+    .TestF32();
+}
+
 /**************************** SUBCONV2D path, setup ****************************/
 
 TEST(DECONVOLUTION_OP_F32, 3x3s2_setup_changing_batch) {
diff --git a/test/fully-connected-operator-tester.h b/test/fully-connected-operator-tester.h
index f5b2cd6..5ddfc94 100644
--- a/test/fully-connected-operator-tester.h
+++ b/test/fully-connected-operator-tester.h
@@ -101,6 +101,15 @@
     return this->qmax_;
   }
 
+  inline FullyConnectedOperatorTester& has_bias(bool has_bias) {
+    this->has_bias_ = has_bias;
+    return *this;
+  }
+
+  inline bool has_bias() const {
+    return this->has_bias_;
+  }
+
   inline FullyConnectedOperatorTester& iterations(size_t iterations) {
     this->iterations_ = iterations;
     return *this;
@@ -132,13 +141,16 @@
       std::generate(kernel.begin(), kernel.end(), std::ref(u8rng));
       std::generate(bias.begin(), bias.end(), std::ref(s32rng));
       std::fill(output.begin(), output.end(), 0xA5);
-      std::fill(accumulators.begin(), accumulators.end(), 0);
 
       // Compute reference results, without renormalization.
-      for (size_t i = 0; i < batch_size(); i++) {
-        for (size_t oc = 0; oc < output_channels(); oc++) {
-          accumulators[i * output_channels() + oc] = bias[oc];
+      if (has_bias()) {
+        for (size_t i = 0; i < batch_size(); i++) {
+          for (size_t oc = 0; oc < output_channels(); oc++) {
+            accumulators[i * output_channels() + oc] = bias[oc];
+          }
         }
+      } else {
+        std::fill(accumulators.begin(), accumulators.end(), 0);
       }
       for (size_t i = 0; i < batch_size(); i++) {
         for (size_t oc = 0; oc < output_channels(); oc++) {
@@ -175,7 +187,7 @@
           input_stride(), output_stride(),
           input_zero_point, 1.0f /* input scale */,
           kernel_zero_point, 1.0f /* kernel scale */,
-          kernel.data(), bias.data(),
+          kernel.data(), has_bias() ? bias.data() : nullptr,
           output_zero_point, output_scale, qmin(), qmax(),
           0, &fully_connected_op));
 
@@ -228,10 +240,14 @@
       std::fill(output.begin(), output.end(), nanf(""));
 
       // Compute reference results, without renormalization.
-      for (size_t i = 0; i < batch_size(); i++) {
-        for (size_t oc = 0; oc < output_channels(); oc++) {
-          output_ref[i * output_channels() + oc] = bias[oc];
+      if (has_bias()) {
+        for (size_t i = 0; i < batch_size(); i++) {
+          for (size_t oc = 0; oc < output_channels(); oc++) {
+            output_ref[i * output_channels() + oc] = bias[oc];
+          }
         }
+      } else {
+        std::fill(output_ref.begin(), output_ref.end(), 0.0f);
       }
       for (size_t i = 0; i < batch_size(); i++) {
         for (size_t oc = 0; oc < output_channels(); oc++) {
@@ -262,7 +278,7 @@
         xnn_create_fully_connected_nc_f32(
           input_channels(), output_channels(),
           input_stride(), output_stride(),
-          kernel.data(), bias.data(),
+          kernel.data(), has_bias() ? bias.data() : nullptr,
           output_min, output_max,
           0, &fully_connected_op));
 
@@ -304,5 +320,6 @@
   size_t batch_size_{1};
   uint8_t qmin_{0};
   uint8_t qmax_{255};
+  bool has_bias_{true};
   size_t iterations_{1};
 };
diff --git a/test/fully-connected.cc b/test/fully-connected.cc
index bff2db9..7f7cb8c 100644
--- a/test/fully-connected.cc
+++ b/test/fully-connected.cc
@@ -60,6 +60,16 @@
     .TestQ8();
 }
 
+TEST(FULLY_CONNECTED_OP_Q8, unit_batch_without_bias) {
+  FullyConnectedOperatorTester()
+    .has_bias(false)
+    .batch_size(1)
+    .input_channels(23)
+    .output_channels(19)
+    .iterations(3)
+    .TestQ8();
+}
+
 TEST(FULLY_CONNECTED_OP_Q8, small_batch) {
   FullyConnectedOperatorTester()
     .batch_size(12)
@@ -109,6 +119,16 @@
     .TestQ8();
 }
 
+TEST(FULLY_CONNECTED_OP_Q8, small_batch_without_bias) {
+  FullyConnectedOperatorTester()
+    .has_bias(false)
+    .batch_size(12)
+    .input_channels(23)
+    .output_channels(19)
+    .iterations(3)
+    .TestQ8();
+}
+
 TEST(FULLY_CONNECTED_OP_F32, unit_batch) {
   FullyConnectedOperatorTester()
     .batch_size(1)
@@ -158,6 +178,16 @@
     .TestF32();
 }
 
+TEST(FULLY_CONNECTED_OP_F32, unit_batch_without_bias) {
+  FullyConnectedOperatorTester()
+    .has_bias(false)
+    .batch_size(1)
+    .input_channels(23)
+    .output_channels(19)
+    .iterations(3)
+    .TestF32();
+}
+
 TEST(FULLY_CONNECTED_OP_F32, small_batch) {
   FullyConnectedOperatorTester()
     .batch_size(12)
@@ -206,3 +236,13 @@
     .iterations(3)
     .TestF32();
 }
+
+TEST(FULLY_CONNECTED_OP_F32, small_batch_without_bias) {
+  FullyConnectedOperatorTester()
+    .has_bias(false)
+    .batch_size(12)
+    .input_channels(23)
+    .output_channels(19)
+    .iterations(3)
+    .TestF32();
+}