QS8 version of NHWC Convolution operator

- Operator implementation
- Unit tests
- Microbenchmarks

PiperOrigin-RevId: 324916339
diff --git a/src/packing.c b/src/packing.c
index 5736624..24ade5b 100644
--- a/src/packing.c
+++ b/src/packing.c
@@ -777,6 +777,53 @@
   }
 }
 
+void xnn_pack_qs8_conv_kgo_w(
+  size_t g,
+  size_t nc,
+  size_t ks,
+  size_t nr,
+  size_t kr,
+  const int8_t* k,
+  const int32_t* b,
+  void* packed_w,
+  const struct xnn_qs8_packing_params* params)
+{
+  const int32_t izp = (int32_t) params->input_zero_point;
+  for (size_t i = 0; i < g; i++) {
+    for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
+      const size_t nr_block_size = min(nc - nr_block_start, nr);
+      int32_t* packed_b = (int32_t*) packed_w;
+      if XNN_LIKELY(b != NULL) {
+        for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
+          *((int32_t*) packed_w) = b[nr_block_start + nr_block_offset];
+          packed_w = (void*) ((uintptr_t) packed_w + sizeof(int32_t));
+        }
+      } else {
+        size_t n = nr_block_size;
+        do {
+          *((int32_t*) packed_w) = 0;
+          packed_w = (void*) ((uintptr_t) packed_w + sizeof(int32_t));
+        } while (--n != 0);
+      }
+      packed_w = (void*) ((uintptr_t) packed_w + (nr - nr_block_size) * sizeof(int32_t));
+      for (size_t ki = 0; ki < ks; ki++) {
+        for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
+          const int8_t kv =
+            k[ki * g * nc + (nr_block_start + nr_block_offset)];
+          *((int8_t*) packed_w) = kv;
+          packed_b[nr_block_offset] -= (int32_t) kv * izp;
+          packed_w = (void*) ((uintptr_t) packed_w + kr * sizeof(int8_t));
+        }
+        packed_w = (void*) ((uintptr_t) packed_w + (nr - nr_block_size) * kr * sizeof(int8_t));
+      }
+    }
+    k += nc;
+    if XNN_UNPREDICTABLE(b != NULL) {
+      b += nc;
+    }
+  }
+}
+
 void xnn_pack_f32_deconv_goki_w(
   size_t g,
   size_t nc,
@@ -1253,6 +1300,47 @@
   }
 }
 
+void xnn_pack_qs8_dwconv_hwg_w(
+  size_t h,
+  size_t w,
+  size_t c,
+  size_t cr,
+  const int8_t* k,
+  const int32_t* b,
+  void* packed_w,
+  const struct xnn_qs8_packing_params* params)
+{
+  const int32_t izp = (int32_t) params->input_zero_point;
+  for (size_t cr_block_start = 0; cr_block_start < c; cr_block_start += cr) {
+    const size_t cr_block_size = min(c - cr_block_start, cr);
+    int32_t* packed_b = (int32_t*) packed_w;
+    if XNN_LIKELY(b != NULL) {
+      for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) {
+        *((int32_t*) packed_w) = b[cr_block_start + cr_block_offset];
+        packed_w = (void*) ((uintptr_t) packed_w + sizeof(int32_t));
+      }
+    } else {
+      size_t n = cr_block_size;
+      do {
+        *((int32_t*) packed_w) = 0;
+        packed_w = (void*) ((uintptr_t) packed_w + sizeof(int32_t));
+      } while (--n != 0);
+    }
+    packed_w = (void*) ((uintptr_t) packed_w + (cr - cr_block_size) * sizeof(int32_t));
+    for (size_t x = 0; x < w; x++) {
+      for (size_t y = 0; y < h; y++) {
+        for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) {
+          const int8_t kv = k[(y * w + x) * c + (cr_block_start + cr_block_offset)];
+          packed_b[cr_block_offset] -= (int32_t) kv * izp;
+          *((int8_t*) packed_w) = kv;
+          packed_w = (void*) ((uintptr_t) packed_w + sizeof(int8_t));
+        }
+        packed_w = (void*) ((uintptr_t) packed_w + (cr - cr_block_size) * sizeof(int8_t));
+      }
+    }
+  }
+}
+
 void xnn_pack_f32_gemminc_goi_w(
   size_t g,
   size_t nc,