Unify comments style

- Change /* comments */ to // comments
- Re-generate auto-generated files
- Remove legacy unused GEMMINC SSE micro-kernels

PiperOrigin-RevId: 271901989
diff --git a/src/xnnpack/argmaxpool.h b/src/xnnpack/argmaxpool.h
index 5b9776d..ce60230 100644
--- a/src/xnnpack/argmaxpool.h
+++ b/src/xnnpack/argmaxpool.h
@@ -56,5 +56,5 @@
 
 
 #ifdef __cplusplus
-} /* extern "C" */
+}  // extern "C"
 #endif
diff --git a/src/xnnpack/avgpool.h b/src/xnnpack/avgpool.h
index 5fd51b9..d838fb5 100644
--- a/src/xnnpack/avgpool.h
+++ b/src/xnnpack/avgpool.h
@@ -92,5 +92,5 @@
 
 
 #ifdef __cplusplus
-} /* extern "C" */
+}  // extern "C"
 #endif
diff --git a/src/xnnpack/clamp.h b/src/xnnpack/clamp.h
index db19d28..0cd59b4 100644
--- a/src/xnnpack/clamp.h
+++ b/src/xnnpack/clamp.h
@@ -45,5 +45,5 @@
 
 
 #ifdef __cplusplus
-} /* extern "C" */
+}  // extern "C"
 #endif
diff --git a/src/xnnpack/conv.h b/src/xnnpack/conv.h
index c1bdec3..efc2634 100644
--- a/src/xnnpack/conv.h
+++ b/src/xnnpack/conv.h
@@ -59,5 +59,5 @@
 
 
 #ifdef __cplusplus
-} /* extern "C" */
+}  // extern "C"
 #endif
diff --git a/src/xnnpack/dwconv.h b/src/xnnpack/dwconv.h
index dc52a61..22794a6 100644
--- a/src/xnnpack/dwconv.h
+++ b/src/xnnpack/dwconv.h
@@ -84,5 +84,5 @@
 
 
 #ifdef __cplusplus
-} /* extern "C" */
+}  // extern "C"
 #endif
diff --git a/src/xnnpack/gavgpool.h b/src/xnnpack/gavgpool.h
index b567196..4680084 100644
--- a/src/xnnpack/gavgpool.h
+++ b/src/xnnpack/gavgpool.h
@@ -95,5 +95,5 @@
 
 
 #ifdef __cplusplus
-} /* extern "C" */
+}  // extern "C"
 #endif
diff --git a/src/xnnpack/gemm.h b/src/xnnpack/gemm.h
index 27f591d..bcc067a 100644
--- a/src/xnnpack/gemm.h
+++ b/src/xnnpack/gemm.h
@@ -185,5 +185,5 @@
 
 
 #ifdef __cplusplus
-} /* extern "C" */
+}  // extern "C"
 #endif
diff --git a/src/xnnpack/hswish.h b/src/xnnpack/hswish.h
index 8d0ab93..eda1a1f 100644
--- a/src/xnnpack/hswish.h
+++ b/src/xnnpack/hswish.h
@@ -31,5 +31,5 @@
 
 
 #ifdef __cplusplus
-} /* extern "C" */
+}  // extern "C"
 #endif
diff --git a/src/xnnpack/igemm.h b/src/xnnpack/igemm.h
index 4d30c6f..b9dc93d 100644
--- a/src/xnnpack/igemm.h
+++ b/src/xnnpack/igemm.h
@@ -101,5 +101,5 @@
 
 
 #ifdef __cplusplus
-} /* extern "C" */
+}  // extern "C"
 #endif
diff --git a/src/xnnpack/im2col.h b/src/xnnpack/im2col.h
index 07323e3..d57dabb 100644
--- a/src/xnnpack/im2col.h
+++ b/src/xnnpack/im2col.h
@@ -33,5 +33,5 @@
   void* output);
 
 #ifdef __cplusplus
-} /* extern "C" */
+}  // extern "C"
 #endif
diff --git a/src/xnnpack/indirection.h b/src/xnnpack/indirection.h
index 60be1f6..6853052 100644
--- a/src/xnnpack/indirection.h
+++ b/src/xnnpack/indirection.h
@@ -53,5 +53,5 @@
   uint32_t log2_element_size);
 
 #ifdef __cplusplus
-} /* extern "C" */
+}  // extern "C"
 #endif
diff --git a/src/xnnpack/lut.h b/src/xnnpack/lut.h
index 49b0ec4..830b7e7 100644
--- a/src/xnnpack/lut.h
+++ b/src/xnnpack/lut.h
@@ -40,5 +40,5 @@
 
 
 #ifdef __cplusplus
-} /* extern "C" */
+}  // extern "C"
 #endif
diff --git a/src/xnnpack/maxpool.h b/src/xnnpack/maxpool.h
index 1c134d7..1cac764 100644
--- a/src/xnnpack/maxpool.h
+++ b/src/xnnpack/maxpool.h
@@ -52,5 +52,5 @@
 
 
 #ifdef __cplusplus
-} /* extern "C" */
+}  // extern "C"
 #endif
diff --git a/src/xnnpack/operator.h b/src/xnnpack/operator.h
index a34d6fd..62f956a 100644
--- a/src/xnnpack/operator.h
+++ b/src/xnnpack/operator.h
@@ -155,7 +155,7 @@
   size_t slice_height;
   size_t indirection_y_stride;
   size_t indirection_x_stride;
-  /* kernel_size * mr * sizeof(void*) */
+  // scaled_kernel_size := kernel_size * mr * sizeof(void*).
   size_t scaled_kernel_size;
 };
 
diff --git a/src/xnnpack/packx.h b/src/xnnpack/packx.h
index 20b3bc1..a42259d 100644
--- a/src/xnnpack/packx.h
+++ b/src/xnnpack/packx.h
@@ -32,5 +32,5 @@
 
 
 #ifdef __cplusplus
-} /* extern "C" */
+}  // extern "C"
 #endif
diff --git a/src/xnnpack/pad.h b/src/xnnpack/pad.h
index 3cb8103..d3a12ec 100644
--- a/src/xnnpack/pad.h
+++ b/src/xnnpack/pad.h
@@ -35,5 +35,5 @@
 
 
 #ifdef __cplusplus
-} /* extern "C" */
+}  // extern "C"
 #endif
diff --git a/src/xnnpack/params.h b/src/xnnpack/params.h
index 30e8393..11065c4 100644
--- a/src/xnnpack/params.h
+++ b/src/xnnpack/params.h
@@ -34,7 +34,7 @@
     XNN_ALIGN(16) float max[4];
     XNN_ALIGN(16) float min[4];
   } sse;
-#endif /* CPUINFO_ARCH_X86 || CPUINFO_ARCH_X86_64 */
+#endif  // CPUINFO_ARCH_X86 || CPUINFO_ARCH_X86_64
 };
 
 union xnn_f32_spchw_params {
@@ -50,7 +50,8 @@
     float min;
     float max;
   } neon;
-#elif CPUINFO_ARCH_X86 || CPUINFO_ARCH_X86_64
+#endif  // CPUINFO_ARCH_ARM || CPUINFO_ARCH_ARM64
+#if CPUINFO_ARCH_X86 || CPUINFO_ARCH_X86_64
   struct {
     XNN_ALIGN(16) uint32_t mask_even[4]; // used by stride 2 kernels
     XNN_ALIGN(16) uint32_t mask_odd[4];  // used by stride 2 kernels
@@ -58,7 +59,7 @@
     XNN_ALIGN(16) float max[4];
     XNN_ALIGN(16) float min[4];
   } sse;
-#endif /* CPUINFO_ARCH_X86 || CPUINFO_ARCH_X86_64 */
+#endif  // CPUINFO_ARCH_X86 || CPUINFO_ARCH_X86_64
 };
 
 union xnn_u8_output_params {
@@ -71,13 +72,13 @@
     uint8_t max;
     uint8_t min;
   } neon;
-#endif /* CPUINFO_ARCH_ARM || CPUINFO_ARCH_ARM64 */
+#endif  // CPUINFO_ARCH_ARM || CPUINFO_ARCH_ARM64
 #if CPUINFO_ARCH_X86 || CPUINFO_ARCH_X86_64
   struct {
     XNN_ALIGN(16) uint8_t max[16];
     XNN_ALIGN(16) uint8_t min[16];
   } sse2;
-#endif /* CPUINFO_ARCH_X86 || CPUINFO_ARCH_X86_64 */
+#endif  // CPUINFO_ARCH_X86 || CPUINFO_ARCH_X86_64
 };
 
 union xnn_f32_avgpool_params {
@@ -92,14 +93,14 @@
     XNN_ALIGN(16) float output_max[4];
     XNN_ALIGN(16) float output_min[4];
   } sse2;
-#endif /* CPUINFO_ARCH_X86 || CPUINFO_ARCH_X86_64 */
+#endif  // CPUINFO_ARCH_X86 || CPUINFO_ARCH_X86_64
 #if CPUINFO_ARCH_ARM || CPUINFO_ARCH_ARM64
   struct {
     XNN_ALIGN(16) float multiplier;
     XNN_ALIGN(16) float output_max;
     XNN_ALIGN(16) float output_min;
   } neon;
-#endif /* CPUINFO_ARCH_ARM || CPUINFO_ARCH_ARM64 */
+#endif  // CPUINFO_ARCH_ARM || CPUINFO_ARCH_ARM64
 };
 
 union xnn_f32_gavgpool_params {
@@ -115,7 +116,7 @@
     XNN_ALIGN(16) float output_min[4];
     XNN_ALIGN(16) uint32_t mask[4];
   } sse;
-#endif /* CPUINFO_ARCH_X86 || CPUINFO_ARCH_X86_64 */
+#endif  // CPUINFO_ARCH_X86 || CPUINFO_ARCH_X86_64
 #if CPUINFO_ARCH_ARM || CPUINFO_ARCH_ARM64
   struct {
     XNN_ALIGN(16) float multiplier;
@@ -123,7 +124,7 @@
     XNN_ALIGN(16) float output_min;
     XNN_ALIGN(16) uint32_t mask[4];
   } neon;
-#endif /* CPUINFO_ARCH_ARM || CPUINFO_ARCH_ARM64 */
+#endif  // CPUINFO_ARCH_ARM || CPUINFO_ARCH_ARM64 */
 };
 
 union xnn_f32_hswish_params {
@@ -138,7 +139,7 @@
     XNN_ALIGN(16) float half[4];
     XNN_ALIGN(16) float one[4];
   } sse;
-#endif /* CPUINFO_ARCH_X86 || CPUINFO_ARCH_X86_64 */
+#endif  // CPUINFO_ARCH_X86 || CPUINFO_ARCH_X86_64
 };
 
 union xnn_q8_gemm_params {
@@ -163,7 +164,7 @@
     uint8_t output_max;
     uint8_t output_min;
   } neon;
-#endif /* CPUINFO_ARCH_ARM || CPUINFO_ARCH_ARM64 */
+#endif  // CPUINFO_ARCH_ARM || CPUINFO_ARCH_ARM64
 #if CPUINFO_ARCH_X86 || CPUINFO_ARCH_X86_64
   struct {
     XNN_ALIGN(16) int16_t kernel_zero_point[8];
@@ -177,7 +178,7 @@
     XNN_ALIGN(16) uint8_t output_max[16];
     XNN_ALIGN(16) uint8_t output_min[16];
   } sse2;
-#endif /* CPUINFO_ARCH_X86 || CPUINFO_ARCH_X86_64 */
+#endif  // CPUINFO_ARCH_X86 || CPUINFO_ARCH_X86_64
 };
 
 union xnn_q8_add_params {
@@ -203,7 +204,7 @@
     uint8_t y_max;
     uint8_t y_min;
   } neon;
-#endif
+#endif  // CPUINFO_ARCH_ARM || CPUINFO_ARCH_ARM64
 #if CPUINFO_ARCH_X86 || CPUINFO_ARCH_X86_64
   struct {
     XNN_ALIGN(16) int32_t zero_point_product[4];
@@ -220,7 +221,7 @@
     uint32_t a_multiplier;
     uint32_t b_multiplier;
   } sse2;
-#endif
+#endif  // CPUINFO_ARCH_X86 || CPUINFO_ARCH_X86_64
 };
 
 union xnn_q8_avgpool_params {
@@ -242,7 +243,7 @@
     uint8_t output_max;
     uint8_t output_min;
   } neon;
-#endif /* CPUINFO_ARCH_ARM || CPUINFO_ARCH_ARM64 */
+#endif  // CPUINFO_ARCH_ARM || CPUINFO_ARCH_ARM64
 #if CPUINFO_ARCH_X86 || CPUINFO_ARCH_X86_64
   struct {
     XNN_ALIGN(16) int32_t bias[4];
@@ -253,7 +254,7 @@
     XNN_ALIGN(16) uint8_t output_max[16];
     XNN_ALIGN(16) uint8_t output_min[16];
   } sse2;
-#endif
+#endif  // CPUINFO_ARCH_X86 || CPUINFO_ARCH_X86_64
 };
 
 union xnn_fp32_requantization_params {
@@ -337,7 +338,7 @@
     uint8_t max;
     uint8_t min;
   } neon;
-#endif /* CPUINFO_ARCH_ARM || CPUINFO_ARCH_ARM64 */
+#endif  // CPUINFO_ARCH_ARM || CPUINFO_ARCH_ARM64
 #if CPUINFO_ARCH_X86 || CPUINFO_ARCH_X86_64
   struct {
     XNN_ALIGN(16) uint32_t multiplier[4];
@@ -349,7 +350,7 @@
     XNN_ALIGN(16) uint8_t max[16];
     XNN_ALIGN(16) uint8_t min[16];
   } sse2;
-#endif /* CPUINFO_ARCH_X86 || CPUINFO_ARCH_X86_64 */
+#endif  // CPUINFO_ARCH_X86 || CPUINFO_ARCH_X86_64
 };
 
 union xnn_requantization_params {
@@ -1125,7 +1126,7 @@
 struct gemm_parameters {
   xnn_gemm_ukernel_function gemm;
   xnn_igemm_ukernel_function igemm;
-  /* Optional GEMM and IGEMM micro-kernels with MR=1 and the same NR and KR parameters */
+  // Optional GEMM and IGEMM micro-kernels with MR=1 and the same NR and KR parameters.
   xnn_gemm_ukernel_function gemm1;
   xnn_igemm_ukernel_function igemm1;
   uint8_t mr;
diff --git a/src/xnnpack/pavgpool.h b/src/xnnpack/pavgpool.h
index f124519..3be16d3 100644
--- a/src/xnnpack/pavgpool.h
+++ b/src/xnnpack/pavgpool.h
@@ -56,5 +56,5 @@
 
 
 #ifdef __cplusplus
-} /* extern "C" */
+}  // extern "C"
 #endif
diff --git a/src/xnnpack/ppmm.h b/src/xnnpack/ppmm.h
index 1bf6941..3239736 100644
--- a/src/xnnpack/ppmm.h
+++ b/src/xnnpack/ppmm.h
@@ -41,5 +41,5 @@
 
 
 #ifdef __cplusplus
-} /* extern "C" */
+}  // extern "C"
 #endif
diff --git a/src/xnnpack/prelu.h b/src/xnnpack/prelu.h
index 2a882a7..559696a 100644
--- a/src/xnnpack/prelu.h
+++ b/src/xnnpack/prelu.h
@@ -34,5 +34,5 @@
 
 
 #ifdef __cplusplus
-} /* extern "C" */
+}  // extern "C"
 #endif
diff --git a/src/xnnpack/requantization-stubs.h b/src/xnnpack/requantization-stubs.h
index ee6e86d..cfeaed6 100644
--- a/src/xnnpack/requantization-stubs.h
+++ b/src/xnnpack/requantization-stubs.h
@@ -65,5 +65,5 @@
 DECLARE_REQUANTIZATION_FUNCTION(xnn_requantize_gemmlowp__neon)
 
 #ifdef __cplusplus
-} /* extern "C" */
+}  // extern "C"
 #endif
diff --git a/src/xnnpack/requantization.h b/src/xnnpack/requantization.h
index bf3e100..51cff74 100644
--- a/src/xnnpack/requantization.h
+++ b/src/xnnpack/requantization.h
@@ -34,15 +34,15 @@
   uint8_t output_min,
   uint8_t output_max)
 {
-  /* Compute requantization parameters */
+  // Compute requantization parameters
   const uint32_t scale_bits = fp32_to_bits(scale);
 
-  /* Multiplier is in [0x40000000, 0x7FFFFF80] range */
+  // Multiplier is in [0x40000000, 0x7FFFFF80] range.
   const int32_t multiplier = (int32_t)(((scale_bits & UINT32_C(0x007FFFFF)) | UINT32_C(0x00800000)) << 7);
   assert(multiplier >= INT32_C(0x40000000));
   assert(multiplier <= INT32_C(0x7FFFFF80));
 
-  /* Shift is in [0, 31] range */
+  // Shift is in [0, 31] range.
   const int32_t shift = 127 + 31 - 32 - (fp32_to_bits(scale) >> 23);
   assert(shift >= 0);
   assert(shift < 32);
@@ -73,15 +73,15 @@
   uint8_t output_min,
   uint8_t output_max)
 {
-  /* Compute requantization parameters */
+  // Compute requantization parameters.
   const uint32_t scale_bits = fp32_to_bits(scale);
 
-  /* Multiplier is in [0x40000000, 0x7FFFFF80] range */
+  // Multiplier is in [0x40000000, 0x7FFFFF80] range.
   const int32_t multiplier = (int32_t)(((scale_bits & UINT32_C(0x007FFFFF)) | UINT32_C(0x00800000)) << 7);
   assert(multiplier >= INT32_C(0x40000000));
   assert(multiplier <= INT32_C(0x7FFFFF80));
 
-  /* Shift is in [0, 31] range */
+  // Shift is in [0, 31] range.
   const int32_t shift = 127 + 31 - 32 - (fp32_to_bits(scale) >> 23);
   assert(shift >= 0);
   assert(shift < 32);
@@ -150,17 +150,17 @@
   uint8_t output_min,
   uint8_t output_max)
 {
-  /* Compute requantization parameters */
+  // Compute requantization parameters.
   assert(scale >= 0x1.0p-32f);
   assert(scale < 256.0f);
   const uint32_t scale_bits = fp32_to_bits(scale);
 
-  /* Multiplier is in [0x00800000, 0x00FFFFFF] range */
+  // Multiplier is in [0x00800000, 0x00FFFFFF] range.
   const int32_t multiplier = ((int32_t) scale_bits & INT32_C(0x007FFFFF)) | INT32_C(0x00800000);
   assert(multiplier >= INT32_C(0x00800000));
   assert(multiplier <= INT32_C(0x00FFFFFF));
 
-  /* Shift is in [16, 55] range */
+  // Shift is in [16, 55] range.
   const int32_t shift = 127 + 23 - (scale_bits >> 23);
   assert(shift >= 16);
   assert(shift < 64);
@@ -218,17 +218,17 @@
   uint8_t output_min,
   uint8_t output_max)
 {
-  /* Compute requantization parameters */
+  // Compute requantization parameters.
   assert(scale >= 0x1.0p-32f);
   assert(scale < 256.0f);
   const uint32_t scale_bits = fp32_to_bits(scale);
 
-  /* Multiplier is in [0x00800000, 0x00FFFFFF] range */
+  // Multiplier is in [0x00800000, 0x00FFFFFF] range.
   const int32_t multiplier = ((int32_t) scale_bits & INT32_C(0x007FFFFF)) | INT32_C(0x00800000);
   assert(multiplier >= INT32_C(0x00800000));
   assert(multiplier <= INT32_C(0x00FFFFFF));
 
-  /* Shift is in [16, 55] range */
+  // Shift is in [16, 55] range.
   const int32_t shift = 127 + 23 - (scale_bits >> 23);
   assert(shift >= 16);
   assert(shift < 64);
@@ -1023,20 +1023,20 @@
   assert(a_output_scale < 0x1.0p+8f);
   assert(b_output_scale < 0x1.0p+8f);
 
-  /* Compute requantization parameters */
+  // Compute requantization parameters.
   const float max_output_scale = a_output_scale > b_output_scale ? a_output_scale : b_output_scale;
   assert(max_output_scale >= 0x1.0p-14f);
   assert(max_output_scale < 0x1.0p+8f);
   const uint32_t max_scale_bits = fp32_to_bits(max_output_scale);
   const int32_t max_scale_exponent = (int32_t) (max_scale_bits >> 23) - 127;
-  /* Shift is in [13, 31] range */
+  // Shift is in [13, 31] range.
   const uint32_t shift = (uint32_t) (21 - max_scale_exponent);
   assert(shift < 32);
   assert(shift >= 13);
 
   const float scale_multiplier = fp32_from_bits((uint32_t) (21 - max_scale_exponent + 127) << 23);
 
-  /* Multipliers are in [0, 2**22) range, largest multiplier is in [2**21, 2**22) range */
+  // Multipliers are in [0, 2**22) range, largest multiplier is in [2**21, 2**22) range.
   const uint32_t a_multiplier = (uint32_t) (int32_t) __builtin_lrintf(a_output_scale * scale_multiplier);
   const uint32_t b_multiplier = (uint32_t) (int32_t) __builtin_lrintf(b_output_scale * scale_multiplier);
   assert((a_multiplier > b_multiplier ? a_multiplier : b_multiplier) >= UINT32_C(0x00200000));
@@ -1112,18 +1112,18 @@
   assert(a_output_scale < 0x1.0p+8f);
   assert(b_output_scale < 0x1.0p+8f);
 
-  /* Compute requantization parameters */
+  // Compute requantization parameters.
   const float max_output_scale = a_output_scale > b_output_scale ? a_output_scale : b_output_scale;
   assert(max_output_scale >= 0x1.0p-10f);
   assert(max_output_scale < 0x1.0p+8f);
   const uint32_t max_scale_bits = fp32_to_bits(max_output_scale);
   const int32_t max_scale_exponent = (int32_t) (max_scale_bits >> 23) - 127;
-  /* Shift is in [13, 31] range */
+  // Shift is in [13, 31] range.
   const uint32_t shift = (uint32_t) (21 - max_scale_exponent);
   assert(shift < 32);
   assert(shift >= 13);
 
-  /* Multipliers are in [0, 2**22) range, largest multiplier is in [2**21, 2**22) range */
+  // Multipliers are in [0, 2**22) range, largest multiplier is in [2**21, 2**22) range.
   const uint32_t a_multiplier = (uint32_t) (int32_t) __builtin_lrintf(fp32_from_bits(fp32_to_bits(a_output_scale) + (shift << 23)));
   const uint32_t b_multiplier = (uint32_t) (int32_t) __builtin_lrintf(fp32_from_bits(fp32_to_bits(b_output_scale) + (shift << 23)));
   assert((a_multiplier > b_multiplier ? a_multiplier : b_multiplier) >= UINT32_C(0x00200000));
@@ -1152,17 +1152,17 @@
   uint8_t min,
   uint8_t max)
 {
-  /* Compute requantization parameters */
+  // Compute requantization parameters.
   assert(scale < 1.0f);
   assert(scale >= 0x1.0p-32f);
   const uint32_t scale_bits = fp32_to_bits(scale);
 
-  /* Multiplier is in [0x40000000, 0x7FFFFF80] range */
+  // Multiplier is in [0x40000000, 0x7FFFFF80] range.
   const int32_t multiplier = (int32_t)(((scale_bits & UINT32_C(0x007FFFFF)) | UINT32_C(0x00800000)) << 7);
   assert(multiplier >= INT32_C(0x40000000));
   assert(multiplier <= INT32_C(0x7FFFFF80));
 
-  /* Shift is in [0, 31] range */
+  // Shift is in [0, 31] range.
   const int32_t shift = 127 + 31 - 32 - (fp32_to_bits(scale) >> 23);
   assert(shift >= 0);
   assert(shift < 32);
@@ -1186,15 +1186,15 @@
   uint8_t min,
   uint8_t max)
 {
-  /* Compute requantization parameters */
+  // Compute requantization parameters.
   const uint32_t scale_bits = fp32_to_bits(scale);
 
-  /* Multiplier is in [0x40000000, 0x7FFFFF80] range */
+  // Multiplier is in [0x40000000, 0x7FFFFF80] range.
   const int32_t multiplier = (int32_t)(((scale_bits & UINT32_C(0x007FFFFF)) | UINT32_C(0x00800000)) << 7);
   assert(multiplier >= INT32_C(0x40000000));
   assert(multiplier <= INT32_C(0x7FFFFF80));
 
-  /* Shift is in [0, 31] range */
+  // Shift is in [0, 31] range.
   const int32_t shift = 127 + 31 - 32 - (fp32_to_bits(scale) >> 23);
   assert(shift >= 0);
   assert(shift < 32);
@@ -1286,16 +1286,16 @@
   uint8_t a, uint8_t b,
   union xnn_q8_add_params params)
 {
-  /* Multiply by factors and accumulate products */
+  // Multiply by factors and accumulate products.
   int32_t acc = params.scalar.zero_point_product +
     (int32_t) ((uint32_t) a * params.scalar.a_multiplier) +
     (int32_t) ((uint32_t) b * params.scalar.b_multiplier);
 
-  /* Shift right and round */
+  // Shift right and round.
   const int32_t rem = (acc & params.scalar.remainder_mask) - (int32_t) (acc < 0);
   acc = asr_s32(acc, params.scalar.shift) + (int32_t) (rem > params.scalar.remainder_threshold);
 
-  /* Clamp and add output zero point */
+  // Clamp and add output zero point.
   int32_t y = acc + params.scalar.y_zero_point;
   if (y >= params.scalar.y_max) {
     y = params.scalar.y_max;
diff --git a/src/xnnpack/rmax.h b/src/xnnpack/rmax.h
index 25f6e32..0dc1996 100644
--- a/src/xnnpack/rmax.h
+++ b/src/xnnpack/rmax.h
@@ -43,5 +43,5 @@
 
 
 #ifdef __cplusplus
-} /* extern "C" */
+}  // extern "C"
 #endif
diff --git a/src/xnnpack/scalar-utils.h b/src/xnnpack/scalar-utils.h
index 88d30c8..caa607d 100644
--- a/src/xnnpack/scalar-utils.h
+++ b/src/xnnpack/scalar-utils.h
@@ -32,7 +32,7 @@
   #if __GNUC__ >= 8
     #define XNN_IGNORE_SHIFT_BASE_UB __attribute__((__no_sanitize__("shift-base")))
   #elif __GNUC__ == 4 && __GNUC_MINOR__ >= 9 || __GNUC__ > 4
-    /* 4.9 <= gcc < 8 support ubsan, but doesn't support no_sanitize attribute */
+    // 4.9 <= gcc < 8 support ubsan, but doesn't support no_sanitize attribute
     #define XNN_IGNORE_SHIFT_BASE_UB
     #ifndef XNN_USE_SHIFT_BASE_UB_WORKAROUND
       #define XNN_USE_SHIFT_BASE_UB_WORKAROUND 1
@@ -82,28 +82,22 @@
   assert(shift >= 24);
   assert(shift < 56);
 
-  /*
-   * Compute absolute value of input as unsigned 32-bit int.
-   * All further computations will work with unsigned values to avoid undefined behaviour on signed operations.
-   */
+  // Compute absolute value of input as unsigned 32-bit int.
+  // All further computations will work with unsigned values to avoid undefined behaviour on signed operations.
   const uint32_t abs_value = (value >= 0) ? (uint32_t) value : -(uint32_t) value;
 
-  /* Compute full 64-bit product of 32-bit factors */
+  // Compute full 64-bit product of 32-bit factors
   const uint64_t product = (uint64_t) abs_value * (uint64_t) multiplier;
 
-  /*
-   * Shift the full 64-bit product right with rounding.
-   * Rounding is performed towards closest integer, with midpoints rounded up (same as away from zero).
-   */
+  // Shift the full 64-bit product right with rounding.
+  // Rounding is performed towards closest integer, with midpoints rounded up (same as away from zero).
   const uint64_t rounding = UINT64_C(1) << (shift - 1);
   const uint32_t abs_scaled_value = (uint32_t) ((product + rounding) >> shift);
 
-  /*
-   * Copy the sign of input to scaled absolute input value.
-   */
+  // Copy the sign of input to scaled absolute input value.
   const int32_t scaled_value = (int32_t) (value >= 0 ? abs_scaled_value : -abs_scaled_value);
 
-  /* Clamp scaled value with zero point between smin and smax */
+  // Clamp scaled value with zero point between smin and smax.
   int32_t clamped_value = scaled_value;
   const int32_t smin = (int32_t) (uint32_t) qmin - (int32_t) (uint32_t) zero_point;
   if (clamped_value < smin) {
@@ -114,7 +108,7 @@
     clamped_value = smax;
   }
 
-  /* Add zero point to clamped value */
+  // Add zero point to clamped value.
   const int32_t biased_value = clamped_value + (int32_t) (uint32_t) zero_point;
 
   return biased_value;
diff --git a/src/xnnpack/spmm.h b/src/xnnpack/spmm.h
index 7ea16bf..93f8ae6 100644
--- a/src/xnnpack/spmm.h
+++ b/src/xnnpack/spmm.h
@@ -62,5 +62,5 @@
 
 
 #ifdef __cplusplus
-} /* extern "C" */
+}  // extern "C"
 #endif
diff --git a/src/xnnpack/unpool.h b/src/xnnpack/unpool.h
index c02457a..ef90f31 100644
--- a/src/xnnpack/unpool.h
+++ b/src/xnnpack/unpool.h
@@ -30,5 +30,5 @@
 
 
 #ifdef __cplusplus
-} /* extern "C" */
+}  // extern "C"
 #endif
diff --git a/src/xnnpack/vadd.h b/src/xnnpack/vadd.h
index a66d171..8f34a0b 100644
--- a/src/xnnpack/vadd.h
+++ b/src/xnnpack/vadd.h
@@ -47,5 +47,5 @@
 
 
 #ifdef __cplusplus
-} /* extern "C" */
+}  // extern "C"
 #endif
diff --git a/src/xnnpack/vmul.h b/src/xnnpack/vmul.h
index 9747de8..2ea19f7 100644
--- a/src/xnnpack/vmul.h
+++ b/src/xnnpack/vmul.h
@@ -31,5 +31,5 @@
 
 
 #ifdef __cplusplus
-} /* extern "C" */
+}  // extern "C"
 #endif
diff --git a/src/xnnpack/vmulcaddc.h b/src/xnnpack/vmulcaddc.h
index a37e747..cd28a34 100644
--- a/src/xnnpack/vmulcaddc.h
+++ b/src/xnnpack/vmulcaddc.h
@@ -35,5 +35,5 @@
 
 
 #ifdef __cplusplus
-} /* extern "C" */
+}  // extern "C"
 #endif
diff --git a/src/xnnpack/vsub.h b/src/xnnpack/vsub.h
index e444eb6..66e0152 100644
--- a/src/xnnpack/vsub.h
+++ b/src/xnnpack/vsub.h
@@ -31,5 +31,5 @@
 
 
 #ifdef __cplusplus
-} /* extern "C" */
+}  // extern "C"
 #endif
diff --git a/src/xnnpack/zip.h b/src/xnnpack/zip.h
index 48b164e..4657271 100644
--- a/src/xnnpack/zip.h
+++ b/src/xnnpack/zip.h
@@ -82,5 +82,5 @@
 
 
 #ifdef __cplusplus
-} /* extern "C" */
+}  // extern "C"
 #endif