Unify comments style

- Change /* comments */ to // comments
- Re-generate auto-generated files
- Remove legacy unused GEMMINC SSE micro-kernels

PiperOrigin-RevId: 271901989
diff --git a/src/f32-argmaxpool/mp9p8q-psimd.c b/src/f32-argmaxpool/mp9p8q-psimd.c
index cb8a4f5..2e2564a8 100644
--- a/src/f32-argmaxpool/mp9p8q-psimd.c
+++ b/src/f32-argmaxpool/mp9p8q-psimd.c
@@ -1,9 +1,7 @@
-/*
- * Copyright 2019 Google LLC
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
 
 #include <assert.h>
 
diff --git a/src/f32-argmaxpool/mp9p8q-scalar.c b/src/f32-argmaxpool/mp9p8q-scalar.c
index f8ae537..0f9f832 100644
--- a/src/f32-argmaxpool/mp9p8q-scalar.c
+++ b/src/f32-argmaxpool/mp9p8q-scalar.c
@@ -1,9 +1,7 @@
-/*
- * Copyright 2019 Google LLC
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
 
 #include <assert.h>
 
diff --git a/src/f32-argmaxpool/mp9p8q-sse2.c b/src/f32-argmaxpool/mp9p8q-sse2.c
index 7eddcd7..31b55bf 100644
--- a/src/f32-argmaxpool/mp9p8q-sse2.c
+++ b/src/f32-argmaxpool/mp9p8q-sse2.c
@@ -1,9 +1,7 @@
-/*
- * Copyright 2019 Google LLC
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
 
 #include <assert.h>
 
diff --git a/src/f32-argmaxpool/up4-psimd.c b/src/f32-argmaxpool/up4-psimd.c
index 5b55bfa..fe0f1fc 100644
--- a/src/f32-argmaxpool/up4-psimd.c
+++ b/src/f32-argmaxpool/up4-psimd.c
@@ -1,9 +1,7 @@
-/*
- * Copyright 2019 Google LLC
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
 
 #include <assert.h>
 
diff --git a/src/f32-argmaxpool/up4-scalar.c b/src/f32-argmaxpool/up4-scalar.c
index 1d95c8f..8b668b0 100644
--- a/src/f32-argmaxpool/up4-scalar.c
+++ b/src/f32-argmaxpool/up4-scalar.c
@@ -1,9 +1,7 @@
-/*
- * Copyright 2019 Google LLC
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
 
 #include <assert.h>
 
diff --git a/src/f32-argmaxpool/up4-sse2.c b/src/f32-argmaxpool/up4-sse2.c
index 64d1d12..1f0e3cf 100644
--- a/src/f32-argmaxpool/up4-sse2.c
+++ b/src/f32-argmaxpool/up4-sse2.c
@@ -1,9 +1,7 @@
-/*
- * Copyright 2019 Google LLC
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
 
 #include <assert.h>
 
diff --git a/src/f32-argmaxpool/up9-psimd.c b/src/f32-argmaxpool/up9-psimd.c
index 69ec655..73d7931 100644
--- a/src/f32-argmaxpool/up9-psimd.c
+++ b/src/f32-argmaxpool/up9-psimd.c
@@ -1,9 +1,7 @@
-/*
- * Copyright 2019 Google LLC
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
 
 #include <assert.h>
 
diff --git a/src/f32-argmaxpool/up9-scalar.c b/src/f32-argmaxpool/up9-scalar.c
index 0cb8a49..08b6fa0 100644
--- a/src/f32-argmaxpool/up9-scalar.c
+++ b/src/f32-argmaxpool/up9-scalar.c
@@ -1,9 +1,7 @@
-/*
- * Copyright 2019 Google LLC
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
 
 #include <assert.h>
 
diff --git a/src/f32-argmaxpool/up9-sse2.c b/src/f32-argmaxpool/up9-sse2.c
index f2df769..acd8609 100644
--- a/src/f32-argmaxpool/up9-sse2.c
+++ b/src/f32-argmaxpool/up9-sse2.c
@@ -1,9 +1,7 @@
-/*
- * Copyright 2019 Google LLC
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
 
 #include <assert.h>
 
diff --git a/src/f32-avgpool/mp9p8q-neon.c b/src/f32-avgpool/mp9p8q-neon.c
index 78d9456..1b59bca 100644
--- a/src/f32-avgpool/mp9p8q-neon.c
+++ b/src/f32-avgpool/mp9p8q-neon.c
@@ -1,9 +1,7 @@
-/*
- * Copyright 2019 Google LLC
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
 
 #include <assert.h>
 
diff --git a/src/f32-avgpool/mp9p8q-psimd.c b/src/f32-avgpool/mp9p8q-psimd.c
index 96e4c54..db808e6 100644
--- a/src/f32-avgpool/mp9p8q-psimd.c
+++ b/src/f32-avgpool/mp9p8q-psimd.c
@@ -1,9 +1,7 @@
-/*
- * Copyright 2019 Google LLC
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
 
 #include <assert.h>
 
diff --git a/src/f32-avgpool/mp9p8q-scalar.c b/src/f32-avgpool/mp9p8q-scalar.c
index c1c5af8..7d91503 100644
--- a/src/f32-avgpool/mp9p8q-scalar.c
+++ b/src/f32-avgpool/mp9p8q-scalar.c
@@ -1,9 +1,7 @@
-/*
- * Copyright 2019 Google LLC
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
 
 #include <assert.h>
 
diff --git a/src/f32-avgpool/mp9p8q-sse.c b/src/f32-avgpool/mp9p8q-sse.c
index 7efa78d..a820eb4 100644
--- a/src/f32-avgpool/mp9p8q-sse.c
+++ b/src/f32-avgpool/mp9p8q-sse.c
@@ -1,9 +1,7 @@
-/*
- * Copyright 2019 Google LLC
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
 
 #include <assert.h>
 
diff --git a/src/f32-avgpool/up9-neon.c b/src/f32-avgpool/up9-neon.c
index 5ae8f80..0c9d150 100644
--- a/src/f32-avgpool/up9-neon.c
+++ b/src/f32-avgpool/up9-neon.c
@@ -1,9 +1,7 @@
-/*
- * Copyright 2019 Google LLC
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
 
 #include <assert.h>
 
diff --git a/src/f32-avgpool/up9-psimd.c b/src/f32-avgpool/up9-psimd.c
index 0579158..834cf9b 100644
--- a/src/f32-avgpool/up9-psimd.c
+++ b/src/f32-avgpool/up9-psimd.c
@@ -1,9 +1,7 @@
-/*
- * Copyright 2019 Google LLC
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
 
 #include <assert.h>
 
diff --git a/src/f32-avgpool/up9-scalar.c b/src/f32-avgpool/up9-scalar.c
index 0b4b253..9174ad5 100644
--- a/src/f32-avgpool/up9-scalar.c
+++ b/src/f32-avgpool/up9-scalar.c
@@ -1,9 +1,7 @@
-/*
- * Copyright 2019 Google LLC
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
 
 #include <assert.h>
 
diff --git a/src/f32-avgpool/up9-sse.c b/src/f32-avgpool/up9-sse.c
index de3685a..f6c7d96 100644
--- a/src/f32-avgpool/up9-sse.c
+++ b/src/f32-avgpool/up9-sse.c
@@ -1,9 +1,7 @@
-/*
- * Copyright 2019 Google LLC
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
 
 #include <assert.h>
 
diff --git a/src/f32-clamp/neon.c b/src/f32-clamp/neon.c
index e131e3e..d87435e 100644
--- a/src/f32-clamp/neon.c
+++ b/src/f32-clamp/neon.c
@@ -1,9 +1,7 @@
-/*
- * Copyright 2019 Google LLC
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
 
 #include <assert.h>
 
diff --git a/src/f32-clamp/psimd.c b/src/f32-clamp/psimd.c
index 53c253c..41e841d 100644
--- a/src/f32-clamp/psimd.c
+++ b/src/f32-clamp/psimd.c
@@ -1,9 +1,7 @@
-/*
- * Copyright 2019 Google LLC
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
 
 #include <assert.h>
 
diff --git a/src/f32-clamp/scalar.c b/src/f32-clamp/scalar.c
index 4fd6ae2..3973b88 100644
--- a/src/f32-clamp/scalar.c
+++ b/src/f32-clamp/scalar.c
@@ -1,9 +1,7 @@
-/*
- * Copyright 2019 Google LLC
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
 
 #include <assert.h>
 
diff --git a/src/f32-clamp/sse.c b/src/f32-clamp/sse.c
index 21e2976..e399757 100644
--- a/src/f32-clamp/sse.c
+++ b/src/f32-clamp/sse.c
@@ -1,9 +1,7 @@
-/*
- * Copyright 2019 Google LLC
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
 
 #include <assert.h>
 
diff --git a/src/f32-dwconv-spchw/3x3p1-neonfma.c b/src/f32-dwconv-spchw/3x3p1-neonfma.c
index 3f2c0e7..ca36ee0 100644
--- a/src/f32-dwconv-spchw/3x3p1-neonfma.c
+++ b/src/f32-dwconv-spchw/3x3p1-neonfma.c
@@ -1,9 +1,7 @@
-/*
- * Copyright 2019 Google LLC
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
 
 #include <assert.h>
 
@@ -36,7 +34,7 @@
   const size_t input_width_increment_single = input_width_stride - round_up_po2(n, 4) / 4 * input_tuple_stride;
   const size_t output_width_increment_single = output_width_stride - (n - 1) / 4 * output_tuple_stride;
 
-  /* No vertical padding */
+  // No vertical padding.
   const float* i0 = input;
   const float* i1 = (const float*) ((uintptr_t) i0 + input_width_stride);
   const float* i2 = (const float*) ((uintptr_t) i1 + input_width_stride);
@@ -150,7 +148,7 @@
       vst1q_f32(output1, vo1); output1 = (float*) ((uintptr_t) output1 + output_tuple_stride);
       vst1q_f32(output2, vo2); output2 = (float*) ((uintptr_t) output2 + output_tuple_stride);
     }
-    /* Always process the last block of 1..4 pixels */
+    // Always process the last block of 1..4 pixels.
     assert(k >= 1);
     assert(k <= 4);
     {
@@ -314,7 +312,7 @@
 
       vst1q_f32(output0, vo); output0 = (float*) ((uintptr_t) output0 + output_tuple_stride);
     }
-    /* Always process the last block of 1..4 pixels */
+    // Always process the last block of 1..4 pixels.
     assert(k >= 1);
     assert(k <= 4);
     {
diff --git a/src/f32-dwconv-spchw/3x3p1-sse.c b/src/f32-dwconv-spchw/3x3p1-sse.c
index 6507fce..eba2dc5 100644
--- a/src/f32-dwconv-spchw/3x3p1-sse.c
+++ b/src/f32-dwconv-spchw/3x3p1-sse.c
@@ -32,7 +32,7 @@
   const size_t input_width_increment = input_width_stride - round_up_po2(n, 4) / 4 * input_tuple_stride;
   const size_t output_width_increment = output_width_stride - (n - 1) / 4 * output_tuple_stride;
 
-  /* No vertical padding */
+  // No vertical padding.
   const float* i0 = input;
   const float* i1 = (const float*) ((uintptr_t) i0 + input_width_stride);
   const float* i2 = (const float*) ((uintptr_t) i1 + input_width_stride);
@@ -49,19 +49,19 @@
   const __m128 vk22 = _mm_load1_ps(weights + 9);
 
   do {
-    /* vi0x3012 = ( vi02, vi01, vi00, vi03 ) */
+    // vi0x3012 = ( vi02, vi01, vi00, vi03 )
     __m128 vi0x3012 = _mm_setzero_ps();
-    /* vi1x3012 = ( vi12, vi11, vi10, vi13 ) */
+    // vi1x3012 = ( vi12, vi11, vi10, vi13 )
     __m128 vi1x3012 = _mm_setzero_ps();
-    /* vi2x3012 = ( vi22, vi21, vi20, vi13 ) */
+    // vi2x3012 = ( vi22, vi21, vi20, vi13 )
     __m128 vi2x3012 = _mm_setzero_ps();
-    /* vi0x4567 = ( vi07, vi06, vi05, vi04 ) */
+    // vi0x4567 = ( vi07, vi06, vi05, vi04 )
     __m128 vi0x4567 = _mm_loadu_ps(i0);
     i0 = (const float*) ((uintptr_t) i0 + input_tuple_stride);
-    /* vi1x4567 = ( vi17, vi16, vi15, vi14 ) */
+    // vi1x4567 = ( vi17, vi16, vi15, vi14 )
     __m128 vi1x4567 = _mm_loadu_ps(i1);
     i1 = (const float*) ((uintptr_t) i1 + input_tuple_stride);
-    /* vi2x4567 = ( vi27, vi26, vi25, vi24 ) */
+    // vi2x4567 = ( vi27, vi26, vi25, vi24 )
     __m128 vi2x4567 = _mm_loadu_ps(i2);
     i2 = (const float*) ((uintptr_t) i2 + input_tuple_stride);
 
@@ -69,32 +69,32 @@
     for (; k > 4; k -= 4) {
       __m128 vo4567p0 = vbias;
 
-      /* vi0x89AB = ( vi0B, vi0A, vi09, vi08 ) */
+      // vi0x89AB = ( vi0B, vi0A, vi09, vi08 )
       const __m128 vi0x89AB = _mm_loadu_ps(i0);
       i0 = (const float*) ((uintptr_t) i0 + input_tuple_stride);
-      /* vi1x89AB = ( vi1B, vi0A, vi09, vi08 ) */
+      // vi1x89AB = ( vi1B, vi0A, vi09, vi08 )
       const __m128 vi1x89AB = _mm_loadu_ps(i1);
       i1 = (const float*) ((uintptr_t) i1 + input_tuple_stride);
-      /* vi2x89AB = ( vi2B, vi0A, vi09, vi08 ) */
+      // vi2x89AB = ( vi2B, vi0A, vi09, vi08 )
       const __m128 vi2x89AB = _mm_loadu_ps(i2);
       i2 = (const float*) ((uintptr_t) i2 + input_tuple_stride);
 
-      /* vi0x7456 = ( vi06, vi05, vi04, vi07 ) */
+      // vi0x7456 = ( vi06, vi05, vi04, vi07 )
       const __m128 vi0x7456 = _mm_shuffle_ps(vi0x4567, vi0x4567, _MM_SHUFFLE(2, 1, 0, 3));
-      /* vi1x7456 = ( vi16, vi15, vi14, vi17 ) */
+      // vi1x7456 = ( vi16, vi15, vi14, vi17 )
       const __m128 vi1x7456 = _mm_shuffle_ps(vi1x4567, vi1x4567, _MM_SHUFFLE(2, 1, 0, 3));
-      /* vi2x7456 = ( vi26, vi25, vi24, vi27 ) */
+      // vi2x7456 = ( vi26, vi25, vi24, vi27 )
       const __m128 vi2x7456 = _mm_shuffle_ps(vi2x4567, vi2x4567, _MM_SHUFFLE(2, 1, 0, 3));
 
       vo4567p0 = _mm_add_ps(vo4567p0, _mm_mul_ps(vi0x4567, vk01));
       __m128 vo4567p1 = _mm_mul_ps(vi1x4567, vk11);
       __m128 vo4567p2 = _mm_mul_ps(vi2x4567, vk21);
 
-      /* vi0x3456 = ( vi06, vi05, vi04, vi03 ) */
+      // vi0x3456 = ( vi06, vi05, vi04, vi03 )
       const __m128 vi0x3456 = _mm_move_ss(vi0x7456, vi0x3012);
-      /* vi1x3456 = ( vi16, vi15, vi14, vi13 ) */
+      // vi1x3456 = ( vi16, vi15, vi14, vi13 )
       const __m128 vi1x3456 = _mm_move_ss(vi1x7456, vi1x3012);
-      /* vi2x3456 = ( vi26, vi25, vi24, vi23 ) */
+      // vi2x3456 = ( vi26, vi25, vi24, vi23 )
       const __m128 vi2x3456 = _mm_move_ss(vi2x7456, vi2x3012);
 
       vo4567p0 = _mm_add_ps(vo4567p0, _mm_mul_ps(vi0x3456, vk00));
@@ -105,18 +105,18 @@
       vi1x3012 = vi1x7456;
       vi2x3012 = vi2x7456;
 
-      /* vi0x8567 = ( vi07, vi06, vi05, vi08 ) */
+      // vi0x8567 = ( vi07, vi06, vi05, vi08 )
       const __m128 vi0x8567 = _mm_move_ss(vi0x4567, vi0x89AB);
-      /* vi1x8567 = ( vi17, vi16, vi15, vi18 ) */
+      // vi1x8567 = ( vi17, vi16, vi15, vi18 )
       const __m128 vi1x8567 = _mm_move_ss(vi1x4567, vi1x89AB);
-      /* vi2x8567 = ( vi27, vi26, vi25, vi28 ) */
+      // vi2x8567 = ( vi27, vi26, vi25, vi28 )
       const __m128 vi2x8567 = _mm_move_ss(vi2x4567, vi2x89AB);
 
-      /* vi0x5678 = ( vi08, vi07, vi06, vi05 ) */
+      // vi0x5678 = ( vi08, vi07, vi06, vi05 )
       const __m128 vi0x5678 = _mm_shuffle_ps(vi0x8567, vi0x8567, _MM_SHUFFLE(0, 3, 2, 1));
-      /* vi1x5678 = ( vi18, vi17, vi16, vi15 ) */
+      // vi1x5678 = ( vi18, vi17, vi16, vi15 )
       const __m128 vi1x5678 = _mm_shuffle_ps(vi1x8567, vi1x8567, _MM_SHUFFLE(0, 3, 2, 1));
-      /* vi2x5678 = ( vi28, vi27, vi26, vi25 ) */
+      // vi2x5678 = ( vi28, vi27, vi26, vi25 )
       const __m128 vi2x5678 = _mm_shuffle_ps(vi2x8567, vi2x8567, _MM_SHUFFLE(0, 3, 2, 1));
 
       vo4567p0 = _mm_add_ps(vo4567p0, _mm_mul_ps(vi0x5678, vk02));
@@ -136,7 +136,7 @@
       _mm_storeu_ps(output, vo);
       output = (float*) ((uintptr_t) output + output_tuple_stride);
     }
-    /* Always process the last block of 1..4 pixels */
+    // Always process the last block of 1..4 pixels.
     assert(k >= 1);
     assert(k <= 4);
     {
@@ -146,22 +146,22 @@
       vi1x4567 = _mm_and_ps(vmask, vi1x4567);
       vi2x4567 = _mm_and_ps(vmask, vi2x4567);
 
-      /* vi0x7456 = ( vi06, vi05, vi04, vi07 ) */
+      // vi0x7456 = ( vi06, vi05, vi04, vi07 )
       const __m128 vi0x7456 = _mm_shuffle_ps(vi0x4567, vi0x4567, _MM_SHUFFLE(2, 1, 0, 3));
-      /* vi1x7456 = ( vi16, vi15, vi14, vi17 ) */
+      // vi1x7456 = ( vi16, vi15, vi14, vi17 )
       const __m128 vi1x7456 = _mm_shuffle_ps(vi1x4567, vi1x4567, _MM_SHUFFLE(2, 1, 0, 3));
-      /* vi2x7456 = ( vi26, vi25, vi24, vi27 ) */
+      // vi2x7456 = ( vi26, vi25, vi24, vi27 )
       const __m128 vi2x7456 = _mm_shuffle_ps(vi2x4567, vi2x4567, _MM_SHUFFLE(2, 1, 0, 3));
 
       vo4567p0 = _mm_add_ps(vo4567p0, _mm_mul_ps(vi0x4567, vk01));
       __m128 vo4567p1 = _mm_mul_ps(vi1x4567, vk11);
       __m128 vo4567p2 = _mm_mul_ps(vi2x4567, vk21);
 
-      /* vi0x3456 = ( vi06, vi05, vi04, vi03 ) */
+      // vi0x3456 = ( vi06, vi05, vi04, vi03 )
       const __m128 vi0x3456 = _mm_move_ss(vi0x7456, vi0x3012);
-      /* vi1x3456 = ( vi16, vi15, vi14, vi13 ) */
+      // vi1x3456 = ( vi16, vi15, vi14, vi13 )
       const __m128 vi1x3456 = _mm_move_ss(vi1x7456, vi1x3012);
-      /* vi2x3456 = ( vi26, vi25, vi24, vi23 ) */
+      // vi2x3456 = ( vi26, vi25, vi24, vi23 )
       const __m128 vi2x3456 = _mm_move_ss(vi2x7456, vi2x3012);
 
       vo4567p0 = _mm_add_ps(vo4567p0, _mm_mul_ps(vi0x3456, vk00));
@@ -169,18 +169,18 @@
       vo4567p2 = _mm_add_ps(vo4567p2, _mm_mul_ps(vi2x3456, vk20));
 
       const __m128 vzero = _mm_setzero_ps();
-      /* vi0x8567 = ( vi07, vi06, vi05, 0.0 ) */
+      // vi0x8567 = ( vi07, vi06, vi05, 0.0 )
       const __m128 vi0x8567 = _mm_move_ss(vi0x4567, vzero);
-      /* vi1x8567 = ( vi17, vi16, vi15, 0.0 ) */
+      // vi1x8567 = ( vi17, vi16, vi15, 0.0 )
       const __m128 vi1x8567 = _mm_move_ss(vi1x4567, vzero);
-      /* vi2x8567 = ( vi27, vi26, vi25, 0.0 ) */
+      // vi2x8567 = ( vi27, vi26, vi25, 0.0 )
       const __m128 vi2x8567 = _mm_move_ss(vi2x4567, vzero);
 
-      /* vi0x5678 = ( vi08, vi07, vi06, vi05 ) */
+      // vi0x5678 = ( vi08, vi07, vi06, vi05 )
       const __m128 vi0x5678 = _mm_shuffle_ps(vi0x8567, vi0x8567, _MM_SHUFFLE(0, 3, 2, 1));
-      /* vi1x5678 = ( vi18, vi17, vi16, vi15 ) */
+      // vi1x5678 = ( vi18, vi17, vi16, vi15 )
       const __m128 vi1x5678 = _mm_shuffle_ps(vi1x8567, vi1x8567, _MM_SHUFFLE(0, 3, 2, 1));
-      /* vi2x5678 = ( vi28, vi27, vi26, vi25 ) */
+      // vi2x5678 = ( vi28, vi27, vi26, vi25 )
       const __m128 vi2x5678 = _mm_shuffle_ps(vi2x8567, vi2x8567, _MM_SHUFFLE(0, 3, 2, 1));
 
       vo4567p0 = _mm_add_ps(vo4567p0, _mm_mul_ps(vi0x5678, vk02));
diff --git a/src/f32-dwconv-spchw/3x3s2p1-neonfma.c b/src/f32-dwconv-spchw/3x3s2p1-neonfma.c
index 008b5fb..9386d7c 100644
--- a/src/f32-dwconv-spchw/3x3s2p1-neonfma.c
+++ b/src/f32-dwconv-spchw/3x3s2p1-neonfma.c
@@ -33,7 +33,7 @@
   const size_t input_width_increment = input_width_stride * 2 - n / 8 * input_tuple_stride * 2;
   const size_t output_width_increment = output_width_stride - n / 8 * output_tuple_stride;
 
-  /* No vertical padding */
+  // No vertical padding.
   const float* i0 = input;
   const float* i1 = (const float*) ((uintptr_t) i0 + input_width_stride);
   const float* i2 = (const float*) ((uintptr_t) i1 + input_width_stride);
@@ -49,7 +49,6 @@
 
     size_t k = n;
     for (; k >= 8; k -= 8) {
-      // bias
       float32x4_t vo468Ap0 = vdupq_laneq_f32(vw0123, 0);
 
       const float32x4_t vi0x4567 = vld1q_f32(i0); i0 = (const float*) ((uintptr_t) i0 + input_tuple_stride);
@@ -86,12 +85,12 @@
       vo468Ap1 = vfmaq_laneq_f32(vo468Ap1, vi1x3579, vw4567, 0);
       vo468Ap2 = vfmaq_laneq_f32(vo468Ap2, vi2x3579, vw4567, 3);
 
-      // do multiplication by right filter tap
+      // Do multiplication by right filter tap.
       vo468Ap0 = vfmaq_laneq_f32(vo468Ap0, vi0x579B, vw0123, 3);
       vo468Ap1 = vfmaq_laneq_f32(vo468Ap1, vi1x579B, vw4567, 2);
       vo468Ap2 = vfmaq_lane_f32 (vo468Ap2, vi2x579B, vw89, 1);
 
-      // add up across rows to get the final outputs
+      // Add up across rows to get the final outputs.
       float32x4_t vo = vaddq_f32(vo468Ap0, vo468Ap1);
       vo = vaddq_f32(vo, vo468Ap2);
 
@@ -100,10 +99,9 @@
 
       vst1q_f32(output, vo); output = (float*) ((uintptr_t) output + output_tuple_stride);
     }
-    /* Last block has 0-7 pixels to process */
+    // Last block has 0-7 pixels to process.
     assert(k < 8);
     if XNN_LIKELY(k != 0) {
-      // bias
       float32x4_t vo468Ap0 = vdupq_laneq_f32(vw0123, 0);
 
       const float32x4_t vi0x4567 = vld1q_f32(i0);
diff --git a/src/f32-dwconv-spchw/3x3s2p1-sse.c b/src/f32-dwconv-spchw/3x3s2p1-sse.c
index 204dc52..b8e7480 100644
--- a/src/f32-dwconv-spchw/3x3s2p1-sse.c
+++ b/src/f32-dwconv-spchw/3x3s2p1-sse.c
@@ -33,7 +33,7 @@
   const size_t input_width_increment = input_width_stride * 2 - n / 8 * input_tuple_stride * 2;
   const size_t output_width_increment = output_width_stride - n / 8 * output_tuple_stride;
 
-  /* No vertical padding */
+  // No vertical padding.
   const float* i0 = input;
   const float* i1 = (const float*) ((uintptr_t) i0 + input_width_stride);
   const float* i2 = (const float*) ((uintptr_t) i1 + input_width_stride);
@@ -112,7 +112,7 @@
       _mm_storeu_ps(output, vo);
       output = (float*) ((uintptr_t) output + output_tuple_stride);
     }
-    /* Last block has 0-7 pixels to process */
+    // Last block has 0-7 pixels to process.
     assert(k < 8);
     if XNN_LIKELY(k != 0) {
       __m128 vo8ACEp0 = vbias;
diff --git a/src/f32-dwconv-spchw/5x5p2-neonfma.c b/src/f32-dwconv-spchw/5x5p2-neonfma.c
index ed60827..df063df 100644
--- a/src/f32-dwconv-spchw/5x5p2-neonfma.c
+++ b/src/f32-dwconv-spchw/5x5p2-neonfma.c
@@ -32,7 +32,7 @@
   const size_t input_width_increment_single = input_width_stride - round_up_po2(n, 4) / 4 * input_tuple_stride;
   const size_t output_width_increment_single = output_width_stride - (n - 1) / 4 * output_tuple_stride;
 
-  /* No vertical padding */
+  // No vertical padding.
   const float* i0 = input;
   const float* i1 = (const float*) ((uintptr_t) i0 + input_width_stride);
   const float* i2 = (const float*) ((uintptr_t) i1 + input_width_stride);
@@ -146,7 +146,7 @@
 
       vst1q_f32(output0, vo0); output0 = (float*) ((uintptr_t) output0 + output_tuple_stride);
     }
-    /* Always process the last block of 5..8 pixels */
+    // Always process the last block of 5..8 pixels.
     if XNN_LIKELY(k > 4)
     {
       float32x4_t vo4567p00 = vdupq_laneq_f32(vw0123, 0);
diff --git a/src/f32-dwconv-spchw/5x5s2p2-neonfma.c b/src/f32-dwconv-spchw/5x5s2p2-neonfma.c
index 18fb0a2..73d8abf 100644
--- a/src/f32-dwconv-spchw/5x5s2p2-neonfma.c
+++ b/src/f32-dwconv-spchw/5x5s2p2-neonfma.c
@@ -33,7 +33,7 @@
   const size_t input_width_increment_single = input_width_stride * 2 - input_tuple_stride * ( (n - 1) / 4 + 1);
   const size_t output_width_increment_single = output_width_stride - (n + 1) / 8 * output_tuple_stride;
 
-  /* No vertical padding */
+  // No vertical padding.
   const float* i0 = input;
   const float* i1 = (const float*) ((uintptr_t) i0 + input_width_stride);
   const float* i2 = (const float*) ((uintptr_t) i1 + input_width_stride);
diff --git a/src/f32-dwconv/up4x9-aarch64-neonfma-cortex-a55.S b/src/f32-dwconv/up4x9-aarch64-neonfma-cortex-a55.S
index 01cd48d..8f3b88d 100644
--- a/src/f32-dwconv/up4x9-aarch64-neonfma-cortex-a55.S
+++ b/src/f32-dwconv/up4x9-aarch64-neonfma-cortex-a55.S
@@ -1,9 +1,7 @@
-/*
- * Copyright 2019 Google LLC
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
+# Copyright 2019 Google LLC
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
 
 #include <xnnpack/assembly.h>
 
diff --git a/src/f32-gavgpool/mp7p7q-neon.c b/src/f32-gavgpool/mp7p7q-neon.c
index ec7224f..e02113e 100644
--- a/src/f32-gavgpool/mp7p7q-neon.c
+++ b/src/f32-gavgpool/mp7p7q-neon.c
@@ -1,9 +1,7 @@
-/*
- * Copyright 2019 Google LLC
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
 
 #include <assert.h>
 
diff --git a/src/f32-gavgpool/mp7p7q-psimd.c b/src/f32-gavgpool/mp7p7q-psimd.c
index be1a8dc..c2ae501 100644
--- a/src/f32-gavgpool/mp7p7q-psimd.c
+++ b/src/f32-gavgpool/mp7p7q-psimd.c
@@ -1,9 +1,7 @@
-/*
- * Copyright 2019 Google LLC
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
 
 #include <assert.h>
 
diff --git a/src/f32-gavgpool/mp7p7q-scalar.c b/src/f32-gavgpool/mp7p7q-scalar.c
index ac721b8..31dec7a 100644
--- a/src/f32-gavgpool/mp7p7q-scalar.c
+++ b/src/f32-gavgpool/mp7p7q-scalar.c
@@ -1,9 +1,7 @@
-/*
- * Copyright 2019 Google LLC
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
 
 #include <assert.h>
 
diff --git a/src/f32-gavgpool/mp7p7q-sse.c b/src/f32-gavgpool/mp7p7q-sse.c
index a7a8891..a1aacfd 100644
--- a/src/f32-gavgpool/mp7p7q-sse.c
+++ b/src/f32-gavgpool/mp7p7q-sse.c
@@ -1,9 +1,7 @@
-/*
- * Copyright 2019 Google LLC
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
 
 #include <assert.h>
 
diff --git a/src/f32-gavgpool/up7-neon.c b/src/f32-gavgpool/up7-neon.c
index e102996..8be6162 100644
--- a/src/f32-gavgpool/up7-neon.c
+++ b/src/f32-gavgpool/up7-neon.c
@@ -1,9 +1,7 @@
-/*
- * Copyright 2019 Google LLC
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
 
 #include <assert.h>
 
diff --git a/src/f32-gavgpool/up7-psimd.c b/src/f32-gavgpool/up7-psimd.c
index 3c69d53..2fb196b 100644
--- a/src/f32-gavgpool/up7-psimd.c
+++ b/src/f32-gavgpool/up7-psimd.c
@@ -1,9 +1,7 @@
-/*
- * Copyright 2019 Google LLC
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
 
 #include <assert.h>
 
diff --git a/src/f32-gavgpool/up7-scalar.c b/src/f32-gavgpool/up7-scalar.c
index 34788d6..a4a1a5b 100644
--- a/src/f32-gavgpool/up7-scalar.c
+++ b/src/f32-gavgpool/up7-scalar.c
@@ -1,9 +1,7 @@
-/*
- * Copyright 2019 Google LLC
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
 
 #include <assert.h>
 
diff --git a/src/f32-gavgpool/up7-sse.c b/src/f32-gavgpool/up7-sse.c
index ec23f2e..f293324 100644
--- a/src/f32-gavgpool/up7-sse.c
+++ b/src/f32-gavgpool/up7-sse.c
@@ -1,9 +1,7 @@
-/*
- * Copyright 2019 Google LLC
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
 
 #include <assert.h>
 
diff --git a/src/f32-gemm/1x12-aarch64-neonfma-cortex-a53.S b/src/f32-gemm/1x12-aarch64-neonfma-cortex-a53.S
index 3bca67f..4a342bb 100644
--- a/src/f32-gemm/1x12-aarch64-neonfma-cortex-a53.S
+++ b/src/f32-gemm/1x12-aarch64-neonfma-cortex-a53.S
@@ -2,10 +2,10 @@
 //   Template: src/f32-gemm/1x12-aarch64-neonfma-cortex-a53.S.in
 //   Generator: tools/xngen
 //
-// Copyright 2019 Google LLC
-//
-// This source code is licensed under the BSD-style license found in the
-// LICENSE file in the root directory of this source tree.
+# Copyright 2019 Google LLC
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
 
 #include <xnnpack/assembly.h>
 
diff --git a/src/f32-gemm/1x12-aarch64-neonfma-cortex-a53.S.in b/src/f32-gemm/1x12-aarch64-neonfma-cortex-a53.S.in
index c79b02e..0691ee9 100644
--- a/src/f32-gemm/1x12-aarch64-neonfma-cortex-a53.S.in
+++ b/src/f32-gemm/1x12-aarch64-neonfma-cortex-a53.S.in
@@ -1,7 +1,7 @@
-// Copyright 2019 Google LLC
-//
-// This source code is licensed under the BSD-style license found in the
-// LICENSE file in the root directory of this source tree.
+# Copyright 2019 Google LLC
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
 
 #include <xnnpack/assembly.h>
 
diff --git a/src/f32-gemminc/1x12-aarch64-neonfma-cortex-a53.S b/src/f32-gemminc/1x12-aarch64-neonfma-cortex-a53.S
index 873ead4..86898b0 100644
--- a/src/f32-gemminc/1x12-aarch64-neonfma-cortex-a53.S
+++ b/src/f32-gemminc/1x12-aarch64-neonfma-cortex-a53.S
@@ -2,10 +2,10 @@
 //   Template: src/f32-gemm/1x12-aarch64-neonfma-cortex-a53.S.in
 //   Generator: tools/xngen
 //
-// Copyright 2019 Google LLC
-//
-// This source code is licensed under the BSD-style license found in the
-// LICENSE file in the root directory of this source tree.
+# Copyright 2019 Google LLC
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
 
 #include <xnnpack/assembly.h>
 
diff --git a/src/f32-gemminc/1x8-sse.c b/src/f32-gemminc/1x8-sse.c
deleted file mode 100644
index 0bde2f9..0000000
--- a/src/f32-gemminc/1x8-sse.c
+++ /dev/null
@@ -1,95 +0,0 @@
-/*
- * Auto-generated file. Do not edit!
- *   Template: src/f32-gemm/sse.c.in
- *   Generator: tools/xngen
- */
-
-
-#include <assert.h>
-
-#include <xmmintrin.h>
-
-#include <xnnpack/gemm.h>
-
-
-void xnn_f32_gemminc_ukernel_1x8__sse(
-    size_t mr,
-    size_t nc,
-    size_t kc,
-    const float*restrict a,
-    size_t a_stride,
-    const float*restrict w,
-    float*restrict c,
-    size_t cm_stride,
-    size_t cn_stride,
-    const float* restrict acc,
-    const union xnn_f32_output_params params[restrict static 1])
-{
-  assert(mr != 0);
-  assert(mr <= 1);
-  assert(nc != 0);
-  assert(kc != 0);
-  assert(kc % sizeof(float) == 0);
-
-  const float* a0 = a;
-  float* c0 = c;
-
-  do {
-    __m128 vacc0x0123 = _mm_load_ps(acc + 0);
-    __m128 vacc0x4567 = _mm_load_ps(acc + 4);
-    acc += 8;
-
-    size_t k = kc;
-    do {
-      const __m128 va0 = _mm_load1_ps(a0);
-      a0 += 1;
-
-      const __m128 vb0123 = _mm_load_ps(w);
-      const __m128 vb4567 = _mm_load_ps(w + 4);
-      w += 8;
-
-      vacc0x0123 = _mm_add_ps(vacc0x0123, _mm_mul_ps(va0, vb0123));
-      vacc0x4567 = _mm_add_ps(vacc0x4567, _mm_mul_ps(va0, vb4567));
-
-      k -= sizeof(float);
-    } while (k != 0);
-
-    const __m128 vmax = _mm_load_ps(params->sse.max);
-    vacc0x0123 = _mm_min_ps(vacc0x0123, vmax);
-    vacc0x4567 = _mm_min_ps(vacc0x4567, vmax);
-
-    const __m128 vmin = _mm_load_ps(params->sse.min);
-    vacc0x0123 = _mm_max_ps(vacc0x0123, vmin);
-    vacc0x4567 = _mm_max_ps(vacc0x4567, vmin);
-
-    if XNN_LIKELY(nc >= 8) {
-      _mm_storeu_ps(c0, vacc0x0123);
-      _mm_storeu_ps(c0 + 4, vacc0x4567);
-      c0 = (float*) ((uintptr_t) c0 + cn_stride);
-
-      a0 = (const float*) ((uintptr_t) a0 - kc);
-
-      nc -= 8;
-    } else {
-      if (nc & 4) {
-        _mm_storeu_ps(c0, vacc0x0123);
-
-        vacc0x0123 = vacc0x4567;
-
-        c0 += 4;
-      }
-      if (nc & 2) {
-        _mm_storel_pi((__m64*) c0, vacc0x0123);
-
-        vacc0x0123 = _mm_movehl_ps(vacc0x0123, vacc0x0123);
-
-        c0 += 2;
-      }
-      if (nc & 1) {
-        _mm_store_ss(c0, vacc0x0123);
-      }
-
-      nc = 0;
-    }
-  } while (nc != 0);
-}
diff --git a/src/f32-gemminc/4x8-aarch64-neonfma-cortex-a57.S b/src/f32-gemminc/4x8-aarch64-neonfma-cortex-a57.S
index 45022cc..9532828 100644
--- a/src/f32-gemminc/4x8-aarch64-neonfma-cortex-a57.S
+++ b/src/f32-gemminc/4x8-aarch64-neonfma-cortex-a57.S
@@ -151,13 +151,9 @@
         LDR q7, [x4], 16
         FMLA v31.4s, v25.4s, v3.s[2]
         FMLA v16.4s, v26.4s, v0.s[3]
-        PRFM PLDL1KEEP, [x5, 128]
         FMLA v17.4s, v27.4s, v0.s[3]
-        PRFM PLDL1KEEP, [x5, 192]
         FMLA v18.4s, v26.4s, v1.s[3]
-        PRFM PLDL1KEEP, [x5, 256]
         FMLA v19.4s, v27.4s, v1.s[3]
-        PRFM PLDL1KEEP, [x5, 320]
         FMLA v28.4s, v26.4s, v2.s[3]
         FMLA v29.4s, v27.4s, v2.s[3]
         FMLA v30.4s, v26.4s, v3.s[3]
@@ -243,13 +239,9 @@
         LDR q7, [x4], 16
         FMLA v31.4s, v25.4s, v3.s[2]
         FMLA v16.4s, v26.4s, v0.s[3]
-        PRFM PLDL1KEEP, [x5, 128]
         FMLA v17.4s, v27.4s, v0.s[3]
-        PRFM PLDL1KEEP, [x5, 192]
         FMLA v18.4s, v26.4s, v1.s[3]
-        PRFM PLDL1KEEP, [x5, 256]
         FMLA v19.4s, v27.4s, v1.s[3]
-        PRFM PLDL1KEEP, [x5, 320]
         FMLA v28.4s, v26.4s, v2.s[3]
         FMLA v29.4s, v27.4s, v2.s[3]
         FMLA v30.4s, v26.4s, v3.s[3]
diff --git a/src/f32-gemminc/4x8-sse.c b/src/f32-gemminc/4x8-sse.c
deleted file mode 100644
index b16e2fe..0000000
--- a/src/f32-gemminc/4x8-sse.c
+++ /dev/null
@@ -1,176 +0,0 @@
-/*
- * Auto-generated file. Do not edit!
- *   Template: src/f32-gemm/sse.c.in
- *   Generator: tools/xngen
- */
-
-
-#include <assert.h>
-
-#include <xmmintrin.h>
-
-#include <xnnpack/gemm.h>
-
-
-void xnn_f32_gemminc_ukernel_4x8__sse(
-    size_t mr,
-    size_t nc,
-    size_t kc,
-    const float*restrict a,
-    size_t a_stride,
-    const float*restrict w,
-    float*restrict c,
-    size_t cm_stride,
-    size_t cn_stride,
-    const float* restrict acc,
-    const union xnn_f32_output_params params[restrict static 1])
-{
-  assert(mr != 0);
-  assert(mr <= 4);
-  assert(nc != 0);
-  assert(kc != 0);
-  assert(kc % sizeof(float) == 0);
-
-  const float* a0 = a;
-  float* c0 = c;
-  const float* a1 = (const float*) ((uintptr_t) a0 + a_stride);
-  float* c1 = (float*) ((uintptr_t) c0 + cm_stride);
-  if XNN_UNPREDICTABLE(mr < 2) {
-    a1 = a0;
-    c1 = c0;
-  }
-  const float* a2 = (const float*) ((uintptr_t) a1 + a_stride);
-  float* c2 = (float*) ((uintptr_t) c1 + cm_stride);
-  if XNN_UNPREDICTABLE(mr <= 2) {
-    a2 = a1;
-    c2 = c1;
-  }
-  const float* a3 = (const float*) ((uintptr_t) a2 + a_stride);
-  float* c3 = (float*) ((uintptr_t) c2 + cm_stride);
-  if XNN_UNPREDICTABLE(mr != 4) {
-    a3 = a2;
-    c3 = c2;
-  }
-
-  do {
-    __m128 vacc0x0123 = _mm_load_ps(acc + 0);
-    __m128 vacc0x4567 = _mm_load_ps(acc + 4);
-    __m128 vacc1x0123 = _mm_load_ps(acc + 8);
-    __m128 vacc1x4567 = _mm_load_ps(acc + 12);
-    __m128 vacc2x0123 = _mm_load_ps(acc + 16);
-    __m128 vacc2x4567 = _mm_load_ps(acc + 20);
-    __m128 vacc3x0123 = _mm_load_ps(acc + 24);
-    __m128 vacc3x4567 = _mm_load_ps(acc + 28);
-    acc += 32;
-
-    size_t k = kc;
-    do {
-      const __m128 va0 = _mm_load1_ps(a0);
-      a0 += 1;
-      const __m128 va1 = _mm_load1_ps(a1);
-      a1 += 1;
-      const __m128 va2 = _mm_load1_ps(a2);
-      a2 += 1;
-      const __m128 va3 = _mm_load1_ps(a3);
-      a3 += 1;
-
-      const __m128 vb0123 = _mm_load_ps(w);
-      const __m128 vb4567 = _mm_load_ps(w + 4);
-      w += 8;
-
-      vacc0x0123 = _mm_add_ps(vacc0x0123, _mm_mul_ps(va0, vb0123));
-      vacc1x0123 = _mm_add_ps(vacc1x0123, _mm_mul_ps(va1, vb0123));
-      vacc2x0123 = _mm_add_ps(vacc2x0123, _mm_mul_ps(va2, vb0123));
-      vacc3x0123 = _mm_add_ps(vacc3x0123, _mm_mul_ps(va3, vb0123));
-      vacc0x4567 = _mm_add_ps(vacc0x4567, _mm_mul_ps(va0, vb4567));
-      vacc1x4567 = _mm_add_ps(vacc1x4567, _mm_mul_ps(va1, vb4567));
-      vacc2x4567 = _mm_add_ps(vacc2x4567, _mm_mul_ps(va2, vb4567));
-      vacc3x4567 = _mm_add_ps(vacc3x4567, _mm_mul_ps(va3, vb4567));
-
-      k -= sizeof(float);
-    } while (k != 0);
-
-    const __m128 vmax = _mm_load_ps(params->sse.max);
-    vacc0x0123 = _mm_min_ps(vacc0x0123, vmax);
-    vacc1x0123 = _mm_min_ps(vacc1x0123, vmax);
-    vacc2x0123 = _mm_min_ps(vacc2x0123, vmax);
-    vacc3x0123 = _mm_min_ps(vacc3x0123, vmax);
-    vacc0x4567 = _mm_min_ps(vacc0x4567, vmax);
-    vacc1x4567 = _mm_min_ps(vacc1x4567, vmax);
-    vacc2x4567 = _mm_min_ps(vacc2x4567, vmax);
-    vacc3x4567 = _mm_min_ps(vacc3x4567, vmax);
-
-    const __m128 vmin = _mm_load_ps(params->sse.min);
-    vacc0x0123 = _mm_max_ps(vacc0x0123, vmin);
-    vacc1x0123 = _mm_max_ps(vacc1x0123, vmin);
-    vacc2x0123 = _mm_max_ps(vacc2x0123, vmin);
-    vacc3x0123 = _mm_max_ps(vacc3x0123, vmin);
-    vacc0x4567 = _mm_max_ps(vacc0x4567, vmin);
-    vacc1x4567 = _mm_max_ps(vacc1x4567, vmin);
-    vacc2x4567 = _mm_max_ps(vacc2x4567, vmin);
-    vacc3x4567 = _mm_max_ps(vacc3x4567, vmin);
-
-    if XNN_LIKELY(nc >= 8) {
-      _mm_storeu_ps(c0, vacc0x0123);
-      _mm_storeu_ps(c0 + 4, vacc0x4567);
-      c0 = (float*) ((uintptr_t) c0 + cn_stride);
-      _mm_storeu_ps(c1, vacc1x0123);
-      _mm_storeu_ps(c1 + 4, vacc1x4567);
-      c1 = (float*) ((uintptr_t) c1 + cn_stride);
-      _mm_storeu_ps(c2, vacc2x0123);
-      _mm_storeu_ps(c2 + 4, vacc2x4567);
-      c2 = (float*) ((uintptr_t) c2 + cn_stride);
-      _mm_storeu_ps(c3, vacc3x0123);
-      _mm_storeu_ps(c3 + 4, vacc3x4567);
-      c3 = (float*) ((uintptr_t) c3 + cn_stride);
-
-      a0 = (const float*) ((uintptr_t) a0 - kc);
-      a1 = (const float*) ((uintptr_t) a1 - kc);
-      a2 = (const float*) ((uintptr_t) a2 - kc);
-      a3 = (const float*) ((uintptr_t) a3 - kc);
-
-      nc -= 8;
-    } else {
-      if (nc & 4) {
-        _mm_storeu_ps(c0, vacc0x0123);
-        _mm_storeu_ps(c1, vacc1x0123);
-        _mm_storeu_ps(c2, vacc2x0123);
-        _mm_storeu_ps(c3, vacc3x0123);
-
-        vacc0x0123 = vacc0x4567;
-        vacc1x0123 = vacc1x4567;
-        vacc2x0123 = vacc2x4567;
-        vacc3x0123 = vacc3x4567;
-
-        c0 += 4;
-        c1 += 4;
-        c2 += 4;
-        c3 += 4;
-      }
-      if (nc & 2) {
-        _mm_storel_pi((__m64*) c0, vacc0x0123);
-        _mm_storel_pi((__m64*) c1, vacc1x0123);
-        _mm_storel_pi((__m64*) c2, vacc2x0123);
-        _mm_storel_pi((__m64*) c3, vacc3x0123);
-
-        vacc0x0123 = _mm_movehl_ps(vacc0x0123, vacc0x0123);
-        vacc1x0123 = _mm_movehl_ps(vacc1x0123, vacc1x0123);
-        vacc2x0123 = _mm_movehl_ps(vacc2x0123, vacc2x0123);
-        vacc3x0123 = _mm_movehl_ps(vacc3x0123, vacc3x0123);
-
-        c0 += 2;
-        c1 += 2;
-        c2 += 2;
-        c3 += 2;
-      }
-      if (nc & 1) {
-        _mm_store_ss(c0, vacc0x0123);
-        _mm_store_ss(c1, vacc1x0123);
-        _mm_store_ss(c2, vacc2x0123);
-        _mm_store_ss(c3, vacc3x0123);
-      }
-
-      nc = 0;
-    }
-  } while (nc != 0);
-}
diff --git a/src/f32-hswish/neon.c b/src/f32-hswish/neon.c
index d29a206..309c15b 100644
--- a/src/f32-hswish/neon.c
+++ b/src/f32-hswish/neon.c
@@ -1,9 +1,7 @@
-/*
- * Copyright 2019 Google LLC
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
 
 #include <assert.h>
 
diff --git a/src/f32-hswish/neonfma.c b/src/f32-hswish/neonfma.c
index 828545a..3e76bd9 100644
--- a/src/f32-hswish/neonfma.c
+++ b/src/f32-hswish/neonfma.c
@@ -1,9 +1,7 @@
-/*
- * Copyright 2019 Google LLC
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
 
 #include <assert.h>
 
diff --git a/src/f32-hswish/psimd.c b/src/f32-hswish/psimd.c
index 43758bf..b285d34 100644
--- a/src/f32-hswish/psimd.c
+++ b/src/f32-hswish/psimd.c
@@ -1,9 +1,7 @@
-/*
- * Copyright 2019 Google LLC
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
 
 #include <assert.h>
 
diff --git a/src/f32-hswish/scalar.c b/src/f32-hswish/scalar.c
index b4d251a..ace4fca 100644
--- a/src/f32-hswish/scalar.c
+++ b/src/f32-hswish/scalar.c
@@ -1,9 +1,7 @@
-/*
- * Copyright 2019 Google LLC
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
 
 #include <assert.h>
 
diff --git a/src/f32-hswish/sse.c b/src/f32-hswish/sse.c
index d0f0e5f..d8b7af1 100644
--- a/src/f32-hswish/sse.c
+++ b/src/f32-hswish/sse.c
@@ -1,9 +1,7 @@
-/*
- * Copyright 2019 Google LLC
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
 
 #include <assert.h>
 
diff --git a/src/f32-maxpool/9p8q-psimd.c b/src/f32-maxpool/9p8q-psimd.c
index 570b3c1..c973db3 100644
--- a/src/f32-maxpool/9p8q-psimd.c
+++ b/src/f32-maxpool/9p8q-psimd.c
@@ -1,9 +1,7 @@
-/*
- * Copyright 2019 Google LLC
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
 
 #include <assert.h>
 
diff --git a/src/f32-maxpool/9p8q-scalar.c b/src/f32-maxpool/9p8q-scalar.c
index e05d025..1108170 100644
--- a/src/f32-maxpool/9p8q-scalar.c
+++ b/src/f32-maxpool/9p8q-scalar.c
@@ -1,9 +1,7 @@
-/*
- * Copyright 2019 Google LLC
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
 
 #include <assert.h>
 
diff --git a/src/f32-maxpool/9p8q-sse.c b/src/f32-maxpool/9p8q-sse.c
index dc8c117..ecf3f76 100644
--- a/src/f32-maxpool/9p8q-sse.c
+++ b/src/f32-maxpool/9p8q-sse.c
@@ -1,9 +1,7 @@
-/*
- * Copyright 2019 Google LLC
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
 
 #include <assert.h>
 
diff --git a/src/f32-pavgpool/mp9p8q-neon.c b/src/f32-pavgpool/mp9p8q-neon.c
index 70422c5..0721dae 100644
--- a/src/f32-pavgpool/mp9p8q-neon.c
+++ b/src/f32-pavgpool/mp9p8q-neon.c
@@ -1,9 +1,7 @@
-/*
- * Copyright 2019 Google LLC
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
 
 #include <assert.h>
 
diff --git a/src/f32-pavgpool/mp9p8q-psimd.c b/src/f32-pavgpool/mp9p8q-psimd.c
index 78fd150..4d4826e 100644
--- a/src/f32-pavgpool/mp9p8q-psimd.c
+++ b/src/f32-pavgpool/mp9p8q-psimd.c
@@ -1,9 +1,7 @@
-/*
- * Copyright 2019 Google LLC
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
 
 #include <assert.h>
 
diff --git a/src/f32-pavgpool/mp9p8q-scalar.c b/src/f32-pavgpool/mp9p8q-scalar.c
index b618240..2793988 100644
--- a/src/f32-pavgpool/mp9p8q-scalar.c
+++ b/src/f32-pavgpool/mp9p8q-scalar.c
@@ -1,9 +1,7 @@
-/*
- * Copyright 2019 Google LLC
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
 
 #include <assert.h>
 
diff --git a/src/f32-pavgpool/mp9p8q-sse.c b/src/f32-pavgpool/mp9p8q-sse.c
index 89b97ed..28a2c2a 100644
--- a/src/f32-pavgpool/mp9p8q-sse.c
+++ b/src/f32-pavgpool/mp9p8q-sse.c
@@ -1,9 +1,7 @@
-/*
- * Copyright 2019 Google LLC
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
 
 #include <assert.h>
 
diff --git a/src/f32-pavgpool/up9-neon.c b/src/f32-pavgpool/up9-neon.c
index 2195019..1b4d3ae 100644
--- a/src/f32-pavgpool/up9-neon.c
+++ b/src/f32-pavgpool/up9-neon.c
@@ -1,9 +1,7 @@
-/*
- * Copyright 2019 Google LLC
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
 
 #include <assert.h>
 
diff --git a/src/f32-pavgpool/up9-psimd.c b/src/f32-pavgpool/up9-psimd.c
index 49637b4..6710df4 100644
--- a/src/f32-pavgpool/up9-psimd.c
+++ b/src/f32-pavgpool/up9-psimd.c
@@ -1,9 +1,7 @@
-/*
- * Copyright 2019 Google LLC
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
 
 #include <assert.h>
 
diff --git a/src/f32-pavgpool/up9-scalar.c b/src/f32-pavgpool/up9-scalar.c
index 1778d87..37b5719 100644
--- a/src/f32-pavgpool/up9-scalar.c
+++ b/src/f32-pavgpool/up9-scalar.c
@@ -1,9 +1,7 @@
-/*
- * Copyright 2019 Google LLC
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
 
 #include <assert.h>
 
diff --git a/src/f32-pavgpool/up9-sse.c b/src/f32-pavgpool/up9-sse.c
index f10a613..5e2db82 100644
--- a/src/f32-pavgpool/up9-sse.c
+++ b/src/f32-pavgpool/up9-sse.c
@@ -1,9 +1,7 @@
-/*
- * Copyright 2019 Google LLC
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
 
 #include <assert.h>
 
diff --git a/src/f32-prelu/x4-psimd.c b/src/f32-prelu/x4-psimd.c
index 3c0772d..3c3619f 100644
--- a/src/f32-prelu/x4-psimd.c
+++ b/src/f32-prelu/x4-psimd.c
@@ -1,9 +1,7 @@
-/*
- * Copyright 2019 Google LLC
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
 
 #include <assert.h>
 
diff --git a/src/f32-prelu/x4-scalar.c b/src/f32-prelu/x4-scalar.c
index 168b86b..6e16835 100644
--- a/src/f32-prelu/x4-scalar.c
+++ b/src/f32-prelu/x4-scalar.c
@@ -1,9 +1,7 @@
-/*
- * Copyright 2019 Google LLC
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
 
 #include <assert.h>
 #include <math.h>
diff --git a/src/f32-prelu/x4-sse.c b/src/f32-prelu/x4-sse.c
index 2c0038a..b0cf097 100644
--- a/src/f32-prelu/x4-sse.c
+++ b/src/f32-prelu/x4-sse.c
@@ -1,9 +1,7 @@
-/*
- * Copyright 2019 Google LLC
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
 
 #include <assert.h>
 
diff --git a/src/f32-rmax/avx.c b/src/f32-rmax/avx.c
index e242283..b68106d 100644
--- a/src/f32-rmax/avx.c
+++ b/src/f32-rmax/avx.c
@@ -1,9 +1,7 @@
-/*
- * Copyright 2019 Google LLC
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
 
 #include <assert.h>
 
diff --git a/src/f32-rmax/avx512f.c b/src/f32-rmax/avx512f.c
index ce96155..78c0467 100644
--- a/src/f32-rmax/avx512f.c
+++ b/src/f32-rmax/avx512f.c
@@ -1,9 +1,7 @@
-/*
- * Copyright 2019 Google LLC
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
 
 #include <assert.h>
 
diff --git a/src/f32-rmax/neon.c b/src/f32-rmax/neon.c
index a0cb076..372637e 100644
--- a/src/f32-rmax/neon.c
+++ b/src/f32-rmax/neon.c
@@ -1,9 +1,7 @@
-/*
- * Copyright 2019 Google LLC
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
 
 #include <assert.h>
 
diff --git a/src/f32-rmax/scalar.c b/src/f32-rmax/scalar.c
index bc8d5f3..527968d 100644
--- a/src/f32-rmax/scalar.c
+++ b/src/f32-rmax/scalar.c
@@ -1,9 +1,7 @@
-/*
- * Copyright 2019 Google LLC
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
 
 #include <assert.h>
 
diff --git a/src/f32-rmax/sse.c b/src/f32-rmax/sse.c
index 8968565..e75cd01 100644
--- a/src/f32-rmax/sse.c
+++ b/src/f32-rmax/sse.c
@@ -1,9 +1,7 @@
-/*
- * Copyright 2019 Google LLC
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
 
 #include <assert.h>
 
diff --git a/src/f32-vadd/psimd.c b/src/f32-vadd/psimd.c
index 2e06ff5..e74097f 100644
--- a/src/f32-vadd/psimd.c
+++ b/src/f32-vadd/psimd.c
@@ -1,9 +1,7 @@
-/*
- * Copyright 2019 Google LLC
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
 
 #include <assert.h>
 
diff --git a/src/f32-vadd/scalar.c b/src/f32-vadd/scalar.c
index 075fcb9..4b209d2 100644
--- a/src/f32-vadd/scalar.c
+++ b/src/f32-vadd/scalar.c
@@ -1,9 +1,7 @@
-/*
- * Copyright 2019 Google LLC
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
 
 #include <assert.h>
 
diff --git a/src/f32-vadd/sse.c b/src/f32-vadd/sse.c
index 2f49638..f65ce63 100644
--- a/src/f32-vadd/sse.c
+++ b/src/f32-vadd/sse.c
@@ -1,9 +1,7 @@
-/*
- * Copyright 2019 Google LLC
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
 
 #include <assert.h>
 
diff --git a/src/f32-vmul/psimd.c b/src/f32-vmul/psimd.c
index e42ff9f..b731451 100644
--- a/src/f32-vmul/psimd.c
+++ b/src/f32-vmul/psimd.c
@@ -1,9 +1,7 @@
-/*
- * Copyright 2019 Google LLC
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
 
 #include <assert.h>
 
diff --git a/src/f32-vmul/scalar.c b/src/f32-vmul/scalar.c
index 8f5c2f3..7576a4a 100644
--- a/src/f32-vmul/scalar.c
+++ b/src/f32-vmul/scalar.c
@@ -1,9 +1,7 @@
-/*
- * Copyright 2019 Google LLC
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
 
 #include <assert.h>
 
diff --git a/src/f32-vmul/sse.c b/src/f32-vmul/sse.c
index cb1087a..f30b9b7 100644
--- a/src/f32-vmul/sse.c
+++ b/src/f32-vmul/sse.c
@@ -1,9 +1,7 @@
-/*
- * Copyright 2019 Google LLC
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
 
 #include <assert.h>
 
diff --git a/src/f32-vmulcaddc/c1-scalar-x2.c b/src/f32-vmulcaddc/c1-scalar-x2.c
index 1b7b7e9..e6c6369 100644
--- a/src/f32-vmulcaddc/c1-scalar-x2.c
+++ b/src/f32-vmulcaddc/c1-scalar-x2.c
@@ -2,12 +2,10 @@
 //   Template: src/f32-vmulcaddc/scalar.c.in
 //   Generator: tools/xngen
 //
-/*
- * Copyright 2019 Google LLC
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
 
 #include <assert.h>
 
diff --git a/src/f32-vmulcaddc/c4-neon-x2.c b/src/f32-vmulcaddc/c4-neon-x2.c
index cee2f83..3a43247 100644
--- a/src/f32-vmulcaddc/c4-neon-x2.c
+++ b/src/f32-vmulcaddc/c4-neon-x2.c
@@ -2,12 +2,10 @@
 //   Template: src/f32-vmulcaddc/neon.c.in
 //   Generator: tools/xngen
 //
-/*
- * Copyright 2019 Google LLC
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
 
 #include <assert.h>
 
diff --git a/src/f32-vmulcaddc/c4-neonfma-x2.c b/src/f32-vmulcaddc/c4-neonfma-x2.c
index f395592..c2cbaeb 100644
--- a/src/f32-vmulcaddc/c4-neonfma-x2.c
+++ b/src/f32-vmulcaddc/c4-neonfma-x2.c
@@ -2,12 +2,10 @@
 //   Template: src/f32-vmulcaddc/neon.c.in
 //   Generator: tools/xngen
 //
-/*
- * Copyright 2019 Google LLC
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
 
 #include <assert.h>
 
diff --git a/src/f32-vmulcaddc/c4-psimd-x2.c b/src/f32-vmulcaddc/c4-psimd-x2.c
index dc9344c..8e57e6a 100644
--- a/src/f32-vmulcaddc/c4-psimd-x2.c
+++ b/src/f32-vmulcaddc/c4-psimd-x2.c
@@ -2,12 +2,10 @@
 //   Template: src/f32-vmulcaddc/psimd.c.in
 //   Generator: tools/xngen
 //
-/*
- * Copyright 2019 Google LLC
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
 
 #include <assert.h>
 
diff --git a/src/f32-vmulcaddc/c4-sse-x2.c b/src/f32-vmulcaddc/c4-sse-x2.c
index b48acf1..effdd31 100644
--- a/src/f32-vmulcaddc/c4-sse-x2.c
+++ b/src/f32-vmulcaddc/c4-sse-x2.c
@@ -2,12 +2,10 @@
 //   Template: src/f32-vmulcaddc/sse.c.in
 //   Generator: tools/xngen
 //
-/*
- * Copyright 2019 Google LLC
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
 
 #include <assert.h>
 
diff --git a/src/f32-vmulcaddc/neon.c.in b/src/f32-vmulcaddc/neon.c.in
index dd7b824..978aecf 100644
--- a/src/f32-vmulcaddc/neon.c.in
+++ b/src/f32-vmulcaddc/neon.c.in
@@ -1,9 +1,7 @@
-/*
- * Copyright 2019 Google LLC
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
 
 $assert CR % 4 == 0
 $ABC = "0123456789ABCDEFGHIJKLMN"
diff --git a/src/f32-vmulcaddc/psimd.c.in b/src/f32-vmulcaddc/psimd.c.in
index 51c27b1..4933bcc 100644
--- a/src/f32-vmulcaddc/psimd.c.in
+++ b/src/f32-vmulcaddc/psimd.c.in
@@ -1,9 +1,7 @@
-/*
- * Copyright 2019 Google LLC
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
 
 $assert CR % 4 == 0
 $ABC = "0123456789ABCDEFGHIJKLMN"
diff --git a/src/f32-vmulcaddc/scalar.c.in b/src/f32-vmulcaddc/scalar.c.in
index df86ff2..53d0c9f 100644
--- a/src/f32-vmulcaddc/scalar.c.in
+++ b/src/f32-vmulcaddc/scalar.c.in
@@ -1,9 +1,7 @@
-/*
- * Copyright 2019 Google LLC
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
 
 $assert CR > 0
 $ABC = "0123456789ABCDEFGHIJKLMN"
diff --git a/src/f32-vmulcaddc/sse.c.in b/src/f32-vmulcaddc/sse.c.in
index e358ec8..6f07e4a 100644
--- a/src/f32-vmulcaddc/sse.c.in
+++ b/src/f32-vmulcaddc/sse.c.in
@@ -1,9 +1,7 @@
-/*
- * Copyright 2019 Google LLC
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
 
 $assert CR % 4 == 0
 $ABC = "0123456789ABCDEFGHIJKLMN"
diff --git a/src/f32-vsub/psimd.c b/src/f32-vsub/psimd.c
index bdf701a..db01a64 100644
--- a/src/f32-vsub/psimd.c
+++ b/src/f32-vsub/psimd.c
@@ -1,9 +1,7 @@
-/*
- * Copyright 2019 Google LLC
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
 
 #include <assert.h>
 
diff --git a/src/f32-vsub/scalar.c b/src/f32-vsub/scalar.c
index a53b419..a9b50da 100644
--- a/src/f32-vsub/scalar.c
+++ b/src/f32-vsub/scalar.c
@@ -1,9 +1,7 @@
-/*
- * Copyright 2019 Google LLC
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
 
 #include <assert.h>
 
diff --git a/src/f32-vsub/sse.c b/src/f32-vsub/sse.c
index 0722622..5c0d7e8 100644
--- a/src/f32-vsub/sse.c
+++ b/src/f32-vsub/sse.c
@@ -1,9 +1,7 @@
-/*
- * Copyright 2019 Google LLC
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
 
 #include <assert.h>
 
diff --git a/src/max-pooling.c b/src/max-pooling.c
index c8a4d67..25e5e66 100644
--- a/src/max-pooling.c
+++ b/src/max-pooling.c
@@ -493,7 +493,7 @@
   const size_t pooling_size = pooling_height * pooling_width;
   const size_t output_height = max_pooling_op->output_height;
   const size_t output_width = max_pooling_op->output_width;
-  /* Micro-kernel may read up to (mr - 1) elements after the end of indirection buffer */
+  // Micro-kernel may read up to (mr - 1) elements after the end of indirection buffer.
   const uint32_t mr = xnn_params.f32.maxpool.mr;
 
   const size_t step_width =
diff --git a/src/q8-avgpool/mp9p8q-neon.c b/src/q8-avgpool/mp9p8q-neon.c
index 08a0b49..f87a7256 100644
--- a/src/q8-avgpool/mp9p8q-neon.c
+++ b/src/q8-avgpool/mp9p8q-neon.c
@@ -1,12 +1,10 @@
-/*
- * Copyright (c) Facebook, Inc. and its affiliates.
- * All rights reserved.
- *
- * Copyright 2019 Google LLC
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
+// Copyright (c) Facebook, Inc. and its affiliates.
+// All rights reserved.
+//
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
 
 #include <assert.h>
 
diff --git a/src/q8-avgpool/mp9p8q-scalar.c b/src/q8-avgpool/mp9p8q-scalar.c
index 6971756..6002fac 100644
--- a/src/q8-avgpool/mp9p8q-scalar.c
+++ b/src/q8-avgpool/mp9p8q-scalar.c
@@ -1,12 +1,10 @@
-/*
- * Copyright (c) Facebook, Inc. and its affiliates.
- * All rights reserved.
- *
- * Copyright 2019 Google LLC
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
+// Copyright (c) Facebook, Inc. and its affiliates.
+// All rights reserved.
+//
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
 
 #include <assert.h>
 
@@ -38,7 +36,8 @@
   const int32_t voutput_max = params->scalar.output_max_less_zero_point;
   const int32_t voutput_zero_point = params->scalar.output_zero_point;
   do {
-    /* First pass */ {
+    // First pass.
+    {
       const uint8_t* i0 = *input++;
       const uint8_t* i1 = *input++;
       const uint8_t* i2 = *input++;
@@ -76,7 +75,7 @@
     }
 
     size_t m = ks;
-    /* Intermediate passes */
+    // Intermediate passes.
     for (m -= 9; m > 8; m -= 8) {
       const uint8_t* i0 = *input++;
       const uint8_t* i1 = *input++;
@@ -114,7 +113,8 @@
       } while (--k != 0);
     }
 
-    /* Last pass */ {
+    // Last pass.
+    {
       const uint8_t* i0 = input[0];
       const uint8_t* i1 = input[1];
       const uint8_t* i2 = input[2];
diff --git a/src/q8-avgpool/mp9p8q-sse2.c b/src/q8-avgpool/mp9p8q-sse2.c
index a27074a..a21c6d9 100644
--- a/src/q8-avgpool/mp9p8q-sse2.c
+++ b/src/q8-avgpool/mp9p8q-sse2.c
@@ -1,12 +1,10 @@
-/*
- * Copyright (c) Facebook, Inc. and its affiliates.
- * All rights reserved.
- *
- * Copyright 2019 Google LLC
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
+// Copyright (c) Facebook, Inc. and its affiliates.
+// All rights reserved.
+//
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
 
 #include <assert.h>
 
diff --git a/src/q8-avgpool/up9-neon.c b/src/q8-avgpool/up9-neon.c
index d228186..88b6ea4 100644
--- a/src/q8-avgpool/up9-neon.c
+++ b/src/q8-avgpool/up9-neon.c
@@ -1,12 +1,10 @@
-/*
- * Copyright (c) Facebook, Inc. and its affiliates.
- * All rights reserved.
- *
- * Copyright 2019 Google LLC
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
+// Copyright (c) Facebook, Inc. and its affiliates.
+// All rights reserved.
+//
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
 
 #include <assert.h>
 
diff --git a/src/q8-avgpool/up9-scalar.c b/src/q8-avgpool/up9-scalar.c
index b8f2fa4..31d9433 100644
--- a/src/q8-avgpool/up9-scalar.c
+++ b/src/q8-avgpool/up9-scalar.c
@@ -1,12 +1,10 @@
-/*
- * Copyright (c) Facebook, Inc. and its affiliates.
- * All rights reserved.
- *
- * Copyright 2019 Google LLC
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
+// Copyright (c) Facebook, Inc. and its affiliates.
+// All rights reserved.
+//
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
 
 #include <assert.h>
 
diff --git a/src/q8-avgpool/up9-sse2.c b/src/q8-avgpool/up9-sse2.c
index 5998790..e98d1ba 100644
--- a/src/q8-avgpool/up9-sse2.c
+++ b/src/q8-avgpool/up9-sse2.c
@@ -1,12 +1,10 @@
-/*
- * Copyright (c) Facebook, Inc. and its affiliates.
- * All rights reserved.
- *
- * Copyright 2019 Google LLC
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
+// Copyright (c) Facebook, Inc. and its affiliates.
+// All rights reserved.
+//
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
 
 #include <assert.h>
 
diff --git a/src/q8-dwconv/up1x9-scalar.c b/src/q8-dwconv/up1x9-scalar.c
index 2014e86..c7cd6ed 100644
--- a/src/q8-dwconv/up1x9-scalar.c
+++ b/src/q8-dwconv/up1x9-scalar.c
@@ -1,9 +1,7 @@
-/*
- * Copyright 2019 Google LLC
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
 
 #include <xnnpack/scalar-utils.h>
 #include <xnnpack/dwconv.h>
diff --git a/src/q8-dwconv/up8x9-aarch32-neon.S b/src/q8-dwconv/up8x9-aarch32-neon.S
index aceabf9..e882b83 100644
--- a/src/q8-dwconv/up8x9-aarch32-neon.S
+++ b/src/q8-dwconv/up8x9-aarch32-neon.S
@@ -1,12 +1,10 @@
-/*
- * Copyright (c) Facebook, Inc. and its affiliates.
- * All rights reserved.
- *
- * Copyright 2019 Google LLC
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
+# Copyright (c) Facebook, Inc. and its affiliates.
+# All rights reserved.
+#
+# Copyright 2019 Google LLC
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
 
 #include <xnnpack/assembly.h>
 
diff --git a/src/q8-dwconv/up8x9-neon.c b/src/q8-dwconv/up8x9-neon.c
index 30fae41..10e6a53 100644
--- a/src/q8-dwconv/up8x9-neon.c
+++ b/src/q8-dwconv/up8x9-neon.c
@@ -1,12 +1,10 @@
-/*
- * Copyright (c) Facebook, Inc. and its affiliates.
- * All rights reserved.
- *
- * Copyright 2019 Google LLC
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
+// Copyright (c) Facebook, Inc. and its affiliates.
+// All rights reserved.
+//
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
 
 #include <arm_neon.h>
 
@@ -31,7 +29,7 @@
   const uint8x8_t voutput_max = vld1_dup_u8(&params->neon.output_max);
 
 #ifdef __aarch64__
-  /* Larger number of registers on AArch64 make it possible to process few pixels at a time */
+  // Larger number of registers on AArch64 make it possible to process few pixels at a time.
   if (input_stride == 3 * sizeof(void*)) {
     for (; output_width >= 3; output_width -= 3) {
       const uint8_t* i00 = input[ 0];
diff --git a/src/q8-dwconv/up8x9-sse2.c b/src/q8-dwconv/up8x9-sse2.c
index d296127..f9ddd77 100644
--- a/src/q8-dwconv/up8x9-sse2.c
+++ b/src/q8-dwconv/up8x9-sse2.c
@@ -1,12 +1,10 @@
-/*
- * Copyright (c) Facebook, Inc. and its affiliates.
- * All rights reserved.
- *
- * Copyright 2019 Google LLC
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
+// Copyright (c) Facebook, Inc. and its affiliates.
+// All rights reserved.
+//
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
 
 #include <immintrin.h>
 
diff --git a/src/q8-gavgpool/mp7p7q-neon.c b/src/q8-gavgpool/mp7p7q-neon.c
index 3a4aa58..f62de3a 100644
--- a/src/q8-gavgpool/mp7p7q-neon.c
+++ b/src/q8-gavgpool/mp7p7q-neon.c
@@ -1,12 +1,10 @@
-/*
- * Copyright (c) Facebook, Inc. and its affiliates.
- * All rights reserved.
- *
- * Copyright 2019 Google LLC
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
+// Copyright (c) Facebook, Inc. and its affiliates.
+// All rights reserved.
+//
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
 
 #include <assert.h>
 
diff --git a/src/q8-gavgpool/mp7p7q-scalar.c b/src/q8-gavgpool/mp7p7q-scalar.c
index dd6658c..ded3262 100644
--- a/src/q8-gavgpool/mp7p7q-scalar.c
+++ b/src/q8-gavgpool/mp7p7q-scalar.c
@@ -1,9 +1,7 @@
-/*
- * Copyright 2019 Google LLC
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
 
 #include <assert.h>
 
@@ -33,7 +31,8 @@
   const uint8_t* i6 = (const uint8_t*) ((uintptr_t) i5 + input_stride);
   const size_t input_increment = 7 * input_stride - n;
 
-  /* First pass */ {
+  // First pass.
+  {
     const int32_t vbias = params->scalar.bias;
 
     int32_t* b = buffer;
@@ -60,7 +59,7 @@
       *b++ = vacc;
     } while (--k != 0);
   }
-  /* Intermediate passes */
+  // Intermediate passes.
   for (m -= 7; m > 7; m -= 7) {
     i0 = (const uint8_t*) ((uintptr_t) i0 + input_increment);
     i1 = (const uint8_t*) ((uintptr_t) i1 + input_increment);
@@ -94,7 +93,8 @@
     } while (--k != 0);
   }
 
-  /* Last pass */ {
+  // Last pass.
+  {
     const int32_t vmultiplier = params->scalar.multiplier;
     const int64_t vrounding = params->scalar.rounding;
     const uint32_t vshift = params->scalar.right_shift;
diff --git a/src/q8-gavgpool/mp7p7q-sse2.c b/src/q8-gavgpool/mp7p7q-sse2.c
index 77b3d02..1874a66 100644
--- a/src/q8-gavgpool/mp7p7q-sse2.c
+++ b/src/q8-gavgpool/mp7p7q-sse2.c
@@ -1,12 +1,10 @@
-/*
- * Copyright (c) Facebook, Inc. and its affiliates.
- * All rights reserved.
- *
- * Copyright 2019 Google LLC
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
+// Copyright (c) Facebook, Inc. and its affiliates.
+// All rights reserved.
+//
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
 
 #include <assert.h>
 
diff --git a/src/q8-gavgpool/up7-neon.c b/src/q8-gavgpool/up7-neon.c
index fb4aad8..09a2d20 100644
--- a/src/q8-gavgpool/up7-neon.c
+++ b/src/q8-gavgpool/up7-neon.c
@@ -1,12 +1,10 @@
-/*
- * Copyright (c) Facebook, Inc. and its affiliates.
- * All rights reserved.
- *
- * Copyright 2019 Google LLC
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
+// Copyright (c) Facebook, Inc. and its affiliates.
+// All rights reserved.
+//
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
 
 #include <assert.h>
 
diff --git a/src/q8-gavgpool/up7-scalar.c b/src/q8-gavgpool/up7-scalar.c
index b437ef9..4934b35 100644
--- a/src/q8-gavgpool/up7-scalar.c
+++ b/src/q8-gavgpool/up7-scalar.c
@@ -1,9 +1,7 @@
-/*
- * Copyright 2019 Google LLC
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
 
 #include <assert.h>
 
diff --git a/src/q8-gavgpool/up7-sse2.c b/src/q8-gavgpool/up7-sse2.c
index 6d8c1bc..86ea7c7 100644
--- a/src/q8-gavgpool/up7-sse2.c
+++ b/src/q8-gavgpool/up7-sse2.c
@@ -1,12 +1,10 @@
-/*
- * Copyright (c) Facebook, Inc. and its affiliates.
- * All rights reserved.
- *
- * Copyright 2019 Google LLC
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
+// Copyright (c) Facebook, Inc. and its affiliates.
+// All rights reserved.
+//
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
 
 #include <assert.h>
 
diff --git a/src/q8-gemm/2x2-scalar.c b/src/q8-gemm/2x2-scalar.c
index 4c7f892..d30a1bf 100644
--- a/src/q8-gemm/2x2-scalar.c
+++ b/src/q8-gemm/2x2-scalar.c
@@ -1,9 +1,7 @@
-/*
- * Copyright 2019 Google LLC
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
 
 #include <assert.h>
 
diff --git a/src/q8-gemm/2x4c8-sse2.c b/src/q8-gemm/2x4c8-sse2.c
index 82e2053..8551ad6 100644
--- a/src/q8-gemm/2x4c8-sse2.c
+++ b/src/q8-gemm/2x4c8-sse2.c
@@ -1,12 +1,10 @@
-/*
- * Copyright (c) Facebook, Inc. and its affiliates.
- * All rights reserved.
- *
- * Copyright 2019 Google LLC
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
+// Copyright (c) Facebook, Inc. and its affiliates.
+// All rights reserved.
+//
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
 
 #include <assert.h>
 
@@ -18,24 +16,19 @@
 
 static inline __m128i sse_reduce4_i32(__m128i x, __m128i y, __m128i z, __m128i w) {
 #if defined(__SSSE3__) && !defined(__ANDROID__)
-  /* xxyy = ( y2 + y3, y0 + y1, x2 + x3, x0 + x1 ) */
+  // xxyy = ( y2 + y3, y0 + y1, x2 + x3, x0 + x1 )
   const __m128i xxyy = _mm_hadd_epi32(x, y);
-  /* zzww = ( w2 + w3, w0 + w1, z2 + z3, z0 + z1 ) */
+  // zzww = ( w2 + w3, w0 + w1, z2 + z3, z0 + z1 )
   const __m128i zzww = _mm_hadd_epi32(z, w);
-  /* xyzw = ( w0 + w1 + w2 + w3, y0 + y1 + y2 + y3, z0 + z1 + z2 + z3, x0 + x1 +
-   * x2 + x3 ) */
+  // xyzw = ( w0 + w1 + w2 + w3, y0 + y1 + y2 + y3, z0 + z1 + z2 + z3, x0 + x1 + x2 + x3 )
   return _mm_hadd_epi32(xxyy, zzww);
 #else
-  /* xzxz = ( z1 + z3, x1 + x3, z0 + z2, x0 + x2 ) */
-  const __m128i xzxz =
-      _mm_add_epi32(_mm_unpacklo_epi32(x, z), _mm_unpackhi_epi32(x, z));
-  /* ywyw = ( w1 + w3, y1 + y3, w0 + w2, y0 + y2 ) */
-  const __m128i ywyw =
-      _mm_add_epi32(_mm_unpacklo_epi32(y, w), _mm_unpackhi_epi32(y, w));
-  /* xyzw = ( w0 + w2 + w1 + w3, y0 + y2 + y1 + y3, z0 + z2 + z1 + z3, x0 + x2 +
-   * x1 + x3 ) */
-  return _mm_add_epi32(
-      _mm_unpacklo_epi32(xzxz, ywyw), _mm_unpackhi_epi32(xzxz, ywyw));
+  // xzxz = ( z1 + z3, x1 + x3, z0 + z2, x0 + x2 )
+  const __m128i xzxz = _mm_add_epi32(_mm_unpacklo_epi32(x, z), _mm_unpackhi_epi32(x, z));
+  // ywyw = ( w1 + w3, y1 + y3, w0 + w2, y0 + y2 )
+  const __m128i ywyw = _mm_add_epi32(_mm_unpacklo_epi32(y, w), _mm_unpackhi_epi32(y, w));
+  // xyzw = ( w0 + w2 + w1 + w3, y0 + y2 + y1 + y3, z0 + z2 + z1 + z3, x0 + x2 + x1 + x3 )
+  return _mm_add_epi32(_mm_unpacklo_epi32(xzxz, ywyw), _mm_unpackhi_epi32(xzxz, ywyw));
 #endif
 }
 
diff --git a/src/q8-gemm/4x4c2-sse2.c b/src/q8-gemm/4x4c2-sse2.c
index 714280b..968f400 100644
--- a/src/q8-gemm/4x4c2-sse2.c
+++ b/src/q8-gemm/4x4c2-sse2.c
@@ -1,12 +1,10 @@
-/*
- * Copyright (c) Facebook, Inc. and its affiliates.
- * All rights reserved.
- *
- * Copyright 2019 Google LLC
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
+// Copyright (c) Facebook, Inc. and its affiliates.
+// All rights reserved.
+//
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
 
 #include <assert.h>
 
diff --git a/src/q8-gemm/4x8-neon.c b/src/q8-gemm/4x8-neon.c
index 4b025ab..f3c8819 100644
--- a/src/q8-gemm/4x8-neon.c
+++ b/src/q8-gemm/4x8-neon.c
@@ -1,12 +1,10 @@
-/*
- * Copyright (c) Facebook, Inc. and its affiliates.
- * All rights reserved.
- *
- * Copyright 2019 Google LLC
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
+// Copyright (c) Facebook, Inc. and its affiliates.
+// All rights reserved.
+//
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
 
 #include <assert.h>
 
diff --git a/src/q8-gemm/8x8-neon.c b/src/q8-gemm/8x8-neon.c
index 675f312..d0291dc 100644
--- a/src/q8-gemm/8x8-neon.c
+++ b/src/q8-gemm/8x8-neon.c
@@ -1,12 +1,10 @@
-/*
- * Copyright (c) Facebook, Inc. and its affiliates.
- * All rights reserved.
- *
- * Copyright 2019 Google LLC
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
+// Copyright (c) Facebook, Inc. and its affiliates.
+// All rights reserved.
+//
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
 
 #include <assert.h>
 
diff --git a/src/q8-igemm/2x2-scalar.c b/src/q8-igemm/2x2-scalar.c
index 18a398c..6d0c961 100644
--- a/src/q8-igemm/2x2-scalar.c
+++ b/src/q8-igemm/2x2-scalar.c
@@ -1,9 +1,7 @@
-/*
- * Copyright 2019 Google LLC
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
 
 #include <assert.h>
 
diff --git a/src/q8-igemm/4x4c2-sse2.c b/src/q8-igemm/4x4c2-sse2.c
index 1c00a26..72b5f32 100644
--- a/src/q8-igemm/4x4c2-sse2.c
+++ b/src/q8-igemm/4x4c2-sse2.c
@@ -1,12 +1,10 @@
-/*
- * Copyright (c) Facebook, Inc. and its affiliates.
- * All rights reserved.
- *
- * Copyright 2019 Google LLC
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
+// Copyright (c) Facebook, Inc. and its affiliates.
+// All rights reserved.
+//
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
 
 #include <assert.h>
 
diff --git a/src/q8-igemm/4x8-neon.c b/src/q8-igemm/4x8-neon.c
index 611d50b..e7ad227 100644
--- a/src/q8-igemm/4x8-neon.c
+++ b/src/q8-igemm/4x8-neon.c
@@ -1,12 +1,10 @@
-/*
- * Copyright (c) Facebook, Inc. and its affiliates.
- * All rights reserved.
- *
- * Copyright 2019 Google LLC
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
+// Copyright (c) Facebook, Inc. and its affiliates.
+// All rights reserved.
+//
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
 
 #include <assert.h>
 
diff --git a/src/q8-igemm/8x8-neon.c b/src/q8-igemm/8x8-neon.c
index 1f68057..ec88e9c 100644
--- a/src/q8-igemm/8x8-neon.c
+++ b/src/q8-igemm/8x8-neon.c
@@ -1,12 +1,10 @@
-/*
- * Copyright (c) Facebook, Inc. and its affiliates.
- * All rights reserved.
- *
- * Copyright 2019 Google LLC
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
+// Copyright (c) Facebook, Inc. and its affiliates.
+// All rights reserved.
+//
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
 
 #include <assert.h>
 
diff --git a/src/q8-vadd/neon.c b/src/q8-vadd/neon.c
index 5b3b421..3306932 100644
--- a/src/q8-vadd/neon.c
+++ b/src/q8-vadd/neon.c
@@ -1,12 +1,10 @@
-/*
- * Copyright (c) Facebook, Inc. and its affiliates.
- * All rights reserved.
- *
- * Copyright 2019 Google LLC
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
+// Copyright (c) Facebook, Inc. and its affiliates.
+// All rights reserved.
+//
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
 
 #include <arm_neon.h>
 
@@ -37,7 +35,7 @@
     const uint8x16_t va23 = vld1q_u8(a); a += 16;
     const uint8x16_t vb23 = vld1q_u8(b); b += 16;
 
-    /* Subtract zero point */
+    // Subtract zero point.
     const int16x8_t vxa0 = vreinterpretq_s16_u16(vsubl_u8(vget_low_u8(va01), va_zero_point));
     const int16x8_t vxb0 = vreinterpretq_s16_u16(vsubl_u8(vget_low_u8(vb01), vb_zero_point));
     const int16x8_t vxa1 = vreinterpretq_s16_u16(vsubl_u8(vget_high_u8(va01), va_zero_point));
@@ -47,7 +45,7 @@
     const int16x8_t vxa3 = vreinterpretq_s16_u16(vsubl_u8(vget_high_u8(va23), va_zero_point));
     const int16x8_t vxb3 = vreinterpretq_s16_u16(vsubl_u8(vget_high_u8(vb23), vb_zero_point));
 
-    /* Multiply by factors and accumulate products */
+    // Multiply by factors and accumulate products.
     int32x4_t vacc0_lo = vmulq_s32(vmovl_s16(vget_low_s16(vxa0)), va_multiplier);
     int32x4_t vacc1_lo = vmulq_s32(vmovl_s16(vget_low_s16(vxa1)), va_multiplier);
     int32x4_t vacc2_lo = vmulq_s32(vmovl_s16(vget_low_s16(vxa2)), va_multiplier);
@@ -66,7 +64,7 @@
     vacc2_hi = vmlaq_s32(vacc2_hi, vmovl_high_s16(vxb2), vb_multiplier);
     vacc3_hi = vmlaq_s32(vacc3_hi, vmovl_high_s16(vxb3), vb_multiplier);
 
-    /* Shift right and round */
+    // Shift right and round.
     vacc0_lo = vsraq_n_s32(vacc0_lo, vbicq_s32(vacc0_lo, vzero_shift_mask), 31);
     vacc1_lo = vsraq_n_s32(vacc1_lo, vbicq_s32(vacc1_lo, vzero_shift_mask), 31);
     vacc2_lo = vsraq_n_s32(vacc2_lo, vbicq_s32(vacc2_lo, vzero_shift_mask), 31);
@@ -85,7 +83,7 @@
     vacc2_hi = vrshlq_s32(vacc2_hi, vright_shift);
     vacc3_hi = vrshlq_s32(vacc3_hi, vright_shift);
 
-    /* Pack, saturate, and add output zero point */
+    // Pack, saturate, and add output zero point.
     const int16x8_t vacc0 = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc0_lo), vacc0_hi), vy_zero_point);
     const int16x8_t vacc1 = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc1_lo), vacc1_hi), vy_zero_point);
     const int16x8_t vacc2 = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc2_lo), vacc2_hi), vy_zero_point);
@@ -107,13 +105,13 @@
     const uint8x16_t va01 = vld1q_u8(a); a += 16;
     const uint8x16_t vb01 = vld1q_u8(b); b += 16;
 
-    /* Subtract zero point */
+    // Subtract zero point.
     const int16x8_t vxa0 = vreinterpretq_s16_u16(vsubl_u8(vget_low_u8(va01), va_zero_point));
     const int16x8_t vxb0 = vreinterpretq_s16_u16(vsubl_u8(vget_low_u8(vb01), vb_zero_point));
     const int16x8_t vxa1 = vreinterpretq_s16_u16(vsubl_u8(vget_high_u8(va01), va_zero_point));
     const int16x8_t vxb1 = vreinterpretq_s16_u16(vsubl_u8(vget_high_u8(vb01), vb_zero_point));
 
-    /* Multiply by factors and accumulate products */
+    // Multiply by factors and accumulate products.
     int32x4_t vacc0_lo = vmulq_s32(vmovl_s16(vget_low_s16(vxa0)), va_multiplier);
     int32x4_t vacc1_lo = vmulq_s32(vmovl_s16(vget_low_s16(vxa1)), va_multiplier);
     int32x4_t vacc0_hi = vmulq_s32(vmovl_s16(vget_high_s16(vxa0)), va_multiplier);
@@ -127,7 +125,7 @@
     vacc0_hi = vmlaq_s32(vacc0_hi, vmovl_s16(vget_high_s16(vxb0)), vb_multiplier);
     vacc1_hi = vmlaq_s32(vacc1_hi, vmovl_s16(vget_high_s16(vxb1)), vb_multiplier);
 
-    /* Shift right and round */
+    // Shift right and round.
     vacc0_lo = vsraq_n_s32(vacc0_lo, vbicq_s32(vacc0_lo, vzero_shift_mask), 31);
     vacc1_lo = vsraq_n_s32(vacc1_lo, vbicq_s32(vacc1_lo, vzero_shift_mask), 31);
     vacc0_hi = vsraq_n_s32(vacc0_hi, vbicq_s32(vacc0_hi, vzero_shift_mask), 31);
@@ -138,7 +136,7 @@
     vacc0_hi = vrshlq_s32(vacc0_hi, vright_shift);
     vacc1_hi = vrshlq_s32(vacc1_hi, vright_shift);
 
-    /* Pack, saturate, and add output zero point */
+    // Pack, saturate, and add output zero point.
     const int16x8_t vacc0 = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc0_lo), vqmovn_s32(vacc0_hi)), vy_zero_point);
     const int16x8_t vacc1 = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc1_lo), vqmovn_s32(vacc1_hi)), vy_zero_point);
 
@@ -153,11 +151,11 @@
     const uint8x8_t va = vld1_u8(a); a += 8;
     const uint8x8_t vb = vld1_u8(b); b += 8;
 
-    /* Subtract zero point */
+    // Subtract zero point.
     const int16x8_t vxa = vreinterpretq_s16_u16(vsubl_u8(va, va_zero_point));
     const int16x8_t vxb = vreinterpretq_s16_u16(vsubl_u8(vb, vb_zero_point));
 
-    /* Multiply by factors and accumulate products */
+    // Multiply by factors and accumulate products.
     int32x4_t vacc_lo = vmulq_s32(vmovl_s16(vget_low_s16(vxa)), va_multiplier);
 #ifdef __aarch64__
     int32x4_t vacc_hi = vmulq_s32(vmovl_high_s16(vxa), va_multiplier);
@@ -172,14 +170,14 @@
     vacc_hi = vmlaq_s32(vacc_hi, vmovl_s16(vget_high_s16(vxb)), vb_multiplier);
 #endif
 
-    /* Shift right and round */
+    // Shift right and round.
     vacc_lo = vsraq_n_s32(vacc_lo, vbicq_s32(vacc_lo, vzero_shift_mask), 31);
     vacc_hi = vsraq_n_s32(vacc_hi, vbicq_s32(vacc_hi, vzero_shift_mask), 31);
 
     vacc_lo = vrshlq_s32(vacc_lo, vright_shift);
     vacc_hi = vrshlq_s32(vacc_hi, vright_shift);
 
-    /* Pack, saturate, and add output zero point */
+    // Pack, saturate, and add output zero point.
 #ifdef __aarch64__
     const int16x8_t vacc = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc_lo), vacc_hi), vy_zero_point);
 #else
@@ -196,11 +194,11 @@
     const uint8x8_t va = vld1_u8(a);
     const uint8x8_t vb = vld1_u8(b);
 
-    /* Subtract zero point */
+    // Subtract zero point.
     const int16x8_t vxa = vreinterpretq_s16_u16(vsubl_u8(va, va_zero_point));
     const int16x8_t vxb = vreinterpretq_s16_u16(vsubl_u8(vb, vb_zero_point));
 
-    /* Multiply by factors and accumulate products */
+    // Multiply by factors and accumulate products.
     int32x4_t vacc_lo = vmulq_s32(vmovl_s16(vget_low_s16(vxa)), va_multiplier);
 #ifdef __aarch64__
     int32x4_t vacc_hi = vmulq_s32(vmovl_high_s16(vxa), va_multiplier);
@@ -215,14 +213,14 @@
     vacc_hi = vmlaq_s32(vacc_hi, vmovl_s16(vget_high_s16(vxb)), vb_multiplier);
 #endif
 
-    /* Shift right and round */
+    // Shift right and round.
     vacc_lo = vsraq_n_s32(vacc_lo, vbicq_s32(vacc_lo, vzero_shift_mask), 31);
     vacc_hi = vsraq_n_s32(vacc_hi, vbicq_s32(vacc_hi, vzero_shift_mask), 31);
 
     vacc_lo = vrshlq_s32(vacc_lo, vright_shift);
     vacc_hi = vrshlq_s32(vacc_hi, vright_shift);
 
-    /* Pack, saturate, and add output zero point */
+    // Pack, saturate, and add output zero point.
 #ifdef __aarch64__
     const int16x8_t vacc = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc_lo), vacc_hi), vy_zero_point);
 #else
diff --git a/src/q8-vadd/scalar.c b/src/q8-vadd/scalar.c
index 6c20a0e..ea24482 100644
--- a/src/q8-vadd/scalar.c
+++ b/src/q8-vadd/scalar.c
@@ -1,9 +1,7 @@
-/*
- * Copyright 2019 Google LLC
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
 
 #include <assert.h>
 
@@ -35,18 +33,18 @@
     const int32_t va = (int32_t) (uint32_t) *a++;
     const int32_t vb = (int32_t) (uint32_t) *b++;
 
-    /* Multiply by factors */
+    // Multiply by factors.
     const int32_t va_product = va * va_multiplier;
     const int32_t vb_product = vb * vb_multiplier;
 
-    /* Accumulate products */
+    // Accumulate products.
     const int32_t vacc = vzero_point_product + va_product + vb_product;
 
-    /* Shift right and round */
+    // Shift right and round.
     const int32_t vremainder = (vacc & vremainder_mask) - (int32_t) (vacc < 0);
     int32_t vy = asr_s32(vacc, vshift) + (int32_t) (vremainder > vremainder_threshold);
 
-    /* Pack, saturate, and add output zero point */
+    // Pack, saturate, and add output zero point.
     vy += vy_zero_point;
     vy = vy < vy_min ? vy_min : vy;
     vy = vy > vy_max ? vy_max : vy;
diff --git a/src/q8-vadd/sse2.c b/src/q8-vadd/sse2.c
index c962f23..51364a5 100644
--- a/src/q8-vadd/sse2.c
+++ b/src/q8-vadd/sse2.c
@@ -1,12 +1,10 @@
-/*
- * Copyright (c) Facebook, Inc. and its affiliates.
- * All rights reserved.
- *
- * Copyright 2019 Google LLC
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
+// Copyright (c) Facebook, Inc. and its affiliates.
+// All rights reserved.
+//
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
 
 #include <immintrin.h>
 
@@ -41,7 +39,7 @@
     const __m128i vxa = _mm_unpacklo_epi8(va, vzero);
     const __m128i vxb = _mm_unpacklo_epi8(vb, vzero);
 
-    /* Multiply by factors */
+    // Multiply by factors.
     const __m128i va_product_lo = _mm_mullo_epi16(vxa, va_multiplier_lo);
     const __m128i va_product_hi =
       _mm_add_epi16(_mm_mulhi_epu16(vxa, va_multiplier_lo), _mm_mullo_epi16(vxa, va_multiplier_hi));
@@ -50,14 +48,14 @@
     const __m128i vb_product_hi =
       _mm_add_epi16(_mm_mulhi_epu16(vxb, vb_multiplier_lo), _mm_mullo_epi16(vxb, vb_multiplier_hi));
 
-    /* Accumulate products */
+    // Accumulate products.
     __m128i vacc_lo = _mm_add_epi32(vzero_point_product, _mm_unpacklo_epi16(va_product_lo, va_product_hi));
     __m128i vacc_hi = _mm_add_epi32(vzero_point_product, _mm_unpackhi_epi16(va_product_lo, va_product_hi));
 
     vacc_lo = _mm_add_epi32(vacc_lo, _mm_unpacklo_epi16(vb_product_lo, vb_product_hi));
     vacc_hi = _mm_add_epi32(vacc_hi, _mm_unpackhi_epi16(vb_product_lo, vb_product_hi));
 
-    /* Shift right and round */
+    // Shift right and round.
     const __m128i vrem_lo =
       _mm_add_epi32(_mm_and_si128(vacc_lo, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), vacc_lo));
     const __m128i vrem_hi =
@@ -66,7 +64,7 @@
     vacc_lo = _mm_sub_epi32(_mm_sra_epi32(vacc_lo, vshift), _mm_cmpgt_epi32(vrem_lo, vremainder_threshold));
     vacc_hi = _mm_sub_epi32(_mm_sra_epi32(vacc_hi, vshift), _mm_cmpgt_epi32(vrem_hi, vremainder_threshold));
 
-    /* Pack, saturate, and add output zero point */
+    // Pack, saturate, and add output zero point.
     const __m128i vy_zero_point = _mm_load_si128((const __m128i*) params->sse2.y_zero_point);
     const __m128i vacc = _mm_adds_epi16(_mm_packs_epi32(vacc_lo, vacc_hi), vy_zero_point);
     __m128i vy = _mm_packus_epi16(vacc, vacc);
@@ -83,7 +81,7 @@
     const __m128i vxa = _mm_unpacklo_epi8(va, vzero);
     const __m128i vxb = _mm_unpacklo_epi8(vb, vzero);
 
-    /* Multiply by factors */
+    // Multiply by factors.
     const __m128i va_product_lo = _mm_mullo_epi16(vxa, va_multiplier_lo);
     const __m128i va_product_hi =
       _mm_add_epi16(_mm_mulhi_epu16(vxa, va_multiplier_lo), _mm_mullo_epi16(vxa, va_multiplier_hi));
@@ -92,14 +90,14 @@
     const __m128i vb_product_hi =
       _mm_add_epi16(_mm_mulhi_epu16(vxb, vb_multiplier_lo), _mm_mullo_epi16(vxb, vb_multiplier_hi));
 
-    /* Accumulate products */
+    // Accumulate products.
     __m128i vacc_lo = _mm_add_epi32(vzero_point_product, _mm_unpacklo_epi16(va_product_lo, va_product_hi));
     __m128i vacc_hi = _mm_add_epi32(vzero_point_product, _mm_unpackhi_epi16(va_product_lo, va_product_hi));
 
     vacc_lo = _mm_add_epi32(vacc_lo, _mm_unpacklo_epi16(vb_product_lo, vb_product_hi));
     vacc_hi = _mm_add_epi32(vacc_hi, _mm_unpackhi_epi16(vb_product_lo, vb_product_hi));
 
-    /* Shift right and round */
+    // Shift right and round.
     const __m128i vrem_lo =
       _mm_add_epi32(_mm_and_si128(vacc_lo, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), vacc_lo));
     const __m128i vrem_hi =
@@ -108,7 +106,7 @@
     vacc_lo = _mm_sub_epi32(_mm_sra_epi32(vacc_lo, vshift), _mm_cmpgt_epi32(vrem_lo, vremainder_threshold));
     vacc_hi = _mm_sub_epi32(_mm_sra_epi32(vacc_hi, vshift), _mm_cmpgt_epi32(vrem_hi, vremainder_threshold));
 
-    /* Pack, saturate, and add output zero point */
+    // Pack, saturate, and add output zero point.
     const __m128i vy_zero_point = _mm_load_si128((const __m128i*) params->sse2.y_zero_point);
     const __m128i vacc = _mm_adds_epi16(_mm_packs_epi32(vacc_lo, vacc_hi), vy_zero_point);
     __m128i vy = _mm_packus_epi16(vacc, vacc);
diff --git a/src/requantization/fp32-neon.c b/src/requantization/fp32-neon.c
index 4d2f67c..d4066fe 100644
--- a/src/requantization/fp32-neon.c
+++ b/src/requantization/fp32-neon.c
@@ -1,12 +1,10 @@
-/*
- * Copyright (c) Facebook, Inc. and its affiliates.
- * All rights reserved.
- *
- * Copyright 2019 Google LLC
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
+// Copyright (c) Facebook, Inc. and its affiliates.
+// All rights reserved.
+//
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
 
 #include <assert.h>
 #include <stdint.h>
@@ -47,37 +45,31 @@
     const int32x4_t w = vld1q_s32(input + 12);
     input += 16;
 
-    /*
-     * Convert int32_t input to FP32 and multiply by FP32 scale.
-     * Both operations involve statistically unbiased roundings:
-     * - Large int32_t values can't be exactly represented as FP32. The conversion instruction in ARM NEON would
-     *   round it to nearest FP32 value with ties to even.
-     * - Product of two FP32 values is generally not exactly representation as an FP32 value, and will be rounded
-     *   to nearest FP32 value with ties to even.
-     */
+    // Convert int32_t input to FP32 and multiply by FP32 scale.
+    // Both operations involve statistically unbiased roundings:
+    // - Large int32_t values can't be exactly represented as FP32. The conversion instruction in ARM NEON would
+    //   round it to nearest FP32 value with ties to even.
+    // - Product of two FP32 values is generally not exactly representation as an FP32 value, and will be rounded
+    //   to nearest FP32 value with ties to even.
     const float32x4_t x_scaled = vmulq_f32(vcvtq_f32_s32(x), vscale);
     const float32x4_t y_scaled = vmulq_f32(vcvtq_f32_s32(y), vscale);
     const float32x4_t z_scaled = vmulq_f32(vcvtq_f32_s32(z), vscale);
     const float32x4_t w_scaled = vmulq_f32(vcvtq_f32_s32(w), vscale);
 
 #ifdef __aarch64__
-    /*
-     * Leverage "Floating-point Convert to Signed integer, rouding to nearest with ties to even" instruction.
-     * This is an ARMv8 instruction (always available in AArch64), which saturates result on overflow.
-     * We don't need to specifically consider saturated results, they will be clamped at the last stage.
-     */
+    // Leverage "Floating-point Convert to Signed integer, rouding to nearest with ties to even" instruction.
+    // This is an ARMv8 instruction (always available in AArch64), which saturates result on overflow.
+    // We don't need to specifically consider saturated results, they will be clamped at the last stage.
     const int32x4_t x_rounded = vcvtnq_s32_f32(x_scaled);
     const int32x4_t y_rounded = vcvtnq_s32_f32(y_scaled);
     const int32x4_t z_rounded = vcvtnq_s32_f32(z_scaled);
     const int32x4_t w_rounded = vcvtnq_s32_f32(w_scaled);
 
-    /*
-     * Standard final sequence on ARM NEON:
-     * - Pack to int16_t and saturate
-     * - Add zero point
-     * - Pack to uint8_t and saturate
-     * - Clamp between qmin and qmax
-     */
+    // Standard final sequence on ARM NEON:
+    // - Pack to int16_t and saturate
+    // - Add zero point
+    // - Pack to uint8_t and saturate
+    // - Clamp between qmin and qmax
     const int16x8_t xy_packed = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(x_rounded), y_rounded), vzero_point);
     const int16x8_t zw_packed = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(z_rounded), w_rounded), vzero_point);
     const uint8x16_t xyzw_packed = vqmovun_high_s16(vqmovun_s16(xy_packed), zw_packed);
@@ -86,64 +78,56 @@
     vst1q_u8(output, xyzw_clamped);
     output += 16;
 #else
-    /*
-     * ARMv7 NEON offers only a floating-point to integer conversion instruction with rounding towards zero.
-     * In lieu of conversion instruction with rounding-to-nearest-even, we use a magic trick of adding a large
-     * number (1.5 * 2**23) to scaled value to cause rounding to integer, and then substracing this magic number as
-     * integer. This trick works only in a limited range (absolute value of input must be less than 2**22), so
-     * generally we have to clamp input to this range before using the magic. However, clamping to any smaller range
-     * works just as well, and thus we clamp to [qmin - zero point, qmax - zero point] range so that after we add
-     * zero point to the result, it gets into target [qmin, qmax] range.
-     */
+    // ARMv7 NEON offers only a floating-point to integer conversion instruction with rounding towards zero.
+    // In lieu of conversion instruction with rounding-to-nearest-even, we use a magic trick of adding a large
+    // number (1.5 * 2**23) to scaled value to cause rounding to integer, and then substracing this magic number as
+    // integer. This trick works only in a limited range (absolute value of input must be less than 2**22), so
+    // generally we have to clamp input to this range before using the magic. However, clamping to any smaller range
+    // works just as well, and thus we clamp to [qmin - zero point, qmax - zero point] range so that after we add
+    // zero point to the result, it gets into target [qmin, qmax] range.
     const float32x4_t x_clamped = vminq_f32(vmaxq_f32(x_scaled, vfmin), vfmax);
     const float32x4_t y_clamped = vminq_f32(vmaxq_f32(y_scaled, vfmin), vfmax);
     const float32x4_t z_clamped = vminq_f32(vmaxq_f32(z_scaled, vfmin), vfmax);
     const float32x4_t w_clamped = vminq_f32(vmaxq_f32(w_scaled, vfmin), vfmax);
 
-    /*
-     * Conversion to integer using the "magic trick". Rounding is performed in the output of addition operation,
-     * and result is rounded to nearest even integer with ties to even.
-     */
+    // Conversion to integer using the "magic trick". Rounding is performed in the output of addition operation,
+    // and result is rounded to nearest even integer with ties to even.
     const int32x4_t x_biased = vsubq_s32(vreinterpretq_s32_f32(vaddq_f32(x_clamped, vfmagic)), vimagic);
     const int32x4_t y_biased = vsubq_s32(vreinterpretq_s32_f32(vaddq_f32(y_clamped, vfmagic)), vimagic);
     const int32x4_t z_biased = vsubq_s32(vreinterpretq_s32_f32(vaddq_f32(z_clamped, vfmagic)), vimagic);
     const int32x4_t w_biased = vsubq_s32(vreinterpretq_s32_f32(vaddq_f32(w_clamped, vfmagic)), vimagic);
 
-    /*
-     * Select low 8 bits of each 32-bit integer in the vectors for the output.
-     * Since result is already clamped to [qmin, qmax] subrange of [0, 255], saturation is not needed.
-     */
+    // Select low 8 bits of each 32-bit integer in the vectors for the output.
+    // Since result is already clamped to [qmin, qmax] subrange of [0, 255], saturation is not needed.
     const int16x8_t xy_packed = vcombine_s16(vmovn_s32(x_biased), vmovn_s32(y_biased));
     const int16x8_t zw_packed = vcombine_s16(vmovn_s32(z_biased), vmovn_s32(w_biased));
     const uint8x16_t xyzw_packed = vreinterpretq_u8_s8(vcombine_s8(vmovn_s16(xy_packed), vmovn_s16(zw_packed)));
 
-    /*
-     * AArch32 version:
-     *   4x VCVT.F32.S32 Qd, Qm
-     *   4x VMUL.F32 Qd, Qm, Qn
-     *   4x VMIN.F32 Qd, Qm, Qn
-     *   4x VMAX.F32 Qd, Qm, Qn
-     *   4x VADD.F32 Qd, Qm, Qn
-     *   4x VSUB.S32 Qd, Qm, Qn
-     *   4x VMOVN.I32 Dd, Qm
-     *   2x VMOVN.I16 Dd, Qm
-     * ---------------------
-     * 30 instructions total
-     *
-     * AArch64 version:
-     *   4x SCVTF Vd.4S, Vn.4S
-     *   4x FMUL Vd.4S, Vn.4S, Vm.4S
-     *   4x FCVTNS Vd.4S, Vn.4S
-     *   2x SQXTN Vd.4H, Vn.4S
-     *   2x SQXTN2 Vd.8H, Vn.4S
-     *   2x ADD Vd.8H, Vn.8H, Vm.8H
-     *   1x SQXTUN Vd.8B, Vn.8H
-     *   1x SQXTUN2 Vd.16B, Vn.8H
-     *   1x UMIN Vd.16B, Vn.16B, Vm.16B
-     *   1x UMAX Vd.16B, Vn.16B, Vm.16B
-     * ---------------------
-     * 22 instructions total
-     */
+    // AArch32 version:
+    //   4x VCVT.F32.S32 Qd, Qm
+    //   4x VMUL.F32 Qd, Qm, Qn
+    //   4x VMIN.F32 Qd, Qm, Qn
+    //   4x VMAX.F32 Qd, Qm, Qn
+    //   4x VADD.F32 Qd, Qm, Qn
+    //   4x VSUB.S32 Qd, Qm, Qn
+    //   4x VMOVN.I32 Dd, Qm
+    //   2x VMOVN.I16 Dd, Qm
+    // ---------------------
+    // 30 instructions total
+    //
+    // AArch64 version:
+    //   4x SCVTF Vd.4S, Vn.4S
+    //   4x FMUL Vd.4S, Vn.4S, Vm.4S
+    //   4x FCVTNS Vd.4S, Vn.4S
+    //   2x SQXTN Vd.4H, Vn.4S
+    //   2x SQXTN2 Vd.8H, Vn.4S
+    //   2x ADD Vd.8H, Vn.8H, Vm.8H
+    //   1x SQXTUN Vd.8B, Vn.8H
+    //   1x SQXTUN2 Vd.16B, Vn.8H
+    //   1x UMIN Vd.16B, Vn.16B, Vm.16B
+    //   1x UMAX Vd.16B, Vn.16B, Vm.16B
+    // ---------------------
+    // 22 instructions total
 
     vst1q_u8(output, xyzw_packed);
     output += 16;
diff --git a/src/requantization/fp32-psimd.c b/src/requantization/fp32-psimd.c
index 922a7e2..4038b27 100644
--- a/src/requantization/fp32-psimd.c
+++ b/src/requantization/fp32-psimd.c
@@ -1,12 +1,10 @@
-/*
- * Copyright (c) Facebook, Inc. and its affiliates.
- * All rights reserved.
- *
- * Copyright 2019 Google LLC
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
+// Copyright (c) Facebook, Inc. and its affiliates.
+// All rights reserved.
+//
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
 
 #include <assert.h>
 #include <stdint.h>
@@ -41,47 +39,39 @@
     const psimd_s32 w = psimd_load_s32(input + 12);
     input += 16;
 
-    /*
-     * Convert int32_t input to FP32 and multiply by FP32 scale.
-     * Both operations involve roundings:
-     * - Large int32_t values can't be exactly represented as FP32. We expect that conversion instruction would
-     *   round it to nearest FP32 value with ties to even, but Clang documentation for __builtin_convertvector does
-     *   not guaratee that.
-     * - Product of two FP32 values is generally not exactly representation as an FP32 value, and will be rounded
-     *   to nearest FP32 value with ties to even.
-     */
+    // Convert int32_t input to FP32 and multiply by FP32 scale.
+    // Both operations involve roundings:
+    // - Large int32_t values can't be exactly represented as FP32. We expect that conversion instruction would
+    //   round it to nearest FP32 value with ties to even, but Clang documentation for __builtin_convertvector does
+    //   not guaratee that.
+    // - Product of two FP32 values is generally not exactly representation as an FP32 value, and will be rounded
+    //   to nearest FP32 value with ties to even.
     const psimd_f32 x_scaled = psimd_cvt_s32_f32(x) * vscale;
     const psimd_f32 y_scaled = psimd_cvt_s32_f32(y) * vscale;
     const psimd_f32 z_scaled = psimd_cvt_s32_f32(z) * vscale;
     const psimd_f32 w_scaled = psimd_cvt_s32_f32(w) * vscale;
 
-    /*
-     * Clang/gcc vector extension does not provide an intrinsics for a floating-point to integer conversion
-     * operation with rounding-to-nearest-even. In lieu of such intrinsic, we use a magic trick of adding a large
-     * number (1.5 * 2**23) to scaled value to cause rounding to integer, and then substracing this magic number as
-     * integer. This trick works only in a limited range (absolute value of input must be less than 2**22), so
-     * generally we have to clamp input to this range before using the magic. However, clamping to any smaller range
-     * works just as well, and thus we clamp to [qmin - zero point, qmax - zero point] range so that after we add
-     * zero point to the result, it gets into target [qmin, qmax] range.
-     */
+    // Clang/gcc vector extension does not provide an intrinsics for a floating-point to integer conversion
+    // operation with rounding-to-nearest-even. In lieu of such intrinsic, we use a magic trick of adding a large
+    // number (1.5 * 2**23) to scaled value to cause rounding to integer, and then substracing this magic number as
+    // integer. This trick works only in a limited range (absolute value of input must be less than 2**22), so
+    // generally we have to clamp input to this range before using the magic. However, clamping to any smaller range
+    // works just as well, and thus we clamp to [qmin - zero point, qmax - zero point] range so that after we add
+    // zero point to the result, it gets into target [qmin, qmax] range.
     const psimd_f32 x_clamped = psimd_min_f32(psimd_max_f32(x_scaled, vfmin), vfmax);
     const psimd_f32 y_clamped = psimd_min_f32(psimd_max_f32(y_scaled, vfmin), vfmax);
     const psimd_f32 z_clamped = psimd_min_f32(psimd_max_f32(z_scaled, vfmin), vfmax);
     const psimd_f32 w_clamped = psimd_min_f32(psimd_max_f32(w_scaled, vfmin), vfmax);
 
-    /*
-     * Conversion to integer using the "magic trick". Rounding is performed in the output of addition operation,
-     * and result is rounded to nearest even integer with ties to even.
-     */
+    // Conversion to integer using the "magic trick". Rounding is performed in the output of addition operation,
+    // and result is rounded to nearest even integer with ties to even.
     const psimd_s32 x_biased = (psimd_s32)(x_clamped + vfmagic) - vimagic;
     const psimd_s32 y_biased = (psimd_s32)(y_clamped + vfmagic) - vimagic;
     const psimd_s32 z_biased = (psimd_s32)(z_clamped + vfmagic) - vimagic;
     const psimd_s32 w_biased = (psimd_s32)(w_clamped + vfmagic) - vimagic;
 
-    /*
-     * Select low 8 bits of each 32-bit integer in the vectors for the output.
-     * Since result is already clamped to [qmin, qmax] subrange of [0, 255], saturation is not needed.
-     */
+    // Select low 8 bits of each 32-bit integer in the vectors for the output.
+    // Since result is already clamped to [qmin, qmax] subrange of [0, 255], saturation is not needed.
     const psimd_u16 xy_packed = psimd_concat_even_u16((psimd_u16) x_biased, (psimd_u16) y_biased);
     const psimd_u16 zw_packed = psimd_concat_even_u16((psimd_u16) z_biased, (psimd_u16) w_biased);
 
diff --git a/src/requantization/fp32-scalar.c b/src/requantization/fp32-scalar.c
index 52fa1c9..53e8c75 100644
--- a/src/requantization/fp32-scalar.c
+++ b/src/requantization/fp32-scalar.c
@@ -1,12 +1,10 @@
-/*
- * Copyright (c) Facebook, Inc. and its affiliates.
- * All rights reserved.
- *
- * Copyright 2019 Google LLC
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
+// Copyright (c) Facebook, Inc. and its affiliates.
+// All rights reserved.
+//
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
 
 #include <assert.h>
 #include <math.h>
diff --git a/src/requantization/fp32-sse2.c b/src/requantization/fp32-sse2.c
index b574a12..163acc5 100644
--- a/src/requantization/fp32-sse2.c
+++ b/src/requantization/fp32-sse2.c
@@ -1,12 +1,10 @@
-/*
- * Copyright (c) Facebook, Inc. and its affiliates.
- * All rights reserved.
- *
- * Copyright 2019 Google LLC
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
+// Copyright (c) Facebook, Inc. and its affiliates.
+// All rights reserved.
+//
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
 
 #include <assert.h>
 #include <stdint.h>
@@ -40,61 +38,53 @@
     const __m128i w = _mm_loadu_si128((const __m128i*) (input + 12));
     input += 16;
 
-    /*
-     * Convert int32_t input to FP32 and multiply by FP32 scale.
-     * Both operations involve statistically unbiased roundings (with default MXCSR rounding mode):
-     * - Large int32_t values can't be exactly represented as FP32. CVTDQ2PS instruction on x86 would round it
-     *   according to nearest FP32 value with ties to even (assuming default MXCSR rounding mode).
-     * - Product of two FP32 values is generally not exactly representation as an FP32 value, and will be rounded
-     *   to nearest FP32 value with ties to even with default MXCSR rounding mode.
-     */
+    // Convert int32_t input to FP32 and multiply by FP32 scale.
+    // Both operations involve statistically unbiased roundings (with default MXCSR rounding mode):
+    // - Large int32_t values can't be exactly represented as FP32. CVTDQ2PS instruction on x86 would round it
+    //   according to nearest FP32 value with ties to even (assuming default MXCSR rounding mode).
+    // - Product of two FP32 values is generally not exactly representation as an FP32 value, and will be rounded
+    //   to nearest FP32 value with ties to even with default MXCSR rounding mode.
     const __m128 x_scaled = _mm_mul_ps(_mm_cvtepi32_ps(x), vscale);
     const __m128 y_scaled = _mm_mul_ps(_mm_cvtepi32_ps(y), vscale);
     const __m128 z_scaled = _mm_mul_ps(_mm_cvtepi32_ps(z), vscale);
     const __m128 w_scaled = _mm_mul_ps(_mm_cvtepi32_ps(w), vscale);
 
-    /*
-     * Convert scaled FP32 result to int32_t using CVTPS2DQ instruction from x86 SSE2. CVTPS2DQ instruction rounds
-     * result according to nearest FP32 value with ties to even (assuming default MXCSR rounding mode).
-     * However, when conversion overflows, it produces INT32_MIN as a result. For large positive inputs the result
-     * of conversion can become negative, which affects the final requantization result. Note that on x86 SSE2 we
-     * have e.g. int32_t(float(INT32_MAX)) == INT32_MIN! This happens because float(INT32_MAX) rounds to 2**31,
-     * which overflows int32_t when it is converted back to integer.
-     *
-     * Thankfully, we can prove that overflow never happens in this requantization scheme. The largest positive
-     * input is INT32_MAX (2**31 - 1), which turns into 2**31 when converted to float. The largest scale value
-     * is 0x1.FFFFFEp-1. When multiplied together, the result is 2147483520 (compare to INT32_MAX = 2147483647),
-     * which fits into int32_t without overflow.
-     */
+    // Convert scaled FP32 result to int32_t using CVTPS2DQ instruction from x86 SSE2. CVTPS2DQ instruction rounds
+    // result according to nearest FP32 value with ties to even (assuming default MXCSR rounding mode).
+    // However, when conversion overflows, it produces INT32_MIN as a result. For large positive inputs the result
+    // of conversion can become negative, which affects the final requantization result. Note that on x86 SSE2 we
+    // have e.g. int32_t(float(INT32_MAX)) == INT32_MIN! This happens because float(INT32_MAX) rounds to 2**31,
+    // which overflows int32_t when it is converted back to integer.
+    //
+    // Thankfully, we can prove that overflow never happens in this requantization scheme. The largest positive
+    // input is INT32_MAX (2**31 - 1), which turns into 2**31 when converted to float. The largest scale value
+    // is 0x1.FFFFFEp-1. When multiplied together, the result is 2147483520 (compare to INT32_MAX = 2147483647),
+    // which fits into int32_t without overflow.
     const __m128i x_rounded = _mm_cvtps_epi32(x_scaled);
     const __m128i y_rounded = _mm_cvtps_epi32(y_scaled);
     const __m128i z_rounded = _mm_cvtps_epi32(z_scaled);
     const __m128i w_rounded = _mm_cvtps_epi32(w_scaled);
 
-    /*
-     * Standard final sequence on x86 SSE2:
-     * - Pack to int16_t and saturate
-     * - Add zero point
-     * - Pack to uint8_t and saturate
-     * - Clamp between qmin and qmax
-     */
+    // Standard final sequence on x86 SSE2:
+    // - Pack to int16_t and saturate
+    // - Add zero point
+    // - Pack to uint8_t and saturate
+    // - Clamp between qmin and qmax
     const __m128i xy_packed = _mm_adds_epi16(_mm_packs_epi32(x_rounded, y_rounded), vzero_point);
     const __m128i zw_packed = _mm_adds_epi16(_mm_packs_epi32(z_rounded, w_rounded), vzero_point);
     const __m128i xyzw_packed = _mm_packus_epi16(xy_packed, zw_packed);
     const __m128i xyzw_clamped = _mm_max_epu8(_mm_min_epu8(xyzw_packed, vqmax), vqmin);
 
-    /*
-     * 4x CVTDQ2PS
-     * 4x MULPS
-     * 4x CVTPS2DQ
-     * 2x PACKSSDW
-     * 1x PACKUSWB
-     * 2x PADDW
-     * 1x PMAXUB
-     * 1x PMINUB
-     * ---------------------
-     * 19 instructions total
-     */
+    // 4x CVTDQ2PS
+    // 4x MULPS
+    // 4x CVTPS2DQ
+    // 2x PACKSSDW
+    // 1x PACKUSWB
+    // 2x PADDW
+    // 1x PMAXUB
+    // 1x PMINUB
+    // ---------------------
+    // 19 instructions total
 
     _mm_storeu_si128((__m128i*) output, xyzw_clamped);
     output += 16;
diff --git a/src/requantization/gemmlowp-neon.c b/src/requantization/gemmlowp-neon.c
index 37b6718..7aa480a 100644
--- a/src/requantization/gemmlowp-neon.c
+++ b/src/requantization/gemmlowp-neon.c
@@ -1,12 +1,10 @@
-/*
- * Copyright (c) Facebook, Inc. and its affiliates.
- * All rights reserved.
- *
- * Copyright 2019 Google LLC
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
+// Copyright (c) Facebook, Inc. and its affiliates.
+// All rights reserved.
+//
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
 
 #include <assert.h>
 #include <stdint.h>
@@ -17,12 +15,10 @@
 #include <xnnpack/requantization-stubs.h>
 
 
-/*
- * The requantization implementation below is adapted from Google's gemmlowp
- * library. It is only used in XNNPACK unit tests and comparative benchmarks,
- * but not the library itself.
- */
-
+// The requantization implementation below is adapted from Google's gemmlowp
+// library. It is only used in XNNPACK unit tests and comparative benchmarks,
+// but not the library itself.
+//
 // Copyright 2015 Google Inc. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
@@ -52,7 +48,7 @@
 
   const uint32_t scale_bits = fp32_to_bits(scale);
 
-  /* Compute requantization parameters */
+  // Compute requantization parameters.
   const uint32_t multiplier = ((scale_bits & UINT32_C(0x007FFFFF)) | UINT32_C(0x00800000)) << 7;
   const int32_t exponent = (fp32_to_bits(scale) >> 23) - 127 - 23 - 7;
   const int32_t shift = -(32 /* using high 32 bits in VQRDMUL */ - 1 /* doubling in VQRDMUL */ + exponent);
diff --git a/src/requantization/gemmlowp-scalar.c b/src/requantization/gemmlowp-scalar.c
index 78f61b1..38eead8 100644
--- a/src/requantization/gemmlowp-scalar.c
+++ b/src/requantization/gemmlowp-scalar.c
@@ -1,12 +1,10 @@
-/*
- * Copyright (c) Facebook, Inc. and its affiliates.
- * All rights reserved.
- *
- * Copyright 2019 Google LLC
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
+// Copyright (c) Facebook, Inc. and its affiliates.
+// All rights reserved.
+//
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
 
 #include <assert.h>
 #include <stdint.h>
@@ -33,13 +31,13 @@
 
   const uint32_t scale_bits = fp32_to_bits(scale);
 
-  /* Compute requantization parameters */
+  // Compute requantization parameters.
   const uint32_t multiplier = ((scale_bits & UINT32_C(0x007FFFFF)) | UINT32_C(0x00800000)) << 7;
   const int32_t exponent = (fp32_to_bits(scale) >> 23) - 127 - 23 - 7;
   const int32_t shift = -(32 /* using high 32 bits in VQRDMUL */ - 1 /* doubling in VQRDMUL */ + exponent);
 
-  const int32_t smin = (int32_t)(uint32_t) qmin;
-  const int32_t smax = (int32_t)(uint32_t) qmax;
+  const int32_t smin = (int32_t) (uint32_t) qmin;
+  const int32_t smax = (int32_t) (uint32_t) qmax;
   for (; n != 0; n -= 4) {
     const int32_t x = input[0];
     const int32_t y = input[1];
@@ -57,13 +55,13 @@
     const int32_t z_scaled = gemmlowp_scalar_rdivbypo2_s32(z_product, shift);
     const int32_t w_scaled = gemmlowp_scalar_rdivbypo2_s32(w_product, shift);
 
-    /* Add zero point to scaled value */
+    // Add zero point to scaled value.
     const int32_t x_biased = x_scaled + zero_point;
     const int32_t y_biased = y_scaled + zero_point;
     const int32_t z_biased = z_scaled + zero_point;
     const int32_t w_biased = w_scaled + zero_point;
 
-    /* Clamp scaled value with zero point between smin and smax */
+    // Clamp scaled value with zero point between smin and smax.
     const int32_t x_clamped = x_biased < smin ? smin : x_biased > smax ? smax : x_biased;
     const int32_t y_clamped = y_biased < smin ? smin : y_biased > smax ? smax : y_biased;
     const int32_t z_clamped = z_biased < smin ? smin : z_biased > smax ? smax : z_biased;
diff --git a/src/requantization/gemmlowp-scalar.h b/src/requantization/gemmlowp-scalar.h
index 91f8bf4..d4d85c9 100644
--- a/src/requantization/gemmlowp-scalar.h
+++ b/src/requantization/gemmlowp-scalar.h
@@ -1,24 +1,19 @@
-/*
- * Copyright (c) Facebook, Inc. and its affiliates.
- * All rights reserved.
- *
- * Copyright 2019 Google LLC
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
+// Copyright (c) Facebook, Inc. and its affiliates.
+// All rights reserved.
+//
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
 
 #pragma once
 
 #include <stdint.h>
 #include <limits.h>
 
-/*
- * The code below is adapted from Google's gemmlowp library.
- * It is only used in XNNPACK unit tests and comparative benchmarks,
- * but not the library itself.
- */
-
+// The code below is adapted from Google's gemmlowp library.
+// It is only used in XNNPACK unit tests and comparative benchmarks, but not the library itself.
+//
 // Copyright 2015 Google Inc. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/src/requantization/gemmlowp-sse.h b/src/requantization/gemmlowp-sse.h
index 1335fe1..d8e2cda 100644
--- a/src/requantization/gemmlowp-sse.h
+++ b/src/requantization/gemmlowp-sse.h
@@ -1,12 +1,10 @@
-/*
- * Copyright (c) Facebook, Inc. and its affiliates.
- * All rights reserved.
- *
- * Copyright 2019 Google LLC
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
+// Copyright (c) Facebook, Inc. and its affiliates.
+// All rights reserved.
+//
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
 
 #pragma once
 
@@ -14,12 +12,10 @@
 
 #include <immintrin.h>
 
-/*
- * The code below is adapted from Google's gemmlowp library.
- * It is only used in XNNPACK unit tests and comparative benchmarks,
- * but not the library itself.
- */
-
+// The code below is adapted from Google's gemmlowp library.
+// It is only used in XNNPACK unit tests and comparative benchmarks,
+// but not the library itself.
+//
 // Copyright 2015 Google Inc. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
@@ -59,12 +55,12 @@
 #ifdef __SSSE3__
   a_neg = _mm_abs_epi32(a); // negate a and b
   b_neg = _mm_abs_epi32(b); // negate a and b
-#else /* pre-SSSE3 */
+#else  // pre-SSSE3
   const __m128i a_neg_mask = _mm_cmplt_epi32(a, zero);
   a_neg = _mm_sub_epi32(_mm_xor_si128(a, a_neg_mask), a_neg_mask);
   const __m128i b_neg_mask = _mm_cmplt_epi32(b, zero);
   b_neg = _mm_sub_epi32(_mm_xor_si128(b, b_neg_mask), b_neg_mask);
-#endif /* pre-SSSE3 */
+#endif  // pre-SSSE3
   mul_us = _mm_mul_epu32(a_neg, b_neg); // uses 0 and 2nd data lanes, (abs), the
                                         // multiplication gives 64 bit result
   mul_us_neg = _mm_sub_epi64(zero, mul_us);
diff --git a/src/requantization/gemmlowp-sse2.c b/src/requantization/gemmlowp-sse2.c
index 93a869e..efac6d0 100644
--- a/src/requantization/gemmlowp-sse2.c
+++ b/src/requantization/gemmlowp-sse2.c
@@ -1,12 +1,10 @@
-/*
- * Copyright (c) Facebook, Inc. and its affiliates.
- * All rights reserved.
- *
- * Copyright 2019 Google LLC
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
+// Copyright (c) Facebook, Inc. and its affiliates.
+// All rights reserved.
+//
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
 
 #include <assert.h>
 #include <stdint.h>
@@ -34,7 +32,7 @@
 
   const uint32_t scale_bits = fp32_to_bits(scale);
 
-  /* Compute requantization parameters */
+  // Compute requantization parameters.
   const uint32_t multiplier = ((scale_bits & UINT32_C(0x007FFFFF)) | UINT32_C(0x00800000)) << 7;
   const int32_t exponent = (fp32_to_bits(scale) >> 23) - 127 - 23 - 7;
   const int32_t shift = -(32 /* using high 32 bits in VQRDMUL */ - 1 /* doubling in VQRDMUL */ + exponent);
diff --git a/src/requantization/gemmlowp-sse4.c b/src/requantization/gemmlowp-sse4.c
index a315746..575f589 100644
--- a/src/requantization/gemmlowp-sse4.c
+++ b/src/requantization/gemmlowp-sse4.c
@@ -1,12 +1,10 @@
-/*
- * Copyright (c) Facebook, Inc. and its affiliates.
- * All rights reserved.
- *
- * Copyright 2019 Google LLC
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
+// Copyright (c) Facebook, Inc. and its affiliates.
+// All rights reserved.
+//
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
 
 #include <assert.h>
 #include <stdint.h>
@@ -34,7 +32,7 @@
 
   const uint32_t scale_bits = fp32_to_bits(scale);
 
-  /* Compute requantization parameters */
+  // Compute requantization parameters.
   const uint32_t multiplier = ((scale_bits & UINT32_C(0x007FFFFF)) | UINT32_C(0x00800000)) << 7;
   const int32_t exponent = (fp32_to_bits(scale) >> 23) - 127 - 23 - 7;
   const int32_t shift = -(32 /* using high 32 bits in VQRDMUL */ - 1 /* doubling in VQRDMUL */ + exponent);
diff --git a/src/requantization/gemmlowp-ssse3.c b/src/requantization/gemmlowp-ssse3.c
index 700e000..4637770 100644
--- a/src/requantization/gemmlowp-ssse3.c
+++ b/src/requantization/gemmlowp-ssse3.c
@@ -1,12 +1,10 @@
-/*
- * Copyright (c) Facebook, Inc. and its affiliates.
- * All rights reserved.
- *
- * Copyright 2019 Google LLC
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
+// Copyright (c) Facebook, Inc. and its affiliates.
+// All rights reserved.
+//
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
 
 #include <assert.h>
 #include <stdint.h>
@@ -34,7 +32,7 @@
 
   const uint32_t scale_bits = fp32_to_bits(scale);
 
-  /* Compute requantization parameters */
+  // Compute requantization parameters.
   const uint32_t multiplier = ((scale_bits & UINT32_C(0x007FFFFF)) | UINT32_C(0x00800000)) << 7;
   const int32_t exponent = (fp32_to_bits(scale) >> 23) - 127 - 23 - 7;
   const int32_t shift = -(32 /* using high 32 bits in VQRDMUL */ - 1 /* doubling in VQRDMUL */ + exponent);
diff --git a/src/requantization/precise-neon.c b/src/requantization/precise-neon.c
index 2e796e3..bcbb11b 100644
--- a/src/requantization/precise-neon.c
+++ b/src/requantization/precise-neon.c
@@ -1,12 +1,10 @@
-/*
- * Copyright (c) Facebook, Inc. and its affiliates.
- * All rights reserved.
- *
- * Copyright 2019 Google LLC
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
+// Copyright (c) Facebook, Inc. and its affiliates.
+// All rights reserved.
+//
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
 
 #include <assert.h>
 #include <stdint.h>
@@ -128,39 +126,37 @@
 
     const uint8x16_t xyzw_clamped = vmaxq_u8(vminq_u8(xyzw_packed, vqmax), vqmin);
 
-    /*
-     * AArch32 version:
-     *   4x VCLT.S32 Qd, Qm, #0
-     *   8x VMULL.S32 Qd, Dm, Dn
-     *   8x VADDW.S32 Qd, Qm, Dn
-     *   8x VRSHL.S32 Qd, Qm, Qn
-     *   8x VMOVN.S64 Dd, Qm
-     *   4x VQMOVN.S32 Dd, Qm
-     *   2x VADD.S16 Qd, Qm, Qn
-     *   2x VQMOVUN.S16 Dd, Qm
-     *   1x VMAX.U8 Qd, Qm, Qn
-     *   1x VMIN.U8 Qd, Qm, Qn
-     * ---------------------
-     * 46 instructions total
-     *
-     * AArch64 version:
-     *   4x CMLT Vd.4S, Vn.4S, #0
-     *   4x SMULL Vd.2D, Vn.2S, Vm.2S
-     *   4x SMULL2 Vd.2D, Vn.4S, Vm.4S
-     *   4x SADDW Vd.2D, Vn.2D, Vm.2S
-     *   4x SADDW2 Vd.2D, Vn.2D, Vm.4S
-     *   8x SRSHL Vd.2D, Vn.2D, Vm.2D
-     *   4x UZP1 Vd.4S, Vn.4S, Vm.4S
-     *   2x SQXTN Vd.4H, Vn.4S
-     *   2x SQXTN2 Vd.8H, Vn.4S
-     *   2x ADD Vd.8H, Vn.8H, Vm.8H
-     *   1x SQXTUN Vd.8B, Vn.8H
-     *   1x SQXTUN2 Vd.16B, Vn.8H
-     *   1x UMIN Vd.16B, Vn.16B, Vm.16B
-     *   1x UMAX Vd.16B, Vn.16B, Vm.16B
-     * ---------------------
-     * 42 instructions total
-     */
+    // AArch32 version:
+    //   4x VCLT.S32 Qd, Qm, #0
+    //   8x VMULL.S32 Qd, Dm, Dn
+    //   8x VADDW.S32 Qd, Qm, Dn
+    //   8x VRSHL.S32 Qd, Qm, Qn
+    //   8x VMOVN.S64 Dd, Qm
+    //   4x VQMOVN.S32 Dd, Qm
+    //   2x VADD.S16 Qd, Qm, Qn
+    //   2x VQMOVUN.S16 Dd, Qm
+    //   1x VMAX.U8 Qd, Qm, Qn
+    //   1x VMIN.U8 Qd, Qm, Qn
+    // ---------------------
+    // 46 instructions total
+    //
+    // AArch64 version:
+    //   4x CMLT Vd.4S, Vn.4S, #0
+    //   4x SMULL Vd.2D, Vn.2S, Vm.2S
+    //   4x SMULL2 Vd.2D, Vn.4S, Vm.4S
+    //   4x SADDW Vd.2D, Vn.2D, Vm.2S
+    //   4x SADDW2 Vd.2D, Vn.2D, Vm.4S
+    //   8x SRSHL Vd.2D, Vn.2D, Vm.2D
+    //   4x UZP1 Vd.4S, Vn.4S, Vm.4S
+    //   2x SQXTN Vd.4H, Vn.4S
+    //   2x SQXTN2 Vd.8H, Vn.4S
+    //   2x ADD Vd.8H, Vn.8H, Vm.8H
+    //   1x SQXTUN Vd.8B, Vn.8H
+    //   1x SQXTUN2 Vd.16B, Vn.8H
+    //   1x UMIN Vd.16B, Vn.16B, Vm.16B
+    //   1x UMAX Vd.16B, Vn.16B, Vm.16B
+    // ---------------------
+    // 42 instructions total
 
     vst1q_u8(output, xyzw_clamped);
     output += 16;
diff --git a/src/requantization/precise-psimd.c b/src/requantization/precise-psimd.c
index 5228155..b065d64 100644
--- a/src/requantization/precise-psimd.c
+++ b/src/requantization/precise-psimd.c
@@ -1,12 +1,10 @@
-/*
- * Copyright (c) Facebook, Inc. and its affiliates.
- * All rights reserved.
- *
- * Copyright 2019 Google LLC
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
+// Copyright (c) Facebook, Inc. and its affiliates.
+// All rights reserved.
+//
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
 
 #include <assert.h>
 #include <stdint.h>
diff --git a/src/requantization/precise-scalar.c b/src/requantization/precise-scalar.c
index e93ae0e..6ec3bf8 100644
--- a/src/requantization/precise-scalar.c
+++ b/src/requantization/precise-scalar.c
@@ -1,12 +1,10 @@
-/*
- * Copyright (c) Facebook, Inc. and its affiliates.
- * All rights reserved.
- *
- * Copyright 2019 Google LLC
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
+// Copyright (c) Facebook, Inc. and its affiliates.
+// All rights reserved.
+//
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
 
 #include <assert.h>
 #include <stdint.h>
@@ -48,46 +46,42 @@
     const int32_t w = input[3];
     input += 4;
 
-    /*
-     * Compute absolute value of input as unsigned 32-bit int.
-     * All further computations will work with unsigned values to avoid undefined behaviour on signed operations.
-     */
+    // Compute absolute value of input as unsigned 32-bit int.
+    // All further computations will work with unsigned values to avoid undefined behaviour on signed operations.
     const uint32_t x_abs = (x >= 0) ? (uint32_t) x : -(uint32_t) x;
     const uint32_t y_abs = (y >= 0) ? (uint32_t) y : -(uint32_t) y;
     const uint32_t z_abs = (z >= 0) ? (uint32_t) z : -(uint32_t) z;
     const uint32_t w_abs = (w >= 0) ? (uint32_t) w : -(uint32_t) w;
 
-    /* Compute full 64-bit product of 32-bit factors */
+    // Compute full 64-bit product of 32-bit factors.
     const uint64_t x_product = (uint64_t) x_abs * (uint64_t) multiplier;
     const uint64_t y_product = (uint64_t) y_abs * (uint64_t) multiplier;
     const uint64_t z_product = (uint64_t) z_abs * (uint64_t) multiplier;
     const uint64_t w_product = (uint64_t) w_abs * (uint64_t) multiplier;
 
-    /*
-     * Shift the full 64-bit product right with rounding.
-     * Rounding is performed towards closest integer, with midpoints rounded up (same as away from zero).
-     *
-     * Generally, this operation requires both 64-bit addition and 64-bit shift, but we use two tricks to replace
-     * 64-bit operations with 32-bit operations.
-     *
-     * To avoid full 64-bit addition we make use of three facts:
-     * - 64-bit rounding value added before the shift is a power of 2, and thus has only one bit set.
-     * - When 0x1.0p-32f <= scale < 0x1.0p-31f, then the non-zero bit in rounding is in the low 32 bits, and
-     *   rounding is exactly 0x80000000 (2**31), because rounding is 2**(scale-1) and scale >= 32. In this case,
-     *   addition of rounding can affect high 32 bits of the product only through overflow, which happens if
-     *   low 32-bit part of the product equals or exceeds 0x80000000. We can reformulate the latter condition
-     *   as low 32-bit part of the product has the bit 31 set, and then overflow happens if both the low 32-bit part
-     *   of the product and the low 32-bit part of the rounding value have bit 31 set. Since 32-bit numbers with the
-     *   bit 31 set are negative when interpreted as signed integers, we can check the overflow condition as
-     *      (int32_t) (LOW(product) & LOW(rounding)) < 0
-     * - When 0x1.0p-31f <= scale < 1.0f, then the non-zero bit is in the high 32 bits of rounding. We just need
-     *   to do 32-bit addition of high 32 bits of rounding and high 32 bits of product. This addition never
-     *   overflows because product <= 0x80000000 * 0xFFFFFF00 < 2**63 and rounding = 2**(scale-1) <= 2**62.
-     *
-     * To avoid full 64-bit shift, we leverage the fact that shift >= 32, and do it in two steps:
-     * - Shift by 32, which can be implemented by extacting the high 32-bit word on 32-bit systems.
-     * - Shift by (shift - 32), which can be implemented as a 32-bit shift of high word of addition result.
-     */
+    // Shift the full 64-bit product right with rounding.
+    // Rounding is performed towards closest integer, with midpoints rounded up (same as away from zero).
+    //
+    // Generally, this operation requires both 64-bit addition and 64-bit shift, but we use two tricks to replace
+    // 64-bit operations with 32-bit operations.
+    //
+    // To avoid full 64-bit addition we make use of three facts:
+    // - 64-bit rounding value added before the shift is a power of 2, and thus has only one bit set.
+    // - When 0x1.0p-32f <= scale < 0x1.0p-31f, then the non-zero bit in rounding is in the low 32 bits, and
+    //   rounding is exactly 0x80000000 (2**31), because rounding is 2**(scale-1) and scale >= 32. In this case,
+    //   addition of rounding can affect high 32 bits of the product only through overflow, which happens if
+    //   low 32-bit part of the product equals or exceeds 0x80000000. We can reformulate the latter condition
+    //   as low 32-bit part of the product has the bit 31 set, and then overflow happens if both the low 32-bit part
+    //   of the product and the low 32-bit part of the rounding value have bit 31 set. Since 32-bit numbers with the
+    //   bit 31 set are negative when interpreted as signed integers, we can check the overflow condition as
+    //      (int32_t) (LOW(product) & LOW(rounding)) < 0
+    // - When 0x1.0p-31f <= scale < 1.0f, then the non-zero bit is in the high 32 bits of rounding. We just need
+    //   to do 32-bit addition of high 32 bits of rounding and high 32 bits of product. This addition never
+    //   overflows because product <= 0x80000000 * 0xFFFFFF00 < 2**63 and rounding = 2**(scale-1) <= 2**62.
+    //
+    // To avoid full 64-bit shift, we leverage the fact that shift >= 32, and do it in two steps:
+    // - Shift by 32, which can be implemented by extacting the high 32-bit word on 32-bit systems.
+    // - Shift by (shift - 32), which can be implemented as a 32-bit shift of high word of addition result.
     const uint32_t x_carry_lo = (uint32_t)((int32_t)((uint32_t) x_product & rounding_lo) < 0);
     const uint32_t y_carry_lo = (uint32_t)((int32_t)((uint32_t) y_product & rounding_lo) < 0);
     const uint32_t z_carry_lo = (uint32_t)((int32_t)((uint32_t) z_product & rounding_lo) < 0);
@@ -103,27 +97,23 @@
     const uint32_t z_abs_scaled = (uint32_t)(z_product_hi + rounding_hi + z_carry_lo) >> shift_minus_32;
     const uint32_t w_abs_scaled = (uint32_t)(w_product_hi + rounding_hi + w_carry_lo) >> shift_minus_32;
 
-    /* Copy the sign of input to scaled absolute input value */
+    // Copy the sign of input to scaled absolute input value.
     const int32_t x_scaled = (int32_t)(x >= 0 ? x_abs_scaled : -x_abs_scaled);
     const int32_t y_scaled = (int32_t)(y >= 0 ? y_abs_scaled : -y_abs_scaled);
     const int32_t z_scaled = (int32_t)(z >= 0 ? z_abs_scaled : -z_abs_scaled);
     const int32_t w_scaled = (int32_t)(w >= 0 ? w_abs_scaled : -w_abs_scaled);
 
-    /*
-     * Clamp scaled value with zero point between (qmin - zero point) and (qmax - zero point).
-     */
+    // Clamp scaled value with zero point between (qmin - zero point) and (qmax - zero point).
     const int32_t x_clamped = x_scaled < smin ? smin : x_scaled > smax ? smax : x_scaled;
     const int32_t y_clamped = y_scaled < smin ? smin : y_scaled > smax ? smax : y_scaled;
     const int32_t z_clamped = z_scaled < smin ? smin : z_scaled > smax ? smax : z_scaled;
     const int32_t w_clamped = w_scaled < smin ? smin : w_scaled > smax ? smax : w_scaled;
 
-    /*
-     * Add zero point to clamped value.
-     * The result is guaranteed to be in [qmin, qmax] range.
-     *
-     * This addition can not be safely done before clamping, because scaled values are in [-2147483520, 2147483519]
-     * range, so addition of zero point (which can be up to 255) can overflow signed 32-bit integer.
-     */
+    // Add zero point to clamped value.
+    // The result is guaranteed to be in [qmin, qmax] range.
+    //
+    // This addition can not be safely done before clamping, because scaled values are in [-2147483520, 2147483519]
+    // range, so addition of zero point (which can be up to 255) can overflow signed 32-bit integer.
     const int32_t x_biased = x_clamped + zero_point;
     const int32_t y_biased = y_clamped + zero_point;
     const int32_t z_biased = z_clamped + zero_point;
@@ -166,59 +156,49 @@
     const int32_t w = input[3];
     input += 4;
 
-    /*
-     * Compute absolute value of input as unsigned 32-bit int.
-     * All further computations will work with unsigned values to avoid undefined behaviour on signed operations.
-     */
+    // Compute absolute value of input as unsigned 32-bit int.
+    // All further computations will work with unsigned values to avoid undefined behaviour on signed operations.
     const uint32_t x_abs = (x >= 0) ? (uint32_t) x : -(uint32_t) x;
     const uint32_t y_abs = (y >= 0) ? (uint32_t) y : -(uint32_t) y;
     const uint32_t z_abs = (z >= 0) ? (uint32_t) z : -(uint32_t) z;
     const uint32_t w_abs = (w >= 0) ? (uint32_t) w : -(uint32_t) w;
 
-    /* Compute full 64-bit product of 32-bit factors */
+    // Compute full 64-bit product of 32-bit factors.
     const uint64_t x_product = (uint64_t) x_abs * (uint64_t) multiplier;
     const uint64_t y_product = (uint64_t) y_abs * (uint64_t) multiplier;
     const uint64_t z_product = (uint64_t) z_abs * (uint64_t) multiplier;
     const uint64_t w_product = (uint64_t) w_abs * (uint64_t) multiplier;
 
-    /*
-     * Shift the full 64-bit product right with rounding.
-     * Rounding is performed towards closest integer, with midpoints rounded up (same as away from zero).
-     *
-     * Note that although rounding is precomputed, it is dependent on shift value, and on processors with 64-bit
-     * "right shift with rounding" instruction each line below can be represented by just one such instruction
-     * (e.g. VRSHL.U64 on ARM NEON, URSHL in ARM64 Advanced SIMD).
-     */
+    // Shift the full 64-bit product right with rounding.
+    // Rounding is performed towards closest integer, with midpoints rounded up (same as away from zero).
+    //
+    // Note that although rounding is precomputed, it is dependent on shift value, and on processors with 64-bit
+    // "right shift with rounding" instruction each line below can be represented by just one such instruction
+    // (e.g. VRSHL.U64 on ARM NEON, URSHL in ARM64 Advanced SIMD).
     const uint32_t x_abs_scaled = (uint32_t)((x_product + rounding) >> shift);
     const uint32_t y_abs_scaled = (uint32_t)((y_product + rounding) >> shift);
     const uint32_t z_abs_scaled = (uint32_t)((z_product + rounding) >> shift);
     const uint32_t w_abs_scaled = (uint32_t)((w_product + rounding) >> shift);
 
-    /*
-     * Copy the sign of input to scaled absolute input value.
-     *
-     * On x86 processors with SSSE3 instruction set, this operation nicely maps to PSIGND instruction.
-     */
+    // Copy the sign of input to scaled absolute input value.
+    //
+    // On x86 processors with SSSE3 instruction set, this operation nicely maps to PSIGND instruction.
     const int32_t x_scaled = (int32_t)(x >= 0 ? x_abs_scaled : -x_abs_scaled);
     const int32_t y_scaled = (int32_t)(y >= 0 ? y_abs_scaled : -y_abs_scaled);
     const int32_t z_scaled = (int32_t)(z >= 0 ? z_abs_scaled : -z_abs_scaled);
     const int32_t w_scaled = (int32_t)(w >= 0 ? w_abs_scaled : -w_abs_scaled);
 
-    /*
-     * Clamp scaled value with zero point between (qmin - zero point) and (qmax - zero point).
-     */
+    // Clamp scaled value with zero point between (qmin - zero point) and (qmax - zero point).
     const int32_t x_clamped = x_scaled < smin ? smin : x_scaled > smax ? smax : x_scaled;
     const int32_t y_clamped = y_scaled < smin ? smin : y_scaled > smax ? smax : y_scaled;
     const int32_t z_clamped = z_scaled < smin ? smin : z_scaled > smax ? smax : z_scaled;
     const int32_t w_clamped = w_scaled < smin ? smin : w_scaled > smax ? smax : w_scaled;
 
-    /*
-     * Add zero point to clamped value.
-     * The result is guaranteed to be in [qmin, qmax] range.
-     *
-     * This addition can not be safely done before clamping, because scaled values are in [-2147483520, 2147483519]
-     * range, so addition of zero point (which can be up to 255) can overflow signed 32-bit integer.
-     */
+    // Add zero point to clamped value.
+    // The result is guaranteed to be in [qmin, qmax] range.
+    //
+    // This addition can not be safely done before clamping, because scaled values are in [-2147483520, 2147483519]
+    // range, so addition of zero point (which can be up to 255) can overflow signed 32-bit integer.
     const int32_t x_biased = x_clamped + zero_point;
     const int32_t y_biased = y_clamped + zero_point;
     const int32_t z_biased = z_clamped + zero_point;
@@ -261,52 +241,42 @@
     const int32_t w = input[3];
     input += 4;
 
-    /*
-     * Compute full 64-bit product of signed 32-bit factors.
-     *
-     * Note: multiplier can be treated as either signed or unsigned.
-     */
+    // Compute full 64-bit product of signed 32-bit factors.
+    //
+    // Note: multiplier can be treated as either signed or unsigned.
     const int64_t x_product = (int64_t) x * (int64_t) multiplier;
     const int64_t y_product = (int64_t) y * (int64_t) multiplier;
     const int64_t z_product = (int64_t) z * (int64_t) multiplier;
     const int64_t w_product = (int64_t) w * (int64_t) multiplier;
 
-    /*
-     * Adjust product before subsequent shift with rounding up to simulate shift with rounding away from zero.
-     */
+    // Adjust product before subsequent shift with rounding up to simulate shift with rounding away from zero.
     const int64_t x_adjusted_product = x_product - (int64_t)(x < 0);
     const int64_t y_adjusted_product = y_product - (int64_t)(y < 0);
     const int64_t z_adjusted_product = z_product - (int64_t)(z < 0);
     const int64_t w_adjusted_product = w_product - (int64_t)(w < 0);
 
-    /*
-     * Arithmetically shift the full 64-bit product right with rounding.
-     * Rounding is performed towards closest integer, with midpoints rounded up.
-     *
-     * Note that although rounding is precomputed, it is dependent on shift value, and on processors with 64-bit
-     * "right shift with rounding" instruction each line below can be represented by just one such instruction
-     * (e.g. VRSHL.S64 on ARM NEON, SRSHL in ARM64 Advanced SIMD).
-     */
+    // Arithmetically shift the full 64-bit product right with rounding.
+    // Rounding is performed towards closest integer, with midpoints rounded up.
+    //
+    // Note that although rounding is precomputed, it is dependent on shift value, and on processors with 64-bit
+    // "right shift with rounding" instruction each line below can be represented by just one such instruction
+    // (e.g. VRSHL.S64 on ARM NEON, SRSHL in ARM64 Advanced SIMD).
     const int32_t x_scaled = (int32_t) asr_s64(x_adjusted_product + rounding, shift);
     const int32_t y_scaled = (int32_t) asr_s64(y_adjusted_product + rounding, shift);
     const int32_t z_scaled = (int32_t) asr_s64(z_adjusted_product + rounding, shift);
     const int32_t w_scaled = (int32_t) asr_s64(w_adjusted_product + rounding, shift);
 
-    /*
-     * Clamp scaled value with zero point between (qmin - zero point) and (qmax - zero point).
-     */
+    // Clamp scaled value with zero point between (qmin - zero point) and (qmax - zero point).
     const int32_t x_clamped = x_scaled < smin ? smin : x_scaled > smax ? smax : x_scaled;
     const int32_t y_clamped = y_scaled < smin ? smin : y_scaled > smax ? smax : y_scaled;
     const int32_t z_clamped = z_scaled < smin ? smin : z_scaled > smax ? smax : z_scaled;
     const int32_t w_clamped = w_scaled < smin ? smin : w_scaled > smax ? smax : w_scaled;
 
-    /*
-     * Add zero point to clamped value.
-     * The result is guaranteed to be in [qmin, qmax] range.
-     *
-     * This addition can not be safely done before clamping, because scaled values are in [-2147483520, 2147483519]
-     * range, so addition of zero point (which can be up to 255) can overflow signed 32-bit integer.
-     */
+    // Add zero point to clamped value.
+    // The result is guaranteed to be in [qmin, qmax] range.
+    //
+    // This addition can not be safely done before clamping, because scaled values are in [-2147483520, 2147483519]
+    // range, so addition of zero point (which can be up to 255) can overflow signed 32-bit integer.
     const int32_t x_biased = x_clamped + zero_point;
     const int32_t y_biased = y_clamped + zero_point;
     const int32_t z_biased = z_clamped + zero_point;
diff --git a/src/requantization/precise-sse2.c b/src/requantization/precise-sse2.c
index c82361c..c733ee3 100644
--- a/src/requantization/precise-sse2.c
+++ b/src/requantization/precise-sse2.c
@@ -1,12 +1,10 @@
-/*
- * Copyright (c) Facebook, Inc. and its affiliates.
- * All rights reserved.
- *
- * Copyright 2019 Google LLC
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
+// Copyright (c) Facebook, Inc. and its affiliates.
+// All rights reserved.
+//
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
 
 #include <assert.h>
 #include <stdint.h>
@@ -108,23 +106,21 @@
     const __m128i xyzw_packed = _mm_packus_epi16(xy_packed, zw_packed);
     const __m128i xyzw_clamped = _mm_max_epu8(_mm_min_epu8(xyzw_packed, vqmax), vqmin);
 
-    /*
-     * 4x PXOR (setzero)
-     * 8x PSUBD
-     * 8x PXOR
-     * 8x PSHUFD
-     * 8x PMULUDQ
-     * 8x PSRLQ
-     * 8x PADDQ
-     * 4x SHUFPS
-     * 2x PACKSSDW
-     * 1x PACKUSWB
-     * 2x PADDW
-     * 1x PMAXUB
-     * 1x PMINUB
-     * ---------------------
-     * 63 instructions total
-     */
+    // 4x PXOR (setzero)
+    // 8x PSUBD
+    // 8x PXOR
+    // 8x PSHUFD
+    // 8x PMULUDQ
+    // 8x PSRLQ
+    // 8x PADDQ
+    // 4x SHUFPS
+    // 2x PACKSSDW
+    // 1x PACKUSWB
+    // 2x PADDW
+    // 1x PMAXUB
+    // 1x PMINUB
+    // ---------------------
+    // 63 instructions total
 
     _mm_storeu_si128((__m128i*) output, xyzw_clamped);
     output += 16;
diff --git a/src/requantization/precise-sse4.c b/src/requantization/precise-sse4.c
index 974a3b1..ef85338 100644
--- a/src/requantization/precise-sse4.c
+++ b/src/requantization/precise-sse4.c
@@ -1,12 +1,10 @@
-/*
- * Copyright (c) Facebook, Inc. and its affiliates.
- * All rights reserved.
- *
- * Copyright 2019 Google LLC
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
+// Copyright (c) Facebook, Inc. and its affiliates.
+// All rights reserved.
+//
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
 
 #include <assert.h>
 #include <stdint.h>
@@ -95,23 +93,21 @@
     const __m128i xyzw_packed = _mm_packus_epi16(xy_packed, zw_packed);
     const __m128i xyzw_clamped = _mm_max_epu8(_mm_min_epu8(xyzw_packed, vqmax), vqmin);
 
-    /*
-     * 4x PABSD
-     * 4x PSHUFD
-     * 8x PMULUDQ
-     * 4x PSRLQ
-     * 4x PSRLD
-     * 8x PADDQ
-     * 4x PBLENDW
-     * 4x PSIGND
-     * 2x PACKSSDW
-     * 1x PACKUSWB
-     * 2x PADDW
-     * 1x PMAXUB
-     * 1x PMINUB
-     * ---------------------
-     * 47 instructions total
-     */
+    // 4x PABSD
+    // 4x PSHUFD
+    // 8x PMULUDQ
+    // 4x PSRLQ
+    // 4x PSRLD
+    // 8x PADDQ
+    // 4x PBLENDW
+    // 4x PSIGND
+    // 2x PACKSSDW
+    // 1x PACKUSWB
+    // 2x PADDW
+    // 1x PMAXUB
+    // 1x PMINUB
+    // ---------------------
+    // 47 instructions total
 
     _mm_storeu_si128((__m128i*) output, xyzw_clamped);
     output += 16;
diff --git a/src/requantization/precise-ssse3.c b/src/requantization/precise-ssse3.c
index 626c0eb..b673d00 100644
--- a/src/requantization/precise-ssse3.c
+++ b/src/requantization/precise-ssse3.c
@@ -1,12 +1,10 @@
-/*
- * Copyright (c) Facebook, Inc. and its affiliates.
- * All rights reserved.
- *
- * Copyright 2019 Google LLC
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
+// Copyright (c) Facebook, Inc. and its affiliates.
+// All rights reserved.
+//
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
 
 #include <assert.h>
 #include <stdint.h>
@@ -103,22 +101,20 @@
     const __m128i xyzw_packed = _mm_packus_epi16(xy_packed, zw_packed);
     const __m128i xyzw_clamped = _mm_max_epu8(_mm_min_epu8(xyzw_packed, vqmax), vqmin);
 
-    /*
-     * 4x PABSD
-     * 8x PSHUFD
-     * 8x PMULUDQ
-     * 8x PSRLQ
-     * 8x PADDQ
-     * 4x SHUFPS
-     * 4x PSIGND
-     * 2x PACKSSDW
-     * 1x PACKUSWB
-     * 2x PADDW
-     * 1x PMAXUB
-     * 1x PMINUB
-     * ---------------------
-     * 51 instructions total
-     */
+    // 4x PABSD
+    // 8x PSHUFD
+    // 8x PMULUDQ
+    // 8x PSRLQ
+    // 8x PADDQ
+    // 4x SHUFPS
+    // 4x PSIGND
+    // 2x PACKSSDW
+    // 1x PACKUSWB
+    // 2x PADDW
+    // 1x PMAXUB
+    // 1x PMINUB
+    // ---------------------
+    // 51 instructions total
 
     _mm_storeu_si128((__m128i*) output, xyzw_clamped);
     output += 16;
diff --git a/src/requantization/q31-neon.c b/src/requantization/q31-neon.c
index 37986bc..42ed833 100644
--- a/src/requantization/q31-neon.c
+++ b/src/requantization/q31-neon.c
@@ -1,12 +1,10 @@
-/*
- * Copyright (c) Facebook, Inc. and its affiliates.
- * All rights reserved.
- *
- * Copyright 2019 Google LLC
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
+// Copyright (c) Facebook, Inc. and its affiliates.
+// All rights reserved.
+//
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
 
 #include <assert.h>
 #include <stdint.h>
@@ -30,15 +28,15 @@
   assert(scale < 1.0f);
   assert(scale >= 0x1.0p-32f);
 
-  /* Compute requantization parameters */
+  // Compute requantization parameters.
   const uint32_t scale_bits = fp32_to_bits(scale);
 
-  /* Multiplier is in [0x40000000, 0x7FFFFF80] range */
+  // Multiplier is in [0x40000000, 0x7FFFFF80] range.
   const int32_t multiplier = (int32_t)(((scale_bits & UINT32_C(0x007FFFFF)) | UINT32_C(0x00800000)) << 7);
   assert(multiplier >= INT32_C(0x40000000));
   assert(multiplier <= INT32_C(0x7FFFFF80));
 
-  /* Shift is in [0, 31] range */
+  // Shift is in [0, 31] range.
   const int32_t shift = 127 + 31 - 32 - (fp32_to_bits(scale) >> 23);
   assert(shift >= 0);
   assert(shift < 32);
@@ -56,23 +54,19 @@
     const int32x4_t w = vld1q_s32(input + 12);
     input += 16;
 
-    /*
-     * Directly use VQRDMULH/SQRDMULH instruction for Q31 multiplication with rounding.
-     * Although these instruction saturate out-of-range outputs, we never hit this case in requantization.
-     */
+    // Directly use VQRDMULH/SQRDMULH instruction for Q31 multiplication with rounding.
+    // Although these instruction saturate out-of-range outputs, we never hit this case in requantization.
     const int32x4_t x_product = vqrdmulhq_s32(x, vmultiplier);
     const int32x4_t y_product = vqrdmulhq_s32(y, vmultiplier);
     const int32x4_t z_product = vqrdmulhq_s32(z, vmultiplier);
     const int32x4_t w_product = vqrdmulhq_s32(w, vmultiplier);
 
-    /*
-     * Shift the 32-bit product right with rounding.
-     * Rounding is performed towards closest integer, with midpoints rounded up (same as away from zero).
-     *
-     * We leverage the "right shift with rounding" instruction (VRSHL.S32 on ARM NEON, SRSHL in ARM64 Advanced SIMD) to
-     * do the shift. However, as this instruction rounds midpoints up, rather than away from zero, we adjust the input
-     * by subtracting 1 from negative values, but only if shift is non-zero.
-     */
+    // Shift the 32-bit product right with rounding.
+    // Rounding is performed towards closest integer, with midpoints rounded up (same as away from zero).
+    //
+    // We leverage the "right shift with rounding" instruction (VRSHL.S32 on ARM NEON, SRSHL in ARM64 Advanced SIMD) to
+    // do the shift. However, as this instruction rounds midpoints up, rather than away from zero, we adjust the input
+    // by subtracting 1 from negative values, but only if shift is non-zero.
     const int32x4_t x_adjusted_product = vsraq_n_s32(x_product, vbicq_s32(x, vshift_eq_0_mask), 31);
     const int32x4_t y_adjusted_product = vsraq_n_s32(y_product, vbicq_s32(y, vshift_eq_0_mask), 31);
     const int32x4_t z_adjusted_product = vsraq_n_s32(z_product, vbicq_s32(z, vshift_eq_0_mask), 31);
@@ -95,35 +89,33 @@
 
     const uint8x16_t xyzw_clamped = vmaxq_u8(vminq_u8(xyzw_packed, vqmax), vqmin);
 
-    /*
-     * AArch32 version:
-     *   4x VQRDMULH.S32 Qd, Qm, Qn
-     *   4x VAND Qd, Qm, Dn
-     *   4x VSRA.S32 Qd, Qm, #31
-     *   4x VRSHL.S32 Qd, Qm, Qn
-     *   4x VQMOVN.S32 Dd, Qm
-     *   2x VADD.S16 Qd, Qm, Qn
-     *   2x VQMOVUN.S16 Dd, Qm
-     *   1x VMAX.U8 Qd, Qm, Qn
-     *   1x VMIN.U8 Qd, Qm, Qn
-     * ---------------------
-     * 26 instructions total
-     *
-     * AArch64 version:
-     *   4x SQRDMULH Vd.4S, Vn.4S, Vm.4S
-     *   4x AND Vd.16B, Vn.16B, Vm.16B
-     *   4x SSRA Vd.4S, Vn.4S, #31
-     *   4x SRSHL Vd.4S, Vn.4S, Vm.4S
-     *   2x SQXTN Vd.4H, Vn.4S
-     *   2x SQXTN2 Vd.8H, Vn.4S
-     *   2x ADD Vd.8H, Vn.8H, Vm.8H
-     *   1x SQXTUN Vd.8B, Vn.8H
-     *   1x SQXTUN2 Vd.16B, Vn.8H
-     *   1x UMIN Vd.16B, Vn.16B, Vm.16B
-     *   1x UMAX Vd.16B, Vn.16B, Vm.16B
-     * ---------------------
-     * 26 instructions total
-     */
+    // AArch32 version:
+    //   4x VQRDMULH.S32 Qd, Qm, Qn
+    //   4x VAND Qd, Qm, Dn
+    //   4x VSRA.S32 Qd, Qm, #31
+    //   4x VRSHL.S32 Qd, Qm, Qn
+    //   4x VQMOVN.S32 Dd, Qm
+    //   2x VADD.S16 Qd, Qm, Qn
+    //   2x VQMOVUN.S16 Dd, Qm
+    //   1x VMAX.U8 Qd, Qm, Qn
+    //   1x VMIN.U8 Qd, Qm, Qn
+    // ---------------------
+    // 26 instructions total
+    //
+    // AArch64 version:
+    //   4x SQRDMULH Vd.4S, Vn.4S, Vm.4S
+    //   4x AND Vd.16B, Vn.16B, Vm.16B
+    //   4x SSRA Vd.4S, Vn.4S, #31
+    //   4x SRSHL Vd.4S, Vn.4S, Vm.4S
+    //   2x SQXTN Vd.4H, Vn.4S
+    //   2x SQXTN2 Vd.8H, Vn.4S
+    //   2x ADD Vd.8H, Vn.8H, Vm.8H
+    //   1x SQXTUN Vd.8B, Vn.8H
+    //   1x SQXTUN2 Vd.16B, Vn.8H
+    //   1x UMIN Vd.16B, Vn.16B, Vm.16B
+    //   1x UMAX Vd.16B, Vn.16B, Vm.16B
+    // ---------------------
+    // 26 instructions total
 
     vst1q_u8(output, xyzw_clamped);
     output += 16;
diff --git a/src/requantization/q31-scalar.c b/src/requantization/q31-scalar.c
index 1677d0b..b12d48a 100644
--- a/src/requantization/q31-scalar.c
+++ b/src/requantization/q31-scalar.c
@@ -1,12 +1,10 @@
-/*
- * Copyright (c) Facebook, Inc. and its affiliates.
- * All rights reserved.
- *
- * Copyright 2019 Google LLC
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
+// Copyright (c) Facebook, Inc. and its affiliates.
+// All rights reserved.
+//
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
 
 #include <assert.h>
 #include <stdint.h>
@@ -29,15 +27,15 @@
   assert(scale < 1.0f);
   assert(scale >= 0x1.0p-32f);
 
-  /* Compute requantization parameters */
+  // Compute requantization parameters.
   const uint32_t scale_bits = fp32_to_bits(scale);
 
-  /* Multiplier is in [0x40000000, 0x7FFFFF80] range */
+  // Multiplier is in [0x40000000, 0x7FFFFF80] range.
   const int32_t multiplier = (int32_t)(((scale_bits & UINT32_C(0x007FFFFF)) | UINT32_C(0x00800000)) << 7);
   assert(multiplier >= INT32_C(0x40000000));
   assert(multiplier <= INT32_C(0x7FFFFF80));
 
-  /* Shift is in [0, 31] range */
+  // Shift is in [0, 31] range.
   const int32_t shift = 127 + 31 - 32 - (fp32_to_bits(scale) >> 23);
   assert(shift >= 0);
   assert(shift < 32);
@@ -54,53 +52,47 @@
     const int32_t w = input[3];
     input += 4;
 
-    /*
-     * Compute full 64-bit product of signed 32-bit factors.
-     *
-     * Note: multiplier can be treated as either signed or unsigned.
-     */
+    // Compute full 64-bit product of signed 32-bit factors.
+    //
+    // Note: multiplier can be treated as either signed or unsigned.
     const int64_t x_product = (int64_t) x * (int64_t) multiplier;
     const int64_t y_product = (int64_t) y * (int64_t) multiplier;
     const int64_t z_product = (int64_t) z * (int64_t) multiplier;
     const int64_t w_product = (int64_t) w * (int64_t) multiplier;
 
-    /*
-     * Get the Q31 multiplication result by extracting bits 31-62 of the product, with rounding up.
-     * Add rounding value (0x40000000) and then shift right by 31 bits and extract the low 32-bit word.
-     * Note: casts to unsigned types are needed to avoid undefined behavior.
-     * Given the multiplier range, the result of Q31 multiplication is in [-2147483520, 2147483519] range.
-     */
+    // Get the Q31 multiplication result by extracting bits 31-62 of the product, with rounding up.
+    // Add rounding value (0x40000000) and then shift right by 31 bits and extract the low 32-bit word.
+    // Note: casts to unsigned types are needed to avoid undefined behavior.
+    // Given the multiplier range, the result of Q31 multiplication is in [-2147483520, 2147483519] range.
     const int32_t x_q31product = (int32_t)(uint32_t)((uint64_t)(x_product + q31rounding) >> 31);
     const int32_t y_q31product = (int32_t)(uint32_t)((uint64_t)(y_product + q31rounding) >> 31);
     const int32_t z_q31product = (int32_t)(uint32_t)((uint64_t)(z_product + q31rounding) >> 31);
     const int32_t w_q31product = (int32_t)(uint32_t)((uint64_t)(w_product + q31rounding) >> 31);
 
-    /*
-     * Arithmetically shift the adjusted product right with rounding.
-     * Rounding is performed towards closest integer, with midpoints rounded away from zero.
-     *
-     * Shift with correct rounding could be efficiently implemented by pre-adding rounding constant, but with input in
-     * [-2147483520, 2147483519] range and rounding constant up to 2**30 we can't rule out overflow. This limitation
-     * leaves us with 3 options:
-     * 1. Extend input to 64-bit signed integer, perform addition and shift on 64-bit integers, then truncate result
-     *    to 32 bits.
-     * 2. Detect overflow and handle this situation separately. Note that overflow is possible only when input is
-     *    positive, and even when addition of a rounding constant overflows 32-bit signed integer, it still doesn't
-     *    overflow 32-bit unsigned integer. Thus, in case of signed overflow, we can compute the result using unsigned
-     *    arithmetics, specifically using logical shift right instead of arithmetic shift right.
-     * 3. Performs arithmetic shift as is, which will produce division result rounded down. Then compute remainder of
-     *    this division by a power of 2, and adjust the result. Result needs adjustment (increment by 1) when
-     *     - input is positive, shift is non-zero, and remainder >= 2**(shift - 1), e.g. 10 >> 2 needs adjustment
-     *     - input is negative, shift is non-zero, and remainder > 2**(shift - 1), e.g. -10 >> 2 doesn't need adjustment
-     *    These conditions can be generalized as
-     *        remainder + (input <= 0) > 2**(shift - 1)
-     *    or equivalently
-     *        remainder - (input < 0) > ((2**shift - 1) >> 1)
-     *    When shift is 0, remainder is 0 as well, the last condition is always false, and no adjustment is done.
-     *
-     * Among these options, option 3 is the most performant across the board, although option 1 is promising for 64-bit
-     * instruction sets.
-     */
+    // Arithmetically shift the adjusted product right with rounding.
+    // Rounding is performed towards closest integer, with midpoints rounded away from zero.
+    //
+    // Shift with correct rounding could be efficiently implemented by pre-adding rounding constant, but with input in
+    // [-2147483520, 2147483519] range and rounding constant up to 2**30 we can't rule out overflow. This limitation
+    // leaves us with 3 options:
+    // 1. Extend input to 64-bit signed integer, perform addition and shift on 64-bit integers, then truncate result
+    //    to 32 bits.
+    // 2. Detect overflow and handle this situation separately. Note that overflow is possible only when input is
+    //    positive, and even when addition of a rounding constant overflows 32-bit signed integer, it still doesn't
+    //    overflow 32-bit unsigned integer. Thus, in case of signed overflow, we can compute the result using unsigned
+    //    arithmetics, specifically using logical shift right instead of arithmetic shift right.
+    // 3. Performs arithmetic shift as is, which will produce division result rounded down. Then compute remainder of
+    //    this division by a power of 2, and adjust the result. Result needs adjustment (increment by 1) when
+    //     - input is positive, shift is non-zero, and remainder >= 2**(shift - 1), e.g. 10 >> 2 needs adjustment
+    //     - input is negative, shift is non-zero, and remainder > 2**(shift - 1), e.g. -10 >> 2 doesn't need adjustment
+    //    These conditions can be generalized as
+    //        remainder + (input <= 0) > 2**(shift - 1)
+    //    or equivalently
+    //        remainder - (input < 0) > ((2**shift - 1) >> 1)
+    //    When shift is 0, remainder is 0 as well, the last condition is always false, and no adjustment is done.
+    //
+    // Among these options, option 3 is the most performant across the board, although option 1 is promising for 64-bit
+    // instruction sets.
     const int32_t x_remainder = (x_q31product & remainder_mask) - (int32_t)(x_q31product < 0);
     const int32_t y_remainder = (y_q31product & remainder_mask) - (int32_t)(y_q31product < 0);
     const int32_t z_remainder = (z_q31product & remainder_mask) - (int32_t)(z_q31product < 0);
@@ -111,21 +103,17 @@
     const int32_t z_scaled = asr_s32(z_q31product, shift) + (int32_t)(z_remainder > threshold);
     const int32_t w_scaled = asr_s32(w_q31product, shift) + (int32_t)(w_remainder > threshold);
 
-    /*
-     * Clamp scaled value with zero point between (qmin - zero point) and (qmax - zero point).
-     */
+    // Clamp scaled value with zero point between (qmin - zero point) and (qmax - zero point).
     const int32_t x_clamped = x_scaled < smin ? smin : x_scaled > smax ? smax : x_scaled;
     const int32_t y_clamped = y_scaled < smin ? smin : y_scaled > smax ? smax : y_scaled;
     const int32_t z_clamped = z_scaled < smin ? smin : z_scaled > smax ? smax : z_scaled;
     const int32_t w_clamped = w_scaled < smin ? smin : w_scaled > smax ? smax : w_scaled;
 
-    /*
-     * Add zero point to clamped value.
-     * The result is guaranteed to be in [qmin, qmax] range.
-     *
-     * This addition can not be safely done before clamping, because scaled values are in [-2147483520, 2147483519]
-     * range, so addition of zero point (which can be up to 255) can overflow signed 32-bit integer.
-     */
+    // Add zero point to clamped value.
+    // The result is guaranteed to be in [qmin, qmax] range.
+    //
+    // This addition can not be safely done before clamping, because scaled values are in [-2147483520, 2147483519]
+    // range, so addition of zero point (which can be up to 255) can overflow signed 32-bit integer.
     const int32_t x_biased = x_clamped + zero_point;
     const int32_t y_biased = y_clamped + zero_point;
     const int32_t z_biased = z_clamped + zero_point;
diff --git a/src/requantization/q31-sse2.c b/src/requantization/q31-sse2.c
index 4223e8a..1ab921f 100644
--- a/src/requantization/q31-sse2.c
+++ b/src/requantization/q31-sse2.c
@@ -1,12 +1,10 @@
-/*
- * Copyright (c) Facebook, Inc. and its affiliates.
- * All rights reserved.
- *
- * Copyright 2019 Google LLC
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
+// Copyright (c) Facebook, Inc. and its affiliates.
+// All rights reserved.
+//
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
 
 #include <assert.h>
 #include <stdint.h>
@@ -30,15 +28,15 @@
   assert(scale < 1.0f);
   assert(scale >= 0x1.0p-32f);
 
-  /* Compute requantization parameters */
+  // Compute requantization parameters.
   const uint32_t scale_bits = fp32_to_bits(scale);
 
-  /* Multiplier is in [0x40000000, 0x7FFFFF80] range */
+  // Multiplier is in [0x40000000, 0x7FFFFF80] range.
   const int32_t multiplier = (int32_t)(((scale_bits & UINT32_C(0x007FFFFF)) | UINT32_C(0x00800000)) << 7);
   assert(multiplier >= INT32_C(0x40000000));
   assert(multiplier <= INT32_C(0x7FFFFF80));
 
-  /* Shift is in [0, 31] range */
+  // Shift is in [0, 31] range.
   const int32_t shift = 127 + 31 - 32 - (fp32_to_bits(scale) >> 23);
   assert(shift >= 0);
   assert(shift < 32);
@@ -160,28 +158,26 @@
     const __m128i xyzw_packed = _mm_packus_epi16(xy_packed, zw_packed);
     const __m128i xyzw_clamped = _mm_max_epu8(_mm_min_epu8(xyzw_packed, vqmax), vqmin);
 
-    /*
-     * 16x PSHUFD
-     * 4x SHUFPS
-     * 8x PMULUDQ
-     * 8x PXOR (setzero)
-     * 12x PXOR
-     * 4x PAND
-     * 8x PADDQ
-     * 4x PADDD
-     * 2x PADDW
-     * 8x PSUBQ
-     * 8x PSUBD
-     * 8x PSRLQ (immediate)
-     * 4x PSRAD (register)
-     * 12x PCMPGTD
-     * 2x PACKSSDW
-     * 1x PACKUSWB
-     * 1x PMAXUB
-     * 1x PMINUB
-     * ---------------------
-     * 111 instructions total
-     */
+    // 16x PSHUFD
+    // 4x SHUFPS
+    // 8x PMULUDQ
+    // 8x PXOR (setzero)
+    // 12x PXOR
+    // 4x PAND
+    // 8x PADDQ
+    // 4x PADDD
+    // 2x PADDW
+    // 8x PSUBQ
+    // 8x PSUBD
+    // 8x PSRLQ (immediate)
+    // 4x PSRAD (register)
+    // 12x PCMPGTD
+    // 2x PACKSSDW
+    // 1x PACKUSWB
+    // 1x PMAXUB
+    // 1x PMINUB
+    // ---------------------
+    // 111 instructions total
 
     _mm_storeu_si128((__m128i*) output, xyzw_clamped);
     output += 16;
diff --git a/src/requantization/q31-sse4.c b/src/requantization/q31-sse4.c
index c598d6b..a90934f 100644
--- a/src/requantization/q31-sse4.c
+++ b/src/requantization/q31-sse4.c
@@ -1,12 +1,10 @@
-/*
- * Copyright (c) Facebook, Inc. and its affiliates.
- * All rights reserved.
- *
- * Copyright 2019 Google LLC
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
+// Copyright (c) Facebook, Inc. and its affiliates.
+// All rights reserved.
+//
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
 
 #include <assert.h>
 #include <stdint.h>
@@ -30,15 +28,15 @@
   assert(scale < 1.0f);
   assert(scale >= 0x1.0p-32f);
 
-  /* Compute requantization parameters */
+  // Compute requantization parameters.
   const uint32_t scale_bits = fp32_to_bits(scale);
 
-  /* Multiplier is in [0x40000000, 0x7FFFFF80] range */
+  // Multiplier is in [0x40000000, 0x7FFFFF80] range.
   const int32_t multiplier = (int32_t)(((scale_bits & UINT32_C(0x007FFFFF)) | UINT32_C(0x00800000)) << 7);
   assert(multiplier >= INT32_C(0x40000000));
   assert(multiplier <= INT32_C(0x7FFFFF80));
 
-  /* Shift is in [0, 31] range */
+  // Shift is in [0, 31] range.
   const int32_t shift = 127 + 31 - 32 - (fp32_to_bits(scale) >> 23);
   assert(shift >= 0);
   assert(shift < 32);
@@ -111,26 +109,24 @@
     const __m128i xyzw_packed = _mm_packus_epi16(xy_packed, zw_packed);
     const __m128i xyzw_clamped = _mm_max_epu8(_mm_min_epu8(xyzw_packed, vqmax), vqmin);
 
-    /*
-     * 4x PSHUFD
-     * 8x PMULDQ
-     * 12x PADDQ
-     * 4x PADDD
-     * 2x PADDW
-     * 4x PSUBD
-     * 4x PSLRQ (immediate)
-     * 4x PSRAD (register)
-     * 4x PBLENDW
-     * 4x PAND
-     * 4x PXOR (setzero)
-     * 8x PCMPGTD
-     * 2x PACKSSDW
-     * 1x PACKUSWB
-     * 1x PMAXUB
-     * 1x PMINUB
-     * ---------------------
-     * 67 instructions total
-     */
+    // 4x PSHUFD
+    // 8x PMULDQ
+    // 12x PADDQ
+    // 4x PADDD
+    // 2x PADDW
+    // 4x PSUBD
+    // 4x PSLRQ (immediate)
+    // 4x PSRAD (register)
+    // 4x PBLENDW
+    // 4x PAND
+    // 4x PXOR (setzero)
+    // 8x PCMPGTD
+    // 2x PACKSSDW
+    // 1x PACKUSWB
+    // 1x PMAXUB
+    // 1x PMINUB
+    // ---------------------
+    // 67 instructions total
 
     _mm_storeu_si128((__m128i*) output, xyzw_clamped);
     output += 16;
diff --git a/src/requantization/q31-ssse3.c b/src/requantization/q31-ssse3.c
index 368ae75..e43e2d8 100644
--- a/src/requantization/q31-ssse3.c
+++ b/src/requantization/q31-ssse3.c
@@ -1,12 +1,10 @@
-/*
- * Copyright (c) Facebook, Inc. and its affiliates.
- * All rights reserved.
- *
- * Copyright 2019 Google LLC
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
+// Copyright (c) Facebook, Inc. and its affiliates.
+// All rights reserved.
+//
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
 
 #include <assert.h>
 #include <stdint.h>
@@ -30,15 +28,15 @@
   assert(scale < 1.0f);
   assert(scale >= 0x1.0p-32f);
 
-  /* Compute requantization parameters */
+  // Compute requantization parameters.
   const uint32_t scale_bits = fp32_to_bits(scale);
 
-  /* Multiplier is in [0x40000000, 0x7FFFFF80] range */
+  // Multiplier is in [0x40000000, 0x7FFFFF80] range.
   const int32_t multiplier = (int32_t)(((scale_bits & UINT32_C(0x007FFFFF)) | UINT32_C(0x00800000)) << 7);
   assert(multiplier >= INT32_C(0x40000000));
   assert(multiplier <= INT32_C(0x7FFFFF80));
 
-  /* Shift is in [0, 31] range */
+  // Shift is in [0, 31] range.
   const int32_t shift = 127 + 31 - 32 - (fp32_to_bits(scale) >> 23);
   assert(shift >= 0);
   assert(shift < 32);
@@ -160,29 +158,27 @@
     const __m128i xyzw_packed = _mm_packus_epi16(xy_packed, zw_packed);
     const __m128i xyzw_clamped = _mm_max_epu8(_mm_min_epu8(xyzw_packed, vqmax), vqmin);
 
-    /*
-     * 16x PSHUFD
-     * 4x SHUFPS
-     * 8x PMULUDQ
-     * 8x PXOR (setzero)
-     * 8x PXOR
-     * 4x PAND
-     * 8x PADDQ
-     * 4x PADDD
-     * 2x PADDW
-     * 8x PSUBQ
-     * 4x PSUBD
-     * 8x PSRLQ (immediate)
-     * 4x PSRAD (register)
-     * 12x PCMPGTD
-     * 4x PABSD
-     * 2x PACKSSDW
-     * 1x PACKUSWB
-     * 1x PMAXUB
-     * 1x PMINUB
-     * ---------------------
-     * 107 instructions total
-     */
+    // 16x PSHUFD
+    // 4x SHUFPS
+    // 8x PMULUDQ
+    // 8x PXOR (setzero)
+    // 8x PXOR
+    // 4x PAND
+    // 8x PADDQ
+    // 4x PADDD
+    // 2x PADDW
+    // 8x PSUBQ
+    // 4x PSUBD
+    // 8x PSRLQ (immediate)
+    // 4x PSRAD (register)
+    // 12x PCMPGTD
+    // 4x PABSD
+    // 2x PACKSSDW
+    // 1x PACKUSWB
+    // 1x PMAXUB
+    // 1x PMINUB
+    // ---------------------
+    // 107 instructions total
 
     _mm_storeu_si128((__m128i*) output, xyzw_clamped);
     output += 16;
diff --git a/src/u8-clamp/neon.c b/src/u8-clamp/neon.c
index 04725f9..9f98175 100644
--- a/src/u8-clamp/neon.c
+++ b/src/u8-clamp/neon.c
@@ -1,12 +1,10 @@
-/*
- * Copyright (c) Facebook, Inc. and its affiliates.
- * All rights reserved.
- *
- * Copyright 2019 Google LLC
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
+// Copyright (c) Facebook, Inc. and its affiliates.
+// All rights reserved.
+//
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
 
 #include <assert.h>
 
diff --git a/src/u8-clamp/scalar.c b/src/u8-clamp/scalar.c
index 6b513b0..d7586e6 100644
--- a/src/u8-clamp/scalar.c
+++ b/src/u8-clamp/scalar.c
@@ -1,9 +1,7 @@
-/*
- * Copyright 2019 Google LLC
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
 
 #include <assert.h>
 
diff --git a/src/u8-clamp/sse2.c b/src/u8-clamp/sse2.c
index 04179a2..f01c346 100644
--- a/src/u8-clamp/sse2.c
+++ b/src/u8-clamp/sse2.c
@@ -1,12 +1,10 @@
-/*
- * Copyright (c) Facebook, Inc. and its affiliates.
- * All rights reserved.
- *
- * Copyright 2019 Google LLC
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
+// Copyright (c) Facebook, Inc. and its affiliates.
+// All rights reserved.
+//
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
 
 #include <assert.h>
 
diff --git a/src/u8-lut32norm/scalar.c b/src/u8-lut32norm/scalar.c
index 3e54121..037d186 100644
--- a/src/u8-lut32norm/scalar.c
+++ b/src/u8-lut32norm/scalar.c
@@ -1,12 +1,10 @@
-/*
- * Copyright (c) Facebook, Inc. and its affiliates.
- * All rights reserved.
- *
- * Copyright 2019 Google LLC
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
+// Copyright (c) Facebook, Inc. and its affiliates.
+// All rights reserved.
+//
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
 
 #include <assert.h>
 
diff --git a/src/u8-maxpool/9p8q-neon.c b/src/u8-maxpool/9p8q-neon.c
index 49b747e..28c19ea 100644
--- a/src/u8-maxpool/9p8q-neon.c
+++ b/src/u8-maxpool/9p8q-neon.c
@@ -1,12 +1,10 @@
-/*
- * Copyright (c) Facebook, Inc. and its affiliates.
- * All rights reserved.
- *
- * Copyright 2019 Google LLC
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
+// Copyright (c) Facebook, Inc. and its affiliates.
+// All rights reserved.
+//
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
 
 #include <assert.h>
 
diff --git a/src/u8-maxpool/9p8q-scalar.c b/src/u8-maxpool/9p8q-scalar.c
index 7aa20a5..198dce7 100644
--- a/src/u8-maxpool/9p8q-scalar.c
+++ b/src/u8-maxpool/9p8q-scalar.c
@@ -1,9 +1,7 @@
-/*
- * Copyright 2019 Google LLC
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
 
 #include <assert.h>
 
diff --git a/src/u8-maxpool/9p8q-sse2.c b/src/u8-maxpool/9p8q-sse2.c
index 02e84eb..1805a69 100644
--- a/src/u8-maxpool/9p8q-sse2.c
+++ b/src/u8-maxpool/9p8q-sse2.c
@@ -1,12 +1,10 @@
-/*
- * Copyright (c) Facebook, Inc. and its affiliates.
- * All rights reserved.
- *
- * Copyright 2019 Google LLC
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
+// Copyright (c) Facebook, Inc. and its affiliates.
+// All rights reserved.
+//
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
 
 #include <assert.h>
 
diff --git a/src/u8-rmax/neon.c b/src/u8-rmax/neon.c
index 12e5bbf..2c3eb1d 100644
--- a/src/u8-rmax/neon.c
+++ b/src/u8-rmax/neon.c
@@ -1,12 +1,10 @@
-/*
- * Copyright (c) Facebook, Inc. and its affiliates.
- * All rights reserved.
- *
- * Copyright 2019 Google LLC
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
+// Copyright (c) Facebook, Inc. and its affiliates.
+// All rights reserved.
+//
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
 
 #include <assert.h>
 
diff --git a/src/u8-rmax/scalar.c b/src/u8-rmax/scalar.c
index a9a3298..74a581a 100644
--- a/src/u8-rmax/scalar.c
+++ b/src/u8-rmax/scalar.c
@@ -1,9 +1,7 @@
-/*
- * Copyright 2019 Google LLC
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
 
 #include <assert.h>
 
diff --git a/src/u8-rmax/sse2.c b/src/u8-rmax/sse2.c
index 034962d..9f1b293 100644
--- a/src/u8-rmax/sse2.c
+++ b/src/u8-rmax/sse2.c
@@ -1,12 +1,10 @@
-/*
- * Copyright (c) Facebook, Inc. and its affiliates.
- * All rights reserved.
- *
- * Copyright 2019 Google LLC
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
+// Copyright (c) Facebook, Inc. and its affiliates.
+// All rights reserved.
+//
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
 
 #include <assert.h>
 
diff --git a/src/wasm-stubs.c b/src/wasm-stubs.c
index 78790b5..29826d7 100644
--- a/src/wasm-stubs.c
+++ b/src/wasm-stubs.c
@@ -16,4 +16,4 @@
 uint32_t xnn_stub_wasm_f32_min(uint32_t a, uint32_t b) {
   return fp32_to_bits(__builtin_wasm_min_f32(fp32_from_bits(a), fp32_from_bits(b)));
 }
-#endif /* CPUINFO_ARCH_WASM || CPUINFO_ARCH_WASMSIMD */
+#endif  // CPUINFO_ARCH_WASM || CPUINFO_ARCH_WASMSIMD
diff --git a/src/x32-packx/x2-scalar.c b/src/x32-packx/x2-scalar.c
index 91c9846..08d0805 100644
--- a/src/x32-packx/x2-scalar.c
+++ b/src/x32-packx/x2-scalar.c
@@ -1,9 +1,7 @@
-/*
- * Copyright 2019 Google LLC
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
 
 #include <assert.h>
 
diff --git a/src/x32-packx/x3-scalar.c b/src/x32-packx/x3-scalar.c
index dc6d8ed..59fa0ff 100644
--- a/src/x32-packx/x3-scalar.c
+++ b/src/x32-packx/x3-scalar.c
@@ -1,9 +1,7 @@
-/*
- * Copyright 2019 Google LLC
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
 
 #include <assert.h>
 
diff --git a/src/x32-packx/x4-neon-st4.c b/src/x32-packx/x4-neon-st4.c
index 97e5501..6f0c03e 100644
--- a/src/x32-packx/x4-neon-st4.c
+++ b/src/x32-packx/x4-neon-st4.c
@@ -1,9 +1,7 @@
-/*
- * Copyright 2019 Google LLC
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
 
 #include <assert.h>
 
diff --git a/src/x32-packx/x4-psimd.c b/src/x32-packx/x4-psimd.c
index 4d15a7e..0fbe4ee 100644
--- a/src/x32-packx/x4-psimd.c
+++ b/src/x32-packx/x4-psimd.c
@@ -1,9 +1,7 @@
-/*
- * Copyright 2019 Google LLC
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
 
 #include <assert.h>
 
diff --git a/src/x32-packx/x4-scalar.c b/src/x32-packx/x4-scalar.c
index 17f7bb7..0577c54 100644
--- a/src/x32-packx/x4-scalar.c
+++ b/src/x32-packx/x4-scalar.c
@@ -1,9 +1,7 @@
-/*
- * Copyright 2019 Google LLC
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
 
 #include <assert.h>
 
diff --git a/src/x32-packx/x4-sse.c b/src/x32-packx/x4-sse.c
index 1ff64d7..c43f4b4 100644
--- a/src/x32-packx/x4-sse.c
+++ b/src/x32-packx/x4-sse.c
@@ -1,9 +1,7 @@
-/*
- * Copyright 2019 Google LLC
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
 
 #include <assert.h>
 
diff --git a/src/x32-pad/x2-neon.c b/src/x32-pad/x2-neon.c
index f1d9da0..4db318b 100644
--- a/src/x32-pad/x2-neon.c
+++ b/src/x32-pad/x2-neon.c
@@ -1,9 +1,7 @@
-/*
- * Copyright 2019 Google LLC
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
 
 #include <assert.h>
 
@@ -39,7 +37,7 @@
   }
   const uint32x4_t vc = vmovq_n_u32(c);
 
-  /* Pre-pad input channels */
+  // Pre-pad input channels.
   for (; l >= 16; l -= 16) {
     vst1q_u32(y0, vc); y0 += 4;
     vst1q_u32(y1, vc); y1 += 4;
@@ -53,7 +51,7 @@
     vst1q_lane_u32(y1, vc, 0); y1 += 1;
   }
 
-  /* Copy input channels */
+  // Copy input channels.
   for (; n >= 16; n -= 16) {
     const uint32x4_t vt0 = vld1q_u32(x0); x0 += 4;
     const uint32x4_t vt1 = vld1q_u32(x1); x1 += 4;
@@ -77,7 +75,7 @@
     }
   }
 
-  /* Post-pad input channels */
+  // Post-pad input channels.
   for (; r >= 16; r -= 16) {
     vst1q_u32(y0, vc); y0 += 4;
     vst1q_u32(y1, vc); y1 += 4;
diff --git a/src/x32-pad/x2-psimd.c b/src/x32-pad/x2-psimd.c
index 78b471a..dacb0c6 100644
--- a/src/x32-pad/x2-psimd.c
+++ b/src/x32-pad/x2-psimd.c
@@ -1,9 +1,7 @@
-/*
- * Copyright 2019 Google LLC
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
 
 #include <assert.h>
 
@@ -39,7 +37,7 @@
   }
   const psimd_u32 vc = psimd_splat_u32(c);
 
-  /* Pre-pad input channels */
+  // Pre-pad input channels.
   for (; l >= 16; l -= 16) {
     psimd_store_u32(y0, vc); y0 += 4;
     psimd_store_u32(y1, vc); y1 += 4;
@@ -53,7 +51,7 @@
     psimd_store1_u32(y1, vc); y1 += 1;
   }
 
-  /* Copy input channels */
+  // Copy input channels.
   for (; n >= 16; n -= 16) {
     const psimd_u32 vt0 = psimd_load_u32(x0); x0 += 4;
     const psimd_u32 vt1 = psimd_load_u32(x1); x1 += 4;
@@ -75,7 +73,7 @@
     }
   }
 
-  /* Post-pad input channels */
+  // Post-pad input channels.
   for (; r >= 16; r -= 16) {
     psimd_store_u32(y0, vc); y0 += 4;
     psimd_store_u32(y1, vc); y1 += 4;
diff --git a/src/x32-pad/x2-scalar.c b/src/x32-pad/x2-scalar.c
index bb17cc7..4f120c5 100644
--- a/src/x32-pad/x2-scalar.c
+++ b/src/x32-pad/x2-scalar.c
@@ -1,9 +1,7 @@
-/*
- * Copyright 2019 Google LLC
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
 
 #include <assert.h>
 
@@ -36,19 +34,19 @@
     y1 = y0;
   }
 
-  /* Pre-pad input channels */
+  // Pre-pad input channels.
   for (; l != 0; l -= 4) {
     *y0++ = c;
     *y1++ = c;
   }
 
-  /* Copy input channels */
+  // Copy input channels.
   for (; n != 0; n -= 4) {
     *y0++ = *x0++;
     *y1++ = *x1++;
   }
 
-  /* Post-pad input channels */
+  // Post-pad input channels.
   for (; r != 0; r -= 4) {
     *y0++ = c;
     *y1++ = c;
diff --git a/src/x32-pad/x2-sse2.c b/src/x32-pad/x2-sse2.c
index 49f7b3f..9f556a4 100644
--- a/src/x32-pad/x2-sse2.c
+++ b/src/x32-pad/x2-sse2.c
@@ -1,9 +1,7 @@
-/*
- * Copyright 2019 Google LLC
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
 
 #include <assert.h>
 
@@ -39,7 +37,7 @@
   }
   const __m128i vc = _mm_set1_epi32((int) c);
 
-  /* Pre-pad input channels */
+  // Pre-pad input channels.
   for (; l >= 16; l -= 16) {
     _mm_storeu_si128((__m128i*) y0, vc); y0 += 4;
     _mm_storeu_si128((__m128i*) y1, vc); y1 += 4;
@@ -53,7 +51,7 @@
     *((uint32_t*) y1) = (uint32_t) _mm_cvtsi128_si32(vc); y1 += 1;
   }
 
-  /* Copy input channels */
+  // Copy input channels.
   for (; n >= 16; n -= 16) {
     const __m128i vt0 = _mm_loadu_si128((const __m128i*) x0); x0 += 4;
     const __m128i vt1 = _mm_loadu_si128((const __m128i*) x1); x1 += 4;
@@ -75,7 +73,7 @@
     }
   }
 
-  /* Post-pad input channels */
+  // Post-pad input channels.
   for (; r >= 16; r -= 16) {
     _mm_storeu_si128((__m128i*) y0, vc); y0 += 4;
     _mm_storeu_si128((__m128i*) y1, vc); y1 += 4;
diff --git a/src/x32-unpool/psimd.c b/src/x32-unpool/psimd.c
index 3501997..74c8ea4 100644
--- a/src/x32-unpool/psimd.c
+++ b/src/x32-unpool/psimd.c
@@ -1,9 +1,7 @@
-/*
- * Copyright 2019 Google LLC
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
 
 #include <assert.h>
 
@@ -20,7 +18,7 @@
     const uint32_t* index,
     uint32_t** output)
 {
-  /* Pre-initialize outputs with constant */
+  // Pre-initialize outputs with constant.
   const psimd_u32 vf = psimd_splat_u32(f);
   uint32_t** os = output;
   do {
@@ -41,7 +39,7 @@
     }
   } while (--p != 0);
 
-  /* Copy indexed elements to output */
+  // Copy indexed elements to output.
   size_t offset = 0;
   do {
     const uint32_t i = *index++;
diff --git a/src/x32-unpool/scalar.c b/src/x32-unpool/scalar.c
index dd6abab..f58f4f8 100644
--- a/src/x32-unpool/scalar.c
+++ b/src/x32-unpool/scalar.c
@@ -1,9 +1,7 @@
-/*
- * Copyright 2019 Google LLC
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
 
 #include <assert.h>
 
@@ -18,7 +16,7 @@
     const uint32_t* index,
     uint32_t** output)
 {
-  /* Pre-initialize outputs with constant */
+  // Pre-initialize outputs with constant.
   uint32_t** os = output;
   do {
     uint32_t* o = *os++;
@@ -28,7 +26,7 @@
     } while (--k != 0);
   } while (--p != 0);
 
-  /* Copy indexed elements to output */
+  // Copy indexed elements to output.
   size_t offset = 0;
   do {
     const uint32_t i = *index++;
diff --git a/src/x32-zip/x2-neon.c b/src/x32-zip/x2-neon.c
index c88695b..d238992 100644
--- a/src/x32-zip/x2-neon.c
+++ b/src/x32-zip/x2-neon.c
@@ -1,9 +1,7 @@
-/*
- * Copyright 2019 Google LLC
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
 
 #include <assert.h>
 
diff --git a/src/x32-zip/x2-psimd.c b/src/x32-zip/x2-psimd.c
index 4723f2b..4460a6e 100644
--- a/src/x32-zip/x2-psimd.c
+++ b/src/x32-zip/x2-psimd.c
@@ -1,9 +1,7 @@
-/*
- * Copyright 2019 Google LLC
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
 
 #include <assert.h>
 
diff --git a/src/x32-zip/x2-scalar.c b/src/x32-zip/x2-scalar.c
index 06e58a3..c2a8d15 100644
--- a/src/x32-zip/x2-scalar.c
+++ b/src/x32-zip/x2-scalar.c
@@ -1,9 +1,7 @@
-/*
- * Copyright 2019 Google LLC
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
 
 #include <assert.h>
 
diff --git a/src/x32-zip/x2-sse2.c b/src/x32-zip/x2-sse2.c
index c61575d..d14ba49 100644
--- a/src/x32-zip/x2-sse2.c
+++ b/src/x32-zip/x2-sse2.c
@@ -1,9 +1,7 @@
-/*
- * Copyright 2019 Google LLC
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
 
 #include <assert.h>
 
diff --git a/src/x32-zip/x3-neon.c b/src/x32-zip/x3-neon.c
index 1522a2d..d78d04b 100644
--- a/src/x32-zip/x3-neon.c
+++ b/src/x32-zip/x3-neon.c
@@ -1,9 +1,7 @@
-/*
- * Copyright 2019 Google LLC
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
 
 #include <assert.h>
 
diff --git a/src/x32-zip/x3-psimd.c b/src/x32-zip/x3-psimd.c
index dea008c..4d68e2a 100644
--- a/src/x32-zip/x3-psimd.c
+++ b/src/x32-zip/x3-psimd.c
@@ -1,9 +1,7 @@
-/*
- * Copyright 2019 Google LLC
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
 
 #include <assert.h>
 
@@ -26,28 +24,28 @@
   uint32_t* o = (uint32_t*) output;
 
   while (n >= 16) {
-    /* vx = ( x3, x2, x1, x0 ) */
+    // vx = ( x3, x2, x1, x0 )
     const psimd_u32 vx = psimd_load_u32(x);
     x += 4;
-    /* vy = ( y3, y2, y1, y0 ) */
+    // vy = ( y3, y2, y1, y0 )
     const psimd_u32 vy = psimd_load_u32(y);
     y += 4;
-    /* vz = ( z3, z2, z1, z0 ) */
+    // vz = ( z3, z2, z1, z0 )
     const psimd_u32 vz = psimd_load_u32(z);
     z += 4;
 
-    /* vxy = ( y2, y0, x2, x0 ) */
+    // vxy = ( y2, y0, x2, x0 )
     const psimd_u32 vxy = psimd_concat_even_u32(vx, vy);
-    /* vyz = ( z3, z1, y3, y1 ) */
+    // vyz = ( z3, z1, y3, y1 )
     const psimd_u32 vyz = psimd_concat_odd_u32(vy, vz);
-    /* vzx = ( x3, x1, z2, z0 ) */
+    // vzx = ( x3, x1, z2, z0 )
     const psimd_u32 vzx = __builtin_shufflevector(vz, vx, 0, 2, 4+1, 4+3);
 
-    /* vxyz0 = ( x1, z0, y0, x0 ) */
+    // vxyz0 = ( x1, z0, y0, x0 )
     const psimd_u32 vxyz0 = psimd_concat_even_u32(vxy, vzx);
-    /* vxyz1 = ( y2, x2, z1, y1 ) */
+    // vxyz1 = ( y2, x2, z1, y1 )
     const psimd_u32 vxyz1 = __builtin_shufflevector(vyz, vxy, 0, 2, 4+1, 4+3);
-    /* vxyz2 = ( z3, y3, x3, z2 ) */
+    // vxyz2 = ( z3, y3, x3, z2 )
     const psimd_u32 vxyz2 = psimd_concat_odd_u32(vzx, vyz);
 
     psimd_store_u32(o, vxyz0);
diff --git a/src/x32-zip/x3-scalar.c b/src/x32-zip/x3-scalar.c
index bb25c3b..7d3271b 100644
--- a/src/x32-zip/x3-scalar.c
+++ b/src/x32-zip/x3-scalar.c
@@ -1,9 +1,7 @@
-/*
- * Copyright 2019 Google LLC
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
 
 #include <assert.h>
 
diff --git a/src/x32-zip/x3-sse2.c b/src/x32-zip/x3-sse2.c
index 49925ae..f730d71 100644
--- a/src/x32-zip/x3-sse2.c
+++ b/src/x32-zip/x3-sse2.c
@@ -1,9 +1,7 @@
-/*
- * Copyright 2019 Google LLC
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
 
 #include <assert.h>
 
@@ -26,28 +24,28 @@
   float* o = (float*) output;
 
   while (n >= 16) {
-    /* vx = ( x3, x2, x1, x0 ) */
+    // vx = ( x3, x2, x1, x0 )
     const __m128 vx = _mm_loadu_ps(x);
     x += 4;
-    /* vy = ( y3, y2, y1, y0 ) */
+    // vy = ( y3, y2, y1, y0 )
     const __m128 vy = _mm_loadu_ps(y);
     y += 4;
-    /* vz = ( z3, z2, z1, z0 ) */
+    // vz = ( z3, z2, z1, z0 )
     const __m128 vz = _mm_loadu_ps(z);
     z += 4;
 
-    /* vxy = ( y2, y0, x2, x0 ) */
+    // vxy = ( y2, y0, x2, x0 )
     const __m128 vxy = _mm_shuffle_ps(vx, vy, _MM_SHUFFLE(2, 0, 2, 0));
-    /* vyz = ( z3, z1, y3, y1 ) */
+    // vyz = ( z3, z1, y3, y1 )
     const __m128 vyz = _mm_shuffle_ps(vy, vz, _MM_SHUFFLE(3, 1, 3, 1));
-    /* vzx = ( x3, x1, z2, z0 ) */
+    // vzx = ( x3, x1, z2, z0 )
     const __m128 vzx = _mm_shuffle_ps(vz, vx, _MM_SHUFFLE(3, 1, 2, 0));
 
-    /* vxyz0 = ( x1, z0, y0, x0 ) */
+    // vxyz0 = ( x1, z0, y0, x0 )
     const __m128 vxyz0 = _mm_shuffle_ps(vxy, vzx, _MM_SHUFFLE(2, 0, 2, 0));
-    /* vxyz1 = ( y2, x2, z1, y1 ) */
+    // vxyz1 = ( y2, x2, z1, y1 )
     const __m128 vxyz1 = _mm_shuffle_ps(vyz, vxy, _MM_SHUFFLE(3, 1, 2, 0));
-    /* vxyz2 = ( z3, y3, x3, z2 ) */
+    // vxyz2 = ( z3, y3, x3, z2 )
     const __m128 vxyz2 = _mm_shuffle_ps(vzx, vyz, _MM_SHUFFLE(3, 1, 3, 1));
 
     _mm_storeu_ps(o, vxyz0);
@@ -58,21 +56,21 @@
   }
   if XNN_UNLIKELY(n != 0) {
     if (n & 8) {
-      /* vx = ( -, -, x1, x0 ) */
+      // vx = ( -, -, x1, x0 )
       const __m128 vx = _mm_castpd_ps(_mm_load_sd((const double*) x));
       x += 2;
-      /* vy = ( -, -, y1, y0 ) */
+      // vy = ( -, -, y1, y0 )
       const __m128 vy = _mm_castpd_ps(_mm_load_sd((const double*) y));
       y += 2;
-      /* vz = ( -, -, z1, z0 ) */
+      // vz = ( -, -, z1, z0 )
       const __m128 vz = _mm_castpd_ps(_mm_load_sd((const double*) z));
       z += 2;
 
-      /* vxy = ( y1, x1, y0, x0 ) */
+      // vxy = ( y1, x1, y0, x0 )
       const __m128 vxy = _mm_unpacklo_ps(vx, vy);
-      /* vzx = ( x1, z1, x0, z0 ) */
+      // vzx = ( x1, z1, x0, z0 )
       const __m128 vzx = _mm_unpacklo_ps(vz, vx);
-      /* vyz = ( z1, y1, z0, y0 ) */
+      // vyz = ( z1, y1, z0, y0 )
       const __m128 vyz = _mm_unpacklo_ps(vy, vz);
 
       _mm_storeu_ps(o, _mm_shuffle_ps(vxy, vzx, _MM_SHUFFLE(3, 0, 1, 0)));
diff --git a/src/x32-zip/x4-neon.c b/src/x32-zip/x4-neon.c
index 45c3dc0..915dc03 100644
--- a/src/x32-zip/x4-neon.c
+++ b/src/x32-zip/x4-neon.c
@@ -1,9 +1,7 @@
-/*
- * Copyright 2019 Google LLC
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
 
 #include <assert.h>
 
diff --git a/src/x32-zip/x4-psimd.c b/src/x32-zip/x4-psimd.c
index e144f34..a220f36 100644
--- a/src/x32-zip/x4-psimd.c
+++ b/src/x32-zip/x4-psimd.c
@@ -1,9 +1,7 @@
-/*
- * Copyright 2019 Google LLC
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
 
 #include <assert.h>
 
diff --git a/src/x32-zip/x4-scalar.c b/src/x32-zip/x4-scalar.c
index a1ca351..104a2c9 100644
--- a/src/x32-zip/x4-scalar.c
+++ b/src/x32-zip/x4-scalar.c
@@ -1,9 +1,7 @@
-/*
- * Copyright 2019 Google LLC
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
 
 #include <assert.h>
 
diff --git a/src/x32-zip/x4-sse2.c b/src/x32-zip/x4-sse2.c
index 0ffc400..c43cbfb 100644
--- a/src/x32-zip/x4-sse2.c
+++ b/src/x32-zip/x4-sse2.c
@@ -1,9 +1,7 @@
-/*
- * Copyright 2019 Google LLC
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
 
 #include <assert.h>
 
diff --git a/src/x32-zip/xm-neon.c b/src/x32-zip/xm-neon.c
index 2ab68a1..ef8a808 100644
--- a/src/x32-zip/xm-neon.c
+++ b/src/x32-zip/xm-neon.c
@@ -1,9 +1,7 @@
-/*
- * Copyright 2019 Google LLC
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
 
 #include <assert.h>
 
diff --git a/src/x32-zip/xm-psimd.c b/src/x32-zip/xm-psimd.c
index d5b0c36..7929093 100644
--- a/src/x32-zip/xm-psimd.c
+++ b/src/x32-zip/xm-psimd.c
@@ -1,9 +1,7 @@
-/*
- * Copyright 2019 Google LLC
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
 
 #include <assert.h>
 
diff --git a/src/x32-zip/xm-scalar.c b/src/x32-zip/xm-scalar.c
index fa2ee80..ae81537 100644
--- a/src/x32-zip/xm-scalar.c
+++ b/src/x32-zip/xm-scalar.c
@@ -1,9 +1,7 @@
-/*
- * Copyright 2019 Google LLC
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
 
 #include <assert.h>
 
diff --git a/src/x32-zip/xm-sse2.c b/src/x32-zip/xm-sse2.c
index da06541..513d9ea 100644
--- a/src/x32-zip/xm-sse2.c
+++ b/src/x32-zip/xm-sse2.c
@@ -1,9 +1,7 @@
-/*
- * Copyright 2019 Google LLC
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
 
 #include <assert.h>
 
diff --git a/src/x8-lut/scalar.c b/src/x8-lut/scalar.c
index 6b6e8a8..bf3ee9e 100644
--- a/src/x8-lut/scalar.c
+++ b/src/x8-lut/scalar.c
@@ -1,12 +1,10 @@
-/*
- * Copyright (c) Facebook, Inc. and its affiliates.
- * All rights reserved.
- *
- * Copyright 2019 Google LLC
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
+// Copyright (c) Facebook, Inc. and its affiliates.
+// All rights reserved.
+//
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
 
 #include <assert.h>
 
diff --git a/src/x8-zip/x2-neon.c b/src/x8-zip/x2-neon.c
index 3732e90..fdcca32 100644
--- a/src/x8-zip/x2-neon.c
+++ b/src/x8-zip/x2-neon.c
@@ -1,12 +1,10 @@
-/*
- * Copyright (c) Facebook, Inc. and its affiliates.
- * All rights reserved.
- *
- * Copyright 2019 Google LLC
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
+// Copyright (c) Facebook, Inc. and its affiliates.
+// All rights reserved.
+//
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
 
 #include <arm_neon.h>
 
diff --git a/src/x8-zip/x2-scalar.c b/src/x8-zip/x2-scalar.c
index e7906ca..10447ad 100644
--- a/src/x8-zip/x2-scalar.c
+++ b/src/x8-zip/x2-scalar.c
@@ -1,9 +1,7 @@
-/*
- * Copyright 2019 Google LLC
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
 
 #include <assert.h>
 
diff --git a/src/x8-zip/x2-sse2.c b/src/x8-zip/x2-sse2.c
index 6ba0963..71dd903 100644
--- a/src/x8-zip/x2-sse2.c
+++ b/src/x8-zip/x2-sse2.c
@@ -1,12 +1,10 @@
-/*
- * Copyright (c) Facebook, Inc. and its affiliates.
- * All rights reserved.
- *
- * Copyright 2019 Google LLC
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
+// Copyright (c) Facebook, Inc. and its affiliates.
+// All rights reserved.
+//
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
 
 #include <emmintrin.h>
 
diff --git a/src/x8-zip/x3-neon.c b/src/x8-zip/x3-neon.c
index 9348ecb..e71ed72 100644
--- a/src/x8-zip/x3-neon.c
+++ b/src/x8-zip/x3-neon.c
@@ -1,12 +1,10 @@
-/*
- * Copyright (c) Facebook, Inc. and its affiliates.
- * All rights reserved.
- *
- * Copyright 2019 Google LLC
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
+// Copyright (c) Facebook, Inc. and its affiliates.
+// All rights reserved.
+//
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
 
 #include <arm_neon.h>
 
diff --git a/src/x8-zip/x3-scalar.c b/src/x8-zip/x3-scalar.c
index b4319ef..485c1cc 100644
--- a/src/x8-zip/x3-scalar.c
+++ b/src/x8-zip/x3-scalar.c
@@ -1,9 +1,7 @@
-/*
- * Copyright 2019 Google LLC
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
 
 #include <assert.h>
 
diff --git a/src/x8-zip/x3-sse2.c b/src/x8-zip/x3-sse2.c
index 045fc20..04298c3 100644
--- a/src/x8-zip/x3-sse2.c
+++ b/src/x8-zip/x3-sse2.c
@@ -1,12 +1,10 @@
-/*
- * Copyright (c) Facebook, Inc. and its affiliates.
- * All rights reserved.
- *
- * Copyright 2019 Google LLC
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
+// Copyright (c) Facebook, Inc. and its affiliates.
+// All rights reserved.
+//
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
 
 #include <emmintrin.h>
 
@@ -27,47 +25,47 @@
     const __m128i vmask0x00FF00FF = _mm_set1_epi16(0x00FF);
     const __m128i vmask0x0000FFFF = _mm_set1_epi32(0x0000FFFF);
     do {
-      /* vx  = ( x15, x14, x13, x12, x11, x10,  x9,  x8,  x7,  x6,  x5,  x4, x3, x2, x1, x0 ) */
+      // vx  = ( x15, x14, x13, x12, x11, x10,  x9,  x8,  x7,  x6,  x5,  x4, x3, x2, x1, x0 )
       const __m128i vx = _mm_loadu_si128((const __m128i*) x);
       x += 16;
-      /* vy  = ( y15, y14, y13, y12, y11, y10,  y9,  y8,  y7,  y6,  y5,  y4, y3, y2, y1, y0 ) */
+      // vy  = ( y15, y14, y13, y12, y11, y10,  y9,  y8,  y7,  y6,  y5,  y4, y3, y2, y1, y0 )
       const __m128i vy = _mm_loadu_si128((const __m128i*) y);
       y += 16;
-      /* vz  = ( z15, z14, z13, z12, z11, z10,  z9,  z8,  z7,  z6,  z5,  z4, z3, z2, z1, z0 ) */
+      // vz  = ( z15, z14, z13, z12, z11, z10,  z9,  z8,  z7,  z6,  z5,  z4, z3, z2, z1, z0 )
       const __m128i vz = _mm_loadu_si128((const __m128i*) z);
       z += 16;
 
-      /* vxeye     = ( y14, x14, y12, x12, y10, x10,  y8,  x8,  y6,  x6,  y4,  x4,  y2,  x2,  y0,  x0 ) */
+      // vxeye     = ( y14, x14, y12, x12, y10, x10,  y8,  x8,  y6,  x6,  y4,  x4,  y2,  x2,  y0,  x0 )
       const __m128i vxeye = _mm_or_si128(_mm_and_si128(vx, vmask0x00FF00FF), _mm_slli_epi16(vy, 8));
-      /* vyozo     = ( z15, y15, z13, y13, z11, y11,  z9,  y9,  z7,  y7,  z5,  y5,  z3,  y3,  z1,  y1 ) */
+      // vyozo     = ( z15, y15, z13, y13, z11, y11,  z9,  y9,  z7,  y7,  z5,  y5,  z3,  y3,  z1,  y1 )
       const __m128i vyozo = _mm_or_si128(_mm_andnot_si128(vmask0x00FF00FF, vz), _mm_srli_epi16(vy, 8));
-      /* vzoxo     = ( x15, z14, x13, z12, x11, z10,  x9,  z8,  x7,  z6,  x5,  z4,  x3,  z2,  x1,  z0 ) */
+      // vzoxo     = ( x15, z14, x13, z12, x11, z10,  x9,  z8,  x7,  z6,  x5,  z4,  x3,  z2,  x1,  z0 )
       const __m128i vzexo = _mm_or_si128(_mm_and_si128(vz, vmask0x00FF00FF), _mm_andnot_si128(vmask0x00FF00FF, vx));
 
-      /* vxeyezexo = ( x13, z12, y12, x12,  x9,  z8,  y8,  x8,  x5,  z4,  y4,  x4,  x1,  z0,  y0,  x0 ) */
+      // vxeyezexo = ( x13, z12, y12, x12,  x9,  z8,  y8,  x8,  x5,  z4,  y4,  x4,  x1,  z0,  y0,  x0 )
       const __m128i vxeyezexo = _mm_or_si128(_mm_and_si128(vxeye, vmask0x0000FFFF), _mm_slli_epi32(vzexo, 16));
-      /* vyozoxeye = ( y14, x14, z13, y13, y10, x10,  z9,  y9,  y6,  x6,  z5,  y5,  y2,  x2,  z1,  y1 ) */
+      // vyozoxeye = ( y14, x14, z13, y13, y10, x10,  z9,  y9,  y6,  x6,  z5,  y5,  y2,  x2,  z1,  y1 )
       const __m128i vyozoxeye = _mm_or_si128(_mm_and_si128(vyozo, vmask0x0000FFFF), _mm_andnot_si128(vmask0x0000FFFF, vxeye));
-      /* vzexoyozo = ( z15, y15, x15, z14, z11, y11, x11, z10,  z7,  y7,  x7,  z6,  z3,  y3,  x3,  z2 ) */
+      // vzexoyozo = ( z15, y15, x15, z14, z11, y11, x11, z10,  z7,  y7,  x7,  z6,  z3,  y3,  x3,  z2 )
       const __m128i vzexoyozo = _mm_or_si128(_mm_andnot_si128(vmask0x0000FFFF, vyozo), _mm_srli_epi32(vzexo, 16));
 
-      /* vtemp0    = ( x13, z12, y12, x12,  x5,  z4,  y4,  x4, z11, y11, x11, z10,  z3,  y3,  x3,  z2 ) */
+      // vtemp0    = ( x13, z12, y12, x12,  x5,  z4,  y4,  x4, z11, y11, x11, z10,  z3,  y3,  x3,  z2 )
       const __m128i vtemp0 = _mm_castps_si128(
       _mm_shuffle_ps(_mm_castsi128_ps(vzexoyozo), _mm_castsi128_ps(vxeyezexo), _MM_SHUFFLE(3, 1, 2, 0)));
-      /* vtemp1    = ( y10, x10,  z9,  y9,  y2,  x2,  z1,  y1,  x9,  z8,  y8,  x8,  x1,  z0,  y0,  x0 ) */
+      // vtemp1    = ( y10, x10,  z9,  y9,  y2,  x2,  z1,  y1,  x9,  z8,  y8,  x8,  x1,  z0,  y0,  x0 )
       const __m128i vtemp1 = _mm_castps_si128(
       _mm_shuffle_ps(_mm_castsi128_ps(vxeyezexo), _mm_castsi128_ps(vyozoxeye), _MM_SHUFFLE(2, 0, 2, 0)));
-      /* vtemp2    = ( z15, y15, x15, z14,  z7,  y7,  x7,  z6, y14, x14, z13, y13,  y6,  x6,  z5,  y5 ) */
+      // vtemp2    = ( z15, y15, x15, z14,  z7,  y7,  x7,  z6, y14, x14, z13, y13,  y6,  x6,  z5,  y5 )
       const __m128i vtemp2 = _mm_castps_si128(
       _mm_shuffle_ps(_mm_castsi128_ps(vyozoxeye), _mm_castsi128_ps(vzexoyozo), _MM_SHUFFLE(3, 1, 3, 1)));
 
-      /* vxyz0     = (  x5,  z4,  y4,  x4,  z3,  y3,  x3,  z2,  y2,  x2,  z1,  y1,  x1,  z0,  y0,  x0 ) */
+      // vxyz0     = (  x5,  z4,  y4,  x4,  z3,  y3,  x3,  z2,  y2,  x2,  z1,  y1,  x1,  z0,  y0,  x0 )
       const __m128i vxyz0 = _mm_castps_si128(
       _mm_shuffle_ps(_mm_castsi128_ps(vtemp1), _mm_castsi128_ps(vtemp0), _MM_SHUFFLE(2, 0, 2, 0)));
-      /* vxyz1     = ( y10, x10,  z9,  y9,  x9,  z8,  y8,  x8,  z7,  y7,  x7,  z6,  y6,  x6,  z5,  y5 ) */
+      // vxyz1     = ( y10, x10,  z9,  y9,  x9,  z8,  y8,  x8,  z7,  y7,  x7,  z6,  y6,  x6,  z5,  y5 )
       const __m128i vxyz1 = _mm_castps_si128(
       _mm_shuffle_ps(_mm_castsi128_ps(vtemp2), _mm_castsi128_ps(vtemp1), _MM_SHUFFLE(3, 1, 2, 0)));
-      /* vxyz2     = ( z15, y15, x15, z14, y14, x14, z13, y13, x13, z12, y12, x12, z11, y11, x11, z10 ) */
+      // vxyz2     = ( z15, y15, x15, z14, y14, x14, z13, y13, x13, z12, y12, x12, z11, y11, x11, z10 )
       const __m128i vxyz2 = _mm_castps_si128(
       _mm_shuffle_ps(_mm_castsi128_ps(vtemp0), _mm_castsi128_ps(vtemp2), _MM_SHUFFLE(3, 1, 3, 1)));
 
@@ -79,44 +77,44 @@
     } while (n >= 16);
     if (n != 0) {
       const size_t address_increment = n - 16;
-      /* vx  = ( x15, x14, x13, x12, x11, x10,  x9,  x8,  x7,  x6,  x5,  x4, x3, x2, x1, x0 ) */
+      // vx  = ( x15, x14, x13, x12, x11, x10,  x9,  x8,  x7,  x6,  x5,  x4, x3, x2, x1, x0 )
       const __m128i vx = _mm_loadu_si128((const __m128i*) ((uintptr_t) x + address_increment));
-      /* vy  = ( y15, y14, y13, y12, y11, y10,  y9,  y8,  y7,  y6,  y5,  y4, y3, y2, y1, y0 ) */
+      // vy  = ( y15, y14, y13, y12, y11, y10,  y9,  y8,  y7,  y6,  y5,  y4, y3, y2, y1, y0 )
       const __m128i vy = _mm_loadu_si128((const __m128i*) ((uintptr_t) y + address_increment));
-      /* vz  = ( z15, z14, z13, z12, z11, z10,  z9,  z8,  z7,  z6,  z5,  z4, z3, z2, z1, z0 ) */
+      // vz  = ( z15, z14, z13, z12, z11, z10,  z9,  z8,  z7,  z6,  z5,  z4, z3, z2, z1, z0 )
       const __m128i vz = _mm_loadu_si128((const __m128i*) ((uintptr_t) z + address_increment));
 
-      /* vxeye     = ( y14, x14, y12, x12, y10, x10,  y8,  x8,  y6,  x6,  y4,  x4,  y2,  x2,  y0,  x0 ) */
+      // vxeye     = ( y14, x14, y12, x12, y10, x10,  y8,  x8,  y6,  x6,  y4,  x4,  y2,  x2,  y0,  x0 )
       const __m128i vxeye = _mm_or_si128(_mm_and_si128(vx, vmask0x00FF00FF), _mm_slli_epi16(vy, 8));
-      /* vyozo     = ( z15, y15, z13, y13, z11, y11,  z9,  y9,  z7,  y7,  z5,  y5,  z3,  y3,  z1,  y1 ) */
+      // vyozo     = ( z15, y15, z13, y13, z11, y11,  z9,  y9,  z7,  y7,  z5,  y5,  z3,  y3,  z1,  y1 )
       const __m128i vyozo = _mm_or_si128(_mm_andnot_si128(vmask0x00FF00FF, vz), _mm_srli_epi16(vy, 8));
-      /* vzoxo     = ( x15, z14, x13, z12, x11, z10,  x9,  z8,  x7,  z6,  x5,  z4,  x3,  z2,  x1,  z0 ) */
+      // vzoxo     = ( x15, z14, x13, z12, x11, z10,  x9,  z8,  x7,  z6,  x5,  z4,  x3,  z2,  x1,  z0 )
       const __m128i vzexo = _mm_or_si128(_mm_and_si128(vz, vmask0x00FF00FF), _mm_andnot_si128(vmask0x00FF00FF, vx));
 
-      /* vxeyezexo = ( x13, z12, y12, x12,  x9,  z8,  y8,  x8,  x5,  z4,  y4,  x4,  x1,  z0,  y0,  x0 ) */
+      // vxeyezexo = ( x13, z12, y12, x12,  x9,  z8,  y8,  x8,  x5,  z4,  y4,  x4,  x1,  z0,  y0,  x0 )
       const __m128i vxeyezexo = _mm_or_si128(_mm_and_si128(vxeye, vmask0x0000FFFF), _mm_slli_epi32(vzexo, 16));
-      /* vyozoxeye = ( y14, x14, z13, y13, y10, x10,  z9,  y9,  y6,  x6,  z5,  y5,  y2,  x2,  z1,  y1 ) */
+      // vyozoxeye = ( y14, x14, z13, y13, y10, x10,  z9,  y9,  y6,  x6,  z5,  y5,  y2,  x2,  z1,  y1 )
       const __m128i vyozoxeye = _mm_or_si128(_mm_and_si128(vyozo, vmask0x0000FFFF), _mm_andnot_si128(vmask0x0000FFFF, vxeye));
-      /* vzexoyozo = ( z15, y15, x15, z14, z11, y11, x11, z10,  z7,  y7,  x7,  z6,  z3,  y3,  x3,  z2 ) */
+      // vzexoyozo = ( z15, y15, x15, z14, z11, y11, x11, z10,  z7,  y7,  x7,  z6,  z3,  y3,  x3,  z2 )
       const __m128i vzexoyozo = _mm_or_si128(_mm_andnot_si128(vmask0x0000FFFF, vyozo), _mm_srli_epi32(vzexo, 16));
 
-      /* vtemp0    = ( x13, z12, y12, x12,  x5,  z4,  y4,  x4, z11, y11, x11, z10,  z3,  y3,  x3,  z2 ) */
+      // vtemp0    = ( x13, z12, y12, x12,  x5,  z4,  y4,  x4, z11, y11, x11, z10,  z3,  y3,  x3,  z2 )
       const __m128i vtemp0 = _mm_castps_si128(
       _mm_shuffle_ps(_mm_castsi128_ps(vzexoyozo), _mm_castsi128_ps(vxeyezexo), _MM_SHUFFLE(3, 1, 2, 0)));
-      /* vtemp1    = ( y10, x10,  z9,  y9,  y2,  x2,  z1,  y1,  x9,  z8,  y8,  x8,  x1,  z0,  y0,  x0 ) */
+      // vtemp1    = ( y10, x10,  z9,  y9,  y2,  x2,  z1,  y1,  x9,  z8,  y8,  x8,  x1,  z0,  y0,  x0 )
       const __m128i vtemp1 = _mm_castps_si128(
       _mm_shuffle_ps(_mm_castsi128_ps(vxeyezexo), _mm_castsi128_ps(vyozoxeye), _MM_SHUFFLE(2, 0, 2, 0)));
-      /* vtemp2    = ( z15, y15, x15, z14,  z7,  y7,  x7,  z6, y14, x14, z13, y13,  y6,  x6,  z5,  y5 ) */
+      // vtemp2    = ( z15, y15, x15, z14,  z7,  y7,  x7,  z6, y14, x14, z13, y13,  y6,  x6,  z5,  y5 )
       const __m128i vtemp2 = _mm_castps_si128(
       _mm_shuffle_ps(_mm_castsi128_ps(vyozoxeye), _mm_castsi128_ps(vzexoyozo), _MM_SHUFFLE(3, 1, 3, 1)));
 
-      /* vxyz0     = (  x5,  z4,  y4,  x4,  z3,  y3,  x3,  z2,  y2,  x2,  z1,  y1,  x1,  z0,  y0,  x0 ) */
+      // vxyz0     = (  x5,  z4,  y4,  x4,  z3,  y3,  x3,  z2,  y2,  x2,  z1,  y1,  x1,  z0,  y0,  x0 )
       const __m128i vxyz0 = _mm_castps_si128(
       _mm_shuffle_ps(_mm_castsi128_ps(vtemp1), _mm_castsi128_ps(vtemp0), _MM_SHUFFLE(2, 0, 2, 0)));
-      /* vxyz1     = ( y10, x10,  z9,  y9,  x9,  z8,  y8,  x8,  z7,  y7,  x7,  z6,  y6,  x6,  z5,  y5 ) */
+      // vxyz1     = ( y10, x10,  z9,  y9,  x9,  z8,  y8,  x8,  z7,  y7,  x7,  z6,  y6,  x6,  z5,  y5 )
       const __m128i vxyz1 = _mm_castps_si128(
       _mm_shuffle_ps(_mm_castsi128_ps(vtemp2), _mm_castsi128_ps(vtemp1), _MM_SHUFFLE(3, 1, 2, 0)));
-      /* vxyz2     = ( z15, y15, x15, z14, y14, x14, z13, y13, x13, z12, y12, x12, z11, y11, x11, z10 ) */
+      // vxyz2     = ( z15, y15, x15, z14, y14, x14, z13, y13, x13, z12, y12, x12, z11, y11, x11, z10 )
       const __m128i vxyz2 = _mm_castps_si128(
       _mm_shuffle_ps(_mm_castsi128_ps(vtemp0), _mm_castsi128_ps(vtemp2), _MM_SHUFFLE(3, 1, 3, 1)));
 
diff --git a/src/x8-zip/x4-neon.c b/src/x8-zip/x4-neon.c
index 38ac597..5332709 100644
--- a/src/x8-zip/x4-neon.c
+++ b/src/x8-zip/x4-neon.c
@@ -1,12 +1,10 @@
-/*
- * Copyright (c) Facebook, Inc. and its affiliates.
- * All rights reserved.
- *
- * Copyright 2019 Google LLC
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
+// Copyright (c) Facebook, Inc. and its affiliates.
+// All rights reserved.
+//
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
 
 #include <arm_neon.h>
 
diff --git a/src/x8-zip/x4-scalar.c b/src/x8-zip/x4-scalar.c
index b56c969..b29e7bf 100644
--- a/src/x8-zip/x4-scalar.c
+++ b/src/x8-zip/x4-scalar.c
@@ -1,9 +1,7 @@
-/*
- * Copyright 2019 Google LLC
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
 
 #include <assert.h>
 
diff --git a/src/x8-zip/x4-sse2.c b/src/x8-zip/x4-sse2.c
index 292a981..c9c74af 100644
--- a/src/x8-zip/x4-sse2.c
+++ b/src/x8-zip/x4-sse2.c
@@ -1,12 +1,10 @@
-/*
- * Copyright (c) Facebook, Inc. and its affiliates.
- * All rights reserved.
- *
- * Copyright 2019 Google LLC
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
+// Copyright (c) Facebook, Inc. and its affiliates.
+// All rights reserved.
+//
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
 
 #include <emmintrin.h>
 
diff --git a/src/x8-zip/xm-neon.c b/src/x8-zip/xm-neon.c
index 2ac19eb..b722155 100644
--- a/src/x8-zip/xm-neon.c
+++ b/src/x8-zip/xm-neon.c
@@ -1,12 +1,10 @@
-/*
- * Copyright (c) Facebook, Inc. and its affiliates.
- * All rights reserved.
- *
- * Copyright 2019 Google LLC
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
+// Copyright (c) Facebook, Inc. and its affiliates.
+// All rights reserved.
+//
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
 
 #include <arm_neon.h>
 
diff --git a/src/x8-zip/xm-scalar.c b/src/x8-zip/xm-scalar.c
index 32a8ae2..cb60b5e 100644
--- a/src/x8-zip/xm-scalar.c
+++ b/src/x8-zip/xm-scalar.c
@@ -1,9 +1,7 @@
-/*
- * Copyright 2019 Google LLC
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
 
 #include <assert.h>
 
diff --git a/src/x8-zip/xm-sse2.c b/src/x8-zip/xm-sse2.c
index 30971f9..f60b3dd 100644
--- a/src/x8-zip/xm-sse2.c
+++ b/src/x8-zip/xm-sse2.c
@@ -1,12 +1,10 @@
-/*
- * Copyright (c) Facebook, Inc. and its affiliates.
- * All rights reserved.
- *
- * Copyright 2019 Google LLC
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
+// Copyright (c) Facebook, Inc. and its affiliates.
+// All rights reserved.
+//
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
 
 #include <emmintrin.h>
 
diff --git a/src/xnnpack/argmaxpool.h b/src/xnnpack/argmaxpool.h
index 5b9776d..ce60230 100644
--- a/src/xnnpack/argmaxpool.h
+++ b/src/xnnpack/argmaxpool.h
@@ -56,5 +56,5 @@
 
 
 #ifdef __cplusplus
-} /* extern "C" */
+}  // extern "C"
 #endif
diff --git a/src/xnnpack/avgpool.h b/src/xnnpack/avgpool.h
index 5fd51b9..d838fb5 100644
--- a/src/xnnpack/avgpool.h
+++ b/src/xnnpack/avgpool.h
@@ -92,5 +92,5 @@
 
 
 #ifdef __cplusplus
-} /* extern "C" */
+}  // extern "C"
 #endif
diff --git a/src/xnnpack/clamp.h b/src/xnnpack/clamp.h
index db19d28..0cd59b4 100644
--- a/src/xnnpack/clamp.h
+++ b/src/xnnpack/clamp.h
@@ -45,5 +45,5 @@
 
 
 #ifdef __cplusplus
-} /* extern "C" */
+}  // extern "C"
 #endif
diff --git a/src/xnnpack/conv.h b/src/xnnpack/conv.h
index c1bdec3..efc2634 100644
--- a/src/xnnpack/conv.h
+++ b/src/xnnpack/conv.h
@@ -59,5 +59,5 @@
 
 
 #ifdef __cplusplus
-} /* extern "C" */
+}  // extern "C"
 #endif
diff --git a/src/xnnpack/dwconv.h b/src/xnnpack/dwconv.h
index dc52a61..22794a6 100644
--- a/src/xnnpack/dwconv.h
+++ b/src/xnnpack/dwconv.h
@@ -84,5 +84,5 @@
 
 
 #ifdef __cplusplus
-} /* extern "C" */
+}  // extern "C"
 #endif
diff --git a/src/xnnpack/gavgpool.h b/src/xnnpack/gavgpool.h
index b567196..4680084 100644
--- a/src/xnnpack/gavgpool.h
+++ b/src/xnnpack/gavgpool.h
@@ -95,5 +95,5 @@
 
 
 #ifdef __cplusplus
-} /* extern "C" */
+}  // extern "C"
 #endif
diff --git a/src/xnnpack/gemm.h b/src/xnnpack/gemm.h
index 27f591d..bcc067a 100644
--- a/src/xnnpack/gemm.h
+++ b/src/xnnpack/gemm.h
@@ -185,5 +185,5 @@
 
 
 #ifdef __cplusplus
-} /* extern "C" */
+}  // extern "C"
 #endif
diff --git a/src/xnnpack/hswish.h b/src/xnnpack/hswish.h
index 8d0ab93..eda1a1f 100644
--- a/src/xnnpack/hswish.h
+++ b/src/xnnpack/hswish.h
@@ -31,5 +31,5 @@
 
 
 #ifdef __cplusplus
-} /* extern "C" */
+}  // extern "C"
 #endif
diff --git a/src/xnnpack/igemm.h b/src/xnnpack/igemm.h
index 4d30c6f..b9dc93d 100644
--- a/src/xnnpack/igemm.h
+++ b/src/xnnpack/igemm.h
@@ -101,5 +101,5 @@
 
 
 #ifdef __cplusplus
-} /* extern "C" */
+}  // extern "C"
 #endif
diff --git a/src/xnnpack/im2col.h b/src/xnnpack/im2col.h
index 07323e3..d57dabb 100644
--- a/src/xnnpack/im2col.h
+++ b/src/xnnpack/im2col.h
@@ -33,5 +33,5 @@
   void* output);
 
 #ifdef __cplusplus
-} /* extern "C" */
+}  // extern "C"
 #endif
diff --git a/src/xnnpack/indirection.h b/src/xnnpack/indirection.h
index 60be1f6..6853052 100644
--- a/src/xnnpack/indirection.h
+++ b/src/xnnpack/indirection.h
@@ -53,5 +53,5 @@
   uint32_t log2_element_size);
 
 #ifdef __cplusplus
-} /* extern "C" */
+}  // extern "C"
 #endif
diff --git a/src/xnnpack/lut.h b/src/xnnpack/lut.h
index 49b0ec4..830b7e7 100644
--- a/src/xnnpack/lut.h
+++ b/src/xnnpack/lut.h
@@ -40,5 +40,5 @@
 
 
 #ifdef __cplusplus
-} /* extern "C" */
+}  // extern "C"
 #endif
diff --git a/src/xnnpack/maxpool.h b/src/xnnpack/maxpool.h
index 1c134d7..1cac764 100644
--- a/src/xnnpack/maxpool.h
+++ b/src/xnnpack/maxpool.h
@@ -52,5 +52,5 @@
 
 
 #ifdef __cplusplus
-} /* extern "C" */
+}  // extern "C"
 #endif
diff --git a/src/xnnpack/operator.h b/src/xnnpack/operator.h
index a34d6fd..62f956a 100644
--- a/src/xnnpack/operator.h
+++ b/src/xnnpack/operator.h
@@ -155,7 +155,7 @@
   size_t slice_height;
   size_t indirection_y_stride;
   size_t indirection_x_stride;
-  /* kernel_size * mr * sizeof(void*) */
+  // scaled_kernel_size := kernel_size * mr * sizeof(void*).
   size_t scaled_kernel_size;
 };
 
diff --git a/src/xnnpack/packx.h b/src/xnnpack/packx.h
index 20b3bc1..a42259d 100644
--- a/src/xnnpack/packx.h
+++ b/src/xnnpack/packx.h
@@ -32,5 +32,5 @@
 
 
 #ifdef __cplusplus
-} /* extern "C" */
+}  // extern "C"
 #endif
diff --git a/src/xnnpack/pad.h b/src/xnnpack/pad.h
index 3cb8103..d3a12ec 100644
--- a/src/xnnpack/pad.h
+++ b/src/xnnpack/pad.h
@@ -35,5 +35,5 @@
 
 
 #ifdef __cplusplus
-} /* extern "C" */
+}  // extern "C"
 #endif
diff --git a/src/xnnpack/params.h b/src/xnnpack/params.h
index 30e8393..11065c4 100644
--- a/src/xnnpack/params.h
+++ b/src/xnnpack/params.h
@@ -34,7 +34,7 @@
     XNN_ALIGN(16) float max[4];
     XNN_ALIGN(16) float min[4];
   } sse;
-#endif /* CPUINFO_ARCH_X86 || CPUINFO_ARCH_X86_64 */
+#endif  // CPUINFO_ARCH_X86 || CPUINFO_ARCH_X86_64
 };
 
 union xnn_f32_spchw_params {
@@ -50,7 +50,8 @@
     float min;
     float max;
   } neon;
-#elif CPUINFO_ARCH_X86 || CPUINFO_ARCH_X86_64
+#endif  // CPUINFO_ARCH_ARM || CPUINFO_ARCH_ARM64
+#if CPUINFO_ARCH_X86 || CPUINFO_ARCH_X86_64
   struct {
     XNN_ALIGN(16) uint32_t mask_even[4]; // used by stride 2 kernels
     XNN_ALIGN(16) uint32_t mask_odd[4];  // used by stride 2 kernels
@@ -58,7 +59,7 @@
     XNN_ALIGN(16) float max[4];
     XNN_ALIGN(16) float min[4];
   } sse;
-#endif /* CPUINFO_ARCH_X86 || CPUINFO_ARCH_X86_64 */
+#endif  // CPUINFO_ARCH_X86 || CPUINFO_ARCH_X86_64
 };
 
 union xnn_u8_output_params {
@@ -71,13 +72,13 @@
     uint8_t max;
     uint8_t min;
   } neon;
-#endif /* CPUINFO_ARCH_ARM || CPUINFO_ARCH_ARM64 */
+#endif  // CPUINFO_ARCH_ARM || CPUINFO_ARCH_ARM64
 #if CPUINFO_ARCH_X86 || CPUINFO_ARCH_X86_64
   struct {
     XNN_ALIGN(16) uint8_t max[16];
     XNN_ALIGN(16) uint8_t min[16];
   } sse2;
-#endif /* CPUINFO_ARCH_X86 || CPUINFO_ARCH_X86_64 */
+#endif  // CPUINFO_ARCH_X86 || CPUINFO_ARCH_X86_64
 };
 
 union xnn_f32_avgpool_params {
@@ -92,14 +93,14 @@
     XNN_ALIGN(16) float output_max[4];
     XNN_ALIGN(16) float output_min[4];
   } sse2;
-#endif /* CPUINFO_ARCH_X86 || CPUINFO_ARCH_X86_64 */
+#endif  // CPUINFO_ARCH_X86 || CPUINFO_ARCH_X86_64
 #if CPUINFO_ARCH_ARM || CPUINFO_ARCH_ARM64
   struct {
     XNN_ALIGN(16) float multiplier;
     XNN_ALIGN(16) float output_max;
     XNN_ALIGN(16) float output_min;
   } neon;
-#endif /* CPUINFO_ARCH_ARM || CPUINFO_ARCH_ARM64 */
+#endif  // CPUINFO_ARCH_ARM || CPUINFO_ARCH_ARM64
 };
 
 union xnn_f32_gavgpool_params {
@@ -115,7 +116,7 @@
     XNN_ALIGN(16) float output_min[4];
     XNN_ALIGN(16) uint32_t mask[4];
   } sse;
-#endif /* CPUINFO_ARCH_X86 || CPUINFO_ARCH_X86_64 */
+#endif  // CPUINFO_ARCH_X86 || CPUINFO_ARCH_X86_64
 #if CPUINFO_ARCH_ARM || CPUINFO_ARCH_ARM64
   struct {
     XNN_ALIGN(16) float multiplier;
@@ -123,7 +124,7 @@
     XNN_ALIGN(16) float output_min;
     XNN_ALIGN(16) uint32_t mask[4];
   } neon;
-#endif /* CPUINFO_ARCH_ARM || CPUINFO_ARCH_ARM64 */
+#endif  // CPUINFO_ARCH_ARM || CPUINFO_ARCH_ARM64 */
 };
 
 union xnn_f32_hswish_params {
@@ -138,7 +139,7 @@
     XNN_ALIGN(16) float half[4];
     XNN_ALIGN(16) float one[4];
   } sse;
-#endif /* CPUINFO_ARCH_X86 || CPUINFO_ARCH_X86_64 */
+#endif  // CPUINFO_ARCH_X86 || CPUINFO_ARCH_X86_64
 };
 
 union xnn_q8_gemm_params {
@@ -163,7 +164,7 @@
     uint8_t output_max;
     uint8_t output_min;
   } neon;
-#endif /* CPUINFO_ARCH_ARM || CPUINFO_ARCH_ARM64 */
+#endif  // CPUINFO_ARCH_ARM || CPUINFO_ARCH_ARM64
 #if CPUINFO_ARCH_X86 || CPUINFO_ARCH_X86_64
   struct {
     XNN_ALIGN(16) int16_t kernel_zero_point[8];
@@ -177,7 +178,7 @@
     XNN_ALIGN(16) uint8_t output_max[16];
     XNN_ALIGN(16) uint8_t output_min[16];
   } sse2;
-#endif /* CPUINFO_ARCH_X86 || CPUINFO_ARCH_X86_64 */
+#endif  // CPUINFO_ARCH_X86 || CPUINFO_ARCH_X86_64
 };
 
 union xnn_q8_add_params {
@@ -203,7 +204,7 @@
     uint8_t y_max;
     uint8_t y_min;
   } neon;
-#endif
+#endif  // CPUINFO_ARCH_ARM || CPUINFO_ARCH_ARM64
 #if CPUINFO_ARCH_X86 || CPUINFO_ARCH_X86_64
   struct {
     XNN_ALIGN(16) int32_t zero_point_product[4];
@@ -220,7 +221,7 @@
     uint32_t a_multiplier;
     uint32_t b_multiplier;
   } sse2;
-#endif
+#endif  // CPUINFO_ARCH_X86 || CPUINFO_ARCH_X86_64
 };
 
 union xnn_q8_avgpool_params {
@@ -242,7 +243,7 @@
     uint8_t output_max;
     uint8_t output_min;
   } neon;
-#endif /* CPUINFO_ARCH_ARM || CPUINFO_ARCH_ARM64 */
+#endif  // CPUINFO_ARCH_ARM || CPUINFO_ARCH_ARM64
 #if CPUINFO_ARCH_X86 || CPUINFO_ARCH_X86_64
   struct {
     XNN_ALIGN(16) int32_t bias[4];
@@ -253,7 +254,7 @@
     XNN_ALIGN(16) uint8_t output_max[16];
     XNN_ALIGN(16) uint8_t output_min[16];
   } sse2;
-#endif
+#endif  // CPUINFO_ARCH_X86 || CPUINFO_ARCH_X86_64
 };
 
 union xnn_fp32_requantization_params {
@@ -337,7 +338,7 @@
     uint8_t max;
     uint8_t min;
   } neon;
-#endif /* CPUINFO_ARCH_ARM || CPUINFO_ARCH_ARM64 */
+#endif  // CPUINFO_ARCH_ARM || CPUINFO_ARCH_ARM64
 #if CPUINFO_ARCH_X86 || CPUINFO_ARCH_X86_64
   struct {
     XNN_ALIGN(16) uint32_t multiplier[4];
@@ -349,7 +350,7 @@
     XNN_ALIGN(16) uint8_t max[16];
     XNN_ALIGN(16) uint8_t min[16];
   } sse2;
-#endif /* CPUINFO_ARCH_X86 || CPUINFO_ARCH_X86_64 */
+#endif  // CPUINFO_ARCH_X86 || CPUINFO_ARCH_X86_64
 };
 
 union xnn_requantization_params {
@@ -1125,7 +1126,7 @@
 struct gemm_parameters {
   xnn_gemm_ukernel_function gemm;
   xnn_igemm_ukernel_function igemm;
-  /* Optional GEMM and IGEMM micro-kernels with MR=1 and the same NR and KR parameters */
+  // Optional GEMM and IGEMM micro-kernels with MR=1 and the same NR and KR parameters.
   xnn_gemm_ukernel_function gemm1;
   xnn_igemm_ukernel_function igemm1;
   uint8_t mr;
diff --git a/src/xnnpack/pavgpool.h b/src/xnnpack/pavgpool.h
index f124519..3be16d3 100644
--- a/src/xnnpack/pavgpool.h
+++ b/src/xnnpack/pavgpool.h
@@ -56,5 +56,5 @@
 
 
 #ifdef __cplusplus
-} /* extern "C" */
+}  // extern "C"
 #endif
diff --git a/src/xnnpack/ppmm.h b/src/xnnpack/ppmm.h
index 1bf6941..3239736 100644
--- a/src/xnnpack/ppmm.h
+++ b/src/xnnpack/ppmm.h
@@ -41,5 +41,5 @@
 
 
 #ifdef __cplusplus
-} /* extern "C" */
+}  // extern "C"
 #endif
diff --git a/src/xnnpack/prelu.h b/src/xnnpack/prelu.h
index 2a882a7..559696a 100644
--- a/src/xnnpack/prelu.h
+++ b/src/xnnpack/prelu.h
@@ -34,5 +34,5 @@
 
 
 #ifdef __cplusplus
-} /* extern "C" */
+}  // extern "C"
 #endif
diff --git a/src/xnnpack/requantization-stubs.h b/src/xnnpack/requantization-stubs.h
index ee6e86d..cfeaed6 100644
--- a/src/xnnpack/requantization-stubs.h
+++ b/src/xnnpack/requantization-stubs.h
@@ -65,5 +65,5 @@
 DECLARE_REQUANTIZATION_FUNCTION(xnn_requantize_gemmlowp__neon)
 
 #ifdef __cplusplus
-} /* extern "C" */
+}  // extern "C"
 #endif
diff --git a/src/xnnpack/requantization.h b/src/xnnpack/requantization.h
index bf3e100..51cff74 100644
--- a/src/xnnpack/requantization.h
+++ b/src/xnnpack/requantization.h
@@ -34,15 +34,15 @@
   uint8_t output_min,
   uint8_t output_max)
 {
-  /* Compute requantization parameters */
+  // Compute requantization parameters
   const uint32_t scale_bits = fp32_to_bits(scale);
 
-  /* Multiplier is in [0x40000000, 0x7FFFFF80] range */
+  // Multiplier is in [0x40000000, 0x7FFFFF80] range.
   const int32_t multiplier = (int32_t)(((scale_bits & UINT32_C(0x007FFFFF)) | UINT32_C(0x00800000)) << 7);
   assert(multiplier >= INT32_C(0x40000000));
   assert(multiplier <= INT32_C(0x7FFFFF80));
 
-  /* Shift is in [0, 31] range */
+  // Shift is in [0, 31] range.
   const int32_t shift = 127 + 31 - 32 - (fp32_to_bits(scale) >> 23);
   assert(shift >= 0);
   assert(shift < 32);
@@ -73,15 +73,15 @@
   uint8_t output_min,
   uint8_t output_max)
 {
-  /* Compute requantization parameters */
+  // Compute requantization parameters.
   const uint32_t scale_bits = fp32_to_bits(scale);
 
-  /* Multiplier is in [0x40000000, 0x7FFFFF80] range */
+  // Multiplier is in [0x40000000, 0x7FFFFF80] range.
   const int32_t multiplier = (int32_t)(((scale_bits & UINT32_C(0x007FFFFF)) | UINT32_C(0x00800000)) << 7);
   assert(multiplier >= INT32_C(0x40000000));
   assert(multiplier <= INT32_C(0x7FFFFF80));
 
-  /* Shift is in [0, 31] range */
+  // Shift is in [0, 31] range.
   const int32_t shift = 127 + 31 - 32 - (fp32_to_bits(scale) >> 23);
   assert(shift >= 0);
   assert(shift < 32);
@@ -150,17 +150,17 @@
   uint8_t output_min,
   uint8_t output_max)
 {
-  /* Compute requantization parameters */
+  // Compute requantization parameters.
   assert(scale >= 0x1.0p-32f);
   assert(scale < 256.0f);
   const uint32_t scale_bits = fp32_to_bits(scale);
 
-  /* Multiplier is in [0x00800000, 0x00FFFFFF] range */
+  // Multiplier is in [0x00800000, 0x00FFFFFF] range.
   const int32_t multiplier = ((int32_t) scale_bits & INT32_C(0x007FFFFF)) | INT32_C(0x00800000);
   assert(multiplier >= INT32_C(0x00800000));
   assert(multiplier <= INT32_C(0x00FFFFFF));
 
-  /* Shift is in [16, 55] range */
+  // Shift is in [16, 55] range.
   const int32_t shift = 127 + 23 - (scale_bits >> 23);
   assert(shift >= 16);
   assert(shift < 64);
@@ -218,17 +218,17 @@
   uint8_t output_min,
   uint8_t output_max)
 {
-  /* Compute requantization parameters */
+  // Compute requantization parameters.
   assert(scale >= 0x1.0p-32f);
   assert(scale < 256.0f);
   const uint32_t scale_bits = fp32_to_bits(scale);
 
-  /* Multiplier is in [0x00800000, 0x00FFFFFF] range */
+  // Multiplier is in [0x00800000, 0x00FFFFFF] range.
   const int32_t multiplier = ((int32_t) scale_bits & INT32_C(0x007FFFFF)) | INT32_C(0x00800000);
   assert(multiplier >= INT32_C(0x00800000));
   assert(multiplier <= INT32_C(0x00FFFFFF));
 
-  /* Shift is in [16, 55] range */
+  // Shift is in [16, 55] range.
   const int32_t shift = 127 + 23 - (scale_bits >> 23);
   assert(shift >= 16);
   assert(shift < 64);
@@ -1023,20 +1023,20 @@
   assert(a_output_scale < 0x1.0p+8f);
   assert(b_output_scale < 0x1.0p+8f);
 
-  /* Compute requantization parameters */
+  // Compute requantization parameters.
   const float max_output_scale = a_output_scale > b_output_scale ? a_output_scale : b_output_scale;
   assert(max_output_scale >= 0x1.0p-14f);
   assert(max_output_scale < 0x1.0p+8f);
   const uint32_t max_scale_bits = fp32_to_bits(max_output_scale);
   const int32_t max_scale_exponent = (int32_t) (max_scale_bits >> 23) - 127;
-  /* Shift is in [13, 31] range */
+  // Shift is in [13, 31] range.
   const uint32_t shift = (uint32_t) (21 - max_scale_exponent);
   assert(shift < 32);
   assert(shift >= 13);
 
   const float scale_multiplier = fp32_from_bits((uint32_t) (21 - max_scale_exponent + 127) << 23);
 
-  /* Multipliers are in [0, 2**22) range, largest multiplier is in [2**21, 2**22) range */
+  // Multipliers are in [0, 2**22) range, largest multiplier is in [2**21, 2**22) range.
   const uint32_t a_multiplier = (uint32_t) (int32_t) __builtin_lrintf(a_output_scale * scale_multiplier);
   const uint32_t b_multiplier = (uint32_t) (int32_t) __builtin_lrintf(b_output_scale * scale_multiplier);
   assert((a_multiplier > b_multiplier ? a_multiplier : b_multiplier) >= UINT32_C(0x00200000));
@@ -1112,18 +1112,18 @@
   assert(a_output_scale < 0x1.0p+8f);
   assert(b_output_scale < 0x1.0p+8f);
 
-  /* Compute requantization parameters */
+  // Compute requantization parameters.
   const float max_output_scale = a_output_scale > b_output_scale ? a_output_scale : b_output_scale;
   assert(max_output_scale >= 0x1.0p-10f);
   assert(max_output_scale < 0x1.0p+8f);
   const uint32_t max_scale_bits = fp32_to_bits(max_output_scale);
   const int32_t max_scale_exponent = (int32_t) (max_scale_bits >> 23) - 127;
-  /* Shift is in [13, 31] range */
+  // Shift is in [13, 31] range.
   const uint32_t shift = (uint32_t) (21 - max_scale_exponent);
   assert(shift < 32);
   assert(shift >= 13);
 
-  /* Multipliers are in [0, 2**22) range, largest multiplier is in [2**21, 2**22) range */
+  // Multipliers are in [0, 2**22) range, largest multiplier is in [2**21, 2**22) range.
   const uint32_t a_multiplier = (uint32_t) (int32_t) __builtin_lrintf(fp32_from_bits(fp32_to_bits(a_output_scale) + (shift << 23)));
   const uint32_t b_multiplier = (uint32_t) (int32_t) __builtin_lrintf(fp32_from_bits(fp32_to_bits(b_output_scale) + (shift << 23)));
   assert((a_multiplier > b_multiplier ? a_multiplier : b_multiplier) >= UINT32_C(0x00200000));
@@ -1152,17 +1152,17 @@
   uint8_t min,
   uint8_t max)
 {
-  /* Compute requantization parameters */
+  // Compute requantization parameters.
   assert(scale < 1.0f);
   assert(scale >= 0x1.0p-32f);
   const uint32_t scale_bits = fp32_to_bits(scale);
 
-  /* Multiplier is in [0x40000000, 0x7FFFFF80] range */
+  // Multiplier is in [0x40000000, 0x7FFFFF80] range.
   const int32_t multiplier = (int32_t)(((scale_bits & UINT32_C(0x007FFFFF)) | UINT32_C(0x00800000)) << 7);
   assert(multiplier >= INT32_C(0x40000000));
   assert(multiplier <= INT32_C(0x7FFFFF80));
 
-  /* Shift is in [0, 31] range */
+  // Shift is in [0, 31] range.
   const int32_t shift = 127 + 31 - 32 - (fp32_to_bits(scale) >> 23);
   assert(shift >= 0);
   assert(shift < 32);
@@ -1186,15 +1186,15 @@
   uint8_t min,
   uint8_t max)
 {
-  /* Compute requantization parameters */
+  // Compute requantization parameters.
   const uint32_t scale_bits = fp32_to_bits(scale);
 
-  /* Multiplier is in [0x40000000, 0x7FFFFF80] range */
+  // Multiplier is in [0x40000000, 0x7FFFFF80] range.
   const int32_t multiplier = (int32_t)(((scale_bits & UINT32_C(0x007FFFFF)) | UINT32_C(0x00800000)) << 7);
   assert(multiplier >= INT32_C(0x40000000));
   assert(multiplier <= INT32_C(0x7FFFFF80));
 
-  /* Shift is in [0, 31] range */
+  // Shift is in [0, 31] range.
   const int32_t shift = 127 + 31 - 32 - (fp32_to_bits(scale) >> 23);
   assert(shift >= 0);
   assert(shift < 32);
@@ -1286,16 +1286,16 @@
   uint8_t a, uint8_t b,
   union xnn_q8_add_params params)
 {
-  /* Multiply by factors and accumulate products */
+  // Multiply by factors and accumulate products.
   int32_t acc = params.scalar.zero_point_product +
     (int32_t) ((uint32_t) a * params.scalar.a_multiplier) +
     (int32_t) ((uint32_t) b * params.scalar.b_multiplier);
 
-  /* Shift right and round */
+  // Shift right and round.
   const int32_t rem = (acc & params.scalar.remainder_mask) - (int32_t) (acc < 0);
   acc = asr_s32(acc, params.scalar.shift) + (int32_t) (rem > params.scalar.remainder_threshold);
 
-  /* Clamp and add output zero point */
+  // Clamp and add output zero point.
   int32_t y = acc + params.scalar.y_zero_point;
   if (y >= params.scalar.y_max) {
     y = params.scalar.y_max;
diff --git a/src/xnnpack/rmax.h b/src/xnnpack/rmax.h
index 25f6e32..0dc1996 100644
--- a/src/xnnpack/rmax.h
+++ b/src/xnnpack/rmax.h
@@ -43,5 +43,5 @@
 
 
 #ifdef __cplusplus
-} /* extern "C" */
+}  // extern "C"
 #endif
diff --git a/src/xnnpack/scalar-utils.h b/src/xnnpack/scalar-utils.h
index 88d30c8..caa607d 100644
--- a/src/xnnpack/scalar-utils.h
+++ b/src/xnnpack/scalar-utils.h
@@ -32,7 +32,7 @@
   #if __GNUC__ >= 8
     #define XNN_IGNORE_SHIFT_BASE_UB __attribute__((__no_sanitize__("shift-base")))
   #elif __GNUC__ == 4 && __GNUC_MINOR__ >= 9 || __GNUC__ > 4
-    /* 4.9 <= gcc < 8 support ubsan, but doesn't support no_sanitize attribute */
+    // 4.9 <= gcc < 8 support ubsan, but doesn't support no_sanitize attribute
     #define XNN_IGNORE_SHIFT_BASE_UB
     #ifndef XNN_USE_SHIFT_BASE_UB_WORKAROUND
       #define XNN_USE_SHIFT_BASE_UB_WORKAROUND 1
@@ -82,28 +82,22 @@
   assert(shift >= 24);
   assert(shift < 56);
 
-  /*
-   * Compute absolute value of input as unsigned 32-bit int.
-   * All further computations will work with unsigned values to avoid undefined behaviour on signed operations.
-   */
+  // Compute absolute value of input as unsigned 32-bit int.
+  // All further computations will work with unsigned values to avoid undefined behaviour on signed operations.
   const uint32_t abs_value = (value >= 0) ? (uint32_t) value : -(uint32_t) value;
 
-  /* Compute full 64-bit product of 32-bit factors */
+  // Compute full 64-bit product of 32-bit factors
   const uint64_t product = (uint64_t) abs_value * (uint64_t) multiplier;
 
-  /*
-   * Shift the full 64-bit product right with rounding.
-   * Rounding is performed towards closest integer, with midpoints rounded up (same as away from zero).
-   */
+  // Shift the full 64-bit product right with rounding.
+  // Rounding is performed towards closest integer, with midpoints rounded up (same as away from zero).
   const uint64_t rounding = UINT64_C(1) << (shift - 1);
   const uint32_t abs_scaled_value = (uint32_t) ((product + rounding) >> shift);
 
-  /*
-   * Copy the sign of input to scaled absolute input value.
-   */
+  // Copy the sign of input to scaled absolute input value.
   const int32_t scaled_value = (int32_t) (value >= 0 ? abs_scaled_value : -abs_scaled_value);
 
-  /* Clamp scaled value with zero point between smin and smax */
+  // Clamp scaled value with zero point between smin and smax.
   int32_t clamped_value = scaled_value;
   const int32_t smin = (int32_t) (uint32_t) qmin - (int32_t) (uint32_t) zero_point;
   if (clamped_value < smin) {
@@ -114,7 +108,7 @@
     clamped_value = smax;
   }
 
-  /* Add zero point to clamped value */
+  // Add zero point to clamped value.
   const int32_t biased_value = clamped_value + (int32_t) (uint32_t) zero_point;
 
   return biased_value;
diff --git a/src/xnnpack/spmm.h b/src/xnnpack/spmm.h
index 7ea16bf..93f8ae6 100644
--- a/src/xnnpack/spmm.h
+++ b/src/xnnpack/spmm.h
@@ -62,5 +62,5 @@
 
 
 #ifdef __cplusplus
-} /* extern "C" */
+}  // extern "C"
 #endif
diff --git a/src/xnnpack/unpool.h b/src/xnnpack/unpool.h
index c02457a..ef90f31 100644
--- a/src/xnnpack/unpool.h
+++ b/src/xnnpack/unpool.h
@@ -30,5 +30,5 @@
 
 
 #ifdef __cplusplus
-} /* extern "C" */
+}  // extern "C"
 #endif
diff --git a/src/xnnpack/vadd.h b/src/xnnpack/vadd.h
index a66d171..8f34a0b 100644
--- a/src/xnnpack/vadd.h
+++ b/src/xnnpack/vadd.h
@@ -47,5 +47,5 @@
 
 
 #ifdef __cplusplus
-} /* extern "C" */
+}  // extern "C"
 #endif
diff --git a/src/xnnpack/vmul.h b/src/xnnpack/vmul.h
index 9747de8..2ea19f7 100644
--- a/src/xnnpack/vmul.h
+++ b/src/xnnpack/vmul.h
@@ -31,5 +31,5 @@
 
 
 #ifdef __cplusplus
-} /* extern "C" */
+}  // extern "C"
 #endif
diff --git a/src/xnnpack/vmulcaddc.h b/src/xnnpack/vmulcaddc.h
index a37e747..cd28a34 100644
--- a/src/xnnpack/vmulcaddc.h
+++ b/src/xnnpack/vmulcaddc.h
@@ -35,5 +35,5 @@
 
 
 #ifdef __cplusplus
-} /* extern "C" */
+}  // extern "C"
 #endif
diff --git a/src/xnnpack/vsub.h b/src/xnnpack/vsub.h
index e444eb6..66e0152 100644
--- a/src/xnnpack/vsub.h
+++ b/src/xnnpack/vsub.h
@@ -31,5 +31,5 @@
 
 
 #ifdef __cplusplus
-} /* extern "C" */
+}  // extern "C"
 #endif
diff --git a/src/xnnpack/zip.h b/src/xnnpack/zip.h
index 48b164e..4657271 100644
--- a/src/xnnpack/zip.h
+++ b/src/xnnpack/zip.h
@@ -82,5 +82,5 @@
 
 
 #ifdef __cplusplus
-} /* extern "C" */
+}  // extern "C"
 #endif