Run template code generators

PiperOrigin-RevId: 389691150
diff --git a/src/f32-raddstoreexpminusmax/gen/wasmsimd-p5-x12-acc2.c b/src/f32-raddstoreexpminusmax/gen/wasmsimd-p5-x12-acc2.c
index 4041920..f7c8dda 100644
--- a/src/f32-raddstoreexpminusmax/gen/wasmsimd-p5-x12-acc2.c
+++ b/src/f32-raddstoreexpminusmax/gen/wasmsimd-p5-x12-acc2.c
@@ -40,7 +40,7 @@
 
   const v128_t vi_max = wasm_f32x4_splat(max);
 
-  v128_t vacc0 = wasm_f32x4_splat(0.0f);
+  v128_t vacc0 = wasm_f64x2_splat(0.0);
   v128_t vacc1 = vacc0;
   for (; elements >= 12 * sizeof(float); elements -= 12 * sizeof(float)) {
     // Load 12 (3x4) inputs at a time.
diff --git a/src/f32-raddstoreexpminusmax/gen/wasmsimd-p5-x12-acc3.c b/src/f32-raddstoreexpminusmax/gen/wasmsimd-p5-x12-acc3.c
index fc3b9f4..00cdc52 100644
--- a/src/f32-raddstoreexpminusmax/gen/wasmsimd-p5-x12-acc3.c
+++ b/src/f32-raddstoreexpminusmax/gen/wasmsimd-p5-x12-acc3.c
@@ -40,7 +40,7 @@
 
   const v128_t vi_max = wasm_f32x4_splat(max);
 
-  v128_t vacc0 = wasm_f32x4_splat(0.0f);
+  v128_t vacc0 = wasm_f64x2_splat(0.0);
   v128_t vacc1 = vacc0;
   v128_t vacc2 = vacc0;
   for (; elements >= 12 * sizeof(float); elements -= 12 * sizeof(float)) {
diff --git a/src/f32-raddstoreexpminusmax/gen/wasmsimd-p5-x12.c b/src/f32-raddstoreexpminusmax/gen/wasmsimd-p5-x12.c
index cb750d0..c4e79c0 100644
--- a/src/f32-raddstoreexpminusmax/gen/wasmsimd-p5-x12.c
+++ b/src/f32-raddstoreexpminusmax/gen/wasmsimd-p5-x12.c
@@ -40,7 +40,7 @@
 
   const v128_t vi_max = wasm_f32x4_splat(max);
 
-  v128_t vacc0 = wasm_f32x4_splat(0.0f);
+  v128_t vacc0 = wasm_f64x2_splat(0.0);
   for (; elements >= 12 * sizeof(float); elements -= 12 * sizeof(float)) {
     // Load 12 (3x4) inputs at a time.
     const v128_t vi0123 = wasm_v128_load(input);
diff --git a/src/f32-raddstoreexpminusmax/gen/wasmsimd-p5-x16-acc2.c b/src/f32-raddstoreexpminusmax/gen/wasmsimd-p5-x16-acc2.c
index 5122d4e..a6d13e6 100644
--- a/src/f32-raddstoreexpminusmax/gen/wasmsimd-p5-x16-acc2.c
+++ b/src/f32-raddstoreexpminusmax/gen/wasmsimd-p5-x16-acc2.c
@@ -40,7 +40,7 @@
 
   const v128_t vi_max = wasm_f32x4_splat(max);
 
-  v128_t vacc0 = wasm_f32x4_splat(0.0f);
+  v128_t vacc0 = wasm_f64x2_splat(0.0);
   v128_t vacc1 = vacc0;
   for (; elements >= 16 * sizeof(float); elements -= 16 * sizeof(float)) {
     // Load 16 (4x4) inputs at a time.
diff --git a/src/f32-raddstoreexpminusmax/gen/wasmsimd-p5-x16-acc4.c b/src/f32-raddstoreexpminusmax/gen/wasmsimd-p5-x16-acc4.c
index 8ef5deb..3d8e1d0 100644
--- a/src/f32-raddstoreexpminusmax/gen/wasmsimd-p5-x16-acc4.c
+++ b/src/f32-raddstoreexpminusmax/gen/wasmsimd-p5-x16-acc4.c
@@ -40,7 +40,7 @@
 
   const v128_t vi_max = wasm_f32x4_splat(max);
 
-  v128_t vacc0 = wasm_f32x4_splat(0.0f);
+  v128_t vacc0 = wasm_f64x2_splat(0.0);
   v128_t vacc1 = vacc0;
   v128_t vacc2 = vacc0;
   v128_t vacc3 = vacc0;
diff --git a/src/f32-raddstoreexpminusmax/gen/wasmsimd-p5-x16.c b/src/f32-raddstoreexpminusmax/gen/wasmsimd-p5-x16.c
index a7d1a07..37ba150 100644
--- a/src/f32-raddstoreexpminusmax/gen/wasmsimd-p5-x16.c
+++ b/src/f32-raddstoreexpminusmax/gen/wasmsimd-p5-x16.c
@@ -40,7 +40,7 @@
 
   const v128_t vi_max = wasm_f32x4_splat(max);
 
-  v128_t vacc0 = wasm_f32x4_splat(0.0f);
+  v128_t vacc0 = wasm_f64x2_splat(0.0);
   for (; elements >= 16 * sizeof(float); elements -= 16 * sizeof(float)) {
     // Load 16 (4x4) inputs at a time.
     const v128_t vi0123 = wasm_v128_load(input);
diff --git a/src/f32-raddstoreexpminusmax/gen/wasmsimd-p5-x20-acc2.c b/src/f32-raddstoreexpminusmax/gen/wasmsimd-p5-x20-acc2.c
index e10f902..260d10d 100644
--- a/src/f32-raddstoreexpminusmax/gen/wasmsimd-p5-x20-acc2.c
+++ b/src/f32-raddstoreexpminusmax/gen/wasmsimd-p5-x20-acc2.c
@@ -40,7 +40,7 @@
 
   const v128_t vi_max = wasm_f32x4_splat(max);
 
-  v128_t vacc0 = wasm_f32x4_splat(0.0f);
+  v128_t vacc0 = wasm_f64x2_splat(0.0);
   v128_t vacc1 = vacc0;
   for (; elements >= 20 * sizeof(float); elements -= 20 * sizeof(float)) {
     // Load 20 (5x4) inputs at a time.
diff --git a/src/f32-raddstoreexpminusmax/gen/wasmsimd-p5-x20-acc5.c b/src/f32-raddstoreexpminusmax/gen/wasmsimd-p5-x20-acc5.c
index da422dc..5c4bd6e 100644
--- a/src/f32-raddstoreexpminusmax/gen/wasmsimd-p5-x20-acc5.c
+++ b/src/f32-raddstoreexpminusmax/gen/wasmsimd-p5-x20-acc5.c
@@ -40,7 +40,7 @@
 
   const v128_t vi_max = wasm_f32x4_splat(max);
 
-  v128_t vacc0 = wasm_f32x4_splat(0.0f);
+  v128_t vacc0 = wasm_f64x2_splat(0.0);
   v128_t vacc1 = vacc0;
   v128_t vacc2 = vacc0;
   v128_t vacc3 = vacc0;
diff --git a/src/f32-raddstoreexpminusmax/gen/wasmsimd-p5-x20.c b/src/f32-raddstoreexpminusmax/gen/wasmsimd-p5-x20.c
index 0c04e0e..d287a1a 100644
--- a/src/f32-raddstoreexpminusmax/gen/wasmsimd-p5-x20.c
+++ b/src/f32-raddstoreexpminusmax/gen/wasmsimd-p5-x20.c
@@ -40,7 +40,7 @@
 
   const v128_t vi_max = wasm_f32x4_splat(max);
 
-  v128_t vacc0 = wasm_f32x4_splat(0.0f);
+  v128_t vacc0 = wasm_f64x2_splat(0.0);
   for (; elements >= 20 * sizeof(float); elements -= 20 * sizeof(float)) {
     // Load 20 (5x4) inputs at a time.
     const v128_t vi0123 = wasm_v128_load(input);
diff --git a/src/f32-raddstoreexpminusmax/gen/wasmsimd-p5-x4.c b/src/f32-raddstoreexpminusmax/gen/wasmsimd-p5-x4.c
index 2a37aa0..5cf12be 100644
--- a/src/f32-raddstoreexpminusmax/gen/wasmsimd-p5-x4.c
+++ b/src/f32-raddstoreexpminusmax/gen/wasmsimd-p5-x4.c
@@ -40,7 +40,7 @@
 
   const v128_t vi_max = wasm_f32x4_splat(max);
 
-  v128_t vacc0 = wasm_f32x4_splat(0.0f);
+  v128_t vacc0 = wasm_f64x2_splat(0.0);
   for (; elements >= 4 * sizeof(float); elements -= 4 * sizeof(float)) {
     // Load 4 (1x4) inputs at a time.
     const v128_t vi0123 = wasm_v128_load(input);
diff --git a/src/f32-raddstoreexpminusmax/gen/wasmsimd-p5-x8-acc2.c b/src/f32-raddstoreexpminusmax/gen/wasmsimd-p5-x8-acc2.c
index 5f60d50..838d347 100644
--- a/src/f32-raddstoreexpminusmax/gen/wasmsimd-p5-x8-acc2.c
+++ b/src/f32-raddstoreexpminusmax/gen/wasmsimd-p5-x8-acc2.c
@@ -40,7 +40,7 @@
 
   const v128_t vi_max = wasm_f32x4_splat(max);
 
-  v128_t vacc0 = wasm_f32x4_splat(0.0f);
+  v128_t vacc0 = wasm_f64x2_splat(0.0);
   v128_t vacc1 = vacc0;
   for (; elements >= 8 * sizeof(float); elements -= 8 * sizeof(float)) {
     // Load 8 (2x4) inputs at a time.
diff --git a/src/f32-raddstoreexpminusmax/gen/wasmsimd-p5-x8.c b/src/f32-raddstoreexpminusmax/gen/wasmsimd-p5-x8.c
index a2e41ce..010be50 100644
--- a/src/f32-raddstoreexpminusmax/gen/wasmsimd-p5-x8.c
+++ b/src/f32-raddstoreexpminusmax/gen/wasmsimd-p5-x8.c
@@ -40,7 +40,7 @@
 
   const v128_t vi_max = wasm_f32x4_splat(max);
 
-  v128_t vacc0 = wasm_f32x4_splat(0.0f);
+  v128_t vacc0 = wasm_f64x2_splat(0.0);
   for (; elements >= 8 * sizeof(float); elements -= 8 * sizeof(float)) {
     // Load 8 (2x4) inputs at a time.
     const v128_t vi0123 = wasm_v128_load(input);
diff --git a/src/f32-raddstoreexpminusmax/neon-p5.c.in b/src/f32-raddstoreexpminusmax/neon-p5.c.in
index 3bb228f..d2c20fe 100644
--- a/src/f32-raddstoreexpminusmax/neon-p5.c.in
+++ b/src/f32-raddstoreexpminusmax/neon-p5.c.in
@@ -60,7 +60,7 @@
       // Compute reduced argument n := round(x / log(2)).
       // We do it by adding a large number (magic bias), which cause rounding of result to an integer, then subtracing the
       // large number back. The first addition is combined with multiplication by log2e into a single FMA instruction.
-      // The trick with adding large number is valid only within certain bounds (|x| <= 2**22), but thats ok, because
+      // The trick with adding large number is valid only within certain bounds (|x| <= 2**22), but that's ok, because
       // inputs outside of [-87.336540, 0.0] underflow expf(x) anyway. We fixup the result for such inputs at the very end
       // of the algorithm.
       $for N in range(0, ELEMENTS_TILE, 4):
@@ -141,7 +141,7 @@
     // Compute reduced argument n := round(x / log(2)).
     // We do it by adding a large number (magic bias), which cause rounding of result to an integer, then subtracing the
     // large number back. The first addition is combined with multiplication by log2e into a single FMA instruction.
-    // The trick with adding large number is valid only within certain bounds (|x| <= 2**22), but thats ok, because
+    // The trick with adding large number is valid only within certain bounds (|x| <= 2**22), but that's ok, because
     // inputs outside of [-87.336540, 0.0] underflow expf(x) anyway. We fixup the result for such inputs at the very end
     // of the algorithm.
     float32x4_t vn = ${VMULADDQ_F32}(vmagic_bias, vx, vlog2e);
@@ -198,7 +198,7 @@
     // Compute reduced argument n := round(x / log(2)).
     // We do it by adding a large number (magic bias), which cause rounding of result to an integer, then subtracing the
     // large number back. The first addition is combined with multiplication by log2e into a single FMA instruction.
-    // The trick with adding large number is valid only within certain bounds (|x| <= 2**22), but thats ok, because
+    // The trick with adding large number is valid only within certain bounds (|x| <= 2**22), but that's ok, because
     // inputs outside of [-87.336540, 0.0] underflow expf(x) anyway. We fixup the result for such inputs at the very end
     // of the algorithm.
     float32x4_t vn = ${VMULADDQ_F32}(vmagic_bias, vx, vlog2e);
diff --git a/src/f32-raddstoreexpminusmax/scalar-p5.c.in b/src/f32-raddstoreexpminusmax/scalar-p5.c.in
index db02ebc..0ff050f 100644
--- a/src/f32-raddstoreexpminusmax/scalar-p5.c.in
+++ b/src/f32-raddstoreexpminusmax/scalar-p5.c.in
@@ -52,7 +52,7 @@
       // Compute reduced argument n := round(x / log(2)).
       // We do it by adding a large number (magic bias) to the product x * (1/log(2)), which cause rounding of the result
       // to an integer, then subtracing the large number back. The trick with adding large number is valid only within
-      // certain bounds (|x| <= 2**22), but thats ok, because inputs outside of [-87.336540, 0.0] underflow expf(x)
+      // certain bounds (|x| <= 2**22), but that's ok, because inputs outside of [-87.336540, 0.0] underflow expf(x)
       // anyway. We fixup the result for such inputs at the very end of the algorithm.
       $for N in range(ELEMENTS_TILE):
         float vn${N} = vx${N} * vlog2e + vmagic_bias;
@@ -135,7 +135,7 @@
     // Compute reduced argument n := round(x / log(2)).
     // We do it by adding a large number (magic bias) to the product x * (1/log(2)), which cause rounding of the result
     // to an integer, then subtracing the large number back. The trick with adding large number is valid only within
-    // certain bounds (|x| <= 2**22), but thats ok, because inputs outside of [-87.336540, 0.0] underflow expf(x)
+    // certain bounds (|x| <= 2**22), but that's ok, because inputs outside of [-87.336540, 0.0] underflow expf(x)
     // anyway. We fixup the result for such inputs at the very end of the algorithm.
     float vn = vx * vlog2e + vmagic_bias;
 
diff --git a/src/math/exp-neonfma-rr2-lut64-p2.c b/src/math/exp-neonfma-rr2-lut64-p2.c
index ab8b42a..22eff23 100644
--- a/src/math/exp-neonfma-rr2-lut64-p2.c
+++ b/src/math/exp-neonfma-rr2-lut64-p2.c
@@ -46,7 +46,7 @@
     // Compute reduced argument n := round(x * 64 / log(2)).
     // We do it by adding a large number (magic bias), which cause rounding of result to an integer, then subtracing the
     // large number back. The first addition is combined with multiplication by log2e into a single FMA instruction.
-    // The trick with adding large number is valid only within certain bounds (|x| <= 2**22), but thats ok, because
+    // The trick with adding large number is valid only within certain bounds (|x| <= 2**22), but that's ok, because
     // inputs outside of [-103.97207, 88.72283] underflow or overflow expf(x) anyway. We fixup the result for such
     // inputs at the very end of the algorithm.
     float32x4_t vn = vfmaq_f32(vmagic_bias, vx, vlog2e_x64);