Run template code generators
PiperOrigin-RevId: 389691150
diff --git a/src/f32-raddstoreexpminusmax/gen/wasmsimd-p5-x12-acc2.c b/src/f32-raddstoreexpminusmax/gen/wasmsimd-p5-x12-acc2.c
index 4041920..f7c8dda 100644
--- a/src/f32-raddstoreexpminusmax/gen/wasmsimd-p5-x12-acc2.c
+++ b/src/f32-raddstoreexpminusmax/gen/wasmsimd-p5-x12-acc2.c
@@ -40,7 +40,7 @@
const v128_t vi_max = wasm_f32x4_splat(max);
- v128_t vacc0 = wasm_f32x4_splat(0.0f);
+ v128_t vacc0 = wasm_f64x2_splat(0.0);
v128_t vacc1 = vacc0;
for (; elements >= 12 * sizeof(float); elements -= 12 * sizeof(float)) {
// Load 12 (3x4) inputs at a time.
diff --git a/src/f32-raddstoreexpminusmax/gen/wasmsimd-p5-x12-acc3.c b/src/f32-raddstoreexpminusmax/gen/wasmsimd-p5-x12-acc3.c
index fc3b9f4..00cdc52 100644
--- a/src/f32-raddstoreexpminusmax/gen/wasmsimd-p5-x12-acc3.c
+++ b/src/f32-raddstoreexpminusmax/gen/wasmsimd-p5-x12-acc3.c
@@ -40,7 +40,7 @@
const v128_t vi_max = wasm_f32x4_splat(max);
- v128_t vacc0 = wasm_f32x4_splat(0.0f);
+ v128_t vacc0 = wasm_f64x2_splat(0.0);
v128_t vacc1 = vacc0;
v128_t vacc2 = vacc0;
for (; elements >= 12 * sizeof(float); elements -= 12 * sizeof(float)) {
diff --git a/src/f32-raddstoreexpminusmax/gen/wasmsimd-p5-x12.c b/src/f32-raddstoreexpminusmax/gen/wasmsimd-p5-x12.c
index cb750d0..c4e79c0 100644
--- a/src/f32-raddstoreexpminusmax/gen/wasmsimd-p5-x12.c
+++ b/src/f32-raddstoreexpminusmax/gen/wasmsimd-p5-x12.c
@@ -40,7 +40,7 @@
const v128_t vi_max = wasm_f32x4_splat(max);
- v128_t vacc0 = wasm_f32x4_splat(0.0f);
+ v128_t vacc0 = wasm_f64x2_splat(0.0);
for (; elements >= 12 * sizeof(float); elements -= 12 * sizeof(float)) {
// Load 12 (3x4) inputs at a time.
const v128_t vi0123 = wasm_v128_load(input);
diff --git a/src/f32-raddstoreexpminusmax/gen/wasmsimd-p5-x16-acc2.c b/src/f32-raddstoreexpminusmax/gen/wasmsimd-p5-x16-acc2.c
index 5122d4e..a6d13e6 100644
--- a/src/f32-raddstoreexpminusmax/gen/wasmsimd-p5-x16-acc2.c
+++ b/src/f32-raddstoreexpminusmax/gen/wasmsimd-p5-x16-acc2.c
@@ -40,7 +40,7 @@
const v128_t vi_max = wasm_f32x4_splat(max);
- v128_t vacc0 = wasm_f32x4_splat(0.0f);
+ v128_t vacc0 = wasm_f64x2_splat(0.0);
v128_t vacc1 = vacc0;
for (; elements >= 16 * sizeof(float); elements -= 16 * sizeof(float)) {
// Load 16 (4x4) inputs at a time.
diff --git a/src/f32-raddstoreexpminusmax/gen/wasmsimd-p5-x16-acc4.c b/src/f32-raddstoreexpminusmax/gen/wasmsimd-p5-x16-acc4.c
index 8ef5deb..3d8e1d0 100644
--- a/src/f32-raddstoreexpminusmax/gen/wasmsimd-p5-x16-acc4.c
+++ b/src/f32-raddstoreexpminusmax/gen/wasmsimd-p5-x16-acc4.c
@@ -40,7 +40,7 @@
const v128_t vi_max = wasm_f32x4_splat(max);
- v128_t vacc0 = wasm_f32x4_splat(0.0f);
+ v128_t vacc0 = wasm_f64x2_splat(0.0);
v128_t vacc1 = vacc0;
v128_t vacc2 = vacc0;
v128_t vacc3 = vacc0;
diff --git a/src/f32-raddstoreexpminusmax/gen/wasmsimd-p5-x16.c b/src/f32-raddstoreexpminusmax/gen/wasmsimd-p5-x16.c
index a7d1a07..37ba150 100644
--- a/src/f32-raddstoreexpminusmax/gen/wasmsimd-p5-x16.c
+++ b/src/f32-raddstoreexpminusmax/gen/wasmsimd-p5-x16.c
@@ -40,7 +40,7 @@
const v128_t vi_max = wasm_f32x4_splat(max);
- v128_t vacc0 = wasm_f32x4_splat(0.0f);
+ v128_t vacc0 = wasm_f64x2_splat(0.0);
for (; elements >= 16 * sizeof(float); elements -= 16 * sizeof(float)) {
// Load 16 (4x4) inputs at a time.
const v128_t vi0123 = wasm_v128_load(input);
diff --git a/src/f32-raddstoreexpminusmax/gen/wasmsimd-p5-x20-acc2.c b/src/f32-raddstoreexpminusmax/gen/wasmsimd-p5-x20-acc2.c
index e10f902..260d10d 100644
--- a/src/f32-raddstoreexpminusmax/gen/wasmsimd-p5-x20-acc2.c
+++ b/src/f32-raddstoreexpminusmax/gen/wasmsimd-p5-x20-acc2.c
@@ -40,7 +40,7 @@
const v128_t vi_max = wasm_f32x4_splat(max);
- v128_t vacc0 = wasm_f32x4_splat(0.0f);
+ v128_t vacc0 = wasm_f64x2_splat(0.0);
v128_t vacc1 = vacc0;
for (; elements >= 20 * sizeof(float); elements -= 20 * sizeof(float)) {
// Load 20 (5x4) inputs at a time.
diff --git a/src/f32-raddstoreexpminusmax/gen/wasmsimd-p5-x20-acc5.c b/src/f32-raddstoreexpminusmax/gen/wasmsimd-p5-x20-acc5.c
index da422dc..5c4bd6e 100644
--- a/src/f32-raddstoreexpminusmax/gen/wasmsimd-p5-x20-acc5.c
+++ b/src/f32-raddstoreexpminusmax/gen/wasmsimd-p5-x20-acc5.c
@@ -40,7 +40,7 @@
const v128_t vi_max = wasm_f32x4_splat(max);
- v128_t vacc0 = wasm_f32x4_splat(0.0f);
+ v128_t vacc0 = wasm_f64x2_splat(0.0);
v128_t vacc1 = vacc0;
v128_t vacc2 = vacc0;
v128_t vacc3 = vacc0;
diff --git a/src/f32-raddstoreexpminusmax/gen/wasmsimd-p5-x20.c b/src/f32-raddstoreexpminusmax/gen/wasmsimd-p5-x20.c
index 0c04e0e..d287a1a 100644
--- a/src/f32-raddstoreexpminusmax/gen/wasmsimd-p5-x20.c
+++ b/src/f32-raddstoreexpminusmax/gen/wasmsimd-p5-x20.c
@@ -40,7 +40,7 @@
const v128_t vi_max = wasm_f32x4_splat(max);
- v128_t vacc0 = wasm_f32x4_splat(0.0f);
+ v128_t vacc0 = wasm_f64x2_splat(0.0);
for (; elements >= 20 * sizeof(float); elements -= 20 * sizeof(float)) {
// Load 20 (5x4) inputs at a time.
const v128_t vi0123 = wasm_v128_load(input);
diff --git a/src/f32-raddstoreexpminusmax/gen/wasmsimd-p5-x4.c b/src/f32-raddstoreexpminusmax/gen/wasmsimd-p5-x4.c
index 2a37aa0..5cf12be 100644
--- a/src/f32-raddstoreexpminusmax/gen/wasmsimd-p5-x4.c
+++ b/src/f32-raddstoreexpminusmax/gen/wasmsimd-p5-x4.c
@@ -40,7 +40,7 @@
const v128_t vi_max = wasm_f32x4_splat(max);
- v128_t vacc0 = wasm_f32x4_splat(0.0f);
+ v128_t vacc0 = wasm_f64x2_splat(0.0);
for (; elements >= 4 * sizeof(float); elements -= 4 * sizeof(float)) {
// Load 4 (1x4) inputs at a time.
const v128_t vi0123 = wasm_v128_load(input);
diff --git a/src/f32-raddstoreexpminusmax/gen/wasmsimd-p5-x8-acc2.c b/src/f32-raddstoreexpminusmax/gen/wasmsimd-p5-x8-acc2.c
index 5f60d50..838d347 100644
--- a/src/f32-raddstoreexpminusmax/gen/wasmsimd-p5-x8-acc2.c
+++ b/src/f32-raddstoreexpminusmax/gen/wasmsimd-p5-x8-acc2.c
@@ -40,7 +40,7 @@
const v128_t vi_max = wasm_f32x4_splat(max);
- v128_t vacc0 = wasm_f32x4_splat(0.0f);
+ v128_t vacc0 = wasm_f64x2_splat(0.0);
v128_t vacc1 = vacc0;
for (; elements >= 8 * sizeof(float); elements -= 8 * sizeof(float)) {
// Load 8 (2x4) inputs at a time.
diff --git a/src/f32-raddstoreexpminusmax/gen/wasmsimd-p5-x8.c b/src/f32-raddstoreexpminusmax/gen/wasmsimd-p5-x8.c
index a2e41ce..010be50 100644
--- a/src/f32-raddstoreexpminusmax/gen/wasmsimd-p5-x8.c
+++ b/src/f32-raddstoreexpminusmax/gen/wasmsimd-p5-x8.c
@@ -40,7 +40,7 @@
const v128_t vi_max = wasm_f32x4_splat(max);
- v128_t vacc0 = wasm_f32x4_splat(0.0f);
+ v128_t vacc0 = wasm_f64x2_splat(0.0);
for (; elements >= 8 * sizeof(float); elements -= 8 * sizeof(float)) {
// Load 8 (2x4) inputs at a time.
const v128_t vi0123 = wasm_v128_load(input);
diff --git a/src/f32-raddstoreexpminusmax/neon-p5.c.in b/src/f32-raddstoreexpminusmax/neon-p5.c.in
index 3bb228f..d2c20fe 100644
--- a/src/f32-raddstoreexpminusmax/neon-p5.c.in
+++ b/src/f32-raddstoreexpminusmax/neon-p5.c.in
@@ -60,7 +60,7 @@
// Compute reduced argument n := round(x / log(2)).
// We do it by adding a large number (magic bias), which cause rounding of result to an integer, then subtracing the
// large number back. The first addition is combined with multiplication by log2e into a single FMA instruction.
- // The trick with adding large number is valid only within certain bounds (|x| <= 2**22), but thats ok, because
+ // The trick with adding large number is valid only within certain bounds (|x| <= 2**22), but that's ok, because
// inputs outside of [-87.336540, 0.0] underflow expf(x) anyway. We fixup the result for such inputs at the very end
// of the algorithm.
$for N in range(0, ELEMENTS_TILE, 4):
@@ -141,7 +141,7 @@
// Compute reduced argument n := round(x / log(2)).
// We do it by adding a large number (magic bias), which cause rounding of result to an integer, then subtracing the
// large number back. The first addition is combined with multiplication by log2e into a single FMA instruction.
- // The trick with adding large number is valid only within certain bounds (|x| <= 2**22), but thats ok, because
+ // The trick with adding large number is valid only within certain bounds (|x| <= 2**22), but that's ok, because
// inputs outside of [-87.336540, 0.0] underflow expf(x) anyway. We fixup the result for such inputs at the very end
// of the algorithm.
float32x4_t vn = ${VMULADDQ_F32}(vmagic_bias, vx, vlog2e);
@@ -198,7 +198,7 @@
// Compute reduced argument n := round(x / log(2)).
// We do it by adding a large number (magic bias), which cause rounding of result to an integer, then subtracing the
// large number back. The first addition is combined with multiplication by log2e into a single FMA instruction.
- // The trick with adding large number is valid only within certain bounds (|x| <= 2**22), but thats ok, because
+ // The trick with adding large number is valid only within certain bounds (|x| <= 2**22), but that's ok, because
// inputs outside of [-87.336540, 0.0] underflow expf(x) anyway. We fixup the result for such inputs at the very end
// of the algorithm.
float32x4_t vn = ${VMULADDQ_F32}(vmagic_bias, vx, vlog2e);
diff --git a/src/f32-raddstoreexpminusmax/scalar-p5.c.in b/src/f32-raddstoreexpminusmax/scalar-p5.c.in
index db02ebc..0ff050f 100644
--- a/src/f32-raddstoreexpminusmax/scalar-p5.c.in
+++ b/src/f32-raddstoreexpminusmax/scalar-p5.c.in
@@ -52,7 +52,7 @@
// Compute reduced argument n := round(x / log(2)).
// We do it by adding a large number (magic bias) to the product x * (1/log(2)), which cause rounding of the result
// to an integer, then subtracing the large number back. The trick with adding large number is valid only within
- // certain bounds (|x| <= 2**22), but thats ok, because inputs outside of [-87.336540, 0.0] underflow expf(x)
+ // certain bounds (|x| <= 2**22), but that's ok, because inputs outside of [-87.336540, 0.0] underflow expf(x)
// anyway. We fixup the result for such inputs at the very end of the algorithm.
$for N in range(ELEMENTS_TILE):
float vn${N} = vx${N} * vlog2e + vmagic_bias;
@@ -135,7 +135,7 @@
// Compute reduced argument n := round(x / log(2)).
// We do it by adding a large number (magic bias) to the product x * (1/log(2)), which cause rounding of the result
// to an integer, then subtracing the large number back. The trick with adding large number is valid only within
- // certain bounds (|x| <= 2**22), but thats ok, because inputs outside of [-87.336540, 0.0] underflow expf(x)
+ // certain bounds (|x| <= 2**22), but that's ok, because inputs outside of [-87.336540, 0.0] underflow expf(x)
// anyway. We fixup the result for such inputs at the very end of the algorithm.
float vn = vx * vlog2e + vmagic_bias;
diff --git a/src/math/exp-neonfma-rr2-lut64-p2.c b/src/math/exp-neonfma-rr2-lut64-p2.c
index ab8b42a..22eff23 100644
--- a/src/math/exp-neonfma-rr2-lut64-p2.c
+++ b/src/math/exp-neonfma-rr2-lut64-p2.c
@@ -46,7 +46,7 @@
// Compute reduced argument n := round(x * 64 / log(2)).
// We do it by adding a large number (magic bias), which cause rounding of result to an integer, then subtracing the
// large number back. The first addition is combined with multiplication by log2e into a single FMA instruction.
- // The trick with adding large number is valid only within certain bounds (|x| <= 2**22), but thats ok, because
+ // The trick with adding large number is valid only within certain bounds (|x| <= 2**22), but that's ok, because
// inputs outside of [-103.97207, 88.72283] underflow or overflow expf(x) anyway. We fixup the result for such
// inputs at the very end of the algorithm.
float32x4_t vn = vfmaq_f32(vmagic_bias, vx, vlog2e_x64);