Disable tsan for micro-kernels which read out-of-bounds

Avoid spurious data races due of overlaps of out-of-bounds reads with writes by
another thread

PiperOrigin-RevId: 313691532
diff --git a/src/f32-raddstoreexpminusmax/gen/neon-lut64-p2-x12-acc2.c b/src/f32-raddstoreexpminusmax/gen/neon-lut64-p2-x12-acc2.c
index fc9fdf5..1b50e0f 100644
--- a/src/f32-raddstoreexpminusmax/gen/neon-lut64-p2-x12-acc2.c
+++ b/src/f32-raddstoreexpminusmax/gen/neon-lut64-p2-x12-acc2.c
@@ -22,7 +22,7 @@
     const float* input,
     float* output,
     float* sum,
-    float max)
+    float max) XNN_DISABLE_TSAN
 {
   assert(elements % sizeof(float) == 0);
 
diff --git a/src/f32-raddstoreexpminusmax/gen/neon-lut64-p2-x12-acc3.c b/src/f32-raddstoreexpminusmax/gen/neon-lut64-p2-x12-acc3.c
index 7ebb3e5..6040f5c 100644
--- a/src/f32-raddstoreexpminusmax/gen/neon-lut64-p2-x12-acc3.c
+++ b/src/f32-raddstoreexpminusmax/gen/neon-lut64-p2-x12-acc3.c
@@ -22,7 +22,7 @@
     const float* input,
     float* output,
     float* sum,
-    float max)
+    float max) XNN_DISABLE_TSAN
 {
   assert(elements % sizeof(float) == 0);
 
diff --git a/src/f32-raddstoreexpminusmax/gen/neon-lut64-p2-x12.c b/src/f32-raddstoreexpminusmax/gen/neon-lut64-p2-x12.c
index 0a7536f..11741ee 100644
--- a/src/f32-raddstoreexpminusmax/gen/neon-lut64-p2-x12.c
+++ b/src/f32-raddstoreexpminusmax/gen/neon-lut64-p2-x12.c
@@ -22,7 +22,7 @@
     const float* input,
     float* output,
     float* sum,
-    float max)
+    float max) XNN_DISABLE_TSAN
 {
   assert(elements % sizeof(float) == 0);
 
diff --git a/src/f32-raddstoreexpminusmax/gen/neon-lut64-p2-x16-acc2.c b/src/f32-raddstoreexpminusmax/gen/neon-lut64-p2-x16-acc2.c
index 116ab07..389c09a 100644
--- a/src/f32-raddstoreexpminusmax/gen/neon-lut64-p2-x16-acc2.c
+++ b/src/f32-raddstoreexpminusmax/gen/neon-lut64-p2-x16-acc2.c
@@ -22,7 +22,7 @@
     const float* input,
     float* output,
     float* sum,
-    float max)
+    float max) XNN_DISABLE_TSAN
 {
   assert(elements % sizeof(float) == 0);
 
diff --git a/src/f32-raddstoreexpminusmax/gen/neon-lut64-p2-x16-acc4.c b/src/f32-raddstoreexpminusmax/gen/neon-lut64-p2-x16-acc4.c
index 591e20d..6b5e45b 100644
--- a/src/f32-raddstoreexpminusmax/gen/neon-lut64-p2-x16-acc4.c
+++ b/src/f32-raddstoreexpminusmax/gen/neon-lut64-p2-x16-acc4.c
@@ -22,7 +22,7 @@
     const float* input,
     float* output,
     float* sum,
-    float max)
+    float max) XNN_DISABLE_TSAN
 {
   assert(elements % sizeof(float) == 0);
 
diff --git a/src/f32-raddstoreexpminusmax/gen/neon-lut64-p2-x16.c b/src/f32-raddstoreexpminusmax/gen/neon-lut64-p2-x16.c
index 7974e1d..7d1d71f 100644
--- a/src/f32-raddstoreexpminusmax/gen/neon-lut64-p2-x16.c
+++ b/src/f32-raddstoreexpminusmax/gen/neon-lut64-p2-x16.c
@@ -22,7 +22,7 @@
     const float* input,
     float* output,
     float* sum,
-    float max)
+    float max) XNN_DISABLE_TSAN
 {
   assert(elements % sizeof(float) == 0);
 
diff --git a/src/f32-raddstoreexpminusmax/gen/neon-lut64-p2-x20-acc2.c b/src/f32-raddstoreexpminusmax/gen/neon-lut64-p2-x20-acc2.c
index 673370b..5203c73 100644
--- a/src/f32-raddstoreexpminusmax/gen/neon-lut64-p2-x20-acc2.c
+++ b/src/f32-raddstoreexpminusmax/gen/neon-lut64-p2-x20-acc2.c
@@ -22,7 +22,7 @@
     const float* input,
     float* output,
     float* sum,
-    float max)
+    float max) XNN_DISABLE_TSAN
 {
   assert(elements % sizeof(float) == 0);
 
diff --git a/src/f32-raddstoreexpminusmax/gen/neon-lut64-p2-x20-acc5.c b/src/f32-raddstoreexpminusmax/gen/neon-lut64-p2-x20-acc5.c
index 0200a06..debb44e 100644
--- a/src/f32-raddstoreexpminusmax/gen/neon-lut64-p2-x20-acc5.c
+++ b/src/f32-raddstoreexpminusmax/gen/neon-lut64-p2-x20-acc5.c
@@ -22,7 +22,7 @@
     const float* input,
     float* output,
     float* sum,
-    float max)
+    float max) XNN_DISABLE_TSAN
 {
   assert(elements % sizeof(float) == 0);
 
diff --git a/src/f32-raddstoreexpminusmax/gen/neon-lut64-p2-x20.c b/src/f32-raddstoreexpminusmax/gen/neon-lut64-p2-x20.c
index 3199ade..97980b5 100644
--- a/src/f32-raddstoreexpminusmax/gen/neon-lut64-p2-x20.c
+++ b/src/f32-raddstoreexpminusmax/gen/neon-lut64-p2-x20.c
@@ -22,7 +22,7 @@
     const float* input,
     float* output,
     float* sum,
-    float max)
+    float max) XNN_DISABLE_TSAN
 {
   assert(elements % sizeof(float) == 0);
 
diff --git a/src/f32-raddstoreexpminusmax/gen/neon-lut64-p2-x4.c b/src/f32-raddstoreexpminusmax/gen/neon-lut64-p2-x4.c
index 630490f..1e15bc9 100644
--- a/src/f32-raddstoreexpminusmax/gen/neon-lut64-p2-x4.c
+++ b/src/f32-raddstoreexpminusmax/gen/neon-lut64-p2-x4.c
@@ -22,7 +22,7 @@
     const float* input,
     float* output,
     float* sum,
-    float max)
+    float max) XNN_DISABLE_TSAN
 {
   assert(elements % sizeof(float) == 0);
 
diff --git a/src/f32-raddstoreexpminusmax/gen/neon-lut64-p2-x8-acc2.c b/src/f32-raddstoreexpminusmax/gen/neon-lut64-p2-x8-acc2.c
index 9ffc6dc..1bd9e78 100644
--- a/src/f32-raddstoreexpminusmax/gen/neon-lut64-p2-x8-acc2.c
+++ b/src/f32-raddstoreexpminusmax/gen/neon-lut64-p2-x8-acc2.c
@@ -22,7 +22,7 @@
     const float* input,
     float* output,
     float* sum,
-    float max)
+    float max) XNN_DISABLE_TSAN
 {
   assert(elements % sizeof(float) == 0);
 
diff --git a/src/f32-raddstoreexpminusmax/gen/neon-lut64-p2-x8.c b/src/f32-raddstoreexpminusmax/gen/neon-lut64-p2-x8.c
index 9a48d4d..c9c3e00 100644
--- a/src/f32-raddstoreexpminusmax/gen/neon-lut64-p2-x8.c
+++ b/src/f32-raddstoreexpminusmax/gen/neon-lut64-p2-x8.c
@@ -22,7 +22,7 @@
     const float* input,
     float* output,
     float* sum,
-    float max)
+    float max) XNN_DISABLE_TSAN
 {
   assert(elements % sizeof(float) == 0);
 
diff --git a/src/f32-raddstoreexpminusmax/gen/neon-p5-x12-acc2.c b/src/f32-raddstoreexpminusmax/gen/neon-p5-x12-acc2.c
index 2817b0e..8cb5929 100644
--- a/src/f32-raddstoreexpminusmax/gen/neon-p5-x12-acc2.c
+++ b/src/f32-raddstoreexpminusmax/gen/neon-p5-x12-acc2.c
@@ -20,7 +20,7 @@
     const float* input,
     float* output,
     float* sum,
-    float max)
+    float max) XNN_DISABLE_TSAN
 {
   assert(elements % sizeof(float) == 0);
 
diff --git a/src/f32-raddstoreexpminusmax/gen/neon-p5-x12-acc3.c b/src/f32-raddstoreexpminusmax/gen/neon-p5-x12-acc3.c
index 9a8b9ac..2272365 100644
--- a/src/f32-raddstoreexpminusmax/gen/neon-p5-x12-acc3.c
+++ b/src/f32-raddstoreexpminusmax/gen/neon-p5-x12-acc3.c
@@ -20,7 +20,7 @@
     const float* input,
     float* output,
     float* sum,
-    float max)
+    float max) XNN_DISABLE_TSAN
 {
   assert(elements % sizeof(float) == 0);
 
diff --git a/src/f32-raddstoreexpminusmax/gen/neon-p5-x12.c b/src/f32-raddstoreexpminusmax/gen/neon-p5-x12.c
index e3a33ac..0fc17fe 100644
--- a/src/f32-raddstoreexpminusmax/gen/neon-p5-x12.c
+++ b/src/f32-raddstoreexpminusmax/gen/neon-p5-x12.c
@@ -20,7 +20,7 @@
     const float* input,
     float* output,
     float* sum,
-    float max)
+    float max) XNN_DISABLE_TSAN
 {
   assert(elements % sizeof(float) == 0);
 
diff --git a/src/f32-raddstoreexpminusmax/gen/neon-p5-x16-acc2.c b/src/f32-raddstoreexpminusmax/gen/neon-p5-x16-acc2.c
index 4026bef..12b541c 100644
--- a/src/f32-raddstoreexpminusmax/gen/neon-p5-x16-acc2.c
+++ b/src/f32-raddstoreexpminusmax/gen/neon-p5-x16-acc2.c
@@ -20,7 +20,7 @@
     const float* input,
     float* output,
     float* sum,
-    float max)
+    float max) XNN_DISABLE_TSAN
 {
   assert(elements % sizeof(float) == 0);
 
diff --git a/src/f32-raddstoreexpminusmax/gen/neon-p5-x16-acc4.c b/src/f32-raddstoreexpminusmax/gen/neon-p5-x16-acc4.c
index 7af363a..1c0c74e 100644
--- a/src/f32-raddstoreexpminusmax/gen/neon-p5-x16-acc4.c
+++ b/src/f32-raddstoreexpminusmax/gen/neon-p5-x16-acc4.c
@@ -20,7 +20,7 @@
     const float* input,
     float* output,
     float* sum,
-    float max)
+    float max) XNN_DISABLE_TSAN
 {
   assert(elements % sizeof(float) == 0);
 
diff --git a/src/f32-raddstoreexpminusmax/gen/neon-p5-x16.c b/src/f32-raddstoreexpminusmax/gen/neon-p5-x16.c
index 22ce848..18d82e3 100644
--- a/src/f32-raddstoreexpminusmax/gen/neon-p5-x16.c
+++ b/src/f32-raddstoreexpminusmax/gen/neon-p5-x16.c
@@ -20,7 +20,7 @@
     const float* input,
     float* output,
     float* sum,
-    float max)
+    float max) XNN_DISABLE_TSAN
 {
   assert(elements % sizeof(float) == 0);
 
diff --git a/src/f32-raddstoreexpminusmax/gen/neon-p5-x20-acc2.c b/src/f32-raddstoreexpminusmax/gen/neon-p5-x20-acc2.c
index dce5edc..04cfc63 100644
--- a/src/f32-raddstoreexpminusmax/gen/neon-p5-x20-acc2.c
+++ b/src/f32-raddstoreexpminusmax/gen/neon-p5-x20-acc2.c
@@ -20,7 +20,7 @@
     const float* input,
     float* output,
     float* sum,
-    float max)
+    float max) XNN_DISABLE_TSAN
 {
   assert(elements % sizeof(float) == 0);
 
diff --git a/src/f32-raddstoreexpminusmax/gen/neon-p5-x20-acc5.c b/src/f32-raddstoreexpminusmax/gen/neon-p5-x20-acc5.c
index af6bc25..02ff6bb 100644
--- a/src/f32-raddstoreexpminusmax/gen/neon-p5-x20-acc5.c
+++ b/src/f32-raddstoreexpminusmax/gen/neon-p5-x20-acc5.c
@@ -20,7 +20,7 @@
     const float* input,
     float* output,
     float* sum,
-    float max)
+    float max) XNN_DISABLE_TSAN
 {
   assert(elements % sizeof(float) == 0);
 
diff --git a/src/f32-raddstoreexpminusmax/gen/neon-p5-x20.c b/src/f32-raddstoreexpminusmax/gen/neon-p5-x20.c
index 4984203..4dc2e87 100644
--- a/src/f32-raddstoreexpminusmax/gen/neon-p5-x20.c
+++ b/src/f32-raddstoreexpminusmax/gen/neon-p5-x20.c
@@ -20,7 +20,7 @@
     const float* input,
     float* output,
     float* sum,
-    float max)
+    float max) XNN_DISABLE_TSAN
 {
   assert(elements % sizeof(float) == 0);
 
diff --git a/src/f32-raddstoreexpminusmax/gen/neon-p5-x4.c b/src/f32-raddstoreexpminusmax/gen/neon-p5-x4.c
index 3bc7511..9b16878 100644
--- a/src/f32-raddstoreexpminusmax/gen/neon-p5-x4.c
+++ b/src/f32-raddstoreexpminusmax/gen/neon-p5-x4.c
@@ -20,7 +20,7 @@
     const float* input,
     float* output,
     float* sum,
-    float max)
+    float max) XNN_DISABLE_TSAN
 {
   assert(elements % sizeof(float) == 0);
 
diff --git a/src/f32-raddstoreexpminusmax/gen/neon-p5-x8-acc2.c b/src/f32-raddstoreexpminusmax/gen/neon-p5-x8-acc2.c
index 627f4b3..92bdf71 100644
--- a/src/f32-raddstoreexpminusmax/gen/neon-p5-x8-acc2.c
+++ b/src/f32-raddstoreexpminusmax/gen/neon-p5-x8-acc2.c
@@ -20,7 +20,7 @@
     const float* input,
     float* output,
     float* sum,
-    float max)
+    float max) XNN_DISABLE_TSAN
 {
   assert(elements % sizeof(float) == 0);
 
diff --git a/src/f32-raddstoreexpminusmax/gen/neon-p5-x8.c b/src/f32-raddstoreexpminusmax/gen/neon-p5-x8.c
index 7b6f1b0..ca899b6 100644
--- a/src/f32-raddstoreexpminusmax/gen/neon-p5-x8.c
+++ b/src/f32-raddstoreexpminusmax/gen/neon-p5-x8.c
@@ -20,7 +20,7 @@
     const float* input,
     float* output,
     float* sum,
-    float max)
+    float max) XNN_DISABLE_TSAN
 {
   assert(elements % sizeof(float) == 0);
 
diff --git a/src/f32-raddstoreexpminusmax/gen/neonfma-lut64-p2-x12-acc2.c b/src/f32-raddstoreexpminusmax/gen/neonfma-lut64-p2-x12-acc2.c
index 9542df6..be590e5 100644
--- a/src/f32-raddstoreexpminusmax/gen/neonfma-lut64-p2-x12-acc2.c
+++ b/src/f32-raddstoreexpminusmax/gen/neonfma-lut64-p2-x12-acc2.c
@@ -22,7 +22,7 @@
     const float* input,
     float* output,
     float* sum,
-    float max)
+    float max) XNN_DISABLE_TSAN
 {
   assert(elements % sizeof(float) == 0);
 
diff --git a/src/f32-raddstoreexpminusmax/gen/neonfma-lut64-p2-x12-acc3.c b/src/f32-raddstoreexpminusmax/gen/neonfma-lut64-p2-x12-acc3.c
index 57564d0..2cd2c6c 100644
--- a/src/f32-raddstoreexpminusmax/gen/neonfma-lut64-p2-x12-acc3.c
+++ b/src/f32-raddstoreexpminusmax/gen/neonfma-lut64-p2-x12-acc3.c
@@ -22,7 +22,7 @@
     const float* input,
     float* output,
     float* sum,
-    float max)
+    float max) XNN_DISABLE_TSAN
 {
   assert(elements % sizeof(float) == 0);
 
diff --git a/src/f32-raddstoreexpminusmax/gen/neonfma-lut64-p2-x12.c b/src/f32-raddstoreexpminusmax/gen/neonfma-lut64-p2-x12.c
index 6292965..9221edb 100644
--- a/src/f32-raddstoreexpminusmax/gen/neonfma-lut64-p2-x12.c
+++ b/src/f32-raddstoreexpminusmax/gen/neonfma-lut64-p2-x12.c
@@ -22,7 +22,7 @@
     const float* input,
     float* output,
     float* sum,
-    float max)
+    float max) XNN_DISABLE_TSAN
 {
   assert(elements % sizeof(float) == 0);
 
diff --git a/src/f32-raddstoreexpminusmax/gen/neonfma-lut64-p2-x16-acc2.c b/src/f32-raddstoreexpminusmax/gen/neonfma-lut64-p2-x16-acc2.c
index 6b61ee2..8c63513 100644
--- a/src/f32-raddstoreexpminusmax/gen/neonfma-lut64-p2-x16-acc2.c
+++ b/src/f32-raddstoreexpminusmax/gen/neonfma-lut64-p2-x16-acc2.c
@@ -22,7 +22,7 @@
     const float* input,
     float* output,
     float* sum,
-    float max)
+    float max) XNN_DISABLE_TSAN
 {
   assert(elements % sizeof(float) == 0);
 
diff --git a/src/f32-raddstoreexpminusmax/gen/neonfma-lut64-p2-x16-acc4.c b/src/f32-raddstoreexpminusmax/gen/neonfma-lut64-p2-x16-acc4.c
index 4c48f8d..b49c855 100644
--- a/src/f32-raddstoreexpminusmax/gen/neonfma-lut64-p2-x16-acc4.c
+++ b/src/f32-raddstoreexpminusmax/gen/neonfma-lut64-p2-x16-acc4.c
@@ -22,7 +22,7 @@
     const float* input,
     float* output,
     float* sum,
-    float max)
+    float max) XNN_DISABLE_TSAN
 {
   assert(elements % sizeof(float) == 0);
 
diff --git a/src/f32-raddstoreexpminusmax/gen/neonfma-lut64-p2-x16.c b/src/f32-raddstoreexpminusmax/gen/neonfma-lut64-p2-x16.c
index d18d499..6b4ee6b 100644
--- a/src/f32-raddstoreexpminusmax/gen/neonfma-lut64-p2-x16.c
+++ b/src/f32-raddstoreexpminusmax/gen/neonfma-lut64-p2-x16.c
@@ -22,7 +22,7 @@
     const float* input,
     float* output,
     float* sum,
-    float max)
+    float max) XNN_DISABLE_TSAN
 {
   assert(elements % sizeof(float) == 0);
 
diff --git a/src/f32-raddstoreexpminusmax/gen/neonfma-lut64-p2-x20-acc2.c b/src/f32-raddstoreexpminusmax/gen/neonfma-lut64-p2-x20-acc2.c
index 7b69b09..fc020e0 100644
--- a/src/f32-raddstoreexpminusmax/gen/neonfma-lut64-p2-x20-acc2.c
+++ b/src/f32-raddstoreexpminusmax/gen/neonfma-lut64-p2-x20-acc2.c
@@ -22,7 +22,7 @@
     const float* input,
     float* output,
     float* sum,
-    float max)
+    float max) XNN_DISABLE_TSAN
 {
   assert(elements % sizeof(float) == 0);
 
diff --git a/src/f32-raddstoreexpminusmax/gen/neonfma-lut64-p2-x20-acc5.c b/src/f32-raddstoreexpminusmax/gen/neonfma-lut64-p2-x20-acc5.c
index 4756fa9..56d3596 100644
--- a/src/f32-raddstoreexpminusmax/gen/neonfma-lut64-p2-x20-acc5.c
+++ b/src/f32-raddstoreexpminusmax/gen/neonfma-lut64-p2-x20-acc5.c
@@ -22,7 +22,7 @@
     const float* input,
     float* output,
     float* sum,
-    float max)
+    float max) XNN_DISABLE_TSAN
 {
   assert(elements % sizeof(float) == 0);
 
diff --git a/src/f32-raddstoreexpminusmax/gen/neonfma-lut64-p2-x20.c b/src/f32-raddstoreexpminusmax/gen/neonfma-lut64-p2-x20.c
index 502e81d..41a4600 100644
--- a/src/f32-raddstoreexpminusmax/gen/neonfma-lut64-p2-x20.c
+++ b/src/f32-raddstoreexpminusmax/gen/neonfma-lut64-p2-x20.c
@@ -22,7 +22,7 @@
     const float* input,
     float* output,
     float* sum,
-    float max)
+    float max) XNN_DISABLE_TSAN
 {
   assert(elements % sizeof(float) == 0);
 
diff --git a/src/f32-raddstoreexpminusmax/gen/neonfma-lut64-p2-x4.c b/src/f32-raddstoreexpminusmax/gen/neonfma-lut64-p2-x4.c
index cc85f20..d2441a7 100644
--- a/src/f32-raddstoreexpminusmax/gen/neonfma-lut64-p2-x4.c
+++ b/src/f32-raddstoreexpminusmax/gen/neonfma-lut64-p2-x4.c
@@ -22,7 +22,7 @@
     const float* input,
     float* output,
     float* sum,
-    float max)
+    float max) XNN_DISABLE_TSAN
 {
   assert(elements % sizeof(float) == 0);
 
diff --git a/src/f32-raddstoreexpminusmax/gen/neonfma-lut64-p2-x8-acc2.c b/src/f32-raddstoreexpminusmax/gen/neonfma-lut64-p2-x8-acc2.c
index 80d4b5d..1c69b68 100644
--- a/src/f32-raddstoreexpminusmax/gen/neonfma-lut64-p2-x8-acc2.c
+++ b/src/f32-raddstoreexpminusmax/gen/neonfma-lut64-p2-x8-acc2.c
@@ -22,7 +22,7 @@
     const float* input,
     float* output,
     float* sum,
-    float max)
+    float max) XNN_DISABLE_TSAN
 {
   assert(elements % sizeof(float) == 0);
 
diff --git a/src/f32-raddstoreexpminusmax/gen/neonfma-lut64-p2-x8.c b/src/f32-raddstoreexpminusmax/gen/neonfma-lut64-p2-x8.c
index 9e986ca..a2a52f9 100644
--- a/src/f32-raddstoreexpminusmax/gen/neonfma-lut64-p2-x8.c
+++ b/src/f32-raddstoreexpminusmax/gen/neonfma-lut64-p2-x8.c
@@ -22,7 +22,7 @@
     const float* input,
     float* output,
     float* sum,
-    float max)
+    float max) XNN_DISABLE_TSAN
 {
   assert(elements % sizeof(float) == 0);
 
diff --git a/src/f32-raddstoreexpminusmax/gen/neonfma-p5-x12-acc2.c b/src/f32-raddstoreexpminusmax/gen/neonfma-p5-x12-acc2.c
index e03f9c6..a0731d9 100644
--- a/src/f32-raddstoreexpminusmax/gen/neonfma-p5-x12-acc2.c
+++ b/src/f32-raddstoreexpminusmax/gen/neonfma-p5-x12-acc2.c
@@ -20,7 +20,7 @@
     const float* input,
     float* output,
     float* sum,
-    float max)
+    float max) XNN_DISABLE_TSAN
 {
   assert(elements % sizeof(float) == 0);
 
diff --git a/src/f32-raddstoreexpminusmax/gen/neonfma-p5-x12-acc3.c b/src/f32-raddstoreexpminusmax/gen/neonfma-p5-x12-acc3.c
index a8faf3b..4991f02 100644
--- a/src/f32-raddstoreexpminusmax/gen/neonfma-p5-x12-acc3.c
+++ b/src/f32-raddstoreexpminusmax/gen/neonfma-p5-x12-acc3.c
@@ -20,7 +20,7 @@
     const float* input,
     float* output,
     float* sum,
-    float max)
+    float max) XNN_DISABLE_TSAN
 {
   assert(elements % sizeof(float) == 0);
 
diff --git a/src/f32-raddstoreexpminusmax/gen/neonfma-p5-x12.c b/src/f32-raddstoreexpminusmax/gen/neonfma-p5-x12.c
index 6867884..5179e56 100644
--- a/src/f32-raddstoreexpminusmax/gen/neonfma-p5-x12.c
+++ b/src/f32-raddstoreexpminusmax/gen/neonfma-p5-x12.c
@@ -20,7 +20,7 @@
     const float* input,
     float* output,
     float* sum,
-    float max)
+    float max) XNN_DISABLE_TSAN
 {
   assert(elements % sizeof(float) == 0);
 
diff --git a/src/f32-raddstoreexpminusmax/gen/neonfma-p5-x16-acc2.c b/src/f32-raddstoreexpminusmax/gen/neonfma-p5-x16-acc2.c
index fe97342..449e937 100644
--- a/src/f32-raddstoreexpminusmax/gen/neonfma-p5-x16-acc2.c
+++ b/src/f32-raddstoreexpminusmax/gen/neonfma-p5-x16-acc2.c
@@ -20,7 +20,7 @@
     const float* input,
     float* output,
     float* sum,
-    float max)
+    float max) XNN_DISABLE_TSAN
 {
   assert(elements % sizeof(float) == 0);
 
diff --git a/src/f32-raddstoreexpminusmax/gen/neonfma-p5-x16-acc4.c b/src/f32-raddstoreexpminusmax/gen/neonfma-p5-x16-acc4.c
index 783cfca..3124553 100644
--- a/src/f32-raddstoreexpminusmax/gen/neonfma-p5-x16-acc4.c
+++ b/src/f32-raddstoreexpminusmax/gen/neonfma-p5-x16-acc4.c
@@ -20,7 +20,7 @@
     const float* input,
     float* output,
     float* sum,
-    float max)
+    float max) XNN_DISABLE_TSAN
 {
   assert(elements % sizeof(float) == 0);
 
diff --git a/src/f32-raddstoreexpminusmax/gen/neonfma-p5-x16.c b/src/f32-raddstoreexpminusmax/gen/neonfma-p5-x16.c
index 49ee72f..e30af20 100644
--- a/src/f32-raddstoreexpminusmax/gen/neonfma-p5-x16.c
+++ b/src/f32-raddstoreexpminusmax/gen/neonfma-p5-x16.c
@@ -20,7 +20,7 @@
     const float* input,
     float* output,
     float* sum,
-    float max)
+    float max) XNN_DISABLE_TSAN
 {
   assert(elements % sizeof(float) == 0);
 
diff --git a/src/f32-raddstoreexpminusmax/gen/neonfma-p5-x20-acc2.c b/src/f32-raddstoreexpminusmax/gen/neonfma-p5-x20-acc2.c
index 43bd94c..838fabe 100644
--- a/src/f32-raddstoreexpminusmax/gen/neonfma-p5-x20-acc2.c
+++ b/src/f32-raddstoreexpminusmax/gen/neonfma-p5-x20-acc2.c
@@ -20,7 +20,7 @@
     const float* input,
     float* output,
     float* sum,
-    float max)
+    float max) XNN_DISABLE_TSAN
 {
   assert(elements % sizeof(float) == 0);
 
diff --git a/src/f32-raddstoreexpminusmax/gen/neonfma-p5-x20-acc5.c b/src/f32-raddstoreexpminusmax/gen/neonfma-p5-x20-acc5.c
index 4134ebb..5b1aaa0 100644
--- a/src/f32-raddstoreexpminusmax/gen/neonfma-p5-x20-acc5.c
+++ b/src/f32-raddstoreexpminusmax/gen/neonfma-p5-x20-acc5.c
@@ -20,7 +20,7 @@
     const float* input,
     float* output,
     float* sum,
-    float max)
+    float max) XNN_DISABLE_TSAN
 {
   assert(elements % sizeof(float) == 0);
 
diff --git a/src/f32-raddstoreexpminusmax/gen/neonfma-p5-x20.c b/src/f32-raddstoreexpminusmax/gen/neonfma-p5-x20.c
index 73da24d..f4800eb 100644
--- a/src/f32-raddstoreexpminusmax/gen/neonfma-p5-x20.c
+++ b/src/f32-raddstoreexpminusmax/gen/neonfma-p5-x20.c
@@ -20,7 +20,7 @@
     const float* input,
     float* output,
     float* sum,
-    float max)
+    float max) XNN_DISABLE_TSAN
 {
   assert(elements % sizeof(float) == 0);
 
diff --git a/src/f32-raddstoreexpminusmax/gen/neonfma-p5-x4.c b/src/f32-raddstoreexpminusmax/gen/neonfma-p5-x4.c
index 58d03b6..0d6be03 100644
--- a/src/f32-raddstoreexpminusmax/gen/neonfma-p5-x4.c
+++ b/src/f32-raddstoreexpminusmax/gen/neonfma-p5-x4.c
@@ -20,7 +20,7 @@
     const float* input,
     float* output,
     float* sum,
-    float max)
+    float max) XNN_DISABLE_TSAN
 {
   assert(elements % sizeof(float) == 0);
 
diff --git a/src/f32-raddstoreexpminusmax/gen/neonfma-p5-x8-acc2.c b/src/f32-raddstoreexpminusmax/gen/neonfma-p5-x8-acc2.c
index 1e1be00..05b0cf6 100644
--- a/src/f32-raddstoreexpminusmax/gen/neonfma-p5-x8-acc2.c
+++ b/src/f32-raddstoreexpminusmax/gen/neonfma-p5-x8-acc2.c
@@ -20,7 +20,7 @@
     const float* input,
     float* output,
     float* sum,
-    float max)
+    float max) XNN_DISABLE_TSAN
 {
   assert(elements % sizeof(float) == 0);
 
diff --git a/src/f32-raddstoreexpminusmax/gen/neonfma-p5-x8.c b/src/f32-raddstoreexpminusmax/gen/neonfma-p5-x8.c
index f69dff6..e356be8 100644
--- a/src/f32-raddstoreexpminusmax/gen/neonfma-p5-x8.c
+++ b/src/f32-raddstoreexpminusmax/gen/neonfma-p5-x8.c
@@ -20,7 +20,7 @@
     const float* input,
     float* output,
     float* sum,
-    float max)
+    float max) XNN_DISABLE_TSAN
 {
   assert(elements % sizeof(float) == 0);
 
diff --git a/src/f32-raddstoreexpminusmax/gen/psimd-p5-x12-acc2.c b/src/f32-raddstoreexpminusmax/gen/psimd-p5-x12-acc2.c
index 8ff55cf..50edd00 100644
--- a/src/f32-raddstoreexpminusmax/gen/psimd-p5-x12-acc2.c
+++ b/src/f32-raddstoreexpminusmax/gen/psimd-p5-x12-acc2.c
@@ -20,7 +20,7 @@
     const float* input,
     float* output,
     float* sum,
-    float max)
+    float max) XNN_DISABLE_TSAN
 {
   assert(elements % sizeof(float) == 0);
 
diff --git a/src/f32-raddstoreexpminusmax/gen/psimd-p5-x12-acc3.c b/src/f32-raddstoreexpminusmax/gen/psimd-p5-x12-acc3.c
index 89f7a49..ede5c6c 100644
--- a/src/f32-raddstoreexpminusmax/gen/psimd-p5-x12-acc3.c
+++ b/src/f32-raddstoreexpminusmax/gen/psimd-p5-x12-acc3.c
@@ -20,7 +20,7 @@
     const float* input,
     float* output,
     float* sum,
-    float max)
+    float max) XNN_DISABLE_TSAN
 {
   assert(elements % sizeof(float) == 0);
 
diff --git a/src/f32-raddstoreexpminusmax/gen/psimd-p5-x12.c b/src/f32-raddstoreexpminusmax/gen/psimd-p5-x12.c
index b517915..e8e2df9 100644
--- a/src/f32-raddstoreexpminusmax/gen/psimd-p5-x12.c
+++ b/src/f32-raddstoreexpminusmax/gen/psimd-p5-x12.c
@@ -20,7 +20,7 @@
     const float* input,
     float* output,
     float* sum,
-    float max)
+    float max) XNN_DISABLE_TSAN
 {
   assert(elements % sizeof(float) == 0);
 
diff --git a/src/f32-raddstoreexpminusmax/gen/psimd-p5-x16-acc2.c b/src/f32-raddstoreexpminusmax/gen/psimd-p5-x16-acc2.c
index 38db010..4df8338 100644
--- a/src/f32-raddstoreexpminusmax/gen/psimd-p5-x16-acc2.c
+++ b/src/f32-raddstoreexpminusmax/gen/psimd-p5-x16-acc2.c
@@ -20,7 +20,7 @@
     const float* input,
     float* output,
     float* sum,
-    float max)
+    float max) XNN_DISABLE_TSAN
 {
   assert(elements % sizeof(float) == 0);
 
diff --git a/src/f32-raddstoreexpminusmax/gen/psimd-p5-x16-acc4.c b/src/f32-raddstoreexpminusmax/gen/psimd-p5-x16-acc4.c
index 12882c1..8ac5fb8 100644
--- a/src/f32-raddstoreexpminusmax/gen/psimd-p5-x16-acc4.c
+++ b/src/f32-raddstoreexpminusmax/gen/psimd-p5-x16-acc4.c
@@ -20,7 +20,7 @@
     const float* input,
     float* output,
     float* sum,
-    float max)
+    float max) XNN_DISABLE_TSAN
 {
   assert(elements % sizeof(float) == 0);
 
diff --git a/src/f32-raddstoreexpminusmax/gen/psimd-p5-x16.c b/src/f32-raddstoreexpminusmax/gen/psimd-p5-x16.c
index 8329c65..7751927 100644
--- a/src/f32-raddstoreexpminusmax/gen/psimd-p5-x16.c
+++ b/src/f32-raddstoreexpminusmax/gen/psimd-p5-x16.c
@@ -20,7 +20,7 @@
     const float* input,
     float* output,
     float* sum,
-    float max)
+    float max) XNN_DISABLE_TSAN
 {
   assert(elements % sizeof(float) == 0);
 
diff --git a/src/f32-raddstoreexpminusmax/gen/psimd-p5-x20-acc2.c b/src/f32-raddstoreexpminusmax/gen/psimd-p5-x20-acc2.c
index 939c2a5..4231d00 100644
--- a/src/f32-raddstoreexpminusmax/gen/psimd-p5-x20-acc2.c
+++ b/src/f32-raddstoreexpminusmax/gen/psimd-p5-x20-acc2.c
@@ -20,7 +20,7 @@
     const float* input,
     float* output,
     float* sum,
-    float max)
+    float max) XNN_DISABLE_TSAN
 {
   assert(elements % sizeof(float) == 0);
 
diff --git a/src/f32-raddstoreexpminusmax/gen/psimd-p5-x20-acc5.c b/src/f32-raddstoreexpminusmax/gen/psimd-p5-x20-acc5.c
index c037620..fc7327b 100644
--- a/src/f32-raddstoreexpminusmax/gen/psimd-p5-x20-acc5.c
+++ b/src/f32-raddstoreexpminusmax/gen/psimd-p5-x20-acc5.c
@@ -20,7 +20,7 @@
     const float* input,
     float* output,
     float* sum,
-    float max)
+    float max) XNN_DISABLE_TSAN
 {
   assert(elements % sizeof(float) == 0);
 
diff --git a/src/f32-raddstoreexpminusmax/gen/psimd-p5-x20.c b/src/f32-raddstoreexpminusmax/gen/psimd-p5-x20.c
index 5b80fa0..259375c 100644
--- a/src/f32-raddstoreexpminusmax/gen/psimd-p5-x20.c
+++ b/src/f32-raddstoreexpminusmax/gen/psimd-p5-x20.c
@@ -20,7 +20,7 @@
     const float* input,
     float* output,
     float* sum,
-    float max)
+    float max) XNN_DISABLE_TSAN
 {
   assert(elements % sizeof(float) == 0);
 
diff --git a/src/f32-raddstoreexpminusmax/gen/psimd-p5-x4.c b/src/f32-raddstoreexpminusmax/gen/psimd-p5-x4.c
index 777837d..f6e5c1c 100644
--- a/src/f32-raddstoreexpminusmax/gen/psimd-p5-x4.c
+++ b/src/f32-raddstoreexpminusmax/gen/psimd-p5-x4.c
@@ -20,7 +20,7 @@
     const float* input,
     float* output,
     float* sum,
-    float max)
+    float max) XNN_DISABLE_TSAN
 {
   assert(elements % sizeof(float) == 0);
 
diff --git a/src/f32-raddstoreexpminusmax/gen/psimd-p5-x8-acc2.c b/src/f32-raddstoreexpminusmax/gen/psimd-p5-x8-acc2.c
index 2132ed3..c7958fa 100644
--- a/src/f32-raddstoreexpminusmax/gen/psimd-p5-x8-acc2.c
+++ b/src/f32-raddstoreexpminusmax/gen/psimd-p5-x8-acc2.c
@@ -20,7 +20,7 @@
     const float* input,
     float* output,
     float* sum,
-    float max)
+    float max) XNN_DISABLE_TSAN
 {
   assert(elements % sizeof(float) == 0);
 
diff --git a/src/f32-raddstoreexpminusmax/gen/psimd-p5-x8.c b/src/f32-raddstoreexpminusmax/gen/psimd-p5-x8.c
index 06a6a75..e803c5e 100644
--- a/src/f32-raddstoreexpminusmax/gen/psimd-p5-x8.c
+++ b/src/f32-raddstoreexpminusmax/gen/psimd-p5-x8.c
@@ -20,7 +20,7 @@
     const float* input,
     float* output,
     float* sum,
-    float max)
+    float max) XNN_DISABLE_TSAN
 {
   assert(elements % sizeof(float) == 0);
 
diff --git a/src/f32-raddstoreexpminusmax/gen/sse2-p5-x12-acc2.c b/src/f32-raddstoreexpminusmax/gen/sse2-p5-x12-acc2.c
index 5ae3399..66904e2 100644
--- a/src/f32-raddstoreexpminusmax/gen/sse2-p5-x12-acc2.c
+++ b/src/f32-raddstoreexpminusmax/gen/sse2-p5-x12-acc2.c
@@ -12,7 +12,6 @@
 #include <emmintrin.h>
 
 #include <xnnpack/common.h>
-#include <xnnpack/intrinsics-polyfill.h>
 #include <xnnpack/raddstoreexpminusmax.h>
 
 
@@ -21,7 +20,7 @@
     const float* input,
     float* output,
     float* sum,
-    float max)
+    float max) XNN_DISABLE_TSAN
 {
   assert(elements % sizeof(float) == 0);
 
@@ -182,7 +181,7 @@
     assert(elements >= 1 * sizeof(float));
     assert(elements <= 3 * sizeof(float));
     // Load 4 inputs at a time.
-    const __m128 vi = _mm_loadu_ps_notsan(input);
+    const __m128 vi = _mm_loadu_ps(input);
 
     // Subtract maximum input x := i - i_max. This implies x <= 0.
     const __m128 vx = _mm_sub_ps(vi, vi_max);
diff --git a/src/f32-raddstoreexpminusmax/gen/sse2-p5-x12-acc3.c b/src/f32-raddstoreexpminusmax/gen/sse2-p5-x12-acc3.c
index 785b0c7..2dc01f4 100644
--- a/src/f32-raddstoreexpminusmax/gen/sse2-p5-x12-acc3.c
+++ b/src/f32-raddstoreexpminusmax/gen/sse2-p5-x12-acc3.c
@@ -12,7 +12,6 @@
 #include <emmintrin.h>
 
 #include <xnnpack/common.h>
-#include <xnnpack/intrinsics-polyfill.h>
 #include <xnnpack/raddstoreexpminusmax.h>
 
 
@@ -21,7 +20,7 @@
     const float* input,
     float* output,
     float* sum,
-    float max)
+    float max) XNN_DISABLE_TSAN
 {
   assert(elements % sizeof(float) == 0);
 
@@ -184,7 +183,7 @@
     assert(elements >= 1 * sizeof(float));
     assert(elements <= 3 * sizeof(float));
     // Load 4 inputs at a time.
-    const __m128 vi = _mm_loadu_ps_notsan(input);
+    const __m128 vi = _mm_loadu_ps(input);
 
     // Subtract maximum input x := i - i_max. This implies x <= 0.
     const __m128 vx = _mm_sub_ps(vi, vi_max);
diff --git a/src/f32-raddstoreexpminusmax/gen/sse2-p5-x12.c b/src/f32-raddstoreexpminusmax/gen/sse2-p5-x12.c
index 40c6ce9..85df068 100644
--- a/src/f32-raddstoreexpminusmax/gen/sse2-p5-x12.c
+++ b/src/f32-raddstoreexpminusmax/gen/sse2-p5-x12.c
@@ -12,7 +12,6 @@
 #include <emmintrin.h>
 
 #include <xnnpack/common.h>
-#include <xnnpack/intrinsics-polyfill.h>
 #include <xnnpack/raddstoreexpminusmax.h>
 
 
@@ -21,7 +20,7 @@
     const float* input,
     float* output,
     float* sum,
-    float max)
+    float max) XNN_DISABLE_TSAN
 {
   assert(elements % sizeof(float) == 0);
 
@@ -179,7 +178,7 @@
     assert(elements >= 1 * sizeof(float));
     assert(elements <= 3 * sizeof(float));
     // Load 4 inputs at a time.
-    const __m128 vi = _mm_loadu_ps_notsan(input);
+    const __m128 vi = _mm_loadu_ps(input);
 
     // Subtract maximum input x := i - i_max. This implies x <= 0.
     const __m128 vx = _mm_sub_ps(vi, vi_max);
diff --git a/src/f32-raddstoreexpminusmax/gen/sse2-p5-x16-acc2.c b/src/f32-raddstoreexpminusmax/gen/sse2-p5-x16-acc2.c
index cfb462b..93ab412 100644
--- a/src/f32-raddstoreexpminusmax/gen/sse2-p5-x16-acc2.c
+++ b/src/f32-raddstoreexpminusmax/gen/sse2-p5-x16-acc2.c
@@ -12,7 +12,6 @@
 #include <emmintrin.h>
 
 #include <xnnpack/common.h>
-#include <xnnpack/intrinsics-polyfill.h>
 #include <xnnpack/raddstoreexpminusmax.h>
 
 
@@ -21,7 +20,7 @@
     const float* input,
     float* output,
     float* sum,
-    float max)
+    float max) XNN_DISABLE_TSAN
 {
   assert(elements % sizeof(float) == 0);
 
@@ -198,7 +197,7 @@
     assert(elements >= 1 * sizeof(float));
     assert(elements <= 3 * sizeof(float));
     // Load 4 inputs at a time.
-    const __m128 vi = _mm_loadu_ps_notsan(input);
+    const __m128 vi = _mm_loadu_ps(input);
 
     // Subtract maximum input x := i - i_max. This implies x <= 0.
     const __m128 vx = _mm_sub_ps(vi, vi_max);
diff --git a/src/f32-raddstoreexpminusmax/gen/sse2-p5-x16-acc4.c b/src/f32-raddstoreexpminusmax/gen/sse2-p5-x16-acc4.c
index 346dea1..11f8d41 100644
--- a/src/f32-raddstoreexpminusmax/gen/sse2-p5-x16-acc4.c
+++ b/src/f32-raddstoreexpminusmax/gen/sse2-p5-x16-acc4.c
@@ -12,7 +12,6 @@
 #include <emmintrin.h>
 
 #include <xnnpack/common.h>
-#include <xnnpack/intrinsics-polyfill.h>
 #include <xnnpack/raddstoreexpminusmax.h>
 
 
@@ -21,7 +20,7 @@
     const float* input,
     float* output,
     float* sum,
-    float max)
+    float max) XNN_DISABLE_TSAN
 {
   assert(elements % sizeof(float) == 0);
 
@@ -202,7 +201,7 @@
     assert(elements >= 1 * sizeof(float));
     assert(elements <= 3 * sizeof(float));
     // Load 4 inputs at a time.
-    const __m128 vi = _mm_loadu_ps_notsan(input);
+    const __m128 vi = _mm_loadu_ps(input);
 
     // Subtract maximum input x := i - i_max. This implies x <= 0.
     const __m128 vx = _mm_sub_ps(vi, vi_max);
diff --git a/src/f32-raddstoreexpminusmax/gen/sse2-p5-x16.c b/src/f32-raddstoreexpminusmax/gen/sse2-p5-x16.c
index 53cd00e..8b38fc4 100644
--- a/src/f32-raddstoreexpminusmax/gen/sse2-p5-x16.c
+++ b/src/f32-raddstoreexpminusmax/gen/sse2-p5-x16.c
@@ -12,7 +12,6 @@
 #include <emmintrin.h>
 
 #include <xnnpack/common.h>
-#include <xnnpack/intrinsics-polyfill.h>
 #include <xnnpack/raddstoreexpminusmax.h>
 
 
@@ -21,7 +20,7 @@
     const float* input,
     float* output,
     float* sum,
-    float max)
+    float max) XNN_DISABLE_TSAN
 {
   assert(elements % sizeof(float) == 0);
 
@@ -195,7 +194,7 @@
     assert(elements >= 1 * sizeof(float));
     assert(elements <= 3 * sizeof(float));
     // Load 4 inputs at a time.
-    const __m128 vi = _mm_loadu_ps_notsan(input);
+    const __m128 vi = _mm_loadu_ps(input);
 
     // Subtract maximum input x := i - i_max. This implies x <= 0.
     const __m128 vx = _mm_sub_ps(vi, vi_max);
diff --git a/src/f32-raddstoreexpminusmax/gen/sse2-p5-x20-acc2.c b/src/f32-raddstoreexpminusmax/gen/sse2-p5-x20-acc2.c
index 4e9bf2b..67d3d52 100644
--- a/src/f32-raddstoreexpminusmax/gen/sse2-p5-x20-acc2.c
+++ b/src/f32-raddstoreexpminusmax/gen/sse2-p5-x20-acc2.c
@@ -12,7 +12,6 @@
 #include <emmintrin.h>
 
 #include <xnnpack/common.h>
-#include <xnnpack/intrinsics-polyfill.h>
 #include <xnnpack/raddstoreexpminusmax.h>
 
 
@@ -21,7 +20,7 @@
     const float* input,
     float* output,
     float* sum,
-    float max)
+    float max) XNN_DISABLE_TSAN
 {
   assert(elements % sizeof(float) == 0);
 
@@ -214,7 +213,7 @@
     assert(elements >= 1 * sizeof(float));
     assert(elements <= 3 * sizeof(float));
     // Load 4 inputs at a time.
-    const __m128 vi = _mm_loadu_ps_notsan(input);
+    const __m128 vi = _mm_loadu_ps(input);
 
     // Subtract maximum input x := i - i_max. This implies x <= 0.
     const __m128 vx = _mm_sub_ps(vi, vi_max);
diff --git a/src/f32-raddstoreexpminusmax/gen/sse2-p5-x20-acc5.c b/src/f32-raddstoreexpminusmax/gen/sse2-p5-x20-acc5.c
index 7c7445e..4e78535 100644
--- a/src/f32-raddstoreexpminusmax/gen/sse2-p5-x20-acc5.c
+++ b/src/f32-raddstoreexpminusmax/gen/sse2-p5-x20-acc5.c
@@ -12,7 +12,6 @@
 #include <emmintrin.h>
 
 #include <xnnpack/common.h>
-#include <xnnpack/intrinsics-polyfill.h>
 #include <xnnpack/raddstoreexpminusmax.h>
 
 
@@ -21,7 +20,7 @@
     const float* input,
     float* output,
     float* sum,
-    float max)
+    float max) XNN_DISABLE_TSAN
 {
   assert(elements % sizeof(float) == 0);
 
@@ -220,7 +219,7 @@
     assert(elements >= 1 * sizeof(float));
     assert(elements <= 3 * sizeof(float));
     // Load 4 inputs at a time.
-    const __m128 vi = _mm_loadu_ps_notsan(input);
+    const __m128 vi = _mm_loadu_ps(input);
 
     // Subtract maximum input x := i - i_max. This implies x <= 0.
     const __m128 vx = _mm_sub_ps(vi, vi_max);
diff --git a/src/f32-raddstoreexpminusmax/gen/sse2-p5-x20.c b/src/f32-raddstoreexpminusmax/gen/sse2-p5-x20.c
index ca0fde9..6ea49ea 100644
--- a/src/f32-raddstoreexpminusmax/gen/sse2-p5-x20.c
+++ b/src/f32-raddstoreexpminusmax/gen/sse2-p5-x20.c
@@ -12,7 +12,6 @@
 #include <emmintrin.h>
 
 #include <xnnpack/common.h>
-#include <xnnpack/intrinsics-polyfill.h>
 #include <xnnpack/raddstoreexpminusmax.h>
 
 
@@ -21,7 +20,7 @@
     const float* input,
     float* output,
     float* sum,
-    float max)
+    float max) XNN_DISABLE_TSAN
 {
   assert(elements % sizeof(float) == 0);
 
@@ -211,7 +210,7 @@
     assert(elements >= 1 * sizeof(float));
     assert(elements <= 3 * sizeof(float));
     // Load 4 inputs at a time.
-    const __m128 vi = _mm_loadu_ps_notsan(input);
+    const __m128 vi = _mm_loadu_ps(input);
 
     // Subtract maximum input x := i - i_max. This implies x <= 0.
     const __m128 vx = _mm_sub_ps(vi, vi_max);
diff --git a/src/f32-raddstoreexpminusmax/gen/sse2-p5-x4.c b/src/f32-raddstoreexpminusmax/gen/sse2-p5-x4.c
index 172fdd3..f60dcbc 100644
--- a/src/f32-raddstoreexpminusmax/gen/sse2-p5-x4.c
+++ b/src/f32-raddstoreexpminusmax/gen/sse2-p5-x4.c
@@ -12,7 +12,6 @@
 #include <emmintrin.h>
 
 #include <xnnpack/common.h>
-#include <xnnpack/intrinsics-polyfill.h>
 #include <xnnpack/raddstoreexpminusmax.h>
 
 
@@ -21,7 +20,7 @@
     const float* input,
     float* output,
     float* sum,
-    float max)
+    float max) XNN_DISABLE_TSAN
 {
   assert(elements % sizeof(float) == 0);
 
@@ -147,7 +146,7 @@
     assert(elements >= 1 * sizeof(float));
     assert(elements <= 3 * sizeof(float));
     // Load 4 inputs at a time.
-    const __m128 vi = _mm_loadu_ps_notsan(input);
+    const __m128 vi = _mm_loadu_ps(input);
 
     // Subtract maximum input x := i - i_max. This implies x <= 0.
     const __m128 vx = _mm_sub_ps(vi, vi_max);
diff --git a/src/f32-raddstoreexpminusmax/gen/sse2-p5-x8-acc2.c b/src/f32-raddstoreexpminusmax/gen/sse2-p5-x8-acc2.c
index 05d2aa8..d09f3cc 100644
--- a/src/f32-raddstoreexpminusmax/gen/sse2-p5-x8-acc2.c
+++ b/src/f32-raddstoreexpminusmax/gen/sse2-p5-x8-acc2.c
@@ -12,7 +12,6 @@
 #include <emmintrin.h>
 
 #include <xnnpack/common.h>
-#include <xnnpack/intrinsics-polyfill.h>
 #include <xnnpack/raddstoreexpminusmax.h>
 
 
@@ -21,7 +20,7 @@
     const float* input,
     float* output,
     float* sum,
-    float max)
+    float max) XNN_DISABLE_TSAN
 {
   assert(elements % sizeof(float) == 0);
 
@@ -166,7 +165,7 @@
     assert(elements >= 1 * sizeof(float));
     assert(elements <= 3 * sizeof(float));
     // Load 4 inputs at a time.
-    const __m128 vi = _mm_loadu_ps_notsan(input);
+    const __m128 vi = _mm_loadu_ps(input);
 
     // Subtract maximum input x := i - i_max. This implies x <= 0.
     const __m128 vx = _mm_sub_ps(vi, vi_max);
diff --git a/src/f32-raddstoreexpminusmax/gen/sse2-p5-x8.c b/src/f32-raddstoreexpminusmax/gen/sse2-p5-x8.c
index 751020f..2dc1589 100644
--- a/src/f32-raddstoreexpminusmax/gen/sse2-p5-x8.c
+++ b/src/f32-raddstoreexpminusmax/gen/sse2-p5-x8.c
@@ -12,7 +12,6 @@
 #include <emmintrin.h>
 
 #include <xnnpack/common.h>
-#include <xnnpack/intrinsics-polyfill.h>
 #include <xnnpack/raddstoreexpminusmax.h>
 
 
@@ -21,7 +20,7 @@
     const float* input,
     float* output,
     float* sum,
-    float max)
+    float max) XNN_DISABLE_TSAN
 {
   assert(elements % sizeof(float) == 0);
 
@@ -163,7 +162,7 @@
     assert(elements >= 1 * sizeof(float));
     assert(elements <= 3 * sizeof(float));
     // Load 4 inputs at a time.
-    const __m128 vi = _mm_loadu_ps_notsan(input);
+    const __m128 vi = _mm_loadu_ps(input);
 
     // Subtract maximum input x := i - i_max. This implies x <= 0.
     const __m128 vx = _mm_sub_ps(vi, vi_max);
diff --git a/src/f32-raddstoreexpminusmax/neon-lut64-p2.c.in b/src/f32-raddstoreexpminusmax/neon-lut64-p2.c.in
index b7a2674..5071ab7 100644
--- a/src/f32-raddstoreexpminusmax/neon-lut64-p2.c.in
+++ b/src/f32-raddstoreexpminusmax/neon-lut64-p2.c.in
@@ -23,7 +23,7 @@
     const float* input,
     float* output,
     float* sum,
-    float max)
+    float max) XNN_DISABLE_TSAN
 {
   assert(elements % sizeof(float) == 0);
 
diff --git a/src/f32-raddstoreexpminusmax/neon-p5.c.in b/src/f32-raddstoreexpminusmax/neon-p5.c.in
index 2bceff0..81d75c2 100644
--- a/src/f32-raddstoreexpminusmax/neon-p5.c.in
+++ b/src/f32-raddstoreexpminusmax/neon-p5.c.in
@@ -21,7 +21,7 @@
     const float* input,
     float* output,
     float* sum,
-    float max)
+    float max) XNN_DISABLE_TSAN
 {
   assert(elements % sizeof(float) == 0);
 
diff --git a/src/f32-raddstoreexpminusmax/psimd-p5.c.in b/src/f32-raddstoreexpminusmax/psimd-p5.c.in
index 7cba991..d7fade0 100644
--- a/src/f32-raddstoreexpminusmax/psimd-p5.c.in
+++ b/src/f32-raddstoreexpminusmax/psimd-p5.c.in
@@ -20,7 +20,7 @@
     const float* input,
     float* output,
     float* sum,
-    float max)
+    float max) XNN_DISABLE_TSAN
 {
   assert(elements % sizeof(float) == 0);
 
diff --git a/src/f32-raddstoreexpminusmax/sse2-p5.c.in b/src/f32-raddstoreexpminusmax/sse2-p5.c.in
index 6582df8..c7f73ca 100644
--- a/src/f32-raddstoreexpminusmax/sse2-p5.c.in
+++ b/src/f32-raddstoreexpminusmax/sse2-p5.c.in
@@ -12,7 +12,6 @@
 #include <emmintrin.h>
 
 #include <xnnpack/common.h>
-#include <xnnpack/intrinsics-polyfill.h>
 #include <xnnpack/raddstoreexpminusmax.h>
 
 
@@ -21,7 +20,7 @@
     const float* input,
     float* output,
     float* sum,
-    float max)
+    float max) XNN_DISABLE_TSAN
 {
   assert(elements % sizeof(float) == 0);
 
@@ -174,7 +173,7 @@
     assert(elements >= 1 * sizeof(float));
     assert(elements <= 3 * sizeof(float));
     // Load 4 inputs at a time.
-    const __m128 vi = _mm_loadu_ps_notsan(input);
+    const __m128 vi = _mm_loadu_ps(input);
 
     // Subtract maximum input x := i - i_max. This implies x <= 0.
     const __m128 vx = _mm_sub_ps(vi, vi_max);